minimind_sft / trainer_state.json
caoyizhen's picture
Upload folder using huggingface_hub
b3ec11e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 47455,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00010536297545042672,
"grad_norm": 8.704546928405762,
"learning_rate": 0.0,
"loss": 3.1115,
"step": 1
},
{
"epoch": 0.010536297545042672,
"grad_norm": 0.45912429690361023,
"learning_rate": 2.085967130214918e-05,
"loss": 2.4969,
"step": 100
},
{
"epoch": 0.021072595090085345,
"grad_norm": 0.45048093795776367,
"learning_rate": 4.193004635482512e-05,
"loss": 2.1901,
"step": 200
},
{
"epoch": 0.03160889263512802,
"grad_norm": 0.4515824615955353,
"learning_rate": 6.300042140750104e-05,
"loss": 2.1404,
"step": 300
},
{
"epoch": 0.04214519018017069,
"grad_norm": 0.44400346279144287,
"learning_rate": 8.4070796460177e-05,
"loss": 2.1252,
"step": 400
},
{
"epoch": 0.05268148772521336,
"grad_norm": 0.43594038486480713,
"learning_rate": 0.00010514117151285294,
"loss": 2.1131,
"step": 500
},
{
"epoch": 0.06321778527025604,
"grad_norm": 0.48596614599227905,
"learning_rate": 0.00012621154656552885,
"loss": 2.1137,
"step": 600
},
{
"epoch": 0.0737540828152987,
"grad_norm": 0.4419753849506378,
"learning_rate": 0.0001472819216182048,
"loss": 2.1132,
"step": 700
},
{
"epoch": 0.08429038036034138,
"grad_norm": 0.4662840962409973,
"learning_rate": 0.00016835229667088076,
"loss": 2.0996,
"step": 800
},
{
"epoch": 0.09482667790538404,
"grad_norm": 0.5108239054679871,
"learning_rate": 0.00018942267172355668,
"loss": 2.097,
"step": 900
},
{
"epoch": 0.10536297545042672,
"grad_norm": 0.4582119584083557,
"learning_rate": 0.00021049304677623262,
"loss": 2.1024,
"step": 1000
},
{
"epoch": 0.1158992729954694,
"grad_norm": 0.4549405574798584,
"learning_rate": 0.00023156342182890856,
"loss": 2.0953,
"step": 1100
},
{
"epoch": 0.12643557054051208,
"grad_norm": 0.5161290168762207,
"learning_rate": 0.0002526337968815845,
"loss": 2.0932,
"step": 1200
},
{
"epoch": 0.13697186808555473,
"grad_norm": 0.5365496277809143,
"learning_rate": 0.00027370417193426044,
"loss": 2.0952,
"step": 1300
},
{
"epoch": 0.1475081656305974,
"grad_norm": 0.5388957858085632,
"learning_rate": 0.0002947745469869364,
"loss": 2.0908,
"step": 1400
},
{
"epoch": 0.15804446317564008,
"grad_norm": 0.46142223477363586,
"learning_rate": 0.00031584492203961227,
"loss": 2.0942,
"step": 1500
},
{
"epoch": 0.16858076072068276,
"grad_norm": 0.46338245272636414,
"learning_rate": 0.00033691529709228824,
"loss": 2.0844,
"step": 1600
},
{
"epoch": 0.17911705826572544,
"grad_norm": 0.48414379358291626,
"learning_rate": 0.0003579856721449642,
"loss": 2.1006,
"step": 1700
},
{
"epoch": 0.1896533558107681,
"grad_norm": 0.5171504020690918,
"learning_rate": 0.0003790560471976401,
"loss": 2.0942,
"step": 1800
},
{
"epoch": 0.20018965335581076,
"grad_norm": 0.4922062158584595,
"learning_rate": 0.0004001264222503161,
"loss": 2.0904,
"step": 1900
},
{
"epoch": 0.21072595090085344,
"grad_norm": 0.6023987531661987,
"learning_rate": 0.000421196797302992,
"loss": 2.0961,
"step": 2000
},
{
"epoch": 0.22126224844589612,
"grad_norm": 0.5345625281333923,
"learning_rate": 0.0004422671723556679,
"loss": 2.0952,
"step": 2100
},
{
"epoch": 0.2317985459909388,
"grad_norm": 0.5566565990447998,
"learning_rate": 0.0004633375474083439,
"loss": 2.0977,
"step": 2200
},
{
"epoch": 0.24233484353598145,
"grad_norm": 0.5522327423095703,
"learning_rate": 0.00048440792246101985,
"loss": 2.1047,
"step": 2300
},
{
"epoch": 0.25287114108102415,
"grad_norm": 0.5047522783279419,
"learning_rate": 0.0004997116365733552,
"loss": 2.0979,
"step": 2400
},
{
"epoch": 0.2634074386260668,
"grad_norm": 0.4784170687198639,
"learning_rate": 0.0004986025464708753,
"loss": 2.102,
"step": 2500
},
{
"epoch": 0.27394373617110945,
"grad_norm": 0.5242518782615662,
"learning_rate": 0.0004974934563683953,
"loss": 2.0998,
"step": 2600
},
{
"epoch": 0.28448003371615216,
"grad_norm": 0.47221043705940247,
"learning_rate": 0.0004963843662659154,
"loss": 2.1038,
"step": 2700
},
{
"epoch": 0.2950163312611948,
"grad_norm": 0.6364756226539612,
"learning_rate": 0.0004952752761634355,
"loss": 2.0921,
"step": 2800
},
{
"epoch": 0.3055526288062375,
"grad_norm": 0.4995081424713135,
"learning_rate": 0.0004941661860609556,
"loss": 2.0924,
"step": 2900
},
{
"epoch": 0.31608892635128016,
"grad_norm": 0.4774039685726166,
"learning_rate": 0.0004930570959584757,
"loss": 2.0911,
"step": 3000
},
{
"epoch": 0.3266252238963228,
"grad_norm": 0.4661494195461273,
"learning_rate": 0.0004919480058559958,
"loss": 2.0936,
"step": 3100
},
{
"epoch": 0.3371615214413655,
"grad_norm": 0.4524199366569519,
"learning_rate": 0.0004908389157535158,
"loss": 2.0885,
"step": 3200
},
{
"epoch": 0.34769781898640817,
"grad_norm": 0.5359546542167664,
"learning_rate": 0.0004897298256510359,
"loss": 2.0836,
"step": 3300
},
{
"epoch": 0.3582341165314509,
"grad_norm": 0.46531638503074646,
"learning_rate": 0.0004886207355485559,
"loss": 2.0732,
"step": 3400
},
{
"epoch": 0.3687704140764935,
"grad_norm": 0.5188313722610474,
"learning_rate": 0.0004875116454460761,
"loss": 2.0701,
"step": 3500
},
{
"epoch": 0.3793067116215362,
"grad_norm": 0.4912554621696472,
"learning_rate": 0.0004864025553435961,
"loss": 2.0734,
"step": 3600
},
{
"epoch": 0.3898430091665789,
"grad_norm": 0.46518808603286743,
"learning_rate": 0.00048529346524111617,
"loss": 2.0673,
"step": 3700
},
{
"epoch": 0.40037930671162153,
"grad_norm": 0.46551960706710815,
"learning_rate": 0.00048418437513863627,
"loss": 2.0733,
"step": 3800
},
{
"epoch": 0.41091560425666424,
"grad_norm": 0.543953001499176,
"learning_rate": 0.0004830752850361563,
"loss": 2.066,
"step": 3900
},
{
"epoch": 0.4214519018017069,
"grad_norm": 0.5043941140174866,
"learning_rate": 0.0004819661949336764,
"loss": 2.0662,
"step": 4000
},
{
"epoch": 0.43198819934674954,
"grad_norm": 0.4602905511856079,
"learning_rate": 0.00048085710483119647,
"loss": 2.0704,
"step": 4100
},
{
"epoch": 0.44252449689179224,
"grad_norm": 0.5024540424346924,
"learning_rate": 0.0004797480147287166,
"loss": 2.0654,
"step": 4200
},
{
"epoch": 0.4530607944368349,
"grad_norm": 0.48672983050346375,
"learning_rate": 0.00047863892462623667,
"loss": 2.0572,
"step": 4300
},
{
"epoch": 0.4635970919818776,
"grad_norm": 0.44031229615211487,
"learning_rate": 0.0004775298345237567,
"loss": 2.0672,
"step": 4400
},
{
"epoch": 0.47413338952692025,
"grad_norm": 0.46178367733955383,
"learning_rate": 0.0004764207444212768,
"loss": 2.0617,
"step": 4500
},
{
"epoch": 0.4846696870719629,
"grad_norm": 0.43033888936042786,
"learning_rate": 0.00047531165431879686,
"loss": 2.0545,
"step": 4600
},
{
"epoch": 0.4952059846170056,
"grad_norm": 0.47262778878211975,
"learning_rate": 0.00047420256421631696,
"loss": 2.0581,
"step": 4700
},
{
"epoch": 0.5057422821620483,
"grad_norm": 0.5511783957481384,
"learning_rate": 0.000473093474113837,
"loss": 2.0497,
"step": 4800
},
{
"epoch": 0.5162785797070909,
"grad_norm": 0.5176393985748291,
"learning_rate": 0.00047198438401135705,
"loss": 2.0466,
"step": 4900
},
{
"epoch": 0.5268148772521336,
"grad_norm": 0.4723651111125946,
"learning_rate": 0.00047087529390887716,
"loss": 2.0468,
"step": 5000
},
{
"epoch": 0.5373511747971763,
"grad_norm": 0.4407116174697876,
"learning_rate": 0.00046976620380639726,
"loss": 2.0551,
"step": 5100
},
{
"epoch": 0.5478874723422189,
"grad_norm": 0.4954802989959717,
"learning_rate": 0.00046865711370391736,
"loss": 2.0418,
"step": 5200
},
{
"epoch": 0.5584237698872616,
"grad_norm": 0.43379735946655273,
"learning_rate": 0.0004675480236014374,
"loss": 2.0455,
"step": 5300
},
{
"epoch": 0.5689600674323043,
"grad_norm": 0.4137374758720398,
"learning_rate": 0.00046643893349895745,
"loss": 2.0435,
"step": 5400
},
{
"epoch": 0.5794963649773469,
"grad_norm": 0.42562806606292725,
"learning_rate": 0.00046532984339647755,
"loss": 2.0462,
"step": 5500
},
{
"epoch": 0.5900326625223896,
"grad_norm": 0.49963149428367615,
"learning_rate": 0.0004642207532939976,
"loss": 2.0461,
"step": 5600
},
{
"epoch": 0.6005689600674323,
"grad_norm": 0.4317498803138733,
"learning_rate": 0.0004631116631915177,
"loss": 2.043,
"step": 5700
},
{
"epoch": 0.611105257612475,
"grad_norm": 0.48041588068008423,
"learning_rate": 0.00046200257308903775,
"loss": 2.0407,
"step": 5800
},
{
"epoch": 0.6216415551575176,
"grad_norm": 0.4867211878299713,
"learning_rate": 0.0004608934829865578,
"loss": 2.0399,
"step": 5900
},
{
"epoch": 0.6321778527025603,
"grad_norm": 0.4728844165802002,
"learning_rate": 0.0004597843928840779,
"loss": 2.0481,
"step": 6000
},
{
"epoch": 0.642714150247603,
"grad_norm": 0.42306768894195557,
"learning_rate": 0.000458675302781598,
"loss": 2.036,
"step": 6100
},
{
"epoch": 0.6532504477926456,
"grad_norm": 0.4628433883190155,
"learning_rate": 0.0004575662126791181,
"loss": 2.0316,
"step": 6200
},
{
"epoch": 0.6637867453376883,
"grad_norm": 0.5507206916809082,
"learning_rate": 0.00045645712257663814,
"loss": 2.037,
"step": 6300
},
{
"epoch": 0.674323042882731,
"grad_norm": 0.5245053172111511,
"learning_rate": 0.00045534803247415824,
"loss": 2.0331,
"step": 6400
},
{
"epoch": 0.6848593404277736,
"grad_norm": 0.4395572543144226,
"learning_rate": 0.0004542389423716783,
"loss": 2.0319,
"step": 6500
},
{
"epoch": 0.6953956379728163,
"grad_norm": 0.4458150565624237,
"learning_rate": 0.00045312985226919833,
"loss": 2.0262,
"step": 6600
},
{
"epoch": 0.705931935517859,
"grad_norm": 0.4589666724205017,
"learning_rate": 0.00045202076216671844,
"loss": 2.0281,
"step": 6700
},
{
"epoch": 0.7164682330629017,
"grad_norm": 0.49729079008102417,
"learning_rate": 0.0004509116720642385,
"loss": 2.0254,
"step": 6800
},
{
"epoch": 0.7270045306079443,
"grad_norm": 0.41776230931282043,
"learning_rate": 0.0004498025819617586,
"loss": 2.0274,
"step": 6900
},
{
"epoch": 0.737540828152987,
"grad_norm": 0.5071027278900146,
"learning_rate": 0.0004486934918592787,
"loss": 2.0226,
"step": 7000
},
{
"epoch": 0.7480771256980298,
"grad_norm": 0.47906801104545593,
"learning_rate": 0.00044758440175679873,
"loss": 2.0225,
"step": 7100
},
{
"epoch": 0.7586134232430723,
"grad_norm": 0.501970648765564,
"learning_rate": 0.00044647531165431883,
"loss": 2.0212,
"step": 7200
},
{
"epoch": 0.769149720788115,
"grad_norm": 0.4116053879261017,
"learning_rate": 0.0004453662215518389,
"loss": 2.0259,
"step": 7300
},
{
"epoch": 0.7796860183331578,
"grad_norm": 0.4501636028289795,
"learning_rate": 0.000444257131449359,
"loss": 2.0205,
"step": 7400
},
{
"epoch": 0.7902223158782004,
"grad_norm": 0.5196821093559265,
"learning_rate": 0.000443148041346879,
"loss": 2.0216,
"step": 7500
},
{
"epoch": 0.8007586134232431,
"grad_norm": 0.4288316071033478,
"learning_rate": 0.00044203895124439907,
"loss": 2.0178,
"step": 7600
},
{
"epoch": 0.8112949109682858,
"grad_norm": 0.41823074221611023,
"learning_rate": 0.00044092986114191917,
"loss": 2.0131,
"step": 7700
},
{
"epoch": 0.8218312085133285,
"grad_norm": 0.4197762608528137,
"learning_rate": 0.0004398207710394392,
"loss": 2.0147,
"step": 7800
},
{
"epoch": 0.8323675060583711,
"grad_norm": 0.4367753267288208,
"learning_rate": 0.0004387116809369593,
"loss": 2.024,
"step": 7900
},
{
"epoch": 0.8429038036034138,
"grad_norm": 0.43104997277259827,
"learning_rate": 0.0004376025908344794,
"loss": 2.0123,
"step": 8000
},
{
"epoch": 0.8534401011484565,
"grad_norm": 0.4320082664489746,
"learning_rate": 0.0004364935007319995,
"loss": 2.0126,
"step": 8100
},
{
"epoch": 0.8639763986934991,
"grad_norm": 0.4402988851070404,
"learning_rate": 0.00043538441062951957,
"loss": 2.0127,
"step": 8200
},
{
"epoch": 0.8745126962385418,
"grad_norm": 0.4331250786781311,
"learning_rate": 0.0004342753205270396,
"loss": 2.0138,
"step": 8300
},
{
"epoch": 0.8850489937835845,
"grad_norm": 0.43004143238067627,
"learning_rate": 0.0004331662304245597,
"loss": 2.0034,
"step": 8400
},
{
"epoch": 0.8955852913286271,
"grad_norm": 0.4509132504463196,
"learning_rate": 0.00043205714032207976,
"loss": 2.0058,
"step": 8500
},
{
"epoch": 0.9061215888736698,
"grad_norm": 0.43472710251808167,
"learning_rate": 0.00043094805021959986,
"loss": 2.0042,
"step": 8600
},
{
"epoch": 0.9166578864187125,
"grad_norm": 0.5285255908966064,
"learning_rate": 0.0004298389601171199,
"loss": 2.0018,
"step": 8700
},
{
"epoch": 0.9271941839637552,
"grad_norm": 0.40384572744369507,
"learning_rate": 0.00042872987001463996,
"loss": 2.0079,
"step": 8800
},
{
"epoch": 0.9377304815087978,
"grad_norm": 0.4634927213191986,
"learning_rate": 0.0004276207799121601,
"loss": 1.9999,
"step": 8900
},
{
"epoch": 0.9482667790538405,
"grad_norm": 0.4585327208042145,
"learning_rate": 0.00042651168980968016,
"loss": 2.0076,
"step": 9000
},
{
"epoch": 0.9588030765988832,
"grad_norm": 0.44724905490875244,
"learning_rate": 0.00042540259970720026,
"loss": 2.0055,
"step": 9100
},
{
"epoch": 0.9693393741439258,
"grad_norm": 0.4930990934371948,
"learning_rate": 0.0004242935096047203,
"loss": 2.0034,
"step": 9200
},
{
"epoch": 0.9798756716889685,
"grad_norm": 0.4466867744922638,
"learning_rate": 0.00042318441950224035,
"loss": 2.0046,
"step": 9300
},
{
"epoch": 0.9904119692340112,
"grad_norm": 0.44049832224845886,
"learning_rate": 0.00042207532939976045,
"loss": 2.0015,
"step": 9400
},
{
"epoch": 1.000948266779054,
"grad_norm": 0.4658033549785614,
"learning_rate": 0.0004209662392972805,
"loss": 1.9938,
"step": 9500
},
{
"epoch": 1.0114845643240966,
"grad_norm": 0.45460689067840576,
"learning_rate": 0.0004198571491948006,
"loss": 1.9694,
"step": 9600
},
{
"epoch": 1.022020861869139,
"grad_norm": 0.43489739298820496,
"learning_rate": 0.00041874805909232065,
"loss": 1.9684,
"step": 9700
},
{
"epoch": 1.0325571594141818,
"grad_norm": 0.4363148510456085,
"learning_rate": 0.0004176389689898407,
"loss": 1.961,
"step": 9800
},
{
"epoch": 1.0430934569592245,
"grad_norm": 0.41610002517700195,
"learning_rate": 0.00041652987888736085,
"loss": 1.9694,
"step": 9900
},
{
"epoch": 1.0536297545042672,
"grad_norm": 0.39003250002861023,
"learning_rate": 0.0004154207887848809,
"loss": 1.9636,
"step": 10000
},
{
"epoch": 1.06416605204931,
"grad_norm": 0.36780601739883423,
"learning_rate": 0.000414311698682401,
"loss": 1.9686,
"step": 10100
},
{
"epoch": 1.0747023495943526,
"grad_norm": 0.4296736419200897,
"learning_rate": 0.00041320260857992104,
"loss": 1.9668,
"step": 10200
},
{
"epoch": 1.0852386471393953,
"grad_norm": 0.45763176679611206,
"learning_rate": 0.00041209351847744114,
"loss": 1.9639,
"step": 10300
},
{
"epoch": 1.0957749446844378,
"grad_norm": 0.41805505752563477,
"learning_rate": 0.0004109844283749612,
"loss": 1.9633,
"step": 10400
},
{
"epoch": 1.1063112422294805,
"grad_norm": 0.42308661341667175,
"learning_rate": 0.00040987533827248124,
"loss": 1.9745,
"step": 10500
},
{
"epoch": 1.1168475397745232,
"grad_norm": 0.4240245223045349,
"learning_rate": 0.00040876624817000134,
"loss": 1.9689,
"step": 10600
},
{
"epoch": 1.127383837319566,
"grad_norm": 0.40365278720855713,
"learning_rate": 0.0004076571580675214,
"loss": 1.9653,
"step": 10700
},
{
"epoch": 1.1379201348646086,
"grad_norm": 0.4099302887916565,
"learning_rate": 0.00040654806796504154,
"loss": 1.958,
"step": 10800
},
{
"epoch": 1.1484564324096513,
"grad_norm": 0.4134521186351776,
"learning_rate": 0.0004054389778625616,
"loss": 1.9686,
"step": 10900
},
{
"epoch": 1.158992729954694,
"grad_norm": 0.40292927622795105,
"learning_rate": 0.00040432988776008163,
"loss": 1.9627,
"step": 11000
},
{
"epoch": 1.1695290274997365,
"grad_norm": 0.4272337555885315,
"learning_rate": 0.00040322079765760173,
"loss": 1.9655,
"step": 11100
},
{
"epoch": 1.1800653250447792,
"grad_norm": 0.43145930767059326,
"learning_rate": 0.0004021117075551218,
"loss": 1.963,
"step": 11200
},
{
"epoch": 1.190601622589822,
"grad_norm": 0.39788371324539185,
"learning_rate": 0.0004010026174526419,
"loss": 1.9616,
"step": 11300
},
{
"epoch": 1.2011379201348646,
"grad_norm": 0.45902547240257263,
"learning_rate": 0.0003998935273501619,
"loss": 1.9589,
"step": 11400
},
{
"epoch": 1.2116742176799074,
"grad_norm": 0.4540606141090393,
"learning_rate": 0.00039878443724768197,
"loss": 1.958,
"step": 11500
},
{
"epoch": 1.2222105152249498,
"grad_norm": 0.4402179419994354,
"learning_rate": 0.0003976753471452021,
"loss": 1.9555,
"step": 11600
},
{
"epoch": 1.2327468127699925,
"grad_norm": 0.389726459980011,
"learning_rate": 0.0003965662570427221,
"loss": 1.9512,
"step": 11700
},
{
"epoch": 1.2432831103150352,
"grad_norm": 0.440833181142807,
"learning_rate": 0.0003954571669402423,
"loss": 1.9561,
"step": 11800
},
{
"epoch": 1.253819407860078,
"grad_norm": 0.3972662091255188,
"learning_rate": 0.0003943480768377623,
"loss": 1.965,
"step": 11900
},
{
"epoch": 1.2643557054051207,
"grad_norm": 0.41316962242126465,
"learning_rate": 0.00039323898673528237,
"loss": 1.9522,
"step": 12000
},
{
"epoch": 1.2748920029501634,
"grad_norm": 0.41109901666641235,
"learning_rate": 0.00039212989663280247,
"loss": 1.9553,
"step": 12100
},
{
"epoch": 1.285428300495206,
"grad_norm": 0.4357900023460388,
"learning_rate": 0.0003910208065303225,
"loss": 1.9513,
"step": 12200
},
{
"epoch": 1.2959645980402485,
"grad_norm": 0.3943662941455841,
"learning_rate": 0.0003899117164278426,
"loss": 1.9611,
"step": 12300
},
{
"epoch": 1.3065008955852913,
"grad_norm": 0.39483174681663513,
"learning_rate": 0.00038880262632536266,
"loss": 1.9478,
"step": 12400
},
{
"epoch": 1.317037193130334,
"grad_norm": 0.43672600388526917,
"learning_rate": 0.00038769353622288276,
"loss": 1.9485,
"step": 12500
},
{
"epoch": 1.3275734906753767,
"grad_norm": 0.42754313349723816,
"learning_rate": 0.0003865844461204028,
"loss": 1.9463,
"step": 12600
},
{
"epoch": 1.3381097882204194,
"grad_norm": 0.41211095452308655,
"learning_rate": 0.0003854753560179229,
"loss": 1.95,
"step": 12700
},
{
"epoch": 1.348646085765462,
"grad_norm": 0.3844158947467804,
"learning_rate": 0.000384366265915443,
"loss": 1.9426,
"step": 12800
},
{
"epoch": 1.3591823833105048,
"grad_norm": 0.4544881582260132,
"learning_rate": 0.00038325717581296306,
"loss": 1.951,
"step": 12900
},
{
"epoch": 1.3697186808555473,
"grad_norm": 0.4058513641357422,
"learning_rate": 0.00038214808571048316,
"loss": 1.9521,
"step": 13000
},
{
"epoch": 1.38025497840059,
"grad_norm": 0.38905027508735657,
"learning_rate": 0.0003810389956080032,
"loss": 1.9534,
"step": 13100
},
{
"epoch": 1.3907912759456327,
"grad_norm": 0.4224783182144165,
"learning_rate": 0.00037992990550552325,
"loss": 1.9485,
"step": 13200
},
{
"epoch": 1.4013275734906754,
"grad_norm": 0.3894629180431366,
"learning_rate": 0.00037882081540304335,
"loss": 1.9459,
"step": 13300
},
{
"epoch": 1.411863871035718,
"grad_norm": 0.4435978829860687,
"learning_rate": 0.0003777117253005634,
"loss": 1.9428,
"step": 13400
},
{
"epoch": 1.4224001685807608,
"grad_norm": 0.4090045690536499,
"learning_rate": 0.0003766026351980835,
"loss": 1.951,
"step": 13500
},
{
"epoch": 1.4329364661258035,
"grad_norm": 0.4192126989364624,
"learning_rate": 0.00037549354509560355,
"loss": 1.9498,
"step": 13600
},
{
"epoch": 1.443472763670846,
"grad_norm": 0.399774968624115,
"learning_rate": 0.00037438445499312365,
"loss": 1.9463,
"step": 13700
},
{
"epoch": 1.4540090612158887,
"grad_norm": 0.3659054636955261,
"learning_rate": 0.00037327536489064375,
"loss": 1.9549,
"step": 13800
},
{
"epoch": 1.4645453587609314,
"grad_norm": 0.385452538728714,
"learning_rate": 0.0003721662747881638,
"loss": 1.9472,
"step": 13900
},
{
"epoch": 1.475081656305974,
"grad_norm": 0.3904755413532257,
"learning_rate": 0.0003710571846856839,
"loss": 1.9438,
"step": 14000
},
{
"epoch": 1.4856179538510168,
"grad_norm": 0.3969903290271759,
"learning_rate": 0.00036994809458320394,
"loss": 1.941,
"step": 14100
},
{
"epoch": 1.4961542513960593,
"grad_norm": 0.4201650321483612,
"learning_rate": 0.000368839004480724,
"loss": 1.9451,
"step": 14200
},
{
"epoch": 1.5066905489411022,
"grad_norm": 0.3867323100566864,
"learning_rate": 0.0003677299143782441,
"loss": 1.9463,
"step": 14300
},
{
"epoch": 1.5172268464861447,
"grad_norm": 0.40658488869667053,
"learning_rate": 0.00036662082427576414,
"loss": 1.9461,
"step": 14400
},
{
"epoch": 1.5277631440311874,
"grad_norm": 0.39837929606437683,
"learning_rate": 0.00036551173417328424,
"loss": 1.9517,
"step": 14500
},
{
"epoch": 1.53829944157623,
"grad_norm": 0.42312178015708923,
"learning_rate": 0.00036440264407080434,
"loss": 1.9351,
"step": 14600
},
{
"epoch": 1.5488357391212728,
"grad_norm": 0.4057867228984833,
"learning_rate": 0.00036329355396832444,
"loss": 1.9403,
"step": 14700
},
{
"epoch": 1.5593720366663155,
"grad_norm": 0.39428508281707764,
"learning_rate": 0.0003621844638658445,
"loss": 1.9484,
"step": 14800
},
{
"epoch": 1.569908334211358,
"grad_norm": 0.381671279668808,
"learning_rate": 0.00036107537376336453,
"loss": 1.9399,
"step": 14900
},
{
"epoch": 1.580444631756401,
"grad_norm": 0.4080953598022461,
"learning_rate": 0.00035996628366088463,
"loss": 1.9316,
"step": 15000
},
{
"epoch": 1.5909809293014434,
"grad_norm": 0.3612942397594452,
"learning_rate": 0.0003588571935584047,
"loss": 1.9337,
"step": 15100
},
{
"epoch": 1.6015172268464861,
"grad_norm": 0.37906691431999207,
"learning_rate": 0.0003577481034559248,
"loss": 1.9338,
"step": 15200
},
{
"epoch": 1.6120535243915288,
"grad_norm": 0.4057066738605499,
"learning_rate": 0.0003566390133534448,
"loss": 1.9399,
"step": 15300
},
{
"epoch": 1.6225898219365715,
"grad_norm": 0.396557480096817,
"learning_rate": 0.0003555299232509649,
"loss": 1.9472,
"step": 15400
},
{
"epoch": 1.6331261194816142,
"grad_norm": 0.37647131085395813,
"learning_rate": 0.000354420833148485,
"loss": 1.9368,
"step": 15500
},
{
"epoch": 1.6436624170266567,
"grad_norm": 0.3920493721961975,
"learning_rate": 0.0003533117430460051,
"loss": 1.9407,
"step": 15600
},
{
"epoch": 1.6541987145716996,
"grad_norm": 0.39372900128364563,
"learning_rate": 0.0003522026529435252,
"loss": 1.9327,
"step": 15700
},
{
"epoch": 1.6647350121167421,
"grad_norm": 0.3832472264766693,
"learning_rate": 0.0003510935628410452,
"loss": 1.9365,
"step": 15800
},
{
"epoch": 1.6752713096617848,
"grad_norm": 0.3669210970401764,
"learning_rate": 0.00034998447273856527,
"loss": 1.9323,
"step": 15900
},
{
"epoch": 1.6858076072068275,
"grad_norm": 0.37810054421424866,
"learning_rate": 0.00034887538263608537,
"loss": 1.93,
"step": 16000
},
{
"epoch": 1.6963439047518702,
"grad_norm": 0.3972882330417633,
"learning_rate": 0.0003477662925336054,
"loss": 1.9299,
"step": 16100
},
{
"epoch": 1.706880202296913,
"grad_norm": 0.39600399136543274,
"learning_rate": 0.0003466572024311255,
"loss": 1.9337,
"step": 16200
},
{
"epoch": 1.7174164998419554,
"grad_norm": 0.367546021938324,
"learning_rate": 0.00034554811232864556,
"loss": 1.934,
"step": 16300
},
{
"epoch": 1.7279527973869984,
"grad_norm": 0.43116411566734314,
"learning_rate": 0.00034443902222616566,
"loss": 1.9296,
"step": 16400
},
{
"epoch": 1.7384890949320408,
"grad_norm": 0.41438373923301697,
"learning_rate": 0.00034332993212368577,
"loss": 1.9304,
"step": 16500
},
{
"epoch": 1.7490253924770836,
"grad_norm": 0.387265145778656,
"learning_rate": 0.0003422208420212058,
"loss": 1.9273,
"step": 16600
},
{
"epoch": 1.7595616900221263,
"grad_norm": 0.3982371687889099,
"learning_rate": 0.0003411117519187259,
"loss": 1.9338,
"step": 16700
},
{
"epoch": 1.7700979875671687,
"grad_norm": 0.3915503919124603,
"learning_rate": 0.00034000266181624596,
"loss": 1.9305,
"step": 16800
},
{
"epoch": 1.7806342851122117,
"grad_norm": 0.38060539960861206,
"learning_rate": 0.00033889357171376606,
"loss": 1.927,
"step": 16900
},
{
"epoch": 1.7911705826572542,
"grad_norm": 0.4222376048564911,
"learning_rate": 0.0003377844816112861,
"loss": 1.9311,
"step": 17000
},
{
"epoch": 1.801706880202297,
"grad_norm": 0.3746761381626129,
"learning_rate": 0.00033667539150880615,
"loss": 1.9269,
"step": 17100
},
{
"epoch": 1.8122431777473396,
"grad_norm": 0.3764290511608124,
"learning_rate": 0.00033556630140632625,
"loss": 1.9239,
"step": 17200
},
{
"epoch": 1.8227794752923823,
"grad_norm": 0.3536926209926605,
"learning_rate": 0.0003344572113038463,
"loss": 1.9312,
"step": 17300
},
{
"epoch": 1.833315772837425,
"grad_norm": 0.3796480596065521,
"learning_rate": 0.0003333481212013664,
"loss": 1.9229,
"step": 17400
},
{
"epoch": 1.8438520703824675,
"grad_norm": 0.3728596866130829,
"learning_rate": 0.0003322390310988865,
"loss": 1.9248,
"step": 17500
},
{
"epoch": 1.8543883679275104,
"grad_norm": 0.3622676432132721,
"learning_rate": 0.00033112994099640655,
"loss": 1.9274,
"step": 17600
},
{
"epoch": 1.8649246654725529,
"grad_norm": 0.3914555013179779,
"learning_rate": 0.00033002085089392665,
"loss": 1.917,
"step": 17700
},
{
"epoch": 1.8754609630175956,
"grad_norm": 0.3367026448249817,
"learning_rate": 0.0003289117607914467,
"loss": 1.9213,
"step": 17800
},
{
"epoch": 1.8859972605626383,
"grad_norm": 0.41049453616142273,
"learning_rate": 0.0003278026706889668,
"loss": 1.921,
"step": 17900
},
{
"epoch": 1.896533558107681,
"grad_norm": 0.38005101680755615,
"learning_rate": 0.00032669358058648684,
"loss": 1.9188,
"step": 18000
},
{
"epoch": 1.9070698556527237,
"grad_norm": 0.3855360150337219,
"learning_rate": 0.0003255844904840069,
"loss": 1.9224,
"step": 18100
},
{
"epoch": 1.9176061531977662,
"grad_norm": 0.3764369487762451,
"learning_rate": 0.000324475400381527,
"loss": 1.9221,
"step": 18200
},
{
"epoch": 1.928142450742809,
"grad_norm": 0.3933279514312744,
"learning_rate": 0.00032336631027904704,
"loss": 1.9233,
"step": 18300
},
{
"epoch": 1.9386787482878516,
"grad_norm": 0.3530935049057007,
"learning_rate": 0.0003222572201765672,
"loss": 1.9218,
"step": 18400
},
{
"epoch": 1.9492150458328943,
"grad_norm": 0.36857885122299194,
"learning_rate": 0.00032114813007408724,
"loss": 1.9211,
"step": 18500
},
{
"epoch": 1.959751343377937,
"grad_norm": 0.3870936930179596,
"learning_rate": 0.00032003903997160734,
"loss": 1.919,
"step": 18600
},
{
"epoch": 1.9702876409229797,
"grad_norm": 0.38852736353874207,
"learning_rate": 0.0003189299498691274,
"loss": 1.9137,
"step": 18700
},
{
"epoch": 1.9808239384680224,
"grad_norm": 0.3802979290485382,
"learning_rate": 0.00031782085976664743,
"loss": 1.9238,
"step": 18800
},
{
"epoch": 1.9913602360130649,
"grad_norm": 0.39477866888046265,
"learning_rate": 0.00031671176966416753,
"loss": 1.9226,
"step": 18900
},
{
"epoch": 2.001896533558108,
"grad_norm": 0.39578545093536377,
"learning_rate": 0.0003156026795616876,
"loss": 1.9067,
"step": 19000
},
{
"epoch": 2.0124328311031503,
"grad_norm": 0.3758637607097626,
"learning_rate": 0.0003144935894592077,
"loss": 1.8889,
"step": 19100
},
{
"epoch": 2.0229691286481932,
"grad_norm": 0.3424636125564575,
"learning_rate": 0.00031338449935672773,
"loss": 1.881,
"step": 19200
},
{
"epoch": 2.0335054261932357,
"grad_norm": 0.3473268151283264,
"learning_rate": 0.0003122754092542478,
"loss": 1.8824,
"step": 19300
},
{
"epoch": 2.044041723738278,
"grad_norm": 0.34891676902770996,
"learning_rate": 0.00031116631915176793,
"loss": 1.8876,
"step": 19400
},
{
"epoch": 2.054578021283321,
"grad_norm": 0.40848681330680847,
"learning_rate": 0.000310057229049288,
"loss": 1.8804,
"step": 19500
},
{
"epoch": 2.0651143188283636,
"grad_norm": 0.3565325140953064,
"learning_rate": 0.0003089481389468081,
"loss": 1.8846,
"step": 19600
},
{
"epoch": 2.0756506163734065,
"grad_norm": 0.3714432418346405,
"learning_rate": 0.0003078390488443281,
"loss": 1.8952,
"step": 19700
},
{
"epoch": 2.086186913918449,
"grad_norm": 0.39024487137794495,
"learning_rate": 0.00030672995874184817,
"loss": 1.8886,
"step": 19800
},
{
"epoch": 2.096723211463492,
"grad_norm": 0.37265217304229736,
"learning_rate": 0.00030562086863936827,
"loss": 1.8815,
"step": 19900
},
{
"epoch": 2.1072595090085344,
"grad_norm": 0.4258386194705963,
"learning_rate": 0.0003045117785368883,
"loss": 1.8797,
"step": 20000
},
{
"epoch": 2.117795806553577,
"grad_norm": 0.3775697350502014,
"learning_rate": 0.0003034026884344084,
"loss": 1.8863,
"step": 20100
},
{
"epoch": 2.12833210409862,
"grad_norm": 0.3451697826385498,
"learning_rate": 0.00030229359833192846,
"loss": 1.8812,
"step": 20200
},
{
"epoch": 2.1388684016436623,
"grad_norm": 0.3747578561306,
"learning_rate": 0.00030118450822944857,
"loss": 1.8884,
"step": 20300
},
{
"epoch": 2.1494046991887052,
"grad_norm": 0.35056072473526,
"learning_rate": 0.00030007541812696867,
"loss": 1.8721,
"step": 20400
},
{
"epoch": 2.1599409967337477,
"grad_norm": 0.3892049491405487,
"learning_rate": 0.0002989663280244887,
"loss": 1.8869,
"step": 20500
},
{
"epoch": 2.1704772942787907,
"grad_norm": 0.4040903151035309,
"learning_rate": 0.0002978572379220088,
"loss": 1.8773,
"step": 20600
},
{
"epoch": 2.181013591823833,
"grad_norm": 0.4122794568538666,
"learning_rate": 0.00029674814781952886,
"loss": 1.8858,
"step": 20700
},
{
"epoch": 2.1915498893688756,
"grad_norm": 0.38314470648765564,
"learning_rate": 0.00029563905771704896,
"loss": 1.8887,
"step": 20800
},
{
"epoch": 2.2020861869139186,
"grad_norm": 0.3841986358165741,
"learning_rate": 0.000294529967614569,
"loss": 1.8886,
"step": 20900
},
{
"epoch": 2.212622484458961,
"grad_norm": 0.3989698588848114,
"learning_rate": 0.00029342087751208905,
"loss": 1.8876,
"step": 21000
},
{
"epoch": 2.223158782004004,
"grad_norm": 0.3878525495529175,
"learning_rate": 0.00029231178740960915,
"loss": 1.8831,
"step": 21100
},
{
"epoch": 2.2336950795490464,
"grad_norm": 0.36871328949928284,
"learning_rate": 0.0002912026973071292,
"loss": 1.8869,
"step": 21200
},
{
"epoch": 2.244231377094089,
"grad_norm": 0.3922217786312103,
"learning_rate": 0.00029009360720464936,
"loss": 1.8867,
"step": 21300
},
{
"epoch": 2.254767674639132,
"grad_norm": 0.37641048431396484,
"learning_rate": 0.0002889845171021694,
"loss": 1.8813,
"step": 21400
},
{
"epoch": 2.2653039721841743,
"grad_norm": 0.3834270238876343,
"learning_rate": 0.00028787542699968945,
"loss": 1.8858,
"step": 21500
},
{
"epoch": 2.2758402697292173,
"grad_norm": 0.3613283336162567,
"learning_rate": 0.00028676633689720955,
"loss": 1.8788,
"step": 21600
},
{
"epoch": 2.2863765672742598,
"grad_norm": 0.3932812511920929,
"learning_rate": 0.0002856572467947296,
"loss": 1.8841,
"step": 21700
},
{
"epoch": 2.2969128648193027,
"grad_norm": 0.380537748336792,
"learning_rate": 0.0002845481566922497,
"loss": 1.8867,
"step": 21800
},
{
"epoch": 2.307449162364345,
"grad_norm": 0.35902804136276245,
"learning_rate": 0.00028343906658976974,
"loss": 1.8925,
"step": 21900
},
{
"epoch": 2.317985459909388,
"grad_norm": 0.3631201386451721,
"learning_rate": 0.0002823299764872898,
"loss": 1.8779,
"step": 22000
},
{
"epoch": 2.3285217574544306,
"grad_norm": 0.3709360361099243,
"learning_rate": 0.0002812208863848099,
"loss": 1.877,
"step": 22100
},
{
"epoch": 2.339058054999473,
"grad_norm": 0.35048261284828186,
"learning_rate": 0.00028011179628233,
"loss": 1.8717,
"step": 22200
},
{
"epoch": 2.349594352544516,
"grad_norm": 0.35067349672317505,
"learning_rate": 0.0002790027061798501,
"loss": 1.8778,
"step": 22300
},
{
"epoch": 2.3601306500895585,
"grad_norm": 0.3626950681209564,
"learning_rate": 0.00027789361607737014,
"loss": 1.886,
"step": 22400
},
{
"epoch": 2.370666947634601,
"grad_norm": 0.35151103138923645,
"learning_rate": 0.00027678452597489024,
"loss": 1.8776,
"step": 22500
},
{
"epoch": 2.381203245179644,
"grad_norm": 0.3527145981788635,
"learning_rate": 0.0002756754358724103,
"loss": 1.8786,
"step": 22600
},
{
"epoch": 2.3917395427246864,
"grad_norm": 0.3571159541606903,
"learning_rate": 0.00027456634576993033,
"loss": 1.8704,
"step": 22700
},
{
"epoch": 2.4022758402697293,
"grad_norm": 0.35839220881462097,
"learning_rate": 0.00027345725566745043,
"loss": 1.8815,
"step": 22800
},
{
"epoch": 2.4128121378147718,
"grad_norm": 0.3516599237918854,
"learning_rate": 0.0002723481655649705,
"loss": 1.8745,
"step": 22900
},
{
"epoch": 2.4233484353598147,
"grad_norm": 0.37703123688697815,
"learning_rate": 0.0002712390754624906,
"loss": 1.8717,
"step": 23000
},
{
"epoch": 2.433884732904857,
"grad_norm": 0.35914528369903564,
"learning_rate": 0.00027012998536001063,
"loss": 1.8751,
"step": 23100
},
{
"epoch": 2.4444210304498997,
"grad_norm": 0.379916787147522,
"learning_rate": 0.00026902089525753073,
"loss": 1.8694,
"step": 23200
},
{
"epoch": 2.4549573279949426,
"grad_norm": 0.38764089345932007,
"learning_rate": 0.00026791180515505083,
"loss": 1.8762,
"step": 23300
},
{
"epoch": 2.465493625539985,
"grad_norm": 0.3425200879573822,
"learning_rate": 0.0002668027150525709,
"loss": 1.8765,
"step": 23400
},
{
"epoch": 2.476029923085028,
"grad_norm": 0.37601912021636963,
"learning_rate": 0.000265693624950091,
"loss": 1.8751,
"step": 23500
},
{
"epoch": 2.4865662206300705,
"grad_norm": 0.3854159414768219,
"learning_rate": 0.000264584534847611,
"loss": 1.8746,
"step": 23600
},
{
"epoch": 2.4971025181751134,
"grad_norm": 0.402798593044281,
"learning_rate": 0.00026347544474513107,
"loss": 1.8758,
"step": 23700
},
{
"epoch": 2.507638815720156,
"grad_norm": 0.3488067388534546,
"learning_rate": 0.00026236635464265117,
"loss": 1.8823,
"step": 23800
},
{
"epoch": 2.5181751132651984,
"grad_norm": 0.38071927428245544,
"learning_rate": 0.0002612572645401712,
"loss": 1.8746,
"step": 23900
},
{
"epoch": 2.5287114108102413,
"grad_norm": 0.3481471538543701,
"learning_rate": 0.0002601481744376913,
"loss": 1.8787,
"step": 24000
},
{
"epoch": 2.539247708355284,
"grad_norm": 0.34442374110221863,
"learning_rate": 0.0002590390843352114,
"loss": 1.88,
"step": 24100
},
{
"epoch": 2.5497840059003267,
"grad_norm": 0.34286609292030334,
"learning_rate": 0.00025792999423273147,
"loss": 1.8711,
"step": 24200
},
{
"epoch": 2.560320303445369,
"grad_norm": 0.3455844819545746,
"learning_rate": 0.00025682090413025157,
"loss": 1.8692,
"step": 24300
},
{
"epoch": 2.570856600990412,
"grad_norm": 0.3363890051841736,
"learning_rate": 0.0002557118140277716,
"loss": 1.8723,
"step": 24400
},
{
"epoch": 2.5813928985354546,
"grad_norm": 0.3758355677127838,
"learning_rate": 0.0002546027239252917,
"loss": 1.8786,
"step": 24500
},
{
"epoch": 2.591929196080497,
"grad_norm": 0.3661966621875763,
"learning_rate": 0.00025349363382281176,
"loss": 1.8742,
"step": 24600
},
{
"epoch": 2.60246549362554,
"grad_norm": 0.3269520103931427,
"learning_rate": 0.00025238454372033186,
"loss": 1.8765,
"step": 24700
},
{
"epoch": 2.6130017911705825,
"grad_norm": 0.37588828802108765,
"learning_rate": 0.0002512754536178519,
"loss": 1.8755,
"step": 24800
},
{
"epoch": 2.6235380887156254,
"grad_norm": 0.34371519088745117,
"learning_rate": 0.00025016636351537195,
"loss": 1.8689,
"step": 24900
},
{
"epoch": 2.634074386260668,
"grad_norm": 0.3703347444534302,
"learning_rate": 0.00024905727341289206,
"loss": 1.869,
"step": 25000
},
{
"epoch": 2.644610683805711,
"grad_norm": 0.3689127266407013,
"learning_rate": 0.00024794818331041216,
"loss": 1.8681,
"step": 25100
},
{
"epoch": 2.6551469813507533,
"grad_norm": 0.3827933371067047,
"learning_rate": 0.0002468390932079322,
"loss": 1.8693,
"step": 25200
},
{
"epoch": 2.665683278895796,
"grad_norm": 0.3681269586086273,
"learning_rate": 0.0002457300031054523,
"loss": 1.8668,
"step": 25300
},
{
"epoch": 2.6762195764408387,
"grad_norm": 0.3521827757358551,
"learning_rate": 0.00024462091300297235,
"loss": 1.872,
"step": 25400
},
{
"epoch": 2.6867558739858812,
"grad_norm": 0.35968610644340515,
"learning_rate": 0.00024351182290049245,
"loss": 1.868,
"step": 25500
},
{
"epoch": 2.697292171530924,
"grad_norm": 0.34900325536727905,
"learning_rate": 0.0002424027327980125,
"loss": 1.8639,
"step": 25600
},
{
"epoch": 2.7078284690759666,
"grad_norm": 0.36115318536758423,
"learning_rate": 0.00024129364269553257,
"loss": 1.8666,
"step": 25700
},
{
"epoch": 2.7183647666210096,
"grad_norm": 0.3598721921443939,
"learning_rate": 0.00024018455259305267,
"loss": 1.8588,
"step": 25800
},
{
"epoch": 2.728901064166052,
"grad_norm": 0.3527396619319916,
"learning_rate": 0.00023907546249057275,
"loss": 1.8626,
"step": 25900
},
{
"epoch": 2.7394373617110945,
"grad_norm": 0.3464626967906952,
"learning_rate": 0.00023796637238809282,
"loss": 1.8724,
"step": 26000
},
{
"epoch": 2.7499736592561375,
"grad_norm": 0.36689963936805725,
"learning_rate": 0.0002368572822856129,
"loss": 1.8658,
"step": 26100
},
{
"epoch": 2.76050995680118,
"grad_norm": 0.3785768151283264,
"learning_rate": 0.00023574819218313297,
"loss": 1.8642,
"step": 26200
},
{
"epoch": 2.771046254346223,
"grad_norm": 0.3481883704662323,
"learning_rate": 0.00023463910208065304,
"loss": 1.8561,
"step": 26300
},
{
"epoch": 2.7815825518912654,
"grad_norm": 0.36630862951278687,
"learning_rate": 0.00023353001197817311,
"loss": 1.862,
"step": 26400
},
{
"epoch": 2.7921188494363083,
"grad_norm": 0.35414576530456543,
"learning_rate": 0.0002324209218756932,
"loss": 1.8676,
"step": 26500
},
{
"epoch": 2.8026551469813508,
"grad_norm": 0.3922441601753235,
"learning_rate": 0.00023131183177321326,
"loss": 1.8709,
"step": 26600
},
{
"epoch": 2.8131914445263932,
"grad_norm": 0.34433358907699585,
"learning_rate": 0.00023020274167073334,
"loss": 1.8676,
"step": 26700
},
{
"epoch": 2.823727742071436,
"grad_norm": 0.32512736320495605,
"learning_rate": 0.0002290936515682534,
"loss": 1.8694,
"step": 26800
},
{
"epoch": 2.8342640396164787,
"grad_norm": 0.3611021041870117,
"learning_rate": 0.00022798456146577348,
"loss": 1.8686,
"step": 26900
},
{
"epoch": 2.8448003371615216,
"grad_norm": 0.34630611538887024,
"learning_rate": 0.00022687547136329356,
"loss": 1.8628,
"step": 27000
},
{
"epoch": 2.855336634706564,
"grad_norm": 0.34372755885124207,
"learning_rate": 0.00022576638126081363,
"loss": 1.8613,
"step": 27100
},
{
"epoch": 2.865872932251607,
"grad_norm": 0.3749391436576843,
"learning_rate": 0.00022465729115833373,
"loss": 1.8725,
"step": 27200
},
{
"epoch": 2.8764092297966495,
"grad_norm": 0.3814404606819153,
"learning_rate": 0.00022354820105585378,
"loss": 1.8627,
"step": 27300
},
{
"epoch": 2.886945527341692,
"grad_norm": 0.35840287804603577,
"learning_rate": 0.00022243911095337385,
"loss": 1.8606,
"step": 27400
},
{
"epoch": 2.897481824886735,
"grad_norm": 0.3533620834350586,
"learning_rate": 0.00022133002085089392,
"loss": 1.8665,
"step": 27500
},
{
"epoch": 2.9080181224317774,
"grad_norm": 0.3550478518009186,
"learning_rate": 0.000220220930748414,
"loss": 1.8587,
"step": 27600
},
{
"epoch": 2.9185544199768203,
"grad_norm": 0.3665110468864441,
"learning_rate": 0.0002191118406459341,
"loss": 1.8655,
"step": 27700
},
{
"epoch": 2.929090717521863,
"grad_norm": 0.3647795021533966,
"learning_rate": 0.00021800275054345415,
"loss": 1.8555,
"step": 27800
},
{
"epoch": 2.9396270150669057,
"grad_norm": 0.34207072854042053,
"learning_rate": 0.00021689366044097422,
"loss": 1.8601,
"step": 27900
},
{
"epoch": 2.950163312611948,
"grad_norm": 0.3422704339027405,
"learning_rate": 0.0002157845703384943,
"loss": 1.8553,
"step": 28000
},
{
"epoch": 2.9606996101569907,
"grad_norm": 0.3600524961948395,
"learning_rate": 0.0002146754802360144,
"loss": 1.8597,
"step": 28100
},
{
"epoch": 2.9712359077020336,
"grad_norm": 0.35774359107017517,
"learning_rate": 0.00021356639013353447,
"loss": 1.86,
"step": 28200
},
{
"epoch": 2.981772205247076,
"grad_norm": 0.3582908511161804,
"learning_rate": 0.00021245730003105454,
"loss": 1.8591,
"step": 28300
},
{
"epoch": 2.9923085027921186,
"grad_norm": 0.36876824498176575,
"learning_rate": 0.0002113482099285746,
"loss": 1.8655,
"step": 28400
},
{
"epoch": 3.0028448003371615,
"grad_norm": 0.3600168526172638,
"learning_rate": 0.00021023911982609466,
"loss": 1.8473,
"step": 28500
},
{
"epoch": 3.013381097882204,
"grad_norm": 0.33718979358673096,
"learning_rate": 0.00020913002972361476,
"loss": 1.8256,
"step": 28600
},
{
"epoch": 3.023917395427247,
"grad_norm": 0.3321118950843811,
"learning_rate": 0.00020802093962113484,
"loss": 1.8251,
"step": 28700
},
{
"epoch": 3.0344536929722894,
"grad_norm": 0.34264570474624634,
"learning_rate": 0.0002069118495186549,
"loss": 1.831,
"step": 28800
},
{
"epoch": 3.0449899905173323,
"grad_norm": 0.3522898852825165,
"learning_rate": 0.00020580275941617496,
"loss": 1.8249,
"step": 28900
},
{
"epoch": 3.055526288062375,
"grad_norm": 0.38659289479255676,
"learning_rate": 0.00020469366931369503,
"loss": 1.829,
"step": 29000
},
{
"epoch": 3.0660625856074177,
"grad_norm": 0.3475963771343231,
"learning_rate": 0.00020358457921121513,
"loss": 1.8287,
"step": 29100
},
{
"epoch": 3.07659888315246,
"grad_norm": 0.37323230504989624,
"learning_rate": 0.0002024754891087352,
"loss": 1.827,
"step": 29200
},
{
"epoch": 3.0871351806975027,
"grad_norm": 0.3953257203102112,
"learning_rate": 0.00020136639900625528,
"loss": 1.8303,
"step": 29300
},
{
"epoch": 3.0976714782425456,
"grad_norm": 0.34784358739852905,
"learning_rate": 0.00020025730890377535,
"loss": 1.8225,
"step": 29400
},
{
"epoch": 3.108207775787588,
"grad_norm": 0.3565751314163208,
"learning_rate": 0.0001991482188012954,
"loss": 1.8292,
"step": 29500
},
{
"epoch": 3.118744073332631,
"grad_norm": 0.368730753660202,
"learning_rate": 0.0001980391286988155,
"loss": 1.8357,
"step": 29600
},
{
"epoch": 3.1292803708776735,
"grad_norm": 0.37354937195777893,
"learning_rate": 0.00019693003859633557,
"loss": 1.8276,
"step": 29700
},
{
"epoch": 3.1398166684227165,
"grad_norm": 0.3472649157047272,
"learning_rate": 0.00019582094849385565,
"loss": 1.8335,
"step": 29800
},
{
"epoch": 3.150352965967759,
"grad_norm": 0.35036763548851013,
"learning_rate": 0.00019471185839137572,
"loss": 1.8276,
"step": 29900
},
{
"epoch": 3.1608892635128014,
"grad_norm": 0.3752099573612213,
"learning_rate": 0.0001936027682888958,
"loss": 1.8308,
"step": 30000
},
{
"epoch": 3.1714255610578443,
"grad_norm": 0.337298184633255,
"learning_rate": 0.00019249367818641587,
"loss": 1.8268,
"step": 30100
},
{
"epoch": 3.181961858602887,
"grad_norm": 0.3451649844646454,
"learning_rate": 0.00019138458808393594,
"loss": 1.825,
"step": 30200
},
{
"epoch": 3.1924981561479298,
"grad_norm": 0.36679157614707947,
"learning_rate": 0.00019027549798145602,
"loss": 1.8389,
"step": 30300
},
{
"epoch": 3.2030344536929722,
"grad_norm": 0.34255459904670715,
"learning_rate": 0.0001891664078789761,
"loss": 1.8321,
"step": 30400
},
{
"epoch": 3.213570751238015,
"grad_norm": 0.36408087611198425,
"learning_rate": 0.0001880573177764962,
"loss": 1.8324,
"step": 30500
},
{
"epoch": 3.2241070487830576,
"grad_norm": 0.32933005690574646,
"learning_rate": 0.00018694822767401624,
"loss": 1.8256,
"step": 30600
},
{
"epoch": 3.2346433463281,
"grad_norm": 0.37449416518211365,
"learning_rate": 0.0001858391375715363,
"loss": 1.8332,
"step": 30700
},
{
"epoch": 3.245179643873143,
"grad_norm": 0.32968634366989136,
"learning_rate": 0.00018473004746905638,
"loss": 1.8247,
"step": 30800
},
{
"epoch": 3.2557159414181855,
"grad_norm": 0.3492085635662079,
"learning_rate": 0.00018362095736657646,
"loss": 1.8339,
"step": 30900
},
{
"epoch": 3.2662522389632285,
"grad_norm": 0.37141090631484985,
"learning_rate": 0.00018251186726409656,
"loss": 1.8332,
"step": 31000
},
{
"epoch": 3.276788536508271,
"grad_norm": 0.3904590308666229,
"learning_rate": 0.0001814027771616166,
"loss": 1.827,
"step": 31100
},
{
"epoch": 3.2873248340533134,
"grad_norm": 0.3764263987541199,
"learning_rate": 0.00018029368705913668,
"loss": 1.827,
"step": 31200
},
{
"epoch": 3.2978611315983564,
"grad_norm": 0.36718282103538513,
"learning_rate": 0.00017918459695665675,
"loss": 1.828,
"step": 31300
},
{
"epoch": 3.308397429143399,
"grad_norm": 0.33118733763694763,
"learning_rate": 0.00017807550685417683,
"loss": 1.8304,
"step": 31400
},
{
"epoch": 3.3189337266884418,
"grad_norm": 0.3702305853366852,
"learning_rate": 0.00017696641675169693,
"loss": 1.8313,
"step": 31500
},
{
"epoch": 3.3294700242334843,
"grad_norm": 0.3547195792198181,
"learning_rate": 0.000175857326649217,
"loss": 1.8306,
"step": 31600
},
{
"epoch": 3.340006321778527,
"grad_norm": 0.3350249230861664,
"learning_rate": 0.00017474823654673705,
"loss": 1.8327,
"step": 31700
},
{
"epoch": 3.3505426193235697,
"grad_norm": 0.34737563133239746,
"learning_rate": 0.00017363914644425712,
"loss": 1.8256,
"step": 31800
},
{
"epoch": 3.361078916868612,
"grad_norm": 0.3753857910633087,
"learning_rate": 0.00017253005634177722,
"loss": 1.8304,
"step": 31900
},
{
"epoch": 3.371615214413655,
"grad_norm": 0.34666532278060913,
"learning_rate": 0.0001714209662392973,
"loss": 1.835,
"step": 32000
},
{
"epoch": 3.3821515119586976,
"grad_norm": 0.3317427933216095,
"learning_rate": 0.00017031187613681737,
"loss": 1.8231,
"step": 32100
},
{
"epoch": 3.3926878095037405,
"grad_norm": 0.33654922246932983,
"learning_rate": 0.00016920278603433742,
"loss": 1.8272,
"step": 32200
},
{
"epoch": 3.403224107048783,
"grad_norm": 0.35222548246383667,
"learning_rate": 0.0001680936959318575,
"loss": 1.8254,
"step": 32300
},
{
"epoch": 3.413760404593826,
"grad_norm": 0.3511573374271393,
"learning_rate": 0.0001669846058293776,
"loss": 1.8297,
"step": 32400
},
{
"epoch": 3.4242967021388684,
"grad_norm": 0.35278716683387756,
"learning_rate": 0.00016587551572689766,
"loss": 1.8269,
"step": 32500
},
{
"epoch": 3.434832999683911,
"grad_norm": 0.3196614682674408,
"learning_rate": 0.00016476642562441774,
"loss": 1.8183,
"step": 32600
},
{
"epoch": 3.445369297228954,
"grad_norm": 0.3310936987400055,
"learning_rate": 0.0001636573355219378,
"loss": 1.8234,
"step": 32700
},
{
"epoch": 3.4559055947739963,
"grad_norm": 0.35424286127090454,
"learning_rate": 0.00016254824541945786,
"loss": 1.8306,
"step": 32800
},
{
"epoch": 3.466441892319039,
"grad_norm": 0.3745037913322449,
"learning_rate": 0.00016143915531697796,
"loss": 1.8313,
"step": 32900
},
{
"epoch": 3.4769781898640817,
"grad_norm": 0.3382411599159241,
"learning_rate": 0.00016033006521449803,
"loss": 1.8225,
"step": 33000
},
{
"epoch": 3.4875144874091246,
"grad_norm": 0.33086690306663513,
"learning_rate": 0.0001592209751120181,
"loss": 1.8208,
"step": 33100
},
{
"epoch": 3.498050784954167,
"grad_norm": 0.3586762249469757,
"learning_rate": 0.00015811188500953818,
"loss": 1.8255,
"step": 33200
},
{
"epoch": 3.5085870824992096,
"grad_norm": 0.3511541187763214,
"learning_rate": 0.00015700279490705825,
"loss": 1.8259,
"step": 33300
},
{
"epoch": 3.5191233800442525,
"grad_norm": 0.3497931659221649,
"learning_rate": 0.00015589370480457833,
"loss": 1.8226,
"step": 33400
},
{
"epoch": 3.529659677589295,
"grad_norm": 0.35156911611557007,
"learning_rate": 0.0001547846147020984,
"loss": 1.8231,
"step": 33500
},
{
"epoch": 3.540195975134338,
"grad_norm": 0.34975793957710266,
"learning_rate": 0.00015367552459961847,
"loss": 1.824,
"step": 33600
},
{
"epoch": 3.5507322726793804,
"grad_norm": 0.3560537099838257,
"learning_rate": 0.00015256643449713855,
"loss": 1.8284,
"step": 33700
},
{
"epoch": 3.5612685702244233,
"grad_norm": 0.37322962284088135,
"learning_rate": 0.00015145734439465865,
"loss": 1.8229,
"step": 33800
},
{
"epoch": 3.571804867769466,
"grad_norm": 0.3404606878757477,
"learning_rate": 0.0001503482542921787,
"loss": 1.8295,
"step": 33900
},
{
"epoch": 3.5823411653145083,
"grad_norm": 0.3346281349658966,
"learning_rate": 0.00014923916418969877,
"loss": 1.8221,
"step": 34000
},
{
"epoch": 3.5928774628595512,
"grad_norm": 0.3319614827632904,
"learning_rate": 0.00014813007408721884,
"loss": 1.8225,
"step": 34100
},
{
"epoch": 3.6034137604045937,
"grad_norm": 0.3317611515522003,
"learning_rate": 0.00014702098398473892,
"loss": 1.8175,
"step": 34200
},
{
"epoch": 3.6139500579496366,
"grad_norm": 0.3446439206600189,
"learning_rate": 0.00014591189388225902,
"loss": 1.8283,
"step": 34300
},
{
"epoch": 3.624486355494679,
"grad_norm": 0.32466185092926025,
"learning_rate": 0.0001448028037797791,
"loss": 1.8201,
"step": 34400
},
{
"epoch": 3.635022653039722,
"grad_norm": 0.3251676559448242,
"learning_rate": 0.00014369371367729914,
"loss": 1.8269,
"step": 34500
},
{
"epoch": 3.6455589505847645,
"grad_norm": 0.3591017723083496,
"learning_rate": 0.0001425846235748192,
"loss": 1.8202,
"step": 34600
},
{
"epoch": 3.656095248129807,
"grad_norm": 0.34030893445014954,
"learning_rate": 0.00014147553347233928,
"loss": 1.8185,
"step": 34700
},
{
"epoch": 3.66663154567485,
"grad_norm": 0.35147637128829956,
"learning_rate": 0.00014036644336985939,
"loss": 1.8252,
"step": 34800
},
{
"epoch": 3.6771678432198924,
"grad_norm": 0.3547748327255249,
"learning_rate": 0.00013925735326737946,
"loss": 1.8142,
"step": 34900
},
{
"epoch": 3.6877041407649354,
"grad_norm": 0.3361000716686249,
"learning_rate": 0.0001381482631648995,
"loss": 1.8235,
"step": 35000
},
{
"epoch": 3.698240438309978,
"grad_norm": 0.3312234580516815,
"learning_rate": 0.00013703917306241958,
"loss": 1.8267,
"step": 35100
},
{
"epoch": 3.7087767358550208,
"grad_norm": 0.36078423261642456,
"learning_rate": 0.00013593008295993965,
"loss": 1.8192,
"step": 35200
},
{
"epoch": 3.7193130334000633,
"grad_norm": 0.32330262660980225,
"learning_rate": 0.00013482099285745975,
"loss": 1.8228,
"step": 35300
},
{
"epoch": 3.7298493309451057,
"grad_norm": 0.34211012721061707,
"learning_rate": 0.00013371190275497983,
"loss": 1.8207,
"step": 35400
},
{
"epoch": 3.7403856284901487,
"grad_norm": 0.34478235244750977,
"learning_rate": 0.0001326028126524999,
"loss": 1.8221,
"step": 35500
},
{
"epoch": 3.750921926035191,
"grad_norm": 0.3438977301120758,
"learning_rate": 0.00013149372255001995,
"loss": 1.8214,
"step": 35600
},
{
"epoch": 3.7614582235802336,
"grad_norm": 0.3275744616985321,
"learning_rate": 0.00013038463244754005,
"loss": 1.8153,
"step": 35700
},
{
"epoch": 3.7719945211252766,
"grad_norm": 0.35410231351852417,
"learning_rate": 0.00012927554234506012,
"loss": 1.8144,
"step": 35800
},
{
"epoch": 3.7825308186703195,
"grad_norm": 0.3045212924480438,
"learning_rate": 0.0001281664522425802,
"loss": 1.8162,
"step": 35900
},
{
"epoch": 3.793067116215362,
"grad_norm": 0.32530274987220764,
"learning_rate": 0.00012705736214010027,
"loss": 1.8212,
"step": 36000
},
{
"epoch": 3.8036034137604045,
"grad_norm": 0.35284802317619324,
"learning_rate": 0.00012594827203762032,
"loss": 1.8217,
"step": 36100
},
{
"epoch": 3.8141397113054474,
"grad_norm": 0.35002532601356506,
"learning_rate": 0.00012483918193514042,
"loss": 1.8179,
"step": 36200
},
{
"epoch": 3.82467600885049,
"grad_norm": 0.33642175793647766,
"learning_rate": 0.0001237300918326605,
"loss": 1.8136,
"step": 36300
},
{
"epoch": 3.8352123063955323,
"grad_norm": 0.3203926086425781,
"learning_rate": 0.00012262100173018056,
"loss": 1.8189,
"step": 36400
},
{
"epoch": 3.8457486039405753,
"grad_norm": 0.3277607560157776,
"learning_rate": 0.00012151191162770062,
"loss": 1.813,
"step": 36500
},
{
"epoch": 3.856284901485618,
"grad_norm": 0.3415702283382416,
"learning_rate": 0.00012040282152522071,
"loss": 1.8157,
"step": 36600
},
{
"epoch": 3.8668211990306607,
"grad_norm": 0.33326780796051025,
"learning_rate": 0.00011929373142274079,
"loss": 1.8144,
"step": 36700
},
{
"epoch": 3.877357496575703,
"grad_norm": 0.3394588530063629,
"learning_rate": 0.00011818464132026086,
"loss": 1.8069,
"step": 36800
},
{
"epoch": 3.887893794120746,
"grad_norm": 0.38374754786491394,
"learning_rate": 0.00011707555121778093,
"loss": 1.8076,
"step": 36900
},
{
"epoch": 3.8984300916657886,
"grad_norm": 0.34460264444351196,
"learning_rate": 0.00011596646111530102,
"loss": 1.8129,
"step": 37000
},
{
"epoch": 3.908966389210831,
"grad_norm": 0.3361436724662781,
"learning_rate": 0.00011485737101282108,
"loss": 1.8105,
"step": 37100
},
{
"epoch": 3.919502686755874,
"grad_norm": 0.35143253207206726,
"learning_rate": 0.00011374828091034115,
"loss": 1.8184,
"step": 37200
},
{
"epoch": 3.930038984300917,
"grad_norm": 0.34239351749420166,
"learning_rate": 0.00011263919080786124,
"loss": 1.8061,
"step": 37300
},
{
"epoch": 3.9405752818459594,
"grad_norm": 0.3523593246936798,
"learning_rate": 0.0001115301007053813,
"loss": 1.8092,
"step": 37400
},
{
"epoch": 3.951111579391002,
"grad_norm": 0.36350205540657043,
"learning_rate": 0.00011042101060290139,
"loss": 1.8094,
"step": 37500
},
{
"epoch": 3.961647876936045,
"grad_norm": 0.3419075906276703,
"learning_rate": 0.00010931192050042146,
"loss": 1.8077,
"step": 37600
},
{
"epoch": 3.9721841744810873,
"grad_norm": 0.3350605070590973,
"learning_rate": 0.00010820283039794154,
"loss": 1.8115,
"step": 37700
},
{
"epoch": 3.9827204720261298,
"grad_norm": 0.33970579504966736,
"learning_rate": 0.00010709374029546161,
"loss": 1.8171,
"step": 37800
},
{
"epoch": 3.9932567695711727,
"grad_norm": 0.36339592933654785,
"learning_rate": 0.00010598465019298167,
"loss": 1.8073,
"step": 37900
},
{
"epoch": 4.003793067116216,
"grad_norm": 0.33541393280029297,
"learning_rate": 0.00010487556009050176,
"loss": 1.7981,
"step": 38000
},
{
"epoch": 4.014329364661258,
"grad_norm": 0.36207860708236694,
"learning_rate": 0.00010376646998802183,
"loss": 1.7909,
"step": 38100
},
{
"epoch": 4.024865662206301,
"grad_norm": 0.34258803725242615,
"learning_rate": 0.0001026573798855419,
"loss": 1.7793,
"step": 38200
},
{
"epoch": 4.0354019597513435,
"grad_norm": 0.34286418557167053,
"learning_rate": 0.00010154828978306198,
"loss": 1.7894,
"step": 38300
},
{
"epoch": 4.0459382572963865,
"grad_norm": 0.3334041237831116,
"learning_rate": 0.00010043919968058205,
"loss": 1.7841,
"step": 38400
},
{
"epoch": 4.0564745548414285,
"grad_norm": 0.3277220129966736,
"learning_rate": 9.933010957810213e-05,
"loss": 1.7851,
"step": 38500
},
{
"epoch": 4.067010852386471,
"grad_norm": 0.3734584450721741,
"learning_rate": 9.82210194756222e-05,
"loss": 1.7889,
"step": 38600
},
{
"epoch": 4.077547149931514,
"grad_norm": 0.3457617461681366,
"learning_rate": 9.711192937314229e-05,
"loss": 1.792,
"step": 38700
},
{
"epoch": 4.088083447476556,
"grad_norm": 0.35224205255508423,
"learning_rate": 9.600283927066235e-05,
"loss": 1.7906,
"step": 38800
},
{
"epoch": 4.098619745021599,
"grad_norm": 0.3286111652851105,
"learning_rate": 9.489374916818243e-05,
"loss": 1.7812,
"step": 38900
},
{
"epoch": 4.109156042566642,
"grad_norm": 0.32292017340660095,
"learning_rate": 9.37846590657025e-05,
"loss": 1.7875,
"step": 39000
},
{
"epoch": 4.119692340111685,
"grad_norm": 0.33784738183021545,
"learning_rate": 9.267556896322257e-05,
"loss": 1.785,
"step": 39100
},
{
"epoch": 4.130228637656727,
"grad_norm": 0.33517780900001526,
"learning_rate": 9.156647886074265e-05,
"loss": 1.7926,
"step": 39200
},
{
"epoch": 4.14076493520177,
"grad_norm": 0.340833842754364,
"learning_rate": 9.045738875826271e-05,
"loss": 1.7875,
"step": 39300
},
{
"epoch": 4.151301232746813,
"grad_norm": 0.3653368353843689,
"learning_rate": 8.93482986557828e-05,
"loss": 1.7843,
"step": 39400
},
{
"epoch": 4.161837530291855,
"grad_norm": 0.3394693434238434,
"learning_rate": 8.823920855330288e-05,
"loss": 1.7804,
"step": 39500
},
{
"epoch": 4.172373827836898,
"grad_norm": 0.3323003947734833,
"learning_rate": 8.713011845082295e-05,
"loss": 1.7848,
"step": 39600
},
{
"epoch": 4.182910125381941,
"grad_norm": 0.35341712832450867,
"learning_rate": 8.602102834834302e-05,
"loss": 1.7833,
"step": 39700
},
{
"epoch": 4.193446422926984,
"grad_norm": 0.3553250730037689,
"learning_rate": 8.49119382458631e-05,
"loss": 1.7844,
"step": 39800
},
{
"epoch": 4.203982720472026,
"grad_norm": 0.3491000831127167,
"learning_rate": 8.380284814338317e-05,
"loss": 1.783,
"step": 39900
},
{
"epoch": 4.214519018017069,
"grad_norm": 0.36473289132118225,
"learning_rate": 8.269375804090324e-05,
"loss": 1.7873,
"step": 40000
},
{
"epoch": 4.225055315562112,
"grad_norm": 0.3357420563697815,
"learning_rate": 8.158466793842332e-05,
"loss": 1.7909,
"step": 40100
},
{
"epoch": 4.235591613107154,
"grad_norm": 0.33982038497924805,
"learning_rate": 8.047557783594339e-05,
"loss": 1.7877,
"step": 40200
},
{
"epoch": 4.246127910652197,
"grad_norm": 0.33362457156181335,
"learning_rate": 7.936648773346347e-05,
"loss": 1.7878,
"step": 40300
},
{
"epoch": 4.25666420819724,
"grad_norm": 0.33826008439064026,
"learning_rate": 7.825739763098354e-05,
"loss": 1.7949,
"step": 40400
},
{
"epoch": 4.267200505742283,
"grad_norm": 0.3940160572528839,
"learning_rate": 7.714830752850361e-05,
"loss": 1.7886,
"step": 40500
},
{
"epoch": 4.277736803287325,
"grad_norm": 0.33485040068626404,
"learning_rate": 7.60392174260237e-05,
"loss": 1.7837,
"step": 40600
},
{
"epoch": 4.288273100832368,
"grad_norm": 0.3465060591697693,
"learning_rate": 7.493012732354376e-05,
"loss": 1.7794,
"step": 40700
},
{
"epoch": 4.2988093983774105,
"grad_norm": 0.3455548584461212,
"learning_rate": 7.382103722106385e-05,
"loss": 1.7877,
"step": 40800
},
{
"epoch": 4.3093456959224525,
"grad_norm": 0.33163055777549744,
"learning_rate": 7.271194711858392e-05,
"loss": 1.7861,
"step": 40900
},
{
"epoch": 4.3198819934674955,
"grad_norm": 0.34442830085754395,
"learning_rate": 7.160285701610398e-05,
"loss": 1.7861,
"step": 41000
},
{
"epoch": 4.330418291012538,
"grad_norm": 0.3633157014846802,
"learning_rate": 7.049376691362407e-05,
"loss": 1.7842,
"step": 41100
},
{
"epoch": 4.340954588557581,
"grad_norm": 0.3231643736362457,
"learning_rate": 6.938467681114414e-05,
"loss": 1.7833,
"step": 41200
},
{
"epoch": 4.351490886102623,
"grad_norm": 0.36037677526474,
"learning_rate": 6.827558670866422e-05,
"loss": 1.7836,
"step": 41300
},
{
"epoch": 4.362027183647666,
"grad_norm": 0.3292723000049591,
"learning_rate": 6.716649660618429e-05,
"loss": 1.7806,
"step": 41400
},
{
"epoch": 4.372563481192709,
"grad_norm": 0.37054258584976196,
"learning_rate": 6.605740650370436e-05,
"loss": 1.79,
"step": 41500
},
{
"epoch": 4.383099778737751,
"grad_norm": 0.3358231782913208,
"learning_rate": 6.494831640122444e-05,
"loss": 1.7882,
"step": 41600
},
{
"epoch": 4.393636076282794,
"grad_norm": 0.3368220031261444,
"learning_rate": 6.383922629874451e-05,
"loss": 1.7812,
"step": 41700
},
{
"epoch": 4.404172373827837,
"grad_norm": 0.34333834052085876,
"learning_rate": 6.273013619626458e-05,
"loss": 1.7837,
"step": 41800
},
{
"epoch": 4.41470867137288,
"grad_norm": 0.3434154987335205,
"learning_rate": 6.162104609378466e-05,
"loss": 1.7858,
"step": 41900
},
{
"epoch": 4.425244968917922,
"grad_norm": 0.35153815150260925,
"learning_rate": 6.051195599130473e-05,
"loss": 1.7759,
"step": 42000
},
{
"epoch": 4.435781266462965,
"grad_norm": 0.3414738178253174,
"learning_rate": 5.940286588882481e-05,
"loss": 1.7827,
"step": 42100
},
{
"epoch": 4.446317564008008,
"grad_norm": 0.3285759687423706,
"learning_rate": 5.8293775786344886e-05,
"loss": 1.7826,
"step": 42200
},
{
"epoch": 4.45685386155305,
"grad_norm": 0.35258546471595764,
"learning_rate": 5.718468568386496e-05,
"loss": 1.7883,
"step": 42300
},
{
"epoch": 4.467390159098093,
"grad_norm": 0.33706724643707275,
"learning_rate": 5.607559558138503e-05,
"loss": 1.7786,
"step": 42400
},
{
"epoch": 4.477926456643136,
"grad_norm": 0.3357242941856384,
"learning_rate": 5.496650547890511e-05,
"loss": 1.7904,
"step": 42500
},
{
"epoch": 4.488462754188178,
"grad_norm": 0.3552809953689575,
"learning_rate": 5.385741537642518e-05,
"loss": 1.7858,
"step": 42600
},
{
"epoch": 4.498999051733221,
"grad_norm": 0.3606029450893402,
"learning_rate": 5.2748325273945254e-05,
"loss": 1.7767,
"step": 42700
},
{
"epoch": 4.509535349278264,
"grad_norm": 0.3668212592601776,
"learning_rate": 5.163923517146533e-05,
"loss": 1.7841,
"step": 42800
},
{
"epoch": 4.520071646823307,
"grad_norm": 0.34113767743110657,
"learning_rate": 5.053014506898541e-05,
"loss": 1.7777,
"step": 42900
},
{
"epoch": 4.530607944368349,
"grad_norm": 0.33344870805740356,
"learning_rate": 4.942105496650548e-05,
"loss": 1.7789,
"step": 43000
},
{
"epoch": 4.541144241913392,
"grad_norm": 0.34441855549812317,
"learning_rate": 4.8311964864025556e-05,
"loss": 1.786,
"step": 43100
},
{
"epoch": 4.5516805394584345,
"grad_norm": 0.3361603617668152,
"learning_rate": 4.720287476154563e-05,
"loss": 1.7835,
"step": 43200
},
{
"epoch": 4.5622168370034775,
"grad_norm": 0.3377070426940918,
"learning_rate": 4.60937846590657e-05,
"loss": 1.7842,
"step": 43300
},
{
"epoch": 4.5727531345485195,
"grad_norm": 0.3532165288925171,
"learning_rate": 4.4984694556585777e-05,
"loss": 1.7848,
"step": 43400
},
{
"epoch": 4.583289432093562,
"grad_norm": 0.35418322682380676,
"learning_rate": 4.387560445410585e-05,
"loss": 1.7854,
"step": 43500
},
{
"epoch": 4.593825729638605,
"grad_norm": 0.33272701501846313,
"learning_rate": 4.276651435162593e-05,
"loss": 1.7754,
"step": 43600
},
{
"epoch": 4.604362027183647,
"grad_norm": 0.36113685369491577,
"learning_rate": 4.1657424249146004e-05,
"loss": 1.7752,
"step": 43700
},
{
"epoch": 4.61489832472869,
"grad_norm": 0.34041377902030945,
"learning_rate": 4.054833414666607e-05,
"loss": 1.774,
"step": 43800
},
{
"epoch": 4.625434622273733,
"grad_norm": 0.3422810435295105,
"learning_rate": 3.943924404418615e-05,
"loss": 1.7832,
"step": 43900
},
{
"epoch": 4.635970919818776,
"grad_norm": 0.3397616744041443,
"learning_rate": 3.8330153941706225e-05,
"loss": 1.78,
"step": 44000
},
{
"epoch": 4.646507217363818,
"grad_norm": 0.3389655649662018,
"learning_rate": 3.72210638392263e-05,
"loss": 1.7771,
"step": 44100
},
{
"epoch": 4.657043514908861,
"grad_norm": 0.3590547442436218,
"learning_rate": 3.611197373674637e-05,
"loss": 1.7838,
"step": 44200
},
{
"epoch": 4.667579812453904,
"grad_norm": 0.33880913257598877,
"learning_rate": 3.500288363426645e-05,
"loss": 1.7708,
"step": 44300
},
{
"epoch": 4.678116109998946,
"grad_norm": 0.3376372456550598,
"learning_rate": 3.389379353178653e-05,
"loss": 1.7767,
"step": 44400
},
{
"epoch": 4.688652407543989,
"grad_norm": 0.3335518538951874,
"learning_rate": 3.2784703429306594e-05,
"loss": 1.7784,
"step": 44500
},
{
"epoch": 4.699188705089032,
"grad_norm": 0.37929996848106384,
"learning_rate": 3.167561332682667e-05,
"loss": 1.7714,
"step": 44600
},
{
"epoch": 4.709725002634074,
"grad_norm": 0.3256159722805023,
"learning_rate": 3.056652322434675e-05,
"loss": 1.7824,
"step": 44700
},
{
"epoch": 4.720261300179117,
"grad_norm": 0.34018459916114807,
"learning_rate": 2.9457433121866822e-05,
"loss": 1.7821,
"step": 44800
},
{
"epoch": 4.73079759772416,
"grad_norm": 0.3662751317024231,
"learning_rate": 2.8348343019386895e-05,
"loss": 1.7799,
"step": 44900
},
{
"epoch": 4.741333895269202,
"grad_norm": 0.32580700516700745,
"learning_rate": 2.723925291690697e-05,
"loss": 1.7801,
"step": 45000
},
{
"epoch": 4.751870192814245,
"grad_norm": 0.3326426148414612,
"learning_rate": 2.6130162814427046e-05,
"loss": 1.7824,
"step": 45100
},
{
"epoch": 4.762406490359288,
"grad_norm": 0.3480491042137146,
"learning_rate": 2.502107271194712e-05,
"loss": 1.7738,
"step": 45200
},
{
"epoch": 4.772942787904331,
"grad_norm": 0.3338908553123474,
"learning_rate": 2.3911982609467194e-05,
"loss": 1.7809,
"step": 45300
},
{
"epoch": 4.783479085449373,
"grad_norm": 0.35016825795173645,
"learning_rate": 2.2802892506987267e-05,
"loss": 1.7798,
"step": 45400
},
{
"epoch": 4.794015382994416,
"grad_norm": 0.35119980573654175,
"learning_rate": 2.1693802404507344e-05,
"loss": 1.7772,
"step": 45500
},
{
"epoch": 4.804551680539459,
"grad_norm": 0.34869563579559326,
"learning_rate": 2.0584712302027415e-05,
"loss": 1.7834,
"step": 45600
},
{
"epoch": 4.815087978084501,
"grad_norm": 0.3165900409221649,
"learning_rate": 1.9475622199547492e-05,
"loss": 1.7766,
"step": 45700
},
{
"epoch": 4.8256242756295435,
"grad_norm": 0.33901646733283997,
"learning_rate": 1.836653209706757e-05,
"loss": 1.7781,
"step": 45800
},
{
"epoch": 4.8361605731745865,
"grad_norm": 0.34397250413894653,
"learning_rate": 1.725744199458764e-05,
"loss": 1.7773,
"step": 45900
},
{
"epoch": 4.846696870719629,
"grad_norm": 0.3640625476837158,
"learning_rate": 1.6148351892107716e-05,
"loss": 1.7775,
"step": 46000
},
{
"epoch": 4.857233168264671,
"grad_norm": 0.3395892381668091,
"learning_rate": 1.503926178962779e-05,
"loss": 1.7817,
"step": 46100
},
{
"epoch": 4.867769465809714,
"grad_norm": 0.3353815972805023,
"learning_rate": 1.3930171687147865e-05,
"loss": 1.7759,
"step": 46200
},
{
"epoch": 4.878305763354757,
"grad_norm": 0.34299150109291077,
"learning_rate": 1.2821081584667939e-05,
"loss": 1.779,
"step": 46300
},
{
"epoch": 4.888842060899799,
"grad_norm": 0.34803491830825806,
"learning_rate": 1.1711991482188014e-05,
"loss": 1.7787,
"step": 46400
},
{
"epoch": 4.899378358444842,
"grad_norm": 0.3452516198158264,
"learning_rate": 1.0602901379708088e-05,
"loss": 1.7822,
"step": 46500
},
{
"epoch": 4.909914655989885,
"grad_norm": 0.32334357500076294,
"learning_rate": 9.493811277228162e-06,
"loss": 1.7774,
"step": 46600
},
{
"epoch": 4.920450953534928,
"grad_norm": 0.34011390805244446,
"learning_rate": 8.384721174748237e-06,
"loss": 1.7788,
"step": 46700
},
{
"epoch": 4.93098725107997,
"grad_norm": 0.3399524688720703,
"learning_rate": 7.2756310722683116e-06,
"loss": 1.778,
"step": 46800
},
{
"epoch": 4.941523548625013,
"grad_norm": 0.33615124225616455,
"learning_rate": 6.166540969788386e-06,
"loss": 1.7771,
"step": 46900
},
{
"epoch": 4.952059846170056,
"grad_norm": 0.3466767966747284,
"learning_rate": 5.05745086730846e-06,
"loss": 1.7774,
"step": 47000
},
{
"epoch": 4.962596143715098,
"grad_norm": 0.33684036135673523,
"learning_rate": 3.948360764828534e-06,
"loss": 1.7735,
"step": 47100
},
{
"epoch": 4.973132441260141,
"grad_norm": 0.3275541663169861,
"learning_rate": 2.8392706623486093e-06,
"loss": 1.773,
"step": 47200
},
{
"epoch": 4.983668738805184,
"grad_norm": 0.3321060240268707,
"learning_rate": 1.7301805598686838e-06,
"loss": 1.7764,
"step": 47300
},
{
"epoch": 4.994205036350227,
"grad_norm": 0.3356621265411377,
"learning_rate": 6.210904573887583e-07,
"loss": 1.7762,
"step": 47400
}
],
"logging_steps": 100,
"max_steps": 47455,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.0383510041641344e+17,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}