{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 47455, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00010536297545042672, "grad_norm": 8.704546928405762, "learning_rate": 0.0, "loss": 3.1115, "step": 1 }, { "epoch": 0.010536297545042672, "grad_norm": 0.45912429690361023, "learning_rate": 2.085967130214918e-05, "loss": 2.4969, "step": 100 }, { "epoch": 0.021072595090085345, "grad_norm": 0.45048093795776367, "learning_rate": 4.193004635482512e-05, "loss": 2.1901, "step": 200 }, { "epoch": 0.03160889263512802, "grad_norm": 0.4515824615955353, "learning_rate": 6.300042140750104e-05, "loss": 2.1404, "step": 300 }, { "epoch": 0.04214519018017069, "grad_norm": 0.44400346279144287, "learning_rate": 8.4070796460177e-05, "loss": 2.1252, "step": 400 }, { "epoch": 0.05268148772521336, "grad_norm": 0.43594038486480713, "learning_rate": 0.00010514117151285294, "loss": 2.1131, "step": 500 }, { "epoch": 0.06321778527025604, "grad_norm": 0.48596614599227905, "learning_rate": 0.00012621154656552885, "loss": 2.1137, "step": 600 }, { "epoch": 0.0737540828152987, "grad_norm": 0.4419753849506378, "learning_rate": 0.0001472819216182048, "loss": 2.1132, "step": 700 }, { "epoch": 0.08429038036034138, "grad_norm": 0.4662840962409973, "learning_rate": 0.00016835229667088076, "loss": 2.0996, "step": 800 }, { "epoch": 0.09482667790538404, "grad_norm": 0.5108239054679871, "learning_rate": 0.00018942267172355668, "loss": 2.097, "step": 900 }, { "epoch": 0.10536297545042672, "grad_norm": 0.4582119584083557, "learning_rate": 0.00021049304677623262, "loss": 2.1024, "step": 1000 }, { "epoch": 0.1158992729954694, "grad_norm": 0.4549405574798584, "learning_rate": 0.00023156342182890856, "loss": 2.0953, "step": 1100 }, { "epoch": 0.12643557054051208, "grad_norm": 0.5161290168762207, "learning_rate": 0.0002526337968815845, "loss": 2.0932, "step": 1200 }, { "epoch": 0.13697186808555473, "grad_norm": 0.5365496277809143, "learning_rate": 0.00027370417193426044, "loss": 2.0952, "step": 1300 }, { "epoch": 0.1475081656305974, "grad_norm": 0.5388957858085632, "learning_rate": 0.0002947745469869364, "loss": 2.0908, "step": 1400 }, { "epoch": 0.15804446317564008, "grad_norm": 0.46142223477363586, "learning_rate": 0.00031584492203961227, "loss": 2.0942, "step": 1500 }, { "epoch": 0.16858076072068276, "grad_norm": 0.46338245272636414, "learning_rate": 0.00033691529709228824, "loss": 2.0844, "step": 1600 }, { "epoch": 0.17911705826572544, "grad_norm": 0.48414379358291626, "learning_rate": 0.0003579856721449642, "loss": 2.1006, "step": 1700 }, { "epoch": 0.1896533558107681, "grad_norm": 0.5171504020690918, "learning_rate": 0.0003790560471976401, "loss": 2.0942, "step": 1800 }, { "epoch": 0.20018965335581076, "grad_norm": 0.4922062158584595, "learning_rate": 0.0004001264222503161, "loss": 2.0904, "step": 1900 }, { "epoch": 0.21072595090085344, "grad_norm": 0.6023987531661987, "learning_rate": 0.000421196797302992, "loss": 2.0961, "step": 2000 }, { "epoch": 0.22126224844589612, "grad_norm": 0.5345625281333923, "learning_rate": 0.0004422671723556679, "loss": 2.0952, "step": 2100 }, { "epoch": 0.2317985459909388, "grad_norm": 0.5566565990447998, "learning_rate": 0.0004633375474083439, "loss": 2.0977, "step": 2200 }, { "epoch": 0.24233484353598145, "grad_norm": 0.5522327423095703, "learning_rate": 0.00048440792246101985, "loss": 2.1047, "step": 2300 }, { "epoch": 0.25287114108102415, "grad_norm": 0.5047522783279419, "learning_rate": 0.0004997116365733552, "loss": 2.0979, "step": 2400 }, { "epoch": 0.2634074386260668, "grad_norm": 0.4784170687198639, "learning_rate": 0.0004986025464708753, "loss": 2.102, "step": 2500 }, { "epoch": 0.27394373617110945, "grad_norm": 0.5242518782615662, "learning_rate": 0.0004974934563683953, "loss": 2.0998, "step": 2600 }, { "epoch": 0.28448003371615216, "grad_norm": 0.47221043705940247, "learning_rate": 0.0004963843662659154, "loss": 2.1038, "step": 2700 }, { "epoch": 0.2950163312611948, "grad_norm": 0.6364756226539612, "learning_rate": 0.0004952752761634355, "loss": 2.0921, "step": 2800 }, { "epoch": 0.3055526288062375, "grad_norm": 0.4995081424713135, "learning_rate": 0.0004941661860609556, "loss": 2.0924, "step": 2900 }, { "epoch": 0.31608892635128016, "grad_norm": 0.4774039685726166, "learning_rate": 0.0004930570959584757, "loss": 2.0911, "step": 3000 }, { "epoch": 0.3266252238963228, "grad_norm": 0.4661494195461273, "learning_rate": 0.0004919480058559958, "loss": 2.0936, "step": 3100 }, { "epoch": 0.3371615214413655, "grad_norm": 0.4524199366569519, "learning_rate": 0.0004908389157535158, "loss": 2.0885, "step": 3200 }, { "epoch": 0.34769781898640817, "grad_norm": 0.5359546542167664, "learning_rate": 0.0004897298256510359, "loss": 2.0836, "step": 3300 }, { "epoch": 0.3582341165314509, "grad_norm": 0.46531638503074646, "learning_rate": 0.0004886207355485559, "loss": 2.0732, "step": 3400 }, { "epoch": 0.3687704140764935, "grad_norm": 0.5188313722610474, "learning_rate": 0.0004875116454460761, "loss": 2.0701, "step": 3500 }, { "epoch": 0.3793067116215362, "grad_norm": 0.4912554621696472, "learning_rate": 0.0004864025553435961, "loss": 2.0734, "step": 3600 }, { "epoch": 0.3898430091665789, "grad_norm": 0.46518808603286743, "learning_rate": 0.00048529346524111617, "loss": 2.0673, "step": 3700 }, { "epoch": 0.40037930671162153, "grad_norm": 0.46551960706710815, "learning_rate": 0.00048418437513863627, "loss": 2.0733, "step": 3800 }, { "epoch": 0.41091560425666424, "grad_norm": 0.543953001499176, "learning_rate": 0.0004830752850361563, "loss": 2.066, "step": 3900 }, { "epoch": 0.4214519018017069, "grad_norm": 0.5043941140174866, "learning_rate": 0.0004819661949336764, "loss": 2.0662, "step": 4000 }, { "epoch": 0.43198819934674954, "grad_norm": 0.4602905511856079, "learning_rate": 0.00048085710483119647, "loss": 2.0704, "step": 4100 }, { "epoch": 0.44252449689179224, "grad_norm": 0.5024540424346924, "learning_rate": 0.0004797480147287166, "loss": 2.0654, "step": 4200 }, { "epoch": 0.4530607944368349, "grad_norm": 0.48672983050346375, "learning_rate": 0.00047863892462623667, "loss": 2.0572, "step": 4300 }, { "epoch": 0.4635970919818776, "grad_norm": 0.44031229615211487, "learning_rate": 0.0004775298345237567, "loss": 2.0672, "step": 4400 }, { "epoch": 0.47413338952692025, "grad_norm": 0.46178367733955383, "learning_rate": 0.0004764207444212768, "loss": 2.0617, "step": 4500 }, { "epoch": 0.4846696870719629, "grad_norm": 0.43033888936042786, "learning_rate": 0.00047531165431879686, "loss": 2.0545, "step": 4600 }, { "epoch": 0.4952059846170056, "grad_norm": 0.47262778878211975, "learning_rate": 0.00047420256421631696, "loss": 2.0581, "step": 4700 }, { "epoch": 0.5057422821620483, "grad_norm": 0.5511783957481384, "learning_rate": 0.000473093474113837, "loss": 2.0497, "step": 4800 }, { "epoch": 0.5162785797070909, "grad_norm": 0.5176393985748291, "learning_rate": 0.00047198438401135705, "loss": 2.0466, "step": 4900 }, { "epoch": 0.5268148772521336, "grad_norm": 0.4723651111125946, "learning_rate": 0.00047087529390887716, "loss": 2.0468, "step": 5000 }, { "epoch": 0.5373511747971763, "grad_norm": 0.4407116174697876, "learning_rate": 0.00046976620380639726, "loss": 2.0551, "step": 5100 }, { "epoch": 0.5478874723422189, "grad_norm": 0.4954802989959717, "learning_rate": 0.00046865711370391736, "loss": 2.0418, "step": 5200 }, { "epoch": 0.5584237698872616, "grad_norm": 0.43379735946655273, "learning_rate": 0.0004675480236014374, "loss": 2.0455, "step": 5300 }, { "epoch": 0.5689600674323043, "grad_norm": 0.4137374758720398, "learning_rate": 0.00046643893349895745, "loss": 2.0435, "step": 5400 }, { "epoch": 0.5794963649773469, "grad_norm": 0.42562806606292725, "learning_rate": 0.00046532984339647755, "loss": 2.0462, "step": 5500 }, { "epoch": 0.5900326625223896, "grad_norm": 0.49963149428367615, "learning_rate": 0.0004642207532939976, "loss": 2.0461, "step": 5600 }, { "epoch": 0.6005689600674323, "grad_norm": 0.4317498803138733, "learning_rate": 0.0004631116631915177, "loss": 2.043, "step": 5700 }, { "epoch": 0.611105257612475, "grad_norm": 0.48041588068008423, "learning_rate": 0.00046200257308903775, "loss": 2.0407, "step": 5800 }, { "epoch": 0.6216415551575176, "grad_norm": 0.4867211878299713, "learning_rate": 0.0004608934829865578, "loss": 2.0399, "step": 5900 }, { "epoch": 0.6321778527025603, "grad_norm": 0.4728844165802002, "learning_rate": 0.0004597843928840779, "loss": 2.0481, "step": 6000 }, { "epoch": 0.642714150247603, "grad_norm": 0.42306768894195557, "learning_rate": 0.000458675302781598, "loss": 2.036, "step": 6100 }, { "epoch": 0.6532504477926456, "grad_norm": 0.4628433883190155, "learning_rate": 0.0004575662126791181, "loss": 2.0316, "step": 6200 }, { "epoch": 0.6637867453376883, "grad_norm": 0.5507206916809082, "learning_rate": 0.00045645712257663814, "loss": 2.037, "step": 6300 }, { "epoch": 0.674323042882731, "grad_norm": 0.5245053172111511, "learning_rate": 0.00045534803247415824, "loss": 2.0331, "step": 6400 }, { "epoch": 0.6848593404277736, "grad_norm": 0.4395572543144226, "learning_rate": 0.0004542389423716783, "loss": 2.0319, "step": 6500 }, { "epoch": 0.6953956379728163, "grad_norm": 0.4458150565624237, "learning_rate": 0.00045312985226919833, "loss": 2.0262, "step": 6600 }, { "epoch": 0.705931935517859, "grad_norm": 0.4589666724205017, "learning_rate": 0.00045202076216671844, "loss": 2.0281, "step": 6700 }, { "epoch": 0.7164682330629017, "grad_norm": 0.49729079008102417, "learning_rate": 0.0004509116720642385, "loss": 2.0254, "step": 6800 }, { "epoch": 0.7270045306079443, "grad_norm": 0.41776230931282043, "learning_rate": 0.0004498025819617586, "loss": 2.0274, "step": 6900 }, { "epoch": 0.737540828152987, "grad_norm": 0.5071027278900146, "learning_rate": 0.0004486934918592787, "loss": 2.0226, "step": 7000 }, { "epoch": 0.7480771256980298, "grad_norm": 0.47906801104545593, "learning_rate": 0.00044758440175679873, "loss": 2.0225, "step": 7100 }, { "epoch": 0.7586134232430723, "grad_norm": 0.501970648765564, "learning_rate": 0.00044647531165431883, "loss": 2.0212, "step": 7200 }, { "epoch": 0.769149720788115, "grad_norm": 0.4116053879261017, "learning_rate": 0.0004453662215518389, "loss": 2.0259, "step": 7300 }, { "epoch": 0.7796860183331578, "grad_norm": 0.4501636028289795, "learning_rate": 0.000444257131449359, "loss": 2.0205, "step": 7400 }, { "epoch": 0.7902223158782004, "grad_norm": 0.5196821093559265, "learning_rate": 0.000443148041346879, "loss": 2.0216, "step": 7500 }, { "epoch": 0.8007586134232431, "grad_norm": 0.4288316071033478, "learning_rate": 0.00044203895124439907, "loss": 2.0178, "step": 7600 }, { "epoch": 0.8112949109682858, "grad_norm": 0.41823074221611023, "learning_rate": 0.00044092986114191917, "loss": 2.0131, "step": 7700 }, { "epoch": 0.8218312085133285, "grad_norm": 0.4197762608528137, "learning_rate": 0.0004398207710394392, "loss": 2.0147, "step": 7800 }, { "epoch": 0.8323675060583711, "grad_norm": 0.4367753267288208, "learning_rate": 0.0004387116809369593, "loss": 2.024, "step": 7900 }, { "epoch": 0.8429038036034138, "grad_norm": 0.43104997277259827, "learning_rate": 0.0004376025908344794, "loss": 2.0123, "step": 8000 }, { "epoch": 0.8534401011484565, "grad_norm": 0.4320082664489746, "learning_rate": 0.0004364935007319995, "loss": 2.0126, "step": 8100 }, { "epoch": 0.8639763986934991, "grad_norm": 0.4402988851070404, "learning_rate": 0.00043538441062951957, "loss": 2.0127, "step": 8200 }, { "epoch": 0.8745126962385418, "grad_norm": 0.4331250786781311, "learning_rate": 0.0004342753205270396, "loss": 2.0138, "step": 8300 }, { "epoch": 0.8850489937835845, "grad_norm": 0.43004143238067627, "learning_rate": 0.0004331662304245597, "loss": 2.0034, "step": 8400 }, { "epoch": 0.8955852913286271, "grad_norm": 0.4509132504463196, "learning_rate": 0.00043205714032207976, "loss": 2.0058, "step": 8500 }, { "epoch": 0.9061215888736698, "grad_norm": 0.43472710251808167, "learning_rate": 0.00043094805021959986, "loss": 2.0042, "step": 8600 }, { "epoch": 0.9166578864187125, "grad_norm": 0.5285255908966064, "learning_rate": 0.0004298389601171199, "loss": 2.0018, "step": 8700 }, { "epoch": 0.9271941839637552, "grad_norm": 0.40384572744369507, "learning_rate": 0.00042872987001463996, "loss": 2.0079, "step": 8800 }, { "epoch": 0.9377304815087978, "grad_norm": 0.4634927213191986, "learning_rate": 0.0004276207799121601, "loss": 1.9999, "step": 8900 }, { "epoch": 0.9482667790538405, "grad_norm": 0.4585327208042145, "learning_rate": 0.00042651168980968016, "loss": 2.0076, "step": 9000 }, { "epoch": 0.9588030765988832, "grad_norm": 0.44724905490875244, "learning_rate": 0.00042540259970720026, "loss": 2.0055, "step": 9100 }, { "epoch": 0.9693393741439258, "grad_norm": 0.4930990934371948, "learning_rate": 0.0004242935096047203, "loss": 2.0034, "step": 9200 }, { "epoch": 0.9798756716889685, "grad_norm": 0.4466867744922638, "learning_rate": 0.00042318441950224035, "loss": 2.0046, "step": 9300 }, { "epoch": 0.9904119692340112, "grad_norm": 0.44049832224845886, "learning_rate": 0.00042207532939976045, "loss": 2.0015, "step": 9400 }, { "epoch": 1.000948266779054, "grad_norm": 0.4658033549785614, "learning_rate": 0.0004209662392972805, "loss": 1.9938, "step": 9500 }, { "epoch": 1.0114845643240966, "grad_norm": 0.45460689067840576, "learning_rate": 0.0004198571491948006, "loss": 1.9694, "step": 9600 }, { "epoch": 1.022020861869139, "grad_norm": 0.43489739298820496, "learning_rate": 0.00041874805909232065, "loss": 1.9684, "step": 9700 }, { "epoch": 1.0325571594141818, "grad_norm": 0.4363148510456085, "learning_rate": 0.0004176389689898407, "loss": 1.961, "step": 9800 }, { "epoch": 1.0430934569592245, "grad_norm": 0.41610002517700195, "learning_rate": 0.00041652987888736085, "loss": 1.9694, "step": 9900 }, { "epoch": 1.0536297545042672, "grad_norm": 0.39003250002861023, "learning_rate": 0.0004154207887848809, "loss": 1.9636, "step": 10000 }, { "epoch": 1.06416605204931, "grad_norm": 0.36780601739883423, "learning_rate": 0.000414311698682401, "loss": 1.9686, "step": 10100 }, { "epoch": 1.0747023495943526, "grad_norm": 0.4296736419200897, "learning_rate": 0.00041320260857992104, "loss": 1.9668, "step": 10200 }, { "epoch": 1.0852386471393953, "grad_norm": 0.45763176679611206, "learning_rate": 0.00041209351847744114, "loss": 1.9639, "step": 10300 }, { "epoch": 1.0957749446844378, "grad_norm": 0.41805505752563477, "learning_rate": 0.0004109844283749612, "loss": 1.9633, "step": 10400 }, { "epoch": 1.1063112422294805, "grad_norm": 0.42308661341667175, "learning_rate": 0.00040987533827248124, "loss": 1.9745, "step": 10500 }, { "epoch": 1.1168475397745232, "grad_norm": 0.4240245223045349, "learning_rate": 0.00040876624817000134, "loss": 1.9689, "step": 10600 }, { "epoch": 1.127383837319566, "grad_norm": 0.40365278720855713, "learning_rate": 0.0004076571580675214, "loss": 1.9653, "step": 10700 }, { "epoch": 1.1379201348646086, "grad_norm": 0.4099302887916565, "learning_rate": 0.00040654806796504154, "loss": 1.958, "step": 10800 }, { "epoch": 1.1484564324096513, "grad_norm": 0.4134521186351776, "learning_rate": 0.0004054389778625616, "loss": 1.9686, "step": 10900 }, { "epoch": 1.158992729954694, "grad_norm": 0.40292927622795105, "learning_rate": 0.00040432988776008163, "loss": 1.9627, "step": 11000 }, { "epoch": 1.1695290274997365, "grad_norm": 0.4272337555885315, "learning_rate": 0.00040322079765760173, "loss": 1.9655, "step": 11100 }, { "epoch": 1.1800653250447792, "grad_norm": 0.43145930767059326, "learning_rate": 0.0004021117075551218, "loss": 1.963, "step": 11200 }, { "epoch": 1.190601622589822, "grad_norm": 0.39788371324539185, "learning_rate": 0.0004010026174526419, "loss": 1.9616, "step": 11300 }, { "epoch": 1.2011379201348646, "grad_norm": 0.45902547240257263, "learning_rate": 0.0003998935273501619, "loss": 1.9589, "step": 11400 }, { "epoch": 1.2116742176799074, "grad_norm": 0.4540606141090393, "learning_rate": 0.00039878443724768197, "loss": 1.958, "step": 11500 }, { "epoch": 1.2222105152249498, "grad_norm": 0.4402179419994354, "learning_rate": 0.0003976753471452021, "loss": 1.9555, "step": 11600 }, { "epoch": 1.2327468127699925, "grad_norm": 0.389726459980011, "learning_rate": 0.0003965662570427221, "loss": 1.9512, "step": 11700 }, { "epoch": 1.2432831103150352, "grad_norm": 0.440833181142807, "learning_rate": 0.0003954571669402423, "loss": 1.9561, "step": 11800 }, { "epoch": 1.253819407860078, "grad_norm": 0.3972662091255188, "learning_rate": 0.0003943480768377623, "loss": 1.965, "step": 11900 }, { "epoch": 1.2643557054051207, "grad_norm": 0.41316962242126465, "learning_rate": 0.00039323898673528237, "loss": 1.9522, "step": 12000 }, { "epoch": 1.2748920029501634, "grad_norm": 0.41109901666641235, "learning_rate": 0.00039212989663280247, "loss": 1.9553, "step": 12100 }, { "epoch": 1.285428300495206, "grad_norm": 0.4357900023460388, "learning_rate": 0.0003910208065303225, "loss": 1.9513, "step": 12200 }, { "epoch": 1.2959645980402485, "grad_norm": 0.3943662941455841, "learning_rate": 0.0003899117164278426, "loss": 1.9611, "step": 12300 }, { "epoch": 1.3065008955852913, "grad_norm": 0.39483174681663513, "learning_rate": 0.00038880262632536266, "loss": 1.9478, "step": 12400 }, { "epoch": 1.317037193130334, "grad_norm": 0.43672600388526917, "learning_rate": 0.00038769353622288276, "loss": 1.9485, "step": 12500 }, { "epoch": 1.3275734906753767, "grad_norm": 0.42754313349723816, "learning_rate": 0.0003865844461204028, "loss": 1.9463, "step": 12600 }, { "epoch": 1.3381097882204194, "grad_norm": 0.41211095452308655, "learning_rate": 0.0003854753560179229, "loss": 1.95, "step": 12700 }, { "epoch": 1.348646085765462, "grad_norm": 0.3844158947467804, "learning_rate": 0.000384366265915443, "loss": 1.9426, "step": 12800 }, { "epoch": 1.3591823833105048, "grad_norm": 0.4544881582260132, "learning_rate": 0.00038325717581296306, "loss": 1.951, "step": 12900 }, { "epoch": 1.3697186808555473, "grad_norm": 0.4058513641357422, "learning_rate": 0.00038214808571048316, "loss": 1.9521, "step": 13000 }, { "epoch": 1.38025497840059, "grad_norm": 0.38905027508735657, "learning_rate": 0.0003810389956080032, "loss": 1.9534, "step": 13100 }, { "epoch": 1.3907912759456327, "grad_norm": 0.4224783182144165, "learning_rate": 0.00037992990550552325, "loss": 1.9485, "step": 13200 }, { "epoch": 1.4013275734906754, "grad_norm": 0.3894629180431366, "learning_rate": 0.00037882081540304335, "loss": 1.9459, "step": 13300 }, { "epoch": 1.411863871035718, "grad_norm": 0.4435978829860687, "learning_rate": 0.0003777117253005634, "loss": 1.9428, "step": 13400 }, { "epoch": 1.4224001685807608, "grad_norm": 0.4090045690536499, "learning_rate": 0.0003766026351980835, "loss": 1.951, "step": 13500 }, { "epoch": 1.4329364661258035, "grad_norm": 0.4192126989364624, "learning_rate": 0.00037549354509560355, "loss": 1.9498, "step": 13600 }, { "epoch": 1.443472763670846, "grad_norm": 0.399774968624115, "learning_rate": 0.00037438445499312365, "loss": 1.9463, "step": 13700 }, { "epoch": 1.4540090612158887, "grad_norm": 0.3659054636955261, "learning_rate": 0.00037327536489064375, "loss": 1.9549, "step": 13800 }, { "epoch": 1.4645453587609314, "grad_norm": 0.385452538728714, "learning_rate": 0.0003721662747881638, "loss": 1.9472, "step": 13900 }, { "epoch": 1.475081656305974, "grad_norm": 0.3904755413532257, "learning_rate": 0.0003710571846856839, "loss": 1.9438, "step": 14000 }, { "epoch": 1.4856179538510168, "grad_norm": 0.3969903290271759, "learning_rate": 0.00036994809458320394, "loss": 1.941, "step": 14100 }, { "epoch": 1.4961542513960593, "grad_norm": 0.4201650321483612, "learning_rate": 0.000368839004480724, "loss": 1.9451, "step": 14200 }, { "epoch": 1.5066905489411022, "grad_norm": 0.3867323100566864, "learning_rate": 0.0003677299143782441, "loss": 1.9463, "step": 14300 }, { "epoch": 1.5172268464861447, "grad_norm": 0.40658488869667053, "learning_rate": 0.00036662082427576414, "loss": 1.9461, "step": 14400 }, { "epoch": 1.5277631440311874, "grad_norm": 0.39837929606437683, "learning_rate": 0.00036551173417328424, "loss": 1.9517, "step": 14500 }, { "epoch": 1.53829944157623, "grad_norm": 0.42312178015708923, "learning_rate": 0.00036440264407080434, "loss": 1.9351, "step": 14600 }, { "epoch": 1.5488357391212728, "grad_norm": 0.4057867228984833, "learning_rate": 0.00036329355396832444, "loss": 1.9403, "step": 14700 }, { "epoch": 1.5593720366663155, "grad_norm": 0.39428508281707764, "learning_rate": 0.0003621844638658445, "loss": 1.9484, "step": 14800 }, { "epoch": 1.569908334211358, "grad_norm": 0.381671279668808, "learning_rate": 0.00036107537376336453, "loss": 1.9399, "step": 14900 }, { "epoch": 1.580444631756401, "grad_norm": 0.4080953598022461, "learning_rate": 0.00035996628366088463, "loss": 1.9316, "step": 15000 }, { "epoch": 1.5909809293014434, "grad_norm": 0.3612942397594452, "learning_rate": 0.0003588571935584047, "loss": 1.9337, "step": 15100 }, { "epoch": 1.6015172268464861, "grad_norm": 0.37906691431999207, "learning_rate": 0.0003577481034559248, "loss": 1.9338, "step": 15200 }, { "epoch": 1.6120535243915288, "grad_norm": 0.4057066738605499, "learning_rate": 0.0003566390133534448, "loss": 1.9399, "step": 15300 }, { "epoch": 1.6225898219365715, "grad_norm": 0.396557480096817, "learning_rate": 0.0003555299232509649, "loss": 1.9472, "step": 15400 }, { "epoch": 1.6331261194816142, "grad_norm": 0.37647131085395813, "learning_rate": 0.000354420833148485, "loss": 1.9368, "step": 15500 }, { "epoch": 1.6436624170266567, "grad_norm": 0.3920493721961975, "learning_rate": 0.0003533117430460051, "loss": 1.9407, "step": 15600 }, { "epoch": 1.6541987145716996, "grad_norm": 0.39372900128364563, "learning_rate": 0.0003522026529435252, "loss": 1.9327, "step": 15700 }, { "epoch": 1.6647350121167421, "grad_norm": 0.3832472264766693, "learning_rate": 0.0003510935628410452, "loss": 1.9365, "step": 15800 }, { "epoch": 1.6752713096617848, "grad_norm": 0.3669210970401764, "learning_rate": 0.00034998447273856527, "loss": 1.9323, "step": 15900 }, { "epoch": 1.6858076072068275, "grad_norm": 0.37810054421424866, "learning_rate": 0.00034887538263608537, "loss": 1.93, "step": 16000 }, { "epoch": 1.6963439047518702, "grad_norm": 0.3972882330417633, "learning_rate": 0.0003477662925336054, "loss": 1.9299, "step": 16100 }, { "epoch": 1.706880202296913, "grad_norm": 0.39600399136543274, "learning_rate": 0.0003466572024311255, "loss": 1.9337, "step": 16200 }, { "epoch": 1.7174164998419554, "grad_norm": 0.367546021938324, "learning_rate": 0.00034554811232864556, "loss": 1.934, "step": 16300 }, { "epoch": 1.7279527973869984, "grad_norm": 0.43116411566734314, "learning_rate": 0.00034443902222616566, "loss": 1.9296, "step": 16400 }, { "epoch": 1.7384890949320408, "grad_norm": 0.41438373923301697, "learning_rate": 0.00034332993212368577, "loss": 1.9304, "step": 16500 }, { "epoch": 1.7490253924770836, "grad_norm": 0.387265145778656, "learning_rate": 0.0003422208420212058, "loss": 1.9273, "step": 16600 }, { "epoch": 1.7595616900221263, "grad_norm": 0.3982371687889099, "learning_rate": 0.0003411117519187259, "loss": 1.9338, "step": 16700 }, { "epoch": 1.7700979875671687, "grad_norm": 0.3915503919124603, "learning_rate": 0.00034000266181624596, "loss": 1.9305, "step": 16800 }, { "epoch": 1.7806342851122117, "grad_norm": 0.38060539960861206, "learning_rate": 0.00033889357171376606, "loss": 1.927, "step": 16900 }, { "epoch": 1.7911705826572542, "grad_norm": 0.4222376048564911, "learning_rate": 0.0003377844816112861, "loss": 1.9311, "step": 17000 }, { "epoch": 1.801706880202297, "grad_norm": 0.3746761381626129, "learning_rate": 0.00033667539150880615, "loss": 1.9269, "step": 17100 }, { "epoch": 1.8122431777473396, "grad_norm": 0.3764290511608124, "learning_rate": 0.00033556630140632625, "loss": 1.9239, "step": 17200 }, { "epoch": 1.8227794752923823, "grad_norm": 0.3536926209926605, "learning_rate": 0.0003344572113038463, "loss": 1.9312, "step": 17300 }, { "epoch": 1.833315772837425, "grad_norm": 0.3796480596065521, "learning_rate": 0.0003333481212013664, "loss": 1.9229, "step": 17400 }, { "epoch": 1.8438520703824675, "grad_norm": 0.3728596866130829, "learning_rate": 0.0003322390310988865, "loss": 1.9248, "step": 17500 }, { "epoch": 1.8543883679275104, "grad_norm": 0.3622676432132721, "learning_rate": 0.00033112994099640655, "loss": 1.9274, "step": 17600 }, { "epoch": 1.8649246654725529, "grad_norm": 0.3914555013179779, "learning_rate": 0.00033002085089392665, "loss": 1.917, "step": 17700 }, { "epoch": 1.8754609630175956, "grad_norm": 0.3367026448249817, "learning_rate": 0.0003289117607914467, "loss": 1.9213, "step": 17800 }, { "epoch": 1.8859972605626383, "grad_norm": 0.41049453616142273, "learning_rate": 0.0003278026706889668, "loss": 1.921, "step": 17900 }, { "epoch": 1.896533558107681, "grad_norm": 0.38005101680755615, "learning_rate": 0.00032669358058648684, "loss": 1.9188, "step": 18000 }, { "epoch": 1.9070698556527237, "grad_norm": 0.3855360150337219, "learning_rate": 0.0003255844904840069, "loss": 1.9224, "step": 18100 }, { "epoch": 1.9176061531977662, "grad_norm": 0.3764369487762451, "learning_rate": 0.000324475400381527, "loss": 1.9221, "step": 18200 }, { "epoch": 1.928142450742809, "grad_norm": 0.3933279514312744, "learning_rate": 0.00032336631027904704, "loss": 1.9233, "step": 18300 }, { "epoch": 1.9386787482878516, "grad_norm": 0.3530935049057007, "learning_rate": 0.0003222572201765672, "loss": 1.9218, "step": 18400 }, { "epoch": 1.9492150458328943, "grad_norm": 0.36857885122299194, "learning_rate": 0.00032114813007408724, "loss": 1.9211, "step": 18500 }, { "epoch": 1.959751343377937, "grad_norm": 0.3870936930179596, "learning_rate": 0.00032003903997160734, "loss": 1.919, "step": 18600 }, { "epoch": 1.9702876409229797, "grad_norm": 0.38852736353874207, "learning_rate": 0.0003189299498691274, "loss": 1.9137, "step": 18700 }, { "epoch": 1.9808239384680224, "grad_norm": 0.3802979290485382, "learning_rate": 0.00031782085976664743, "loss": 1.9238, "step": 18800 }, { "epoch": 1.9913602360130649, "grad_norm": 0.39477866888046265, "learning_rate": 0.00031671176966416753, "loss": 1.9226, "step": 18900 }, { "epoch": 2.001896533558108, "grad_norm": 0.39578545093536377, "learning_rate": 0.0003156026795616876, "loss": 1.9067, "step": 19000 }, { "epoch": 2.0124328311031503, "grad_norm": 0.3758637607097626, "learning_rate": 0.0003144935894592077, "loss": 1.8889, "step": 19100 }, { "epoch": 2.0229691286481932, "grad_norm": 0.3424636125564575, "learning_rate": 0.00031338449935672773, "loss": 1.881, "step": 19200 }, { "epoch": 2.0335054261932357, "grad_norm": 0.3473268151283264, "learning_rate": 0.0003122754092542478, "loss": 1.8824, "step": 19300 }, { "epoch": 2.044041723738278, "grad_norm": 0.34891676902770996, "learning_rate": 0.00031116631915176793, "loss": 1.8876, "step": 19400 }, { "epoch": 2.054578021283321, "grad_norm": 0.40848681330680847, "learning_rate": 0.000310057229049288, "loss": 1.8804, "step": 19500 }, { "epoch": 2.0651143188283636, "grad_norm": 0.3565325140953064, "learning_rate": 0.0003089481389468081, "loss": 1.8846, "step": 19600 }, { "epoch": 2.0756506163734065, "grad_norm": 0.3714432418346405, "learning_rate": 0.0003078390488443281, "loss": 1.8952, "step": 19700 }, { "epoch": 2.086186913918449, "grad_norm": 0.39024487137794495, "learning_rate": 0.00030672995874184817, "loss": 1.8886, "step": 19800 }, { "epoch": 2.096723211463492, "grad_norm": 0.37265217304229736, "learning_rate": 0.00030562086863936827, "loss": 1.8815, "step": 19900 }, { "epoch": 2.1072595090085344, "grad_norm": 0.4258386194705963, "learning_rate": 0.0003045117785368883, "loss": 1.8797, "step": 20000 }, { "epoch": 2.117795806553577, "grad_norm": 0.3775697350502014, "learning_rate": 0.0003034026884344084, "loss": 1.8863, "step": 20100 }, { "epoch": 2.12833210409862, "grad_norm": 0.3451697826385498, "learning_rate": 0.00030229359833192846, "loss": 1.8812, "step": 20200 }, { "epoch": 2.1388684016436623, "grad_norm": 0.3747578561306, "learning_rate": 0.00030118450822944857, "loss": 1.8884, "step": 20300 }, { "epoch": 2.1494046991887052, "grad_norm": 0.35056072473526, "learning_rate": 0.00030007541812696867, "loss": 1.8721, "step": 20400 }, { "epoch": 2.1599409967337477, "grad_norm": 0.3892049491405487, "learning_rate": 0.0002989663280244887, "loss": 1.8869, "step": 20500 }, { "epoch": 2.1704772942787907, "grad_norm": 0.4040903151035309, "learning_rate": 0.0002978572379220088, "loss": 1.8773, "step": 20600 }, { "epoch": 2.181013591823833, "grad_norm": 0.4122794568538666, "learning_rate": 0.00029674814781952886, "loss": 1.8858, "step": 20700 }, { "epoch": 2.1915498893688756, "grad_norm": 0.38314470648765564, "learning_rate": 0.00029563905771704896, "loss": 1.8887, "step": 20800 }, { "epoch": 2.2020861869139186, "grad_norm": 0.3841986358165741, "learning_rate": 0.000294529967614569, "loss": 1.8886, "step": 20900 }, { "epoch": 2.212622484458961, "grad_norm": 0.3989698588848114, "learning_rate": 0.00029342087751208905, "loss": 1.8876, "step": 21000 }, { "epoch": 2.223158782004004, "grad_norm": 0.3878525495529175, "learning_rate": 0.00029231178740960915, "loss": 1.8831, "step": 21100 }, { "epoch": 2.2336950795490464, "grad_norm": 0.36871328949928284, "learning_rate": 0.0002912026973071292, "loss": 1.8869, "step": 21200 }, { "epoch": 2.244231377094089, "grad_norm": 0.3922217786312103, "learning_rate": 0.00029009360720464936, "loss": 1.8867, "step": 21300 }, { "epoch": 2.254767674639132, "grad_norm": 0.37641048431396484, "learning_rate": 0.0002889845171021694, "loss": 1.8813, "step": 21400 }, { "epoch": 2.2653039721841743, "grad_norm": 0.3834270238876343, "learning_rate": 0.00028787542699968945, "loss": 1.8858, "step": 21500 }, { "epoch": 2.2758402697292173, "grad_norm": 0.3613283336162567, "learning_rate": 0.00028676633689720955, "loss": 1.8788, "step": 21600 }, { "epoch": 2.2863765672742598, "grad_norm": 0.3932812511920929, "learning_rate": 0.0002856572467947296, "loss": 1.8841, "step": 21700 }, { "epoch": 2.2969128648193027, "grad_norm": 0.380537748336792, "learning_rate": 0.0002845481566922497, "loss": 1.8867, "step": 21800 }, { "epoch": 2.307449162364345, "grad_norm": 0.35902804136276245, "learning_rate": 0.00028343906658976974, "loss": 1.8925, "step": 21900 }, { "epoch": 2.317985459909388, "grad_norm": 0.3631201386451721, "learning_rate": 0.0002823299764872898, "loss": 1.8779, "step": 22000 }, { "epoch": 2.3285217574544306, "grad_norm": 0.3709360361099243, "learning_rate": 0.0002812208863848099, "loss": 1.877, "step": 22100 }, { "epoch": 2.339058054999473, "grad_norm": 0.35048261284828186, "learning_rate": 0.00028011179628233, "loss": 1.8717, "step": 22200 }, { "epoch": 2.349594352544516, "grad_norm": 0.35067349672317505, "learning_rate": 0.0002790027061798501, "loss": 1.8778, "step": 22300 }, { "epoch": 2.3601306500895585, "grad_norm": 0.3626950681209564, "learning_rate": 0.00027789361607737014, "loss": 1.886, "step": 22400 }, { "epoch": 2.370666947634601, "grad_norm": 0.35151103138923645, "learning_rate": 0.00027678452597489024, "loss": 1.8776, "step": 22500 }, { "epoch": 2.381203245179644, "grad_norm": 0.3527145981788635, "learning_rate": 0.0002756754358724103, "loss": 1.8786, "step": 22600 }, { "epoch": 2.3917395427246864, "grad_norm": 0.3571159541606903, "learning_rate": 0.00027456634576993033, "loss": 1.8704, "step": 22700 }, { "epoch": 2.4022758402697293, "grad_norm": 0.35839220881462097, "learning_rate": 0.00027345725566745043, "loss": 1.8815, "step": 22800 }, { "epoch": 2.4128121378147718, "grad_norm": 0.3516599237918854, "learning_rate": 0.0002723481655649705, "loss": 1.8745, "step": 22900 }, { "epoch": 2.4233484353598147, "grad_norm": 0.37703123688697815, "learning_rate": 0.0002712390754624906, "loss": 1.8717, "step": 23000 }, { "epoch": 2.433884732904857, "grad_norm": 0.35914528369903564, "learning_rate": 0.00027012998536001063, "loss": 1.8751, "step": 23100 }, { "epoch": 2.4444210304498997, "grad_norm": 0.379916787147522, "learning_rate": 0.00026902089525753073, "loss": 1.8694, "step": 23200 }, { "epoch": 2.4549573279949426, "grad_norm": 0.38764089345932007, "learning_rate": 0.00026791180515505083, "loss": 1.8762, "step": 23300 }, { "epoch": 2.465493625539985, "grad_norm": 0.3425200879573822, "learning_rate": 0.0002668027150525709, "loss": 1.8765, "step": 23400 }, { "epoch": 2.476029923085028, "grad_norm": 0.37601912021636963, "learning_rate": 0.000265693624950091, "loss": 1.8751, "step": 23500 }, { "epoch": 2.4865662206300705, "grad_norm": 0.3854159414768219, "learning_rate": 0.000264584534847611, "loss": 1.8746, "step": 23600 }, { "epoch": 2.4971025181751134, "grad_norm": 0.402798593044281, "learning_rate": 0.00026347544474513107, "loss": 1.8758, "step": 23700 }, { "epoch": 2.507638815720156, "grad_norm": 0.3488067388534546, "learning_rate": 0.00026236635464265117, "loss": 1.8823, "step": 23800 }, { "epoch": 2.5181751132651984, "grad_norm": 0.38071927428245544, "learning_rate": 0.0002612572645401712, "loss": 1.8746, "step": 23900 }, { "epoch": 2.5287114108102413, "grad_norm": 0.3481471538543701, "learning_rate": 0.0002601481744376913, "loss": 1.8787, "step": 24000 }, { "epoch": 2.539247708355284, "grad_norm": 0.34442374110221863, "learning_rate": 0.0002590390843352114, "loss": 1.88, "step": 24100 }, { "epoch": 2.5497840059003267, "grad_norm": 0.34286609292030334, "learning_rate": 0.00025792999423273147, "loss": 1.8711, "step": 24200 }, { "epoch": 2.560320303445369, "grad_norm": 0.3455844819545746, "learning_rate": 0.00025682090413025157, "loss": 1.8692, "step": 24300 }, { "epoch": 2.570856600990412, "grad_norm": 0.3363890051841736, "learning_rate": 0.0002557118140277716, "loss": 1.8723, "step": 24400 }, { "epoch": 2.5813928985354546, "grad_norm": 0.3758355677127838, "learning_rate": 0.0002546027239252917, "loss": 1.8786, "step": 24500 }, { "epoch": 2.591929196080497, "grad_norm": 0.3661966621875763, "learning_rate": 0.00025349363382281176, "loss": 1.8742, "step": 24600 }, { "epoch": 2.60246549362554, "grad_norm": 0.3269520103931427, "learning_rate": 0.00025238454372033186, "loss": 1.8765, "step": 24700 }, { "epoch": 2.6130017911705825, "grad_norm": 0.37588828802108765, "learning_rate": 0.0002512754536178519, "loss": 1.8755, "step": 24800 }, { "epoch": 2.6235380887156254, "grad_norm": 0.34371519088745117, "learning_rate": 0.00025016636351537195, "loss": 1.8689, "step": 24900 }, { "epoch": 2.634074386260668, "grad_norm": 0.3703347444534302, "learning_rate": 0.00024905727341289206, "loss": 1.869, "step": 25000 }, { "epoch": 2.644610683805711, "grad_norm": 0.3689127266407013, "learning_rate": 0.00024794818331041216, "loss": 1.8681, "step": 25100 }, { "epoch": 2.6551469813507533, "grad_norm": 0.3827933371067047, "learning_rate": 0.0002468390932079322, "loss": 1.8693, "step": 25200 }, { "epoch": 2.665683278895796, "grad_norm": 0.3681269586086273, "learning_rate": 0.0002457300031054523, "loss": 1.8668, "step": 25300 }, { "epoch": 2.6762195764408387, "grad_norm": 0.3521827757358551, "learning_rate": 0.00024462091300297235, "loss": 1.872, "step": 25400 }, { "epoch": 2.6867558739858812, "grad_norm": 0.35968610644340515, "learning_rate": 0.00024351182290049245, "loss": 1.868, "step": 25500 }, { "epoch": 2.697292171530924, "grad_norm": 0.34900325536727905, "learning_rate": 0.0002424027327980125, "loss": 1.8639, "step": 25600 }, { "epoch": 2.7078284690759666, "grad_norm": 0.36115318536758423, "learning_rate": 0.00024129364269553257, "loss": 1.8666, "step": 25700 }, { "epoch": 2.7183647666210096, "grad_norm": 0.3598721921443939, "learning_rate": 0.00024018455259305267, "loss": 1.8588, "step": 25800 }, { "epoch": 2.728901064166052, "grad_norm": 0.3527396619319916, "learning_rate": 0.00023907546249057275, "loss": 1.8626, "step": 25900 }, { "epoch": 2.7394373617110945, "grad_norm": 0.3464626967906952, "learning_rate": 0.00023796637238809282, "loss": 1.8724, "step": 26000 }, { "epoch": 2.7499736592561375, "grad_norm": 0.36689963936805725, "learning_rate": 0.0002368572822856129, "loss": 1.8658, "step": 26100 }, { "epoch": 2.76050995680118, "grad_norm": 0.3785768151283264, "learning_rate": 0.00023574819218313297, "loss": 1.8642, "step": 26200 }, { "epoch": 2.771046254346223, "grad_norm": 0.3481883704662323, "learning_rate": 0.00023463910208065304, "loss": 1.8561, "step": 26300 }, { "epoch": 2.7815825518912654, "grad_norm": 0.36630862951278687, "learning_rate": 0.00023353001197817311, "loss": 1.862, "step": 26400 }, { "epoch": 2.7921188494363083, "grad_norm": 0.35414576530456543, "learning_rate": 0.0002324209218756932, "loss": 1.8676, "step": 26500 }, { "epoch": 2.8026551469813508, "grad_norm": 0.3922441601753235, "learning_rate": 0.00023131183177321326, "loss": 1.8709, "step": 26600 }, { "epoch": 2.8131914445263932, "grad_norm": 0.34433358907699585, "learning_rate": 0.00023020274167073334, "loss": 1.8676, "step": 26700 }, { "epoch": 2.823727742071436, "grad_norm": 0.32512736320495605, "learning_rate": 0.0002290936515682534, "loss": 1.8694, "step": 26800 }, { "epoch": 2.8342640396164787, "grad_norm": 0.3611021041870117, "learning_rate": 0.00022798456146577348, "loss": 1.8686, "step": 26900 }, { "epoch": 2.8448003371615216, "grad_norm": 0.34630611538887024, "learning_rate": 0.00022687547136329356, "loss": 1.8628, "step": 27000 }, { "epoch": 2.855336634706564, "grad_norm": 0.34372755885124207, "learning_rate": 0.00022576638126081363, "loss": 1.8613, "step": 27100 }, { "epoch": 2.865872932251607, "grad_norm": 0.3749391436576843, "learning_rate": 0.00022465729115833373, "loss": 1.8725, "step": 27200 }, { "epoch": 2.8764092297966495, "grad_norm": 0.3814404606819153, "learning_rate": 0.00022354820105585378, "loss": 1.8627, "step": 27300 }, { "epoch": 2.886945527341692, "grad_norm": 0.35840287804603577, "learning_rate": 0.00022243911095337385, "loss": 1.8606, "step": 27400 }, { "epoch": 2.897481824886735, "grad_norm": 0.3533620834350586, "learning_rate": 0.00022133002085089392, "loss": 1.8665, "step": 27500 }, { "epoch": 2.9080181224317774, "grad_norm": 0.3550478518009186, "learning_rate": 0.000220220930748414, "loss": 1.8587, "step": 27600 }, { "epoch": 2.9185544199768203, "grad_norm": 0.3665110468864441, "learning_rate": 0.0002191118406459341, "loss": 1.8655, "step": 27700 }, { "epoch": 2.929090717521863, "grad_norm": 0.3647795021533966, "learning_rate": 0.00021800275054345415, "loss": 1.8555, "step": 27800 }, { "epoch": 2.9396270150669057, "grad_norm": 0.34207072854042053, "learning_rate": 0.00021689366044097422, "loss": 1.8601, "step": 27900 }, { "epoch": 2.950163312611948, "grad_norm": 0.3422704339027405, "learning_rate": 0.0002157845703384943, "loss": 1.8553, "step": 28000 }, { "epoch": 2.9606996101569907, "grad_norm": 0.3600524961948395, "learning_rate": 0.0002146754802360144, "loss": 1.8597, "step": 28100 }, { "epoch": 2.9712359077020336, "grad_norm": 0.35774359107017517, "learning_rate": 0.00021356639013353447, "loss": 1.86, "step": 28200 }, { "epoch": 2.981772205247076, "grad_norm": 0.3582908511161804, "learning_rate": 0.00021245730003105454, "loss": 1.8591, "step": 28300 }, { "epoch": 2.9923085027921186, "grad_norm": 0.36876824498176575, "learning_rate": 0.0002113482099285746, "loss": 1.8655, "step": 28400 }, { "epoch": 3.0028448003371615, "grad_norm": 0.3600168526172638, "learning_rate": 0.00021023911982609466, "loss": 1.8473, "step": 28500 }, { "epoch": 3.013381097882204, "grad_norm": 0.33718979358673096, "learning_rate": 0.00020913002972361476, "loss": 1.8256, "step": 28600 }, { "epoch": 3.023917395427247, "grad_norm": 0.3321118950843811, "learning_rate": 0.00020802093962113484, "loss": 1.8251, "step": 28700 }, { "epoch": 3.0344536929722894, "grad_norm": 0.34264570474624634, "learning_rate": 0.0002069118495186549, "loss": 1.831, "step": 28800 }, { "epoch": 3.0449899905173323, "grad_norm": 0.3522898852825165, "learning_rate": 0.00020580275941617496, "loss": 1.8249, "step": 28900 }, { "epoch": 3.055526288062375, "grad_norm": 0.38659289479255676, "learning_rate": 0.00020469366931369503, "loss": 1.829, "step": 29000 }, { "epoch": 3.0660625856074177, "grad_norm": 0.3475963771343231, "learning_rate": 0.00020358457921121513, "loss": 1.8287, "step": 29100 }, { "epoch": 3.07659888315246, "grad_norm": 0.37323230504989624, "learning_rate": 0.0002024754891087352, "loss": 1.827, "step": 29200 }, { "epoch": 3.0871351806975027, "grad_norm": 0.3953257203102112, "learning_rate": 0.00020136639900625528, "loss": 1.8303, "step": 29300 }, { "epoch": 3.0976714782425456, "grad_norm": 0.34784358739852905, "learning_rate": 0.00020025730890377535, "loss": 1.8225, "step": 29400 }, { "epoch": 3.108207775787588, "grad_norm": 0.3565751314163208, "learning_rate": 0.0001991482188012954, "loss": 1.8292, "step": 29500 }, { "epoch": 3.118744073332631, "grad_norm": 0.368730753660202, "learning_rate": 0.0001980391286988155, "loss": 1.8357, "step": 29600 }, { "epoch": 3.1292803708776735, "grad_norm": 0.37354937195777893, "learning_rate": 0.00019693003859633557, "loss": 1.8276, "step": 29700 }, { "epoch": 3.1398166684227165, "grad_norm": 0.3472649157047272, "learning_rate": 0.00019582094849385565, "loss": 1.8335, "step": 29800 }, { "epoch": 3.150352965967759, "grad_norm": 0.35036763548851013, "learning_rate": 0.00019471185839137572, "loss": 1.8276, "step": 29900 }, { "epoch": 3.1608892635128014, "grad_norm": 0.3752099573612213, "learning_rate": 0.0001936027682888958, "loss": 1.8308, "step": 30000 }, { "epoch": 3.1714255610578443, "grad_norm": 0.337298184633255, "learning_rate": 0.00019249367818641587, "loss": 1.8268, "step": 30100 }, { "epoch": 3.181961858602887, "grad_norm": 0.3451649844646454, "learning_rate": 0.00019138458808393594, "loss": 1.825, "step": 30200 }, { "epoch": 3.1924981561479298, "grad_norm": 0.36679157614707947, "learning_rate": 0.00019027549798145602, "loss": 1.8389, "step": 30300 }, { "epoch": 3.2030344536929722, "grad_norm": 0.34255459904670715, "learning_rate": 0.0001891664078789761, "loss": 1.8321, "step": 30400 }, { "epoch": 3.213570751238015, "grad_norm": 0.36408087611198425, "learning_rate": 0.0001880573177764962, "loss": 1.8324, "step": 30500 }, { "epoch": 3.2241070487830576, "grad_norm": 0.32933005690574646, "learning_rate": 0.00018694822767401624, "loss": 1.8256, "step": 30600 }, { "epoch": 3.2346433463281, "grad_norm": 0.37449416518211365, "learning_rate": 0.0001858391375715363, "loss": 1.8332, "step": 30700 }, { "epoch": 3.245179643873143, "grad_norm": 0.32968634366989136, "learning_rate": 0.00018473004746905638, "loss": 1.8247, "step": 30800 }, { "epoch": 3.2557159414181855, "grad_norm": 0.3492085635662079, "learning_rate": 0.00018362095736657646, "loss": 1.8339, "step": 30900 }, { "epoch": 3.2662522389632285, "grad_norm": 0.37141090631484985, "learning_rate": 0.00018251186726409656, "loss": 1.8332, "step": 31000 }, { "epoch": 3.276788536508271, "grad_norm": 0.3904590308666229, "learning_rate": 0.0001814027771616166, "loss": 1.827, "step": 31100 }, { "epoch": 3.2873248340533134, "grad_norm": 0.3764263987541199, "learning_rate": 0.00018029368705913668, "loss": 1.827, "step": 31200 }, { "epoch": 3.2978611315983564, "grad_norm": 0.36718282103538513, "learning_rate": 0.00017918459695665675, "loss": 1.828, "step": 31300 }, { "epoch": 3.308397429143399, "grad_norm": 0.33118733763694763, "learning_rate": 0.00017807550685417683, "loss": 1.8304, "step": 31400 }, { "epoch": 3.3189337266884418, "grad_norm": 0.3702305853366852, "learning_rate": 0.00017696641675169693, "loss": 1.8313, "step": 31500 }, { "epoch": 3.3294700242334843, "grad_norm": 0.3547195792198181, "learning_rate": 0.000175857326649217, "loss": 1.8306, "step": 31600 }, { "epoch": 3.340006321778527, "grad_norm": 0.3350249230861664, "learning_rate": 0.00017474823654673705, "loss": 1.8327, "step": 31700 }, { "epoch": 3.3505426193235697, "grad_norm": 0.34737563133239746, "learning_rate": 0.00017363914644425712, "loss": 1.8256, "step": 31800 }, { "epoch": 3.361078916868612, "grad_norm": 0.3753857910633087, "learning_rate": 0.00017253005634177722, "loss": 1.8304, "step": 31900 }, { "epoch": 3.371615214413655, "grad_norm": 0.34666532278060913, "learning_rate": 0.0001714209662392973, "loss": 1.835, "step": 32000 }, { "epoch": 3.3821515119586976, "grad_norm": 0.3317427933216095, "learning_rate": 0.00017031187613681737, "loss": 1.8231, "step": 32100 }, { "epoch": 3.3926878095037405, "grad_norm": 0.33654922246932983, "learning_rate": 0.00016920278603433742, "loss": 1.8272, "step": 32200 }, { "epoch": 3.403224107048783, "grad_norm": 0.35222548246383667, "learning_rate": 0.0001680936959318575, "loss": 1.8254, "step": 32300 }, { "epoch": 3.413760404593826, "grad_norm": 0.3511573374271393, "learning_rate": 0.0001669846058293776, "loss": 1.8297, "step": 32400 }, { "epoch": 3.4242967021388684, "grad_norm": 0.35278716683387756, "learning_rate": 0.00016587551572689766, "loss": 1.8269, "step": 32500 }, { "epoch": 3.434832999683911, "grad_norm": 0.3196614682674408, "learning_rate": 0.00016476642562441774, "loss": 1.8183, "step": 32600 }, { "epoch": 3.445369297228954, "grad_norm": 0.3310936987400055, "learning_rate": 0.0001636573355219378, "loss": 1.8234, "step": 32700 }, { "epoch": 3.4559055947739963, "grad_norm": 0.35424286127090454, "learning_rate": 0.00016254824541945786, "loss": 1.8306, "step": 32800 }, { "epoch": 3.466441892319039, "grad_norm": 0.3745037913322449, "learning_rate": 0.00016143915531697796, "loss": 1.8313, "step": 32900 }, { "epoch": 3.4769781898640817, "grad_norm": 0.3382411599159241, "learning_rate": 0.00016033006521449803, "loss": 1.8225, "step": 33000 }, { "epoch": 3.4875144874091246, "grad_norm": 0.33086690306663513, "learning_rate": 0.0001592209751120181, "loss": 1.8208, "step": 33100 }, { "epoch": 3.498050784954167, "grad_norm": 0.3586762249469757, "learning_rate": 0.00015811188500953818, "loss": 1.8255, "step": 33200 }, { "epoch": 3.5085870824992096, "grad_norm": 0.3511541187763214, "learning_rate": 0.00015700279490705825, "loss": 1.8259, "step": 33300 }, { "epoch": 3.5191233800442525, "grad_norm": 0.3497931659221649, "learning_rate": 0.00015589370480457833, "loss": 1.8226, "step": 33400 }, { "epoch": 3.529659677589295, "grad_norm": 0.35156911611557007, "learning_rate": 0.0001547846147020984, "loss": 1.8231, "step": 33500 }, { "epoch": 3.540195975134338, "grad_norm": 0.34975793957710266, "learning_rate": 0.00015367552459961847, "loss": 1.824, "step": 33600 }, { "epoch": 3.5507322726793804, "grad_norm": 0.3560537099838257, "learning_rate": 0.00015256643449713855, "loss": 1.8284, "step": 33700 }, { "epoch": 3.5612685702244233, "grad_norm": 0.37322962284088135, "learning_rate": 0.00015145734439465865, "loss": 1.8229, "step": 33800 }, { "epoch": 3.571804867769466, "grad_norm": 0.3404606878757477, "learning_rate": 0.0001503482542921787, "loss": 1.8295, "step": 33900 }, { "epoch": 3.5823411653145083, "grad_norm": 0.3346281349658966, "learning_rate": 0.00014923916418969877, "loss": 1.8221, "step": 34000 }, { "epoch": 3.5928774628595512, "grad_norm": 0.3319614827632904, "learning_rate": 0.00014813007408721884, "loss": 1.8225, "step": 34100 }, { "epoch": 3.6034137604045937, "grad_norm": 0.3317611515522003, "learning_rate": 0.00014702098398473892, "loss": 1.8175, "step": 34200 }, { "epoch": 3.6139500579496366, "grad_norm": 0.3446439206600189, "learning_rate": 0.00014591189388225902, "loss": 1.8283, "step": 34300 }, { "epoch": 3.624486355494679, "grad_norm": 0.32466185092926025, "learning_rate": 0.0001448028037797791, "loss": 1.8201, "step": 34400 }, { "epoch": 3.635022653039722, "grad_norm": 0.3251676559448242, "learning_rate": 0.00014369371367729914, "loss": 1.8269, "step": 34500 }, { "epoch": 3.6455589505847645, "grad_norm": 0.3591017723083496, "learning_rate": 0.0001425846235748192, "loss": 1.8202, "step": 34600 }, { "epoch": 3.656095248129807, "grad_norm": 0.34030893445014954, "learning_rate": 0.00014147553347233928, "loss": 1.8185, "step": 34700 }, { "epoch": 3.66663154567485, "grad_norm": 0.35147637128829956, "learning_rate": 0.00014036644336985939, "loss": 1.8252, "step": 34800 }, { "epoch": 3.6771678432198924, "grad_norm": 0.3547748327255249, "learning_rate": 0.00013925735326737946, "loss": 1.8142, "step": 34900 }, { "epoch": 3.6877041407649354, "grad_norm": 0.3361000716686249, "learning_rate": 0.0001381482631648995, "loss": 1.8235, "step": 35000 }, { "epoch": 3.698240438309978, "grad_norm": 0.3312234580516815, "learning_rate": 0.00013703917306241958, "loss": 1.8267, "step": 35100 }, { "epoch": 3.7087767358550208, "grad_norm": 0.36078423261642456, "learning_rate": 0.00013593008295993965, "loss": 1.8192, "step": 35200 }, { "epoch": 3.7193130334000633, "grad_norm": 0.32330262660980225, "learning_rate": 0.00013482099285745975, "loss": 1.8228, "step": 35300 }, { "epoch": 3.7298493309451057, "grad_norm": 0.34211012721061707, "learning_rate": 0.00013371190275497983, "loss": 1.8207, "step": 35400 }, { "epoch": 3.7403856284901487, "grad_norm": 0.34478235244750977, "learning_rate": 0.0001326028126524999, "loss": 1.8221, "step": 35500 }, { "epoch": 3.750921926035191, "grad_norm": 0.3438977301120758, "learning_rate": 0.00013149372255001995, "loss": 1.8214, "step": 35600 }, { "epoch": 3.7614582235802336, "grad_norm": 0.3275744616985321, "learning_rate": 0.00013038463244754005, "loss": 1.8153, "step": 35700 }, { "epoch": 3.7719945211252766, "grad_norm": 0.35410231351852417, "learning_rate": 0.00012927554234506012, "loss": 1.8144, "step": 35800 }, { "epoch": 3.7825308186703195, "grad_norm": 0.3045212924480438, "learning_rate": 0.0001281664522425802, "loss": 1.8162, "step": 35900 }, { "epoch": 3.793067116215362, "grad_norm": 0.32530274987220764, "learning_rate": 0.00012705736214010027, "loss": 1.8212, "step": 36000 }, { "epoch": 3.8036034137604045, "grad_norm": 0.35284802317619324, "learning_rate": 0.00012594827203762032, "loss": 1.8217, "step": 36100 }, { "epoch": 3.8141397113054474, "grad_norm": 0.35002532601356506, "learning_rate": 0.00012483918193514042, "loss": 1.8179, "step": 36200 }, { "epoch": 3.82467600885049, "grad_norm": 0.33642175793647766, "learning_rate": 0.0001237300918326605, "loss": 1.8136, "step": 36300 }, { "epoch": 3.8352123063955323, "grad_norm": 0.3203926086425781, "learning_rate": 0.00012262100173018056, "loss": 1.8189, "step": 36400 }, { "epoch": 3.8457486039405753, "grad_norm": 0.3277607560157776, "learning_rate": 0.00012151191162770062, "loss": 1.813, "step": 36500 }, { "epoch": 3.856284901485618, "grad_norm": 0.3415702283382416, "learning_rate": 0.00012040282152522071, "loss": 1.8157, "step": 36600 }, { "epoch": 3.8668211990306607, "grad_norm": 0.33326780796051025, "learning_rate": 0.00011929373142274079, "loss": 1.8144, "step": 36700 }, { "epoch": 3.877357496575703, "grad_norm": 0.3394588530063629, "learning_rate": 0.00011818464132026086, "loss": 1.8069, "step": 36800 }, { "epoch": 3.887893794120746, "grad_norm": 0.38374754786491394, "learning_rate": 0.00011707555121778093, "loss": 1.8076, "step": 36900 }, { "epoch": 3.8984300916657886, "grad_norm": 0.34460264444351196, "learning_rate": 0.00011596646111530102, "loss": 1.8129, "step": 37000 }, { "epoch": 3.908966389210831, "grad_norm": 0.3361436724662781, "learning_rate": 0.00011485737101282108, "loss": 1.8105, "step": 37100 }, { "epoch": 3.919502686755874, "grad_norm": 0.35143253207206726, "learning_rate": 0.00011374828091034115, "loss": 1.8184, "step": 37200 }, { "epoch": 3.930038984300917, "grad_norm": 0.34239351749420166, "learning_rate": 0.00011263919080786124, "loss": 1.8061, "step": 37300 }, { "epoch": 3.9405752818459594, "grad_norm": 0.3523593246936798, "learning_rate": 0.0001115301007053813, "loss": 1.8092, "step": 37400 }, { "epoch": 3.951111579391002, "grad_norm": 0.36350205540657043, "learning_rate": 0.00011042101060290139, "loss": 1.8094, "step": 37500 }, { "epoch": 3.961647876936045, "grad_norm": 0.3419075906276703, "learning_rate": 0.00010931192050042146, "loss": 1.8077, "step": 37600 }, { "epoch": 3.9721841744810873, "grad_norm": 0.3350605070590973, "learning_rate": 0.00010820283039794154, "loss": 1.8115, "step": 37700 }, { "epoch": 3.9827204720261298, "grad_norm": 0.33970579504966736, "learning_rate": 0.00010709374029546161, "loss": 1.8171, "step": 37800 }, { "epoch": 3.9932567695711727, "grad_norm": 0.36339592933654785, "learning_rate": 0.00010598465019298167, "loss": 1.8073, "step": 37900 }, { "epoch": 4.003793067116216, "grad_norm": 0.33541393280029297, "learning_rate": 0.00010487556009050176, "loss": 1.7981, "step": 38000 }, { "epoch": 4.014329364661258, "grad_norm": 0.36207860708236694, "learning_rate": 0.00010376646998802183, "loss": 1.7909, "step": 38100 }, { "epoch": 4.024865662206301, "grad_norm": 0.34258803725242615, "learning_rate": 0.0001026573798855419, "loss": 1.7793, "step": 38200 }, { "epoch": 4.0354019597513435, "grad_norm": 0.34286418557167053, "learning_rate": 0.00010154828978306198, "loss": 1.7894, "step": 38300 }, { "epoch": 4.0459382572963865, "grad_norm": 0.3334041237831116, "learning_rate": 0.00010043919968058205, "loss": 1.7841, "step": 38400 }, { "epoch": 4.0564745548414285, "grad_norm": 0.3277220129966736, "learning_rate": 9.933010957810213e-05, "loss": 1.7851, "step": 38500 }, { "epoch": 4.067010852386471, "grad_norm": 0.3734584450721741, "learning_rate": 9.82210194756222e-05, "loss": 1.7889, "step": 38600 }, { "epoch": 4.077547149931514, "grad_norm": 0.3457617461681366, "learning_rate": 9.711192937314229e-05, "loss": 1.792, "step": 38700 }, { "epoch": 4.088083447476556, "grad_norm": 0.35224205255508423, "learning_rate": 9.600283927066235e-05, "loss": 1.7906, "step": 38800 }, { "epoch": 4.098619745021599, "grad_norm": 0.3286111652851105, "learning_rate": 9.489374916818243e-05, "loss": 1.7812, "step": 38900 }, { "epoch": 4.109156042566642, "grad_norm": 0.32292017340660095, "learning_rate": 9.37846590657025e-05, "loss": 1.7875, "step": 39000 }, { "epoch": 4.119692340111685, "grad_norm": 0.33784738183021545, "learning_rate": 9.267556896322257e-05, "loss": 1.785, "step": 39100 }, { "epoch": 4.130228637656727, "grad_norm": 0.33517780900001526, "learning_rate": 9.156647886074265e-05, "loss": 1.7926, "step": 39200 }, { "epoch": 4.14076493520177, "grad_norm": 0.340833842754364, "learning_rate": 9.045738875826271e-05, "loss": 1.7875, "step": 39300 }, { "epoch": 4.151301232746813, "grad_norm": 0.3653368353843689, "learning_rate": 8.93482986557828e-05, "loss": 1.7843, "step": 39400 }, { "epoch": 4.161837530291855, "grad_norm": 0.3394693434238434, "learning_rate": 8.823920855330288e-05, "loss": 1.7804, "step": 39500 }, { "epoch": 4.172373827836898, "grad_norm": 0.3323003947734833, "learning_rate": 8.713011845082295e-05, "loss": 1.7848, "step": 39600 }, { "epoch": 4.182910125381941, "grad_norm": 0.35341712832450867, "learning_rate": 8.602102834834302e-05, "loss": 1.7833, "step": 39700 }, { "epoch": 4.193446422926984, "grad_norm": 0.3553250730037689, "learning_rate": 8.49119382458631e-05, "loss": 1.7844, "step": 39800 }, { "epoch": 4.203982720472026, "grad_norm": 0.3491000831127167, "learning_rate": 8.380284814338317e-05, "loss": 1.783, "step": 39900 }, { "epoch": 4.214519018017069, "grad_norm": 0.36473289132118225, "learning_rate": 8.269375804090324e-05, "loss": 1.7873, "step": 40000 }, { "epoch": 4.225055315562112, "grad_norm": 0.3357420563697815, "learning_rate": 8.158466793842332e-05, "loss": 1.7909, "step": 40100 }, { "epoch": 4.235591613107154, "grad_norm": 0.33982038497924805, "learning_rate": 8.047557783594339e-05, "loss": 1.7877, "step": 40200 }, { "epoch": 4.246127910652197, "grad_norm": 0.33362457156181335, "learning_rate": 7.936648773346347e-05, "loss": 1.7878, "step": 40300 }, { "epoch": 4.25666420819724, "grad_norm": 0.33826008439064026, "learning_rate": 7.825739763098354e-05, "loss": 1.7949, "step": 40400 }, { "epoch": 4.267200505742283, "grad_norm": 0.3940160572528839, "learning_rate": 7.714830752850361e-05, "loss": 1.7886, "step": 40500 }, { "epoch": 4.277736803287325, "grad_norm": 0.33485040068626404, "learning_rate": 7.60392174260237e-05, "loss": 1.7837, "step": 40600 }, { "epoch": 4.288273100832368, "grad_norm": 0.3465060591697693, "learning_rate": 7.493012732354376e-05, "loss": 1.7794, "step": 40700 }, { "epoch": 4.2988093983774105, "grad_norm": 0.3455548584461212, "learning_rate": 7.382103722106385e-05, "loss": 1.7877, "step": 40800 }, { "epoch": 4.3093456959224525, "grad_norm": 0.33163055777549744, "learning_rate": 7.271194711858392e-05, "loss": 1.7861, "step": 40900 }, { "epoch": 4.3198819934674955, "grad_norm": 0.34442830085754395, "learning_rate": 7.160285701610398e-05, "loss": 1.7861, "step": 41000 }, { "epoch": 4.330418291012538, "grad_norm": 0.3633157014846802, "learning_rate": 7.049376691362407e-05, "loss": 1.7842, "step": 41100 }, { "epoch": 4.340954588557581, "grad_norm": 0.3231643736362457, "learning_rate": 6.938467681114414e-05, "loss": 1.7833, "step": 41200 }, { "epoch": 4.351490886102623, "grad_norm": 0.36037677526474, "learning_rate": 6.827558670866422e-05, "loss": 1.7836, "step": 41300 }, { "epoch": 4.362027183647666, "grad_norm": 0.3292723000049591, "learning_rate": 6.716649660618429e-05, "loss": 1.7806, "step": 41400 }, { "epoch": 4.372563481192709, "grad_norm": 0.37054258584976196, "learning_rate": 6.605740650370436e-05, "loss": 1.79, "step": 41500 }, { "epoch": 4.383099778737751, "grad_norm": 0.3358231782913208, "learning_rate": 6.494831640122444e-05, "loss": 1.7882, "step": 41600 }, { "epoch": 4.393636076282794, "grad_norm": 0.3368220031261444, "learning_rate": 6.383922629874451e-05, "loss": 1.7812, "step": 41700 }, { "epoch": 4.404172373827837, "grad_norm": 0.34333834052085876, "learning_rate": 6.273013619626458e-05, "loss": 1.7837, "step": 41800 }, { "epoch": 4.41470867137288, "grad_norm": 0.3434154987335205, "learning_rate": 6.162104609378466e-05, "loss": 1.7858, "step": 41900 }, { "epoch": 4.425244968917922, "grad_norm": 0.35153815150260925, "learning_rate": 6.051195599130473e-05, "loss": 1.7759, "step": 42000 }, { "epoch": 4.435781266462965, "grad_norm": 0.3414738178253174, "learning_rate": 5.940286588882481e-05, "loss": 1.7827, "step": 42100 }, { "epoch": 4.446317564008008, "grad_norm": 0.3285759687423706, "learning_rate": 5.8293775786344886e-05, "loss": 1.7826, "step": 42200 }, { "epoch": 4.45685386155305, "grad_norm": 0.35258546471595764, "learning_rate": 5.718468568386496e-05, "loss": 1.7883, "step": 42300 }, { "epoch": 4.467390159098093, "grad_norm": 0.33706724643707275, "learning_rate": 5.607559558138503e-05, "loss": 1.7786, "step": 42400 }, { "epoch": 4.477926456643136, "grad_norm": 0.3357242941856384, "learning_rate": 5.496650547890511e-05, "loss": 1.7904, "step": 42500 }, { "epoch": 4.488462754188178, "grad_norm": 0.3552809953689575, "learning_rate": 5.385741537642518e-05, "loss": 1.7858, "step": 42600 }, { "epoch": 4.498999051733221, "grad_norm": 0.3606029450893402, "learning_rate": 5.2748325273945254e-05, "loss": 1.7767, "step": 42700 }, { "epoch": 4.509535349278264, "grad_norm": 0.3668212592601776, "learning_rate": 5.163923517146533e-05, "loss": 1.7841, "step": 42800 }, { "epoch": 4.520071646823307, "grad_norm": 0.34113767743110657, "learning_rate": 5.053014506898541e-05, "loss": 1.7777, "step": 42900 }, { "epoch": 4.530607944368349, "grad_norm": 0.33344870805740356, "learning_rate": 4.942105496650548e-05, "loss": 1.7789, "step": 43000 }, { "epoch": 4.541144241913392, "grad_norm": 0.34441855549812317, "learning_rate": 4.8311964864025556e-05, "loss": 1.786, "step": 43100 }, { "epoch": 4.5516805394584345, "grad_norm": 0.3361603617668152, "learning_rate": 4.720287476154563e-05, "loss": 1.7835, "step": 43200 }, { "epoch": 4.5622168370034775, "grad_norm": 0.3377070426940918, "learning_rate": 4.60937846590657e-05, "loss": 1.7842, "step": 43300 }, { "epoch": 4.5727531345485195, "grad_norm": 0.3532165288925171, "learning_rate": 4.4984694556585777e-05, "loss": 1.7848, "step": 43400 }, { "epoch": 4.583289432093562, "grad_norm": 0.35418322682380676, "learning_rate": 4.387560445410585e-05, "loss": 1.7854, "step": 43500 }, { "epoch": 4.593825729638605, "grad_norm": 0.33272701501846313, "learning_rate": 4.276651435162593e-05, "loss": 1.7754, "step": 43600 }, { "epoch": 4.604362027183647, "grad_norm": 0.36113685369491577, "learning_rate": 4.1657424249146004e-05, "loss": 1.7752, "step": 43700 }, { "epoch": 4.61489832472869, "grad_norm": 0.34041377902030945, "learning_rate": 4.054833414666607e-05, "loss": 1.774, "step": 43800 }, { "epoch": 4.625434622273733, "grad_norm": 0.3422810435295105, "learning_rate": 3.943924404418615e-05, "loss": 1.7832, "step": 43900 }, { "epoch": 4.635970919818776, "grad_norm": 0.3397616744041443, "learning_rate": 3.8330153941706225e-05, "loss": 1.78, "step": 44000 }, { "epoch": 4.646507217363818, "grad_norm": 0.3389655649662018, "learning_rate": 3.72210638392263e-05, "loss": 1.7771, "step": 44100 }, { "epoch": 4.657043514908861, "grad_norm": 0.3590547442436218, "learning_rate": 3.611197373674637e-05, "loss": 1.7838, "step": 44200 }, { "epoch": 4.667579812453904, "grad_norm": 0.33880913257598877, "learning_rate": 3.500288363426645e-05, "loss": 1.7708, "step": 44300 }, { "epoch": 4.678116109998946, "grad_norm": 0.3376372456550598, "learning_rate": 3.389379353178653e-05, "loss": 1.7767, "step": 44400 }, { "epoch": 4.688652407543989, "grad_norm": 0.3335518538951874, "learning_rate": 3.2784703429306594e-05, "loss": 1.7784, "step": 44500 }, { "epoch": 4.699188705089032, "grad_norm": 0.37929996848106384, "learning_rate": 3.167561332682667e-05, "loss": 1.7714, "step": 44600 }, { "epoch": 4.709725002634074, "grad_norm": 0.3256159722805023, "learning_rate": 3.056652322434675e-05, "loss": 1.7824, "step": 44700 }, { "epoch": 4.720261300179117, "grad_norm": 0.34018459916114807, "learning_rate": 2.9457433121866822e-05, "loss": 1.7821, "step": 44800 }, { "epoch": 4.73079759772416, "grad_norm": 0.3662751317024231, "learning_rate": 2.8348343019386895e-05, "loss": 1.7799, "step": 44900 }, { "epoch": 4.741333895269202, "grad_norm": 0.32580700516700745, "learning_rate": 2.723925291690697e-05, "loss": 1.7801, "step": 45000 }, { "epoch": 4.751870192814245, "grad_norm": 0.3326426148414612, "learning_rate": 2.6130162814427046e-05, "loss": 1.7824, "step": 45100 }, { "epoch": 4.762406490359288, "grad_norm": 0.3480491042137146, "learning_rate": 2.502107271194712e-05, "loss": 1.7738, "step": 45200 }, { "epoch": 4.772942787904331, "grad_norm": 0.3338908553123474, "learning_rate": 2.3911982609467194e-05, "loss": 1.7809, "step": 45300 }, { "epoch": 4.783479085449373, "grad_norm": 0.35016825795173645, "learning_rate": 2.2802892506987267e-05, "loss": 1.7798, "step": 45400 }, { "epoch": 4.794015382994416, "grad_norm": 0.35119980573654175, "learning_rate": 2.1693802404507344e-05, "loss": 1.7772, "step": 45500 }, { "epoch": 4.804551680539459, "grad_norm": 0.34869563579559326, "learning_rate": 2.0584712302027415e-05, "loss": 1.7834, "step": 45600 }, { "epoch": 4.815087978084501, "grad_norm": 0.3165900409221649, "learning_rate": 1.9475622199547492e-05, "loss": 1.7766, "step": 45700 }, { "epoch": 4.8256242756295435, "grad_norm": 0.33901646733283997, "learning_rate": 1.836653209706757e-05, "loss": 1.7781, "step": 45800 }, { "epoch": 4.8361605731745865, "grad_norm": 0.34397250413894653, "learning_rate": 1.725744199458764e-05, "loss": 1.7773, "step": 45900 }, { "epoch": 4.846696870719629, "grad_norm": 0.3640625476837158, "learning_rate": 1.6148351892107716e-05, "loss": 1.7775, "step": 46000 }, { "epoch": 4.857233168264671, "grad_norm": 0.3395892381668091, "learning_rate": 1.503926178962779e-05, "loss": 1.7817, "step": 46100 }, { "epoch": 4.867769465809714, "grad_norm": 0.3353815972805023, "learning_rate": 1.3930171687147865e-05, "loss": 1.7759, "step": 46200 }, { "epoch": 4.878305763354757, "grad_norm": 0.34299150109291077, "learning_rate": 1.2821081584667939e-05, "loss": 1.779, "step": 46300 }, { "epoch": 4.888842060899799, "grad_norm": 0.34803491830825806, "learning_rate": 1.1711991482188014e-05, "loss": 1.7787, "step": 46400 }, { "epoch": 4.899378358444842, "grad_norm": 0.3452516198158264, "learning_rate": 1.0602901379708088e-05, "loss": 1.7822, "step": 46500 }, { "epoch": 4.909914655989885, "grad_norm": 0.32334357500076294, "learning_rate": 9.493811277228162e-06, "loss": 1.7774, "step": 46600 }, { "epoch": 4.920450953534928, "grad_norm": 0.34011390805244446, "learning_rate": 8.384721174748237e-06, "loss": 1.7788, "step": 46700 }, { "epoch": 4.93098725107997, "grad_norm": 0.3399524688720703, "learning_rate": 7.2756310722683116e-06, "loss": 1.778, "step": 46800 }, { "epoch": 4.941523548625013, "grad_norm": 0.33615124225616455, "learning_rate": 6.166540969788386e-06, "loss": 1.7771, "step": 46900 }, { "epoch": 4.952059846170056, "grad_norm": 0.3466767966747284, "learning_rate": 5.05745086730846e-06, "loss": 1.7774, "step": 47000 }, { "epoch": 4.962596143715098, "grad_norm": 0.33684036135673523, "learning_rate": 3.948360764828534e-06, "loss": 1.7735, "step": 47100 }, { "epoch": 4.973132441260141, "grad_norm": 0.3275541663169861, "learning_rate": 2.8392706623486093e-06, "loss": 1.773, "step": 47200 }, { "epoch": 4.983668738805184, "grad_norm": 0.3321060240268707, "learning_rate": 1.7301805598686838e-06, "loss": 1.7764, "step": 47300 }, { "epoch": 4.994205036350227, "grad_norm": 0.3356621265411377, "learning_rate": 6.210904573887583e-07, "loss": 1.7762, "step": 47400 } ], "logging_steps": 100, "max_steps": 47455, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.0383510041641344e+17, "train_batch_size": 128, "trial_name": null, "trial_params": null }