Safetensors
English
bert
biomedbert-small / trainer_state.json
davidmezzetti's picture
Add model
03666b4
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 726327,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002065185515614868,
"grad_norm": 0.7036675810813904,
"learning_rate": 0.0001996,
"loss": 7.8213,
"step": 500
},
{
"epoch": 0.004130371031229736,
"grad_norm": 1.0079172849655151,
"learning_rate": 0.0003996,
"loss": 6.7143,
"step": 1000
},
{
"epoch": 0.006195556546844603,
"grad_norm": 1.0771256685256958,
"learning_rate": 0.0005996,
"loss": 6.5273,
"step": 1500
},
{
"epoch": 0.008260742062459471,
"grad_norm": 1.1658340692520142,
"learning_rate": 0.0007996,
"loss": 6.3948,
"step": 2000
},
{
"epoch": 0.010325927578074339,
"grad_norm": 1.4879825115203857,
"learning_rate": 0.0009996,
"loss": 5.9994,
"step": 2500
},
{
"epoch": 0.012391113093689206,
"grad_norm": 1.6412945985794067,
"learning_rate": 0.0009993106087504335,
"loss": 4.7896,
"step": 3000
},
{
"epoch": 0.014456298609304074,
"grad_norm": 1.4124268293380737,
"learning_rate": 0.0009986198359552766,
"loss": 3.8145,
"step": 3500
},
{
"epoch": 0.016521484124918943,
"grad_norm": 1.2819844484329224,
"learning_rate": 0.0009979290631601198,
"loss": 3.4161,
"step": 4000
},
{
"epoch": 0.01858666964053381,
"grad_norm": 1.155612826347351,
"learning_rate": 0.0009972382903649629,
"loss": 3.1921,
"step": 4500
},
{
"epoch": 0.020651855156148678,
"grad_norm": 1.1647284030914307,
"learning_rate": 0.0009965475175698062,
"loss": 3.055,
"step": 5000
},
{
"epoch": 0.022717040671763545,
"grad_norm": 1.1216390132904053,
"learning_rate": 0.0009958567447746491,
"loss": 2.9393,
"step": 5500
},
{
"epoch": 0.024782226187378412,
"grad_norm": 1.1327152252197266,
"learning_rate": 0.0009951659719794922,
"loss": 2.8641,
"step": 6000
},
{
"epoch": 0.02684741170299328,
"grad_norm": 1.0822185277938843,
"learning_rate": 0.0009944751991843356,
"loss": 2.7994,
"step": 6500
},
{
"epoch": 0.028912597218608147,
"grad_norm": 1.0448203086853027,
"learning_rate": 0.0009937844263891787,
"loss": 2.7461,
"step": 7000
},
{
"epoch": 0.030977782734223015,
"grad_norm": 1.0597904920578003,
"learning_rate": 0.0009930936535940218,
"loss": 2.7034,
"step": 7500
},
{
"epoch": 0.033042968249837885,
"grad_norm": 1.0737932920455933,
"learning_rate": 0.000992402880798865,
"loss": 2.6603,
"step": 8000
},
{
"epoch": 0.03510815376545275,
"grad_norm": 1.033523440361023,
"learning_rate": 0.000991712108003708,
"loss": 2.6265,
"step": 8500
},
{
"epoch": 0.03717333928106762,
"grad_norm": 0.976208508014679,
"learning_rate": 0.0009910213352085512,
"loss": 2.5961,
"step": 9000
},
{
"epoch": 0.03923852479668249,
"grad_norm": 0.9439292550086975,
"learning_rate": 0.0009903305624133943,
"loss": 2.576,
"step": 9500
},
{
"epoch": 0.041303710312297355,
"grad_norm": 0.9609880447387695,
"learning_rate": 0.0009896397896182376,
"loss": 2.5466,
"step": 10000
},
{
"epoch": 0.04336889582791222,
"grad_norm": 0.9652389883995056,
"learning_rate": 0.0009889490168230807,
"loss": 2.527,
"step": 10500
},
{
"epoch": 0.04543408134352709,
"grad_norm": 1.0029548406600952,
"learning_rate": 0.0009882582440279238,
"loss": 2.5086,
"step": 11000
},
{
"epoch": 0.04749926685914196,
"grad_norm": 0.9536625742912292,
"learning_rate": 0.000987567471232767,
"loss": 2.4902,
"step": 11500
},
{
"epoch": 0.049564452374756825,
"grad_norm": 0.9976128339767456,
"learning_rate": 0.00098687669843761,
"loss": 2.4657,
"step": 12000
},
{
"epoch": 0.05162963789037169,
"grad_norm": 1.0109055042266846,
"learning_rate": 0.0009861859256424532,
"loss": 2.4605,
"step": 12500
},
{
"epoch": 0.05369482340598656,
"grad_norm": 0.9560060501098633,
"learning_rate": 0.0009854951528472963,
"loss": 2.4431,
"step": 13000
},
{
"epoch": 0.05576000892160143,
"grad_norm": 0.9709720015525818,
"learning_rate": 0.0009848043800521396,
"loss": 2.4307,
"step": 13500
},
{
"epoch": 0.057825194437216294,
"grad_norm": 0.9962353706359863,
"learning_rate": 0.0009841136072569828,
"loss": 2.4115,
"step": 14000
},
{
"epoch": 0.05989037995283116,
"grad_norm": 0.9110284447669983,
"learning_rate": 0.0009834228344618259,
"loss": 2.4021,
"step": 14500
},
{
"epoch": 0.06195556546844603,
"grad_norm": 0.9498186707496643,
"learning_rate": 0.000982732061666669,
"loss": 2.3856,
"step": 15000
},
{
"epoch": 0.0640207509840609,
"grad_norm": 0.8862460851669312,
"learning_rate": 0.0009820412888715121,
"loss": 2.3762,
"step": 15500
},
{
"epoch": 0.06608593649967577,
"grad_norm": 0.9397904276847839,
"learning_rate": 0.0009813505160763552,
"loss": 2.3679,
"step": 16000
},
{
"epoch": 0.06815112201529064,
"grad_norm": 0.9054779410362244,
"learning_rate": 0.0009806597432811984,
"loss": 2.3561,
"step": 16500
},
{
"epoch": 0.0702163075309055,
"grad_norm": 0.8556115627288818,
"learning_rate": 0.0009799689704860417,
"loss": 2.353,
"step": 17000
},
{
"epoch": 0.07228149304652037,
"grad_norm": 0.9651133418083191,
"learning_rate": 0.0009792781976908848,
"loss": 2.3416,
"step": 17500
},
{
"epoch": 0.07434667856213524,
"grad_norm": 0.9362500309944153,
"learning_rate": 0.000978587424895728,
"loss": 2.3328,
"step": 18000
},
{
"epoch": 0.07641186407775011,
"grad_norm": 0.9050174951553345,
"learning_rate": 0.000977896652100571,
"loss": 2.3216,
"step": 18500
},
{
"epoch": 0.07847704959336498,
"grad_norm": 0.8851823806762695,
"learning_rate": 0.0009772058793054142,
"loss": 2.3177,
"step": 19000
},
{
"epoch": 0.08054223510897984,
"grad_norm": 0.8814013600349426,
"learning_rate": 0.0009765151065102573,
"loss": 2.3002,
"step": 19500
},
{
"epoch": 0.08260742062459471,
"grad_norm": 0.9048078656196594,
"learning_rate": 0.0009758243337151005,
"loss": 2.305,
"step": 20000
},
{
"epoch": 0.08467260614020958,
"grad_norm": 0.8821763396263123,
"learning_rate": 0.0009751335609199436,
"loss": 2.2925,
"step": 20500
},
{
"epoch": 0.08673779165582444,
"grad_norm": 0.873921811580658,
"learning_rate": 0.0009744427881247867,
"loss": 2.2861,
"step": 21000
},
{
"epoch": 0.08880297717143931,
"grad_norm": 0.8664683699607849,
"learning_rate": 0.0009737520153296299,
"loss": 2.2809,
"step": 21500
},
{
"epoch": 0.09086816268705418,
"grad_norm": 0.9115278720855713,
"learning_rate": 0.0009730612425344731,
"loss": 2.2765,
"step": 22000
},
{
"epoch": 0.09293334820266905,
"grad_norm": 0.875135064125061,
"learning_rate": 0.0009723704697393162,
"loss": 2.2699,
"step": 22500
},
{
"epoch": 0.09499853371828391,
"grad_norm": 0.8888856172561646,
"learning_rate": 0.0009716796969441593,
"loss": 2.2637,
"step": 23000
},
{
"epoch": 0.09706371923389878,
"grad_norm": 0.8921205401420593,
"learning_rate": 0.0009709889241490025,
"loss": 2.2591,
"step": 23500
},
{
"epoch": 0.09912890474951365,
"grad_norm": 0.840370774269104,
"learning_rate": 0.0009702981513538456,
"loss": 2.25,
"step": 24000
},
{
"epoch": 0.10119409026512852,
"grad_norm": 0.8678010702133179,
"learning_rate": 0.0009696073785586888,
"loss": 2.2472,
"step": 24500
},
{
"epoch": 0.10325927578074338,
"grad_norm": 0.8795912265777588,
"learning_rate": 0.0009689166057635319,
"loss": 2.2403,
"step": 25000
},
{
"epoch": 0.10532446129635825,
"grad_norm": 0.909457266330719,
"learning_rate": 0.0009682258329683751,
"loss": 2.2362,
"step": 25500
},
{
"epoch": 0.10738964681197312,
"grad_norm": 0.8739911317825317,
"learning_rate": 0.0009675350601732182,
"loss": 2.2285,
"step": 26000
},
{
"epoch": 0.10945483232758799,
"grad_norm": 0.8885407447814941,
"learning_rate": 0.0009668442873780614,
"loss": 2.2268,
"step": 26500
},
{
"epoch": 0.11152001784320285,
"grad_norm": 0.8348733186721802,
"learning_rate": 0.0009661535145829045,
"loss": 2.2201,
"step": 27000
},
{
"epoch": 0.11358520335881772,
"grad_norm": 0.8733665943145752,
"learning_rate": 0.0009654627417877476,
"loss": 2.219,
"step": 27500
},
{
"epoch": 0.11565038887443259,
"grad_norm": 0.8849397897720337,
"learning_rate": 0.0009647719689925908,
"loss": 2.2115,
"step": 28000
},
{
"epoch": 0.11771557439004746,
"grad_norm": 0.8752795457839966,
"learning_rate": 0.0009640811961974339,
"loss": 2.2051,
"step": 28500
},
{
"epoch": 0.11978075990566232,
"grad_norm": 0.8557031750679016,
"learning_rate": 0.0009633904234022772,
"loss": 2.2029,
"step": 29000
},
{
"epoch": 0.12184594542127719,
"grad_norm": 0.8175500631332397,
"learning_rate": 0.0009626996506071203,
"loss": 2.1967,
"step": 29500
},
{
"epoch": 0.12391113093689206,
"grad_norm": 0.8393064737319946,
"learning_rate": 0.0009620088778119633,
"loss": 2.1949,
"step": 30000
},
{
"epoch": 0.12597631645250693,
"grad_norm": 0.8515117764472961,
"learning_rate": 0.0009613181050168065,
"loss": 2.1909,
"step": 30500
},
{
"epoch": 0.1280415019681218,
"grad_norm": 0.8967178463935852,
"learning_rate": 0.0009606273322216496,
"loss": 2.1858,
"step": 31000
},
{
"epoch": 0.13010668748373666,
"grad_norm": 0.8990112543106079,
"learning_rate": 0.0009599365594264929,
"loss": 2.1814,
"step": 31500
},
{
"epoch": 0.13217187299935154,
"grad_norm": 0.8051102161407471,
"learning_rate": 0.000959245786631336,
"loss": 2.1803,
"step": 32000
},
{
"epoch": 0.1342370585149664,
"grad_norm": 0.8505108952522278,
"learning_rate": 0.0009585550138361792,
"loss": 2.1775,
"step": 32500
},
{
"epoch": 0.13630224403058128,
"grad_norm": 0.8586075305938721,
"learning_rate": 0.0009578642410410222,
"loss": 2.1671,
"step": 33000
},
{
"epoch": 0.13836742954619613,
"grad_norm": 0.830560028553009,
"learning_rate": 0.0009571734682458653,
"loss": 2.1697,
"step": 33500
},
{
"epoch": 0.140432615061811,
"grad_norm": 0.8251802325248718,
"learning_rate": 0.0009564826954507086,
"loss": 2.1646,
"step": 34000
},
{
"epoch": 0.14249780057742586,
"grad_norm": 0.8522030711174011,
"learning_rate": 0.0009557919226555517,
"loss": 2.1609,
"step": 34500
},
{
"epoch": 0.14456298609304075,
"grad_norm": 0.8344951272010803,
"learning_rate": 0.0009551011498603949,
"loss": 2.1542,
"step": 35000
},
{
"epoch": 0.1466281716086556,
"grad_norm": 0.8527629375457764,
"learning_rate": 0.000954410377065238,
"loss": 2.1584,
"step": 35500
},
{
"epoch": 0.14869335712427048,
"grad_norm": 0.8409314155578613,
"learning_rate": 0.0009537196042700811,
"loss": 2.1472,
"step": 36000
},
{
"epoch": 0.15075854263988533,
"grad_norm": 0.8568186163902283,
"learning_rate": 0.0009530288314749243,
"loss": 2.146,
"step": 36500
},
{
"epoch": 0.15282372815550022,
"grad_norm": 0.8142380118370056,
"learning_rate": 0.0009523380586797674,
"loss": 2.1467,
"step": 37000
},
{
"epoch": 0.15488891367111507,
"grad_norm": 0.8309258222579956,
"learning_rate": 0.0009516472858846106,
"loss": 2.142,
"step": 37500
},
{
"epoch": 0.15695409918672995,
"grad_norm": 0.8471255302429199,
"learning_rate": 0.0009509565130894537,
"loss": 2.1425,
"step": 38000
},
{
"epoch": 0.1590192847023448,
"grad_norm": 0.8846974968910217,
"learning_rate": 0.0009502657402942969,
"loss": 2.1377,
"step": 38500
},
{
"epoch": 0.16108447021795969,
"grad_norm": 0.8476696014404297,
"learning_rate": 0.00094957496749914,
"loss": 2.1356,
"step": 39000
},
{
"epoch": 0.16314965573357454,
"grad_norm": 0.8468635678291321,
"learning_rate": 0.0009488841947039832,
"loss": 2.1316,
"step": 39500
},
{
"epoch": 0.16521484124918942,
"grad_norm": 0.8356343507766724,
"learning_rate": 0.0009481934219088263,
"loss": 2.1269,
"step": 40000
},
{
"epoch": 0.16728002676480427,
"grad_norm": 0.7726144790649414,
"learning_rate": 0.0009475026491136694,
"loss": 2.1277,
"step": 40500
},
{
"epoch": 0.16934521228041916,
"grad_norm": 0.8468815684318542,
"learning_rate": 0.0009468118763185126,
"loss": 2.1332,
"step": 41000
},
{
"epoch": 0.171410397796034,
"grad_norm": 0.78179931640625,
"learning_rate": 0.0009461211035233558,
"loss": 2.1195,
"step": 41500
},
{
"epoch": 0.1734755833116489,
"grad_norm": 0.8012422919273376,
"learning_rate": 0.0009454303307281989,
"loss": 2.115,
"step": 42000
},
{
"epoch": 0.17554076882726374,
"grad_norm": 0.8458732962608337,
"learning_rate": 0.000944739557933042,
"loss": 2.1146,
"step": 42500
},
{
"epoch": 0.17760595434287862,
"grad_norm": 0.8353042006492615,
"learning_rate": 0.0009440487851378852,
"loss": 2.1123,
"step": 43000
},
{
"epoch": 0.17967113985849348,
"grad_norm": 0.8672284483909607,
"learning_rate": 0.0009433580123427283,
"loss": 2.1114,
"step": 43500
},
{
"epoch": 0.18173632537410836,
"grad_norm": 0.7824869751930237,
"learning_rate": 0.0009426672395475715,
"loss": 2.1082,
"step": 44000
},
{
"epoch": 0.1838015108897232,
"grad_norm": 0.7976692318916321,
"learning_rate": 0.0009419764667524147,
"loss": 2.1052,
"step": 44500
},
{
"epoch": 0.1858666964053381,
"grad_norm": 0.876670777797699,
"learning_rate": 0.0009412856939572577,
"loss": 2.1033,
"step": 45000
},
{
"epoch": 0.18793188192095295,
"grad_norm": 0.7947434186935425,
"learning_rate": 0.0009405949211621009,
"loss": 2.0971,
"step": 45500
},
{
"epoch": 0.18999706743656783,
"grad_norm": 0.823627233505249,
"learning_rate": 0.000939904148366944,
"loss": 2.0984,
"step": 46000
},
{
"epoch": 0.19206225295218268,
"grad_norm": 0.8043273091316223,
"learning_rate": 0.0009392133755717873,
"loss": 2.0958,
"step": 46500
},
{
"epoch": 0.19412743846779756,
"grad_norm": 0.8782801032066345,
"learning_rate": 0.0009385226027766304,
"loss": 2.0914,
"step": 47000
},
{
"epoch": 0.19619262398341242,
"grad_norm": 0.8043196201324463,
"learning_rate": 0.0009378318299814735,
"loss": 2.0888,
"step": 47500
},
{
"epoch": 0.1982578094990273,
"grad_norm": 0.8064476251602173,
"learning_rate": 0.0009371410571863166,
"loss": 2.0847,
"step": 48000
},
{
"epoch": 0.20032299501464215,
"grad_norm": 0.801071047782898,
"learning_rate": 0.0009364502843911597,
"loss": 2.0844,
"step": 48500
},
{
"epoch": 0.20238818053025703,
"grad_norm": 0.8486244678497314,
"learning_rate": 0.000935759511596003,
"loss": 2.0853,
"step": 49000
},
{
"epoch": 0.2044533660458719,
"grad_norm": 0.813061535358429,
"learning_rate": 0.0009350687388008461,
"loss": 2.0812,
"step": 49500
},
{
"epoch": 0.20651855156148677,
"grad_norm": 0.8625230193138123,
"learning_rate": 0.0009343779660056893,
"loss": 2.0832,
"step": 50000
},
{
"epoch": 0.20858373707710165,
"grad_norm": 0.8224324584007263,
"learning_rate": 0.0009336871932105324,
"loss": 2.0785,
"step": 50500
},
{
"epoch": 0.2106489225927165,
"grad_norm": 0.8722664713859558,
"learning_rate": 0.0009329964204153754,
"loss": 2.074,
"step": 51000
},
{
"epoch": 0.21271410810833138,
"grad_norm": 0.8052055239677429,
"learning_rate": 0.0009323056476202187,
"loss": 2.074,
"step": 51500
},
{
"epoch": 0.21477929362394624,
"grad_norm": 0.8521301746368408,
"learning_rate": 0.0009316148748250618,
"loss": 2.0681,
"step": 52000
},
{
"epoch": 0.21684447913956112,
"grad_norm": 0.846494197845459,
"learning_rate": 0.000930924102029905,
"loss": 2.073,
"step": 52500
},
{
"epoch": 0.21890966465517597,
"grad_norm": 0.8026652336120605,
"learning_rate": 0.0009302333292347481,
"loss": 2.0685,
"step": 53000
},
{
"epoch": 0.22097485017079085,
"grad_norm": 0.8246744871139526,
"learning_rate": 0.0009295425564395913,
"loss": 2.0653,
"step": 53500
},
{
"epoch": 0.2230400356864057,
"grad_norm": 0.8326907157897949,
"learning_rate": 0.0009288517836444344,
"loss": 2.0643,
"step": 54000
},
{
"epoch": 0.2251052212020206,
"grad_norm": 0.7792090177536011,
"learning_rate": 0.0009281610108492775,
"loss": 2.0622,
"step": 54500
},
{
"epoch": 0.22717040671763544,
"grad_norm": 0.8691778779029846,
"learning_rate": 0.0009274702380541207,
"loss": 2.0624,
"step": 55000
},
{
"epoch": 0.22923559223325032,
"grad_norm": 0.7907185554504395,
"learning_rate": 0.0009267794652589638,
"loss": 2.0571,
"step": 55500
},
{
"epoch": 0.23130077774886518,
"grad_norm": 0.8440839052200317,
"learning_rate": 0.000926088692463807,
"loss": 2.0612,
"step": 56000
},
{
"epoch": 0.23336596326448006,
"grad_norm": 0.8027564883232117,
"learning_rate": 0.0009253979196686502,
"loss": 2.054,
"step": 56500
},
{
"epoch": 0.2354311487800949,
"grad_norm": 0.7806565165519714,
"learning_rate": 0.0009247071468734933,
"loss": 2.053,
"step": 57000
},
{
"epoch": 0.2374963342957098,
"grad_norm": 0.8598223328590393,
"learning_rate": 0.0009240163740783364,
"loss": 2.0518,
"step": 57500
},
{
"epoch": 0.23956151981132465,
"grad_norm": 0.8221333622932434,
"learning_rate": 0.0009233256012831795,
"loss": 2.052,
"step": 58000
},
{
"epoch": 0.24162670532693953,
"grad_norm": 0.8474496603012085,
"learning_rate": 0.0009226348284880227,
"loss": 2.0435,
"step": 58500
},
{
"epoch": 0.24369189084255438,
"grad_norm": 0.8255507349967957,
"learning_rate": 0.0009219440556928659,
"loss": 2.045,
"step": 59000
},
{
"epoch": 0.24575707635816926,
"grad_norm": 0.7817030549049377,
"learning_rate": 0.0009212532828977091,
"loss": 2.0472,
"step": 59500
},
{
"epoch": 0.24782226187378412,
"grad_norm": 0.7616594433784485,
"learning_rate": 0.0009205625101025521,
"loss": 2.0424,
"step": 60000
},
{
"epoch": 0.249887447389399,
"grad_norm": 0.8131653070449829,
"learning_rate": 0.0009198717373073953,
"loss": 2.0438,
"step": 60500
},
{
"epoch": 0.25195263290501385,
"grad_norm": 0.7939597368240356,
"learning_rate": 0.0009191809645122384,
"loss": 2.0392,
"step": 61000
},
{
"epoch": 0.2540178184206287,
"grad_norm": 0.823221743106842,
"learning_rate": 0.0009184901917170816,
"loss": 2.0409,
"step": 61500
},
{
"epoch": 0.2560830039362436,
"grad_norm": 0.8100286722183228,
"learning_rate": 0.0009177994189219248,
"loss": 2.0352,
"step": 62000
},
{
"epoch": 0.25814818945185847,
"grad_norm": 0.84886634349823,
"learning_rate": 0.0009171086461267679,
"loss": 2.0395,
"step": 62500
},
{
"epoch": 0.2602133749674733,
"grad_norm": 0.8171844482421875,
"learning_rate": 0.000916417873331611,
"loss": 2.0374,
"step": 63000
},
{
"epoch": 0.2622785604830882,
"grad_norm": 0.8373914957046509,
"learning_rate": 0.0009157271005364541,
"loss": 2.0302,
"step": 63500
},
{
"epoch": 0.2643437459987031,
"grad_norm": 0.8553788065910339,
"learning_rate": 0.0009150363277412974,
"loss": 2.0346,
"step": 64000
},
{
"epoch": 0.26640893151431794,
"grad_norm": 0.8569718599319458,
"learning_rate": 0.0009143455549461405,
"loss": 2.0347,
"step": 64500
},
{
"epoch": 0.2684741170299328,
"grad_norm": 0.8263908624649048,
"learning_rate": 0.0009136547821509836,
"loss": 2.0306,
"step": 65000
},
{
"epoch": 0.27053930254554764,
"grad_norm": 0.8501819372177124,
"learning_rate": 0.0009129640093558268,
"loss": 2.0271,
"step": 65500
},
{
"epoch": 0.27260448806116255,
"grad_norm": 0.8343943357467651,
"learning_rate": 0.0009122732365606698,
"loss": 2.0296,
"step": 66000
},
{
"epoch": 0.2746696735767774,
"grad_norm": 0.8072646856307983,
"learning_rate": 0.000911582463765513,
"loss": 2.0238,
"step": 66500
},
{
"epoch": 0.27673485909239226,
"grad_norm": 0.8142940998077393,
"learning_rate": 0.0009108916909703562,
"loss": 2.0289,
"step": 67000
},
{
"epoch": 0.2788000446080071,
"grad_norm": 0.7751716375350952,
"learning_rate": 0.0009102009181751994,
"loss": 2.0262,
"step": 67500
},
{
"epoch": 0.280865230123622,
"grad_norm": 0.7758037447929382,
"learning_rate": 0.0009095101453800425,
"loss": 2.0202,
"step": 68000
},
{
"epoch": 0.2829304156392369,
"grad_norm": 0.8752540349960327,
"learning_rate": 0.0009088193725848856,
"loss": 2.0195,
"step": 68500
},
{
"epoch": 0.28499560115485173,
"grad_norm": 0.8347713351249695,
"learning_rate": 0.0009081285997897288,
"loss": 2.0187,
"step": 69000
},
{
"epoch": 0.2870607866704666,
"grad_norm": 0.8156507015228271,
"learning_rate": 0.0009074378269945719,
"loss": 2.0157,
"step": 69500
},
{
"epoch": 0.2891259721860815,
"grad_norm": 0.7821555137634277,
"learning_rate": 0.0009067470541994151,
"loss": 2.0152,
"step": 70000
},
{
"epoch": 0.29119115770169635,
"grad_norm": 0.84757399559021,
"learning_rate": 0.0009060562814042582,
"loss": 2.0157,
"step": 70500
},
{
"epoch": 0.2932563432173112,
"grad_norm": 0.8818306922912598,
"learning_rate": 0.0009053655086091014,
"loss": 2.0121,
"step": 71000
},
{
"epoch": 0.29532152873292605,
"grad_norm": 0.8257991671562195,
"learning_rate": 0.0009046747358139446,
"loss": 2.009,
"step": 71500
},
{
"epoch": 0.29738671424854096,
"grad_norm": 0.821416437625885,
"learning_rate": 0.0009039839630187876,
"loss": 2.0094,
"step": 72000
},
{
"epoch": 0.2994518997641558,
"grad_norm": 0.7886099815368652,
"learning_rate": 0.0009032931902236308,
"loss": 2.0067,
"step": 72500
},
{
"epoch": 0.30151708527977067,
"grad_norm": 0.8650347590446472,
"learning_rate": 0.0009026024174284739,
"loss": 2.0046,
"step": 73000
},
{
"epoch": 0.3035822707953855,
"grad_norm": 0.8249508738517761,
"learning_rate": 0.0009019116446333171,
"loss": 2.0044,
"step": 73500
},
{
"epoch": 0.30564745631100043,
"grad_norm": 0.8648396730422974,
"learning_rate": 0.0009012208718381603,
"loss": 2.0061,
"step": 74000
},
{
"epoch": 0.3077126418266153,
"grad_norm": 0.8078823089599609,
"learning_rate": 0.0009005300990430035,
"loss": 2.001,
"step": 74500
},
{
"epoch": 0.30977782734223014,
"grad_norm": 0.8452419638633728,
"learning_rate": 0.0008998393262478465,
"loss": 1.9992,
"step": 75000
},
{
"epoch": 0.31184301285784505,
"grad_norm": 0.7989551424980164,
"learning_rate": 0.0008991485534526896,
"loss": 2.007,
"step": 75500
},
{
"epoch": 0.3139081983734599,
"grad_norm": 0.8734456300735474,
"learning_rate": 0.0008984577806575328,
"loss": 2.0004,
"step": 76000
},
{
"epoch": 0.31597338388907475,
"grad_norm": 0.8965834975242615,
"learning_rate": 0.000897767007862376,
"loss": 2.0034,
"step": 76500
},
{
"epoch": 0.3180385694046896,
"grad_norm": 0.7855513691902161,
"learning_rate": 0.0008970762350672192,
"loss": 1.9932,
"step": 77000
},
{
"epoch": 0.3201037549203045,
"grad_norm": 0.825775682926178,
"learning_rate": 0.0008963854622720623,
"loss": 1.9979,
"step": 77500
},
{
"epoch": 0.32216894043591937,
"grad_norm": 0.7757362127304077,
"learning_rate": 0.0008956946894769054,
"loss": 1.9974,
"step": 78000
},
{
"epoch": 0.3242341259515342,
"grad_norm": 0.8657450675964355,
"learning_rate": 0.0008950039166817485,
"loss": 1.9976,
"step": 78500
},
{
"epoch": 0.3262993114671491,
"grad_norm": 0.8072881102561951,
"learning_rate": 0.0008943131438865916,
"loss": 1.9934,
"step": 79000
},
{
"epoch": 0.328364496982764,
"grad_norm": 0.7893191576004028,
"learning_rate": 0.0008936223710914349,
"loss": 1.9945,
"step": 79500
},
{
"epoch": 0.33042968249837884,
"grad_norm": 0.8834479451179504,
"learning_rate": 0.000892931598296278,
"loss": 1.9914,
"step": 80000
},
{
"epoch": 0.3324948680139937,
"grad_norm": 0.8713655471801758,
"learning_rate": 0.0008922408255011212,
"loss": 1.9946,
"step": 80500
},
{
"epoch": 0.33456005352960855,
"grad_norm": 0.8255290389060974,
"learning_rate": 0.0008915500527059643,
"loss": 1.9914,
"step": 81000
},
{
"epoch": 0.33662523904522346,
"grad_norm": 0.8153598308563232,
"learning_rate": 0.0008908592799108073,
"loss": 1.9861,
"step": 81500
},
{
"epoch": 0.3386904245608383,
"grad_norm": 0.8533855080604553,
"learning_rate": 0.0008901685071156506,
"loss": 1.9887,
"step": 82000
},
{
"epoch": 0.34075561007645316,
"grad_norm": 0.912350594997406,
"learning_rate": 0.0008894777343204937,
"loss": 1.9923,
"step": 82500
},
{
"epoch": 0.342820795592068,
"grad_norm": 0.8206115365028381,
"learning_rate": 0.0008887869615253369,
"loss": 1.9869,
"step": 83000
},
{
"epoch": 0.3448859811076829,
"grad_norm": 0.8313278555870056,
"learning_rate": 0.00088809618873018,
"loss": 1.9863,
"step": 83500
},
{
"epoch": 0.3469511666232978,
"grad_norm": 0.9152906537055969,
"learning_rate": 0.0008874054159350233,
"loss": 1.9786,
"step": 84000
},
{
"epoch": 0.34901635213891263,
"grad_norm": 0.8398587107658386,
"learning_rate": 0.0008867146431398663,
"loss": 1.986,
"step": 84500
},
{
"epoch": 0.3510815376545275,
"grad_norm": 0.8084604144096375,
"learning_rate": 0.0008860238703447094,
"loss": 1.9837,
"step": 85000
},
{
"epoch": 0.3531467231701424,
"grad_norm": 0.7918562889099121,
"learning_rate": 0.0008853330975495526,
"loss": 1.9789,
"step": 85500
},
{
"epoch": 0.35521190868575725,
"grad_norm": 0.8110492825508118,
"learning_rate": 0.0008846423247543957,
"loss": 1.9754,
"step": 86000
},
{
"epoch": 0.3572770942013721,
"grad_norm": 0.7786925435066223,
"learning_rate": 0.000883951551959239,
"loss": 1.9817,
"step": 86500
},
{
"epoch": 0.35934227971698696,
"grad_norm": 0.8928225636482239,
"learning_rate": 0.0008832607791640821,
"loss": 1.9809,
"step": 87000
},
{
"epoch": 0.36140746523260187,
"grad_norm": 0.821860134601593,
"learning_rate": 0.0008825700063689252,
"loss": 1.9759,
"step": 87500
},
{
"epoch": 0.3634726507482167,
"grad_norm": 0.8514395952224731,
"learning_rate": 0.0008818792335737683,
"loss": 1.9739,
"step": 88000
},
{
"epoch": 0.3655378362638316,
"grad_norm": 0.8256642818450928,
"learning_rate": 0.0008811884607786114,
"loss": 1.9743,
"step": 88500
},
{
"epoch": 0.3676030217794464,
"grad_norm": 0.8043322563171387,
"learning_rate": 0.0008804976879834547,
"loss": 1.975,
"step": 89000
},
{
"epoch": 0.36966820729506134,
"grad_norm": 0.8065923452377319,
"learning_rate": 0.0008798069151882978,
"loss": 1.9712,
"step": 89500
},
{
"epoch": 0.3717333928106762,
"grad_norm": 0.8350073099136353,
"learning_rate": 0.000879116142393141,
"loss": 1.9741,
"step": 90000
},
{
"epoch": 0.37379857832629104,
"grad_norm": 0.8081244230270386,
"learning_rate": 0.000878425369597984,
"loss": 1.977,
"step": 90500
},
{
"epoch": 0.3758637638419059,
"grad_norm": 0.7285000681877136,
"learning_rate": 0.0008777345968028272,
"loss": 1.9688,
"step": 91000
},
{
"epoch": 0.3779289493575208,
"grad_norm": 0.8110142350196838,
"learning_rate": 0.0008770438240076704,
"loss": 1.9675,
"step": 91500
},
{
"epoch": 0.37999413487313566,
"grad_norm": 0.8193402886390686,
"learning_rate": 0.0008763530512125135,
"loss": 1.9682,
"step": 92000
},
{
"epoch": 0.3820593203887505,
"grad_norm": 0.8382455110549927,
"learning_rate": 0.0008756622784173567,
"loss": 1.9652,
"step": 92500
},
{
"epoch": 0.38412450590436537,
"grad_norm": 0.7900645732879639,
"learning_rate": 0.0008749715056221998,
"loss": 1.9654,
"step": 93000
},
{
"epoch": 0.3861896914199803,
"grad_norm": 0.7835169434547424,
"learning_rate": 0.0008742807328270429,
"loss": 1.9676,
"step": 93500
},
{
"epoch": 0.38825487693559513,
"grad_norm": 0.8066137433052063,
"learning_rate": 0.000873589960031886,
"loss": 1.9668,
"step": 94000
},
{
"epoch": 0.39032006245121,
"grad_norm": 0.8150152564048767,
"learning_rate": 0.0008728991872367293,
"loss": 1.9683,
"step": 94500
},
{
"epoch": 0.39238524796682483,
"grad_norm": 1.05111825466156,
"learning_rate": 0.0008722084144415724,
"loss": 1.9613,
"step": 95000
},
{
"epoch": 0.39445043348243974,
"grad_norm": 0.8422666788101196,
"learning_rate": 0.0008715176416464155,
"loss": 1.9625,
"step": 95500
},
{
"epoch": 0.3965156189980546,
"grad_norm": 0.8087729215621948,
"learning_rate": 0.0008708268688512587,
"loss": 1.9657,
"step": 96000
},
{
"epoch": 0.39858080451366945,
"grad_norm": 0.8095026612281799,
"learning_rate": 0.0008701360960561017,
"loss": 1.9636,
"step": 96500
},
{
"epoch": 0.4006459900292843,
"grad_norm": 0.7824914455413818,
"learning_rate": 0.000869445323260945,
"loss": 1.9655,
"step": 97000
},
{
"epoch": 0.4027111755448992,
"grad_norm": 0.8077009320259094,
"learning_rate": 0.0008687545504657881,
"loss": 1.9577,
"step": 97500
},
{
"epoch": 0.40477636106051407,
"grad_norm": 0.7984289526939392,
"learning_rate": 0.0008680637776706313,
"loss": 1.9575,
"step": 98000
},
{
"epoch": 0.4068415465761289,
"grad_norm": 0.8378064036369324,
"learning_rate": 0.0008673730048754744,
"loss": 1.9587,
"step": 98500
},
{
"epoch": 0.4089067320917438,
"grad_norm": 0.7952322959899902,
"learning_rate": 0.0008666822320803175,
"loss": 1.9572,
"step": 99000
},
{
"epoch": 0.4109719176073587,
"grad_norm": 0.9045737385749817,
"learning_rate": 0.0008659914592851607,
"loss": 1.957,
"step": 99500
},
{
"epoch": 0.41303710312297354,
"grad_norm": 0.8450877666473389,
"learning_rate": 0.0008653006864900038,
"loss": 1.9573,
"step": 100000
},
{
"epoch": 0.4151022886385884,
"grad_norm": 0.8580604791641235,
"learning_rate": 0.000864609913694847,
"loss": 1.9561,
"step": 100500
},
{
"epoch": 0.4171674741542033,
"grad_norm": 0.8783984780311584,
"learning_rate": 0.0008639191408996901,
"loss": 1.9551,
"step": 101000
},
{
"epoch": 0.41923265966981815,
"grad_norm": 0.7707995772361755,
"learning_rate": 0.0008632283681045334,
"loss": 1.9551,
"step": 101500
},
{
"epoch": 0.421297845185433,
"grad_norm": 0.7902424931526184,
"learning_rate": 0.0008625375953093765,
"loss": 1.9544,
"step": 102000
},
{
"epoch": 0.42336303070104786,
"grad_norm": 0.850943922996521,
"learning_rate": 0.0008618468225142195,
"loss": 1.9556,
"step": 102500
},
{
"epoch": 0.42542821621666277,
"grad_norm": 0.918465793132782,
"learning_rate": 0.0008611560497190627,
"loss": 1.9504,
"step": 103000
},
{
"epoch": 0.4274934017322776,
"grad_norm": 0.8017387390136719,
"learning_rate": 0.0008604652769239058,
"loss": 1.9467,
"step": 103500
},
{
"epoch": 0.4295585872478925,
"grad_norm": 0.8548043370246887,
"learning_rate": 0.000859774504128749,
"loss": 1.9488,
"step": 104000
},
{
"epoch": 0.43162377276350733,
"grad_norm": 0.8529847264289856,
"learning_rate": 0.0008590837313335922,
"loss": 1.9472,
"step": 104500
},
{
"epoch": 0.43368895827912224,
"grad_norm": 0.9331560730934143,
"learning_rate": 0.0008583929585384354,
"loss": 1.9444,
"step": 105000
},
{
"epoch": 0.4357541437947371,
"grad_norm": 0.7767966985702515,
"learning_rate": 0.0008577021857432784,
"loss": 1.9437,
"step": 105500
},
{
"epoch": 0.43781932931035195,
"grad_norm": 0.8031103610992432,
"learning_rate": 0.0008570114129481215,
"loss": 1.9481,
"step": 106000
},
{
"epoch": 0.4398845148259668,
"grad_norm": 0.8879848122596741,
"learning_rate": 0.0008563206401529647,
"loss": 1.9439,
"step": 106500
},
{
"epoch": 0.4419497003415817,
"grad_norm": 0.8233328461647034,
"learning_rate": 0.0008556298673578079,
"loss": 1.9485,
"step": 107000
},
{
"epoch": 0.44401488585719656,
"grad_norm": 0.8277767300605774,
"learning_rate": 0.0008549390945626511,
"loss": 1.9475,
"step": 107500
},
{
"epoch": 0.4460800713728114,
"grad_norm": 0.8291540741920471,
"learning_rate": 0.0008542483217674942,
"loss": 1.9503,
"step": 108000
},
{
"epoch": 0.44814525688842627,
"grad_norm": 0.8007998466491699,
"learning_rate": 0.0008535575489723373,
"loss": 1.9441,
"step": 108500
},
{
"epoch": 0.4502104424040412,
"grad_norm": 0.7802460193634033,
"learning_rate": 0.0008528667761771804,
"loss": 1.9422,
"step": 109000
},
{
"epoch": 0.45227562791965603,
"grad_norm": 0.7900969982147217,
"learning_rate": 0.0008521760033820236,
"loss": 1.9418,
"step": 109500
},
{
"epoch": 0.4543408134352709,
"grad_norm": 0.958767294883728,
"learning_rate": 0.0008514852305868668,
"loss": 1.9431,
"step": 110000
},
{
"epoch": 0.45640599895088574,
"grad_norm": 0.8186129331588745,
"learning_rate": 0.0008507944577917099,
"loss": 1.9368,
"step": 110500
},
{
"epoch": 0.45847118446650065,
"grad_norm": 0.7958455085754395,
"learning_rate": 0.0008501036849965531,
"loss": 1.9368,
"step": 111000
},
{
"epoch": 0.4605363699821155,
"grad_norm": 1.3525958061218262,
"learning_rate": 0.0008494129122013961,
"loss": 1.936,
"step": 111500
},
{
"epoch": 0.46260155549773035,
"grad_norm": 0.8453717827796936,
"learning_rate": 0.0008487221394062394,
"loss": 1.9438,
"step": 112000
},
{
"epoch": 0.4646667410133452,
"grad_norm": 0.8021391034126282,
"learning_rate": 0.0008480313666110825,
"loss": 1.9428,
"step": 112500
},
{
"epoch": 0.4667319265289601,
"grad_norm": 0.8905833959579468,
"learning_rate": 0.0008473405938159256,
"loss": 1.9416,
"step": 113000
},
{
"epoch": 0.46879711204457497,
"grad_norm": 0.789579451084137,
"learning_rate": 0.0008466498210207688,
"loss": 1.9406,
"step": 113500
},
{
"epoch": 0.4708622975601898,
"grad_norm": 0.8398124575614929,
"learning_rate": 0.000845959048225612,
"loss": 1.935,
"step": 114000
},
{
"epoch": 0.4729274830758047,
"grad_norm": 0.8189172148704529,
"learning_rate": 0.0008452682754304551,
"loss": 1.9367,
"step": 114500
},
{
"epoch": 0.4749926685914196,
"grad_norm": 0.7979219555854797,
"learning_rate": 0.0008445775026352982,
"loss": 1.9309,
"step": 115000
},
{
"epoch": 0.47705785410703444,
"grad_norm": 0.9062512516975403,
"learning_rate": 0.0008438867298401414,
"loss": 1.9389,
"step": 115500
},
{
"epoch": 0.4791230396226493,
"grad_norm": 0.9431639909744263,
"learning_rate": 0.0008431959570449845,
"loss": 1.9307,
"step": 116000
},
{
"epoch": 0.48118822513826415,
"grad_norm": 0.8639684319496155,
"learning_rate": 0.0008425051842498276,
"loss": 1.9325,
"step": 116500
},
{
"epoch": 0.48325341065387906,
"grad_norm": 0.8229732513427734,
"learning_rate": 0.0008418144114546709,
"loss": 1.9323,
"step": 117000
},
{
"epoch": 0.4853185961694939,
"grad_norm": 0.789789080619812,
"learning_rate": 0.0008411236386595139,
"loss": 1.9317,
"step": 117500
},
{
"epoch": 0.48738378168510876,
"grad_norm": 0.8473231196403503,
"learning_rate": 0.0008404328658643571,
"loss": 1.9332,
"step": 118000
},
{
"epoch": 0.4894489672007236,
"grad_norm": 0.9255551099777222,
"learning_rate": 0.0008397420930692002,
"loss": 1.9309,
"step": 118500
},
{
"epoch": 0.4915141527163385,
"grad_norm": 0.7924582958221436,
"learning_rate": 0.0008390513202740435,
"loss": 1.9277,
"step": 119000
},
{
"epoch": 0.4935793382319534,
"grad_norm": 0.8907535672187805,
"learning_rate": 0.0008383605474788866,
"loss": 1.9294,
"step": 119500
},
{
"epoch": 0.49564452374756823,
"grad_norm": 0.8191530108451843,
"learning_rate": 0.0008376697746837297,
"loss": 1.9306,
"step": 120000
},
{
"epoch": 0.4977097092631831,
"grad_norm": 0.8925333023071289,
"learning_rate": 0.0008369790018885728,
"loss": 1.9229,
"step": 120500
},
{
"epoch": 0.499774894778798,
"grad_norm": 0.8087531924247742,
"learning_rate": 0.0008362882290934159,
"loss": 1.9243,
"step": 121000
},
{
"epoch": 0.5018400802944128,
"grad_norm": 0.8658357858657837,
"learning_rate": 0.0008355974562982591,
"loss": 1.9296,
"step": 121500
},
{
"epoch": 0.5039052658100277,
"grad_norm": 0.8883163332939148,
"learning_rate": 0.0008349066835031023,
"loss": 1.9228,
"step": 122000
},
{
"epoch": 0.5059704513256426,
"grad_norm": 0.9020292162895203,
"learning_rate": 0.0008342159107079455,
"loss": 1.9242,
"step": 122500
},
{
"epoch": 0.5080356368412574,
"grad_norm": 0.7825981974601746,
"learning_rate": 0.0008335251379127886,
"loss": 1.9244,
"step": 123000
},
{
"epoch": 0.5101008223568723,
"grad_norm": 0.7903372645378113,
"learning_rate": 0.0008328343651176316,
"loss": 1.9263,
"step": 123500
},
{
"epoch": 0.5121660078724872,
"grad_norm": 0.8415020108222961,
"learning_rate": 0.0008321435923224748,
"loss": 1.9243,
"step": 124000
},
{
"epoch": 0.5142311933881021,
"grad_norm": 0.8838851451873779,
"learning_rate": 0.000831452819527318,
"loss": 1.9229,
"step": 124500
},
{
"epoch": 0.5162963789037169,
"grad_norm": 0.8412485718727112,
"learning_rate": 0.0008307620467321612,
"loss": 1.9198,
"step": 125000
},
{
"epoch": 0.5183615644193318,
"grad_norm": 0.8944464921951294,
"learning_rate": 0.0008300712739370043,
"loss": 1.9242,
"step": 125500
},
{
"epoch": 0.5204267499349466,
"grad_norm": 0.8970022797584534,
"learning_rate": 0.0008293805011418475,
"loss": 1.9157,
"step": 126000
},
{
"epoch": 0.5224919354505615,
"grad_norm": 0.7767829895019531,
"learning_rate": 0.0008286897283466905,
"loss": 1.9146,
"step": 126500
},
{
"epoch": 0.5245571209661763,
"grad_norm": 0.9366709589958191,
"learning_rate": 0.0008279989555515337,
"loss": 1.9219,
"step": 127000
},
{
"epoch": 0.5266223064817913,
"grad_norm": 0.813752293586731,
"learning_rate": 0.0008273081827563769,
"loss": 1.9131,
"step": 127500
},
{
"epoch": 0.5286874919974062,
"grad_norm": 0.7913943529129028,
"learning_rate": 0.00082661740996122,
"loss": 1.9151,
"step": 128000
},
{
"epoch": 0.530752677513021,
"grad_norm": 0.7573590278625488,
"learning_rate": 0.0008259266371660632,
"loss": 1.9141,
"step": 128500
},
{
"epoch": 0.5328178630286359,
"grad_norm": 0.8860184550285339,
"learning_rate": 0.0008252358643709063,
"loss": 1.9157,
"step": 129000
},
{
"epoch": 0.5348830485442507,
"grad_norm": 0.7423400282859802,
"learning_rate": 0.0008245450915757495,
"loss": 1.9123,
"step": 129500
},
{
"epoch": 0.5369482340598656,
"grad_norm": 0.7855700254440308,
"learning_rate": 0.0008238543187805926,
"loss": 1.9156,
"step": 130000
},
{
"epoch": 0.5390134195754804,
"grad_norm": 0.7748924493789673,
"learning_rate": 0.0008231635459854357,
"loss": 1.9125,
"step": 130500
},
{
"epoch": 0.5410786050910953,
"grad_norm": 0.823998212814331,
"learning_rate": 0.0008224727731902789,
"loss": 1.914,
"step": 131000
},
{
"epoch": 0.5431437906067103,
"grad_norm": 0.837291955947876,
"learning_rate": 0.000821782000395122,
"loss": 1.9135,
"step": 131500
},
{
"epoch": 0.5452089761223251,
"grad_norm": 0.8040900230407715,
"learning_rate": 0.0008210912275999653,
"loss": 1.9167,
"step": 132000
},
{
"epoch": 0.54727416163794,
"grad_norm": 0.8205652236938477,
"learning_rate": 0.0008204004548048083,
"loss": 1.9111,
"step": 132500
},
{
"epoch": 0.5493393471535548,
"grad_norm": 0.9085518717765808,
"learning_rate": 0.0008197096820096515,
"loss": 1.9082,
"step": 133000
},
{
"epoch": 0.5514045326691697,
"grad_norm": 0.9547085165977478,
"learning_rate": 0.0008190189092144946,
"loss": 1.9066,
"step": 133500
},
{
"epoch": 0.5534697181847845,
"grad_norm": 0.8351136445999146,
"learning_rate": 0.0008183281364193377,
"loss": 1.9152,
"step": 134000
},
{
"epoch": 0.5555349037003994,
"grad_norm": 0.814534068107605,
"learning_rate": 0.000817637363624181,
"loss": 1.9093,
"step": 134500
},
{
"epoch": 0.5576000892160142,
"grad_norm": 0.8208035826683044,
"learning_rate": 0.0008169465908290241,
"loss": 1.9107,
"step": 135000
},
{
"epoch": 0.5596652747316292,
"grad_norm": 0.8544581532478333,
"learning_rate": 0.0008162558180338672,
"loss": 1.9092,
"step": 135500
},
{
"epoch": 0.561730460247244,
"grad_norm": 0.8623299598693848,
"learning_rate": 0.0008155650452387103,
"loss": 1.9097,
"step": 136000
},
{
"epoch": 0.5637956457628589,
"grad_norm": 0.8688506484031677,
"learning_rate": 0.0008148742724435535,
"loss": 1.9103,
"step": 136500
},
{
"epoch": 0.5658608312784738,
"grad_norm": 0.8412228226661682,
"learning_rate": 0.0008141834996483967,
"loss": 1.9074,
"step": 137000
},
{
"epoch": 0.5679260167940886,
"grad_norm": 0.8734971880912781,
"learning_rate": 0.0008134927268532398,
"loss": 1.906,
"step": 137500
},
{
"epoch": 0.5699912023097035,
"grad_norm": 0.8894969820976257,
"learning_rate": 0.000812801954058083,
"loss": 1.9072,
"step": 138000
},
{
"epoch": 0.5720563878253183,
"grad_norm": 0.7939966320991516,
"learning_rate": 0.0008121111812629261,
"loss": 1.908,
"step": 138500
},
{
"epoch": 0.5741215733409332,
"grad_norm": 0.8480666875839233,
"learning_rate": 0.0008114204084677692,
"loss": 1.9031,
"step": 139000
},
{
"epoch": 0.5761867588565481,
"grad_norm": 0.7555306553840637,
"learning_rate": 0.0008107296356726124,
"loss": 1.902,
"step": 139500
},
{
"epoch": 0.578251944372163,
"grad_norm": 0.8896836638450623,
"learning_rate": 0.0008100388628774556,
"loss": 1.9075,
"step": 140000
},
{
"epoch": 0.5803171298877778,
"grad_norm": 0.9097606539726257,
"learning_rate": 0.0008093480900822987,
"loss": 1.9072,
"step": 140500
},
{
"epoch": 0.5823823154033927,
"grad_norm": 0.9234623312950134,
"learning_rate": 0.0008086573172871418,
"loss": 1.9053,
"step": 141000
},
{
"epoch": 0.5844475009190075,
"grad_norm": 0.9804132580757141,
"learning_rate": 0.000807966544491985,
"loss": 1.8995,
"step": 141500
},
{
"epoch": 0.5865126864346224,
"grad_norm": 0.8714466691017151,
"learning_rate": 0.0008072757716968281,
"loss": 1.9036,
"step": 142000
},
{
"epoch": 0.5885778719502373,
"grad_norm": 0.8345698714256287,
"learning_rate": 0.0008065849989016713,
"loss": 1.8996,
"step": 142500
},
{
"epoch": 0.5906430574658521,
"grad_norm": 0.8244128227233887,
"learning_rate": 0.0008058942261065144,
"loss": 1.9,
"step": 143000
},
{
"epoch": 0.5927082429814671,
"grad_norm": 0.8433374166488647,
"learning_rate": 0.0008052034533113576,
"loss": 1.8983,
"step": 143500
},
{
"epoch": 0.5947734284970819,
"grad_norm": 0.9245389699935913,
"learning_rate": 0.0008045126805162007,
"loss": 1.8985,
"step": 144000
},
{
"epoch": 0.5968386140126968,
"grad_norm": 0.8123714923858643,
"learning_rate": 0.0008038219077210439,
"loss": 1.8955,
"step": 144500
},
{
"epoch": 0.5989037995283116,
"grad_norm": 0.834078848361969,
"learning_rate": 0.000803131134925887,
"loss": 1.8985,
"step": 145000
},
{
"epoch": 0.6009689850439265,
"grad_norm": 0.8230902552604675,
"learning_rate": 0.0008024403621307301,
"loss": 1.8993,
"step": 145500
},
{
"epoch": 0.6030341705595413,
"grad_norm": 0.7516800761222839,
"learning_rate": 0.0008017495893355733,
"loss": 1.8954,
"step": 146000
},
{
"epoch": 0.6050993560751562,
"grad_norm": 0.8156014084815979,
"learning_rate": 0.0008010588165404164,
"loss": 1.8934,
"step": 146500
},
{
"epoch": 0.607164541590771,
"grad_norm": 0.8357443809509277,
"learning_rate": 0.0008003680437452597,
"loss": 1.8985,
"step": 147000
},
{
"epoch": 0.609229727106386,
"grad_norm": 0.8833040595054626,
"learning_rate": 0.0007996772709501028,
"loss": 1.8951,
"step": 147500
},
{
"epoch": 0.6112949126220009,
"grad_norm": 0.9052265286445618,
"learning_rate": 0.0007989864981549458,
"loss": 1.8919,
"step": 148000
},
{
"epoch": 0.6133600981376157,
"grad_norm": 0.7939783334732056,
"learning_rate": 0.000798295725359789,
"loss": 1.8982,
"step": 148500
},
{
"epoch": 0.6154252836532306,
"grad_norm": 0.8598021864891052,
"learning_rate": 0.0007976049525646321,
"loss": 1.8955,
"step": 149000
},
{
"epoch": 0.6174904691688454,
"grad_norm": 0.7993877530097961,
"learning_rate": 0.0007969141797694754,
"loss": 1.894,
"step": 149500
},
{
"epoch": 0.6195556546844603,
"grad_norm": 0.8220402002334595,
"learning_rate": 0.0007962234069743185,
"loss": 1.8963,
"step": 150000
},
{
"epoch": 0.6216208402000751,
"grad_norm": 0.9298192262649536,
"learning_rate": 0.0007955326341791617,
"loss": 1.8902,
"step": 150500
},
{
"epoch": 0.6236860257156901,
"grad_norm": 0.7912063002586365,
"learning_rate": 0.0007948418613840047,
"loss": 1.8918,
"step": 151000
},
{
"epoch": 0.625751211231305,
"grad_norm": 0.907156765460968,
"learning_rate": 0.0007941510885888478,
"loss": 1.8929,
"step": 151500
},
{
"epoch": 0.6278163967469198,
"grad_norm": 0.8619490265846252,
"learning_rate": 0.0007934603157936911,
"loss": 1.8881,
"step": 152000
},
{
"epoch": 0.6298815822625347,
"grad_norm": 0.8170045018196106,
"learning_rate": 0.0007927695429985342,
"loss": 1.8879,
"step": 152500
},
{
"epoch": 0.6319467677781495,
"grad_norm": 0.7822418212890625,
"learning_rate": 0.0007920787702033774,
"loss": 1.8956,
"step": 153000
},
{
"epoch": 0.6340119532937644,
"grad_norm": 0.878753125667572,
"learning_rate": 0.0007913879974082205,
"loss": 1.8914,
"step": 153500
},
{
"epoch": 0.6360771388093792,
"grad_norm": 0.8338424563407898,
"learning_rate": 0.0007906972246130636,
"loss": 1.8911,
"step": 154000
},
{
"epoch": 0.6381423243249941,
"grad_norm": 0.8565462827682495,
"learning_rate": 0.0007900064518179068,
"loss": 1.8881,
"step": 154500
},
{
"epoch": 0.640207509840609,
"grad_norm": 0.82133948802948,
"learning_rate": 0.0007893156790227499,
"loss": 1.8884,
"step": 155000
},
{
"epoch": 0.6422726953562239,
"grad_norm": 0.9342901706695557,
"learning_rate": 0.0007886249062275931,
"loss": 1.8865,
"step": 155500
},
{
"epoch": 0.6443378808718387,
"grad_norm": 0.8597960472106934,
"learning_rate": 0.0007879341334324362,
"loss": 1.8893,
"step": 156000
},
{
"epoch": 0.6464030663874536,
"grad_norm": 0.816633939743042,
"learning_rate": 0.0007872433606372795,
"loss": 1.8831,
"step": 156500
},
{
"epoch": 0.6484682519030684,
"grad_norm": 0.8402358293533325,
"learning_rate": 0.0007865525878421225,
"loss": 1.8844,
"step": 157000
},
{
"epoch": 0.6505334374186833,
"grad_norm": 0.8066496253013611,
"learning_rate": 0.0007858618150469657,
"loss": 1.8879,
"step": 157500
},
{
"epoch": 0.6525986229342982,
"grad_norm": 0.7855266332626343,
"learning_rate": 0.0007851710422518088,
"loss": 1.8883,
"step": 158000
},
{
"epoch": 0.654663808449913,
"grad_norm": 0.8272327184677124,
"learning_rate": 0.0007844802694566519,
"loss": 1.8823,
"step": 158500
},
{
"epoch": 0.656728993965528,
"grad_norm": 0.7959176898002625,
"learning_rate": 0.0007837894966614951,
"loss": 1.8862,
"step": 159000
},
{
"epoch": 0.6587941794811428,
"grad_norm": 0.8315137028694153,
"learning_rate": 0.0007830987238663383,
"loss": 1.8839,
"step": 159500
},
{
"epoch": 0.6608593649967577,
"grad_norm": 0.8382706046104431,
"learning_rate": 0.0007824079510711814,
"loss": 1.8838,
"step": 160000
},
{
"epoch": 0.6629245505123725,
"grad_norm": 0.7986578941345215,
"learning_rate": 0.0007817171782760245,
"loss": 1.8825,
"step": 160500
},
{
"epoch": 0.6649897360279874,
"grad_norm": 0.8452582359313965,
"learning_rate": 0.0007810264054808677,
"loss": 1.884,
"step": 161000
},
{
"epoch": 0.6670549215436022,
"grad_norm": 0.86090487241745,
"learning_rate": 0.0007803356326857108,
"loss": 1.8837,
"step": 161500
},
{
"epoch": 0.6691201070592171,
"grad_norm": 0.8608242273330688,
"learning_rate": 0.000779644859890554,
"loss": 1.8791,
"step": 162000
},
{
"epoch": 0.671185292574832,
"grad_norm": 0.8503440618515015,
"learning_rate": 0.0007789540870953972,
"loss": 1.8771,
"step": 162500
},
{
"epoch": 0.6732504780904469,
"grad_norm": 0.7802348136901855,
"learning_rate": 0.0007782633143002402,
"loss": 1.8848,
"step": 163000
},
{
"epoch": 0.6753156636060618,
"grad_norm": 0.9252862930297852,
"learning_rate": 0.0007775725415050834,
"loss": 1.8791,
"step": 163500
},
{
"epoch": 0.6773808491216766,
"grad_norm": 0.8752533793449402,
"learning_rate": 0.0007768817687099265,
"loss": 1.881,
"step": 164000
},
{
"epoch": 0.6794460346372915,
"grad_norm": 0.9123765826225281,
"learning_rate": 0.0007761909959147698,
"loss": 1.8809,
"step": 164500
},
{
"epoch": 0.6815112201529063,
"grad_norm": 0.8338991403579712,
"learning_rate": 0.0007755002231196129,
"loss": 1.8786,
"step": 165000
},
{
"epoch": 0.6835764056685212,
"grad_norm": 0.8287580609321594,
"learning_rate": 0.000774809450324456,
"loss": 1.8797,
"step": 165500
},
{
"epoch": 0.685641591184136,
"grad_norm": 0.8854800462722778,
"learning_rate": 0.0007741186775292991,
"loss": 1.8799,
"step": 166000
},
{
"epoch": 0.6877067766997509,
"grad_norm": 0.8071344494819641,
"learning_rate": 0.0007734279047341422,
"loss": 1.8738,
"step": 166500
},
{
"epoch": 0.6897719622153659,
"grad_norm": 0.8015414476394653,
"learning_rate": 0.0007727371319389855,
"loss": 1.8764,
"step": 167000
},
{
"epoch": 0.6918371477309807,
"grad_norm": 0.8209612965583801,
"learning_rate": 0.0007720463591438286,
"loss": 1.8802,
"step": 167500
},
{
"epoch": 0.6939023332465956,
"grad_norm": 0.8554903268814087,
"learning_rate": 0.0007713555863486718,
"loss": 1.8742,
"step": 168000
},
{
"epoch": 0.6959675187622104,
"grad_norm": 0.8712402582168579,
"learning_rate": 0.0007706648135535149,
"loss": 1.8761,
"step": 168500
},
{
"epoch": 0.6980327042778253,
"grad_norm": 0.8566715121269226,
"learning_rate": 0.0007699740407583579,
"loss": 1.8742,
"step": 169000
},
{
"epoch": 0.7000978897934401,
"grad_norm": 0.8078393340110779,
"learning_rate": 0.0007692832679632012,
"loss": 1.8723,
"step": 169500
},
{
"epoch": 0.702163075309055,
"grad_norm": 0.8996677994728088,
"learning_rate": 0.0007685924951680443,
"loss": 1.8729,
"step": 170000
},
{
"epoch": 0.7042282608246698,
"grad_norm": 0.8081231117248535,
"learning_rate": 0.0007679017223728875,
"loss": 1.8749,
"step": 170500
},
{
"epoch": 0.7062934463402848,
"grad_norm": 0.8042668104171753,
"learning_rate": 0.0007672109495777306,
"loss": 1.869,
"step": 171000
},
{
"epoch": 0.7083586318558996,
"grad_norm": 0.8018625378608704,
"learning_rate": 0.0007665201767825738,
"loss": 1.87,
"step": 171500
},
{
"epoch": 0.7104238173715145,
"grad_norm": 0.8580893874168396,
"learning_rate": 0.0007658294039874169,
"loss": 1.874,
"step": 172000
},
{
"epoch": 0.7124890028871294,
"grad_norm": 0.8910616636276245,
"learning_rate": 0.00076513863119226,
"loss": 1.8749,
"step": 172500
},
{
"epoch": 0.7145541884027442,
"grad_norm": 0.9036041498184204,
"learning_rate": 0.0007644478583971032,
"loss": 1.874,
"step": 173000
},
{
"epoch": 0.7166193739183591,
"grad_norm": 0.8568321466445923,
"learning_rate": 0.0007637570856019463,
"loss": 1.8683,
"step": 173500
},
{
"epoch": 0.7186845594339739,
"grad_norm": 0.7695671916007996,
"learning_rate": 0.0007630663128067895,
"loss": 1.8677,
"step": 174000
},
{
"epoch": 0.7207497449495888,
"grad_norm": 0.9139420390129089,
"learning_rate": 0.0007623755400116327,
"loss": 1.8694,
"step": 174500
},
{
"epoch": 0.7228149304652037,
"grad_norm": 0.8462100625038147,
"learning_rate": 0.0007616847672164757,
"loss": 1.8714,
"step": 175000
},
{
"epoch": 0.7248801159808186,
"grad_norm": 0.8447960615158081,
"learning_rate": 0.0007609939944213189,
"loss": 1.8669,
"step": 175500
},
{
"epoch": 0.7269453014964334,
"grad_norm": 0.810688316822052,
"learning_rate": 0.000760303221626162,
"loss": 1.8637,
"step": 176000
},
{
"epoch": 0.7290104870120483,
"grad_norm": 0.7696130871772766,
"learning_rate": 0.0007596124488310052,
"loss": 1.8658,
"step": 176500
},
{
"epoch": 0.7310756725276631,
"grad_norm": 0.8709802031517029,
"learning_rate": 0.0007589216760358484,
"loss": 1.8664,
"step": 177000
},
{
"epoch": 0.733140858043278,
"grad_norm": 0.8365340828895569,
"learning_rate": 0.0007582309032406916,
"loss": 1.8656,
"step": 177500
},
{
"epoch": 0.7352060435588929,
"grad_norm": 1.0223950147628784,
"learning_rate": 0.0007575401304455346,
"loss": 1.8694,
"step": 178000
},
{
"epoch": 0.7372712290745078,
"grad_norm": 0.829572319984436,
"learning_rate": 0.0007568493576503777,
"loss": 1.8708,
"step": 178500
},
{
"epoch": 0.7393364145901227,
"grad_norm": 0.80058354139328,
"learning_rate": 0.0007561585848552209,
"loss": 1.8687,
"step": 179000
},
{
"epoch": 0.7414016001057375,
"grad_norm": 0.8351449370384216,
"learning_rate": 0.0007554678120600641,
"loss": 1.8639,
"step": 179500
},
{
"epoch": 0.7434667856213524,
"grad_norm": 0.905135989189148,
"learning_rate": 0.0007547770392649073,
"loss": 1.8645,
"step": 180000
},
{
"epoch": 0.7455319711369672,
"grad_norm": 0.8477722406387329,
"learning_rate": 0.0007540862664697504,
"loss": 1.8615,
"step": 180500
},
{
"epoch": 0.7475971566525821,
"grad_norm": 0.9718809723854065,
"learning_rate": 0.0007533954936745935,
"loss": 1.8575,
"step": 181000
},
{
"epoch": 0.7496623421681969,
"grad_norm": 0.9097675681114197,
"learning_rate": 0.0007527047208794366,
"loss": 1.8632,
"step": 181500
},
{
"epoch": 0.7517275276838118,
"grad_norm": 0.9181948900222778,
"learning_rate": 0.0007520139480842798,
"loss": 1.864,
"step": 182000
},
{
"epoch": 0.7537927131994268,
"grad_norm": 0.8058791160583496,
"learning_rate": 0.000751323175289123,
"loss": 1.8585,
"step": 182500
},
{
"epoch": 0.7558578987150416,
"grad_norm": 0.818452775478363,
"learning_rate": 0.0007506324024939661,
"loss": 1.8621,
"step": 183000
},
{
"epoch": 0.7579230842306565,
"grad_norm": 0.8789253830909729,
"learning_rate": 0.0007499416296988093,
"loss": 1.8567,
"step": 183500
},
{
"epoch": 0.7599882697462713,
"grad_norm": 0.8578682541847229,
"learning_rate": 0.0007492508569036523,
"loss": 1.8615,
"step": 184000
},
{
"epoch": 0.7620534552618862,
"grad_norm": 0.8071935176849365,
"learning_rate": 0.0007485600841084956,
"loss": 1.8589,
"step": 184500
},
{
"epoch": 0.764118640777501,
"grad_norm": 0.8300654888153076,
"learning_rate": 0.0007478693113133387,
"loss": 1.8588,
"step": 185000
},
{
"epoch": 0.7661838262931159,
"grad_norm": 0.8175327181816101,
"learning_rate": 0.0007471785385181818,
"loss": 1.8617,
"step": 185500
},
{
"epoch": 0.7682490118087307,
"grad_norm": 0.8610235452651978,
"learning_rate": 0.000746487765723025,
"loss": 1.8561,
"step": 186000
},
{
"epoch": 0.7703141973243457,
"grad_norm": 0.857377290725708,
"learning_rate": 0.0007457969929278681,
"loss": 1.856,
"step": 186500
},
{
"epoch": 0.7723793828399605,
"grad_norm": 0.8002254366874695,
"learning_rate": 0.0007451062201327113,
"loss": 1.8609,
"step": 187000
},
{
"epoch": 0.7744445683555754,
"grad_norm": 0.9635730385780334,
"learning_rate": 0.0007444154473375544,
"loss": 1.8592,
"step": 187500
},
{
"epoch": 0.7765097538711903,
"grad_norm": 0.8251007795333862,
"learning_rate": 0.0007437246745423976,
"loss": 1.8623,
"step": 188000
},
{
"epoch": 0.7785749393868051,
"grad_norm": 0.8280484676361084,
"learning_rate": 0.0007430339017472407,
"loss": 1.855,
"step": 188500
},
{
"epoch": 0.78064012490242,
"grad_norm": 0.7635123133659363,
"learning_rate": 0.0007423431289520838,
"loss": 1.8559,
"step": 189000
},
{
"epoch": 0.7827053104180348,
"grad_norm": 0.7589561939239502,
"learning_rate": 0.0007416523561569271,
"loss": 1.8576,
"step": 189500
},
{
"epoch": 0.7847704959336497,
"grad_norm": 0.876846969127655,
"learning_rate": 0.0007409615833617701,
"loss": 1.8571,
"step": 190000
},
{
"epoch": 0.7868356814492646,
"grad_norm": 0.9164223074913025,
"learning_rate": 0.0007402708105666133,
"loss": 1.8542,
"step": 190500
},
{
"epoch": 0.7889008669648795,
"grad_norm": 0.9645445346832275,
"learning_rate": 0.0007395800377714564,
"loss": 1.8584,
"step": 191000
},
{
"epoch": 0.7909660524804943,
"grad_norm": 0.8780491948127747,
"learning_rate": 0.0007388892649762996,
"loss": 1.862,
"step": 191500
},
{
"epoch": 0.7930312379961092,
"grad_norm": 0.8747962117195129,
"learning_rate": 0.0007381984921811428,
"loss": 1.8544,
"step": 192000
},
{
"epoch": 0.795096423511724,
"grad_norm": 0.8750070333480835,
"learning_rate": 0.0007375077193859859,
"loss": 1.8562,
"step": 192500
},
{
"epoch": 0.7971616090273389,
"grad_norm": 0.7979694604873657,
"learning_rate": 0.000736816946590829,
"loss": 1.8559,
"step": 193000
},
{
"epoch": 0.7992267945429538,
"grad_norm": 0.8153182864189148,
"learning_rate": 0.0007361261737956721,
"loss": 1.8497,
"step": 193500
},
{
"epoch": 0.8012919800585686,
"grad_norm": 0.9015457034111023,
"learning_rate": 0.0007354354010005153,
"loss": 1.85,
"step": 194000
},
{
"epoch": 0.8033571655741836,
"grad_norm": 0.845658540725708,
"learning_rate": 0.0007347446282053585,
"loss": 1.853,
"step": 194500
},
{
"epoch": 0.8054223510897984,
"grad_norm": 0.839846670627594,
"learning_rate": 0.0007340538554102017,
"loss": 1.8511,
"step": 195000
},
{
"epoch": 0.8074875366054133,
"grad_norm": 0.8285427689552307,
"learning_rate": 0.0007333630826150448,
"loss": 1.8512,
"step": 195500
},
{
"epoch": 0.8095527221210281,
"grad_norm": 0.8489523530006409,
"learning_rate": 0.0007326723098198878,
"loss": 1.8538,
"step": 196000
},
{
"epoch": 0.811617907636643,
"grad_norm": 0.8332532644271851,
"learning_rate": 0.000731981537024731,
"loss": 1.8528,
"step": 196500
},
{
"epoch": 0.8136830931522578,
"grad_norm": 0.8185180425643921,
"learning_rate": 0.0007312907642295742,
"loss": 1.8472,
"step": 197000
},
{
"epoch": 0.8157482786678727,
"grad_norm": 0.8716513514518738,
"learning_rate": 0.0007305999914344174,
"loss": 1.8487,
"step": 197500
},
{
"epoch": 0.8178134641834875,
"grad_norm": 0.8488348126411438,
"learning_rate": 0.0007299092186392605,
"loss": 1.8487,
"step": 198000
},
{
"epoch": 0.8198786496991025,
"grad_norm": 0.7853295207023621,
"learning_rate": 0.0007292184458441037,
"loss": 1.8467,
"step": 198500
},
{
"epoch": 0.8219438352147174,
"grad_norm": 0.8092118501663208,
"learning_rate": 0.0007285276730489468,
"loss": 1.8454,
"step": 199000
},
{
"epoch": 0.8240090207303322,
"grad_norm": 0.8414338231086731,
"learning_rate": 0.0007278369002537899,
"loss": 1.8456,
"step": 199500
},
{
"epoch": 0.8260742062459471,
"grad_norm": 0.7936431765556335,
"learning_rate": 0.0007271461274586331,
"loss": 1.8455,
"step": 200000
},
{
"epoch": 0.8281393917615619,
"grad_norm": 0.8624149560928345,
"learning_rate": 0.0007264553546634762,
"loss": 1.8417,
"step": 200500
},
{
"epoch": 0.8302045772771768,
"grad_norm": 0.7787384986877441,
"learning_rate": 0.0007257645818683194,
"loss": 1.8469,
"step": 201000
},
{
"epoch": 0.8322697627927916,
"grad_norm": 0.7881982922554016,
"learning_rate": 0.0007250738090731625,
"loss": 1.8469,
"step": 201500
},
{
"epoch": 0.8343349483084066,
"grad_norm": 0.8017438650131226,
"learning_rate": 0.0007243830362780058,
"loss": 1.8477,
"step": 202000
},
{
"epoch": 0.8364001338240215,
"grad_norm": 0.8839012980461121,
"learning_rate": 0.0007236922634828488,
"loss": 1.8433,
"step": 202500
},
{
"epoch": 0.8384653193396363,
"grad_norm": 0.8032566905021667,
"learning_rate": 0.0007230014906876919,
"loss": 1.845,
"step": 203000
},
{
"epoch": 0.8405305048552512,
"grad_norm": 0.9038089513778687,
"learning_rate": 0.0007223107178925351,
"loss": 1.8425,
"step": 203500
},
{
"epoch": 0.842595690370866,
"grad_norm": 0.9411084651947021,
"learning_rate": 0.0007216199450973782,
"loss": 1.8414,
"step": 204000
},
{
"epoch": 0.8446608758864809,
"grad_norm": 0.8129530549049377,
"learning_rate": 0.0007209291723022215,
"loss": 1.8413,
"step": 204500
},
{
"epoch": 0.8467260614020957,
"grad_norm": 0.7938794493675232,
"learning_rate": 0.0007202383995070646,
"loss": 1.8419,
"step": 205000
},
{
"epoch": 0.8487912469177106,
"grad_norm": 0.9388673305511475,
"learning_rate": 0.0007195476267119077,
"loss": 1.8433,
"step": 205500
},
{
"epoch": 0.8508564324333255,
"grad_norm": 0.9263845682144165,
"learning_rate": 0.0007188568539167508,
"loss": 1.8407,
"step": 206000
},
{
"epoch": 0.8529216179489404,
"grad_norm": 0.7958715558052063,
"learning_rate": 0.0007181660811215939,
"loss": 1.8464,
"step": 206500
},
{
"epoch": 0.8549868034645552,
"grad_norm": 0.7796356081962585,
"learning_rate": 0.0007174753083264372,
"loss": 1.8444,
"step": 207000
},
{
"epoch": 0.8570519889801701,
"grad_norm": 0.8039528727531433,
"learning_rate": 0.0007167845355312803,
"loss": 1.8474,
"step": 207500
},
{
"epoch": 0.859117174495785,
"grad_norm": 0.8445290327072144,
"learning_rate": 0.0007160937627361235,
"loss": 1.8405,
"step": 208000
},
{
"epoch": 0.8611823600113998,
"grad_norm": 0.8098761439323425,
"learning_rate": 0.0007154029899409665,
"loss": 1.8395,
"step": 208500
},
{
"epoch": 0.8632475455270147,
"grad_norm": 0.8343963027000427,
"learning_rate": 0.0007147122171458097,
"loss": 1.8362,
"step": 209000
},
{
"epoch": 0.8653127310426295,
"grad_norm": 0.8452053666114807,
"learning_rate": 0.0007140214443506529,
"loss": 1.8431,
"step": 209500
},
{
"epoch": 0.8673779165582445,
"grad_norm": 0.8454539179801941,
"learning_rate": 0.000713330671555496,
"loss": 1.8327,
"step": 210000
},
{
"epoch": 0.8694431020738593,
"grad_norm": 0.7599641680717468,
"learning_rate": 0.0007126398987603392,
"loss": 1.8335,
"step": 210500
},
{
"epoch": 0.8715082875894742,
"grad_norm": 0.8617073893547058,
"learning_rate": 0.0007119491259651823,
"loss": 1.8381,
"step": 211000
},
{
"epoch": 0.873573473105089,
"grad_norm": 0.8182563781738281,
"learning_rate": 0.0007112583531700254,
"loss": 1.8359,
"step": 211500
},
{
"epoch": 0.8756386586207039,
"grad_norm": 0.8188121318817139,
"learning_rate": 0.0007105675803748686,
"loss": 1.8362,
"step": 212000
},
{
"epoch": 0.8777038441363187,
"grad_norm": 0.7888435125350952,
"learning_rate": 0.0007098768075797118,
"loss": 1.8389,
"step": 212500
},
{
"epoch": 0.8797690296519336,
"grad_norm": 0.8186080455780029,
"learning_rate": 0.0007091860347845549,
"loss": 1.8356,
"step": 213000
},
{
"epoch": 0.8818342151675485,
"grad_norm": 0.7884934544563293,
"learning_rate": 0.000708495261989398,
"loss": 1.8356,
"step": 213500
},
{
"epoch": 0.8838994006831634,
"grad_norm": 0.8210222125053406,
"learning_rate": 0.0007078044891942412,
"loss": 1.8348,
"step": 214000
},
{
"epoch": 0.8859645861987783,
"grad_norm": 0.868903398513794,
"learning_rate": 0.0007071137163990843,
"loss": 1.8349,
"step": 214500
},
{
"epoch": 0.8880297717143931,
"grad_norm": 0.8679877519607544,
"learning_rate": 0.0007064229436039275,
"loss": 1.834,
"step": 215000
},
{
"epoch": 0.890094957230008,
"grad_norm": 0.8414639234542847,
"learning_rate": 0.0007057321708087706,
"loss": 1.8348,
"step": 215500
},
{
"epoch": 0.8921601427456228,
"grad_norm": 0.8036888241767883,
"learning_rate": 0.0007050413980136138,
"loss": 1.8331,
"step": 216000
},
{
"epoch": 0.8942253282612377,
"grad_norm": 0.833270251750946,
"learning_rate": 0.0007043506252184569,
"loss": 1.8317,
"step": 216500
},
{
"epoch": 0.8962905137768525,
"grad_norm": 0.7350865602493286,
"learning_rate": 0.0007036598524233001,
"loss": 1.8305,
"step": 217000
},
{
"epoch": 0.8983556992924674,
"grad_norm": 0.8501140475273132,
"learning_rate": 0.0007029690796281432,
"loss": 1.8278,
"step": 217500
},
{
"epoch": 0.9004208848080824,
"grad_norm": 0.82123202085495,
"learning_rate": 0.0007022783068329863,
"loss": 1.8285,
"step": 218000
},
{
"epoch": 0.9024860703236972,
"grad_norm": 0.8079880475997925,
"learning_rate": 0.0007015875340378295,
"loss": 1.833,
"step": 218500
},
{
"epoch": 0.9045512558393121,
"grad_norm": 0.7871448397636414,
"learning_rate": 0.0007008967612426726,
"loss": 1.8338,
"step": 219000
},
{
"epoch": 0.9066164413549269,
"grad_norm": 0.8511725664138794,
"learning_rate": 0.0007002059884475159,
"loss": 1.8325,
"step": 219500
},
{
"epoch": 0.9086816268705418,
"grad_norm": 0.9022111296653748,
"learning_rate": 0.000699515215652359,
"loss": 1.8292,
"step": 220000
},
{
"epoch": 0.9107468123861566,
"grad_norm": 0.8371003270149231,
"learning_rate": 0.000698824442857202,
"loss": 1.8287,
"step": 220500
},
{
"epoch": 0.9128119979017715,
"grad_norm": 0.824407160282135,
"learning_rate": 0.0006981336700620452,
"loss": 1.8315,
"step": 221000
},
{
"epoch": 0.9148771834173863,
"grad_norm": 0.847411572933197,
"learning_rate": 0.0006974428972668883,
"loss": 1.8333,
"step": 221500
},
{
"epoch": 0.9169423689330013,
"grad_norm": 0.8592170476913452,
"learning_rate": 0.0006967521244717316,
"loss": 1.8234,
"step": 222000
},
{
"epoch": 0.9190075544486161,
"grad_norm": 0.7863643169403076,
"learning_rate": 0.0006960613516765747,
"loss": 1.8364,
"step": 222500
},
{
"epoch": 0.921072739964231,
"grad_norm": 0.7801703214645386,
"learning_rate": 0.0006953705788814179,
"loss": 1.8243,
"step": 223000
},
{
"epoch": 0.9231379254798459,
"grad_norm": 0.8160432577133179,
"learning_rate": 0.0006946798060862609,
"loss": 1.8236,
"step": 223500
},
{
"epoch": 0.9252031109954607,
"grad_norm": 0.825862467288971,
"learning_rate": 0.000693989033291104,
"loss": 1.827,
"step": 224000
},
{
"epoch": 0.9272682965110756,
"grad_norm": 0.8575713634490967,
"learning_rate": 0.0006932982604959473,
"loss": 1.8225,
"step": 224500
},
{
"epoch": 0.9293334820266904,
"grad_norm": 0.7798600792884827,
"learning_rate": 0.0006926074877007904,
"loss": 1.8276,
"step": 225000
},
{
"epoch": 0.9313986675423054,
"grad_norm": 0.8042396903038025,
"learning_rate": 0.0006919167149056336,
"loss": 1.8274,
"step": 225500
},
{
"epoch": 0.9334638530579202,
"grad_norm": 0.8900014758110046,
"learning_rate": 0.0006912259421104767,
"loss": 1.8242,
"step": 226000
},
{
"epoch": 0.9355290385735351,
"grad_norm": 0.8842340111732483,
"learning_rate": 0.0006905351693153198,
"loss": 1.8222,
"step": 226500
},
{
"epoch": 0.9375942240891499,
"grad_norm": 0.8076005578041077,
"learning_rate": 0.000689844396520163,
"loss": 1.824,
"step": 227000
},
{
"epoch": 0.9396594096047648,
"grad_norm": 0.8478308916091919,
"learning_rate": 0.0006891536237250061,
"loss": 1.8213,
"step": 227500
},
{
"epoch": 0.9417245951203796,
"grad_norm": 0.8478752374649048,
"learning_rate": 0.0006884628509298493,
"loss": 1.8271,
"step": 228000
},
{
"epoch": 0.9437897806359945,
"grad_norm": 0.8306804299354553,
"learning_rate": 0.0006877720781346924,
"loss": 1.8257,
"step": 228500
},
{
"epoch": 0.9458549661516094,
"grad_norm": 0.8503381013870239,
"learning_rate": 0.0006870813053395356,
"loss": 1.8196,
"step": 229000
},
{
"epoch": 0.9479201516672243,
"grad_norm": 0.7972338795661926,
"learning_rate": 0.0006863905325443787,
"loss": 1.8219,
"step": 229500
},
{
"epoch": 0.9499853371828392,
"grad_norm": 0.8305501341819763,
"learning_rate": 0.0006856997597492219,
"loss": 1.8204,
"step": 230000
},
{
"epoch": 0.952050522698454,
"grad_norm": 0.8877650499343872,
"learning_rate": 0.000685008986954065,
"loss": 1.8227,
"step": 230500
},
{
"epoch": 0.9541157082140689,
"grad_norm": 0.8762148022651672,
"learning_rate": 0.0006843182141589081,
"loss": 1.8224,
"step": 231000
},
{
"epoch": 0.9561808937296837,
"grad_norm": 0.7984791398048401,
"learning_rate": 0.0006836274413637513,
"loss": 1.8188,
"step": 231500
},
{
"epoch": 0.9582460792452986,
"grad_norm": 0.8119187355041504,
"learning_rate": 0.0006829366685685945,
"loss": 1.8212,
"step": 232000
},
{
"epoch": 0.9603112647609134,
"grad_norm": 0.8037796020507812,
"learning_rate": 0.0006822458957734376,
"loss": 1.8187,
"step": 232500
},
{
"epoch": 0.9623764502765283,
"grad_norm": 0.8950905799865723,
"learning_rate": 0.0006815551229782807,
"loss": 1.816,
"step": 233000
},
{
"epoch": 0.9644416357921433,
"grad_norm": 0.8347873091697693,
"learning_rate": 0.0006808643501831239,
"loss": 1.8218,
"step": 233500
},
{
"epoch": 0.9665068213077581,
"grad_norm": 0.8473377823829651,
"learning_rate": 0.000680173577387967,
"loss": 1.8195,
"step": 234000
},
{
"epoch": 0.968572006823373,
"grad_norm": 0.7937746644020081,
"learning_rate": 0.0006794828045928102,
"loss": 1.8171,
"step": 234500
},
{
"epoch": 0.9706371923389878,
"grad_norm": 0.7679367065429688,
"learning_rate": 0.0006787920317976534,
"loss": 1.8146,
"step": 235000
},
{
"epoch": 0.9727023778546027,
"grad_norm": 0.8515623807907104,
"learning_rate": 0.0006781012590024964,
"loss": 1.8173,
"step": 235500
},
{
"epoch": 0.9747675633702175,
"grad_norm": 0.8188038468360901,
"learning_rate": 0.0006774104862073396,
"loss": 1.817,
"step": 236000
},
{
"epoch": 0.9768327488858324,
"grad_norm": 0.7979288697242737,
"learning_rate": 0.0006767197134121827,
"loss": 1.819,
"step": 236500
},
{
"epoch": 0.9788979344014472,
"grad_norm": 0.8797492980957031,
"learning_rate": 0.000676028940617026,
"loss": 1.8148,
"step": 237000
},
{
"epoch": 0.9809631199170622,
"grad_norm": 0.8223576545715332,
"learning_rate": 0.0006753381678218691,
"loss": 1.815,
"step": 237500
},
{
"epoch": 0.983028305432677,
"grad_norm": 0.8249248266220093,
"learning_rate": 0.0006746473950267122,
"loss": 1.8174,
"step": 238000
},
{
"epoch": 0.9850934909482919,
"grad_norm": 0.8677356243133545,
"learning_rate": 0.0006739566222315553,
"loss": 1.8209,
"step": 238500
},
{
"epoch": 0.9871586764639068,
"grad_norm": 0.829744815826416,
"learning_rate": 0.0006732658494363984,
"loss": 1.8132,
"step": 239000
},
{
"epoch": 0.9892238619795216,
"grad_norm": 0.8238321542739868,
"learning_rate": 0.0006725750766412417,
"loss": 1.8104,
"step": 239500
},
{
"epoch": 0.9912890474951365,
"grad_norm": 0.8242679834365845,
"learning_rate": 0.0006718843038460848,
"loss": 1.8129,
"step": 240000
},
{
"epoch": 0.9933542330107513,
"grad_norm": 0.7887668013572693,
"learning_rate": 0.000671193531050928,
"loss": 1.8156,
"step": 240500
},
{
"epoch": 0.9954194185263662,
"grad_norm": 0.7950047850608826,
"learning_rate": 0.0006705027582557711,
"loss": 1.8158,
"step": 241000
},
{
"epoch": 0.9974846040419811,
"grad_norm": 0.7892596125602722,
"learning_rate": 0.0006698119854606141,
"loss": 1.8122,
"step": 241500
},
{
"epoch": 0.999549789557596,
"grad_norm": 0.8291964530944824,
"learning_rate": 0.0006691212126654574,
"loss": 1.8176,
"step": 242000
},
{
"epoch": 1.0016149750732108,
"grad_norm": 0.7552099227905273,
"learning_rate": 0.0006684304398703005,
"loss": 1.8117,
"step": 242500
},
{
"epoch": 1.0036801605888257,
"grad_norm": 0.9016017317771912,
"learning_rate": 0.0006677396670751437,
"loss": 1.8141,
"step": 243000
},
{
"epoch": 1.0057453461044406,
"grad_norm": 0.7623195648193359,
"learning_rate": 0.0006670488942799868,
"loss": 1.8104,
"step": 243500
},
{
"epoch": 1.0078105316200554,
"grad_norm": 0.8782749772071838,
"learning_rate": 0.00066635812148483,
"loss": 1.813,
"step": 244000
},
{
"epoch": 1.0098757171356703,
"grad_norm": 0.800456702709198,
"learning_rate": 0.000665667348689673,
"loss": 1.8105,
"step": 244500
},
{
"epoch": 1.0119409026512851,
"grad_norm": 0.855076014995575,
"learning_rate": 0.0006649765758945162,
"loss": 1.8081,
"step": 245000
},
{
"epoch": 1.0140060881669,
"grad_norm": 0.8036173582077026,
"learning_rate": 0.0006642858030993594,
"loss": 1.8112,
"step": 245500
},
{
"epoch": 1.0160712736825148,
"grad_norm": 0.8001554012298584,
"learning_rate": 0.0006635950303042025,
"loss": 1.8079,
"step": 246000
},
{
"epoch": 1.0181364591981297,
"grad_norm": 0.8144285082817078,
"learning_rate": 0.0006629042575090457,
"loss": 1.8076,
"step": 246500
},
{
"epoch": 1.0202016447137445,
"grad_norm": 0.8857467174530029,
"learning_rate": 0.0006622134847138889,
"loss": 1.8108,
"step": 247000
},
{
"epoch": 1.0222668302293596,
"grad_norm": 0.7909874320030212,
"learning_rate": 0.000661522711918732,
"loss": 1.8068,
"step": 247500
},
{
"epoch": 1.0243320157449745,
"grad_norm": 0.8089008331298828,
"learning_rate": 0.0006608319391235751,
"loss": 1.8093,
"step": 248000
},
{
"epoch": 1.0263972012605893,
"grad_norm": 0.8550245761871338,
"learning_rate": 0.0006601411663284182,
"loss": 1.8087,
"step": 248500
},
{
"epoch": 1.0284623867762042,
"grad_norm": 0.8594583868980408,
"learning_rate": 0.0006594503935332614,
"loss": 1.8062,
"step": 249000
},
{
"epoch": 1.030527572291819,
"grad_norm": 0.8355042338371277,
"learning_rate": 0.0006587596207381046,
"loss": 1.809,
"step": 249500
},
{
"epoch": 1.0325927578074339,
"grad_norm": 0.8276521563529968,
"learning_rate": 0.0006580688479429478,
"loss": 1.8138,
"step": 250000
},
{
"epoch": 1.0346579433230487,
"grad_norm": 0.8123018741607666,
"learning_rate": 0.0006573780751477908,
"loss": 1.8017,
"step": 250500
},
{
"epoch": 1.0367231288386636,
"grad_norm": 0.7968121767044067,
"learning_rate": 0.000656687302352634,
"loss": 1.8076,
"step": 251000
},
{
"epoch": 1.0387883143542784,
"grad_norm": 0.941233217716217,
"learning_rate": 0.0006559965295574771,
"loss": 1.8059,
"step": 251500
},
{
"epoch": 1.0408534998698933,
"grad_norm": 0.8153935074806213,
"learning_rate": 0.0006553057567623202,
"loss": 1.8065,
"step": 252000
},
{
"epoch": 1.0429186853855081,
"grad_norm": 0.7739303112030029,
"learning_rate": 0.0006546149839671635,
"loss": 1.8043,
"step": 252500
},
{
"epoch": 1.044983870901123,
"grad_norm": 0.8117313385009766,
"learning_rate": 0.0006539242111720066,
"loss": 1.8004,
"step": 253000
},
{
"epoch": 1.0470490564167378,
"grad_norm": 0.8029870390892029,
"learning_rate": 0.0006532334383768497,
"loss": 1.8046,
"step": 253500
},
{
"epoch": 1.0491142419323527,
"grad_norm": 0.8172849416732788,
"learning_rate": 0.0006525426655816928,
"loss": 1.8059,
"step": 254000
},
{
"epoch": 1.0511794274479676,
"grad_norm": 0.7874976992607117,
"learning_rate": 0.0006518518927865361,
"loss": 1.8054,
"step": 254500
},
{
"epoch": 1.0532446129635824,
"grad_norm": 0.7877236604690552,
"learning_rate": 0.0006511611199913792,
"loss": 1.8026,
"step": 255000
},
{
"epoch": 1.0553097984791975,
"grad_norm": 0.8584260940551758,
"learning_rate": 0.0006504703471962223,
"loss": 1.8007,
"step": 255500
},
{
"epoch": 1.0573749839948123,
"grad_norm": 0.8094419240951538,
"learning_rate": 0.0006497795744010655,
"loss": 1.7978,
"step": 256000
},
{
"epoch": 1.0594401695104272,
"grad_norm": 0.7890325784683228,
"learning_rate": 0.0006490888016059086,
"loss": 1.8,
"step": 256500
},
{
"epoch": 1.061505355026042,
"grad_norm": 0.8764976263046265,
"learning_rate": 0.0006483980288107518,
"loss": 1.8046,
"step": 257000
},
{
"epoch": 1.063570540541657,
"grad_norm": 0.8675107359886169,
"learning_rate": 0.0006477072560155949,
"loss": 1.8036,
"step": 257500
},
{
"epoch": 1.0656357260572717,
"grad_norm": 0.7634553909301758,
"learning_rate": 0.0006470164832204381,
"loss": 1.8018,
"step": 258000
},
{
"epoch": 1.0677009115728866,
"grad_norm": 0.7381558418273926,
"learning_rate": 0.0006463257104252812,
"loss": 1.7998,
"step": 258500
},
{
"epoch": 1.0697660970885015,
"grad_norm": 0.9076355695724487,
"learning_rate": 0.0006456349376301243,
"loss": 1.7948,
"step": 259000
},
{
"epoch": 1.0718312826041163,
"grad_norm": 0.9148507118225098,
"learning_rate": 0.0006449441648349676,
"loss": 1.8018,
"step": 259500
},
{
"epoch": 1.0738964681197312,
"grad_norm": 0.7980071902275085,
"learning_rate": 0.0006442533920398106,
"loss": 1.8023,
"step": 260000
},
{
"epoch": 1.075961653635346,
"grad_norm": 0.8765013217926025,
"learning_rate": 0.0006435626192446538,
"loss": 1.8003,
"step": 260500
},
{
"epoch": 1.0780268391509609,
"grad_norm": 0.8422530293464661,
"learning_rate": 0.0006428718464494969,
"loss": 1.802,
"step": 261000
},
{
"epoch": 1.0800920246665757,
"grad_norm": 0.8139374852180481,
"learning_rate": 0.0006421810736543401,
"loss": 1.8038,
"step": 261500
},
{
"epoch": 1.0821572101821906,
"grad_norm": 0.7891634702682495,
"learning_rate": 0.0006414903008591833,
"loss": 1.8047,
"step": 262000
},
{
"epoch": 1.0842223956978054,
"grad_norm": 0.822912335395813,
"learning_rate": 0.0006407995280640264,
"loss": 1.7979,
"step": 262500
},
{
"epoch": 1.0862875812134205,
"grad_norm": 0.8557060360908508,
"learning_rate": 0.0006401087552688695,
"loss": 1.7974,
"step": 263000
},
{
"epoch": 1.0883527667290354,
"grad_norm": 0.8948346972465515,
"learning_rate": 0.0006394179824737126,
"loss": 1.8004,
"step": 263500
},
{
"epoch": 1.0904179522446502,
"grad_norm": 0.795589029788971,
"learning_rate": 0.0006387272096785558,
"loss": 1.7988,
"step": 264000
},
{
"epoch": 1.092483137760265,
"grad_norm": 0.7854675054550171,
"learning_rate": 0.000638036436883399,
"loss": 1.7999,
"step": 264500
},
{
"epoch": 1.09454832327588,
"grad_norm": 0.8320429921150208,
"learning_rate": 0.0006373456640882422,
"loss": 1.7988,
"step": 265000
},
{
"epoch": 1.0966135087914948,
"grad_norm": 0.7923471331596375,
"learning_rate": 0.0006366548912930853,
"loss": 1.7947,
"step": 265500
},
{
"epoch": 1.0986786943071096,
"grad_norm": 0.7947016358375549,
"learning_rate": 0.0006359641184979283,
"loss": 1.7949,
"step": 266000
},
{
"epoch": 1.1007438798227245,
"grad_norm": 0.8632909655570984,
"learning_rate": 0.0006352733457027715,
"loss": 1.7917,
"step": 266500
},
{
"epoch": 1.1028090653383393,
"grad_norm": 0.8080165982246399,
"learning_rate": 0.0006345825729076146,
"loss": 1.7925,
"step": 267000
},
{
"epoch": 1.1048742508539542,
"grad_norm": 0.8370658159255981,
"learning_rate": 0.0006338918001124579,
"loss": 1.7926,
"step": 267500
},
{
"epoch": 1.106939436369569,
"grad_norm": 0.7986084818840027,
"learning_rate": 0.000633201027317301,
"loss": 1.7918,
"step": 268000
},
{
"epoch": 1.109004621885184,
"grad_norm": 0.8623395562171936,
"learning_rate": 0.0006325102545221442,
"loss": 1.7916,
"step": 268500
},
{
"epoch": 1.1110698074007987,
"grad_norm": 0.8417394757270813,
"learning_rate": 0.0006318194817269872,
"loss": 1.7924,
"step": 269000
},
{
"epoch": 1.1131349929164136,
"grad_norm": 0.8161811232566833,
"learning_rate": 0.0006311287089318303,
"loss": 1.7914,
"step": 269500
},
{
"epoch": 1.1152001784320285,
"grad_norm": 0.8812907338142395,
"learning_rate": 0.0006304379361366736,
"loss": 1.7906,
"step": 270000
},
{
"epoch": 1.1172653639476433,
"grad_norm": 0.7669122219085693,
"learning_rate": 0.0006297471633415167,
"loss": 1.7917,
"step": 270500
},
{
"epoch": 1.1193305494632584,
"grad_norm": 0.792958676815033,
"learning_rate": 0.0006290563905463599,
"loss": 1.7916,
"step": 271000
},
{
"epoch": 1.1213957349788732,
"grad_norm": 0.8431819677352905,
"learning_rate": 0.000628365617751203,
"loss": 1.7913,
"step": 271500
},
{
"epoch": 1.123460920494488,
"grad_norm": 0.8096106648445129,
"learning_rate": 0.0006276748449560462,
"loss": 1.7937,
"step": 272000
},
{
"epoch": 1.125526106010103,
"grad_norm": 0.8288501501083374,
"learning_rate": 0.0006269840721608893,
"loss": 1.7928,
"step": 272500
},
{
"epoch": 1.1275912915257178,
"grad_norm": 0.8212178349494934,
"learning_rate": 0.0006262932993657324,
"loss": 1.7892,
"step": 273000
},
{
"epoch": 1.1296564770413327,
"grad_norm": 0.7889783978462219,
"learning_rate": 0.0006256025265705756,
"loss": 1.7865,
"step": 273500
},
{
"epoch": 1.1317216625569475,
"grad_norm": 0.8126891255378723,
"learning_rate": 0.0006249117537754187,
"loss": 1.7884,
"step": 274000
},
{
"epoch": 1.1337868480725624,
"grad_norm": 0.8215599060058594,
"learning_rate": 0.000624220980980262,
"loss": 1.788,
"step": 274500
},
{
"epoch": 1.1358520335881772,
"grad_norm": 0.7967174053192139,
"learning_rate": 0.000623530208185105,
"loss": 1.7886,
"step": 275000
},
{
"epoch": 1.137917219103792,
"grad_norm": 0.8354322910308838,
"learning_rate": 0.0006228394353899482,
"loss": 1.7841,
"step": 275500
},
{
"epoch": 1.139982404619407,
"grad_norm": 0.7985238432884216,
"learning_rate": 0.0006221486625947913,
"loss": 1.7886,
"step": 276000
},
{
"epoch": 1.1420475901350218,
"grad_norm": 0.8069713115692139,
"learning_rate": 0.0006214578897996344,
"loss": 1.7893,
"step": 276500
},
{
"epoch": 1.1441127756506366,
"grad_norm": 0.8244253396987915,
"learning_rate": 0.0006207671170044777,
"loss": 1.7846,
"step": 277000
},
{
"epoch": 1.1461779611662515,
"grad_norm": 0.7911844253540039,
"learning_rate": 0.0006200763442093208,
"loss": 1.782,
"step": 277500
},
{
"epoch": 1.1482431466818666,
"grad_norm": 0.8204144239425659,
"learning_rate": 0.0006193855714141639,
"loss": 1.7877,
"step": 278000
},
{
"epoch": 1.1503083321974814,
"grad_norm": 0.7840794920921326,
"learning_rate": 0.000618694798619007,
"loss": 1.7855,
"step": 278500
},
{
"epoch": 1.1523735177130963,
"grad_norm": 0.7567317485809326,
"learning_rate": 0.0006180040258238502,
"loss": 1.7815,
"step": 279000
},
{
"epoch": 1.1544387032287111,
"grad_norm": 0.8889859914779663,
"learning_rate": 0.0006173132530286934,
"loss": 1.7844,
"step": 279500
},
{
"epoch": 1.156503888744326,
"grad_norm": 0.7965997457504272,
"learning_rate": 0.0006166224802335365,
"loss": 1.7823,
"step": 280000
},
{
"epoch": 1.1585690742599408,
"grad_norm": 0.7915734052658081,
"learning_rate": 0.0006159317074383797,
"loss": 1.7829,
"step": 280500
},
{
"epoch": 1.1606342597755557,
"grad_norm": 0.8453460335731506,
"learning_rate": 0.0006152409346432227,
"loss": 1.7854,
"step": 281000
},
{
"epoch": 1.1626994452911705,
"grad_norm": 0.8347840905189514,
"learning_rate": 0.0006145501618480659,
"loss": 1.7777,
"step": 281500
},
{
"epoch": 1.1647646308067854,
"grad_norm": 0.806870698928833,
"learning_rate": 0.000613859389052909,
"loss": 1.7814,
"step": 282000
},
{
"epoch": 1.1668298163224002,
"grad_norm": 0.7722708582878113,
"learning_rate": 0.0006131686162577523,
"loss": 1.7816,
"step": 282500
},
{
"epoch": 1.168895001838015,
"grad_norm": 0.7900815606117249,
"learning_rate": 0.0006124778434625954,
"loss": 1.78,
"step": 283000
},
{
"epoch": 1.17096018735363,
"grad_norm": 0.8070388436317444,
"learning_rate": 0.0006117870706674385,
"loss": 1.7794,
"step": 283500
},
{
"epoch": 1.1730253728692448,
"grad_norm": 0.8343568444252014,
"learning_rate": 0.0006110962978722816,
"loss": 1.7817,
"step": 284000
},
{
"epoch": 1.1750905583848597,
"grad_norm": 0.7810460329055786,
"learning_rate": 0.0006104055250771247,
"loss": 1.7822,
"step": 284500
},
{
"epoch": 1.1771557439004745,
"grad_norm": 0.8281691670417786,
"learning_rate": 0.000609714752281968,
"loss": 1.7773,
"step": 285000
},
{
"epoch": 1.1792209294160894,
"grad_norm": 0.7959678769111633,
"learning_rate": 0.0006090239794868111,
"loss": 1.7755,
"step": 285500
},
{
"epoch": 1.1812861149317042,
"grad_norm": 0.7893877625465393,
"learning_rate": 0.0006083332066916543,
"loss": 1.7788,
"step": 286000
},
{
"epoch": 1.183351300447319,
"grad_norm": 0.794282078742981,
"learning_rate": 0.0006076424338964974,
"loss": 1.7785,
"step": 286500
},
{
"epoch": 1.1854164859629341,
"grad_norm": 0.833561360836029,
"learning_rate": 0.0006069516611013404,
"loss": 1.778,
"step": 287000
},
{
"epoch": 1.187481671478549,
"grad_norm": 0.7725043296813965,
"learning_rate": 0.0006062608883061837,
"loss": 1.7763,
"step": 287500
},
{
"epoch": 1.1895468569941638,
"grad_norm": 0.8378251194953918,
"learning_rate": 0.0006055701155110268,
"loss": 1.7785,
"step": 288000
},
{
"epoch": 1.1916120425097787,
"grad_norm": 0.8435170650482178,
"learning_rate": 0.00060487934271587,
"loss": 1.7808,
"step": 288500
},
{
"epoch": 1.1936772280253936,
"grad_norm": 0.7910299301147461,
"learning_rate": 0.0006041885699207131,
"loss": 1.7791,
"step": 289000
},
{
"epoch": 1.1957424135410084,
"grad_norm": 0.7965072989463806,
"learning_rate": 0.0006034977971255562,
"loss": 1.7762,
"step": 289500
},
{
"epoch": 1.1978075990566233,
"grad_norm": 0.7592757344245911,
"learning_rate": 0.0006028070243303994,
"loss": 1.7724,
"step": 290000
},
{
"epoch": 1.1998727845722381,
"grad_norm": 0.7980614304542542,
"learning_rate": 0.0006021162515352425,
"loss": 1.7773,
"step": 290500
},
{
"epoch": 1.201937970087853,
"grad_norm": 0.8618481755256653,
"learning_rate": 0.0006014254787400857,
"loss": 1.7773,
"step": 291000
},
{
"epoch": 1.2040031556034678,
"grad_norm": 0.7855138778686523,
"learning_rate": 0.0006007347059449288,
"loss": 1.7775,
"step": 291500
},
{
"epoch": 1.2060683411190827,
"grad_norm": 0.9088487029075623,
"learning_rate": 0.000600043933149772,
"loss": 1.7732,
"step": 292000
},
{
"epoch": 1.2081335266346975,
"grad_norm": 0.7684744000434875,
"learning_rate": 0.0005993531603546152,
"loss": 1.7712,
"step": 292500
},
{
"epoch": 1.2101987121503124,
"grad_norm": 0.8111701607704163,
"learning_rate": 0.0005986623875594582,
"loss": 1.7728,
"step": 293000
},
{
"epoch": 1.2122638976659272,
"grad_norm": 0.7883111834526062,
"learning_rate": 0.0005979716147643014,
"loss": 1.7744,
"step": 293500
},
{
"epoch": 1.2143290831815423,
"grad_norm": 0.888268768787384,
"learning_rate": 0.0005972808419691445,
"loss": 1.7758,
"step": 294000
},
{
"epoch": 1.2163942686971572,
"grad_norm": 1.260141372680664,
"learning_rate": 0.0005965900691739878,
"loss": 1.7733,
"step": 294500
},
{
"epoch": 1.218459454212772,
"grad_norm": 0.7965800166130066,
"learning_rate": 0.0005958992963788309,
"loss": 1.7681,
"step": 295000
},
{
"epoch": 1.2205246397283869,
"grad_norm": 0.8069186806678772,
"learning_rate": 0.0005952085235836741,
"loss": 1.7709,
"step": 295500
},
{
"epoch": 1.2225898252440017,
"grad_norm": 0.7815278172492981,
"learning_rate": 0.0005945177507885171,
"loss": 1.7746,
"step": 296000
},
{
"epoch": 1.2246550107596166,
"grad_norm": 0.8087014555931091,
"learning_rate": 0.0005938269779933602,
"loss": 1.7769,
"step": 296500
},
{
"epoch": 1.2267201962752314,
"grad_norm": 0.8358011245727539,
"learning_rate": 0.0005931362051982034,
"loss": 1.7748,
"step": 297000
},
{
"epoch": 1.2287853817908463,
"grad_norm": 0.7773184776306152,
"learning_rate": 0.0005924454324030466,
"loss": 1.7687,
"step": 297500
},
{
"epoch": 1.2308505673064611,
"grad_norm": 0.8666139245033264,
"learning_rate": 0.0005917546596078898,
"loss": 1.771,
"step": 298000
},
{
"epoch": 1.232915752822076,
"grad_norm": 0.7656075954437256,
"learning_rate": 0.0005910638868127329,
"loss": 1.7668,
"step": 298500
},
{
"epoch": 1.2349809383376908,
"grad_norm": 0.78355473279953,
"learning_rate": 0.000590373114017576,
"loss": 1.7773,
"step": 299000
},
{
"epoch": 1.2370461238533057,
"grad_norm": 0.7689515948295593,
"learning_rate": 0.0005896823412224191,
"loss": 1.7722,
"step": 299500
},
{
"epoch": 1.2391113093689206,
"grad_norm": 0.7110136151313782,
"learning_rate": 0.0005889915684272623,
"loss": 1.7741,
"step": 300000
},
{
"epoch": 1.2411764948845354,
"grad_norm": 0.7835440635681152,
"learning_rate": 0.0005883007956321055,
"loss": 1.7699,
"step": 300500
},
{
"epoch": 1.2432416804001503,
"grad_norm": 0.8212382793426514,
"learning_rate": 0.0005876100228369486,
"loss": 1.7723,
"step": 301000
},
{
"epoch": 1.2453068659157651,
"grad_norm": 1.0025386810302734,
"learning_rate": 0.0005869192500417918,
"loss": 1.774,
"step": 301500
},
{
"epoch": 1.24737205143138,
"grad_norm": 0.8511669635772705,
"learning_rate": 0.0005862284772466348,
"loss": 1.769,
"step": 302000
},
{
"epoch": 1.249437236946995,
"grad_norm": 0.8234755992889404,
"learning_rate": 0.0005855377044514781,
"loss": 1.7718,
"step": 302500
},
{
"epoch": 1.25150242246261,
"grad_norm": 0.7945066094398499,
"learning_rate": 0.0005848469316563212,
"loss": 1.7678,
"step": 303000
},
{
"epoch": 1.2535676079782248,
"grad_norm": 0.7915132641792297,
"learning_rate": 0.0005841561588611643,
"loss": 1.7661,
"step": 303500
},
{
"epoch": 1.2556327934938396,
"grad_norm": 0.7837565541267395,
"learning_rate": 0.0005834653860660075,
"loss": 1.7655,
"step": 304000
},
{
"epoch": 1.2576979790094545,
"grad_norm": 0.8219539523124695,
"learning_rate": 0.0005827746132708506,
"loss": 1.768,
"step": 304500
},
{
"epoch": 1.2597631645250693,
"grad_norm": 0.7561802268028259,
"learning_rate": 0.0005820838404756938,
"loss": 1.7652,
"step": 305000
},
{
"epoch": 1.2618283500406842,
"grad_norm": 0.7804844975471497,
"learning_rate": 0.0005813930676805369,
"loss": 1.767,
"step": 305500
},
{
"epoch": 1.263893535556299,
"grad_norm": 0.7776834964752197,
"learning_rate": 0.0005807022948853801,
"loss": 1.7619,
"step": 306000
},
{
"epoch": 1.2659587210719139,
"grad_norm": 0.7807164192199707,
"learning_rate": 0.0005800115220902232,
"loss": 1.7677,
"step": 306500
},
{
"epoch": 1.2680239065875287,
"grad_norm": 0.7830272316932678,
"learning_rate": 0.0005793207492950663,
"loss": 1.7638,
"step": 307000
},
{
"epoch": 1.2700890921031436,
"grad_norm": 0.7787231206893921,
"learning_rate": 0.0005786299764999096,
"loss": 1.7605,
"step": 307500
},
{
"epoch": 1.2721542776187584,
"grad_norm": 0.7798328399658203,
"learning_rate": 0.0005779392037047526,
"loss": 1.7617,
"step": 308000
},
{
"epoch": 1.2742194631343733,
"grad_norm": 0.8115136027336121,
"learning_rate": 0.0005772484309095958,
"loss": 1.7629,
"step": 308500
},
{
"epoch": 1.2762846486499884,
"grad_norm": 0.7370808720588684,
"learning_rate": 0.0005765576581144389,
"loss": 1.7584,
"step": 309000
},
{
"epoch": 1.2783498341656032,
"grad_norm": 0.7543977499008179,
"learning_rate": 0.0005758668853192822,
"loss": 1.7643,
"step": 309500
},
{
"epoch": 1.280415019681218,
"grad_norm": 0.8601275086402893,
"learning_rate": 0.0005751761125241253,
"loss": 1.7606,
"step": 310000
},
{
"epoch": 1.282480205196833,
"grad_norm": 0.7618570327758789,
"learning_rate": 0.0005744853397289684,
"loss": 1.7604,
"step": 310500
},
{
"epoch": 1.2845453907124478,
"grad_norm": 0.7939981818199158,
"learning_rate": 0.0005737945669338115,
"loss": 1.7632,
"step": 311000
},
{
"epoch": 1.2866105762280626,
"grad_norm": 0.7793112397193909,
"learning_rate": 0.0005731037941386546,
"loss": 1.7631,
"step": 311500
},
{
"epoch": 1.2886757617436775,
"grad_norm": 0.7892638444900513,
"learning_rate": 0.0005724130213434978,
"loss": 1.7576,
"step": 312000
},
{
"epoch": 1.2907409472592923,
"grad_norm": 0.8487522602081299,
"learning_rate": 0.000571722248548341,
"loss": 1.761,
"step": 312500
},
{
"epoch": 1.2928061327749072,
"grad_norm": 0.7883718609809875,
"learning_rate": 0.0005710314757531842,
"loss": 1.7571,
"step": 313000
},
{
"epoch": 1.294871318290522,
"grad_norm": 0.7739648818969727,
"learning_rate": 0.0005703407029580273,
"loss": 1.7595,
"step": 313500
},
{
"epoch": 1.296936503806137,
"grad_norm": 0.7544906139373779,
"learning_rate": 0.0005696499301628703,
"loss": 1.7563,
"step": 314000
},
{
"epoch": 1.2990016893217518,
"grad_norm": 0.7876557111740112,
"learning_rate": 0.0005689591573677135,
"loss": 1.7623,
"step": 314500
},
{
"epoch": 1.3010668748373666,
"grad_norm": 0.8152881860733032,
"learning_rate": 0.0005682683845725567,
"loss": 1.7561,
"step": 315000
},
{
"epoch": 1.3031320603529815,
"grad_norm": 0.8232436180114746,
"learning_rate": 0.0005675776117773999,
"loss": 1.7547,
"step": 315500
},
{
"epoch": 1.3051972458685963,
"grad_norm": 0.8717594742774963,
"learning_rate": 0.000566886838982243,
"loss": 1.7589,
"step": 316000
},
{
"epoch": 1.3072624313842112,
"grad_norm": 0.8005387187004089,
"learning_rate": 0.0005661960661870862,
"loss": 1.7531,
"step": 316500
},
{
"epoch": 1.309327616899826,
"grad_norm": 0.7955303192138672,
"learning_rate": 0.0005655052933919294,
"loss": 1.7581,
"step": 317000
},
{
"epoch": 1.3113928024154409,
"grad_norm": 0.785753071308136,
"learning_rate": 0.0005648145205967724,
"loss": 1.7584,
"step": 317500
},
{
"epoch": 1.3134579879310557,
"grad_norm": 0.775891125202179,
"learning_rate": 0.0005641237478016156,
"loss": 1.755,
"step": 318000
},
{
"epoch": 1.3155231734466706,
"grad_norm": 0.8091910481452942,
"learning_rate": 0.0005634329750064587,
"loss": 1.7562,
"step": 318500
},
{
"epoch": 1.3175883589622857,
"grad_norm": 0.7912936806678772,
"learning_rate": 0.0005627422022113019,
"loss": 1.7593,
"step": 319000
},
{
"epoch": 1.3196535444779005,
"grad_norm": 0.7949129343032837,
"learning_rate": 0.000562051429416145,
"loss": 1.7555,
"step": 319500
},
{
"epoch": 1.3217187299935154,
"grad_norm": 0.784271240234375,
"learning_rate": 0.0005613606566209883,
"loss": 1.7578,
"step": 320000
},
{
"epoch": 1.3237839155091302,
"grad_norm": 0.8425039052963257,
"learning_rate": 0.0005606698838258313,
"loss": 1.7578,
"step": 320500
},
{
"epoch": 1.325849101024745,
"grad_norm": 0.9427282214164734,
"learning_rate": 0.0005599791110306744,
"loss": 1.7553,
"step": 321000
},
{
"epoch": 1.32791428654036,
"grad_norm": 2.0400569438934326,
"learning_rate": 0.0005592883382355176,
"loss": 1.7582,
"step": 321500
},
{
"epoch": 1.3299794720559748,
"grad_norm": 0.8407232761383057,
"learning_rate": 0.0005585975654403607,
"loss": 1.7528,
"step": 322000
},
{
"epoch": 1.3320446575715896,
"grad_norm": 0.8112275004386902,
"learning_rate": 0.000557906792645204,
"loss": 1.7532,
"step": 322500
},
{
"epoch": 1.3341098430872045,
"grad_norm": 0.7587988376617432,
"learning_rate": 0.0005572160198500471,
"loss": 1.7578,
"step": 323000
},
{
"epoch": 1.3361750286028193,
"grad_norm": 0.7508676052093506,
"learning_rate": 0.0005565252470548902,
"loss": 1.7538,
"step": 323500
},
{
"epoch": 1.3382402141184342,
"grad_norm": 0.7507205009460449,
"learning_rate": 0.0005558344742597333,
"loss": 1.7516,
"step": 324000
},
{
"epoch": 1.340305399634049,
"grad_norm": 0.8090864419937134,
"learning_rate": 0.0005551437014645764,
"loss": 1.7537,
"step": 324500
},
{
"epoch": 1.3423705851496641,
"grad_norm": 0.7598680853843689,
"learning_rate": 0.0005544529286694197,
"loss": 1.7528,
"step": 325000
},
{
"epoch": 1.344435770665279,
"grad_norm": 0.795917272567749,
"learning_rate": 0.0005537621558742628,
"loss": 1.7503,
"step": 325500
},
{
"epoch": 1.3465009561808938,
"grad_norm": 0.8306310772895813,
"learning_rate": 0.000553071383079106,
"loss": 1.7519,
"step": 326000
},
{
"epoch": 1.3485661416965087,
"grad_norm": 0.7626925110816956,
"learning_rate": 0.000552380610283949,
"loss": 1.7504,
"step": 326500
},
{
"epoch": 1.3506313272121235,
"grad_norm": 0.8428457975387573,
"learning_rate": 0.0005516898374887922,
"loss": 1.75,
"step": 327000
},
{
"epoch": 1.3526965127277384,
"grad_norm": 0.7658423185348511,
"learning_rate": 0.0005509990646936354,
"loss": 1.7544,
"step": 327500
},
{
"epoch": 1.3547616982433532,
"grad_norm": 0.7995271682739258,
"learning_rate": 0.0005503082918984785,
"loss": 1.7523,
"step": 328000
},
{
"epoch": 1.356826883758968,
"grad_norm": 0.7842050790786743,
"learning_rate": 0.0005496175191033217,
"loss": 1.75,
"step": 328500
},
{
"epoch": 1.358892069274583,
"grad_norm": 0.7905313968658447,
"learning_rate": 0.0005489267463081648,
"loss": 1.7459,
"step": 329000
},
{
"epoch": 1.3609572547901978,
"grad_norm": 0.7925072312355042,
"learning_rate": 0.0005482359735130079,
"loss": 1.7489,
"step": 329500
},
{
"epoch": 1.3630224403058127,
"grad_norm": 0.786297082901001,
"learning_rate": 0.0005475452007178511,
"loss": 1.7511,
"step": 330000
},
{
"epoch": 1.3650876258214275,
"grad_norm": 0.8276360630989075,
"learning_rate": 0.0005468544279226943,
"loss": 1.7501,
"step": 330500
},
{
"epoch": 1.3671528113370424,
"grad_norm": 0.7593994736671448,
"learning_rate": 0.0005461636551275374,
"loss": 1.7495,
"step": 331000
},
{
"epoch": 1.3692179968526572,
"grad_norm": 0.7812342047691345,
"learning_rate": 0.0005454728823323805,
"loss": 1.7444,
"step": 331500
},
{
"epoch": 1.371283182368272,
"grad_norm": 0.8850775957107544,
"learning_rate": 0.0005447821095372237,
"loss": 1.7459,
"step": 332000
},
{
"epoch": 1.373348367883887,
"grad_norm": 0.7758823037147522,
"learning_rate": 0.0005440913367420668,
"loss": 1.7466,
"step": 332500
},
{
"epoch": 1.3754135533995018,
"grad_norm": 0.7858127951622009,
"learning_rate": 0.00054340056394691,
"loss": 1.7484,
"step": 333000
},
{
"epoch": 1.3774787389151166,
"grad_norm": 0.7900636792182922,
"learning_rate": 0.0005427097911517531,
"loss": 1.7463,
"step": 333500
},
{
"epoch": 1.3795439244307315,
"grad_norm": 0.8080687522888184,
"learning_rate": 0.0005420190183565963,
"loss": 1.7465,
"step": 334000
},
{
"epoch": 1.3816091099463466,
"grad_norm": 0.8253558278083801,
"learning_rate": 0.0005413282455614394,
"loss": 1.7429,
"step": 334500
},
{
"epoch": 1.3836742954619614,
"grad_norm": 0.8424259424209595,
"learning_rate": 0.0005406374727662826,
"loss": 1.748,
"step": 335000
},
{
"epoch": 1.3857394809775763,
"grad_norm": 0.7918978929519653,
"learning_rate": 0.0005399466999711257,
"loss": 1.7424,
"step": 335500
},
{
"epoch": 1.3878046664931911,
"grad_norm": 0.7710541486740112,
"learning_rate": 0.0005392559271759688,
"loss": 1.7429,
"step": 336000
},
{
"epoch": 1.389869852008806,
"grad_norm": 0.8952863216400146,
"learning_rate": 0.000538565154380812,
"loss": 1.7379,
"step": 336500
},
{
"epoch": 1.3919350375244208,
"grad_norm": 0.7974414229393005,
"learning_rate": 0.0005378743815856551,
"loss": 1.7442,
"step": 337000
},
{
"epoch": 1.3940002230400357,
"grad_norm": 0.8121086955070496,
"learning_rate": 0.0005371836087904984,
"loss": 1.7417,
"step": 337500
},
{
"epoch": 1.3960654085556505,
"grad_norm": 0.7721625566482544,
"learning_rate": 0.0005364928359953415,
"loss": 1.7399,
"step": 338000
},
{
"epoch": 1.3981305940712654,
"grad_norm": 0.7423729300498962,
"learning_rate": 0.0005358020632001845,
"loss": 1.7451,
"step": 338500
},
{
"epoch": 1.4001957795868802,
"grad_norm": 0.8138153553009033,
"learning_rate": 0.0005351112904050277,
"loss": 1.7405,
"step": 339000
},
{
"epoch": 1.402260965102495,
"grad_norm": 0.8146694302558899,
"learning_rate": 0.0005344205176098708,
"loss": 1.7431,
"step": 339500
},
{
"epoch": 1.40432615061811,
"grad_norm": 0.826502799987793,
"learning_rate": 0.0005337297448147141,
"loss": 1.7389,
"step": 340000
},
{
"epoch": 1.406391336133725,
"grad_norm": 0.7904347777366638,
"learning_rate": 0.0005330389720195572,
"loss": 1.7387,
"step": 340500
},
{
"epoch": 1.4084565216493399,
"grad_norm": 0.7897937297821045,
"learning_rate": 0.0005323481992244004,
"loss": 1.7428,
"step": 341000
},
{
"epoch": 1.4105217071649547,
"grad_norm": 0.8036805391311646,
"learning_rate": 0.0005316574264292434,
"loss": 1.7417,
"step": 341500
},
{
"epoch": 1.4125868926805696,
"grad_norm": 0.7628007531166077,
"learning_rate": 0.0005309666536340865,
"loss": 1.7404,
"step": 342000
},
{
"epoch": 1.4146520781961844,
"grad_norm": 0.8156040906906128,
"learning_rate": 0.0005302758808389298,
"loss": 1.7374,
"step": 342500
},
{
"epoch": 1.4167172637117993,
"grad_norm": 0.8283891081809998,
"learning_rate": 0.0005295851080437729,
"loss": 1.7373,
"step": 343000
},
{
"epoch": 1.4187824492274141,
"grad_norm": 0.8151522278785706,
"learning_rate": 0.0005288943352486161,
"loss": 1.7356,
"step": 343500
},
{
"epoch": 1.420847634743029,
"grad_norm": 0.8706732988357544,
"learning_rate": 0.0005282035624534592,
"loss": 1.7403,
"step": 344000
},
{
"epoch": 1.4229128202586439,
"grad_norm": 0.8165752291679382,
"learning_rate": 0.0005275127896583023,
"loss": 1.7405,
"step": 344500
},
{
"epoch": 1.4249780057742587,
"grad_norm": 0.8452313542366028,
"learning_rate": 0.0005268220168631455,
"loss": 1.7385,
"step": 345000
},
{
"epoch": 1.4270431912898736,
"grad_norm": 0.8291791081428528,
"learning_rate": 0.0005261312440679886,
"loss": 1.7381,
"step": 345500
},
{
"epoch": 1.4291083768054884,
"grad_norm": 0.8294808864593506,
"learning_rate": 0.0005254404712728318,
"loss": 1.7398,
"step": 346000
},
{
"epoch": 1.4311735623211033,
"grad_norm": 0.8119639754295349,
"learning_rate": 0.0005247496984776749,
"loss": 1.7386,
"step": 346500
},
{
"epoch": 1.4332387478367181,
"grad_norm": 0.7947481870651245,
"learning_rate": 0.0005240589256825181,
"loss": 1.7361,
"step": 347000
},
{
"epoch": 1.435303933352333,
"grad_norm": 0.8204724192619324,
"learning_rate": 0.0005233681528873612,
"loss": 1.7405,
"step": 347500
},
{
"epoch": 1.4373691188679478,
"grad_norm": 0.7718450427055359,
"learning_rate": 0.0005226773800922044,
"loss": 1.7398,
"step": 348000
},
{
"epoch": 1.4394343043835627,
"grad_norm": 0.7919915318489075,
"learning_rate": 0.0005219866072970475,
"loss": 1.7334,
"step": 348500
},
{
"epoch": 1.4414994898991775,
"grad_norm": 0.8244622945785522,
"learning_rate": 0.0005212958345018906,
"loss": 1.7363,
"step": 349000
},
{
"epoch": 1.4435646754147924,
"grad_norm": 0.8124867677688599,
"learning_rate": 0.0005206050617067338,
"loss": 1.7368,
"step": 349500
},
{
"epoch": 1.4456298609304072,
"grad_norm": 0.8139218091964722,
"learning_rate": 0.000519914288911577,
"loss": 1.7344,
"step": 350000
},
{
"epoch": 1.4476950464460223,
"grad_norm": 0.7997359037399292,
"learning_rate": 0.0005192235161164201,
"loss": 1.7296,
"step": 350500
},
{
"epoch": 1.4497602319616372,
"grad_norm": 0.8655456304550171,
"learning_rate": 0.0005185327433212632,
"loss": 1.7353,
"step": 351000
},
{
"epoch": 1.451825417477252,
"grad_norm": 0.8099657893180847,
"learning_rate": 0.0005178419705261064,
"loss": 1.7356,
"step": 351500
},
{
"epoch": 1.4538906029928669,
"grad_norm": 0.7905128002166748,
"learning_rate": 0.0005171511977309495,
"loss": 1.7331,
"step": 352000
},
{
"epoch": 1.4559557885084817,
"grad_norm": 0.7679085731506348,
"learning_rate": 0.0005164604249357927,
"loss": 1.7347,
"step": 352500
},
{
"epoch": 1.4580209740240966,
"grad_norm": 0.7984927892684937,
"learning_rate": 0.0005157696521406359,
"loss": 1.7331,
"step": 353000
},
{
"epoch": 1.4600861595397114,
"grad_norm": 0.8509982824325562,
"learning_rate": 0.0005150788793454789,
"loss": 1.7281,
"step": 353500
},
{
"epoch": 1.4621513450553263,
"grad_norm": 0.756581723690033,
"learning_rate": 0.0005143881065503221,
"loss": 1.7272,
"step": 354000
},
{
"epoch": 1.4642165305709411,
"grad_norm": 0.808980405330658,
"learning_rate": 0.0005136973337551652,
"loss": 1.7321,
"step": 354500
},
{
"epoch": 1.466281716086556,
"grad_norm": 0.8383910059928894,
"learning_rate": 0.0005130065609600085,
"loss": 1.7337,
"step": 355000
},
{
"epoch": 1.4683469016021709,
"grad_norm": 0.7818363308906555,
"learning_rate": 0.0005123157881648516,
"loss": 1.7327,
"step": 355500
},
{
"epoch": 1.470412087117786,
"grad_norm": 0.7779876589775085,
"learning_rate": 0.0005116250153696947,
"loss": 1.7317,
"step": 356000
},
{
"epoch": 1.4724772726334008,
"grad_norm": 0.7729701399803162,
"learning_rate": 0.0005109342425745378,
"loss": 1.7382,
"step": 356500
},
{
"epoch": 1.4745424581490156,
"grad_norm": 0.7971392273902893,
"learning_rate": 0.0005102434697793809,
"loss": 1.7274,
"step": 357000
},
{
"epoch": 1.4766076436646305,
"grad_norm": 0.8112899661064148,
"learning_rate": 0.0005095526969842242,
"loss": 1.7297,
"step": 357500
},
{
"epoch": 1.4786728291802453,
"grad_norm": 0.7968249917030334,
"learning_rate": 0.0005088619241890673,
"loss": 1.7293,
"step": 358000
},
{
"epoch": 1.4807380146958602,
"grad_norm": 0.7626878619194031,
"learning_rate": 0.0005081711513939105,
"loss": 1.7308,
"step": 358500
},
{
"epoch": 1.482803200211475,
"grad_norm": 0.7603055834770203,
"learning_rate": 0.0005074803785987536,
"loss": 1.7285,
"step": 359000
},
{
"epoch": 1.48486838572709,
"grad_norm": 0.7844238877296448,
"learning_rate": 0.0005067896058035966,
"loss": 1.7296,
"step": 359500
},
{
"epoch": 1.4869335712427048,
"grad_norm": 0.7728045582771301,
"learning_rate": 0.0005060988330084399,
"loss": 1.7287,
"step": 360000
},
{
"epoch": 1.4889987567583196,
"grad_norm": 0.7845308184623718,
"learning_rate": 0.000505408060213283,
"loss": 1.7249,
"step": 360500
},
{
"epoch": 1.4910639422739345,
"grad_norm": 0.8352622985839844,
"learning_rate": 0.0005047172874181262,
"loss": 1.7259,
"step": 361000
},
{
"epoch": 1.4931291277895493,
"grad_norm": 0.8270286917686462,
"learning_rate": 0.0005040265146229693,
"loss": 1.7291,
"step": 361500
},
{
"epoch": 1.4951943133051642,
"grad_norm": 0.7802717089653015,
"learning_rate": 0.0005033357418278125,
"loss": 1.7243,
"step": 362000
},
{
"epoch": 1.497259498820779,
"grad_norm": 0.7886295914649963,
"learning_rate": 0.0005026449690326556,
"loss": 1.7298,
"step": 362500
},
{
"epoch": 1.4993246843363939,
"grad_norm": 0.8236453533172607,
"learning_rate": 0.0005019541962374987,
"loss": 1.725,
"step": 363000
},
{
"epoch": 1.5013898698520087,
"grad_norm": 0.7767708897590637,
"learning_rate": 0.0005012634234423419,
"loss": 1.7302,
"step": 363500
},
{
"epoch": 1.5034550553676236,
"grad_norm": 0.7658302783966064,
"learning_rate": 0.000500572650647185,
"loss": 1.7227,
"step": 364000
},
{
"epoch": 1.5055202408832384,
"grad_norm": 0.7607765793800354,
"learning_rate": 0.0004998818778520282,
"loss": 1.7292,
"step": 364500
},
{
"epoch": 1.5075854263988533,
"grad_norm": 0.7231427431106567,
"learning_rate": 0.0004991911050568714,
"loss": 1.7257,
"step": 365000
},
{
"epoch": 1.5096506119144681,
"grad_norm": 0.965886116027832,
"learning_rate": 0.0004985003322617145,
"loss": 1.7238,
"step": 365500
},
{
"epoch": 1.511715797430083,
"grad_norm": 0.8113678097724915,
"learning_rate": 0.0004978095594665576,
"loss": 1.7213,
"step": 366000
},
{
"epoch": 1.5137809829456979,
"grad_norm": 0.8098276853561401,
"learning_rate": 0.0004971187866714007,
"loss": 1.7289,
"step": 366500
},
{
"epoch": 1.515846168461313,
"grad_norm": 1.9579529762268066,
"learning_rate": 0.0004964280138762439,
"loss": 1.7273,
"step": 367000
},
{
"epoch": 1.5179113539769278,
"grad_norm": 0.8172611594200134,
"learning_rate": 0.0004957372410810871,
"loss": 1.728,
"step": 367500
},
{
"epoch": 1.5199765394925426,
"grad_norm": 0.7897489666938782,
"learning_rate": 0.0004950464682859302,
"loss": 1.722,
"step": 368000
},
{
"epoch": 1.5220417250081575,
"grad_norm": 0.804604172706604,
"learning_rate": 0.0004943556954907734,
"loss": 1.7228,
"step": 368500
},
{
"epoch": 1.5241069105237723,
"grad_norm": 0.8362699151039124,
"learning_rate": 0.0004936649226956165,
"loss": 1.7204,
"step": 369000
},
{
"epoch": 1.5261720960393872,
"grad_norm": 0.7928584814071655,
"learning_rate": 0.0004929741499004596,
"loss": 1.7232,
"step": 369500
},
{
"epoch": 1.528237281555002,
"grad_norm": 0.8171131610870361,
"learning_rate": 0.0004922833771053028,
"loss": 1.721,
"step": 370000
},
{
"epoch": 1.530302467070617,
"grad_norm": 0.7682649493217468,
"learning_rate": 0.000491592604310146,
"loss": 1.7181,
"step": 370500
},
{
"epoch": 1.532367652586232,
"grad_norm": 0.7589514255523682,
"learning_rate": 0.0004909018315149891,
"loss": 1.7207,
"step": 371000
},
{
"epoch": 1.5344328381018468,
"grad_norm": 0.7927723526954651,
"learning_rate": 0.0004902110587198322,
"loss": 1.7172,
"step": 371500
},
{
"epoch": 1.5364980236174617,
"grad_norm": 0.7993720173835754,
"learning_rate": 0.0004895202859246754,
"loss": 1.7239,
"step": 372000
},
{
"epoch": 1.5385632091330765,
"grad_norm": 0.75545734167099,
"learning_rate": 0.0004888295131295186,
"loss": 1.7193,
"step": 372500
},
{
"epoch": 1.5406283946486914,
"grad_norm": 0.8410167694091797,
"learning_rate": 0.0004881387403343617,
"loss": 1.7212,
"step": 373000
},
{
"epoch": 1.5426935801643062,
"grad_norm": 0.7745389938354492,
"learning_rate": 0.00048744796753920485,
"loss": 1.7178,
"step": 373500
},
{
"epoch": 1.544758765679921,
"grad_norm": 1.1876429319381714,
"learning_rate": 0.00048675719474404797,
"loss": 1.7166,
"step": 374000
},
{
"epoch": 1.546823951195536,
"grad_norm": 0.8236234188079834,
"learning_rate": 0.0004860664219488911,
"loss": 1.7244,
"step": 374500
},
{
"epoch": 1.5488891367111508,
"grad_norm": 0.8301746845245361,
"learning_rate": 0.00048537564915373426,
"loss": 1.7154,
"step": 375000
},
{
"epoch": 1.5509543222267657,
"grad_norm": 0.8341511487960815,
"learning_rate": 0.0004846848763585774,
"loss": 1.7201,
"step": 375500
},
{
"epoch": 1.5530195077423805,
"grad_norm": 0.772774338722229,
"learning_rate": 0.00048399410356342055,
"loss": 1.7194,
"step": 376000
},
{
"epoch": 1.5550846932579954,
"grad_norm": 0.7931101322174072,
"learning_rate": 0.0004833033307682637,
"loss": 1.7218,
"step": 376500
},
{
"epoch": 1.5571498787736102,
"grad_norm": 0.7973618507385254,
"learning_rate": 0.00048261255797310684,
"loss": 1.7132,
"step": 377000
},
{
"epoch": 1.559215064289225,
"grad_norm": 0.7944709062576294,
"learning_rate": 0.00048192178517795,
"loss": 1.7152,
"step": 377500
},
{
"epoch": 1.56128024980484,
"grad_norm": 0.7873803377151489,
"learning_rate": 0.00048123101238279313,
"loss": 1.7136,
"step": 378000
},
{
"epoch": 1.5633454353204548,
"grad_norm": 0.8176526427268982,
"learning_rate": 0.00048054023958763625,
"loss": 1.7146,
"step": 378500
},
{
"epoch": 1.5654106208360696,
"grad_norm": 0.80870121717453,
"learning_rate": 0.0004798494667924794,
"loss": 1.7152,
"step": 379000
},
{
"epoch": 1.5674758063516845,
"grad_norm": 0.8075997829437256,
"learning_rate": 0.0004791586939973226,
"loss": 1.7156,
"step": 379500
},
{
"epoch": 1.5695409918672993,
"grad_norm": 0.7649165391921997,
"learning_rate": 0.0004784679212021657,
"loss": 1.7128,
"step": 380000
},
{
"epoch": 1.5716061773829142,
"grad_norm": 0.7735922932624817,
"learning_rate": 0.0004777771484070089,
"loss": 1.7127,
"step": 380500
},
{
"epoch": 1.573671362898529,
"grad_norm": 0.8073831796646118,
"learning_rate": 0.00047708637561185205,
"loss": 1.7157,
"step": 381000
},
{
"epoch": 1.575736548414144,
"grad_norm": 0.7841485738754272,
"learning_rate": 0.0004763956028166951,
"loss": 1.7164,
"step": 381500
},
{
"epoch": 1.5778017339297588,
"grad_norm": 0.7511780261993408,
"learning_rate": 0.0004757048300215383,
"loss": 1.7146,
"step": 382000
},
{
"epoch": 1.5798669194453738,
"grad_norm": 0.7717761993408203,
"learning_rate": 0.00047501405722638146,
"loss": 1.7141,
"step": 382500
},
{
"epoch": 1.5819321049609887,
"grad_norm": 0.8251765966415405,
"learning_rate": 0.0004743232844312246,
"loss": 1.7146,
"step": 383000
},
{
"epoch": 1.5839972904766035,
"grad_norm": 0.8129590749740601,
"learning_rate": 0.00047363251163606775,
"loss": 1.7151,
"step": 383500
},
{
"epoch": 1.5860624759922184,
"grad_norm": 0.7942067384719849,
"learning_rate": 0.0004729417388409109,
"loss": 1.7145,
"step": 384000
},
{
"epoch": 1.5881276615078332,
"grad_norm": 0.8091747760772705,
"learning_rate": 0.00047225096604575404,
"loss": 1.7102,
"step": 384500
},
{
"epoch": 1.590192847023448,
"grad_norm": 0.8157942295074463,
"learning_rate": 0.00047156019325059716,
"loss": 1.7139,
"step": 385000
},
{
"epoch": 1.592258032539063,
"grad_norm": 0.7791504859924316,
"learning_rate": 0.00047086942045544033,
"loss": 1.7138,
"step": 385500
},
{
"epoch": 1.5943232180546778,
"grad_norm": 0.8184142708778381,
"learning_rate": 0.00047017864766028345,
"loss": 1.7119,
"step": 386000
},
{
"epoch": 1.5963884035702929,
"grad_norm": 0.7216043472290039,
"learning_rate": 0.0004694878748651266,
"loss": 1.7062,
"step": 386500
},
{
"epoch": 1.5984535890859077,
"grad_norm": 0.7634962797164917,
"learning_rate": 0.0004687971020699698,
"loss": 1.7109,
"step": 387000
},
{
"epoch": 1.6005187746015226,
"grad_norm": 0.7682668566703796,
"learning_rate": 0.0004681063292748129,
"loss": 1.7175,
"step": 387500
},
{
"epoch": 1.6025839601171374,
"grad_norm": 0.7891648411750793,
"learning_rate": 0.0004674155564796561,
"loss": 1.7076,
"step": 388000
},
{
"epoch": 1.6046491456327523,
"grad_norm": 0.7735166549682617,
"learning_rate": 0.0004667247836844992,
"loss": 1.7122,
"step": 388500
},
{
"epoch": 1.6067143311483671,
"grad_norm": 0.7342345714569092,
"learning_rate": 0.0004660340108893423,
"loss": 1.7093,
"step": 389000
},
{
"epoch": 1.608779516663982,
"grad_norm": 0.7745596170425415,
"learning_rate": 0.0004653432380941855,
"loss": 1.7072,
"step": 389500
},
{
"epoch": 1.6108447021795969,
"grad_norm": 0.7656903266906738,
"learning_rate": 0.00046465246529902866,
"loss": 1.7114,
"step": 390000
},
{
"epoch": 1.6129098876952117,
"grad_norm": 0.807043731212616,
"learning_rate": 0.0004639616925038718,
"loss": 1.7085,
"step": 390500
},
{
"epoch": 1.6149750732108266,
"grad_norm": 0.7980780601501465,
"learning_rate": 0.00046327091970871495,
"loss": 1.7054,
"step": 391000
},
{
"epoch": 1.6170402587264414,
"grad_norm": 0.7772185802459717,
"learning_rate": 0.0004625801469135581,
"loss": 1.7077,
"step": 391500
},
{
"epoch": 1.6191054442420563,
"grad_norm": 0.7955535054206848,
"learning_rate": 0.0004618893741184012,
"loss": 1.7059,
"step": 392000
},
{
"epoch": 1.6211706297576711,
"grad_norm": 0.7842792868614197,
"learning_rate": 0.00046119860132324436,
"loss": 1.7032,
"step": 392500
},
{
"epoch": 1.623235815273286,
"grad_norm": 0.7722345590591431,
"learning_rate": 0.00046050782852808753,
"loss": 1.7076,
"step": 393000
},
{
"epoch": 1.6253010007889008,
"grad_norm": 0.7836925983428955,
"learning_rate": 0.00045981705573293065,
"loss": 1.701,
"step": 393500
},
{
"epoch": 1.6273661863045157,
"grad_norm": 0.8407610058784485,
"learning_rate": 0.0004591262829377738,
"loss": 1.7054,
"step": 394000
},
{
"epoch": 1.6294313718201305,
"grad_norm": 0.7842757701873779,
"learning_rate": 0.000458435510142617,
"loss": 1.7085,
"step": 394500
},
{
"epoch": 1.6314965573357454,
"grad_norm": 0.7749829292297363,
"learning_rate": 0.0004577447373474601,
"loss": 1.7082,
"step": 395000
},
{
"epoch": 1.6335617428513602,
"grad_norm": 0.7778738141059875,
"learning_rate": 0.0004570539645523032,
"loss": 1.7071,
"step": 395500
},
{
"epoch": 1.635626928366975,
"grad_norm": 0.7654650211334229,
"learning_rate": 0.0004563631917571464,
"loss": 1.7093,
"step": 396000
},
{
"epoch": 1.63769211388259,
"grad_norm": 0.7864561676979065,
"learning_rate": 0.0004556724189619895,
"loss": 1.7035,
"step": 396500
},
{
"epoch": 1.6397572993982048,
"grad_norm": 0.7672191262245178,
"learning_rate": 0.0004549816461668327,
"loss": 1.7052,
"step": 397000
},
{
"epoch": 1.6418224849138197,
"grad_norm": 0.7847920656204224,
"learning_rate": 0.00045429087337167586,
"loss": 1.7033,
"step": 397500
},
{
"epoch": 1.6438876704294345,
"grad_norm": 0.7824931144714355,
"learning_rate": 0.000453600100576519,
"loss": 1.7042,
"step": 398000
},
{
"epoch": 1.6459528559450496,
"grad_norm": 0.7992446422576904,
"learning_rate": 0.00045290932778136215,
"loss": 1.7051,
"step": 398500
},
{
"epoch": 1.6480180414606644,
"grad_norm": 0.7504148483276367,
"learning_rate": 0.00045221855498620527,
"loss": 1.7036,
"step": 399000
},
{
"epoch": 1.6500832269762793,
"grad_norm": 0.8227455019950867,
"learning_rate": 0.0004515277821910484,
"loss": 1.6998,
"step": 399500
},
{
"epoch": 1.6521484124918941,
"grad_norm": 0.7897786498069763,
"learning_rate": 0.00045083700939589156,
"loss": 1.7047,
"step": 400000
},
{
"epoch": 1.654213598007509,
"grad_norm": 0.7825984954833984,
"learning_rate": 0.00045014623660073473,
"loss": 1.7043,
"step": 400500
},
{
"epoch": 1.6562787835231239,
"grad_norm": 0.8071085214614868,
"learning_rate": 0.00044945546380557785,
"loss": 1.7035,
"step": 401000
},
{
"epoch": 1.6583439690387387,
"grad_norm": 0.9406007528305054,
"learning_rate": 0.000448764691010421,
"loss": 1.7011,
"step": 401500
},
{
"epoch": 1.6604091545543538,
"grad_norm": 0.7797788381576538,
"learning_rate": 0.0004480739182152642,
"loss": 1.7024,
"step": 402000
},
{
"epoch": 1.6624743400699686,
"grad_norm": 0.8257543444633484,
"learning_rate": 0.00044738314542010725,
"loss": 1.6958,
"step": 402500
},
{
"epoch": 1.6645395255855835,
"grad_norm": 0.8066025972366333,
"learning_rate": 0.0004466923726249504,
"loss": 1.6998,
"step": 403000
},
{
"epoch": 1.6666047111011983,
"grad_norm": 0.8041613698005676,
"learning_rate": 0.0004460015998297936,
"loss": 1.7053,
"step": 403500
},
{
"epoch": 1.6686698966168132,
"grad_norm": 0.8146698474884033,
"learning_rate": 0.0004453108270346367,
"loss": 1.6969,
"step": 404000
},
{
"epoch": 1.670735082132428,
"grad_norm": 0.7349113821983337,
"learning_rate": 0.0004446200542394799,
"loss": 1.6997,
"step": 404500
},
{
"epoch": 1.672800267648043,
"grad_norm": 0.7776924967765808,
"learning_rate": 0.00044392928144432306,
"loss": 1.6976,
"step": 405000
},
{
"epoch": 1.6748654531636578,
"grad_norm": 0.8118670582771301,
"learning_rate": 0.0004432385086491662,
"loss": 1.7039,
"step": 405500
},
{
"epoch": 1.6769306386792726,
"grad_norm": 0.7525516152381897,
"learning_rate": 0.0004425477358540093,
"loss": 1.7017,
"step": 406000
},
{
"epoch": 1.6789958241948875,
"grad_norm": 0.7723379135131836,
"learning_rate": 0.00044185696305885247,
"loss": 1.6997,
"step": 406500
},
{
"epoch": 1.6810610097105023,
"grad_norm": 0.7601300477981567,
"learning_rate": 0.00044116619026369564,
"loss": 1.6931,
"step": 407000
},
{
"epoch": 1.6831261952261172,
"grad_norm": 0.788893461227417,
"learning_rate": 0.00044047541746853876,
"loss": 1.6941,
"step": 407500
},
{
"epoch": 1.685191380741732,
"grad_norm": 0.8101310133934021,
"learning_rate": 0.00043978464467338193,
"loss": 1.697,
"step": 408000
},
{
"epoch": 1.6872565662573469,
"grad_norm": 0.7839348912239075,
"learning_rate": 0.0004390938718782251,
"loss": 1.7037,
"step": 408500
},
{
"epoch": 1.6893217517729617,
"grad_norm": 1.0003387928009033,
"learning_rate": 0.0004384030990830682,
"loss": 1.6995,
"step": 409000
},
{
"epoch": 1.6913869372885766,
"grad_norm": 0.7542647123336792,
"learning_rate": 0.00043771232628791134,
"loss": 1.6982,
"step": 409500
},
{
"epoch": 1.6934521228041914,
"grad_norm": 0.8054424524307251,
"learning_rate": 0.0004370215534927545,
"loss": 1.6971,
"step": 410000
},
{
"epoch": 1.6955173083198063,
"grad_norm": 0.7765061259269714,
"learning_rate": 0.0004363307806975976,
"loss": 1.6951,
"step": 410500
},
{
"epoch": 1.6975824938354211,
"grad_norm": 0.7774503231048584,
"learning_rate": 0.0004356400079024408,
"loss": 1.6947,
"step": 411000
},
{
"epoch": 1.699647679351036,
"grad_norm": 0.8000075817108154,
"learning_rate": 0.00043494923510728397,
"loss": 1.6937,
"step": 411500
},
{
"epoch": 1.7017128648666509,
"grad_norm": 0.8032427430152893,
"learning_rate": 0.0004342584623121271,
"loss": 1.6921,
"step": 412000
},
{
"epoch": 1.7037780503822657,
"grad_norm": 0.7988405227661133,
"learning_rate": 0.00043356768951697026,
"loss": 1.6968,
"step": 412500
},
{
"epoch": 1.7058432358978806,
"grad_norm": 0.7719324231147766,
"learning_rate": 0.0004328769167218134,
"loss": 1.6912,
"step": 413000
},
{
"epoch": 1.7079084214134954,
"grad_norm": 0.7580344080924988,
"learning_rate": 0.0004321861439266565,
"loss": 1.6951,
"step": 413500
},
{
"epoch": 1.7099736069291105,
"grad_norm": 0.8045200705528259,
"learning_rate": 0.00043149537113149967,
"loss": 1.6921,
"step": 414000
},
{
"epoch": 1.7120387924447253,
"grad_norm": 0.7698059678077698,
"learning_rate": 0.00043080459833634284,
"loss": 1.6929,
"step": 414500
},
{
"epoch": 1.7141039779603402,
"grad_norm": 0.8124533891677856,
"learning_rate": 0.00043011382554118596,
"loss": 1.6918,
"step": 415000
},
{
"epoch": 1.716169163475955,
"grad_norm": 0.7770412564277649,
"learning_rate": 0.00042942305274602913,
"loss": 1.6903,
"step": 415500
},
{
"epoch": 1.71823434899157,
"grad_norm": 0.7901027202606201,
"learning_rate": 0.0004287322799508723,
"loss": 1.6932,
"step": 416000
},
{
"epoch": 1.7202995345071848,
"grad_norm": 0.7586656212806702,
"learning_rate": 0.00042804150715571536,
"loss": 1.6932,
"step": 416500
},
{
"epoch": 1.7223647200227996,
"grad_norm": 0.7596163153648376,
"learning_rate": 0.00042735073436055853,
"loss": 1.6979,
"step": 417000
},
{
"epoch": 1.7244299055384145,
"grad_norm": 0.7645015716552734,
"learning_rate": 0.0004266599615654017,
"loss": 1.6929,
"step": 417500
},
{
"epoch": 1.7264950910540295,
"grad_norm": 0.8256881237030029,
"learning_rate": 0.0004259691887702448,
"loss": 1.691,
"step": 418000
},
{
"epoch": 1.7285602765696444,
"grad_norm": 0.78524249792099,
"learning_rate": 0.000425278415975088,
"loss": 1.69,
"step": 418500
},
{
"epoch": 1.7306254620852592,
"grad_norm": 0.814737856388092,
"learning_rate": 0.00042458764317993117,
"loss": 1.6926,
"step": 419000
},
{
"epoch": 1.732690647600874,
"grad_norm": 0.7561067342758179,
"learning_rate": 0.00042389687038477423,
"loss": 1.6928,
"step": 419500
},
{
"epoch": 1.734755833116489,
"grad_norm": 0.7771661281585693,
"learning_rate": 0.0004232060975896174,
"loss": 1.6863,
"step": 420000
},
{
"epoch": 1.7368210186321038,
"grad_norm": 0.7177093625068665,
"learning_rate": 0.0004225153247944606,
"loss": 1.6931,
"step": 420500
},
{
"epoch": 1.7388862041477187,
"grad_norm": 0.8142688870429993,
"learning_rate": 0.0004218245519993037,
"loss": 1.6895,
"step": 421000
},
{
"epoch": 1.7409513896633335,
"grad_norm": 0.8166112899780273,
"learning_rate": 0.00042113377920414687,
"loss": 1.69,
"step": 421500
},
{
"epoch": 1.7430165751789484,
"grad_norm": 0.7927871942520142,
"learning_rate": 0.00042044300640899004,
"loss": 1.69,
"step": 422000
},
{
"epoch": 1.7450817606945632,
"grad_norm": 0.8192989230155945,
"learning_rate": 0.00041975223361383316,
"loss": 1.6834,
"step": 422500
},
{
"epoch": 1.747146946210178,
"grad_norm": 0.825117290019989,
"learning_rate": 0.00041906146081867627,
"loss": 1.6875,
"step": 423000
},
{
"epoch": 1.749212131725793,
"grad_norm": 0.8357008695602417,
"learning_rate": 0.00041837068802351944,
"loss": 1.6869,
"step": 423500
},
{
"epoch": 1.7512773172414078,
"grad_norm": 0.8047915101051331,
"learning_rate": 0.00041767991522836256,
"loss": 1.6864,
"step": 424000
},
{
"epoch": 1.7533425027570226,
"grad_norm": 0.8068717122077942,
"learning_rate": 0.00041698914243320573,
"loss": 1.6871,
"step": 424500
},
{
"epoch": 1.7554076882726375,
"grad_norm": 0.7879107594490051,
"learning_rate": 0.0004162983696380489,
"loss": 1.6826,
"step": 425000
},
{
"epoch": 1.7574728737882523,
"grad_norm": 0.7748578190803528,
"learning_rate": 0.000415607596842892,
"loss": 1.6831,
"step": 425500
},
{
"epoch": 1.7595380593038672,
"grad_norm": 0.7206512093544006,
"learning_rate": 0.0004149168240477352,
"loss": 1.6884,
"step": 426000
},
{
"epoch": 1.761603244819482,
"grad_norm": 0.7805559039115906,
"learning_rate": 0.0004142260512525783,
"loss": 1.6863,
"step": 426500
},
{
"epoch": 1.763668430335097,
"grad_norm": 0.8146787285804749,
"learning_rate": 0.00041353527845742143,
"loss": 1.6838,
"step": 427000
},
{
"epoch": 1.7657336158507118,
"grad_norm": 0.7216916680335999,
"learning_rate": 0.0004128445056622646,
"loss": 1.6863,
"step": 427500
},
{
"epoch": 1.7677988013663266,
"grad_norm": 0.7865545153617859,
"learning_rate": 0.0004121537328671078,
"loss": 1.6838,
"step": 428000
},
{
"epoch": 1.7698639868819415,
"grad_norm": 0.7617883682250977,
"learning_rate": 0.0004114629600719509,
"loss": 1.684,
"step": 428500
},
{
"epoch": 1.7719291723975563,
"grad_norm": 0.8186792135238647,
"learning_rate": 0.00041077218727679407,
"loss": 1.6828,
"step": 429000
},
{
"epoch": 1.7739943579131714,
"grad_norm": 0.7898605465888977,
"learning_rate": 0.00041008141448163724,
"loss": 1.68,
"step": 429500
},
{
"epoch": 1.7760595434287862,
"grad_norm": 0.7490332126617432,
"learning_rate": 0.0004093906416864803,
"loss": 1.6822,
"step": 430000
},
{
"epoch": 1.778124728944401,
"grad_norm": 0.7616461515426636,
"learning_rate": 0.00040869986889132347,
"loss": 1.6866,
"step": 430500
},
{
"epoch": 1.780189914460016,
"grad_norm": 0.7681095004081726,
"learning_rate": 0.00040800909609616664,
"loss": 1.6811,
"step": 431000
},
{
"epoch": 1.7822550999756308,
"grad_norm": 0.7684192657470703,
"learning_rate": 0.00040731832330100976,
"loss": 1.6862,
"step": 431500
},
{
"epoch": 1.7843202854912457,
"grad_norm": 0.7826496362686157,
"learning_rate": 0.00040662755050585293,
"loss": 1.6859,
"step": 432000
},
{
"epoch": 1.7863854710068605,
"grad_norm": 0.7974809408187866,
"learning_rate": 0.0004059367777106961,
"loss": 1.6833,
"step": 432500
},
{
"epoch": 1.7884506565224754,
"grad_norm": 0.8294712901115417,
"learning_rate": 0.0004052460049155392,
"loss": 1.6832,
"step": 433000
},
{
"epoch": 1.7905158420380904,
"grad_norm": 0.8153785467147827,
"learning_rate": 0.00040455523212038234,
"loss": 1.6811,
"step": 433500
},
{
"epoch": 1.7925810275537053,
"grad_norm": 0.80795818567276,
"learning_rate": 0.0004038644593252255,
"loss": 1.6788,
"step": 434000
},
{
"epoch": 1.7946462130693202,
"grad_norm": 0.7648016214370728,
"learning_rate": 0.00040317368653006863,
"loss": 1.686,
"step": 434500
},
{
"epoch": 1.796711398584935,
"grad_norm": 0.7882753610610962,
"learning_rate": 0.0004024829137349118,
"loss": 1.6792,
"step": 435000
},
{
"epoch": 1.7987765841005499,
"grad_norm": 0.7577452659606934,
"learning_rate": 0.000401792140939755,
"loss": 1.6803,
"step": 435500
},
{
"epoch": 1.8008417696161647,
"grad_norm": 0.7712865471839905,
"learning_rate": 0.0004011013681445981,
"loss": 1.6813,
"step": 436000
},
{
"epoch": 1.8029069551317796,
"grad_norm": 0.7820202708244324,
"learning_rate": 0.00040041059534944126,
"loss": 1.6819,
"step": 436500
},
{
"epoch": 1.8049721406473944,
"grad_norm": 0.7566621899604797,
"learning_rate": 0.0003997198225542844,
"loss": 1.68,
"step": 437000
},
{
"epoch": 1.8070373261630093,
"grad_norm": 0.7587839365005493,
"learning_rate": 0.0003990290497591275,
"loss": 1.6833,
"step": 437500
},
{
"epoch": 1.8091025116786241,
"grad_norm": 0.798997700214386,
"learning_rate": 0.00039833827696397067,
"loss": 1.6797,
"step": 438000
},
{
"epoch": 1.811167697194239,
"grad_norm": 0.7913112044334412,
"learning_rate": 0.00039764750416881384,
"loss": 1.6792,
"step": 438500
},
{
"epoch": 1.8132328827098538,
"grad_norm": 0.7663547992706299,
"learning_rate": 0.00039695673137365696,
"loss": 1.6807,
"step": 439000
},
{
"epoch": 1.8152980682254687,
"grad_norm": 0.77425217628479,
"learning_rate": 0.00039626595857850013,
"loss": 1.6759,
"step": 439500
},
{
"epoch": 1.8173632537410835,
"grad_norm": 0.807633101940155,
"learning_rate": 0.0003955751857833433,
"loss": 1.6777,
"step": 440000
},
{
"epoch": 1.8194284392566984,
"grad_norm": 0.7748910188674927,
"learning_rate": 0.00039488441298818637,
"loss": 1.6794,
"step": 440500
},
{
"epoch": 1.8214936247723132,
"grad_norm": 0.8132478594779968,
"learning_rate": 0.00039419364019302954,
"loss": 1.6777,
"step": 441000
},
{
"epoch": 1.823558810287928,
"grad_norm": 0.7609587907791138,
"learning_rate": 0.0003935028673978727,
"loss": 1.6775,
"step": 441500
},
{
"epoch": 1.825623995803543,
"grad_norm": 0.8203696608543396,
"learning_rate": 0.00039281209460271583,
"loss": 1.675,
"step": 442000
},
{
"epoch": 1.8276891813191578,
"grad_norm": 0.7865495681762695,
"learning_rate": 0.000392121321807559,
"loss": 1.6783,
"step": 442500
},
{
"epoch": 1.8297543668347727,
"grad_norm": 0.7632693648338318,
"learning_rate": 0.0003914305490124022,
"loss": 1.6771,
"step": 443000
},
{
"epoch": 1.8318195523503875,
"grad_norm": 0.790891706943512,
"learning_rate": 0.0003907397762172453,
"loss": 1.6787,
"step": 443500
},
{
"epoch": 1.8338847378660024,
"grad_norm": 0.7918925881385803,
"learning_rate": 0.0003900490034220884,
"loss": 1.6749,
"step": 444000
},
{
"epoch": 1.8359499233816172,
"grad_norm": 0.8381515741348267,
"learning_rate": 0.0003893582306269316,
"loss": 1.6741,
"step": 444500
},
{
"epoch": 1.838015108897232,
"grad_norm": 0.8085419535636902,
"learning_rate": 0.0003886674578317747,
"loss": 1.6697,
"step": 445000
},
{
"epoch": 1.8400802944128472,
"grad_norm": 0.7606683969497681,
"learning_rate": 0.00038797668503661787,
"loss": 1.6745,
"step": 445500
},
{
"epoch": 1.842145479928462,
"grad_norm": 0.7622495889663696,
"learning_rate": 0.00038728591224146104,
"loss": 1.6722,
"step": 446000
},
{
"epoch": 1.8442106654440769,
"grad_norm": 0.8180463910102844,
"learning_rate": 0.00038659513944630416,
"loss": 1.6708,
"step": 446500
},
{
"epoch": 1.8462758509596917,
"grad_norm": 0.7783413529396057,
"learning_rate": 0.00038590436665114733,
"loss": 1.6741,
"step": 447000
},
{
"epoch": 1.8483410364753066,
"grad_norm": 0.7698727250099182,
"learning_rate": 0.00038521359385599045,
"loss": 1.6744,
"step": 447500
},
{
"epoch": 1.8504062219909214,
"grad_norm": 0.7889679670333862,
"learning_rate": 0.00038452282106083357,
"loss": 1.6744,
"step": 448000
},
{
"epoch": 1.8524714075065363,
"grad_norm": 0.8463781476020813,
"learning_rate": 0.00038383204826567674,
"loss": 1.6693,
"step": 448500
},
{
"epoch": 1.8545365930221511,
"grad_norm": 0.7730614542961121,
"learning_rate": 0.0003831412754705199,
"loss": 1.6748,
"step": 449000
},
{
"epoch": 1.8566017785377662,
"grad_norm": 0.7694717049598694,
"learning_rate": 0.00038245050267536303,
"loss": 1.6723,
"step": 449500
},
{
"epoch": 1.858666964053381,
"grad_norm": 0.7720078229904175,
"learning_rate": 0.0003817597298802062,
"loss": 1.6712,
"step": 450000
},
{
"epoch": 1.860732149568996,
"grad_norm": 0.7817273139953613,
"learning_rate": 0.0003810689570850494,
"loss": 1.6703,
"step": 450500
},
{
"epoch": 1.8627973350846108,
"grad_norm": 0.7825304269790649,
"learning_rate": 0.00038037818428989244,
"loss": 1.6688,
"step": 451000
},
{
"epoch": 1.8648625206002256,
"grad_norm": 0.7758463621139526,
"learning_rate": 0.0003796874114947356,
"loss": 1.6657,
"step": 451500
},
{
"epoch": 1.8669277061158405,
"grad_norm": 0.7757241129875183,
"learning_rate": 0.0003789966386995788,
"loss": 1.6734,
"step": 452000
},
{
"epoch": 1.8689928916314553,
"grad_norm": 0.8086944222450256,
"learning_rate": 0.0003783058659044219,
"loss": 1.669,
"step": 452500
},
{
"epoch": 1.8710580771470702,
"grad_norm": 0.7736507058143616,
"learning_rate": 0.00037761509310926507,
"loss": 1.6729,
"step": 453000
},
{
"epoch": 1.873123262662685,
"grad_norm": 0.7895172834396362,
"learning_rate": 0.00037692432031410824,
"loss": 1.6681,
"step": 453500
},
{
"epoch": 1.8751884481782999,
"grad_norm": 0.7610639929771423,
"learning_rate": 0.00037623354751895136,
"loss": 1.6624,
"step": 454000
},
{
"epoch": 1.8772536336939147,
"grad_norm": 0.7881196737289429,
"learning_rate": 0.0003755427747237945,
"loss": 1.6697,
"step": 454500
},
{
"epoch": 1.8793188192095296,
"grad_norm": 0.7839071154594421,
"learning_rate": 0.00037485200192863765,
"loss": 1.6713,
"step": 455000
},
{
"epoch": 1.8813840047251444,
"grad_norm": 0.7790060043334961,
"learning_rate": 0.00037416122913348077,
"loss": 1.6683,
"step": 455500
},
{
"epoch": 1.8834491902407593,
"grad_norm": 0.757612943649292,
"learning_rate": 0.00037347045633832394,
"loss": 1.6662,
"step": 456000
},
{
"epoch": 1.8855143757563742,
"grad_norm": 0.7868499755859375,
"learning_rate": 0.0003727796835431671,
"loss": 1.6666,
"step": 456500
},
{
"epoch": 1.887579561271989,
"grad_norm": 0.8040853142738342,
"learning_rate": 0.00037208891074801023,
"loss": 1.6637,
"step": 457000
},
{
"epoch": 1.8896447467876039,
"grad_norm": 0.7756462693214417,
"learning_rate": 0.0003713981379528534,
"loss": 1.6678,
"step": 457500
},
{
"epoch": 1.8917099323032187,
"grad_norm": 0.781300961971283,
"learning_rate": 0.0003707073651576965,
"loss": 1.6656,
"step": 458000
},
{
"epoch": 1.8937751178188336,
"grad_norm": 0.7810469269752502,
"learning_rate": 0.00037001659236253964,
"loss": 1.6617,
"step": 458500
},
{
"epoch": 1.8958403033344484,
"grad_norm": 0.7562840580940247,
"learning_rate": 0.0003693258195673828,
"loss": 1.6635,
"step": 459000
},
{
"epoch": 1.8979054888500633,
"grad_norm": 0.7803590893745422,
"learning_rate": 0.000368635046772226,
"loss": 1.6689,
"step": 459500
},
{
"epoch": 1.8999706743656781,
"grad_norm": 0.8209202885627747,
"learning_rate": 0.0003679442739770691,
"loss": 1.6632,
"step": 460000
},
{
"epoch": 1.902035859881293,
"grad_norm": 0.7608214020729065,
"learning_rate": 0.00036725350118191227,
"loss": 1.6607,
"step": 460500
},
{
"epoch": 1.904101045396908,
"grad_norm": 0.796277642250061,
"learning_rate": 0.00036656272838675544,
"loss": 1.6653,
"step": 461000
},
{
"epoch": 1.906166230912523,
"grad_norm": 0.796653687953949,
"learning_rate": 0.0003658719555915985,
"loss": 1.6601,
"step": 461500
},
{
"epoch": 1.9082314164281378,
"grad_norm": 0.7833842039108276,
"learning_rate": 0.0003651811827964417,
"loss": 1.662,
"step": 462000
},
{
"epoch": 1.9102966019437526,
"grad_norm": 0.7710606455802917,
"learning_rate": 0.00036449041000128485,
"loss": 1.6616,
"step": 462500
},
{
"epoch": 1.9123617874593675,
"grad_norm": 0.7609611749649048,
"learning_rate": 0.00036379963720612797,
"loss": 1.6612,
"step": 463000
},
{
"epoch": 1.9144269729749823,
"grad_norm": 0.7709171175956726,
"learning_rate": 0.00036310886441097114,
"loss": 1.6657,
"step": 463500
},
{
"epoch": 1.9164921584905972,
"grad_norm": 0.7778812646865845,
"learning_rate": 0.0003624180916158143,
"loss": 1.6663,
"step": 464000
},
{
"epoch": 1.918557344006212,
"grad_norm": 0.7948848605155945,
"learning_rate": 0.0003617273188206574,
"loss": 1.6607,
"step": 464500
},
{
"epoch": 1.920622529521827,
"grad_norm": 0.7880497574806213,
"learning_rate": 0.00036103654602550055,
"loss": 1.6615,
"step": 465000
},
{
"epoch": 1.922687715037442,
"grad_norm": 0.7933222055435181,
"learning_rate": 0.0003603457732303437,
"loss": 1.6622,
"step": 465500
},
{
"epoch": 1.9247529005530568,
"grad_norm": 0.7489884495735168,
"learning_rate": 0.00035965500043518684,
"loss": 1.6594,
"step": 466000
},
{
"epoch": 1.9268180860686717,
"grad_norm": 0.7909550666809082,
"learning_rate": 0.00035896422764003,
"loss": 1.6606,
"step": 466500
},
{
"epoch": 1.9288832715842865,
"grad_norm": 0.8264633417129517,
"learning_rate": 0.0003582734548448732,
"loss": 1.6586,
"step": 467000
},
{
"epoch": 1.9309484570999014,
"grad_norm": 0.8184587359428406,
"learning_rate": 0.00035758268204971635,
"loss": 1.6621,
"step": 467500
},
{
"epoch": 1.9330136426155162,
"grad_norm": 0.78268963098526,
"learning_rate": 0.0003568919092545594,
"loss": 1.6603,
"step": 468000
},
{
"epoch": 1.935078828131131,
"grad_norm": 0.7832273244857788,
"learning_rate": 0.0003562011364594026,
"loss": 1.661,
"step": 468500
},
{
"epoch": 1.937144013646746,
"grad_norm": 0.7547221183776855,
"learning_rate": 0.00035551036366424576,
"loss": 1.6607,
"step": 469000
},
{
"epoch": 1.9392091991623608,
"grad_norm": 0.8514434099197388,
"learning_rate": 0.0003548195908690889,
"loss": 1.6612,
"step": 469500
},
{
"epoch": 1.9412743846779756,
"grad_norm": 0.7895204424858093,
"learning_rate": 0.00035412881807393205,
"loss": 1.6577,
"step": 470000
},
{
"epoch": 1.9433395701935905,
"grad_norm": 0.8596895933151245,
"learning_rate": 0.0003534380452787752,
"loss": 1.6572,
"step": 470500
},
{
"epoch": 1.9454047557092053,
"grad_norm": 0.7693920731544495,
"learning_rate": 0.00035274727248361834,
"loss": 1.6587,
"step": 471000
},
{
"epoch": 1.9474699412248202,
"grad_norm": 0.8171895742416382,
"learning_rate": 0.00035205649968846146,
"loss": 1.6628,
"step": 471500
},
{
"epoch": 1.949535126740435,
"grad_norm": 0.7534123659133911,
"learning_rate": 0.00035136572689330463,
"loss": 1.6561,
"step": 472000
},
{
"epoch": 1.95160031225605,
"grad_norm": 0.7739940881729126,
"learning_rate": 0.00035067495409814775,
"loss": 1.6567,
"step": 472500
},
{
"epoch": 1.9536654977716648,
"grad_norm": 0.7825185656547546,
"learning_rate": 0.0003499841813029909,
"loss": 1.6544,
"step": 473000
},
{
"epoch": 1.9557306832872796,
"grad_norm": 0.7564761638641357,
"learning_rate": 0.0003492934085078341,
"loss": 1.6577,
"step": 473500
},
{
"epoch": 1.9577958688028945,
"grad_norm": 0.808772087097168,
"learning_rate": 0.0003486026357126772,
"loss": 1.655,
"step": 474000
},
{
"epoch": 1.9598610543185093,
"grad_norm": 0.8151499629020691,
"learning_rate": 0.0003479118629175204,
"loss": 1.6579,
"step": 474500
},
{
"epoch": 1.9619262398341242,
"grad_norm": 0.8655403852462769,
"learning_rate": 0.0003472210901223635,
"loss": 1.6532,
"step": 475000
},
{
"epoch": 1.963991425349739,
"grad_norm": 0.7786942720413208,
"learning_rate": 0.0003465303173272066,
"loss": 1.653,
"step": 475500
},
{
"epoch": 1.9660566108653539,
"grad_norm": 0.8005113005638123,
"learning_rate": 0.0003458395445320498,
"loss": 1.6538,
"step": 476000
},
{
"epoch": 1.968121796380969,
"grad_norm": 0.7797335386276245,
"learning_rate": 0.00034514877173689296,
"loss": 1.6567,
"step": 476500
},
{
"epoch": 1.9701869818965838,
"grad_norm": 0.7935357689857483,
"learning_rate": 0.0003444579989417361,
"loss": 1.6551,
"step": 477000
},
{
"epoch": 1.9722521674121987,
"grad_norm": 0.7659555077552795,
"learning_rate": 0.00034376722614657925,
"loss": 1.654,
"step": 477500
},
{
"epoch": 1.9743173529278135,
"grad_norm": 0.7984480857849121,
"learning_rate": 0.0003430764533514224,
"loss": 1.6513,
"step": 478000
},
{
"epoch": 1.9763825384434284,
"grad_norm": 0.7759101986885071,
"learning_rate": 0.0003423856805562655,
"loss": 1.6517,
"step": 478500
},
{
"epoch": 1.9784477239590432,
"grad_norm": 0.7922109961509705,
"learning_rate": 0.00034169490776110866,
"loss": 1.6551,
"step": 479000
},
{
"epoch": 1.980512909474658,
"grad_norm": 0.7864669561386108,
"learning_rate": 0.00034100413496595183,
"loss": 1.6521,
"step": 479500
},
{
"epoch": 1.982578094990273,
"grad_norm": 0.7987329959869385,
"learning_rate": 0.00034031336217079495,
"loss": 1.6531,
"step": 480000
},
{
"epoch": 1.984643280505888,
"grad_norm": 0.7777888774871826,
"learning_rate": 0.0003396225893756381,
"loss": 1.6509,
"step": 480500
},
{
"epoch": 1.9867084660215029,
"grad_norm": 0.7795775532722473,
"learning_rate": 0.0003389318165804813,
"loss": 1.6518,
"step": 481000
},
{
"epoch": 1.9887736515371177,
"grad_norm": 0.7711332440376282,
"learning_rate": 0.0003382410437853244,
"loss": 1.6519,
"step": 481500
},
{
"epoch": 1.9908388370527326,
"grad_norm": 0.8026793003082275,
"learning_rate": 0.0003375502709901675,
"loss": 1.6509,
"step": 482000
},
{
"epoch": 1.9929040225683474,
"grad_norm": 0.7959824204444885,
"learning_rate": 0.0003368594981950107,
"loss": 1.6511,
"step": 482500
},
{
"epoch": 1.9949692080839623,
"grad_norm": 0.7960503697395325,
"learning_rate": 0.0003361687253998538,
"loss": 1.6534,
"step": 483000
},
{
"epoch": 1.9970343935995771,
"grad_norm": 0.8475084900856018,
"learning_rate": 0.000335477952604697,
"loss": 1.6517,
"step": 483500
},
{
"epoch": 1.999099579115192,
"grad_norm": 0.7885191440582275,
"learning_rate": 0.00033478717980954016,
"loss": 1.6531,
"step": 484000
},
{
"epoch": 2.001164764630807,
"grad_norm": 0.7712221145629883,
"learning_rate": 0.0003340964070143833,
"loss": 1.6502,
"step": 484500
},
{
"epoch": 2.0032299501464217,
"grad_norm": 0.7648369669914246,
"learning_rate": 0.00033340563421922645,
"loss": 1.6441,
"step": 485000
},
{
"epoch": 2.0052951356620365,
"grad_norm": 0.8174281120300293,
"learning_rate": 0.00033271486142406957,
"loss": 1.6481,
"step": 485500
},
{
"epoch": 2.0073603211776514,
"grad_norm": 0.7810222506523132,
"learning_rate": 0.0003320240886289127,
"loss": 1.6484,
"step": 486000
},
{
"epoch": 2.0094255066932663,
"grad_norm": 0.8257454633712769,
"learning_rate": 0.00033133331583375586,
"loss": 1.6465,
"step": 486500
},
{
"epoch": 2.011490692208881,
"grad_norm": 0.7819936871528625,
"learning_rate": 0.00033064254303859903,
"loss": 1.6459,
"step": 487000
},
{
"epoch": 2.013555877724496,
"grad_norm": 0.7899196743965149,
"learning_rate": 0.00032995177024344215,
"loss": 1.6459,
"step": 487500
},
{
"epoch": 2.015621063240111,
"grad_norm": 0.8132250905036926,
"learning_rate": 0.0003292609974482853,
"loss": 1.6488,
"step": 488000
},
{
"epoch": 2.0176862487557257,
"grad_norm": 0.803816020488739,
"learning_rate": 0.0003285702246531285,
"loss": 1.6426,
"step": 488500
},
{
"epoch": 2.0197514342713405,
"grad_norm": 0.7602670788764954,
"learning_rate": 0.00032787945185797155,
"loss": 1.6462,
"step": 489000
},
{
"epoch": 2.0218166197869554,
"grad_norm": 0.7873088121414185,
"learning_rate": 0.0003271886790628147,
"loss": 1.6463,
"step": 489500
},
{
"epoch": 2.0238818053025702,
"grad_norm": 0.81231290102005,
"learning_rate": 0.0003264979062676579,
"loss": 1.6477,
"step": 490000
},
{
"epoch": 2.025946990818185,
"grad_norm": 0.8037064671516418,
"learning_rate": 0.000325807133472501,
"loss": 1.6454,
"step": 490500
},
{
"epoch": 2.0280121763338,
"grad_norm": 0.8113204836845398,
"learning_rate": 0.0003251163606773442,
"loss": 1.6449,
"step": 491000
},
{
"epoch": 2.030077361849415,
"grad_norm": 0.7967438101768494,
"learning_rate": 0.00032442558788218736,
"loss": 1.6413,
"step": 491500
},
{
"epoch": 2.0321425473650296,
"grad_norm": 0.7982317805290222,
"learning_rate": 0.0003237348150870305,
"loss": 1.6461,
"step": 492000
},
{
"epoch": 2.0342077328806445,
"grad_norm": 0.786389172077179,
"learning_rate": 0.0003230440422918736,
"loss": 1.6492,
"step": 492500
},
{
"epoch": 2.0362729183962593,
"grad_norm": 0.8528838157653809,
"learning_rate": 0.00032235326949671677,
"loss": 1.6433,
"step": 493000
},
{
"epoch": 2.038338103911874,
"grad_norm": 0.7775473594665527,
"learning_rate": 0.0003216624967015599,
"loss": 1.6445,
"step": 493500
},
{
"epoch": 2.040403289427489,
"grad_norm": 0.7589669227600098,
"learning_rate": 0.00032097172390640306,
"loss": 1.6424,
"step": 494000
},
{
"epoch": 2.0424684749431044,
"grad_norm": 0.7403915524482727,
"learning_rate": 0.00032028095111124623,
"loss": 1.6414,
"step": 494500
},
{
"epoch": 2.044533660458719,
"grad_norm": 0.7815344333648682,
"learning_rate": 0.00031959017831608935,
"loss": 1.6398,
"step": 495000
},
{
"epoch": 2.046598845974334,
"grad_norm": 0.7826516628265381,
"learning_rate": 0.0003188994055209325,
"loss": 1.6404,
"step": 495500
},
{
"epoch": 2.048664031489949,
"grad_norm": 0.8382503986358643,
"learning_rate": 0.00031820863272577564,
"loss": 1.6477,
"step": 496000
},
{
"epoch": 2.0507292170055638,
"grad_norm": 0.8345251679420471,
"learning_rate": 0.00031751785993061875,
"loss": 1.6395,
"step": 496500
},
{
"epoch": 2.0527944025211786,
"grad_norm": 0.7702645659446716,
"learning_rate": 0.0003168270871354619,
"loss": 1.6393,
"step": 497000
},
{
"epoch": 2.0548595880367935,
"grad_norm": 0.7861506938934326,
"learning_rate": 0.0003161363143403051,
"loss": 1.6431,
"step": 497500
},
{
"epoch": 2.0569247735524083,
"grad_norm": 0.8483462929725647,
"learning_rate": 0.0003154455415451482,
"loss": 1.638,
"step": 498000
},
{
"epoch": 2.058989959068023,
"grad_norm": 0.7427666783332825,
"learning_rate": 0.0003147547687499914,
"loss": 1.6398,
"step": 498500
},
{
"epoch": 2.061055144583638,
"grad_norm": 0.8200947046279907,
"learning_rate": 0.00031406399595483456,
"loss": 1.6442,
"step": 499000
},
{
"epoch": 2.063120330099253,
"grad_norm": 0.7826699018478394,
"learning_rate": 0.0003133732231596776,
"loss": 1.6373,
"step": 499500
},
{
"epoch": 2.0651855156148677,
"grad_norm": 0.8340067267417908,
"learning_rate": 0.0003126824503645208,
"loss": 1.6423,
"step": 500000
},
{
"epoch": 2.0672507011304826,
"grad_norm": 0.8408244252204895,
"learning_rate": 0.00031199167756936397,
"loss": 1.6385,
"step": 500500
},
{
"epoch": 2.0693158866460974,
"grad_norm": 0.7903205752372742,
"learning_rate": 0.0003113009047742071,
"loss": 1.64,
"step": 501000
},
{
"epoch": 2.0713810721617123,
"grad_norm": 0.8002933859825134,
"learning_rate": 0.00031061013197905026,
"loss": 1.6406,
"step": 501500
},
{
"epoch": 2.073446257677327,
"grad_norm": 0.7864850759506226,
"learning_rate": 0.00030991935918389343,
"loss": 1.6357,
"step": 502000
},
{
"epoch": 2.075511443192942,
"grad_norm": 0.8161391615867615,
"learning_rate": 0.00030922858638873655,
"loss": 1.6361,
"step": 502500
},
{
"epoch": 2.077576628708557,
"grad_norm": 0.8277705311775208,
"learning_rate": 0.00030853781359357966,
"loss": 1.6375,
"step": 503000
},
{
"epoch": 2.0796418142241717,
"grad_norm": 0.7988829016685486,
"learning_rate": 0.00030784704079842284,
"loss": 1.6327,
"step": 503500
},
{
"epoch": 2.0817069997397866,
"grad_norm": 0.7771642208099365,
"learning_rate": 0.00030715626800326595,
"loss": 1.6331,
"step": 504000
},
{
"epoch": 2.0837721852554014,
"grad_norm": 0.7471011281013489,
"learning_rate": 0.0003064654952081091,
"loss": 1.6352,
"step": 504500
},
{
"epoch": 2.0858373707710163,
"grad_norm": 0.7738475203514099,
"learning_rate": 0.0003057747224129523,
"loss": 1.6401,
"step": 505000
},
{
"epoch": 2.087902556286631,
"grad_norm": 0.7593071460723877,
"learning_rate": 0.0003050839496177954,
"loss": 1.6394,
"step": 505500
},
{
"epoch": 2.089967741802246,
"grad_norm": 0.7778981328010559,
"learning_rate": 0.0003043931768226386,
"loss": 1.6379,
"step": 506000
},
{
"epoch": 2.092032927317861,
"grad_norm": 0.7668618559837341,
"learning_rate": 0.0003037024040274817,
"loss": 1.6374,
"step": 506500
},
{
"epoch": 2.0940981128334757,
"grad_norm": 0.7854458689689636,
"learning_rate": 0.0003030116312323248,
"loss": 1.6314,
"step": 507000
},
{
"epoch": 2.0961632983490905,
"grad_norm": 0.7660508751869202,
"learning_rate": 0.000302320858437168,
"loss": 1.6326,
"step": 507500
},
{
"epoch": 2.0982284838647054,
"grad_norm": 0.759593665599823,
"learning_rate": 0.00030163008564201117,
"loss": 1.6352,
"step": 508000
},
{
"epoch": 2.1002936693803202,
"grad_norm": 0.7907975912094116,
"learning_rate": 0.0003009393128468543,
"loss": 1.6301,
"step": 508500
},
{
"epoch": 2.102358854895935,
"grad_norm": 0.8606127500534058,
"learning_rate": 0.00030024854005169746,
"loss": 1.6329,
"step": 509000
},
{
"epoch": 2.10442404041155,
"grad_norm": 0.788470447063446,
"learning_rate": 0.00029955776725654063,
"loss": 1.6336,
"step": 509500
},
{
"epoch": 2.106489225927165,
"grad_norm": 0.7975521087646484,
"learning_rate": 0.0002988669944613837,
"loss": 1.6354,
"step": 510000
},
{
"epoch": 2.10855441144278,
"grad_norm": 0.8134068250656128,
"learning_rate": 0.00029817622166622686,
"loss": 1.6354,
"step": 510500
},
{
"epoch": 2.110619596958395,
"grad_norm": 0.8084931373596191,
"learning_rate": 0.00029748544887107004,
"loss": 1.6348,
"step": 511000
},
{
"epoch": 2.11268478247401,
"grad_norm": 0.8037887811660767,
"learning_rate": 0.00029679467607591315,
"loss": 1.6315,
"step": 511500
},
{
"epoch": 2.1147499679896247,
"grad_norm": 0.7990454435348511,
"learning_rate": 0.0002961039032807563,
"loss": 1.6297,
"step": 512000
},
{
"epoch": 2.1168151535052395,
"grad_norm": 0.7971472144126892,
"learning_rate": 0.0002954131304855995,
"loss": 1.6312,
"step": 512500
},
{
"epoch": 2.1188803390208544,
"grad_norm": 0.8105595707893372,
"learning_rate": 0.0002947223576904426,
"loss": 1.6291,
"step": 513000
},
{
"epoch": 2.1209455245364692,
"grad_norm": 0.8046666979789734,
"learning_rate": 0.00029403158489528573,
"loss": 1.6323,
"step": 513500
},
{
"epoch": 2.123010710052084,
"grad_norm": 0.8228232860565186,
"learning_rate": 0.0002933408121001289,
"loss": 1.6309,
"step": 514000
},
{
"epoch": 2.125075895567699,
"grad_norm": 0.7555162906646729,
"learning_rate": 0.000292650039304972,
"loss": 1.6305,
"step": 514500
},
{
"epoch": 2.127141081083314,
"grad_norm": 0.7698606848716736,
"learning_rate": 0.0002919592665098152,
"loss": 1.6326,
"step": 515000
},
{
"epoch": 2.1292062665989286,
"grad_norm": 0.7718132138252258,
"learning_rate": 0.00029126849371465837,
"loss": 1.6291,
"step": 515500
},
{
"epoch": 2.1312714521145435,
"grad_norm": 0.7855656147003174,
"learning_rate": 0.0002905777209195015,
"loss": 1.6283,
"step": 516000
},
{
"epoch": 2.1333366376301584,
"grad_norm": 0.8064797520637512,
"learning_rate": 0.00028988694812434466,
"loss": 1.628,
"step": 516500
},
{
"epoch": 2.135401823145773,
"grad_norm": 0.7986974716186523,
"learning_rate": 0.0002891961753291878,
"loss": 1.6288,
"step": 517000
},
{
"epoch": 2.137467008661388,
"grad_norm": 0.8309503197669983,
"learning_rate": 0.0002885054025340309,
"loss": 1.6316,
"step": 517500
},
{
"epoch": 2.139532194177003,
"grad_norm": 0.8169652819633484,
"learning_rate": 0.00028781462973887406,
"loss": 1.6266,
"step": 518000
},
{
"epoch": 2.1415973796926178,
"grad_norm": 0.7754685282707214,
"learning_rate": 0.00028712385694371724,
"loss": 1.6307,
"step": 518500
},
{
"epoch": 2.1436625652082326,
"grad_norm": 0.7740616798400879,
"learning_rate": 0.00028643308414856035,
"loss": 1.6287,
"step": 519000
},
{
"epoch": 2.1457277507238475,
"grad_norm": 0.7874515056610107,
"learning_rate": 0.0002857423113534035,
"loss": 1.6254,
"step": 519500
},
{
"epoch": 2.1477929362394623,
"grad_norm": 0.8042634725570679,
"learning_rate": 0.0002850515385582467,
"loss": 1.6248,
"step": 520000
},
{
"epoch": 2.149858121755077,
"grad_norm": 0.8339025974273682,
"learning_rate": 0.00028436076576308976,
"loss": 1.6293,
"step": 520500
},
{
"epoch": 2.151923307270692,
"grad_norm": 0.822348415851593,
"learning_rate": 0.00028366999296793293,
"loss": 1.6297,
"step": 521000
},
{
"epoch": 2.153988492786307,
"grad_norm": 0.7726020812988281,
"learning_rate": 0.0002829792201727761,
"loss": 1.6291,
"step": 521500
},
{
"epoch": 2.1560536783019217,
"grad_norm": 0.7853801846504211,
"learning_rate": 0.0002822884473776192,
"loss": 1.6255,
"step": 522000
},
{
"epoch": 2.1581188638175366,
"grad_norm": 0.7884477376937866,
"learning_rate": 0.0002815976745824624,
"loss": 1.627,
"step": 522500
},
{
"epoch": 2.1601840493331514,
"grad_norm": 0.8253931999206543,
"learning_rate": 0.00028090690178730557,
"loss": 1.6259,
"step": 523000
},
{
"epoch": 2.1622492348487663,
"grad_norm": 0.7904614210128784,
"learning_rate": 0.00028021612899214863,
"loss": 1.6269,
"step": 523500
},
{
"epoch": 2.164314420364381,
"grad_norm": 0.7632104158401489,
"learning_rate": 0.0002795253561969918,
"loss": 1.6276,
"step": 524000
},
{
"epoch": 2.166379605879996,
"grad_norm": 0.8433115482330322,
"learning_rate": 0.000278834583401835,
"loss": 1.624,
"step": 524500
},
{
"epoch": 2.168444791395611,
"grad_norm": 0.8692212104797363,
"learning_rate": 0.0002781438106066781,
"loss": 1.6217,
"step": 525000
},
{
"epoch": 2.1705099769112257,
"grad_norm": 0.7796012759208679,
"learning_rate": 0.00027745303781152126,
"loss": 1.622,
"step": 525500
},
{
"epoch": 2.172575162426841,
"grad_norm": 0.8118318319320679,
"learning_rate": 0.00027676226501636443,
"loss": 1.6234,
"step": 526000
},
{
"epoch": 2.174640347942456,
"grad_norm": 0.7940993309020996,
"learning_rate": 0.00027607149222120755,
"loss": 1.626,
"step": 526500
},
{
"epoch": 2.1767055334580707,
"grad_norm": 0.797366738319397,
"learning_rate": 0.00027538071942605067,
"loss": 1.6189,
"step": 527000
},
{
"epoch": 2.1787707189736856,
"grad_norm": 0.7807763814926147,
"learning_rate": 0.00027468994663089384,
"loss": 1.6204,
"step": 527500
},
{
"epoch": 2.1808359044893004,
"grad_norm": 0.8015199303627014,
"learning_rate": 0.000273999173835737,
"loss": 1.6214,
"step": 528000
},
{
"epoch": 2.1829010900049153,
"grad_norm": 0.8279714584350586,
"learning_rate": 0.00027330840104058013,
"loss": 1.6238,
"step": 528500
},
{
"epoch": 2.18496627552053,
"grad_norm": 0.7654675245285034,
"learning_rate": 0.0002726176282454233,
"loss": 1.6285,
"step": 529000
},
{
"epoch": 2.187031461036145,
"grad_norm": 0.7837437391281128,
"learning_rate": 0.0002719268554502665,
"loss": 1.6214,
"step": 529500
},
{
"epoch": 2.18909664655176,
"grad_norm": 1.069981336593628,
"learning_rate": 0.0002712360826551096,
"loss": 1.6179,
"step": 530000
},
{
"epoch": 2.1911618320673747,
"grad_norm": 0.7750839591026306,
"learning_rate": 0.0002705453098599527,
"loss": 1.6207,
"step": 530500
},
{
"epoch": 2.1932270175829895,
"grad_norm": 0.7411586046218872,
"learning_rate": 0.0002698545370647959,
"loss": 1.6213,
"step": 531000
},
{
"epoch": 2.1952922030986044,
"grad_norm": 0.8239914774894714,
"learning_rate": 0.000269163764269639,
"loss": 1.6162,
"step": 531500
},
{
"epoch": 2.1973573886142193,
"grad_norm": 0.7895837426185608,
"learning_rate": 0.0002684729914744822,
"loss": 1.6175,
"step": 532000
},
{
"epoch": 2.199422574129834,
"grad_norm": 0.7678940892219543,
"learning_rate": 0.00026778221867932534,
"loss": 1.6182,
"step": 532500
},
{
"epoch": 2.201487759645449,
"grad_norm": 0.7663738131523132,
"learning_rate": 0.00026709144588416846,
"loss": 1.6161,
"step": 533000
},
{
"epoch": 2.203552945161064,
"grad_norm": 0.8070668578147888,
"learning_rate": 0.00026640067308901163,
"loss": 1.6155,
"step": 533500
},
{
"epoch": 2.2056181306766787,
"grad_norm": 0.8500379323959351,
"learning_rate": 0.00026570990029385475,
"loss": 1.6189,
"step": 534000
},
{
"epoch": 2.2076833161922935,
"grad_norm": 0.8292637467384338,
"learning_rate": 0.00026501912749869787,
"loss": 1.6153,
"step": 534500
},
{
"epoch": 2.2097485017079084,
"grad_norm": 0.7907617688179016,
"learning_rate": 0.00026432835470354104,
"loss": 1.6174,
"step": 535000
},
{
"epoch": 2.2118136872235232,
"grad_norm": 0.7643933892250061,
"learning_rate": 0.0002636375819083842,
"loss": 1.6175,
"step": 535500
},
{
"epoch": 2.213878872739138,
"grad_norm": 0.7963258624076843,
"learning_rate": 0.00026294680911322733,
"loss": 1.6184,
"step": 536000
},
{
"epoch": 2.215944058254753,
"grad_norm": 0.7595391273498535,
"learning_rate": 0.0002622560363180705,
"loss": 1.6167,
"step": 536500
},
{
"epoch": 2.218009243770368,
"grad_norm": 0.8099820613861084,
"learning_rate": 0.0002615652635229137,
"loss": 1.6172,
"step": 537000
},
{
"epoch": 2.2200744292859826,
"grad_norm": 0.82416170835495,
"learning_rate": 0.00026087449072775674,
"loss": 1.6141,
"step": 537500
},
{
"epoch": 2.2221396148015975,
"grad_norm": 0.8243468999862671,
"learning_rate": 0.0002601837179325999,
"loss": 1.616,
"step": 538000
},
{
"epoch": 2.2242048003172123,
"grad_norm": 0.8235235214233398,
"learning_rate": 0.0002594929451374431,
"loss": 1.6142,
"step": 538500
},
{
"epoch": 2.226269985832827,
"grad_norm": 0.8147215843200684,
"learning_rate": 0.0002588021723422862,
"loss": 1.6113,
"step": 539000
},
{
"epoch": 2.228335171348442,
"grad_norm": 0.8038352131843567,
"learning_rate": 0.00025811139954712937,
"loss": 1.6198,
"step": 539500
},
{
"epoch": 2.230400356864057,
"grad_norm": 0.7971067428588867,
"learning_rate": 0.00025742062675197254,
"loss": 1.6171,
"step": 540000
},
{
"epoch": 2.2324655423796718,
"grad_norm": 0.8829773664474487,
"learning_rate": 0.00025672985395681566,
"loss": 1.6124,
"step": 540500
},
{
"epoch": 2.2345307278952866,
"grad_norm": 0.8199840784072876,
"learning_rate": 0.0002560390811616588,
"loss": 1.6113,
"step": 541000
},
{
"epoch": 2.236595913410902,
"grad_norm": 0.8040071725845337,
"learning_rate": 0.00025534830836650195,
"loss": 1.617,
"step": 541500
},
{
"epoch": 2.2386610989265168,
"grad_norm": 0.7963501811027527,
"learning_rate": 0.00025465753557134507,
"loss": 1.6092,
"step": 542000
},
{
"epoch": 2.2407262844421316,
"grad_norm": 0.8022527694702148,
"learning_rate": 0.00025396676277618824,
"loss": 1.6121,
"step": 542500
},
{
"epoch": 2.2427914699577465,
"grad_norm": 0.7954930663108826,
"learning_rate": 0.0002532759899810314,
"loss": 1.6151,
"step": 543000
},
{
"epoch": 2.2448566554733613,
"grad_norm": 0.7801050543785095,
"learning_rate": 0.00025258521718587453,
"loss": 1.6093,
"step": 543500
},
{
"epoch": 2.246921840988976,
"grad_norm": 0.7806600332260132,
"learning_rate": 0.0002518944443907177,
"loss": 1.6089,
"step": 544000
},
{
"epoch": 2.248987026504591,
"grad_norm": 0.7561779022216797,
"learning_rate": 0.0002512036715955608,
"loss": 1.6132,
"step": 544500
},
{
"epoch": 2.251052212020206,
"grad_norm": 0.8682865500450134,
"learning_rate": 0.00025051289880040394,
"loss": 1.6124,
"step": 545000
},
{
"epoch": 2.2531173975358207,
"grad_norm": 0.7335362434387207,
"learning_rate": 0.0002498221260052471,
"loss": 1.6115,
"step": 545500
},
{
"epoch": 2.2551825830514356,
"grad_norm": 0.8360188603401184,
"learning_rate": 0.0002491313532100903,
"loss": 1.6157,
"step": 546000
},
{
"epoch": 2.2572477685670504,
"grad_norm": 0.8048787713050842,
"learning_rate": 0.0002484405804149334,
"loss": 1.6138,
"step": 546500
},
{
"epoch": 2.2593129540826653,
"grad_norm": 0.7759965658187866,
"learning_rate": 0.00024774980761977657,
"loss": 1.6112,
"step": 547000
},
{
"epoch": 2.26137813959828,
"grad_norm": 0.8284432888031006,
"learning_rate": 0.0002470590348246197,
"loss": 1.6077,
"step": 547500
},
{
"epoch": 2.263443325113895,
"grad_norm": 0.8492142558097839,
"learning_rate": 0.00024636826202946286,
"loss": 1.6059,
"step": 548000
},
{
"epoch": 2.26550851062951,
"grad_norm": 0.921442449092865,
"learning_rate": 0.000245677489234306,
"loss": 1.6078,
"step": 548500
},
{
"epoch": 2.2675736961451247,
"grad_norm": 0.7907894253730774,
"learning_rate": 0.00024498671643914915,
"loss": 1.6118,
"step": 549000
},
{
"epoch": 2.2696388816607396,
"grad_norm": 0.7716451287269592,
"learning_rate": 0.0002442959436439923,
"loss": 1.6066,
"step": 549500
},
{
"epoch": 2.2717040671763544,
"grad_norm": 0.757423460483551,
"learning_rate": 0.00024360517084883544,
"loss": 1.6098,
"step": 550000
},
{
"epoch": 2.2737692526919693,
"grad_norm": 0.8127204179763794,
"learning_rate": 0.00024291439805367856,
"loss": 1.6089,
"step": 550500
},
{
"epoch": 2.275834438207584,
"grad_norm": 0.7709484100341797,
"learning_rate": 0.00024222362525852173,
"loss": 1.6081,
"step": 551000
},
{
"epoch": 2.277899623723199,
"grad_norm": 0.7922874093055725,
"learning_rate": 0.00024153285246336488,
"loss": 1.6064,
"step": 551500
},
{
"epoch": 2.279964809238814,
"grad_norm": 0.789162814617157,
"learning_rate": 0.00024084207966820802,
"loss": 1.6052,
"step": 552000
},
{
"epoch": 2.2820299947544287,
"grad_norm": 0.8289847373962402,
"learning_rate": 0.00024015130687305117,
"loss": 1.604,
"step": 552500
},
{
"epoch": 2.2840951802700435,
"grad_norm": 0.8505263924598694,
"learning_rate": 0.0002394605340778943,
"loss": 1.6046,
"step": 553000
},
{
"epoch": 2.2861603657856584,
"grad_norm": 0.7633844614028931,
"learning_rate": 0.00023876976128273746,
"loss": 1.6098,
"step": 553500
},
{
"epoch": 2.2882255513012733,
"grad_norm": 0.775978147983551,
"learning_rate": 0.0002380789884875806,
"loss": 1.606,
"step": 554000
},
{
"epoch": 2.290290736816888,
"grad_norm": 0.8002934455871582,
"learning_rate": 0.00023738821569242374,
"loss": 1.5989,
"step": 554500
},
{
"epoch": 2.292355922332503,
"grad_norm": 0.8203332424163818,
"learning_rate": 0.0002366974428972669,
"loss": 1.6076,
"step": 555000
},
{
"epoch": 2.294421107848118,
"grad_norm": 0.7718694806098938,
"learning_rate": 0.00023600667010211006,
"loss": 1.6023,
"step": 555500
},
{
"epoch": 2.296486293363733,
"grad_norm": 0.8252015709877014,
"learning_rate": 0.00023531589730695318,
"loss": 1.6046,
"step": 556000
},
{
"epoch": 2.2985514788793475,
"grad_norm": 0.868835985660553,
"learning_rate": 0.00023462512451179632,
"loss": 1.6013,
"step": 556500
},
{
"epoch": 2.300616664394963,
"grad_norm": 0.8472076058387756,
"learning_rate": 0.0002339343517166395,
"loss": 1.6006,
"step": 557000
},
{
"epoch": 2.3026818499105772,
"grad_norm": 0.7968847751617432,
"learning_rate": 0.00023324357892148261,
"loss": 1.6055,
"step": 557500
},
{
"epoch": 2.3047470354261925,
"grad_norm": 0.8021098375320435,
"learning_rate": 0.00023255280612632576,
"loss": 1.6016,
"step": 558000
},
{
"epoch": 2.3068122209418074,
"grad_norm": 0.852824866771698,
"learning_rate": 0.00023186203333116893,
"loss": 1.6028,
"step": 558500
},
{
"epoch": 2.3088774064574222,
"grad_norm": 0.8099557161331177,
"learning_rate": 0.00023117126053601208,
"loss": 1.6022,
"step": 559000
},
{
"epoch": 2.310942591973037,
"grad_norm": 0.7851099371910095,
"learning_rate": 0.0002304804877408552,
"loss": 1.6012,
"step": 559500
},
{
"epoch": 2.313007777488652,
"grad_norm": 0.7841119170188904,
"learning_rate": 0.00022978971494569837,
"loss": 1.6055,
"step": 560000
},
{
"epoch": 2.315072963004267,
"grad_norm": 0.7725875973701477,
"learning_rate": 0.0002290989421505415,
"loss": 1.6012,
"step": 560500
},
{
"epoch": 2.3171381485198816,
"grad_norm": 0.8066521286964417,
"learning_rate": 0.00022840816935538463,
"loss": 1.6021,
"step": 561000
},
{
"epoch": 2.3192033340354965,
"grad_norm": 0.804887056350708,
"learning_rate": 0.0002277173965602278,
"loss": 1.6006,
"step": 561500
},
{
"epoch": 2.3212685195511114,
"grad_norm": 0.7885397672653198,
"learning_rate": 0.00022702662376507094,
"loss": 1.6035,
"step": 562000
},
{
"epoch": 2.323333705066726,
"grad_norm": 0.7402700781822205,
"learning_rate": 0.00022633585096991406,
"loss": 1.5988,
"step": 562500
},
{
"epoch": 2.325398890582341,
"grad_norm": 0.801807701587677,
"learning_rate": 0.00022564507817475723,
"loss": 1.6025,
"step": 563000
},
{
"epoch": 2.327464076097956,
"grad_norm": 0.7947646379470825,
"learning_rate": 0.00022495430537960038,
"loss": 1.6002,
"step": 563500
},
{
"epoch": 2.3295292616135708,
"grad_norm": 0.8268435001373291,
"learning_rate": 0.00022426353258444352,
"loss": 1.5977,
"step": 564000
},
{
"epoch": 2.3315944471291856,
"grad_norm": 0.8092913627624512,
"learning_rate": 0.00022357275978928667,
"loss": 1.5996,
"step": 564500
},
{
"epoch": 2.3336596326448005,
"grad_norm": 0.7848919630050659,
"learning_rate": 0.00022288198699412981,
"loss": 1.5995,
"step": 565000
},
{
"epoch": 2.3357248181604153,
"grad_norm": 0.802832305431366,
"learning_rate": 0.00022219121419897296,
"loss": 1.5979,
"step": 565500
},
{
"epoch": 2.33779000367603,
"grad_norm": 0.8020511865615845,
"learning_rate": 0.0002215004414038161,
"loss": 1.5981,
"step": 566000
},
{
"epoch": 2.339855189191645,
"grad_norm": 0.8132838606834412,
"learning_rate": 0.00022080966860865925,
"loss": 1.5959,
"step": 566500
},
{
"epoch": 2.34192037470726,
"grad_norm": 0.8069867491722107,
"learning_rate": 0.0002201188958135024,
"loss": 1.599,
"step": 567000
},
{
"epoch": 2.3439855602228747,
"grad_norm": 0.8337593078613281,
"learning_rate": 0.00021942812301834556,
"loss": 1.5981,
"step": 567500
},
{
"epoch": 2.3460507457384896,
"grad_norm": 0.7885046601295471,
"learning_rate": 0.00021873735022318868,
"loss": 1.597,
"step": 568000
},
{
"epoch": 2.3481159312541044,
"grad_norm": 0.8003047108650208,
"learning_rate": 0.00021804657742803183,
"loss": 1.5969,
"step": 568500
},
{
"epoch": 2.3501811167697193,
"grad_norm": 0.7714529037475586,
"learning_rate": 0.000217355804632875,
"loss": 1.5951,
"step": 569000
},
{
"epoch": 2.352246302285334,
"grad_norm": 0.8057835102081299,
"learning_rate": 0.00021666503183771812,
"loss": 1.598,
"step": 569500
},
{
"epoch": 2.354311487800949,
"grad_norm": 0.830685019493103,
"learning_rate": 0.00021597425904256126,
"loss": 1.5947,
"step": 570000
},
{
"epoch": 2.356376673316564,
"grad_norm": 0.7966949939727783,
"learning_rate": 0.00021528348624740443,
"loss": 1.5933,
"step": 570500
},
{
"epoch": 2.3584418588321787,
"grad_norm": 0.8312224745750427,
"learning_rate": 0.00021459271345224758,
"loss": 1.5941,
"step": 571000
},
{
"epoch": 2.3605070443477936,
"grad_norm": 0.8126243948936462,
"learning_rate": 0.0002139019406570907,
"loss": 1.595,
"step": 571500
},
{
"epoch": 2.3625722298634084,
"grad_norm": 0.7867225408554077,
"learning_rate": 0.00021321116786193387,
"loss": 1.5941,
"step": 572000
},
{
"epoch": 2.3646374153790237,
"grad_norm": 0.8437660336494446,
"learning_rate": 0.000212520395066777,
"loss": 1.5937,
"step": 572500
},
{
"epoch": 2.366702600894638,
"grad_norm": 0.7851312160491943,
"learning_rate": 0.00021182962227162016,
"loss": 1.5942,
"step": 573000
},
{
"epoch": 2.3687677864102534,
"grad_norm": 0.8472355008125305,
"learning_rate": 0.0002111388494764633,
"loss": 1.5937,
"step": 573500
},
{
"epoch": 2.3708329719258683,
"grad_norm": 0.7966650128364563,
"learning_rate": 0.00021044807668130645,
"loss": 1.5916,
"step": 574000
},
{
"epoch": 2.372898157441483,
"grad_norm": 0.8345617651939392,
"learning_rate": 0.00020975730388614962,
"loss": 1.5868,
"step": 574500
},
{
"epoch": 2.374963342957098,
"grad_norm": 0.82713383436203,
"learning_rate": 0.00020906653109099274,
"loss": 1.5982,
"step": 575000
},
{
"epoch": 2.377028528472713,
"grad_norm": 0.8211519718170166,
"learning_rate": 0.00020837575829583588,
"loss": 1.5888,
"step": 575500
},
{
"epoch": 2.3790937139883277,
"grad_norm": 0.8414788842201233,
"learning_rate": 0.00020768498550067905,
"loss": 1.5898,
"step": 576000
},
{
"epoch": 2.3811588995039425,
"grad_norm": 0.7635331749916077,
"learning_rate": 0.00020699421270552217,
"loss": 1.5873,
"step": 576500
},
{
"epoch": 2.3832240850195574,
"grad_norm": 0.8028623461723328,
"learning_rate": 0.00020630343991036532,
"loss": 1.5903,
"step": 577000
},
{
"epoch": 2.3852892705351723,
"grad_norm": 0.8185293674468994,
"learning_rate": 0.0002056126671152085,
"loss": 1.5917,
"step": 577500
},
{
"epoch": 2.387354456050787,
"grad_norm": 0.800356924533844,
"learning_rate": 0.00020492189432005163,
"loss": 1.5915,
"step": 578000
},
{
"epoch": 2.389419641566402,
"grad_norm": 0.7916369438171387,
"learning_rate": 0.00020423112152489475,
"loss": 1.5909,
"step": 578500
},
{
"epoch": 2.391484827082017,
"grad_norm": 0.830033540725708,
"learning_rate": 0.00020354034872973792,
"loss": 1.5881,
"step": 579000
},
{
"epoch": 2.3935500125976317,
"grad_norm": 0.7948420643806458,
"learning_rate": 0.00020284957593458107,
"loss": 1.5897,
"step": 579500
},
{
"epoch": 2.3956151981132465,
"grad_norm": 0.818466067314148,
"learning_rate": 0.00020215880313942419,
"loss": 1.5884,
"step": 580000
},
{
"epoch": 2.3976803836288614,
"grad_norm": 0.8161965608596802,
"learning_rate": 0.00020146803034426736,
"loss": 1.5906,
"step": 580500
},
{
"epoch": 2.3997455691444762,
"grad_norm": 0.8100621104240417,
"learning_rate": 0.0002007772575491105,
"loss": 1.5867,
"step": 581000
},
{
"epoch": 2.401810754660091,
"grad_norm": 0.8225206136703491,
"learning_rate": 0.00020008648475395365,
"loss": 1.5912,
"step": 581500
},
{
"epoch": 2.403875940175706,
"grad_norm": 0.8299617767333984,
"learning_rate": 0.0001993957119587968,
"loss": 1.592,
"step": 582000
},
{
"epoch": 2.405941125691321,
"grad_norm": 0.7852752208709717,
"learning_rate": 0.00019870493916363994,
"loss": 1.5859,
"step": 582500
},
{
"epoch": 2.4080063112069356,
"grad_norm": 0.8510515689849854,
"learning_rate": 0.00019801416636848308,
"loss": 1.5868,
"step": 583000
},
{
"epoch": 2.4100714967225505,
"grad_norm": 0.8003944158554077,
"learning_rate": 0.00019732339357332623,
"loss": 1.5839,
"step": 583500
},
{
"epoch": 2.4121366822381654,
"grad_norm": 0.8351225852966309,
"learning_rate": 0.00019663262077816937,
"loss": 1.5853,
"step": 584000
},
{
"epoch": 2.41420186775378,
"grad_norm": 0.8417115211486816,
"learning_rate": 0.00019594184798301252,
"loss": 1.5808,
"step": 584500
},
{
"epoch": 2.416267053269395,
"grad_norm": 0.822975754737854,
"learning_rate": 0.0001952510751878557,
"loss": 1.5828,
"step": 585000
},
{
"epoch": 2.41833223878501,
"grad_norm": 0.8236469030380249,
"learning_rate": 0.0001945603023926988,
"loss": 1.5831,
"step": 585500
},
{
"epoch": 2.4203974243006248,
"grad_norm": 0.8697351217269897,
"learning_rate": 0.00019386952959754195,
"loss": 1.5833,
"step": 586000
},
{
"epoch": 2.4224626098162396,
"grad_norm": 0.7966268658638,
"learning_rate": 0.00019317875680238512,
"loss": 1.5835,
"step": 586500
},
{
"epoch": 2.4245277953318545,
"grad_norm": 0.8148783445358276,
"learning_rate": 0.00019248798400722824,
"loss": 1.5855,
"step": 587000
},
{
"epoch": 2.4265929808474693,
"grad_norm": 0.8134833574295044,
"learning_rate": 0.00019179721121207139,
"loss": 1.5843,
"step": 587500
},
{
"epoch": 2.4286581663630846,
"grad_norm": 0.7940511703491211,
"learning_rate": 0.00019110643841691456,
"loss": 1.5831,
"step": 588000
},
{
"epoch": 2.430723351878699,
"grad_norm": 0.7859951257705688,
"learning_rate": 0.00019041566562175768,
"loss": 1.5884,
"step": 588500
},
{
"epoch": 2.4327885373943143,
"grad_norm": 0.7890865802764893,
"learning_rate": 0.00018972489282660082,
"loss": 1.5807,
"step": 589000
},
{
"epoch": 2.434853722909929,
"grad_norm": 0.7785663604736328,
"learning_rate": 0.000189034120031444,
"loss": 1.581,
"step": 589500
},
{
"epoch": 2.436918908425544,
"grad_norm": 0.8008002638816833,
"learning_rate": 0.00018834334723628714,
"loss": 1.5823,
"step": 590000
},
{
"epoch": 2.438984093941159,
"grad_norm": 0.8359131813049316,
"learning_rate": 0.00018765257444113025,
"loss": 1.5773,
"step": 590500
},
{
"epoch": 2.4410492794567737,
"grad_norm": 0.8443474173545837,
"learning_rate": 0.00018696180164597343,
"loss": 1.5841,
"step": 591000
},
{
"epoch": 2.4431144649723886,
"grad_norm": 0.7927765846252441,
"learning_rate": 0.00018627102885081657,
"loss": 1.5777,
"step": 591500
},
{
"epoch": 2.4451796504880035,
"grad_norm": 0.7933915853500366,
"learning_rate": 0.0001855802560556597,
"loss": 1.5801,
"step": 592000
},
{
"epoch": 2.4472448360036183,
"grad_norm": 0.798565149307251,
"learning_rate": 0.00018488948326050286,
"loss": 1.5797,
"step": 592500
},
{
"epoch": 2.449310021519233,
"grad_norm": 0.8151854276657104,
"learning_rate": 0.000184198710465346,
"loss": 1.5776,
"step": 593000
},
{
"epoch": 2.451375207034848,
"grad_norm": 0.7885642051696777,
"learning_rate": 0.00018350793767018915,
"loss": 1.5798,
"step": 593500
},
{
"epoch": 2.453440392550463,
"grad_norm": 0.8265528082847595,
"learning_rate": 0.0001828171648750323,
"loss": 1.5794,
"step": 594000
},
{
"epoch": 2.4555055780660777,
"grad_norm": 0.8994278311729431,
"learning_rate": 0.00018212639207987544,
"loss": 1.5758,
"step": 594500
},
{
"epoch": 2.4575707635816926,
"grad_norm": 0.7827315330505371,
"learning_rate": 0.00018143561928471859,
"loss": 1.5732,
"step": 595000
},
{
"epoch": 2.4596359490973074,
"grad_norm": 0.7778897285461426,
"learning_rate": 0.00018074484648956173,
"loss": 1.5796,
"step": 595500
},
{
"epoch": 2.4617011346129223,
"grad_norm": 0.7877337336540222,
"learning_rate": 0.00018005407369440487,
"loss": 1.5756,
"step": 596000
},
{
"epoch": 2.463766320128537,
"grad_norm": 0.7807685136795044,
"learning_rate": 0.00017936330089924802,
"loss": 1.5787,
"step": 596500
},
{
"epoch": 2.465831505644152,
"grad_norm": 0.825579047203064,
"learning_rate": 0.0001786725281040912,
"loss": 1.5794,
"step": 597000
},
{
"epoch": 2.467896691159767,
"grad_norm": 0.8047968149185181,
"learning_rate": 0.0001779817553089343,
"loss": 1.5809,
"step": 597500
},
{
"epoch": 2.4699618766753817,
"grad_norm": 0.8542481660842896,
"learning_rate": 0.00017729098251377745,
"loss": 1.5746,
"step": 598000
},
{
"epoch": 2.4720270621909965,
"grad_norm": 0.8317158222198486,
"learning_rate": 0.00017660020971862063,
"loss": 1.5809,
"step": 598500
},
{
"epoch": 2.4740922477066114,
"grad_norm": 0.8227892518043518,
"learning_rate": 0.00017590943692346374,
"loss": 1.5785,
"step": 599000
},
{
"epoch": 2.4761574332222263,
"grad_norm": 0.8336827158927917,
"learning_rate": 0.0001752186641283069,
"loss": 1.5746,
"step": 599500
},
{
"epoch": 2.478222618737841,
"grad_norm": 0.809407651424408,
"learning_rate": 0.00017452789133315006,
"loss": 1.5778,
"step": 600000
},
{
"epoch": 2.480287804253456,
"grad_norm": 0.799867570400238,
"learning_rate": 0.0001738371185379932,
"loss": 1.5762,
"step": 600500
},
{
"epoch": 2.482352989769071,
"grad_norm": 0.826615571975708,
"learning_rate": 0.00017314634574283632,
"loss": 1.5717,
"step": 601000
},
{
"epoch": 2.4844181752846857,
"grad_norm": 0.7937526702880859,
"learning_rate": 0.0001724555729476795,
"loss": 1.579,
"step": 601500
},
{
"epoch": 2.4864833608003005,
"grad_norm": 0.8167052865028381,
"learning_rate": 0.00017176480015252264,
"loss": 1.5727,
"step": 602000
},
{
"epoch": 2.4885485463159154,
"grad_norm": 0.8457524180412292,
"learning_rate": 0.00017107402735736579,
"loss": 1.5684,
"step": 602500
},
{
"epoch": 2.4906137318315302,
"grad_norm": 0.8600340485572815,
"learning_rate": 0.00017038325456220893,
"loss": 1.5767,
"step": 603000
},
{
"epoch": 2.4926789173471455,
"grad_norm": 0.786114513874054,
"learning_rate": 0.00016969248176705207,
"loss": 1.5696,
"step": 603500
},
{
"epoch": 2.49474410286276,
"grad_norm": 0.8081954717636108,
"learning_rate": 0.00016900170897189525,
"loss": 1.5735,
"step": 604000
},
{
"epoch": 2.4968092883783752,
"grad_norm": 0.8113991618156433,
"learning_rate": 0.00016831093617673836,
"loss": 1.5746,
"step": 604500
},
{
"epoch": 2.49887447389399,
"grad_norm": 0.8515011668205261,
"learning_rate": 0.0001676201633815815,
"loss": 1.568,
"step": 605000
},
{
"epoch": 2.500939659409605,
"grad_norm": 0.7948423624038696,
"learning_rate": 0.00016692939058642468,
"loss": 1.5727,
"step": 605500
},
{
"epoch": 2.50300484492522,
"grad_norm": 0.8115394711494446,
"learning_rate": 0.0001662386177912678,
"loss": 1.5704,
"step": 606000
},
{
"epoch": 2.5050700304408346,
"grad_norm": 0.8036853671073914,
"learning_rate": 0.00016554784499611094,
"loss": 1.5684,
"step": 606500
},
{
"epoch": 2.5071352159564495,
"grad_norm": 0.7892432808876038,
"learning_rate": 0.00016485707220095412,
"loss": 1.569,
"step": 607000
},
{
"epoch": 2.5092004014720644,
"grad_norm": 0.7984645366668701,
"learning_rate": 0.00016416629940579726,
"loss": 1.5679,
"step": 607500
},
{
"epoch": 2.511265586987679,
"grad_norm": 0.7996472120285034,
"learning_rate": 0.00016347552661064038,
"loss": 1.5691,
"step": 608000
},
{
"epoch": 2.513330772503294,
"grad_norm": 0.8775748610496521,
"learning_rate": 0.00016278475381548355,
"loss": 1.5707,
"step": 608500
},
{
"epoch": 2.515395958018909,
"grad_norm": 0.8051262497901917,
"learning_rate": 0.0001620939810203267,
"loss": 1.5739,
"step": 609000
},
{
"epoch": 2.5174611435345238,
"grad_norm": 0.8654427528381348,
"learning_rate": 0.0001614032082251698,
"loss": 1.5697,
"step": 609500
},
{
"epoch": 2.5195263290501386,
"grad_norm": 0.8159758448600769,
"learning_rate": 0.00016071243543001298,
"loss": 1.57,
"step": 610000
},
{
"epoch": 2.5215915145657535,
"grad_norm": 0.8165413737297058,
"learning_rate": 0.00016002166263485613,
"loss": 1.569,
"step": 610500
},
{
"epoch": 2.5236567000813683,
"grad_norm": 0.7978746891021729,
"learning_rate": 0.00015933088983969927,
"loss": 1.5659,
"step": 611000
},
{
"epoch": 2.525721885596983,
"grad_norm": 0.781399130821228,
"learning_rate": 0.00015864011704454242,
"loss": 1.5707,
"step": 611500
},
{
"epoch": 2.527787071112598,
"grad_norm": 0.8478353023529053,
"learning_rate": 0.00015794934424938556,
"loss": 1.5704,
"step": 612000
},
{
"epoch": 2.529852256628213,
"grad_norm": 0.846371054649353,
"learning_rate": 0.0001572585714542287,
"loss": 1.5638,
"step": 612500
},
{
"epoch": 2.5319174421438277,
"grad_norm": 0.8290744423866272,
"learning_rate": 0.00015656779865907185,
"loss": 1.5702,
"step": 613000
},
{
"epoch": 2.5339826276594426,
"grad_norm": 0.8195119500160217,
"learning_rate": 0.000155877025863915,
"loss": 1.5677,
"step": 613500
},
{
"epoch": 2.5360478131750575,
"grad_norm": 0.8459944128990173,
"learning_rate": 0.00015518625306875814,
"loss": 1.5662,
"step": 614000
},
{
"epoch": 2.5381129986906723,
"grad_norm": 0.7994758486747742,
"learning_rate": 0.00015449548027360132,
"loss": 1.5676,
"step": 614500
},
{
"epoch": 2.540178184206287,
"grad_norm": 0.7963876724243164,
"learning_rate": 0.00015380470747844443,
"loss": 1.5661,
"step": 615000
},
{
"epoch": 2.542243369721902,
"grad_norm": 0.8234278559684753,
"learning_rate": 0.00015311393468328758,
"loss": 1.5635,
"step": 615500
},
{
"epoch": 2.544308555237517,
"grad_norm": 0.7948046922683716,
"learning_rate": 0.00015242316188813075,
"loss": 1.5631,
"step": 616000
},
{
"epoch": 2.5463737407531317,
"grad_norm": 0.7982361912727356,
"learning_rate": 0.00015173238909297387,
"loss": 1.5685,
"step": 616500
},
{
"epoch": 2.5484389262687466,
"grad_norm": 0.7927718758583069,
"learning_rate": 0.000151041616297817,
"loss": 1.5661,
"step": 617000
},
{
"epoch": 2.5505041117843614,
"grad_norm": 0.8640558123588562,
"learning_rate": 0.00015035084350266018,
"loss": 1.5673,
"step": 617500
},
{
"epoch": 2.5525692972999767,
"grad_norm": 0.8167000412940979,
"learning_rate": 0.0001496600707075033,
"loss": 1.5666,
"step": 618000
},
{
"epoch": 2.554634482815591,
"grad_norm": 0.8331367373466492,
"learning_rate": 0.00014896929791234645,
"loss": 1.5656,
"step": 618500
},
{
"epoch": 2.5566996683312064,
"grad_norm": 0.8466469645500183,
"learning_rate": 0.00014827852511718962,
"loss": 1.562,
"step": 619000
},
{
"epoch": 2.558764853846821,
"grad_norm": 0.7808212637901306,
"learning_rate": 0.00014758775232203276,
"loss": 1.5605,
"step": 619500
},
{
"epoch": 2.560830039362436,
"grad_norm": 0.8436982035636902,
"learning_rate": 0.00014689697952687588,
"loss": 1.5621,
"step": 620000
},
{
"epoch": 2.5628952248780505,
"grad_norm": 0.8526425361633301,
"learning_rate": 0.00014620620673171905,
"loss": 1.566,
"step": 620500
},
{
"epoch": 2.564960410393666,
"grad_norm": 0.8892133831977844,
"learning_rate": 0.0001455154339365622,
"loss": 1.5623,
"step": 621000
},
{
"epoch": 2.5670255959092803,
"grad_norm": 0.8048965930938721,
"learning_rate": 0.00014482466114140532,
"loss": 1.5617,
"step": 621500
},
{
"epoch": 2.5690907814248956,
"grad_norm": 0.8180302977561951,
"learning_rate": 0.0001441338883462485,
"loss": 1.5605,
"step": 622000
},
{
"epoch": 2.5711559669405104,
"grad_norm": 0.795669674873352,
"learning_rate": 0.00014344311555109163,
"loss": 1.5615,
"step": 622500
},
{
"epoch": 2.5732211524561253,
"grad_norm": 0.8272981643676758,
"learning_rate": 0.00014275234275593478,
"loss": 1.5606,
"step": 623000
},
{
"epoch": 2.57528633797174,
"grad_norm": 0.8385244607925415,
"learning_rate": 0.00014206156996077792,
"loss": 1.5628,
"step": 623500
},
{
"epoch": 2.577351523487355,
"grad_norm": 0.8457437753677368,
"learning_rate": 0.00014137079716562107,
"loss": 1.5553,
"step": 624000
},
{
"epoch": 2.57941670900297,
"grad_norm": 0.8497530221939087,
"learning_rate": 0.0001406800243704642,
"loss": 1.5521,
"step": 624500
},
{
"epoch": 2.5814818945185847,
"grad_norm": 0.8231092691421509,
"learning_rate": 0.00013998925157530736,
"loss": 1.5613,
"step": 625000
},
{
"epoch": 2.5835470800341995,
"grad_norm": 0.783505380153656,
"learning_rate": 0.0001392984787801505,
"loss": 1.5577,
"step": 625500
},
{
"epoch": 2.5856122655498144,
"grad_norm": 0.8594375848770142,
"learning_rate": 0.00013860770598499365,
"loss": 1.5603,
"step": 626000
},
{
"epoch": 2.5876774510654292,
"grad_norm": 0.824301540851593,
"learning_rate": 0.00013791693318983682,
"loss": 1.5592,
"step": 626500
},
{
"epoch": 2.589742636581044,
"grad_norm": 0.7970808744430542,
"learning_rate": 0.00013722616039467994,
"loss": 1.5577,
"step": 627000
},
{
"epoch": 2.591807822096659,
"grad_norm": 0.7681635022163391,
"learning_rate": 0.00013653538759952308,
"loss": 1.556,
"step": 627500
},
{
"epoch": 2.593873007612274,
"grad_norm": 0.820792555809021,
"learning_rate": 0.00013584461480436625,
"loss": 1.5567,
"step": 628000
},
{
"epoch": 2.5959381931278886,
"grad_norm": 0.8436790704727173,
"learning_rate": 0.00013515384200920937,
"loss": 1.5562,
"step": 628500
},
{
"epoch": 2.5980033786435035,
"grad_norm": 0.806010901927948,
"learning_rate": 0.00013446306921405252,
"loss": 1.558,
"step": 629000
},
{
"epoch": 2.6000685641591184,
"grad_norm": 0.8049686551094055,
"learning_rate": 0.0001337722964188957,
"loss": 1.5593,
"step": 629500
},
{
"epoch": 2.602133749674733,
"grad_norm": 0.8346471786499023,
"learning_rate": 0.00013308152362373883,
"loss": 1.5551,
"step": 630000
},
{
"epoch": 2.604198935190348,
"grad_norm": 0.8366252779960632,
"learning_rate": 0.00013239075082858195,
"loss": 1.5571,
"step": 630500
},
{
"epoch": 2.606264120705963,
"grad_norm": 0.8249139785766602,
"learning_rate": 0.00013169997803342512,
"loss": 1.554,
"step": 631000
},
{
"epoch": 2.6083293062215778,
"grad_norm": 0.8431522250175476,
"learning_rate": 0.00013100920523826827,
"loss": 1.557,
"step": 631500
},
{
"epoch": 2.6103944917371926,
"grad_norm": 0.8180191516876221,
"learning_rate": 0.00013031843244311138,
"loss": 1.553,
"step": 632000
},
{
"epoch": 2.6124596772528075,
"grad_norm": 0.7824527025222778,
"learning_rate": 0.00012962765964795456,
"loss": 1.558,
"step": 632500
},
{
"epoch": 2.6145248627684223,
"grad_norm": 0.839433491230011,
"learning_rate": 0.0001289368868527977,
"loss": 1.5525,
"step": 633000
},
{
"epoch": 2.6165900482840376,
"grad_norm": 0.9019516110420227,
"learning_rate": 0.00012824611405764087,
"loss": 1.5569,
"step": 633500
},
{
"epoch": 2.618655233799652,
"grad_norm": 0.8029139637947083,
"learning_rate": 0.000127555341262484,
"loss": 1.5552,
"step": 634000
},
{
"epoch": 2.6207204193152673,
"grad_norm": 0.8322605490684509,
"learning_rate": 0.00012686456846732714,
"loss": 1.5566,
"step": 634500
},
{
"epoch": 2.6227856048308817,
"grad_norm": 0.8417773842811584,
"learning_rate": 0.0001261737956721703,
"loss": 1.551,
"step": 635000
},
{
"epoch": 2.624850790346497,
"grad_norm": 0.8202713131904602,
"learning_rate": 0.00012548302287701343,
"loss": 1.5507,
"step": 635500
},
{
"epoch": 2.6269159758621115,
"grad_norm": 0.839905858039856,
"learning_rate": 0.00012479225008185657,
"loss": 1.5495,
"step": 636000
},
{
"epoch": 2.6289811613777267,
"grad_norm": 0.8542851805686951,
"learning_rate": 0.00012410147728669972,
"loss": 1.5504,
"step": 636500
},
{
"epoch": 2.631046346893341,
"grad_norm": 0.8227192163467407,
"learning_rate": 0.0001234107044915429,
"loss": 1.5531,
"step": 637000
},
{
"epoch": 2.6331115324089565,
"grad_norm": 0.8212194442749023,
"learning_rate": 0.000122719931696386,
"loss": 1.5523,
"step": 637500
},
{
"epoch": 2.6351767179245713,
"grad_norm": 0.8629603981971741,
"learning_rate": 0.00012202915890122916,
"loss": 1.5479,
"step": 638000
},
{
"epoch": 2.637241903440186,
"grad_norm": 0.8459728956222534,
"learning_rate": 0.00012133838610607231,
"loss": 1.5481,
"step": 638500
},
{
"epoch": 2.639307088955801,
"grad_norm": 0.8557335734367371,
"learning_rate": 0.00012064761331091545,
"loss": 1.5487,
"step": 639000
},
{
"epoch": 2.641372274471416,
"grad_norm": 0.8298543691635132,
"learning_rate": 0.0001199568405157586,
"loss": 1.5479,
"step": 639500
},
{
"epoch": 2.6434374599870307,
"grad_norm": 0.8238996863365173,
"learning_rate": 0.00011926606772060176,
"loss": 1.5507,
"step": 640000
},
{
"epoch": 2.6455026455026456,
"grad_norm": 0.7995360493659973,
"learning_rate": 0.0001185752949254449,
"loss": 1.5443,
"step": 640500
},
{
"epoch": 2.6475678310182604,
"grad_norm": 0.8611718416213989,
"learning_rate": 0.00011788452213028803,
"loss": 1.5476,
"step": 641000
},
{
"epoch": 2.6496330165338753,
"grad_norm": 0.8229385614395142,
"learning_rate": 0.00011719374933513119,
"loss": 1.545,
"step": 641500
},
{
"epoch": 2.65169820204949,
"grad_norm": 0.8134409785270691,
"learning_rate": 0.00011650297653997434,
"loss": 1.5482,
"step": 642000
},
{
"epoch": 2.653763387565105,
"grad_norm": 0.8563694953918457,
"learning_rate": 0.00011581220374481748,
"loss": 1.5457,
"step": 642500
},
{
"epoch": 2.65582857308072,
"grad_norm": 0.8361693620681763,
"learning_rate": 0.00011512143094966063,
"loss": 1.5462,
"step": 643000
},
{
"epoch": 2.6578937585963347,
"grad_norm": 0.8493614792823792,
"learning_rate": 0.00011443065815450378,
"loss": 1.5463,
"step": 643500
},
{
"epoch": 2.6599589441119496,
"grad_norm": 0.7997604012489319,
"learning_rate": 0.00011373988535934692,
"loss": 1.547,
"step": 644000
},
{
"epoch": 2.6620241296275644,
"grad_norm": 0.8045528531074524,
"learning_rate": 0.00011304911256419006,
"loss": 1.5491,
"step": 644500
},
{
"epoch": 2.6640893151431793,
"grad_norm": 0.8172311186790466,
"learning_rate": 0.00011235833976903322,
"loss": 1.5486,
"step": 645000
},
{
"epoch": 2.666154500658794,
"grad_norm": 0.8630313873291016,
"learning_rate": 0.00011166756697387635,
"loss": 1.5513,
"step": 645500
},
{
"epoch": 2.668219686174409,
"grad_norm": 0.8246090412139893,
"learning_rate": 0.00011097679417871951,
"loss": 1.5461,
"step": 646000
},
{
"epoch": 2.670284871690024,
"grad_norm": 0.8191748857498169,
"learning_rate": 0.00011028602138356265,
"loss": 1.545,
"step": 646500
},
{
"epoch": 2.6723500572056387,
"grad_norm": 1.1739202737808228,
"learning_rate": 0.0001095952485884058,
"loss": 1.5455,
"step": 647000
},
{
"epoch": 2.6744152427212535,
"grad_norm": 0.8145565390586853,
"learning_rate": 0.00010890447579324894,
"loss": 1.5408,
"step": 647500
},
{
"epoch": 2.6764804282368684,
"grad_norm": 0.8613256216049194,
"learning_rate": 0.00010821370299809209,
"loss": 1.5439,
"step": 648000
},
{
"epoch": 2.6785456137524832,
"grad_norm": 0.8024303317070007,
"learning_rate": 0.00010752293020293523,
"loss": 1.5438,
"step": 648500
},
{
"epoch": 2.680610799268098,
"grad_norm": 0.8254972100257874,
"learning_rate": 0.00010683215740777838,
"loss": 1.5458,
"step": 649000
},
{
"epoch": 2.682675984783713,
"grad_norm": 0.815696120262146,
"learning_rate": 0.00010614138461262154,
"loss": 1.542,
"step": 649500
},
{
"epoch": 2.6847411702993282,
"grad_norm": 0.8715610504150391,
"learning_rate": 0.00010545061181746467,
"loss": 1.5415,
"step": 650000
},
{
"epoch": 2.6868063558149426,
"grad_norm": 0.8358045220375061,
"learning_rate": 0.00010475983902230781,
"loss": 1.5419,
"step": 650500
},
{
"epoch": 2.688871541330558,
"grad_norm": 0.7865080237388611,
"learning_rate": 0.00010406906622715097,
"loss": 1.5429,
"step": 651000
},
{
"epoch": 2.6909367268461724,
"grad_norm": 0.8054898381233215,
"learning_rate": 0.0001033782934319941,
"loss": 1.5433,
"step": 651500
},
{
"epoch": 2.6930019123617877,
"grad_norm": 0.8930450081825256,
"learning_rate": 0.00010268752063683726,
"loss": 1.5352,
"step": 652000
},
{
"epoch": 2.695067097877402,
"grad_norm": 0.8042411208152771,
"learning_rate": 0.0001019967478416804,
"loss": 1.5413,
"step": 652500
},
{
"epoch": 2.6971322833930174,
"grad_norm": 0.8400362133979797,
"learning_rate": 0.00010130597504652355,
"loss": 1.5423,
"step": 653000
},
{
"epoch": 2.699197468908632,
"grad_norm": 0.8137294054031372,
"learning_rate": 0.0001006152022513667,
"loss": 1.5432,
"step": 653500
},
{
"epoch": 2.701262654424247,
"grad_norm": 0.8344128727912903,
"learning_rate": 9.992442945620984e-05,
"loss": 1.5398,
"step": 654000
},
{
"epoch": 2.703327839939862,
"grad_norm": 0.849104642868042,
"learning_rate": 9.923365666105298e-05,
"loss": 1.54,
"step": 654500
},
{
"epoch": 2.7053930254554768,
"grad_norm": 0.8286527991294861,
"learning_rate": 9.854288386589613e-05,
"loss": 1.5412,
"step": 655000
},
{
"epoch": 2.7074582109710916,
"grad_norm": 0.8378123641014099,
"learning_rate": 9.785211107073929e-05,
"loss": 1.5361,
"step": 655500
},
{
"epoch": 2.7095233964867065,
"grad_norm": 0.8808925151824951,
"learning_rate": 9.716133827558243e-05,
"loss": 1.5382,
"step": 656000
},
{
"epoch": 2.7115885820023213,
"grad_norm": 0.8783825039863586,
"learning_rate": 9.647056548042558e-05,
"loss": 1.5361,
"step": 656500
},
{
"epoch": 2.713653767517936,
"grad_norm": 0.8051160573959351,
"learning_rate": 9.577979268526872e-05,
"loss": 1.5358,
"step": 657000
},
{
"epoch": 2.715718953033551,
"grad_norm": 0.896801233291626,
"learning_rate": 9.508901989011187e-05,
"loss": 1.5368,
"step": 657500
},
{
"epoch": 2.717784138549166,
"grad_norm": 0.8218420743942261,
"learning_rate": 9.439824709495501e-05,
"loss": 1.537,
"step": 658000
},
{
"epoch": 2.7198493240647807,
"grad_norm": 0.8470411896705627,
"learning_rate": 9.370747429979816e-05,
"loss": 1.5322,
"step": 658500
},
{
"epoch": 2.7219145095803956,
"grad_norm": 0.8505502939224243,
"learning_rate": 9.301670150464131e-05,
"loss": 1.5317,
"step": 659000
},
{
"epoch": 2.7239796950960105,
"grad_norm": 0.8617528080940247,
"learning_rate": 9.232592870948445e-05,
"loss": 1.5375,
"step": 659500
},
{
"epoch": 2.7260448806116253,
"grad_norm": 0.8441663384437561,
"learning_rate": 9.16351559143276e-05,
"loss": 1.5366,
"step": 660000
},
{
"epoch": 2.72811006612724,
"grad_norm": 0.8294611573219299,
"learning_rate": 9.094438311917075e-05,
"loss": 1.5373,
"step": 660500
},
{
"epoch": 2.730175251642855,
"grad_norm": 0.8215169906616211,
"learning_rate": 9.025361032401388e-05,
"loss": 1.5327,
"step": 661000
},
{
"epoch": 2.73224043715847,
"grad_norm": 0.8766931891441345,
"learning_rate": 8.956283752885704e-05,
"loss": 1.5339,
"step": 661500
},
{
"epoch": 2.7343056226740847,
"grad_norm": 0.8456342220306396,
"learning_rate": 8.887206473370018e-05,
"loss": 1.5341,
"step": 662000
},
{
"epoch": 2.7363708081896996,
"grad_norm": 0.8384252786636353,
"learning_rate": 8.818129193854333e-05,
"loss": 1.5338,
"step": 662500
},
{
"epoch": 2.7384359937053144,
"grad_norm": 0.8584861159324646,
"learning_rate": 8.749051914338647e-05,
"loss": 1.5301,
"step": 663000
},
{
"epoch": 2.7405011792209293,
"grad_norm": 0.8463834524154663,
"learning_rate": 8.679974634822962e-05,
"loss": 1.531,
"step": 663500
},
{
"epoch": 2.742566364736544,
"grad_norm": 0.84855055809021,
"learning_rate": 8.610897355307276e-05,
"loss": 1.5332,
"step": 664000
},
{
"epoch": 2.744631550252159,
"grad_norm": 0.8267730474472046,
"learning_rate": 8.541820075791591e-05,
"loss": 1.5337,
"step": 664500
},
{
"epoch": 2.746696735767774,
"grad_norm": 0.8398123383522034,
"learning_rate": 8.472742796275907e-05,
"loss": 1.5327,
"step": 665000
},
{
"epoch": 2.748761921283389,
"grad_norm": 0.8413114547729492,
"learning_rate": 8.40366551676022e-05,
"loss": 1.5355,
"step": 665500
},
{
"epoch": 2.7508271067990036,
"grad_norm": 0.8241723775863647,
"learning_rate": 8.334588237244536e-05,
"loss": 1.533,
"step": 666000
},
{
"epoch": 2.752892292314619,
"grad_norm": 0.8695456981658936,
"learning_rate": 8.26551095772885e-05,
"loss": 1.5347,
"step": 666500
},
{
"epoch": 2.7549574778302333,
"grad_norm": 0.8351263403892517,
"learning_rate": 8.196433678213163e-05,
"loss": 1.53,
"step": 667000
},
{
"epoch": 2.7570226633458486,
"grad_norm": 0.8227745294570923,
"learning_rate": 8.127356398697479e-05,
"loss": 1.53,
"step": 667500
},
{
"epoch": 2.759087848861463,
"grad_norm": 0.8654522895812988,
"learning_rate": 8.058279119181794e-05,
"loss": 1.532,
"step": 668000
},
{
"epoch": 2.7611530343770783,
"grad_norm": 0.819057822227478,
"learning_rate": 7.989201839666108e-05,
"loss": 1.5297,
"step": 668500
},
{
"epoch": 2.763218219892693,
"grad_norm": 0.8575501441955566,
"learning_rate": 7.920124560150422e-05,
"loss": 1.5275,
"step": 669000
},
{
"epoch": 2.765283405408308,
"grad_norm": 0.8428553938865662,
"learning_rate": 7.851047280634738e-05,
"loss": 1.5321,
"step": 669500
},
{
"epoch": 2.767348590923923,
"grad_norm": 0.8702006936073303,
"learning_rate": 7.781970001119051e-05,
"loss": 1.5291,
"step": 670000
},
{
"epoch": 2.7694137764395377,
"grad_norm": 0.8024266958236694,
"learning_rate": 7.712892721603366e-05,
"loss": 1.529,
"step": 670500
},
{
"epoch": 2.7714789619551525,
"grad_norm": 0.862339437007904,
"learning_rate": 7.643815442087682e-05,
"loss": 1.5337,
"step": 671000
},
{
"epoch": 2.7735441474707674,
"grad_norm": 0.8829432725906372,
"learning_rate": 7.574738162571996e-05,
"loss": 1.5243,
"step": 671500
},
{
"epoch": 2.7756093329863822,
"grad_norm": 0.8032020926475525,
"learning_rate": 7.505660883056311e-05,
"loss": 1.525,
"step": 672000
},
{
"epoch": 2.777674518501997,
"grad_norm": 0.8329365849494934,
"learning_rate": 7.436583603540625e-05,
"loss": 1.532,
"step": 672500
},
{
"epoch": 2.779739704017612,
"grad_norm": 0.865728497505188,
"learning_rate": 7.367506324024941e-05,
"loss": 1.5243,
"step": 673000
},
{
"epoch": 2.781804889533227,
"grad_norm": 0.8427261114120483,
"learning_rate": 7.298429044509254e-05,
"loss": 1.5197,
"step": 673500
},
{
"epoch": 2.7838700750488417,
"grad_norm": 0.8444133400917053,
"learning_rate": 7.229351764993569e-05,
"loss": 1.5314,
"step": 674000
},
{
"epoch": 2.7859352605644565,
"grad_norm": 0.8255510330200195,
"learning_rate": 7.160274485477885e-05,
"loss": 1.5275,
"step": 674500
},
{
"epoch": 2.7880004460800714,
"grad_norm": 0.794021487236023,
"learning_rate": 7.091197205962198e-05,
"loss": 1.5237,
"step": 675000
},
{
"epoch": 2.790065631595686,
"grad_norm": 0.8648783564567566,
"learning_rate": 7.022119926446513e-05,
"loss": 1.5221,
"step": 675500
},
{
"epoch": 2.792130817111301,
"grad_norm": 0.8662870526313782,
"learning_rate": 6.953042646930828e-05,
"loss": 1.5239,
"step": 676000
},
{
"epoch": 2.794196002626916,
"grad_norm": 0.8716167211532593,
"learning_rate": 6.883965367415141e-05,
"loss": 1.5284,
"step": 676500
},
{
"epoch": 2.7962611881425308,
"grad_norm": 0.8369839191436768,
"learning_rate": 6.814888087899457e-05,
"loss": 1.5206,
"step": 677000
},
{
"epoch": 2.7983263736581456,
"grad_norm": 0.8716705441474915,
"learning_rate": 6.745810808383771e-05,
"loss": 1.5179,
"step": 677500
},
{
"epoch": 2.8003915591737605,
"grad_norm": 0.8210489153862,
"learning_rate": 6.676733528868086e-05,
"loss": 1.5286,
"step": 678000
},
{
"epoch": 2.8024567446893753,
"grad_norm": 0.8834524750709534,
"learning_rate": 6.6076562493524e-05,
"loss": 1.5271,
"step": 678500
},
{
"epoch": 2.80452193020499,
"grad_norm": 0.858285665512085,
"learning_rate": 6.538578969836716e-05,
"loss": 1.5232,
"step": 679000
},
{
"epoch": 2.806587115720605,
"grad_norm": 0.8696337342262268,
"learning_rate": 6.46950169032103e-05,
"loss": 1.524,
"step": 679500
},
{
"epoch": 2.80865230123622,
"grad_norm": 0.8471727967262268,
"learning_rate": 6.400424410805344e-05,
"loss": 1.523,
"step": 680000
},
{
"epoch": 2.8107174867518347,
"grad_norm": 0.8594076633453369,
"learning_rate": 6.33134713128966e-05,
"loss": 1.5166,
"step": 680500
},
{
"epoch": 2.81278267226745,
"grad_norm": 0.856606662273407,
"learning_rate": 6.262269851773973e-05,
"loss": 1.523,
"step": 681000
},
{
"epoch": 2.8148478577830645,
"grad_norm": 0.8609211444854736,
"learning_rate": 6.193192572258289e-05,
"loss": 1.5209,
"step": 681500
},
{
"epoch": 2.8169130432986798,
"grad_norm": 0.8398802280426025,
"learning_rate": 6.124115292742603e-05,
"loss": 1.5271,
"step": 682000
},
{
"epoch": 2.818978228814294,
"grad_norm": 0.9304519295692444,
"learning_rate": 6.0550380132269176e-05,
"loss": 1.5205,
"step": 682500
},
{
"epoch": 2.8210434143299095,
"grad_norm": 0.8197703957557678,
"learning_rate": 5.985960733711232e-05,
"loss": 1.524,
"step": 683000
},
{
"epoch": 2.823108599845524,
"grad_norm": 0.831089973449707,
"learning_rate": 5.916883454195547e-05,
"loss": 1.5204,
"step": 683500
},
{
"epoch": 2.825173785361139,
"grad_norm": 0.8130340576171875,
"learning_rate": 5.847806174679862e-05,
"loss": 1.5151,
"step": 684000
},
{
"epoch": 2.827238970876754,
"grad_norm": 0.8501649498939514,
"learning_rate": 5.7787288951641755e-05,
"loss": 1.5213,
"step": 684500
},
{
"epoch": 2.829304156392369,
"grad_norm": 0.827510416507721,
"learning_rate": 5.709651615648491e-05,
"loss": 1.5202,
"step": 685000
},
{
"epoch": 2.8313693419079837,
"grad_norm": 0.8375749588012695,
"learning_rate": 5.640574336132805e-05,
"loss": 1.5226,
"step": 685500
},
{
"epoch": 2.8334345274235986,
"grad_norm": 0.8179614543914795,
"learning_rate": 5.57149705661712e-05,
"loss": 1.5174,
"step": 686000
},
{
"epoch": 2.8354997129392134,
"grad_norm": 0.8485569953918457,
"learning_rate": 5.502419777101435e-05,
"loss": 1.5197,
"step": 686500
},
{
"epoch": 2.8375648984548283,
"grad_norm": 0.8839040398597717,
"learning_rate": 5.433342497585749e-05,
"loss": 1.5206,
"step": 687000
},
{
"epoch": 2.839630083970443,
"grad_norm": 0.8560023307800293,
"learning_rate": 5.364265218070064e-05,
"loss": 1.5177,
"step": 687500
},
{
"epoch": 2.841695269486058,
"grad_norm": 0.8139906525611877,
"learning_rate": 5.295187938554378e-05,
"loss": 1.5154,
"step": 688000
},
{
"epoch": 2.843760455001673,
"grad_norm": 0.9361693859100342,
"learning_rate": 5.226110659038693e-05,
"loss": 1.5131,
"step": 688500
},
{
"epoch": 2.8458256405172877,
"grad_norm": 0.8294958472251892,
"learning_rate": 5.157033379523008e-05,
"loss": 1.5194,
"step": 689000
},
{
"epoch": 2.8478908260329026,
"grad_norm": 0.8591476082801819,
"learning_rate": 5.0879561000073224e-05,
"loss": 1.5179,
"step": 689500
},
{
"epoch": 2.8499560115485174,
"grad_norm": 0.8466942310333252,
"learning_rate": 5.018878820491637e-05,
"loss": 1.5122,
"step": 690000
},
{
"epoch": 2.8520211970641323,
"grad_norm": 0.9315714240074158,
"learning_rate": 4.949801540975952e-05,
"loss": 1.5193,
"step": 690500
},
{
"epoch": 2.854086382579747,
"grad_norm": 0.8646622896194458,
"learning_rate": 4.8807242614602665e-05,
"loss": 1.5155,
"step": 691000
},
{
"epoch": 2.856151568095362,
"grad_norm": 0.8958275318145752,
"learning_rate": 4.8116469819445804e-05,
"loss": 1.5172,
"step": 691500
},
{
"epoch": 2.858216753610977,
"grad_norm": 0.8623936176300049,
"learning_rate": 4.7425697024288955e-05,
"loss": 1.5138,
"step": 692000
},
{
"epoch": 2.8602819391265917,
"grad_norm": 0.8689021468162537,
"learning_rate": 4.67349242291321e-05,
"loss": 1.5144,
"step": 692500
},
{
"epoch": 2.8623471246422065,
"grad_norm": 0.8967764973640442,
"learning_rate": 4.6044151433975245e-05,
"loss": 1.5156,
"step": 693000
},
{
"epoch": 2.8644123101578214,
"grad_norm": 0.8540061116218567,
"learning_rate": 4.5353378638818396e-05,
"loss": 1.5161,
"step": 693500
},
{
"epoch": 2.8664774956734362,
"grad_norm": 0.8717928528785706,
"learning_rate": 4.466260584366154e-05,
"loss": 1.5097,
"step": 694000
},
{
"epoch": 2.868542681189051,
"grad_norm": 0.861867368221283,
"learning_rate": 4.397183304850468e-05,
"loss": 1.5117,
"step": 694500
},
{
"epoch": 2.870607866704666,
"grad_norm": 0.8746508955955505,
"learning_rate": 4.328106025334783e-05,
"loss": 1.5116,
"step": 695000
},
{
"epoch": 2.872673052220281,
"grad_norm": 0.856505274772644,
"learning_rate": 4.2590287458190976e-05,
"loss": 1.5105,
"step": 695500
},
{
"epoch": 2.8747382377358957,
"grad_norm": 0.8690941333770752,
"learning_rate": 4.189951466303412e-05,
"loss": 1.511,
"step": 696000
},
{
"epoch": 2.876803423251511,
"grad_norm": 0.8394379019737244,
"learning_rate": 4.120874186787727e-05,
"loss": 1.511,
"step": 696500
},
{
"epoch": 2.8788686087671254,
"grad_norm": 0.847400426864624,
"learning_rate": 4.051796907272042e-05,
"loss": 1.5152,
"step": 697000
},
{
"epoch": 2.8809337942827407,
"grad_norm": 0.8548203706741333,
"learning_rate": 3.982719627756357e-05,
"loss": 1.5103,
"step": 697500
},
{
"epoch": 2.882998979798355,
"grad_norm": 0.9266785979270935,
"learning_rate": 3.913642348240671e-05,
"loss": 1.5172,
"step": 698000
},
{
"epoch": 2.8850641653139704,
"grad_norm": 0.8905568718910217,
"learning_rate": 3.844565068724985e-05,
"loss": 1.5147,
"step": 698500
},
{
"epoch": 2.8871293508295848,
"grad_norm": 0.8947970271110535,
"learning_rate": 3.7754877892093e-05,
"loss": 1.5116,
"step": 699000
},
{
"epoch": 2.8891945363452,
"grad_norm": 0.8671281337738037,
"learning_rate": 3.706410509693615e-05,
"loss": 1.5089,
"step": 699500
},
{
"epoch": 2.8912597218608145,
"grad_norm": 0.8655187487602234,
"learning_rate": 3.637333230177929e-05,
"loss": 1.5079,
"step": 700000
},
{
"epoch": 2.8933249073764298,
"grad_norm": 0.8781392574310303,
"learning_rate": 3.5682559506622444e-05,
"loss": 1.5051,
"step": 700500
},
{
"epoch": 2.8953900928920446,
"grad_norm": 0.8239871859550476,
"learning_rate": 3.499178671146558e-05,
"loss": 1.5135,
"step": 701000
},
{
"epoch": 2.8974552784076595,
"grad_norm": 0.8702250719070435,
"learning_rate": 3.430101391630873e-05,
"loss": 1.5101,
"step": 701500
},
{
"epoch": 2.8995204639232743,
"grad_norm": 0.8681339621543884,
"learning_rate": 3.361024112115188e-05,
"loss": 1.5098,
"step": 702000
},
{
"epoch": 2.901585649438889,
"grad_norm": 0.8929154276847839,
"learning_rate": 3.2919468325995024e-05,
"loss": 1.5115,
"step": 702500
},
{
"epoch": 2.903650834954504,
"grad_norm": 0.8695405125617981,
"learning_rate": 3.222869553083817e-05,
"loss": 1.5073,
"step": 703000
},
{
"epoch": 2.905716020470119,
"grad_norm": 0.8858229517936707,
"learning_rate": 3.153792273568132e-05,
"loss": 1.5081,
"step": 703500
},
{
"epoch": 2.9077812059857338,
"grad_norm": 0.8298658132553101,
"learning_rate": 3.0847149940524465e-05,
"loss": 1.5109,
"step": 704000
},
{
"epoch": 2.9098463915013486,
"grad_norm": 0.9026769399642944,
"learning_rate": 3.015637714536761e-05,
"loss": 1.5036,
"step": 704500
},
{
"epoch": 2.9119115770169635,
"grad_norm": 0.8433796763420105,
"learning_rate": 2.9465604350210755e-05,
"loss": 1.5103,
"step": 705000
},
{
"epoch": 2.9139767625325783,
"grad_norm": 0.8475963473320007,
"learning_rate": 2.87748315550539e-05,
"loss": 1.5084,
"step": 705500
},
{
"epoch": 2.916041948048193,
"grad_norm": 0.8807883262634277,
"learning_rate": 2.8084058759897048e-05,
"loss": 1.5089,
"step": 706000
},
{
"epoch": 2.918107133563808,
"grad_norm": 0.9054199457168579,
"learning_rate": 2.7393285964740193e-05,
"loss": 1.5123,
"step": 706500
},
{
"epoch": 2.920172319079423,
"grad_norm": 0.8661481738090515,
"learning_rate": 2.670251316958334e-05,
"loss": 1.5088,
"step": 707000
},
{
"epoch": 2.9222375045950377,
"grad_norm": 0.8456491231918335,
"learning_rate": 2.6011740374426486e-05,
"loss": 1.5078,
"step": 707500
},
{
"epoch": 2.9243026901106526,
"grad_norm": 0.8700172305107117,
"learning_rate": 2.5320967579269634e-05,
"loss": 1.5056,
"step": 708000
},
{
"epoch": 2.9263678756262674,
"grad_norm": 0.882483184337616,
"learning_rate": 2.463019478411278e-05,
"loss": 1.5118,
"step": 708500
},
{
"epoch": 2.9284330611418823,
"grad_norm": 0.8397735357284546,
"learning_rate": 2.3939421988955924e-05,
"loss": 1.5078,
"step": 709000
},
{
"epoch": 2.930498246657497,
"grad_norm": 0.8614588379859924,
"learning_rate": 2.3248649193799072e-05,
"loss": 1.504,
"step": 709500
},
{
"epoch": 2.932563432173112,
"grad_norm": 0.8456758260726929,
"learning_rate": 2.2557876398642217e-05,
"loss": 1.5068,
"step": 710000
},
{
"epoch": 2.934628617688727,
"grad_norm": 0.8835407495498657,
"learning_rate": 2.1867103603485365e-05,
"loss": 1.503,
"step": 710500
},
{
"epoch": 2.9366938032043417,
"grad_norm": 0.8269529938697815,
"learning_rate": 2.117633080832851e-05,
"loss": 1.5039,
"step": 711000
},
{
"epoch": 2.9387589887199566,
"grad_norm": 0.9135294556617737,
"learning_rate": 2.0485558013171655e-05,
"loss": 1.5067,
"step": 711500
},
{
"epoch": 2.940824174235572,
"grad_norm": 0.8736814856529236,
"learning_rate": 1.9794785218014803e-05,
"loss": 1.5025,
"step": 712000
},
{
"epoch": 2.9428893597511863,
"grad_norm": 0.8207076191902161,
"learning_rate": 1.9104012422857948e-05,
"loss": 1.503,
"step": 712500
},
{
"epoch": 2.9449545452668016,
"grad_norm": 0.8992505669593811,
"learning_rate": 1.8413239627701093e-05,
"loss": 1.5057,
"step": 713000
},
{
"epoch": 2.947019730782416,
"grad_norm": 0.8630014657974243,
"learning_rate": 1.772246683254424e-05,
"loss": 1.5026,
"step": 713500
},
{
"epoch": 2.9490849162980313,
"grad_norm": 0.8466277122497559,
"learning_rate": 1.703169403738739e-05,
"loss": 1.5022,
"step": 714000
},
{
"epoch": 2.9511501018136457,
"grad_norm": 0.8246403932571411,
"learning_rate": 1.6340921242230534e-05,
"loss": 1.5025,
"step": 714500
},
{
"epoch": 2.953215287329261,
"grad_norm": 0.8537036776542664,
"learning_rate": 1.565014844707368e-05,
"loss": 1.5004,
"step": 715000
},
{
"epoch": 2.9552804728448754,
"grad_norm": 0.8644038438796997,
"learning_rate": 1.4959375651916825e-05,
"loss": 1.5003,
"step": 715500
},
{
"epoch": 2.9573456583604907,
"grad_norm": 0.8385940790176392,
"learning_rate": 1.4268602856759972e-05,
"loss": 1.4993,
"step": 716000
},
{
"epoch": 2.9594108438761055,
"grad_norm": 0.8472567796707153,
"learning_rate": 1.3577830061603118e-05,
"loss": 1.503,
"step": 716500
},
{
"epoch": 2.9614760293917204,
"grad_norm": 0.8817070126533508,
"learning_rate": 1.2887057266446265e-05,
"loss": 1.5039,
"step": 717000
},
{
"epoch": 2.9635412149073352,
"grad_norm": 0.8786518573760986,
"learning_rate": 1.219628447128941e-05,
"loss": 1.506,
"step": 717500
},
{
"epoch": 2.96560640042295,
"grad_norm": 0.8719050884246826,
"learning_rate": 1.1505511676132556e-05,
"loss": 1.5004,
"step": 718000
},
{
"epoch": 2.967671585938565,
"grad_norm": 0.9109290242195129,
"learning_rate": 1.0814738880975703e-05,
"loss": 1.5021,
"step": 718500
},
{
"epoch": 2.96973677145418,
"grad_norm": 0.8234292268753052,
"learning_rate": 1.012396608581885e-05,
"loss": 1.5025,
"step": 719000
},
{
"epoch": 2.9718019569697947,
"grad_norm": 0.9141399264335632,
"learning_rate": 9.433193290661996e-06,
"loss": 1.4982,
"step": 719500
},
{
"epoch": 2.9738671424854095,
"grad_norm": 0.8994991183280945,
"learning_rate": 8.74242049550514e-06,
"loss": 1.5012,
"step": 720000
},
{
"epoch": 2.9759323280010244,
"grad_norm": 0.8629069328308105,
"learning_rate": 8.051647700348289e-06,
"loss": 1.5005,
"step": 720500
},
{
"epoch": 2.977997513516639,
"grad_norm": 0.8604488968849182,
"learning_rate": 7.360874905191434e-06,
"loss": 1.497,
"step": 721000
},
{
"epoch": 2.980062699032254,
"grad_norm": 0.8444788455963135,
"learning_rate": 6.67010211003458e-06,
"loss": 1.5015,
"step": 721500
},
{
"epoch": 2.982127884547869,
"grad_norm": 0.844616711139679,
"learning_rate": 5.979329314877727e-06,
"loss": 1.5037,
"step": 722000
},
{
"epoch": 2.9841930700634838,
"grad_norm": 0.8340693712234497,
"learning_rate": 5.288556519720873e-06,
"loss": 1.5002,
"step": 722500
},
{
"epoch": 2.9862582555790986,
"grad_norm": 0.8410211205482483,
"learning_rate": 4.597783724564018e-06,
"loss": 1.4972,
"step": 723000
},
{
"epoch": 2.9883234410947135,
"grad_norm": 0.8680119514465332,
"learning_rate": 3.907010929407165e-06,
"loss": 1.4977,
"step": 723500
},
{
"epoch": 2.9903886266103283,
"grad_norm": 0.8596481084823608,
"learning_rate": 3.2162381342503112e-06,
"loss": 1.5007,
"step": 724000
},
{
"epoch": 2.992453812125943,
"grad_norm": 0.7909371256828308,
"learning_rate": 2.5254653390934573e-06,
"loss": 1.4953,
"step": 724500
},
{
"epoch": 2.994518997641558,
"grad_norm": 0.8666454553604126,
"learning_rate": 1.8346925439366037e-06,
"loss": 1.5011,
"step": 725000
},
{
"epoch": 2.996584183157173,
"grad_norm": 0.8664350509643555,
"learning_rate": 1.1439197487797498e-06,
"loss": 1.5007,
"step": 725500
},
{
"epoch": 2.9986493686727878,
"grad_norm": 0.8779242634773254,
"learning_rate": 4.531469536228961e-07,
"loss": 1.4985,
"step": 726000
},
{
"epoch": 3.0,
"step": 726327,
"total_flos": 1.546067484574894e+18,
"train_loss": 1.7818369276394814,
"train_runtime": 122016.1779,
"train_samples_per_second": 380.973,
"train_steps_per_second": 5.953
}
],
"logging_steps": 500,
"max_steps": 726327,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.546067484574894e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}