N3D-VLM / trainer_state.json
YXXPP's picture
Duplicate from yuxinhk/N3D-VLM
141c0ef
{
"best_global_step": null,
"best_metric": 0.4795108139514923,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 30000,
"global_step": 65896,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015175427947068108,
"grad_norm": 56.41612243652344,
"learning_rate": 1.5022761760242794e-07,
"loss": 8.3409,
"step": 100
},
{
"epoch": 0.0030350855894136215,
"grad_norm": 18.706037521362305,
"learning_rate": 3.019726858877087e-07,
"loss": 5.1846,
"step": 200
},
{
"epoch": 0.004552628384120432,
"grad_norm": 3.0564115047454834,
"learning_rate": 4.537177541729894e-07,
"loss": 1.1549,
"step": 300
},
{
"epoch": 0.006070171178827243,
"grad_norm": 1.5621784925460815,
"learning_rate": 6.054628224582701e-07,
"loss": 0.335,
"step": 400
},
{
"epoch": 0.007587713973534054,
"grad_norm": 1.4899357557296753,
"learning_rate": 7.57207890743551e-07,
"loss": 0.244,
"step": 500
},
{
"epoch": 0.009105256768240864,
"grad_norm": 2.1066386699676514,
"learning_rate": 9.089529590288317e-07,
"loss": 0.2106,
"step": 600
},
{
"epoch": 0.010622799562947675,
"grad_norm": 1.2894740104675293,
"learning_rate": 1.0606980273141124e-06,
"loss": 0.1825,
"step": 700
},
{
"epoch": 0.012140342357654486,
"grad_norm": 2.292886972427368,
"learning_rate": 1.212443095599393e-06,
"loss": 0.1729,
"step": 800
},
{
"epoch": 0.013657885152361297,
"grad_norm": 0.8042752146720886,
"learning_rate": 1.3641881638846738e-06,
"loss": 0.1696,
"step": 900
},
{
"epoch": 0.015175427947068108,
"grad_norm": 1.586381435394287,
"learning_rate": 1.5159332321699546e-06,
"loss": 0.1612,
"step": 1000
},
{
"epoch": 0.016692970741774917,
"grad_norm": 2.209632635116577,
"learning_rate": 1.6676783004552353e-06,
"loss": 0.1693,
"step": 1100
},
{
"epoch": 0.018210513536481728,
"grad_norm": 1.8002041578292847,
"learning_rate": 1.819423368740516e-06,
"loss": 0.1649,
"step": 1200
},
{
"epoch": 0.01972805633118854,
"grad_norm": 0.7994608879089355,
"learning_rate": 1.971168437025797e-06,
"loss": 0.1666,
"step": 1300
},
{
"epoch": 0.02124559912589535,
"grad_norm": 0.8511660099029541,
"learning_rate": 2.1229135053110773e-06,
"loss": 0.1671,
"step": 1400
},
{
"epoch": 0.02276314192060216,
"grad_norm": 1.5392524003982544,
"learning_rate": 2.274658573596358e-06,
"loss": 0.1573,
"step": 1500
},
{
"epoch": 0.024280684715308972,
"grad_norm": 1.7585276365280151,
"learning_rate": 2.426403641881639e-06,
"loss": 0.1589,
"step": 1600
},
{
"epoch": 0.025798227510015783,
"grad_norm": 1.419176697731018,
"learning_rate": 2.57814871016692e-06,
"loss": 0.155,
"step": 1700
},
{
"epoch": 0.027315770304722594,
"grad_norm": 1.071205973625183,
"learning_rate": 2.729893778452201e-06,
"loss": 0.1616,
"step": 1800
},
{
"epoch": 0.028833313099429405,
"grad_norm": 1.5084614753723145,
"learning_rate": 2.8816388467374813e-06,
"loss": 0.1639,
"step": 1900
},
{
"epoch": 0.030350855894136216,
"grad_norm": 0.5268077254295349,
"learning_rate": 3.0333839150227617e-06,
"loss": 0.1531,
"step": 2000
},
{
"epoch": 0.03186839868884302,
"grad_norm": 1.1704555749893188,
"learning_rate": 3.185128983308043e-06,
"loss": 0.1637,
"step": 2100
},
{
"epoch": 0.033385941483549834,
"grad_norm": 8.939970970153809,
"learning_rate": 3.3368740515933235e-06,
"loss": 0.1569,
"step": 2200
},
{
"epoch": 0.034903484278256645,
"grad_norm": 0.9303669929504395,
"learning_rate": 3.488619119878604e-06,
"loss": 0.1561,
"step": 2300
},
{
"epoch": 0.036421027072963456,
"grad_norm": 1.3250867128372192,
"learning_rate": 3.6403641881638852e-06,
"loss": 0.157,
"step": 2400
},
{
"epoch": 0.03793856986767027,
"grad_norm": 0.671481192111969,
"learning_rate": 3.7921092564491657e-06,
"loss": 0.1539,
"step": 2500
},
{
"epoch": 0.03945611266237708,
"grad_norm": 1.0448344945907593,
"learning_rate": 3.9438543247344466e-06,
"loss": 0.1513,
"step": 2600
},
{
"epoch": 0.04097365545708389,
"grad_norm": 0.7382568717002869,
"learning_rate": 4.0955993930197274e-06,
"loss": 0.1515,
"step": 2700
},
{
"epoch": 0.0424911982517907,
"grad_norm": 0.9559106230735779,
"learning_rate": 4.247344461305008e-06,
"loss": 0.1579,
"step": 2800
},
{
"epoch": 0.04400874104649751,
"grad_norm": 1.2689694166183472,
"learning_rate": 4.399089529590288e-06,
"loss": 0.1544,
"step": 2900
},
{
"epoch": 0.04552628384120432,
"grad_norm": 1.0517489910125732,
"learning_rate": 4.55083459787557e-06,
"loss": 0.1558,
"step": 3000
},
{
"epoch": 0.04704382663591113,
"grad_norm": 0.9564648270606995,
"learning_rate": 4.70257966616085e-06,
"loss": 0.1593,
"step": 3100
},
{
"epoch": 0.048561369430617944,
"grad_norm": 5.6809797286987305,
"learning_rate": 4.854324734446131e-06,
"loss": 0.1584,
"step": 3200
},
{
"epoch": 0.050078912225324755,
"grad_norm": 0.6552232503890991,
"learning_rate": 5.006069802731411e-06,
"loss": 0.1547,
"step": 3300
},
{
"epoch": 0.051596455020031566,
"grad_norm": 1.9638985395431519,
"learning_rate": 5.157814871016692e-06,
"loss": 0.1518,
"step": 3400
},
{
"epoch": 0.05311399781473838,
"grad_norm": 1.733556866645813,
"learning_rate": 5.309559939301974e-06,
"loss": 0.1608,
"step": 3500
},
{
"epoch": 0.05463154060944519,
"grad_norm": 2.305605173110962,
"learning_rate": 5.4613050075872545e-06,
"loss": 0.1567,
"step": 3600
},
{
"epoch": 0.056149083404152,
"grad_norm": 1.0986473560333252,
"learning_rate": 5.6130500758725345e-06,
"loss": 0.1621,
"step": 3700
},
{
"epoch": 0.05766662619885881,
"grad_norm": 1.204899549484253,
"learning_rate": 5.764795144157815e-06,
"loss": 0.1612,
"step": 3800
},
{
"epoch": 0.05918416899356562,
"grad_norm": 2.068692684173584,
"learning_rate": 5.916540212443096e-06,
"loss": 0.1612,
"step": 3900
},
{
"epoch": 0.06070171178827243,
"grad_norm": 1.4351601600646973,
"learning_rate": 6.068285280728376e-06,
"loss": 0.1601,
"step": 4000
},
{
"epoch": 0.06221925458297924,
"grad_norm": 0.6356476545333862,
"learning_rate": 6.220030349013657e-06,
"loss": 0.1626,
"step": 4100
},
{
"epoch": 0.06373679737768605,
"grad_norm": 1.5154367685317993,
"learning_rate": 6.371775417298939e-06,
"loss": 0.1613,
"step": 4200
},
{
"epoch": 0.06525434017239286,
"grad_norm": 3.8945472240448,
"learning_rate": 6.52352048558422e-06,
"loss": 0.1595,
"step": 4300
},
{
"epoch": 0.06677188296709967,
"grad_norm": 3.9455080032348633,
"learning_rate": 6.6752655538695e-06,
"loss": 0.1573,
"step": 4400
},
{
"epoch": 0.06828942576180648,
"grad_norm": 0.725102961063385,
"learning_rate": 6.827010622154781e-06,
"loss": 0.1589,
"step": 4500
},
{
"epoch": 0.06980696855651329,
"grad_norm": 0.5262890458106995,
"learning_rate": 6.978755690440061e-06,
"loss": 0.1612,
"step": 4600
},
{
"epoch": 0.0713245113512201,
"grad_norm": 0.8161805272102356,
"learning_rate": 7.130500758725342e-06,
"loss": 0.1571,
"step": 4700
},
{
"epoch": 0.07284205414592691,
"grad_norm": 0.5373992323875427,
"learning_rate": 7.2822458270106225e-06,
"loss": 0.1623,
"step": 4800
},
{
"epoch": 0.07435959694063372,
"grad_norm": 0.9811096787452698,
"learning_rate": 7.433990895295904e-06,
"loss": 0.159,
"step": 4900
},
{
"epoch": 0.07587713973534053,
"grad_norm": 0.9945287108421326,
"learning_rate": 7.585735963581184e-06,
"loss": 0.1514,
"step": 5000
},
{
"epoch": 0.07739468253004735,
"grad_norm": 0.566162645816803,
"learning_rate": 7.737481031866465e-06,
"loss": 0.1606,
"step": 5100
},
{
"epoch": 0.07891222532475416,
"grad_norm": 0.9396035075187683,
"learning_rate": 7.889226100151746e-06,
"loss": 0.1553,
"step": 5200
},
{
"epoch": 0.08042976811946097,
"grad_norm": 2.3921587467193604,
"learning_rate": 8.040971168437027e-06,
"loss": 0.1578,
"step": 5300
},
{
"epoch": 0.08194731091416778,
"grad_norm": 0.6013012528419495,
"learning_rate": 8.192716236722306e-06,
"loss": 0.1638,
"step": 5400
},
{
"epoch": 0.08346485370887459,
"grad_norm": 0.69361412525177,
"learning_rate": 8.344461305007589e-06,
"loss": 0.1607,
"step": 5500
},
{
"epoch": 0.0849823965035814,
"grad_norm": 10.008782386779785,
"learning_rate": 8.49620637329287e-06,
"loss": 0.1623,
"step": 5600
},
{
"epoch": 0.08649993929828821,
"grad_norm": 7.383462429046631,
"learning_rate": 8.64795144157815e-06,
"loss": 0.1609,
"step": 5700
},
{
"epoch": 0.08801748209299502,
"grad_norm": 0.6986634135246277,
"learning_rate": 8.79969650986343e-06,
"loss": 0.1631,
"step": 5800
},
{
"epoch": 0.08953502488770183,
"grad_norm": 0.8260175585746765,
"learning_rate": 8.95144157814871e-06,
"loss": 0.1641,
"step": 5900
},
{
"epoch": 0.09105256768240864,
"grad_norm": 1.675830364227295,
"learning_rate": 9.103186646433991e-06,
"loss": 0.1638,
"step": 6000
},
{
"epoch": 0.09257011047711546,
"grad_norm": 2.092184543609619,
"learning_rate": 9.254931714719272e-06,
"loss": 0.169,
"step": 6100
},
{
"epoch": 0.09408765327182227,
"grad_norm": 0.577240526676178,
"learning_rate": 9.406676783004553e-06,
"loss": 0.1621,
"step": 6200
},
{
"epoch": 0.09560519606652908,
"grad_norm": 2.742009162902832,
"learning_rate": 9.558421851289834e-06,
"loss": 0.1615,
"step": 6300
},
{
"epoch": 0.09712273886123589,
"grad_norm": 0.755526602268219,
"learning_rate": 9.710166919575115e-06,
"loss": 0.1657,
"step": 6400
},
{
"epoch": 0.0986402816559427,
"grad_norm": 0.8038458824157715,
"learning_rate": 9.861911987860396e-06,
"loss": 0.1636,
"step": 6500
},
{
"epoch": 0.10015782445064951,
"grad_norm": 0.8791661858558655,
"learning_rate": 9.99999943176563e-06,
"loss": 0.1645,
"step": 6600
},
{
"epoch": 0.10167536724535632,
"grad_norm": 0.9931176900863647,
"learning_rate": 9.999916652173913e-06,
"loss": 0.1727,
"step": 6700
},
{
"epoch": 0.10319291004006313,
"grad_norm": 0.6823338866233826,
"learning_rate": 9.999693570463897e-06,
"loss": 0.1689,
"step": 6800
},
{
"epoch": 0.10471045283476994,
"grad_norm": 0.7916462421417236,
"learning_rate": 9.999330192895455e-06,
"loss": 0.1637,
"step": 6900
},
{
"epoch": 0.10622799562947675,
"grad_norm": 0.6410049796104431,
"learning_rate": 9.998826529665285e-06,
"loss": 0.1731,
"step": 7000
},
{
"epoch": 0.10774553842418357,
"grad_norm": 1.0823462009429932,
"learning_rate": 9.998182594906624e-06,
"loss": 0.1673,
"step": 7100
},
{
"epoch": 0.10926308121889038,
"grad_norm": 1.3753924369812012,
"learning_rate": 9.997398406688858e-06,
"loss": 0.1625,
"step": 7200
},
{
"epoch": 0.11078062401359719,
"grad_norm": 2.6931698322296143,
"learning_rate": 9.996473987017008e-06,
"loss": 0.167,
"step": 7300
},
{
"epoch": 0.112298166808304,
"grad_norm": 1.7609068155288696,
"learning_rate": 9.995409361831112e-06,
"loss": 0.1645,
"step": 7400
},
{
"epoch": 0.11381570960301081,
"grad_norm": 0.9486920833587646,
"learning_rate": 9.994204561005502e-06,
"loss": 0.1663,
"step": 7500
},
{
"epoch": 0.11533325239771762,
"grad_norm": 1.1171340942382812,
"learning_rate": 9.992859618347963e-06,
"loss": 0.165,
"step": 7600
},
{
"epoch": 0.11685079519242443,
"grad_norm": 1.054910659790039,
"learning_rate": 9.991374571598786e-06,
"loss": 0.1645,
"step": 7700
},
{
"epoch": 0.11836833798713124,
"grad_norm": 1.4396705627441406,
"learning_rate": 9.989749462429707e-06,
"loss": 0.1674,
"step": 7800
},
{
"epoch": 0.11988588078183805,
"grad_norm": 4.53713321685791,
"learning_rate": 9.987984336442738e-06,
"loss": 0.1621,
"step": 7900
},
{
"epoch": 0.12140342357654486,
"grad_norm": 1.5164715051651,
"learning_rate": 9.986079243168885e-06,
"loss": 0.1658,
"step": 8000
},
{
"epoch": 0.12292096637125167,
"grad_norm": 0.6809989213943481,
"learning_rate": 9.984034236066764e-06,
"loss": 0.168,
"step": 8100
},
{
"epoch": 0.12443850916595849,
"grad_norm": 1.363065481185913,
"learning_rate": 9.981849372521101e-06,
"loss": 0.1611,
"step": 8200
},
{
"epoch": 0.12595605196066528,
"grad_norm": 1.713124394416809,
"learning_rate": 9.979524713841111e-06,
"loss": 0.1592,
"step": 8300
},
{
"epoch": 0.1274735947553721,
"grad_norm": 3.671640157699585,
"learning_rate": 9.97706032525879e-06,
"loss": 0.1787,
"step": 8400
},
{
"epoch": 0.1289911375500789,
"grad_norm": 0.6882109045982361,
"learning_rate": 9.97445627592708e-06,
"loss": 0.1663,
"step": 8500
},
{
"epoch": 0.13050868034478572,
"grad_norm": 0.4362512230873108,
"learning_rate": 9.971712638917924e-06,
"loss": 0.1629,
"step": 8600
},
{
"epoch": 0.13202622313949253,
"grad_norm": 0.7280552983283997,
"learning_rate": 9.968829491220221e-06,
"loss": 0.1667,
"step": 8700
},
{
"epoch": 0.13354376593419934,
"grad_norm": 0.7983876466751099,
"learning_rate": 9.965806913737671e-06,
"loss": 0.1656,
"step": 8800
},
{
"epoch": 0.13506130872890615,
"grad_norm": 0.5041903853416443,
"learning_rate": 9.962644991286487e-06,
"loss": 0.1669,
"step": 8900
},
{
"epoch": 0.13657885152361296,
"grad_norm": 4.93981409072876,
"learning_rate": 9.959343812593037e-06,
"loss": 0.1672,
"step": 9000
},
{
"epoch": 0.13809639431831977,
"grad_norm": 1.671231746673584,
"learning_rate": 9.955903470291331e-06,
"loss": 0.1737,
"step": 9100
},
{
"epoch": 0.13961393711302658,
"grad_norm": 1.0303494930267334,
"learning_rate": 9.952324060920446e-06,
"loss": 0.173,
"step": 9200
},
{
"epoch": 0.1411314799077334,
"grad_norm": 1.154575228691101,
"learning_rate": 9.948605684921799e-06,
"loss": 0.1704,
"step": 9300
},
{
"epoch": 0.1426490227024402,
"grad_norm": 2.287119150161743,
"learning_rate": 9.944748446636334e-06,
"loss": 0.1644,
"step": 9400
},
{
"epoch": 0.14416656549714701,
"grad_norm": 0.7774553298950195,
"learning_rate": 9.940752454301597e-06,
"loss": 0.1714,
"step": 9500
},
{
"epoch": 0.14568410829185383,
"grad_norm": 0.48550257086753845,
"learning_rate": 9.936617820048692e-06,
"loss": 0.1615,
"step": 9600
},
{
"epoch": 0.14720165108656064,
"grad_norm": 0.9614207744598389,
"learning_rate": 9.932344659899146e-06,
"loss": 0.1674,
"step": 9700
},
{
"epoch": 0.14871919388126745,
"grad_norm": 0.6777101755142212,
"learning_rate": 9.927933093761638e-06,
"loss": 0.1704,
"step": 9800
},
{
"epoch": 0.15023673667597426,
"grad_norm": 1.3050462007522583,
"learning_rate": 9.923383245428651e-06,
"loss": 0.161,
"step": 9900
},
{
"epoch": 0.15175427947068107,
"grad_norm": 0.702738344669342,
"learning_rate": 9.91869524257298e-06,
"loss": 0.1684,
"step": 10000
},
{
"epoch": 0.15327182226538788,
"grad_norm": 0.5779732465744019,
"learning_rate": 9.91386921674417e-06,
"loss": 0.1681,
"step": 10100
},
{
"epoch": 0.1547893650600947,
"grad_norm": 1.1301337480545044,
"learning_rate": 9.9089053033648e-06,
"loss": 0.1715,
"step": 10200
},
{
"epoch": 0.1563069078548015,
"grad_norm": 0.9821518063545227,
"learning_rate": 9.903803641726713e-06,
"loss": 0.1747,
"step": 10300
},
{
"epoch": 0.1578244506495083,
"grad_norm": 2.881585121154785,
"learning_rate": 9.898564374987075e-06,
"loss": 0.163,
"step": 10400
},
{
"epoch": 0.15934199344421512,
"grad_norm": 1.2632317543029785,
"learning_rate": 9.893187650164384e-06,
"loss": 0.1677,
"step": 10500
},
{
"epoch": 0.16085953623892194,
"grad_norm": 0.5543649792671204,
"learning_rate": 9.887673618134333e-06,
"loss": 0.164,
"step": 10600
},
{
"epoch": 0.16237707903362875,
"grad_norm": 0.6111209988594055,
"learning_rate": 9.882022433625574e-06,
"loss": 0.1584,
"step": 10700
},
{
"epoch": 0.16389462182833556,
"grad_norm": 1.4198076725006104,
"learning_rate": 9.876234255215383e-06,
"loss": 0.1699,
"step": 10800
},
{
"epoch": 0.16541216462304237,
"grad_norm": 0.710670530796051,
"learning_rate": 9.870309245325206e-06,
"loss": 0.1638,
"step": 10900
},
{
"epoch": 0.16692970741774918,
"grad_norm": 0.851134181022644,
"learning_rate": 9.864247570216102e-06,
"loss": 0.1709,
"step": 11000
},
{
"epoch": 0.168447250212456,
"grad_norm": 2.1078672409057617,
"learning_rate": 9.858049399984076e-06,
"loss": 0.1621,
"step": 11100
},
{
"epoch": 0.1699647930071628,
"grad_norm": 1.062860369682312,
"learning_rate": 9.851714908555313e-06,
"loss": 0.1675,
"step": 11200
},
{
"epoch": 0.1714823358018696,
"grad_norm": 0.6492528319358826,
"learning_rate": 9.845244273681287e-06,
"loss": 0.1663,
"step": 11300
},
{
"epoch": 0.17299987859657642,
"grad_norm": 0.5542171001434326,
"learning_rate": 9.838637676933782e-06,
"loss": 0.1616,
"step": 11400
},
{
"epoch": 0.17451742139128323,
"grad_norm": 1.5355699062347412,
"learning_rate": 9.831895303699792e-06,
"loss": 0.171,
"step": 11500
},
{
"epoch": 0.17603496418599004,
"grad_norm": 0.8221864104270935,
"learning_rate": 9.82501734317632e-06,
"loss": 0.1671,
"step": 11600
},
{
"epoch": 0.17755250698069686,
"grad_norm": 9.875476837158203,
"learning_rate": 9.818003988365068e-06,
"loss": 0.1668,
"step": 11700
},
{
"epoch": 0.17907004977540367,
"grad_norm": 1.923878788948059,
"learning_rate": 9.810855436067027e-06,
"loss": 0.1743,
"step": 11800
},
{
"epoch": 0.18058759257011048,
"grad_norm": 1.0459812879562378,
"learning_rate": 9.803571886876943e-06,
"loss": 0.1718,
"step": 11900
},
{
"epoch": 0.1821051353648173,
"grad_norm": 1.3900260925292969,
"learning_rate": 9.7961535451777e-06,
"loss": 0.1706,
"step": 12000
},
{
"epoch": 0.1836226781595241,
"grad_norm": 0.52808678150177,
"learning_rate": 9.788600619134582e-06,
"loss": 0.1704,
"step": 12100
},
{
"epoch": 0.1851402209542309,
"grad_norm": 3.0196077823638916,
"learning_rate": 9.780913320689425e-06,
"loss": 0.1723,
"step": 12200
},
{
"epoch": 0.18665776374893772,
"grad_norm": 0.9358514547348022,
"learning_rate": 9.773091865554673e-06,
"loss": 0.1627,
"step": 12300
},
{
"epoch": 0.18817530654364453,
"grad_norm": 0.5178400874137878,
"learning_rate": 9.765136473207335e-06,
"loss": 0.1669,
"step": 12400
},
{
"epoch": 0.18969284933835134,
"grad_norm": 0.7377560138702393,
"learning_rate": 9.757047366882807e-06,
"loss": 0.1691,
"step": 12500
},
{
"epoch": 0.19121039213305815,
"grad_norm": 0.4387303292751312,
"learning_rate": 9.748824773568626e-06,
"loss": 0.1676,
"step": 12600
},
{
"epoch": 0.19272793492776497,
"grad_norm": 0.7734019756317139,
"learning_rate": 9.740468923998088e-06,
"loss": 0.166,
"step": 12700
},
{
"epoch": 0.19424547772247178,
"grad_norm": 5.757879257202148,
"learning_rate": 9.731980052643782e-06,
"loss": 0.1619,
"step": 12800
},
{
"epoch": 0.1957630205171786,
"grad_norm": 2.1326065063476562,
"learning_rate": 9.723358397711004e-06,
"loss": 0.1646,
"step": 12900
},
{
"epoch": 0.1972805633118854,
"grad_norm": 2.209416151046753,
"learning_rate": 9.71460420113108e-06,
"loss": 0.1695,
"step": 13000
},
{
"epoch": 0.1987981061065922,
"grad_norm": 2.3190994262695312,
"learning_rate": 9.705717708554567e-06,
"loss": 0.1668,
"step": 13100
},
{
"epoch": 0.20031564890129902,
"grad_norm": 1.0803442001342773,
"learning_rate": 9.69669916934437e-06,
"loss": 0.1644,
"step": 13200
},
{
"epoch": 0.20183319169600583,
"grad_norm": 1.2388124465942383,
"learning_rate": 9.687548836568736e-06,
"loss": 0.1688,
"step": 13300
},
{
"epoch": 0.20335073449071264,
"grad_norm": 3.6386444568634033,
"learning_rate": 9.678266966994163e-06,
"loss": 0.1616,
"step": 13400
},
{
"epoch": 0.20486827728541945,
"grad_norm": 0.5421572327613831,
"learning_rate": 9.668853821078184e-06,
"loss": 0.1668,
"step": 13500
},
{
"epoch": 0.20638582008012626,
"grad_norm": 0.970447838306427,
"learning_rate": 9.659309662962061e-06,
"loss": 0.168,
"step": 13600
},
{
"epoch": 0.20790336287483308,
"grad_norm": 0.7634482383728027,
"learning_rate": 9.649634760463383e-06,
"loss": 0.1631,
"step": 13700
},
{
"epoch": 0.2094209056695399,
"grad_norm": 0.7025083899497986,
"learning_rate": 9.639829385068538e-06,
"loss": 0.1607,
"step": 13800
},
{
"epoch": 0.2109384484642467,
"grad_norm": 0.6824074983596802,
"learning_rate": 9.6298938119251e-06,
"loss": 0.1611,
"step": 13900
},
{
"epoch": 0.2124559912589535,
"grad_norm": 0.5447623133659363,
"learning_rate": 9.619828319834105e-06,
"loss": 0.173,
"step": 14000
},
{
"epoch": 0.21397353405366032,
"grad_norm": 0.7672458291053772,
"learning_rate": 9.609633191242239e-06,
"loss": 0.1731,
"step": 14100
},
{
"epoch": 0.21549107684836713,
"grad_norm": 4.314092636108398,
"learning_rate": 9.599308712233895e-06,
"loss": 0.1681,
"step": 14200
},
{
"epoch": 0.21700861964307394,
"grad_norm": 0.6085025072097778,
"learning_rate": 9.588855172523157e-06,
"loss": 0.1721,
"step": 14300
},
{
"epoch": 0.21852616243778075,
"grad_norm": 2.6311752796173096,
"learning_rate": 9.578272865445671e-06,
"loss": 0.1637,
"step": 14400
},
{
"epoch": 0.22004370523248756,
"grad_norm": 0.9000397324562073,
"learning_rate": 9.567562087950403e-06,
"loss": 0.1656,
"step": 14500
},
{
"epoch": 0.22156124802719437,
"grad_norm": 0.8496785163879395,
"learning_rate": 9.55672314059132e-06,
"loss": 0.1688,
"step": 14600
},
{
"epoch": 0.22307879082190119,
"grad_norm": 1.8966537714004517,
"learning_rate": 9.545756327518947e-06,
"loss": 0.1721,
"step": 14700
},
{
"epoch": 0.224596333616608,
"grad_norm": 0.5740467309951782,
"learning_rate": 9.534661956471834e-06,
"loss": 0.162,
"step": 14800
},
{
"epoch": 0.2261138764113148,
"grad_norm": 0.6003543138504028,
"learning_rate": 9.523440338767922e-06,
"loss": 0.1747,
"step": 14900
},
{
"epoch": 0.22763141920602162,
"grad_norm": 2.5062880516052246,
"learning_rate": 9.512091789295807e-06,
"loss": 0.1693,
"step": 15000
},
{
"epoch": 0.22914896200072843,
"grad_norm": 0.5050596594810486,
"learning_rate": 9.500616626505906e-06,
"loss": 0.1648,
"step": 15100
},
{
"epoch": 0.23066650479543524,
"grad_norm": 0.9980542659759521,
"learning_rate": 9.489015172401511e-06,
"loss": 0.1665,
"step": 15200
},
{
"epoch": 0.23218404759014205,
"grad_norm": 0.49514228105545044,
"learning_rate": 9.477287752529772e-06,
"loss": 0.1648,
"step": 15300
},
{
"epoch": 0.23370159038484886,
"grad_norm": 0.6534783244132996,
"learning_rate": 9.46543469597254e-06,
"loss": 0.1676,
"step": 15400
},
{
"epoch": 0.23521913317955567,
"grad_norm": 1.3305740356445312,
"learning_rate": 9.45345633533715e-06,
"loss": 0.1747,
"step": 15500
},
{
"epoch": 0.23673667597426248,
"grad_norm": 1.0036684274673462,
"learning_rate": 9.44135300674708e-06,
"loss": 0.1719,
"step": 15600
},
{
"epoch": 0.2382542187689693,
"grad_norm": 1.052393913269043,
"learning_rate": 9.429125049832518e-06,
"loss": 0.1702,
"step": 15700
},
{
"epoch": 0.2397717615636761,
"grad_norm": 1.700551986694336,
"learning_rate": 9.416772807720835e-06,
"loss": 0.1642,
"step": 15800
},
{
"epoch": 0.24128930435838292,
"grad_norm": 1.1149799823760986,
"learning_rate": 9.404296627026959e-06,
"loss": 0.1707,
"step": 15900
},
{
"epoch": 0.24280684715308973,
"grad_norm": 0.7962595820426941,
"learning_rate": 9.391696857843638e-06,
"loss": 0.1688,
"step": 16000
},
{
"epoch": 0.24432438994779654,
"grad_norm": 2.124986171722412,
"learning_rate": 9.378973853731627e-06,
"loss": 0.1584,
"step": 16100
},
{
"epoch": 0.24584193274250335,
"grad_norm": 2.773843288421631,
"learning_rate": 9.366127971709764e-06,
"loss": 0.168,
"step": 16200
},
{
"epoch": 0.24735947553721016,
"grad_norm": 0.8750647306442261,
"learning_rate": 9.353159572244953e-06,
"loss": 0.1677,
"step": 16300
},
{
"epoch": 0.24887701833191697,
"grad_norm": 1.1571807861328125,
"learning_rate": 9.340069019242038e-06,
"loss": 0.1729,
"step": 16400
},
{
"epoch": 0.2503945611266238,
"grad_norm": 0.78291255235672,
"learning_rate": 9.326856680033609e-06,
"loss": 0.1678,
"step": 16500
},
{
"epoch": 0.25191210392133057,
"grad_norm": 4.446779727935791,
"learning_rate": 9.313522925369678e-06,
"loss": 0.1672,
"step": 16600
},
{
"epoch": 0.2534296467160374,
"grad_norm": 0.4619388282299042,
"learning_rate": 9.300068129407292e-06,
"loss": 0.1663,
"step": 16700
},
{
"epoch": 0.2549471895107442,
"grad_norm": 0.7868841886520386,
"learning_rate": 9.286492669700016e-06,
"loss": 0.1681,
"step": 16800
},
{
"epoch": 0.256464732305451,
"grad_norm": 0.615048885345459,
"learning_rate": 9.272796927187353e-06,
"loss": 0.1686,
"step": 16900
},
{
"epoch": 0.2579822751001578,
"grad_norm": 0.76714688539505,
"learning_rate": 9.258981286184046e-06,
"loss": 0.1646,
"step": 17000
},
{
"epoch": 0.25949981789486465,
"grad_norm": 1.5852404832839966,
"learning_rate": 9.245046134369295e-06,
"loss": 0.1663,
"step": 17100
},
{
"epoch": 0.26101736068957143,
"grad_norm": 0.47872257232666016,
"learning_rate": 9.230991862775884e-06,
"loss": 0.1667,
"step": 17200
},
{
"epoch": 0.26253490348427827,
"grad_norm": 0.9261009097099304,
"learning_rate": 9.216818865779203e-06,
"loss": 0.1687,
"step": 17300
},
{
"epoch": 0.26405244627898505,
"grad_norm": 1.3889875411987305,
"learning_rate": 9.20252754108618e-06,
"loss": 0.1663,
"step": 17400
},
{
"epoch": 0.2655699890736919,
"grad_norm": 1.199637532234192,
"learning_rate": 9.188118289724127e-06,
"loss": 0.1561,
"step": 17500
},
{
"epoch": 0.2670875318683987,
"grad_norm": 2.0619025230407715,
"learning_rate": 9.17359151602948e-06,
"loss": 0.1658,
"step": 17600
},
{
"epoch": 0.2686050746631055,
"grad_norm": 0.5356110334396362,
"learning_rate": 9.158947627636462e-06,
"loss": 0.1579,
"step": 17700
},
{
"epoch": 0.2701226174578123,
"grad_norm": 0.7900151014328003,
"learning_rate": 9.144187035465631e-06,
"loss": 0.1696,
"step": 17800
},
{
"epoch": 0.27164016025251914,
"grad_norm": 0.6425641179084778,
"learning_rate": 9.129310153712365e-06,
"loss": 0.1702,
"step": 17900
},
{
"epoch": 0.2731577030472259,
"grad_norm": 0.46129781007766724,
"learning_rate": 9.114317399835225e-06,
"loss": 0.1662,
"step": 18000
},
{
"epoch": 0.27467524584193276,
"grad_norm": 0.8664289116859436,
"learning_rate": 9.099209194544248e-06,
"loss": 0.1646,
"step": 18100
},
{
"epoch": 0.27619278863663954,
"grad_norm": 2.408888339996338,
"learning_rate": 9.083985961789148e-06,
"loss": 0.1705,
"step": 18200
},
{
"epoch": 0.2777103314313464,
"grad_norm": 0.7840184569358826,
"learning_rate": 9.0686481287474e-06,
"loss": 0.1671,
"step": 18300
},
{
"epoch": 0.27922787422605316,
"grad_norm": 1.1906856298446655,
"learning_rate": 9.053196125812276e-06,
"loss": 0.1666,
"step": 18400
},
{
"epoch": 0.28074541702076,
"grad_norm": 0.7326360940933228,
"learning_rate": 9.037630386580752e-06,
"loss": 0.1694,
"step": 18500
},
{
"epoch": 0.2822629598154668,
"grad_norm": 1.0893489122390747,
"learning_rate": 9.021951347841344e-06,
"loss": 0.1643,
"step": 18600
},
{
"epoch": 0.2837805026101736,
"grad_norm": 0.863768994808197,
"learning_rate": 9.006159449561859e-06,
"loss": 0.1685,
"step": 18700
},
{
"epoch": 0.2852980454048804,
"grad_norm": 0.8099831938743591,
"learning_rate": 8.990255134877037e-06,
"loss": 0.1674,
"step": 18800
},
{
"epoch": 0.28681558819958725,
"grad_norm": 0.7958328723907471,
"learning_rate": 8.974238850076128e-06,
"loss": 0.1654,
"step": 18900
},
{
"epoch": 0.28833313099429403,
"grad_norm": 0.6013241410255432,
"learning_rate": 8.95811104459036e-06,
"loss": 0.1688,
"step": 19000
},
{
"epoch": 0.28985067378900087,
"grad_norm": 0.7762428522109985,
"learning_rate": 8.941872170980333e-06,
"loss": 0.1652,
"step": 19100
},
{
"epoch": 0.29136821658370765,
"grad_norm": 0.7196159958839417,
"learning_rate": 8.925522684923311e-06,
"loss": 0.1716,
"step": 19200
},
{
"epoch": 0.2928857593784145,
"grad_norm": 0.737194836139679,
"learning_rate": 8.909063045200454e-06,
"loss": 0.1534,
"step": 19300
},
{
"epoch": 0.2944033021731213,
"grad_norm": 0.6643932461738586,
"learning_rate": 8.892493713683918e-06,
"loss": 0.1689,
"step": 19400
},
{
"epoch": 0.2959208449678281,
"grad_norm": 0.5714944005012512,
"learning_rate": 8.875815155323923e-06,
"loss": 0.1698,
"step": 19500
},
{
"epoch": 0.2974383877625349,
"grad_norm": 0.9755032658576965,
"learning_rate": 8.85902783813568e-06,
"loss": 0.1688,
"step": 19600
},
{
"epoch": 0.29895593055724173,
"grad_norm": 0.7520804405212402,
"learning_rate": 8.842132233186272e-06,
"loss": 0.1678,
"step": 19700
},
{
"epoch": 0.3004734733519485,
"grad_norm": 1.14603853225708,
"learning_rate": 8.825128814581439e-06,
"loss": 0.1705,
"step": 19800
},
{
"epoch": 0.30199101614665536,
"grad_norm": 1.068724274635315,
"learning_rate": 8.808018059452264e-06,
"loss": 0.1694,
"step": 19900
},
{
"epoch": 0.30350855894136214,
"grad_norm": 1.168589472770691,
"learning_rate": 8.790800447941786e-06,
"loss": 0.1672,
"step": 20000
},
{
"epoch": 0.305026101736069,
"grad_norm": 0.6969729065895081,
"learning_rate": 8.773476463191533e-06,
"loss": 0.1626,
"step": 20100
},
{
"epoch": 0.30654364453077576,
"grad_norm": 0.7738513946533203,
"learning_rate": 8.756046591327963e-06,
"loss": 0.1665,
"step": 20200
},
{
"epoch": 0.3080611873254826,
"grad_norm": 0.5603029131889343,
"learning_rate": 8.738511321448815e-06,
"loss": 0.1724,
"step": 20300
},
{
"epoch": 0.3095787301201894,
"grad_norm": 0.8661625981330872,
"learning_rate": 8.720871145609394e-06,
"loss": 0.1675,
"step": 20400
},
{
"epoch": 0.3110962729148962,
"grad_norm": 1.929918885231018,
"learning_rate": 8.70312655880876e-06,
"loss": 0.163,
"step": 20500
},
{
"epoch": 0.312613815709603,
"grad_norm": 0.903670608997345,
"learning_rate": 8.685278058975832e-06,
"loss": 0.1675,
"step": 20600
},
{
"epoch": 0.31413135850430984,
"grad_norm": 0.876111626625061,
"learning_rate": 8.667326146955431e-06,
"loss": 0.1722,
"step": 20700
},
{
"epoch": 0.3156489012990166,
"grad_norm": 0.5705169439315796,
"learning_rate": 8.649271326494209e-06,
"loss": 0.1605,
"step": 20800
},
{
"epoch": 0.31716644409372347,
"grad_norm": 0.7466210722923279,
"learning_rate": 8.631114104226523e-06,
"loss": 0.165,
"step": 20900
},
{
"epoch": 0.31868398688843025,
"grad_norm": 0.6578019261360168,
"learning_rate": 8.612854989660215e-06,
"loss": 0.1665,
"step": 21000
},
{
"epoch": 0.3202015296831371,
"grad_norm": 0.5048807263374329,
"learning_rate": 8.594494495162317e-06,
"loss": 0.1638,
"step": 21100
},
{
"epoch": 0.32171907247784387,
"grad_norm": 0.7712035179138184,
"learning_rate": 8.576033135944674e-06,
"loss": 0.1671,
"step": 21200
},
{
"epoch": 0.3232366152725507,
"grad_norm": 0.49530839920043945,
"learning_rate": 8.557471430049476e-06,
"loss": 0.1648,
"step": 21300
},
{
"epoch": 0.3247541580672575,
"grad_norm": 1.7516402006149292,
"learning_rate": 8.538809898334743e-06,
"loss": 0.1682,
"step": 21400
},
{
"epoch": 0.32627170086196433,
"grad_norm": 0.9786944389343262,
"learning_rate": 8.520049064459687e-06,
"loss": 0.1674,
"step": 21500
},
{
"epoch": 0.3277892436566711,
"grad_norm": 0.8049986362457275,
"learning_rate": 8.50118945487003e-06,
"loss": 0.1584,
"step": 21600
},
{
"epoch": 0.32930678645137795,
"grad_norm": 1.1162434816360474,
"learning_rate": 8.482231598783231e-06,
"loss": 0.1638,
"step": 21700
},
{
"epoch": 0.33082432924608474,
"grad_norm": 1.419999361038208,
"learning_rate": 8.463176028173632e-06,
"loss": 0.16,
"step": 21800
},
{
"epoch": 0.3323418720407916,
"grad_norm": 0.6773690581321716,
"learning_rate": 8.444023277757527e-06,
"loss": 0.162,
"step": 21900
},
{
"epoch": 0.33385941483549836,
"grad_norm": 0.5430874824523926,
"learning_rate": 8.424773884978169e-06,
"loss": 0.1581,
"step": 22000
},
{
"epoch": 0.3353769576302052,
"grad_norm": 1.547601580619812,
"learning_rate": 8.405428389990678e-06,
"loss": 0.1635,
"step": 22100
},
{
"epoch": 0.336894500424912,
"grad_norm": 2.0339860916137695,
"learning_rate": 8.385987335646889e-06,
"loss": 0.1725,
"step": 22200
},
{
"epoch": 0.3384120432196188,
"grad_norm": 1.0022940635681152,
"learning_rate": 8.366451267480114e-06,
"loss": 0.1634,
"step": 22300
},
{
"epoch": 0.3399295860143256,
"grad_norm": 2.0224385261535645,
"learning_rate": 8.346820733689845e-06,
"loss": 0.1657,
"step": 22400
},
{
"epoch": 0.34144712880903244,
"grad_norm": 0.5869084000587463,
"learning_rate": 8.327096285126356e-06,
"loss": 0.1696,
"step": 22500
},
{
"epoch": 0.3429646716037392,
"grad_norm": 0.8519582152366638,
"learning_rate": 8.307278475275258e-06,
"loss": 0.1627,
"step": 22600
},
{
"epoch": 0.34448221439844606,
"grad_norm": 0.4737469255924225,
"learning_rate": 8.287367860241961e-06,
"loss": 0.1669,
"step": 22700
},
{
"epoch": 0.34599975719315285,
"grad_norm": 1.4706870317459106,
"learning_rate": 8.267364998736073e-06,
"loss": 0.1681,
"step": 22800
},
{
"epoch": 0.3475172999878597,
"grad_norm": 0.866769015789032,
"learning_rate": 8.247270452055718e-06,
"loss": 0.16,
"step": 22900
},
{
"epoch": 0.34903484278256647,
"grad_norm": 1.338789939880371,
"learning_rate": 8.227084784071786e-06,
"loss": 0.1616,
"step": 23000
},
{
"epoch": 0.3505523855772733,
"grad_norm": 1.5837043523788452,
"learning_rate": 8.206808561212119e-06,
"loss": 0.1626,
"step": 23100
},
{
"epoch": 0.3520699283719801,
"grad_norm": 2.0691418647766113,
"learning_rate": 8.1864423524456e-06,
"loss": 0.168,
"step": 23200
},
{
"epoch": 0.3535874711666869,
"grad_norm": 0.9734016060829163,
"learning_rate": 8.165986729266207e-06,
"loss": 0.1643,
"step": 23300
},
{
"epoch": 0.3551050139613937,
"grad_norm": 0.6484026312828064,
"learning_rate": 8.14544226567696e-06,
"loss": 0.1642,
"step": 23400
},
{
"epoch": 0.3566225567561005,
"grad_norm": 0.8267654180526733,
"learning_rate": 8.124809538173816e-06,
"loss": 0.1702,
"step": 23500
},
{
"epoch": 0.35814009955080733,
"grad_norm": 1.2854251861572266,
"learning_rate": 8.104089125729509e-06,
"loss": 0.1714,
"step": 23600
},
{
"epoch": 0.3596576423455141,
"grad_norm": 1.147830605506897,
"learning_rate": 8.083281609777278e-06,
"loss": 0.1622,
"step": 23700
},
{
"epoch": 0.36117518514022096,
"grad_norm": 7.396162509918213,
"learning_rate": 8.06238757419457e-06,
"loss": 0.1746,
"step": 23800
},
{
"epoch": 0.36269272793492774,
"grad_norm": 0.7266018390655518,
"learning_rate": 8.041407605286647e-06,
"loss": 0.1623,
"step": 23900
},
{
"epoch": 0.3642102707296346,
"grad_norm": 0.3472922444343567,
"learning_rate": 8.020342291770143e-06,
"loss": 0.16,
"step": 24000
},
{
"epoch": 0.36572781352434136,
"grad_norm": 0.5582528114318848,
"learning_rate": 7.99919222475653e-06,
"loss": 0.1632,
"step": 24100
},
{
"epoch": 0.3672453563190482,
"grad_norm": 1.1052249670028687,
"learning_rate": 7.977957997735541e-06,
"loss": 0.1628,
"step": 24200
},
{
"epoch": 0.368762899113755,
"grad_norm": 0.760474443435669,
"learning_rate": 7.956640206558517e-06,
"loss": 0.1673,
"step": 24300
},
{
"epoch": 0.3702804419084618,
"grad_norm": 0.8195217251777649,
"learning_rate": 7.935239449421684e-06,
"loss": 0.1665,
"step": 24400
},
{
"epoch": 0.3717979847031686,
"grad_norm": 1.5612919330596924,
"learning_rate": 7.913756326849359e-06,
"loss": 0.1685,
"step": 24500
},
{
"epoch": 0.37331552749787544,
"grad_norm": 1.6288279294967651,
"learning_rate": 7.892191441677115e-06,
"loss": 0.1527,
"step": 24600
},
{
"epoch": 0.3748330702925822,
"grad_norm": 0.48679786920547485,
"learning_rate": 7.870545399034853e-06,
"loss": 0.1608,
"step": 24700
},
{
"epoch": 0.37635061308728907,
"grad_norm": 0.5854870676994324,
"learning_rate": 7.848818806329825e-06,
"loss": 0.1638,
"step": 24800
},
{
"epoch": 0.37786815588199585,
"grad_norm": 1.408368468284607,
"learning_rate": 7.82701227322959e-06,
"loss": 0.1616,
"step": 24900
},
{
"epoch": 0.3793856986767027,
"grad_norm": 0.739921510219574,
"learning_rate": 7.805126411644907e-06,
"loss": 0.1608,
"step": 25000
},
{
"epoch": 0.38090324147140947,
"grad_norm": 0.7832688093185425,
"learning_rate": 7.78316183571256e-06,
"loss": 0.1675,
"step": 25100
},
{
"epoch": 0.3824207842661163,
"grad_norm": 1.3807283639907837,
"learning_rate": 7.761119161778129e-06,
"loss": 0.1639,
"step": 25200
},
{
"epoch": 0.3839383270608231,
"grad_norm": 3.3557193279266357,
"learning_rate": 7.738999008378695e-06,
"loss": 0.1696,
"step": 25300
},
{
"epoch": 0.38545586985552993,
"grad_norm": 1.0659217834472656,
"learning_rate": 7.71680199622548e-06,
"loss": 0.1661,
"step": 25400
},
{
"epoch": 0.3869734126502367,
"grad_norm": 1.3830986022949219,
"learning_rate": 7.694528748186432e-06,
"loss": 0.1564,
"step": 25500
},
{
"epoch": 0.38849095544494355,
"grad_norm": 0.6899144053459167,
"learning_rate": 7.672179889268748e-06,
"loss": 0.1693,
"step": 25600
},
{
"epoch": 0.39000849823965034,
"grad_norm": 0.9374479055404663,
"learning_rate": 7.649756046601327e-06,
"loss": 0.1668,
"step": 25700
},
{
"epoch": 0.3915260410343572,
"grad_norm": 0.6372075080871582,
"learning_rate": 7.627257849417188e-06,
"loss": 0.1597,
"step": 25800
},
{
"epoch": 0.39304358382906396,
"grad_norm": 0.5880036354064941,
"learning_rate": 7.604685929035798e-06,
"loss": 0.162,
"step": 25900
},
{
"epoch": 0.3945611266237708,
"grad_norm": 0.8676182627677917,
"learning_rate": 7.582040918845362e-06,
"loss": 0.1676,
"step": 26000
},
{
"epoch": 0.3960786694184776,
"grad_norm": 0.9364919066429138,
"learning_rate": 7.559323454285055e-06,
"loss": 0.1597,
"step": 26100
},
{
"epoch": 0.3975962122131844,
"grad_norm": 0.6055238842964172,
"learning_rate": 7.53653417282718e-06,
"loss": 0.1593,
"step": 26200
},
{
"epoch": 0.3991137550078912,
"grad_norm": 1.3126447200775146,
"learning_rate": 7.513673713959293e-06,
"loss": 0.1678,
"step": 26300
},
{
"epoch": 0.40063129780259804,
"grad_norm": 0.8760477900505066,
"learning_rate": 7.490742719166248e-06,
"loss": 0.1669,
"step": 26400
},
{
"epoch": 0.4021488405973048,
"grad_norm": 0.8734946846961975,
"learning_rate": 7.467741831912199e-06,
"loss": 0.1672,
"step": 26500
},
{
"epoch": 0.40366638339201166,
"grad_norm": 0.9851352572441101,
"learning_rate": 7.444671697622544e-06,
"loss": 0.1692,
"step": 26600
},
{
"epoch": 0.40518392618671845,
"grad_norm": 1.0815508365631104,
"learning_rate": 7.42153296366582e-06,
"loss": 0.1682,
"step": 26700
},
{
"epoch": 0.4067014689814253,
"grad_norm": 0.7282068133354187,
"learning_rate": 7.398326279335525e-06,
"loss": 0.1571,
"step": 26800
},
{
"epoch": 0.40821901177613207,
"grad_norm": 1.295764684677124,
"learning_rate": 7.37505229583191e-06,
"loss": 0.1635,
"step": 26900
},
{
"epoch": 0.4097365545708389,
"grad_norm": 0.6792795062065125,
"learning_rate": 7.351711666243699e-06,
"loss": 0.1622,
"step": 27000
},
{
"epoch": 0.4112540973655457,
"grad_norm": 0.6477678418159485,
"learning_rate": 7.328305045529764e-06,
"loss": 0.1632,
"step": 27100
},
{
"epoch": 0.41277164016025253,
"grad_norm": 0.544188380241394,
"learning_rate": 7.304833090500749e-06,
"loss": 0.1687,
"step": 27200
},
{
"epoch": 0.4142891829549593,
"grad_norm": 0.7626290917396545,
"learning_rate": 7.281296459800634e-06,
"loss": 0.1623,
"step": 27300
},
{
"epoch": 0.41580672574966615,
"grad_norm": 0.6300278902053833,
"learning_rate": 7.257695813888257e-06,
"loss": 0.164,
"step": 27400
},
{
"epoch": 0.41732426854437293,
"grad_norm": 0.5807547569274902,
"learning_rate": 7.2340318150187825e-06,
"loss": 0.155,
"step": 27500
},
{
"epoch": 0.4188418113390798,
"grad_norm": 0.816822350025177,
"learning_rate": 7.210305127225112e-06,
"loss": 0.1626,
"step": 27600
},
{
"epoch": 0.42035935413378656,
"grad_norm": 0.9591571092605591,
"learning_rate": 7.186516416299255e-06,
"loss": 0.1672,
"step": 27700
},
{
"epoch": 0.4218768969284934,
"grad_norm": 0.5725838541984558,
"learning_rate": 7.162666349773647e-06,
"loss": 0.1613,
"step": 27800
},
{
"epoch": 0.4233944397232002,
"grad_norm": 0.8374956846237183,
"learning_rate": 7.138755596902415e-06,
"loss": 0.1686,
"step": 27900
},
{
"epoch": 0.424911982517907,
"grad_norm": 0.8108429908752441,
"learning_rate": 7.1147848286425995e-06,
"loss": 0.1657,
"step": 28000
},
{
"epoch": 0.4264295253126138,
"grad_norm": 0.5929946899414062,
"learning_rate": 7.090754717635325e-06,
"loss": 0.1595,
"step": 28100
},
{
"epoch": 0.42794706810732064,
"grad_norm": 0.7109673023223877,
"learning_rate": 7.066665938186926e-06,
"loss": 0.1606,
"step": 28200
},
{
"epoch": 0.4294646109020274,
"grad_norm": 1.7905889749526978,
"learning_rate": 7.04251916625003e-06,
"loss": 0.1724,
"step": 28300
},
{
"epoch": 0.43098215369673426,
"grad_norm": 0.7144661545753479,
"learning_rate": 7.018315079404584e-06,
"loss": 0.1628,
"step": 28400
},
{
"epoch": 0.43249969649144104,
"grad_norm": 3.669461965560913,
"learning_rate": 6.994054356838835e-06,
"loss": 0.1596,
"step": 28500
},
{
"epoch": 0.4340172392861479,
"grad_norm": 0.5240535140037537,
"learning_rate": 6.969737679330291e-06,
"loss": 0.1599,
"step": 28600
},
{
"epoch": 0.43553478208085467,
"grad_norm": 0.728648841381073,
"learning_rate": 6.945365729226594e-06,
"loss": 0.1659,
"step": 28700
},
{
"epoch": 0.4370523248755615,
"grad_norm": 1.0085320472717285,
"learning_rate": 6.920939190426392e-06,
"loss": 0.1618,
"step": 28800
},
{
"epoch": 0.4385698676702683,
"grad_norm": 0.7288528084754944,
"learning_rate": 6.89645874836014e-06,
"loss": 0.1629,
"step": 28900
},
{
"epoch": 0.4400874104649751,
"grad_norm": 0.5646519660949707,
"learning_rate": 6.871925089970861e-06,
"loss": 0.1641,
"step": 29000
},
{
"epoch": 0.4416049532596819,
"grad_norm": 3.171855926513672,
"learning_rate": 6.847338903694882e-06,
"loss": 0.1657,
"step": 29100
},
{
"epoch": 0.44312249605438875,
"grad_norm": 0.8458850979804993,
"learning_rate": 6.8227008794425055e-06,
"loss": 0.1638,
"step": 29200
},
{
"epoch": 0.44464003884909553,
"grad_norm": 0.879671037197113,
"learning_rate": 6.798011708578655e-06,
"loss": 0.1587,
"step": 29300
},
{
"epoch": 0.44615758164380237,
"grad_norm": 2.345825433731079,
"learning_rate": 6.773272083903475e-06,
"loss": 0.1654,
"step": 29400
},
{
"epoch": 0.44767512443850915,
"grad_norm": 0.6839026212692261,
"learning_rate": 6.748482699632884e-06,
"loss": 0.1659,
"step": 29500
},
{
"epoch": 0.449192667233216,
"grad_norm": 0.6057868003845215,
"learning_rate": 6.723644251379106e-06,
"loss": 0.1658,
"step": 29600
},
{
"epoch": 0.4507102100279228,
"grad_norm": 0.9070055484771729,
"learning_rate": 6.698757436131138e-06,
"loss": 0.1594,
"step": 29700
},
{
"epoch": 0.4522277528226296,
"grad_norm": 0.7737216353416443,
"learning_rate": 6.673822952235201e-06,
"loss": 0.1661,
"step": 29800
},
{
"epoch": 0.4537452956173364,
"grad_norm": 0.5271407961845398,
"learning_rate": 6.648841499375143e-06,
"loss": 0.1613,
"step": 29900
},
{
"epoch": 0.45526283841204324,
"grad_norm": 0.6933364868164062,
"learning_rate": 6.623813778552796e-06,
"loss": 0.1657,
"step": 30000
},
{
"epoch": 0.45526283841204324,
"eval_loss": 0.5074921250343323,
"eval_runtime": 989.6205,
"eval_samples_per_second": 50.321,
"eval_steps_per_second": 6.29,
"step": 30000
},
{
"epoch": 0.45678038120675,
"grad_norm": 0.5691549181938171,
"learning_rate": 6.59874049206832e-06,
"loss": 0.162,
"step": 30100
},
{
"epoch": 0.45829792400145686,
"grad_norm": 1.0941545963287354,
"learning_rate": 6.573622343500482e-06,
"loss": 0.1696,
"step": 30200
},
{
"epoch": 0.45981546679616364,
"grad_norm": 2.0004982948303223,
"learning_rate": 6.548460037686925e-06,
"loss": 0.1633,
"step": 30300
},
{
"epoch": 0.4613330095908705,
"grad_norm": 0.568308413028717,
"learning_rate": 6.5232542807043765e-06,
"loss": 0.1569,
"step": 30400
},
{
"epoch": 0.46285055238557726,
"grad_norm": 1.2990856170654297,
"learning_rate": 6.498005779848848e-06,
"loss": 0.1583,
"step": 30500
},
{
"epoch": 0.4643680951802841,
"grad_norm": 1.7516120672225952,
"learning_rate": 6.472715243615781e-06,
"loss": 0.1572,
"step": 30600
},
{
"epoch": 0.4658856379749909,
"grad_norm": 0.8424046635627747,
"learning_rate": 6.4473833816801675e-06,
"loss": 0.1599,
"step": 30700
},
{
"epoch": 0.4674031807696977,
"grad_norm": 0.7119935750961304,
"learning_rate": 6.422010904876634e-06,
"loss": 0.1607,
"step": 30800
},
{
"epoch": 0.4689207235644045,
"grad_norm": 0.7812116742134094,
"learning_rate": 6.396598525179495e-06,
"loss": 0.1653,
"step": 30900
},
{
"epoch": 0.47043826635911135,
"grad_norm": 1.2674237489700317,
"learning_rate": 6.371146955682781e-06,
"loss": 0.163,
"step": 31000
},
{
"epoch": 0.47195580915381813,
"grad_norm": 0.7816241979598999,
"learning_rate": 6.34565691058022e-06,
"loss": 0.1575,
"step": 31100
},
{
"epoch": 0.47347335194852497,
"grad_norm": 0.6579769849777222,
"learning_rate": 6.320129105145198e-06,
"loss": 0.1647,
"step": 31200
},
{
"epoch": 0.47499089474323175,
"grad_norm": 0.8202933073043823,
"learning_rate": 6.294564255710695e-06,
"loss": 0.1583,
"step": 31300
},
{
"epoch": 0.4765084375379386,
"grad_norm": 0.5198040008544922,
"learning_rate": 6.26896307964917e-06,
"loss": 0.1598,
"step": 31400
},
{
"epoch": 0.4780259803326454,
"grad_norm": 0.5005636215209961,
"learning_rate": 6.243326295352451e-06,
"loss": 0.1536,
"step": 31500
},
{
"epoch": 0.4795435231273522,
"grad_norm": 1.1730496883392334,
"learning_rate": 6.217654622211553e-06,
"loss": 0.1701,
"step": 31600
},
{
"epoch": 0.481061065922059,
"grad_norm": 0.9473150372505188,
"learning_rate": 6.191948780596511e-06,
"loss": 0.1586,
"step": 31700
},
{
"epoch": 0.48257860871676583,
"grad_norm": 1.1389187574386597,
"learning_rate": 6.166209491836157e-06,
"loss": 0.1567,
"step": 31800
},
{
"epoch": 0.4840961515114726,
"grad_norm": 0.5175455808639526,
"learning_rate": 6.140437478197876e-06,
"loss": 0.1665,
"step": 31900
},
{
"epoch": 0.48561369430617946,
"grad_norm": 0.5194241404533386,
"learning_rate": 6.114633462867344e-06,
"loss": 0.1654,
"step": 32000
},
{
"epoch": 0.48713123710088624,
"grad_norm": 0.6536809802055359,
"learning_rate": 6.088798169928236e-06,
"loss": 0.1591,
"step": 32100
},
{
"epoch": 0.4886487798955931,
"grad_norm": 0.6603504419326782,
"learning_rate": 6.0629323243419006e-06,
"loss": 0.1638,
"step": 32200
},
{
"epoch": 0.49016632269029986,
"grad_norm": 1.4609029293060303,
"learning_rate": 6.037036651927022e-06,
"loss": 0.1639,
"step": 32300
},
{
"epoch": 0.4916838654850067,
"grad_norm": 0.5603197813034058,
"learning_rate": 6.011111879339252e-06,
"loss": 0.1651,
"step": 32400
},
{
"epoch": 0.4932014082797135,
"grad_norm": 7.655277252197266,
"learning_rate": 5.98515873405082e-06,
"loss": 0.1621,
"step": 32500
},
{
"epoch": 0.4947189510744203,
"grad_norm": 0.5782756209373474,
"learning_rate": 5.959177944330118e-06,
"loss": 0.1594,
"step": 32600
},
{
"epoch": 0.4962364938691271,
"grad_norm": 1.0311505794525146,
"learning_rate": 5.933170239221266e-06,
"loss": 0.1586,
"step": 32700
},
{
"epoch": 0.49775403666383394,
"grad_norm": 3.3592989444732666,
"learning_rate": 5.907136348523651e-06,
"loss": 0.1674,
"step": 32800
},
{
"epoch": 0.4992715794585407,
"grad_norm": 0.6535949110984802,
"learning_rate": 5.8810770027714544e-06,
"loss": 0.1548,
"step": 32900
},
{
"epoch": 0.5007891222532476,
"grad_norm": 3.045614004135132,
"learning_rate": 5.8549929332131494e-06,
"loss": 0.1602,
"step": 33000
},
{
"epoch": 0.5023066650479544,
"grad_norm": 0.8914518356323242,
"learning_rate": 5.828884871790977e-06,
"loss": 0.1587,
"step": 33100
},
{
"epoch": 0.5038242078426611,
"grad_norm": 1.4585373401641846,
"learning_rate": 5.802753551120417e-06,
"loss": 0.1643,
"step": 33200
},
{
"epoch": 0.505341750637368,
"grad_norm": 1.471450686454773,
"learning_rate": 5.77659970446962e-06,
"loss": 0.1686,
"step": 33300
},
{
"epoch": 0.5068592934320748,
"grad_norm": 2.6323044300079346,
"learning_rate": 5.750424065738837e-06,
"loss": 0.1596,
"step": 33400
},
{
"epoch": 0.5083768362267816,
"grad_norm": 0.884983241558075,
"learning_rate": 5.724227369439823e-06,
"loss": 0.163,
"step": 33500
},
{
"epoch": 0.5098943790214884,
"grad_norm": 0.4667279124259949,
"learning_rate": 5.69801035067523e-06,
"loss": 0.1597,
"step": 33600
},
{
"epoch": 0.5114119218161952,
"grad_norm": 0.9215657114982605,
"learning_rate": 5.671773745117977e-06,
"loss": 0.1609,
"step": 33700
},
{
"epoch": 0.512929464610902,
"grad_norm": 0.6155304908752441,
"learning_rate": 5.6455182889906e-06,
"loss": 0.1579,
"step": 33800
},
{
"epoch": 0.5144470074056089,
"grad_norm": 0.7917832732200623,
"learning_rate": 5.619244719044605e-06,
"loss": 0.1598,
"step": 33900
},
{
"epoch": 0.5159645502003156,
"grad_norm": 0.7764760255813599,
"learning_rate": 5.5929537725397845e-06,
"loss": 0.1576,
"step": 34000
},
{
"epoch": 0.5174820929950225,
"grad_norm": 0.3823162019252777,
"learning_rate": 5.566646187223535e-06,
"loss": 0.1625,
"step": 34100
},
{
"epoch": 0.5189996357897293,
"grad_norm": 0.46521395444869995,
"learning_rate": 5.5403227013101515e-06,
"loss": 0.1647,
"step": 34200
},
{
"epoch": 0.5205171785844361,
"grad_norm": 0.7488669157028198,
"learning_rate": 5.513984053460112e-06,
"loss": 0.16,
"step": 34300
},
{
"epoch": 0.5220347213791429,
"grad_norm": 0.6630042195320129,
"learning_rate": 5.4876309827593554e-06,
"loss": 0.1632,
"step": 34400
},
{
"epoch": 0.5235522641738497,
"grad_norm": 1.024338960647583,
"learning_rate": 5.461264228698537e-06,
"loss": 0.1542,
"step": 34500
},
{
"epoch": 0.5250698069685565,
"grad_norm": 1.1433038711547852,
"learning_rate": 5.434884531152281e-06,
"loss": 0.1572,
"step": 34600
},
{
"epoch": 0.5265873497632634,
"grad_norm": 0.4922288656234741,
"learning_rate": 5.408492630358414e-06,
"loss": 0.1672,
"step": 34700
},
{
"epoch": 0.5281048925579701,
"grad_norm": 1.3577494621276855,
"learning_rate": 5.3820892668972005e-06,
"loss": 0.1591,
"step": 34800
},
{
"epoch": 0.529622435352677,
"grad_norm": 1.1060514450073242,
"learning_rate": 5.355675181670554e-06,
"loss": 0.164,
"step": 34900
},
{
"epoch": 0.5311399781473838,
"grad_norm": 0.9221227765083313,
"learning_rate": 5.329251115881253e-06,
"loss": 0.1698,
"step": 35000
},
{
"epoch": 0.5326575209420905,
"grad_norm": 0.5531550049781799,
"learning_rate": 5.3028178110121395e-06,
"loss": 0.1629,
"step": 35100
},
{
"epoch": 0.5341750637367974,
"grad_norm": 1.188730001449585,
"learning_rate": 5.276376008805309e-06,
"loss": 0.1589,
"step": 35200
},
{
"epoch": 0.5356926065315042,
"grad_norm": 1.3050966262817383,
"learning_rate": 5.249926451241305e-06,
"loss": 0.1704,
"step": 35300
},
{
"epoch": 0.537210149326211,
"grad_norm": 0.5301448106765747,
"learning_rate": 5.2234698805182885e-06,
"loss": 0.1624,
"step": 35400
},
{
"epoch": 0.5387276921209178,
"grad_norm": 2.257366180419922,
"learning_rate": 5.1970070390312184e-06,
"loss": 0.1575,
"step": 35500
},
{
"epoch": 0.5402452349156246,
"grad_norm": 0.47665533423423767,
"learning_rate": 5.1705386693510175e-06,
"loss": 0.1579,
"step": 35600
},
{
"epoch": 0.5417627777103314,
"grad_norm": 0.8027414083480835,
"learning_rate": 5.144065514203731e-06,
"loss": 0.1555,
"step": 35700
},
{
"epoch": 0.5432803205050383,
"grad_norm": 0.7599822878837585,
"learning_rate": 5.117588316449694e-06,
"loss": 0.1584,
"step": 35800
},
{
"epoch": 0.544797863299745,
"grad_norm": 1.1928893327713013,
"learning_rate": 5.091107819062676e-06,
"loss": 0.1623,
"step": 35900
},
{
"epoch": 0.5463154060944518,
"grad_norm": 0.7790924906730652,
"learning_rate": 5.06462476510904e-06,
"loss": 0.1604,
"step": 36000
},
{
"epoch": 0.5478329488891587,
"grad_norm": 0.8471243381500244,
"learning_rate": 5.038139897726886e-06,
"loss": 0.1632,
"step": 36100
},
{
"epoch": 0.5493504916838655,
"grad_norm": 2.127570390701294,
"learning_rate": 5.011653960105204e-06,
"loss": 0.1567,
"step": 36200
},
{
"epoch": 0.5508680344785722,
"grad_norm": 0.45868635177612305,
"learning_rate": 4.985167695463012e-06,
"loss": 0.1561,
"step": 36300
},
{
"epoch": 0.5523855772732791,
"grad_norm": 0.9626381993293762,
"learning_rate": 4.958681847028508e-06,
"loss": 0.1589,
"step": 36400
},
{
"epoch": 0.5539031200679859,
"grad_norm": 0.9326303005218506,
"learning_rate": 4.932197158018208e-06,
"loss": 0.1606,
"step": 36500
},
{
"epoch": 0.5554206628626928,
"grad_norm": 0.5278806090354919,
"learning_rate": 4.9057143716160945e-06,
"loss": 0.1631,
"step": 36600
},
{
"epoch": 0.5569382056573995,
"grad_norm": 0.6460352540016174,
"learning_rate": 4.879234230952764e-06,
"loss": 0.1603,
"step": 36700
},
{
"epoch": 0.5584557484521063,
"grad_norm": 0.8054344058036804,
"learning_rate": 4.8527574790845635e-06,
"loss": 0.1582,
"step": 36800
},
{
"epoch": 0.5599732912468132,
"grad_norm": 1.1287479400634766,
"learning_rate": 4.826284858972757e-06,
"loss": 0.1584,
"step": 36900
},
{
"epoch": 0.56149083404152,
"grad_norm": 0.7059003710746765,
"learning_rate": 4.7998171134626595e-06,
"loss": 0.1564,
"step": 37000
},
{
"epoch": 0.5630083768362267,
"grad_norm": 0.6124716401100159,
"learning_rate": 4.7733549852628085e-06,
"loss": 0.1607,
"step": 37100
},
{
"epoch": 0.5645259196309336,
"grad_norm": 0.9231120944023132,
"learning_rate": 4.746899216924106e-06,
"loss": 0.1646,
"step": 37200
},
{
"epoch": 0.5660434624256404,
"grad_norm": 1.342411994934082,
"learning_rate": 4.720450550818996e-06,
"loss": 0.1539,
"step": 37300
},
{
"epoch": 0.5675610052203472,
"grad_norm": 1.3646472692489624,
"learning_rate": 4.694009729120626e-06,
"loss": 0.158,
"step": 37400
},
{
"epoch": 0.569078548015054,
"grad_norm": 1.1396809816360474,
"learning_rate": 4.667577493782025e-06,
"loss": 0.1564,
"step": 37500
},
{
"epoch": 0.5705960908097608,
"grad_norm": 0.6573958992958069,
"learning_rate": 4.641154586515277e-06,
"loss": 0.1578,
"step": 37600
},
{
"epoch": 0.5721136336044677,
"grad_norm": 0.7357528805732727,
"learning_rate": 4.614741748770714e-06,
"loss": 0.1597,
"step": 37700
},
{
"epoch": 0.5736311763991745,
"grad_norm": 0.7496752142906189,
"learning_rate": 4.588339721716109e-06,
"loss": 0.154,
"step": 37800
},
{
"epoch": 0.5751487191938812,
"grad_norm": 0.8587321639060974,
"learning_rate": 4.561949246215875e-06,
"loss": 0.1574,
"step": 37900
},
{
"epoch": 0.5766662619885881,
"grad_norm": 0.512344241142273,
"learning_rate": 4.535571062810281e-06,
"loss": 0.1591,
"step": 38000
},
{
"epoch": 0.5781838047832949,
"grad_norm": 0.6572905778884888,
"learning_rate": 4.509205911694666e-06,
"loss": 0.1614,
"step": 38100
},
{
"epoch": 0.5797013475780017,
"grad_norm": 0.7880915999412537,
"learning_rate": 4.482854532698675e-06,
"loss": 0.1626,
"step": 38200
},
{
"epoch": 0.5812188903727085,
"grad_norm": 0.6261263489723206,
"learning_rate": 4.456517665265491e-06,
"loss": 0.1606,
"step": 38300
},
{
"epoch": 0.5827364331674153,
"grad_norm": 2.0507938861846924,
"learning_rate": 4.430196048431093e-06,
"loss": 0.1566,
"step": 38400
},
{
"epoch": 0.5842539759621221,
"grad_norm": 1.5023155212402344,
"learning_rate": 4.403890420803511e-06,
"loss": 0.1613,
"step": 38500
},
{
"epoch": 0.585771518756829,
"grad_norm": 0.5439951419830322,
"learning_rate": 4.377601520542107e-06,
"loss": 0.1555,
"step": 38600
},
{
"epoch": 0.5872890615515357,
"grad_norm": 2.4225058555603027,
"learning_rate": 4.3513300853368565e-06,
"loss": 0.1556,
"step": 38700
},
{
"epoch": 0.5888066043462425,
"grad_norm": 1.1448410749435425,
"learning_rate": 4.32507685238765e-06,
"loss": 0.1544,
"step": 38800
},
{
"epoch": 0.5903241471409494,
"grad_norm": 0.8714995384216309,
"learning_rate": 4.298842558383609e-06,
"loss": 0.1583,
"step": 38900
},
{
"epoch": 0.5918416899356562,
"grad_norm": 0.4775989055633545,
"learning_rate": 4.272627939482406e-06,
"loss": 0.1575,
"step": 39000
},
{
"epoch": 0.593359232730363,
"grad_norm": 0.8010126948356628,
"learning_rate": 4.2464337312896185e-06,
"loss": 0.1588,
"step": 39100
},
{
"epoch": 0.5948767755250698,
"grad_norm": 0.7952308654785156,
"learning_rate": 4.220260668838076e-06,
"loss": 0.1583,
"step": 39200
},
{
"epoch": 0.5963943183197766,
"grad_norm": 0.5506075024604797,
"learning_rate": 4.194109486567242e-06,
"loss": 0.1582,
"step": 39300
},
{
"epoch": 0.5979118611144835,
"grad_norm": 0.8976244926452637,
"learning_rate": 4.167980918302605e-06,
"loss": 0.1564,
"step": 39400
},
{
"epoch": 0.5994294039091902,
"grad_norm": 0.7584324479103088,
"learning_rate": 4.141875697235081e-06,
"loss": 0.1588,
"step": 39500
},
{
"epoch": 0.600946946703897,
"grad_norm": 0.5679917335510254,
"learning_rate": 4.115794555900443e-06,
"loss": 0.1603,
"step": 39600
},
{
"epoch": 0.6024644894986039,
"grad_norm": 2.806525945663452,
"learning_rate": 4.089738226158768e-06,
"loss": 0.1612,
"step": 39700
},
{
"epoch": 0.6039820322933107,
"grad_norm": 0.5877603888511658,
"learning_rate": 4.063707439173894e-06,
"loss": 0.1583,
"step": 39800
},
{
"epoch": 0.6054995750880174,
"grad_norm": 0.8282864093780518,
"learning_rate": 4.0377029253929104e-06,
"loss": 0.1528,
"step": 39900
},
{
"epoch": 0.6070171178827243,
"grad_norm": 0.6813582181930542,
"learning_rate": 4.011725414525653e-06,
"loss": 0.1528,
"step": 40000
},
{
"epoch": 0.6085346606774311,
"grad_norm": 0.43593981862068176,
"learning_rate": 3.985775635524234e-06,
"loss": 0.1546,
"step": 40100
},
{
"epoch": 0.610052203472138,
"grad_norm": 0.5289904475212097,
"learning_rate": 3.959854316562584e-06,
"loss": 0.1523,
"step": 40200
},
{
"epoch": 0.6115697462668447,
"grad_norm": 0.6840155720710754,
"learning_rate": 3.933962185016021e-06,
"loss": 0.1621,
"step": 40300
},
{
"epoch": 0.6130872890615515,
"grad_norm": 0.8754764795303345,
"learning_rate": 3.908099967440838e-06,
"loss": 0.1577,
"step": 40400
},
{
"epoch": 0.6146048318562584,
"grad_norm": 0.7703122496604919,
"learning_rate": 3.882268389553912e-06,
"loss": 0.1574,
"step": 40500
},
{
"epoch": 0.6161223746509652,
"grad_norm": 0.6455814838409424,
"learning_rate": 3.856468176212345e-06,
"loss": 0.1573,
"step": 40600
},
{
"epoch": 0.6176399174456719,
"grad_norm": 5.197359085083008,
"learning_rate": 3.830700051393125e-06,
"loss": 0.1495,
"step": 40700
},
{
"epoch": 0.6191574602403788,
"grad_norm": 1.4284406900405884,
"learning_rate": 3.804964738172803e-06,
"loss": 0.1565,
"step": 40800
},
{
"epoch": 0.6206750030350856,
"grad_norm": 0.9033112525939941,
"learning_rate": 3.7792629587072086e-06,
"loss": 0.1641,
"step": 40900
},
{
"epoch": 0.6221925458297924,
"grad_norm": 1.4378776550292969,
"learning_rate": 3.753595434211187e-06,
"loss": 0.1572,
"step": 41000
},
{
"epoch": 0.6237100886244992,
"grad_norm": 1.1654125452041626,
"learning_rate": 3.7279628849383526e-06,
"loss": 0.1533,
"step": 41100
},
{
"epoch": 0.625227631419206,
"grad_norm": 1.5666859149932861,
"learning_rate": 3.702366030160891e-06,
"loss": 0.1571,
"step": 41200
},
{
"epoch": 0.6267451742139128,
"grad_norm": 1.6572262048721313,
"learning_rate": 3.6768055881493616e-06,
"loss": 0.1546,
"step": 41300
},
{
"epoch": 0.6282627170086197,
"grad_norm": 0.7941910028457642,
"learning_rate": 3.651282276152556e-06,
"loss": 0.1569,
"step": 41400
},
{
"epoch": 0.6297802598033264,
"grad_norm": 2.4592742919921875,
"learning_rate": 3.6257968103773567e-06,
"loss": 0.1548,
"step": 41500
},
{
"epoch": 0.6312978025980333,
"grad_norm": 0.6559963822364807,
"learning_rate": 3.6003499059686564e-06,
"loss": 0.1533,
"step": 41600
},
{
"epoch": 0.6328153453927401,
"grad_norm": 0.561519980430603,
"learning_rate": 3.574942276989273e-06,
"loss": 0.1621,
"step": 41700
},
{
"epoch": 0.6343328881874469,
"grad_norm": 0.48774096369743347,
"learning_rate": 3.5495746363999255e-06,
"loss": 0.153,
"step": 41800
},
{
"epoch": 0.6358504309821537,
"grad_norm": 0.5202430486679077,
"learning_rate": 3.524247696039223e-06,
"loss": 0.154,
"step": 41900
},
{
"epoch": 0.6373679737768605,
"grad_norm": 2.0533859729766846,
"learning_rate": 3.498962166603688e-06,
"loss": 0.1609,
"step": 42000
},
{
"epoch": 0.6388855165715673,
"grad_norm": 0.495179146528244,
"learning_rate": 3.4737187576278175e-06,
"loss": 0.1528,
"step": 42100
},
{
"epoch": 0.6404030593662742,
"grad_norm": 0.5103787779808044,
"learning_rate": 3.4485181774641697e-06,
"loss": 0.164,
"step": 42200
},
{
"epoch": 0.6419206021609809,
"grad_norm": 0.9691652059555054,
"learning_rate": 3.4233611332634874e-06,
"loss": 0.1578,
"step": 42300
},
{
"epoch": 0.6434381449556877,
"grad_norm": 0.7118510603904724,
"learning_rate": 3.3982483309548574e-06,
"loss": 0.1544,
"step": 42400
},
{
"epoch": 0.6449556877503946,
"grad_norm": 1.5103263854980469,
"learning_rate": 3.3731804752258988e-06,
"loss": 0.1564,
"step": 42500
},
{
"epoch": 0.6464732305451014,
"grad_norm": 0.7205672860145569,
"learning_rate": 3.348158269502989e-06,
"loss": 0.1519,
"step": 42600
},
{
"epoch": 0.6479907733398081,
"grad_norm": 1.0135701894760132,
"learning_rate": 3.323182415931525e-06,
"loss": 0.158,
"step": 42700
},
{
"epoch": 0.649508316134515,
"grad_norm": 2.973532199859619,
"learning_rate": 3.2982536153562238e-06,
"loss": 0.1604,
"step": 42800
},
{
"epoch": 0.6510258589292218,
"grad_norm": 1.3079688549041748,
"learning_rate": 3.2733725673014514e-06,
"loss": 0.1594,
"step": 42900
},
{
"epoch": 0.6525434017239287,
"grad_norm": 0.47416195273399353,
"learning_rate": 3.2485399699515936e-06,
"loss": 0.1567,
"step": 43000
},
{
"epoch": 0.6540609445186354,
"grad_norm": 0.927847146987915,
"learning_rate": 3.223756520131471e-06,
"loss": 0.1505,
"step": 43100
},
{
"epoch": 0.6555784873133422,
"grad_norm": 0.8857322335243225,
"learning_rate": 3.1990229132867755e-06,
"loss": 0.1566,
"step": 43200
},
{
"epoch": 0.6570960301080491,
"grad_norm": 1.3829877376556396,
"learning_rate": 3.174339843464567e-06,
"loss": 0.1593,
"step": 43300
},
{
"epoch": 0.6586135729027559,
"grad_norm": 1.6694592237472534,
"learning_rate": 3.1497080032937832e-06,
"loss": 0.1592,
"step": 43400
},
{
"epoch": 0.6601311156974626,
"grad_norm": 0.580848753452301,
"learning_rate": 3.1251280839658215e-06,
"loss": 0.1516,
"step": 43500
},
{
"epoch": 0.6616486584921695,
"grad_norm": 0.8469058275222778,
"learning_rate": 3.1006007752151247e-06,
"loss": 0.1559,
"step": 43600
},
{
"epoch": 0.6631662012868763,
"grad_norm": 1.1796588897705078,
"learning_rate": 3.076126765299844e-06,
"loss": 0.1578,
"step": 43700
},
{
"epoch": 0.6646837440815831,
"grad_norm": 1.4025541543960571,
"learning_rate": 3.0517067409825115e-06,
"loss": 0.1594,
"step": 43800
},
{
"epoch": 0.6662012868762899,
"grad_norm": 0.6020201444625854,
"learning_rate": 3.027341387510781e-06,
"loss": 0.1522,
"step": 43900
},
{
"epoch": 0.6677188296709967,
"grad_norm": 0.46768826246261597,
"learning_rate": 3.0030313885981876e-06,
"loss": 0.1557,
"step": 44000
},
{
"epoch": 0.6692363724657036,
"grad_norm": 0.6836826801300049,
"learning_rate": 2.978777426404975e-06,
"loss": 0.1576,
"step": 44100
},
{
"epoch": 0.6707539152604104,
"grad_norm": 0.8283591270446777,
"learning_rate": 2.9545801815189403e-06,
"loss": 0.1563,
"step": 44200
},
{
"epoch": 0.6722714580551171,
"grad_norm": 0.5825768113136292,
"learning_rate": 2.930440332936345e-06,
"loss": 0.1586,
"step": 44300
},
{
"epoch": 0.673789000849824,
"grad_norm": 0.7869064211845398,
"learning_rate": 2.9063585580428586e-06,
"loss": 0.1555,
"step": 44400
},
{
"epoch": 0.6753065436445308,
"grad_norm": 0.5700305104255676,
"learning_rate": 2.8823355325945545e-06,
"loss": 0.1574,
"step": 44500
},
{
"epoch": 0.6768240864392376,
"grad_norm": 0.5473443865776062,
"learning_rate": 2.8583719306989386e-06,
"loss": 0.1546,
"step": 44600
},
{
"epoch": 0.6783416292339444,
"grad_norm": 0.9259124398231506,
"learning_rate": 2.834468424796044e-06,
"loss": 0.1533,
"step": 44700
},
{
"epoch": 0.6798591720286512,
"grad_norm": 2.3020691871643066,
"learning_rate": 2.8106256856395536e-06,
"loss": 0.1602,
"step": 44800
},
{
"epoch": 0.681376714823358,
"grad_norm": 0.6738184094429016,
"learning_rate": 2.78684438227798e-06,
"loss": 0.1603,
"step": 44900
},
{
"epoch": 0.6828942576180649,
"grad_norm": 0.4569184482097626,
"learning_rate": 2.763125182035898e-06,
"loss": 0.1497,
"step": 45000
},
{
"epoch": 0.6844118004127716,
"grad_norm": 0.7541574835777283,
"learning_rate": 2.7394687504952065e-06,
"loss": 0.1574,
"step": 45100
},
{
"epoch": 0.6859293432074784,
"grad_norm": 1.6138263940811157,
"learning_rate": 2.7158757514764674e-06,
"loss": 0.1587,
"step": 45200
},
{
"epoch": 0.6874468860021853,
"grad_norm": 1.5819748640060425,
"learning_rate": 2.692346847020259e-06,
"loss": 0.1544,
"step": 45300
},
{
"epoch": 0.6889644287968921,
"grad_norm": 1.57101309299469,
"learning_rate": 2.66888269736862e-06,
"loss": 0.1527,
"step": 45400
},
{
"epoch": 0.6904819715915989,
"grad_norm": 0.48757970333099365,
"learning_rate": 2.645483960946501e-06,
"loss": 0.154,
"step": 45500
},
{
"epoch": 0.6919995143863057,
"grad_norm": 0.48652932047843933,
"learning_rate": 2.622151294343308e-06,
"loss": 0.1528,
"step": 45600
},
{
"epoch": 0.6935170571810125,
"grad_norm": 0.6868948936462402,
"learning_rate": 2.5988853522944626e-06,
"loss": 0.1606,
"step": 45700
},
{
"epoch": 0.6950345999757194,
"grad_norm": 0.7019301056861877,
"learning_rate": 2.575686787663041e-06,
"loss": 0.1534,
"step": 45800
},
{
"epoch": 0.6965521427704261,
"grad_norm": 0.9775063991546631,
"learning_rate": 2.552556251421443e-06,
"loss": 0.1518,
"step": 45900
},
{
"epoch": 0.6980696855651329,
"grad_norm": 0.6645476818084717,
"learning_rate": 2.529494392633138e-06,
"loss": 0.1473,
"step": 46000
},
{
"epoch": 0.6995872283598398,
"grad_norm": 2.778902053833008,
"learning_rate": 2.506501858434439e-06,
"loss": 0.145,
"step": 46100
},
{
"epoch": 0.7011047711545466,
"grad_norm": 1.3645416498184204,
"learning_rate": 2.483579294016355e-06,
"loss": 0.1535,
"step": 46200
},
{
"epoch": 0.7026223139492533,
"grad_norm": 0.9883469343185425,
"learning_rate": 2.4607273426064725e-06,
"loss": 0.1538,
"step": 46300
},
{
"epoch": 0.7041398567439602,
"grad_norm": 0.7800496816635132,
"learning_rate": 2.4379466454509236e-06,
"loss": 0.1532,
"step": 46400
},
{
"epoch": 0.705657399538667,
"grad_norm": 0.6158055067062378,
"learning_rate": 2.4152378417963733e-06,
"loss": 0.1551,
"step": 46500
},
{
"epoch": 0.7071749423333737,
"grad_norm": 0.6619333028793335,
"learning_rate": 2.3926015688721e-06,
"loss": 0.1573,
"step": 46600
},
{
"epoch": 0.7086924851280806,
"grad_norm": 0.5126205086708069,
"learning_rate": 2.3700384618720973e-06,
"loss": 0.1469,
"step": 46700
},
{
"epoch": 0.7102100279227874,
"grad_norm": 0.6815778613090515,
"learning_rate": 2.3475491539372596e-06,
"loss": 0.1548,
"step": 46800
},
{
"epoch": 0.7117275707174943,
"grad_norm": 0.8624967336654663,
"learning_rate": 2.325134276137619e-06,
"loss": 0.1563,
"step": 46900
},
{
"epoch": 0.713245113512201,
"grad_norm": 0.5875204205513,
"learning_rate": 2.3027944574546237e-06,
"loss": 0.1541,
"step": 47000
},
{
"epoch": 0.7147626563069078,
"grad_norm": 0.5968221426010132,
"learning_rate": 2.2805303247635035e-06,
"loss": 0.1503,
"step": 47100
},
{
"epoch": 0.7162801991016147,
"grad_norm": 1.5649338960647583,
"learning_rate": 2.258342502815665e-06,
"loss": 0.1483,
"step": 47200
},
{
"epoch": 0.7177977418963215,
"grad_norm": 0.739321231842041,
"learning_rate": 2.2362316142211755e-06,
"loss": 0.1527,
"step": 47300
},
{
"epoch": 0.7193152846910282,
"grad_norm": 1.2006664276123047,
"learning_rate": 2.2141982794312737e-06,
"loss": 0.1586,
"step": 47400
},
{
"epoch": 0.7208328274857351,
"grad_norm": 0.6313862204551697,
"learning_rate": 2.19224311672098e-06,
"loss": 0.1582,
"step": 47500
},
{
"epoch": 0.7223503702804419,
"grad_norm": 1.3812549114227295,
"learning_rate": 2.170366742171727e-06,
"loss": 0.1535,
"step": 47600
},
{
"epoch": 0.7238679130751488,
"grad_norm": 0.39481043815612793,
"learning_rate": 2.148569769654089e-06,
"loss": 0.1522,
"step": 47700
},
{
"epoch": 0.7253854558698555,
"grad_norm": 1.3398104906082153,
"learning_rate": 2.1268528108105424e-06,
"loss": 0.1564,
"step": 47800
},
{
"epoch": 0.7269029986645623,
"grad_norm": 0.962664783000946,
"learning_rate": 2.105216475038314e-06,
"loss": 0.1505,
"step": 47900
},
{
"epoch": 0.7284205414592692,
"grad_norm": 0.7913764119148254,
"learning_rate": 2.0836613694722696e-06,
"loss": 0.1532,
"step": 48000
},
{
"epoch": 0.729938084253976,
"grad_norm": 1.8171051740646362,
"learning_rate": 2.0621880989678895e-06,
"loss": 0.156,
"step": 48100
},
{
"epoch": 0.7314556270486827,
"grad_norm": 1.8144663572311401,
"learning_rate": 2.0407972660842824e-06,
"loss": 0.1517,
"step": 48200
},
{
"epoch": 0.7329731698433896,
"grad_norm": 0.5789530277252197,
"learning_rate": 2.0194894710672908e-06,
"loss": 0.15,
"step": 48300
},
{
"epoch": 0.7344907126380964,
"grad_norm": 2.0675876140594482,
"learning_rate": 1.998265311832634e-06,
"loss": 0.1489,
"step": 48400
},
{
"epoch": 0.7360082554328032,
"grad_norm": 2.8520278930664062,
"learning_rate": 1.9771253839491423e-06,
"loss": 0.1546,
"step": 48500
},
{
"epoch": 0.73752579822751,
"grad_norm": 0.9103041291236877,
"learning_rate": 1.956070280622036e-06,
"loss": 0.1559,
"step": 48600
},
{
"epoch": 0.7390433410222168,
"grad_norm": 0.4860074520111084,
"learning_rate": 1.9351005926762808e-06,
"loss": 0.1548,
"step": 48700
},
{
"epoch": 0.7405608838169236,
"grad_norm": 0.9042688012123108,
"learning_rate": 1.9142169085400175e-06,
"loss": 0.1568,
"step": 48800
},
{
"epoch": 0.7420784266116305,
"grad_norm": 1.0387529134750366,
"learning_rate": 1.8934198142280357e-06,
"loss": 0.1534,
"step": 48900
},
{
"epoch": 0.7435959694063372,
"grad_norm": 0.46772363781929016,
"learning_rate": 1.8727098933253435e-06,
"loss": 0.1512,
"step": 49000
},
{
"epoch": 0.745113512201044,
"grad_norm": 1.0915402173995972,
"learning_rate": 1.8520877269707804e-06,
"loss": 0.1541,
"step": 49100
},
{
"epoch": 0.7466310549957509,
"grad_norm": 0.6211321949958801,
"learning_rate": 1.8315538938407195e-06,
"loss": 0.1553,
"step": 49200
},
{
"epoch": 0.7481485977904577,
"grad_norm": 0.9021114110946655,
"learning_rate": 1.8111089701328205e-06,
"loss": 0.1439,
"step": 49300
},
{
"epoch": 0.7496661405851645,
"grad_norm": 0.821042537689209,
"learning_rate": 1.7907535295498702e-06,
"loss": 0.1525,
"step": 49400
},
{
"epoch": 0.7511836833798713,
"grad_norm": 0.9626114368438721,
"learning_rate": 1.770488143283674e-06,
"loss": 0.1498,
"step": 49500
},
{
"epoch": 0.7527012261745781,
"grad_norm": 0.6211876273155212,
"learning_rate": 1.7503133799990384e-06,
"loss": 0.1549,
"step": 49600
},
{
"epoch": 0.754218768969285,
"grad_norm": 1.3714808225631714,
"learning_rate": 1.7302298058178025e-06,
"loss": 0.1557,
"step": 49700
},
{
"epoch": 0.7557363117639917,
"grad_norm": 0.7492864727973938,
"learning_rate": 1.7102379843029643e-06,
"loss": 0.1619,
"step": 49800
},
{
"epoch": 0.7572538545586985,
"grad_norm": 0.6472452282905579,
"learning_rate": 1.690338476442852e-06,
"loss": 0.1569,
"step": 49900
},
{
"epoch": 0.7587713973534054,
"grad_norm": 1.0946698188781738,
"learning_rate": 1.6705318406353999e-06,
"loss": 0.1505,
"step": 50000
},
{
"epoch": 0.7602889401481122,
"grad_norm": 0.5974524021148682,
"learning_rate": 1.6508186326724607e-06,
"loss": 0.1519,
"step": 50100
},
{
"epoch": 0.7618064829428189,
"grad_norm": 0.8393105864524841,
"learning_rate": 1.6311994057242259e-06,
"loss": 0.1561,
"step": 50200
},
{
"epoch": 0.7633240257375258,
"grad_norm": 3.725569248199463,
"learning_rate": 1.6116747103236902e-06,
"loss": 0.1539,
"step": 50300
},
{
"epoch": 0.7648415685322326,
"grad_norm": 0.8107349872589111,
"learning_rate": 1.5922450943512136e-06,
"loss": 0.1534,
"step": 50400
},
{
"epoch": 0.7663591113269395,
"grad_norm": 1.49680757522583,
"learning_rate": 1.572911103019139e-06,
"loss": 0.1568,
"step": 50500
},
{
"epoch": 0.7678766541216462,
"grad_norm": 0.8140336275100708,
"learning_rate": 1.5536732788564963e-06,
"loss": 0.1525,
"step": 50600
},
{
"epoch": 0.769394196916353,
"grad_norm": 0.5300224423408508,
"learning_rate": 1.5345321616937841e-06,
"loss": 0.1565,
"step": 50700
},
{
"epoch": 0.7709117397110599,
"grad_norm": 1.5161974430084229,
"learning_rate": 1.5154882886478095e-06,
"loss": 0.1541,
"step": 50800
},
{
"epoch": 0.7724292825057667,
"grad_norm": 0.9870403409004211,
"learning_rate": 1.496542194106629e-06,
"loss": 0.1466,
"step": 50900
},
{
"epoch": 0.7739468253004734,
"grad_norm": 1.346680760383606,
"learning_rate": 1.4776944097145413e-06,
"loss": 0.1552,
"step": 51000
},
{
"epoch": 0.7754643680951803,
"grad_norm": 0.9406745433807373,
"learning_rate": 1.4589454643571816e-06,
"loss": 0.148,
"step": 51100
},
{
"epoch": 0.7769819108898871,
"grad_norm": 0.8680304288864136,
"learning_rate": 1.4402958841466664e-06,
"loss": 0.1529,
"step": 51200
},
{
"epoch": 0.778499453684594,
"grad_norm": 1.0303786993026733,
"learning_rate": 1.4217461924068438e-06,
"loss": 0.152,
"step": 51300
},
{
"epoch": 0.7800169964793007,
"grad_norm": 0.7590554356575012,
"learning_rate": 1.4032969096585968e-06,
"loss": 0.1523,
"step": 51400
},
{
"epoch": 0.7815345392740075,
"grad_norm": 0.8220491409301758,
"learning_rate": 1.3849485536052488e-06,
"loss": 0.1485,
"step": 51500
},
{
"epoch": 0.7830520820687144,
"grad_norm": 0.7371506690979004,
"learning_rate": 1.3667016391180231e-06,
"loss": 0.1526,
"step": 51600
},
{
"epoch": 0.7845696248634212,
"grad_norm": 0.9485886693000793,
"learning_rate": 1.3485566782216097e-06,
"loss": 0.1508,
"step": 51700
},
{
"epoch": 0.7860871676581279,
"grad_norm": 0.862425684928894,
"learning_rate": 1.3305141800797827e-06,
"loss": 0.1552,
"step": 51800
},
{
"epoch": 0.7876047104528348,
"grad_norm": 1.46304190158844,
"learning_rate": 1.3125746509811266e-06,
"loss": 0.1556,
"step": 51900
},
{
"epoch": 0.7891222532475416,
"grad_norm": 0.8005903959274292,
"learning_rate": 1.2947385943248165e-06,
"loss": 0.1497,
"step": 52000
},
{
"epoch": 0.7906397960422484,
"grad_norm": 0.5657956004142761,
"learning_rate": 1.2770065106065043e-06,
"loss": 0.1529,
"step": 52100
},
{
"epoch": 0.7921573388369552,
"grad_norm": 0.7030004858970642,
"learning_rate": 1.2593788974042636e-06,
"loss": 0.1495,
"step": 52200
},
{
"epoch": 0.793674881631662,
"grad_norm": 0.7087785601615906,
"learning_rate": 1.2418562493646374e-06,
"loss": 0.1579,
"step": 52300
},
{
"epoch": 0.7951924244263688,
"grad_norm": 0.6476522088050842,
"learning_rate": 1.2244390581887478e-06,
"loss": 0.1597,
"step": 52400
},
{
"epoch": 0.7967099672210757,
"grad_norm": 2.871662139892578,
"learning_rate": 1.2071278126185042e-06,
"loss": 0.1546,
"step": 52500
},
{
"epoch": 0.7982275100157824,
"grad_norm": 0.6594822406768799,
"learning_rate": 1.1899229984228922e-06,
"loss": 0.1516,
"step": 52600
},
{
"epoch": 0.7997450528104892,
"grad_norm": 0.890384316444397,
"learning_rate": 1.1728250983843308e-06,
"loss": 0.1527,
"step": 52700
},
{
"epoch": 0.8012625956051961,
"grad_norm": 0.6064486503601074,
"learning_rate": 1.1558345922851404e-06,
"loss": 0.1497,
"step": 52800
},
{
"epoch": 0.8027801383999029,
"grad_norm": 0.6611379384994507,
"learning_rate": 1.138951956894065e-06,
"loss": 0.1519,
"step": 52900
},
{
"epoch": 0.8042976811946096,
"grad_norm": 0.7390156984329224,
"learning_rate": 1.122177665952906e-06,
"loss": 0.1512,
"step": 53000
},
{
"epoch": 0.8058152239893165,
"grad_norm": 1.1560479402542114,
"learning_rate": 1.1055121901632165e-06,
"loss": 0.1561,
"step": 53100
},
{
"epoch": 0.8073327667840233,
"grad_norm": 1.7963082790374756,
"learning_rate": 1.0889559971731073e-06,
"loss": 0.1522,
"step": 53200
},
{
"epoch": 0.8088503095787302,
"grad_norm": 0.9894265532493591,
"learning_rate": 1.0725095515641088e-06,
"loss": 0.1512,
"step": 53300
},
{
"epoch": 0.8103678523734369,
"grad_norm": 1.401088833808899,
"learning_rate": 1.0561733148381475e-06,
"loss": 0.1565,
"step": 53400
},
{
"epoch": 0.8118853951681437,
"grad_norm": 0.6930254697799683,
"learning_rate": 1.0399477454045875e-06,
"loss": 0.1584,
"step": 53500
},
{
"epoch": 0.8134029379628506,
"grad_norm": 0.8673112392425537,
"learning_rate": 1.023833298567372e-06,
"loss": 0.1543,
"step": 53600
},
{
"epoch": 0.8149204807575574,
"grad_norm": 1.2038730382919312,
"learning_rate": 1.0078304265122425e-06,
"loss": 0.1503,
"step": 53700
},
{
"epoch": 0.8164380235522641,
"grad_norm": 0.9704771637916565,
"learning_rate": 9.919395782940561e-07,
"loss": 0.1518,
"step": 53800
},
{
"epoch": 0.817955566346971,
"grad_norm": 1.151596188545227,
"learning_rate": 9.761611998241766e-07,
"loss": 0.151,
"step": 53900
},
{
"epoch": 0.8194731091416778,
"grad_norm": 0.46842506527900696,
"learning_rate": 9.604957338579724e-07,
"loss": 0.1516,
"step": 54000
},
{
"epoch": 0.8209906519363847,
"grad_norm": 2.784548044204712,
"learning_rate": 9.449436199823797e-07,
"loss": 0.1505,
"step": 54100
},
{
"epoch": 0.8225081947310914,
"grad_norm": 1.2736271619796753,
"learning_rate": 9.295052946035804e-07,
"loss": 0.1511,
"step": 54200
},
{
"epoch": 0.8240257375257982,
"grad_norm": 1.5416733026504517,
"learning_rate": 9.141811909347454e-07,
"loss": 0.1499,
"step": 54300
},
{
"epoch": 0.8255432803205051,
"grad_norm": 0.6435291767120361,
"learning_rate": 8.98971738983882e-07,
"loss": 0.1556,
"step": 54400
},
{
"epoch": 0.8270608231152119,
"grad_norm": 0.45951247215270996,
"learning_rate": 8.838773655417731e-07,
"loss": 0.1516,
"step": 54500
},
{
"epoch": 0.8285783659099186,
"grad_norm": 2.424105167388916,
"learning_rate": 8.688984941699907e-07,
"loss": 0.1528,
"step": 54600
},
{
"epoch": 0.8300959087046255,
"grad_norm": 3.5320215225219727,
"learning_rate": 8.540355451890204e-07,
"loss": 0.1467,
"step": 54700
},
{
"epoch": 0.8316134514993323,
"grad_norm": 0.8505892753601074,
"learning_rate": 8.392889356664563e-07,
"loss": 0.1442,
"step": 54800
},
{
"epoch": 0.8331309942940391,
"grad_norm": 0.7711630463600159,
"learning_rate": 8.246590794053111e-07,
"loss": 0.1539,
"step": 54900
},
{
"epoch": 0.8346485370887459,
"grad_norm": 1.0841100215911865,
"learning_rate": 8.101463869323889e-07,
"loss": 0.1522,
"step": 55000
},
{
"epoch": 0.8361660798834527,
"grad_norm": 0.7660624980926514,
"learning_rate": 7.957512654867805e-07,
"loss": 0.1462,
"step": 55100
},
{
"epoch": 0.8376836226781595,
"grad_norm": 1.2569801807403564,
"learning_rate": 7.81474119008424e-07,
"loss": 0.153,
"step": 55200
},
{
"epoch": 0.8392011654728664,
"grad_norm": 0.9163670539855957,
"learning_rate": 7.673153481267781e-07,
"loss": 0.1513,
"step": 55300
},
{
"epoch": 0.8407187082675731,
"grad_norm": 1.1001312732696533,
"learning_rate": 7.532753501495732e-07,
"loss": 0.1528,
"step": 55400
},
{
"epoch": 0.84223625106228,
"grad_norm": 1.1752309799194336,
"learning_rate": 7.393545190516704e-07,
"loss": 0.1495,
"step": 55500
},
{
"epoch": 0.8437537938569868,
"grad_norm": 1.5676056146621704,
"learning_rate": 7.255532454639968e-07,
"loss": 0.1524,
"step": 55600
},
{
"epoch": 0.8452713366516936,
"grad_norm": 2.6412742137908936,
"learning_rate": 7.118719166625953e-07,
"loss": 0.1534,
"step": 55700
},
{
"epoch": 0.8467888794464004,
"grad_norm": 0.74490886926651,
"learning_rate": 6.983109165577451e-07,
"loss": 0.1497,
"step": 55800
},
{
"epoch": 0.8483064222411072,
"grad_norm": 0.660461962223053,
"learning_rate": 6.848706256832e-07,
"loss": 0.1527,
"step": 55900
},
{
"epoch": 0.849823965035814,
"grad_norm": 1.0362908840179443,
"learning_rate": 6.715514211855007e-07,
"loss": 0.156,
"step": 56000
},
{
"epoch": 0.8513415078305209,
"grad_norm": 1.0476360321044922,
"learning_rate": 6.583536768134008e-07,
"loss": 0.1532,
"step": 56100
},
{
"epoch": 0.8528590506252276,
"grad_norm": 0.7230373620986938,
"learning_rate": 6.452777629073698e-07,
"loss": 0.1513,
"step": 56200
},
{
"epoch": 0.8543765934199344,
"grad_norm": 0.875400960445404,
"learning_rate": 6.3232404638921e-07,
"loss": 0.1509,
"step": 56300
},
{
"epoch": 0.8558941362146413,
"grad_norm": 0.6312329769134521,
"learning_rate": 6.194928907517534e-07,
"loss": 0.1518,
"step": 56400
},
{
"epoch": 0.8574116790093481,
"grad_norm": 0.7422769069671631,
"learning_rate": 6.067846560486646e-07,
"loss": 0.1437,
"step": 56500
},
{
"epoch": 0.8589292218040548,
"grad_norm": 0.7523478269577026,
"learning_rate": 5.941996988843385e-07,
"loss": 0.149,
"step": 56600
},
{
"epoch": 0.8604467645987617,
"grad_norm": 0.9320465326309204,
"learning_rate": 5.817383724038906e-07,
"loss": 0.1526,
"step": 56700
},
{
"epoch": 0.8619643073934685,
"grad_norm": 0.7207502126693726,
"learning_rate": 5.694010262832522e-07,
"loss": 0.1538,
"step": 56800
},
{
"epoch": 0.8634818501881754,
"grad_norm": 4.081971645355225,
"learning_rate": 5.571880067193514e-07,
"loss": 0.1485,
"step": 56900
},
{
"epoch": 0.8649993929828821,
"grad_norm": 0.9618120789527893,
"learning_rate": 5.450996564204053e-07,
"loss": 0.1503,
"step": 57000
},
{
"epoch": 0.8665169357775889,
"grad_norm": 12.091854095458984,
"learning_rate": 5.331363145962981e-07,
"loss": 0.1441,
"step": 57100
},
{
"epoch": 0.8680344785722958,
"grad_norm": 1.4775971174240112,
"learning_rate": 5.212983169490671e-07,
"loss": 0.1517,
"step": 57200
},
{
"epoch": 0.8695520213670026,
"grad_norm": 0.7318031787872314,
"learning_rate": 5.095859956634774e-07,
"loss": 0.1469,
"step": 57300
},
{
"epoch": 0.8710695641617093,
"grad_norm": 0.4980830252170563,
"learning_rate": 4.97999679397706e-07,
"loss": 0.1564,
"step": 57400
},
{
"epoch": 0.8725871069564162,
"grad_norm": 1.0416113138198853,
"learning_rate": 4.865396932741151e-07,
"loss": 0.1514,
"step": 57500
},
{
"epoch": 0.874104649751123,
"grad_norm": 1.5563331842422485,
"learning_rate": 4.7520635887013164e-07,
"loss": 0.1536,
"step": 57600
},
{
"epoch": 0.8756221925458298,
"grad_norm": 0.7019147872924805,
"learning_rate": 4.639999942092205e-07,
"loss": 0.1563,
"step": 57700
},
{
"epoch": 0.8771397353405366,
"grad_norm": 10.20484447479248,
"learning_rate": 4.5292091375196524e-07,
"loss": 0.1538,
"step": 57800
},
{
"epoch": 0.8786572781352434,
"grad_norm": 6.894394397735596,
"learning_rate": 4.4196942838723834e-07,
"loss": 0.1498,
"step": 57900
},
{
"epoch": 0.8801748209299503,
"grad_norm": 1.0151070356369019,
"learning_rate": 4.311458454234829e-07,
"loss": 0.1491,
"step": 58000
},
{
"epoch": 0.881692363724657,
"grad_norm": 15.238383293151855,
"learning_rate": 4.2045046858008367e-07,
"loss": 0.1529,
"step": 58100
},
{
"epoch": 0.8832099065193638,
"grad_norm": 0.8131313323974609,
"learning_rate": 4.098835979788507e-07,
"loss": 0.1509,
"step": 58200
},
{
"epoch": 0.8847274493140707,
"grad_norm": 2.1564486026763916,
"learning_rate": 3.9944553013559153e-07,
"loss": 0.1598,
"step": 58300
},
{
"epoch": 0.8862449921087775,
"grad_norm": 1.2616732120513916,
"learning_rate": 3.891365579517936e-07,
"loss": 0.1538,
"step": 58400
},
{
"epoch": 0.8877625349034842,
"grad_norm": 0.682653546333313,
"learning_rate": 3.7895697070640835e-07,
"loss": 0.1505,
"step": 58500
},
{
"epoch": 0.8892800776981911,
"grad_norm": 0.6538369059562683,
"learning_rate": 3.6890705404772575e-07,
"loss": 0.1527,
"step": 58600
},
{
"epoch": 0.8907976204928979,
"grad_norm": 0.8717949986457825,
"learning_rate": 3.5898708998536866e-07,
"loss": 0.1518,
"step": 58700
},
{
"epoch": 0.8923151632876047,
"grad_norm": 0.9392913579940796,
"learning_rate": 3.491973568823692e-07,
"loss": 0.1537,
"step": 58800
},
{
"epoch": 0.8938327060823115,
"grad_norm": 0.7152376770973206,
"learning_rate": 3.395381294473665e-07,
"loss": 0.152,
"step": 58900
},
{
"epoch": 0.8953502488770183,
"grad_norm": 1.1813652515411377,
"learning_rate": 3.3000967872689135e-07,
"loss": 0.1521,
"step": 59000
},
{
"epoch": 0.8968677916717251,
"grad_norm": 0.9120343327522278,
"learning_rate": 3.206122720977667e-07,
"loss": 0.1574,
"step": 59100
},
{
"epoch": 0.898385334466432,
"grad_norm": 3.3989994525909424,
"learning_rate": 3.1134617325959795e-07,
"loss": 0.1524,
"step": 59200
},
{
"epoch": 0.8999028772611387,
"grad_norm": 1.1465574502944946,
"learning_rate": 3.022116422273802e-07,
"loss": 0.1508,
"step": 59300
},
{
"epoch": 0.9014204200558456,
"grad_norm": 1.940820574760437,
"learning_rate": 2.9320893532419515e-07,
"loss": 0.1496,
"step": 59400
},
{
"epoch": 0.9029379628505524,
"grad_norm": 1.1829354763031006,
"learning_rate": 2.8433830517402505e-07,
"loss": 0.1471,
"step": 59500
},
{
"epoch": 0.9044555056452592,
"grad_norm": 1.006872534751892,
"learning_rate": 2.7560000069465856e-07,
"loss": 0.1461,
"step": 59600
},
{
"epoch": 0.905973048439966,
"grad_norm": 1.5081512928009033,
"learning_rate": 2.6699426709071e-07,
"loss": 0.1444,
"step": 59700
},
{
"epoch": 0.9074905912346728,
"grad_norm": 0.7259085178375244,
"learning_rate": 2.585213458467339e-07,
"loss": 0.1454,
"step": 59800
},
{
"epoch": 0.9090081340293796,
"grad_norm": 0.7794526815414429,
"learning_rate": 2.501814747204551e-07,
"loss": 0.1483,
"step": 59900
},
{
"epoch": 0.9105256768240865,
"grad_norm": 1.0124095678329468,
"learning_rate": 2.4197488773609004e-07,
"loss": 0.1499,
"step": 60000
},
{
"epoch": 0.9105256768240865,
"eval_loss": 0.4795108139514923,
"eval_runtime": 988.4632,
"eval_samples_per_second": 50.38,
"eval_steps_per_second": 6.298,
"step": 60000
},
{
"epoch": 0.9120432196187932,
"grad_norm": 0.6119298934936523,
"learning_rate": 2.3390181517778665e-07,
"loss": 0.1459,
"step": 60100
},
{
"epoch": 0.9135607624135,
"grad_norm": 0.997187614440918,
"learning_rate": 2.2596248358315699e-07,
"loss": 0.1525,
"step": 60200
},
{
"epoch": 0.9150783052082069,
"grad_norm": 0.8470273017883301,
"learning_rate": 2.1815711573692222e-07,
"loss": 0.1485,
"step": 60300
},
{
"epoch": 0.9165958480029137,
"grad_norm": 29.62990951538086,
"learning_rate": 2.104859306646623e-07,
"loss": 0.1497,
"step": 60400
},
{
"epoch": 0.9181133907976204,
"grad_norm": 2.7757768630981445,
"learning_rate": 2.0294914362666895e-07,
"loss": 0.1453,
"step": 60500
},
{
"epoch": 0.9196309335923273,
"grad_norm": 1.834951400756836,
"learning_rate": 1.955469661119047e-07,
"loss": 0.1486,
"step": 60600
},
{
"epoch": 0.9211484763870341,
"grad_norm": 1.0161601305007935,
"learning_rate": 1.8827960583206906e-07,
"loss": 0.1478,
"step": 60700
},
{
"epoch": 0.922666019181741,
"grad_norm": 0.5595516562461853,
"learning_rate": 1.8114726671576988e-07,
"loss": 0.1499,
"step": 60800
},
{
"epoch": 0.9241835619764477,
"grad_norm": 1.4370942115783691,
"learning_rate": 1.7415014890280024e-07,
"loss": 0.1532,
"step": 60900
},
{
"epoch": 0.9257011047711545,
"grad_norm": 1.0193774700164795,
"learning_rate": 1.6728844873852402e-07,
"loss": 0.1533,
"step": 61000
},
{
"epoch": 0.9272186475658614,
"grad_norm": 1.3867748975753784,
"learning_rate": 1.6056235876836413e-07,
"loss": 0.1527,
"step": 61100
},
{
"epoch": 0.9287361903605682,
"grad_norm": 0.9046171307563782,
"learning_rate": 1.5397206773240136e-07,
"loss": 0.1462,
"step": 61200
},
{
"epoch": 0.9302537331552749,
"grad_norm": 0.798395037651062,
"learning_rate": 1.4751776056007583e-07,
"loss": 0.155,
"step": 61300
},
{
"epoch": 0.9317712759499818,
"grad_norm": 1.0858103036880493,
"learning_rate": 1.4119961836500218e-07,
"loss": 0.151,
"step": 61400
},
{
"epoch": 0.9332888187446886,
"grad_norm": 0.8098335266113281,
"learning_rate": 1.3501781843988038e-07,
"loss": 0.1438,
"step": 61500
},
{
"epoch": 0.9348063615393954,
"grad_norm": 1.0074046850204468,
"learning_rate": 1.2897253425152855e-07,
"loss": 0.1452,
"step": 61600
},
{
"epoch": 0.9363239043341022,
"grad_norm": 1.212449312210083,
"learning_rate": 1.230639354360086e-07,
"loss": 0.1464,
"step": 61700
},
{
"epoch": 0.937841447128809,
"grad_norm": 0.6493209600448608,
"learning_rate": 1.1729218779387208e-07,
"loss": 0.1487,
"step": 61800
},
{
"epoch": 0.9393589899235159,
"grad_norm": 0.5616501569747925,
"learning_rate": 1.1165745328550081e-07,
"loss": 0.1537,
"step": 61900
},
{
"epoch": 0.9408765327182227,
"grad_norm": 0.7600206732749939,
"learning_rate": 1.0615989002657034e-07,
"loss": 0.1532,
"step": 62000
},
{
"epoch": 0.9423940755129294,
"grad_norm": 0.9349325895309448,
"learning_rate": 1.0079965228360411e-07,
"loss": 0.1486,
"step": 62100
},
{
"epoch": 0.9439116183076363,
"grad_norm": 1.0201034545898438,
"learning_rate": 9.557689046965302e-08,
"loss": 0.1524,
"step": 62200
},
{
"epoch": 0.9454291611023431,
"grad_norm": 0.7782607078552246,
"learning_rate": 9.049175114006825e-08,
"loss": 0.1523,
"step": 62300
},
{
"epoch": 0.9469467038970499,
"grad_norm": 1.189127802848816,
"learning_rate": 8.55443769883929e-08,
"loss": 0.1591,
"step": 62400
},
{
"epoch": 0.9484642466917567,
"grad_norm": 0.599915623664856,
"learning_rate": 8.07349068423563e-08,
"loss": 0.1541,
"step": 62500
},
{
"epoch": 0.9499817894864635,
"grad_norm": 1.3700199127197266,
"learning_rate": 7.606347565997652e-08,
"loss": 0.148,
"step": 62600
},
{
"epoch": 0.9514993322811703,
"grad_norm": 0.8257864117622375,
"learning_rate": 7.153021452577846e-08,
"loss": 0.1462,
"step": 62700
},
{
"epoch": 0.9530168750758772,
"grad_norm": 0.75310218334198,
"learning_rate": 6.713525064710958e-08,
"loss": 0.1481,
"step": 62800
},
{
"epoch": 0.9545344178705839,
"grad_norm": 1.1317468881607056,
"learning_rate": 6.287870735057488e-08,
"loss": 0.1492,
"step": 62900
},
{
"epoch": 0.9560519606652907,
"grad_norm": 0.7714105248451233,
"learning_rate": 5.8760704078572593e-08,
"loss": 0.1506,
"step": 63000
},
{
"epoch": 0.9575695034599976,
"grad_norm": 1.6061201095581055,
"learning_rate": 5.478135638594617e-08,
"loss": 0.1503,
"step": 63100
},
{
"epoch": 0.9590870462547044,
"grad_norm": 0.6467359066009521,
"learning_rate": 5.094077593673863e-08,
"loss": 0.1448,
"step": 63200
},
{
"epoch": 0.9606045890494112,
"grad_norm": 0.6276798844337463,
"learning_rate": 4.723907050106169e-08,
"loss": 0.1508,
"step": 63300
},
{
"epoch": 0.962122131844118,
"grad_norm": 0.886906087398529,
"learning_rate": 4.3676343952068765e-08,
"loss": 0.1412,
"step": 63400
},
{
"epoch": 0.9636396746388248,
"grad_norm": 0.833092212677002,
"learning_rate": 4.0252696263043956e-08,
"loss": 0.1562,
"step": 63500
},
{
"epoch": 0.9651572174335317,
"grad_norm": 1.2917169332504272,
"learning_rate": 3.696822350459206e-08,
"loss": 0.1514,
"step": 63600
},
{
"epoch": 0.9666747602282384,
"grad_norm": 1.8088849782943726,
"learning_rate": 3.382301784194686e-08,
"loss": 0.1483,
"step": 63700
},
{
"epoch": 0.9681923030229452,
"grad_norm": 2.2121670246124268,
"learning_rate": 3.0817167532383727e-08,
"loss": 0.1434,
"step": 63800
},
{
"epoch": 0.9697098458176521,
"grad_norm": 0.5262467861175537,
"learning_rate": 2.795075692274052e-08,
"loss": 0.1514,
"step": 63900
},
{
"epoch": 0.9712273886123589,
"grad_norm": 0.9518747925758362,
"learning_rate": 2.5223866447055544e-08,
"loss": 0.1475,
"step": 64000
},
{
"epoch": 0.9727449314070656,
"grad_norm": 0.6502547264099121,
"learning_rate": 2.2636572624304964e-08,
"loss": 0.1463,
"step": 64100
},
{
"epoch": 0.9742624742017725,
"grad_norm": 1.0486894845962524,
"learning_rate": 2.018894805626115e-08,
"loss": 0.1543,
"step": 64200
},
{
"epoch": 0.9757800169964793,
"grad_norm": 0.6395145654678345,
"learning_rate": 1.788106142545043e-08,
"loss": 0.1543,
"step": 64300
},
{
"epoch": 0.9772975597911862,
"grad_norm": 4.918582916259766,
"learning_rate": 1.5712977493229088e-08,
"loss": 0.1473,
"step": 64400
},
{
"epoch": 0.9788151025858929,
"grad_norm": 0.7501472234725952,
"learning_rate": 1.3684757097965351e-08,
"loss": 0.1473,
"step": 64500
},
{
"epoch": 0.9803326453805997,
"grad_norm": 4.3455491065979,
"learning_rate": 1.179645715333133e-08,
"loss": 0.1506,
"step": 64600
},
{
"epoch": 0.9818501881753066,
"grad_norm": 1.2910078763961792,
"learning_rate": 1.004813064670651e-08,
"loss": 0.1474,
"step": 64700
},
{
"epoch": 0.9833677309700134,
"grad_norm": 2.1404385566711426,
"learning_rate": 8.439826637691162e-09,
"loss": 0.1517,
"step": 64800
},
{
"epoch": 0.9848852737647201,
"grad_norm": 1.020157814025879,
"learning_rate": 6.971590256729666e-09,
"loss": 0.1517,
"step": 64900
},
{
"epoch": 0.986402816559427,
"grad_norm": 0.48167684674263,
"learning_rate": 5.643462703843749e-09,
"loss": 0.1505,
"step": 65000
},
{
"epoch": 0.9879203593541338,
"grad_norm": 0.8583424091339111,
"learning_rate": 4.455481247476745e-09,
"loss": 0.1554,
"step": 65100
},
{
"epoch": 0.9894379021488406,
"grad_norm": 0.6753373742103577,
"learning_rate": 3.407679223446647e-09,
"loss": 0.1524,
"step": 65200
},
{
"epoch": 0.9909554449435474,
"grad_norm": 0.7973353266716003,
"learning_rate": 2.5000860340124167e-09,
"loss": 0.1569,
"step": 65300
},
{
"epoch": 0.9924729877382542,
"grad_norm": 3.2762601375579834,
"learning_rate": 1.7327271470479746e-09,
"loss": 0.1558,
"step": 65400
},
{
"epoch": 0.993990530532961,
"grad_norm": 1.0592360496520996,
"learning_rate": 1.1056240953283281e-09,
"loss": 0.1491,
"step": 65500
},
{
"epoch": 0.9955080733276679,
"grad_norm": 1.079147458076477,
"learning_rate": 6.18794475923945e-10,
"loss": 0.1515,
"step": 65600
},
{
"epoch": 0.9970256161223746,
"grad_norm": 0.4301016628742218,
"learning_rate": 2.722519497072584e-10,
"loss": 0.1533,
"step": 65700
},
{
"epoch": 0.9985431589170815,
"grad_norm": 3.838508367538452,
"learning_rate": 6.600624097075071e-11,
"loss": 0.1514,
"step": 65800
},
{
"epoch": 1.0,
"step": 65896,
"total_flos": 1.1742044747616118e+20,
"train_loss": 0.1813333111942806,
"train_runtime": 211973.7233,
"train_samples_per_second": 9.948,
"train_steps_per_second": 0.311
}
],
"logging_steps": 100,
"max_steps": 65896,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1742044747616118e+20,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}