diff --git "a/checkpoint-52810/trainer_state.json" "b/checkpoint-52810/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-52810/trainer_state.json" @@ -0,0 +1,37055 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 52810, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009467903806097331, + "grad_norm": 1.3941882848739624, + "learning_rate": 1.999621283847756e-05, + "loss": 0.6629, + "step": 10 + }, + { + "epoch": 0.0018935807612194661, + "grad_norm": 1.248525619506836, + "learning_rate": 1.9992425676955122e-05, + "loss": 0.554, + "step": 20 + }, + { + "epoch": 0.002840371141829199, + "grad_norm": 1.0281944274902344, + "learning_rate": 1.9988638515432685e-05, + "loss": 0.462, + "step": 30 + }, + { + "epoch": 0.0037871615224389322, + "grad_norm": 0.8631435036659241, + "learning_rate": 1.9984851353910245e-05, + "loss": 0.3908, + "step": 40 + }, + { + "epoch": 0.004733951903048665, + "grad_norm": 0.697015106678009, + "learning_rate": 1.9981064192387805e-05, + "loss": 0.3349, + "step": 50 + }, + { + "epoch": 0.005680742283658398, + "grad_norm": 0.6066704988479614, + "learning_rate": 1.997727703086537e-05, + "loss": 0.2918, + "step": 60 + }, + { + "epoch": 0.006627532664268131, + "grad_norm": 0.5035427808761597, + "learning_rate": 1.997348986934293e-05, + "loss": 0.2571, + "step": 70 + }, + { + "epoch": 0.0075743230448778644, + "grad_norm": 0.4318634271621704, + "learning_rate": 1.9969702707820492e-05, + "loss": 0.2356, + "step": 80 + }, + { + "epoch": 0.008521113425487597, + "grad_norm": 0.38201332092285156, + "learning_rate": 1.9965915546298052e-05, + "loss": 0.2173, + "step": 90 + }, + { + "epoch": 0.00946790380609733, + "grad_norm": 0.35830944776535034, + "learning_rate": 1.9962128384775612e-05, + "loss": 0.2039, + "step": 100 + }, + { + "epoch": 0.010414694186707063, + "grad_norm": 0.31125694513320923, + "learning_rate": 1.9958341223253172e-05, + "loss": 0.1964, + "step": 110 + }, + { + "epoch": 0.011361484567316796, + "grad_norm": 0.3396235406398773, + "learning_rate": 1.9954554061730736e-05, + "loss": 0.1988, + "step": 120 + }, + { + "epoch": 0.01230827494792653, + "grad_norm": 0.272406667470932, + "learning_rate": 1.9950766900208296e-05, + "loss": 0.1848, + "step": 130 + }, + { + "epoch": 0.013255065328536262, + "grad_norm": 0.27646365761756897, + "learning_rate": 1.9946979738685856e-05, + "loss": 0.1815, + "step": 140 + }, + { + "epoch": 0.014201855709145995, + "grad_norm": 0.3154810070991516, + "learning_rate": 1.9943192577163416e-05, + "loss": 0.1741, + "step": 150 + }, + { + "epoch": 0.015148646089755729, + "grad_norm": 0.2073918879032135, + "learning_rate": 1.993940541564098e-05, + "loss": 0.1785, + "step": 160 + }, + { + "epoch": 0.01609543647036546, + "grad_norm": 0.23369823396205902, + "learning_rate": 1.993561825411854e-05, + "loss": 0.1729, + "step": 170 + }, + { + "epoch": 0.017042226850975194, + "grad_norm": 0.21863120794296265, + "learning_rate": 1.99318310925961e-05, + "loss": 0.1715, + "step": 180 + }, + { + "epoch": 0.017989017231584928, + "grad_norm": 0.24868248403072357, + "learning_rate": 1.9928043931073663e-05, + "loss": 0.1679, + "step": 190 + }, + { + "epoch": 0.01893580761219466, + "grad_norm": 0.25883275270462036, + "learning_rate": 1.9924256769551223e-05, + "loss": 0.1668, + "step": 200 + }, + { + "epoch": 0.019882597992804393, + "grad_norm": 0.23148854076862335, + "learning_rate": 1.9920469608028786e-05, + "loss": 0.1624, + "step": 210 + }, + { + "epoch": 0.020829388373414127, + "grad_norm": 0.2580638825893402, + "learning_rate": 1.9916682446506346e-05, + "loss": 0.1667, + "step": 220 + }, + { + "epoch": 0.021776178754023857, + "grad_norm": 0.28563928604125977, + "learning_rate": 1.9912895284983906e-05, + "loss": 0.1673, + "step": 230 + }, + { + "epoch": 0.02272296913463359, + "grad_norm": 0.23611009120941162, + "learning_rate": 1.9909108123461467e-05, + "loss": 0.1687, + "step": 240 + }, + { + "epoch": 0.023669759515243326, + "grad_norm": 0.27900001406669617, + "learning_rate": 1.9905320961939027e-05, + "loss": 0.1626, + "step": 250 + }, + { + "epoch": 0.02461654989585306, + "grad_norm": 0.32736676931381226, + "learning_rate": 1.990153380041659e-05, + "loss": 0.1647, + "step": 260 + }, + { + "epoch": 0.02556334027646279, + "grad_norm": 0.25693920254707336, + "learning_rate": 1.989774663889415e-05, + "loss": 0.154, + "step": 270 + }, + { + "epoch": 0.026510130657072525, + "grad_norm": 0.2522478401660919, + "learning_rate": 1.989395947737171e-05, + "loss": 0.1672, + "step": 280 + }, + { + "epoch": 0.02745692103768226, + "grad_norm": 0.20121590793132782, + "learning_rate": 1.9890172315849274e-05, + "loss": 0.1585, + "step": 290 + }, + { + "epoch": 0.02840371141829199, + "grad_norm": 0.2583962380886078, + "learning_rate": 1.9886385154326834e-05, + "loss": 0.1558, + "step": 300 + }, + { + "epoch": 0.029350501798901724, + "grad_norm": 0.2211063653230667, + "learning_rate": 1.9882597992804397e-05, + "loss": 0.1645, + "step": 310 + }, + { + "epoch": 0.030297292179511458, + "grad_norm": 0.2027604579925537, + "learning_rate": 1.9878810831281957e-05, + "loss": 0.1608, + "step": 320 + }, + { + "epoch": 0.03124408256012119, + "grad_norm": 0.2854134738445282, + "learning_rate": 1.9875023669759517e-05, + "loss": 0.1599, + "step": 330 + }, + { + "epoch": 0.03219087294073092, + "grad_norm": 0.18335621058940887, + "learning_rate": 1.9871236508237077e-05, + "loss": 0.1688, + "step": 340 + }, + { + "epoch": 0.03313766332134065, + "grad_norm": 0.24929380416870117, + "learning_rate": 1.986744934671464e-05, + "loss": 0.1578, + "step": 350 + }, + { + "epoch": 0.03408445370195039, + "grad_norm": 0.20108000934123993, + "learning_rate": 1.98636621851922e-05, + "loss": 0.156, + "step": 360 + }, + { + "epoch": 0.03503124408256012, + "grad_norm": 0.24530109763145447, + "learning_rate": 1.985987502366976e-05, + "loss": 0.1497, + "step": 370 + }, + { + "epoch": 0.035978034463169856, + "grad_norm": 0.2246169000864029, + "learning_rate": 1.985608786214732e-05, + "loss": 0.1634, + "step": 380 + }, + { + "epoch": 0.03692482484377959, + "grad_norm": 0.2763858735561371, + "learning_rate": 1.985230070062488e-05, + "loss": 0.1592, + "step": 390 + }, + { + "epoch": 0.03787161522438932, + "grad_norm": 0.19560927152633667, + "learning_rate": 1.9848513539102444e-05, + "loss": 0.1488, + "step": 400 + }, + { + "epoch": 0.03881840560499905, + "grad_norm": 0.22450771927833557, + "learning_rate": 1.9844726377580004e-05, + "loss": 0.1575, + "step": 410 + }, + { + "epoch": 0.039765195985608785, + "grad_norm": 0.29325568675994873, + "learning_rate": 1.9840939216057568e-05, + "loss": 0.1591, + "step": 420 + }, + { + "epoch": 0.04071198636621852, + "grad_norm": 0.2534486651420593, + "learning_rate": 1.9837152054535128e-05, + "loss": 0.1588, + "step": 430 + }, + { + "epoch": 0.041658776746828254, + "grad_norm": 0.21553802490234375, + "learning_rate": 1.9833364893012688e-05, + "loss": 0.1572, + "step": 440 + }, + { + "epoch": 0.04260556712743799, + "grad_norm": 0.20270265638828278, + "learning_rate": 1.982957773149025e-05, + "loss": 0.1664, + "step": 450 + }, + { + "epoch": 0.043552357508047715, + "grad_norm": 0.26474466919898987, + "learning_rate": 1.982579056996781e-05, + "loss": 0.1549, + "step": 460 + }, + { + "epoch": 0.04449914788865745, + "grad_norm": 0.23204047977924347, + "learning_rate": 1.982200340844537e-05, + "loss": 0.157, + "step": 470 + }, + { + "epoch": 0.04544593826926718, + "grad_norm": 0.22077463567256927, + "learning_rate": 1.981821624692293e-05, + "loss": 0.1601, + "step": 480 + }, + { + "epoch": 0.04639272864987692, + "grad_norm": 0.24180015921592712, + "learning_rate": 1.9814429085400495e-05, + "loss": 0.1679, + "step": 490 + }, + { + "epoch": 0.04733951903048665, + "grad_norm": 0.20309610664844513, + "learning_rate": 1.9810641923878055e-05, + "loss": 0.15, + "step": 500 + }, + { + "epoch": 0.048286309411096386, + "grad_norm": 0.2009934037923813, + "learning_rate": 1.9806854762355615e-05, + "loss": 0.1614, + "step": 510 + }, + { + "epoch": 0.04923309979170612, + "grad_norm": 0.29323482513427734, + "learning_rate": 1.980306760083318e-05, + "loss": 0.1635, + "step": 520 + }, + { + "epoch": 0.05017989017231585, + "grad_norm": 0.20960304141044617, + "learning_rate": 1.979928043931074e-05, + "loss": 0.1522, + "step": 530 + }, + { + "epoch": 0.05112668055292558, + "grad_norm": 0.22783887386322021, + "learning_rate": 1.97954932777883e-05, + "loss": 0.1573, + "step": 540 + }, + { + "epoch": 0.052073470933535315, + "grad_norm": 0.22936321794986725, + "learning_rate": 1.9791706116265862e-05, + "loss": 0.1436, + "step": 550 + }, + { + "epoch": 0.05302026131414505, + "grad_norm": 0.22300918400287628, + "learning_rate": 1.9787918954743422e-05, + "loss": 0.1546, + "step": 560 + }, + { + "epoch": 0.053967051694754783, + "grad_norm": 0.22048968076705933, + "learning_rate": 1.9784131793220982e-05, + "loss": 0.1416, + "step": 570 + }, + { + "epoch": 0.05491384207536452, + "grad_norm": 0.254375696182251, + "learning_rate": 1.9780344631698542e-05, + "loss": 0.1532, + "step": 580 + }, + { + "epoch": 0.055860632455974245, + "grad_norm": 0.3245167136192322, + "learning_rate": 1.9776557470176106e-05, + "loss": 0.15, + "step": 590 + }, + { + "epoch": 0.05680742283658398, + "grad_norm": 0.21500037610530853, + "learning_rate": 1.9772770308653666e-05, + "loss": 0.1524, + "step": 600 + }, + { + "epoch": 0.05775421321719371, + "grad_norm": 0.19425217807292938, + "learning_rate": 1.9768983147131226e-05, + "loss": 0.1462, + "step": 610 + }, + { + "epoch": 0.05870100359780345, + "grad_norm": 0.23880882561206818, + "learning_rate": 1.9765195985608786e-05, + "loss": 0.1477, + "step": 620 + }, + { + "epoch": 0.05964779397841318, + "grad_norm": 0.2194243222475052, + "learning_rate": 1.976140882408635e-05, + "loss": 0.1501, + "step": 630 + }, + { + "epoch": 0.060594584359022916, + "grad_norm": 0.3316631317138672, + "learning_rate": 1.975762166256391e-05, + "loss": 0.158, + "step": 640 + }, + { + "epoch": 0.06154137473963264, + "grad_norm": 0.23250705003738403, + "learning_rate": 1.9753834501041473e-05, + "loss": 0.1486, + "step": 650 + }, + { + "epoch": 0.06248816512024238, + "grad_norm": 0.211704283952713, + "learning_rate": 1.9750047339519033e-05, + "loss": 0.1487, + "step": 660 + }, + { + "epoch": 0.06343495550085211, + "grad_norm": 0.30874431133270264, + "learning_rate": 1.9746260177996593e-05, + "loss": 0.1529, + "step": 670 + }, + { + "epoch": 0.06438174588146184, + "grad_norm": 0.26447850465774536, + "learning_rate": 1.9742473016474156e-05, + "loss": 0.1521, + "step": 680 + }, + { + "epoch": 0.06532853626207158, + "grad_norm": 0.23446421325206757, + "learning_rate": 1.9738685854951716e-05, + "loss": 0.1525, + "step": 690 + }, + { + "epoch": 0.0662753266426813, + "grad_norm": 0.2225397229194641, + "learning_rate": 1.9734898693429276e-05, + "loss": 0.1476, + "step": 700 + }, + { + "epoch": 0.06722211702329105, + "grad_norm": 0.20426969230175018, + "learning_rate": 1.9731111531906836e-05, + "loss": 0.1564, + "step": 710 + }, + { + "epoch": 0.06816890740390077, + "grad_norm": 0.27150651812553406, + "learning_rate": 1.9727324370384396e-05, + "loss": 0.1459, + "step": 720 + }, + { + "epoch": 0.06911569778451052, + "grad_norm": 0.23939593136310577, + "learning_rate": 1.972353720886196e-05, + "loss": 0.1399, + "step": 730 + }, + { + "epoch": 0.07006248816512024, + "grad_norm": 0.26259246468544006, + "learning_rate": 1.971975004733952e-05, + "loss": 0.1483, + "step": 740 + }, + { + "epoch": 0.07100927854572997, + "grad_norm": 0.22480525076389313, + "learning_rate": 1.971596288581708e-05, + "loss": 0.14, + "step": 750 + }, + { + "epoch": 0.07195606892633971, + "grad_norm": 0.2269698679447174, + "learning_rate": 1.9712175724294643e-05, + "loss": 0.1497, + "step": 760 + }, + { + "epoch": 0.07290285930694944, + "grad_norm": 0.216821551322937, + "learning_rate": 1.9708388562772203e-05, + "loss": 0.1422, + "step": 770 + }, + { + "epoch": 0.07384964968755918, + "grad_norm": 0.24823130667209625, + "learning_rate": 1.9704601401249767e-05, + "loss": 0.1436, + "step": 780 + }, + { + "epoch": 0.0747964400681689, + "grad_norm": 0.3634086847305298, + "learning_rate": 1.9700814239727327e-05, + "loss": 0.1579, + "step": 790 + }, + { + "epoch": 0.07574323044877863, + "grad_norm": 0.24505460262298584, + "learning_rate": 1.9697027078204887e-05, + "loss": 0.1423, + "step": 800 + }, + { + "epoch": 0.07669002082938838, + "grad_norm": 0.30174288153648376, + "learning_rate": 1.9693239916682447e-05, + "loss": 0.1504, + "step": 810 + }, + { + "epoch": 0.0776368112099981, + "grad_norm": 0.2679823637008667, + "learning_rate": 1.968945275516001e-05, + "loss": 0.1428, + "step": 820 + }, + { + "epoch": 0.07858360159060784, + "grad_norm": 0.30597180128097534, + "learning_rate": 1.968566559363757e-05, + "loss": 0.1487, + "step": 830 + }, + { + "epoch": 0.07953039197121757, + "grad_norm": 0.25198864936828613, + "learning_rate": 1.968187843211513e-05, + "loss": 0.1442, + "step": 840 + }, + { + "epoch": 0.08047718235182731, + "grad_norm": 0.23763233423233032, + "learning_rate": 1.967809127059269e-05, + "loss": 0.1437, + "step": 850 + }, + { + "epoch": 0.08142397273243704, + "grad_norm": 0.20261543989181519, + "learning_rate": 1.9674304109070254e-05, + "loss": 0.146, + "step": 860 + }, + { + "epoch": 0.08237076311304677, + "grad_norm": 0.24674880504608154, + "learning_rate": 1.9670516947547814e-05, + "loss": 0.1485, + "step": 870 + }, + { + "epoch": 0.08331755349365651, + "grad_norm": 0.23125316202640533, + "learning_rate": 1.9666729786025377e-05, + "loss": 0.1434, + "step": 880 + }, + { + "epoch": 0.08426434387426623, + "grad_norm": 0.2659028470516205, + "learning_rate": 1.9662942624502938e-05, + "loss": 0.1428, + "step": 890 + }, + { + "epoch": 0.08521113425487598, + "grad_norm": 0.22943943738937378, + "learning_rate": 1.9659155462980498e-05, + "loss": 0.1474, + "step": 900 + }, + { + "epoch": 0.0861579246354857, + "grad_norm": 0.25804975628852844, + "learning_rate": 1.965536830145806e-05, + "loss": 0.1404, + "step": 910 + }, + { + "epoch": 0.08710471501609543, + "grad_norm": 0.26774296164512634, + "learning_rate": 1.965158113993562e-05, + "loss": 0.1433, + "step": 920 + }, + { + "epoch": 0.08805150539670517, + "grad_norm": 0.2502722144126892, + "learning_rate": 1.964779397841318e-05, + "loss": 0.1391, + "step": 930 + }, + { + "epoch": 0.0889982957773149, + "grad_norm": 0.22401435673236847, + "learning_rate": 1.964400681689074e-05, + "loss": 0.145, + "step": 940 + }, + { + "epoch": 0.08994508615792464, + "grad_norm": 0.25525417923927307, + "learning_rate": 1.96402196553683e-05, + "loss": 0.1481, + "step": 950 + }, + { + "epoch": 0.09089187653853437, + "grad_norm": 0.2963325083255768, + "learning_rate": 1.9636432493845865e-05, + "loss": 0.1452, + "step": 960 + }, + { + "epoch": 0.09183866691914411, + "grad_norm": 0.257304847240448, + "learning_rate": 1.9632645332323425e-05, + "loss": 0.1409, + "step": 970 + }, + { + "epoch": 0.09278545729975383, + "grad_norm": 0.3979777991771698, + "learning_rate": 1.9628858170800985e-05, + "loss": 0.1363, + "step": 980 + }, + { + "epoch": 0.09373224768036356, + "grad_norm": 0.2767225205898285, + "learning_rate": 1.9625071009278548e-05, + "loss": 0.1402, + "step": 990 + }, + { + "epoch": 0.0946790380609733, + "grad_norm": 0.22073742747306824, + "learning_rate": 1.9621283847756108e-05, + "loss": 0.1471, + "step": 1000 + }, + { + "epoch": 0.09562582844158303, + "grad_norm": 0.2693091928958893, + "learning_rate": 1.961749668623367e-05, + "loss": 0.1505, + "step": 1010 + }, + { + "epoch": 0.09657261882219277, + "grad_norm": 0.24559836089611053, + "learning_rate": 1.9613709524711232e-05, + "loss": 0.1369, + "step": 1020 + }, + { + "epoch": 0.0975194092028025, + "grad_norm": 0.22167791426181793, + "learning_rate": 1.9609922363188792e-05, + "loss": 0.1348, + "step": 1030 + }, + { + "epoch": 0.09846619958341224, + "grad_norm": 0.23056115210056305, + "learning_rate": 1.9606135201666352e-05, + "loss": 0.1394, + "step": 1040 + }, + { + "epoch": 0.09941298996402197, + "grad_norm": 0.35368019342422485, + "learning_rate": 1.9602348040143915e-05, + "loss": 0.1485, + "step": 1050 + }, + { + "epoch": 0.1003597803446317, + "grad_norm": 0.35556063055992126, + "learning_rate": 1.9598560878621475e-05, + "loss": 0.1489, + "step": 1060 + }, + { + "epoch": 0.10130657072524143, + "grad_norm": 0.2518173158168793, + "learning_rate": 1.9594773717099035e-05, + "loss": 0.1489, + "step": 1070 + }, + { + "epoch": 0.10225336110585116, + "grad_norm": 0.2986575663089752, + "learning_rate": 1.9590986555576595e-05, + "loss": 0.1397, + "step": 1080 + }, + { + "epoch": 0.1032001514864609, + "grad_norm": 0.22278901934623718, + "learning_rate": 1.958719939405416e-05, + "loss": 0.1322, + "step": 1090 + }, + { + "epoch": 0.10414694186707063, + "grad_norm": 0.2492545247077942, + "learning_rate": 1.958341223253172e-05, + "loss": 0.1344, + "step": 1100 + }, + { + "epoch": 0.10509373224768036, + "grad_norm": 0.348187118768692, + "learning_rate": 1.957962507100928e-05, + "loss": 0.15, + "step": 1110 + }, + { + "epoch": 0.1060405226282901, + "grad_norm": 0.3564904034137726, + "learning_rate": 1.9575837909486842e-05, + "loss": 0.1401, + "step": 1120 + }, + { + "epoch": 0.10698731300889983, + "grad_norm": 0.25616133213043213, + "learning_rate": 1.9572050747964402e-05, + "loss": 0.1451, + "step": 1130 + }, + { + "epoch": 0.10793410338950957, + "grad_norm": 0.2531684637069702, + "learning_rate": 1.9568263586441962e-05, + "loss": 0.1426, + "step": 1140 + }, + { + "epoch": 0.1088808937701193, + "grad_norm": 0.2361394166946411, + "learning_rate": 1.9564476424919526e-05, + "loss": 0.1332, + "step": 1150 + }, + { + "epoch": 0.10982768415072904, + "grad_norm": 0.2564547657966614, + "learning_rate": 1.9560689263397086e-05, + "loss": 0.1352, + "step": 1160 + }, + { + "epoch": 0.11077447453133876, + "grad_norm": 0.2713509500026703, + "learning_rate": 1.9556902101874646e-05, + "loss": 0.1346, + "step": 1170 + }, + { + "epoch": 0.11172126491194849, + "grad_norm": 0.3002881705760956, + "learning_rate": 1.9553114940352206e-05, + "loss": 0.1434, + "step": 1180 + }, + { + "epoch": 0.11266805529255823, + "grad_norm": 0.3270877003669739, + "learning_rate": 1.954932777882977e-05, + "loss": 0.1441, + "step": 1190 + }, + { + "epoch": 0.11361484567316796, + "grad_norm": 0.31670162081718445, + "learning_rate": 1.954554061730733e-05, + "loss": 0.142, + "step": 1200 + }, + { + "epoch": 0.1145616360537777, + "grad_norm": 0.290374755859375, + "learning_rate": 1.954175345578489e-05, + "loss": 0.1443, + "step": 1210 + }, + { + "epoch": 0.11550842643438743, + "grad_norm": 0.25081178545951843, + "learning_rate": 1.9537966294262453e-05, + "loss": 0.1308, + "step": 1220 + }, + { + "epoch": 0.11645521681499715, + "grad_norm": 0.29356658458709717, + "learning_rate": 1.9534179132740013e-05, + "loss": 0.1281, + "step": 1230 + }, + { + "epoch": 0.1174020071956069, + "grad_norm": 0.38654467463493347, + "learning_rate": 1.9530391971217577e-05, + "loss": 0.1515, + "step": 1240 + }, + { + "epoch": 0.11834879757621662, + "grad_norm": 0.2073059380054474, + "learning_rate": 1.9526604809695137e-05, + "loss": 0.13, + "step": 1250 + }, + { + "epoch": 0.11929558795682636, + "grad_norm": 0.21164889633655548, + "learning_rate": 1.9522817648172697e-05, + "loss": 0.1325, + "step": 1260 + }, + { + "epoch": 0.12024237833743609, + "grad_norm": 0.28698667883872986, + "learning_rate": 1.9519030486650257e-05, + "loss": 0.1349, + "step": 1270 + }, + { + "epoch": 0.12118916871804583, + "grad_norm": 0.24369283020496368, + "learning_rate": 1.9515243325127817e-05, + "loss": 0.1364, + "step": 1280 + }, + { + "epoch": 0.12213595909865556, + "grad_norm": 0.24442093074321747, + "learning_rate": 1.951145616360538e-05, + "loss": 0.134, + "step": 1290 + }, + { + "epoch": 0.12308274947926529, + "grad_norm": 0.21796512603759766, + "learning_rate": 1.950766900208294e-05, + "loss": 0.1447, + "step": 1300 + }, + { + "epoch": 0.12402953985987503, + "grad_norm": 0.2371196150779724, + "learning_rate": 1.95038818405605e-05, + "loss": 0.1339, + "step": 1310 + }, + { + "epoch": 0.12497633024048475, + "grad_norm": 0.31082502007484436, + "learning_rate": 1.950009467903806e-05, + "loss": 0.1288, + "step": 1320 + }, + { + "epoch": 0.12592312062109448, + "grad_norm": 0.24794170260429382, + "learning_rate": 1.9496307517515624e-05, + "loss": 0.1293, + "step": 1330 + }, + { + "epoch": 0.12686991100170422, + "grad_norm": 0.21178680658340454, + "learning_rate": 1.9492520355993184e-05, + "loss": 0.1353, + "step": 1340 + }, + { + "epoch": 0.12781670138231396, + "grad_norm": 0.3490561246871948, + "learning_rate": 1.9488733194470747e-05, + "loss": 0.1343, + "step": 1350 + }, + { + "epoch": 0.12876349176292368, + "grad_norm": 0.25812265276908875, + "learning_rate": 1.9484946032948307e-05, + "loss": 0.1304, + "step": 1360 + }, + { + "epoch": 0.12971028214353342, + "grad_norm": 0.2792973220348358, + "learning_rate": 1.9481158871425867e-05, + "loss": 0.135, + "step": 1370 + }, + { + "epoch": 0.13065707252414316, + "grad_norm": 0.4771386384963989, + "learning_rate": 1.947737170990343e-05, + "loss": 0.1367, + "step": 1380 + }, + { + "epoch": 0.1316038629047529, + "grad_norm": 0.2986657917499542, + "learning_rate": 1.947358454838099e-05, + "loss": 0.127, + "step": 1390 + }, + { + "epoch": 0.1325506532853626, + "grad_norm": 0.2369564324617386, + "learning_rate": 1.946979738685855e-05, + "loss": 0.132, + "step": 1400 + }, + { + "epoch": 0.13349744366597235, + "grad_norm": 0.27226126194000244, + "learning_rate": 1.946601022533611e-05, + "loss": 0.1347, + "step": 1410 + }, + { + "epoch": 0.1344442340465821, + "grad_norm": 0.26052626967430115, + "learning_rate": 1.946222306381367e-05, + "loss": 0.1331, + "step": 1420 + }, + { + "epoch": 0.1353910244271918, + "grad_norm": 0.34890538454055786, + "learning_rate": 1.9458435902291234e-05, + "loss": 0.1347, + "step": 1430 + }, + { + "epoch": 0.13633781480780155, + "grad_norm": 0.2817912995815277, + "learning_rate": 1.9454648740768794e-05, + "loss": 0.1307, + "step": 1440 + }, + { + "epoch": 0.1372846051884113, + "grad_norm": 0.2826266884803772, + "learning_rate": 1.9450861579246358e-05, + "loss": 0.1426, + "step": 1450 + }, + { + "epoch": 0.13823139556902103, + "grad_norm": 0.29997652769088745, + "learning_rate": 1.9447074417723918e-05, + "loss": 0.1327, + "step": 1460 + }, + { + "epoch": 0.13917818594963074, + "grad_norm": 0.30327969789505005, + "learning_rate": 1.9443287256201478e-05, + "loss": 0.1326, + "step": 1470 + }, + { + "epoch": 0.14012497633024049, + "grad_norm": 0.2640013098716736, + "learning_rate": 1.943950009467904e-05, + "loss": 0.1376, + "step": 1480 + }, + { + "epoch": 0.14107176671085023, + "grad_norm": 0.24050156772136688, + "learning_rate": 1.94357129331566e-05, + "loss": 0.132, + "step": 1490 + }, + { + "epoch": 0.14201855709145994, + "grad_norm": 0.34889718890190125, + "learning_rate": 1.943192577163416e-05, + "loss": 0.1286, + "step": 1500 + }, + { + "epoch": 0.14296534747206968, + "grad_norm": 0.24513930082321167, + "learning_rate": 1.942813861011172e-05, + "loss": 0.1351, + "step": 1510 + }, + { + "epoch": 0.14391213785267942, + "grad_norm": 0.3242141902446747, + "learning_rate": 1.9424351448589285e-05, + "loss": 0.1357, + "step": 1520 + }, + { + "epoch": 0.14485892823328914, + "grad_norm": 0.25881341099739075, + "learning_rate": 1.9420564287066845e-05, + "loss": 0.1307, + "step": 1530 + }, + { + "epoch": 0.14580571861389888, + "grad_norm": 0.24430468678474426, + "learning_rate": 1.9416777125544405e-05, + "loss": 0.1208, + "step": 1540 + }, + { + "epoch": 0.14675250899450862, + "grad_norm": 0.32587867975234985, + "learning_rate": 1.9412989964021965e-05, + "loss": 0.133, + "step": 1550 + }, + { + "epoch": 0.14769929937511836, + "grad_norm": 0.30574116110801697, + "learning_rate": 1.940920280249953e-05, + "loss": 0.1321, + "step": 1560 + }, + { + "epoch": 0.14864608975572807, + "grad_norm": 0.316580206155777, + "learning_rate": 1.940541564097709e-05, + "loss": 0.1387, + "step": 1570 + }, + { + "epoch": 0.1495928801363378, + "grad_norm": 0.4084049165248871, + "learning_rate": 1.9401628479454652e-05, + "loss": 0.134, + "step": 1580 + }, + { + "epoch": 0.15053967051694755, + "grad_norm": 0.3074231445789337, + "learning_rate": 1.9397841317932212e-05, + "loss": 0.1277, + "step": 1590 + }, + { + "epoch": 0.15148646089755727, + "grad_norm": 0.34363508224487305, + "learning_rate": 1.9394054156409772e-05, + "loss": 0.1305, + "step": 1600 + }, + { + "epoch": 0.152433251278167, + "grad_norm": 0.27845168113708496, + "learning_rate": 1.9390266994887336e-05, + "loss": 0.1259, + "step": 1610 + }, + { + "epoch": 0.15338004165877675, + "grad_norm": 0.2976890504360199, + "learning_rate": 1.9386479833364896e-05, + "loss": 0.1404, + "step": 1620 + }, + { + "epoch": 0.1543268320393865, + "grad_norm": 0.23898760974407196, + "learning_rate": 1.9382692671842456e-05, + "loss": 0.1243, + "step": 1630 + }, + { + "epoch": 0.1552736224199962, + "grad_norm": 0.28521913290023804, + "learning_rate": 1.9378905510320016e-05, + "loss": 0.1265, + "step": 1640 + }, + { + "epoch": 0.15622041280060595, + "grad_norm": 0.34148770570755005, + "learning_rate": 1.9375118348797576e-05, + "loss": 0.1347, + "step": 1650 + }, + { + "epoch": 0.1571672031812157, + "grad_norm": 0.43205875158309937, + "learning_rate": 1.937133118727514e-05, + "loss": 0.1246, + "step": 1660 + }, + { + "epoch": 0.1581139935618254, + "grad_norm": 0.33226126432418823, + "learning_rate": 1.93675440257527e-05, + "loss": 0.1273, + "step": 1670 + }, + { + "epoch": 0.15906078394243514, + "grad_norm": 0.3262731730937958, + "learning_rate": 1.936375686423026e-05, + "loss": 0.1241, + "step": 1680 + }, + { + "epoch": 0.16000757432304488, + "grad_norm": 0.27835527062416077, + "learning_rate": 1.9359969702707823e-05, + "loss": 0.1314, + "step": 1690 + }, + { + "epoch": 0.16095436470365462, + "grad_norm": 0.28034350275993347, + "learning_rate": 1.9356182541185383e-05, + "loss": 0.1297, + "step": 1700 + }, + { + "epoch": 0.16190115508426434, + "grad_norm": 0.2608806788921356, + "learning_rate": 1.9352395379662946e-05, + "loss": 0.1244, + "step": 1710 + }, + { + "epoch": 0.16284794546487408, + "grad_norm": 0.32022422552108765, + "learning_rate": 1.9348608218140506e-05, + "loss": 0.1297, + "step": 1720 + }, + { + "epoch": 0.16379473584548382, + "grad_norm": 0.2928096354007721, + "learning_rate": 1.9344821056618066e-05, + "loss": 0.1246, + "step": 1730 + }, + { + "epoch": 0.16474152622609353, + "grad_norm": 0.3574795126914978, + "learning_rate": 1.9341033895095626e-05, + "loss": 0.1383, + "step": 1740 + }, + { + "epoch": 0.16568831660670327, + "grad_norm": 0.2713389992713928, + "learning_rate": 1.933724673357319e-05, + "loss": 0.1344, + "step": 1750 + }, + { + "epoch": 0.16663510698731301, + "grad_norm": 0.2822892665863037, + "learning_rate": 1.933345957205075e-05, + "loss": 0.1281, + "step": 1760 + }, + { + "epoch": 0.16758189736792276, + "grad_norm": 0.3460506200790405, + "learning_rate": 1.932967241052831e-05, + "loss": 0.1307, + "step": 1770 + }, + { + "epoch": 0.16852868774853247, + "grad_norm": 0.323542058467865, + "learning_rate": 1.932588524900587e-05, + "loss": 0.1362, + "step": 1780 + }, + { + "epoch": 0.1694754781291422, + "grad_norm": 0.30738893151283264, + "learning_rate": 1.9322098087483433e-05, + "loss": 0.1282, + "step": 1790 + }, + { + "epoch": 0.17042226850975195, + "grad_norm": 0.34989845752716064, + "learning_rate": 1.9318310925960994e-05, + "loss": 0.1234, + "step": 1800 + }, + { + "epoch": 0.17136905889036166, + "grad_norm": 0.33833956718444824, + "learning_rate": 1.9314523764438557e-05, + "loss": 0.1313, + "step": 1810 + }, + { + "epoch": 0.1723158492709714, + "grad_norm": 0.3208501636981964, + "learning_rate": 1.9310736602916117e-05, + "loss": 0.136, + "step": 1820 + }, + { + "epoch": 0.17326263965158115, + "grad_norm": 0.38496994972229004, + "learning_rate": 1.9306949441393677e-05, + "loss": 0.1278, + "step": 1830 + }, + { + "epoch": 0.17420943003219086, + "grad_norm": 0.2736138105392456, + "learning_rate": 1.9303162279871237e-05, + "loss": 0.1262, + "step": 1840 + }, + { + "epoch": 0.1751562204128006, + "grad_norm": 0.25814059376716614, + "learning_rate": 1.92993751183488e-05, + "loss": 0.1246, + "step": 1850 + }, + { + "epoch": 0.17610301079341034, + "grad_norm": 0.2308269441127777, + "learning_rate": 1.929558795682636e-05, + "loss": 0.136, + "step": 1860 + }, + { + "epoch": 0.17704980117402008, + "grad_norm": 0.3181948661804199, + "learning_rate": 1.929180079530392e-05, + "loss": 0.1206, + "step": 1870 + }, + { + "epoch": 0.1779965915546298, + "grad_norm": 0.2826565206050873, + "learning_rate": 1.928801363378148e-05, + "loss": 0.1329, + "step": 1880 + }, + { + "epoch": 0.17894338193523954, + "grad_norm": 0.4134462773799896, + "learning_rate": 1.9284226472259044e-05, + "loss": 0.126, + "step": 1890 + }, + { + "epoch": 0.17989017231584928, + "grad_norm": 0.3727562725543976, + "learning_rate": 1.9280439310736604e-05, + "loss": 0.1319, + "step": 1900 + }, + { + "epoch": 0.180836962696459, + "grad_norm": 0.25525522232055664, + "learning_rate": 1.9276652149214164e-05, + "loss": 0.1312, + "step": 1910 + }, + { + "epoch": 0.18178375307706873, + "grad_norm": 0.2545289397239685, + "learning_rate": 1.9272864987691728e-05, + "loss": 0.1367, + "step": 1920 + }, + { + "epoch": 0.18273054345767847, + "grad_norm": 0.3217427730560303, + "learning_rate": 1.9269077826169288e-05, + "loss": 0.1367, + "step": 1930 + }, + { + "epoch": 0.18367733383828821, + "grad_norm": 0.25008267164230347, + "learning_rate": 1.926529066464685e-05, + "loss": 0.1259, + "step": 1940 + }, + { + "epoch": 0.18462412421889793, + "grad_norm": 0.28599464893341064, + "learning_rate": 1.926150350312441e-05, + "loss": 0.1283, + "step": 1950 + }, + { + "epoch": 0.18557091459950767, + "grad_norm": 0.30091628432273865, + "learning_rate": 1.925771634160197e-05, + "loss": 0.1424, + "step": 1960 + }, + { + "epoch": 0.1865177049801174, + "grad_norm": 0.2841487228870392, + "learning_rate": 1.925392918007953e-05, + "loss": 0.1225, + "step": 1970 + }, + { + "epoch": 0.18746449536072712, + "grad_norm": 0.2743360996246338, + "learning_rate": 1.925014201855709e-05, + "loss": 0.1307, + "step": 1980 + }, + { + "epoch": 0.18841128574133686, + "grad_norm": 0.2886848449707031, + "learning_rate": 1.9246354857034655e-05, + "loss": 0.1292, + "step": 1990 + }, + { + "epoch": 0.1893580761219466, + "grad_norm": 0.35317230224609375, + "learning_rate": 1.9242567695512215e-05, + "loss": 0.1357, + "step": 2000 + }, + { + "epoch": 0.19030486650255635, + "grad_norm": 0.2847851514816284, + "learning_rate": 1.9238780533989775e-05, + "loss": 0.1222, + "step": 2010 + }, + { + "epoch": 0.19125165688316606, + "grad_norm": 0.3323739469051361, + "learning_rate": 1.9234993372467335e-05, + "loss": 0.1251, + "step": 2020 + }, + { + "epoch": 0.1921984472637758, + "grad_norm": 0.35399678349494934, + "learning_rate": 1.92312062109449e-05, + "loss": 0.1334, + "step": 2030 + }, + { + "epoch": 0.19314523764438554, + "grad_norm": 0.2679152488708496, + "learning_rate": 1.922741904942246e-05, + "loss": 0.1271, + "step": 2040 + }, + { + "epoch": 0.19409202802499526, + "grad_norm": 0.23062950372695923, + "learning_rate": 1.9223631887900022e-05, + "loss": 0.1259, + "step": 2050 + }, + { + "epoch": 0.195038818405605, + "grad_norm": 0.25115498900413513, + "learning_rate": 1.9219844726377582e-05, + "loss": 0.1283, + "step": 2060 + }, + { + "epoch": 0.19598560878621474, + "grad_norm": 0.2893475294113159, + "learning_rate": 1.9216057564855142e-05, + "loss": 0.1277, + "step": 2070 + }, + { + "epoch": 0.19693239916682448, + "grad_norm": 0.3177919089794159, + "learning_rate": 1.9212270403332705e-05, + "loss": 0.1205, + "step": 2080 + }, + { + "epoch": 0.1978791895474342, + "grad_norm": 0.35728374123573303, + "learning_rate": 1.9208483241810265e-05, + "loss": 0.1345, + "step": 2090 + }, + { + "epoch": 0.19882597992804393, + "grad_norm": 0.3154493570327759, + "learning_rate": 1.9204696080287825e-05, + "loss": 0.1249, + "step": 2100 + }, + { + "epoch": 0.19977277030865367, + "grad_norm": 0.28626155853271484, + "learning_rate": 1.9200908918765386e-05, + "loss": 0.1241, + "step": 2110 + }, + { + "epoch": 0.2007195606892634, + "grad_norm": 0.35578519105911255, + "learning_rate": 1.9197121757242946e-05, + "loss": 0.1339, + "step": 2120 + }, + { + "epoch": 0.20166635106987313, + "grad_norm": 0.2937614917755127, + "learning_rate": 1.919333459572051e-05, + "loss": 0.1289, + "step": 2130 + }, + { + "epoch": 0.20261314145048287, + "grad_norm": 0.30598610639572144, + "learning_rate": 1.918954743419807e-05, + "loss": 0.1265, + "step": 2140 + }, + { + "epoch": 0.20355993183109258, + "grad_norm": 0.2843713164329529, + "learning_rate": 1.9185760272675633e-05, + "loss": 0.1256, + "step": 2150 + }, + { + "epoch": 0.20450672221170232, + "grad_norm": 0.2970130145549774, + "learning_rate": 1.9181973111153193e-05, + "loss": 0.1184, + "step": 2160 + }, + { + "epoch": 0.20545351259231207, + "grad_norm": 0.30623194575309753, + "learning_rate": 1.9178185949630756e-05, + "loss": 0.1272, + "step": 2170 + }, + { + "epoch": 0.2064003029729218, + "grad_norm": 0.3569609224796295, + "learning_rate": 1.9174398788108316e-05, + "loss": 0.1224, + "step": 2180 + }, + { + "epoch": 0.20734709335353152, + "grad_norm": 0.29475313425064087, + "learning_rate": 1.9170611626585876e-05, + "loss": 0.128, + "step": 2190 + }, + { + "epoch": 0.20829388373414126, + "grad_norm": 0.2869347035884857, + "learning_rate": 1.9166824465063436e-05, + "loss": 0.1289, + "step": 2200 + }, + { + "epoch": 0.209240674114751, + "grad_norm": 0.2653000056743622, + "learning_rate": 1.9163037303540996e-05, + "loss": 0.1174, + "step": 2210 + }, + { + "epoch": 0.21018746449536072, + "grad_norm": 0.21829605102539062, + "learning_rate": 1.915925014201856e-05, + "loss": 0.1203, + "step": 2220 + }, + { + "epoch": 0.21113425487597046, + "grad_norm": 0.32572606205940247, + "learning_rate": 1.915546298049612e-05, + "loss": 0.1249, + "step": 2230 + }, + { + "epoch": 0.2120810452565802, + "grad_norm": 0.5544735789299011, + "learning_rate": 1.915167581897368e-05, + "loss": 0.1258, + "step": 2240 + }, + { + "epoch": 0.21302783563718994, + "grad_norm": 0.3420076370239258, + "learning_rate": 1.914788865745124e-05, + "loss": 0.1225, + "step": 2250 + }, + { + "epoch": 0.21397462601779965, + "grad_norm": 0.32641270756721497, + "learning_rate": 1.9144101495928803e-05, + "loss": 0.1334, + "step": 2260 + }, + { + "epoch": 0.2149214163984094, + "grad_norm": 0.3167610168457031, + "learning_rate": 1.9140314334406363e-05, + "loss": 0.1267, + "step": 2270 + }, + { + "epoch": 0.21586820677901913, + "grad_norm": 0.37354180216789246, + "learning_rate": 1.9136527172883927e-05, + "loss": 0.1408, + "step": 2280 + }, + { + "epoch": 0.21681499715962885, + "grad_norm": 0.2738534212112427, + "learning_rate": 1.9132740011361487e-05, + "loss": 0.1231, + "step": 2290 + }, + { + "epoch": 0.2177617875402386, + "grad_norm": 0.3062466084957123, + "learning_rate": 1.9128952849839047e-05, + "loss": 0.1302, + "step": 2300 + }, + { + "epoch": 0.21870857792084833, + "grad_norm": 0.3265305161476135, + "learning_rate": 1.912516568831661e-05, + "loss": 0.121, + "step": 2310 + }, + { + "epoch": 0.21965536830145807, + "grad_norm": 0.3429173231124878, + "learning_rate": 1.912137852679417e-05, + "loss": 0.1344, + "step": 2320 + }, + { + "epoch": 0.22060215868206778, + "grad_norm": 0.2771424949169159, + "learning_rate": 1.911759136527173e-05, + "loss": 0.1188, + "step": 2330 + }, + { + "epoch": 0.22154894906267752, + "grad_norm": 0.29070237278938293, + "learning_rate": 1.911380420374929e-05, + "loss": 0.1218, + "step": 2340 + }, + { + "epoch": 0.22249573944328727, + "grad_norm": 0.2981018126010895, + "learning_rate": 1.911001704222685e-05, + "loss": 0.1361, + "step": 2350 + }, + { + "epoch": 0.22344252982389698, + "grad_norm": 0.3472117781639099, + "learning_rate": 1.9106229880704414e-05, + "loss": 0.1211, + "step": 2360 + }, + { + "epoch": 0.22438932020450672, + "grad_norm": 0.271899551153183, + "learning_rate": 1.9102442719181974e-05, + "loss": 0.133, + "step": 2370 + }, + { + "epoch": 0.22533611058511646, + "grad_norm": 0.34610268473625183, + "learning_rate": 1.9098655557659534e-05, + "loss": 0.1273, + "step": 2380 + }, + { + "epoch": 0.22628290096572617, + "grad_norm": 0.2797755002975464, + "learning_rate": 1.9094868396137097e-05, + "loss": 0.1243, + "step": 2390 + }, + { + "epoch": 0.22722969134633592, + "grad_norm": 0.32911232113838196, + "learning_rate": 1.9091081234614657e-05, + "loss": 0.1184, + "step": 2400 + }, + { + "epoch": 0.22817648172694566, + "grad_norm": 0.27326205372810364, + "learning_rate": 1.908729407309222e-05, + "loss": 0.1177, + "step": 2410 + }, + { + "epoch": 0.2291232721075554, + "grad_norm": 0.2909172475337982, + "learning_rate": 1.908350691156978e-05, + "loss": 0.1177, + "step": 2420 + }, + { + "epoch": 0.2300700624881651, + "grad_norm": 0.2706824839115143, + "learning_rate": 1.907971975004734e-05, + "loss": 0.1202, + "step": 2430 + }, + { + "epoch": 0.23101685286877485, + "grad_norm": 0.31379246711730957, + "learning_rate": 1.90759325885249e-05, + "loss": 0.1164, + "step": 2440 + }, + { + "epoch": 0.2319636432493846, + "grad_norm": 0.33181753754615784, + "learning_rate": 1.9072145427002464e-05, + "loss": 0.1225, + "step": 2450 + }, + { + "epoch": 0.2329104336299943, + "grad_norm": 0.23471438884735107, + "learning_rate": 1.9068358265480025e-05, + "loss": 0.1273, + "step": 2460 + }, + { + "epoch": 0.23385722401060405, + "grad_norm": 0.22987720370292664, + "learning_rate": 1.9064571103957585e-05, + "loss": 0.1291, + "step": 2470 + }, + { + "epoch": 0.2348040143912138, + "grad_norm": 0.26784586906433105, + "learning_rate": 1.9060783942435145e-05, + "loss": 0.1334, + "step": 2480 + }, + { + "epoch": 0.23575080477182353, + "grad_norm": 0.3212006390094757, + "learning_rate": 1.9056996780912708e-05, + "loss": 0.1257, + "step": 2490 + }, + { + "epoch": 0.23669759515243324, + "grad_norm": 0.2957512140274048, + "learning_rate": 1.9053209619390268e-05, + "loss": 0.137, + "step": 2500 + }, + { + "epoch": 0.23764438553304298, + "grad_norm": 0.2556230425834656, + "learning_rate": 1.904942245786783e-05, + "loss": 0.1283, + "step": 2510 + }, + { + "epoch": 0.23859117591365273, + "grad_norm": 0.3201919198036194, + "learning_rate": 1.904563529634539e-05, + "loss": 0.1261, + "step": 2520 + }, + { + "epoch": 0.23953796629426244, + "grad_norm": 0.2649897336959839, + "learning_rate": 1.904184813482295e-05, + "loss": 0.1234, + "step": 2530 + }, + { + "epoch": 0.24048475667487218, + "grad_norm": 0.2947181165218353, + "learning_rate": 1.9038060973300512e-05, + "loss": 0.1229, + "step": 2540 + }, + { + "epoch": 0.24143154705548192, + "grad_norm": 0.2702978253364563, + "learning_rate": 1.9034273811778075e-05, + "loss": 0.13, + "step": 2550 + }, + { + "epoch": 0.24237833743609166, + "grad_norm": 0.2793761193752289, + "learning_rate": 1.9030486650255635e-05, + "loss": 0.1213, + "step": 2560 + }, + { + "epoch": 0.24332512781670138, + "grad_norm": 0.36399927735328674, + "learning_rate": 1.9026699488733195e-05, + "loss": 0.1341, + "step": 2570 + }, + { + "epoch": 0.24427191819731112, + "grad_norm": 0.2572638690471649, + "learning_rate": 1.9022912327210755e-05, + "loss": 0.1205, + "step": 2580 + }, + { + "epoch": 0.24521870857792086, + "grad_norm": 0.5421980023384094, + "learning_rate": 1.901912516568832e-05, + "loss": 0.1418, + "step": 2590 + }, + { + "epoch": 0.24616549895853057, + "grad_norm": 0.3563353419303894, + "learning_rate": 1.901533800416588e-05, + "loss": 0.1109, + "step": 2600 + }, + { + "epoch": 0.2471122893391403, + "grad_norm": 0.2500777244567871, + "learning_rate": 1.901155084264344e-05, + "loss": 0.1221, + "step": 2610 + }, + { + "epoch": 0.24805907971975005, + "grad_norm": 0.2641560137271881, + "learning_rate": 1.9007763681121002e-05, + "loss": 0.1204, + "step": 2620 + }, + { + "epoch": 0.2490058701003598, + "grad_norm": 0.2920982837677002, + "learning_rate": 1.9003976519598562e-05, + "loss": 0.1281, + "step": 2630 + }, + { + "epoch": 0.2499526604809695, + "grad_norm": 0.31924280524253845, + "learning_rate": 1.9000189358076126e-05, + "loss": 0.1245, + "step": 2640 + }, + { + "epoch": 0.2508994508615792, + "grad_norm": 0.30418962240219116, + "learning_rate": 1.8996402196553686e-05, + "loss": 0.1322, + "step": 2650 + }, + { + "epoch": 0.25184624124218896, + "grad_norm": 0.37784865498542786, + "learning_rate": 1.8992615035031246e-05, + "loss": 0.1239, + "step": 2660 + }, + { + "epoch": 0.2527930316227987, + "grad_norm": 0.28060677647590637, + "learning_rate": 1.8988827873508806e-05, + "loss": 0.1282, + "step": 2670 + }, + { + "epoch": 0.25373982200340844, + "grad_norm": 0.3838202953338623, + "learning_rate": 1.8985040711986366e-05, + "loss": 0.1369, + "step": 2680 + }, + { + "epoch": 0.2546866123840182, + "grad_norm": 0.3191165626049042, + "learning_rate": 1.898125355046393e-05, + "loss": 0.1177, + "step": 2690 + }, + { + "epoch": 0.2556334027646279, + "grad_norm": 0.28924256563186646, + "learning_rate": 1.897746638894149e-05, + "loss": 0.1258, + "step": 2700 + }, + { + "epoch": 0.25658019314523767, + "grad_norm": 0.33843380212783813, + "learning_rate": 1.897367922741905e-05, + "loss": 0.1202, + "step": 2710 + }, + { + "epoch": 0.25752698352584735, + "grad_norm": 0.313869446516037, + "learning_rate": 1.8969892065896613e-05, + "loss": 0.1167, + "step": 2720 + }, + { + "epoch": 0.2584737739064571, + "grad_norm": 0.3136950433254242, + "learning_rate": 1.8966104904374173e-05, + "loss": 0.1229, + "step": 2730 + }, + { + "epoch": 0.25942056428706683, + "grad_norm": 0.3564135730266571, + "learning_rate": 1.8962317742851733e-05, + "loss": 0.1269, + "step": 2740 + }, + { + "epoch": 0.2603673546676766, + "grad_norm": 0.28279656171798706, + "learning_rate": 1.8958530581329296e-05, + "loss": 0.1223, + "step": 2750 + }, + { + "epoch": 0.2613141450482863, + "grad_norm": 0.38073378801345825, + "learning_rate": 1.8954743419806857e-05, + "loss": 0.1264, + "step": 2760 + }, + { + "epoch": 0.26226093542889606, + "grad_norm": 0.3409043252468109, + "learning_rate": 1.8950956258284417e-05, + "loss": 0.1241, + "step": 2770 + }, + { + "epoch": 0.2632077258095058, + "grad_norm": 0.3403298854827881, + "learning_rate": 1.894716909676198e-05, + "loss": 0.1297, + "step": 2780 + }, + { + "epoch": 0.2641545161901155, + "grad_norm": 0.29332926869392395, + "learning_rate": 1.894338193523954e-05, + "loss": 0.127, + "step": 2790 + }, + { + "epoch": 0.2651013065707252, + "grad_norm": 0.26315629482269287, + "learning_rate": 1.89395947737171e-05, + "loss": 0.1265, + "step": 2800 + }, + { + "epoch": 0.26604809695133497, + "grad_norm": 0.32936838269233704, + "learning_rate": 1.893580761219466e-05, + "loss": 0.123, + "step": 2810 + }, + { + "epoch": 0.2669948873319447, + "grad_norm": 0.2917657196521759, + "learning_rate": 1.8932020450672224e-05, + "loss": 0.1061, + "step": 2820 + }, + { + "epoch": 0.26794167771255445, + "grad_norm": 0.2700554132461548, + "learning_rate": 1.8928233289149784e-05, + "loss": 0.1099, + "step": 2830 + }, + { + "epoch": 0.2688884680931642, + "grad_norm": 0.29582569003105164, + "learning_rate": 1.8924446127627344e-05, + "loss": 0.1299, + "step": 2840 + }, + { + "epoch": 0.26983525847377393, + "grad_norm": 0.28645405173301697, + "learning_rate": 1.8920658966104907e-05, + "loss": 0.127, + "step": 2850 + }, + { + "epoch": 0.2707820488543836, + "grad_norm": 0.5851916670799255, + "learning_rate": 1.8916871804582467e-05, + "loss": 0.1162, + "step": 2860 + }, + { + "epoch": 0.27172883923499336, + "grad_norm": 0.35853055119514465, + "learning_rate": 1.891308464306003e-05, + "loss": 0.1322, + "step": 2870 + }, + { + "epoch": 0.2726756296156031, + "grad_norm": 0.3862152695655823, + "learning_rate": 1.890929748153759e-05, + "loss": 0.126, + "step": 2880 + }, + { + "epoch": 0.27362241999621284, + "grad_norm": 0.27488529682159424, + "learning_rate": 1.890551032001515e-05, + "loss": 0.1203, + "step": 2890 + }, + { + "epoch": 0.2745692103768226, + "grad_norm": 0.3433137834072113, + "learning_rate": 1.890172315849271e-05, + "loss": 0.1208, + "step": 2900 + }, + { + "epoch": 0.2755160007574323, + "grad_norm": 0.38652151823043823, + "learning_rate": 1.889793599697027e-05, + "loss": 0.1227, + "step": 2910 + }, + { + "epoch": 0.27646279113804206, + "grad_norm": 0.31607943773269653, + "learning_rate": 1.8894148835447834e-05, + "loss": 0.1151, + "step": 2920 + }, + { + "epoch": 0.27740958151865175, + "grad_norm": 0.319286584854126, + "learning_rate": 1.8890361673925394e-05, + "loss": 0.1263, + "step": 2930 + }, + { + "epoch": 0.2783563718992615, + "grad_norm": 0.27554312348365784, + "learning_rate": 1.8886574512402954e-05, + "loss": 0.113, + "step": 2940 + }, + { + "epoch": 0.27930316227987123, + "grad_norm": 0.3367822766304016, + "learning_rate": 1.8882787350880514e-05, + "loss": 0.1137, + "step": 2950 + }, + { + "epoch": 0.28024995266048097, + "grad_norm": 0.3277094066143036, + "learning_rate": 1.8879000189358078e-05, + "loss": 0.1163, + "step": 2960 + }, + { + "epoch": 0.2811967430410907, + "grad_norm": 0.39818716049194336, + "learning_rate": 1.8875213027835638e-05, + "loss": 0.1318, + "step": 2970 + }, + { + "epoch": 0.28214353342170045, + "grad_norm": 0.2829638421535492, + "learning_rate": 1.88714258663132e-05, + "loss": 0.1228, + "step": 2980 + }, + { + "epoch": 0.2830903238023102, + "grad_norm": 0.2840532958507538, + "learning_rate": 1.886763870479076e-05, + "loss": 0.1181, + "step": 2990 + }, + { + "epoch": 0.2840371141829199, + "grad_norm": 0.4064909815788269, + "learning_rate": 1.886385154326832e-05, + "loss": 0.1163, + "step": 3000 + }, + { + "epoch": 0.2849839045635296, + "grad_norm": 0.373701274394989, + "learning_rate": 1.8860064381745885e-05, + "loss": 0.1209, + "step": 3010 + }, + { + "epoch": 0.28593069494413936, + "grad_norm": 0.26867496967315674, + "learning_rate": 1.8856277220223445e-05, + "loss": 0.1264, + "step": 3020 + }, + { + "epoch": 0.2868774853247491, + "grad_norm": 0.3012067675590515, + "learning_rate": 1.8852490058701005e-05, + "loss": 0.1161, + "step": 3030 + }, + { + "epoch": 0.28782427570535885, + "grad_norm": 0.26876571774482727, + "learning_rate": 1.8848702897178565e-05, + "loss": 0.1156, + "step": 3040 + }, + { + "epoch": 0.2887710660859686, + "grad_norm": 0.2857358455657959, + "learning_rate": 1.8844915735656125e-05, + "loss": 0.1286, + "step": 3050 + }, + { + "epoch": 0.28971785646657827, + "grad_norm": 0.2931133210659027, + "learning_rate": 1.884112857413369e-05, + "loss": 0.1203, + "step": 3060 + }, + { + "epoch": 0.290664646847188, + "grad_norm": 0.2736462950706482, + "learning_rate": 1.883734141261125e-05, + "loss": 0.1189, + "step": 3070 + }, + { + "epoch": 0.29161143722779775, + "grad_norm": 0.32691168785095215, + "learning_rate": 1.8833554251088812e-05, + "loss": 0.1231, + "step": 3080 + }, + { + "epoch": 0.2925582276084075, + "grad_norm": 0.3366908133029938, + "learning_rate": 1.8829767089566372e-05, + "loss": 0.1254, + "step": 3090 + }, + { + "epoch": 0.29350501798901724, + "grad_norm": 0.26143720746040344, + "learning_rate": 1.8825979928043932e-05, + "loss": 0.1268, + "step": 3100 + }, + { + "epoch": 0.294451808369627, + "grad_norm": 0.3320617079734802, + "learning_rate": 1.8822192766521496e-05, + "loss": 0.1234, + "step": 3110 + }, + { + "epoch": 0.2953985987502367, + "grad_norm": 0.28845691680908203, + "learning_rate": 1.8818405604999056e-05, + "loss": 0.1201, + "step": 3120 + }, + { + "epoch": 0.2963453891308464, + "grad_norm": 0.31063926219940186, + "learning_rate": 1.8814618443476616e-05, + "loss": 0.1173, + "step": 3130 + }, + { + "epoch": 0.29729217951145614, + "grad_norm": 0.24950078129768372, + "learning_rate": 1.8810831281954176e-05, + "loss": 0.1283, + "step": 3140 + }, + { + "epoch": 0.2982389698920659, + "grad_norm": 0.2675219178199768, + "learning_rate": 1.880704412043174e-05, + "loss": 0.1175, + "step": 3150 + }, + { + "epoch": 0.2991857602726756, + "grad_norm": 0.24839593470096588, + "learning_rate": 1.88032569589093e-05, + "loss": 0.1228, + "step": 3160 + }, + { + "epoch": 0.30013255065328537, + "grad_norm": 0.5048424005508423, + "learning_rate": 1.879946979738686e-05, + "loss": 0.1129, + "step": 3170 + }, + { + "epoch": 0.3010793410338951, + "grad_norm": 0.34761127829551697, + "learning_rate": 1.879568263586442e-05, + "loss": 0.1219, + "step": 3180 + }, + { + "epoch": 0.30202613141450485, + "grad_norm": 0.3296762704849243, + "learning_rate": 1.8791895474341983e-05, + "loss": 0.131, + "step": 3190 + }, + { + "epoch": 0.30297292179511454, + "grad_norm": 0.3156794011592865, + "learning_rate": 1.8788108312819543e-05, + "loss": 0.1151, + "step": 3200 + }, + { + "epoch": 0.3039197121757243, + "grad_norm": 0.29802405834198, + "learning_rate": 1.8784321151297106e-05, + "loss": 0.1267, + "step": 3210 + }, + { + "epoch": 0.304866502556334, + "grad_norm": 0.285553902387619, + "learning_rate": 1.8780533989774666e-05, + "loss": 0.1202, + "step": 3220 + }, + { + "epoch": 0.30581329293694376, + "grad_norm": 0.29214930534362793, + "learning_rate": 1.8776746828252226e-05, + "loss": 0.1183, + "step": 3230 + }, + { + "epoch": 0.3067600833175535, + "grad_norm": 0.3107370138168335, + "learning_rate": 1.877295966672979e-05, + "loss": 0.1187, + "step": 3240 + }, + { + "epoch": 0.30770687369816324, + "grad_norm": 0.3791562020778656, + "learning_rate": 1.876917250520735e-05, + "loss": 0.119, + "step": 3250 + }, + { + "epoch": 0.308653664078773, + "grad_norm": 0.3186882734298706, + "learning_rate": 1.876538534368491e-05, + "loss": 0.1164, + "step": 3260 + }, + { + "epoch": 0.30960045445938267, + "grad_norm": 0.3271386921405792, + "learning_rate": 1.876159818216247e-05, + "loss": 0.1066, + "step": 3270 + }, + { + "epoch": 0.3105472448399924, + "grad_norm": 0.27886903285980225, + "learning_rate": 1.875781102064003e-05, + "loss": 0.1237, + "step": 3280 + }, + { + "epoch": 0.31149403522060215, + "grad_norm": 0.2511722147464752, + "learning_rate": 1.8754023859117593e-05, + "loss": 0.128, + "step": 3290 + }, + { + "epoch": 0.3124408256012119, + "grad_norm": 0.3614468276500702, + "learning_rate": 1.8750236697595153e-05, + "loss": 0.1192, + "step": 3300 + }, + { + "epoch": 0.31338761598182163, + "grad_norm": 0.3021714687347412, + "learning_rate": 1.8746449536072713e-05, + "loss": 0.1235, + "step": 3310 + }, + { + "epoch": 0.3143344063624314, + "grad_norm": 0.31286343932151794, + "learning_rate": 1.8742662374550277e-05, + "loss": 0.1263, + "step": 3320 + }, + { + "epoch": 0.3152811967430411, + "grad_norm": 0.31820154190063477, + "learning_rate": 1.8738875213027837e-05, + "loss": 0.1271, + "step": 3330 + }, + { + "epoch": 0.3162279871236508, + "grad_norm": 0.3829352557659149, + "learning_rate": 1.87350880515054e-05, + "loss": 0.1096, + "step": 3340 + }, + { + "epoch": 0.31717477750426054, + "grad_norm": 0.24418611824512482, + "learning_rate": 1.873130088998296e-05, + "loss": 0.1299, + "step": 3350 + }, + { + "epoch": 0.3181215678848703, + "grad_norm": 0.25075456500053406, + "learning_rate": 1.872751372846052e-05, + "loss": 0.111, + "step": 3360 + }, + { + "epoch": 0.31906835826548, + "grad_norm": 0.41566210985183716, + "learning_rate": 1.872372656693808e-05, + "loss": 0.1201, + "step": 3370 + }, + { + "epoch": 0.32001514864608976, + "grad_norm": 0.32048362493515015, + "learning_rate": 1.8719939405415644e-05, + "loss": 0.1253, + "step": 3380 + }, + { + "epoch": 0.3209619390266995, + "grad_norm": 0.3619791567325592, + "learning_rate": 1.8716152243893204e-05, + "loss": 0.126, + "step": 3390 + }, + { + "epoch": 0.32190872940730925, + "grad_norm": 0.3619197905063629, + "learning_rate": 1.8712365082370764e-05, + "loss": 0.1155, + "step": 3400 + }, + { + "epoch": 0.32285551978791893, + "grad_norm": 0.29323554039001465, + "learning_rate": 1.8708577920848324e-05, + "loss": 0.1118, + "step": 3410 + }, + { + "epoch": 0.3238023101685287, + "grad_norm": 0.3184221386909485, + "learning_rate": 1.8704790759325888e-05, + "loss": 0.1265, + "step": 3420 + }, + { + "epoch": 0.3247491005491384, + "grad_norm": 0.33619818091392517, + "learning_rate": 1.8701003597803448e-05, + "loss": 0.1219, + "step": 3430 + }, + { + "epoch": 0.32569589092974816, + "grad_norm": 0.3194987177848816, + "learning_rate": 1.869721643628101e-05, + "loss": 0.1231, + "step": 3440 + }, + { + "epoch": 0.3266426813103579, + "grad_norm": 0.239166721701622, + "learning_rate": 1.869342927475857e-05, + "loss": 0.1021, + "step": 3450 + }, + { + "epoch": 0.32758947169096764, + "grad_norm": 0.315458208322525, + "learning_rate": 1.868964211323613e-05, + "loss": 0.1284, + "step": 3460 + }, + { + "epoch": 0.3285362620715774, + "grad_norm": 0.3272011876106262, + "learning_rate": 1.868585495171369e-05, + "loss": 0.1158, + "step": 3470 + }, + { + "epoch": 0.32948305245218706, + "grad_norm": 0.38707324862480164, + "learning_rate": 1.8682067790191255e-05, + "loss": 0.1166, + "step": 3480 + }, + { + "epoch": 0.3304298428327968, + "grad_norm": 0.3211500644683838, + "learning_rate": 1.8678280628668815e-05, + "loss": 0.129, + "step": 3490 + }, + { + "epoch": 0.33137663321340655, + "grad_norm": 0.35265690088272095, + "learning_rate": 1.8674493467146375e-05, + "loss": 0.1185, + "step": 3500 + }, + { + "epoch": 0.3323234235940163, + "grad_norm": 0.27759236097335815, + "learning_rate": 1.8670706305623935e-05, + "loss": 0.1313, + "step": 3510 + }, + { + "epoch": 0.33327021397462603, + "grad_norm": 0.3936869502067566, + "learning_rate": 1.8666919144101498e-05, + "loss": 0.1238, + "step": 3520 + }, + { + "epoch": 0.33421700435523577, + "grad_norm": 0.2950145900249481, + "learning_rate": 1.8663131982579058e-05, + "loss": 0.1189, + "step": 3530 + }, + { + "epoch": 0.3351637947358455, + "grad_norm": 0.46038469672203064, + "learning_rate": 1.8659344821056618e-05, + "loss": 0.1254, + "step": 3540 + }, + { + "epoch": 0.3361105851164552, + "grad_norm": 0.3949941098690033, + "learning_rate": 1.8655557659534182e-05, + "loss": 0.1166, + "step": 3550 + }, + { + "epoch": 0.33705737549706494, + "grad_norm": 0.218438521027565, + "learning_rate": 1.8651770498011742e-05, + "loss": 0.1096, + "step": 3560 + }, + { + "epoch": 0.3380041658776747, + "grad_norm": 0.34671449661254883, + "learning_rate": 1.8647983336489305e-05, + "loss": 0.1163, + "step": 3570 + }, + { + "epoch": 0.3389509562582844, + "grad_norm": 0.29920464754104614, + "learning_rate": 1.8644196174966865e-05, + "loss": 0.1261, + "step": 3580 + }, + { + "epoch": 0.33989774663889416, + "grad_norm": 0.2960605025291443, + "learning_rate": 1.8640409013444425e-05, + "loss": 0.1181, + "step": 3590 + }, + { + "epoch": 0.3408445370195039, + "grad_norm": 0.38953059911727905, + "learning_rate": 1.8636621851921985e-05, + "loss": 0.1218, + "step": 3600 + }, + { + "epoch": 0.34179132740011364, + "grad_norm": 0.2678033709526062, + "learning_rate": 1.8632834690399545e-05, + "loss": 0.114, + "step": 3610 + }, + { + "epoch": 0.34273811778072333, + "grad_norm": 0.3713693916797638, + "learning_rate": 1.862904752887711e-05, + "loss": 0.1207, + "step": 3620 + }, + { + "epoch": 0.34368490816133307, + "grad_norm": 0.43956953287124634, + "learning_rate": 1.862526036735467e-05, + "loss": 0.1235, + "step": 3630 + }, + { + "epoch": 0.3446316985419428, + "grad_norm": 0.30412259697914124, + "learning_rate": 1.862147320583223e-05, + "loss": 0.1205, + "step": 3640 + }, + { + "epoch": 0.34557848892255255, + "grad_norm": 0.3188311755657196, + "learning_rate": 1.861768604430979e-05, + "loss": 0.1262, + "step": 3650 + }, + { + "epoch": 0.3465252793031623, + "grad_norm": 0.3703117072582245, + "learning_rate": 1.8613898882787352e-05, + "loss": 0.1157, + "step": 3660 + }, + { + "epoch": 0.34747206968377203, + "grad_norm": 0.314065158367157, + "learning_rate": 1.8610111721264912e-05, + "loss": 0.1174, + "step": 3670 + }, + { + "epoch": 0.3484188600643817, + "grad_norm": 0.3832274377346039, + "learning_rate": 1.8606324559742476e-05, + "loss": 0.1245, + "step": 3680 + }, + { + "epoch": 0.34936565044499146, + "grad_norm": 0.3195415139198303, + "learning_rate": 1.8602537398220036e-05, + "loss": 0.1175, + "step": 3690 + }, + { + "epoch": 0.3503124408256012, + "grad_norm": 0.2775132656097412, + "learning_rate": 1.8598750236697596e-05, + "loss": 0.1166, + "step": 3700 + }, + { + "epoch": 0.35125923120621094, + "grad_norm": 0.43748944997787476, + "learning_rate": 1.859496307517516e-05, + "loss": 0.1247, + "step": 3710 + }, + { + "epoch": 0.3522060215868207, + "grad_norm": 0.313702791929245, + "learning_rate": 1.859117591365272e-05, + "loss": 0.1267, + "step": 3720 + }, + { + "epoch": 0.3531528119674304, + "grad_norm": 0.37444302439689636, + "learning_rate": 1.858738875213028e-05, + "loss": 0.118, + "step": 3730 + }, + { + "epoch": 0.35409960234804017, + "grad_norm": 0.32261762022972107, + "learning_rate": 1.858360159060784e-05, + "loss": 0.1237, + "step": 3740 + }, + { + "epoch": 0.35504639272864985, + "grad_norm": 0.2619532346725464, + "learning_rate": 1.85798144290854e-05, + "loss": 0.1119, + "step": 3750 + }, + { + "epoch": 0.3559931831092596, + "grad_norm": 0.31362977623939514, + "learning_rate": 1.8576027267562963e-05, + "loss": 0.115, + "step": 3760 + }, + { + "epoch": 0.35693997348986933, + "grad_norm": 0.34366849064826965, + "learning_rate": 1.8572240106040523e-05, + "loss": 0.124, + "step": 3770 + }, + { + "epoch": 0.3578867638704791, + "grad_norm": 0.31962850689888, + "learning_rate": 1.8568452944518087e-05, + "loss": 0.1269, + "step": 3780 + }, + { + "epoch": 0.3588335542510888, + "grad_norm": 0.35521697998046875, + "learning_rate": 1.8564665782995647e-05, + "loss": 0.129, + "step": 3790 + }, + { + "epoch": 0.35978034463169856, + "grad_norm": 0.37336549162864685, + "learning_rate": 1.856087862147321e-05, + "loss": 0.1244, + "step": 3800 + }, + { + "epoch": 0.3607271350123083, + "grad_norm": 0.3989594578742981, + "learning_rate": 1.855709145995077e-05, + "loss": 0.1272, + "step": 3810 + }, + { + "epoch": 0.361673925392918, + "grad_norm": 0.2903200387954712, + "learning_rate": 1.855330429842833e-05, + "loss": 0.1198, + "step": 3820 + }, + { + "epoch": 0.3626207157735277, + "grad_norm": 0.2967371940612793, + "learning_rate": 1.854951713690589e-05, + "loss": 0.1171, + "step": 3830 + }, + { + "epoch": 0.36356750615413747, + "grad_norm": 0.33660125732421875, + "learning_rate": 1.854572997538345e-05, + "loss": 0.1186, + "step": 3840 + }, + { + "epoch": 0.3645142965347472, + "grad_norm": 0.27733585238456726, + "learning_rate": 1.8541942813861014e-05, + "loss": 0.1234, + "step": 3850 + }, + { + "epoch": 0.36546108691535695, + "grad_norm": 0.3161308169364929, + "learning_rate": 1.8538155652338574e-05, + "loss": 0.119, + "step": 3860 + }, + { + "epoch": 0.3664078772959667, + "grad_norm": 0.2673567533493042, + "learning_rate": 1.8534368490816134e-05, + "loss": 0.1249, + "step": 3870 + }, + { + "epoch": 0.36735466767657643, + "grad_norm": 0.2674126625061035, + "learning_rate": 1.8530581329293694e-05, + "loss": 0.1212, + "step": 3880 + }, + { + "epoch": 0.3683014580571861, + "grad_norm": 0.4858635663986206, + "learning_rate": 1.8526794167771257e-05, + "loss": 0.123, + "step": 3890 + }, + { + "epoch": 0.36924824843779586, + "grad_norm": 0.35940200090408325, + "learning_rate": 1.8523007006248817e-05, + "loss": 0.1192, + "step": 3900 + }, + { + "epoch": 0.3701950388184056, + "grad_norm": 0.3293863534927368, + "learning_rate": 1.851921984472638e-05, + "loss": 0.1125, + "step": 3910 + }, + { + "epoch": 0.37114182919901534, + "grad_norm": 0.44395145773887634, + "learning_rate": 1.851543268320394e-05, + "loss": 0.1143, + "step": 3920 + }, + { + "epoch": 0.3720886195796251, + "grad_norm": 0.317997545003891, + "learning_rate": 1.85116455216815e-05, + "loss": 0.1155, + "step": 3930 + }, + { + "epoch": 0.3730354099602348, + "grad_norm": 0.3860480785369873, + "learning_rate": 1.8507858360159064e-05, + "loss": 0.1309, + "step": 3940 + }, + { + "epoch": 0.37398220034084456, + "grad_norm": 0.2992136478424072, + "learning_rate": 1.8504071198636624e-05, + "loss": 0.1177, + "step": 3950 + }, + { + "epoch": 0.37492899072145425, + "grad_norm": 0.2919655442237854, + "learning_rate": 1.8500284037114184e-05, + "loss": 0.1184, + "step": 3960 + }, + { + "epoch": 0.375875781102064, + "grad_norm": 0.3052353262901306, + "learning_rate": 1.8496496875591744e-05, + "loss": 0.1317, + "step": 3970 + }, + { + "epoch": 0.37682257148267373, + "grad_norm": 0.3412197530269623, + "learning_rate": 1.8492709714069305e-05, + "loss": 0.1256, + "step": 3980 + }, + { + "epoch": 0.37776936186328347, + "grad_norm": 0.31510818004608154, + "learning_rate": 1.8488922552546868e-05, + "loss": 0.1226, + "step": 3990 + }, + { + "epoch": 0.3787161522438932, + "grad_norm": 0.3692219853401184, + "learning_rate": 1.8485135391024428e-05, + "loss": 0.13, + "step": 4000 + }, + { + "epoch": 0.37966294262450295, + "grad_norm": 0.3020903766155243, + "learning_rate": 1.8481348229501988e-05, + "loss": 0.1188, + "step": 4010 + }, + { + "epoch": 0.3806097330051127, + "grad_norm": 0.36188632249832153, + "learning_rate": 1.847756106797955e-05, + "loss": 0.1203, + "step": 4020 + }, + { + "epoch": 0.3815565233857224, + "grad_norm": 0.3624081313610077, + "learning_rate": 1.847377390645711e-05, + "loss": 0.1289, + "step": 4030 + }, + { + "epoch": 0.3825033137663321, + "grad_norm": 0.2747180461883545, + "learning_rate": 1.8469986744934675e-05, + "loss": 0.1174, + "step": 4040 + }, + { + "epoch": 0.38345010414694186, + "grad_norm": 0.3396485149860382, + "learning_rate": 1.8466199583412235e-05, + "loss": 0.119, + "step": 4050 + }, + { + "epoch": 0.3843968945275516, + "grad_norm": 0.3224307894706726, + "learning_rate": 1.8462412421889795e-05, + "loss": 0.1141, + "step": 4060 + }, + { + "epoch": 0.38534368490816134, + "grad_norm": 0.30942174792289734, + "learning_rate": 1.8458625260367355e-05, + "loss": 0.1165, + "step": 4070 + }, + { + "epoch": 0.3862904752887711, + "grad_norm": 0.2940969467163086, + "learning_rate": 1.845483809884492e-05, + "loss": 0.1162, + "step": 4080 + }, + { + "epoch": 0.3872372656693808, + "grad_norm": 0.35114723443984985, + "learning_rate": 1.845105093732248e-05, + "loss": 0.1306, + "step": 4090 + }, + { + "epoch": 0.3881840560499905, + "grad_norm": 0.42864593863487244, + "learning_rate": 1.844726377580004e-05, + "loss": 0.1226, + "step": 4100 + }, + { + "epoch": 0.38913084643060025, + "grad_norm": 0.3231365978717804, + "learning_rate": 1.84434766142776e-05, + "loss": 0.1194, + "step": 4110 + }, + { + "epoch": 0.39007763681121, + "grad_norm": 0.30363401770591736, + "learning_rate": 1.8439689452755162e-05, + "loss": 0.1075, + "step": 4120 + }, + { + "epoch": 0.39102442719181973, + "grad_norm": 0.39980143308639526, + "learning_rate": 1.8435902291232722e-05, + "loss": 0.1149, + "step": 4130 + }, + { + "epoch": 0.3919712175724295, + "grad_norm": 0.37222790718078613, + "learning_rate": 1.8432115129710286e-05, + "loss": 0.117, + "step": 4140 + }, + { + "epoch": 0.3929180079530392, + "grad_norm": 0.29298582673072815, + "learning_rate": 1.8428327968187846e-05, + "loss": 0.125, + "step": 4150 + }, + { + "epoch": 0.39386479833364896, + "grad_norm": 0.339240700006485, + "learning_rate": 1.8424540806665406e-05, + "loss": 0.1182, + "step": 4160 + }, + { + "epoch": 0.39481158871425864, + "grad_norm": 0.33925309777259827, + "learning_rate": 1.8420753645142966e-05, + "loss": 0.1227, + "step": 4170 + }, + { + "epoch": 0.3957583790948684, + "grad_norm": 0.2940181791782379, + "learning_rate": 1.841696648362053e-05, + "loss": 0.1207, + "step": 4180 + }, + { + "epoch": 0.3967051694754781, + "grad_norm": 0.27949029207229614, + "learning_rate": 1.841317932209809e-05, + "loss": 0.121, + "step": 4190 + }, + { + "epoch": 0.39765195985608787, + "grad_norm": 0.2940124571323395, + "learning_rate": 1.840939216057565e-05, + "loss": 0.1181, + "step": 4200 + }, + { + "epoch": 0.3985987502366976, + "grad_norm": 0.31642553210258484, + "learning_rate": 1.840560499905321e-05, + "loss": 0.125, + "step": 4210 + }, + { + "epoch": 0.39954554061730735, + "grad_norm": 0.23605939745903015, + "learning_rate": 1.8401817837530773e-05, + "loss": 0.1207, + "step": 4220 + }, + { + "epoch": 0.40049233099791703, + "grad_norm": 0.36963286995887756, + "learning_rate": 1.8398030676008333e-05, + "loss": 0.1217, + "step": 4230 + }, + { + "epoch": 0.4014391213785268, + "grad_norm": 0.254743754863739, + "learning_rate": 1.8394243514485893e-05, + "loss": 0.1243, + "step": 4240 + }, + { + "epoch": 0.4023859117591365, + "grad_norm": 0.326809287071228, + "learning_rate": 1.8390456352963456e-05, + "loss": 0.1242, + "step": 4250 + }, + { + "epoch": 0.40333270213974626, + "grad_norm": 0.3072037100791931, + "learning_rate": 1.8386669191441016e-05, + "loss": 0.1195, + "step": 4260 + }, + { + "epoch": 0.404279492520356, + "grad_norm": 0.37460359930992126, + "learning_rate": 1.838288202991858e-05, + "loss": 0.1114, + "step": 4270 + }, + { + "epoch": 0.40522628290096574, + "grad_norm": 0.24516154825687408, + "learning_rate": 1.837909486839614e-05, + "loss": 0.1088, + "step": 4280 + }, + { + "epoch": 0.4061730732815755, + "grad_norm": 0.3073451519012451, + "learning_rate": 1.83753077068737e-05, + "loss": 0.1094, + "step": 4290 + }, + { + "epoch": 0.40711986366218517, + "grad_norm": 0.3263719379901886, + "learning_rate": 1.837152054535126e-05, + "loss": 0.1217, + "step": 4300 + }, + { + "epoch": 0.4080666540427949, + "grad_norm": 0.3760577142238617, + "learning_rate": 1.836773338382882e-05, + "loss": 0.1236, + "step": 4310 + }, + { + "epoch": 0.40901344442340465, + "grad_norm": 0.30230146646499634, + "learning_rate": 1.8363946222306383e-05, + "loss": 0.1188, + "step": 4320 + }, + { + "epoch": 0.4099602348040144, + "grad_norm": 0.4337088465690613, + "learning_rate": 1.8360159060783944e-05, + "loss": 0.1251, + "step": 4330 + }, + { + "epoch": 0.41090702518462413, + "grad_norm": 0.35297223925590515, + "learning_rate": 1.8356371899261504e-05, + "loss": 0.1171, + "step": 4340 + }, + { + "epoch": 0.41185381556523387, + "grad_norm": 0.3118896186351776, + "learning_rate": 1.8352584737739067e-05, + "loss": 0.1216, + "step": 4350 + }, + { + "epoch": 0.4128006059458436, + "grad_norm": 0.47326090931892395, + "learning_rate": 1.8348797576216627e-05, + "loss": 0.1247, + "step": 4360 + }, + { + "epoch": 0.4137473963264533, + "grad_norm": 0.4229494035243988, + "learning_rate": 1.8345010414694187e-05, + "loss": 0.116, + "step": 4370 + }, + { + "epoch": 0.41469418670706304, + "grad_norm": 0.3024285137653351, + "learning_rate": 1.834122325317175e-05, + "loss": 0.1235, + "step": 4380 + }, + { + "epoch": 0.4156409770876728, + "grad_norm": 0.3133857548236847, + "learning_rate": 1.833743609164931e-05, + "loss": 0.1137, + "step": 4390 + }, + { + "epoch": 0.4165877674682825, + "grad_norm": 0.2643986940383911, + "learning_rate": 1.833364893012687e-05, + "loss": 0.1218, + "step": 4400 + }, + { + "epoch": 0.41753455784889226, + "grad_norm": 0.3025350272655487, + "learning_rate": 1.8329861768604434e-05, + "loss": 0.1174, + "step": 4410 + }, + { + "epoch": 0.418481348229502, + "grad_norm": 0.31136399507522583, + "learning_rate": 1.8326074607081994e-05, + "loss": 0.1197, + "step": 4420 + }, + { + "epoch": 0.41942813861011174, + "grad_norm": 0.3025195896625519, + "learning_rate": 1.8322287445559554e-05, + "loss": 0.1119, + "step": 4430 + }, + { + "epoch": 0.42037492899072143, + "grad_norm": 0.23095518350601196, + "learning_rate": 1.8318500284037114e-05, + "loss": 0.1224, + "step": 4440 + }, + { + "epoch": 0.42132171937133117, + "grad_norm": 0.45459040999412537, + "learning_rate": 1.8314713122514674e-05, + "loss": 0.1166, + "step": 4450 + }, + { + "epoch": 0.4222685097519409, + "grad_norm": 0.39385083317756653, + "learning_rate": 1.8310925960992238e-05, + "loss": 0.1238, + "step": 4460 + }, + { + "epoch": 0.42321530013255065, + "grad_norm": 0.2855985462665558, + "learning_rate": 1.8307138799469798e-05, + "loss": 0.1288, + "step": 4470 + }, + { + "epoch": 0.4241620905131604, + "grad_norm": 0.29849758744239807, + "learning_rate": 1.830335163794736e-05, + "loss": 0.1121, + "step": 4480 + }, + { + "epoch": 0.42510888089377014, + "grad_norm": 0.40433022379875183, + "learning_rate": 1.829956447642492e-05, + "loss": 0.1129, + "step": 4490 + }, + { + "epoch": 0.4260556712743799, + "grad_norm": 0.29155251383781433, + "learning_rate": 1.8295777314902485e-05, + "loss": 0.1194, + "step": 4500 + }, + { + "epoch": 0.42700246165498956, + "grad_norm": 0.2591720223426819, + "learning_rate": 1.8291990153380045e-05, + "loss": 0.115, + "step": 4510 + }, + { + "epoch": 0.4279492520355993, + "grad_norm": 0.3257406949996948, + "learning_rate": 1.8288202991857605e-05, + "loss": 0.1137, + "step": 4520 + }, + { + "epoch": 0.42889604241620904, + "grad_norm": 0.3112312853336334, + "learning_rate": 1.8284415830335165e-05, + "loss": 0.1228, + "step": 4530 + }, + { + "epoch": 0.4298428327968188, + "grad_norm": 0.2938838601112366, + "learning_rate": 1.8280628668812725e-05, + "loss": 0.1111, + "step": 4540 + }, + { + "epoch": 0.4307896231774285, + "grad_norm": 0.3180755376815796, + "learning_rate": 1.827684150729029e-05, + "loss": 0.1355, + "step": 4550 + }, + { + "epoch": 0.43173641355803827, + "grad_norm": 0.39242005348205566, + "learning_rate": 1.827305434576785e-05, + "loss": 0.12, + "step": 4560 + }, + { + "epoch": 0.432683203938648, + "grad_norm": 0.3311753273010254, + "learning_rate": 1.826926718424541e-05, + "loss": 0.1233, + "step": 4570 + }, + { + "epoch": 0.4336299943192577, + "grad_norm": 0.26584166288375854, + "learning_rate": 1.826548002272297e-05, + "loss": 0.1149, + "step": 4580 + }, + { + "epoch": 0.43457678469986744, + "grad_norm": 0.282243549823761, + "learning_rate": 1.8261692861200532e-05, + "loss": 0.1295, + "step": 4590 + }, + { + "epoch": 0.4355235750804772, + "grad_norm": 0.4552994668483734, + "learning_rate": 1.8257905699678092e-05, + "loss": 0.1123, + "step": 4600 + }, + { + "epoch": 0.4364703654610869, + "grad_norm": 0.2721503674983978, + "learning_rate": 1.8254118538155655e-05, + "loss": 0.117, + "step": 4610 + }, + { + "epoch": 0.43741715584169666, + "grad_norm": 0.30223289132118225, + "learning_rate": 1.8250331376633215e-05, + "loss": 0.1135, + "step": 4620 + }, + { + "epoch": 0.4383639462223064, + "grad_norm": 0.3740018904209137, + "learning_rate": 1.8246544215110776e-05, + "loss": 0.1199, + "step": 4630 + }, + { + "epoch": 0.43931073660291614, + "grad_norm": 0.25978952646255493, + "learning_rate": 1.824275705358834e-05, + "loss": 0.1128, + "step": 4640 + }, + { + "epoch": 0.4402575269835258, + "grad_norm": 0.2803758680820465, + "learning_rate": 1.82389698920659e-05, + "loss": 0.1174, + "step": 4650 + }, + { + "epoch": 0.44120431736413557, + "grad_norm": 0.2670730948448181, + "learning_rate": 1.823518273054346e-05, + "loss": 0.1126, + "step": 4660 + }, + { + "epoch": 0.4421511077447453, + "grad_norm": 0.22530563175678253, + "learning_rate": 1.823139556902102e-05, + "loss": 0.1259, + "step": 4670 + }, + { + "epoch": 0.44309789812535505, + "grad_norm": 0.3665080964565277, + "learning_rate": 1.822760840749858e-05, + "loss": 0.1191, + "step": 4680 + }, + { + "epoch": 0.4440446885059648, + "grad_norm": 0.2968369722366333, + "learning_rate": 1.8223821245976143e-05, + "loss": 0.1222, + "step": 4690 + }, + { + "epoch": 0.44499147888657453, + "grad_norm": 0.38494449853897095, + "learning_rate": 1.8220034084453703e-05, + "loss": 0.1204, + "step": 4700 + }, + { + "epoch": 0.4459382692671843, + "grad_norm": 0.3491016924381256, + "learning_rate": 1.8216246922931266e-05, + "loss": 0.1261, + "step": 4710 + }, + { + "epoch": 0.44688505964779396, + "grad_norm": 0.5060954689979553, + "learning_rate": 1.8212459761408826e-05, + "loss": 0.125, + "step": 4720 + }, + { + "epoch": 0.4478318500284037, + "grad_norm": 0.36539798974990845, + "learning_rate": 1.8208672599886386e-05, + "loss": 0.1177, + "step": 4730 + }, + { + "epoch": 0.44877864040901344, + "grad_norm": 0.30213627219200134, + "learning_rate": 1.820488543836395e-05, + "loss": 0.1146, + "step": 4740 + }, + { + "epoch": 0.4497254307896232, + "grad_norm": 0.3161025047302246, + "learning_rate": 1.820109827684151e-05, + "loss": 0.1229, + "step": 4750 + }, + { + "epoch": 0.4506722211702329, + "grad_norm": 0.3496663272380829, + "learning_rate": 1.819731111531907e-05, + "loss": 0.1175, + "step": 4760 + }, + { + "epoch": 0.45161901155084266, + "grad_norm": 0.4112556576728821, + "learning_rate": 1.819352395379663e-05, + "loss": 0.1297, + "step": 4770 + }, + { + "epoch": 0.45256580193145235, + "grad_norm": 0.31184515357017517, + "learning_rate": 1.8189736792274193e-05, + "loss": 0.113, + "step": 4780 + }, + { + "epoch": 0.4535125923120621, + "grad_norm": 0.2756892144680023, + "learning_rate": 1.8185949630751753e-05, + "loss": 0.1189, + "step": 4790 + }, + { + "epoch": 0.45445938269267183, + "grad_norm": 0.38942059874534607, + "learning_rate": 1.8182162469229313e-05, + "loss": 0.1213, + "step": 4800 + }, + { + "epoch": 0.4554061730732816, + "grad_norm": 0.38971012830734253, + "learning_rate": 1.8178375307706873e-05, + "loss": 0.1186, + "step": 4810 + }, + { + "epoch": 0.4563529634538913, + "grad_norm": 0.36139774322509766, + "learning_rate": 1.8174588146184437e-05, + "loss": 0.1099, + "step": 4820 + }, + { + "epoch": 0.45729975383450105, + "grad_norm": 0.30894285440444946, + "learning_rate": 1.8170800984661997e-05, + "loss": 0.1125, + "step": 4830 + }, + { + "epoch": 0.4582465442151108, + "grad_norm": 0.39547091722488403, + "learning_rate": 1.816701382313956e-05, + "loss": 0.1159, + "step": 4840 + }, + { + "epoch": 0.4591933345957205, + "grad_norm": 0.35442787408828735, + "learning_rate": 1.816322666161712e-05, + "loss": 0.1199, + "step": 4850 + }, + { + "epoch": 0.4601401249763302, + "grad_norm": 0.31647929549217224, + "learning_rate": 1.815943950009468e-05, + "loss": 0.123, + "step": 4860 + }, + { + "epoch": 0.46108691535693996, + "grad_norm": 0.23882287740707397, + "learning_rate": 1.815565233857224e-05, + "loss": 0.1281, + "step": 4870 + }, + { + "epoch": 0.4620337057375497, + "grad_norm": 0.28302937746047974, + "learning_rate": 1.8151865177049804e-05, + "loss": 0.1178, + "step": 4880 + }, + { + "epoch": 0.46298049611815945, + "grad_norm": 0.30785325169563293, + "learning_rate": 1.8148078015527364e-05, + "loss": 0.115, + "step": 4890 + }, + { + "epoch": 0.4639272864987692, + "grad_norm": 0.291075199842453, + "learning_rate": 1.8144290854004924e-05, + "loss": 0.1119, + "step": 4900 + }, + { + "epoch": 0.46487407687937893, + "grad_norm": 0.3640170395374298, + "learning_rate": 1.8140503692482484e-05, + "loss": 0.1153, + "step": 4910 + }, + { + "epoch": 0.4658208672599886, + "grad_norm": 0.25291305780410767, + "learning_rate": 1.8136716530960047e-05, + "loss": 0.115, + "step": 4920 + }, + { + "epoch": 0.46676765764059835, + "grad_norm": 0.34131917357444763, + "learning_rate": 1.8132929369437607e-05, + "loss": 0.1164, + "step": 4930 + }, + { + "epoch": 0.4677144480212081, + "grad_norm": 0.40944379568099976, + "learning_rate": 1.8129142207915168e-05, + "loss": 0.1251, + "step": 4940 + }, + { + "epoch": 0.46866123840181784, + "grad_norm": 0.36214330792427063, + "learning_rate": 1.812535504639273e-05, + "loss": 0.1176, + "step": 4950 + }, + { + "epoch": 0.4696080287824276, + "grad_norm": 0.2422647923231125, + "learning_rate": 1.812156788487029e-05, + "loss": 0.1047, + "step": 4960 + }, + { + "epoch": 0.4705548191630373, + "grad_norm": 0.33104845881462097, + "learning_rate": 1.8117780723347854e-05, + "loss": 0.1178, + "step": 4970 + }, + { + "epoch": 0.47150160954364706, + "grad_norm": 0.30480530858039856, + "learning_rate": 1.8113993561825415e-05, + "loss": 0.1267, + "step": 4980 + }, + { + "epoch": 0.47244839992425675, + "grad_norm": 0.29709336161613464, + "learning_rate": 1.8110206400302975e-05, + "loss": 0.1172, + "step": 4990 + }, + { + "epoch": 0.4733951903048665, + "grad_norm": 0.3976782560348511, + "learning_rate": 1.8106419238780535e-05, + "loss": 0.1195, + "step": 5000 + }, + { + "epoch": 0.4743419806854762, + "grad_norm": 0.29639732837677, + "learning_rate": 1.8102632077258095e-05, + "loss": 0.12, + "step": 5010 + }, + { + "epoch": 0.47528877106608597, + "grad_norm": 0.31762635707855225, + "learning_rate": 1.8098844915735658e-05, + "loss": 0.1197, + "step": 5020 + }, + { + "epoch": 0.4762355614466957, + "grad_norm": 0.29064318537712097, + "learning_rate": 1.8095057754213218e-05, + "loss": 0.11, + "step": 5030 + }, + { + "epoch": 0.47718235182730545, + "grad_norm": 0.37592342495918274, + "learning_rate": 1.8091270592690778e-05, + "loss": 0.1122, + "step": 5040 + }, + { + "epoch": 0.4781291422079152, + "grad_norm": 0.3141964077949524, + "learning_rate": 1.808748343116834e-05, + "loss": 0.1202, + "step": 5050 + }, + { + "epoch": 0.4790759325885249, + "grad_norm": 0.4769488275051117, + "learning_rate": 1.80836962696459e-05, + "loss": 0.1191, + "step": 5060 + }, + { + "epoch": 0.4800227229691346, + "grad_norm": 0.30844876170158386, + "learning_rate": 1.8079909108123465e-05, + "loss": 0.113, + "step": 5070 + }, + { + "epoch": 0.48096951334974436, + "grad_norm": 0.3101666569709778, + "learning_rate": 1.8076121946601025e-05, + "loss": 0.1099, + "step": 5080 + }, + { + "epoch": 0.4819163037303541, + "grad_norm": 0.2982792854309082, + "learning_rate": 1.8072334785078585e-05, + "loss": 0.1315, + "step": 5090 + }, + { + "epoch": 0.48286309411096384, + "grad_norm": 0.33043378591537476, + "learning_rate": 1.8068547623556145e-05, + "loss": 0.1187, + "step": 5100 + }, + { + "epoch": 0.4838098844915736, + "grad_norm": 0.30517569184303284, + "learning_rate": 1.806476046203371e-05, + "loss": 0.108, + "step": 5110 + }, + { + "epoch": 0.4847566748721833, + "grad_norm": 0.393460750579834, + "learning_rate": 1.806097330051127e-05, + "loss": 0.1212, + "step": 5120 + }, + { + "epoch": 0.485703465252793, + "grad_norm": 0.2593437731266022, + "learning_rate": 1.805718613898883e-05, + "loss": 0.1095, + "step": 5130 + }, + { + "epoch": 0.48665025563340275, + "grad_norm": 0.33833277225494385, + "learning_rate": 1.805339897746639e-05, + "loss": 0.1239, + "step": 5140 + }, + { + "epoch": 0.4875970460140125, + "grad_norm": 0.2983340620994568, + "learning_rate": 1.804961181594395e-05, + "loss": 0.1141, + "step": 5150 + }, + { + "epoch": 0.48854383639462223, + "grad_norm": 0.286362886428833, + "learning_rate": 1.8045824654421512e-05, + "loss": 0.1089, + "step": 5160 + }, + { + "epoch": 0.489490626775232, + "grad_norm": 0.27127572894096375, + "learning_rate": 1.8042037492899072e-05, + "loss": 0.1031, + "step": 5170 + }, + { + "epoch": 0.4904374171558417, + "grad_norm": 0.28876861929893494, + "learning_rate": 1.8038250331376636e-05, + "loss": 0.1184, + "step": 5180 + }, + { + "epoch": 0.49138420753645146, + "grad_norm": 0.33738434314727783, + "learning_rate": 1.8034463169854196e-05, + "loss": 0.1157, + "step": 5190 + }, + { + "epoch": 0.49233099791706114, + "grad_norm": 0.40151458978652954, + "learning_rate": 1.803067600833176e-05, + "loss": 0.1124, + "step": 5200 + }, + { + "epoch": 0.4932777882976709, + "grad_norm": 0.30535009503364563, + "learning_rate": 1.802688884680932e-05, + "loss": 0.1254, + "step": 5210 + }, + { + "epoch": 0.4942245786782806, + "grad_norm": 0.29737281799316406, + "learning_rate": 1.802310168528688e-05, + "loss": 0.1118, + "step": 5220 + }, + { + "epoch": 0.49517136905889036, + "grad_norm": 0.27557647228240967, + "learning_rate": 1.801931452376444e-05, + "loss": 0.1042, + "step": 5230 + }, + { + "epoch": 0.4961181594395001, + "grad_norm": 0.4273216724395752, + "learning_rate": 1.8015527362242e-05, + "loss": 0.1263, + "step": 5240 + }, + { + "epoch": 0.49706494982010985, + "grad_norm": 0.3010781407356262, + "learning_rate": 1.8011740200719563e-05, + "loss": 0.1181, + "step": 5250 + }, + { + "epoch": 0.4980117402007196, + "grad_norm": 0.3851463496685028, + "learning_rate": 1.8007953039197123e-05, + "loss": 0.1186, + "step": 5260 + }, + { + "epoch": 0.4989585305813293, + "grad_norm": 0.2870292365550995, + "learning_rate": 1.8004165877674683e-05, + "loss": 0.1272, + "step": 5270 + }, + { + "epoch": 0.499905320961939, + "grad_norm": 0.3452846109867096, + "learning_rate": 1.8000378716152243e-05, + "loss": 0.1264, + "step": 5280 + }, + { + "epoch": 0.5008521113425488, + "grad_norm": 0.32496556639671326, + "learning_rate": 1.7996591554629807e-05, + "loss": 0.1183, + "step": 5290 + }, + { + "epoch": 0.5017989017231584, + "grad_norm": 0.29744213819503784, + "learning_rate": 1.7992804393107367e-05, + "loss": 0.1225, + "step": 5300 + }, + { + "epoch": 0.5027456921037682, + "grad_norm": 0.3163803815841675, + "learning_rate": 1.798901723158493e-05, + "loss": 0.1197, + "step": 5310 + }, + { + "epoch": 0.5036924824843779, + "grad_norm": 0.2961333096027374, + "learning_rate": 1.798523007006249e-05, + "loss": 0.1266, + "step": 5320 + }, + { + "epoch": 0.5046392728649877, + "grad_norm": 0.41541510820388794, + "learning_rate": 1.798144290854005e-05, + "loss": 0.1249, + "step": 5330 + }, + { + "epoch": 0.5055860632455974, + "grad_norm": 0.3027608096599579, + "learning_rate": 1.7977655747017614e-05, + "loss": 0.1138, + "step": 5340 + }, + { + "epoch": 0.5065328536262071, + "grad_norm": 0.34629517793655396, + "learning_rate": 1.7973868585495174e-05, + "loss": 0.1143, + "step": 5350 + }, + { + "epoch": 0.5074796440068169, + "grad_norm": 0.3278519809246063, + "learning_rate": 1.7970081423972734e-05, + "loss": 0.1258, + "step": 5360 + }, + { + "epoch": 0.5084264343874266, + "grad_norm": 0.39441338181495667, + "learning_rate": 1.7966294262450294e-05, + "loss": 0.1111, + "step": 5370 + }, + { + "epoch": 0.5093732247680364, + "grad_norm": 0.19750452041625977, + "learning_rate": 1.7962507100927854e-05, + "loss": 0.1132, + "step": 5380 + }, + { + "epoch": 0.5103200151486461, + "grad_norm": 0.32753902673721313, + "learning_rate": 1.7958719939405417e-05, + "loss": 0.1089, + "step": 5390 + }, + { + "epoch": 0.5112668055292559, + "grad_norm": 0.283546507358551, + "learning_rate": 1.7954932777882977e-05, + "loss": 0.1187, + "step": 5400 + }, + { + "epoch": 0.5122135959098656, + "grad_norm": 0.2674579322338104, + "learning_rate": 1.795114561636054e-05, + "loss": 0.1144, + "step": 5410 + }, + { + "epoch": 0.5131603862904753, + "grad_norm": 0.3309236168861389, + "learning_rate": 1.79473584548381e-05, + "loss": 0.1087, + "step": 5420 + }, + { + "epoch": 0.5141071766710851, + "grad_norm": 0.3469752371311188, + "learning_rate": 1.794357129331566e-05, + "loss": 0.1207, + "step": 5430 + }, + { + "epoch": 0.5150539670516947, + "grad_norm": 0.28428488969802856, + "learning_rate": 1.7939784131793224e-05, + "loss": 0.1221, + "step": 5440 + }, + { + "epoch": 0.5160007574323044, + "grad_norm": 0.32135307788848877, + "learning_rate": 1.7935996970270784e-05, + "loss": 0.1193, + "step": 5450 + }, + { + "epoch": 0.5169475478129142, + "grad_norm": 0.26351454854011536, + "learning_rate": 1.7932209808748344e-05, + "loss": 0.1218, + "step": 5460 + }, + { + "epoch": 0.5178943381935239, + "grad_norm": 0.2696927785873413, + "learning_rate": 1.7928422647225904e-05, + "loss": 0.1108, + "step": 5470 + }, + { + "epoch": 0.5188411285741337, + "grad_norm": 0.4349839985370636, + "learning_rate": 1.7924635485703468e-05, + "loss": 0.125, + "step": 5480 + }, + { + "epoch": 0.5197879189547434, + "grad_norm": 0.3225243091583252, + "learning_rate": 1.7920848324181028e-05, + "loss": 0.1191, + "step": 5490 + }, + { + "epoch": 0.5207347093353532, + "grad_norm": 0.3667551577091217, + "learning_rate": 1.7917061162658588e-05, + "loss": 0.112, + "step": 5500 + }, + { + "epoch": 0.5216814997159629, + "grad_norm": 0.3296608328819275, + "learning_rate": 1.7913274001136148e-05, + "loss": 0.1181, + "step": 5510 + }, + { + "epoch": 0.5226282900965726, + "grad_norm": 0.3578593134880066, + "learning_rate": 1.790948683961371e-05, + "loss": 0.1193, + "step": 5520 + }, + { + "epoch": 0.5235750804771824, + "grad_norm": 0.3956611752510071, + "learning_rate": 1.790569967809127e-05, + "loss": 0.1203, + "step": 5530 + }, + { + "epoch": 0.5245218708577921, + "grad_norm": 0.34705013036727905, + "learning_rate": 1.7901912516568835e-05, + "loss": 0.118, + "step": 5540 + }, + { + "epoch": 0.5254686612384019, + "grad_norm": 0.30799970030784607, + "learning_rate": 1.7898125355046395e-05, + "loss": 0.117, + "step": 5550 + }, + { + "epoch": 0.5264154516190116, + "grad_norm": 0.2643110156059265, + "learning_rate": 1.7894338193523955e-05, + "loss": 0.1139, + "step": 5560 + }, + { + "epoch": 0.5273622419996212, + "grad_norm": 0.2462998777627945, + "learning_rate": 1.789055103200152e-05, + "loss": 0.1148, + "step": 5570 + }, + { + "epoch": 0.528309032380231, + "grad_norm": 0.3135557472705841, + "learning_rate": 1.788676387047908e-05, + "loss": 0.1123, + "step": 5580 + }, + { + "epoch": 0.5292558227608407, + "grad_norm": 0.3650512397289276, + "learning_rate": 1.788297670895664e-05, + "loss": 0.1127, + "step": 5590 + }, + { + "epoch": 0.5302026131414505, + "grad_norm": 0.24632461369037628, + "learning_rate": 1.78791895474342e-05, + "loss": 0.1248, + "step": 5600 + }, + { + "epoch": 0.5311494035220602, + "grad_norm": 0.3508126139640808, + "learning_rate": 1.787540238591176e-05, + "loss": 0.1208, + "step": 5610 + }, + { + "epoch": 0.5320961939026699, + "grad_norm": 0.33299440145492554, + "learning_rate": 1.7871615224389322e-05, + "loss": 0.1119, + "step": 5620 + }, + { + "epoch": 0.5330429842832797, + "grad_norm": 0.3384663164615631, + "learning_rate": 1.7867828062866882e-05, + "loss": 0.1196, + "step": 5630 + }, + { + "epoch": 0.5339897746638894, + "grad_norm": 0.389332115650177, + "learning_rate": 1.7864040901344442e-05, + "loss": 0.1137, + "step": 5640 + }, + { + "epoch": 0.5349365650444992, + "grad_norm": 0.3215329349040985, + "learning_rate": 1.7860253739822006e-05, + "loss": 0.1155, + "step": 5650 + }, + { + "epoch": 0.5358833554251089, + "grad_norm": 0.32901594042778015, + "learning_rate": 1.7856466578299566e-05, + "loss": 0.1133, + "step": 5660 + }, + { + "epoch": 0.5368301458057186, + "grad_norm": 0.364347904920578, + "learning_rate": 1.785267941677713e-05, + "loss": 0.1272, + "step": 5670 + }, + { + "epoch": 0.5377769361863284, + "grad_norm": 0.29443469643592834, + "learning_rate": 1.784889225525469e-05, + "loss": 0.1136, + "step": 5680 + }, + { + "epoch": 0.5387237265669381, + "grad_norm": 0.3348941206932068, + "learning_rate": 1.784510509373225e-05, + "loss": 0.1164, + "step": 5690 + }, + { + "epoch": 0.5396705169475479, + "grad_norm": 0.26882919669151306, + "learning_rate": 1.784131793220981e-05, + "loss": 0.1211, + "step": 5700 + }, + { + "epoch": 0.5406173073281575, + "grad_norm": 0.22887469828128815, + "learning_rate": 1.7837530770687373e-05, + "loss": 0.1103, + "step": 5710 + }, + { + "epoch": 0.5415640977087672, + "grad_norm": 0.40600207448005676, + "learning_rate": 1.7833743609164933e-05, + "loss": 0.113, + "step": 5720 + }, + { + "epoch": 0.542510888089377, + "grad_norm": 0.3073524534702301, + "learning_rate": 1.7829956447642493e-05, + "loss": 0.104, + "step": 5730 + }, + { + "epoch": 0.5434576784699867, + "grad_norm": 0.3362687826156616, + "learning_rate": 1.7826169286120053e-05, + "loss": 0.1145, + "step": 5740 + }, + { + "epoch": 0.5444044688505965, + "grad_norm": 0.30210235714912415, + "learning_rate": 1.7822382124597616e-05, + "loss": 0.1132, + "step": 5750 + }, + { + "epoch": 0.5453512592312062, + "grad_norm": 0.4413982927799225, + "learning_rate": 1.7818594963075176e-05, + "loss": 0.1256, + "step": 5760 + }, + { + "epoch": 0.5462980496118159, + "grad_norm": 0.32046809792518616, + "learning_rate": 1.781480780155274e-05, + "loss": 0.1178, + "step": 5770 + }, + { + "epoch": 0.5472448399924257, + "grad_norm": 0.36214178800582886, + "learning_rate": 1.78110206400303e-05, + "loss": 0.1179, + "step": 5780 + }, + { + "epoch": 0.5481916303730354, + "grad_norm": 0.2953844666481018, + "learning_rate": 1.780723347850786e-05, + "loss": 0.1229, + "step": 5790 + }, + { + "epoch": 0.5491384207536452, + "grad_norm": 0.36594676971435547, + "learning_rate": 1.780344631698542e-05, + "loss": 0.1182, + "step": 5800 + }, + { + "epoch": 0.5500852111342549, + "grad_norm": 0.38411974906921387, + "learning_rate": 1.7799659155462983e-05, + "loss": 0.1322, + "step": 5810 + }, + { + "epoch": 0.5510320015148646, + "grad_norm": 0.3273877799510956, + "learning_rate": 1.7795871993940543e-05, + "loss": 0.1226, + "step": 5820 + }, + { + "epoch": 0.5519787918954744, + "grad_norm": 0.3491196930408478, + "learning_rate": 1.7792084832418103e-05, + "loss": 0.1172, + "step": 5830 + }, + { + "epoch": 0.5529255822760841, + "grad_norm": 0.27573728561401367, + "learning_rate": 1.7788297670895663e-05, + "loss": 0.117, + "step": 5840 + }, + { + "epoch": 0.5538723726566938, + "grad_norm": 0.31645190715789795, + "learning_rate": 1.7784510509373227e-05, + "loss": 0.1122, + "step": 5850 + }, + { + "epoch": 0.5548191630373035, + "grad_norm": 0.3352598249912262, + "learning_rate": 1.7780723347850787e-05, + "loss": 0.1193, + "step": 5860 + }, + { + "epoch": 0.5557659534179132, + "grad_norm": 0.49130764603614807, + "learning_rate": 1.7776936186328347e-05, + "loss": 0.1306, + "step": 5870 + }, + { + "epoch": 0.556712743798523, + "grad_norm": 0.27824321389198303, + "learning_rate": 1.777314902480591e-05, + "loss": 0.1095, + "step": 5880 + }, + { + "epoch": 0.5576595341791327, + "grad_norm": 0.3100660443305969, + "learning_rate": 1.776936186328347e-05, + "loss": 0.1367, + "step": 5890 + }, + { + "epoch": 0.5586063245597425, + "grad_norm": 0.404834508895874, + "learning_rate": 1.7765574701761034e-05, + "loss": 0.1166, + "step": 5900 + }, + { + "epoch": 0.5595531149403522, + "grad_norm": 0.2933177053928375, + "learning_rate": 1.7761787540238594e-05, + "loss": 0.1202, + "step": 5910 + }, + { + "epoch": 0.5604999053209619, + "grad_norm": 0.3327297270298004, + "learning_rate": 1.7758000378716154e-05, + "loss": 0.1185, + "step": 5920 + }, + { + "epoch": 0.5614466957015717, + "grad_norm": 0.2998553514480591, + "learning_rate": 1.7754213217193714e-05, + "loss": 0.1163, + "step": 5930 + }, + { + "epoch": 0.5623934860821814, + "grad_norm": 0.3321799337863922, + "learning_rate": 1.7750426055671274e-05, + "loss": 0.1192, + "step": 5940 + }, + { + "epoch": 0.5633402764627912, + "grad_norm": 0.2803283929824829, + "learning_rate": 1.7746638894148838e-05, + "loss": 0.1112, + "step": 5950 + }, + { + "epoch": 0.5642870668434009, + "grad_norm": 0.28032103180885315, + "learning_rate": 1.7742851732626398e-05, + "loss": 0.1198, + "step": 5960 + }, + { + "epoch": 0.5652338572240106, + "grad_norm": 0.2938515543937683, + "learning_rate": 1.7739064571103958e-05, + "loss": 0.1118, + "step": 5970 + }, + { + "epoch": 0.5661806476046204, + "grad_norm": 0.33386483788490295, + "learning_rate": 1.773527740958152e-05, + "loss": 0.1095, + "step": 5980 + }, + { + "epoch": 0.56712743798523, + "grad_norm": 0.35478469729423523, + "learning_rate": 1.773149024805908e-05, + "loss": 0.1235, + "step": 5990 + }, + { + "epoch": 0.5680742283658398, + "grad_norm": 0.29097840189933777, + "learning_rate": 1.772770308653664e-05, + "loss": 0.1247, + "step": 6000 + }, + { + "epoch": 0.5690210187464495, + "grad_norm": 0.36649322509765625, + "learning_rate": 1.7723915925014205e-05, + "loss": 0.1206, + "step": 6010 + }, + { + "epoch": 0.5699678091270592, + "grad_norm": 0.3363392651081085, + "learning_rate": 1.7720128763491765e-05, + "loss": 0.1144, + "step": 6020 + }, + { + "epoch": 0.570914599507669, + "grad_norm": 0.31241416931152344, + "learning_rate": 1.7716341601969325e-05, + "loss": 0.1145, + "step": 6030 + }, + { + "epoch": 0.5718613898882787, + "grad_norm": 0.34137117862701416, + "learning_rate": 1.7712554440446888e-05, + "loss": 0.1245, + "step": 6040 + }, + { + "epoch": 0.5728081802688885, + "grad_norm": 0.4431675374507904, + "learning_rate": 1.7708767278924448e-05, + "loss": 0.1218, + "step": 6050 + }, + { + "epoch": 0.5737549706494982, + "grad_norm": 0.28357037901878357, + "learning_rate": 1.7704980117402008e-05, + "loss": 0.1097, + "step": 6060 + }, + { + "epoch": 0.574701761030108, + "grad_norm": 0.35272452235221863, + "learning_rate": 1.770119295587957e-05, + "loss": 0.1246, + "step": 6070 + }, + { + "epoch": 0.5756485514107177, + "grad_norm": 0.29270246624946594, + "learning_rate": 1.769740579435713e-05, + "loss": 0.1067, + "step": 6080 + }, + { + "epoch": 0.5765953417913274, + "grad_norm": 0.3070552349090576, + "learning_rate": 1.7693618632834692e-05, + "loss": 0.1131, + "step": 6090 + }, + { + "epoch": 0.5775421321719372, + "grad_norm": 0.42108526825904846, + "learning_rate": 1.7689831471312252e-05, + "loss": 0.1244, + "step": 6100 + }, + { + "epoch": 0.5784889225525469, + "grad_norm": 0.3098716735839844, + "learning_rate": 1.7686044309789815e-05, + "loss": 0.1226, + "step": 6110 + }, + { + "epoch": 0.5794357129331565, + "grad_norm": 0.2832401990890503, + "learning_rate": 1.7682257148267375e-05, + "loss": 0.1135, + "step": 6120 + }, + { + "epoch": 0.5803825033137663, + "grad_norm": 0.3348044455051422, + "learning_rate": 1.767846998674494e-05, + "loss": 0.1198, + "step": 6130 + }, + { + "epoch": 0.581329293694376, + "grad_norm": 0.3181207776069641, + "learning_rate": 1.76746828252225e-05, + "loss": 0.1148, + "step": 6140 + }, + { + "epoch": 0.5822760840749858, + "grad_norm": 0.46395212411880493, + "learning_rate": 1.767089566370006e-05, + "loss": 0.1257, + "step": 6150 + }, + { + "epoch": 0.5832228744555955, + "grad_norm": 0.3275673985481262, + "learning_rate": 1.766710850217762e-05, + "loss": 0.1176, + "step": 6160 + }, + { + "epoch": 0.5841696648362052, + "grad_norm": 0.28788167238235474, + "learning_rate": 1.766332134065518e-05, + "loss": 0.1076, + "step": 6170 + }, + { + "epoch": 0.585116455216815, + "grad_norm": 0.29496243596076965, + "learning_rate": 1.7659534179132742e-05, + "loss": 0.1083, + "step": 6180 + }, + { + "epoch": 0.5860632455974247, + "grad_norm": 0.2746005356311798, + "learning_rate": 1.7655747017610302e-05, + "loss": 0.1139, + "step": 6190 + }, + { + "epoch": 0.5870100359780345, + "grad_norm": 0.3565037250518799, + "learning_rate": 1.7651959856087863e-05, + "loss": 0.1207, + "step": 6200 + }, + { + "epoch": 0.5879568263586442, + "grad_norm": 0.28780317306518555, + "learning_rate": 1.7648172694565423e-05, + "loss": 0.1172, + "step": 6210 + }, + { + "epoch": 0.588903616739254, + "grad_norm": 0.3613559305667877, + "learning_rate": 1.7644385533042986e-05, + "loss": 0.1152, + "step": 6220 + }, + { + "epoch": 0.5898504071198637, + "grad_norm": 0.3410801589488983, + "learning_rate": 1.7640598371520546e-05, + "loss": 0.1113, + "step": 6230 + }, + { + "epoch": 0.5907971975004734, + "grad_norm": 0.26274338364601135, + "learning_rate": 1.763681120999811e-05, + "loss": 0.1147, + "step": 6240 + }, + { + "epoch": 0.5917439878810832, + "grad_norm": 0.3763429820537567, + "learning_rate": 1.763302404847567e-05, + "loss": 0.1142, + "step": 6250 + }, + { + "epoch": 0.5926907782616928, + "grad_norm": 0.4751342833042145, + "learning_rate": 1.762923688695323e-05, + "loss": 0.1187, + "step": 6260 + }, + { + "epoch": 0.5936375686423025, + "grad_norm": 0.3844728171825409, + "learning_rate": 1.7625449725430793e-05, + "loss": 0.1208, + "step": 6270 + }, + { + "epoch": 0.5945843590229123, + "grad_norm": 0.32040935754776, + "learning_rate": 1.7621662563908353e-05, + "loss": 0.1158, + "step": 6280 + }, + { + "epoch": 0.595531149403522, + "grad_norm": 0.34830090403556824, + "learning_rate": 1.7617875402385913e-05, + "loss": 0.114, + "step": 6290 + }, + { + "epoch": 0.5964779397841318, + "grad_norm": 0.3176250159740448, + "learning_rate": 1.7614088240863473e-05, + "loss": 0.1155, + "step": 6300 + }, + { + "epoch": 0.5974247301647415, + "grad_norm": 0.3590896427631378, + "learning_rate": 1.7610301079341033e-05, + "loss": 0.1153, + "step": 6310 + }, + { + "epoch": 0.5983715205453513, + "grad_norm": 0.2998102903366089, + "learning_rate": 1.7606513917818597e-05, + "loss": 0.1179, + "step": 6320 + }, + { + "epoch": 0.599318310925961, + "grad_norm": 0.31011080741882324, + "learning_rate": 1.7602726756296157e-05, + "loss": 0.1295, + "step": 6330 + }, + { + "epoch": 0.6002651013065707, + "grad_norm": 0.30341920256614685, + "learning_rate": 1.759893959477372e-05, + "loss": 0.1214, + "step": 6340 + }, + { + "epoch": 0.6012118916871805, + "grad_norm": 0.372761607170105, + "learning_rate": 1.759515243325128e-05, + "loss": 0.1208, + "step": 6350 + }, + { + "epoch": 0.6021586820677902, + "grad_norm": 0.33373719453811646, + "learning_rate": 1.759136527172884e-05, + "loss": 0.1034, + "step": 6360 + }, + { + "epoch": 0.6031054724484, + "grad_norm": 0.34052538871765137, + "learning_rate": 1.7587578110206404e-05, + "loss": 0.1103, + "step": 6370 + }, + { + "epoch": 0.6040522628290097, + "grad_norm": 0.28972190618515015, + "learning_rate": 1.7583790948683964e-05, + "loss": 0.121, + "step": 6380 + }, + { + "epoch": 0.6049990532096194, + "grad_norm": 0.29406800866127014, + "learning_rate": 1.7580003787161524e-05, + "loss": 0.1238, + "step": 6390 + }, + { + "epoch": 0.6059458435902291, + "grad_norm": 0.4161909520626068, + "learning_rate": 1.7576216625639084e-05, + "loss": 0.1135, + "step": 6400 + }, + { + "epoch": 0.6068926339708388, + "grad_norm": 0.3617647886276245, + "learning_rate": 1.7572429464116647e-05, + "loss": 0.1175, + "step": 6410 + }, + { + "epoch": 0.6078394243514486, + "grad_norm": 0.33809611201286316, + "learning_rate": 1.7568642302594207e-05, + "loss": 0.1043, + "step": 6420 + }, + { + "epoch": 0.6087862147320583, + "grad_norm": 0.3799417316913605, + "learning_rate": 1.7564855141071767e-05, + "loss": 0.1058, + "step": 6430 + }, + { + "epoch": 0.609733005112668, + "grad_norm": 0.3288790285587311, + "learning_rate": 1.7561067979549327e-05, + "loss": 0.1073, + "step": 6440 + }, + { + "epoch": 0.6106797954932778, + "grad_norm": 0.2925897240638733, + "learning_rate": 1.755728081802689e-05, + "loss": 0.1144, + "step": 6450 + }, + { + "epoch": 0.6116265858738875, + "grad_norm": 0.29820024967193604, + "learning_rate": 1.755349365650445e-05, + "loss": 0.1199, + "step": 6460 + }, + { + "epoch": 0.6125733762544973, + "grad_norm": 0.2849036753177643, + "learning_rate": 1.7549706494982014e-05, + "loss": 0.1192, + "step": 6470 + }, + { + "epoch": 0.613520166635107, + "grad_norm": 0.2699211537837982, + "learning_rate": 1.7545919333459574e-05, + "loss": 0.1087, + "step": 6480 + }, + { + "epoch": 0.6144669570157167, + "grad_norm": 0.31521305441856384, + "learning_rate": 1.7542132171937134e-05, + "loss": 0.1141, + "step": 6490 + }, + { + "epoch": 0.6154137473963265, + "grad_norm": 0.28813159465789795, + "learning_rate": 1.7538345010414694e-05, + "loss": 0.1147, + "step": 6500 + }, + { + "epoch": 0.6163605377769362, + "grad_norm": 0.4241039752960205, + "learning_rate": 1.7534557848892258e-05, + "loss": 0.114, + "step": 6510 + }, + { + "epoch": 0.617307328157546, + "grad_norm": 0.2915937602519989, + "learning_rate": 1.7530770687369818e-05, + "loss": 0.1121, + "step": 6520 + }, + { + "epoch": 0.6182541185381557, + "grad_norm": 0.311350554227829, + "learning_rate": 1.7526983525847378e-05, + "loss": 0.1207, + "step": 6530 + }, + { + "epoch": 0.6192009089187653, + "grad_norm": 0.38931339979171753, + "learning_rate": 1.7523196364324938e-05, + "loss": 0.1166, + "step": 6540 + }, + { + "epoch": 0.6201476992993751, + "grad_norm": 0.3369103968143463, + "learning_rate": 1.75194092028025e-05, + "loss": 0.1208, + "step": 6550 + }, + { + "epoch": 0.6210944896799848, + "grad_norm": 0.2888890504837036, + "learning_rate": 1.751562204128006e-05, + "loss": 0.1224, + "step": 6560 + }, + { + "epoch": 0.6220412800605946, + "grad_norm": 0.4478912949562073, + "learning_rate": 1.751183487975762e-05, + "loss": 0.1218, + "step": 6570 + }, + { + "epoch": 0.6229880704412043, + "grad_norm": 0.34411540627479553, + "learning_rate": 1.7508047718235185e-05, + "loss": 0.1205, + "step": 6580 + }, + { + "epoch": 0.623934860821814, + "grad_norm": 0.2995246648788452, + "learning_rate": 1.7504260556712745e-05, + "loss": 0.109, + "step": 6590 + }, + { + "epoch": 0.6248816512024238, + "grad_norm": 0.2678649425506592, + "learning_rate": 1.750047339519031e-05, + "loss": 0.1122, + "step": 6600 + }, + { + "epoch": 0.6258284415830335, + "grad_norm": 0.3918863832950592, + "learning_rate": 1.749668623366787e-05, + "loss": 0.1132, + "step": 6610 + }, + { + "epoch": 0.6267752319636433, + "grad_norm": 0.3421894311904907, + "learning_rate": 1.749289907214543e-05, + "loss": 0.101, + "step": 6620 + }, + { + "epoch": 0.627722022344253, + "grad_norm": 0.32719123363494873, + "learning_rate": 1.748911191062299e-05, + "loss": 0.1193, + "step": 6630 + }, + { + "epoch": 0.6286688127248627, + "grad_norm": 0.43975284695625305, + "learning_rate": 1.748532474910055e-05, + "loss": 0.1129, + "step": 6640 + }, + { + "epoch": 0.6296156031054725, + "grad_norm": 0.27118706703186035, + "learning_rate": 1.7481537587578112e-05, + "loss": 0.1099, + "step": 6650 + }, + { + "epoch": 0.6305623934860822, + "grad_norm": 0.32624804973602295, + "learning_rate": 1.7477750426055672e-05, + "loss": 0.1249, + "step": 6660 + }, + { + "epoch": 0.631509183866692, + "grad_norm": 0.3094474971294403, + "learning_rate": 1.7473963264533232e-05, + "loss": 0.122, + "step": 6670 + }, + { + "epoch": 0.6324559742473016, + "grad_norm": 0.31549081206321716, + "learning_rate": 1.7470176103010796e-05, + "loss": 0.1085, + "step": 6680 + }, + { + "epoch": 0.6334027646279113, + "grad_norm": 0.24869810044765472, + "learning_rate": 1.7466388941488356e-05, + "loss": 0.1182, + "step": 6690 + }, + { + "epoch": 0.6343495550085211, + "grad_norm": 0.30127742886543274, + "learning_rate": 1.746260177996592e-05, + "loss": 0.1004, + "step": 6700 + }, + { + "epoch": 0.6352963453891308, + "grad_norm": 0.47941163182258606, + "learning_rate": 1.745881461844348e-05, + "loss": 0.1176, + "step": 6710 + }, + { + "epoch": 0.6362431357697406, + "grad_norm": 0.35430312156677246, + "learning_rate": 1.745502745692104e-05, + "loss": 0.1153, + "step": 6720 + }, + { + "epoch": 0.6371899261503503, + "grad_norm": 0.2501283288002014, + "learning_rate": 1.74512402953986e-05, + "loss": 0.1101, + "step": 6730 + }, + { + "epoch": 0.63813671653096, + "grad_norm": 0.3357498049736023, + "learning_rate": 1.7447453133876163e-05, + "loss": 0.1191, + "step": 6740 + }, + { + "epoch": 0.6390835069115698, + "grad_norm": 0.30266913771629333, + "learning_rate": 1.7443665972353723e-05, + "loss": 0.1137, + "step": 6750 + }, + { + "epoch": 0.6400302972921795, + "grad_norm": 0.39451736211776733, + "learning_rate": 1.7439878810831283e-05, + "loss": 0.1113, + "step": 6760 + }, + { + "epoch": 0.6409770876727893, + "grad_norm": 0.33438143134117126, + "learning_rate": 1.7436091649308843e-05, + "loss": 0.118, + "step": 6770 + }, + { + "epoch": 0.641923878053399, + "grad_norm": 0.2927396893501282, + "learning_rate": 1.7432304487786403e-05, + "loss": 0.1172, + "step": 6780 + }, + { + "epoch": 0.6428706684340088, + "grad_norm": 0.3146035969257355, + "learning_rate": 1.7428517326263966e-05, + "loss": 0.1073, + "step": 6790 + }, + { + "epoch": 0.6438174588146185, + "grad_norm": 0.27291423082351685, + "learning_rate": 1.7424730164741526e-05, + "loss": 0.1177, + "step": 6800 + }, + { + "epoch": 0.6447642491952281, + "grad_norm": 0.3319913148880005, + "learning_rate": 1.742094300321909e-05, + "loss": 0.119, + "step": 6810 + }, + { + "epoch": 0.6457110395758379, + "grad_norm": 0.3484410345554352, + "learning_rate": 1.741715584169665e-05, + "loss": 0.1201, + "step": 6820 + }, + { + "epoch": 0.6466578299564476, + "grad_norm": 0.2857920229434967, + "learning_rate": 1.7413368680174213e-05, + "loss": 0.1043, + "step": 6830 + }, + { + "epoch": 0.6476046203370573, + "grad_norm": 0.3543633818626404, + "learning_rate": 1.7409581518651773e-05, + "loss": 0.1148, + "step": 6840 + }, + { + "epoch": 0.6485514107176671, + "grad_norm": 0.2742081582546234, + "learning_rate": 1.7405794357129333e-05, + "loss": 0.1037, + "step": 6850 + }, + { + "epoch": 0.6494982010982768, + "grad_norm": 0.36285659670829773, + "learning_rate": 1.7402007195606894e-05, + "loss": 0.1154, + "step": 6860 + }, + { + "epoch": 0.6504449914788866, + "grad_norm": 0.3244501054286957, + "learning_rate": 1.7398220034084454e-05, + "loss": 0.1115, + "step": 6870 + }, + { + "epoch": 0.6513917818594963, + "grad_norm": 0.3813318610191345, + "learning_rate": 1.7394432872562017e-05, + "loss": 0.1125, + "step": 6880 + }, + { + "epoch": 0.652338572240106, + "grad_norm": 0.34014782309532166, + "learning_rate": 1.7390645711039577e-05, + "loss": 0.1154, + "step": 6890 + }, + { + "epoch": 0.6532853626207158, + "grad_norm": 0.31174853444099426, + "learning_rate": 1.7386858549517137e-05, + "loss": 0.1156, + "step": 6900 + }, + { + "epoch": 0.6542321530013255, + "grad_norm": 0.3397250771522522, + "learning_rate": 1.7383071387994697e-05, + "loss": 0.1113, + "step": 6910 + }, + { + "epoch": 0.6551789433819353, + "grad_norm": 0.3041819632053375, + "learning_rate": 1.737928422647226e-05, + "loss": 0.1235, + "step": 6920 + }, + { + "epoch": 0.656125733762545, + "grad_norm": 0.31160038709640503, + "learning_rate": 1.737549706494982e-05, + "loss": 0.1117, + "step": 6930 + }, + { + "epoch": 0.6570725241431548, + "grad_norm": 0.3039339780807495, + "learning_rate": 1.7371709903427384e-05, + "loss": 0.1164, + "step": 6940 + }, + { + "epoch": 0.6580193145237644, + "grad_norm": 0.4804076850414276, + "learning_rate": 1.7367922741904944e-05, + "loss": 0.1148, + "step": 6950 + }, + { + "epoch": 0.6589661049043741, + "grad_norm": 0.31758782267570496, + "learning_rate": 1.7364135580382504e-05, + "loss": 0.1042, + "step": 6960 + }, + { + "epoch": 0.6599128952849839, + "grad_norm": 0.3608994781970978, + "learning_rate": 1.7360348418860068e-05, + "loss": 0.1173, + "step": 6970 + }, + { + "epoch": 0.6608596856655936, + "grad_norm": 0.32902655005455017, + "learning_rate": 1.7356561257337628e-05, + "loss": 0.1105, + "step": 6980 + }, + { + "epoch": 0.6618064760462034, + "grad_norm": 0.29006704688072205, + "learning_rate": 1.7352774095815188e-05, + "loss": 0.1069, + "step": 6990 + }, + { + "epoch": 0.6627532664268131, + "grad_norm": 0.3214852809906006, + "learning_rate": 1.7348986934292748e-05, + "loss": 0.121, + "step": 7000 + }, + { + "epoch": 0.6637000568074228, + "grad_norm": 0.2860215902328491, + "learning_rate": 1.7345199772770308e-05, + "loss": 0.1135, + "step": 7010 + }, + { + "epoch": 0.6646468471880326, + "grad_norm": 0.2932814359664917, + "learning_rate": 1.734141261124787e-05, + "loss": 0.1047, + "step": 7020 + }, + { + "epoch": 0.6655936375686423, + "grad_norm": 0.310357928276062, + "learning_rate": 1.733762544972543e-05, + "loss": 0.1119, + "step": 7030 + }, + { + "epoch": 0.6665404279492521, + "grad_norm": 0.38796934485435486, + "learning_rate": 1.7333838288202995e-05, + "loss": 0.1136, + "step": 7040 + }, + { + "epoch": 0.6674872183298618, + "grad_norm": 0.3642352819442749, + "learning_rate": 1.7330051126680555e-05, + "loss": 0.1071, + "step": 7050 + }, + { + "epoch": 0.6684340087104715, + "grad_norm": 0.40385109186172485, + "learning_rate": 1.7326263965158115e-05, + "loss": 0.1171, + "step": 7060 + }, + { + "epoch": 0.6693807990910813, + "grad_norm": 0.4555074870586395, + "learning_rate": 1.7322476803635678e-05, + "loss": 0.1205, + "step": 7070 + }, + { + "epoch": 0.670327589471691, + "grad_norm": 0.35308781266212463, + "learning_rate": 1.731868964211324e-05, + "loss": 0.1206, + "step": 7080 + }, + { + "epoch": 0.6712743798523007, + "grad_norm": 0.2664026916027069, + "learning_rate": 1.73149024805908e-05, + "loss": 0.1132, + "step": 7090 + }, + { + "epoch": 0.6722211702329104, + "grad_norm": 0.30298927426338196, + "learning_rate": 1.731111531906836e-05, + "loss": 0.1095, + "step": 7100 + }, + { + "epoch": 0.6731679606135201, + "grad_norm": 0.33091816306114197, + "learning_rate": 1.7307328157545922e-05, + "loss": 0.1047, + "step": 7110 + }, + { + "epoch": 0.6741147509941299, + "grad_norm": 0.31363001465797424, + "learning_rate": 1.7303540996023482e-05, + "loss": 0.1074, + "step": 7120 + }, + { + "epoch": 0.6750615413747396, + "grad_norm": 0.3875551223754883, + "learning_rate": 1.7299753834501042e-05, + "loss": 0.1185, + "step": 7130 + }, + { + "epoch": 0.6760083317553494, + "grad_norm": 0.3871995210647583, + "learning_rate": 1.7295966672978602e-05, + "loss": 0.1232, + "step": 7140 + }, + { + "epoch": 0.6769551221359591, + "grad_norm": 0.3625803589820862, + "learning_rate": 1.7292179511456165e-05, + "loss": 0.1158, + "step": 7150 + }, + { + "epoch": 0.6779019125165688, + "grad_norm": 0.4107898473739624, + "learning_rate": 1.7288392349933726e-05, + "loss": 0.1143, + "step": 7160 + }, + { + "epoch": 0.6788487028971786, + "grad_norm": 0.2983371615409851, + "learning_rate": 1.728460518841129e-05, + "loss": 0.1095, + "step": 7170 + }, + { + "epoch": 0.6797954932777883, + "grad_norm": 0.26490136981010437, + "learning_rate": 1.728081802688885e-05, + "loss": 0.1129, + "step": 7180 + }, + { + "epoch": 0.6807422836583981, + "grad_norm": 0.327644020318985, + "learning_rate": 1.727703086536641e-05, + "loss": 0.1198, + "step": 7190 + }, + { + "epoch": 0.6816890740390078, + "grad_norm": 0.37036940455436707, + "learning_rate": 1.727324370384397e-05, + "loss": 0.127, + "step": 7200 + }, + { + "epoch": 0.6826358644196175, + "grad_norm": 0.30712732672691345, + "learning_rate": 1.7269456542321533e-05, + "loss": 0.1046, + "step": 7210 + }, + { + "epoch": 0.6835826548002273, + "grad_norm": 0.30205264687538147, + "learning_rate": 1.7265669380799093e-05, + "loss": 0.1131, + "step": 7220 + }, + { + "epoch": 0.6845294451808369, + "grad_norm": 0.27775585651397705, + "learning_rate": 1.7261882219276653e-05, + "loss": 0.1251, + "step": 7230 + }, + { + "epoch": 0.6854762355614467, + "grad_norm": 0.3181157410144806, + "learning_rate": 1.7258095057754213e-05, + "loss": 0.1225, + "step": 7240 + }, + { + "epoch": 0.6864230259420564, + "grad_norm": 0.3210241496562958, + "learning_rate": 1.7254307896231776e-05, + "loss": 0.1037, + "step": 7250 + }, + { + "epoch": 0.6873698163226661, + "grad_norm": 0.36600542068481445, + "learning_rate": 1.7250520734709336e-05, + "loss": 0.1176, + "step": 7260 + }, + { + "epoch": 0.6883166067032759, + "grad_norm": 0.455118864774704, + "learning_rate": 1.7246733573186896e-05, + "loss": 0.1241, + "step": 7270 + }, + { + "epoch": 0.6892633970838856, + "grad_norm": 0.3404286205768585, + "learning_rate": 1.724294641166446e-05, + "loss": 0.1216, + "step": 7280 + }, + { + "epoch": 0.6902101874644954, + "grad_norm": 0.33997026085853577, + "learning_rate": 1.723915925014202e-05, + "loss": 0.1202, + "step": 7290 + }, + { + "epoch": 0.6911569778451051, + "grad_norm": 0.3330562710762024, + "learning_rate": 1.7235372088619583e-05, + "loss": 0.1106, + "step": 7300 + }, + { + "epoch": 0.6921037682257148, + "grad_norm": 0.29542532563209534, + "learning_rate": 1.7231584927097143e-05, + "loss": 0.1236, + "step": 7310 + }, + { + "epoch": 0.6930505586063246, + "grad_norm": 0.38714075088500977, + "learning_rate": 1.7227797765574703e-05, + "loss": 0.1203, + "step": 7320 + }, + { + "epoch": 0.6939973489869343, + "grad_norm": 0.36225587129592896, + "learning_rate": 1.7224010604052263e-05, + "loss": 0.1127, + "step": 7330 + }, + { + "epoch": 0.6949441393675441, + "grad_norm": 0.3260248899459839, + "learning_rate": 1.7220223442529823e-05, + "loss": 0.1212, + "step": 7340 + }, + { + "epoch": 0.6958909297481538, + "grad_norm": 0.3012619912624359, + "learning_rate": 1.7216436281007387e-05, + "loss": 0.1196, + "step": 7350 + }, + { + "epoch": 0.6968377201287634, + "grad_norm": 0.3051506280899048, + "learning_rate": 1.7212649119484947e-05, + "loss": 0.1118, + "step": 7360 + }, + { + "epoch": 0.6977845105093732, + "grad_norm": 0.33954405784606934, + "learning_rate": 1.7208861957962507e-05, + "loss": 0.1286, + "step": 7370 + }, + { + "epoch": 0.6987313008899829, + "grad_norm": 0.2781881093978882, + "learning_rate": 1.720507479644007e-05, + "loss": 0.1107, + "step": 7380 + }, + { + "epoch": 0.6996780912705927, + "grad_norm": 0.4584430456161499, + "learning_rate": 1.720128763491763e-05, + "loss": 0.11, + "step": 7390 + }, + { + "epoch": 0.7006248816512024, + "grad_norm": 0.30674710869789124, + "learning_rate": 1.7197500473395194e-05, + "loss": 0.127, + "step": 7400 + }, + { + "epoch": 0.7015716720318121, + "grad_norm": 0.2992197573184967, + "learning_rate": 1.7193713311872754e-05, + "loss": 0.1096, + "step": 7410 + }, + { + "epoch": 0.7025184624124219, + "grad_norm": 0.35374733805656433, + "learning_rate": 1.7189926150350314e-05, + "loss": 0.1053, + "step": 7420 + }, + { + "epoch": 0.7034652527930316, + "grad_norm": 0.4027782082557678, + "learning_rate": 1.7186138988827874e-05, + "loss": 0.1238, + "step": 7430 + }, + { + "epoch": 0.7044120431736414, + "grad_norm": 0.2525242269039154, + "learning_rate": 1.7182351827305437e-05, + "loss": 0.1187, + "step": 7440 + }, + { + "epoch": 0.7053588335542511, + "grad_norm": 0.34557586908340454, + "learning_rate": 1.7178564665782997e-05, + "loss": 0.1149, + "step": 7450 + }, + { + "epoch": 0.7063056239348608, + "grad_norm": 0.3434407413005829, + "learning_rate": 1.7174777504260558e-05, + "loss": 0.1202, + "step": 7460 + }, + { + "epoch": 0.7072524143154706, + "grad_norm": 0.3270295262336731, + "learning_rate": 1.7170990342738118e-05, + "loss": 0.1214, + "step": 7470 + }, + { + "epoch": 0.7081992046960803, + "grad_norm": 0.3213960528373718, + "learning_rate": 1.7167203181215678e-05, + "loss": 0.1166, + "step": 7480 + }, + { + "epoch": 0.7091459950766901, + "grad_norm": 0.4006138741970062, + "learning_rate": 1.716341601969324e-05, + "loss": 0.1221, + "step": 7490 + }, + { + "epoch": 0.7100927854572997, + "grad_norm": 0.36110085248947144, + "learning_rate": 1.71596288581708e-05, + "loss": 0.1167, + "step": 7500 + }, + { + "epoch": 0.7110395758379094, + "grad_norm": 0.3229648172855377, + "learning_rate": 1.7155841696648365e-05, + "loss": 0.1179, + "step": 7510 + }, + { + "epoch": 0.7119863662185192, + "grad_norm": 0.3359043300151825, + "learning_rate": 1.7152054535125925e-05, + "loss": 0.1186, + "step": 7520 + }, + { + "epoch": 0.7129331565991289, + "grad_norm": 0.32050198316574097, + "learning_rate": 1.7148267373603488e-05, + "loss": 0.1256, + "step": 7530 + }, + { + "epoch": 0.7138799469797387, + "grad_norm": 0.31654369831085205, + "learning_rate": 1.7144480212081048e-05, + "loss": 0.1185, + "step": 7540 + }, + { + "epoch": 0.7148267373603484, + "grad_norm": 0.35290828347206116, + "learning_rate": 1.7140693050558608e-05, + "loss": 0.1112, + "step": 7550 + }, + { + "epoch": 0.7157735277409581, + "grad_norm": 0.3996545076370239, + "learning_rate": 1.7136905889036168e-05, + "loss": 0.1186, + "step": 7560 + }, + { + "epoch": 0.7167203181215679, + "grad_norm": 0.33729514479637146, + "learning_rate": 1.7133118727513728e-05, + "loss": 0.1138, + "step": 7570 + }, + { + "epoch": 0.7176671085021776, + "grad_norm": 0.2941891849040985, + "learning_rate": 1.712933156599129e-05, + "loss": 0.113, + "step": 7580 + }, + { + "epoch": 0.7186138988827874, + "grad_norm": 0.28606683015823364, + "learning_rate": 1.7125544404468852e-05, + "loss": 0.1114, + "step": 7590 + }, + { + "epoch": 0.7195606892633971, + "grad_norm": 0.4108836054801941, + "learning_rate": 1.7121757242946412e-05, + "loss": 0.1291, + "step": 7600 + }, + { + "epoch": 0.7205074796440069, + "grad_norm": 0.3045104742050171, + "learning_rate": 1.7117970081423975e-05, + "loss": 0.1146, + "step": 7610 + }, + { + "epoch": 0.7214542700246166, + "grad_norm": 0.3012486398220062, + "learning_rate": 1.7114182919901535e-05, + "loss": 0.1104, + "step": 7620 + }, + { + "epoch": 0.7224010604052263, + "grad_norm": 0.28472423553466797, + "learning_rate": 1.7110395758379095e-05, + "loss": 0.0998, + "step": 7630 + }, + { + "epoch": 0.723347850785836, + "grad_norm": 0.3129308223724365, + "learning_rate": 1.710660859685666e-05, + "loss": 0.1154, + "step": 7640 + }, + { + "epoch": 0.7242946411664457, + "grad_norm": 0.33357781171798706, + "learning_rate": 1.710282143533422e-05, + "loss": 0.1186, + "step": 7650 + }, + { + "epoch": 0.7252414315470554, + "grad_norm": 0.300777792930603, + "learning_rate": 1.709903427381178e-05, + "loss": 0.1217, + "step": 7660 + }, + { + "epoch": 0.7261882219276652, + "grad_norm": 0.3664512634277344, + "learning_rate": 1.7095247112289342e-05, + "loss": 0.1069, + "step": 7670 + }, + { + "epoch": 0.7271350123082749, + "grad_norm": 0.3453938364982605, + "learning_rate": 1.7091459950766902e-05, + "loss": 0.1149, + "step": 7680 + }, + { + "epoch": 0.7280818026888847, + "grad_norm": 0.24504321813583374, + "learning_rate": 1.7087672789244462e-05, + "loss": 0.111, + "step": 7690 + }, + { + "epoch": 0.7290285930694944, + "grad_norm": 0.38600441813468933, + "learning_rate": 1.7083885627722022e-05, + "loss": 0.1129, + "step": 7700 + }, + { + "epoch": 0.7299753834501042, + "grad_norm": 0.35876065492630005, + "learning_rate": 1.7080098466199582e-05, + "loss": 0.1144, + "step": 7710 + }, + { + "epoch": 0.7309221738307139, + "grad_norm": 0.3433103859424591, + "learning_rate": 1.7076311304677146e-05, + "loss": 0.1145, + "step": 7720 + }, + { + "epoch": 0.7318689642113236, + "grad_norm": 0.4363567531108856, + "learning_rate": 1.7072524143154706e-05, + "loss": 0.1068, + "step": 7730 + }, + { + "epoch": 0.7328157545919334, + "grad_norm": 0.2609582841396332, + "learning_rate": 1.706873698163227e-05, + "loss": 0.1039, + "step": 7740 + }, + { + "epoch": 0.7337625449725431, + "grad_norm": 0.251044899225235, + "learning_rate": 1.706494982010983e-05, + "loss": 0.1292, + "step": 7750 + }, + { + "epoch": 0.7347093353531529, + "grad_norm": 0.3957395553588867, + "learning_rate": 1.706116265858739e-05, + "loss": 0.1105, + "step": 7760 + }, + { + "epoch": 0.7356561257337626, + "grad_norm": 0.3416815996170044, + "learning_rate": 1.7057375497064953e-05, + "loss": 0.1149, + "step": 7770 + }, + { + "epoch": 0.7366029161143722, + "grad_norm": 0.3129086196422577, + "learning_rate": 1.7053588335542513e-05, + "loss": 0.1176, + "step": 7780 + }, + { + "epoch": 0.737549706494982, + "grad_norm": 0.3613971471786499, + "learning_rate": 1.7049801174020073e-05, + "loss": 0.1061, + "step": 7790 + }, + { + "epoch": 0.7384964968755917, + "grad_norm": 0.32689687609672546, + "learning_rate": 1.7046014012497633e-05, + "loss": 0.1111, + "step": 7800 + }, + { + "epoch": 0.7394432872562015, + "grad_norm": 0.30839022994041443, + "learning_rate": 1.7042226850975197e-05, + "loss": 0.127, + "step": 7810 + }, + { + "epoch": 0.7403900776368112, + "grad_norm": 0.481495201587677, + "learning_rate": 1.7038439689452757e-05, + "loss": 0.1147, + "step": 7820 + }, + { + "epoch": 0.7413368680174209, + "grad_norm": 0.27305448055267334, + "learning_rate": 1.7034652527930317e-05, + "loss": 0.1233, + "step": 7830 + }, + { + "epoch": 0.7422836583980307, + "grad_norm": 0.35240069031715393, + "learning_rate": 1.7030865366407877e-05, + "loss": 0.114, + "step": 7840 + }, + { + "epoch": 0.7432304487786404, + "grad_norm": 0.39616870880126953, + "learning_rate": 1.702707820488544e-05, + "loss": 0.1218, + "step": 7850 + }, + { + "epoch": 0.7441772391592502, + "grad_norm": 0.2716734707355499, + "learning_rate": 1.7023291043363e-05, + "loss": 0.1211, + "step": 7860 + }, + { + "epoch": 0.7451240295398599, + "grad_norm": 0.26934340596199036, + "learning_rate": 1.7019503881840564e-05, + "loss": 0.1042, + "step": 7870 + }, + { + "epoch": 0.7460708199204696, + "grad_norm": 0.27856406569480896, + "learning_rate": 1.7015716720318124e-05, + "loss": 0.1126, + "step": 7880 + }, + { + "epoch": 0.7470176103010794, + "grad_norm": 0.2654203772544861, + "learning_rate": 1.7011929558795684e-05, + "loss": 0.1286, + "step": 7890 + }, + { + "epoch": 0.7479644006816891, + "grad_norm": 0.29593995213508606, + "learning_rate": 1.7008142397273244e-05, + "loss": 0.1136, + "step": 7900 + }, + { + "epoch": 0.7489111910622988, + "grad_norm": 0.32165074348449707, + "learning_rate": 1.7004355235750807e-05, + "loss": 0.1229, + "step": 7910 + }, + { + "epoch": 0.7498579814429085, + "grad_norm": 0.2874644696712494, + "learning_rate": 1.7000568074228367e-05, + "loss": 0.1071, + "step": 7920 + }, + { + "epoch": 0.7508047718235182, + "grad_norm": 0.3464849293231964, + "learning_rate": 1.6996780912705927e-05, + "loss": 0.1043, + "step": 7930 + }, + { + "epoch": 0.751751562204128, + "grad_norm": 0.27734798192977905, + "learning_rate": 1.6992993751183487e-05, + "loss": 0.1211, + "step": 7940 + }, + { + "epoch": 0.7526983525847377, + "grad_norm": 0.2745431065559387, + "learning_rate": 1.698920658966105e-05, + "loss": 0.1188, + "step": 7950 + }, + { + "epoch": 0.7536451429653475, + "grad_norm": 0.3578427731990814, + "learning_rate": 1.698541942813861e-05, + "loss": 0.1201, + "step": 7960 + }, + { + "epoch": 0.7545919333459572, + "grad_norm": 0.26581892371177673, + "learning_rate": 1.6981632266616174e-05, + "loss": 0.1136, + "step": 7970 + }, + { + "epoch": 0.7555387237265669, + "grad_norm": 0.32221293449401855, + "learning_rate": 1.6977845105093734e-05, + "loss": 0.11, + "step": 7980 + }, + { + "epoch": 0.7564855141071767, + "grad_norm": 0.2992156445980072, + "learning_rate": 1.6974057943571294e-05, + "loss": 0.1014, + "step": 7990 + }, + { + "epoch": 0.7574323044877864, + "grad_norm": 0.3733976483345032, + "learning_rate": 1.6970270782048858e-05, + "loss": 0.1099, + "step": 8000 + }, + { + "epoch": 0.7583790948683962, + "grad_norm": 0.25940683484077454, + "learning_rate": 1.6966483620526418e-05, + "loss": 0.105, + "step": 8010 + }, + { + "epoch": 0.7593258852490059, + "grad_norm": 0.3570631742477417, + "learning_rate": 1.6962696459003978e-05, + "loss": 0.1009, + "step": 8020 + }, + { + "epoch": 0.7602726756296156, + "grad_norm": 0.30129891633987427, + "learning_rate": 1.6958909297481538e-05, + "loss": 0.1156, + "step": 8030 + }, + { + "epoch": 0.7612194660102254, + "grad_norm": 0.3454397916793823, + "learning_rate": 1.6955122135959098e-05, + "loss": 0.1191, + "step": 8040 + }, + { + "epoch": 0.762166256390835, + "grad_norm": 0.2947542071342468, + "learning_rate": 1.695133497443666e-05, + "loss": 0.1053, + "step": 8050 + }, + { + "epoch": 0.7631130467714448, + "grad_norm": 0.3459526300430298, + "learning_rate": 1.694754781291422e-05, + "loss": 0.1187, + "step": 8060 + }, + { + "epoch": 0.7640598371520545, + "grad_norm": 0.3022899031639099, + "learning_rate": 1.694376065139178e-05, + "loss": 0.1118, + "step": 8070 + }, + { + "epoch": 0.7650066275326642, + "grad_norm": 0.3002983331680298, + "learning_rate": 1.6939973489869345e-05, + "loss": 0.121, + "step": 8080 + }, + { + "epoch": 0.765953417913274, + "grad_norm": 0.2755148410797119, + "learning_rate": 1.6936186328346905e-05, + "loss": 0.1182, + "step": 8090 + }, + { + "epoch": 0.7669002082938837, + "grad_norm": 0.34115657210350037, + "learning_rate": 1.693239916682447e-05, + "loss": 0.1171, + "step": 8100 + }, + { + "epoch": 0.7678469986744935, + "grad_norm": 0.24916787445545197, + "learning_rate": 1.692861200530203e-05, + "loss": 0.1161, + "step": 8110 + }, + { + "epoch": 0.7687937890551032, + "grad_norm": 0.27540966868400574, + "learning_rate": 1.692482484377959e-05, + "loss": 0.1125, + "step": 8120 + }, + { + "epoch": 0.769740579435713, + "grad_norm": 0.3216499090194702, + "learning_rate": 1.692103768225715e-05, + "loss": 0.1069, + "step": 8130 + }, + { + "epoch": 0.7706873698163227, + "grad_norm": 0.33225181698799133, + "learning_rate": 1.6917250520734712e-05, + "loss": 0.1101, + "step": 8140 + }, + { + "epoch": 0.7716341601969324, + "grad_norm": 0.3109791576862335, + "learning_rate": 1.6913463359212272e-05, + "loss": 0.1137, + "step": 8150 + }, + { + "epoch": 0.7725809505775422, + "grad_norm": 0.3619652986526489, + "learning_rate": 1.6909676197689832e-05, + "loss": 0.1202, + "step": 8160 + }, + { + "epoch": 0.7735277409581519, + "grad_norm": 0.3975580930709839, + "learning_rate": 1.6905889036167392e-05, + "loss": 0.1161, + "step": 8170 + }, + { + "epoch": 0.7744745313387617, + "grad_norm": 0.35568127036094666, + "learning_rate": 1.6902101874644952e-05, + "loss": 0.1113, + "step": 8180 + }, + { + "epoch": 0.7754213217193713, + "grad_norm": 0.356973260641098, + "learning_rate": 1.6898314713122516e-05, + "loss": 0.1047, + "step": 8190 + }, + { + "epoch": 0.776368112099981, + "grad_norm": 0.4212486445903778, + "learning_rate": 1.6894527551600076e-05, + "loss": 0.1096, + "step": 8200 + }, + { + "epoch": 0.7773149024805908, + "grad_norm": 0.34463343024253845, + "learning_rate": 1.689074039007764e-05, + "loss": 0.1244, + "step": 8210 + }, + { + "epoch": 0.7782616928612005, + "grad_norm": 0.3768672049045563, + "learning_rate": 1.68869532285552e-05, + "loss": 0.1158, + "step": 8220 + }, + { + "epoch": 0.7792084832418102, + "grad_norm": 0.36685991287231445, + "learning_rate": 1.6883166067032763e-05, + "loss": 0.1141, + "step": 8230 + }, + { + "epoch": 0.78015527362242, + "grad_norm": 0.3462464213371277, + "learning_rate": 1.6879378905510323e-05, + "loss": 0.1173, + "step": 8240 + }, + { + "epoch": 0.7811020640030297, + "grad_norm": 0.38483893871307373, + "learning_rate": 1.6875591743987883e-05, + "loss": 0.1107, + "step": 8250 + }, + { + "epoch": 0.7820488543836395, + "grad_norm": 0.3127537667751312, + "learning_rate": 1.6871804582465443e-05, + "loss": 0.1179, + "step": 8260 + }, + { + "epoch": 0.7829956447642492, + "grad_norm": 0.39382901787757874, + "learning_rate": 1.6868017420943003e-05, + "loss": 0.121, + "step": 8270 + }, + { + "epoch": 0.783942435144859, + "grad_norm": 0.35741671919822693, + "learning_rate": 1.6864230259420566e-05, + "loss": 0.1101, + "step": 8280 + }, + { + "epoch": 0.7848892255254687, + "grad_norm": 0.3009890615940094, + "learning_rate": 1.6860443097898126e-05, + "loss": 0.1231, + "step": 8290 + }, + { + "epoch": 0.7858360159060784, + "grad_norm": 0.34346726536750793, + "learning_rate": 1.6856655936375686e-05, + "loss": 0.1222, + "step": 8300 + }, + { + "epoch": 0.7867828062866882, + "grad_norm": 0.43910688161849976, + "learning_rate": 1.685286877485325e-05, + "loss": 0.1129, + "step": 8310 + }, + { + "epoch": 0.7877295966672979, + "grad_norm": 0.36337924003601074, + "learning_rate": 1.684908161333081e-05, + "loss": 0.1164, + "step": 8320 + }, + { + "epoch": 0.7886763870479075, + "grad_norm": 0.3842446208000183, + "learning_rate": 1.6845294451808373e-05, + "loss": 0.12, + "step": 8330 + }, + { + "epoch": 0.7896231774285173, + "grad_norm": 0.4252307713031769, + "learning_rate": 1.6841507290285933e-05, + "loss": 0.1145, + "step": 8340 + }, + { + "epoch": 0.790569967809127, + "grad_norm": 0.3211591839790344, + "learning_rate": 1.6837720128763493e-05, + "loss": 0.1016, + "step": 8350 + }, + { + "epoch": 0.7915167581897368, + "grad_norm": 0.35847267508506775, + "learning_rate": 1.6833932967241053e-05, + "loss": 0.1064, + "step": 8360 + }, + { + "epoch": 0.7924635485703465, + "grad_norm": 0.3097435534000397, + "learning_rate": 1.6830145805718617e-05, + "loss": 0.1195, + "step": 8370 + }, + { + "epoch": 0.7934103389509563, + "grad_norm": 0.43024635314941406, + "learning_rate": 1.6826358644196177e-05, + "loss": 0.106, + "step": 8380 + }, + { + "epoch": 0.794357129331566, + "grad_norm": 0.23815903067588806, + "learning_rate": 1.6822571482673737e-05, + "loss": 0.1055, + "step": 8390 + }, + { + "epoch": 0.7953039197121757, + "grad_norm": 0.32491034269332886, + "learning_rate": 1.6818784321151297e-05, + "loss": 0.1108, + "step": 8400 + }, + { + "epoch": 0.7962507100927855, + "grad_norm": 0.3355867862701416, + "learning_rate": 1.6814997159628857e-05, + "loss": 0.1172, + "step": 8410 + }, + { + "epoch": 0.7971975004733952, + "grad_norm": 0.4188220500946045, + "learning_rate": 1.681120999810642e-05, + "loss": 0.1208, + "step": 8420 + }, + { + "epoch": 0.798144290854005, + "grad_norm": 0.4368966817855835, + "learning_rate": 1.680742283658398e-05, + "loss": 0.1239, + "step": 8430 + }, + { + "epoch": 0.7990910812346147, + "grad_norm": 0.28890368342399597, + "learning_rate": 1.6803635675061544e-05, + "loss": 0.1179, + "step": 8440 + }, + { + "epoch": 0.8000378716152244, + "grad_norm": 0.28585273027420044, + "learning_rate": 1.6799848513539104e-05, + "loss": 0.1128, + "step": 8450 + }, + { + "epoch": 0.8009846619958341, + "grad_norm": 0.32358139753341675, + "learning_rate": 1.6796061352016667e-05, + "loss": 0.1168, + "step": 8460 + }, + { + "epoch": 0.8019314523764438, + "grad_norm": 0.28638800978660583, + "learning_rate": 1.6792274190494228e-05, + "loss": 0.1167, + "step": 8470 + }, + { + "epoch": 0.8028782427570536, + "grad_norm": 0.2773953974246979, + "learning_rate": 1.6788487028971788e-05, + "loss": 0.1208, + "step": 8480 + }, + { + "epoch": 0.8038250331376633, + "grad_norm": 0.34793245792388916, + "learning_rate": 1.6784699867449348e-05, + "loss": 0.1176, + "step": 8490 + }, + { + "epoch": 0.804771823518273, + "grad_norm": 0.3978104591369629, + "learning_rate": 1.6780912705926908e-05, + "loss": 0.1153, + "step": 8500 + }, + { + "epoch": 0.8057186138988828, + "grad_norm": 0.2925117313861847, + "learning_rate": 1.677712554440447e-05, + "loss": 0.1156, + "step": 8510 + }, + { + "epoch": 0.8066654042794925, + "grad_norm": 0.30174776911735535, + "learning_rate": 1.677333838288203e-05, + "loss": 0.1177, + "step": 8520 + }, + { + "epoch": 0.8076121946601023, + "grad_norm": 0.27571502327919006, + "learning_rate": 1.676955122135959e-05, + "loss": 0.1082, + "step": 8530 + }, + { + "epoch": 0.808558985040712, + "grad_norm": 0.34001198410987854, + "learning_rate": 1.676576405983715e-05, + "loss": 0.1208, + "step": 8540 + }, + { + "epoch": 0.8095057754213217, + "grad_norm": 0.3395918309688568, + "learning_rate": 1.6761976898314715e-05, + "loss": 0.1197, + "step": 8550 + }, + { + "epoch": 0.8104525658019315, + "grad_norm": 0.31503212451934814, + "learning_rate": 1.6758189736792275e-05, + "loss": 0.1151, + "step": 8560 + }, + { + "epoch": 0.8113993561825412, + "grad_norm": 0.2970471680164337, + "learning_rate": 1.6754402575269838e-05, + "loss": 0.1102, + "step": 8570 + }, + { + "epoch": 0.812346146563151, + "grad_norm": 0.2834688425064087, + "learning_rate": 1.6750615413747398e-05, + "loss": 0.1163, + "step": 8580 + }, + { + "epoch": 0.8132929369437607, + "grad_norm": 0.4195939600467682, + "learning_rate": 1.6746828252224958e-05, + "loss": 0.1146, + "step": 8590 + }, + { + "epoch": 0.8142397273243703, + "grad_norm": 0.2607402801513672, + "learning_rate": 1.6743041090702522e-05, + "loss": 0.1109, + "step": 8600 + }, + { + "epoch": 0.8151865177049801, + "grad_norm": 0.2828407883644104, + "learning_rate": 1.6739253929180082e-05, + "loss": 0.1215, + "step": 8610 + }, + { + "epoch": 0.8161333080855898, + "grad_norm": 0.2749273180961609, + "learning_rate": 1.6735466767657642e-05, + "loss": 0.1039, + "step": 8620 + }, + { + "epoch": 0.8170800984661996, + "grad_norm": 0.3468409478664398, + "learning_rate": 1.6731679606135202e-05, + "loss": 0.1093, + "step": 8630 + }, + { + "epoch": 0.8180268888468093, + "grad_norm": 0.30343708395957947, + "learning_rate": 1.6727892444612762e-05, + "loss": 0.1089, + "step": 8640 + }, + { + "epoch": 0.818973679227419, + "grad_norm": 0.33821362257003784, + "learning_rate": 1.6724105283090325e-05, + "loss": 0.1059, + "step": 8650 + }, + { + "epoch": 0.8199204696080288, + "grad_norm": 0.3864172101020813, + "learning_rate": 1.6720318121567885e-05, + "loss": 0.1176, + "step": 8660 + }, + { + "epoch": 0.8208672599886385, + "grad_norm": 0.4220865070819855, + "learning_rate": 1.671653096004545e-05, + "loss": 0.1213, + "step": 8670 + }, + { + "epoch": 0.8218140503692483, + "grad_norm": 0.2996737062931061, + "learning_rate": 1.671274379852301e-05, + "loss": 0.11, + "step": 8680 + }, + { + "epoch": 0.822760840749858, + "grad_norm": 0.3411973714828491, + "learning_rate": 1.670895663700057e-05, + "loss": 0.1227, + "step": 8690 + }, + { + "epoch": 0.8237076311304677, + "grad_norm": 0.34379059076309204, + "learning_rate": 1.6705169475478132e-05, + "loss": 0.1144, + "step": 8700 + }, + { + "epoch": 0.8246544215110775, + "grad_norm": 0.3353620171546936, + "learning_rate": 1.6701382313955692e-05, + "loss": 0.1053, + "step": 8710 + }, + { + "epoch": 0.8256012118916872, + "grad_norm": 0.3999831974506378, + "learning_rate": 1.6697595152433252e-05, + "loss": 0.1205, + "step": 8720 + }, + { + "epoch": 0.826548002272297, + "grad_norm": 0.34344691038131714, + "learning_rate": 1.6693807990910813e-05, + "loss": 0.1145, + "step": 8730 + }, + { + "epoch": 0.8274947926529066, + "grad_norm": 0.3627908229827881, + "learning_rate": 1.6690020829388376e-05, + "loss": 0.1125, + "step": 8740 + }, + { + "epoch": 0.8284415830335163, + "grad_norm": 0.40607041120529175, + "learning_rate": 1.6686233667865936e-05, + "loss": 0.1202, + "step": 8750 + }, + { + "epoch": 0.8293883734141261, + "grad_norm": 0.278888076543808, + "learning_rate": 1.6682446506343496e-05, + "loss": 0.1136, + "step": 8760 + }, + { + "epoch": 0.8303351637947358, + "grad_norm": 0.32712674140930176, + "learning_rate": 1.6678659344821056e-05, + "loss": 0.1088, + "step": 8770 + }, + { + "epoch": 0.8312819541753456, + "grad_norm": 0.44736137986183167, + "learning_rate": 1.667487218329862e-05, + "loss": 0.1121, + "step": 8780 + }, + { + "epoch": 0.8322287445559553, + "grad_norm": 0.3264457583427429, + "learning_rate": 1.667108502177618e-05, + "loss": 0.1052, + "step": 8790 + }, + { + "epoch": 0.833175534936565, + "grad_norm": 0.3332121670246124, + "learning_rate": 1.6667297860253743e-05, + "loss": 0.1184, + "step": 8800 + }, + { + "epoch": 0.8341223253171748, + "grad_norm": 0.3029613196849823, + "learning_rate": 1.6663510698731303e-05, + "loss": 0.1133, + "step": 8810 + }, + { + "epoch": 0.8350691156977845, + "grad_norm": 0.33913642168045044, + "learning_rate": 1.6659723537208863e-05, + "loss": 0.1086, + "step": 8820 + }, + { + "epoch": 0.8360159060783943, + "grad_norm": 0.2835783362388611, + "learning_rate": 1.6655936375686423e-05, + "loss": 0.1174, + "step": 8830 + }, + { + "epoch": 0.836962696459004, + "grad_norm": 0.3837894797325134, + "learning_rate": 1.6652149214163987e-05, + "loss": 0.1116, + "step": 8840 + }, + { + "epoch": 0.8379094868396137, + "grad_norm": 0.29306384921073914, + "learning_rate": 1.6648362052641547e-05, + "loss": 0.1149, + "step": 8850 + }, + { + "epoch": 0.8388562772202235, + "grad_norm": 0.3184460997581482, + "learning_rate": 1.6644574891119107e-05, + "loss": 0.1169, + "step": 8860 + }, + { + "epoch": 0.8398030676008332, + "grad_norm": 0.3720015287399292, + "learning_rate": 1.6640787729596667e-05, + "loss": 0.1211, + "step": 8870 + }, + { + "epoch": 0.8407498579814429, + "grad_norm": 0.423492431640625, + "learning_rate": 1.663700056807423e-05, + "loss": 0.1099, + "step": 8880 + }, + { + "epoch": 0.8416966483620526, + "grad_norm": 0.32279732823371887, + "learning_rate": 1.663321340655179e-05, + "loss": 0.1116, + "step": 8890 + }, + { + "epoch": 0.8426434387426623, + "grad_norm": 0.26023128628730774, + "learning_rate": 1.662942624502935e-05, + "loss": 0.101, + "step": 8900 + }, + { + "epoch": 0.8435902291232721, + "grad_norm": 0.34170594811439514, + "learning_rate": 1.6625639083506914e-05, + "loss": 0.1267, + "step": 8910 + }, + { + "epoch": 0.8445370195038818, + "grad_norm": 0.27268069982528687, + "learning_rate": 1.6621851921984474e-05, + "loss": 0.1104, + "step": 8920 + }, + { + "epoch": 0.8454838098844916, + "grad_norm": 0.27401381731033325, + "learning_rate": 1.6618064760462037e-05, + "loss": 0.1003, + "step": 8930 + }, + { + "epoch": 0.8464306002651013, + "grad_norm": 0.3447090983390808, + "learning_rate": 1.6614277598939597e-05, + "loss": 0.1209, + "step": 8940 + }, + { + "epoch": 0.847377390645711, + "grad_norm": 0.3804704546928406, + "learning_rate": 1.6610490437417157e-05, + "loss": 0.1217, + "step": 8950 + }, + { + "epoch": 0.8483241810263208, + "grad_norm": 0.47949594259262085, + "learning_rate": 1.6606703275894717e-05, + "loss": 0.1173, + "step": 8960 + }, + { + "epoch": 0.8492709714069305, + "grad_norm": 0.3841295838356018, + "learning_rate": 1.6602916114372277e-05, + "loss": 0.1079, + "step": 8970 + }, + { + "epoch": 0.8502177617875403, + "grad_norm": 0.29418498277664185, + "learning_rate": 1.659912895284984e-05, + "loss": 0.111, + "step": 8980 + }, + { + "epoch": 0.85116455216815, + "grad_norm": 0.30468037724494934, + "learning_rate": 1.65953417913274e-05, + "loss": 0.116, + "step": 8990 + }, + { + "epoch": 0.8521113425487598, + "grad_norm": 0.2763366997241974, + "learning_rate": 1.659155462980496e-05, + "loss": 0.116, + "step": 9000 + }, + { + "epoch": 0.8530581329293694, + "grad_norm": 0.30675357580184937, + "learning_rate": 1.6587767468282524e-05, + "loss": 0.11, + "step": 9010 + }, + { + "epoch": 0.8540049233099791, + "grad_norm": 0.31186580657958984, + "learning_rate": 1.6583980306760084e-05, + "loss": 0.115, + "step": 9020 + }, + { + "epoch": 0.8549517136905889, + "grad_norm": 0.4842294156551361, + "learning_rate": 1.6580193145237648e-05, + "loss": 0.1288, + "step": 9030 + }, + { + "epoch": 0.8558985040711986, + "grad_norm": 0.3168499767780304, + "learning_rate": 1.6576405983715208e-05, + "loss": 0.1196, + "step": 9040 + }, + { + "epoch": 0.8568452944518083, + "grad_norm": 0.2683272361755371, + "learning_rate": 1.6572618822192768e-05, + "loss": 0.1193, + "step": 9050 + }, + { + "epoch": 0.8577920848324181, + "grad_norm": 0.3049313724040985, + "learning_rate": 1.6568831660670328e-05, + "loss": 0.1079, + "step": 9060 + }, + { + "epoch": 0.8587388752130278, + "grad_norm": 0.31613585352897644, + "learning_rate": 1.656504449914789e-05, + "loss": 0.1108, + "step": 9070 + }, + { + "epoch": 0.8596856655936376, + "grad_norm": 0.34152960777282715, + "learning_rate": 1.656125733762545e-05, + "loss": 0.113, + "step": 9080 + }, + { + "epoch": 0.8606324559742473, + "grad_norm": 0.3691774010658264, + "learning_rate": 1.655747017610301e-05, + "loss": 0.1079, + "step": 9090 + }, + { + "epoch": 0.861579246354857, + "grad_norm": 0.3918764591217041, + "learning_rate": 1.655368301458057e-05, + "loss": 0.1142, + "step": 9100 + }, + { + "epoch": 0.8625260367354668, + "grad_norm": 0.3079935610294342, + "learning_rate": 1.654989585305813e-05, + "loss": 0.114, + "step": 9110 + }, + { + "epoch": 0.8634728271160765, + "grad_norm": 0.31950023770332336, + "learning_rate": 1.6546108691535695e-05, + "loss": 0.103, + "step": 9120 + }, + { + "epoch": 0.8644196174966863, + "grad_norm": 0.2867056429386139, + "learning_rate": 1.6542321530013255e-05, + "loss": 0.1095, + "step": 9130 + }, + { + "epoch": 0.865366407877296, + "grad_norm": 0.4084359109401703, + "learning_rate": 1.653853436849082e-05, + "loss": 0.1185, + "step": 9140 + }, + { + "epoch": 0.8663131982579056, + "grad_norm": 0.3682761490345001, + "learning_rate": 1.653474720696838e-05, + "loss": 0.1198, + "step": 9150 + }, + { + "epoch": 0.8672599886385154, + "grad_norm": 0.3352651596069336, + "learning_rate": 1.6530960045445942e-05, + "loss": 0.1118, + "step": 9160 + }, + { + "epoch": 0.8682067790191251, + "grad_norm": 0.300908625125885, + "learning_rate": 1.6527172883923502e-05, + "loss": 0.1216, + "step": 9170 + }, + { + "epoch": 0.8691535693997349, + "grad_norm": 0.2699333131313324, + "learning_rate": 1.6523385722401062e-05, + "loss": 0.1061, + "step": 9180 + }, + { + "epoch": 0.8701003597803446, + "grad_norm": 0.34169745445251465, + "learning_rate": 1.6519598560878622e-05, + "loss": 0.1124, + "step": 9190 + }, + { + "epoch": 0.8710471501609544, + "grad_norm": 0.29521581530570984, + "learning_rate": 1.6515811399356182e-05, + "loss": 0.1248, + "step": 9200 + }, + { + "epoch": 0.8719939405415641, + "grad_norm": 0.2916036546230316, + "learning_rate": 1.6512024237833746e-05, + "loss": 0.1189, + "step": 9210 + }, + { + "epoch": 0.8729407309221738, + "grad_norm": 0.31541579961776733, + "learning_rate": 1.6508237076311306e-05, + "loss": 0.1112, + "step": 9220 + }, + { + "epoch": 0.8738875213027836, + "grad_norm": 0.9175082445144653, + "learning_rate": 1.6504449914788866e-05, + "loss": 0.1219, + "step": 9230 + }, + { + "epoch": 0.8748343116833933, + "grad_norm": 0.3295627236366272, + "learning_rate": 1.650066275326643e-05, + "loss": 0.1114, + "step": 9240 + }, + { + "epoch": 0.8757811020640031, + "grad_norm": 0.3011680245399475, + "learning_rate": 1.649687559174399e-05, + "loss": 0.1095, + "step": 9250 + }, + { + "epoch": 0.8767278924446128, + "grad_norm": 0.28221869468688965, + "learning_rate": 1.649308843022155e-05, + "loss": 0.1134, + "step": 9260 + }, + { + "epoch": 0.8776746828252225, + "grad_norm": 0.2599260210990906, + "learning_rate": 1.6489301268699113e-05, + "loss": 0.1154, + "step": 9270 + }, + { + "epoch": 0.8786214732058323, + "grad_norm": 0.3351801931858063, + "learning_rate": 1.6485514107176673e-05, + "loss": 0.1115, + "step": 9280 + }, + { + "epoch": 0.8795682635864419, + "grad_norm": 0.256975382566452, + "learning_rate": 1.6481726945654233e-05, + "loss": 0.1101, + "step": 9290 + }, + { + "epoch": 0.8805150539670517, + "grad_norm": 0.33674198389053345, + "learning_rate": 1.6477939784131796e-05, + "loss": 0.1193, + "step": 9300 + }, + { + "epoch": 0.8814618443476614, + "grad_norm": 0.3078038692474365, + "learning_rate": 1.6474152622609356e-05, + "loss": 0.1195, + "step": 9310 + }, + { + "epoch": 0.8824086347282711, + "grad_norm": 0.34881317615509033, + "learning_rate": 1.6470365461086916e-05, + "loss": 0.1133, + "step": 9320 + }, + { + "epoch": 0.8833554251088809, + "grad_norm": 0.32375583052635193, + "learning_rate": 1.6466578299564476e-05, + "loss": 0.1074, + "step": 9330 + }, + { + "epoch": 0.8843022154894906, + "grad_norm": 0.40294790267944336, + "learning_rate": 1.6462791138042037e-05, + "loss": 0.1231, + "step": 9340 + }, + { + "epoch": 0.8852490058701004, + "grad_norm": 0.3399127721786499, + "learning_rate": 1.64590039765196e-05, + "loss": 0.1075, + "step": 9350 + }, + { + "epoch": 0.8861957962507101, + "grad_norm": 0.4025484323501587, + "learning_rate": 1.645521681499716e-05, + "loss": 0.1187, + "step": 9360 + }, + { + "epoch": 0.8871425866313198, + "grad_norm": 0.30942675471305847, + "learning_rate": 1.6451429653474723e-05, + "loss": 0.1056, + "step": 9370 + }, + { + "epoch": 0.8880893770119296, + "grad_norm": 0.28850677609443665, + "learning_rate": 1.6447642491952284e-05, + "loss": 0.1099, + "step": 9380 + }, + { + "epoch": 0.8890361673925393, + "grad_norm": 0.4062662422657013, + "learning_rate": 1.6443855330429844e-05, + "loss": 0.1115, + "step": 9390 + }, + { + "epoch": 0.8899829577731491, + "grad_norm": 0.28927358984947205, + "learning_rate": 1.6440068168907407e-05, + "loss": 0.1236, + "step": 9400 + }, + { + "epoch": 0.8909297481537588, + "grad_norm": 0.3105138838291168, + "learning_rate": 1.6436281007384967e-05, + "loss": 0.1183, + "step": 9410 + }, + { + "epoch": 0.8918765385343685, + "grad_norm": 0.3197774589061737, + "learning_rate": 1.6432493845862527e-05, + "loss": 0.1086, + "step": 9420 + }, + { + "epoch": 0.8928233289149782, + "grad_norm": 0.32746583223342896, + "learning_rate": 1.6428706684340087e-05, + "loss": 0.1059, + "step": 9430 + }, + { + "epoch": 0.8937701192955879, + "grad_norm": 0.37387615442276, + "learning_rate": 1.642491952281765e-05, + "loss": 0.113, + "step": 9440 + }, + { + "epoch": 0.8947169096761977, + "grad_norm": 0.2911737263202667, + "learning_rate": 1.642113236129521e-05, + "loss": 0.1192, + "step": 9450 + }, + { + "epoch": 0.8956637000568074, + "grad_norm": 0.25878214836120605, + "learning_rate": 1.641734519977277e-05, + "loss": 0.112, + "step": 9460 + }, + { + "epoch": 0.8966104904374171, + "grad_norm": 0.3366353511810303, + "learning_rate": 1.641355803825033e-05, + "loss": 0.1107, + "step": 9470 + }, + { + "epoch": 0.8975572808180269, + "grad_norm": 0.3223920166492462, + "learning_rate": 1.6409770876727894e-05, + "loss": 0.117, + "step": 9480 + }, + { + "epoch": 0.8985040711986366, + "grad_norm": 0.32262280583381653, + "learning_rate": 1.6405983715205454e-05, + "loss": 0.1127, + "step": 9490 + }, + { + "epoch": 0.8994508615792464, + "grad_norm": 0.344641774892807, + "learning_rate": 1.6402196553683018e-05, + "loss": 0.1107, + "step": 9500 + }, + { + "epoch": 0.9003976519598561, + "grad_norm": 0.30609458684921265, + "learning_rate": 1.6398409392160578e-05, + "loss": 0.1189, + "step": 9510 + }, + { + "epoch": 0.9013444423404658, + "grad_norm": 0.27467280626296997, + "learning_rate": 1.6394622230638138e-05, + "loss": 0.1277, + "step": 9520 + }, + { + "epoch": 0.9022912327210756, + "grad_norm": 0.31600040197372437, + "learning_rate": 1.6390835069115698e-05, + "loss": 0.1095, + "step": 9530 + }, + { + "epoch": 0.9032380231016853, + "grad_norm": 0.2679688036441803, + "learning_rate": 1.638704790759326e-05, + "loss": 0.1054, + "step": 9540 + }, + { + "epoch": 0.9041848134822951, + "grad_norm": 0.3445126414299011, + "learning_rate": 1.638326074607082e-05, + "loss": 0.1128, + "step": 9550 + }, + { + "epoch": 0.9051316038629047, + "grad_norm": 0.3073931634426117, + "learning_rate": 1.637947358454838e-05, + "loss": 0.116, + "step": 9560 + }, + { + "epoch": 0.9060783942435144, + "grad_norm": 0.36402496695518494, + "learning_rate": 1.637568642302594e-05, + "loss": 0.1106, + "step": 9570 + }, + { + "epoch": 0.9070251846241242, + "grad_norm": 0.34014105796813965, + "learning_rate": 1.6371899261503505e-05, + "loss": 0.1166, + "step": 9580 + }, + { + "epoch": 0.9079719750047339, + "grad_norm": 0.3127152919769287, + "learning_rate": 1.6368112099981065e-05, + "loss": 0.1146, + "step": 9590 + }, + { + "epoch": 0.9089187653853437, + "grad_norm": 0.252822607755661, + "learning_rate": 1.636432493845863e-05, + "loss": 0.1127, + "step": 9600 + }, + { + "epoch": 0.9098655557659534, + "grad_norm": 0.2900446653366089, + "learning_rate": 1.636053777693619e-05, + "loss": 0.1093, + "step": 9610 + }, + { + "epoch": 0.9108123461465631, + "grad_norm": 0.2973507046699524, + "learning_rate": 1.635675061541375e-05, + "loss": 0.1063, + "step": 9620 + }, + { + "epoch": 0.9117591365271729, + "grad_norm": 0.2629620134830475, + "learning_rate": 1.6352963453891312e-05, + "loss": 0.1135, + "step": 9630 + }, + { + "epoch": 0.9127059269077826, + "grad_norm": 0.2994817793369293, + "learning_rate": 1.6349176292368872e-05, + "loss": 0.118, + "step": 9640 + }, + { + "epoch": 0.9136527172883924, + "grad_norm": 0.39161399006843567, + "learning_rate": 1.6345389130846432e-05, + "loss": 0.1134, + "step": 9650 + }, + { + "epoch": 0.9145995076690021, + "grad_norm": 0.3175990581512451, + "learning_rate": 1.6341601969323992e-05, + "loss": 0.1145, + "step": 9660 + }, + { + "epoch": 0.9155462980496119, + "grad_norm": 0.31118014454841614, + "learning_rate": 1.6337814807801552e-05, + "loss": 0.1245, + "step": 9670 + }, + { + "epoch": 0.9164930884302216, + "grad_norm": 0.29309409856796265, + "learning_rate": 1.6334027646279115e-05, + "loss": 0.1103, + "step": 9680 + }, + { + "epoch": 0.9174398788108313, + "grad_norm": 0.2814522385597229, + "learning_rate": 1.6330240484756676e-05, + "loss": 0.1055, + "step": 9690 + }, + { + "epoch": 0.918386669191441, + "grad_norm": 0.2632008492946625, + "learning_rate": 1.6326453323234236e-05, + "loss": 0.0977, + "step": 9700 + }, + { + "epoch": 0.9193334595720507, + "grad_norm": 0.3142138123512268, + "learning_rate": 1.63226661617118e-05, + "loss": 0.1192, + "step": 9710 + }, + { + "epoch": 0.9202802499526604, + "grad_norm": 0.29596778750419617, + "learning_rate": 1.631887900018936e-05, + "loss": 0.1105, + "step": 9720 + }, + { + "epoch": 0.9212270403332702, + "grad_norm": 0.31886908411979675, + "learning_rate": 1.6315091838666923e-05, + "loss": 0.1108, + "step": 9730 + }, + { + "epoch": 0.9221738307138799, + "grad_norm": 0.32292440533638, + "learning_rate": 1.6311304677144483e-05, + "loss": 0.122, + "step": 9740 + }, + { + "epoch": 0.9231206210944897, + "grad_norm": 0.34209519624710083, + "learning_rate": 1.6307517515622043e-05, + "loss": 0.1117, + "step": 9750 + }, + { + "epoch": 0.9240674114750994, + "grad_norm": 0.40027058124542236, + "learning_rate": 1.6303730354099603e-05, + "loss": 0.1177, + "step": 9760 + }, + { + "epoch": 0.9250142018557092, + "grad_norm": 0.3353174924850464, + "learning_rate": 1.6299943192577166e-05, + "loss": 0.1147, + "step": 9770 + }, + { + "epoch": 0.9259609922363189, + "grad_norm": 0.31666451692581177, + "learning_rate": 1.6296156031054726e-05, + "loss": 0.1182, + "step": 9780 + }, + { + "epoch": 0.9269077826169286, + "grad_norm": 0.33596381545066833, + "learning_rate": 1.6292368869532286e-05, + "loss": 0.126, + "step": 9790 + }, + { + "epoch": 0.9278545729975384, + "grad_norm": 0.30454882979393005, + "learning_rate": 1.6288581708009846e-05, + "loss": 0.1176, + "step": 9800 + }, + { + "epoch": 0.9288013633781481, + "grad_norm": 0.3319470286369324, + "learning_rate": 1.6284794546487406e-05, + "loss": 0.117, + "step": 9810 + }, + { + "epoch": 0.9297481537587579, + "grad_norm": 0.30625036358833313, + "learning_rate": 1.628100738496497e-05, + "loss": 0.1104, + "step": 9820 + }, + { + "epoch": 0.9306949441393676, + "grad_norm": 0.2689434885978699, + "learning_rate": 1.627722022344253e-05, + "loss": 0.1075, + "step": 9830 + }, + { + "epoch": 0.9316417345199772, + "grad_norm": 0.34801292419433594, + "learning_rate": 1.6273433061920093e-05, + "loss": 0.1101, + "step": 9840 + }, + { + "epoch": 0.932588524900587, + "grad_norm": 0.379484087228775, + "learning_rate": 1.6269645900397653e-05, + "loss": 0.1106, + "step": 9850 + }, + { + "epoch": 0.9335353152811967, + "grad_norm": 0.24420376121997833, + "learning_rate": 1.6265858738875217e-05, + "loss": 0.1034, + "step": 9860 + }, + { + "epoch": 0.9344821056618065, + "grad_norm": 0.3154924213886261, + "learning_rate": 1.6262071577352777e-05, + "loss": 0.1125, + "step": 9870 + }, + { + "epoch": 0.9354288960424162, + "grad_norm": 0.3439418375492096, + "learning_rate": 1.6258284415830337e-05, + "loss": 0.1131, + "step": 9880 + }, + { + "epoch": 0.9363756864230259, + "grad_norm": 0.3413674533367157, + "learning_rate": 1.6254497254307897e-05, + "loss": 0.1227, + "step": 9890 + }, + { + "epoch": 0.9373224768036357, + "grad_norm": 0.3342759311199188, + "learning_rate": 1.6250710092785457e-05, + "loss": 0.1107, + "step": 9900 + }, + { + "epoch": 0.9382692671842454, + "grad_norm": 0.28516536951065063, + "learning_rate": 1.624692293126302e-05, + "loss": 0.1141, + "step": 9910 + }, + { + "epoch": 0.9392160575648552, + "grad_norm": 0.33588477969169617, + "learning_rate": 1.624313576974058e-05, + "loss": 0.1111, + "step": 9920 + }, + { + "epoch": 0.9401628479454649, + "grad_norm": 0.23833739757537842, + "learning_rate": 1.623934860821814e-05, + "loss": 0.1101, + "step": 9930 + }, + { + "epoch": 0.9411096383260746, + "grad_norm": 0.2724970579147339, + "learning_rate": 1.6235561446695704e-05, + "loss": 0.1149, + "step": 9940 + }, + { + "epoch": 0.9420564287066844, + "grad_norm": 0.3115902543067932, + "learning_rate": 1.6231774285173264e-05, + "loss": 0.1131, + "step": 9950 + }, + { + "epoch": 0.9430032190872941, + "grad_norm": 0.2824494242668152, + "learning_rate": 1.6227987123650827e-05, + "loss": 0.1086, + "step": 9960 + }, + { + "epoch": 0.9439500094679039, + "grad_norm": 0.2815304398536682, + "learning_rate": 1.6224199962128387e-05, + "loss": 0.1151, + "step": 9970 + }, + { + "epoch": 0.9448967998485135, + "grad_norm": 0.24285712838172913, + "learning_rate": 1.6220412800605947e-05, + "loss": 0.1058, + "step": 9980 + }, + { + "epoch": 0.9458435902291232, + "grad_norm": 0.35544589161872864, + "learning_rate": 1.6216625639083508e-05, + "loss": 0.1113, + "step": 9990 + }, + { + "epoch": 0.946790380609733, + "grad_norm": 0.3292807638645172, + "learning_rate": 1.621283847756107e-05, + "loss": 0.13, + "step": 10000 + }, + { + "epoch": 0.9477371709903427, + "grad_norm": 0.3442451059818268, + "learning_rate": 1.620905131603863e-05, + "loss": 0.1217, + "step": 10010 + }, + { + "epoch": 0.9486839613709525, + "grad_norm": 0.2950438857078552, + "learning_rate": 1.620526415451619e-05, + "loss": 0.1069, + "step": 10020 + }, + { + "epoch": 0.9496307517515622, + "grad_norm": 0.29165175557136536, + "learning_rate": 1.620147699299375e-05, + "loss": 0.111, + "step": 10030 + }, + { + "epoch": 0.9505775421321719, + "grad_norm": 0.3380615711212158, + "learning_rate": 1.619768983147131e-05, + "loss": 0.1299, + "step": 10040 + }, + { + "epoch": 0.9515243325127817, + "grad_norm": 0.28867748379707336, + "learning_rate": 1.6193902669948875e-05, + "loss": 0.1143, + "step": 10050 + }, + { + "epoch": 0.9524711228933914, + "grad_norm": 0.3240725100040436, + "learning_rate": 1.6190115508426435e-05, + "loss": 0.1108, + "step": 10060 + }, + { + "epoch": 0.9534179132740012, + "grad_norm": 0.2736086845397949, + "learning_rate": 1.6186328346903998e-05, + "loss": 0.1127, + "step": 10070 + }, + { + "epoch": 0.9543647036546109, + "grad_norm": 0.3198991119861603, + "learning_rate": 1.6182541185381558e-05, + "loss": 0.1077, + "step": 10080 + }, + { + "epoch": 0.9553114940352206, + "grad_norm": 0.32814446091651917, + "learning_rate": 1.6178754023859118e-05, + "loss": 0.1157, + "step": 10090 + }, + { + "epoch": 0.9562582844158304, + "grad_norm": 0.3374975919723511, + "learning_rate": 1.617496686233668e-05, + "loss": 0.1134, + "step": 10100 + }, + { + "epoch": 0.95720507479644, + "grad_norm": 0.24537509679794312, + "learning_rate": 1.617117970081424e-05, + "loss": 0.1169, + "step": 10110 + }, + { + "epoch": 0.9581518651770498, + "grad_norm": 0.29897022247314453, + "learning_rate": 1.6167392539291802e-05, + "loss": 0.1153, + "step": 10120 + }, + { + "epoch": 0.9590986555576595, + "grad_norm": 0.3204127550125122, + "learning_rate": 1.6163605377769362e-05, + "loss": 0.1089, + "step": 10130 + }, + { + "epoch": 0.9600454459382692, + "grad_norm": 0.2863658666610718, + "learning_rate": 1.6159818216246925e-05, + "loss": 0.121, + "step": 10140 + }, + { + "epoch": 0.960992236318879, + "grad_norm": 0.3890479803085327, + "learning_rate": 1.6156031054724485e-05, + "loss": 0.1148, + "step": 10150 + }, + { + "epoch": 0.9619390266994887, + "grad_norm": 0.3382028341293335, + "learning_rate": 1.6152243893202045e-05, + "loss": 0.1195, + "step": 10160 + }, + { + "epoch": 0.9628858170800985, + "grad_norm": 0.27825555205345154, + "learning_rate": 1.6148456731679605e-05, + "loss": 0.1097, + "step": 10170 + }, + { + "epoch": 0.9638326074607082, + "grad_norm": 0.292385995388031, + "learning_rate": 1.614466957015717e-05, + "loss": 0.1158, + "step": 10180 + }, + { + "epoch": 0.9647793978413179, + "grad_norm": 0.3470708429813385, + "learning_rate": 1.614088240863473e-05, + "loss": 0.1291, + "step": 10190 + }, + { + "epoch": 0.9657261882219277, + "grad_norm": 0.34543946385383606, + "learning_rate": 1.6137095247112292e-05, + "loss": 0.122, + "step": 10200 + }, + { + "epoch": 0.9666729786025374, + "grad_norm": 0.2738993465900421, + "learning_rate": 1.6133308085589852e-05, + "loss": 0.1215, + "step": 10210 + }, + { + "epoch": 0.9676197689831472, + "grad_norm": 0.28177791833877563, + "learning_rate": 1.6129520924067412e-05, + "loss": 0.1255, + "step": 10220 + }, + { + "epoch": 0.9685665593637569, + "grad_norm": 0.2852981984615326, + "learning_rate": 1.6125733762544972e-05, + "loss": 0.1039, + "step": 10230 + }, + { + "epoch": 0.9695133497443666, + "grad_norm": 0.3322448432445526, + "learning_rate": 1.6121946601022536e-05, + "loss": 0.1114, + "step": 10240 + }, + { + "epoch": 0.9704601401249763, + "grad_norm": 0.3602607548236847, + "learning_rate": 1.6118159439500096e-05, + "loss": 0.1167, + "step": 10250 + }, + { + "epoch": 0.971406930505586, + "grad_norm": 0.33711227774620056, + "learning_rate": 1.6114372277977656e-05, + "loss": 0.1051, + "step": 10260 + }, + { + "epoch": 0.9723537208861958, + "grad_norm": 0.39165088534355164, + "learning_rate": 1.6110585116455216e-05, + "loss": 0.1157, + "step": 10270 + }, + { + "epoch": 0.9733005112668055, + "grad_norm": 0.32423534989356995, + "learning_rate": 1.610679795493278e-05, + "loss": 0.1048, + "step": 10280 + }, + { + "epoch": 0.9742473016474152, + "grad_norm": 0.31623855233192444, + "learning_rate": 1.610301079341034e-05, + "loss": 0.1123, + "step": 10290 + }, + { + "epoch": 0.975194092028025, + "grad_norm": 0.3637257516384125, + "learning_rate": 1.6099223631887903e-05, + "loss": 0.1087, + "step": 10300 + }, + { + "epoch": 0.9761408824086347, + "grad_norm": 0.5228608846664429, + "learning_rate": 1.6095436470365463e-05, + "loss": 0.1167, + "step": 10310 + }, + { + "epoch": 0.9770876727892445, + "grad_norm": 0.3268279731273651, + "learning_rate": 1.6091649308843023e-05, + "loss": 0.1231, + "step": 10320 + }, + { + "epoch": 0.9780344631698542, + "grad_norm": 0.2953413426876068, + "learning_rate": 1.6087862147320586e-05, + "loss": 0.1084, + "step": 10330 + }, + { + "epoch": 0.978981253550464, + "grad_norm": 0.26272329688072205, + "learning_rate": 1.6084074985798147e-05, + "loss": 0.109, + "step": 10340 + }, + { + "epoch": 0.9799280439310737, + "grad_norm": 0.28040027618408203, + "learning_rate": 1.6080287824275707e-05, + "loss": 0.1086, + "step": 10350 + }, + { + "epoch": 0.9808748343116834, + "grad_norm": 0.28157317638397217, + "learning_rate": 1.6076500662753267e-05, + "loss": 0.1056, + "step": 10360 + }, + { + "epoch": 0.9818216246922932, + "grad_norm": 0.2670665979385376, + "learning_rate": 1.6072713501230827e-05, + "loss": 0.116, + "step": 10370 + }, + { + "epoch": 0.9827684150729029, + "grad_norm": 0.36410754919052124, + "learning_rate": 1.606892633970839e-05, + "loss": 0.1142, + "step": 10380 + }, + { + "epoch": 0.9837152054535125, + "grad_norm": 0.32198601961135864, + "learning_rate": 1.606513917818595e-05, + "loss": 0.1098, + "step": 10390 + }, + { + "epoch": 0.9846619958341223, + "grad_norm": 0.3849109709262848, + "learning_rate": 1.606135201666351e-05, + "loss": 0.1116, + "step": 10400 + }, + { + "epoch": 0.985608786214732, + "grad_norm": 0.344265878200531, + "learning_rate": 1.6057564855141074e-05, + "loss": 0.1124, + "step": 10410 + }, + { + "epoch": 0.9865555765953418, + "grad_norm": 0.33011743426322937, + "learning_rate": 1.6053777693618634e-05, + "loss": 0.1032, + "step": 10420 + }, + { + "epoch": 0.9875023669759515, + "grad_norm": 0.4829758405685425, + "learning_rate": 1.6049990532096197e-05, + "loss": 0.1041, + "step": 10430 + }, + { + "epoch": 0.9884491573565612, + "grad_norm": 0.3808184862136841, + "learning_rate": 1.6046203370573757e-05, + "loss": 0.1116, + "step": 10440 + }, + { + "epoch": 0.989395947737171, + "grad_norm": 0.31177645921707153, + "learning_rate": 1.6042416209051317e-05, + "loss": 0.1195, + "step": 10450 + }, + { + "epoch": 0.9903427381177807, + "grad_norm": 0.30874019861221313, + "learning_rate": 1.6038629047528877e-05, + "loss": 0.1122, + "step": 10460 + }, + { + "epoch": 0.9912895284983905, + "grad_norm": 0.286725789308548, + "learning_rate": 1.603484188600644e-05, + "loss": 0.1127, + "step": 10470 + }, + { + "epoch": 0.9922363188790002, + "grad_norm": 0.30968090891838074, + "learning_rate": 1.6031054724484e-05, + "loss": 0.1119, + "step": 10480 + }, + { + "epoch": 0.99318310925961, + "grad_norm": 0.29032912850379944, + "learning_rate": 1.602726756296156e-05, + "loss": 0.1139, + "step": 10490 + }, + { + "epoch": 0.9941298996402197, + "grad_norm": 0.3562324643135071, + "learning_rate": 1.602348040143912e-05, + "loss": 0.1163, + "step": 10500 + }, + { + "epoch": 0.9950766900208294, + "grad_norm": 0.28674376010894775, + "learning_rate": 1.6019693239916684e-05, + "loss": 0.1055, + "step": 10510 + }, + { + "epoch": 0.9960234804014392, + "grad_norm": 0.2935590147972107, + "learning_rate": 1.6015906078394244e-05, + "loss": 0.1043, + "step": 10520 + }, + { + "epoch": 0.9969702707820488, + "grad_norm": 0.3492721617221832, + "learning_rate": 1.6012118916871804e-05, + "loss": 0.1146, + "step": 10530 + }, + { + "epoch": 0.9979170611626585, + "grad_norm": 0.30205583572387695, + "learning_rate": 1.6008331755349368e-05, + "loss": 0.107, + "step": 10540 + }, + { + "epoch": 0.9988638515432683, + "grad_norm": 0.3231695294380188, + "learning_rate": 1.6004544593826928e-05, + "loss": 0.1152, + "step": 10550 + }, + { + "epoch": 0.999810641923878, + "grad_norm": 0.31855645775794983, + "learning_rate": 1.600075743230449e-05, + "loss": 0.1111, + "step": 10560 + }, + { + "epoch": 1.0, + "eval_f1_micro": 0.3363452934467936, + "eval_loss": 0.11279743909835815, + "eval_precision": 0.6191527157237444, + "eval_recall": 0.23088503924363168, + "eval_runtime": 345.408, + "eval_samples_per_second": 122.305, + "eval_steps_per_second": 7.646, + "step": 10562 + }, + { + "epoch": 1.0007574323044879, + "grad_norm": 0.3501763641834259, + "learning_rate": 1.599697027078205e-05, + "loss": 0.1054, + "step": 10570 + }, + { + "epoch": 1.0017042226850976, + "grad_norm": 0.2885648012161255, + "learning_rate": 1.599318310925961e-05, + "loss": 0.1037, + "step": 10580 + }, + { + "epoch": 1.0026510130657071, + "grad_norm": 0.24593491852283478, + "learning_rate": 1.598939594773717e-05, + "loss": 0.0999, + "step": 10590 + }, + { + "epoch": 1.0035978034463169, + "grad_norm": 0.2742719054222107, + "learning_rate": 1.598560878621473e-05, + "loss": 0.1129, + "step": 10600 + }, + { + "epoch": 1.0045445938269266, + "grad_norm": 0.221815824508667, + "learning_rate": 1.5981821624692295e-05, + "loss": 0.1023, + "step": 10610 + }, + { + "epoch": 1.0054913842075364, + "grad_norm": 0.2318876087665558, + "learning_rate": 1.5978034463169855e-05, + "loss": 0.1098, + "step": 10620 + }, + { + "epoch": 1.006438174588146, + "grad_norm": 0.25267088413238525, + "learning_rate": 1.5974247301647415e-05, + "loss": 0.1091, + "step": 10630 + }, + { + "epoch": 1.0073849649687558, + "grad_norm": 0.3592093884944916, + "learning_rate": 1.597046014012498e-05, + "loss": 0.1121, + "step": 10640 + }, + { + "epoch": 1.0083317553493656, + "grad_norm": 0.2729831337928772, + "learning_rate": 1.596667297860254e-05, + "loss": 0.1002, + "step": 10650 + }, + { + "epoch": 1.0092785457299753, + "grad_norm": 0.24247096478939056, + "learning_rate": 1.5962885817080102e-05, + "loss": 0.1015, + "step": 10660 + }, + { + "epoch": 1.010225336110585, + "grad_norm": 0.2638133466243744, + "learning_rate": 1.5959098655557662e-05, + "loss": 0.1011, + "step": 10670 + }, + { + "epoch": 1.0111721264911948, + "grad_norm": 0.377765029668808, + "learning_rate": 1.5955311494035222e-05, + "loss": 0.1101, + "step": 10680 + }, + { + "epoch": 1.0121189168718046, + "grad_norm": 0.3203154504299164, + "learning_rate": 1.5951524332512782e-05, + "loss": 0.1197, + "step": 10690 + }, + { + "epoch": 1.0130657072524143, + "grad_norm": 0.35998988151550293, + "learning_rate": 1.5947737170990346e-05, + "loss": 0.1005, + "step": 10700 + }, + { + "epoch": 1.014012497633024, + "grad_norm": 0.28926974534988403, + "learning_rate": 1.5943950009467906e-05, + "loss": 0.1066, + "step": 10710 + }, + { + "epoch": 1.0149592880136338, + "grad_norm": 0.3111324608325958, + "learning_rate": 1.5940162847945466e-05, + "loss": 0.1067, + "step": 10720 + }, + { + "epoch": 1.0159060783942435, + "grad_norm": 0.27141204476356506, + "learning_rate": 1.5936375686423026e-05, + "loss": 0.1054, + "step": 10730 + }, + { + "epoch": 1.0168528687748533, + "grad_norm": 0.3853040039539337, + "learning_rate": 1.5932588524900586e-05, + "loss": 0.113, + "step": 10740 + }, + { + "epoch": 1.017799659155463, + "grad_norm": 0.3320479094982147, + "learning_rate": 1.592880136337815e-05, + "loss": 0.1027, + "step": 10750 + }, + { + "epoch": 1.0187464495360727, + "grad_norm": 0.38209497928619385, + "learning_rate": 1.592501420185571e-05, + "loss": 0.1289, + "step": 10760 + }, + { + "epoch": 1.0196932399166825, + "grad_norm": 0.42024433612823486, + "learning_rate": 1.5921227040333273e-05, + "loss": 0.1168, + "step": 10770 + }, + { + "epoch": 1.0206400302972922, + "grad_norm": 0.2727624177932739, + "learning_rate": 1.5917439878810833e-05, + "loss": 0.1107, + "step": 10780 + }, + { + "epoch": 1.021586820677902, + "grad_norm": 0.39503511786460876, + "learning_rate": 1.5913652717288393e-05, + "loss": 0.1126, + "step": 10790 + }, + { + "epoch": 1.0225336110585117, + "grad_norm": 0.27807143330574036, + "learning_rate": 1.5909865555765956e-05, + "loss": 0.1132, + "step": 10800 + }, + { + "epoch": 1.0234804014391214, + "grad_norm": 0.3399743139743805, + "learning_rate": 1.5906078394243516e-05, + "loss": 0.1136, + "step": 10810 + }, + { + "epoch": 1.0244271918197312, + "grad_norm": 0.3346470296382904, + "learning_rate": 1.5902291232721076e-05, + "loss": 0.1144, + "step": 10820 + }, + { + "epoch": 1.025373982200341, + "grad_norm": 0.33610638976097107, + "learning_rate": 1.5898504071198636e-05, + "loss": 0.1088, + "step": 10830 + }, + { + "epoch": 1.0263207725809507, + "grad_norm": 0.33162784576416016, + "learning_rate": 1.58947169096762e-05, + "loss": 0.1026, + "step": 10840 + }, + { + "epoch": 1.0272675629615604, + "grad_norm": 0.3138902485370636, + "learning_rate": 1.589092974815376e-05, + "loss": 0.1161, + "step": 10850 + }, + { + "epoch": 1.0282143533421702, + "grad_norm": 0.3033609688282013, + "learning_rate": 1.588714258663132e-05, + "loss": 0.0992, + "step": 10860 + }, + { + "epoch": 1.0291611437227797, + "grad_norm": 0.3374888300895691, + "learning_rate": 1.5883355425108883e-05, + "loss": 0.122, + "step": 10870 + }, + { + "epoch": 1.0301079341033894, + "grad_norm": 0.294697642326355, + "learning_rate": 1.5879568263586443e-05, + "loss": 0.1126, + "step": 10880 + }, + { + "epoch": 1.0310547244839992, + "grad_norm": 0.3098808526992798, + "learning_rate": 1.5875781102064003e-05, + "loss": 0.1078, + "step": 10890 + }, + { + "epoch": 1.032001514864609, + "grad_norm": 0.23213276267051697, + "learning_rate": 1.5871993940541567e-05, + "loss": 0.1113, + "step": 10900 + }, + { + "epoch": 1.0329483052452186, + "grad_norm": 0.30921486020088196, + "learning_rate": 1.5868206779019127e-05, + "loss": 0.118, + "step": 10910 + }, + { + "epoch": 1.0338950956258284, + "grad_norm": 0.29103872179985046, + "learning_rate": 1.5864419617496687e-05, + "loss": 0.1108, + "step": 10920 + }, + { + "epoch": 1.0348418860064381, + "grad_norm": 0.34007346630096436, + "learning_rate": 1.5860632455974247e-05, + "loss": 0.1153, + "step": 10930 + }, + { + "epoch": 1.0357886763870479, + "grad_norm": 0.3389047682285309, + "learning_rate": 1.585684529445181e-05, + "loss": 0.1126, + "step": 10940 + }, + { + "epoch": 1.0367354667676576, + "grad_norm": 0.29568174481391907, + "learning_rate": 1.585305813292937e-05, + "loss": 0.1059, + "step": 10950 + }, + { + "epoch": 1.0376822571482673, + "grad_norm": 0.38442787528038025, + "learning_rate": 1.584927097140693e-05, + "loss": 0.1051, + "step": 10960 + }, + { + "epoch": 1.038629047528877, + "grad_norm": 0.3523216247558594, + "learning_rate": 1.584548380988449e-05, + "loss": 0.1061, + "step": 10970 + }, + { + "epoch": 1.0395758379094868, + "grad_norm": 0.3302572965621948, + "learning_rate": 1.5841696648362054e-05, + "loss": 0.1105, + "step": 10980 + }, + { + "epoch": 1.0405226282900966, + "grad_norm": 0.32812613248825073, + "learning_rate": 1.5837909486839614e-05, + "loss": 0.1081, + "step": 10990 + }, + { + "epoch": 1.0414694186707063, + "grad_norm": 0.33464688062667847, + "learning_rate": 1.5834122325317178e-05, + "loss": 0.1139, + "step": 11000 + }, + { + "epoch": 1.042416209051316, + "grad_norm": 0.29011818766593933, + "learning_rate": 1.5830335163794738e-05, + "loss": 0.1016, + "step": 11010 + }, + { + "epoch": 1.0433629994319258, + "grad_norm": 0.26706334948539734, + "learning_rate": 1.5826548002272298e-05, + "loss": 0.1128, + "step": 11020 + }, + { + "epoch": 1.0443097898125355, + "grad_norm": 0.3417057991027832, + "learning_rate": 1.582276084074986e-05, + "loss": 0.1046, + "step": 11030 + }, + { + "epoch": 1.0452565801931453, + "grad_norm": 0.4278360903263092, + "learning_rate": 1.581897367922742e-05, + "loss": 0.1078, + "step": 11040 + }, + { + "epoch": 1.046203370573755, + "grad_norm": 0.34714770317077637, + "learning_rate": 1.581518651770498e-05, + "loss": 0.1127, + "step": 11050 + }, + { + "epoch": 1.0471501609543648, + "grad_norm": 0.2509910464286804, + "learning_rate": 1.581139935618254e-05, + "loss": 0.1104, + "step": 11060 + }, + { + "epoch": 1.0480969513349745, + "grad_norm": 0.27627503871917725, + "learning_rate": 1.58076121946601e-05, + "loss": 0.1139, + "step": 11070 + }, + { + "epoch": 1.0490437417155842, + "grad_norm": 0.3637837767601013, + "learning_rate": 1.5803825033137665e-05, + "loss": 0.1117, + "step": 11080 + }, + { + "epoch": 1.049990532096194, + "grad_norm": 0.22421805560588837, + "learning_rate": 1.5800037871615225e-05, + "loss": 0.0998, + "step": 11090 + }, + { + "epoch": 1.0509373224768037, + "grad_norm": 0.5345719456672668, + "learning_rate": 1.5796250710092785e-05, + "loss": 0.1088, + "step": 11100 + }, + { + "epoch": 1.0518841128574135, + "grad_norm": 0.29915082454681396, + "learning_rate": 1.5792463548570348e-05, + "loss": 0.1158, + "step": 11110 + }, + { + "epoch": 1.0528309032380232, + "grad_norm": 0.2868350148200989, + "learning_rate": 1.578867638704791e-05, + "loss": 0.1053, + "step": 11120 + }, + { + "epoch": 1.053777693618633, + "grad_norm": 0.3004460632801056, + "learning_rate": 1.5784889225525472e-05, + "loss": 0.1086, + "step": 11130 + }, + { + "epoch": 1.0547244839992427, + "grad_norm": 0.37865251302719116, + "learning_rate": 1.5781102064003032e-05, + "loss": 0.1094, + "step": 11140 + }, + { + "epoch": 1.0556712743798522, + "grad_norm": 0.37065449357032776, + "learning_rate": 1.5777314902480592e-05, + "loss": 0.1105, + "step": 11150 + }, + { + "epoch": 1.056618064760462, + "grad_norm": 0.3341347575187683, + "learning_rate": 1.5773527740958152e-05, + "loss": 0.1, + "step": 11160 + }, + { + "epoch": 1.0575648551410717, + "grad_norm": 0.34143874049186707, + "learning_rate": 1.5769740579435715e-05, + "loss": 0.104, + "step": 11170 + }, + { + "epoch": 1.0585116455216814, + "grad_norm": 0.39380621910095215, + "learning_rate": 1.5765953417913275e-05, + "loss": 0.1102, + "step": 11180 + }, + { + "epoch": 1.0594584359022912, + "grad_norm": 0.2955397367477417, + "learning_rate": 1.5762166256390835e-05, + "loss": 0.1201, + "step": 11190 + }, + { + "epoch": 1.060405226282901, + "grad_norm": 0.33224380016326904, + "learning_rate": 1.5758379094868395e-05, + "loss": 0.1158, + "step": 11200 + }, + { + "epoch": 1.0613520166635106, + "grad_norm": 0.3783450722694397, + "learning_rate": 1.575459193334596e-05, + "loss": 0.1129, + "step": 11210 + }, + { + "epoch": 1.0622988070441204, + "grad_norm": 0.3262997567653656, + "learning_rate": 1.575080477182352e-05, + "loss": 0.1029, + "step": 11220 + }, + { + "epoch": 1.0632455974247301, + "grad_norm": 0.38155102729797363, + "learning_rate": 1.5747017610301082e-05, + "loss": 0.1021, + "step": 11230 + }, + { + "epoch": 1.0641923878053399, + "grad_norm": 0.3473179042339325, + "learning_rate": 1.5743230448778642e-05, + "loss": 0.1022, + "step": 11240 + }, + { + "epoch": 1.0651391781859496, + "grad_norm": 0.3675661087036133, + "learning_rate": 1.5739443287256203e-05, + "loss": 0.1015, + "step": 11250 + }, + { + "epoch": 1.0660859685665593, + "grad_norm": 0.3818339705467224, + "learning_rate": 1.5735656125733766e-05, + "loss": 0.1092, + "step": 11260 + }, + { + "epoch": 1.067032758947169, + "grad_norm": 0.321243554353714, + "learning_rate": 1.5731868964211326e-05, + "loss": 0.1006, + "step": 11270 + }, + { + "epoch": 1.0679795493277788, + "grad_norm": 0.42666223645210266, + "learning_rate": 1.5728081802688886e-05, + "loss": 0.1087, + "step": 11280 + }, + { + "epoch": 1.0689263397083886, + "grad_norm": 0.42080941796302795, + "learning_rate": 1.5724294641166446e-05, + "loss": 0.1068, + "step": 11290 + }, + { + "epoch": 1.0698731300889983, + "grad_norm": 0.3022630512714386, + "learning_rate": 1.5720507479644006e-05, + "loss": 0.104, + "step": 11300 + }, + { + "epoch": 1.070819920469608, + "grad_norm": 0.45780861377716064, + "learning_rate": 1.571672031812157e-05, + "loss": 0.0998, + "step": 11310 + }, + { + "epoch": 1.0717667108502178, + "grad_norm": 0.33300885558128357, + "learning_rate": 1.571293315659913e-05, + "loss": 0.1107, + "step": 11320 + }, + { + "epoch": 1.0727135012308275, + "grad_norm": 0.3255704641342163, + "learning_rate": 1.570914599507669e-05, + "loss": 0.1127, + "step": 11330 + }, + { + "epoch": 1.0736602916114373, + "grad_norm": 0.3382815420627594, + "learning_rate": 1.5705358833554253e-05, + "loss": 0.1043, + "step": 11340 + }, + { + "epoch": 1.074607081992047, + "grad_norm": 0.27806100249290466, + "learning_rate": 1.5701571672031813e-05, + "loss": 0.1143, + "step": 11350 + }, + { + "epoch": 1.0755538723726568, + "grad_norm": 0.27662235498428345, + "learning_rate": 1.5697784510509377e-05, + "loss": 0.0919, + "step": 11360 + }, + { + "epoch": 1.0765006627532665, + "grad_norm": 0.32525408267974854, + "learning_rate": 1.5693997348986937e-05, + "loss": 0.1128, + "step": 11370 + }, + { + "epoch": 1.0774474531338762, + "grad_norm": 0.2928694188594818, + "learning_rate": 1.5690210187464497e-05, + "loss": 0.1114, + "step": 11380 + }, + { + "epoch": 1.078394243514486, + "grad_norm": 0.32543128728866577, + "learning_rate": 1.5686423025942057e-05, + "loss": 0.1113, + "step": 11390 + }, + { + "epoch": 1.0793410338950957, + "grad_norm": 0.3159891366958618, + "learning_rate": 1.568263586441962e-05, + "loss": 0.1111, + "step": 11400 + }, + { + "epoch": 1.0802878242757052, + "grad_norm": 0.6057949066162109, + "learning_rate": 1.567884870289718e-05, + "loss": 0.1102, + "step": 11410 + }, + { + "epoch": 1.081234614656315, + "grad_norm": 0.43823304772377014, + "learning_rate": 1.567506154137474e-05, + "loss": 0.1154, + "step": 11420 + }, + { + "epoch": 1.0821814050369247, + "grad_norm": 0.37226566672325134, + "learning_rate": 1.56712743798523e-05, + "loss": 0.1087, + "step": 11430 + }, + { + "epoch": 1.0831281954175345, + "grad_norm": 0.41247767210006714, + "learning_rate": 1.566748721832986e-05, + "loss": 0.1091, + "step": 11440 + }, + { + "epoch": 1.0840749857981442, + "grad_norm": 0.3546197712421417, + "learning_rate": 1.5663700056807424e-05, + "loss": 0.0982, + "step": 11450 + }, + { + "epoch": 1.085021776178754, + "grad_norm": 0.32991188764572144, + "learning_rate": 1.5659912895284984e-05, + "loss": 0.1032, + "step": 11460 + }, + { + "epoch": 1.0859685665593637, + "grad_norm": 0.3210223317146301, + "learning_rate": 1.5656125733762547e-05, + "loss": 0.1192, + "step": 11470 + }, + { + "epoch": 1.0869153569399734, + "grad_norm": 0.2751235067844391, + "learning_rate": 1.5652338572240107e-05, + "loss": 0.1184, + "step": 11480 + }, + { + "epoch": 1.0878621473205832, + "grad_norm": 0.45019084215164185, + "learning_rate": 1.564855141071767e-05, + "loss": 0.114, + "step": 11490 + }, + { + "epoch": 1.088808937701193, + "grad_norm": 0.3328302502632141, + "learning_rate": 1.564476424919523e-05, + "loss": 0.111, + "step": 11500 + }, + { + "epoch": 1.0897557280818027, + "grad_norm": 0.3211038410663605, + "learning_rate": 1.564097708767279e-05, + "loss": 0.1133, + "step": 11510 + }, + { + "epoch": 1.0907025184624124, + "grad_norm": 0.3597552180290222, + "learning_rate": 1.563718992615035e-05, + "loss": 0.1108, + "step": 11520 + }, + { + "epoch": 1.0916493088430221, + "grad_norm": 0.3381154239177704, + "learning_rate": 1.563340276462791e-05, + "loss": 0.1077, + "step": 11530 + }, + { + "epoch": 1.0925960992236319, + "grad_norm": 0.3959258198738098, + "learning_rate": 1.5629615603105474e-05, + "loss": 0.1208, + "step": 11540 + }, + { + "epoch": 1.0935428896042416, + "grad_norm": 0.3280211091041565, + "learning_rate": 1.5625828441583034e-05, + "loss": 0.1039, + "step": 11550 + }, + { + "epoch": 1.0944896799848514, + "grad_norm": 0.27923819422721863, + "learning_rate": 1.5622041280060595e-05, + "loss": 0.1066, + "step": 11560 + }, + { + "epoch": 1.095436470365461, + "grad_norm": 0.29647937417030334, + "learning_rate": 1.5618254118538158e-05, + "loss": 0.1001, + "step": 11570 + }, + { + "epoch": 1.0963832607460708, + "grad_norm": 0.41365155577659607, + "learning_rate": 1.5614466957015718e-05, + "loss": 0.1123, + "step": 11580 + }, + { + "epoch": 1.0973300511266806, + "grad_norm": 0.29978007078170776, + "learning_rate": 1.561067979549328e-05, + "loss": 0.1168, + "step": 11590 + }, + { + "epoch": 1.0982768415072903, + "grad_norm": 0.3278815746307373, + "learning_rate": 1.560689263397084e-05, + "loss": 0.1126, + "step": 11600 + }, + { + "epoch": 1.0992236318879, + "grad_norm": 0.34750813245773315, + "learning_rate": 1.56031054724484e-05, + "loss": 0.1062, + "step": 11610 + }, + { + "epoch": 1.1001704222685098, + "grad_norm": 0.5691556930541992, + "learning_rate": 1.559931831092596e-05, + "loss": 0.0999, + "step": 11620 + }, + { + "epoch": 1.1011172126491195, + "grad_norm": 0.43018022179603577, + "learning_rate": 1.5595531149403525e-05, + "loss": 0.1154, + "step": 11630 + }, + { + "epoch": 1.1020640030297293, + "grad_norm": 0.3963812589645386, + "learning_rate": 1.5591743987881085e-05, + "loss": 0.1001, + "step": 11640 + }, + { + "epoch": 1.103010793410339, + "grad_norm": 0.2910940945148468, + "learning_rate": 1.5587956826358645e-05, + "loss": 0.1131, + "step": 11650 + }, + { + "epoch": 1.1039575837909488, + "grad_norm": 0.42120805382728577, + "learning_rate": 1.5584169664836205e-05, + "loss": 0.1139, + "step": 11660 + }, + { + "epoch": 1.1049043741715585, + "grad_norm": 0.33243581652641296, + "learning_rate": 1.5580382503313765e-05, + "loss": 0.1058, + "step": 11670 + }, + { + "epoch": 1.1058511645521683, + "grad_norm": 0.360920250415802, + "learning_rate": 1.557659534179133e-05, + "loss": 0.1033, + "step": 11680 + }, + { + "epoch": 1.106797954932778, + "grad_norm": 0.3652898073196411, + "learning_rate": 1.557280818026889e-05, + "loss": 0.1107, + "step": 11690 + }, + { + "epoch": 1.1077447453133875, + "grad_norm": 0.3319729268550873, + "learning_rate": 1.5569021018746452e-05, + "loss": 0.1037, + "step": 11700 + }, + { + "epoch": 1.1086915356939973, + "grad_norm": 0.4759555757045746, + "learning_rate": 1.5565233857224012e-05, + "loss": 0.1157, + "step": 11710 + }, + { + "epoch": 1.109638326074607, + "grad_norm": 0.30491140484809875, + "learning_rate": 1.5561446695701572e-05, + "loss": 0.0995, + "step": 11720 + }, + { + "epoch": 1.1105851164552167, + "grad_norm": 0.3831186592578888, + "learning_rate": 1.5557659534179136e-05, + "loss": 0.1128, + "step": 11730 + }, + { + "epoch": 1.1115319068358265, + "grad_norm": 0.3806062936782837, + "learning_rate": 1.5553872372656696e-05, + "loss": 0.0988, + "step": 11740 + }, + { + "epoch": 1.1124786972164362, + "grad_norm": 0.3348376452922821, + "learning_rate": 1.5550085211134256e-05, + "loss": 0.1121, + "step": 11750 + }, + { + "epoch": 1.113425487597046, + "grad_norm": 0.3526676297187805, + "learning_rate": 1.5546298049611816e-05, + "loss": 0.1134, + "step": 11760 + }, + { + "epoch": 1.1143722779776557, + "grad_norm": 0.4545170068740845, + "learning_rate": 1.554251088808938e-05, + "loss": 0.1104, + "step": 11770 + }, + { + "epoch": 1.1153190683582654, + "grad_norm": 0.2890951335430145, + "learning_rate": 1.553872372656694e-05, + "loss": 0.1009, + "step": 11780 + }, + { + "epoch": 1.1162658587388752, + "grad_norm": 0.3283509612083435, + "learning_rate": 1.55349365650445e-05, + "loss": 0.097, + "step": 11790 + }, + { + "epoch": 1.117212649119485, + "grad_norm": 0.39825356006622314, + "learning_rate": 1.553114940352206e-05, + "loss": 0.108, + "step": 11800 + }, + { + "epoch": 1.1181594395000947, + "grad_norm": 0.3061431348323822, + "learning_rate": 1.5527362241999623e-05, + "loss": 0.1115, + "step": 11810 + }, + { + "epoch": 1.1191062298807044, + "grad_norm": 0.3241097629070282, + "learning_rate": 1.5523575080477183e-05, + "loss": 0.104, + "step": 11820 + }, + { + "epoch": 1.1200530202613141, + "grad_norm": 0.4179798662662506, + "learning_rate": 1.5519787918954746e-05, + "loss": 0.1111, + "step": 11830 + }, + { + "epoch": 1.1209998106419239, + "grad_norm": 0.52249675989151, + "learning_rate": 1.5516000757432306e-05, + "loss": 0.1128, + "step": 11840 + }, + { + "epoch": 1.1219466010225336, + "grad_norm": 0.42467227578163147, + "learning_rate": 1.5512213595909866e-05, + "loss": 0.1127, + "step": 11850 + }, + { + "epoch": 1.1228933914031434, + "grad_norm": 0.344828337430954, + "learning_rate": 1.5508426434387427e-05, + "loss": 0.1084, + "step": 11860 + }, + { + "epoch": 1.1238401817837531, + "grad_norm": 0.30831944942474365, + "learning_rate": 1.550463927286499e-05, + "loss": 0.1149, + "step": 11870 + }, + { + "epoch": 1.1247869721643629, + "grad_norm": 0.4202735424041748, + "learning_rate": 1.550085211134255e-05, + "loss": 0.1233, + "step": 11880 + }, + { + "epoch": 1.1257337625449726, + "grad_norm": 0.46457767486572266, + "learning_rate": 1.549706494982011e-05, + "loss": 0.1151, + "step": 11890 + }, + { + "epoch": 1.1266805529255823, + "grad_norm": 0.41014325618743896, + "learning_rate": 1.549327778829767e-05, + "loss": 0.1046, + "step": 11900 + }, + { + "epoch": 1.127627343306192, + "grad_norm": 0.3300694227218628, + "learning_rate": 1.5489490626775234e-05, + "loss": 0.1044, + "step": 11910 + }, + { + "epoch": 1.1285741336868018, + "grad_norm": 0.3922729194164276, + "learning_rate": 1.5485703465252794e-05, + "loss": 0.0988, + "step": 11920 + }, + { + "epoch": 1.1295209240674116, + "grad_norm": 0.36446335911750793, + "learning_rate": 1.5481916303730357e-05, + "loss": 0.1082, + "step": 11930 + }, + { + "epoch": 1.1304677144480213, + "grad_norm": 0.3100954294204712, + "learning_rate": 1.5478129142207917e-05, + "loss": 0.1062, + "step": 11940 + }, + { + "epoch": 1.1314145048286308, + "grad_norm": 0.3029150366783142, + "learning_rate": 1.5474341980685477e-05, + "loss": 0.1166, + "step": 11950 + }, + { + "epoch": 1.1323612952092406, + "grad_norm": 0.287994384765625, + "learning_rate": 1.547055481916304e-05, + "loss": 0.1009, + "step": 11960 + }, + { + "epoch": 1.1333080855898503, + "grad_norm": 0.3634592294692993, + "learning_rate": 1.54667676576406e-05, + "loss": 0.1107, + "step": 11970 + }, + { + "epoch": 1.13425487597046, + "grad_norm": 0.39057114720344543, + "learning_rate": 1.546298049611816e-05, + "loss": 0.1117, + "step": 11980 + }, + { + "epoch": 1.1352016663510698, + "grad_norm": 0.3468126356601715, + "learning_rate": 1.545919333459572e-05, + "loss": 0.1161, + "step": 11990 + }, + { + "epoch": 1.1361484567316795, + "grad_norm": 0.43798407912254333, + "learning_rate": 1.545540617307328e-05, + "loss": 0.1152, + "step": 12000 + }, + { + "epoch": 1.1370952471122893, + "grad_norm": 0.2758103907108307, + "learning_rate": 1.5451619011550844e-05, + "loss": 0.1125, + "step": 12010 + }, + { + "epoch": 1.138042037492899, + "grad_norm": 0.3223762512207031, + "learning_rate": 1.5447831850028404e-05, + "loss": 0.1206, + "step": 12020 + }, + { + "epoch": 1.1389888278735087, + "grad_norm": 0.2902495563030243, + "learning_rate": 1.5444044688505964e-05, + "loss": 0.1007, + "step": 12030 + }, + { + "epoch": 1.1399356182541185, + "grad_norm": 0.253961980342865, + "learning_rate": 1.5440257526983528e-05, + "loss": 0.0983, + "step": 12040 + }, + { + "epoch": 1.1408824086347282, + "grad_norm": 0.41474276781082153, + "learning_rate": 1.5436470365461088e-05, + "loss": 0.1021, + "step": 12050 + }, + { + "epoch": 1.141829199015338, + "grad_norm": 0.3824501037597656, + "learning_rate": 1.543268320393865e-05, + "loss": 0.1171, + "step": 12060 + }, + { + "epoch": 1.1427759893959477, + "grad_norm": 0.39116084575653076, + "learning_rate": 1.542889604241621e-05, + "loss": 0.1052, + "step": 12070 + }, + { + "epoch": 1.1437227797765575, + "grad_norm": 0.7037903070449829, + "learning_rate": 1.542510888089377e-05, + "loss": 0.0976, + "step": 12080 + }, + { + "epoch": 1.1446695701571672, + "grad_norm": 0.4843405485153198, + "learning_rate": 1.542132171937133e-05, + "loss": 0.1128, + "step": 12090 + }, + { + "epoch": 1.145616360537777, + "grad_norm": 0.36582592129707336, + "learning_rate": 1.5417534557848895e-05, + "loss": 0.1073, + "step": 12100 + }, + { + "epoch": 1.1465631509183867, + "grad_norm": 0.36037132143974304, + "learning_rate": 1.5413747396326455e-05, + "loss": 0.1109, + "step": 12110 + }, + { + "epoch": 1.1475099412989964, + "grad_norm": 0.30198612809181213, + "learning_rate": 1.5409960234804015e-05, + "loss": 0.109, + "step": 12120 + }, + { + "epoch": 1.1484567316796062, + "grad_norm": 0.32701000571250916, + "learning_rate": 1.5406173073281575e-05, + "loss": 0.1, + "step": 12130 + }, + { + "epoch": 1.149403522060216, + "grad_norm": 0.35844531655311584, + "learning_rate": 1.540238591175914e-05, + "loss": 0.1204, + "step": 12140 + }, + { + "epoch": 1.1503503124408256, + "grad_norm": 0.32474035024642944, + "learning_rate": 1.53985987502367e-05, + "loss": 0.1059, + "step": 12150 + }, + { + "epoch": 1.1512971028214354, + "grad_norm": 0.3637523651123047, + "learning_rate": 1.539481158871426e-05, + "loss": 0.1058, + "step": 12160 + }, + { + "epoch": 1.1522438932020451, + "grad_norm": 0.33581167459487915, + "learning_rate": 1.5391024427191822e-05, + "loss": 0.1123, + "step": 12170 + }, + { + "epoch": 1.1531906835826549, + "grad_norm": 0.2559072971343994, + "learning_rate": 1.5387237265669382e-05, + "loss": 0.1103, + "step": 12180 + }, + { + "epoch": 1.1541374739632646, + "grad_norm": 0.40483200550079346, + "learning_rate": 1.5383450104146945e-05, + "loss": 0.0937, + "step": 12190 + }, + { + "epoch": 1.1550842643438743, + "grad_norm": 0.29024216532707214, + "learning_rate": 1.5379662942624505e-05, + "loss": 0.1039, + "step": 12200 + }, + { + "epoch": 1.156031054724484, + "grad_norm": 0.3387871980667114, + "learning_rate": 1.5375875781102066e-05, + "loss": 0.1055, + "step": 12210 + }, + { + "epoch": 1.1569778451050938, + "grad_norm": 0.33456581830978394, + "learning_rate": 1.5372088619579626e-05, + "loss": 0.1102, + "step": 12220 + }, + { + "epoch": 1.1579246354857036, + "grad_norm": 0.25301578640937805, + "learning_rate": 1.5368301458057186e-05, + "loss": 0.1114, + "step": 12230 + }, + { + "epoch": 1.1588714258663133, + "grad_norm": 0.33042630553245544, + "learning_rate": 1.536451429653475e-05, + "loss": 0.11, + "step": 12240 + }, + { + "epoch": 1.159818216246923, + "grad_norm": 0.2601511478424072, + "learning_rate": 1.536072713501231e-05, + "loss": 0.1023, + "step": 12250 + }, + { + "epoch": 1.1607650066275326, + "grad_norm": 0.3771694004535675, + "learning_rate": 1.535693997348987e-05, + "loss": 0.1173, + "step": 12260 + }, + { + "epoch": 1.1617117970081423, + "grad_norm": 0.3950464725494385, + "learning_rate": 1.5353152811967433e-05, + "loss": 0.1156, + "step": 12270 + }, + { + "epoch": 1.162658587388752, + "grad_norm": 0.36990073323249817, + "learning_rate": 1.5349365650444993e-05, + "loss": 0.1082, + "step": 12280 + }, + { + "epoch": 1.1636053777693618, + "grad_norm": 0.4734830856323242, + "learning_rate": 1.5345578488922556e-05, + "loss": 0.1025, + "step": 12290 + }, + { + "epoch": 1.1645521681499715, + "grad_norm": 0.40759214758872986, + "learning_rate": 1.5341791327400116e-05, + "loss": 0.1204, + "step": 12300 + }, + { + "epoch": 1.1654989585305813, + "grad_norm": 0.30610671639442444, + "learning_rate": 1.5338004165877676e-05, + "loss": 0.1032, + "step": 12310 + }, + { + "epoch": 1.166445748911191, + "grad_norm": 0.3321300446987152, + "learning_rate": 1.5334217004355236e-05, + "loss": 0.114, + "step": 12320 + }, + { + "epoch": 1.1673925392918008, + "grad_norm": 0.31097903847694397, + "learning_rate": 1.53304298428328e-05, + "loss": 0.1151, + "step": 12330 + }, + { + "epoch": 1.1683393296724105, + "grad_norm": 0.32177960872650146, + "learning_rate": 1.532664268131036e-05, + "loss": 0.1137, + "step": 12340 + }, + { + "epoch": 1.1692861200530202, + "grad_norm": 0.3149729371070862, + "learning_rate": 1.532285551978792e-05, + "loss": 0.1197, + "step": 12350 + }, + { + "epoch": 1.17023291043363, + "grad_norm": 0.3405247628688812, + "learning_rate": 1.531906835826548e-05, + "loss": 0.1027, + "step": 12360 + }, + { + "epoch": 1.1711797008142397, + "grad_norm": 0.34930819272994995, + "learning_rate": 1.531528119674304e-05, + "loss": 0.1085, + "step": 12370 + }, + { + "epoch": 1.1721264911948495, + "grad_norm": 0.4254821240901947, + "learning_rate": 1.5311494035220603e-05, + "loss": 0.1117, + "step": 12380 + }, + { + "epoch": 1.1730732815754592, + "grad_norm": 0.380534827709198, + "learning_rate": 1.5307706873698163e-05, + "loss": 0.1112, + "step": 12390 + }, + { + "epoch": 1.174020071956069, + "grad_norm": 0.28946927189826965, + "learning_rate": 1.5303919712175727e-05, + "loss": 0.1039, + "step": 12400 + }, + { + "epoch": 1.1749668623366787, + "grad_norm": 0.2585335671901703, + "learning_rate": 1.5300132550653287e-05, + "loss": 0.0979, + "step": 12410 + }, + { + "epoch": 1.1759136527172884, + "grad_norm": 0.4145877957344055, + "learning_rate": 1.5296345389130847e-05, + "loss": 0.1072, + "step": 12420 + }, + { + "epoch": 1.1768604430978982, + "grad_norm": 0.3534521162509918, + "learning_rate": 1.529255822760841e-05, + "loss": 0.1125, + "step": 12430 + }, + { + "epoch": 1.177807233478508, + "grad_norm": 0.312716007232666, + "learning_rate": 1.528877106608597e-05, + "loss": 0.1124, + "step": 12440 + }, + { + "epoch": 1.1787540238591177, + "grad_norm": 0.3371047377586365, + "learning_rate": 1.528498390456353e-05, + "loss": 0.1013, + "step": 12450 + }, + { + "epoch": 1.1797008142397274, + "grad_norm": 0.3125591278076172, + "learning_rate": 1.528119674304109e-05, + "loss": 0.1095, + "step": 12460 + }, + { + "epoch": 1.1806476046203371, + "grad_norm": 0.3226798474788666, + "learning_rate": 1.5277409581518654e-05, + "loss": 0.111, + "step": 12470 + }, + { + "epoch": 1.1815943950009469, + "grad_norm": 0.3581395745277405, + "learning_rate": 1.5273622419996214e-05, + "loss": 0.1096, + "step": 12480 + }, + { + "epoch": 1.1825411853815566, + "grad_norm": 0.3344843089580536, + "learning_rate": 1.5269835258473774e-05, + "loss": 0.1111, + "step": 12490 + }, + { + "epoch": 1.1834879757621661, + "grad_norm": 0.40416479110717773, + "learning_rate": 1.5266048096951337e-05, + "loss": 0.1065, + "step": 12500 + }, + { + "epoch": 1.1844347661427759, + "grad_norm": 0.3474705219268799, + "learning_rate": 1.5262260935428897e-05, + "loss": 0.1017, + "step": 12510 + }, + { + "epoch": 1.1853815565233856, + "grad_norm": 0.374860554933548, + "learning_rate": 1.525847377390646e-05, + "loss": 0.1007, + "step": 12520 + }, + { + "epoch": 1.1863283469039954, + "grad_norm": 0.33770567178726196, + "learning_rate": 1.525468661238402e-05, + "loss": 0.1054, + "step": 12530 + }, + { + "epoch": 1.187275137284605, + "grad_norm": 0.3405020833015442, + "learning_rate": 1.5250899450861581e-05, + "loss": 0.1152, + "step": 12540 + }, + { + "epoch": 1.1882219276652148, + "grad_norm": 0.3371250033378601, + "learning_rate": 1.5247112289339141e-05, + "loss": 0.1026, + "step": 12550 + }, + { + "epoch": 1.1891687180458246, + "grad_norm": 0.4953118860721588, + "learning_rate": 1.5243325127816701e-05, + "loss": 0.1165, + "step": 12560 + }, + { + "epoch": 1.1901155084264343, + "grad_norm": 0.33394649624824524, + "learning_rate": 1.5239537966294265e-05, + "loss": 0.1181, + "step": 12570 + }, + { + "epoch": 1.191062298807044, + "grad_norm": 0.3339691758155823, + "learning_rate": 1.5235750804771825e-05, + "loss": 0.106, + "step": 12580 + }, + { + "epoch": 1.1920090891876538, + "grad_norm": 0.25260743498802185, + "learning_rate": 1.5231963643249386e-05, + "loss": 0.1196, + "step": 12590 + }, + { + "epoch": 1.1929558795682635, + "grad_norm": 0.307058185338974, + "learning_rate": 1.5228176481726946e-05, + "loss": 0.1159, + "step": 12600 + }, + { + "epoch": 1.1939026699488733, + "grad_norm": 0.3390623927116394, + "learning_rate": 1.5224389320204508e-05, + "loss": 0.1045, + "step": 12610 + }, + { + "epoch": 1.194849460329483, + "grad_norm": 0.357327401638031, + "learning_rate": 1.522060215868207e-05, + "loss": 0.0975, + "step": 12620 + }, + { + "epoch": 1.1957962507100928, + "grad_norm": 0.29861733317375183, + "learning_rate": 1.521681499715963e-05, + "loss": 0.1007, + "step": 12630 + }, + { + "epoch": 1.1967430410907025, + "grad_norm": 0.3431881368160248, + "learning_rate": 1.521302783563719e-05, + "loss": 0.1111, + "step": 12640 + }, + { + "epoch": 1.1976898314713122, + "grad_norm": 0.29756271839141846, + "learning_rate": 1.5209240674114752e-05, + "loss": 0.1014, + "step": 12650 + }, + { + "epoch": 1.198636621851922, + "grad_norm": 0.34353452920913696, + "learning_rate": 1.5205453512592313e-05, + "loss": 0.1107, + "step": 12660 + }, + { + "epoch": 1.1995834122325317, + "grad_norm": 0.31465011835098267, + "learning_rate": 1.5201666351069875e-05, + "loss": 0.1031, + "step": 12670 + }, + { + "epoch": 1.2005302026131415, + "grad_norm": 0.333942174911499, + "learning_rate": 1.5197879189547435e-05, + "loss": 0.1042, + "step": 12680 + }, + { + "epoch": 1.2014769929937512, + "grad_norm": 0.3030639588832855, + "learning_rate": 1.5194092028024995e-05, + "loss": 0.1015, + "step": 12690 + }, + { + "epoch": 1.202423783374361, + "grad_norm": 0.3866049349308014, + "learning_rate": 1.5190304866502557e-05, + "loss": 0.1069, + "step": 12700 + }, + { + "epoch": 1.2033705737549707, + "grad_norm": 0.3382660448551178, + "learning_rate": 1.5186517704980119e-05, + "loss": 0.1053, + "step": 12710 + }, + { + "epoch": 1.2043173641355804, + "grad_norm": 0.4327520430088043, + "learning_rate": 1.518273054345768e-05, + "loss": 0.1057, + "step": 12720 + }, + { + "epoch": 1.2052641545161902, + "grad_norm": 0.30221450328826904, + "learning_rate": 1.517894338193524e-05, + "loss": 0.1132, + "step": 12730 + }, + { + "epoch": 1.2062109448968, + "grad_norm": 0.39057356119155884, + "learning_rate": 1.51751562204128e-05, + "loss": 0.1081, + "step": 12740 + }, + { + "epoch": 1.2071577352774097, + "grad_norm": 0.31764695048332214, + "learning_rate": 1.5171369058890364e-05, + "loss": 0.1174, + "step": 12750 + }, + { + "epoch": 1.2081045256580194, + "grad_norm": 0.4214201271533966, + "learning_rate": 1.5167581897367924e-05, + "loss": 0.1076, + "step": 12760 + }, + { + "epoch": 1.2090513160386291, + "grad_norm": 0.4078432321548462, + "learning_rate": 1.5163794735845486e-05, + "loss": 0.1035, + "step": 12770 + }, + { + "epoch": 1.2099981064192389, + "grad_norm": 0.2991807460784912, + "learning_rate": 1.5160007574323046e-05, + "loss": 0.1172, + "step": 12780 + }, + { + "epoch": 1.2109448967998486, + "grad_norm": 0.3427339494228363, + "learning_rate": 1.5156220412800606e-05, + "loss": 0.1145, + "step": 12790 + }, + { + "epoch": 1.2118916871804584, + "grad_norm": 0.3660449981689453, + "learning_rate": 1.515243325127817e-05, + "loss": 0.1126, + "step": 12800 + }, + { + "epoch": 1.2128384775610679, + "grad_norm": 0.2685498595237732, + "learning_rate": 1.514864608975573e-05, + "loss": 0.1074, + "step": 12810 + }, + { + "epoch": 1.2137852679416776, + "grad_norm": 0.4036467969417572, + "learning_rate": 1.514485892823329e-05, + "loss": 0.1129, + "step": 12820 + }, + { + "epoch": 1.2147320583222874, + "grad_norm": 0.28507718443870544, + "learning_rate": 1.5141071766710851e-05, + "loss": 0.1054, + "step": 12830 + }, + { + "epoch": 1.215678848702897, + "grad_norm": 0.32858598232269287, + "learning_rate": 1.5137284605188411e-05, + "loss": 0.1081, + "step": 12840 + }, + { + "epoch": 1.2166256390835068, + "grad_norm": 0.3813497722148895, + "learning_rate": 1.5133497443665975e-05, + "loss": 0.1152, + "step": 12850 + }, + { + "epoch": 1.2175724294641166, + "grad_norm": 0.31586164236068726, + "learning_rate": 1.5129710282143535e-05, + "loss": 0.1076, + "step": 12860 + }, + { + "epoch": 1.2185192198447263, + "grad_norm": 0.44221732020378113, + "learning_rate": 1.5125923120621095e-05, + "loss": 0.1092, + "step": 12870 + }, + { + "epoch": 1.219466010225336, + "grad_norm": 0.4099266231060028, + "learning_rate": 1.5122135959098657e-05, + "loss": 0.1171, + "step": 12880 + }, + { + "epoch": 1.2204128006059458, + "grad_norm": 0.2790457606315613, + "learning_rate": 1.5118348797576218e-05, + "loss": 0.1051, + "step": 12890 + }, + { + "epoch": 1.2213595909865556, + "grad_norm": 0.40873250365257263, + "learning_rate": 1.511456163605378e-05, + "loss": 0.1068, + "step": 12900 + }, + { + "epoch": 1.2223063813671653, + "grad_norm": 0.34618380665779114, + "learning_rate": 1.511077447453134e-05, + "loss": 0.1111, + "step": 12910 + }, + { + "epoch": 1.223253171747775, + "grad_norm": 0.3440490961074829, + "learning_rate": 1.51069873130089e-05, + "loss": 0.1128, + "step": 12920 + }, + { + "epoch": 1.2241999621283848, + "grad_norm": 0.34496238827705383, + "learning_rate": 1.5103200151486462e-05, + "loss": 0.0993, + "step": 12930 + }, + { + "epoch": 1.2251467525089945, + "grad_norm": 0.3635864853858948, + "learning_rate": 1.5099412989964024e-05, + "loss": 0.0994, + "step": 12940 + }, + { + "epoch": 1.2260935428896043, + "grad_norm": 0.3953652083873749, + "learning_rate": 1.5095625828441585e-05, + "loss": 0.1148, + "step": 12950 + }, + { + "epoch": 1.227040333270214, + "grad_norm": 0.31408146023750305, + "learning_rate": 1.5091838666919145e-05, + "loss": 0.0987, + "step": 12960 + }, + { + "epoch": 1.2279871236508237, + "grad_norm": 0.47466138005256653, + "learning_rate": 1.5088051505396706e-05, + "loss": 0.1049, + "step": 12970 + }, + { + "epoch": 1.2289339140314335, + "grad_norm": 0.36139219999313354, + "learning_rate": 1.5084264343874267e-05, + "loss": 0.1182, + "step": 12980 + }, + { + "epoch": 1.2298807044120432, + "grad_norm": 0.31297993659973145, + "learning_rate": 1.5080477182351829e-05, + "loss": 0.1084, + "step": 12990 + }, + { + "epoch": 1.230827494792653, + "grad_norm": 0.35729625821113586, + "learning_rate": 1.5076690020829389e-05, + "loss": 0.1043, + "step": 13000 + }, + { + "epoch": 1.2317742851732627, + "grad_norm": 0.4169113039970398, + "learning_rate": 1.507290285930695e-05, + "loss": 0.1033, + "step": 13010 + }, + { + "epoch": 1.2327210755538724, + "grad_norm": 0.39845699071884155, + "learning_rate": 1.506911569778451e-05, + "loss": 0.1143, + "step": 13020 + }, + { + "epoch": 1.2336678659344822, + "grad_norm": 0.270967960357666, + "learning_rate": 1.5065328536262074e-05, + "loss": 0.1063, + "step": 13030 + }, + { + "epoch": 1.234614656315092, + "grad_norm": 0.3269328474998474, + "learning_rate": 1.5061541374739634e-05, + "loss": 0.0983, + "step": 13040 + }, + { + "epoch": 1.2355614466957014, + "grad_norm": 0.3136744499206543, + "learning_rate": 1.5057754213217194e-05, + "loss": 0.0919, + "step": 13050 + }, + { + "epoch": 1.2365082370763112, + "grad_norm": 0.3122495710849762, + "learning_rate": 1.5053967051694756e-05, + "loss": 0.1076, + "step": 13060 + }, + { + "epoch": 1.237455027456921, + "grad_norm": 0.4010438024997711, + "learning_rate": 1.5050179890172316e-05, + "loss": 0.1116, + "step": 13070 + }, + { + "epoch": 1.2384018178375307, + "grad_norm": 0.4302702248096466, + "learning_rate": 1.504639272864988e-05, + "loss": 0.1219, + "step": 13080 + }, + { + "epoch": 1.2393486082181404, + "grad_norm": 0.35059264302253723, + "learning_rate": 1.504260556712744e-05, + "loss": 0.12, + "step": 13090 + }, + { + "epoch": 1.2402953985987502, + "grad_norm": 0.30450910329818726, + "learning_rate": 1.5038818405605e-05, + "loss": 0.1018, + "step": 13100 + }, + { + "epoch": 1.24124218897936, + "grad_norm": 0.3359741270542145, + "learning_rate": 1.5035031244082561e-05, + "loss": 0.1174, + "step": 13110 + }, + { + "epoch": 1.2421889793599696, + "grad_norm": 0.2931978702545166, + "learning_rate": 1.5031244082560122e-05, + "loss": 0.1101, + "step": 13120 + }, + { + "epoch": 1.2431357697405794, + "grad_norm": 0.27383294701576233, + "learning_rate": 1.5027456921037685e-05, + "loss": 0.1108, + "step": 13130 + }, + { + "epoch": 1.2440825601211891, + "grad_norm": 0.33028000593185425, + "learning_rate": 1.5023669759515245e-05, + "loss": 0.1055, + "step": 13140 + }, + { + "epoch": 1.2450293505017989, + "grad_norm": 0.3667839765548706, + "learning_rate": 1.5019882597992805e-05, + "loss": 0.1099, + "step": 13150 + }, + { + "epoch": 1.2459761408824086, + "grad_norm": 0.3085506558418274, + "learning_rate": 1.5016095436470367e-05, + "loss": 0.1124, + "step": 13160 + }, + { + "epoch": 1.2469229312630183, + "grad_norm": 0.3162235617637634, + "learning_rate": 1.5012308274947929e-05, + "loss": 0.1133, + "step": 13170 + }, + { + "epoch": 1.247869721643628, + "grad_norm": 0.3025159239768982, + "learning_rate": 1.5008521113425489e-05, + "loss": 0.1079, + "step": 13180 + }, + { + "epoch": 1.2488165120242378, + "grad_norm": 0.3852561414241791, + "learning_rate": 1.500473395190305e-05, + "loss": 0.1041, + "step": 13190 + }, + { + "epoch": 1.2497633024048476, + "grad_norm": 0.3863688111305237, + "learning_rate": 1.500094679038061e-05, + "loss": 0.1081, + "step": 13200 + }, + { + "epoch": 1.2507100927854573, + "grad_norm": 0.42680519819259644, + "learning_rate": 1.499715962885817e-05, + "loss": 0.1011, + "step": 13210 + }, + { + "epoch": 1.251656883166067, + "grad_norm": 0.3259578049182892, + "learning_rate": 1.4993372467335734e-05, + "loss": 0.1116, + "step": 13220 + }, + { + "epoch": 1.2526036735466768, + "grad_norm": 0.3074738383293152, + "learning_rate": 1.4989585305813294e-05, + "loss": 0.1133, + "step": 13230 + }, + { + "epoch": 1.2535504639272865, + "grad_norm": 0.4082379639148712, + "learning_rate": 1.4985798144290856e-05, + "loss": 0.1107, + "step": 13240 + }, + { + "epoch": 1.2544972543078963, + "grad_norm": 0.35676753520965576, + "learning_rate": 1.4982010982768416e-05, + "loss": 0.1068, + "step": 13250 + }, + { + "epoch": 1.255444044688506, + "grad_norm": 0.34967681765556335, + "learning_rate": 1.4978223821245976e-05, + "loss": 0.1081, + "step": 13260 + }, + { + "epoch": 1.2563908350691158, + "grad_norm": 0.37591052055358887, + "learning_rate": 1.497443665972354e-05, + "loss": 0.1185, + "step": 13270 + }, + { + "epoch": 1.2573376254497255, + "grad_norm": 0.3394329845905304, + "learning_rate": 1.49706494982011e-05, + "loss": 0.1147, + "step": 13280 + }, + { + "epoch": 1.2582844158303352, + "grad_norm": 0.26928722858428955, + "learning_rate": 1.4966862336678661e-05, + "loss": 0.1077, + "step": 13290 + }, + { + "epoch": 1.259231206210945, + "grad_norm": 0.35786399245262146, + "learning_rate": 1.4963075175156221e-05, + "loss": 0.1024, + "step": 13300 + }, + { + "epoch": 1.2601779965915547, + "grad_norm": 0.2981221377849579, + "learning_rate": 1.4959288013633784e-05, + "loss": 0.1101, + "step": 13310 + }, + { + "epoch": 1.2611247869721645, + "grad_norm": 0.33681800961494446, + "learning_rate": 1.4955500852111345e-05, + "loss": 0.1158, + "step": 13320 + }, + { + "epoch": 1.2620715773527742, + "grad_norm": 0.3591706156730652, + "learning_rate": 1.4951713690588905e-05, + "loss": 0.114, + "step": 13330 + }, + { + "epoch": 1.263018367733384, + "grad_norm": 0.35171523690223694, + "learning_rate": 1.4947926529066466e-05, + "loss": 0.1057, + "step": 13340 + }, + { + "epoch": 1.2639651581139937, + "grad_norm": 0.36935341358184814, + "learning_rate": 1.4944139367544026e-05, + "loss": 0.1111, + "step": 13350 + }, + { + "epoch": 1.2649119484946034, + "grad_norm": 0.3233521580696106, + "learning_rate": 1.4940352206021588e-05, + "loss": 0.1015, + "step": 13360 + }, + { + "epoch": 1.2658587388752132, + "grad_norm": 0.408893346786499, + "learning_rate": 1.493656504449915e-05, + "loss": 0.1197, + "step": 13370 + }, + { + "epoch": 1.2668055292558227, + "grad_norm": 0.38509801030158997, + "learning_rate": 1.493277788297671e-05, + "loss": 0.1036, + "step": 13380 + }, + { + "epoch": 1.2677523196364324, + "grad_norm": 0.3889254331588745, + "learning_rate": 1.492899072145427e-05, + "loss": 0.1089, + "step": 13390 + }, + { + "epoch": 1.2686991100170422, + "grad_norm": 0.3505185544490814, + "learning_rate": 1.4925203559931832e-05, + "loss": 0.1083, + "step": 13400 + }, + { + "epoch": 1.269645900397652, + "grad_norm": 0.3532257676124573, + "learning_rate": 1.4921416398409393e-05, + "loss": 0.1123, + "step": 13410 + }, + { + "epoch": 1.2705926907782616, + "grad_norm": 0.4355742633342743, + "learning_rate": 1.4917629236886955e-05, + "loss": 0.1194, + "step": 13420 + }, + { + "epoch": 1.2715394811588714, + "grad_norm": 0.3455640971660614, + "learning_rate": 1.4913842075364515e-05, + "loss": 0.1062, + "step": 13430 + }, + { + "epoch": 1.2724862715394811, + "grad_norm": 0.420749306678772, + "learning_rate": 1.4910054913842075e-05, + "loss": 0.1005, + "step": 13440 + }, + { + "epoch": 1.2734330619200909, + "grad_norm": 0.4022374451160431, + "learning_rate": 1.4906267752319639e-05, + "loss": 0.1089, + "step": 13450 + }, + { + "epoch": 1.2743798523007006, + "grad_norm": 0.31215524673461914, + "learning_rate": 1.4902480590797199e-05, + "loss": 0.1126, + "step": 13460 + }, + { + "epoch": 1.2753266426813104, + "grad_norm": 0.37119176983833313, + "learning_rate": 1.489869342927476e-05, + "loss": 0.1088, + "step": 13470 + }, + { + "epoch": 1.27627343306192, + "grad_norm": 0.31066417694091797, + "learning_rate": 1.489490626775232e-05, + "loss": 0.1089, + "step": 13480 + }, + { + "epoch": 1.2772202234425298, + "grad_norm": 0.3264316916465759, + "learning_rate": 1.489111910622988e-05, + "loss": 0.1151, + "step": 13490 + }, + { + "epoch": 1.2781670138231396, + "grad_norm": 0.2913450002670288, + "learning_rate": 1.4887331944707444e-05, + "loss": 0.1135, + "step": 13500 + }, + { + "epoch": 1.2791138042037493, + "grad_norm": 0.3532226085662842, + "learning_rate": 1.4883544783185004e-05, + "loss": 0.1064, + "step": 13510 + }, + { + "epoch": 1.280060594584359, + "grad_norm": 0.3865976929664612, + "learning_rate": 1.4879757621662566e-05, + "loss": 0.1015, + "step": 13520 + }, + { + "epoch": 1.2810073849649688, + "grad_norm": 0.37290051579475403, + "learning_rate": 1.4875970460140126e-05, + "loss": 0.0982, + "step": 13530 + }, + { + "epoch": 1.2819541753455785, + "grad_norm": 0.4275021553039551, + "learning_rate": 1.4872183298617686e-05, + "loss": 0.1177, + "step": 13540 + }, + { + "epoch": 1.2829009657261883, + "grad_norm": 0.30412721633911133, + "learning_rate": 1.486839613709525e-05, + "loss": 0.1116, + "step": 13550 + }, + { + "epoch": 1.283847756106798, + "grad_norm": 0.31443697214126587, + "learning_rate": 1.486460897557281e-05, + "loss": 0.1128, + "step": 13560 + }, + { + "epoch": 1.2847945464874078, + "grad_norm": 0.23646575212478638, + "learning_rate": 1.486082181405037e-05, + "loss": 0.107, + "step": 13570 + }, + { + "epoch": 1.2857413368680175, + "grad_norm": 0.4218645691871643, + "learning_rate": 1.4857034652527931e-05, + "loss": 0.1021, + "step": 13580 + }, + { + "epoch": 1.286688127248627, + "grad_norm": 0.3243103623390198, + "learning_rate": 1.4853247491005493e-05, + "loss": 0.1037, + "step": 13590 + }, + { + "epoch": 1.2876349176292368, + "grad_norm": 0.3228418529033661, + "learning_rate": 1.4849460329483055e-05, + "loss": 0.1002, + "step": 13600 + }, + { + "epoch": 1.2885817080098465, + "grad_norm": 0.3226083517074585, + "learning_rate": 1.4845673167960615e-05, + "loss": 0.1123, + "step": 13610 + }, + { + "epoch": 1.2895284983904562, + "grad_norm": 0.39514216780662537, + "learning_rate": 1.4841886006438175e-05, + "loss": 0.1105, + "step": 13620 + }, + { + "epoch": 1.290475288771066, + "grad_norm": 0.31612706184387207, + "learning_rate": 1.4838098844915737e-05, + "loss": 0.1117, + "step": 13630 + }, + { + "epoch": 1.2914220791516757, + "grad_norm": 0.3052833378314972, + "learning_rate": 1.4834311683393298e-05, + "loss": 0.108, + "step": 13640 + }, + { + "epoch": 1.2923688695322855, + "grad_norm": 0.32270005345344543, + "learning_rate": 1.483052452187086e-05, + "loss": 0.107, + "step": 13650 + }, + { + "epoch": 1.2933156599128952, + "grad_norm": 0.3900124430656433, + "learning_rate": 1.482673736034842e-05, + "loss": 0.1161, + "step": 13660 + }, + { + "epoch": 1.294262450293505, + "grad_norm": 0.3898795247077942, + "learning_rate": 1.482295019882598e-05, + "loss": 0.109, + "step": 13670 + }, + { + "epoch": 1.2952092406741147, + "grad_norm": 0.3327631652355194, + "learning_rate": 1.4819163037303542e-05, + "loss": 0.1065, + "step": 13680 + }, + { + "epoch": 1.2961560310547244, + "grad_norm": 0.2671869695186615, + "learning_rate": 1.4815375875781104e-05, + "loss": 0.1151, + "step": 13690 + }, + { + "epoch": 1.2971028214353342, + "grad_norm": 0.36693522334098816, + "learning_rate": 1.4811588714258665e-05, + "loss": 0.1253, + "step": 13700 + }, + { + "epoch": 1.298049611815944, + "grad_norm": 0.37485846877098083, + "learning_rate": 1.4807801552736225e-05, + "loss": 0.1075, + "step": 13710 + }, + { + "epoch": 1.2989964021965537, + "grad_norm": 0.3453369736671448, + "learning_rate": 1.4804014391213785e-05, + "loss": 0.1098, + "step": 13720 + }, + { + "epoch": 1.2999431925771634, + "grad_norm": 0.29103124141693115, + "learning_rate": 1.4800227229691349e-05, + "loss": 0.1063, + "step": 13730 + }, + { + "epoch": 1.3008899829577731, + "grad_norm": 0.5688313245773315, + "learning_rate": 1.4796440068168909e-05, + "loss": 0.1125, + "step": 13740 + }, + { + "epoch": 1.3018367733383829, + "grad_norm": 0.4438546597957611, + "learning_rate": 1.4792652906646469e-05, + "loss": 0.1085, + "step": 13750 + }, + { + "epoch": 1.3027835637189926, + "grad_norm": 0.39846259355545044, + "learning_rate": 1.478886574512403e-05, + "loss": 0.1084, + "step": 13760 + }, + { + "epoch": 1.3037303540996024, + "grad_norm": 0.362238347530365, + "learning_rate": 1.478507858360159e-05, + "loss": 0.1084, + "step": 13770 + }, + { + "epoch": 1.304677144480212, + "grad_norm": 0.4283710718154907, + "learning_rate": 1.4781291422079154e-05, + "loss": 0.1117, + "step": 13780 + }, + { + "epoch": 1.3056239348608218, + "grad_norm": 0.2781156301498413, + "learning_rate": 1.4777504260556714e-05, + "loss": 0.1013, + "step": 13790 + }, + { + "epoch": 1.3065707252414316, + "grad_norm": 0.31439098715782166, + "learning_rate": 1.4773717099034274e-05, + "loss": 0.1084, + "step": 13800 + }, + { + "epoch": 1.3075175156220413, + "grad_norm": 0.4003913998603821, + "learning_rate": 1.4769929937511836e-05, + "loss": 0.1096, + "step": 13810 + }, + { + "epoch": 1.308464306002651, + "grad_norm": 0.39215022325515747, + "learning_rate": 1.4766142775989396e-05, + "loss": 0.1009, + "step": 13820 + }, + { + "epoch": 1.3094110963832608, + "grad_norm": 0.5251069068908691, + "learning_rate": 1.476235561446696e-05, + "loss": 0.11, + "step": 13830 + }, + { + "epoch": 1.3103578867638705, + "grad_norm": 0.4530577063560486, + "learning_rate": 1.475856845294452e-05, + "loss": 0.104, + "step": 13840 + }, + { + "epoch": 1.3113046771444803, + "grad_norm": 0.3963939845561981, + "learning_rate": 1.475478129142208e-05, + "loss": 0.1095, + "step": 13850 + }, + { + "epoch": 1.31225146752509, + "grad_norm": 0.2831088602542877, + "learning_rate": 1.4750994129899641e-05, + "loss": 0.0973, + "step": 13860 + }, + { + "epoch": 1.3131982579056998, + "grad_norm": 0.3418594002723694, + "learning_rate": 1.4747206968377203e-05, + "loss": 0.099, + "step": 13870 + }, + { + "epoch": 1.3141450482863095, + "grad_norm": 0.35267600417137146, + "learning_rate": 1.4743419806854765e-05, + "loss": 0.106, + "step": 13880 + }, + { + "epoch": 1.3150918386669193, + "grad_norm": 0.35564425587654114, + "learning_rate": 1.4739632645332325e-05, + "loss": 0.121, + "step": 13890 + }, + { + "epoch": 1.316038629047529, + "grad_norm": 0.37613263726234436, + "learning_rate": 1.4735845483809885e-05, + "loss": 0.1255, + "step": 13900 + }, + { + "epoch": 1.3169854194281387, + "grad_norm": 0.3647848665714264, + "learning_rate": 1.4732058322287445e-05, + "loss": 0.1113, + "step": 13910 + }, + { + "epoch": 1.3179322098087485, + "grad_norm": 0.4477173686027527, + "learning_rate": 1.4728271160765008e-05, + "loss": 0.1099, + "step": 13920 + }, + { + "epoch": 1.318879000189358, + "grad_norm": 0.3369513154029846, + "learning_rate": 1.4724483999242569e-05, + "loss": 0.108, + "step": 13930 + }, + { + "epoch": 1.3198257905699677, + "grad_norm": 0.32130277156829834, + "learning_rate": 1.472069683772013e-05, + "loss": 0.1066, + "step": 13940 + }, + { + "epoch": 1.3207725809505775, + "grad_norm": 0.3958997428417206, + "learning_rate": 1.471690967619769e-05, + "loss": 0.1084, + "step": 13950 + }, + { + "epoch": 1.3217193713311872, + "grad_norm": 0.27947115898132324, + "learning_rate": 1.4713122514675254e-05, + "loss": 0.1051, + "step": 13960 + }, + { + "epoch": 1.322666161711797, + "grad_norm": 0.2896656095981598, + "learning_rate": 1.4709335353152814e-05, + "loss": 0.0985, + "step": 13970 + }, + { + "epoch": 1.3236129520924067, + "grad_norm": 0.3368990123271942, + "learning_rate": 1.4705548191630374e-05, + "loss": 0.1037, + "step": 13980 + }, + { + "epoch": 1.3245597424730164, + "grad_norm": 0.4216310977935791, + "learning_rate": 1.4701761030107936e-05, + "loss": 0.1084, + "step": 13990 + }, + { + "epoch": 1.3255065328536262, + "grad_norm": 0.29863274097442627, + "learning_rate": 1.4697973868585496e-05, + "loss": 0.1108, + "step": 14000 + }, + { + "epoch": 1.326453323234236, + "grad_norm": 0.3625233769416809, + "learning_rate": 1.4694186707063059e-05, + "loss": 0.1151, + "step": 14010 + }, + { + "epoch": 1.3274001136148457, + "grad_norm": 0.3410845696926117, + "learning_rate": 1.4690399545540619e-05, + "loss": 0.1129, + "step": 14020 + }, + { + "epoch": 1.3283469039954554, + "grad_norm": 0.4437841773033142, + "learning_rate": 1.468661238401818e-05, + "loss": 0.112, + "step": 14030 + }, + { + "epoch": 1.3292936943760651, + "grad_norm": 0.3411904573440552, + "learning_rate": 1.4682825222495741e-05, + "loss": 0.1099, + "step": 14040 + }, + { + "epoch": 1.330240484756675, + "grad_norm": 0.36379116773605347, + "learning_rate": 1.4679038060973301e-05, + "loss": 0.1121, + "step": 14050 + }, + { + "epoch": 1.3311872751372846, + "grad_norm": 0.36595743894577026, + "learning_rate": 1.4675250899450864e-05, + "loss": 0.1061, + "step": 14060 + }, + { + "epoch": 1.3321340655178944, + "grad_norm": 0.3377189040184021, + "learning_rate": 1.4671463737928424e-05, + "loss": 0.1058, + "step": 14070 + }, + { + "epoch": 1.3330808558985041, + "grad_norm": 0.321304589509964, + "learning_rate": 1.4667676576405985e-05, + "loss": 0.116, + "step": 14080 + }, + { + "epoch": 1.3340276462791139, + "grad_norm": 0.3352893590927124, + "learning_rate": 1.4663889414883545e-05, + "loss": 0.1121, + "step": 14090 + }, + { + "epoch": 1.3349744366597236, + "grad_norm": 0.3742593228816986, + "learning_rate": 1.4660102253361108e-05, + "loss": 0.1076, + "step": 14100 + }, + { + "epoch": 1.3359212270403333, + "grad_norm": 0.2847396731376648, + "learning_rate": 1.4656315091838668e-05, + "loss": 0.1029, + "step": 14110 + }, + { + "epoch": 1.336868017420943, + "grad_norm": 0.36076590418815613, + "learning_rate": 1.465252793031623e-05, + "loss": 0.0999, + "step": 14120 + }, + { + "epoch": 1.3378148078015528, + "grad_norm": 0.3037276566028595, + "learning_rate": 1.464874076879379e-05, + "loss": 0.107, + "step": 14130 + }, + { + "epoch": 1.3387615981821623, + "grad_norm": 0.41113731265068054, + "learning_rate": 1.464495360727135e-05, + "loss": 0.1035, + "step": 14140 + }, + { + "epoch": 1.339708388562772, + "grad_norm": 0.3692213296890259, + "learning_rate": 1.4641166445748913e-05, + "loss": 0.1125, + "step": 14150 + }, + { + "epoch": 1.3406551789433818, + "grad_norm": 0.31594541668891907, + "learning_rate": 1.4637379284226473e-05, + "loss": 0.1016, + "step": 14160 + }, + { + "epoch": 1.3416019693239916, + "grad_norm": 0.3881237506866455, + "learning_rate": 1.4633592122704035e-05, + "loss": 0.1063, + "step": 14170 + }, + { + "epoch": 1.3425487597046013, + "grad_norm": 0.34778666496276855, + "learning_rate": 1.4629804961181595e-05, + "loss": 0.1044, + "step": 14180 + }, + { + "epoch": 1.343495550085211, + "grad_norm": 0.38068491220474243, + "learning_rate": 1.4626017799659155e-05, + "loss": 0.1088, + "step": 14190 + }, + { + "epoch": 1.3444423404658208, + "grad_norm": 0.379708856344223, + "learning_rate": 1.4622230638136719e-05, + "loss": 0.1065, + "step": 14200 + }, + { + "epoch": 1.3453891308464305, + "grad_norm": 0.47989198565483093, + "learning_rate": 1.4618443476614279e-05, + "loss": 0.1083, + "step": 14210 + }, + { + "epoch": 1.3463359212270403, + "grad_norm": 0.3261682689189911, + "learning_rate": 1.461465631509184e-05, + "loss": 0.1131, + "step": 14220 + }, + { + "epoch": 1.34728271160765, + "grad_norm": 0.27690833806991577, + "learning_rate": 1.46108691535694e-05, + "loss": 0.1107, + "step": 14230 + }, + { + "epoch": 1.3482295019882597, + "grad_norm": 0.3644765317440033, + "learning_rate": 1.4607081992046962e-05, + "loss": 0.1078, + "step": 14240 + }, + { + "epoch": 1.3491762923688695, + "grad_norm": 0.40070703625679016, + "learning_rate": 1.4603294830524524e-05, + "loss": 0.101, + "step": 14250 + }, + { + "epoch": 1.3501230827494792, + "grad_norm": 0.4271068871021271, + "learning_rate": 1.4599507669002084e-05, + "loss": 0.0979, + "step": 14260 + }, + { + "epoch": 1.351069873130089, + "grad_norm": 0.3761310875415802, + "learning_rate": 1.4595720507479644e-05, + "loss": 0.1151, + "step": 14270 + }, + { + "epoch": 1.3520166635106987, + "grad_norm": 0.4187332093715668, + "learning_rate": 1.4591933345957206e-05, + "loss": 0.1112, + "step": 14280 + }, + { + "epoch": 1.3529634538913085, + "grad_norm": 0.2853829264640808, + "learning_rate": 1.4588146184434768e-05, + "loss": 0.1051, + "step": 14290 + }, + { + "epoch": 1.3539102442719182, + "grad_norm": 0.4218035638332367, + "learning_rate": 1.458435902291233e-05, + "loss": 0.1153, + "step": 14300 + }, + { + "epoch": 1.354857034652528, + "grad_norm": 0.46550899744033813, + "learning_rate": 1.458057186138989e-05, + "loss": 0.1111, + "step": 14310 + }, + { + "epoch": 1.3558038250331377, + "grad_norm": 0.3463427722454071, + "learning_rate": 1.457678469986745e-05, + "loss": 0.1056, + "step": 14320 + }, + { + "epoch": 1.3567506154137474, + "grad_norm": 0.35000738501548767, + "learning_rate": 1.4572997538345011e-05, + "loss": 0.1043, + "step": 14330 + }, + { + "epoch": 1.3576974057943572, + "grad_norm": 0.3538738191127777, + "learning_rate": 1.4569210376822573e-05, + "loss": 0.1177, + "step": 14340 + }, + { + "epoch": 1.358644196174967, + "grad_norm": 0.33624202013015747, + "learning_rate": 1.4565423215300135e-05, + "loss": 0.1143, + "step": 14350 + }, + { + "epoch": 1.3595909865555766, + "grad_norm": 0.3360025882720947, + "learning_rate": 1.4561636053777695e-05, + "loss": 0.1113, + "step": 14360 + }, + { + "epoch": 1.3605377769361864, + "grad_norm": 0.400475412607193, + "learning_rate": 1.4557848892255255e-05, + "loss": 0.106, + "step": 14370 + }, + { + "epoch": 1.3614845673167961, + "grad_norm": 0.318945050239563, + "learning_rate": 1.4554061730732818e-05, + "loss": 0.123, + "step": 14380 + }, + { + "epoch": 1.3624313576974059, + "grad_norm": 0.3328794836997986, + "learning_rate": 1.4550274569210378e-05, + "loss": 0.1189, + "step": 14390 + }, + { + "epoch": 1.3633781480780156, + "grad_norm": 0.3011545240879059, + "learning_rate": 1.454648740768794e-05, + "loss": 0.1053, + "step": 14400 + }, + { + "epoch": 1.3643249384586253, + "grad_norm": 0.3530985116958618, + "learning_rate": 1.45427002461655e-05, + "loss": 0.1049, + "step": 14410 + }, + { + "epoch": 1.365271728839235, + "grad_norm": 0.31439208984375, + "learning_rate": 1.453891308464306e-05, + "loss": 0.1172, + "step": 14420 + }, + { + "epoch": 1.3662185192198448, + "grad_norm": 0.2875416874885559, + "learning_rate": 1.4535125923120624e-05, + "loss": 0.1038, + "step": 14430 + }, + { + "epoch": 1.3671653096004546, + "grad_norm": 0.36960655450820923, + "learning_rate": 1.4531338761598184e-05, + "loss": 0.1199, + "step": 14440 + }, + { + "epoch": 1.3681120999810643, + "grad_norm": 0.34971728920936584, + "learning_rate": 1.4527551600075744e-05, + "loss": 0.1161, + "step": 14450 + }, + { + "epoch": 1.369058890361674, + "grad_norm": 0.31043538451194763, + "learning_rate": 1.4523764438553305e-05, + "loss": 0.1091, + "step": 14460 + }, + { + "epoch": 1.3700056807422838, + "grad_norm": 0.3760201632976532, + "learning_rate": 1.4519977277030865e-05, + "loss": 0.1136, + "step": 14470 + }, + { + "epoch": 1.3709524711228933, + "grad_norm": 0.3248160779476166, + "learning_rate": 1.4516190115508429e-05, + "loss": 0.1093, + "step": 14480 + }, + { + "epoch": 1.371899261503503, + "grad_norm": 0.4631254971027374, + "learning_rate": 1.4512402953985989e-05, + "loss": 0.1025, + "step": 14490 + }, + { + "epoch": 1.3728460518841128, + "grad_norm": 0.31244543194770813, + "learning_rate": 1.4508615792463549e-05, + "loss": 0.1068, + "step": 14500 + }, + { + "epoch": 1.3737928422647225, + "grad_norm": 0.35992249846458435, + "learning_rate": 1.450482863094111e-05, + "loss": 0.1116, + "step": 14510 + }, + { + "epoch": 1.3747396326453323, + "grad_norm": 0.4582676291465759, + "learning_rate": 1.4501041469418672e-05, + "loss": 0.1007, + "step": 14520 + }, + { + "epoch": 1.375686423025942, + "grad_norm": 0.3570781648159027, + "learning_rate": 1.4497254307896234e-05, + "loss": 0.1125, + "step": 14530 + }, + { + "epoch": 1.3766332134065518, + "grad_norm": 0.29250195622444153, + "learning_rate": 1.4493467146373794e-05, + "loss": 0.1095, + "step": 14540 + }, + { + "epoch": 1.3775800037871615, + "grad_norm": 0.4136636257171631, + "learning_rate": 1.4489679984851354e-05, + "loss": 0.1174, + "step": 14550 + }, + { + "epoch": 1.3785267941677712, + "grad_norm": 0.3163720369338989, + "learning_rate": 1.4485892823328916e-05, + "loss": 0.1028, + "step": 14560 + }, + { + "epoch": 1.379473584548381, + "grad_norm": 0.43609631061553955, + "learning_rate": 1.4482105661806478e-05, + "loss": 0.1098, + "step": 14570 + }, + { + "epoch": 1.3804203749289907, + "grad_norm": 0.364156037569046, + "learning_rate": 1.447831850028404e-05, + "loss": 0.111, + "step": 14580 + }, + { + "epoch": 1.3813671653096005, + "grad_norm": 0.3868848383426666, + "learning_rate": 1.44745313387616e-05, + "loss": 0.1288, + "step": 14590 + }, + { + "epoch": 1.3823139556902102, + "grad_norm": 0.30255648493766785, + "learning_rate": 1.447074417723916e-05, + "loss": 0.1122, + "step": 14600 + }, + { + "epoch": 1.38326074607082, + "grad_norm": 0.4294314682483673, + "learning_rate": 1.4466957015716721e-05, + "loss": 0.1088, + "step": 14610 + }, + { + "epoch": 1.3842075364514297, + "grad_norm": 0.3444744050502777, + "learning_rate": 1.4463169854194283e-05, + "loss": 0.106, + "step": 14620 + }, + { + "epoch": 1.3851543268320394, + "grad_norm": 0.3731893002986908, + "learning_rate": 1.4459382692671843e-05, + "loss": 0.1074, + "step": 14630 + }, + { + "epoch": 1.3861011172126492, + "grad_norm": 0.30317267775535583, + "learning_rate": 1.4455595531149405e-05, + "loss": 0.1052, + "step": 14640 + }, + { + "epoch": 1.387047907593259, + "grad_norm": 0.3629275858402252, + "learning_rate": 1.4451808369626965e-05, + "loss": 0.1024, + "step": 14650 + }, + { + "epoch": 1.3879946979738687, + "grad_norm": 0.3758595883846283, + "learning_rate": 1.4448021208104528e-05, + "loss": 0.1152, + "step": 14660 + }, + { + "epoch": 1.3889414883544784, + "grad_norm": 0.3961845636367798, + "learning_rate": 1.4444234046582088e-05, + "loss": 0.1112, + "step": 14670 + }, + { + "epoch": 1.3898882787350881, + "grad_norm": 0.3658260405063629, + "learning_rate": 1.4440446885059648e-05, + "loss": 0.1094, + "step": 14680 + }, + { + "epoch": 1.3908350691156977, + "grad_norm": 0.3775550425052643, + "learning_rate": 1.443665972353721e-05, + "loss": 0.1136, + "step": 14690 + }, + { + "epoch": 1.3917818594963074, + "grad_norm": 0.30337873101234436, + "learning_rate": 1.443287256201477e-05, + "loss": 0.1073, + "step": 14700 + }, + { + "epoch": 1.3927286498769171, + "grad_norm": 0.36682024598121643, + "learning_rate": 1.4429085400492334e-05, + "loss": 0.1066, + "step": 14710 + }, + { + "epoch": 1.3936754402575269, + "grad_norm": 0.5544394254684448, + "learning_rate": 1.4425298238969894e-05, + "loss": 0.1134, + "step": 14720 + }, + { + "epoch": 1.3946222306381366, + "grad_norm": 0.3843153715133667, + "learning_rate": 1.4421511077447454e-05, + "loss": 0.1048, + "step": 14730 + }, + { + "epoch": 1.3955690210187464, + "grad_norm": 0.40148961544036865, + "learning_rate": 1.4417723915925016e-05, + "loss": 0.1068, + "step": 14740 + }, + { + "epoch": 1.396515811399356, + "grad_norm": 0.30784979462623596, + "learning_rate": 1.4413936754402576e-05, + "loss": 0.1017, + "step": 14750 + }, + { + "epoch": 1.3974626017799658, + "grad_norm": 0.30099502205848694, + "learning_rate": 1.4410149592880139e-05, + "loss": 0.1049, + "step": 14760 + }, + { + "epoch": 1.3984093921605756, + "grad_norm": 0.30565211176872253, + "learning_rate": 1.4406362431357699e-05, + "loss": 0.1109, + "step": 14770 + }, + { + "epoch": 1.3993561825411853, + "grad_norm": 0.33714359998703003, + "learning_rate": 1.4402575269835259e-05, + "loss": 0.1105, + "step": 14780 + }, + { + "epoch": 1.400302972921795, + "grad_norm": 0.420604407787323, + "learning_rate": 1.4398788108312821e-05, + "loss": 0.0969, + "step": 14790 + }, + { + "epoch": 1.4012497633024048, + "grad_norm": 0.32643163204193115, + "learning_rate": 1.4395000946790383e-05, + "loss": 0.0992, + "step": 14800 + }, + { + "epoch": 1.4021965536830145, + "grad_norm": 0.4230499267578125, + "learning_rate": 1.4391213785267943e-05, + "loss": 0.1153, + "step": 14810 + }, + { + "epoch": 1.4031433440636243, + "grad_norm": 0.4465436637401581, + "learning_rate": 1.4387426623745504e-05, + "loss": 0.1096, + "step": 14820 + }, + { + "epoch": 1.404090134444234, + "grad_norm": 0.37599942088127136, + "learning_rate": 1.4383639462223064e-05, + "loss": 0.1142, + "step": 14830 + }, + { + "epoch": 1.4050369248248438, + "grad_norm": 0.340032160282135, + "learning_rate": 1.4379852300700625e-05, + "loss": 0.106, + "step": 14840 + }, + { + "epoch": 1.4059837152054535, + "grad_norm": 0.35222116112709045, + "learning_rate": 1.4376065139178188e-05, + "loss": 0.1116, + "step": 14850 + }, + { + "epoch": 1.4069305055860633, + "grad_norm": 0.3487146496772766, + "learning_rate": 1.4372277977655748e-05, + "loss": 0.1183, + "step": 14860 + }, + { + "epoch": 1.407877295966673, + "grad_norm": 0.3430015742778778, + "learning_rate": 1.436849081613331e-05, + "loss": 0.1118, + "step": 14870 + }, + { + "epoch": 1.4088240863472827, + "grad_norm": 0.2878393828868866, + "learning_rate": 1.436470365461087e-05, + "loss": 0.106, + "step": 14880 + }, + { + "epoch": 1.4097708767278925, + "grad_norm": 0.31863144040107727, + "learning_rate": 1.436091649308843e-05, + "loss": 0.1075, + "step": 14890 + }, + { + "epoch": 1.4107176671085022, + "grad_norm": 0.3831043243408203, + "learning_rate": 1.4357129331565993e-05, + "loss": 0.1032, + "step": 14900 + }, + { + "epoch": 1.411664457489112, + "grad_norm": 0.39178845286369324, + "learning_rate": 1.4353342170043553e-05, + "loss": 0.1145, + "step": 14910 + }, + { + "epoch": 1.4126112478697217, + "grad_norm": 0.34716176986694336, + "learning_rate": 1.4349555008521115e-05, + "loss": 0.1067, + "step": 14920 + }, + { + "epoch": 1.4135580382503314, + "grad_norm": 0.30537331104278564, + "learning_rate": 1.4345767846998675e-05, + "loss": 0.1123, + "step": 14930 + }, + { + "epoch": 1.4145048286309412, + "grad_norm": 0.4346376657485962, + "learning_rate": 1.4341980685476239e-05, + "loss": 0.1033, + "step": 14940 + }, + { + "epoch": 1.415451619011551, + "grad_norm": 0.33927392959594727, + "learning_rate": 1.4338193523953799e-05, + "loss": 0.1059, + "step": 14950 + }, + { + "epoch": 1.4163984093921607, + "grad_norm": 0.3123628795146942, + "learning_rate": 1.4334406362431359e-05, + "loss": 0.1012, + "step": 14960 + }, + { + "epoch": 1.4173451997727704, + "grad_norm": 0.36858904361724854, + "learning_rate": 1.433061920090892e-05, + "loss": 0.1102, + "step": 14970 + }, + { + "epoch": 1.4182919901533801, + "grad_norm": 0.29052573442459106, + "learning_rate": 1.432683203938648e-05, + "loss": 0.1007, + "step": 14980 + }, + { + "epoch": 1.4192387805339899, + "grad_norm": 0.41456088423728943, + "learning_rate": 1.4323044877864042e-05, + "loss": 0.1134, + "step": 14990 + }, + { + "epoch": 1.4201855709145996, + "grad_norm": 0.333921879529953, + "learning_rate": 1.4319257716341604e-05, + "loss": 0.1013, + "step": 15000 + }, + { + "epoch": 1.4211323612952094, + "grad_norm": 0.36147958040237427, + "learning_rate": 1.4315470554819164e-05, + "loss": 0.1115, + "step": 15010 + }, + { + "epoch": 1.422079151675819, + "grad_norm": 0.37708374857902527, + "learning_rate": 1.4311683393296724e-05, + "loss": 0.1092, + "step": 15020 + }, + { + "epoch": 1.4230259420564286, + "grad_norm": 0.3549210727214813, + "learning_rate": 1.4307896231774286e-05, + "loss": 0.1065, + "step": 15030 + }, + { + "epoch": 1.4239727324370384, + "grad_norm": 0.35664400458335876, + "learning_rate": 1.4304109070251848e-05, + "loss": 0.1124, + "step": 15040 + }, + { + "epoch": 1.424919522817648, + "grad_norm": 0.33038046956062317, + "learning_rate": 1.430032190872941e-05, + "loss": 0.0909, + "step": 15050 + }, + { + "epoch": 1.4258663131982579, + "grad_norm": 0.46176019310951233, + "learning_rate": 1.429653474720697e-05, + "loss": 0.1019, + "step": 15060 + }, + { + "epoch": 1.4268131035788676, + "grad_norm": 0.4170028269290924, + "learning_rate": 1.429274758568453e-05, + "loss": 0.109, + "step": 15070 + }, + { + "epoch": 1.4277598939594773, + "grad_norm": 0.42809316515922546, + "learning_rate": 1.4288960424162093e-05, + "loss": 0.1107, + "step": 15080 + }, + { + "epoch": 1.428706684340087, + "grad_norm": 0.2779739499092102, + "learning_rate": 1.4285173262639653e-05, + "loss": 0.1163, + "step": 15090 + }, + { + "epoch": 1.4296534747206968, + "grad_norm": 0.37938955426216125, + "learning_rate": 1.4281386101117215e-05, + "loss": 0.1082, + "step": 15100 + }, + { + "epoch": 1.4306002651013066, + "grad_norm": 0.3991941511631012, + "learning_rate": 1.4277598939594775e-05, + "loss": 0.1017, + "step": 15110 + }, + { + "epoch": 1.4315470554819163, + "grad_norm": 0.31340453028678894, + "learning_rate": 1.4273811778072335e-05, + "loss": 0.1174, + "step": 15120 + }, + { + "epoch": 1.432493845862526, + "grad_norm": 0.5108481645584106, + "learning_rate": 1.4270024616549898e-05, + "loss": 0.1105, + "step": 15130 + }, + { + "epoch": 1.4334406362431358, + "grad_norm": 0.4092690944671631, + "learning_rate": 1.4266237455027458e-05, + "loss": 0.1144, + "step": 15140 + }, + { + "epoch": 1.4343874266237455, + "grad_norm": 0.32388442754745483, + "learning_rate": 1.426245029350502e-05, + "loss": 0.1156, + "step": 15150 + }, + { + "epoch": 1.4353342170043553, + "grad_norm": 0.3596822917461395, + "learning_rate": 1.425866313198258e-05, + "loss": 0.1048, + "step": 15160 + }, + { + "epoch": 1.436281007384965, + "grad_norm": 0.3799607455730438, + "learning_rate": 1.425487597046014e-05, + "loss": 0.1118, + "step": 15170 + }, + { + "epoch": 1.4372277977655747, + "grad_norm": 0.3617545962333679, + "learning_rate": 1.4251088808937703e-05, + "loss": 0.1067, + "step": 15180 + }, + { + "epoch": 1.4381745881461845, + "grad_norm": 0.37102869153022766, + "learning_rate": 1.4247301647415264e-05, + "loss": 0.1164, + "step": 15190 + }, + { + "epoch": 1.4391213785267942, + "grad_norm": 0.347303181886673, + "learning_rate": 1.4243514485892824e-05, + "loss": 0.1087, + "step": 15200 + }, + { + "epoch": 1.440068168907404, + "grad_norm": 0.3193877041339874, + "learning_rate": 1.4239727324370385e-05, + "loss": 0.1154, + "step": 15210 + }, + { + "epoch": 1.4410149592880137, + "grad_norm": 0.502229630947113, + "learning_rate": 1.4235940162847947e-05, + "loss": 0.1136, + "step": 15220 + }, + { + "epoch": 1.4419617496686234, + "grad_norm": 0.4352882504463196, + "learning_rate": 1.4232153001325509e-05, + "loss": 0.1054, + "step": 15230 + }, + { + "epoch": 1.442908540049233, + "grad_norm": 0.3593296706676483, + "learning_rate": 1.4228365839803069e-05, + "loss": 0.1083, + "step": 15240 + }, + { + "epoch": 1.4438553304298427, + "grad_norm": 0.3581804931163788, + "learning_rate": 1.4224578678280629e-05, + "loss": 0.1136, + "step": 15250 + }, + { + "epoch": 1.4448021208104525, + "grad_norm": 0.272034615278244, + "learning_rate": 1.422079151675819e-05, + "loss": 0.0999, + "step": 15260 + }, + { + "epoch": 1.4457489111910622, + "grad_norm": 0.35017022490501404, + "learning_rate": 1.4217004355235752e-05, + "loss": 0.1154, + "step": 15270 + }, + { + "epoch": 1.446695701571672, + "grad_norm": 0.3309600055217743, + "learning_rate": 1.4213217193713314e-05, + "loss": 0.1033, + "step": 15280 + }, + { + "epoch": 1.4476424919522817, + "grad_norm": 0.3215767443180084, + "learning_rate": 1.4209430032190874e-05, + "loss": 0.1088, + "step": 15290 + }, + { + "epoch": 1.4485892823328914, + "grad_norm": 0.3672831356525421, + "learning_rate": 1.4205642870668434e-05, + "loss": 0.1059, + "step": 15300 + }, + { + "epoch": 1.4495360727135012, + "grad_norm": 0.3399512767791748, + "learning_rate": 1.4201855709145996e-05, + "loss": 0.1054, + "step": 15310 + }, + { + "epoch": 1.450482863094111, + "grad_norm": 0.2895975708961487, + "learning_rate": 1.4198068547623558e-05, + "loss": 0.11, + "step": 15320 + }, + { + "epoch": 1.4514296534747206, + "grad_norm": 0.38185954093933105, + "learning_rate": 1.419428138610112e-05, + "loss": 0.1186, + "step": 15330 + }, + { + "epoch": 1.4523764438553304, + "grad_norm": 0.3593434691429138, + "learning_rate": 1.419049422457868e-05, + "loss": 0.0985, + "step": 15340 + }, + { + "epoch": 1.4533232342359401, + "grad_norm": 0.38047701120376587, + "learning_rate": 1.418670706305624e-05, + "loss": 0.1052, + "step": 15350 + }, + { + "epoch": 1.4542700246165499, + "grad_norm": 0.34423530101776123, + "learning_rate": 1.4182919901533803e-05, + "loss": 0.1071, + "step": 15360 + }, + { + "epoch": 1.4552168149971596, + "grad_norm": 0.27197402715682983, + "learning_rate": 1.4179132740011363e-05, + "loss": 0.1089, + "step": 15370 + }, + { + "epoch": 1.4561636053777693, + "grad_norm": 0.3939764201641083, + "learning_rate": 1.4175345578488923e-05, + "loss": 0.1069, + "step": 15380 + }, + { + "epoch": 1.457110395758379, + "grad_norm": 0.30846384167671204, + "learning_rate": 1.4171558416966485e-05, + "loss": 0.1062, + "step": 15390 + }, + { + "epoch": 1.4580571861389888, + "grad_norm": 0.30915161967277527, + "learning_rate": 1.4167771255444045e-05, + "loss": 0.0926, + "step": 15400 + }, + { + "epoch": 1.4590039765195986, + "grad_norm": 0.33449786901474, + "learning_rate": 1.4163984093921608e-05, + "loss": 0.1106, + "step": 15410 + }, + { + "epoch": 1.4599507669002083, + "grad_norm": 0.27424463629722595, + "learning_rate": 1.4160196932399168e-05, + "loss": 0.1045, + "step": 15420 + }, + { + "epoch": 1.460897557280818, + "grad_norm": 0.2804360091686249, + "learning_rate": 1.4156409770876728e-05, + "loss": 0.1177, + "step": 15430 + }, + { + "epoch": 1.4618443476614278, + "grad_norm": 0.35507193207740784, + "learning_rate": 1.415262260935429e-05, + "loss": 0.1058, + "step": 15440 + }, + { + "epoch": 1.4627911380420375, + "grad_norm": 0.3734015226364136, + "learning_rate": 1.414883544783185e-05, + "loss": 0.1046, + "step": 15450 + }, + { + "epoch": 1.4637379284226473, + "grad_norm": 0.29109764099121094, + "learning_rate": 1.4145048286309414e-05, + "loss": 0.1055, + "step": 15460 + }, + { + "epoch": 1.464684718803257, + "grad_norm": 0.3349165916442871, + "learning_rate": 1.4141261124786974e-05, + "loss": 0.1029, + "step": 15470 + }, + { + "epoch": 1.4656315091838668, + "grad_norm": 0.3439183235168457, + "learning_rate": 1.4137473963264534e-05, + "loss": 0.1088, + "step": 15480 + }, + { + "epoch": 1.4665782995644765, + "grad_norm": 0.31033629179000854, + "learning_rate": 1.4133686801742095e-05, + "loss": 0.1001, + "step": 15490 + }, + { + "epoch": 1.4675250899450862, + "grad_norm": 0.3343507945537567, + "learning_rate": 1.4129899640219657e-05, + "loss": 0.1085, + "step": 15500 + }, + { + "epoch": 1.468471880325696, + "grad_norm": 0.3156386911869049, + "learning_rate": 1.4126112478697219e-05, + "loss": 0.1126, + "step": 15510 + }, + { + "epoch": 1.4694186707063057, + "grad_norm": 0.3920190632343292, + "learning_rate": 1.4122325317174779e-05, + "loss": 0.1108, + "step": 15520 + }, + { + "epoch": 1.4703654610869155, + "grad_norm": 0.2956010699272156, + "learning_rate": 1.4118538155652339e-05, + "loss": 0.0993, + "step": 15530 + }, + { + "epoch": 1.4713122514675252, + "grad_norm": 0.363127201795578, + "learning_rate": 1.4114750994129899e-05, + "loss": 0.0994, + "step": 15540 + }, + { + "epoch": 1.472259041848135, + "grad_norm": 0.3261021673679352, + "learning_rate": 1.4110963832607463e-05, + "loss": 0.1056, + "step": 15550 + }, + { + "epoch": 1.4732058322287447, + "grad_norm": 0.37980979681015015, + "learning_rate": 1.4107176671085023e-05, + "loss": 0.1151, + "step": 15560 + }, + { + "epoch": 1.4741526226093544, + "grad_norm": 0.35632893443107605, + "learning_rate": 1.4103389509562584e-05, + "loss": 0.1067, + "step": 15570 + }, + { + "epoch": 1.475099412989964, + "grad_norm": 0.3723986744880676, + "learning_rate": 1.4099602348040144e-05, + "loss": 0.1052, + "step": 15580 + }, + { + "epoch": 1.4760462033705737, + "grad_norm": 0.3155215382575989, + "learning_rate": 1.4095815186517704e-05, + "loss": 0.1047, + "step": 15590 + }, + { + "epoch": 1.4769929937511834, + "grad_norm": 0.36644500494003296, + "learning_rate": 1.4092028024995268e-05, + "loss": 0.1038, + "step": 15600 + }, + { + "epoch": 1.4779397841317932, + "grad_norm": 0.3605925738811493, + "learning_rate": 1.4088240863472828e-05, + "loss": 0.111, + "step": 15610 + }, + { + "epoch": 1.478886574512403, + "grad_norm": 0.39198005199432373, + "learning_rate": 1.408445370195039e-05, + "loss": 0.1148, + "step": 15620 + }, + { + "epoch": 1.4798333648930126, + "grad_norm": 0.36386531591415405, + "learning_rate": 1.408066654042795e-05, + "loss": 0.1043, + "step": 15630 + }, + { + "epoch": 1.4807801552736224, + "grad_norm": 0.44443851709365845, + "learning_rate": 1.4076879378905513e-05, + "loss": 0.1029, + "step": 15640 + }, + { + "epoch": 1.4817269456542321, + "grad_norm": 0.4252612590789795, + "learning_rate": 1.4073092217383073e-05, + "loss": 0.1065, + "step": 15650 + }, + { + "epoch": 1.4826737360348419, + "grad_norm": 0.3278532028198242, + "learning_rate": 1.4069305055860633e-05, + "loss": 0.1099, + "step": 15660 + }, + { + "epoch": 1.4836205264154516, + "grad_norm": 0.30991101264953613, + "learning_rate": 1.4065517894338195e-05, + "loss": 0.1065, + "step": 15670 + }, + { + "epoch": 1.4845673167960614, + "grad_norm": 0.39886489510536194, + "learning_rate": 1.4061730732815755e-05, + "loss": 0.11, + "step": 15680 + }, + { + "epoch": 1.485514107176671, + "grad_norm": 0.3394196331501007, + "learning_rate": 1.4057943571293318e-05, + "loss": 0.1159, + "step": 15690 + }, + { + "epoch": 1.4864608975572808, + "grad_norm": 0.32270580530166626, + "learning_rate": 1.4054156409770879e-05, + "loss": 0.118, + "step": 15700 + }, + { + "epoch": 1.4874076879378906, + "grad_norm": 0.3170892000198364, + "learning_rate": 1.4050369248248439e-05, + "loss": 0.1064, + "step": 15710 + }, + { + "epoch": 1.4883544783185003, + "grad_norm": 0.4652809202671051, + "learning_rate": 1.4046582086725999e-05, + "loss": 0.1251, + "step": 15720 + }, + { + "epoch": 1.48930126869911, + "grad_norm": 0.34130120277404785, + "learning_rate": 1.404279492520356e-05, + "loss": 0.1066, + "step": 15730 + }, + { + "epoch": 1.4902480590797198, + "grad_norm": 0.34349262714385986, + "learning_rate": 1.4039007763681122e-05, + "loss": 0.1042, + "step": 15740 + }, + { + "epoch": 1.4911948494603295, + "grad_norm": 0.38822877407073975, + "learning_rate": 1.4035220602158684e-05, + "loss": 0.1116, + "step": 15750 + }, + { + "epoch": 1.4921416398409393, + "grad_norm": 0.31036290526390076, + "learning_rate": 1.4031433440636244e-05, + "loss": 0.0986, + "step": 15760 + }, + { + "epoch": 1.493088430221549, + "grad_norm": 0.33042672276496887, + "learning_rate": 1.4027646279113804e-05, + "loss": 0.1002, + "step": 15770 + }, + { + "epoch": 1.4940352206021588, + "grad_norm": 0.3388994038105011, + "learning_rate": 1.4023859117591367e-05, + "loss": 0.1034, + "step": 15780 + }, + { + "epoch": 1.4949820109827683, + "grad_norm": 0.44724178314208984, + "learning_rate": 1.4020071956068927e-05, + "loss": 0.1108, + "step": 15790 + }, + { + "epoch": 1.495928801363378, + "grad_norm": 0.358551949262619, + "learning_rate": 1.401628479454649e-05, + "loss": 0.1043, + "step": 15800 + }, + { + "epoch": 1.4968755917439878, + "grad_norm": 0.32363811135292053, + "learning_rate": 1.401249763302405e-05, + "loss": 0.098, + "step": 15810 + }, + { + "epoch": 1.4978223821245975, + "grad_norm": 0.3617938160896301, + "learning_rate": 1.400871047150161e-05, + "loss": 0.1137, + "step": 15820 + }, + { + "epoch": 1.4987691725052072, + "grad_norm": 0.33651456236839294, + "learning_rate": 1.4004923309979173e-05, + "loss": 0.1052, + "step": 15830 + }, + { + "epoch": 1.499715962885817, + "grad_norm": 0.3093239963054657, + "learning_rate": 1.4001136148456733e-05, + "loss": 0.106, + "step": 15840 + }, + { + "epoch": 1.5006627532664267, + "grad_norm": 0.36069008708000183, + "learning_rate": 1.3997348986934295e-05, + "loss": 0.1059, + "step": 15850 + }, + { + "epoch": 1.5016095436470365, + "grad_norm": 0.35160404443740845, + "learning_rate": 1.3993561825411855e-05, + "loss": 0.1072, + "step": 15860 + }, + { + "epoch": 1.5025563340276462, + "grad_norm": 0.3883945345878601, + "learning_rate": 1.3989774663889415e-05, + "loss": 0.106, + "step": 15870 + }, + { + "epoch": 1.503503124408256, + "grad_norm": 0.4188264012336731, + "learning_rate": 1.3985987502366978e-05, + "loss": 0.1135, + "step": 15880 + }, + { + "epoch": 1.5044499147888657, + "grad_norm": 0.3728765547275543, + "learning_rate": 1.3982200340844538e-05, + "loss": 0.1204, + "step": 15890 + }, + { + "epoch": 1.5053967051694754, + "grad_norm": 0.30158689618110657, + "learning_rate": 1.3978413179322098e-05, + "loss": 0.1085, + "step": 15900 + }, + { + "epoch": 1.5063434955500852, + "grad_norm": 0.30330297350883484, + "learning_rate": 1.397462601779966e-05, + "loss": 0.1121, + "step": 15910 + }, + { + "epoch": 1.507290285930695, + "grad_norm": 0.36662396788597107, + "learning_rate": 1.3970838856277222e-05, + "loss": 0.1103, + "step": 15920 + }, + { + "epoch": 1.5082370763113047, + "grad_norm": 0.4485093653202057, + "learning_rate": 1.3967051694754783e-05, + "loss": 0.1229, + "step": 15930 + }, + { + "epoch": 1.5091838666919144, + "grad_norm": 0.34482085704803467, + "learning_rate": 1.3963264533232343e-05, + "loss": 0.1104, + "step": 15940 + }, + { + "epoch": 1.5101306570725241, + "grad_norm": 0.5053472518920898, + "learning_rate": 1.3959477371709904e-05, + "loss": 0.1076, + "step": 15950 + }, + { + "epoch": 1.5110774474531339, + "grad_norm": 0.34632086753845215, + "learning_rate": 1.3955690210187465e-05, + "loss": 0.0972, + "step": 15960 + }, + { + "epoch": 1.5120242378337436, + "grad_norm": 0.3810611367225647, + "learning_rate": 1.3951903048665027e-05, + "loss": 0.1073, + "step": 15970 + }, + { + "epoch": 1.5129710282143534, + "grad_norm": 0.364725798368454, + "learning_rate": 1.3948115887142589e-05, + "loss": 0.1093, + "step": 15980 + }, + { + "epoch": 1.513917818594963, + "grad_norm": 0.32327204942703247, + "learning_rate": 1.3944328725620149e-05, + "loss": 0.1197, + "step": 15990 + }, + { + "epoch": 1.5148646089755728, + "grad_norm": 0.4384889602661133, + "learning_rate": 1.3940541564097709e-05, + "loss": 0.1057, + "step": 16000 + }, + { + "epoch": 1.5158113993561826, + "grad_norm": 0.29192763566970825, + "learning_rate": 1.393675440257527e-05, + "loss": 0.1043, + "step": 16010 + }, + { + "epoch": 1.5167581897367923, + "grad_norm": 0.37460416555404663, + "learning_rate": 1.3932967241052832e-05, + "loss": 0.1147, + "step": 16020 + }, + { + "epoch": 1.517704980117402, + "grad_norm": 0.29063111543655396, + "learning_rate": 1.3929180079530394e-05, + "loss": 0.0987, + "step": 16030 + }, + { + "epoch": 1.5186517704980118, + "grad_norm": 0.3550906777381897, + "learning_rate": 1.3925392918007954e-05, + "loss": 0.1092, + "step": 16040 + }, + { + "epoch": 1.5195985608786216, + "grad_norm": 0.3875524699687958, + "learning_rate": 1.3921605756485514e-05, + "loss": 0.1118, + "step": 16050 + }, + { + "epoch": 1.5205453512592313, + "grad_norm": 0.29305535554885864, + "learning_rate": 1.3917818594963078e-05, + "loss": 0.1153, + "step": 16060 + }, + { + "epoch": 1.521492141639841, + "grad_norm": 0.28514429926872253, + "learning_rate": 1.3914031433440638e-05, + "loss": 0.11, + "step": 16070 + }, + { + "epoch": 1.5224389320204508, + "grad_norm": 0.426382452249527, + "learning_rate": 1.3910244271918198e-05, + "loss": 0.1287, + "step": 16080 + }, + { + "epoch": 1.5233857224010605, + "grad_norm": 0.381534218788147, + "learning_rate": 1.390645711039576e-05, + "loss": 0.095, + "step": 16090 + }, + { + "epoch": 1.5243325127816703, + "grad_norm": 0.3806873559951782, + "learning_rate": 1.390266994887332e-05, + "loss": 0.1051, + "step": 16100 + }, + { + "epoch": 1.52527930316228, + "grad_norm": 0.29320812225341797, + "learning_rate": 1.3898882787350883e-05, + "loss": 0.109, + "step": 16110 + }, + { + "epoch": 1.5262260935428897, + "grad_norm": 0.32169806957244873, + "learning_rate": 1.3895095625828443e-05, + "loss": 0.0986, + "step": 16120 + }, + { + "epoch": 1.5271728839234995, + "grad_norm": 0.40018782019615173, + "learning_rate": 1.3891308464306003e-05, + "loss": 0.1125, + "step": 16130 + }, + { + "epoch": 1.5281196743041092, + "grad_norm": 0.3513498604297638, + "learning_rate": 1.3887521302783565e-05, + "loss": 0.1134, + "step": 16140 + }, + { + "epoch": 1.529066464684719, + "grad_norm": 0.3241359293460846, + "learning_rate": 1.3883734141261125e-05, + "loss": 0.1014, + "step": 16150 + }, + { + "epoch": 1.5300132550653285, + "grad_norm": 0.3017043173313141, + "learning_rate": 1.3879946979738688e-05, + "loss": 0.1074, + "step": 16160 + }, + { + "epoch": 1.5309600454459382, + "grad_norm": 0.3104637563228607, + "learning_rate": 1.3876159818216248e-05, + "loss": 0.105, + "step": 16170 + }, + { + "epoch": 1.531906835826548, + "grad_norm": 0.36349010467529297, + "learning_rate": 1.3872372656693808e-05, + "loss": 0.1064, + "step": 16180 + }, + { + "epoch": 1.5328536262071577, + "grad_norm": 0.35329166054725647, + "learning_rate": 1.386858549517137e-05, + "loss": 0.1091, + "step": 16190 + }, + { + "epoch": 1.5338004165877674, + "grad_norm": 0.36299046874046326, + "learning_rate": 1.3864798333648932e-05, + "loss": 0.1067, + "step": 16200 + }, + { + "epoch": 1.5347472069683772, + "grad_norm": 0.46190503239631653, + "learning_rate": 1.3861011172126494e-05, + "loss": 0.1032, + "step": 16210 + }, + { + "epoch": 1.535693997348987, + "grad_norm": 0.3034611642360687, + "learning_rate": 1.3857224010604054e-05, + "loss": 0.1112, + "step": 16220 + }, + { + "epoch": 1.5366407877295967, + "grad_norm": 0.3905613422393799, + "learning_rate": 1.3853436849081614e-05, + "loss": 0.1063, + "step": 16230 + }, + { + "epoch": 1.5375875781102064, + "grad_norm": 0.34141162037849426, + "learning_rate": 1.3849649687559175e-05, + "loss": 0.1279, + "step": 16240 + }, + { + "epoch": 1.5385343684908162, + "grad_norm": 0.3614260256290436, + "learning_rate": 1.3845862526036737e-05, + "loss": 0.1103, + "step": 16250 + }, + { + "epoch": 1.539481158871426, + "grad_norm": 0.43407753109931946, + "learning_rate": 1.3842075364514297e-05, + "loss": 0.1082, + "step": 16260 + }, + { + "epoch": 1.5404279492520356, + "grad_norm": 0.39427345991134644, + "learning_rate": 1.3838288202991859e-05, + "loss": 0.1121, + "step": 16270 + }, + { + "epoch": 1.5413747396326454, + "grad_norm": 0.34145137667655945, + "learning_rate": 1.3834501041469419e-05, + "loss": 0.0989, + "step": 16280 + }, + { + "epoch": 1.5423215300132551, + "grad_norm": 0.5783584713935852, + "learning_rate": 1.3830713879946979e-05, + "loss": 0.094, + "step": 16290 + }, + { + "epoch": 1.5432683203938646, + "grad_norm": 0.31390732526779175, + "learning_rate": 1.3826926718424543e-05, + "loss": 0.1118, + "step": 16300 + }, + { + "epoch": 1.5442151107744744, + "grad_norm": 0.3206731379032135, + "learning_rate": 1.3823139556902103e-05, + "loss": 0.1121, + "step": 16310 + }, + { + "epoch": 1.5451619011550841, + "grad_norm": 0.47795799374580383, + "learning_rate": 1.3819352395379664e-05, + "loss": 0.1062, + "step": 16320 + }, + { + "epoch": 1.5461086915356939, + "grad_norm": 0.36711663007736206, + "learning_rate": 1.3815565233857224e-05, + "loss": 0.1116, + "step": 16330 + }, + { + "epoch": 1.5470554819163036, + "grad_norm": 0.37463775277137756, + "learning_rate": 1.3811778072334788e-05, + "loss": 0.104, + "step": 16340 + }, + { + "epoch": 1.5480022722969133, + "grad_norm": 0.3962617814540863, + "learning_rate": 1.3807990910812348e-05, + "loss": 0.1018, + "step": 16350 + }, + { + "epoch": 1.548949062677523, + "grad_norm": 0.31192144751548767, + "learning_rate": 1.3804203749289908e-05, + "loss": 0.1057, + "step": 16360 + }, + { + "epoch": 1.5498958530581328, + "grad_norm": 0.3603304326534271, + "learning_rate": 1.380041658776747e-05, + "loss": 0.1028, + "step": 16370 + }, + { + "epoch": 1.5508426434387426, + "grad_norm": 0.35720306634902954, + "learning_rate": 1.379662942624503e-05, + "loss": 0.1126, + "step": 16380 + }, + { + "epoch": 1.5517894338193523, + "grad_norm": 0.3946895897388458, + "learning_rate": 1.3792842264722593e-05, + "loss": 0.1091, + "step": 16390 + }, + { + "epoch": 1.552736224199962, + "grad_norm": 0.3394911587238312, + "learning_rate": 1.3789055103200153e-05, + "loss": 0.1128, + "step": 16400 + }, + { + "epoch": 1.5536830145805718, + "grad_norm": 0.36512428522109985, + "learning_rate": 1.3785267941677713e-05, + "loss": 0.1059, + "step": 16410 + }, + { + "epoch": 1.5546298049611815, + "grad_norm": 0.417980819940567, + "learning_rate": 1.3781480780155275e-05, + "loss": 0.116, + "step": 16420 + }, + { + "epoch": 1.5555765953417913, + "grad_norm": 0.36091163754463196, + "learning_rate": 1.3777693618632835e-05, + "loss": 0.1078, + "step": 16430 + }, + { + "epoch": 1.556523385722401, + "grad_norm": 0.29771095514297485, + "learning_rate": 1.3773906457110397e-05, + "loss": 0.1051, + "step": 16440 + }, + { + "epoch": 1.5574701761030108, + "grad_norm": 0.39906641840934753, + "learning_rate": 1.3770119295587958e-05, + "loss": 0.1164, + "step": 16450 + }, + { + "epoch": 1.5584169664836205, + "grad_norm": 0.35542377829551697, + "learning_rate": 1.3766332134065519e-05, + "loss": 0.0973, + "step": 16460 + }, + { + "epoch": 1.5593637568642302, + "grad_norm": 0.3593170642852783, + "learning_rate": 1.3762544972543079e-05, + "loss": 0.1127, + "step": 16470 + }, + { + "epoch": 1.56031054724484, + "grad_norm": 0.34388262033462524, + "learning_rate": 1.3758757811020642e-05, + "loss": 0.1107, + "step": 16480 + }, + { + "epoch": 1.5612573376254497, + "grad_norm": 0.38229548931121826, + "learning_rate": 1.3754970649498202e-05, + "loss": 0.1057, + "step": 16490 + }, + { + "epoch": 1.5622041280060595, + "grad_norm": 0.38837409019470215, + "learning_rate": 1.3751183487975764e-05, + "loss": 0.1151, + "step": 16500 + }, + { + "epoch": 1.5631509183866692, + "grad_norm": 0.3817042112350464, + "learning_rate": 1.3747396326453324e-05, + "loss": 0.1076, + "step": 16510 + }, + { + "epoch": 1.564097708767279, + "grad_norm": 0.5364170074462891, + "learning_rate": 1.3743609164930884e-05, + "loss": 0.11, + "step": 16520 + }, + { + "epoch": 1.5650444991478887, + "grad_norm": 0.30082473158836365, + "learning_rate": 1.3739822003408447e-05, + "loss": 0.125, + "step": 16530 + }, + { + "epoch": 1.5659912895284984, + "grad_norm": 0.28244081139564514, + "learning_rate": 1.3736034841886007e-05, + "loss": 0.1131, + "step": 16540 + }, + { + "epoch": 1.5669380799091082, + "grad_norm": 0.34583744406700134, + "learning_rate": 1.3732247680363569e-05, + "loss": 0.0999, + "step": 16550 + }, + { + "epoch": 1.567884870289718, + "grad_norm": 0.30678942799568176, + "learning_rate": 1.372846051884113e-05, + "loss": 0.1166, + "step": 16560 + }, + { + "epoch": 1.5688316606703276, + "grad_norm": 0.3636748194694519, + "learning_rate": 1.372467335731869e-05, + "loss": 0.1055, + "step": 16570 + }, + { + "epoch": 1.5697784510509374, + "grad_norm": 0.4010995030403137, + "learning_rate": 1.3720886195796253e-05, + "loss": 0.1131, + "step": 16580 + }, + { + "epoch": 1.5707252414315471, + "grad_norm": 0.43221795558929443, + "learning_rate": 1.3717099034273813e-05, + "loss": 0.1149, + "step": 16590 + }, + { + "epoch": 1.5716720318121569, + "grad_norm": 0.3476791977882385, + "learning_rate": 1.3713311872751374e-05, + "loss": 0.1044, + "step": 16600 + }, + { + "epoch": 1.5726188221927666, + "grad_norm": 0.36815622448921204, + "learning_rate": 1.3709524711228935e-05, + "loss": 0.1139, + "step": 16610 + }, + { + "epoch": 1.5735656125733763, + "grad_norm": 0.42477601766586304, + "learning_rate": 1.3705737549706496e-05, + "loss": 0.1132, + "step": 16620 + }, + { + "epoch": 1.574512402953986, + "grad_norm": 0.35573610663414, + "learning_rate": 1.3701950388184058e-05, + "loss": 0.1137, + "step": 16630 + }, + { + "epoch": 1.5754591933345958, + "grad_norm": 0.38760095834732056, + "learning_rate": 1.3698163226661618e-05, + "loss": 0.114, + "step": 16640 + }, + { + "epoch": 1.5764059837152056, + "grad_norm": 0.34179815649986267, + "learning_rate": 1.3694376065139178e-05, + "loss": 0.1087, + "step": 16650 + }, + { + "epoch": 1.5773527740958153, + "grad_norm": 0.3777365982532501, + "learning_rate": 1.369058890361674e-05, + "loss": 0.118, + "step": 16660 + }, + { + "epoch": 1.578299564476425, + "grad_norm": 0.3674735724925995, + "learning_rate": 1.3686801742094302e-05, + "loss": 0.1049, + "step": 16670 + }, + { + "epoch": 1.5792463548570348, + "grad_norm": 0.3519558608531952, + "learning_rate": 1.3683014580571863e-05, + "loss": 0.1135, + "step": 16680 + }, + { + "epoch": 1.5801931452376445, + "grad_norm": 0.3152370750904083, + "learning_rate": 1.3679227419049423e-05, + "loss": 0.1034, + "step": 16690 + }, + { + "epoch": 1.5811399356182543, + "grad_norm": 0.40113574266433716, + "learning_rate": 1.3675440257526983e-05, + "loss": 0.111, + "step": 16700 + }, + { + "epoch": 1.5820867259988638, + "grad_norm": 0.3402666747570038, + "learning_rate": 1.3671653096004547e-05, + "loss": 0.1102, + "step": 16710 + }, + { + "epoch": 1.5830335163794735, + "grad_norm": 0.33529624342918396, + "learning_rate": 1.3667865934482107e-05, + "loss": 0.1112, + "step": 16720 + }, + { + "epoch": 1.5839803067600833, + "grad_norm": 0.30931976437568665, + "learning_rate": 1.3664078772959669e-05, + "loss": 0.1048, + "step": 16730 + }, + { + "epoch": 1.584927097140693, + "grad_norm": 0.3059307932853699, + "learning_rate": 1.3660291611437229e-05, + "loss": 0.1111, + "step": 16740 + }, + { + "epoch": 1.5858738875213028, + "grad_norm": 0.41115766763687134, + "learning_rate": 1.3656504449914789e-05, + "loss": 0.11, + "step": 16750 + }, + { + "epoch": 1.5868206779019125, + "grad_norm": 0.3679191768169403, + "learning_rate": 1.3652717288392352e-05, + "loss": 0.1011, + "step": 16760 + }, + { + "epoch": 1.5877674682825222, + "grad_norm": 0.32862117886543274, + "learning_rate": 1.3648930126869912e-05, + "loss": 0.1108, + "step": 16770 + }, + { + "epoch": 1.588714258663132, + "grad_norm": 0.25885462760925293, + "learning_rate": 1.3645142965347474e-05, + "loss": 0.1102, + "step": 16780 + }, + { + "epoch": 1.5896610490437417, + "grad_norm": 0.401241272687912, + "learning_rate": 1.3641355803825034e-05, + "loss": 0.1099, + "step": 16790 + }, + { + "epoch": 1.5906078394243515, + "grad_norm": 0.3936287760734558, + "learning_rate": 1.3637568642302594e-05, + "loss": 0.1048, + "step": 16800 + }, + { + "epoch": 1.5915546298049612, + "grad_norm": 0.41480880975723267, + "learning_rate": 1.3633781480780158e-05, + "loss": 0.1053, + "step": 16810 + }, + { + "epoch": 1.592501420185571, + "grad_norm": 0.42180556058883667, + "learning_rate": 1.3629994319257718e-05, + "loss": 0.1123, + "step": 16820 + }, + { + "epoch": 1.5934482105661807, + "grad_norm": 0.3735913634300232, + "learning_rate": 1.3626207157735278e-05, + "loss": 0.1139, + "step": 16830 + }, + { + "epoch": 1.5943950009467904, + "grad_norm": 0.38873815536499023, + "learning_rate": 1.362241999621284e-05, + "loss": 0.1036, + "step": 16840 + }, + { + "epoch": 1.5953417913274, + "grad_norm": 0.3312840461730957, + "learning_rate": 1.3618632834690401e-05, + "loss": 0.1036, + "step": 16850 + }, + { + "epoch": 1.5962885817080097, + "grad_norm": 0.344212144613266, + "learning_rate": 1.3614845673167963e-05, + "loss": 0.1139, + "step": 16860 + }, + { + "epoch": 1.5972353720886194, + "grad_norm": 0.3515395224094391, + "learning_rate": 1.3611058511645523e-05, + "loss": 0.1082, + "step": 16870 + }, + { + "epoch": 1.5981821624692292, + "grad_norm": 0.30493828654289246, + "learning_rate": 1.3607271350123083e-05, + "loss": 0.1117, + "step": 16880 + }, + { + "epoch": 1.599128952849839, + "grad_norm": 0.29671570658683777, + "learning_rate": 1.3603484188600645e-05, + "loss": 0.104, + "step": 16890 + }, + { + "epoch": 1.6000757432304487, + "grad_norm": 0.3543199598789215, + "learning_rate": 1.3599697027078206e-05, + "loss": 0.1084, + "step": 16900 + }, + { + "epoch": 1.6010225336110584, + "grad_norm": 0.4090178310871124, + "learning_rate": 1.3595909865555768e-05, + "loss": 0.1056, + "step": 16910 + }, + { + "epoch": 1.6019693239916681, + "grad_norm": 0.35988327860832214, + "learning_rate": 1.3592122704033328e-05, + "loss": 0.1055, + "step": 16920 + }, + { + "epoch": 1.6029161143722779, + "grad_norm": 0.4046424925327301, + "learning_rate": 1.3588335542510888e-05, + "loss": 0.1097, + "step": 16930 + }, + { + "epoch": 1.6038629047528876, + "grad_norm": 0.330355167388916, + "learning_rate": 1.358454838098845e-05, + "loss": 0.1104, + "step": 16940 + }, + { + "epoch": 1.6048096951334974, + "grad_norm": 0.3554498553276062, + "learning_rate": 1.3580761219466012e-05, + "loss": 0.0995, + "step": 16950 + }, + { + "epoch": 1.605756485514107, + "grad_norm": 0.3867628872394562, + "learning_rate": 1.3576974057943574e-05, + "loss": 0.1146, + "step": 16960 + }, + { + "epoch": 1.6067032758947168, + "grad_norm": 0.3537595272064209, + "learning_rate": 1.3573186896421134e-05, + "loss": 0.1039, + "step": 16970 + }, + { + "epoch": 1.6076500662753266, + "grad_norm": 0.33180224895477295, + "learning_rate": 1.3569399734898694e-05, + "loss": 0.1067, + "step": 16980 + }, + { + "epoch": 1.6085968566559363, + "grad_norm": 0.36756113171577454, + "learning_rate": 1.3565612573376257e-05, + "loss": 0.107, + "step": 16990 + }, + { + "epoch": 1.609543647036546, + "grad_norm": 0.33853188157081604, + "learning_rate": 1.3561825411853817e-05, + "loss": 0.1221, + "step": 17000 + }, + { + "epoch": 1.6104904374171558, + "grad_norm": 0.2996186316013336, + "learning_rate": 1.3558038250331377e-05, + "loss": 0.1043, + "step": 17010 + }, + { + "epoch": 1.6114372277977655, + "grad_norm": 0.3746570944786072, + "learning_rate": 1.3554251088808939e-05, + "loss": 0.1121, + "step": 17020 + }, + { + "epoch": 1.6123840181783753, + "grad_norm": 0.3464304804801941, + "learning_rate": 1.3550463927286499e-05, + "loss": 0.1071, + "step": 17030 + }, + { + "epoch": 1.613330808558985, + "grad_norm": 0.3381328284740448, + "learning_rate": 1.3546676765764062e-05, + "loss": 0.1097, + "step": 17040 + }, + { + "epoch": 1.6142775989395948, + "grad_norm": 0.3596116304397583, + "learning_rate": 1.3542889604241622e-05, + "loss": 0.106, + "step": 17050 + }, + { + "epoch": 1.6152243893202045, + "grad_norm": 0.3500080406665802, + "learning_rate": 1.3539102442719182e-05, + "loss": 0.101, + "step": 17060 + }, + { + "epoch": 1.6161711797008143, + "grad_norm": 0.30177420377731323, + "learning_rate": 1.3535315281196744e-05, + "loss": 0.1022, + "step": 17070 + }, + { + "epoch": 1.617117970081424, + "grad_norm": 0.3444776237010956, + "learning_rate": 1.3531528119674304e-05, + "loss": 0.1164, + "step": 17080 + }, + { + "epoch": 1.6180647604620337, + "grad_norm": 0.3821253180503845, + "learning_rate": 1.3527740958151868e-05, + "loss": 0.1183, + "step": 17090 + }, + { + "epoch": 1.6190115508426435, + "grad_norm": 0.39384153485298157, + "learning_rate": 1.3523953796629428e-05, + "loss": 0.0943, + "step": 17100 + }, + { + "epoch": 1.6199583412232532, + "grad_norm": 0.4101506769657135, + "learning_rate": 1.3520166635106988e-05, + "loss": 0.1035, + "step": 17110 + }, + { + "epoch": 1.620905131603863, + "grad_norm": 0.4028509557247162, + "learning_rate": 1.351637947358455e-05, + "loss": 0.1063, + "step": 17120 + }, + { + "epoch": 1.6218519219844727, + "grad_norm": 0.33712711930274963, + "learning_rate": 1.3512592312062111e-05, + "loss": 0.1069, + "step": 17130 + }, + { + "epoch": 1.6227987123650824, + "grad_norm": 0.3235998749732971, + "learning_rate": 1.3508805150539673e-05, + "loss": 0.1101, + "step": 17140 + }, + { + "epoch": 1.6237455027456922, + "grad_norm": 0.33888810873031616, + "learning_rate": 1.3505017989017233e-05, + "loss": 0.1136, + "step": 17150 + }, + { + "epoch": 1.624692293126302, + "grad_norm": 0.351136714220047, + "learning_rate": 1.3501230827494793e-05, + "loss": 0.109, + "step": 17160 + }, + { + "epoch": 1.6256390835069117, + "grad_norm": 0.3584769070148468, + "learning_rate": 1.3497443665972353e-05, + "loss": 0.108, + "step": 17170 + }, + { + "epoch": 1.6265858738875214, + "grad_norm": 0.39292648434638977, + "learning_rate": 1.3493656504449917e-05, + "loss": 0.1024, + "step": 17180 + }, + { + "epoch": 1.6275326642681311, + "grad_norm": 0.33220037817955017, + "learning_rate": 1.3489869342927477e-05, + "loss": 0.1042, + "step": 17190 + }, + { + "epoch": 1.6284794546487409, + "grad_norm": 0.3405236303806305, + "learning_rate": 1.3486082181405038e-05, + "loss": 0.1156, + "step": 17200 + }, + { + "epoch": 1.6294262450293506, + "grad_norm": 0.31284892559051514, + "learning_rate": 1.3482295019882598e-05, + "loss": 0.1054, + "step": 17210 + }, + { + "epoch": 1.6303730354099604, + "grad_norm": 0.34259989857673645, + "learning_rate": 1.3478507858360159e-05, + "loss": 0.1017, + "step": 17220 + }, + { + "epoch": 1.63131982579057, + "grad_norm": 0.37608641386032104, + "learning_rate": 1.3474720696837722e-05, + "loss": 0.1173, + "step": 17230 + }, + { + "epoch": 1.6322666161711799, + "grad_norm": 0.38453209400177, + "learning_rate": 1.3470933535315282e-05, + "loss": 0.1049, + "step": 17240 + }, + { + "epoch": 1.6332134065517896, + "grad_norm": 0.27649539709091187, + "learning_rate": 1.3467146373792844e-05, + "loss": 0.1031, + "step": 17250 + }, + { + "epoch": 1.6341601969323991, + "grad_norm": 0.30744364857673645, + "learning_rate": 1.3463359212270404e-05, + "loss": 0.1038, + "step": 17260 + }, + { + "epoch": 1.6351069873130089, + "grad_norm": 0.30624982714653015, + "learning_rate": 1.3459572050747967e-05, + "loss": 0.1157, + "step": 17270 + }, + { + "epoch": 1.6360537776936186, + "grad_norm": 0.40259572863578796, + "learning_rate": 1.3455784889225527e-05, + "loss": 0.1124, + "step": 17280 + }, + { + "epoch": 1.6370005680742283, + "grad_norm": 0.3984117805957794, + "learning_rate": 1.3451997727703087e-05, + "loss": 0.1047, + "step": 17290 + }, + { + "epoch": 1.637947358454838, + "grad_norm": 0.34300553798675537, + "learning_rate": 1.3448210566180649e-05, + "loss": 0.1167, + "step": 17300 + }, + { + "epoch": 1.6388941488354478, + "grad_norm": 0.3874812424182892, + "learning_rate": 1.3444423404658209e-05, + "loss": 0.0996, + "step": 17310 + }, + { + "epoch": 1.6398409392160576, + "grad_norm": 0.31231194734573364, + "learning_rate": 1.3440636243135773e-05, + "loss": 0.1049, + "step": 17320 + }, + { + "epoch": 1.6407877295966673, + "grad_norm": 0.35242578387260437, + "learning_rate": 1.3436849081613333e-05, + "loss": 0.1014, + "step": 17330 + }, + { + "epoch": 1.641734519977277, + "grad_norm": 0.3678164482116699, + "learning_rate": 1.3433061920090893e-05, + "loss": 0.1019, + "step": 17340 + }, + { + "epoch": 1.6426813103578868, + "grad_norm": 0.500481903553009, + "learning_rate": 1.3429274758568453e-05, + "loss": 0.111, + "step": 17350 + }, + { + "epoch": 1.6436281007384965, + "grad_norm": 0.35619476437568665, + "learning_rate": 1.3425487597046014e-05, + "loss": 0.1058, + "step": 17360 + }, + { + "epoch": 1.6445748911191063, + "grad_norm": 0.28489652276039124, + "learning_rate": 1.3421700435523576e-05, + "loss": 0.107, + "step": 17370 + }, + { + "epoch": 1.645521681499716, + "grad_norm": 0.3987047076225281, + "learning_rate": 1.3417913274001138e-05, + "loss": 0.1102, + "step": 17380 + }, + { + "epoch": 1.6464684718803257, + "grad_norm": 0.29614952206611633, + "learning_rate": 1.3414126112478698e-05, + "loss": 0.1016, + "step": 17390 + }, + { + "epoch": 1.6474152622609353, + "grad_norm": 0.4422844648361206, + "learning_rate": 1.3410338950956258e-05, + "loss": 0.1109, + "step": 17400 + }, + { + "epoch": 1.648362052641545, + "grad_norm": 0.485542356967926, + "learning_rate": 1.3406551789433821e-05, + "loss": 0.125, + "step": 17410 + }, + { + "epoch": 1.6493088430221547, + "grad_norm": 0.3143191337585449, + "learning_rate": 1.3402764627911382e-05, + "loss": 0.0982, + "step": 17420 + }, + { + "epoch": 1.6502556334027645, + "grad_norm": 0.34800443053245544, + "learning_rate": 1.3398977466388943e-05, + "loss": 0.1222, + "step": 17430 + }, + { + "epoch": 1.6512024237833742, + "grad_norm": 0.4274339973926544, + "learning_rate": 1.3395190304866503e-05, + "loss": 0.1128, + "step": 17440 + }, + { + "epoch": 1.652149214163984, + "grad_norm": 0.35616034269332886, + "learning_rate": 1.3391403143344063e-05, + "loss": 0.1101, + "step": 17450 + }, + { + "epoch": 1.6530960045445937, + "grad_norm": 0.34067031741142273, + "learning_rate": 1.3387615981821627e-05, + "loss": 0.1077, + "step": 17460 + }, + { + "epoch": 1.6540427949252035, + "grad_norm": 0.367699533700943, + "learning_rate": 1.3383828820299187e-05, + "loss": 0.1086, + "step": 17470 + }, + { + "epoch": 1.6549895853058132, + "grad_norm": 0.34683769941329956, + "learning_rate": 1.3380041658776749e-05, + "loss": 0.1103, + "step": 17480 + }, + { + "epoch": 1.655936375686423, + "grad_norm": 0.31630000472068787, + "learning_rate": 1.3376254497254309e-05, + "loss": 0.1099, + "step": 17490 + }, + { + "epoch": 1.6568831660670327, + "grad_norm": 0.30700427293777466, + "learning_rate": 1.3372467335731869e-05, + "loss": 0.1102, + "step": 17500 + }, + { + "epoch": 1.6578299564476424, + "grad_norm": 0.3312118351459503, + "learning_rate": 1.3368680174209432e-05, + "loss": 0.109, + "step": 17510 + }, + { + "epoch": 1.6587767468282522, + "grad_norm": 0.3572567403316498, + "learning_rate": 1.3364893012686992e-05, + "loss": 0.1098, + "step": 17520 + }, + { + "epoch": 1.659723537208862, + "grad_norm": 0.35258644819259644, + "learning_rate": 1.3361105851164552e-05, + "loss": 0.1026, + "step": 17530 + }, + { + "epoch": 1.6606703275894716, + "grad_norm": 0.4267875552177429, + "learning_rate": 1.3357318689642114e-05, + "loss": 0.1126, + "step": 17540 + }, + { + "epoch": 1.6616171179700814, + "grad_norm": 0.4066093862056732, + "learning_rate": 1.3353531528119676e-05, + "loss": 0.1037, + "step": 17550 + }, + { + "epoch": 1.6625639083506911, + "grad_norm": 0.39801302552223206, + "learning_rate": 1.3349744366597237e-05, + "loss": 0.1166, + "step": 17560 + }, + { + "epoch": 1.6635106987313009, + "grad_norm": 0.3564015030860901, + "learning_rate": 1.3345957205074798e-05, + "loss": 0.1115, + "step": 17570 + }, + { + "epoch": 1.6644574891119106, + "grad_norm": 0.3493454158306122, + "learning_rate": 1.3342170043552358e-05, + "loss": 0.1121, + "step": 17580 + }, + { + "epoch": 1.6654042794925203, + "grad_norm": 0.33144834637641907, + "learning_rate": 1.333838288202992e-05, + "loss": 0.1018, + "step": 17590 + }, + { + "epoch": 1.66635106987313, + "grad_norm": 0.295213907957077, + "learning_rate": 1.3334595720507481e-05, + "loss": 0.1048, + "step": 17600 + }, + { + "epoch": 1.6672978602537398, + "grad_norm": 0.35501471161842346, + "learning_rate": 1.3330808558985043e-05, + "loss": 0.1038, + "step": 17610 + }, + { + "epoch": 1.6682446506343496, + "grad_norm": 0.373725026845932, + "learning_rate": 1.3327021397462603e-05, + "loss": 0.1049, + "step": 17620 + }, + { + "epoch": 1.6691914410149593, + "grad_norm": 0.35085785388946533, + "learning_rate": 1.3323234235940163e-05, + "loss": 0.1079, + "step": 17630 + }, + { + "epoch": 1.670138231395569, + "grad_norm": 0.28753459453582764, + "learning_rate": 1.3319447074417725e-05, + "loss": 0.0992, + "step": 17640 + }, + { + "epoch": 1.6710850217761788, + "grad_norm": 0.3040468990802765, + "learning_rate": 1.3315659912895286e-05, + "loss": 0.1077, + "step": 17650 + }, + { + "epoch": 1.6720318121567885, + "grad_norm": 0.2903306782245636, + "learning_rate": 1.3311872751372848e-05, + "loss": 0.1025, + "step": 17660 + }, + { + "epoch": 1.6729786025373983, + "grad_norm": 0.377331018447876, + "learning_rate": 1.3308085589850408e-05, + "loss": 0.1116, + "step": 17670 + }, + { + "epoch": 1.673925392918008, + "grad_norm": 0.3333212435245514, + "learning_rate": 1.3304298428327968e-05, + "loss": 0.1141, + "step": 17680 + }, + { + "epoch": 1.6748721832986178, + "grad_norm": 0.342886358499527, + "learning_rate": 1.3300511266805532e-05, + "loss": 0.1043, + "step": 17690 + }, + { + "epoch": 1.6758189736792275, + "grad_norm": 0.3984515368938446, + "learning_rate": 1.3296724105283092e-05, + "loss": 0.1001, + "step": 17700 + }, + { + "epoch": 1.6767657640598372, + "grad_norm": 0.32483118772506714, + "learning_rate": 1.3292936943760652e-05, + "loss": 0.1174, + "step": 17710 + }, + { + "epoch": 1.677712554440447, + "grad_norm": 0.4028964638710022, + "learning_rate": 1.3289149782238214e-05, + "loss": 0.1056, + "step": 17720 + }, + { + "epoch": 1.6786593448210567, + "grad_norm": 0.5397863984107971, + "learning_rate": 1.3285362620715774e-05, + "loss": 0.1081, + "step": 17730 + }, + { + "epoch": 1.6796061352016665, + "grad_norm": 0.34457170963287354, + "learning_rate": 1.3281575459193337e-05, + "loss": 0.1088, + "step": 17740 + }, + { + "epoch": 1.6805529255822762, + "grad_norm": 0.3310573101043701, + "learning_rate": 1.3277788297670897e-05, + "loss": 0.107, + "step": 17750 + }, + { + "epoch": 1.681499715962886, + "grad_norm": 0.475350558757782, + "learning_rate": 1.3274001136148457e-05, + "loss": 0.1054, + "step": 17760 + }, + { + "epoch": 1.6824465063434957, + "grad_norm": 0.32560041546821594, + "learning_rate": 1.3270213974626019e-05, + "loss": 0.0932, + "step": 17770 + }, + { + "epoch": 1.6833932967241054, + "grad_norm": 0.2892504930496216, + "learning_rate": 1.3266426813103579e-05, + "loss": 0.107, + "step": 17780 + }, + { + "epoch": 1.6843400871047152, + "grad_norm": 0.4393444359302521, + "learning_rate": 1.3262639651581142e-05, + "loss": 0.103, + "step": 17790 + }, + { + "epoch": 1.685286877485325, + "grad_norm": 0.3609474003314972, + "learning_rate": 1.3258852490058702e-05, + "loss": 0.1035, + "step": 17800 + }, + { + "epoch": 1.6862336678659344, + "grad_norm": 0.3656294643878937, + "learning_rate": 1.3255065328536262e-05, + "loss": 0.1054, + "step": 17810 + }, + { + "epoch": 1.6871804582465442, + "grad_norm": 0.32531723380088806, + "learning_rate": 1.3251278167013824e-05, + "loss": 0.1109, + "step": 17820 + }, + { + "epoch": 1.688127248627154, + "grad_norm": 0.42885199189186096, + "learning_rate": 1.3247491005491386e-05, + "loss": 0.1159, + "step": 17830 + }, + { + "epoch": 1.6890740390077637, + "grad_norm": 0.34545162320137024, + "learning_rate": 1.3243703843968948e-05, + "loss": 0.1137, + "step": 17840 + }, + { + "epoch": 1.6900208293883734, + "grad_norm": 0.45957455039024353, + "learning_rate": 1.3239916682446508e-05, + "loss": 0.1126, + "step": 17850 + }, + { + "epoch": 1.6909676197689831, + "grad_norm": 0.3322799503803253, + "learning_rate": 1.3236129520924068e-05, + "loss": 0.113, + "step": 17860 + }, + { + "epoch": 1.6919144101495929, + "grad_norm": 0.4249228239059448, + "learning_rate": 1.323234235940163e-05, + "loss": 0.1114, + "step": 17870 + }, + { + "epoch": 1.6928612005302026, + "grad_norm": 0.3494950830936432, + "learning_rate": 1.3228555197879191e-05, + "loss": 0.1083, + "step": 17880 + }, + { + "epoch": 1.6938079909108124, + "grad_norm": 0.34661203622817993, + "learning_rate": 1.3224768036356751e-05, + "loss": 0.108, + "step": 17890 + }, + { + "epoch": 1.694754781291422, + "grad_norm": 0.35037049651145935, + "learning_rate": 1.3220980874834313e-05, + "loss": 0.1079, + "step": 17900 + }, + { + "epoch": 1.6957015716720318, + "grad_norm": 0.3720286190509796, + "learning_rate": 1.3217193713311873e-05, + "loss": 0.1062, + "step": 17910 + }, + { + "epoch": 1.6966483620526416, + "grad_norm": 0.37974655628204346, + "learning_rate": 1.3213406551789433e-05, + "loss": 0.0993, + "step": 17920 + }, + { + "epoch": 1.6975951524332513, + "grad_norm": 0.28876063227653503, + "learning_rate": 1.3209619390266997e-05, + "loss": 0.0907, + "step": 17930 + }, + { + "epoch": 1.698541942813861, + "grad_norm": 0.33278247714042664, + "learning_rate": 1.3205832228744557e-05, + "loss": 0.1087, + "step": 17940 + }, + { + "epoch": 1.6994887331944706, + "grad_norm": 0.2906360924243927, + "learning_rate": 1.3202045067222118e-05, + "loss": 0.1123, + "step": 17950 + }, + { + "epoch": 1.7004355235750803, + "grad_norm": 0.32706132531166077, + "learning_rate": 1.3198257905699678e-05, + "loss": 0.1011, + "step": 17960 + }, + { + "epoch": 1.70138231395569, + "grad_norm": 0.3792358934879303, + "learning_rate": 1.3194470744177242e-05, + "loss": 0.104, + "step": 17970 + }, + { + "epoch": 1.7023291043362998, + "grad_norm": 0.5245835185050964, + "learning_rate": 1.3190683582654802e-05, + "loss": 0.1009, + "step": 17980 + }, + { + "epoch": 1.7032758947169095, + "grad_norm": 0.3694629669189453, + "learning_rate": 1.3186896421132362e-05, + "loss": 0.1104, + "step": 17990 + }, + { + "epoch": 1.7042226850975193, + "grad_norm": 0.27885186672210693, + "learning_rate": 1.3183109259609924e-05, + "loss": 0.1059, + "step": 18000 + }, + { + "epoch": 1.705169475478129, + "grad_norm": 0.5485479831695557, + "learning_rate": 1.3179322098087484e-05, + "loss": 0.115, + "step": 18010 + }, + { + "epoch": 1.7061162658587388, + "grad_norm": 0.4096299707889557, + "learning_rate": 1.3175534936565047e-05, + "loss": 0.1192, + "step": 18020 + }, + { + "epoch": 1.7070630562393485, + "grad_norm": 0.3202039301395416, + "learning_rate": 1.3171747775042607e-05, + "loss": 0.0996, + "step": 18030 + }, + { + "epoch": 1.7080098466199582, + "grad_norm": 0.6081573367118835, + "learning_rate": 1.3167960613520167e-05, + "loss": 0.1155, + "step": 18040 + }, + { + "epoch": 1.708956637000568, + "grad_norm": 0.37756624817848206, + "learning_rate": 1.3164173451997729e-05, + "loss": 0.1093, + "step": 18050 + }, + { + "epoch": 1.7099034273811777, + "grad_norm": 0.4071192443370819, + "learning_rate": 1.3160386290475289e-05, + "loss": 0.1026, + "step": 18060 + }, + { + "epoch": 1.7108502177617875, + "grad_norm": 0.3811383545398712, + "learning_rate": 1.315659912895285e-05, + "loss": 0.0979, + "step": 18070 + }, + { + "epoch": 1.7117970081423972, + "grad_norm": 0.32877886295318604, + "learning_rate": 1.3152811967430413e-05, + "loss": 0.1148, + "step": 18080 + }, + { + "epoch": 1.712743798523007, + "grad_norm": 0.42139747738838196, + "learning_rate": 1.3149024805907973e-05, + "loss": 0.1129, + "step": 18090 + }, + { + "epoch": 1.7136905889036167, + "grad_norm": 0.2595408260822296, + "learning_rate": 1.3145237644385533e-05, + "loss": 0.1049, + "step": 18100 + }, + { + "epoch": 1.7146373792842264, + "grad_norm": 0.38902461528778076, + "learning_rate": 1.3141450482863096e-05, + "loss": 0.1105, + "step": 18110 + }, + { + "epoch": 1.7155841696648362, + "grad_norm": 0.3774598240852356, + "learning_rate": 1.3137663321340656e-05, + "loss": 0.114, + "step": 18120 + }, + { + "epoch": 1.716530960045446, + "grad_norm": 0.29289478063583374, + "learning_rate": 1.3133876159818218e-05, + "loss": 0.0974, + "step": 18130 + }, + { + "epoch": 1.7174777504260557, + "grad_norm": 0.383760541677475, + "learning_rate": 1.3130088998295778e-05, + "loss": 0.1078, + "step": 18140 + }, + { + "epoch": 1.7184245408066654, + "grad_norm": 0.33194929361343384, + "learning_rate": 1.3126301836773338e-05, + "loss": 0.1192, + "step": 18150 + }, + { + "epoch": 1.7193713311872751, + "grad_norm": 0.41859087347984314, + "learning_rate": 1.3122514675250901e-05, + "loss": 0.1125, + "step": 18160 + }, + { + "epoch": 1.7203181215678849, + "grad_norm": 0.3526773750782013, + "learning_rate": 1.3118727513728461e-05, + "loss": 0.1102, + "step": 18170 + }, + { + "epoch": 1.7212649119484946, + "grad_norm": 0.3202621042728424, + "learning_rate": 1.3114940352206023e-05, + "loss": 0.1103, + "step": 18180 + }, + { + "epoch": 1.7222117023291044, + "grad_norm": 0.37917235493659973, + "learning_rate": 1.3111153190683583e-05, + "loss": 0.1153, + "step": 18190 + }, + { + "epoch": 1.723158492709714, + "grad_norm": 0.41578903794288635, + "learning_rate": 1.3107366029161143e-05, + "loss": 0.1048, + "step": 18200 + }, + { + "epoch": 1.7241052830903238, + "grad_norm": 0.33654308319091797, + "learning_rate": 1.3103578867638707e-05, + "loss": 0.1089, + "step": 18210 + }, + { + "epoch": 1.7250520734709336, + "grad_norm": 0.3018990457057953, + "learning_rate": 1.3099791706116267e-05, + "loss": 0.1097, + "step": 18220 + }, + { + "epoch": 1.7259988638515433, + "grad_norm": 0.36322179436683655, + "learning_rate": 1.3096004544593829e-05, + "loss": 0.0998, + "step": 18230 + }, + { + "epoch": 1.726945654232153, + "grad_norm": 0.3884753882884979, + "learning_rate": 1.3092217383071389e-05, + "loss": 0.118, + "step": 18240 + }, + { + "epoch": 1.7278924446127628, + "grad_norm": 0.31357282400131226, + "learning_rate": 1.308843022154895e-05, + "loss": 0.1103, + "step": 18250 + }, + { + "epoch": 1.7288392349933726, + "grad_norm": 0.3505796492099762, + "learning_rate": 1.3084643060026512e-05, + "loss": 0.1044, + "step": 18260 + }, + { + "epoch": 1.7297860253739823, + "grad_norm": 0.37499359250068665, + "learning_rate": 1.3080855898504072e-05, + "loss": 0.1157, + "step": 18270 + }, + { + "epoch": 1.730732815754592, + "grad_norm": 0.42914479970932007, + "learning_rate": 1.3077068736981632e-05, + "loss": 0.116, + "step": 18280 + }, + { + "epoch": 1.7316796061352018, + "grad_norm": 0.3526166081428528, + "learning_rate": 1.3073281575459194e-05, + "loss": 0.1052, + "step": 18290 + }, + { + "epoch": 1.7326263965158115, + "grad_norm": 0.2813660204410553, + "learning_rate": 1.3069494413936756e-05, + "loss": 0.1033, + "step": 18300 + }, + { + "epoch": 1.7335731868964213, + "grad_norm": 0.3836597800254822, + "learning_rate": 1.3065707252414317e-05, + "loss": 0.1045, + "step": 18310 + }, + { + "epoch": 1.734519977277031, + "grad_norm": 0.41530275344848633, + "learning_rate": 1.3061920090891877e-05, + "loss": 0.0978, + "step": 18320 + }, + { + "epoch": 1.7354667676576407, + "grad_norm": 0.3948558270931244, + "learning_rate": 1.3058132929369438e-05, + "loss": 0.1063, + "step": 18330 + }, + { + "epoch": 1.7364135580382505, + "grad_norm": 0.34573134779930115, + "learning_rate": 1.3054345767847e-05, + "loss": 0.1063, + "step": 18340 + }, + { + "epoch": 1.7373603484188602, + "grad_norm": 0.41142192482948303, + "learning_rate": 1.3050558606324561e-05, + "loss": 0.116, + "step": 18350 + }, + { + "epoch": 1.7383071387994697, + "grad_norm": 0.34815993905067444, + "learning_rate": 1.3046771444802123e-05, + "loss": 0.1106, + "step": 18360 + }, + { + "epoch": 1.7392539291800795, + "grad_norm": 0.3826438784599304, + "learning_rate": 1.3042984283279683e-05, + "loss": 0.1089, + "step": 18370 + }, + { + "epoch": 1.7402007195606892, + "grad_norm": 0.3685339093208313, + "learning_rate": 1.3039197121757243e-05, + "loss": 0.103, + "step": 18380 + }, + { + "epoch": 1.741147509941299, + "grad_norm": 0.33224424719810486, + "learning_rate": 1.3035409960234806e-05, + "loss": 0.1075, + "step": 18390 + }, + { + "epoch": 1.7420943003219087, + "grad_norm": 0.4313163757324219, + "learning_rate": 1.3031622798712366e-05, + "loss": 0.1098, + "step": 18400 + }, + { + "epoch": 1.7430410907025184, + "grad_norm": 0.3623204827308655, + "learning_rate": 1.3027835637189928e-05, + "loss": 0.1059, + "step": 18410 + }, + { + "epoch": 1.7439878810831282, + "grad_norm": 0.44435903429985046, + "learning_rate": 1.3024048475667488e-05, + "loss": 0.1094, + "step": 18420 + }, + { + "epoch": 1.744934671463738, + "grad_norm": 0.41246020793914795, + "learning_rate": 1.3020261314145048e-05, + "loss": 0.1143, + "step": 18430 + }, + { + "epoch": 1.7458814618443477, + "grad_norm": 0.3123202323913574, + "learning_rate": 1.3016474152622612e-05, + "loss": 0.1027, + "step": 18440 + }, + { + "epoch": 1.7468282522249574, + "grad_norm": 0.5212844014167786, + "learning_rate": 1.3012686991100172e-05, + "loss": 0.1157, + "step": 18450 + }, + { + "epoch": 1.7477750426055672, + "grad_norm": 0.43488380312919617, + "learning_rate": 1.3008899829577732e-05, + "loss": 0.1086, + "step": 18460 + }, + { + "epoch": 1.748721832986177, + "grad_norm": 0.3901176452636719, + "learning_rate": 1.3005112668055293e-05, + "loss": 0.1078, + "step": 18470 + }, + { + "epoch": 1.7496686233667866, + "grad_norm": 0.2928845286369324, + "learning_rate": 1.3001325506532854e-05, + "loss": 0.1166, + "step": 18480 + }, + { + "epoch": 1.7506154137473964, + "grad_norm": 0.340564101934433, + "learning_rate": 1.2997538345010417e-05, + "loss": 0.1087, + "step": 18490 + }, + { + "epoch": 1.751562204128006, + "grad_norm": 0.3431471288204193, + "learning_rate": 1.2993751183487977e-05, + "loss": 0.1094, + "step": 18500 + }, + { + "epoch": 1.7525089945086156, + "grad_norm": 0.35341206192970276, + "learning_rate": 1.2989964021965537e-05, + "loss": 0.0969, + "step": 18510 + }, + { + "epoch": 1.7534557848892254, + "grad_norm": 0.3784622550010681, + "learning_rate": 1.2986176860443099e-05, + "loss": 0.1087, + "step": 18520 + }, + { + "epoch": 1.7544025752698351, + "grad_norm": 0.3758784532546997, + "learning_rate": 1.298238969892066e-05, + "loss": 0.1087, + "step": 18530 + }, + { + "epoch": 1.7553493656504449, + "grad_norm": 0.31814923882484436, + "learning_rate": 1.2978602537398222e-05, + "loss": 0.1035, + "step": 18540 + }, + { + "epoch": 1.7562961560310546, + "grad_norm": 0.28400737047195435, + "learning_rate": 1.2974815375875782e-05, + "loss": 0.1037, + "step": 18550 + }, + { + "epoch": 1.7572429464116643, + "grad_norm": 0.3504233956336975, + "learning_rate": 1.2971028214353342e-05, + "loss": 0.1049, + "step": 18560 + }, + { + "epoch": 1.758189736792274, + "grad_norm": 0.3481403589248657, + "learning_rate": 1.2967241052830904e-05, + "loss": 0.1078, + "step": 18570 + }, + { + "epoch": 1.7591365271728838, + "grad_norm": 0.3445204794406891, + "learning_rate": 1.2963453891308466e-05, + "loss": 0.1093, + "step": 18580 + }, + { + "epoch": 1.7600833175534936, + "grad_norm": 0.3659699559211731, + "learning_rate": 1.2959666729786028e-05, + "loss": 0.1137, + "step": 18590 + }, + { + "epoch": 1.7610301079341033, + "grad_norm": 0.35296133160591125, + "learning_rate": 1.2955879568263588e-05, + "loss": 0.107, + "step": 18600 + }, + { + "epoch": 1.761976898314713, + "grad_norm": 0.30524009466171265, + "learning_rate": 1.2952092406741148e-05, + "loss": 0.1009, + "step": 18610 + }, + { + "epoch": 1.7629236886953228, + "grad_norm": 0.3883345127105713, + "learning_rate": 1.2948305245218708e-05, + "loss": 0.1173, + "step": 18620 + }, + { + "epoch": 1.7638704790759325, + "grad_norm": 0.2975183427333832, + "learning_rate": 1.2944518083696271e-05, + "loss": 0.117, + "step": 18630 + }, + { + "epoch": 1.7648172694565423, + "grad_norm": 0.35033711791038513, + "learning_rate": 1.2940730922173831e-05, + "loss": 0.1081, + "step": 18640 + }, + { + "epoch": 1.765764059837152, + "grad_norm": 0.3302345871925354, + "learning_rate": 1.2936943760651393e-05, + "loss": 0.1063, + "step": 18650 + }, + { + "epoch": 1.7667108502177618, + "grad_norm": 0.30319857597351074, + "learning_rate": 1.2933156599128953e-05, + "loss": 0.0999, + "step": 18660 + }, + { + "epoch": 1.7676576405983715, + "grad_norm": 0.39417383074760437, + "learning_rate": 1.2929369437606516e-05, + "loss": 0.1059, + "step": 18670 + }, + { + "epoch": 1.7686044309789812, + "grad_norm": 0.39419564604759216, + "learning_rate": 1.2925582276084077e-05, + "loss": 0.1081, + "step": 18680 + }, + { + "epoch": 1.769551221359591, + "grad_norm": 0.30187517404556274, + "learning_rate": 1.2921795114561637e-05, + "loss": 0.1064, + "step": 18690 + }, + { + "epoch": 1.7704980117402007, + "grad_norm": 0.3162728548049927, + "learning_rate": 1.2918007953039198e-05, + "loss": 0.1037, + "step": 18700 + }, + { + "epoch": 1.7714448021208105, + "grad_norm": 0.31417393684387207, + "learning_rate": 1.2914220791516758e-05, + "loss": 0.0992, + "step": 18710 + }, + { + "epoch": 1.7723915925014202, + "grad_norm": 0.3579910695552826, + "learning_rate": 1.2910433629994322e-05, + "loss": 0.1048, + "step": 18720 + }, + { + "epoch": 1.77333838288203, + "grad_norm": 0.4408959746360779, + "learning_rate": 1.2906646468471882e-05, + "loss": 0.1051, + "step": 18730 + }, + { + "epoch": 1.7742851732626397, + "grad_norm": 0.32164129614830017, + "learning_rate": 1.2902859306949442e-05, + "loss": 0.1097, + "step": 18740 + }, + { + "epoch": 1.7752319636432494, + "grad_norm": 0.28425174951553345, + "learning_rate": 1.2899072145427004e-05, + "loss": 0.11, + "step": 18750 + }, + { + "epoch": 1.7761787540238592, + "grad_norm": 0.3800603449344635, + "learning_rate": 1.2895284983904564e-05, + "loss": 0.1081, + "step": 18760 + }, + { + "epoch": 1.777125544404469, + "grad_norm": 0.3860698938369751, + "learning_rate": 1.2891497822382127e-05, + "loss": 0.1078, + "step": 18770 + }, + { + "epoch": 1.7780723347850786, + "grad_norm": 0.3523519039154053, + "learning_rate": 1.2887710660859687e-05, + "loss": 0.1115, + "step": 18780 + }, + { + "epoch": 1.7790191251656884, + "grad_norm": 0.5016382932662964, + "learning_rate": 1.2883923499337247e-05, + "loss": 0.1106, + "step": 18790 + }, + { + "epoch": 1.7799659155462981, + "grad_norm": 0.34837406873703003, + "learning_rate": 1.2880136337814807e-05, + "loss": 0.1046, + "step": 18800 + }, + { + "epoch": 1.7809127059269079, + "grad_norm": 0.34380877017974854, + "learning_rate": 1.287634917629237e-05, + "loss": 0.1028, + "step": 18810 + }, + { + "epoch": 1.7818594963075176, + "grad_norm": 0.32999187707901, + "learning_rate": 1.287256201476993e-05, + "loss": 0.0998, + "step": 18820 + }, + { + "epoch": 1.7828062866881274, + "grad_norm": 0.36616331338882446, + "learning_rate": 1.2868774853247493e-05, + "loss": 0.0974, + "step": 18830 + }, + { + "epoch": 1.783753077068737, + "grad_norm": 0.3709386885166168, + "learning_rate": 1.2864987691725053e-05, + "loss": 0.1101, + "step": 18840 + }, + { + "epoch": 1.7846998674493468, + "grad_norm": 0.3592256009578705, + "learning_rate": 1.2861200530202613e-05, + "loss": 0.1136, + "step": 18850 + }, + { + "epoch": 1.7856466578299566, + "grad_norm": 0.532692015171051, + "learning_rate": 1.2857413368680176e-05, + "loss": 0.1119, + "step": 18860 + }, + { + "epoch": 1.7865934482105663, + "grad_norm": 0.36976680159568787, + "learning_rate": 1.2853626207157736e-05, + "loss": 0.1098, + "step": 18870 + }, + { + "epoch": 1.787540238591176, + "grad_norm": 0.452336847782135, + "learning_rate": 1.2849839045635298e-05, + "loss": 0.1029, + "step": 18880 + }, + { + "epoch": 1.7884870289717858, + "grad_norm": 0.44521087408065796, + "learning_rate": 1.2846051884112858e-05, + "loss": 0.111, + "step": 18890 + }, + { + "epoch": 1.7894338193523955, + "grad_norm": 0.34995198249816895, + "learning_rate": 1.2842264722590418e-05, + "loss": 0.1091, + "step": 18900 + }, + { + "epoch": 1.790380609733005, + "grad_norm": 0.3400953710079193, + "learning_rate": 1.2838477561067981e-05, + "loss": 0.1009, + "step": 18910 + }, + { + "epoch": 1.7913274001136148, + "grad_norm": 0.37189778685569763, + "learning_rate": 1.2834690399545541e-05, + "loss": 0.1034, + "step": 18920 + }, + { + "epoch": 1.7922741904942245, + "grad_norm": 0.3677590787410736, + "learning_rate": 1.2830903238023103e-05, + "loss": 0.1039, + "step": 18930 + }, + { + "epoch": 1.7932209808748343, + "grad_norm": 0.32119473814964294, + "learning_rate": 1.2827116076500663e-05, + "loss": 0.1073, + "step": 18940 + }, + { + "epoch": 1.794167771255444, + "grad_norm": 0.3377484679222107, + "learning_rate": 1.2823328914978227e-05, + "loss": 0.1176, + "step": 18950 + }, + { + "epoch": 1.7951145616360538, + "grad_norm": 0.2578378915786743, + "learning_rate": 1.2819541753455787e-05, + "loss": 0.0994, + "step": 18960 + }, + { + "epoch": 1.7960613520166635, + "grad_norm": 0.34310463070869446, + "learning_rate": 1.2815754591933347e-05, + "loss": 0.1026, + "step": 18970 + }, + { + "epoch": 1.7970081423972732, + "grad_norm": 0.33588457107543945, + "learning_rate": 1.2811967430410907e-05, + "loss": 0.1039, + "step": 18980 + }, + { + "epoch": 1.797954932777883, + "grad_norm": 0.3982495367527008, + "learning_rate": 1.2808180268888469e-05, + "loss": 0.1154, + "step": 18990 + }, + { + "epoch": 1.7989017231584927, + "grad_norm": 0.4286862909793854, + "learning_rate": 1.280439310736603e-05, + "loss": 0.1143, + "step": 19000 + }, + { + "epoch": 1.7998485135391025, + "grad_norm": 0.37063848972320557, + "learning_rate": 1.2800605945843592e-05, + "loss": 0.1077, + "step": 19010 + }, + { + "epoch": 1.8007953039197122, + "grad_norm": 0.4058857262134552, + "learning_rate": 1.2796818784321152e-05, + "loss": 0.1165, + "step": 19020 + }, + { + "epoch": 1.801742094300322, + "grad_norm": 0.3987621068954468, + "learning_rate": 1.2793031622798712e-05, + "loss": 0.1118, + "step": 19030 + }, + { + "epoch": 1.8026888846809317, + "grad_norm": 0.41691696643829346, + "learning_rate": 1.2789244461276274e-05, + "loss": 0.1178, + "step": 19040 + }, + { + "epoch": 1.8036356750615412, + "grad_norm": 0.4202606678009033, + "learning_rate": 1.2785457299753836e-05, + "loss": 0.1091, + "step": 19050 + }, + { + "epoch": 1.804582465442151, + "grad_norm": 0.39799049496650696, + "learning_rate": 1.2781670138231397e-05, + "loss": 0.1096, + "step": 19060 + }, + { + "epoch": 1.8055292558227607, + "grad_norm": 0.3521597385406494, + "learning_rate": 1.2777882976708957e-05, + "loss": 0.1062, + "step": 19070 + }, + { + "epoch": 1.8064760462033704, + "grad_norm": 0.3163425326347351, + "learning_rate": 1.2774095815186517e-05, + "loss": 0.1077, + "step": 19080 + }, + { + "epoch": 1.8074228365839802, + "grad_norm": 0.3193988800048828, + "learning_rate": 1.2770308653664081e-05, + "loss": 0.1037, + "step": 19090 + }, + { + "epoch": 1.80836962696459, + "grad_norm": 0.34172874689102173, + "learning_rate": 1.2766521492141641e-05, + "loss": 0.1086, + "step": 19100 + }, + { + "epoch": 1.8093164173451997, + "grad_norm": 0.3774087429046631, + "learning_rate": 1.2762734330619203e-05, + "loss": 0.1042, + "step": 19110 + }, + { + "epoch": 1.8102632077258094, + "grad_norm": 0.4073845148086548, + "learning_rate": 1.2758947169096763e-05, + "loss": 0.1124, + "step": 19120 + }, + { + "epoch": 1.8112099981064191, + "grad_norm": 0.2993011474609375, + "learning_rate": 1.2755160007574323e-05, + "loss": 0.0977, + "step": 19130 + }, + { + "epoch": 1.8121567884870289, + "grad_norm": 0.3964138925075531, + "learning_rate": 1.2751372846051886e-05, + "loss": 0.1123, + "step": 19140 + }, + { + "epoch": 1.8131035788676386, + "grad_norm": 0.39932727813720703, + "learning_rate": 1.2747585684529446e-05, + "loss": 0.1022, + "step": 19150 + }, + { + "epoch": 1.8140503692482484, + "grad_norm": 0.24434198439121246, + "learning_rate": 1.2743798523007006e-05, + "loss": 0.0994, + "step": 19160 + }, + { + "epoch": 1.814997159628858, + "grad_norm": 0.431255042552948, + "learning_rate": 1.2740011361484568e-05, + "loss": 0.1157, + "step": 19170 + }, + { + "epoch": 1.8159439500094678, + "grad_norm": 0.35100871324539185, + "learning_rate": 1.2736224199962128e-05, + "loss": 0.1129, + "step": 19180 + }, + { + "epoch": 1.8168907403900776, + "grad_norm": 0.357479989528656, + "learning_rate": 1.2732437038439692e-05, + "loss": 0.1034, + "step": 19190 + }, + { + "epoch": 1.8178375307706873, + "grad_norm": 0.31612756848335266, + "learning_rate": 1.2728649876917252e-05, + "loss": 0.1147, + "step": 19200 + }, + { + "epoch": 1.818784321151297, + "grad_norm": 0.3879270851612091, + "learning_rate": 1.2724862715394812e-05, + "loss": 0.1071, + "step": 19210 + }, + { + "epoch": 1.8197311115319068, + "grad_norm": 0.3949774205684662, + "learning_rate": 1.2721075553872373e-05, + "loss": 0.1131, + "step": 19220 + }, + { + "epoch": 1.8206779019125165, + "grad_norm": 0.3450389802455902, + "learning_rate": 1.2717288392349935e-05, + "loss": 0.1172, + "step": 19230 + }, + { + "epoch": 1.8216246922931263, + "grad_norm": 0.30399641394615173, + "learning_rate": 1.2713501230827497e-05, + "loss": 0.1061, + "step": 19240 + }, + { + "epoch": 1.822571482673736, + "grad_norm": 0.3874061703681946, + "learning_rate": 1.2709714069305057e-05, + "loss": 0.1176, + "step": 19250 + }, + { + "epoch": 1.8235182730543458, + "grad_norm": 0.3001534640789032, + "learning_rate": 1.2705926907782617e-05, + "loss": 0.1109, + "step": 19260 + }, + { + "epoch": 1.8244650634349555, + "grad_norm": 0.3698557913303375, + "learning_rate": 1.2702139746260179e-05, + "loss": 0.1094, + "step": 19270 + }, + { + "epoch": 1.8254118538155653, + "grad_norm": 0.33681055903434753, + "learning_rate": 1.269835258473774e-05, + "loss": 0.1043, + "step": 19280 + }, + { + "epoch": 1.826358644196175, + "grad_norm": 0.33553245663642883, + "learning_rate": 1.2694565423215302e-05, + "loss": 0.109, + "step": 19290 + }, + { + "epoch": 1.8273054345767847, + "grad_norm": 0.3944529592990875, + "learning_rate": 1.2690778261692862e-05, + "loss": 0.1056, + "step": 19300 + }, + { + "epoch": 1.8282522249573945, + "grad_norm": 0.34673944115638733, + "learning_rate": 1.2686991100170422e-05, + "loss": 0.098, + "step": 19310 + }, + { + "epoch": 1.8291990153380042, + "grad_norm": 0.46945884823799133, + "learning_rate": 1.2683203938647984e-05, + "loss": 0.0998, + "step": 19320 + }, + { + "epoch": 1.830145805718614, + "grad_norm": 0.3739577531814575, + "learning_rate": 1.2679416777125546e-05, + "loss": 0.1151, + "step": 19330 + }, + { + "epoch": 1.8310925960992237, + "grad_norm": 0.32211214303970337, + "learning_rate": 1.2675629615603106e-05, + "loss": 0.122, + "step": 19340 + }, + { + "epoch": 1.8320393864798334, + "grad_norm": 0.3556137979030609, + "learning_rate": 1.2671842454080668e-05, + "loss": 0.1027, + "step": 19350 + }, + { + "epoch": 1.8329861768604432, + "grad_norm": 0.3246605098247528, + "learning_rate": 1.2668055292558228e-05, + "loss": 0.1168, + "step": 19360 + }, + { + "epoch": 1.833932967241053, + "grad_norm": 0.3900707960128784, + "learning_rate": 1.2664268131035791e-05, + "loss": 0.1166, + "step": 19370 + }, + { + "epoch": 1.8348797576216627, + "grad_norm": 0.4111650288105011, + "learning_rate": 1.2660480969513351e-05, + "loss": 0.1055, + "step": 19380 + }, + { + "epoch": 1.8358265480022724, + "grad_norm": 0.29402846097946167, + "learning_rate": 1.2656693807990911e-05, + "loss": 0.1027, + "step": 19390 + }, + { + "epoch": 1.8367733383828821, + "grad_norm": 0.4428441822528839, + "learning_rate": 1.2652906646468473e-05, + "loss": 0.1067, + "step": 19400 + }, + { + "epoch": 1.837720128763492, + "grad_norm": 0.40585818886756897, + "learning_rate": 1.2649119484946033e-05, + "loss": 0.1085, + "step": 19410 + }, + { + "epoch": 1.8386669191441016, + "grad_norm": 0.34224626421928406, + "learning_rate": 1.2645332323423596e-05, + "loss": 0.1152, + "step": 19420 + }, + { + "epoch": 1.8396137095247114, + "grad_norm": 0.34518611431121826, + "learning_rate": 1.2641545161901156e-05, + "loss": 0.1073, + "step": 19430 + }, + { + "epoch": 1.8405604999053211, + "grad_norm": 0.4500565826892853, + "learning_rate": 1.2637758000378717e-05, + "loss": 0.1136, + "step": 19440 + }, + { + "epoch": 1.8415072902859309, + "grad_norm": 0.3417593538761139, + "learning_rate": 1.2633970838856278e-05, + "loss": 0.1154, + "step": 19450 + }, + { + "epoch": 1.8424540806665404, + "grad_norm": 0.3833911120891571, + "learning_rate": 1.263018367733384e-05, + "loss": 0.1125, + "step": 19460 + }, + { + "epoch": 1.8434008710471501, + "grad_norm": 0.34328556060791016, + "learning_rate": 1.2626396515811402e-05, + "loss": 0.1094, + "step": 19470 + }, + { + "epoch": 1.8443476614277599, + "grad_norm": 0.3361929953098297, + "learning_rate": 1.2622609354288962e-05, + "loss": 0.1104, + "step": 19480 + }, + { + "epoch": 1.8452944518083696, + "grad_norm": 0.3236779272556305, + "learning_rate": 1.2618822192766522e-05, + "loss": 0.1153, + "step": 19490 + }, + { + "epoch": 1.8462412421889793, + "grad_norm": 0.33372533321380615, + "learning_rate": 1.2615035031244084e-05, + "loss": 0.1055, + "step": 19500 + }, + { + "epoch": 1.847188032569589, + "grad_norm": 0.28622475266456604, + "learning_rate": 1.2611247869721645e-05, + "loss": 0.1072, + "step": 19510 + }, + { + "epoch": 1.8481348229501988, + "grad_norm": 0.45177966356277466, + "learning_rate": 1.2607460708199205e-05, + "loss": 0.1142, + "step": 19520 + }, + { + "epoch": 1.8490816133308086, + "grad_norm": 0.40346699953079224, + "learning_rate": 1.2603673546676767e-05, + "loss": 0.1147, + "step": 19530 + }, + { + "epoch": 1.8500284037114183, + "grad_norm": 0.3747116029262543, + "learning_rate": 1.2599886385154327e-05, + "loss": 0.1143, + "step": 19540 + }, + { + "epoch": 1.850975194092028, + "grad_norm": 0.37933945655822754, + "learning_rate": 1.2596099223631887e-05, + "loss": 0.1021, + "step": 19550 + }, + { + "epoch": 1.8519219844726378, + "grad_norm": 0.34756040573120117, + "learning_rate": 1.259231206210945e-05, + "loss": 0.1165, + "step": 19560 + }, + { + "epoch": 1.8528687748532475, + "grad_norm": 0.32047250866889954, + "learning_rate": 1.258852490058701e-05, + "loss": 0.1062, + "step": 19570 + }, + { + "epoch": 1.8538155652338573, + "grad_norm": 0.3668143153190613, + "learning_rate": 1.2584737739064572e-05, + "loss": 0.1149, + "step": 19580 + }, + { + "epoch": 1.854762355614467, + "grad_norm": 0.3343770205974579, + "learning_rate": 1.2580950577542133e-05, + "loss": 0.1055, + "step": 19590 + }, + { + "epoch": 1.8557091459950765, + "grad_norm": 0.3434257507324219, + "learning_rate": 1.2577163416019696e-05, + "loss": 0.1035, + "step": 19600 + }, + { + "epoch": 1.8566559363756863, + "grad_norm": 0.3178967535495758, + "learning_rate": 1.2573376254497256e-05, + "loss": 0.0951, + "step": 19610 + }, + { + "epoch": 1.857602726756296, + "grad_norm": 0.3695988059043884, + "learning_rate": 1.2569589092974816e-05, + "loss": 0.1224, + "step": 19620 + }, + { + "epoch": 1.8585495171369057, + "grad_norm": 0.3733889162540436, + "learning_rate": 1.2565801931452378e-05, + "loss": 0.1029, + "step": 19630 + }, + { + "epoch": 1.8594963075175155, + "grad_norm": 0.3214036822319031, + "learning_rate": 1.2562014769929938e-05, + "loss": 0.115, + "step": 19640 + }, + { + "epoch": 1.8604430978981252, + "grad_norm": 0.2950606942176819, + "learning_rate": 1.2558227608407501e-05, + "loss": 0.1078, + "step": 19650 + }, + { + "epoch": 1.861389888278735, + "grad_norm": 0.37673529982566833, + "learning_rate": 1.2554440446885061e-05, + "loss": 0.1129, + "step": 19660 + }, + { + "epoch": 1.8623366786593447, + "grad_norm": 0.36980605125427246, + "learning_rate": 1.2550653285362621e-05, + "loss": 0.1034, + "step": 19670 + }, + { + "epoch": 1.8632834690399545, + "grad_norm": 0.2896386384963989, + "learning_rate": 1.2546866123840183e-05, + "loss": 0.1164, + "step": 19680 + }, + { + "epoch": 1.8642302594205642, + "grad_norm": 0.299468457698822, + "learning_rate": 1.2543078962317743e-05, + "loss": 0.1094, + "step": 19690 + }, + { + "epoch": 1.865177049801174, + "grad_norm": 0.4312908351421356, + "learning_rate": 1.2539291800795305e-05, + "loss": 0.1132, + "step": 19700 + }, + { + "epoch": 1.8661238401817837, + "grad_norm": 0.29344403743743896, + "learning_rate": 1.2535504639272867e-05, + "loss": 0.1007, + "step": 19710 + }, + { + "epoch": 1.8670706305623934, + "grad_norm": 0.37645217776298523, + "learning_rate": 1.2531717477750427e-05, + "loss": 0.1109, + "step": 19720 + }, + { + "epoch": 1.8680174209430032, + "grad_norm": 0.40150484442710876, + "learning_rate": 1.2527930316227987e-05, + "loss": 0.1068, + "step": 19730 + }, + { + "epoch": 1.868964211323613, + "grad_norm": 0.3141026198863983, + "learning_rate": 1.252414315470555e-05, + "loss": 0.1082, + "step": 19740 + }, + { + "epoch": 1.8699110017042226, + "grad_norm": 0.4085179269313812, + "learning_rate": 1.252035599318311e-05, + "loss": 0.1016, + "step": 19750 + }, + { + "epoch": 1.8708577920848324, + "grad_norm": 0.33192178606987, + "learning_rate": 1.2516568831660672e-05, + "loss": 0.1054, + "step": 19760 + }, + { + "epoch": 1.8718045824654421, + "grad_norm": 0.3281863033771515, + "learning_rate": 1.2512781670138232e-05, + "loss": 0.1093, + "step": 19770 + }, + { + "epoch": 1.8727513728460519, + "grad_norm": 0.27537137269973755, + "learning_rate": 1.2508994508615792e-05, + "loss": 0.1078, + "step": 19780 + }, + { + "epoch": 1.8736981632266616, + "grad_norm": 0.34461522102355957, + "learning_rate": 1.2505207347093356e-05, + "loss": 0.1146, + "step": 19790 + }, + { + "epoch": 1.8746449536072713, + "grad_norm": 0.3809272050857544, + "learning_rate": 1.2501420185570916e-05, + "loss": 0.1051, + "step": 19800 + }, + { + "epoch": 1.875591743987881, + "grad_norm": 0.4907462000846863, + "learning_rate": 1.2497633024048477e-05, + "loss": 0.1085, + "step": 19810 + }, + { + "epoch": 1.8765385343684908, + "grad_norm": 0.32635852694511414, + "learning_rate": 1.2493845862526037e-05, + "loss": 0.111, + "step": 19820 + }, + { + "epoch": 1.8774853247491006, + "grad_norm": 0.41782793402671814, + "learning_rate": 1.2490058701003597e-05, + "loss": 0.1049, + "step": 19830 + }, + { + "epoch": 1.8784321151297103, + "grad_norm": 0.34212571382522583, + "learning_rate": 1.2486271539481161e-05, + "loss": 0.1073, + "step": 19840 + }, + { + "epoch": 1.87937890551032, + "grad_norm": 0.3190506100654602, + "learning_rate": 1.2482484377958721e-05, + "loss": 0.0979, + "step": 19850 + }, + { + "epoch": 1.8803256958909298, + "grad_norm": 0.34766045212745667, + "learning_rate": 1.2478697216436283e-05, + "loss": 0.1017, + "step": 19860 + }, + { + "epoch": 1.8812724862715395, + "grad_norm": 0.3498804271221161, + "learning_rate": 1.2474910054913843e-05, + "loss": 0.103, + "step": 19870 + }, + { + "epoch": 1.8822192766521493, + "grad_norm": 0.38651737570762634, + "learning_rate": 1.2471122893391404e-05, + "loss": 0.1176, + "step": 19880 + }, + { + "epoch": 1.883166067032759, + "grad_norm": 0.2624821364879608, + "learning_rate": 1.2467335731868966e-05, + "loss": 0.1112, + "step": 19890 + }, + { + "epoch": 1.8841128574133688, + "grad_norm": 0.3932912349700928, + "learning_rate": 1.2463548570346526e-05, + "loss": 0.1133, + "step": 19900 + }, + { + "epoch": 1.8850596477939785, + "grad_norm": 0.3405761420726776, + "learning_rate": 1.2459761408824086e-05, + "loss": 0.1193, + "step": 19910 + }, + { + "epoch": 1.8860064381745882, + "grad_norm": 0.31691908836364746, + "learning_rate": 1.2455974247301648e-05, + "loss": 0.1105, + "step": 19920 + }, + { + "epoch": 1.886953228555198, + "grad_norm": 0.3382716178894043, + "learning_rate": 1.245218708577921e-05, + "loss": 0.106, + "step": 19930 + }, + { + "epoch": 1.8879000189358077, + "grad_norm": 0.3085930645465851, + "learning_rate": 1.2448399924256772e-05, + "loss": 0.0963, + "step": 19940 + }, + { + "epoch": 1.8888468093164175, + "grad_norm": 0.3540954887866974, + "learning_rate": 1.2444612762734332e-05, + "loss": 0.1054, + "step": 19950 + }, + { + "epoch": 1.8897935996970272, + "grad_norm": 0.37571460008621216, + "learning_rate": 1.2440825601211892e-05, + "loss": 0.1063, + "step": 19960 + }, + { + "epoch": 1.890740390077637, + "grad_norm": 0.4311580955982208, + "learning_rate": 1.2437038439689453e-05, + "loss": 0.1046, + "step": 19970 + }, + { + "epoch": 1.8916871804582467, + "grad_norm": 0.4840514659881592, + "learning_rate": 1.2433251278167015e-05, + "loss": 0.1126, + "step": 19980 + }, + { + "epoch": 1.8926339708388564, + "grad_norm": 0.33492323756217957, + "learning_rate": 1.2429464116644577e-05, + "loss": 0.1036, + "step": 19990 + }, + { + "epoch": 1.8935807612194662, + "grad_norm": 0.3419467806816101, + "learning_rate": 1.2425676955122137e-05, + "loss": 0.1008, + "step": 20000 + }, + { + "epoch": 1.894527551600076, + "grad_norm": 0.35411712527275085, + "learning_rate": 1.2421889793599697e-05, + "loss": 0.0967, + "step": 20010 + }, + { + "epoch": 1.8954743419806854, + "grad_norm": 0.508010983467102, + "learning_rate": 1.241810263207726e-05, + "loss": 0.1017, + "step": 20020 + }, + { + "epoch": 1.8964211323612952, + "grad_norm": 0.32599472999572754, + "learning_rate": 1.241431547055482e-05, + "loss": 0.1, + "step": 20030 + }, + { + "epoch": 1.897367922741905, + "grad_norm": 0.401747465133667, + "learning_rate": 1.2410528309032382e-05, + "loss": 0.1251, + "step": 20040 + }, + { + "epoch": 1.8983147131225147, + "grad_norm": 0.37528371810913086, + "learning_rate": 1.2406741147509942e-05, + "loss": 0.1113, + "step": 20050 + }, + { + "epoch": 1.8992615035031244, + "grad_norm": 0.396750271320343, + "learning_rate": 1.2402953985987502e-05, + "loss": 0.1058, + "step": 20060 + }, + { + "epoch": 1.9002082938837341, + "grad_norm": 0.38264715671539307, + "learning_rate": 1.2399166824465066e-05, + "loss": 0.1173, + "step": 20070 + }, + { + "epoch": 1.9011550842643439, + "grad_norm": 0.2985268235206604, + "learning_rate": 1.2395379662942626e-05, + "loss": 0.1064, + "step": 20080 + }, + { + "epoch": 1.9021018746449536, + "grad_norm": 0.40297383069992065, + "learning_rate": 1.2391592501420186e-05, + "loss": 0.1114, + "step": 20090 + }, + { + "epoch": 1.9030486650255634, + "grad_norm": 0.344043493270874, + "learning_rate": 1.2387805339897748e-05, + "loss": 0.1038, + "step": 20100 + }, + { + "epoch": 1.903995455406173, + "grad_norm": 0.35050851106643677, + "learning_rate": 1.2384018178375308e-05, + "loss": 0.103, + "step": 20110 + }, + { + "epoch": 1.9049422457867828, + "grad_norm": 0.35159409046173096, + "learning_rate": 1.2380231016852871e-05, + "loss": 0.1114, + "step": 20120 + }, + { + "epoch": 1.9058890361673926, + "grad_norm": 0.3516301214694977, + "learning_rate": 1.2376443855330431e-05, + "loss": 0.1026, + "step": 20130 + }, + { + "epoch": 1.9068358265480023, + "grad_norm": 0.3457273542881012, + "learning_rate": 1.2372656693807991e-05, + "loss": 0.104, + "step": 20140 + }, + { + "epoch": 1.9077826169286118, + "grad_norm": 0.39187681674957275, + "learning_rate": 1.2368869532285553e-05, + "loss": 0.1091, + "step": 20150 + }, + { + "epoch": 1.9087294073092216, + "grad_norm": 0.41298121213912964, + "learning_rate": 1.2365082370763115e-05, + "loss": 0.11, + "step": 20160 + }, + { + "epoch": 1.9096761976898313, + "grad_norm": 0.47788843512535095, + "learning_rate": 1.2361295209240676e-05, + "loss": 0.1054, + "step": 20170 + }, + { + "epoch": 1.910622988070441, + "grad_norm": 0.45805585384368896, + "learning_rate": 1.2357508047718236e-05, + "loss": 0.1067, + "step": 20180 + }, + { + "epoch": 1.9115697784510508, + "grad_norm": 0.27484065294265747, + "learning_rate": 1.2353720886195796e-05, + "loss": 0.1078, + "step": 20190 + }, + { + "epoch": 1.9125165688316605, + "grad_norm": 0.39492151141166687, + "learning_rate": 1.2349933724673358e-05, + "loss": 0.1123, + "step": 20200 + }, + { + "epoch": 1.9134633592122703, + "grad_norm": 0.29254859685897827, + "learning_rate": 1.234614656315092e-05, + "loss": 0.1099, + "step": 20210 + }, + { + "epoch": 1.91441014959288, + "grad_norm": 0.41775816679000854, + "learning_rate": 1.2342359401628482e-05, + "loss": 0.106, + "step": 20220 + }, + { + "epoch": 1.9153569399734898, + "grad_norm": 0.30593568086624146, + "learning_rate": 1.2338572240106042e-05, + "loss": 0.1139, + "step": 20230 + }, + { + "epoch": 1.9163037303540995, + "grad_norm": 0.38288331031799316, + "learning_rate": 1.2334785078583602e-05, + "loss": 0.111, + "step": 20240 + }, + { + "epoch": 1.9172505207347093, + "grad_norm": 0.3679775297641754, + "learning_rate": 1.2330997917061162e-05, + "loss": 0.108, + "step": 20250 + }, + { + "epoch": 1.918197311115319, + "grad_norm": 0.33158156275749207, + "learning_rate": 1.2327210755538725e-05, + "loss": 0.1157, + "step": 20260 + }, + { + "epoch": 1.9191441014959287, + "grad_norm": 0.32633069157600403, + "learning_rate": 1.2323423594016285e-05, + "loss": 0.0991, + "step": 20270 + }, + { + "epoch": 1.9200908918765385, + "grad_norm": 0.3557127118110657, + "learning_rate": 1.2319636432493847e-05, + "loss": 0.1048, + "step": 20280 + }, + { + "epoch": 1.9210376822571482, + "grad_norm": 0.30199623107910156, + "learning_rate": 1.2315849270971407e-05, + "loss": 0.1035, + "step": 20290 + }, + { + "epoch": 1.921984472637758, + "grad_norm": 0.3319079279899597, + "learning_rate": 1.231206210944897e-05, + "loss": 0.1095, + "step": 20300 + }, + { + "epoch": 1.9229312630183677, + "grad_norm": 0.3735467195510864, + "learning_rate": 1.230827494792653e-05, + "loss": 0.1107, + "step": 20310 + }, + { + "epoch": 1.9238780533989774, + "grad_norm": 0.3348022401332855, + "learning_rate": 1.230448778640409e-05, + "loss": 0.0992, + "step": 20320 + }, + { + "epoch": 1.9248248437795872, + "grad_norm": 0.37244561314582825, + "learning_rate": 1.2300700624881652e-05, + "loss": 0.1044, + "step": 20330 + }, + { + "epoch": 1.925771634160197, + "grad_norm": 0.36787769198417664, + "learning_rate": 1.2296913463359212e-05, + "loss": 0.0992, + "step": 20340 + }, + { + "epoch": 1.9267184245408067, + "grad_norm": 0.4016842842102051, + "learning_rate": 1.2293126301836776e-05, + "loss": 0.109, + "step": 20350 + }, + { + "epoch": 1.9276652149214164, + "grad_norm": 0.39530324935913086, + "learning_rate": 1.2289339140314336e-05, + "loss": 0.1057, + "step": 20360 + }, + { + "epoch": 1.9286120053020261, + "grad_norm": 0.4053460955619812, + "learning_rate": 1.2285551978791896e-05, + "loss": 0.1048, + "step": 20370 + }, + { + "epoch": 1.9295587956826359, + "grad_norm": 0.4953003525733948, + "learning_rate": 1.2281764817269458e-05, + "loss": 0.1077, + "step": 20380 + }, + { + "epoch": 1.9305055860632456, + "grad_norm": 0.4115254878997803, + "learning_rate": 1.2277977655747018e-05, + "loss": 0.1184, + "step": 20390 + }, + { + "epoch": 1.9314523764438554, + "grad_norm": 0.27491551637649536, + "learning_rate": 1.2274190494224581e-05, + "loss": 0.1114, + "step": 20400 + }, + { + "epoch": 1.932399166824465, + "grad_norm": 0.3053814470767975, + "learning_rate": 1.2270403332702141e-05, + "loss": 0.1023, + "step": 20410 + }, + { + "epoch": 1.9333459572050748, + "grad_norm": 0.3862939178943634, + "learning_rate": 1.2266616171179701e-05, + "loss": 0.11, + "step": 20420 + }, + { + "epoch": 1.9342927475856846, + "grad_norm": 0.3790120482444763, + "learning_rate": 1.2262829009657261e-05, + "loss": 0.1107, + "step": 20430 + }, + { + "epoch": 1.9352395379662943, + "grad_norm": 0.35550233721733093, + "learning_rate": 1.2259041848134825e-05, + "loss": 0.0923, + "step": 20440 + }, + { + "epoch": 1.936186328346904, + "grad_norm": 0.3539644777774811, + "learning_rate": 1.2255254686612385e-05, + "loss": 0.1112, + "step": 20450 + }, + { + "epoch": 1.9371331187275138, + "grad_norm": 0.4248740077018738, + "learning_rate": 1.2251467525089947e-05, + "loss": 0.1124, + "step": 20460 + }, + { + "epoch": 1.9380799091081236, + "grad_norm": 0.305600643157959, + "learning_rate": 1.2247680363567507e-05, + "loss": 0.1069, + "step": 20470 + }, + { + "epoch": 1.9390266994887333, + "grad_norm": 0.3527030348777771, + "learning_rate": 1.2243893202045067e-05, + "loss": 0.1074, + "step": 20480 + }, + { + "epoch": 1.939973489869343, + "grad_norm": 0.38150569796562195, + "learning_rate": 1.224010604052263e-05, + "loss": 0.0973, + "step": 20490 + }, + { + "epoch": 1.9409202802499528, + "grad_norm": 0.2797296643257141, + "learning_rate": 1.223631887900019e-05, + "loss": 0.1099, + "step": 20500 + }, + { + "epoch": 1.9418670706305625, + "grad_norm": 0.38830894231796265, + "learning_rate": 1.2232531717477752e-05, + "loss": 0.1088, + "step": 20510 + }, + { + "epoch": 1.9428138610111723, + "grad_norm": 0.3484981954097748, + "learning_rate": 1.2228744555955312e-05, + "loss": 0.1084, + "step": 20520 + }, + { + "epoch": 1.943760651391782, + "grad_norm": 0.45552173256874084, + "learning_rate": 1.2224957394432872e-05, + "loss": 0.1053, + "step": 20530 + }, + { + "epoch": 1.9447074417723917, + "grad_norm": 0.324050635099411, + "learning_rate": 1.2221170232910435e-05, + "loss": 0.0998, + "step": 20540 + }, + { + "epoch": 1.9456542321530015, + "grad_norm": 0.35266685485839844, + "learning_rate": 1.2217383071387996e-05, + "loss": 0.1083, + "step": 20550 + }, + { + "epoch": 1.9466010225336112, + "grad_norm": 0.4260408878326416, + "learning_rate": 1.2213595909865557e-05, + "loss": 0.1085, + "step": 20560 + }, + { + "epoch": 1.9475478129142207, + "grad_norm": 0.3600156009197235, + "learning_rate": 1.2209808748343117e-05, + "loss": 0.1134, + "step": 20570 + }, + { + "epoch": 1.9484946032948305, + "grad_norm": 0.3416011929512024, + "learning_rate": 1.220602158682068e-05, + "loss": 0.097, + "step": 20580 + }, + { + "epoch": 1.9494413936754402, + "grad_norm": 0.3432440459728241, + "learning_rate": 1.220223442529824e-05, + "loss": 0.1001, + "step": 20590 + }, + { + "epoch": 1.95038818405605, + "grad_norm": 0.32206058502197266, + "learning_rate": 1.2198447263775801e-05, + "loss": 0.1012, + "step": 20600 + }, + { + "epoch": 1.9513349744366597, + "grad_norm": 0.382606565952301, + "learning_rate": 1.2194660102253361e-05, + "loss": 0.1126, + "step": 20610 + }, + { + "epoch": 1.9522817648172694, + "grad_norm": 0.3285946249961853, + "learning_rate": 1.2190872940730923e-05, + "loss": 0.1086, + "step": 20620 + }, + { + "epoch": 1.9532285551978792, + "grad_norm": 0.30580756068229675, + "learning_rate": 1.2187085779208484e-05, + "loss": 0.1067, + "step": 20630 + }, + { + "epoch": 1.954175345578489, + "grad_norm": 0.31543785333633423, + "learning_rate": 1.2183298617686046e-05, + "loss": 0.1095, + "step": 20640 + }, + { + "epoch": 1.9551221359590987, + "grad_norm": 0.4250805974006653, + "learning_rate": 1.2179511456163606e-05, + "loss": 0.1133, + "step": 20650 + }, + { + "epoch": 1.9560689263397084, + "grad_norm": 0.3352222442626953, + "learning_rate": 1.2175724294641166e-05, + "loss": 0.0984, + "step": 20660 + }, + { + "epoch": 1.9570157167203182, + "grad_norm": 0.3887184262275696, + "learning_rate": 1.2171937133118728e-05, + "loss": 0.1153, + "step": 20670 + }, + { + "epoch": 1.957962507100928, + "grad_norm": 0.37712177634239197, + "learning_rate": 1.216814997159629e-05, + "loss": 0.1165, + "step": 20680 + }, + { + "epoch": 1.9589092974815376, + "grad_norm": 0.3524229824542999, + "learning_rate": 1.2164362810073851e-05, + "loss": 0.1024, + "step": 20690 + }, + { + "epoch": 1.9598560878621472, + "grad_norm": 0.3581532835960388, + "learning_rate": 1.2160575648551412e-05, + "loss": 0.1115, + "step": 20700 + }, + { + "epoch": 1.960802878242757, + "grad_norm": 0.30578964948654175, + "learning_rate": 1.2156788487028972e-05, + "loss": 0.1082, + "step": 20710 + }, + { + "epoch": 1.9617496686233666, + "grad_norm": 0.3593074381351471, + "learning_rate": 1.2153001325506535e-05, + "loss": 0.1068, + "step": 20720 + }, + { + "epoch": 1.9626964590039764, + "grad_norm": 0.3663315773010254, + "learning_rate": 1.2149214163984095e-05, + "loss": 0.1089, + "step": 20730 + }, + { + "epoch": 1.9636432493845861, + "grad_norm": 0.2964952886104584, + "learning_rate": 1.2145427002461657e-05, + "loss": 0.1073, + "step": 20740 + }, + { + "epoch": 1.9645900397651959, + "grad_norm": 0.38032329082489014, + "learning_rate": 1.2141639840939217e-05, + "loss": 0.1098, + "step": 20750 + }, + { + "epoch": 1.9655368301458056, + "grad_norm": 0.4049619436264038, + "learning_rate": 1.2137852679416777e-05, + "loss": 0.1162, + "step": 20760 + }, + { + "epoch": 1.9664836205264153, + "grad_norm": 0.39563053846359253, + "learning_rate": 1.213406551789434e-05, + "loss": 0.1091, + "step": 20770 + }, + { + "epoch": 1.967430410907025, + "grad_norm": 0.2906529903411865, + "learning_rate": 1.21302783563719e-05, + "loss": 0.1143, + "step": 20780 + }, + { + "epoch": 1.9683772012876348, + "grad_norm": 0.30797553062438965, + "learning_rate": 1.212649119484946e-05, + "loss": 0.104, + "step": 20790 + }, + { + "epoch": 1.9693239916682446, + "grad_norm": 0.3349768817424774, + "learning_rate": 1.2122704033327022e-05, + "loss": 0.1071, + "step": 20800 + }, + { + "epoch": 1.9702707820488543, + "grad_norm": 0.3945980370044708, + "learning_rate": 1.2118916871804582e-05, + "loss": 0.1191, + "step": 20810 + }, + { + "epoch": 1.971217572429464, + "grad_norm": 0.35294216871261597, + "learning_rate": 1.2115129710282146e-05, + "loss": 0.105, + "step": 20820 + }, + { + "epoch": 1.9721643628100738, + "grad_norm": 0.38833561539649963, + "learning_rate": 1.2111342548759706e-05, + "loss": 0.12, + "step": 20830 + }, + { + "epoch": 1.9731111531906835, + "grad_norm": 0.2893434464931488, + "learning_rate": 1.2107555387237266e-05, + "loss": 0.104, + "step": 20840 + }, + { + "epoch": 1.9740579435712933, + "grad_norm": 0.2558499276638031, + "learning_rate": 1.2103768225714828e-05, + "loss": 0.1068, + "step": 20850 + }, + { + "epoch": 1.975004733951903, + "grad_norm": 0.2818835973739624, + "learning_rate": 1.209998106419239e-05, + "loss": 0.11, + "step": 20860 + }, + { + "epoch": 1.9759515243325128, + "grad_norm": 0.29642802476882935, + "learning_rate": 1.2096193902669951e-05, + "loss": 0.1008, + "step": 20870 + }, + { + "epoch": 1.9768983147131225, + "grad_norm": 0.3638819456100464, + "learning_rate": 1.2092406741147511e-05, + "loss": 0.1125, + "step": 20880 + }, + { + "epoch": 1.9778451050937322, + "grad_norm": 0.4232320487499237, + "learning_rate": 1.2088619579625071e-05, + "loss": 0.1142, + "step": 20890 + }, + { + "epoch": 1.978791895474342, + "grad_norm": 0.26880526542663574, + "learning_rate": 1.2084832418102633e-05, + "loss": 0.1018, + "step": 20900 + }, + { + "epoch": 1.9797386858549517, + "grad_norm": 0.35345229506492615, + "learning_rate": 1.2081045256580195e-05, + "loss": 0.0994, + "step": 20910 + }, + { + "epoch": 1.9806854762355615, + "grad_norm": 0.30742210149765015, + "learning_rate": 1.2077258095057756e-05, + "loss": 0.1117, + "step": 20920 + }, + { + "epoch": 1.9816322666161712, + "grad_norm": 0.37625306844711304, + "learning_rate": 1.2073470933535316e-05, + "loss": 0.106, + "step": 20930 + }, + { + "epoch": 1.982579056996781, + "grad_norm": 0.35394927859306335, + "learning_rate": 1.2069683772012876e-05, + "loss": 0.1196, + "step": 20940 + }, + { + "epoch": 1.9835258473773907, + "grad_norm": 0.4539976119995117, + "learning_rate": 1.2065896610490438e-05, + "loss": 0.1081, + "step": 20950 + }, + { + "epoch": 1.9844726377580004, + "grad_norm": 0.37135881185531616, + "learning_rate": 1.2062109448968e-05, + "loss": 0.1178, + "step": 20960 + }, + { + "epoch": 1.9854194281386102, + "grad_norm": 0.29517096281051636, + "learning_rate": 1.205832228744556e-05, + "loss": 0.1093, + "step": 20970 + }, + { + "epoch": 1.98636621851922, + "grad_norm": 0.3522278666496277, + "learning_rate": 1.2054535125923122e-05, + "loss": 0.1173, + "step": 20980 + }, + { + "epoch": 1.9873130088998296, + "grad_norm": 0.3382992744445801, + "learning_rate": 1.2050747964400682e-05, + "loss": 0.1081, + "step": 20990 + }, + { + "epoch": 1.9882597992804394, + "grad_norm": 0.3614194989204407, + "learning_rate": 1.2046960802878245e-05, + "loss": 0.103, + "step": 21000 + }, + { + "epoch": 1.9892065896610491, + "grad_norm": 0.29812687635421753, + "learning_rate": 1.2043173641355805e-05, + "loss": 0.105, + "step": 21010 + }, + { + "epoch": 1.9901533800416589, + "grad_norm": 0.5184694528579712, + "learning_rate": 1.2039386479833365e-05, + "loss": 0.1009, + "step": 21020 + }, + { + "epoch": 1.9911001704222686, + "grad_norm": 0.4298660159111023, + "learning_rate": 1.2035599318310927e-05, + "loss": 0.1127, + "step": 21030 + }, + { + "epoch": 1.9920469608028784, + "grad_norm": 0.32132789492607117, + "learning_rate": 1.2031812156788487e-05, + "loss": 0.1092, + "step": 21040 + }, + { + "epoch": 1.992993751183488, + "grad_norm": 0.4802221953868866, + "learning_rate": 1.202802499526605e-05, + "loss": 0.1194, + "step": 21050 + }, + { + "epoch": 1.9939405415640978, + "grad_norm": 0.32533472776412964, + "learning_rate": 1.202423783374361e-05, + "loss": 0.0933, + "step": 21060 + }, + { + "epoch": 1.9948873319447076, + "grad_norm": 0.3043247163295746, + "learning_rate": 1.202045067222117e-05, + "loss": 0.1093, + "step": 21070 + }, + { + "epoch": 1.9958341223253173, + "grad_norm": 0.36251574754714966, + "learning_rate": 1.2016663510698732e-05, + "loss": 0.1098, + "step": 21080 + }, + { + "epoch": 1.996780912705927, + "grad_norm": 0.40744641423225403, + "learning_rate": 1.2012876349176292e-05, + "loss": 0.1181, + "step": 21090 + }, + { + "epoch": 1.9977277030865368, + "grad_norm": 0.2982351779937744, + "learning_rate": 1.2009089187653856e-05, + "loss": 0.0982, + "step": 21100 + }, + { + "epoch": 1.9986744934671465, + "grad_norm": 0.4201706051826477, + "learning_rate": 1.2005302026131416e-05, + "loss": 0.1118, + "step": 21110 + }, + { + "epoch": 1.999621283847756, + "grad_norm": 0.26378563046455383, + "learning_rate": 1.2001514864608976e-05, + "loss": 0.1048, + "step": 21120 + }, + { + "epoch": 2.0, + "eval_f1_micro": 0.35933516585247743, + "eval_loss": 0.11070390790700912, + "eval_precision": 0.6185036281623848, + "eval_recall": 0.2532268101250577, + "eval_runtime": 339.0084, + "eval_samples_per_second": 124.613, + "eval_steps_per_second": 7.79, + "step": 21124 + }, + { + "epoch": 2.000568074228366, + "grad_norm": 0.3789435625076294, + "learning_rate": 1.1997727703086538e-05, + "loss": 0.1128, + "step": 21130 + }, + { + "epoch": 2.0015148646089758, + "grad_norm": 0.3707931637763977, + "learning_rate": 1.19939405415641e-05, + "loss": 0.1088, + "step": 21140 + }, + { + "epoch": 2.0024616549895855, + "grad_norm": 0.30625054240226746, + "learning_rate": 1.199015338004166e-05, + "loss": 0.0942, + "step": 21150 + }, + { + "epoch": 2.0034084453701952, + "grad_norm": 0.28570687770843506, + "learning_rate": 1.1986366218519221e-05, + "loss": 0.1063, + "step": 21160 + }, + { + "epoch": 2.004355235750805, + "grad_norm": 0.313829630613327, + "learning_rate": 1.1982579056996781e-05, + "loss": 0.0991, + "step": 21170 + }, + { + "epoch": 2.0053020261314143, + "grad_norm": 0.2512275278568268, + "learning_rate": 1.1978791895474341e-05, + "loss": 0.097, + "step": 21180 + }, + { + "epoch": 2.006248816512024, + "grad_norm": 0.3135952055454254, + "learning_rate": 1.1975004733951905e-05, + "loss": 0.1055, + "step": 21190 + }, + { + "epoch": 2.0071956068926338, + "grad_norm": 0.35371828079223633, + "learning_rate": 1.1971217572429465e-05, + "loss": 0.1033, + "step": 21200 + }, + { + "epoch": 2.0081423972732435, + "grad_norm": 0.31105470657348633, + "learning_rate": 1.1967430410907027e-05, + "loss": 0.1014, + "step": 21210 + }, + { + "epoch": 2.0090891876538532, + "grad_norm": 0.32313773036003113, + "learning_rate": 1.1963643249384587e-05, + "loss": 0.094, + "step": 21220 + }, + { + "epoch": 2.010035978034463, + "grad_norm": 0.31268981099128723, + "learning_rate": 1.1959856087862147e-05, + "loss": 0.0951, + "step": 21230 + }, + { + "epoch": 2.0109827684150727, + "grad_norm": 0.38261574506759644, + "learning_rate": 1.195606892633971e-05, + "loss": 0.0923, + "step": 21240 + }, + { + "epoch": 2.0119295587956825, + "grad_norm": 0.3849813640117645, + "learning_rate": 1.195228176481727e-05, + "loss": 0.1121, + "step": 21250 + }, + { + "epoch": 2.012876349176292, + "grad_norm": 0.4521833062171936, + "learning_rate": 1.1948494603294832e-05, + "loss": 0.1121, + "step": 21260 + }, + { + "epoch": 2.013823139556902, + "grad_norm": 0.3331480622291565, + "learning_rate": 1.1944707441772392e-05, + "loss": 0.0995, + "step": 21270 + }, + { + "epoch": 2.0147699299375117, + "grad_norm": 0.33220431208610535, + "learning_rate": 1.1940920280249955e-05, + "loss": 0.1018, + "step": 21280 + }, + { + "epoch": 2.0157167203181214, + "grad_norm": 0.37220412492752075, + "learning_rate": 1.1937133118727515e-05, + "loss": 0.1042, + "step": 21290 + }, + { + "epoch": 2.016663510698731, + "grad_norm": 0.3874574303627014, + "learning_rate": 1.1933345957205075e-05, + "loss": 0.1085, + "step": 21300 + }, + { + "epoch": 2.017610301079341, + "grad_norm": 0.3492327928543091, + "learning_rate": 1.1929558795682637e-05, + "loss": 0.098, + "step": 21310 + }, + { + "epoch": 2.0185570914599507, + "grad_norm": 0.3867335617542267, + "learning_rate": 1.1925771634160197e-05, + "loss": 0.1019, + "step": 21320 + }, + { + "epoch": 2.0195038818405604, + "grad_norm": 0.386625200510025, + "learning_rate": 1.1921984472637759e-05, + "loss": 0.1014, + "step": 21330 + }, + { + "epoch": 2.02045067222117, + "grad_norm": 0.3031746447086334, + "learning_rate": 1.191819731111532e-05, + "loss": 0.1009, + "step": 21340 + }, + { + "epoch": 2.02139746260178, + "grad_norm": 0.3026321530342102, + "learning_rate": 1.191441014959288e-05, + "loss": 0.1019, + "step": 21350 + }, + { + "epoch": 2.0223442529823896, + "grad_norm": 0.35251784324645996, + "learning_rate": 1.1910622988070441e-05, + "loss": 0.095, + "step": 21360 + }, + { + "epoch": 2.0232910433629994, + "grad_norm": 0.34639063477516174, + "learning_rate": 1.1906835826548003e-05, + "loss": 0.1065, + "step": 21370 + }, + { + "epoch": 2.024237833743609, + "grad_norm": 0.34346747398376465, + "learning_rate": 1.1903048665025564e-05, + "loss": 0.1038, + "step": 21380 + }, + { + "epoch": 2.025184624124219, + "grad_norm": 0.3798876702785492, + "learning_rate": 1.1899261503503126e-05, + "loss": 0.1022, + "step": 21390 + }, + { + "epoch": 2.0261314145048286, + "grad_norm": 0.4110786020755768, + "learning_rate": 1.1895474341980686e-05, + "loss": 0.1039, + "step": 21400 + }, + { + "epoch": 2.0270782048854383, + "grad_norm": 0.3044571578502655, + "learning_rate": 1.1891687180458246e-05, + "loss": 0.0952, + "step": 21410 + }, + { + "epoch": 2.028024995266048, + "grad_norm": 0.39002007246017456, + "learning_rate": 1.188790001893581e-05, + "loss": 0.1061, + "step": 21420 + }, + { + "epoch": 2.028971785646658, + "grad_norm": 0.3900294899940491, + "learning_rate": 1.188411285741337e-05, + "loss": 0.11, + "step": 21430 + }, + { + "epoch": 2.0299185760272676, + "grad_norm": 0.36899328231811523, + "learning_rate": 1.1880325695890931e-05, + "loss": 0.0922, + "step": 21440 + }, + { + "epoch": 2.0308653664078773, + "grad_norm": 0.35091322660446167, + "learning_rate": 1.1876538534368491e-05, + "loss": 0.0999, + "step": 21450 + }, + { + "epoch": 2.031812156788487, + "grad_norm": 0.40123164653778076, + "learning_rate": 1.1872751372846052e-05, + "loss": 0.0946, + "step": 21460 + }, + { + "epoch": 2.0327589471690968, + "grad_norm": 0.2955148220062256, + "learning_rate": 1.1868964211323615e-05, + "loss": 0.1001, + "step": 21470 + }, + { + "epoch": 2.0337057375497065, + "grad_norm": 0.38906633853912354, + "learning_rate": 1.1865177049801175e-05, + "loss": 0.1002, + "step": 21480 + }, + { + "epoch": 2.0346525279303163, + "grad_norm": 0.46951401233673096, + "learning_rate": 1.1861389888278737e-05, + "loss": 0.108, + "step": 21490 + }, + { + "epoch": 2.035599318310926, + "grad_norm": 0.46195003390312195, + "learning_rate": 1.1857602726756297e-05, + "loss": 0.1145, + "step": 21500 + }, + { + "epoch": 2.0365461086915357, + "grad_norm": 0.4008125364780426, + "learning_rate": 1.1853815565233857e-05, + "loss": 0.1036, + "step": 21510 + }, + { + "epoch": 2.0374928990721455, + "grad_norm": 0.4968140721321106, + "learning_rate": 1.185002840371142e-05, + "loss": 0.1086, + "step": 21520 + }, + { + "epoch": 2.038439689452755, + "grad_norm": 0.39275693893432617, + "learning_rate": 1.184624124218898e-05, + "loss": 0.109, + "step": 21530 + }, + { + "epoch": 2.039386479833365, + "grad_norm": 0.34367573261260986, + "learning_rate": 1.184245408066654e-05, + "loss": 0.1009, + "step": 21540 + }, + { + "epoch": 2.0403332702139747, + "grad_norm": 0.4184512794017792, + "learning_rate": 1.1838666919144102e-05, + "loss": 0.1024, + "step": 21550 + }, + { + "epoch": 2.0412800605945844, + "grad_norm": 0.4419008493423462, + "learning_rate": 1.1834879757621664e-05, + "loss": 0.1033, + "step": 21560 + }, + { + "epoch": 2.042226850975194, + "grad_norm": 0.37311434745788574, + "learning_rate": 1.1831092596099226e-05, + "loss": 0.0988, + "step": 21570 + }, + { + "epoch": 2.043173641355804, + "grad_norm": 0.39084023237228394, + "learning_rate": 1.1827305434576786e-05, + "loss": 0.0997, + "step": 21580 + }, + { + "epoch": 2.0441204317364137, + "grad_norm": 0.3225145637989044, + "learning_rate": 1.1823518273054346e-05, + "loss": 0.1041, + "step": 21590 + }, + { + "epoch": 2.0450672221170234, + "grad_norm": 0.44645923376083374, + "learning_rate": 1.1819731111531907e-05, + "loss": 0.1047, + "step": 21600 + }, + { + "epoch": 2.046014012497633, + "grad_norm": 0.39939242601394653, + "learning_rate": 1.181594395000947e-05, + "loss": 0.097, + "step": 21610 + }, + { + "epoch": 2.046960802878243, + "grad_norm": 0.41991719603538513, + "learning_rate": 1.1812156788487031e-05, + "loss": 0.1054, + "step": 21620 + }, + { + "epoch": 2.0479075932588526, + "grad_norm": 0.4075891971588135, + "learning_rate": 1.1808369626964591e-05, + "loss": 0.0975, + "step": 21630 + }, + { + "epoch": 2.0488543836394624, + "grad_norm": 0.3350875675678253, + "learning_rate": 1.1804582465442151e-05, + "loss": 0.0968, + "step": 21640 + }, + { + "epoch": 2.049801174020072, + "grad_norm": 0.33716604113578796, + "learning_rate": 1.1800795303919713e-05, + "loss": 0.1, + "step": 21650 + }, + { + "epoch": 2.050747964400682, + "grad_norm": 0.37090936303138733, + "learning_rate": 1.1797008142397275e-05, + "loss": 0.105, + "step": 21660 + }, + { + "epoch": 2.0516947547812916, + "grad_norm": 0.33265426754951477, + "learning_rate": 1.1793220980874836e-05, + "loss": 0.1001, + "step": 21670 + }, + { + "epoch": 2.0526415451619013, + "grad_norm": 0.3445303440093994, + "learning_rate": 1.1789433819352396e-05, + "loss": 0.0987, + "step": 21680 + }, + { + "epoch": 2.053588335542511, + "grad_norm": 0.42871901392936707, + "learning_rate": 1.1785646657829956e-05, + "loss": 0.0973, + "step": 21690 + }, + { + "epoch": 2.054535125923121, + "grad_norm": 0.3626950979232788, + "learning_rate": 1.178185949630752e-05, + "loss": 0.1076, + "step": 21700 + }, + { + "epoch": 2.0554819163037306, + "grad_norm": 0.4158441126346588, + "learning_rate": 1.177807233478508e-05, + "loss": 0.1075, + "step": 21710 + }, + { + "epoch": 2.0564287066843403, + "grad_norm": 0.43088018894195557, + "learning_rate": 1.177428517326264e-05, + "loss": 0.0975, + "step": 21720 + }, + { + "epoch": 2.0573754970649496, + "grad_norm": 0.3257809281349182, + "learning_rate": 1.1770498011740202e-05, + "loss": 0.1001, + "step": 21730 + }, + { + "epoch": 2.0583222874455593, + "grad_norm": 0.3736395537853241, + "learning_rate": 1.1766710850217762e-05, + "loss": 0.0882, + "step": 21740 + }, + { + "epoch": 2.059269077826169, + "grad_norm": 0.3264060318470001, + "learning_rate": 1.1762923688695325e-05, + "loss": 0.1097, + "step": 21750 + }, + { + "epoch": 2.060215868206779, + "grad_norm": 0.4029650092124939, + "learning_rate": 1.1759136527172885e-05, + "loss": 0.105, + "step": 21760 + }, + { + "epoch": 2.0611626585873886, + "grad_norm": 0.4287700653076172, + "learning_rate": 1.1755349365650445e-05, + "loss": 0.1034, + "step": 21770 + }, + { + "epoch": 2.0621094489679983, + "grad_norm": 0.43112558126449585, + "learning_rate": 1.1751562204128007e-05, + "loss": 0.1001, + "step": 21780 + }, + { + "epoch": 2.063056239348608, + "grad_norm": 0.34501007199287415, + "learning_rate": 1.1747775042605567e-05, + "loss": 0.0823, + "step": 21790 + }, + { + "epoch": 2.064003029729218, + "grad_norm": 0.36909836530685425, + "learning_rate": 1.174398788108313e-05, + "loss": 0.0943, + "step": 21800 + }, + { + "epoch": 2.0649498201098275, + "grad_norm": 0.3652644753456116, + "learning_rate": 1.174020071956069e-05, + "loss": 0.0984, + "step": 21810 + }, + { + "epoch": 2.0658966104904373, + "grad_norm": 0.40247103571891785, + "learning_rate": 1.173641355803825e-05, + "loss": 0.1, + "step": 21820 + }, + { + "epoch": 2.066843400871047, + "grad_norm": 0.437529057264328, + "learning_rate": 1.1732626396515812e-05, + "loss": 0.0996, + "step": 21830 + }, + { + "epoch": 2.0677901912516568, + "grad_norm": 0.4239822030067444, + "learning_rate": 1.1728839234993374e-05, + "loss": 0.1022, + "step": 21840 + }, + { + "epoch": 2.0687369816322665, + "grad_norm": 0.3921540677547455, + "learning_rate": 1.1725052073470936e-05, + "loss": 0.1009, + "step": 21850 + }, + { + "epoch": 2.0696837720128762, + "grad_norm": 0.32198435068130493, + "learning_rate": 1.1721264911948496e-05, + "loss": 0.0923, + "step": 21860 + }, + { + "epoch": 2.070630562393486, + "grad_norm": 0.4083239734172821, + "learning_rate": 1.1717477750426056e-05, + "loss": 0.1084, + "step": 21870 + }, + { + "epoch": 2.0715773527740957, + "grad_norm": 0.47515833377838135, + "learning_rate": 1.1713690588903616e-05, + "loss": 0.0953, + "step": 21880 + }, + { + "epoch": 2.0725241431547055, + "grad_norm": 0.3879965841770172, + "learning_rate": 1.170990342738118e-05, + "loss": 0.0956, + "step": 21890 + }, + { + "epoch": 2.073470933535315, + "grad_norm": 0.34762468934059143, + "learning_rate": 1.170611626585874e-05, + "loss": 0.1116, + "step": 21900 + }, + { + "epoch": 2.074417723915925, + "grad_norm": 0.3374441862106323, + "learning_rate": 1.1702329104336301e-05, + "loss": 0.1126, + "step": 21910 + }, + { + "epoch": 2.0753645142965347, + "grad_norm": 0.3270218074321747, + "learning_rate": 1.1698541942813861e-05, + "loss": 0.0973, + "step": 21920 + }, + { + "epoch": 2.0763113046771444, + "grad_norm": 0.39837753772735596, + "learning_rate": 1.1694754781291421e-05, + "loss": 0.1041, + "step": 21930 + }, + { + "epoch": 2.077258095057754, + "grad_norm": 0.3634657561779022, + "learning_rate": 1.1690967619768985e-05, + "loss": 0.1133, + "step": 21940 + }, + { + "epoch": 2.078204885438364, + "grad_norm": 0.41098886728286743, + "learning_rate": 1.1687180458246545e-05, + "loss": 0.1103, + "step": 21950 + }, + { + "epoch": 2.0791516758189736, + "grad_norm": 0.3962465822696686, + "learning_rate": 1.1683393296724107e-05, + "loss": 0.0996, + "step": 21960 + }, + { + "epoch": 2.0800984661995834, + "grad_norm": 0.34035059809684753, + "learning_rate": 1.1679606135201667e-05, + "loss": 0.0988, + "step": 21970 + }, + { + "epoch": 2.081045256580193, + "grad_norm": 0.5173091888427734, + "learning_rate": 1.167581897367923e-05, + "loss": 0.1018, + "step": 21980 + }, + { + "epoch": 2.081992046960803, + "grad_norm": 0.4123232662677765, + "learning_rate": 1.167203181215679e-05, + "loss": 0.102, + "step": 21990 + }, + { + "epoch": 2.0829388373414126, + "grad_norm": 0.4414355158805847, + "learning_rate": 1.166824465063435e-05, + "loss": 0.1068, + "step": 22000 + }, + { + "epoch": 2.0838856277220223, + "grad_norm": 0.4595598578453064, + "learning_rate": 1.1664457489111912e-05, + "loss": 0.1072, + "step": 22010 + }, + { + "epoch": 2.084832418102632, + "grad_norm": 0.33270683884620667, + "learning_rate": 1.1660670327589472e-05, + "loss": 0.1039, + "step": 22020 + }, + { + "epoch": 2.085779208483242, + "grad_norm": 0.2970442771911621, + "learning_rate": 1.1656883166067035e-05, + "loss": 0.0969, + "step": 22030 + }, + { + "epoch": 2.0867259988638516, + "grad_norm": 0.3258838355541229, + "learning_rate": 1.1653096004544595e-05, + "loss": 0.097, + "step": 22040 + }, + { + "epoch": 2.0876727892444613, + "grad_norm": 0.3754783272743225, + "learning_rate": 1.1649308843022155e-05, + "loss": 0.1031, + "step": 22050 + }, + { + "epoch": 2.088619579625071, + "grad_norm": 0.38050562143325806, + "learning_rate": 1.1645521681499715e-05, + "loss": 0.1027, + "step": 22060 + }, + { + "epoch": 2.089566370005681, + "grad_norm": 0.2980318069458008, + "learning_rate": 1.1641734519977277e-05, + "loss": 0.0978, + "step": 22070 + }, + { + "epoch": 2.0905131603862905, + "grad_norm": 0.4399225115776062, + "learning_rate": 1.1637947358454839e-05, + "loss": 0.1075, + "step": 22080 + }, + { + "epoch": 2.0914599507669003, + "grad_norm": 0.4211890697479248, + "learning_rate": 1.16341601969324e-05, + "loss": 0.1011, + "step": 22090 + }, + { + "epoch": 2.09240674114751, + "grad_norm": 0.4464966058731079, + "learning_rate": 1.163037303540996e-05, + "loss": 0.1032, + "step": 22100 + }, + { + "epoch": 2.0933535315281198, + "grad_norm": 0.40521979331970215, + "learning_rate": 1.162658587388752e-05, + "loss": 0.1041, + "step": 22110 + }, + { + "epoch": 2.0943003219087295, + "grad_norm": 0.33831852674484253, + "learning_rate": 1.1622798712365084e-05, + "loss": 0.102, + "step": 22120 + }, + { + "epoch": 2.0952471122893392, + "grad_norm": 0.3584233522415161, + "learning_rate": 1.1619011550842644e-05, + "loss": 0.1055, + "step": 22130 + }, + { + "epoch": 2.096193902669949, + "grad_norm": 0.3828630745410919, + "learning_rate": 1.1615224389320206e-05, + "loss": 0.115, + "step": 22140 + }, + { + "epoch": 2.0971406930505587, + "grad_norm": 0.4157094359397888, + "learning_rate": 1.1611437227797766e-05, + "loss": 0.1081, + "step": 22150 + }, + { + "epoch": 2.0980874834311685, + "grad_norm": 0.39398008584976196, + "learning_rate": 1.1607650066275326e-05, + "loss": 0.0993, + "step": 22160 + }, + { + "epoch": 2.099034273811778, + "grad_norm": 0.4000839591026306, + "learning_rate": 1.160386290475289e-05, + "loss": 0.0977, + "step": 22170 + }, + { + "epoch": 2.099981064192388, + "grad_norm": 0.39173951745033264, + "learning_rate": 1.160007574323045e-05, + "loss": 0.0993, + "step": 22180 + }, + { + "epoch": 2.1009278545729977, + "grad_norm": 0.43835166096687317, + "learning_rate": 1.1596288581708011e-05, + "loss": 0.1078, + "step": 22190 + }, + { + "epoch": 2.1018746449536074, + "grad_norm": 0.383684903383255, + "learning_rate": 1.1592501420185571e-05, + "loss": 0.0987, + "step": 22200 + }, + { + "epoch": 2.102821435334217, + "grad_norm": 0.3146076798439026, + "learning_rate": 1.1588714258663131e-05, + "loss": 0.0955, + "step": 22210 + }, + { + "epoch": 2.103768225714827, + "grad_norm": 0.4743645191192627, + "learning_rate": 1.1584927097140695e-05, + "loss": 0.1029, + "step": 22220 + }, + { + "epoch": 2.1047150160954367, + "grad_norm": 0.4409228563308716, + "learning_rate": 1.1581139935618255e-05, + "loss": 0.1088, + "step": 22230 + }, + { + "epoch": 2.1056618064760464, + "grad_norm": 0.36643436551094055, + "learning_rate": 1.1577352774095815e-05, + "loss": 0.0978, + "step": 22240 + }, + { + "epoch": 2.106608596856656, + "grad_norm": 0.33801764249801636, + "learning_rate": 1.1573565612573377e-05, + "loss": 0.0965, + "step": 22250 + }, + { + "epoch": 2.107555387237266, + "grad_norm": 0.3886876404285431, + "learning_rate": 1.1569778451050938e-05, + "loss": 0.1043, + "step": 22260 + }, + { + "epoch": 2.1085021776178756, + "grad_norm": 0.36893683671951294, + "learning_rate": 1.15659912895285e-05, + "loss": 0.1088, + "step": 22270 + }, + { + "epoch": 2.1094489679984854, + "grad_norm": 0.4243239760398865, + "learning_rate": 1.156220412800606e-05, + "loss": 0.1071, + "step": 22280 + }, + { + "epoch": 2.1103957583790947, + "grad_norm": 0.38450294733047485, + "learning_rate": 1.155841696648362e-05, + "loss": 0.1044, + "step": 22290 + }, + { + "epoch": 2.1113425487597044, + "grad_norm": 0.4414058327674866, + "learning_rate": 1.1554629804961182e-05, + "loss": 0.0988, + "step": 22300 + }, + { + "epoch": 2.112289339140314, + "grad_norm": 0.5164914131164551, + "learning_rate": 1.1550842643438744e-05, + "loss": 0.1067, + "step": 22310 + }, + { + "epoch": 2.113236129520924, + "grad_norm": 0.5250256657600403, + "learning_rate": 1.1547055481916306e-05, + "loss": 0.0878, + "step": 22320 + }, + { + "epoch": 2.1141829199015336, + "grad_norm": 0.3771630823612213, + "learning_rate": 1.1543268320393866e-05, + "loss": 0.0995, + "step": 22330 + }, + { + "epoch": 2.1151297102821434, + "grad_norm": 0.3611592948436737, + "learning_rate": 1.1539481158871426e-05, + "loss": 0.0974, + "step": 22340 + }, + { + "epoch": 2.116076500662753, + "grad_norm": 0.3579205572605133, + "learning_rate": 1.1535693997348989e-05, + "loss": 0.0936, + "step": 22350 + }, + { + "epoch": 2.117023291043363, + "grad_norm": 0.3400232493877411, + "learning_rate": 1.1531906835826549e-05, + "loss": 0.1052, + "step": 22360 + }, + { + "epoch": 2.1179700814239726, + "grad_norm": 0.404937744140625, + "learning_rate": 1.1528119674304111e-05, + "loss": 0.0953, + "step": 22370 + }, + { + "epoch": 2.1189168718045823, + "grad_norm": 0.37508904933929443, + "learning_rate": 1.1524332512781671e-05, + "loss": 0.0947, + "step": 22380 + }, + { + "epoch": 2.119863662185192, + "grad_norm": 0.3533603847026825, + "learning_rate": 1.1520545351259231e-05, + "loss": 0.0974, + "step": 22390 + }, + { + "epoch": 2.120810452565802, + "grad_norm": 0.5213316082954407, + "learning_rate": 1.1516758189736794e-05, + "loss": 0.1033, + "step": 22400 + }, + { + "epoch": 2.1217572429464115, + "grad_norm": 0.33669185638427734, + "learning_rate": 1.1512971028214354e-05, + "loss": 0.1032, + "step": 22410 + }, + { + "epoch": 2.1227040333270213, + "grad_norm": 0.40858665108680725, + "learning_rate": 1.1509183866691915e-05, + "loss": 0.0935, + "step": 22420 + }, + { + "epoch": 2.123650823707631, + "grad_norm": 0.4377613067626953, + "learning_rate": 1.1505396705169476e-05, + "loss": 0.1039, + "step": 22430 + }, + { + "epoch": 2.1245976140882408, + "grad_norm": 0.4569412171840668, + "learning_rate": 1.1501609543647036e-05, + "loss": 0.1047, + "step": 22440 + }, + { + "epoch": 2.1255444044688505, + "grad_norm": 0.33584511280059814, + "learning_rate": 1.14978223821246e-05, + "loss": 0.1045, + "step": 22450 + }, + { + "epoch": 2.1264911948494603, + "grad_norm": 0.3816305994987488, + "learning_rate": 1.149403522060216e-05, + "loss": 0.0912, + "step": 22460 + }, + { + "epoch": 2.12743798523007, + "grad_norm": 0.39922207593917847, + "learning_rate": 1.149024805907972e-05, + "loss": 0.1083, + "step": 22470 + }, + { + "epoch": 2.1283847756106797, + "grad_norm": 0.3402602970600128, + "learning_rate": 1.1486460897557282e-05, + "loss": 0.1036, + "step": 22480 + }, + { + "epoch": 2.1293315659912895, + "grad_norm": 0.45204728841781616, + "learning_rate": 1.1482673736034843e-05, + "loss": 0.1006, + "step": 22490 + }, + { + "epoch": 2.130278356371899, + "grad_norm": 0.3794861435890198, + "learning_rate": 1.1478886574512405e-05, + "loss": 0.1013, + "step": 22500 + }, + { + "epoch": 2.131225146752509, + "grad_norm": 0.341086745262146, + "learning_rate": 1.1475099412989965e-05, + "loss": 0.1117, + "step": 22510 + }, + { + "epoch": 2.1321719371331187, + "grad_norm": 0.3587258756160736, + "learning_rate": 1.1471312251467525e-05, + "loss": 0.099, + "step": 22520 + }, + { + "epoch": 2.1331187275137284, + "grad_norm": 0.267326295375824, + "learning_rate": 1.1467525089945087e-05, + "loss": 0.0919, + "step": 22530 + }, + { + "epoch": 2.134065517894338, + "grad_norm": 0.3860250413417816, + "learning_rate": 1.1463737928422649e-05, + "loss": 0.1005, + "step": 22540 + }, + { + "epoch": 2.135012308274948, + "grad_norm": 0.4012054204940796, + "learning_rate": 1.145995076690021e-05, + "loss": 0.1085, + "step": 22550 + }, + { + "epoch": 2.1359590986555577, + "grad_norm": 0.40226098895072937, + "learning_rate": 1.145616360537777e-05, + "loss": 0.0968, + "step": 22560 + }, + { + "epoch": 2.1369058890361674, + "grad_norm": 0.3890402615070343, + "learning_rate": 1.145237644385533e-05, + "loss": 0.0997, + "step": 22570 + }, + { + "epoch": 2.137852679416777, + "grad_norm": 0.3543037176132202, + "learning_rate": 1.1448589282332892e-05, + "loss": 0.1072, + "step": 22580 + }, + { + "epoch": 2.138799469797387, + "grad_norm": 0.3767646253108978, + "learning_rate": 1.1444802120810454e-05, + "loss": 0.1018, + "step": 22590 + }, + { + "epoch": 2.1397462601779966, + "grad_norm": 0.5453084707260132, + "learning_rate": 1.1441014959288014e-05, + "loss": 0.1013, + "step": 22600 + }, + { + "epoch": 2.1406930505586064, + "grad_norm": 0.434987336397171, + "learning_rate": 1.1437227797765576e-05, + "loss": 0.0935, + "step": 22610 + }, + { + "epoch": 2.141639840939216, + "grad_norm": 0.3639536201953888, + "learning_rate": 1.1433440636243136e-05, + "loss": 0.1021, + "step": 22620 + }, + { + "epoch": 2.142586631319826, + "grad_norm": 0.4960155487060547, + "learning_rate": 1.14296534747207e-05, + "loss": 0.0851, + "step": 22630 + }, + { + "epoch": 2.1435334217004356, + "grad_norm": 0.47204598784446716, + "learning_rate": 1.142586631319826e-05, + "loss": 0.1004, + "step": 22640 + }, + { + "epoch": 2.1444802120810453, + "grad_norm": 0.3582194745540619, + "learning_rate": 1.142207915167582e-05, + "loss": 0.1055, + "step": 22650 + }, + { + "epoch": 2.145427002461655, + "grad_norm": 0.3919258713722229, + "learning_rate": 1.1418291990153381e-05, + "loss": 0.1082, + "step": 22660 + }, + { + "epoch": 2.146373792842265, + "grad_norm": 0.43927210569381714, + "learning_rate": 1.1414504828630941e-05, + "loss": 0.1002, + "step": 22670 + }, + { + "epoch": 2.1473205832228746, + "grad_norm": 0.371798038482666, + "learning_rate": 1.1410717667108505e-05, + "loss": 0.0947, + "step": 22680 + }, + { + "epoch": 2.1482673736034843, + "grad_norm": 0.3531559705734253, + "learning_rate": 1.1406930505586065e-05, + "loss": 0.0998, + "step": 22690 + }, + { + "epoch": 2.149214163984094, + "grad_norm": 0.36792996525764465, + "learning_rate": 1.1403143344063625e-05, + "loss": 0.0975, + "step": 22700 + }, + { + "epoch": 2.150160954364704, + "grad_norm": 0.48282304406166077, + "learning_rate": 1.1399356182541186e-05, + "loss": 0.1017, + "step": 22710 + }, + { + "epoch": 2.1511077447453135, + "grad_norm": 0.3662376403808594, + "learning_rate": 1.1395569021018746e-05, + "loss": 0.0957, + "step": 22720 + }, + { + "epoch": 2.1520545351259233, + "grad_norm": 0.46066606044769287, + "learning_rate": 1.139178185949631e-05, + "loss": 0.108, + "step": 22730 + }, + { + "epoch": 2.153001325506533, + "grad_norm": 0.3750786781311035, + "learning_rate": 1.138799469797387e-05, + "loss": 0.104, + "step": 22740 + }, + { + "epoch": 2.1539481158871427, + "grad_norm": 0.40401408076286316, + "learning_rate": 1.138420753645143e-05, + "loss": 0.0964, + "step": 22750 + }, + { + "epoch": 2.1548949062677525, + "grad_norm": 0.4238393008708954, + "learning_rate": 1.1380420374928992e-05, + "loss": 0.0976, + "step": 22760 + }, + { + "epoch": 2.1558416966483622, + "grad_norm": 0.43456730246543884, + "learning_rate": 1.1376633213406554e-05, + "loss": 0.1018, + "step": 22770 + }, + { + "epoch": 2.156788487028972, + "grad_norm": 0.36559218168258667, + "learning_rate": 1.1372846051884114e-05, + "loss": 0.1044, + "step": 22780 + }, + { + "epoch": 2.1577352774095817, + "grad_norm": 0.41886982321739197, + "learning_rate": 1.1369058890361675e-05, + "loss": 0.1046, + "step": 22790 + }, + { + "epoch": 2.1586820677901915, + "grad_norm": 0.47367626428604126, + "learning_rate": 1.1365271728839235e-05, + "loss": 0.1025, + "step": 22800 + }, + { + "epoch": 2.159628858170801, + "grad_norm": 0.38137686252593994, + "learning_rate": 1.1361484567316795e-05, + "loss": 0.1115, + "step": 22810 + }, + { + "epoch": 2.1605756485514105, + "grad_norm": 0.44069352746009827, + "learning_rate": 1.1357697405794359e-05, + "loss": 0.1125, + "step": 22820 + }, + { + "epoch": 2.1615224389320202, + "grad_norm": 0.39172229170799255, + "learning_rate": 1.1353910244271919e-05, + "loss": 0.099, + "step": 22830 + }, + { + "epoch": 2.16246922931263, + "grad_norm": 0.3726031184196472, + "learning_rate": 1.135012308274948e-05, + "loss": 0.1056, + "step": 22840 + }, + { + "epoch": 2.1634160196932397, + "grad_norm": 0.3669355511665344, + "learning_rate": 1.134633592122704e-05, + "loss": 0.0933, + "step": 22850 + }, + { + "epoch": 2.1643628100738495, + "grad_norm": 0.34460747241973877, + "learning_rate": 1.13425487597046e-05, + "loss": 0.1067, + "step": 22860 + }, + { + "epoch": 2.165309600454459, + "grad_norm": 0.35513612627983093, + "learning_rate": 1.1338761598182164e-05, + "loss": 0.1112, + "step": 22870 + }, + { + "epoch": 2.166256390835069, + "grad_norm": 0.33671122789382935, + "learning_rate": 1.1334974436659724e-05, + "loss": 0.1029, + "step": 22880 + }, + { + "epoch": 2.1672031812156787, + "grad_norm": 0.35488536953926086, + "learning_rate": 1.1331187275137286e-05, + "loss": 0.1062, + "step": 22890 + }, + { + "epoch": 2.1681499715962884, + "grad_norm": 0.49665024876594543, + "learning_rate": 1.1327400113614846e-05, + "loss": 0.1067, + "step": 22900 + }, + { + "epoch": 2.169096761976898, + "grad_norm": 0.326875776052475, + "learning_rate": 1.132361295209241e-05, + "loss": 0.1, + "step": 22910 + }, + { + "epoch": 2.170043552357508, + "grad_norm": 0.4280208945274353, + "learning_rate": 1.131982579056997e-05, + "loss": 0.0971, + "step": 22920 + }, + { + "epoch": 2.1709903427381176, + "grad_norm": 0.3841555714607239, + "learning_rate": 1.131603862904753e-05, + "loss": 0.1064, + "step": 22930 + }, + { + "epoch": 2.1719371331187274, + "grad_norm": 0.5070419311523438, + "learning_rate": 1.1312251467525091e-05, + "loss": 0.101, + "step": 22940 + }, + { + "epoch": 2.172883923499337, + "grad_norm": 0.3213464915752411, + "learning_rate": 1.1308464306002651e-05, + "loss": 0.1038, + "step": 22950 + }, + { + "epoch": 2.173830713879947, + "grad_norm": 0.33351173996925354, + "learning_rate": 1.1304677144480213e-05, + "loss": 0.1018, + "step": 22960 + }, + { + "epoch": 2.1747775042605566, + "grad_norm": 0.4228649139404297, + "learning_rate": 1.1300889982957775e-05, + "loss": 0.0955, + "step": 22970 + }, + { + "epoch": 2.1757242946411663, + "grad_norm": 0.3855364918708801, + "learning_rate": 1.1297102821435335e-05, + "loss": 0.104, + "step": 22980 + }, + { + "epoch": 2.176671085021776, + "grad_norm": 0.340480238199234, + "learning_rate": 1.1293315659912895e-05, + "loss": 0.0951, + "step": 22990 + }, + { + "epoch": 2.177617875402386, + "grad_norm": 0.38249287009239197, + "learning_rate": 1.1289528498390457e-05, + "loss": 0.1, + "step": 23000 + }, + { + "epoch": 2.1785646657829956, + "grad_norm": 0.40845736861228943, + "learning_rate": 1.1285741336868018e-05, + "loss": 0.1086, + "step": 23010 + }, + { + "epoch": 2.1795114561636053, + "grad_norm": 0.34934353828430176, + "learning_rate": 1.128195417534558e-05, + "loss": 0.0902, + "step": 23020 + }, + { + "epoch": 2.180458246544215, + "grad_norm": 0.3424789607524872, + "learning_rate": 1.127816701382314e-05, + "loss": 0.101, + "step": 23030 + }, + { + "epoch": 2.181405036924825, + "grad_norm": 0.4287693500518799, + "learning_rate": 1.12743798523007e-05, + "loss": 0.0964, + "step": 23040 + }, + { + "epoch": 2.1823518273054345, + "grad_norm": 0.40323200821876526, + "learning_rate": 1.1270592690778264e-05, + "loss": 0.105, + "step": 23050 + }, + { + "epoch": 2.1832986176860443, + "grad_norm": 0.4015294015407562, + "learning_rate": 1.1266805529255824e-05, + "loss": 0.1071, + "step": 23060 + }, + { + "epoch": 2.184245408066654, + "grad_norm": 0.33209195733070374, + "learning_rate": 1.1263018367733385e-05, + "loss": 0.1095, + "step": 23070 + }, + { + "epoch": 2.1851921984472638, + "grad_norm": 0.32889801263809204, + "learning_rate": 1.1259231206210946e-05, + "loss": 0.1038, + "step": 23080 + }, + { + "epoch": 2.1861389888278735, + "grad_norm": 0.4167686104774475, + "learning_rate": 1.1255444044688506e-05, + "loss": 0.1093, + "step": 23090 + }, + { + "epoch": 2.1870857792084832, + "grad_norm": 0.3739524483680725, + "learning_rate": 1.1251656883166069e-05, + "loss": 0.0927, + "step": 23100 + }, + { + "epoch": 2.188032569589093, + "grad_norm": 0.31450697779655457, + "learning_rate": 1.1247869721643629e-05, + "loss": 0.0995, + "step": 23110 + }, + { + "epoch": 2.1889793599697027, + "grad_norm": 0.4176722466945648, + "learning_rate": 1.124408256012119e-05, + "loss": 0.1056, + "step": 23120 + }, + { + "epoch": 2.1899261503503125, + "grad_norm": 0.4337596595287323, + "learning_rate": 1.1240295398598751e-05, + "loss": 0.0984, + "step": 23130 + }, + { + "epoch": 2.190872940730922, + "grad_norm": 0.42897704243659973, + "learning_rate": 1.1236508237076311e-05, + "loss": 0.112, + "step": 23140 + }, + { + "epoch": 2.191819731111532, + "grad_norm": 0.3697352111339569, + "learning_rate": 1.1232721075553874e-05, + "loss": 0.1104, + "step": 23150 + }, + { + "epoch": 2.1927665214921417, + "grad_norm": 0.38484689593315125, + "learning_rate": 1.1228933914031434e-05, + "loss": 0.1043, + "step": 23160 + }, + { + "epoch": 2.1937133118727514, + "grad_norm": 0.4187999367713928, + "learning_rate": 1.1225146752508994e-05, + "loss": 0.0929, + "step": 23170 + }, + { + "epoch": 2.194660102253361, + "grad_norm": 0.38299185037612915, + "learning_rate": 1.1221359590986556e-05, + "loss": 0.0971, + "step": 23180 + }, + { + "epoch": 2.195606892633971, + "grad_norm": 0.3589703142642975, + "learning_rate": 1.1217572429464118e-05, + "loss": 0.1065, + "step": 23190 + }, + { + "epoch": 2.1965536830145806, + "grad_norm": 0.37426868081092834, + "learning_rate": 1.121378526794168e-05, + "loss": 0.1031, + "step": 23200 + }, + { + "epoch": 2.1975004733951904, + "grad_norm": 0.40484800934791565, + "learning_rate": 1.120999810641924e-05, + "loss": 0.0995, + "step": 23210 + }, + { + "epoch": 2.1984472637758, + "grad_norm": 0.355503112077713, + "learning_rate": 1.12062109448968e-05, + "loss": 0.0865, + "step": 23220 + }, + { + "epoch": 2.19939405415641, + "grad_norm": 0.32087182998657227, + "learning_rate": 1.1202423783374362e-05, + "loss": 0.0965, + "step": 23230 + }, + { + "epoch": 2.2003408445370196, + "grad_norm": 0.4689808189868927, + "learning_rate": 1.1198636621851923e-05, + "loss": 0.1116, + "step": 23240 + }, + { + "epoch": 2.2012876349176294, + "grad_norm": 0.4498715400695801, + "learning_rate": 1.1194849460329485e-05, + "loss": 0.0951, + "step": 23250 + }, + { + "epoch": 2.202234425298239, + "grad_norm": 0.41492167115211487, + "learning_rate": 1.1191062298807045e-05, + "loss": 0.1072, + "step": 23260 + }, + { + "epoch": 2.203181215678849, + "grad_norm": 0.3707394599914551, + "learning_rate": 1.1187275137284605e-05, + "loss": 0.1012, + "step": 23270 + }, + { + "epoch": 2.2041280060594586, + "grad_norm": 0.29027289152145386, + "learning_rate": 1.1183487975762167e-05, + "loss": 0.1106, + "step": 23280 + }, + { + "epoch": 2.2050747964400683, + "grad_norm": 0.4856210947036743, + "learning_rate": 1.1179700814239729e-05, + "loss": 0.1114, + "step": 23290 + }, + { + "epoch": 2.206021586820678, + "grad_norm": 0.3140472173690796, + "learning_rate": 1.117591365271729e-05, + "loss": 0.1064, + "step": 23300 + }, + { + "epoch": 2.206968377201288, + "grad_norm": 0.33930498361587524, + "learning_rate": 1.117212649119485e-05, + "loss": 0.0924, + "step": 23310 + }, + { + "epoch": 2.2079151675818975, + "grad_norm": 0.37622299790382385, + "learning_rate": 1.116833932967241e-05, + "loss": 0.1065, + "step": 23320 + }, + { + "epoch": 2.2088619579625073, + "grad_norm": 0.3196406066417694, + "learning_rate": 1.1164552168149974e-05, + "loss": 0.1021, + "step": 23330 + }, + { + "epoch": 2.209808748343117, + "grad_norm": 0.5100225210189819, + "learning_rate": 1.1160765006627534e-05, + "loss": 0.0962, + "step": 23340 + }, + { + "epoch": 2.2107555387237268, + "grad_norm": 0.3671920895576477, + "learning_rate": 1.1156977845105094e-05, + "loss": 0.1023, + "step": 23350 + }, + { + "epoch": 2.2117023291043365, + "grad_norm": 0.45640867948532104, + "learning_rate": 1.1153190683582656e-05, + "loss": 0.1033, + "step": 23360 + }, + { + "epoch": 2.2126491194849462, + "grad_norm": 0.36683550477027893, + "learning_rate": 1.1149403522060216e-05, + "loss": 0.1012, + "step": 23370 + }, + { + "epoch": 2.213595909865556, + "grad_norm": 0.30357277393341064, + "learning_rate": 1.114561636053778e-05, + "loss": 0.1083, + "step": 23380 + }, + { + "epoch": 2.2145427002461657, + "grad_norm": 0.4162294268608093, + "learning_rate": 1.114182919901534e-05, + "loss": 0.0986, + "step": 23390 + }, + { + "epoch": 2.215489490626775, + "grad_norm": 0.45273682475090027, + "learning_rate": 1.11380420374929e-05, + "loss": 0.1074, + "step": 23400 + }, + { + "epoch": 2.2164362810073848, + "grad_norm": 0.36101433634757996, + "learning_rate": 1.1134254875970461e-05, + "loss": 0.1018, + "step": 23410 + }, + { + "epoch": 2.2173830713879945, + "grad_norm": 0.3455662727355957, + "learning_rate": 1.1130467714448021e-05, + "loss": 0.1075, + "step": 23420 + }, + { + "epoch": 2.2183298617686042, + "grad_norm": 0.34743842482566833, + "learning_rate": 1.1126680552925585e-05, + "loss": 0.0967, + "step": 23430 + }, + { + "epoch": 2.219276652149214, + "grad_norm": 0.560983419418335, + "learning_rate": 1.1122893391403145e-05, + "loss": 0.1049, + "step": 23440 + }, + { + "epoch": 2.2202234425298237, + "grad_norm": 0.4330439865589142, + "learning_rate": 1.1119106229880705e-05, + "loss": 0.1065, + "step": 23450 + }, + { + "epoch": 2.2211702329104335, + "grad_norm": 0.32083675265312195, + "learning_rate": 1.1115319068358266e-05, + "loss": 0.0957, + "step": 23460 + }, + { + "epoch": 2.222117023291043, + "grad_norm": 0.4112713634967804, + "learning_rate": 1.1111531906835828e-05, + "loss": 0.1034, + "step": 23470 + }, + { + "epoch": 2.223063813671653, + "grad_norm": 0.3654922842979431, + "learning_rate": 1.110774474531339e-05, + "loss": 0.095, + "step": 23480 + }, + { + "epoch": 2.2240106040522627, + "grad_norm": 0.3552573621273041, + "learning_rate": 1.110395758379095e-05, + "loss": 0.1026, + "step": 23490 + }, + { + "epoch": 2.2249573944328724, + "grad_norm": 0.34025436639785767, + "learning_rate": 1.110017042226851e-05, + "loss": 0.0899, + "step": 23500 + }, + { + "epoch": 2.225904184813482, + "grad_norm": 0.4973716139793396, + "learning_rate": 1.109638326074607e-05, + "loss": 0.1019, + "step": 23510 + }, + { + "epoch": 2.226850975194092, + "grad_norm": 0.4082499146461487, + "learning_rate": 1.1092596099223633e-05, + "loss": 0.1037, + "step": 23520 + }, + { + "epoch": 2.2277977655747017, + "grad_norm": 0.4043087959289551, + "learning_rate": 1.1088808937701194e-05, + "loss": 0.1039, + "step": 23530 + }, + { + "epoch": 2.2287445559553114, + "grad_norm": 0.38824453949928284, + "learning_rate": 1.1085021776178755e-05, + "loss": 0.1082, + "step": 23540 + }, + { + "epoch": 2.229691346335921, + "grad_norm": 0.39685913920402527, + "learning_rate": 1.1081234614656315e-05, + "loss": 0.1018, + "step": 23550 + }, + { + "epoch": 2.230638136716531, + "grad_norm": 0.3705897927284241, + "learning_rate": 1.1077447453133875e-05, + "loss": 0.0942, + "step": 23560 + }, + { + "epoch": 2.2315849270971406, + "grad_norm": 0.34453701972961426, + "learning_rate": 1.1073660291611439e-05, + "loss": 0.0989, + "step": 23570 + }, + { + "epoch": 2.2325317174777504, + "grad_norm": 0.3718234896659851, + "learning_rate": 1.1069873130088999e-05, + "loss": 0.1031, + "step": 23580 + }, + { + "epoch": 2.23347850785836, + "grad_norm": 0.41039136052131653, + "learning_rate": 1.106608596856656e-05, + "loss": 0.1032, + "step": 23590 + }, + { + "epoch": 2.23442529823897, + "grad_norm": 0.3343423008918762, + "learning_rate": 1.106229880704412e-05, + "loss": 0.0923, + "step": 23600 + }, + { + "epoch": 2.2353720886195796, + "grad_norm": 0.47895216941833496, + "learning_rate": 1.1058511645521684e-05, + "loss": 0.0983, + "step": 23610 + }, + { + "epoch": 2.2363188790001893, + "grad_norm": 0.4448593258857727, + "learning_rate": 1.1054724483999244e-05, + "loss": 0.1053, + "step": 23620 + }, + { + "epoch": 2.237265669380799, + "grad_norm": 0.3424699008464813, + "learning_rate": 1.1050937322476804e-05, + "loss": 0.1031, + "step": 23630 + }, + { + "epoch": 2.238212459761409, + "grad_norm": 0.39542269706726074, + "learning_rate": 1.1047150160954366e-05, + "loss": 0.1019, + "step": 23640 + }, + { + "epoch": 2.2391592501420186, + "grad_norm": 0.46471700072288513, + "learning_rate": 1.1043362999431926e-05, + "loss": 0.1024, + "step": 23650 + }, + { + "epoch": 2.2401060405226283, + "grad_norm": 0.3723284900188446, + "learning_rate": 1.103957583790949e-05, + "loss": 0.1063, + "step": 23660 + }, + { + "epoch": 2.241052830903238, + "grad_norm": 0.43616223335266113, + "learning_rate": 1.103578867638705e-05, + "loss": 0.0981, + "step": 23670 + }, + { + "epoch": 2.2419996212838478, + "grad_norm": 0.48921120166778564, + "learning_rate": 1.103200151486461e-05, + "loss": 0.0993, + "step": 23680 + }, + { + "epoch": 2.2429464116644575, + "grad_norm": 0.38076233863830566, + "learning_rate": 1.102821435334217e-05, + "loss": 0.0946, + "step": 23690 + }, + { + "epoch": 2.2438932020450673, + "grad_norm": 0.4395969808101654, + "learning_rate": 1.1024427191819731e-05, + "loss": 0.1043, + "step": 23700 + }, + { + "epoch": 2.244839992425677, + "grad_norm": 0.4211001396179199, + "learning_rate": 1.1020640030297293e-05, + "loss": 0.1045, + "step": 23710 + }, + { + "epoch": 2.2457867828062867, + "grad_norm": 0.47736597061157227, + "learning_rate": 1.1016852868774855e-05, + "loss": 0.0977, + "step": 23720 + }, + { + "epoch": 2.2467335731868965, + "grad_norm": 0.37812915444374084, + "learning_rate": 1.1013065707252415e-05, + "loss": 0.1043, + "step": 23730 + }, + { + "epoch": 2.2476803635675062, + "grad_norm": 0.3703843057155609, + "learning_rate": 1.1009278545729975e-05, + "loss": 0.0907, + "step": 23740 + }, + { + "epoch": 2.248627153948116, + "grad_norm": 0.5733705759048462, + "learning_rate": 1.1005491384207538e-05, + "loss": 0.1081, + "step": 23750 + }, + { + "epoch": 2.2495739443287257, + "grad_norm": 0.6033251881599426, + "learning_rate": 1.1001704222685098e-05, + "loss": 0.0984, + "step": 23760 + }, + { + "epoch": 2.2505207347093354, + "grad_norm": 0.39114996790885925, + "learning_rate": 1.099791706116266e-05, + "loss": 0.0993, + "step": 23770 + }, + { + "epoch": 2.251467525089945, + "grad_norm": 0.19909757375717163, + "learning_rate": 1.099412989964022e-05, + "loss": 0.0931, + "step": 23780 + }, + { + "epoch": 2.252414315470555, + "grad_norm": 0.36311134696006775, + "learning_rate": 1.099034273811778e-05, + "loss": 0.0998, + "step": 23790 + }, + { + "epoch": 2.2533611058511647, + "grad_norm": 0.392431378364563, + "learning_rate": 1.0986555576595344e-05, + "loss": 0.111, + "step": 23800 + }, + { + "epoch": 2.2543078962317744, + "grad_norm": 0.46047669649124146, + "learning_rate": 1.0982768415072904e-05, + "loss": 0.106, + "step": 23810 + }, + { + "epoch": 2.255254686612384, + "grad_norm": 0.42195338010787964, + "learning_rate": 1.0978981253550465e-05, + "loss": 0.1035, + "step": 23820 + }, + { + "epoch": 2.256201476992994, + "grad_norm": 0.3718126118183136, + "learning_rate": 1.0975194092028025e-05, + "loss": 0.1002, + "step": 23830 + }, + { + "epoch": 2.2571482673736036, + "grad_norm": 0.42515698075294495, + "learning_rate": 1.0971406930505586e-05, + "loss": 0.0956, + "step": 23840 + }, + { + "epoch": 2.2580950577542134, + "grad_norm": 0.3577660620212555, + "learning_rate": 1.0967619768983149e-05, + "loss": 0.0911, + "step": 23850 + }, + { + "epoch": 2.259041848134823, + "grad_norm": 0.3810538649559021, + "learning_rate": 1.0963832607460709e-05, + "loss": 0.0925, + "step": 23860 + }, + { + "epoch": 2.259988638515433, + "grad_norm": 0.3717814087867737, + "learning_rate": 1.0960045445938269e-05, + "loss": 0.1065, + "step": 23870 + }, + { + "epoch": 2.2609354288960426, + "grad_norm": 0.3299460709095001, + "learning_rate": 1.095625828441583e-05, + "loss": 0.101, + "step": 23880 + }, + { + "epoch": 2.2618822192766523, + "grad_norm": 0.44920241832733154, + "learning_rate": 1.0952471122893393e-05, + "loss": 0.0995, + "step": 23890 + }, + { + "epoch": 2.2628290096572616, + "grad_norm": 0.475917786359787, + "learning_rate": 1.0948683961370954e-05, + "loss": 0.1048, + "step": 23900 + }, + { + "epoch": 2.2637758000378714, + "grad_norm": 0.34341323375701904, + "learning_rate": 1.0944896799848514e-05, + "loss": 0.0944, + "step": 23910 + }, + { + "epoch": 2.264722590418481, + "grad_norm": 0.4694386124610901, + "learning_rate": 1.0941109638326074e-05, + "loss": 0.1032, + "step": 23920 + }, + { + "epoch": 2.265669380799091, + "grad_norm": 0.3619329631328583, + "learning_rate": 1.0937322476803636e-05, + "loss": 0.1046, + "step": 23930 + }, + { + "epoch": 2.2666161711797006, + "grad_norm": 0.4530966877937317, + "learning_rate": 1.0933535315281198e-05, + "loss": 0.1078, + "step": 23940 + }, + { + "epoch": 2.2675629615603103, + "grad_norm": 0.44397875666618347, + "learning_rate": 1.092974815375876e-05, + "loss": 0.1055, + "step": 23950 + }, + { + "epoch": 2.26850975194092, + "grad_norm": 0.3822941482067108, + "learning_rate": 1.092596099223632e-05, + "loss": 0.0984, + "step": 23960 + }, + { + "epoch": 2.26945654232153, + "grad_norm": 0.41480833292007446, + "learning_rate": 1.092217383071388e-05, + "loss": 0.0986, + "step": 23970 + }, + { + "epoch": 2.2704033327021396, + "grad_norm": 0.44778960943222046, + "learning_rate": 1.0918386669191441e-05, + "loss": 0.0938, + "step": 23980 + }, + { + "epoch": 2.2713501230827493, + "grad_norm": 0.36493951082229614, + "learning_rate": 1.0914599507669003e-05, + "loss": 0.114, + "step": 23990 + }, + { + "epoch": 2.272296913463359, + "grad_norm": 0.3928702175617218, + "learning_rate": 1.0910812346146565e-05, + "loss": 0.099, + "step": 24000 + }, + { + "epoch": 2.273243703843969, + "grad_norm": 0.3912307024002075, + "learning_rate": 1.0907025184624125e-05, + "loss": 0.1041, + "step": 24010 + }, + { + "epoch": 2.2741904942245785, + "grad_norm": 0.4148852229118347, + "learning_rate": 1.0903238023101685e-05, + "loss": 0.1105, + "step": 24020 + }, + { + "epoch": 2.2751372846051883, + "grad_norm": 0.35756590962409973, + "learning_rate": 1.0899450861579249e-05, + "loss": 0.1032, + "step": 24030 + }, + { + "epoch": 2.276084074985798, + "grad_norm": 0.3831791281700134, + "learning_rate": 1.0895663700056809e-05, + "loss": 0.1013, + "step": 24040 + }, + { + "epoch": 2.2770308653664078, + "grad_norm": 0.40133121609687805, + "learning_rate": 1.0891876538534369e-05, + "loss": 0.1076, + "step": 24050 + }, + { + "epoch": 2.2779776557470175, + "grad_norm": 0.38974645733833313, + "learning_rate": 1.088808937701193e-05, + "loss": 0.102, + "step": 24060 + }, + { + "epoch": 2.2789244461276272, + "grad_norm": 0.3199298679828644, + "learning_rate": 1.088430221548949e-05, + "loss": 0.1021, + "step": 24070 + }, + { + "epoch": 2.279871236508237, + "grad_norm": 0.3961081802845001, + "learning_rate": 1.0880515053967054e-05, + "loss": 0.1049, + "step": 24080 + }, + { + "epoch": 2.2808180268888467, + "grad_norm": 0.41309094429016113, + "learning_rate": 1.0876727892444614e-05, + "loss": 0.102, + "step": 24090 + }, + { + "epoch": 2.2817648172694565, + "grad_norm": 0.465978741645813, + "learning_rate": 1.0872940730922174e-05, + "loss": 0.1078, + "step": 24100 + }, + { + "epoch": 2.282711607650066, + "grad_norm": 0.4430433511734009, + "learning_rate": 1.0869153569399736e-05, + "loss": 0.1085, + "step": 24110 + }, + { + "epoch": 2.283658398030676, + "grad_norm": 0.30565348267555237, + "learning_rate": 1.0865366407877296e-05, + "loss": 0.0987, + "step": 24120 + }, + { + "epoch": 2.2846051884112857, + "grad_norm": 0.39248156547546387, + "learning_rate": 1.086157924635486e-05, + "loss": 0.1028, + "step": 24130 + }, + { + "epoch": 2.2855519787918954, + "grad_norm": 0.2839440405368805, + "learning_rate": 1.085779208483242e-05, + "loss": 0.1027, + "step": 24140 + }, + { + "epoch": 2.286498769172505, + "grad_norm": 0.4470391571521759, + "learning_rate": 1.085400492330998e-05, + "loss": 0.1004, + "step": 24150 + }, + { + "epoch": 2.287445559553115, + "grad_norm": 0.388384610414505, + "learning_rate": 1.0850217761787541e-05, + "loss": 0.1032, + "step": 24160 + }, + { + "epoch": 2.2883923499337246, + "grad_norm": 0.4492109417915344, + "learning_rate": 1.0846430600265103e-05, + "loss": 0.0996, + "step": 24170 + }, + { + "epoch": 2.2893391403143344, + "grad_norm": 0.5113523602485657, + "learning_rate": 1.0842643438742664e-05, + "loss": 0.1005, + "step": 24180 + }, + { + "epoch": 2.290285930694944, + "grad_norm": 0.385354220867157, + "learning_rate": 1.0838856277220225e-05, + "loss": 0.0923, + "step": 24190 + }, + { + "epoch": 2.291232721075554, + "grad_norm": 0.32123634219169617, + "learning_rate": 1.0835069115697785e-05, + "loss": 0.0998, + "step": 24200 + }, + { + "epoch": 2.2921795114561636, + "grad_norm": 0.343923956155777, + "learning_rate": 1.0831281954175346e-05, + "loss": 0.1069, + "step": 24210 + }, + { + "epoch": 2.2931263018367734, + "grad_norm": 0.4596140682697296, + "learning_rate": 1.0827494792652908e-05, + "loss": 0.1073, + "step": 24220 + }, + { + "epoch": 2.294073092217383, + "grad_norm": 0.41971439123153687, + "learning_rate": 1.0823707631130468e-05, + "loss": 0.1001, + "step": 24230 + }, + { + "epoch": 2.295019882597993, + "grad_norm": 0.42049020528793335, + "learning_rate": 1.081992046960803e-05, + "loss": 0.1014, + "step": 24240 + }, + { + "epoch": 2.2959666729786026, + "grad_norm": 0.3357348144054413, + "learning_rate": 1.081613330808559e-05, + "loss": 0.1056, + "step": 24250 + }, + { + "epoch": 2.2969134633592123, + "grad_norm": 0.4161593019962311, + "learning_rate": 1.081234614656315e-05, + "loss": 0.0925, + "step": 24260 + }, + { + "epoch": 2.297860253739822, + "grad_norm": 0.4063127934932709, + "learning_rate": 1.0808558985040713e-05, + "loss": 0.1007, + "step": 24270 + }, + { + "epoch": 2.298807044120432, + "grad_norm": 0.4100823402404785, + "learning_rate": 1.0804771823518273e-05, + "loss": 0.0959, + "step": 24280 + }, + { + "epoch": 2.2997538345010415, + "grad_norm": 0.37574973702430725, + "learning_rate": 1.0800984661995835e-05, + "loss": 0.0992, + "step": 24290 + }, + { + "epoch": 2.3007006248816513, + "grad_norm": 0.3953949213027954, + "learning_rate": 1.0797197500473395e-05, + "loss": 0.0973, + "step": 24300 + }, + { + "epoch": 2.301647415262261, + "grad_norm": 0.41357365250587463, + "learning_rate": 1.0793410338950959e-05, + "loss": 0.0964, + "step": 24310 + }, + { + "epoch": 2.3025942056428708, + "grad_norm": 0.5206488966941833, + "learning_rate": 1.0789623177428519e-05, + "loss": 0.0944, + "step": 24320 + }, + { + "epoch": 2.3035409960234805, + "grad_norm": 0.4037434458732605, + "learning_rate": 1.0785836015906079e-05, + "loss": 0.1056, + "step": 24330 + }, + { + "epoch": 2.3044877864040902, + "grad_norm": 0.3638472557067871, + "learning_rate": 1.078204885438364e-05, + "loss": 0.105, + "step": 24340 + }, + { + "epoch": 2.3054345767847, + "grad_norm": 0.31433483958244324, + "learning_rate": 1.07782616928612e-05, + "loss": 0.0918, + "step": 24350 + }, + { + "epoch": 2.3063813671653097, + "grad_norm": 0.4167768359184265, + "learning_rate": 1.0774474531338764e-05, + "loss": 0.1058, + "step": 24360 + }, + { + "epoch": 2.3073281575459195, + "grad_norm": 0.3920457661151886, + "learning_rate": 1.0770687369816324e-05, + "loss": 0.1039, + "step": 24370 + }, + { + "epoch": 2.308274947926529, + "grad_norm": 0.40075308084487915, + "learning_rate": 1.0766900208293884e-05, + "loss": 0.1016, + "step": 24380 + }, + { + "epoch": 2.309221738307139, + "grad_norm": 0.33713826537132263, + "learning_rate": 1.0763113046771446e-05, + "loss": 0.1019, + "step": 24390 + }, + { + "epoch": 2.3101685286877487, + "grad_norm": 0.3890605568885803, + "learning_rate": 1.0759325885249006e-05, + "loss": 0.1052, + "step": 24400 + }, + { + "epoch": 2.3111153190683584, + "grad_norm": 0.39935365319252014, + "learning_rate": 1.0755538723726568e-05, + "loss": 0.1035, + "step": 24410 + }, + { + "epoch": 2.312062109448968, + "grad_norm": 0.45820632576942444, + "learning_rate": 1.075175156220413e-05, + "loss": 0.1042, + "step": 24420 + }, + { + "epoch": 2.313008899829578, + "grad_norm": 0.39744850993156433, + "learning_rate": 1.074796440068169e-05, + "loss": 0.107, + "step": 24430 + }, + { + "epoch": 2.3139556902101877, + "grad_norm": 0.3054029643535614, + "learning_rate": 1.074417723915925e-05, + "loss": 0.0961, + "step": 24440 + }, + { + "epoch": 2.3149024805907974, + "grad_norm": 0.39057713747024536, + "learning_rate": 1.0740390077636813e-05, + "loss": 0.0982, + "step": 24450 + }, + { + "epoch": 2.315849270971407, + "grad_norm": 0.3561437726020813, + "learning_rate": 1.0736602916114373e-05, + "loss": 0.1011, + "step": 24460 + }, + { + "epoch": 2.316796061352017, + "grad_norm": 0.5833640694618225, + "learning_rate": 1.0732815754591935e-05, + "loss": 0.0979, + "step": 24470 + }, + { + "epoch": 2.3177428517326266, + "grad_norm": 0.37024566531181335, + "learning_rate": 1.0729028593069495e-05, + "loss": 0.097, + "step": 24480 + }, + { + "epoch": 2.3186896421132364, + "grad_norm": 0.4453885853290558, + "learning_rate": 1.0725241431547055e-05, + "loss": 0.0997, + "step": 24490 + }, + { + "epoch": 2.319636432493846, + "grad_norm": 0.3976321220397949, + "learning_rate": 1.0721454270024618e-05, + "loss": 0.1111, + "step": 24500 + }, + { + "epoch": 2.320583222874456, + "grad_norm": 0.4668462872505188, + "learning_rate": 1.0717667108502178e-05, + "loss": 0.1026, + "step": 24510 + }, + { + "epoch": 2.321530013255065, + "grad_norm": 0.39205145835876465, + "learning_rate": 1.071387994697974e-05, + "loss": 0.0995, + "step": 24520 + }, + { + "epoch": 2.322476803635675, + "grad_norm": 0.5295231342315674, + "learning_rate": 1.07100927854573e-05, + "loss": 0.0996, + "step": 24530 + }, + { + "epoch": 2.3234235940162846, + "grad_norm": 0.41702479124069214, + "learning_rate": 1.070630562393486e-05, + "loss": 0.0925, + "step": 24540 + }, + { + "epoch": 2.3243703843968944, + "grad_norm": 0.41526830196380615, + "learning_rate": 1.0702518462412424e-05, + "loss": 0.1025, + "step": 24550 + }, + { + "epoch": 2.325317174777504, + "grad_norm": 0.5049123167991638, + "learning_rate": 1.0698731300889984e-05, + "loss": 0.1003, + "step": 24560 + }, + { + "epoch": 2.326263965158114, + "grad_norm": 0.35912126302719116, + "learning_rate": 1.0694944139367545e-05, + "loss": 0.0987, + "step": 24570 + }, + { + "epoch": 2.3272107555387236, + "grad_norm": 0.4618116319179535, + "learning_rate": 1.0691156977845105e-05, + "loss": 0.0898, + "step": 24580 + }, + { + "epoch": 2.3281575459193333, + "grad_norm": 0.6204038262367249, + "learning_rate": 1.0687369816322667e-05, + "loss": 0.1065, + "step": 24590 + }, + { + "epoch": 2.329104336299943, + "grad_norm": 0.43588438630104065, + "learning_rate": 1.0683582654800229e-05, + "loss": 0.0994, + "step": 24600 + }, + { + "epoch": 2.330051126680553, + "grad_norm": 0.4884413480758667, + "learning_rate": 1.0679795493277789e-05, + "loss": 0.1034, + "step": 24610 + }, + { + "epoch": 2.3309979170611625, + "grad_norm": 0.4790765643119812, + "learning_rate": 1.0676008331755349e-05, + "loss": 0.103, + "step": 24620 + }, + { + "epoch": 2.3319447074417723, + "grad_norm": 0.4357077181339264, + "learning_rate": 1.067222117023291e-05, + "loss": 0.1067, + "step": 24630 + }, + { + "epoch": 2.332891497822382, + "grad_norm": 0.4674258530139923, + "learning_rate": 1.0668434008710473e-05, + "loss": 0.1077, + "step": 24640 + }, + { + "epoch": 2.3338382882029918, + "grad_norm": 0.3695948123931885, + "learning_rate": 1.0664646847188034e-05, + "loss": 0.1057, + "step": 24650 + }, + { + "epoch": 2.3347850785836015, + "grad_norm": 0.4131108522415161, + "learning_rate": 1.0660859685665594e-05, + "loss": 0.0913, + "step": 24660 + }, + { + "epoch": 2.3357318689642113, + "grad_norm": 0.42493703961372375, + "learning_rate": 1.0657072524143154e-05, + "loss": 0.1159, + "step": 24670 + }, + { + "epoch": 2.336678659344821, + "grad_norm": 0.33579689264297485, + "learning_rate": 1.0653285362620716e-05, + "loss": 0.1058, + "step": 24680 + }, + { + "epoch": 2.3376254497254307, + "grad_norm": 0.32977592945098877, + "learning_rate": 1.0649498201098278e-05, + "loss": 0.1109, + "step": 24690 + }, + { + "epoch": 2.3385722401060405, + "grad_norm": 0.5081207752227783, + "learning_rate": 1.064571103957584e-05, + "loss": 0.1073, + "step": 24700 + }, + { + "epoch": 2.33951903048665, + "grad_norm": 0.4069708585739136, + "learning_rate": 1.06419238780534e-05, + "loss": 0.0989, + "step": 24710 + }, + { + "epoch": 2.34046582086726, + "grad_norm": 0.522319495677948, + "learning_rate": 1.063813671653096e-05, + "loss": 0.1065, + "step": 24720 + }, + { + "epoch": 2.3414126112478697, + "grad_norm": 0.4592416286468506, + "learning_rate": 1.0634349555008523e-05, + "loss": 0.099, + "step": 24730 + }, + { + "epoch": 2.3423594016284794, + "grad_norm": 0.4711533486843109, + "learning_rate": 1.0630562393486083e-05, + "loss": 0.1073, + "step": 24740 + }, + { + "epoch": 2.343306192009089, + "grad_norm": 0.47300341725349426, + "learning_rate": 1.0626775231963645e-05, + "loss": 0.0981, + "step": 24750 + }, + { + "epoch": 2.344252982389699, + "grad_norm": 0.43047043681144714, + "learning_rate": 1.0622988070441205e-05, + "loss": 0.1011, + "step": 24760 + }, + { + "epoch": 2.3451997727703087, + "grad_norm": 0.44232434034347534, + "learning_rate": 1.0619200908918765e-05, + "loss": 0.1123, + "step": 24770 + }, + { + "epoch": 2.3461465631509184, + "grad_norm": 0.37191641330718994, + "learning_rate": 1.0615413747396328e-05, + "loss": 0.1089, + "step": 24780 + }, + { + "epoch": 2.347093353531528, + "grad_norm": 0.3949145972728729, + "learning_rate": 1.0611626585873888e-05, + "loss": 0.0954, + "step": 24790 + }, + { + "epoch": 2.348040143912138, + "grad_norm": 0.4205836057662964, + "learning_rate": 1.0607839424351449e-05, + "loss": 0.1051, + "step": 24800 + }, + { + "epoch": 2.3489869342927476, + "grad_norm": 0.37778088450431824, + "learning_rate": 1.060405226282901e-05, + "loss": 0.1046, + "step": 24810 + }, + { + "epoch": 2.3499337246733574, + "grad_norm": 0.49821335077285767, + "learning_rate": 1.060026510130657e-05, + "loss": 0.0951, + "step": 24820 + }, + { + "epoch": 2.350880515053967, + "grad_norm": 0.4278693199157715, + "learning_rate": 1.0596477939784134e-05, + "loss": 0.1021, + "step": 24830 + }, + { + "epoch": 2.351827305434577, + "grad_norm": 0.33827054500579834, + "learning_rate": 1.0592690778261694e-05, + "loss": 0.0946, + "step": 24840 + }, + { + "epoch": 2.3527740958151866, + "grad_norm": 0.38886088132858276, + "learning_rate": 1.0588903616739254e-05, + "loss": 0.1012, + "step": 24850 + }, + { + "epoch": 2.3537208861957963, + "grad_norm": 0.40593603253364563, + "learning_rate": 1.0585116455216816e-05, + "loss": 0.0967, + "step": 24860 + }, + { + "epoch": 2.354667676576406, + "grad_norm": 0.45751744508743286, + "learning_rate": 1.0581329293694377e-05, + "loss": 0.1053, + "step": 24870 + }, + { + "epoch": 2.355614466957016, + "grad_norm": 0.5506783127784729, + "learning_rate": 1.0577542132171939e-05, + "loss": 0.1005, + "step": 24880 + }, + { + "epoch": 2.3565612573376256, + "grad_norm": 0.39473041892051697, + "learning_rate": 1.05737549706495e-05, + "loss": 0.1158, + "step": 24890 + }, + { + "epoch": 2.3575080477182353, + "grad_norm": 0.4578403830528259, + "learning_rate": 1.056996780912706e-05, + "loss": 0.1007, + "step": 24900 + }, + { + "epoch": 2.358454838098845, + "grad_norm": 0.4987157881259918, + "learning_rate": 1.0566180647604621e-05, + "loss": 0.106, + "step": 24910 + }, + { + "epoch": 2.359401628479455, + "grad_norm": 0.4185522794723511, + "learning_rate": 1.0562393486082183e-05, + "loss": 0.1079, + "step": 24920 + }, + { + "epoch": 2.3603484188600645, + "grad_norm": 0.5091202259063721, + "learning_rate": 1.0558606324559744e-05, + "loss": 0.0975, + "step": 24930 + }, + { + "epoch": 2.3612952092406743, + "grad_norm": 0.3753291666507721, + "learning_rate": 1.0554819163037304e-05, + "loss": 0.0955, + "step": 24940 + }, + { + "epoch": 2.362241999621284, + "grad_norm": 0.5717144012451172, + "learning_rate": 1.0551032001514865e-05, + "loss": 0.1101, + "step": 24950 + }, + { + "epoch": 2.3631887900018937, + "grad_norm": 0.3344045877456665, + "learning_rate": 1.0547244839992425e-05, + "loss": 0.0962, + "step": 24960 + }, + { + "epoch": 2.3641355803825035, + "grad_norm": 0.41045841574668884, + "learning_rate": 1.0543457678469988e-05, + "loss": 0.0969, + "step": 24970 + }, + { + "epoch": 2.3650823707631132, + "grad_norm": 0.37063470482826233, + "learning_rate": 1.0539670516947548e-05, + "loss": 0.1095, + "step": 24980 + }, + { + "epoch": 2.366029161143723, + "grad_norm": 0.43232864141464233, + "learning_rate": 1.053588335542511e-05, + "loss": 0.1046, + "step": 24990 + }, + { + "epoch": 2.3669759515243323, + "grad_norm": 0.3335595726966858, + "learning_rate": 1.053209619390267e-05, + "loss": 0.0973, + "step": 25000 + }, + { + "epoch": 2.367922741904942, + "grad_norm": 0.3094400465488434, + "learning_rate": 1.0528309032380233e-05, + "loss": 0.103, + "step": 25010 + }, + { + "epoch": 2.3688695322855517, + "grad_norm": 0.4552101790904999, + "learning_rate": 1.0524521870857793e-05, + "loss": 0.1036, + "step": 25020 + }, + { + "epoch": 2.3698163226661615, + "grad_norm": 0.44783899188041687, + "learning_rate": 1.0520734709335353e-05, + "loss": 0.0968, + "step": 25030 + }, + { + "epoch": 2.3707631130467712, + "grad_norm": 0.39976587891578674, + "learning_rate": 1.0516947547812915e-05, + "loss": 0.1006, + "step": 25040 + }, + { + "epoch": 2.371709903427381, + "grad_norm": 0.4066604673862457, + "learning_rate": 1.0513160386290475e-05, + "loss": 0.1012, + "step": 25050 + }, + { + "epoch": 2.3726566938079907, + "grad_norm": 0.3684154152870178, + "learning_rate": 1.0509373224768039e-05, + "loss": 0.1036, + "step": 25060 + }, + { + "epoch": 2.3736034841886005, + "grad_norm": 0.44917765259742737, + "learning_rate": 1.0505586063245599e-05, + "loss": 0.0948, + "step": 25070 + }, + { + "epoch": 2.37455027456921, + "grad_norm": 0.41735780239105225, + "learning_rate": 1.0501798901723159e-05, + "loss": 0.0976, + "step": 25080 + }, + { + "epoch": 2.37549706494982, + "grad_norm": 0.4906679093837738, + "learning_rate": 1.049801174020072e-05, + "loss": 0.1085, + "step": 25090 + }, + { + "epoch": 2.3764438553304297, + "grad_norm": 0.36485058069229126, + "learning_rate": 1.0494224578678282e-05, + "loss": 0.1058, + "step": 25100 + }, + { + "epoch": 2.3773906457110394, + "grad_norm": 0.41198959946632385, + "learning_rate": 1.0490437417155844e-05, + "loss": 0.1033, + "step": 25110 + }, + { + "epoch": 2.378337436091649, + "grad_norm": 0.3697530925273895, + "learning_rate": 1.0486650255633404e-05, + "loss": 0.1002, + "step": 25120 + }, + { + "epoch": 2.379284226472259, + "grad_norm": 0.3377944529056549, + "learning_rate": 1.0482863094110964e-05, + "loss": 0.1087, + "step": 25130 + }, + { + "epoch": 2.3802310168528686, + "grad_norm": 0.3125544786453247, + "learning_rate": 1.0479075932588524e-05, + "loss": 0.0987, + "step": 25140 + }, + { + "epoch": 2.3811778072334784, + "grad_norm": 0.5143665075302124, + "learning_rate": 1.0475288771066088e-05, + "loss": 0.1159, + "step": 25150 + }, + { + "epoch": 2.382124597614088, + "grad_norm": 0.6163843274116516, + "learning_rate": 1.0471501609543648e-05, + "loss": 0.1035, + "step": 25160 + }, + { + "epoch": 2.383071387994698, + "grad_norm": 0.37617814540863037, + "learning_rate": 1.046771444802121e-05, + "loss": 0.1069, + "step": 25170 + }, + { + "epoch": 2.3840181783753076, + "grad_norm": 0.33066341280937195, + "learning_rate": 1.046392728649877e-05, + "loss": 0.0948, + "step": 25180 + }, + { + "epoch": 2.3849649687559173, + "grad_norm": 0.4607396125793457, + "learning_rate": 1.046014012497633e-05, + "loss": 0.1102, + "step": 25190 + }, + { + "epoch": 2.385911759136527, + "grad_norm": 0.513787031173706, + "learning_rate": 1.0456352963453893e-05, + "loss": 0.1104, + "step": 25200 + }, + { + "epoch": 2.386858549517137, + "grad_norm": 0.4283960461616516, + "learning_rate": 1.0452565801931453e-05, + "loss": 0.1061, + "step": 25210 + }, + { + "epoch": 2.3878053398977466, + "grad_norm": 0.4610239267349243, + "learning_rate": 1.0448778640409015e-05, + "loss": 0.1001, + "step": 25220 + }, + { + "epoch": 2.3887521302783563, + "grad_norm": 0.44825395941734314, + "learning_rate": 1.0444991478886575e-05, + "loss": 0.103, + "step": 25230 + }, + { + "epoch": 2.389698920658966, + "grad_norm": 0.5586211681365967, + "learning_rate": 1.0441204317364138e-05, + "loss": 0.1103, + "step": 25240 + }, + { + "epoch": 2.390645711039576, + "grad_norm": 0.49608463048934937, + "learning_rate": 1.0437417155841698e-05, + "loss": 0.108, + "step": 25250 + }, + { + "epoch": 2.3915925014201855, + "grad_norm": 0.44712910056114197, + "learning_rate": 1.0433629994319258e-05, + "loss": 0.1046, + "step": 25260 + }, + { + "epoch": 2.3925392918007953, + "grad_norm": 0.39927998185157776, + "learning_rate": 1.042984283279682e-05, + "loss": 0.0994, + "step": 25270 + }, + { + "epoch": 2.393486082181405, + "grad_norm": 0.3669414222240448, + "learning_rate": 1.042605567127438e-05, + "loss": 0.1054, + "step": 25280 + }, + { + "epoch": 2.3944328725620148, + "grad_norm": 0.369989812374115, + "learning_rate": 1.0422268509751943e-05, + "loss": 0.0967, + "step": 25290 + }, + { + "epoch": 2.3953796629426245, + "grad_norm": 0.41249990463256836, + "learning_rate": 1.0418481348229504e-05, + "loss": 0.1009, + "step": 25300 + }, + { + "epoch": 2.3963264533232342, + "grad_norm": 0.4730875492095947, + "learning_rate": 1.0414694186707064e-05, + "loss": 0.1078, + "step": 25310 + }, + { + "epoch": 2.397273243703844, + "grad_norm": 0.3919539451599121, + "learning_rate": 1.0410907025184624e-05, + "loss": 0.101, + "step": 25320 + }, + { + "epoch": 2.3982200340844537, + "grad_norm": 0.3141326308250427, + "learning_rate": 1.0407119863662185e-05, + "loss": 0.0976, + "step": 25330 + }, + { + "epoch": 2.3991668244650635, + "grad_norm": 0.5199115872383118, + "learning_rate": 1.0403332702139747e-05, + "loss": 0.0982, + "step": 25340 + }, + { + "epoch": 2.400113614845673, + "grad_norm": 0.4684145152568817, + "learning_rate": 1.0399545540617309e-05, + "loss": 0.1113, + "step": 25350 + }, + { + "epoch": 2.401060405226283, + "grad_norm": 0.3334536552429199, + "learning_rate": 1.0395758379094869e-05, + "loss": 0.1024, + "step": 25360 + }, + { + "epoch": 2.4020071956068927, + "grad_norm": 0.4707101583480835, + "learning_rate": 1.0391971217572429e-05, + "loss": 0.0954, + "step": 25370 + }, + { + "epoch": 2.4029539859875024, + "grad_norm": 0.40972524881362915, + "learning_rate": 1.0388184056049992e-05, + "loss": 0.0996, + "step": 25380 + }, + { + "epoch": 2.403900776368112, + "grad_norm": 0.41831696033477783, + "learning_rate": 1.0384396894527552e-05, + "loss": 0.0975, + "step": 25390 + }, + { + "epoch": 2.404847566748722, + "grad_norm": 0.3924408555030823, + "learning_rate": 1.0380609733005114e-05, + "loss": 0.0992, + "step": 25400 + }, + { + "epoch": 2.4057943571293317, + "grad_norm": 0.30749720335006714, + "learning_rate": 1.0376822571482674e-05, + "loss": 0.0926, + "step": 25410 + }, + { + "epoch": 2.4067411475099414, + "grad_norm": 0.5540740489959717, + "learning_rate": 1.0373035409960234e-05, + "loss": 0.1105, + "step": 25420 + }, + { + "epoch": 2.407687937890551, + "grad_norm": 0.3289966285228729, + "learning_rate": 1.0369248248437798e-05, + "loss": 0.0948, + "step": 25430 + }, + { + "epoch": 2.408634728271161, + "grad_norm": 0.4328863322734833, + "learning_rate": 1.0365461086915358e-05, + "loss": 0.1089, + "step": 25440 + }, + { + "epoch": 2.4095815186517706, + "grad_norm": 0.40920546650886536, + "learning_rate": 1.036167392539292e-05, + "loss": 0.1039, + "step": 25450 + }, + { + "epoch": 2.4105283090323804, + "grad_norm": 0.3753580152988434, + "learning_rate": 1.035788676387048e-05, + "loss": 0.1008, + "step": 25460 + }, + { + "epoch": 2.41147509941299, + "grad_norm": 0.39004644751548767, + "learning_rate": 1.035409960234804e-05, + "loss": 0.0979, + "step": 25470 + }, + { + "epoch": 2.4124218897936, + "grad_norm": 0.4152458608150482, + "learning_rate": 1.0350312440825603e-05, + "loss": 0.0988, + "step": 25480 + }, + { + "epoch": 2.4133686801742096, + "grad_norm": 0.3045837879180908, + "learning_rate": 1.0346525279303163e-05, + "loss": 0.0986, + "step": 25490 + }, + { + "epoch": 2.4143154705548193, + "grad_norm": 0.4492361545562744, + "learning_rate": 1.0342738117780723e-05, + "loss": 0.1012, + "step": 25500 + }, + { + "epoch": 2.415262260935429, + "grad_norm": 0.35907822847366333, + "learning_rate": 1.0338950956258285e-05, + "loss": 0.1057, + "step": 25510 + }, + { + "epoch": 2.416209051316039, + "grad_norm": 0.39662420749664307, + "learning_rate": 1.0335163794735847e-05, + "loss": 0.097, + "step": 25520 + }, + { + "epoch": 2.4171558416966485, + "grad_norm": 0.3447544276714325, + "learning_rate": 1.0331376633213408e-05, + "loss": 0.0984, + "step": 25530 + }, + { + "epoch": 2.4181026320772583, + "grad_norm": 0.3469262719154358, + "learning_rate": 1.0327589471690968e-05, + "loss": 0.0924, + "step": 25540 + }, + { + "epoch": 2.419049422457868, + "grad_norm": 0.4851413369178772, + "learning_rate": 1.0323802310168528e-05, + "loss": 0.0913, + "step": 25550 + }, + { + "epoch": 2.4199962128384778, + "grad_norm": 0.4492618143558502, + "learning_rate": 1.032001514864609e-05, + "loss": 0.102, + "step": 25560 + }, + { + "epoch": 2.4209430032190875, + "grad_norm": 0.35623621940612793, + "learning_rate": 1.0316227987123652e-05, + "loss": 0.102, + "step": 25570 + }, + { + "epoch": 2.4218897935996972, + "grad_norm": 0.3780003488063812, + "learning_rate": 1.0312440825601214e-05, + "loss": 0.1039, + "step": 25580 + }, + { + "epoch": 2.422836583980307, + "grad_norm": 0.32068032026290894, + "learning_rate": 1.0308653664078774e-05, + "loss": 0.1024, + "step": 25590 + }, + { + "epoch": 2.4237833743609167, + "grad_norm": 0.49494946002960205, + "learning_rate": 1.0304866502556334e-05, + "loss": 0.1009, + "step": 25600 + }, + { + "epoch": 2.4247301647415265, + "grad_norm": 0.38599666953086853, + "learning_rate": 1.0301079341033896e-05, + "loss": 0.0999, + "step": 25610 + }, + { + "epoch": 2.4256769551221358, + "grad_norm": 0.4353082776069641, + "learning_rate": 1.0297292179511457e-05, + "loss": 0.0963, + "step": 25620 + }, + { + "epoch": 2.4266237455027455, + "grad_norm": 0.46552932262420654, + "learning_rate": 1.0293505017989019e-05, + "loss": 0.1018, + "step": 25630 + }, + { + "epoch": 2.4275705358833553, + "grad_norm": 0.45082664489746094, + "learning_rate": 1.0289717856466579e-05, + "loss": 0.0984, + "step": 25640 + }, + { + "epoch": 2.428517326263965, + "grad_norm": 0.4540878236293793, + "learning_rate": 1.028593069494414e-05, + "loss": 0.0968, + "step": 25650 + }, + { + "epoch": 2.4294641166445747, + "grad_norm": 0.43615978956222534, + "learning_rate": 1.0282143533421703e-05, + "loss": 0.1084, + "step": 25660 + }, + { + "epoch": 2.4304109070251845, + "grad_norm": 0.3706541359424591, + "learning_rate": 1.0278356371899263e-05, + "loss": 0.1014, + "step": 25670 + }, + { + "epoch": 2.431357697405794, + "grad_norm": 0.37163105607032776, + "learning_rate": 1.0274569210376823e-05, + "loss": 0.1007, + "step": 25680 + }, + { + "epoch": 2.432304487786404, + "grad_norm": 0.40251484513282776, + "learning_rate": 1.0270782048854384e-05, + "loss": 0.0975, + "step": 25690 + }, + { + "epoch": 2.4332512781670137, + "grad_norm": 0.49345412850379944, + "learning_rate": 1.0266994887331944e-05, + "loss": 0.1049, + "step": 25700 + }, + { + "epoch": 2.4341980685476234, + "grad_norm": 0.4760383665561676, + "learning_rate": 1.0263207725809508e-05, + "loss": 0.1036, + "step": 25710 + }, + { + "epoch": 2.435144858928233, + "grad_norm": 0.3557586669921875, + "learning_rate": 1.0259420564287068e-05, + "loss": 0.1005, + "step": 25720 + }, + { + "epoch": 2.436091649308843, + "grad_norm": 0.40292495489120483, + "learning_rate": 1.0255633402764628e-05, + "loss": 0.1133, + "step": 25730 + }, + { + "epoch": 2.4370384396894527, + "grad_norm": 0.47368377447128296, + "learning_rate": 1.025184624124219e-05, + "loss": 0.1011, + "step": 25740 + }, + { + "epoch": 2.4379852300700624, + "grad_norm": 0.36609113216400146, + "learning_rate": 1.024805907971975e-05, + "loss": 0.1008, + "step": 25750 + }, + { + "epoch": 2.438932020450672, + "grad_norm": 0.46925249695777893, + "learning_rate": 1.0244271918197313e-05, + "loss": 0.1067, + "step": 25760 + }, + { + "epoch": 2.439878810831282, + "grad_norm": 0.4214388132095337, + "learning_rate": 1.0240484756674873e-05, + "loss": 0.098, + "step": 25770 + }, + { + "epoch": 2.4408256012118916, + "grad_norm": 0.40375715494155884, + "learning_rate": 1.0236697595152433e-05, + "loss": 0.1016, + "step": 25780 + }, + { + "epoch": 2.4417723915925014, + "grad_norm": 0.4981019198894501, + "learning_rate": 1.0232910433629995e-05, + "loss": 0.0995, + "step": 25790 + }, + { + "epoch": 2.442719181973111, + "grad_norm": 0.363440603017807, + "learning_rate": 1.0229123272107557e-05, + "loss": 0.1016, + "step": 25800 + }, + { + "epoch": 2.443665972353721, + "grad_norm": 0.3500191569328308, + "learning_rate": 1.0225336110585119e-05, + "loss": 0.0983, + "step": 25810 + }, + { + "epoch": 2.4446127627343306, + "grad_norm": 0.4750962555408478, + "learning_rate": 1.0221548949062679e-05, + "loss": 0.1119, + "step": 25820 + }, + { + "epoch": 2.4455595531149403, + "grad_norm": 0.4848047196865082, + "learning_rate": 1.0217761787540239e-05, + "loss": 0.1047, + "step": 25830 + }, + { + "epoch": 2.44650634349555, + "grad_norm": 0.370795875787735, + "learning_rate": 1.02139746260178e-05, + "loss": 0.102, + "step": 25840 + }, + { + "epoch": 2.44745313387616, + "grad_norm": 0.32790902256965637, + "learning_rate": 1.0210187464495362e-05, + "loss": 0.1046, + "step": 25850 + }, + { + "epoch": 2.4483999242567696, + "grad_norm": 0.45114025473594666, + "learning_rate": 1.0206400302972922e-05, + "loss": 0.1025, + "step": 25860 + }, + { + "epoch": 2.4493467146373793, + "grad_norm": 0.45215851068496704, + "learning_rate": 1.0202613141450484e-05, + "loss": 0.0962, + "step": 25870 + }, + { + "epoch": 2.450293505017989, + "grad_norm": 0.5089224576950073, + "learning_rate": 1.0198825979928044e-05, + "loss": 0.0939, + "step": 25880 + }, + { + "epoch": 2.451240295398599, + "grad_norm": 0.4181041121482849, + "learning_rate": 1.0195038818405604e-05, + "loss": 0.0912, + "step": 25890 + }, + { + "epoch": 2.4521870857792085, + "grad_norm": 0.39679673314094543, + "learning_rate": 1.0191251656883167e-05, + "loss": 0.1028, + "step": 25900 + }, + { + "epoch": 2.4531338761598183, + "grad_norm": 0.434073269367218, + "learning_rate": 1.0187464495360728e-05, + "loss": 0.1004, + "step": 25910 + }, + { + "epoch": 2.454080666540428, + "grad_norm": 0.38664939999580383, + "learning_rate": 1.018367733383829e-05, + "loss": 0.1054, + "step": 25920 + }, + { + "epoch": 2.4550274569210377, + "grad_norm": 0.3743431568145752, + "learning_rate": 1.017989017231585e-05, + "loss": 0.1084, + "step": 25930 + }, + { + "epoch": 2.4559742473016475, + "grad_norm": 0.33121728897094727, + "learning_rate": 1.0176103010793413e-05, + "loss": 0.1053, + "step": 25940 + }, + { + "epoch": 2.4569210376822572, + "grad_norm": 0.40912964940071106, + "learning_rate": 1.0172315849270973e-05, + "loss": 0.1018, + "step": 25950 + }, + { + "epoch": 2.457867828062867, + "grad_norm": 0.36607125401496887, + "learning_rate": 1.0168528687748533e-05, + "loss": 0.0844, + "step": 25960 + }, + { + "epoch": 2.4588146184434767, + "grad_norm": 0.3174314796924591, + "learning_rate": 1.0164741526226095e-05, + "loss": 0.0992, + "step": 25970 + }, + { + "epoch": 2.4597614088240864, + "grad_norm": 0.47616884112358093, + "learning_rate": 1.0160954364703655e-05, + "loss": 0.0967, + "step": 25980 + }, + { + "epoch": 2.460708199204696, + "grad_norm": 0.44378870725631714, + "learning_rate": 1.0157167203181218e-05, + "loss": 0.1127, + "step": 25990 + }, + { + "epoch": 2.461654989585306, + "grad_norm": 0.37386101484298706, + "learning_rate": 1.0153380041658778e-05, + "loss": 0.103, + "step": 26000 + }, + { + "epoch": 2.4626017799659157, + "grad_norm": 0.3897585868835449, + "learning_rate": 1.0149592880136338e-05, + "loss": 0.1054, + "step": 26010 + }, + { + "epoch": 2.4635485703465254, + "grad_norm": 0.39490073919296265, + "learning_rate": 1.01458057186139e-05, + "loss": 0.0868, + "step": 26020 + }, + { + "epoch": 2.464495360727135, + "grad_norm": 0.3911207616329193, + "learning_rate": 1.014201855709146e-05, + "loss": 0.0994, + "step": 26030 + }, + { + "epoch": 2.465442151107745, + "grad_norm": 0.4419346749782562, + "learning_rate": 1.0138231395569022e-05, + "loss": 0.1015, + "step": 26040 + }, + { + "epoch": 2.4663889414883546, + "grad_norm": 0.38570520281791687, + "learning_rate": 1.0134444234046583e-05, + "loss": 0.1021, + "step": 26050 + }, + { + "epoch": 2.4673357318689644, + "grad_norm": 0.477276474237442, + "learning_rate": 1.0130657072524144e-05, + "loss": 0.115, + "step": 26060 + }, + { + "epoch": 2.468282522249574, + "grad_norm": 0.5218283534049988, + "learning_rate": 1.0126869911001704e-05, + "loss": 0.1097, + "step": 26070 + }, + { + "epoch": 2.469229312630184, + "grad_norm": 0.4555433690547943, + "learning_rate": 1.0123082749479267e-05, + "loss": 0.0965, + "step": 26080 + }, + { + "epoch": 2.4701761030107936, + "grad_norm": 0.44638416171073914, + "learning_rate": 1.0119295587956827e-05, + "loss": 0.1144, + "step": 26090 + }, + { + "epoch": 2.471122893391403, + "grad_norm": 0.4732908010482788, + "learning_rate": 1.0115508426434389e-05, + "loss": 0.1105, + "step": 26100 + }, + { + "epoch": 2.4720696837720126, + "grad_norm": 0.359528511762619, + "learning_rate": 1.0111721264911949e-05, + "loss": 0.1026, + "step": 26110 + }, + { + "epoch": 2.4730164741526224, + "grad_norm": 0.37714725732803345, + "learning_rate": 1.0107934103389509e-05, + "loss": 0.1029, + "step": 26120 + }, + { + "epoch": 2.473963264533232, + "grad_norm": 0.40320953726768494, + "learning_rate": 1.0104146941867072e-05, + "loss": 0.1089, + "step": 26130 + }, + { + "epoch": 2.474910054913842, + "grad_norm": 0.44699448347091675, + "learning_rate": 1.0100359780344632e-05, + "loss": 0.0999, + "step": 26140 + }, + { + "epoch": 2.4758568452944516, + "grad_norm": 0.4459933638572693, + "learning_rate": 1.0096572618822194e-05, + "loss": 0.1066, + "step": 26150 + }, + { + "epoch": 2.4768036356750613, + "grad_norm": 0.4070841670036316, + "learning_rate": 1.0092785457299754e-05, + "loss": 0.0888, + "step": 26160 + }, + { + "epoch": 2.477750426055671, + "grad_norm": 0.3742724359035492, + "learning_rate": 1.0088998295777314e-05, + "loss": 0.1042, + "step": 26170 + }, + { + "epoch": 2.478697216436281, + "grad_norm": 0.392292857170105, + "learning_rate": 1.0085211134254878e-05, + "loss": 0.1116, + "step": 26180 + }, + { + "epoch": 2.4796440068168906, + "grad_norm": 0.471701979637146, + "learning_rate": 1.0081423972732438e-05, + "loss": 0.1021, + "step": 26190 + }, + { + "epoch": 2.4805907971975003, + "grad_norm": 0.4960010349750519, + "learning_rate": 1.007763681121e-05, + "loss": 0.1079, + "step": 26200 + }, + { + "epoch": 2.48153758757811, + "grad_norm": 0.39826032519340515, + "learning_rate": 1.007384964968756e-05, + "loss": 0.1085, + "step": 26210 + }, + { + "epoch": 2.48248437795872, + "grad_norm": 0.6029541492462158, + "learning_rate": 1.0070062488165121e-05, + "loss": 0.1003, + "step": 26220 + }, + { + "epoch": 2.4834311683393295, + "grad_norm": 0.3961453437805176, + "learning_rate": 1.0066275326642683e-05, + "loss": 0.104, + "step": 26230 + }, + { + "epoch": 2.4843779587199393, + "grad_norm": 0.3852611482143402, + "learning_rate": 1.0062488165120243e-05, + "loss": 0.0978, + "step": 26240 + }, + { + "epoch": 2.485324749100549, + "grad_norm": 0.38596782088279724, + "learning_rate": 1.0058701003597803e-05, + "loss": 0.1016, + "step": 26250 + }, + { + "epoch": 2.4862715394811588, + "grad_norm": 0.4030351936817169, + "learning_rate": 1.0054913842075365e-05, + "loss": 0.0972, + "step": 26260 + }, + { + "epoch": 2.4872183298617685, + "grad_norm": 0.3803381621837616, + "learning_rate": 1.0051126680552927e-05, + "loss": 0.1077, + "step": 26270 + }, + { + "epoch": 2.4881651202423782, + "grad_norm": 0.36874496936798096, + "learning_rate": 1.0047339519030488e-05, + "loss": 0.0997, + "step": 26280 + }, + { + "epoch": 2.489111910622988, + "grad_norm": 0.3608757257461548, + "learning_rate": 1.0043552357508048e-05, + "loss": 0.1122, + "step": 26290 + }, + { + "epoch": 2.4900587010035977, + "grad_norm": 0.4197673499584198, + "learning_rate": 1.0039765195985608e-05, + "loss": 0.106, + "step": 26300 + }, + { + "epoch": 2.4910054913842075, + "grad_norm": 0.2702406644821167, + "learning_rate": 1.003597803446317e-05, + "loss": 0.0992, + "step": 26310 + }, + { + "epoch": 2.491952281764817, + "grad_norm": 0.41988605260849, + "learning_rate": 1.0032190872940732e-05, + "loss": 0.1146, + "step": 26320 + }, + { + "epoch": 2.492899072145427, + "grad_norm": 0.36301714181900024, + "learning_rate": 1.0028403711418294e-05, + "loss": 0.1033, + "step": 26330 + }, + { + "epoch": 2.4938458625260367, + "grad_norm": 0.3015279471874237, + "learning_rate": 1.0024616549895854e-05, + "loss": 0.1009, + "step": 26340 + }, + { + "epoch": 2.4947926529066464, + "grad_norm": 0.35299044847488403, + "learning_rate": 1.0020829388373414e-05, + "loss": 0.1005, + "step": 26350 + }, + { + "epoch": 2.495739443287256, + "grad_norm": 0.4299270212650299, + "learning_rate": 1.0017042226850977e-05, + "loss": 0.101, + "step": 26360 + }, + { + "epoch": 2.496686233667866, + "grad_norm": 0.37819668650627136, + "learning_rate": 1.0013255065328537e-05, + "loss": 0.1016, + "step": 26370 + }, + { + "epoch": 2.4976330240484756, + "grad_norm": 0.3589572012424469, + "learning_rate": 1.0009467903806099e-05, + "loss": 0.1035, + "step": 26380 + }, + { + "epoch": 2.4985798144290854, + "grad_norm": 0.46611204743385315, + "learning_rate": 1.0005680742283659e-05, + "loss": 0.0983, + "step": 26390 + }, + { + "epoch": 2.499526604809695, + "grad_norm": 0.3120664358139038, + "learning_rate": 1.0001893580761219e-05, + "loss": 0.0976, + "step": 26400 + }, + { + "epoch": 2.500473395190305, + "grad_norm": 0.5002902746200562, + "learning_rate": 9.99810641923878e-06, + "loss": 0.1132, + "step": 26410 + }, + { + "epoch": 2.5014201855709146, + "grad_norm": 0.4584709405899048, + "learning_rate": 9.994319257716343e-06, + "loss": 0.1046, + "step": 26420 + }, + { + "epoch": 2.5023669759515244, + "grad_norm": 0.3668809235095978, + "learning_rate": 9.990532096193903e-06, + "loss": 0.1001, + "step": 26430 + }, + { + "epoch": 2.503313766332134, + "grad_norm": 0.3679928183555603, + "learning_rate": 9.986744934671464e-06, + "loss": 0.1007, + "step": 26440 + }, + { + "epoch": 2.504260556712744, + "grad_norm": 0.5092624425888062, + "learning_rate": 9.982957773149026e-06, + "loss": 0.0932, + "step": 26450 + }, + { + "epoch": 2.5052073470933536, + "grad_norm": 0.441669762134552, + "learning_rate": 9.979170611626586e-06, + "loss": 0.1069, + "step": 26460 + }, + { + "epoch": 2.5061541374739633, + "grad_norm": 0.3531561493873596, + "learning_rate": 9.975383450104148e-06, + "loss": 0.099, + "step": 26470 + }, + { + "epoch": 2.507100927854573, + "grad_norm": 0.37312039732933044, + "learning_rate": 9.971596288581708e-06, + "loss": 0.0952, + "step": 26480 + }, + { + "epoch": 2.508047718235183, + "grad_norm": 0.34380894899368286, + "learning_rate": 9.96780912705927e-06, + "loss": 0.0982, + "step": 26490 + }, + { + "epoch": 2.5089945086157925, + "grad_norm": 0.4413946568965912, + "learning_rate": 9.964021965536831e-06, + "loss": 0.0978, + "step": 26500 + }, + { + "epoch": 2.5099412989964023, + "grad_norm": 0.31311383843421936, + "learning_rate": 9.960234804014393e-06, + "loss": 0.0958, + "step": 26510 + }, + { + "epoch": 2.510888089377012, + "grad_norm": 0.6745802164077759, + "learning_rate": 9.956447642491953e-06, + "loss": 0.1069, + "step": 26520 + }, + { + "epoch": 2.5118348797576218, + "grad_norm": 0.4960954189300537, + "learning_rate": 9.952660480969513e-06, + "loss": 0.1153, + "step": 26530 + }, + { + "epoch": 2.5127816701382315, + "grad_norm": 0.41089656949043274, + "learning_rate": 9.948873319447075e-06, + "loss": 0.104, + "step": 26540 + }, + { + "epoch": 2.5137284605188412, + "grad_norm": 0.4213186502456665, + "learning_rate": 9.945086157924637e-06, + "loss": 0.097, + "step": 26550 + }, + { + "epoch": 2.514675250899451, + "grad_norm": 0.5175119638442993, + "learning_rate": 9.941298996402199e-06, + "loss": 0.101, + "step": 26560 + }, + { + "epoch": 2.5156220412800607, + "grad_norm": 0.4612017273902893, + "learning_rate": 9.937511834879759e-06, + "loss": 0.1023, + "step": 26570 + }, + { + "epoch": 2.5165688316606705, + "grad_norm": 0.3659135699272156, + "learning_rate": 9.93372467335732e-06, + "loss": 0.0975, + "step": 26580 + }, + { + "epoch": 2.51751562204128, + "grad_norm": 0.38637617230415344, + "learning_rate": 9.92993751183488e-06, + "loss": 0.0971, + "step": 26590 + }, + { + "epoch": 2.51846241242189, + "grad_norm": 0.493749737739563, + "learning_rate": 9.92615035031244e-06, + "loss": 0.1004, + "step": 26600 + }, + { + "epoch": 2.5194092028024997, + "grad_norm": 0.36117368936538696, + "learning_rate": 9.922363188790002e-06, + "loss": 0.1025, + "step": 26610 + }, + { + "epoch": 2.5203559931831094, + "grad_norm": 0.36743780970573425, + "learning_rate": 9.918576027267564e-06, + "loss": 0.0948, + "step": 26620 + }, + { + "epoch": 2.521302783563719, + "grad_norm": 0.40864408016204834, + "learning_rate": 9.914788865745126e-06, + "loss": 0.1033, + "step": 26630 + }, + { + "epoch": 2.522249573944329, + "grad_norm": 0.41886240243911743, + "learning_rate": 9.911001704222686e-06, + "loss": 0.0943, + "step": 26640 + }, + { + "epoch": 2.5231963643249387, + "grad_norm": 0.42005595564842224, + "learning_rate": 9.907214542700247e-06, + "loss": 0.1059, + "step": 26650 + }, + { + "epoch": 2.5241431547055484, + "grad_norm": 0.4357232451438904, + "learning_rate": 9.903427381177807e-06, + "loss": 0.1176, + "step": 26660 + }, + { + "epoch": 2.525089945086158, + "grad_norm": 0.3606035113334656, + "learning_rate": 9.89964021965537e-06, + "loss": 0.0933, + "step": 26670 + }, + { + "epoch": 2.526036735466768, + "grad_norm": 0.40714558959007263, + "learning_rate": 9.895853058132931e-06, + "loss": 0.1028, + "step": 26680 + }, + { + "epoch": 2.5269835258473776, + "grad_norm": 0.4298784136772156, + "learning_rate": 9.892065896610491e-06, + "loss": 0.0944, + "step": 26690 + }, + { + "epoch": 2.5279303162279874, + "grad_norm": 0.46234458684921265, + "learning_rate": 9.888278735088053e-06, + "loss": 0.1018, + "step": 26700 + }, + { + "epoch": 2.528877106608597, + "grad_norm": 0.3541281819343567, + "learning_rate": 9.884491573565613e-06, + "loss": 0.0992, + "step": 26710 + }, + { + "epoch": 2.529823896989207, + "grad_norm": 0.4032933712005615, + "learning_rate": 9.880704412043175e-06, + "loss": 0.1043, + "step": 26720 + }, + { + "epoch": 2.5307706873698166, + "grad_norm": 0.39109474420547485, + "learning_rate": 9.876917250520736e-06, + "loss": 0.1141, + "step": 26730 + }, + { + "epoch": 2.5317174777504263, + "grad_norm": 0.36314740777015686, + "learning_rate": 9.873130088998296e-06, + "loss": 0.1048, + "step": 26740 + }, + { + "epoch": 2.5326642681310356, + "grad_norm": 0.42757153511047363, + "learning_rate": 9.869342927475858e-06, + "loss": 0.0944, + "step": 26750 + }, + { + "epoch": 2.5336110585116454, + "grad_norm": 0.43337103724479675, + "learning_rate": 9.865555765953418e-06, + "loss": 0.1059, + "step": 26760 + }, + { + "epoch": 2.534557848892255, + "grad_norm": 0.615561842918396, + "learning_rate": 9.86176860443098e-06, + "loss": 0.1011, + "step": 26770 + }, + { + "epoch": 2.535504639272865, + "grad_norm": 0.3941485583782196, + "learning_rate": 9.85798144290854e-06, + "loss": 0.1097, + "step": 26780 + }, + { + "epoch": 2.5364514296534746, + "grad_norm": 0.5565608739852905, + "learning_rate": 9.854194281386102e-06, + "loss": 0.1034, + "step": 26790 + }, + { + "epoch": 2.5373982200340843, + "grad_norm": 0.3893771469593048, + "learning_rate": 9.850407119863663e-06, + "loss": 0.105, + "step": 26800 + }, + { + "epoch": 2.538345010414694, + "grad_norm": 0.36800989508628845, + "learning_rate": 9.846619958341223e-06, + "loss": 0.1047, + "step": 26810 + }, + { + "epoch": 2.539291800795304, + "grad_norm": 0.3624418377876282, + "learning_rate": 9.842832796818785e-06, + "loss": 0.0904, + "step": 26820 + }, + { + "epoch": 2.5402385911759136, + "grad_norm": 0.4615015387535095, + "learning_rate": 9.839045635296345e-06, + "loss": 0.1125, + "step": 26830 + }, + { + "epoch": 2.5411853815565233, + "grad_norm": 0.4052465558052063, + "learning_rate": 9.835258473773907e-06, + "loss": 0.102, + "step": 26840 + }, + { + "epoch": 2.542132171937133, + "grad_norm": 0.3502803444862366, + "learning_rate": 9.831471312251469e-06, + "loss": 0.1073, + "step": 26850 + }, + { + "epoch": 2.5430789623177428, + "grad_norm": 0.3207018971443176, + "learning_rate": 9.82768415072903e-06, + "loss": 0.0975, + "step": 26860 + }, + { + "epoch": 2.5440257526983525, + "grad_norm": 0.7084227204322815, + "learning_rate": 9.82389698920659e-06, + "loss": 0.103, + "step": 26870 + }, + { + "epoch": 2.5449725430789623, + "grad_norm": 0.4235849976539612, + "learning_rate": 9.82010982768415e-06, + "loss": 0.101, + "step": 26880 + }, + { + "epoch": 2.545919333459572, + "grad_norm": 0.43773093819618225, + "learning_rate": 9.816322666161712e-06, + "loss": 0.0937, + "step": 26890 + }, + { + "epoch": 2.5468661238401817, + "grad_norm": 0.42589303851127625, + "learning_rate": 9.812535504639274e-06, + "loss": 0.0968, + "step": 26900 + }, + { + "epoch": 2.5478129142207915, + "grad_norm": 0.41181427240371704, + "learning_rate": 9.808748343116836e-06, + "loss": 0.1082, + "step": 26910 + }, + { + "epoch": 2.548759704601401, + "grad_norm": 0.4209132492542267, + "learning_rate": 9.804961181594396e-06, + "loss": 0.1005, + "step": 26920 + }, + { + "epoch": 2.549706494982011, + "grad_norm": 0.445416659116745, + "learning_rate": 9.801174020071958e-06, + "loss": 0.0934, + "step": 26930 + }, + { + "epoch": 2.5506532853626207, + "grad_norm": 0.392095148563385, + "learning_rate": 9.797386858549518e-06, + "loss": 0.1022, + "step": 26940 + }, + { + "epoch": 2.5516000757432304, + "grad_norm": 0.46347376704216003, + "learning_rate": 9.79359969702708e-06, + "loss": 0.1007, + "step": 26950 + }, + { + "epoch": 2.55254686612384, + "grad_norm": 0.37809625267982483, + "learning_rate": 9.78981253550464e-06, + "loss": 0.1021, + "step": 26960 + }, + { + "epoch": 2.55349365650445, + "grad_norm": 0.3950808346271515, + "learning_rate": 9.786025373982201e-06, + "loss": 0.1119, + "step": 26970 + }, + { + "epoch": 2.5544404468850597, + "grad_norm": 0.4668138325214386, + "learning_rate": 9.782238212459763e-06, + "loss": 0.1073, + "step": 26980 + }, + { + "epoch": 2.5553872372656694, + "grad_norm": 0.5842813849449158, + "learning_rate": 9.778451050937323e-06, + "loss": 0.1027, + "step": 26990 + }, + { + "epoch": 2.556334027646279, + "grad_norm": 0.4209204316139221, + "learning_rate": 9.774663889414885e-06, + "loss": 0.1093, + "step": 27000 + }, + { + "epoch": 2.557280818026889, + "grad_norm": 0.3692864775657654, + "learning_rate": 9.770876727892445e-06, + "loss": 0.099, + "step": 27010 + }, + { + "epoch": 2.5582276084074986, + "grad_norm": 0.4216177761554718, + "learning_rate": 9.767089566370007e-06, + "loss": 0.0859, + "step": 27020 + }, + { + "epoch": 2.5591743987881084, + "grad_norm": 0.3516777753829956, + "learning_rate": 9.763302404847568e-06, + "loss": 0.0966, + "step": 27030 + }, + { + "epoch": 2.560121189168718, + "grad_norm": 0.43485236167907715, + "learning_rate": 9.759515243325128e-06, + "loss": 0.1052, + "step": 27040 + }, + { + "epoch": 2.561067979549328, + "grad_norm": 0.3804272711277008, + "learning_rate": 9.75572808180269e-06, + "loss": 0.1069, + "step": 27050 + }, + { + "epoch": 2.5620147699299376, + "grad_norm": 0.7018653750419617, + "learning_rate": 9.75194092028025e-06, + "loss": 0.1099, + "step": 27060 + }, + { + "epoch": 2.5629615603105473, + "grad_norm": 0.37473297119140625, + "learning_rate": 9.748153758757812e-06, + "loss": 0.1122, + "step": 27070 + }, + { + "epoch": 2.563908350691157, + "grad_norm": 0.4419376850128174, + "learning_rate": 9.744366597235374e-06, + "loss": 0.1087, + "step": 27080 + }, + { + "epoch": 2.564855141071767, + "grad_norm": 0.4710046648979187, + "learning_rate": 9.740579435712934e-06, + "loss": 0.0974, + "step": 27090 + }, + { + "epoch": 2.5658019314523766, + "grad_norm": 0.32833173871040344, + "learning_rate": 9.736792274190495e-06, + "loss": 0.1037, + "step": 27100 + }, + { + "epoch": 2.5667487218329863, + "grad_norm": 0.44157248735427856, + "learning_rate": 9.733005112668055e-06, + "loss": 0.1031, + "step": 27110 + }, + { + "epoch": 2.567695512213596, + "grad_norm": 0.370108038187027, + "learning_rate": 9.729217951145617e-06, + "loss": 0.1024, + "step": 27120 + }, + { + "epoch": 2.568642302594206, + "grad_norm": 0.4278753697872162, + "learning_rate": 9.725430789623179e-06, + "loss": 0.107, + "step": 27130 + }, + { + "epoch": 2.5695890929748155, + "grad_norm": 0.3966275751590729, + "learning_rate": 9.721643628100739e-06, + "loss": 0.1007, + "step": 27140 + }, + { + "epoch": 2.5705358833554253, + "grad_norm": 0.4091242253780365, + "learning_rate": 9.7178564665783e-06, + "loss": 0.092, + "step": 27150 + }, + { + "epoch": 2.571482673736035, + "grad_norm": 0.47113844752311707, + "learning_rate": 9.71406930505586e-06, + "loss": 0.1154, + "step": 27160 + }, + { + "epoch": 2.5724294641166443, + "grad_norm": 0.4091624915599823, + "learning_rate": 9.710282143533423e-06, + "loss": 0.0941, + "step": 27170 + }, + { + "epoch": 2.573376254497254, + "grad_norm": 0.3549819886684418, + "learning_rate": 9.706494982010983e-06, + "loss": 0.1018, + "step": 27180 + }, + { + "epoch": 2.574323044877864, + "grad_norm": 0.4599911868572235, + "learning_rate": 9.702707820488544e-06, + "loss": 0.0995, + "step": 27190 + }, + { + "epoch": 2.5752698352584735, + "grad_norm": 0.39354264736175537, + "learning_rate": 9.698920658966106e-06, + "loss": 0.1001, + "step": 27200 + }, + { + "epoch": 2.5762166256390833, + "grad_norm": 0.6075589060783386, + "learning_rate": 9.695133497443668e-06, + "loss": 0.0988, + "step": 27210 + }, + { + "epoch": 2.577163416019693, + "grad_norm": 0.48551851511001587, + "learning_rate": 9.691346335921228e-06, + "loss": 0.1041, + "step": 27220 + }, + { + "epoch": 2.5781102064003028, + "grad_norm": 0.4040965139865875, + "learning_rate": 9.687559174398788e-06, + "loss": 0.1081, + "step": 27230 + }, + { + "epoch": 2.5790569967809125, + "grad_norm": 0.36837923526763916, + "learning_rate": 9.68377201287635e-06, + "loss": 0.0972, + "step": 27240 + }, + { + "epoch": 2.5800037871615222, + "grad_norm": 0.5304983258247375, + "learning_rate": 9.679984851353911e-06, + "loss": 0.0971, + "step": 27250 + }, + { + "epoch": 2.580950577542132, + "grad_norm": 0.4093058705329895, + "learning_rate": 9.676197689831473e-06, + "loss": 0.0941, + "step": 27260 + }, + { + "epoch": 2.5818973679227417, + "grad_norm": 0.4903060793876648, + "learning_rate": 9.672410528309033e-06, + "loss": 0.1164, + "step": 27270 + }, + { + "epoch": 2.5828441583033515, + "grad_norm": 0.41253364086151123, + "learning_rate": 9.668623366786595e-06, + "loss": 0.1075, + "step": 27280 + }, + { + "epoch": 2.583790948683961, + "grad_norm": 0.4047181010246277, + "learning_rate": 9.664836205264155e-06, + "loss": 0.0968, + "step": 27290 + }, + { + "epoch": 2.584737739064571, + "grad_norm": 0.5090596079826355, + "learning_rate": 9.661049043741717e-06, + "loss": 0.1035, + "step": 27300 + }, + { + "epoch": 2.5856845294451807, + "grad_norm": 0.378775030374527, + "learning_rate": 9.657261882219278e-06, + "loss": 0.1019, + "step": 27310 + }, + { + "epoch": 2.5866313198257904, + "grad_norm": 0.45318761467933655, + "learning_rate": 9.653474720696839e-06, + "loss": 0.107, + "step": 27320 + }, + { + "epoch": 2.5875781102064, + "grad_norm": 0.35085242986679077, + "learning_rate": 9.6496875591744e-06, + "loss": 0.1012, + "step": 27330 + }, + { + "epoch": 2.58852490058701, + "grad_norm": 0.31217142939567566, + "learning_rate": 9.64590039765196e-06, + "loss": 0.1014, + "step": 27340 + }, + { + "epoch": 2.5894716909676196, + "grad_norm": 0.3551445007324219, + "learning_rate": 9.642113236129522e-06, + "loss": 0.0934, + "step": 27350 + }, + { + "epoch": 2.5904184813482294, + "grad_norm": 0.37578850984573364, + "learning_rate": 9.638326074607082e-06, + "loss": 0.1089, + "step": 27360 + }, + { + "epoch": 2.591365271728839, + "grad_norm": 0.3457893133163452, + "learning_rate": 9.634538913084644e-06, + "loss": 0.1089, + "step": 27370 + }, + { + "epoch": 2.592312062109449, + "grad_norm": 0.5415358543395996, + "learning_rate": 9.630751751562206e-06, + "loss": 0.1047, + "step": 27380 + }, + { + "epoch": 2.5932588524900586, + "grad_norm": 0.4887268543243408, + "learning_rate": 9.626964590039766e-06, + "loss": 0.1154, + "step": 27390 + }, + { + "epoch": 2.5942056428706683, + "grad_norm": 0.348284512758255, + "learning_rate": 9.623177428517327e-06, + "loss": 0.0981, + "step": 27400 + }, + { + "epoch": 2.595152433251278, + "grad_norm": 0.3595815896987915, + "learning_rate": 9.619390266994887e-06, + "loss": 0.1144, + "step": 27410 + }, + { + "epoch": 2.596099223631888, + "grad_norm": 0.4330320656299591, + "learning_rate": 9.61560310547245e-06, + "loss": 0.1115, + "step": 27420 + }, + { + "epoch": 2.5970460140124976, + "grad_norm": 0.40695780515670776, + "learning_rate": 9.611815943950011e-06, + "loss": 0.1044, + "step": 27430 + }, + { + "epoch": 2.5979928043931073, + "grad_norm": 0.3428425192832947, + "learning_rate": 9.608028782427571e-06, + "loss": 0.0991, + "step": 27440 + }, + { + "epoch": 2.598939594773717, + "grad_norm": 0.4021109938621521, + "learning_rate": 9.604241620905133e-06, + "loss": 0.1029, + "step": 27450 + }, + { + "epoch": 2.599886385154327, + "grad_norm": 0.4028293490409851, + "learning_rate": 9.600454459382693e-06, + "loss": 0.106, + "step": 27460 + }, + { + "epoch": 2.6008331755349365, + "grad_norm": 0.4052664041519165, + "learning_rate": 9.596667297860255e-06, + "loss": 0.0996, + "step": 27470 + }, + { + "epoch": 2.6017799659155463, + "grad_norm": 0.4120866060256958, + "learning_rate": 9.592880136337816e-06, + "loss": 0.0961, + "step": 27480 + }, + { + "epoch": 2.602726756296156, + "grad_norm": 0.455219566822052, + "learning_rate": 9.589092974815378e-06, + "loss": 0.1044, + "step": 27490 + }, + { + "epoch": 2.6036735466767658, + "grad_norm": 0.45610177516937256, + "learning_rate": 9.585305813292938e-06, + "loss": 0.1042, + "step": 27500 + }, + { + "epoch": 2.6046203370573755, + "grad_norm": 0.42534270882606506, + "learning_rate": 9.581518651770498e-06, + "loss": 0.0997, + "step": 27510 + }, + { + "epoch": 2.6055671274379852, + "grad_norm": 0.45785534381866455, + "learning_rate": 9.57773149024806e-06, + "loss": 0.0963, + "step": 27520 + }, + { + "epoch": 2.606513917818595, + "grad_norm": 0.3305295705795288, + "learning_rate": 9.57394432872562e-06, + "loss": 0.1055, + "step": 27530 + }, + { + "epoch": 2.6074607081992047, + "grad_norm": 0.3611030578613281, + "learning_rate": 9.570157167203182e-06, + "loss": 0.1057, + "step": 27540 + }, + { + "epoch": 2.6084074985798145, + "grad_norm": 0.4136884808540344, + "learning_rate": 9.566370005680743e-06, + "loss": 0.1059, + "step": 27550 + }, + { + "epoch": 2.609354288960424, + "grad_norm": 0.5084044933319092, + "learning_rate": 9.562582844158305e-06, + "loss": 0.1014, + "step": 27560 + }, + { + "epoch": 2.610301079341034, + "grad_norm": 0.4754091799259186, + "learning_rate": 9.558795682635865e-06, + "loss": 0.1052, + "step": 27570 + }, + { + "epoch": 2.6112478697216437, + "grad_norm": 0.47309383749961853, + "learning_rate": 9.555008521113425e-06, + "loss": 0.1032, + "step": 27580 + }, + { + "epoch": 2.6121946601022534, + "grad_norm": 0.40714767575263977, + "learning_rate": 9.551221359590987e-06, + "loss": 0.1038, + "step": 27590 + }, + { + "epoch": 2.613141450482863, + "grad_norm": 0.5299632549285889, + "learning_rate": 9.547434198068549e-06, + "loss": 0.1031, + "step": 27600 + }, + { + "epoch": 2.614088240863473, + "grad_norm": 0.4505506157875061, + "learning_rate": 9.54364703654611e-06, + "loss": 0.1007, + "step": 27610 + }, + { + "epoch": 2.6150350312440827, + "grad_norm": 0.40479910373687744, + "learning_rate": 9.53985987502367e-06, + "loss": 0.0999, + "step": 27620 + }, + { + "epoch": 2.6159818216246924, + "grad_norm": 0.4297819137573242, + "learning_rate": 9.536072713501232e-06, + "loss": 0.0984, + "step": 27630 + }, + { + "epoch": 2.616928612005302, + "grad_norm": 0.48748689889907837, + "learning_rate": 9.532285551978792e-06, + "loss": 0.1042, + "step": 27640 + }, + { + "epoch": 2.617875402385912, + "grad_norm": 0.3475923240184784, + "learning_rate": 9.528498390456354e-06, + "loss": 0.1109, + "step": 27650 + }, + { + "epoch": 2.6188221927665216, + "grad_norm": 0.4447574019432068, + "learning_rate": 9.524711228933916e-06, + "loss": 0.0934, + "step": 27660 + }, + { + "epoch": 2.6197689831471314, + "grad_norm": 0.44557541608810425, + "learning_rate": 9.520924067411476e-06, + "loss": 0.1007, + "step": 27670 + }, + { + "epoch": 2.620715773527741, + "grad_norm": 0.32018914818763733, + "learning_rate": 9.517136905889038e-06, + "loss": 0.0991, + "step": 27680 + }, + { + "epoch": 2.621662563908351, + "grad_norm": 0.5814476609230042, + "learning_rate": 9.513349744366598e-06, + "loss": 0.1069, + "step": 27690 + }, + { + "epoch": 2.6226093542889606, + "grad_norm": 0.3966127634048462, + "learning_rate": 9.50956258284416e-06, + "loss": 0.0957, + "step": 27700 + }, + { + "epoch": 2.6235561446695703, + "grad_norm": 0.420902281999588, + "learning_rate": 9.50577542132172e-06, + "loss": 0.1155, + "step": 27710 + }, + { + "epoch": 2.62450293505018, + "grad_norm": 0.4212897717952728, + "learning_rate": 9.501988259799281e-06, + "loss": 0.101, + "step": 27720 + }, + { + "epoch": 2.62544972543079, + "grad_norm": 0.40939393639564514, + "learning_rate": 9.498201098276843e-06, + "loss": 0.0981, + "step": 27730 + }, + { + "epoch": 2.6263965158113995, + "grad_norm": 0.5749149918556213, + "learning_rate": 9.494413936754403e-06, + "loss": 0.1014, + "step": 27740 + }, + { + "epoch": 2.6273433061920093, + "grad_norm": 0.5041487812995911, + "learning_rate": 9.490626775231965e-06, + "loss": 0.099, + "step": 27750 + }, + { + "epoch": 2.628290096572619, + "grad_norm": 0.41404733061790466, + "learning_rate": 9.486839613709525e-06, + "loss": 0.0995, + "step": 27760 + }, + { + "epoch": 2.6292368869532288, + "grad_norm": 0.41370484232902527, + "learning_rate": 9.483052452187086e-06, + "loss": 0.0932, + "step": 27770 + }, + { + "epoch": 2.6301836773338385, + "grad_norm": 0.3428075909614563, + "learning_rate": 9.479265290664648e-06, + "loss": 0.0948, + "step": 27780 + }, + { + "epoch": 2.6311304677144483, + "grad_norm": 0.4095135033130646, + "learning_rate": 9.475478129142208e-06, + "loss": 0.1154, + "step": 27790 + }, + { + "epoch": 2.632077258095058, + "grad_norm": 0.40821513533592224, + "learning_rate": 9.47169096761977e-06, + "loss": 0.0991, + "step": 27800 + }, + { + "epoch": 2.6330240484756677, + "grad_norm": 0.46019446849823, + "learning_rate": 9.46790380609733e-06, + "loss": 0.1024, + "step": 27810 + }, + { + "epoch": 2.6339708388562775, + "grad_norm": 0.4173412024974823, + "learning_rate": 9.464116644574892e-06, + "loss": 0.1104, + "step": 27820 + }, + { + "epoch": 2.634917629236887, + "grad_norm": 0.39484941959381104, + "learning_rate": 9.460329483052454e-06, + "loss": 0.1056, + "step": 27830 + }, + { + "epoch": 2.635864419617497, + "grad_norm": 0.41165807843208313, + "learning_rate": 9.456542321530015e-06, + "loss": 0.1044, + "step": 27840 + }, + { + "epoch": 2.6368112099981063, + "grad_norm": 0.38451337814331055, + "learning_rate": 9.452755160007575e-06, + "loss": 0.0996, + "step": 27850 + }, + { + "epoch": 2.637758000378716, + "grad_norm": 0.382441908121109, + "learning_rate": 9.448967998485135e-06, + "loss": 0.1104, + "step": 27860 + }, + { + "epoch": 2.6387047907593257, + "grad_norm": 0.32897433638572693, + "learning_rate": 9.445180836962697e-06, + "loss": 0.0944, + "step": 27870 + }, + { + "epoch": 2.6396515811399355, + "grad_norm": 0.4390844404697418, + "learning_rate": 9.441393675440257e-06, + "loss": 0.1113, + "step": 27880 + }, + { + "epoch": 2.640598371520545, + "grad_norm": 0.38016265630722046, + "learning_rate": 9.437606513917819e-06, + "loss": 0.1019, + "step": 27890 + }, + { + "epoch": 2.641545161901155, + "grad_norm": 0.4154101610183716, + "learning_rate": 9.43381935239538e-06, + "loss": 0.1052, + "step": 27900 + }, + { + "epoch": 2.6424919522817647, + "grad_norm": 0.4230835437774658, + "learning_rate": 9.430032190872942e-06, + "loss": 0.108, + "step": 27910 + }, + { + "epoch": 2.6434387426623744, + "grad_norm": 0.41765904426574707, + "learning_rate": 9.426245029350502e-06, + "loss": 0.1081, + "step": 27920 + }, + { + "epoch": 2.644385533042984, + "grad_norm": 0.3967103362083435, + "learning_rate": 9.422457867828063e-06, + "loss": 0.0963, + "step": 27930 + }, + { + "epoch": 2.645332323423594, + "grad_norm": 0.39287203550338745, + "learning_rate": 9.418670706305624e-06, + "loss": 0.1028, + "step": 27940 + }, + { + "epoch": 2.6462791138042037, + "grad_norm": 0.38825488090515137, + "learning_rate": 9.414883544783186e-06, + "loss": 0.1122, + "step": 27950 + }, + { + "epoch": 2.6472259041848134, + "grad_norm": 0.3866090178489685, + "learning_rate": 9.411096383260748e-06, + "loss": 0.1035, + "step": 27960 + }, + { + "epoch": 2.648172694565423, + "grad_norm": 0.5346670150756836, + "learning_rate": 9.407309221738308e-06, + "loss": 0.1051, + "step": 27970 + }, + { + "epoch": 2.649119484946033, + "grad_norm": 0.3844970464706421, + "learning_rate": 9.40352206021587e-06, + "loss": 0.1082, + "step": 27980 + }, + { + "epoch": 2.6500662753266426, + "grad_norm": 0.39131975173950195, + "learning_rate": 9.39973489869343e-06, + "loss": 0.1047, + "step": 27990 + }, + { + "epoch": 2.6510130657072524, + "grad_norm": 0.3815428912639618, + "learning_rate": 9.395947737170991e-06, + "loss": 0.1039, + "step": 28000 + }, + { + "epoch": 2.651959856087862, + "grad_norm": 0.42498138546943665, + "learning_rate": 9.392160575648553e-06, + "loss": 0.1035, + "step": 28010 + }, + { + "epoch": 2.652906646468472, + "grad_norm": 0.32439425587654114, + "learning_rate": 9.388373414126113e-06, + "loss": 0.1077, + "step": 28020 + }, + { + "epoch": 2.6538534368490816, + "grad_norm": 0.40164288878440857, + "learning_rate": 9.384586252603675e-06, + "loss": 0.1024, + "step": 28030 + }, + { + "epoch": 2.6548002272296913, + "grad_norm": 0.4523821771144867, + "learning_rate": 9.380799091081235e-06, + "loss": 0.1006, + "step": 28040 + }, + { + "epoch": 2.655747017610301, + "grad_norm": 0.4752205014228821, + "learning_rate": 9.377011929558797e-06, + "loss": 0.0968, + "step": 28050 + }, + { + "epoch": 2.656693807990911, + "grad_norm": 0.3431170880794525, + "learning_rate": 9.373224768036357e-06, + "loss": 0.1011, + "step": 28060 + }, + { + "epoch": 2.6576405983715206, + "grad_norm": 0.37541231513023376, + "learning_rate": 9.369437606513918e-06, + "loss": 0.1063, + "step": 28070 + }, + { + "epoch": 2.6585873887521303, + "grad_norm": 0.3426431715488434, + "learning_rate": 9.36565044499148e-06, + "loss": 0.1008, + "step": 28080 + }, + { + "epoch": 2.65953417913274, + "grad_norm": 0.2883263826370239, + "learning_rate": 9.36186328346904e-06, + "loss": 0.0901, + "step": 28090 + }, + { + "epoch": 2.66048096951335, + "grad_norm": 0.5053014755249023, + "learning_rate": 9.358076121946602e-06, + "loss": 0.1126, + "step": 28100 + }, + { + "epoch": 2.6614277598939595, + "grad_norm": 0.5336264967918396, + "learning_rate": 9.354288960424162e-06, + "loss": 0.1027, + "step": 28110 + }, + { + "epoch": 2.6623745502745693, + "grad_norm": 0.3703152537345886, + "learning_rate": 9.350501798901724e-06, + "loss": 0.0943, + "step": 28120 + }, + { + "epoch": 2.663321340655179, + "grad_norm": 0.35078686475753784, + "learning_rate": 9.346714637379286e-06, + "loss": 0.1003, + "step": 28130 + }, + { + "epoch": 2.6642681310357887, + "grad_norm": 0.6297541260719299, + "learning_rate": 9.342927475856846e-06, + "loss": 0.1065, + "step": 28140 + }, + { + "epoch": 2.6652149214163985, + "grad_norm": 0.4121268093585968, + "learning_rate": 9.339140314334407e-06, + "loss": 0.0953, + "step": 28150 + }, + { + "epoch": 2.6661617117970082, + "grad_norm": 0.3645578622817993, + "learning_rate": 9.335353152811967e-06, + "loss": 0.095, + "step": 28160 + }, + { + "epoch": 2.667108502177618, + "grad_norm": 0.36650314927101135, + "learning_rate": 9.331565991289529e-06, + "loss": 0.0971, + "step": 28170 + }, + { + "epoch": 2.6680552925582277, + "grad_norm": 0.41033312678337097, + "learning_rate": 9.327778829767091e-06, + "loss": 0.1084, + "step": 28180 + }, + { + "epoch": 2.6690020829388375, + "grad_norm": 0.3698018193244934, + "learning_rate": 9.323991668244653e-06, + "loss": 0.0964, + "step": 28190 + }, + { + "epoch": 2.669948873319447, + "grad_norm": 0.4807288944721222, + "learning_rate": 9.320204506722213e-06, + "loss": 0.0984, + "step": 28200 + }, + { + "epoch": 2.670895663700057, + "grad_norm": 0.41571494936943054, + "learning_rate": 9.316417345199773e-06, + "loss": 0.1018, + "step": 28210 + }, + { + "epoch": 2.6718424540806667, + "grad_norm": 0.48666346073150635, + "learning_rate": 9.312630183677334e-06, + "loss": 0.0965, + "step": 28220 + }, + { + "epoch": 2.6727892444612764, + "grad_norm": 0.39642250537872314, + "learning_rate": 9.308843022154895e-06, + "loss": 0.0924, + "step": 28230 + }, + { + "epoch": 2.673736034841886, + "grad_norm": 0.5108928680419922, + "learning_rate": 9.305055860632456e-06, + "loss": 0.0957, + "step": 28240 + }, + { + "epoch": 2.674682825222496, + "grad_norm": 0.36331531405448914, + "learning_rate": 9.301268699110018e-06, + "loss": 0.1091, + "step": 28250 + }, + { + "epoch": 2.6756296156031056, + "grad_norm": 0.42903733253479004, + "learning_rate": 9.29748153758758e-06, + "loss": 0.0933, + "step": 28260 + }, + { + "epoch": 2.676576405983715, + "grad_norm": 0.4590227007865906, + "learning_rate": 9.29369437606514e-06, + "loss": 0.1076, + "step": 28270 + }, + { + "epoch": 2.6775231963643247, + "grad_norm": 0.3357694447040558, + "learning_rate": 9.2899072145427e-06, + "loss": 0.0952, + "step": 28280 + }, + { + "epoch": 2.6784699867449344, + "grad_norm": 0.441007137298584, + "learning_rate": 9.286120053020262e-06, + "loss": 0.096, + "step": 28290 + }, + { + "epoch": 2.679416777125544, + "grad_norm": 0.47577551007270813, + "learning_rate": 9.282332891497823e-06, + "loss": 0.1044, + "step": 28300 + }, + { + "epoch": 2.680363567506154, + "grad_norm": 0.3424263298511505, + "learning_rate": 9.278545729975385e-06, + "loss": 0.1007, + "step": 28310 + }, + { + "epoch": 2.6813103578867636, + "grad_norm": 0.46387240290641785, + "learning_rate": 9.274758568452945e-06, + "loss": 0.1009, + "step": 28320 + }, + { + "epoch": 2.6822571482673734, + "grad_norm": 0.3691031336784363, + "learning_rate": 9.270971406930507e-06, + "loss": 0.0984, + "step": 28330 + }, + { + "epoch": 2.683203938647983, + "grad_norm": 0.42378664016723633, + "learning_rate": 9.267184245408067e-06, + "loss": 0.1095, + "step": 28340 + }, + { + "epoch": 2.684150729028593, + "grad_norm": 0.4089430570602417, + "learning_rate": 9.263397083885629e-06, + "loss": 0.1112, + "step": 28350 + }, + { + "epoch": 2.6850975194092026, + "grad_norm": 0.41893911361694336, + "learning_rate": 9.25960992236319e-06, + "loss": 0.1007, + "step": 28360 + }, + { + "epoch": 2.6860443097898123, + "grad_norm": 0.40327945351600647, + "learning_rate": 9.25582276084075e-06, + "loss": 0.113, + "step": 28370 + }, + { + "epoch": 2.686991100170422, + "grad_norm": 0.39842647314071655, + "learning_rate": 9.252035599318312e-06, + "loss": 0.1074, + "step": 28380 + }, + { + "epoch": 2.687937890551032, + "grad_norm": 0.36703211069107056, + "learning_rate": 9.248248437795872e-06, + "loss": 0.1043, + "step": 28390 + }, + { + "epoch": 2.6888846809316416, + "grad_norm": 0.44750773906707764, + "learning_rate": 9.244461276273434e-06, + "loss": 0.1092, + "step": 28400 + }, + { + "epoch": 2.6898314713122513, + "grad_norm": 0.42375195026397705, + "learning_rate": 9.240674114750994e-06, + "loss": 0.1054, + "step": 28410 + }, + { + "epoch": 2.690778261692861, + "grad_norm": 0.4005579650402069, + "learning_rate": 9.236886953228556e-06, + "loss": 0.1114, + "step": 28420 + }, + { + "epoch": 2.691725052073471, + "grad_norm": 0.3783735930919647, + "learning_rate": 9.233099791706118e-06, + "loss": 0.0922, + "step": 28430 + }, + { + "epoch": 2.6926718424540805, + "grad_norm": 0.3596361577510834, + "learning_rate": 9.229312630183678e-06, + "loss": 0.1046, + "step": 28440 + }, + { + "epoch": 2.6936186328346903, + "grad_norm": 0.3943907916545868, + "learning_rate": 9.22552546866124e-06, + "loss": 0.1097, + "step": 28450 + }, + { + "epoch": 2.6945654232153, + "grad_norm": 0.3750338852405548, + "learning_rate": 9.2217383071388e-06, + "loss": 0.1066, + "step": 28460 + }, + { + "epoch": 2.6955122135959098, + "grad_norm": 0.34630802273750305, + "learning_rate": 9.217951145616361e-06, + "loss": 0.0975, + "step": 28470 + }, + { + "epoch": 2.6964590039765195, + "grad_norm": 0.3319561779499054, + "learning_rate": 9.214163984093923e-06, + "loss": 0.0873, + "step": 28480 + }, + { + "epoch": 2.6974057943571292, + "grad_norm": 0.4126971364021301, + "learning_rate": 9.210376822571483e-06, + "loss": 0.1056, + "step": 28490 + }, + { + "epoch": 2.698352584737739, + "grad_norm": 0.35686981678009033, + "learning_rate": 9.206589661049045e-06, + "loss": 0.0919, + "step": 28500 + }, + { + "epoch": 2.6992993751183487, + "grad_norm": 0.3125929832458496, + "learning_rate": 9.202802499526605e-06, + "loss": 0.1018, + "step": 28510 + }, + { + "epoch": 2.7002461654989585, + "grad_norm": 0.47137147188186646, + "learning_rate": 9.199015338004166e-06, + "loss": 0.0992, + "step": 28520 + }, + { + "epoch": 2.701192955879568, + "grad_norm": 0.34128260612487793, + "learning_rate": 9.195228176481728e-06, + "loss": 0.1002, + "step": 28530 + }, + { + "epoch": 2.702139746260178, + "grad_norm": 0.3971148133277893, + "learning_rate": 9.19144101495929e-06, + "loss": 0.0989, + "step": 28540 + }, + { + "epoch": 2.7030865366407877, + "grad_norm": 0.5022692680358887, + "learning_rate": 9.18765385343685e-06, + "loss": 0.1152, + "step": 28550 + }, + { + "epoch": 2.7040333270213974, + "grad_norm": 0.4363287687301636, + "learning_rate": 9.18386669191441e-06, + "loss": 0.1099, + "step": 28560 + }, + { + "epoch": 2.704980117402007, + "grad_norm": 0.46551355719566345, + "learning_rate": 9.180079530391972e-06, + "loss": 0.1023, + "step": 28570 + }, + { + "epoch": 2.705926907782617, + "grad_norm": 0.32491248846054077, + "learning_rate": 9.176292368869534e-06, + "loss": 0.0966, + "step": 28580 + }, + { + "epoch": 2.7068736981632266, + "grad_norm": 0.46695515513420105, + "learning_rate": 9.172505207347094e-06, + "loss": 0.1006, + "step": 28590 + }, + { + "epoch": 2.7078204885438364, + "grad_norm": 0.38846078515052795, + "learning_rate": 9.168718045824655e-06, + "loss": 0.11, + "step": 28600 + }, + { + "epoch": 2.708767278924446, + "grad_norm": 0.4434100091457367, + "learning_rate": 9.164930884302217e-06, + "loss": 0.1016, + "step": 28610 + }, + { + "epoch": 2.709714069305056, + "grad_norm": 0.5759299993515015, + "learning_rate": 9.161143722779777e-06, + "loss": 0.101, + "step": 28620 + }, + { + "epoch": 2.7106608596856656, + "grad_norm": 0.4500638246536255, + "learning_rate": 9.157356561257337e-06, + "loss": 0.1084, + "step": 28630 + }, + { + "epoch": 2.7116076500662754, + "grad_norm": 0.4416857659816742, + "learning_rate": 9.153569399734899e-06, + "loss": 0.1003, + "step": 28640 + }, + { + "epoch": 2.712554440446885, + "grad_norm": 0.35741469264030457, + "learning_rate": 9.14978223821246e-06, + "loss": 0.1061, + "step": 28650 + }, + { + "epoch": 2.713501230827495, + "grad_norm": 0.39139777421951294, + "learning_rate": 9.145995076690022e-06, + "loss": 0.0985, + "step": 28660 + }, + { + "epoch": 2.7144480212081046, + "grad_norm": 0.778073787689209, + "learning_rate": 9.142207915167582e-06, + "loss": 0.1092, + "step": 28670 + }, + { + "epoch": 2.7153948115887143, + "grad_norm": 0.4162473976612091, + "learning_rate": 9.138420753645144e-06, + "loss": 0.1075, + "step": 28680 + }, + { + "epoch": 2.716341601969324, + "grad_norm": 0.4190244972705841, + "learning_rate": 9.134633592122704e-06, + "loss": 0.1046, + "step": 28690 + }, + { + "epoch": 2.717288392349934, + "grad_norm": 0.4655373990535736, + "learning_rate": 9.130846430600266e-06, + "loss": 0.1061, + "step": 28700 + }, + { + "epoch": 2.7182351827305435, + "grad_norm": 0.43157628178596497, + "learning_rate": 9.127059269077828e-06, + "loss": 0.104, + "step": 28710 + }, + { + "epoch": 2.7191819731111533, + "grad_norm": 0.35439619421958923, + "learning_rate": 9.123272107555388e-06, + "loss": 0.0998, + "step": 28720 + }, + { + "epoch": 2.720128763491763, + "grad_norm": 0.4340245723724365, + "learning_rate": 9.11948494603295e-06, + "loss": 0.1002, + "step": 28730 + }, + { + "epoch": 2.7210755538723728, + "grad_norm": 0.40614044666290283, + "learning_rate": 9.11569778451051e-06, + "loss": 0.0958, + "step": 28740 + }, + { + "epoch": 2.7220223442529825, + "grad_norm": 0.4191066324710846, + "learning_rate": 9.111910622988071e-06, + "loss": 0.0966, + "step": 28750 + }, + { + "epoch": 2.7229691346335922, + "grad_norm": 0.4126628637313843, + "learning_rate": 9.108123461465633e-06, + "loss": 0.1123, + "step": 28760 + }, + { + "epoch": 2.723915925014202, + "grad_norm": 0.515906035900116, + "learning_rate": 9.104336299943193e-06, + "loss": 0.1094, + "step": 28770 + }, + { + "epoch": 2.7248627153948117, + "grad_norm": 0.3944433331489563, + "learning_rate": 9.100549138420755e-06, + "loss": 0.1047, + "step": 28780 + }, + { + "epoch": 2.7258095057754215, + "grad_norm": 0.5118011832237244, + "learning_rate": 9.096761976898315e-06, + "loss": 0.1007, + "step": 28790 + }, + { + "epoch": 2.726756296156031, + "grad_norm": 0.4848870038986206, + "learning_rate": 9.092974815375877e-06, + "loss": 0.1019, + "step": 28800 + }, + { + "epoch": 2.727703086536641, + "grad_norm": 0.5556014776229858, + "learning_rate": 9.089187653853437e-06, + "loss": 0.1018, + "step": 28810 + }, + { + "epoch": 2.7286498769172507, + "grad_norm": 0.4127148687839508, + "learning_rate": 9.085400492330998e-06, + "loss": 0.1007, + "step": 28820 + }, + { + "epoch": 2.7295966672978604, + "grad_norm": 0.4346880614757538, + "learning_rate": 9.08161333080856e-06, + "loss": 0.094, + "step": 28830 + }, + { + "epoch": 2.73054345767847, + "grad_norm": 0.41865330934524536, + "learning_rate": 9.07782616928612e-06, + "loss": 0.0938, + "step": 28840 + }, + { + "epoch": 2.73149024805908, + "grad_norm": 0.3504531681537628, + "learning_rate": 9.074039007763682e-06, + "loss": 0.096, + "step": 28850 + }, + { + "epoch": 2.7324370384396897, + "grad_norm": 0.43708714842796326, + "learning_rate": 9.070251846241242e-06, + "loss": 0.1066, + "step": 28860 + }, + { + "epoch": 2.7333838288202994, + "grad_norm": 0.3688562512397766, + "learning_rate": 9.066464684718804e-06, + "loss": 0.0983, + "step": 28870 + }, + { + "epoch": 2.734330619200909, + "grad_norm": 0.41222694516181946, + "learning_rate": 9.062677523196365e-06, + "loss": 0.1142, + "step": 28880 + }, + { + "epoch": 2.735277409581519, + "grad_norm": 0.329771488904953, + "learning_rate": 9.058890361673927e-06, + "loss": 0.092, + "step": 28890 + }, + { + "epoch": 2.7362241999621286, + "grad_norm": 0.3156600594520569, + "learning_rate": 9.055103200151487e-06, + "loss": 0.1007, + "step": 28900 + }, + { + "epoch": 2.7371709903427384, + "grad_norm": 0.48856040835380554, + "learning_rate": 9.051316038629047e-06, + "loss": 0.1107, + "step": 28910 + }, + { + "epoch": 2.738117780723348, + "grad_norm": 0.39420148730278015, + "learning_rate": 9.047528877106609e-06, + "loss": 0.097, + "step": 28920 + }, + { + "epoch": 2.739064571103958, + "grad_norm": 0.4240681827068329, + "learning_rate": 9.04374171558417e-06, + "loss": 0.1077, + "step": 28930 + }, + { + "epoch": 2.7400113614845676, + "grad_norm": 0.43309491872787476, + "learning_rate": 9.039954554061733e-06, + "loss": 0.1095, + "step": 28940 + }, + { + "epoch": 2.740958151865177, + "grad_norm": 0.48016998171806335, + "learning_rate": 9.036167392539293e-06, + "loss": 0.096, + "step": 28950 + }, + { + "epoch": 2.7419049422457866, + "grad_norm": 0.34501051902770996, + "learning_rate": 9.032380231016854e-06, + "loss": 0.1063, + "step": 28960 + }, + { + "epoch": 2.7428517326263964, + "grad_norm": 0.42231935262680054, + "learning_rate": 9.028593069494414e-06, + "loss": 0.1087, + "step": 28970 + }, + { + "epoch": 2.743798523007006, + "grad_norm": 0.3782620429992676, + "learning_rate": 9.024805907971974e-06, + "loss": 0.099, + "step": 28980 + }, + { + "epoch": 2.744745313387616, + "grad_norm": 0.4469917118549347, + "learning_rate": 9.021018746449536e-06, + "loss": 0.096, + "step": 28990 + }, + { + "epoch": 2.7456921037682256, + "grad_norm": 0.3554760217666626, + "learning_rate": 9.017231584927098e-06, + "loss": 0.0918, + "step": 29000 + }, + { + "epoch": 2.7466388941488353, + "grad_norm": 0.3817310929298401, + "learning_rate": 9.01344442340466e-06, + "loss": 0.1054, + "step": 29010 + }, + { + "epoch": 2.747585684529445, + "grad_norm": 0.37116482853889465, + "learning_rate": 9.00965726188222e-06, + "loss": 0.1005, + "step": 29020 + }, + { + "epoch": 2.748532474910055, + "grad_norm": 0.41311806440353394, + "learning_rate": 9.005870100359781e-06, + "loss": 0.1086, + "step": 29030 + }, + { + "epoch": 2.7494792652906646, + "grad_norm": 0.648277223110199, + "learning_rate": 9.002082938837342e-06, + "loss": 0.1081, + "step": 29040 + }, + { + "epoch": 2.7504260556712743, + "grad_norm": 0.39448094367980957, + "learning_rate": 8.998295777314903e-06, + "loss": 0.105, + "step": 29050 + }, + { + "epoch": 2.751372846051884, + "grad_norm": 0.4399425983428955, + "learning_rate": 8.994508615792465e-06, + "loss": 0.1081, + "step": 29060 + }, + { + "epoch": 2.7523196364324938, + "grad_norm": 0.4444071650505066, + "learning_rate": 8.990721454270025e-06, + "loss": 0.0978, + "step": 29070 + }, + { + "epoch": 2.7532664268131035, + "grad_norm": 0.5003328919410706, + "learning_rate": 8.986934292747587e-06, + "loss": 0.1057, + "step": 29080 + }, + { + "epoch": 2.7542132171937133, + "grad_norm": 0.42164021730422974, + "learning_rate": 8.983147131225147e-06, + "loss": 0.1144, + "step": 29090 + }, + { + "epoch": 2.755160007574323, + "grad_norm": 0.43083667755126953, + "learning_rate": 8.979359969702709e-06, + "loss": 0.0995, + "step": 29100 + }, + { + "epoch": 2.7561067979549327, + "grad_norm": 0.4107886552810669, + "learning_rate": 8.97557280818027e-06, + "loss": 0.104, + "step": 29110 + }, + { + "epoch": 2.7570535883355425, + "grad_norm": 0.3801655173301697, + "learning_rate": 8.97178564665783e-06, + "loss": 0.0987, + "step": 29120 + }, + { + "epoch": 2.7580003787161522, + "grad_norm": 0.38127654790878296, + "learning_rate": 8.967998485135392e-06, + "loss": 0.1015, + "step": 29130 + }, + { + "epoch": 2.758947169096762, + "grad_norm": 0.3987577259540558, + "learning_rate": 8.964211323612952e-06, + "loss": 0.11, + "step": 29140 + }, + { + "epoch": 2.7598939594773717, + "grad_norm": 0.4438229203224182, + "learning_rate": 8.960424162090514e-06, + "loss": 0.1074, + "step": 29150 + }, + { + "epoch": 2.7608407498579814, + "grad_norm": 0.4837617874145508, + "learning_rate": 8.956637000568074e-06, + "loss": 0.0993, + "step": 29160 + }, + { + "epoch": 2.761787540238591, + "grad_norm": 0.3521498739719391, + "learning_rate": 8.952849839045636e-06, + "loss": 0.1008, + "step": 29170 + }, + { + "epoch": 2.762734330619201, + "grad_norm": 0.40667611360549927, + "learning_rate": 8.949062677523197e-06, + "loss": 0.1001, + "step": 29180 + }, + { + "epoch": 2.7636811209998107, + "grad_norm": 0.420484334230423, + "learning_rate": 8.94527551600076e-06, + "loss": 0.1032, + "step": 29190 + }, + { + "epoch": 2.7646279113804204, + "grad_norm": 0.49111509323120117, + "learning_rate": 8.94148835447832e-06, + "loss": 0.1025, + "step": 29200 + }, + { + "epoch": 2.76557470176103, + "grad_norm": 0.4495706260204315, + "learning_rate": 8.93770119295588e-06, + "loss": 0.1005, + "step": 29210 + }, + { + "epoch": 2.76652149214164, + "grad_norm": 0.45926108956336975, + "learning_rate": 8.933914031433441e-06, + "loss": 0.0989, + "step": 29220 + }, + { + "epoch": 2.7674682825222496, + "grad_norm": 0.4548151195049286, + "learning_rate": 8.930126869911003e-06, + "loss": 0.0988, + "step": 29230 + }, + { + "epoch": 2.7684150729028594, + "grad_norm": 0.47408318519592285, + "learning_rate": 8.926339708388565e-06, + "loss": 0.0912, + "step": 29240 + }, + { + "epoch": 2.769361863283469, + "grad_norm": 0.41850709915161133, + "learning_rate": 8.922552546866125e-06, + "loss": 0.0955, + "step": 29250 + }, + { + "epoch": 2.770308653664079, + "grad_norm": 0.4139980673789978, + "learning_rate": 8.918765385343686e-06, + "loss": 0.0994, + "step": 29260 + }, + { + "epoch": 2.7712554440446886, + "grad_norm": 0.4438539743423462, + "learning_rate": 8.914978223821246e-06, + "loss": 0.1026, + "step": 29270 + }, + { + "epoch": 2.7722022344252983, + "grad_norm": 0.34790053963661194, + "learning_rate": 8.911191062298808e-06, + "loss": 0.103, + "step": 29280 + }, + { + "epoch": 2.773149024805908, + "grad_norm": 0.48705199360847473, + "learning_rate": 8.90740390077637e-06, + "loss": 0.1025, + "step": 29290 + }, + { + "epoch": 2.774095815186518, + "grad_norm": 0.46644020080566406, + "learning_rate": 8.90361673925393e-06, + "loss": 0.1061, + "step": 29300 + }, + { + "epoch": 2.7750426055671276, + "grad_norm": 0.28609853982925415, + "learning_rate": 8.899829577731492e-06, + "loss": 0.105, + "step": 29310 + }, + { + "epoch": 2.7759893959477373, + "grad_norm": 0.47828933596611023, + "learning_rate": 8.896042416209052e-06, + "loss": 0.1074, + "step": 29320 + }, + { + "epoch": 2.776936186328347, + "grad_norm": 0.530474841594696, + "learning_rate": 8.892255254686613e-06, + "loss": 0.0944, + "step": 29330 + }, + { + "epoch": 2.777882976708957, + "grad_norm": 0.46874600648880005, + "learning_rate": 8.888468093164174e-06, + "loss": 0.0999, + "step": 29340 + }, + { + "epoch": 2.7788297670895665, + "grad_norm": 0.4388161301612854, + "learning_rate": 8.884680931641735e-06, + "loss": 0.092, + "step": 29350 + }, + { + "epoch": 2.7797765574701763, + "grad_norm": 0.35012832283973694, + "learning_rate": 8.880893770119297e-06, + "loss": 0.1041, + "step": 29360 + }, + { + "epoch": 2.7807233478507856, + "grad_norm": 0.4594600796699524, + "learning_rate": 8.877106608596857e-06, + "loss": 0.096, + "step": 29370 + }, + { + "epoch": 2.7816701382313953, + "grad_norm": 0.3499717116355896, + "learning_rate": 8.873319447074419e-06, + "loss": 0.0911, + "step": 29380 + }, + { + "epoch": 2.782616928612005, + "grad_norm": 0.431905597448349, + "learning_rate": 8.869532285551979e-06, + "loss": 0.0965, + "step": 29390 + }, + { + "epoch": 2.783563718992615, + "grad_norm": 0.3726125955581665, + "learning_rate": 8.86574512402954e-06, + "loss": 0.0958, + "step": 29400 + }, + { + "epoch": 2.7845105093732245, + "grad_norm": 0.38136884570121765, + "learning_rate": 8.861957962507102e-06, + "loss": 0.1041, + "step": 29410 + }, + { + "epoch": 2.7854572997538343, + "grad_norm": 0.398968368768692, + "learning_rate": 8.858170800984662e-06, + "loss": 0.0988, + "step": 29420 + }, + { + "epoch": 2.786404090134444, + "grad_norm": 0.4601745307445526, + "learning_rate": 8.854383639462224e-06, + "loss": 0.1027, + "step": 29430 + }, + { + "epoch": 2.7873508805150538, + "grad_norm": 0.34036746621131897, + "learning_rate": 8.850596477939784e-06, + "loss": 0.1155, + "step": 29440 + }, + { + "epoch": 2.7882976708956635, + "grad_norm": 0.3742331266403198, + "learning_rate": 8.846809316417346e-06, + "loss": 0.1044, + "step": 29450 + }, + { + "epoch": 2.7892444612762732, + "grad_norm": 0.40490207076072693, + "learning_rate": 8.843022154894908e-06, + "loss": 0.1057, + "step": 29460 + }, + { + "epoch": 2.790191251656883, + "grad_norm": 0.37119242548942566, + "learning_rate": 8.83923499337247e-06, + "loss": 0.099, + "step": 29470 + }, + { + "epoch": 2.7911380420374927, + "grad_norm": 0.36027467250823975, + "learning_rate": 8.83544783185003e-06, + "loss": 0.0929, + "step": 29480 + }, + { + "epoch": 2.7920848324181025, + "grad_norm": 0.4557902216911316, + "learning_rate": 8.83166067032759e-06, + "loss": 0.0963, + "step": 29490 + }, + { + "epoch": 2.793031622798712, + "grad_norm": 0.4901174306869507, + "learning_rate": 8.827873508805151e-06, + "loss": 0.1129, + "step": 29500 + }, + { + "epoch": 2.793978413179322, + "grad_norm": 0.5360562205314636, + "learning_rate": 8.824086347282711e-06, + "loss": 0.1027, + "step": 29510 + }, + { + "epoch": 2.7949252035599317, + "grad_norm": 0.36238235235214233, + "learning_rate": 8.820299185760273e-06, + "loss": 0.0943, + "step": 29520 + }, + { + "epoch": 2.7958719939405414, + "grad_norm": 0.39753592014312744, + "learning_rate": 8.816512024237835e-06, + "loss": 0.1038, + "step": 29530 + }, + { + "epoch": 2.796818784321151, + "grad_norm": 0.4467945098876953, + "learning_rate": 8.812724862715397e-06, + "loss": 0.098, + "step": 29540 + }, + { + "epoch": 2.797765574701761, + "grad_norm": 0.3594754636287689, + "learning_rate": 8.808937701192957e-06, + "loss": 0.1085, + "step": 29550 + }, + { + "epoch": 2.7987123650823706, + "grad_norm": 0.4449863135814667, + "learning_rate": 8.805150539670517e-06, + "loss": 0.1032, + "step": 29560 + }, + { + "epoch": 2.7996591554629804, + "grad_norm": 0.45748332142829895, + "learning_rate": 8.801363378148078e-06, + "loss": 0.0975, + "step": 29570 + }, + { + "epoch": 2.80060594584359, + "grad_norm": 0.42559105157852173, + "learning_rate": 8.79757621662564e-06, + "loss": 0.101, + "step": 29580 + }, + { + "epoch": 2.8015527362242, + "grad_norm": 0.3052559494972229, + "learning_rate": 8.793789055103202e-06, + "loss": 0.0929, + "step": 29590 + }, + { + "epoch": 2.8024995266048096, + "grad_norm": 0.30140575766563416, + "learning_rate": 8.790001893580762e-06, + "loss": 0.0913, + "step": 29600 + }, + { + "epoch": 2.8034463169854194, + "grad_norm": 0.5412850975990295, + "learning_rate": 8.786214732058324e-06, + "loss": 0.1077, + "step": 29610 + }, + { + "epoch": 2.804393107366029, + "grad_norm": 0.3537708520889282, + "learning_rate": 8.782427570535884e-06, + "loss": 0.0994, + "step": 29620 + }, + { + "epoch": 2.805339897746639, + "grad_norm": 0.38238513469696045, + "learning_rate": 8.778640409013445e-06, + "loss": 0.1039, + "step": 29630 + }, + { + "epoch": 2.8062866881272486, + "grad_norm": 0.40344172716140747, + "learning_rate": 8.774853247491007e-06, + "loss": 0.0976, + "step": 29640 + }, + { + "epoch": 2.8072334785078583, + "grad_norm": 0.33826422691345215, + "learning_rate": 8.771066085968567e-06, + "loss": 0.0959, + "step": 29650 + }, + { + "epoch": 2.808180268888468, + "grad_norm": 0.42276862263679504, + "learning_rate": 8.767278924446129e-06, + "loss": 0.1051, + "step": 29660 + }, + { + "epoch": 2.809127059269078, + "grad_norm": 0.48476433753967285, + "learning_rate": 8.763491762923689e-06, + "loss": 0.1041, + "step": 29670 + }, + { + "epoch": 2.8100738496496875, + "grad_norm": 0.37548232078552246, + "learning_rate": 8.75970460140125e-06, + "loss": 0.0978, + "step": 29680 + }, + { + "epoch": 2.8110206400302973, + "grad_norm": 0.4841581881046295, + "learning_rate": 8.75591743987881e-06, + "loss": 0.0984, + "step": 29690 + }, + { + "epoch": 2.811967430410907, + "grad_norm": 0.8658079504966736, + "learning_rate": 8.752130278356373e-06, + "loss": 0.1072, + "step": 29700 + }, + { + "epoch": 2.8129142207915168, + "grad_norm": 0.4591039717197418, + "learning_rate": 8.748343116833934e-06, + "loss": 0.1086, + "step": 29710 + }, + { + "epoch": 2.8138610111721265, + "grad_norm": 0.4760480225086212, + "learning_rate": 8.744555955311494e-06, + "loss": 0.097, + "step": 29720 + }, + { + "epoch": 2.8148078015527362, + "grad_norm": 0.3787853717803955, + "learning_rate": 8.740768793789056e-06, + "loss": 0.1036, + "step": 29730 + }, + { + "epoch": 2.815754591933346, + "grad_norm": 0.46122756600379944, + "learning_rate": 8.736981632266616e-06, + "loss": 0.0953, + "step": 29740 + }, + { + "epoch": 2.8167013823139557, + "grad_norm": 0.37929031252861023, + "learning_rate": 8.733194470744178e-06, + "loss": 0.0986, + "step": 29750 + }, + { + "epoch": 2.8176481726945655, + "grad_norm": 0.3670518696308136, + "learning_rate": 8.72940730922174e-06, + "loss": 0.1008, + "step": 29760 + }, + { + "epoch": 2.818594963075175, + "grad_norm": 0.35238727927207947, + "learning_rate": 8.7256201476993e-06, + "loss": 0.1015, + "step": 29770 + }, + { + "epoch": 2.819541753455785, + "grad_norm": 0.42419156432151794, + "learning_rate": 8.721832986176861e-06, + "loss": 0.094, + "step": 29780 + }, + { + "epoch": 2.8204885438363947, + "grad_norm": 0.3655274510383606, + "learning_rate": 8.718045824654421e-06, + "loss": 0.1012, + "step": 29790 + }, + { + "epoch": 2.8214353342170044, + "grad_norm": 0.42500266432762146, + "learning_rate": 8.714258663131983e-06, + "loss": 0.107, + "step": 29800 + }, + { + "epoch": 2.822382124597614, + "grad_norm": 0.43933090567588806, + "learning_rate": 8.710471501609545e-06, + "loss": 0.1035, + "step": 29810 + }, + { + "epoch": 2.823328914978224, + "grad_norm": 0.4164799153804779, + "learning_rate": 8.706684340087107e-06, + "loss": 0.1074, + "step": 29820 + }, + { + "epoch": 2.8242757053588337, + "grad_norm": 0.46152105927467346, + "learning_rate": 8.702897178564667e-06, + "loss": 0.1, + "step": 29830 + }, + { + "epoch": 2.8252224957394434, + "grad_norm": 0.4320620000362396, + "learning_rate": 8.699110017042227e-06, + "loss": 0.1056, + "step": 29840 + }, + { + "epoch": 2.826169286120053, + "grad_norm": 0.42175158858299255, + "learning_rate": 8.695322855519789e-06, + "loss": 0.1018, + "step": 29850 + }, + { + "epoch": 2.827116076500663, + "grad_norm": 0.43998804688453674, + "learning_rate": 8.691535693997349e-06, + "loss": 0.0971, + "step": 29860 + }, + { + "epoch": 2.8280628668812726, + "grad_norm": 0.4695793688297272, + "learning_rate": 8.68774853247491e-06, + "loss": 0.1024, + "step": 29870 + }, + { + "epoch": 2.8290096572618824, + "grad_norm": 0.3865763545036316, + "learning_rate": 8.683961370952472e-06, + "loss": 0.1038, + "step": 29880 + }, + { + "epoch": 2.829956447642492, + "grad_norm": 0.3920170068740845, + "learning_rate": 8.680174209430034e-06, + "loss": 0.1063, + "step": 29890 + }, + { + "epoch": 2.830903238023102, + "grad_norm": 0.3742087781429291, + "learning_rate": 8.676387047907594e-06, + "loss": 0.1004, + "step": 29900 + }, + { + "epoch": 2.8318500284037116, + "grad_norm": 0.40059271454811096, + "learning_rate": 8.672599886385154e-06, + "loss": 0.0996, + "step": 29910 + }, + { + "epoch": 2.8327968187843213, + "grad_norm": 0.3583625257015228, + "learning_rate": 8.668812724862716e-06, + "loss": 0.0944, + "step": 29920 + }, + { + "epoch": 2.833743609164931, + "grad_norm": 0.4899672865867615, + "learning_rate": 8.665025563340277e-06, + "loss": 0.1033, + "step": 29930 + }, + { + "epoch": 2.834690399545541, + "grad_norm": 0.3681374788284302, + "learning_rate": 8.661238401817839e-06, + "loss": 0.0964, + "step": 29940 + }, + { + "epoch": 2.8356371899261505, + "grad_norm": 0.4126521050930023, + "learning_rate": 8.6574512402954e-06, + "loss": 0.086, + "step": 29950 + }, + { + "epoch": 2.8365839803067603, + "grad_norm": 0.5693396925926208, + "learning_rate": 8.653664078772961e-06, + "loss": 0.11, + "step": 29960 + }, + { + "epoch": 2.83753077068737, + "grad_norm": 0.5090595483779907, + "learning_rate": 8.649876917250521e-06, + "loss": 0.1099, + "step": 29970 + }, + { + "epoch": 2.8384775610679798, + "grad_norm": 0.49548888206481934, + "learning_rate": 8.646089755728083e-06, + "loss": 0.1036, + "step": 29980 + }, + { + "epoch": 2.8394243514485895, + "grad_norm": 0.3540317416191101, + "learning_rate": 8.642302594205644e-06, + "loss": 0.101, + "step": 29990 + }, + { + "epoch": 2.8403711418291993, + "grad_norm": 0.3879123032093048, + "learning_rate": 8.638515432683205e-06, + "loss": 0.094, + "step": 30000 + }, + { + "epoch": 2.841317932209809, + "grad_norm": 0.39417970180511475, + "learning_rate": 8.634728271160766e-06, + "loss": 0.0965, + "step": 30010 + }, + { + "epoch": 2.8422647225904187, + "grad_norm": 0.38096165657043457, + "learning_rate": 8.630941109638326e-06, + "loss": 0.1065, + "step": 30020 + }, + { + "epoch": 2.8432115129710285, + "grad_norm": 0.3729986548423767, + "learning_rate": 8.627153948115888e-06, + "loss": 0.1002, + "step": 30030 + }, + { + "epoch": 2.844158303351638, + "grad_norm": 0.37190115451812744, + "learning_rate": 8.623366786593448e-06, + "loss": 0.1023, + "step": 30040 + }, + { + "epoch": 2.8451050937322475, + "grad_norm": 0.2926923334598541, + "learning_rate": 8.61957962507101e-06, + "loss": 0.1041, + "step": 30050 + }, + { + "epoch": 2.8460518841128573, + "grad_norm": 0.4673272371292114, + "learning_rate": 8.615792463548572e-06, + "loss": 0.1016, + "step": 30060 + }, + { + "epoch": 2.846998674493467, + "grad_norm": 0.4333568215370178, + "learning_rate": 8.612005302026132e-06, + "loss": 0.0999, + "step": 30070 + }, + { + "epoch": 2.8479454648740767, + "grad_norm": 0.385219931602478, + "learning_rate": 8.608218140503693e-06, + "loss": 0.0973, + "step": 30080 + }, + { + "epoch": 2.8488922552546865, + "grad_norm": 0.3480224311351776, + "learning_rate": 8.604430978981253e-06, + "loss": 0.1044, + "step": 30090 + }, + { + "epoch": 2.849839045635296, + "grad_norm": 0.3795575499534607, + "learning_rate": 8.600643817458815e-06, + "loss": 0.0946, + "step": 30100 + }, + { + "epoch": 2.850785836015906, + "grad_norm": 0.3406654894351959, + "learning_rate": 8.596856655936377e-06, + "loss": 0.1109, + "step": 30110 + }, + { + "epoch": 2.8517326263965157, + "grad_norm": 0.4390551745891571, + "learning_rate": 8.593069494413937e-06, + "loss": 0.105, + "step": 30120 + }, + { + "epoch": 2.8526794167771254, + "grad_norm": 0.3838212490081787, + "learning_rate": 8.589282332891499e-06, + "loss": 0.1058, + "step": 30130 + }, + { + "epoch": 2.853626207157735, + "grad_norm": 0.5091719627380371, + "learning_rate": 8.585495171369059e-06, + "loss": 0.1062, + "step": 30140 + }, + { + "epoch": 2.854572997538345, + "grad_norm": 0.4112211763858795, + "learning_rate": 8.58170800984662e-06, + "loss": 0.1028, + "step": 30150 + }, + { + "epoch": 2.8555197879189547, + "grad_norm": 0.3643788695335388, + "learning_rate": 8.577920848324182e-06, + "loss": 0.0972, + "step": 30160 + }, + { + "epoch": 2.8564665782995644, + "grad_norm": 0.38067877292633057, + "learning_rate": 8.574133686801744e-06, + "loss": 0.1092, + "step": 30170 + }, + { + "epoch": 2.857413368680174, + "grad_norm": 0.38998544216156006, + "learning_rate": 8.570346525279304e-06, + "loss": 0.1089, + "step": 30180 + }, + { + "epoch": 2.858360159060784, + "grad_norm": 0.4681199789047241, + "learning_rate": 8.566559363756864e-06, + "loss": 0.0997, + "step": 30190 + }, + { + "epoch": 2.8593069494413936, + "grad_norm": 0.4034975469112396, + "learning_rate": 8.562772202234426e-06, + "loss": 0.1003, + "step": 30200 + }, + { + "epoch": 2.8602537398220034, + "grad_norm": 0.42309436202049255, + "learning_rate": 8.558985040711988e-06, + "loss": 0.1071, + "step": 30210 + }, + { + "epoch": 2.861200530202613, + "grad_norm": 0.42179518938064575, + "learning_rate": 8.555197879189548e-06, + "loss": 0.1047, + "step": 30220 + }, + { + "epoch": 2.862147320583223, + "grad_norm": 0.4579227566719055, + "learning_rate": 8.55141071766711e-06, + "loss": 0.1007, + "step": 30230 + }, + { + "epoch": 2.8630941109638326, + "grad_norm": 0.4164641499519348, + "learning_rate": 8.547623556144671e-06, + "loss": 0.1086, + "step": 30240 + }, + { + "epoch": 2.8640409013444423, + "grad_norm": 0.3335796296596527, + "learning_rate": 8.543836394622231e-06, + "loss": 0.0975, + "step": 30250 + }, + { + "epoch": 2.864987691725052, + "grad_norm": 0.4051174819469452, + "learning_rate": 8.540049233099791e-06, + "loss": 0.112, + "step": 30260 + }, + { + "epoch": 2.865934482105662, + "grad_norm": 0.3602439761161804, + "learning_rate": 8.536262071577353e-06, + "loss": 0.0976, + "step": 30270 + }, + { + "epoch": 2.8668812724862716, + "grad_norm": 0.48511287569999695, + "learning_rate": 8.532474910054915e-06, + "loss": 0.0992, + "step": 30280 + }, + { + "epoch": 2.8678280628668813, + "grad_norm": 0.5073041915893555, + "learning_rate": 8.528687748532476e-06, + "loss": 0.0991, + "step": 30290 + }, + { + "epoch": 2.868774853247491, + "grad_norm": 1.1230666637420654, + "learning_rate": 8.524900587010037e-06, + "loss": 0.1025, + "step": 30300 + }, + { + "epoch": 2.869721643628101, + "grad_norm": 0.4232262670993805, + "learning_rate": 8.521113425487598e-06, + "loss": 0.1106, + "step": 30310 + }, + { + "epoch": 2.8706684340087105, + "grad_norm": 0.4858565032482147, + "learning_rate": 8.517326263965158e-06, + "loss": 0.1069, + "step": 30320 + }, + { + "epoch": 2.8716152243893203, + "grad_norm": 0.4056932330131531, + "learning_rate": 8.51353910244272e-06, + "loss": 0.1081, + "step": 30330 + }, + { + "epoch": 2.87256201476993, + "grad_norm": 0.3665787875652313, + "learning_rate": 8.509751940920282e-06, + "loss": 0.0968, + "step": 30340 + }, + { + "epoch": 2.8735088051505397, + "grad_norm": 0.5439797639846802, + "learning_rate": 8.505964779397842e-06, + "loss": 0.0961, + "step": 30350 + }, + { + "epoch": 2.8744555955311495, + "grad_norm": 0.37947776913642883, + "learning_rate": 8.502177617875404e-06, + "loss": 0.11, + "step": 30360 + }, + { + "epoch": 2.8754023859117592, + "grad_norm": 0.5098512768745422, + "learning_rate": 8.498390456352964e-06, + "loss": 0.1019, + "step": 30370 + }, + { + "epoch": 2.876349176292369, + "grad_norm": 0.43195387721061707, + "learning_rate": 8.494603294830525e-06, + "loss": 0.1083, + "step": 30380 + }, + { + "epoch": 2.8772959666729787, + "grad_norm": 0.534127414226532, + "learning_rate": 8.490816133308087e-06, + "loss": 0.1054, + "step": 30390 + }, + { + "epoch": 2.8782427570535885, + "grad_norm": 0.3190009295940399, + "learning_rate": 8.487028971785647e-06, + "loss": 0.1028, + "step": 30400 + }, + { + "epoch": 2.879189547434198, + "grad_norm": 0.31885769963264465, + "learning_rate": 8.483241810263209e-06, + "loss": 0.0895, + "step": 30410 + }, + { + "epoch": 2.880136337814808, + "grad_norm": 0.37694260478019714, + "learning_rate": 8.479454648740769e-06, + "loss": 0.1054, + "step": 30420 + }, + { + "epoch": 2.8810831281954177, + "grad_norm": 0.38295096158981323, + "learning_rate": 8.47566748721833e-06, + "loss": 0.1009, + "step": 30430 + }, + { + "epoch": 2.8820299185760274, + "grad_norm": 0.4697856903076172, + "learning_rate": 8.47188032569589e-06, + "loss": 0.1041, + "step": 30440 + }, + { + "epoch": 2.882976708956637, + "grad_norm": 0.4540785551071167, + "learning_rate": 8.468093164173452e-06, + "loss": 0.1118, + "step": 30450 + }, + { + "epoch": 2.883923499337247, + "grad_norm": 0.3804856538772583, + "learning_rate": 8.464306002651014e-06, + "loss": 0.0938, + "step": 30460 + }, + { + "epoch": 2.884870289717856, + "grad_norm": 0.46269491314888, + "learning_rate": 8.460518841128574e-06, + "loss": 0.104, + "step": 30470 + }, + { + "epoch": 2.885817080098466, + "grad_norm": 0.4231495261192322, + "learning_rate": 8.456731679606136e-06, + "loss": 0.104, + "step": 30480 + }, + { + "epoch": 2.8867638704790757, + "grad_norm": 0.37176263332366943, + "learning_rate": 8.452944518083696e-06, + "loss": 0.1049, + "step": 30490 + }, + { + "epoch": 2.8877106608596854, + "grad_norm": 0.39734789729118347, + "learning_rate": 8.449157356561258e-06, + "loss": 0.1014, + "step": 30500 + }, + { + "epoch": 2.888657451240295, + "grad_norm": 0.49146899580955505, + "learning_rate": 8.44537019503882e-06, + "loss": 0.1101, + "step": 30510 + }, + { + "epoch": 2.889604241620905, + "grad_norm": 0.4609476625919342, + "learning_rate": 8.441583033516381e-06, + "loss": 0.0973, + "step": 30520 + }, + { + "epoch": 2.8905510320015146, + "grad_norm": 0.44791117310523987, + "learning_rate": 8.437795871993941e-06, + "loss": 0.1032, + "step": 30530 + }, + { + "epoch": 2.8914978223821244, + "grad_norm": 0.3995889127254486, + "learning_rate": 8.434008710471501e-06, + "loss": 0.0989, + "step": 30540 + }, + { + "epoch": 2.892444612762734, + "grad_norm": 0.4357839822769165, + "learning_rate": 8.430221548949063e-06, + "loss": 0.0979, + "step": 30550 + }, + { + "epoch": 2.893391403143344, + "grad_norm": 0.5497676730155945, + "learning_rate": 8.426434387426625e-06, + "loss": 0.1051, + "step": 30560 + }, + { + "epoch": 2.8943381935239536, + "grad_norm": 0.43459513783454895, + "learning_rate": 8.422647225904187e-06, + "loss": 0.1036, + "step": 30570 + }, + { + "epoch": 2.8952849839045633, + "grad_norm": 0.3618446886539459, + "learning_rate": 8.418860064381747e-06, + "loss": 0.0987, + "step": 30580 + }, + { + "epoch": 2.896231774285173, + "grad_norm": 0.43975988030433655, + "learning_rate": 8.415072902859308e-06, + "loss": 0.1078, + "step": 30590 + }, + { + "epoch": 2.897178564665783, + "grad_norm": 0.460006445646286, + "learning_rate": 8.411285741336868e-06, + "loss": 0.0949, + "step": 30600 + }, + { + "epoch": 2.8981253550463926, + "grad_norm": 0.6009049415588379, + "learning_rate": 8.407498579814429e-06, + "loss": 0.1095, + "step": 30610 + }, + { + "epoch": 2.8990721454270023, + "grad_norm": 0.4352884888648987, + "learning_rate": 8.40371141829199e-06, + "loss": 0.1125, + "step": 30620 + }, + { + "epoch": 2.900018935807612, + "grad_norm": 0.46387022733688354, + "learning_rate": 8.399924256769552e-06, + "loss": 0.1033, + "step": 30630 + }, + { + "epoch": 2.900965726188222, + "grad_norm": 0.3929131031036377, + "learning_rate": 8.396137095247114e-06, + "loss": 0.1045, + "step": 30640 + }, + { + "epoch": 2.9019125165688315, + "grad_norm": 0.29404574632644653, + "learning_rate": 8.392349933724674e-06, + "loss": 0.0872, + "step": 30650 + }, + { + "epoch": 2.9028593069494413, + "grad_norm": 0.5976027250289917, + "learning_rate": 8.388562772202236e-06, + "loss": 0.1156, + "step": 30660 + }, + { + "epoch": 2.903806097330051, + "grad_norm": 0.2850901782512665, + "learning_rate": 8.384775610679796e-06, + "loss": 0.0923, + "step": 30670 + }, + { + "epoch": 2.9047528877106608, + "grad_norm": 0.4848046898841858, + "learning_rate": 8.380988449157357e-06, + "loss": 0.1097, + "step": 30680 + }, + { + "epoch": 2.9056996780912705, + "grad_norm": 0.4893626272678375, + "learning_rate": 8.377201287634919e-06, + "loss": 0.1069, + "step": 30690 + }, + { + "epoch": 2.9066464684718802, + "grad_norm": 0.44268444180488586, + "learning_rate": 8.373414126112479e-06, + "loss": 0.0992, + "step": 30700 + }, + { + "epoch": 2.90759325885249, + "grad_norm": 0.44177380204200745, + "learning_rate": 8.369626964590041e-06, + "loss": 0.1033, + "step": 30710 + }, + { + "epoch": 2.9085400492330997, + "grad_norm": 0.37980490922927856, + "learning_rate": 8.365839803067601e-06, + "loss": 0.1114, + "step": 30720 + }, + { + "epoch": 2.9094868396137095, + "grad_norm": 0.32962408661842346, + "learning_rate": 8.362052641545163e-06, + "loss": 0.1095, + "step": 30730 + }, + { + "epoch": 2.910433629994319, + "grad_norm": 0.39001163840293884, + "learning_rate": 8.358265480022724e-06, + "loss": 0.1004, + "step": 30740 + }, + { + "epoch": 2.911380420374929, + "grad_norm": 0.4204920828342438, + "learning_rate": 8.354478318500284e-06, + "loss": 0.0961, + "step": 30750 + }, + { + "epoch": 2.9123272107555387, + "grad_norm": 0.397558331489563, + "learning_rate": 8.350691156977846e-06, + "loss": 0.0963, + "step": 30760 + }, + { + "epoch": 2.9132740011361484, + "grad_norm": 0.4277651906013489, + "learning_rate": 8.346903995455406e-06, + "loss": 0.1019, + "step": 30770 + }, + { + "epoch": 2.914220791516758, + "grad_norm": 0.4084283411502838, + "learning_rate": 8.343116833932968e-06, + "loss": 0.1075, + "step": 30780 + }, + { + "epoch": 2.915167581897368, + "grad_norm": 0.41574668884277344, + "learning_rate": 8.339329672410528e-06, + "loss": 0.0916, + "step": 30790 + }, + { + "epoch": 2.9161143722779777, + "grad_norm": 0.419190376996994, + "learning_rate": 8.33554251088809e-06, + "loss": 0.0954, + "step": 30800 + }, + { + "epoch": 2.9170611626585874, + "grad_norm": 0.5709701180458069, + "learning_rate": 8.331755349365652e-06, + "loss": 0.1003, + "step": 30810 + }, + { + "epoch": 2.918007953039197, + "grad_norm": 0.2899015545845032, + "learning_rate": 8.327968187843212e-06, + "loss": 0.0999, + "step": 30820 + }, + { + "epoch": 2.918954743419807, + "grad_norm": 0.40506407618522644, + "learning_rate": 8.324181026320773e-06, + "loss": 0.1095, + "step": 30830 + }, + { + "epoch": 2.9199015338004166, + "grad_norm": 0.4087386429309845, + "learning_rate": 8.320393864798333e-06, + "loss": 0.1019, + "step": 30840 + }, + { + "epoch": 2.9208483241810264, + "grad_norm": 0.45319652557373047, + "learning_rate": 8.316606703275895e-06, + "loss": 0.1031, + "step": 30850 + }, + { + "epoch": 2.921795114561636, + "grad_norm": 0.44078201055526733, + "learning_rate": 8.312819541753457e-06, + "loss": 0.1081, + "step": 30860 + }, + { + "epoch": 2.922741904942246, + "grad_norm": 0.34192386269569397, + "learning_rate": 8.309032380231019e-06, + "loss": 0.0939, + "step": 30870 + }, + { + "epoch": 2.9236886953228556, + "grad_norm": 0.38007766008377075, + "learning_rate": 8.305245218708579e-06, + "loss": 0.09, + "step": 30880 + }, + { + "epoch": 2.9246354857034653, + "grad_norm": 0.45848166942596436, + "learning_rate": 8.301458057186139e-06, + "loss": 0.0951, + "step": 30890 + }, + { + "epoch": 2.925582276084075, + "grad_norm": 0.3923320472240448, + "learning_rate": 8.2976708956637e-06, + "loss": 0.098, + "step": 30900 + }, + { + "epoch": 2.926529066464685, + "grad_norm": 0.4490647614002228, + "learning_rate": 8.293883734141262e-06, + "loss": 0.1143, + "step": 30910 + }, + { + "epoch": 2.9274758568452945, + "grad_norm": 0.4360518455505371, + "learning_rate": 8.290096572618824e-06, + "loss": 0.109, + "step": 30920 + }, + { + "epoch": 2.9284226472259043, + "grad_norm": 0.41733619570732117, + "learning_rate": 8.286309411096384e-06, + "loss": 0.1024, + "step": 30930 + }, + { + "epoch": 2.929369437606514, + "grad_norm": 0.4506927728652954, + "learning_rate": 8.282522249573946e-06, + "loss": 0.0936, + "step": 30940 + }, + { + "epoch": 2.9303162279871238, + "grad_norm": 0.46986424922943115, + "learning_rate": 8.278735088051506e-06, + "loss": 0.1047, + "step": 30950 + }, + { + "epoch": 2.9312630183677335, + "grad_norm": 0.4629628360271454, + "learning_rate": 8.274947926529066e-06, + "loss": 0.1027, + "step": 30960 + }, + { + "epoch": 2.9322098087483432, + "grad_norm": 0.48981261253356934, + "learning_rate": 8.271160765006628e-06, + "loss": 0.1122, + "step": 30970 + }, + { + "epoch": 2.933156599128953, + "grad_norm": 0.4340340793132782, + "learning_rate": 8.26737360348419e-06, + "loss": 0.1002, + "step": 30980 + }, + { + "epoch": 2.9341033895095627, + "grad_norm": 0.36658594012260437, + "learning_rate": 8.263586441961751e-06, + "loss": 0.1019, + "step": 30990 + }, + { + "epoch": 2.9350501798901725, + "grad_norm": 0.36050164699554443, + "learning_rate": 8.259799280439311e-06, + "loss": 0.1025, + "step": 31000 + }, + { + "epoch": 2.935996970270782, + "grad_norm": 0.4739544093608856, + "learning_rate": 8.256012118916873e-06, + "loss": 0.106, + "step": 31010 + }, + { + "epoch": 2.936943760651392, + "grad_norm": 0.4080391228199005, + "learning_rate": 8.252224957394433e-06, + "loss": 0.1066, + "step": 31020 + }, + { + "epoch": 2.9378905510320017, + "grad_norm": 0.46320927143096924, + "learning_rate": 8.248437795871995e-06, + "loss": 0.1067, + "step": 31030 + }, + { + "epoch": 2.9388373414126114, + "grad_norm": 0.4874909222126007, + "learning_rate": 8.244650634349556e-06, + "loss": 0.1084, + "step": 31040 + }, + { + "epoch": 2.939784131793221, + "grad_norm": 0.4553222060203552, + "learning_rate": 8.240863472827116e-06, + "loss": 0.1105, + "step": 31050 + }, + { + "epoch": 2.940730922173831, + "grad_norm": 0.37739482522010803, + "learning_rate": 8.237076311304678e-06, + "loss": 0.1, + "step": 31060 + }, + { + "epoch": 2.9416777125544407, + "grad_norm": 0.4049690067768097, + "learning_rate": 8.233289149782238e-06, + "loss": 0.1007, + "step": 31070 + }, + { + "epoch": 2.9426245029350504, + "grad_norm": 0.4319703280925751, + "learning_rate": 8.2295019882598e-06, + "loss": 0.1003, + "step": 31080 + }, + { + "epoch": 2.94357129331566, + "grad_norm": 0.410161554813385, + "learning_rate": 8.225714826737362e-06, + "loss": 0.0976, + "step": 31090 + }, + { + "epoch": 2.94451808369627, + "grad_norm": 0.4579450190067291, + "learning_rate": 8.221927665214922e-06, + "loss": 0.1117, + "step": 31100 + }, + { + "epoch": 2.9454648740768796, + "grad_norm": 0.4039101302623749, + "learning_rate": 8.218140503692484e-06, + "loss": 0.0941, + "step": 31110 + }, + { + "epoch": 2.9464116644574894, + "grad_norm": 0.3221927583217621, + "learning_rate": 8.214353342170044e-06, + "loss": 0.0926, + "step": 31120 + }, + { + "epoch": 2.947358454838099, + "grad_norm": 0.44777631759643555, + "learning_rate": 8.210566180647605e-06, + "loss": 0.091, + "step": 31130 + }, + { + "epoch": 2.948305245218709, + "grad_norm": 0.4191173315048218, + "learning_rate": 8.206779019125165e-06, + "loss": 0.1019, + "step": 31140 + }, + { + "epoch": 2.949252035599318, + "grad_norm": 0.3635985255241394, + "learning_rate": 8.202991857602727e-06, + "loss": 0.0973, + "step": 31150 + }, + { + "epoch": 2.950198825979928, + "grad_norm": 0.4057210087776184, + "learning_rate": 8.199204696080289e-06, + "loss": 0.1028, + "step": 31160 + }, + { + "epoch": 2.9511456163605376, + "grad_norm": 0.41892239451408386, + "learning_rate": 8.195417534557849e-06, + "loss": 0.0964, + "step": 31170 + }, + { + "epoch": 2.9520924067411474, + "grad_norm": 0.4387721121311188, + "learning_rate": 8.19163037303541e-06, + "loss": 0.111, + "step": 31180 + }, + { + "epoch": 2.953039197121757, + "grad_norm": 0.3993951082229614, + "learning_rate": 8.18784321151297e-06, + "loss": 0.106, + "step": 31190 + }, + { + "epoch": 2.953985987502367, + "grad_norm": 0.39773261547088623, + "learning_rate": 8.184056049990532e-06, + "loss": 0.1029, + "step": 31200 + }, + { + "epoch": 2.9549327778829766, + "grad_norm": 0.4858965575695038, + "learning_rate": 8.180268888468094e-06, + "loss": 0.1036, + "step": 31210 + }, + { + "epoch": 2.9558795682635863, + "grad_norm": 0.4160551130771637, + "learning_rate": 8.176481726945656e-06, + "loss": 0.1045, + "step": 31220 + }, + { + "epoch": 2.956826358644196, + "grad_norm": 0.34373006224632263, + "learning_rate": 8.172694565423216e-06, + "loss": 0.0965, + "step": 31230 + }, + { + "epoch": 2.957773149024806, + "grad_norm": 0.416771799325943, + "learning_rate": 8.168907403900776e-06, + "loss": 0.1053, + "step": 31240 + }, + { + "epoch": 2.9587199394054156, + "grad_norm": 0.31747400760650635, + "learning_rate": 8.165120242378338e-06, + "loss": 0.0999, + "step": 31250 + }, + { + "epoch": 2.9596667297860253, + "grad_norm": 0.3813968598842621, + "learning_rate": 8.1613330808559e-06, + "loss": 0.1069, + "step": 31260 + }, + { + "epoch": 2.960613520166635, + "grad_norm": 0.48185887932777405, + "learning_rate": 8.157545919333461e-06, + "loss": 0.1049, + "step": 31270 + }, + { + "epoch": 2.961560310547245, + "grad_norm": 0.4670296311378479, + "learning_rate": 8.153758757811021e-06, + "loss": 0.0993, + "step": 31280 + }, + { + "epoch": 2.9625071009278545, + "grad_norm": 0.41335439682006836, + "learning_rate": 8.149971596288583e-06, + "loss": 0.1024, + "step": 31290 + }, + { + "epoch": 2.9634538913084643, + "grad_norm": 0.5939239263534546, + "learning_rate": 8.146184434766143e-06, + "loss": 0.1149, + "step": 31300 + }, + { + "epoch": 2.964400681689074, + "grad_norm": 0.4662041962146759, + "learning_rate": 8.142397273243703e-06, + "loss": 0.0962, + "step": 31310 + }, + { + "epoch": 2.9653474720696837, + "grad_norm": 0.44135046005249023, + "learning_rate": 8.138610111721265e-06, + "loss": 0.1022, + "step": 31320 + }, + { + "epoch": 2.9662942624502935, + "grad_norm": 0.37937572598457336, + "learning_rate": 8.134822950198827e-06, + "loss": 0.0962, + "step": 31330 + }, + { + "epoch": 2.9672410528309032, + "grad_norm": 0.5369215607643127, + "learning_rate": 8.131035788676388e-06, + "loss": 0.1068, + "step": 31340 + }, + { + "epoch": 2.968187843211513, + "grad_norm": 0.40610820055007935, + "learning_rate": 8.127248627153948e-06, + "loss": 0.1103, + "step": 31350 + }, + { + "epoch": 2.9691346335921227, + "grad_norm": 0.37899497151374817, + "learning_rate": 8.12346146563151e-06, + "loss": 0.1, + "step": 31360 + }, + { + "epoch": 2.9700814239727324, + "grad_norm": 0.387104332447052, + "learning_rate": 8.11967430410907e-06, + "loss": 0.1107, + "step": 31370 + }, + { + "epoch": 2.971028214353342, + "grad_norm": 0.43317079544067383, + "learning_rate": 8.115887142586632e-06, + "loss": 0.0966, + "step": 31380 + }, + { + "epoch": 2.971975004733952, + "grad_norm": 0.4942995309829712, + "learning_rate": 8.112099981064194e-06, + "loss": 0.1081, + "step": 31390 + }, + { + "epoch": 2.9729217951145617, + "grad_norm": 0.43864768743515015, + "learning_rate": 8.108312819541754e-06, + "loss": 0.1045, + "step": 31400 + }, + { + "epoch": 2.9738685854951714, + "grad_norm": 0.4242566227912903, + "learning_rate": 8.104525658019316e-06, + "loss": 0.1189, + "step": 31410 + }, + { + "epoch": 2.974815375875781, + "grad_norm": 0.3926277756690979, + "learning_rate": 8.100738496496876e-06, + "loss": 0.1059, + "step": 31420 + }, + { + "epoch": 2.975762166256391, + "grad_norm": 0.4414288401603699, + "learning_rate": 8.096951334974437e-06, + "loss": 0.1075, + "step": 31430 + }, + { + "epoch": 2.9767089566370006, + "grad_norm": 0.39135444164276123, + "learning_rate": 8.093164173451999e-06, + "loss": 0.1038, + "step": 31440 + }, + { + "epoch": 2.9776557470176104, + "grad_norm": 0.3688848912715912, + "learning_rate": 8.089377011929559e-06, + "loss": 0.1032, + "step": 31450 + }, + { + "epoch": 2.97860253739822, + "grad_norm": 0.3285348117351532, + "learning_rate": 8.08558985040712e-06, + "loss": 0.0941, + "step": 31460 + }, + { + "epoch": 2.97954932777883, + "grad_norm": 0.4666126072406769, + "learning_rate": 8.081802688884681e-06, + "loss": 0.1117, + "step": 31470 + }, + { + "epoch": 2.9804961181594396, + "grad_norm": 0.3238498568534851, + "learning_rate": 8.078015527362243e-06, + "loss": 0.1034, + "step": 31480 + }, + { + "epoch": 2.9814429085400493, + "grad_norm": 0.3677513003349304, + "learning_rate": 8.074228365839803e-06, + "loss": 0.1045, + "step": 31490 + }, + { + "epoch": 2.982389698920659, + "grad_norm": 0.4916920065879822, + "learning_rate": 8.070441204317364e-06, + "loss": 0.1064, + "step": 31500 + }, + { + "epoch": 2.983336489301269, + "grad_norm": 0.47723665833473206, + "learning_rate": 8.066654042794926e-06, + "loss": 0.1071, + "step": 31510 + }, + { + "epoch": 2.9842832796818786, + "grad_norm": 0.43712523579597473, + "learning_rate": 8.062866881272486e-06, + "loss": 0.106, + "step": 31520 + }, + { + "epoch": 2.9852300700624883, + "grad_norm": 0.40085798501968384, + "learning_rate": 8.059079719750048e-06, + "loss": 0.1053, + "step": 31530 + }, + { + "epoch": 2.986176860443098, + "grad_norm": 0.39306896924972534, + "learning_rate": 8.055292558227608e-06, + "loss": 0.104, + "step": 31540 + }, + { + "epoch": 2.987123650823708, + "grad_norm": 0.38417014479637146, + "learning_rate": 8.05150539670517e-06, + "loss": 0.0958, + "step": 31550 + }, + { + "epoch": 2.9880704412043175, + "grad_norm": 0.5245347619056702, + "learning_rate": 8.047718235182731e-06, + "loss": 0.104, + "step": 31560 + }, + { + "epoch": 2.989017231584927, + "grad_norm": 0.5753111839294434, + "learning_rate": 8.043931073660293e-06, + "loss": 0.1133, + "step": 31570 + }, + { + "epoch": 2.9899640219655366, + "grad_norm": 0.4295615553855896, + "learning_rate": 8.040143912137853e-06, + "loss": 0.0972, + "step": 31580 + }, + { + "epoch": 2.9909108123461463, + "grad_norm": 0.37313562631607056, + "learning_rate": 8.036356750615413e-06, + "loss": 0.1035, + "step": 31590 + }, + { + "epoch": 2.991857602726756, + "grad_norm": 0.4777378439903259, + "learning_rate": 8.032569589092975e-06, + "loss": 0.1051, + "step": 31600 + }, + { + "epoch": 2.992804393107366, + "grad_norm": 0.5843793153762817, + "learning_rate": 8.028782427570537e-06, + "loss": 0.1132, + "step": 31610 + }, + { + "epoch": 2.9937511834879755, + "grad_norm": 0.38907530903816223, + "learning_rate": 8.024995266048099e-06, + "loss": 0.0922, + "step": 31620 + }, + { + "epoch": 2.9946979738685853, + "grad_norm": 0.45201992988586426, + "learning_rate": 8.021208104525659e-06, + "loss": 0.1044, + "step": 31630 + }, + { + "epoch": 2.995644764249195, + "grad_norm": 0.35846221446990967, + "learning_rate": 8.01742094300322e-06, + "loss": 0.1043, + "step": 31640 + }, + { + "epoch": 2.9965915546298048, + "grad_norm": 0.3565918803215027, + "learning_rate": 8.01363378148078e-06, + "loss": 0.0996, + "step": 31650 + }, + { + "epoch": 2.9975383450104145, + "grad_norm": 0.4547620117664337, + "learning_rate": 8.009846619958342e-06, + "loss": 0.103, + "step": 31660 + }, + { + "epoch": 2.9984851353910242, + "grad_norm": 0.41886186599731445, + "learning_rate": 8.006059458435902e-06, + "loss": 0.1025, + "step": 31670 + }, + { + "epoch": 2.999431925771634, + "grad_norm": 0.40914952754974365, + "learning_rate": 8.002272296913464e-06, + "loss": 0.105, + "step": 31680 + }, + { + "epoch": 3.0, + "eval_f1_micro": 0.3799037598890792, + "eval_loss": 0.11170154809951782, + "eval_precision": 0.58840372226199, + "eval_recall": 0.2805066543549391, + "eval_runtime": 333.7804, + "eval_samples_per_second": 126.565, + "eval_steps_per_second": 7.912, + "step": 31686 + }, + { + "epoch": 3.0003787161522437, + "grad_norm": 0.3885652720928192, + "learning_rate": 7.998485135391026e-06, + "loss": 0.1001, + "step": 31690 + }, + { + "epoch": 3.0013255065328535, + "grad_norm": 0.38781455159187317, + "learning_rate": 7.994697973868586e-06, + "loss": 0.0948, + "step": 31700 + }, + { + "epoch": 3.002272296913463, + "grad_norm": 0.3879997432231903, + "learning_rate": 7.990910812346147e-06, + "loss": 0.1031, + "step": 31710 + }, + { + "epoch": 3.003219087294073, + "grad_norm": 0.4263533055782318, + "learning_rate": 7.987123650823708e-06, + "loss": 0.0944, + "step": 31720 + }, + { + "epoch": 3.0041658776746827, + "grad_norm": 0.3836594521999359, + "learning_rate": 7.98333648930127e-06, + "loss": 0.096, + "step": 31730 + }, + { + "epoch": 3.0051126680552924, + "grad_norm": 0.4415520429611206, + "learning_rate": 7.979549327778831e-06, + "loss": 0.0971, + "step": 31740 + }, + { + "epoch": 3.006059458435902, + "grad_norm": 0.3368381857872009, + "learning_rate": 7.975762166256391e-06, + "loss": 0.0865, + "step": 31750 + }, + { + "epoch": 3.007006248816512, + "grad_norm": 0.47972026467323303, + "learning_rate": 7.971975004733953e-06, + "loss": 0.0996, + "step": 31760 + }, + { + "epoch": 3.0079530391971216, + "grad_norm": 0.44021525979042053, + "learning_rate": 7.968187843211513e-06, + "loss": 0.102, + "step": 31770 + }, + { + "epoch": 3.0088998295777314, + "grad_norm": 0.4182315170764923, + "learning_rate": 7.964400681689075e-06, + "loss": 0.0991, + "step": 31780 + }, + { + "epoch": 3.009846619958341, + "grad_norm": 0.39695531129837036, + "learning_rate": 7.960613520166636e-06, + "loss": 0.1008, + "step": 31790 + }, + { + "epoch": 3.010793410338951, + "grad_norm": 0.36578354239463806, + "learning_rate": 7.956826358644196e-06, + "loss": 0.089, + "step": 31800 + }, + { + "epoch": 3.0117402007195606, + "grad_norm": 0.5266786813735962, + "learning_rate": 7.953039197121758e-06, + "loss": 0.0971, + "step": 31810 + }, + { + "epoch": 3.0126869911001704, + "grad_norm": 0.37619754672050476, + "learning_rate": 7.949252035599318e-06, + "loss": 0.0951, + "step": 31820 + }, + { + "epoch": 3.01363378148078, + "grad_norm": 0.4449104368686676, + "learning_rate": 7.94546487407688e-06, + "loss": 0.1009, + "step": 31830 + }, + { + "epoch": 3.01458057186139, + "grad_norm": 0.44374340772628784, + "learning_rate": 7.941677712554442e-06, + "loss": 0.0898, + "step": 31840 + }, + { + "epoch": 3.0155273622419996, + "grad_norm": 0.4777348041534424, + "learning_rate": 7.937890551032002e-06, + "loss": 0.0909, + "step": 31850 + }, + { + "epoch": 3.0164741526226093, + "grad_norm": 0.4121871888637543, + "learning_rate": 7.934103389509563e-06, + "loss": 0.0946, + "step": 31860 + }, + { + "epoch": 3.017420943003219, + "grad_norm": 0.38763758540153503, + "learning_rate": 7.930316227987124e-06, + "loss": 0.0976, + "step": 31870 + }, + { + "epoch": 3.018367733383829, + "grad_norm": 0.35308313369750977, + "learning_rate": 7.926529066464685e-06, + "loss": 0.0887, + "step": 31880 + }, + { + "epoch": 3.0193145237644385, + "grad_norm": 0.47998329997062683, + "learning_rate": 7.922741904942245e-06, + "loss": 0.0919, + "step": 31890 + }, + { + "epoch": 3.0202613141450483, + "grad_norm": 0.4650914669036865, + "learning_rate": 7.918954743419807e-06, + "loss": 0.0915, + "step": 31900 + }, + { + "epoch": 3.021208104525658, + "grad_norm": 0.3876132369041443, + "learning_rate": 7.915167581897369e-06, + "loss": 0.0955, + "step": 31910 + }, + { + "epoch": 3.0221548949062678, + "grad_norm": 0.524383544921875, + "learning_rate": 7.91138042037493e-06, + "loss": 0.0975, + "step": 31920 + }, + { + "epoch": 3.0231016852868775, + "grad_norm": 0.42943963408470154, + "learning_rate": 7.90759325885249e-06, + "loss": 0.0814, + "step": 31930 + }, + { + "epoch": 3.0240484756674872, + "grad_norm": 0.4390105903148651, + "learning_rate": 7.90380609733005e-06, + "loss": 0.0999, + "step": 31940 + }, + { + "epoch": 3.024995266048097, + "grad_norm": 0.46827784180641174, + "learning_rate": 7.900018935807612e-06, + "loss": 0.0896, + "step": 31950 + }, + { + "epoch": 3.0259420564287067, + "grad_norm": 0.45552289485931396, + "learning_rate": 7.896231774285174e-06, + "loss": 0.1136, + "step": 31960 + }, + { + "epoch": 3.0268888468093165, + "grad_norm": 0.4800921082496643, + "learning_rate": 7.892444612762736e-06, + "loss": 0.0991, + "step": 31970 + }, + { + "epoch": 3.027835637189926, + "grad_norm": 0.484578400850296, + "learning_rate": 7.888657451240296e-06, + "loss": 0.0985, + "step": 31980 + }, + { + "epoch": 3.028782427570536, + "grad_norm": 0.4732307195663452, + "learning_rate": 7.884870289717858e-06, + "loss": 0.091, + "step": 31990 + }, + { + "epoch": 3.0297292179511457, + "grad_norm": 0.4047934114933014, + "learning_rate": 7.881083128195418e-06, + "loss": 0.0935, + "step": 32000 + }, + { + "epoch": 3.0306760083317554, + "grad_norm": 0.32263126969337463, + "learning_rate": 7.87729596667298e-06, + "loss": 0.0917, + "step": 32010 + }, + { + "epoch": 3.031622798712365, + "grad_norm": 0.40960872173309326, + "learning_rate": 7.873508805150541e-06, + "loss": 0.095, + "step": 32020 + }, + { + "epoch": 3.032569589092975, + "grad_norm": 0.30367690324783325, + "learning_rate": 7.869721643628101e-06, + "loss": 0.0926, + "step": 32030 + }, + { + "epoch": 3.0335163794735847, + "grad_norm": 0.348137766122818, + "learning_rate": 7.865934482105663e-06, + "loss": 0.0897, + "step": 32040 + }, + { + "epoch": 3.0344631698541944, + "grad_norm": 0.45152419805526733, + "learning_rate": 7.862147320583223e-06, + "loss": 0.098, + "step": 32050 + }, + { + "epoch": 3.035409960234804, + "grad_norm": 0.47513529658317566, + "learning_rate": 7.858360159060785e-06, + "loss": 0.1028, + "step": 32060 + }, + { + "epoch": 3.036356750615414, + "grad_norm": 0.4300587773323059, + "learning_rate": 7.854572997538345e-06, + "loss": 0.0911, + "step": 32070 + }, + { + "epoch": 3.0373035409960236, + "grad_norm": 0.47339463233947754, + "learning_rate": 7.850785836015907e-06, + "loss": 0.1049, + "step": 32080 + }, + { + "epoch": 3.0382503313766334, + "grad_norm": 0.42077648639678955, + "learning_rate": 7.846998674493468e-06, + "loss": 0.0898, + "step": 32090 + }, + { + "epoch": 3.039197121757243, + "grad_norm": 0.4841423034667969, + "learning_rate": 7.843211512971028e-06, + "loss": 0.0965, + "step": 32100 + }, + { + "epoch": 3.040143912137853, + "grad_norm": 0.6945616006851196, + "learning_rate": 7.83942435144859e-06, + "loss": 0.0957, + "step": 32110 + }, + { + "epoch": 3.0410907025184626, + "grad_norm": 0.573688805103302, + "learning_rate": 7.83563718992615e-06, + "loss": 0.0892, + "step": 32120 + }, + { + "epoch": 3.0420374928990723, + "grad_norm": 0.4494647979736328, + "learning_rate": 7.831850028403712e-06, + "loss": 0.0929, + "step": 32130 + }, + { + "epoch": 3.042984283279682, + "grad_norm": 0.42762213945388794, + "learning_rate": 7.828062866881274e-06, + "loss": 0.0937, + "step": 32140 + }, + { + "epoch": 3.043931073660292, + "grad_norm": 0.6574509143829346, + "learning_rate": 7.824275705358835e-06, + "loss": 0.1021, + "step": 32150 + }, + { + "epoch": 3.0448778640409015, + "grad_norm": 0.5261147022247314, + "learning_rate": 7.820488543836395e-06, + "loss": 0.0967, + "step": 32160 + }, + { + "epoch": 3.0458246544215113, + "grad_norm": 0.509415864944458, + "learning_rate": 7.816701382313956e-06, + "loss": 0.0968, + "step": 32170 + }, + { + "epoch": 3.046771444802121, + "grad_norm": 0.5804086327552795, + "learning_rate": 7.812914220791517e-06, + "loss": 0.0962, + "step": 32180 + }, + { + "epoch": 3.0477182351827303, + "grad_norm": 0.5601807236671448, + "learning_rate": 7.809127059269079e-06, + "loss": 0.0982, + "step": 32190 + }, + { + "epoch": 3.04866502556334, + "grad_norm": 0.42098745703697205, + "learning_rate": 7.80533989774664e-06, + "loss": 0.0931, + "step": 32200 + }, + { + "epoch": 3.04961181594395, + "grad_norm": 0.36883658170700073, + "learning_rate": 7.8015527362242e-06, + "loss": 0.1013, + "step": 32210 + }, + { + "epoch": 3.0505586063245596, + "grad_norm": 0.4977787137031555, + "learning_rate": 7.797765574701763e-06, + "loss": 0.0951, + "step": 32220 + }, + { + "epoch": 3.0515053967051693, + "grad_norm": 0.49785080552101135, + "learning_rate": 7.793978413179323e-06, + "loss": 0.0951, + "step": 32230 + }, + { + "epoch": 3.052452187085779, + "grad_norm": 0.40809834003448486, + "learning_rate": 7.790191251656883e-06, + "loss": 0.0918, + "step": 32240 + }, + { + "epoch": 3.0533989774663888, + "grad_norm": 0.39890798926353455, + "learning_rate": 7.786404090134444e-06, + "loss": 0.0907, + "step": 32250 + }, + { + "epoch": 3.0543457678469985, + "grad_norm": 0.40804323554039, + "learning_rate": 7.782616928612006e-06, + "loss": 0.0976, + "step": 32260 + }, + { + "epoch": 3.0552925582276083, + "grad_norm": 0.48888227343559265, + "learning_rate": 7.778829767089568e-06, + "loss": 0.0928, + "step": 32270 + }, + { + "epoch": 3.056239348608218, + "grad_norm": 0.5478982329368591, + "learning_rate": 7.775042605567128e-06, + "loss": 0.1114, + "step": 32280 + }, + { + "epoch": 3.0571861389888277, + "grad_norm": 0.4051608145236969, + "learning_rate": 7.77125544404469e-06, + "loss": 0.098, + "step": 32290 + }, + { + "epoch": 3.0581329293694375, + "grad_norm": 0.38355210423469543, + "learning_rate": 7.76746828252225e-06, + "loss": 0.0954, + "step": 32300 + }, + { + "epoch": 3.059079719750047, + "grad_norm": 0.46852338314056396, + "learning_rate": 7.763681120999811e-06, + "loss": 0.0961, + "step": 32310 + }, + { + "epoch": 3.060026510130657, + "grad_norm": 0.45753785967826843, + "learning_rate": 7.759893959477373e-06, + "loss": 0.0942, + "step": 32320 + }, + { + "epoch": 3.0609733005112667, + "grad_norm": 0.41343867778778076, + "learning_rate": 7.756106797954933e-06, + "loss": 0.1011, + "step": 32330 + }, + { + "epoch": 3.0619200908918764, + "grad_norm": 0.41205528378486633, + "learning_rate": 7.752319636432495e-06, + "loss": 0.0954, + "step": 32340 + }, + { + "epoch": 3.062866881272486, + "grad_norm": 0.45434126257896423, + "learning_rate": 7.748532474910055e-06, + "loss": 0.0873, + "step": 32350 + }, + { + "epoch": 3.063813671653096, + "grad_norm": 0.45336347818374634, + "learning_rate": 7.744745313387617e-06, + "loss": 0.0993, + "step": 32360 + }, + { + "epoch": 3.0647604620337057, + "grad_norm": 0.4804220199584961, + "learning_rate": 7.740958151865179e-06, + "loss": 0.0911, + "step": 32370 + }, + { + "epoch": 3.0657072524143154, + "grad_norm": 0.43861210346221924, + "learning_rate": 7.737170990342739e-06, + "loss": 0.0841, + "step": 32380 + }, + { + "epoch": 3.066654042794925, + "grad_norm": 0.4974420964717865, + "learning_rate": 7.7333838288203e-06, + "loss": 0.0982, + "step": 32390 + }, + { + "epoch": 3.067600833175535, + "grad_norm": 0.571367621421814, + "learning_rate": 7.72959666729786e-06, + "loss": 0.1027, + "step": 32400 + }, + { + "epoch": 3.0685476235561446, + "grad_norm": 0.4368377923965454, + "learning_rate": 7.725809505775422e-06, + "loss": 0.0971, + "step": 32410 + }, + { + "epoch": 3.0694944139367544, + "grad_norm": 0.5460110306739807, + "learning_rate": 7.722022344252982e-06, + "loss": 0.094, + "step": 32420 + }, + { + "epoch": 3.070441204317364, + "grad_norm": 0.4684341847896576, + "learning_rate": 7.718235182730544e-06, + "loss": 0.1011, + "step": 32430 + }, + { + "epoch": 3.071387994697974, + "grad_norm": 0.7317419648170471, + "learning_rate": 7.714448021208106e-06, + "loss": 0.0971, + "step": 32440 + }, + { + "epoch": 3.0723347850785836, + "grad_norm": 0.3783852159976959, + "learning_rate": 7.710660859685666e-06, + "loss": 0.0983, + "step": 32450 + }, + { + "epoch": 3.0732815754591933, + "grad_norm": 0.4761520326137543, + "learning_rate": 7.706873698163227e-06, + "loss": 0.0923, + "step": 32460 + }, + { + "epoch": 3.074228365839803, + "grad_norm": 0.6367383599281311, + "learning_rate": 7.703086536640787e-06, + "loss": 0.1035, + "step": 32470 + }, + { + "epoch": 3.075175156220413, + "grad_norm": 0.36390215158462524, + "learning_rate": 7.69929937511835e-06, + "loss": 0.0949, + "step": 32480 + }, + { + "epoch": 3.0761219466010226, + "grad_norm": 0.41366979479789734, + "learning_rate": 7.695512213595911e-06, + "loss": 0.0998, + "step": 32490 + }, + { + "epoch": 3.0770687369816323, + "grad_norm": 0.493579238653183, + "learning_rate": 7.691725052073473e-06, + "loss": 0.0885, + "step": 32500 + }, + { + "epoch": 3.078015527362242, + "grad_norm": 0.5382437109947205, + "learning_rate": 7.687937890551033e-06, + "loss": 0.1057, + "step": 32510 + }, + { + "epoch": 3.078962317742852, + "grad_norm": 0.4758876860141754, + "learning_rate": 7.684150729028593e-06, + "loss": 0.0985, + "step": 32520 + }, + { + "epoch": 3.0799091081234615, + "grad_norm": 0.3409093916416168, + "learning_rate": 7.680363567506155e-06, + "loss": 0.0913, + "step": 32530 + }, + { + "epoch": 3.0808558985040713, + "grad_norm": 0.3846043050289154, + "learning_rate": 7.676576405983716e-06, + "loss": 0.1007, + "step": 32540 + }, + { + "epoch": 3.081802688884681, + "grad_norm": 0.5030474662780762, + "learning_rate": 7.672789244461278e-06, + "loss": 0.0932, + "step": 32550 + }, + { + "epoch": 3.0827494792652907, + "grad_norm": 0.5268787145614624, + "learning_rate": 7.669002082938838e-06, + "loss": 0.1036, + "step": 32560 + }, + { + "epoch": 3.0836962696459005, + "grad_norm": 0.49069228768348694, + "learning_rate": 7.6652149214164e-06, + "loss": 0.1017, + "step": 32570 + }, + { + "epoch": 3.0846430600265102, + "grad_norm": 0.44971224665641785, + "learning_rate": 7.66142775989396e-06, + "loss": 0.1015, + "step": 32580 + }, + { + "epoch": 3.08558985040712, + "grad_norm": 0.3594251573085785, + "learning_rate": 7.65764059837152e-06, + "loss": 0.0984, + "step": 32590 + }, + { + "epoch": 3.0865366407877297, + "grad_norm": 0.49989503622055054, + "learning_rate": 7.653853436849082e-06, + "loss": 0.1014, + "step": 32600 + }, + { + "epoch": 3.0874834311683395, + "grad_norm": 0.4332210123538971, + "learning_rate": 7.650066275326643e-06, + "loss": 0.0901, + "step": 32610 + }, + { + "epoch": 3.088430221548949, + "grad_norm": 0.4731793999671936, + "learning_rate": 7.646279113804205e-06, + "loss": 0.0934, + "step": 32620 + }, + { + "epoch": 3.089377011929559, + "grad_norm": 0.5124056339263916, + "learning_rate": 7.642491952281765e-06, + "loss": 0.096, + "step": 32630 + }, + { + "epoch": 3.0903238023101687, + "grad_norm": 0.40301021933555603, + "learning_rate": 7.638704790759327e-06, + "loss": 0.0938, + "step": 32640 + }, + { + "epoch": 3.0912705926907784, + "grad_norm": 0.35446423292160034, + "learning_rate": 7.634917629236887e-06, + "loss": 0.0888, + "step": 32650 + }, + { + "epoch": 3.092217383071388, + "grad_norm": 0.43065381050109863, + "learning_rate": 7.631130467714449e-06, + "loss": 0.0855, + "step": 32660 + }, + { + "epoch": 3.093164173451998, + "grad_norm": 0.41086545586586, + "learning_rate": 7.62734330619201e-06, + "loss": 0.0959, + "step": 32670 + }, + { + "epoch": 3.0941109638326076, + "grad_norm": 0.5487180948257446, + "learning_rate": 7.6235561446695705e-06, + "loss": 0.0874, + "step": 32680 + }, + { + "epoch": 3.0950577542132174, + "grad_norm": 0.4843038320541382, + "learning_rate": 7.619768983147132e-06, + "loss": 0.0943, + "step": 32690 + }, + { + "epoch": 3.096004544593827, + "grad_norm": 0.49333345890045166, + "learning_rate": 7.615981821624693e-06, + "loss": 0.0942, + "step": 32700 + }, + { + "epoch": 3.096951334974437, + "grad_norm": 0.4573739767074585, + "learning_rate": 7.612194660102254e-06, + "loss": 0.0971, + "step": 32710 + }, + { + "epoch": 3.0978981253550466, + "grad_norm": 0.5025661587715149, + "learning_rate": 7.608407498579815e-06, + "loss": 0.0946, + "step": 32720 + }, + { + "epoch": 3.098844915735656, + "grad_norm": 0.3712051212787628, + "learning_rate": 7.604620337057376e-06, + "loss": 0.095, + "step": 32730 + }, + { + "epoch": 3.0997917061162656, + "grad_norm": 0.47726795077323914, + "learning_rate": 7.600833175534938e-06, + "loss": 0.0951, + "step": 32740 + }, + { + "epoch": 3.1007384964968754, + "grad_norm": 0.42070797085762024, + "learning_rate": 7.597046014012498e-06, + "loss": 0.0957, + "step": 32750 + }, + { + "epoch": 3.101685286877485, + "grad_norm": 0.4767979681491852, + "learning_rate": 7.593258852490059e-06, + "loss": 0.1072, + "step": 32760 + }, + { + "epoch": 3.102632077258095, + "grad_norm": 0.522358238697052, + "learning_rate": 7.58947169096762e-06, + "loss": 0.0874, + "step": 32770 + }, + { + "epoch": 3.1035788676387046, + "grad_norm": 0.36201608180999756, + "learning_rate": 7.585684529445182e-06, + "loss": 0.1002, + "step": 32780 + }, + { + "epoch": 3.1045256580193143, + "grad_norm": 0.4515857696533203, + "learning_rate": 7.581897367922743e-06, + "loss": 0.099, + "step": 32790 + }, + { + "epoch": 3.105472448399924, + "grad_norm": 0.3814016878604889, + "learning_rate": 7.578110206400303e-06, + "loss": 0.0921, + "step": 32800 + }, + { + "epoch": 3.106419238780534, + "grad_norm": 0.41360601782798767, + "learning_rate": 7.574323044877865e-06, + "loss": 0.0864, + "step": 32810 + }, + { + "epoch": 3.1073660291611436, + "grad_norm": 0.35619574785232544, + "learning_rate": 7.570535883355426e-06, + "loss": 0.0928, + "step": 32820 + }, + { + "epoch": 3.1083128195417533, + "grad_norm": 0.3933326005935669, + "learning_rate": 7.566748721832987e-06, + "loss": 0.0877, + "step": 32830 + }, + { + "epoch": 3.109259609922363, + "grad_norm": 0.45120447874069214, + "learning_rate": 7.5629615603105474e-06, + "loss": 0.0911, + "step": 32840 + }, + { + "epoch": 3.110206400302973, + "grad_norm": 0.5016258358955383, + "learning_rate": 7.559174398788109e-06, + "loss": 0.0954, + "step": 32850 + }, + { + "epoch": 3.1111531906835825, + "grad_norm": 0.45662152767181396, + "learning_rate": 7.55538723726567e-06, + "loss": 0.0894, + "step": 32860 + }, + { + "epoch": 3.1120999810641923, + "grad_norm": 0.4399343729019165, + "learning_rate": 7.551600075743231e-06, + "loss": 0.0987, + "step": 32870 + }, + { + "epoch": 3.113046771444802, + "grad_norm": 0.4458087980747223, + "learning_rate": 7.547812914220793e-06, + "loss": 0.0917, + "step": 32880 + }, + { + "epoch": 3.1139935618254118, + "grad_norm": 0.3969976603984833, + "learning_rate": 7.544025752698353e-06, + "loss": 0.1044, + "step": 32890 + }, + { + "epoch": 3.1149403522060215, + "grad_norm": 0.4304681718349457, + "learning_rate": 7.5402385911759145e-06, + "loss": 0.0945, + "step": 32900 + }, + { + "epoch": 3.1158871425866312, + "grad_norm": 0.4242285192012787, + "learning_rate": 7.536451429653475e-06, + "loss": 0.0909, + "step": 32910 + }, + { + "epoch": 3.116833932967241, + "grad_norm": 0.5326030254364014, + "learning_rate": 7.532664268131037e-06, + "loss": 0.101, + "step": 32920 + }, + { + "epoch": 3.1177807233478507, + "grad_norm": 0.47638463973999023, + "learning_rate": 7.528877106608597e-06, + "loss": 0.0937, + "step": 32930 + }, + { + "epoch": 3.1187275137284605, + "grad_norm": 0.32601669430732727, + "learning_rate": 7.525089945086158e-06, + "loss": 0.0916, + "step": 32940 + }, + { + "epoch": 3.11967430410907, + "grad_norm": 0.5212317109107971, + "learning_rate": 7.52130278356372e-06, + "loss": 0.0955, + "step": 32950 + }, + { + "epoch": 3.12062109448968, + "grad_norm": 0.5516402125358582, + "learning_rate": 7.517515622041281e-06, + "loss": 0.1028, + "step": 32960 + }, + { + "epoch": 3.1215678848702897, + "grad_norm": 0.4505993127822876, + "learning_rate": 7.5137284605188425e-06, + "loss": 0.0967, + "step": 32970 + }, + { + "epoch": 3.1225146752508994, + "grad_norm": 0.4699845314025879, + "learning_rate": 7.5099412989964025e-06, + "loss": 0.0915, + "step": 32980 + }, + { + "epoch": 3.123461465631509, + "grad_norm": 0.39752283692359924, + "learning_rate": 7.506154137473964e-06, + "loss": 0.0974, + "step": 32990 + }, + { + "epoch": 3.124408256012119, + "grad_norm": 0.40322551131248474, + "learning_rate": 7.502366975951525e-06, + "loss": 0.1015, + "step": 33000 + }, + { + "epoch": 3.1253550463927287, + "grad_norm": 0.38083702325820923, + "learning_rate": 7.498579814429085e-06, + "loss": 0.0952, + "step": 33010 + }, + { + "epoch": 3.1263018367733384, + "grad_norm": 0.44498032331466675, + "learning_rate": 7.494792652906647e-06, + "loss": 0.0836, + "step": 33020 + }, + { + "epoch": 3.127248627153948, + "grad_norm": 0.45370641350746155, + "learning_rate": 7.491005491384208e-06, + "loss": 0.0993, + "step": 33030 + }, + { + "epoch": 3.128195417534558, + "grad_norm": 0.49418359994888306, + "learning_rate": 7.48721832986177e-06, + "loss": 0.0945, + "step": 33040 + }, + { + "epoch": 3.1291422079151676, + "grad_norm": 0.38631823658943176, + "learning_rate": 7.4834311683393305e-06, + "loss": 0.0982, + "step": 33050 + }, + { + "epoch": 3.1300889982957774, + "grad_norm": 0.4079272150993347, + "learning_rate": 7.479644006816892e-06, + "loss": 0.0911, + "step": 33060 + }, + { + "epoch": 3.131035788676387, + "grad_norm": 0.5161040425300598, + "learning_rate": 7.475856845294452e-06, + "loss": 0.1054, + "step": 33070 + }, + { + "epoch": 3.131982579056997, + "grad_norm": 0.4277031719684601, + "learning_rate": 7.472069683772013e-06, + "loss": 0.0893, + "step": 33080 + }, + { + "epoch": 3.1329293694376066, + "grad_norm": 0.3898018002510071, + "learning_rate": 7.468282522249575e-06, + "loss": 0.1023, + "step": 33090 + }, + { + "epoch": 3.1338761598182163, + "grad_norm": 0.48282235860824585, + "learning_rate": 7.464495360727135e-06, + "loss": 0.0853, + "step": 33100 + }, + { + "epoch": 3.134822950198826, + "grad_norm": 0.5204094052314758, + "learning_rate": 7.460708199204697e-06, + "loss": 0.089, + "step": 33110 + }, + { + "epoch": 3.135769740579436, + "grad_norm": 0.49214330315589905, + "learning_rate": 7.456921037682258e-06, + "loss": 0.0978, + "step": 33120 + }, + { + "epoch": 3.1367165309600455, + "grad_norm": 0.4669604003429413, + "learning_rate": 7.453133876159819e-06, + "loss": 0.0986, + "step": 33130 + }, + { + "epoch": 3.1376633213406553, + "grad_norm": 0.4839509427547455, + "learning_rate": 7.44934671463738e-06, + "loss": 0.1034, + "step": 33140 + }, + { + "epoch": 3.138610111721265, + "grad_norm": 0.5297679901123047, + "learning_rate": 7.44555955311494e-06, + "loss": 0.1051, + "step": 33150 + }, + { + "epoch": 3.1395569021018748, + "grad_norm": 0.4383249580860138, + "learning_rate": 7.441772391592502e-06, + "loss": 0.1006, + "step": 33160 + }, + { + "epoch": 3.1405036924824845, + "grad_norm": 0.48694220185279846, + "learning_rate": 7.437985230070063e-06, + "loss": 0.097, + "step": 33170 + }, + { + "epoch": 3.1414504828630943, + "grad_norm": 0.5828351974487305, + "learning_rate": 7.434198068547625e-06, + "loss": 0.0959, + "step": 33180 + }, + { + "epoch": 3.142397273243704, + "grad_norm": 0.4270647168159485, + "learning_rate": 7.430410907025185e-06, + "loss": 0.1049, + "step": 33190 + }, + { + "epoch": 3.1433440636243137, + "grad_norm": 0.5030734539031982, + "learning_rate": 7.4266237455027465e-06, + "loss": 0.0996, + "step": 33200 + }, + { + "epoch": 3.1442908540049235, + "grad_norm": 0.46380719542503357, + "learning_rate": 7.422836583980307e-06, + "loss": 0.0947, + "step": 33210 + }, + { + "epoch": 3.145237644385533, + "grad_norm": 0.4121900796890259, + "learning_rate": 7.419049422457868e-06, + "loss": 0.0987, + "step": 33220 + }, + { + "epoch": 3.146184434766143, + "grad_norm": 0.48857617378234863, + "learning_rate": 7.41526226093543e-06, + "loss": 0.0987, + "step": 33230 + }, + { + "epoch": 3.1471312251467527, + "grad_norm": 0.4057711660861969, + "learning_rate": 7.41147509941299e-06, + "loss": 0.0938, + "step": 33240 + }, + { + "epoch": 3.1480780155273624, + "grad_norm": 0.3517865836620331, + "learning_rate": 7.407687937890552e-06, + "loss": 0.0943, + "step": 33250 + }, + { + "epoch": 3.149024805907972, + "grad_norm": 0.529962420463562, + "learning_rate": 7.403900776368113e-06, + "loss": 0.1027, + "step": 33260 + }, + { + "epoch": 3.149971596288582, + "grad_norm": 0.4298302233219147, + "learning_rate": 7.4001136148456744e-06, + "loss": 0.0916, + "step": 33270 + }, + { + "epoch": 3.1509183866691917, + "grad_norm": 0.34590843319892883, + "learning_rate": 7.3963264533232345e-06, + "loss": 0.1008, + "step": 33280 + }, + { + "epoch": 3.1518651770498014, + "grad_norm": 0.35897600650787354, + "learning_rate": 7.392539291800795e-06, + "loss": 0.0982, + "step": 33290 + }, + { + "epoch": 3.1528119674304107, + "grad_norm": 0.4598219692707062, + "learning_rate": 7.388752130278357e-06, + "loss": 0.096, + "step": 33300 + }, + { + "epoch": 3.1537587578110204, + "grad_norm": 0.5004033446311951, + "learning_rate": 7.384964968755918e-06, + "loss": 0.0921, + "step": 33310 + }, + { + "epoch": 3.15470554819163, + "grad_norm": 0.49685394763946533, + "learning_rate": 7.38117780723348e-06, + "loss": 0.1034, + "step": 33320 + }, + { + "epoch": 3.15565233857224, + "grad_norm": 0.5572829842567444, + "learning_rate": 7.37739064571104e-06, + "loss": 0.0979, + "step": 33330 + }, + { + "epoch": 3.1565991289528497, + "grad_norm": 0.4883466362953186, + "learning_rate": 7.3736034841886016e-06, + "loss": 0.0931, + "step": 33340 + }, + { + "epoch": 3.1575459193334594, + "grad_norm": 0.434589147567749, + "learning_rate": 7.3698163226661625e-06, + "loss": 0.0954, + "step": 33350 + }, + { + "epoch": 3.158492709714069, + "grad_norm": 0.4334976375102997, + "learning_rate": 7.3660291611437225e-06, + "loss": 0.0976, + "step": 33360 + }, + { + "epoch": 3.159439500094679, + "grad_norm": 0.4388626217842102, + "learning_rate": 7.362241999621284e-06, + "loss": 0.1022, + "step": 33370 + }, + { + "epoch": 3.1603862904752886, + "grad_norm": 0.5536752939224243, + "learning_rate": 7.358454838098845e-06, + "loss": 0.0956, + "step": 33380 + }, + { + "epoch": 3.1613330808558984, + "grad_norm": 0.528320848941803, + "learning_rate": 7.354667676576407e-06, + "loss": 0.0998, + "step": 33390 + }, + { + "epoch": 3.162279871236508, + "grad_norm": 0.40832021832466125, + "learning_rate": 7.350880515053968e-06, + "loss": 0.0959, + "step": 33400 + }, + { + "epoch": 3.163226661617118, + "grad_norm": 0.4212343394756317, + "learning_rate": 7.3470933535315295e-06, + "loss": 0.0933, + "step": 33410 + }, + { + "epoch": 3.1641734519977276, + "grad_norm": 0.49932220578193665, + "learning_rate": 7.34330619200909e-06, + "loss": 0.1002, + "step": 33420 + }, + { + "epoch": 3.1651202423783373, + "grad_norm": 0.4118240475654602, + "learning_rate": 7.3395190304866505e-06, + "loss": 0.0856, + "step": 33430 + }, + { + "epoch": 3.166067032758947, + "grad_norm": 0.5317455530166626, + "learning_rate": 7.335731868964212e-06, + "loss": 0.0918, + "step": 33440 + }, + { + "epoch": 3.167013823139557, + "grad_norm": 0.5176566243171692, + "learning_rate": 7.331944707441772e-06, + "loss": 0.0998, + "step": 33450 + }, + { + "epoch": 3.1679606135201666, + "grad_norm": 0.4694359600543976, + "learning_rate": 7.328157545919334e-06, + "loss": 0.1001, + "step": 33460 + }, + { + "epoch": 3.1689074039007763, + "grad_norm": 0.44935131072998047, + "learning_rate": 7.324370384396895e-06, + "loss": 0.0954, + "step": 33470 + }, + { + "epoch": 3.169854194281386, + "grad_norm": 0.49862948060035706, + "learning_rate": 7.320583222874457e-06, + "loss": 0.1028, + "step": 33480 + }, + { + "epoch": 3.170800984661996, + "grad_norm": 0.5710641145706177, + "learning_rate": 7.3167960613520176e-06, + "loss": 0.0985, + "step": 33490 + }, + { + "epoch": 3.1717477750426055, + "grad_norm": 0.44302472472190857, + "learning_rate": 7.313008899829578e-06, + "loss": 0.103, + "step": 33500 + }, + { + "epoch": 3.1726945654232153, + "grad_norm": 0.5398033857345581, + "learning_rate": 7.309221738307139e-06, + "loss": 0.0995, + "step": 33510 + }, + { + "epoch": 3.173641355803825, + "grad_norm": 0.4514923393726349, + "learning_rate": 7.3054345767847e-06, + "loss": 0.0991, + "step": 33520 + }, + { + "epoch": 3.1745881461844347, + "grad_norm": 0.5368667840957642, + "learning_rate": 7.301647415262262e-06, + "loss": 0.099, + "step": 33530 + }, + { + "epoch": 3.1755349365650445, + "grad_norm": 0.46442773938179016, + "learning_rate": 7.297860253739822e-06, + "loss": 0.1082, + "step": 33540 + }, + { + "epoch": 3.1764817269456542, + "grad_norm": 0.5406079888343811, + "learning_rate": 7.294073092217384e-06, + "loss": 0.0905, + "step": 33550 + }, + { + "epoch": 3.177428517326264, + "grad_norm": 0.531243085861206, + "learning_rate": 7.290285930694945e-06, + "loss": 0.0882, + "step": 33560 + }, + { + "epoch": 3.1783753077068737, + "grad_norm": 0.4771122336387634, + "learning_rate": 7.286498769172506e-06, + "loss": 0.101, + "step": 33570 + }, + { + "epoch": 3.1793220980874835, + "grad_norm": 0.5824273824691772, + "learning_rate": 7.282711607650067e-06, + "loss": 0.0921, + "step": 33580 + }, + { + "epoch": 3.180268888468093, + "grad_norm": 0.5940106511116028, + "learning_rate": 7.278924446127627e-06, + "loss": 0.1092, + "step": 33590 + }, + { + "epoch": 3.181215678848703, + "grad_norm": 0.4634968042373657, + "learning_rate": 7.275137284605189e-06, + "loss": 0.095, + "step": 33600 + }, + { + "epoch": 3.1821624692293127, + "grad_norm": 0.5046654939651489, + "learning_rate": 7.27135012308275e-06, + "loss": 0.0964, + "step": 33610 + }, + { + "epoch": 3.1831092596099224, + "grad_norm": 0.46652933955192566, + "learning_rate": 7.267562961560312e-06, + "loss": 0.0892, + "step": 33620 + }, + { + "epoch": 3.184056049990532, + "grad_norm": 0.5102341175079346, + "learning_rate": 7.263775800037872e-06, + "loss": 0.093, + "step": 33630 + }, + { + "epoch": 3.185002840371142, + "grad_norm": 0.5113751292228699, + "learning_rate": 7.259988638515433e-06, + "loss": 0.1, + "step": 33640 + }, + { + "epoch": 3.1859496307517516, + "grad_norm": 0.537693202495575, + "learning_rate": 7.2562014769929944e-06, + "loss": 0.096, + "step": 33650 + }, + { + "epoch": 3.1868964211323614, + "grad_norm": 0.5255269408226013, + "learning_rate": 7.252414315470555e-06, + "loss": 0.0978, + "step": 33660 + }, + { + "epoch": 3.187843211512971, + "grad_norm": 0.4506100118160248, + "learning_rate": 7.248627153948117e-06, + "loss": 0.096, + "step": 33670 + }, + { + "epoch": 3.188790001893581, + "grad_norm": 0.39353442192077637, + "learning_rate": 7.244839992425677e-06, + "loss": 0.0947, + "step": 33680 + }, + { + "epoch": 3.1897367922741906, + "grad_norm": 0.5483629703521729, + "learning_rate": 7.241052830903239e-06, + "loss": 0.0959, + "step": 33690 + }, + { + "epoch": 3.1906835826548003, + "grad_norm": 0.42552974820137024, + "learning_rate": 7.2372656693808e-06, + "loss": 0.0984, + "step": 33700 + }, + { + "epoch": 3.19163037303541, + "grad_norm": 0.6715368032455444, + "learning_rate": 7.233478507858361e-06, + "loss": 0.0949, + "step": 33710 + }, + { + "epoch": 3.19257716341602, + "grad_norm": 0.40398454666137695, + "learning_rate": 7.2296913463359216e-06, + "loss": 0.0889, + "step": 33720 + }, + { + "epoch": 3.1935239537966296, + "grad_norm": 0.42471370100975037, + "learning_rate": 7.2259041848134825e-06, + "loss": 0.0875, + "step": 33730 + }, + { + "epoch": 3.1944707441772393, + "grad_norm": 0.37097862362861633, + "learning_rate": 7.222117023291044e-06, + "loss": 0.0979, + "step": 33740 + }, + { + "epoch": 3.195417534557849, + "grad_norm": 0.514901340007782, + "learning_rate": 7.218329861768605e-06, + "loss": 0.0887, + "step": 33750 + }, + { + "epoch": 3.196364324938459, + "grad_norm": 0.5147632956504822, + "learning_rate": 7.214542700246167e-06, + "loss": 0.0991, + "step": 33760 + }, + { + "epoch": 3.1973111153190685, + "grad_norm": 0.48616480827331543, + "learning_rate": 7.210755538723727e-06, + "loss": 0.0976, + "step": 33770 + }, + { + "epoch": 3.1982579056996783, + "grad_norm": 0.5624353885650635, + "learning_rate": 7.206968377201288e-06, + "loss": 0.0962, + "step": 33780 + }, + { + "epoch": 3.199204696080288, + "grad_norm": 0.4632226526737213, + "learning_rate": 7.2031812156788495e-06, + "loss": 0.0989, + "step": 33790 + }, + { + "epoch": 3.2001514864608978, + "grad_norm": 0.403451144695282, + "learning_rate": 7.1993940541564104e-06, + "loss": 0.0923, + "step": 33800 + }, + { + "epoch": 3.2010982768415075, + "grad_norm": 0.37990331649780273, + "learning_rate": 7.195606892633971e-06, + "loss": 0.1006, + "step": 33810 + }, + { + "epoch": 3.202045067222117, + "grad_norm": 0.5115433931350708, + "learning_rate": 7.191819731111532e-06, + "loss": 0.0984, + "step": 33820 + }, + { + "epoch": 3.2029918576027265, + "grad_norm": 0.4312513470649719, + "learning_rate": 7.188032569589094e-06, + "loss": 0.0887, + "step": 33830 + }, + { + "epoch": 3.2039386479833363, + "grad_norm": 0.46648553013801575, + "learning_rate": 7.184245408066655e-06, + "loss": 0.0893, + "step": 33840 + }, + { + "epoch": 3.204885438363946, + "grad_norm": 0.4689205586910248, + "learning_rate": 7.180458246544215e-06, + "loss": 0.0991, + "step": 33850 + }, + { + "epoch": 3.2058322287445558, + "grad_norm": 0.3541924059391022, + "learning_rate": 7.176671085021777e-06, + "loss": 0.0871, + "step": 33860 + }, + { + "epoch": 3.2067790191251655, + "grad_norm": 0.45448562502861023, + "learning_rate": 7.1728839234993376e-06, + "loss": 0.0977, + "step": 33870 + }, + { + "epoch": 3.2077258095057752, + "grad_norm": 0.39853841066360474, + "learning_rate": 7.169096761976899e-06, + "loss": 0.0911, + "step": 33880 + }, + { + "epoch": 3.208672599886385, + "grad_norm": 0.6429105997085571, + "learning_rate": 7.16530960045446e-06, + "loss": 0.105, + "step": 33890 + }, + { + "epoch": 3.2096193902669947, + "grad_norm": 0.3810679614543915, + "learning_rate": 7.161522438932021e-06, + "loss": 0.0865, + "step": 33900 + }, + { + "epoch": 3.2105661806476045, + "grad_norm": 0.43609294295310974, + "learning_rate": 7.157735277409582e-06, + "loss": 0.0947, + "step": 33910 + }, + { + "epoch": 3.211512971028214, + "grad_norm": 0.644542932510376, + "learning_rate": 7.153948115887143e-06, + "loss": 0.1029, + "step": 33920 + }, + { + "epoch": 3.212459761408824, + "grad_norm": 0.4697587490081787, + "learning_rate": 7.150160954364705e-06, + "loss": 0.1001, + "step": 33930 + }, + { + "epoch": 3.2134065517894337, + "grad_norm": 0.5083484649658203, + "learning_rate": 7.146373792842265e-06, + "loss": 0.1025, + "step": 33940 + }, + { + "epoch": 3.2143533421700434, + "grad_norm": 0.4288027882575989, + "learning_rate": 7.142586631319826e-06, + "loss": 0.1035, + "step": 33950 + }, + { + "epoch": 3.215300132550653, + "grad_norm": 0.38893651962280273, + "learning_rate": 7.138799469797387e-06, + "loss": 0.0987, + "step": 33960 + }, + { + "epoch": 3.216246922931263, + "grad_norm": 0.4421164393424988, + "learning_rate": 7.135012308274949e-06, + "loss": 0.1082, + "step": 33970 + }, + { + "epoch": 3.2171937133118726, + "grad_norm": 0.4417242109775543, + "learning_rate": 7.13122514675251e-06, + "loss": 0.0949, + "step": 33980 + }, + { + "epoch": 3.2181405036924824, + "grad_norm": 0.3973984718322754, + "learning_rate": 7.12743798523007e-06, + "loss": 0.0934, + "step": 33990 + }, + { + "epoch": 3.219087294073092, + "grad_norm": 0.39709991216659546, + "learning_rate": 7.123650823707632e-06, + "loss": 0.0937, + "step": 34000 + }, + { + "epoch": 3.220034084453702, + "grad_norm": 0.42711204290390015, + "learning_rate": 7.119863662185193e-06, + "loss": 0.0929, + "step": 34010 + }, + { + "epoch": 3.2209808748343116, + "grad_norm": 0.48301854729652405, + "learning_rate": 7.116076500662754e-06, + "loss": 0.1032, + "step": 34020 + }, + { + "epoch": 3.2219276652149214, + "grad_norm": 0.47069182991981506, + "learning_rate": 7.1122893391403144e-06, + "loss": 0.0938, + "step": 34030 + }, + { + "epoch": 3.222874455595531, + "grad_norm": 0.36594197154045105, + "learning_rate": 7.108502177617876e-06, + "loss": 0.088, + "step": 34040 + }, + { + "epoch": 3.223821245976141, + "grad_norm": 0.42151713371276855, + "learning_rate": 7.104715016095437e-06, + "loss": 0.0945, + "step": 34050 + }, + { + "epoch": 3.2247680363567506, + "grad_norm": 0.37251588702201843, + "learning_rate": 7.100927854572998e-06, + "loss": 0.0957, + "step": 34060 + }, + { + "epoch": 3.2257148267373603, + "grad_norm": 0.46561744809150696, + "learning_rate": 7.09714069305056e-06, + "loss": 0.0978, + "step": 34070 + }, + { + "epoch": 3.22666161711797, + "grad_norm": 0.5953252911567688, + "learning_rate": 7.09335353152812e-06, + "loss": 0.1009, + "step": 34080 + }, + { + "epoch": 3.22760840749858, + "grad_norm": 0.5023998022079468, + "learning_rate": 7.0895663700056815e-06, + "loss": 0.1021, + "step": 34090 + }, + { + "epoch": 3.2285551978791895, + "grad_norm": 0.49502333998680115, + "learning_rate": 7.085779208483242e-06, + "loss": 0.0945, + "step": 34100 + }, + { + "epoch": 3.2295019882597993, + "grad_norm": 0.5286855697631836, + "learning_rate": 7.081992046960804e-06, + "loss": 0.0947, + "step": 34110 + }, + { + "epoch": 3.230448778640409, + "grad_norm": 0.4840443730354309, + "learning_rate": 7.078204885438364e-06, + "loss": 0.0993, + "step": 34120 + }, + { + "epoch": 3.2313955690210188, + "grad_norm": 0.4057762920856476, + "learning_rate": 7.074417723915925e-06, + "loss": 0.0879, + "step": 34130 + }, + { + "epoch": 3.2323423594016285, + "grad_norm": 0.6441464424133301, + "learning_rate": 7.070630562393487e-06, + "loss": 0.1014, + "step": 34140 + }, + { + "epoch": 3.2332891497822382, + "grad_norm": 0.4571669399738312, + "learning_rate": 7.066843400871048e-06, + "loss": 0.1046, + "step": 34150 + }, + { + "epoch": 3.234235940162848, + "grad_norm": 0.4196922779083252, + "learning_rate": 7.0630562393486095e-06, + "loss": 0.0954, + "step": 34160 + }, + { + "epoch": 3.2351827305434577, + "grad_norm": 0.3758500814437866, + "learning_rate": 7.0592690778261695e-06, + "loss": 0.0786, + "step": 34170 + }, + { + "epoch": 3.2361295209240675, + "grad_norm": 0.47714629769325256, + "learning_rate": 7.055481916303731e-06, + "loss": 0.1078, + "step": 34180 + }, + { + "epoch": 3.237076311304677, + "grad_norm": 0.4457477033138275, + "learning_rate": 7.051694754781292e-06, + "loss": 0.0975, + "step": 34190 + }, + { + "epoch": 3.238023101685287, + "grad_norm": 0.469585120677948, + "learning_rate": 7.047907593258852e-06, + "loss": 0.0982, + "step": 34200 + }, + { + "epoch": 3.2389698920658967, + "grad_norm": 0.4486069083213806, + "learning_rate": 7.044120431736414e-06, + "loss": 0.0991, + "step": 34210 + }, + { + "epoch": 3.2399166824465064, + "grad_norm": 0.5082442164421082, + "learning_rate": 7.040333270213975e-06, + "loss": 0.1033, + "step": 34220 + }, + { + "epoch": 3.240863472827116, + "grad_norm": 0.4870776832103729, + "learning_rate": 7.036546108691537e-06, + "loss": 0.0845, + "step": 34230 + }, + { + "epoch": 3.241810263207726, + "grad_norm": 0.5926299095153809, + "learning_rate": 7.0327589471690975e-06, + "loss": 0.0862, + "step": 34240 + }, + { + "epoch": 3.2427570535883357, + "grad_norm": 0.4643210470676422, + "learning_rate": 7.028971785646659e-06, + "loss": 0.0884, + "step": 34250 + }, + { + "epoch": 3.2437038439689454, + "grad_norm": 0.46026501059532166, + "learning_rate": 7.025184624124219e-06, + "loss": 0.0967, + "step": 34260 + }, + { + "epoch": 3.244650634349555, + "grad_norm": 0.4218289852142334, + "learning_rate": 7.02139746260178e-06, + "loss": 0.0884, + "step": 34270 + }, + { + "epoch": 3.245597424730165, + "grad_norm": 0.4901501536369324, + "learning_rate": 7.017610301079342e-06, + "loss": 0.1004, + "step": 34280 + }, + { + "epoch": 3.2465442151107746, + "grad_norm": 0.39414164423942566, + "learning_rate": 7.013823139556902e-06, + "loss": 0.0874, + "step": 34290 + }, + { + "epoch": 3.2474910054913844, + "grad_norm": 0.4422838091850281, + "learning_rate": 7.010035978034464e-06, + "loss": 0.0935, + "step": 34300 + }, + { + "epoch": 3.248437795871994, + "grad_norm": 0.41335728764533997, + "learning_rate": 7.006248816512025e-06, + "loss": 0.0939, + "step": 34310 + }, + { + "epoch": 3.249384586252604, + "grad_norm": 0.5088624954223633, + "learning_rate": 7.002461654989586e-06, + "loss": 0.1019, + "step": 34320 + }, + { + "epoch": 3.2503313766332136, + "grad_norm": 0.6368246078491211, + "learning_rate": 6.998674493467147e-06, + "loss": 0.1008, + "step": 34330 + }, + { + "epoch": 3.2512781670138233, + "grad_norm": 0.47923582792282104, + "learning_rate": 6.994887331944707e-06, + "loss": 0.1007, + "step": 34340 + }, + { + "epoch": 3.252224957394433, + "grad_norm": 0.3951050341129303, + "learning_rate": 6.991100170422269e-06, + "loss": 0.1036, + "step": 34350 + }, + { + "epoch": 3.253171747775043, + "grad_norm": 0.4178468883037567, + "learning_rate": 6.98731300889983e-06, + "loss": 0.0983, + "step": 34360 + }, + { + "epoch": 3.2541185381556526, + "grad_norm": 0.4253663420677185, + "learning_rate": 6.983525847377392e-06, + "loss": 0.0855, + "step": 34370 + }, + { + "epoch": 3.2550653285362623, + "grad_norm": 0.47536349296569824, + "learning_rate": 6.979738685854952e-06, + "loss": 0.0934, + "step": 34380 + }, + { + "epoch": 3.256012118916872, + "grad_norm": 0.49580034613609314, + "learning_rate": 6.9759515243325135e-06, + "loss": 0.0931, + "step": 34390 + }, + { + "epoch": 3.2569589092974818, + "grad_norm": 0.5044097900390625, + "learning_rate": 6.972164362810074e-06, + "loss": 0.0866, + "step": 34400 + }, + { + "epoch": 3.2579056996780915, + "grad_norm": 0.5280126333236694, + "learning_rate": 6.968377201287635e-06, + "loss": 0.0879, + "step": 34410 + }, + { + "epoch": 3.258852490058701, + "grad_norm": 0.5161647200584412, + "learning_rate": 6.964590039765197e-06, + "loss": 0.1006, + "step": 34420 + }, + { + "epoch": 3.2597992804393106, + "grad_norm": 0.48363789916038513, + "learning_rate": 6.960802878242757e-06, + "loss": 0.0898, + "step": 34430 + }, + { + "epoch": 3.2607460708199203, + "grad_norm": 0.47649186849594116, + "learning_rate": 6.957015716720319e-06, + "loss": 0.0884, + "step": 34440 + }, + { + "epoch": 3.26169286120053, + "grad_norm": 0.535963237285614, + "learning_rate": 6.95322855519788e-06, + "loss": 0.1064, + "step": 34450 + }, + { + "epoch": 3.2626396515811398, + "grad_norm": 0.5076953768730164, + "learning_rate": 6.9494413936754415e-06, + "loss": 0.0916, + "step": 34460 + }, + { + "epoch": 3.2635864419617495, + "grad_norm": 0.4224216639995575, + "learning_rate": 6.9456542321530015e-06, + "loss": 0.0994, + "step": 34470 + }, + { + "epoch": 3.2645332323423593, + "grad_norm": 0.6081414222717285, + "learning_rate": 6.941867070630562e-06, + "loss": 0.0992, + "step": 34480 + }, + { + "epoch": 3.265480022722969, + "grad_norm": 0.5121296644210815, + "learning_rate": 6.938079909108124e-06, + "loss": 0.0913, + "step": 34490 + }, + { + "epoch": 3.2664268131035787, + "grad_norm": 0.4258511960506439, + "learning_rate": 6.934292747585685e-06, + "loss": 0.0968, + "step": 34500 + }, + { + "epoch": 3.2673736034841885, + "grad_norm": 0.5431085228919983, + "learning_rate": 6.930505586063247e-06, + "loss": 0.0998, + "step": 34510 + }, + { + "epoch": 3.2683203938647982, + "grad_norm": 0.5170323252677917, + "learning_rate": 6.926718424540807e-06, + "loss": 0.0922, + "step": 34520 + }, + { + "epoch": 3.269267184245408, + "grad_norm": 0.5068280100822449, + "learning_rate": 6.922931263018369e-06, + "loss": 0.0962, + "step": 34530 + }, + { + "epoch": 3.2702139746260177, + "grad_norm": 0.5490415692329407, + "learning_rate": 6.9191441014959295e-06, + "loss": 0.0962, + "step": 34540 + }, + { + "epoch": 3.2711607650066274, + "grad_norm": 0.4179328382015228, + "learning_rate": 6.9153569399734895e-06, + "loss": 0.0972, + "step": 34550 + }, + { + "epoch": 3.272107555387237, + "grad_norm": 0.5323558449745178, + "learning_rate": 6.911569778451051e-06, + "loss": 0.0989, + "step": 34560 + }, + { + "epoch": 3.273054345767847, + "grad_norm": 0.4823920726776123, + "learning_rate": 6.907782616928612e-06, + "loss": 0.0944, + "step": 34570 + }, + { + "epoch": 3.2740011361484567, + "grad_norm": 0.3574821352958679, + "learning_rate": 6.903995455406174e-06, + "loss": 0.0992, + "step": 34580 + }, + { + "epoch": 3.2749479265290664, + "grad_norm": 0.4525294005870819, + "learning_rate": 6.900208293883735e-06, + "loss": 0.1004, + "step": 34590 + }, + { + "epoch": 3.275894716909676, + "grad_norm": 0.521497905254364, + "learning_rate": 6.8964211323612966e-06, + "loss": 0.0967, + "step": 34600 + }, + { + "epoch": 3.276841507290286, + "grad_norm": 0.4805905222892761, + "learning_rate": 6.892633970838857e-06, + "loss": 0.1, + "step": 34610 + }, + { + "epoch": 3.2777882976708956, + "grad_norm": 0.6537460684776306, + "learning_rate": 6.8888468093164175e-06, + "loss": 0.0977, + "step": 34620 + }, + { + "epoch": 3.2787350880515054, + "grad_norm": 0.6015987396240234, + "learning_rate": 6.885059647793979e-06, + "loss": 0.0972, + "step": 34630 + }, + { + "epoch": 3.279681878432115, + "grad_norm": 0.6600093841552734, + "learning_rate": 6.881272486271539e-06, + "loss": 0.0933, + "step": 34640 + }, + { + "epoch": 3.280628668812725, + "grad_norm": 0.5536496043205261, + "learning_rate": 6.877485324749101e-06, + "loss": 0.1031, + "step": 34650 + }, + { + "epoch": 3.2815754591933346, + "grad_norm": 0.4587319791316986, + "learning_rate": 6.873698163226662e-06, + "loss": 0.0879, + "step": 34660 + }, + { + "epoch": 3.2825222495739443, + "grad_norm": 0.6899036169052124, + "learning_rate": 6.869911001704224e-06, + "loss": 0.0961, + "step": 34670 + }, + { + "epoch": 3.283469039954554, + "grad_norm": 0.488418310880661, + "learning_rate": 6.8661238401817846e-06, + "loss": 0.1059, + "step": 34680 + }, + { + "epoch": 3.284415830335164, + "grad_norm": 0.42573970556259155, + "learning_rate": 6.862336678659345e-06, + "loss": 0.1001, + "step": 34690 + }, + { + "epoch": 3.2853626207157736, + "grad_norm": 0.5093210339546204, + "learning_rate": 6.858549517136906e-06, + "loss": 0.0984, + "step": 34700 + }, + { + "epoch": 3.2863094110963833, + "grad_norm": 0.5334013104438782, + "learning_rate": 6.854762355614467e-06, + "loss": 0.1044, + "step": 34710 + }, + { + "epoch": 3.287256201476993, + "grad_norm": 0.5470275282859802, + "learning_rate": 6.850975194092029e-06, + "loss": 0.0953, + "step": 34720 + }, + { + "epoch": 3.288202991857603, + "grad_norm": 0.4627389907836914, + "learning_rate": 6.847188032569589e-06, + "loss": 0.0926, + "step": 34730 + }, + { + "epoch": 3.2891497822382125, + "grad_norm": 0.4343864917755127, + "learning_rate": 6.843400871047151e-06, + "loss": 0.1108, + "step": 34740 + }, + { + "epoch": 3.2900965726188223, + "grad_norm": 0.4533028304576874, + "learning_rate": 6.839613709524712e-06, + "loss": 0.1036, + "step": 34750 + }, + { + "epoch": 3.291043362999432, + "grad_norm": 0.5282678008079529, + "learning_rate": 6.8358265480022734e-06, + "loss": 0.1052, + "step": 34760 + }, + { + "epoch": 3.2919901533800418, + "grad_norm": 0.4647209644317627, + "learning_rate": 6.832039386479834e-06, + "loss": 0.0928, + "step": 34770 + }, + { + "epoch": 3.2929369437606515, + "grad_norm": 0.5247597694396973, + "learning_rate": 6.828252224957394e-06, + "loss": 0.0964, + "step": 34780 + }, + { + "epoch": 3.2938837341412612, + "grad_norm": 0.5702017545700073, + "learning_rate": 6.824465063434956e-06, + "loss": 0.1065, + "step": 34790 + }, + { + "epoch": 3.294830524521871, + "grad_norm": 0.5594719052314758, + "learning_rate": 6.820677901912517e-06, + "loss": 0.0919, + "step": 34800 + }, + { + "epoch": 3.2957773149024807, + "grad_norm": 0.4289184510707855, + "learning_rate": 6.816890740390079e-06, + "loss": 0.0946, + "step": 34810 + }, + { + "epoch": 3.2967241052830905, + "grad_norm": 0.4763040542602539, + "learning_rate": 6.813103578867639e-06, + "loss": 0.0976, + "step": 34820 + }, + { + "epoch": 3.2976708956637, + "grad_norm": 0.4474315643310547, + "learning_rate": 6.8093164173452006e-06, + "loss": 0.0918, + "step": 34830 + }, + { + "epoch": 3.29861768604431, + "grad_norm": 0.5152229070663452, + "learning_rate": 6.8055292558227615e-06, + "loss": 0.0953, + "step": 34840 + }, + { + "epoch": 3.2995644764249197, + "grad_norm": 0.43370160460472107, + "learning_rate": 6.801742094300322e-06, + "loss": 0.0957, + "step": 34850 + }, + { + "epoch": 3.3005112668055294, + "grad_norm": 0.4983856976032257, + "learning_rate": 6.797954932777884e-06, + "loss": 0.0954, + "step": 34860 + }, + { + "epoch": 3.301458057186139, + "grad_norm": 0.43842777609825134, + "learning_rate": 6.794167771255444e-06, + "loss": 0.0968, + "step": 34870 + }, + { + "epoch": 3.302404847566749, + "grad_norm": 0.41550055146217346, + "learning_rate": 6.790380609733006e-06, + "loss": 0.0988, + "step": 34880 + }, + { + "epoch": 3.3033516379473586, + "grad_norm": 0.45826441049575806, + "learning_rate": 6.786593448210567e-06, + "loss": 0.0923, + "step": 34890 + }, + { + "epoch": 3.304298428327968, + "grad_norm": 0.5877769589424133, + "learning_rate": 6.7828062866881285e-06, + "loss": 0.0995, + "step": 34900 + }, + { + "epoch": 3.3052452187085777, + "grad_norm": 0.5494678616523743, + "learning_rate": 6.779019125165689e-06, + "loss": 0.0944, + "step": 34910 + }, + { + "epoch": 3.3061920090891874, + "grad_norm": 0.48389655351638794, + "learning_rate": 6.7752319636432495e-06, + "loss": 0.1098, + "step": 34920 + }, + { + "epoch": 3.307138799469797, + "grad_norm": 0.4657025635242462, + "learning_rate": 6.771444802120811e-06, + "loss": 0.0941, + "step": 34930 + }, + { + "epoch": 3.308085589850407, + "grad_norm": 0.4596572816371918, + "learning_rate": 6.767657640598372e-06, + "loss": 0.0945, + "step": 34940 + }, + { + "epoch": 3.3090323802310166, + "grad_norm": 0.5223503708839417, + "learning_rate": 6.763870479075934e-06, + "loss": 0.1009, + "step": 34950 + }, + { + "epoch": 3.3099791706116264, + "grad_norm": 0.6406366229057312, + "learning_rate": 6.760083317553494e-06, + "loss": 0.0995, + "step": 34960 + }, + { + "epoch": 3.310925960992236, + "grad_norm": 0.4907243549823761, + "learning_rate": 6.756296156031056e-06, + "loss": 0.0971, + "step": 34970 + }, + { + "epoch": 3.311872751372846, + "grad_norm": 0.49459734559059143, + "learning_rate": 6.7525089945086166e-06, + "loss": 0.0962, + "step": 34980 + }, + { + "epoch": 3.3128195417534556, + "grad_norm": 0.46839937567710876, + "learning_rate": 6.748721832986177e-06, + "loss": 0.0904, + "step": 34990 + }, + { + "epoch": 3.3137663321340654, + "grad_norm": 0.4720109701156616, + "learning_rate": 6.744934671463738e-06, + "loss": 0.1, + "step": 35000 + }, + { + "epoch": 3.314713122514675, + "grad_norm": 0.49172693490982056, + "learning_rate": 6.741147509941299e-06, + "loss": 0.0925, + "step": 35010 + }, + { + "epoch": 3.315659912895285, + "grad_norm": 0.4704832434654236, + "learning_rate": 6.737360348418861e-06, + "loss": 0.1016, + "step": 35020 + }, + { + "epoch": 3.3166067032758946, + "grad_norm": 0.4528552293777466, + "learning_rate": 6.733573186896422e-06, + "loss": 0.1003, + "step": 35030 + }, + { + "epoch": 3.3175534936565043, + "grad_norm": 0.44715917110443115, + "learning_rate": 6.729786025373984e-06, + "loss": 0.0942, + "step": 35040 + }, + { + "epoch": 3.318500284037114, + "grad_norm": 0.4122602641582489, + "learning_rate": 6.725998863851544e-06, + "loss": 0.0961, + "step": 35050 + }, + { + "epoch": 3.319447074417724, + "grad_norm": 0.4888918101787567, + "learning_rate": 6.7222117023291046e-06, + "loss": 0.1032, + "step": 35060 + }, + { + "epoch": 3.3203938647983335, + "grad_norm": 0.527289092540741, + "learning_rate": 6.718424540806666e-06, + "loss": 0.0961, + "step": 35070 + }, + { + "epoch": 3.3213406551789433, + "grad_norm": 0.47067925333976746, + "learning_rate": 6.714637379284226e-06, + "loss": 0.0933, + "step": 35080 + }, + { + "epoch": 3.322287445559553, + "grad_norm": 0.4370437562465668, + "learning_rate": 6.710850217761788e-06, + "loss": 0.0973, + "step": 35090 + }, + { + "epoch": 3.3232342359401628, + "grad_norm": 0.41316908597946167, + "learning_rate": 6.707063056239349e-06, + "loss": 0.0961, + "step": 35100 + }, + { + "epoch": 3.3241810263207725, + "grad_norm": 0.4926522672176361, + "learning_rate": 6.703275894716911e-06, + "loss": 0.1046, + "step": 35110 + }, + { + "epoch": 3.3251278167013822, + "grad_norm": 0.5030735731124878, + "learning_rate": 6.699488733194472e-06, + "loss": 0.0908, + "step": 35120 + }, + { + "epoch": 3.326074607081992, + "grad_norm": 0.4518188238143921, + "learning_rate": 6.695701571672032e-06, + "loss": 0.0955, + "step": 35130 + }, + { + "epoch": 3.3270213974626017, + "grad_norm": 0.43322181701660156, + "learning_rate": 6.6919144101495934e-06, + "loss": 0.0939, + "step": 35140 + }, + { + "epoch": 3.3279681878432115, + "grad_norm": 0.4343359172344208, + "learning_rate": 6.688127248627154e-06, + "loss": 0.0971, + "step": 35150 + }, + { + "epoch": 3.328914978223821, + "grad_norm": 0.48641207814216614, + "learning_rate": 6.684340087104716e-06, + "loss": 0.0996, + "step": 35160 + }, + { + "epoch": 3.329861768604431, + "grad_norm": 0.5804575085639954, + "learning_rate": 6.680552925582276e-06, + "loss": 0.0993, + "step": 35170 + }, + { + "epoch": 3.3308085589850407, + "grad_norm": 0.6237295866012573, + "learning_rate": 6.676765764059838e-06, + "loss": 0.0888, + "step": 35180 + }, + { + "epoch": 3.3317553493656504, + "grad_norm": 0.44823551177978516, + "learning_rate": 6.672978602537399e-06, + "loss": 0.0896, + "step": 35190 + }, + { + "epoch": 3.33270213974626, + "grad_norm": 0.617464005947113, + "learning_rate": 6.66919144101496e-06, + "loss": 0.1059, + "step": 35200 + }, + { + "epoch": 3.33364893012687, + "grad_norm": 0.5996119379997253, + "learning_rate": 6.665404279492521e-06, + "loss": 0.0877, + "step": 35210 + }, + { + "epoch": 3.3345957205074797, + "grad_norm": 0.6480814814567566, + "learning_rate": 6.6616171179700815e-06, + "loss": 0.1111, + "step": 35220 + }, + { + "epoch": 3.3355425108880894, + "grad_norm": 0.5987730622291565, + "learning_rate": 6.657829956447643e-06, + "loss": 0.0859, + "step": 35230 + }, + { + "epoch": 3.336489301268699, + "grad_norm": 0.5056741237640381, + "learning_rate": 6.654042794925204e-06, + "loss": 0.0893, + "step": 35240 + }, + { + "epoch": 3.337436091649309, + "grad_norm": 0.5164422988891602, + "learning_rate": 6.650255633402766e-06, + "loss": 0.0987, + "step": 35250 + }, + { + "epoch": 3.3383828820299186, + "grad_norm": 0.4121684432029724, + "learning_rate": 6.646468471880326e-06, + "loss": 0.0892, + "step": 35260 + }, + { + "epoch": 3.3393296724105284, + "grad_norm": 0.47107455134391785, + "learning_rate": 6.642681310357887e-06, + "loss": 0.0872, + "step": 35270 + }, + { + "epoch": 3.340276462791138, + "grad_norm": 0.5224701166152954, + "learning_rate": 6.6388941488354485e-06, + "loss": 0.1134, + "step": 35280 + }, + { + "epoch": 3.341223253171748, + "grad_norm": 0.45090168714523315, + "learning_rate": 6.6351069873130094e-06, + "loss": 0.093, + "step": 35290 + }, + { + "epoch": 3.3421700435523576, + "grad_norm": 0.2751280963420868, + "learning_rate": 6.631319825790571e-06, + "loss": 0.082, + "step": 35300 + }, + { + "epoch": 3.3431168339329673, + "grad_norm": 0.5314486622810364, + "learning_rate": 6.627532664268131e-06, + "loss": 0.0954, + "step": 35310 + }, + { + "epoch": 3.344063624313577, + "grad_norm": 0.4997088611125946, + "learning_rate": 6.623745502745693e-06, + "loss": 0.093, + "step": 35320 + }, + { + "epoch": 3.345010414694187, + "grad_norm": 0.40355512499809265, + "learning_rate": 6.619958341223254e-06, + "loss": 0.0949, + "step": 35330 + }, + { + "epoch": 3.3459572050747965, + "grad_norm": 0.4665563106536865, + "learning_rate": 6.616171179700815e-06, + "loss": 0.0909, + "step": 35340 + }, + { + "epoch": 3.3469039954554063, + "grad_norm": 0.4175027012825012, + "learning_rate": 6.612384018178376e-06, + "loss": 0.0892, + "step": 35350 + }, + { + "epoch": 3.347850785836016, + "grad_norm": 0.5048304796218872, + "learning_rate": 6.6085968566559366e-06, + "loss": 0.0952, + "step": 35360 + }, + { + "epoch": 3.3487975762166258, + "grad_norm": 0.3938494026660919, + "learning_rate": 6.604809695133498e-06, + "loss": 0.0874, + "step": 35370 + }, + { + "epoch": 3.3497443665972355, + "grad_norm": 0.4948226809501648, + "learning_rate": 6.601022533611059e-06, + "loss": 0.1051, + "step": 35380 + }, + { + "epoch": 3.3506911569778453, + "grad_norm": 0.4628227651119232, + "learning_rate": 6.597235372088621e-06, + "loss": 0.1034, + "step": 35390 + }, + { + "epoch": 3.351637947358455, + "grad_norm": 0.4768291711807251, + "learning_rate": 6.593448210566181e-06, + "loss": 0.0944, + "step": 35400 + }, + { + "epoch": 3.3525847377390647, + "grad_norm": 0.5109726190567017, + "learning_rate": 6.589661049043742e-06, + "loss": 0.098, + "step": 35410 + }, + { + "epoch": 3.3535315281196745, + "grad_norm": 0.49673202633857727, + "learning_rate": 6.585873887521304e-06, + "loss": 0.0965, + "step": 35420 + }, + { + "epoch": 3.354478318500284, + "grad_norm": 0.46202951669692993, + "learning_rate": 6.5820867259988645e-06, + "loss": 0.0868, + "step": 35430 + }, + { + "epoch": 3.355425108880894, + "grad_norm": 0.5012629628181458, + "learning_rate": 6.578299564476425e-06, + "loss": 0.1084, + "step": 35440 + }, + { + "epoch": 3.3563718992615037, + "grad_norm": 0.44407936930656433, + "learning_rate": 6.574512402953986e-06, + "loss": 0.0925, + "step": 35450 + }, + { + "epoch": 3.3573186896421134, + "grad_norm": 0.6375394463539124, + "learning_rate": 6.570725241431548e-06, + "loss": 0.1051, + "step": 35460 + }, + { + "epoch": 3.358265480022723, + "grad_norm": 0.4182000756263733, + "learning_rate": 6.566938079909109e-06, + "loss": 0.1004, + "step": 35470 + }, + { + "epoch": 3.359212270403333, + "grad_norm": 0.40456700325012207, + "learning_rate": 6.563150918386669e-06, + "loss": 0.0903, + "step": 35480 + }, + { + "epoch": 3.3601590607839427, + "grad_norm": 0.47193360328674316, + "learning_rate": 6.559363756864231e-06, + "loss": 0.1049, + "step": 35490 + }, + { + "epoch": 3.3611058511645524, + "grad_norm": 0.44254809617996216, + "learning_rate": 6.555576595341792e-06, + "loss": 0.0924, + "step": 35500 + }, + { + "epoch": 3.362052641545162, + "grad_norm": 0.44690197706222534, + "learning_rate": 6.551789433819353e-06, + "loss": 0.1009, + "step": 35510 + }, + { + "epoch": 3.3629994319257714, + "grad_norm": 0.32038065791130066, + "learning_rate": 6.548002272296914e-06, + "loss": 0.0964, + "step": 35520 + }, + { + "epoch": 3.363946222306381, + "grad_norm": 0.42337891459465027, + "learning_rate": 6.544215110774475e-06, + "loss": 0.0983, + "step": 35530 + }, + { + "epoch": 3.364893012686991, + "grad_norm": 0.45714786648750305, + "learning_rate": 6.540427949252036e-06, + "loss": 0.0878, + "step": 35540 + }, + { + "epoch": 3.3658398030676007, + "grad_norm": 0.4319290220737457, + "learning_rate": 6.536640787729597e-06, + "loss": 0.0933, + "step": 35550 + }, + { + "epoch": 3.3667865934482104, + "grad_norm": 0.5740752816200256, + "learning_rate": 6.532853626207159e-06, + "loss": 0.0961, + "step": 35560 + }, + { + "epoch": 3.36773338382882, + "grad_norm": 0.5247994661331177, + "learning_rate": 6.529066464684719e-06, + "loss": 0.0912, + "step": 35570 + }, + { + "epoch": 3.36868017420943, + "grad_norm": 0.5578654408454895, + "learning_rate": 6.5252793031622805e-06, + "loss": 0.0926, + "step": 35580 + }, + { + "epoch": 3.3696269645900396, + "grad_norm": 0.4753609895706177, + "learning_rate": 6.521492141639841e-06, + "loss": 0.0999, + "step": 35590 + }, + { + "epoch": 3.3705737549706494, + "grad_norm": 0.4210730195045471, + "learning_rate": 6.517704980117403e-06, + "loss": 0.0935, + "step": 35600 + }, + { + "epoch": 3.371520545351259, + "grad_norm": 0.5239378809928894, + "learning_rate": 6.513917818594964e-06, + "loss": 0.0998, + "step": 35610 + }, + { + "epoch": 3.372467335731869, + "grad_norm": 0.554095447063446, + "learning_rate": 6.510130657072524e-06, + "loss": 0.0968, + "step": 35620 + }, + { + "epoch": 3.3734141261124786, + "grad_norm": 0.5286445617675781, + "learning_rate": 6.506343495550086e-06, + "loss": 0.0887, + "step": 35630 + }, + { + "epoch": 3.3743609164930883, + "grad_norm": 0.4211771488189697, + "learning_rate": 6.502556334027647e-06, + "loss": 0.0887, + "step": 35640 + }, + { + "epoch": 3.375307706873698, + "grad_norm": 0.4598631262779236, + "learning_rate": 6.4987691725052085e-06, + "loss": 0.095, + "step": 35650 + }, + { + "epoch": 3.376254497254308, + "grad_norm": 0.4372754991054535, + "learning_rate": 6.4949820109827685e-06, + "loss": 0.0942, + "step": 35660 + }, + { + "epoch": 3.3772012876349176, + "grad_norm": 0.5444381833076477, + "learning_rate": 6.49119484946033e-06, + "loss": 0.09, + "step": 35670 + }, + { + "epoch": 3.3781480780155273, + "grad_norm": 0.597549557685852, + "learning_rate": 6.487407687937891e-06, + "loss": 0.0895, + "step": 35680 + }, + { + "epoch": 3.379094868396137, + "grad_norm": 0.4873766005039215, + "learning_rate": 6.483620526415452e-06, + "loss": 0.0912, + "step": 35690 + }, + { + "epoch": 3.380041658776747, + "grad_norm": 0.4911908507347107, + "learning_rate": 6.479833364893014e-06, + "loss": 0.1113, + "step": 35700 + }, + { + "epoch": 3.3809884491573565, + "grad_norm": 0.5038026571273804, + "learning_rate": 6.476046203370574e-06, + "loss": 0.0958, + "step": 35710 + }, + { + "epoch": 3.3819352395379663, + "grad_norm": 0.4452175498008728, + "learning_rate": 6.472259041848136e-06, + "loss": 0.0898, + "step": 35720 + }, + { + "epoch": 3.382882029918576, + "grad_norm": 0.4882689416408539, + "learning_rate": 6.4684718803256965e-06, + "loss": 0.0961, + "step": 35730 + }, + { + "epoch": 3.3838288202991857, + "grad_norm": 0.4825945794582367, + "learning_rate": 6.464684718803258e-06, + "loss": 0.0926, + "step": 35740 + }, + { + "epoch": 3.3847756106797955, + "grad_norm": 0.4384516477584839, + "learning_rate": 6.460897557280818e-06, + "loss": 0.0864, + "step": 35750 + }, + { + "epoch": 3.3857224010604052, + "grad_norm": 0.5509099960327148, + "learning_rate": 6.457110395758379e-06, + "loss": 0.1007, + "step": 35760 + }, + { + "epoch": 3.386669191441015, + "grad_norm": 0.6205552220344543, + "learning_rate": 6.453323234235941e-06, + "loss": 0.0955, + "step": 35770 + }, + { + "epoch": 3.3876159818216247, + "grad_norm": 0.42575883865356445, + "learning_rate": 6.449536072713502e-06, + "loss": 0.0985, + "step": 35780 + }, + { + "epoch": 3.3885627722022345, + "grad_norm": 0.5736665725708008, + "learning_rate": 6.4457489111910636e-06, + "loss": 0.0954, + "step": 35790 + }, + { + "epoch": 3.389509562582844, + "grad_norm": 0.416550874710083, + "learning_rate": 6.441961749668624e-06, + "loss": 0.0929, + "step": 35800 + }, + { + "epoch": 3.390456352963454, + "grad_norm": 0.4721170961856842, + "learning_rate": 6.438174588146185e-06, + "loss": 0.0939, + "step": 35810 + }, + { + "epoch": 3.3914031433440637, + "grad_norm": 0.44223690032958984, + "learning_rate": 6.434387426623746e-06, + "loss": 0.0905, + "step": 35820 + }, + { + "epoch": 3.3923499337246734, + "grad_norm": 0.48436668515205383, + "learning_rate": 6.430600265101306e-06, + "loss": 0.1023, + "step": 35830 + }, + { + "epoch": 3.393296724105283, + "grad_norm": 0.48289725184440613, + "learning_rate": 6.426813103578868e-06, + "loss": 0.096, + "step": 35840 + }, + { + "epoch": 3.394243514485893, + "grad_norm": 0.5580629110336304, + "learning_rate": 6.423025942056429e-06, + "loss": 0.1035, + "step": 35850 + }, + { + "epoch": 3.3951903048665026, + "grad_norm": 0.33182501792907715, + "learning_rate": 6.419238780533991e-06, + "loss": 0.0972, + "step": 35860 + }, + { + "epoch": 3.3961370952471124, + "grad_norm": 0.5468883514404297, + "learning_rate": 6.415451619011552e-06, + "loss": 0.1044, + "step": 35870 + }, + { + "epoch": 3.397083885627722, + "grad_norm": 0.4226824939250946, + "learning_rate": 6.411664457489113e-06, + "loss": 0.0929, + "step": 35880 + }, + { + "epoch": 3.398030676008332, + "grad_norm": 0.46152549982070923, + "learning_rate": 6.407877295966673e-06, + "loss": 0.1023, + "step": 35890 + }, + { + "epoch": 3.3989774663889416, + "grad_norm": 0.5353741645812988, + "learning_rate": 6.404090134444234e-06, + "loss": 0.1086, + "step": 35900 + }, + { + "epoch": 3.3999242567695513, + "grad_norm": 0.43531525135040283, + "learning_rate": 6.400302972921796e-06, + "loss": 0.1058, + "step": 35910 + }, + { + "epoch": 3.400871047150161, + "grad_norm": 0.5732455253601074, + "learning_rate": 6.396515811399356e-06, + "loss": 0.0953, + "step": 35920 + }, + { + "epoch": 3.401817837530771, + "grad_norm": 0.453078955411911, + "learning_rate": 6.392728649876918e-06, + "loss": 0.1015, + "step": 35930 + }, + { + "epoch": 3.4027646279113806, + "grad_norm": 0.48207634687423706, + "learning_rate": 6.388941488354479e-06, + "loss": 0.1028, + "step": 35940 + }, + { + "epoch": 3.4037114182919903, + "grad_norm": 0.5186272263526917, + "learning_rate": 6.3851543268320405e-06, + "loss": 0.0925, + "step": 35950 + }, + { + "epoch": 3.4046582086726, + "grad_norm": 0.5188668370246887, + "learning_rate": 6.381367165309601e-06, + "loss": 0.1047, + "step": 35960 + }, + { + "epoch": 3.40560499905321, + "grad_norm": 0.42555683851242065, + "learning_rate": 6.377580003787161e-06, + "loss": 0.1046, + "step": 35970 + }, + { + "epoch": 3.4065517894338195, + "grad_norm": 0.615644633769989, + "learning_rate": 6.373792842264723e-06, + "loss": 0.1068, + "step": 35980 + }, + { + "epoch": 3.4074985798144293, + "grad_norm": 0.5070286393165588, + "learning_rate": 6.370005680742284e-06, + "loss": 0.0975, + "step": 35990 + }, + { + "epoch": 3.4084453701950386, + "grad_norm": 0.5518382787704468, + "learning_rate": 6.366218519219846e-06, + "loss": 0.099, + "step": 36000 + }, + { + "epoch": 3.4093921605756483, + "grad_norm": 0.45891568064689636, + "learning_rate": 6.362431357697406e-06, + "loss": 0.0898, + "step": 36010 + }, + { + "epoch": 3.410338950956258, + "grad_norm": 0.5730379223823547, + "learning_rate": 6.358644196174968e-06, + "loss": 0.0962, + "step": 36020 + }, + { + "epoch": 3.411285741336868, + "grad_norm": 0.3990994691848755, + "learning_rate": 6.3548570346525285e-06, + "loss": 0.0846, + "step": 36030 + }, + { + "epoch": 3.4122325317174775, + "grad_norm": 0.5104419589042664, + "learning_rate": 6.351069873130089e-06, + "loss": 0.1066, + "step": 36040 + }, + { + "epoch": 3.4131793220980873, + "grad_norm": 0.46190160512924194, + "learning_rate": 6.347282711607651e-06, + "loss": 0.0952, + "step": 36050 + }, + { + "epoch": 3.414126112478697, + "grad_norm": 0.4073682129383087, + "learning_rate": 6.343495550085211e-06, + "loss": 0.0966, + "step": 36060 + }, + { + "epoch": 3.4150729028593068, + "grad_norm": 0.5171728730201721, + "learning_rate": 6.339708388562773e-06, + "loss": 0.1023, + "step": 36070 + }, + { + "epoch": 3.4160196932399165, + "grad_norm": 0.4601489305496216, + "learning_rate": 6.335921227040334e-06, + "loss": 0.0856, + "step": 36080 + }, + { + "epoch": 3.4169664836205262, + "grad_norm": 0.4700734317302704, + "learning_rate": 6.3321340655178956e-06, + "loss": 0.0939, + "step": 36090 + }, + { + "epoch": 3.417913274001136, + "grad_norm": 0.5556718111038208, + "learning_rate": 6.328346903995456e-06, + "loss": 0.084, + "step": 36100 + }, + { + "epoch": 3.4188600643817457, + "grad_norm": 0.5419147610664368, + "learning_rate": 6.3245597424730165e-06, + "loss": 0.0897, + "step": 36110 + }, + { + "epoch": 3.4198068547623555, + "grad_norm": 0.5299506783485413, + "learning_rate": 6.320772580950578e-06, + "loss": 0.0937, + "step": 36120 + }, + { + "epoch": 3.420753645142965, + "grad_norm": 0.4458409249782562, + "learning_rate": 6.316985419428139e-06, + "loss": 0.0833, + "step": 36130 + }, + { + "epoch": 3.421700435523575, + "grad_norm": 0.35562217235565186, + "learning_rate": 6.313198257905701e-06, + "loss": 0.1003, + "step": 36140 + }, + { + "epoch": 3.4226472259041847, + "grad_norm": 0.40143030881881714, + "learning_rate": 6.309411096383261e-06, + "loss": 0.0882, + "step": 36150 + }, + { + "epoch": 3.4235940162847944, + "grad_norm": 0.479168176651001, + "learning_rate": 6.305623934860823e-06, + "loss": 0.0951, + "step": 36160 + }, + { + "epoch": 3.424540806665404, + "grad_norm": 0.5207576751708984, + "learning_rate": 6.3018367733383836e-06, + "loss": 0.1059, + "step": 36170 + }, + { + "epoch": 3.425487597046014, + "grad_norm": 0.46082577109336853, + "learning_rate": 6.298049611815944e-06, + "loss": 0.0926, + "step": 36180 + }, + { + "epoch": 3.4264343874266237, + "grad_norm": 0.45255082845687866, + "learning_rate": 6.294262450293505e-06, + "loss": 0.0921, + "step": 36190 + }, + { + "epoch": 3.4273811778072334, + "grad_norm": 0.4582348167896271, + "learning_rate": 6.290475288771066e-06, + "loss": 0.1018, + "step": 36200 + }, + { + "epoch": 3.428327968187843, + "grad_norm": 0.5190147757530212, + "learning_rate": 6.286688127248628e-06, + "loss": 0.0997, + "step": 36210 + }, + { + "epoch": 3.429274758568453, + "grad_norm": 0.5631082653999329, + "learning_rate": 6.282900965726189e-06, + "loss": 0.094, + "step": 36220 + }, + { + "epoch": 3.4302215489490626, + "grad_norm": 0.45488786697387695, + "learning_rate": 6.279113804203751e-06, + "loss": 0.0881, + "step": 36230 + }, + { + "epoch": 3.4311683393296724, + "grad_norm": 0.4592917263507843, + "learning_rate": 6.275326642681311e-06, + "loss": 0.1066, + "step": 36240 + }, + { + "epoch": 3.432115129710282, + "grad_norm": 0.49670347571372986, + "learning_rate": 6.271539481158872e-06, + "loss": 0.0913, + "step": 36250 + }, + { + "epoch": 3.433061920090892, + "grad_norm": 0.5354011654853821, + "learning_rate": 6.267752319636433e-06, + "loss": 0.0948, + "step": 36260 + }, + { + "epoch": 3.4340087104715016, + "grad_norm": 0.5139631628990173, + "learning_rate": 6.263965158113993e-06, + "loss": 0.1055, + "step": 36270 + }, + { + "epoch": 3.4349555008521113, + "grad_norm": 0.5732100009918213, + "learning_rate": 6.260177996591555e-06, + "loss": 0.1018, + "step": 36280 + }, + { + "epoch": 3.435902291232721, + "grad_norm": 0.6240257620811462, + "learning_rate": 6.256390835069116e-06, + "loss": 0.1047, + "step": 36290 + }, + { + "epoch": 3.436849081613331, + "grad_norm": 0.4386384189128876, + "learning_rate": 6.252603673546678e-06, + "loss": 0.1011, + "step": 36300 + }, + { + "epoch": 3.4377958719939405, + "grad_norm": 0.4700325131416321, + "learning_rate": 6.248816512024239e-06, + "loss": 0.0957, + "step": 36310 + }, + { + "epoch": 3.4387426623745503, + "grad_norm": 0.41461843252182007, + "learning_rate": 6.245029350501799e-06, + "loss": 0.096, + "step": 36320 + }, + { + "epoch": 3.43968945275516, + "grad_norm": 0.4554722309112549, + "learning_rate": 6.2412421889793605e-06, + "loss": 0.0852, + "step": 36330 + }, + { + "epoch": 3.4406362431357698, + "grad_norm": 0.6045703887939453, + "learning_rate": 6.237455027456921e-06, + "loss": 0.0948, + "step": 36340 + }, + { + "epoch": 3.4415830335163795, + "grad_norm": 0.48058703541755676, + "learning_rate": 6.233667865934483e-06, + "loss": 0.094, + "step": 36350 + }, + { + "epoch": 3.4425298238969892, + "grad_norm": 0.4483596086502075, + "learning_rate": 6.229880704412043e-06, + "loss": 0.0971, + "step": 36360 + }, + { + "epoch": 3.443476614277599, + "grad_norm": 0.43034300208091736, + "learning_rate": 6.226093542889605e-06, + "loss": 0.1038, + "step": 36370 + }, + { + "epoch": 3.4444234046582087, + "grad_norm": 0.6224856972694397, + "learning_rate": 6.222306381367166e-06, + "loss": 0.1106, + "step": 36380 + }, + { + "epoch": 3.4453701950388185, + "grad_norm": 0.4863092005252838, + "learning_rate": 6.218519219844727e-06, + "loss": 0.0938, + "step": 36390 + }, + { + "epoch": 3.446316985419428, + "grad_norm": 0.38047119975090027, + "learning_rate": 6.2147320583222884e-06, + "loss": 0.0935, + "step": 36400 + }, + { + "epoch": 3.447263775800038, + "grad_norm": 0.5247942209243774, + "learning_rate": 6.2109448967998485e-06, + "loss": 0.0943, + "step": 36410 + }, + { + "epoch": 3.4482105661806477, + "grad_norm": 0.5480045080184937, + "learning_rate": 6.20715773527741e-06, + "loss": 0.098, + "step": 36420 + }, + { + "epoch": 3.4491573565612574, + "grad_norm": 0.4587515890598297, + "learning_rate": 6.203370573754971e-06, + "loss": 0.0984, + "step": 36430 + }, + { + "epoch": 3.450104146941867, + "grad_norm": 0.48768389225006104, + "learning_rate": 6.199583412232533e-06, + "loss": 0.1042, + "step": 36440 + }, + { + "epoch": 3.451050937322477, + "grad_norm": 0.4601198434829712, + "learning_rate": 6.195796250710093e-06, + "loss": 0.0974, + "step": 36450 + }, + { + "epoch": 3.4519977277030867, + "grad_norm": 0.48066088557243347, + "learning_rate": 6.192009089187654e-06, + "loss": 0.0966, + "step": 36460 + }, + { + "epoch": 3.4529445180836964, + "grad_norm": 0.42125844955444336, + "learning_rate": 6.1882219276652155e-06, + "loss": 0.0948, + "step": 36470 + }, + { + "epoch": 3.453891308464306, + "grad_norm": 0.5926153659820557, + "learning_rate": 6.1844347661427764e-06, + "loss": 0.0921, + "step": 36480 + }, + { + "epoch": 3.454838098844916, + "grad_norm": 0.5392172336578369, + "learning_rate": 6.180647604620338e-06, + "loss": 0.0964, + "step": 36490 + }, + { + "epoch": 3.4557848892255256, + "grad_norm": 0.4178329110145569, + "learning_rate": 6.176860443097898e-06, + "loss": 0.103, + "step": 36500 + }, + { + "epoch": 3.4567316796061354, + "grad_norm": 0.45059117674827576, + "learning_rate": 6.17307328157546e-06, + "loss": 0.0995, + "step": 36510 + }, + { + "epoch": 3.457678469986745, + "grad_norm": 0.5537148118019104, + "learning_rate": 6.169286120053021e-06, + "loss": 0.0894, + "step": 36520 + }, + { + "epoch": 3.458625260367355, + "grad_norm": 0.469635933637619, + "learning_rate": 6.165498958530581e-06, + "loss": 0.0955, + "step": 36530 + }, + { + "epoch": 3.4595720507479646, + "grad_norm": 0.4903915822505951, + "learning_rate": 6.161711797008143e-06, + "loss": 0.0952, + "step": 36540 + }, + { + "epoch": 3.4605188411285743, + "grad_norm": 0.4877898097038269, + "learning_rate": 6.1579246354857036e-06, + "loss": 0.0915, + "step": 36550 + }, + { + "epoch": 3.461465631509184, + "grad_norm": 0.4818441867828369, + "learning_rate": 6.154137473963265e-06, + "loss": 0.0957, + "step": 36560 + }, + { + "epoch": 3.462412421889794, + "grad_norm": 0.4799627661705017, + "learning_rate": 6.150350312440826e-06, + "loss": 0.1026, + "step": 36570 + }, + { + "epoch": 3.4633592122704036, + "grad_norm": 0.4602954685688019, + "learning_rate": 6.146563150918388e-06, + "loss": 0.0894, + "step": 36580 + }, + { + "epoch": 3.4643060026510133, + "grad_norm": 0.4297948479652405, + "learning_rate": 6.142775989395948e-06, + "loss": 0.0999, + "step": 36590 + }, + { + "epoch": 3.465252793031623, + "grad_norm": 0.4892706871032715, + "learning_rate": 6.138988827873509e-06, + "loss": 0.0953, + "step": 36600 + }, + { + "epoch": 3.4661995834122328, + "grad_norm": 0.5648135542869568, + "learning_rate": 6.135201666351071e-06, + "loss": 0.1001, + "step": 36610 + }, + { + "epoch": 3.467146373792842, + "grad_norm": 0.4892693758010864, + "learning_rate": 6.131414504828631e-06, + "loss": 0.0975, + "step": 36620 + }, + { + "epoch": 3.468093164173452, + "grad_norm": 0.4457409679889679, + "learning_rate": 6.1276273433061924e-06, + "loss": 0.0948, + "step": 36630 + }, + { + "epoch": 3.4690399545540616, + "grad_norm": 0.4559725821018219, + "learning_rate": 6.123840181783753e-06, + "loss": 0.0892, + "step": 36640 + }, + { + "epoch": 3.4699867449346713, + "grad_norm": 0.5138303637504578, + "learning_rate": 6.120053020261315e-06, + "loss": 0.0914, + "step": 36650 + }, + { + "epoch": 3.470933535315281, + "grad_norm": 0.4566517472267151, + "learning_rate": 6.116265858738876e-06, + "loss": 0.0971, + "step": 36660 + }, + { + "epoch": 3.471880325695891, + "grad_norm": 0.4594644010066986, + "learning_rate": 6.112478697216436e-06, + "loss": 0.0996, + "step": 36670 + }, + { + "epoch": 3.4728271160765005, + "grad_norm": 0.46614035964012146, + "learning_rate": 6.108691535693998e-06, + "loss": 0.1154, + "step": 36680 + }, + { + "epoch": 3.4737739064571103, + "grad_norm": 0.4599017798900604, + "learning_rate": 6.104904374171559e-06, + "loss": 0.0995, + "step": 36690 + }, + { + "epoch": 3.47472069683772, + "grad_norm": 0.4371439814567566, + "learning_rate": 6.10111721264912e-06, + "loss": 0.0844, + "step": 36700 + }, + { + "epoch": 3.4756674872183297, + "grad_norm": 0.5069402456283569, + "learning_rate": 6.0973300511266805e-06, + "loss": 0.0952, + "step": 36710 + }, + { + "epoch": 3.4766142775989395, + "grad_norm": 0.4560225307941437, + "learning_rate": 6.093542889604242e-06, + "loss": 0.0983, + "step": 36720 + }, + { + "epoch": 3.4775610679795492, + "grad_norm": 0.4633190929889679, + "learning_rate": 6.089755728081803e-06, + "loss": 0.0921, + "step": 36730 + }, + { + "epoch": 3.478507858360159, + "grad_norm": 0.3294411301612854, + "learning_rate": 6.085968566559364e-06, + "loss": 0.0891, + "step": 36740 + }, + { + "epoch": 3.4794546487407687, + "grad_norm": 0.3967805504798889, + "learning_rate": 6.082181405036926e-06, + "loss": 0.0894, + "step": 36750 + }, + { + "epoch": 3.4804014391213784, + "grad_norm": 0.4573502242565155, + "learning_rate": 6.078394243514486e-06, + "loss": 0.1026, + "step": 36760 + }, + { + "epoch": 3.481348229501988, + "grad_norm": 0.46853652596473694, + "learning_rate": 6.0746070819920475e-06, + "loss": 0.0921, + "step": 36770 + }, + { + "epoch": 3.482295019882598, + "grad_norm": 0.5473374128341675, + "learning_rate": 6.070819920469608e-06, + "loss": 0.0936, + "step": 36780 + }, + { + "epoch": 3.4832418102632077, + "grad_norm": 0.3934822976589203, + "learning_rate": 6.06703275894717e-06, + "loss": 0.0878, + "step": 36790 + }, + { + "epoch": 3.4841886006438174, + "grad_norm": 0.4433002173900604, + "learning_rate": 6.06324559742473e-06, + "loss": 0.0899, + "step": 36800 + }, + { + "epoch": 3.485135391024427, + "grad_norm": 0.5189225673675537, + "learning_rate": 6.059458435902291e-06, + "loss": 0.1038, + "step": 36810 + }, + { + "epoch": 3.486082181405037, + "grad_norm": 0.5562026500701904, + "learning_rate": 6.055671274379853e-06, + "loss": 0.0998, + "step": 36820 + }, + { + "epoch": 3.4870289717856466, + "grad_norm": 0.45497822761535645, + "learning_rate": 6.051884112857414e-06, + "loss": 0.1049, + "step": 36830 + }, + { + "epoch": 3.4879757621662564, + "grad_norm": 0.5216988921165466, + "learning_rate": 6.0480969513349755e-06, + "loss": 0.1033, + "step": 36840 + }, + { + "epoch": 3.488922552546866, + "grad_norm": 0.4457246959209442, + "learning_rate": 6.0443097898125355e-06, + "loss": 0.0898, + "step": 36850 + }, + { + "epoch": 3.489869342927476, + "grad_norm": 0.4475347697734833, + "learning_rate": 6.040522628290097e-06, + "loss": 0.1023, + "step": 36860 + }, + { + "epoch": 3.4908161333080856, + "grad_norm": 0.4940994381904602, + "learning_rate": 6.036735466767658e-06, + "loss": 0.0953, + "step": 36870 + }, + { + "epoch": 3.4917629236886953, + "grad_norm": 0.3915945291519165, + "learning_rate": 6.032948305245219e-06, + "loss": 0.0953, + "step": 36880 + }, + { + "epoch": 3.492709714069305, + "grad_norm": 0.5811683535575867, + "learning_rate": 6.02916114372278e-06, + "loss": 0.0955, + "step": 36890 + }, + { + "epoch": 3.493656504449915, + "grad_norm": 0.4573379158973694, + "learning_rate": 6.025373982200341e-06, + "loss": 0.0916, + "step": 36900 + }, + { + "epoch": 3.4946032948305246, + "grad_norm": 0.6304858922958374, + "learning_rate": 6.021586820677903e-06, + "loss": 0.0964, + "step": 36910 + }, + { + "epoch": 3.4955500852111343, + "grad_norm": 0.37654414772987366, + "learning_rate": 6.0177996591554635e-06, + "loss": 0.083, + "step": 36920 + }, + { + "epoch": 3.496496875591744, + "grad_norm": 0.5535511374473572, + "learning_rate": 6.014012497633025e-06, + "loss": 0.1011, + "step": 36930 + }, + { + "epoch": 3.497443665972354, + "grad_norm": 0.4015631675720215, + "learning_rate": 6.010225336110585e-06, + "loss": 0.087, + "step": 36940 + }, + { + "epoch": 3.4983904563529635, + "grad_norm": 0.5389937162399292, + "learning_rate": 6.006438174588146e-06, + "loss": 0.0943, + "step": 36950 + }, + { + "epoch": 3.4993372467335733, + "grad_norm": 0.48296236991882324, + "learning_rate": 6.002651013065708e-06, + "loss": 0.0984, + "step": 36960 + }, + { + "epoch": 3.500284037114183, + "grad_norm": 0.36838439106941223, + "learning_rate": 5.998863851543269e-06, + "loss": 0.0859, + "step": 36970 + }, + { + "epoch": 3.5012308274947928, + "grad_norm": 0.5506975054740906, + "learning_rate": 5.99507669002083e-06, + "loss": 0.1207, + "step": 36980 + }, + { + "epoch": 3.5021776178754025, + "grad_norm": 0.4506598114967346, + "learning_rate": 5.991289528498391e-06, + "loss": 0.0949, + "step": 36990 + }, + { + "epoch": 3.5031244082560122, + "grad_norm": 0.4446566104888916, + "learning_rate": 5.987502366975952e-06, + "loss": 0.1041, + "step": 37000 + }, + { + "epoch": 3.504071198636622, + "grad_norm": 0.43818366527557373, + "learning_rate": 5.983715205453513e-06, + "loss": 0.0986, + "step": 37010 + }, + { + "epoch": 3.5050179890172317, + "grad_norm": 0.4997681975364685, + "learning_rate": 5.979928043931073e-06, + "loss": 0.0918, + "step": 37020 + }, + { + "epoch": 3.5059647793978415, + "grad_norm": 0.5976042747497559, + "learning_rate": 5.976140882408635e-06, + "loss": 0.101, + "step": 37030 + }, + { + "epoch": 3.506911569778451, + "grad_norm": 0.43398550152778625, + "learning_rate": 5.972353720886196e-06, + "loss": 0.0949, + "step": 37040 + }, + { + "epoch": 3.507858360159061, + "grad_norm": 0.43026724457740784, + "learning_rate": 5.968566559363758e-06, + "loss": 0.0977, + "step": 37050 + }, + { + "epoch": 3.5088051505396707, + "grad_norm": 0.5097009539604187, + "learning_rate": 5.964779397841319e-06, + "loss": 0.0977, + "step": 37060 + }, + { + "epoch": 3.50975194092028, + "grad_norm": 0.36898836493492126, + "learning_rate": 5.9609922363188795e-06, + "loss": 0.0881, + "step": 37070 + }, + { + "epoch": 3.5106987313008897, + "grad_norm": 0.44765183329582214, + "learning_rate": 5.95720507479644e-06, + "loss": 0.0946, + "step": 37080 + }, + { + "epoch": 3.5116455216814995, + "grad_norm": 0.4629915654659271, + "learning_rate": 5.953417913274001e-06, + "loss": 0.0886, + "step": 37090 + }, + { + "epoch": 3.512592312062109, + "grad_norm": 0.43138405680656433, + "learning_rate": 5.949630751751563e-06, + "loss": 0.0945, + "step": 37100 + }, + { + "epoch": 3.513539102442719, + "grad_norm": 0.4578106105327606, + "learning_rate": 5.945843590229123e-06, + "loss": 0.1139, + "step": 37110 + }, + { + "epoch": 3.5144858928233287, + "grad_norm": 0.5025810599327087, + "learning_rate": 5.942056428706685e-06, + "loss": 0.1039, + "step": 37120 + }, + { + "epoch": 3.5154326832039384, + "grad_norm": 0.5416958332061768, + "learning_rate": 5.938269267184246e-06, + "loss": 0.0826, + "step": 37130 + }, + { + "epoch": 3.516379473584548, + "grad_norm": 0.4320995509624481, + "learning_rate": 5.9344821056618075e-06, + "loss": 0.1061, + "step": 37140 + }, + { + "epoch": 3.517326263965158, + "grad_norm": 0.4534364938735962, + "learning_rate": 5.930694944139368e-06, + "loss": 0.0876, + "step": 37150 + }, + { + "epoch": 3.5182730543457676, + "grad_norm": 0.5116265416145325, + "learning_rate": 5.926907782616928e-06, + "loss": 0.0907, + "step": 37160 + }, + { + "epoch": 3.5192198447263774, + "grad_norm": 0.582007646560669, + "learning_rate": 5.92312062109449e-06, + "loss": 0.1016, + "step": 37170 + }, + { + "epoch": 3.520166635106987, + "grad_norm": 0.5855867862701416, + "learning_rate": 5.919333459572051e-06, + "loss": 0.1076, + "step": 37180 + }, + { + "epoch": 3.521113425487597, + "grad_norm": 0.5251570343971252, + "learning_rate": 5.915546298049613e-06, + "loss": 0.0893, + "step": 37190 + }, + { + "epoch": 3.5220602158682066, + "grad_norm": 0.48873370885849, + "learning_rate": 5.911759136527173e-06, + "loss": 0.0926, + "step": 37200 + }, + { + "epoch": 3.5230070062488164, + "grad_norm": 0.4588128328323364, + "learning_rate": 5.907971975004735e-06, + "loss": 0.0877, + "step": 37210 + }, + { + "epoch": 3.523953796629426, + "grad_norm": 0.509213924407959, + "learning_rate": 5.9041848134822955e-06, + "loss": 0.0963, + "step": 37220 + }, + { + "epoch": 3.524900587010036, + "grad_norm": 0.4476909637451172, + "learning_rate": 5.900397651959856e-06, + "loss": 0.0957, + "step": 37230 + }, + { + "epoch": 3.5258473773906456, + "grad_norm": 0.4278342127799988, + "learning_rate": 5.896610490437418e-06, + "loss": 0.1033, + "step": 37240 + }, + { + "epoch": 3.5267941677712553, + "grad_norm": 0.41186821460723877, + "learning_rate": 5.892823328914978e-06, + "loss": 0.0943, + "step": 37250 + }, + { + "epoch": 3.527740958151865, + "grad_norm": 0.3761973977088928, + "learning_rate": 5.88903616739254e-06, + "loss": 0.1, + "step": 37260 + }, + { + "epoch": 3.528687748532475, + "grad_norm": 0.4176313281059265, + "learning_rate": 5.885249005870101e-06, + "loss": 0.0896, + "step": 37270 + }, + { + "epoch": 3.5296345389130845, + "grad_norm": 0.43153753876686096, + "learning_rate": 5.8814618443476626e-06, + "loss": 0.0949, + "step": 37280 + }, + { + "epoch": 3.5305813292936943, + "grad_norm": 0.49732184410095215, + "learning_rate": 5.877674682825223e-06, + "loss": 0.0931, + "step": 37290 + }, + { + "epoch": 3.531528119674304, + "grad_norm": 0.6734378933906555, + "learning_rate": 5.8738875213027835e-06, + "loss": 0.102, + "step": 37300 + }, + { + "epoch": 3.5324749100549138, + "grad_norm": 0.48934412002563477, + "learning_rate": 5.870100359780345e-06, + "loss": 0.0953, + "step": 37310 + }, + { + "epoch": 3.5334217004355235, + "grad_norm": 0.5085573196411133, + "learning_rate": 5.866313198257906e-06, + "loss": 0.1059, + "step": 37320 + }, + { + "epoch": 3.5343684908161332, + "grad_norm": 0.5069712996482849, + "learning_rate": 5.862526036735468e-06, + "loss": 0.0994, + "step": 37330 + }, + { + "epoch": 3.535315281196743, + "grad_norm": 0.5646600127220154, + "learning_rate": 5.858738875213028e-06, + "loss": 0.0982, + "step": 37340 + }, + { + "epoch": 3.5362620715773527, + "grad_norm": 0.3952558636665344, + "learning_rate": 5.85495171369059e-06, + "loss": 0.0928, + "step": 37350 + }, + { + "epoch": 3.5372088619579625, + "grad_norm": 0.45193013548851013, + "learning_rate": 5.851164552168151e-06, + "loss": 0.099, + "step": 37360 + }, + { + "epoch": 3.538155652338572, + "grad_norm": 0.493637353181839, + "learning_rate": 5.847377390645711e-06, + "loss": 0.0988, + "step": 37370 + }, + { + "epoch": 3.539102442719182, + "grad_norm": 0.5344982743263245, + "learning_rate": 5.843590229123272e-06, + "loss": 0.0946, + "step": 37380 + }, + { + "epoch": 3.5400492330997917, + "grad_norm": 0.4482509195804596, + "learning_rate": 5.839803067600833e-06, + "loss": 0.0998, + "step": 37390 + }, + { + "epoch": 3.5409960234804014, + "grad_norm": 0.4465737044811249, + "learning_rate": 5.836015906078395e-06, + "loss": 0.095, + "step": 37400 + }, + { + "epoch": 3.541942813861011, + "grad_norm": 0.6820048689842224, + "learning_rate": 5.832228744555956e-06, + "loss": 0.0971, + "step": 37410 + }, + { + "epoch": 3.542889604241621, + "grad_norm": 0.40104493498802185, + "learning_rate": 5.828441583033518e-06, + "loss": 0.0931, + "step": 37420 + }, + { + "epoch": 3.5438363946222307, + "grad_norm": 0.42221519351005554, + "learning_rate": 5.824654421511078e-06, + "loss": 0.0942, + "step": 37430 + }, + { + "epoch": 3.5447831850028404, + "grad_norm": 0.5034583210945129, + "learning_rate": 5.820867259988639e-06, + "loss": 0.1013, + "step": 37440 + }, + { + "epoch": 3.54572997538345, + "grad_norm": 0.42112550139427185, + "learning_rate": 5.8170800984662e-06, + "loss": 0.1039, + "step": 37450 + }, + { + "epoch": 3.54667676576406, + "grad_norm": 0.494607537984848, + "learning_rate": 5.81329293694376e-06, + "loss": 0.0986, + "step": 37460 + }, + { + "epoch": 3.5476235561446696, + "grad_norm": 0.449089378118515, + "learning_rate": 5.809505775421322e-06, + "loss": 0.0919, + "step": 37470 + }, + { + "epoch": 3.5485703465252794, + "grad_norm": 0.5358659625053406, + "learning_rate": 5.805718613898883e-06, + "loss": 0.1047, + "step": 37480 + }, + { + "epoch": 3.549517136905889, + "grad_norm": 0.43697452545166016, + "learning_rate": 5.801931452376445e-06, + "loss": 0.102, + "step": 37490 + }, + { + "epoch": 3.550463927286499, + "grad_norm": 0.48515307903289795, + "learning_rate": 5.798144290854006e-06, + "loss": 0.0946, + "step": 37500 + }, + { + "epoch": 3.5514107176671086, + "grad_norm": 0.5111203789710999, + "learning_rate": 5.794357129331566e-06, + "loss": 0.0979, + "step": 37510 + }, + { + "epoch": 3.5523575080477183, + "grad_norm": 0.47968727350234985, + "learning_rate": 5.7905699678091275e-06, + "loss": 0.0949, + "step": 37520 + }, + { + "epoch": 3.553304298428328, + "grad_norm": 0.36792320013046265, + "learning_rate": 5.786782806286688e-06, + "loss": 0.1011, + "step": 37530 + }, + { + "epoch": 3.554251088808938, + "grad_norm": 0.6238874197006226, + "learning_rate": 5.78299564476425e-06, + "loss": 0.1021, + "step": 37540 + }, + { + "epoch": 3.5551978791895475, + "grad_norm": 0.47502410411834717, + "learning_rate": 5.77920848324181e-06, + "loss": 0.0954, + "step": 37550 + }, + { + "epoch": 3.5561446695701573, + "grad_norm": 0.5178982615470886, + "learning_rate": 5.775421321719372e-06, + "loss": 0.1071, + "step": 37560 + }, + { + "epoch": 3.557091459950767, + "grad_norm": 0.5157225131988525, + "learning_rate": 5.771634160196933e-06, + "loss": 0.0962, + "step": 37570 + }, + { + "epoch": 3.5580382503313768, + "grad_norm": 0.4980148673057556, + "learning_rate": 5.7678469986744945e-06, + "loss": 0.1032, + "step": 37580 + }, + { + "epoch": 3.5589850407119865, + "grad_norm": 0.6088648438453674, + "learning_rate": 5.7640598371520554e-06, + "loss": 0.0972, + "step": 37590 + }, + { + "epoch": 3.5599318310925963, + "grad_norm": 0.42747214436531067, + "learning_rate": 5.7602726756296155e-06, + "loss": 0.0916, + "step": 37600 + }, + { + "epoch": 3.560878621473206, + "grad_norm": 0.5847950577735901, + "learning_rate": 5.756485514107177e-06, + "loss": 0.0955, + "step": 37610 + }, + { + "epoch": 3.5618254118538157, + "grad_norm": 0.5192122459411621, + "learning_rate": 5.752698352584738e-06, + "loss": 0.0986, + "step": 37620 + }, + { + "epoch": 3.5627722022344255, + "grad_norm": 0.506806492805481, + "learning_rate": 5.7489111910623e-06, + "loss": 0.0968, + "step": 37630 + }, + { + "epoch": 3.563718992615035, + "grad_norm": 0.4985431730747223, + "learning_rate": 5.74512402953986e-06, + "loss": 0.1011, + "step": 37640 + }, + { + "epoch": 3.564665782995645, + "grad_norm": 0.776935338973999, + "learning_rate": 5.741336868017422e-06, + "loss": 0.0975, + "step": 37650 + }, + { + "epoch": 3.5656125733762547, + "grad_norm": 0.46507251262664795, + "learning_rate": 5.7375497064949826e-06, + "loss": 0.0898, + "step": 37660 + }, + { + "epoch": 3.5665593637568644, + "grad_norm": 0.450090229511261, + "learning_rate": 5.7337625449725435e-06, + "loss": 0.111, + "step": 37670 + }, + { + "epoch": 3.567506154137474, + "grad_norm": 0.4301955997943878, + "learning_rate": 5.729975383450105e-06, + "loss": 0.0987, + "step": 37680 + }, + { + "epoch": 3.568452944518084, + "grad_norm": 0.36564552783966064, + "learning_rate": 5.726188221927665e-06, + "loss": 0.102, + "step": 37690 + }, + { + "epoch": 3.5693997348986937, + "grad_norm": 0.6277235150337219, + "learning_rate": 5.722401060405227e-06, + "loss": 0.1052, + "step": 37700 + }, + { + "epoch": 3.5703465252793034, + "grad_norm": 0.5736376643180847, + "learning_rate": 5.718613898882788e-06, + "loss": 0.109, + "step": 37710 + }, + { + "epoch": 3.571293315659913, + "grad_norm": 0.5228685736656189, + "learning_rate": 5.71482673736035e-06, + "loss": 0.1, + "step": 37720 + }, + { + "epoch": 3.572240106040523, + "grad_norm": 0.32450515031814575, + "learning_rate": 5.71103957583791e-06, + "loss": 0.1014, + "step": 37730 + }, + { + "epoch": 3.5731868964211326, + "grad_norm": 0.4867997169494629, + "learning_rate": 5.707252414315471e-06, + "loss": 0.0964, + "step": 37740 + }, + { + "epoch": 3.5741336868017424, + "grad_norm": 0.4821121394634247, + "learning_rate": 5.703465252793032e-06, + "loss": 0.0894, + "step": 37750 + }, + { + "epoch": 3.5750804771823517, + "grad_norm": 0.609014630317688, + "learning_rate": 5.699678091270593e-06, + "loss": 0.106, + "step": 37760 + }, + { + "epoch": 3.5760272675629614, + "grad_norm": 0.5423181056976318, + "learning_rate": 5.695890929748155e-06, + "loss": 0.1029, + "step": 37770 + }, + { + "epoch": 3.576974057943571, + "grad_norm": 0.464769572019577, + "learning_rate": 5.692103768225715e-06, + "loss": 0.1011, + "step": 37780 + }, + { + "epoch": 3.577920848324181, + "grad_norm": 0.4220525920391083, + "learning_rate": 5.688316606703277e-06, + "loss": 0.0923, + "step": 37790 + }, + { + "epoch": 3.5788676387047906, + "grad_norm": 0.4703858494758606, + "learning_rate": 5.684529445180838e-06, + "loss": 0.0998, + "step": 37800 + }, + { + "epoch": 3.5798144290854004, + "grad_norm": 0.5060818791389465, + "learning_rate": 5.680742283658398e-06, + "loss": 0.1134, + "step": 37810 + }, + { + "epoch": 3.58076121946601, + "grad_norm": 0.4083239436149597, + "learning_rate": 5.6769551221359594e-06, + "loss": 0.0953, + "step": 37820 + }, + { + "epoch": 3.58170800984662, + "grad_norm": 0.5225698947906494, + "learning_rate": 5.67316796061352e-06, + "loss": 0.0917, + "step": 37830 + }, + { + "epoch": 3.5826548002272296, + "grad_norm": 0.5536400675773621, + "learning_rate": 5.669380799091082e-06, + "loss": 0.1023, + "step": 37840 + }, + { + "epoch": 3.5836015906078393, + "grad_norm": 0.4061962366104126, + "learning_rate": 5.665593637568643e-06, + "loss": 0.0982, + "step": 37850 + }, + { + "epoch": 3.584548380988449, + "grad_norm": 0.48493853211402893, + "learning_rate": 5.661806476046205e-06, + "loss": 0.0926, + "step": 37860 + }, + { + "epoch": 3.585495171369059, + "grad_norm": 0.4256500005722046, + "learning_rate": 5.658019314523765e-06, + "loss": 0.1008, + "step": 37870 + }, + { + "epoch": 3.5864419617496686, + "grad_norm": 0.529008150100708, + "learning_rate": 5.654232153001326e-06, + "loss": 0.094, + "step": 37880 + }, + { + "epoch": 3.5873887521302783, + "grad_norm": 0.6380013227462769, + "learning_rate": 5.650444991478887e-06, + "loss": 0.0955, + "step": 37890 + }, + { + "epoch": 3.588335542510888, + "grad_norm": 0.4689798057079315, + "learning_rate": 5.6466578299564475e-06, + "loss": 0.0963, + "step": 37900 + }, + { + "epoch": 3.589282332891498, + "grad_norm": 0.5896109938621521, + "learning_rate": 5.642870668434009e-06, + "loss": 0.0981, + "step": 37910 + }, + { + "epoch": 3.5902291232721075, + "grad_norm": 0.4753442704677582, + "learning_rate": 5.63908350691157e-06, + "loss": 0.1036, + "step": 37920 + }, + { + "epoch": 3.5911759136527173, + "grad_norm": 0.40374627709388733, + "learning_rate": 5.635296345389132e-06, + "loss": 0.0946, + "step": 37930 + }, + { + "epoch": 3.592122704033327, + "grad_norm": 0.5366222858428955, + "learning_rate": 5.631509183866693e-06, + "loss": 0.0969, + "step": 37940 + }, + { + "epoch": 3.5930694944139367, + "grad_norm": 0.5004488825798035, + "learning_rate": 5.627722022344253e-06, + "loss": 0.0981, + "step": 37950 + }, + { + "epoch": 3.5940162847945465, + "grad_norm": 0.43726542592048645, + "learning_rate": 5.6239348608218145e-06, + "loss": 0.0961, + "step": 37960 + }, + { + "epoch": 3.5949630751751562, + "grad_norm": 0.4899952709674835, + "learning_rate": 5.6201476992993754e-06, + "loss": 0.103, + "step": 37970 + }, + { + "epoch": 3.595909865555766, + "grad_norm": 0.5843260288238525, + "learning_rate": 5.616360537776937e-06, + "loss": 0.1004, + "step": 37980 + }, + { + "epoch": 3.5968566559363757, + "grad_norm": 0.43663713335990906, + "learning_rate": 5.612573376254497e-06, + "loss": 0.1021, + "step": 37990 + }, + { + "epoch": 3.5978034463169855, + "grad_norm": 0.44171518087387085, + "learning_rate": 5.608786214732059e-06, + "loss": 0.1001, + "step": 38000 + }, + { + "epoch": 3.598750236697595, + "grad_norm": 0.5805891752243042, + "learning_rate": 5.60499905320962e-06, + "loss": 0.1009, + "step": 38010 + }, + { + "epoch": 3.599697027078205, + "grad_norm": 0.43290090560913086, + "learning_rate": 5.601211891687181e-06, + "loss": 0.1025, + "step": 38020 + }, + { + "epoch": 3.6006438174588147, + "grad_norm": 0.4977321922779083, + "learning_rate": 5.5974247301647425e-06, + "loss": 0.0943, + "step": 38030 + }, + { + "epoch": 3.6015906078394244, + "grad_norm": 0.4680572748184204, + "learning_rate": 5.5936375686423026e-06, + "loss": 0.0977, + "step": 38040 + }, + { + "epoch": 3.602537398220034, + "grad_norm": 0.567624568939209, + "learning_rate": 5.589850407119864e-06, + "loss": 0.0968, + "step": 38050 + }, + { + "epoch": 3.603484188600644, + "grad_norm": 0.41031312942504883, + "learning_rate": 5.586063245597425e-06, + "loss": 0.091, + "step": 38060 + }, + { + "epoch": 3.6044309789812536, + "grad_norm": 0.5081103444099426, + "learning_rate": 5.582276084074987e-06, + "loss": 0.1007, + "step": 38070 + }, + { + "epoch": 3.6053777693618634, + "grad_norm": 0.5523340106010437, + "learning_rate": 5.578488922552547e-06, + "loss": 0.0965, + "step": 38080 + }, + { + "epoch": 3.606324559742473, + "grad_norm": 0.49104392528533936, + "learning_rate": 5.574701761030108e-06, + "loss": 0.0952, + "step": 38090 + }, + { + "epoch": 3.607271350123083, + "grad_norm": 0.37221527099609375, + "learning_rate": 5.57091459950767e-06, + "loss": 0.0991, + "step": 38100 + }, + { + "epoch": 3.6082181405036926, + "grad_norm": 0.43933555483818054, + "learning_rate": 5.5671274379852305e-06, + "loss": 0.0957, + "step": 38110 + }, + { + "epoch": 3.6091649308843023, + "grad_norm": 0.5173497796058655, + "learning_rate": 5.563340276462792e-06, + "loss": 0.095, + "step": 38120 + }, + { + "epoch": 3.610111721264912, + "grad_norm": 0.43472984433174133, + "learning_rate": 5.559553114940352e-06, + "loss": 0.101, + "step": 38130 + }, + { + "epoch": 3.611058511645522, + "grad_norm": 0.5377931594848633, + "learning_rate": 5.555765953417914e-06, + "loss": 0.0956, + "step": 38140 + }, + { + "epoch": 3.6120053020261316, + "grad_norm": 0.35924702882766724, + "learning_rate": 5.551978791895475e-06, + "loss": 0.0932, + "step": 38150 + }, + { + "epoch": 3.6129520924067413, + "grad_norm": 0.6031462550163269, + "learning_rate": 5.548191630373035e-06, + "loss": 0.0976, + "step": 38160 + }, + { + "epoch": 3.6138988827873506, + "grad_norm": 0.435293048620224, + "learning_rate": 5.544404468850597e-06, + "loss": 0.1001, + "step": 38170 + }, + { + "epoch": 3.6148456731679603, + "grad_norm": 0.5811923742294312, + "learning_rate": 5.540617307328158e-06, + "loss": 0.1128, + "step": 38180 + }, + { + "epoch": 3.61579246354857, + "grad_norm": 0.49129927158355713, + "learning_rate": 5.536830145805719e-06, + "loss": 0.0932, + "step": 38190 + }, + { + "epoch": 3.61673925392918, + "grad_norm": 0.41449084877967834, + "learning_rate": 5.53304298428328e-06, + "loss": 0.097, + "step": 38200 + }, + { + "epoch": 3.6176860443097896, + "grad_norm": 0.5685412287712097, + "learning_rate": 5.529255822760842e-06, + "loss": 0.0907, + "step": 38210 + }, + { + "epoch": 3.6186328346903993, + "grad_norm": 0.5093902945518494, + "learning_rate": 5.525468661238402e-06, + "loss": 0.0942, + "step": 38220 + }, + { + "epoch": 3.619579625071009, + "grad_norm": 0.5617339015007019, + "learning_rate": 5.521681499715963e-06, + "loss": 0.1015, + "step": 38230 + }, + { + "epoch": 3.620526415451619, + "grad_norm": 0.47863510251045227, + "learning_rate": 5.517894338193525e-06, + "loss": 0.0952, + "step": 38240 + }, + { + "epoch": 3.6214732058322285, + "grad_norm": 0.6331188678741455, + "learning_rate": 5.514107176671085e-06, + "loss": 0.111, + "step": 38250 + }, + { + "epoch": 3.6224199962128383, + "grad_norm": 0.45131048560142517, + "learning_rate": 5.5103200151486465e-06, + "loss": 0.0915, + "step": 38260 + }, + { + "epoch": 3.623366786593448, + "grad_norm": 0.4484195411205292, + "learning_rate": 5.506532853626207e-06, + "loss": 0.0964, + "step": 38270 + }, + { + "epoch": 3.6243135769740578, + "grad_norm": 0.5122811198234558, + "learning_rate": 5.502745692103769e-06, + "loss": 0.1005, + "step": 38280 + }, + { + "epoch": 3.6252603673546675, + "grad_norm": 0.5720915794372559, + "learning_rate": 5.49895853058133e-06, + "loss": 0.0957, + "step": 38290 + }, + { + "epoch": 3.6262071577352772, + "grad_norm": 0.4863225221633911, + "learning_rate": 5.49517136905889e-06, + "loss": 0.0901, + "step": 38300 + }, + { + "epoch": 3.627153948115887, + "grad_norm": 0.5356199145317078, + "learning_rate": 5.491384207536452e-06, + "loss": 0.0994, + "step": 38310 + }, + { + "epoch": 3.6281007384964967, + "grad_norm": 0.46760469675064087, + "learning_rate": 5.487597046014013e-06, + "loss": 0.1005, + "step": 38320 + }, + { + "epoch": 3.6290475288771065, + "grad_norm": 0.39269837737083435, + "learning_rate": 5.4838098844915745e-06, + "loss": 0.0925, + "step": 38330 + }, + { + "epoch": 3.629994319257716, + "grad_norm": 0.49656835198402405, + "learning_rate": 5.4800227229691345e-06, + "loss": 0.0932, + "step": 38340 + }, + { + "epoch": 3.630941109638326, + "grad_norm": 0.4261341691017151, + "learning_rate": 5.476235561446696e-06, + "loss": 0.0933, + "step": 38350 + }, + { + "epoch": 3.6318879000189357, + "grad_norm": 0.5111949443817139, + "learning_rate": 5.472448399924257e-06, + "loss": 0.1001, + "step": 38360 + }, + { + "epoch": 3.6328346903995454, + "grad_norm": 0.4922689199447632, + "learning_rate": 5.468661238401818e-06, + "loss": 0.0957, + "step": 38370 + }, + { + "epoch": 3.633781480780155, + "grad_norm": 0.5869754552841187, + "learning_rate": 5.46487407687938e-06, + "loss": 0.0937, + "step": 38380 + }, + { + "epoch": 3.634728271160765, + "grad_norm": 0.6243377327919006, + "learning_rate": 5.46108691535694e-06, + "loss": 0.0976, + "step": 38390 + }, + { + "epoch": 3.6356750615413747, + "grad_norm": 0.5943287014961243, + "learning_rate": 5.457299753834502e-06, + "loss": 0.0877, + "step": 38400 + }, + { + "epoch": 3.6366218519219844, + "grad_norm": 0.5626177191734314, + "learning_rate": 5.4535125923120625e-06, + "loss": 0.1154, + "step": 38410 + }, + { + "epoch": 3.637568642302594, + "grad_norm": 0.48984989523887634, + "learning_rate": 5.449725430789624e-06, + "loss": 0.081, + "step": 38420 + }, + { + "epoch": 3.638515432683204, + "grad_norm": 0.3821601867675781, + "learning_rate": 5.445938269267184e-06, + "loss": 0.0934, + "step": 38430 + }, + { + "epoch": 3.6394622230638136, + "grad_norm": 0.38352078199386597, + "learning_rate": 5.442151107744745e-06, + "loss": 0.0917, + "step": 38440 + }, + { + "epoch": 3.6404090134444234, + "grad_norm": 0.505770206451416, + "learning_rate": 5.438363946222307e-06, + "loss": 0.1035, + "step": 38450 + }, + { + "epoch": 3.641355803825033, + "grad_norm": 0.3925401568412781, + "learning_rate": 5.434576784699868e-06, + "loss": 0.0872, + "step": 38460 + }, + { + "epoch": 3.642302594205643, + "grad_norm": 0.40184637904167175, + "learning_rate": 5.43078962317743e-06, + "loss": 0.0957, + "step": 38470 + }, + { + "epoch": 3.6432493845862526, + "grad_norm": 0.4207031726837158, + "learning_rate": 5.42700246165499e-06, + "loss": 0.095, + "step": 38480 + }, + { + "epoch": 3.6441961749668623, + "grad_norm": 0.5287842750549316, + "learning_rate": 5.423215300132551e-06, + "loss": 0.0976, + "step": 38490 + }, + { + "epoch": 3.645142965347472, + "grad_norm": 0.5332127809524536, + "learning_rate": 5.419428138610112e-06, + "loss": 0.0938, + "step": 38500 + }, + { + "epoch": 3.646089755728082, + "grad_norm": 0.5463581085205078, + "learning_rate": 5.415640977087673e-06, + "loss": 0.1012, + "step": 38510 + }, + { + "epoch": 3.6470365461086915, + "grad_norm": 0.42161843180656433, + "learning_rate": 5.411853815565234e-06, + "loss": 0.1035, + "step": 38520 + }, + { + "epoch": 3.6479833364893013, + "grad_norm": 0.42370715737342834, + "learning_rate": 5.408066654042795e-06, + "loss": 0.1005, + "step": 38530 + }, + { + "epoch": 3.648930126869911, + "grad_norm": 0.48021796345710754, + "learning_rate": 5.404279492520357e-06, + "loss": 0.0997, + "step": 38540 + }, + { + "epoch": 3.6498769172505208, + "grad_norm": 0.4591447710990906, + "learning_rate": 5.400492330997918e-06, + "loss": 0.0933, + "step": 38550 + }, + { + "epoch": 3.6508237076311305, + "grad_norm": 0.5126654505729675, + "learning_rate": 5.396705169475479e-06, + "loss": 0.0943, + "step": 38560 + }, + { + "epoch": 3.6517704980117403, + "grad_norm": 0.39282628893852234, + "learning_rate": 5.392918007953039e-06, + "loss": 0.1074, + "step": 38570 + }, + { + "epoch": 3.65271728839235, + "grad_norm": 0.5055245161056519, + "learning_rate": 5.3891308464306e-06, + "loss": 0.0951, + "step": 38580 + }, + { + "epoch": 3.6536640787729597, + "grad_norm": 0.45934176445007324, + "learning_rate": 5.385343684908162e-06, + "loss": 0.0912, + "step": 38590 + }, + { + "epoch": 3.6546108691535695, + "grad_norm": 0.5393901467323303, + "learning_rate": 5.381556523385723e-06, + "loss": 0.0963, + "step": 38600 + }, + { + "epoch": 3.655557659534179, + "grad_norm": 0.47153976559638977, + "learning_rate": 5.377769361863284e-06, + "loss": 0.0953, + "step": 38610 + }, + { + "epoch": 3.656504449914789, + "grad_norm": 0.5694108605384827, + "learning_rate": 5.373982200340845e-06, + "loss": 0.109, + "step": 38620 + }, + { + "epoch": 3.6574512402953987, + "grad_norm": 0.49688521027565, + "learning_rate": 5.3701950388184065e-06, + "loss": 0.0938, + "step": 38630 + }, + { + "epoch": 3.6583980306760084, + "grad_norm": 0.5291985273361206, + "learning_rate": 5.366407877295967e-06, + "loss": 0.0899, + "step": 38640 + }, + { + "epoch": 3.659344821056618, + "grad_norm": 0.4994890093803406, + "learning_rate": 5.362620715773527e-06, + "loss": 0.0931, + "step": 38650 + }, + { + "epoch": 3.660291611437228, + "grad_norm": 0.5147172808647156, + "learning_rate": 5.358833554251089e-06, + "loss": 0.0946, + "step": 38660 + }, + { + "epoch": 3.6612384018178377, + "grad_norm": 0.36024779081344604, + "learning_rate": 5.35504639272865e-06, + "loss": 0.085, + "step": 38670 + }, + { + "epoch": 3.6621851921984474, + "grad_norm": 0.5125046372413635, + "learning_rate": 5.351259231206212e-06, + "loss": 0.0917, + "step": 38680 + }, + { + "epoch": 3.663131982579057, + "grad_norm": 0.5628103613853455, + "learning_rate": 5.347472069683773e-06, + "loss": 0.0934, + "step": 38690 + }, + { + "epoch": 3.664078772959667, + "grad_norm": 0.390716016292572, + "learning_rate": 5.343684908161334e-06, + "loss": 0.096, + "step": 38700 + }, + { + "epoch": 3.6650255633402766, + "grad_norm": 0.41418376564979553, + "learning_rate": 5.3398977466388945e-06, + "loss": 0.095, + "step": 38710 + }, + { + "epoch": 3.6659723537208864, + "grad_norm": 0.4665653109550476, + "learning_rate": 5.336110585116455e-06, + "loss": 0.0858, + "step": 38720 + }, + { + "epoch": 3.666919144101496, + "grad_norm": 0.35321033000946045, + "learning_rate": 5.332323423594017e-06, + "loss": 0.0907, + "step": 38730 + }, + { + "epoch": 3.667865934482106, + "grad_norm": 0.5768105387687683, + "learning_rate": 5.328536262071577e-06, + "loss": 0.106, + "step": 38740 + }, + { + "epoch": 3.6688127248627156, + "grad_norm": 0.383003294467926, + "learning_rate": 5.324749100549139e-06, + "loss": 0.0893, + "step": 38750 + }, + { + "epoch": 3.6697595152433253, + "grad_norm": 0.5730847120285034, + "learning_rate": 5.3209619390267e-06, + "loss": 0.1072, + "step": 38760 + }, + { + "epoch": 3.670706305623935, + "grad_norm": 0.5745497345924377, + "learning_rate": 5.3171747775042616e-06, + "loss": 0.1006, + "step": 38770 + }, + { + "epoch": 3.671653096004545, + "grad_norm": 0.5345969796180725, + "learning_rate": 5.3133876159818225e-06, + "loss": 0.1059, + "step": 38780 + }, + { + "epoch": 3.6725998863851546, + "grad_norm": 0.5308682918548584, + "learning_rate": 5.3096004544593825e-06, + "loss": 0.1005, + "step": 38790 + }, + { + "epoch": 3.6735466767657643, + "grad_norm": 0.4583006203174591, + "learning_rate": 5.305813292936944e-06, + "loss": 0.0957, + "step": 38800 + }, + { + "epoch": 3.674493467146374, + "grad_norm": 0.511604905128479, + "learning_rate": 5.302026131414505e-06, + "loss": 0.1052, + "step": 38810 + }, + { + "epoch": 3.675440257526984, + "grad_norm": 0.42457523941993713, + "learning_rate": 5.298238969892067e-06, + "loss": 0.099, + "step": 38820 + }, + { + "epoch": 3.6763870479075935, + "grad_norm": 0.3803611993789673, + "learning_rate": 5.294451808369627e-06, + "loss": 0.102, + "step": 38830 + }, + { + "epoch": 3.6773338382882033, + "grad_norm": 0.4064808785915375, + "learning_rate": 5.290664646847189e-06, + "loss": 0.0926, + "step": 38840 + }, + { + "epoch": 3.678280628668813, + "grad_norm": 0.4240824580192566, + "learning_rate": 5.28687748532475e-06, + "loss": 0.0975, + "step": 38850 + }, + { + "epoch": 3.6792274190494223, + "grad_norm": 0.4897139370441437, + "learning_rate": 5.2830903238023105e-06, + "loss": 0.1003, + "step": 38860 + }, + { + "epoch": 3.680174209430032, + "grad_norm": 0.5453110933303833, + "learning_rate": 5.279303162279872e-06, + "loss": 0.0876, + "step": 38870 + }, + { + "epoch": 3.681120999810642, + "grad_norm": 0.5848513245582581, + "learning_rate": 5.275516000757432e-06, + "loss": 0.094, + "step": 38880 + }, + { + "epoch": 3.6820677901912515, + "grad_norm": 0.5227193832397461, + "learning_rate": 5.271728839234994e-06, + "loss": 0.0932, + "step": 38890 + }, + { + "epoch": 3.6830145805718613, + "grad_norm": 0.4724016487598419, + "learning_rate": 5.267941677712555e-06, + "loss": 0.0948, + "step": 38900 + }, + { + "epoch": 3.683961370952471, + "grad_norm": 0.39952653646469116, + "learning_rate": 5.264154516190117e-06, + "loss": 0.0876, + "step": 38910 + }, + { + "epoch": 3.6849081613330807, + "grad_norm": 0.5192437767982483, + "learning_rate": 5.260367354667677e-06, + "loss": 0.0963, + "step": 38920 + }, + { + "epoch": 3.6858549517136905, + "grad_norm": 0.4506594240665436, + "learning_rate": 5.256580193145238e-06, + "loss": 0.099, + "step": 38930 + }, + { + "epoch": 3.6868017420943002, + "grad_norm": 0.5142754316329956, + "learning_rate": 5.252793031622799e-06, + "loss": 0.09, + "step": 38940 + }, + { + "epoch": 3.68774853247491, + "grad_norm": 0.4741668105125427, + "learning_rate": 5.24900587010036e-06, + "loss": 0.0977, + "step": 38950 + }, + { + "epoch": 3.6886953228555197, + "grad_norm": 0.40883010625839233, + "learning_rate": 5.245218708577922e-06, + "loss": 0.0943, + "step": 38960 + }, + { + "epoch": 3.6896421132361295, + "grad_norm": 0.4715437591075897, + "learning_rate": 5.241431547055482e-06, + "loss": 0.0937, + "step": 38970 + }, + { + "epoch": 3.690588903616739, + "grad_norm": 0.44934457540512085, + "learning_rate": 5.237644385533044e-06, + "loss": 0.0993, + "step": 38980 + }, + { + "epoch": 3.691535693997349, + "grad_norm": 0.5743240714073181, + "learning_rate": 5.233857224010605e-06, + "loss": 0.0994, + "step": 38990 + }, + { + "epoch": 3.6924824843779587, + "grad_norm": 0.5706230998039246, + "learning_rate": 5.230070062488165e-06, + "loss": 0.1102, + "step": 39000 + }, + { + "epoch": 3.6934292747585684, + "grad_norm": 0.6386612057685852, + "learning_rate": 5.2262829009657265e-06, + "loss": 0.1075, + "step": 39010 + }, + { + "epoch": 3.694376065139178, + "grad_norm": 0.5190657377243042, + "learning_rate": 5.222495739443287e-06, + "loss": 0.0912, + "step": 39020 + }, + { + "epoch": 3.695322855519788, + "grad_norm": 0.4563543200492859, + "learning_rate": 5.218708577920849e-06, + "loss": 0.0967, + "step": 39030 + }, + { + "epoch": 3.6962696459003976, + "grad_norm": 0.477840393781662, + "learning_rate": 5.21492141639841e-06, + "loss": 0.0921, + "step": 39040 + }, + { + "epoch": 3.6972164362810074, + "grad_norm": 0.46329066157341003, + "learning_rate": 5.211134254875972e-06, + "loss": 0.0922, + "step": 39050 + }, + { + "epoch": 3.698163226661617, + "grad_norm": 0.46231696009635925, + "learning_rate": 5.207347093353532e-06, + "loss": 0.0902, + "step": 39060 + }, + { + "epoch": 3.699110017042227, + "grad_norm": 0.4845748245716095, + "learning_rate": 5.203559931831093e-06, + "loss": 0.0913, + "step": 39070 + }, + { + "epoch": 3.7000568074228366, + "grad_norm": 0.4496608078479767, + "learning_rate": 5.1997727703086544e-06, + "loss": 0.1088, + "step": 39080 + }, + { + "epoch": 3.7010035978034463, + "grad_norm": 0.5793826580047607, + "learning_rate": 5.1959856087862145e-06, + "loss": 0.0963, + "step": 39090 + }, + { + "epoch": 3.701950388184056, + "grad_norm": 0.43485772609710693, + "learning_rate": 5.192198447263776e-06, + "loss": 0.0919, + "step": 39100 + }, + { + "epoch": 3.702897178564666, + "grad_norm": 0.47092369198799133, + "learning_rate": 5.188411285741337e-06, + "loss": 0.1123, + "step": 39110 + }, + { + "epoch": 3.7038439689452756, + "grad_norm": 0.5703817009925842, + "learning_rate": 5.184624124218899e-06, + "loss": 0.1002, + "step": 39120 + }, + { + "epoch": 3.7047907593258853, + "grad_norm": 0.5016136765480042, + "learning_rate": 5.18083696269646e-06, + "loss": 0.091, + "step": 39130 + }, + { + "epoch": 3.705737549706495, + "grad_norm": 0.5749192237854004, + "learning_rate": 5.17704980117402e-06, + "loss": 0.0987, + "step": 39140 + }, + { + "epoch": 3.706684340087105, + "grad_norm": 0.4935661554336548, + "learning_rate": 5.1732626396515816e-06, + "loss": 0.103, + "step": 39150 + }, + { + "epoch": 3.7076311304677145, + "grad_norm": 0.4583258032798767, + "learning_rate": 5.1694754781291425e-06, + "loss": 0.1013, + "step": 39160 + }, + { + "epoch": 3.7085779208483243, + "grad_norm": 0.6873877644538879, + "learning_rate": 5.165688316606704e-06, + "loss": 0.1042, + "step": 39170 + }, + { + "epoch": 3.709524711228934, + "grad_norm": 0.4328051507472992, + "learning_rate": 5.161901155084264e-06, + "loss": 0.0933, + "step": 39180 + }, + { + "epoch": 3.7104715016095438, + "grad_norm": 0.4692210257053375, + "learning_rate": 5.158113993561826e-06, + "loss": 0.0967, + "step": 39190 + }, + { + "epoch": 3.7114182919901535, + "grad_norm": 0.48374322056770325, + "learning_rate": 5.154326832039387e-06, + "loss": 0.1008, + "step": 39200 + }, + { + "epoch": 3.7123650823707632, + "grad_norm": 0.3715035021305084, + "learning_rate": 5.150539670516948e-06, + "loss": 0.1025, + "step": 39210 + }, + { + "epoch": 3.713311872751373, + "grad_norm": 0.4053986370563507, + "learning_rate": 5.1467525089945095e-06, + "loss": 0.0927, + "step": 39220 + }, + { + "epoch": 3.7142586631319827, + "grad_norm": 0.44326600432395935, + "learning_rate": 5.14296534747207e-06, + "loss": 0.1005, + "step": 39230 + }, + { + "epoch": 3.7152054535125925, + "grad_norm": 0.4053700268268585, + "learning_rate": 5.139178185949631e-06, + "loss": 0.0878, + "step": 39240 + }, + { + "epoch": 3.716152243893202, + "grad_norm": 0.6459446549415588, + "learning_rate": 5.135391024427192e-06, + "loss": 0.098, + "step": 39250 + }, + { + "epoch": 3.717099034273812, + "grad_norm": 0.5322126746177673, + "learning_rate": 5.131603862904754e-06, + "loss": 0.0916, + "step": 39260 + }, + { + "epoch": 3.7180458246544212, + "grad_norm": 0.4378093481063843, + "learning_rate": 5.127816701382314e-06, + "loss": 0.0913, + "step": 39270 + }, + { + "epoch": 3.718992615035031, + "grad_norm": 0.46981629729270935, + "learning_rate": 5.124029539859875e-06, + "loss": 0.1043, + "step": 39280 + }, + { + "epoch": 3.7199394054156407, + "grad_norm": 0.5976932048797607, + "learning_rate": 5.120242378337437e-06, + "loss": 0.0986, + "step": 39290 + }, + { + "epoch": 3.7208861957962505, + "grad_norm": 0.48355865478515625, + "learning_rate": 5.1164552168149975e-06, + "loss": 0.0873, + "step": 39300 + }, + { + "epoch": 3.72183298617686, + "grad_norm": 0.49303942918777466, + "learning_rate": 5.112668055292559e-06, + "loss": 0.0943, + "step": 39310 + }, + { + "epoch": 3.72277977655747, + "grad_norm": 0.41613462567329407, + "learning_rate": 5.108880893770119e-06, + "loss": 0.0976, + "step": 39320 + }, + { + "epoch": 3.7237265669380797, + "grad_norm": 0.5080744624137878, + "learning_rate": 5.105093732247681e-06, + "loss": 0.0889, + "step": 39330 + }, + { + "epoch": 3.7246733573186894, + "grad_norm": 0.47723624110221863, + "learning_rate": 5.101306570725242e-06, + "loss": 0.1033, + "step": 39340 + }, + { + "epoch": 3.725620147699299, + "grad_norm": 0.37608417868614197, + "learning_rate": 5.097519409202802e-06, + "loss": 0.0966, + "step": 39350 + }, + { + "epoch": 3.726566938079909, + "grad_norm": 0.38140979409217834, + "learning_rate": 5.093732247680364e-06, + "loss": 0.0972, + "step": 39360 + }, + { + "epoch": 3.7275137284605186, + "grad_norm": 0.39315181970596313, + "learning_rate": 5.089945086157925e-06, + "loss": 0.0953, + "step": 39370 + }, + { + "epoch": 3.7284605188411284, + "grad_norm": 0.525129497051239, + "learning_rate": 5.086157924635486e-06, + "loss": 0.0933, + "step": 39380 + }, + { + "epoch": 3.729407309221738, + "grad_norm": 0.464629203081131, + "learning_rate": 5.082370763113047e-06, + "loss": 0.095, + "step": 39390 + }, + { + "epoch": 3.730354099602348, + "grad_norm": 0.46547815203666687, + "learning_rate": 5.078583601590609e-06, + "loss": 0.0894, + "step": 39400 + }, + { + "epoch": 3.7313008899829576, + "grad_norm": 0.5702111124992371, + "learning_rate": 5.074796440068169e-06, + "loss": 0.0897, + "step": 39410 + }, + { + "epoch": 3.7322476803635674, + "grad_norm": 0.45302221179008484, + "learning_rate": 5.07100927854573e-06, + "loss": 0.0917, + "step": 39420 + }, + { + "epoch": 3.733194470744177, + "grad_norm": 0.5478273034095764, + "learning_rate": 5.067222117023292e-06, + "loss": 0.0968, + "step": 39430 + }, + { + "epoch": 3.734141261124787, + "grad_norm": 0.5107770562171936, + "learning_rate": 5.063434955500852e-06, + "loss": 0.0977, + "step": 39440 + }, + { + "epoch": 3.7350880515053966, + "grad_norm": 0.6436827182769775, + "learning_rate": 5.0596477939784135e-06, + "loss": 0.1011, + "step": 39450 + }, + { + "epoch": 3.7360348418860063, + "grad_norm": 0.49567675590515137, + "learning_rate": 5.0558606324559744e-06, + "loss": 0.0942, + "step": 39460 + }, + { + "epoch": 3.736981632266616, + "grad_norm": 0.5632124543190002, + "learning_rate": 5.052073470933536e-06, + "loss": 0.0965, + "step": 39470 + }, + { + "epoch": 3.737928422647226, + "grad_norm": 0.47855645418167114, + "learning_rate": 5.048286309411097e-06, + "loss": 0.0938, + "step": 39480 + }, + { + "epoch": 3.7388752130278355, + "grad_norm": 0.4823874533176422, + "learning_rate": 5.044499147888657e-06, + "loss": 0.1016, + "step": 39490 + }, + { + "epoch": 3.7398220034084453, + "grad_norm": 0.4572794437408447, + "learning_rate": 5.040711986366219e-06, + "loss": 0.0936, + "step": 39500 + }, + { + "epoch": 3.740768793789055, + "grad_norm": 0.4439483880996704, + "learning_rate": 5.03692482484378e-06, + "loss": 0.0971, + "step": 39510 + }, + { + "epoch": 3.7417155841696648, + "grad_norm": 0.3832974433898926, + "learning_rate": 5.0331376633213415e-06, + "loss": 0.0943, + "step": 39520 + }, + { + "epoch": 3.7426623745502745, + "grad_norm": 0.40241408348083496, + "learning_rate": 5.0293505017989016e-06, + "loss": 0.0983, + "step": 39530 + }, + { + "epoch": 3.7436091649308842, + "grad_norm": 0.43205514550209045, + "learning_rate": 5.025563340276463e-06, + "loss": 0.0973, + "step": 39540 + }, + { + "epoch": 3.744555955311494, + "grad_norm": 0.6235865950584412, + "learning_rate": 5.021776178754024e-06, + "loss": 0.0899, + "step": 39550 + }, + { + "epoch": 3.7455027456921037, + "grad_norm": 0.48281237483024597, + "learning_rate": 5.017989017231585e-06, + "loss": 0.0924, + "step": 39560 + }, + { + "epoch": 3.7464495360727135, + "grad_norm": 0.4790968894958496, + "learning_rate": 5.014201855709147e-06, + "loss": 0.0955, + "step": 39570 + }, + { + "epoch": 3.747396326453323, + "grad_norm": 0.46759265661239624, + "learning_rate": 5.010414694186707e-06, + "loss": 0.0945, + "step": 39580 + }, + { + "epoch": 3.748343116833933, + "grad_norm": 0.5322169661521912, + "learning_rate": 5.006627532664269e-06, + "loss": 0.1096, + "step": 39590 + }, + { + "epoch": 3.7492899072145427, + "grad_norm": 0.46110084652900696, + "learning_rate": 5.0028403711418295e-06, + "loss": 0.0896, + "step": 39600 + }, + { + "epoch": 3.7502366975951524, + "grad_norm": 0.4692782461643219, + "learning_rate": 4.99905320961939e-06, + "loss": 0.1023, + "step": 39610 + }, + { + "epoch": 3.751183487975762, + "grad_norm": 0.5090218186378479, + "learning_rate": 4.995266048096951e-06, + "loss": 0.0952, + "step": 39620 + }, + { + "epoch": 3.752130278356372, + "grad_norm": 0.41407740116119385, + "learning_rate": 4.991478886574513e-06, + "loss": 0.0829, + "step": 39630 + }, + { + "epoch": 3.7530770687369817, + "grad_norm": 0.5581246614456177, + "learning_rate": 4.987691725052074e-06, + "loss": 0.0991, + "step": 39640 + }, + { + "epoch": 3.7540238591175914, + "grad_norm": 0.5363439321517944, + "learning_rate": 4.983904563529635e-06, + "loss": 0.0975, + "step": 39650 + }, + { + "epoch": 3.754970649498201, + "grad_norm": 0.45378658175468445, + "learning_rate": 4.980117402007197e-06, + "loss": 0.0937, + "step": 39660 + }, + { + "epoch": 3.755917439878811, + "grad_norm": 0.41703352332115173, + "learning_rate": 4.976330240484757e-06, + "loss": 0.0931, + "step": 39670 + }, + { + "epoch": 3.7568642302594206, + "grad_norm": 0.5335865020751953, + "learning_rate": 4.972543078962318e-06, + "loss": 0.1045, + "step": 39680 + }, + { + "epoch": 3.7578110206400304, + "grad_norm": 0.6401326060295105, + "learning_rate": 4.968755917439879e-06, + "loss": 0.0875, + "step": 39690 + }, + { + "epoch": 3.75875781102064, + "grad_norm": 0.5189956426620483, + "learning_rate": 4.96496875591744e-06, + "loss": 0.0914, + "step": 39700 + }, + { + "epoch": 3.75970460140125, + "grad_norm": 0.5471692085266113, + "learning_rate": 4.961181594395001e-06, + "loss": 0.1065, + "step": 39710 + }, + { + "epoch": 3.7606513917818596, + "grad_norm": 0.5299888253211975, + "learning_rate": 4.957394432872563e-06, + "loss": 0.1024, + "step": 39720 + }, + { + "epoch": 3.7615981821624693, + "grad_norm": 0.38434287905693054, + "learning_rate": 4.953607271350124e-06, + "loss": 0.0994, + "step": 39730 + }, + { + "epoch": 3.762544972543079, + "grad_norm": 0.45277974009513855, + "learning_rate": 4.949820109827685e-06, + "loss": 0.0953, + "step": 39740 + }, + { + "epoch": 3.763491762923689, + "grad_norm": 0.5209444761276245, + "learning_rate": 4.9460329483052455e-06, + "loss": 0.0917, + "step": 39750 + }, + { + "epoch": 3.7644385533042986, + "grad_norm": 0.37321043014526367, + "learning_rate": 4.942245786782806e-06, + "loss": 0.0951, + "step": 39760 + }, + { + "epoch": 3.7653853436849083, + "grad_norm": 0.5020799040794373, + "learning_rate": 4.938458625260368e-06, + "loss": 0.0967, + "step": 39770 + }, + { + "epoch": 3.766332134065518, + "grad_norm": 0.624764084815979, + "learning_rate": 4.934671463737929e-06, + "loss": 0.1014, + "step": 39780 + }, + { + "epoch": 3.7672789244461278, + "grad_norm": 0.4501070976257324, + "learning_rate": 4.93088430221549e-06, + "loss": 0.0932, + "step": 39790 + }, + { + "epoch": 3.7682257148267375, + "grad_norm": 0.4746125042438507, + "learning_rate": 4.927097140693051e-06, + "loss": 0.0907, + "step": 39800 + }, + { + "epoch": 3.7691725052073473, + "grad_norm": 0.5339996814727783, + "learning_rate": 4.923309979170612e-06, + "loss": 0.0976, + "step": 39810 + }, + { + "epoch": 3.770119295587957, + "grad_norm": 0.495650976896286, + "learning_rate": 4.919522817648173e-06, + "loss": 0.1006, + "step": 39820 + }, + { + "epoch": 3.7710660859685667, + "grad_norm": 0.4445793926715851, + "learning_rate": 4.915735656125734e-06, + "loss": 0.0875, + "step": 39830 + }, + { + "epoch": 3.7720128763491765, + "grad_norm": 0.3669644892215729, + "learning_rate": 4.911948494603295e-06, + "loss": 0.0893, + "step": 39840 + }, + { + "epoch": 3.772959666729786, + "grad_norm": 0.5043411254882812, + "learning_rate": 4.908161333080856e-06, + "loss": 0.103, + "step": 39850 + }, + { + "epoch": 3.773906457110396, + "grad_norm": 0.4899156987667084, + "learning_rate": 4.904374171558418e-06, + "loss": 0.1061, + "step": 39860 + }, + { + "epoch": 3.7748532474910057, + "grad_norm": 0.4879787564277649, + "learning_rate": 4.900587010035979e-06, + "loss": 0.0988, + "step": 39870 + }, + { + "epoch": 3.7758000378716154, + "grad_norm": 0.48512783646583557, + "learning_rate": 4.89679984851354e-06, + "loss": 0.1002, + "step": 39880 + }, + { + "epoch": 3.776746828252225, + "grad_norm": 0.5323088765144348, + "learning_rate": 4.893012686991101e-06, + "loss": 0.0943, + "step": 39890 + }, + { + "epoch": 3.777693618632835, + "grad_norm": 0.44328346848487854, + "learning_rate": 4.8892255254686615e-06, + "loss": 0.1019, + "step": 39900 + }, + { + "epoch": 3.7786404090134447, + "grad_norm": 0.4099396765232086, + "learning_rate": 4.885438363946222e-06, + "loss": 0.0961, + "step": 39910 + }, + { + "epoch": 3.7795871993940544, + "grad_norm": 0.4763758182525635, + "learning_rate": 4.881651202423784e-06, + "loss": 0.0974, + "step": 39920 + }, + { + "epoch": 3.780533989774664, + "grad_norm": 0.417901873588562, + "learning_rate": 4.877864040901345e-06, + "loss": 0.1075, + "step": 39930 + }, + { + "epoch": 3.781480780155274, + "grad_norm": 0.5062112808227539, + "learning_rate": 4.874076879378906e-06, + "loss": 0.0879, + "step": 39940 + }, + { + "epoch": 3.7824275705358836, + "grad_norm": 0.4354439377784729, + "learning_rate": 4.870289717856467e-06, + "loss": 0.1048, + "step": 39950 + }, + { + "epoch": 3.783374360916493, + "grad_norm": 0.3767815828323364, + "learning_rate": 4.866502556334028e-06, + "loss": 0.0985, + "step": 39960 + }, + { + "epoch": 3.7843211512971027, + "grad_norm": 0.48620229959487915, + "learning_rate": 4.8627153948115895e-06, + "loss": 0.1029, + "step": 39970 + }, + { + "epoch": 3.7852679416777124, + "grad_norm": 0.40215253829956055, + "learning_rate": 4.85892823328915e-06, + "loss": 0.0974, + "step": 39980 + }, + { + "epoch": 3.786214732058322, + "grad_norm": 0.43390804529190063, + "learning_rate": 4.855141071766711e-06, + "loss": 0.1016, + "step": 39990 + }, + { + "epoch": 3.787161522438932, + "grad_norm": 0.48430293798446655, + "learning_rate": 4.851353910244272e-06, + "loss": 0.0981, + "step": 40000 + }, + { + "epoch": 3.7881083128195416, + "grad_norm": 0.4889768362045288, + "learning_rate": 4.847566748721834e-06, + "loss": 0.0992, + "step": 40010 + }, + { + "epoch": 3.7890551032001514, + "grad_norm": 0.5483484268188477, + "learning_rate": 4.843779587199394e-06, + "loss": 0.096, + "step": 40020 + }, + { + "epoch": 3.790001893580761, + "grad_norm": 0.4289323687553406, + "learning_rate": 4.839992425676956e-06, + "loss": 0.0913, + "step": 40030 + }, + { + "epoch": 3.790948683961371, + "grad_norm": 0.5495353937149048, + "learning_rate": 4.836205264154517e-06, + "loss": 0.1083, + "step": 40040 + }, + { + "epoch": 3.7918954743419806, + "grad_norm": 0.5764693021774292, + "learning_rate": 4.8324181026320775e-06, + "loss": 0.0974, + "step": 40050 + }, + { + "epoch": 3.7928422647225903, + "grad_norm": 0.46960946917533875, + "learning_rate": 4.828630941109639e-06, + "loss": 0.0879, + "step": 40060 + }, + { + "epoch": 3.7937890551032, + "grad_norm": 0.4713224768638611, + "learning_rate": 4.8248437795872e-06, + "loss": 0.0894, + "step": 40070 + }, + { + "epoch": 3.79473584548381, + "grad_norm": 0.5814439654350281, + "learning_rate": 4.821056618064761e-06, + "loss": 0.0995, + "step": 40080 + }, + { + "epoch": 3.7956826358644196, + "grad_norm": 0.5569701790809631, + "learning_rate": 4.817269456542322e-06, + "loss": 0.103, + "step": 40090 + }, + { + "epoch": 3.7966294262450293, + "grad_norm": 0.4573192894458771, + "learning_rate": 4.813482295019883e-06, + "loss": 0.0901, + "step": 40100 + }, + { + "epoch": 3.797576216625639, + "grad_norm": 0.5137414932250977, + "learning_rate": 4.809695133497444e-06, + "loss": 0.0958, + "step": 40110 + }, + { + "epoch": 3.798523007006249, + "grad_norm": 0.5065782070159912, + "learning_rate": 4.8059079719750055e-06, + "loss": 0.1015, + "step": 40120 + }, + { + "epoch": 3.7994697973868585, + "grad_norm": 0.512101411819458, + "learning_rate": 4.802120810452566e-06, + "loss": 0.0904, + "step": 40130 + }, + { + "epoch": 3.8004165877674683, + "grad_norm": 0.4687511920928955, + "learning_rate": 4.798333648930127e-06, + "loss": 0.0917, + "step": 40140 + }, + { + "epoch": 3.801363378148078, + "grad_norm": 0.5853365063667297, + "learning_rate": 4.794546487407689e-06, + "loss": 0.1092, + "step": 40150 + }, + { + "epoch": 3.8023101685286878, + "grad_norm": 0.4754956066608429, + "learning_rate": 4.790759325885249e-06, + "loss": 0.0892, + "step": 40160 + }, + { + "epoch": 3.8032569589092975, + "grad_norm": 0.48234477639198303, + "learning_rate": 4.78697216436281e-06, + "loss": 0.1014, + "step": 40170 + }, + { + "epoch": 3.8042037492899072, + "grad_norm": 0.4432053864002228, + "learning_rate": 4.783185002840372e-06, + "loss": 0.0928, + "step": 40180 + }, + { + "epoch": 3.805150539670517, + "grad_norm": 0.6404855847358704, + "learning_rate": 4.779397841317933e-06, + "loss": 0.1047, + "step": 40190 + }, + { + "epoch": 3.8060973300511267, + "grad_norm": 0.509357750415802, + "learning_rate": 4.7756106797954935e-06, + "loss": 0.1058, + "step": 40200 + }, + { + "epoch": 3.8070441204317365, + "grad_norm": 0.6530634760856628, + "learning_rate": 4.771823518273055e-06, + "loss": 0.0989, + "step": 40210 + }, + { + "epoch": 3.807990910812346, + "grad_norm": 0.5133205652236938, + "learning_rate": 4.768036356750616e-06, + "loss": 0.1008, + "step": 40220 + }, + { + "epoch": 3.808937701192956, + "grad_norm": 0.4522642493247986, + "learning_rate": 4.764249195228177e-06, + "loss": 0.0926, + "step": 40230 + }, + { + "epoch": 3.8098844915735657, + "grad_norm": 0.5123722553253174, + "learning_rate": 4.760462033705738e-06, + "loss": 0.0945, + "step": 40240 + }, + { + "epoch": 3.8108312819541754, + "grad_norm": 0.5599709749221802, + "learning_rate": 4.756674872183299e-06, + "loss": 0.0934, + "step": 40250 + }, + { + "epoch": 3.811778072334785, + "grad_norm": 0.5186089873313904, + "learning_rate": 4.75288771066086e-06, + "loss": 0.0897, + "step": 40260 + }, + { + "epoch": 3.812724862715395, + "grad_norm": 0.4753962755203247, + "learning_rate": 4.7491005491384215e-06, + "loss": 0.0854, + "step": 40270 + }, + { + "epoch": 3.8136716530960046, + "grad_norm": 0.5974533557891846, + "learning_rate": 4.745313387615982e-06, + "loss": 0.1008, + "step": 40280 + }, + { + "epoch": 3.8146184434766144, + "grad_norm": 0.39401939511299133, + "learning_rate": 4.741526226093543e-06, + "loss": 0.0982, + "step": 40290 + }, + { + "epoch": 3.815565233857224, + "grad_norm": 0.3924501836299896, + "learning_rate": 4.737739064571104e-06, + "loss": 0.0974, + "step": 40300 + }, + { + "epoch": 3.816512024237834, + "grad_norm": 0.4209947884082794, + "learning_rate": 4.733951903048665e-06, + "loss": 0.0874, + "step": 40310 + }, + { + "epoch": 3.8174588146184436, + "grad_norm": 0.3790121078491211, + "learning_rate": 4.730164741526227e-06, + "loss": 0.0893, + "step": 40320 + }, + { + "epoch": 3.8184056049990533, + "grad_norm": 0.4306885302066803, + "learning_rate": 4.726377580003788e-06, + "loss": 0.094, + "step": 40330 + }, + { + "epoch": 3.819352395379663, + "grad_norm": 0.49594056606292725, + "learning_rate": 4.7225904184813486e-06, + "loss": 0.091, + "step": 40340 + }, + { + "epoch": 3.820299185760273, + "grad_norm": 0.47653913497924805, + "learning_rate": 4.7188032569589095e-06, + "loss": 0.0933, + "step": 40350 + }, + { + "epoch": 3.8212459761408826, + "grad_norm": 0.5557363629341125, + "learning_rate": 4.715016095436471e-06, + "loss": 0.0967, + "step": 40360 + }, + { + "epoch": 3.822192766521492, + "grad_norm": 0.5122710466384888, + "learning_rate": 4.711228933914031e-06, + "loss": 0.0988, + "step": 40370 + }, + { + "epoch": 3.8231395569021016, + "grad_norm": 0.4322662353515625, + "learning_rate": 4.707441772391593e-06, + "loss": 0.1034, + "step": 40380 + }, + { + "epoch": 3.8240863472827114, + "grad_norm": 0.4525582492351532, + "learning_rate": 4.703654610869154e-06, + "loss": 0.0991, + "step": 40390 + }, + { + "epoch": 3.825033137663321, + "grad_norm": 0.473119854927063, + "learning_rate": 4.699867449346715e-06, + "loss": 0.0967, + "step": 40400 + }, + { + "epoch": 3.825979928043931, + "grad_norm": 0.43417078256607056, + "learning_rate": 4.6960802878242765e-06, + "loss": 0.0953, + "step": 40410 + }, + { + "epoch": 3.8269267184245406, + "grad_norm": 0.4016440510749817, + "learning_rate": 4.6922931263018374e-06, + "loss": 0.0943, + "step": 40420 + }, + { + "epoch": 3.8278735088051503, + "grad_norm": 0.4091476798057556, + "learning_rate": 4.688505964779398e-06, + "loss": 0.0974, + "step": 40430 + }, + { + "epoch": 3.82882029918576, + "grad_norm": 0.44425806403160095, + "learning_rate": 4.684718803256959e-06, + "loss": 0.0925, + "step": 40440 + }, + { + "epoch": 3.82976708956637, + "grad_norm": 0.4543251693248749, + "learning_rate": 4.68093164173452e-06, + "loss": 0.0906, + "step": 40450 + }, + { + "epoch": 3.8307138799469795, + "grad_norm": 0.4300362169742584, + "learning_rate": 4.677144480212081e-06, + "loss": 0.0908, + "step": 40460 + }, + { + "epoch": 3.8316606703275893, + "grad_norm": 0.5083745718002319, + "learning_rate": 4.673357318689643e-06, + "loss": 0.0934, + "step": 40470 + }, + { + "epoch": 3.832607460708199, + "grad_norm": 0.45083117485046387, + "learning_rate": 4.669570157167204e-06, + "loss": 0.0906, + "step": 40480 + }, + { + "epoch": 3.8335542510888088, + "grad_norm": 0.43083450198173523, + "learning_rate": 4.6657829956447646e-06, + "loss": 0.0904, + "step": 40490 + }, + { + "epoch": 3.8345010414694185, + "grad_norm": 0.48674798011779785, + "learning_rate": 4.661995834122326e-06, + "loss": 0.0954, + "step": 40500 + }, + { + "epoch": 3.8354478318500282, + "grad_norm": 0.5845592021942139, + "learning_rate": 4.658208672599886e-06, + "loss": 0.1006, + "step": 40510 + }, + { + "epoch": 3.836394622230638, + "grad_norm": 0.4620465934276581, + "learning_rate": 4.654421511077447e-06, + "loss": 0.0905, + "step": 40520 + }, + { + "epoch": 3.8373414126112477, + "grad_norm": 0.5082583427429199, + "learning_rate": 4.650634349555009e-06, + "loss": 0.1009, + "step": 40530 + }, + { + "epoch": 3.8382882029918575, + "grad_norm": 0.4657653272151947, + "learning_rate": 4.64684718803257e-06, + "loss": 0.0937, + "step": 40540 + }, + { + "epoch": 3.839234993372467, + "grad_norm": 0.4716366231441498, + "learning_rate": 4.643060026510131e-06, + "loss": 0.0924, + "step": 40550 + }, + { + "epoch": 3.840181783753077, + "grad_norm": 0.5509023666381836, + "learning_rate": 4.6392728649876925e-06, + "loss": 0.0934, + "step": 40560 + }, + { + "epoch": 3.8411285741336867, + "grad_norm": 0.4932687282562256, + "learning_rate": 4.6354857034652534e-06, + "loss": 0.0921, + "step": 40570 + }, + { + "epoch": 3.8420753645142964, + "grad_norm": 0.5577378869056702, + "learning_rate": 4.631698541942814e-06, + "loss": 0.0957, + "step": 40580 + }, + { + "epoch": 3.843022154894906, + "grad_norm": 0.4173696041107178, + "learning_rate": 4.627911380420375e-06, + "loss": 0.0981, + "step": 40590 + }, + { + "epoch": 3.843968945275516, + "grad_norm": 0.5011758208274841, + "learning_rate": 4.624124218897936e-06, + "loss": 0.0972, + "step": 40600 + }, + { + "epoch": 3.8449157356561257, + "grad_norm": 0.4696127772331238, + "learning_rate": 4.620337057375497e-06, + "loss": 0.0893, + "step": 40610 + }, + { + "epoch": 3.8458625260367354, + "grad_norm": 0.4647819697856903, + "learning_rate": 4.616549895853059e-06, + "loss": 0.0876, + "step": 40620 + }, + { + "epoch": 3.846809316417345, + "grad_norm": 0.4026613235473633, + "learning_rate": 4.61276273433062e-06, + "loss": 0.0959, + "step": 40630 + }, + { + "epoch": 3.847756106797955, + "grad_norm": 0.5512098670005798, + "learning_rate": 4.6089755728081806e-06, + "loss": 0.0947, + "step": 40640 + }, + { + "epoch": 3.8487028971785646, + "grad_norm": 0.44850537180900574, + "learning_rate": 4.6051884112857414e-06, + "loss": 0.0863, + "step": 40650 + }, + { + "epoch": 3.8496496875591744, + "grad_norm": 0.4348522126674652, + "learning_rate": 4.601401249763302e-06, + "loss": 0.1017, + "step": 40660 + }, + { + "epoch": 3.850596477939784, + "grad_norm": 0.5444629192352295, + "learning_rate": 4.597614088240864e-06, + "loss": 0.0927, + "step": 40670 + }, + { + "epoch": 3.851543268320394, + "grad_norm": 0.5211396813392639, + "learning_rate": 4.593826926718425e-06, + "loss": 0.1035, + "step": 40680 + }, + { + "epoch": 3.8524900587010036, + "grad_norm": 0.4562847912311554, + "learning_rate": 4.590039765195986e-06, + "loss": 0.089, + "step": 40690 + }, + { + "epoch": 3.8534368490816133, + "grad_norm": 0.46698418259620667, + "learning_rate": 4.586252603673547e-06, + "loss": 0.1002, + "step": 40700 + }, + { + "epoch": 3.854383639462223, + "grad_norm": 0.421149879693985, + "learning_rate": 4.5824654421511085e-06, + "loss": 0.0984, + "step": 40710 + }, + { + "epoch": 3.855330429842833, + "grad_norm": 0.4744303822517395, + "learning_rate": 4.5786782806286686e-06, + "loss": 0.0939, + "step": 40720 + }, + { + "epoch": 3.8562772202234425, + "grad_norm": 0.498382568359375, + "learning_rate": 4.57489111910623e-06, + "loss": 0.1012, + "step": 40730 + }, + { + "epoch": 3.8572240106040523, + "grad_norm": 0.4662265479564667, + "learning_rate": 4.571103957583791e-06, + "loss": 0.1025, + "step": 40740 + }, + { + "epoch": 3.858170800984662, + "grad_norm": 0.4596439003944397, + "learning_rate": 4.567316796061352e-06, + "loss": 0.1004, + "step": 40750 + }, + { + "epoch": 3.8591175913652718, + "grad_norm": 0.4291379153728485, + "learning_rate": 4.563529634538914e-06, + "loss": 0.0911, + "step": 40760 + }, + { + "epoch": 3.8600643817458815, + "grad_norm": 0.5993068218231201, + "learning_rate": 4.559742473016475e-06, + "loss": 0.1015, + "step": 40770 + }, + { + "epoch": 3.8610111721264913, + "grad_norm": 0.4954267740249634, + "learning_rate": 4.555955311494036e-06, + "loss": 0.0964, + "step": 40780 + }, + { + "epoch": 3.861957962507101, + "grad_norm": 0.5256617069244385, + "learning_rate": 4.5521681499715965e-06, + "loss": 0.1025, + "step": 40790 + }, + { + "epoch": 3.8629047528877107, + "grad_norm": 0.4482620060443878, + "learning_rate": 4.5483809884491574e-06, + "loss": 0.092, + "step": 40800 + }, + { + "epoch": 3.8638515432683205, + "grad_norm": 0.4422076344490051, + "learning_rate": 4.544593826926718e-06, + "loss": 0.0924, + "step": 40810 + }, + { + "epoch": 3.86479833364893, + "grad_norm": 0.4300210773944855, + "learning_rate": 4.54080666540428e-06, + "loss": 0.0921, + "step": 40820 + }, + { + "epoch": 3.86574512402954, + "grad_norm": 0.6585009694099426, + "learning_rate": 4.537019503881841e-06, + "loss": 0.0913, + "step": 40830 + }, + { + "epoch": 3.8666919144101497, + "grad_norm": 0.4743376672267914, + "learning_rate": 4.533232342359402e-06, + "loss": 0.0921, + "step": 40840 + }, + { + "epoch": 3.8676387047907594, + "grad_norm": 0.49737921357154846, + "learning_rate": 4.529445180836964e-06, + "loss": 0.1085, + "step": 40850 + }, + { + "epoch": 3.868585495171369, + "grad_norm": 0.49208909273147583, + "learning_rate": 4.525658019314524e-06, + "loss": 0.0999, + "step": 40860 + }, + { + "epoch": 3.869532285551979, + "grad_norm": 0.5970167517662048, + "learning_rate": 4.521870857792085e-06, + "loss": 0.1033, + "step": 40870 + }, + { + "epoch": 3.8704790759325887, + "grad_norm": 0.5238867998123169, + "learning_rate": 4.518083696269646e-06, + "loss": 0.0958, + "step": 40880 + }, + { + "epoch": 3.8714258663131984, + "grad_norm": 0.4827011227607727, + "learning_rate": 4.514296534747207e-06, + "loss": 0.1031, + "step": 40890 + }, + { + "epoch": 3.872372656693808, + "grad_norm": 0.5186099410057068, + "learning_rate": 4.510509373224768e-06, + "loss": 0.1, + "step": 40900 + }, + { + "epoch": 3.873319447074418, + "grad_norm": 0.41698890924453735, + "learning_rate": 4.50672221170233e-06, + "loss": 0.0982, + "step": 40910 + }, + { + "epoch": 3.8742662374550276, + "grad_norm": 0.5530847311019897, + "learning_rate": 4.502935050179891e-06, + "loss": 0.0983, + "step": 40920 + }, + { + "epoch": 3.8752130278356374, + "grad_norm": 0.519140899181366, + "learning_rate": 4.499147888657452e-06, + "loss": 0.0957, + "step": 40930 + }, + { + "epoch": 3.876159818216247, + "grad_norm": 0.46212148666381836, + "learning_rate": 4.4953607271350125e-06, + "loss": 0.0904, + "step": 40940 + }, + { + "epoch": 3.877106608596857, + "grad_norm": 0.5408146381378174, + "learning_rate": 4.4915735656125734e-06, + "loss": 0.1085, + "step": 40950 + }, + { + "epoch": 3.8780533989774666, + "grad_norm": 0.5476594567298889, + "learning_rate": 4.487786404090135e-06, + "loss": 0.0984, + "step": 40960 + }, + { + "epoch": 3.8790001893580763, + "grad_norm": 0.45062705874443054, + "learning_rate": 4.483999242567696e-06, + "loss": 0.0981, + "step": 40970 + }, + { + "epoch": 3.879946979738686, + "grad_norm": 0.47842487692832947, + "learning_rate": 4.480212081045257e-06, + "loss": 0.0979, + "step": 40980 + }, + { + "epoch": 3.880893770119296, + "grad_norm": 0.47004956007003784, + "learning_rate": 4.476424919522818e-06, + "loss": 0.0996, + "step": 40990 + }, + { + "epoch": 3.8818405604999056, + "grad_norm": 0.5024536848068237, + "learning_rate": 4.47263775800038e-06, + "loss": 0.0903, + "step": 41000 + }, + { + "epoch": 3.8827873508805153, + "grad_norm": 0.4794340133666992, + "learning_rate": 4.46885059647794e-06, + "loss": 0.0961, + "step": 41010 + }, + { + "epoch": 3.883734141261125, + "grad_norm": 0.4689330756664276, + "learning_rate": 4.465063434955501e-06, + "loss": 0.1038, + "step": 41020 + }, + { + "epoch": 3.884680931641735, + "grad_norm": 0.6886982321739197, + "learning_rate": 4.461276273433062e-06, + "loss": 0.1095, + "step": 41030 + }, + { + "epoch": 3.8856277220223445, + "grad_norm": 0.48080509901046753, + "learning_rate": 4.457489111910623e-06, + "loss": 0.0917, + "step": 41040 + }, + { + "epoch": 3.8865745124029543, + "grad_norm": 0.4674771726131439, + "learning_rate": 4.453701950388185e-06, + "loss": 0.0889, + "step": 41050 + }, + { + "epoch": 3.8875213027835636, + "grad_norm": 0.6061664819717407, + "learning_rate": 4.449914788865746e-06, + "loss": 0.0952, + "step": 41060 + }, + { + "epoch": 3.8884680931641733, + "grad_norm": 0.5249764323234558, + "learning_rate": 4.446127627343307e-06, + "loss": 0.1027, + "step": 41070 + }, + { + "epoch": 3.889414883544783, + "grad_norm": 0.5209566354751587, + "learning_rate": 4.442340465820868e-06, + "loss": 0.0985, + "step": 41080 + }, + { + "epoch": 3.890361673925393, + "grad_norm": 0.6610578894615173, + "learning_rate": 4.4385533042984285e-06, + "loss": 0.1029, + "step": 41090 + }, + { + "epoch": 3.8913084643060025, + "grad_norm": 0.46757304668426514, + "learning_rate": 4.434766142775989e-06, + "loss": 0.0956, + "step": 41100 + }, + { + "epoch": 3.8922552546866123, + "grad_norm": 0.47179147601127625, + "learning_rate": 4.430978981253551e-06, + "loss": 0.106, + "step": 41110 + }, + { + "epoch": 3.893202045067222, + "grad_norm": 0.6236637830734253, + "learning_rate": 4.427191819731112e-06, + "loss": 0.1017, + "step": 41120 + }, + { + "epoch": 3.8941488354478317, + "grad_norm": 0.48480746150016785, + "learning_rate": 4.423404658208673e-06, + "loss": 0.0922, + "step": 41130 + }, + { + "epoch": 3.8950956258284415, + "grad_norm": 0.4346686601638794, + "learning_rate": 4.419617496686235e-06, + "loss": 0.0944, + "step": 41140 + }, + { + "epoch": 3.8960424162090512, + "grad_norm": 0.39827197790145874, + "learning_rate": 4.415830335163795e-06, + "loss": 0.0913, + "step": 41150 + }, + { + "epoch": 3.896989206589661, + "grad_norm": 0.395149827003479, + "learning_rate": 4.412043173641356e-06, + "loss": 0.0964, + "step": 41160 + }, + { + "epoch": 3.8979359969702707, + "grad_norm": 0.47335392236709595, + "learning_rate": 4.408256012118917e-06, + "loss": 0.0892, + "step": 41170 + }, + { + "epoch": 3.8988827873508805, + "grad_norm": 0.47912123799324036, + "learning_rate": 4.404468850596478e-06, + "loss": 0.0965, + "step": 41180 + }, + { + "epoch": 3.89982957773149, + "grad_norm": 0.4129624664783478, + "learning_rate": 4.400681689074039e-06, + "loss": 0.0887, + "step": 41190 + }, + { + "epoch": 3.9007763681121, + "grad_norm": 0.5240882635116577, + "learning_rate": 4.396894527551601e-06, + "loss": 0.0967, + "step": 41200 + }, + { + "epoch": 3.9017231584927097, + "grad_norm": 0.5136657953262329, + "learning_rate": 4.393107366029162e-06, + "loss": 0.0956, + "step": 41210 + }, + { + "epoch": 3.9026699488733194, + "grad_norm": 0.5962952971458435, + "learning_rate": 4.389320204506723e-06, + "loss": 0.0984, + "step": 41220 + }, + { + "epoch": 3.903616739253929, + "grad_norm": 0.37900274991989136, + "learning_rate": 4.385533042984284e-06, + "loss": 0.092, + "step": 41230 + }, + { + "epoch": 3.904563529634539, + "grad_norm": 0.5577122569084167, + "learning_rate": 4.3817458814618445e-06, + "loss": 0.1054, + "step": 41240 + }, + { + "epoch": 3.9055103200151486, + "grad_norm": 0.4894963800907135, + "learning_rate": 4.377958719939405e-06, + "loss": 0.1018, + "step": 41250 + }, + { + "epoch": 3.9064571103957584, + "grad_norm": 0.43864747881889343, + "learning_rate": 4.374171558416967e-06, + "loss": 0.0965, + "step": 41260 + }, + { + "epoch": 3.907403900776368, + "grad_norm": 0.45537394285202026, + "learning_rate": 4.370384396894528e-06, + "loss": 0.0887, + "step": 41270 + }, + { + "epoch": 3.908350691156978, + "grad_norm": 0.5147911310195923, + "learning_rate": 4.366597235372089e-06, + "loss": 0.0912, + "step": 41280 + }, + { + "epoch": 3.9092974815375876, + "grad_norm": 0.5520839691162109, + "learning_rate": 4.36281007384965e-06, + "loss": 0.099, + "step": 41290 + }, + { + "epoch": 3.9102442719181973, + "grad_norm": 0.5957738757133484, + "learning_rate": 4.359022912327211e-06, + "loss": 0.1005, + "step": 41300 + }, + { + "epoch": 3.911191062298807, + "grad_norm": 0.5196476578712463, + "learning_rate": 4.3552357508047725e-06, + "loss": 0.1033, + "step": 41310 + }, + { + "epoch": 3.912137852679417, + "grad_norm": 0.59003746509552, + "learning_rate": 4.351448589282333e-06, + "loss": 0.0987, + "step": 41320 + }, + { + "epoch": 3.9130846430600266, + "grad_norm": 0.5245180726051331, + "learning_rate": 4.347661427759894e-06, + "loss": 0.0974, + "step": 41330 + }, + { + "epoch": 3.9140314334406363, + "grad_norm": 0.5251783728599548, + "learning_rate": 4.343874266237455e-06, + "loss": 0.0985, + "step": 41340 + }, + { + "epoch": 3.914978223821246, + "grad_norm": 0.46796226501464844, + "learning_rate": 4.340087104715017e-06, + "loss": 0.1035, + "step": 41350 + }, + { + "epoch": 3.915925014201856, + "grad_norm": 0.5118393898010254, + "learning_rate": 4.336299943192577e-06, + "loss": 0.0924, + "step": 41360 + }, + { + "epoch": 3.9168718045824655, + "grad_norm": 0.5003322958946228, + "learning_rate": 4.332512781670139e-06, + "loss": 0.0982, + "step": 41370 + }, + { + "epoch": 3.9178185949630753, + "grad_norm": 0.6285452246665955, + "learning_rate": 4.3287256201477e-06, + "loss": 0.0941, + "step": 41380 + }, + { + "epoch": 3.918765385343685, + "grad_norm": 0.4589633345603943, + "learning_rate": 4.3249384586252605e-06, + "loss": 0.0941, + "step": 41390 + }, + { + "epoch": 3.9197121757242948, + "grad_norm": 0.48504549264907837, + "learning_rate": 4.321151297102822e-06, + "loss": 0.0986, + "step": 41400 + }, + { + "epoch": 3.9206589661049045, + "grad_norm": 0.5638332366943359, + "learning_rate": 4.317364135580383e-06, + "loss": 0.1092, + "step": 41410 + }, + { + "epoch": 3.9216057564855142, + "grad_norm": 0.42241472005844116, + "learning_rate": 4.313576974057944e-06, + "loss": 0.1062, + "step": 41420 + }, + { + "epoch": 3.922552546866124, + "grad_norm": 0.46967965364456177, + "learning_rate": 4.309789812535505e-06, + "loss": 0.0994, + "step": 41430 + }, + { + "epoch": 3.9234993372467337, + "grad_norm": 0.4713064432144165, + "learning_rate": 4.306002651013066e-06, + "loss": 0.1011, + "step": 41440 + }, + { + "epoch": 3.9244461276273435, + "grad_norm": 0.5808030366897583, + "learning_rate": 4.302215489490627e-06, + "loss": 0.0967, + "step": 41450 + }, + { + "epoch": 3.925392918007953, + "grad_norm": 0.5431656241416931, + "learning_rate": 4.2984283279681885e-06, + "loss": 0.0967, + "step": 41460 + }, + { + "epoch": 3.9263397083885625, + "grad_norm": 0.5076019167900085, + "learning_rate": 4.294641166445749e-06, + "loss": 0.0957, + "step": 41470 + }, + { + "epoch": 3.9272864987691722, + "grad_norm": 0.6491959691047668, + "learning_rate": 4.29085400492331e-06, + "loss": 0.0995, + "step": 41480 + }, + { + "epoch": 3.928233289149782, + "grad_norm": 0.46067243814468384, + "learning_rate": 4.287066843400872e-06, + "loss": 0.0989, + "step": 41490 + }, + { + "epoch": 3.9291800795303917, + "grad_norm": 0.5454583764076233, + "learning_rate": 4.283279681878432e-06, + "loss": 0.1021, + "step": 41500 + }, + { + "epoch": 3.9301268699110015, + "grad_norm": 0.6414952278137207, + "learning_rate": 4.279492520355994e-06, + "loss": 0.0944, + "step": 41510 + }, + { + "epoch": 3.931073660291611, + "grad_norm": 0.4030686914920807, + "learning_rate": 4.275705358833555e-06, + "loss": 0.0901, + "step": 41520 + }, + { + "epoch": 3.932020450672221, + "grad_norm": 0.4818320572376251, + "learning_rate": 4.271918197311116e-06, + "loss": 0.0925, + "step": 41530 + }, + { + "epoch": 3.9329672410528307, + "grad_norm": 0.5245049595832825, + "learning_rate": 4.2681310357886765e-06, + "loss": 0.0972, + "step": 41540 + }, + { + "epoch": 3.9339140314334404, + "grad_norm": 0.492740273475647, + "learning_rate": 4.264343874266238e-06, + "loss": 0.0995, + "step": 41550 + }, + { + "epoch": 3.93486082181405, + "grad_norm": 0.5344408750534058, + "learning_rate": 4.260556712743799e-06, + "loss": 0.0982, + "step": 41560 + }, + { + "epoch": 3.93580761219466, + "grad_norm": 0.4735774099826813, + "learning_rate": 4.25676955122136e-06, + "loss": 0.0895, + "step": 41570 + }, + { + "epoch": 3.9367544025752697, + "grad_norm": 0.4924392104148865, + "learning_rate": 4.252982389698921e-06, + "loss": 0.0966, + "step": 41580 + }, + { + "epoch": 3.9377011929558794, + "grad_norm": 0.4154967963695526, + "learning_rate": 4.249195228176482e-06, + "loss": 0.0941, + "step": 41590 + }, + { + "epoch": 3.938647983336489, + "grad_norm": 0.5360844731330872, + "learning_rate": 4.2454080666540436e-06, + "loss": 0.0954, + "step": 41600 + }, + { + "epoch": 3.939594773717099, + "grad_norm": 0.5082783102989197, + "learning_rate": 4.2416209051316045e-06, + "loss": 0.0958, + "step": 41610 + }, + { + "epoch": 3.9405415640977086, + "grad_norm": 0.5777152180671692, + "learning_rate": 4.237833743609165e-06, + "loss": 0.1034, + "step": 41620 + }, + { + "epoch": 3.9414883544783184, + "grad_norm": 0.40907520055770874, + "learning_rate": 4.234046582086726e-06, + "loss": 0.0997, + "step": 41630 + }, + { + "epoch": 3.942435144858928, + "grad_norm": 0.34812426567077637, + "learning_rate": 4.230259420564287e-06, + "loss": 0.0912, + "step": 41640 + }, + { + "epoch": 3.943381935239538, + "grad_norm": 0.5555279850959778, + "learning_rate": 4.226472259041848e-06, + "loss": 0.0933, + "step": 41650 + }, + { + "epoch": 3.9443287256201476, + "grad_norm": 0.45173120498657227, + "learning_rate": 4.22268509751941e-06, + "loss": 0.09, + "step": 41660 + }, + { + "epoch": 3.9452755160007573, + "grad_norm": 0.4487028121948242, + "learning_rate": 4.218897935996971e-06, + "loss": 0.0951, + "step": 41670 + }, + { + "epoch": 3.946222306381367, + "grad_norm": 0.4836462736129761, + "learning_rate": 4.215110774474532e-06, + "loss": 0.0978, + "step": 41680 + }, + { + "epoch": 3.947169096761977, + "grad_norm": 0.521132230758667, + "learning_rate": 4.211323612952093e-06, + "loss": 0.0945, + "step": 41690 + }, + { + "epoch": 3.9481158871425865, + "grad_norm": 0.6219900846481323, + "learning_rate": 4.207536451429654e-06, + "loss": 0.1016, + "step": 41700 + }, + { + "epoch": 3.9490626775231963, + "grad_norm": 0.4954199194908142, + "learning_rate": 4.203749289907214e-06, + "loss": 0.0984, + "step": 41710 + }, + { + "epoch": 3.950009467903806, + "grad_norm": 0.5494492053985596, + "learning_rate": 4.199962128384776e-06, + "loss": 0.0959, + "step": 41720 + }, + { + "epoch": 3.9509562582844158, + "grad_norm": 0.4500981569290161, + "learning_rate": 4.196174966862337e-06, + "loss": 0.0954, + "step": 41730 + }, + { + "epoch": 3.9519030486650255, + "grad_norm": 0.4959108531475067, + "learning_rate": 4.192387805339898e-06, + "loss": 0.1015, + "step": 41740 + }, + { + "epoch": 3.9528498390456352, + "grad_norm": 0.514580249786377, + "learning_rate": 4.1886006438174595e-06, + "loss": 0.1017, + "step": 41750 + }, + { + "epoch": 3.953796629426245, + "grad_norm": 0.6242176294326782, + "learning_rate": 4.1848134822950204e-06, + "loss": 0.1032, + "step": 41760 + }, + { + "epoch": 3.9547434198068547, + "grad_norm": 0.5583734512329102, + "learning_rate": 4.181026320772581e-06, + "loss": 0.0922, + "step": 41770 + }, + { + "epoch": 3.9556902101874645, + "grad_norm": 0.5744973421096802, + "learning_rate": 4.177239159250142e-06, + "loss": 0.1132, + "step": 41780 + }, + { + "epoch": 3.956637000568074, + "grad_norm": 0.48197564482688904, + "learning_rate": 4.173451997727703e-06, + "loss": 0.1009, + "step": 41790 + }, + { + "epoch": 3.957583790948684, + "grad_norm": 0.44193461537361145, + "learning_rate": 4.169664836205264e-06, + "loss": 0.089, + "step": 41800 + }, + { + "epoch": 3.9585305813292937, + "grad_norm": 0.5327918529510498, + "learning_rate": 4.165877674682826e-06, + "loss": 0.0966, + "step": 41810 + }, + { + "epoch": 3.9594773717099034, + "grad_norm": 0.5483490824699402, + "learning_rate": 4.162090513160387e-06, + "loss": 0.1069, + "step": 41820 + }, + { + "epoch": 3.960424162090513, + "grad_norm": 0.4684647023677826, + "learning_rate": 4.1583033516379476e-06, + "loss": 0.0917, + "step": 41830 + }, + { + "epoch": 3.961370952471123, + "grad_norm": 0.5732463598251343, + "learning_rate": 4.154516190115509e-06, + "loss": 0.0979, + "step": 41840 + }, + { + "epoch": 3.9623177428517327, + "grad_norm": 0.4299609959125519, + "learning_rate": 4.150729028593069e-06, + "loss": 0.0904, + "step": 41850 + }, + { + "epoch": 3.9632645332323424, + "grad_norm": 0.5590096116065979, + "learning_rate": 4.146941867070631e-06, + "loss": 0.0963, + "step": 41860 + }, + { + "epoch": 3.964211323612952, + "grad_norm": 0.47193509340286255, + "learning_rate": 4.143154705548192e-06, + "loss": 0.1036, + "step": 41870 + }, + { + "epoch": 3.965158113993562, + "grad_norm": 0.4498583674430847, + "learning_rate": 4.139367544025753e-06, + "loss": 0.0872, + "step": 41880 + }, + { + "epoch": 3.9661049043741716, + "grad_norm": 0.5629134178161621, + "learning_rate": 4.135580382503314e-06, + "loss": 0.0999, + "step": 41890 + }, + { + "epoch": 3.9670516947547814, + "grad_norm": 0.5969022512435913, + "learning_rate": 4.1317932209808755e-06, + "loss": 0.1046, + "step": 41900 + }, + { + "epoch": 3.967998485135391, + "grad_norm": 0.7509692311286926, + "learning_rate": 4.1280060594584364e-06, + "loss": 0.1002, + "step": 41910 + }, + { + "epoch": 3.968945275516001, + "grad_norm": 0.5245600938796997, + "learning_rate": 4.124218897935997e-06, + "loss": 0.1006, + "step": 41920 + }, + { + "epoch": 3.9698920658966106, + "grad_norm": 0.5322967767715454, + "learning_rate": 4.120431736413558e-06, + "loss": 0.0946, + "step": 41930 + }, + { + "epoch": 3.9708388562772203, + "grad_norm": 0.3742385804653168, + "learning_rate": 4.116644574891119e-06, + "loss": 0.0973, + "step": 41940 + }, + { + "epoch": 3.97178564665783, + "grad_norm": 0.7984851002693176, + "learning_rate": 4.112857413368681e-06, + "loss": 0.1026, + "step": 41950 + }, + { + "epoch": 3.97273243703844, + "grad_norm": 0.5663356184959412, + "learning_rate": 4.109070251846242e-06, + "loss": 0.0899, + "step": 41960 + }, + { + "epoch": 3.9736792274190496, + "grad_norm": 0.596892237663269, + "learning_rate": 4.105283090323803e-06, + "loss": 0.0942, + "step": 41970 + }, + { + "epoch": 3.9746260177996593, + "grad_norm": 0.4649079144001007, + "learning_rate": 4.1014959288013636e-06, + "loss": 0.1016, + "step": 41980 + }, + { + "epoch": 3.975572808180269, + "grad_norm": 0.4916474521160126, + "learning_rate": 4.0977087672789245e-06, + "loss": 0.0997, + "step": 41990 + }, + { + "epoch": 3.9765195985608788, + "grad_norm": 0.5276491641998291, + "learning_rate": 4.093921605756485e-06, + "loss": 0.0989, + "step": 42000 + }, + { + "epoch": 3.9774663889414885, + "grad_norm": 0.4202950596809387, + "learning_rate": 4.090134444234047e-06, + "loss": 0.0949, + "step": 42010 + }, + { + "epoch": 3.9784131793220983, + "grad_norm": 0.4367360770702362, + "learning_rate": 4.086347282711608e-06, + "loss": 0.0954, + "step": 42020 + }, + { + "epoch": 3.979359969702708, + "grad_norm": 0.4967721700668335, + "learning_rate": 4.082560121189169e-06, + "loss": 0.1072, + "step": 42030 + }, + { + "epoch": 3.9803067600833177, + "grad_norm": 0.4512579143047333, + "learning_rate": 4.078772959666731e-06, + "loss": 0.0905, + "step": 42040 + }, + { + "epoch": 3.9812535504639275, + "grad_norm": 0.5061561465263367, + "learning_rate": 4.0749857981442915e-06, + "loss": 0.0954, + "step": 42050 + }, + { + "epoch": 3.9822003408445372, + "grad_norm": 0.4698218107223511, + "learning_rate": 4.0711986366218516e-06, + "loss": 0.1007, + "step": 42060 + }, + { + "epoch": 3.983147131225147, + "grad_norm": 0.6213154196739197, + "learning_rate": 4.067411475099413e-06, + "loss": 0.0922, + "step": 42070 + }, + { + "epoch": 3.9840939216057567, + "grad_norm": 0.5239683985710144, + "learning_rate": 4.063624313576974e-06, + "loss": 0.0936, + "step": 42080 + }, + { + "epoch": 3.9850407119863664, + "grad_norm": 0.4962012469768524, + "learning_rate": 4.059837152054535e-06, + "loss": 0.1056, + "step": 42090 + }, + { + "epoch": 3.985987502366976, + "grad_norm": 0.3595285415649414, + "learning_rate": 4.056049990532097e-06, + "loss": 0.0968, + "step": 42100 + }, + { + "epoch": 3.986934292747586, + "grad_norm": 0.40510043501853943, + "learning_rate": 4.052262829009658e-06, + "loss": 0.0974, + "step": 42110 + }, + { + "epoch": 3.9878810831281957, + "grad_norm": 0.6077454090118408, + "learning_rate": 4.048475667487219e-06, + "loss": 0.0937, + "step": 42120 + }, + { + "epoch": 3.9888278735088054, + "grad_norm": 0.5463760495185852, + "learning_rate": 4.0446885059647795e-06, + "loss": 0.0941, + "step": 42130 + }, + { + "epoch": 3.989774663889415, + "grad_norm": 0.548797070980072, + "learning_rate": 4.0409013444423404e-06, + "loss": 0.0925, + "step": 42140 + }, + { + "epoch": 3.990721454270025, + "grad_norm": 0.5181418657302856, + "learning_rate": 4.037114182919901e-06, + "loss": 0.1075, + "step": 42150 + }, + { + "epoch": 3.991668244650634, + "grad_norm": 0.49558815360069275, + "learning_rate": 4.033327021397463e-06, + "loss": 0.0938, + "step": 42160 + }, + { + "epoch": 3.992615035031244, + "grad_norm": 0.4663916230201721, + "learning_rate": 4.029539859875024e-06, + "loss": 0.0906, + "step": 42170 + }, + { + "epoch": 3.9935618254118537, + "grad_norm": 0.5485295057296753, + "learning_rate": 4.025752698352585e-06, + "loss": 0.0894, + "step": 42180 + }, + { + "epoch": 3.9945086157924634, + "grad_norm": 0.6362839341163635, + "learning_rate": 4.021965536830147e-06, + "loss": 0.0995, + "step": 42190 + }, + { + "epoch": 3.995455406173073, + "grad_norm": 0.499687522649765, + "learning_rate": 4.018178375307707e-06, + "loss": 0.0889, + "step": 42200 + }, + { + "epoch": 3.996402196553683, + "grad_norm": 0.4681762158870697, + "learning_rate": 4.014391213785268e-06, + "loss": 0.0959, + "step": 42210 + }, + { + "epoch": 3.9973489869342926, + "grad_norm": 0.400115430355072, + "learning_rate": 4.010604052262829e-06, + "loss": 0.0889, + "step": 42220 + }, + { + "epoch": 3.9982957773149024, + "grad_norm": 0.5818225145339966, + "learning_rate": 4.00681689074039e-06, + "loss": 0.0921, + "step": 42230 + }, + { + "epoch": 3.999242567695512, + "grad_norm": 0.4878033995628357, + "learning_rate": 4.003029729217951e-06, + "loss": 0.1001, + "step": 42240 + }, + { + "epoch": 4.0, + "eval_f1_micro": 0.3810792804796802, + "eval_loss": 0.11376741528511047, + "eval_precision": 0.5667181865018032, + "eval_recall": 0.28705060521508724, + "eval_runtime": 344.9502, + "eval_samples_per_second": 122.467, + "eval_steps_per_second": 7.656, + "step": 42248 + }, + { + "epoch": 4.000189358076122, + "grad_norm": 0.49115651845932007, + "learning_rate": 3.999242567695513e-06, + "loss": 0.0971, + "step": 42250 + }, + { + "epoch": 4.001136148456732, + "grad_norm": 0.39359888434410095, + "learning_rate": 3.995455406173074e-06, + "loss": 0.0859, + "step": 42260 + }, + { + "epoch": 4.002082938837342, + "grad_norm": 0.5498154759407043, + "learning_rate": 3.991668244650635e-06, + "loss": 0.0983, + "step": 42270 + }, + { + "epoch": 4.0030297292179515, + "grad_norm": 0.3986906409263611, + "learning_rate": 3.9878810831281955e-06, + "loss": 0.087, + "step": 42280 + }, + { + "epoch": 4.003976519598561, + "grad_norm": 0.49595123529434204, + "learning_rate": 3.9840939216057564e-06, + "loss": 0.0904, + "step": 42290 + }, + { + "epoch": 4.004923309979171, + "grad_norm": 0.5356326103210449, + "learning_rate": 3.980306760083318e-06, + "loss": 0.0977, + "step": 42300 + }, + { + "epoch": 4.005870100359781, + "grad_norm": 0.4728410840034485, + "learning_rate": 3.976519598560879e-06, + "loss": 0.0972, + "step": 42310 + }, + { + "epoch": 4.0068168907403905, + "grad_norm": 0.5084603428840637, + "learning_rate": 3.97273243703844e-06, + "loss": 0.0911, + "step": 42320 + }, + { + "epoch": 4.007763681121, + "grad_norm": 0.4655855596065521, + "learning_rate": 3.968945275516001e-06, + "loss": 0.0995, + "step": 42330 + }, + { + "epoch": 4.00871047150161, + "grad_norm": 0.4194011092185974, + "learning_rate": 3.965158113993562e-06, + "loss": 0.0991, + "step": 42340 + }, + { + "epoch": 4.00965726188222, + "grad_norm": 0.42159825563430786, + "learning_rate": 3.961370952471123e-06, + "loss": 0.0902, + "step": 42350 + }, + { + "epoch": 4.010604052262829, + "grad_norm": 0.4718884825706482, + "learning_rate": 3.957583790948684e-06, + "loss": 0.0951, + "step": 42360 + }, + { + "epoch": 4.011550842643438, + "grad_norm": 0.4827013909816742, + "learning_rate": 3.953796629426245e-06, + "loss": 0.0913, + "step": 42370 + }, + { + "epoch": 4.012497633024048, + "grad_norm": 0.517824113368988, + "learning_rate": 3.950009467903806e-06, + "loss": 0.1011, + "step": 42380 + }, + { + "epoch": 4.013444423404658, + "grad_norm": 0.6215113401412964, + "learning_rate": 3.946222306381368e-06, + "loss": 0.09, + "step": 42390 + }, + { + "epoch": 4.0143912137852675, + "grad_norm": 0.39003387093544006, + "learning_rate": 3.942435144858929e-06, + "loss": 0.097, + "step": 42400 + }, + { + "epoch": 4.015338004165877, + "grad_norm": 0.6039709448814392, + "learning_rate": 3.93864798333649e-06, + "loss": 0.0943, + "step": 42410 + }, + { + "epoch": 4.016284794546487, + "grad_norm": 0.3276531398296356, + "learning_rate": 3.934860821814051e-06, + "loss": 0.0876, + "step": 42420 + }, + { + "epoch": 4.017231584927097, + "grad_norm": 0.5303285717964172, + "learning_rate": 3.9310736602916115e-06, + "loss": 0.0961, + "step": 42430 + }, + { + "epoch": 4.0181783753077065, + "grad_norm": 0.4780145585536957, + "learning_rate": 3.927286498769172e-06, + "loss": 0.1001, + "step": 42440 + }, + { + "epoch": 4.019125165688316, + "grad_norm": 0.4328151345252991, + "learning_rate": 3.923499337246734e-06, + "loss": 0.0844, + "step": 42450 + }, + { + "epoch": 4.020071956068926, + "grad_norm": 0.4836941957473755, + "learning_rate": 3.919712175724295e-06, + "loss": 0.0974, + "step": 42460 + }, + { + "epoch": 4.021018746449536, + "grad_norm": 0.42942121624946594, + "learning_rate": 3.915925014201856e-06, + "loss": 0.0867, + "step": 42470 + }, + { + "epoch": 4.0219655368301455, + "grad_norm": 0.5348581671714783, + "learning_rate": 3.912137852679418e-06, + "loss": 0.0902, + "step": 42480 + }, + { + "epoch": 4.022912327210755, + "grad_norm": 0.4421156644821167, + "learning_rate": 3.908350691156978e-06, + "loss": 0.0909, + "step": 42490 + }, + { + "epoch": 4.023859117591365, + "grad_norm": 0.4744870662689209, + "learning_rate": 3.9045635296345395e-06, + "loss": 0.0938, + "step": 42500 + }, + { + "epoch": 4.024805907971975, + "grad_norm": 0.4022984206676483, + "learning_rate": 3.9007763681121e-06, + "loss": 0.0963, + "step": 42510 + }, + { + "epoch": 4.025752698352584, + "grad_norm": 0.5710201859474182, + "learning_rate": 3.896989206589661e-06, + "loss": 0.085, + "step": 42520 + }, + { + "epoch": 4.026699488733194, + "grad_norm": 0.46365416049957275, + "learning_rate": 3.893202045067222e-06, + "loss": 0.0856, + "step": 42530 + }, + { + "epoch": 4.027646279113804, + "grad_norm": 0.5811256170272827, + "learning_rate": 3.889414883544784e-06, + "loss": 0.0884, + "step": 42540 + }, + { + "epoch": 4.028593069494414, + "grad_norm": 0.6684948205947876, + "learning_rate": 3.885627722022345e-06, + "loss": 0.0943, + "step": 42550 + }, + { + "epoch": 4.029539859875023, + "grad_norm": 0.5227164626121521, + "learning_rate": 3.881840560499906e-06, + "loss": 0.0913, + "step": 42560 + }, + { + "epoch": 4.030486650255633, + "grad_norm": 0.47879600524902344, + "learning_rate": 3.878053398977467e-06, + "loss": 0.0914, + "step": 42570 + }, + { + "epoch": 4.031433440636243, + "grad_norm": 0.48690658807754517, + "learning_rate": 3.8742662374550275e-06, + "loss": 0.0975, + "step": 42580 + }, + { + "epoch": 4.032380231016853, + "grad_norm": 0.44016072154045105, + "learning_rate": 3.870479075932589e-06, + "loss": 0.0899, + "step": 42590 + }, + { + "epoch": 4.033327021397462, + "grad_norm": 0.6146271824836731, + "learning_rate": 3.86669191441015e-06, + "loss": 0.0915, + "step": 42600 + }, + { + "epoch": 4.034273811778072, + "grad_norm": 0.46333184838294983, + "learning_rate": 3.862904752887711e-06, + "loss": 0.0877, + "step": 42610 + }, + { + "epoch": 4.035220602158682, + "grad_norm": 0.6286950707435608, + "learning_rate": 3.859117591365272e-06, + "loss": 0.0972, + "step": 42620 + }, + { + "epoch": 4.036167392539292, + "grad_norm": 0.4000542163848877, + "learning_rate": 3.855330429842833e-06, + "loss": 0.0875, + "step": 42630 + }, + { + "epoch": 4.037114182919901, + "grad_norm": 0.44887304306030273, + "learning_rate": 3.851543268320394e-06, + "loss": 0.0939, + "step": 42640 + }, + { + "epoch": 4.038060973300511, + "grad_norm": 0.4717208743095398, + "learning_rate": 3.8477561067979555e-06, + "loss": 0.0938, + "step": 42650 + }, + { + "epoch": 4.039007763681121, + "grad_norm": 0.5275483131408691, + "learning_rate": 3.843968945275516e-06, + "loss": 0.0834, + "step": 42660 + }, + { + "epoch": 4.0399545540617305, + "grad_norm": 0.5291388630867004, + "learning_rate": 3.840181783753077e-06, + "loss": 0.0809, + "step": 42670 + }, + { + "epoch": 4.04090134444234, + "grad_norm": 0.49929943680763245, + "learning_rate": 3.836394622230639e-06, + "loss": 0.0963, + "step": 42680 + }, + { + "epoch": 4.04184813482295, + "grad_norm": 0.65031498670578, + "learning_rate": 3.8326074607082e-06, + "loss": 0.0855, + "step": 42690 + }, + { + "epoch": 4.04279492520356, + "grad_norm": 0.4142380654811859, + "learning_rate": 3.82882029918576e-06, + "loss": 0.0999, + "step": 42700 + }, + { + "epoch": 4.0437417155841695, + "grad_norm": 0.4350908100605011, + "learning_rate": 3.825033137663322e-06, + "loss": 0.0862, + "step": 42710 + }, + { + "epoch": 4.044688505964779, + "grad_norm": 0.5821089744567871, + "learning_rate": 3.821245976140883e-06, + "loss": 0.0968, + "step": 42720 + }, + { + "epoch": 4.045635296345389, + "grad_norm": 0.5138930678367615, + "learning_rate": 3.8174588146184435e-06, + "loss": 0.0857, + "step": 42730 + }, + { + "epoch": 4.046582086725999, + "grad_norm": 0.4821796417236328, + "learning_rate": 3.813671653096005e-06, + "loss": 0.0989, + "step": 42740 + }, + { + "epoch": 4.0475288771066085, + "grad_norm": 0.41542673110961914, + "learning_rate": 3.809884491573566e-06, + "loss": 0.0864, + "step": 42750 + }, + { + "epoch": 4.048475667487218, + "grad_norm": 0.49766823649406433, + "learning_rate": 3.806097330051127e-06, + "loss": 0.0998, + "step": 42760 + }, + { + "epoch": 4.049422457867828, + "grad_norm": 0.4959116280078888, + "learning_rate": 3.802310168528688e-06, + "loss": 0.0874, + "step": 42770 + }, + { + "epoch": 4.050369248248438, + "grad_norm": 0.40544602274894714, + "learning_rate": 3.798523007006249e-06, + "loss": 0.0962, + "step": 42780 + }, + { + "epoch": 4.051316038629047, + "grad_norm": 0.33507898449897766, + "learning_rate": 3.79473584548381e-06, + "loss": 0.0804, + "step": 42790 + }, + { + "epoch": 4.052262829009657, + "grad_norm": 0.55490642786026, + "learning_rate": 3.7909486839613715e-06, + "loss": 0.0981, + "step": 42800 + }, + { + "epoch": 4.053209619390267, + "grad_norm": 0.5510737299919128, + "learning_rate": 3.7871615224389324e-06, + "loss": 0.088, + "step": 42810 + }, + { + "epoch": 4.054156409770877, + "grad_norm": 0.43902432918548584, + "learning_rate": 3.7833743609164937e-06, + "loss": 0.0914, + "step": 42820 + }, + { + "epoch": 4.055103200151486, + "grad_norm": 0.6550730466842651, + "learning_rate": 3.7795871993940546e-06, + "loss": 0.0906, + "step": 42830 + }, + { + "epoch": 4.056049990532096, + "grad_norm": 0.5181649923324585, + "learning_rate": 3.7758000378716155e-06, + "loss": 0.0989, + "step": 42840 + }, + { + "epoch": 4.056996780912706, + "grad_norm": 0.479004830121994, + "learning_rate": 3.7720128763491764e-06, + "loss": 0.0845, + "step": 42850 + }, + { + "epoch": 4.057943571293316, + "grad_norm": 0.565980076789856, + "learning_rate": 3.7682257148267377e-06, + "loss": 0.0969, + "step": 42860 + }, + { + "epoch": 4.058890361673925, + "grad_norm": 0.5291234254837036, + "learning_rate": 3.7644385533042986e-06, + "loss": 0.0919, + "step": 42870 + }, + { + "epoch": 4.059837152054535, + "grad_norm": 0.4307625889778137, + "learning_rate": 3.76065139178186e-06, + "loss": 0.0916, + "step": 42880 + }, + { + "epoch": 4.060783942435145, + "grad_norm": 0.5084070563316345, + "learning_rate": 3.7568642302594212e-06, + "loss": 0.0903, + "step": 42890 + }, + { + "epoch": 4.061730732815755, + "grad_norm": 0.4158044159412384, + "learning_rate": 3.753077068736982e-06, + "loss": 0.0969, + "step": 42900 + }, + { + "epoch": 4.062677523196364, + "grad_norm": 0.4868123233318329, + "learning_rate": 3.7492899072145426e-06, + "loss": 0.0872, + "step": 42910 + }, + { + "epoch": 4.063624313576974, + "grad_norm": 0.5304853320121765, + "learning_rate": 3.745502745692104e-06, + "loss": 0.0999, + "step": 42920 + }, + { + "epoch": 4.064571103957584, + "grad_norm": 0.5248091816902161, + "learning_rate": 3.7417155841696652e-06, + "loss": 0.0868, + "step": 42930 + }, + { + "epoch": 4.0655178943381935, + "grad_norm": 0.4787554144859314, + "learning_rate": 3.737928422647226e-06, + "loss": 0.0874, + "step": 42940 + }, + { + "epoch": 4.066464684718803, + "grad_norm": 0.5181282758712769, + "learning_rate": 3.7341412611247875e-06, + "loss": 0.088, + "step": 42950 + }, + { + "epoch": 4.067411475099413, + "grad_norm": 0.49244225025177, + "learning_rate": 3.7303540996023484e-06, + "loss": 0.0858, + "step": 42960 + }, + { + "epoch": 4.068358265480023, + "grad_norm": 0.6987329721450806, + "learning_rate": 3.7265669380799097e-06, + "loss": 0.0864, + "step": 42970 + }, + { + "epoch": 4.0693050558606325, + "grad_norm": 0.5377588868141174, + "learning_rate": 3.72277977655747e-06, + "loss": 0.0928, + "step": 42980 + }, + { + "epoch": 4.070251846241242, + "grad_norm": 0.4637696146965027, + "learning_rate": 3.7189926150350315e-06, + "loss": 0.0864, + "step": 42990 + }, + { + "epoch": 4.071198636621852, + "grad_norm": 0.43769657611846924, + "learning_rate": 3.7152054535125924e-06, + "loss": 0.085, + "step": 43000 + }, + { + "epoch": 4.072145427002462, + "grad_norm": 0.5931176543235779, + "learning_rate": 3.7114182919901537e-06, + "loss": 0.1043, + "step": 43010 + }, + { + "epoch": 4.0730922173830715, + "grad_norm": 0.45635858178138733, + "learning_rate": 3.707631130467715e-06, + "loss": 0.0967, + "step": 43020 + }, + { + "epoch": 4.074039007763681, + "grad_norm": 0.49210211634635925, + "learning_rate": 3.703843968945276e-06, + "loss": 0.0806, + "step": 43030 + }, + { + "epoch": 4.074985798144291, + "grad_norm": 0.6789916157722473, + "learning_rate": 3.7000568074228372e-06, + "loss": 0.0991, + "step": 43040 + }, + { + "epoch": 4.075932588524901, + "grad_norm": 0.44278761744499207, + "learning_rate": 3.6962696459003977e-06, + "loss": 0.0774, + "step": 43050 + }, + { + "epoch": 4.07687937890551, + "grad_norm": 0.5981815457344055, + "learning_rate": 3.692482484377959e-06, + "loss": 0.0935, + "step": 43060 + }, + { + "epoch": 4.07782616928612, + "grad_norm": 0.6542117595672607, + "learning_rate": 3.68869532285552e-06, + "loss": 0.098, + "step": 43070 + }, + { + "epoch": 4.07877295966673, + "grad_norm": 0.5480513572692871, + "learning_rate": 3.6849081613330812e-06, + "loss": 0.1004, + "step": 43080 + }, + { + "epoch": 4.07971975004734, + "grad_norm": 0.4663204848766327, + "learning_rate": 3.681120999810642e-06, + "loss": 0.0915, + "step": 43090 + }, + { + "epoch": 4.080666540427949, + "grad_norm": 0.5434608459472656, + "learning_rate": 3.6773338382882035e-06, + "loss": 0.0831, + "step": 43100 + }, + { + "epoch": 4.081613330808559, + "grad_norm": 0.44176313281059265, + "learning_rate": 3.6735466767657648e-06, + "loss": 0.0872, + "step": 43110 + }, + { + "epoch": 4.082560121189169, + "grad_norm": 0.5268293619155884, + "learning_rate": 3.6697595152433252e-06, + "loss": 0.0984, + "step": 43120 + }, + { + "epoch": 4.083506911569779, + "grad_norm": 0.6170424222946167, + "learning_rate": 3.665972353720886e-06, + "loss": 0.0946, + "step": 43130 + }, + { + "epoch": 4.084453701950388, + "grad_norm": 0.45425689220428467, + "learning_rate": 3.6621851921984475e-06, + "loss": 0.0731, + "step": 43140 + }, + { + "epoch": 4.085400492330998, + "grad_norm": 0.4029372036457062, + "learning_rate": 3.6583980306760088e-06, + "loss": 0.0899, + "step": 43150 + }, + { + "epoch": 4.086347282711608, + "grad_norm": 0.6184468865394592, + "learning_rate": 3.6546108691535697e-06, + "loss": 0.0877, + "step": 43160 + }, + { + "epoch": 4.087294073092218, + "grad_norm": 0.49466800689697266, + "learning_rate": 3.650823707631131e-06, + "loss": 0.0795, + "step": 43170 + }, + { + "epoch": 4.088240863472827, + "grad_norm": 0.625234842300415, + "learning_rate": 3.647036546108692e-06, + "loss": 0.0829, + "step": 43180 + }, + { + "epoch": 4.089187653853437, + "grad_norm": 0.5789068937301636, + "learning_rate": 3.643249384586253e-06, + "loss": 0.0903, + "step": 43190 + }, + { + "epoch": 4.090134444234047, + "grad_norm": 0.5432450175285339, + "learning_rate": 3.6394622230638137e-06, + "loss": 0.0932, + "step": 43200 + }, + { + "epoch": 4.091081234614657, + "grad_norm": 0.4496850371360779, + "learning_rate": 3.635675061541375e-06, + "loss": 0.1028, + "step": 43210 + }, + { + "epoch": 4.092028024995266, + "grad_norm": 0.5556248426437378, + "learning_rate": 3.631887900018936e-06, + "loss": 0.0884, + "step": 43220 + }, + { + "epoch": 4.092974815375876, + "grad_norm": 0.4554558992385864, + "learning_rate": 3.6281007384964972e-06, + "loss": 0.0915, + "step": 43230 + }, + { + "epoch": 4.093921605756486, + "grad_norm": 0.5424495339393616, + "learning_rate": 3.6243135769740585e-06, + "loss": 0.0902, + "step": 43240 + }, + { + "epoch": 4.0948683961370955, + "grad_norm": 0.5642138719558716, + "learning_rate": 3.6205264154516194e-06, + "loss": 0.0981, + "step": 43250 + }, + { + "epoch": 4.095815186517705, + "grad_norm": 0.45746803283691406, + "learning_rate": 3.6167392539291803e-06, + "loss": 0.085, + "step": 43260 + }, + { + "epoch": 4.096761976898315, + "grad_norm": 0.34574341773986816, + "learning_rate": 3.6129520924067412e-06, + "loss": 0.0862, + "step": 43270 + }, + { + "epoch": 4.097708767278925, + "grad_norm": 0.5801050662994385, + "learning_rate": 3.6091649308843026e-06, + "loss": 0.0977, + "step": 43280 + }, + { + "epoch": 4.0986555576595345, + "grad_norm": 0.4668993353843689, + "learning_rate": 3.6053777693618634e-06, + "loss": 0.0949, + "step": 43290 + }, + { + "epoch": 4.099602348040144, + "grad_norm": 0.4316027760505676, + "learning_rate": 3.6015906078394248e-06, + "loss": 0.0861, + "step": 43300 + }, + { + "epoch": 4.100549138420754, + "grad_norm": 0.4293339252471924, + "learning_rate": 3.5978034463169857e-06, + "loss": 0.0851, + "step": 43310 + }, + { + "epoch": 4.101495928801364, + "grad_norm": 0.5610676407814026, + "learning_rate": 3.594016284794547e-06, + "loss": 0.0951, + "step": 43320 + }, + { + "epoch": 4.1024427191819735, + "grad_norm": 0.5009633302688599, + "learning_rate": 3.5902291232721075e-06, + "loss": 0.0776, + "step": 43330 + }, + { + "epoch": 4.103389509562583, + "grad_norm": 0.4935799241065979, + "learning_rate": 3.5864419617496688e-06, + "loss": 0.0954, + "step": 43340 + }, + { + "epoch": 4.104336299943193, + "grad_norm": 0.5310682058334351, + "learning_rate": 3.58265480022723e-06, + "loss": 0.0908, + "step": 43350 + }, + { + "epoch": 4.105283090323803, + "grad_norm": 0.5049786567687988, + "learning_rate": 3.578867638704791e-06, + "loss": 0.0912, + "step": 43360 + }, + { + "epoch": 4.106229880704412, + "grad_norm": 0.5558485984802246, + "learning_rate": 3.5750804771823523e-06, + "loss": 0.0943, + "step": 43370 + }, + { + "epoch": 4.107176671085022, + "grad_norm": 0.4917635917663574, + "learning_rate": 3.571293315659913e-06, + "loss": 0.0899, + "step": 43380 + }, + { + "epoch": 4.108123461465632, + "grad_norm": 0.4739043712615967, + "learning_rate": 3.5675061541374745e-06, + "loss": 0.0873, + "step": 43390 + }, + { + "epoch": 4.109070251846242, + "grad_norm": 0.5204480886459351, + "learning_rate": 3.563718992615035e-06, + "loss": 0.1029, + "step": 43400 + }, + { + "epoch": 4.110017042226851, + "grad_norm": 0.4711787700653076, + "learning_rate": 3.5599318310925963e-06, + "loss": 0.0947, + "step": 43410 + }, + { + "epoch": 4.110963832607461, + "grad_norm": 0.5298397541046143, + "learning_rate": 3.5561446695701572e-06, + "loss": 0.0915, + "step": 43420 + }, + { + "epoch": 4.111910622988071, + "grad_norm": 0.6017003059387207, + "learning_rate": 3.5523575080477185e-06, + "loss": 0.0902, + "step": 43430 + }, + { + "epoch": 4.112857413368681, + "grad_norm": 0.4537605345249176, + "learning_rate": 3.54857034652528e-06, + "loss": 0.0843, + "step": 43440 + }, + { + "epoch": 4.1138042037492895, + "grad_norm": 0.5435331463813782, + "learning_rate": 3.5447831850028408e-06, + "loss": 0.0977, + "step": 43450 + }, + { + "epoch": 4.114750994129899, + "grad_norm": 0.5554385185241699, + "learning_rate": 3.540996023480402e-06, + "loss": 0.0895, + "step": 43460 + }, + { + "epoch": 4.115697784510509, + "grad_norm": 0.5493789315223694, + "learning_rate": 3.5372088619579626e-06, + "loss": 0.0863, + "step": 43470 + }, + { + "epoch": 4.116644574891119, + "grad_norm": 0.5053249597549438, + "learning_rate": 3.533421700435524e-06, + "loss": 0.0876, + "step": 43480 + }, + { + "epoch": 4.117591365271728, + "grad_norm": 0.5336869955062866, + "learning_rate": 3.5296345389130848e-06, + "loss": 0.0879, + "step": 43490 + }, + { + "epoch": 4.118538155652338, + "grad_norm": 0.5733828544616699, + "learning_rate": 3.525847377390646e-06, + "loss": 0.0983, + "step": 43500 + }, + { + "epoch": 4.119484946032948, + "grad_norm": 0.6210169792175293, + "learning_rate": 3.522060215868207e-06, + "loss": 0.0854, + "step": 43510 + }, + { + "epoch": 4.120431736413558, + "grad_norm": 0.4751220941543579, + "learning_rate": 3.5182730543457683e-06, + "loss": 0.0949, + "step": 43520 + }, + { + "epoch": 4.121378526794167, + "grad_norm": 0.5846556425094604, + "learning_rate": 3.5144858928233296e-06, + "loss": 0.0801, + "step": 43530 + }, + { + "epoch": 4.122325317174777, + "grad_norm": 0.5414957404136658, + "learning_rate": 3.51069873130089e-06, + "loss": 0.0845, + "step": 43540 + }, + { + "epoch": 4.123272107555387, + "grad_norm": 0.47205281257629395, + "learning_rate": 3.506911569778451e-06, + "loss": 0.0982, + "step": 43550 + }, + { + "epoch": 4.124218897935997, + "grad_norm": 0.5458006858825684, + "learning_rate": 3.5031244082560123e-06, + "loss": 0.0885, + "step": 43560 + }, + { + "epoch": 4.125165688316606, + "grad_norm": 0.5724362730979919, + "learning_rate": 3.4993372467335736e-06, + "loss": 0.086, + "step": 43570 + }, + { + "epoch": 4.126112478697216, + "grad_norm": 0.3912672996520996, + "learning_rate": 3.4955500852111345e-06, + "loss": 0.0897, + "step": 43580 + }, + { + "epoch": 4.127059269077826, + "grad_norm": 0.4875558316707611, + "learning_rate": 3.491762923688696e-06, + "loss": 0.0947, + "step": 43590 + }, + { + "epoch": 4.128006059458436, + "grad_norm": 0.46745261549949646, + "learning_rate": 3.4879757621662567e-06, + "loss": 0.0883, + "step": 43600 + }, + { + "epoch": 4.128952849839045, + "grad_norm": 0.6316899657249451, + "learning_rate": 3.4841886006438176e-06, + "loss": 0.1003, + "step": 43610 + }, + { + "epoch": 4.129899640219655, + "grad_norm": 0.44248083233833313, + "learning_rate": 3.4804014391213785e-06, + "loss": 0.0972, + "step": 43620 + }, + { + "epoch": 4.130846430600265, + "grad_norm": 0.6286366581916809, + "learning_rate": 3.47661427759894e-06, + "loss": 0.0857, + "step": 43630 + }, + { + "epoch": 4.1317932209808745, + "grad_norm": 0.46254751086235046, + "learning_rate": 3.4728271160765008e-06, + "loss": 0.0873, + "step": 43640 + }, + { + "epoch": 4.132740011361484, + "grad_norm": 0.591060221195221, + "learning_rate": 3.469039954554062e-06, + "loss": 0.0979, + "step": 43650 + }, + { + "epoch": 4.133686801742094, + "grad_norm": 0.5578216910362244, + "learning_rate": 3.4652527930316234e-06, + "loss": 0.1034, + "step": 43660 + }, + { + "epoch": 4.134633592122704, + "grad_norm": 0.7366402745246887, + "learning_rate": 3.4614656315091843e-06, + "loss": 0.1012, + "step": 43670 + }, + { + "epoch": 4.1355803825033135, + "grad_norm": 0.5355960130691528, + "learning_rate": 3.4576784699867448e-06, + "loss": 0.0971, + "step": 43680 + }, + { + "epoch": 4.136527172883923, + "grad_norm": 0.5104373693466187, + "learning_rate": 3.453891308464306e-06, + "loss": 0.0926, + "step": 43690 + }, + { + "epoch": 4.137473963264533, + "grad_norm": 0.48333603143692017, + "learning_rate": 3.4501041469418674e-06, + "loss": 0.099, + "step": 43700 + }, + { + "epoch": 4.138420753645143, + "grad_norm": 0.5138222575187683, + "learning_rate": 3.4463169854194283e-06, + "loss": 0.0983, + "step": 43710 + }, + { + "epoch": 4.1393675440257525, + "grad_norm": 0.49473053216934204, + "learning_rate": 3.4425298238969896e-06, + "loss": 0.0889, + "step": 43720 + }, + { + "epoch": 4.140314334406362, + "grad_norm": 0.6268186569213867, + "learning_rate": 3.4387426623745505e-06, + "loss": 0.0898, + "step": 43730 + }, + { + "epoch": 4.141261124786972, + "grad_norm": 0.4563818871974945, + "learning_rate": 3.434955500852112e-06, + "loss": 0.0878, + "step": 43740 + }, + { + "epoch": 4.142207915167582, + "grad_norm": 0.4766491651535034, + "learning_rate": 3.4311683393296723e-06, + "loss": 0.0866, + "step": 43750 + }, + { + "epoch": 4.143154705548191, + "grad_norm": 0.5177786946296692, + "learning_rate": 3.4273811778072336e-06, + "loss": 0.087, + "step": 43760 + }, + { + "epoch": 4.144101495928801, + "grad_norm": 0.5356361865997314, + "learning_rate": 3.4235940162847945e-06, + "loss": 0.0851, + "step": 43770 + }, + { + "epoch": 4.145048286309411, + "grad_norm": 0.5455963015556335, + "learning_rate": 3.419806854762356e-06, + "loss": 0.0932, + "step": 43780 + }, + { + "epoch": 4.145995076690021, + "grad_norm": 0.501149594783783, + "learning_rate": 3.416019693239917e-06, + "loss": 0.0876, + "step": 43790 + }, + { + "epoch": 4.14694186707063, + "grad_norm": 0.5715905427932739, + "learning_rate": 3.412232531717478e-06, + "loss": 0.0959, + "step": 43800 + }, + { + "epoch": 4.14788865745124, + "grad_norm": 0.5404168367385864, + "learning_rate": 3.4084453701950394e-06, + "loss": 0.0905, + "step": 43810 + }, + { + "epoch": 4.14883544783185, + "grad_norm": 0.4844471514225006, + "learning_rate": 3.4046582086726003e-06, + "loss": 0.0814, + "step": 43820 + }, + { + "epoch": 4.14978223821246, + "grad_norm": 0.4519600570201874, + "learning_rate": 3.400871047150161e-06, + "loss": 0.0831, + "step": 43830 + }, + { + "epoch": 4.150729028593069, + "grad_norm": 0.5062319040298462, + "learning_rate": 3.397083885627722e-06, + "loss": 0.0879, + "step": 43840 + }, + { + "epoch": 4.151675818973679, + "grad_norm": 0.4504513740539551, + "learning_rate": 3.3932967241052834e-06, + "loss": 0.0981, + "step": 43850 + }, + { + "epoch": 4.152622609354289, + "grad_norm": 0.6103249788284302, + "learning_rate": 3.3895095625828443e-06, + "loss": 0.0992, + "step": 43860 + }, + { + "epoch": 4.153569399734899, + "grad_norm": 0.5387426018714905, + "learning_rate": 3.3857224010604056e-06, + "loss": 0.0953, + "step": 43870 + }, + { + "epoch": 4.154516190115508, + "grad_norm": 0.5584139823913574, + "learning_rate": 3.381935239537967e-06, + "loss": 0.0872, + "step": 43880 + }, + { + "epoch": 4.155462980496118, + "grad_norm": 0.4367479383945465, + "learning_rate": 3.378148078015528e-06, + "loss": 0.0927, + "step": 43890 + }, + { + "epoch": 4.156409770876728, + "grad_norm": 0.5904217958450317, + "learning_rate": 3.3743609164930883e-06, + "loss": 0.0887, + "step": 43900 + }, + { + "epoch": 4.1573565612573375, + "grad_norm": 0.6446182131767273, + "learning_rate": 3.3705737549706496e-06, + "loss": 0.0908, + "step": 43910 + }, + { + "epoch": 4.158303351637947, + "grad_norm": 0.38347458839416504, + "learning_rate": 3.366786593448211e-06, + "loss": 0.0941, + "step": 43920 + }, + { + "epoch": 4.159250142018557, + "grad_norm": 0.5349393486976624, + "learning_rate": 3.362999431925772e-06, + "loss": 0.0929, + "step": 43930 + }, + { + "epoch": 4.160196932399167, + "grad_norm": 0.5757917165756226, + "learning_rate": 3.359212270403333e-06, + "loss": 0.0875, + "step": 43940 + }, + { + "epoch": 4.1611437227797765, + "grad_norm": 0.34581947326660156, + "learning_rate": 3.355425108880894e-06, + "loss": 0.0951, + "step": 43950 + }, + { + "epoch": 4.162090513160386, + "grad_norm": 0.4834064245223999, + "learning_rate": 3.3516379473584554e-06, + "loss": 0.0894, + "step": 43960 + }, + { + "epoch": 4.163037303540996, + "grad_norm": 0.5049462914466858, + "learning_rate": 3.347850785836016e-06, + "loss": 0.0941, + "step": 43970 + }, + { + "epoch": 4.163984093921606, + "grad_norm": 0.6113541722297668, + "learning_rate": 3.344063624313577e-06, + "loss": 0.093, + "step": 43980 + }, + { + "epoch": 4.1649308843022155, + "grad_norm": 0.6100361943244934, + "learning_rate": 3.340276462791138e-06, + "loss": 0.0983, + "step": 43990 + }, + { + "epoch": 4.165877674682825, + "grad_norm": 0.7216998934745789, + "learning_rate": 3.3364893012686994e-06, + "loss": 0.0979, + "step": 44000 + }, + { + "epoch": 4.166824465063435, + "grad_norm": 0.789558470249176, + "learning_rate": 3.3327021397462607e-06, + "loss": 0.1005, + "step": 44010 + }, + { + "epoch": 4.167771255444045, + "grad_norm": 0.4637594223022461, + "learning_rate": 3.3289149782238216e-06, + "loss": 0.0846, + "step": 44020 + }, + { + "epoch": 4.168718045824654, + "grad_norm": 0.49285727739334106, + "learning_rate": 3.325127816701383e-06, + "loss": 0.0794, + "step": 44030 + }, + { + "epoch": 4.169664836205264, + "grad_norm": 0.43726247549057007, + "learning_rate": 3.3213406551789434e-06, + "loss": 0.081, + "step": 44040 + }, + { + "epoch": 4.170611626585874, + "grad_norm": 0.5689728856086731, + "learning_rate": 3.3175534936565047e-06, + "loss": 0.093, + "step": 44050 + }, + { + "epoch": 4.171558416966484, + "grad_norm": 0.5433076620101929, + "learning_rate": 3.3137663321340656e-06, + "loss": 0.0962, + "step": 44060 + }, + { + "epoch": 4.172505207347093, + "grad_norm": 0.47096943855285645, + "learning_rate": 3.309979170611627e-06, + "loss": 0.0893, + "step": 44070 + }, + { + "epoch": 4.173451997727703, + "grad_norm": 0.50560462474823, + "learning_rate": 3.306192009089188e-06, + "loss": 0.0866, + "step": 44080 + }, + { + "epoch": 4.174398788108313, + "grad_norm": 0.6455307602882385, + "learning_rate": 3.302404847566749e-06, + "loss": 0.102, + "step": 44090 + }, + { + "epoch": 4.175345578488923, + "grad_norm": 0.5444678664207458, + "learning_rate": 3.2986176860443105e-06, + "loss": 0.0829, + "step": 44100 + }, + { + "epoch": 4.176292368869532, + "grad_norm": 0.5178965330123901, + "learning_rate": 3.294830524521871e-06, + "loss": 0.0884, + "step": 44110 + }, + { + "epoch": 4.177239159250142, + "grad_norm": 0.5928695797920227, + "learning_rate": 3.2910433629994323e-06, + "loss": 0.0952, + "step": 44120 + }, + { + "epoch": 4.178185949630752, + "grad_norm": 0.47902634739875793, + "learning_rate": 3.287256201476993e-06, + "loss": 0.1003, + "step": 44130 + }, + { + "epoch": 4.179132740011362, + "grad_norm": 0.4959571957588196, + "learning_rate": 3.2834690399545545e-06, + "loss": 0.091, + "step": 44140 + }, + { + "epoch": 4.180079530391971, + "grad_norm": 0.5497419834136963, + "learning_rate": 3.2796818784321154e-06, + "loss": 0.0969, + "step": 44150 + }, + { + "epoch": 4.181026320772581, + "grad_norm": 0.4828396141529083, + "learning_rate": 3.2758947169096767e-06, + "loss": 0.0869, + "step": 44160 + }, + { + "epoch": 4.181973111153191, + "grad_norm": 0.46545299887657166, + "learning_rate": 3.2721075553872376e-06, + "loss": 0.0841, + "step": 44170 + }, + { + "epoch": 4.1829199015338006, + "grad_norm": 0.6381782293319702, + "learning_rate": 3.2683203938647985e-06, + "loss": 0.0926, + "step": 44180 + }, + { + "epoch": 4.18386669191441, + "grad_norm": 0.5599507093429565, + "learning_rate": 3.2645332323423594e-06, + "loss": 0.0921, + "step": 44190 + }, + { + "epoch": 4.18481348229502, + "grad_norm": 0.5431086421012878, + "learning_rate": 3.2607460708199207e-06, + "loss": 0.095, + "step": 44200 + }, + { + "epoch": 4.18576027267563, + "grad_norm": 0.6155442595481873, + "learning_rate": 3.256958909297482e-06, + "loss": 0.101, + "step": 44210 + }, + { + "epoch": 4.1867070630562395, + "grad_norm": 0.489976704120636, + "learning_rate": 3.253171747775043e-06, + "loss": 0.0864, + "step": 44220 + }, + { + "epoch": 4.187653853436849, + "grad_norm": 0.6099218130111694, + "learning_rate": 3.2493845862526042e-06, + "loss": 0.0933, + "step": 44230 + }, + { + "epoch": 4.188600643817459, + "grad_norm": 0.5284793376922607, + "learning_rate": 3.245597424730165e-06, + "loss": 0.0897, + "step": 44240 + }, + { + "epoch": 4.189547434198069, + "grad_norm": 0.5145465731620789, + "learning_rate": 3.241810263207726e-06, + "loss": 0.0909, + "step": 44250 + }, + { + "epoch": 4.1904942245786785, + "grad_norm": 0.6255049109458923, + "learning_rate": 3.238023101685287e-06, + "loss": 0.0878, + "step": 44260 + }, + { + "epoch": 4.191441014959288, + "grad_norm": 0.5650139451026917, + "learning_rate": 3.2342359401628482e-06, + "loss": 0.09, + "step": 44270 + }, + { + "epoch": 4.192387805339898, + "grad_norm": 0.5437065362930298, + "learning_rate": 3.230448778640409e-06, + "loss": 0.0881, + "step": 44280 + }, + { + "epoch": 4.193334595720508, + "grad_norm": 0.5027974843978882, + "learning_rate": 3.2266616171179705e-06, + "loss": 0.0874, + "step": 44290 + }, + { + "epoch": 4.1942813861011174, + "grad_norm": 0.484749436378479, + "learning_rate": 3.2228744555955318e-06, + "loss": 0.0866, + "step": 44300 + }, + { + "epoch": 4.195228176481727, + "grad_norm": 0.46499377489089966, + "learning_rate": 3.2190872940730927e-06, + "loss": 0.0879, + "step": 44310 + }, + { + "epoch": 4.196174966862337, + "grad_norm": 0.6362327933311462, + "learning_rate": 3.215300132550653e-06, + "loss": 0.1016, + "step": 44320 + }, + { + "epoch": 4.197121757242947, + "grad_norm": 0.5938698053359985, + "learning_rate": 3.2115129710282145e-06, + "loss": 0.0904, + "step": 44330 + }, + { + "epoch": 4.198068547623556, + "grad_norm": 0.5677269101142883, + "learning_rate": 3.207725809505776e-06, + "loss": 0.096, + "step": 44340 + }, + { + "epoch": 4.199015338004166, + "grad_norm": 0.5697394609451294, + "learning_rate": 3.2039386479833367e-06, + "loss": 0.0902, + "step": 44350 + }, + { + "epoch": 4.199962128384776, + "grad_norm": 0.7053269743919373, + "learning_rate": 3.200151486460898e-06, + "loss": 0.0922, + "step": 44360 + }, + { + "epoch": 4.200908918765386, + "grad_norm": 0.5635038018226624, + "learning_rate": 3.196364324938459e-06, + "loss": 0.0905, + "step": 44370 + }, + { + "epoch": 4.201855709145995, + "grad_norm": 0.7025290131568909, + "learning_rate": 3.1925771634160202e-06, + "loss": 0.0978, + "step": 44380 + }, + { + "epoch": 4.202802499526605, + "grad_norm": 0.44185054302215576, + "learning_rate": 3.1887900018935807e-06, + "loss": 0.0942, + "step": 44390 + }, + { + "epoch": 4.203749289907215, + "grad_norm": 0.5605813264846802, + "learning_rate": 3.185002840371142e-06, + "loss": 0.0846, + "step": 44400 + }, + { + "epoch": 4.204696080287825, + "grad_norm": 0.541060745716095, + "learning_rate": 3.181215678848703e-06, + "loss": 0.0915, + "step": 44410 + }, + { + "epoch": 4.205642870668434, + "grad_norm": 0.6130930185317993, + "learning_rate": 3.1774285173262642e-06, + "loss": 0.088, + "step": 44420 + }, + { + "epoch": 4.206589661049044, + "grad_norm": 0.4869426190853119, + "learning_rate": 3.1736413558038256e-06, + "loss": 0.086, + "step": 44430 + }, + { + "epoch": 4.207536451429654, + "grad_norm": 0.5549150109291077, + "learning_rate": 3.1698541942813865e-06, + "loss": 0.1032, + "step": 44440 + }, + { + "epoch": 4.208483241810264, + "grad_norm": 0.4869174063205719, + "learning_rate": 3.1660670327589478e-06, + "loss": 0.0957, + "step": 44450 + }, + { + "epoch": 4.209430032190873, + "grad_norm": 0.43977847695350647, + "learning_rate": 3.1622798712365082e-06, + "loss": 0.0936, + "step": 44460 + }, + { + "epoch": 4.210376822571483, + "grad_norm": 0.502215564250946, + "learning_rate": 3.1584927097140696e-06, + "loss": 0.0909, + "step": 44470 + }, + { + "epoch": 4.211323612952093, + "grad_norm": 0.53812175989151, + "learning_rate": 3.1547055481916305e-06, + "loss": 0.0854, + "step": 44480 + }, + { + "epoch": 4.2122704033327025, + "grad_norm": 0.4930614233016968, + "learning_rate": 3.1509183866691918e-06, + "loss": 0.0879, + "step": 44490 + }, + { + "epoch": 4.213217193713312, + "grad_norm": 0.4674643576145172, + "learning_rate": 3.1471312251467527e-06, + "loss": 0.0847, + "step": 44500 + }, + { + "epoch": 4.214163984093922, + "grad_norm": 0.5616251826286316, + "learning_rate": 3.143344063624314e-06, + "loss": 0.0863, + "step": 44510 + }, + { + "epoch": 4.215110774474532, + "grad_norm": 0.6773747801780701, + "learning_rate": 3.1395569021018753e-06, + "loss": 0.0955, + "step": 44520 + }, + { + "epoch": 4.2160575648551415, + "grad_norm": 0.4281054735183716, + "learning_rate": 3.135769740579436e-06, + "loss": 0.0897, + "step": 44530 + }, + { + "epoch": 4.217004355235751, + "grad_norm": 0.48651763796806335, + "learning_rate": 3.1319825790569967e-06, + "loss": 0.0889, + "step": 44540 + }, + { + "epoch": 4.217951145616361, + "grad_norm": 0.4472315013408661, + "learning_rate": 3.128195417534558e-06, + "loss": 0.0892, + "step": 44550 + }, + { + "epoch": 4.218897935996971, + "grad_norm": 0.732990026473999, + "learning_rate": 3.1244082560121193e-06, + "loss": 0.0944, + "step": 44560 + }, + { + "epoch": 4.2198447263775805, + "grad_norm": 0.6399824023246765, + "learning_rate": 3.1206210944896802e-06, + "loss": 0.0925, + "step": 44570 + }, + { + "epoch": 4.220791516758189, + "grad_norm": 0.5704659819602966, + "learning_rate": 3.1168339329672415e-06, + "loss": 0.0967, + "step": 44580 + }, + { + "epoch": 4.221738307138799, + "grad_norm": 0.4731642007827759, + "learning_rate": 3.1130467714448024e-06, + "loss": 0.0763, + "step": 44590 + }, + { + "epoch": 4.222685097519409, + "grad_norm": 0.4535263180732727, + "learning_rate": 3.1092596099223633e-06, + "loss": 0.0871, + "step": 44600 + }, + { + "epoch": 4.2236318879000185, + "grad_norm": 0.5828526020050049, + "learning_rate": 3.1054724483999242e-06, + "loss": 0.0829, + "step": 44610 + }, + { + "epoch": 4.224578678280628, + "grad_norm": 0.6800204515457153, + "learning_rate": 3.1016852868774856e-06, + "loss": 0.0786, + "step": 44620 + }, + { + "epoch": 4.225525468661238, + "grad_norm": 0.49945423007011414, + "learning_rate": 3.0978981253550465e-06, + "loss": 0.09, + "step": 44630 + }, + { + "epoch": 4.226472259041848, + "grad_norm": 0.5540609955787659, + "learning_rate": 3.0941109638326078e-06, + "loss": 0.0906, + "step": 44640 + }, + { + "epoch": 4.2274190494224575, + "grad_norm": 0.47390490770339966, + "learning_rate": 3.090323802310169e-06, + "loss": 0.0908, + "step": 44650 + }, + { + "epoch": 4.228365839803067, + "grad_norm": 0.5915796756744385, + "learning_rate": 3.08653664078773e-06, + "loss": 0.0917, + "step": 44660 + }, + { + "epoch": 4.229312630183677, + "grad_norm": 0.5430005192756653, + "learning_rate": 3.0827494792652905e-06, + "loss": 0.0953, + "step": 44670 + }, + { + "epoch": 4.230259420564287, + "grad_norm": 0.6260116696357727, + "learning_rate": 3.0789623177428518e-06, + "loss": 0.0943, + "step": 44680 + }, + { + "epoch": 4.2312062109448965, + "grad_norm": 0.6242892146110535, + "learning_rate": 3.075175156220413e-06, + "loss": 0.0951, + "step": 44690 + }, + { + "epoch": 4.232153001325506, + "grad_norm": 0.5777116417884827, + "learning_rate": 3.071387994697974e-06, + "loss": 0.0939, + "step": 44700 + }, + { + "epoch": 4.233099791706116, + "grad_norm": 0.638270914554596, + "learning_rate": 3.0676008331755353e-06, + "loss": 0.0942, + "step": 44710 + }, + { + "epoch": 4.234046582086726, + "grad_norm": 0.5501429438591003, + "learning_rate": 3.0638136716530962e-06, + "loss": 0.1015, + "step": 44720 + }, + { + "epoch": 4.234993372467335, + "grad_norm": 0.5100410580635071, + "learning_rate": 3.0600265101306575e-06, + "loss": 0.0945, + "step": 44730 + }, + { + "epoch": 4.235940162847945, + "grad_norm": 0.4826267659664154, + "learning_rate": 3.056239348608218e-06, + "loss": 0.0921, + "step": 44740 + }, + { + "epoch": 4.236886953228555, + "grad_norm": 0.6098191738128662, + "learning_rate": 3.0524521870857793e-06, + "loss": 0.0859, + "step": 44750 + }, + { + "epoch": 4.237833743609165, + "grad_norm": 0.5259220600128174, + "learning_rate": 3.0486650255633402e-06, + "loss": 0.0962, + "step": 44760 + }, + { + "epoch": 4.238780533989774, + "grad_norm": 0.6392509937286377, + "learning_rate": 3.0448778640409015e-06, + "loss": 0.1002, + "step": 44770 + }, + { + "epoch": 4.239727324370384, + "grad_norm": 0.673291802406311, + "learning_rate": 3.041090702518463e-06, + "loss": 0.0896, + "step": 44780 + }, + { + "epoch": 4.240674114750994, + "grad_norm": 0.5085720419883728, + "learning_rate": 3.0373035409960238e-06, + "loss": 0.0902, + "step": 44790 + }, + { + "epoch": 4.241620905131604, + "grad_norm": 0.5305391550064087, + "learning_rate": 3.033516379473585e-06, + "loss": 0.0941, + "step": 44800 + }, + { + "epoch": 4.242567695512213, + "grad_norm": 0.4959773123264313, + "learning_rate": 3.0297292179511456e-06, + "loss": 0.0939, + "step": 44810 + }, + { + "epoch": 4.243514485892823, + "grad_norm": 0.6148700714111328, + "learning_rate": 3.025942056428707e-06, + "loss": 0.1018, + "step": 44820 + }, + { + "epoch": 4.244461276273433, + "grad_norm": 0.5363166928291321, + "learning_rate": 3.0221548949062678e-06, + "loss": 0.0916, + "step": 44830 + }, + { + "epoch": 4.245408066654043, + "grad_norm": 0.5104047656059265, + "learning_rate": 3.018367733383829e-06, + "loss": 0.0917, + "step": 44840 + }, + { + "epoch": 4.246354857034652, + "grad_norm": 0.5864166021347046, + "learning_rate": 3.01458057186139e-06, + "loss": 0.0875, + "step": 44850 + }, + { + "epoch": 4.247301647415262, + "grad_norm": 0.6654096245765686, + "learning_rate": 3.0107934103389513e-06, + "loss": 0.0902, + "step": 44860 + }, + { + "epoch": 4.248248437795872, + "grad_norm": 0.49257123470306396, + "learning_rate": 3.0070062488165126e-06, + "loss": 0.0956, + "step": 44870 + }, + { + "epoch": 4.2491952281764815, + "grad_norm": 0.4794057309627533, + "learning_rate": 3.003219087294073e-06, + "loss": 0.0885, + "step": 44880 + }, + { + "epoch": 4.250142018557091, + "grad_norm": 0.5510391592979431, + "learning_rate": 2.9994319257716344e-06, + "loss": 0.0933, + "step": 44890 + }, + { + "epoch": 4.251088808937701, + "grad_norm": 0.5095749497413635, + "learning_rate": 2.9956447642491953e-06, + "loss": 0.1042, + "step": 44900 + }, + { + "epoch": 4.252035599318311, + "grad_norm": 0.5804000496864319, + "learning_rate": 2.9918576027267566e-06, + "loss": 0.0976, + "step": 44910 + }, + { + "epoch": 4.2529823896989205, + "grad_norm": 0.47743797302246094, + "learning_rate": 2.9880704412043175e-06, + "loss": 0.0949, + "step": 44920 + }, + { + "epoch": 4.25392918007953, + "grad_norm": 0.5327832102775574, + "learning_rate": 2.984283279681879e-06, + "loss": 0.0912, + "step": 44930 + }, + { + "epoch": 4.25487597046014, + "grad_norm": 0.6161805391311646, + "learning_rate": 2.9804961181594398e-06, + "loss": 0.0891, + "step": 44940 + }, + { + "epoch": 4.25582276084075, + "grad_norm": 0.6115200519561768, + "learning_rate": 2.9767089566370006e-06, + "loss": 0.1009, + "step": 44950 + }, + { + "epoch": 4.2567695512213595, + "grad_norm": 0.5854682922363281, + "learning_rate": 2.9729217951145615e-06, + "loss": 0.0943, + "step": 44960 + }, + { + "epoch": 4.257716341601969, + "grad_norm": 0.4380771219730377, + "learning_rate": 2.969134633592123e-06, + "loss": 0.0829, + "step": 44970 + }, + { + "epoch": 4.258663131982579, + "grad_norm": 0.5447367429733276, + "learning_rate": 2.965347472069684e-06, + "loss": 0.0949, + "step": 44980 + }, + { + "epoch": 4.259609922363189, + "grad_norm": 0.4817304313182831, + "learning_rate": 2.961560310547245e-06, + "loss": 0.0893, + "step": 44990 + }, + { + "epoch": 4.260556712743798, + "grad_norm": 0.37457752227783203, + "learning_rate": 2.9577731490248064e-06, + "loss": 0.0844, + "step": 45000 + }, + { + "epoch": 4.261503503124408, + "grad_norm": 0.5563647747039795, + "learning_rate": 2.9539859875023673e-06, + "loss": 0.094, + "step": 45010 + }, + { + "epoch": 4.262450293505018, + "grad_norm": 0.5248158574104309, + "learning_rate": 2.950198825979928e-06, + "loss": 0.0954, + "step": 45020 + }, + { + "epoch": 4.263397083885628, + "grad_norm": 0.5105319023132324, + "learning_rate": 2.946411664457489e-06, + "loss": 0.0929, + "step": 45030 + }, + { + "epoch": 4.264343874266237, + "grad_norm": 0.4475686252117157, + "learning_rate": 2.9426245029350504e-06, + "loss": 0.0855, + "step": 45040 + }, + { + "epoch": 4.265290664646847, + "grad_norm": 0.46911075711250305, + "learning_rate": 2.9388373414126113e-06, + "loss": 0.0874, + "step": 45050 + }, + { + "epoch": 4.266237455027457, + "grad_norm": 0.579980731010437, + "learning_rate": 2.9350501798901726e-06, + "loss": 0.0953, + "step": 45060 + }, + { + "epoch": 4.267184245408067, + "grad_norm": 0.5501284599304199, + "learning_rate": 2.931263018367734e-06, + "loss": 0.0919, + "step": 45070 + }, + { + "epoch": 4.268131035788676, + "grad_norm": 0.44050830602645874, + "learning_rate": 2.927475856845295e-06, + "loss": 0.0929, + "step": 45080 + }, + { + "epoch": 4.269077826169286, + "grad_norm": 0.6507890820503235, + "learning_rate": 2.9236886953228553e-06, + "loss": 0.0945, + "step": 45090 + }, + { + "epoch": 4.270024616549896, + "grad_norm": 0.5835983753204346, + "learning_rate": 2.9199015338004166e-06, + "loss": 0.0931, + "step": 45100 + }, + { + "epoch": 4.270971406930506, + "grad_norm": 0.6553801894187927, + "learning_rate": 2.916114372277978e-06, + "loss": 0.0948, + "step": 45110 + }, + { + "epoch": 4.271918197311115, + "grad_norm": 0.4947971701622009, + "learning_rate": 2.912327210755539e-06, + "loss": 0.0886, + "step": 45120 + }, + { + "epoch": 4.272864987691725, + "grad_norm": 0.6035920977592468, + "learning_rate": 2.9085400492331e-06, + "loss": 0.0933, + "step": 45130 + }, + { + "epoch": 4.273811778072335, + "grad_norm": 0.46540600061416626, + "learning_rate": 2.904752887710661e-06, + "loss": 0.0925, + "step": 45140 + }, + { + "epoch": 4.2747585684529446, + "grad_norm": 0.5728728771209717, + "learning_rate": 2.9009657261882224e-06, + "loss": 0.0954, + "step": 45150 + }, + { + "epoch": 4.275705358833554, + "grad_norm": 0.5419313311576843, + "learning_rate": 2.897178564665783e-06, + "loss": 0.0843, + "step": 45160 + }, + { + "epoch": 4.276652149214164, + "grad_norm": 0.41763198375701904, + "learning_rate": 2.893391403143344e-06, + "loss": 0.0896, + "step": 45170 + }, + { + "epoch": 4.277598939594774, + "grad_norm": 0.4943348467350006, + "learning_rate": 2.889604241620905e-06, + "loss": 0.092, + "step": 45180 + }, + { + "epoch": 4.2785457299753835, + "grad_norm": 0.5788520574569702, + "learning_rate": 2.8858170800984664e-06, + "loss": 0.0891, + "step": 45190 + }, + { + "epoch": 4.279492520355993, + "grad_norm": 0.4970496594905853, + "learning_rate": 2.8820299185760277e-06, + "loss": 0.0962, + "step": 45200 + }, + { + "epoch": 4.280439310736603, + "grad_norm": 0.6108376979827881, + "learning_rate": 2.8782427570535886e-06, + "loss": 0.0921, + "step": 45210 + }, + { + "epoch": 4.281386101117213, + "grad_norm": 0.55495685338974, + "learning_rate": 2.87445559553115e-06, + "loss": 0.0909, + "step": 45220 + }, + { + "epoch": 4.2823328914978225, + "grad_norm": 0.5994839668273926, + "learning_rate": 2.870668434008711e-06, + "loss": 0.0983, + "step": 45230 + }, + { + "epoch": 4.283279681878432, + "grad_norm": 0.5374669432640076, + "learning_rate": 2.8668812724862717e-06, + "loss": 0.0994, + "step": 45240 + }, + { + "epoch": 4.284226472259042, + "grad_norm": 0.5874958038330078, + "learning_rate": 2.8630941109638326e-06, + "loss": 0.0955, + "step": 45250 + }, + { + "epoch": 4.285173262639652, + "grad_norm": 0.6345770359039307, + "learning_rate": 2.859306949441394e-06, + "loss": 0.0882, + "step": 45260 + }, + { + "epoch": 4.2861200530202614, + "grad_norm": 0.5831238627433777, + "learning_rate": 2.855519787918955e-06, + "loss": 0.093, + "step": 45270 + }, + { + "epoch": 4.287066843400871, + "grad_norm": 0.6263869404792786, + "learning_rate": 2.851732626396516e-06, + "loss": 0.0918, + "step": 45280 + }, + { + "epoch": 4.288013633781481, + "grad_norm": 0.5837584733963013, + "learning_rate": 2.8479454648740775e-06, + "loss": 0.0931, + "step": 45290 + }, + { + "epoch": 4.288960424162091, + "grad_norm": 0.4665307104587555, + "learning_rate": 2.8441583033516384e-06, + "loss": 0.0907, + "step": 45300 + }, + { + "epoch": 4.2899072145427, + "grad_norm": 0.5143084526062012, + "learning_rate": 2.840371141829199e-06, + "loss": 0.1047, + "step": 45310 + }, + { + "epoch": 4.29085400492331, + "grad_norm": 0.6588802933692932, + "learning_rate": 2.83658398030676e-06, + "loss": 0.0964, + "step": 45320 + }, + { + "epoch": 4.29180079530392, + "grad_norm": 0.6557096242904663, + "learning_rate": 2.8327968187843215e-06, + "loss": 0.0914, + "step": 45330 + }, + { + "epoch": 4.29274758568453, + "grad_norm": 0.5459699630737305, + "learning_rate": 2.8290096572618824e-06, + "loss": 0.0905, + "step": 45340 + }, + { + "epoch": 4.293694376065139, + "grad_norm": 0.5911346673965454, + "learning_rate": 2.8252224957394437e-06, + "loss": 0.088, + "step": 45350 + }, + { + "epoch": 4.294641166445749, + "grad_norm": 0.5709453821182251, + "learning_rate": 2.8214353342170046e-06, + "loss": 0.103, + "step": 45360 + }, + { + "epoch": 4.295587956826359, + "grad_norm": 0.7081350684165955, + "learning_rate": 2.817648172694566e-06, + "loss": 0.0754, + "step": 45370 + }, + { + "epoch": 4.296534747206969, + "grad_norm": 0.529180645942688, + "learning_rate": 2.8138610111721264e-06, + "loss": 0.0876, + "step": 45380 + }, + { + "epoch": 4.297481537587578, + "grad_norm": 0.5527861714363098, + "learning_rate": 2.8100738496496877e-06, + "loss": 0.085, + "step": 45390 + }, + { + "epoch": 4.298428327968188, + "grad_norm": 0.5242286920547485, + "learning_rate": 2.8062866881272486e-06, + "loss": 0.0858, + "step": 45400 + }, + { + "epoch": 4.299375118348798, + "grad_norm": 0.5668191313743591, + "learning_rate": 2.80249952660481e-06, + "loss": 0.0865, + "step": 45410 + }, + { + "epoch": 4.300321908729408, + "grad_norm": 0.3957083821296692, + "learning_rate": 2.7987123650823713e-06, + "loss": 0.0847, + "step": 45420 + }, + { + "epoch": 4.301268699110017, + "grad_norm": 0.458646297454834, + "learning_rate": 2.794925203559932e-06, + "loss": 0.0982, + "step": 45430 + }, + { + "epoch": 4.302215489490627, + "grad_norm": 0.45908018946647644, + "learning_rate": 2.7911380420374935e-06, + "loss": 0.0922, + "step": 45440 + }, + { + "epoch": 4.303162279871237, + "grad_norm": 0.6364571452140808, + "learning_rate": 2.787350880515054e-06, + "loss": 0.0889, + "step": 45450 + }, + { + "epoch": 4.3041090702518465, + "grad_norm": 0.5598451495170593, + "learning_rate": 2.7835637189926153e-06, + "loss": 0.0781, + "step": 45460 + }, + { + "epoch": 4.305055860632456, + "grad_norm": 0.44022366404533386, + "learning_rate": 2.779776557470176e-06, + "loss": 0.0911, + "step": 45470 + }, + { + "epoch": 4.306002651013066, + "grad_norm": 0.6158979535102844, + "learning_rate": 2.7759893959477375e-06, + "loss": 0.0982, + "step": 45480 + }, + { + "epoch": 4.306949441393676, + "grad_norm": 0.6025349497795105, + "learning_rate": 2.7722022344252984e-06, + "loss": 0.1083, + "step": 45490 + }, + { + "epoch": 4.3078962317742855, + "grad_norm": 0.5192539691925049, + "learning_rate": 2.7684150729028597e-06, + "loss": 0.0909, + "step": 45500 + }, + { + "epoch": 4.308843022154895, + "grad_norm": 0.592406153678894, + "learning_rate": 2.764627911380421e-06, + "loss": 0.0974, + "step": 45510 + }, + { + "epoch": 4.309789812535505, + "grad_norm": 0.3901938199996948, + "learning_rate": 2.7608407498579815e-06, + "loss": 0.0904, + "step": 45520 + }, + { + "epoch": 4.310736602916115, + "grad_norm": 0.5858525037765503, + "learning_rate": 2.7570535883355424e-06, + "loss": 0.0891, + "step": 45530 + }, + { + "epoch": 4.3116833932967245, + "grad_norm": 0.7031994462013245, + "learning_rate": 2.7532664268131037e-06, + "loss": 0.1008, + "step": 45540 + }, + { + "epoch": 4.312630183677334, + "grad_norm": 0.4433521628379822, + "learning_rate": 2.749479265290665e-06, + "loss": 0.0892, + "step": 45550 + }, + { + "epoch": 4.313576974057944, + "grad_norm": 0.4655055105686188, + "learning_rate": 2.745692103768226e-06, + "loss": 0.0889, + "step": 45560 + }, + { + "epoch": 4.314523764438554, + "grad_norm": 0.5043991804122925, + "learning_rate": 2.7419049422457872e-06, + "loss": 0.0945, + "step": 45570 + }, + { + "epoch": 4.315470554819163, + "grad_norm": 0.4252864420413971, + "learning_rate": 2.738117780723348e-06, + "loss": 0.0956, + "step": 45580 + }, + { + "epoch": 4.316417345199773, + "grad_norm": 0.6047373414039612, + "learning_rate": 2.734330619200909e-06, + "loss": 0.0986, + "step": 45590 + }, + { + "epoch": 4.317364135580383, + "grad_norm": 0.4813744127750397, + "learning_rate": 2.73054345767847e-06, + "loss": 0.0969, + "step": 45600 + }, + { + "epoch": 4.318310925960993, + "grad_norm": 0.4027179777622223, + "learning_rate": 2.7267562961560313e-06, + "loss": 0.09, + "step": 45610 + }, + { + "epoch": 4.319257716341602, + "grad_norm": 0.682832658290863, + "learning_rate": 2.722969134633592e-06, + "loss": 0.0912, + "step": 45620 + }, + { + "epoch": 4.320204506722211, + "grad_norm": 0.6026838421821594, + "learning_rate": 2.7191819731111535e-06, + "loss": 0.0972, + "step": 45630 + }, + { + "epoch": 4.321151297102821, + "grad_norm": 0.5975409746170044, + "learning_rate": 2.715394811588715e-06, + "loss": 0.085, + "step": 45640 + }, + { + "epoch": 4.322098087483431, + "grad_norm": 0.5089371800422668, + "learning_rate": 2.7116076500662757e-06, + "loss": 0.0953, + "step": 45650 + }, + { + "epoch": 4.3230448778640405, + "grad_norm": 0.4977569580078125, + "learning_rate": 2.7078204885438366e-06, + "loss": 0.1012, + "step": 45660 + }, + { + "epoch": 4.32399166824465, + "grad_norm": 0.7111302614212036, + "learning_rate": 2.7040333270213975e-06, + "loss": 0.0931, + "step": 45670 + }, + { + "epoch": 4.32493845862526, + "grad_norm": 0.4907667934894562, + "learning_rate": 2.700246165498959e-06, + "loss": 0.0789, + "step": 45680 + }, + { + "epoch": 4.32588524900587, + "grad_norm": 0.5581902265548706, + "learning_rate": 2.6964590039765197e-06, + "loss": 0.0841, + "step": 45690 + }, + { + "epoch": 4.326832039386479, + "grad_norm": 0.5252041220664978, + "learning_rate": 2.692671842454081e-06, + "loss": 0.0931, + "step": 45700 + }, + { + "epoch": 4.327778829767089, + "grad_norm": 0.6080724000930786, + "learning_rate": 2.688884680931642e-06, + "loss": 0.0977, + "step": 45710 + }, + { + "epoch": 4.328725620147699, + "grad_norm": 0.4370320737361908, + "learning_rate": 2.6850975194092032e-06, + "loss": 0.09, + "step": 45720 + }, + { + "epoch": 4.329672410528309, + "grad_norm": 0.4596753418445587, + "learning_rate": 2.6813103578867637e-06, + "loss": 0.0828, + "step": 45730 + }, + { + "epoch": 4.330619200908918, + "grad_norm": 0.5326861143112183, + "learning_rate": 2.677523196364325e-06, + "loss": 0.0834, + "step": 45740 + }, + { + "epoch": 4.331565991289528, + "grad_norm": 0.5321633815765381, + "learning_rate": 2.6737360348418863e-06, + "loss": 0.0885, + "step": 45750 + }, + { + "epoch": 4.332512781670138, + "grad_norm": 0.5375781655311584, + "learning_rate": 2.6699488733194472e-06, + "loss": 0.0909, + "step": 45760 + }, + { + "epoch": 4.333459572050748, + "grad_norm": 0.5236536264419556, + "learning_rate": 2.6661617117970086e-06, + "loss": 0.0964, + "step": 45770 + }, + { + "epoch": 4.334406362431357, + "grad_norm": 0.4828406274318695, + "learning_rate": 2.6623745502745695e-06, + "loss": 0.0861, + "step": 45780 + }, + { + "epoch": 4.335353152811967, + "grad_norm": 0.4819585382938385, + "learning_rate": 2.6585873887521308e-06, + "loss": 0.087, + "step": 45790 + }, + { + "epoch": 4.336299943192577, + "grad_norm": 0.489366739988327, + "learning_rate": 2.6548002272296913e-06, + "loss": 0.0862, + "step": 45800 + }, + { + "epoch": 4.337246733573187, + "grad_norm": 0.4460524320602417, + "learning_rate": 2.6510130657072526e-06, + "loss": 0.0917, + "step": 45810 + }, + { + "epoch": 4.338193523953796, + "grad_norm": 0.562499463558197, + "learning_rate": 2.6472259041848135e-06, + "loss": 0.0886, + "step": 45820 + }, + { + "epoch": 4.339140314334406, + "grad_norm": 0.4999210834503174, + "learning_rate": 2.643438742662375e-06, + "loss": 0.0786, + "step": 45830 + }, + { + "epoch": 4.340087104715016, + "grad_norm": 0.5683452486991882, + "learning_rate": 2.639651581139936e-06, + "loss": 0.1064, + "step": 45840 + }, + { + "epoch": 4.3410338950956255, + "grad_norm": 0.6871407628059387, + "learning_rate": 2.635864419617497e-06, + "loss": 0.0964, + "step": 45850 + }, + { + "epoch": 4.341980685476235, + "grad_norm": 0.5813667178153992, + "learning_rate": 2.6320772580950583e-06, + "loss": 0.0922, + "step": 45860 + }, + { + "epoch": 4.342927475856845, + "grad_norm": 0.6370223760604858, + "learning_rate": 2.628290096572619e-06, + "loss": 0.0942, + "step": 45870 + }, + { + "epoch": 4.343874266237455, + "grad_norm": 0.4169613718986511, + "learning_rate": 2.62450293505018e-06, + "loss": 0.0964, + "step": 45880 + }, + { + "epoch": 4.3448210566180645, + "grad_norm": 0.5772116780281067, + "learning_rate": 2.620715773527741e-06, + "loss": 0.0938, + "step": 45890 + }, + { + "epoch": 4.345767846998674, + "grad_norm": 0.5945174098014832, + "learning_rate": 2.6169286120053023e-06, + "loss": 0.0913, + "step": 45900 + }, + { + "epoch": 4.346714637379284, + "grad_norm": 0.6152166724205017, + "learning_rate": 2.6131414504828632e-06, + "loss": 0.097, + "step": 45910 + }, + { + "epoch": 4.347661427759894, + "grad_norm": 0.6659379601478577, + "learning_rate": 2.6093542889604246e-06, + "loss": 0.0872, + "step": 45920 + }, + { + "epoch": 4.3486082181405035, + "grad_norm": 0.5365942120552063, + "learning_rate": 2.605567127437986e-06, + "loss": 0.0881, + "step": 45930 + }, + { + "epoch": 4.349555008521113, + "grad_norm": 0.5780148506164551, + "learning_rate": 2.6017799659155463e-06, + "loss": 0.0986, + "step": 45940 + }, + { + "epoch": 4.350501798901723, + "grad_norm": 0.530954122543335, + "learning_rate": 2.5979928043931072e-06, + "loss": 0.0837, + "step": 45950 + }, + { + "epoch": 4.351448589282333, + "grad_norm": 0.49948498606681824, + "learning_rate": 2.5942056428706686e-06, + "loss": 0.0903, + "step": 45960 + }, + { + "epoch": 4.352395379662942, + "grad_norm": 0.5024592876434326, + "learning_rate": 2.59041848134823e-06, + "loss": 0.0984, + "step": 45970 + }, + { + "epoch": 4.353342170043552, + "grad_norm": 0.6981245279312134, + "learning_rate": 2.5866313198257908e-06, + "loss": 0.0929, + "step": 45980 + }, + { + "epoch": 4.354288960424162, + "grad_norm": 0.5182442665100098, + "learning_rate": 2.582844158303352e-06, + "loss": 0.0947, + "step": 45990 + }, + { + "epoch": 4.355235750804772, + "grad_norm": 0.4673933684825897, + "learning_rate": 2.579056996780913e-06, + "loss": 0.0954, + "step": 46000 + }, + { + "epoch": 4.356182541185381, + "grad_norm": 0.5071808695793152, + "learning_rate": 2.575269835258474e-06, + "loss": 0.0925, + "step": 46010 + }, + { + "epoch": 4.357129331565991, + "grad_norm": 0.7042328715324402, + "learning_rate": 2.571482673736035e-06, + "loss": 0.0966, + "step": 46020 + }, + { + "epoch": 4.358076121946601, + "grad_norm": 0.4986877739429474, + "learning_rate": 2.567695512213596e-06, + "loss": 0.0865, + "step": 46030 + }, + { + "epoch": 4.359022912327211, + "grad_norm": 0.6330245137214661, + "learning_rate": 2.563908350691157e-06, + "loss": 0.0896, + "step": 46040 + }, + { + "epoch": 4.35996970270782, + "grad_norm": 0.4807395935058594, + "learning_rate": 2.5601211891687183e-06, + "loss": 0.0935, + "step": 46050 + }, + { + "epoch": 4.36091649308843, + "grad_norm": 0.5322906374931335, + "learning_rate": 2.5563340276462796e-06, + "loss": 0.0862, + "step": 46060 + }, + { + "epoch": 4.36186328346904, + "grad_norm": 0.6274631023406982, + "learning_rate": 2.5525468661238405e-06, + "loss": 0.0954, + "step": 46070 + }, + { + "epoch": 4.36281007384965, + "grad_norm": 0.5602927207946777, + "learning_rate": 2.548759704601401e-06, + "loss": 0.0784, + "step": 46080 + }, + { + "epoch": 4.363756864230259, + "grad_norm": 0.517584502696991, + "learning_rate": 2.5449725430789623e-06, + "loss": 0.083, + "step": 46090 + }, + { + "epoch": 4.364703654610869, + "grad_norm": 0.6797549724578857, + "learning_rate": 2.5411853815565237e-06, + "loss": 0.1039, + "step": 46100 + }, + { + "epoch": 4.365650444991479, + "grad_norm": 0.5814264416694641, + "learning_rate": 2.5373982200340846e-06, + "loss": 0.0909, + "step": 46110 + }, + { + "epoch": 4.3665972353720885, + "grad_norm": 0.6943686604499817, + "learning_rate": 2.533611058511646e-06, + "loss": 0.0994, + "step": 46120 + }, + { + "epoch": 4.367544025752698, + "grad_norm": 0.554736316204071, + "learning_rate": 2.5298238969892068e-06, + "loss": 0.0923, + "step": 46130 + }, + { + "epoch": 4.368490816133308, + "grad_norm": 0.5204230546951294, + "learning_rate": 2.526036735466768e-06, + "loss": 0.0977, + "step": 46140 + }, + { + "epoch": 4.369437606513918, + "grad_norm": 0.6369757056236267, + "learning_rate": 2.5222495739443286e-06, + "loss": 0.0994, + "step": 46150 + }, + { + "epoch": 4.3703843968945275, + "grad_norm": 0.6230180263519287, + "learning_rate": 2.51846241242189e-06, + "loss": 0.0898, + "step": 46160 + }, + { + "epoch": 4.371331187275137, + "grad_norm": 0.5078501105308533, + "learning_rate": 2.5146752508994508e-06, + "loss": 0.0942, + "step": 46170 + }, + { + "epoch": 4.372277977655747, + "grad_norm": 0.5081248879432678, + "learning_rate": 2.510888089377012e-06, + "loss": 0.0845, + "step": 46180 + }, + { + "epoch": 4.373224768036357, + "grad_norm": 0.5332440733909607, + "learning_rate": 2.5071009278545734e-06, + "loss": 0.0962, + "step": 46190 + }, + { + "epoch": 4.3741715584169665, + "grad_norm": 0.44974285364151, + "learning_rate": 2.5033137663321343e-06, + "loss": 0.0843, + "step": 46200 + }, + { + "epoch": 4.375118348797576, + "grad_norm": 0.500975489616394, + "learning_rate": 2.499526604809695e-06, + "loss": 0.0938, + "step": 46210 + }, + { + "epoch": 4.376065139178186, + "grad_norm": 0.5823422074317932, + "learning_rate": 2.4957394432872565e-06, + "loss": 0.085, + "step": 46220 + }, + { + "epoch": 4.377011929558796, + "grad_norm": 0.4945879280567169, + "learning_rate": 2.4919522817648174e-06, + "loss": 0.0902, + "step": 46230 + }, + { + "epoch": 4.377958719939405, + "grad_norm": 0.4944179356098175, + "learning_rate": 2.4881651202423783e-06, + "loss": 0.1018, + "step": 46240 + }, + { + "epoch": 4.378905510320015, + "grad_norm": 0.5736690163612366, + "learning_rate": 2.4843779587199396e-06, + "loss": 0.0926, + "step": 46250 + }, + { + "epoch": 4.379852300700625, + "grad_norm": 0.5215355753898621, + "learning_rate": 2.4805907971975005e-06, + "loss": 0.09, + "step": 46260 + }, + { + "epoch": 4.380799091081235, + "grad_norm": 0.5324762463569641, + "learning_rate": 2.476803635675062e-06, + "loss": 0.093, + "step": 46270 + }, + { + "epoch": 4.381745881461844, + "grad_norm": 0.4614587128162384, + "learning_rate": 2.4730164741526228e-06, + "loss": 0.0891, + "step": 46280 + }, + { + "epoch": 4.382692671842454, + "grad_norm": 0.4408273994922638, + "learning_rate": 2.469229312630184e-06, + "loss": 0.0968, + "step": 46290 + }, + { + "epoch": 4.383639462223064, + "grad_norm": 0.4188309907913208, + "learning_rate": 2.465442151107745e-06, + "loss": 0.0792, + "step": 46300 + }, + { + "epoch": 4.384586252603674, + "grad_norm": 0.5207238793373108, + "learning_rate": 2.461654989585306e-06, + "loss": 0.092, + "step": 46310 + }, + { + "epoch": 4.385533042984283, + "grad_norm": 0.5580665469169617, + "learning_rate": 2.457867828062867e-06, + "loss": 0.089, + "step": 46320 + }, + { + "epoch": 4.386479833364893, + "grad_norm": 0.6222299933433533, + "learning_rate": 2.454080666540428e-06, + "loss": 0.0925, + "step": 46330 + }, + { + "epoch": 4.387426623745503, + "grad_norm": 0.5015923976898193, + "learning_rate": 2.4502935050179894e-06, + "loss": 0.0992, + "step": 46340 + }, + { + "epoch": 4.388373414126113, + "grad_norm": 0.5996383428573608, + "learning_rate": 2.4465063434955503e-06, + "loss": 0.0803, + "step": 46350 + }, + { + "epoch": 4.389320204506722, + "grad_norm": 0.5574952960014343, + "learning_rate": 2.442719181973111e-06, + "loss": 0.0826, + "step": 46360 + }, + { + "epoch": 4.390266994887332, + "grad_norm": 0.6148197054862976, + "learning_rate": 2.4389320204506725e-06, + "loss": 0.0994, + "step": 46370 + }, + { + "epoch": 4.391213785267942, + "grad_norm": 0.5188667178153992, + "learning_rate": 2.4351448589282334e-06, + "loss": 0.0922, + "step": 46380 + }, + { + "epoch": 4.392160575648552, + "grad_norm": 0.5669800043106079, + "learning_rate": 2.4313576974057947e-06, + "loss": 0.0964, + "step": 46390 + }, + { + "epoch": 4.393107366029161, + "grad_norm": 0.5456835031509399, + "learning_rate": 2.4275705358833556e-06, + "loss": 0.0906, + "step": 46400 + }, + { + "epoch": 4.394054156409771, + "grad_norm": 0.4881961941719055, + "learning_rate": 2.423783374360917e-06, + "loss": 0.0896, + "step": 46410 + }, + { + "epoch": 4.395000946790381, + "grad_norm": 0.42108121514320374, + "learning_rate": 2.419996212838478e-06, + "loss": 0.1075, + "step": 46420 + }, + { + "epoch": 4.3959477371709905, + "grad_norm": 0.5552435517311096, + "learning_rate": 2.4162090513160387e-06, + "loss": 0.096, + "step": 46430 + }, + { + "epoch": 4.3968945275516, + "grad_norm": 0.6532780528068542, + "learning_rate": 2.4124218897936e-06, + "loss": 0.0881, + "step": 46440 + }, + { + "epoch": 4.39784131793221, + "grad_norm": 0.4636865258216858, + "learning_rate": 2.408634728271161e-06, + "loss": 0.0916, + "step": 46450 + }, + { + "epoch": 4.39878810831282, + "grad_norm": 0.4746432304382324, + "learning_rate": 2.404847566748722e-06, + "loss": 0.0911, + "step": 46460 + }, + { + "epoch": 4.3997348986934295, + "grad_norm": 0.6184666752815247, + "learning_rate": 2.401060405226283e-06, + "loss": 0.1002, + "step": 46470 + }, + { + "epoch": 4.400681689074039, + "grad_norm": 0.6605782508850098, + "learning_rate": 2.3972732437038445e-06, + "loss": 0.0955, + "step": 46480 + }, + { + "epoch": 4.401628479454649, + "grad_norm": 0.5042291283607483, + "learning_rate": 2.393486082181405e-06, + "loss": 0.0885, + "step": 46490 + }, + { + "epoch": 4.402575269835259, + "grad_norm": 0.5106028318405151, + "learning_rate": 2.3896989206589663e-06, + "loss": 0.0945, + "step": 46500 + }, + { + "epoch": 4.4035220602158684, + "grad_norm": 0.5014446973800659, + "learning_rate": 2.3859117591365276e-06, + "loss": 0.0889, + "step": 46510 + }, + { + "epoch": 4.404468850596478, + "grad_norm": 0.5890821814537048, + "learning_rate": 2.3821245976140885e-06, + "loss": 0.0865, + "step": 46520 + }, + { + "epoch": 4.405415640977088, + "grad_norm": 0.416118860244751, + "learning_rate": 2.3783374360916494e-06, + "loss": 0.0877, + "step": 46530 + }, + { + "epoch": 4.406362431357698, + "grad_norm": 0.7998496890068054, + "learning_rate": 2.3745502745692107e-06, + "loss": 0.0817, + "step": 46540 + }, + { + "epoch": 4.407309221738307, + "grad_norm": 0.48212817311286926, + "learning_rate": 2.3707631130467716e-06, + "loss": 0.0901, + "step": 46550 + }, + { + "epoch": 4.408256012118917, + "grad_norm": 0.44966939091682434, + "learning_rate": 2.3669759515243325e-06, + "loss": 0.0983, + "step": 46560 + }, + { + "epoch": 4.409202802499527, + "grad_norm": 0.4153594672679901, + "learning_rate": 2.363188790001894e-06, + "loss": 0.0952, + "step": 46570 + }, + { + "epoch": 4.410149592880137, + "grad_norm": 0.42512208223342896, + "learning_rate": 2.3594016284794547e-06, + "loss": 0.0979, + "step": 46580 + }, + { + "epoch": 4.411096383260746, + "grad_norm": 0.49382078647613525, + "learning_rate": 2.3556144669570156e-06, + "loss": 0.0817, + "step": 46590 + }, + { + "epoch": 4.412043173641356, + "grad_norm": 0.48640701174736023, + "learning_rate": 2.351827305434577e-06, + "loss": 0.0823, + "step": 46600 + }, + { + "epoch": 4.412989964021966, + "grad_norm": 0.48631972074508667, + "learning_rate": 2.3480401439121383e-06, + "loss": 0.09, + "step": 46610 + }, + { + "epoch": 4.413936754402576, + "grad_norm": 0.6041695475578308, + "learning_rate": 2.344252982389699e-06, + "loss": 0.0924, + "step": 46620 + }, + { + "epoch": 4.414883544783185, + "grad_norm": 0.41758155822753906, + "learning_rate": 2.34046582086726e-06, + "loss": 0.082, + "step": 46630 + }, + { + "epoch": 4.415830335163795, + "grad_norm": 0.47783786058425903, + "learning_rate": 2.3366786593448214e-06, + "loss": 0.0934, + "step": 46640 + }, + { + "epoch": 4.416777125544405, + "grad_norm": 0.5115928053855896, + "learning_rate": 2.3328914978223823e-06, + "loss": 0.0831, + "step": 46650 + }, + { + "epoch": 4.417723915925015, + "grad_norm": 0.5559064745903015, + "learning_rate": 2.329104336299943e-06, + "loss": 0.0944, + "step": 46660 + }, + { + "epoch": 4.418670706305624, + "grad_norm": 0.6243152618408203, + "learning_rate": 2.3253171747775045e-06, + "loss": 0.0967, + "step": 46670 + }, + { + "epoch": 4.419617496686234, + "grad_norm": 0.567179799079895, + "learning_rate": 2.3215300132550654e-06, + "loss": 0.0927, + "step": 46680 + }, + { + "epoch": 4.420564287066844, + "grad_norm": 0.6107397079467773, + "learning_rate": 2.3177428517326267e-06, + "loss": 0.0885, + "step": 46690 + }, + { + "epoch": 4.4215110774474535, + "grad_norm": 0.7441450953483582, + "learning_rate": 2.3139556902101876e-06, + "loss": 0.089, + "step": 46700 + }, + { + "epoch": 4.422457867828063, + "grad_norm": 0.49230247735977173, + "learning_rate": 2.3101685286877485e-06, + "loss": 0.0904, + "step": 46710 + }, + { + "epoch": 4.423404658208673, + "grad_norm": 0.5044174194335938, + "learning_rate": 2.30638136716531e-06, + "loss": 0.099, + "step": 46720 + }, + { + "epoch": 4.424351448589283, + "grad_norm": 0.5213074684143066, + "learning_rate": 2.3025942056428707e-06, + "loss": 0.1036, + "step": 46730 + }, + { + "epoch": 4.4252982389698925, + "grad_norm": 0.5848998427391052, + "learning_rate": 2.298807044120432e-06, + "loss": 0.0934, + "step": 46740 + }, + { + "epoch": 4.426245029350502, + "grad_norm": 0.5705507397651672, + "learning_rate": 2.295019882597993e-06, + "loss": 0.0944, + "step": 46750 + }, + { + "epoch": 4.427191819731112, + "grad_norm": 0.5704928040504456, + "learning_rate": 2.2912327210755543e-06, + "loss": 0.0906, + "step": 46760 + }, + { + "epoch": 4.428138610111722, + "grad_norm": 0.46121203899383545, + "learning_rate": 2.287445559553115e-06, + "loss": 0.0931, + "step": 46770 + }, + { + "epoch": 4.4290854004923315, + "grad_norm": 0.5511819124221802, + "learning_rate": 2.283658398030676e-06, + "loss": 0.0842, + "step": 46780 + }, + { + "epoch": 4.430032190872941, + "grad_norm": 0.6521610021591187, + "learning_rate": 2.2798712365082374e-06, + "loss": 0.0952, + "step": 46790 + }, + { + "epoch": 4.43097898125355, + "grad_norm": 0.48299136757850647, + "learning_rate": 2.2760840749857983e-06, + "loss": 0.0887, + "step": 46800 + }, + { + "epoch": 4.43192577163416, + "grad_norm": 0.5433517694473267, + "learning_rate": 2.272296913463359e-06, + "loss": 0.0947, + "step": 46810 + }, + { + "epoch": 4.4328725620147695, + "grad_norm": 0.4225456118583679, + "learning_rate": 2.2685097519409205e-06, + "loss": 0.0901, + "step": 46820 + }, + { + "epoch": 4.433819352395379, + "grad_norm": 0.5814269781112671, + "learning_rate": 2.264722590418482e-06, + "loss": 0.0831, + "step": 46830 + }, + { + "epoch": 4.434766142775989, + "grad_norm": 0.42077985405921936, + "learning_rate": 2.2609354288960427e-06, + "loss": 0.0984, + "step": 46840 + }, + { + "epoch": 4.435712933156599, + "grad_norm": 0.4770452380180359, + "learning_rate": 2.2571482673736036e-06, + "loss": 0.0832, + "step": 46850 + }, + { + "epoch": 4.4366597235372085, + "grad_norm": 0.47910043597221375, + "learning_rate": 2.253361105851165e-06, + "loss": 0.0877, + "step": 46860 + }, + { + "epoch": 4.437606513917818, + "grad_norm": 0.5875367522239685, + "learning_rate": 2.249573944328726e-06, + "loss": 0.0954, + "step": 46870 + }, + { + "epoch": 4.438553304298428, + "grad_norm": 0.4312451183795929, + "learning_rate": 2.2457867828062867e-06, + "loss": 0.0939, + "step": 46880 + }, + { + "epoch": 4.439500094679038, + "grad_norm": 0.5338612794876099, + "learning_rate": 2.241999621283848e-06, + "loss": 0.0901, + "step": 46890 + }, + { + "epoch": 4.4404468850596475, + "grad_norm": 0.516419529914856, + "learning_rate": 2.238212459761409e-06, + "loss": 0.0938, + "step": 46900 + }, + { + "epoch": 4.441393675440257, + "grad_norm": 0.7233615517616272, + "learning_rate": 2.23442529823897e-06, + "loss": 0.0881, + "step": 46910 + }, + { + "epoch": 4.442340465820867, + "grad_norm": 0.6186655163764954, + "learning_rate": 2.230638136716531e-06, + "loss": 0.0892, + "step": 46920 + }, + { + "epoch": 4.443287256201477, + "grad_norm": 0.5014260411262512, + "learning_rate": 2.2268509751940925e-06, + "loss": 0.0906, + "step": 46930 + }, + { + "epoch": 4.444234046582086, + "grad_norm": 0.5914987325668335, + "learning_rate": 2.2230638136716534e-06, + "loss": 0.0945, + "step": 46940 + }, + { + "epoch": 4.445180836962696, + "grad_norm": 0.5450988411903381, + "learning_rate": 2.2192766521492143e-06, + "loss": 0.0895, + "step": 46950 + }, + { + "epoch": 4.446127627343306, + "grad_norm": 0.5281708240509033, + "learning_rate": 2.2154894906267756e-06, + "loss": 0.0864, + "step": 46960 + }, + { + "epoch": 4.447074417723916, + "grad_norm": 0.5472949743270874, + "learning_rate": 2.2117023291043365e-06, + "loss": 0.0948, + "step": 46970 + }, + { + "epoch": 4.448021208104525, + "grad_norm": 0.6187710165977478, + "learning_rate": 2.2079151675818974e-06, + "loss": 0.0931, + "step": 46980 + }, + { + "epoch": 4.448967998485135, + "grad_norm": 0.4200582802295685, + "learning_rate": 2.2041280060594587e-06, + "loss": 0.0873, + "step": 46990 + }, + { + "epoch": 4.449914788865745, + "grad_norm": 0.4873574674129486, + "learning_rate": 2.2003408445370196e-06, + "loss": 0.0942, + "step": 47000 + }, + { + "epoch": 4.450861579246355, + "grad_norm": 0.4789665639400482, + "learning_rate": 2.196553683014581e-06, + "loss": 0.0916, + "step": 47010 + }, + { + "epoch": 4.451808369626964, + "grad_norm": 0.41863638162612915, + "learning_rate": 2.192766521492142e-06, + "loss": 0.0944, + "step": 47020 + }, + { + "epoch": 4.452755160007574, + "grad_norm": 0.6053740382194519, + "learning_rate": 2.1889793599697027e-06, + "loss": 0.0953, + "step": 47030 + }, + { + "epoch": 4.453701950388184, + "grad_norm": 0.613536536693573, + "learning_rate": 2.185192198447264e-06, + "loss": 0.0982, + "step": 47040 + }, + { + "epoch": 4.454648740768794, + "grad_norm": 0.5732889771461487, + "learning_rate": 2.181405036924825e-06, + "loss": 0.0998, + "step": 47050 + }, + { + "epoch": 4.455595531149403, + "grad_norm": 0.5707571506500244, + "learning_rate": 2.1776178754023862e-06, + "loss": 0.0998, + "step": 47060 + }, + { + "epoch": 4.456542321530013, + "grad_norm": 0.5420945882797241, + "learning_rate": 2.173830713879947e-06, + "loss": 0.1015, + "step": 47070 + }, + { + "epoch": 4.457489111910623, + "grad_norm": 0.49488115310668945, + "learning_rate": 2.1700435523575085e-06, + "loss": 0.0854, + "step": 47080 + }, + { + "epoch": 4.4584359022912325, + "grad_norm": 0.6199977993965149, + "learning_rate": 2.1662563908350694e-06, + "loss": 0.0994, + "step": 47090 + }, + { + "epoch": 4.459382692671842, + "grad_norm": 0.6206436157226562, + "learning_rate": 2.1624692293126302e-06, + "loss": 0.0893, + "step": 47100 + }, + { + "epoch": 4.460329483052452, + "grad_norm": 0.5212389826774597, + "learning_rate": 2.1586820677901916e-06, + "loss": 0.0995, + "step": 47110 + }, + { + "epoch": 4.461276273433062, + "grad_norm": 0.4194228947162628, + "learning_rate": 2.1548949062677525e-06, + "loss": 0.0898, + "step": 47120 + }, + { + "epoch": 4.4622230638136715, + "grad_norm": 0.6620252728462219, + "learning_rate": 2.1511077447453134e-06, + "loss": 0.0957, + "step": 47130 + }, + { + "epoch": 4.463169854194281, + "grad_norm": 0.6501525640487671, + "learning_rate": 2.1473205832228747e-06, + "loss": 0.0942, + "step": 47140 + }, + { + "epoch": 4.464116644574891, + "grad_norm": 0.46277403831481934, + "learning_rate": 2.143533421700436e-06, + "loss": 0.0901, + "step": 47150 + }, + { + "epoch": 4.465063434955501, + "grad_norm": 0.5377644896507263, + "learning_rate": 2.139746260177997e-06, + "loss": 0.0952, + "step": 47160 + }, + { + "epoch": 4.4660102253361105, + "grad_norm": 0.4474472403526306, + "learning_rate": 2.135959098655558e-06, + "loss": 0.085, + "step": 47170 + }, + { + "epoch": 4.46695701571672, + "grad_norm": 0.7432825565338135, + "learning_rate": 2.132171937133119e-06, + "loss": 0.0975, + "step": 47180 + }, + { + "epoch": 4.46790380609733, + "grad_norm": 0.6087955832481384, + "learning_rate": 2.12838477561068e-06, + "loss": 0.0924, + "step": 47190 + }, + { + "epoch": 4.46885059647794, + "grad_norm": 0.43486493825912476, + "learning_rate": 2.124597614088241e-06, + "loss": 0.0937, + "step": 47200 + }, + { + "epoch": 4.469797386858549, + "grad_norm": 0.597850501537323, + "learning_rate": 2.1208104525658022e-06, + "loss": 0.0919, + "step": 47210 + }, + { + "epoch": 4.470744177239159, + "grad_norm": 0.5512962937355042, + "learning_rate": 2.117023291043363e-06, + "loss": 0.0919, + "step": 47220 + }, + { + "epoch": 4.471690967619769, + "grad_norm": 0.6809868216514587, + "learning_rate": 2.113236129520924e-06, + "loss": 0.0883, + "step": 47230 + }, + { + "epoch": 4.472637758000379, + "grad_norm": 0.5824252963066101, + "learning_rate": 2.1094489679984853e-06, + "loss": 0.0948, + "step": 47240 + }, + { + "epoch": 4.473584548380988, + "grad_norm": 0.5019168853759766, + "learning_rate": 2.1056618064760467e-06, + "loss": 0.0888, + "step": 47250 + }, + { + "epoch": 4.474531338761598, + "grad_norm": 0.7237482666969299, + "learning_rate": 2.101874644953607e-06, + "loss": 0.0959, + "step": 47260 + }, + { + "epoch": 4.475478129142208, + "grad_norm": 0.5009791851043701, + "learning_rate": 2.0980874834311685e-06, + "loss": 0.0878, + "step": 47270 + }, + { + "epoch": 4.476424919522818, + "grad_norm": 0.6152024269104004, + "learning_rate": 2.0943003219087298e-06, + "loss": 0.1019, + "step": 47280 + }, + { + "epoch": 4.477371709903427, + "grad_norm": 0.589474618434906, + "learning_rate": 2.0905131603862907e-06, + "loss": 0.089, + "step": 47290 + }, + { + "epoch": 4.478318500284037, + "grad_norm": 0.5134758353233337, + "learning_rate": 2.0867259988638516e-06, + "loss": 0.0772, + "step": 47300 + }, + { + "epoch": 4.479265290664647, + "grad_norm": 0.5028401613235474, + "learning_rate": 2.082938837341413e-06, + "loss": 0.0933, + "step": 47310 + }, + { + "epoch": 4.480212081045257, + "grad_norm": 0.5507858395576477, + "learning_rate": 2.0791516758189738e-06, + "loss": 0.0952, + "step": 47320 + }, + { + "epoch": 4.481158871425866, + "grad_norm": 0.5241835713386536, + "learning_rate": 2.0753645142965347e-06, + "loss": 0.095, + "step": 47330 + }, + { + "epoch": 4.482105661806476, + "grad_norm": 0.4093499183654785, + "learning_rate": 2.071577352774096e-06, + "loss": 0.0999, + "step": 47340 + }, + { + "epoch": 4.483052452187086, + "grad_norm": 0.4314521253108978, + "learning_rate": 2.067790191251657e-06, + "loss": 0.0847, + "step": 47350 + }, + { + "epoch": 4.4839992425676956, + "grad_norm": 0.5057846903800964, + "learning_rate": 2.0640030297292182e-06, + "loss": 0.0932, + "step": 47360 + }, + { + "epoch": 4.484946032948305, + "grad_norm": 0.49404215812683105, + "learning_rate": 2.060215868206779e-06, + "loss": 0.0854, + "step": 47370 + }, + { + "epoch": 4.485892823328915, + "grad_norm": 0.46423012018203735, + "learning_rate": 2.0564287066843404e-06, + "loss": 0.0762, + "step": 47380 + }, + { + "epoch": 4.486839613709525, + "grad_norm": 0.568574070930481, + "learning_rate": 2.0526415451619013e-06, + "loss": 0.0926, + "step": 47390 + }, + { + "epoch": 4.4877864040901345, + "grad_norm": 0.45246657729148865, + "learning_rate": 2.0488543836394622e-06, + "loss": 0.0875, + "step": 47400 + }, + { + "epoch": 4.488733194470744, + "grad_norm": 0.6380802392959595, + "learning_rate": 2.0450672221170235e-06, + "loss": 0.0916, + "step": 47410 + }, + { + "epoch": 4.489679984851354, + "grad_norm": 0.6181575059890747, + "learning_rate": 2.0412800605945844e-06, + "loss": 0.1019, + "step": 47420 + }, + { + "epoch": 4.490626775231964, + "grad_norm": 0.47934210300445557, + "learning_rate": 2.0374928990721458e-06, + "loss": 0.0916, + "step": 47430 + }, + { + "epoch": 4.4915735656125735, + "grad_norm": 0.3730686902999878, + "learning_rate": 2.0337057375497067e-06, + "loss": 0.0926, + "step": 47440 + }, + { + "epoch": 4.492520355993183, + "grad_norm": 0.524203896522522, + "learning_rate": 2.0299185760272676e-06, + "loss": 0.0851, + "step": 47450 + }, + { + "epoch": 4.493467146373793, + "grad_norm": 0.602016806602478, + "learning_rate": 2.026131414504829e-06, + "loss": 0.1, + "step": 47460 + }, + { + "epoch": 4.494413936754403, + "grad_norm": 0.5126527547836304, + "learning_rate": 2.0223442529823898e-06, + "loss": 0.0947, + "step": 47470 + }, + { + "epoch": 4.4953607271350124, + "grad_norm": 0.5978550314903259, + "learning_rate": 2.0185570914599507e-06, + "loss": 0.0912, + "step": 47480 + }, + { + "epoch": 4.496307517515622, + "grad_norm": 0.6456809043884277, + "learning_rate": 2.014769929937512e-06, + "loss": 0.0906, + "step": 47490 + }, + { + "epoch": 4.497254307896232, + "grad_norm": 0.6243536472320557, + "learning_rate": 2.0109827684150733e-06, + "loss": 0.0892, + "step": 47500 + }, + { + "epoch": 4.498201098276842, + "grad_norm": 0.6239606738090515, + "learning_rate": 2.007195606892634e-06, + "loss": 0.1018, + "step": 47510 + }, + { + "epoch": 4.499147888657451, + "grad_norm": 0.5502533316612244, + "learning_rate": 2.003408445370195e-06, + "loss": 0.0876, + "step": 47520 + }, + { + "epoch": 4.500094679038061, + "grad_norm": 0.5233820676803589, + "learning_rate": 1.9996212838477564e-06, + "loss": 0.0894, + "step": 47530 + }, + { + "epoch": 4.501041469418671, + "grad_norm": 0.5940799713134766, + "learning_rate": 1.9958341223253173e-06, + "loss": 0.093, + "step": 47540 + }, + { + "epoch": 4.501988259799281, + "grad_norm": 0.4981343150138855, + "learning_rate": 1.9920469608028782e-06, + "loss": 0.0985, + "step": 47550 + }, + { + "epoch": 4.50293505017989, + "grad_norm": 0.5547967553138733, + "learning_rate": 1.9882597992804395e-06, + "loss": 0.0964, + "step": 47560 + }, + { + "epoch": 4.5038818405605, + "grad_norm": 0.5440186262130737, + "learning_rate": 1.9844726377580004e-06, + "loss": 0.0894, + "step": 47570 + }, + { + "epoch": 4.50482863094111, + "grad_norm": 0.4961601197719574, + "learning_rate": 1.9806854762355613e-06, + "loss": 0.0971, + "step": 47580 + }, + { + "epoch": 4.50577542132172, + "grad_norm": 0.533189594745636, + "learning_rate": 1.9768983147131226e-06, + "loss": 0.0958, + "step": 47590 + }, + { + "epoch": 4.506722211702329, + "grad_norm": 0.6105077862739563, + "learning_rate": 1.973111153190684e-06, + "loss": 0.1035, + "step": 47600 + }, + { + "epoch": 4.507669002082939, + "grad_norm": 0.48340150713920593, + "learning_rate": 1.969323991668245e-06, + "loss": 0.0964, + "step": 47610 + }, + { + "epoch": 4.508615792463549, + "grad_norm": 0.4797910451889038, + "learning_rate": 1.9655368301458058e-06, + "loss": 0.087, + "step": 47620 + }, + { + "epoch": 4.509562582844159, + "grad_norm": 0.4579121470451355, + "learning_rate": 1.961749668623367e-06, + "loss": 0.0753, + "step": 47630 + }, + { + "epoch": 4.510509373224768, + "grad_norm": 0.44602665305137634, + "learning_rate": 1.957962507100928e-06, + "loss": 0.0839, + "step": 47640 + }, + { + "epoch": 4.511456163605378, + "grad_norm": 0.5439406633377075, + "learning_rate": 1.954175345578489e-06, + "loss": 0.0939, + "step": 47650 + }, + { + "epoch": 4.512402953985988, + "grad_norm": 0.49075302481651306, + "learning_rate": 1.95038818405605e-06, + "loss": 0.0858, + "step": 47660 + }, + { + "epoch": 4.5133497443665975, + "grad_norm": 0.6823569536209106, + "learning_rate": 1.946601022533611e-06, + "loss": 0.0905, + "step": 47670 + }, + { + "epoch": 4.514296534747207, + "grad_norm": 0.6025480031967163, + "learning_rate": 1.9428138610111724e-06, + "loss": 0.0906, + "step": 47680 + }, + { + "epoch": 4.515243325127817, + "grad_norm": 0.5409242510795593, + "learning_rate": 1.9390266994887333e-06, + "loss": 0.0802, + "step": 47690 + }, + { + "epoch": 4.516190115508427, + "grad_norm": 0.7054596543312073, + "learning_rate": 1.9352395379662946e-06, + "loss": 0.0928, + "step": 47700 + }, + { + "epoch": 4.5171369058890365, + "grad_norm": 0.5168089270591736, + "learning_rate": 1.9314523764438555e-06, + "loss": 0.0991, + "step": 47710 + }, + { + "epoch": 4.518083696269646, + "grad_norm": 0.4419559836387634, + "learning_rate": 1.9276652149214164e-06, + "loss": 0.0989, + "step": 47720 + }, + { + "epoch": 4.519030486650256, + "grad_norm": 0.38130053877830505, + "learning_rate": 1.9238780533989777e-06, + "loss": 0.0909, + "step": 47730 + }, + { + "epoch": 4.519977277030866, + "grad_norm": 0.4823101758956909, + "learning_rate": 1.9200908918765386e-06, + "loss": 0.0903, + "step": 47740 + }, + { + "epoch": 4.5209240674114755, + "grad_norm": 0.47707557678222656, + "learning_rate": 1.9163037303541e-06, + "loss": 0.0872, + "step": 47750 + }, + { + "epoch": 4.521870857792085, + "grad_norm": 0.549338161945343, + "learning_rate": 1.912516568831661e-06, + "loss": 0.0893, + "step": 47760 + }, + { + "epoch": 4.522817648172695, + "grad_norm": 0.4239804148674011, + "learning_rate": 1.9087294073092218e-06, + "loss": 0.1028, + "step": 47770 + }, + { + "epoch": 4.523764438553305, + "grad_norm": 0.5911729335784912, + "learning_rate": 1.904942245786783e-06, + "loss": 0.0959, + "step": 47780 + }, + { + "epoch": 4.524711228933914, + "grad_norm": 0.635185956954956, + "learning_rate": 1.901155084264344e-06, + "loss": 0.0912, + "step": 47790 + }, + { + "epoch": 4.525658019314523, + "grad_norm": 0.43398451805114746, + "learning_rate": 1.897367922741905e-06, + "loss": 0.09, + "step": 47800 + }, + { + "epoch": 4.526604809695133, + "grad_norm": 0.4836176931858063, + "learning_rate": 1.8935807612194662e-06, + "loss": 0.0944, + "step": 47810 + }, + { + "epoch": 4.527551600075743, + "grad_norm": 0.5630194544792175, + "learning_rate": 1.8897935996970273e-06, + "loss": 0.0997, + "step": 47820 + }, + { + "epoch": 4.5284983904563525, + "grad_norm": 0.6335233449935913, + "learning_rate": 1.8860064381745882e-06, + "loss": 0.1002, + "step": 47830 + }, + { + "epoch": 4.529445180836962, + "grad_norm": 0.5927964448928833, + "learning_rate": 1.8822192766521493e-06, + "loss": 0.0922, + "step": 47840 + }, + { + "epoch": 4.530391971217572, + "grad_norm": 0.6638123393058777, + "learning_rate": 1.8784321151297106e-06, + "loss": 0.0957, + "step": 47850 + }, + { + "epoch": 4.531338761598182, + "grad_norm": 0.5143892765045166, + "learning_rate": 1.8746449536072713e-06, + "loss": 0.0997, + "step": 47860 + }, + { + "epoch": 4.5322855519787915, + "grad_norm": 0.6001013517379761, + "learning_rate": 1.8708577920848326e-06, + "loss": 0.0843, + "step": 47870 + }, + { + "epoch": 4.533232342359401, + "grad_norm": 0.5932102799415588, + "learning_rate": 1.8670706305623937e-06, + "loss": 0.0833, + "step": 47880 + }, + { + "epoch": 4.534179132740011, + "grad_norm": 0.5315927863121033, + "learning_rate": 1.8632834690399548e-06, + "loss": 0.0888, + "step": 47890 + }, + { + "epoch": 4.535125923120621, + "grad_norm": 0.47217097878456116, + "learning_rate": 1.8594963075175157e-06, + "loss": 0.0947, + "step": 47900 + }, + { + "epoch": 4.53607271350123, + "grad_norm": 0.554823100566864, + "learning_rate": 1.8557091459950768e-06, + "loss": 0.087, + "step": 47910 + }, + { + "epoch": 4.53701950388184, + "grad_norm": 0.4658668339252472, + "learning_rate": 1.851921984472638e-06, + "loss": 0.097, + "step": 47920 + }, + { + "epoch": 4.53796629426245, + "grad_norm": 0.6078159213066101, + "learning_rate": 1.8481348229501988e-06, + "loss": 0.0929, + "step": 47930 + }, + { + "epoch": 4.53891308464306, + "grad_norm": 0.42829376459121704, + "learning_rate": 1.84434766142776e-06, + "loss": 0.091, + "step": 47940 + }, + { + "epoch": 4.539859875023669, + "grad_norm": 0.5195884704589844, + "learning_rate": 1.840560499905321e-06, + "loss": 0.0881, + "step": 47950 + }, + { + "epoch": 4.540806665404279, + "grad_norm": 0.6427383422851562, + "learning_rate": 1.8367733383828824e-06, + "loss": 0.0967, + "step": 47960 + }, + { + "epoch": 4.541753455784889, + "grad_norm": 0.4192584753036499, + "learning_rate": 1.832986176860443e-06, + "loss": 0.0976, + "step": 47970 + }, + { + "epoch": 4.542700246165499, + "grad_norm": 0.5321178436279297, + "learning_rate": 1.8291990153380044e-06, + "loss": 0.0993, + "step": 47980 + }, + { + "epoch": 4.543647036546108, + "grad_norm": 0.5339673757553101, + "learning_rate": 1.8254118538155655e-06, + "loss": 0.0901, + "step": 47990 + }, + { + "epoch": 4.544593826926718, + "grad_norm": 0.5013030767440796, + "learning_rate": 1.8216246922931264e-06, + "loss": 0.0915, + "step": 48000 + }, + { + "epoch": 4.545540617307328, + "grad_norm": 0.5032514929771423, + "learning_rate": 1.8178375307706875e-06, + "loss": 0.0938, + "step": 48010 + }, + { + "epoch": 4.546487407687938, + "grad_norm": 0.48303863406181335, + "learning_rate": 1.8140503692482486e-06, + "loss": 0.0889, + "step": 48020 + }, + { + "epoch": 4.547434198068547, + "grad_norm": 0.5052141547203064, + "learning_rate": 1.8102632077258097e-06, + "loss": 0.0935, + "step": 48030 + }, + { + "epoch": 4.548380988449157, + "grad_norm": 0.5991350412368774, + "learning_rate": 1.8064760462033706e-06, + "loss": 0.0912, + "step": 48040 + }, + { + "epoch": 4.549327778829767, + "grad_norm": 0.5054553747177124, + "learning_rate": 1.8026888846809317e-06, + "loss": 0.0939, + "step": 48050 + }, + { + "epoch": 4.5502745692103765, + "grad_norm": 0.5401033759117126, + "learning_rate": 1.7989017231584928e-06, + "loss": 0.0977, + "step": 48060 + }, + { + "epoch": 4.551221359590986, + "grad_norm": 0.5015616416931152, + "learning_rate": 1.7951145616360537e-06, + "loss": 0.0882, + "step": 48070 + }, + { + "epoch": 4.552168149971596, + "grad_norm": 0.6362848877906799, + "learning_rate": 1.791327400113615e-06, + "loss": 0.095, + "step": 48080 + }, + { + "epoch": 4.553114940352206, + "grad_norm": 0.5170758962631226, + "learning_rate": 1.7875402385911762e-06, + "loss": 0.0863, + "step": 48090 + }, + { + "epoch": 4.5540617307328155, + "grad_norm": 0.5425009727478027, + "learning_rate": 1.7837530770687373e-06, + "loss": 0.0876, + "step": 48100 + }, + { + "epoch": 4.555008521113425, + "grad_norm": 0.5243383049964905, + "learning_rate": 1.7799659155462982e-06, + "loss": 0.0919, + "step": 48110 + }, + { + "epoch": 4.555955311494035, + "grad_norm": 0.8553285002708435, + "learning_rate": 1.7761787540238593e-06, + "loss": 0.0934, + "step": 48120 + }, + { + "epoch": 4.556902101874645, + "grad_norm": 0.548096239566803, + "learning_rate": 1.7723915925014204e-06, + "loss": 0.0834, + "step": 48130 + }, + { + "epoch": 4.5578488922552545, + "grad_norm": 0.7237502336502075, + "learning_rate": 1.7686044309789813e-06, + "loss": 0.0891, + "step": 48140 + }, + { + "epoch": 4.558795682635864, + "grad_norm": 0.5385095477104187, + "learning_rate": 1.7648172694565424e-06, + "loss": 0.0946, + "step": 48150 + }, + { + "epoch": 4.559742473016474, + "grad_norm": 0.577634871006012, + "learning_rate": 1.7610301079341035e-06, + "loss": 0.0991, + "step": 48160 + }, + { + "epoch": 4.560689263397084, + "grad_norm": 0.4976717531681061, + "learning_rate": 1.7572429464116648e-06, + "loss": 0.0842, + "step": 48170 + }, + { + "epoch": 4.561636053777693, + "grad_norm": 0.5380846261978149, + "learning_rate": 1.7534557848892255e-06, + "loss": 0.0935, + "step": 48180 + }, + { + "epoch": 4.562582844158303, + "grad_norm": 0.5093594789505005, + "learning_rate": 1.7496686233667868e-06, + "loss": 0.0862, + "step": 48190 + }, + { + "epoch": 4.563529634538913, + "grad_norm": 0.6310622096061707, + "learning_rate": 1.745881461844348e-06, + "loss": 0.1006, + "step": 48200 + }, + { + "epoch": 4.564476424919523, + "grad_norm": 0.47840484976768494, + "learning_rate": 1.7420943003219088e-06, + "loss": 0.0926, + "step": 48210 + }, + { + "epoch": 4.565423215300132, + "grad_norm": 0.5460345149040222, + "learning_rate": 1.73830713879947e-06, + "loss": 0.092, + "step": 48220 + }, + { + "epoch": 4.566370005680742, + "grad_norm": 0.46515849232673645, + "learning_rate": 1.734519977277031e-06, + "loss": 0.0905, + "step": 48230 + }, + { + "epoch": 4.567316796061352, + "grad_norm": 0.45555320382118225, + "learning_rate": 1.7307328157545921e-06, + "loss": 0.0841, + "step": 48240 + }, + { + "epoch": 4.568263586441962, + "grad_norm": 0.587450385093689, + "learning_rate": 1.726945654232153e-06, + "loss": 0.0927, + "step": 48250 + }, + { + "epoch": 4.569210376822571, + "grad_norm": 0.5282682776451111, + "learning_rate": 1.7231584927097142e-06, + "loss": 0.0789, + "step": 48260 + }, + { + "epoch": 4.570157167203181, + "grad_norm": 0.847499668598175, + "learning_rate": 1.7193713311872753e-06, + "loss": 0.0904, + "step": 48270 + }, + { + "epoch": 4.571103957583791, + "grad_norm": 0.5478895306587219, + "learning_rate": 1.7155841696648362e-06, + "loss": 0.0991, + "step": 48280 + }, + { + "epoch": 4.572050747964401, + "grad_norm": 0.49527424573898315, + "learning_rate": 1.7117970081423973e-06, + "loss": 0.089, + "step": 48290 + }, + { + "epoch": 4.57299753834501, + "grad_norm": 0.4628439247608185, + "learning_rate": 1.7080098466199586e-06, + "loss": 0.0934, + "step": 48300 + }, + { + "epoch": 4.57394432872562, + "grad_norm": 0.5064544677734375, + "learning_rate": 1.7042226850975197e-06, + "loss": 0.1002, + "step": 48310 + }, + { + "epoch": 4.57489111910623, + "grad_norm": 0.5350474715232849, + "learning_rate": 1.7004355235750806e-06, + "loss": 0.0985, + "step": 48320 + }, + { + "epoch": 4.5758379094868395, + "grad_norm": 0.45474421977996826, + "learning_rate": 1.6966483620526417e-06, + "loss": 0.0882, + "step": 48330 + }, + { + "epoch": 4.576784699867449, + "grad_norm": 0.480854332447052, + "learning_rate": 1.6928612005302028e-06, + "loss": 0.0882, + "step": 48340 + }, + { + "epoch": 4.577731490248059, + "grad_norm": 0.5241674780845642, + "learning_rate": 1.689074039007764e-06, + "loss": 0.0966, + "step": 48350 + }, + { + "epoch": 4.578678280628669, + "grad_norm": 0.5988481640815735, + "learning_rate": 1.6852868774853248e-06, + "loss": 0.0881, + "step": 48360 + }, + { + "epoch": 4.5796250710092785, + "grad_norm": 0.5592998266220093, + "learning_rate": 1.681499715962886e-06, + "loss": 0.084, + "step": 48370 + }, + { + "epoch": 4.580571861389888, + "grad_norm": 0.5882778167724609, + "learning_rate": 1.677712554440447e-06, + "loss": 0.0969, + "step": 48380 + }, + { + "epoch": 4.581518651770498, + "grad_norm": 0.5792142748832703, + "learning_rate": 1.673925392918008e-06, + "loss": 0.084, + "step": 48390 + }, + { + "epoch": 4.582465442151108, + "grad_norm": 0.586322009563446, + "learning_rate": 1.670138231395569e-06, + "loss": 0.0948, + "step": 48400 + }, + { + "epoch": 4.5834122325317175, + "grad_norm": 0.4606407880783081, + "learning_rate": 1.6663510698731304e-06, + "loss": 0.0815, + "step": 48410 + }, + { + "epoch": 4.584359022912327, + "grad_norm": 0.5143293142318726, + "learning_rate": 1.6625639083506915e-06, + "loss": 0.0904, + "step": 48420 + }, + { + "epoch": 4.585305813292937, + "grad_norm": 0.5697513818740845, + "learning_rate": 1.6587767468282524e-06, + "loss": 0.0917, + "step": 48430 + }, + { + "epoch": 4.586252603673547, + "grad_norm": 0.568105161190033, + "learning_rate": 1.6549895853058135e-06, + "loss": 0.0918, + "step": 48440 + }, + { + "epoch": 4.587199394054156, + "grad_norm": 0.5528663396835327, + "learning_rate": 1.6512024237833746e-06, + "loss": 0.094, + "step": 48450 + }, + { + "epoch": 4.588146184434766, + "grad_norm": 0.5852027535438538, + "learning_rate": 1.6474152622609355e-06, + "loss": 0.099, + "step": 48460 + }, + { + "epoch": 4.589092974815376, + "grad_norm": 0.7316393256187439, + "learning_rate": 1.6436281007384966e-06, + "loss": 0.099, + "step": 48470 + }, + { + "epoch": 4.590039765195986, + "grad_norm": 0.5696329474449158, + "learning_rate": 1.6398409392160577e-06, + "loss": 0.1, + "step": 48480 + }, + { + "epoch": 4.590986555576595, + "grad_norm": 0.4912358522415161, + "learning_rate": 1.6360537776936188e-06, + "loss": 0.0913, + "step": 48490 + }, + { + "epoch": 4.591933345957205, + "grad_norm": 0.6251103281974792, + "learning_rate": 1.6322666161711797e-06, + "loss": 0.0927, + "step": 48500 + }, + { + "epoch": 4.592880136337815, + "grad_norm": 0.6149888038635254, + "learning_rate": 1.628479454648741e-06, + "loss": 0.1018, + "step": 48510 + }, + { + "epoch": 4.593826926718425, + "grad_norm": 0.39401429891586304, + "learning_rate": 1.6246922931263021e-06, + "loss": 0.0951, + "step": 48520 + }, + { + "epoch": 4.594773717099034, + "grad_norm": 0.585252046585083, + "learning_rate": 1.620905131603863e-06, + "loss": 0.0879, + "step": 48530 + }, + { + "epoch": 4.595720507479644, + "grad_norm": 0.5147320628166199, + "learning_rate": 1.6171179700814241e-06, + "loss": 0.0849, + "step": 48540 + }, + { + "epoch": 4.596667297860254, + "grad_norm": 0.5566774010658264, + "learning_rate": 1.6133308085589852e-06, + "loss": 0.0954, + "step": 48550 + }, + { + "epoch": 4.597614088240864, + "grad_norm": 0.5427478551864624, + "learning_rate": 1.6095436470365463e-06, + "loss": 0.0856, + "step": 48560 + }, + { + "epoch": 4.598560878621473, + "grad_norm": 0.4353178143501282, + "learning_rate": 1.6057564855141072e-06, + "loss": 0.0823, + "step": 48570 + }, + { + "epoch": 4.599507669002083, + "grad_norm": 0.49626612663269043, + "learning_rate": 1.6019693239916683e-06, + "loss": 0.0825, + "step": 48580 + }, + { + "epoch": 4.600454459382693, + "grad_norm": 0.5396493077278137, + "learning_rate": 1.5981821624692295e-06, + "loss": 0.0931, + "step": 48590 + }, + { + "epoch": 4.601401249763303, + "grad_norm": 0.6535559892654419, + "learning_rate": 1.5943950009467904e-06, + "loss": 0.0897, + "step": 48600 + }, + { + "epoch": 4.602348040143912, + "grad_norm": 0.4790814518928528, + "learning_rate": 1.5906078394243515e-06, + "loss": 0.0877, + "step": 48610 + }, + { + "epoch": 4.603294830524522, + "grad_norm": 0.5527889132499695, + "learning_rate": 1.5868206779019128e-06, + "loss": 0.0879, + "step": 48620 + }, + { + "epoch": 4.604241620905132, + "grad_norm": 0.5145699977874756, + "learning_rate": 1.5830335163794739e-06, + "loss": 0.0951, + "step": 48630 + }, + { + "epoch": 4.6051884112857415, + "grad_norm": 0.5361632108688354, + "learning_rate": 1.5792463548570348e-06, + "loss": 0.0875, + "step": 48640 + }, + { + "epoch": 4.606135201666351, + "grad_norm": 0.4808364808559418, + "learning_rate": 1.5754591933345959e-06, + "loss": 0.0941, + "step": 48650 + }, + { + "epoch": 4.607081992046961, + "grad_norm": 0.46790459752082825, + "learning_rate": 1.571672031812157e-06, + "loss": 0.0998, + "step": 48660 + }, + { + "epoch": 4.608028782427571, + "grad_norm": 0.5993084907531738, + "learning_rate": 1.567884870289718e-06, + "loss": 0.0939, + "step": 48670 + }, + { + "epoch": 4.6089755728081805, + "grad_norm": 0.5489649772644043, + "learning_rate": 1.564097708767279e-06, + "loss": 0.0902, + "step": 48680 + }, + { + "epoch": 4.60992236318879, + "grad_norm": 0.4542575776576996, + "learning_rate": 1.5603105472448401e-06, + "loss": 0.0962, + "step": 48690 + }, + { + "epoch": 4.6108691535694, + "grad_norm": 0.5660474300384521, + "learning_rate": 1.5565233857224012e-06, + "loss": 0.1011, + "step": 48700 + }, + { + "epoch": 4.61181594395001, + "grad_norm": 0.36919817328453064, + "learning_rate": 1.5527362241999621e-06, + "loss": 0.0955, + "step": 48710 + }, + { + "epoch": 4.6127627343306195, + "grad_norm": 0.44812771677970886, + "learning_rate": 1.5489490626775232e-06, + "loss": 0.0887, + "step": 48720 + }, + { + "epoch": 4.613709524711229, + "grad_norm": 0.48133018612861633, + "learning_rate": 1.5451619011550845e-06, + "loss": 0.0901, + "step": 48730 + }, + { + "epoch": 4.614656315091839, + "grad_norm": 0.5825098156929016, + "learning_rate": 1.5413747396326452e-06, + "loss": 0.0916, + "step": 48740 + }, + { + "epoch": 4.615603105472449, + "grad_norm": 0.49545857310295105, + "learning_rate": 1.5375875781102066e-06, + "loss": 0.0859, + "step": 48750 + }, + { + "epoch": 4.616549895853058, + "grad_norm": 0.6294886469841003, + "learning_rate": 1.5338004165877677e-06, + "loss": 0.0948, + "step": 48760 + }, + { + "epoch": 4.617496686233668, + "grad_norm": 0.4956039488315582, + "learning_rate": 1.5300132550653288e-06, + "loss": 0.0992, + "step": 48770 + }, + { + "epoch": 4.618443476614278, + "grad_norm": 0.5199247598648071, + "learning_rate": 1.5262260935428897e-06, + "loss": 0.0933, + "step": 48780 + }, + { + "epoch": 4.619390266994888, + "grad_norm": 0.6329244375228882, + "learning_rate": 1.5224389320204508e-06, + "loss": 0.0943, + "step": 48790 + }, + { + "epoch": 4.620337057375497, + "grad_norm": 0.43671914935112, + "learning_rate": 1.5186517704980119e-06, + "loss": 0.0833, + "step": 48800 + }, + { + "epoch": 4.621283847756107, + "grad_norm": 0.7302520871162415, + "learning_rate": 1.5148646089755728e-06, + "loss": 0.0935, + "step": 48810 + }, + { + "epoch": 4.622230638136717, + "grad_norm": 0.5845633149147034, + "learning_rate": 1.5110774474531339e-06, + "loss": 0.0968, + "step": 48820 + }, + { + "epoch": 4.623177428517327, + "grad_norm": 0.4450472891330719, + "learning_rate": 1.507290285930695e-06, + "loss": 0.0898, + "step": 48830 + }, + { + "epoch": 4.624124218897936, + "grad_norm": 0.4358493685722351, + "learning_rate": 1.5035031244082563e-06, + "loss": 0.0884, + "step": 48840 + }, + { + "epoch": 4.625071009278546, + "grad_norm": 0.4955228865146637, + "learning_rate": 1.4997159628858172e-06, + "loss": 0.1026, + "step": 48850 + }, + { + "epoch": 4.626017799659156, + "grad_norm": 0.5216303467750549, + "learning_rate": 1.4959288013633783e-06, + "loss": 0.0972, + "step": 48860 + }, + { + "epoch": 4.626964590039766, + "grad_norm": 0.5317696928977966, + "learning_rate": 1.4921416398409394e-06, + "loss": 0.0997, + "step": 48870 + }, + { + "epoch": 4.627911380420375, + "grad_norm": 0.6008380055427551, + "learning_rate": 1.4883544783185003e-06, + "loss": 0.0917, + "step": 48880 + }, + { + "epoch": 4.628858170800985, + "grad_norm": 0.46178102493286133, + "learning_rate": 1.4845673167960614e-06, + "loss": 0.0947, + "step": 48890 + }, + { + "epoch": 4.629804961181595, + "grad_norm": 0.46813449263572693, + "learning_rate": 1.4807801552736225e-06, + "loss": 0.0991, + "step": 48900 + }, + { + "epoch": 4.6307517515622045, + "grad_norm": 0.5440060496330261, + "learning_rate": 1.4769929937511836e-06, + "loss": 0.0876, + "step": 48910 + }, + { + "epoch": 4.631698541942814, + "grad_norm": 0.6925702691078186, + "learning_rate": 1.4732058322287445e-06, + "loss": 0.0983, + "step": 48920 + }, + { + "epoch": 4.632645332323424, + "grad_norm": 0.5300967693328857, + "learning_rate": 1.4694186707063057e-06, + "loss": 0.0946, + "step": 48930 + }, + { + "epoch": 4.633592122704034, + "grad_norm": 0.6564103364944458, + "learning_rate": 1.465631509183867e-06, + "loss": 0.1005, + "step": 48940 + }, + { + "epoch": 4.6345389130846435, + "grad_norm": 0.7219815254211426, + "learning_rate": 1.4618443476614277e-06, + "loss": 0.0886, + "step": 48950 + }, + { + "epoch": 4.635485703465253, + "grad_norm": 0.486798495054245, + "learning_rate": 1.458057186138989e-06, + "loss": 0.0898, + "step": 48960 + }, + { + "epoch": 4.636432493845863, + "grad_norm": 0.5655710697174072, + "learning_rate": 1.45427002461655e-06, + "loss": 0.0954, + "step": 48970 + }, + { + "epoch": 4.637379284226473, + "grad_norm": 0.5758100152015686, + "learning_rate": 1.4504828630941112e-06, + "loss": 0.0938, + "step": 48980 + }, + { + "epoch": 4.6383260746070825, + "grad_norm": 0.4616601765155792, + "learning_rate": 1.446695701571672e-06, + "loss": 0.0915, + "step": 48990 + }, + { + "epoch": 4.639272864987692, + "grad_norm": 0.6310985088348389, + "learning_rate": 1.4429085400492332e-06, + "loss": 0.0851, + "step": 49000 + }, + { + "epoch": 4.640219655368302, + "grad_norm": 0.46795547008514404, + "learning_rate": 1.4391213785267943e-06, + "loss": 0.0965, + "step": 49010 + }, + { + "epoch": 4.641166445748912, + "grad_norm": 0.546395480632782, + "learning_rate": 1.4353342170043554e-06, + "loss": 0.096, + "step": 49020 + }, + { + "epoch": 4.6421132361295205, + "grad_norm": 0.5430060029029846, + "learning_rate": 1.4315470554819163e-06, + "loss": 0.0937, + "step": 49030 + }, + { + "epoch": 4.64306002651013, + "grad_norm": 0.5058007836341858, + "learning_rate": 1.4277598939594774e-06, + "loss": 0.0872, + "step": 49040 + }, + { + "epoch": 4.64400681689074, + "grad_norm": 0.7336437702178955, + "learning_rate": 1.4239727324370387e-06, + "loss": 0.0842, + "step": 49050 + }, + { + "epoch": 4.64495360727135, + "grad_norm": 0.4892963469028473, + "learning_rate": 1.4201855709145994e-06, + "loss": 0.0875, + "step": 49060 + }, + { + "epoch": 4.6459003976519595, + "grad_norm": 0.4091499447822571, + "learning_rate": 1.4163984093921607e-06, + "loss": 0.1047, + "step": 49070 + }, + { + "epoch": 4.646847188032569, + "grad_norm": 0.6091591715812683, + "learning_rate": 1.4126112478697219e-06, + "loss": 0.097, + "step": 49080 + }, + { + "epoch": 4.647793978413179, + "grad_norm": 0.5314242839813232, + "learning_rate": 1.408824086347283e-06, + "loss": 0.0954, + "step": 49090 + }, + { + "epoch": 4.648740768793789, + "grad_norm": 0.569369912147522, + "learning_rate": 1.4050369248248439e-06, + "loss": 0.1066, + "step": 49100 + }, + { + "epoch": 4.6496875591743985, + "grad_norm": 0.5993393659591675, + "learning_rate": 1.401249763302405e-06, + "loss": 0.0892, + "step": 49110 + }, + { + "epoch": 4.650634349555008, + "grad_norm": 0.5521275401115417, + "learning_rate": 1.397462601779966e-06, + "loss": 0.0922, + "step": 49120 + }, + { + "epoch": 4.651581139935618, + "grad_norm": 0.3962774872779846, + "learning_rate": 1.393675440257527e-06, + "loss": 0.0928, + "step": 49130 + }, + { + "epoch": 4.652527930316228, + "grad_norm": 0.5043448805809021, + "learning_rate": 1.389888278735088e-06, + "loss": 0.1001, + "step": 49140 + }, + { + "epoch": 4.653474720696837, + "grad_norm": 0.5763162970542908, + "learning_rate": 1.3861011172126492e-06, + "loss": 0.0874, + "step": 49150 + }, + { + "epoch": 4.654421511077447, + "grad_norm": 0.46237945556640625, + "learning_rate": 1.3823139556902105e-06, + "loss": 0.0857, + "step": 49160 + }, + { + "epoch": 4.655368301458057, + "grad_norm": 0.5338969826698303, + "learning_rate": 1.3785267941677712e-06, + "loss": 0.0896, + "step": 49170 + }, + { + "epoch": 4.656315091838667, + "grad_norm": 0.4410754144191742, + "learning_rate": 1.3747396326453325e-06, + "loss": 0.0927, + "step": 49180 + }, + { + "epoch": 4.657261882219276, + "grad_norm": 0.5380188822746277, + "learning_rate": 1.3709524711228936e-06, + "loss": 0.0926, + "step": 49190 + }, + { + "epoch": 4.658208672599886, + "grad_norm": 0.5101576447486877, + "learning_rate": 1.3671653096004545e-06, + "loss": 0.0946, + "step": 49200 + }, + { + "epoch": 4.659155462980496, + "grad_norm": 0.49067002534866333, + "learning_rate": 1.3633781480780156e-06, + "loss": 0.0882, + "step": 49210 + }, + { + "epoch": 4.660102253361106, + "grad_norm": 0.5032888054847717, + "learning_rate": 1.3595909865555767e-06, + "loss": 0.0909, + "step": 49220 + }, + { + "epoch": 4.661049043741715, + "grad_norm": 0.4651202857494354, + "learning_rate": 1.3558038250331378e-06, + "loss": 0.0969, + "step": 49230 + }, + { + "epoch": 4.661995834122325, + "grad_norm": 0.42336854338645935, + "learning_rate": 1.3520166635106987e-06, + "loss": 0.0834, + "step": 49240 + }, + { + "epoch": 4.662942624502935, + "grad_norm": 0.5473812222480774, + "learning_rate": 1.3482295019882598e-06, + "loss": 0.0901, + "step": 49250 + }, + { + "epoch": 4.663889414883545, + "grad_norm": 0.5984389781951904, + "learning_rate": 1.344442340465821e-06, + "loss": 0.0965, + "step": 49260 + }, + { + "epoch": 4.664836205264154, + "grad_norm": 0.5616798400878906, + "learning_rate": 1.3406551789433819e-06, + "loss": 0.0972, + "step": 49270 + }, + { + "epoch": 4.665782995644764, + "grad_norm": 0.5246201753616333, + "learning_rate": 1.3368680174209432e-06, + "loss": 0.0831, + "step": 49280 + }, + { + "epoch": 4.666729786025374, + "grad_norm": 0.5149614214897156, + "learning_rate": 1.3330808558985043e-06, + "loss": 0.0932, + "step": 49290 + }, + { + "epoch": 4.6676765764059835, + "grad_norm": 0.5594366192817688, + "learning_rate": 1.3292936943760654e-06, + "loss": 0.0884, + "step": 49300 + }, + { + "epoch": 4.668623366786593, + "grad_norm": 0.4831557869911194, + "learning_rate": 1.3255065328536263e-06, + "loss": 0.0873, + "step": 49310 + }, + { + "epoch": 4.669570157167203, + "grad_norm": 0.47912707924842834, + "learning_rate": 1.3217193713311874e-06, + "loss": 0.0916, + "step": 49320 + }, + { + "epoch": 4.670516947547813, + "grad_norm": 0.525671124458313, + "learning_rate": 1.3179322098087485e-06, + "loss": 0.0933, + "step": 49330 + }, + { + "epoch": 4.6714637379284225, + "grad_norm": 0.4452461898326874, + "learning_rate": 1.3141450482863094e-06, + "loss": 0.09, + "step": 49340 + }, + { + "epoch": 4.672410528309032, + "grad_norm": 0.5709280371665955, + "learning_rate": 1.3103578867638705e-06, + "loss": 0.0957, + "step": 49350 + }, + { + "epoch": 4.673357318689642, + "grad_norm": 0.48085644841194153, + "learning_rate": 1.3065707252414316e-06, + "loss": 0.0907, + "step": 49360 + }, + { + "epoch": 4.674304109070252, + "grad_norm": 0.5009046196937561, + "learning_rate": 1.302783563718993e-06, + "loss": 0.094, + "step": 49370 + }, + { + "epoch": 4.6752508994508615, + "grad_norm": 0.5080695152282715, + "learning_rate": 1.2989964021965536e-06, + "loss": 0.0946, + "step": 49380 + }, + { + "epoch": 4.676197689831471, + "grad_norm": 0.5521093010902405, + "learning_rate": 1.295209240674115e-06, + "loss": 0.0906, + "step": 49390 + }, + { + "epoch": 4.677144480212081, + "grad_norm": 0.5247788429260254, + "learning_rate": 1.291422079151676e-06, + "loss": 0.0859, + "step": 49400 + }, + { + "epoch": 4.678091270592691, + "grad_norm": 0.6452503204345703, + "learning_rate": 1.287634917629237e-06, + "loss": 0.1062, + "step": 49410 + }, + { + "epoch": 4.6790380609733, + "grad_norm": 0.5267934799194336, + "learning_rate": 1.283847756106798e-06, + "loss": 0.0984, + "step": 49420 + }, + { + "epoch": 4.67998485135391, + "grad_norm": 0.5547208786010742, + "learning_rate": 1.2800605945843592e-06, + "loss": 0.0953, + "step": 49430 + }, + { + "epoch": 4.68093164173452, + "grad_norm": 0.6340035796165466, + "learning_rate": 1.2762734330619203e-06, + "loss": 0.088, + "step": 49440 + }, + { + "epoch": 4.68187843211513, + "grad_norm": 0.4992550313472748, + "learning_rate": 1.2724862715394812e-06, + "loss": 0.0956, + "step": 49450 + }, + { + "epoch": 4.682825222495739, + "grad_norm": 0.5360420346260071, + "learning_rate": 1.2686991100170423e-06, + "loss": 0.0925, + "step": 49460 + }, + { + "epoch": 4.683772012876349, + "grad_norm": 0.5174246430397034, + "learning_rate": 1.2649119484946034e-06, + "loss": 0.0936, + "step": 49470 + }, + { + "epoch": 4.684718803256959, + "grad_norm": 0.514578104019165, + "learning_rate": 1.2611247869721643e-06, + "loss": 0.0926, + "step": 49480 + }, + { + "epoch": 4.685665593637569, + "grad_norm": 0.41404736042022705, + "learning_rate": 1.2573376254497254e-06, + "loss": 0.0923, + "step": 49490 + }, + { + "epoch": 4.686612384018178, + "grad_norm": 0.648402214050293, + "learning_rate": 1.2535504639272867e-06, + "loss": 0.0917, + "step": 49500 + }, + { + "epoch": 4.687559174398788, + "grad_norm": 0.45198550820350647, + "learning_rate": 1.2497633024048476e-06, + "loss": 0.0917, + "step": 49510 + }, + { + "epoch": 4.688505964779398, + "grad_norm": 0.5977778434753418, + "learning_rate": 1.2459761408824087e-06, + "loss": 0.0857, + "step": 49520 + }, + { + "epoch": 4.689452755160008, + "grad_norm": 0.5270425081253052, + "learning_rate": 1.2421889793599698e-06, + "loss": 0.0884, + "step": 49530 + }, + { + "epoch": 4.690399545540617, + "grad_norm": 0.5529866218566895, + "learning_rate": 1.238401817837531e-06, + "loss": 0.0892, + "step": 49540 + }, + { + "epoch": 4.691346335921227, + "grad_norm": 0.5758625268936157, + "learning_rate": 1.234614656315092e-06, + "loss": 0.0875, + "step": 49550 + }, + { + "epoch": 4.692293126301837, + "grad_norm": 0.40848255157470703, + "learning_rate": 1.230827494792653e-06, + "loss": 0.0908, + "step": 49560 + }, + { + "epoch": 4.6932399166824466, + "grad_norm": 0.7464273571968079, + "learning_rate": 1.227040333270214e-06, + "loss": 0.0883, + "step": 49570 + }, + { + "epoch": 4.694186707063056, + "grad_norm": 0.5910485982894897, + "learning_rate": 1.2232531717477752e-06, + "loss": 0.1056, + "step": 49580 + }, + { + "epoch": 4.695133497443666, + "grad_norm": 0.6291195750236511, + "learning_rate": 1.2194660102253363e-06, + "loss": 0.0964, + "step": 49590 + }, + { + "epoch": 4.696080287824276, + "grad_norm": 0.5667415857315063, + "learning_rate": 1.2156788487028974e-06, + "loss": 0.0881, + "step": 49600 + }, + { + "epoch": 4.6970270782048855, + "grad_norm": 0.5887658596038818, + "learning_rate": 1.2118916871804585e-06, + "loss": 0.1005, + "step": 49610 + }, + { + "epoch": 4.697973868585495, + "grad_norm": 0.6646347045898438, + "learning_rate": 1.2081045256580194e-06, + "loss": 0.1049, + "step": 49620 + }, + { + "epoch": 4.698920658966105, + "grad_norm": 0.6309397220611572, + "learning_rate": 1.2043173641355805e-06, + "loss": 0.0966, + "step": 49630 + }, + { + "epoch": 4.699867449346715, + "grad_norm": 0.47189533710479736, + "learning_rate": 1.2005302026131416e-06, + "loss": 0.0959, + "step": 49640 + }, + { + "epoch": 4.7008142397273245, + "grad_norm": 0.550778329372406, + "learning_rate": 1.1967430410907025e-06, + "loss": 0.0953, + "step": 49650 + }, + { + "epoch": 4.701761030107934, + "grad_norm": 0.5566471219062805, + "learning_rate": 1.1929558795682638e-06, + "loss": 0.0952, + "step": 49660 + }, + { + "epoch": 4.702707820488544, + "grad_norm": 0.4658328592777252, + "learning_rate": 1.1891687180458247e-06, + "loss": 0.0936, + "step": 49670 + }, + { + "epoch": 4.703654610869154, + "grad_norm": 0.49258533120155334, + "learning_rate": 1.1853815565233858e-06, + "loss": 0.0836, + "step": 49680 + }, + { + "epoch": 4.7046014012497634, + "grad_norm": 0.4751582741737366, + "learning_rate": 1.181594395000947e-06, + "loss": 0.0952, + "step": 49690 + }, + { + "epoch": 4.705548191630373, + "grad_norm": 0.3998904824256897, + "learning_rate": 1.1778072334785078e-06, + "loss": 0.0842, + "step": 49700 + }, + { + "epoch": 4.706494982010983, + "grad_norm": 0.48084378242492676, + "learning_rate": 1.1740200719560691e-06, + "loss": 0.0868, + "step": 49710 + }, + { + "epoch": 4.707441772391593, + "grad_norm": 0.6053601503372192, + "learning_rate": 1.17023291043363e-06, + "loss": 0.093, + "step": 49720 + }, + { + "epoch": 4.708388562772202, + "grad_norm": 0.6152405142784119, + "learning_rate": 1.1664457489111911e-06, + "loss": 0.0939, + "step": 49730 + }, + { + "epoch": 4.709335353152812, + "grad_norm": 0.5622721910476685, + "learning_rate": 1.1626585873887522e-06, + "loss": 0.0994, + "step": 49740 + }, + { + "epoch": 4.710282143533422, + "grad_norm": 0.4358634650707245, + "learning_rate": 1.1588714258663134e-06, + "loss": 0.091, + "step": 49750 + }, + { + "epoch": 4.711228933914032, + "grad_norm": 0.6170639395713806, + "learning_rate": 1.1550842643438743e-06, + "loss": 0.0965, + "step": 49760 + }, + { + "epoch": 4.712175724294641, + "grad_norm": 0.49188241362571716, + "learning_rate": 1.1512971028214354e-06, + "loss": 0.0889, + "step": 49770 + }, + { + "epoch": 4.713122514675251, + "grad_norm": 0.5877458453178406, + "learning_rate": 1.1475099412989965e-06, + "loss": 0.0862, + "step": 49780 + }, + { + "epoch": 4.714069305055861, + "grad_norm": 0.6240220665931702, + "learning_rate": 1.1437227797765576e-06, + "loss": 0.1044, + "step": 49790 + }, + { + "epoch": 4.715016095436471, + "grad_norm": 0.5297105312347412, + "learning_rate": 1.1399356182541187e-06, + "loss": 0.0975, + "step": 49800 + }, + { + "epoch": 4.71596288581708, + "grad_norm": 0.5716924071311951, + "learning_rate": 1.1361484567316796e-06, + "loss": 0.0828, + "step": 49810 + }, + { + "epoch": 4.71690967619769, + "grad_norm": 0.516631543636322, + "learning_rate": 1.132361295209241e-06, + "loss": 0.1032, + "step": 49820 + }, + { + "epoch": 4.7178564665783, + "grad_norm": 0.5588654279708862, + "learning_rate": 1.1285741336868018e-06, + "loss": 0.0918, + "step": 49830 + }, + { + "epoch": 4.71880325695891, + "grad_norm": 0.6855382919311523, + "learning_rate": 1.124786972164363e-06, + "loss": 0.0926, + "step": 49840 + }, + { + "epoch": 4.719750047339519, + "grad_norm": 0.5521231293678284, + "learning_rate": 1.120999810641924e-06, + "loss": 0.0953, + "step": 49850 + }, + { + "epoch": 4.720696837720129, + "grad_norm": 0.4538123905658722, + "learning_rate": 1.117212649119485e-06, + "loss": 0.086, + "step": 49860 + }, + { + "epoch": 4.721643628100739, + "grad_norm": 0.6084619760513306, + "learning_rate": 1.1134254875970462e-06, + "loss": 0.0997, + "step": 49870 + }, + { + "epoch": 4.7225904184813485, + "grad_norm": 0.5663161277770996, + "learning_rate": 1.1096383260746071e-06, + "loss": 0.0975, + "step": 49880 + }, + { + "epoch": 4.723537208861958, + "grad_norm": 0.6240411996841431, + "learning_rate": 1.1058511645521682e-06, + "loss": 0.0902, + "step": 49890 + }, + { + "epoch": 4.724483999242568, + "grad_norm": 0.5772308111190796, + "learning_rate": 1.1020640030297293e-06, + "loss": 0.0963, + "step": 49900 + }, + { + "epoch": 4.725430789623178, + "grad_norm": 0.5128609538078308, + "learning_rate": 1.0982768415072905e-06, + "loss": 0.093, + "step": 49910 + }, + { + "epoch": 4.7263775800037875, + "grad_norm": 0.5880887508392334, + "learning_rate": 1.0944896799848514e-06, + "loss": 0.099, + "step": 49920 + }, + { + "epoch": 4.727324370384397, + "grad_norm": 0.5767456293106079, + "learning_rate": 1.0907025184624125e-06, + "loss": 0.0891, + "step": 49930 + }, + { + "epoch": 4.728271160765007, + "grad_norm": 0.49970582127571106, + "learning_rate": 1.0869153569399736e-06, + "loss": 0.1076, + "step": 49940 + }, + { + "epoch": 4.729217951145617, + "grad_norm": 0.5847047567367554, + "learning_rate": 1.0831281954175347e-06, + "loss": 0.0907, + "step": 49950 + }, + { + "epoch": 4.7301647415262265, + "grad_norm": 0.5303926467895508, + "learning_rate": 1.0793410338950958e-06, + "loss": 0.0815, + "step": 49960 + }, + { + "epoch": 4.731111531906836, + "grad_norm": 0.5159422159194946, + "learning_rate": 1.0755538723726567e-06, + "loss": 0.0913, + "step": 49970 + }, + { + "epoch": 4.732058322287446, + "grad_norm": 0.5303027033805847, + "learning_rate": 1.071766710850218e-06, + "loss": 0.1028, + "step": 49980 + }, + { + "epoch": 4.733005112668056, + "grad_norm": 0.5290799140930176, + "learning_rate": 1.067979549327779e-06, + "loss": 0.0884, + "step": 49990 + }, + { + "epoch": 4.7339519030486645, + "grad_norm": 0.5512375831604004, + "learning_rate": 1.06419238780534e-06, + "loss": 0.091, + "step": 50000 + }, + { + "epoch": 4.734898693429274, + "grad_norm": 0.5875411629676819, + "learning_rate": 1.0604052262829011e-06, + "loss": 0.0945, + "step": 50010 + }, + { + "epoch": 4.735845483809884, + "grad_norm": 0.4429018199443817, + "learning_rate": 1.056618064760462e-06, + "loss": 0.0963, + "step": 50020 + }, + { + "epoch": 4.736792274190494, + "grad_norm": 0.46107998490333557, + "learning_rate": 1.0528309032380233e-06, + "loss": 0.0905, + "step": 50030 + }, + { + "epoch": 4.7377390645711035, + "grad_norm": 0.5040816068649292, + "learning_rate": 1.0490437417155842e-06, + "loss": 0.0918, + "step": 50040 + }, + { + "epoch": 4.738685854951713, + "grad_norm": 0.7133190631866455, + "learning_rate": 1.0452565801931453e-06, + "loss": 0.0888, + "step": 50050 + }, + { + "epoch": 4.739632645332323, + "grad_norm": 0.5230016112327576, + "learning_rate": 1.0414694186707064e-06, + "loss": 0.0957, + "step": 50060 + }, + { + "epoch": 4.740579435712933, + "grad_norm": 0.5962569117546082, + "learning_rate": 1.0376822571482673e-06, + "loss": 0.09, + "step": 50070 + }, + { + "epoch": 4.7415262260935425, + "grad_norm": 0.6349098086357117, + "learning_rate": 1.0338950956258284e-06, + "loss": 0.0887, + "step": 50080 + }, + { + "epoch": 4.742473016474152, + "grad_norm": 0.5445008277893066, + "learning_rate": 1.0301079341033896e-06, + "loss": 0.0943, + "step": 50090 + }, + { + "epoch": 4.743419806854762, + "grad_norm": 0.6481450200080872, + "learning_rate": 1.0263207725809507e-06, + "loss": 0.0951, + "step": 50100 + }, + { + "epoch": 4.744366597235372, + "grad_norm": 0.44210201501846313, + "learning_rate": 1.0225336110585118e-06, + "loss": 0.0906, + "step": 50110 + }, + { + "epoch": 4.745313387615981, + "grad_norm": 0.5137997269630432, + "learning_rate": 1.0187464495360729e-06, + "loss": 0.0924, + "step": 50120 + }, + { + "epoch": 4.746260177996591, + "grad_norm": 0.6586678624153137, + "learning_rate": 1.0149592880136338e-06, + "loss": 0.0893, + "step": 50130 + }, + { + "epoch": 4.747206968377201, + "grad_norm": 0.46986591815948486, + "learning_rate": 1.0111721264911949e-06, + "loss": 0.0896, + "step": 50140 + }, + { + "epoch": 4.748153758757811, + "grad_norm": 0.5114980936050415, + "learning_rate": 1.007384964968756e-06, + "loss": 0.0951, + "step": 50150 + }, + { + "epoch": 4.74910054913842, + "grad_norm": 0.6802167296409607, + "learning_rate": 1.003597803446317e-06, + "loss": 0.0996, + "step": 50160 + }, + { + "epoch": 4.75004733951903, + "grad_norm": 0.46343374252319336, + "learning_rate": 9.998106419238782e-07, + "loss": 0.0969, + "step": 50170 + }, + { + "epoch": 4.75099412989964, + "grad_norm": 0.5714529752731323, + "learning_rate": 9.960234804014391e-07, + "loss": 0.0931, + "step": 50180 + }, + { + "epoch": 4.75194092028025, + "grad_norm": 0.6029381155967712, + "learning_rate": 9.922363188790002e-07, + "loss": 0.0856, + "step": 50190 + }, + { + "epoch": 4.752887710660859, + "grad_norm": 0.5086356997489929, + "learning_rate": 9.884491573565613e-07, + "loss": 0.0866, + "step": 50200 + }, + { + "epoch": 4.753834501041469, + "grad_norm": 0.6164889931678772, + "learning_rate": 9.846619958341224e-07, + "loss": 0.0884, + "step": 50210 + }, + { + "epoch": 4.754781291422079, + "grad_norm": 0.5729707479476929, + "learning_rate": 9.808748343116835e-07, + "loss": 0.09, + "step": 50220 + }, + { + "epoch": 4.755728081802689, + "grad_norm": 0.3943701982498169, + "learning_rate": 9.770876727892444e-07, + "loss": 0.0936, + "step": 50230 + }, + { + "epoch": 4.756674872183298, + "grad_norm": 0.4985075891017914, + "learning_rate": 9.733005112668055e-07, + "loss": 0.0933, + "step": 50240 + }, + { + "epoch": 4.757621662563908, + "grad_norm": 0.4944251775741577, + "learning_rate": 9.695133497443667e-07, + "loss": 0.0931, + "step": 50250 + }, + { + "epoch": 4.758568452944518, + "grad_norm": 0.6448076963424683, + "learning_rate": 9.657261882219278e-07, + "loss": 0.095, + "step": 50260 + }, + { + "epoch": 4.7595152433251275, + "grad_norm": 0.4403682053089142, + "learning_rate": 9.619390266994889e-07, + "loss": 0.0919, + "step": 50270 + }, + { + "epoch": 4.760462033705737, + "grad_norm": 0.49853387475013733, + "learning_rate": 9.5815186517705e-07, + "loss": 0.0899, + "step": 50280 + }, + { + "epoch": 4.761408824086347, + "grad_norm": 0.5590994358062744, + "learning_rate": 9.543647036546109e-07, + "loss": 0.0904, + "step": 50290 + }, + { + "epoch": 4.762355614466957, + "grad_norm": 0.555829644203186, + "learning_rate": 9.50577542132172e-07, + "loss": 0.0903, + "step": 50300 + }, + { + "epoch": 4.7633024048475665, + "grad_norm": 0.5076011419296265, + "learning_rate": 9.467903806097331e-07, + "loss": 0.0881, + "step": 50310 + }, + { + "epoch": 4.764249195228176, + "grad_norm": 0.5209220051765442, + "learning_rate": 9.430032190872941e-07, + "loss": 0.0877, + "step": 50320 + }, + { + "epoch": 4.765195985608786, + "grad_norm": 0.591728687286377, + "learning_rate": 9.392160575648553e-07, + "loss": 0.0913, + "step": 50330 + }, + { + "epoch": 4.766142775989396, + "grad_norm": 0.6746892333030701, + "learning_rate": 9.354288960424163e-07, + "loss": 0.1078, + "step": 50340 + }, + { + "epoch": 4.7670895663700055, + "grad_norm": 0.5808528065681458, + "learning_rate": 9.316417345199774e-07, + "loss": 0.0884, + "step": 50350 + }, + { + "epoch": 4.768036356750615, + "grad_norm": 0.49978432059288025, + "learning_rate": 9.278545729975384e-07, + "loss": 0.096, + "step": 50360 + }, + { + "epoch": 4.768983147131225, + "grad_norm": 0.5775094628334045, + "learning_rate": 9.240674114750994e-07, + "loss": 0.083, + "step": 50370 + }, + { + "epoch": 4.769929937511835, + "grad_norm": 0.5577254891395569, + "learning_rate": 9.202802499526605e-07, + "loss": 0.0953, + "step": 50380 + }, + { + "epoch": 4.770876727892444, + "grad_norm": 0.45562031865119934, + "learning_rate": 9.164930884302215e-07, + "loss": 0.0793, + "step": 50390 + }, + { + "epoch": 4.771823518273054, + "grad_norm": 0.594912588596344, + "learning_rate": 9.127059269077827e-07, + "loss": 0.0974, + "step": 50400 + }, + { + "epoch": 4.772770308653664, + "grad_norm": 0.5201529860496521, + "learning_rate": 9.089187653853438e-07, + "loss": 0.082, + "step": 50410 + }, + { + "epoch": 4.773717099034274, + "grad_norm": 0.5733709335327148, + "learning_rate": 9.051316038629049e-07, + "loss": 0.085, + "step": 50420 + }, + { + "epoch": 4.774663889414883, + "grad_norm": 0.5458856225013733, + "learning_rate": 9.013444423404659e-07, + "loss": 0.0922, + "step": 50430 + }, + { + "epoch": 4.775610679795493, + "grad_norm": 0.6832992434501648, + "learning_rate": 8.975572808180269e-07, + "loss": 0.098, + "step": 50440 + }, + { + "epoch": 4.776557470176103, + "grad_norm": 0.5935986638069153, + "learning_rate": 8.937701192955881e-07, + "loss": 0.0934, + "step": 50450 + }, + { + "epoch": 4.777504260556713, + "grad_norm": 0.4659850001335144, + "learning_rate": 8.899829577731491e-07, + "loss": 0.0904, + "step": 50460 + }, + { + "epoch": 4.778451050937322, + "grad_norm": 0.5135877132415771, + "learning_rate": 8.861957962507102e-07, + "loss": 0.0974, + "step": 50470 + }, + { + "epoch": 4.779397841317932, + "grad_norm": 0.5025619268417358, + "learning_rate": 8.824086347282712e-07, + "loss": 0.0916, + "step": 50480 + }, + { + "epoch": 4.780344631698542, + "grad_norm": 0.5468993186950684, + "learning_rate": 8.786214732058324e-07, + "loss": 0.0944, + "step": 50490 + }, + { + "epoch": 4.781291422079152, + "grad_norm": 0.46831241250038147, + "learning_rate": 8.748343116833934e-07, + "loss": 0.0931, + "step": 50500 + }, + { + "epoch": 4.782238212459761, + "grad_norm": 0.5284664630889893, + "learning_rate": 8.710471501609544e-07, + "loss": 0.0966, + "step": 50510 + }, + { + "epoch": 4.783185002840371, + "grad_norm": 0.5607685446739197, + "learning_rate": 8.672599886385155e-07, + "loss": 0.1012, + "step": 50520 + }, + { + "epoch": 4.784131793220981, + "grad_norm": 0.6038400530815125, + "learning_rate": 8.634728271160765e-07, + "loss": 0.0884, + "step": 50530 + }, + { + "epoch": 4.7850785836015906, + "grad_norm": 0.5385847091674805, + "learning_rate": 8.596856655936376e-07, + "loss": 0.1006, + "step": 50540 + }, + { + "epoch": 4.7860253739822, + "grad_norm": 0.5663634538650513, + "learning_rate": 8.558985040711986e-07, + "loss": 0.0893, + "step": 50550 + }, + { + "epoch": 4.78697216436281, + "grad_norm": 0.5027875900268555, + "learning_rate": 8.521113425487598e-07, + "loss": 0.0979, + "step": 50560 + }, + { + "epoch": 4.78791895474342, + "grad_norm": 0.5991628170013428, + "learning_rate": 8.483241810263208e-07, + "loss": 0.0864, + "step": 50570 + }, + { + "epoch": 4.7888657451240295, + "grad_norm": 0.5896027684211731, + "learning_rate": 8.44537019503882e-07, + "loss": 0.0851, + "step": 50580 + }, + { + "epoch": 4.789812535504639, + "grad_norm": 0.5217905640602112, + "learning_rate": 8.40749857981443e-07, + "loss": 0.0859, + "step": 50590 + }, + { + "epoch": 4.790759325885249, + "grad_norm": 0.5823928117752075, + "learning_rate": 8.36962696459004e-07, + "loss": 0.1002, + "step": 50600 + }, + { + "epoch": 4.791706116265859, + "grad_norm": 0.49396812915802, + "learning_rate": 8.331755349365652e-07, + "loss": 0.0977, + "step": 50610 + }, + { + "epoch": 4.7926529066464685, + "grad_norm": 0.6325224041938782, + "learning_rate": 8.293883734141262e-07, + "loss": 0.0994, + "step": 50620 + }, + { + "epoch": 4.793599697027078, + "grad_norm": 0.3896678388118744, + "learning_rate": 8.256012118916873e-07, + "loss": 0.0823, + "step": 50630 + }, + { + "epoch": 4.794546487407688, + "grad_norm": 0.5307965278625488, + "learning_rate": 8.218140503692483e-07, + "loss": 0.0837, + "step": 50640 + }, + { + "epoch": 4.795493277788298, + "grad_norm": 0.47504711151123047, + "learning_rate": 8.180268888468094e-07, + "loss": 0.0868, + "step": 50650 + }, + { + "epoch": 4.7964400681689074, + "grad_norm": 0.432176411151886, + "learning_rate": 8.142397273243705e-07, + "loss": 0.0866, + "step": 50660 + }, + { + "epoch": 4.797386858549517, + "grad_norm": 0.5039948225021362, + "learning_rate": 8.104525658019315e-07, + "loss": 0.1057, + "step": 50670 + }, + { + "epoch": 4.798333648930127, + "grad_norm": 0.6106250882148743, + "learning_rate": 8.066654042794926e-07, + "loss": 0.0948, + "step": 50680 + }, + { + "epoch": 4.799280439310737, + "grad_norm": 0.6046525835990906, + "learning_rate": 8.028782427570536e-07, + "loss": 0.0845, + "step": 50690 + }, + { + "epoch": 4.800227229691346, + "grad_norm": 0.6091911792755127, + "learning_rate": 7.990910812346147e-07, + "loss": 0.0823, + "step": 50700 + }, + { + "epoch": 4.801174020071956, + "grad_norm": 0.4663586914539337, + "learning_rate": 7.953039197121757e-07, + "loss": 0.0902, + "step": 50710 + }, + { + "epoch": 4.802120810452566, + "grad_norm": 0.6265435814857483, + "learning_rate": 7.915167581897369e-07, + "loss": 0.1029, + "step": 50720 + }, + { + "epoch": 4.803067600833176, + "grad_norm": 0.5090877413749695, + "learning_rate": 7.877295966672979e-07, + "loss": 0.0911, + "step": 50730 + }, + { + "epoch": 4.804014391213785, + "grad_norm": 0.7234902381896973, + "learning_rate": 7.83942435144859e-07, + "loss": 0.0988, + "step": 50740 + }, + { + "epoch": 4.804961181594395, + "grad_norm": 0.46384572982788086, + "learning_rate": 7.801552736224201e-07, + "loss": 0.0962, + "step": 50750 + }, + { + "epoch": 4.805907971975005, + "grad_norm": 0.6380556225776672, + "learning_rate": 7.763681120999811e-07, + "loss": 0.0934, + "step": 50760 + }, + { + "epoch": 4.806854762355615, + "grad_norm": 0.6317399144172668, + "learning_rate": 7.725809505775423e-07, + "loss": 0.0901, + "step": 50770 + }, + { + "epoch": 4.807801552736224, + "grad_norm": 0.6209624409675598, + "learning_rate": 7.687937890551033e-07, + "loss": 0.0879, + "step": 50780 + }, + { + "epoch": 4.808748343116834, + "grad_norm": 0.45979273319244385, + "learning_rate": 7.650066275326644e-07, + "loss": 0.0969, + "step": 50790 + }, + { + "epoch": 4.809695133497444, + "grad_norm": 0.5645127892494202, + "learning_rate": 7.612194660102254e-07, + "loss": 0.0925, + "step": 50800 + }, + { + "epoch": 4.810641923878054, + "grad_norm": 0.5529855489730835, + "learning_rate": 7.574323044877864e-07, + "loss": 0.0911, + "step": 50810 + }, + { + "epoch": 4.811588714258663, + "grad_norm": 0.6504641175270081, + "learning_rate": 7.536451429653475e-07, + "loss": 0.0852, + "step": 50820 + }, + { + "epoch": 4.812535504639273, + "grad_norm": 0.6094434261322021, + "learning_rate": 7.498579814429086e-07, + "loss": 0.0921, + "step": 50830 + }, + { + "epoch": 4.813482295019883, + "grad_norm": 0.6175083518028259, + "learning_rate": 7.460708199204697e-07, + "loss": 0.0906, + "step": 50840 + }, + { + "epoch": 4.8144290854004925, + "grad_norm": 0.4623327851295471, + "learning_rate": 7.422836583980307e-07, + "loss": 0.0888, + "step": 50850 + }, + { + "epoch": 4.815375875781102, + "grad_norm": 0.47847726941108704, + "learning_rate": 7.384964968755918e-07, + "loss": 0.0926, + "step": 50860 + }, + { + "epoch": 4.816322666161712, + "grad_norm": 0.49766966700553894, + "learning_rate": 7.347093353531528e-07, + "loss": 0.0964, + "step": 50870 + }, + { + "epoch": 4.817269456542322, + "grad_norm": 0.6062334179878235, + "learning_rate": 7.309221738307138e-07, + "loss": 0.0953, + "step": 50880 + }, + { + "epoch": 4.8182162469229315, + "grad_norm": 0.5141642093658447, + "learning_rate": 7.27135012308275e-07, + "loss": 0.095, + "step": 50890 + }, + { + "epoch": 4.819163037303541, + "grad_norm": 0.5506232380867004, + "learning_rate": 7.23347850785836e-07, + "loss": 0.0992, + "step": 50900 + }, + { + "epoch": 4.820109827684151, + "grad_norm": 0.4917549788951874, + "learning_rate": 7.195606892633972e-07, + "loss": 0.0851, + "step": 50910 + }, + { + "epoch": 4.821056618064761, + "grad_norm": 0.5677406787872314, + "learning_rate": 7.157735277409582e-07, + "loss": 0.0859, + "step": 50920 + }, + { + "epoch": 4.8220034084453705, + "grad_norm": 0.45178139209747314, + "learning_rate": 7.119863662185194e-07, + "loss": 0.097, + "step": 50930 + }, + { + "epoch": 4.82295019882598, + "grad_norm": 0.5761244297027588, + "learning_rate": 7.081992046960804e-07, + "loss": 0.0885, + "step": 50940 + }, + { + "epoch": 4.82389698920659, + "grad_norm": 0.4566514790058136, + "learning_rate": 7.044120431736415e-07, + "loss": 0.1038, + "step": 50950 + }, + { + "epoch": 4.8248437795872, + "grad_norm": 0.4556839168071747, + "learning_rate": 7.006248816512025e-07, + "loss": 0.085, + "step": 50960 + }, + { + "epoch": 4.825790569967809, + "grad_norm": 0.45772460103034973, + "learning_rate": 6.968377201287635e-07, + "loss": 0.0882, + "step": 50970 + }, + { + "epoch": 4.826737360348419, + "grad_norm": 0.7039437890052795, + "learning_rate": 6.930505586063246e-07, + "loss": 0.09, + "step": 50980 + }, + { + "epoch": 4.827684150729029, + "grad_norm": 0.44567543268203735, + "learning_rate": 6.892633970838856e-07, + "loss": 0.0843, + "step": 50990 + }, + { + "epoch": 4.828630941109639, + "grad_norm": 0.6256598830223083, + "learning_rate": 6.854762355614468e-07, + "loss": 0.0886, + "step": 51000 + }, + { + "epoch": 4.829577731490248, + "grad_norm": 0.6110970377922058, + "learning_rate": 6.816890740390078e-07, + "loss": 0.0879, + "step": 51010 + }, + { + "epoch": 4.830524521870858, + "grad_norm": 0.594294548034668, + "learning_rate": 6.779019125165689e-07, + "loss": 0.0857, + "step": 51020 + }, + { + "epoch": 4.831471312251468, + "grad_norm": 0.5996149778366089, + "learning_rate": 6.741147509941299e-07, + "loss": 0.0977, + "step": 51030 + }, + { + "epoch": 4.832418102632078, + "grad_norm": 0.46598851680755615, + "learning_rate": 6.703275894716909e-07, + "loss": 0.0779, + "step": 51040 + }, + { + "epoch": 4.833364893012687, + "grad_norm": 0.5769855380058289, + "learning_rate": 6.665404279492521e-07, + "loss": 0.0891, + "step": 51050 + }, + { + "epoch": 4.834311683393297, + "grad_norm": 0.4835968017578125, + "learning_rate": 6.627532664268131e-07, + "loss": 0.1008, + "step": 51060 + }, + { + "epoch": 4.835258473773907, + "grad_norm": 0.5602681040763855, + "learning_rate": 6.589661049043743e-07, + "loss": 0.0915, + "step": 51070 + }, + { + "epoch": 4.836205264154517, + "grad_norm": 0.47379618883132935, + "learning_rate": 6.551789433819353e-07, + "loss": 0.0871, + "step": 51080 + }, + { + "epoch": 4.837152054535126, + "grad_norm": 0.6135598421096802, + "learning_rate": 6.513917818594965e-07, + "loss": 0.0923, + "step": 51090 + }, + { + "epoch": 4.838098844915736, + "grad_norm": 0.7021900415420532, + "learning_rate": 6.476046203370575e-07, + "loss": 0.0874, + "step": 51100 + }, + { + "epoch": 4.839045635296346, + "grad_norm": 0.561215341091156, + "learning_rate": 6.438174588146185e-07, + "loss": 0.0867, + "step": 51110 + }, + { + "epoch": 4.8399924256769555, + "grad_norm": 0.49121299386024475, + "learning_rate": 6.400302972921796e-07, + "loss": 0.0854, + "step": 51120 + }, + { + "epoch": 4.840939216057565, + "grad_norm": 0.5570336580276489, + "learning_rate": 6.362431357697406e-07, + "loss": 0.1007, + "step": 51130 + }, + { + "epoch": 4.841886006438175, + "grad_norm": 0.8793237805366516, + "learning_rate": 6.324559742473017e-07, + "loss": 0.0888, + "step": 51140 + }, + { + "epoch": 4.842832796818785, + "grad_norm": 0.47588610649108887, + "learning_rate": 6.286688127248627e-07, + "loss": 0.0858, + "step": 51150 + }, + { + "epoch": 4.8437795871993945, + "grad_norm": 0.44155871868133545, + "learning_rate": 6.248816512024238e-07, + "loss": 0.084, + "step": 51160 + }, + { + "epoch": 4.844726377580004, + "grad_norm": 0.5633000731468201, + "learning_rate": 6.210944896799849e-07, + "loss": 0.1027, + "step": 51170 + }, + { + "epoch": 4.845673167960614, + "grad_norm": 0.44512370228767395, + "learning_rate": 6.17307328157546e-07, + "loss": 0.0938, + "step": 51180 + }, + { + "epoch": 4.846619958341224, + "grad_norm": 0.5156192183494568, + "learning_rate": 6.13520166635107e-07, + "loss": 0.0919, + "step": 51190 + }, + { + "epoch": 4.8475667487218335, + "grad_norm": 0.6136853098869324, + "learning_rate": 6.097330051126681e-07, + "loss": 0.0896, + "step": 51200 + }, + { + "epoch": 4.848513539102443, + "grad_norm": 0.6080178618431091, + "learning_rate": 6.059458435902292e-07, + "loss": 0.0972, + "step": 51210 + }, + { + "epoch": 4.849460329483053, + "grad_norm": 0.5267219543457031, + "learning_rate": 6.021586820677902e-07, + "loss": 0.0873, + "step": 51220 + }, + { + "epoch": 4.850407119863662, + "grad_norm": 0.5620561838150024, + "learning_rate": 5.983715205453512e-07, + "loss": 0.0993, + "step": 51230 + }, + { + "epoch": 4.8513539102442715, + "grad_norm": 0.5927213430404663, + "learning_rate": 5.945843590229124e-07, + "loss": 0.0996, + "step": 51240 + }, + { + "epoch": 4.852300700624881, + "grad_norm": 0.5121906399726868, + "learning_rate": 5.907971975004735e-07, + "loss": 0.0854, + "step": 51250 + }, + { + "epoch": 4.853247491005491, + "grad_norm": 0.546197772026062, + "learning_rate": 5.870100359780346e-07, + "loss": 0.0863, + "step": 51260 + }, + { + "epoch": 4.854194281386101, + "grad_norm": 0.5386756658554077, + "learning_rate": 5.832228744555956e-07, + "loss": 0.0932, + "step": 51270 + }, + { + "epoch": 4.8551410717667105, + "grad_norm": 0.5520429611206055, + "learning_rate": 5.794357129331567e-07, + "loss": 0.098, + "step": 51280 + }, + { + "epoch": 4.85608786214732, + "grad_norm": 0.5074911713600159, + "learning_rate": 5.756485514107177e-07, + "loss": 0.0906, + "step": 51290 + }, + { + "epoch": 4.85703465252793, + "grad_norm": 0.6046466827392578, + "learning_rate": 5.718613898882788e-07, + "loss": 0.0918, + "step": 51300 + }, + { + "epoch": 4.85798144290854, + "grad_norm": 0.5096051096916199, + "learning_rate": 5.680742283658398e-07, + "loss": 0.0943, + "step": 51310 + }, + { + "epoch": 4.8589282332891495, + "grad_norm": 0.5715464353561401, + "learning_rate": 5.642870668434009e-07, + "loss": 0.093, + "step": 51320 + }, + { + "epoch": 4.859875023669759, + "grad_norm": 0.540324330329895, + "learning_rate": 5.60499905320962e-07, + "loss": 0.0941, + "step": 51330 + }, + { + "epoch": 4.860821814050369, + "grad_norm": 0.5595934391021729, + "learning_rate": 5.567127437985231e-07, + "loss": 0.0846, + "step": 51340 + }, + { + "epoch": 4.861768604430979, + "grad_norm": 0.5944637060165405, + "learning_rate": 5.529255822760841e-07, + "loss": 0.0862, + "step": 51350 + }, + { + "epoch": 4.862715394811588, + "grad_norm": 0.5972811579704285, + "learning_rate": 5.491384207536452e-07, + "loss": 0.0895, + "step": 51360 + }, + { + "epoch": 4.863662185192198, + "grad_norm": 0.6000321507453918, + "learning_rate": 5.453512592312062e-07, + "loss": 0.0938, + "step": 51370 + }, + { + "epoch": 4.864608975572808, + "grad_norm": 0.5122269988059998, + "learning_rate": 5.415640977087673e-07, + "loss": 0.0908, + "step": 51380 + }, + { + "epoch": 4.865555765953418, + "grad_norm": 0.534418523311615, + "learning_rate": 5.377769361863283e-07, + "loss": 0.0884, + "step": 51390 + }, + { + "epoch": 4.866502556334027, + "grad_norm": 0.5945922136306763, + "learning_rate": 5.339897746638894e-07, + "loss": 0.0935, + "step": 51400 + }, + { + "epoch": 4.867449346714637, + "grad_norm": 0.6264474391937256, + "learning_rate": 5.302026131414506e-07, + "loss": 0.0939, + "step": 51410 + }, + { + "epoch": 4.868396137095247, + "grad_norm": 0.5093408823013306, + "learning_rate": 5.264154516190117e-07, + "loss": 0.083, + "step": 51420 + }, + { + "epoch": 4.869342927475857, + "grad_norm": 0.48322850465774536, + "learning_rate": 5.226282900965727e-07, + "loss": 0.0948, + "step": 51430 + }, + { + "epoch": 4.870289717856466, + "grad_norm": 0.5509017109870911, + "learning_rate": 5.188411285741337e-07, + "loss": 0.0864, + "step": 51440 + }, + { + "epoch": 4.871236508237076, + "grad_norm": 0.6064262986183167, + "learning_rate": 5.150539670516948e-07, + "loss": 0.0866, + "step": 51450 + }, + { + "epoch": 4.872183298617686, + "grad_norm": 0.5785761475563049, + "learning_rate": 5.112668055292559e-07, + "loss": 0.0941, + "step": 51460 + }, + { + "epoch": 4.873130088998296, + "grad_norm": 0.6652457118034363, + "learning_rate": 5.074796440068169e-07, + "loss": 0.0828, + "step": 51470 + }, + { + "epoch": 4.874076879378905, + "grad_norm": 0.5285108089447021, + "learning_rate": 5.03692482484378e-07, + "loss": 0.0979, + "step": 51480 + }, + { + "epoch": 4.875023669759515, + "grad_norm": 0.4987425208091736, + "learning_rate": 4.999053209619391e-07, + "loss": 0.1033, + "step": 51490 + }, + { + "epoch": 4.875970460140125, + "grad_norm": 0.6044142842292786, + "learning_rate": 4.961181594395001e-07, + "loss": 0.0953, + "step": 51500 + }, + { + "epoch": 4.8769172505207345, + "grad_norm": 0.5120283961296082, + "learning_rate": 4.923309979170612e-07, + "loss": 0.0954, + "step": 51510 + }, + { + "epoch": 4.877864040901344, + "grad_norm": 0.4522251784801483, + "learning_rate": 4.885438363946222e-07, + "loss": 0.0936, + "step": 51520 + }, + { + "epoch": 4.878810831281954, + "grad_norm": 0.5160796642303467, + "learning_rate": 4.847566748721833e-07, + "loss": 0.0902, + "step": 51530 + }, + { + "epoch": 4.879757621662564, + "grad_norm": 0.4830280840396881, + "learning_rate": 4.809695133497444e-07, + "loss": 0.0887, + "step": 51540 + }, + { + "epoch": 4.8807044120431735, + "grad_norm": 0.5954660177230835, + "learning_rate": 4.771823518273054e-07, + "loss": 0.0994, + "step": 51550 + }, + { + "epoch": 4.881651202423783, + "grad_norm": 0.5464210510253906, + "learning_rate": 4.7339519030486655e-07, + "loss": 0.0893, + "step": 51560 + }, + { + "epoch": 4.882597992804393, + "grad_norm": 0.7138330936431885, + "learning_rate": 4.6960802878242765e-07, + "loss": 0.0908, + "step": 51570 + }, + { + "epoch": 4.883544783185003, + "grad_norm": 0.5895309448242188, + "learning_rate": 4.658208672599887e-07, + "loss": 0.0942, + "step": 51580 + }, + { + "epoch": 4.8844915735656125, + "grad_norm": 0.6211789846420288, + "learning_rate": 4.620337057375497e-07, + "loss": 0.0981, + "step": 51590 + }, + { + "epoch": 4.885438363946222, + "grad_norm": 0.4954429864883423, + "learning_rate": 4.5824654421511077e-07, + "loss": 0.1031, + "step": 51600 + }, + { + "epoch": 4.886385154326832, + "grad_norm": 0.550929605960846, + "learning_rate": 4.544593826926719e-07, + "loss": 0.0878, + "step": 51610 + }, + { + "epoch": 4.887331944707442, + "grad_norm": 0.5120697617530823, + "learning_rate": 4.5067222117023293e-07, + "loss": 0.0861, + "step": 51620 + }, + { + "epoch": 4.888278735088051, + "grad_norm": 0.5993196368217468, + "learning_rate": 4.4688505964779404e-07, + "loss": 0.0898, + "step": 51630 + }, + { + "epoch": 4.889225525468661, + "grad_norm": 0.627377450466156, + "learning_rate": 4.430978981253551e-07, + "loss": 0.0916, + "step": 51640 + }, + { + "epoch": 4.890172315849271, + "grad_norm": 0.4231056272983551, + "learning_rate": 4.393107366029162e-07, + "loss": 0.0837, + "step": 51650 + }, + { + "epoch": 4.891119106229881, + "grad_norm": 0.4605785310268402, + "learning_rate": 4.355235750804772e-07, + "loss": 0.1033, + "step": 51660 + }, + { + "epoch": 4.89206589661049, + "grad_norm": 0.57819002866745, + "learning_rate": 4.3173641355803826e-07, + "loss": 0.0947, + "step": 51670 + }, + { + "epoch": 4.8930126869911, + "grad_norm": 0.48372822999954224, + "learning_rate": 4.279492520355993e-07, + "loss": 0.104, + "step": 51680 + }, + { + "epoch": 4.89395947737171, + "grad_norm": 0.5683338642120361, + "learning_rate": 4.241620905131604e-07, + "loss": 0.1037, + "step": 51690 + }, + { + "epoch": 4.89490626775232, + "grad_norm": 0.5659564733505249, + "learning_rate": 4.203749289907215e-07, + "loss": 0.0902, + "step": 51700 + }, + { + "epoch": 4.895853058132929, + "grad_norm": 0.5283416509628296, + "learning_rate": 4.165877674682826e-07, + "loss": 0.0962, + "step": 51710 + }, + { + "epoch": 4.896799848513539, + "grad_norm": 0.5911533236503601, + "learning_rate": 4.1280060594584364e-07, + "loss": 0.0885, + "step": 51720 + }, + { + "epoch": 4.897746638894149, + "grad_norm": 0.5418230891227722, + "learning_rate": 4.090134444234047e-07, + "loss": 0.092, + "step": 51730 + }, + { + "epoch": 4.898693429274759, + "grad_norm": 0.6199533939361572, + "learning_rate": 4.0522628290096575e-07, + "loss": 0.0958, + "step": 51740 + }, + { + "epoch": 4.899640219655368, + "grad_norm": 0.4867435097694397, + "learning_rate": 4.014391213785268e-07, + "loss": 0.1006, + "step": 51750 + }, + { + "epoch": 4.900587010035978, + "grad_norm": 0.6489893198013306, + "learning_rate": 3.9765195985608786e-07, + "loss": 0.0924, + "step": 51760 + }, + { + "epoch": 4.901533800416588, + "grad_norm": 0.5418457388877869, + "learning_rate": 3.9386479833364897e-07, + "loss": 0.0877, + "step": 51770 + }, + { + "epoch": 4.902480590797198, + "grad_norm": 0.4436706006526947, + "learning_rate": 3.9007763681121003e-07, + "loss": 0.0911, + "step": 51780 + }, + { + "epoch": 4.903427381177807, + "grad_norm": 0.6262994408607483, + "learning_rate": 3.8629047528877114e-07, + "loss": 0.0925, + "step": 51790 + }, + { + "epoch": 4.904374171558417, + "grad_norm": 0.5168532729148865, + "learning_rate": 3.825033137663322e-07, + "loss": 0.0854, + "step": 51800 + }, + { + "epoch": 4.905320961939027, + "grad_norm": 0.578430712223053, + "learning_rate": 3.787161522438932e-07, + "loss": 0.0937, + "step": 51810 + }, + { + "epoch": 4.9062677523196365, + "grad_norm": 0.49829116463661194, + "learning_rate": 3.749289907214543e-07, + "loss": 0.0913, + "step": 51820 + }, + { + "epoch": 4.907214542700246, + "grad_norm": 0.6826627254486084, + "learning_rate": 3.7114182919901536e-07, + "loss": 0.0982, + "step": 51830 + }, + { + "epoch": 4.908161333080856, + "grad_norm": 0.670319139957428, + "learning_rate": 3.673546676765764e-07, + "loss": 0.0905, + "step": 51840 + }, + { + "epoch": 4.909108123461466, + "grad_norm": 0.5203000903129578, + "learning_rate": 3.635675061541375e-07, + "loss": 0.0947, + "step": 51850 + }, + { + "epoch": 4.9100549138420755, + "grad_norm": 0.6277847290039062, + "learning_rate": 3.597803446316986e-07, + "loss": 0.09, + "step": 51860 + }, + { + "epoch": 4.911001704222685, + "grad_norm": 0.4402257204055786, + "learning_rate": 3.559931831092597e-07, + "loss": 0.0889, + "step": 51870 + }, + { + "epoch": 4.911948494603295, + "grad_norm": 0.5525423884391785, + "learning_rate": 3.5220602158682074e-07, + "loss": 0.0873, + "step": 51880 + }, + { + "epoch": 4.912895284983905, + "grad_norm": 0.5541327595710754, + "learning_rate": 3.4841886006438174e-07, + "loss": 0.0901, + "step": 51890 + }, + { + "epoch": 4.9138420753645145, + "grad_norm": 0.7090700268745422, + "learning_rate": 3.446316985419428e-07, + "loss": 0.099, + "step": 51900 + }, + { + "epoch": 4.914788865745124, + "grad_norm": 0.45772573351860046, + "learning_rate": 3.408445370195039e-07, + "loss": 0.0945, + "step": 51910 + }, + { + "epoch": 4.915735656125734, + "grad_norm": 0.5060674548149109, + "learning_rate": 3.3705737549706496e-07, + "loss": 0.0889, + "step": 51920 + }, + { + "epoch": 4.916682446506344, + "grad_norm": 0.5407578349113464, + "learning_rate": 3.3327021397462607e-07, + "loss": 0.0935, + "step": 51930 + }, + { + "epoch": 4.917629236886953, + "grad_norm": 0.5680246353149414, + "learning_rate": 3.294830524521871e-07, + "loss": 0.0903, + "step": 51940 + }, + { + "epoch": 4.918576027267563, + "grad_norm": 0.4451833963394165, + "learning_rate": 3.2569589092974823e-07, + "loss": 0.0966, + "step": 51950 + }, + { + "epoch": 4.919522817648173, + "grad_norm": 0.5037840008735657, + "learning_rate": 3.2190872940730924e-07, + "loss": 0.0854, + "step": 51960 + }, + { + "epoch": 4.920469608028783, + "grad_norm": 0.614662766456604, + "learning_rate": 3.181215678848703e-07, + "loss": 0.093, + "step": 51970 + }, + { + "epoch": 4.921416398409392, + "grad_norm": 0.5942671895027161, + "learning_rate": 3.1433440636243135e-07, + "loss": 0.0971, + "step": 51980 + }, + { + "epoch": 4.922363188790002, + "grad_norm": 0.5831498503684998, + "learning_rate": 3.1054724483999246e-07, + "loss": 0.0887, + "step": 51990 + }, + { + "epoch": 4.923309979170612, + "grad_norm": 0.5798229575157166, + "learning_rate": 3.067600833175535e-07, + "loss": 0.0842, + "step": 52000 + }, + { + "epoch": 4.924256769551222, + "grad_norm": 0.44426295161247253, + "learning_rate": 3.029729217951146e-07, + "loss": 0.0889, + "step": 52010 + }, + { + "epoch": 4.925203559931831, + "grad_norm": 0.7449190616607666, + "learning_rate": 2.991857602726756e-07, + "loss": 0.0945, + "step": 52020 + }, + { + "epoch": 4.926150350312441, + "grad_norm": 0.5424138307571411, + "learning_rate": 2.9539859875023673e-07, + "loss": 0.0899, + "step": 52030 + }, + { + "epoch": 4.927097140693051, + "grad_norm": 0.5666702389717102, + "learning_rate": 2.916114372277978e-07, + "loss": 0.0969, + "step": 52040 + }, + { + "epoch": 4.928043931073661, + "grad_norm": 0.5209141373634338, + "learning_rate": 2.8782427570535884e-07, + "loss": 0.1057, + "step": 52050 + }, + { + "epoch": 4.92899072145427, + "grad_norm": 0.5510291457176208, + "learning_rate": 2.840371141829199e-07, + "loss": 0.0918, + "step": 52060 + }, + { + "epoch": 4.92993751183488, + "grad_norm": 0.46501290798187256, + "learning_rate": 2.80249952660481e-07, + "loss": 0.0858, + "step": 52070 + }, + { + "epoch": 4.93088430221549, + "grad_norm": 0.4720863103866577, + "learning_rate": 2.7646279113804206e-07, + "loss": 0.0912, + "step": 52080 + }, + { + "epoch": 4.9318310925960995, + "grad_norm": 0.48449814319610596, + "learning_rate": 2.726756296156031e-07, + "loss": 0.0955, + "step": 52090 + }, + { + "epoch": 4.932777882976709, + "grad_norm": 0.4848043918609619, + "learning_rate": 2.6888846809316417e-07, + "loss": 0.0839, + "step": 52100 + }, + { + "epoch": 4.933724673357319, + "grad_norm": 0.5846676826477051, + "learning_rate": 2.651013065707253e-07, + "loss": 0.1018, + "step": 52110 + }, + { + "epoch": 4.934671463737929, + "grad_norm": 0.5330713391304016, + "learning_rate": 2.6131414504828633e-07, + "loss": 0.0993, + "step": 52120 + }, + { + "epoch": 4.9356182541185385, + "grad_norm": 0.6754573583602905, + "learning_rate": 2.575269835258474e-07, + "loss": 0.0799, + "step": 52130 + }, + { + "epoch": 4.936565044499148, + "grad_norm": 0.6761156916618347, + "learning_rate": 2.5373982200340844e-07, + "loss": 0.091, + "step": 52140 + }, + { + "epoch": 4.937511834879758, + "grad_norm": 0.5080304741859436, + "learning_rate": 2.4995266048096955e-07, + "loss": 0.0866, + "step": 52150 + }, + { + "epoch": 4.938458625260368, + "grad_norm": 0.6686791777610779, + "learning_rate": 2.461654989585306e-07, + "loss": 0.0964, + "step": 52160 + }, + { + "epoch": 4.9394054156409775, + "grad_norm": 0.7539730668067932, + "learning_rate": 2.4237833743609166e-07, + "loss": 0.1112, + "step": 52170 + }, + { + "epoch": 4.940352206021587, + "grad_norm": 0.5886653661727905, + "learning_rate": 2.385911759136527e-07, + "loss": 0.0957, + "step": 52180 + }, + { + "epoch": 4.941298996402197, + "grad_norm": 0.7578022480010986, + "learning_rate": 2.3480401439121383e-07, + "loss": 0.0875, + "step": 52190 + }, + { + "epoch": 4.942245786782806, + "grad_norm": 0.584598958492279, + "learning_rate": 2.3101685286877486e-07, + "loss": 0.0958, + "step": 52200 + }, + { + "epoch": 4.9431925771634155, + "grad_norm": 0.6678522229194641, + "learning_rate": 2.2722969134633594e-07, + "loss": 0.0882, + "step": 52210 + }, + { + "epoch": 4.944139367544025, + "grad_norm": 0.49344995617866516, + "learning_rate": 2.2344252982389702e-07, + "loss": 0.0962, + "step": 52220 + }, + { + "epoch": 4.945086157924635, + "grad_norm": 0.5028517842292786, + "learning_rate": 2.196553683014581e-07, + "loss": 0.081, + "step": 52230 + }, + { + "epoch": 4.946032948305245, + "grad_norm": 0.5575220584869385, + "learning_rate": 2.1586820677901913e-07, + "loss": 0.0837, + "step": 52240 + }, + { + "epoch": 4.9469797386858545, + "grad_norm": 0.733795166015625, + "learning_rate": 2.120810452565802e-07, + "loss": 0.1, + "step": 52250 + }, + { + "epoch": 4.947926529066464, + "grad_norm": 0.5502561926841736, + "learning_rate": 2.082938837341413e-07, + "loss": 0.0875, + "step": 52260 + }, + { + "epoch": 4.948873319447074, + "grad_norm": 0.6022515296936035, + "learning_rate": 2.0450672221170235e-07, + "loss": 0.0782, + "step": 52270 + }, + { + "epoch": 4.949820109827684, + "grad_norm": 0.6461271643638611, + "learning_rate": 2.007195606892634e-07, + "loss": 0.099, + "step": 52280 + }, + { + "epoch": 4.9507669002082935, + "grad_norm": 0.5273721814155579, + "learning_rate": 1.9693239916682449e-07, + "loss": 0.0911, + "step": 52290 + }, + { + "epoch": 4.951713690588903, + "grad_norm": 0.4946170151233673, + "learning_rate": 1.9314523764438557e-07, + "loss": 0.0851, + "step": 52300 + }, + { + "epoch": 4.952660480969513, + "grad_norm": 0.5599589347839355, + "learning_rate": 1.893580761219466e-07, + "loss": 0.0975, + "step": 52310 + }, + { + "epoch": 4.953607271350123, + "grad_norm": 0.565903902053833, + "learning_rate": 1.8557091459950768e-07, + "loss": 0.0834, + "step": 52320 + }, + { + "epoch": 4.954554061730732, + "grad_norm": 0.6564542055130005, + "learning_rate": 1.8178375307706876e-07, + "loss": 0.0998, + "step": 52330 + }, + { + "epoch": 4.955500852111342, + "grad_norm": 0.5040982961654663, + "learning_rate": 1.7799659155462984e-07, + "loss": 0.0962, + "step": 52340 + }, + { + "epoch": 4.956447642491952, + "grad_norm": 0.6264041662216187, + "learning_rate": 1.7420943003219087e-07, + "loss": 0.0967, + "step": 52350 + }, + { + "epoch": 4.957394432872562, + "grad_norm": 0.6681390404701233, + "learning_rate": 1.7042226850975195e-07, + "loss": 0.0926, + "step": 52360 + }, + { + "epoch": 4.958341223253171, + "grad_norm": 0.6040511727333069, + "learning_rate": 1.6663510698731304e-07, + "loss": 0.0899, + "step": 52370 + }, + { + "epoch": 4.959288013633781, + "grad_norm": 0.6360688209533691, + "learning_rate": 1.6284794546487412e-07, + "loss": 0.1003, + "step": 52380 + }, + { + "epoch": 4.960234804014391, + "grad_norm": 0.5660171508789062, + "learning_rate": 1.5906078394243515e-07, + "loss": 0.1008, + "step": 52390 + }, + { + "epoch": 4.961181594395001, + "grad_norm": 0.5813804864883423, + "learning_rate": 1.5527362241999623e-07, + "loss": 0.093, + "step": 52400 + }, + { + "epoch": 4.96212838477561, + "grad_norm": 0.5464773774147034, + "learning_rate": 1.514864608975573e-07, + "loss": 0.1, + "step": 52410 + }, + { + "epoch": 4.96307517515622, + "grad_norm": 0.6224668622016907, + "learning_rate": 1.4769929937511836e-07, + "loss": 0.0953, + "step": 52420 + }, + { + "epoch": 4.96402196553683, + "grad_norm": 0.5166348814964294, + "learning_rate": 1.4391213785267942e-07, + "loss": 0.0877, + "step": 52430 + }, + { + "epoch": 4.96496875591744, + "grad_norm": 0.5422220826148987, + "learning_rate": 1.401249763302405e-07, + "loss": 0.0908, + "step": 52440 + }, + { + "epoch": 4.965915546298049, + "grad_norm": 0.5344117879867554, + "learning_rate": 1.3633781480780156e-07, + "loss": 0.0997, + "step": 52450 + }, + { + "epoch": 4.966862336678659, + "grad_norm": 0.46766147017478943, + "learning_rate": 1.3255065328536264e-07, + "loss": 0.0754, + "step": 52460 + }, + { + "epoch": 4.967809127059269, + "grad_norm": 0.5525385141372681, + "learning_rate": 1.287634917629237e-07, + "loss": 0.09, + "step": 52470 + }, + { + "epoch": 4.9687559174398785, + "grad_norm": 0.5809475183486938, + "learning_rate": 1.2497633024048478e-07, + "loss": 0.0963, + "step": 52480 + }, + { + "epoch": 4.969702707820488, + "grad_norm": 0.6854071021080017, + "learning_rate": 1.2118916871804583e-07, + "loss": 0.1061, + "step": 52490 + }, + { + "epoch": 4.970649498201098, + "grad_norm": 0.5188122391700745, + "learning_rate": 1.1740200719560691e-07, + "loss": 0.0921, + "step": 52500 + }, + { + "epoch": 4.971596288581708, + "grad_norm": 0.5574137568473816, + "learning_rate": 1.1361484567316797e-07, + "loss": 0.0948, + "step": 52510 + }, + { + "epoch": 4.9725430789623175, + "grad_norm": 0.44761478900909424, + "learning_rate": 1.0982768415072905e-07, + "loss": 0.0905, + "step": 52520 + }, + { + "epoch": 4.973489869342927, + "grad_norm": 0.5985987782478333, + "learning_rate": 1.060405226282901e-07, + "loss": 0.0954, + "step": 52530 + }, + { + "epoch": 4.974436659723537, + "grad_norm": 0.4684286415576935, + "learning_rate": 1.0225336110585117e-07, + "loss": 0.0884, + "step": 52540 + }, + { + "epoch": 4.975383450104147, + "grad_norm": 0.5261790156364441, + "learning_rate": 9.846619958341224e-08, + "loss": 0.0846, + "step": 52550 + }, + { + "epoch": 4.9763302404847565, + "grad_norm": 0.4636407196521759, + "learning_rate": 9.46790380609733e-08, + "loss": 0.0934, + "step": 52560 + }, + { + "epoch": 4.977277030865366, + "grad_norm": 0.5825064182281494, + "learning_rate": 9.089187653853438e-08, + "loss": 0.0837, + "step": 52570 + }, + { + "epoch": 4.978223821245976, + "grad_norm": 0.538732647895813, + "learning_rate": 8.710471501609544e-08, + "loss": 0.095, + "step": 52580 + }, + { + "epoch": 4.979170611626586, + "grad_norm": 0.6379238963127136, + "learning_rate": 8.331755349365652e-08, + "loss": 0.0948, + "step": 52590 + }, + { + "epoch": 4.980117402007195, + "grad_norm": 0.5082975029945374, + "learning_rate": 7.953039197121757e-08, + "loss": 0.0952, + "step": 52600 + }, + { + "epoch": 4.981064192387805, + "grad_norm": 0.5919937491416931, + "learning_rate": 7.574323044877865e-08, + "loss": 0.0916, + "step": 52610 + }, + { + "epoch": 4.982010982768415, + "grad_norm": 0.654510498046875, + "learning_rate": 7.195606892633971e-08, + "loss": 0.0862, + "step": 52620 + }, + { + "epoch": 4.982957773149025, + "grad_norm": 0.6276813745498657, + "learning_rate": 6.816890740390078e-08, + "loss": 0.0938, + "step": 52630 + }, + { + "epoch": 4.983904563529634, + "grad_norm": 0.5056014060974121, + "learning_rate": 6.438174588146185e-08, + "loss": 0.0918, + "step": 52640 + }, + { + "epoch": 4.984851353910244, + "grad_norm": 0.49018388986587524, + "learning_rate": 6.059458435902292e-08, + "loss": 0.0882, + "step": 52650 + }, + { + "epoch": 4.985798144290854, + "grad_norm": 0.5894728899002075, + "learning_rate": 5.6807422836583984e-08, + "loss": 0.0977, + "step": 52660 + }, + { + "epoch": 4.986744934671464, + "grad_norm": 0.5480173230171204, + "learning_rate": 5.302026131414505e-08, + "loss": 0.0968, + "step": 52670 + }, + { + "epoch": 4.987691725052073, + "grad_norm": 0.576349139213562, + "learning_rate": 4.923309979170612e-08, + "loss": 0.0972, + "step": 52680 + }, + { + "epoch": 4.988638515432683, + "grad_norm": 0.4693711996078491, + "learning_rate": 4.544593826926719e-08, + "loss": 0.0839, + "step": 52690 + }, + { + "epoch": 4.989585305813293, + "grad_norm": 0.3827964961528778, + "learning_rate": 4.165877674682826e-08, + "loss": 0.0773, + "step": 52700 + }, + { + "epoch": 4.990532096193903, + "grad_norm": 0.5371719598770142, + "learning_rate": 3.787161522438933e-08, + "loss": 0.0852, + "step": 52710 + }, + { + "epoch": 4.991478886574512, + "grad_norm": 0.5461321473121643, + "learning_rate": 3.408445370195039e-08, + "loss": 0.0826, + "step": 52720 + }, + { + "epoch": 4.992425676955122, + "grad_norm": 0.5288814902305603, + "learning_rate": 3.029729217951146e-08, + "loss": 0.1059, + "step": 52730 + }, + { + "epoch": 4.993372467335732, + "grad_norm": 0.6059397459030151, + "learning_rate": 2.6510130657072527e-08, + "loss": 0.0953, + "step": 52740 + }, + { + "epoch": 4.9943192577163416, + "grad_norm": 0.6540989279747009, + "learning_rate": 2.2722969134633595e-08, + "loss": 0.0852, + "step": 52750 + }, + { + "epoch": 4.995266048096951, + "grad_norm": 0.5415211319923401, + "learning_rate": 1.8935807612194664e-08, + "loss": 0.0922, + "step": 52760 + }, + { + "epoch": 4.996212838477561, + "grad_norm": 0.715372622013092, + "learning_rate": 1.514864608975573e-08, + "loss": 0.0942, + "step": 52770 + }, + { + "epoch": 4.997159628858171, + "grad_norm": 0.5425447225570679, + "learning_rate": 1.1361484567316798e-08, + "loss": 0.0904, + "step": 52780 + }, + { + "epoch": 4.9981064192387805, + "grad_norm": 0.6264485716819763, + "learning_rate": 7.574323044877864e-09, + "loss": 0.089, + "step": 52790 + }, + { + "epoch": 4.99905320961939, + "grad_norm": 0.5500597953796387, + "learning_rate": 3.787161522438932e-09, + "loss": 0.1056, + "step": 52800 + }, + { + "epoch": 5.0, + "grad_norm": 1.679782748222351, + "learning_rate": 0.0, + "loss": 0.0979, + "step": 52810 + }, + { + "epoch": 5.0, + "eval_f1_micro": 0.3872254461700786, + "eval_loss": 0.11608477681875229, + "eval_precision": 0.5393189964157706, + "eval_recall": 0.3020454864805187, + "eval_runtime": 367.8742, + "eval_samples_per_second": 114.835, + "eval_steps_per_second": 7.179, + "step": 52810 + } + ], + "logging_steps": 10, + "max_steps": 52810, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.55886064391168e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}