diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,195503 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999820939352159, + "eval_steps": 500, + "global_step": 27923, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.581212956828478e-05, + "grad_norm": 10.962271690368652, + "learning_rate": 2.386634844868735e-07, + "loss": 3.4992, + "step": 1 + }, + { + "epoch": 7.162425913656956e-05, + "grad_norm": 27.797842025756836, + "learning_rate": 4.77326968973747e-07, + "loss": 3.1642, + "step": 2 + }, + { + "epoch": 0.00010743638870485433, + "grad_norm": 9.58411979675293, + "learning_rate": 7.159904534606206e-07, + "loss": 3.0653, + "step": 3 + }, + { + "epoch": 0.00014324851827313912, + "grad_norm": 20.217010498046875, + "learning_rate": 9.54653937947494e-07, + "loss": 2.3841, + "step": 4 + }, + { + "epoch": 0.00017906064784142388, + "grad_norm": 7.880495071411133, + "learning_rate": 1.1933174224343676e-06, + "loss": 2.9288, + "step": 5 + }, + { + "epoch": 0.00021487277740970867, + "grad_norm": 51.29137420654297, + "learning_rate": 1.4319809069212413e-06, + "loss": 3.7625, + "step": 6 + }, + { + "epoch": 0.00025068490697799346, + "grad_norm": 8.99178409576416, + "learning_rate": 1.6706443914081146e-06, + "loss": 3.156, + "step": 7 + }, + { + "epoch": 0.00028649703654627824, + "grad_norm": 7.409815788269043, + "learning_rate": 1.909307875894988e-06, + "loss": 3.0058, + "step": 8 + }, + { + "epoch": 0.000322309166114563, + "grad_norm": 26.714468002319336, + "learning_rate": 2.1479713603818614e-06, + "loss": 3.0568, + "step": 9 + }, + { + "epoch": 0.00035812129568284776, + "grad_norm": 9.890267372131348, + "learning_rate": 2.386634844868735e-06, + "loss": 2.9411, + "step": 10 + }, + { + "epoch": 0.00039393342525113255, + "grad_norm": 6.088392734527588, + "learning_rate": 2.625298329355609e-06, + "loss": 2.6308, + "step": 11 + }, + { + "epoch": 0.00042974555481941734, + "grad_norm": 5.2657365798950195, + "learning_rate": 2.8639618138424826e-06, + "loss": 2.3566, + "step": 12 + }, + { + "epoch": 0.0004655576843877021, + "grad_norm": 14.518162727355957, + "learning_rate": 3.1026252983293554e-06, + "loss": 3.2695, + "step": 13 + }, + { + "epoch": 0.0005013698139559869, + "grad_norm": 42.87174987792969, + "learning_rate": 3.341288782816229e-06, + "loss": 3.3732, + "step": 14 + }, + { + "epoch": 0.0005371819435242717, + "grad_norm": 5.752267837524414, + "learning_rate": 3.579952267303103e-06, + "loss": 2.6332, + "step": 15 + }, + { + "epoch": 0.0005729940730925565, + "grad_norm": 5.9656572341918945, + "learning_rate": 3.818615751789976e-06, + "loss": 2.5045, + "step": 16 + }, + { + "epoch": 0.0006088062026608413, + "grad_norm": 15.938884735107422, + "learning_rate": 4.05727923627685e-06, + "loss": 3.2342, + "step": 17 + }, + { + "epoch": 0.000644618332229126, + "grad_norm": 12.065786361694336, + "learning_rate": 4.295942720763723e-06, + "loss": 2.4276, + "step": 18 + }, + { + "epoch": 0.0006804304617974107, + "grad_norm": 9.022994995117188, + "learning_rate": 4.5346062052505965e-06, + "loss": 3.5128, + "step": 19 + }, + { + "epoch": 0.0007162425913656955, + "grad_norm": 8.21558952331543, + "learning_rate": 4.77326968973747e-06, + "loss": 2.6403, + "step": 20 + }, + { + "epoch": 0.0007520547209339803, + "grad_norm": 6.674815654754639, + "learning_rate": 5.011933174224344e-06, + "loss": 2.9362, + "step": 21 + }, + { + "epoch": 0.0007878668505022651, + "grad_norm": 8.615503311157227, + "learning_rate": 5.250596658711218e-06, + "loss": 3.4083, + "step": 22 + }, + { + "epoch": 0.0008236789800705499, + "grad_norm": 5.087034225463867, + "learning_rate": 5.489260143198091e-06, + "loss": 2.6269, + "step": 23 + }, + { + "epoch": 0.0008594911096388347, + "grad_norm": 5.468980312347412, + "learning_rate": 5.727923627684965e-06, + "loss": 2.7802, + "step": 24 + }, + { + "epoch": 0.0008953032392071195, + "grad_norm": 4.205332279205322, + "learning_rate": 5.966587112171838e-06, + "loss": 2.6768, + "step": 25 + }, + { + "epoch": 0.0009311153687754042, + "grad_norm": 2.8609237670898438, + "learning_rate": 6.205250596658711e-06, + "loss": 2.3405, + "step": 26 + }, + { + "epoch": 0.000966927498343689, + "grad_norm": 4.893277168273926, + "learning_rate": 6.4439140811455855e-06, + "loss": 2.9049, + "step": 27 + }, + { + "epoch": 0.0010027396279119738, + "grad_norm": 2.7744317054748535, + "learning_rate": 6.682577565632458e-06, + "loss": 2.3621, + "step": 28 + }, + { + "epoch": 0.0010385517574802586, + "grad_norm": 4.931545257568359, + "learning_rate": 6.921241050119331e-06, + "loss": 2.8006, + "step": 29 + }, + { + "epoch": 0.0010743638870485434, + "grad_norm": 2.949612855911255, + "learning_rate": 7.159904534606206e-06, + "loss": 2.7418, + "step": 30 + }, + { + "epoch": 0.0011101760166168282, + "grad_norm": 6.12894868850708, + "learning_rate": 7.398568019093079e-06, + "loss": 3.279, + "step": 31 + }, + { + "epoch": 0.001145988146185113, + "grad_norm": 4.933887004852295, + "learning_rate": 7.637231503579952e-06, + "loss": 3.0113, + "step": 32 + }, + { + "epoch": 0.0011818002757533978, + "grad_norm": 3.533989906311035, + "learning_rate": 7.875894988066825e-06, + "loss": 2.4149, + "step": 33 + }, + { + "epoch": 0.0012176124053216825, + "grad_norm": 5.942819118499756, + "learning_rate": 8.1145584725537e-06, + "loss": 2.7419, + "step": 34 + }, + { + "epoch": 0.0012534245348899673, + "grad_norm": 5.094061374664307, + "learning_rate": 8.353221957040573e-06, + "loss": 3.1046, + "step": 35 + }, + { + "epoch": 0.001289236664458252, + "grad_norm": 4.425570487976074, + "learning_rate": 8.591885441527446e-06, + "loss": 2.9092, + "step": 36 + }, + { + "epoch": 0.0013250487940265367, + "grad_norm": 3.693305253982544, + "learning_rate": 8.83054892601432e-06, + "loss": 2.7984, + "step": 37 + }, + { + "epoch": 0.0013608609235948215, + "grad_norm": 4.119627475738525, + "learning_rate": 9.069212410501193e-06, + "loss": 2.4779, + "step": 38 + }, + { + "epoch": 0.0013966730531631063, + "grad_norm": 4.064864158630371, + "learning_rate": 9.307875894988068e-06, + "loss": 2.5798, + "step": 39 + }, + { + "epoch": 0.001432485182731391, + "grad_norm": 6.357635974884033, + "learning_rate": 9.54653937947494e-06, + "loss": 2.9759, + "step": 40 + }, + { + "epoch": 0.0014682973122996758, + "grad_norm": 4.885557651519775, + "learning_rate": 9.785202863961815e-06, + "loss": 2.6862, + "step": 41 + }, + { + "epoch": 0.0015041094418679606, + "grad_norm": 4.694367408752441, + "learning_rate": 1.0023866348448688e-05, + "loss": 2.8069, + "step": 42 + }, + { + "epoch": 0.0015399215714362454, + "grad_norm": 6.279577732086182, + "learning_rate": 1.026252983293556e-05, + "loss": 2.5923, + "step": 43 + }, + { + "epoch": 0.0015757337010045302, + "grad_norm": 3.011857032775879, + "learning_rate": 1.0501193317422435e-05, + "loss": 2.4332, + "step": 44 + }, + { + "epoch": 0.001611545830572815, + "grad_norm": 5.952685832977295, + "learning_rate": 1.0739856801909308e-05, + "loss": 3.1646, + "step": 45 + }, + { + "epoch": 0.0016473579601410998, + "grad_norm": 3.988813877105713, + "learning_rate": 1.0978520286396183e-05, + "loss": 2.7031, + "step": 46 + }, + { + "epoch": 0.0016831700897093846, + "grad_norm": 4.060389041900635, + "learning_rate": 1.1217183770883056e-05, + "loss": 2.7739, + "step": 47 + }, + { + "epoch": 0.0017189822192776693, + "grad_norm": 3.857228994369507, + "learning_rate": 1.145584725536993e-05, + "loss": 2.5465, + "step": 48 + }, + { + "epoch": 0.0017547943488459541, + "grad_norm": 2.791111469268799, + "learning_rate": 1.1694510739856803e-05, + "loss": 2.465, + "step": 49 + }, + { + "epoch": 0.001790606478414239, + "grad_norm": 3.5531413555145264, + "learning_rate": 1.1933174224343676e-05, + "loss": 2.6838, + "step": 50 + }, + { + "epoch": 0.0018264186079825237, + "grad_norm": 3.4963018894195557, + "learning_rate": 1.2171837708830549e-05, + "loss": 2.4643, + "step": 51 + }, + { + "epoch": 0.0018622307375508085, + "grad_norm": 2.362746238708496, + "learning_rate": 1.2410501193317422e-05, + "loss": 2.2958, + "step": 52 + }, + { + "epoch": 0.0018980428671190933, + "grad_norm": 2.81494140625, + "learning_rate": 1.2649164677804295e-05, + "loss": 2.6014, + "step": 53 + }, + { + "epoch": 0.001933854996687378, + "grad_norm": 2.551485061645508, + "learning_rate": 1.2887828162291171e-05, + "loss": 2.5691, + "step": 54 + }, + { + "epoch": 0.0019696671262556626, + "grad_norm": 7.383233547210693, + "learning_rate": 1.3126491646778044e-05, + "loss": 3.3642, + "step": 55 + }, + { + "epoch": 0.0020054792558239476, + "grad_norm": 6.4138360023498535, + "learning_rate": 1.3365155131264917e-05, + "loss": 3.1544, + "step": 56 + }, + { + "epoch": 0.002041291385392232, + "grad_norm": 3.4359352588653564, + "learning_rate": 1.360381861575179e-05, + "loss": 2.7802, + "step": 57 + }, + { + "epoch": 0.0020771035149605172, + "grad_norm": 1.671030879020691, + "learning_rate": 1.3842482100238662e-05, + "loss": 2.0855, + "step": 58 + }, + { + "epoch": 0.002112915644528802, + "grad_norm": 2.736416816711426, + "learning_rate": 1.4081145584725539e-05, + "loss": 2.5001, + "step": 59 + }, + { + "epoch": 0.002148727774097087, + "grad_norm": 2.362579584121704, + "learning_rate": 1.4319809069212412e-05, + "loss": 2.5147, + "step": 60 + }, + { + "epoch": 0.0021845399036653714, + "grad_norm": 4.88362455368042, + "learning_rate": 1.4558472553699284e-05, + "loss": 2.7335, + "step": 61 + }, + { + "epoch": 0.0022203520332336564, + "grad_norm": 3.396545171737671, + "learning_rate": 1.4797136038186157e-05, + "loss": 2.2491, + "step": 62 + }, + { + "epoch": 0.002256164162801941, + "grad_norm": 2.045161008834839, + "learning_rate": 1.5035799522673034e-05, + "loss": 2.1031, + "step": 63 + }, + { + "epoch": 0.002291976292370226, + "grad_norm": 2.4404568672180176, + "learning_rate": 1.5274463007159905e-05, + "loss": 2.4767, + "step": 64 + }, + { + "epoch": 0.0023277884219385105, + "grad_norm": 1.991403341293335, + "learning_rate": 1.551312649164678e-05, + "loss": 2.2476, + "step": 65 + }, + { + "epoch": 0.0023636005515067955, + "grad_norm": 2.6608493328094482, + "learning_rate": 1.575178997613365e-05, + "loss": 2.4446, + "step": 66 + }, + { + "epoch": 0.00239941268107508, + "grad_norm": 4.269506454467773, + "learning_rate": 1.5990453460620525e-05, + "loss": 2.6558, + "step": 67 + }, + { + "epoch": 0.002435224810643365, + "grad_norm": 2.757368564605713, + "learning_rate": 1.62291169451074e-05, + "loss": 2.4896, + "step": 68 + }, + { + "epoch": 0.0024710369402116497, + "grad_norm": 3.7774527072906494, + "learning_rate": 1.6467780429594274e-05, + "loss": 2.6252, + "step": 69 + }, + { + "epoch": 0.0025068490697799347, + "grad_norm": 2.459183692932129, + "learning_rate": 1.6706443914081145e-05, + "loss": 2.3044, + "step": 70 + }, + { + "epoch": 0.0025426611993482192, + "grad_norm": 1.71919584274292, + "learning_rate": 1.694510739856802e-05, + "loss": 2.2353, + "step": 71 + }, + { + "epoch": 0.002578473328916504, + "grad_norm": 2.3338592052459717, + "learning_rate": 1.718377088305489e-05, + "loss": 2.3211, + "step": 72 + }, + { + "epoch": 0.002614285458484789, + "grad_norm": 4.527637958526611, + "learning_rate": 1.742243436754177e-05, + "loss": 2.5248, + "step": 73 + }, + { + "epoch": 0.0026500975880530734, + "grad_norm": 4.273586273193359, + "learning_rate": 1.766109785202864e-05, + "loss": 2.7464, + "step": 74 + }, + { + "epoch": 0.0026859097176213584, + "grad_norm": 2.748335838317871, + "learning_rate": 1.7899761336515515e-05, + "loss": 2.4126, + "step": 75 + }, + { + "epoch": 0.002721721847189643, + "grad_norm": 4.273103713989258, + "learning_rate": 1.8138424821002386e-05, + "loss": 2.5641, + "step": 76 + }, + { + "epoch": 0.002757533976757928, + "grad_norm": 2.175398826599121, + "learning_rate": 1.837708830548926e-05, + "loss": 2.335, + "step": 77 + }, + { + "epoch": 0.0027933461063262125, + "grad_norm": 1.991686463356018, + "learning_rate": 1.8615751789976135e-05, + "loss": 2.1709, + "step": 78 + }, + { + "epoch": 0.0028291582358944975, + "grad_norm": 2.3759572505950928, + "learning_rate": 1.885441527446301e-05, + "loss": 2.2788, + "step": 79 + }, + { + "epoch": 0.002864970365462782, + "grad_norm": 2.7693066596984863, + "learning_rate": 1.909307875894988e-05, + "loss": 2.2775, + "step": 80 + }, + { + "epoch": 0.002900782495031067, + "grad_norm": 3.2281436920166016, + "learning_rate": 1.9331742243436756e-05, + "loss": 2.453, + "step": 81 + }, + { + "epoch": 0.0029365946245993517, + "grad_norm": 2.9001846313476562, + "learning_rate": 1.957040572792363e-05, + "loss": 2.4301, + "step": 82 + }, + { + "epoch": 0.0029724067541676367, + "grad_norm": 4.701358795166016, + "learning_rate": 1.98090692124105e-05, + "loss": 2.3049, + "step": 83 + }, + { + "epoch": 0.0030082188837359213, + "grad_norm": 3.314978837966919, + "learning_rate": 2.0047732696897376e-05, + "loss": 2.5192, + "step": 84 + }, + { + "epoch": 0.0030440310133042063, + "grad_norm": 3.295654535293579, + "learning_rate": 2.0286396181384247e-05, + "loss": 2.3816, + "step": 85 + }, + { + "epoch": 0.003079843142872491, + "grad_norm": 4.113006591796875, + "learning_rate": 2.052505966587112e-05, + "loss": 2.3823, + "step": 86 + }, + { + "epoch": 0.003115655272440776, + "grad_norm": 6.134260654449463, + "learning_rate": 2.0763723150357996e-05, + "loss": 2.9283, + "step": 87 + }, + { + "epoch": 0.0031514674020090604, + "grad_norm": 3.5906167030334473, + "learning_rate": 2.100238663484487e-05, + "loss": 2.7102, + "step": 88 + }, + { + "epoch": 0.0031872795315773454, + "grad_norm": 2.333726167678833, + "learning_rate": 2.1241050119331742e-05, + "loss": 2.0401, + "step": 89 + }, + { + "epoch": 0.00322309166114563, + "grad_norm": 2.3526418209075928, + "learning_rate": 2.1479713603818617e-05, + "loss": 2.0289, + "step": 90 + }, + { + "epoch": 0.003258903790713915, + "grad_norm": 4.23220157623291, + "learning_rate": 2.171837708830549e-05, + "loss": 2.6101, + "step": 91 + }, + { + "epoch": 0.0032947159202821995, + "grad_norm": 2.601986885070801, + "learning_rate": 2.1957040572792366e-05, + "loss": 2.1812, + "step": 92 + }, + { + "epoch": 0.0033305280498504846, + "grad_norm": 3.328922748565674, + "learning_rate": 2.2195704057279237e-05, + "loss": 2.4358, + "step": 93 + }, + { + "epoch": 0.003366340179418769, + "grad_norm": 3.1595191955566406, + "learning_rate": 2.243436754176611e-05, + "loss": 2.243, + "step": 94 + }, + { + "epoch": 0.003402152308987054, + "grad_norm": 3.6129677295684814, + "learning_rate": 2.2673031026252983e-05, + "loss": 1.9055, + "step": 95 + }, + { + "epoch": 0.0034379644385553387, + "grad_norm": 2.5094430446624756, + "learning_rate": 2.291169451073986e-05, + "loss": 1.9977, + "step": 96 + }, + { + "epoch": 0.0034737765681236233, + "grad_norm": 5.377303600311279, + "learning_rate": 2.3150357995226732e-05, + "loss": 2.5395, + "step": 97 + }, + { + "epoch": 0.0035095886976919083, + "grad_norm": 5.151118755340576, + "learning_rate": 2.3389021479713606e-05, + "loss": 2.4771, + "step": 98 + }, + { + "epoch": 0.003545400827260193, + "grad_norm": 3.074028968811035, + "learning_rate": 2.3627684964200477e-05, + "loss": 2.1024, + "step": 99 + }, + { + "epoch": 0.003581212956828478, + "grad_norm": 2.7826621532440186, + "learning_rate": 2.3866348448687352e-05, + "loss": 2.4725, + "step": 100 + }, + { + "epoch": 0.0036170250863967624, + "grad_norm": 4.709033012390137, + "learning_rate": 2.4105011933174227e-05, + "loss": 2.6447, + "step": 101 + }, + { + "epoch": 0.0036528372159650474, + "grad_norm": 5.2452311515808105, + "learning_rate": 2.4343675417661098e-05, + "loss": 2.373, + "step": 102 + }, + { + "epoch": 0.003688649345533332, + "grad_norm": 2.4772093296051025, + "learning_rate": 2.4582338902147972e-05, + "loss": 2.3534, + "step": 103 + }, + { + "epoch": 0.003724461475101617, + "grad_norm": 4.724703788757324, + "learning_rate": 2.4821002386634844e-05, + "loss": 2.6285, + "step": 104 + }, + { + "epoch": 0.0037602736046699016, + "grad_norm": 2.5867512226104736, + "learning_rate": 2.5059665871121718e-05, + "loss": 2.2477, + "step": 105 + }, + { + "epoch": 0.0037960857342381866, + "grad_norm": 4.422124862670898, + "learning_rate": 2.529832935560859e-05, + "loss": 2.3933, + "step": 106 + }, + { + "epoch": 0.003831897863806471, + "grad_norm": 3.257288932800293, + "learning_rate": 2.5536992840095464e-05, + "loss": 1.8866, + "step": 107 + }, + { + "epoch": 0.003867709993374756, + "grad_norm": 3.285341501235962, + "learning_rate": 2.5775656324582342e-05, + "loss": 2.3934, + "step": 108 + }, + { + "epoch": 0.0039035221229430407, + "grad_norm": 3.1677615642547607, + "learning_rate": 2.6014319809069216e-05, + "loss": 2.4064, + "step": 109 + }, + { + "epoch": 0.003939334252511325, + "grad_norm": 5.5628461837768555, + "learning_rate": 2.6252983293556088e-05, + "loss": 2.5679, + "step": 110 + }, + { + "epoch": 0.003975146382079611, + "grad_norm": 3.9429681301116943, + "learning_rate": 2.6491646778042962e-05, + "loss": 2.5226, + "step": 111 + }, + { + "epoch": 0.004010958511647895, + "grad_norm": 3.8189384937286377, + "learning_rate": 2.6730310262529833e-05, + "loss": 2.3188, + "step": 112 + }, + { + "epoch": 0.00404677064121618, + "grad_norm": 3.60325026512146, + "learning_rate": 2.6968973747016708e-05, + "loss": 2.3814, + "step": 113 + }, + { + "epoch": 0.004082582770784464, + "grad_norm": 4.04654598236084, + "learning_rate": 2.720763723150358e-05, + "loss": 2.0409, + "step": 114 + }, + { + "epoch": 0.00411839490035275, + "grad_norm": 2.3440001010894775, + "learning_rate": 2.7446300715990454e-05, + "loss": 2.1214, + "step": 115 + }, + { + "epoch": 0.0041542070299210344, + "grad_norm": 2.3456714153289795, + "learning_rate": 2.7684964200477325e-05, + "loss": 1.7589, + "step": 116 + }, + { + "epoch": 0.004190019159489319, + "grad_norm": 3.4626646041870117, + "learning_rate": 2.7923627684964203e-05, + "loss": 2.4336, + "step": 117 + }, + { + "epoch": 0.004225831289057604, + "grad_norm": 2.838393211364746, + "learning_rate": 2.8162291169451077e-05, + "loss": 1.9198, + "step": 118 + }, + { + "epoch": 0.004261643418625889, + "grad_norm": 3.0637154579162598, + "learning_rate": 2.840095465393795e-05, + "loss": 2.0948, + "step": 119 + }, + { + "epoch": 0.004297455548194174, + "grad_norm": 3.7256317138671875, + "learning_rate": 2.8639618138424823e-05, + "loss": 2.4361, + "step": 120 + }, + { + "epoch": 0.004333267677762458, + "grad_norm": 3.2016754150390625, + "learning_rate": 2.8878281622911694e-05, + "loss": 2.1261, + "step": 121 + }, + { + "epoch": 0.004369079807330743, + "grad_norm": 4.712606906890869, + "learning_rate": 2.911694510739857e-05, + "loss": 2.0613, + "step": 122 + }, + { + "epoch": 0.004404891936899027, + "grad_norm": 3.5016684532165527, + "learning_rate": 2.935560859188544e-05, + "loss": 2.1061, + "step": 123 + }, + { + "epoch": 0.004440704066467313, + "grad_norm": 2.8278017044067383, + "learning_rate": 2.9594272076372315e-05, + "loss": 2.5463, + "step": 124 + }, + { + "epoch": 0.004476516196035597, + "grad_norm": 3.4511027336120605, + "learning_rate": 2.983293556085919e-05, + "loss": 2.4129, + "step": 125 + }, + { + "epoch": 0.004512328325603882, + "grad_norm": 2.2344186305999756, + "learning_rate": 3.0071599045346067e-05, + "loss": 2.1861, + "step": 126 + }, + { + "epoch": 0.0045481404551721664, + "grad_norm": 2.4487826824188232, + "learning_rate": 3.031026252983294e-05, + "loss": 1.985, + "step": 127 + }, + { + "epoch": 0.004583952584740452, + "grad_norm": 3.2509713172912598, + "learning_rate": 3.054892601431981e-05, + "loss": 2.2934, + "step": 128 + }, + { + "epoch": 0.0046197647143087365, + "grad_norm": 1.7187750339508057, + "learning_rate": 3.0787589498806684e-05, + "loss": 1.9708, + "step": 129 + }, + { + "epoch": 0.004655576843877021, + "grad_norm": 5.076966285705566, + "learning_rate": 3.102625298329356e-05, + "loss": 2.6119, + "step": 130 + }, + { + "epoch": 0.004691388973445306, + "grad_norm": 3.493896245956421, + "learning_rate": 3.126491646778043e-05, + "loss": 2.5397, + "step": 131 + }, + { + "epoch": 0.004727201103013591, + "grad_norm": 2.5739636421203613, + "learning_rate": 3.15035799522673e-05, + "loss": 2.1216, + "step": 132 + }, + { + "epoch": 0.004763013232581876, + "grad_norm": 2.577495574951172, + "learning_rate": 3.1742243436754176e-05, + "loss": 2.1758, + "step": 133 + }, + { + "epoch": 0.00479882536215016, + "grad_norm": 2.5767970085144043, + "learning_rate": 3.198090692124105e-05, + "loss": 2.2066, + "step": 134 + }, + { + "epoch": 0.004834637491718445, + "grad_norm": 3.6485743522644043, + "learning_rate": 3.2219570405727925e-05, + "loss": 2.4, + "step": 135 + }, + { + "epoch": 0.00487044962128673, + "grad_norm": 3.2032318115234375, + "learning_rate": 3.24582338902148e-05, + "loss": 2.323, + "step": 136 + }, + { + "epoch": 0.004906261750855015, + "grad_norm": 3.5218751430511475, + "learning_rate": 3.2696897374701674e-05, + "loss": 2.1281, + "step": 137 + }, + { + "epoch": 0.004942073880423299, + "grad_norm": 3.35833477973938, + "learning_rate": 3.293556085918855e-05, + "loss": 2.3306, + "step": 138 + }, + { + "epoch": 0.004977886009991584, + "grad_norm": 2.5601823329925537, + "learning_rate": 3.3174224343675416e-05, + "loss": 2.0496, + "step": 139 + }, + { + "epoch": 0.005013698139559869, + "grad_norm": 5.0960373878479, + "learning_rate": 3.341288782816229e-05, + "loss": 2.6784, + "step": 140 + }, + { + "epoch": 0.005049510269128154, + "grad_norm": 3.276700973510742, + "learning_rate": 3.3651551312649165e-05, + "loss": 2.0709, + "step": 141 + }, + { + "epoch": 0.0050853223986964385, + "grad_norm": 3.034759283065796, + "learning_rate": 3.389021479713604e-05, + "loss": 2.1616, + "step": 142 + }, + { + "epoch": 0.005121134528264723, + "grad_norm": 3.5838255882263184, + "learning_rate": 3.4128878281622915e-05, + "loss": 2.5447, + "step": 143 + }, + { + "epoch": 0.005156946657833008, + "grad_norm": 3.239966630935669, + "learning_rate": 3.436754176610978e-05, + "loss": 2.4755, + "step": 144 + }, + { + "epoch": 0.005192758787401293, + "grad_norm": 3.4092092514038086, + "learning_rate": 3.4606205250596664e-05, + "loss": 2.2712, + "step": 145 + }, + { + "epoch": 0.005228570916969578, + "grad_norm": 2.1897785663604736, + "learning_rate": 3.484486873508354e-05, + "loss": 2.1974, + "step": 146 + }, + { + "epoch": 0.005264383046537862, + "grad_norm": 3.396059513092041, + "learning_rate": 3.5083532219570406e-05, + "loss": 2.4299, + "step": 147 + }, + { + "epoch": 0.005300195176106147, + "grad_norm": 4.3964104652404785, + "learning_rate": 3.532219570405728e-05, + "loss": 2.2913, + "step": 148 + }, + { + "epoch": 0.005336007305674432, + "grad_norm": 2.8789312839508057, + "learning_rate": 3.5560859188544155e-05, + "loss": 1.6942, + "step": 149 + }, + { + "epoch": 0.005371819435242717, + "grad_norm": 1.7784701585769653, + "learning_rate": 3.579952267303103e-05, + "loss": 1.9099, + "step": 150 + }, + { + "epoch": 0.005407631564811001, + "grad_norm": 3.5115861892700195, + "learning_rate": 3.60381861575179e-05, + "loss": 2.4334, + "step": 151 + }, + { + "epoch": 0.005443443694379286, + "grad_norm": 2.5266120433807373, + "learning_rate": 3.627684964200477e-05, + "loss": 2.1036, + "step": 152 + }, + { + "epoch": 0.005479255823947571, + "grad_norm": 2.6542105674743652, + "learning_rate": 3.651551312649165e-05, + "loss": 2.1469, + "step": 153 + }, + { + "epoch": 0.005515067953515856, + "grad_norm": 6.56311559677124, + "learning_rate": 3.675417661097852e-05, + "loss": 2.9395, + "step": 154 + }, + { + "epoch": 0.0055508800830841405, + "grad_norm": 2.516366720199585, + "learning_rate": 3.6992840095465396e-05, + "loss": 1.9398, + "step": 155 + }, + { + "epoch": 0.005586692212652425, + "grad_norm": 2.376039981842041, + "learning_rate": 3.723150357995227e-05, + "loss": 2.1975, + "step": 156 + }, + { + "epoch": 0.0056225043422207105, + "grad_norm": 3.064851760864258, + "learning_rate": 3.7470167064439145e-05, + "loss": 2.2792, + "step": 157 + }, + { + "epoch": 0.005658316471788995, + "grad_norm": 3.0628392696380615, + "learning_rate": 3.770883054892602e-05, + "loss": 2.1432, + "step": 158 + }, + { + "epoch": 0.00569412860135728, + "grad_norm": 4.342270851135254, + "learning_rate": 3.794749403341289e-05, + "loss": 2.2726, + "step": 159 + }, + { + "epoch": 0.005729940730925564, + "grad_norm": 3.9450583457946777, + "learning_rate": 3.818615751789976e-05, + "loss": 2.2563, + "step": 160 + }, + { + "epoch": 0.00576575286049385, + "grad_norm": 2.65295672416687, + "learning_rate": 3.8424821002386637e-05, + "loss": 2.279, + "step": 161 + }, + { + "epoch": 0.005801564990062134, + "grad_norm": 3.084883451461792, + "learning_rate": 3.866348448687351e-05, + "loss": 2.347, + "step": 162 + }, + { + "epoch": 0.005837377119630419, + "grad_norm": 2.0659403800964355, + "learning_rate": 3.8902147971360386e-05, + "loss": 1.982, + "step": 163 + }, + { + "epoch": 0.005873189249198703, + "grad_norm": 2.8707988262176514, + "learning_rate": 3.914081145584726e-05, + "loss": 2.2617, + "step": 164 + }, + { + "epoch": 0.005909001378766989, + "grad_norm": 2.436180353164673, + "learning_rate": 3.9379474940334135e-05, + "loss": 2.0672, + "step": 165 + }, + { + "epoch": 0.005944813508335273, + "grad_norm": 3.461872100830078, + "learning_rate": 3.9618138424821e-05, + "loss": 2.024, + "step": 166 + }, + { + "epoch": 0.005980625637903558, + "grad_norm": 2.2728688716888428, + "learning_rate": 3.985680190930788e-05, + "loss": 1.7771, + "step": 167 + }, + { + "epoch": 0.0060164377674718425, + "grad_norm": 2.9046618938446045, + "learning_rate": 4.009546539379475e-05, + "loss": 2.1021, + "step": 168 + }, + { + "epoch": 0.006052249897040127, + "grad_norm": 3.4231221675872803, + "learning_rate": 4.0334128878281626e-05, + "loss": 2.4282, + "step": 169 + }, + { + "epoch": 0.0060880620266084125, + "grad_norm": 2.6314404010772705, + "learning_rate": 4.0572792362768494e-05, + "loss": 2.5756, + "step": 170 + }, + { + "epoch": 0.006123874156176697, + "grad_norm": 1.9490439891815186, + "learning_rate": 4.081145584725537e-05, + "loss": 1.9626, + "step": 171 + }, + { + "epoch": 0.006159686285744982, + "grad_norm": 2.614915370941162, + "learning_rate": 4.105011933174224e-05, + "loss": 2.0636, + "step": 172 + }, + { + "epoch": 0.006195498415313266, + "grad_norm": 2.4821505546569824, + "learning_rate": 4.1288782816229125e-05, + "loss": 2.2238, + "step": 173 + }, + { + "epoch": 0.006231310544881552, + "grad_norm": 3.056910991668701, + "learning_rate": 4.152744630071599e-05, + "loss": 1.9857, + "step": 174 + }, + { + "epoch": 0.006267122674449836, + "grad_norm": 3.037914514541626, + "learning_rate": 4.176610978520287e-05, + "loss": 2.0045, + "step": 175 + }, + { + "epoch": 0.006302934804018121, + "grad_norm": 2.768630266189575, + "learning_rate": 4.200477326968974e-05, + "loss": 2.2729, + "step": 176 + }, + { + "epoch": 0.006338746933586405, + "grad_norm": 6.603919506072998, + "learning_rate": 4.2243436754176616e-05, + "loss": 2.4431, + "step": 177 + }, + { + "epoch": 0.006374559063154691, + "grad_norm": 2.241738796234131, + "learning_rate": 4.2482100238663484e-05, + "loss": 2.1826, + "step": 178 + }, + { + "epoch": 0.006410371192722975, + "grad_norm": 2.1560134887695312, + "learning_rate": 4.272076372315036e-05, + "loss": 2.041, + "step": 179 + }, + { + "epoch": 0.00644618332229126, + "grad_norm": 2.806037187576294, + "learning_rate": 4.295942720763723e-05, + "loss": 2.1842, + "step": 180 + }, + { + "epoch": 0.0064819954518595445, + "grad_norm": 3.3710579872131348, + "learning_rate": 4.319809069212411e-05, + "loss": 1.8961, + "step": 181 + }, + { + "epoch": 0.00651780758142783, + "grad_norm": 2.774545192718506, + "learning_rate": 4.343675417661098e-05, + "loss": 2.1378, + "step": 182 + }, + { + "epoch": 0.0065536197109961145, + "grad_norm": 4.045608043670654, + "learning_rate": 4.367541766109786e-05, + "loss": 2.3613, + "step": 183 + }, + { + "epoch": 0.006589431840564399, + "grad_norm": 4.307875633239746, + "learning_rate": 4.391408114558473e-05, + "loss": 2.5546, + "step": 184 + }, + { + "epoch": 0.006625243970132684, + "grad_norm": 2.564769744873047, + "learning_rate": 4.41527446300716e-05, + "loss": 2.0365, + "step": 185 + }, + { + "epoch": 0.006661056099700969, + "grad_norm": 3.592845916748047, + "learning_rate": 4.4391408114558474e-05, + "loss": 2.0478, + "step": 186 + }, + { + "epoch": 0.006696868229269254, + "grad_norm": 2.4689202308654785, + "learning_rate": 4.463007159904535e-05, + "loss": 2.099, + "step": 187 + }, + { + "epoch": 0.006732680358837538, + "grad_norm": 2.8406245708465576, + "learning_rate": 4.486873508353222e-05, + "loss": 1.972, + "step": 188 + }, + { + "epoch": 0.006768492488405823, + "grad_norm": 3.0729517936706543, + "learning_rate": 4.510739856801909e-05, + "loss": 2.3021, + "step": 189 + }, + { + "epoch": 0.006804304617974108, + "grad_norm": 2.427234649658203, + "learning_rate": 4.5346062052505965e-05, + "loss": 2.1842, + "step": 190 + }, + { + "epoch": 0.006840116747542393, + "grad_norm": 2.7807438373565674, + "learning_rate": 4.5584725536992847e-05, + "loss": 2.2948, + "step": 191 + }, + { + "epoch": 0.006875928877110677, + "grad_norm": 3.589139699935913, + "learning_rate": 4.582338902147972e-05, + "loss": 2.3455, + "step": 192 + }, + { + "epoch": 0.006911741006678962, + "grad_norm": 3.561444044113159, + "learning_rate": 4.606205250596659e-05, + "loss": 1.9343, + "step": 193 + }, + { + "epoch": 0.0069475531362472465, + "grad_norm": 2.3971855640411377, + "learning_rate": 4.6300715990453463e-05, + "loss": 2.144, + "step": 194 + }, + { + "epoch": 0.006983365265815532, + "grad_norm": 1.9098275899887085, + "learning_rate": 4.653937947494034e-05, + "loss": 1.8509, + "step": 195 + }, + { + "epoch": 0.0070191773953838165, + "grad_norm": 2.244335412979126, + "learning_rate": 4.677804295942721e-05, + "loss": 1.922, + "step": 196 + }, + { + "epoch": 0.007054989524952101, + "grad_norm": 2.8799850940704346, + "learning_rate": 4.701670644391408e-05, + "loss": 1.9339, + "step": 197 + }, + { + "epoch": 0.007090801654520386, + "grad_norm": 2.483138084411621, + "learning_rate": 4.7255369928400955e-05, + "loss": 2.0267, + "step": 198 + }, + { + "epoch": 0.007126613784088671, + "grad_norm": 4.479922294616699, + "learning_rate": 4.749403341288783e-05, + "loss": 2.345, + "step": 199 + }, + { + "epoch": 0.007162425913656956, + "grad_norm": 2.4808027744293213, + "learning_rate": 4.7732696897374704e-05, + "loss": 2.1023, + "step": 200 + }, + { + "epoch": 0.00719823804322524, + "grad_norm": 3.2008464336395264, + "learning_rate": 4.797136038186158e-05, + "loss": 2.1962, + "step": 201 + }, + { + "epoch": 0.007234050172793525, + "grad_norm": 3.3301191329956055, + "learning_rate": 4.821002386634845e-05, + "loss": 2.0843, + "step": 202 + }, + { + "epoch": 0.00726986230236181, + "grad_norm": 2.2607581615448, + "learning_rate": 4.844868735083533e-05, + "loss": 1.812, + "step": 203 + }, + { + "epoch": 0.007305674431930095, + "grad_norm": 2.308560371398926, + "learning_rate": 4.8687350835322196e-05, + "loss": 2.1753, + "step": 204 + }, + { + "epoch": 0.007341486561498379, + "grad_norm": 1.8823864459991455, + "learning_rate": 4.892601431980907e-05, + "loss": 2.0771, + "step": 205 + }, + { + "epoch": 0.007377298691066664, + "grad_norm": 2.2979161739349365, + "learning_rate": 4.9164677804295945e-05, + "loss": 2.1303, + "step": 206 + }, + { + "epoch": 0.007413110820634949, + "grad_norm": 2.1987791061401367, + "learning_rate": 4.940334128878282e-05, + "loss": 2.1973, + "step": 207 + }, + { + "epoch": 0.007448922950203234, + "grad_norm": 2.7911312580108643, + "learning_rate": 4.964200477326969e-05, + "loss": 2.2152, + "step": 208 + }, + { + "epoch": 0.0074847350797715186, + "grad_norm": 4.45490837097168, + "learning_rate": 4.988066825775656e-05, + "loss": 2.0745, + "step": 209 + }, + { + "epoch": 0.007520547209339803, + "grad_norm": 3.342639684677124, + "learning_rate": 5.0119331742243436e-05, + "loss": 2.3235, + "step": 210 + }, + { + "epoch": 0.0075563593389080886, + "grad_norm": 2.095116376876831, + "learning_rate": 5.035799522673032e-05, + "loss": 1.9779, + "step": 211 + }, + { + "epoch": 0.007592171468476373, + "grad_norm": 2.7536773681640625, + "learning_rate": 5.059665871121718e-05, + "loss": 2.11, + "step": 212 + }, + { + "epoch": 0.007627983598044658, + "grad_norm": 2.3608434200286865, + "learning_rate": 5.083532219570406e-05, + "loss": 2.0265, + "step": 213 + }, + { + "epoch": 0.007663795727612942, + "grad_norm": 2.917818307876587, + "learning_rate": 5.107398568019093e-05, + "loss": 1.9071, + "step": 214 + }, + { + "epoch": 0.007699607857181228, + "grad_norm": 3.007936716079712, + "learning_rate": 5.131264916467781e-05, + "loss": 2.0646, + "step": 215 + }, + { + "epoch": 0.007735419986749512, + "grad_norm": 2.02176833152771, + "learning_rate": 5.1551312649164684e-05, + "loss": 1.9931, + "step": 216 + }, + { + "epoch": 0.007771232116317797, + "grad_norm": 2.787811040878296, + "learning_rate": 5.178997613365155e-05, + "loss": 2.1347, + "step": 217 + }, + { + "epoch": 0.007807044245886081, + "grad_norm": 3.3416104316711426, + "learning_rate": 5.202863961813843e-05, + "loss": 2.1722, + "step": 218 + }, + { + "epoch": 0.007842856375454366, + "grad_norm": 3.348334312438965, + "learning_rate": 5.22673031026253e-05, + "loss": 2.2807, + "step": 219 + }, + { + "epoch": 0.00787866850502265, + "grad_norm": 5.854157447814941, + "learning_rate": 5.2505966587112175e-05, + "loss": 2.7781, + "step": 220 + }, + { + "epoch": 0.007914480634590935, + "grad_norm": 2.1232118606567383, + "learning_rate": 5.274463007159904e-05, + "loss": 2.0395, + "step": 221 + }, + { + "epoch": 0.007950292764159221, + "grad_norm": 3.457007646560669, + "learning_rate": 5.2983293556085924e-05, + "loss": 2.2375, + "step": 222 + }, + { + "epoch": 0.007986104893727506, + "grad_norm": 3.5690152645111084, + "learning_rate": 5.322195704057279e-05, + "loss": 2.5559, + "step": 223 + }, + { + "epoch": 0.00802191702329579, + "grad_norm": 2.673306941986084, + "learning_rate": 5.346062052505967e-05, + "loss": 2.2094, + "step": 224 + }, + { + "epoch": 0.008057729152864075, + "grad_norm": 2.7809441089630127, + "learning_rate": 5.369928400954655e-05, + "loss": 1.9878, + "step": 225 + }, + { + "epoch": 0.00809354128243236, + "grad_norm": 3.0256967544555664, + "learning_rate": 5.3937947494033416e-05, + "loss": 1.8963, + "step": 226 + }, + { + "epoch": 0.008129353412000644, + "grad_norm": 3.9648005962371826, + "learning_rate": 5.417661097852029e-05, + "loss": 2.0862, + "step": 227 + }, + { + "epoch": 0.008165165541568929, + "grad_norm": 2.245849609375, + "learning_rate": 5.441527446300716e-05, + "loss": 1.7293, + "step": 228 + }, + { + "epoch": 0.008200977671137213, + "grad_norm": 2.413544178009033, + "learning_rate": 5.465393794749404e-05, + "loss": 2.0762, + "step": 229 + }, + { + "epoch": 0.0082367898007055, + "grad_norm": 3.0067107677459717, + "learning_rate": 5.489260143198091e-05, + "loss": 2.1954, + "step": 230 + }, + { + "epoch": 0.008272601930273784, + "grad_norm": 3.096357583999634, + "learning_rate": 5.513126491646778e-05, + "loss": 2.2491, + "step": 231 + }, + { + "epoch": 0.008308414059842069, + "grad_norm": 2.373617172241211, + "learning_rate": 5.536992840095465e-05, + "loss": 2.1228, + "step": 232 + }, + { + "epoch": 0.008344226189410353, + "grad_norm": 2.055159091949463, + "learning_rate": 5.560859188544153e-05, + "loss": 1.9121, + "step": 233 + }, + { + "epoch": 0.008380038318978638, + "grad_norm": 3.7004029750823975, + "learning_rate": 5.5847255369928406e-05, + "loss": 2.3244, + "step": 234 + }, + { + "epoch": 0.008415850448546923, + "grad_norm": 2.1520016193389893, + "learning_rate": 5.6085918854415273e-05, + "loss": 2.1335, + "step": 235 + }, + { + "epoch": 0.008451662578115207, + "grad_norm": 4.440451622009277, + "learning_rate": 5.6324582338902155e-05, + "loss": 2.6448, + "step": 236 + }, + { + "epoch": 0.008487474707683492, + "grad_norm": 1.7631467580795288, + "learning_rate": 5.656324582338902e-05, + "loss": 2.0462, + "step": 237 + }, + { + "epoch": 0.008523286837251778, + "grad_norm": 3.681589365005493, + "learning_rate": 5.68019093078759e-05, + "loss": 2.3295, + "step": 238 + }, + { + "epoch": 0.008559098966820063, + "grad_norm": 2.199773073196411, + "learning_rate": 5.7040572792362765e-05, + "loss": 1.9294, + "step": 239 + }, + { + "epoch": 0.008594911096388347, + "grad_norm": 2.9512124061584473, + "learning_rate": 5.7279236276849646e-05, + "loss": 1.9352, + "step": 240 + }, + { + "epoch": 0.008630723225956632, + "grad_norm": 2.6109206676483154, + "learning_rate": 5.7517899761336514e-05, + "loss": 2.1316, + "step": 241 + }, + { + "epoch": 0.008666535355524916, + "grad_norm": 2.3685696125030518, + "learning_rate": 5.775656324582339e-05, + "loss": 2.1921, + "step": 242 + }, + { + "epoch": 0.008702347485093201, + "grad_norm": 3.183547019958496, + "learning_rate": 5.799522673031027e-05, + "loss": 2.0476, + "step": 243 + }, + { + "epoch": 0.008738159614661485, + "grad_norm": 2.3643414974212646, + "learning_rate": 5.823389021479714e-05, + "loss": 2.3041, + "step": 244 + }, + { + "epoch": 0.00877397174422977, + "grad_norm": 2.646777868270874, + "learning_rate": 5.847255369928402e-05, + "loss": 1.8089, + "step": 245 + }, + { + "epoch": 0.008809783873798055, + "grad_norm": 1.8658462762832642, + "learning_rate": 5.871121718377088e-05, + "loss": 2.2097, + "step": 246 + }, + { + "epoch": 0.008845596003366341, + "grad_norm": 3.2187111377716064, + "learning_rate": 5.894988066825776e-05, + "loss": 2.3646, + "step": 247 + }, + { + "epoch": 0.008881408132934625, + "grad_norm": 2.9397201538085938, + "learning_rate": 5.918854415274463e-05, + "loss": 1.8528, + "step": 248 + }, + { + "epoch": 0.00891722026250291, + "grad_norm": 2.3005692958831787, + "learning_rate": 5.942720763723151e-05, + "loss": 1.6524, + "step": 249 + }, + { + "epoch": 0.008953032392071195, + "grad_norm": 2.3031492233276367, + "learning_rate": 5.966587112171838e-05, + "loss": 1.9533, + "step": 250 + }, + { + "epoch": 0.00898884452163948, + "grad_norm": 1.9477695226669312, + "learning_rate": 5.990453460620525e-05, + "loss": 1.8504, + "step": 251 + }, + { + "epoch": 0.009024656651207764, + "grad_norm": 2.6072261333465576, + "learning_rate": 6.0143198090692134e-05, + "loss": 2.2766, + "step": 252 + }, + { + "epoch": 0.009060468780776048, + "grad_norm": 1.8509987592697144, + "learning_rate": 6.0381861575179e-05, + "loss": 1.9699, + "step": 253 + }, + { + "epoch": 0.009096280910344333, + "grad_norm": 2.4382662773132324, + "learning_rate": 6.062052505966588e-05, + "loss": 2.0657, + "step": 254 + }, + { + "epoch": 0.00913209303991262, + "grad_norm": 2.3414926528930664, + "learning_rate": 6.0859188544152745e-05, + "loss": 2.0522, + "step": 255 + }, + { + "epoch": 0.009167905169480904, + "grad_norm": 2.583178997039795, + "learning_rate": 6.109785202863962e-05, + "loss": 2.0446, + "step": 256 + }, + { + "epoch": 0.009203717299049188, + "grad_norm": 4.623299598693848, + "learning_rate": 6.133651551312649e-05, + "loss": 2.2111, + "step": 257 + }, + { + "epoch": 0.009239529428617473, + "grad_norm": 2.9717419147491455, + "learning_rate": 6.157517899761337e-05, + "loss": 2.0214, + "step": 258 + }, + { + "epoch": 0.009275341558185757, + "grad_norm": 2.3703761100769043, + "learning_rate": 6.181384248210024e-05, + "loss": 2.0451, + "step": 259 + }, + { + "epoch": 0.009311153687754042, + "grad_norm": 2.0671181678771973, + "learning_rate": 6.205250596658712e-05, + "loss": 1.9399, + "step": 260 + }, + { + "epoch": 0.009346965817322327, + "grad_norm": 2.3600852489471436, + "learning_rate": 6.2291169451074e-05, + "loss": 1.9703, + "step": 261 + }, + { + "epoch": 0.009382777946890611, + "grad_norm": 1.8281196355819702, + "learning_rate": 6.252983293556087e-05, + "loss": 1.9562, + "step": 262 + }, + { + "epoch": 0.009418590076458897, + "grad_norm": 3.4217169284820557, + "learning_rate": 6.276849642004773e-05, + "loss": 2.1296, + "step": 263 + }, + { + "epoch": 0.009454402206027182, + "grad_norm": 5.549176216125488, + "learning_rate": 6.30071599045346e-05, + "loss": 2.3743, + "step": 264 + }, + { + "epoch": 0.009490214335595467, + "grad_norm": 1.707136869430542, + "learning_rate": 6.324582338902148e-05, + "loss": 1.9212, + "step": 265 + }, + { + "epoch": 0.009526026465163751, + "grad_norm": 2.4918057918548584, + "learning_rate": 6.348448687350835e-05, + "loss": 2.0175, + "step": 266 + }, + { + "epoch": 0.009561838594732036, + "grad_norm": 2.368617057800293, + "learning_rate": 6.372315035799523e-05, + "loss": 1.8226, + "step": 267 + }, + { + "epoch": 0.00959765072430032, + "grad_norm": 2.9346976280212402, + "learning_rate": 6.39618138424821e-05, + "loss": 2.3414, + "step": 268 + }, + { + "epoch": 0.009633462853868605, + "grad_norm": 2.7401106357574463, + "learning_rate": 6.420047732696898e-05, + "loss": 1.8663, + "step": 269 + }, + { + "epoch": 0.00966927498343689, + "grad_norm": 2.8707168102264404, + "learning_rate": 6.443914081145585e-05, + "loss": 2.2932, + "step": 270 + }, + { + "epoch": 0.009705087113005174, + "grad_norm": 2.1876916885375977, + "learning_rate": 6.467780429594272e-05, + "loss": 1.986, + "step": 271 + }, + { + "epoch": 0.00974089924257346, + "grad_norm": 2.1263930797576904, + "learning_rate": 6.49164677804296e-05, + "loss": 1.8861, + "step": 272 + }, + { + "epoch": 0.009776711372141745, + "grad_norm": 1.8206520080566406, + "learning_rate": 6.515513126491647e-05, + "loss": 1.8851, + "step": 273 + }, + { + "epoch": 0.00981252350171003, + "grad_norm": 2.8688876628875732, + "learning_rate": 6.539379474940335e-05, + "loss": 1.8439, + "step": 274 + }, + { + "epoch": 0.009848335631278314, + "grad_norm": 2.3246121406555176, + "learning_rate": 6.563245823389022e-05, + "loss": 2.1826, + "step": 275 + }, + { + "epoch": 0.009884147760846599, + "grad_norm": 2.6065704822540283, + "learning_rate": 6.58711217183771e-05, + "loss": 2.229, + "step": 276 + }, + { + "epoch": 0.009919959890414883, + "grad_norm": 2.4256670475006104, + "learning_rate": 6.610978520286396e-05, + "loss": 2.0945, + "step": 277 + }, + { + "epoch": 0.009955772019983168, + "grad_norm": 1.7812882661819458, + "learning_rate": 6.634844868735083e-05, + "loss": 2.255, + "step": 278 + }, + { + "epoch": 0.009991584149551452, + "grad_norm": 1.8207054138183594, + "learning_rate": 6.65871121718377e-05, + "loss": 1.6961, + "step": 279 + }, + { + "epoch": 0.010027396279119739, + "grad_norm": 1.6386990547180176, + "learning_rate": 6.682577565632458e-05, + "loss": 1.856, + "step": 280 + }, + { + "epoch": 0.010063208408688023, + "grad_norm": 3.287531614303589, + "learning_rate": 6.706443914081146e-05, + "loss": 2.3542, + "step": 281 + }, + { + "epoch": 0.010099020538256308, + "grad_norm": 2.7156126499176025, + "learning_rate": 6.730310262529833e-05, + "loss": 1.8437, + "step": 282 + }, + { + "epoch": 0.010134832667824592, + "grad_norm": 2.3014869689941406, + "learning_rate": 6.754176610978521e-05, + "loss": 2.447, + "step": 283 + }, + { + "epoch": 0.010170644797392877, + "grad_norm": 2.7459521293640137, + "learning_rate": 6.778042959427208e-05, + "loss": 2.2216, + "step": 284 + }, + { + "epoch": 0.010206456926961162, + "grad_norm": 2.640103816986084, + "learning_rate": 6.801909307875896e-05, + "loss": 2.0094, + "step": 285 + }, + { + "epoch": 0.010242269056529446, + "grad_norm": 3.3041248321533203, + "learning_rate": 6.825775656324583e-05, + "loss": 2.6193, + "step": 286 + }, + { + "epoch": 0.01027808118609773, + "grad_norm": 2.390202522277832, + "learning_rate": 6.84964200477327e-05, + "loss": 2.4139, + "step": 287 + }, + { + "epoch": 0.010313893315666015, + "grad_norm": 2.879079580307007, + "learning_rate": 6.873508353221956e-05, + "loss": 2.1419, + "step": 288 + }, + { + "epoch": 0.010349705445234302, + "grad_norm": 5.001767635345459, + "learning_rate": 6.897374701670645e-05, + "loss": 2.108, + "step": 289 + }, + { + "epoch": 0.010385517574802586, + "grad_norm": 4.854882717132568, + "learning_rate": 6.921241050119333e-05, + "loss": 2.4277, + "step": 290 + }, + { + "epoch": 0.01042132970437087, + "grad_norm": 2.883726119995117, + "learning_rate": 6.94510739856802e-05, + "loss": 2.0774, + "step": 291 + }, + { + "epoch": 0.010457141833939155, + "grad_norm": 2.1024317741394043, + "learning_rate": 6.968973747016708e-05, + "loss": 2.1162, + "step": 292 + }, + { + "epoch": 0.01049295396350744, + "grad_norm": 2.173642873764038, + "learning_rate": 6.992840095465394e-05, + "loss": 1.9923, + "step": 293 + }, + { + "epoch": 0.010528766093075724, + "grad_norm": 2.249166250228882, + "learning_rate": 7.016706443914081e-05, + "loss": 2.0456, + "step": 294 + }, + { + "epoch": 0.010564578222644009, + "grad_norm": 2.782399892807007, + "learning_rate": 7.040572792362768e-05, + "loss": 1.6617, + "step": 295 + }, + { + "epoch": 0.010600390352212294, + "grad_norm": 2.9986255168914795, + "learning_rate": 7.064439140811456e-05, + "loss": 2.0264, + "step": 296 + }, + { + "epoch": 0.01063620248178058, + "grad_norm": 1.7165770530700684, + "learning_rate": 7.088305489260143e-05, + "loss": 1.9433, + "step": 297 + }, + { + "epoch": 0.010672014611348864, + "grad_norm": 2.9197936058044434, + "learning_rate": 7.112171837708831e-05, + "loss": 1.9866, + "step": 298 + }, + { + "epoch": 0.010707826740917149, + "grad_norm": 3.9878487586975098, + "learning_rate": 7.136038186157519e-05, + "loss": 2.2649, + "step": 299 + }, + { + "epoch": 0.010743638870485434, + "grad_norm": 2.6345372200012207, + "learning_rate": 7.159904534606206e-05, + "loss": 1.8249, + "step": 300 + }, + { + "epoch": 0.010779451000053718, + "grad_norm": 2.4214723110198975, + "learning_rate": 7.183770883054893e-05, + "loss": 2.0411, + "step": 301 + }, + { + "epoch": 0.010815263129622003, + "grad_norm": 3.2533586025238037, + "learning_rate": 7.20763723150358e-05, + "loss": 2.0109, + "step": 302 + }, + { + "epoch": 0.010851075259190287, + "grad_norm": 2.417447566986084, + "learning_rate": 7.231503579952268e-05, + "loss": 1.996, + "step": 303 + }, + { + "epoch": 0.010886887388758572, + "grad_norm": 3.2113840579986572, + "learning_rate": 7.255369928400954e-05, + "loss": 2.1155, + "step": 304 + }, + { + "epoch": 0.010922699518326858, + "grad_norm": 2.6941497325897217, + "learning_rate": 7.279236276849643e-05, + "loss": 2.1457, + "step": 305 + }, + { + "epoch": 0.010958511647895143, + "grad_norm": 1.8207515478134155, + "learning_rate": 7.30310262529833e-05, + "loss": 1.9003, + "step": 306 + }, + { + "epoch": 0.010994323777463427, + "grad_norm": 3.3273072242736816, + "learning_rate": 7.326968973747017e-05, + "loss": 2.2891, + "step": 307 + }, + { + "epoch": 0.011030135907031712, + "grad_norm": 1.6675465106964111, + "learning_rate": 7.350835322195704e-05, + "loss": 1.9006, + "step": 308 + }, + { + "epoch": 0.011065948036599996, + "grad_norm": 2.2108771800994873, + "learning_rate": 7.374701670644391e-05, + "loss": 2.0193, + "step": 309 + }, + { + "epoch": 0.011101760166168281, + "grad_norm": 1.6703202724456787, + "learning_rate": 7.398568019093079e-05, + "loss": 1.7437, + "step": 310 + }, + { + "epoch": 0.011137572295736566, + "grad_norm": 2.188809394836426, + "learning_rate": 7.422434367541766e-05, + "loss": 1.97, + "step": 311 + }, + { + "epoch": 0.01117338442530485, + "grad_norm": 2.2568600177764893, + "learning_rate": 7.446300715990454e-05, + "loss": 2.1866, + "step": 312 + }, + { + "epoch": 0.011209196554873135, + "grad_norm": 2.189673662185669, + "learning_rate": 7.470167064439141e-05, + "loss": 1.8975, + "step": 313 + }, + { + "epoch": 0.011245008684441421, + "grad_norm": 1.8501036167144775, + "learning_rate": 7.494033412887829e-05, + "loss": 2.0104, + "step": 314 + }, + { + "epoch": 0.011280820814009706, + "grad_norm": 3.4657652378082275, + "learning_rate": 7.517899761336516e-05, + "loss": 2.1677, + "step": 315 + }, + { + "epoch": 0.01131663294357799, + "grad_norm": 1.7842155694961548, + "learning_rate": 7.541766109785204e-05, + "loss": 2.162, + "step": 316 + }, + { + "epoch": 0.011352445073146275, + "grad_norm": 2.6685571670532227, + "learning_rate": 7.565632458233891e-05, + "loss": 2.2024, + "step": 317 + }, + { + "epoch": 0.01138825720271456, + "grad_norm": 2.2792065143585205, + "learning_rate": 7.589498806682577e-05, + "loss": 2.0813, + "step": 318 + }, + { + "epoch": 0.011424069332282844, + "grad_norm": 1.8221955299377441, + "learning_rate": 7.613365155131266e-05, + "loss": 1.9769, + "step": 319 + }, + { + "epoch": 0.011459881461851128, + "grad_norm": 4.3000922203063965, + "learning_rate": 7.637231503579952e-05, + "loss": 2.301, + "step": 320 + }, + { + "epoch": 0.011495693591419413, + "grad_norm": 1.8403431177139282, + "learning_rate": 7.66109785202864e-05, + "loss": 1.9934, + "step": 321 + }, + { + "epoch": 0.0115315057209877, + "grad_norm": 2.3018696308135986, + "learning_rate": 7.684964200477327e-05, + "loss": 1.9003, + "step": 322 + }, + { + "epoch": 0.011567317850555984, + "grad_norm": 2.887930393218994, + "learning_rate": 7.708830548926015e-05, + "loss": 2.5743, + "step": 323 + }, + { + "epoch": 0.011603129980124268, + "grad_norm": 1.5166923999786377, + "learning_rate": 7.732696897374702e-05, + "loss": 1.8831, + "step": 324 + }, + { + "epoch": 0.011638942109692553, + "grad_norm": 2.4193575382232666, + "learning_rate": 7.756563245823389e-05, + "loss": 2.0361, + "step": 325 + }, + { + "epoch": 0.011674754239260838, + "grad_norm": 1.8523463010787964, + "learning_rate": 7.780429594272077e-05, + "loss": 1.9684, + "step": 326 + }, + { + "epoch": 0.011710566368829122, + "grad_norm": 2.535493850708008, + "learning_rate": 7.804295942720764e-05, + "loss": 2.1888, + "step": 327 + }, + { + "epoch": 0.011746378498397407, + "grad_norm": 2.134873390197754, + "learning_rate": 7.828162291169452e-05, + "loss": 1.826, + "step": 328 + }, + { + "epoch": 0.011782190627965691, + "grad_norm": 2.6702308654785156, + "learning_rate": 7.852028639618139e-05, + "loss": 1.9467, + "step": 329 + }, + { + "epoch": 0.011818002757533978, + "grad_norm": 1.7101609706878662, + "learning_rate": 7.875894988066827e-05, + "loss": 2.0146, + "step": 330 + }, + { + "epoch": 0.011853814887102262, + "grad_norm": 1.9963518381118774, + "learning_rate": 7.899761336515514e-05, + "loss": 1.8542, + "step": 331 + }, + { + "epoch": 0.011889627016670547, + "grad_norm": 2.23158597946167, + "learning_rate": 7.9236276849642e-05, + "loss": 1.895, + "step": 332 + }, + { + "epoch": 0.011925439146238831, + "grad_norm": 2.7184247970581055, + "learning_rate": 7.947494033412887e-05, + "loss": 2.2446, + "step": 333 + }, + { + "epoch": 0.011961251275807116, + "grad_norm": 2.315206527709961, + "learning_rate": 7.971360381861575e-05, + "loss": 2.2946, + "step": 334 + }, + { + "epoch": 0.0119970634053754, + "grad_norm": 2.8590247631073, + "learning_rate": 7.995226730310262e-05, + "loss": 2.0968, + "step": 335 + }, + { + "epoch": 0.012032875534943685, + "grad_norm": 2.333500623703003, + "learning_rate": 8.01909307875895e-05, + "loss": 2.2555, + "step": 336 + }, + { + "epoch": 0.01206868766451197, + "grad_norm": 1.6746714115142822, + "learning_rate": 8.042959427207638e-05, + "loss": 1.9062, + "step": 337 + }, + { + "epoch": 0.012104499794080254, + "grad_norm": 2.6714026927948, + "learning_rate": 8.066825775656325e-05, + "loss": 2.5086, + "step": 338 + }, + { + "epoch": 0.01214031192364854, + "grad_norm": 4.634089469909668, + "learning_rate": 8.090692124105012e-05, + "loss": 2.0091, + "step": 339 + }, + { + "epoch": 0.012176124053216825, + "grad_norm": 2.3018248081207275, + "learning_rate": 8.114558472553699e-05, + "loss": 2.1707, + "step": 340 + }, + { + "epoch": 0.01221193618278511, + "grad_norm": 2.3117361068725586, + "learning_rate": 8.138424821002387e-05, + "loss": 2.0692, + "step": 341 + }, + { + "epoch": 0.012247748312353394, + "grad_norm": 2.316091299057007, + "learning_rate": 8.162291169451074e-05, + "loss": 2.1788, + "step": 342 + }, + { + "epoch": 0.012283560441921679, + "grad_norm": 1.7986754179000854, + "learning_rate": 8.186157517899762e-05, + "loss": 2.0328, + "step": 343 + }, + { + "epoch": 0.012319372571489963, + "grad_norm": 2.5290677547454834, + "learning_rate": 8.210023866348449e-05, + "loss": 2.2345, + "step": 344 + }, + { + "epoch": 0.012355184701058248, + "grad_norm": 1.8213863372802734, + "learning_rate": 8.233890214797137e-05, + "loss": 1.833, + "step": 345 + }, + { + "epoch": 0.012390996830626532, + "grad_norm": 2.757683753967285, + "learning_rate": 8.257756563245825e-05, + "loss": 2.2434, + "step": 346 + }, + { + "epoch": 0.012426808960194819, + "grad_norm": 2.7653613090515137, + "learning_rate": 8.28162291169451e-05, + "loss": 1.9301, + "step": 347 + }, + { + "epoch": 0.012462621089763103, + "grad_norm": 2.544404983520508, + "learning_rate": 8.305489260143198e-05, + "loss": 2.1534, + "step": 348 + }, + { + "epoch": 0.012498433219331388, + "grad_norm": 2.3338091373443604, + "learning_rate": 8.329355608591885e-05, + "loss": 2.1738, + "step": 349 + }, + { + "epoch": 0.012534245348899672, + "grad_norm": 2.3454766273498535, + "learning_rate": 8.353221957040573e-05, + "loss": 2.1207, + "step": 350 + }, + { + "epoch": 0.012570057478467957, + "grad_norm": 2.4842939376831055, + "learning_rate": 8.37708830548926e-05, + "loss": 1.9311, + "step": 351 + }, + { + "epoch": 0.012605869608036242, + "grad_norm": 3.1244163513183594, + "learning_rate": 8.400954653937948e-05, + "loss": 1.8968, + "step": 352 + }, + { + "epoch": 0.012641681737604526, + "grad_norm": 1.9500224590301514, + "learning_rate": 8.424821002386635e-05, + "loss": 1.9746, + "step": 353 + }, + { + "epoch": 0.01267749386717281, + "grad_norm": 2.537527561187744, + "learning_rate": 8.448687350835323e-05, + "loss": 1.8227, + "step": 354 + }, + { + "epoch": 0.012713305996741097, + "grad_norm": 1.9497244358062744, + "learning_rate": 8.47255369928401e-05, + "loss": 1.9673, + "step": 355 + }, + { + "epoch": 0.012749118126309382, + "grad_norm": 2.9225804805755615, + "learning_rate": 8.496420047732697e-05, + "loss": 2.253, + "step": 356 + }, + { + "epoch": 0.012784930255877666, + "grad_norm": 2.1374170780181885, + "learning_rate": 8.520286396181385e-05, + "loss": 1.8416, + "step": 357 + }, + { + "epoch": 0.01282074238544595, + "grad_norm": 2.318568468093872, + "learning_rate": 8.544152744630072e-05, + "loss": 2.1213, + "step": 358 + }, + { + "epoch": 0.012856554515014235, + "grad_norm": 1.6215802431106567, + "learning_rate": 8.56801909307876e-05, + "loss": 1.9469, + "step": 359 + }, + { + "epoch": 0.01289236664458252, + "grad_norm": 2.4686646461486816, + "learning_rate": 8.591885441527447e-05, + "loss": 2.2494, + "step": 360 + }, + { + "epoch": 0.012928178774150804, + "grad_norm": 1.9479748010635376, + "learning_rate": 8.615751789976135e-05, + "loss": 2.0661, + "step": 361 + }, + { + "epoch": 0.012963990903719089, + "grad_norm": 2.651142120361328, + "learning_rate": 8.639618138424822e-05, + "loss": 2.2081, + "step": 362 + }, + { + "epoch": 0.012999803033287374, + "grad_norm": 2.298008441925049, + "learning_rate": 8.663484486873508e-05, + "loss": 2.0353, + "step": 363 + }, + { + "epoch": 0.01303561516285566, + "grad_norm": 2.419844388961792, + "learning_rate": 8.687350835322196e-05, + "loss": 1.8743, + "step": 364 + }, + { + "epoch": 0.013071427292423944, + "grad_norm": 1.65255868434906, + "learning_rate": 8.711217183770883e-05, + "loss": 1.9959, + "step": 365 + }, + { + "epoch": 0.013107239421992229, + "grad_norm": 2.8441193103790283, + "learning_rate": 8.735083532219571e-05, + "loss": 1.9528, + "step": 366 + }, + { + "epoch": 0.013143051551560514, + "grad_norm": 2.9021151065826416, + "learning_rate": 8.758949880668258e-05, + "loss": 1.7156, + "step": 367 + }, + { + "epoch": 0.013178863681128798, + "grad_norm": 2.2067577838897705, + "learning_rate": 8.782816229116946e-05, + "loss": 1.9428, + "step": 368 + }, + { + "epoch": 0.013214675810697083, + "grad_norm": 1.5661609172821045, + "learning_rate": 8.806682577565633e-05, + "loss": 1.7973, + "step": 369 + }, + { + "epoch": 0.013250487940265367, + "grad_norm": 2.014951467514038, + "learning_rate": 8.83054892601432e-05, + "loss": 2.0513, + "step": 370 + }, + { + "epoch": 0.013286300069833652, + "grad_norm": 2.001088857650757, + "learning_rate": 8.854415274463007e-05, + "loss": 2.0834, + "step": 371 + }, + { + "epoch": 0.013322112199401938, + "grad_norm": 3.38238787651062, + "learning_rate": 8.878281622911695e-05, + "loss": 1.9486, + "step": 372 + }, + { + "epoch": 0.013357924328970223, + "grad_norm": 2.687730550765991, + "learning_rate": 8.902147971360383e-05, + "loss": 2.0036, + "step": 373 + }, + { + "epoch": 0.013393736458538507, + "grad_norm": 2.138909101486206, + "learning_rate": 8.92601431980907e-05, + "loss": 1.8376, + "step": 374 + }, + { + "epoch": 0.013429548588106792, + "grad_norm": 2.748356580734253, + "learning_rate": 8.949880668257758e-05, + "loss": 2.4684, + "step": 375 + }, + { + "epoch": 0.013465360717675076, + "grad_norm": 1.895658016204834, + "learning_rate": 8.973747016706445e-05, + "loss": 2.1803, + "step": 376 + }, + { + "epoch": 0.013501172847243361, + "grad_norm": 2.7984509468078613, + "learning_rate": 8.997613365155131e-05, + "loss": 2.166, + "step": 377 + }, + { + "epoch": 0.013536984976811646, + "grad_norm": 4.1571173667907715, + "learning_rate": 9.021479713603818e-05, + "loss": 2.1634, + "step": 378 + }, + { + "epoch": 0.01357279710637993, + "grad_norm": 2.012040853500366, + "learning_rate": 9.045346062052506e-05, + "loss": 1.6297, + "step": 379 + }, + { + "epoch": 0.013608609235948217, + "grad_norm": 1.9994226694107056, + "learning_rate": 9.069212410501193e-05, + "loss": 1.7911, + "step": 380 + }, + { + "epoch": 0.013644421365516501, + "grad_norm": 2.6208250522613525, + "learning_rate": 9.093078758949881e-05, + "loss": 1.94, + "step": 381 + }, + { + "epoch": 0.013680233495084786, + "grad_norm": 3.200495719909668, + "learning_rate": 9.116945107398569e-05, + "loss": 1.9532, + "step": 382 + }, + { + "epoch": 0.01371604562465307, + "grad_norm": 1.4798295497894287, + "learning_rate": 9.140811455847256e-05, + "loss": 2.0838, + "step": 383 + }, + { + "epoch": 0.013751857754221355, + "grad_norm": 2.466656446456909, + "learning_rate": 9.164677804295944e-05, + "loss": 2.0837, + "step": 384 + }, + { + "epoch": 0.01378766988378964, + "grad_norm": 2.8002874851226807, + "learning_rate": 9.18854415274463e-05, + "loss": 2.3469, + "step": 385 + }, + { + "epoch": 0.013823482013357924, + "grad_norm": 2.267125368118286, + "learning_rate": 9.212410501193318e-05, + "loss": 2.1056, + "step": 386 + }, + { + "epoch": 0.013859294142926208, + "grad_norm": 2.0536201000213623, + "learning_rate": 9.236276849642005e-05, + "loss": 2.2548, + "step": 387 + }, + { + "epoch": 0.013895106272494493, + "grad_norm": 1.349810242652893, + "learning_rate": 9.260143198090693e-05, + "loss": 2.0713, + "step": 388 + }, + { + "epoch": 0.01393091840206278, + "grad_norm": 2.174856424331665, + "learning_rate": 9.28400954653938e-05, + "loss": 1.9262, + "step": 389 + }, + { + "epoch": 0.013966730531631064, + "grad_norm": 1.979978322982788, + "learning_rate": 9.307875894988068e-05, + "loss": 2.2743, + "step": 390 + }, + { + "epoch": 0.014002542661199349, + "grad_norm": 1.4002437591552734, + "learning_rate": 9.331742243436754e-05, + "loss": 1.8055, + "step": 391 + }, + { + "epoch": 0.014038354790767633, + "grad_norm": 1.8472005128860474, + "learning_rate": 9.355608591885443e-05, + "loss": 2.1736, + "step": 392 + }, + { + "epoch": 0.014074166920335918, + "grad_norm": 1.950952172279358, + "learning_rate": 9.379474940334129e-05, + "loss": 2.1521, + "step": 393 + }, + { + "epoch": 0.014109979049904202, + "grad_norm": 1.622833251953125, + "learning_rate": 9.403341288782816e-05, + "loss": 1.8347, + "step": 394 + }, + { + "epoch": 0.014145791179472487, + "grad_norm": 2.020577907562256, + "learning_rate": 9.427207637231504e-05, + "loss": 2.0858, + "step": 395 + }, + { + "epoch": 0.014181603309040771, + "grad_norm": 1.5022003650665283, + "learning_rate": 9.451073985680191e-05, + "loss": 1.8131, + "step": 396 + }, + { + "epoch": 0.014217415438609058, + "grad_norm": 1.293748140335083, + "learning_rate": 9.474940334128879e-05, + "loss": 1.7116, + "step": 397 + }, + { + "epoch": 0.014253227568177342, + "grad_norm": 3.8732552528381348, + "learning_rate": 9.498806682577566e-05, + "loss": 2.359, + "step": 398 + }, + { + "epoch": 0.014289039697745627, + "grad_norm": 2.218346118927002, + "learning_rate": 9.522673031026254e-05, + "loss": 2.3042, + "step": 399 + }, + { + "epoch": 0.014324851827313911, + "grad_norm": 1.8491095304489136, + "learning_rate": 9.546539379474941e-05, + "loss": 1.8915, + "step": 400 + }, + { + "epoch": 0.014360663956882196, + "grad_norm": 2.9908032417297363, + "learning_rate": 9.570405727923628e-05, + "loss": 2.0991, + "step": 401 + }, + { + "epoch": 0.01439647608645048, + "grad_norm": 6.480159282684326, + "learning_rate": 9.594272076372316e-05, + "loss": 2.5134, + "step": 402 + }, + { + "epoch": 0.014432288216018765, + "grad_norm": 1.737426996231079, + "learning_rate": 9.618138424821003e-05, + "loss": 1.8967, + "step": 403 + }, + { + "epoch": 0.01446810034558705, + "grad_norm": 1.8148224353790283, + "learning_rate": 9.64200477326969e-05, + "loss": 1.7998, + "step": 404 + }, + { + "epoch": 0.014503912475155336, + "grad_norm": 2.05126690864563, + "learning_rate": 9.665871121718377e-05, + "loss": 1.8313, + "step": 405 + }, + { + "epoch": 0.01453972460472362, + "grad_norm": 3.2550830841064453, + "learning_rate": 9.689737470167066e-05, + "loss": 2.2962, + "step": 406 + }, + { + "epoch": 0.014575536734291905, + "grad_norm": 2.001216411590576, + "learning_rate": 9.713603818615752e-05, + "loss": 2.031, + "step": 407 + }, + { + "epoch": 0.01461134886386019, + "grad_norm": 1.8144034147262573, + "learning_rate": 9.737470167064439e-05, + "loss": 1.675, + "step": 408 + }, + { + "epoch": 0.014647160993428474, + "grad_norm": 2.0882346630096436, + "learning_rate": 9.761336515513126e-05, + "loss": 2.0904, + "step": 409 + }, + { + "epoch": 0.014682973122996759, + "grad_norm": 2.1910388469696045, + "learning_rate": 9.785202863961814e-05, + "loss": 1.8463, + "step": 410 + }, + { + "epoch": 0.014718785252565043, + "grad_norm": 1.9278587102890015, + "learning_rate": 9.809069212410502e-05, + "loss": 2.0166, + "step": 411 + }, + { + "epoch": 0.014754597382133328, + "grad_norm": 2.418215751647949, + "learning_rate": 9.832935560859189e-05, + "loss": 2.3441, + "step": 412 + }, + { + "epoch": 0.014790409511701613, + "grad_norm": 1.1429497003555298, + "learning_rate": 9.856801909307877e-05, + "loss": 1.628, + "step": 413 + }, + { + "epoch": 0.014826221641269899, + "grad_norm": 2.355159044265747, + "learning_rate": 9.880668257756564e-05, + "loss": 2.1252, + "step": 414 + }, + { + "epoch": 0.014862033770838183, + "grad_norm": 1.5233746767044067, + "learning_rate": 9.90453460620525e-05, + "loss": 1.9219, + "step": 415 + }, + { + "epoch": 0.014897845900406468, + "grad_norm": 1.9501924514770508, + "learning_rate": 9.928400954653937e-05, + "loss": 1.6977, + "step": 416 + }, + { + "epoch": 0.014933658029974753, + "grad_norm": 2.0013651847839355, + "learning_rate": 9.952267303102626e-05, + "loss": 2.1085, + "step": 417 + }, + { + "epoch": 0.014969470159543037, + "grad_norm": 2.1112027168273926, + "learning_rate": 9.976133651551312e-05, + "loss": 2.0397, + "step": 418 + }, + { + "epoch": 0.015005282289111322, + "grad_norm": 1.9288190603256226, + "learning_rate": 0.0001, + "loss": 2.0879, + "step": 419 + }, + { + "epoch": 0.015041094418679606, + "grad_norm": 2.2626709938049316, + "learning_rate": 0.00010023866348448687, + "loss": 2.2272, + "step": 420 + }, + { + "epoch": 0.01507690654824789, + "grad_norm": 1.7174164056777954, + "learning_rate": 0.00010047732696897377, + "loss": 1.9436, + "step": 421 + }, + { + "epoch": 0.015112718677816177, + "grad_norm": 2.5441973209381104, + "learning_rate": 0.00010071599045346064, + "loss": 2.2915, + "step": 422 + }, + { + "epoch": 0.015148530807384462, + "grad_norm": 1.7958893775939941, + "learning_rate": 0.0001009546539379475, + "loss": 1.9233, + "step": 423 + }, + { + "epoch": 0.015184342936952746, + "grad_norm": 2.6941654682159424, + "learning_rate": 0.00010119331742243436, + "loss": 2.0398, + "step": 424 + }, + { + "epoch": 0.01522015506652103, + "grad_norm": 2.385948657989502, + "learning_rate": 0.00010143198090692125, + "loss": 2.2552, + "step": 425 + }, + { + "epoch": 0.015255967196089315, + "grad_norm": 2.532027244567871, + "learning_rate": 0.00010167064439140812, + "loss": 2.3462, + "step": 426 + }, + { + "epoch": 0.0152917793256576, + "grad_norm": 1.7785552740097046, + "learning_rate": 0.00010190930787589499, + "loss": 2.1202, + "step": 427 + }, + { + "epoch": 0.015327591455225885, + "grad_norm": 4.266390323638916, + "learning_rate": 0.00010214797136038186, + "loss": 1.7885, + "step": 428 + }, + { + "epoch": 0.015363403584794169, + "grad_norm": 1.9129207134246826, + "learning_rate": 0.00010238663484486875, + "loss": 1.9795, + "step": 429 + }, + { + "epoch": 0.015399215714362455, + "grad_norm": 2.3568286895751953, + "learning_rate": 0.00010262529832935562, + "loss": 2.1897, + "step": 430 + }, + { + "epoch": 0.01543502784393074, + "grad_norm": 2.463308811187744, + "learning_rate": 0.00010286396181384249, + "loss": 2.191, + "step": 431 + }, + { + "epoch": 0.015470839973499025, + "grad_norm": 2.999436855316162, + "learning_rate": 0.00010310262529832937, + "loss": 2.1384, + "step": 432 + }, + { + "epoch": 0.01550665210306731, + "grad_norm": 2.8959248065948486, + "learning_rate": 0.00010334128878281624, + "loss": 2.5383, + "step": 433 + }, + { + "epoch": 0.015542464232635594, + "grad_norm": 1.6721136569976807, + "learning_rate": 0.0001035799522673031, + "loss": 2.1312, + "step": 434 + }, + { + "epoch": 0.015578276362203878, + "grad_norm": 2.227095365524292, + "learning_rate": 0.00010381861575178997, + "loss": 1.9225, + "step": 435 + }, + { + "epoch": 0.015614088491772163, + "grad_norm": 2.0040481090545654, + "learning_rate": 0.00010405727923627687, + "loss": 1.9404, + "step": 436 + }, + { + "epoch": 0.01564990062134045, + "grad_norm": 2.118105173110962, + "learning_rate": 0.00010429594272076373, + "loss": 2.0456, + "step": 437 + }, + { + "epoch": 0.015685712750908732, + "grad_norm": 1.629913568496704, + "learning_rate": 0.0001045346062052506, + "loss": 1.9724, + "step": 438 + }, + { + "epoch": 0.01572152488047702, + "grad_norm": 1.544683575630188, + "learning_rate": 0.00010477326968973748, + "loss": 2.3597, + "step": 439 + }, + { + "epoch": 0.0157573370100453, + "grad_norm": 2.007746934890747, + "learning_rate": 0.00010501193317422435, + "loss": 2.1682, + "step": 440 + }, + { + "epoch": 0.015793149139613587, + "grad_norm": 2.2837698459625244, + "learning_rate": 0.00010525059665871122, + "loss": 2.0777, + "step": 441 + }, + { + "epoch": 0.01582896126918187, + "grad_norm": 2.146202802658081, + "learning_rate": 0.00010548926014319809, + "loss": 2.0073, + "step": 442 + }, + { + "epoch": 0.015864773398750157, + "grad_norm": 2.1583595275878906, + "learning_rate": 0.00010572792362768498, + "loss": 1.7767, + "step": 443 + }, + { + "epoch": 0.015900585528318443, + "grad_norm": 2.5789666175842285, + "learning_rate": 0.00010596658711217185, + "loss": 2.2289, + "step": 444 + }, + { + "epoch": 0.015936397657886726, + "grad_norm": 2.8515987396240234, + "learning_rate": 0.00010620525059665872, + "loss": 2.1816, + "step": 445 + }, + { + "epoch": 0.015972209787455012, + "grad_norm": 2.2290050983428955, + "learning_rate": 0.00010644391408114558, + "loss": 2.1232, + "step": 446 + }, + { + "epoch": 0.016008021917023295, + "grad_norm": 1.601799726486206, + "learning_rate": 0.00010668257756563247, + "loss": 2.1464, + "step": 447 + }, + { + "epoch": 0.01604383404659158, + "grad_norm": 1.905332088470459, + "learning_rate": 0.00010692124105011933, + "loss": 1.8095, + "step": 448 + }, + { + "epoch": 0.016079646176159864, + "grad_norm": 2.2088615894317627, + "learning_rate": 0.0001071599045346062, + "loss": 2.4517, + "step": 449 + }, + { + "epoch": 0.01611545830572815, + "grad_norm": 1.835677146911621, + "learning_rate": 0.0001073985680190931, + "loss": 2.1423, + "step": 450 + }, + { + "epoch": 0.016151270435296437, + "grad_norm": 1.556067705154419, + "learning_rate": 0.00010763723150357996, + "loss": 1.8783, + "step": 451 + }, + { + "epoch": 0.01618708256486472, + "grad_norm": 2.0234811305999756, + "learning_rate": 0.00010787589498806683, + "loss": 2.0476, + "step": 452 + }, + { + "epoch": 0.016222894694433006, + "grad_norm": 2.6546096801757812, + "learning_rate": 0.0001081145584725537, + "loss": 1.9325, + "step": 453 + }, + { + "epoch": 0.01625870682400129, + "grad_norm": 1.8908900022506714, + "learning_rate": 0.00010835322195704058, + "loss": 1.9481, + "step": 454 + }, + { + "epoch": 0.016294518953569575, + "grad_norm": 1.8198904991149902, + "learning_rate": 0.00010859188544152745, + "loss": 1.918, + "step": 455 + }, + { + "epoch": 0.016330331083137858, + "grad_norm": 1.573096752166748, + "learning_rate": 0.00010883054892601432, + "loss": 2.0613, + "step": 456 + }, + { + "epoch": 0.016366143212706144, + "grad_norm": 1.9454529285430908, + "learning_rate": 0.00010906921241050121, + "loss": 1.9583, + "step": 457 + }, + { + "epoch": 0.016401955342274427, + "grad_norm": 1.7274153232574463, + "learning_rate": 0.00010930787589498808, + "loss": 1.7381, + "step": 458 + }, + { + "epoch": 0.016437767471842713, + "grad_norm": 1.4126781225204468, + "learning_rate": 0.00010954653937947495, + "loss": 1.7259, + "step": 459 + }, + { + "epoch": 0.016473579601411, + "grad_norm": 2.398538589477539, + "learning_rate": 0.00010978520286396181, + "loss": 2.2239, + "step": 460 + }, + { + "epoch": 0.016509391730979282, + "grad_norm": 1.7049897909164429, + "learning_rate": 0.0001100238663484487, + "loss": 1.7488, + "step": 461 + }, + { + "epoch": 0.01654520386054757, + "grad_norm": 1.9099923372268677, + "learning_rate": 0.00011026252983293556, + "loss": 1.8568, + "step": 462 + }, + { + "epoch": 0.01658101599011585, + "grad_norm": 1.5508618354797363, + "learning_rate": 0.00011050119331742243, + "loss": 1.9427, + "step": 463 + }, + { + "epoch": 0.016616828119684138, + "grad_norm": 3.0816149711608887, + "learning_rate": 0.0001107398568019093, + "loss": 2.3965, + "step": 464 + }, + { + "epoch": 0.01665264024925242, + "grad_norm": 1.8753119707107544, + "learning_rate": 0.0001109785202863962, + "loss": 1.9468, + "step": 465 + }, + { + "epoch": 0.016688452378820707, + "grad_norm": 1.9726189374923706, + "learning_rate": 0.00011121718377088306, + "loss": 1.7688, + "step": 466 + }, + { + "epoch": 0.01672426450838899, + "grad_norm": 1.5842738151550293, + "learning_rate": 0.00011145584725536993, + "loss": 1.8443, + "step": 467 + }, + { + "epoch": 0.016760076637957276, + "grad_norm": 2.3097848892211914, + "learning_rate": 0.00011169451073985681, + "loss": 2.4292, + "step": 468 + }, + { + "epoch": 0.016795888767525562, + "grad_norm": 1.7131446599960327, + "learning_rate": 0.00011193317422434368, + "loss": 1.9989, + "step": 469 + }, + { + "epoch": 0.016831700897093845, + "grad_norm": 1.544679045677185, + "learning_rate": 0.00011217183770883055, + "loss": 1.9196, + "step": 470 + }, + { + "epoch": 0.01686751302666213, + "grad_norm": 2.726846218109131, + "learning_rate": 0.00011241050119331741, + "loss": 2.1967, + "step": 471 + }, + { + "epoch": 0.016903325156230414, + "grad_norm": 3.2844207286834717, + "learning_rate": 0.00011264916467780431, + "loss": 1.8285, + "step": 472 + }, + { + "epoch": 0.0169391372857987, + "grad_norm": 1.295161247253418, + "learning_rate": 0.00011288782816229118, + "loss": 1.9555, + "step": 473 + }, + { + "epoch": 0.016974949415366983, + "grad_norm": 2.582324266433716, + "learning_rate": 0.00011312649164677805, + "loss": 2.1132, + "step": 474 + }, + { + "epoch": 0.01701076154493527, + "grad_norm": 1.7749077081680298, + "learning_rate": 0.00011336515513126493, + "loss": 2.0865, + "step": 475 + }, + { + "epoch": 0.017046573674503556, + "grad_norm": 2.0523269176483154, + "learning_rate": 0.0001136038186157518, + "loss": 2.2231, + "step": 476 + }, + { + "epoch": 0.01708238580407184, + "grad_norm": 3.3681750297546387, + "learning_rate": 0.00011384248210023866, + "loss": 2.6565, + "step": 477 + }, + { + "epoch": 0.017118197933640125, + "grad_norm": 2.334472417831421, + "learning_rate": 0.00011408114558472553, + "loss": 2.1217, + "step": 478 + }, + { + "epoch": 0.017154010063208408, + "grad_norm": 2.677645444869995, + "learning_rate": 0.00011431980906921242, + "loss": 2.2404, + "step": 479 + }, + { + "epoch": 0.017189822192776694, + "grad_norm": 2.0976386070251465, + "learning_rate": 0.00011455847255369929, + "loss": 2.2931, + "step": 480 + }, + { + "epoch": 0.017225634322344977, + "grad_norm": 1.753631591796875, + "learning_rate": 0.00011479713603818616, + "loss": 1.9063, + "step": 481 + }, + { + "epoch": 0.017261446451913263, + "grad_norm": 1.7365729808807373, + "learning_rate": 0.00011503579952267303, + "loss": 1.9847, + "step": 482 + }, + { + "epoch": 0.017297258581481546, + "grad_norm": 2.4236369132995605, + "learning_rate": 0.00011527446300715991, + "loss": 1.9288, + "step": 483 + }, + { + "epoch": 0.017333070711049833, + "grad_norm": 1.8561084270477295, + "learning_rate": 0.00011551312649164678, + "loss": 1.743, + "step": 484 + }, + { + "epoch": 0.01736888284061812, + "grad_norm": 1.5186161994934082, + "learning_rate": 0.00011575178997613365, + "loss": 2.0968, + "step": 485 + }, + { + "epoch": 0.017404694970186402, + "grad_norm": 2.043816089630127, + "learning_rate": 0.00011599045346062054, + "loss": 2.2149, + "step": 486 + }, + { + "epoch": 0.017440507099754688, + "grad_norm": 2.2540132999420166, + "learning_rate": 0.00011622911694510741, + "loss": 2.089, + "step": 487 + }, + { + "epoch": 0.01747631922932297, + "grad_norm": 1.8196076154708862, + "learning_rate": 0.00011646778042959428, + "loss": 2.051, + "step": 488 + }, + { + "epoch": 0.017512131358891257, + "grad_norm": 1.6149146556854248, + "learning_rate": 0.00011670644391408114, + "loss": 1.991, + "step": 489 + }, + { + "epoch": 0.01754794348845954, + "grad_norm": 1.9776016473770142, + "learning_rate": 0.00011694510739856804, + "loss": 2.2675, + "step": 490 + }, + { + "epoch": 0.017583755618027826, + "grad_norm": 2.040417432785034, + "learning_rate": 0.0001171837708830549, + "loss": 2.0448, + "step": 491 + }, + { + "epoch": 0.01761956774759611, + "grad_norm": 1.90510094165802, + "learning_rate": 0.00011742243436754176, + "loss": 2.0016, + "step": 492 + }, + { + "epoch": 0.017655379877164395, + "grad_norm": 1.4178807735443115, + "learning_rate": 0.00011766109785202863, + "loss": 2.1013, + "step": 493 + }, + { + "epoch": 0.017691192006732682, + "grad_norm": 3.005915641784668, + "learning_rate": 0.00011789976133651552, + "loss": 2.3571, + "step": 494 + }, + { + "epoch": 0.017727004136300965, + "grad_norm": 1.7287548780441284, + "learning_rate": 0.00011813842482100239, + "loss": 1.8362, + "step": 495 + }, + { + "epoch": 0.01776281626586925, + "grad_norm": 2.2629318237304688, + "learning_rate": 0.00011837708830548926, + "loss": 2.0679, + "step": 496 + }, + { + "epoch": 0.017798628395437534, + "grad_norm": 2.3710145950317383, + "learning_rate": 0.00011861575178997615, + "loss": 2.2409, + "step": 497 + }, + { + "epoch": 0.01783444052500582, + "grad_norm": 2.7783234119415283, + "learning_rate": 0.00011885441527446302, + "loss": 2.2683, + "step": 498 + }, + { + "epoch": 0.017870252654574103, + "grad_norm": 2.343177556991577, + "learning_rate": 0.00011909307875894989, + "loss": 1.9737, + "step": 499 + }, + { + "epoch": 0.01790606478414239, + "grad_norm": 1.3416262865066528, + "learning_rate": 0.00011933174224343676, + "loss": 1.7674, + "step": 500 + }, + { + "epoch": 0.017941876913710676, + "grad_norm": 2.2444570064544678, + "learning_rate": 0.00011957040572792364, + "loss": 2.3294, + "step": 501 + }, + { + "epoch": 0.01797768904327896, + "grad_norm": 2.3115077018737793, + "learning_rate": 0.0001198090692124105, + "loss": 2.3566, + "step": 502 + }, + { + "epoch": 0.018013501172847245, + "grad_norm": 1.9268572330474854, + "learning_rate": 0.00012004773269689737, + "loss": 1.8323, + "step": 503 + }, + { + "epoch": 0.018049313302415528, + "grad_norm": 3.032489061355591, + "learning_rate": 0.00012028639618138427, + "loss": 1.9294, + "step": 504 + }, + { + "epoch": 0.018085125431983814, + "grad_norm": 1.1588308811187744, + "learning_rate": 0.00012052505966587114, + "loss": 1.9001, + "step": 505 + }, + { + "epoch": 0.018120937561552097, + "grad_norm": 1.5301387310028076, + "learning_rate": 0.000120763723150358, + "loss": 1.9317, + "step": 506 + }, + { + "epoch": 0.018156749691120383, + "grad_norm": 2.175445079803467, + "learning_rate": 0.00012100238663484487, + "loss": 1.7914, + "step": 507 + }, + { + "epoch": 0.018192561820688666, + "grad_norm": 1.8067626953125, + "learning_rate": 0.00012124105011933175, + "loss": 1.9488, + "step": 508 + }, + { + "epoch": 0.018228373950256952, + "grad_norm": 2.2004895210266113, + "learning_rate": 0.00012147971360381862, + "loss": 1.9453, + "step": 509 + }, + { + "epoch": 0.01826418607982524, + "grad_norm": 1.2897497415542603, + "learning_rate": 0.00012171837708830549, + "loss": 1.8825, + "step": 510 + }, + { + "epoch": 0.01829999820939352, + "grad_norm": 1.6512136459350586, + "learning_rate": 0.00012195704057279236, + "loss": 2.1278, + "step": 511 + }, + { + "epoch": 0.018335810338961808, + "grad_norm": 1.7471429109573364, + "learning_rate": 0.00012219570405727924, + "loss": 1.994, + "step": 512 + }, + { + "epoch": 0.01837162246853009, + "grad_norm": 1.75509774684906, + "learning_rate": 0.0001224343675417661, + "loss": 1.9485, + "step": 513 + }, + { + "epoch": 0.018407434598098377, + "grad_norm": 1.647789478302002, + "learning_rate": 0.00012267303102625297, + "loss": 1.8526, + "step": 514 + }, + { + "epoch": 0.01844324672766666, + "grad_norm": 1.4581859111785889, + "learning_rate": 0.00012291169451073987, + "loss": 1.9781, + "step": 515 + }, + { + "epoch": 0.018479058857234946, + "grad_norm": 1.680022120475769, + "learning_rate": 0.00012315035799522674, + "loss": 1.9252, + "step": 516 + }, + { + "epoch": 0.01851487098680323, + "grad_norm": 1.8121978044509888, + "learning_rate": 0.0001233890214797136, + "loss": 1.5773, + "step": 517 + }, + { + "epoch": 0.018550683116371515, + "grad_norm": 2.650664806365967, + "learning_rate": 0.00012362768496420047, + "loss": 2.358, + "step": 518 + }, + { + "epoch": 0.0185864952459398, + "grad_norm": 1.9430338144302368, + "learning_rate": 0.00012386634844868737, + "loss": 2.1443, + "step": 519 + }, + { + "epoch": 0.018622307375508084, + "grad_norm": 1.728421688079834, + "learning_rate": 0.00012410501193317423, + "loss": 1.8808, + "step": 520 + }, + { + "epoch": 0.01865811950507637, + "grad_norm": 2.1015219688415527, + "learning_rate": 0.0001243436754176611, + "loss": 1.8069, + "step": 521 + }, + { + "epoch": 0.018693931634644653, + "grad_norm": 3.219109058380127, + "learning_rate": 0.000124582338902148, + "loss": 1.9532, + "step": 522 + }, + { + "epoch": 0.01872974376421294, + "grad_norm": 1.7970722913742065, + "learning_rate": 0.00012482100238663487, + "loss": 2.1112, + "step": 523 + }, + { + "epoch": 0.018765555893781222, + "grad_norm": 2.274960517883301, + "learning_rate": 0.00012505966587112173, + "loss": 1.6838, + "step": 524 + }, + { + "epoch": 0.01880136802334951, + "grad_norm": 1.3818376064300537, + "learning_rate": 0.0001252983293556086, + "loss": 1.9166, + "step": 525 + }, + { + "epoch": 0.018837180152917795, + "grad_norm": 1.6130000352859497, + "learning_rate": 0.00012553699284009547, + "loss": 2.2296, + "step": 526 + }, + { + "epoch": 0.018872992282486078, + "grad_norm": 1.9274317026138306, + "learning_rate": 0.00012577565632458234, + "loss": 2.098, + "step": 527 + }, + { + "epoch": 0.018908804412054364, + "grad_norm": 1.789971947669983, + "learning_rate": 0.0001260143198090692, + "loss": 1.7194, + "step": 528 + }, + { + "epoch": 0.018944616541622647, + "grad_norm": 1.8698501586914062, + "learning_rate": 0.00012625298329355607, + "loss": 2.3349, + "step": 529 + }, + { + "epoch": 0.018980428671190933, + "grad_norm": 1.8852440118789673, + "learning_rate": 0.00012649164677804297, + "loss": 1.918, + "step": 530 + }, + { + "epoch": 0.019016240800759216, + "grad_norm": 2.258939743041992, + "learning_rate": 0.00012673031026252983, + "loss": 2.038, + "step": 531 + }, + { + "epoch": 0.019052052930327502, + "grad_norm": 1.3770029544830322, + "learning_rate": 0.0001269689737470167, + "loss": 1.8361, + "step": 532 + }, + { + "epoch": 0.019087865059895785, + "grad_norm": 1.3910750150680542, + "learning_rate": 0.0001272076372315036, + "loss": 2.0795, + "step": 533 + }, + { + "epoch": 0.01912367718946407, + "grad_norm": 1.963090181350708, + "learning_rate": 0.00012744630071599047, + "loss": 2.2053, + "step": 534 + }, + { + "epoch": 0.019159489319032358, + "grad_norm": 2.1430723667144775, + "learning_rate": 0.00012768496420047733, + "loss": 1.7194, + "step": 535 + }, + { + "epoch": 0.01919530144860064, + "grad_norm": 1.6133919954299927, + "learning_rate": 0.0001279236276849642, + "loss": 1.821, + "step": 536 + }, + { + "epoch": 0.019231113578168927, + "grad_norm": 1.79860258102417, + "learning_rate": 0.0001281622911694511, + "loss": 1.9787, + "step": 537 + }, + { + "epoch": 0.01926692570773721, + "grad_norm": 1.847287893295288, + "learning_rate": 0.00012840095465393796, + "loss": 2.0806, + "step": 538 + }, + { + "epoch": 0.019302737837305496, + "grad_norm": 1.458694338798523, + "learning_rate": 0.00012863961813842483, + "loss": 2.1716, + "step": 539 + }, + { + "epoch": 0.01933854996687378, + "grad_norm": 2.063096761703491, + "learning_rate": 0.0001288782816229117, + "loss": 1.9751, + "step": 540 + }, + { + "epoch": 0.019374362096442065, + "grad_norm": 1.8983663320541382, + "learning_rate": 0.00012911694510739857, + "loss": 1.9129, + "step": 541 + }, + { + "epoch": 0.019410174226010348, + "grad_norm": 2.726100444793701, + "learning_rate": 0.00012935560859188543, + "loss": 2.3521, + "step": 542 + }, + { + "epoch": 0.019445986355578634, + "grad_norm": 1.60848069190979, + "learning_rate": 0.0001295942720763723, + "loss": 1.673, + "step": 543 + }, + { + "epoch": 0.01948179848514692, + "grad_norm": 1.7081117630004883, + "learning_rate": 0.0001298329355608592, + "loss": 2.2052, + "step": 544 + }, + { + "epoch": 0.019517610614715204, + "grad_norm": 1.2822455167770386, + "learning_rate": 0.00013007159904534607, + "loss": 1.8536, + "step": 545 + }, + { + "epoch": 0.01955342274428349, + "grad_norm": 1.9416059255599976, + "learning_rate": 0.00013031026252983293, + "loss": 2.3582, + "step": 546 + }, + { + "epoch": 0.019589234873851773, + "grad_norm": 1.0728063583374023, + "learning_rate": 0.0001305489260143198, + "loss": 1.6316, + "step": 547 + }, + { + "epoch": 0.01962504700342006, + "grad_norm": 1.4513641595840454, + "learning_rate": 0.0001307875894988067, + "loss": 1.9188, + "step": 548 + }, + { + "epoch": 0.019660859132988342, + "grad_norm": 1.5579917430877686, + "learning_rate": 0.00013102625298329356, + "loss": 1.758, + "step": 549 + }, + { + "epoch": 0.019696671262556628, + "grad_norm": 1.304106593132019, + "learning_rate": 0.00013126491646778043, + "loss": 1.9442, + "step": 550 + }, + { + "epoch": 0.01973248339212491, + "grad_norm": 1.6284905672073364, + "learning_rate": 0.00013150357995226733, + "loss": 2.0928, + "step": 551 + }, + { + "epoch": 0.019768295521693197, + "grad_norm": 2.598228931427002, + "learning_rate": 0.0001317422434367542, + "loss": 1.9553, + "step": 552 + }, + { + "epoch": 0.019804107651261484, + "grad_norm": 2.1815645694732666, + "learning_rate": 0.00013198090692124106, + "loss": 2.2695, + "step": 553 + }, + { + "epoch": 0.019839919780829766, + "grad_norm": 1.7056655883789062, + "learning_rate": 0.00013221957040572793, + "loss": 1.9625, + "step": 554 + }, + { + "epoch": 0.019875731910398053, + "grad_norm": 1.9924041032791138, + "learning_rate": 0.00013245823389021482, + "loss": 2.1729, + "step": 555 + }, + { + "epoch": 0.019911544039966336, + "grad_norm": 1.7806648015975952, + "learning_rate": 0.00013269689737470167, + "loss": 1.845, + "step": 556 + }, + { + "epoch": 0.019947356169534622, + "grad_norm": 1.9855705499649048, + "learning_rate": 0.00013293556085918853, + "loss": 1.8858, + "step": 557 + }, + { + "epoch": 0.019983168299102905, + "grad_norm": 2.107879161834717, + "learning_rate": 0.0001331742243436754, + "loss": 2.1954, + "step": 558 + }, + { + "epoch": 0.02001898042867119, + "grad_norm": 1.5310916900634766, + "learning_rate": 0.0001334128878281623, + "loss": 2.0236, + "step": 559 + }, + { + "epoch": 0.020054792558239477, + "grad_norm": 1.5715898275375366, + "learning_rate": 0.00013365155131264916, + "loss": 2.3037, + "step": 560 + }, + { + "epoch": 0.02009060468780776, + "grad_norm": 1.846575379371643, + "learning_rate": 0.00013389021479713603, + "loss": 1.9472, + "step": 561 + }, + { + "epoch": 0.020126416817376046, + "grad_norm": 1.5027644634246826, + "learning_rate": 0.00013412887828162293, + "loss": 1.8978, + "step": 562 + }, + { + "epoch": 0.02016222894694433, + "grad_norm": 2.3320515155792236, + "learning_rate": 0.0001343675417661098, + "loss": 2.0998, + "step": 563 + }, + { + "epoch": 0.020198041076512616, + "grad_norm": 1.4879544973373413, + "learning_rate": 0.00013460620525059666, + "loss": 2.0195, + "step": 564 + }, + { + "epoch": 0.0202338532060809, + "grad_norm": 1.6790188550949097, + "learning_rate": 0.00013484486873508353, + "loss": 2.0034, + "step": 565 + }, + { + "epoch": 0.020269665335649185, + "grad_norm": 1.3368083238601685, + "learning_rate": 0.00013508353221957042, + "loss": 1.853, + "step": 566 + }, + { + "epoch": 0.020305477465217468, + "grad_norm": 2.40515398979187, + "learning_rate": 0.0001353221957040573, + "loss": 1.6864, + "step": 567 + }, + { + "epoch": 0.020341289594785754, + "grad_norm": 3.6255276203155518, + "learning_rate": 0.00013556085918854416, + "loss": 2.0015, + "step": 568 + }, + { + "epoch": 0.02037710172435404, + "grad_norm": 1.3990453481674194, + "learning_rate": 0.00013579952267303105, + "loss": 1.9736, + "step": 569 + }, + { + "epoch": 0.020412913853922323, + "grad_norm": 1.815877079963684, + "learning_rate": 0.00013603818615751792, + "loss": 2.0062, + "step": 570 + }, + { + "epoch": 0.02044872598349061, + "grad_norm": 2.521155595779419, + "learning_rate": 0.0001362768496420048, + "loss": 2.2658, + "step": 571 + }, + { + "epoch": 0.020484538113058892, + "grad_norm": 1.6638094186782837, + "learning_rate": 0.00013651551312649166, + "loss": 2.0926, + "step": 572 + }, + { + "epoch": 0.02052035024262718, + "grad_norm": 1.9844567775726318, + "learning_rate": 0.00013675417661097853, + "loss": 2.2214, + "step": 573 + }, + { + "epoch": 0.02055616237219546, + "grad_norm": 1.8711490631103516, + "learning_rate": 0.0001369928400954654, + "loss": 1.9504, + "step": 574 + }, + { + "epoch": 0.020591974501763748, + "grad_norm": 2.035768985748291, + "learning_rate": 0.00013723150357995226, + "loss": 1.8715, + "step": 575 + }, + { + "epoch": 0.02062778663133203, + "grad_norm": 1.6506410837173462, + "learning_rate": 0.00013747016706443913, + "loss": 2.1253, + "step": 576 + }, + { + "epoch": 0.020663598760900317, + "grad_norm": 1.5059458017349243, + "learning_rate": 0.00013770883054892602, + "loss": 1.9463, + "step": 577 + }, + { + "epoch": 0.020699410890468603, + "grad_norm": 2.002347230911255, + "learning_rate": 0.0001379474940334129, + "loss": 2.3318, + "step": 578 + }, + { + "epoch": 0.020735223020036886, + "grad_norm": 2.4913315773010254, + "learning_rate": 0.00013818615751789976, + "loss": 2.0054, + "step": 579 + }, + { + "epoch": 0.020771035149605172, + "grad_norm": 1.9572597742080688, + "learning_rate": 0.00013842482100238665, + "loss": 2.24, + "step": 580 + }, + { + "epoch": 0.020806847279173455, + "grad_norm": 2.6330034732818604, + "learning_rate": 0.00013866348448687352, + "loss": 2.0444, + "step": 581 + }, + { + "epoch": 0.02084265940874174, + "grad_norm": 2.075014591217041, + "learning_rate": 0.0001389021479713604, + "loss": 2.1214, + "step": 582 + }, + { + "epoch": 0.020878471538310024, + "grad_norm": 1.4259493350982666, + "learning_rate": 0.00013914081145584726, + "loss": 1.6096, + "step": 583 + }, + { + "epoch": 0.02091428366787831, + "grad_norm": 2.2150115966796875, + "learning_rate": 0.00013937947494033415, + "loss": 2.0084, + "step": 584 + }, + { + "epoch": 0.020950095797446597, + "grad_norm": 2.1223316192626953, + "learning_rate": 0.00013961813842482102, + "loss": 2.2784, + "step": 585 + }, + { + "epoch": 0.02098590792701488, + "grad_norm": 1.5994501113891602, + "learning_rate": 0.0001398568019093079, + "loss": 2.2577, + "step": 586 + }, + { + "epoch": 0.021021720056583166, + "grad_norm": 1.6532303094863892, + "learning_rate": 0.00014009546539379476, + "loss": 1.965, + "step": 587 + }, + { + "epoch": 0.02105753218615145, + "grad_norm": 1.9446479082107544, + "learning_rate": 0.00014033412887828162, + "loss": 2.207, + "step": 588 + }, + { + "epoch": 0.021093344315719735, + "grad_norm": 1.3874430656433105, + "learning_rate": 0.0001405727923627685, + "loss": 2.0884, + "step": 589 + }, + { + "epoch": 0.021129156445288018, + "grad_norm": 1.4150493144989014, + "learning_rate": 0.00014081145584725536, + "loss": 1.8492, + "step": 590 + }, + { + "epoch": 0.021164968574856304, + "grad_norm": 2.022547483444214, + "learning_rate": 0.00014105011933174225, + "loss": 1.9806, + "step": 591 + }, + { + "epoch": 0.021200780704424587, + "grad_norm": 2.3493235111236572, + "learning_rate": 0.00014128878281622912, + "loss": 2.4232, + "step": 592 + }, + { + "epoch": 0.021236592833992873, + "grad_norm": 1.6374825239181519, + "learning_rate": 0.000141527446300716, + "loss": 1.9931, + "step": 593 + }, + { + "epoch": 0.02127240496356116, + "grad_norm": 1.9927897453308105, + "learning_rate": 0.00014176610978520286, + "loss": 1.9837, + "step": 594 + }, + { + "epoch": 0.021308217093129442, + "grad_norm": 2.0843703746795654, + "learning_rate": 0.00014200477326968975, + "loss": 2.2173, + "step": 595 + }, + { + "epoch": 0.02134402922269773, + "grad_norm": 2.041806936264038, + "learning_rate": 0.00014224343675417662, + "loss": 2.1302, + "step": 596 + }, + { + "epoch": 0.02137984135226601, + "grad_norm": 2.7673728466033936, + "learning_rate": 0.0001424821002386635, + "loss": 2.6909, + "step": 597 + }, + { + "epoch": 0.021415653481834298, + "grad_norm": 1.646714210510254, + "learning_rate": 0.00014272076372315038, + "loss": 2.1574, + "step": 598 + }, + { + "epoch": 0.02145146561140258, + "grad_norm": 1.549854040145874, + "learning_rate": 0.00014295942720763725, + "loss": 2.1436, + "step": 599 + }, + { + "epoch": 0.021487277740970867, + "grad_norm": 2.0806381702423096, + "learning_rate": 0.00014319809069212412, + "loss": 1.6527, + "step": 600 + }, + { + "epoch": 0.02152308987053915, + "grad_norm": 1.1766951084136963, + "learning_rate": 0.000143436754176611, + "loss": 1.5967, + "step": 601 + }, + { + "epoch": 0.021558902000107436, + "grad_norm": 1.7814152240753174, + "learning_rate": 0.00014367541766109785, + "loss": 1.8676, + "step": 602 + }, + { + "epoch": 0.021594714129675723, + "grad_norm": 1.4170676469802856, + "learning_rate": 0.00014391408114558472, + "loss": 1.9861, + "step": 603 + }, + { + "epoch": 0.021630526259244005, + "grad_norm": 1.3304917812347412, + "learning_rate": 0.0001441527446300716, + "loss": 1.7847, + "step": 604 + }, + { + "epoch": 0.02166633838881229, + "grad_norm": 1.353492259979248, + "learning_rate": 0.00014439140811455846, + "loss": 1.9458, + "step": 605 + }, + { + "epoch": 0.021702150518380574, + "grad_norm": 1.9017729759216309, + "learning_rate": 0.00014463007159904535, + "loss": 2.2398, + "step": 606 + }, + { + "epoch": 0.02173796264794886, + "grad_norm": 1.3059676885604858, + "learning_rate": 0.00014486873508353222, + "loss": 1.7462, + "step": 607 + }, + { + "epoch": 0.021773774777517144, + "grad_norm": 1.7438218593597412, + "learning_rate": 0.0001451073985680191, + "loss": 2.2967, + "step": 608 + }, + { + "epoch": 0.02180958690708543, + "grad_norm": 1.7737010717391968, + "learning_rate": 0.00014534606205250598, + "loss": 1.9835, + "step": 609 + }, + { + "epoch": 0.021845399036653716, + "grad_norm": 1.6507363319396973, + "learning_rate": 0.00014558472553699285, + "loss": 1.7111, + "step": 610 + }, + { + "epoch": 0.021881211166222, + "grad_norm": 1.657170295715332, + "learning_rate": 0.00014582338902147972, + "loss": 2.1349, + "step": 611 + }, + { + "epoch": 0.021917023295790285, + "grad_norm": 2.128286838531494, + "learning_rate": 0.0001460620525059666, + "loss": 2.0744, + "step": 612 + }, + { + "epoch": 0.021952835425358568, + "grad_norm": 1.6936157941818237, + "learning_rate": 0.00014630071599045348, + "loss": 2.2095, + "step": 613 + }, + { + "epoch": 0.021988647554926855, + "grad_norm": 1.3525550365447998, + "learning_rate": 0.00014653937947494035, + "loss": 1.9486, + "step": 614 + }, + { + "epoch": 0.022024459684495137, + "grad_norm": 1.4725382328033447, + "learning_rate": 0.00014677804295942722, + "loss": 2.0803, + "step": 615 + }, + { + "epoch": 0.022060271814063424, + "grad_norm": 1.4054583311080933, + "learning_rate": 0.00014701670644391409, + "loss": 1.6478, + "step": 616 + }, + { + "epoch": 0.022096083943631706, + "grad_norm": 1.6811931133270264, + "learning_rate": 0.00014725536992840095, + "loss": 2.2387, + "step": 617 + }, + { + "epoch": 0.022131896073199993, + "grad_norm": 2.0554606914520264, + "learning_rate": 0.00014749403341288782, + "loss": 1.9838, + "step": 618 + }, + { + "epoch": 0.02216770820276828, + "grad_norm": 1.9794045686721802, + "learning_rate": 0.0001477326968973747, + "loss": 2.1976, + "step": 619 + }, + { + "epoch": 0.022203520332336562, + "grad_norm": 1.6070706844329834, + "learning_rate": 0.00014797136038186158, + "loss": 2.0375, + "step": 620 + }, + { + "epoch": 0.022239332461904848, + "grad_norm": 1.8594540357589722, + "learning_rate": 0.00014821002386634845, + "loss": 2.2466, + "step": 621 + }, + { + "epoch": 0.02227514459147313, + "grad_norm": 2.2988474369049072, + "learning_rate": 0.00014844868735083532, + "loss": 2.0812, + "step": 622 + }, + { + "epoch": 0.022310956721041417, + "grad_norm": 1.9472583532333374, + "learning_rate": 0.0001486873508353222, + "loss": 1.7122, + "step": 623 + }, + { + "epoch": 0.0223467688506097, + "grad_norm": 2.4133808612823486, + "learning_rate": 0.00014892601431980908, + "loss": 2.0218, + "step": 624 + }, + { + "epoch": 0.022382580980177987, + "grad_norm": 1.960550308227539, + "learning_rate": 0.00014916467780429595, + "loss": 1.6735, + "step": 625 + }, + { + "epoch": 0.02241839310974627, + "grad_norm": 1.429086446762085, + "learning_rate": 0.00014940334128878282, + "loss": 1.8953, + "step": 626 + }, + { + "epoch": 0.022454205239314556, + "grad_norm": 2.099316120147705, + "learning_rate": 0.0001496420047732697, + "loss": 2.0225, + "step": 627 + }, + { + "epoch": 0.022490017368882842, + "grad_norm": 1.6147634983062744, + "learning_rate": 0.00014988066825775658, + "loss": 1.9627, + "step": 628 + }, + { + "epoch": 0.022525829498451125, + "grad_norm": 3.438903570175171, + "learning_rate": 0.00015011933174224345, + "loss": 2.2949, + "step": 629 + }, + { + "epoch": 0.02256164162801941, + "grad_norm": 2.2410268783569336, + "learning_rate": 0.00015035799522673032, + "loss": 2.1214, + "step": 630 + }, + { + "epoch": 0.022597453757587694, + "grad_norm": 1.8363879919052124, + "learning_rate": 0.0001505966587112172, + "loss": 1.6187, + "step": 631 + }, + { + "epoch": 0.02263326588715598, + "grad_norm": 1.890332579612732, + "learning_rate": 0.00015083532219570408, + "loss": 2.244, + "step": 632 + }, + { + "epoch": 0.022669078016724263, + "grad_norm": 2.458477258682251, + "learning_rate": 0.00015107398568019092, + "loss": 2.4482, + "step": 633 + }, + { + "epoch": 0.02270489014629255, + "grad_norm": 1.5786428451538086, + "learning_rate": 0.00015131264916467781, + "loss": 1.9677, + "step": 634 + }, + { + "epoch": 0.022740702275860836, + "grad_norm": 1.7664666175842285, + "learning_rate": 0.00015155131264916468, + "loss": 2.1842, + "step": 635 + }, + { + "epoch": 0.02277651440542912, + "grad_norm": 1.6327040195465088, + "learning_rate": 0.00015178997613365155, + "loss": 1.7608, + "step": 636 + }, + { + "epoch": 0.022812326534997405, + "grad_norm": 1.7589260339736938, + "learning_rate": 0.00015202863961813842, + "loss": 1.9472, + "step": 637 + }, + { + "epoch": 0.022848138664565688, + "grad_norm": 3.3443808555603027, + "learning_rate": 0.0001522673031026253, + "loss": 1.894, + "step": 638 + }, + { + "epoch": 0.022883950794133974, + "grad_norm": 1.7203983068466187, + "learning_rate": 0.00015250596658711218, + "loss": 1.616, + "step": 639 + }, + { + "epoch": 0.022919762923702257, + "grad_norm": 2.1859230995178223, + "learning_rate": 0.00015274463007159905, + "loss": 1.9724, + "step": 640 + }, + { + "epoch": 0.022955575053270543, + "grad_norm": 1.5211198329925537, + "learning_rate": 0.00015298329355608592, + "loss": 2.0265, + "step": 641 + }, + { + "epoch": 0.022991387182838826, + "grad_norm": 1.601146936416626, + "learning_rate": 0.0001532219570405728, + "loss": 1.9278, + "step": 642 + }, + { + "epoch": 0.023027199312407112, + "grad_norm": 1.6272515058517456, + "learning_rate": 0.00015346062052505968, + "loss": 1.8003, + "step": 643 + }, + { + "epoch": 0.0230630114419754, + "grad_norm": 2.8821325302124023, + "learning_rate": 0.00015369928400954655, + "loss": 2.173, + "step": 644 + }, + { + "epoch": 0.02309882357154368, + "grad_norm": 1.6864391565322876, + "learning_rate": 0.00015393794749403344, + "loss": 2.0027, + "step": 645 + }, + { + "epoch": 0.023134635701111968, + "grad_norm": 1.6988509893417358, + "learning_rate": 0.0001541766109785203, + "loss": 2.1044, + "step": 646 + }, + { + "epoch": 0.02317044783068025, + "grad_norm": 1.5111042261123657, + "learning_rate": 0.00015441527446300718, + "loss": 1.9445, + "step": 647 + }, + { + "epoch": 0.023206259960248537, + "grad_norm": 1.5460760593414307, + "learning_rate": 0.00015465393794749404, + "loss": 1.9057, + "step": 648 + }, + { + "epoch": 0.02324207208981682, + "grad_norm": 2.1058406829833984, + "learning_rate": 0.0001548926014319809, + "loss": 2.2557, + "step": 649 + }, + { + "epoch": 0.023277884219385106, + "grad_norm": 2.062448263168335, + "learning_rate": 0.00015513126491646778, + "loss": 2.3574, + "step": 650 + }, + { + "epoch": 0.02331369634895339, + "grad_norm": 1.2844550609588623, + "learning_rate": 0.00015536992840095465, + "loss": 2.0645, + "step": 651 + }, + { + "epoch": 0.023349508478521675, + "grad_norm": 1.7018535137176514, + "learning_rate": 0.00015560859188544154, + "loss": 2.0612, + "step": 652 + }, + { + "epoch": 0.02338532060808996, + "grad_norm": 2.8740715980529785, + "learning_rate": 0.0001558472553699284, + "loss": 2.0078, + "step": 653 + }, + { + "epoch": 0.023421132737658244, + "grad_norm": 2.4455902576446533, + "learning_rate": 0.00015608591885441528, + "loss": 1.9655, + "step": 654 + }, + { + "epoch": 0.02345694486722653, + "grad_norm": 1.9727590084075928, + "learning_rate": 0.00015632458233890215, + "loss": 2.1078, + "step": 655 + }, + { + "epoch": 0.023492756996794813, + "grad_norm": 1.6687122583389282, + "learning_rate": 0.00015656324582338904, + "loss": 2.0421, + "step": 656 + }, + { + "epoch": 0.0235285691263631, + "grad_norm": 2.6429245471954346, + "learning_rate": 0.0001568019093078759, + "loss": 2.3127, + "step": 657 + }, + { + "epoch": 0.023564381255931383, + "grad_norm": 2.1367905139923096, + "learning_rate": 0.00015704057279236278, + "loss": 2.2663, + "step": 658 + }, + { + "epoch": 0.02360019338549967, + "grad_norm": 1.8748291730880737, + "learning_rate": 0.00015727923627684964, + "loss": 1.6464, + "step": 659 + }, + { + "epoch": 0.023636005515067955, + "grad_norm": 1.3424737453460693, + "learning_rate": 0.00015751789976133654, + "loss": 1.9569, + "step": 660 + }, + { + "epoch": 0.023671817644636238, + "grad_norm": 1.6000856161117554, + "learning_rate": 0.0001577565632458234, + "loss": 1.9856, + "step": 661 + }, + { + "epoch": 0.023707629774204524, + "grad_norm": 1.6837635040283203, + "learning_rate": 0.00015799522673031027, + "loss": 1.8739, + "step": 662 + }, + { + "epoch": 0.023743441903772807, + "grad_norm": 1.6797798871994019, + "learning_rate": 0.00015823389021479714, + "loss": 1.9404, + "step": 663 + }, + { + "epoch": 0.023779254033341093, + "grad_norm": 1.217158317565918, + "learning_rate": 0.000158472553699284, + "loss": 2.0562, + "step": 664 + }, + { + "epoch": 0.023815066162909376, + "grad_norm": 1.4763151407241821, + "learning_rate": 0.00015871121718377088, + "loss": 1.8707, + "step": 665 + }, + { + "epoch": 0.023850878292477663, + "grad_norm": 1.5236214399337769, + "learning_rate": 0.00015894988066825775, + "loss": 1.7224, + "step": 666 + }, + { + "epoch": 0.023886690422045945, + "grad_norm": 1.8331769704818726, + "learning_rate": 0.00015918854415274464, + "loss": 2.1492, + "step": 667 + }, + { + "epoch": 0.02392250255161423, + "grad_norm": 1.3603184223175049, + "learning_rate": 0.0001594272076372315, + "loss": 2.0236, + "step": 668 + }, + { + "epoch": 0.023958314681182518, + "grad_norm": 1.4922575950622559, + "learning_rate": 0.00015966587112171838, + "loss": 1.7288, + "step": 669 + }, + { + "epoch": 0.0239941268107508, + "grad_norm": 1.5984807014465332, + "learning_rate": 0.00015990453460620524, + "loss": 2.249, + "step": 670 + }, + { + "epoch": 0.024029938940319087, + "grad_norm": 1.735472321510315, + "learning_rate": 0.00016014319809069214, + "loss": 1.7396, + "step": 671 + }, + { + "epoch": 0.02406575106988737, + "grad_norm": 1.040790319442749, + "learning_rate": 0.000160381861575179, + "loss": 1.7578, + "step": 672 + }, + { + "epoch": 0.024101563199455656, + "grad_norm": 1.53122878074646, + "learning_rate": 0.00016062052505966587, + "loss": 1.7265, + "step": 673 + }, + { + "epoch": 0.02413737532902394, + "grad_norm": 3.3196752071380615, + "learning_rate": 0.00016085918854415277, + "loss": 2.035, + "step": 674 + }, + { + "epoch": 0.024173187458592225, + "grad_norm": 1.471759557723999, + "learning_rate": 0.00016109785202863964, + "loss": 1.9174, + "step": 675 + }, + { + "epoch": 0.02420899958816051, + "grad_norm": 1.670170545578003, + "learning_rate": 0.0001613365155131265, + "loss": 2.0755, + "step": 676 + }, + { + "epoch": 0.024244811717728795, + "grad_norm": 1.6233552694320679, + "learning_rate": 0.00016157517899761337, + "loss": 1.8244, + "step": 677 + }, + { + "epoch": 0.02428062384729708, + "grad_norm": 3.0857901573181152, + "learning_rate": 0.00016181384248210024, + "loss": 2.8201, + "step": 678 + }, + { + "epoch": 0.024316435976865364, + "grad_norm": 1.895977258682251, + "learning_rate": 0.0001620525059665871, + "loss": 1.9378, + "step": 679 + }, + { + "epoch": 0.02435224810643365, + "grad_norm": 1.9350051879882812, + "learning_rate": 0.00016229116945107398, + "loss": 2.0559, + "step": 680 + }, + { + "epoch": 0.024388060236001933, + "grad_norm": 2.0227410793304443, + "learning_rate": 0.00016252983293556087, + "loss": 2.2404, + "step": 681 + }, + { + "epoch": 0.02442387236557022, + "grad_norm": 1.1764206886291504, + "learning_rate": 0.00016276849642004774, + "loss": 1.8332, + "step": 682 + }, + { + "epoch": 0.024459684495138502, + "grad_norm": 2.8057363033294678, + "learning_rate": 0.0001630071599045346, + "loss": 2.0951, + "step": 683 + }, + { + "epoch": 0.02449549662470679, + "grad_norm": 1.5808459520339966, + "learning_rate": 0.00016324582338902147, + "loss": 2.0856, + "step": 684 + }, + { + "epoch": 0.024531308754275075, + "grad_norm": 2.126241683959961, + "learning_rate": 0.00016348448687350837, + "loss": 1.9048, + "step": 685 + }, + { + "epoch": 0.024567120883843357, + "grad_norm": 1.388526201248169, + "learning_rate": 0.00016372315035799524, + "loss": 1.8886, + "step": 686 + }, + { + "epoch": 0.024602933013411644, + "grad_norm": 1.420893907546997, + "learning_rate": 0.0001639618138424821, + "loss": 1.9233, + "step": 687 + }, + { + "epoch": 0.024638745142979927, + "grad_norm": 1.5350810289382935, + "learning_rate": 0.00016420047732696897, + "loss": 2.1028, + "step": 688 + }, + { + "epoch": 0.024674557272548213, + "grad_norm": 1.220397710800171, + "learning_rate": 0.00016443914081145587, + "loss": 1.9763, + "step": 689 + }, + { + "epoch": 0.024710369402116496, + "grad_norm": 1.369965672492981, + "learning_rate": 0.00016467780429594274, + "loss": 2.074, + "step": 690 + }, + { + "epoch": 0.024746181531684782, + "grad_norm": 2.692288398742676, + "learning_rate": 0.0001649164677804296, + "loss": 2.4718, + "step": 691 + }, + { + "epoch": 0.024781993661253065, + "grad_norm": 1.9050400257110596, + "learning_rate": 0.0001651551312649165, + "loss": 2.2467, + "step": 692 + }, + { + "epoch": 0.02481780579082135, + "grad_norm": 1.706419825553894, + "learning_rate": 0.00016539379474940334, + "loss": 1.7131, + "step": 693 + }, + { + "epoch": 0.024853617920389637, + "grad_norm": 1.5882086753845215, + "learning_rate": 0.0001656324582338902, + "loss": 2.0457, + "step": 694 + }, + { + "epoch": 0.02488943004995792, + "grad_norm": 1.5576844215393066, + "learning_rate": 0.00016587112171837707, + "loss": 1.7335, + "step": 695 + }, + { + "epoch": 0.024925242179526207, + "grad_norm": 1.3042941093444824, + "learning_rate": 0.00016610978520286397, + "loss": 1.7833, + "step": 696 + }, + { + "epoch": 0.02496105430909449, + "grad_norm": 1.5248041152954102, + "learning_rate": 0.00016634844868735084, + "loss": 1.8416, + "step": 697 + }, + { + "epoch": 0.024996866438662776, + "grad_norm": 3.5359699726104736, + "learning_rate": 0.0001665871121718377, + "loss": 2.8096, + "step": 698 + }, + { + "epoch": 0.02503267856823106, + "grad_norm": 1.6402137279510498, + "learning_rate": 0.0001668257756563246, + "loss": 2.0446, + "step": 699 + }, + { + "epoch": 0.025068490697799345, + "grad_norm": 1.531969428062439, + "learning_rate": 0.00016706443914081147, + "loss": 1.9463, + "step": 700 + }, + { + "epoch": 0.025104302827367628, + "grad_norm": 2.4962432384490967, + "learning_rate": 0.00016730310262529834, + "loss": 2.3453, + "step": 701 + }, + { + "epoch": 0.025140114956935914, + "grad_norm": 1.790243148803711, + "learning_rate": 0.0001675417661097852, + "loss": 2.0987, + "step": 702 + }, + { + "epoch": 0.0251759270865042, + "grad_norm": 1.7493826150894165, + "learning_rate": 0.0001677804295942721, + "loss": 1.7627, + "step": 703 + }, + { + "epoch": 0.025211739216072483, + "grad_norm": 1.5792869329452515, + "learning_rate": 0.00016801909307875897, + "loss": 1.7321, + "step": 704 + }, + { + "epoch": 0.02524755134564077, + "grad_norm": 1.3130288124084473, + "learning_rate": 0.00016825775656324583, + "loss": 2.0553, + "step": 705 + }, + { + "epoch": 0.025283363475209052, + "grad_norm": 1.769005537033081, + "learning_rate": 0.0001684964200477327, + "loss": 2.0413, + "step": 706 + }, + { + "epoch": 0.02531917560477734, + "grad_norm": 1.4005001783370972, + "learning_rate": 0.0001687350835322196, + "loss": 1.8354, + "step": 707 + }, + { + "epoch": 0.02535498773434562, + "grad_norm": 1.722219705581665, + "learning_rate": 0.00016897374701670646, + "loss": 2.1277, + "step": 708 + }, + { + "epoch": 0.025390799863913908, + "grad_norm": 1.9481533765792847, + "learning_rate": 0.00016921241050119333, + "loss": 2.1439, + "step": 709 + }, + { + "epoch": 0.025426611993482194, + "grad_norm": 1.281445026397705, + "learning_rate": 0.0001694510739856802, + "loss": 2.1655, + "step": 710 + }, + { + "epoch": 0.025462424123050477, + "grad_norm": 1.7158055305480957, + "learning_rate": 0.00016968973747016707, + "loss": 2.3323, + "step": 711 + }, + { + "epoch": 0.025498236252618763, + "grad_norm": 1.4600533246994019, + "learning_rate": 0.00016992840095465394, + "loss": 1.9099, + "step": 712 + }, + { + "epoch": 0.025534048382187046, + "grad_norm": 1.8186371326446533, + "learning_rate": 0.0001701670644391408, + "loss": 1.8916, + "step": 713 + }, + { + "epoch": 0.025569860511755332, + "grad_norm": 1.6856729984283447, + "learning_rate": 0.0001704057279236277, + "loss": 1.8507, + "step": 714 + }, + { + "epoch": 0.025605672641323615, + "grad_norm": 1.6164995431900024, + "learning_rate": 0.00017064439140811457, + "loss": 1.7116, + "step": 715 + }, + { + "epoch": 0.0256414847708919, + "grad_norm": 1.3906581401824951, + "learning_rate": 0.00017088305489260143, + "loss": 2.0011, + "step": 716 + }, + { + "epoch": 0.025677296900460184, + "grad_norm": 1.490162968635559, + "learning_rate": 0.00017112171837708833, + "loss": 2.2157, + "step": 717 + }, + { + "epoch": 0.02571310903002847, + "grad_norm": 1.9008619785308838, + "learning_rate": 0.0001713603818615752, + "loss": 2.0239, + "step": 718 + }, + { + "epoch": 0.025748921159596757, + "grad_norm": 1.612532377243042, + "learning_rate": 0.00017159904534606206, + "loss": 2.0756, + "step": 719 + }, + { + "epoch": 0.02578473328916504, + "grad_norm": 1.4195812940597534, + "learning_rate": 0.00017183770883054893, + "loss": 2.0478, + "step": 720 + }, + { + "epoch": 0.025820545418733326, + "grad_norm": 1.613661289215088, + "learning_rate": 0.00017207637231503583, + "loss": 2.3087, + "step": 721 + }, + { + "epoch": 0.02585635754830161, + "grad_norm": 1.243811845779419, + "learning_rate": 0.0001723150357995227, + "loss": 1.9042, + "step": 722 + }, + { + "epoch": 0.025892169677869895, + "grad_norm": 1.657544493675232, + "learning_rate": 0.00017255369928400956, + "loss": 1.998, + "step": 723 + }, + { + "epoch": 0.025927981807438178, + "grad_norm": 2.2602198123931885, + "learning_rate": 0.00017279236276849643, + "loss": 2.2622, + "step": 724 + }, + { + "epoch": 0.025963793937006464, + "grad_norm": 1.8787742853164673, + "learning_rate": 0.0001730310262529833, + "loss": 2.1541, + "step": 725 + }, + { + "epoch": 0.025999606066574747, + "grad_norm": 1.5164096355438232, + "learning_rate": 0.00017326968973747017, + "loss": 1.8409, + "step": 726 + }, + { + "epoch": 0.026035418196143034, + "grad_norm": 1.5882408618927002, + "learning_rate": 0.00017350835322195703, + "loss": 2.0037, + "step": 727 + }, + { + "epoch": 0.02607123032571132, + "grad_norm": 2.327857732772827, + "learning_rate": 0.00017374701670644393, + "loss": 2.1629, + "step": 728 + }, + { + "epoch": 0.026107042455279603, + "grad_norm": 2.476983070373535, + "learning_rate": 0.0001739856801909308, + "loss": 2.3573, + "step": 729 + }, + { + "epoch": 0.02614285458484789, + "grad_norm": 2.473822593688965, + "learning_rate": 0.00017422434367541766, + "loss": 2.3806, + "step": 730 + }, + { + "epoch": 0.026178666714416172, + "grad_norm": 2.2253236770629883, + "learning_rate": 0.00017446300715990453, + "loss": 2.1016, + "step": 731 + }, + { + "epoch": 0.026214478843984458, + "grad_norm": 2.6160786151885986, + "learning_rate": 0.00017470167064439143, + "loss": 1.7466, + "step": 732 + }, + { + "epoch": 0.02625029097355274, + "grad_norm": 1.6734675168991089, + "learning_rate": 0.0001749403341288783, + "loss": 1.9496, + "step": 733 + }, + { + "epoch": 0.026286103103121027, + "grad_norm": 2.585047721862793, + "learning_rate": 0.00017517899761336516, + "loss": 2.2801, + "step": 734 + }, + { + "epoch": 0.026321915232689314, + "grad_norm": 2.7207157611846924, + "learning_rate": 0.00017541766109785203, + "loss": 1.9879, + "step": 735 + }, + { + "epoch": 0.026357727362257596, + "grad_norm": 1.3990424871444702, + "learning_rate": 0.00017565632458233893, + "loss": 2.206, + "step": 736 + }, + { + "epoch": 0.026393539491825883, + "grad_norm": 1.7303647994995117, + "learning_rate": 0.0001758949880668258, + "loss": 2.1254, + "step": 737 + }, + { + "epoch": 0.026429351621394166, + "grad_norm": 1.4347045421600342, + "learning_rate": 0.00017613365155131266, + "loss": 2.0242, + "step": 738 + }, + { + "epoch": 0.026465163750962452, + "grad_norm": 2.1285247802734375, + "learning_rate": 0.00017637231503579953, + "loss": 1.7194, + "step": 739 + }, + { + "epoch": 0.026500975880530735, + "grad_norm": 1.445741891860962, + "learning_rate": 0.0001766109785202864, + "loss": 1.951, + "step": 740 + }, + { + "epoch": 0.02653678801009902, + "grad_norm": 1.6613547801971436, + "learning_rate": 0.00017684964200477326, + "loss": 1.9932, + "step": 741 + }, + { + "epoch": 0.026572600139667304, + "grad_norm": 1.2473163604736328, + "learning_rate": 0.00017708830548926013, + "loss": 2.0447, + "step": 742 + }, + { + "epoch": 0.02660841226923559, + "grad_norm": 1.4733924865722656, + "learning_rate": 0.00017732696897374703, + "loss": 1.9246, + "step": 743 + }, + { + "epoch": 0.026644224398803876, + "grad_norm": 1.6123089790344238, + "learning_rate": 0.0001775656324582339, + "loss": 1.7595, + "step": 744 + }, + { + "epoch": 0.02668003652837216, + "grad_norm": 1.358934998512268, + "learning_rate": 0.00017780429594272076, + "loss": 1.9185, + "step": 745 + }, + { + "epoch": 0.026715848657940446, + "grad_norm": 1.765520691871643, + "learning_rate": 0.00017804295942720766, + "loss": 1.8516, + "step": 746 + }, + { + "epoch": 0.02675166078750873, + "grad_norm": 2.589219331741333, + "learning_rate": 0.00017828162291169453, + "loss": 1.9495, + "step": 747 + }, + { + "epoch": 0.026787472917077015, + "grad_norm": 1.5191903114318848, + "learning_rate": 0.0001785202863961814, + "loss": 2.121, + "step": 748 + }, + { + "epoch": 0.026823285046645298, + "grad_norm": 2.5188381671905518, + "learning_rate": 0.00017875894988066826, + "loss": 2.154, + "step": 749 + }, + { + "epoch": 0.026859097176213584, + "grad_norm": 1.1737215518951416, + "learning_rate": 0.00017899761336515516, + "loss": 1.6, + "step": 750 + }, + { + "epoch": 0.026894909305781867, + "grad_norm": 1.4904850721359253, + "learning_rate": 0.00017923627684964202, + "loss": 2.1385, + "step": 751 + }, + { + "epoch": 0.026930721435350153, + "grad_norm": 1.7092859745025635, + "learning_rate": 0.0001794749403341289, + "loss": 2.4254, + "step": 752 + }, + { + "epoch": 0.02696653356491844, + "grad_norm": 1.8370356559753418, + "learning_rate": 0.00017971360381861576, + "loss": 2.0809, + "step": 753 + }, + { + "epoch": 0.027002345694486722, + "grad_norm": 1.6396968364715576, + "learning_rate": 0.00017995226730310263, + "loss": 2.0178, + "step": 754 + }, + { + "epoch": 0.02703815782405501, + "grad_norm": 1.7405651807785034, + "learning_rate": 0.0001801909307875895, + "loss": 2.1105, + "step": 755 + }, + { + "epoch": 0.02707396995362329, + "grad_norm": 1.99338698387146, + "learning_rate": 0.00018042959427207636, + "loss": 2.1981, + "step": 756 + }, + { + "epoch": 0.027109782083191578, + "grad_norm": 1.7673587799072266, + "learning_rate": 0.00018066825775656326, + "loss": 2.343, + "step": 757 + }, + { + "epoch": 0.02714559421275986, + "grad_norm": 2.105565309524536, + "learning_rate": 0.00018090692124105013, + "loss": 2.0746, + "step": 758 + }, + { + "epoch": 0.027181406342328147, + "grad_norm": 1.3338245153427124, + "learning_rate": 0.000181145584725537, + "loss": 1.9739, + "step": 759 + }, + { + "epoch": 0.027217218471896433, + "grad_norm": 1.8781588077545166, + "learning_rate": 0.00018138424821002386, + "loss": 2.0575, + "step": 760 + }, + { + "epoch": 0.027253030601464716, + "grad_norm": 1.1376395225524902, + "learning_rate": 0.00018162291169451076, + "loss": 1.8376, + "step": 761 + }, + { + "epoch": 0.027288842731033002, + "grad_norm": 1.6982131004333496, + "learning_rate": 0.00018186157517899762, + "loss": 2.3264, + "step": 762 + }, + { + "epoch": 0.027324654860601285, + "grad_norm": 1.391781210899353, + "learning_rate": 0.0001821002386634845, + "loss": 2.0107, + "step": 763 + }, + { + "epoch": 0.02736046699016957, + "grad_norm": 1.4247759580612183, + "learning_rate": 0.00018233890214797139, + "loss": 1.8508, + "step": 764 + }, + { + "epoch": 0.027396279119737854, + "grad_norm": 1.8408862352371216, + "learning_rate": 0.00018257756563245825, + "loss": 1.9833, + "step": 765 + }, + { + "epoch": 0.02743209124930614, + "grad_norm": 2.116222381591797, + "learning_rate": 0.00018281622911694512, + "loss": 2.0208, + "step": 766 + }, + { + "epoch": 0.027467903378874423, + "grad_norm": 1.4811813831329346, + "learning_rate": 0.000183054892601432, + "loss": 1.9656, + "step": 767 + }, + { + "epoch": 0.02750371550844271, + "grad_norm": 1.1970711946487427, + "learning_rate": 0.00018329355608591888, + "loss": 2.0476, + "step": 768 + }, + { + "epoch": 0.027539527638010996, + "grad_norm": 1.6708041429519653, + "learning_rate": 0.00018353221957040575, + "loss": 2.0781, + "step": 769 + }, + { + "epoch": 0.02757533976757928, + "grad_norm": 1.3129223585128784, + "learning_rate": 0.0001837708830548926, + "loss": 1.7613, + "step": 770 + }, + { + "epoch": 0.027611151897147565, + "grad_norm": 1.6428635120391846, + "learning_rate": 0.00018400954653937946, + "loss": 1.9753, + "step": 771 + }, + { + "epoch": 0.027646964026715848, + "grad_norm": 1.3336197137832642, + "learning_rate": 0.00018424821002386636, + "loss": 1.8005, + "step": 772 + }, + { + "epoch": 0.027682776156284134, + "grad_norm": 1.9145218133926392, + "learning_rate": 0.00018448687350835322, + "loss": 1.8981, + "step": 773 + }, + { + "epoch": 0.027718588285852417, + "grad_norm": 1.456026315689087, + "learning_rate": 0.0001847255369928401, + "loss": 2.0075, + "step": 774 + }, + { + "epoch": 0.027754400415420703, + "grad_norm": 1.5138424634933472, + "learning_rate": 0.00018496420047732699, + "loss": 2.2897, + "step": 775 + }, + { + "epoch": 0.027790212544988986, + "grad_norm": 1.3635684251785278, + "learning_rate": 0.00018520286396181385, + "loss": 1.8127, + "step": 776 + }, + { + "epoch": 0.027826024674557272, + "grad_norm": 2.9247641563415527, + "learning_rate": 0.00018544152744630072, + "loss": 2.0023, + "step": 777 + }, + { + "epoch": 0.02786183680412556, + "grad_norm": 2.099508285522461, + "learning_rate": 0.0001856801909307876, + "loss": 1.8379, + "step": 778 + }, + { + "epoch": 0.02789764893369384, + "grad_norm": 1.8196276426315308, + "learning_rate": 0.00018591885441527448, + "loss": 2.3346, + "step": 779 + }, + { + "epoch": 0.027933461063262128, + "grad_norm": 1.5537874698638916, + "learning_rate": 0.00018615751789976135, + "loss": 2.0353, + "step": 780 + }, + { + "epoch": 0.02796927319283041, + "grad_norm": 1.42340087890625, + "learning_rate": 0.00018639618138424822, + "loss": 1.8038, + "step": 781 + }, + { + "epoch": 0.028005085322398697, + "grad_norm": 2.276153802871704, + "learning_rate": 0.0001866348448687351, + "loss": 1.9849, + "step": 782 + }, + { + "epoch": 0.02804089745196698, + "grad_norm": 1.2486404180526733, + "learning_rate": 0.00018687350835322198, + "loss": 1.8228, + "step": 783 + }, + { + "epoch": 0.028076709581535266, + "grad_norm": 2.1660590171813965, + "learning_rate": 0.00018711217183770885, + "loss": 1.8634, + "step": 784 + }, + { + "epoch": 0.028112521711103552, + "grad_norm": 1.5528640747070312, + "learning_rate": 0.00018735083532219572, + "loss": 2.2286, + "step": 785 + }, + { + "epoch": 0.028148333840671835, + "grad_norm": 1.3248339891433716, + "learning_rate": 0.00018758949880668259, + "loss": 2.0061, + "step": 786 + }, + { + "epoch": 0.02818414597024012, + "grad_norm": 1.7929283380508423, + "learning_rate": 0.00018782816229116945, + "loss": 2.3811, + "step": 787 + }, + { + "epoch": 0.028219958099808404, + "grad_norm": 1.729906678199768, + "learning_rate": 0.00018806682577565632, + "loss": 1.8049, + "step": 788 + }, + { + "epoch": 0.02825577022937669, + "grad_norm": 1.7999067306518555, + "learning_rate": 0.0001883054892601432, + "loss": 1.9822, + "step": 789 + }, + { + "epoch": 0.028291582358944974, + "grad_norm": 1.5044533014297485, + "learning_rate": 0.00018854415274463008, + "loss": 1.8759, + "step": 790 + }, + { + "epoch": 0.02832739448851326, + "grad_norm": 1.5658330917358398, + "learning_rate": 0.00018878281622911695, + "loss": 2.1563, + "step": 791 + }, + { + "epoch": 0.028363206618081543, + "grad_norm": 1.8020349740982056, + "learning_rate": 0.00018902147971360382, + "loss": 2.1909, + "step": 792 + }, + { + "epoch": 0.02839901874764983, + "grad_norm": 1.3612864017486572, + "learning_rate": 0.00018926014319809071, + "loss": 1.7796, + "step": 793 + }, + { + "epoch": 0.028434830877218115, + "grad_norm": 1.2705191373825073, + "learning_rate": 0.00018949880668257758, + "loss": 2.071, + "step": 794 + }, + { + "epoch": 0.028470643006786398, + "grad_norm": 1.3243223428726196, + "learning_rate": 0.00018973747016706445, + "loss": 2.1743, + "step": 795 + }, + { + "epoch": 0.028506455136354684, + "grad_norm": 1.805530309677124, + "learning_rate": 0.00018997613365155132, + "loss": 1.9458, + "step": 796 + }, + { + "epoch": 0.028542267265922967, + "grad_norm": 1.6725308895111084, + "learning_rate": 0.0001902147971360382, + "loss": 1.9733, + "step": 797 + }, + { + "epoch": 0.028578079395491254, + "grad_norm": 2.9171323776245117, + "learning_rate": 0.00019045346062052508, + "loss": 1.8562, + "step": 798 + }, + { + "epoch": 0.028613891525059536, + "grad_norm": 1.2938868999481201, + "learning_rate": 0.00019069212410501195, + "loss": 2.2139, + "step": 799 + }, + { + "epoch": 0.028649703654627823, + "grad_norm": 1.9000681638717651, + "learning_rate": 0.00019093078758949882, + "loss": 1.9803, + "step": 800 + }, + { + "epoch": 0.028685515784196106, + "grad_norm": 1.895714282989502, + "learning_rate": 0.00019116945107398568, + "loss": 1.8723, + "step": 801 + }, + { + "epoch": 0.028721327913764392, + "grad_norm": 1.1832196712493896, + "learning_rate": 0.00019140811455847255, + "loss": 1.7096, + "step": 802 + }, + { + "epoch": 0.028757140043332678, + "grad_norm": 1.382149577140808, + "learning_rate": 0.00019164677804295942, + "loss": 1.7541, + "step": 803 + }, + { + "epoch": 0.02879295217290096, + "grad_norm": 1.5034754276275635, + "learning_rate": 0.00019188544152744631, + "loss": 2.3756, + "step": 804 + }, + { + "epoch": 0.028828764302469247, + "grad_norm": 1.5388857126235962, + "learning_rate": 0.00019212410501193318, + "loss": 1.9153, + "step": 805 + }, + { + "epoch": 0.02886457643203753, + "grad_norm": 2.118048667907715, + "learning_rate": 0.00019236276849642005, + "loss": 2.2704, + "step": 806 + }, + { + "epoch": 0.028900388561605816, + "grad_norm": 1.6065908670425415, + "learning_rate": 0.00019260143198090692, + "loss": 2.0055, + "step": 807 + }, + { + "epoch": 0.0289362006911741, + "grad_norm": 2.0480384826660156, + "learning_rate": 0.0001928400954653938, + "loss": 2.033, + "step": 808 + }, + { + "epoch": 0.028972012820742386, + "grad_norm": 2.066574811935425, + "learning_rate": 0.00019307875894988068, + "loss": 2.2037, + "step": 809 + }, + { + "epoch": 0.029007824950310672, + "grad_norm": 1.3903077840805054, + "learning_rate": 0.00019331742243436755, + "loss": 1.7733, + "step": 810 + }, + { + "epoch": 0.029043637079878955, + "grad_norm": 2.8067617416381836, + "learning_rate": 0.00019355608591885444, + "loss": 1.7627, + "step": 811 + }, + { + "epoch": 0.02907944920944724, + "grad_norm": 1.9488036632537842, + "learning_rate": 0.0001937947494033413, + "loss": 1.9536, + "step": 812 + }, + { + "epoch": 0.029115261339015524, + "grad_norm": 1.4101473093032837, + "learning_rate": 0.00019403341288782818, + "loss": 2.1673, + "step": 813 + }, + { + "epoch": 0.02915107346858381, + "grad_norm": 1.2836867570877075, + "learning_rate": 0.00019427207637231505, + "loss": 1.7867, + "step": 814 + }, + { + "epoch": 0.029186885598152093, + "grad_norm": 1.4783426523208618, + "learning_rate": 0.00019451073985680191, + "loss": 2.1039, + "step": 815 + }, + { + "epoch": 0.02922269772772038, + "grad_norm": 1.3614662885665894, + "learning_rate": 0.00019474940334128878, + "loss": 2.0084, + "step": 816 + }, + { + "epoch": 0.029258509857288662, + "grad_norm": 1.2852891683578491, + "learning_rate": 0.00019498806682577565, + "loss": 1.8364, + "step": 817 + }, + { + "epoch": 0.02929432198685695, + "grad_norm": 1.6663340330123901, + "learning_rate": 0.00019522673031026252, + "loss": 2.1946, + "step": 818 + }, + { + "epoch": 0.029330134116425235, + "grad_norm": 2.088148832321167, + "learning_rate": 0.0001954653937947494, + "loss": 2.1451, + "step": 819 + }, + { + "epoch": 0.029365946245993518, + "grad_norm": 1.2781169414520264, + "learning_rate": 0.00019570405727923628, + "loss": 1.8716, + "step": 820 + }, + { + "epoch": 0.029401758375561804, + "grad_norm": 1.7055004835128784, + "learning_rate": 0.00019594272076372315, + "loss": 2.1419, + "step": 821 + }, + { + "epoch": 0.029437570505130087, + "grad_norm": 2.0636518001556396, + "learning_rate": 0.00019618138424821004, + "loss": 2.2445, + "step": 822 + }, + { + "epoch": 0.029473382634698373, + "grad_norm": 1.557982325553894, + "learning_rate": 0.0001964200477326969, + "loss": 1.8256, + "step": 823 + }, + { + "epoch": 0.029509194764266656, + "grad_norm": 1.3213343620300293, + "learning_rate": 0.00019665871121718378, + "loss": 1.6728, + "step": 824 + }, + { + "epoch": 0.029545006893834942, + "grad_norm": 2.1603245735168457, + "learning_rate": 0.00019689737470167065, + "loss": 2.0918, + "step": 825 + }, + { + "epoch": 0.029580819023403225, + "grad_norm": 1.5868972539901733, + "learning_rate": 0.00019713603818615754, + "loss": 2.0132, + "step": 826 + }, + { + "epoch": 0.02961663115297151, + "grad_norm": 2.6218371391296387, + "learning_rate": 0.0001973747016706444, + "loss": 1.97, + "step": 827 + }, + { + "epoch": 0.029652443282539798, + "grad_norm": 1.8237935304641724, + "learning_rate": 0.00019761336515513128, + "loss": 1.8931, + "step": 828 + }, + { + "epoch": 0.02968825541210808, + "grad_norm": 1.9808688163757324, + "learning_rate": 0.00019785202863961817, + "loss": 1.9166, + "step": 829 + }, + { + "epoch": 0.029724067541676367, + "grad_norm": 1.4820518493652344, + "learning_rate": 0.000198090692124105, + "loss": 1.9317, + "step": 830 + }, + { + "epoch": 0.02975987967124465, + "grad_norm": 2.2204062938690186, + "learning_rate": 0.00019832935560859188, + "loss": 2.0906, + "step": 831 + }, + { + "epoch": 0.029795691800812936, + "grad_norm": 1.4470603466033936, + "learning_rate": 0.00019856801909307875, + "loss": 1.9674, + "step": 832 + }, + { + "epoch": 0.02983150393038122, + "grad_norm": 1.444513201713562, + "learning_rate": 0.00019880668257756564, + "loss": 1.9251, + "step": 833 + }, + { + "epoch": 0.029867316059949505, + "grad_norm": 1.6284515857696533, + "learning_rate": 0.0001990453460620525, + "loss": 1.8075, + "step": 834 + }, + { + "epoch": 0.02990312818951779, + "grad_norm": 1.885413646697998, + "learning_rate": 0.00019928400954653938, + "loss": 2.0856, + "step": 835 + }, + { + "epoch": 0.029938940319086074, + "grad_norm": 1.7370575666427612, + "learning_rate": 0.00019952267303102625, + "loss": 1.7621, + "step": 836 + }, + { + "epoch": 0.02997475244865436, + "grad_norm": 2.0905561447143555, + "learning_rate": 0.00019976133651551314, + "loss": 2.5441, + "step": 837 + }, + { + "epoch": 0.030010564578222643, + "grad_norm": 1.5622786283493042, + "learning_rate": 0.0002, + "loss": 1.9866, + "step": 838 + }, + { + "epoch": 0.03004637670779093, + "grad_norm": 1.1882922649383545, + "learning_rate": 0.0001999999993273145, + "loss": 1.807, + "step": 839 + }, + { + "epoch": 0.030082188837359213, + "grad_norm": 1.3145676851272583, + "learning_rate": 0.000199999997309258, + "loss": 1.9652, + "step": 840 + }, + { + "epoch": 0.0301180009669275, + "grad_norm": 1.4804152250289917, + "learning_rate": 0.00019999999394583053, + "loss": 2.1154, + "step": 841 + }, + { + "epoch": 0.03015381309649578, + "grad_norm": 1.6515103578567505, + "learning_rate": 0.00019999998923703213, + "loss": 1.7606, + "step": 842 + }, + { + "epoch": 0.030189625226064068, + "grad_norm": 2.046954393386841, + "learning_rate": 0.00019999998318286286, + "loss": 1.9643, + "step": 843 + }, + { + "epoch": 0.030225437355632354, + "grad_norm": 2.260840654373169, + "learning_rate": 0.0001999999757833228, + "loss": 2.0192, + "step": 844 + }, + { + "epoch": 0.030261249485200637, + "grad_norm": 2.859248161315918, + "learning_rate": 0.00019999996703841207, + "loss": 1.9989, + "step": 845 + }, + { + "epoch": 0.030297061614768923, + "grad_norm": 1.7310001850128174, + "learning_rate": 0.00019999995694813073, + "loss": 2.0006, + "step": 846 + }, + { + "epoch": 0.030332873744337206, + "grad_norm": 1.423134207725525, + "learning_rate": 0.00019999994551247901, + "loss": 1.7585, + "step": 847 + }, + { + "epoch": 0.030368685873905493, + "grad_norm": 1.5323225259780884, + "learning_rate": 0.000199999932731457, + "loss": 2.1375, + "step": 848 + }, + { + "epoch": 0.030404498003473775, + "grad_norm": 1.53788423538208, + "learning_rate": 0.00019999991860506492, + "loss": 1.9595, + "step": 849 + }, + { + "epoch": 0.03044031013304206, + "grad_norm": 1.2339884042739868, + "learning_rate": 0.00019999990313330286, + "loss": 2.0193, + "step": 850 + }, + { + "epoch": 0.030476122262610345, + "grad_norm": 1.9879727363586426, + "learning_rate": 0.00019999988631617114, + "loss": 1.5824, + "step": 851 + }, + { + "epoch": 0.03051193439217863, + "grad_norm": 1.245160460472107, + "learning_rate": 0.00019999986815366993, + "loss": 1.7495, + "step": 852 + }, + { + "epoch": 0.030547746521746917, + "grad_norm": 1.278892159461975, + "learning_rate": 0.0001999998486457995, + "loss": 2.207, + "step": 853 + }, + { + "epoch": 0.0305835586513152, + "grad_norm": 1.3570001125335693, + "learning_rate": 0.00019999982779256005, + "loss": 1.7336, + "step": 854 + }, + { + "epoch": 0.030619370780883486, + "grad_norm": 1.5163108110427856, + "learning_rate": 0.00019999980559395195, + "loss": 2.1173, + "step": 855 + }, + { + "epoch": 0.03065518291045177, + "grad_norm": 1.1349409818649292, + "learning_rate": 0.00019999978204997545, + "loss": 1.8362, + "step": 856 + }, + { + "epoch": 0.030690995040020055, + "grad_norm": 1.9292572736740112, + "learning_rate": 0.00019999975716063087, + "loss": 2.0006, + "step": 857 + }, + { + "epoch": 0.030726807169588338, + "grad_norm": 1.8030412197113037, + "learning_rate": 0.0001999997309259185, + "loss": 2.0975, + "step": 858 + }, + { + "epoch": 0.030762619299156625, + "grad_norm": 1.3017148971557617, + "learning_rate": 0.0001999997033458388, + "loss": 1.9676, + "step": 859 + }, + { + "epoch": 0.03079843142872491, + "grad_norm": 1.4081376791000366, + "learning_rate": 0.00019999967442039206, + "loss": 1.985, + "step": 860 + }, + { + "epoch": 0.030834243558293194, + "grad_norm": 1.465991735458374, + "learning_rate": 0.0001999996441495787, + "loss": 1.7478, + "step": 861 + }, + { + "epoch": 0.03087005568786148, + "grad_norm": 1.6048805713653564, + "learning_rate": 0.0001999996125333991, + "loss": 2.0733, + "step": 862 + }, + { + "epoch": 0.030905867817429763, + "grad_norm": 1.3127105236053467, + "learning_rate": 0.00019999957957185375, + "loss": 1.9589, + "step": 863 + }, + { + "epoch": 0.03094167994699805, + "grad_norm": 1.815430760383606, + "learning_rate": 0.000199999545264943, + "loss": 2.246, + "step": 864 + }, + { + "epoch": 0.030977492076566332, + "grad_norm": 1.334842324256897, + "learning_rate": 0.00019999950961266738, + "loss": 1.8453, + "step": 865 + }, + { + "epoch": 0.03101330420613462, + "grad_norm": 1.6176645755767822, + "learning_rate": 0.00019999947261502735, + "loss": 2.0127, + "step": 866 + }, + { + "epoch": 0.0310491163357029, + "grad_norm": 1.444711446762085, + "learning_rate": 0.0001999994342720234, + "loss": 1.7625, + "step": 867 + }, + { + "epoch": 0.031084928465271187, + "grad_norm": 1.4336127042770386, + "learning_rate": 0.00019999939458365605, + "loss": 1.799, + "step": 868 + }, + { + "epoch": 0.031120740594839474, + "grad_norm": 1.5195722579956055, + "learning_rate": 0.00019999935354992582, + "loss": 1.9003, + "step": 869 + }, + { + "epoch": 0.031156552724407757, + "grad_norm": 2.2031455039978027, + "learning_rate": 0.0001999993111708333, + "loss": 2.1488, + "step": 870 + }, + { + "epoch": 0.031192364853976043, + "grad_norm": 1.6597728729248047, + "learning_rate": 0.00019999926744637903, + "loss": 1.9993, + "step": 871 + }, + { + "epoch": 0.031228176983544326, + "grad_norm": 1.707897424697876, + "learning_rate": 0.0001999992223765636, + "loss": 2.0831, + "step": 872 + }, + { + "epoch": 0.03126398911311261, + "grad_norm": 1.9093265533447266, + "learning_rate": 0.00019999917596138765, + "loss": 2.2365, + "step": 873 + }, + { + "epoch": 0.0312998012426809, + "grad_norm": 2.5550174713134766, + "learning_rate": 0.00019999912820085176, + "loss": 2.1307, + "step": 874 + }, + { + "epoch": 0.03133561337224918, + "grad_norm": 1.5057379007339478, + "learning_rate": 0.0001999990790949566, + "loss": 2.0101, + "step": 875 + }, + { + "epoch": 0.031371425501817464, + "grad_norm": 1.5937858819961548, + "learning_rate": 0.0001999990286437028, + "loss": 2.0092, + "step": 876 + }, + { + "epoch": 0.031407237631385754, + "grad_norm": 1.1650123596191406, + "learning_rate": 0.00019999897684709104, + "loss": 1.8871, + "step": 877 + }, + { + "epoch": 0.03144304976095404, + "grad_norm": 2.418663501739502, + "learning_rate": 0.00019999892370512208, + "loss": 2.106, + "step": 878 + }, + { + "epoch": 0.03147886189052232, + "grad_norm": 1.407043218612671, + "learning_rate": 0.00019999886921779657, + "loss": 1.9746, + "step": 879 + }, + { + "epoch": 0.0315146740200906, + "grad_norm": 1.4505114555358887, + "learning_rate": 0.00019999881338511526, + "loss": 1.9175, + "step": 880 + }, + { + "epoch": 0.03155048614965889, + "grad_norm": 1.4700239896774292, + "learning_rate": 0.0001999987562070789, + "loss": 1.9464, + "step": 881 + }, + { + "epoch": 0.031586298279227175, + "grad_norm": 1.1745959520339966, + "learning_rate": 0.00019999869768368828, + "loss": 1.7727, + "step": 882 + }, + { + "epoch": 0.03162211040879546, + "grad_norm": 1.303771734237671, + "learning_rate": 0.0001999986378149442, + "loss": 2.056, + "step": 883 + }, + { + "epoch": 0.03165792253836374, + "grad_norm": 1.8219273090362549, + "learning_rate": 0.00019999857660084737, + "loss": 1.768, + "step": 884 + }, + { + "epoch": 0.03169373466793203, + "grad_norm": 1.5007424354553223, + "learning_rate": 0.00019999851404139873, + "loss": 1.9576, + "step": 885 + }, + { + "epoch": 0.03172954679750031, + "grad_norm": 1.8747928142547607, + "learning_rate": 0.00019999845013659906, + "loss": 1.8605, + "step": 886 + }, + { + "epoch": 0.031765358927068596, + "grad_norm": 1.2296186685562134, + "learning_rate": 0.00019999838488644924, + "loss": 2.0188, + "step": 887 + }, + { + "epoch": 0.031801171056636886, + "grad_norm": 1.2861896753311157, + "learning_rate": 0.00019999831829095013, + "loss": 2.0889, + "step": 888 + }, + { + "epoch": 0.03183698318620517, + "grad_norm": 1.5891250371932983, + "learning_rate": 0.00019999825035010263, + "loss": 1.9346, + "step": 889 + }, + { + "epoch": 0.03187279531577345, + "grad_norm": 1.4180065393447876, + "learning_rate": 0.00019999818106390766, + "loss": 1.7997, + "step": 890 + }, + { + "epoch": 0.031908607445341734, + "grad_norm": 1.7038114070892334, + "learning_rate": 0.0001999981104323662, + "loss": 1.7128, + "step": 891 + }, + { + "epoch": 0.031944419574910024, + "grad_norm": 1.518619179725647, + "learning_rate": 0.00019999803845547907, + "loss": 1.802, + "step": 892 + }, + { + "epoch": 0.03198023170447831, + "grad_norm": 1.3319896459579468, + "learning_rate": 0.00019999796513324735, + "loss": 1.8977, + "step": 893 + }, + { + "epoch": 0.03201604383404659, + "grad_norm": 1.7978893518447876, + "learning_rate": 0.00019999789046567203, + "loss": 1.8772, + "step": 894 + }, + { + "epoch": 0.03205185596361488, + "grad_norm": 2.5152993202209473, + "learning_rate": 0.00019999781445275406, + "loss": 1.9364, + "step": 895 + }, + { + "epoch": 0.03208766809318316, + "grad_norm": 1.4267518520355225, + "learning_rate": 0.0001999977370944945, + "loss": 2.3957, + "step": 896 + }, + { + "epoch": 0.032123480222751445, + "grad_norm": 1.907285451889038, + "learning_rate": 0.00019999765839089434, + "loss": 1.6652, + "step": 897 + }, + { + "epoch": 0.03215929235231973, + "grad_norm": 1.9558316469192505, + "learning_rate": 0.00019999757834195472, + "loss": 1.9541, + "step": 898 + }, + { + "epoch": 0.03219510448188802, + "grad_norm": 1.6896051168441772, + "learning_rate": 0.00019999749694767666, + "loss": 1.722, + "step": 899 + }, + { + "epoch": 0.0322309166114563, + "grad_norm": 2.4404208660125732, + "learning_rate": 0.0001999974142080612, + "loss": 2.4644, + "step": 900 + }, + { + "epoch": 0.03226672874102458, + "grad_norm": 1.4918023347854614, + "learning_rate": 0.00019999733012310958, + "loss": 1.9803, + "step": 901 + }, + { + "epoch": 0.03230254087059287, + "grad_norm": 1.9694902896881104, + "learning_rate": 0.00019999724469282288, + "loss": 1.9535, + "step": 902 + }, + { + "epoch": 0.032338353000161156, + "grad_norm": 1.778144359588623, + "learning_rate": 0.00019999715791720223, + "loss": 1.8126, + "step": 903 + }, + { + "epoch": 0.03237416512972944, + "grad_norm": 1.5544754266738892, + "learning_rate": 0.00019999706979624877, + "loss": 2.0249, + "step": 904 + }, + { + "epoch": 0.03240997725929772, + "grad_norm": 2.4764208793640137, + "learning_rate": 0.00019999698032996377, + "loss": 2.1015, + "step": 905 + }, + { + "epoch": 0.03244578938886601, + "grad_norm": 1.926424503326416, + "learning_rate": 0.00019999688951834836, + "loss": 1.8125, + "step": 906 + }, + { + "epoch": 0.032481601518434294, + "grad_norm": 2.0330772399902344, + "learning_rate": 0.0001999967973614038, + "loss": 2.0531, + "step": 907 + }, + { + "epoch": 0.03251741364800258, + "grad_norm": 2.0009052753448486, + "learning_rate": 0.00019999670385913133, + "loss": 2.2191, + "step": 908 + }, + { + "epoch": 0.03255322577757086, + "grad_norm": 1.3479782342910767, + "learning_rate": 0.00019999660901153218, + "loss": 1.7542, + "step": 909 + }, + { + "epoch": 0.03258903790713915, + "grad_norm": 1.28345787525177, + "learning_rate": 0.00019999651281860762, + "loss": 1.9944, + "step": 910 + }, + { + "epoch": 0.03262485003670743, + "grad_norm": 1.4357455968856812, + "learning_rate": 0.00019999641528035898, + "loss": 2.0591, + "step": 911 + }, + { + "epoch": 0.032660662166275715, + "grad_norm": 1.6938787698745728, + "learning_rate": 0.0001999963163967876, + "loss": 2.2751, + "step": 912 + }, + { + "epoch": 0.032696474295844005, + "grad_norm": 1.267437219619751, + "learning_rate": 0.00019999621616789473, + "loss": 1.8928, + "step": 913 + }, + { + "epoch": 0.03273228642541229, + "grad_norm": 1.2574923038482666, + "learning_rate": 0.00019999611459368174, + "loss": 1.793, + "step": 914 + }, + { + "epoch": 0.03276809855498057, + "grad_norm": 1.0232667922973633, + "learning_rate": 0.00019999601167415006, + "loss": 1.8391, + "step": 915 + }, + { + "epoch": 0.032803910684548854, + "grad_norm": 2.3581461906433105, + "learning_rate": 0.000199995907409301, + "loss": 2.4764, + "step": 916 + }, + { + "epoch": 0.032839722814117144, + "grad_norm": 1.5463016033172607, + "learning_rate": 0.000199995801799136, + "loss": 1.9629, + "step": 917 + }, + { + "epoch": 0.032875534943685426, + "grad_norm": 2.3027689456939697, + "learning_rate": 0.00019999569484365645, + "loss": 1.9298, + "step": 918 + }, + { + "epoch": 0.03291134707325371, + "grad_norm": 1.575325846672058, + "learning_rate": 0.00019999558654286385, + "loss": 1.8611, + "step": 919 + }, + { + "epoch": 0.032947159202822, + "grad_norm": 2.7732012271881104, + "learning_rate": 0.0001999954768967596, + "loss": 2.2494, + "step": 920 + }, + { + "epoch": 0.03298297133239028, + "grad_norm": 2.0338802337646484, + "learning_rate": 0.0001999953659053452, + "loss": 1.8992, + "step": 921 + }, + { + "epoch": 0.033018783461958565, + "grad_norm": 1.5755022764205933, + "learning_rate": 0.0001999952535686221, + "loss": 2.1845, + "step": 922 + }, + { + "epoch": 0.03305459559152685, + "grad_norm": 1.2108244895935059, + "learning_rate": 0.00019999513988659188, + "loss": 1.6432, + "step": 923 + }, + { + "epoch": 0.03309040772109514, + "grad_norm": 2.210641860961914, + "learning_rate": 0.00019999502485925605, + "loss": 2.4049, + "step": 924 + }, + { + "epoch": 0.03312621985066342, + "grad_norm": 1.6159656047821045, + "learning_rate": 0.00019999490848661612, + "loss": 1.6542, + "step": 925 + }, + { + "epoch": 0.0331620319802317, + "grad_norm": 1.2654778957366943, + "learning_rate": 0.00019999479076867368, + "loss": 1.991, + "step": 926 + }, + { + "epoch": 0.03319784410979999, + "grad_norm": 1.8192824125289917, + "learning_rate": 0.00019999467170543031, + "loss": 1.9816, + "step": 927 + }, + { + "epoch": 0.033233656239368276, + "grad_norm": 1.5657546520233154, + "learning_rate": 0.00019999455129688764, + "loss": 1.8487, + "step": 928 + }, + { + "epoch": 0.03326946836893656, + "grad_norm": 1.4876744747161865, + "learning_rate": 0.00019999442954304729, + "loss": 1.7911, + "step": 929 + }, + { + "epoch": 0.03330528049850484, + "grad_norm": 1.8739250898361206, + "learning_rate": 0.00019999430644391082, + "loss": 1.9183, + "step": 930 + }, + { + "epoch": 0.03334109262807313, + "grad_norm": 1.3608872890472412, + "learning_rate": 0.00019999418199947994, + "loss": 1.8478, + "step": 931 + }, + { + "epoch": 0.033376904757641414, + "grad_norm": 1.5598634481430054, + "learning_rate": 0.00019999405620975636, + "loss": 2.0591, + "step": 932 + }, + { + "epoch": 0.0334127168872097, + "grad_norm": 2.9095773696899414, + "learning_rate": 0.00019999392907474174, + "loss": 2.0628, + "step": 933 + }, + { + "epoch": 0.03344852901677798, + "grad_norm": 1.6652559041976929, + "learning_rate": 0.00019999380059443773, + "loss": 1.9522, + "step": 934 + }, + { + "epoch": 0.03348434114634627, + "grad_norm": 1.8389030694961548, + "learning_rate": 0.00019999367076884616, + "loss": 1.8165, + "step": 935 + }, + { + "epoch": 0.03352015327591455, + "grad_norm": 1.5243996381759644, + "learning_rate": 0.00019999353959796872, + "loss": 1.9362, + "step": 936 + }, + { + "epoch": 0.033555965405482835, + "grad_norm": 1.587996006011963, + "learning_rate": 0.0001999934070818072, + "loss": 1.9498, + "step": 937 + }, + { + "epoch": 0.033591777535051125, + "grad_norm": 1.3211830854415894, + "learning_rate": 0.00019999327322036336, + "loss": 1.8008, + "step": 938 + }, + { + "epoch": 0.03362758966461941, + "grad_norm": 1.775557518005371, + "learning_rate": 0.00019999313801363902, + "loss": 2.0888, + "step": 939 + }, + { + "epoch": 0.03366340179418769, + "grad_norm": 1.5863924026489258, + "learning_rate": 0.00019999300146163597, + "loss": 1.7688, + "step": 940 + }, + { + "epoch": 0.03369921392375597, + "grad_norm": 2.072439193725586, + "learning_rate": 0.00019999286356435608, + "loss": 1.9447, + "step": 941 + }, + { + "epoch": 0.03373502605332426, + "grad_norm": 1.8205657005310059, + "learning_rate": 0.0001999927243218012, + "loss": 1.852, + "step": 942 + }, + { + "epoch": 0.033770838182892546, + "grad_norm": 2.22773814201355, + "learning_rate": 0.0001999925837339732, + "loss": 1.898, + "step": 943 + }, + { + "epoch": 0.03380665031246083, + "grad_norm": 1.3842288255691528, + "learning_rate": 0.00019999244180087395, + "loss": 2.0167, + "step": 944 + }, + { + "epoch": 0.03384246244202912, + "grad_norm": 1.9286949634552002, + "learning_rate": 0.00019999229852250537, + "loss": 1.9268, + "step": 945 + }, + { + "epoch": 0.0338782745715974, + "grad_norm": 2.1182754039764404, + "learning_rate": 0.00019999215389886942, + "loss": 2.0501, + "step": 946 + }, + { + "epoch": 0.033914086701165684, + "grad_norm": 1.6029773950576782, + "learning_rate": 0.000199992007929968, + "loss": 1.9263, + "step": 947 + }, + { + "epoch": 0.03394989883073397, + "grad_norm": 1.2110930681228638, + "learning_rate": 0.0001999918606158031, + "loss": 1.9129, + "step": 948 + }, + { + "epoch": 0.03398571096030226, + "grad_norm": 1.507839322090149, + "learning_rate": 0.0001999917119563767, + "loss": 1.9622, + "step": 949 + }, + { + "epoch": 0.03402152308987054, + "grad_norm": 1.6795588731765747, + "learning_rate": 0.00019999156195169078, + "loss": 1.864, + "step": 950 + }, + { + "epoch": 0.03405733521943882, + "grad_norm": 1.9231281280517578, + "learning_rate": 0.0001999914106017474, + "loss": 2.1288, + "step": 951 + }, + { + "epoch": 0.03409314734900711, + "grad_norm": 1.7240833044052124, + "learning_rate": 0.00019999125790654855, + "loss": 1.7942, + "step": 952 + }, + { + "epoch": 0.034128959478575395, + "grad_norm": 1.4035639762878418, + "learning_rate": 0.0001999911038660963, + "loss": 1.9428, + "step": 953 + }, + { + "epoch": 0.03416477160814368, + "grad_norm": 1.2519588470458984, + "learning_rate": 0.00019999094848039274, + "loss": 1.7459, + "step": 954 + }, + { + "epoch": 0.03420058373771196, + "grad_norm": 1.201668620109558, + "learning_rate": 0.00019999079174943995, + "loss": 1.6607, + "step": 955 + }, + { + "epoch": 0.03423639586728025, + "grad_norm": 1.6537754535675049, + "learning_rate": 0.00019999063367324003, + "loss": 2.0089, + "step": 956 + }, + { + "epoch": 0.03427220799684853, + "grad_norm": 2.3439764976501465, + "learning_rate": 0.0001999904742517951, + "loss": 2.1381, + "step": 957 + }, + { + "epoch": 0.034308020126416816, + "grad_norm": 1.219859004020691, + "learning_rate": 0.00019999031348510733, + "loss": 2.0814, + "step": 958 + }, + { + "epoch": 0.0343438322559851, + "grad_norm": 1.969250202178955, + "learning_rate": 0.00019999015137317887, + "loss": 2.0864, + "step": 959 + }, + { + "epoch": 0.03437964438555339, + "grad_norm": 2.401399850845337, + "learning_rate": 0.0001999899879160119, + "loss": 1.9438, + "step": 960 + }, + { + "epoch": 0.03441545651512167, + "grad_norm": 1.4327278137207031, + "learning_rate": 0.00019998982311360863, + "loss": 1.9214, + "step": 961 + }, + { + "epoch": 0.034451268644689954, + "grad_norm": 2.319035768508911, + "learning_rate": 0.00019998965696597126, + "loss": 2.3354, + "step": 962 + }, + { + "epoch": 0.034487080774258244, + "grad_norm": 1.1964077949523926, + "learning_rate": 0.00019998948947310202, + "loss": 1.9042, + "step": 963 + }, + { + "epoch": 0.03452289290382653, + "grad_norm": 1.8281431198120117, + "learning_rate": 0.0001999893206350032, + "loss": 1.9747, + "step": 964 + }, + { + "epoch": 0.03455870503339481, + "grad_norm": 1.1979892253875732, + "learning_rate": 0.00019998915045167702, + "loss": 1.6347, + "step": 965 + }, + { + "epoch": 0.03459451716296309, + "grad_norm": 1.382490873336792, + "learning_rate": 0.0001999889789231258, + "loss": 2.0574, + "step": 966 + }, + { + "epoch": 0.03463032929253138, + "grad_norm": 1.8725450038909912, + "learning_rate": 0.00019998880604935187, + "loss": 1.9832, + "step": 967 + }, + { + "epoch": 0.034666141422099665, + "grad_norm": 1.4053820371627808, + "learning_rate": 0.00019998863183035752, + "loss": 1.5536, + "step": 968 + }, + { + "epoch": 0.03470195355166795, + "grad_norm": 2.361921787261963, + "learning_rate": 0.0001999884562661451, + "loss": 2.347, + "step": 969 + }, + { + "epoch": 0.03473776568123624, + "grad_norm": 1.5455695390701294, + "learning_rate": 0.00019998827935671697, + "loss": 1.5136, + "step": 970 + }, + { + "epoch": 0.03477357781080452, + "grad_norm": 1.388515830039978, + "learning_rate": 0.00019998810110207553, + "loss": 1.7991, + "step": 971 + }, + { + "epoch": 0.034809389940372804, + "grad_norm": 1.5065860748291016, + "learning_rate": 0.00019998792150222316, + "loss": 2.0703, + "step": 972 + }, + { + "epoch": 0.034845202069941086, + "grad_norm": 2.07222580909729, + "learning_rate": 0.0001999877405571623, + "loss": 1.9729, + "step": 973 + }, + { + "epoch": 0.034881014199509376, + "grad_norm": 1.977397084236145, + "learning_rate": 0.00019998755826689535, + "loss": 2.313, + "step": 974 + }, + { + "epoch": 0.03491682632907766, + "grad_norm": 1.2814414501190186, + "learning_rate": 0.00019998737463142478, + "loss": 1.7694, + "step": 975 + }, + { + "epoch": 0.03495263845864594, + "grad_norm": 2.0901002883911133, + "learning_rate": 0.00019998718965075305, + "loss": 2.186, + "step": 976 + }, + { + "epoch": 0.03498845058821423, + "grad_norm": 1.2248455286026, + "learning_rate": 0.00019998700332488265, + "loss": 1.7233, + "step": 977 + }, + { + "epoch": 0.035024262717782514, + "grad_norm": 1.0053720474243164, + "learning_rate": 0.00019998681565381611, + "loss": 1.5713, + "step": 978 + }, + { + "epoch": 0.0350600748473508, + "grad_norm": 1.413627028465271, + "learning_rate": 0.00019998662663755595, + "loss": 1.908, + "step": 979 + }, + { + "epoch": 0.03509588697691908, + "grad_norm": 1.2800226211547852, + "learning_rate": 0.00019998643627610466, + "loss": 1.83, + "step": 980 + }, + { + "epoch": 0.03513169910648737, + "grad_norm": 1.8141751289367676, + "learning_rate": 0.00019998624456946492, + "loss": 1.6135, + "step": 981 + }, + { + "epoch": 0.03516751123605565, + "grad_norm": 1.8859864473342896, + "learning_rate": 0.00019998605151763917, + "loss": 2.0094, + "step": 982 + }, + { + "epoch": 0.035203323365623936, + "grad_norm": 1.6842153072357178, + "learning_rate": 0.00019998585712063008, + "loss": 1.8373, + "step": 983 + }, + { + "epoch": 0.03523913549519222, + "grad_norm": 1.4130045175552368, + "learning_rate": 0.00019998566137844026, + "loss": 2.2282, + "step": 984 + }, + { + "epoch": 0.03527494762476051, + "grad_norm": 2.0987722873687744, + "learning_rate": 0.00019998546429107235, + "loss": 2.1985, + "step": 985 + }, + { + "epoch": 0.03531075975432879, + "grad_norm": 1.7609018087387085, + "learning_rate": 0.00019998526585852898, + "loss": 2.0455, + "step": 986 + }, + { + "epoch": 0.035346571883897074, + "grad_norm": 1.6820363998413086, + "learning_rate": 0.00019998506608081282, + "loss": 2.0726, + "step": 987 + }, + { + "epoch": 0.035382384013465364, + "grad_norm": 2.3095192909240723, + "learning_rate": 0.00019998486495792657, + "loss": 1.9518, + "step": 988 + }, + { + "epoch": 0.035418196143033646, + "grad_norm": 1.5085362195968628, + "learning_rate": 0.00019998466248987294, + "loss": 2.0056, + "step": 989 + }, + { + "epoch": 0.03545400827260193, + "grad_norm": 1.2365782260894775, + "learning_rate": 0.00019998445867665463, + "loss": 2.0304, + "step": 990 + }, + { + "epoch": 0.03548982040217021, + "grad_norm": 1.5272630453109741, + "learning_rate": 0.0001999842535182744, + "loss": 1.9765, + "step": 991 + }, + { + "epoch": 0.0355256325317385, + "grad_norm": 1.3069766759872437, + "learning_rate": 0.00019998404701473504, + "loss": 2.0625, + "step": 992 + }, + { + "epoch": 0.035561444661306785, + "grad_norm": 1.3852381706237793, + "learning_rate": 0.00019998383916603927, + "loss": 2.0252, + "step": 993 + }, + { + "epoch": 0.03559725679087507, + "grad_norm": 2.0161495208740234, + "learning_rate": 0.00019998362997218993, + "loss": 2.197, + "step": 994 + }, + { + "epoch": 0.03563306892044336, + "grad_norm": 1.4747849702835083, + "learning_rate": 0.0001999834194331898, + "loss": 2.02, + "step": 995 + }, + { + "epoch": 0.03566888105001164, + "grad_norm": 2.3067164421081543, + "learning_rate": 0.00019998320754904177, + "loss": 2.6484, + "step": 996 + }, + { + "epoch": 0.03570469317957992, + "grad_norm": 1.9584856033325195, + "learning_rate": 0.0001999829943197486, + "loss": 1.8812, + "step": 997 + }, + { + "epoch": 0.035740505309148206, + "grad_norm": 1.4480103254318237, + "learning_rate": 0.00019998277974531326, + "loss": 1.9695, + "step": 998 + }, + { + "epoch": 0.035776317438716496, + "grad_norm": 1.4160569906234741, + "learning_rate": 0.00019998256382573856, + "loss": 2.1651, + "step": 999 + }, + { + "epoch": 0.03581212956828478, + "grad_norm": 1.8961275815963745, + "learning_rate": 0.0001999823465610274, + "loss": 2.2543, + "step": 1000 + }, + { + "epoch": 0.03584794169785306, + "grad_norm": 1.6168714761734009, + "learning_rate": 0.0001999821279511828, + "loss": 1.8404, + "step": 1001 + }, + { + "epoch": 0.03588375382742135, + "grad_norm": 1.4820830821990967, + "learning_rate": 0.0001999819079962076, + "loss": 1.7631, + "step": 1002 + }, + { + "epoch": 0.035919565956989634, + "grad_norm": 1.6105800867080688, + "learning_rate": 0.0001999816866961048, + "loss": 2.187, + "step": 1003 + }, + { + "epoch": 0.03595537808655792, + "grad_norm": 1.5409481525421143, + "learning_rate": 0.00019998146405087738, + "loss": 1.8704, + "step": 1004 + }, + { + "epoch": 0.0359911902161262, + "grad_norm": 1.9356262683868408, + "learning_rate": 0.00019998124006052832, + "loss": 1.9801, + "step": 1005 + }, + { + "epoch": 0.03602700234569449, + "grad_norm": 1.5342742204666138, + "learning_rate": 0.00019998101472506064, + "loss": 2.0569, + "step": 1006 + }, + { + "epoch": 0.03606281447526277, + "grad_norm": 1.746179223060608, + "learning_rate": 0.00019998078804447738, + "loss": 2.0516, + "step": 1007 + }, + { + "epoch": 0.036098626604831055, + "grad_norm": 1.6185777187347412, + "learning_rate": 0.00019998056001878158, + "loss": 1.9516, + "step": 1008 + }, + { + "epoch": 0.03613443873439934, + "grad_norm": 1.815090537071228, + "learning_rate": 0.0001999803306479763, + "loss": 1.9385, + "step": 1009 + }, + { + "epoch": 0.03617025086396763, + "grad_norm": 1.338623285293579, + "learning_rate": 0.00019998009993206462, + "loss": 1.8261, + "step": 1010 + }, + { + "epoch": 0.03620606299353591, + "grad_norm": 1.5827144384384155, + "learning_rate": 0.0001999798678710497, + "loss": 1.7827, + "step": 1011 + }, + { + "epoch": 0.03624187512310419, + "grad_norm": 1.5187708139419556, + "learning_rate": 0.00019997963446493461, + "loss": 2.2097, + "step": 1012 + }, + { + "epoch": 0.03627768725267248, + "grad_norm": 1.234924554824829, + "learning_rate": 0.00019997939971372252, + "loss": 1.7321, + "step": 1013 + }, + { + "epoch": 0.036313499382240766, + "grad_norm": 1.9228838682174683, + "learning_rate": 0.00019997916361741655, + "loss": 2.0538, + "step": 1014 + }, + { + "epoch": 0.03634931151180905, + "grad_norm": 1.6233713626861572, + "learning_rate": 0.0001999789261760199, + "loss": 2.216, + "step": 1015 + }, + { + "epoch": 0.03638512364137733, + "grad_norm": 1.3126624822616577, + "learning_rate": 0.00019997868738953577, + "loss": 1.7939, + "step": 1016 + }, + { + "epoch": 0.03642093577094562, + "grad_norm": 1.9218897819519043, + "learning_rate": 0.00019997844725796733, + "loss": 2.1702, + "step": 1017 + }, + { + "epoch": 0.036456747900513904, + "grad_norm": 1.6152637004852295, + "learning_rate": 0.0001999782057813179, + "loss": 1.9599, + "step": 1018 + }, + { + "epoch": 0.03649256003008219, + "grad_norm": 1.2749489545822144, + "learning_rate": 0.00019997796295959065, + "loss": 1.9781, + "step": 1019 + }, + { + "epoch": 0.03652837215965048, + "grad_norm": 1.346900224685669, + "learning_rate": 0.00019997771879278883, + "loss": 1.8683, + "step": 1020 + }, + { + "epoch": 0.03656418428921876, + "grad_norm": 1.3836091756820679, + "learning_rate": 0.00019997747328091584, + "loss": 1.7748, + "step": 1021 + }, + { + "epoch": 0.03659999641878704, + "grad_norm": 1.3248271942138672, + "learning_rate": 0.00019997722642397484, + "loss": 2.0045, + "step": 1022 + }, + { + "epoch": 0.036635808548355325, + "grad_norm": 1.6735011339187622, + "learning_rate": 0.00019997697822196926, + "loss": 1.9793, + "step": 1023 + }, + { + "epoch": 0.036671620677923615, + "grad_norm": 2.2710859775543213, + "learning_rate": 0.00019997672867490238, + "loss": 1.8652, + "step": 1024 + }, + { + "epoch": 0.0367074328074919, + "grad_norm": 1.8319227695465088, + "learning_rate": 0.0001999764777827776, + "loss": 2.1901, + "step": 1025 + }, + { + "epoch": 0.03674324493706018, + "grad_norm": 1.5663321018218994, + "learning_rate": 0.00019997622554559824, + "loss": 1.9499, + "step": 1026 + }, + { + "epoch": 0.03677905706662847, + "grad_norm": 1.2792712450027466, + "learning_rate": 0.00019997597196336775, + "loss": 2.1186, + "step": 1027 + }, + { + "epoch": 0.03681486919619675, + "grad_norm": 1.6958818435668945, + "learning_rate": 0.00019997571703608952, + "loss": 1.9744, + "step": 1028 + }, + { + "epoch": 0.036850681325765036, + "grad_norm": 1.8845629692077637, + "learning_rate": 0.00019997546076376695, + "loss": 2.0421, + "step": 1029 + }, + { + "epoch": 0.03688649345533332, + "grad_norm": 2.2948620319366455, + "learning_rate": 0.00019997520314640356, + "loss": 2.1033, + "step": 1030 + }, + { + "epoch": 0.03692230558490161, + "grad_norm": 1.8859891891479492, + "learning_rate": 0.00019997494418400272, + "loss": 1.7958, + "step": 1031 + }, + { + "epoch": 0.03695811771446989, + "grad_norm": 1.6800627708435059, + "learning_rate": 0.00019997468387656796, + "loss": 2.0049, + "step": 1032 + }, + { + "epoch": 0.036993929844038174, + "grad_norm": 2.595055341720581, + "learning_rate": 0.00019997442222410283, + "loss": 2.2109, + "step": 1033 + }, + { + "epoch": 0.03702974197360646, + "grad_norm": 1.660823941230774, + "learning_rate": 0.0001999741592266108, + "loss": 1.9758, + "step": 1034 + }, + { + "epoch": 0.03706555410317475, + "grad_norm": 2.998129367828369, + "learning_rate": 0.0001999738948840954, + "loss": 1.9876, + "step": 1035 + }, + { + "epoch": 0.03710136623274303, + "grad_norm": 1.9035344123840332, + "learning_rate": 0.0001999736291965602, + "loss": 2.2105, + "step": 1036 + }, + { + "epoch": 0.03713717836231131, + "grad_norm": 1.6378567218780518, + "learning_rate": 0.00019997336216400876, + "loss": 1.985, + "step": 1037 + }, + { + "epoch": 0.0371729904918796, + "grad_norm": 2.2616209983825684, + "learning_rate": 0.00019997309378644472, + "loss": 2.1327, + "step": 1038 + }, + { + "epoch": 0.037208802621447885, + "grad_norm": 1.207955241203308, + "learning_rate": 0.00019997282406387167, + "loss": 1.7478, + "step": 1039 + }, + { + "epoch": 0.03724461475101617, + "grad_norm": 1.3184400796890259, + "learning_rate": 0.00019997255299629318, + "loss": 1.6028, + "step": 1040 + }, + { + "epoch": 0.03728042688058445, + "grad_norm": 1.752772569656372, + "learning_rate": 0.00019997228058371298, + "loss": 1.9437, + "step": 1041 + }, + { + "epoch": 0.03731623901015274, + "grad_norm": 1.965004563331604, + "learning_rate": 0.00019997200682613468, + "loss": 2.4803, + "step": 1042 + }, + { + "epoch": 0.037352051139721024, + "grad_norm": 1.6652560234069824, + "learning_rate": 0.00019997173172356202, + "loss": 2.1181, + "step": 1043 + }, + { + "epoch": 0.037387863269289306, + "grad_norm": 1.3832405805587769, + "learning_rate": 0.00019997145527599864, + "loss": 2.0115, + "step": 1044 + }, + { + "epoch": 0.037423675398857596, + "grad_norm": 1.9811400175094604, + "learning_rate": 0.00019997117748344825, + "loss": 2.0448, + "step": 1045 + }, + { + "epoch": 0.03745948752842588, + "grad_norm": 1.222140908241272, + "learning_rate": 0.00019997089834591466, + "loss": 1.8386, + "step": 1046 + }, + { + "epoch": 0.03749529965799416, + "grad_norm": 1.2641700506210327, + "learning_rate": 0.00019997061786340158, + "loss": 1.7789, + "step": 1047 + }, + { + "epoch": 0.037531111787562445, + "grad_norm": 2.9230399131774902, + "learning_rate": 0.00019997033603591277, + "loss": 2.0249, + "step": 1048 + }, + { + "epoch": 0.037566923917130735, + "grad_norm": 1.4298175573349, + "learning_rate": 0.00019997005286345208, + "loss": 1.6235, + "step": 1049 + }, + { + "epoch": 0.03760273604669902, + "grad_norm": 2.0398781299591064, + "learning_rate": 0.00019996976834602324, + "loss": 2.2335, + "step": 1050 + }, + { + "epoch": 0.0376385481762673, + "grad_norm": 2.0259313583374023, + "learning_rate": 0.00019996948248363015, + "loss": 2.1829, + "step": 1051 + }, + { + "epoch": 0.03767436030583559, + "grad_norm": 1.503429651260376, + "learning_rate": 0.0001999691952762766, + "loss": 1.8359, + "step": 1052 + }, + { + "epoch": 0.03771017243540387, + "grad_norm": 1.3695441484451294, + "learning_rate": 0.00019996890672396652, + "loss": 1.8916, + "step": 1053 + }, + { + "epoch": 0.037745984564972156, + "grad_norm": 2.213430881500244, + "learning_rate": 0.0001999686168267037, + "loss": 2.0709, + "step": 1054 + }, + { + "epoch": 0.03778179669454044, + "grad_norm": 1.7021877765655518, + "learning_rate": 0.0001999683255844921, + "loss": 1.8707, + "step": 1055 + }, + { + "epoch": 0.03781760882410873, + "grad_norm": 1.7013872861862183, + "learning_rate": 0.00019996803299733565, + "loss": 1.8043, + "step": 1056 + }, + { + "epoch": 0.03785342095367701, + "grad_norm": 2.1002440452575684, + "learning_rate": 0.00019996773906523827, + "loss": 1.7684, + "step": 1057 + }, + { + "epoch": 0.037889233083245294, + "grad_norm": 1.6813231706619263, + "learning_rate": 0.0001999674437882039, + "loss": 1.9193, + "step": 1058 + }, + { + "epoch": 0.03792504521281358, + "grad_norm": 1.3570369482040405, + "learning_rate": 0.0001999671471662365, + "loss": 1.7127, + "step": 1059 + }, + { + "epoch": 0.03796085734238187, + "grad_norm": 3.0057036876678467, + "learning_rate": 0.0001999668491993401, + "loss": 2.5179, + "step": 1060 + }, + { + "epoch": 0.03799666947195015, + "grad_norm": 1.5875788927078247, + "learning_rate": 0.00019996654988751867, + "loss": 1.9084, + "step": 1061 + }, + { + "epoch": 0.03803248160151843, + "grad_norm": 4.081822395324707, + "learning_rate": 0.0001999662492307763, + "loss": 2.2503, + "step": 1062 + }, + { + "epoch": 0.03806829373108672, + "grad_norm": 1.6430948972702026, + "learning_rate": 0.000199965947229117, + "loss": 1.6658, + "step": 1063 + }, + { + "epoch": 0.038104105860655005, + "grad_norm": 1.858124017715454, + "learning_rate": 0.0001999656438825448, + "loss": 1.8926, + "step": 1064 + }, + { + "epoch": 0.03813991799022329, + "grad_norm": 1.7443957328796387, + "learning_rate": 0.0001999653391910638, + "loss": 1.8012, + "step": 1065 + }, + { + "epoch": 0.03817573011979157, + "grad_norm": 1.912501335144043, + "learning_rate": 0.00019996503315467811, + "loss": 1.8669, + "step": 1066 + }, + { + "epoch": 0.03821154224935986, + "grad_norm": 1.8698126077651978, + "learning_rate": 0.00019996472577339186, + "loss": 2.3256, + "step": 1067 + }, + { + "epoch": 0.03824735437892814, + "grad_norm": 1.444370985031128, + "learning_rate": 0.00019996441704720917, + "loss": 1.724, + "step": 1068 + }, + { + "epoch": 0.038283166508496426, + "grad_norm": 2.0056838989257812, + "learning_rate": 0.00019996410697613418, + "loss": 1.8959, + "step": 1069 + }, + { + "epoch": 0.038318978638064716, + "grad_norm": 1.4529732465744019, + "learning_rate": 0.0001999637955601711, + "loss": 2.0943, + "step": 1070 + }, + { + "epoch": 0.038354790767633, + "grad_norm": 1.5077389478683472, + "learning_rate": 0.00019996348279932406, + "loss": 1.6368, + "step": 1071 + }, + { + "epoch": 0.03839060289720128, + "grad_norm": 1.458603858947754, + "learning_rate": 0.0001999631686935973, + "loss": 2.1051, + "step": 1072 + }, + { + "epoch": 0.038426415026769564, + "grad_norm": 1.7620093822479248, + "learning_rate": 0.0001999628532429951, + "loss": 2.2006, + "step": 1073 + }, + { + "epoch": 0.038462227156337854, + "grad_norm": 1.673052430152893, + "learning_rate": 0.00019996253644752158, + "loss": 1.9259, + "step": 1074 + }, + { + "epoch": 0.03849803928590614, + "grad_norm": 1.3855998516082764, + "learning_rate": 0.00019996221830718115, + "loss": 1.9441, + "step": 1075 + }, + { + "epoch": 0.03853385141547442, + "grad_norm": 1.4624788761138916, + "learning_rate": 0.00019996189882197797, + "loss": 1.9249, + "step": 1076 + }, + { + "epoch": 0.0385696635450427, + "grad_norm": 2.3567802906036377, + "learning_rate": 0.0001999615779919164, + "loss": 1.8877, + "step": 1077 + }, + { + "epoch": 0.03860547567461099, + "grad_norm": 1.6427150964736938, + "learning_rate": 0.0001999612558170007, + "loss": 2.2207, + "step": 1078 + }, + { + "epoch": 0.038641287804179275, + "grad_norm": 1.4462939500808716, + "learning_rate": 0.0001999609322972353, + "loss": 1.9105, + "step": 1079 + }, + { + "epoch": 0.03867709993374756, + "grad_norm": 1.3780089616775513, + "learning_rate": 0.00019996060743262447, + "loss": 1.9752, + "step": 1080 + }, + { + "epoch": 0.03871291206331585, + "grad_norm": 1.4172141551971436, + "learning_rate": 0.00019996028122317257, + "loss": 1.9202, + "step": 1081 + }, + { + "epoch": 0.03874872419288413, + "grad_norm": 1.6056876182556152, + "learning_rate": 0.00019995995366888408, + "loss": 2.0305, + "step": 1082 + }, + { + "epoch": 0.03878453632245241, + "grad_norm": 1.9504435062408447, + "learning_rate": 0.00019995962476976336, + "loss": 1.7623, + "step": 1083 + }, + { + "epoch": 0.038820348452020696, + "grad_norm": 1.192026138305664, + "learning_rate": 0.00019995929452581478, + "loss": 2.051, + "step": 1084 + }, + { + "epoch": 0.038856160581588986, + "grad_norm": 2.1449334621429443, + "learning_rate": 0.00019995896293704285, + "loss": 1.9216, + "step": 1085 + }, + { + "epoch": 0.03889197271115727, + "grad_norm": 1.6051068305969238, + "learning_rate": 0.00019995863000345202, + "loss": 1.7794, + "step": 1086 + }, + { + "epoch": 0.03892778484072555, + "grad_norm": 1.9584460258483887, + "learning_rate": 0.00019995829572504677, + "loss": 1.84, + "step": 1087 + }, + { + "epoch": 0.03896359697029384, + "grad_norm": 1.3237011432647705, + "learning_rate": 0.00019995796010183157, + "loss": 1.9595, + "step": 1088 + }, + { + "epoch": 0.038999409099862124, + "grad_norm": 1.275923728942871, + "learning_rate": 0.00019995762313381095, + "loss": 1.8077, + "step": 1089 + }, + { + "epoch": 0.03903522122943041, + "grad_norm": 1.2776323556900024, + "learning_rate": 0.00019995728482098945, + "loss": 1.798, + "step": 1090 + }, + { + "epoch": 0.03907103335899869, + "grad_norm": 1.3595540523529053, + "learning_rate": 0.00019995694516337164, + "loss": 2.0992, + "step": 1091 + }, + { + "epoch": 0.03910684548856698, + "grad_norm": 1.3693207502365112, + "learning_rate": 0.00019995660416096206, + "loss": 1.9173, + "step": 1092 + }, + { + "epoch": 0.03914265761813526, + "grad_norm": 1.6067368984222412, + "learning_rate": 0.00019995626181376527, + "loss": 1.9124, + "step": 1093 + }, + { + "epoch": 0.039178469747703545, + "grad_norm": 1.5119096040725708, + "learning_rate": 0.00019995591812178596, + "loss": 2.2585, + "step": 1094 + }, + { + "epoch": 0.039214281877271835, + "grad_norm": 1.692460060119629, + "learning_rate": 0.00019995557308502866, + "loss": 1.9322, + "step": 1095 + }, + { + "epoch": 0.03925009400684012, + "grad_norm": 2.6956653594970703, + "learning_rate": 0.00019995522670349808, + "loss": 2.2339, + "step": 1096 + }, + { + "epoch": 0.0392859061364084, + "grad_norm": 1.5072599649429321, + "learning_rate": 0.00019995487897719888, + "loss": 2.0304, + "step": 1097 + }, + { + "epoch": 0.039321718265976684, + "grad_norm": 1.522534728050232, + "learning_rate": 0.00019995452990613567, + "loss": 1.7841, + "step": 1098 + }, + { + "epoch": 0.03935753039554497, + "grad_norm": 3.1163241863250732, + "learning_rate": 0.00019995417949031323, + "loss": 2.0245, + "step": 1099 + }, + { + "epoch": 0.039393342525113256, + "grad_norm": 2.7022454738616943, + "learning_rate": 0.00019995382772973623, + "loss": 2.064, + "step": 1100 + }, + { + "epoch": 0.03942915465468154, + "grad_norm": 1.7741551399230957, + "learning_rate": 0.00019995347462440938, + "loss": 1.8704, + "step": 1101 + }, + { + "epoch": 0.03946496678424982, + "grad_norm": 2.4735567569732666, + "learning_rate": 0.0001999531201743375, + "loss": 2.2503, + "step": 1102 + }, + { + "epoch": 0.03950077891381811, + "grad_norm": 2.0130088329315186, + "learning_rate": 0.0001999527643795253, + "loss": 2.0751, + "step": 1103 + }, + { + "epoch": 0.039536591043386395, + "grad_norm": 1.6272753477096558, + "learning_rate": 0.00019995240723997757, + "loss": 2.1966, + "step": 1104 + }, + { + "epoch": 0.03957240317295468, + "grad_norm": 1.5757356882095337, + "learning_rate": 0.00019995204875569914, + "loss": 1.7593, + "step": 1105 + }, + { + "epoch": 0.03960821530252297, + "grad_norm": 1.7916936874389648, + "learning_rate": 0.00019995168892669485, + "loss": 1.9154, + "step": 1106 + }, + { + "epoch": 0.03964402743209125, + "grad_norm": 1.6669237613677979, + "learning_rate": 0.00019995132775296948, + "loss": 2.0772, + "step": 1107 + }, + { + "epoch": 0.03967983956165953, + "grad_norm": 1.3182016611099243, + "learning_rate": 0.00019995096523452795, + "loss": 1.6875, + "step": 1108 + }, + { + "epoch": 0.039715651691227816, + "grad_norm": 1.801761269569397, + "learning_rate": 0.0001999506013713751, + "loss": 1.8703, + "step": 1109 + }, + { + "epoch": 0.039751463820796105, + "grad_norm": 1.1991573572158813, + "learning_rate": 0.0001999502361635158, + "loss": 2.0746, + "step": 1110 + }, + { + "epoch": 0.03978727595036439, + "grad_norm": 1.6609413623809814, + "learning_rate": 0.00019994986961095504, + "loss": 2.0265, + "step": 1111 + }, + { + "epoch": 0.03982308807993267, + "grad_norm": 1.425610065460205, + "learning_rate": 0.0001999495017136977, + "loss": 1.757, + "step": 1112 + }, + { + "epoch": 0.03985890020950096, + "grad_norm": 1.956095576286316, + "learning_rate": 0.00019994913247174876, + "loss": 2.0188, + "step": 1113 + }, + { + "epoch": 0.039894712339069244, + "grad_norm": 1.771166205406189, + "learning_rate": 0.00019994876188511314, + "loss": 2.2367, + "step": 1114 + }, + { + "epoch": 0.03993052446863753, + "grad_norm": 1.6214959621429443, + "learning_rate": 0.00019994838995379585, + "loss": 1.8904, + "step": 1115 + }, + { + "epoch": 0.03996633659820581, + "grad_norm": 1.3988220691680908, + "learning_rate": 0.0001999480166778019, + "loss": 1.7266, + "step": 1116 + }, + { + "epoch": 0.0400021487277741, + "grad_norm": 1.1129361391067505, + "learning_rate": 0.00019994764205713631, + "loss": 1.9944, + "step": 1117 + }, + { + "epoch": 0.04003796085734238, + "grad_norm": 1.3052695989608765, + "learning_rate": 0.00019994726609180415, + "loss": 1.8591, + "step": 1118 + }, + { + "epoch": 0.040073772986910665, + "grad_norm": 1.9215950965881348, + "learning_rate": 0.00019994688878181044, + "loss": 2.0272, + "step": 1119 + }, + { + "epoch": 0.040109585116478955, + "grad_norm": 1.3812592029571533, + "learning_rate": 0.0001999465101271602, + "loss": 1.8704, + "step": 1120 + }, + { + "epoch": 0.04014539724604724, + "grad_norm": 1.5294747352600098, + "learning_rate": 0.00019994613012785868, + "loss": 1.8968, + "step": 1121 + }, + { + "epoch": 0.04018120937561552, + "grad_norm": 1.8965163230895996, + "learning_rate": 0.00019994574878391084, + "loss": 2.0483, + "step": 1122 + }, + { + "epoch": 0.0402170215051838, + "grad_norm": 1.571008324623108, + "learning_rate": 0.00019994536609532187, + "loss": 1.5518, + "step": 1123 + }, + { + "epoch": 0.04025283363475209, + "grad_norm": 2.02764630317688, + "learning_rate": 0.00019994498206209695, + "loss": 1.5406, + "step": 1124 + }, + { + "epoch": 0.040288645764320376, + "grad_norm": 1.47233247756958, + "learning_rate": 0.0001999445966842412, + "loss": 1.9526, + "step": 1125 + }, + { + "epoch": 0.04032445789388866, + "grad_norm": 1.4829063415527344, + "learning_rate": 0.00019994420996175983, + "loss": 1.8804, + "step": 1126 + }, + { + "epoch": 0.04036027002345694, + "grad_norm": 1.8773620128631592, + "learning_rate": 0.00019994382189465802, + "loss": 2.0499, + "step": 1127 + }, + { + "epoch": 0.04039608215302523, + "grad_norm": 2.0403313636779785, + "learning_rate": 0.000199943432482941, + "loss": 1.7639, + "step": 1128 + }, + { + "epoch": 0.040431894282593514, + "grad_norm": 2.01694917678833, + "learning_rate": 0.00019994304172661403, + "loss": 2.5179, + "step": 1129 + }, + { + "epoch": 0.0404677064121618, + "grad_norm": 1.556822419166565, + "learning_rate": 0.00019994264962568234, + "loss": 2.0218, + "step": 1130 + }, + { + "epoch": 0.04050351854173009, + "grad_norm": 2.926945924758911, + "learning_rate": 0.00019994225618015125, + "loss": 2.1212, + "step": 1131 + }, + { + "epoch": 0.04053933067129837, + "grad_norm": 2.010427713394165, + "learning_rate": 0.000199941861390026, + "loss": 2.0122, + "step": 1132 + }, + { + "epoch": 0.04057514280086665, + "grad_norm": 1.425561547279358, + "learning_rate": 0.0001999414652553119, + "loss": 1.9191, + "step": 1133 + }, + { + "epoch": 0.040610954930434935, + "grad_norm": 1.2319929599761963, + "learning_rate": 0.00019994106777601432, + "loss": 1.7172, + "step": 1134 + }, + { + "epoch": 0.040646767060003225, + "grad_norm": 1.7433674335479736, + "learning_rate": 0.00019994066895213857, + "loss": 1.8323, + "step": 1135 + }, + { + "epoch": 0.04068257918957151, + "grad_norm": 1.2644333839416504, + "learning_rate": 0.00019994026878369003, + "loss": 1.8999, + "step": 1136 + }, + { + "epoch": 0.04071839131913979, + "grad_norm": 1.5659446716308594, + "learning_rate": 0.00019993986727067414, + "loss": 1.7909, + "step": 1137 + }, + { + "epoch": 0.04075420344870808, + "grad_norm": 2.269122362136841, + "learning_rate": 0.0001999394644130962, + "loss": 2.0758, + "step": 1138 + }, + { + "epoch": 0.04079001557827636, + "grad_norm": 1.7199034690856934, + "learning_rate": 0.00019993906021096168, + "loss": 1.8833, + "step": 1139 + }, + { + "epoch": 0.040825827707844646, + "grad_norm": 1.6728589534759521, + "learning_rate": 0.00019993865466427603, + "loss": 1.8298, + "step": 1140 + }, + { + "epoch": 0.04086163983741293, + "grad_norm": 2.495798349380493, + "learning_rate": 0.00019993824777304469, + "loss": 1.639, + "step": 1141 + }, + { + "epoch": 0.04089745196698122, + "grad_norm": 1.8760560750961304, + "learning_rate": 0.0001999378395372731, + "loss": 1.5388, + "step": 1142 + }, + { + "epoch": 0.0409332640965495, + "grad_norm": 1.9759643077850342, + "learning_rate": 0.00019993742995696686, + "loss": 2.1794, + "step": 1143 + }, + { + "epoch": 0.040969076226117784, + "grad_norm": 1.5241389274597168, + "learning_rate": 0.0001999370190321314, + "loss": 1.9345, + "step": 1144 + }, + { + "epoch": 0.041004888355686074, + "grad_norm": 1.6700853109359741, + "learning_rate": 0.0001999366067627722, + "loss": 1.6506, + "step": 1145 + }, + { + "epoch": 0.04104070048525436, + "grad_norm": 1.6093263626098633, + "learning_rate": 0.0001999361931488949, + "loss": 1.9827, + "step": 1146 + }, + { + "epoch": 0.04107651261482264, + "grad_norm": 1.726548194885254, + "learning_rate": 0.00019993577819050505, + "loss": 1.9425, + "step": 1147 + }, + { + "epoch": 0.04111232474439092, + "grad_norm": 1.4879534244537354, + "learning_rate": 0.00019993536188760817, + "loss": 1.8571, + "step": 1148 + }, + { + "epoch": 0.04114813687395921, + "grad_norm": 2.0006051063537598, + "learning_rate": 0.00019993494424020992, + "loss": 1.7725, + "step": 1149 + }, + { + "epoch": 0.041183949003527495, + "grad_norm": 1.6221108436584473, + "learning_rate": 0.00019993452524831592, + "loss": 1.8815, + "step": 1150 + }, + { + "epoch": 0.04121976113309578, + "grad_norm": 1.5773656368255615, + "learning_rate": 0.0001999341049119318, + "loss": 1.7668, + "step": 1151 + }, + { + "epoch": 0.04125557326266406, + "grad_norm": 1.7638592720031738, + "learning_rate": 0.00019993368323106315, + "loss": 1.9433, + "step": 1152 + }, + { + "epoch": 0.04129138539223235, + "grad_norm": 1.4809341430664062, + "learning_rate": 0.0001999332602057157, + "loss": 1.6965, + "step": 1153 + }, + { + "epoch": 0.041327197521800633, + "grad_norm": 1.7813769578933716, + "learning_rate": 0.0001999328358358952, + "loss": 1.8184, + "step": 1154 + }, + { + "epoch": 0.041363009651368916, + "grad_norm": 1.9684855937957764, + "learning_rate": 0.00019993241012160727, + "loss": 1.9786, + "step": 1155 + }, + { + "epoch": 0.041398821780937206, + "grad_norm": 1.7000457048416138, + "learning_rate": 0.00019993198306285766, + "loss": 1.742, + "step": 1156 + }, + { + "epoch": 0.04143463391050549, + "grad_norm": 1.2915273904800415, + "learning_rate": 0.0001999315546596521, + "loss": 1.774, + "step": 1157 + }, + { + "epoch": 0.04147044604007377, + "grad_norm": 1.3124654293060303, + "learning_rate": 0.0001999311249119964, + "loss": 1.9188, + "step": 1158 + }, + { + "epoch": 0.041506258169642055, + "grad_norm": 1.339920997619629, + "learning_rate": 0.0001999306938198963, + "loss": 1.9007, + "step": 1159 + }, + { + "epoch": 0.041542070299210344, + "grad_norm": 1.8238441944122314, + "learning_rate": 0.00019993026138335763, + "loss": 2.3424, + "step": 1160 + }, + { + "epoch": 0.04157788242877863, + "grad_norm": 1.7708998918533325, + "learning_rate": 0.0001999298276023862, + "loss": 2.0156, + "step": 1161 + }, + { + "epoch": 0.04161369455834691, + "grad_norm": 1.3194271326065063, + "learning_rate": 0.00019992939247698784, + "loss": 1.7528, + "step": 1162 + }, + { + "epoch": 0.0416495066879152, + "grad_norm": 1.9072855710983276, + "learning_rate": 0.00019992895600716838, + "loss": 1.944, + "step": 1163 + }, + { + "epoch": 0.04168531881748348, + "grad_norm": 1.2703431844711304, + "learning_rate": 0.00019992851819293373, + "loss": 1.8563, + "step": 1164 + }, + { + "epoch": 0.041721130947051766, + "grad_norm": 1.5858049392700195, + "learning_rate": 0.00019992807903428976, + "loss": 2.0275, + "step": 1165 + }, + { + "epoch": 0.04175694307662005, + "grad_norm": 1.2060877084732056, + "learning_rate": 0.0001999276385312424, + "loss": 1.811, + "step": 1166 + }, + { + "epoch": 0.04179275520618834, + "grad_norm": 1.198927640914917, + "learning_rate": 0.00019992719668379753, + "loss": 1.9143, + "step": 1167 + }, + { + "epoch": 0.04182856733575662, + "grad_norm": 2.0849688053131104, + "learning_rate": 0.00019992675349196114, + "loss": 2.4221, + "step": 1168 + }, + { + "epoch": 0.041864379465324904, + "grad_norm": 1.9343287944793701, + "learning_rate": 0.0001999263089557392, + "loss": 2.047, + "step": 1169 + }, + { + "epoch": 0.041900191594893194, + "grad_norm": 1.5256718397140503, + "learning_rate": 0.00019992586307513767, + "loss": 2.1052, + "step": 1170 + }, + { + "epoch": 0.041936003724461476, + "grad_norm": 1.3801934719085693, + "learning_rate": 0.00019992541585016254, + "loss": 1.6852, + "step": 1171 + }, + { + "epoch": 0.04197181585402976, + "grad_norm": 1.4266293048858643, + "learning_rate": 0.0001999249672808198, + "loss": 1.9, + "step": 1172 + }, + { + "epoch": 0.04200762798359804, + "grad_norm": 1.2615375518798828, + "learning_rate": 0.00019992451736711554, + "loss": 2.0813, + "step": 1173 + }, + { + "epoch": 0.04204344011316633, + "grad_norm": 1.4383363723754883, + "learning_rate": 0.00019992406610905582, + "loss": 1.6228, + "step": 1174 + }, + { + "epoch": 0.042079252242734615, + "grad_norm": 1.2527996301651, + "learning_rate": 0.00019992361350664663, + "loss": 1.8077, + "step": 1175 + }, + { + "epoch": 0.0421150643723029, + "grad_norm": 1.6648797988891602, + "learning_rate": 0.00019992315955989415, + "loss": 2.051, + "step": 1176 + }, + { + "epoch": 0.04215087650187118, + "grad_norm": 1.7546112537384033, + "learning_rate": 0.00019992270426880446, + "loss": 1.871, + "step": 1177 + }, + { + "epoch": 0.04218668863143947, + "grad_norm": 1.7627754211425781, + "learning_rate": 0.00019992224763338366, + "loss": 1.7231, + "step": 1178 + }, + { + "epoch": 0.04222250076100775, + "grad_norm": 1.8482389450073242, + "learning_rate": 0.00019992178965363787, + "loss": 1.698, + "step": 1179 + }, + { + "epoch": 0.042258312890576036, + "grad_norm": 1.2556909322738647, + "learning_rate": 0.00019992133032957336, + "loss": 1.9038, + "step": 1180 + }, + { + "epoch": 0.042294125020144326, + "grad_norm": 1.856414794921875, + "learning_rate": 0.0001999208696611962, + "loss": 1.7274, + "step": 1181 + }, + { + "epoch": 0.04232993714971261, + "grad_norm": 1.5887939929962158, + "learning_rate": 0.00019992040764851263, + "loss": 1.7081, + "step": 1182 + }, + { + "epoch": 0.04236574927928089, + "grad_norm": 1.9165586233139038, + "learning_rate": 0.00019991994429152888, + "loss": 1.8554, + "step": 1183 + }, + { + "epoch": 0.042401561408849174, + "grad_norm": 2.281977653503418, + "learning_rate": 0.00019991947959025112, + "loss": 2.5105, + "step": 1184 + }, + { + "epoch": 0.042437373538417464, + "grad_norm": 1.645222544670105, + "learning_rate": 0.0001999190135446857, + "loss": 2.0212, + "step": 1185 + }, + { + "epoch": 0.04247318566798575, + "grad_norm": 1.8959003686904907, + "learning_rate": 0.00019991854615483882, + "loss": 2.2363, + "step": 1186 + }, + { + "epoch": 0.04250899779755403, + "grad_norm": 1.5478854179382324, + "learning_rate": 0.00019991807742071678, + "loss": 1.9947, + "step": 1187 + }, + { + "epoch": 0.04254480992712232, + "grad_norm": 1.8765732049942017, + "learning_rate": 0.0001999176073423259, + "loss": 1.7691, + "step": 1188 + }, + { + "epoch": 0.0425806220566906, + "grad_norm": 1.8564085960388184, + "learning_rate": 0.00019991713591967252, + "loss": 1.7588, + "step": 1189 + }, + { + "epoch": 0.042616434186258885, + "grad_norm": 1.8056801557540894, + "learning_rate": 0.00019991666315276292, + "loss": 2.027, + "step": 1190 + }, + { + "epoch": 0.04265224631582717, + "grad_norm": 1.8482861518859863, + "learning_rate": 0.0001999161890416035, + "loss": 1.8143, + "step": 1191 + }, + { + "epoch": 0.04268805844539546, + "grad_norm": 1.6742010116577148, + "learning_rate": 0.00019991571358620068, + "loss": 2.162, + "step": 1192 + }, + { + "epoch": 0.04272387057496374, + "grad_norm": 1.4236180782318115, + "learning_rate": 0.0001999152367865608, + "loss": 1.8787, + "step": 1193 + }, + { + "epoch": 0.04275968270453202, + "grad_norm": 1.2990878820419312, + "learning_rate": 0.0001999147586426903, + "loss": 1.8382, + "step": 1194 + }, + { + "epoch": 0.04279549483410031, + "grad_norm": 1.8911601305007935, + "learning_rate": 0.00019991427915459558, + "loss": 1.8486, + "step": 1195 + }, + { + "epoch": 0.042831306963668596, + "grad_norm": 1.2889195680618286, + "learning_rate": 0.0001999137983222831, + "loss": 1.9353, + "step": 1196 + }, + { + "epoch": 0.04286711909323688, + "grad_norm": 1.7363015413284302, + "learning_rate": 0.0001999133161457594, + "loss": 1.7446, + "step": 1197 + }, + { + "epoch": 0.04290293122280516, + "grad_norm": 1.3156371116638184, + "learning_rate": 0.00019991283262503083, + "loss": 1.9038, + "step": 1198 + }, + { + "epoch": 0.04293874335237345, + "grad_norm": 1.2407479286193848, + "learning_rate": 0.00019991234776010406, + "loss": 1.8502, + "step": 1199 + }, + { + "epoch": 0.042974555481941734, + "grad_norm": 1.3936583995819092, + "learning_rate": 0.0001999118615509855, + "loss": 1.5118, + "step": 1200 + }, + { + "epoch": 0.04301036761151002, + "grad_norm": 1.2559468746185303, + "learning_rate": 0.00019991137399768166, + "loss": 1.9056, + "step": 1201 + }, + { + "epoch": 0.0430461797410783, + "grad_norm": 2.0051515102386475, + "learning_rate": 0.00019991088510019924, + "loss": 1.8493, + "step": 1202 + }, + { + "epoch": 0.04308199187064659, + "grad_norm": 1.2575451135635376, + "learning_rate": 0.0001999103948585447, + "loss": 1.4424, + "step": 1203 + }, + { + "epoch": 0.04311780400021487, + "grad_norm": 1.596622109413147, + "learning_rate": 0.00019990990327272467, + "loss": 2.141, + "step": 1204 + }, + { + "epoch": 0.043153616129783155, + "grad_norm": 1.6816015243530273, + "learning_rate": 0.00019990941034274577, + "loss": 1.962, + "step": 1205 + }, + { + "epoch": 0.043189428259351445, + "grad_norm": 1.785096287727356, + "learning_rate": 0.00019990891606861463, + "loss": 1.5008, + "step": 1206 + }, + { + "epoch": 0.04322524038891973, + "grad_norm": 1.742447853088379, + "learning_rate": 0.0001999084204503379, + "loss": 1.8909, + "step": 1207 + }, + { + "epoch": 0.04326105251848801, + "grad_norm": 1.6396584510803223, + "learning_rate": 0.00019990792348792224, + "loss": 1.8476, + "step": 1208 + }, + { + "epoch": 0.043296864648056294, + "grad_norm": 1.9370383024215698, + "learning_rate": 0.00019990742518137436, + "loss": 2.1081, + "step": 1209 + }, + { + "epoch": 0.04333267677762458, + "grad_norm": 2.301330089569092, + "learning_rate": 0.00019990692553070093, + "loss": 2.0177, + "step": 1210 + }, + { + "epoch": 0.043368488907192866, + "grad_norm": 1.2723262310028076, + "learning_rate": 0.0001999064245359087, + "loss": 1.6058, + "step": 1211 + }, + { + "epoch": 0.04340430103676115, + "grad_norm": 1.3464289903640747, + "learning_rate": 0.00019990592219700437, + "loss": 1.7071, + "step": 1212 + }, + { + "epoch": 0.04344011316632944, + "grad_norm": 1.548187017440796, + "learning_rate": 0.00019990541851399476, + "loss": 1.9317, + "step": 1213 + }, + { + "epoch": 0.04347592529589772, + "grad_norm": 1.9910813570022583, + "learning_rate": 0.00019990491348688657, + "loss": 2.1553, + "step": 1214 + }, + { + "epoch": 0.043511737425466004, + "grad_norm": 1.2897824048995972, + "learning_rate": 0.00019990440711568666, + "loss": 1.831, + "step": 1215 + }, + { + "epoch": 0.04354754955503429, + "grad_norm": 1.9216833114624023, + "learning_rate": 0.00019990389940040184, + "loss": 1.9973, + "step": 1216 + }, + { + "epoch": 0.04358336168460258, + "grad_norm": 2.3305580615997314, + "learning_rate": 0.0001999033903410389, + "loss": 1.934, + "step": 1217 + }, + { + "epoch": 0.04361917381417086, + "grad_norm": 1.5975700616836548, + "learning_rate": 0.00019990287993760473, + "loss": 1.829, + "step": 1218 + }, + { + "epoch": 0.04365498594373914, + "grad_norm": 1.2177056074142456, + "learning_rate": 0.00019990236819010615, + "loss": 1.7145, + "step": 1219 + }, + { + "epoch": 0.04369079807330743, + "grad_norm": 1.501842737197876, + "learning_rate": 0.0001999018550985501, + "loss": 1.9816, + "step": 1220 + }, + { + "epoch": 0.043726610202875715, + "grad_norm": 2.3533644676208496, + "learning_rate": 0.00019990134066294338, + "loss": 2.2937, + "step": 1221 + }, + { + "epoch": 0.043762422332444, + "grad_norm": 1.9165318012237549, + "learning_rate": 0.00019990082488329308, + "loss": 2.0315, + "step": 1222 + }, + { + "epoch": 0.04379823446201228, + "grad_norm": 1.4523035287857056, + "learning_rate": 0.000199900307759606, + "loss": 2.2864, + "step": 1223 + }, + { + "epoch": 0.04383404659158057, + "grad_norm": 1.336516261100769, + "learning_rate": 0.00019989978929188914, + "loss": 1.9243, + "step": 1224 + }, + { + "epoch": 0.043869858721148854, + "grad_norm": 1.738339900970459, + "learning_rate": 0.00019989926948014945, + "loss": 1.7763, + "step": 1225 + }, + { + "epoch": 0.043905670850717136, + "grad_norm": 2.4466097354888916, + "learning_rate": 0.000199898748324394, + "loss": 2.1759, + "step": 1226 + }, + { + "epoch": 0.04394148298028542, + "grad_norm": 1.6366623640060425, + "learning_rate": 0.00019989822582462972, + "loss": 1.8826, + "step": 1227 + }, + { + "epoch": 0.04397729510985371, + "grad_norm": 1.563489556312561, + "learning_rate": 0.00019989770198086367, + "loss": 1.6678, + "step": 1228 + }, + { + "epoch": 0.04401310723942199, + "grad_norm": 1.8374176025390625, + "learning_rate": 0.0001998971767931029, + "loss": 1.7684, + "step": 1229 + }, + { + "epoch": 0.044048919368990275, + "grad_norm": 1.9586941003799438, + "learning_rate": 0.0001998966502613545, + "loss": 2.132, + "step": 1230 + }, + { + "epoch": 0.044084731498558564, + "grad_norm": 1.2699816226959229, + "learning_rate": 0.0001998961223856255, + "loss": 1.9222, + "step": 1231 + }, + { + "epoch": 0.04412054362812685, + "grad_norm": 1.6662757396697998, + "learning_rate": 0.00019989559316592305, + "loss": 1.6608, + "step": 1232 + }, + { + "epoch": 0.04415635575769513, + "grad_norm": 1.3291850090026855, + "learning_rate": 0.00019989506260225426, + "loss": 2.0618, + "step": 1233 + }, + { + "epoch": 0.04419216788726341, + "grad_norm": 2.277132272720337, + "learning_rate": 0.00019989453069462623, + "loss": 1.7717, + "step": 1234 + }, + { + "epoch": 0.0442279800168317, + "grad_norm": 1.6302454471588135, + "learning_rate": 0.00019989399744304616, + "loss": 2.0715, + "step": 1235 + }, + { + "epoch": 0.044263792146399986, + "grad_norm": 1.285477876663208, + "learning_rate": 0.0001998934628475212, + "loss": 1.5437, + "step": 1236 + }, + { + "epoch": 0.04429960427596827, + "grad_norm": 2.632361888885498, + "learning_rate": 0.00019989292690805854, + "loss": 1.8132, + "step": 1237 + }, + { + "epoch": 0.04433541640553656, + "grad_norm": 1.5045816898345947, + "learning_rate": 0.00019989238962466542, + "loss": 1.8698, + "step": 1238 + }, + { + "epoch": 0.04437122853510484, + "grad_norm": 1.6610292196273804, + "learning_rate": 0.00019989185099734903, + "loss": 1.8337, + "step": 1239 + }, + { + "epoch": 0.044407040664673124, + "grad_norm": 1.562566876411438, + "learning_rate": 0.00019989131102611667, + "loss": 1.9222, + "step": 1240 + }, + { + "epoch": 0.04444285279424141, + "grad_norm": 2.671149969100952, + "learning_rate": 0.00019989076971097555, + "loss": 1.8991, + "step": 1241 + }, + { + "epoch": 0.044478664923809696, + "grad_norm": 2.1588051319122314, + "learning_rate": 0.00019989022705193299, + "loss": 1.9221, + "step": 1242 + }, + { + "epoch": 0.04451447705337798, + "grad_norm": 1.949323296546936, + "learning_rate": 0.00019988968304899624, + "loss": 2.0363, + "step": 1243 + }, + { + "epoch": 0.04455028918294626, + "grad_norm": 1.7885140180587769, + "learning_rate": 0.00019988913770217269, + "loss": 1.8394, + "step": 1244 + }, + { + "epoch": 0.04458610131251455, + "grad_norm": 1.935902714729309, + "learning_rate": 0.00019988859101146962, + "loss": 2.0142, + "step": 1245 + }, + { + "epoch": 0.044621913442082835, + "grad_norm": 1.9690736532211304, + "learning_rate": 0.00019988804297689438, + "loss": 1.8359, + "step": 1246 + }, + { + "epoch": 0.04465772557165112, + "grad_norm": 1.5136144161224365, + "learning_rate": 0.0001998874935984544, + "loss": 1.9559, + "step": 1247 + }, + { + "epoch": 0.0446935377012194, + "grad_norm": 1.742443561553955, + "learning_rate": 0.00019988694287615704, + "loss": 1.9451, + "step": 1248 + }, + { + "epoch": 0.04472934983078769, + "grad_norm": 1.5419561862945557, + "learning_rate": 0.0001998863908100097, + "loss": 1.7377, + "step": 1249 + }, + { + "epoch": 0.04476516196035597, + "grad_norm": 1.9197380542755127, + "learning_rate": 0.00019988583740001984, + "loss": 1.7331, + "step": 1250 + }, + { + "epoch": 0.044800974089924256, + "grad_norm": 2.089766502380371, + "learning_rate": 0.00019988528264619485, + "loss": 1.8769, + "step": 1251 + }, + { + "epoch": 0.04483678621949254, + "grad_norm": 1.1897839307785034, + "learning_rate": 0.00019988472654854222, + "loss": 1.9805, + "step": 1252 + }, + { + "epoch": 0.04487259834906083, + "grad_norm": 2.2890515327453613, + "learning_rate": 0.00019988416910706947, + "loss": 2.2311, + "step": 1253 + }, + { + "epoch": 0.04490841047862911, + "grad_norm": 1.2708755731582642, + "learning_rate": 0.00019988361032178403, + "loss": 1.7924, + "step": 1254 + }, + { + "epoch": 0.044944222608197394, + "grad_norm": 1.8061339855194092, + "learning_rate": 0.00019988305019269346, + "loss": 1.7318, + "step": 1255 + }, + { + "epoch": 0.044980034737765684, + "grad_norm": 1.5179500579833984, + "learning_rate": 0.00019988248871980532, + "loss": 1.6443, + "step": 1256 + }, + { + "epoch": 0.04501584686733397, + "grad_norm": 1.4282045364379883, + "learning_rate": 0.0001998819259031271, + "loss": 1.8233, + "step": 1257 + }, + { + "epoch": 0.04505165899690225, + "grad_norm": 1.1604241132736206, + "learning_rate": 0.00019988136174266643, + "loss": 1.8747, + "step": 1258 + }, + { + "epoch": 0.04508747112647053, + "grad_norm": 2.2839112281799316, + "learning_rate": 0.00019988079623843087, + "loss": 2.0667, + "step": 1259 + }, + { + "epoch": 0.04512328325603882, + "grad_norm": 2.5396506786346436, + "learning_rate": 0.000199880229390428, + "loss": 1.9539, + "step": 1260 + }, + { + "epoch": 0.045159095385607105, + "grad_norm": 1.8665293455123901, + "learning_rate": 0.00019987966119866554, + "loss": 1.7051, + "step": 1261 + }, + { + "epoch": 0.04519490751517539, + "grad_norm": 2.028208017349243, + "learning_rate": 0.00019987909166315103, + "loss": 1.7659, + "step": 1262 + }, + { + "epoch": 0.04523071964474368, + "grad_norm": 2.1394529342651367, + "learning_rate": 0.00019987852078389218, + "loss": 1.6273, + "step": 1263 + }, + { + "epoch": 0.04526653177431196, + "grad_norm": 2.826866626739502, + "learning_rate": 0.00019987794856089668, + "loss": 2.0664, + "step": 1264 + }, + { + "epoch": 0.04530234390388024, + "grad_norm": 1.5801968574523926, + "learning_rate": 0.0001998773749941722, + "loss": 2.1104, + "step": 1265 + }, + { + "epoch": 0.045338156033448526, + "grad_norm": 1.9282563924789429, + "learning_rate": 0.00019987680008372647, + "loss": 1.9696, + "step": 1266 + }, + { + "epoch": 0.045373968163016816, + "grad_norm": 2.0032389163970947, + "learning_rate": 0.00019987622382956722, + "loss": 1.8998, + "step": 1267 + }, + { + "epoch": 0.0454097802925851, + "grad_norm": 1.5961376428604126, + "learning_rate": 0.00019987564623170226, + "loss": 1.9436, + "step": 1268 + }, + { + "epoch": 0.04544559242215338, + "grad_norm": 1.54341721534729, + "learning_rate": 0.00019987506729013927, + "loss": 1.8716, + "step": 1269 + }, + { + "epoch": 0.04548140455172167, + "grad_norm": 2.1734094619750977, + "learning_rate": 0.0001998744870048861, + "loss": 1.7878, + "step": 1270 + }, + { + "epoch": 0.045517216681289954, + "grad_norm": 1.2922240495681763, + "learning_rate": 0.0001998739053759505, + "loss": 1.7814, + "step": 1271 + }, + { + "epoch": 0.04555302881085824, + "grad_norm": 1.7018311023712158, + "learning_rate": 0.00019987332240334037, + "loss": 1.9387, + "step": 1272 + }, + { + "epoch": 0.04558884094042652, + "grad_norm": 1.7782323360443115, + "learning_rate": 0.00019987273808706347, + "loss": 1.7283, + "step": 1273 + }, + { + "epoch": 0.04562465306999481, + "grad_norm": 1.5118334293365479, + "learning_rate": 0.00019987215242712775, + "loss": 1.7824, + "step": 1274 + }, + { + "epoch": 0.04566046519956309, + "grad_norm": 1.525036096572876, + "learning_rate": 0.00019987156542354103, + "loss": 1.7238, + "step": 1275 + }, + { + "epoch": 0.045696277329131375, + "grad_norm": 1.968327522277832, + "learning_rate": 0.00019987097707631124, + "loss": 2.0203, + "step": 1276 + }, + { + "epoch": 0.04573208945869966, + "grad_norm": 1.6078931093215942, + "learning_rate": 0.00019987038738544625, + "loss": 1.4283, + "step": 1277 + }, + { + "epoch": 0.04576790158826795, + "grad_norm": 1.5513797998428345, + "learning_rate": 0.00019986979635095402, + "loss": 1.811, + "step": 1278 + }, + { + "epoch": 0.04580371371783623, + "grad_norm": 1.5902775526046753, + "learning_rate": 0.00019986920397284253, + "loss": 1.8791, + "step": 1279 + }, + { + "epoch": 0.045839525847404514, + "grad_norm": 1.7717511653900146, + "learning_rate": 0.0001998686102511197, + "loss": 2.1192, + "step": 1280 + }, + { + "epoch": 0.0458753379769728, + "grad_norm": 1.5827869176864624, + "learning_rate": 0.00019986801518579353, + "loss": 1.8253, + "step": 1281 + }, + { + "epoch": 0.045911150106541086, + "grad_norm": 2.1853511333465576, + "learning_rate": 0.00019986741877687207, + "loss": 1.8936, + "step": 1282 + }, + { + "epoch": 0.04594696223610937, + "grad_norm": 2.5177838802337646, + "learning_rate": 0.00019986682102436328, + "loss": 1.8655, + "step": 1283 + }, + { + "epoch": 0.04598277436567765, + "grad_norm": 1.5945955514907837, + "learning_rate": 0.00019986622192827525, + "loss": 2.0518, + "step": 1284 + }, + { + "epoch": 0.04601858649524594, + "grad_norm": 1.603678584098816, + "learning_rate": 0.000199865621488616, + "loss": 2.0015, + "step": 1285 + }, + { + "epoch": 0.046054398624814225, + "grad_norm": 1.593038558959961, + "learning_rate": 0.00019986501970539367, + "loss": 1.9955, + "step": 1286 + }, + { + "epoch": 0.04609021075438251, + "grad_norm": 1.295050024986267, + "learning_rate": 0.0001998644165786163, + "loss": 1.8378, + "step": 1287 + }, + { + "epoch": 0.0461260228839508, + "grad_norm": 1.8528512716293335, + "learning_rate": 0.00019986381210829199, + "loss": 2.0828, + "step": 1288 + }, + { + "epoch": 0.04616183501351908, + "grad_norm": 1.5784971714019775, + "learning_rate": 0.00019986320629442893, + "loss": 1.728, + "step": 1289 + }, + { + "epoch": 0.04619764714308736, + "grad_norm": 1.469783902168274, + "learning_rate": 0.00019986259913703526, + "loss": 1.6864, + "step": 1290 + }, + { + "epoch": 0.046233459272655646, + "grad_norm": 1.7400978803634644, + "learning_rate": 0.00019986199063611913, + "loss": 1.7075, + "step": 1291 + }, + { + "epoch": 0.046269271402223935, + "grad_norm": 1.7644522190093994, + "learning_rate": 0.0001998613807916887, + "loss": 2.161, + "step": 1292 + }, + { + "epoch": 0.04630508353179222, + "grad_norm": 2.0017952919006348, + "learning_rate": 0.00019986076960375223, + "loss": 1.9667, + "step": 1293 + }, + { + "epoch": 0.0463408956613605, + "grad_norm": 1.2968863248825073, + "learning_rate": 0.00019986015707231788, + "loss": 1.8902, + "step": 1294 + }, + { + "epoch": 0.04637670779092879, + "grad_norm": 1.572568416595459, + "learning_rate": 0.00019985954319739392, + "loss": 1.9558, + "step": 1295 + }, + { + "epoch": 0.046412519920497074, + "grad_norm": 1.356263518333435, + "learning_rate": 0.00019985892797898865, + "loss": 1.8009, + "step": 1296 + }, + { + "epoch": 0.04644833205006536, + "grad_norm": 1.2009116411209106, + "learning_rate": 0.00019985831141711033, + "loss": 1.8879, + "step": 1297 + }, + { + "epoch": 0.04648414417963364, + "grad_norm": 1.1283270120620728, + "learning_rate": 0.00019985769351176723, + "loss": 1.7523, + "step": 1298 + }, + { + "epoch": 0.04651995630920193, + "grad_norm": 1.6041669845581055, + "learning_rate": 0.00019985707426296764, + "loss": 1.823, + "step": 1299 + }, + { + "epoch": 0.04655576843877021, + "grad_norm": 1.4035066366195679, + "learning_rate": 0.00019985645367071993, + "loss": 1.6812, + "step": 1300 + }, + { + "epoch": 0.046591580568338495, + "grad_norm": 1.4297456741333008, + "learning_rate": 0.00019985583173503244, + "loss": 1.7875, + "step": 1301 + }, + { + "epoch": 0.04662739269790678, + "grad_norm": 1.2370003461837769, + "learning_rate": 0.00019985520845591356, + "loss": 1.7563, + "step": 1302 + }, + { + "epoch": 0.04666320482747507, + "grad_norm": 1.1710233688354492, + "learning_rate": 0.00019985458383337164, + "loss": 1.467, + "step": 1303 + }, + { + "epoch": 0.04669901695704335, + "grad_norm": 1.7941819429397583, + "learning_rate": 0.0001998539578674151, + "loss": 1.7955, + "step": 1304 + }, + { + "epoch": 0.04673482908661163, + "grad_norm": 1.6461565494537354, + "learning_rate": 0.00019985333055805236, + "loss": 1.6701, + "step": 1305 + }, + { + "epoch": 0.04677064121617992, + "grad_norm": 1.215015172958374, + "learning_rate": 0.00019985270190529187, + "loss": 1.6302, + "step": 1306 + }, + { + "epoch": 0.046806453345748206, + "grad_norm": 3.887476921081543, + "learning_rate": 0.00019985207190914206, + "loss": 1.687, + "step": 1307 + }, + { + "epoch": 0.04684226547531649, + "grad_norm": 1.9764825105667114, + "learning_rate": 0.00019985144056961141, + "loss": 2.1641, + "step": 1308 + }, + { + "epoch": 0.04687807760488477, + "grad_norm": 1.4992644786834717, + "learning_rate": 0.00019985080788670847, + "loss": 2.0364, + "step": 1309 + }, + { + "epoch": 0.04691388973445306, + "grad_norm": 2.266002655029297, + "learning_rate": 0.00019985017386044167, + "loss": 1.8181, + "step": 1310 + }, + { + "epoch": 0.046949701864021344, + "grad_norm": 2.006157636642456, + "learning_rate": 0.00019984953849081958, + "loss": 1.9546, + "step": 1311 + }, + { + "epoch": 0.04698551399358963, + "grad_norm": 1.8313133716583252, + "learning_rate": 0.00019984890177785077, + "loss": 1.9454, + "step": 1312 + }, + { + "epoch": 0.04702132612315792, + "grad_norm": 2.395103931427002, + "learning_rate": 0.00019984826372154374, + "loss": 1.9007, + "step": 1313 + }, + { + "epoch": 0.0470571382527262, + "grad_norm": 1.6291462182998657, + "learning_rate": 0.00019984762432190717, + "loss": 1.7289, + "step": 1314 + }, + { + "epoch": 0.04709295038229448, + "grad_norm": 1.8135216236114502, + "learning_rate": 0.00019984698357894957, + "loss": 2.0032, + "step": 1315 + }, + { + "epoch": 0.047128762511862765, + "grad_norm": 1.8101180791854858, + "learning_rate": 0.00019984634149267962, + "loss": 1.8555, + "step": 1316 + }, + { + "epoch": 0.047164574641431055, + "grad_norm": 1.441382646560669, + "learning_rate": 0.00019984569806310592, + "loss": 1.596, + "step": 1317 + }, + { + "epoch": 0.04720038677099934, + "grad_norm": 1.289527416229248, + "learning_rate": 0.00019984505329023717, + "loss": 1.6556, + "step": 1318 + }, + { + "epoch": 0.04723619890056762, + "grad_norm": 2.152682304382324, + "learning_rate": 0.000199844407174082, + "loss": 2.0037, + "step": 1319 + }, + { + "epoch": 0.04727201103013591, + "grad_norm": 1.3137688636779785, + "learning_rate": 0.00019984375971464913, + "loss": 1.4988, + "step": 1320 + }, + { + "epoch": 0.04730782315970419, + "grad_norm": 2.3812808990478516, + "learning_rate": 0.00019984311091194725, + "loss": 2.0996, + "step": 1321 + }, + { + "epoch": 0.047343635289272476, + "grad_norm": 1.7241475582122803, + "learning_rate": 0.0001998424607659851, + "loss": 1.7122, + "step": 1322 + }, + { + "epoch": 0.04737944741884076, + "grad_norm": 1.3988949060440063, + "learning_rate": 0.00019984180927677146, + "loss": 1.6472, + "step": 1323 + }, + { + "epoch": 0.04741525954840905, + "grad_norm": 1.1787669658660889, + "learning_rate": 0.00019984115644431502, + "loss": 1.76, + "step": 1324 + }, + { + "epoch": 0.04745107167797733, + "grad_norm": 2.1474668979644775, + "learning_rate": 0.00019984050226862462, + "loss": 1.9005, + "step": 1325 + }, + { + "epoch": 0.047486883807545614, + "grad_norm": 1.143454670906067, + "learning_rate": 0.00019983984674970905, + "loss": 1.629, + "step": 1326 + }, + { + "epoch": 0.0475226959371139, + "grad_norm": 1.7645509243011475, + "learning_rate": 0.00019983918988757715, + "loss": 2.044, + "step": 1327 + }, + { + "epoch": 0.04755850806668219, + "grad_norm": 1.4978320598602295, + "learning_rate": 0.0001998385316822377, + "loss": 1.9889, + "step": 1328 + }, + { + "epoch": 0.04759432019625047, + "grad_norm": 1.7313258647918701, + "learning_rate": 0.0001998378721336996, + "loss": 2.0302, + "step": 1329 + }, + { + "epoch": 0.04763013232581875, + "grad_norm": 2.19752836227417, + "learning_rate": 0.0001998372112419717, + "loss": 2.1565, + "step": 1330 + }, + { + "epoch": 0.04766594445538704, + "grad_norm": 1.5281023979187012, + "learning_rate": 0.00019983654900706293, + "loss": 1.7766, + "step": 1331 + }, + { + "epoch": 0.047701756584955325, + "grad_norm": 1.273146152496338, + "learning_rate": 0.00019983588542898218, + "loss": 1.3964, + "step": 1332 + }, + { + "epoch": 0.04773756871452361, + "grad_norm": 1.923797607421875, + "learning_rate": 0.00019983522050773833, + "loss": 2.0575, + "step": 1333 + }, + { + "epoch": 0.04777338084409189, + "grad_norm": 2.0104284286499023, + "learning_rate": 0.00019983455424334038, + "loss": 2.059, + "step": 1334 + }, + { + "epoch": 0.04780919297366018, + "grad_norm": 1.3276079893112183, + "learning_rate": 0.0001998338866357973, + "loss": 1.7016, + "step": 1335 + }, + { + "epoch": 0.04784500510322846, + "grad_norm": 1.1934891939163208, + "learning_rate": 0.00019983321768511801, + "loss": 1.7879, + "step": 1336 + }, + { + "epoch": 0.047880817232796746, + "grad_norm": 1.8240382671356201, + "learning_rate": 0.00019983254739131158, + "loss": 1.9411, + "step": 1337 + }, + { + "epoch": 0.047916629362365036, + "grad_norm": 1.425972819328308, + "learning_rate": 0.000199831875754387, + "loss": 1.7801, + "step": 1338 + }, + { + "epoch": 0.04795244149193332, + "grad_norm": 1.7534042596817017, + "learning_rate": 0.00019983120277435333, + "loss": 1.6276, + "step": 1339 + }, + { + "epoch": 0.0479882536215016, + "grad_norm": 1.249691128730774, + "learning_rate": 0.00019983052845121954, + "loss": 2.0146, + "step": 1340 + }, + { + "epoch": 0.048024065751069885, + "grad_norm": 1.7718329429626465, + "learning_rate": 0.00019982985278499483, + "loss": 1.8151, + "step": 1341 + }, + { + "epoch": 0.048059877880638174, + "grad_norm": 2.18198561668396, + "learning_rate": 0.0001998291757756882, + "loss": 2.0883, + "step": 1342 + }, + { + "epoch": 0.04809569001020646, + "grad_norm": 1.7033090591430664, + "learning_rate": 0.00019982849742330875, + "loss": 1.8902, + "step": 1343 + }, + { + "epoch": 0.04813150213977474, + "grad_norm": 1.6161448955535889, + "learning_rate": 0.00019982781772786564, + "loss": 1.9358, + "step": 1344 + }, + { + "epoch": 0.04816731426934303, + "grad_norm": 2.100522041320801, + "learning_rate": 0.00019982713668936805, + "loss": 1.798, + "step": 1345 + }, + { + "epoch": 0.04820312639891131, + "grad_norm": 2.4044883251190186, + "learning_rate": 0.00019982645430782506, + "loss": 2.3565, + "step": 1346 + }, + { + "epoch": 0.048238938528479595, + "grad_norm": 1.768221378326416, + "learning_rate": 0.00019982577058324589, + "loss": 1.9625, + "step": 1347 + }, + { + "epoch": 0.04827475065804788, + "grad_norm": 1.4462714195251465, + "learning_rate": 0.00019982508551563978, + "loss": 1.7243, + "step": 1348 + }, + { + "epoch": 0.04831056278761617, + "grad_norm": 2.0050547122955322, + "learning_rate": 0.00019982439910501588, + "loss": 1.57, + "step": 1349 + }, + { + "epoch": 0.04834637491718445, + "grad_norm": 1.5143630504608154, + "learning_rate": 0.0001998237113513835, + "loss": 1.708, + "step": 1350 + }, + { + "epoch": 0.048382187046752734, + "grad_norm": 1.6981946229934692, + "learning_rate": 0.00019982302225475182, + "loss": 2.1118, + "step": 1351 + }, + { + "epoch": 0.04841799917632102, + "grad_norm": 1.5785638093948364, + "learning_rate": 0.0001998223318151301, + "loss": 1.6616, + "step": 1352 + }, + { + "epoch": 0.048453811305889306, + "grad_norm": 1.7291991710662842, + "learning_rate": 0.00019982164003252772, + "loss": 1.8341, + "step": 1353 + }, + { + "epoch": 0.04848962343545759, + "grad_norm": 2.4716439247131348, + "learning_rate": 0.0001998209469069539, + "loss": 2.0056, + "step": 1354 + }, + { + "epoch": 0.04852543556502587, + "grad_norm": 2.480513334274292, + "learning_rate": 0.00019982025243841804, + "loss": 2.1853, + "step": 1355 + }, + { + "epoch": 0.04856124769459416, + "grad_norm": 2.5003104209899902, + "learning_rate": 0.00019981955662692942, + "loss": 2.2649, + "step": 1356 + }, + { + "epoch": 0.048597059824162445, + "grad_norm": 1.948498010635376, + "learning_rate": 0.00019981885947249742, + "loss": 1.8958, + "step": 1357 + }, + { + "epoch": 0.04863287195373073, + "grad_norm": 1.2755658626556396, + "learning_rate": 0.0001998181609751314, + "loss": 1.4372, + "step": 1358 + }, + { + "epoch": 0.04866868408329901, + "grad_norm": 1.3627718687057495, + "learning_rate": 0.00019981746113484082, + "loss": 1.8786, + "step": 1359 + }, + { + "epoch": 0.0487044962128673, + "grad_norm": 1.4305596351623535, + "learning_rate": 0.00019981675995163505, + "loss": 1.7181, + "step": 1360 + }, + { + "epoch": 0.04874030834243558, + "grad_norm": 1.3786169290542603, + "learning_rate": 0.00019981605742552352, + "loss": 1.6552, + "step": 1361 + }, + { + "epoch": 0.048776120472003866, + "grad_norm": 2.100572109222412, + "learning_rate": 0.00019981535355651569, + "loss": 1.9594, + "step": 1362 + }, + { + "epoch": 0.048811932601572156, + "grad_norm": 2.110886573791504, + "learning_rate": 0.00019981464834462103, + "loss": 2.0575, + "step": 1363 + }, + { + "epoch": 0.04884774473114044, + "grad_norm": 1.4668046236038208, + "learning_rate": 0.00019981394178984903, + "loss": 1.7259, + "step": 1364 + }, + { + "epoch": 0.04888355686070872, + "grad_norm": 1.8313469886779785, + "learning_rate": 0.0001998132338922092, + "loss": 1.9957, + "step": 1365 + }, + { + "epoch": 0.048919368990277004, + "grad_norm": 1.4437497854232788, + "learning_rate": 0.00019981252465171102, + "loss": 1.8628, + "step": 1366 + }, + { + "epoch": 0.048955181119845294, + "grad_norm": 1.769682765007019, + "learning_rate": 0.0001998118140683641, + "loss": 1.8354, + "step": 1367 + }, + { + "epoch": 0.04899099324941358, + "grad_norm": 1.4478873014450073, + "learning_rate": 0.00019981110214217798, + "loss": 1.9215, + "step": 1368 + }, + { + "epoch": 0.04902680537898186, + "grad_norm": 1.3279414176940918, + "learning_rate": 0.00019981038887316221, + "loss": 1.6335, + "step": 1369 + }, + { + "epoch": 0.04906261750855015, + "grad_norm": 1.3693251609802246, + "learning_rate": 0.00019980967426132642, + "loss": 1.9246, + "step": 1370 + }, + { + "epoch": 0.04909842963811843, + "grad_norm": 1.9764519929885864, + "learning_rate": 0.0001998089583066802, + "loss": 1.881, + "step": 1371 + }, + { + "epoch": 0.049134241767686715, + "grad_norm": 2.154409646987915, + "learning_rate": 0.00019980824100923318, + "loss": 1.8377, + "step": 1372 + }, + { + "epoch": 0.049170053897255, + "grad_norm": 1.876036524772644, + "learning_rate": 0.00019980752236899502, + "loss": 1.976, + "step": 1373 + }, + { + "epoch": 0.04920586602682329, + "grad_norm": 1.0766030550003052, + "learning_rate": 0.00019980680238597542, + "loss": 1.7499, + "step": 1374 + }, + { + "epoch": 0.04924167815639157, + "grad_norm": 1.5491044521331787, + "learning_rate": 0.000199806081060184, + "loss": 1.9504, + "step": 1375 + }, + { + "epoch": 0.04927749028595985, + "grad_norm": 1.6977920532226562, + "learning_rate": 0.00019980535839163053, + "loss": 1.6755, + "step": 1376 + }, + { + "epoch": 0.049313302415528136, + "grad_norm": 2.227975606918335, + "learning_rate": 0.00019980463438032468, + "loss": 2.0132, + "step": 1377 + }, + { + "epoch": 0.049349114545096426, + "grad_norm": 1.4667011499404907, + "learning_rate": 0.0001998039090262762, + "loss": 1.8883, + "step": 1378 + }, + { + "epoch": 0.04938492667466471, + "grad_norm": 1.5283125638961792, + "learning_rate": 0.0001998031823294949, + "loss": 1.8843, + "step": 1379 + }, + { + "epoch": 0.04942073880423299, + "grad_norm": 1.5257043838500977, + "learning_rate": 0.0001998024542899905, + "loss": 1.8381, + "step": 1380 + }, + { + "epoch": 0.04945655093380128, + "grad_norm": 1.9778072834014893, + "learning_rate": 0.00019980172490777283, + "loss": 1.9028, + "step": 1381 + }, + { + "epoch": 0.049492363063369564, + "grad_norm": 1.3122044801712036, + "learning_rate": 0.00019980099418285166, + "loss": 1.6511, + "step": 1382 + }, + { + "epoch": 0.04952817519293785, + "grad_norm": 1.8901945352554321, + "learning_rate": 0.00019980026211523686, + "loss": 1.6198, + "step": 1383 + }, + { + "epoch": 0.04956398732250613, + "grad_norm": 1.79074227809906, + "learning_rate": 0.00019979952870493824, + "loss": 1.8086, + "step": 1384 + }, + { + "epoch": 0.04959979945207442, + "grad_norm": 1.6266603469848633, + "learning_rate": 0.00019979879395196575, + "loss": 1.6447, + "step": 1385 + }, + { + "epoch": 0.0496356115816427, + "grad_norm": 1.4488394260406494, + "learning_rate": 0.00019979805785632916, + "loss": 2.0551, + "step": 1386 + }, + { + "epoch": 0.049671423711210985, + "grad_norm": 3.32766056060791, + "learning_rate": 0.00019979732041803847, + "loss": 2.1713, + "step": 1387 + }, + { + "epoch": 0.049707235840779275, + "grad_norm": 1.405636191368103, + "learning_rate": 0.00019979658163710355, + "loss": 1.6463, + "step": 1388 + }, + { + "epoch": 0.04974304797034756, + "grad_norm": 1.9788293838500977, + "learning_rate": 0.00019979584151353437, + "loss": 1.9736, + "step": 1389 + }, + { + "epoch": 0.04977886009991584, + "grad_norm": 2.140531539916992, + "learning_rate": 0.00019979510004734083, + "loss": 1.9015, + "step": 1390 + }, + { + "epoch": 0.049814672229484123, + "grad_norm": 2.0656092166900635, + "learning_rate": 0.00019979435723853296, + "loss": 1.8466, + "step": 1391 + }, + { + "epoch": 0.04985048435905241, + "grad_norm": 1.4488940238952637, + "learning_rate": 0.00019979361308712073, + "loss": 1.7686, + "step": 1392 + }, + { + "epoch": 0.049886296488620696, + "grad_norm": 2.0686659812927246, + "learning_rate": 0.00019979286759311423, + "loss": 1.8533, + "step": 1393 + }, + { + "epoch": 0.04992210861818898, + "grad_norm": 2.168595314025879, + "learning_rate": 0.00019979212075652334, + "loss": 2.1408, + "step": 1394 + }, + { + "epoch": 0.04995792074775727, + "grad_norm": 1.4077308177947998, + "learning_rate": 0.00019979137257735823, + "loss": 1.8534, + "step": 1395 + }, + { + "epoch": 0.04999373287732555, + "grad_norm": 1.5167765617370605, + "learning_rate": 0.0001997906230556289, + "loss": 1.7809, + "step": 1396 + }, + { + "epoch": 0.050029545006893834, + "grad_norm": 2.342071771621704, + "learning_rate": 0.00019978987219134545, + "loss": 1.8212, + "step": 1397 + }, + { + "epoch": 0.05006535713646212, + "grad_norm": 1.417253851890564, + "learning_rate": 0.000199789119984518, + "loss": 2.1048, + "step": 1398 + }, + { + "epoch": 0.05010116926603041, + "grad_norm": 1.8908305168151855, + "learning_rate": 0.0001997883664351567, + "loss": 1.951, + "step": 1399 + }, + { + "epoch": 0.05013698139559869, + "grad_norm": 2.8274893760681152, + "learning_rate": 0.00019978761154327158, + "loss": 2.3188, + "step": 1400 + }, + { + "epoch": 0.05017279352516697, + "grad_norm": 1.62851083278656, + "learning_rate": 0.0001997868553088729, + "loss": 1.9725, + "step": 1401 + }, + { + "epoch": 0.050208605654735255, + "grad_norm": 2.5111052989959717, + "learning_rate": 0.00019978609773197082, + "loss": 2.2518, + "step": 1402 + }, + { + "epoch": 0.050244417784303545, + "grad_norm": 1.185157299041748, + "learning_rate": 0.00019978533881257547, + "loss": 1.675, + "step": 1403 + }, + { + "epoch": 0.05028022991387183, + "grad_norm": 1.112566590309143, + "learning_rate": 0.0001997845785506971, + "loss": 1.7595, + "step": 1404 + }, + { + "epoch": 0.05031604204344011, + "grad_norm": 1.807395339012146, + "learning_rate": 0.00019978381694634595, + "loss": 1.9387, + "step": 1405 + }, + { + "epoch": 0.0503518541730084, + "grad_norm": 1.576179027557373, + "learning_rate": 0.00019978305399953228, + "loss": 1.6791, + "step": 1406 + }, + { + "epoch": 0.050387666302576684, + "grad_norm": 1.4238735437393188, + "learning_rate": 0.0001997822897102663, + "loss": 1.8154, + "step": 1407 + }, + { + "epoch": 0.050423478432144966, + "grad_norm": 1.8287601470947266, + "learning_rate": 0.00019978152407855833, + "loss": 1.8966, + "step": 1408 + }, + { + "epoch": 0.05045929056171325, + "grad_norm": 1.644075632095337, + "learning_rate": 0.00019978075710441867, + "loss": 1.8428, + "step": 1409 + }, + { + "epoch": 0.05049510269128154, + "grad_norm": 1.835912823677063, + "learning_rate": 0.0001997799887878576, + "loss": 1.8366, + "step": 1410 + }, + { + "epoch": 0.05053091482084982, + "grad_norm": 1.9325989484786987, + "learning_rate": 0.0001997792191288855, + "loss": 1.534, + "step": 1411 + }, + { + "epoch": 0.050566726950418105, + "grad_norm": 1.7003639936447144, + "learning_rate": 0.00019977844812751273, + "loss": 1.8233, + "step": 1412 + }, + { + "epoch": 0.050602539079986394, + "grad_norm": 1.4617531299591064, + "learning_rate": 0.00019977767578374965, + "loss": 1.8938, + "step": 1413 + }, + { + "epoch": 0.05063835120955468, + "grad_norm": 1.3252395391464233, + "learning_rate": 0.0001997769020976066, + "loss": 1.9042, + "step": 1414 + }, + { + "epoch": 0.05067416333912296, + "grad_norm": 1.881447672843933, + "learning_rate": 0.0001997761270690941, + "loss": 1.8318, + "step": 1415 + }, + { + "epoch": 0.05070997546869124, + "grad_norm": 2.197857618331909, + "learning_rate": 0.00019977535069822246, + "loss": 1.7475, + "step": 1416 + }, + { + "epoch": 0.05074578759825953, + "grad_norm": 1.581891655921936, + "learning_rate": 0.0001997745729850022, + "loss": 1.7239, + "step": 1417 + }, + { + "epoch": 0.050781599727827816, + "grad_norm": 1.7551162242889404, + "learning_rate": 0.00019977379392944377, + "loss": 1.8272, + "step": 1418 + }, + { + "epoch": 0.0508174118573961, + "grad_norm": 1.9000366926193237, + "learning_rate": 0.00019977301353155764, + "loss": 2.1745, + "step": 1419 + }, + { + "epoch": 0.05085322398696439, + "grad_norm": 1.7322187423706055, + "learning_rate": 0.00019977223179135428, + "loss": 1.9419, + "step": 1420 + }, + { + "epoch": 0.05088903611653267, + "grad_norm": 1.9449477195739746, + "learning_rate": 0.0001997714487088443, + "loss": 1.7893, + "step": 1421 + }, + { + "epoch": 0.050924848246100954, + "grad_norm": 1.3871630430221558, + "learning_rate": 0.0001997706642840381, + "loss": 1.9459, + "step": 1422 + }, + { + "epoch": 0.05096066037566924, + "grad_norm": 1.2880792617797852, + "learning_rate": 0.00019976987851694634, + "loss": 1.7042, + "step": 1423 + }, + { + "epoch": 0.050996472505237526, + "grad_norm": 2.2730605602264404, + "learning_rate": 0.00019976909140757956, + "loss": 1.744, + "step": 1424 + }, + { + "epoch": 0.05103228463480581, + "grad_norm": 2.3292646408081055, + "learning_rate": 0.00019976830295594832, + "loss": 2.1704, + "step": 1425 + }, + { + "epoch": 0.05106809676437409, + "grad_norm": 2.2148680686950684, + "learning_rate": 0.0001997675131620633, + "loss": 1.7412, + "step": 1426 + }, + { + "epoch": 0.051103908893942375, + "grad_norm": 2.116534471511841, + "learning_rate": 0.00019976672202593506, + "loss": 1.7296, + "step": 1427 + }, + { + "epoch": 0.051139721023510665, + "grad_norm": 1.278947114944458, + "learning_rate": 0.00019976592954757427, + "loss": 1.4632, + "step": 1428 + }, + { + "epoch": 0.05117553315307895, + "grad_norm": 1.4900505542755127, + "learning_rate": 0.00019976513572699157, + "loss": 1.8935, + "step": 1429 + }, + { + "epoch": 0.05121134528264723, + "grad_norm": 1.7054023742675781, + "learning_rate": 0.00019976434056419767, + "loss": 1.681, + "step": 1430 + }, + { + "epoch": 0.05124715741221552, + "grad_norm": 2.0350306034088135, + "learning_rate": 0.00019976354405920328, + "loss": 1.8399, + "step": 1431 + }, + { + "epoch": 0.0512829695417838, + "grad_norm": 1.9636918306350708, + "learning_rate": 0.00019976274621201907, + "loss": 2.0888, + "step": 1432 + }, + { + "epoch": 0.051318781671352086, + "grad_norm": 1.9971915483474731, + "learning_rate": 0.00019976194702265578, + "loss": 1.7744, + "step": 1433 + }, + { + "epoch": 0.05135459380092037, + "grad_norm": 2.1631476879119873, + "learning_rate": 0.00019976114649112418, + "loss": 2.0092, + "step": 1434 + }, + { + "epoch": 0.05139040593048866, + "grad_norm": 3.263909339904785, + "learning_rate": 0.00019976034461743504, + "loss": 2.3351, + "step": 1435 + }, + { + "epoch": 0.05142621806005694, + "grad_norm": 1.6741386651992798, + "learning_rate": 0.00019975954140159915, + "loss": 1.7882, + "step": 1436 + }, + { + "epoch": 0.051462030189625224, + "grad_norm": 1.8211408853530884, + "learning_rate": 0.0001997587368436273, + "loss": 1.6995, + "step": 1437 + }, + { + "epoch": 0.051497842319193514, + "grad_norm": 1.880556344985962, + "learning_rate": 0.00019975793094353036, + "loss": 2.0151, + "step": 1438 + }, + { + "epoch": 0.0515336544487618, + "grad_norm": 1.701261281967163, + "learning_rate": 0.0001997571237013191, + "loss": 1.6038, + "step": 1439 + }, + { + "epoch": 0.05156946657833008, + "grad_norm": 1.4975484609603882, + "learning_rate": 0.00019975631511700442, + "loss": 1.5615, + "step": 1440 + }, + { + "epoch": 0.05160527870789836, + "grad_norm": 1.6058564186096191, + "learning_rate": 0.00019975550519059723, + "loss": 2.0746, + "step": 1441 + }, + { + "epoch": 0.05164109083746665, + "grad_norm": 1.5407977104187012, + "learning_rate": 0.00019975469392210834, + "loss": 1.893, + "step": 1442 + }, + { + "epoch": 0.051676902967034935, + "grad_norm": 2.4655606746673584, + "learning_rate": 0.00019975388131154875, + "loss": 2.4228, + "step": 1443 + }, + { + "epoch": 0.05171271509660322, + "grad_norm": 2.1224660873413086, + "learning_rate": 0.00019975306735892936, + "loss": 1.84, + "step": 1444 + }, + { + "epoch": 0.05174852722617151, + "grad_norm": 1.920393943786621, + "learning_rate": 0.00019975225206426113, + "loss": 1.8256, + "step": 1445 + }, + { + "epoch": 0.05178433935573979, + "grad_norm": 1.4037801027297974, + "learning_rate": 0.000199751435427555, + "loss": 1.6021, + "step": 1446 + }, + { + "epoch": 0.05182015148530807, + "grad_norm": 2.118389844894409, + "learning_rate": 0.000199750617448822, + "loss": 1.7848, + "step": 1447 + }, + { + "epoch": 0.051855963614876356, + "grad_norm": 1.7762823104858398, + "learning_rate": 0.0001997497981280731, + "loss": 1.8298, + "step": 1448 + }, + { + "epoch": 0.051891775744444646, + "grad_norm": 2.352095127105713, + "learning_rate": 0.0001997489774653193, + "loss": 1.9828, + "step": 1449 + }, + { + "epoch": 0.05192758787401293, + "grad_norm": 3.4863343238830566, + "learning_rate": 0.00019974815546057172, + "loss": 2.2459, + "step": 1450 + }, + { + "epoch": 0.05196340000358121, + "grad_norm": 2.1996467113494873, + "learning_rate": 0.00019974733211384135, + "loss": 1.7053, + "step": 1451 + }, + { + "epoch": 0.051999212133149494, + "grad_norm": 1.548524022102356, + "learning_rate": 0.0001997465074251393, + "loss": 1.6912, + "step": 1452 + }, + { + "epoch": 0.052035024262717784, + "grad_norm": 1.4116077423095703, + "learning_rate": 0.00019974568139447666, + "loss": 1.6227, + "step": 1453 + }, + { + "epoch": 0.05207083639228607, + "grad_norm": 1.6894891262054443, + "learning_rate": 0.00019974485402186453, + "loss": 1.599, + "step": 1454 + }, + { + "epoch": 0.05210664852185435, + "grad_norm": 2.850682497024536, + "learning_rate": 0.00019974402530731407, + "loss": 1.929, + "step": 1455 + }, + { + "epoch": 0.05214246065142264, + "grad_norm": 2.964691400527954, + "learning_rate": 0.0001997431952508364, + "loss": 1.4999, + "step": 1456 + }, + { + "epoch": 0.05217827278099092, + "grad_norm": 2.3899643421173096, + "learning_rate": 0.00019974236385244268, + "loss": 1.8135, + "step": 1457 + }, + { + "epoch": 0.052214084910559205, + "grad_norm": 3.404496431350708, + "learning_rate": 0.00019974153111214414, + "loss": 2.2774, + "step": 1458 + }, + { + "epoch": 0.05224989704012749, + "grad_norm": 1.3085155487060547, + "learning_rate": 0.00019974069702995194, + "loss": 1.7506, + "step": 1459 + }, + { + "epoch": 0.05228570916969578, + "grad_norm": 1.9195306301116943, + "learning_rate": 0.00019973986160587732, + "loss": 2.0551, + "step": 1460 + }, + { + "epoch": 0.05232152129926406, + "grad_norm": 1.2769794464111328, + "learning_rate": 0.0001997390248399315, + "loss": 1.6659, + "step": 1461 + }, + { + "epoch": 0.052357333428832344, + "grad_norm": 1.5254219770431519, + "learning_rate": 0.00019973818673212578, + "loss": 1.3987, + "step": 1462 + }, + { + "epoch": 0.05239314555840063, + "grad_norm": 1.952865481376648, + "learning_rate": 0.00019973734728247143, + "loss": 1.6659, + "step": 1463 + }, + { + "epoch": 0.052428957687968916, + "grad_norm": 1.9222220182418823, + "learning_rate": 0.0001997365064909797, + "loss": 1.6806, + "step": 1464 + }, + { + "epoch": 0.0524647698175372, + "grad_norm": 1.1028681993484497, + "learning_rate": 0.0001997356643576619, + "loss": 1.8794, + "step": 1465 + }, + { + "epoch": 0.05250058194710548, + "grad_norm": 1.5954684019088745, + "learning_rate": 0.00019973482088252943, + "loss": 1.6585, + "step": 1466 + }, + { + "epoch": 0.05253639407667377, + "grad_norm": 2.3468151092529297, + "learning_rate": 0.00019973397606559354, + "loss": 1.9015, + "step": 1467 + }, + { + "epoch": 0.052572206206242054, + "grad_norm": 1.6654571294784546, + "learning_rate": 0.0001997331299068657, + "loss": 1.9589, + "step": 1468 + }, + { + "epoch": 0.05260801833581034, + "grad_norm": 1.9264411926269531, + "learning_rate": 0.00019973228240635722, + "loss": 1.9634, + "step": 1469 + }, + { + "epoch": 0.05264383046537863, + "grad_norm": 1.5382975339889526, + "learning_rate": 0.00019973143356407952, + "loss": 1.7071, + "step": 1470 + }, + { + "epoch": 0.05267964259494691, + "grad_norm": 1.8750600814819336, + "learning_rate": 0.00019973058338004407, + "loss": 1.8874, + "step": 1471 + }, + { + "epoch": 0.05271545472451519, + "grad_norm": 2.4676332473754883, + "learning_rate": 0.00019972973185426222, + "loss": 2.0747, + "step": 1472 + }, + { + "epoch": 0.052751266854083476, + "grad_norm": 1.5208827257156372, + "learning_rate": 0.0001997288789867455, + "loss": 1.8408, + "step": 1473 + }, + { + "epoch": 0.052787078983651765, + "grad_norm": 2.3117616176605225, + "learning_rate": 0.0001997280247775053, + "loss": 1.6432, + "step": 1474 + }, + { + "epoch": 0.05282289111322005, + "grad_norm": 1.715316653251648, + "learning_rate": 0.0001997271692265532, + "loss": 1.494, + "step": 1475 + }, + { + "epoch": 0.05285870324278833, + "grad_norm": 1.7642710208892822, + "learning_rate": 0.0001997263123339007, + "loss": 1.6121, + "step": 1476 + }, + { + "epoch": 0.052894515372356614, + "grad_norm": 1.6220943927764893, + "learning_rate": 0.00019972545409955927, + "loss": 1.4739, + "step": 1477 + }, + { + "epoch": 0.052930327501924904, + "grad_norm": 1.6621453762054443, + "learning_rate": 0.0001997245945235405, + "loss": 2.0465, + "step": 1478 + }, + { + "epoch": 0.052966139631493186, + "grad_norm": 1.6994413137435913, + "learning_rate": 0.00019972373360585598, + "loss": 1.8201, + "step": 1479 + }, + { + "epoch": 0.05300195176106147, + "grad_norm": 1.6507011651992798, + "learning_rate": 0.0001997228713465172, + "loss": 1.7818, + "step": 1480 + }, + { + "epoch": 0.05303776389062976, + "grad_norm": 2.683297872543335, + "learning_rate": 0.00019972200774553587, + "loss": 2.1151, + "step": 1481 + }, + { + "epoch": 0.05307357602019804, + "grad_norm": 1.821467399597168, + "learning_rate": 0.00019972114280292355, + "loss": 1.759, + "step": 1482 + }, + { + "epoch": 0.053109388149766325, + "grad_norm": 1.6540615558624268, + "learning_rate": 0.00019972027651869186, + "loss": 1.6425, + "step": 1483 + }, + { + "epoch": 0.05314520027933461, + "grad_norm": 1.185801386833191, + "learning_rate": 0.0001997194088928525, + "loss": 1.598, + "step": 1484 + }, + { + "epoch": 0.0531810124089029, + "grad_norm": 1.5758860111236572, + "learning_rate": 0.0001997185399254171, + "loss": 1.8597, + "step": 1485 + }, + { + "epoch": 0.05321682453847118, + "grad_norm": 1.5715807676315308, + "learning_rate": 0.00019971766961639738, + "loss": 1.6007, + "step": 1486 + }, + { + "epoch": 0.05325263666803946, + "grad_norm": 2.0418407917022705, + "learning_rate": 0.00019971679796580504, + "loss": 2.1591, + "step": 1487 + }, + { + "epoch": 0.05328844879760775, + "grad_norm": 1.2468957901000977, + "learning_rate": 0.00019971592497365184, + "loss": 1.6722, + "step": 1488 + }, + { + "epoch": 0.053324260927176036, + "grad_norm": 1.5520515441894531, + "learning_rate": 0.00019971505063994948, + "loss": 1.7877, + "step": 1489 + }, + { + "epoch": 0.05336007305674432, + "grad_norm": 1.990512728691101, + "learning_rate": 0.0001997141749647097, + "loss": 1.8976, + "step": 1490 + }, + { + "epoch": 0.0533958851863126, + "grad_norm": 1.7781422138214111, + "learning_rate": 0.00019971329794794436, + "loss": 2.0116, + "step": 1491 + }, + { + "epoch": 0.05343169731588089, + "grad_norm": 1.6702563762664795, + "learning_rate": 0.0001997124195896652, + "loss": 1.7436, + "step": 1492 + }, + { + "epoch": 0.053467509445449174, + "grad_norm": 2.4951159954071045, + "learning_rate": 0.00019971153988988406, + "loss": 1.7853, + "step": 1493 + }, + { + "epoch": 0.05350332157501746, + "grad_norm": 1.4511957168579102, + "learning_rate": 0.00019971065884861276, + "loss": 1.6299, + "step": 1494 + }, + { + "epoch": 0.05353913370458575, + "grad_norm": 1.593092679977417, + "learning_rate": 0.00019970977646586319, + "loss": 2.1281, + "step": 1495 + }, + { + "epoch": 0.05357494583415403, + "grad_norm": 2.2723426818847656, + "learning_rate": 0.00019970889274164715, + "loss": 1.4845, + "step": 1496 + }, + { + "epoch": 0.05361075796372231, + "grad_norm": 2.1010289192199707, + "learning_rate": 0.00019970800767597663, + "loss": 1.995, + "step": 1497 + }, + { + "epoch": 0.053646570093290595, + "grad_norm": 1.3042644262313843, + "learning_rate": 0.00019970712126886342, + "loss": 2.1141, + "step": 1498 + }, + { + "epoch": 0.053682382222858885, + "grad_norm": 1.9947757720947266, + "learning_rate": 0.00019970623352031952, + "loss": 1.8331, + "step": 1499 + }, + { + "epoch": 0.05371819435242717, + "grad_norm": 2.3290586471557617, + "learning_rate": 0.00019970534443035688, + "loss": 1.6654, + "step": 1500 + }, + { + "epoch": 0.05375400648199545, + "grad_norm": 3.0495433807373047, + "learning_rate": 0.00019970445399898745, + "loss": 1.8413, + "step": 1501 + }, + { + "epoch": 0.05378981861156373, + "grad_norm": 2.522512435913086, + "learning_rate": 0.0001997035622262232, + "loss": 1.8241, + "step": 1502 + }, + { + "epoch": 0.05382563074113202, + "grad_norm": 2.1961100101470947, + "learning_rate": 0.00019970266911207608, + "loss": 1.6946, + "step": 1503 + }, + { + "epoch": 0.053861442870700306, + "grad_norm": 1.8775655031204224, + "learning_rate": 0.00019970177465655818, + "loss": 1.8042, + "step": 1504 + }, + { + "epoch": 0.05389725500026859, + "grad_norm": 3.8398947715759277, + "learning_rate": 0.00019970087885968154, + "loss": 2.0227, + "step": 1505 + }, + { + "epoch": 0.05393306712983688, + "grad_norm": 1.3387746810913086, + "learning_rate": 0.00019969998172145815, + "loss": 1.6747, + "step": 1506 + }, + { + "epoch": 0.05396887925940516, + "grad_norm": 1.517892599105835, + "learning_rate": 0.00019969908324190012, + "loss": 1.8804, + "step": 1507 + }, + { + "epoch": 0.054004691388973444, + "grad_norm": 1.1590577363967896, + "learning_rate": 0.0001996981834210195, + "loss": 1.6391, + "step": 1508 + }, + { + "epoch": 0.05404050351854173, + "grad_norm": 2.3728671073913574, + "learning_rate": 0.00019969728225882846, + "loss": 1.6555, + "step": 1509 + }, + { + "epoch": 0.05407631564811002, + "grad_norm": 1.6440011262893677, + "learning_rate": 0.0001996963797553391, + "loss": 1.7857, + "step": 1510 + }, + { + "epoch": 0.0541121277776783, + "grad_norm": 1.4567910432815552, + "learning_rate": 0.0001996954759105635, + "loss": 1.9391, + "step": 1511 + }, + { + "epoch": 0.05414793990724658, + "grad_norm": 1.946500539779663, + "learning_rate": 0.00019969457072451392, + "loss": 1.7353, + "step": 1512 + }, + { + "epoch": 0.05418375203681487, + "grad_norm": 2.7675740718841553, + "learning_rate": 0.00019969366419720245, + "loss": 2.133, + "step": 1513 + }, + { + "epoch": 0.054219564166383155, + "grad_norm": 2.4415030479431152, + "learning_rate": 0.00019969275632864133, + "loss": 1.9024, + "step": 1514 + }, + { + "epoch": 0.05425537629595144, + "grad_norm": 1.4757903814315796, + "learning_rate": 0.0001996918471188428, + "loss": 1.7258, + "step": 1515 + }, + { + "epoch": 0.05429118842551972, + "grad_norm": 1.7123427391052246, + "learning_rate": 0.00019969093656781902, + "loss": 1.7771, + "step": 1516 + }, + { + "epoch": 0.05432700055508801, + "grad_norm": 1.3325016498565674, + "learning_rate": 0.00019969002467558228, + "loss": 1.7978, + "step": 1517 + }, + { + "epoch": 0.05436281268465629, + "grad_norm": 1.9106487035751343, + "learning_rate": 0.00019968911144214486, + "loss": 1.8826, + "step": 1518 + }, + { + "epoch": 0.054398624814224576, + "grad_norm": 2.213944435119629, + "learning_rate": 0.00019968819686751906, + "loss": 1.9959, + "step": 1519 + }, + { + "epoch": 0.054434436943792866, + "grad_norm": 2.123725175857544, + "learning_rate": 0.00019968728095171715, + "loss": 2.1413, + "step": 1520 + }, + { + "epoch": 0.05447024907336115, + "grad_norm": 1.2367082834243774, + "learning_rate": 0.00019968636369475142, + "loss": 1.6893, + "step": 1521 + }, + { + "epoch": 0.05450606120292943, + "grad_norm": 1.6155388355255127, + "learning_rate": 0.00019968544509663428, + "loss": 1.9238, + "step": 1522 + }, + { + "epoch": 0.054541873332497715, + "grad_norm": 2.1532177925109863, + "learning_rate": 0.00019968452515737805, + "loss": 1.8312, + "step": 1523 + }, + { + "epoch": 0.054577685462066004, + "grad_norm": 2.00732421875, + "learning_rate": 0.00019968360387699513, + "loss": 1.8616, + "step": 1524 + }, + { + "epoch": 0.05461349759163429, + "grad_norm": 2.6617205142974854, + "learning_rate": 0.00019968268125549794, + "loss": 1.9007, + "step": 1525 + }, + { + "epoch": 0.05464930972120257, + "grad_norm": 1.5837981700897217, + "learning_rate": 0.0001996817572928988, + "loss": 1.8432, + "step": 1526 + }, + { + "epoch": 0.05468512185077085, + "grad_norm": 1.387168526649475, + "learning_rate": 0.0001996808319892102, + "loss": 1.7933, + "step": 1527 + }, + { + "epoch": 0.05472093398033914, + "grad_norm": 1.664420247077942, + "learning_rate": 0.00019967990534444462, + "loss": 1.8014, + "step": 1528 + }, + { + "epoch": 0.054756746109907425, + "grad_norm": 1.184078574180603, + "learning_rate": 0.00019967897735861446, + "loss": 1.451, + "step": 1529 + }, + { + "epoch": 0.05479255823947571, + "grad_norm": 2.2369894981384277, + "learning_rate": 0.00019967804803173227, + "loss": 1.5848, + "step": 1530 + }, + { + "epoch": 0.054828370369044, + "grad_norm": 1.4040497541427612, + "learning_rate": 0.00019967711736381048, + "loss": 1.8061, + "step": 1531 + }, + { + "epoch": 0.05486418249861228, + "grad_norm": 2.491568088531494, + "learning_rate": 0.00019967618535486164, + "loss": 1.9725, + "step": 1532 + }, + { + "epoch": 0.054899994628180564, + "grad_norm": 1.9477348327636719, + "learning_rate": 0.00019967525200489833, + "loss": 1.6782, + "step": 1533 + }, + { + "epoch": 0.054935806757748847, + "grad_norm": 1.5153226852416992, + "learning_rate": 0.00019967431731393308, + "loss": 1.8764, + "step": 1534 + }, + { + "epoch": 0.054971618887317136, + "grad_norm": 2.029557704925537, + "learning_rate": 0.00019967338128197847, + "loss": 1.8136, + "step": 1535 + }, + { + "epoch": 0.05500743101688542, + "grad_norm": 2.0749216079711914, + "learning_rate": 0.00019967244390904708, + "loss": 1.9191, + "step": 1536 + }, + { + "epoch": 0.0550432431464537, + "grad_norm": 1.8724786043167114, + "learning_rate": 0.0001996715051951515, + "loss": 1.7789, + "step": 1537 + }, + { + "epoch": 0.05507905527602199, + "grad_norm": 1.8553539514541626, + "learning_rate": 0.0001996705651403044, + "loss": 1.9742, + "step": 1538 + }, + { + "epoch": 0.055114867405590275, + "grad_norm": 1.4529833793640137, + "learning_rate": 0.0001996696237445184, + "loss": 1.5149, + "step": 1539 + }, + { + "epoch": 0.05515067953515856, + "grad_norm": 1.6312896013259888, + "learning_rate": 0.0001996686810078062, + "loss": 1.8811, + "step": 1540 + }, + { + "epoch": 0.05518649166472684, + "grad_norm": 2.4548962116241455, + "learning_rate": 0.00019966773693018045, + "loss": 1.9498, + "step": 1541 + }, + { + "epoch": 0.05522230379429513, + "grad_norm": 1.8022762537002563, + "learning_rate": 0.00019966679151165384, + "loss": 1.9692, + "step": 1542 + }, + { + "epoch": 0.05525811592386341, + "grad_norm": 2.094322919845581, + "learning_rate": 0.00019966584475223913, + "loss": 1.7657, + "step": 1543 + }, + { + "epoch": 0.055293928053431696, + "grad_norm": 1.8352880477905273, + "learning_rate": 0.00019966489665194904, + "loss": 1.7019, + "step": 1544 + }, + { + "epoch": 0.055329740182999985, + "grad_norm": 3.184751033782959, + "learning_rate": 0.0001996639472107963, + "loss": 2.2141, + "step": 1545 + }, + { + "epoch": 0.05536555231256827, + "grad_norm": 1.6981626749038696, + "learning_rate": 0.00019966299642879375, + "loss": 1.7203, + "step": 1546 + }, + { + "epoch": 0.05540136444213655, + "grad_norm": 1.5179368257522583, + "learning_rate": 0.00019966204430595412, + "loss": 1.907, + "step": 1547 + }, + { + "epoch": 0.055437176571704834, + "grad_norm": 3.7298059463500977, + "learning_rate": 0.00019966109084229024, + "loss": 1.9523, + "step": 1548 + }, + { + "epoch": 0.055472988701273124, + "grad_norm": 2.229259729385376, + "learning_rate": 0.00019966013603781493, + "loss": 1.9006, + "step": 1549 + }, + { + "epoch": 0.05550880083084141, + "grad_norm": 1.454759120941162, + "learning_rate": 0.00019965917989254103, + "loss": 1.7634, + "step": 1550 + }, + { + "epoch": 0.05554461296040969, + "grad_norm": 1.9848576784133911, + "learning_rate": 0.00019965822240648143, + "loss": 2.0198, + "step": 1551 + }, + { + "epoch": 0.05558042508997797, + "grad_norm": 1.571907877922058, + "learning_rate": 0.00019965726357964902, + "loss": 1.6818, + "step": 1552 + }, + { + "epoch": 0.05561623721954626, + "grad_norm": 2.924750328063965, + "learning_rate": 0.00019965630341205664, + "loss": 1.8103, + "step": 1553 + }, + { + "epoch": 0.055652049349114545, + "grad_norm": 1.6184571981430054, + "learning_rate": 0.00019965534190371725, + "loss": 1.8719, + "step": 1554 + }, + { + "epoch": 0.05568786147868283, + "grad_norm": 1.6031067371368408, + "learning_rate": 0.0001996543790546438, + "loss": 2.0676, + "step": 1555 + }, + { + "epoch": 0.05572367360825112, + "grad_norm": 2.796915054321289, + "learning_rate": 0.00019965341486484923, + "loss": 1.7383, + "step": 1556 + }, + { + "epoch": 0.0557594857378194, + "grad_norm": 3.0277481079101562, + "learning_rate": 0.00019965244933434648, + "loss": 1.8378, + "step": 1557 + }, + { + "epoch": 0.05579529786738768, + "grad_norm": 1.5920354127883911, + "learning_rate": 0.00019965148246314858, + "loss": 1.9057, + "step": 1558 + }, + { + "epoch": 0.055831109996955966, + "grad_norm": 2.056974172592163, + "learning_rate": 0.00019965051425126852, + "loss": 1.6761, + "step": 1559 + }, + { + "epoch": 0.055866922126524256, + "grad_norm": 1.310811161994934, + "learning_rate": 0.00019964954469871936, + "loss": 1.5442, + "step": 1560 + }, + { + "epoch": 0.05590273425609254, + "grad_norm": 2.0601959228515625, + "learning_rate": 0.0001996485738055141, + "loss": 1.8756, + "step": 1561 + }, + { + "epoch": 0.05593854638566082, + "grad_norm": 1.5039955377578735, + "learning_rate": 0.00019964760157166578, + "loss": 1.9241, + "step": 1562 + }, + { + "epoch": 0.05597435851522911, + "grad_norm": 2.9783098697662354, + "learning_rate": 0.00019964662799718753, + "loss": 1.995, + "step": 1563 + }, + { + "epoch": 0.056010170644797394, + "grad_norm": 2.375992774963379, + "learning_rate": 0.00019964565308209248, + "loss": 2.1893, + "step": 1564 + }, + { + "epoch": 0.05604598277436568, + "grad_norm": 1.5130183696746826, + "learning_rate": 0.00019964467682639364, + "loss": 1.5716, + "step": 1565 + }, + { + "epoch": 0.05608179490393396, + "grad_norm": 1.9226343631744385, + "learning_rate": 0.00019964369923010424, + "loss": 1.9059, + "step": 1566 + }, + { + "epoch": 0.05611760703350225, + "grad_norm": 3.1932523250579834, + "learning_rate": 0.00019964272029323742, + "loss": 2.0947, + "step": 1567 + }, + { + "epoch": 0.05615341916307053, + "grad_norm": 1.9467480182647705, + "learning_rate": 0.00019964174001580628, + "loss": 2.0824, + "step": 1568 + }, + { + "epoch": 0.056189231292638815, + "grad_norm": 1.4615256786346436, + "learning_rate": 0.00019964075839782407, + "loss": 1.7063, + "step": 1569 + }, + { + "epoch": 0.056225043422207105, + "grad_norm": 2.083310604095459, + "learning_rate": 0.000199639775439304, + "loss": 2.1004, + "step": 1570 + }, + { + "epoch": 0.05626085555177539, + "grad_norm": 2.23530650138855, + "learning_rate": 0.00019963879114025926, + "loss": 1.7032, + "step": 1571 + }, + { + "epoch": 0.05629666768134367, + "grad_norm": 1.425076961517334, + "learning_rate": 0.00019963780550070315, + "loss": 1.7797, + "step": 1572 + }, + { + "epoch": 0.05633247981091195, + "grad_norm": 2.6845715045928955, + "learning_rate": 0.00019963681852064883, + "loss": 1.8535, + "step": 1573 + }, + { + "epoch": 0.05636829194048024, + "grad_norm": 2.078648567199707, + "learning_rate": 0.0001996358302001097, + "loss": 1.6292, + "step": 1574 + }, + { + "epoch": 0.056404104070048526, + "grad_norm": 2.747649908065796, + "learning_rate": 0.00019963484053909896, + "loss": 1.877, + "step": 1575 + }, + { + "epoch": 0.05643991619961681, + "grad_norm": 2.103194236755371, + "learning_rate": 0.00019963384953762995, + "loss": 1.8571, + "step": 1576 + }, + { + "epoch": 0.05647572832918509, + "grad_norm": 1.530626654624939, + "learning_rate": 0.00019963285719571604, + "loss": 1.5274, + "step": 1577 + }, + { + "epoch": 0.05651154045875338, + "grad_norm": 1.7835010290145874, + "learning_rate": 0.00019963186351337054, + "loss": 1.678, + "step": 1578 + }, + { + "epoch": 0.056547352588321664, + "grad_norm": 1.6494067907333374, + "learning_rate": 0.00019963086849060684, + "loss": 1.6927, + "step": 1579 + }, + { + "epoch": 0.05658316471788995, + "grad_norm": 2.0887563228607178, + "learning_rate": 0.0001996298721274383, + "loss": 1.8169, + "step": 1580 + }, + { + "epoch": 0.05661897684745824, + "grad_norm": 1.8033913373947144, + "learning_rate": 0.00019962887442387834, + "loss": 1.7722, + "step": 1581 + }, + { + "epoch": 0.05665478897702652, + "grad_norm": 2.5214874744415283, + "learning_rate": 0.0001996278753799404, + "loss": 1.7613, + "step": 1582 + }, + { + "epoch": 0.0566906011065948, + "grad_norm": 1.961786150932312, + "learning_rate": 0.00019962687499563793, + "loss": 1.8367, + "step": 1583 + }, + { + "epoch": 0.056726413236163085, + "grad_norm": 2.0807454586029053, + "learning_rate": 0.00019962587327098435, + "loss": 1.7686, + "step": 1584 + }, + { + "epoch": 0.056762225365731375, + "grad_norm": 1.7898211479187012, + "learning_rate": 0.00019962487020599315, + "loss": 1.2548, + "step": 1585 + }, + { + "epoch": 0.05679803749529966, + "grad_norm": 1.7042467594146729, + "learning_rate": 0.00019962386580067782, + "loss": 1.6725, + "step": 1586 + }, + { + "epoch": 0.05683384962486794, + "grad_norm": 1.3166635036468506, + "learning_rate": 0.00019962286005505188, + "loss": 1.9565, + "step": 1587 + }, + { + "epoch": 0.05686966175443623, + "grad_norm": 1.8134676218032837, + "learning_rate": 0.00019962185296912887, + "loss": 1.7871, + "step": 1588 + }, + { + "epoch": 0.056905473884004514, + "grad_norm": 1.3248748779296875, + "learning_rate": 0.00019962084454292235, + "loss": 1.7441, + "step": 1589 + }, + { + "epoch": 0.056941286013572796, + "grad_norm": 2.7689707279205322, + "learning_rate": 0.00019961983477644583, + "loss": 1.7567, + "step": 1590 + }, + { + "epoch": 0.05697709814314108, + "grad_norm": 1.4286848306655884, + "learning_rate": 0.00019961882366971296, + "loss": 1.5907, + "step": 1591 + }, + { + "epoch": 0.05701291027270937, + "grad_norm": 1.531070590019226, + "learning_rate": 0.00019961781122273734, + "loss": 1.8205, + "step": 1592 + }, + { + "epoch": 0.05704872240227765, + "grad_norm": 3.7116222381591797, + "learning_rate": 0.00019961679743553252, + "loss": 1.7679, + "step": 1593 + }, + { + "epoch": 0.057084534531845935, + "grad_norm": 1.6121838092803955, + "learning_rate": 0.0001996157823081122, + "loss": 1.728, + "step": 1594 + }, + { + "epoch": 0.057120346661414224, + "grad_norm": 1.6923259496688843, + "learning_rate": 0.00019961476584049004, + "loss": 1.5902, + "step": 1595 + }, + { + "epoch": 0.05715615879098251, + "grad_norm": 1.7286845445632935, + "learning_rate": 0.00019961374803267968, + "loss": 1.8698, + "step": 1596 + }, + { + "epoch": 0.05719197092055079, + "grad_norm": 1.6534048318862915, + "learning_rate": 0.00019961272888469484, + "loss": 1.8303, + "step": 1597 + }, + { + "epoch": 0.05722778305011907, + "grad_norm": 2.573026180267334, + "learning_rate": 0.00019961170839654922, + "loss": 1.8427, + "step": 1598 + }, + { + "epoch": 0.05726359517968736, + "grad_norm": 1.91075599193573, + "learning_rate": 0.00019961068656825656, + "loss": 1.7374, + "step": 1599 + }, + { + "epoch": 0.057299407309255646, + "grad_norm": 1.761756181716919, + "learning_rate": 0.0001996096633998306, + "loss": 1.9732, + "step": 1600 + }, + { + "epoch": 0.05733521943882393, + "grad_norm": 1.8080767393112183, + "learning_rate": 0.0001996086388912851, + "loss": 1.5933, + "step": 1601 + }, + { + "epoch": 0.05737103156839221, + "grad_norm": 2.0051701068878174, + "learning_rate": 0.00019960761304263386, + "loss": 1.8956, + "step": 1602 + }, + { + "epoch": 0.0574068436979605, + "grad_norm": 2.3873651027679443, + "learning_rate": 0.0001996065858538907, + "loss": 1.7061, + "step": 1603 + }, + { + "epoch": 0.057442655827528784, + "grad_norm": 1.7301465272903442, + "learning_rate": 0.00019960555732506937, + "loss": 1.5289, + "step": 1604 + }, + { + "epoch": 0.05747846795709707, + "grad_norm": 1.9653252363204956, + "learning_rate": 0.00019960452745618375, + "loss": 1.6136, + "step": 1605 + }, + { + "epoch": 0.057514280086665356, + "grad_norm": 1.3157404661178589, + "learning_rate": 0.0001996034962472477, + "loss": 1.833, + "step": 1606 + }, + { + "epoch": 0.05755009221623364, + "grad_norm": 1.559603214263916, + "learning_rate": 0.0001996024636982751, + "loss": 1.6462, + "step": 1607 + }, + { + "epoch": 0.05758590434580192, + "grad_norm": 1.8252371549606323, + "learning_rate": 0.0001996014298092798, + "loss": 1.6637, + "step": 1608 + }, + { + "epoch": 0.057621716475370205, + "grad_norm": 1.6523933410644531, + "learning_rate": 0.00019960039458027576, + "loss": 1.6047, + "step": 1609 + }, + { + "epoch": 0.057657528604938495, + "grad_norm": 1.5484936237335205, + "learning_rate": 0.00019959935801127686, + "loss": 1.7067, + "step": 1610 + }, + { + "epoch": 0.05769334073450678, + "grad_norm": 2.451167583465576, + "learning_rate": 0.00019959832010229712, + "loss": 1.419, + "step": 1611 + }, + { + "epoch": 0.05772915286407506, + "grad_norm": 1.6821362972259521, + "learning_rate": 0.0001995972808533504, + "loss": 2.039, + "step": 1612 + }, + { + "epoch": 0.05776496499364335, + "grad_norm": 1.6939760446548462, + "learning_rate": 0.00019959624026445077, + "loss": 1.6131, + "step": 1613 + }, + { + "epoch": 0.05780077712321163, + "grad_norm": 1.5620489120483398, + "learning_rate": 0.0001995951983356122, + "loss": 1.6838, + "step": 1614 + }, + { + "epoch": 0.057836589252779916, + "grad_norm": 2.6445093154907227, + "learning_rate": 0.0001995941550668487, + "loss": 1.8336, + "step": 1615 + }, + { + "epoch": 0.0578724013823482, + "grad_norm": 1.760237693786621, + "learning_rate": 0.00019959311045817432, + "loss": 1.6198, + "step": 1616 + }, + { + "epoch": 0.05790821351191649, + "grad_norm": 1.3533624410629272, + "learning_rate": 0.00019959206450960307, + "loss": 1.7068, + "step": 1617 + }, + { + "epoch": 0.05794402564148477, + "grad_norm": 2.5859081745147705, + "learning_rate": 0.0001995910172211491, + "loss": 1.9612, + "step": 1618 + }, + { + "epoch": 0.057979837771053054, + "grad_norm": 1.4971524477005005, + "learning_rate": 0.0001995899685928264, + "loss": 1.7951, + "step": 1619 + }, + { + "epoch": 0.058015649900621344, + "grad_norm": 1.8958418369293213, + "learning_rate": 0.0001995889186246492, + "loss": 1.5899, + "step": 1620 + }, + { + "epoch": 0.05805146203018963, + "grad_norm": 2.1714656352996826, + "learning_rate": 0.0001995878673166315, + "loss": 1.5648, + "step": 1621 + }, + { + "epoch": 0.05808727415975791, + "grad_norm": 2.7349958419799805, + "learning_rate": 0.00019958681466878756, + "loss": 1.9454, + "step": 1622 + }, + { + "epoch": 0.05812308628932619, + "grad_norm": 2.072175979614258, + "learning_rate": 0.00019958576068113145, + "loss": 1.5197, + "step": 1623 + }, + { + "epoch": 0.05815889841889448, + "grad_norm": 2.4490714073181152, + "learning_rate": 0.00019958470535367742, + "loss": 1.5733, + "step": 1624 + }, + { + "epoch": 0.058194710548462765, + "grad_norm": 1.668999195098877, + "learning_rate": 0.00019958364868643958, + "loss": 1.5685, + "step": 1625 + }, + { + "epoch": 0.05823052267803105, + "grad_norm": 1.6361305713653564, + "learning_rate": 0.00019958259067943225, + "loss": 1.9354, + "step": 1626 + }, + { + "epoch": 0.05826633480759933, + "grad_norm": 1.7708817720413208, + "learning_rate": 0.0001995815313326696, + "loss": 1.6394, + "step": 1627 + }, + { + "epoch": 0.05830214693716762, + "grad_norm": 1.748575210571289, + "learning_rate": 0.0001995804706461659, + "loss": 1.8433, + "step": 1628 + }, + { + "epoch": 0.0583379590667359, + "grad_norm": 1.4690053462982178, + "learning_rate": 0.0001995794086199354, + "loss": 1.5964, + "step": 1629 + }, + { + "epoch": 0.058373771196304186, + "grad_norm": 1.3967971801757812, + "learning_rate": 0.00019957834525399242, + "loss": 1.657, + "step": 1630 + }, + { + "epoch": 0.058409583325872476, + "grad_norm": 1.4848518371582031, + "learning_rate": 0.00019957728054835125, + "loss": 1.8209, + "step": 1631 + }, + { + "epoch": 0.05844539545544076, + "grad_norm": 2.1335513591766357, + "learning_rate": 0.0001995762145030262, + "loss": 1.7809, + "step": 1632 + }, + { + "epoch": 0.05848120758500904, + "grad_norm": 3.323810577392578, + "learning_rate": 0.00019957514711803164, + "loss": 1.4154, + "step": 1633 + }, + { + "epoch": 0.058517019714577324, + "grad_norm": 1.775437593460083, + "learning_rate": 0.00019957407839338193, + "loss": 1.6845, + "step": 1634 + }, + { + "epoch": 0.058552831844145614, + "grad_norm": 2.64595365524292, + "learning_rate": 0.00019957300832909144, + "loss": 1.5673, + "step": 1635 + }, + { + "epoch": 0.0585886439737139, + "grad_norm": 1.9590623378753662, + "learning_rate": 0.00019957193692517455, + "loss": 1.7878, + "step": 1636 + }, + { + "epoch": 0.05862445610328218, + "grad_norm": 1.5874866247177124, + "learning_rate": 0.00019957086418164567, + "loss": 1.5769, + "step": 1637 + }, + { + "epoch": 0.05866026823285047, + "grad_norm": 2.0571467876434326, + "learning_rate": 0.00019956979009851927, + "loss": 1.5945, + "step": 1638 + }, + { + "epoch": 0.05869608036241875, + "grad_norm": 1.961224913597107, + "learning_rate": 0.00019956871467580978, + "loss": 1.8875, + "step": 1639 + }, + { + "epoch": 0.058731892491987035, + "grad_norm": 1.3154363632202148, + "learning_rate": 0.00019956763791353165, + "loss": 1.4418, + "step": 1640 + }, + { + "epoch": 0.05876770462155532, + "grad_norm": 1.3892945051193237, + "learning_rate": 0.00019956655981169942, + "loss": 1.7232, + "step": 1641 + }, + { + "epoch": 0.05880351675112361, + "grad_norm": 1.9630827903747559, + "learning_rate": 0.00019956548037032752, + "loss": 1.8138, + "step": 1642 + }, + { + "epoch": 0.05883932888069189, + "grad_norm": 1.364547848701477, + "learning_rate": 0.00019956439958943053, + "loss": 1.8376, + "step": 1643 + }, + { + "epoch": 0.058875141010260174, + "grad_norm": 1.8685306310653687, + "learning_rate": 0.00019956331746902298, + "loss": 1.8356, + "step": 1644 + }, + { + "epoch": 0.05891095313982846, + "grad_norm": 2.003082036972046, + "learning_rate": 0.00019956223400911943, + "loss": 1.7245, + "step": 1645 + }, + { + "epoch": 0.058946765269396746, + "grad_norm": 2.334925651550293, + "learning_rate": 0.00019956114920973442, + "loss": 1.9533, + "step": 1646 + }, + { + "epoch": 0.05898257739896503, + "grad_norm": 2.2185535430908203, + "learning_rate": 0.00019956006307088258, + "loss": 1.6938, + "step": 1647 + }, + { + "epoch": 0.05901838952853331, + "grad_norm": 1.9847118854522705, + "learning_rate": 0.00019955897559257853, + "loss": 1.7853, + "step": 1648 + }, + { + "epoch": 0.0590542016581016, + "grad_norm": 2.876863956451416, + "learning_rate": 0.00019955788677483686, + "loss": 1.6786, + "step": 1649 + }, + { + "epoch": 0.059090013787669884, + "grad_norm": 1.6888865232467651, + "learning_rate": 0.00019955679661767226, + "loss": 1.4412, + "step": 1650 + }, + { + "epoch": 0.05912582591723817, + "grad_norm": 2.452845335006714, + "learning_rate": 0.0001995557051210994, + "loss": 1.7187, + "step": 1651 + }, + { + "epoch": 0.05916163804680645, + "grad_norm": 1.8312182426452637, + "learning_rate": 0.0001995546122851329, + "loss": 1.9031, + "step": 1652 + }, + { + "epoch": 0.05919745017637474, + "grad_norm": 1.6904535293579102, + "learning_rate": 0.00019955351810978754, + "loss": 1.9099, + "step": 1653 + }, + { + "epoch": 0.05923326230594302, + "grad_norm": 1.750017762184143, + "learning_rate": 0.000199552422595078, + "loss": 1.8949, + "step": 1654 + }, + { + "epoch": 0.059269074435511306, + "grad_norm": 1.7873555421829224, + "learning_rate": 0.000199551325741019, + "loss": 1.6408, + "step": 1655 + }, + { + "epoch": 0.059304886565079595, + "grad_norm": 2.2488410472869873, + "learning_rate": 0.00019955022754762535, + "loss": 2.0474, + "step": 1656 + }, + { + "epoch": 0.05934069869464788, + "grad_norm": 1.8136587142944336, + "learning_rate": 0.0001995491280149118, + "loss": 1.9466, + "step": 1657 + }, + { + "epoch": 0.05937651082421616, + "grad_norm": 1.3177629709243774, + "learning_rate": 0.00019954802714289315, + "loss": 1.7, + "step": 1658 + }, + { + "epoch": 0.059412322953784444, + "grad_norm": 1.3142410516738892, + "learning_rate": 0.0001995469249315842, + "loss": 1.6965, + "step": 1659 + }, + { + "epoch": 0.059448135083352734, + "grad_norm": 1.6144740581512451, + "learning_rate": 0.00019954582138099978, + "loss": 1.8903, + "step": 1660 + }, + { + "epoch": 0.059483947212921016, + "grad_norm": 1.29384446144104, + "learning_rate": 0.00019954471649115475, + "loss": 1.614, + "step": 1661 + }, + { + "epoch": 0.0595197593424893, + "grad_norm": 1.294191598892212, + "learning_rate": 0.00019954361026206394, + "loss": 1.5788, + "step": 1662 + }, + { + "epoch": 0.05955557147205759, + "grad_norm": 1.5210322141647339, + "learning_rate": 0.00019954250269374227, + "loss": 1.5188, + "step": 1663 + }, + { + "epoch": 0.05959138360162587, + "grad_norm": 2.4071707725524902, + "learning_rate": 0.0001995413937862046, + "loss": 1.7177, + "step": 1664 + }, + { + "epoch": 0.059627195731194155, + "grad_norm": 1.5951025485992432, + "learning_rate": 0.0001995402835394659, + "loss": 1.5914, + "step": 1665 + }, + { + "epoch": 0.05966300786076244, + "grad_norm": 2.301365852355957, + "learning_rate": 0.00019953917195354105, + "loss": 1.6783, + "step": 1666 + }, + { + "epoch": 0.05969881999033073, + "grad_norm": 1.3707059621810913, + "learning_rate": 0.00019953805902844508, + "loss": 1.9201, + "step": 1667 + }, + { + "epoch": 0.05973463211989901, + "grad_norm": 1.7182759046554565, + "learning_rate": 0.00019953694476419293, + "loss": 1.8426, + "step": 1668 + }, + { + "epoch": 0.05977044424946729, + "grad_norm": 1.4055579900741577, + "learning_rate": 0.00019953582916079957, + "loss": 1.4242, + "step": 1669 + }, + { + "epoch": 0.05980625637903558, + "grad_norm": 1.643748164176941, + "learning_rate": 0.00019953471221827998, + "loss": 1.7121, + "step": 1670 + }, + { + "epoch": 0.059842068508603866, + "grad_norm": 1.5763031244277954, + "learning_rate": 0.00019953359393664927, + "loss": 1.564, + "step": 1671 + }, + { + "epoch": 0.05987788063817215, + "grad_norm": 1.790825605392456, + "learning_rate": 0.0001995324743159224, + "loss": 1.5434, + "step": 1672 + }, + { + "epoch": 0.05991369276774043, + "grad_norm": 2.168774127960205, + "learning_rate": 0.00019953135335611452, + "loss": 1.6468, + "step": 1673 + }, + { + "epoch": 0.05994950489730872, + "grad_norm": 1.6681972742080688, + "learning_rate": 0.00019953023105724068, + "loss": 1.9108, + "step": 1674 + }, + { + "epoch": 0.059985317026877004, + "grad_norm": 1.9940334558486938, + "learning_rate": 0.00019952910741931592, + "loss": 1.5585, + "step": 1675 + }, + { + "epoch": 0.06002112915644529, + "grad_norm": 2.587232828140259, + "learning_rate": 0.0001995279824423554, + "loss": 1.8064, + "step": 1676 + }, + { + "epoch": 0.06005694128601357, + "grad_norm": 1.9780170917510986, + "learning_rate": 0.0001995268561263743, + "loss": 1.8043, + "step": 1677 + }, + { + "epoch": 0.06009275341558186, + "grad_norm": 1.620010256767273, + "learning_rate": 0.00019952572847138772, + "loss": 1.865, + "step": 1678 + }, + { + "epoch": 0.06012856554515014, + "grad_norm": 2.4111506938934326, + "learning_rate": 0.00019952459947741082, + "loss": 1.9095, + "step": 1679 + }, + { + "epoch": 0.060164377674718425, + "grad_norm": 2.624497652053833, + "learning_rate": 0.00019952346914445883, + "loss": 1.902, + "step": 1680 + }, + { + "epoch": 0.060200189804286715, + "grad_norm": 1.8209869861602783, + "learning_rate": 0.00019952233747254691, + "loss": 1.7628, + "step": 1681 + }, + { + "epoch": 0.060236001933855, + "grad_norm": 1.5332977771759033, + "learning_rate": 0.00019952120446169037, + "loss": 1.6418, + "step": 1682 + }, + { + "epoch": 0.06027181406342328, + "grad_norm": 1.8602702617645264, + "learning_rate": 0.00019952007011190433, + "loss": 1.918, + "step": 1683 + }, + { + "epoch": 0.06030762619299156, + "grad_norm": 1.2004653215408325, + "learning_rate": 0.00019951893442320416, + "loss": 1.5955, + "step": 1684 + }, + { + "epoch": 0.06034343832255985, + "grad_norm": 1.2076934576034546, + "learning_rate": 0.0001995177973956051, + "loss": 1.7053, + "step": 1685 + }, + { + "epoch": 0.060379250452128136, + "grad_norm": 2.4891135692596436, + "learning_rate": 0.00019951665902912243, + "loss": 1.902, + "step": 1686 + }, + { + "epoch": 0.06041506258169642, + "grad_norm": 1.7342336177825928, + "learning_rate": 0.00019951551932377148, + "loss": 1.7217, + "step": 1687 + }, + { + "epoch": 0.06045087471126471, + "grad_norm": 2.313361406326294, + "learning_rate": 0.00019951437827956758, + "loss": 1.5658, + "step": 1688 + }, + { + "epoch": 0.06048668684083299, + "grad_norm": 1.3781776428222656, + "learning_rate": 0.0001995132358965261, + "loss": 1.5952, + "step": 1689 + }, + { + "epoch": 0.060522498970401274, + "grad_norm": 2.3602588176727295, + "learning_rate": 0.00019951209217466238, + "loss": 1.8622, + "step": 1690 + }, + { + "epoch": 0.06055831109996956, + "grad_norm": 2.0375075340270996, + "learning_rate": 0.0001995109471139918, + "loss": 1.6982, + "step": 1691 + }, + { + "epoch": 0.06059412322953785, + "grad_norm": 1.8000264167785645, + "learning_rate": 0.0001995098007145298, + "loss": 1.8062, + "step": 1692 + }, + { + "epoch": 0.06062993535910613, + "grad_norm": 1.7053439617156982, + "learning_rate": 0.00019950865297629184, + "loss": 1.789, + "step": 1693 + }, + { + "epoch": 0.06066574748867441, + "grad_norm": 3.238233804702759, + "learning_rate": 0.00019950750389929328, + "loss": 2.2441, + "step": 1694 + }, + { + "epoch": 0.0607015596182427, + "grad_norm": 1.5956525802612305, + "learning_rate": 0.0001995063534835496, + "loss": 1.8782, + "step": 1695 + }, + { + "epoch": 0.060737371747810985, + "grad_norm": 2.0592153072357178, + "learning_rate": 0.0001995052017290763, + "loss": 1.846, + "step": 1696 + }, + { + "epoch": 0.06077318387737927, + "grad_norm": 1.7917789220809937, + "learning_rate": 0.00019950404863588883, + "loss": 1.7443, + "step": 1697 + }, + { + "epoch": 0.06080899600694755, + "grad_norm": 1.8193100690841675, + "learning_rate": 0.00019950289420400278, + "loss": 2.2193, + "step": 1698 + }, + { + "epoch": 0.06084480813651584, + "grad_norm": 1.6950918436050415, + "learning_rate": 0.00019950173843343364, + "loss": 1.9775, + "step": 1699 + }, + { + "epoch": 0.06088062026608412, + "grad_norm": 1.3480393886566162, + "learning_rate": 0.00019950058132419692, + "loss": 1.5773, + "step": 1700 + }, + { + "epoch": 0.060916432395652406, + "grad_norm": 2.6421098709106445, + "learning_rate": 0.00019949942287630825, + "loss": 1.8098, + "step": 1701 + }, + { + "epoch": 0.06095224452522069, + "grad_norm": 1.7833307981491089, + "learning_rate": 0.00019949826308978316, + "loss": 1.758, + "step": 1702 + }, + { + "epoch": 0.06098805665478898, + "grad_norm": 2.0496833324432373, + "learning_rate": 0.0001994971019646373, + "loss": 1.7918, + "step": 1703 + }, + { + "epoch": 0.06102386878435726, + "grad_norm": 2.668501377105713, + "learning_rate": 0.0001994959395008863, + "loss": 1.8106, + "step": 1704 + }, + { + "epoch": 0.061059680913925544, + "grad_norm": 1.7229629755020142, + "learning_rate": 0.00019949477569854575, + "loss": 1.7763, + "step": 1705 + }, + { + "epoch": 0.061095493043493834, + "grad_norm": 2.5283937454223633, + "learning_rate": 0.00019949361055763133, + "loss": 1.9133, + "step": 1706 + }, + { + "epoch": 0.06113130517306212, + "grad_norm": 2.3446457386016846, + "learning_rate": 0.00019949244407815875, + "loss": 1.5309, + "step": 1707 + }, + { + "epoch": 0.0611671173026304, + "grad_norm": 1.1809579133987427, + "learning_rate": 0.00019949127626014363, + "loss": 1.4884, + "step": 1708 + }, + { + "epoch": 0.06120292943219868, + "grad_norm": 1.2034236192703247, + "learning_rate": 0.00019949010710360173, + "loss": 1.5259, + "step": 1709 + }, + { + "epoch": 0.06123874156176697, + "grad_norm": 2.4860334396362305, + "learning_rate": 0.0001994889366085488, + "loss": 1.9134, + "step": 1710 + }, + { + "epoch": 0.061274553691335255, + "grad_norm": 1.7052736282348633, + "learning_rate": 0.00019948776477500053, + "loss": 1.8427, + "step": 1711 + }, + { + "epoch": 0.06131036582090354, + "grad_norm": 1.8961480855941772, + "learning_rate": 0.0001994865916029727, + "loss": 1.6836, + "step": 1712 + }, + { + "epoch": 0.06134617795047183, + "grad_norm": 1.532853603363037, + "learning_rate": 0.00019948541709248116, + "loss": 1.7492, + "step": 1713 + }, + { + "epoch": 0.06138199008004011, + "grad_norm": 1.632897138595581, + "learning_rate": 0.00019948424124354163, + "loss": 1.7434, + "step": 1714 + }, + { + "epoch": 0.061417802209608394, + "grad_norm": 2.043008804321289, + "learning_rate": 0.00019948306405616996, + "loss": 1.8477, + "step": 1715 + }, + { + "epoch": 0.061453614339176676, + "grad_norm": 1.3530820608139038, + "learning_rate": 0.00019948188553038198, + "loss": 1.9704, + "step": 1716 + }, + { + "epoch": 0.061489426468744966, + "grad_norm": 1.3419357538223267, + "learning_rate": 0.0001994807056661936, + "loss": 1.6517, + "step": 1717 + }, + { + "epoch": 0.06152523859831325, + "grad_norm": 1.2969164848327637, + "learning_rate": 0.00019947952446362058, + "loss": 1.8418, + "step": 1718 + }, + { + "epoch": 0.06156105072788153, + "grad_norm": 2.865709066390991, + "learning_rate": 0.00019947834192267892, + "loss": 1.862, + "step": 1719 + }, + { + "epoch": 0.06159686285744982, + "grad_norm": 1.7418867349624634, + "learning_rate": 0.00019947715804338447, + "loss": 1.8999, + "step": 1720 + }, + { + "epoch": 0.061632674987018105, + "grad_norm": 2.2759385108947754, + "learning_rate": 0.00019947597282575318, + "loss": 1.6186, + "step": 1721 + }, + { + "epoch": 0.06166848711658639, + "grad_norm": 1.5044995546340942, + "learning_rate": 0.00019947478626980097, + "loss": 1.7092, + "step": 1722 + }, + { + "epoch": 0.06170429924615467, + "grad_norm": 1.620406150817871, + "learning_rate": 0.00019947359837554384, + "loss": 1.6904, + "step": 1723 + }, + { + "epoch": 0.06174011137572296, + "grad_norm": 1.9609277248382568, + "learning_rate": 0.00019947240914299776, + "loss": 1.6958, + "step": 1724 + }, + { + "epoch": 0.06177592350529124, + "grad_norm": 2.1059086322784424, + "learning_rate": 0.00019947121857217875, + "loss": 1.7419, + "step": 1725 + }, + { + "epoch": 0.061811735634859526, + "grad_norm": 2.0715277194976807, + "learning_rate": 0.00019947002666310276, + "loss": 1.6952, + "step": 1726 + }, + { + "epoch": 0.06184754776442781, + "grad_norm": 1.6598073244094849, + "learning_rate": 0.00019946883341578588, + "loss": 1.6857, + "step": 1727 + }, + { + "epoch": 0.0618833598939961, + "grad_norm": 1.5604004859924316, + "learning_rate": 0.00019946763883024415, + "loss": 1.6462, + "step": 1728 + }, + { + "epoch": 0.06191917202356438, + "grad_norm": 2.105992555618286, + "learning_rate": 0.00019946644290649368, + "loss": 1.77, + "step": 1729 + }, + { + "epoch": 0.061954984153132664, + "grad_norm": 2.2045366764068604, + "learning_rate": 0.00019946524564455048, + "loss": 2.0856, + "step": 1730 + }, + { + "epoch": 0.061990796282700954, + "grad_norm": 1.7695341110229492, + "learning_rate": 0.00019946404704443072, + "loss": 1.7301, + "step": 1731 + }, + { + "epoch": 0.06202660841226924, + "grad_norm": 1.6013516187667847, + "learning_rate": 0.00019946284710615052, + "loss": 1.5589, + "step": 1732 + }, + { + "epoch": 0.06206242054183752, + "grad_norm": 1.9727685451507568, + "learning_rate": 0.00019946164582972594, + "loss": 1.8941, + "step": 1733 + }, + { + "epoch": 0.0620982326714058, + "grad_norm": 1.8168443441390991, + "learning_rate": 0.0001994604432151733, + "loss": 1.5806, + "step": 1734 + }, + { + "epoch": 0.06213404480097409, + "grad_norm": 1.5395933389663696, + "learning_rate": 0.00019945923926250865, + "loss": 1.88, + "step": 1735 + }, + { + "epoch": 0.062169856930542375, + "grad_norm": 1.3251415491104126, + "learning_rate": 0.00019945803397174823, + "loss": 1.6235, + "step": 1736 + }, + { + "epoch": 0.06220566906011066, + "grad_norm": 1.7758703231811523, + "learning_rate": 0.00019945682734290825, + "loss": 1.7609, + "step": 1737 + }, + { + "epoch": 0.06224148118967895, + "grad_norm": 1.366156816482544, + "learning_rate": 0.00019945561937600496, + "loss": 1.629, + "step": 1738 + }, + { + "epoch": 0.06227729331924723, + "grad_norm": 2.5820953845977783, + "learning_rate": 0.0001994544100710546, + "loss": 1.6179, + "step": 1739 + }, + { + "epoch": 0.06231310544881551, + "grad_norm": 1.8584883213043213, + "learning_rate": 0.0001994531994280734, + "loss": 1.8395, + "step": 1740 + }, + { + "epoch": 0.062348917578383796, + "grad_norm": 1.4980369806289673, + "learning_rate": 0.00019945198744707776, + "loss": 1.8487, + "step": 1741 + }, + { + "epoch": 0.062384729707952086, + "grad_norm": 2.2591607570648193, + "learning_rate": 0.00019945077412808387, + "loss": 1.8087, + "step": 1742 + }, + { + "epoch": 0.06242054183752037, + "grad_norm": 4.082409381866455, + "learning_rate": 0.0001994495594711081, + "loss": 1.7609, + "step": 1743 + }, + { + "epoch": 0.06245635396708865, + "grad_norm": 1.8433887958526611, + "learning_rate": 0.0001994483434761668, + "loss": 1.6445, + "step": 1744 + }, + { + "epoch": 0.06249216609665694, + "grad_norm": 1.5228683948516846, + "learning_rate": 0.0001994471261432763, + "loss": 1.8867, + "step": 1745 + }, + { + "epoch": 0.06252797822622522, + "grad_norm": 1.2005826234817505, + "learning_rate": 0.00019944590747245298, + "loss": 1.7023, + "step": 1746 + }, + { + "epoch": 0.06256379035579351, + "grad_norm": 1.6530516147613525, + "learning_rate": 0.0001994446874637133, + "loss": 1.9049, + "step": 1747 + }, + { + "epoch": 0.0625996024853618, + "grad_norm": 2.352060556411743, + "learning_rate": 0.00019944346611707356, + "loss": 1.6247, + "step": 1748 + }, + { + "epoch": 0.06263541461493008, + "grad_norm": 2.1047065258026123, + "learning_rate": 0.0001994422434325503, + "loss": 1.6373, + "step": 1749 + }, + { + "epoch": 0.06267122674449836, + "grad_norm": 1.7999935150146484, + "learning_rate": 0.00019944101941015994, + "loss": 1.7002, + "step": 1750 + }, + { + "epoch": 0.06270703887406665, + "grad_norm": 1.833964467048645, + "learning_rate": 0.0001994397940499189, + "loss": 1.6692, + "step": 1751 + }, + { + "epoch": 0.06274285100363493, + "grad_norm": 1.7198271751403809, + "learning_rate": 0.00019943856735184373, + "loss": 1.9069, + "step": 1752 + }, + { + "epoch": 0.06277866313320321, + "grad_norm": 2.55334734916687, + "learning_rate": 0.00019943733931595086, + "loss": 1.789, + "step": 1753 + }, + { + "epoch": 0.06281447526277151, + "grad_norm": 1.8098565340042114, + "learning_rate": 0.0001994361099422569, + "loss": 1.7267, + "step": 1754 + }, + { + "epoch": 0.06285028739233979, + "grad_norm": 1.1642190217971802, + "learning_rate": 0.00019943487923077831, + "loss": 1.4918, + "step": 1755 + }, + { + "epoch": 0.06288609952190807, + "grad_norm": 2.4731671810150146, + "learning_rate": 0.0001994336471815317, + "loss": 1.7026, + "step": 1756 + }, + { + "epoch": 0.06292191165147636, + "grad_norm": 1.744199514389038, + "learning_rate": 0.00019943241379453364, + "loss": 1.6319, + "step": 1757 + }, + { + "epoch": 0.06295772378104464, + "grad_norm": 2.089003562927246, + "learning_rate": 0.00019943117906980068, + "loss": 1.5497, + "step": 1758 + }, + { + "epoch": 0.06299353591061292, + "grad_norm": 1.1575398445129395, + "learning_rate": 0.00019942994300734947, + "loss": 1.7987, + "step": 1759 + }, + { + "epoch": 0.0630293480401812, + "grad_norm": 1.753517746925354, + "learning_rate": 0.00019942870560719664, + "loss": 1.7975, + "step": 1760 + }, + { + "epoch": 0.06306516016974949, + "grad_norm": 2.7385034561157227, + "learning_rate": 0.00019942746686935883, + "loss": 1.641, + "step": 1761 + }, + { + "epoch": 0.06310097229931778, + "grad_norm": 3.074314594268799, + "learning_rate": 0.0001994262267938527, + "loss": 1.6846, + "step": 1762 + }, + { + "epoch": 0.06313678442888607, + "grad_norm": 1.2961831092834473, + "learning_rate": 0.00019942498538069495, + "loss": 1.7411, + "step": 1763 + }, + { + "epoch": 0.06317259655845435, + "grad_norm": 2.032168388366699, + "learning_rate": 0.00019942374262990224, + "loss": 1.8094, + "step": 1764 + }, + { + "epoch": 0.06320840868802263, + "grad_norm": 1.753873586654663, + "learning_rate": 0.00019942249854149136, + "loss": 1.7005, + "step": 1765 + }, + { + "epoch": 0.06324422081759092, + "grad_norm": 2.4005773067474365, + "learning_rate": 0.000199421253115479, + "loss": 1.8196, + "step": 1766 + }, + { + "epoch": 0.0632800329471592, + "grad_norm": 1.66982102394104, + "learning_rate": 0.0001994200063518819, + "loss": 1.7387, + "step": 1767 + }, + { + "epoch": 0.06331584507672748, + "grad_norm": 1.7754584550857544, + "learning_rate": 0.0001994187582507169, + "loss": 1.7563, + "step": 1768 + }, + { + "epoch": 0.06335165720629578, + "grad_norm": 2.168553590774536, + "learning_rate": 0.0001994175088120007, + "loss": 1.5715, + "step": 1769 + }, + { + "epoch": 0.06338746933586406, + "grad_norm": 2.892993450164795, + "learning_rate": 0.00019941625803575019, + "loss": 1.6514, + "step": 1770 + }, + { + "epoch": 0.06342328146543234, + "grad_norm": 1.5328547954559326, + "learning_rate": 0.00019941500592198216, + "loss": 1.7424, + "step": 1771 + }, + { + "epoch": 0.06345909359500063, + "grad_norm": 1.4240583181381226, + "learning_rate": 0.00019941375247071346, + "loss": 1.7308, + "step": 1772 + }, + { + "epoch": 0.06349490572456891, + "grad_norm": 1.379332184791565, + "learning_rate": 0.00019941249768196093, + "loss": 1.6878, + "step": 1773 + }, + { + "epoch": 0.06353071785413719, + "grad_norm": 1.5497807264328003, + "learning_rate": 0.0001994112415557415, + "loss": 1.6895, + "step": 1774 + }, + { + "epoch": 0.06356652998370547, + "grad_norm": 2.0320279598236084, + "learning_rate": 0.00019940998409207205, + "loss": 1.9394, + "step": 1775 + }, + { + "epoch": 0.06360234211327377, + "grad_norm": 1.5920579433441162, + "learning_rate": 0.00019940872529096947, + "loss": 1.4275, + "step": 1776 + }, + { + "epoch": 0.06363815424284205, + "grad_norm": 1.7592054605484009, + "learning_rate": 0.00019940746515245073, + "loss": 1.8923, + "step": 1777 + }, + { + "epoch": 0.06367396637241034, + "grad_norm": 1.5628163814544678, + "learning_rate": 0.0001994062036765328, + "loss": 1.9191, + "step": 1778 + }, + { + "epoch": 0.06370977850197862, + "grad_norm": 2.930593729019165, + "learning_rate": 0.0001994049408632326, + "loss": 1.7287, + "step": 1779 + }, + { + "epoch": 0.0637455906315469, + "grad_norm": 2.2003262042999268, + "learning_rate": 0.00019940367671256716, + "loss": 1.8437, + "step": 1780 + }, + { + "epoch": 0.06378140276111519, + "grad_norm": 1.2991431951522827, + "learning_rate": 0.00019940241122455346, + "loss": 1.8275, + "step": 1781 + }, + { + "epoch": 0.06381721489068347, + "grad_norm": 1.6640808582305908, + "learning_rate": 0.00019940114439920853, + "loss": 1.7935, + "step": 1782 + }, + { + "epoch": 0.06385302702025177, + "grad_norm": 1.9864023923873901, + "learning_rate": 0.00019939987623654944, + "loss": 1.7738, + "step": 1783 + }, + { + "epoch": 0.06388883914982005, + "grad_norm": 1.4357529878616333, + "learning_rate": 0.0001993986067365932, + "loss": 1.7587, + "step": 1784 + }, + { + "epoch": 0.06392465127938833, + "grad_norm": 1.5999445915222168, + "learning_rate": 0.00019939733589935694, + "loss": 1.7795, + "step": 1785 + }, + { + "epoch": 0.06396046340895661, + "grad_norm": 2.447521448135376, + "learning_rate": 0.00019939606372485776, + "loss": 1.9686, + "step": 1786 + }, + { + "epoch": 0.0639962755385249, + "grad_norm": 1.655842900276184, + "learning_rate": 0.00019939479021311273, + "loss": 2.0018, + "step": 1787 + }, + { + "epoch": 0.06403208766809318, + "grad_norm": 1.4144165515899658, + "learning_rate": 0.000199393515364139, + "loss": 1.5992, + "step": 1788 + }, + { + "epoch": 0.06406789979766146, + "grad_norm": 2.399679183959961, + "learning_rate": 0.00019939223917795373, + "loss": 1.7488, + "step": 1789 + }, + { + "epoch": 0.06410371192722976, + "grad_norm": 2.1720006465911865, + "learning_rate": 0.00019939096165457411, + "loss": 1.8954, + "step": 1790 + }, + { + "epoch": 0.06413952405679804, + "grad_norm": 1.701594352722168, + "learning_rate": 0.00019938968279401727, + "loss": 1.9236, + "step": 1791 + }, + { + "epoch": 0.06417533618636632, + "grad_norm": 1.7471704483032227, + "learning_rate": 0.0001993884025963005, + "loss": 1.9107, + "step": 1792 + }, + { + "epoch": 0.06421114831593461, + "grad_norm": 1.7068006992340088, + "learning_rate": 0.00019938712106144096, + "loss": 1.9236, + "step": 1793 + }, + { + "epoch": 0.06424696044550289, + "grad_norm": 1.5164170265197754, + "learning_rate": 0.0001993858381894559, + "loss": 2.014, + "step": 1794 + }, + { + "epoch": 0.06428277257507117, + "grad_norm": 1.6961904764175415, + "learning_rate": 0.00019938455398036257, + "loss": 2.0187, + "step": 1795 + }, + { + "epoch": 0.06431858470463946, + "grad_norm": 1.8207290172576904, + "learning_rate": 0.00019938326843417826, + "loss": 1.533, + "step": 1796 + }, + { + "epoch": 0.06435439683420775, + "grad_norm": 2.8234851360321045, + "learning_rate": 0.0001993819815509203, + "loss": 1.7141, + "step": 1797 + }, + { + "epoch": 0.06439020896377604, + "grad_norm": 2.3623080253601074, + "learning_rate": 0.00019938069333060593, + "loss": 2.0943, + "step": 1798 + }, + { + "epoch": 0.06442602109334432, + "grad_norm": 1.3733361959457397, + "learning_rate": 0.00019937940377325256, + "loss": 1.6068, + "step": 1799 + }, + { + "epoch": 0.0644618332229126, + "grad_norm": 1.828696370124817, + "learning_rate": 0.0001993781128788775, + "loss": 1.6581, + "step": 1800 + }, + { + "epoch": 0.06449764535248088, + "grad_norm": 1.8768473863601685, + "learning_rate": 0.00019937682064749811, + "loss": 1.8659, + "step": 1801 + }, + { + "epoch": 0.06453345748204917, + "grad_norm": 1.796538233757019, + "learning_rate": 0.0001993755270791318, + "loss": 1.6696, + "step": 1802 + }, + { + "epoch": 0.06456926961161745, + "grad_norm": 2.022562026977539, + "learning_rate": 0.00019937423217379594, + "loss": 1.5342, + "step": 1803 + }, + { + "epoch": 0.06460508174118575, + "grad_norm": 1.4857107400894165, + "learning_rate": 0.00019937293593150796, + "loss": 1.3543, + "step": 1804 + }, + { + "epoch": 0.06464089387075403, + "grad_norm": 1.9994350671768188, + "learning_rate": 0.00019937163835228534, + "loss": 1.6907, + "step": 1805 + }, + { + "epoch": 0.06467670600032231, + "grad_norm": 1.9123289585113525, + "learning_rate": 0.0001993703394361455, + "loss": 1.5374, + "step": 1806 + }, + { + "epoch": 0.0647125181298906, + "grad_norm": 1.8468797206878662, + "learning_rate": 0.0001993690391831059, + "loss": 1.4841, + "step": 1807 + }, + { + "epoch": 0.06474833025945888, + "grad_norm": 2.5288283824920654, + "learning_rate": 0.00019936773759318408, + "loss": 1.5264, + "step": 1808 + }, + { + "epoch": 0.06478414238902716, + "grad_norm": 1.552699089050293, + "learning_rate": 0.0001993664346663975, + "loss": 1.6451, + "step": 1809 + }, + { + "epoch": 0.06481995451859544, + "grad_norm": 1.2891765832901, + "learning_rate": 0.00019936513040276371, + "loss": 1.4697, + "step": 1810 + }, + { + "epoch": 0.06485576664816373, + "grad_norm": 2.329200267791748, + "learning_rate": 0.00019936382480230028, + "loss": 1.55, + "step": 1811 + }, + { + "epoch": 0.06489157877773202, + "grad_norm": 1.7297874689102173, + "learning_rate": 0.00019936251786502478, + "loss": 1.4764, + "step": 1812 + }, + { + "epoch": 0.0649273909073003, + "grad_norm": 1.8127317428588867, + "learning_rate": 0.00019936120959095473, + "loss": 1.5498, + "step": 1813 + }, + { + "epoch": 0.06496320303686859, + "grad_norm": 1.7599560022354126, + "learning_rate": 0.0001993598999801078, + "loss": 1.7308, + "step": 1814 + }, + { + "epoch": 0.06499901516643687, + "grad_norm": 2.6168103218078613, + "learning_rate": 0.00019935858903250155, + "loss": 1.8455, + "step": 1815 + }, + { + "epoch": 0.06503482729600515, + "grad_norm": 2.163510799407959, + "learning_rate": 0.00019935727674815369, + "loss": 1.6476, + "step": 1816 + }, + { + "epoch": 0.06507063942557344, + "grad_norm": 1.5857428312301636, + "learning_rate": 0.0001993559631270818, + "loss": 1.771, + "step": 1817 + }, + { + "epoch": 0.06510645155514172, + "grad_norm": 2.8085968494415283, + "learning_rate": 0.0001993546481693036, + "loss": 1.7182, + "step": 1818 + }, + { + "epoch": 0.06514226368471002, + "grad_norm": 1.4190795421600342, + "learning_rate": 0.00019935333187483676, + "loss": 1.6694, + "step": 1819 + }, + { + "epoch": 0.0651780758142783, + "grad_norm": 1.375416874885559, + "learning_rate": 0.000199352014243699, + "loss": 1.7377, + "step": 1820 + }, + { + "epoch": 0.06521388794384658, + "grad_norm": 2.3879919052124023, + "learning_rate": 0.00019935069527590805, + "loss": 1.703, + "step": 1821 + }, + { + "epoch": 0.06524970007341487, + "grad_norm": 1.3292384147644043, + "learning_rate": 0.00019934937497148163, + "loss": 1.657, + "step": 1822 + }, + { + "epoch": 0.06528551220298315, + "grad_norm": 1.2860294580459595, + "learning_rate": 0.00019934805333043752, + "loss": 1.6298, + "step": 1823 + }, + { + "epoch": 0.06532132433255143, + "grad_norm": 1.6525280475616455, + "learning_rate": 0.00019934673035279353, + "loss": 1.8054, + "step": 1824 + }, + { + "epoch": 0.06535713646211971, + "grad_norm": 1.3846570253372192, + "learning_rate": 0.00019934540603856743, + "loss": 1.6464, + "step": 1825 + }, + { + "epoch": 0.06539294859168801, + "grad_norm": 1.7888760566711426, + "learning_rate": 0.000199344080387777, + "loss": 1.9671, + "step": 1826 + }, + { + "epoch": 0.0654287607212563, + "grad_norm": 1.9460965394973755, + "learning_rate": 0.00019934275340044013, + "loss": 1.7569, + "step": 1827 + }, + { + "epoch": 0.06546457285082458, + "grad_norm": 2.1091198921203613, + "learning_rate": 0.0001993414250765747, + "loss": 1.6684, + "step": 1828 + }, + { + "epoch": 0.06550038498039286, + "grad_norm": 1.813179612159729, + "learning_rate": 0.0001993400954161985, + "loss": 1.6355, + "step": 1829 + }, + { + "epoch": 0.06553619710996114, + "grad_norm": 1.5191233158111572, + "learning_rate": 0.00019933876441932943, + "loss": 1.7589, + "step": 1830 + }, + { + "epoch": 0.06557200923952942, + "grad_norm": 2.3240489959716797, + "learning_rate": 0.00019933743208598546, + "loss": 1.6427, + "step": 1831 + }, + { + "epoch": 0.06560782136909771, + "grad_norm": 1.6643846035003662, + "learning_rate": 0.00019933609841618445, + "loss": 1.8053, + "step": 1832 + }, + { + "epoch": 0.065643633498666, + "grad_norm": 2.014047145843506, + "learning_rate": 0.0001993347634099444, + "loss": 1.3374, + "step": 1833 + }, + { + "epoch": 0.06567944562823429, + "grad_norm": 1.8650319576263428, + "learning_rate": 0.00019933342706728323, + "loss": 1.9769, + "step": 1834 + }, + { + "epoch": 0.06571525775780257, + "grad_norm": 1.386111855506897, + "learning_rate": 0.0001993320893882189, + "loss": 1.2967, + "step": 1835 + }, + { + "epoch": 0.06575106988737085, + "grad_norm": 1.4114490747451782, + "learning_rate": 0.00019933075037276949, + "loss": 1.5003, + "step": 1836 + }, + { + "epoch": 0.06578688201693914, + "grad_norm": 1.2881723642349243, + "learning_rate": 0.00019932941002095294, + "loss": 2.0214, + "step": 1837 + }, + { + "epoch": 0.06582269414650742, + "grad_norm": 1.827712059020996, + "learning_rate": 0.00019932806833278726, + "loss": 1.4548, + "step": 1838 + }, + { + "epoch": 0.0658585062760757, + "grad_norm": 2.9244232177734375, + "learning_rate": 0.0001993267253082906, + "loss": 1.6894, + "step": 1839 + }, + { + "epoch": 0.065894318405644, + "grad_norm": 1.268809199333191, + "learning_rate": 0.00019932538094748098, + "loss": 1.5501, + "step": 1840 + }, + { + "epoch": 0.06593013053521228, + "grad_norm": 1.1067763566970825, + "learning_rate": 0.00019932403525037642, + "loss": 1.6284, + "step": 1841 + }, + { + "epoch": 0.06596594266478056, + "grad_norm": 2.2531087398529053, + "learning_rate": 0.00019932268821699513, + "loss": 1.7967, + "step": 1842 + }, + { + "epoch": 0.06600175479434885, + "grad_norm": 2.0509607791900635, + "learning_rate": 0.0001993213398473552, + "loss": 1.2527, + "step": 1843 + }, + { + "epoch": 0.06603756692391713, + "grad_norm": 2.42293381690979, + "learning_rate": 0.00019931999014147472, + "loss": 1.8235, + "step": 1844 + }, + { + "epoch": 0.06607337905348541, + "grad_norm": 1.7757221460342407, + "learning_rate": 0.0001993186390993719, + "loss": 2.0193, + "step": 1845 + }, + { + "epoch": 0.0661091911830537, + "grad_norm": 1.6632944345474243, + "learning_rate": 0.0001993172867210649, + "loss": 1.8347, + "step": 1846 + }, + { + "epoch": 0.06614500331262199, + "grad_norm": 2.6417300701141357, + "learning_rate": 0.00019931593300657192, + "loss": 2.097, + "step": 1847 + }, + { + "epoch": 0.06618081544219027, + "grad_norm": 2.0074949264526367, + "learning_rate": 0.00019931457795591118, + "loss": 1.5325, + "step": 1848 + }, + { + "epoch": 0.06621662757175856, + "grad_norm": 1.9241271018981934, + "learning_rate": 0.00019931322156910088, + "loss": 1.7955, + "step": 1849 + }, + { + "epoch": 0.06625243970132684, + "grad_norm": 1.4078401327133179, + "learning_rate": 0.0001993118638461593, + "loss": 1.6546, + "step": 1850 + }, + { + "epoch": 0.06628825183089512, + "grad_norm": 1.9178889989852905, + "learning_rate": 0.00019931050478710468, + "loss": 1.7818, + "step": 1851 + }, + { + "epoch": 0.0663240639604634, + "grad_norm": 2.1118886470794678, + "learning_rate": 0.00019930914439195534, + "loss": 1.7646, + "step": 1852 + }, + { + "epoch": 0.06635987609003169, + "grad_norm": 2.849180221557617, + "learning_rate": 0.00019930778266072957, + "loss": 1.6227, + "step": 1853 + }, + { + "epoch": 0.06639568821959999, + "grad_norm": 2.1016101837158203, + "learning_rate": 0.00019930641959344566, + "loss": 1.263, + "step": 1854 + }, + { + "epoch": 0.06643150034916827, + "grad_norm": 1.8419221639633179, + "learning_rate": 0.00019930505519012197, + "loss": 1.5502, + "step": 1855 + }, + { + "epoch": 0.06646731247873655, + "grad_norm": 2.3808183670043945, + "learning_rate": 0.0001993036894507769, + "loss": 1.8669, + "step": 1856 + }, + { + "epoch": 0.06650312460830483, + "grad_norm": 1.957215666770935, + "learning_rate": 0.00019930232237542873, + "loss": 1.7474, + "step": 1857 + }, + { + "epoch": 0.06653893673787312, + "grad_norm": 1.7244082689285278, + "learning_rate": 0.0001993009539640959, + "loss": 1.6863, + "step": 1858 + }, + { + "epoch": 0.0665747488674414, + "grad_norm": 1.3676176071166992, + "learning_rate": 0.00019929958421679685, + "loss": 1.6681, + "step": 1859 + }, + { + "epoch": 0.06661056099700968, + "grad_norm": 1.637736439704895, + "learning_rate": 0.00019929821313354997, + "loss": 1.8369, + "step": 1860 + }, + { + "epoch": 0.06664637312657797, + "grad_norm": 1.4250011444091797, + "learning_rate": 0.00019929684071437373, + "loss": 1.7336, + "step": 1861 + }, + { + "epoch": 0.06668218525614626, + "grad_norm": 1.5548502206802368, + "learning_rate": 0.00019929546695928658, + "loss": 1.8462, + "step": 1862 + }, + { + "epoch": 0.06671799738571454, + "grad_norm": 1.8258202075958252, + "learning_rate": 0.000199294091868307, + "loss": 1.6896, + "step": 1863 + }, + { + "epoch": 0.06675380951528283, + "grad_norm": 1.2990753650665283, + "learning_rate": 0.0001992927154414535, + "loss": 1.6857, + "step": 1864 + }, + { + "epoch": 0.06678962164485111, + "grad_norm": 1.2354774475097656, + "learning_rate": 0.00019929133767874454, + "loss": 1.5531, + "step": 1865 + }, + { + "epoch": 0.0668254337744194, + "grad_norm": 1.4240732192993164, + "learning_rate": 0.0001992899585801988, + "loss": 1.8465, + "step": 1866 + }, + { + "epoch": 0.06686124590398768, + "grad_norm": 1.7457743883132935, + "learning_rate": 0.0001992885781458347, + "loss": 1.7631, + "step": 1867 + }, + { + "epoch": 0.06689705803355596, + "grad_norm": 1.6174548864364624, + "learning_rate": 0.0001992871963756708, + "loss": 1.5175, + "step": 1868 + }, + { + "epoch": 0.06693287016312426, + "grad_norm": 1.0970457792282104, + "learning_rate": 0.00019928581326972582, + "loss": 1.6494, + "step": 1869 + }, + { + "epoch": 0.06696868229269254, + "grad_norm": 1.3978979587554932, + "learning_rate": 0.00019928442882801825, + "loss": 1.5092, + "step": 1870 + }, + { + "epoch": 0.06700449442226082, + "grad_norm": 1.5778833627700806, + "learning_rate": 0.00019928304305056677, + "loss": 1.7631, + "step": 1871 + }, + { + "epoch": 0.0670403065518291, + "grad_norm": 1.6992747783660889, + "learning_rate": 0.00019928165593739, + "loss": 1.8914, + "step": 1872 + }, + { + "epoch": 0.06707611868139739, + "grad_norm": 1.8649605512619019, + "learning_rate": 0.00019928026748850663, + "loss": 1.8694, + "step": 1873 + }, + { + "epoch": 0.06711193081096567, + "grad_norm": 1.2663445472717285, + "learning_rate": 0.00019927887770393533, + "loss": 1.757, + "step": 1874 + }, + { + "epoch": 0.06714774294053395, + "grad_norm": 1.696965217590332, + "learning_rate": 0.0001992774865836948, + "loss": 1.7698, + "step": 1875 + }, + { + "epoch": 0.06718355507010225, + "grad_norm": 2.0695016384124756, + "learning_rate": 0.0001992760941278037, + "loss": 1.6721, + "step": 1876 + }, + { + "epoch": 0.06721936719967053, + "grad_norm": 1.6472669839859009, + "learning_rate": 0.00019927470033628087, + "loss": 1.5199, + "step": 1877 + }, + { + "epoch": 0.06725517932923882, + "grad_norm": 3.179795742034912, + "learning_rate": 0.00019927330520914496, + "loss": 1.4682, + "step": 1878 + }, + { + "epoch": 0.0672909914588071, + "grad_norm": 1.6882466077804565, + "learning_rate": 0.00019927190874641478, + "loss": 1.8529, + "step": 1879 + }, + { + "epoch": 0.06732680358837538, + "grad_norm": 1.6953445672988892, + "learning_rate": 0.00019927051094810913, + "loss": 1.9795, + "step": 1880 + }, + { + "epoch": 0.06736261571794366, + "grad_norm": 1.3161673545837402, + "learning_rate": 0.00019926911181424682, + "loss": 1.7497, + "step": 1881 + }, + { + "epoch": 0.06739842784751195, + "grad_norm": 1.8265464305877686, + "learning_rate": 0.00019926771134484662, + "loss": 1.5825, + "step": 1882 + }, + { + "epoch": 0.06743423997708024, + "grad_norm": 2.6041676998138428, + "learning_rate": 0.00019926630953992746, + "loss": 1.9153, + "step": 1883 + }, + { + "epoch": 0.06747005210664853, + "grad_norm": 2.21744441986084, + "learning_rate": 0.00019926490639950812, + "loss": 1.8852, + "step": 1884 + }, + { + "epoch": 0.06750586423621681, + "grad_norm": 2.1859962940216064, + "learning_rate": 0.00019926350192360753, + "loss": 1.441, + "step": 1885 + }, + { + "epoch": 0.06754167636578509, + "grad_norm": 1.6695815324783325, + "learning_rate": 0.00019926209611224454, + "loss": 1.5176, + "step": 1886 + }, + { + "epoch": 0.06757748849535337, + "grad_norm": 1.3804771900177002, + "learning_rate": 0.00019926068896543807, + "loss": 1.5485, + "step": 1887 + }, + { + "epoch": 0.06761330062492166, + "grad_norm": 1.9427920579910278, + "learning_rate": 0.0001992592804832071, + "loss": 1.6327, + "step": 1888 + }, + { + "epoch": 0.06764911275448994, + "grad_norm": 1.2968367338180542, + "learning_rate": 0.00019925787066557053, + "loss": 1.7162, + "step": 1889 + }, + { + "epoch": 0.06768492488405824, + "grad_norm": 1.447363257408142, + "learning_rate": 0.00019925645951254735, + "loss": 1.6678, + "step": 1890 + }, + { + "epoch": 0.06772073701362652, + "grad_norm": 1.4006659984588623, + "learning_rate": 0.00019925504702415653, + "loss": 1.7161, + "step": 1891 + }, + { + "epoch": 0.0677565491431948, + "grad_norm": 1.27907395362854, + "learning_rate": 0.00019925363320041708, + "loss": 1.5937, + "step": 1892 + }, + { + "epoch": 0.06779236127276309, + "grad_norm": 1.2862017154693604, + "learning_rate": 0.00019925221804134805, + "loss": 1.6961, + "step": 1893 + }, + { + "epoch": 0.06782817340233137, + "grad_norm": 2.0907084941864014, + "learning_rate": 0.0001992508015469684, + "loss": 1.5839, + "step": 1894 + }, + { + "epoch": 0.06786398553189965, + "grad_norm": 2.4519996643066406, + "learning_rate": 0.00019924938371729728, + "loss": 1.772, + "step": 1895 + }, + { + "epoch": 0.06789979766146793, + "grad_norm": 2.106823205947876, + "learning_rate": 0.00019924796455235373, + "loss": 1.9488, + "step": 1896 + }, + { + "epoch": 0.06793560979103623, + "grad_norm": 1.8093961477279663, + "learning_rate": 0.00019924654405215682, + "loss": 1.5178, + "step": 1897 + }, + { + "epoch": 0.06797142192060451, + "grad_norm": 2.3577721118927, + "learning_rate": 0.00019924512221672572, + "loss": 1.6168, + "step": 1898 + }, + { + "epoch": 0.0680072340501728, + "grad_norm": 1.4707239866256714, + "learning_rate": 0.00019924369904607945, + "loss": 1.6463, + "step": 1899 + }, + { + "epoch": 0.06804304617974108, + "grad_norm": 1.9914259910583496, + "learning_rate": 0.00019924227454023728, + "loss": 1.6292, + "step": 1900 + }, + { + "epoch": 0.06807885830930936, + "grad_norm": 1.2515363693237305, + "learning_rate": 0.0001992408486992183, + "loss": 1.754, + "step": 1901 + }, + { + "epoch": 0.06811467043887764, + "grad_norm": 2.06872296333313, + "learning_rate": 0.00019923942152304169, + "loss": 1.3609, + "step": 1902 + }, + { + "epoch": 0.06815048256844593, + "grad_norm": 1.7905656099319458, + "learning_rate": 0.0001992379930117267, + "loss": 1.7038, + "step": 1903 + }, + { + "epoch": 0.06818629469801422, + "grad_norm": 1.7369552850723267, + "learning_rate": 0.00019923656316529252, + "loss": 1.5613, + "step": 1904 + }, + { + "epoch": 0.06822210682758251, + "grad_norm": 1.4253979921340942, + "learning_rate": 0.00019923513198375837, + "loss": 1.53, + "step": 1905 + }, + { + "epoch": 0.06825791895715079, + "grad_norm": 1.3576210737228394, + "learning_rate": 0.00019923369946714354, + "loss": 1.9039, + "step": 1906 + }, + { + "epoch": 0.06829373108671907, + "grad_norm": 2.030214548110962, + "learning_rate": 0.00019923226561546726, + "loss": 1.5617, + "step": 1907 + }, + { + "epoch": 0.06832954321628736, + "grad_norm": 2.2546305656433105, + "learning_rate": 0.00019923083042874885, + "loss": 2.2339, + "step": 1908 + }, + { + "epoch": 0.06836535534585564, + "grad_norm": 2.327280044555664, + "learning_rate": 0.00019922939390700767, + "loss": 1.5703, + "step": 1909 + }, + { + "epoch": 0.06840116747542392, + "grad_norm": 2.571589708328247, + "learning_rate": 0.00019922795605026295, + "loss": 1.6462, + "step": 1910 + }, + { + "epoch": 0.0684369796049922, + "grad_norm": 1.4403351545333862, + "learning_rate": 0.00019922651685853407, + "loss": 1.6363, + "step": 1911 + }, + { + "epoch": 0.0684727917345605, + "grad_norm": 2.2027952671051025, + "learning_rate": 0.0001992250763318404, + "loss": 1.6944, + "step": 1912 + }, + { + "epoch": 0.06850860386412878, + "grad_norm": 2.0483181476593018, + "learning_rate": 0.00019922363447020134, + "loss": 2.0797, + "step": 1913 + }, + { + "epoch": 0.06854441599369707, + "grad_norm": 1.3529871702194214, + "learning_rate": 0.00019922219127363624, + "loss": 1.6436, + "step": 1914 + }, + { + "epoch": 0.06858022812326535, + "grad_norm": 1.9239258766174316, + "learning_rate": 0.00019922074674216456, + "loss": 1.5636, + "step": 1915 + }, + { + "epoch": 0.06861604025283363, + "grad_norm": 2.6495797634124756, + "learning_rate": 0.00019921930087580573, + "loss": 1.9851, + "step": 1916 + }, + { + "epoch": 0.06865185238240192, + "grad_norm": 1.5434764623641968, + "learning_rate": 0.00019921785367457917, + "loss": 1.8326, + "step": 1917 + }, + { + "epoch": 0.0686876645119702, + "grad_norm": 1.602738857269287, + "learning_rate": 0.00019921640513850437, + "loss": 1.5884, + "step": 1918 + }, + { + "epoch": 0.0687234766415385, + "grad_norm": 1.9453281164169312, + "learning_rate": 0.00019921495526760083, + "loss": 1.6429, + "step": 1919 + }, + { + "epoch": 0.06875928877110678, + "grad_norm": 1.6963716745376587, + "learning_rate": 0.00019921350406188805, + "loss": 1.642, + "step": 1920 + }, + { + "epoch": 0.06879510090067506, + "grad_norm": 1.7032421827316284, + "learning_rate": 0.00019921205152138556, + "loss": 1.5783, + "step": 1921 + }, + { + "epoch": 0.06883091303024334, + "grad_norm": 1.9125220775604248, + "learning_rate": 0.00019921059764611284, + "loss": 1.9889, + "step": 1922 + }, + { + "epoch": 0.06886672515981163, + "grad_norm": 1.6646088361740112, + "learning_rate": 0.00019920914243608956, + "loss": 1.7274, + "step": 1923 + }, + { + "epoch": 0.06890253728937991, + "grad_norm": 1.3573722839355469, + "learning_rate": 0.0001992076858913352, + "loss": 1.3907, + "step": 1924 + }, + { + "epoch": 0.06893834941894819, + "grad_norm": 1.8801019191741943, + "learning_rate": 0.0001992062280118694, + "loss": 1.8737, + "step": 1925 + }, + { + "epoch": 0.06897416154851649, + "grad_norm": 1.404797911643982, + "learning_rate": 0.0001992047687977118, + "loss": 1.5543, + "step": 1926 + }, + { + "epoch": 0.06900997367808477, + "grad_norm": 2.4929728507995605, + "learning_rate": 0.00019920330824888197, + "loss": 1.8148, + "step": 1927 + }, + { + "epoch": 0.06904578580765305, + "grad_norm": 1.5967156887054443, + "learning_rate": 0.0001992018463653996, + "loss": 1.7098, + "step": 1928 + }, + { + "epoch": 0.06908159793722134, + "grad_norm": 1.629167079925537, + "learning_rate": 0.00019920038314728434, + "loss": 1.6494, + "step": 1929 + }, + { + "epoch": 0.06911741006678962, + "grad_norm": 1.807207703590393, + "learning_rate": 0.00019919891859455588, + "loss": 1.8693, + "step": 1930 + }, + { + "epoch": 0.0691532221963579, + "grad_norm": 1.9468473196029663, + "learning_rate": 0.00019919745270723395, + "loss": 1.7264, + "step": 1931 + }, + { + "epoch": 0.06918903432592619, + "grad_norm": 1.7176305055618286, + "learning_rate": 0.00019919598548533824, + "loss": 1.8964, + "step": 1932 + }, + { + "epoch": 0.06922484645549448, + "grad_norm": 2.3279569149017334, + "learning_rate": 0.00019919451692888848, + "loss": 1.7493, + "step": 1933 + }, + { + "epoch": 0.06926065858506276, + "grad_norm": 2.4404523372650146, + "learning_rate": 0.00019919304703790446, + "loss": 1.8917, + "step": 1934 + }, + { + "epoch": 0.06929647071463105, + "grad_norm": 1.455718755722046, + "learning_rate": 0.00019919157581240596, + "loss": 1.6557, + "step": 1935 + }, + { + "epoch": 0.06933228284419933, + "grad_norm": 2.637948751449585, + "learning_rate": 0.00019919010325241275, + "loss": 1.4312, + "step": 1936 + }, + { + "epoch": 0.06936809497376761, + "grad_norm": 1.4223843812942505, + "learning_rate": 0.00019918862935794463, + "loss": 2.0284, + "step": 1937 + }, + { + "epoch": 0.0694039071033359, + "grad_norm": 2.231555461883545, + "learning_rate": 0.00019918715412902142, + "loss": 1.9927, + "step": 1938 + }, + { + "epoch": 0.06943971923290418, + "grad_norm": 1.6516687870025635, + "learning_rate": 0.00019918567756566305, + "loss": 1.5935, + "step": 1939 + }, + { + "epoch": 0.06947553136247248, + "grad_norm": 1.9006787538528442, + "learning_rate": 0.0001991841996678893, + "loss": 1.2902, + "step": 1940 + }, + { + "epoch": 0.06951134349204076, + "grad_norm": 2.0806474685668945, + "learning_rate": 0.0001991827204357201, + "loss": 1.3603, + "step": 1941 + }, + { + "epoch": 0.06954715562160904, + "grad_norm": 1.810377836227417, + "learning_rate": 0.0001991812398691753, + "loss": 1.6762, + "step": 1942 + }, + { + "epoch": 0.06958296775117732, + "grad_norm": 1.6597081422805786, + "learning_rate": 0.00019917975796827488, + "loss": 1.6292, + "step": 1943 + }, + { + "epoch": 0.06961877988074561, + "grad_norm": 1.8197176456451416, + "learning_rate": 0.00019917827473303875, + "loss": 1.5692, + "step": 1944 + }, + { + "epoch": 0.06965459201031389, + "grad_norm": 1.2852716445922852, + "learning_rate": 0.00019917679016348685, + "loss": 1.6318, + "step": 1945 + }, + { + "epoch": 0.06969040413988217, + "grad_norm": 1.7549954652786255, + "learning_rate": 0.00019917530425963916, + "loss": 1.8909, + "step": 1946 + }, + { + "epoch": 0.06972621626945047, + "grad_norm": 1.5758832693099976, + "learning_rate": 0.0001991738170215157, + "loss": 1.5813, + "step": 1947 + }, + { + "epoch": 0.06976202839901875, + "grad_norm": 2.270517587661743, + "learning_rate": 0.00019917232844913644, + "loss": 1.8209, + "step": 1948 + }, + { + "epoch": 0.06979784052858704, + "grad_norm": 2.2871134281158447, + "learning_rate": 0.00019917083854252142, + "loss": 1.7456, + "step": 1949 + }, + { + "epoch": 0.06983365265815532, + "grad_norm": 1.7921063899993896, + "learning_rate": 0.00019916934730169073, + "loss": 1.5367, + "step": 1950 + }, + { + "epoch": 0.0698694647877236, + "grad_norm": 1.8753576278686523, + "learning_rate": 0.00019916785472666435, + "loss": 1.6617, + "step": 1951 + }, + { + "epoch": 0.06990527691729188, + "grad_norm": 2.1969332695007324, + "learning_rate": 0.0001991663608174624, + "loss": 1.4074, + "step": 1952 + }, + { + "epoch": 0.06994108904686017, + "grad_norm": 1.6860895156860352, + "learning_rate": 0.000199164865574105, + "loss": 1.9061, + "step": 1953 + }, + { + "epoch": 0.06997690117642846, + "grad_norm": 1.7256300449371338, + "learning_rate": 0.00019916336899661224, + "loss": 1.7184, + "step": 1954 + }, + { + "epoch": 0.07001271330599675, + "grad_norm": 1.6683882474899292, + "learning_rate": 0.00019916187108500428, + "loss": 1.6855, + "step": 1955 + }, + { + "epoch": 0.07004852543556503, + "grad_norm": 1.431535243988037, + "learning_rate": 0.00019916037183930122, + "loss": 1.8098, + "step": 1956 + }, + { + "epoch": 0.07008433756513331, + "grad_norm": 2.2073168754577637, + "learning_rate": 0.00019915887125952327, + "loss": 1.5942, + "step": 1957 + }, + { + "epoch": 0.0701201496947016, + "grad_norm": 1.7813090085983276, + "learning_rate": 0.00019915736934569066, + "loss": 1.7848, + "step": 1958 + }, + { + "epoch": 0.07015596182426988, + "grad_norm": 1.7625131607055664, + "learning_rate": 0.0001991558660978235, + "loss": 1.6697, + "step": 1959 + }, + { + "epoch": 0.07019177395383816, + "grad_norm": 1.319684386253357, + "learning_rate": 0.0001991543615159421, + "loss": 1.8004, + "step": 1960 + }, + { + "epoch": 0.07022758608340644, + "grad_norm": 1.4817261695861816, + "learning_rate": 0.00019915285560006662, + "loss": 1.7313, + "step": 1961 + }, + { + "epoch": 0.07026339821297474, + "grad_norm": 1.8547471761703491, + "learning_rate": 0.00019915134835021738, + "loss": 1.8569, + "step": 1962 + }, + { + "epoch": 0.07029921034254302, + "grad_norm": 1.6080464124679565, + "learning_rate": 0.00019914983976641466, + "loss": 1.4447, + "step": 1963 + }, + { + "epoch": 0.0703350224721113, + "grad_norm": 1.452889323234558, + "learning_rate": 0.00019914832984867874, + "loss": 1.6103, + "step": 1964 + }, + { + "epoch": 0.07037083460167959, + "grad_norm": 1.8950872421264648, + "learning_rate": 0.0001991468185970299, + "loss": 1.7781, + "step": 1965 + }, + { + "epoch": 0.07040664673124787, + "grad_norm": 1.769492745399475, + "learning_rate": 0.00019914530601148855, + "loss": 1.708, + "step": 1966 + }, + { + "epoch": 0.07044245886081615, + "grad_norm": 1.1686387062072754, + "learning_rate": 0.000199143792092075, + "loss": 1.6729, + "step": 1967 + }, + { + "epoch": 0.07047827099038444, + "grad_norm": 1.3533289432525635, + "learning_rate": 0.00019914227683880958, + "loss": 1.6236, + "step": 1968 + }, + { + "epoch": 0.07051408311995273, + "grad_norm": 1.8892587423324585, + "learning_rate": 0.0001991407602517127, + "loss": 1.7556, + "step": 1969 + }, + { + "epoch": 0.07054989524952102, + "grad_norm": 1.8320790529251099, + "learning_rate": 0.00019913924233080482, + "loss": 1.4439, + "step": 1970 + }, + { + "epoch": 0.0705857073790893, + "grad_norm": 2.168354034423828, + "learning_rate": 0.0001991377230761063, + "loss": 1.8053, + "step": 1971 + }, + { + "epoch": 0.07062151950865758, + "grad_norm": 1.4819917678833008, + "learning_rate": 0.00019913620248763756, + "loss": 1.6767, + "step": 1972 + }, + { + "epoch": 0.07065733163822586, + "grad_norm": 1.7416330575942993, + "learning_rate": 0.0001991346805654191, + "loss": 1.7843, + "step": 1973 + }, + { + "epoch": 0.07069314376779415, + "grad_norm": 2.728843927383423, + "learning_rate": 0.00019913315730947143, + "loss": 1.7568, + "step": 1974 + }, + { + "epoch": 0.07072895589736243, + "grad_norm": 1.4552961587905884, + "learning_rate": 0.00019913163271981495, + "loss": 1.3882, + "step": 1975 + }, + { + "epoch": 0.07076476802693073, + "grad_norm": 1.9386544227600098, + "learning_rate": 0.00019913010679647027, + "loss": 1.5637, + "step": 1976 + }, + { + "epoch": 0.07080058015649901, + "grad_norm": 3.1994082927703857, + "learning_rate": 0.00019912857953945784, + "loss": 1.7099, + "step": 1977 + }, + { + "epoch": 0.07083639228606729, + "grad_norm": 1.4596532583236694, + "learning_rate": 0.00019912705094879827, + "loss": 1.5022, + "step": 1978 + }, + { + "epoch": 0.07087220441563558, + "grad_norm": 1.96254563331604, + "learning_rate": 0.00019912552102451206, + "loss": 1.6754, + "step": 1979 + }, + { + "epoch": 0.07090801654520386, + "grad_norm": 1.8212988376617432, + "learning_rate": 0.00019912398976661984, + "loss": 1.6848, + "step": 1980 + }, + { + "epoch": 0.07094382867477214, + "grad_norm": 1.8282296657562256, + "learning_rate": 0.0001991224571751422, + "loss": 1.6715, + "step": 1981 + }, + { + "epoch": 0.07097964080434042, + "grad_norm": 2.6742913722991943, + "learning_rate": 0.00019912092325009975, + "loss": 1.4798, + "step": 1982 + }, + { + "epoch": 0.07101545293390872, + "grad_norm": 2.115812063217163, + "learning_rate": 0.00019911938799151315, + "loss": 1.8264, + "step": 1983 + }, + { + "epoch": 0.071051265063477, + "grad_norm": 1.2611589431762695, + "learning_rate": 0.00019911785139940303, + "loss": 1.6957, + "step": 1984 + }, + { + "epoch": 0.07108707719304529, + "grad_norm": 1.0900263786315918, + "learning_rate": 0.00019911631347379008, + "loss": 1.563, + "step": 1985 + }, + { + "epoch": 0.07112288932261357, + "grad_norm": 1.8478407859802246, + "learning_rate": 0.00019911477421469495, + "loss": 1.6451, + "step": 1986 + }, + { + "epoch": 0.07115870145218185, + "grad_norm": 2.0498433113098145, + "learning_rate": 0.0001991132336221384, + "loss": 1.4951, + "step": 1987 + }, + { + "epoch": 0.07119451358175014, + "grad_norm": 1.6745867729187012, + "learning_rate": 0.00019911169169614117, + "loss": 1.7767, + "step": 1988 + }, + { + "epoch": 0.07123032571131842, + "grad_norm": 1.856895923614502, + "learning_rate": 0.00019911014843672394, + "loss": 1.5433, + "step": 1989 + }, + { + "epoch": 0.07126613784088671, + "grad_norm": 1.378844976425171, + "learning_rate": 0.0001991086038439075, + "loss": 1.7412, + "step": 1990 + }, + { + "epoch": 0.071301949970455, + "grad_norm": 1.3555171489715576, + "learning_rate": 0.00019910705791771263, + "loss": 1.5704, + "step": 1991 + }, + { + "epoch": 0.07133776210002328, + "grad_norm": 1.2136765718460083, + "learning_rate": 0.00019910551065816017, + "loss": 1.6872, + "step": 1992 + }, + { + "epoch": 0.07137357422959156, + "grad_norm": 2.4193828105926514, + "learning_rate": 0.0001991039620652709, + "loss": 1.693, + "step": 1993 + }, + { + "epoch": 0.07140938635915985, + "grad_norm": 1.5262309312820435, + "learning_rate": 0.00019910241213906565, + "loss": 1.6757, + "step": 1994 + }, + { + "epoch": 0.07144519848872813, + "grad_norm": 1.5123639106750488, + "learning_rate": 0.00019910086087956527, + "loss": 1.556, + "step": 1995 + }, + { + "epoch": 0.07148101061829641, + "grad_norm": 1.8455959558486938, + "learning_rate": 0.00019909930828679063, + "loss": 1.3509, + "step": 1996 + }, + { + "epoch": 0.07151682274786471, + "grad_norm": 1.946921706199646, + "learning_rate": 0.0001990977543607626, + "loss": 1.7793, + "step": 1997 + }, + { + "epoch": 0.07155263487743299, + "grad_norm": 2.3060195446014404, + "learning_rate": 0.00019909619910150216, + "loss": 1.8093, + "step": 1998 + }, + { + "epoch": 0.07158844700700127, + "grad_norm": 1.4833565950393677, + "learning_rate": 0.0001990946425090302, + "loss": 1.5529, + "step": 1999 + }, + { + "epoch": 0.07162425913656956, + "grad_norm": 1.3298144340515137, + "learning_rate": 0.00019909308458336759, + "loss": 1.7373, + "step": 2000 + }, + { + "epoch": 0.07166007126613784, + "grad_norm": 1.6872481107711792, + "learning_rate": 0.0001990915253245354, + "loss": 1.7242, + "step": 2001 + }, + { + "epoch": 0.07169588339570612, + "grad_norm": 1.7926667928695679, + "learning_rate": 0.0001990899647325545, + "loss": 1.6452, + "step": 2002 + }, + { + "epoch": 0.0717316955252744, + "grad_norm": 1.7860251665115356, + "learning_rate": 0.000199088402807446, + "loss": 1.6085, + "step": 2003 + }, + { + "epoch": 0.0717675076548427, + "grad_norm": 2.7218711376190186, + "learning_rate": 0.00019908683954923082, + "loss": 1.9528, + "step": 2004 + }, + { + "epoch": 0.07180331978441098, + "grad_norm": 2.7485177516937256, + "learning_rate": 0.00019908527495793004, + "loss": 1.5438, + "step": 2005 + }, + { + "epoch": 0.07183913191397927, + "grad_norm": 2.6853740215301514, + "learning_rate": 0.00019908370903356468, + "loss": 1.7663, + "step": 2006 + }, + { + "epoch": 0.07187494404354755, + "grad_norm": 1.6789944171905518, + "learning_rate": 0.00019908214177615584, + "loss": 1.919, + "step": 2007 + }, + { + "epoch": 0.07191075617311583, + "grad_norm": 2.4104981422424316, + "learning_rate": 0.00019908057318572458, + "loss": 1.3113, + "step": 2008 + }, + { + "epoch": 0.07194656830268412, + "grad_norm": 1.2793664932250977, + "learning_rate": 0.000199079003262292, + "loss": 1.4809, + "step": 2009 + }, + { + "epoch": 0.0719823804322524, + "grad_norm": 1.5413146018981934, + "learning_rate": 0.00019907743200587926, + "loss": 1.5148, + "step": 2010 + }, + { + "epoch": 0.07201819256182068, + "grad_norm": 1.1401458978652954, + "learning_rate": 0.00019907585941650747, + "loss": 1.733, + "step": 2011 + }, + { + "epoch": 0.07205400469138898, + "grad_norm": 1.4969557523727417, + "learning_rate": 0.00019907428549419777, + "loss": 1.7206, + "step": 2012 + }, + { + "epoch": 0.07208981682095726, + "grad_norm": 1.5214531421661377, + "learning_rate": 0.00019907271023897138, + "loss": 1.8043, + "step": 2013 + }, + { + "epoch": 0.07212562895052554, + "grad_norm": 1.6244412660598755, + "learning_rate": 0.00019907113365084947, + "loss": 1.7689, + "step": 2014 + }, + { + "epoch": 0.07216144108009383, + "grad_norm": 1.6461654901504517, + "learning_rate": 0.0001990695557298532, + "loss": 1.5748, + "step": 2015 + }, + { + "epoch": 0.07219725320966211, + "grad_norm": 1.6046760082244873, + "learning_rate": 0.0001990679764760039, + "loss": 1.8966, + "step": 2016 + }, + { + "epoch": 0.07223306533923039, + "grad_norm": 1.2768783569335938, + "learning_rate": 0.00019906639588932276, + "loss": 1.6619, + "step": 2017 + }, + { + "epoch": 0.07226887746879868, + "grad_norm": 1.3331384658813477, + "learning_rate": 0.00019906481396983103, + "loss": 1.6712, + "step": 2018 + }, + { + "epoch": 0.07230468959836697, + "grad_norm": 1.4289017915725708, + "learning_rate": 0.00019906323071755005, + "loss": 1.7709, + "step": 2019 + }, + { + "epoch": 0.07234050172793526, + "grad_norm": 1.8819347620010376, + "learning_rate": 0.00019906164613250104, + "loss": 1.9157, + "step": 2020 + }, + { + "epoch": 0.07237631385750354, + "grad_norm": 1.380541443824768, + "learning_rate": 0.00019906006021470538, + "loss": 1.5728, + "step": 2021 + }, + { + "epoch": 0.07241212598707182, + "grad_norm": 1.3870627880096436, + "learning_rate": 0.00019905847296418437, + "loss": 1.6477, + "step": 2022 + }, + { + "epoch": 0.0724479381166401, + "grad_norm": 1.8019845485687256, + "learning_rate": 0.0001990568843809594, + "loss": 1.4675, + "step": 2023 + }, + { + "epoch": 0.07248375024620839, + "grad_norm": 1.3413184881210327, + "learning_rate": 0.00019905529446505183, + "loss": 1.588, + "step": 2024 + }, + { + "epoch": 0.07251956237577667, + "grad_norm": 1.5961971282958984, + "learning_rate": 0.00019905370321648302, + "loss": 1.6738, + "step": 2025 + }, + { + "epoch": 0.07255537450534497, + "grad_norm": 2.0965704917907715, + "learning_rate": 0.00019905211063527442, + "loss": 1.5145, + "step": 2026 + }, + { + "epoch": 0.07259118663491325, + "grad_norm": 1.2881782054901123, + "learning_rate": 0.00019905051672144746, + "loss": 1.6991, + "step": 2027 + }, + { + "epoch": 0.07262699876448153, + "grad_norm": 1.2879416942596436, + "learning_rate": 0.00019904892147502352, + "loss": 1.795, + "step": 2028 + }, + { + "epoch": 0.07266281089404981, + "grad_norm": 1.2281581163406372, + "learning_rate": 0.00019904732489602417, + "loss": 1.5426, + "step": 2029 + }, + { + "epoch": 0.0726986230236181, + "grad_norm": 1.7174360752105713, + "learning_rate": 0.00019904572698447077, + "loss": 1.6864, + "step": 2030 + }, + { + "epoch": 0.07273443515318638, + "grad_norm": 1.7847421169281006, + "learning_rate": 0.00019904412774038487, + "loss": 1.6732, + "step": 2031 + }, + { + "epoch": 0.07277024728275466, + "grad_norm": 1.5283077955245972, + "learning_rate": 0.000199042527163788, + "loss": 1.7374, + "step": 2032 + }, + { + "epoch": 0.07280605941232296, + "grad_norm": 1.7698839902877808, + "learning_rate": 0.0001990409252547017, + "loss": 1.8254, + "step": 2033 + }, + { + "epoch": 0.07284187154189124, + "grad_norm": 1.3195661306381226, + "learning_rate": 0.0001990393220131475, + "loss": 1.8756, + "step": 2034 + }, + { + "epoch": 0.07287768367145953, + "grad_norm": 1.8096301555633545, + "learning_rate": 0.00019903771743914696, + "loss": 1.4076, + "step": 2035 + }, + { + "epoch": 0.07291349580102781, + "grad_norm": 1.5130648612976074, + "learning_rate": 0.00019903611153272168, + "loss": 1.2862, + "step": 2036 + }, + { + "epoch": 0.07294930793059609, + "grad_norm": 2.1466357707977295, + "learning_rate": 0.0001990345042938933, + "loss": 1.6471, + "step": 2037 + }, + { + "epoch": 0.07298512006016437, + "grad_norm": 1.3187168836593628, + "learning_rate": 0.00019903289572268336, + "loss": 1.6238, + "step": 2038 + }, + { + "epoch": 0.07302093218973266, + "grad_norm": 1.909706473350525, + "learning_rate": 0.0001990312858191136, + "loss": 1.6599, + "step": 2039 + }, + { + "epoch": 0.07305674431930095, + "grad_norm": 1.4692286252975464, + "learning_rate": 0.0001990296745832056, + "loss": 1.7824, + "step": 2040 + }, + { + "epoch": 0.07309255644886924, + "grad_norm": 1.2459222078323364, + "learning_rate": 0.00019902806201498106, + "loss": 1.7793, + "step": 2041 + }, + { + "epoch": 0.07312836857843752, + "grad_norm": 1.560295581817627, + "learning_rate": 0.0001990264481144617, + "loss": 1.7668, + "step": 2042 + }, + { + "epoch": 0.0731641807080058, + "grad_norm": 1.6913843154907227, + "learning_rate": 0.00019902483288166922, + "loss": 1.6637, + "step": 2043 + }, + { + "epoch": 0.07319999283757408, + "grad_norm": 1.3807941675186157, + "learning_rate": 0.00019902321631662533, + "loss": 1.5641, + "step": 2044 + }, + { + "epoch": 0.07323580496714237, + "grad_norm": 2.5262813568115234, + "learning_rate": 0.0001990215984193518, + "loss": 1.4331, + "step": 2045 + }, + { + "epoch": 0.07327161709671065, + "grad_norm": 1.294885277748108, + "learning_rate": 0.00019901997918987042, + "loss": 1.7085, + "step": 2046 + }, + { + "epoch": 0.07330742922627895, + "grad_norm": 1.254361867904663, + "learning_rate": 0.0001990183586282029, + "loss": 1.5558, + "step": 2047 + }, + { + "epoch": 0.07334324135584723, + "grad_norm": 1.6477164030075073, + "learning_rate": 0.00019901673673437112, + "loss": 1.6353, + "step": 2048 + }, + { + "epoch": 0.07337905348541551, + "grad_norm": 2.1215851306915283, + "learning_rate": 0.00019901511350839686, + "loss": 2.1082, + "step": 2049 + }, + { + "epoch": 0.0734148656149838, + "grad_norm": 1.359099268913269, + "learning_rate": 0.00019901348895030196, + "loss": 1.8811, + "step": 2050 + }, + { + "epoch": 0.07345067774455208, + "grad_norm": 1.2427027225494385, + "learning_rate": 0.0001990118630601083, + "loss": 1.7966, + "step": 2051 + }, + { + "epoch": 0.07348648987412036, + "grad_norm": 2.6582934856414795, + "learning_rate": 0.00019901023583783776, + "loss": 1.7436, + "step": 2052 + }, + { + "epoch": 0.07352230200368864, + "grad_norm": 2.8324716091156006, + "learning_rate": 0.00019900860728351216, + "loss": 1.4201, + "step": 2053 + }, + { + "epoch": 0.07355811413325694, + "grad_norm": 1.3952288627624512, + "learning_rate": 0.00019900697739715347, + "loss": 1.7557, + "step": 2054 + }, + { + "epoch": 0.07359392626282522, + "grad_norm": 1.437296748161316, + "learning_rate": 0.00019900534617878365, + "loss": 1.7427, + "step": 2055 + }, + { + "epoch": 0.0736297383923935, + "grad_norm": 2.397494077682495, + "learning_rate": 0.0001990037136284246, + "loss": 1.6845, + "step": 2056 + }, + { + "epoch": 0.07366555052196179, + "grad_norm": 1.7484171390533447, + "learning_rate": 0.00019900207974609822, + "loss": 1.5456, + "step": 2057 + }, + { + "epoch": 0.07370136265153007, + "grad_norm": 1.8446656465530396, + "learning_rate": 0.00019900044453182662, + "loss": 1.7223, + "step": 2058 + }, + { + "epoch": 0.07373717478109836, + "grad_norm": 1.7288898229599, + "learning_rate": 0.00019899880798563172, + "loss": 1.6006, + "step": 2059 + }, + { + "epoch": 0.07377298691066664, + "grad_norm": 2.0306966304779053, + "learning_rate": 0.00019899717010753558, + "loss": 1.6236, + "step": 2060 + }, + { + "epoch": 0.07380879904023492, + "grad_norm": 1.7749613523483276, + "learning_rate": 0.0001989955308975602, + "loss": 1.7825, + "step": 2061 + }, + { + "epoch": 0.07384461116980322, + "grad_norm": 1.8691487312316895, + "learning_rate": 0.00019899389035572763, + "loss": 1.8369, + "step": 2062 + }, + { + "epoch": 0.0738804232993715, + "grad_norm": 1.8923448324203491, + "learning_rate": 0.00019899224848205998, + "loss": 1.7227, + "step": 2063 + }, + { + "epoch": 0.07391623542893978, + "grad_norm": 1.278397798538208, + "learning_rate": 0.0001989906052765793, + "loss": 1.5775, + "step": 2064 + }, + { + "epoch": 0.07395204755850807, + "grad_norm": 1.4953033924102783, + "learning_rate": 0.00019898896073930776, + "loss": 1.7734, + "step": 2065 + }, + { + "epoch": 0.07398785968807635, + "grad_norm": 1.5151599645614624, + "learning_rate": 0.00019898731487026742, + "loss": 1.6955, + "step": 2066 + }, + { + "epoch": 0.07402367181764463, + "grad_norm": 1.816370964050293, + "learning_rate": 0.00019898566766948038, + "loss": 1.5642, + "step": 2067 + }, + { + "epoch": 0.07405948394721291, + "grad_norm": 1.5039783716201782, + "learning_rate": 0.00019898401913696892, + "loss": 1.492, + "step": 2068 + }, + { + "epoch": 0.07409529607678121, + "grad_norm": 1.5567251443862915, + "learning_rate": 0.00019898236927275517, + "loss": 1.8026, + "step": 2069 + }, + { + "epoch": 0.0741311082063495, + "grad_norm": 2.611471176147461, + "learning_rate": 0.0001989807180768613, + "loss": 1.8471, + "step": 2070 + }, + { + "epoch": 0.07416692033591778, + "grad_norm": 1.6151888370513916, + "learning_rate": 0.00019897906554930956, + "loss": 1.6592, + "step": 2071 + }, + { + "epoch": 0.07420273246548606, + "grad_norm": 2.0822031497955322, + "learning_rate": 0.00019897741169012213, + "loss": 1.8254, + "step": 2072 + }, + { + "epoch": 0.07423854459505434, + "grad_norm": 1.3963028192520142, + "learning_rate": 0.00019897575649932135, + "loss": 1.6788, + "step": 2073 + }, + { + "epoch": 0.07427435672462263, + "grad_norm": 1.506298542022705, + "learning_rate": 0.0001989740999769294, + "loss": 1.6061, + "step": 2074 + }, + { + "epoch": 0.07431016885419091, + "grad_norm": 1.5757018327713013, + "learning_rate": 0.0001989724421229686, + "loss": 1.8792, + "step": 2075 + }, + { + "epoch": 0.0743459809837592, + "grad_norm": 1.4515763521194458, + "learning_rate": 0.00019897078293746128, + "loss": 1.7142, + "step": 2076 + }, + { + "epoch": 0.07438179311332749, + "grad_norm": 2.8991751670837402, + "learning_rate": 0.0001989691224204297, + "loss": 1.757, + "step": 2077 + }, + { + "epoch": 0.07441760524289577, + "grad_norm": 1.243881344795227, + "learning_rate": 0.0001989674605718963, + "loss": 1.6967, + "step": 2078 + }, + { + "epoch": 0.07445341737246405, + "grad_norm": 2.1475493907928467, + "learning_rate": 0.00019896579739188335, + "loss": 1.7954, + "step": 2079 + }, + { + "epoch": 0.07448922950203234, + "grad_norm": 1.5451580286026, + "learning_rate": 0.00019896413288041323, + "loss": 1.6241, + "step": 2080 + }, + { + "epoch": 0.07452504163160062, + "grad_norm": 1.7768465280532837, + "learning_rate": 0.00019896246703750837, + "loss": 1.6177, + "step": 2081 + }, + { + "epoch": 0.0745608537611689, + "grad_norm": 1.5759029388427734, + "learning_rate": 0.00019896079986319118, + "loss": 1.8915, + "step": 2082 + }, + { + "epoch": 0.0745966658907372, + "grad_norm": 1.9887797832489014, + "learning_rate": 0.00019895913135748407, + "loss": 1.6981, + "step": 2083 + }, + { + "epoch": 0.07463247802030548, + "grad_norm": 1.390030860900879, + "learning_rate": 0.0001989574615204095, + "loss": 1.5878, + "step": 2084 + }, + { + "epoch": 0.07466829014987376, + "grad_norm": 1.6922458410263062, + "learning_rate": 0.0001989557903519899, + "loss": 1.4414, + "step": 2085 + }, + { + "epoch": 0.07470410227944205, + "grad_norm": 2.327239990234375, + "learning_rate": 0.0001989541178522478, + "loss": 1.753, + "step": 2086 + }, + { + "epoch": 0.07473991440901033, + "grad_norm": 1.5192580223083496, + "learning_rate": 0.0001989524440212057, + "loss": 1.5827, + "step": 2087 + }, + { + "epoch": 0.07477572653857861, + "grad_norm": 1.4516956806182861, + "learning_rate": 0.00019895076885888613, + "loss": 1.4345, + "step": 2088 + }, + { + "epoch": 0.0748115386681469, + "grad_norm": 1.7795008420944214, + "learning_rate": 0.00019894909236531158, + "loss": 1.3601, + "step": 2089 + }, + { + "epoch": 0.07484735079771519, + "grad_norm": 1.4862284660339355, + "learning_rate": 0.0001989474145405046, + "loss": 1.6291, + "step": 2090 + }, + { + "epoch": 0.07488316292728348, + "grad_norm": 1.9060381650924683, + "learning_rate": 0.00019894573538448783, + "loss": 1.9217, + "step": 2091 + }, + { + "epoch": 0.07491897505685176, + "grad_norm": 1.4660017490386963, + "learning_rate": 0.0001989440548972838, + "loss": 1.5406, + "step": 2092 + }, + { + "epoch": 0.07495478718642004, + "grad_norm": 1.3798494338989258, + "learning_rate": 0.0001989423730789151, + "loss": 1.4285, + "step": 2093 + }, + { + "epoch": 0.07499059931598832, + "grad_norm": 1.282052755355835, + "learning_rate": 0.00019894068992940448, + "loss": 1.7026, + "step": 2094 + }, + { + "epoch": 0.0750264114455566, + "grad_norm": 3.1386232376098633, + "learning_rate": 0.00019893900544877443, + "loss": 1.7159, + "step": 2095 + }, + { + "epoch": 0.07506222357512489, + "grad_norm": 1.8487104177474976, + "learning_rate": 0.00019893731963704773, + "loss": 1.4776, + "step": 2096 + }, + { + "epoch": 0.07509803570469319, + "grad_norm": 1.7013107538223267, + "learning_rate": 0.000198935632494247, + "loss": 1.9181, + "step": 2097 + }, + { + "epoch": 0.07513384783426147, + "grad_norm": 1.594666600227356, + "learning_rate": 0.00019893394402039496, + "loss": 1.7788, + "step": 2098 + }, + { + "epoch": 0.07516965996382975, + "grad_norm": 1.4459965229034424, + "learning_rate": 0.00019893225421551428, + "loss": 1.6597, + "step": 2099 + }, + { + "epoch": 0.07520547209339803, + "grad_norm": 2.2312240600585938, + "learning_rate": 0.0001989305630796278, + "loss": 1.5934, + "step": 2100 + }, + { + "epoch": 0.07524128422296632, + "grad_norm": 1.4842166900634766, + "learning_rate": 0.00019892887061275815, + "loss": 1.7209, + "step": 2101 + }, + { + "epoch": 0.0752770963525346, + "grad_norm": 2.3655948638916016, + "learning_rate": 0.00019892717681492815, + "loss": 1.5201, + "step": 2102 + }, + { + "epoch": 0.07531290848210288, + "grad_norm": 1.5361484289169312, + "learning_rate": 0.00019892548168616063, + "loss": 1.6127, + "step": 2103 + }, + { + "epoch": 0.07534872061167118, + "grad_norm": 2.2546679973602295, + "learning_rate": 0.00019892378522647834, + "loss": 1.3852, + "step": 2104 + }, + { + "epoch": 0.07538453274123946, + "grad_norm": 1.4017812013626099, + "learning_rate": 0.00019892208743590412, + "loss": 1.4618, + "step": 2105 + }, + { + "epoch": 0.07542034487080775, + "grad_norm": 1.399861216545105, + "learning_rate": 0.00019892038831446085, + "loss": 1.2668, + "step": 2106 + }, + { + "epoch": 0.07545615700037603, + "grad_norm": 1.8196576833724976, + "learning_rate": 0.0001989186878621713, + "loss": 1.7222, + "step": 2107 + }, + { + "epoch": 0.07549196912994431, + "grad_norm": 1.6640617847442627, + "learning_rate": 0.00019891698607905843, + "loss": 1.5111, + "step": 2108 + }, + { + "epoch": 0.0755277812595126, + "grad_norm": 1.4460179805755615, + "learning_rate": 0.0001989152829651451, + "loss": 1.7697, + "step": 2109 + }, + { + "epoch": 0.07556359338908088, + "grad_norm": 2.077554941177368, + "learning_rate": 0.00019891357852045422, + "loss": 1.5551, + "step": 2110 + }, + { + "epoch": 0.07559940551864916, + "grad_norm": 2.658271312713623, + "learning_rate": 0.00019891187274500874, + "loss": 1.9367, + "step": 2111 + }, + { + "epoch": 0.07563521764821746, + "grad_norm": 1.6757729053497314, + "learning_rate": 0.0001989101656388316, + "loss": 1.5789, + "step": 2112 + }, + { + "epoch": 0.07567102977778574, + "grad_norm": 1.1914464235305786, + "learning_rate": 0.00019890845720194576, + "loss": 1.6118, + "step": 2113 + }, + { + "epoch": 0.07570684190735402, + "grad_norm": 1.547780990600586, + "learning_rate": 0.00019890674743437424, + "loss": 1.8614, + "step": 2114 + }, + { + "epoch": 0.0757426540369223, + "grad_norm": 2.2313425540924072, + "learning_rate": 0.00019890503633614, + "loss": 1.5434, + "step": 2115 + }, + { + "epoch": 0.07577846616649059, + "grad_norm": 1.3298251628875732, + "learning_rate": 0.00019890332390726606, + "loss": 1.6958, + "step": 2116 + }, + { + "epoch": 0.07581427829605887, + "grad_norm": 1.4307975769042969, + "learning_rate": 0.00019890161014777546, + "loss": 1.4825, + "step": 2117 + }, + { + "epoch": 0.07585009042562715, + "grad_norm": 3.3594889640808105, + "learning_rate": 0.0001988998950576913, + "loss": 1.695, + "step": 2118 + }, + { + "epoch": 0.07588590255519545, + "grad_norm": 1.8578377962112427, + "learning_rate": 0.00019889817863703662, + "loss": 1.7816, + "step": 2119 + }, + { + "epoch": 0.07592171468476373, + "grad_norm": 1.5581252574920654, + "learning_rate": 0.0001988964608858345, + "loss": 1.889, + "step": 2120 + }, + { + "epoch": 0.07595752681433202, + "grad_norm": 1.5691393613815308, + "learning_rate": 0.00019889474180410805, + "loss": 1.5541, + "step": 2121 + }, + { + "epoch": 0.0759933389439003, + "grad_norm": 1.5680208206176758, + "learning_rate": 0.00019889302139188044, + "loss": 1.5807, + "step": 2122 + }, + { + "epoch": 0.07602915107346858, + "grad_norm": 1.4914289712905884, + "learning_rate": 0.00019889129964917478, + "loss": 1.8856, + "step": 2123 + }, + { + "epoch": 0.07606496320303686, + "grad_norm": 1.8300025463104248, + "learning_rate": 0.00019888957657601425, + "loss": 1.6327, + "step": 2124 + }, + { + "epoch": 0.07610077533260515, + "grad_norm": 1.6856653690338135, + "learning_rate": 0.00019888785217242206, + "loss": 1.5229, + "step": 2125 + }, + { + "epoch": 0.07613658746217344, + "grad_norm": 1.4205312728881836, + "learning_rate": 0.00019888612643842132, + "loss": 1.5932, + "step": 2126 + }, + { + "epoch": 0.07617239959174173, + "grad_norm": 1.4903684854507446, + "learning_rate": 0.00019888439937403534, + "loss": 1.7948, + "step": 2127 + }, + { + "epoch": 0.07620821172131001, + "grad_norm": 1.770424485206604, + "learning_rate": 0.0001988826709792873, + "loss": 1.8025, + "step": 2128 + }, + { + "epoch": 0.07624402385087829, + "grad_norm": 1.9164286851882935, + "learning_rate": 0.00019888094125420044, + "loss": 1.7622, + "step": 2129 + }, + { + "epoch": 0.07627983598044658, + "grad_norm": 2.1902430057525635, + "learning_rate": 0.00019887921019879812, + "loss": 1.94, + "step": 2130 + }, + { + "epoch": 0.07631564811001486, + "grad_norm": 1.942354679107666, + "learning_rate": 0.00019887747781310356, + "loss": 1.6829, + "step": 2131 + }, + { + "epoch": 0.07635146023958314, + "grad_norm": 2.437299966812134, + "learning_rate": 0.00019887574409714005, + "loss": 1.8157, + "step": 2132 + }, + { + "epoch": 0.07638727236915144, + "grad_norm": 1.7092030048370361, + "learning_rate": 0.00019887400905093096, + "loss": 1.8297, + "step": 2133 + }, + { + "epoch": 0.07642308449871972, + "grad_norm": 1.8825984001159668, + "learning_rate": 0.00019887227267449963, + "loss": 1.739, + "step": 2134 + }, + { + "epoch": 0.076458896628288, + "grad_norm": 1.9578901529312134, + "learning_rate": 0.00019887053496786937, + "loss": 1.772, + "step": 2135 + }, + { + "epoch": 0.07649470875785629, + "grad_norm": 0.9991260170936584, + "learning_rate": 0.00019886879593106365, + "loss": 1.3756, + "step": 2136 + }, + { + "epoch": 0.07653052088742457, + "grad_norm": 1.4741476774215698, + "learning_rate": 0.00019886705556410576, + "loss": 1.4068, + "step": 2137 + }, + { + "epoch": 0.07656633301699285, + "grad_norm": 1.7536135911941528, + "learning_rate": 0.0001988653138670192, + "loss": 1.7613, + "step": 2138 + }, + { + "epoch": 0.07660214514656113, + "grad_norm": 1.7891243696212769, + "learning_rate": 0.00019886357083982734, + "loss": 1.5153, + "step": 2139 + }, + { + "epoch": 0.07663795727612943, + "grad_norm": 1.2617411613464355, + "learning_rate": 0.0001988618264825537, + "loss": 1.597, + "step": 2140 + }, + { + "epoch": 0.07667376940569771, + "grad_norm": 1.4487390518188477, + "learning_rate": 0.00019886008079522167, + "loss": 1.6301, + "step": 2141 + }, + { + "epoch": 0.076709581535266, + "grad_norm": 1.4209492206573486, + "learning_rate": 0.0001988583337778548, + "loss": 1.6636, + "step": 2142 + }, + { + "epoch": 0.07674539366483428, + "grad_norm": 1.6687378883361816, + "learning_rate": 0.00019885658543047655, + "loss": 1.7059, + "step": 2143 + }, + { + "epoch": 0.07678120579440256, + "grad_norm": 1.7952252626419067, + "learning_rate": 0.00019885483575311045, + "loss": 2.0421, + "step": 2144 + }, + { + "epoch": 0.07681701792397085, + "grad_norm": 1.4075217247009277, + "learning_rate": 0.00019885308474578008, + "loss": 1.5278, + "step": 2145 + }, + { + "epoch": 0.07685283005353913, + "grad_norm": 1.9520444869995117, + "learning_rate": 0.00019885133240850892, + "loss": 1.9848, + "step": 2146 + }, + { + "epoch": 0.07688864218310743, + "grad_norm": 2.3137176036834717, + "learning_rate": 0.00019884957874132065, + "loss": 2.0316, + "step": 2147 + }, + { + "epoch": 0.07692445431267571, + "grad_norm": 1.3930728435516357, + "learning_rate": 0.00019884782374423877, + "loss": 1.6199, + "step": 2148 + }, + { + "epoch": 0.07696026644224399, + "grad_norm": 1.7720856666564941, + "learning_rate": 0.00019884606741728692, + "loss": 1.4512, + "step": 2149 + }, + { + "epoch": 0.07699607857181227, + "grad_norm": 1.3794426918029785, + "learning_rate": 0.00019884430976048877, + "loss": 1.8775, + "step": 2150 + }, + { + "epoch": 0.07703189070138056, + "grad_norm": 1.3538085222244263, + "learning_rate": 0.00019884255077386788, + "loss": 1.4677, + "step": 2151 + }, + { + "epoch": 0.07706770283094884, + "grad_norm": 1.767635703086853, + "learning_rate": 0.000198840790457448, + "loss": 1.4348, + "step": 2152 + }, + { + "epoch": 0.07710351496051712, + "grad_norm": 1.4579626321792603, + "learning_rate": 0.00019883902881125278, + "loss": 1.756, + "step": 2153 + }, + { + "epoch": 0.0771393270900854, + "grad_norm": 1.3255012035369873, + "learning_rate": 0.00019883726583530594, + "loss": 1.6512, + "step": 2154 + }, + { + "epoch": 0.0771751392196537, + "grad_norm": 1.5879746675491333, + "learning_rate": 0.00019883550152963113, + "loss": 1.6947, + "step": 2155 + }, + { + "epoch": 0.07721095134922198, + "grad_norm": 1.8822317123413086, + "learning_rate": 0.00019883373589425215, + "loss": 1.702, + "step": 2156 + }, + { + "epoch": 0.07724676347879027, + "grad_norm": 2.246804714202881, + "learning_rate": 0.00019883196892919275, + "loss": 1.6123, + "step": 2157 + }, + { + "epoch": 0.07728257560835855, + "grad_norm": 1.7975350618362427, + "learning_rate": 0.00019883020063447672, + "loss": 1.6843, + "step": 2158 + }, + { + "epoch": 0.07731838773792683, + "grad_norm": 1.6588022708892822, + "learning_rate": 0.00019882843101012778, + "loss": 1.7215, + "step": 2159 + }, + { + "epoch": 0.07735419986749512, + "grad_norm": 1.990808367729187, + "learning_rate": 0.00019882666005616978, + "loss": 1.729, + "step": 2160 + }, + { + "epoch": 0.0773900119970634, + "grad_norm": 2.0240907669067383, + "learning_rate": 0.00019882488777262655, + "loss": 1.299, + "step": 2161 + }, + { + "epoch": 0.0774258241266317, + "grad_norm": 1.273319959640503, + "learning_rate": 0.00019882311415952194, + "loss": 1.2768, + "step": 2162 + }, + { + "epoch": 0.07746163625619998, + "grad_norm": 3.0127789974212646, + "learning_rate": 0.00019882133921687983, + "loss": 1.7046, + "step": 2163 + }, + { + "epoch": 0.07749744838576826, + "grad_norm": 1.886521577835083, + "learning_rate": 0.00019881956294472405, + "loss": 1.62, + "step": 2164 + }, + { + "epoch": 0.07753326051533654, + "grad_norm": 1.6154439449310303, + "learning_rate": 0.00019881778534307852, + "loss": 1.5143, + "step": 2165 + }, + { + "epoch": 0.07756907264490483, + "grad_norm": 2.255967617034912, + "learning_rate": 0.0001988160064119671, + "loss": 1.6454, + "step": 2166 + }, + { + "epoch": 0.07760488477447311, + "grad_norm": 2.1320412158966064, + "learning_rate": 0.00019881422615141385, + "loss": 1.6159, + "step": 2167 + }, + { + "epoch": 0.07764069690404139, + "grad_norm": 1.293569803237915, + "learning_rate": 0.00019881244456144262, + "loss": 1.7265, + "step": 2168 + }, + { + "epoch": 0.07767650903360969, + "grad_norm": 1.4700331687927246, + "learning_rate": 0.00019881066164207742, + "loss": 1.6755, + "step": 2169 + }, + { + "epoch": 0.07771232116317797, + "grad_norm": 2.1993863582611084, + "learning_rate": 0.0001988088773933422, + "loss": 1.7127, + "step": 2170 + }, + { + "epoch": 0.07774813329274625, + "grad_norm": 2.8095591068267822, + "learning_rate": 0.000198807091815261, + "loss": 1.6606, + "step": 2171 + }, + { + "epoch": 0.07778394542231454, + "grad_norm": 1.5221842527389526, + "learning_rate": 0.00019880530490785784, + "loss": 1.543, + "step": 2172 + }, + { + "epoch": 0.07781975755188282, + "grad_norm": 1.5565546751022339, + "learning_rate": 0.00019880351667115673, + "loss": 1.3438, + "step": 2173 + }, + { + "epoch": 0.0778555696814511, + "grad_norm": 1.7649588584899902, + "learning_rate": 0.00019880172710518178, + "loss": 1.7558, + "step": 2174 + }, + { + "epoch": 0.07789138181101939, + "grad_norm": 2.050035238265991, + "learning_rate": 0.00019879993620995702, + "loss": 1.6594, + "step": 2175 + }, + { + "epoch": 0.07792719394058768, + "grad_norm": 1.595670461654663, + "learning_rate": 0.00019879814398550657, + "loss": 2.0138, + "step": 2176 + }, + { + "epoch": 0.07796300607015597, + "grad_norm": 1.8371297121047974, + "learning_rate": 0.00019879635043185454, + "loss": 1.6345, + "step": 2177 + }, + { + "epoch": 0.07799881819972425, + "grad_norm": 1.5137767791748047, + "learning_rate": 0.00019879455554902502, + "loss": 1.8678, + "step": 2178 + }, + { + "epoch": 0.07803463032929253, + "grad_norm": 1.4583414793014526, + "learning_rate": 0.00019879275933704224, + "loss": 1.7769, + "step": 2179 + }, + { + "epoch": 0.07807044245886081, + "grad_norm": 1.8222814798355103, + "learning_rate": 0.00019879096179593027, + "loss": 1.7373, + "step": 2180 + }, + { + "epoch": 0.0781062545884291, + "grad_norm": 1.8903967142105103, + "learning_rate": 0.00019878916292571334, + "loss": 2.003, + "step": 2181 + }, + { + "epoch": 0.07814206671799738, + "grad_norm": 2.2535934448242188, + "learning_rate": 0.00019878736272641568, + "loss": 1.661, + "step": 2182 + }, + { + "epoch": 0.07817787884756568, + "grad_norm": 1.6508339643478394, + "learning_rate": 0.00019878556119806148, + "loss": 1.3687, + "step": 2183 + }, + { + "epoch": 0.07821369097713396, + "grad_norm": 1.6056736707687378, + "learning_rate": 0.00019878375834067496, + "loss": 1.3639, + "step": 2184 + }, + { + "epoch": 0.07824950310670224, + "grad_norm": 1.481017827987671, + "learning_rate": 0.0001987819541542804, + "loss": 1.9444, + "step": 2185 + }, + { + "epoch": 0.07828531523627053, + "grad_norm": 1.7288168668746948, + "learning_rate": 0.0001987801486389021, + "loss": 1.9123, + "step": 2186 + }, + { + "epoch": 0.07832112736583881, + "grad_norm": 2.189270257949829, + "learning_rate": 0.00019877834179456424, + "loss": 1.5186, + "step": 2187 + }, + { + "epoch": 0.07835693949540709, + "grad_norm": 1.4086220264434814, + "learning_rate": 0.00019877653362129126, + "loss": 1.5263, + "step": 2188 + }, + { + "epoch": 0.07839275162497537, + "grad_norm": 1.423419713973999, + "learning_rate": 0.00019877472411910745, + "loss": 1.6879, + "step": 2189 + }, + { + "epoch": 0.07842856375454367, + "grad_norm": 1.8120468854904175, + "learning_rate": 0.0001987729132880371, + "loss": 1.725, + "step": 2190 + }, + { + "epoch": 0.07846437588411195, + "grad_norm": 1.2922435998916626, + "learning_rate": 0.00019877110112810463, + "loss": 1.5935, + "step": 2191 + }, + { + "epoch": 0.07850018801368024, + "grad_norm": 1.484522819519043, + "learning_rate": 0.00019876928763933437, + "loss": 1.5979, + "step": 2192 + }, + { + "epoch": 0.07853600014324852, + "grad_norm": 1.5501383543014526, + "learning_rate": 0.00019876747282175078, + "loss": 1.6291, + "step": 2193 + }, + { + "epoch": 0.0785718122728168, + "grad_norm": 1.7386257648468018, + "learning_rate": 0.00019876565667537824, + "loss": 1.5312, + "step": 2194 + }, + { + "epoch": 0.07860762440238508, + "grad_norm": 1.6912481784820557, + "learning_rate": 0.00019876383920024117, + "loss": 1.5414, + "step": 2195 + }, + { + "epoch": 0.07864343653195337, + "grad_norm": 1.5818910598754883, + "learning_rate": 0.00019876202039636405, + "loss": 1.8032, + "step": 2196 + }, + { + "epoch": 0.07867924866152166, + "grad_norm": 1.9245262145996094, + "learning_rate": 0.00019876020026377136, + "loss": 1.3893, + "step": 2197 + }, + { + "epoch": 0.07871506079108995, + "grad_norm": 1.422369360923767, + "learning_rate": 0.00019875837880248756, + "loss": 1.6805, + "step": 2198 + }, + { + "epoch": 0.07875087292065823, + "grad_norm": 1.8367788791656494, + "learning_rate": 0.00019875655601253714, + "loss": 1.7323, + "step": 2199 + }, + { + "epoch": 0.07878668505022651, + "grad_norm": 1.3831416368484497, + "learning_rate": 0.00019875473189394463, + "loss": 1.5551, + "step": 2200 + }, + { + "epoch": 0.0788224971797948, + "grad_norm": 1.5825157165527344, + "learning_rate": 0.00019875290644673463, + "loss": 1.7588, + "step": 2201 + }, + { + "epoch": 0.07885830930936308, + "grad_norm": 1.0356022119522095, + "learning_rate": 0.00019875107967093163, + "loss": 1.7062, + "step": 2202 + }, + { + "epoch": 0.07889412143893136, + "grad_norm": 1.5128309726715088, + "learning_rate": 0.00019874925156656024, + "loss": 1.6296, + "step": 2203 + }, + { + "epoch": 0.07892993356849964, + "grad_norm": 2.1393330097198486, + "learning_rate": 0.00019874742213364506, + "loss": 1.8722, + "step": 2204 + }, + { + "epoch": 0.07896574569806794, + "grad_norm": 1.531294345855713, + "learning_rate": 0.00019874559137221068, + "loss": 1.5297, + "step": 2205 + }, + { + "epoch": 0.07900155782763622, + "grad_norm": 1.653439998626709, + "learning_rate": 0.00019874375928228175, + "loss": 1.5084, + "step": 2206 + }, + { + "epoch": 0.0790373699572045, + "grad_norm": 1.6008787155151367, + "learning_rate": 0.00019874192586388288, + "loss": 1.4493, + "step": 2207 + }, + { + "epoch": 0.07907318208677279, + "grad_norm": 1.8827781677246094, + "learning_rate": 0.00019874009111703878, + "loss": 1.6156, + "step": 2208 + }, + { + "epoch": 0.07910899421634107, + "grad_norm": 1.7337536811828613, + "learning_rate": 0.00019873825504177414, + "loss": 1.7076, + "step": 2209 + }, + { + "epoch": 0.07914480634590935, + "grad_norm": 1.7455438375473022, + "learning_rate": 0.0001987364176381136, + "loss": 1.4984, + "step": 2210 + }, + { + "epoch": 0.07918061847547764, + "grad_norm": 1.7682832479476929, + "learning_rate": 0.00019873457890608198, + "loss": 1.7267, + "step": 2211 + }, + { + "epoch": 0.07921643060504593, + "grad_norm": 1.8477669954299927, + "learning_rate": 0.0001987327388457039, + "loss": 1.8183, + "step": 2212 + }, + { + "epoch": 0.07925224273461422, + "grad_norm": 2.0341742038726807, + "learning_rate": 0.0001987308974570042, + "loss": 1.5857, + "step": 2213 + }, + { + "epoch": 0.0792880548641825, + "grad_norm": 1.6047203540802002, + "learning_rate": 0.0001987290547400076, + "loss": 1.4304, + "step": 2214 + }, + { + "epoch": 0.07932386699375078, + "grad_norm": 1.6200206279754639, + "learning_rate": 0.000198727210694739, + "loss": 1.3838, + "step": 2215 + }, + { + "epoch": 0.07935967912331907, + "grad_norm": 1.4143503904342651, + "learning_rate": 0.00019872536532122305, + "loss": 1.4999, + "step": 2216 + }, + { + "epoch": 0.07939549125288735, + "grad_norm": 1.7010365724563599, + "learning_rate": 0.0001987235186194847, + "loss": 1.9153, + "step": 2217 + }, + { + "epoch": 0.07943130338245563, + "grad_norm": 1.9490126371383667, + "learning_rate": 0.00019872167058954874, + "loss": 1.5034, + "step": 2218 + }, + { + "epoch": 0.07946711551202393, + "grad_norm": 1.496788740158081, + "learning_rate": 0.00019871982123144004, + "loss": 1.7765, + "step": 2219 + }, + { + "epoch": 0.07950292764159221, + "grad_norm": 1.3413259983062744, + "learning_rate": 0.00019871797054518347, + "loss": 1.6507, + "step": 2220 + }, + { + "epoch": 0.0795387397711605, + "grad_norm": 2.239360809326172, + "learning_rate": 0.00019871611853080397, + "loss": 1.8348, + "step": 2221 + }, + { + "epoch": 0.07957455190072878, + "grad_norm": 1.3430578708648682, + "learning_rate": 0.00019871426518832644, + "loss": 1.7134, + "step": 2222 + }, + { + "epoch": 0.07961036403029706, + "grad_norm": 1.7762681245803833, + "learning_rate": 0.00019871241051777576, + "loss": 1.8258, + "step": 2223 + }, + { + "epoch": 0.07964617615986534, + "grad_norm": 3.437670946121216, + "learning_rate": 0.00019871055451917694, + "loss": 1.598, + "step": 2224 + }, + { + "epoch": 0.07968198828943363, + "grad_norm": 1.7100651264190674, + "learning_rate": 0.00019870869719255496, + "loss": 1.5688, + "step": 2225 + }, + { + "epoch": 0.07971780041900192, + "grad_norm": 1.7335412502288818, + "learning_rate": 0.00019870683853793474, + "loss": 1.4256, + "step": 2226 + }, + { + "epoch": 0.0797536125485702, + "grad_norm": 1.4160618782043457, + "learning_rate": 0.00019870497855534137, + "loss": 1.6183, + "step": 2227 + }, + { + "epoch": 0.07978942467813849, + "grad_norm": 1.4244014024734497, + "learning_rate": 0.00019870311724479983, + "loss": 1.7856, + "step": 2228 + }, + { + "epoch": 0.07982523680770677, + "grad_norm": 1.4283264875411987, + "learning_rate": 0.00019870125460633514, + "loss": 1.4689, + "step": 2229 + }, + { + "epoch": 0.07986104893727505, + "grad_norm": 1.4369860887527466, + "learning_rate": 0.00019869939063997243, + "loss": 1.6801, + "step": 2230 + }, + { + "epoch": 0.07989686106684334, + "grad_norm": 1.8066489696502686, + "learning_rate": 0.00019869752534573668, + "loss": 1.4535, + "step": 2231 + }, + { + "epoch": 0.07993267319641162, + "grad_norm": 2.8576557636260986, + "learning_rate": 0.00019869565872365308, + "loss": 1.7293, + "step": 2232 + }, + { + "epoch": 0.07996848532597992, + "grad_norm": 1.3145085573196411, + "learning_rate": 0.00019869379077374667, + "loss": 1.7171, + "step": 2233 + }, + { + "epoch": 0.0800042974555482, + "grad_norm": 2.2998170852661133, + "learning_rate": 0.00019869192149604264, + "loss": 1.6707, + "step": 2234 + }, + { + "epoch": 0.08004010958511648, + "grad_norm": 1.755245566368103, + "learning_rate": 0.0001986900508905661, + "loss": 1.5189, + "step": 2235 + }, + { + "epoch": 0.08007592171468476, + "grad_norm": 1.8190783262252808, + "learning_rate": 0.00019868817895734222, + "loss": 1.7181, + "step": 2236 + }, + { + "epoch": 0.08011173384425305, + "grad_norm": 1.505210280418396, + "learning_rate": 0.00019868630569639618, + "loss": 1.5393, + "step": 2237 + }, + { + "epoch": 0.08014754597382133, + "grad_norm": 2.068084478378296, + "learning_rate": 0.0001986844311077532, + "loss": 1.5744, + "step": 2238 + }, + { + "epoch": 0.08018335810338961, + "grad_norm": 1.9969322681427002, + "learning_rate": 0.0001986825551914385, + "loss": 1.507, + "step": 2239 + }, + { + "epoch": 0.08021917023295791, + "grad_norm": 3.0546181201934814, + "learning_rate": 0.00019868067794747728, + "loss": 1.8212, + "step": 2240 + }, + { + "epoch": 0.08025498236252619, + "grad_norm": 1.3269965648651123, + "learning_rate": 0.00019867879937589486, + "loss": 1.6769, + "step": 2241 + }, + { + "epoch": 0.08029079449209447, + "grad_norm": 1.6832436323165894, + "learning_rate": 0.0001986769194767165, + "loss": 1.7213, + "step": 2242 + }, + { + "epoch": 0.08032660662166276, + "grad_norm": 1.4924203157424927, + "learning_rate": 0.00019867503824996745, + "loss": 1.6567, + "step": 2243 + }, + { + "epoch": 0.08036241875123104, + "grad_norm": 1.7548980712890625, + "learning_rate": 0.00019867315569567303, + "loss": 1.571, + "step": 2244 + }, + { + "epoch": 0.08039823088079932, + "grad_norm": 2.1904239654541016, + "learning_rate": 0.0001986712718138586, + "loss": 1.6707, + "step": 2245 + }, + { + "epoch": 0.0804340430103676, + "grad_norm": 1.3834635019302368, + "learning_rate": 0.00019866938660454949, + "loss": 1.5114, + "step": 2246 + }, + { + "epoch": 0.0804698551399359, + "grad_norm": 1.6187025308609009, + "learning_rate": 0.00019866750006777102, + "loss": 1.6336, + "step": 2247 + }, + { + "epoch": 0.08050566726950419, + "grad_norm": 1.5296133756637573, + "learning_rate": 0.00019866561220354862, + "loss": 1.6381, + "step": 2248 + }, + { + "epoch": 0.08054147939907247, + "grad_norm": 1.5018010139465332, + "learning_rate": 0.0001986637230119077, + "loss": 1.5588, + "step": 2249 + }, + { + "epoch": 0.08057729152864075, + "grad_norm": 1.3861911296844482, + "learning_rate": 0.00019866183249287364, + "loss": 1.5929, + "step": 2250 + }, + { + "epoch": 0.08061310365820903, + "grad_norm": 1.2317131757736206, + "learning_rate": 0.00019865994064647188, + "loss": 1.4665, + "step": 2251 + }, + { + "epoch": 0.08064891578777732, + "grad_norm": 1.849687933921814, + "learning_rate": 0.0001986580474727279, + "loss": 1.5341, + "step": 2252 + }, + { + "epoch": 0.0806847279173456, + "grad_norm": 1.3923594951629639, + "learning_rate": 0.00019865615297166714, + "loss": 1.7859, + "step": 2253 + }, + { + "epoch": 0.08072054004691388, + "grad_norm": 1.4301036596298218, + "learning_rate": 0.0001986542571433151, + "loss": 1.8605, + "step": 2254 + }, + { + "epoch": 0.08075635217648218, + "grad_norm": 1.6380246877670288, + "learning_rate": 0.00019865235998769727, + "loss": 1.7913, + "step": 2255 + }, + { + "epoch": 0.08079216430605046, + "grad_norm": 2.1478631496429443, + "learning_rate": 0.0001986504615048392, + "loss": 1.5993, + "step": 2256 + }, + { + "epoch": 0.08082797643561875, + "grad_norm": 1.3344511985778809, + "learning_rate": 0.0001986485616947664, + "loss": 1.718, + "step": 2257 + }, + { + "epoch": 0.08086378856518703, + "grad_norm": 1.8746141195297241, + "learning_rate": 0.00019864666055750452, + "loss": 1.6871, + "step": 2258 + }, + { + "epoch": 0.08089960069475531, + "grad_norm": 1.6850662231445312, + "learning_rate": 0.000198644758093079, + "loss": 1.8947, + "step": 2259 + }, + { + "epoch": 0.0809354128243236, + "grad_norm": 1.7207536697387695, + "learning_rate": 0.00019864285430151553, + "loss": 1.6498, + "step": 2260 + }, + { + "epoch": 0.08097122495389188, + "grad_norm": 1.387789011001587, + "learning_rate": 0.00019864094918283968, + "loss": 1.5868, + "step": 2261 + }, + { + "epoch": 0.08100703708346017, + "grad_norm": 1.4519275426864624, + "learning_rate": 0.0001986390427370771, + "loss": 1.4775, + "step": 2262 + }, + { + "epoch": 0.08104284921302846, + "grad_norm": 1.7079511880874634, + "learning_rate": 0.00019863713496425347, + "loss": 1.4262, + "step": 2263 + }, + { + "epoch": 0.08107866134259674, + "grad_norm": 2.308537483215332, + "learning_rate": 0.0001986352258643944, + "loss": 1.5502, + "step": 2264 + }, + { + "epoch": 0.08111447347216502, + "grad_norm": 1.9541075229644775, + "learning_rate": 0.00019863331543752558, + "loss": 1.6206, + "step": 2265 + }, + { + "epoch": 0.0811502856017333, + "grad_norm": 1.6608632802963257, + "learning_rate": 0.00019863140368367273, + "loss": 1.5943, + "step": 2266 + }, + { + "epoch": 0.08118609773130159, + "grad_norm": 1.6765403747558594, + "learning_rate": 0.00019862949060286158, + "loss": 1.852, + "step": 2267 + }, + { + "epoch": 0.08122190986086987, + "grad_norm": 1.4967211484909058, + "learning_rate": 0.00019862757619511784, + "loss": 1.4903, + "step": 2268 + }, + { + "epoch": 0.08125772199043817, + "grad_norm": 1.465468168258667, + "learning_rate": 0.0001986256604604673, + "loss": 1.5695, + "step": 2269 + }, + { + "epoch": 0.08129353412000645, + "grad_norm": 1.5892409086227417, + "learning_rate": 0.0001986237433989357, + "loss": 1.6337, + "step": 2270 + }, + { + "epoch": 0.08132934624957473, + "grad_norm": 1.3970975875854492, + "learning_rate": 0.0001986218250105489, + "loss": 1.6544, + "step": 2271 + }, + { + "epoch": 0.08136515837914302, + "grad_norm": 2.048312187194824, + "learning_rate": 0.0001986199052953326, + "loss": 1.4003, + "step": 2272 + }, + { + "epoch": 0.0814009705087113, + "grad_norm": 1.864230751991272, + "learning_rate": 0.0001986179842533127, + "loss": 1.4919, + "step": 2273 + }, + { + "epoch": 0.08143678263827958, + "grad_norm": 1.4341579675674438, + "learning_rate": 0.00019861606188451502, + "loss": 1.5127, + "step": 2274 + }, + { + "epoch": 0.08147259476784786, + "grad_norm": 1.5005865097045898, + "learning_rate": 0.00019861413818896546, + "loss": 1.6661, + "step": 2275 + }, + { + "epoch": 0.08150840689741616, + "grad_norm": 1.1538370847702026, + "learning_rate": 0.00019861221316668984, + "loss": 1.4567, + "step": 2276 + }, + { + "epoch": 0.08154421902698444, + "grad_norm": 1.613469123840332, + "learning_rate": 0.0001986102868177141, + "loss": 1.7542, + "step": 2277 + }, + { + "epoch": 0.08158003115655273, + "grad_norm": 1.6482155323028564, + "learning_rate": 0.0001986083591420642, + "loss": 1.8616, + "step": 2278 + }, + { + "epoch": 0.08161584328612101, + "grad_norm": 1.7545753717422485, + "learning_rate": 0.00019860643013976597, + "loss": 1.4438, + "step": 2279 + }, + { + "epoch": 0.08165165541568929, + "grad_norm": 2.3131508827209473, + "learning_rate": 0.00019860449981084545, + "loss": 1.6012, + "step": 2280 + }, + { + "epoch": 0.08168746754525757, + "grad_norm": 1.8130494356155396, + "learning_rate": 0.00019860256815532854, + "loss": 1.2831, + "step": 2281 + }, + { + "epoch": 0.08172327967482586, + "grad_norm": 1.7892794609069824, + "learning_rate": 0.0001986006351732413, + "loss": 1.5343, + "step": 2282 + }, + { + "epoch": 0.08175909180439415, + "grad_norm": 1.544555902481079, + "learning_rate": 0.00019859870086460965, + "loss": 1.708, + "step": 2283 + }, + { + "epoch": 0.08179490393396244, + "grad_norm": 2.1215953826904297, + "learning_rate": 0.0001985967652294597, + "loss": 1.7509, + "step": 2284 + }, + { + "epoch": 0.08183071606353072, + "grad_norm": 1.4176714420318604, + "learning_rate": 0.00019859482826781744, + "loss": 1.5661, + "step": 2285 + }, + { + "epoch": 0.081866528193099, + "grad_norm": 1.7044744491577148, + "learning_rate": 0.00019859288997970895, + "loss": 1.7327, + "step": 2286 + }, + { + "epoch": 0.08190234032266729, + "grad_norm": 1.9832544326782227, + "learning_rate": 0.0001985909503651603, + "loss": 1.4708, + "step": 2287 + }, + { + "epoch": 0.08193815245223557, + "grad_norm": 1.8185745477676392, + "learning_rate": 0.0001985890094241976, + "loss": 1.7814, + "step": 2288 + }, + { + "epoch": 0.08197396458180385, + "grad_norm": 1.557418942451477, + "learning_rate": 0.0001985870671568469, + "loss": 1.7562, + "step": 2289 + }, + { + "epoch": 0.08200977671137215, + "grad_norm": 1.7640334367752075, + "learning_rate": 0.00019858512356313445, + "loss": 1.4121, + "step": 2290 + }, + { + "epoch": 0.08204558884094043, + "grad_norm": 1.6936959028244019, + "learning_rate": 0.00019858317864308628, + "loss": 1.6866, + "step": 2291 + }, + { + "epoch": 0.08208140097050871, + "grad_norm": 1.587742805480957, + "learning_rate": 0.0001985812323967286, + "loss": 1.6117, + "step": 2292 + }, + { + "epoch": 0.082117213100077, + "grad_norm": 1.4579888582229614, + "learning_rate": 0.00019857928482408763, + "loss": 1.7537, + "step": 2293 + }, + { + "epoch": 0.08215302522964528, + "grad_norm": 1.2377456426620483, + "learning_rate": 0.00019857733592518954, + "loss": 1.2892, + "step": 2294 + }, + { + "epoch": 0.08218883735921356, + "grad_norm": 2.1315197944641113, + "learning_rate": 0.00019857538570006053, + "loss": 1.7453, + "step": 2295 + }, + { + "epoch": 0.08222464948878185, + "grad_norm": 2.444917678833008, + "learning_rate": 0.00019857343414872685, + "loss": 1.8851, + "step": 2296 + }, + { + "epoch": 0.08226046161835014, + "grad_norm": 2.752296209335327, + "learning_rate": 0.0001985714812712148, + "loss": 1.7393, + "step": 2297 + }, + { + "epoch": 0.08229627374791842, + "grad_norm": 1.5612289905548096, + "learning_rate": 0.0001985695270675506, + "loss": 1.5019, + "step": 2298 + }, + { + "epoch": 0.08233208587748671, + "grad_norm": 1.6323474645614624, + "learning_rate": 0.00019856757153776058, + "loss": 1.6256, + "step": 2299 + }, + { + "epoch": 0.08236789800705499, + "grad_norm": 2.15275239944458, + "learning_rate": 0.000198565614681871, + "loss": 1.4934, + "step": 2300 + }, + { + "epoch": 0.08240371013662327, + "grad_norm": 2.2830469608306885, + "learning_rate": 0.0001985636564999082, + "loss": 1.8445, + "step": 2301 + }, + { + "epoch": 0.08243952226619156, + "grad_norm": 1.283078670501709, + "learning_rate": 0.00019856169699189856, + "loss": 1.6053, + "step": 2302 + }, + { + "epoch": 0.08247533439575984, + "grad_norm": 2.0136337280273438, + "learning_rate": 0.00019855973615786842, + "loss": 1.7864, + "step": 2303 + }, + { + "epoch": 0.08251114652532812, + "grad_norm": 1.7052009105682373, + "learning_rate": 0.0001985577739978442, + "loss": 1.2617, + "step": 2304 + }, + { + "epoch": 0.08254695865489642, + "grad_norm": 1.6163235902786255, + "learning_rate": 0.0001985558105118522, + "loss": 1.575, + "step": 2305 + }, + { + "epoch": 0.0825827707844647, + "grad_norm": 1.2028448581695557, + "learning_rate": 0.00019855384569991892, + "loss": 1.575, + "step": 2306 + }, + { + "epoch": 0.08261858291403298, + "grad_norm": 1.1882579326629639, + "learning_rate": 0.0001985518795620708, + "loss": 1.7627, + "step": 2307 + }, + { + "epoch": 0.08265439504360127, + "grad_norm": 1.8147140741348267, + "learning_rate": 0.0001985499120983342, + "loss": 1.8266, + "step": 2308 + }, + { + "epoch": 0.08269020717316955, + "grad_norm": 1.386354684829712, + "learning_rate": 0.00019854794330873568, + "loss": 1.6571, + "step": 2309 + }, + { + "epoch": 0.08272601930273783, + "grad_norm": 2.043935775756836, + "learning_rate": 0.00019854597319330175, + "loss": 1.6551, + "step": 2310 + }, + { + "epoch": 0.08276183143230612, + "grad_norm": 1.749604344367981, + "learning_rate": 0.00019854400175205883, + "loss": 1.7481, + "step": 2311 + }, + { + "epoch": 0.08279764356187441, + "grad_norm": 1.7542747259140015, + "learning_rate": 0.00019854202898503346, + "loss": 1.4622, + "step": 2312 + }, + { + "epoch": 0.0828334556914427, + "grad_norm": 1.2417621612548828, + "learning_rate": 0.00019854005489225224, + "loss": 1.7303, + "step": 2313 + }, + { + "epoch": 0.08286926782101098, + "grad_norm": 1.5347504615783691, + "learning_rate": 0.00019853807947374166, + "loss": 1.866, + "step": 2314 + }, + { + "epoch": 0.08290507995057926, + "grad_norm": 1.5232512950897217, + "learning_rate": 0.0001985361027295283, + "loss": 1.5288, + "step": 2315 + }, + { + "epoch": 0.08294089208014754, + "grad_norm": 1.9294697046279907, + "learning_rate": 0.00019853412465963883, + "loss": 1.5636, + "step": 2316 + }, + { + "epoch": 0.08297670420971583, + "grad_norm": 1.68217933177948, + "learning_rate": 0.0001985321452640998, + "loss": 1.547, + "step": 2317 + }, + { + "epoch": 0.08301251633928411, + "grad_norm": 2.1691644191741943, + "learning_rate": 0.00019853016454293785, + "loss": 1.6431, + "step": 2318 + }, + { + "epoch": 0.0830483284688524, + "grad_norm": 1.4858300685882568, + "learning_rate": 0.00019852818249617963, + "loss": 1.7581, + "step": 2319 + }, + { + "epoch": 0.08308414059842069, + "grad_norm": 1.6669294834136963, + "learning_rate": 0.0001985261991238518, + "loss": 1.4139, + "step": 2320 + }, + { + "epoch": 0.08311995272798897, + "grad_norm": 1.9712706804275513, + "learning_rate": 0.00019852421442598107, + "loss": 1.7628, + "step": 2321 + }, + { + "epoch": 0.08315576485755725, + "grad_norm": 2.3515987396240234, + "learning_rate": 0.0001985222284025941, + "loss": 1.9166, + "step": 2322 + }, + { + "epoch": 0.08319157698712554, + "grad_norm": 1.5739303827285767, + "learning_rate": 0.00019852024105371764, + "loss": 1.7737, + "step": 2323 + }, + { + "epoch": 0.08322738911669382, + "grad_norm": 1.4964985847473145, + "learning_rate": 0.0001985182523793784, + "loss": 1.3749, + "step": 2324 + }, + { + "epoch": 0.0832632012462621, + "grad_norm": 1.9431986808776855, + "learning_rate": 0.00019851626237960316, + "loss": 1.7314, + "step": 2325 + }, + { + "epoch": 0.0832990133758304, + "grad_norm": 1.4260855913162231, + "learning_rate": 0.00019851427105441874, + "loss": 1.6923, + "step": 2326 + }, + { + "epoch": 0.08333482550539868, + "grad_norm": 2.3210411071777344, + "learning_rate": 0.00019851227840385184, + "loss": 1.6412, + "step": 2327 + }, + { + "epoch": 0.08337063763496697, + "grad_norm": 1.6222409009933472, + "learning_rate": 0.00019851028442792928, + "loss": 1.7586, + "step": 2328 + }, + { + "epoch": 0.08340644976453525, + "grad_norm": 1.9236634969711304, + "learning_rate": 0.00019850828912667794, + "loss": 1.8013, + "step": 2329 + }, + { + "epoch": 0.08344226189410353, + "grad_norm": 1.6394051313400269, + "learning_rate": 0.0001985062925001246, + "loss": 1.5605, + "step": 2330 + }, + { + "epoch": 0.08347807402367181, + "grad_norm": 1.769962191581726, + "learning_rate": 0.0001985042945482962, + "loss": 1.6226, + "step": 2331 + }, + { + "epoch": 0.0835138861532401, + "grad_norm": 1.7239434719085693, + "learning_rate": 0.00019850229527121956, + "loss": 1.5937, + "step": 2332 + }, + { + "epoch": 0.0835496982828084, + "grad_norm": 1.6355654001235962, + "learning_rate": 0.00019850029466892161, + "loss": 1.7279, + "step": 2333 + }, + { + "epoch": 0.08358551041237668, + "grad_norm": 2.545929193496704, + "learning_rate": 0.00019849829274142924, + "loss": 1.472, + "step": 2334 + }, + { + "epoch": 0.08362132254194496, + "grad_norm": 2.478335380554199, + "learning_rate": 0.00019849628948876943, + "loss": 2.0368, + "step": 2335 + }, + { + "epoch": 0.08365713467151324, + "grad_norm": 1.5632565021514893, + "learning_rate": 0.00019849428491096904, + "loss": 1.5616, + "step": 2336 + }, + { + "epoch": 0.08369294680108152, + "grad_norm": 1.9258129596710205, + "learning_rate": 0.0001984922790080551, + "loss": 1.7881, + "step": 2337 + }, + { + "epoch": 0.08372875893064981, + "grad_norm": 1.5572803020477295, + "learning_rate": 0.0001984902717800546, + "loss": 1.4918, + "step": 2338 + }, + { + "epoch": 0.08376457106021809, + "grad_norm": 1.6713584661483765, + "learning_rate": 0.00019848826322699456, + "loss": 1.5777, + "step": 2339 + }, + { + "epoch": 0.08380038318978639, + "grad_norm": 1.7208898067474365, + "learning_rate": 0.000198486253348902, + "loss": 1.4879, + "step": 2340 + }, + { + "epoch": 0.08383619531935467, + "grad_norm": 3.210822343826294, + "learning_rate": 0.0001984842421458039, + "loss": 1.8145, + "step": 2341 + }, + { + "epoch": 0.08387200744892295, + "grad_norm": 1.435042142868042, + "learning_rate": 0.00019848222961772733, + "loss": 1.7716, + "step": 2342 + }, + { + "epoch": 0.08390781957849124, + "grad_norm": 1.7752784490585327, + "learning_rate": 0.00019848021576469944, + "loss": 1.6036, + "step": 2343 + }, + { + "epoch": 0.08394363170805952, + "grad_norm": 1.3135857582092285, + "learning_rate": 0.00019847820058674728, + "loss": 1.6017, + "step": 2344 + }, + { + "epoch": 0.0839794438376278, + "grad_norm": 2.173564910888672, + "learning_rate": 0.00019847618408389792, + "loss": 1.7796, + "step": 2345 + }, + { + "epoch": 0.08401525596719608, + "grad_norm": 2.061150312423706, + "learning_rate": 0.00019847416625617855, + "loss": 1.7466, + "step": 2346 + }, + { + "epoch": 0.08405106809676438, + "grad_norm": 1.2300703525543213, + "learning_rate": 0.0001984721471036163, + "loss": 1.6624, + "step": 2347 + }, + { + "epoch": 0.08408688022633266, + "grad_norm": 2.010895252227783, + "learning_rate": 0.00019847012662623832, + "loss": 1.5831, + "step": 2348 + }, + { + "epoch": 0.08412269235590095, + "grad_norm": 1.3102781772613525, + "learning_rate": 0.00019846810482407182, + "loss": 1.7545, + "step": 2349 + }, + { + "epoch": 0.08415850448546923, + "grad_norm": 1.5911515951156616, + "learning_rate": 0.00019846608169714398, + "loss": 1.3566, + "step": 2350 + }, + { + "epoch": 0.08419431661503751, + "grad_norm": 2.5576908588409424, + "learning_rate": 0.00019846405724548204, + "loss": 1.5922, + "step": 2351 + }, + { + "epoch": 0.0842301287446058, + "grad_norm": 1.8098796606063843, + "learning_rate": 0.00019846203146911318, + "loss": 1.8392, + "step": 2352 + }, + { + "epoch": 0.08426594087417408, + "grad_norm": 1.828391432762146, + "learning_rate": 0.00019846000436806471, + "loss": 1.9675, + "step": 2353 + }, + { + "epoch": 0.08430175300374236, + "grad_norm": 1.516196846961975, + "learning_rate": 0.00019845797594236387, + "loss": 1.596, + "step": 2354 + }, + { + "epoch": 0.08433756513331066, + "grad_norm": 1.5106902122497559, + "learning_rate": 0.00019845594619203797, + "loss": 1.6667, + "step": 2355 + }, + { + "epoch": 0.08437337726287894, + "grad_norm": 1.6493037939071655, + "learning_rate": 0.00019845391511711435, + "loss": 1.5757, + "step": 2356 + }, + { + "epoch": 0.08440918939244722, + "grad_norm": 1.7102857828140259, + "learning_rate": 0.00019845188271762029, + "loss": 1.6952, + "step": 2357 + }, + { + "epoch": 0.0844450015220155, + "grad_norm": 1.6236246824264526, + "learning_rate": 0.0001984498489935831, + "loss": 1.542, + "step": 2358 + }, + { + "epoch": 0.08448081365158379, + "grad_norm": 1.3867934942245483, + "learning_rate": 0.00019844781394503022, + "loss": 1.5048, + "step": 2359 + }, + { + "epoch": 0.08451662578115207, + "grad_norm": 2.9754772186279297, + "learning_rate": 0.00019844577757198898, + "loss": 1.6699, + "step": 2360 + }, + { + "epoch": 0.08455243791072035, + "grad_norm": 1.5601086616516113, + "learning_rate": 0.00019844373987448676, + "loss": 1.5336, + "step": 2361 + }, + { + "epoch": 0.08458825004028865, + "grad_norm": 1.989410638809204, + "learning_rate": 0.00019844170085255104, + "loss": 1.5408, + "step": 2362 + }, + { + "epoch": 0.08462406216985693, + "grad_norm": 1.0998789072036743, + "learning_rate": 0.0001984396605062092, + "loss": 1.5359, + "step": 2363 + }, + { + "epoch": 0.08465987429942522, + "grad_norm": 2.2412939071655273, + "learning_rate": 0.00019843761883548872, + "loss": 1.448, + "step": 2364 + }, + { + "epoch": 0.0846956864289935, + "grad_norm": 1.2157700061798096, + "learning_rate": 0.00019843557584041705, + "loss": 1.4621, + "step": 2365 + }, + { + "epoch": 0.08473149855856178, + "grad_norm": 1.5647035837173462, + "learning_rate": 0.0001984335315210217, + "loss": 1.8045, + "step": 2366 + }, + { + "epoch": 0.08476731068813007, + "grad_norm": 1.3524651527404785, + "learning_rate": 0.00019843148587733012, + "loss": 1.778, + "step": 2367 + }, + { + "epoch": 0.08480312281769835, + "grad_norm": 1.6187690496444702, + "learning_rate": 0.00019842943890936986, + "loss": 1.6222, + "step": 2368 + }, + { + "epoch": 0.08483893494726664, + "grad_norm": 3.2864222526550293, + "learning_rate": 0.00019842739061716848, + "loss": 1.8601, + "step": 2369 + }, + { + "epoch": 0.08487474707683493, + "grad_norm": 1.786603569984436, + "learning_rate": 0.00019842534100075355, + "loss": 1.9872, + "step": 2370 + }, + { + "epoch": 0.08491055920640321, + "grad_norm": 1.6398714780807495, + "learning_rate": 0.00019842329006015255, + "loss": 1.5213, + "step": 2371 + }, + { + "epoch": 0.0849463713359715, + "grad_norm": 1.576285719871521, + "learning_rate": 0.0001984212377953932, + "loss": 1.9445, + "step": 2372 + }, + { + "epoch": 0.08498218346553978, + "grad_norm": 1.1794153451919556, + "learning_rate": 0.00019841918420650302, + "loss": 1.6204, + "step": 2373 + }, + { + "epoch": 0.08501799559510806, + "grad_norm": 1.7451518774032593, + "learning_rate": 0.00019841712929350965, + "loss": 1.7095, + "step": 2374 + }, + { + "epoch": 0.08505380772467634, + "grad_norm": 1.3258010149002075, + "learning_rate": 0.0001984150730564408, + "loss": 1.8024, + "step": 2375 + }, + { + "epoch": 0.08508961985424464, + "grad_norm": 1.6353540420532227, + "learning_rate": 0.00019841301549532409, + "loss": 1.7716, + "step": 2376 + }, + { + "epoch": 0.08512543198381292, + "grad_norm": 1.306315302848816, + "learning_rate": 0.00019841095661018716, + "loss": 1.6493, + "step": 2377 + }, + { + "epoch": 0.0851612441133812, + "grad_norm": 1.7025679349899292, + "learning_rate": 0.00019840889640105775, + "loss": 1.8987, + "step": 2378 + }, + { + "epoch": 0.08519705624294949, + "grad_norm": 2.471372127532959, + "learning_rate": 0.00019840683486796362, + "loss": 1.5562, + "step": 2379 + }, + { + "epoch": 0.08523286837251777, + "grad_norm": 1.2974969148635864, + "learning_rate": 0.00019840477201093243, + "loss": 1.6721, + "step": 2380 + }, + { + "epoch": 0.08526868050208605, + "grad_norm": 1.7710561752319336, + "learning_rate": 0.00019840270782999197, + "loss": 1.6326, + "step": 2381 + }, + { + "epoch": 0.08530449263165434, + "grad_norm": 1.3455687761306763, + "learning_rate": 0.00019840064232517, + "loss": 1.7118, + "step": 2382 + }, + { + "epoch": 0.08534030476122263, + "grad_norm": 1.3794631958007812, + "learning_rate": 0.0001983985754964943, + "loss": 1.5416, + "step": 2383 + }, + { + "epoch": 0.08537611689079092, + "grad_norm": 1.446144938468933, + "learning_rate": 0.00019839650734399276, + "loss": 1.7194, + "step": 2384 + }, + { + "epoch": 0.0854119290203592, + "grad_norm": 1.3965164422988892, + "learning_rate": 0.0001983944378676931, + "loss": 1.5514, + "step": 2385 + }, + { + "epoch": 0.08544774114992748, + "grad_norm": 1.8332926034927368, + "learning_rate": 0.00019839236706762318, + "loss": 1.9941, + "step": 2386 + }, + { + "epoch": 0.08548355327949576, + "grad_norm": 2.237839460372925, + "learning_rate": 0.00019839029494381086, + "loss": 1.5284, + "step": 2387 + }, + { + "epoch": 0.08551936540906405, + "grad_norm": 1.652097463607788, + "learning_rate": 0.0001983882214962841, + "loss": 1.8933, + "step": 2388 + }, + { + "epoch": 0.08555517753863233, + "grad_norm": 1.1951125860214233, + "learning_rate": 0.00019838614672507067, + "loss": 1.5353, + "step": 2389 + }, + { + "epoch": 0.08559098966820063, + "grad_norm": 1.575891375541687, + "learning_rate": 0.00019838407063019857, + "loss": 1.9357, + "step": 2390 + }, + { + "epoch": 0.08562680179776891, + "grad_norm": 1.398511290550232, + "learning_rate": 0.0001983819932116957, + "loss": 1.6936, + "step": 2391 + }, + { + "epoch": 0.08566261392733719, + "grad_norm": 1.8677663803100586, + "learning_rate": 0.00019837991446959005, + "loss": 1.8502, + "step": 2392 + }, + { + "epoch": 0.08569842605690547, + "grad_norm": 1.2182732820510864, + "learning_rate": 0.0001983778344039095, + "loss": 1.4945, + "step": 2393 + }, + { + "epoch": 0.08573423818647376, + "grad_norm": 1.4245761632919312, + "learning_rate": 0.00019837575301468211, + "loss": 1.8349, + "step": 2394 + }, + { + "epoch": 0.08577005031604204, + "grad_norm": 1.4133869409561157, + "learning_rate": 0.00019837367030193587, + "loss": 1.7366, + "step": 2395 + }, + { + "epoch": 0.08580586244561032, + "grad_norm": 2.530789852142334, + "learning_rate": 0.00019837158626569878, + "loss": 2.013, + "step": 2396 + }, + { + "epoch": 0.08584167457517862, + "grad_norm": 1.37733793258667, + "learning_rate": 0.0001983695009059989, + "loss": 1.7408, + "step": 2397 + }, + { + "epoch": 0.0858774867047469, + "grad_norm": 1.6104776859283447, + "learning_rate": 0.00019836741422286425, + "loss": 1.5966, + "step": 2398 + }, + { + "epoch": 0.08591329883431519, + "grad_norm": 1.435455322265625, + "learning_rate": 0.00019836532621632293, + "loss": 1.5554, + "step": 2399 + }, + { + "epoch": 0.08594911096388347, + "grad_norm": 1.2741247415542603, + "learning_rate": 0.000198363236886403, + "loss": 1.3838, + "step": 2400 + }, + { + "epoch": 0.08598492309345175, + "grad_norm": 1.459208369255066, + "learning_rate": 0.00019836114623313265, + "loss": 1.6547, + "step": 2401 + }, + { + "epoch": 0.08602073522302003, + "grad_norm": 2.1880502700805664, + "learning_rate": 0.00019835905425653994, + "loss": 1.3995, + "step": 2402 + }, + { + "epoch": 0.08605654735258832, + "grad_norm": 2.445836305618286, + "learning_rate": 0.00019835696095665302, + "loss": 1.9346, + "step": 2403 + }, + { + "epoch": 0.0860923594821566, + "grad_norm": 2.707475185394287, + "learning_rate": 0.00019835486633350006, + "loss": 1.8816, + "step": 2404 + }, + { + "epoch": 0.0861281716117249, + "grad_norm": 1.7417428493499756, + "learning_rate": 0.00019835277038710928, + "loss": 1.5025, + "step": 2405 + }, + { + "epoch": 0.08616398374129318, + "grad_norm": 1.2903729677200317, + "learning_rate": 0.00019835067311750878, + "loss": 1.5742, + "step": 2406 + }, + { + "epoch": 0.08619979587086146, + "grad_norm": 1.8366937637329102, + "learning_rate": 0.00019834857452472686, + "loss": 1.5567, + "step": 2407 + }, + { + "epoch": 0.08623560800042974, + "grad_norm": 1.5683704614639282, + "learning_rate": 0.00019834647460879174, + "loss": 1.3516, + "step": 2408 + }, + { + "epoch": 0.08627142012999803, + "grad_norm": 1.3883376121520996, + "learning_rate": 0.00019834437336973165, + "loss": 1.6229, + "step": 2409 + }, + { + "epoch": 0.08630723225956631, + "grad_norm": 2.0175423622131348, + "learning_rate": 0.00019834227080757488, + "loss": 1.6116, + "step": 2410 + }, + { + "epoch": 0.0863430443891346, + "grad_norm": 1.6487847566604614, + "learning_rate": 0.0001983401669223497, + "loss": 1.674, + "step": 2411 + }, + { + "epoch": 0.08637885651870289, + "grad_norm": 1.7350643873214722, + "learning_rate": 0.00019833806171408442, + "loss": 1.7811, + "step": 2412 + }, + { + "epoch": 0.08641466864827117, + "grad_norm": 1.6598446369171143, + "learning_rate": 0.0001983359551828074, + "loss": 1.6918, + "step": 2413 + }, + { + "epoch": 0.08645048077783946, + "grad_norm": 5.592057228088379, + "learning_rate": 0.0001983338473285469, + "loss": 1.6704, + "step": 2414 + }, + { + "epoch": 0.08648629290740774, + "grad_norm": 1.5842877626419067, + "learning_rate": 0.00019833173815133134, + "loss": 1.4705, + "step": 2415 + }, + { + "epoch": 0.08652210503697602, + "grad_norm": 1.3319464921951294, + "learning_rate": 0.0001983296276511891, + "loss": 1.6647, + "step": 2416 + }, + { + "epoch": 0.0865579171665443, + "grad_norm": 1.8691999912261963, + "learning_rate": 0.00019832751582814855, + "loss": 1.83, + "step": 2417 + }, + { + "epoch": 0.08659372929611259, + "grad_norm": 2.428990125656128, + "learning_rate": 0.0001983254026822381, + "loss": 1.6859, + "step": 2418 + }, + { + "epoch": 0.08662954142568088, + "grad_norm": 2.839390516281128, + "learning_rate": 0.0001983232882134862, + "loss": 1.4562, + "step": 2419 + }, + { + "epoch": 0.08666535355524917, + "grad_norm": 1.6174662113189697, + "learning_rate": 0.00019832117242192128, + "loss": 1.4465, + "step": 2420 + }, + { + "epoch": 0.08670116568481745, + "grad_norm": 1.4822794198989868, + "learning_rate": 0.0001983190553075718, + "loss": 1.4847, + "step": 2421 + }, + { + "epoch": 0.08673697781438573, + "grad_norm": 1.9179503917694092, + "learning_rate": 0.00019831693687046627, + "loss": 1.8159, + "step": 2422 + }, + { + "epoch": 0.08677278994395402, + "grad_norm": 3.1191704273223877, + "learning_rate": 0.00019831481711063314, + "loss": 1.8706, + "step": 2423 + }, + { + "epoch": 0.0868086020735223, + "grad_norm": 1.2060275077819824, + "learning_rate": 0.000198312696028101, + "loss": 1.5291, + "step": 2424 + }, + { + "epoch": 0.08684441420309058, + "grad_norm": 1.2168149948120117, + "learning_rate": 0.00019831057362289833, + "loss": 1.5218, + "step": 2425 + }, + { + "epoch": 0.08688022633265888, + "grad_norm": 2.720611095428467, + "learning_rate": 0.00019830844989505373, + "loss": 1.8275, + "step": 2426 + }, + { + "epoch": 0.08691603846222716, + "grad_norm": 1.599690318107605, + "learning_rate": 0.00019830632484459573, + "loss": 1.4722, + "step": 2427 + }, + { + "epoch": 0.08695185059179544, + "grad_norm": 1.5266869068145752, + "learning_rate": 0.00019830419847155292, + "loss": 1.5618, + "step": 2428 + }, + { + "epoch": 0.08698766272136373, + "grad_norm": 2.5052943229675293, + "learning_rate": 0.00019830207077595392, + "loss": 1.9813, + "step": 2429 + }, + { + "epoch": 0.08702347485093201, + "grad_norm": 2.039900779724121, + "learning_rate": 0.00019829994175782738, + "loss": 1.7817, + "step": 2430 + }, + { + "epoch": 0.08705928698050029, + "grad_norm": 1.7723355293273926, + "learning_rate": 0.0001982978114172019, + "loss": 1.8192, + "step": 2431 + }, + { + "epoch": 0.08709509911006857, + "grad_norm": 1.4781324863433838, + "learning_rate": 0.0001982956797541062, + "loss": 1.5802, + "step": 2432 + }, + { + "epoch": 0.08713091123963687, + "grad_norm": 1.499817967414856, + "learning_rate": 0.0001982935467685689, + "loss": 1.7145, + "step": 2433 + }, + { + "epoch": 0.08716672336920515, + "grad_norm": 1.2955687046051025, + "learning_rate": 0.0001982914124606187, + "loss": 1.4076, + "step": 2434 + }, + { + "epoch": 0.08720253549877344, + "grad_norm": 1.1812690496444702, + "learning_rate": 0.00019828927683028435, + "loss": 1.6311, + "step": 2435 + }, + { + "epoch": 0.08723834762834172, + "grad_norm": 1.2968649864196777, + "learning_rate": 0.00019828713987759454, + "loss": 1.377, + "step": 2436 + }, + { + "epoch": 0.08727415975791, + "grad_norm": 1.2757093906402588, + "learning_rate": 0.00019828500160257807, + "loss": 1.7077, + "step": 2437 + }, + { + "epoch": 0.08730997188747829, + "grad_norm": 1.4437847137451172, + "learning_rate": 0.0001982828620052637, + "loss": 1.4253, + "step": 2438 + }, + { + "epoch": 0.08734578401704657, + "grad_norm": 1.79239821434021, + "learning_rate": 0.00019828072108568016, + "loss": 1.8537, + "step": 2439 + }, + { + "epoch": 0.08738159614661486, + "grad_norm": 1.204079031944275, + "learning_rate": 0.0001982785788438563, + "loss": 1.5567, + "step": 2440 + }, + { + "epoch": 0.08741740827618315, + "grad_norm": 2.0811383724212646, + "learning_rate": 0.00019827643527982095, + "loss": 1.806, + "step": 2441 + }, + { + "epoch": 0.08745322040575143, + "grad_norm": 1.5582301616668701, + "learning_rate": 0.00019827429039360293, + "loss": 1.7261, + "step": 2442 + }, + { + "epoch": 0.08748903253531971, + "grad_norm": 1.1033960580825806, + "learning_rate": 0.00019827214418523107, + "loss": 1.7938, + "step": 2443 + }, + { + "epoch": 0.087524844664888, + "grad_norm": 1.3593032360076904, + "learning_rate": 0.0001982699966547343, + "loss": 1.5597, + "step": 2444 + }, + { + "epoch": 0.08756065679445628, + "grad_norm": 1.574497938156128, + "learning_rate": 0.00019826784780214147, + "loss": 1.3391, + "step": 2445 + }, + { + "epoch": 0.08759646892402456, + "grad_norm": 2.002777338027954, + "learning_rate": 0.0001982656976274815, + "loss": 1.3655, + "step": 2446 + }, + { + "epoch": 0.08763228105359286, + "grad_norm": 2.2492942810058594, + "learning_rate": 0.00019826354613078332, + "loss": 1.948, + "step": 2447 + }, + { + "epoch": 0.08766809318316114, + "grad_norm": 1.3055859804153442, + "learning_rate": 0.0001982613933120759, + "loss": 1.6317, + "step": 2448 + }, + { + "epoch": 0.08770390531272942, + "grad_norm": 2.0596539974212646, + "learning_rate": 0.00019825923917138818, + "loss": 1.8837, + "step": 2449 + }, + { + "epoch": 0.08773971744229771, + "grad_norm": 1.592313289642334, + "learning_rate": 0.0001982570837087491, + "loss": 1.5169, + "step": 2450 + }, + { + "epoch": 0.08777552957186599, + "grad_norm": 1.5785194635391235, + "learning_rate": 0.00019825492692418774, + "loss": 1.5085, + "step": 2451 + }, + { + "epoch": 0.08781134170143427, + "grad_norm": 1.0770231485366821, + "learning_rate": 0.00019825276881773308, + "loss": 1.5045, + "step": 2452 + }, + { + "epoch": 0.08784715383100256, + "grad_norm": 1.7435587644577026, + "learning_rate": 0.00019825060938941414, + "loss": 1.7879, + "step": 2453 + }, + { + "epoch": 0.08788296596057084, + "grad_norm": 1.482975959777832, + "learning_rate": 0.00019824844863925998, + "loss": 1.722, + "step": 2454 + }, + { + "epoch": 0.08791877809013914, + "grad_norm": 1.9586156606674194, + "learning_rate": 0.0001982462865672997, + "loss": 1.6066, + "step": 2455 + }, + { + "epoch": 0.08795459021970742, + "grad_norm": 1.9015264511108398, + "learning_rate": 0.00019824412317356234, + "loss": 1.4479, + "step": 2456 + }, + { + "epoch": 0.0879904023492757, + "grad_norm": 1.943438172340393, + "learning_rate": 0.00019824195845807703, + "loss": 1.5558, + "step": 2457 + }, + { + "epoch": 0.08802621447884398, + "grad_norm": 1.8411654233932495, + "learning_rate": 0.00019823979242087288, + "loss": 1.3188, + "step": 2458 + }, + { + "epoch": 0.08806202660841227, + "grad_norm": 1.542543888092041, + "learning_rate": 0.00019823762506197907, + "loss": 1.7211, + "step": 2459 + }, + { + "epoch": 0.08809783873798055, + "grad_norm": 1.1874290704727173, + "learning_rate": 0.0001982354563814247, + "loss": 1.4066, + "step": 2460 + }, + { + "epoch": 0.08813365086754883, + "grad_norm": 2.709726095199585, + "learning_rate": 0.000198233286379239, + "loss": 1.8428, + "step": 2461 + }, + { + "epoch": 0.08816946299711713, + "grad_norm": 1.5549380779266357, + "learning_rate": 0.00019823111505545114, + "loss": 1.537, + "step": 2462 + }, + { + "epoch": 0.08820527512668541, + "grad_norm": 2.0994925498962402, + "learning_rate": 0.00019822894241009037, + "loss": 1.4763, + "step": 2463 + }, + { + "epoch": 0.0882410872562537, + "grad_norm": 1.4595978260040283, + "learning_rate": 0.00019822676844318582, + "loss": 1.6066, + "step": 2464 + }, + { + "epoch": 0.08827689938582198, + "grad_norm": 1.3861949443817139, + "learning_rate": 0.00019822459315476686, + "loss": 1.5408, + "step": 2465 + }, + { + "epoch": 0.08831271151539026, + "grad_norm": 1.820631980895996, + "learning_rate": 0.00019822241654486266, + "loss": 1.9246, + "step": 2466 + }, + { + "epoch": 0.08834852364495854, + "grad_norm": 2.3096110820770264, + "learning_rate": 0.00019822023861350256, + "loss": 1.8691, + "step": 2467 + }, + { + "epoch": 0.08838433577452683, + "grad_norm": 1.4352091550827026, + "learning_rate": 0.00019821805936071584, + "loss": 1.6238, + "step": 2468 + }, + { + "epoch": 0.08842014790409512, + "grad_norm": 1.4374312162399292, + "learning_rate": 0.00019821587878653184, + "loss": 1.6068, + "step": 2469 + }, + { + "epoch": 0.0884559600336634, + "grad_norm": 1.8493610620498657, + "learning_rate": 0.00019821369689097988, + "loss": 1.6721, + "step": 2470 + }, + { + "epoch": 0.08849177216323169, + "grad_norm": 1.8211166858673096, + "learning_rate": 0.00019821151367408927, + "loss": 1.3961, + "step": 2471 + }, + { + "epoch": 0.08852758429279997, + "grad_norm": 2.59659743309021, + "learning_rate": 0.00019820932913588947, + "loss": 1.8369, + "step": 2472 + }, + { + "epoch": 0.08856339642236825, + "grad_norm": 1.3814586400985718, + "learning_rate": 0.00019820714327640983, + "loss": 1.2698, + "step": 2473 + }, + { + "epoch": 0.08859920855193654, + "grad_norm": 1.4654902219772339, + "learning_rate": 0.00019820495609567976, + "loss": 1.6292, + "step": 2474 + }, + { + "epoch": 0.08863502068150482, + "grad_norm": 1.361952543258667, + "learning_rate": 0.00019820276759372867, + "loss": 1.9109, + "step": 2475 + }, + { + "epoch": 0.08867083281107312, + "grad_norm": 1.8597922325134277, + "learning_rate": 0.00019820057777058598, + "loss": 1.5002, + "step": 2476 + }, + { + "epoch": 0.0887066449406414, + "grad_norm": 1.6137717962265015, + "learning_rate": 0.00019819838662628122, + "loss": 1.6735, + "step": 2477 + }, + { + "epoch": 0.08874245707020968, + "grad_norm": 1.753365397453308, + "learning_rate": 0.00019819619416084385, + "loss": 1.8299, + "step": 2478 + }, + { + "epoch": 0.08877826919977796, + "grad_norm": 1.4413976669311523, + "learning_rate": 0.00019819400037430332, + "loss": 1.5304, + "step": 2479 + }, + { + "epoch": 0.08881408132934625, + "grad_norm": 1.604013442993164, + "learning_rate": 0.0001981918052666892, + "loss": 1.5374, + "step": 2480 + }, + { + "epoch": 0.08884989345891453, + "grad_norm": 2.0614285469055176, + "learning_rate": 0.00019818960883803097, + "loss": 1.5122, + "step": 2481 + }, + { + "epoch": 0.08888570558848281, + "grad_norm": 1.5279723405838013, + "learning_rate": 0.00019818741108835824, + "loss": 1.3099, + "step": 2482 + }, + { + "epoch": 0.08892151771805111, + "grad_norm": 2.621316432952881, + "learning_rate": 0.00019818521201770052, + "loss": 1.8033, + "step": 2483 + }, + { + "epoch": 0.08895732984761939, + "grad_norm": 1.9032632112503052, + "learning_rate": 0.00019818301162608743, + "loss": 1.7269, + "step": 2484 + }, + { + "epoch": 0.08899314197718768, + "grad_norm": 1.6183884143829346, + "learning_rate": 0.00019818080991354858, + "loss": 1.4296, + "step": 2485 + }, + { + "epoch": 0.08902895410675596, + "grad_norm": 1.493016004562378, + "learning_rate": 0.00019817860688011357, + "loss": 1.558, + "step": 2486 + }, + { + "epoch": 0.08906476623632424, + "grad_norm": 1.3361989259719849, + "learning_rate": 0.00019817640252581202, + "loss": 1.3134, + "step": 2487 + }, + { + "epoch": 0.08910057836589252, + "grad_norm": 2.0151519775390625, + "learning_rate": 0.00019817419685067364, + "loss": 2.0462, + "step": 2488 + }, + { + "epoch": 0.08913639049546081, + "grad_norm": 1.6625710725784302, + "learning_rate": 0.00019817198985472807, + "loss": 1.5406, + "step": 2489 + }, + { + "epoch": 0.0891722026250291, + "grad_norm": 1.9225102663040161, + "learning_rate": 0.00019816978153800504, + "loss": 1.7719, + "step": 2490 + }, + { + "epoch": 0.08920801475459739, + "grad_norm": 1.7934019565582275, + "learning_rate": 0.00019816757190053416, + "loss": 1.6073, + "step": 2491 + }, + { + "epoch": 0.08924382688416567, + "grad_norm": 1.8220930099487305, + "learning_rate": 0.00019816536094234528, + "loss": 1.7796, + "step": 2492 + }, + { + "epoch": 0.08927963901373395, + "grad_norm": 1.3911093473434448, + "learning_rate": 0.00019816314866346807, + "loss": 1.4431, + "step": 2493 + }, + { + "epoch": 0.08931545114330224, + "grad_norm": 1.3377423286437988, + "learning_rate": 0.00019816093506393233, + "loss": 1.6441, + "step": 2494 + }, + { + "epoch": 0.08935126327287052, + "grad_norm": 2.121514320373535, + "learning_rate": 0.00019815872014376784, + "loss": 1.6375, + "step": 2495 + }, + { + "epoch": 0.0893870754024388, + "grad_norm": 2.1512534618377686, + "learning_rate": 0.00019815650390300434, + "loss": 1.8018, + "step": 2496 + }, + { + "epoch": 0.0894228875320071, + "grad_norm": 1.6939420700073242, + "learning_rate": 0.00019815428634167176, + "loss": 1.2262, + "step": 2497 + }, + { + "epoch": 0.08945869966157538, + "grad_norm": 1.2576205730438232, + "learning_rate": 0.00019815206745979981, + "loss": 1.6021, + "step": 2498 + }, + { + "epoch": 0.08949451179114366, + "grad_norm": 1.6956746578216553, + "learning_rate": 0.00019814984725741842, + "loss": 1.5378, + "step": 2499 + }, + { + "epoch": 0.08953032392071195, + "grad_norm": 1.830729365348816, + "learning_rate": 0.00019814762573455743, + "loss": 1.9099, + "step": 2500 + }, + { + "epoch": 0.08956613605028023, + "grad_norm": 1.476012110710144, + "learning_rate": 0.00019814540289124675, + "loss": 1.3992, + "step": 2501 + }, + { + "epoch": 0.08960194817984851, + "grad_norm": 2.3555774688720703, + "learning_rate": 0.00019814317872751626, + "loss": 1.9159, + "step": 2502 + }, + { + "epoch": 0.0896377603094168, + "grad_norm": 1.2655014991760254, + "learning_rate": 0.0001981409532433959, + "loss": 1.7051, + "step": 2503 + }, + { + "epoch": 0.08967357243898508, + "grad_norm": 1.7695574760437012, + "learning_rate": 0.00019813872643891563, + "loss": 1.7181, + "step": 2504 + }, + { + "epoch": 0.08970938456855337, + "grad_norm": 1.8145886659622192, + "learning_rate": 0.00019813649831410535, + "loss": 1.5635, + "step": 2505 + }, + { + "epoch": 0.08974519669812166, + "grad_norm": 1.3770238161087036, + "learning_rate": 0.00019813426886899509, + "loss": 1.2597, + "step": 2506 + }, + { + "epoch": 0.08978100882768994, + "grad_norm": 2.0309836864471436, + "learning_rate": 0.00019813203810361483, + "loss": 1.4215, + "step": 2507 + }, + { + "epoch": 0.08981682095725822, + "grad_norm": 1.656128168106079, + "learning_rate": 0.00019812980601799458, + "loss": 1.5579, + "step": 2508 + }, + { + "epoch": 0.0898526330868265, + "grad_norm": 1.2524288892745972, + "learning_rate": 0.00019812757261216435, + "loss": 1.7402, + "step": 2509 + }, + { + "epoch": 0.08988844521639479, + "grad_norm": 1.3505594730377197, + "learning_rate": 0.0001981253378861542, + "loss": 1.6665, + "step": 2510 + }, + { + "epoch": 0.08992425734596307, + "grad_norm": 2.9887189865112305, + "learning_rate": 0.00019812310183999423, + "loss": 1.9483, + "step": 2511 + }, + { + "epoch": 0.08996006947553137, + "grad_norm": 1.1449756622314453, + "learning_rate": 0.00019812086447371446, + "loss": 1.6492, + "step": 2512 + }, + { + "epoch": 0.08999588160509965, + "grad_norm": 1.5508500337600708, + "learning_rate": 0.00019811862578734507, + "loss": 1.9411, + "step": 2513 + }, + { + "epoch": 0.09003169373466793, + "grad_norm": 1.6970720291137695, + "learning_rate": 0.0001981163857809161, + "loss": 1.7011, + "step": 2514 + }, + { + "epoch": 0.09006750586423622, + "grad_norm": 1.9046391248703003, + "learning_rate": 0.00019811414445445772, + "loss": 1.4795, + "step": 2515 + }, + { + "epoch": 0.0901033179938045, + "grad_norm": 1.9710471630096436, + "learning_rate": 0.00019811190180800013, + "loss": 1.8442, + "step": 2516 + }, + { + "epoch": 0.09013913012337278, + "grad_norm": 1.5773948431015015, + "learning_rate": 0.0001981096578415734, + "loss": 1.569, + "step": 2517 + }, + { + "epoch": 0.09017494225294106, + "grad_norm": 1.4161051511764526, + "learning_rate": 0.00019810741255520782, + "loss": 1.4553, + "step": 2518 + }, + { + "epoch": 0.09021075438250936, + "grad_norm": 1.7593234777450562, + "learning_rate": 0.0001981051659489335, + "loss": 1.7854, + "step": 2519 + }, + { + "epoch": 0.09024656651207764, + "grad_norm": 1.5166726112365723, + "learning_rate": 0.00019810291802278078, + "loss": 1.403, + "step": 2520 + }, + { + "epoch": 0.09028237864164593, + "grad_norm": 1.500707983970642, + "learning_rate": 0.00019810066877677982, + "loss": 2.0719, + "step": 2521 + }, + { + "epoch": 0.09031819077121421, + "grad_norm": 2.0746140480041504, + "learning_rate": 0.00019809841821096086, + "loss": 1.3798, + "step": 2522 + }, + { + "epoch": 0.09035400290078249, + "grad_norm": 1.631017804145813, + "learning_rate": 0.00019809616632535427, + "loss": 1.6161, + "step": 2523 + }, + { + "epoch": 0.09038981503035078, + "grad_norm": 1.3964747190475464, + "learning_rate": 0.00019809391311999028, + "loss": 1.7918, + "step": 2524 + }, + { + "epoch": 0.09042562715991906, + "grad_norm": 1.8602650165557861, + "learning_rate": 0.00019809165859489922, + "loss": 1.7201, + "step": 2525 + }, + { + "epoch": 0.09046143928948736, + "grad_norm": 1.7261444330215454, + "learning_rate": 0.00019808940275011145, + "loss": 1.7615, + "step": 2526 + }, + { + "epoch": 0.09049725141905564, + "grad_norm": 1.6224415302276611, + "learning_rate": 0.00019808714558565727, + "loss": 1.1881, + "step": 2527 + }, + { + "epoch": 0.09053306354862392, + "grad_norm": 1.8537174463272095, + "learning_rate": 0.00019808488710156707, + "loss": 1.7823, + "step": 2528 + }, + { + "epoch": 0.0905688756781922, + "grad_norm": 2.799915313720703, + "learning_rate": 0.0001980826272978712, + "loss": 1.5794, + "step": 2529 + }, + { + "epoch": 0.09060468780776049, + "grad_norm": 1.8718516826629639, + "learning_rate": 0.00019808036617460016, + "loss": 1.6623, + "step": 2530 + }, + { + "epoch": 0.09064049993732877, + "grad_norm": 1.234557867050171, + "learning_rate": 0.00019807810373178425, + "loss": 1.5993, + "step": 2531 + }, + { + "epoch": 0.09067631206689705, + "grad_norm": 1.8687853813171387, + "learning_rate": 0.000198075839969454, + "loss": 1.6352, + "step": 2532 + }, + { + "epoch": 0.09071212419646535, + "grad_norm": 1.8785006999969482, + "learning_rate": 0.00019807357488763985, + "loss": 1.7187, + "step": 2533 + }, + { + "epoch": 0.09074793632603363, + "grad_norm": 1.3195416927337646, + "learning_rate": 0.00019807130848637224, + "loss": 1.7064, + "step": 2534 + }, + { + "epoch": 0.09078374845560191, + "grad_norm": 2.186288833618164, + "learning_rate": 0.00019806904076568165, + "loss": 1.5541, + "step": 2535 + }, + { + "epoch": 0.0908195605851702, + "grad_norm": 1.6952531337738037, + "learning_rate": 0.00019806677172559865, + "loss": 1.3094, + "step": 2536 + }, + { + "epoch": 0.09085537271473848, + "grad_norm": 4.69820499420166, + "learning_rate": 0.00019806450136615372, + "loss": 1.5628, + "step": 2537 + }, + { + "epoch": 0.09089118484430676, + "grad_norm": 1.3726061582565308, + "learning_rate": 0.0001980622296873774, + "loss": 1.7239, + "step": 2538 + }, + { + "epoch": 0.09092699697387505, + "grad_norm": 1.5278609991073608, + "learning_rate": 0.0001980599566893003, + "loss": 2.0344, + "step": 2539 + }, + { + "epoch": 0.09096280910344334, + "grad_norm": 1.4546926021575928, + "learning_rate": 0.00019805768237195296, + "loss": 1.671, + "step": 2540 + }, + { + "epoch": 0.09099862123301163, + "grad_norm": 2.053382396697998, + "learning_rate": 0.00019805540673536597, + "loss": 1.7412, + "step": 2541 + }, + { + "epoch": 0.09103443336257991, + "grad_norm": 2.1475985050201416, + "learning_rate": 0.00019805312977956997, + "loss": 1.3434, + "step": 2542 + }, + { + "epoch": 0.09107024549214819, + "grad_norm": 2.8895585536956787, + "learning_rate": 0.0001980508515045956, + "loss": 1.545, + "step": 2543 + }, + { + "epoch": 0.09110605762171647, + "grad_norm": 1.509041428565979, + "learning_rate": 0.00019804857191047353, + "loss": 1.936, + "step": 2544 + }, + { + "epoch": 0.09114186975128476, + "grad_norm": 2.280407190322876, + "learning_rate": 0.00019804629099723435, + "loss": 1.9202, + "step": 2545 + }, + { + "epoch": 0.09117768188085304, + "grad_norm": 2.5649526119232178, + "learning_rate": 0.00019804400876490883, + "loss": 1.7697, + "step": 2546 + }, + { + "epoch": 0.09121349401042134, + "grad_norm": 2.6087114810943604, + "learning_rate": 0.00019804172521352761, + "loss": 1.6824, + "step": 2547 + }, + { + "epoch": 0.09124930613998962, + "grad_norm": 1.8538936376571655, + "learning_rate": 0.00019803944034312148, + "loss": 1.6512, + "step": 2548 + }, + { + "epoch": 0.0912851182695579, + "grad_norm": 2.424487352371216, + "learning_rate": 0.0001980371541537211, + "loss": 1.4249, + "step": 2549 + }, + { + "epoch": 0.09132093039912619, + "grad_norm": 1.9333043098449707, + "learning_rate": 0.0001980348666453573, + "loss": 1.8535, + "step": 2550 + }, + { + "epoch": 0.09135674252869447, + "grad_norm": 1.7312296628952026, + "learning_rate": 0.00019803257781806082, + "loss": 1.7266, + "step": 2551 + }, + { + "epoch": 0.09139255465826275, + "grad_norm": 1.5011173486709595, + "learning_rate": 0.00019803028767186246, + "loss": 1.7465, + "step": 2552 + }, + { + "epoch": 0.09142836678783103, + "grad_norm": 2.4707727432250977, + "learning_rate": 0.000198027996206793, + "loss": 1.6072, + "step": 2553 + }, + { + "epoch": 0.09146417891739932, + "grad_norm": 3.124415159225464, + "learning_rate": 0.0001980257034228833, + "loss": 1.8117, + "step": 2554 + }, + { + "epoch": 0.09149999104696761, + "grad_norm": 1.7809544801712036, + "learning_rate": 0.00019802340932016424, + "loss": 1.7285, + "step": 2555 + }, + { + "epoch": 0.0915358031765359, + "grad_norm": 1.9475129842758179, + "learning_rate": 0.00019802111389866664, + "loss": 1.6601, + "step": 2556 + }, + { + "epoch": 0.09157161530610418, + "grad_norm": 1.3887070417404175, + "learning_rate": 0.00019801881715842136, + "loss": 1.5281, + "step": 2557 + }, + { + "epoch": 0.09160742743567246, + "grad_norm": 1.341725468635559, + "learning_rate": 0.00019801651909945935, + "loss": 1.5264, + "step": 2558 + }, + { + "epoch": 0.09164323956524074, + "grad_norm": 1.8760310411453247, + "learning_rate": 0.0001980142197218115, + "loss": 1.6517, + "step": 2559 + }, + { + "epoch": 0.09167905169480903, + "grad_norm": 2.533712863922119, + "learning_rate": 0.0001980119190255088, + "loss": 1.6518, + "step": 2560 + }, + { + "epoch": 0.09171486382437731, + "grad_norm": 2.026277780532837, + "learning_rate": 0.0001980096170105821, + "loss": 1.5654, + "step": 2561 + }, + { + "epoch": 0.0917506759539456, + "grad_norm": 2.1434755325317383, + "learning_rate": 0.00019800731367706248, + "loss": 1.7728, + "step": 2562 + }, + { + "epoch": 0.09178648808351389, + "grad_norm": 1.6237499713897705, + "learning_rate": 0.0001980050090249808, + "loss": 1.651, + "step": 2563 + }, + { + "epoch": 0.09182230021308217, + "grad_norm": 1.460422396659851, + "learning_rate": 0.0001980027030543682, + "loss": 1.5255, + "step": 2564 + }, + { + "epoch": 0.09185811234265046, + "grad_norm": 1.3004599809646606, + "learning_rate": 0.00019800039576525562, + "loss": 1.7805, + "step": 2565 + }, + { + "epoch": 0.09189392447221874, + "grad_norm": 1.7502729892730713, + "learning_rate": 0.00019799808715767413, + "loss": 1.8473, + "step": 2566 + }, + { + "epoch": 0.09192973660178702, + "grad_norm": 1.482386827468872, + "learning_rate": 0.00019799577723165479, + "loss": 1.647, + "step": 2567 + }, + { + "epoch": 0.0919655487313553, + "grad_norm": 2.2665460109710693, + "learning_rate": 0.0001979934659872287, + "loss": 1.8273, + "step": 2568 + }, + { + "epoch": 0.0920013608609236, + "grad_norm": 1.5462538003921509, + "learning_rate": 0.00019799115342442687, + "loss": 1.6563, + "step": 2569 + }, + { + "epoch": 0.09203717299049188, + "grad_norm": 1.4237371683120728, + "learning_rate": 0.0001979888395432805, + "loss": 1.7318, + "step": 2570 + }, + { + "epoch": 0.09207298512006017, + "grad_norm": 1.7229256629943848, + "learning_rate": 0.00019798652434382068, + "loss": 1.7655, + "step": 2571 + }, + { + "epoch": 0.09210879724962845, + "grad_norm": 1.699660062789917, + "learning_rate": 0.0001979842078260786, + "loss": 1.4854, + "step": 2572 + }, + { + "epoch": 0.09214460937919673, + "grad_norm": 1.2170562744140625, + "learning_rate": 0.00019798188999008536, + "loss": 1.2979, + "step": 2573 + }, + { + "epoch": 0.09218042150876501, + "grad_norm": 1.5595293045043945, + "learning_rate": 0.00019797957083587218, + "loss": 1.4223, + "step": 2574 + }, + { + "epoch": 0.0922162336383333, + "grad_norm": 1.6665608882904053, + "learning_rate": 0.00019797725036347025, + "loss": 1.5744, + "step": 2575 + }, + { + "epoch": 0.0922520457679016, + "grad_norm": 1.5953999757766724, + "learning_rate": 0.00019797492857291085, + "loss": 1.4263, + "step": 2576 + }, + { + "epoch": 0.09228785789746988, + "grad_norm": 1.8051010370254517, + "learning_rate": 0.00019797260546422512, + "loss": 2.0402, + "step": 2577 + }, + { + "epoch": 0.09232367002703816, + "grad_norm": 2.281867027282715, + "learning_rate": 0.00019797028103744438, + "loss": 1.7244, + "step": 2578 + }, + { + "epoch": 0.09235948215660644, + "grad_norm": 1.6062850952148438, + "learning_rate": 0.00019796795529259986, + "loss": 1.6111, + "step": 2579 + }, + { + "epoch": 0.09239529428617473, + "grad_norm": 1.4826256036758423, + "learning_rate": 0.0001979656282297229, + "loss": 1.442, + "step": 2580 + }, + { + "epoch": 0.09243110641574301, + "grad_norm": 2.0289320945739746, + "learning_rate": 0.00019796329984884473, + "loss": 1.986, + "step": 2581 + }, + { + "epoch": 0.09246691854531129, + "grad_norm": 1.5660099983215332, + "learning_rate": 0.00019796097014999678, + "loss": 1.7942, + "step": 2582 + }, + { + "epoch": 0.09250273067487959, + "grad_norm": 1.9263335466384888, + "learning_rate": 0.0001979586391332103, + "loss": 1.7872, + "step": 2583 + }, + { + "epoch": 0.09253854280444787, + "grad_norm": 1.0887078046798706, + "learning_rate": 0.0001979563067985167, + "loss": 1.591, + "step": 2584 + }, + { + "epoch": 0.09257435493401615, + "grad_norm": 1.3899720907211304, + "learning_rate": 0.00019795397314594735, + "loss": 1.4923, + "step": 2585 + }, + { + "epoch": 0.09261016706358444, + "grad_norm": 1.539726972579956, + "learning_rate": 0.00019795163817553363, + "loss": 1.5147, + "step": 2586 + }, + { + "epoch": 0.09264597919315272, + "grad_norm": 2.2429847717285156, + "learning_rate": 0.000197949301887307, + "loss": 1.6219, + "step": 2587 + }, + { + "epoch": 0.092681791322721, + "grad_norm": 1.6642656326293945, + "learning_rate": 0.00019794696428129883, + "loss": 1.3772, + "step": 2588 + }, + { + "epoch": 0.09271760345228929, + "grad_norm": 1.2310926914215088, + "learning_rate": 0.0001979446253575406, + "loss": 1.5615, + "step": 2589 + }, + { + "epoch": 0.09275341558185758, + "grad_norm": 2.3761885166168213, + "learning_rate": 0.00019794228511606376, + "loss": 1.5446, + "step": 2590 + }, + { + "epoch": 0.09278922771142586, + "grad_norm": 1.7739567756652832, + "learning_rate": 0.00019793994355689985, + "loss": 1.5243, + "step": 2591 + }, + { + "epoch": 0.09282503984099415, + "grad_norm": 1.9059218168258667, + "learning_rate": 0.0001979376006800803, + "loss": 1.5244, + "step": 2592 + }, + { + "epoch": 0.09286085197056243, + "grad_norm": 2.342510938644409, + "learning_rate": 0.00019793525648563668, + "loss": 1.6978, + "step": 2593 + }, + { + "epoch": 0.09289666410013071, + "grad_norm": 1.4292327165603638, + "learning_rate": 0.0001979329109736005, + "loss": 1.78, + "step": 2594 + }, + { + "epoch": 0.092932476229699, + "grad_norm": 2.344578504562378, + "learning_rate": 0.00019793056414400332, + "loss": 1.8671, + "step": 2595 + }, + { + "epoch": 0.09296828835926728, + "grad_norm": 1.9726289510726929, + "learning_rate": 0.00019792821599687676, + "loss": 1.5574, + "step": 2596 + }, + { + "epoch": 0.09300410048883558, + "grad_norm": 1.629486322402954, + "learning_rate": 0.00019792586653225237, + "loss": 1.5039, + "step": 2597 + }, + { + "epoch": 0.09303991261840386, + "grad_norm": 1.6008448600769043, + "learning_rate": 0.00019792351575016173, + "loss": 1.5486, + "step": 2598 + }, + { + "epoch": 0.09307572474797214, + "grad_norm": 1.9387474060058594, + "learning_rate": 0.0001979211636506365, + "loss": 1.5127, + "step": 2599 + }, + { + "epoch": 0.09311153687754042, + "grad_norm": 2.0888354778289795, + "learning_rate": 0.0001979188102337083, + "loss": 1.6695, + "step": 2600 + }, + { + "epoch": 0.0931473490071087, + "grad_norm": 1.6669467687606812, + "learning_rate": 0.00019791645549940886, + "loss": 1.6549, + "step": 2601 + }, + { + "epoch": 0.09318316113667699, + "grad_norm": 1.3782527446746826, + "learning_rate": 0.0001979140994477698, + "loss": 1.6938, + "step": 2602 + }, + { + "epoch": 0.09321897326624527, + "grad_norm": 1.7212045192718506, + "learning_rate": 0.00019791174207882284, + "loss": 1.3562, + "step": 2603 + }, + { + "epoch": 0.09325478539581356, + "grad_norm": 1.2293366193771362, + "learning_rate": 0.00019790938339259967, + "loss": 1.65, + "step": 2604 + }, + { + "epoch": 0.09329059752538185, + "grad_norm": 1.813463568687439, + "learning_rate": 0.00019790702338913204, + "loss": 1.5209, + "step": 2605 + }, + { + "epoch": 0.09332640965495013, + "grad_norm": 1.8608582019805908, + "learning_rate": 0.0001979046620684517, + "loss": 1.5321, + "step": 2606 + }, + { + "epoch": 0.09336222178451842, + "grad_norm": 2.023027181625366, + "learning_rate": 0.00019790229943059045, + "loss": 1.6548, + "step": 2607 + }, + { + "epoch": 0.0933980339140867, + "grad_norm": 1.4660706520080566, + "learning_rate": 0.00019789993547558, + "loss": 1.7817, + "step": 2608 + }, + { + "epoch": 0.09343384604365498, + "grad_norm": 1.455670714378357, + "learning_rate": 0.00019789757020345224, + "loss": 1.6645, + "step": 2609 + }, + { + "epoch": 0.09346965817322327, + "grad_norm": 1.8113867044448853, + "learning_rate": 0.00019789520361423893, + "loss": 1.7678, + "step": 2610 + }, + { + "epoch": 0.09350547030279155, + "grad_norm": 1.1966438293457031, + "learning_rate": 0.00019789283570797192, + "loss": 1.7918, + "step": 2611 + }, + { + "epoch": 0.09354128243235985, + "grad_norm": 1.6652956008911133, + "learning_rate": 0.0001978904664846831, + "loss": 1.5339, + "step": 2612 + }, + { + "epoch": 0.09357709456192813, + "grad_norm": 1.756519079208374, + "learning_rate": 0.00019788809594440432, + "loss": 1.5227, + "step": 2613 + }, + { + "epoch": 0.09361290669149641, + "grad_norm": 1.711736798286438, + "learning_rate": 0.00019788572408716747, + "loss": 1.8176, + "step": 2614 + }, + { + "epoch": 0.0936487188210647, + "grad_norm": 2.1952521800994873, + "learning_rate": 0.00019788335091300448, + "loss": 1.7203, + "step": 2615 + }, + { + "epoch": 0.09368453095063298, + "grad_norm": 1.3425790071487427, + "learning_rate": 0.00019788097642194725, + "loss": 1.7342, + "step": 2616 + }, + { + "epoch": 0.09372034308020126, + "grad_norm": 1.7646204233169556, + "learning_rate": 0.00019787860061402774, + "loss": 1.5705, + "step": 2617 + }, + { + "epoch": 0.09375615520976954, + "grad_norm": 2.6315975189208984, + "learning_rate": 0.00019787622348927793, + "loss": 1.5904, + "step": 2618 + }, + { + "epoch": 0.09379196733933784, + "grad_norm": 2.2786176204681396, + "learning_rate": 0.00019787384504772976, + "loss": 1.4719, + "step": 2619 + }, + { + "epoch": 0.09382777946890612, + "grad_norm": 3.4464972019195557, + "learning_rate": 0.00019787146528941528, + "loss": 2.1013, + "step": 2620 + }, + { + "epoch": 0.0938635915984744, + "grad_norm": 1.4851138591766357, + "learning_rate": 0.00019786908421436645, + "loss": 1.6989, + "step": 2621 + }, + { + "epoch": 0.09389940372804269, + "grad_norm": 1.547202229499817, + "learning_rate": 0.00019786670182261534, + "loss": 1.8292, + "step": 2622 + }, + { + "epoch": 0.09393521585761097, + "grad_norm": 1.8076845407485962, + "learning_rate": 0.00019786431811419402, + "loss": 1.5035, + "step": 2623 + }, + { + "epoch": 0.09397102798717925, + "grad_norm": 2.3341751098632812, + "learning_rate": 0.0001978619330891345, + "loss": 2.0307, + "step": 2624 + }, + { + "epoch": 0.09400684011674754, + "grad_norm": 2.7499022483825684, + "learning_rate": 0.0001978595467474689, + "loss": 1.51, + "step": 2625 + }, + { + "epoch": 0.09404265224631583, + "grad_norm": 1.925464153289795, + "learning_rate": 0.00019785715908922938, + "loss": 1.4552, + "step": 2626 + }, + { + "epoch": 0.09407846437588412, + "grad_norm": 2.460425853729248, + "learning_rate": 0.00019785477011444798, + "loss": 1.5344, + "step": 2627 + }, + { + "epoch": 0.0941142765054524, + "grad_norm": 1.5089377164840698, + "learning_rate": 0.00019785237982315686, + "loss": 1.6439, + "step": 2628 + }, + { + "epoch": 0.09415008863502068, + "grad_norm": 1.501597285270691, + "learning_rate": 0.0001978499882153882, + "loss": 1.5609, + "step": 2629 + }, + { + "epoch": 0.09418590076458896, + "grad_norm": 1.284104824066162, + "learning_rate": 0.00019784759529117415, + "loss": 1.4742, + "step": 2630 + }, + { + "epoch": 0.09422171289415725, + "grad_norm": 2.3818297386169434, + "learning_rate": 0.000197845201050547, + "loss": 1.5001, + "step": 2631 + }, + { + "epoch": 0.09425752502372553, + "grad_norm": 1.6480205059051514, + "learning_rate": 0.0001978428054935388, + "loss": 1.5732, + "step": 2632 + }, + { + "epoch": 0.09429333715329383, + "grad_norm": 2.133923053741455, + "learning_rate": 0.00019784040862018184, + "loss": 1.5003, + "step": 2633 + }, + { + "epoch": 0.09432914928286211, + "grad_norm": 1.5461385250091553, + "learning_rate": 0.00019783801043050844, + "loss": 1.4475, + "step": 2634 + }, + { + "epoch": 0.09436496141243039, + "grad_norm": 1.5656708478927612, + "learning_rate": 0.0001978356109245508, + "loss": 1.5606, + "step": 2635 + }, + { + "epoch": 0.09440077354199868, + "grad_norm": 1.454704999923706, + "learning_rate": 0.00019783321010234122, + "loss": 1.643, + "step": 2636 + }, + { + "epoch": 0.09443658567156696, + "grad_norm": 1.9194536209106445, + "learning_rate": 0.000197830807963912, + "loss": 1.4815, + "step": 2637 + }, + { + "epoch": 0.09447239780113524, + "grad_norm": 1.1359739303588867, + "learning_rate": 0.00019782840450929543, + "loss": 1.6183, + "step": 2638 + }, + { + "epoch": 0.09450820993070352, + "grad_norm": 1.4093936681747437, + "learning_rate": 0.00019782599973852387, + "loss": 1.7041, + "step": 2639 + }, + { + "epoch": 0.09454402206027182, + "grad_norm": 1.9672014713287354, + "learning_rate": 0.0001978235936516297, + "loss": 1.4104, + "step": 2640 + }, + { + "epoch": 0.0945798341898401, + "grad_norm": 1.2967243194580078, + "learning_rate": 0.0001978211862486452, + "loss": 1.5651, + "step": 2641 + }, + { + "epoch": 0.09461564631940839, + "grad_norm": 1.4249886274337769, + "learning_rate": 0.00019781877752960285, + "loss": 1.6584, + "step": 2642 + }, + { + "epoch": 0.09465145844897667, + "grad_norm": 1.4482895135879517, + "learning_rate": 0.00019781636749453504, + "loss": 1.6222, + "step": 2643 + }, + { + "epoch": 0.09468727057854495, + "grad_norm": 1.9295979738235474, + "learning_rate": 0.00019781395614347415, + "loss": 1.778, + "step": 2644 + }, + { + "epoch": 0.09472308270811323, + "grad_norm": 1.6828997135162354, + "learning_rate": 0.0001978115434764527, + "loss": 1.602, + "step": 2645 + }, + { + "epoch": 0.09475889483768152, + "grad_norm": 1.602980375289917, + "learning_rate": 0.00019780912949350307, + "loss": 1.5946, + "step": 2646 + }, + { + "epoch": 0.09479470696724981, + "grad_norm": 1.2852801084518433, + "learning_rate": 0.00019780671419465776, + "loss": 1.3936, + "step": 2647 + }, + { + "epoch": 0.0948305190968181, + "grad_norm": 3.3350985050201416, + "learning_rate": 0.00019780429757994928, + "loss": 1.587, + "step": 2648 + }, + { + "epoch": 0.09486633122638638, + "grad_norm": 1.130829930305481, + "learning_rate": 0.00019780187964941011, + "loss": 1.479, + "step": 2649 + }, + { + "epoch": 0.09490214335595466, + "grad_norm": 1.76201593875885, + "learning_rate": 0.00019779946040307284, + "loss": 1.5925, + "step": 2650 + }, + { + "epoch": 0.09493795548552295, + "grad_norm": 2.2327420711517334, + "learning_rate": 0.00019779703984096998, + "loss": 1.6929, + "step": 2651 + }, + { + "epoch": 0.09497376761509123, + "grad_norm": 2.0052194595336914, + "learning_rate": 0.00019779461796313408, + "loss": 1.4586, + "step": 2652 + }, + { + "epoch": 0.09500957974465951, + "grad_norm": 1.6383410692214966, + "learning_rate": 0.00019779219476959777, + "loss": 1.4045, + "step": 2653 + }, + { + "epoch": 0.0950453918742278, + "grad_norm": 2.5757429599761963, + "learning_rate": 0.0001977897702603936, + "loss": 1.7264, + "step": 2654 + }, + { + "epoch": 0.09508120400379609, + "grad_norm": 2.200880289077759, + "learning_rate": 0.0001977873444355542, + "loss": 1.7437, + "step": 2655 + }, + { + "epoch": 0.09511701613336437, + "grad_norm": 2.070451498031616, + "learning_rate": 0.00019778491729511224, + "loss": 1.634, + "step": 2656 + }, + { + "epoch": 0.09515282826293266, + "grad_norm": 1.4518322944641113, + "learning_rate": 0.00019778248883910035, + "loss": 1.7425, + "step": 2657 + }, + { + "epoch": 0.09518864039250094, + "grad_norm": 1.1998945474624634, + "learning_rate": 0.0001977800590675512, + "loss": 1.4792, + "step": 2658 + }, + { + "epoch": 0.09522445252206922, + "grad_norm": 1.4403979778289795, + "learning_rate": 0.0001977776279804975, + "loss": 1.5377, + "step": 2659 + }, + { + "epoch": 0.0952602646516375, + "grad_norm": 1.753049373626709, + "learning_rate": 0.0001977751955779719, + "loss": 1.1242, + "step": 2660 + }, + { + "epoch": 0.09529607678120579, + "grad_norm": 2.60457706451416, + "learning_rate": 0.00019777276186000716, + "loss": 1.3977, + "step": 2661 + }, + { + "epoch": 0.09533188891077408, + "grad_norm": 2.1619999408721924, + "learning_rate": 0.00019777032682663607, + "loss": 1.8488, + "step": 2662 + }, + { + "epoch": 0.09536770104034237, + "grad_norm": 1.300704002380371, + "learning_rate": 0.00019776789047789133, + "loss": 1.7853, + "step": 2663 + }, + { + "epoch": 0.09540351316991065, + "grad_norm": 1.6537222862243652, + "learning_rate": 0.0001977654528138057, + "loss": 1.5626, + "step": 2664 + }, + { + "epoch": 0.09543932529947893, + "grad_norm": 1.322972059249878, + "learning_rate": 0.00019776301383441207, + "loss": 1.5224, + "step": 2665 + }, + { + "epoch": 0.09547513742904722, + "grad_norm": 1.761858582496643, + "learning_rate": 0.00019776057353974315, + "loss": 1.7065, + "step": 2666 + }, + { + "epoch": 0.0955109495586155, + "grad_norm": 1.886605143547058, + "learning_rate": 0.00019775813192983183, + "loss": 1.2394, + "step": 2667 + }, + { + "epoch": 0.09554676168818378, + "grad_norm": 2.51098370552063, + "learning_rate": 0.00019775568900471096, + "loss": 1.3918, + "step": 2668 + }, + { + "epoch": 0.09558257381775208, + "grad_norm": 2.1192142963409424, + "learning_rate": 0.00019775324476441336, + "loss": 1.4485, + "step": 2669 + }, + { + "epoch": 0.09561838594732036, + "grad_norm": 2.188304901123047, + "learning_rate": 0.00019775079920897196, + "loss": 1.7083, + "step": 2670 + }, + { + "epoch": 0.09565419807688864, + "grad_norm": 1.3558300733566284, + "learning_rate": 0.00019774835233841965, + "loss": 1.7486, + "step": 2671 + }, + { + "epoch": 0.09569001020645693, + "grad_norm": 1.7164361476898193, + "learning_rate": 0.00019774590415278933, + "loss": 1.4583, + "step": 2672 + }, + { + "epoch": 0.09572582233602521, + "grad_norm": 1.419765830039978, + "learning_rate": 0.00019774345465211398, + "loss": 2.0221, + "step": 2673 + }, + { + "epoch": 0.09576163446559349, + "grad_norm": 1.728421926498413, + "learning_rate": 0.00019774100383642651, + "loss": 1.3956, + "step": 2674 + }, + { + "epoch": 0.09579744659516178, + "grad_norm": 2.3539481163024902, + "learning_rate": 0.0001977385517057599, + "loss": 1.6057, + "step": 2675 + }, + { + "epoch": 0.09583325872473007, + "grad_norm": 2.140263319015503, + "learning_rate": 0.00019773609826014718, + "loss": 1.7721, + "step": 2676 + }, + { + "epoch": 0.09586907085429835, + "grad_norm": 1.8717950582504272, + "learning_rate": 0.0001977336434996213, + "loss": 1.3382, + "step": 2677 + }, + { + "epoch": 0.09590488298386664, + "grad_norm": 1.4856847524642944, + "learning_rate": 0.00019773118742421532, + "loss": 1.4095, + "step": 2678 + }, + { + "epoch": 0.09594069511343492, + "grad_norm": 2.1832220554351807, + "learning_rate": 0.00019772873003396228, + "loss": 1.6058, + "step": 2679 + }, + { + "epoch": 0.0959765072430032, + "grad_norm": 1.603605031967163, + "learning_rate": 0.00019772627132889526, + "loss": 1.5258, + "step": 2680 + }, + { + "epoch": 0.09601231937257149, + "grad_norm": 1.6799068450927734, + "learning_rate": 0.00019772381130904728, + "loss": 1.6569, + "step": 2681 + }, + { + "epoch": 0.09604813150213977, + "grad_norm": 1.3893667459487915, + "learning_rate": 0.0001977213499744515, + "loss": 1.6977, + "step": 2682 + }, + { + "epoch": 0.09608394363170807, + "grad_norm": 2.7440836429595947, + "learning_rate": 0.00019771888732514098, + "loss": 1.495, + "step": 2683 + }, + { + "epoch": 0.09611975576127635, + "grad_norm": 1.2910890579223633, + "learning_rate": 0.00019771642336114892, + "loss": 1.1927, + "step": 2684 + }, + { + "epoch": 0.09615556789084463, + "grad_norm": 1.6925691366195679, + "learning_rate": 0.0001977139580825084, + "loss": 1.2957, + "step": 2685 + }, + { + "epoch": 0.09619138002041291, + "grad_norm": 2.3331246376037598, + "learning_rate": 0.0001977114914892526, + "loss": 1.7952, + "step": 2686 + }, + { + "epoch": 0.0962271921499812, + "grad_norm": 2.0909788608551025, + "learning_rate": 0.00019770902358141478, + "loss": 1.4007, + "step": 2687 + }, + { + "epoch": 0.09626300427954948, + "grad_norm": 1.5967774391174316, + "learning_rate": 0.00019770655435902805, + "loss": 1.4887, + "step": 2688 + }, + { + "epoch": 0.09629881640911776, + "grad_norm": 1.9621723890304565, + "learning_rate": 0.00019770408382212564, + "loss": 1.1734, + "step": 2689 + }, + { + "epoch": 0.09633462853868606, + "grad_norm": 1.2921861410140991, + "learning_rate": 0.00019770161197074084, + "loss": 1.6445, + "step": 2690 + }, + { + "epoch": 0.09637044066825434, + "grad_norm": 1.2130435705184937, + "learning_rate": 0.00019769913880490688, + "loss": 1.6201, + "step": 2691 + }, + { + "epoch": 0.09640625279782263, + "grad_norm": 1.7521440982818604, + "learning_rate": 0.000197696664324657, + "loss": 1.6516, + "step": 2692 + }, + { + "epoch": 0.09644206492739091, + "grad_norm": 1.5702006816864014, + "learning_rate": 0.00019769418853002454, + "loss": 1.2543, + "step": 2693 + }, + { + "epoch": 0.09647787705695919, + "grad_norm": 1.7723594903945923, + "learning_rate": 0.0001976917114210428, + "loss": 1.5552, + "step": 2694 + }, + { + "epoch": 0.09651368918652747, + "grad_norm": 1.4246082305908203, + "learning_rate": 0.00019768923299774506, + "loss": 1.6263, + "step": 2695 + }, + { + "epoch": 0.09654950131609576, + "grad_norm": 1.6311163902282715, + "learning_rate": 0.00019768675326016475, + "loss": 1.5833, + "step": 2696 + }, + { + "epoch": 0.09658531344566405, + "grad_norm": 1.1841009855270386, + "learning_rate": 0.00019768427220833514, + "loss": 1.3689, + "step": 2697 + }, + { + "epoch": 0.09662112557523234, + "grad_norm": 1.5306531190872192, + "learning_rate": 0.00019768178984228967, + "loss": 1.8065, + "step": 2698 + }, + { + "epoch": 0.09665693770480062, + "grad_norm": 1.5561325550079346, + "learning_rate": 0.00019767930616206174, + "loss": 1.6343, + "step": 2699 + }, + { + "epoch": 0.0966927498343689, + "grad_norm": 2.01039457321167, + "learning_rate": 0.00019767682116768472, + "loss": 1.4323, + "step": 2700 + }, + { + "epoch": 0.09672856196393718, + "grad_norm": 1.3528622388839722, + "learning_rate": 0.00019767433485919206, + "loss": 1.5957, + "step": 2701 + }, + { + "epoch": 0.09676437409350547, + "grad_norm": 1.388312816619873, + "learning_rate": 0.0001976718472366172, + "loss": 1.5354, + "step": 2702 + }, + { + "epoch": 0.09680018622307375, + "grad_norm": 1.7916830778121948, + "learning_rate": 0.00019766935829999363, + "loss": 1.7827, + "step": 2703 + }, + { + "epoch": 0.09683599835264203, + "grad_norm": 2.035825729370117, + "learning_rate": 0.00019766686804935488, + "loss": 1.52, + "step": 2704 + }, + { + "epoch": 0.09687181048221033, + "grad_norm": 1.4044291973114014, + "learning_rate": 0.00019766437648473435, + "loss": 1.778, + "step": 2705 + }, + { + "epoch": 0.09690762261177861, + "grad_norm": 1.3465924263000488, + "learning_rate": 0.00019766188360616563, + "loss": 1.6834, + "step": 2706 + }, + { + "epoch": 0.0969434347413469, + "grad_norm": 1.3629565238952637, + "learning_rate": 0.00019765938941368222, + "loss": 1.7601, + "step": 2707 + }, + { + "epoch": 0.09697924687091518, + "grad_norm": 1.55349600315094, + "learning_rate": 0.00019765689390731773, + "loss": 1.8671, + "step": 2708 + }, + { + "epoch": 0.09701505900048346, + "grad_norm": 1.5286920070648193, + "learning_rate": 0.0001976543970871057, + "loss": 1.6034, + "step": 2709 + }, + { + "epoch": 0.09705087113005174, + "grad_norm": 1.5274664163589478, + "learning_rate": 0.0001976518989530797, + "loss": 1.7297, + "step": 2710 + }, + { + "epoch": 0.09708668325962003, + "grad_norm": 1.5408055782318115, + "learning_rate": 0.00019764939950527336, + "loss": 1.5561, + "step": 2711 + }, + { + "epoch": 0.09712249538918832, + "grad_norm": 1.8871009349822998, + "learning_rate": 0.0001976468987437203, + "loss": 1.5792, + "step": 2712 + }, + { + "epoch": 0.0971583075187566, + "grad_norm": 1.9408537149429321, + "learning_rate": 0.0001976443966684542, + "loss": 1.6264, + "step": 2713 + }, + { + "epoch": 0.09719411964832489, + "grad_norm": 1.510097861289978, + "learning_rate": 0.00019764189327950869, + "loss": 1.4198, + "step": 2714 + }, + { + "epoch": 0.09722993177789317, + "grad_norm": 2.1380035877227783, + "learning_rate": 0.00019763938857691744, + "loss": 1.4642, + "step": 2715 + }, + { + "epoch": 0.09726574390746145, + "grad_norm": 1.4172416925430298, + "learning_rate": 0.00019763688256071418, + "loss": 1.9416, + "step": 2716 + }, + { + "epoch": 0.09730155603702974, + "grad_norm": 1.5462101697921753, + "learning_rate": 0.0001976343752309326, + "loss": 1.573, + "step": 2717 + }, + { + "epoch": 0.09733736816659802, + "grad_norm": 1.9838913679122925, + "learning_rate": 0.00019763186658760645, + "loss": 1.7277, + "step": 2718 + }, + { + "epoch": 0.09737318029616632, + "grad_norm": 1.450179100036621, + "learning_rate": 0.00019762935663076946, + "loss": 1.7552, + "step": 2719 + }, + { + "epoch": 0.0974089924257346, + "grad_norm": 1.3784503936767578, + "learning_rate": 0.00019762684536045542, + "loss": 1.443, + "step": 2720 + }, + { + "epoch": 0.09744480455530288, + "grad_norm": 2.3167805671691895, + "learning_rate": 0.00019762433277669807, + "loss": 1.8773, + "step": 2721 + }, + { + "epoch": 0.09748061668487117, + "grad_norm": 2.927114486694336, + "learning_rate": 0.00019762181887953128, + "loss": 1.7365, + "step": 2722 + }, + { + "epoch": 0.09751642881443945, + "grad_norm": 2.3608787059783936, + "learning_rate": 0.00019761930366898883, + "loss": 1.6947, + "step": 2723 + }, + { + "epoch": 0.09755224094400773, + "grad_norm": 1.6629176139831543, + "learning_rate": 0.0001976167871451046, + "loss": 1.6643, + "step": 2724 + }, + { + "epoch": 0.09758805307357601, + "grad_norm": 1.806070327758789, + "learning_rate": 0.00019761426930791238, + "loss": 2.0347, + "step": 2725 + }, + { + "epoch": 0.09762386520314431, + "grad_norm": 2.381452798843384, + "learning_rate": 0.00019761175015744605, + "loss": 1.5381, + "step": 2726 + }, + { + "epoch": 0.0976596773327126, + "grad_norm": 1.5285437107086182, + "learning_rate": 0.0001976092296937396, + "loss": 1.9806, + "step": 2727 + }, + { + "epoch": 0.09769548946228088, + "grad_norm": 1.324546456336975, + "learning_rate": 0.00019760670791682685, + "loss": 1.7321, + "step": 2728 + }, + { + "epoch": 0.09773130159184916, + "grad_norm": 1.9074602127075195, + "learning_rate": 0.00019760418482674173, + "loss": 1.6216, + "step": 2729 + }, + { + "epoch": 0.09776711372141744, + "grad_norm": 1.6975016593933105, + "learning_rate": 0.0001976016604235182, + "loss": 1.7029, + "step": 2730 + }, + { + "epoch": 0.09780292585098573, + "grad_norm": 1.9731216430664062, + "learning_rate": 0.00019759913470719024, + "loss": 1.5059, + "step": 2731 + }, + { + "epoch": 0.09783873798055401, + "grad_norm": 1.32267165184021, + "learning_rate": 0.00019759660767779184, + "loss": 1.4878, + "step": 2732 + }, + { + "epoch": 0.0978745501101223, + "grad_norm": 1.2084392309188843, + "learning_rate": 0.00019759407933535693, + "loss": 1.6624, + "step": 2733 + }, + { + "epoch": 0.09791036223969059, + "grad_norm": 2.081899404525757, + "learning_rate": 0.0001975915496799196, + "loss": 1.7721, + "step": 2734 + }, + { + "epoch": 0.09794617436925887, + "grad_norm": 2.2579267024993896, + "learning_rate": 0.00019758901871151383, + "loss": 1.3182, + "step": 2735 + }, + { + "epoch": 0.09798198649882715, + "grad_norm": 1.7588104009628296, + "learning_rate": 0.00019758648643017373, + "loss": 1.2959, + "step": 2736 + }, + { + "epoch": 0.09801779862839544, + "grad_norm": 1.9816704988479614, + "learning_rate": 0.0001975839528359333, + "loss": 1.4635, + "step": 2737 + }, + { + "epoch": 0.09805361075796372, + "grad_norm": 1.833520770072937, + "learning_rate": 0.00019758141792882667, + "loss": 1.4191, + "step": 2738 + }, + { + "epoch": 0.098089422887532, + "grad_norm": 1.7605247497558594, + "learning_rate": 0.00019757888170888793, + "loss": 1.3866, + "step": 2739 + }, + { + "epoch": 0.0981252350171003, + "grad_norm": 2.2199249267578125, + "learning_rate": 0.0001975763441761512, + "loss": 1.7087, + "step": 2740 + }, + { + "epoch": 0.09816104714666858, + "grad_norm": 1.999975562095642, + "learning_rate": 0.00019757380533065065, + "loss": 1.8079, + "step": 2741 + }, + { + "epoch": 0.09819685927623686, + "grad_norm": 1.5539270639419556, + "learning_rate": 0.00019757126517242038, + "loss": 1.7708, + "step": 2742 + }, + { + "epoch": 0.09823267140580515, + "grad_norm": 1.6064941883087158, + "learning_rate": 0.0001975687237014946, + "loss": 1.7572, + "step": 2743 + }, + { + "epoch": 0.09826848353537343, + "grad_norm": 1.6434097290039062, + "learning_rate": 0.0001975661809179075, + "loss": 1.6864, + "step": 2744 + }, + { + "epoch": 0.09830429566494171, + "grad_norm": 1.3108112812042236, + "learning_rate": 0.0001975636368216933, + "loss": 1.4519, + "step": 2745 + }, + { + "epoch": 0.09834010779451, + "grad_norm": 1.4596123695373535, + "learning_rate": 0.0001975610914128862, + "loss": 1.8373, + "step": 2746 + }, + { + "epoch": 0.09837591992407829, + "grad_norm": 1.9536592960357666, + "learning_rate": 0.00019755854469152045, + "loss": 1.2695, + "step": 2747 + }, + { + "epoch": 0.09841173205364658, + "grad_norm": 1.4774552583694458, + "learning_rate": 0.00019755599665763037, + "loss": 1.4301, + "step": 2748 + }, + { + "epoch": 0.09844754418321486, + "grad_norm": 1.8306077718734741, + "learning_rate": 0.00019755344731125013, + "loss": 1.7258, + "step": 2749 + }, + { + "epoch": 0.09848335631278314, + "grad_norm": 1.3312674760818481, + "learning_rate": 0.00019755089665241413, + "loss": 1.6966, + "step": 2750 + }, + { + "epoch": 0.09851916844235142, + "grad_norm": 1.9812297821044922, + "learning_rate": 0.00019754834468115664, + "loss": 1.7699, + "step": 2751 + }, + { + "epoch": 0.0985549805719197, + "grad_norm": 1.5077332258224487, + "learning_rate": 0.00019754579139751198, + "loss": 1.82, + "step": 2752 + }, + { + "epoch": 0.09859079270148799, + "grad_norm": 1.7618402242660522, + "learning_rate": 0.00019754323680151457, + "loss": 1.4261, + "step": 2753 + }, + { + "epoch": 0.09862660483105627, + "grad_norm": 2.45039439201355, + "learning_rate": 0.00019754068089319869, + "loss": 1.6046, + "step": 2754 + }, + { + "epoch": 0.09866241696062457, + "grad_norm": 1.6822230815887451, + "learning_rate": 0.00019753812367259878, + "loss": 1.4612, + "step": 2755 + }, + { + "epoch": 0.09869822909019285, + "grad_norm": 1.5892387628555298, + "learning_rate": 0.00019753556513974922, + "loss": 1.5376, + "step": 2756 + }, + { + "epoch": 0.09873404121976113, + "grad_norm": 1.8028465509414673, + "learning_rate": 0.00019753300529468446, + "loss": 1.6662, + "step": 2757 + }, + { + "epoch": 0.09876985334932942, + "grad_norm": 1.4199618101119995, + "learning_rate": 0.00019753044413743892, + "loss": 1.7844, + "step": 2758 + }, + { + "epoch": 0.0988056654788977, + "grad_norm": 2.244751453399658, + "learning_rate": 0.00019752788166804702, + "loss": 1.7328, + "step": 2759 + }, + { + "epoch": 0.09884147760846598, + "grad_norm": 1.673805594444275, + "learning_rate": 0.0001975253178865433, + "loss": 1.6421, + "step": 2760 + }, + { + "epoch": 0.09887728973803427, + "grad_norm": 1.4417093992233276, + "learning_rate": 0.00019752275279296227, + "loss": 1.6409, + "step": 2761 + }, + { + "epoch": 0.09891310186760256, + "grad_norm": 1.2416033744812012, + "learning_rate": 0.00019752018638733836, + "loss": 1.673, + "step": 2762 + }, + { + "epoch": 0.09894891399717085, + "grad_norm": 1.481547236442566, + "learning_rate": 0.00019751761866970612, + "loss": 1.7885, + "step": 2763 + }, + { + "epoch": 0.09898472612673913, + "grad_norm": 1.45943284034729, + "learning_rate": 0.00019751504964010016, + "loss": 1.9124, + "step": 2764 + }, + { + "epoch": 0.09902053825630741, + "grad_norm": 1.5683441162109375, + "learning_rate": 0.00019751247929855495, + "loss": 1.3896, + "step": 2765 + }, + { + "epoch": 0.0990563503858757, + "grad_norm": 1.5302742719650269, + "learning_rate": 0.0001975099076451051, + "loss": 1.5202, + "step": 2766 + }, + { + "epoch": 0.09909216251544398, + "grad_norm": 2.1754040718078613, + "learning_rate": 0.00019750733467978525, + "loss": 1.6422, + "step": 2767 + }, + { + "epoch": 0.09912797464501226, + "grad_norm": 2.2145087718963623, + "learning_rate": 0.00019750476040262998, + "loss": 2.0339, + "step": 2768 + }, + { + "epoch": 0.09916378677458056, + "grad_norm": 1.463472843170166, + "learning_rate": 0.00019750218481367392, + "loss": 1.6032, + "step": 2769 + }, + { + "epoch": 0.09919959890414884, + "grad_norm": 1.4679081439971924, + "learning_rate": 0.00019749960791295174, + "loss": 1.702, + "step": 2770 + }, + { + "epoch": 0.09923541103371712, + "grad_norm": 1.6745630502700806, + "learning_rate": 0.0001974970297004981, + "loss": 1.872, + "step": 2771 + }, + { + "epoch": 0.0992712231632854, + "grad_norm": 1.3749425411224365, + "learning_rate": 0.0001974944501763477, + "loss": 1.5595, + "step": 2772 + }, + { + "epoch": 0.09930703529285369, + "grad_norm": 1.8216643333435059, + "learning_rate": 0.0001974918693405352, + "loss": 1.5808, + "step": 2773 + }, + { + "epoch": 0.09934284742242197, + "grad_norm": 1.7504353523254395, + "learning_rate": 0.0001974892871930954, + "loss": 1.6087, + "step": 2774 + }, + { + "epoch": 0.09937865955199025, + "grad_norm": 1.5744881629943848, + "learning_rate": 0.00019748670373406294, + "loss": 1.7731, + "step": 2775 + }, + { + "epoch": 0.09941447168155855, + "grad_norm": 2.4953949451446533, + "learning_rate": 0.00019748411896347267, + "loss": 1.6093, + "step": 2776 + }, + { + "epoch": 0.09945028381112683, + "grad_norm": 1.4541890621185303, + "learning_rate": 0.00019748153288135932, + "loss": 1.7734, + "step": 2777 + }, + { + "epoch": 0.09948609594069512, + "grad_norm": 1.250166654586792, + "learning_rate": 0.0001974789454877577, + "loss": 1.3861, + "step": 2778 + }, + { + "epoch": 0.0995219080702634, + "grad_norm": 1.8420655727386475, + "learning_rate": 0.0001974763567827026, + "loss": 1.5565, + "step": 2779 + }, + { + "epoch": 0.09955772019983168, + "grad_norm": 1.5533993244171143, + "learning_rate": 0.00019747376676622878, + "loss": 1.5603, + "step": 2780 + }, + { + "epoch": 0.09959353232939996, + "grad_norm": 1.4248117208480835, + "learning_rate": 0.00019747117543837125, + "loss": 1.8146, + "step": 2781 + }, + { + "epoch": 0.09962934445896825, + "grad_norm": 1.520105242729187, + "learning_rate": 0.00019746858279916476, + "loss": 1.3316, + "step": 2782 + }, + { + "epoch": 0.09966515658853654, + "grad_norm": 1.8025250434875488, + "learning_rate": 0.0001974659888486442, + "loss": 1.903, + "step": 2783 + }, + { + "epoch": 0.09970096871810483, + "grad_norm": 1.3632969856262207, + "learning_rate": 0.0001974633935868445, + "loss": 1.41, + "step": 2784 + }, + { + "epoch": 0.09973678084767311, + "grad_norm": 1.2988522052764893, + "learning_rate": 0.00019746079701380055, + "loss": 1.6359, + "step": 2785 + }, + { + "epoch": 0.09977259297724139, + "grad_norm": 2.9737942218780518, + "learning_rate": 0.00019745819912954732, + "loss": 1.5823, + "step": 2786 + }, + { + "epoch": 0.09980840510680968, + "grad_norm": 1.4505422115325928, + "learning_rate": 0.00019745559993411966, + "loss": 1.5382, + "step": 2787 + }, + { + "epoch": 0.09984421723637796, + "grad_norm": 1.6142643690109253, + "learning_rate": 0.00019745299942755266, + "loss": 1.6859, + "step": 2788 + }, + { + "epoch": 0.09988002936594624, + "grad_norm": 1.6316615343093872, + "learning_rate": 0.00019745039760988127, + "loss": 1.6503, + "step": 2789 + }, + { + "epoch": 0.09991584149551454, + "grad_norm": 2.2728500366210938, + "learning_rate": 0.00019744779448114047, + "loss": 1.2704, + "step": 2790 + }, + { + "epoch": 0.09995165362508282, + "grad_norm": 1.512316346168518, + "learning_rate": 0.00019744519004136527, + "loss": 1.269, + "step": 2791 + }, + { + "epoch": 0.0999874657546511, + "grad_norm": 1.5691752433776855, + "learning_rate": 0.00019744258429059075, + "loss": 1.6717, + "step": 2792 + }, + { + "epoch": 0.10002327788421939, + "grad_norm": 1.5791586637496948, + "learning_rate": 0.00019743997722885198, + "loss": 1.6228, + "step": 2793 + }, + { + "epoch": 0.10005909001378767, + "grad_norm": 1.5813857316970825, + "learning_rate": 0.00019743736885618395, + "loss": 1.3792, + "step": 2794 + }, + { + "epoch": 0.10009490214335595, + "grad_norm": 2.116391897201538, + "learning_rate": 0.00019743475917262187, + "loss": 1.5369, + "step": 2795 + }, + { + "epoch": 0.10013071427292423, + "grad_norm": 1.7743563652038574, + "learning_rate": 0.00019743214817820074, + "loss": 1.677, + "step": 2796 + }, + { + "epoch": 0.10016652640249253, + "grad_norm": 1.637967586517334, + "learning_rate": 0.00019742953587295573, + "loss": 1.6838, + "step": 2797 + }, + { + "epoch": 0.10020233853206081, + "grad_norm": 1.7225396633148193, + "learning_rate": 0.000197426922256922, + "loss": 1.7073, + "step": 2798 + }, + { + "epoch": 0.1002381506616291, + "grad_norm": 1.4051920175552368, + "learning_rate": 0.00019742430733013473, + "loss": 1.3682, + "step": 2799 + }, + { + "epoch": 0.10027396279119738, + "grad_norm": 1.5960766077041626, + "learning_rate": 0.00019742169109262904, + "loss": 1.8122, + "step": 2800 + }, + { + "epoch": 0.10030977492076566, + "grad_norm": 1.5492359399795532, + "learning_rate": 0.00019741907354444018, + "loss": 1.656, + "step": 2801 + }, + { + "epoch": 0.10034558705033395, + "grad_norm": 1.6034247875213623, + "learning_rate": 0.00019741645468560336, + "loss": 1.8924, + "step": 2802 + }, + { + "epoch": 0.10038139917990223, + "grad_norm": 2.0048415660858154, + "learning_rate": 0.00019741383451615376, + "loss": 1.6452, + "step": 2803 + }, + { + "epoch": 0.10041721130947051, + "grad_norm": 1.9224190711975098, + "learning_rate": 0.0001974112130361267, + "loss": 1.747, + "step": 2804 + }, + { + "epoch": 0.10045302343903881, + "grad_norm": 1.7946642637252808, + "learning_rate": 0.0001974085902455574, + "loss": 1.6644, + "step": 2805 + }, + { + "epoch": 0.10048883556860709, + "grad_norm": 1.3062515258789062, + "learning_rate": 0.0001974059661444812, + "loss": 1.6385, + "step": 2806 + }, + { + "epoch": 0.10052464769817537, + "grad_norm": 1.9280710220336914, + "learning_rate": 0.00019740334073293334, + "loss": 1.7102, + "step": 2807 + }, + { + "epoch": 0.10056045982774366, + "grad_norm": 1.6444439888000488, + "learning_rate": 0.0001974007140109492, + "loss": 1.645, + "step": 2808 + }, + { + "epoch": 0.10059627195731194, + "grad_norm": 2.063418388366699, + "learning_rate": 0.00019739808597856405, + "loss": 1.9611, + "step": 2809 + }, + { + "epoch": 0.10063208408688022, + "grad_norm": 1.3895442485809326, + "learning_rate": 0.0001973954566358133, + "loss": 1.5975, + "step": 2810 + }, + { + "epoch": 0.1006678962164485, + "grad_norm": 1.1396634578704834, + "learning_rate": 0.0001973928259827323, + "loss": 1.5006, + "step": 2811 + }, + { + "epoch": 0.1007037083460168, + "grad_norm": 2.067750930786133, + "learning_rate": 0.0001973901940193565, + "loss": 1.4993, + "step": 2812 + }, + { + "epoch": 0.10073952047558508, + "grad_norm": 2.5891499519348145, + "learning_rate": 0.00019738756074572127, + "loss": 1.7023, + "step": 2813 + }, + { + "epoch": 0.10077533260515337, + "grad_norm": 1.5110852718353271, + "learning_rate": 0.00019738492616186198, + "loss": 1.7311, + "step": 2814 + }, + { + "epoch": 0.10081114473472165, + "grad_norm": 1.5566610097885132, + "learning_rate": 0.00019738229026781414, + "loss": 1.4559, + "step": 2815 + }, + { + "epoch": 0.10084695686428993, + "grad_norm": 1.310876488685608, + "learning_rate": 0.00019737965306361322, + "loss": 1.5475, + "step": 2816 + }, + { + "epoch": 0.10088276899385822, + "grad_norm": 1.5883020162582397, + "learning_rate": 0.00019737701454929468, + "loss": 1.6171, + "step": 2817 + }, + { + "epoch": 0.1009185811234265, + "grad_norm": 1.3175413608551025, + "learning_rate": 0.000197374374724894, + "loss": 1.6509, + "step": 2818 + }, + { + "epoch": 0.1009543932529948, + "grad_norm": 1.287886619567871, + "learning_rate": 0.0001973717335904467, + "loss": 1.5315, + "step": 2819 + }, + { + "epoch": 0.10099020538256308, + "grad_norm": 2.7122435569763184, + "learning_rate": 0.00019736909114598833, + "loss": 1.8696, + "step": 2820 + }, + { + "epoch": 0.10102601751213136, + "grad_norm": 1.4318008422851562, + "learning_rate": 0.00019736644739155445, + "loss": 1.4318, + "step": 2821 + }, + { + "epoch": 0.10106182964169964, + "grad_norm": 1.3406039476394653, + "learning_rate": 0.00019736380232718062, + "loss": 1.6633, + "step": 2822 + }, + { + "epoch": 0.10109764177126793, + "grad_norm": 1.6335771083831787, + "learning_rate": 0.00019736115595290238, + "loss": 1.7371, + "step": 2823 + }, + { + "epoch": 0.10113345390083621, + "grad_norm": 1.8981465101242065, + "learning_rate": 0.00019735850826875542, + "loss": 1.596, + "step": 2824 + }, + { + "epoch": 0.10116926603040449, + "grad_norm": 1.0624903440475464, + "learning_rate": 0.0001973558592747753, + "loss": 1.4223, + "step": 2825 + }, + { + "epoch": 0.10120507815997279, + "grad_norm": 1.5947264432907104, + "learning_rate": 0.00019735320897099764, + "loss": 1.9196, + "step": 2826 + }, + { + "epoch": 0.10124089028954107, + "grad_norm": 1.1546331644058228, + "learning_rate": 0.00019735055735745817, + "loss": 1.6058, + "step": 2827 + }, + { + "epoch": 0.10127670241910935, + "grad_norm": 2.101027011871338, + "learning_rate": 0.0001973479044341925, + "loss": 1.6675, + "step": 2828 + }, + { + "epoch": 0.10131251454867764, + "grad_norm": 2.380446672439575, + "learning_rate": 0.00019734525020123639, + "loss": 1.8698, + "step": 2829 + }, + { + "epoch": 0.10134832667824592, + "grad_norm": 2.061709403991699, + "learning_rate": 0.00019734259465862546, + "loss": 1.6063, + "step": 2830 + }, + { + "epoch": 0.1013841388078142, + "grad_norm": 1.974861741065979, + "learning_rate": 0.0001973399378063955, + "loss": 1.3884, + "step": 2831 + }, + { + "epoch": 0.10141995093738249, + "grad_norm": 1.3313992023468018, + "learning_rate": 0.00019733727964458221, + "loss": 1.4632, + "step": 2832 + }, + { + "epoch": 0.10145576306695078, + "grad_norm": 1.6623539924621582, + "learning_rate": 0.00019733462017322142, + "loss": 1.8361, + "step": 2833 + }, + { + "epoch": 0.10149157519651907, + "grad_norm": 1.7105774879455566, + "learning_rate": 0.00019733195939234882, + "loss": 1.7204, + "step": 2834 + }, + { + "epoch": 0.10152738732608735, + "grad_norm": 1.2900364398956299, + "learning_rate": 0.00019732929730200031, + "loss": 1.6921, + "step": 2835 + }, + { + "epoch": 0.10156319945565563, + "grad_norm": 1.5834048986434937, + "learning_rate": 0.00019732663390221162, + "loss": 1.876, + "step": 2836 + }, + { + "epoch": 0.10159901158522391, + "grad_norm": 1.4001473188400269, + "learning_rate": 0.0001973239691930186, + "loss": 1.6452, + "step": 2837 + }, + { + "epoch": 0.1016348237147922, + "grad_norm": 2.6770882606506348, + "learning_rate": 0.00019732130317445714, + "loss": 1.6828, + "step": 2838 + }, + { + "epoch": 0.10167063584436048, + "grad_norm": 1.2641379833221436, + "learning_rate": 0.00019731863584656308, + "loss": 1.5711, + "step": 2839 + }, + { + "epoch": 0.10170644797392878, + "grad_norm": 1.3252092599868774, + "learning_rate": 0.0001973159672093723, + "loss": 1.8273, + "step": 2840 + }, + { + "epoch": 0.10174226010349706, + "grad_norm": 1.4117766618728638, + "learning_rate": 0.00019731329726292073, + "loss": 1.4361, + "step": 2841 + }, + { + "epoch": 0.10177807223306534, + "grad_norm": 1.7400825023651123, + "learning_rate": 0.00019731062600724424, + "loss": 1.5425, + "step": 2842 + }, + { + "epoch": 0.10181388436263362, + "grad_norm": 1.2664161920547485, + "learning_rate": 0.0001973079534423788, + "loss": 1.7458, + "step": 2843 + }, + { + "epoch": 0.10184969649220191, + "grad_norm": 1.4911015033721924, + "learning_rate": 0.00019730527956836035, + "loss": 1.6534, + "step": 2844 + }, + { + "epoch": 0.10188550862177019, + "grad_norm": 1.494632601737976, + "learning_rate": 0.00019730260438522492, + "loss": 1.4687, + "step": 2845 + }, + { + "epoch": 0.10192132075133847, + "grad_norm": 2.710315227508545, + "learning_rate": 0.00019729992789300845, + "loss": 1.6687, + "step": 2846 + }, + { + "epoch": 0.10195713288090676, + "grad_norm": 1.292092204093933, + "learning_rate": 0.00019729725009174693, + "loss": 1.6175, + "step": 2847 + }, + { + "epoch": 0.10199294501047505, + "grad_norm": 1.6304455995559692, + "learning_rate": 0.00019729457098147647, + "loss": 1.4029, + "step": 2848 + }, + { + "epoch": 0.10202875714004334, + "grad_norm": 1.7625586986541748, + "learning_rate": 0.000197291890562233, + "loss": 1.9333, + "step": 2849 + }, + { + "epoch": 0.10206456926961162, + "grad_norm": 2.0807228088378906, + "learning_rate": 0.00019728920883405263, + "loss": 1.5159, + "step": 2850 + }, + { + "epoch": 0.1021003813991799, + "grad_norm": 1.6510517597198486, + "learning_rate": 0.00019728652579697152, + "loss": 1.5753, + "step": 2851 + }, + { + "epoch": 0.10213619352874818, + "grad_norm": 1.9343862533569336, + "learning_rate": 0.00019728384145102564, + "loss": 1.615, + "step": 2852 + }, + { + "epoch": 0.10217200565831647, + "grad_norm": 1.403560996055603, + "learning_rate": 0.00019728115579625117, + "loss": 1.4167, + "step": 2853 + }, + { + "epoch": 0.10220781778788475, + "grad_norm": 1.5644052028656006, + "learning_rate": 0.00019727846883268427, + "loss": 1.7456, + "step": 2854 + }, + { + "epoch": 0.10224362991745305, + "grad_norm": 1.6829103231430054, + "learning_rate": 0.00019727578056036101, + "loss": 1.7578, + "step": 2855 + }, + { + "epoch": 0.10227944204702133, + "grad_norm": 1.861466646194458, + "learning_rate": 0.0001972730909793176, + "loss": 1.5786, + "step": 2856 + }, + { + "epoch": 0.10231525417658961, + "grad_norm": 1.6972744464874268, + "learning_rate": 0.00019727040008959024, + "loss": 1.6002, + "step": 2857 + }, + { + "epoch": 0.1023510663061579, + "grad_norm": 1.5097100734710693, + "learning_rate": 0.00019726770789121512, + "loss": 1.5889, + "step": 2858 + }, + { + "epoch": 0.10238687843572618, + "grad_norm": 1.354004979133606, + "learning_rate": 0.00019726501438422842, + "loss": 1.6782, + "step": 2859 + }, + { + "epoch": 0.10242269056529446, + "grad_norm": 1.700211763381958, + "learning_rate": 0.00019726231956866645, + "loss": 1.3758, + "step": 2860 + }, + { + "epoch": 0.10245850269486274, + "grad_norm": 2.7783384323120117, + "learning_rate": 0.00019725962344456543, + "loss": 1.5725, + "step": 2861 + }, + { + "epoch": 0.10249431482443104, + "grad_norm": 1.8978086709976196, + "learning_rate": 0.00019725692601196162, + "loss": 1.6209, + "step": 2862 + }, + { + "epoch": 0.10253012695399932, + "grad_norm": 2.073197841644287, + "learning_rate": 0.00019725422727089132, + "loss": 1.3348, + "step": 2863 + }, + { + "epoch": 0.1025659390835676, + "grad_norm": 1.9262858629226685, + "learning_rate": 0.00019725152722139085, + "loss": 1.8382, + "step": 2864 + }, + { + "epoch": 0.10260175121313589, + "grad_norm": 1.9772917032241821, + "learning_rate": 0.00019724882586349653, + "loss": 1.7399, + "step": 2865 + }, + { + "epoch": 0.10263756334270417, + "grad_norm": 1.9056910276412964, + "learning_rate": 0.00019724612319724468, + "loss": 1.5189, + "step": 2866 + }, + { + "epoch": 0.10267337547227245, + "grad_norm": 1.3025951385498047, + "learning_rate": 0.0001972434192226717, + "loss": 1.6968, + "step": 2867 + }, + { + "epoch": 0.10270918760184074, + "grad_norm": 1.772749900817871, + "learning_rate": 0.00019724071393981393, + "loss": 1.3956, + "step": 2868 + }, + { + "epoch": 0.10274499973140903, + "grad_norm": 2.921922445297241, + "learning_rate": 0.00019723800734870782, + "loss": 1.7918, + "step": 2869 + }, + { + "epoch": 0.10278081186097732, + "grad_norm": 1.5922387838363647, + "learning_rate": 0.00019723529944938974, + "loss": 1.8264, + "step": 2870 + }, + { + "epoch": 0.1028166239905456, + "grad_norm": 1.159803867340088, + "learning_rate": 0.00019723259024189612, + "loss": 1.4858, + "step": 2871 + }, + { + "epoch": 0.10285243612011388, + "grad_norm": 1.4431113004684448, + "learning_rate": 0.0001972298797262634, + "loss": 1.6408, + "step": 2872 + }, + { + "epoch": 0.10288824824968217, + "grad_norm": 1.711513876914978, + "learning_rate": 0.0001972271679025281, + "loss": 1.6958, + "step": 2873 + }, + { + "epoch": 0.10292406037925045, + "grad_norm": 1.448328971862793, + "learning_rate": 0.00019722445477072666, + "loss": 1.4791, + "step": 2874 + }, + { + "epoch": 0.10295987250881873, + "grad_norm": 1.213527798652649, + "learning_rate": 0.00019722174033089557, + "loss": 1.5364, + "step": 2875 + }, + { + "epoch": 0.10299568463838703, + "grad_norm": 1.4906288385391235, + "learning_rate": 0.0001972190245830714, + "loss": 1.4892, + "step": 2876 + }, + { + "epoch": 0.10303149676795531, + "grad_norm": 1.639464020729065, + "learning_rate": 0.00019721630752729064, + "loss": 1.7839, + "step": 2877 + }, + { + "epoch": 0.1030673088975236, + "grad_norm": 2.0337164402008057, + "learning_rate": 0.00019721358916358986, + "loss": 1.5573, + "step": 2878 + }, + { + "epoch": 0.10310312102709188, + "grad_norm": 1.2845540046691895, + "learning_rate": 0.00019721086949200565, + "loss": 1.6272, + "step": 2879 + }, + { + "epoch": 0.10313893315666016, + "grad_norm": 1.317577838897705, + "learning_rate": 0.00019720814851257457, + "loss": 1.5472, + "step": 2880 + }, + { + "epoch": 0.10317474528622844, + "grad_norm": 1.3626635074615479, + "learning_rate": 0.00019720542622533323, + "loss": 1.6389, + "step": 2881 + }, + { + "epoch": 0.10321055741579672, + "grad_norm": 1.4564090967178345, + "learning_rate": 0.0001972027026303183, + "loss": 1.5697, + "step": 2882 + }, + { + "epoch": 0.10324636954536502, + "grad_norm": 1.459882140159607, + "learning_rate": 0.00019719997772756637, + "loss": 1.4884, + "step": 2883 + }, + { + "epoch": 0.1032821816749333, + "grad_norm": 1.6414357423782349, + "learning_rate": 0.00019719725151711413, + "loss": 2.0682, + "step": 2884 + }, + { + "epoch": 0.10331799380450159, + "grad_norm": 1.225942850112915, + "learning_rate": 0.00019719452399899823, + "loss": 1.3942, + "step": 2885 + }, + { + "epoch": 0.10335380593406987, + "grad_norm": 1.67031991481781, + "learning_rate": 0.00019719179517325538, + "loss": 1.4331, + "step": 2886 + }, + { + "epoch": 0.10338961806363815, + "grad_norm": 2.3105432987213135, + "learning_rate": 0.00019718906503992233, + "loss": 1.7211, + "step": 2887 + }, + { + "epoch": 0.10342543019320644, + "grad_norm": 1.6651228666305542, + "learning_rate": 0.00019718633359903573, + "loss": 1.3568, + "step": 2888 + }, + { + "epoch": 0.10346124232277472, + "grad_norm": 2.5842862129211426, + "learning_rate": 0.00019718360085063238, + "loss": 1.6142, + "step": 2889 + }, + { + "epoch": 0.10349705445234302, + "grad_norm": 1.3132528066635132, + "learning_rate": 0.00019718086679474905, + "loss": 1.5248, + "step": 2890 + }, + { + "epoch": 0.1035328665819113, + "grad_norm": 1.5135085582733154, + "learning_rate": 0.0001971781314314225, + "loss": 1.4578, + "step": 2891 + }, + { + "epoch": 0.10356867871147958, + "grad_norm": 1.8713313341140747, + "learning_rate": 0.00019717539476068959, + "loss": 1.5403, + "step": 2892 + }, + { + "epoch": 0.10360449084104786, + "grad_norm": 1.3445090055465698, + "learning_rate": 0.00019717265678258702, + "loss": 1.6932, + "step": 2893 + }, + { + "epoch": 0.10364030297061615, + "grad_norm": 2.2110817432403564, + "learning_rate": 0.00019716991749715174, + "loss": 1.6663, + "step": 2894 + }, + { + "epoch": 0.10367611510018443, + "grad_norm": 1.4373912811279297, + "learning_rate": 0.00019716717690442055, + "loss": 1.6872, + "step": 2895 + }, + { + "epoch": 0.10371192722975271, + "grad_norm": 1.3239036798477173, + "learning_rate": 0.00019716443500443034, + "loss": 1.4225, + "step": 2896 + }, + { + "epoch": 0.103747739359321, + "grad_norm": 1.6182363033294678, + "learning_rate": 0.00019716169179721799, + "loss": 1.7373, + "step": 2897 + }, + { + "epoch": 0.10378355148888929, + "grad_norm": 1.3850167989730835, + "learning_rate": 0.00019715894728282037, + "loss": 1.5345, + "step": 2898 + }, + { + "epoch": 0.10381936361845757, + "grad_norm": 1.4673134088516235, + "learning_rate": 0.00019715620146127448, + "loss": 1.5932, + "step": 2899 + }, + { + "epoch": 0.10385517574802586, + "grad_norm": 1.7944717407226562, + "learning_rate": 0.0001971534543326172, + "loss": 1.6443, + "step": 2900 + }, + { + "epoch": 0.10389098787759414, + "grad_norm": 1.4910426139831543, + "learning_rate": 0.0001971507058968855, + "loss": 1.7324, + "step": 2901 + }, + { + "epoch": 0.10392680000716242, + "grad_norm": 2.181478261947632, + "learning_rate": 0.00019714795615411644, + "loss": 1.6161, + "step": 2902 + }, + { + "epoch": 0.1039626121367307, + "grad_norm": 1.2145978212356567, + "learning_rate": 0.00019714520510434686, + "loss": 1.5633, + "step": 2903 + }, + { + "epoch": 0.10399842426629899, + "grad_norm": 1.4202697277069092, + "learning_rate": 0.0001971424527476139, + "loss": 1.3827, + "step": 2904 + }, + { + "epoch": 0.10403423639586729, + "grad_norm": 2.2743844985961914, + "learning_rate": 0.0001971396990839545, + "loss": 1.6353, + "step": 2905 + }, + { + "epoch": 0.10407004852543557, + "grad_norm": 1.5119394063949585, + "learning_rate": 0.0001971369441134058, + "loss": 1.4954, + "step": 2906 + }, + { + "epoch": 0.10410586065500385, + "grad_norm": 2.039295196533203, + "learning_rate": 0.00019713418783600477, + "loss": 1.4147, + "step": 2907 + }, + { + "epoch": 0.10414167278457213, + "grad_norm": 1.9588508605957031, + "learning_rate": 0.00019713143025178856, + "loss": 1.7196, + "step": 2908 + }, + { + "epoch": 0.10417748491414042, + "grad_norm": 1.776458978652954, + "learning_rate": 0.00019712867136079427, + "loss": 1.5977, + "step": 2909 + }, + { + "epoch": 0.1042132970437087, + "grad_norm": 1.6060348749160767, + "learning_rate": 0.00019712591116305896, + "loss": 1.8558, + "step": 2910 + }, + { + "epoch": 0.10424910917327698, + "grad_norm": 1.828564167022705, + "learning_rate": 0.0001971231496586198, + "loss": 1.6377, + "step": 2911 + }, + { + "epoch": 0.10428492130284528, + "grad_norm": 2.2467479705810547, + "learning_rate": 0.00019712038684751394, + "loss": 1.6065, + "step": 2912 + }, + { + "epoch": 0.10432073343241356, + "grad_norm": 1.4483753442764282, + "learning_rate": 0.0001971176227297786, + "loss": 1.4403, + "step": 2913 + }, + { + "epoch": 0.10435654556198184, + "grad_norm": 1.480074405670166, + "learning_rate": 0.00019711485730545086, + "loss": 1.4702, + "step": 2914 + }, + { + "epoch": 0.10439235769155013, + "grad_norm": 2.7046217918395996, + "learning_rate": 0.000197112090574568, + "loss": 1.5385, + "step": 2915 + }, + { + "epoch": 0.10442816982111841, + "grad_norm": 1.4356821775436401, + "learning_rate": 0.00019710932253716722, + "loss": 1.2486, + "step": 2916 + }, + { + "epoch": 0.1044639819506867, + "grad_norm": 1.5573153495788574, + "learning_rate": 0.00019710655319328578, + "loss": 1.8457, + "step": 2917 + }, + { + "epoch": 0.10449979408025498, + "grad_norm": 1.470349669456482, + "learning_rate": 0.00019710378254296092, + "loss": 2.0201, + "step": 2918 + }, + { + "epoch": 0.10453560620982327, + "grad_norm": 1.6238548755645752, + "learning_rate": 0.00019710101058622993, + "loss": 1.279, + "step": 2919 + }, + { + "epoch": 0.10457141833939156, + "grad_norm": 1.5568904876708984, + "learning_rate": 0.00019709823732313008, + "loss": 1.6239, + "step": 2920 + }, + { + "epoch": 0.10460723046895984, + "grad_norm": 2.827681064605713, + "learning_rate": 0.0001970954627536987, + "loss": 1.8256, + "step": 2921 + }, + { + "epoch": 0.10464304259852812, + "grad_norm": 1.9860001802444458, + "learning_rate": 0.00019709268687797312, + "loss": 1.6688, + "step": 2922 + }, + { + "epoch": 0.1046788547280964, + "grad_norm": 1.3693649768829346, + "learning_rate": 0.00019708990969599067, + "loss": 1.4567, + "step": 2923 + }, + { + "epoch": 0.10471466685766469, + "grad_norm": 1.5036529302597046, + "learning_rate": 0.00019708713120778873, + "loss": 1.7974, + "step": 2924 + }, + { + "epoch": 0.10475047898723297, + "grad_norm": 1.446396827697754, + "learning_rate": 0.00019708435141340465, + "loss": 1.5247, + "step": 2925 + }, + { + "epoch": 0.10478629111680127, + "grad_norm": 1.1922216415405273, + "learning_rate": 0.00019708157031287588, + "loss": 1.8338, + "step": 2926 + }, + { + "epoch": 0.10482210324636955, + "grad_norm": 1.8486875295639038, + "learning_rate": 0.0001970787879062398, + "loss": 1.359, + "step": 2927 + }, + { + "epoch": 0.10485791537593783, + "grad_norm": 1.4665194749832153, + "learning_rate": 0.00019707600419353383, + "loss": 1.6474, + "step": 2928 + }, + { + "epoch": 0.10489372750550612, + "grad_norm": 1.7647480964660645, + "learning_rate": 0.00019707321917479547, + "loss": 1.6358, + "step": 2929 + }, + { + "epoch": 0.1049295396350744, + "grad_norm": 1.694216012954712, + "learning_rate": 0.00019707043285006214, + "loss": 1.618, + "step": 2930 + }, + { + "epoch": 0.10496535176464268, + "grad_norm": 1.8111251592636108, + "learning_rate": 0.00019706764521937138, + "loss": 1.6625, + "step": 2931 + }, + { + "epoch": 0.10500116389421096, + "grad_norm": 1.5972100496292114, + "learning_rate": 0.00019706485628276062, + "loss": 1.5874, + "step": 2932 + }, + { + "epoch": 0.10503697602377926, + "grad_norm": 1.9588450193405151, + "learning_rate": 0.00019706206604026746, + "loss": 1.5872, + "step": 2933 + }, + { + "epoch": 0.10507278815334754, + "grad_norm": 1.1568310260772705, + "learning_rate": 0.00019705927449192937, + "loss": 1.4907, + "step": 2934 + }, + { + "epoch": 0.10510860028291583, + "grad_norm": 1.5908012390136719, + "learning_rate": 0.00019705648163778397, + "loss": 1.5814, + "step": 2935 + }, + { + "epoch": 0.10514441241248411, + "grad_norm": 1.7401422262191772, + "learning_rate": 0.00019705368747786878, + "loss": 1.9683, + "step": 2936 + }, + { + "epoch": 0.10518022454205239, + "grad_norm": 1.7295998334884644, + "learning_rate": 0.00019705089201222143, + "loss": 1.3394, + "step": 2937 + }, + { + "epoch": 0.10521603667162067, + "grad_norm": 1.5581072568893433, + "learning_rate": 0.00019704809524087952, + "loss": 1.7824, + "step": 2938 + }, + { + "epoch": 0.10525184880118896, + "grad_norm": 1.2611238956451416, + "learning_rate": 0.00019704529716388068, + "loss": 1.5758, + "step": 2939 + }, + { + "epoch": 0.10528766093075725, + "grad_norm": 1.469044804573059, + "learning_rate": 0.00019704249778126253, + "loss": 1.5392, + "step": 2940 + }, + { + "epoch": 0.10532347306032554, + "grad_norm": 1.7068703174591064, + "learning_rate": 0.00019703969709306273, + "loss": 1.6083, + "step": 2941 + }, + { + "epoch": 0.10535928518989382, + "grad_norm": 1.3602216243743896, + "learning_rate": 0.000197036895099319, + "loss": 1.9103, + "step": 2942 + }, + { + "epoch": 0.1053950973194621, + "grad_norm": 1.29122793674469, + "learning_rate": 0.000197034091800069, + "loss": 1.9111, + "step": 2943 + }, + { + "epoch": 0.10543090944903039, + "grad_norm": 1.4069241285324097, + "learning_rate": 0.00019703128719535047, + "loss": 1.6298, + "step": 2944 + }, + { + "epoch": 0.10546672157859867, + "grad_norm": 1.6360414028167725, + "learning_rate": 0.00019702848128520112, + "loss": 1.5577, + "step": 2945 + }, + { + "epoch": 0.10550253370816695, + "grad_norm": 2.210772752761841, + "learning_rate": 0.00019702567406965874, + "loss": 1.8814, + "step": 2946 + }, + { + "epoch": 0.10553834583773523, + "grad_norm": 1.9793405532836914, + "learning_rate": 0.00019702286554876107, + "loss": 1.8386, + "step": 2947 + }, + { + "epoch": 0.10557415796730353, + "grad_norm": 1.657800316810608, + "learning_rate": 0.00019702005572254586, + "loss": 1.603, + "step": 2948 + }, + { + "epoch": 0.10560997009687181, + "grad_norm": 2.0030272006988525, + "learning_rate": 0.00019701724459105096, + "loss": 1.615, + "step": 2949 + }, + { + "epoch": 0.1056457822264401, + "grad_norm": 1.4756288528442383, + "learning_rate": 0.0001970144321543142, + "loss": 1.713, + "step": 2950 + }, + { + "epoch": 0.10568159435600838, + "grad_norm": 1.9561413526535034, + "learning_rate": 0.00019701161841237337, + "loss": 1.5548, + "step": 2951 + }, + { + "epoch": 0.10571740648557666, + "grad_norm": 2.031585216522217, + "learning_rate": 0.00019700880336526635, + "loss": 1.9009, + "step": 2952 + }, + { + "epoch": 0.10575321861514494, + "grad_norm": 1.3856836557388306, + "learning_rate": 0.000197005987013031, + "loss": 1.9266, + "step": 2953 + }, + { + "epoch": 0.10578903074471323, + "grad_norm": 1.4809298515319824, + "learning_rate": 0.00019700316935570525, + "loss": 1.7208, + "step": 2954 + }, + { + "epoch": 0.10582484287428152, + "grad_norm": 1.3451876640319824, + "learning_rate": 0.00019700035039332697, + "loss": 1.6572, + "step": 2955 + }, + { + "epoch": 0.10586065500384981, + "grad_norm": 1.6890588998794556, + "learning_rate": 0.00019699753012593412, + "loss": 1.6136, + "step": 2956 + }, + { + "epoch": 0.10589646713341809, + "grad_norm": 1.5177700519561768, + "learning_rate": 0.0001969947085535646, + "loss": 1.2336, + "step": 2957 + }, + { + "epoch": 0.10593227926298637, + "grad_norm": 2.2147626876831055, + "learning_rate": 0.00019699188567625639, + "loss": 1.7097, + "step": 2958 + }, + { + "epoch": 0.10596809139255466, + "grad_norm": 1.2537357807159424, + "learning_rate": 0.00019698906149404746, + "loss": 1.6603, + "step": 2959 + }, + { + "epoch": 0.10600390352212294, + "grad_norm": 1.5088328123092651, + "learning_rate": 0.00019698623600697583, + "loss": 1.6384, + "step": 2960 + }, + { + "epoch": 0.10603971565169122, + "grad_norm": 1.7185791730880737, + "learning_rate": 0.0001969834092150795, + "loss": 1.9186, + "step": 2961 + }, + { + "epoch": 0.10607552778125952, + "grad_norm": 1.3850542306900024, + "learning_rate": 0.0001969805811183965, + "loss": 1.7726, + "step": 2962 + }, + { + "epoch": 0.1061113399108278, + "grad_norm": 1.59743332862854, + "learning_rate": 0.00019697775171696486, + "loss": 1.6747, + "step": 2963 + }, + { + "epoch": 0.10614715204039608, + "grad_norm": 1.8517948389053345, + "learning_rate": 0.00019697492101082266, + "loss": 1.6965, + "step": 2964 + }, + { + "epoch": 0.10618296416996437, + "grad_norm": 1.5334497690200806, + "learning_rate": 0.000196972089000008, + "loss": 1.6025, + "step": 2965 + }, + { + "epoch": 0.10621877629953265, + "grad_norm": 1.2686636447906494, + "learning_rate": 0.00019696925568455894, + "loss": 1.5366, + "step": 2966 + }, + { + "epoch": 0.10625458842910093, + "grad_norm": 1.518660306930542, + "learning_rate": 0.00019696642106451368, + "loss": 1.8495, + "step": 2967 + }, + { + "epoch": 0.10629040055866922, + "grad_norm": 1.365371584892273, + "learning_rate": 0.00019696358513991027, + "loss": 1.8094, + "step": 2968 + }, + { + "epoch": 0.10632621268823751, + "grad_norm": 1.5992499589920044, + "learning_rate": 0.0001969607479107869, + "loss": 1.6623, + "step": 2969 + }, + { + "epoch": 0.1063620248178058, + "grad_norm": 1.7085697650909424, + "learning_rate": 0.00019695790937718176, + "loss": 1.8765, + "step": 2970 + }, + { + "epoch": 0.10639783694737408, + "grad_norm": 2.420846700668335, + "learning_rate": 0.00019695506953913298, + "loss": 2.0682, + "step": 2971 + }, + { + "epoch": 0.10643364907694236, + "grad_norm": 2.160022735595703, + "learning_rate": 0.0001969522283966788, + "loss": 1.9651, + "step": 2972 + }, + { + "epoch": 0.10646946120651064, + "grad_norm": 1.9768903255462646, + "learning_rate": 0.00019694938594985747, + "loss": 1.5112, + "step": 2973 + }, + { + "epoch": 0.10650527333607893, + "grad_norm": 1.8344073295593262, + "learning_rate": 0.00019694654219870722, + "loss": 1.7646, + "step": 2974 + }, + { + "epoch": 0.10654108546564721, + "grad_norm": 1.4202852249145508, + "learning_rate": 0.00019694369714326625, + "loss": 1.6141, + "step": 2975 + }, + { + "epoch": 0.1065768975952155, + "grad_norm": 1.8785535097122192, + "learning_rate": 0.00019694085078357293, + "loss": 1.7087, + "step": 2976 + }, + { + "epoch": 0.10661270972478379, + "grad_norm": 1.394516944885254, + "learning_rate": 0.00019693800311966549, + "loss": 1.5792, + "step": 2977 + }, + { + "epoch": 0.10664852185435207, + "grad_norm": 1.6537799835205078, + "learning_rate": 0.00019693515415158223, + "loss": 1.6773, + "step": 2978 + }, + { + "epoch": 0.10668433398392035, + "grad_norm": 1.922308325767517, + "learning_rate": 0.00019693230387936154, + "loss": 1.7615, + "step": 2979 + }, + { + "epoch": 0.10672014611348864, + "grad_norm": 1.586806058883667, + "learning_rate": 0.00019692945230304174, + "loss": 1.7742, + "step": 2980 + }, + { + "epoch": 0.10675595824305692, + "grad_norm": 2.630951166152954, + "learning_rate": 0.00019692659942266118, + "loss": 2.197, + "step": 2981 + }, + { + "epoch": 0.1067917703726252, + "grad_norm": 1.6085656881332397, + "learning_rate": 0.00019692374523825823, + "loss": 1.4823, + "step": 2982 + }, + { + "epoch": 0.1068275825021935, + "grad_norm": 1.7773700952529907, + "learning_rate": 0.00019692088974987133, + "loss": 1.8165, + "step": 2983 + }, + { + "epoch": 0.10686339463176178, + "grad_norm": 1.6625611782073975, + "learning_rate": 0.0001969180329575389, + "loss": 1.6183, + "step": 2984 + }, + { + "epoch": 0.10689920676133007, + "grad_norm": 3.302849292755127, + "learning_rate": 0.0001969151748612993, + "loss": 1.8682, + "step": 2985 + }, + { + "epoch": 0.10693501889089835, + "grad_norm": 1.2780555486679077, + "learning_rate": 0.00019691231546119107, + "loss": 1.3038, + "step": 2986 + }, + { + "epoch": 0.10697083102046663, + "grad_norm": 1.9470473527908325, + "learning_rate": 0.00019690945475725266, + "loss": 1.4459, + "step": 2987 + }, + { + "epoch": 0.10700664315003491, + "grad_norm": 1.5665614604949951, + "learning_rate": 0.0001969065927495225, + "loss": 1.6486, + "step": 2988 + }, + { + "epoch": 0.1070424552796032, + "grad_norm": 1.4052174091339111, + "learning_rate": 0.00019690372943803914, + "loss": 1.7583, + "step": 2989 + }, + { + "epoch": 0.1070782674091715, + "grad_norm": 2.11560320854187, + "learning_rate": 0.00019690086482284112, + "loss": 1.4927, + "step": 2990 + }, + { + "epoch": 0.10711407953873978, + "grad_norm": 1.2599753141403198, + "learning_rate": 0.00019689799890396694, + "loss": 1.5102, + "step": 2991 + }, + { + "epoch": 0.10714989166830806, + "grad_norm": 2.6709375381469727, + "learning_rate": 0.0001968951316814552, + "loss": 1.7188, + "step": 2992 + }, + { + "epoch": 0.10718570379787634, + "grad_norm": 2.060213565826416, + "learning_rate": 0.0001968922631553444, + "loss": 1.5915, + "step": 2993 + }, + { + "epoch": 0.10722151592744462, + "grad_norm": 1.579801082611084, + "learning_rate": 0.00019688939332567325, + "loss": 1.5262, + "step": 2994 + }, + { + "epoch": 0.10725732805701291, + "grad_norm": 1.5131771564483643, + "learning_rate": 0.00019688652219248021, + "loss": 1.2072, + "step": 2995 + }, + { + "epoch": 0.10729314018658119, + "grad_norm": 1.1854443550109863, + "learning_rate": 0.00019688364975580406, + "loss": 1.556, + "step": 2996 + }, + { + "epoch": 0.10732895231614947, + "grad_norm": 1.2820968627929688, + "learning_rate": 0.00019688077601568332, + "loss": 1.4393, + "step": 2997 + }, + { + "epoch": 0.10736476444571777, + "grad_norm": 1.5455936193466187, + "learning_rate": 0.00019687790097215675, + "loss": 1.6942, + "step": 2998 + }, + { + "epoch": 0.10740057657528605, + "grad_norm": 1.7165623903274536, + "learning_rate": 0.00019687502462526296, + "loss": 1.8651, + "step": 2999 + }, + { + "epoch": 0.10743638870485434, + "grad_norm": 1.641993522644043, + "learning_rate": 0.00019687214697504068, + "loss": 1.7097, + "step": 3000 + }, + { + "epoch": 0.10747220083442262, + "grad_norm": 1.5092154741287231, + "learning_rate": 0.00019686926802152862, + "loss": 1.6165, + "step": 3001 + }, + { + "epoch": 0.1075080129639909, + "grad_norm": 1.4830560684204102, + "learning_rate": 0.0001968663877647655, + "loss": 1.6553, + "step": 3002 + }, + { + "epoch": 0.10754382509355918, + "grad_norm": 1.6284724473953247, + "learning_rate": 0.0001968635062047901, + "loss": 1.7557, + "step": 3003 + }, + { + "epoch": 0.10757963722312747, + "grad_norm": 1.5482624769210815, + "learning_rate": 0.00019686062334164114, + "loss": 1.531, + "step": 3004 + }, + { + "epoch": 0.10761544935269576, + "grad_norm": 1.963331937789917, + "learning_rate": 0.00019685773917535747, + "loss": 1.5902, + "step": 3005 + }, + { + "epoch": 0.10765126148226405, + "grad_norm": 1.459810495376587, + "learning_rate": 0.00019685485370597781, + "loss": 1.6869, + "step": 3006 + }, + { + "epoch": 0.10768707361183233, + "grad_norm": 1.9397635459899902, + "learning_rate": 0.00019685196693354108, + "loss": 1.9816, + "step": 3007 + }, + { + "epoch": 0.10772288574140061, + "grad_norm": 1.316998839378357, + "learning_rate": 0.00019684907885808602, + "loss": 1.4956, + "step": 3008 + }, + { + "epoch": 0.1077586978709689, + "grad_norm": 1.8278248310089111, + "learning_rate": 0.00019684618947965157, + "loss": 1.7439, + "step": 3009 + }, + { + "epoch": 0.10779451000053718, + "grad_norm": 1.5549050569534302, + "learning_rate": 0.00019684329879827655, + "loss": 1.7759, + "step": 3010 + }, + { + "epoch": 0.10783032213010546, + "grad_norm": 1.860183596611023, + "learning_rate": 0.00019684040681399988, + "loss": 1.7717, + "step": 3011 + }, + { + "epoch": 0.10786613425967376, + "grad_norm": 1.4960808753967285, + "learning_rate": 0.0001968375135268604, + "loss": 1.8338, + "step": 3012 + }, + { + "epoch": 0.10790194638924204, + "grad_norm": 2.194607734680176, + "learning_rate": 0.00019683461893689713, + "loss": 1.3071, + "step": 3013 + }, + { + "epoch": 0.10793775851881032, + "grad_norm": 1.5040713548660278, + "learning_rate": 0.00019683172304414895, + "loss": 1.9321, + "step": 3014 + }, + { + "epoch": 0.1079735706483786, + "grad_norm": 1.7146315574645996, + "learning_rate": 0.00019682882584865486, + "loss": 1.6084, + "step": 3015 + }, + { + "epoch": 0.10800938277794689, + "grad_norm": 1.6277421712875366, + "learning_rate": 0.0001968259273504538, + "loss": 1.296, + "step": 3016 + }, + { + "epoch": 0.10804519490751517, + "grad_norm": 3.6533689498901367, + "learning_rate": 0.0001968230275495848, + "loss": 1.5586, + "step": 3017 + }, + { + "epoch": 0.10808100703708345, + "grad_norm": 1.6158928871154785, + "learning_rate": 0.00019682012644608684, + "loss": 1.9503, + "step": 3018 + }, + { + "epoch": 0.10811681916665175, + "grad_norm": 1.6434288024902344, + "learning_rate": 0.000196817224039999, + "loss": 1.5552, + "step": 3019 + }, + { + "epoch": 0.10815263129622003, + "grad_norm": 1.395654320716858, + "learning_rate": 0.00019681432033136025, + "loss": 1.6248, + "step": 3020 + }, + { + "epoch": 0.10818844342578832, + "grad_norm": 1.5744826793670654, + "learning_rate": 0.00019681141532020973, + "loss": 1.4838, + "step": 3021 + }, + { + "epoch": 0.1082242555553566, + "grad_norm": 1.7468857765197754, + "learning_rate": 0.00019680850900658648, + "loss": 1.8954, + "step": 3022 + }, + { + "epoch": 0.10826006768492488, + "grad_norm": 2.2361481189727783, + "learning_rate": 0.00019680560139052962, + "loss": 1.5789, + "step": 3023 + }, + { + "epoch": 0.10829587981449317, + "grad_norm": 1.6525112390518188, + "learning_rate": 0.00019680269247207826, + "loss": 1.705, + "step": 3024 + }, + { + "epoch": 0.10833169194406145, + "grad_norm": 1.5771716833114624, + "learning_rate": 0.00019679978225127154, + "loss": 1.5043, + "step": 3025 + }, + { + "epoch": 0.10836750407362974, + "grad_norm": 1.917914867401123, + "learning_rate": 0.00019679687072814863, + "loss": 1.8099, + "step": 3026 + }, + { + "epoch": 0.10840331620319803, + "grad_norm": 1.7994537353515625, + "learning_rate": 0.00019679395790274867, + "loss": 1.8708, + "step": 3027 + }, + { + "epoch": 0.10843912833276631, + "grad_norm": 1.5328046083450317, + "learning_rate": 0.00019679104377511085, + "loss": 1.6289, + "step": 3028 + }, + { + "epoch": 0.1084749404623346, + "grad_norm": 1.5930429697036743, + "learning_rate": 0.0001967881283452744, + "loss": 1.6851, + "step": 3029 + }, + { + "epoch": 0.10851075259190288, + "grad_norm": 1.4349143505096436, + "learning_rate": 0.00019678521161327854, + "loss": 1.7717, + "step": 3030 + }, + { + "epoch": 0.10854656472147116, + "grad_norm": 1.4315009117126465, + "learning_rate": 0.0001967822935791625, + "loss": 1.8928, + "step": 3031 + }, + { + "epoch": 0.10858237685103944, + "grad_norm": 1.4336453676223755, + "learning_rate": 0.0001967793742429655, + "loss": 1.9097, + "step": 3032 + }, + { + "epoch": 0.10861818898060774, + "grad_norm": 1.6382025480270386, + "learning_rate": 0.00019677645360472693, + "loss": 1.6168, + "step": 3033 + }, + { + "epoch": 0.10865400111017602, + "grad_norm": 2.577434778213501, + "learning_rate": 0.00019677353166448595, + "loss": 1.7761, + "step": 3034 + }, + { + "epoch": 0.1086898132397443, + "grad_norm": 1.9029229879379272, + "learning_rate": 0.00019677060842228193, + "loss": 1.594, + "step": 3035 + }, + { + "epoch": 0.10872562536931259, + "grad_norm": 1.9777616262435913, + "learning_rate": 0.00019676768387815423, + "loss": 1.8283, + "step": 3036 + }, + { + "epoch": 0.10876143749888087, + "grad_norm": 1.5141868591308594, + "learning_rate": 0.00019676475803214217, + "loss": 1.3587, + "step": 3037 + }, + { + "epoch": 0.10879724962844915, + "grad_norm": 1.609864354133606, + "learning_rate": 0.0001967618308842851, + "loss": 1.7622, + "step": 3038 + }, + { + "epoch": 0.10883306175801744, + "grad_norm": 2.1204848289489746, + "learning_rate": 0.00019675890243462237, + "loss": 1.6448, + "step": 3039 + }, + { + "epoch": 0.10886887388758573, + "grad_norm": 1.144871473312378, + "learning_rate": 0.00019675597268319344, + "loss": 1.6545, + "step": 3040 + }, + { + "epoch": 0.10890468601715401, + "grad_norm": 1.7205898761749268, + "learning_rate": 0.00019675304163003772, + "loss": 1.7451, + "step": 3041 + }, + { + "epoch": 0.1089404981467223, + "grad_norm": 1.9791665077209473, + "learning_rate": 0.00019675010927519462, + "loss": 1.871, + "step": 3042 + }, + { + "epoch": 0.10897631027629058, + "grad_norm": 1.382914423942566, + "learning_rate": 0.0001967471756187036, + "loss": 1.6498, + "step": 3043 + }, + { + "epoch": 0.10901212240585886, + "grad_norm": 2.2286922931671143, + "learning_rate": 0.0001967442406606041, + "loss": 1.5953, + "step": 3044 + }, + { + "epoch": 0.10904793453542715, + "grad_norm": 1.5717313289642334, + "learning_rate": 0.00019674130440093567, + "loss": 1.7754, + "step": 3045 + }, + { + "epoch": 0.10908374666499543, + "grad_norm": 1.2834662199020386, + "learning_rate": 0.00019673836683973777, + "loss": 1.546, + "step": 3046 + }, + { + "epoch": 0.10911955879456371, + "grad_norm": 2.3408381938934326, + "learning_rate": 0.00019673542797704992, + "loss": 1.6439, + "step": 3047 + }, + { + "epoch": 0.10915537092413201, + "grad_norm": 2.09751558303833, + "learning_rate": 0.00019673248781291167, + "loss": 1.7954, + "step": 3048 + }, + { + "epoch": 0.10919118305370029, + "grad_norm": 1.5235075950622559, + "learning_rate": 0.00019672954634736257, + "loss": 2.0121, + "step": 3049 + }, + { + "epoch": 0.10922699518326857, + "grad_norm": 2.3699159622192383, + "learning_rate": 0.00019672660358044218, + "loss": 1.5584, + "step": 3050 + }, + { + "epoch": 0.10926280731283686, + "grad_norm": 1.5464084148406982, + "learning_rate": 0.00019672365951219013, + "loss": 1.2083, + "step": 3051 + }, + { + "epoch": 0.10929861944240514, + "grad_norm": 1.4740662574768066, + "learning_rate": 0.00019672071414264598, + "loss": 1.6632, + "step": 3052 + }, + { + "epoch": 0.10933443157197342, + "grad_norm": 1.5872660875320435, + "learning_rate": 0.0001967177674718494, + "loss": 1.7066, + "step": 3053 + }, + { + "epoch": 0.1093702437015417, + "grad_norm": 1.4718883037567139, + "learning_rate": 0.00019671481949984002, + "loss": 1.7492, + "step": 3054 + }, + { + "epoch": 0.10940605583111, + "grad_norm": 1.7044392824172974, + "learning_rate": 0.0001967118702266575, + "loss": 1.3809, + "step": 3055 + }, + { + "epoch": 0.10944186796067829, + "grad_norm": 1.6310226917266846, + "learning_rate": 0.0001967089196523415, + "loss": 1.8413, + "step": 3056 + }, + { + "epoch": 0.10947768009024657, + "grad_norm": 1.6687250137329102, + "learning_rate": 0.00019670596777693176, + "loss": 1.4604, + "step": 3057 + }, + { + "epoch": 0.10951349221981485, + "grad_norm": 1.9419200420379639, + "learning_rate": 0.00019670301460046795, + "loss": 1.7134, + "step": 3058 + }, + { + "epoch": 0.10954930434938313, + "grad_norm": 1.624732255935669, + "learning_rate": 0.0001967000601229898, + "loss": 1.6658, + "step": 3059 + }, + { + "epoch": 0.10958511647895142, + "grad_norm": 2.9077885150909424, + "learning_rate": 0.00019669710434453707, + "loss": 1.8957, + "step": 3060 + }, + { + "epoch": 0.1096209286085197, + "grad_norm": 1.5916715860366821, + "learning_rate": 0.00019669414726514956, + "loss": 1.8633, + "step": 3061 + }, + { + "epoch": 0.109656740738088, + "grad_norm": 1.584800124168396, + "learning_rate": 0.000196691188884867, + "loss": 1.3014, + "step": 3062 + }, + { + "epoch": 0.10969255286765628, + "grad_norm": 1.7780625820159912, + "learning_rate": 0.00019668822920372922, + "loss": 1.7299, + "step": 3063 + }, + { + "epoch": 0.10972836499722456, + "grad_norm": 1.7964451313018799, + "learning_rate": 0.00019668526822177605, + "loss": 1.5913, + "step": 3064 + }, + { + "epoch": 0.10976417712679284, + "grad_norm": 1.9102052450180054, + "learning_rate": 0.00019668230593904734, + "loss": 1.3066, + "step": 3065 + }, + { + "epoch": 0.10979998925636113, + "grad_norm": 1.4517691135406494, + "learning_rate": 0.00019667934235558285, + "loss": 1.4668, + "step": 3066 + }, + { + "epoch": 0.10983580138592941, + "grad_norm": 1.3721327781677246, + "learning_rate": 0.00019667637747142257, + "loss": 1.8355, + "step": 3067 + }, + { + "epoch": 0.10987161351549769, + "grad_norm": 2.1370456218719482, + "learning_rate": 0.0001966734112866063, + "loss": 1.386, + "step": 3068 + }, + { + "epoch": 0.10990742564506599, + "grad_norm": 1.892778992652893, + "learning_rate": 0.00019667044380117398, + "loss": 1.4856, + "step": 3069 + }, + { + "epoch": 0.10994323777463427, + "grad_norm": 1.4679338932037354, + "learning_rate": 0.00019666747501516553, + "loss": 1.7188, + "step": 3070 + }, + { + "epoch": 0.10997904990420256, + "grad_norm": 1.373197317123413, + "learning_rate": 0.00019666450492862093, + "loss": 1.3631, + "step": 3071 + }, + { + "epoch": 0.11001486203377084, + "grad_norm": 1.1402403116226196, + "learning_rate": 0.0001966615335415801, + "loss": 1.5227, + "step": 3072 + }, + { + "epoch": 0.11005067416333912, + "grad_norm": 1.6488516330718994, + "learning_rate": 0.000196658560854083, + "loss": 1.7497, + "step": 3073 + }, + { + "epoch": 0.1100864862929074, + "grad_norm": 1.3037346601486206, + "learning_rate": 0.00019665558686616965, + "loss": 1.8412, + "step": 3074 + }, + { + "epoch": 0.11012229842247569, + "grad_norm": 1.519590139389038, + "learning_rate": 0.00019665261157788004, + "loss": 1.7108, + "step": 3075 + }, + { + "epoch": 0.11015811055204398, + "grad_norm": 1.3518164157867432, + "learning_rate": 0.00019664963498925423, + "loss": 1.6536, + "step": 3076 + }, + { + "epoch": 0.11019392268161227, + "grad_norm": 3.0712289810180664, + "learning_rate": 0.00019664665710033226, + "loss": 1.6875, + "step": 3077 + }, + { + "epoch": 0.11022973481118055, + "grad_norm": 1.832874059677124, + "learning_rate": 0.0001966436779111542, + "loss": 1.8305, + "step": 3078 + }, + { + "epoch": 0.11026554694074883, + "grad_norm": 1.384520411491394, + "learning_rate": 0.00019664069742176006, + "loss": 1.6199, + "step": 3079 + }, + { + "epoch": 0.11030135907031711, + "grad_norm": 1.634929895401001, + "learning_rate": 0.00019663771563219006, + "loss": 1.6368, + "step": 3080 + }, + { + "epoch": 0.1103371711998854, + "grad_norm": 1.5089815855026245, + "learning_rate": 0.00019663473254248417, + "loss": 1.8269, + "step": 3081 + }, + { + "epoch": 0.11037298332945368, + "grad_norm": 1.8302271366119385, + "learning_rate": 0.00019663174815268266, + "loss": 1.7455, + "step": 3082 + }, + { + "epoch": 0.11040879545902198, + "grad_norm": 1.9949239492416382, + "learning_rate": 0.0001966287624628256, + "loss": 1.8868, + "step": 3083 + }, + { + "epoch": 0.11044460758859026, + "grad_norm": 1.749061107635498, + "learning_rate": 0.0001966257754729532, + "loss": 1.9831, + "step": 3084 + }, + { + "epoch": 0.11048041971815854, + "grad_norm": 1.5313340425491333, + "learning_rate": 0.00019662278718310562, + "loss": 1.6813, + "step": 3085 + }, + { + "epoch": 0.11051623184772683, + "grad_norm": 2.523949146270752, + "learning_rate": 0.0001966197975933231, + "loss": 1.4592, + "step": 3086 + }, + { + "epoch": 0.11055204397729511, + "grad_norm": 1.696785807609558, + "learning_rate": 0.0001966168067036458, + "loss": 1.4651, + "step": 3087 + }, + { + "epoch": 0.11058785610686339, + "grad_norm": 2.151517391204834, + "learning_rate": 0.000196613814514114, + "loss": 1.7816, + "step": 3088 + }, + { + "epoch": 0.11062366823643167, + "grad_norm": 1.1579859256744385, + "learning_rate": 0.00019661082102476795, + "loss": 1.4892, + "step": 3089 + }, + { + "epoch": 0.11065948036599997, + "grad_norm": 1.6856324672698975, + "learning_rate": 0.00019660782623564792, + "loss": 1.6944, + "step": 3090 + }, + { + "epoch": 0.11069529249556825, + "grad_norm": 1.9886382818222046, + "learning_rate": 0.0001966048301467942, + "loss": 1.5386, + "step": 3091 + }, + { + "epoch": 0.11073110462513654, + "grad_norm": 1.2798269987106323, + "learning_rate": 0.0001966018327582471, + "loss": 1.5971, + "step": 3092 + }, + { + "epoch": 0.11076691675470482, + "grad_norm": 1.2525361776351929, + "learning_rate": 0.00019659883407004697, + "loss": 1.6204, + "step": 3093 + }, + { + "epoch": 0.1108027288842731, + "grad_norm": 1.73500657081604, + "learning_rate": 0.00019659583408223412, + "loss": 1.7557, + "step": 3094 + }, + { + "epoch": 0.11083854101384139, + "grad_norm": 1.7497897148132324, + "learning_rate": 0.00019659283279484891, + "loss": 1.7348, + "step": 3095 + }, + { + "epoch": 0.11087435314340967, + "grad_norm": 1.3274827003479004, + "learning_rate": 0.00019658983020793175, + "loss": 1.4419, + "step": 3096 + }, + { + "epoch": 0.11091016527297795, + "grad_norm": 2.2989590167999268, + "learning_rate": 0.000196586826321523, + "loss": 1.9745, + "step": 3097 + }, + { + "epoch": 0.11094597740254625, + "grad_norm": 2.2532341480255127, + "learning_rate": 0.0001965838211356631, + "loss": 1.8956, + "step": 3098 + }, + { + "epoch": 0.11098178953211453, + "grad_norm": 2.0518624782562256, + "learning_rate": 0.00019658081465039246, + "loss": 1.681, + "step": 3099 + }, + { + "epoch": 0.11101760166168281, + "grad_norm": 1.6704341173171997, + "learning_rate": 0.00019657780686575157, + "loss": 1.7184, + "step": 3100 + }, + { + "epoch": 0.1110534137912511, + "grad_norm": 1.0749001502990723, + "learning_rate": 0.00019657479778178083, + "loss": 1.6137, + "step": 3101 + }, + { + "epoch": 0.11108922592081938, + "grad_norm": 1.7141863107681274, + "learning_rate": 0.00019657178739852075, + "loss": 1.484, + "step": 3102 + }, + { + "epoch": 0.11112503805038766, + "grad_norm": 1.4108766317367554, + "learning_rate": 0.00019656877571601187, + "loss": 1.8166, + "step": 3103 + }, + { + "epoch": 0.11116085017995594, + "grad_norm": 1.4703015089035034, + "learning_rate": 0.00019656576273429467, + "loss": 1.796, + "step": 3104 + }, + { + "epoch": 0.11119666230952424, + "grad_norm": 1.2321605682373047, + "learning_rate": 0.0001965627484534097, + "loss": 1.6408, + "step": 3105 + }, + { + "epoch": 0.11123247443909252, + "grad_norm": 1.6735345125198364, + "learning_rate": 0.0001965597328733975, + "loss": 1.6995, + "step": 3106 + }, + { + "epoch": 0.1112682865686608, + "grad_norm": 1.1838451623916626, + "learning_rate": 0.00019655671599429865, + "loss": 1.663, + "step": 3107 + }, + { + "epoch": 0.11130409869822909, + "grad_norm": 1.8111246824264526, + "learning_rate": 0.0001965536978161537, + "loss": 1.7964, + "step": 3108 + }, + { + "epoch": 0.11133991082779737, + "grad_norm": 2.363241195678711, + "learning_rate": 0.00019655067833900333, + "loss": 1.5979, + "step": 3109 + }, + { + "epoch": 0.11137572295736566, + "grad_norm": 1.3892968893051147, + "learning_rate": 0.00019654765756288813, + "loss": 1.9266, + "step": 3110 + }, + { + "epoch": 0.11141153508693394, + "grad_norm": 2.5642900466918945, + "learning_rate": 0.00019654463548784873, + "loss": 1.6784, + "step": 3111 + }, + { + "epoch": 0.11144734721650223, + "grad_norm": 1.9628474712371826, + "learning_rate": 0.00019654161211392576, + "loss": 1.5582, + "step": 3112 + }, + { + "epoch": 0.11148315934607052, + "grad_norm": 1.6404321193695068, + "learning_rate": 0.00019653858744115996, + "loss": 1.4472, + "step": 3113 + }, + { + "epoch": 0.1115189714756388, + "grad_norm": 1.7177790403366089, + "learning_rate": 0.00019653556146959197, + "loss": 1.4041, + "step": 3114 + }, + { + "epoch": 0.11155478360520708, + "grad_norm": 1.340905785560608, + "learning_rate": 0.00019653253419926254, + "loss": 1.7599, + "step": 3115 + }, + { + "epoch": 0.11159059573477537, + "grad_norm": 1.997868537902832, + "learning_rate": 0.00019652950563021237, + "loss": 1.6725, + "step": 3116 + }, + { + "epoch": 0.11162640786434365, + "grad_norm": 1.5993092060089111, + "learning_rate": 0.00019652647576248223, + "loss": 1.6371, + "step": 3117 + }, + { + "epoch": 0.11166221999391193, + "grad_norm": 1.9976245164871216, + "learning_rate": 0.00019652344459611287, + "loss": 1.8941, + "step": 3118 + }, + { + "epoch": 0.11169803212348023, + "grad_norm": 1.9197741746902466, + "learning_rate": 0.00019652041213114504, + "loss": 1.9326, + "step": 3119 + }, + { + "epoch": 0.11173384425304851, + "grad_norm": 1.5483609437942505, + "learning_rate": 0.0001965173783676196, + "loss": 1.3853, + "step": 3120 + }, + { + "epoch": 0.1117696563826168, + "grad_norm": 1.5547926425933838, + "learning_rate": 0.0001965143433055773, + "loss": 1.769, + "step": 3121 + }, + { + "epoch": 0.11180546851218508, + "grad_norm": 1.8922780752182007, + "learning_rate": 0.00019651130694505904, + "loss": 1.7482, + "step": 3122 + }, + { + "epoch": 0.11184128064175336, + "grad_norm": 1.4488351345062256, + "learning_rate": 0.00019650826928610564, + "loss": 1.8088, + "step": 3123 + }, + { + "epoch": 0.11187709277132164, + "grad_norm": 1.4341516494750977, + "learning_rate": 0.00019650523032875791, + "loss": 1.361, + "step": 3124 + }, + { + "epoch": 0.11191290490088993, + "grad_norm": 1.3100446462631226, + "learning_rate": 0.00019650219007305686, + "loss": 1.53, + "step": 3125 + }, + { + "epoch": 0.11194871703045822, + "grad_norm": 1.9011170864105225, + "learning_rate": 0.00019649914851904327, + "loss": 1.5894, + "step": 3126 + }, + { + "epoch": 0.1119845291600265, + "grad_norm": 1.6128778457641602, + "learning_rate": 0.0001964961056667581, + "loss": 1.7066, + "step": 3127 + }, + { + "epoch": 0.11202034128959479, + "grad_norm": 1.8269716501235962, + "learning_rate": 0.00019649306151624235, + "loss": 1.4852, + "step": 3128 + }, + { + "epoch": 0.11205615341916307, + "grad_norm": 1.937235713005066, + "learning_rate": 0.0001964900160675369, + "loss": 1.5758, + "step": 3129 + }, + { + "epoch": 0.11209196554873135, + "grad_norm": 1.8874437808990479, + "learning_rate": 0.00019648696932068272, + "loss": 1.98, + "step": 3130 + }, + { + "epoch": 0.11212777767829964, + "grad_norm": 1.6971936225891113, + "learning_rate": 0.0001964839212757209, + "loss": 1.5301, + "step": 3131 + }, + { + "epoch": 0.11216358980786792, + "grad_norm": 1.5790627002716064, + "learning_rate": 0.00019648087193269232, + "loss": 1.6124, + "step": 3132 + }, + { + "epoch": 0.11219940193743622, + "grad_norm": 1.3589112758636475, + "learning_rate": 0.00019647782129163805, + "loss": 1.6747, + "step": 3133 + }, + { + "epoch": 0.1122352140670045, + "grad_norm": 1.376923680305481, + "learning_rate": 0.00019647476935259916, + "loss": 1.5895, + "step": 3134 + }, + { + "epoch": 0.11227102619657278, + "grad_norm": 2.1595699787139893, + "learning_rate": 0.0001964717161156167, + "loss": 1.5737, + "step": 3135 + }, + { + "epoch": 0.11230683832614106, + "grad_norm": 1.3219338655471802, + "learning_rate": 0.00019646866158073173, + "loss": 1.5445, + "step": 3136 + }, + { + "epoch": 0.11234265045570935, + "grad_norm": 1.44645357131958, + "learning_rate": 0.00019646560574798535, + "loss": 1.2135, + "step": 3137 + }, + { + "epoch": 0.11237846258527763, + "grad_norm": 1.1569671630859375, + "learning_rate": 0.0001964625486174187, + "loss": 1.3581, + "step": 3138 + }, + { + "epoch": 0.11241427471484591, + "grad_norm": 1.5105891227722168, + "learning_rate": 0.00019645949018907283, + "loss": 1.5849, + "step": 3139 + }, + { + "epoch": 0.11245008684441421, + "grad_norm": 1.767600655555725, + "learning_rate": 0.000196456430462989, + "loss": 1.3581, + "step": 3140 + }, + { + "epoch": 0.11248589897398249, + "grad_norm": 1.3685157299041748, + "learning_rate": 0.00019645336943920828, + "loss": 1.7175, + "step": 3141 + }, + { + "epoch": 0.11252171110355078, + "grad_norm": 1.4004133939743042, + "learning_rate": 0.00019645030711777192, + "loss": 1.2914, + "step": 3142 + }, + { + "epoch": 0.11255752323311906, + "grad_norm": 1.5391340255737305, + "learning_rate": 0.0001964472434987211, + "loss": 1.7839, + "step": 3143 + }, + { + "epoch": 0.11259333536268734, + "grad_norm": 1.2137620449066162, + "learning_rate": 0.00019644417858209702, + "loss": 1.5005, + "step": 3144 + }, + { + "epoch": 0.11262914749225562, + "grad_norm": 1.874804139137268, + "learning_rate": 0.00019644111236794088, + "loss": 1.8163, + "step": 3145 + }, + { + "epoch": 0.1126649596218239, + "grad_norm": 1.817853569984436, + "learning_rate": 0.000196438044856294, + "loss": 1.6768, + "step": 3146 + }, + { + "epoch": 0.11270077175139219, + "grad_norm": 1.3849821090698242, + "learning_rate": 0.0001964349760471976, + "loss": 1.5519, + "step": 3147 + }, + { + "epoch": 0.11273658388096049, + "grad_norm": 1.526769995689392, + "learning_rate": 0.00019643190594069302, + "loss": 1.939, + "step": 3148 + }, + { + "epoch": 0.11277239601052877, + "grad_norm": 2.315960645675659, + "learning_rate": 0.00019642883453682152, + "loss": 1.8201, + "step": 3149 + }, + { + "epoch": 0.11280820814009705, + "grad_norm": 1.7398210763931274, + "learning_rate": 0.00019642576183562444, + "loss": 1.4776, + "step": 3150 + }, + { + "epoch": 0.11284402026966533, + "grad_norm": 1.768480658531189, + "learning_rate": 0.00019642268783714312, + "loss": 1.9995, + "step": 3151 + }, + { + "epoch": 0.11287983239923362, + "grad_norm": 1.9272271394729614, + "learning_rate": 0.0001964196125414189, + "loss": 1.6427, + "step": 3152 + }, + { + "epoch": 0.1129156445288019, + "grad_norm": 1.451150894165039, + "learning_rate": 0.0001964165359484932, + "loss": 1.6079, + "step": 3153 + }, + { + "epoch": 0.11295145665837018, + "grad_norm": 1.640161395072937, + "learning_rate": 0.00019641345805840733, + "loss": 1.5618, + "step": 3154 + }, + { + "epoch": 0.11298726878793848, + "grad_norm": 1.5714844465255737, + "learning_rate": 0.00019641037887120277, + "loss": 1.7151, + "step": 3155 + }, + { + "epoch": 0.11302308091750676, + "grad_norm": 1.5246632099151611, + "learning_rate": 0.00019640729838692092, + "loss": 1.425, + "step": 3156 + }, + { + "epoch": 0.11305889304707505, + "grad_norm": 1.6431224346160889, + "learning_rate": 0.00019640421660560323, + "loss": 1.6421, + "step": 3157 + }, + { + "epoch": 0.11309470517664333, + "grad_norm": 1.2764418125152588, + "learning_rate": 0.00019640113352729116, + "loss": 1.5922, + "step": 3158 + }, + { + "epoch": 0.11313051730621161, + "grad_norm": 1.4131271839141846, + "learning_rate": 0.00019639804915202617, + "loss": 1.5558, + "step": 3159 + }, + { + "epoch": 0.1131663294357799, + "grad_norm": 1.229662537574768, + "learning_rate": 0.0001963949634798498, + "loss": 1.5588, + "step": 3160 + }, + { + "epoch": 0.11320214156534818, + "grad_norm": 1.4876227378845215, + "learning_rate": 0.0001963918765108035, + "loss": 1.4488, + "step": 3161 + }, + { + "epoch": 0.11323795369491647, + "grad_norm": 1.2596564292907715, + "learning_rate": 0.00019638878824492886, + "loss": 1.4179, + "step": 3162 + }, + { + "epoch": 0.11327376582448476, + "grad_norm": 1.4421182870864868, + "learning_rate": 0.0001963856986822674, + "loss": 1.3701, + "step": 3163 + }, + { + "epoch": 0.11330957795405304, + "grad_norm": 1.4923369884490967, + "learning_rate": 0.00019638260782286072, + "loss": 1.3928, + "step": 3164 + }, + { + "epoch": 0.11334539008362132, + "grad_norm": 1.701507806777954, + "learning_rate": 0.00019637951566675035, + "loss": 1.8446, + "step": 3165 + }, + { + "epoch": 0.1133812022131896, + "grad_norm": 1.7031058073043823, + "learning_rate": 0.00019637642221397792, + "loss": 1.6509, + "step": 3166 + }, + { + "epoch": 0.11341701434275789, + "grad_norm": 1.7792160511016846, + "learning_rate": 0.00019637332746458506, + "loss": 1.7611, + "step": 3167 + }, + { + "epoch": 0.11345282647232617, + "grad_norm": 2.105922222137451, + "learning_rate": 0.00019637023141861338, + "loss": 1.716, + "step": 3168 + }, + { + "epoch": 0.11348863860189447, + "grad_norm": 2.09403920173645, + "learning_rate": 0.00019636713407610455, + "loss": 1.5287, + "step": 3169 + }, + { + "epoch": 0.11352445073146275, + "grad_norm": 1.582002878189087, + "learning_rate": 0.0001963640354371002, + "loss": 2.0743, + "step": 3170 + }, + { + "epoch": 0.11356026286103103, + "grad_norm": 1.5988606214523315, + "learning_rate": 0.00019636093550164208, + "loss": 1.6238, + "step": 3171 + }, + { + "epoch": 0.11359607499059932, + "grad_norm": 1.5815061330795288, + "learning_rate": 0.00019635783426977187, + "loss": 2.0104, + "step": 3172 + }, + { + "epoch": 0.1136318871201676, + "grad_norm": 1.422127366065979, + "learning_rate": 0.00019635473174153128, + "loss": 1.7212, + "step": 3173 + }, + { + "epoch": 0.11366769924973588, + "grad_norm": 1.4015790224075317, + "learning_rate": 0.00019635162791696212, + "loss": 1.672, + "step": 3174 + }, + { + "epoch": 0.11370351137930416, + "grad_norm": 1.4458675384521484, + "learning_rate": 0.00019634852279610602, + "loss": 1.6764, + "step": 3175 + }, + { + "epoch": 0.11373932350887246, + "grad_norm": 2.8560314178466797, + "learning_rate": 0.00019634541637900487, + "loss": 1.6277, + "step": 3176 + }, + { + "epoch": 0.11377513563844074, + "grad_norm": 1.758179783821106, + "learning_rate": 0.0001963423086657004, + "loss": 1.4349, + "step": 3177 + }, + { + "epoch": 0.11381094776800903, + "grad_norm": 1.4987274408340454, + "learning_rate": 0.00019633919965623444, + "loss": 1.7514, + "step": 3178 + }, + { + "epoch": 0.11384675989757731, + "grad_norm": 1.8860074281692505, + "learning_rate": 0.0001963360893506488, + "loss": 1.4095, + "step": 3179 + }, + { + "epoch": 0.11388257202714559, + "grad_norm": 1.5295324325561523, + "learning_rate": 0.0001963329777489854, + "loss": 1.479, + "step": 3180 + }, + { + "epoch": 0.11391838415671388, + "grad_norm": 1.5854120254516602, + "learning_rate": 0.00019632986485128602, + "loss": 1.6353, + "step": 3181 + }, + { + "epoch": 0.11395419628628216, + "grad_norm": 1.5801342725753784, + "learning_rate": 0.00019632675065759254, + "loss": 1.4634, + "step": 3182 + }, + { + "epoch": 0.11399000841585046, + "grad_norm": 1.8172426223754883, + "learning_rate": 0.0001963236351679469, + "loss": 1.7716, + "step": 3183 + }, + { + "epoch": 0.11402582054541874, + "grad_norm": 1.4854059219360352, + "learning_rate": 0.00019632051838239099, + "loss": 1.6442, + "step": 3184 + }, + { + "epoch": 0.11406163267498702, + "grad_norm": 1.682544231414795, + "learning_rate": 0.00019631740030096677, + "loss": 1.9143, + "step": 3185 + }, + { + "epoch": 0.1140974448045553, + "grad_norm": 1.281859040260315, + "learning_rate": 0.00019631428092371612, + "loss": 1.5915, + "step": 3186 + }, + { + "epoch": 0.11413325693412359, + "grad_norm": 1.3597787618637085, + "learning_rate": 0.00019631116025068112, + "loss": 1.6747, + "step": 3187 + }, + { + "epoch": 0.11416906906369187, + "grad_norm": 1.622419834136963, + "learning_rate": 0.00019630803828190368, + "loss": 1.848, + "step": 3188 + }, + { + "epoch": 0.11420488119326015, + "grad_norm": 1.744403600692749, + "learning_rate": 0.00019630491501742577, + "loss": 1.5405, + "step": 3189 + }, + { + "epoch": 0.11424069332282845, + "grad_norm": 1.3016765117645264, + "learning_rate": 0.00019630179045728946, + "loss": 1.5965, + "step": 3190 + }, + { + "epoch": 0.11427650545239673, + "grad_norm": 1.3310681581497192, + "learning_rate": 0.00019629866460153683, + "loss": 1.5548, + "step": 3191 + }, + { + "epoch": 0.11431231758196501, + "grad_norm": 2.2537686824798584, + "learning_rate": 0.00019629553745020983, + "loss": 1.2023, + "step": 3192 + }, + { + "epoch": 0.1143481297115333, + "grad_norm": 1.3856066465377808, + "learning_rate": 0.00019629240900335062, + "loss": 1.5476, + "step": 3193 + }, + { + "epoch": 0.11438394184110158, + "grad_norm": 1.8118864297866821, + "learning_rate": 0.00019628927926100125, + "loss": 1.4654, + "step": 3194 + }, + { + "epoch": 0.11441975397066986, + "grad_norm": 1.674796223640442, + "learning_rate": 0.0001962861482232038, + "loss": 1.549, + "step": 3195 + }, + { + "epoch": 0.11445556610023815, + "grad_norm": 1.317910075187683, + "learning_rate": 0.00019628301589000047, + "loss": 1.3573, + "step": 3196 + }, + { + "epoch": 0.11449137822980643, + "grad_norm": 2.1432268619537354, + "learning_rate": 0.00019627988226143334, + "loss": 1.477, + "step": 3197 + }, + { + "epoch": 0.11452719035937473, + "grad_norm": 1.5118340253829956, + "learning_rate": 0.00019627674733754458, + "loss": 1.5706, + "step": 3198 + }, + { + "epoch": 0.11456300248894301, + "grad_norm": 1.699642539024353, + "learning_rate": 0.00019627361111837637, + "loss": 1.4643, + "step": 3199 + }, + { + "epoch": 0.11459881461851129, + "grad_norm": 1.292831301689148, + "learning_rate": 0.00019627047360397092, + "loss": 1.8203, + "step": 3200 + }, + { + "epoch": 0.11463462674807957, + "grad_norm": 1.6774110794067383, + "learning_rate": 0.00019626733479437042, + "loss": 1.9212, + "step": 3201 + }, + { + "epoch": 0.11467043887764786, + "grad_norm": 1.5225844383239746, + "learning_rate": 0.0001962641946896171, + "loss": 1.6182, + "step": 3202 + }, + { + "epoch": 0.11470625100721614, + "grad_norm": 2.3992981910705566, + "learning_rate": 0.0001962610532897532, + "loss": 1.9434, + "step": 3203 + }, + { + "epoch": 0.11474206313678442, + "grad_norm": 1.9705089330673218, + "learning_rate": 0.00019625791059482106, + "loss": 1.4532, + "step": 3204 + }, + { + "epoch": 0.11477787526635272, + "grad_norm": 2.258025884628296, + "learning_rate": 0.00019625476660486285, + "loss": 1.3525, + "step": 3205 + }, + { + "epoch": 0.114813687395921, + "grad_norm": 1.7987005710601807, + "learning_rate": 0.0001962516213199209, + "loss": 1.7462, + "step": 3206 + }, + { + "epoch": 0.11484949952548928, + "grad_norm": 1.720633864402771, + "learning_rate": 0.00019624847474003756, + "loss": 1.7263, + "step": 3207 + }, + { + "epoch": 0.11488531165505757, + "grad_norm": 1.8208423852920532, + "learning_rate": 0.00019624532686525513, + "loss": 1.5331, + "step": 3208 + }, + { + "epoch": 0.11492112378462585, + "grad_norm": 1.2464954853057861, + "learning_rate": 0.000196242177695616, + "loss": 1.4239, + "step": 3209 + }, + { + "epoch": 0.11495693591419413, + "grad_norm": 1.676194429397583, + "learning_rate": 0.0001962390272311625, + "loss": 1.8397, + "step": 3210 + }, + { + "epoch": 0.11499274804376242, + "grad_norm": 1.4178789854049683, + "learning_rate": 0.00019623587547193703, + "loss": 1.525, + "step": 3211 + }, + { + "epoch": 0.11502856017333071, + "grad_norm": 1.856972098350525, + "learning_rate": 0.00019623272241798198, + "loss": 1.9044, + "step": 3212 + }, + { + "epoch": 0.115064372302899, + "grad_norm": 2.108940362930298, + "learning_rate": 0.0001962295680693398, + "loss": 1.7134, + "step": 3213 + }, + { + "epoch": 0.11510018443246728, + "grad_norm": 1.6334648132324219, + "learning_rate": 0.0001962264124260529, + "loss": 1.5537, + "step": 3214 + }, + { + "epoch": 0.11513599656203556, + "grad_norm": 1.7307848930358887, + "learning_rate": 0.00019622325548816373, + "loss": 1.7397, + "step": 3215 + }, + { + "epoch": 0.11517180869160384, + "grad_norm": 1.7003225088119507, + "learning_rate": 0.0001962200972557148, + "loss": 1.7081, + "step": 3216 + }, + { + "epoch": 0.11520762082117213, + "grad_norm": 1.5408225059509277, + "learning_rate": 0.00019621693772874855, + "loss": 1.3395, + "step": 3217 + }, + { + "epoch": 0.11524343295074041, + "grad_norm": 1.5445457696914673, + "learning_rate": 0.00019621377690730754, + "loss": 1.5551, + "step": 3218 + }, + { + "epoch": 0.1152792450803087, + "grad_norm": 2.64323353767395, + "learning_rate": 0.00019621061479143425, + "loss": 1.662, + "step": 3219 + }, + { + "epoch": 0.11531505720987699, + "grad_norm": 1.4125784635543823, + "learning_rate": 0.00019620745138117124, + "loss": 1.498, + "step": 3220 + }, + { + "epoch": 0.11535086933944527, + "grad_norm": 1.3482048511505127, + "learning_rate": 0.00019620428667656108, + "loss": 1.6449, + "step": 3221 + }, + { + "epoch": 0.11538668146901356, + "grad_norm": 1.5159589052200317, + "learning_rate": 0.00019620112067764636, + "loss": 1.7357, + "step": 3222 + }, + { + "epoch": 0.11542249359858184, + "grad_norm": 1.3427600860595703, + "learning_rate": 0.0001961979533844696, + "loss": 1.4208, + "step": 3223 + }, + { + "epoch": 0.11545830572815012, + "grad_norm": 2.6894454956054688, + "learning_rate": 0.0001961947847970735, + "loss": 1.7024, + "step": 3224 + }, + { + "epoch": 0.1154941178577184, + "grad_norm": 2.293968677520752, + "learning_rate": 0.00019619161491550065, + "loss": 1.2496, + "step": 3225 + }, + { + "epoch": 0.1155299299872867, + "grad_norm": 2.2852323055267334, + "learning_rate": 0.00019618844373979372, + "loss": 1.6852, + "step": 3226 + }, + { + "epoch": 0.11556574211685498, + "grad_norm": 1.5715569257736206, + "learning_rate": 0.0001961852712699953, + "loss": 1.4904, + "step": 3227 + }, + { + "epoch": 0.11560155424642327, + "grad_norm": 1.5239616632461548, + "learning_rate": 0.00019618209750614813, + "loss": 1.4977, + "step": 3228 + }, + { + "epoch": 0.11563736637599155, + "grad_norm": 1.4947649240493774, + "learning_rate": 0.00019617892244829495, + "loss": 1.5866, + "step": 3229 + }, + { + "epoch": 0.11567317850555983, + "grad_norm": 1.623744010925293, + "learning_rate": 0.0001961757460964784, + "loss": 1.5114, + "step": 3230 + }, + { + "epoch": 0.11570899063512811, + "grad_norm": 1.7694364786148071, + "learning_rate": 0.00019617256845074125, + "loss": 1.6273, + "step": 3231 + }, + { + "epoch": 0.1157448027646964, + "grad_norm": 1.5195114612579346, + "learning_rate": 0.00019616938951112623, + "loss": 1.3845, + "step": 3232 + }, + { + "epoch": 0.1157806148942647, + "grad_norm": 1.6811881065368652, + "learning_rate": 0.00019616620927767614, + "loss": 1.3784, + "step": 3233 + }, + { + "epoch": 0.11581642702383298, + "grad_norm": 1.8384026288986206, + "learning_rate": 0.00019616302775043377, + "loss": 1.6615, + "step": 3234 + }, + { + "epoch": 0.11585223915340126, + "grad_norm": 1.5395166873931885, + "learning_rate": 0.00019615984492944187, + "loss": 1.7524, + "step": 3235 + }, + { + "epoch": 0.11588805128296954, + "grad_norm": 1.3467326164245605, + "learning_rate": 0.00019615666081474332, + "loss": 1.6619, + "step": 3236 + }, + { + "epoch": 0.11592386341253783, + "grad_norm": 1.4491621255874634, + "learning_rate": 0.00019615347540638092, + "loss": 1.5844, + "step": 3237 + }, + { + "epoch": 0.11595967554210611, + "grad_norm": 2.1446890830993652, + "learning_rate": 0.00019615028870439752, + "loss": 1.6285, + "step": 3238 + }, + { + "epoch": 0.11599548767167439, + "grad_norm": 2.09091854095459, + "learning_rate": 0.00019614710070883602, + "loss": 1.5445, + "step": 3239 + }, + { + "epoch": 0.11603129980124269, + "grad_norm": 1.3230056762695312, + "learning_rate": 0.00019614391141973934, + "loss": 1.5391, + "step": 3240 + }, + { + "epoch": 0.11606711193081097, + "grad_norm": 1.5421323776245117, + "learning_rate": 0.00019614072083715028, + "loss": 1.4999, + "step": 3241 + }, + { + "epoch": 0.11610292406037925, + "grad_norm": 1.7281304597854614, + "learning_rate": 0.00019613752896111187, + "loss": 1.5916, + "step": 3242 + }, + { + "epoch": 0.11613873618994754, + "grad_norm": 2.072479724884033, + "learning_rate": 0.00019613433579166706, + "loss": 1.922, + "step": 3243 + }, + { + "epoch": 0.11617454831951582, + "grad_norm": 1.6558066606521606, + "learning_rate": 0.0001961311413288587, + "loss": 1.6862, + "step": 3244 + }, + { + "epoch": 0.1162103604490841, + "grad_norm": 2.008986711502075, + "learning_rate": 0.00019612794557272983, + "loss": 1.5542, + "step": 3245 + }, + { + "epoch": 0.11624617257865238, + "grad_norm": 1.22840416431427, + "learning_rate": 0.00019612474852332348, + "loss": 1.4519, + "step": 3246 + }, + { + "epoch": 0.11628198470822067, + "grad_norm": 1.6340930461883545, + "learning_rate": 0.00019612155018068264, + "loss": 1.5469, + "step": 3247 + }, + { + "epoch": 0.11631779683778896, + "grad_norm": 2.318631887435913, + "learning_rate": 0.00019611835054485032, + "loss": 1.7494, + "step": 3248 + }, + { + "epoch": 0.11635360896735725, + "grad_norm": 1.6667400598526, + "learning_rate": 0.00019611514961586957, + "loss": 1.3893, + "step": 3249 + }, + { + "epoch": 0.11638942109692553, + "grad_norm": 1.4194461107254028, + "learning_rate": 0.00019611194739378344, + "loss": 1.8322, + "step": 3250 + }, + { + "epoch": 0.11642523322649381, + "grad_norm": 1.7950221300125122, + "learning_rate": 0.00019610874387863508, + "loss": 1.4541, + "step": 3251 + }, + { + "epoch": 0.1164610453560621, + "grad_norm": 1.682098627090454, + "learning_rate": 0.00019610553907046748, + "loss": 1.6563, + "step": 3252 + }, + { + "epoch": 0.11649685748563038, + "grad_norm": 2.29400372505188, + "learning_rate": 0.0001961023329693239, + "loss": 1.7833, + "step": 3253 + }, + { + "epoch": 0.11653266961519866, + "grad_norm": 1.656288981437683, + "learning_rate": 0.00019609912557524734, + "loss": 1.5191, + "step": 3254 + }, + { + "epoch": 0.11656848174476696, + "grad_norm": 1.3235529661178589, + "learning_rate": 0.000196095916888281, + "loss": 1.539, + "step": 3255 + }, + { + "epoch": 0.11660429387433524, + "grad_norm": 1.794012188911438, + "learning_rate": 0.00019609270690846807, + "loss": 1.7339, + "step": 3256 + }, + { + "epoch": 0.11664010600390352, + "grad_norm": 1.4943112134933472, + "learning_rate": 0.00019608949563585174, + "loss": 1.5602, + "step": 3257 + }, + { + "epoch": 0.1166759181334718, + "grad_norm": 2.628278970718384, + "learning_rate": 0.00019608628307047517, + "loss": 1.7145, + "step": 3258 + }, + { + "epoch": 0.11671173026304009, + "grad_norm": 1.359542965888977, + "learning_rate": 0.0001960830692123816, + "loss": 1.3189, + "step": 3259 + }, + { + "epoch": 0.11674754239260837, + "grad_norm": 1.5178779363632202, + "learning_rate": 0.00019607985406161425, + "loss": 1.527, + "step": 3260 + }, + { + "epoch": 0.11678335452217666, + "grad_norm": 1.6285661458969116, + "learning_rate": 0.00019607663761821644, + "loss": 1.8477, + "step": 3261 + }, + { + "epoch": 0.11681916665174495, + "grad_norm": 1.4759529829025269, + "learning_rate": 0.0001960734198822314, + "loss": 1.6696, + "step": 3262 + }, + { + "epoch": 0.11685497878131323, + "grad_norm": 1.8590339422225952, + "learning_rate": 0.0001960702008537024, + "loss": 1.5988, + "step": 3263 + }, + { + "epoch": 0.11689079091088152, + "grad_norm": 2.2385072708129883, + "learning_rate": 0.00019606698053267277, + "loss": 1.7862, + "step": 3264 + }, + { + "epoch": 0.1169266030404498, + "grad_norm": 1.7401012182235718, + "learning_rate": 0.00019606375891918583, + "loss": 1.8007, + "step": 3265 + }, + { + "epoch": 0.11696241517001808, + "grad_norm": 1.3024067878723145, + "learning_rate": 0.00019606053601328496, + "loss": 1.5707, + "step": 3266 + }, + { + "epoch": 0.11699822729958637, + "grad_norm": 2.206632375717163, + "learning_rate": 0.00019605731181501342, + "loss": 1.4116, + "step": 3267 + }, + { + "epoch": 0.11703403942915465, + "grad_norm": 1.3507251739501953, + "learning_rate": 0.00019605408632441474, + "loss": 1.6662, + "step": 3268 + }, + { + "epoch": 0.11706985155872295, + "grad_norm": 1.4132224321365356, + "learning_rate": 0.00019605085954153218, + "loss": 1.7236, + "step": 3269 + }, + { + "epoch": 0.11710566368829123, + "grad_norm": 1.848435878753662, + "learning_rate": 0.00019604763146640922, + "loss": 1.774, + "step": 3270 + }, + { + "epoch": 0.11714147581785951, + "grad_norm": 1.356055736541748, + "learning_rate": 0.00019604440209908925, + "loss": 1.9218, + "step": 3271 + }, + { + "epoch": 0.1171772879474278, + "grad_norm": 1.9709981679916382, + "learning_rate": 0.00019604117143961575, + "loss": 1.5058, + "step": 3272 + }, + { + "epoch": 0.11721310007699608, + "grad_norm": 1.8755549192428589, + "learning_rate": 0.00019603793948803216, + "loss": 1.4719, + "step": 3273 + }, + { + "epoch": 0.11724891220656436, + "grad_norm": 2.5782339572906494, + "learning_rate": 0.000196034706244382, + "loss": 1.5972, + "step": 3274 + }, + { + "epoch": 0.11728472433613264, + "grad_norm": 1.4497991800308228, + "learning_rate": 0.0001960314717087087, + "loss": 1.4209, + "step": 3275 + }, + { + "epoch": 0.11732053646570094, + "grad_norm": 1.4412992000579834, + "learning_rate": 0.00019602823588105585, + "loss": 1.517, + "step": 3276 + }, + { + "epoch": 0.11735634859526922, + "grad_norm": 1.4440698623657227, + "learning_rate": 0.000196024998761467, + "loss": 1.5662, + "step": 3277 + }, + { + "epoch": 0.1173921607248375, + "grad_norm": 3.968775987625122, + "learning_rate": 0.00019602176034998556, + "loss": 2.1371, + "step": 3278 + }, + { + "epoch": 0.11742797285440579, + "grad_norm": 1.4765177965164185, + "learning_rate": 0.00019601852064665524, + "loss": 1.6333, + "step": 3279 + }, + { + "epoch": 0.11746378498397407, + "grad_norm": 1.5928294658660889, + "learning_rate": 0.0001960152796515196, + "loss": 1.6321, + "step": 3280 + }, + { + "epoch": 0.11749959711354235, + "grad_norm": 1.4952478408813477, + "learning_rate": 0.00019601203736462219, + "loss": 1.7204, + "step": 3281 + }, + { + "epoch": 0.11753540924311064, + "grad_norm": 2.540161609649658, + "learning_rate": 0.00019600879378600666, + "loss": 1.6557, + "step": 3282 + }, + { + "epoch": 0.11757122137267893, + "grad_norm": 1.3990188837051392, + "learning_rate": 0.0001960055489157167, + "loss": 1.6378, + "step": 3283 + }, + { + "epoch": 0.11760703350224722, + "grad_norm": 1.4970078468322754, + "learning_rate": 0.00019600230275379588, + "loss": 1.7807, + "step": 3284 + }, + { + "epoch": 0.1176428456318155, + "grad_norm": 1.9799847602844238, + "learning_rate": 0.0001959990553002879, + "loss": 1.6429, + "step": 3285 + }, + { + "epoch": 0.11767865776138378, + "grad_norm": 1.5458717346191406, + "learning_rate": 0.0001959958065552365, + "loss": 1.8378, + "step": 3286 + }, + { + "epoch": 0.11771446989095206, + "grad_norm": 1.1473616361618042, + "learning_rate": 0.0001959925565186853, + "loss": 1.4595, + "step": 3287 + }, + { + "epoch": 0.11775028202052035, + "grad_norm": 1.4953563213348389, + "learning_rate": 0.00019598930519067813, + "loss": 1.6126, + "step": 3288 + }, + { + "epoch": 0.11778609415008863, + "grad_norm": 1.4678362607955933, + "learning_rate": 0.00019598605257125864, + "loss": 1.492, + "step": 3289 + }, + { + "epoch": 0.11782190627965693, + "grad_norm": 2.026249408721924, + "learning_rate": 0.0001959827986604706, + "loss": 1.6692, + "step": 3290 + }, + { + "epoch": 0.11785771840922521, + "grad_norm": 1.462710976600647, + "learning_rate": 0.00019597954345835787, + "loss": 1.4171, + "step": 3291 + }, + { + "epoch": 0.11789353053879349, + "grad_norm": 1.6118189096450806, + "learning_rate": 0.00019597628696496418, + "loss": 1.7512, + "step": 3292 + }, + { + "epoch": 0.11792934266836178, + "grad_norm": 1.621272087097168, + "learning_rate": 0.0001959730291803333, + "loss": 1.9068, + "step": 3293 + }, + { + "epoch": 0.11796515479793006, + "grad_norm": 1.9732425212860107, + "learning_rate": 0.00019596977010450915, + "loss": 1.4781, + "step": 3294 + }, + { + "epoch": 0.11800096692749834, + "grad_norm": 1.2705248594284058, + "learning_rate": 0.00019596650973753555, + "loss": 1.9466, + "step": 3295 + }, + { + "epoch": 0.11803677905706662, + "grad_norm": 2.1659154891967773, + "learning_rate": 0.00019596324807945632, + "loss": 1.526, + "step": 3296 + }, + { + "epoch": 0.1180725911866349, + "grad_norm": 1.6250512599945068, + "learning_rate": 0.00019595998513031537, + "loss": 1.6662, + "step": 3297 + }, + { + "epoch": 0.1181084033162032, + "grad_norm": 3.29927659034729, + "learning_rate": 0.00019595672089015663, + "loss": 1.7103, + "step": 3298 + }, + { + "epoch": 0.11814421544577149, + "grad_norm": 1.6981123685836792, + "learning_rate": 0.00019595345535902394, + "loss": 1.2357, + "step": 3299 + }, + { + "epoch": 0.11818002757533977, + "grad_norm": 1.8738186359405518, + "learning_rate": 0.0001959501885369613, + "loss": 1.6669, + "step": 3300 + }, + { + "epoch": 0.11821583970490805, + "grad_norm": 2.400287628173828, + "learning_rate": 0.00019594692042401263, + "loss": 1.6469, + "step": 3301 + }, + { + "epoch": 0.11825165183447633, + "grad_norm": 2.324725389480591, + "learning_rate": 0.00019594365102022193, + "loss": 1.8658, + "step": 3302 + }, + { + "epoch": 0.11828746396404462, + "grad_norm": 1.470152735710144, + "learning_rate": 0.00019594038032563315, + "loss": 1.7331, + "step": 3303 + }, + { + "epoch": 0.1183232760936129, + "grad_norm": 1.8536076545715332, + "learning_rate": 0.0001959371083402903, + "loss": 1.6881, + "step": 3304 + }, + { + "epoch": 0.1183590882231812, + "grad_norm": 1.580949306488037, + "learning_rate": 0.00019593383506423743, + "loss": 1.5505, + "step": 3305 + }, + { + "epoch": 0.11839490035274948, + "grad_norm": 1.4899119138717651, + "learning_rate": 0.00019593056049751852, + "loss": 1.5014, + "step": 3306 + }, + { + "epoch": 0.11843071248231776, + "grad_norm": 1.3627156019210815, + "learning_rate": 0.0001959272846401777, + "loss": 1.502, + "step": 3307 + }, + { + "epoch": 0.11846652461188605, + "grad_norm": 2.0167927742004395, + "learning_rate": 0.000195924007492259, + "loss": 1.6057, + "step": 3308 + }, + { + "epoch": 0.11850233674145433, + "grad_norm": 2.2905960083007812, + "learning_rate": 0.00019592072905380648, + "loss": 1.6687, + "step": 3309 + }, + { + "epoch": 0.11853814887102261, + "grad_norm": 2.684022903442383, + "learning_rate": 0.00019591744932486428, + "loss": 1.6046, + "step": 3310 + }, + { + "epoch": 0.1185739610005909, + "grad_norm": 1.62175452709198, + "learning_rate": 0.00019591416830547657, + "loss": 1.5254, + "step": 3311 + }, + { + "epoch": 0.11860977313015919, + "grad_norm": 1.9423574209213257, + "learning_rate": 0.0001959108859956874, + "loss": 1.6332, + "step": 3312 + }, + { + "epoch": 0.11864558525972747, + "grad_norm": 2.0534324645996094, + "learning_rate": 0.00019590760239554097, + "loss": 1.6084, + "step": 3313 + }, + { + "epoch": 0.11868139738929576, + "grad_norm": 1.9046193361282349, + "learning_rate": 0.00019590431750508153, + "loss": 1.585, + "step": 3314 + }, + { + "epoch": 0.11871720951886404, + "grad_norm": 1.8338521718978882, + "learning_rate": 0.00019590103132435314, + "loss": 1.8236, + "step": 3315 + }, + { + "epoch": 0.11875302164843232, + "grad_norm": 1.7030360698699951, + "learning_rate": 0.00019589774385340007, + "loss": 1.716, + "step": 3316 + }, + { + "epoch": 0.1187888337780006, + "grad_norm": 2.5421273708343506, + "learning_rate": 0.0001958944550922666, + "loss": 1.6587, + "step": 3317 + }, + { + "epoch": 0.11882464590756889, + "grad_norm": 1.5858945846557617, + "learning_rate": 0.0001958911650409969, + "loss": 1.4128, + "step": 3318 + }, + { + "epoch": 0.11886045803713718, + "grad_norm": 1.6112788915634155, + "learning_rate": 0.0001958878736996353, + "loss": 1.5822, + "step": 3319 + }, + { + "epoch": 0.11889627016670547, + "grad_norm": 1.3577380180358887, + "learning_rate": 0.00019588458106822602, + "loss": 1.5731, + "step": 3320 + }, + { + "epoch": 0.11893208229627375, + "grad_norm": 1.960248351097107, + "learning_rate": 0.00019588128714681337, + "loss": 1.6318, + "step": 3321 + }, + { + "epoch": 0.11896789442584203, + "grad_norm": 1.8858251571655273, + "learning_rate": 0.0001958779919354417, + "loss": 1.8675, + "step": 3322 + }, + { + "epoch": 0.11900370655541032, + "grad_norm": 1.6242177486419678, + "learning_rate": 0.00019587469543415532, + "loss": 1.5496, + "step": 3323 + }, + { + "epoch": 0.1190395186849786, + "grad_norm": 1.738443374633789, + "learning_rate": 0.00019587139764299857, + "loss": 1.614, + "step": 3324 + }, + { + "epoch": 0.11907533081454688, + "grad_norm": 1.9253852367401123, + "learning_rate": 0.00019586809856201586, + "loss": 1.8159, + "step": 3325 + }, + { + "epoch": 0.11911114294411518, + "grad_norm": 1.6207756996154785, + "learning_rate": 0.00019586479819125153, + "loss": 1.3749, + "step": 3326 + }, + { + "epoch": 0.11914695507368346, + "grad_norm": 1.6630465984344482, + "learning_rate": 0.00019586149653074997, + "loss": 1.3199, + "step": 3327 + }, + { + "epoch": 0.11918276720325174, + "grad_norm": 1.2815748453140259, + "learning_rate": 0.00019585819358055567, + "loss": 1.6019, + "step": 3328 + }, + { + "epoch": 0.11921857933282003, + "grad_norm": 2.2043771743774414, + "learning_rate": 0.00019585488934071302, + "loss": 1.7924, + "step": 3329 + }, + { + "epoch": 0.11925439146238831, + "grad_norm": 1.9640440940856934, + "learning_rate": 0.00019585158381126645, + "loss": 1.4117, + "step": 3330 + }, + { + "epoch": 0.11929020359195659, + "grad_norm": 1.911688208580017, + "learning_rate": 0.00019584827699226044, + "loss": 1.6101, + "step": 3331 + }, + { + "epoch": 0.11932601572152488, + "grad_norm": 1.7753043174743652, + "learning_rate": 0.00019584496888373955, + "loss": 1.353, + "step": 3332 + }, + { + "epoch": 0.11936182785109317, + "grad_norm": 1.6217472553253174, + "learning_rate": 0.00019584165948574822, + "loss": 1.6926, + "step": 3333 + }, + { + "epoch": 0.11939763998066145, + "grad_norm": 1.9915986061096191, + "learning_rate": 0.00019583834879833097, + "loss": 1.3721, + "step": 3334 + }, + { + "epoch": 0.11943345211022974, + "grad_norm": 2.1937718391418457, + "learning_rate": 0.0001958350368215324, + "loss": 1.526, + "step": 3335 + }, + { + "epoch": 0.11946926423979802, + "grad_norm": 1.4065150022506714, + "learning_rate": 0.00019583172355539698, + "loss": 1.5645, + "step": 3336 + }, + { + "epoch": 0.1195050763693663, + "grad_norm": 1.9046217203140259, + "learning_rate": 0.00019582840899996936, + "loss": 1.4022, + "step": 3337 + }, + { + "epoch": 0.11954088849893459, + "grad_norm": 1.5891504287719727, + "learning_rate": 0.00019582509315529408, + "loss": 1.7865, + "step": 3338 + }, + { + "epoch": 0.11957670062850287, + "grad_norm": 2.062664270401001, + "learning_rate": 0.0001958217760214158, + "loss": 1.8436, + "step": 3339 + }, + { + "epoch": 0.11961251275807117, + "grad_norm": 1.4193004369735718, + "learning_rate": 0.00019581845759837914, + "loss": 1.5553, + "step": 3340 + }, + { + "epoch": 0.11964832488763945, + "grad_norm": 1.8979054689407349, + "learning_rate": 0.0001958151378862287, + "loss": 1.4616, + "step": 3341 + }, + { + "epoch": 0.11968413701720773, + "grad_norm": 1.922569990158081, + "learning_rate": 0.00019581181688500918, + "loss": 1.7021, + "step": 3342 + }, + { + "epoch": 0.11971994914677601, + "grad_norm": 1.8692396879196167, + "learning_rate": 0.00019580849459476527, + "loss": 1.5885, + "step": 3343 + }, + { + "epoch": 0.1197557612763443, + "grad_norm": 1.5603022575378418, + "learning_rate": 0.00019580517101554164, + "loss": 1.4791, + "step": 3344 + }, + { + "epoch": 0.11979157340591258, + "grad_norm": 1.3422173261642456, + "learning_rate": 0.00019580184614738299, + "loss": 1.626, + "step": 3345 + }, + { + "epoch": 0.11982738553548086, + "grad_norm": 1.8114674091339111, + "learning_rate": 0.0001957985199903341, + "loss": 1.5095, + "step": 3346 + }, + { + "epoch": 0.11986319766504915, + "grad_norm": 1.594317078590393, + "learning_rate": 0.00019579519254443967, + "loss": 1.7669, + "step": 3347 + }, + { + "epoch": 0.11989900979461744, + "grad_norm": 1.1270697116851807, + "learning_rate": 0.00019579186380974455, + "loss": 1.3657, + "step": 3348 + }, + { + "epoch": 0.11993482192418572, + "grad_norm": 1.7906159162521362, + "learning_rate": 0.0001957885337862934, + "loss": 1.7797, + "step": 3349 + }, + { + "epoch": 0.11997063405375401, + "grad_norm": 1.6569995880126953, + "learning_rate": 0.00019578520247413113, + "loss": 1.7222, + "step": 3350 + }, + { + "epoch": 0.12000644618332229, + "grad_norm": 1.458554744720459, + "learning_rate": 0.0001957818698733025, + "loss": 1.7073, + "step": 3351 + }, + { + "epoch": 0.12004225831289057, + "grad_norm": 1.9747871160507202, + "learning_rate": 0.00019577853598385235, + "loss": 1.495, + "step": 3352 + }, + { + "epoch": 0.12007807044245886, + "grad_norm": 1.7646852731704712, + "learning_rate": 0.00019577520080582556, + "loss": 1.6767, + "step": 3353 + }, + { + "epoch": 0.12011388257202714, + "grad_norm": 1.4243603944778442, + "learning_rate": 0.00019577186433926698, + "loss": 1.8968, + "step": 3354 + }, + { + "epoch": 0.12014969470159544, + "grad_norm": 1.5458492040634155, + "learning_rate": 0.00019576852658422146, + "loss": 1.7073, + "step": 3355 + }, + { + "epoch": 0.12018550683116372, + "grad_norm": 2.496417760848999, + "learning_rate": 0.000195765187540734, + "loss": 2.0073, + "step": 3356 + }, + { + "epoch": 0.120221318960732, + "grad_norm": 2.2858903408050537, + "learning_rate": 0.00019576184720884946, + "loss": 1.893, + "step": 3357 + }, + { + "epoch": 0.12025713109030028, + "grad_norm": 1.6809884309768677, + "learning_rate": 0.00019575850558861278, + "loss": 1.597, + "step": 3358 + }, + { + "epoch": 0.12029294321986857, + "grad_norm": 2.288938283920288, + "learning_rate": 0.00019575516268006892, + "loss": 1.594, + "step": 3359 + }, + { + "epoch": 0.12032875534943685, + "grad_norm": 1.5342596769332886, + "learning_rate": 0.00019575181848326289, + "loss": 1.4432, + "step": 3360 + }, + { + "epoch": 0.12036456747900513, + "grad_norm": 1.6115968227386475, + "learning_rate": 0.00019574847299823965, + "loss": 1.6213, + "step": 3361 + }, + { + "epoch": 0.12040037960857343, + "grad_norm": 1.2919033765792847, + "learning_rate": 0.00019574512622504416, + "loss": 1.6027, + "step": 3362 + }, + { + "epoch": 0.12043619173814171, + "grad_norm": 1.2876076698303223, + "learning_rate": 0.00019574177816372154, + "loss": 1.541, + "step": 3363 + }, + { + "epoch": 0.12047200386771, + "grad_norm": 1.2301689386367798, + "learning_rate": 0.0001957384288143168, + "loss": 1.733, + "step": 3364 + }, + { + "epoch": 0.12050781599727828, + "grad_norm": 2.5293233394622803, + "learning_rate": 0.000195735078176875, + "loss": 1.7144, + "step": 3365 + }, + { + "epoch": 0.12054362812684656, + "grad_norm": 2.2179653644561768, + "learning_rate": 0.0001957317262514412, + "loss": 1.673, + "step": 3366 + }, + { + "epoch": 0.12057944025641484, + "grad_norm": 2.00532603263855, + "learning_rate": 0.00019572837303806048, + "loss": 1.7745, + "step": 3367 + }, + { + "epoch": 0.12061525238598313, + "grad_norm": 1.687103271484375, + "learning_rate": 0.00019572501853677802, + "loss": 1.4363, + "step": 3368 + }, + { + "epoch": 0.12065106451555142, + "grad_norm": 1.7787339687347412, + "learning_rate": 0.0001957216627476389, + "loss": 1.5124, + "step": 3369 + }, + { + "epoch": 0.1206868766451197, + "grad_norm": 2.2750205993652344, + "learning_rate": 0.0001957183056706883, + "loss": 1.6114, + "step": 3370 + }, + { + "epoch": 0.12072268877468799, + "grad_norm": 2.279995918273926, + "learning_rate": 0.0001957149473059713, + "loss": 1.5944, + "step": 3371 + }, + { + "epoch": 0.12075850090425627, + "grad_norm": 1.3898695707321167, + "learning_rate": 0.0001957115876535332, + "loss": 1.7184, + "step": 3372 + }, + { + "epoch": 0.12079431303382455, + "grad_norm": 1.9735623598098755, + "learning_rate": 0.00019570822671341915, + "loss": 1.6018, + "step": 3373 + }, + { + "epoch": 0.12083012516339284, + "grad_norm": 1.697853684425354, + "learning_rate": 0.00019570486448567437, + "loss": 1.6601, + "step": 3374 + }, + { + "epoch": 0.12086593729296112, + "grad_norm": 1.8338154554367065, + "learning_rate": 0.00019570150097034404, + "loss": 1.7257, + "step": 3375 + }, + { + "epoch": 0.12090174942252942, + "grad_norm": 1.4777474403381348, + "learning_rate": 0.0001956981361674735, + "loss": 1.7592, + "step": 3376 + }, + { + "epoch": 0.1209375615520977, + "grad_norm": 1.6526165008544922, + "learning_rate": 0.00019569477007710798, + "loss": 1.714, + "step": 3377 + }, + { + "epoch": 0.12097337368166598, + "grad_norm": 1.5610233545303345, + "learning_rate": 0.00019569140269929276, + "loss": 1.7781, + "step": 3378 + }, + { + "epoch": 0.12100918581123427, + "grad_norm": 1.9586926698684692, + "learning_rate": 0.00019568803403407315, + "loss": 1.7737, + "step": 3379 + }, + { + "epoch": 0.12104499794080255, + "grad_norm": 1.7385483980178833, + "learning_rate": 0.00019568466408149447, + "loss": 1.3575, + "step": 3380 + }, + { + "epoch": 0.12108081007037083, + "grad_norm": 1.839732050895691, + "learning_rate": 0.00019568129284160203, + "loss": 1.5505, + "step": 3381 + }, + { + "epoch": 0.12111662219993911, + "grad_norm": 1.285846471786499, + "learning_rate": 0.00019567792031444125, + "loss": 1.4664, + "step": 3382 + }, + { + "epoch": 0.12115243432950741, + "grad_norm": 1.6386572122573853, + "learning_rate": 0.00019567454650005749, + "loss": 1.6598, + "step": 3383 + }, + { + "epoch": 0.1211882464590757, + "grad_norm": 1.7839397192001343, + "learning_rate": 0.00019567117139849605, + "loss": 1.7173, + "step": 3384 + }, + { + "epoch": 0.12122405858864398, + "grad_norm": 1.632829189300537, + "learning_rate": 0.00019566779500980247, + "loss": 1.3093, + "step": 3385 + }, + { + "epoch": 0.12125987071821226, + "grad_norm": 1.8660407066345215, + "learning_rate": 0.00019566441733402207, + "loss": 1.2346, + "step": 3386 + }, + { + "epoch": 0.12129568284778054, + "grad_norm": 1.885238766670227, + "learning_rate": 0.00019566103837120036, + "loss": 1.3926, + "step": 3387 + }, + { + "epoch": 0.12133149497734882, + "grad_norm": 1.5017540454864502, + "learning_rate": 0.00019565765812138274, + "loss": 1.4036, + "step": 3388 + }, + { + "epoch": 0.12136730710691711, + "grad_norm": 1.5726633071899414, + "learning_rate": 0.00019565427658461474, + "loss": 1.624, + "step": 3389 + }, + { + "epoch": 0.1214031192364854, + "grad_norm": 1.7805520296096802, + "learning_rate": 0.00019565089376094184, + "loss": 1.7118, + "step": 3390 + }, + { + "epoch": 0.12143893136605369, + "grad_norm": 1.9089933633804321, + "learning_rate": 0.0001956475096504095, + "loss": 1.7371, + "step": 3391 + }, + { + "epoch": 0.12147474349562197, + "grad_norm": 1.7601195573806763, + "learning_rate": 0.00019564412425306338, + "loss": 1.9506, + "step": 3392 + }, + { + "epoch": 0.12151055562519025, + "grad_norm": 1.944966197013855, + "learning_rate": 0.00019564073756894889, + "loss": 1.8287, + "step": 3393 + }, + { + "epoch": 0.12154636775475854, + "grad_norm": 2.336825370788574, + "learning_rate": 0.00019563734959811163, + "loss": 1.5147, + "step": 3394 + }, + { + "epoch": 0.12158217988432682, + "grad_norm": 2.415855646133423, + "learning_rate": 0.00019563396034059724, + "loss": 1.4703, + "step": 3395 + }, + { + "epoch": 0.1216179920138951, + "grad_norm": 1.4387824535369873, + "learning_rate": 0.00019563056979645123, + "loss": 1.4512, + "step": 3396 + }, + { + "epoch": 0.12165380414346338, + "grad_norm": 2.0192601680755615, + "learning_rate": 0.00019562717796571929, + "loss": 1.536, + "step": 3397 + }, + { + "epoch": 0.12168961627303168, + "grad_norm": 1.5521689653396606, + "learning_rate": 0.00019562378484844697, + "loss": 1.6861, + "step": 3398 + }, + { + "epoch": 0.12172542840259996, + "grad_norm": 1.3953551054000854, + "learning_rate": 0.00019562039044468, + "loss": 1.713, + "step": 3399 + }, + { + "epoch": 0.12176124053216825, + "grad_norm": 2.014117956161499, + "learning_rate": 0.00019561699475446401, + "loss": 1.4568, + "step": 3400 + }, + { + "epoch": 0.12179705266173653, + "grad_norm": 1.4220621585845947, + "learning_rate": 0.00019561359777784472, + "loss": 1.4072, + "step": 3401 + }, + { + "epoch": 0.12183286479130481, + "grad_norm": 1.6513803005218506, + "learning_rate": 0.0001956101995148678, + "loss": 1.6897, + "step": 3402 + }, + { + "epoch": 0.1218686769208731, + "grad_norm": 1.61029052734375, + "learning_rate": 0.00019560679996557894, + "loss": 1.6601, + "step": 3403 + }, + { + "epoch": 0.12190448905044138, + "grad_norm": 1.2982370853424072, + "learning_rate": 0.00019560339913002396, + "loss": 1.6648, + "step": 3404 + }, + { + "epoch": 0.12194030118000967, + "grad_norm": 1.9422756433486938, + "learning_rate": 0.00019559999700824852, + "loss": 1.8368, + "step": 3405 + }, + { + "epoch": 0.12197611330957796, + "grad_norm": 1.4068212509155273, + "learning_rate": 0.00019559659360029845, + "loss": 1.681, + "step": 3406 + }, + { + "epoch": 0.12201192543914624, + "grad_norm": 2.2662994861602783, + "learning_rate": 0.0001955931889062195, + "loss": 1.7048, + "step": 3407 + }, + { + "epoch": 0.12204773756871452, + "grad_norm": 1.8388558626174927, + "learning_rate": 0.00019558978292605754, + "loss": 1.4593, + "step": 3408 + }, + { + "epoch": 0.1220835496982828, + "grad_norm": 2.5157718658447266, + "learning_rate": 0.00019558637565985834, + "loss": 1.5091, + "step": 3409 + }, + { + "epoch": 0.12211936182785109, + "grad_norm": 1.798851728439331, + "learning_rate": 0.00019558296710766774, + "loss": 1.5104, + "step": 3410 + }, + { + "epoch": 0.12215517395741937, + "grad_norm": 1.3769530057907104, + "learning_rate": 0.00019557955726953163, + "loss": 1.516, + "step": 3411 + }, + { + "epoch": 0.12219098608698767, + "grad_norm": 1.429823398590088, + "learning_rate": 0.00019557614614549586, + "loss": 1.6896, + "step": 3412 + }, + { + "epoch": 0.12222679821655595, + "grad_norm": 2.1437742710113525, + "learning_rate": 0.00019557273373560632, + "loss": 1.8087, + "step": 3413 + }, + { + "epoch": 0.12226261034612423, + "grad_norm": 1.2321090698242188, + "learning_rate": 0.00019556932003990892, + "loss": 1.331, + "step": 3414 + }, + { + "epoch": 0.12229842247569252, + "grad_norm": 1.5394012928009033, + "learning_rate": 0.0001955659050584496, + "loss": 1.6945, + "step": 3415 + }, + { + "epoch": 0.1223342346052608, + "grad_norm": 1.67526376247406, + "learning_rate": 0.0001955624887912743, + "loss": 1.3599, + "step": 3416 + }, + { + "epoch": 0.12237004673482908, + "grad_norm": 1.7456480264663696, + "learning_rate": 0.00019555907123842902, + "loss": 1.6028, + "step": 3417 + }, + { + "epoch": 0.12240585886439737, + "grad_norm": 1.8807971477508545, + "learning_rate": 0.00019555565239995966, + "loss": 1.4512, + "step": 3418 + }, + { + "epoch": 0.12244167099396566, + "grad_norm": 1.4449418783187866, + "learning_rate": 0.00019555223227591225, + "loss": 1.6143, + "step": 3419 + }, + { + "epoch": 0.12247748312353395, + "grad_norm": 1.2141433954238892, + "learning_rate": 0.0001955488108663328, + "loss": 1.6618, + "step": 3420 + }, + { + "epoch": 0.12251329525310223, + "grad_norm": 1.769530177116394, + "learning_rate": 0.00019554538817126739, + "loss": 1.5924, + "step": 3421 + }, + { + "epoch": 0.12254910738267051, + "grad_norm": 1.607627272605896, + "learning_rate": 0.000195541964190762, + "loss": 1.64, + "step": 3422 + }, + { + "epoch": 0.1225849195122388, + "grad_norm": 2.0021603107452393, + "learning_rate": 0.00019553853892486273, + "loss": 1.5966, + "step": 3423 + }, + { + "epoch": 0.12262073164180708, + "grad_norm": 1.4065313339233398, + "learning_rate": 0.00019553511237361564, + "loss": 1.7312, + "step": 3424 + }, + { + "epoch": 0.12265654377137536, + "grad_norm": 2.360572338104248, + "learning_rate": 0.00019553168453706685, + "loss": 1.6099, + "step": 3425 + }, + { + "epoch": 0.12269235590094366, + "grad_norm": 1.807173728942871, + "learning_rate": 0.00019552825541526247, + "loss": 1.4694, + "step": 3426 + }, + { + "epoch": 0.12272816803051194, + "grad_norm": 1.3030319213867188, + "learning_rate": 0.00019552482500824865, + "loss": 1.8069, + "step": 3427 + }, + { + "epoch": 0.12276398016008022, + "grad_norm": 1.7905423641204834, + "learning_rate": 0.0001955213933160715, + "loss": 1.5229, + "step": 3428 + }, + { + "epoch": 0.1227997922896485, + "grad_norm": 2.1524462699890137, + "learning_rate": 0.00019551796033877726, + "loss": 1.3083, + "step": 3429 + }, + { + "epoch": 0.12283560441921679, + "grad_norm": 1.6406817436218262, + "learning_rate": 0.00019551452607641205, + "loss": 1.8781, + "step": 3430 + }, + { + "epoch": 0.12287141654878507, + "grad_norm": 1.5875840187072754, + "learning_rate": 0.0001955110905290221, + "loss": 1.9099, + "step": 3431 + }, + { + "epoch": 0.12290722867835335, + "grad_norm": 1.7724831104278564, + "learning_rate": 0.00019550765369665362, + "loss": 1.4552, + "step": 3432 + }, + { + "epoch": 0.12294304080792165, + "grad_norm": 1.4344149827957153, + "learning_rate": 0.00019550421557935286, + "loss": 1.7283, + "step": 3433 + }, + { + "epoch": 0.12297885293748993, + "grad_norm": 1.5454742908477783, + "learning_rate": 0.00019550077617716606, + "loss": 1.7259, + "step": 3434 + }, + { + "epoch": 0.12301466506705822, + "grad_norm": 1.3281104564666748, + "learning_rate": 0.00019549733549013954, + "loss": 1.674, + "step": 3435 + }, + { + "epoch": 0.1230504771966265, + "grad_norm": 1.5898027420043945, + "learning_rate": 0.0001954938935183195, + "loss": 1.7844, + "step": 3436 + }, + { + "epoch": 0.12308628932619478, + "grad_norm": 1.7751864194869995, + "learning_rate": 0.00019549045026175232, + "loss": 1.6153, + "step": 3437 + }, + { + "epoch": 0.12312210145576306, + "grad_norm": 1.5239864587783813, + "learning_rate": 0.00019548700572048433, + "loss": 1.5856, + "step": 3438 + }, + { + "epoch": 0.12315791358533135, + "grad_norm": 1.585067629814148, + "learning_rate": 0.00019548355989456182, + "loss": 1.5034, + "step": 3439 + }, + { + "epoch": 0.12319372571489964, + "grad_norm": 1.449587345123291, + "learning_rate": 0.0001954801127840312, + "loss": 1.4557, + "step": 3440 + }, + { + "epoch": 0.12322953784446793, + "grad_norm": 1.8552396297454834, + "learning_rate": 0.00019547666438893879, + "loss": 1.4238, + "step": 3441 + }, + { + "epoch": 0.12326534997403621, + "grad_norm": 1.5274230241775513, + "learning_rate": 0.00019547321470933103, + "loss": 1.4542, + "step": 3442 + }, + { + "epoch": 0.12330116210360449, + "grad_norm": 1.3685420751571655, + "learning_rate": 0.00019546976374525433, + "loss": 1.5404, + "step": 3443 + }, + { + "epoch": 0.12333697423317277, + "grad_norm": 1.6194121837615967, + "learning_rate": 0.0001954663114967551, + "loss": 1.2244, + "step": 3444 + }, + { + "epoch": 0.12337278636274106, + "grad_norm": 1.4230560064315796, + "learning_rate": 0.0001954628579638798, + "loss": 1.0127, + "step": 3445 + }, + { + "epoch": 0.12340859849230934, + "grad_norm": 1.3849716186523438, + "learning_rate": 0.0001954594031466749, + "loss": 1.425, + "step": 3446 + }, + { + "epoch": 0.12344441062187762, + "grad_norm": 1.335742473602295, + "learning_rate": 0.00019545594704518682, + "loss": 1.7514, + "step": 3447 + }, + { + "epoch": 0.12348022275144592, + "grad_norm": 1.256226897239685, + "learning_rate": 0.00019545248965946216, + "loss": 1.6755, + "step": 3448 + }, + { + "epoch": 0.1235160348810142, + "grad_norm": 1.8748904466629028, + "learning_rate": 0.00019544903098954732, + "loss": 1.516, + "step": 3449 + }, + { + "epoch": 0.12355184701058249, + "grad_norm": 1.9898053407669067, + "learning_rate": 0.0001954455710354889, + "loss": 1.5757, + "step": 3450 + }, + { + "epoch": 0.12358765914015077, + "grad_norm": 2.141721725463867, + "learning_rate": 0.00019544210979733343, + "loss": 1.9781, + "step": 3451 + }, + { + "epoch": 0.12362347126971905, + "grad_norm": 1.461661696434021, + "learning_rate": 0.0001954386472751275, + "loss": 1.8818, + "step": 3452 + }, + { + "epoch": 0.12365928339928733, + "grad_norm": 2.044703245162964, + "learning_rate": 0.0001954351834689177, + "loss": 1.8225, + "step": 3453 + }, + { + "epoch": 0.12369509552885562, + "grad_norm": 2.4182326793670654, + "learning_rate": 0.0001954317183787506, + "loss": 1.6916, + "step": 3454 + }, + { + "epoch": 0.12373090765842391, + "grad_norm": 1.5588754415512085, + "learning_rate": 0.00019542825200467279, + "loss": 1.7322, + "step": 3455 + }, + { + "epoch": 0.1237667197879922, + "grad_norm": 1.1768525838851929, + "learning_rate": 0.00019542478434673096, + "loss": 1.5594, + "step": 3456 + }, + { + "epoch": 0.12380253191756048, + "grad_norm": 1.7234669923782349, + "learning_rate": 0.00019542131540497174, + "loss": 1.7051, + "step": 3457 + }, + { + "epoch": 0.12383834404712876, + "grad_norm": 1.9066272974014282, + "learning_rate": 0.00019541784517944182, + "loss": 1.7679, + "step": 3458 + }, + { + "epoch": 0.12387415617669705, + "grad_norm": 1.688675045967102, + "learning_rate": 0.0001954143736701879, + "loss": 1.2632, + "step": 3459 + }, + { + "epoch": 0.12390996830626533, + "grad_norm": 1.6447263956069946, + "learning_rate": 0.0001954109008772566, + "loss": 1.7467, + "step": 3460 + }, + { + "epoch": 0.12394578043583361, + "grad_norm": 1.7118195295333862, + "learning_rate": 0.00019540742680069473, + "loss": 1.6544, + "step": 3461 + }, + { + "epoch": 0.12398159256540191, + "grad_norm": 2.0889880657196045, + "learning_rate": 0.000195403951440549, + "loss": 1.8585, + "step": 3462 + }, + { + "epoch": 0.12401740469497019, + "grad_norm": 1.5095477104187012, + "learning_rate": 0.00019540047479686616, + "loss": 1.5888, + "step": 3463 + }, + { + "epoch": 0.12405321682453847, + "grad_norm": 1.3710155487060547, + "learning_rate": 0.00019539699686969302, + "loss": 1.4073, + "step": 3464 + }, + { + "epoch": 0.12408902895410676, + "grad_norm": 1.261867642402649, + "learning_rate": 0.0001953935176590763, + "loss": 1.7322, + "step": 3465 + }, + { + "epoch": 0.12412484108367504, + "grad_norm": 1.395007848739624, + "learning_rate": 0.00019539003716506287, + "loss": 1.5827, + "step": 3466 + }, + { + "epoch": 0.12416065321324332, + "grad_norm": 1.8088184595108032, + "learning_rate": 0.0001953865553876995, + "loss": 2.0731, + "step": 3467 + }, + { + "epoch": 0.1241964653428116, + "grad_norm": 2.2658514976501465, + "learning_rate": 0.00019538307232703313, + "loss": 1.6118, + "step": 3468 + }, + { + "epoch": 0.1242322774723799, + "grad_norm": 1.3344290256500244, + "learning_rate": 0.0001953795879831105, + "loss": 1.6023, + "step": 3469 + }, + { + "epoch": 0.12426808960194818, + "grad_norm": 1.413332462310791, + "learning_rate": 0.00019537610235597857, + "loss": 1.5516, + "step": 3470 + }, + { + "epoch": 0.12430390173151647, + "grad_norm": 1.3064593076705933, + "learning_rate": 0.00019537261544568421, + "loss": 1.3519, + "step": 3471 + }, + { + "epoch": 0.12433971386108475, + "grad_norm": 4.2692060470581055, + "learning_rate": 0.00019536912725227432, + "loss": 1.7083, + "step": 3472 + }, + { + "epoch": 0.12437552599065303, + "grad_norm": 1.505118489265442, + "learning_rate": 0.00019536563777579585, + "loss": 1.597, + "step": 3473 + }, + { + "epoch": 0.12441133812022132, + "grad_norm": 1.7473787069320679, + "learning_rate": 0.0001953621470162957, + "loss": 1.7033, + "step": 3474 + }, + { + "epoch": 0.1244471502497896, + "grad_norm": 1.8221546411514282, + "learning_rate": 0.00019535865497382094, + "loss": 1.4684, + "step": 3475 + }, + { + "epoch": 0.1244829623793579, + "grad_norm": 1.8060365915298462, + "learning_rate": 0.00019535516164841842, + "loss": 1.6477, + "step": 3476 + }, + { + "epoch": 0.12451877450892618, + "grad_norm": 1.790130853652954, + "learning_rate": 0.00019535166704013522, + "loss": 1.5853, + "step": 3477 + }, + { + "epoch": 0.12455458663849446, + "grad_norm": 2.9032599925994873, + "learning_rate": 0.00019534817114901833, + "loss": 1.7533, + "step": 3478 + }, + { + "epoch": 0.12459039876806274, + "grad_norm": 1.4391133785247803, + "learning_rate": 0.0001953446739751148, + "loss": 1.6817, + "step": 3479 + }, + { + "epoch": 0.12462621089763103, + "grad_norm": 1.8639107942581177, + "learning_rate": 0.00019534117551847166, + "loss": 1.6716, + "step": 3480 + }, + { + "epoch": 0.12466202302719931, + "grad_norm": 1.7020066976547241, + "learning_rate": 0.000195337675779136, + "loss": 1.7672, + "step": 3481 + }, + { + "epoch": 0.12469783515676759, + "grad_norm": 2.505441188812256, + "learning_rate": 0.00019533417475715487, + "loss": 1.6561, + "step": 3482 + }, + { + "epoch": 0.12473364728633589, + "grad_norm": 1.5666439533233643, + "learning_rate": 0.0001953306724525754, + "loss": 1.6065, + "step": 3483 + }, + { + "epoch": 0.12476945941590417, + "grad_norm": 1.5428463220596313, + "learning_rate": 0.00019532716886544468, + "loss": 1.4111, + "step": 3484 + }, + { + "epoch": 0.12480527154547245, + "grad_norm": 1.5650322437286377, + "learning_rate": 0.0001953236639958099, + "loss": 1.5319, + "step": 3485 + }, + { + "epoch": 0.12484108367504074, + "grad_norm": 1.1498960256576538, + "learning_rate": 0.00019532015784371818, + "loss": 1.8039, + "step": 3486 + }, + { + "epoch": 0.12487689580460902, + "grad_norm": 1.6411759853363037, + "learning_rate": 0.00019531665040921668, + "loss": 1.812, + "step": 3487 + }, + { + "epoch": 0.1249127079341773, + "grad_norm": 1.3807140588760376, + "learning_rate": 0.00019531314169235259, + "loss": 1.1548, + "step": 3488 + }, + { + "epoch": 0.12494852006374559, + "grad_norm": 1.5563311576843262, + "learning_rate": 0.00019530963169317312, + "loss": 1.2558, + "step": 3489 + }, + { + "epoch": 0.12498433219331388, + "grad_norm": 1.7699103355407715, + "learning_rate": 0.0001953061204117255, + "loss": 1.6063, + "step": 3490 + }, + { + "epoch": 0.12502014432288217, + "grad_norm": 1.5972926616668701, + "learning_rate": 0.00019530260784805697, + "loss": 1.3708, + "step": 3491 + }, + { + "epoch": 0.12505595645245043, + "grad_norm": 1.3855398893356323, + "learning_rate": 0.00019529909400221475, + "loss": 1.3582, + "step": 3492 + }, + { + "epoch": 0.12509176858201873, + "grad_norm": 1.3573073148727417, + "learning_rate": 0.00019529557887424618, + "loss": 1.8298, + "step": 3493 + }, + { + "epoch": 0.12512758071158703, + "grad_norm": 1.5848575830459595, + "learning_rate": 0.00019529206246419854, + "loss": 1.5766, + "step": 3494 + }, + { + "epoch": 0.1251633928411553, + "grad_norm": 1.3935266733169556, + "learning_rate": 0.00019528854477211908, + "loss": 1.5095, + "step": 3495 + }, + { + "epoch": 0.1251992049707236, + "grad_norm": 1.3671562671661377, + "learning_rate": 0.0001952850257980552, + "loss": 1.5969, + "step": 3496 + }, + { + "epoch": 0.12523501710029186, + "grad_norm": 1.2846484184265137, + "learning_rate": 0.00019528150554205419, + "loss": 1.4534, + "step": 3497 + }, + { + "epoch": 0.12527082922986016, + "grad_norm": 1.6687588691711426, + "learning_rate": 0.00019527798400416338, + "loss": 1.5611, + "step": 3498 + }, + { + "epoch": 0.12530664135942843, + "grad_norm": 1.379241704940796, + "learning_rate": 0.00019527446118443025, + "loss": 1.4491, + "step": 3499 + }, + { + "epoch": 0.12534245348899672, + "grad_norm": 1.4438045024871826, + "learning_rate": 0.00019527093708290215, + "loss": 1.6858, + "step": 3500 + }, + { + "epoch": 0.12537826561856502, + "grad_norm": 1.3740583658218384, + "learning_rate": 0.00019526741169962643, + "loss": 1.8549, + "step": 3501 + }, + { + "epoch": 0.1254140777481333, + "grad_norm": 2.178952932357788, + "learning_rate": 0.00019526388503465062, + "loss": 1.8759, + "step": 3502 + }, + { + "epoch": 0.1254498898777016, + "grad_norm": 1.6737861633300781, + "learning_rate": 0.00019526035708802207, + "loss": 1.5266, + "step": 3503 + }, + { + "epoch": 0.12548570200726986, + "grad_norm": 1.7721725702285767, + "learning_rate": 0.00019525682785978833, + "loss": 1.7037, + "step": 3504 + }, + { + "epoch": 0.12552151413683815, + "grad_norm": 1.7052780389785767, + "learning_rate": 0.00019525329734999683, + "loss": 1.7986, + "step": 3505 + }, + { + "epoch": 0.12555732626640642, + "grad_norm": 1.8097865581512451, + "learning_rate": 0.0001952497655586951, + "loss": 1.397, + "step": 3506 + }, + { + "epoch": 0.12559313839597472, + "grad_norm": 1.6215720176696777, + "learning_rate": 0.00019524623248593062, + "loss": 1.4687, + "step": 3507 + }, + { + "epoch": 0.12562895052554302, + "grad_norm": 1.4546492099761963, + "learning_rate": 0.00019524269813175096, + "loss": 1.3458, + "step": 3508 + }, + { + "epoch": 0.12566476265511128, + "grad_norm": 2.1986570358276367, + "learning_rate": 0.00019523916249620363, + "loss": 1.5225, + "step": 3509 + }, + { + "epoch": 0.12570057478467958, + "grad_norm": 3.118424892425537, + "learning_rate": 0.0001952356255793362, + "loss": 1.679, + "step": 3510 + }, + { + "epoch": 0.12573638691424785, + "grad_norm": 1.657961368560791, + "learning_rate": 0.00019523208738119632, + "loss": 1.482, + "step": 3511 + }, + { + "epoch": 0.12577219904381615, + "grad_norm": 1.8398610353469849, + "learning_rate": 0.00019522854790183152, + "loss": 1.649, + "step": 3512 + }, + { + "epoch": 0.12580801117338442, + "grad_norm": 1.3558249473571777, + "learning_rate": 0.00019522500714128942, + "loss": 1.5596, + "step": 3513 + }, + { + "epoch": 0.1258438233029527, + "grad_norm": 1.444551706314087, + "learning_rate": 0.0001952214650996177, + "loss": 1.3201, + "step": 3514 + }, + { + "epoch": 0.125879635432521, + "grad_norm": 1.4728481769561768, + "learning_rate": 0.000195217921776864, + "loss": 1.6087, + "step": 3515 + }, + { + "epoch": 0.12591544756208928, + "grad_norm": 1.3532110452651978, + "learning_rate": 0.000195214377173076, + "loss": 1.6132, + "step": 3516 + }, + { + "epoch": 0.12595125969165757, + "grad_norm": 1.7189764976501465, + "learning_rate": 0.00019521083128830137, + "loss": 1.508, + "step": 3517 + }, + { + "epoch": 0.12598707182122584, + "grad_norm": 1.3803995847702026, + "learning_rate": 0.0001952072841225878, + "loss": 1.7025, + "step": 3518 + }, + { + "epoch": 0.12602288395079414, + "grad_norm": 1.9265297651290894, + "learning_rate": 0.00019520373567598304, + "loss": 1.4115, + "step": 3519 + }, + { + "epoch": 0.1260586960803624, + "grad_norm": 2.2055609226226807, + "learning_rate": 0.0001952001859485348, + "loss": 1.8266, + "step": 3520 + }, + { + "epoch": 0.1260945082099307, + "grad_norm": 1.7643824815750122, + "learning_rate": 0.0001951966349402909, + "loss": 1.5893, + "step": 3521 + }, + { + "epoch": 0.12613032033949897, + "grad_norm": 2.026763677597046, + "learning_rate": 0.00019519308265129903, + "loss": 1.4778, + "step": 3522 + }, + { + "epoch": 0.12616613246906727, + "grad_norm": 1.664231538772583, + "learning_rate": 0.00019518952908160705, + "loss": 1.6736, + "step": 3523 + }, + { + "epoch": 0.12620194459863557, + "grad_norm": 1.765260934829712, + "learning_rate": 0.00019518597423126273, + "loss": 1.665, + "step": 3524 + }, + { + "epoch": 0.12623775672820384, + "grad_norm": 2.3637137413024902, + "learning_rate": 0.0001951824181003139, + "loss": 1.3418, + "step": 3525 + }, + { + "epoch": 0.12627356885777213, + "grad_norm": 1.4499986171722412, + "learning_rate": 0.00019517886068880843, + "loss": 1.6264, + "step": 3526 + }, + { + "epoch": 0.1263093809873404, + "grad_norm": 1.7675611972808838, + "learning_rate": 0.00019517530199679415, + "loss": 1.6114, + "step": 3527 + }, + { + "epoch": 0.1263451931169087, + "grad_norm": 1.6797280311584473, + "learning_rate": 0.00019517174202431895, + "loss": 1.4893, + "step": 3528 + }, + { + "epoch": 0.12638100524647697, + "grad_norm": 1.2751450538635254, + "learning_rate": 0.00019516818077143071, + "loss": 1.6812, + "step": 3529 + }, + { + "epoch": 0.12641681737604527, + "grad_norm": 2.196251630783081, + "learning_rate": 0.00019516461823817737, + "loss": 1.3997, + "step": 3530 + }, + { + "epoch": 0.12645262950561356, + "grad_norm": 2.467259645462036, + "learning_rate": 0.00019516105442460684, + "loss": 1.4916, + "step": 3531 + }, + { + "epoch": 0.12648844163518183, + "grad_norm": 1.8031963109970093, + "learning_rate": 0.0001951574893307671, + "loss": 1.8051, + "step": 3532 + }, + { + "epoch": 0.12652425376475013, + "grad_norm": 1.3685870170593262, + "learning_rate": 0.00019515392295670604, + "loss": 1.4656, + "step": 3533 + }, + { + "epoch": 0.1265600658943184, + "grad_norm": 1.3247365951538086, + "learning_rate": 0.00019515035530247172, + "loss": 1.5153, + "step": 3534 + }, + { + "epoch": 0.1265958780238867, + "grad_norm": 1.341870665550232, + "learning_rate": 0.0001951467863681121, + "loss": 1.5531, + "step": 3535 + }, + { + "epoch": 0.12663169015345496, + "grad_norm": 1.5447837114334106, + "learning_rate": 0.00019514321615367517, + "loss": 1.8168, + "step": 3536 + }, + { + "epoch": 0.12666750228302326, + "grad_norm": 1.3823332786560059, + "learning_rate": 0.000195139644659209, + "loss": 1.6153, + "step": 3537 + }, + { + "epoch": 0.12670331441259156, + "grad_norm": 1.7624273300170898, + "learning_rate": 0.00019513607188476168, + "loss": 1.5328, + "step": 3538 + }, + { + "epoch": 0.12673912654215982, + "grad_norm": 1.4588311910629272, + "learning_rate": 0.00019513249783038118, + "loss": 1.7005, + "step": 3539 + }, + { + "epoch": 0.12677493867172812, + "grad_norm": 2.036802291870117, + "learning_rate": 0.00019512892249611566, + "loss": 1.8805, + "step": 3540 + }, + { + "epoch": 0.1268107508012964, + "grad_norm": 2.2579665184020996, + "learning_rate": 0.00019512534588201318, + "loss": 1.3699, + "step": 3541 + }, + { + "epoch": 0.1268465629308647, + "grad_norm": 2.078040599822998, + "learning_rate": 0.00019512176798812189, + "loss": 1.6924, + "step": 3542 + }, + { + "epoch": 0.12688237506043296, + "grad_norm": 1.5221152305603027, + "learning_rate": 0.0001951181888144899, + "loss": 1.8049, + "step": 3543 + }, + { + "epoch": 0.12691818719000125, + "grad_norm": 1.3810714483261108, + "learning_rate": 0.00019511460836116537, + "loss": 1.6305, + "step": 3544 + }, + { + "epoch": 0.12695399931956955, + "grad_norm": 1.5885570049285889, + "learning_rate": 0.00019511102662819648, + "loss": 1.6947, + "step": 3545 + }, + { + "epoch": 0.12698981144913782, + "grad_norm": 2.6954009532928467, + "learning_rate": 0.0001951074436156314, + "loss": 1.5527, + "step": 3546 + }, + { + "epoch": 0.12702562357870611, + "grad_norm": 2.2410197257995605, + "learning_rate": 0.00019510385932351837, + "loss": 1.5252, + "step": 3547 + }, + { + "epoch": 0.12706143570827438, + "grad_norm": 1.5985091924667358, + "learning_rate": 0.00019510027375190556, + "loss": 1.6583, + "step": 3548 + }, + { + "epoch": 0.12709724783784268, + "grad_norm": 1.7822718620300293, + "learning_rate": 0.00019509668690084126, + "loss": 1.5017, + "step": 3549 + }, + { + "epoch": 0.12713305996741095, + "grad_norm": 1.6479328870773315, + "learning_rate": 0.00019509309877037369, + "loss": 1.4035, + "step": 3550 + }, + { + "epoch": 0.12716887209697925, + "grad_norm": 2.219636917114258, + "learning_rate": 0.00019508950936055115, + "loss": 1.8554, + "step": 3551 + }, + { + "epoch": 0.12720468422654754, + "grad_norm": 2.312058925628662, + "learning_rate": 0.0001950859186714219, + "loss": 1.6133, + "step": 3552 + }, + { + "epoch": 0.1272404963561158, + "grad_norm": 1.3517247438430786, + "learning_rate": 0.00019508232670303427, + "loss": 1.5924, + "step": 3553 + }, + { + "epoch": 0.1272763084856841, + "grad_norm": 1.835817813873291, + "learning_rate": 0.00019507873345543658, + "loss": 1.4991, + "step": 3554 + }, + { + "epoch": 0.12731212061525238, + "grad_norm": 1.174762487411499, + "learning_rate": 0.00019507513892867717, + "loss": 1.5491, + "step": 3555 + }, + { + "epoch": 0.12734793274482067, + "grad_norm": 1.6903138160705566, + "learning_rate": 0.0001950715431228044, + "loss": 1.526, + "step": 3556 + }, + { + "epoch": 0.12738374487438894, + "grad_norm": 2.1673574447631836, + "learning_rate": 0.0001950679460378667, + "loss": 1.7603, + "step": 3557 + }, + { + "epoch": 0.12741955700395724, + "grad_norm": 1.705032229423523, + "learning_rate": 0.00019506434767391237, + "loss": 1.7594, + "step": 3558 + }, + { + "epoch": 0.12745536913352554, + "grad_norm": 1.64605712890625, + "learning_rate": 0.00019506074803098987, + "loss": 1.9528, + "step": 3559 + }, + { + "epoch": 0.1274911812630938, + "grad_norm": 1.3241467475891113, + "learning_rate": 0.00019505714710914764, + "loss": 1.5991, + "step": 3560 + }, + { + "epoch": 0.1275269933926621, + "grad_norm": 2.728872299194336, + "learning_rate": 0.0001950535449084341, + "loss": 1.6143, + "step": 3561 + }, + { + "epoch": 0.12756280552223037, + "grad_norm": 1.7435359954833984, + "learning_rate": 0.0001950499414288977, + "loss": 1.4116, + "step": 3562 + }, + { + "epoch": 0.12759861765179867, + "grad_norm": 1.7278475761413574, + "learning_rate": 0.000195046336670587, + "loss": 1.6202, + "step": 3563 + }, + { + "epoch": 0.12763442978136694, + "grad_norm": 1.4856594800949097, + "learning_rate": 0.0001950427306335504, + "loss": 1.85, + "step": 3564 + }, + { + "epoch": 0.12767024191093523, + "grad_norm": 1.2689180374145508, + "learning_rate": 0.00019503912331783648, + "loss": 1.5936, + "step": 3565 + }, + { + "epoch": 0.12770605404050353, + "grad_norm": 2.304764747619629, + "learning_rate": 0.00019503551472349373, + "loss": 1.5894, + "step": 3566 + }, + { + "epoch": 0.1277418661700718, + "grad_norm": 1.8257946968078613, + "learning_rate": 0.0001950319048505707, + "loss": 1.3694, + "step": 3567 + }, + { + "epoch": 0.1277776782996401, + "grad_norm": 1.283836841583252, + "learning_rate": 0.000195028293699116, + "loss": 1.537, + "step": 3568 + }, + { + "epoch": 0.12781349042920837, + "grad_norm": 2.9690396785736084, + "learning_rate": 0.0001950246812691782, + "loss": 1.5115, + "step": 3569 + }, + { + "epoch": 0.12784930255877666, + "grad_norm": 2.2096030712127686, + "learning_rate": 0.00019502106756080583, + "loss": 1.8623, + "step": 3570 + }, + { + "epoch": 0.12788511468834493, + "grad_norm": 1.6043503284454346, + "learning_rate": 0.00019501745257404762, + "loss": 1.5075, + "step": 3571 + }, + { + "epoch": 0.12792092681791323, + "grad_norm": 1.8550293445587158, + "learning_rate": 0.00019501383630895211, + "loss": 1.6319, + "step": 3572 + }, + { + "epoch": 0.12795673894748152, + "grad_norm": 1.1330084800720215, + "learning_rate": 0.00019501021876556802, + "loss": 1.7045, + "step": 3573 + }, + { + "epoch": 0.1279925510770498, + "grad_norm": 3.4932503700256348, + "learning_rate": 0.00019500659994394398, + "loss": 1.5967, + "step": 3574 + }, + { + "epoch": 0.1280283632066181, + "grad_norm": 2.301506280899048, + "learning_rate": 0.0001950029798441287, + "loss": 1.7702, + "step": 3575 + }, + { + "epoch": 0.12806417533618636, + "grad_norm": 1.4132013320922852, + "learning_rate": 0.00019499935846617084, + "loss": 1.8051, + "step": 3576 + }, + { + "epoch": 0.12809998746575466, + "grad_norm": 1.813435435295105, + "learning_rate": 0.0001949957358101192, + "loss": 1.5849, + "step": 3577 + }, + { + "epoch": 0.12813579959532292, + "grad_norm": 1.2465287446975708, + "learning_rate": 0.00019499211187602242, + "loss": 1.5895, + "step": 3578 + }, + { + "epoch": 0.12817161172489122, + "grad_norm": 1.4146263599395752, + "learning_rate": 0.0001949884866639293, + "loss": 1.9067, + "step": 3579 + }, + { + "epoch": 0.12820742385445952, + "grad_norm": 1.7771110534667969, + "learning_rate": 0.00019498486017388865, + "loss": 1.59, + "step": 3580 + }, + { + "epoch": 0.1282432359840278, + "grad_norm": 2.2220492362976074, + "learning_rate": 0.00019498123240594924, + "loss": 1.4147, + "step": 3581 + }, + { + "epoch": 0.12827904811359608, + "grad_norm": 2.194570779800415, + "learning_rate": 0.00019497760336015984, + "loss": 1.7702, + "step": 3582 + }, + { + "epoch": 0.12831486024316435, + "grad_norm": 1.2068736553192139, + "learning_rate": 0.0001949739730365693, + "loss": 1.6503, + "step": 3583 + }, + { + "epoch": 0.12835067237273265, + "grad_norm": 1.3843286037445068, + "learning_rate": 0.0001949703414352265, + "loss": 1.6336, + "step": 3584 + }, + { + "epoch": 0.12838648450230092, + "grad_norm": 1.7587592601776123, + "learning_rate": 0.0001949667085561802, + "loss": 1.6254, + "step": 3585 + }, + { + "epoch": 0.12842229663186921, + "grad_norm": 2.3941118717193604, + "learning_rate": 0.00019496307439947937, + "loss": 1.7615, + "step": 3586 + }, + { + "epoch": 0.1284581087614375, + "grad_norm": 1.4519942998886108, + "learning_rate": 0.00019495943896517286, + "loss": 1.3183, + "step": 3587 + }, + { + "epoch": 0.12849392089100578, + "grad_norm": 1.6823445558547974, + "learning_rate": 0.0001949558022533096, + "loss": 1.4198, + "step": 3588 + }, + { + "epoch": 0.12852973302057408, + "grad_norm": 1.5577532052993774, + "learning_rate": 0.00019495216426393847, + "loss": 1.6609, + "step": 3589 + }, + { + "epoch": 0.12856554515014235, + "grad_norm": 1.4570772647857666, + "learning_rate": 0.0001949485249971085, + "loss": 1.5659, + "step": 3590 + }, + { + "epoch": 0.12860135727971064, + "grad_norm": 1.4191895723342896, + "learning_rate": 0.00019494488445286856, + "loss": 1.4708, + "step": 3591 + }, + { + "epoch": 0.1286371694092789, + "grad_norm": 1.8212336301803589, + "learning_rate": 0.00019494124263126766, + "loss": 1.5248, + "step": 3592 + }, + { + "epoch": 0.1286729815388472, + "grad_norm": 1.7171404361724854, + "learning_rate": 0.00019493759953235484, + "loss": 1.7069, + "step": 3593 + }, + { + "epoch": 0.1287087936684155, + "grad_norm": 1.546820878982544, + "learning_rate": 0.00019493395515617908, + "loss": 1.3911, + "step": 3594 + }, + { + "epoch": 0.12874460579798377, + "grad_norm": 1.5426114797592163, + "learning_rate": 0.00019493030950278937, + "loss": 1.7267, + "step": 3595 + }, + { + "epoch": 0.12878041792755207, + "grad_norm": 1.4243674278259277, + "learning_rate": 0.00019492666257223484, + "loss": 1.8226, + "step": 3596 + }, + { + "epoch": 0.12881623005712034, + "grad_norm": 1.364989161491394, + "learning_rate": 0.00019492301436456447, + "loss": 1.6399, + "step": 3597 + }, + { + "epoch": 0.12885204218668864, + "grad_norm": 2.2802083492279053, + "learning_rate": 0.00019491936487982744, + "loss": 1.6192, + "step": 3598 + }, + { + "epoch": 0.1288878543162569, + "grad_norm": 1.3957548141479492, + "learning_rate": 0.00019491571411807274, + "loss": 1.6369, + "step": 3599 + }, + { + "epoch": 0.1289236664458252, + "grad_norm": 1.914682149887085, + "learning_rate": 0.00019491206207934955, + "loss": 1.7215, + "step": 3600 + }, + { + "epoch": 0.1289594785753935, + "grad_norm": 2.625183582305908, + "learning_rate": 0.00019490840876370703, + "loss": 1.367, + "step": 3601 + }, + { + "epoch": 0.12899529070496177, + "grad_norm": 1.3641928434371948, + "learning_rate": 0.00019490475417119425, + "loss": 1.7807, + "step": 3602 + }, + { + "epoch": 0.12903110283453006, + "grad_norm": 1.6439179182052612, + "learning_rate": 0.00019490109830186042, + "loss": 1.4354, + "step": 3603 + }, + { + "epoch": 0.12906691496409833, + "grad_norm": 1.4001127481460571, + "learning_rate": 0.00019489744115575475, + "loss": 1.5427, + "step": 3604 + }, + { + "epoch": 0.12910272709366663, + "grad_norm": 1.2241640090942383, + "learning_rate": 0.00019489378273292643, + "loss": 1.4639, + "step": 3605 + }, + { + "epoch": 0.1291385392232349, + "grad_norm": 1.4079821109771729, + "learning_rate": 0.00019489012303342462, + "loss": 1.6541, + "step": 3606 + }, + { + "epoch": 0.1291743513528032, + "grad_norm": 2.0953893661499023, + "learning_rate": 0.00019488646205729864, + "loss": 1.5438, + "step": 3607 + }, + { + "epoch": 0.1292101634823715, + "grad_norm": 2.2923357486724854, + "learning_rate": 0.00019488279980459772, + "loss": 1.6912, + "step": 3608 + }, + { + "epoch": 0.12924597561193976, + "grad_norm": 1.4191970825195312, + "learning_rate": 0.00019487913627537108, + "loss": 1.8267, + "step": 3609 + }, + { + "epoch": 0.12928178774150806, + "grad_norm": 2.6938745975494385, + "learning_rate": 0.00019487547146966808, + "loss": 1.7512, + "step": 3610 + }, + { + "epoch": 0.12931759987107633, + "grad_norm": 2.0451958179473877, + "learning_rate": 0.00019487180538753796, + "loss": 1.7794, + "step": 3611 + }, + { + "epoch": 0.12935341200064462, + "grad_norm": 1.861045241355896, + "learning_rate": 0.0001948681380290301, + "loss": 1.8672, + "step": 3612 + }, + { + "epoch": 0.1293892241302129, + "grad_norm": 1.9906060695648193, + "learning_rate": 0.0001948644693941938, + "loss": 1.4666, + "step": 3613 + }, + { + "epoch": 0.1294250362597812, + "grad_norm": 2.1408982276916504, + "learning_rate": 0.00019486079948307844, + "loss": 1.5275, + "step": 3614 + }, + { + "epoch": 0.12946084838934946, + "grad_norm": 1.689831018447876, + "learning_rate": 0.00019485712829573338, + "loss": 1.6761, + "step": 3615 + }, + { + "epoch": 0.12949666051891776, + "grad_norm": 1.6339595317840576, + "learning_rate": 0.000194853455832208, + "loss": 1.477, + "step": 3616 + }, + { + "epoch": 0.12953247264848605, + "grad_norm": 1.2168866395950317, + "learning_rate": 0.00019484978209255175, + "loss": 1.1758, + "step": 3617 + }, + { + "epoch": 0.12956828477805432, + "grad_norm": 3.002838134765625, + "learning_rate": 0.00019484610707681403, + "loss": 1.5886, + "step": 3618 + }, + { + "epoch": 0.12960409690762262, + "grad_norm": 1.7156847715377808, + "learning_rate": 0.00019484243078504428, + "loss": 1.5088, + "step": 3619 + }, + { + "epoch": 0.1296399090371909, + "grad_norm": 1.3578027486801147, + "learning_rate": 0.00019483875321729194, + "loss": 1.5774, + "step": 3620 + }, + { + "epoch": 0.12967572116675918, + "grad_norm": 1.3243157863616943, + "learning_rate": 0.00019483507437360653, + "loss": 1.7064, + "step": 3621 + }, + { + "epoch": 0.12971153329632745, + "grad_norm": 2.185899257659912, + "learning_rate": 0.0001948313942540375, + "loss": 1.7627, + "step": 3622 + }, + { + "epoch": 0.12974734542589575, + "grad_norm": 1.9846500158309937, + "learning_rate": 0.00019482771285863438, + "loss": 1.5378, + "step": 3623 + }, + { + "epoch": 0.12978315755546405, + "grad_norm": 2.1353330612182617, + "learning_rate": 0.00019482403018744674, + "loss": 1.6318, + "step": 3624 + }, + { + "epoch": 0.12981896968503231, + "grad_norm": 1.6027969121932983, + "learning_rate": 0.00019482034624052408, + "loss": 1.935, + "step": 3625 + }, + { + "epoch": 0.1298547818146006, + "grad_norm": 1.7356218099594116, + "learning_rate": 0.00019481666101791594, + "loss": 1.593, + "step": 3626 + }, + { + "epoch": 0.12989059394416888, + "grad_norm": 1.551705002784729, + "learning_rate": 0.00019481297451967195, + "loss": 1.7691, + "step": 3627 + }, + { + "epoch": 0.12992640607373718, + "grad_norm": 2.0163631439208984, + "learning_rate": 0.0001948092867458417, + "loss": 1.4032, + "step": 3628 + }, + { + "epoch": 0.12996221820330545, + "grad_norm": 1.6839312314987183, + "learning_rate": 0.00019480559769647477, + "loss": 1.8452, + "step": 3629 + }, + { + "epoch": 0.12999803033287374, + "grad_norm": 1.5859227180480957, + "learning_rate": 0.00019480190737162083, + "loss": 1.5909, + "step": 3630 + }, + { + "epoch": 0.13003384246244204, + "grad_norm": 1.7990963459014893, + "learning_rate": 0.0001947982157713295, + "loss": 1.5817, + "step": 3631 + }, + { + "epoch": 0.1300696545920103, + "grad_norm": 1.4903086423873901, + "learning_rate": 0.00019479452289565048, + "loss": 1.8922, + "step": 3632 + }, + { + "epoch": 0.1301054667215786, + "grad_norm": 1.210903525352478, + "learning_rate": 0.00019479082874463338, + "loss": 1.5365, + "step": 3633 + }, + { + "epoch": 0.13014127885114687, + "grad_norm": 1.3731626272201538, + "learning_rate": 0.000194787133318328, + "loss": 1.7997, + "step": 3634 + }, + { + "epoch": 0.13017709098071517, + "grad_norm": 1.7975584268569946, + "learning_rate": 0.000194783436616784, + "loss": 1.4367, + "step": 3635 + }, + { + "epoch": 0.13021290311028344, + "grad_norm": 1.2568280696868896, + "learning_rate": 0.00019477973864005113, + "loss": 1.8788, + "step": 3636 + }, + { + "epoch": 0.13024871523985174, + "grad_norm": 1.7492560148239136, + "learning_rate": 0.0001947760393881791, + "loss": 1.3877, + "step": 3637 + }, + { + "epoch": 0.13028452736942003, + "grad_norm": 2.4165797233581543, + "learning_rate": 0.00019477233886121772, + "loss": 1.7056, + "step": 3638 + }, + { + "epoch": 0.1303203394989883, + "grad_norm": 1.5210332870483398, + "learning_rate": 0.00019476863705921677, + "loss": 1.8208, + "step": 3639 + }, + { + "epoch": 0.1303561516285566, + "grad_norm": 1.419649600982666, + "learning_rate": 0.00019476493398222608, + "loss": 1.4835, + "step": 3640 + }, + { + "epoch": 0.13039196375812487, + "grad_norm": 2.1755502223968506, + "learning_rate": 0.0001947612296302954, + "loss": 1.441, + "step": 3641 + }, + { + "epoch": 0.13042777588769316, + "grad_norm": 1.332960844039917, + "learning_rate": 0.00019475752400347464, + "loss": 1.6522, + "step": 3642 + }, + { + "epoch": 0.13046358801726143, + "grad_norm": 1.6385389566421509, + "learning_rate": 0.00019475381710181363, + "loss": 1.3294, + "step": 3643 + }, + { + "epoch": 0.13049940014682973, + "grad_norm": 2.0935511589050293, + "learning_rate": 0.0001947501089253622, + "loss": 1.9683, + "step": 3644 + }, + { + "epoch": 0.13053521227639803, + "grad_norm": 1.5718079805374146, + "learning_rate": 0.00019474639947417028, + "loss": 1.4642, + "step": 3645 + }, + { + "epoch": 0.1305710244059663, + "grad_norm": 1.8546572923660278, + "learning_rate": 0.0001947426887482878, + "loss": 1.6069, + "step": 3646 + }, + { + "epoch": 0.1306068365355346, + "grad_norm": 1.3184876441955566, + "learning_rate": 0.0001947389767477646, + "loss": 1.666, + "step": 3647 + }, + { + "epoch": 0.13064264866510286, + "grad_norm": 1.41603422164917, + "learning_rate": 0.00019473526347265073, + "loss": 1.588, + "step": 3648 + }, + { + "epoch": 0.13067846079467116, + "grad_norm": 1.5340843200683594, + "learning_rate": 0.00019473154892299608, + "loss": 1.848, + "step": 3649 + }, + { + "epoch": 0.13071427292423943, + "grad_norm": 2.2189242839813232, + "learning_rate": 0.00019472783309885057, + "loss": 1.9326, + "step": 3650 + }, + { + "epoch": 0.13075008505380772, + "grad_norm": 2.0368833541870117, + "learning_rate": 0.0001947241160002643, + "loss": 1.7715, + "step": 3651 + }, + { + "epoch": 0.13078589718337602, + "grad_norm": 1.550632357597351, + "learning_rate": 0.00019472039762728728, + "loss": 1.5666, + "step": 3652 + }, + { + "epoch": 0.1308217093129443, + "grad_norm": 1.8009685277938843, + "learning_rate": 0.00019471667797996944, + "loss": 1.5932, + "step": 3653 + }, + { + "epoch": 0.1308575214425126, + "grad_norm": 1.6331686973571777, + "learning_rate": 0.00019471295705836088, + "loss": 1.4965, + "step": 3654 + }, + { + "epoch": 0.13089333357208086, + "grad_norm": 1.6497012376785278, + "learning_rate": 0.00019470923486251165, + "loss": 1.638, + "step": 3655 + }, + { + "epoch": 0.13092914570164915, + "grad_norm": 1.526160717010498, + "learning_rate": 0.00019470551139247184, + "loss": 1.7185, + "step": 3656 + }, + { + "epoch": 0.13096495783121742, + "grad_norm": 1.3786871433258057, + "learning_rate": 0.00019470178664829154, + "loss": 1.6264, + "step": 3657 + }, + { + "epoch": 0.13100076996078572, + "grad_norm": 1.2619601488113403, + "learning_rate": 0.00019469806063002082, + "loss": 1.5839, + "step": 3658 + }, + { + "epoch": 0.13103658209035401, + "grad_norm": 2.778254270553589, + "learning_rate": 0.0001946943333377099, + "loss": 1.5246, + "step": 3659 + }, + { + "epoch": 0.13107239421992228, + "grad_norm": 1.5448185205459595, + "learning_rate": 0.00019469060477140886, + "loss": 1.4715, + "step": 3660 + }, + { + "epoch": 0.13110820634949058, + "grad_norm": 1.5126322507858276, + "learning_rate": 0.00019468687493116784, + "loss": 1.6236, + "step": 3661 + }, + { + "epoch": 0.13114401847905885, + "grad_norm": 2.1006526947021484, + "learning_rate": 0.00019468314381703708, + "loss": 1.8905, + "step": 3662 + }, + { + "epoch": 0.13117983060862715, + "grad_norm": 1.9070483446121216, + "learning_rate": 0.00019467941142906674, + "loss": 1.6353, + "step": 3663 + }, + { + "epoch": 0.13121564273819541, + "grad_norm": 1.40544855594635, + "learning_rate": 0.00019467567776730707, + "loss": 1.5716, + "step": 3664 + }, + { + "epoch": 0.1312514548677637, + "grad_norm": 1.3862591981887817, + "learning_rate": 0.00019467194283180828, + "loss": 1.5309, + "step": 3665 + }, + { + "epoch": 0.131287266997332, + "grad_norm": 1.2076510190963745, + "learning_rate": 0.0001946682066226206, + "loss": 1.585, + "step": 3666 + }, + { + "epoch": 0.13132307912690028, + "grad_norm": 2.244685411453247, + "learning_rate": 0.0001946644691397943, + "loss": 1.6969, + "step": 3667 + }, + { + "epoch": 0.13135889125646857, + "grad_norm": 1.1528452634811401, + "learning_rate": 0.00019466073038337968, + "loss": 1.2363, + "step": 3668 + }, + { + "epoch": 0.13139470338603684, + "grad_norm": 1.7110793590545654, + "learning_rate": 0.00019465699035342706, + "loss": 1.6413, + "step": 3669 + }, + { + "epoch": 0.13143051551560514, + "grad_norm": 1.4075368642807007, + "learning_rate": 0.00019465324904998672, + "loss": 1.6554, + "step": 3670 + }, + { + "epoch": 0.1314663276451734, + "grad_norm": 1.4957016706466675, + "learning_rate": 0.000194649506473109, + "loss": 1.6191, + "step": 3671 + }, + { + "epoch": 0.1315021397747417, + "grad_norm": 1.4293609857559204, + "learning_rate": 0.00019464576262284426, + "loss": 1.3673, + "step": 3672 + }, + { + "epoch": 0.13153795190431, + "grad_norm": 1.4260005950927734, + "learning_rate": 0.00019464201749924288, + "loss": 1.7648, + "step": 3673 + }, + { + "epoch": 0.13157376403387827, + "grad_norm": 1.628811240196228, + "learning_rate": 0.00019463827110235523, + "loss": 1.5441, + "step": 3674 + }, + { + "epoch": 0.13160957616344657, + "grad_norm": 2.7041733264923096, + "learning_rate": 0.00019463452343223173, + "loss": 1.7595, + "step": 3675 + }, + { + "epoch": 0.13164538829301484, + "grad_norm": 4.314607620239258, + "learning_rate": 0.00019463077448892278, + "loss": 1.4025, + "step": 3676 + }, + { + "epoch": 0.13168120042258313, + "grad_norm": 1.7181106805801392, + "learning_rate": 0.0001946270242724788, + "loss": 1.495, + "step": 3677 + }, + { + "epoch": 0.1317170125521514, + "grad_norm": 1.686544418334961, + "learning_rate": 0.0001946232727829503, + "loss": 1.727, + "step": 3678 + }, + { + "epoch": 0.1317528246817197, + "grad_norm": 1.6624183654785156, + "learning_rate": 0.00019461952002038771, + "loss": 1.7125, + "step": 3679 + }, + { + "epoch": 0.131788636811288, + "grad_norm": 2.172788381576538, + "learning_rate": 0.0001946157659848415, + "loss": 2.2068, + "step": 3680 + }, + { + "epoch": 0.13182444894085626, + "grad_norm": 1.7004787921905518, + "learning_rate": 0.00019461201067636226, + "loss": 1.8583, + "step": 3681 + }, + { + "epoch": 0.13186026107042456, + "grad_norm": 1.7210794687271118, + "learning_rate": 0.00019460825409500042, + "loss": 1.7554, + "step": 3682 + }, + { + "epoch": 0.13189607319999283, + "grad_norm": 1.6098159551620483, + "learning_rate": 0.00019460449624080655, + "loss": 1.6517, + "step": 3683 + }, + { + "epoch": 0.13193188532956113, + "grad_norm": 2.078428030014038, + "learning_rate": 0.00019460073711383125, + "loss": 1.4448, + "step": 3684 + }, + { + "epoch": 0.1319676974591294, + "grad_norm": 1.5644986629486084, + "learning_rate": 0.00019459697671412503, + "loss": 1.7364, + "step": 3685 + }, + { + "epoch": 0.1320035095886977, + "grad_norm": 1.9476972818374634, + "learning_rate": 0.0001945932150417385, + "loss": 1.4418, + "step": 3686 + }, + { + "epoch": 0.132039321718266, + "grad_norm": 2.0070180892944336, + "learning_rate": 0.0001945894520967223, + "loss": 1.8668, + "step": 3687 + }, + { + "epoch": 0.13207513384783426, + "grad_norm": 2.056800127029419, + "learning_rate": 0.00019458568787912703, + "loss": 1.6064, + "step": 3688 + }, + { + "epoch": 0.13211094597740256, + "grad_norm": 1.6706658601760864, + "learning_rate": 0.00019458192238900335, + "loss": 1.5147, + "step": 3689 + }, + { + "epoch": 0.13214675810697082, + "grad_norm": 1.309191346168518, + "learning_rate": 0.00019457815562640187, + "loss": 1.1176, + "step": 3690 + }, + { + "epoch": 0.13218257023653912, + "grad_norm": 1.6902896165847778, + "learning_rate": 0.00019457438759137334, + "loss": 1.4857, + "step": 3691 + }, + { + "epoch": 0.1322183823661074, + "grad_norm": 1.9160804748535156, + "learning_rate": 0.00019457061828396838, + "loss": 1.4037, + "step": 3692 + }, + { + "epoch": 0.1322541944956757, + "grad_norm": 1.564273476600647, + "learning_rate": 0.00019456684770423777, + "loss": 1.6198, + "step": 3693 + }, + { + "epoch": 0.13229000662524398, + "grad_norm": 1.7293407917022705, + "learning_rate": 0.00019456307585223218, + "loss": 1.5622, + "step": 3694 + }, + { + "epoch": 0.13232581875481225, + "grad_norm": 1.3648868799209595, + "learning_rate": 0.00019455930272800243, + "loss": 1.507, + "step": 3695 + }, + { + "epoch": 0.13236163088438055, + "grad_norm": 1.312423825263977, + "learning_rate": 0.00019455552833159918, + "loss": 1.7455, + "step": 3696 + }, + { + "epoch": 0.13239744301394882, + "grad_norm": 1.8402488231658936, + "learning_rate": 0.00019455175266307328, + "loss": 1.6363, + "step": 3697 + }, + { + "epoch": 0.13243325514351711, + "grad_norm": 1.179580807685852, + "learning_rate": 0.00019454797572247552, + "loss": 1.6652, + "step": 3698 + }, + { + "epoch": 0.13246906727308538, + "grad_norm": 1.4366496801376343, + "learning_rate": 0.0001945441975098567, + "loss": 1.7087, + "step": 3699 + }, + { + "epoch": 0.13250487940265368, + "grad_norm": 1.816963791847229, + "learning_rate": 0.00019454041802526766, + "loss": 1.7501, + "step": 3700 + }, + { + "epoch": 0.13254069153222198, + "grad_norm": 1.6543554067611694, + "learning_rate": 0.00019453663726875923, + "loss": 1.6763, + "step": 3701 + }, + { + "epoch": 0.13257650366179025, + "grad_norm": 1.448135256767273, + "learning_rate": 0.0001945328552403823, + "loss": 1.6681, + "step": 3702 + }, + { + "epoch": 0.13261231579135854, + "grad_norm": 1.8160815238952637, + "learning_rate": 0.00019452907194018776, + "loss": 1.5419, + "step": 3703 + }, + { + "epoch": 0.1326481279209268, + "grad_norm": 1.41280198097229, + "learning_rate": 0.00019452528736822646, + "loss": 1.7881, + "step": 3704 + }, + { + "epoch": 0.1326839400504951, + "grad_norm": 1.6581374406814575, + "learning_rate": 0.00019452150152454936, + "loss": 2.032, + "step": 3705 + }, + { + "epoch": 0.13271975218006338, + "grad_norm": 1.9039281606674194, + "learning_rate": 0.0001945177144092074, + "loss": 1.3711, + "step": 3706 + }, + { + "epoch": 0.13275556430963167, + "grad_norm": 2.1422500610351562, + "learning_rate": 0.0001945139260222515, + "loss": 1.5075, + "step": 3707 + }, + { + "epoch": 0.13279137643919997, + "grad_norm": 2.3555521965026855, + "learning_rate": 0.00019451013636373262, + "loss": 1.6693, + "step": 3708 + }, + { + "epoch": 0.13282718856876824, + "grad_norm": 1.4542937278747559, + "learning_rate": 0.00019450634543370177, + "loss": 1.6695, + "step": 3709 + }, + { + "epoch": 0.13286300069833654, + "grad_norm": 1.687001347541809, + "learning_rate": 0.00019450255323220995, + "loss": 1.5687, + "step": 3710 + }, + { + "epoch": 0.1328988128279048, + "grad_norm": 1.497179388999939, + "learning_rate": 0.00019449875975930818, + "loss": 2.1073, + "step": 3711 + }, + { + "epoch": 0.1329346249574731, + "grad_norm": 2.0575437545776367, + "learning_rate": 0.00019449496501504747, + "loss": 1.7413, + "step": 3712 + }, + { + "epoch": 0.13297043708704137, + "grad_norm": 1.6596516370773315, + "learning_rate": 0.0001944911689994789, + "loss": 1.4723, + "step": 3713 + }, + { + "epoch": 0.13300624921660967, + "grad_norm": 1.3024641275405884, + "learning_rate": 0.0001944873717126536, + "loss": 1.4606, + "step": 3714 + }, + { + "epoch": 0.13304206134617794, + "grad_norm": 1.9574958086013794, + "learning_rate": 0.00019448357315462255, + "loss": 1.7971, + "step": 3715 + }, + { + "epoch": 0.13307787347574623, + "grad_norm": 1.5044560432434082, + "learning_rate": 0.00019447977332543687, + "loss": 1.4529, + "step": 3716 + }, + { + "epoch": 0.13311368560531453, + "grad_norm": 2.2105259895324707, + "learning_rate": 0.00019447597222514772, + "loss": 1.9443, + "step": 3717 + }, + { + "epoch": 0.1331494977348828, + "grad_norm": 2.5844597816467285, + "learning_rate": 0.00019447216985380626, + "loss": 1.3201, + "step": 3718 + }, + { + "epoch": 0.1331853098644511, + "grad_norm": 1.6001721620559692, + "learning_rate": 0.0001944683662114636, + "loss": 1.6278, + "step": 3719 + }, + { + "epoch": 0.13322112199401936, + "grad_norm": 1.7248762845993042, + "learning_rate": 0.00019446456129817093, + "loss": 1.6102, + "step": 3720 + }, + { + "epoch": 0.13325693412358766, + "grad_norm": 1.187751054763794, + "learning_rate": 0.00019446075511397943, + "loss": 1.4295, + "step": 3721 + }, + { + "epoch": 0.13329274625315593, + "grad_norm": 2.0701000690460205, + "learning_rate": 0.0001944569476589403, + "loss": 1.7675, + "step": 3722 + }, + { + "epoch": 0.13332855838272423, + "grad_norm": 1.876980185508728, + "learning_rate": 0.00019445313893310482, + "loss": 1.7526, + "step": 3723 + }, + { + "epoch": 0.13336437051229252, + "grad_norm": 2.1986751556396484, + "learning_rate": 0.00019444932893652417, + "loss": 1.3216, + "step": 3724 + }, + { + "epoch": 0.1334001826418608, + "grad_norm": 2.8376963138580322, + "learning_rate": 0.00019444551766924963, + "loss": 1.9192, + "step": 3725 + }, + { + "epoch": 0.1334359947714291, + "grad_norm": 1.2646297216415405, + "learning_rate": 0.00019444170513133248, + "loss": 1.6767, + "step": 3726 + }, + { + "epoch": 0.13347180690099736, + "grad_norm": 1.789505124092102, + "learning_rate": 0.00019443789132282403, + "loss": 1.7196, + "step": 3727 + }, + { + "epoch": 0.13350761903056566, + "grad_norm": 1.2923089265823364, + "learning_rate": 0.0001944340762437755, + "loss": 1.2456, + "step": 3728 + }, + { + "epoch": 0.13354343116013392, + "grad_norm": 2.154646635055542, + "learning_rate": 0.00019443025989423834, + "loss": 1.5132, + "step": 3729 + }, + { + "epoch": 0.13357924328970222, + "grad_norm": 1.5187957286834717, + "learning_rate": 0.00019442644227426383, + "loss": 1.5191, + "step": 3730 + }, + { + "epoch": 0.13361505541927052, + "grad_norm": 1.3650364875793457, + "learning_rate": 0.00019442262338390337, + "loss": 1.3481, + "step": 3731 + }, + { + "epoch": 0.1336508675488388, + "grad_norm": 2.002079963684082, + "learning_rate": 0.00019441880322320824, + "loss": 1.5562, + "step": 3732 + }, + { + "epoch": 0.13368667967840708, + "grad_norm": 1.4634652137756348, + "learning_rate": 0.00019441498179222997, + "loss": 1.8203, + "step": 3733 + }, + { + "epoch": 0.13372249180797535, + "grad_norm": 2.2729015350341797, + "learning_rate": 0.00019441115909101986, + "loss": 1.4976, + "step": 3734 + }, + { + "epoch": 0.13375830393754365, + "grad_norm": 1.93559992313385, + "learning_rate": 0.0001944073351196294, + "loss": 1.4768, + "step": 3735 + }, + { + "epoch": 0.13379411606711192, + "grad_norm": 1.4857501983642578, + "learning_rate": 0.00019440350987811003, + "loss": 1.5598, + "step": 3736 + }, + { + "epoch": 0.13382992819668021, + "grad_norm": 1.494903564453125, + "learning_rate": 0.0001943996833665132, + "loss": 1.5704, + "step": 3737 + }, + { + "epoch": 0.1338657403262485, + "grad_norm": 1.9642722606658936, + "learning_rate": 0.0001943958555848904, + "loss": 1.855, + "step": 3738 + }, + { + "epoch": 0.13390155245581678, + "grad_norm": 2.271066904067993, + "learning_rate": 0.00019439202653329313, + "loss": 1.9326, + "step": 3739 + }, + { + "epoch": 0.13393736458538508, + "grad_norm": 1.980333924293518, + "learning_rate": 0.00019438819621177289, + "loss": 1.9477, + "step": 3740 + }, + { + "epoch": 0.13397317671495335, + "grad_norm": 1.602170467376709, + "learning_rate": 0.00019438436462038125, + "loss": 1.434, + "step": 3741 + }, + { + "epoch": 0.13400898884452164, + "grad_norm": 2.0307059288024902, + "learning_rate": 0.00019438053175916968, + "loss": 1.5559, + "step": 3742 + }, + { + "epoch": 0.1340448009740899, + "grad_norm": 2.126970052719116, + "learning_rate": 0.00019437669762818985, + "loss": 1.8426, + "step": 3743 + }, + { + "epoch": 0.1340806131036582, + "grad_norm": 2.3032383918762207, + "learning_rate": 0.00019437286222749326, + "loss": 1.7689, + "step": 3744 + }, + { + "epoch": 0.1341164252332265, + "grad_norm": 1.8578029870986938, + "learning_rate": 0.00019436902555713153, + "loss": 1.8085, + "step": 3745 + }, + { + "epoch": 0.13415223736279477, + "grad_norm": 1.2118780612945557, + "learning_rate": 0.00019436518761715632, + "loss": 1.6056, + "step": 3746 + }, + { + "epoch": 0.13418804949236307, + "grad_norm": 2.220696210861206, + "learning_rate": 0.0001943613484076192, + "loss": 1.4916, + "step": 3747 + }, + { + "epoch": 0.13422386162193134, + "grad_norm": 1.5976279973983765, + "learning_rate": 0.0001943575079285719, + "loss": 1.6949, + "step": 3748 + }, + { + "epoch": 0.13425967375149964, + "grad_norm": 1.7551590204238892, + "learning_rate": 0.000194353666180066, + "loss": 1.5253, + "step": 3749 + }, + { + "epoch": 0.1342954858810679, + "grad_norm": 1.204624891281128, + "learning_rate": 0.00019434982316215326, + "loss": 1.5708, + "step": 3750 + }, + { + "epoch": 0.1343312980106362, + "grad_norm": 1.5681825876235962, + "learning_rate": 0.00019434597887488532, + "loss": 1.4955, + "step": 3751 + }, + { + "epoch": 0.1343671101402045, + "grad_norm": 1.9825056791305542, + "learning_rate": 0.00019434213331831398, + "loss": 1.8734, + "step": 3752 + }, + { + "epoch": 0.13440292226977277, + "grad_norm": 1.5142807960510254, + "learning_rate": 0.00019433828649249087, + "loss": 1.4563, + "step": 3753 + }, + { + "epoch": 0.13443873439934106, + "grad_norm": 1.6803392171859741, + "learning_rate": 0.00019433443839746785, + "loss": 1.6086, + "step": 3754 + }, + { + "epoch": 0.13447454652890933, + "grad_norm": 1.265977144241333, + "learning_rate": 0.00019433058903329663, + "loss": 1.7027, + "step": 3755 + }, + { + "epoch": 0.13451035865847763, + "grad_norm": 2.002089500427246, + "learning_rate": 0.00019432673840002898, + "loss": 1.8738, + "step": 3756 + }, + { + "epoch": 0.1345461707880459, + "grad_norm": 1.433565378189087, + "learning_rate": 0.00019432288649771676, + "loss": 1.3665, + "step": 3757 + }, + { + "epoch": 0.1345819829176142, + "grad_norm": 1.1154769659042358, + "learning_rate": 0.0001943190333264118, + "loss": 1.6499, + "step": 3758 + }, + { + "epoch": 0.1346177950471825, + "grad_norm": 1.8043376207351685, + "learning_rate": 0.0001943151788861659, + "loss": 1.6887, + "step": 3759 + }, + { + "epoch": 0.13465360717675076, + "grad_norm": 1.4192312955856323, + "learning_rate": 0.0001943113231770309, + "loss": 1.3811, + "step": 3760 + }, + { + "epoch": 0.13468941930631906, + "grad_norm": 1.6293889284133911, + "learning_rate": 0.0001943074661990587, + "loss": 1.9191, + "step": 3761 + }, + { + "epoch": 0.13472523143588733, + "grad_norm": 1.9068212509155273, + "learning_rate": 0.0001943036079523012, + "loss": 1.6092, + "step": 3762 + }, + { + "epoch": 0.13476104356545562, + "grad_norm": 1.3996855020523071, + "learning_rate": 0.00019429974843681032, + "loss": 1.4419, + "step": 3763 + }, + { + "epoch": 0.1347968556950239, + "grad_norm": 1.6263465881347656, + "learning_rate": 0.0001942958876526379, + "loss": 1.7413, + "step": 3764 + }, + { + "epoch": 0.1348326678245922, + "grad_norm": 2.075838327407837, + "learning_rate": 0.000194292025599836, + "loss": 1.5274, + "step": 3765 + }, + { + "epoch": 0.1348684799541605, + "grad_norm": 2.207247734069824, + "learning_rate": 0.00019428816227845652, + "loss": 1.9061, + "step": 3766 + }, + { + "epoch": 0.13490429208372876, + "grad_norm": 1.719639778137207, + "learning_rate": 0.0001942842976885514, + "loss": 1.6978, + "step": 3767 + }, + { + "epoch": 0.13494010421329705, + "grad_norm": 1.641737461090088, + "learning_rate": 0.00019428043183017274, + "loss": 1.807, + "step": 3768 + }, + { + "epoch": 0.13497591634286532, + "grad_norm": 4.124390125274658, + "learning_rate": 0.00019427656470337242, + "loss": 1.825, + "step": 3769 + }, + { + "epoch": 0.13501172847243362, + "grad_norm": 1.5481641292572021, + "learning_rate": 0.00019427269630820258, + "loss": 1.5993, + "step": 3770 + }, + { + "epoch": 0.1350475406020019, + "grad_norm": 1.4922468662261963, + "learning_rate": 0.00019426882664471515, + "loss": 1.5663, + "step": 3771 + }, + { + "epoch": 0.13508335273157018, + "grad_norm": 1.7647703886032104, + "learning_rate": 0.00019426495571296234, + "loss": 1.6529, + "step": 3772 + }, + { + "epoch": 0.13511916486113848, + "grad_norm": 1.8872132301330566, + "learning_rate": 0.00019426108351299607, + "loss": 1.8358, + "step": 3773 + }, + { + "epoch": 0.13515497699070675, + "grad_norm": 1.7756673097610474, + "learning_rate": 0.00019425721004486852, + "loss": 1.8599, + "step": 3774 + }, + { + "epoch": 0.13519078912027505, + "grad_norm": 1.4053908586502075, + "learning_rate": 0.00019425333530863182, + "loss": 1.4622, + "step": 3775 + }, + { + "epoch": 0.13522660124984331, + "grad_norm": 1.7477492094039917, + "learning_rate": 0.00019424945930433807, + "loss": 1.8449, + "step": 3776 + }, + { + "epoch": 0.1352624133794116, + "grad_norm": 1.654231309890747, + "learning_rate": 0.0001942455820320394, + "loss": 1.4055, + "step": 3777 + }, + { + "epoch": 0.13529822550897988, + "grad_norm": 1.1583056449890137, + "learning_rate": 0.00019424170349178802, + "loss": 1.6276, + "step": 3778 + }, + { + "epoch": 0.13533403763854818, + "grad_norm": 1.3777235746383667, + "learning_rate": 0.00019423782368363604, + "loss": 1.471, + "step": 3779 + }, + { + "epoch": 0.13536984976811647, + "grad_norm": 1.4694525003433228, + "learning_rate": 0.00019423394260763573, + "loss": 1.4806, + "step": 3780 + }, + { + "epoch": 0.13540566189768474, + "grad_norm": 1.5091458559036255, + "learning_rate": 0.00019423006026383926, + "loss": 1.7003, + "step": 3781 + }, + { + "epoch": 0.13544147402725304, + "grad_norm": 1.9628468751907349, + "learning_rate": 0.0001942261766522989, + "loss": 1.8922, + "step": 3782 + }, + { + "epoch": 0.1354772861568213, + "grad_norm": 1.4558885097503662, + "learning_rate": 0.00019422229177306686, + "loss": 1.4158, + "step": 3783 + }, + { + "epoch": 0.1355130982863896, + "grad_norm": 2.17854380607605, + "learning_rate": 0.0001942184056261954, + "loss": 1.5416, + "step": 3784 + }, + { + "epoch": 0.13554891041595787, + "grad_norm": 1.5657025575637817, + "learning_rate": 0.00019421451821173685, + "loss": 1.768, + "step": 3785 + }, + { + "epoch": 0.13558472254552617, + "grad_norm": 1.424683928489685, + "learning_rate": 0.0001942106295297435, + "loss": 1.6767, + "step": 3786 + }, + { + "epoch": 0.13562053467509447, + "grad_norm": 1.3735336065292358, + "learning_rate": 0.00019420673958026762, + "loss": 1.6295, + "step": 3787 + }, + { + "epoch": 0.13565634680466274, + "grad_norm": 1.9221537113189697, + "learning_rate": 0.0001942028483633616, + "loss": 1.9549, + "step": 3788 + }, + { + "epoch": 0.13569215893423103, + "grad_norm": 1.564569354057312, + "learning_rate": 0.00019419895587907777, + "loss": 1.6037, + "step": 3789 + }, + { + "epoch": 0.1357279710637993, + "grad_norm": 1.569985032081604, + "learning_rate": 0.0001941950621274685, + "loss": 1.8659, + "step": 3790 + }, + { + "epoch": 0.1357637831933676, + "grad_norm": 1.9685579538345337, + "learning_rate": 0.00019419116710858614, + "loss": 1.7621, + "step": 3791 + }, + { + "epoch": 0.13579959532293587, + "grad_norm": 1.6017086505889893, + "learning_rate": 0.00019418727082248316, + "loss": 1.4238, + "step": 3792 + }, + { + "epoch": 0.13583540745250416, + "grad_norm": 1.52402925491333, + "learning_rate": 0.00019418337326921193, + "loss": 1.4991, + "step": 3793 + }, + { + "epoch": 0.13587121958207246, + "grad_norm": 1.6972289085388184, + "learning_rate": 0.0001941794744488249, + "loss": 1.7073, + "step": 3794 + }, + { + "epoch": 0.13590703171164073, + "grad_norm": 1.2555439472198486, + "learning_rate": 0.0001941755743613745, + "loss": 1.6406, + "step": 3795 + }, + { + "epoch": 0.13594284384120903, + "grad_norm": 2.1183104515075684, + "learning_rate": 0.00019417167300691328, + "loss": 1.6005, + "step": 3796 + }, + { + "epoch": 0.1359786559707773, + "grad_norm": 1.3762356042861938, + "learning_rate": 0.00019416777038549362, + "loss": 1.3172, + "step": 3797 + }, + { + "epoch": 0.1360144681003456, + "grad_norm": 1.4184659719467163, + "learning_rate": 0.00019416386649716812, + "loss": 1.5459, + "step": 3798 + }, + { + "epoch": 0.13605028022991386, + "grad_norm": 1.5762983560562134, + "learning_rate": 0.0001941599613419892, + "loss": 1.3168, + "step": 3799 + }, + { + "epoch": 0.13608609235948216, + "grad_norm": 2.1073853969573975, + "learning_rate": 0.00019415605492000953, + "loss": 1.9473, + "step": 3800 + }, + { + "epoch": 0.13612190448905045, + "grad_norm": 1.605509638786316, + "learning_rate": 0.00019415214723128154, + "loss": 1.5105, + "step": 3801 + }, + { + "epoch": 0.13615771661861872, + "grad_norm": 2.1277925968170166, + "learning_rate": 0.0001941482382758579, + "loss": 1.621, + "step": 3802 + }, + { + "epoch": 0.13619352874818702, + "grad_norm": 1.3043378591537476, + "learning_rate": 0.00019414432805379113, + "loss": 1.389, + "step": 3803 + }, + { + "epoch": 0.1362293408777553, + "grad_norm": 2.173734664916992, + "learning_rate": 0.00019414041656513385, + "loss": 1.4075, + "step": 3804 + }, + { + "epoch": 0.1362651530073236, + "grad_norm": 1.3967981338500977, + "learning_rate": 0.0001941365038099387, + "loss": 1.708, + "step": 3805 + }, + { + "epoch": 0.13630096513689186, + "grad_norm": 1.6393210887908936, + "learning_rate": 0.00019413258978825834, + "loss": 1.6744, + "step": 3806 + }, + { + "epoch": 0.13633677726646015, + "grad_norm": 1.854275107383728, + "learning_rate": 0.0001941286745001454, + "loss": 1.5661, + "step": 3807 + }, + { + "epoch": 0.13637258939602845, + "grad_norm": 1.3104915618896484, + "learning_rate": 0.00019412475794565256, + "loss": 1.3521, + "step": 3808 + }, + { + "epoch": 0.13640840152559672, + "grad_norm": 1.714754343032837, + "learning_rate": 0.00019412084012483249, + "loss": 1.662, + "step": 3809 + }, + { + "epoch": 0.13644421365516501, + "grad_norm": 2.037449359893799, + "learning_rate": 0.00019411692103773795, + "loss": 1.4034, + "step": 3810 + }, + { + "epoch": 0.13648002578473328, + "grad_norm": 1.4741345643997192, + "learning_rate": 0.00019411300068442167, + "loss": 1.7963, + "step": 3811 + }, + { + "epoch": 0.13651583791430158, + "grad_norm": 1.921120524406433, + "learning_rate": 0.0001941090790649363, + "loss": 1.6161, + "step": 3812 + }, + { + "epoch": 0.13655165004386985, + "grad_norm": 1.4448705911636353, + "learning_rate": 0.00019410515617933468, + "loss": 1.2904, + "step": 3813 + }, + { + "epoch": 0.13658746217343815, + "grad_norm": 1.8026163578033447, + "learning_rate": 0.0001941012320276696, + "loss": 1.9121, + "step": 3814 + }, + { + "epoch": 0.13662327430300641, + "grad_norm": 1.457109808921814, + "learning_rate": 0.0001940973066099938, + "loss": 1.4424, + "step": 3815 + }, + { + "epoch": 0.1366590864325747, + "grad_norm": 2.4692842960357666, + "learning_rate": 0.00019409337992636015, + "loss": 1.5736, + "step": 3816 + }, + { + "epoch": 0.136694898562143, + "grad_norm": 2.4115750789642334, + "learning_rate": 0.0001940894519768214, + "loss": 1.4381, + "step": 3817 + }, + { + "epoch": 0.13673071069171128, + "grad_norm": 1.905058741569519, + "learning_rate": 0.00019408552276143045, + "loss": 1.8125, + "step": 3818 + }, + { + "epoch": 0.13676652282127957, + "grad_norm": 1.6240928173065186, + "learning_rate": 0.00019408159228024018, + "loss": 1.5705, + "step": 3819 + }, + { + "epoch": 0.13680233495084784, + "grad_norm": 1.6123714447021484, + "learning_rate": 0.00019407766053330342, + "loss": 1.4053, + "step": 3820 + }, + { + "epoch": 0.13683814708041614, + "grad_norm": 1.5208178758621216, + "learning_rate": 0.00019407372752067308, + "loss": 1.3531, + "step": 3821 + }, + { + "epoch": 0.1368739592099844, + "grad_norm": 1.6093798875808716, + "learning_rate": 0.0001940697932424021, + "loss": 1.5774, + "step": 3822 + }, + { + "epoch": 0.1369097713395527, + "grad_norm": 1.2067844867706299, + "learning_rate": 0.0001940658576985434, + "loss": 1.4985, + "step": 3823 + }, + { + "epoch": 0.136945583469121, + "grad_norm": 1.312362551689148, + "learning_rate": 0.0001940619208891499, + "loss": 1.6762, + "step": 3824 + }, + { + "epoch": 0.13698139559868927, + "grad_norm": 1.438751459121704, + "learning_rate": 0.0001940579828142746, + "loss": 1.6947, + "step": 3825 + }, + { + "epoch": 0.13701720772825757, + "grad_norm": 1.2554978132247925, + "learning_rate": 0.00019405404347397047, + "loss": 1.5329, + "step": 3826 + }, + { + "epoch": 0.13705301985782584, + "grad_norm": 1.9145128726959229, + "learning_rate": 0.0001940501028682905, + "loss": 1.7587, + "step": 3827 + }, + { + "epoch": 0.13708883198739413, + "grad_norm": 1.6156679391860962, + "learning_rate": 0.00019404616099728773, + "loss": 1.5496, + "step": 3828 + }, + { + "epoch": 0.1371246441169624, + "grad_norm": 1.4059901237487793, + "learning_rate": 0.00019404221786101513, + "loss": 1.4649, + "step": 3829 + }, + { + "epoch": 0.1371604562465307, + "grad_norm": 1.4880584478378296, + "learning_rate": 0.0001940382734595258, + "loss": 1.5342, + "step": 3830 + }, + { + "epoch": 0.137196268376099, + "grad_norm": 2.3095836639404297, + "learning_rate": 0.00019403432779287286, + "loss": 1.7837, + "step": 3831 + }, + { + "epoch": 0.13723208050566726, + "grad_norm": 1.6353516578674316, + "learning_rate": 0.00019403038086110926, + "loss": 1.7732, + "step": 3832 + }, + { + "epoch": 0.13726789263523556, + "grad_norm": 1.5463842153549194, + "learning_rate": 0.00019402643266428822, + "loss": 1.7355, + "step": 3833 + }, + { + "epoch": 0.13730370476480383, + "grad_norm": 2.119309186935425, + "learning_rate": 0.00019402248320246282, + "loss": 2.0842, + "step": 3834 + }, + { + "epoch": 0.13733951689437213, + "grad_norm": 1.909306287765503, + "learning_rate": 0.00019401853247568614, + "loss": 1.5683, + "step": 3835 + }, + { + "epoch": 0.1373753290239404, + "grad_norm": 1.564666509628296, + "learning_rate": 0.00019401458048401145, + "loss": 1.3599, + "step": 3836 + }, + { + "epoch": 0.1374111411535087, + "grad_norm": 1.7415882349014282, + "learning_rate": 0.0001940106272274918, + "loss": 1.4302, + "step": 3837 + }, + { + "epoch": 0.137446953283077, + "grad_norm": 1.8725720643997192, + "learning_rate": 0.00019400667270618046, + "loss": 1.3951, + "step": 3838 + }, + { + "epoch": 0.13748276541264526, + "grad_norm": 1.7075797319412231, + "learning_rate": 0.00019400271692013058, + "loss": 1.9641, + "step": 3839 + }, + { + "epoch": 0.13751857754221355, + "grad_norm": 2.7534332275390625, + "learning_rate": 0.0001939987598693954, + "loss": 1.6384, + "step": 3840 + }, + { + "epoch": 0.13755438967178182, + "grad_norm": 2.2370193004608154, + "learning_rate": 0.00019399480155402813, + "loss": 1.57, + "step": 3841 + }, + { + "epoch": 0.13759020180135012, + "grad_norm": 1.1581882238388062, + "learning_rate": 0.0001939908419740821, + "loss": 1.664, + "step": 3842 + }, + { + "epoch": 0.1376260139309184, + "grad_norm": 1.3554637432098389, + "learning_rate": 0.0001939868811296105, + "loss": 1.2336, + "step": 3843 + }, + { + "epoch": 0.1376618260604867, + "grad_norm": 1.4079585075378418, + "learning_rate": 0.00019398291902066666, + "loss": 1.5603, + "step": 3844 + }, + { + "epoch": 0.13769763819005498, + "grad_norm": 1.8186653852462769, + "learning_rate": 0.00019397895564730386, + "loss": 1.3144, + "step": 3845 + }, + { + "epoch": 0.13773345031962325, + "grad_norm": 2.0624194145202637, + "learning_rate": 0.00019397499100957542, + "loss": 1.974, + "step": 3846 + }, + { + "epoch": 0.13776926244919155, + "grad_norm": 2.5842959880828857, + "learning_rate": 0.00019397102510753473, + "loss": 1.5397, + "step": 3847 + }, + { + "epoch": 0.13780507457875982, + "grad_norm": 1.7426347732543945, + "learning_rate": 0.0001939670579412351, + "loss": 1.8237, + "step": 3848 + }, + { + "epoch": 0.13784088670832811, + "grad_norm": 1.6106834411621094, + "learning_rate": 0.00019396308951072992, + "loss": 1.6794, + "step": 3849 + }, + { + "epoch": 0.13787669883789638, + "grad_norm": 2.3994948863983154, + "learning_rate": 0.00019395911981607254, + "loss": 1.4646, + "step": 3850 + }, + { + "epoch": 0.13791251096746468, + "grad_norm": 1.4603257179260254, + "learning_rate": 0.00019395514885731644, + "loss": 1.5009, + "step": 3851 + }, + { + "epoch": 0.13794832309703298, + "grad_norm": 2.004852056503296, + "learning_rate": 0.000193951176634515, + "loss": 1.6899, + "step": 3852 + }, + { + "epoch": 0.13798413522660125, + "grad_norm": 1.9660876989364624, + "learning_rate": 0.00019394720314772166, + "loss": 1.7038, + "step": 3853 + }, + { + "epoch": 0.13801994735616954, + "grad_norm": 1.419790506362915, + "learning_rate": 0.00019394322839698988, + "loss": 1.8069, + "step": 3854 + }, + { + "epoch": 0.1380557594857378, + "grad_norm": 1.6597111225128174, + "learning_rate": 0.00019393925238237313, + "loss": 1.8452, + "step": 3855 + }, + { + "epoch": 0.1380915716153061, + "grad_norm": 1.4487488269805908, + "learning_rate": 0.00019393527510392494, + "loss": 1.7923, + "step": 3856 + }, + { + "epoch": 0.13812738374487438, + "grad_norm": 1.6511144638061523, + "learning_rate": 0.0001939312965616988, + "loss": 1.6654, + "step": 3857 + }, + { + "epoch": 0.13816319587444267, + "grad_norm": 1.8880691528320312, + "learning_rate": 0.0001939273167557482, + "loss": 1.666, + "step": 3858 + }, + { + "epoch": 0.13819900800401097, + "grad_norm": 1.333387017250061, + "learning_rate": 0.00019392333568612672, + "loss": 1.6742, + "step": 3859 + }, + { + "epoch": 0.13823482013357924, + "grad_norm": 1.4086413383483887, + "learning_rate": 0.00019391935335288788, + "loss": 1.5357, + "step": 3860 + }, + { + "epoch": 0.13827063226314754, + "grad_norm": 2.5079047679901123, + "learning_rate": 0.00019391536975608533, + "loss": 1.6305, + "step": 3861 + }, + { + "epoch": 0.1383064443927158, + "grad_norm": 1.7378469705581665, + "learning_rate": 0.0001939113848957726, + "loss": 1.2147, + "step": 3862 + }, + { + "epoch": 0.1383422565222841, + "grad_norm": 2.0225167274475098, + "learning_rate": 0.00019390739877200335, + "loss": 1.5807, + "step": 3863 + }, + { + "epoch": 0.13837806865185237, + "grad_norm": 1.6210397481918335, + "learning_rate": 0.00019390341138483117, + "loss": 1.6289, + "step": 3864 + }, + { + "epoch": 0.13841388078142067, + "grad_norm": 1.4240684509277344, + "learning_rate": 0.0001938994227343097, + "loss": 1.7307, + "step": 3865 + }, + { + "epoch": 0.13844969291098896, + "grad_norm": 1.4018386602401733, + "learning_rate": 0.00019389543282049263, + "loss": 1.4844, + "step": 3866 + }, + { + "epoch": 0.13848550504055723, + "grad_norm": 1.4546011686325073, + "learning_rate": 0.0001938914416434336, + "loss": 1.6839, + "step": 3867 + }, + { + "epoch": 0.13852131717012553, + "grad_norm": 1.5144541263580322, + "learning_rate": 0.00019388744920318638, + "loss": 1.6554, + "step": 3868 + }, + { + "epoch": 0.1385571292996938, + "grad_norm": 1.9731286764144897, + "learning_rate": 0.00019388345549980462, + "loss": 1.8439, + "step": 3869 + }, + { + "epoch": 0.1385929414292621, + "grad_norm": 1.4650535583496094, + "learning_rate": 0.00019387946053334206, + "loss": 1.836, + "step": 3870 + }, + { + "epoch": 0.13862875355883036, + "grad_norm": 2.14497447013855, + "learning_rate": 0.00019387546430385246, + "loss": 1.5419, + "step": 3871 + }, + { + "epoch": 0.13866456568839866, + "grad_norm": 1.929789423942566, + "learning_rate": 0.00019387146681138957, + "loss": 1.6334, + "step": 3872 + }, + { + "epoch": 0.13870037781796696, + "grad_norm": 1.5620695352554321, + "learning_rate": 0.00019386746805600717, + "loss": 1.5802, + "step": 3873 + }, + { + "epoch": 0.13873618994753523, + "grad_norm": 1.579016089439392, + "learning_rate": 0.00019386346803775909, + "loss": 1.5735, + "step": 3874 + }, + { + "epoch": 0.13877200207710352, + "grad_norm": 2.0494682788848877, + "learning_rate": 0.00019385946675669913, + "loss": 1.6366, + "step": 3875 + }, + { + "epoch": 0.1388078142066718, + "grad_norm": 1.5577737092971802, + "learning_rate": 0.0001938554642128811, + "loss": 1.5314, + "step": 3876 + }, + { + "epoch": 0.1388436263362401, + "grad_norm": 1.546035647392273, + "learning_rate": 0.00019385146040635886, + "loss": 1.6867, + "step": 3877 + }, + { + "epoch": 0.13887943846580836, + "grad_norm": 1.4088118076324463, + "learning_rate": 0.00019384745533718628, + "loss": 1.2657, + "step": 3878 + }, + { + "epoch": 0.13891525059537665, + "grad_norm": 2.4216670989990234, + "learning_rate": 0.00019384344900541723, + "loss": 1.448, + "step": 3879 + }, + { + "epoch": 0.13895106272494495, + "grad_norm": 2.032590627670288, + "learning_rate": 0.00019383944141110565, + "loss": 1.607, + "step": 3880 + }, + { + "epoch": 0.13898687485451322, + "grad_norm": 1.4707528352737427, + "learning_rate": 0.00019383543255430542, + "loss": 1.3947, + "step": 3881 + }, + { + "epoch": 0.13902268698408152, + "grad_norm": 1.8237826824188232, + "learning_rate": 0.00019383142243507048, + "loss": 2.0583, + "step": 3882 + }, + { + "epoch": 0.13905849911364979, + "grad_norm": 1.9654194116592407, + "learning_rate": 0.00019382741105345482, + "loss": 1.6511, + "step": 3883 + }, + { + "epoch": 0.13909431124321808, + "grad_norm": 1.2782480716705322, + "learning_rate": 0.0001938233984095123, + "loss": 1.8372, + "step": 3884 + }, + { + "epoch": 0.13913012337278635, + "grad_norm": 1.2892425060272217, + "learning_rate": 0.00019381938450329704, + "loss": 1.8873, + "step": 3885 + }, + { + "epoch": 0.13916593550235465, + "grad_norm": 1.9381794929504395, + "learning_rate": 0.00019381536933486295, + "loss": 1.8379, + "step": 3886 + }, + { + "epoch": 0.13920174763192295, + "grad_norm": 1.7396278381347656, + "learning_rate": 0.0001938113529042641, + "loss": 1.7356, + "step": 3887 + }, + { + "epoch": 0.13923755976149121, + "grad_norm": 1.13014817237854, + "learning_rate": 0.0001938073352115545, + "loss": 1.4796, + "step": 3888 + }, + { + "epoch": 0.1392733718910595, + "grad_norm": 1.9644767045974731, + "learning_rate": 0.00019380331625678821, + "loss": 1.7187, + "step": 3889 + }, + { + "epoch": 0.13930918402062778, + "grad_norm": 1.3784605264663696, + "learning_rate": 0.00019379929604001927, + "loss": 1.5731, + "step": 3890 + }, + { + "epoch": 0.13934499615019608, + "grad_norm": 2.058908700942993, + "learning_rate": 0.00019379527456130183, + "loss": 1.7203, + "step": 3891 + }, + { + "epoch": 0.13938080827976435, + "grad_norm": 1.9625152349472046, + "learning_rate": 0.00019379125182068994, + "loss": 1.4671, + "step": 3892 + }, + { + "epoch": 0.13941662040933264, + "grad_norm": 2.0643458366394043, + "learning_rate": 0.00019378722781823772, + "loss": 1.6485, + "step": 3893 + }, + { + "epoch": 0.13945243253890094, + "grad_norm": 1.778398871421814, + "learning_rate": 0.00019378320255399934, + "loss": 1.6492, + "step": 3894 + }, + { + "epoch": 0.1394882446684692, + "grad_norm": 1.6145321130752563, + "learning_rate": 0.00019377917602802897, + "loss": 1.6214, + "step": 3895 + }, + { + "epoch": 0.1395240567980375, + "grad_norm": 2.243457794189453, + "learning_rate": 0.00019377514824038073, + "loss": 1.9855, + "step": 3896 + }, + { + "epoch": 0.13955986892760577, + "grad_norm": 2.51615047454834, + "learning_rate": 0.00019377111919110883, + "loss": 2.0496, + "step": 3897 + }, + { + "epoch": 0.13959568105717407, + "grad_norm": 1.4384043216705322, + "learning_rate": 0.00019376708888026747, + "loss": 1.7474, + "step": 3898 + }, + { + "epoch": 0.13963149318674234, + "grad_norm": 2.0922720432281494, + "learning_rate": 0.0001937630573079109, + "loss": 1.5602, + "step": 3899 + }, + { + "epoch": 0.13966730531631064, + "grad_norm": 1.4650601148605347, + "learning_rate": 0.0001937590244740933, + "loss": 1.3567, + "step": 3900 + }, + { + "epoch": 0.13970311744587893, + "grad_norm": 1.8897500038146973, + "learning_rate": 0.000193754990378869, + "loss": 1.6301, + "step": 3901 + }, + { + "epoch": 0.1397389295754472, + "grad_norm": 1.7540127038955688, + "learning_rate": 0.00019375095502229223, + "loss": 1.4029, + "step": 3902 + }, + { + "epoch": 0.1397747417050155, + "grad_norm": 1.8361109495162964, + "learning_rate": 0.0001937469184044173, + "loss": 1.6593, + "step": 3903 + }, + { + "epoch": 0.13981055383458377, + "grad_norm": 1.9301716089248657, + "learning_rate": 0.0001937428805252985, + "loss": 1.3712, + "step": 3904 + }, + { + "epoch": 0.13984636596415206, + "grad_norm": 1.518929123878479, + "learning_rate": 0.00019373884138499018, + "loss": 1.5331, + "step": 3905 + }, + { + "epoch": 0.13988217809372033, + "grad_norm": 1.6392289400100708, + "learning_rate": 0.00019373480098354665, + "loss": 1.5763, + "step": 3906 + }, + { + "epoch": 0.13991799022328863, + "grad_norm": 1.7571324110031128, + "learning_rate": 0.00019373075932102227, + "loss": 1.6839, + "step": 3907 + }, + { + "epoch": 0.13995380235285693, + "grad_norm": 1.5957682132720947, + "learning_rate": 0.00019372671639747145, + "loss": 1.7607, + "step": 3908 + }, + { + "epoch": 0.1399896144824252, + "grad_norm": 1.305310845375061, + "learning_rate": 0.00019372267221294854, + "loss": 1.679, + "step": 3909 + }, + { + "epoch": 0.1400254266119935, + "grad_norm": 1.584241271018982, + "learning_rate": 0.00019371862676750796, + "loss": 1.6078, + "step": 3910 + }, + { + "epoch": 0.14006123874156176, + "grad_norm": 1.5107513666152954, + "learning_rate": 0.00019371458006120417, + "loss": 1.7136, + "step": 3911 + }, + { + "epoch": 0.14009705087113006, + "grad_norm": 1.7936160564422607, + "learning_rate": 0.00019371053209409157, + "loss": 1.5745, + "step": 3912 + }, + { + "epoch": 0.14013286300069833, + "grad_norm": 2.0149476528167725, + "learning_rate": 0.00019370648286622466, + "loss": 1.3959, + "step": 3913 + }, + { + "epoch": 0.14016867513026662, + "grad_norm": 1.791279673576355, + "learning_rate": 0.00019370243237765787, + "loss": 1.2805, + "step": 3914 + }, + { + "epoch": 0.1402044872598349, + "grad_norm": 1.2142484188079834, + "learning_rate": 0.00019369838062844577, + "loss": 1.6181, + "step": 3915 + }, + { + "epoch": 0.1402402993894032, + "grad_norm": 1.787173867225647, + "learning_rate": 0.00019369432761864278, + "loss": 1.5593, + "step": 3916 + }, + { + "epoch": 0.14027611151897149, + "grad_norm": 1.2792410850524902, + "learning_rate": 0.00019369027334830346, + "loss": 1.5291, + "step": 3917 + }, + { + "epoch": 0.14031192364853975, + "grad_norm": 1.5265432596206665, + "learning_rate": 0.00019368621781748238, + "loss": 1.7252, + "step": 3918 + }, + { + "epoch": 0.14034773577810805, + "grad_norm": 1.5280210971832275, + "learning_rate": 0.0001936821610262341, + "loss": 1.6944, + "step": 3919 + }, + { + "epoch": 0.14038354790767632, + "grad_norm": 1.6217960119247437, + "learning_rate": 0.00019367810297461313, + "loss": 1.4182, + "step": 3920 + }, + { + "epoch": 0.14041936003724462, + "grad_norm": 1.484212875366211, + "learning_rate": 0.00019367404366267416, + "loss": 1.4251, + "step": 3921 + }, + { + "epoch": 0.14045517216681289, + "grad_norm": 1.711181402206421, + "learning_rate": 0.0001936699830904718, + "loss": 1.7121, + "step": 3922 + }, + { + "epoch": 0.14049098429638118, + "grad_norm": 1.52628755569458, + "learning_rate": 0.00019366592125806057, + "loss": 1.5609, + "step": 3923 + }, + { + "epoch": 0.14052679642594948, + "grad_norm": 1.480896234512329, + "learning_rate": 0.00019366185816549524, + "loss": 1.4094, + "step": 3924 + }, + { + "epoch": 0.14056260855551775, + "grad_norm": 1.4317753314971924, + "learning_rate": 0.0001936577938128304, + "loss": 1.6869, + "step": 3925 + }, + { + "epoch": 0.14059842068508605, + "grad_norm": 2.088813543319702, + "learning_rate": 0.00019365372820012077, + "loss": 1.4735, + "step": 3926 + }, + { + "epoch": 0.14063423281465431, + "grad_norm": 1.6882331371307373, + "learning_rate": 0.00019364966132742102, + "loss": 1.7869, + "step": 3927 + }, + { + "epoch": 0.1406700449442226, + "grad_norm": 2.3925135135650635, + "learning_rate": 0.00019364559319478585, + "loss": 1.5204, + "step": 3928 + }, + { + "epoch": 0.14070585707379088, + "grad_norm": 1.3298742771148682, + "learning_rate": 0.00019364152380227007, + "loss": 1.3791, + "step": 3929 + }, + { + "epoch": 0.14074166920335918, + "grad_norm": 2.0857841968536377, + "learning_rate": 0.00019363745314992836, + "loss": 1.6491, + "step": 3930 + }, + { + "epoch": 0.14077748133292747, + "grad_norm": 1.499003291130066, + "learning_rate": 0.00019363338123781548, + "loss": 1.5969, + "step": 3931 + }, + { + "epoch": 0.14081329346249574, + "grad_norm": 1.8543964624404907, + "learning_rate": 0.00019362930806598625, + "loss": 1.5286, + "step": 3932 + }, + { + "epoch": 0.14084910559206404, + "grad_norm": 1.2838963270187378, + "learning_rate": 0.00019362523363449546, + "loss": 1.3795, + "step": 3933 + }, + { + "epoch": 0.1408849177216323, + "grad_norm": 1.6175882816314697, + "learning_rate": 0.0001936211579433979, + "loss": 1.5956, + "step": 3934 + }, + { + "epoch": 0.1409207298512006, + "grad_norm": 1.3847663402557373, + "learning_rate": 0.00019361708099274844, + "loss": 1.7353, + "step": 3935 + }, + { + "epoch": 0.14095654198076887, + "grad_norm": 1.8384443521499634, + "learning_rate": 0.00019361300278260193, + "loss": 2.0406, + "step": 3936 + }, + { + "epoch": 0.14099235411033717, + "grad_norm": 1.939363718032837, + "learning_rate": 0.00019360892331301316, + "loss": 1.7603, + "step": 3937 + }, + { + "epoch": 0.14102816623990547, + "grad_norm": 1.2415847778320312, + "learning_rate": 0.00019360484258403713, + "loss": 1.7003, + "step": 3938 + }, + { + "epoch": 0.14106397836947374, + "grad_norm": 3.8942644596099854, + "learning_rate": 0.00019360076059572867, + "loss": 1.3907, + "step": 3939 + }, + { + "epoch": 0.14109979049904203, + "grad_norm": 1.311605453491211, + "learning_rate": 0.0001935966773481427, + "loss": 1.7552, + "step": 3940 + }, + { + "epoch": 0.1411356026286103, + "grad_norm": 1.316116452217102, + "learning_rate": 0.00019359259284133418, + "loss": 1.4707, + "step": 3941 + }, + { + "epoch": 0.1411714147581786, + "grad_norm": 1.855420470237732, + "learning_rate": 0.00019358850707535804, + "loss": 1.445, + "step": 3942 + }, + { + "epoch": 0.14120722688774687, + "grad_norm": 1.1825437545776367, + "learning_rate": 0.00019358442005026926, + "loss": 1.3708, + "step": 3943 + }, + { + "epoch": 0.14124303901731516, + "grad_norm": 1.6518439054489136, + "learning_rate": 0.0001935803317661228, + "loss": 1.6204, + "step": 3944 + }, + { + "epoch": 0.14127885114688346, + "grad_norm": 1.6733884811401367, + "learning_rate": 0.0001935762422229737, + "loss": 1.7793, + "step": 3945 + }, + { + "epoch": 0.14131466327645173, + "grad_norm": 1.8255501985549927, + "learning_rate": 0.00019357215142087699, + "loss": 1.3537, + "step": 3946 + }, + { + "epoch": 0.14135047540602003, + "grad_norm": 1.6905624866485596, + "learning_rate": 0.0001935680593598877, + "loss": 1.4878, + "step": 3947 + }, + { + "epoch": 0.1413862875355883, + "grad_norm": 1.3700306415557861, + "learning_rate": 0.00019356396604006083, + "loss": 1.3288, + "step": 3948 + }, + { + "epoch": 0.1414220996651566, + "grad_norm": 1.2963467836380005, + "learning_rate": 0.00019355987146145147, + "loss": 1.5802, + "step": 3949 + }, + { + "epoch": 0.14145791179472486, + "grad_norm": 1.2930432558059692, + "learning_rate": 0.00019355577562411473, + "loss": 1.592, + "step": 3950 + }, + { + "epoch": 0.14149372392429316, + "grad_norm": 1.4524013996124268, + "learning_rate": 0.00019355167852810575, + "loss": 1.6285, + "step": 3951 + }, + { + "epoch": 0.14152953605386145, + "grad_norm": 1.399037480354309, + "learning_rate": 0.00019354758017347957, + "loss": 1.5855, + "step": 3952 + }, + { + "epoch": 0.14156534818342972, + "grad_norm": 1.4224966764450073, + "learning_rate": 0.00019354348056029136, + "loss": 1.9141, + "step": 3953 + }, + { + "epoch": 0.14160116031299802, + "grad_norm": 1.995407223701477, + "learning_rate": 0.0001935393796885963, + "loss": 1.6105, + "step": 3954 + }, + { + "epoch": 0.1416369724425663, + "grad_norm": 1.7793134450912476, + "learning_rate": 0.00019353527755844953, + "loss": 1.6095, + "step": 3955 + }, + { + "epoch": 0.14167278457213459, + "grad_norm": 1.9654982089996338, + "learning_rate": 0.00019353117416990627, + "loss": 1.6372, + "step": 3956 + }, + { + "epoch": 0.14170859670170285, + "grad_norm": 1.6730225086212158, + "learning_rate": 0.0001935270695230217, + "loss": 1.3259, + "step": 3957 + }, + { + "epoch": 0.14174440883127115, + "grad_norm": 1.6628282070159912, + "learning_rate": 0.00019352296361785105, + "loss": 1.6113, + "step": 3958 + }, + { + "epoch": 0.14178022096083945, + "grad_norm": 1.5034743547439575, + "learning_rate": 0.00019351885645444957, + "loss": 1.5158, + "step": 3959 + }, + { + "epoch": 0.14181603309040772, + "grad_norm": 2.691425323486328, + "learning_rate": 0.0001935147480328725, + "loss": 1.6924, + "step": 3960 + }, + { + "epoch": 0.141851845219976, + "grad_norm": 1.750555157661438, + "learning_rate": 0.0001935106383531751, + "loss": 1.6288, + "step": 3961 + }, + { + "epoch": 0.14188765734954428, + "grad_norm": 2.0469725131988525, + "learning_rate": 0.00019350652741541272, + "loss": 1.7488, + "step": 3962 + }, + { + "epoch": 0.14192346947911258, + "grad_norm": 1.605979323387146, + "learning_rate": 0.00019350241521964062, + "loss": 1.6378, + "step": 3963 + }, + { + "epoch": 0.14195928160868085, + "grad_norm": 1.7067230939865112, + "learning_rate": 0.00019349830176591408, + "loss": 1.778, + "step": 3964 + }, + { + "epoch": 0.14199509373824915, + "grad_norm": 2.101348400115967, + "learning_rate": 0.00019349418705428854, + "loss": 1.4173, + "step": 3965 + }, + { + "epoch": 0.14203090586781744, + "grad_norm": 1.571287989616394, + "learning_rate": 0.0001934900710848193, + "loss": 1.6081, + "step": 3966 + }, + { + "epoch": 0.1420667179973857, + "grad_norm": 1.2210115194320679, + "learning_rate": 0.00019348595385756178, + "loss": 1.1689, + "step": 3967 + }, + { + "epoch": 0.142102530126954, + "grad_norm": 1.7547118663787842, + "learning_rate": 0.00019348183537257131, + "loss": 1.652, + "step": 3968 + }, + { + "epoch": 0.14213834225652228, + "grad_norm": 2.7394344806671143, + "learning_rate": 0.00019347771562990332, + "loss": 1.8632, + "step": 3969 + }, + { + "epoch": 0.14217415438609057, + "grad_norm": 1.5795948505401611, + "learning_rate": 0.00019347359462961326, + "loss": 1.5562, + "step": 3970 + }, + { + "epoch": 0.14220996651565884, + "grad_norm": 1.3725916147232056, + "learning_rate": 0.00019346947237175655, + "loss": 1.4961, + "step": 3971 + }, + { + "epoch": 0.14224577864522714, + "grad_norm": 1.5435025691986084, + "learning_rate": 0.00019346534885638866, + "loss": 1.877, + "step": 3972 + }, + { + "epoch": 0.14228159077479544, + "grad_norm": 1.6002931594848633, + "learning_rate": 0.00019346122408356507, + "loss": 1.6769, + "step": 3973 + }, + { + "epoch": 0.1423174029043637, + "grad_norm": 1.736573576927185, + "learning_rate": 0.00019345709805334123, + "loss": 1.5778, + "step": 3974 + }, + { + "epoch": 0.142353215033932, + "grad_norm": 1.5255703926086426, + "learning_rate": 0.00019345297076577272, + "loss": 1.8338, + "step": 3975 + }, + { + "epoch": 0.14238902716350027, + "grad_norm": 1.9835999011993408, + "learning_rate": 0.00019344884222091503, + "loss": 1.6607, + "step": 3976 + }, + { + "epoch": 0.14242483929306857, + "grad_norm": 1.5903600454330444, + "learning_rate": 0.00019344471241882372, + "loss": 1.5658, + "step": 3977 + }, + { + "epoch": 0.14246065142263684, + "grad_norm": 1.982418179512024, + "learning_rate": 0.0001934405813595543, + "loss": 1.7213, + "step": 3978 + }, + { + "epoch": 0.14249646355220513, + "grad_norm": 1.5569443702697754, + "learning_rate": 0.00019343644904316242, + "loss": 1.6512, + "step": 3979 + }, + { + "epoch": 0.14253227568177343, + "grad_norm": 1.252939224243164, + "learning_rate": 0.0001934323154697036, + "loss": 1.307, + "step": 3980 + }, + { + "epoch": 0.1425680878113417, + "grad_norm": 1.7318528890609741, + "learning_rate": 0.00019342818063923357, + "loss": 1.6283, + "step": 3981 + }, + { + "epoch": 0.14260389994091, + "grad_norm": 1.3830797672271729, + "learning_rate": 0.00019342404455180784, + "loss": 1.5416, + "step": 3982 + }, + { + "epoch": 0.14263971207047826, + "grad_norm": 1.8082152605056763, + "learning_rate": 0.00019341990720748208, + "loss": 1.4456, + "step": 3983 + }, + { + "epoch": 0.14267552420004656, + "grad_norm": 1.605903148651123, + "learning_rate": 0.000193415768606312, + "loss": 1.9392, + "step": 3984 + }, + { + "epoch": 0.14271133632961483, + "grad_norm": 1.7564297914505005, + "learning_rate": 0.00019341162874835326, + "loss": 1.9917, + "step": 3985 + }, + { + "epoch": 0.14274714845918313, + "grad_norm": 1.354068398475647, + "learning_rate": 0.00019340748763366152, + "loss": 1.6126, + "step": 3986 + }, + { + "epoch": 0.14278296058875142, + "grad_norm": 1.6402426958084106, + "learning_rate": 0.00019340334526229253, + "loss": 1.7782, + "step": 3987 + }, + { + "epoch": 0.1428187727183197, + "grad_norm": 1.6833535432815552, + "learning_rate": 0.00019339920163430202, + "loss": 1.6701, + "step": 3988 + }, + { + "epoch": 0.142854584847888, + "grad_norm": 2.3586952686309814, + "learning_rate": 0.0001933950567497457, + "loss": 1.6699, + "step": 3989 + }, + { + "epoch": 0.14289039697745626, + "grad_norm": 2.0771920680999756, + "learning_rate": 0.0001933909106086794, + "loss": 1.6299, + "step": 3990 + }, + { + "epoch": 0.14292620910702455, + "grad_norm": 1.3921229839324951, + "learning_rate": 0.00019338676321115883, + "loss": 1.4079, + "step": 3991 + }, + { + "epoch": 0.14296202123659282, + "grad_norm": 1.5394184589385986, + "learning_rate": 0.00019338261455723984, + "loss": 1.3872, + "step": 3992 + }, + { + "epoch": 0.14299783336616112, + "grad_norm": 1.6472933292388916, + "learning_rate": 0.00019337846464697825, + "loss": 1.3228, + "step": 3993 + }, + { + "epoch": 0.14303364549572942, + "grad_norm": 1.6850037574768066, + "learning_rate": 0.00019337431348042983, + "loss": 1.5189, + "step": 3994 + }, + { + "epoch": 0.14306945762529769, + "grad_norm": 3.3073441982269287, + "learning_rate": 0.00019337016105765048, + "loss": 1.5217, + "step": 3995 + }, + { + "epoch": 0.14310526975486598, + "grad_norm": 1.2370121479034424, + "learning_rate": 0.00019336600737869603, + "loss": 1.89, + "step": 3996 + }, + { + "epoch": 0.14314108188443425, + "grad_norm": 1.7099319696426392, + "learning_rate": 0.00019336185244362244, + "loss": 1.5129, + "step": 3997 + }, + { + "epoch": 0.14317689401400255, + "grad_norm": 1.7119805812835693, + "learning_rate": 0.0001933576962524855, + "loss": 1.3498, + "step": 3998 + }, + { + "epoch": 0.14321270614357082, + "grad_norm": 1.3768866062164307, + "learning_rate": 0.0001933535388053412, + "loss": 1.4905, + "step": 3999 + }, + { + "epoch": 0.1432485182731391, + "grad_norm": 2.029284715652466, + "learning_rate": 0.00019334938010224546, + "loss": 1.8039, + "step": 4000 + }, + { + "epoch": 0.1432843304027074, + "grad_norm": 1.4423243999481201, + "learning_rate": 0.0001933452201432542, + "loss": 1.7993, + "step": 4001 + }, + { + "epoch": 0.14332014253227568, + "grad_norm": 1.3633579015731812, + "learning_rate": 0.00019334105892842342, + "loss": 1.5531, + "step": 4002 + }, + { + "epoch": 0.14335595466184398, + "grad_norm": 1.5188398361206055, + "learning_rate": 0.00019333689645780912, + "loss": 1.7493, + "step": 4003 + }, + { + "epoch": 0.14339176679141225, + "grad_norm": 1.5770578384399414, + "learning_rate": 0.00019333273273146721, + "loss": 1.5965, + "step": 4004 + }, + { + "epoch": 0.14342757892098054, + "grad_norm": 1.7399095296859741, + "learning_rate": 0.00019332856774945383, + "loss": 1.6626, + "step": 4005 + }, + { + "epoch": 0.1434633910505488, + "grad_norm": 1.923545241355896, + "learning_rate": 0.00019332440151182493, + "loss": 1.245, + "step": 4006 + }, + { + "epoch": 0.1434992031801171, + "grad_norm": 1.3939712047576904, + "learning_rate": 0.00019332023401863658, + "loss": 1.3326, + "step": 4007 + }, + { + "epoch": 0.1435350153096854, + "grad_norm": 1.7758983373641968, + "learning_rate": 0.00019331606526994488, + "loss": 1.603, + "step": 4008 + }, + { + "epoch": 0.14357082743925367, + "grad_norm": 1.7037540674209595, + "learning_rate": 0.0001933118952658059, + "loss": 1.5085, + "step": 4009 + }, + { + "epoch": 0.14360663956882197, + "grad_norm": 1.764907956123352, + "learning_rate": 0.00019330772400627573, + "loss": 1.1214, + "step": 4010 + }, + { + "epoch": 0.14364245169839024, + "grad_norm": 1.9461796283721924, + "learning_rate": 0.00019330355149141046, + "loss": 1.5362, + "step": 4011 + }, + { + "epoch": 0.14367826382795854, + "grad_norm": 2.126535415649414, + "learning_rate": 0.00019329937772126626, + "loss": 1.9574, + "step": 4012 + }, + { + "epoch": 0.1437140759575268, + "grad_norm": 1.2146317958831787, + "learning_rate": 0.0001932952026958993, + "loss": 1.3589, + "step": 4013 + }, + { + "epoch": 0.1437498880870951, + "grad_norm": 1.6272441148757935, + "learning_rate": 0.00019329102641536575, + "loss": 1.4191, + "step": 4014 + }, + { + "epoch": 0.14378570021666337, + "grad_norm": 1.698939561843872, + "learning_rate": 0.00019328684887972173, + "loss": 1.6507, + "step": 4015 + }, + { + "epoch": 0.14382151234623167, + "grad_norm": 2.0913240909576416, + "learning_rate": 0.00019328267008902352, + "loss": 1.5051, + "step": 4016 + }, + { + "epoch": 0.14385732447579996, + "grad_norm": 1.6525379419326782, + "learning_rate": 0.00019327849004332728, + "loss": 1.7334, + "step": 4017 + }, + { + "epoch": 0.14389313660536823, + "grad_norm": 1.996968388557434, + "learning_rate": 0.0001932743087426893, + "loss": 1.6867, + "step": 4018 + }, + { + "epoch": 0.14392894873493653, + "grad_norm": 1.8276373147964478, + "learning_rate": 0.00019327012618716583, + "loss": 1.6823, + "step": 4019 + }, + { + "epoch": 0.1439647608645048, + "grad_norm": 1.7384955883026123, + "learning_rate": 0.00019326594237681311, + "loss": 1.9046, + "step": 4020 + }, + { + "epoch": 0.1440005729940731, + "grad_norm": 1.7132830619812012, + "learning_rate": 0.00019326175731168742, + "loss": 1.8533, + "step": 4021 + }, + { + "epoch": 0.14403638512364136, + "grad_norm": 1.467646837234497, + "learning_rate": 0.00019325757099184507, + "loss": 1.2429, + "step": 4022 + }, + { + "epoch": 0.14407219725320966, + "grad_norm": 1.7912276983261108, + "learning_rate": 0.00019325338341734245, + "loss": 1.685, + "step": 4023 + }, + { + "epoch": 0.14410800938277796, + "grad_norm": 1.2599592208862305, + "learning_rate": 0.00019324919458823582, + "loss": 1.5783, + "step": 4024 + }, + { + "epoch": 0.14414382151234623, + "grad_norm": 2.230529308319092, + "learning_rate": 0.00019324500450458153, + "loss": 1.7649, + "step": 4025 + }, + { + "epoch": 0.14417963364191452, + "grad_norm": 1.7477943897247314, + "learning_rate": 0.000193240813166436, + "loss": 1.5066, + "step": 4026 + }, + { + "epoch": 0.1442154457714828, + "grad_norm": 2.1602365970611572, + "learning_rate": 0.0001932366205738556, + "loss": 1.3825, + "step": 4027 + }, + { + "epoch": 0.1442512579010511, + "grad_norm": 1.7186968326568604, + "learning_rate": 0.00019323242672689676, + "loss": 1.6016, + "step": 4028 + }, + { + "epoch": 0.14428707003061936, + "grad_norm": 1.2913117408752441, + "learning_rate": 0.00019322823162561586, + "loss": 1.6547, + "step": 4029 + }, + { + "epoch": 0.14432288216018765, + "grad_norm": 2.078582525253296, + "learning_rate": 0.00019322403527006937, + "loss": 1.5363, + "step": 4030 + }, + { + "epoch": 0.14435869428975595, + "grad_norm": 1.4359171390533447, + "learning_rate": 0.00019321983766031373, + "loss": 1.8056, + "step": 4031 + }, + { + "epoch": 0.14439450641932422, + "grad_norm": 1.497952938079834, + "learning_rate": 0.00019321563879640542, + "loss": 1.421, + "step": 4032 + }, + { + "epoch": 0.14443031854889252, + "grad_norm": 1.6174252033233643, + "learning_rate": 0.00019321143867840091, + "loss": 1.5336, + "step": 4033 + }, + { + "epoch": 0.14446613067846079, + "grad_norm": 1.4377790689468384, + "learning_rate": 0.00019320723730635676, + "loss": 1.8566, + "step": 4034 + }, + { + "epoch": 0.14450194280802908, + "grad_norm": 2.628340482711792, + "learning_rate": 0.00019320303468032944, + "loss": 1.6271, + "step": 4035 + }, + { + "epoch": 0.14453775493759735, + "grad_norm": 1.5452464818954468, + "learning_rate": 0.00019319883080037552, + "loss": 1.7898, + "step": 4036 + }, + { + "epoch": 0.14457356706716565, + "grad_norm": 1.258744478225708, + "learning_rate": 0.00019319462566655155, + "loss": 1.727, + "step": 4037 + }, + { + "epoch": 0.14460937919673394, + "grad_norm": 1.481390357017517, + "learning_rate": 0.0001931904192789141, + "loss": 1.6695, + "step": 4038 + }, + { + "epoch": 0.1446451913263022, + "grad_norm": 1.921749234199524, + "learning_rate": 0.00019318621163751974, + "loss": 1.4772, + "step": 4039 + }, + { + "epoch": 0.1446810034558705, + "grad_norm": 1.3307794332504272, + "learning_rate": 0.00019318200274242515, + "loss": 1.7675, + "step": 4040 + }, + { + "epoch": 0.14471681558543878, + "grad_norm": 1.9500494003295898, + "learning_rate": 0.0001931777925936869, + "loss": 1.2512, + "step": 4041 + }, + { + "epoch": 0.14475262771500708, + "grad_norm": 1.4262527227401733, + "learning_rate": 0.00019317358119136163, + "loss": 1.5668, + "step": 4042 + }, + { + "epoch": 0.14478843984457535, + "grad_norm": 2.077671766281128, + "learning_rate": 0.000193169368535506, + "loss": 1.6646, + "step": 4043 + }, + { + "epoch": 0.14482425197414364, + "grad_norm": 2.521920680999756, + "learning_rate": 0.00019316515462617672, + "loss": 1.9527, + "step": 4044 + }, + { + "epoch": 0.14486006410371194, + "grad_norm": 2.3683269023895264, + "learning_rate": 0.00019316093946343044, + "loss": 1.3807, + "step": 4045 + }, + { + "epoch": 0.1448958762332802, + "grad_norm": 2.656454086303711, + "learning_rate": 0.00019315672304732388, + "loss": 1.38, + "step": 4046 + }, + { + "epoch": 0.1449316883628485, + "grad_norm": 2.149718999862671, + "learning_rate": 0.0001931525053779138, + "loss": 1.4746, + "step": 4047 + }, + { + "epoch": 0.14496750049241677, + "grad_norm": 1.8854492902755737, + "learning_rate": 0.00019314828645525692, + "loss": 1.5098, + "step": 4048 + }, + { + "epoch": 0.14500331262198507, + "grad_norm": 1.668682336807251, + "learning_rate": 0.00019314406627940996, + "loss": 1.3973, + "step": 4049 + }, + { + "epoch": 0.14503912475155334, + "grad_norm": 2.1312713623046875, + "learning_rate": 0.00019313984485042976, + "loss": 1.6263, + "step": 4050 + }, + { + "epoch": 0.14507493688112164, + "grad_norm": 1.5961402654647827, + "learning_rate": 0.0001931356221683731, + "loss": 1.6585, + "step": 4051 + }, + { + "epoch": 0.14511074901068993, + "grad_norm": 1.9257919788360596, + "learning_rate": 0.00019313139823329677, + "loss": 1.5084, + "step": 4052 + }, + { + "epoch": 0.1451465611402582, + "grad_norm": 1.2645249366760254, + "learning_rate": 0.00019312717304525762, + "loss": 1.4746, + "step": 4053 + }, + { + "epoch": 0.1451823732698265, + "grad_norm": 2.637812614440918, + "learning_rate": 0.00019312294660431246, + "loss": 2.0501, + "step": 4054 + }, + { + "epoch": 0.14521818539939477, + "grad_norm": 1.7504860162734985, + "learning_rate": 0.00019311871891051818, + "loss": 1.5642, + "step": 4055 + }, + { + "epoch": 0.14525399752896306, + "grad_norm": 1.7490657567977905, + "learning_rate": 0.00019311448996393163, + "loss": 1.4698, + "step": 4056 + }, + { + "epoch": 0.14528980965853133, + "grad_norm": 1.9256107807159424, + "learning_rate": 0.00019311025976460978, + "loss": 1.5543, + "step": 4057 + }, + { + "epoch": 0.14532562178809963, + "grad_norm": 1.8809314966201782, + "learning_rate": 0.00019310602831260944, + "loss": 1.6738, + "step": 4058 + }, + { + "epoch": 0.14536143391766793, + "grad_norm": 1.5549511909484863, + "learning_rate": 0.0001931017956079876, + "loss": 1.8064, + "step": 4059 + }, + { + "epoch": 0.1453972460472362, + "grad_norm": 1.6281778812408447, + "learning_rate": 0.0001930975616508012, + "loss": 1.6228, + "step": 4060 + }, + { + "epoch": 0.1454330581768045, + "grad_norm": 1.7993215322494507, + "learning_rate": 0.00019309332644110722, + "loss": 1.9817, + "step": 4061 + }, + { + "epoch": 0.14546887030637276, + "grad_norm": 1.8530840873718262, + "learning_rate": 0.0001930890899789626, + "loss": 1.3957, + "step": 4062 + }, + { + "epoch": 0.14550468243594106, + "grad_norm": 1.63814115524292, + "learning_rate": 0.0001930848522644243, + "loss": 1.6432, + "step": 4063 + }, + { + "epoch": 0.14554049456550933, + "grad_norm": 1.4699954986572266, + "learning_rate": 0.00019308061329754942, + "loss": 1.6721, + "step": 4064 + }, + { + "epoch": 0.14557630669507762, + "grad_norm": 1.9068433046340942, + "learning_rate": 0.00019307637307839498, + "loss": 1.4541, + "step": 4065 + }, + { + "epoch": 0.14561211882464592, + "grad_norm": 2.003532886505127, + "learning_rate": 0.00019307213160701798, + "loss": 1.5527, + "step": 4066 + }, + { + "epoch": 0.1456479309542142, + "grad_norm": 1.656280755996704, + "learning_rate": 0.0001930678888834755, + "loss": 1.4751, + "step": 4067 + }, + { + "epoch": 0.14568374308378249, + "grad_norm": 1.5825142860412598, + "learning_rate": 0.00019306364490782462, + "loss": 1.749, + "step": 4068 + }, + { + "epoch": 0.14571955521335075, + "grad_norm": 1.796554446220398, + "learning_rate": 0.00019305939968012245, + "loss": 1.6165, + "step": 4069 + }, + { + "epoch": 0.14575536734291905, + "grad_norm": 1.4555829763412476, + "learning_rate": 0.00019305515320042611, + "loss": 1.5499, + "step": 4070 + }, + { + "epoch": 0.14579117947248732, + "grad_norm": 1.407891869544983, + "learning_rate": 0.00019305090546879267, + "loss": 1.4587, + "step": 4071 + }, + { + "epoch": 0.14582699160205562, + "grad_norm": 1.5201910734176636, + "learning_rate": 0.00019304665648527935, + "loss": 1.4193, + "step": 4072 + }, + { + "epoch": 0.1458628037316239, + "grad_norm": 1.4494929313659668, + "learning_rate": 0.00019304240624994328, + "loss": 1.7287, + "step": 4073 + }, + { + "epoch": 0.14589861586119218, + "grad_norm": 2.1447548866271973, + "learning_rate": 0.00019303815476284168, + "loss": 1.4123, + "step": 4074 + }, + { + "epoch": 0.14593442799076048, + "grad_norm": 2.7469265460968018, + "learning_rate": 0.0001930339020240317, + "loss": 1.7491, + "step": 4075 + }, + { + "epoch": 0.14597024012032875, + "grad_norm": 1.5956698656082153, + "learning_rate": 0.00019302964803357057, + "loss": 1.7761, + "step": 4076 + }, + { + "epoch": 0.14600605224989704, + "grad_norm": 1.6789740324020386, + "learning_rate": 0.00019302539279151553, + "loss": 1.571, + "step": 4077 + }, + { + "epoch": 0.1460418643794653, + "grad_norm": 1.620974063873291, + "learning_rate": 0.00019302113629792383, + "loss": 1.1998, + "step": 4078 + }, + { + "epoch": 0.1460776765090336, + "grad_norm": 1.7867196798324585, + "learning_rate": 0.0001930168785528527, + "loss": 1.6286, + "step": 4079 + }, + { + "epoch": 0.1461134886386019, + "grad_norm": 1.3706187009811401, + "learning_rate": 0.00019301261955635948, + "loss": 1.7489, + "step": 4080 + }, + { + "epoch": 0.14614930076817018, + "grad_norm": 1.409914493560791, + "learning_rate": 0.00019300835930850143, + "loss": 1.466, + "step": 4081 + }, + { + "epoch": 0.14618511289773847, + "grad_norm": 1.6765789985656738, + "learning_rate": 0.0001930040978093359, + "loss": 1.4924, + "step": 4082 + }, + { + "epoch": 0.14622092502730674, + "grad_norm": 1.7411587238311768, + "learning_rate": 0.00019299983505892016, + "loss": 1.467, + "step": 4083 + }, + { + "epoch": 0.14625673715687504, + "grad_norm": 2.216538906097412, + "learning_rate": 0.00019299557105731166, + "loss": 1.4325, + "step": 4084 + }, + { + "epoch": 0.1462925492864433, + "grad_norm": 1.6220505237579346, + "learning_rate": 0.00019299130580456765, + "loss": 1.5235, + "step": 4085 + }, + { + "epoch": 0.1463283614160116, + "grad_norm": 1.5487260818481445, + "learning_rate": 0.0001929870393007456, + "loss": 1.7881, + "step": 4086 + }, + { + "epoch": 0.1463641735455799, + "grad_norm": 1.7539433240890503, + "learning_rate": 0.00019298277154590284, + "loss": 1.3377, + "step": 4087 + }, + { + "epoch": 0.14639998567514817, + "grad_norm": 1.9030964374542236, + "learning_rate": 0.0001929785025400969, + "loss": 1.5991, + "step": 4088 + }, + { + "epoch": 0.14643579780471647, + "grad_norm": 1.8642268180847168, + "learning_rate": 0.0001929742322833851, + "loss": 1.5565, + "step": 4089 + }, + { + "epoch": 0.14647160993428474, + "grad_norm": 2.047663450241089, + "learning_rate": 0.00019296996077582492, + "loss": 1.4049, + "step": 4090 + }, + { + "epoch": 0.14650742206385303, + "grad_norm": 1.8870108127593994, + "learning_rate": 0.00019296568801747385, + "loss": 1.6984, + "step": 4091 + }, + { + "epoch": 0.1465432341934213, + "grad_norm": 1.8283346891403198, + "learning_rate": 0.00019296141400838938, + "loss": 1.5549, + "step": 4092 + }, + { + "epoch": 0.1465790463229896, + "grad_norm": 1.5765265226364136, + "learning_rate": 0.00019295713874862896, + "loss": 1.3471, + "step": 4093 + }, + { + "epoch": 0.1466148584525579, + "grad_norm": 1.2447913885116577, + "learning_rate": 0.0001929528622382502, + "loss": 1.5457, + "step": 4094 + }, + { + "epoch": 0.14665067058212616, + "grad_norm": 1.6574128866195679, + "learning_rate": 0.00019294858447731054, + "loss": 1.4687, + "step": 4095 + }, + { + "epoch": 0.14668648271169446, + "grad_norm": 2.061941385269165, + "learning_rate": 0.0001929443054658676, + "loss": 1.8521, + "step": 4096 + }, + { + "epoch": 0.14672229484126273, + "grad_norm": 1.5628901720046997, + "learning_rate": 0.00019294002520397888, + "loss": 1.6355, + "step": 4097 + }, + { + "epoch": 0.14675810697083103, + "grad_norm": 1.9049891233444214, + "learning_rate": 0.000192935743691702, + "loss": 1.5275, + "step": 4098 + }, + { + "epoch": 0.1467939191003993, + "grad_norm": 1.6401816606521606, + "learning_rate": 0.00019293146092909462, + "loss": 1.5944, + "step": 4099 + }, + { + "epoch": 0.1468297312299676, + "grad_norm": 1.786863923072815, + "learning_rate": 0.00019292717691621428, + "loss": 1.579, + "step": 4100 + }, + { + "epoch": 0.1468655433595359, + "grad_norm": 1.6657721996307373, + "learning_rate": 0.00019292289165311863, + "loss": 1.6496, + "step": 4101 + }, + { + "epoch": 0.14690135548910416, + "grad_norm": 1.5139931440353394, + "learning_rate": 0.00019291860513986534, + "loss": 1.8056, + "step": 4102 + }, + { + "epoch": 0.14693716761867245, + "grad_norm": 2.9963998794555664, + "learning_rate": 0.0001929143173765121, + "loss": 1.6924, + "step": 4103 + }, + { + "epoch": 0.14697297974824072, + "grad_norm": 1.4028496742248535, + "learning_rate": 0.00019291002836311654, + "loss": 1.7265, + "step": 4104 + }, + { + "epoch": 0.14700879187780902, + "grad_norm": 1.964406132698059, + "learning_rate": 0.0001929057380997364, + "loss": 1.3505, + "step": 4105 + }, + { + "epoch": 0.1470446040073773, + "grad_norm": 1.6631462574005127, + "learning_rate": 0.0001929014465864294, + "loss": 1.5789, + "step": 4106 + }, + { + "epoch": 0.14708041613694559, + "grad_norm": 1.476067066192627, + "learning_rate": 0.00019289715382325327, + "loss": 1.4211, + "step": 4107 + }, + { + "epoch": 0.14711622826651388, + "grad_norm": 2.226459264755249, + "learning_rate": 0.00019289285981026577, + "loss": 1.8789, + "step": 4108 + }, + { + "epoch": 0.14715204039608215, + "grad_norm": 1.4748985767364502, + "learning_rate": 0.00019288856454752464, + "loss": 1.4772, + "step": 4109 + }, + { + "epoch": 0.14718785252565045, + "grad_norm": 1.3329429626464844, + "learning_rate": 0.0001928842680350877, + "loss": 1.4615, + "step": 4110 + }, + { + "epoch": 0.14722366465521872, + "grad_norm": 1.498295783996582, + "learning_rate": 0.00019287997027301275, + "loss": 1.5852, + "step": 4111 + }, + { + "epoch": 0.147259476784787, + "grad_norm": 1.641626238822937, + "learning_rate": 0.00019287567126135763, + "loss": 1.3915, + "step": 4112 + }, + { + "epoch": 0.14729528891435528, + "grad_norm": 1.6395761966705322, + "learning_rate": 0.00019287137100018013, + "loss": 2.055, + "step": 4113 + }, + { + "epoch": 0.14733110104392358, + "grad_norm": 1.7023566961288452, + "learning_rate": 0.00019286706948953812, + "loss": 1.8299, + "step": 4114 + }, + { + "epoch": 0.14736691317349185, + "grad_norm": 1.3379522562026978, + "learning_rate": 0.00019286276672948952, + "loss": 1.6071, + "step": 4115 + }, + { + "epoch": 0.14740272530306014, + "grad_norm": 1.9350535869598389, + "learning_rate": 0.00019285846272009213, + "loss": 1.64, + "step": 4116 + }, + { + "epoch": 0.14743853743262844, + "grad_norm": 1.9311000108718872, + "learning_rate": 0.00019285415746140392, + "loss": 1.7087, + "step": 4117 + }, + { + "epoch": 0.1474743495621967, + "grad_norm": 1.5109360218048096, + "learning_rate": 0.0001928498509534828, + "loss": 1.5686, + "step": 4118 + }, + { + "epoch": 0.147510161691765, + "grad_norm": 1.3077818155288696, + "learning_rate": 0.0001928455431963867, + "loss": 1.488, + "step": 4119 + }, + { + "epoch": 0.14754597382133328, + "grad_norm": 1.6297913789749146, + "learning_rate": 0.00019284123419017357, + "loss": 1.8312, + "step": 4120 + }, + { + "epoch": 0.14758178595090157, + "grad_norm": 1.9442793130874634, + "learning_rate": 0.0001928369239349014, + "loss": 1.6814, + "step": 4121 + }, + { + "epoch": 0.14761759808046984, + "grad_norm": 1.7997331619262695, + "learning_rate": 0.00019283261243062817, + "loss": 1.6751, + "step": 4122 + }, + { + "epoch": 0.14765341021003814, + "grad_norm": 1.4392348527908325, + "learning_rate": 0.0001928282996774119, + "loss": 1.5335, + "step": 4123 + }, + { + "epoch": 0.14768922233960644, + "grad_norm": 1.7808204889297485, + "learning_rate": 0.00019282398567531058, + "loss": 1.5752, + "step": 4124 + }, + { + "epoch": 0.1477250344691747, + "grad_norm": 1.8829363584518433, + "learning_rate": 0.00019281967042438227, + "loss": 1.5472, + "step": 4125 + }, + { + "epoch": 0.147760846598743, + "grad_norm": 1.8297572135925293, + "learning_rate": 0.000192815353924685, + "loss": 1.6218, + "step": 4126 + }, + { + "epoch": 0.14779665872831127, + "grad_norm": 1.4455066919326782, + "learning_rate": 0.0001928110361762769, + "loss": 1.5734, + "step": 4127 + }, + { + "epoch": 0.14783247085787957, + "grad_norm": 1.5757455825805664, + "learning_rate": 0.000192806717179216, + "loss": 1.561, + "step": 4128 + }, + { + "epoch": 0.14786828298744784, + "grad_norm": 1.614942193031311, + "learning_rate": 0.00019280239693356048, + "loss": 1.5828, + "step": 4129 + }, + { + "epoch": 0.14790409511701613, + "grad_norm": 1.358445405960083, + "learning_rate": 0.0001927980754393684, + "loss": 1.7682, + "step": 4130 + }, + { + "epoch": 0.14793990724658443, + "grad_norm": 2.172544240951538, + "learning_rate": 0.00019279375269669785, + "loss": 1.5469, + "step": 4131 + }, + { + "epoch": 0.1479757193761527, + "grad_norm": 1.9252666234970093, + "learning_rate": 0.00019278942870560713, + "loss": 1.8746, + "step": 4132 + }, + { + "epoch": 0.148011531505721, + "grad_norm": 1.5914863348007202, + "learning_rate": 0.0001927851034661543, + "loss": 1.3982, + "step": 4133 + }, + { + "epoch": 0.14804734363528926, + "grad_norm": 1.750622272491455, + "learning_rate": 0.0001927807769783976, + "loss": 1.4232, + "step": 4134 + }, + { + "epoch": 0.14808315576485756, + "grad_norm": 2.3878767490386963, + "learning_rate": 0.0001927764492423952, + "loss": 1.6743, + "step": 4135 + }, + { + "epoch": 0.14811896789442583, + "grad_norm": 1.5392639636993408, + "learning_rate": 0.0001927721202582054, + "loss": 1.5216, + "step": 4136 + }, + { + "epoch": 0.14815478002399413, + "grad_norm": 1.3905284404754639, + "learning_rate": 0.00019276779002588634, + "loss": 1.6186, + "step": 4137 + }, + { + "epoch": 0.14819059215356242, + "grad_norm": 1.1709767580032349, + "learning_rate": 0.00019276345854549634, + "loss": 1.6602, + "step": 4138 + }, + { + "epoch": 0.1482264042831307, + "grad_norm": 1.6870055198669434, + "learning_rate": 0.00019275912581709367, + "loss": 1.8051, + "step": 4139 + }, + { + "epoch": 0.148262216412699, + "grad_norm": 1.4576029777526855, + "learning_rate": 0.0001927547918407366, + "loss": 1.6819, + "step": 4140 + }, + { + "epoch": 0.14829802854226726, + "grad_norm": 1.3755351305007935, + "learning_rate": 0.00019275045661648344, + "loss": 1.6577, + "step": 4141 + }, + { + "epoch": 0.14833384067183555, + "grad_norm": 1.6038774251937866, + "learning_rate": 0.00019274612014439258, + "loss": 1.6699, + "step": 4142 + }, + { + "epoch": 0.14836965280140382, + "grad_norm": 1.760777473449707, + "learning_rate": 0.00019274178242452224, + "loss": 1.6629, + "step": 4143 + }, + { + "epoch": 0.14840546493097212, + "grad_norm": 2.301041841506958, + "learning_rate": 0.0001927374434569309, + "loss": 1.7042, + "step": 4144 + }, + { + "epoch": 0.14844127706054042, + "grad_norm": 1.588143229484558, + "learning_rate": 0.00019273310324167687, + "loss": 1.8193, + "step": 4145 + }, + { + "epoch": 0.14847708919010869, + "grad_norm": 1.4113539457321167, + "learning_rate": 0.00019272876177881852, + "loss": 1.3863, + "step": 4146 + }, + { + "epoch": 0.14851290131967698, + "grad_norm": 2.251722812652588, + "learning_rate": 0.00019272441906841432, + "loss": 1.5958, + "step": 4147 + }, + { + "epoch": 0.14854871344924525, + "grad_norm": 1.7772836685180664, + "learning_rate": 0.00019272007511052266, + "loss": 1.7736, + "step": 4148 + }, + { + "epoch": 0.14858452557881355, + "grad_norm": 1.554674744606018, + "learning_rate": 0.000192715729905202, + "loss": 1.6975, + "step": 4149 + }, + { + "epoch": 0.14862033770838182, + "grad_norm": 1.403333306312561, + "learning_rate": 0.00019271138345251077, + "loss": 1.7412, + "step": 4150 + }, + { + "epoch": 0.1486561498379501, + "grad_norm": 1.7260535955429077, + "learning_rate": 0.00019270703575250748, + "loss": 1.5755, + "step": 4151 + }, + { + "epoch": 0.1486919619675184, + "grad_norm": 2.66255784034729, + "learning_rate": 0.0001927026868052506, + "loss": 1.34, + "step": 4152 + }, + { + "epoch": 0.14872777409708668, + "grad_norm": 1.3532320261001587, + "learning_rate": 0.00019269833661079866, + "loss": 1.6399, + "step": 4153 + }, + { + "epoch": 0.14876358622665498, + "grad_norm": 3.2480742931365967, + "learning_rate": 0.00019269398516921015, + "loss": 2.3012, + "step": 4154 + }, + { + "epoch": 0.14879939835622324, + "grad_norm": 1.9341964721679688, + "learning_rate": 0.00019268963248054367, + "loss": 1.818, + "step": 4155 + }, + { + "epoch": 0.14883521048579154, + "grad_norm": 1.7220872640609741, + "learning_rate": 0.00019268527854485773, + "loss": 1.5048, + "step": 4156 + }, + { + "epoch": 0.1488710226153598, + "grad_norm": 1.5572283267974854, + "learning_rate": 0.0001926809233622109, + "loss": 1.9191, + "step": 4157 + }, + { + "epoch": 0.1489068347449281, + "grad_norm": 2.1267168521881104, + "learning_rate": 0.0001926765669326618, + "loss": 1.3807, + "step": 4158 + }, + { + "epoch": 0.1489426468744964, + "grad_norm": 2.1267030239105225, + "learning_rate": 0.00019267220925626907, + "loss": 1.7096, + "step": 4159 + }, + { + "epoch": 0.14897845900406467, + "grad_norm": 1.8242478370666504, + "learning_rate": 0.00019266785033309128, + "loss": 1.2927, + "step": 4160 + }, + { + "epoch": 0.14901427113363297, + "grad_norm": 1.9160438776016235, + "learning_rate": 0.0001926634901631871, + "loss": 1.6361, + "step": 4161 + }, + { + "epoch": 0.14905008326320124, + "grad_norm": 2.731289863586426, + "learning_rate": 0.00019265912874661515, + "loss": 1.3703, + "step": 4162 + }, + { + "epoch": 0.14908589539276954, + "grad_norm": 1.2577362060546875, + "learning_rate": 0.0001926547660834342, + "loss": 1.7034, + "step": 4163 + }, + { + "epoch": 0.1491217075223378, + "grad_norm": 2.197948932647705, + "learning_rate": 0.00019265040217370286, + "loss": 1.4072, + "step": 4164 + }, + { + "epoch": 0.1491575196519061, + "grad_norm": 1.9935760498046875, + "learning_rate": 0.0001926460370174799, + "loss": 1.3064, + "step": 4165 + }, + { + "epoch": 0.1491933317814744, + "grad_norm": 1.7972018718719482, + "learning_rate": 0.00019264167061482397, + "loss": 1.4971, + "step": 4166 + }, + { + "epoch": 0.14922914391104267, + "grad_norm": 1.4733848571777344, + "learning_rate": 0.0001926373029657939, + "loss": 1.6895, + "step": 4167 + }, + { + "epoch": 0.14926495604061096, + "grad_norm": 1.9356968402862549, + "learning_rate": 0.00019263293407044838, + "loss": 1.792, + "step": 4168 + }, + { + "epoch": 0.14930076817017923, + "grad_norm": 1.5070140361785889, + "learning_rate": 0.00019262856392884625, + "loss": 1.6962, + "step": 4169 + }, + { + "epoch": 0.14933658029974753, + "grad_norm": 1.9698539972305298, + "learning_rate": 0.00019262419254104628, + "loss": 1.7214, + "step": 4170 + }, + { + "epoch": 0.1493723924293158, + "grad_norm": 1.7777533531188965, + "learning_rate": 0.00019261981990710723, + "loss": 1.6434, + "step": 4171 + }, + { + "epoch": 0.1494082045588841, + "grad_norm": 1.478132963180542, + "learning_rate": 0.000192615446027088, + "loss": 1.493, + "step": 4172 + }, + { + "epoch": 0.1494440166884524, + "grad_norm": 1.4548566341400146, + "learning_rate": 0.00019261107090104743, + "loss": 1.5991, + "step": 4173 + }, + { + "epoch": 0.14947982881802066, + "grad_norm": 1.5640240907669067, + "learning_rate": 0.00019260669452904433, + "loss": 1.3423, + "step": 4174 + }, + { + "epoch": 0.14951564094758896, + "grad_norm": 1.3845349550247192, + "learning_rate": 0.00019260231691113763, + "loss": 1.8214, + "step": 4175 + }, + { + "epoch": 0.14955145307715723, + "grad_norm": 3.009308099746704, + "learning_rate": 0.00019259793804738619, + "loss": 1.6082, + "step": 4176 + }, + { + "epoch": 0.14958726520672552, + "grad_norm": 1.3763492107391357, + "learning_rate": 0.00019259355793784893, + "loss": 1.6317, + "step": 4177 + }, + { + "epoch": 0.1496230773362938, + "grad_norm": 1.6250135898590088, + "learning_rate": 0.00019258917658258483, + "loss": 1.7019, + "step": 4178 + }, + { + "epoch": 0.1496588894658621, + "grad_norm": 1.358526349067688, + "learning_rate": 0.00019258479398165273, + "loss": 1.6496, + "step": 4179 + }, + { + "epoch": 0.14969470159543039, + "grad_norm": 2.3600802421569824, + "learning_rate": 0.00019258041013511167, + "loss": 1.5639, + "step": 4180 + }, + { + "epoch": 0.14973051372499865, + "grad_norm": 2.442429542541504, + "learning_rate": 0.00019257602504302063, + "loss": 1.8827, + "step": 4181 + }, + { + "epoch": 0.14976632585456695, + "grad_norm": 2.7102713584899902, + "learning_rate": 0.0001925716387054386, + "loss": 1.8546, + "step": 4182 + }, + { + "epoch": 0.14980213798413522, + "grad_norm": 2.178633689880371, + "learning_rate": 0.00019256725112242455, + "loss": 2.0248, + "step": 4183 + }, + { + "epoch": 0.14983795011370352, + "grad_norm": 1.9340856075286865, + "learning_rate": 0.00019256286229403754, + "loss": 1.5944, + "step": 4184 + }, + { + "epoch": 0.14987376224327179, + "grad_norm": 1.5424754619598389, + "learning_rate": 0.00019255847222033663, + "loss": 1.711, + "step": 4185 + }, + { + "epoch": 0.14990957437284008, + "grad_norm": 2.0131659507751465, + "learning_rate": 0.00019255408090138086, + "loss": 1.7849, + "step": 4186 + }, + { + "epoch": 0.14994538650240838, + "grad_norm": 1.3488818407058716, + "learning_rate": 0.00019254968833722934, + "loss": 1.567, + "step": 4187 + }, + { + "epoch": 0.14998119863197665, + "grad_norm": 1.4595369100570679, + "learning_rate": 0.0001925452945279411, + "loss": 1.5943, + "step": 4188 + }, + { + "epoch": 0.15001701076154494, + "grad_norm": 2.130014419555664, + "learning_rate": 0.00019254089947357534, + "loss": 1.6016, + "step": 4189 + }, + { + "epoch": 0.1500528228911132, + "grad_norm": 1.4605454206466675, + "learning_rate": 0.00019253650317419113, + "loss": 1.6543, + "step": 4190 + }, + { + "epoch": 0.1500886350206815, + "grad_norm": 1.6974399089813232, + "learning_rate": 0.0001925321056298476, + "loss": 1.473, + "step": 4191 + }, + { + "epoch": 0.15012444715024978, + "grad_norm": 1.9349365234375, + "learning_rate": 0.000192527706840604, + "loss": 1.4904, + "step": 4192 + }, + { + "epoch": 0.15016025927981808, + "grad_norm": 1.4719817638397217, + "learning_rate": 0.00019252330680651945, + "loss": 1.6002, + "step": 4193 + }, + { + "epoch": 0.15019607140938637, + "grad_norm": 1.5579456090927124, + "learning_rate": 0.0001925189055276531, + "loss": 1.2423, + "step": 4194 + }, + { + "epoch": 0.15023188353895464, + "grad_norm": 1.1475658416748047, + "learning_rate": 0.00019251450300406426, + "loss": 1.4654, + "step": 4195 + }, + { + "epoch": 0.15026769566852294, + "grad_norm": 2.370964527130127, + "learning_rate": 0.00019251009923581213, + "loss": 1.782, + "step": 4196 + }, + { + "epoch": 0.1503035077980912, + "grad_norm": 1.2127045392990112, + "learning_rate": 0.0001925056942229559, + "loss": 1.5602, + "step": 4197 + }, + { + "epoch": 0.1503393199276595, + "grad_norm": 1.3405200242996216, + "learning_rate": 0.00019250128796555492, + "loss": 1.7569, + "step": 4198 + }, + { + "epoch": 0.15037513205722777, + "grad_norm": 1.5121461153030396, + "learning_rate": 0.0001924968804636684, + "loss": 1.5301, + "step": 4199 + }, + { + "epoch": 0.15041094418679607, + "grad_norm": 1.8312376737594604, + "learning_rate": 0.0001924924717173557, + "loss": 1.5413, + "step": 4200 + }, + { + "epoch": 0.15044675631636437, + "grad_norm": 1.5825523138046265, + "learning_rate": 0.00019248806172667606, + "loss": 1.7082, + "step": 4201 + }, + { + "epoch": 0.15048256844593264, + "grad_norm": 1.664043664932251, + "learning_rate": 0.00019248365049168888, + "loss": 1.6665, + "step": 4202 + }, + { + "epoch": 0.15051838057550093, + "grad_norm": 1.4566935300827026, + "learning_rate": 0.00019247923801245345, + "loss": 1.4875, + "step": 4203 + }, + { + "epoch": 0.1505541927050692, + "grad_norm": 1.3645408153533936, + "learning_rate": 0.0001924748242890292, + "loss": 1.578, + "step": 4204 + }, + { + "epoch": 0.1505900048346375, + "grad_norm": 1.360119104385376, + "learning_rate": 0.00019247040932147546, + "loss": 1.6422, + "step": 4205 + }, + { + "epoch": 0.15062581696420577, + "grad_norm": 1.9559032917022705, + "learning_rate": 0.00019246599310985163, + "loss": 1.64, + "step": 4206 + }, + { + "epoch": 0.15066162909377406, + "grad_norm": 1.3689755201339722, + "learning_rate": 0.0001924615756542171, + "loss": 1.5453, + "step": 4207 + }, + { + "epoch": 0.15069744122334236, + "grad_norm": 2.615208864212036, + "learning_rate": 0.0001924571569546314, + "loss": 1.7152, + "step": 4208 + }, + { + "epoch": 0.15073325335291063, + "grad_norm": 1.3856520652770996, + "learning_rate": 0.00019245273701115387, + "loss": 1.5176, + "step": 4209 + }, + { + "epoch": 0.15076906548247893, + "grad_norm": 1.9545027017593384, + "learning_rate": 0.00019244831582384406, + "loss": 1.38, + "step": 4210 + }, + { + "epoch": 0.1508048776120472, + "grad_norm": 1.9589897394180298, + "learning_rate": 0.0001924438933927614, + "loss": 1.5865, + "step": 4211 + }, + { + "epoch": 0.1508406897416155, + "grad_norm": 1.9281854629516602, + "learning_rate": 0.00019243946971796535, + "loss": 1.2587, + "step": 4212 + }, + { + "epoch": 0.15087650187118376, + "grad_norm": 2.179154396057129, + "learning_rate": 0.00019243504479951552, + "loss": 1.6455, + "step": 4213 + }, + { + "epoch": 0.15091231400075206, + "grad_norm": 1.6948529481887817, + "learning_rate": 0.00019243061863747138, + "loss": 1.5106, + "step": 4214 + }, + { + "epoch": 0.15094812613032033, + "grad_norm": 1.5750123262405396, + "learning_rate": 0.0001924261912318925, + "loss": 1.7466, + "step": 4215 + }, + { + "epoch": 0.15098393825988862, + "grad_norm": 1.5252922773361206, + "learning_rate": 0.00019242176258283845, + "loss": 1.4174, + "step": 4216 + }, + { + "epoch": 0.15101975038945692, + "grad_norm": 1.4911702871322632, + "learning_rate": 0.00019241733269036878, + "loss": 1.792, + "step": 4217 + }, + { + "epoch": 0.1510555625190252, + "grad_norm": 2.6139893531799316, + "learning_rate": 0.0001924129015545431, + "loss": 1.496, + "step": 4218 + }, + { + "epoch": 0.15109137464859348, + "grad_norm": 1.4543615579605103, + "learning_rate": 0.00019240846917542107, + "loss": 1.3647, + "step": 4219 + }, + { + "epoch": 0.15112718677816175, + "grad_norm": 2.1806602478027344, + "learning_rate": 0.00019240403555306225, + "loss": 1.8613, + "step": 4220 + }, + { + "epoch": 0.15116299890773005, + "grad_norm": 1.576529622077942, + "learning_rate": 0.00019239960068752633, + "loss": 1.5466, + "step": 4221 + }, + { + "epoch": 0.15119881103729832, + "grad_norm": 1.780757188796997, + "learning_rate": 0.00019239516457887298, + "loss": 1.732, + "step": 4222 + }, + { + "epoch": 0.15123462316686662, + "grad_norm": 1.6536751985549927, + "learning_rate": 0.00019239072722716186, + "loss": 1.5458, + "step": 4223 + }, + { + "epoch": 0.1512704352964349, + "grad_norm": 3.4212424755096436, + "learning_rate": 0.0001923862886324527, + "loss": 1.6249, + "step": 4224 + }, + { + "epoch": 0.15130624742600318, + "grad_norm": 2.4516830444335938, + "learning_rate": 0.00019238184879480518, + "loss": 1.722, + "step": 4225 + }, + { + "epoch": 0.15134205955557148, + "grad_norm": 1.8271950483322144, + "learning_rate": 0.00019237740771427906, + "loss": 1.3643, + "step": 4226 + }, + { + "epoch": 0.15137787168513975, + "grad_norm": 2.7619059085845947, + "learning_rate": 0.00019237296539093408, + "loss": 1.786, + "step": 4227 + }, + { + "epoch": 0.15141368381470804, + "grad_norm": 2.0190656185150146, + "learning_rate": 0.00019236852182482998, + "loss": 1.1294, + "step": 4228 + }, + { + "epoch": 0.1514494959442763, + "grad_norm": 2.5435988903045654, + "learning_rate": 0.0001923640770160266, + "loss": 1.965, + "step": 4229 + }, + { + "epoch": 0.1514853080738446, + "grad_norm": 1.3937095403671265, + "learning_rate": 0.00019235963096458366, + "loss": 1.7719, + "step": 4230 + }, + { + "epoch": 0.1515211202034129, + "grad_norm": 1.7603644132614136, + "learning_rate": 0.00019235518367056106, + "loss": 1.9055, + "step": 4231 + }, + { + "epoch": 0.15155693233298118, + "grad_norm": 1.535352349281311, + "learning_rate": 0.0001923507351340186, + "loss": 1.7588, + "step": 4232 + }, + { + "epoch": 0.15159274446254947, + "grad_norm": 1.4259074926376343, + "learning_rate": 0.00019234628535501607, + "loss": 1.5549, + "step": 4233 + }, + { + "epoch": 0.15162855659211774, + "grad_norm": 1.2318248748779297, + "learning_rate": 0.00019234183433361344, + "loss": 1.6689, + "step": 4234 + }, + { + "epoch": 0.15166436872168604, + "grad_norm": 1.5642166137695312, + "learning_rate": 0.0001923373820698705, + "loss": 1.5143, + "step": 4235 + }, + { + "epoch": 0.1517001808512543, + "grad_norm": 2.0307435989379883, + "learning_rate": 0.00019233292856384723, + "loss": 1.6097, + "step": 4236 + }, + { + "epoch": 0.1517359929808226, + "grad_norm": 1.8250125646591187, + "learning_rate": 0.00019232847381560347, + "loss": 1.7707, + "step": 4237 + }, + { + "epoch": 0.1517718051103909, + "grad_norm": 1.4493552446365356, + "learning_rate": 0.00019232401782519923, + "loss": 1.683, + "step": 4238 + }, + { + "epoch": 0.15180761723995917, + "grad_norm": 1.6072957515716553, + "learning_rate": 0.0001923195605926944, + "loss": 1.5616, + "step": 4239 + }, + { + "epoch": 0.15184342936952747, + "grad_norm": 2.072479724884033, + "learning_rate": 0.00019231510211814896, + "loss": 1.5442, + "step": 4240 + }, + { + "epoch": 0.15187924149909574, + "grad_norm": 2.20919132232666, + "learning_rate": 0.0001923106424016229, + "loss": 1.8976, + "step": 4241 + }, + { + "epoch": 0.15191505362866403, + "grad_norm": 1.0948854684829712, + "learning_rate": 0.00019230618144317624, + "loss": 1.4725, + "step": 4242 + }, + { + "epoch": 0.1519508657582323, + "grad_norm": 1.5968300104141235, + "learning_rate": 0.00019230171924286896, + "loss": 1.4642, + "step": 4243 + }, + { + "epoch": 0.1519866778878006, + "grad_norm": 1.6487561464309692, + "learning_rate": 0.0001922972558007611, + "loss": 1.7243, + "step": 4244 + }, + { + "epoch": 0.1520224900173689, + "grad_norm": 1.4044201374053955, + "learning_rate": 0.00019229279111691272, + "loss": 1.6046, + "step": 4245 + }, + { + "epoch": 0.15205830214693716, + "grad_norm": 1.3864117860794067, + "learning_rate": 0.0001922883251913839, + "loss": 1.6748, + "step": 4246 + }, + { + "epoch": 0.15209411427650546, + "grad_norm": 2.1303157806396484, + "learning_rate": 0.00019228385802423469, + "loss": 1.7468, + "step": 4247 + }, + { + "epoch": 0.15212992640607373, + "grad_norm": 2.085751533508301, + "learning_rate": 0.0001922793896155252, + "loss": 1.9624, + "step": 4248 + }, + { + "epoch": 0.15216573853564203, + "grad_norm": 1.5157827138900757, + "learning_rate": 0.00019227491996531558, + "loss": 1.6942, + "step": 4249 + }, + { + "epoch": 0.1522015506652103, + "grad_norm": 2.0757040977478027, + "learning_rate": 0.00019227044907366595, + "loss": 1.4183, + "step": 4250 + }, + { + "epoch": 0.1522373627947786, + "grad_norm": 1.8636008501052856, + "learning_rate": 0.00019226597694063638, + "loss": 1.5945, + "step": 4251 + }, + { + "epoch": 0.1522731749243469, + "grad_norm": 1.7461687326431274, + "learning_rate": 0.0001922615035662872, + "loss": 1.5256, + "step": 4252 + }, + { + "epoch": 0.15230898705391516, + "grad_norm": 1.3671756982803345, + "learning_rate": 0.00019225702895067843, + "loss": 1.4619, + "step": 4253 + }, + { + "epoch": 0.15234479918348345, + "grad_norm": 1.5393552780151367, + "learning_rate": 0.00019225255309387036, + "loss": 1.728, + "step": 4254 + }, + { + "epoch": 0.15238061131305172, + "grad_norm": 2.7406222820281982, + "learning_rate": 0.00019224807599592318, + "loss": 1.7323, + "step": 4255 + }, + { + "epoch": 0.15241642344262002, + "grad_norm": 1.4855775833129883, + "learning_rate": 0.00019224359765689713, + "loss": 1.7204, + "step": 4256 + }, + { + "epoch": 0.1524522355721883, + "grad_norm": 1.3277029991149902, + "learning_rate": 0.00019223911807685244, + "loss": 1.5232, + "step": 4257 + }, + { + "epoch": 0.15248804770175658, + "grad_norm": 1.8301455974578857, + "learning_rate": 0.00019223463725584944, + "loss": 1.5253, + "step": 4258 + }, + { + "epoch": 0.15252385983132488, + "grad_norm": 1.7147241830825806, + "learning_rate": 0.00019223015519394834, + "loss": 1.6035, + "step": 4259 + }, + { + "epoch": 0.15255967196089315, + "grad_norm": 2.0909276008605957, + "learning_rate": 0.00019222567189120947, + "loss": 1.3304, + "step": 4260 + }, + { + "epoch": 0.15259548409046145, + "grad_norm": 2.0987610816955566, + "learning_rate": 0.00019222118734769317, + "loss": 1.3313, + "step": 4261 + }, + { + "epoch": 0.15263129622002972, + "grad_norm": 2.262599229812622, + "learning_rate": 0.00019221670156345971, + "loss": 1.5674, + "step": 4262 + }, + { + "epoch": 0.152667108349598, + "grad_norm": 1.5876708030700684, + "learning_rate": 0.00019221221453856954, + "loss": 1.7479, + "step": 4263 + }, + { + "epoch": 0.15270292047916628, + "grad_norm": 1.6196600198745728, + "learning_rate": 0.00019220772627308292, + "loss": 1.5288, + "step": 4264 + }, + { + "epoch": 0.15273873260873458, + "grad_norm": 1.6314212083816528, + "learning_rate": 0.00019220323676706028, + "loss": 1.6841, + "step": 4265 + }, + { + "epoch": 0.15277454473830288, + "grad_norm": 2.1747360229492188, + "learning_rate": 0.00019219874602056204, + "loss": 1.8169, + "step": 4266 + }, + { + "epoch": 0.15281035686787114, + "grad_norm": 1.5729329586029053, + "learning_rate": 0.0001921942540336486, + "loss": 1.5527, + "step": 4267 + }, + { + "epoch": 0.15284616899743944, + "grad_norm": 2.512627124786377, + "learning_rate": 0.00019218976080638043, + "loss": 2.074, + "step": 4268 + }, + { + "epoch": 0.1528819811270077, + "grad_norm": 1.6818349361419678, + "learning_rate": 0.0001921852663388179, + "loss": 1.5731, + "step": 4269 + }, + { + "epoch": 0.152917793256576, + "grad_norm": 1.7092454433441162, + "learning_rate": 0.0001921807706310215, + "loss": 1.5929, + "step": 4270 + }, + { + "epoch": 0.15295360538614428, + "grad_norm": 1.1693865060806274, + "learning_rate": 0.00019217627368305176, + "loss": 1.7299, + "step": 4271 + }, + { + "epoch": 0.15298941751571257, + "grad_norm": 1.8270220756530762, + "learning_rate": 0.0001921717754949692, + "loss": 1.6689, + "step": 4272 + }, + { + "epoch": 0.15302522964528087, + "grad_norm": 1.5601493120193481, + "learning_rate": 0.00019216727606683425, + "loss": 1.5714, + "step": 4273 + }, + { + "epoch": 0.15306104177484914, + "grad_norm": 1.6041169166564941, + "learning_rate": 0.00019216277539870752, + "loss": 1.5596, + "step": 4274 + }, + { + "epoch": 0.15309685390441743, + "grad_norm": 1.5292716026306152, + "learning_rate": 0.00019215827349064948, + "loss": 1.6235, + "step": 4275 + }, + { + "epoch": 0.1531326660339857, + "grad_norm": 2.1486856937408447, + "learning_rate": 0.00019215377034272074, + "loss": 1.7512, + "step": 4276 + }, + { + "epoch": 0.153168478163554, + "grad_norm": 1.281849980354309, + "learning_rate": 0.00019214926595498196, + "loss": 1.5935, + "step": 4277 + }, + { + "epoch": 0.15320429029312227, + "grad_norm": 2.1205649375915527, + "learning_rate": 0.0001921447603274936, + "loss": 1.6512, + "step": 4278 + }, + { + "epoch": 0.15324010242269057, + "grad_norm": 1.811474323272705, + "learning_rate": 0.0001921402534603164, + "loss": 1.7293, + "step": 4279 + }, + { + "epoch": 0.15327591455225886, + "grad_norm": 1.8452061414718628, + "learning_rate": 0.00019213574535351092, + "loss": 1.6552, + "step": 4280 + }, + { + "epoch": 0.15331172668182713, + "grad_norm": 2.1379551887512207, + "learning_rate": 0.00019213123600713783, + "loss": 1.6141, + "step": 4281 + }, + { + "epoch": 0.15334753881139543, + "grad_norm": 1.5196058750152588, + "learning_rate": 0.0001921267254212578, + "loss": 1.5217, + "step": 4282 + }, + { + "epoch": 0.1533833509409637, + "grad_norm": 1.5301889181137085, + "learning_rate": 0.00019212221359593152, + "loss": 1.2763, + "step": 4283 + }, + { + "epoch": 0.153419163070532, + "grad_norm": 1.7867982387542725, + "learning_rate": 0.00019211770053121968, + "loss": 1.7346, + "step": 4284 + }, + { + "epoch": 0.15345497520010026, + "grad_norm": 2.244570016860962, + "learning_rate": 0.000192113186227183, + "loss": 1.5217, + "step": 4285 + }, + { + "epoch": 0.15349078732966856, + "grad_norm": 2.4065821170806885, + "learning_rate": 0.0001921086706838822, + "loss": 1.4745, + "step": 4286 + }, + { + "epoch": 0.15352659945923686, + "grad_norm": 1.6467057466506958, + "learning_rate": 0.0001921041539013781, + "loss": 1.5964, + "step": 4287 + }, + { + "epoch": 0.15356241158880513, + "grad_norm": 1.5443044900894165, + "learning_rate": 0.00019209963587973138, + "loss": 1.8898, + "step": 4288 + }, + { + "epoch": 0.15359822371837342, + "grad_norm": 2.1365907192230225, + "learning_rate": 0.00019209511661900285, + "loss": 1.4256, + "step": 4289 + }, + { + "epoch": 0.1536340358479417, + "grad_norm": 1.4355337619781494, + "learning_rate": 0.00019209059611925336, + "loss": 1.4913, + "step": 4290 + }, + { + "epoch": 0.15366984797751, + "grad_norm": 1.4051859378814697, + "learning_rate": 0.00019208607438054364, + "loss": 1.547, + "step": 4291 + }, + { + "epoch": 0.15370566010707826, + "grad_norm": 1.3148490190505981, + "learning_rate": 0.0001920815514029346, + "loss": 1.7636, + "step": 4292 + }, + { + "epoch": 0.15374147223664655, + "grad_norm": 1.518978476524353, + "learning_rate": 0.00019207702718648705, + "loss": 1.4276, + "step": 4293 + }, + { + "epoch": 0.15377728436621485, + "grad_norm": 1.5426995754241943, + "learning_rate": 0.00019207250173126187, + "loss": 1.5931, + "step": 4294 + }, + { + "epoch": 0.15381309649578312, + "grad_norm": 1.3305436372756958, + "learning_rate": 0.00019206797503731996, + "loss": 1.5414, + "step": 4295 + }, + { + "epoch": 0.15384890862535142, + "grad_norm": 1.3565562963485718, + "learning_rate": 0.0001920634471047222, + "loss": 1.4051, + "step": 4296 + }, + { + "epoch": 0.15388472075491968, + "grad_norm": 1.4529234170913696, + "learning_rate": 0.0001920589179335295, + "loss": 1.7174, + "step": 4297 + }, + { + "epoch": 0.15392053288448798, + "grad_norm": 1.6896700859069824, + "learning_rate": 0.00019205438752380283, + "loss": 1.5858, + "step": 4298 + }, + { + "epoch": 0.15395634501405625, + "grad_norm": 2.139944314956665, + "learning_rate": 0.00019204985587560307, + "loss": 1.871, + "step": 4299 + }, + { + "epoch": 0.15399215714362455, + "grad_norm": 2.2128217220306396, + "learning_rate": 0.00019204532298899127, + "loss": 1.7976, + "step": 4300 + }, + { + "epoch": 0.15402796927319284, + "grad_norm": 1.833006501197815, + "learning_rate": 0.0001920407888640284, + "loss": 1.8781, + "step": 4301 + }, + { + "epoch": 0.1540637814027611, + "grad_norm": 1.8609815835952759, + "learning_rate": 0.00019203625350077541, + "loss": 1.89, + "step": 4302 + }, + { + "epoch": 0.1540995935323294, + "grad_norm": 1.195251703262329, + "learning_rate": 0.00019203171689929333, + "loss": 1.6183, + "step": 4303 + }, + { + "epoch": 0.15413540566189768, + "grad_norm": 1.212415099143982, + "learning_rate": 0.00019202717905964325, + "loss": 1.4988, + "step": 4304 + }, + { + "epoch": 0.15417121779146598, + "grad_norm": 1.2291085720062256, + "learning_rate": 0.00019202263998188617, + "loss": 1.6022, + "step": 4305 + }, + { + "epoch": 0.15420702992103424, + "grad_norm": 1.3216090202331543, + "learning_rate": 0.00019201809966608316, + "loss": 1.5615, + "step": 4306 + }, + { + "epoch": 0.15424284205060254, + "grad_norm": 2.6345887184143066, + "learning_rate": 0.0001920135581122953, + "loss": 1.4008, + "step": 4307 + }, + { + "epoch": 0.1542786541801708, + "grad_norm": 2.237443208694458, + "learning_rate": 0.00019200901532058376, + "loss": 1.5771, + "step": 4308 + }, + { + "epoch": 0.1543144663097391, + "grad_norm": 1.5229663848876953, + "learning_rate": 0.00019200447129100954, + "loss": 1.4401, + "step": 4309 + }, + { + "epoch": 0.1543502784393074, + "grad_norm": 1.4248111248016357, + "learning_rate": 0.00019199992602363385, + "loss": 1.6815, + "step": 4310 + }, + { + "epoch": 0.15438609056887567, + "grad_norm": 1.6188117265701294, + "learning_rate": 0.00019199537951851788, + "loss": 1.7073, + "step": 4311 + }, + { + "epoch": 0.15442190269844397, + "grad_norm": 1.5151900053024292, + "learning_rate": 0.0001919908317757227, + "loss": 1.5666, + "step": 4312 + }, + { + "epoch": 0.15445771482801224, + "grad_norm": 2.0137276649475098, + "learning_rate": 0.00019198628279530952, + "loss": 1.9482, + "step": 4313 + }, + { + "epoch": 0.15449352695758053, + "grad_norm": 2.2357749938964844, + "learning_rate": 0.00019198173257733961, + "loss": 1.5095, + "step": 4314 + }, + { + "epoch": 0.1545293390871488, + "grad_norm": 1.722032070159912, + "learning_rate": 0.00019197718112187409, + "loss": 1.2756, + "step": 4315 + }, + { + "epoch": 0.1545651512167171, + "grad_norm": 1.9286119937896729, + "learning_rate": 0.00019197262842897425, + "loss": 1.2332, + "step": 4316 + }, + { + "epoch": 0.1546009633462854, + "grad_norm": 1.6464585065841675, + "learning_rate": 0.00019196807449870133, + "loss": 1.768, + "step": 4317 + }, + { + "epoch": 0.15463677547585367, + "grad_norm": 1.3432539701461792, + "learning_rate": 0.00019196351933111662, + "loss": 1.3892, + "step": 4318 + }, + { + "epoch": 0.15467258760542196, + "grad_norm": 1.8034002780914307, + "learning_rate": 0.00019195896292628138, + "loss": 1.5942, + "step": 4319 + }, + { + "epoch": 0.15470839973499023, + "grad_norm": 2.2822370529174805, + "learning_rate": 0.00019195440528425688, + "loss": 1.6211, + "step": 4320 + }, + { + "epoch": 0.15474421186455853, + "grad_norm": 1.6064921617507935, + "learning_rate": 0.00019194984640510447, + "loss": 1.4712, + "step": 4321 + }, + { + "epoch": 0.1547800239941268, + "grad_norm": 1.5366289615631104, + "learning_rate": 0.00019194528628888554, + "loss": 1.847, + "step": 4322 + }, + { + "epoch": 0.1548158361236951, + "grad_norm": 1.3502074480056763, + "learning_rate": 0.00019194072493566134, + "loss": 1.9242, + "step": 4323 + }, + { + "epoch": 0.1548516482532634, + "grad_norm": 1.394463300704956, + "learning_rate": 0.00019193616234549328, + "loss": 1.4996, + "step": 4324 + }, + { + "epoch": 0.15488746038283166, + "grad_norm": 1.9019322395324707, + "learning_rate": 0.00019193159851844276, + "loss": 1.5957, + "step": 4325 + }, + { + "epoch": 0.15492327251239996, + "grad_norm": 1.9424875974655151, + "learning_rate": 0.00019192703345457114, + "loss": 1.6297, + "step": 4326 + }, + { + "epoch": 0.15495908464196823, + "grad_norm": 1.5669649839401245, + "learning_rate": 0.00019192246715393988, + "loss": 1.7531, + "step": 4327 + }, + { + "epoch": 0.15499489677153652, + "grad_norm": 1.4756217002868652, + "learning_rate": 0.0001919178996166104, + "loss": 1.7796, + "step": 4328 + }, + { + "epoch": 0.1550307089011048, + "grad_norm": 1.9263927936553955, + "learning_rate": 0.00019191333084264412, + "loss": 1.907, + "step": 4329 + }, + { + "epoch": 0.1550665210306731, + "grad_norm": 1.990947961807251, + "learning_rate": 0.00019190876083210258, + "loss": 1.7693, + "step": 4330 + }, + { + "epoch": 0.15510233316024138, + "grad_norm": 1.4774898290634155, + "learning_rate": 0.00019190418958504716, + "loss": 1.6509, + "step": 4331 + }, + { + "epoch": 0.15513814528980965, + "grad_norm": 1.6892197132110596, + "learning_rate": 0.00019189961710153948, + "loss": 1.7677, + "step": 4332 + }, + { + "epoch": 0.15517395741937795, + "grad_norm": 1.8601287603378296, + "learning_rate": 0.00019189504338164095, + "loss": 1.5163, + "step": 4333 + }, + { + "epoch": 0.15520976954894622, + "grad_norm": 2.1573426723480225, + "learning_rate": 0.00019189046842541316, + "loss": 1.9389, + "step": 4334 + }, + { + "epoch": 0.15524558167851452, + "grad_norm": 2.2533397674560547, + "learning_rate": 0.00019188589223291763, + "loss": 1.6179, + "step": 4335 + }, + { + "epoch": 0.15528139380808278, + "grad_norm": 1.462187647819519, + "learning_rate": 0.00019188131480421595, + "loss": 1.5003, + "step": 4336 + }, + { + "epoch": 0.15531720593765108, + "grad_norm": 1.378979206085205, + "learning_rate": 0.0001918767361393697, + "loss": 1.674, + "step": 4337 + }, + { + "epoch": 0.15535301806721938, + "grad_norm": 1.7389543056488037, + "learning_rate": 0.00019187215623844053, + "loss": 1.5242, + "step": 4338 + }, + { + "epoch": 0.15538883019678765, + "grad_norm": 1.7959777116775513, + "learning_rate": 0.00019186757510148995, + "loss": 1.5021, + "step": 4339 + }, + { + "epoch": 0.15542464232635594, + "grad_norm": 1.8340651988983154, + "learning_rate": 0.00019186299272857965, + "loss": 1.6096, + "step": 4340 + }, + { + "epoch": 0.1554604544559242, + "grad_norm": 1.2857201099395752, + "learning_rate": 0.0001918584091197713, + "loss": 1.5341, + "step": 4341 + }, + { + "epoch": 0.1554962665854925, + "grad_norm": 1.4234918355941772, + "learning_rate": 0.00019185382427512653, + "loss": 1.3857, + "step": 4342 + }, + { + "epoch": 0.15553207871506078, + "grad_norm": 1.4376306533813477, + "learning_rate": 0.00019184923819470703, + "loss": 1.683, + "step": 4343 + }, + { + "epoch": 0.15556789084462908, + "grad_norm": 1.7839604616165161, + "learning_rate": 0.0001918446508785745, + "loss": 1.4875, + "step": 4344 + }, + { + "epoch": 0.15560370297419737, + "grad_norm": 2.093585968017578, + "learning_rate": 0.00019184006232679068, + "loss": 1.708, + "step": 4345 + }, + { + "epoch": 0.15563951510376564, + "grad_norm": 1.6490429639816284, + "learning_rate": 0.00019183547253941733, + "loss": 1.4749, + "step": 4346 + }, + { + "epoch": 0.15567532723333394, + "grad_norm": 1.4258878231048584, + "learning_rate": 0.0001918308815165161, + "loss": 1.5839, + "step": 4347 + }, + { + "epoch": 0.1557111393629022, + "grad_norm": 1.3602529764175415, + "learning_rate": 0.0001918262892581488, + "loss": 1.4586, + "step": 4348 + }, + { + "epoch": 0.1557469514924705, + "grad_norm": 2.043414354324341, + "learning_rate": 0.00019182169576437724, + "loss": 1.6374, + "step": 4349 + }, + { + "epoch": 0.15578276362203877, + "grad_norm": 1.3405836820602417, + "learning_rate": 0.00019181710103526321, + "loss": 1.7475, + "step": 4350 + }, + { + "epoch": 0.15581857575160707, + "grad_norm": 1.6657061576843262, + "learning_rate": 0.00019181250507086854, + "loss": 1.4629, + "step": 4351 + }, + { + "epoch": 0.15585438788117537, + "grad_norm": 1.7391798496246338, + "learning_rate": 0.00019180790787125504, + "loss": 1.6842, + "step": 4352 + }, + { + "epoch": 0.15589020001074363, + "grad_norm": 1.8803669214248657, + "learning_rate": 0.00019180330943648454, + "loss": 1.7868, + "step": 4353 + }, + { + "epoch": 0.15592601214031193, + "grad_norm": 1.6519395112991333, + "learning_rate": 0.00019179870976661895, + "loss": 1.5024, + "step": 4354 + }, + { + "epoch": 0.1559618242698802, + "grad_norm": 1.9108158349990845, + "learning_rate": 0.0001917941088617201, + "loss": 1.5709, + "step": 4355 + }, + { + "epoch": 0.1559976363994485, + "grad_norm": 1.362729787826538, + "learning_rate": 0.00019178950672184996, + "loss": 1.6681, + "step": 4356 + }, + { + "epoch": 0.15603344852901677, + "grad_norm": 1.1817349195480347, + "learning_rate": 0.0001917849033470704, + "loss": 1.6116, + "step": 4357 + }, + { + "epoch": 0.15606926065858506, + "grad_norm": 1.8846251964569092, + "learning_rate": 0.00019178029873744335, + "loss": 1.6507, + "step": 4358 + }, + { + "epoch": 0.15610507278815336, + "grad_norm": 1.3459134101867676, + "learning_rate": 0.00019177569289303078, + "loss": 1.5352, + "step": 4359 + }, + { + "epoch": 0.15614088491772163, + "grad_norm": 1.8612664937973022, + "learning_rate": 0.00019177108581389462, + "loss": 1.7641, + "step": 4360 + }, + { + "epoch": 0.15617669704728993, + "grad_norm": 2.042421817779541, + "learning_rate": 0.0001917664775000969, + "loss": 1.5105, + "step": 4361 + }, + { + "epoch": 0.1562125091768582, + "grad_norm": 1.3293653726577759, + "learning_rate": 0.00019176186795169956, + "loss": 1.5479, + "step": 4362 + }, + { + "epoch": 0.1562483213064265, + "grad_norm": 1.579801321029663, + "learning_rate": 0.0001917572571687647, + "loss": 1.5348, + "step": 4363 + }, + { + "epoch": 0.15628413343599476, + "grad_norm": 1.945090889930725, + "learning_rate": 0.00019175264515135427, + "loss": 1.7108, + "step": 4364 + }, + { + "epoch": 0.15631994556556306, + "grad_norm": 1.8579822778701782, + "learning_rate": 0.00019174803189953035, + "loss": 2.0244, + "step": 4365 + }, + { + "epoch": 0.15635575769513135, + "grad_norm": 1.6764092445373535, + "learning_rate": 0.00019174341741335504, + "loss": 1.4705, + "step": 4366 + }, + { + "epoch": 0.15639156982469962, + "grad_norm": 1.4812837839126587, + "learning_rate": 0.00019173880169289035, + "loss": 1.6117, + "step": 4367 + }, + { + "epoch": 0.15642738195426792, + "grad_norm": 1.7119643688201904, + "learning_rate": 0.00019173418473819844, + "loss": 1.6282, + "step": 4368 + }, + { + "epoch": 0.1564631940838362, + "grad_norm": 2.8325753211975098, + "learning_rate": 0.0001917295665493414, + "loss": 1.4901, + "step": 4369 + }, + { + "epoch": 0.15649900621340448, + "grad_norm": 1.717469334602356, + "learning_rate": 0.00019172494712638136, + "loss": 1.7564, + "step": 4370 + }, + { + "epoch": 0.15653481834297275, + "grad_norm": 3.0501840114593506, + "learning_rate": 0.0001917203264693805, + "loss": 1.5696, + "step": 4371 + }, + { + "epoch": 0.15657063047254105, + "grad_norm": 2.0719053745269775, + "learning_rate": 0.0001917157045784009, + "loss": 1.5447, + "step": 4372 + }, + { + "epoch": 0.15660644260210935, + "grad_norm": 2.070021390914917, + "learning_rate": 0.00019171108145350484, + "loss": 1.8622, + "step": 4373 + }, + { + "epoch": 0.15664225473167762, + "grad_norm": 1.451671838760376, + "learning_rate": 0.00019170645709475447, + "loss": 1.3056, + "step": 4374 + }, + { + "epoch": 0.1566780668612459, + "grad_norm": 1.7899963855743408, + "learning_rate": 0.00019170183150221201, + "loss": 1.3817, + "step": 4375 + }, + { + "epoch": 0.15671387899081418, + "grad_norm": 2.0643718242645264, + "learning_rate": 0.00019169720467593972, + "loss": 1.751, + "step": 4376 + }, + { + "epoch": 0.15674969112038248, + "grad_norm": 1.62465238571167, + "learning_rate": 0.0001916925766159998, + "loss": 1.5946, + "step": 4377 + }, + { + "epoch": 0.15678550324995075, + "grad_norm": 1.3463857173919678, + "learning_rate": 0.0001916879473224545, + "loss": 1.3212, + "step": 4378 + }, + { + "epoch": 0.15682131537951904, + "grad_norm": 1.371046543121338, + "learning_rate": 0.00019168331679536623, + "loss": 1.8971, + "step": 4379 + }, + { + "epoch": 0.15685712750908734, + "grad_norm": 1.3901337385177612, + "learning_rate": 0.00019167868503479712, + "loss": 1.6782, + "step": 4380 + }, + { + "epoch": 0.1568929396386556, + "grad_norm": 1.584809422492981, + "learning_rate": 0.00019167405204080956, + "loss": 1.5364, + "step": 4381 + }, + { + "epoch": 0.1569287517682239, + "grad_norm": 1.3930258750915527, + "learning_rate": 0.00019166941781346592, + "loss": 1.2805, + "step": 4382 + }, + { + "epoch": 0.15696456389779218, + "grad_norm": 2.2190914154052734, + "learning_rate": 0.0001916647823528285, + "loss": 1.8246, + "step": 4383 + }, + { + "epoch": 0.15700037602736047, + "grad_norm": 2.376316547393799, + "learning_rate": 0.00019166014565895966, + "loss": 1.8629, + "step": 4384 + }, + { + "epoch": 0.15703618815692874, + "grad_norm": 1.8719273805618286, + "learning_rate": 0.0001916555077319218, + "loss": 1.642, + "step": 4385 + }, + { + "epoch": 0.15707200028649704, + "grad_norm": 1.8520594835281372, + "learning_rate": 0.0001916508685717773, + "loss": 1.6127, + "step": 4386 + }, + { + "epoch": 0.15710781241606533, + "grad_norm": 1.3201779127120972, + "learning_rate": 0.0001916462281785886, + "loss": 1.5689, + "step": 4387 + }, + { + "epoch": 0.1571436245456336, + "grad_norm": 2.1866772174835205, + "learning_rate": 0.0001916415865524181, + "loss": 1.6067, + "step": 4388 + }, + { + "epoch": 0.1571794366752019, + "grad_norm": 1.5497702360153198, + "learning_rate": 0.00019163694369332825, + "loss": 1.3363, + "step": 4389 + }, + { + "epoch": 0.15721524880477017, + "grad_norm": 1.541739583015442, + "learning_rate": 0.00019163229960138156, + "loss": 1.3411, + "step": 4390 + }, + { + "epoch": 0.15725106093433847, + "grad_norm": 1.7324460744857788, + "learning_rate": 0.00019162765427664045, + "loss": 1.6654, + "step": 4391 + }, + { + "epoch": 0.15728687306390673, + "grad_norm": 1.6018710136413574, + "learning_rate": 0.00019162300771916746, + "loss": 1.6184, + "step": 4392 + }, + { + "epoch": 0.15732268519347503, + "grad_norm": 1.3000634908676147, + "learning_rate": 0.00019161835992902507, + "loss": 1.5956, + "step": 4393 + }, + { + "epoch": 0.15735849732304333, + "grad_norm": 1.8652485609054565, + "learning_rate": 0.00019161371090627583, + "loss": 1.5652, + "step": 4394 + }, + { + "epoch": 0.1573943094526116, + "grad_norm": 1.4492467641830444, + "learning_rate": 0.00019160906065098228, + "loss": 1.4706, + "step": 4395 + }, + { + "epoch": 0.1574301215821799, + "grad_norm": 1.5934580564498901, + "learning_rate": 0.00019160440916320698, + "loss": 1.6347, + "step": 4396 + }, + { + "epoch": 0.15746593371174816, + "grad_norm": 1.5383509397506714, + "learning_rate": 0.00019159975644301256, + "loss": 1.7649, + "step": 4397 + }, + { + "epoch": 0.15750174584131646, + "grad_norm": 1.6854779720306396, + "learning_rate": 0.00019159510249046154, + "loss": 1.6996, + "step": 4398 + }, + { + "epoch": 0.15753755797088473, + "grad_norm": 1.5617847442626953, + "learning_rate": 0.00019159044730561656, + "loss": 1.7046, + "step": 4399 + }, + { + "epoch": 0.15757337010045303, + "grad_norm": 1.5053625106811523, + "learning_rate": 0.00019158579088854026, + "loss": 1.6323, + "step": 4400 + }, + { + "epoch": 0.15760918223002132, + "grad_norm": 1.4751994609832764, + "learning_rate": 0.0001915811332392953, + "loss": 1.6127, + "step": 4401 + }, + { + "epoch": 0.1576449943595896, + "grad_norm": 1.9280856847763062, + "learning_rate": 0.00019157647435794428, + "loss": 1.2965, + "step": 4402 + }, + { + "epoch": 0.1576808064891579, + "grad_norm": 2.083721160888672, + "learning_rate": 0.00019157181424454996, + "loss": 1.899, + "step": 4403 + }, + { + "epoch": 0.15771661861872616, + "grad_norm": 1.2885043621063232, + "learning_rate": 0.00019156715289917497, + "loss": 1.5413, + "step": 4404 + }, + { + "epoch": 0.15775243074829445, + "grad_norm": 1.6024374961853027, + "learning_rate": 0.0001915624903218821, + "loss": 1.4269, + "step": 4405 + }, + { + "epoch": 0.15778824287786272, + "grad_norm": 2.1277174949645996, + "learning_rate": 0.00019155782651273398, + "loss": 1.9484, + "step": 4406 + }, + { + "epoch": 0.15782405500743102, + "grad_norm": 1.5377131700515747, + "learning_rate": 0.00019155316147179342, + "loss": 1.5133, + "step": 4407 + }, + { + "epoch": 0.1578598671369993, + "grad_norm": 1.6273584365844727, + "learning_rate": 0.00019154849519912318, + "loss": 1.7354, + "step": 4408 + }, + { + "epoch": 0.15789567926656758, + "grad_norm": 2.1824471950531006, + "learning_rate": 0.00019154382769478602, + "loss": 1.6386, + "step": 4409 + }, + { + "epoch": 0.15793149139613588, + "grad_norm": 1.4899530410766602, + "learning_rate": 0.00019153915895884474, + "loss": 1.5148, + "step": 4410 + }, + { + "epoch": 0.15796730352570415, + "grad_norm": 1.584786057472229, + "learning_rate": 0.00019153448899136212, + "loss": 1.3899, + "step": 4411 + }, + { + "epoch": 0.15800311565527245, + "grad_norm": 1.3985209465026855, + "learning_rate": 0.00019152981779240106, + "loss": 1.7417, + "step": 4412 + }, + { + "epoch": 0.15803892778484072, + "grad_norm": 1.8065505027770996, + "learning_rate": 0.00019152514536202437, + "loss": 1.1059, + "step": 4413 + }, + { + "epoch": 0.158074739914409, + "grad_norm": 1.7250769138336182, + "learning_rate": 0.0001915204717002949, + "loss": 1.819, + "step": 4414 + }, + { + "epoch": 0.15811055204397728, + "grad_norm": 1.400039553642273, + "learning_rate": 0.00019151579680727553, + "loss": 1.586, + "step": 4415 + }, + { + "epoch": 0.15814636417354558, + "grad_norm": 2.593348264694214, + "learning_rate": 0.00019151112068302917, + "loss": 1.9007, + "step": 4416 + }, + { + "epoch": 0.15818217630311388, + "grad_norm": 1.6109260320663452, + "learning_rate": 0.0001915064433276187, + "loss": 1.4546, + "step": 4417 + }, + { + "epoch": 0.15821798843268214, + "grad_norm": 1.6492670774459839, + "learning_rate": 0.0001915017647411071, + "loss": 1.5743, + "step": 4418 + }, + { + "epoch": 0.15825380056225044, + "grad_norm": 2.3043372631073, + "learning_rate": 0.00019149708492355728, + "loss": 2.1891, + "step": 4419 + }, + { + "epoch": 0.1582896126918187, + "grad_norm": 1.7343984842300415, + "learning_rate": 0.0001914924038750322, + "loss": 1.817, + "step": 4420 + }, + { + "epoch": 0.158325424821387, + "grad_norm": 1.7777115106582642, + "learning_rate": 0.00019148772159559486, + "loss": 1.4785, + "step": 4421 + }, + { + "epoch": 0.15836123695095528, + "grad_norm": 1.8788021802902222, + "learning_rate": 0.00019148303808530818, + "loss": 1.4365, + "step": 4422 + }, + { + "epoch": 0.15839704908052357, + "grad_norm": 1.6759637594223022, + "learning_rate": 0.00019147835334423527, + "loss": 1.5425, + "step": 4423 + }, + { + "epoch": 0.15843286121009187, + "grad_norm": 1.5373001098632812, + "learning_rate": 0.0001914736673724391, + "loss": 1.4686, + "step": 4424 + }, + { + "epoch": 0.15846867333966014, + "grad_norm": 1.998205542564392, + "learning_rate": 0.00019146898016998273, + "loss": 1.7219, + "step": 4425 + }, + { + "epoch": 0.15850448546922843, + "grad_norm": 1.5967192649841309, + "learning_rate": 0.00019146429173692923, + "loss": 1.4809, + "step": 4426 + }, + { + "epoch": 0.1585402975987967, + "grad_norm": 1.7499881982803345, + "learning_rate": 0.00019145960207334165, + "loss": 1.4854, + "step": 4427 + }, + { + "epoch": 0.158576109728365, + "grad_norm": 1.3718316555023193, + "learning_rate": 0.00019145491117928312, + "loss": 1.7586, + "step": 4428 + }, + { + "epoch": 0.15861192185793327, + "grad_norm": 1.6473380327224731, + "learning_rate": 0.00019145021905481673, + "loss": 1.8481, + "step": 4429 + }, + { + "epoch": 0.15864773398750157, + "grad_norm": 1.416988730430603, + "learning_rate": 0.00019144552570000558, + "loss": 1.5625, + "step": 4430 + }, + { + "epoch": 0.15868354611706986, + "grad_norm": 1.5032716989517212, + "learning_rate": 0.00019144083111491284, + "loss": 1.511, + "step": 4431 + }, + { + "epoch": 0.15871935824663813, + "grad_norm": 1.6411312818527222, + "learning_rate": 0.0001914361352996017, + "loss": 1.3405, + "step": 4432 + }, + { + "epoch": 0.15875517037620643, + "grad_norm": 1.9214622974395752, + "learning_rate": 0.00019143143825413526, + "loss": 1.6389, + "step": 4433 + }, + { + "epoch": 0.1587909825057747, + "grad_norm": 1.5103355646133423, + "learning_rate": 0.00019142673997857678, + "loss": 1.4921, + "step": 4434 + }, + { + "epoch": 0.158826794635343, + "grad_norm": 1.3821022510528564, + "learning_rate": 0.00019142204047298945, + "loss": 1.4057, + "step": 4435 + }, + { + "epoch": 0.15886260676491126, + "grad_norm": 2.0215821266174316, + "learning_rate": 0.00019141733973743644, + "loss": 1.7836, + "step": 4436 + }, + { + "epoch": 0.15889841889447956, + "grad_norm": 1.9990155696868896, + "learning_rate": 0.0001914126377719811, + "loss": 1.508, + "step": 4437 + }, + { + "epoch": 0.15893423102404786, + "grad_norm": 1.4154623746871948, + "learning_rate": 0.00019140793457668665, + "loss": 1.4155, + "step": 4438 + }, + { + "epoch": 0.15897004315361613, + "grad_norm": 1.843180775642395, + "learning_rate": 0.0001914032301516163, + "loss": 1.6476, + "step": 4439 + }, + { + "epoch": 0.15900585528318442, + "grad_norm": 1.4247946739196777, + "learning_rate": 0.0001913985244968334, + "loss": 1.4814, + "step": 4440 + }, + { + "epoch": 0.1590416674127527, + "grad_norm": 1.6840612888336182, + "learning_rate": 0.00019139381761240127, + "loss": 1.5899, + "step": 4441 + }, + { + "epoch": 0.159077479542321, + "grad_norm": 1.6597590446472168, + "learning_rate": 0.00019138910949838321, + "loss": 1.4572, + "step": 4442 + }, + { + "epoch": 0.15911329167188926, + "grad_norm": 1.8821818828582764, + "learning_rate": 0.0001913844001548425, + "loss": 1.6773, + "step": 4443 + }, + { + "epoch": 0.15914910380145755, + "grad_norm": 1.239434838294983, + "learning_rate": 0.00019137968958184265, + "loss": 1.5669, + "step": 4444 + }, + { + "epoch": 0.15918491593102585, + "grad_norm": 1.371330738067627, + "learning_rate": 0.00019137497777944691, + "loss": 1.2934, + "step": 4445 + }, + { + "epoch": 0.15922072806059412, + "grad_norm": 2.062408447265625, + "learning_rate": 0.00019137026474771874, + "loss": 1.3111, + "step": 4446 + }, + { + "epoch": 0.15925654019016242, + "grad_norm": 2.3759512901306152, + "learning_rate": 0.00019136555048672145, + "loss": 1.4451, + "step": 4447 + }, + { + "epoch": 0.15929235231973068, + "grad_norm": 2.019629716873169, + "learning_rate": 0.0001913608349965186, + "loss": 1.6197, + "step": 4448 + }, + { + "epoch": 0.15932816444929898, + "grad_norm": 1.272171139717102, + "learning_rate": 0.0001913561182771735, + "loss": 1.6714, + "step": 4449 + }, + { + "epoch": 0.15936397657886725, + "grad_norm": 1.7505909204483032, + "learning_rate": 0.00019135140032874973, + "loss": 1.5886, + "step": 4450 + }, + { + "epoch": 0.15939978870843555, + "grad_norm": 1.6725510358810425, + "learning_rate": 0.00019134668115131068, + "loss": 1.6745, + "step": 4451 + }, + { + "epoch": 0.15943560083800384, + "grad_norm": 1.7542790174484253, + "learning_rate": 0.00019134196074491988, + "loss": 1.584, + "step": 4452 + }, + { + "epoch": 0.1594714129675721, + "grad_norm": 1.595931053161621, + "learning_rate": 0.00019133723910964078, + "loss": 1.5579, + "step": 4453 + }, + { + "epoch": 0.1595072250971404, + "grad_norm": 2.000361204147339, + "learning_rate": 0.00019133251624553696, + "loss": 1.5491, + "step": 4454 + }, + { + "epoch": 0.15954303722670868, + "grad_norm": 1.6233510971069336, + "learning_rate": 0.00019132779215267197, + "loss": 1.7638, + "step": 4455 + }, + { + "epoch": 0.15957884935627697, + "grad_norm": 1.5333667993545532, + "learning_rate": 0.00019132306683110933, + "loss": 1.4675, + "step": 4456 + }, + { + "epoch": 0.15961466148584524, + "grad_norm": 1.6372523307800293, + "learning_rate": 0.0001913183402809126, + "loss": 1.8686, + "step": 4457 + }, + { + "epoch": 0.15965047361541354, + "grad_norm": 1.6973719596862793, + "learning_rate": 0.00019131361250214541, + "loss": 1.4094, + "step": 4458 + }, + { + "epoch": 0.15968628574498184, + "grad_norm": 1.5486109256744385, + "learning_rate": 0.00019130888349487134, + "loss": 1.2735, + "step": 4459 + }, + { + "epoch": 0.1597220978745501, + "grad_norm": 1.7678258419036865, + "learning_rate": 0.00019130415325915406, + "loss": 1.454, + "step": 4460 + }, + { + "epoch": 0.1597579100041184, + "grad_norm": 1.7122492790222168, + "learning_rate": 0.00019129942179505713, + "loss": 1.3684, + "step": 4461 + }, + { + "epoch": 0.15979372213368667, + "grad_norm": 1.4079463481903076, + "learning_rate": 0.00019129468910264428, + "loss": 1.6309, + "step": 4462 + }, + { + "epoch": 0.15982953426325497, + "grad_norm": 2.084505319595337, + "learning_rate": 0.00019128995518197912, + "loss": 1.6246, + "step": 4463 + }, + { + "epoch": 0.15986534639282324, + "grad_norm": 1.141055941581726, + "learning_rate": 0.00019128522003312537, + "loss": 1.2627, + "step": 4464 + }, + { + "epoch": 0.15990115852239153, + "grad_norm": 1.7960704565048218, + "learning_rate": 0.00019128048365614676, + "loss": 1.5314, + "step": 4465 + }, + { + "epoch": 0.15993697065195983, + "grad_norm": 1.8227843046188354, + "learning_rate": 0.00019127574605110693, + "loss": 1.8467, + "step": 4466 + }, + { + "epoch": 0.1599727827815281, + "grad_norm": 1.508379340171814, + "learning_rate": 0.00019127100721806975, + "loss": 1.6528, + "step": 4467 + }, + { + "epoch": 0.1600085949110964, + "grad_norm": 1.5757975578308105, + "learning_rate": 0.00019126626715709885, + "loss": 1.25, + "step": 4468 + }, + { + "epoch": 0.16004440704066467, + "grad_norm": 1.5202853679656982, + "learning_rate": 0.00019126152586825806, + "loss": 1.4711, + "step": 4469 + }, + { + "epoch": 0.16008021917023296, + "grad_norm": 1.781907558441162, + "learning_rate": 0.00019125678335161117, + "loss": 1.4699, + "step": 4470 + }, + { + "epoch": 0.16011603129980123, + "grad_norm": 1.2357120513916016, + "learning_rate": 0.00019125203960722198, + "loss": 1.5946, + "step": 4471 + }, + { + "epoch": 0.16015184342936953, + "grad_norm": 1.6788946390151978, + "learning_rate": 0.00019124729463515427, + "loss": 1.719, + "step": 4472 + }, + { + "epoch": 0.16018765555893782, + "grad_norm": 1.5136687755584717, + "learning_rate": 0.00019124254843547195, + "loss": 1.5618, + "step": 4473 + }, + { + "epoch": 0.1602234676885061, + "grad_norm": 1.810028314590454, + "learning_rate": 0.0001912378010082388, + "loss": 1.5755, + "step": 4474 + }, + { + "epoch": 0.1602592798180744, + "grad_norm": 4.77049446105957, + "learning_rate": 0.00019123305235351873, + "loss": 1.4884, + "step": 4475 + }, + { + "epoch": 0.16029509194764266, + "grad_norm": 1.4742318391799927, + "learning_rate": 0.00019122830247137563, + "loss": 1.3881, + "step": 4476 + }, + { + "epoch": 0.16033090407721096, + "grad_norm": 2.340515375137329, + "learning_rate": 0.00019122355136187342, + "loss": 1.8475, + "step": 4477 + }, + { + "epoch": 0.16036671620677923, + "grad_norm": 1.6513471603393555, + "learning_rate": 0.00019121879902507595, + "loss": 1.6164, + "step": 4478 + }, + { + "epoch": 0.16040252833634752, + "grad_norm": 1.3290749788284302, + "learning_rate": 0.00019121404546104724, + "loss": 1.6316, + "step": 4479 + }, + { + "epoch": 0.16043834046591582, + "grad_norm": 1.52192223072052, + "learning_rate": 0.00019120929066985122, + "loss": 1.2523, + "step": 4480 + }, + { + "epoch": 0.1604741525954841, + "grad_norm": 2.044299602508545, + "learning_rate": 0.0001912045346515518, + "loss": 1.4638, + "step": 4481 + }, + { + "epoch": 0.16050996472505238, + "grad_norm": 1.4951385259628296, + "learning_rate": 0.00019119977740621305, + "loss": 1.476, + "step": 4482 + }, + { + "epoch": 0.16054577685462065, + "grad_norm": 1.5891962051391602, + "learning_rate": 0.0001911950189338989, + "loss": 1.1198, + "step": 4483 + }, + { + "epoch": 0.16058158898418895, + "grad_norm": 1.8488585948944092, + "learning_rate": 0.00019119025923467343, + "loss": 1.5125, + "step": 4484 + }, + { + "epoch": 0.16061740111375722, + "grad_norm": 2.9444382190704346, + "learning_rate": 0.00019118549830860065, + "loss": 1.4347, + "step": 4485 + }, + { + "epoch": 0.16065321324332552, + "grad_norm": 2.6969316005706787, + "learning_rate": 0.0001911807361557446, + "loss": 1.4744, + "step": 4486 + }, + { + "epoch": 0.1606890253728938, + "grad_norm": 1.69938063621521, + "learning_rate": 0.00019117597277616932, + "loss": 1.5696, + "step": 4487 + }, + { + "epoch": 0.16072483750246208, + "grad_norm": 1.9011529684066772, + "learning_rate": 0.00019117120816993899, + "loss": 1.6207, + "step": 4488 + }, + { + "epoch": 0.16076064963203038, + "grad_norm": 1.8051317930221558, + "learning_rate": 0.00019116644233711764, + "loss": 1.5994, + "step": 4489 + }, + { + "epoch": 0.16079646176159865, + "grad_norm": 1.7188615798950195, + "learning_rate": 0.0001911616752777694, + "loss": 1.7194, + "step": 4490 + }, + { + "epoch": 0.16083227389116694, + "grad_norm": 1.4117614030838013, + "learning_rate": 0.0001911569069919584, + "loss": 1.666, + "step": 4491 + }, + { + "epoch": 0.1608680860207352, + "grad_norm": 1.8532236814498901, + "learning_rate": 0.00019115213747974882, + "loss": 1.2589, + "step": 4492 + }, + { + "epoch": 0.1609038981503035, + "grad_norm": 1.4849985837936401, + "learning_rate": 0.0001911473667412048, + "loss": 1.5491, + "step": 4493 + }, + { + "epoch": 0.1609397102798718, + "grad_norm": 1.5304160118103027, + "learning_rate": 0.00019114259477639057, + "loss": 1.656, + "step": 4494 + }, + { + "epoch": 0.16097552240944007, + "grad_norm": 1.8665733337402344, + "learning_rate": 0.00019113782158537024, + "loss": 1.447, + "step": 4495 + }, + { + "epoch": 0.16101133453900837, + "grad_norm": 1.4369174242019653, + "learning_rate": 0.0001911330471682081, + "loss": 1.3837, + "step": 4496 + }, + { + "epoch": 0.16104714666857664, + "grad_norm": 1.2710061073303223, + "learning_rate": 0.00019112827152496835, + "loss": 1.7857, + "step": 4497 + }, + { + "epoch": 0.16108295879814494, + "grad_norm": 1.0543251037597656, + "learning_rate": 0.00019112349465571525, + "loss": 1.6002, + "step": 4498 + }, + { + "epoch": 0.1611187709277132, + "grad_norm": 1.3556406497955322, + "learning_rate": 0.0001911187165605131, + "loss": 1.5894, + "step": 4499 + }, + { + "epoch": 0.1611545830572815, + "grad_norm": 1.9065412282943726, + "learning_rate": 0.00019111393723942615, + "loss": 1.5288, + "step": 4500 + }, + { + "epoch": 0.1611903951868498, + "grad_norm": 1.592268705368042, + "learning_rate": 0.00019110915669251868, + "loss": 1.5041, + "step": 4501 + }, + { + "epoch": 0.16122620731641807, + "grad_norm": 1.5915298461914062, + "learning_rate": 0.00019110437491985505, + "loss": 1.9453, + "step": 4502 + }, + { + "epoch": 0.16126201944598637, + "grad_norm": 2.055844306945801, + "learning_rate": 0.00019109959192149955, + "loss": 1.405, + "step": 4503 + }, + { + "epoch": 0.16129783157555463, + "grad_norm": 1.386273980140686, + "learning_rate": 0.0001910948076975166, + "loss": 1.3482, + "step": 4504 + }, + { + "epoch": 0.16133364370512293, + "grad_norm": 1.3630104064941406, + "learning_rate": 0.00019109002224797046, + "loss": 1.8159, + "step": 4505 + }, + { + "epoch": 0.1613694558346912, + "grad_norm": 1.701704978942871, + "learning_rate": 0.00019108523557292558, + "loss": 1.5629, + "step": 4506 + }, + { + "epoch": 0.1614052679642595, + "grad_norm": 1.483626127243042, + "learning_rate": 0.00019108044767244636, + "loss": 1.6765, + "step": 4507 + }, + { + "epoch": 0.16144108009382777, + "grad_norm": 1.3464930057525635, + "learning_rate": 0.0001910756585465972, + "loss": 1.7481, + "step": 4508 + }, + { + "epoch": 0.16147689222339606, + "grad_norm": 1.5823675394058228, + "learning_rate": 0.0001910708681954425, + "loss": 1.5115, + "step": 4509 + }, + { + "epoch": 0.16151270435296436, + "grad_norm": 1.8616117238998413, + "learning_rate": 0.00019106607661904682, + "loss": 1.6684, + "step": 4510 + }, + { + "epoch": 0.16154851648253263, + "grad_norm": 1.4790433645248413, + "learning_rate": 0.00019106128381747448, + "loss": 1.528, + "step": 4511 + }, + { + "epoch": 0.16158432861210092, + "grad_norm": 1.3721879720687866, + "learning_rate": 0.00019105648979079006, + "loss": 1.6033, + "step": 4512 + }, + { + "epoch": 0.1616201407416692, + "grad_norm": 1.1864506006240845, + "learning_rate": 0.000191051694539058, + "loss": 1.5174, + "step": 4513 + }, + { + "epoch": 0.1616559528712375, + "grad_norm": 1.2634984254837036, + "learning_rate": 0.0001910468980623428, + "loss": 1.5987, + "step": 4514 + }, + { + "epoch": 0.16169176500080576, + "grad_norm": 2.0381622314453125, + "learning_rate": 0.0001910421003607091, + "loss": 1.6299, + "step": 4515 + }, + { + "epoch": 0.16172757713037406, + "grad_norm": 1.4539151191711426, + "learning_rate": 0.00019103730143422135, + "loss": 1.8375, + "step": 4516 + }, + { + "epoch": 0.16176338925994235, + "grad_norm": 1.7480626106262207, + "learning_rate": 0.00019103250128294413, + "loss": 1.608, + "step": 4517 + }, + { + "epoch": 0.16179920138951062, + "grad_norm": 1.822675347328186, + "learning_rate": 0.00019102769990694208, + "loss": 1.5056, + "step": 4518 + }, + { + "epoch": 0.16183501351907892, + "grad_norm": 1.5207996368408203, + "learning_rate": 0.00019102289730627968, + "loss": 1.5263, + "step": 4519 + }, + { + "epoch": 0.1618708256486472, + "grad_norm": 2.3780176639556885, + "learning_rate": 0.0001910180934810216, + "loss": 1.581, + "step": 4520 + }, + { + "epoch": 0.16190663777821548, + "grad_norm": 2.004570484161377, + "learning_rate": 0.0001910132884312325, + "loss": 1.853, + "step": 4521 + }, + { + "epoch": 0.16194244990778375, + "grad_norm": 1.402849793434143, + "learning_rate": 0.00019100848215697705, + "loss": 1.5757, + "step": 4522 + }, + { + "epoch": 0.16197826203735205, + "grad_norm": 1.324273943901062, + "learning_rate": 0.00019100367465831983, + "loss": 1.5214, + "step": 4523 + }, + { + "epoch": 0.16201407416692035, + "grad_norm": 1.5259591341018677, + "learning_rate": 0.00019099886593532554, + "loss": 1.4345, + "step": 4524 + }, + { + "epoch": 0.16204988629648862, + "grad_norm": 1.5737764835357666, + "learning_rate": 0.00019099405598805888, + "loss": 1.5467, + "step": 4525 + }, + { + "epoch": 0.1620856984260569, + "grad_norm": 1.5883179903030396, + "learning_rate": 0.0001909892448165846, + "loss": 1.553, + "step": 4526 + }, + { + "epoch": 0.16212151055562518, + "grad_norm": 1.3239035606384277, + "learning_rate": 0.0001909844324209674, + "loss": 1.5387, + "step": 4527 + }, + { + "epoch": 0.16215732268519348, + "grad_norm": 1.957148790359497, + "learning_rate": 0.00019097961880127203, + "loss": 1.5983, + "step": 4528 + }, + { + "epoch": 0.16219313481476175, + "grad_norm": 1.2890499830245972, + "learning_rate": 0.0001909748039575632, + "loss": 1.6041, + "step": 4529 + }, + { + "epoch": 0.16222894694433004, + "grad_norm": 1.505007266998291, + "learning_rate": 0.00019096998788990574, + "loss": 1.3721, + "step": 4530 + }, + { + "epoch": 0.16226475907389834, + "grad_norm": 1.8835853338241577, + "learning_rate": 0.00019096517059836448, + "loss": 1.8609, + "step": 4531 + }, + { + "epoch": 0.1623005712034666, + "grad_norm": 1.509743571281433, + "learning_rate": 0.00019096035208300416, + "loss": 1.5606, + "step": 4532 + }, + { + "epoch": 0.1623363833330349, + "grad_norm": 1.2899866104125977, + "learning_rate": 0.00019095553234388962, + "loss": 1.7222, + "step": 4533 + }, + { + "epoch": 0.16237219546260317, + "grad_norm": 1.679097294807434, + "learning_rate": 0.00019095071138108575, + "loss": 1.5859, + "step": 4534 + }, + { + "epoch": 0.16240800759217147, + "grad_norm": 1.3049614429473877, + "learning_rate": 0.00019094588919465734, + "loss": 1.4242, + "step": 4535 + }, + { + "epoch": 0.16244381972173974, + "grad_norm": 2.140238046646118, + "learning_rate": 0.0001909410657846693, + "loss": 1.7717, + "step": 4536 + }, + { + "epoch": 0.16247963185130804, + "grad_norm": 1.8646196126937866, + "learning_rate": 0.00019093624115118656, + "loss": 1.5395, + "step": 4537 + }, + { + "epoch": 0.16251544398087633, + "grad_norm": 1.333450198173523, + "learning_rate": 0.00019093141529427396, + "loss": 1.5125, + "step": 4538 + }, + { + "epoch": 0.1625512561104446, + "grad_norm": 1.132122278213501, + "learning_rate": 0.00019092658821399648, + "loss": 1.4367, + "step": 4539 + }, + { + "epoch": 0.1625870682400129, + "grad_norm": 1.6168757677078247, + "learning_rate": 0.00019092175991041905, + "loss": 1.6936, + "step": 4540 + }, + { + "epoch": 0.16262288036958117, + "grad_norm": 1.564820647239685, + "learning_rate": 0.0001909169303836066, + "loss": 1.645, + "step": 4541 + }, + { + "epoch": 0.16265869249914947, + "grad_norm": 1.6114917993545532, + "learning_rate": 0.00019091209963362416, + "loss": 1.4573, + "step": 4542 + }, + { + "epoch": 0.16269450462871773, + "grad_norm": 2.2553696632385254, + "learning_rate": 0.00019090726766053667, + "loss": 1.7323, + "step": 4543 + }, + { + "epoch": 0.16273031675828603, + "grad_norm": 1.2172681093215942, + "learning_rate": 0.00019090243446440915, + "loss": 1.5023, + "step": 4544 + }, + { + "epoch": 0.16276612888785433, + "grad_norm": 1.509480357170105, + "learning_rate": 0.0001908976000453066, + "loss": 1.6955, + "step": 4545 + }, + { + "epoch": 0.1628019410174226, + "grad_norm": 1.7382971048355103, + "learning_rate": 0.00019089276440329415, + "loss": 1.4946, + "step": 4546 + }, + { + "epoch": 0.1628377531469909, + "grad_norm": 2.00982928276062, + "learning_rate": 0.00019088792753843675, + "loss": 1.7347, + "step": 4547 + }, + { + "epoch": 0.16287356527655916, + "grad_norm": 1.5432096719741821, + "learning_rate": 0.00019088308945079956, + "loss": 1.8206, + "step": 4548 + }, + { + "epoch": 0.16290937740612746, + "grad_norm": 1.4766311645507812, + "learning_rate": 0.00019087825014044762, + "loss": 1.6214, + "step": 4549 + }, + { + "epoch": 0.16294518953569573, + "grad_norm": 2.6827495098114014, + "learning_rate": 0.00019087340960744604, + "loss": 1.6215, + "step": 4550 + }, + { + "epoch": 0.16298100166526402, + "grad_norm": 1.2720814943313599, + "learning_rate": 0.00019086856785185992, + "loss": 1.5734, + "step": 4551 + }, + { + "epoch": 0.16301681379483232, + "grad_norm": 1.7849035263061523, + "learning_rate": 0.0001908637248737545, + "loss": 1.5433, + "step": 4552 + }, + { + "epoch": 0.1630526259244006, + "grad_norm": 1.7423144578933716, + "learning_rate": 0.00019085888067319485, + "loss": 1.4252, + "step": 4553 + }, + { + "epoch": 0.1630884380539689, + "grad_norm": 2.395569324493408, + "learning_rate": 0.00019085403525024612, + "loss": 1.5359, + "step": 4554 + }, + { + "epoch": 0.16312425018353716, + "grad_norm": 1.3138221502304077, + "learning_rate": 0.00019084918860497356, + "loss": 1.4022, + "step": 4555 + }, + { + "epoch": 0.16316006231310545, + "grad_norm": 1.5145282745361328, + "learning_rate": 0.00019084434073744238, + "loss": 1.108, + "step": 4556 + }, + { + "epoch": 0.16319587444267372, + "grad_norm": 1.376220703125, + "learning_rate": 0.00019083949164771773, + "loss": 1.4966, + "step": 4557 + }, + { + "epoch": 0.16323168657224202, + "grad_norm": 1.3827080726623535, + "learning_rate": 0.00019083464133586492, + "loss": 1.6534, + "step": 4558 + }, + { + "epoch": 0.16326749870181032, + "grad_norm": 1.524684190750122, + "learning_rate": 0.00019082978980194918, + "loss": 1.7182, + "step": 4559 + }, + { + "epoch": 0.16330331083137858, + "grad_norm": 1.8181837797164917, + "learning_rate": 0.00019082493704603576, + "loss": 1.631, + "step": 4560 + }, + { + "epoch": 0.16333912296094688, + "grad_norm": 1.2133936882019043, + "learning_rate": 0.00019082008306819001, + "loss": 1.7072, + "step": 4561 + }, + { + "epoch": 0.16337493509051515, + "grad_norm": 1.3571362495422363, + "learning_rate": 0.00019081522786847717, + "loss": 1.6741, + "step": 4562 + }, + { + "epoch": 0.16341074722008345, + "grad_norm": 1.4400066137313843, + "learning_rate": 0.0001908103714469626, + "loss": 1.394, + "step": 4563 + }, + { + "epoch": 0.16344655934965172, + "grad_norm": 2.2084951400756836, + "learning_rate": 0.00019080551380371157, + "loss": 1.7241, + "step": 4564 + }, + { + "epoch": 0.16348237147922, + "grad_norm": 1.2983593940734863, + "learning_rate": 0.0001908006549387895, + "loss": 1.4786, + "step": 4565 + }, + { + "epoch": 0.1635181836087883, + "grad_norm": 1.8585529327392578, + "learning_rate": 0.00019079579485226176, + "loss": 1.5445, + "step": 4566 + }, + { + "epoch": 0.16355399573835658, + "grad_norm": 1.7613205909729004, + "learning_rate": 0.0001907909335441937, + "loss": 1.54, + "step": 4567 + }, + { + "epoch": 0.16358980786792487, + "grad_norm": 1.85093355178833, + "learning_rate": 0.00019078607101465078, + "loss": 1.5251, + "step": 4568 + }, + { + "epoch": 0.16362561999749314, + "grad_norm": 2.1224894523620605, + "learning_rate": 0.00019078120726369834, + "loss": 1.6325, + "step": 4569 + }, + { + "epoch": 0.16366143212706144, + "grad_norm": 1.862607717514038, + "learning_rate": 0.00019077634229140188, + "loss": 1.4159, + "step": 4570 + }, + { + "epoch": 0.1636972442566297, + "grad_norm": 1.5398863554000854, + "learning_rate": 0.0001907714760978268, + "loss": 1.6412, + "step": 4571 + }, + { + "epoch": 0.163733056386198, + "grad_norm": 1.6998611688613892, + "learning_rate": 0.0001907666086830386, + "loss": 1.6833, + "step": 4572 + }, + { + "epoch": 0.1637688685157663, + "grad_norm": 1.262229561805725, + "learning_rate": 0.0001907617400471028, + "loss": 1.3874, + "step": 4573 + }, + { + "epoch": 0.16380468064533457, + "grad_norm": 1.8390412330627441, + "learning_rate": 0.00019075687019008483, + "loss": 1.7563, + "step": 4574 + }, + { + "epoch": 0.16384049277490287, + "grad_norm": 1.7868350744247437, + "learning_rate": 0.00019075199911205024, + "loss": 1.5629, + "step": 4575 + }, + { + "epoch": 0.16387630490447114, + "grad_norm": 1.949973464012146, + "learning_rate": 0.00019074712681306456, + "loss": 1.9253, + "step": 4576 + }, + { + "epoch": 0.16391211703403943, + "grad_norm": 1.4536129236221313, + "learning_rate": 0.00019074225329319337, + "loss": 1.5995, + "step": 4577 + }, + { + "epoch": 0.1639479291636077, + "grad_norm": 1.5427803993225098, + "learning_rate": 0.00019073737855250218, + "loss": 1.5241, + "step": 4578 + }, + { + "epoch": 0.163983741293176, + "grad_norm": 1.3218435049057007, + "learning_rate": 0.00019073250259105663, + "loss": 1.6316, + "step": 4579 + }, + { + "epoch": 0.1640195534227443, + "grad_norm": 1.3886061906814575, + "learning_rate": 0.00019072762540892226, + "loss": 1.5922, + "step": 4580 + }, + { + "epoch": 0.16405536555231257, + "grad_norm": 1.7809525728225708, + "learning_rate": 0.00019072274700616474, + "loss": 1.951, + "step": 4581 + }, + { + "epoch": 0.16409117768188086, + "grad_norm": 1.7092533111572266, + "learning_rate": 0.00019071786738284968, + "loss": 1.4504, + "step": 4582 + }, + { + "epoch": 0.16412698981144913, + "grad_norm": 1.6752238273620605, + "learning_rate": 0.00019071298653904276, + "loss": 1.6263, + "step": 4583 + }, + { + "epoch": 0.16416280194101743, + "grad_norm": 1.514328122138977, + "learning_rate": 0.00019070810447480957, + "loss": 1.6375, + "step": 4584 + }, + { + "epoch": 0.1641986140705857, + "grad_norm": 1.2303587198257446, + "learning_rate": 0.00019070322119021588, + "loss": 1.4602, + "step": 4585 + }, + { + "epoch": 0.164234426200154, + "grad_norm": 2.415630578994751, + "learning_rate": 0.00019069833668532732, + "loss": 1.6044, + "step": 4586 + }, + { + "epoch": 0.1642702383297223, + "grad_norm": 1.291359543800354, + "learning_rate": 0.00019069345096020966, + "loss": 1.5712, + "step": 4587 + }, + { + "epoch": 0.16430605045929056, + "grad_norm": 1.2763800621032715, + "learning_rate": 0.00019068856401492857, + "loss": 1.6667, + "step": 4588 + }, + { + "epoch": 0.16434186258885886, + "grad_norm": 1.4778591394424438, + "learning_rate": 0.00019068367584954986, + "loss": 1.3605, + "step": 4589 + }, + { + "epoch": 0.16437767471842712, + "grad_norm": 1.095234990119934, + "learning_rate": 0.00019067878646413923, + "loss": 1.5766, + "step": 4590 + }, + { + "epoch": 0.16441348684799542, + "grad_norm": 1.2776097059249878, + "learning_rate": 0.0001906738958587625, + "loss": 1.5516, + "step": 4591 + }, + { + "epoch": 0.1644492989775637, + "grad_norm": 1.524300456047058, + "learning_rate": 0.00019066900403348551, + "loss": 1.4556, + "step": 4592 + }, + { + "epoch": 0.164485111107132, + "grad_norm": 1.618364691734314, + "learning_rate": 0.000190664110988374, + "loss": 1.6515, + "step": 4593 + }, + { + "epoch": 0.16452092323670028, + "grad_norm": 2.83412766456604, + "learning_rate": 0.00019065921672349384, + "loss": 1.6118, + "step": 4594 + }, + { + "epoch": 0.16455673536626855, + "grad_norm": 1.6449576616287231, + "learning_rate": 0.00019065432123891083, + "loss": 1.5997, + "step": 4595 + }, + { + "epoch": 0.16459254749583685, + "grad_norm": 1.6248342990875244, + "learning_rate": 0.00019064942453469086, + "loss": 1.6638, + "step": 4596 + }, + { + "epoch": 0.16462835962540512, + "grad_norm": 2.3031158447265625, + "learning_rate": 0.0001906445266108998, + "loss": 1.7916, + "step": 4597 + }, + { + "epoch": 0.16466417175497342, + "grad_norm": 1.7779685258865356, + "learning_rate": 0.0001906396274676036, + "loss": 1.4638, + "step": 4598 + }, + { + "epoch": 0.16469998388454168, + "grad_norm": 1.4672049283981323, + "learning_rate": 0.00019063472710486814, + "loss": 1.6929, + "step": 4599 + }, + { + "epoch": 0.16473579601410998, + "grad_norm": 1.7978802919387817, + "learning_rate": 0.0001906298255227593, + "loss": 1.6518, + "step": 4600 + }, + { + "epoch": 0.16477160814367828, + "grad_norm": 1.8508810997009277, + "learning_rate": 0.00019062492272134307, + "loss": 1.4408, + "step": 4601 + }, + { + "epoch": 0.16480742027324655, + "grad_norm": 1.8320178985595703, + "learning_rate": 0.0001906200187006854, + "loss": 1.5152, + "step": 4602 + }, + { + "epoch": 0.16484323240281484, + "grad_norm": 1.634345531463623, + "learning_rate": 0.0001906151134608523, + "loss": 1.6666, + "step": 4603 + }, + { + "epoch": 0.1648790445323831, + "grad_norm": 1.9694124460220337, + "learning_rate": 0.0001906102070019097, + "loss": 1.6432, + "step": 4604 + }, + { + "epoch": 0.1649148566619514, + "grad_norm": 2.6619157791137695, + "learning_rate": 0.00019060529932392366, + "loss": 1.4241, + "step": 4605 + }, + { + "epoch": 0.16495066879151968, + "grad_norm": 2.111093282699585, + "learning_rate": 0.00019060039042696016, + "loss": 1.5643, + "step": 4606 + }, + { + "epoch": 0.16498648092108797, + "grad_norm": 1.5327450037002563, + "learning_rate": 0.00019059548031108528, + "loss": 1.507, + "step": 4607 + }, + { + "epoch": 0.16502229305065624, + "grad_norm": 2.1236813068389893, + "learning_rate": 0.0001905905689763651, + "loss": 1.3831, + "step": 4608 + }, + { + "epoch": 0.16505810518022454, + "grad_norm": 1.4303845167160034, + "learning_rate": 0.00019058565642286567, + "loss": 1.1834, + "step": 4609 + }, + { + "epoch": 0.16509391730979284, + "grad_norm": 1.145059585571289, + "learning_rate": 0.00019058074265065303, + "loss": 1.4526, + "step": 4610 + }, + { + "epoch": 0.1651297294393611, + "grad_norm": 2.205949068069458, + "learning_rate": 0.00019057582765979341, + "loss": 1.4746, + "step": 4611 + }, + { + "epoch": 0.1651655415689294, + "grad_norm": 1.787413477897644, + "learning_rate": 0.00019057091145035281, + "loss": 1.5251, + "step": 4612 + }, + { + "epoch": 0.16520135369849767, + "grad_norm": 1.5059257745742798, + "learning_rate": 0.00019056599402239742, + "loss": 1.5406, + "step": 4613 + }, + { + "epoch": 0.16523716582806597, + "grad_norm": 1.2862091064453125, + "learning_rate": 0.00019056107537599343, + "loss": 1.5867, + "step": 4614 + }, + { + "epoch": 0.16527297795763424, + "grad_norm": 1.640185832977295, + "learning_rate": 0.000190556155511207, + "loss": 1.9078, + "step": 4615 + }, + { + "epoch": 0.16530879008720253, + "grad_norm": 2.0387260913848877, + "learning_rate": 0.00019055123442810427, + "loss": 1.6649, + "step": 4616 + }, + { + "epoch": 0.16534460221677083, + "grad_norm": 1.985213041305542, + "learning_rate": 0.00019054631212675152, + "loss": 1.2348, + "step": 4617 + }, + { + "epoch": 0.1653804143463391, + "grad_norm": 1.5583789348602295, + "learning_rate": 0.00019054138860721492, + "loss": 1.7628, + "step": 4618 + }, + { + "epoch": 0.1654162264759074, + "grad_norm": 2.322619915008545, + "learning_rate": 0.00019053646386956073, + "loss": 1.385, + "step": 4619 + }, + { + "epoch": 0.16545203860547567, + "grad_norm": 1.4217337369918823, + "learning_rate": 0.00019053153791385516, + "loss": 1.5673, + "step": 4620 + }, + { + "epoch": 0.16548785073504396, + "grad_norm": 2.825695276260376, + "learning_rate": 0.00019052661074016458, + "loss": 1.5057, + "step": 4621 + }, + { + "epoch": 0.16552366286461223, + "grad_norm": 1.6821850538253784, + "learning_rate": 0.0001905216823485552, + "loss": 1.4592, + "step": 4622 + }, + { + "epoch": 0.16555947499418053, + "grad_norm": 1.3392614126205444, + "learning_rate": 0.00019051675273909336, + "loss": 1.7243, + "step": 4623 + }, + { + "epoch": 0.16559528712374882, + "grad_norm": 1.9273262023925781, + "learning_rate": 0.00019051182191184537, + "loss": 1.3487, + "step": 4624 + }, + { + "epoch": 0.1656310992533171, + "grad_norm": 1.8148261308670044, + "learning_rate": 0.00019050688986687754, + "loss": 1.5975, + "step": 4625 + }, + { + "epoch": 0.1656669113828854, + "grad_norm": 1.3820042610168457, + "learning_rate": 0.00019050195660425627, + "loss": 1.465, + "step": 4626 + }, + { + "epoch": 0.16570272351245366, + "grad_norm": 1.627036690711975, + "learning_rate": 0.00019049702212404793, + "loss": 1.3036, + "step": 4627 + }, + { + "epoch": 0.16573853564202196, + "grad_norm": 1.241610050201416, + "learning_rate": 0.00019049208642631885, + "loss": 1.6127, + "step": 4628 + }, + { + "epoch": 0.16577434777159022, + "grad_norm": 1.7306376695632935, + "learning_rate": 0.00019048714951113552, + "loss": 1.775, + "step": 4629 + }, + { + "epoch": 0.16581015990115852, + "grad_norm": 1.5658414363861084, + "learning_rate": 0.00019048221137856427, + "loss": 1.8022, + "step": 4630 + }, + { + "epoch": 0.16584597203072682, + "grad_norm": 1.4824405908584595, + "learning_rate": 0.0001904772720286716, + "loss": 1.5788, + "step": 4631 + }, + { + "epoch": 0.1658817841602951, + "grad_norm": 1.6523913145065308, + "learning_rate": 0.00019047233146152393, + "loss": 1.4036, + "step": 4632 + }, + { + "epoch": 0.16591759628986338, + "grad_norm": 1.4244657754898071, + "learning_rate": 0.00019046738967718778, + "loss": 1.6885, + "step": 4633 + }, + { + "epoch": 0.16595340841943165, + "grad_norm": 1.679327368736267, + "learning_rate": 0.00019046244667572957, + "loss": 1.6083, + "step": 4634 + }, + { + "epoch": 0.16598922054899995, + "grad_norm": 1.6196266412734985, + "learning_rate": 0.00019045750245721583, + "loss": 1.3341, + "step": 4635 + }, + { + "epoch": 0.16602503267856822, + "grad_norm": 2.6375415325164795, + "learning_rate": 0.00019045255702171307, + "loss": 1.7689, + "step": 4636 + }, + { + "epoch": 0.16606084480813652, + "grad_norm": 1.9008333683013916, + "learning_rate": 0.00019044761036928783, + "loss": 1.6576, + "step": 4637 + }, + { + "epoch": 0.1660966569377048, + "grad_norm": 2.3040616512298584, + "learning_rate": 0.00019044266250000668, + "loss": 1.7239, + "step": 4638 + }, + { + "epoch": 0.16613246906727308, + "grad_norm": 2.4334566593170166, + "learning_rate": 0.00019043771341393614, + "loss": 1.7195, + "step": 4639 + }, + { + "epoch": 0.16616828119684138, + "grad_norm": 1.575951337814331, + "learning_rate": 0.00019043276311114283, + "loss": 1.6378, + "step": 4640 + }, + { + "epoch": 0.16620409332640965, + "grad_norm": 2.337109327316284, + "learning_rate": 0.00019042781159169336, + "loss": 1.6753, + "step": 4641 + }, + { + "epoch": 0.16623990545597794, + "grad_norm": 1.561690092086792, + "learning_rate": 0.00019042285885565433, + "loss": 1.6058, + "step": 4642 + }, + { + "epoch": 0.1662757175855462, + "grad_norm": 1.7889922857284546, + "learning_rate": 0.00019041790490309233, + "loss": 1.1429, + "step": 4643 + }, + { + "epoch": 0.1663115297151145, + "grad_norm": 1.3240993022918701, + "learning_rate": 0.00019041294973407412, + "loss": 1.641, + "step": 4644 + }, + { + "epoch": 0.1663473418446828, + "grad_norm": 2.112696409225464, + "learning_rate": 0.00019040799334866626, + "loss": 1.8524, + "step": 4645 + }, + { + "epoch": 0.16638315397425107, + "grad_norm": 1.8621598482131958, + "learning_rate": 0.00019040303574693545, + "loss": 1.724, + "step": 4646 + }, + { + "epoch": 0.16641896610381937, + "grad_norm": 1.5603737831115723, + "learning_rate": 0.00019039807692894842, + "loss": 1.4181, + "step": 4647 + }, + { + "epoch": 0.16645477823338764, + "grad_norm": 1.5125117301940918, + "learning_rate": 0.00019039311689477185, + "loss": 1.628, + "step": 4648 + }, + { + "epoch": 0.16649059036295594, + "grad_norm": 1.4379773139953613, + "learning_rate": 0.0001903881556444725, + "loss": 1.5146, + "step": 4649 + }, + { + "epoch": 0.1665264024925242, + "grad_norm": 2.8193790912628174, + "learning_rate": 0.00019038319317811714, + "loss": 1.4629, + "step": 4650 + }, + { + "epoch": 0.1665622146220925, + "grad_norm": 1.4412678480148315, + "learning_rate": 0.00019037822949577248, + "loss": 1.2369, + "step": 4651 + }, + { + "epoch": 0.1665980267516608, + "grad_norm": 1.3727669715881348, + "learning_rate": 0.00019037326459750534, + "loss": 1.6593, + "step": 4652 + }, + { + "epoch": 0.16663383888122907, + "grad_norm": 1.2576712369918823, + "learning_rate": 0.00019036829848338246, + "loss": 1.4383, + "step": 4653 + }, + { + "epoch": 0.16666965101079736, + "grad_norm": 1.55268132686615, + "learning_rate": 0.00019036333115347073, + "loss": 1.6322, + "step": 4654 + }, + { + "epoch": 0.16670546314036563, + "grad_norm": 1.6275209188461304, + "learning_rate": 0.00019035836260783691, + "loss": 1.4125, + "step": 4655 + }, + { + "epoch": 0.16674127526993393, + "grad_norm": 1.3718266487121582, + "learning_rate": 0.00019035339284654787, + "loss": 1.3941, + "step": 4656 + }, + { + "epoch": 0.1667770873995022, + "grad_norm": 1.859714388847351, + "learning_rate": 0.0001903484218696705, + "loss": 1.5406, + "step": 4657 + }, + { + "epoch": 0.1668128995290705, + "grad_norm": 1.5116417407989502, + "learning_rate": 0.00019034344967727165, + "loss": 1.6361, + "step": 4658 + }, + { + "epoch": 0.1668487116586388, + "grad_norm": 1.9010447263717651, + "learning_rate": 0.0001903384762694182, + "loss": 1.7493, + "step": 4659 + }, + { + "epoch": 0.16688452378820706, + "grad_norm": 1.5057936906814575, + "learning_rate": 0.0001903335016461771, + "loss": 1.3977, + "step": 4660 + }, + { + "epoch": 0.16692033591777536, + "grad_norm": 1.8938969373703003, + "learning_rate": 0.00019032852580761527, + "loss": 1.6976, + "step": 4661 + }, + { + "epoch": 0.16695614804734363, + "grad_norm": 1.1892553567886353, + "learning_rate": 0.00019032354875379962, + "loss": 1.2591, + "step": 4662 + }, + { + "epoch": 0.16699196017691192, + "grad_norm": 1.9757999181747437, + "learning_rate": 0.00019031857048479713, + "loss": 1.305, + "step": 4663 + }, + { + "epoch": 0.1670277723064802, + "grad_norm": 1.8454691171646118, + "learning_rate": 0.00019031359100067478, + "loss": 1.7795, + "step": 4664 + }, + { + "epoch": 0.1670635844360485, + "grad_norm": 2.3742315769195557, + "learning_rate": 0.00019030861030149956, + "loss": 1.4219, + "step": 4665 + }, + { + "epoch": 0.1670993965656168, + "grad_norm": 1.6165446043014526, + "learning_rate": 0.00019030362838733846, + "loss": 1.3658, + "step": 4666 + }, + { + "epoch": 0.16713520869518506, + "grad_norm": 1.2845289707183838, + "learning_rate": 0.00019029864525825857, + "loss": 1.5783, + "step": 4667 + }, + { + "epoch": 0.16717102082475335, + "grad_norm": 1.3073277473449707, + "learning_rate": 0.00019029366091432684, + "loss": 1.6852, + "step": 4668 + }, + { + "epoch": 0.16720683295432162, + "grad_norm": 1.8444633483886719, + "learning_rate": 0.0001902886753556104, + "loss": 1.3777, + "step": 4669 + }, + { + "epoch": 0.16724264508388992, + "grad_norm": 1.9535983800888062, + "learning_rate": 0.0001902836885821763, + "loss": 1.5784, + "step": 4670 + }, + { + "epoch": 0.1672784572134582, + "grad_norm": 1.5261894464492798, + "learning_rate": 0.00019027870059409158, + "loss": 1.3801, + "step": 4671 + }, + { + "epoch": 0.16731426934302648, + "grad_norm": 1.40703284740448, + "learning_rate": 0.00019027371139142342, + "loss": 1.4457, + "step": 4672 + }, + { + "epoch": 0.16735008147259478, + "grad_norm": 1.388258695602417, + "learning_rate": 0.00019026872097423894, + "loss": 1.3877, + "step": 4673 + }, + { + "epoch": 0.16738589360216305, + "grad_norm": 1.8786894083023071, + "learning_rate": 0.00019026372934260525, + "loss": 1.318, + "step": 4674 + }, + { + "epoch": 0.16742170573173135, + "grad_norm": 1.9680266380310059, + "learning_rate": 0.0001902587364965895, + "loss": 1.36, + "step": 4675 + }, + { + "epoch": 0.16745751786129962, + "grad_norm": 1.4114983081817627, + "learning_rate": 0.00019025374243625888, + "loss": 1.7629, + "step": 4676 + }, + { + "epoch": 0.1674933299908679, + "grad_norm": 1.4164817333221436, + "learning_rate": 0.0001902487471616806, + "loss": 1.935, + "step": 4677 + }, + { + "epoch": 0.16752914212043618, + "grad_norm": 1.973514437675476, + "learning_rate": 0.00019024375067292181, + "loss": 1.3236, + "step": 4678 + }, + { + "epoch": 0.16756495425000448, + "grad_norm": 1.940759301185608, + "learning_rate": 0.00019023875297004977, + "loss": 1.7045, + "step": 4679 + }, + { + "epoch": 0.16760076637957277, + "grad_norm": 1.5399664640426636, + "learning_rate": 0.0001902337540531317, + "loss": 1.8139, + "step": 4680 + }, + { + "epoch": 0.16763657850914104, + "grad_norm": 1.957043170928955, + "learning_rate": 0.00019022875392223486, + "loss": 1.6672, + "step": 4681 + }, + { + "epoch": 0.16767239063870934, + "grad_norm": 1.677278757095337, + "learning_rate": 0.00019022375257742656, + "loss": 1.8589, + "step": 4682 + }, + { + "epoch": 0.1677082027682776, + "grad_norm": 1.3992509841918945, + "learning_rate": 0.000190218750018774, + "loss": 1.3775, + "step": 4683 + }, + { + "epoch": 0.1677440148978459, + "grad_norm": 3.1266071796417236, + "learning_rate": 0.00019021374624634456, + "loss": 1.5442, + "step": 4684 + }, + { + "epoch": 0.16777982702741417, + "grad_norm": 1.9289811849594116, + "learning_rate": 0.0001902087412602055, + "loss": 1.4118, + "step": 4685 + }, + { + "epoch": 0.16781563915698247, + "grad_norm": 1.3202590942382812, + "learning_rate": 0.00019020373506042424, + "loss": 1.3978, + "step": 4686 + }, + { + "epoch": 0.16785145128655077, + "grad_norm": 1.4680845737457275, + "learning_rate": 0.00019019872764706804, + "loss": 1.8426, + "step": 4687 + }, + { + "epoch": 0.16788726341611904, + "grad_norm": 1.5509357452392578, + "learning_rate": 0.00019019371902020434, + "loss": 1.6945, + "step": 4688 + }, + { + "epoch": 0.16792307554568733, + "grad_norm": 2.0995917320251465, + "learning_rate": 0.00019018870917990045, + "loss": 1.9977, + "step": 4689 + }, + { + "epoch": 0.1679588876752556, + "grad_norm": 1.4658147096633911, + "learning_rate": 0.00019018369812622384, + "loss": 1.7161, + "step": 4690 + }, + { + "epoch": 0.1679946998048239, + "grad_norm": 1.6705026626586914, + "learning_rate": 0.0001901786858592419, + "loss": 1.464, + "step": 4691 + }, + { + "epoch": 0.16803051193439217, + "grad_norm": 2.0256388187408447, + "learning_rate": 0.00019017367237902206, + "loss": 1.2567, + "step": 4692 + }, + { + "epoch": 0.16806632406396046, + "grad_norm": 1.6162030696868896, + "learning_rate": 0.00019016865768563176, + "loss": 1.7855, + "step": 4693 + }, + { + "epoch": 0.16810213619352876, + "grad_norm": 1.8909845352172852, + "learning_rate": 0.0001901636417791385, + "loss": 1.7189, + "step": 4694 + }, + { + "epoch": 0.16813794832309703, + "grad_norm": 1.6075421571731567, + "learning_rate": 0.00019015862465960974, + "loss": 1.7273, + "step": 4695 + }, + { + "epoch": 0.16817376045266533, + "grad_norm": 1.5975663661956787, + "learning_rate": 0.00019015360632711298, + "loss": 1.8074, + "step": 4696 + }, + { + "epoch": 0.1682095725822336, + "grad_norm": 1.947001576423645, + "learning_rate": 0.00019014858678171573, + "loss": 1.5853, + "step": 4697 + }, + { + "epoch": 0.1682453847118019, + "grad_norm": 2.0930190086364746, + "learning_rate": 0.0001901435660234855, + "loss": 1.6109, + "step": 4698 + }, + { + "epoch": 0.16828119684137016, + "grad_norm": 1.5381509065628052, + "learning_rate": 0.00019013854405248992, + "loss": 1.6881, + "step": 4699 + }, + { + "epoch": 0.16831700897093846, + "grad_norm": 1.4447983503341675, + "learning_rate": 0.0001901335208687965, + "loss": 1.5999, + "step": 4700 + }, + { + "epoch": 0.16835282110050676, + "grad_norm": 1.7236464023590088, + "learning_rate": 0.00019012849647247277, + "loss": 1.3609, + "step": 4701 + }, + { + "epoch": 0.16838863323007502, + "grad_norm": 1.4596633911132812, + "learning_rate": 0.0001901234708635864, + "loss": 1.5683, + "step": 4702 + }, + { + "epoch": 0.16842444535964332, + "grad_norm": 1.497183084487915, + "learning_rate": 0.00019011844404220497, + "loss": 1.502, + "step": 4703 + }, + { + "epoch": 0.1684602574892116, + "grad_norm": 1.4851388931274414, + "learning_rate": 0.00019011341600839616, + "loss": 1.4912, + "step": 4704 + }, + { + "epoch": 0.1684960696187799, + "grad_norm": 1.5417276620864868, + "learning_rate": 0.00019010838676222755, + "loss": 1.7476, + "step": 4705 + }, + { + "epoch": 0.16853188174834816, + "grad_norm": 2.275900363922119, + "learning_rate": 0.00019010335630376682, + "loss": 1.2967, + "step": 4706 + }, + { + "epoch": 0.16856769387791645, + "grad_norm": 1.881659984588623, + "learning_rate": 0.00019009832463308168, + "loss": 1.9507, + "step": 4707 + }, + { + "epoch": 0.16860350600748472, + "grad_norm": 2.0647614002227783, + "learning_rate": 0.00019009329175023978, + "loss": 1.6887, + "step": 4708 + }, + { + "epoch": 0.16863931813705302, + "grad_norm": 1.464247226715088, + "learning_rate": 0.00019008825765530886, + "loss": 1.6486, + "step": 4709 + }, + { + "epoch": 0.16867513026662131, + "grad_norm": 1.4748529195785522, + "learning_rate": 0.00019008322234835662, + "loss": 1.5222, + "step": 4710 + }, + { + "epoch": 0.16871094239618958, + "grad_norm": 1.4888544082641602, + "learning_rate": 0.00019007818582945086, + "loss": 1.3543, + "step": 4711 + }, + { + "epoch": 0.16874675452575788, + "grad_norm": 1.683009386062622, + "learning_rate": 0.00019007314809865928, + "loss": 1.7678, + "step": 4712 + }, + { + "epoch": 0.16878256665532615, + "grad_norm": 2.0621697902679443, + "learning_rate": 0.00019006810915604967, + "loss": 1.7165, + "step": 4713 + }, + { + "epoch": 0.16881837878489445, + "grad_norm": 2.0385336875915527, + "learning_rate": 0.00019006306900168983, + "loss": 1.6773, + "step": 4714 + }, + { + "epoch": 0.16885419091446272, + "grad_norm": 1.4429692029953003, + "learning_rate": 0.00019005802763564757, + "loss": 1.3534, + "step": 4715 + }, + { + "epoch": 0.168890003044031, + "grad_norm": 1.8206114768981934, + "learning_rate": 0.00019005298505799073, + "loss": 1.6491, + "step": 4716 + }, + { + "epoch": 0.1689258151735993, + "grad_norm": 1.4741144180297852, + "learning_rate": 0.00019004794126878713, + "loss": 1.8381, + "step": 4717 + }, + { + "epoch": 0.16896162730316758, + "grad_norm": 1.921205759048462, + "learning_rate": 0.00019004289626810462, + "loss": 1.6051, + "step": 4718 + }, + { + "epoch": 0.16899743943273587, + "grad_norm": 1.435024619102478, + "learning_rate": 0.00019003785005601112, + "loss": 1.5402, + "step": 4719 + }, + { + "epoch": 0.16903325156230414, + "grad_norm": 1.4565465450286865, + "learning_rate": 0.00019003280263257447, + "loss": 1.6903, + "step": 4720 + }, + { + "epoch": 0.16906906369187244, + "grad_norm": 1.364829182624817, + "learning_rate": 0.0001900277539978626, + "loss": 1.5461, + "step": 4721 + }, + { + "epoch": 0.1691048758214407, + "grad_norm": 1.5410417318344116, + "learning_rate": 0.0001900227041519434, + "loss": 1.4193, + "step": 4722 + }, + { + "epoch": 0.169140687951009, + "grad_norm": 1.8465443849563599, + "learning_rate": 0.00019001765309488487, + "loss": 1.8538, + "step": 4723 + }, + { + "epoch": 0.1691765000805773, + "grad_norm": 1.8663067817687988, + "learning_rate": 0.00019001260082675492, + "loss": 1.8133, + "step": 4724 + }, + { + "epoch": 0.16921231221014557, + "grad_norm": 1.7289997339248657, + "learning_rate": 0.00019000754734762153, + "loss": 1.6781, + "step": 4725 + }, + { + "epoch": 0.16924812433971387, + "grad_norm": 1.981075644493103, + "learning_rate": 0.0001900024926575527, + "loss": 1.5013, + "step": 4726 + }, + { + "epoch": 0.16928393646928214, + "grad_norm": 1.7916004657745361, + "learning_rate": 0.0001899974367566164, + "loss": 1.5879, + "step": 4727 + }, + { + "epoch": 0.16931974859885043, + "grad_norm": 2.0685083866119385, + "learning_rate": 0.00018999237964488074, + "loss": 1.428, + "step": 4728 + }, + { + "epoch": 0.1693555607284187, + "grad_norm": 1.6859298944473267, + "learning_rate": 0.0001899873213224136, + "loss": 1.5873, + "step": 4729 + }, + { + "epoch": 0.169391372857987, + "grad_norm": 1.8945560455322266, + "learning_rate": 0.0001899822617892832, + "loss": 1.8325, + "step": 4730 + }, + { + "epoch": 0.1694271849875553, + "grad_norm": 1.4337220191955566, + "learning_rate": 0.0001899772010455575, + "loss": 1.5093, + "step": 4731 + }, + { + "epoch": 0.16946299711712356, + "grad_norm": 1.2983472347259521, + "learning_rate": 0.00018997213909130464, + "loss": 1.2281, + "step": 4732 + }, + { + "epoch": 0.16949880924669186, + "grad_norm": 2.4921884536743164, + "learning_rate": 0.0001899670759265927, + "loss": 1.8038, + "step": 4733 + }, + { + "epoch": 0.16953462137626013, + "grad_norm": 2.076310634613037, + "learning_rate": 0.00018996201155148983, + "loss": 1.4219, + "step": 4734 + }, + { + "epoch": 0.16957043350582843, + "grad_norm": 1.6718064546585083, + "learning_rate": 0.0001899569459660641, + "loss": 1.8481, + "step": 4735 + }, + { + "epoch": 0.1696062456353967, + "grad_norm": 1.3863492012023926, + "learning_rate": 0.0001899518791703837, + "loss": 1.4694, + "step": 4736 + }, + { + "epoch": 0.169642057764965, + "grad_norm": 1.388734221458435, + "learning_rate": 0.0001899468111645168, + "loss": 1.6998, + "step": 4737 + }, + { + "epoch": 0.1696778698945333, + "grad_norm": 2.2914044857025146, + "learning_rate": 0.00018994174194853161, + "loss": 1.5079, + "step": 4738 + }, + { + "epoch": 0.16971368202410156, + "grad_norm": 1.6915277242660522, + "learning_rate": 0.00018993667152249625, + "loss": 1.5748, + "step": 4739 + }, + { + "epoch": 0.16974949415366986, + "grad_norm": 1.5572466850280762, + "learning_rate": 0.00018993159988647901, + "loss": 1.8074, + "step": 4740 + }, + { + "epoch": 0.16978530628323812, + "grad_norm": 1.6799663305282593, + "learning_rate": 0.0001899265270405481, + "loss": 1.6571, + "step": 4741 + }, + { + "epoch": 0.16982111841280642, + "grad_norm": 1.6444222927093506, + "learning_rate": 0.00018992145298477175, + "loss": 1.4765, + "step": 4742 + }, + { + "epoch": 0.1698569305423747, + "grad_norm": 2.0587503910064697, + "learning_rate": 0.00018991637771921825, + "loss": 2.0098, + "step": 4743 + }, + { + "epoch": 0.169892742671943, + "grad_norm": 1.695487141609192, + "learning_rate": 0.00018991130124395585, + "loss": 1.8735, + "step": 4744 + }, + { + "epoch": 0.16992855480151128, + "grad_norm": 1.8474738597869873, + "learning_rate": 0.0001899062235590529, + "loss": 1.5526, + "step": 4745 + }, + { + "epoch": 0.16996436693107955, + "grad_norm": 1.5542140007019043, + "learning_rate": 0.00018990114466457768, + "loss": 1.7612, + "step": 4746 + }, + { + "epoch": 0.17000017906064785, + "grad_norm": 1.6513017416000366, + "learning_rate": 0.0001898960645605985, + "loss": 1.5931, + "step": 4747 + }, + { + "epoch": 0.17003599119021612, + "grad_norm": 2.3618006706237793, + "learning_rate": 0.00018989098324718375, + "loss": 1.5834, + "step": 4748 + }, + { + "epoch": 0.17007180331978441, + "grad_norm": 1.3786261081695557, + "learning_rate": 0.00018988590072440176, + "loss": 1.6837, + "step": 4749 + }, + { + "epoch": 0.17010761544935268, + "grad_norm": 1.8375169038772583, + "learning_rate": 0.00018988081699232095, + "loss": 1.726, + "step": 4750 + }, + { + "epoch": 0.17014342757892098, + "grad_norm": 1.6182897090911865, + "learning_rate": 0.00018987573205100965, + "loss": 1.6424, + "step": 4751 + }, + { + "epoch": 0.17017923970848928, + "grad_norm": 1.8573373556137085, + "learning_rate": 0.00018987064590053634, + "loss": 2.111, + "step": 4752 + }, + { + "epoch": 0.17021505183805755, + "grad_norm": 1.8004847764968872, + "learning_rate": 0.0001898655585409694, + "loss": 1.5171, + "step": 4753 + }, + { + "epoch": 0.17025086396762584, + "grad_norm": 1.9292305707931519, + "learning_rate": 0.00018986046997237726, + "loss": 1.4811, + "step": 4754 + }, + { + "epoch": 0.1702866760971941, + "grad_norm": 1.3681687116622925, + "learning_rate": 0.00018985538019482842, + "loss": 1.6534, + "step": 4755 + }, + { + "epoch": 0.1703224882267624, + "grad_norm": 2.1036245822906494, + "learning_rate": 0.00018985028920839137, + "loss": 1.6907, + "step": 4756 + }, + { + "epoch": 0.17035830035633068, + "grad_norm": 1.4106781482696533, + "learning_rate": 0.00018984519701313455, + "loss": 1.5812, + "step": 4757 + }, + { + "epoch": 0.17039411248589897, + "grad_norm": 1.4277902841567993, + "learning_rate": 0.0001898401036091265, + "loss": 1.5905, + "step": 4758 + }, + { + "epoch": 0.17042992461546727, + "grad_norm": 1.5859612226486206, + "learning_rate": 0.00018983500899643577, + "loss": 1.9137, + "step": 4759 + }, + { + "epoch": 0.17046573674503554, + "grad_norm": 1.3639755249023438, + "learning_rate": 0.00018982991317513084, + "loss": 1.4867, + "step": 4760 + }, + { + "epoch": 0.17050154887460384, + "grad_norm": 2.493811845779419, + "learning_rate": 0.0001898248161452803, + "loss": 1.4949, + "step": 4761 + }, + { + "epoch": 0.1705373610041721, + "grad_norm": 1.4881577491760254, + "learning_rate": 0.00018981971790695275, + "loss": 1.5378, + "step": 4762 + }, + { + "epoch": 0.1705731731337404, + "grad_norm": 1.687936544418335, + "learning_rate": 0.0001898146184602167, + "loss": 1.6787, + "step": 4763 + }, + { + "epoch": 0.17060898526330867, + "grad_norm": 1.3773003816604614, + "learning_rate": 0.0001898095178051409, + "loss": 1.4484, + "step": 4764 + }, + { + "epoch": 0.17064479739287697, + "grad_norm": 1.2580550909042358, + "learning_rate": 0.0001898044159417938, + "loss": 1.5878, + "step": 4765 + }, + { + "epoch": 0.17068060952244526, + "grad_norm": 1.4113050699234009, + "learning_rate": 0.00018979931287024416, + "loss": 1.2658, + "step": 4766 + }, + { + "epoch": 0.17071642165201353, + "grad_norm": 2.062539577484131, + "learning_rate": 0.00018979420859056062, + "loss": 1.7766, + "step": 4767 + }, + { + "epoch": 0.17075223378158183, + "grad_norm": 2.4932713508605957, + "learning_rate": 0.0001897891031028118, + "loss": 1.9452, + "step": 4768 + }, + { + "epoch": 0.1707880459111501, + "grad_norm": 1.9760795831680298, + "learning_rate": 0.0001897839964070664, + "loss": 1.4249, + "step": 4769 + }, + { + "epoch": 0.1708238580407184, + "grad_norm": 2.0726683139801025, + "learning_rate": 0.00018977888850339319, + "loss": 1.7297, + "step": 4770 + }, + { + "epoch": 0.17085967017028666, + "grad_norm": 1.313335657119751, + "learning_rate": 0.0001897737793918608, + "loss": 1.5473, + "step": 4771 + }, + { + "epoch": 0.17089548229985496, + "grad_norm": 1.480515956878662, + "learning_rate": 0.00018976866907253803, + "loss": 1.2051, + "step": 4772 + }, + { + "epoch": 0.17093129442942326, + "grad_norm": 1.6562939882278442, + "learning_rate": 0.0001897635575454936, + "loss": 1.6465, + "step": 4773 + }, + { + "epoch": 0.17096710655899153, + "grad_norm": 1.5738717317581177, + "learning_rate": 0.0001897584448107963, + "loss": 1.7648, + "step": 4774 + }, + { + "epoch": 0.17100291868855982, + "grad_norm": 1.9761050939559937, + "learning_rate": 0.0001897533308685149, + "loss": 1.6079, + "step": 4775 + }, + { + "epoch": 0.1710387308181281, + "grad_norm": 1.6524931192398071, + "learning_rate": 0.00018974821571871822, + "loss": 1.5807, + "step": 4776 + }, + { + "epoch": 0.1710745429476964, + "grad_norm": 1.2474535703659058, + "learning_rate": 0.00018974309936147502, + "loss": 1.3729, + "step": 4777 + }, + { + "epoch": 0.17111035507726466, + "grad_norm": 1.4474204778671265, + "learning_rate": 0.00018973798179685423, + "loss": 1.4076, + "step": 4778 + }, + { + "epoch": 0.17114616720683296, + "grad_norm": 1.3599224090576172, + "learning_rate": 0.0001897328630249246, + "loss": 1.5103, + "step": 4779 + }, + { + "epoch": 0.17118197933640125, + "grad_norm": 1.535048484802246, + "learning_rate": 0.0001897277430457551, + "loss": 1.3841, + "step": 4780 + }, + { + "epoch": 0.17121779146596952, + "grad_norm": 1.823050856590271, + "learning_rate": 0.00018972262185941452, + "loss": 1.7171, + "step": 4781 + }, + { + "epoch": 0.17125360359553782, + "grad_norm": 2.32497239112854, + "learning_rate": 0.00018971749946597178, + "loss": 1.4991, + "step": 4782 + }, + { + "epoch": 0.1712894157251061, + "grad_norm": 1.427659511566162, + "learning_rate": 0.00018971237586549587, + "loss": 1.5099, + "step": 4783 + }, + { + "epoch": 0.17132522785467438, + "grad_norm": 1.4457180500030518, + "learning_rate": 0.00018970725105805562, + "loss": 1.5058, + "step": 4784 + }, + { + "epoch": 0.17136103998424265, + "grad_norm": 1.7552998065948486, + "learning_rate": 0.00018970212504372004, + "loss": 1.5277, + "step": 4785 + }, + { + "epoch": 0.17139685211381095, + "grad_norm": 1.6383708715438843, + "learning_rate": 0.00018969699782255808, + "loss": 1.8856, + "step": 4786 + }, + { + "epoch": 0.17143266424337925, + "grad_norm": 1.7544535398483276, + "learning_rate": 0.0001896918693946387, + "loss": 1.4428, + "step": 4787 + }, + { + "epoch": 0.17146847637294751, + "grad_norm": 1.2920863628387451, + "learning_rate": 0.0001896867397600309, + "loss": 1.7034, + "step": 4788 + }, + { + "epoch": 0.1715042885025158, + "grad_norm": 2.2302498817443848, + "learning_rate": 0.00018968160891880373, + "loss": 1.9788, + "step": 4789 + }, + { + "epoch": 0.17154010063208408, + "grad_norm": 1.1154465675354004, + "learning_rate": 0.00018967647687102618, + "loss": 1.2938, + "step": 4790 + }, + { + "epoch": 0.17157591276165238, + "grad_norm": 2.1247408390045166, + "learning_rate": 0.00018967134361676732, + "loss": 1.5288, + "step": 4791 + }, + { + "epoch": 0.17161172489122065, + "grad_norm": 1.328395128250122, + "learning_rate": 0.00018966620915609618, + "loss": 1.5552, + "step": 4792 + }, + { + "epoch": 0.17164753702078894, + "grad_norm": 2.552074432373047, + "learning_rate": 0.00018966107348908188, + "loss": 1.4236, + "step": 4793 + }, + { + "epoch": 0.17168334915035724, + "grad_norm": 1.6310802698135376, + "learning_rate": 0.00018965593661579347, + "loss": 1.6218, + "step": 4794 + }, + { + "epoch": 0.1717191612799255, + "grad_norm": 1.940028429031372, + "learning_rate": 0.00018965079853630007, + "loss": 1.7298, + "step": 4795 + }, + { + "epoch": 0.1717549734094938, + "grad_norm": 1.7597527503967285, + "learning_rate": 0.00018964565925067085, + "loss": 1.7404, + "step": 4796 + }, + { + "epoch": 0.17179078553906207, + "grad_norm": 1.7288353443145752, + "learning_rate": 0.0001896405187589749, + "loss": 1.4283, + "step": 4797 + }, + { + "epoch": 0.17182659766863037, + "grad_norm": 1.8926104307174683, + "learning_rate": 0.0001896353770612814, + "loss": 1.7908, + "step": 4798 + }, + { + "epoch": 0.17186240979819864, + "grad_norm": 2.0064892768859863, + "learning_rate": 0.00018963023415765956, + "loss": 1.5197, + "step": 4799 + }, + { + "epoch": 0.17189822192776694, + "grad_norm": 1.9775582551956177, + "learning_rate": 0.00018962509004817846, + "loss": 1.7128, + "step": 4800 + }, + { + "epoch": 0.17193403405733523, + "grad_norm": 1.5334006547927856, + "learning_rate": 0.00018961994473290744, + "loss": 1.5295, + "step": 4801 + }, + { + "epoch": 0.1719698461869035, + "grad_norm": 1.7124851942062378, + "learning_rate": 0.00018961479821191562, + "loss": 1.5951, + "step": 4802 + }, + { + "epoch": 0.1720056583164718, + "grad_norm": 1.847869634628296, + "learning_rate": 0.00018960965048527232, + "loss": 1.6884, + "step": 4803 + }, + { + "epoch": 0.17204147044604007, + "grad_norm": 1.9723440408706665, + "learning_rate": 0.00018960450155304677, + "loss": 1.4846, + "step": 4804 + }, + { + "epoch": 0.17207728257560836, + "grad_norm": 1.7727763652801514, + "learning_rate": 0.00018959935141530821, + "loss": 1.5497, + "step": 4805 + }, + { + "epoch": 0.17211309470517663, + "grad_norm": 1.8374532461166382, + "learning_rate": 0.00018959420007212593, + "loss": 1.2231, + "step": 4806 + }, + { + "epoch": 0.17214890683474493, + "grad_norm": 1.577351450920105, + "learning_rate": 0.0001895890475235693, + "loss": 1.5661, + "step": 4807 + }, + { + "epoch": 0.1721847189643132, + "grad_norm": 1.4347150325775146, + "learning_rate": 0.00018958389376970758, + "loss": 1.644, + "step": 4808 + }, + { + "epoch": 0.1722205310938815, + "grad_norm": 1.2892824411392212, + "learning_rate": 0.00018957873881061014, + "loss": 1.407, + "step": 4809 + }, + { + "epoch": 0.1722563432234498, + "grad_norm": 1.7509137392044067, + "learning_rate": 0.00018957358264634627, + "loss": 1.8629, + "step": 4810 + }, + { + "epoch": 0.17229215535301806, + "grad_norm": 1.6735416650772095, + "learning_rate": 0.0001895684252769854, + "loss": 1.3715, + "step": 4811 + }, + { + "epoch": 0.17232796748258636, + "grad_norm": 1.627367615699768, + "learning_rate": 0.00018956326670259695, + "loss": 1.3836, + "step": 4812 + }, + { + "epoch": 0.17236377961215463, + "grad_norm": 1.8383601903915405, + "learning_rate": 0.00018955810692325025, + "loss": 1.6612, + "step": 4813 + }, + { + "epoch": 0.17239959174172292, + "grad_norm": 1.8072162866592407, + "learning_rate": 0.00018955294593901476, + "loss": 1.7922, + "step": 4814 + }, + { + "epoch": 0.1724354038712912, + "grad_norm": 1.667475938796997, + "learning_rate": 0.00018954778374995988, + "loss": 1.4375, + "step": 4815 + }, + { + "epoch": 0.1724712160008595, + "grad_norm": 1.460464358329773, + "learning_rate": 0.00018954262035615505, + "loss": 1.8729, + "step": 4816 + }, + { + "epoch": 0.1725070281304278, + "grad_norm": 1.6498515605926514, + "learning_rate": 0.0001895374557576698, + "loss": 1.7033, + "step": 4817 + }, + { + "epoch": 0.17254284025999606, + "grad_norm": 1.6040598154067993, + "learning_rate": 0.00018953228995457355, + "loss": 1.5721, + "step": 4818 + }, + { + "epoch": 0.17257865238956435, + "grad_norm": 1.5885403156280518, + "learning_rate": 0.00018952712294693585, + "loss": 1.6447, + "step": 4819 + }, + { + "epoch": 0.17261446451913262, + "grad_norm": 1.9566494226455688, + "learning_rate": 0.0001895219547348262, + "loss": 1.316, + "step": 4820 + }, + { + "epoch": 0.17265027664870092, + "grad_norm": 2.1236560344696045, + "learning_rate": 0.0001895167853183141, + "loss": 1.8277, + "step": 4821 + }, + { + "epoch": 0.1726860887782692, + "grad_norm": 2.2870054244995117, + "learning_rate": 0.00018951161469746915, + "loss": 1.4881, + "step": 4822 + }, + { + "epoch": 0.17272190090783748, + "grad_norm": 1.455335259437561, + "learning_rate": 0.00018950644287236084, + "loss": 1.8649, + "step": 4823 + }, + { + "epoch": 0.17275771303740578, + "grad_norm": 1.4912785291671753, + "learning_rate": 0.00018950126984305885, + "loss": 1.4861, + "step": 4824 + }, + { + "epoch": 0.17279352516697405, + "grad_norm": 1.774718165397644, + "learning_rate": 0.0001894960956096327, + "loss": 1.4081, + "step": 4825 + }, + { + "epoch": 0.17282933729654235, + "grad_norm": 1.8963184356689453, + "learning_rate": 0.000189490920172152, + "loss": 1.7869, + "step": 4826 + }, + { + "epoch": 0.17286514942611061, + "grad_norm": 1.4983773231506348, + "learning_rate": 0.00018948574353068643, + "loss": 1.7589, + "step": 4827 + }, + { + "epoch": 0.1729009615556789, + "grad_norm": 1.7496103048324585, + "learning_rate": 0.0001894805656853056, + "loss": 1.6636, + "step": 4828 + }, + { + "epoch": 0.17293677368524718, + "grad_norm": 1.442915678024292, + "learning_rate": 0.00018947538663607918, + "loss": 1.2341, + "step": 4829 + }, + { + "epoch": 0.17297258581481548, + "grad_norm": 2.0862679481506348, + "learning_rate": 0.00018947020638307687, + "loss": 1.6988, + "step": 4830 + }, + { + "epoch": 0.17300839794438377, + "grad_norm": 1.8809422254562378, + "learning_rate": 0.0001894650249263683, + "loss": 1.7472, + "step": 4831 + }, + { + "epoch": 0.17304421007395204, + "grad_norm": 1.4286421537399292, + "learning_rate": 0.00018945984226602326, + "loss": 1.7746, + "step": 4832 + }, + { + "epoch": 0.17308002220352034, + "grad_norm": 1.5385067462921143, + "learning_rate": 0.00018945465840211143, + "loss": 1.7762, + "step": 4833 + }, + { + "epoch": 0.1731158343330886, + "grad_norm": 1.8565199375152588, + "learning_rate": 0.00018944947333470252, + "loss": 1.6201, + "step": 4834 + }, + { + "epoch": 0.1731516464626569, + "grad_norm": 1.7545292377471924, + "learning_rate": 0.0001894442870638664, + "loss": 1.468, + "step": 4835 + }, + { + "epoch": 0.17318745859222517, + "grad_norm": 2.246614456176758, + "learning_rate": 0.00018943909958967273, + "loss": 1.7679, + "step": 4836 + }, + { + "epoch": 0.17322327072179347, + "grad_norm": 1.7241929769515991, + "learning_rate": 0.00018943391091219133, + "loss": 1.6106, + "step": 4837 + }, + { + "epoch": 0.17325908285136177, + "grad_norm": 1.5218449831008911, + "learning_rate": 0.00018942872103149206, + "loss": 1.7442, + "step": 4838 + }, + { + "epoch": 0.17329489498093004, + "grad_norm": 1.9982221126556396, + "learning_rate": 0.00018942352994764464, + "loss": 1.6432, + "step": 4839 + }, + { + "epoch": 0.17333070711049833, + "grad_norm": 2.3945224285125732, + "learning_rate": 0.00018941833766071903, + "loss": 1.7619, + "step": 4840 + }, + { + "epoch": 0.1733665192400666, + "grad_norm": 1.7721819877624512, + "learning_rate": 0.00018941314417078502, + "loss": 1.5272, + "step": 4841 + }, + { + "epoch": 0.1734023313696349, + "grad_norm": 1.349071979522705, + "learning_rate": 0.00018940794947791247, + "loss": 1.3999, + "step": 4842 + }, + { + "epoch": 0.17343814349920317, + "grad_norm": 1.555430293083191, + "learning_rate": 0.0001894027535821713, + "loss": 1.5955, + "step": 4843 + }, + { + "epoch": 0.17347395562877146, + "grad_norm": 1.8791706562042236, + "learning_rate": 0.0001893975564836314, + "loss": 1.5432, + "step": 4844 + }, + { + "epoch": 0.17350976775833976, + "grad_norm": 2.6219892501831055, + "learning_rate": 0.00018939235818236268, + "loss": 2.0688, + "step": 4845 + }, + { + "epoch": 0.17354557988790803, + "grad_norm": 2.0889711380004883, + "learning_rate": 0.00018938715867843512, + "loss": 1.4781, + "step": 4846 + }, + { + "epoch": 0.17358139201747633, + "grad_norm": 1.7666796445846558, + "learning_rate": 0.0001893819579719186, + "loss": 1.4148, + "step": 4847 + }, + { + "epoch": 0.1736172041470446, + "grad_norm": 2.062335968017578, + "learning_rate": 0.00018937675606288317, + "loss": 1.4671, + "step": 4848 + }, + { + "epoch": 0.1736530162766129, + "grad_norm": 1.5806844234466553, + "learning_rate": 0.00018937155295139878, + "loss": 1.7065, + "step": 4849 + }, + { + "epoch": 0.17368882840618116, + "grad_norm": 1.7525285482406616, + "learning_rate": 0.00018936634863753537, + "loss": 1.5066, + "step": 4850 + }, + { + "epoch": 0.17372464053574946, + "grad_norm": 1.3670426607131958, + "learning_rate": 0.00018936114312136307, + "loss": 1.5515, + "step": 4851 + }, + { + "epoch": 0.17376045266531776, + "grad_norm": 1.4752345085144043, + "learning_rate": 0.0001893559364029518, + "loss": 1.4657, + "step": 4852 + }, + { + "epoch": 0.17379626479488602, + "grad_norm": 1.8495032787322998, + "learning_rate": 0.00018935072848237172, + "loss": 1.6514, + "step": 4853 + }, + { + "epoch": 0.17383207692445432, + "grad_norm": 2.1843106746673584, + "learning_rate": 0.00018934551935969284, + "loss": 1.4489, + "step": 4854 + }, + { + "epoch": 0.1738678890540226, + "grad_norm": 1.5473827123641968, + "learning_rate": 0.00018934030903498518, + "loss": 1.5436, + "step": 4855 + }, + { + "epoch": 0.1739037011835909, + "grad_norm": 1.4357026815414429, + "learning_rate": 0.00018933509750831897, + "loss": 1.6933, + "step": 4856 + }, + { + "epoch": 0.17393951331315916, + "grad_norm": 1.5008283853530884, + "learning_rate": 0.00018932988477976423, + "loss": 1.6496, + "step": 4857 + }, + { + "epoch": 0.17397532544272745, + "grad_norm": 1.748734474182129, + "learning_rate": 0.0001893246708493911, + "loss": 1.6295, + "step": 4858 + }, + { + "epoch": 0.17401113757229575, + "grad_norm": 2.9435081481933594, + "learning_rate": 0.0001893194557172698, + "loss": 1.3209, + "step": 4859 + }, + { + "epoch": 0.17404694970186402, + "grad_norm": 1.4894545078277588, + "learning_rate": 0.0001893142393834704, + "loss": 1.7977, + "step": 4860 + }, + { + "epoch": 0.17408276183143231, + "grad_norm": 2.0931038856506348, + "learning_rate": 0.00018930902184806313, + "loss": 1.9863, + "step": 4861 + }, + { + "epoch": 0.17411857396100058, + "grad_norm": 1.7065221071243286, + "learning_rate": 0.00018930380311111815, + "loss": 1.4548, + "step": 4862 + }, + { + "epoch": 0.17415438609056888, + "grad_norm": 1.5173895359039307, + "learning_rate": 0.0001892985831727057, + "loss": 1.607, + "step": 4863 + }, + { + "epoch": 0.17419019822013715, + "grad_norm": 1.4496746063232422, + "learning_rate": 0.000189293362032896, + "loss": 1.4503, + "step": 4864 + }, + { + "epoch": 0.17422601034970545, + "grad_norm": 1.7697582244873047, + "learning_rate": 0.00018928813969175932, + "loss": 1.6072, + "step": 4865 + }, + { + "epoch": 0.17426182247927374, + "grad_norm": 1.6433504819869995, + "learning_rate": 0.00018928291614936585, + "loss": 1.7286, + "step": 4866 + }, + { + "epoch": 0.174297634608842, + "grad_norm": 1.8499150276184082, + "learning_rate": 0.00018927769140578593, + "loss": 1.5328, + "step": 4867 + }, + { + "epoch": 0.1743334467384103, + "grad_norm": 2.2025258541107178, + "learning_rate": 0.00018927246546108985, + "loss": 1.755, + "step": 4868 + }, + { + "epoch": 0.17436925886797858, + "grad_norm": 2.001037120819092, + "learning_rate": 0.00018926723831534789, + "loss": 1.4364, + "step": 4869 + }, + { + "epoch": 0.17440507099754687, + "grad_norm": 1.4569237232208252, + "learning_rate": 0.00018926200996863038, + "loss": 1.8061, + "step": 4870 + }, + { + "epoch": 0.17444088312711514, + "grad_norm": 1.2852599620819092, + "learning_rate": 0.00018925678042100766, + "loss": 1.2263, + "step": 4871 + }, + { + "epoch": 0.17447669525668344, + "grad_norm": 1.772127628326416, + "learning_rate": 0.0001892515496725501, + "loss": 1.4574, + "step": 4872 + }, + { + "epoch": 0.17451250738625174, + "grad_norm": 1.3789273500442505, + "learning_rate": 0.00018924631772332807, + "loss": 1.4344, + "step": 4873 + }, + { + "epoch": 0.17454831951582, + "grad_norm": 1.595746397972107, + "learning_rate": 0.00018924108457341195, + "loss": 1.9308, + "step": 4874 + }, + { + "epoch": 0.1745841316453883, + "grad_norm": 1.9921436309814453, + "learning_rate": 0.00018923585022287214, + "loss": 1.5553, + "step": 4875 + }, + { + "epoch": 0.17461994377495657, + "grad_norm": 1.2532308101654053, + "learning_rate": 0.00018923061467177908, + "loss": 1.5337, + "step": 4876 + }, + { + "epoch": 0.17465575590452487, + "grad_norm": 1.189430832862854, + "learning_rate": 0.0001892253779202032, + "loss": 1.4088, + "step": 4877 + }, + { + "epoch": 0.17469156803409314, + "grad_norm": 2.0476043224334717, + "learning_rate": 0.00018922013996821492, + "loss": 1.2659, + "step": 4878 + }, + { + "epoch": 0.17472738016366143, + "grad_norm": 2.3595757484436035, + "learning_rate": 0.0001892149008158848, + "loss": 1.4595, + "step": 4879 + }, + { + "epoch": 0.17476319229322973, + "grad_norm": 1.562600016593933, + "learning_rate": 0.00018920966046328324, + "loss": 1.5422, + "step": 4880 + }, + { + "epoch": 0.174799004422798, + "grad_norm": 1.5140035152435303, + "learning_rate": 0.00018920441891048077, + "loss": 1.628, + "step": 4881 + }, + { + "epoch": 0.1748348165523663, + "grad_norm": 1.220018744468689, + "learning_rate": 0.00018919917615754792, + "loss": 1.4796, + "step": 4882 + }, + { + "epoch": 0.17487062868193456, + "grad_norm": 1.8878225088119507, + "learning_rate": 0.00018919393220455518, + "loss": 1.2749, + "step": 4883 + }, + { + "epoch": 0.17490644081150286, + "grad_norm": 1.5586538314819336, + "learning_rate": 0.00018918868705157318, + "loss": 1.4179, + "step": 4884 + }, + { + "epoch": 0.17494225294107113, + "grad_norm": 1.4380314350128174, + "learning_rate": 0.00018918344069867243, + "loss": 1.5866, + "step": 4885 + }, + { + "epoch": 0.17497806507063943, + "grad_norm": 3.5423998832702637, + "learning_rate": 0.00018917819314592351, + "loss": 1.9557, + "step": 4886 + }, + { + "epoch": 0.17501387720020772, + "grad_norm": 1.9124844074249268, + "learning_rate": 0.00018917294439339705, + "loss": 1.517, + "step": 4887 + }, + { + "epoch": 0.175049689329776, + "grad_norm": 1.3516485691070557, + "learning_rate": 0.0001891676944411636, + "loss": 1.5067, + "step": 4888 + }, + { + "epoch": 0.1750855014593443, + "grad_norm": 1.858694076538086, + "learning_rate": 0.00018916244328929388, + "loss": 1.4675, + "step": 4889 + }, + { + "epoch": 0.17512131358891256, + "grad_norm": 1.8786976337432861, + "learning_rate": 0.00018915719093785848, + "loss": 1.5355, + "step": 4890 + }, + { + "epoch": 0.17515712571848085, + "grad_norm": 1.3482768535614014, + "learning_rate": 0.00018915193738692812, + "loss": 1.6203, + "step": 4891 + }, + { + "epoch": 0.17519293784804912, + "grad_norm": 1.3710525035858154, + "learning_rate": 0.00018914668263657342, + "loss": 1.3993, + "step": 4892 + }, + { + "epoch": 0.17522874997761742, + "grad_norm": 1.5821046829223633, + "learning_rate": 0.00018914142668686505, + "loss": 1.5148, + "step": 4893 + }, + { + "epoch": 0.17526456210718572, + "grad_norm": 2.2228548526763916, + "learning_rate": 0.0001891361695378738, + "loss": 1.9352, + "step": 4894 + }, + { + "epoch": 0.175300374236754, + "grad_norm": 3.116131544113159, + "learning_rate": 0.00018913091118967037, + "loss": 1.8747, + "step": 4895 + }, + { + "epoch": 0.17533618636632228, + "grad_norm": 1.6172984838485718, + "learning_rate": 0.00018912565164232552, + "loss": 1.3994, + "step": 4896 + }, + { + "epoch": 0.17537199849589055, + "grad_norm": 1.4550762176513672, + "learning_rate": 0.00018912039089590997, + "loss": 1.4414, + "step": 4897 + }, + { + "epoch": 0.17540781062545885, + "grad_norm": 2.1344704627990723, + "learning_rate": 0.00018911512895049452, + "loss": 1.6229, + "step": 4898 + }, + { + "epoch": 0.17544362275502712, + "grad_norm": 1.5842924118041992, + "learning_rate": 0.00018910986580614997, + "loss": 1.325, + "step": 4899 + }, + { + "epoch": 0.17547943488459541, + "grad_norm": 1.6697801351547241, + "learning_rate": 0.00018910460146294707, + "loss": 1.5925, + "step": 4900 + }, + { + "epoch": 0.1755152470141637, + "grad_norm": 1.509250521659851, + "learning_rate": 0.00018909933592095674, + "loss": 1.4572, + "step": 4901 + }, + { + "epoch": 0.17555105914373198, + "grad_norm": 1.783841609954834, + "learning_rate": 0.0001890940691802498, + "loss": 1.6234, + "step": 4902 + }, + { + "epoch": 0.17558687127330028, + "grad_norm": 1.7891788482666016, + "learning_rate": 0.00018908880124089702, + "loss": 1.458, + "step": 4903 + }, + { + "epoch": 0.17562268340286855, + "grad_norm": 2.8010456562042236, + "learning_rate": 0.0001890835321029694, + "loss": 1.5074, + "step": 4904 + }, + { + "epoch": 0.17565849553243684, + "grad_norm": 1.094394564628601, + "learning_rate": 0.00018907826176653772, + "loss": 1.3326, + "step": 4905 + }, + { + "epoch": 0.1756943076620051, + "grad_norm": 1.3641574382781982, + "learning_rate": 0.00018907299023167293, + "loss": 1.7497, + "step": 4906 + }, + { + "epoch": 0.1757301197915734, + "grad_norm": 1.4977530241012573, + "learning_rate": 0.00018906771749844595, + "loss": 1.5956, + "step": 4907 + }, + { + "epoch": 0.17576593192114168, + "grad_norm": 1.44773268699646, + "learning_rate": 0.00018906244356692775, + "loss": 1.7386, + "step": 4908 + }, + { + "epoch": 0.17580174405070997, + "grad_norm": 1.46619713306427, + "learning_rate": 0.00018905716843718926, + "loss": 1.6829, + "step": 4909 + }, + { + "epoch": 0.17583755618027827, + "grad_norm": 4.56627893447876, + "learning_rate": 0.00018905189210930142, + "loss": 1.5221, + "step": 4910 + }, + { + "epoch": 0.17587336830984654, + "grad_norm": 1.9148731231689453, + "learning_rate": 0.00018904661458333524, + "loss": 1.5686, + "step": 4911 + }, + { + "epoch": 0.17590918043941484, + "grad_norm": 1.6884065866470337, + "learning_rate": 0.00018904133585936173, + "loss": 1.531, + "step": 4912 + }, + { + "epoch": 0.1759449925689831, + "grad_norm": 1.323333740234375, + "learning_rate": 0.0001890360559374519, + "loss": 1.7735, + "step": 4913 + }, + { + "epoch": 0.1759808046985514, + "grad_norm": 2.1087372303009033, + "learning_rate": 0.00018903077481767676, + "loss": 1.305, + "step": 4914 + }, + { + "epoch": 0.17601661682811967, + "grad_norm": 2.0046706199645996, + "learning_rate": 0.00018902549250010743, + "loss": 1.5707, + "step": 4915 + }, + { + "epoch": 0.17605242895768797, + "grad_norm": 2.4085426330566406, + "learning_rate": 0.0001890202089848149, + "loss": 1.6196, + "step": 4916 + }, + { + "epoch": 0.17608824108725626, + "grad_norm": 1.8258897066116333, + "learning_rate": 0.00018901492427187032, + "loss": 1.5813, + "step": 4917 + }, + { + "epoch": 0.17612405321682453, + "grad_norm": 1.4927031993865967, + "learning_rate": 0.0001890096383613447, + "loss": 1.6213, + "step": 4918 + }, + { + "epoch": 0.17615986534639283, + "grad_norm": 1.136582612991333, + "learning_rate": 0.00018900435125330923, + "loss": 1.4248, + "step": 4919 + }, + { + "epoch": 0.1761956774759611, + "grad_norm": 1.3010669946670532, + "learning_rate": 0.00018899906294783504, + "loss": 1.6543, + "step": 4920 + }, + { + "epoch": 0.1762314896055294, + "grad_norm": 1.9157588481903076, + "learning_rate": 0.00018899377344499328, + "loss": 1.5228, + "step": 4921 + }, + { + "epoch": 0.17626730173509766, + "grad_norm": 2.504683494567871, + "learning_rate": 0.00018898848274485506, + "loss": 2.0403, + "step": 4922 + }, + { + "epoch": 0.17630311386466596, + "grad_norm": 1.409203052520752, + "learning_rate": 0.00018898319084749158, + "loss": 1.2702, + "step": 4923 + }, + { + "epoch": 0.17633892599423426, + "grad_norm": 2.0909388065338135, + "learning_rate": 0.00018897789775297404, + "loss": 1.8806, + "step": 4924 + }, + { + "epoch": 0.17637473812380253, + "grad_norm": 1.3817533254623413, + "learning_rate": 0.0001889726034613737, + "loss": 1.5544, + "step": 4925 + }, + { + "epoch": 0.17641055025337082, + "grad_norm": 1.9548993110656738, + "learning_rate": 0.00018896730797276175, + "loss": 1.6944, + "step": 4926 + }, + { + "epoch": 0.1764463623829391, + "grad_norm": 1.3529974222183228, + "learning_rate": 0.00018896201128720938, + "loss": 1.471, + "step": 4927 + }, + { + "epoch": 0.1764821745125074, + "grad_norm": 1.5662931203842163, + "learning_rate": 0.00018895671340478796, + "loss": 1.5855, + "step": 4928 + }, + { + "epoch": 0.17651798664207566, + "grad_norm": 1.9040552377700806, + "learning_rate": 0.00018895141432556867, + "loss": 1.4631, + "step": 4929 + }, + { + "epoch": 0.17655379877164395, + "grad_norm": 2.3355555534362793, + "learning_rate": 0.00018894611404962283, + "loss": 1.6409, + "step": 4930 + }, + { + "epoch": 0.17658961090121225, + "grad_norm": 1.3861925601959229, + "learning_rate": 0.0001889408125770218, + "loss": 1.5785, + "step": 4931 + }, + { + "epoch": 0.17662542303078052, + "grad_norm": 1.3609862327575684, + "learning_rate": 0.00018893550990783684, + "loss": 1.4329, + "step": 4932 + }, + { + "epoch": 0.17666123516034882, + "grad_norm": 1.739033818244934, + "learning_rate": 0.00018893020604213932, + "loss": 1.6007, + "step": 4933 + }, + { + "epoch": 0.1766970472899171, + "grad_norm": 2.2016334533691406, + "learning_rate": 0.00018892490098000055, + "loss": 1.4416, + "step": 4934 + }, + { + "epoch": 0.17673285941948538, + "grad_norm": 1.7724460363388062, + "learning_rate": 0.00018891959472149198, + "loss": 1.4571, + "step": 4935 + }, + { + "epoch": 0.17676867154905365, + "grad_norm": 2.670675754547119, + "learning_rate": 0.00018891428726668495, + "loss": 1.5139, + "step": 4936 + }, + { + "epoch": 0.17680448367862195, + "grad_norm": 1.693932294845581, + "learning_rate": 0.00018890897861565086, + "loss": 2.0678, + "step": 4937 + }, + { + "epoch": 0.17684029580819025, + "grad_norm": 1.6531952619552612, + "learning_rate": 0.00018890366876846119, + "loss": 1.7776, + "step": 4938 + }, + { + "epoch": 0.17687610793775851, + "grad_norm": 2.104870080947876, + "learning_rate": 0.00018889835772518731, + "loss": 1.5346, + "step": 4939 + }, + { + "epoch": 0.1769119200673268, + "grad_norm": 1.7648040056228638, + "learning_rate": 0.00018889304548590067, + "loss": 1.8308, + "step": 4940 + }, + { + "epoch": 0.17694773219689508, + "grad_norm": 1.3178796768188477, + "learning_rate": 0.00018888773205067282, + "loss": 1.7215, + "step": 4941 + }, + { + "epoch": 0.17698354432646338, + "grad_norm": 1.2029863595962524, + "learning_rate": 0.00018888241741957514, + "loss": 1.5563, + "step": 4942 + }, + { + "epoch": 0.17701935645603165, + "grad_norm": 2.1883883476257324, + "learning_rate": 0.00018887710159267923, + "loss": 1.7515, + "step": 4943 + }, + { + "epoch": 0.17705516858559994, + "grad_norm": 1.2338509559631348, + "learning_rate": 0.00018887178457005653, + "loss": 1.4304, + "step": 4944 + }, + { + "epoch": 0.17709098071516824, + "grad_norm": 1.7541881799697876, + "learning_rate": 0.00018886646635177864, + "loss": 1.7429, + "step": 4945 + }, + { + "epoch": 0.1771267928447365, + "grad_norm": 1.4852969646453857, + "learning_rate": 0.00018886114693791704, + "loss": 1.4427, + "step": 4946 + }, + { + "epoch": 0.1771626049743048, + "grad_norm": 1.5804656744003296, + "learning_rate": 0.00018885582632854333, + "loss": 1.431, + "step": 4947 + }, + { + "epoch": 0.17719841710387307, + "grad_norm": 1.3191192150115967, + "learning_rate": 0.00018885050452372912, + "loss": 1.5655, + "step": 4948 + }, + { + "epoch": 0.17723422923344137, + "grad_norm": 2.596315383911133, + "learning_rate": 0.00018884518152354596, + "loss": 1.496, + "step": 4949 + }, + { + "epoch": 0.17727004136300964, + "grad_norm": 1.8313298225402832, + "learning_rate": 0.00018883985732806547, + "loss": 1.5148, + "step": 4950 + }, + { + "epoch": 0.17730585349257794, + "grad_norm": 1.5022848844528198, + "learning_rate": 0.00018883453193735932, + "loss": 1.4678, + "step": 4951 + }, + { + "epoch": 0.17734166562214623, + "grad_norm": 1.8129926919937134, + "learning_rate": 0.00018882920535149913, + "loss": 1.5432, + "step": 4952 + }, + { + "epoch": 0.1773774777517145, + "grad_norm": 1.5845561027526855, + "learning_rate": 0.00018882387757055655, + "loss": 1.9524, + "step": 4953 + }, + { + "epoch": 0.1774132898812828, + "grad_norm": 1.0808457136154175, + "learning_rate": 0.00018881854859460328, + "loss": 1.643, + "step": 4954 + }, + { + "epoch": 0.17744910201085107, + "grad_norm": 1.7996083498001099, + "learning_rate": 0.00018881321842371103, + "loss": 1.6271, + "step": 4955 + }, + { + "epoch": 0.17748491414041936, + "grad_norm": 1.7075645923614502, + "learning_rate": 0.00018880788705795144, + "loss": 1.8461, + "step": 4956 + }, + { + "epoch": 0.17752072626998763, + "grad_norm": 1.3339205980300903, + "learning_rate": 0.00018880255449739634, + "loss": 1.5211, + "step": 4957 + }, + { + "epoch": 0.17755653839955593, + "grad_norm": 1.3496900796890259, + "learning_rate": 0.00018879722074211736, + "loss": 1.5706, + "step": 4958 + }, + { + "epoch": 0.17759235052912423, + "grad_norm": 1.2548480033874512, + "learning_rate": 0.00018879188579218635, + "loss": 1.5067, + "step": 4959 + }, + { + "epoch": 0.1776281626586925, + "grad_norm": 1.7974036931991577, + "learning_rate": 0.000188786549647675, + "loss": 1.38, + "step": 4960 + }, + { + "epoch": 0.1776639747882608, + "grad_norm": 1.8285739421844482, + "learning_rate": 0.0001887812123086552, + "loss": 1.8508, + "step": 4961 + }, + { + "epoch": 0.17769978691782906, + "grad_norm": 1.8094350099563599, + "learning_rate": 0.0001887758737751987, + "loss": 1.4213, + "step": 4962 + }, + { + "epoch": 0.17773559904739736, + "grad_norm": 1.3000043630599976, + "learning_rate": 0.00018877053404737734, + "loss": 1.6764, + "step": 4963 + }, + { + "epoch": 0.17777141117696563, + "grad_norm": 1.428836703300476, + "learning_rate": 0.00018876519312526293, + "loss": 1.6104, + "step": 4964 + }, + { + "epoch": 0.17780722330653392, + "grad_norm": 1.7616479396820068, + "learning_rate": 0.00018875985100892738, + "loss": 1.2714, + "step": 4965 + }, + { + "epoch": 0.17784303543610222, + "grad_norm": 1.2152507305145264, + "learning_rate": 0.0001887545076984425, + "loss": 1.6792, + "step": 4966 + }, + { + "epoch": 0.1778788475656705, + "grad_norm": 1.5839030742645264, + "learning_rate": 0.0001887491631938802, + "loss": 1.2338, + "step": 4967 + }, + { + "epoch": 0.17791465969523879, + "grad_norm": 3.108227014541626, + "learning_rate": 0.0001887438174953124, + "loss": 1.7161, + "step": 4968 + }, + { + "epoch": 0.17795047182480705, + "grad_norm": 1.628882884979248, + "learning_rate": 0.000188738470602811, + "loss": 1.9082, + "step": 4969 + }, + { + "epoch": 0.17798628395437535, + "grad_norm": 1.6627739667892456, + "learning_rate": 0.00018873312251644793, + "loss": 1.5598, + "step": 4970 + }, + { + "epoch": 0.17802209608394362, + "grad_norm": 1.4799766540527344, + "learning_rate": 0.00018872777323629514, + "loss": 1.3478, + "step": 4971 + }, + { + "epoch": 0.17805790821351192, + "grad_norm": 2.2663679122924805, + "learning_rate": 0.00018872242276242465, + "loss": 1.8249, + "step": 4972 + }, + { + "epoch": 0.17809372034308021, + "grad_norm": 1.7324576377868652, + "learning_rate": 0.0001887170710949084, + "loss": 1.9787, + "step": 4973 + }, + { + "epoch": 0.17812953247264848, + "grad_norm": 1.4873781204223633, + "learning_rate": 0.00018871171823381836, + "loss": 1.9714, + "step": 4974 + }, + { + "epoch": 0.17816534460221678, + "grad_norm": 1.8465592861175537, + "learning_rate": 0.00018870636417922662, + "loss": 1.4609, + "step": 4975 + }, + { + "epoch": 0.17820115673178505, + "grad_norm": 2.0763323307037354, + "learning_rate": 0.00018870100893120516, + "loss": 1.8321, + "step": 4976 + }, + { + "epoch": 0.17823696886135335, + "grad_norm": 1.5704597234725952, + "learning_rate": 0.00018869565248982607, + "loss": 1.4162, + "step": 4977 + }, + { + "epoch": 0.17827278099092161, + "grad_norm": 1.4963994026184082, + "learning_rate": 0.00018869029485516135, + "loss": 1.2262, + "step": 4978 + }, + { + "epoch": 0.1783085931204899, + "grad_norm": 1.768388271331787, + "learning_rate": 0.0001886849360272831, + "loss": 1.5942, + "step": 4979 + }, + { + "epoch": 0.1783444052500582, + "grad_norm": 1.202764868736267, + "learning_rate": 0.00018867957600626344, + "loss": 1.7769, + "step": 4980 + }, + { + "epoch": 0.17838021737962648, + "grad_norm": 1.4093730449676514, + "learning_rate": 0.0001886742147921745, + "loss": 1.6073, + "step": 4981 + }, + { + "epoch": 0.17841602950919477, + "grad_norm": 1.5123848915100098, + "learning_rate": 0.0001886688523850884, + "loss": 1.5704, + "step": 4982 + }, + { + "epoch": 0.17845184163876304, + "grad_norm": 1.5131558179855347, + "learning_rate": 0.0001886634887850772, + "loss": 1.7871, + "step": 4983 + }, + { + "epoch": 0.17848765376833134, + "grad_norm": 1.548387050628662, + "learning_rate": 0.00018865812399221317, + "loss": 1.6158, + "step": 4984 + }, + { + "epoch": 0.1785234658978996, + "grad_norm": 1.310096263885498, + "learning_rate": 0.00018865275800656844, + "loss": 1.6574, + "step": 4985 + }, + { + "epoch": 0.1785592780274679, + "grad_norm": 1.429352045059204, + "learning_rate": 0.00018864739082821518, + "loss": 1.6101, + "step": 4986 + }, + { + "epoch": 0.1785950901570362, + "grad_norm": 1.6432214975357056, + "learning_rate": 0.00018864202245722566, + "loss": 1.7259, + "step": 4987 + }, + { + "epoch": 0.17863090228660447, + "grad_norm": 1.8350661993026733, + "learning_rate": 0.00018863665289367204, + "loss": 1.6486, + "step": 4988 + }, + { + "epoch": 0.17866671441617277, + "grad_norm": 1.4883739948272705, + "learning_rate": 0.0001886312821376266, + "loss": 1.5008, + "step": 4989 + }, + { + "epoch": 0.17870252654574104, + "grad_norm": 1.758785605430603, + "learning_rate": 0.00018862591018916155, + "loss": 1.5733, + "step": 4990 + }, + { + "epoch": 0.17873833867530933, + "grad_norm": 1.595152497291565, + "learning_rate": 0.00018862053704834925, + "loss": 1.4687, + "step": 4991 + }, + { + "epoch": 0.1787741508048776, + "grad_norm": 2.3240230083465576, + "learning_rate": 0.00018861516271526191, + "loss": 1.4684, + "step": 4992 + }, + { + "epoch": 0.1788099629344459, + "grad_norm": 1.5655384063720703, + "learning_rate": 0.00018860978718997185, + "loss": 1.8508, + "step": 4993 + }, + { + "epoch": 0.1788457750640142, + "grad_norm": 1.9290118217468262, + "learning_rate": 0.00018860441047255144, + "loss": 1.6378, + "step": 4994 + }, + { + "epoch": 0.17888158719358246, + "grad_norm": 1.8785892724990845, + "learning_rate": 0.00018859903256307297, + "loss": 1.8559, + "step": 4995 + }, + { + "epoch": 0.17891739932315076, + "grad_norm": 1.6702314615249634, + "learning_rate": 0.00018859365346160877, + "loss": 1.6963, + "step": 4996 + }, + { + "epoch": 0.17895321145271903, + "grad_norm": 2.097177743911743, + "learning_rate": 0.00018858827316823126, + "loss": 1.4722, + "step": 4997 + }, + { + "epoch": 0.17898902358228733, + "grad_norm": 1.577393889427185, + "learning_rate": 0.0001885828916830128, + "loss": 1.5131, + "step": 4998 + }, + { + "epoch": 0.1790248357118556, + "grad_norm": 2.048555374145508, + "learning_rate": 0.00018857750900602583, + "loss": 1.8445, + "step": 4999 + }, + { + "epoch": 0.1790606478414239, + "grad_norm": 1.83433997631073, + "learning_rate": 0.00018857212513734268, + "loss": 1.348, + "step": 5000 + }, + { + "epoch": 0.17909645997099216, + "grad_norm": 1.6511319875717163, + "learning_rate": 0.00018856674007703585, + "loss": 1.7947, + "step": 5001 + }, + { + "epoch": 0.17913227210056046, + "grad_norm": 1.9509414434432983, + "learning_rate": 0.0001885613538251778, + "loss": 1.6304, + "step": 5002 + }, + { + "epoch": 0.17916808423012875, + "grad_norm": 1.4391506910324097, + "learning_rate": 0.00018855596638184095, + "loss": 1.5117, + "step": 5003 + }, + { + "epoch": 0.17920389635969702, + "grad_norm": 1.9456406831741333, + "learning_rate": 0.0001885505777470978, + "loss": 1.6562, + "step": 5004 + }, + { + "epoch": 0.17923970848926532, + "grad_norm": 1.2658982276916504, + "learning_rate": 0.00018854518792102084, + "loss": 1.4669, + "step": 5005 + }, + { + "epoch": 0.1792755206188336, + "grad_norm": 1.4607932567596436, + "learning_rate": 0.00018853979690368263, + "loss": 1.8091, + "step": 5006 + }, + { + "epoch": 0.17931133274840189, + "grad_norm": 1.4325487613677979, + "learning_rate": 0.0001885344046951556, + "loss": 1.6542, + "step": 5007 + }, + { + "epoch": 0.17934714487797015, + "grad_norm": 1.477703332901001, + "learning_rate": 0.00018852901129551243, + "loss": 1.5072, + "step": 5008 + }, + { + "epoch": 0.17938295700753845, + "grad_norm": 2.172150135040283, + "learning_rate": 0.00018852361670482556, + "loss": 1.7455, + "step": 5009 + }, + { + "epoch": 0.17941876913710675, + "grad_norm": 1.7485425472259521, + "learning_rate": 0.00018851822092316763, + "loss": 1.8077, + "step": 5010 + }, + { + "epoch": 0.17945458126667502, + "grad_norm": 1.443163514137268, + "learning_rate": 0.00018851282395061122, + "loss": 1.3942, + "step": 5011 + }, + { + "epoch": 0.17949039339624331, + "grad_norm": 1.8629781007766724, + "learning_rate": 0.00018850742578722894, + "loss": 1.6342, + "step": 5012 + }, + { + "epoch": 0.17952620552581158, + "grad_norm": 2.062121629714966, + "learning_rate": 0.00018850202643309338, + "loss": 1.6435, + "step": 5013 + }, + { + "epoch": 0.17956201765537988, + "grad_norm": 1.654205083847046, + "learning_rate": 0.00018849662588827723, + "loss": 1.4998, + "step": 5014 + }, + { + "epoch": 0.17959782978494815, + "grad_norm": 2.017118453979492, + "learning_rate": 0.00018849122415285315, + "loss": 1.6594, + "step": 5015 + }, + { + "epoch": 0.17963364191451645, + "grad_norm": 1.514277696609497, + "learning_rate": 0.00018848582122689376, + "loss": 1.5694, + "step": 5016 + }, + { + "epoch": 0.17966945404408474, + "grad_norm": 1.299932599067688, + "learning_rate": 0.0001884804171104718, + "loss": 1.6352, + "step": 5017 + }, + { + "epoch": 0.179705266173653, + "grad_norm": 2.4143078327178955, + "learning_rate": 0.00018847501180365995, + "loss": 1.9637, + "step": 5018 + }, + { + "epoch": 0.1797410783032213, + "grad_norm": 1.4530065059661865, + "learning_rate": 0.000188469605306531, + "loss": 1.6145, + "step": 5019 + }, + { + "epoch": 0.17977689043278958, + "grad_norm": 1.9624786376953125, + "learning_rate": 0.00018846419761915753, + "loss": 1.3612, + "step": 5020 + }, + { + "epoch": 0.17981270256235787, + "grad_norm": 1.5147650241851807, + "learning_rate": 0.00018845878874161249, + "loss": 1.127, + "step": 5021 + }, + { + "epoch": 0.17984851469192614, + "grad_norm": 1.562705636024475, + "learning_rate": 0.00018845337867396848, + "loss": 1.6414, + "step": 5022 + }, + { + "epoch": 0.17988432682149444, + "grad_norm": 1.8817349672317505, + "learning_rate": 0.0001884479674162984, + "loss": 1.8362, + "step": 5023 + }, + { + "epoch": 0.17992013895106274, + "grad_norm": 1.2586948871612549, + "learning_rate": 0.00018844255496867497, + "loss": 1.4049, + "step": 5024 + }, + { + "epoch": 0.179955951080631, + "grad_norm": 1.6511366367340088, + "learning_rate": 0.00018843714133117106, + "loss": 1.6536, + "step": 5025 + }, + { + "epoch": 0.1799917632101993, + "grad_norm": 1.656224012374878, + "learning_rate": 0.0001884317265038595, + "loss": 1.6308, + "step": 5026 + }, + { + "epoch": 0.18002757533976757, + "grad_norm": 2.134478807449341, + "learning_rate": 0.00018842631048681313, + "loss": 1.3212, + "step": 5027 + }, + { + "epoch": 0.18006338746933587, + "grad_norm": 1.6021968126296997, + "learning_rate": 0.00018842089328010482, + "loss": 1.7667, + "step": 5028 + }, + { + "epoch": 0.18009919959890414, + "grad_norm": 2.7165796756744385, + "learning_rate": 0.00018841547488380745, + "loss": 1.7146, + "step": 5029 + }, + { + "epoch": 0.18013501172847243, + "grad_norm": 1.9333155155181885, + "learning_rate": 0.00018841005529799388, + "loss": 1.6024, + "step": 5030 + }, + { + "epoch": 0.18017082385804073, + "grad_norm": 1.365800380706787, + "learning_rate": 0.00018840463452273707, + "loss": 1.4516, + "step": 5031 + }, + { + "epoch": 0.180206635987609, + "grad_norm": 1.4623908996582031, + "learning_rate": 0.00018839921255810996, + "loss": 1.6984, + "step": 5032 + }, + { + "epoch": 0.1802424481171773, + "grad_norm": 1.6998884677886963, + "learning_rate": 0.00018839378940418544, + "loss": 1.4623, + "step": 5033 + }, + { + "epoch": 0.18027826024674556, + "grad_norm": 2.1139144897460938, + "learning_rate": 0.00018838836506103652, + "loss": 1.5678, + "step": 5034 + }, + { + "epoch": 0.18031407237631386, + "grad_norm": 2.4886868000030518, + "learning_rate": 0.00018838293952873616, + "loss": 1.8086, + "step": 5035 + }, + { + "epoch": 0.18034988450588213, + "grad_norm": 1.4761202335357666, + "learning_rate": 0.0001883775128073573, + "loss": 1.8566, + "step": 5036 + }, + { + "epoch": 0.18038569663545043, + "grad_norm": 1.5243207216262817, + "learning_rate": 0.0001883720848969731, + "loss": 1.5345, + "step": 5037 + }, + { + "epoch": 0.18042150876501872, + "grad_norm": 2.0490150451660156, + "learning_rate": 0.00018836665579765642, + "loss": 1.3139, + "step": 5038 + }, + { + "epoch": 0.180457320894587, + "grad_norm": 1.9801737070083618, + "learning_rate": 0.0001883612255094804, + "loss": 1.6783, + "step": 5039 + }, + { + "epoch": 0.1804931330241553, + "grad_norm": 1.588937520980835, + "learning_rate": 0.00018835579403251806, + "loss": 1.4642, + "step": 5040 + }, + { + "epoch": 0.18052894515372356, + "grad_norm": 1.7436013221740723, + "learning_rate": 0.00018835036136684248, + "loss": 1.7724, + "step": 5041 + }, + { + "epoch": 0.18056475728329185, + "grad_norm": 1.4341626167297363, + "learning_rate": 0.00018834492751252678, + "loss": 1.5823, + "step": 5042 + }, + { + "epoch": 0.18060056941286012, + "grad_norm": 1.5773844718933105, + "learning_rate": 0.000188339492469644, + "loss": 1.5664, + "step": 5043 + }, + { + "epoch": 0.18063638154242842, + "grad_norm": 2.491431474685669, + "learning_rate": 0.0001883340562382673, + "loss": 2.0062, + "step": 5044 + }, + { + "epoch": 0.18067219367199672, + "grad_norm": 1.8883206844329834, + "learning_rate": 0.00018832861881846983, + "loss": 1.6656, + "step": 5045 + }, + { + "epoch": 0.18070800580156499, + "grad_norm": 1.7385650873184204, + "learning_rate": 0.00018832318021032472, + "loss": 1.637, + "step": 5046 + }, + { + "epoch": 0.18074381793113328, + "grad_norm": 1.5121614933013916, + "learning_rate": 0.00018831774041390517, + "loss": 1.1038, + "step": 5047 + }, + { + "epoch": 0.18077963006070155, + "grad_norm": 1.1999789476394653, + "learning_rate": 0.00018831229942928434, + "loss": 1.5409, + "step": 5048 + }, + { + "epoch": 0.18081544219026985, + "grad_norm": 1.249335527420044, + "learning_rate": 0.0001883068572565354, + "loss": 1.3306, + "step": 5049 + }, + { + "epoch": 0.18085125431983812, + "grad_norm": 1.4928827285766602, + "learning_rate": 0.00018830141389573166, + "loss": 1.699, + "step": 5050 + }, + { + "epoch": 0.18088706644940641, + "grad_norm": 1.2425192594528198, + "learning_rate": 0.00018829596934694624, + "loss": 1.6774, + "step": 5051 + }, + { + "epoch": 0.1809228785789747, + "grad_norm": 1.6975491046905518, + "learning_rate": 0.0001882905236102525, + "loss": 1.6176, + "step": 5052 + }, + { + "epoch": 0.18095869070854298, + "grad_norm": 1.3166990280151367, + "learning_rate": 0.0001882850766857236, + "loss": 1.3818, + "step": 5053 + }, + { + "epoch": 0.18099450283811128, + "grad_norm": 1.856614351272583, + "learning_rate": 0.0001882796285734329, + "loss": 1.5886, + "step": 5054 + }, + { + "epoch": 0.18103031496767955, + "grad_norm": 1.364363670349121, + "learning_rate": 0.0001882741792734537, + "loss": 1.4846, + "step": 5055 + }, + { + "epoch": 0.18106612709724784, + "grad_norm": 1.5802984237670898, + "learning_rate": 0.00018826872878585925, + "loss": 1.5028, + "step": 5056 + }, + { + "epoch": 0.1811019392268161, + "grad_norm": 1.4129685163497925, + "learning_rate": 0.0001882632771107229, + "loss": 1.6633, + "step": 5057 + }, + { + "epoch": 0.1811377513563844, + "grad_norm": 1.0658329725265503, + "learning_rate": 0.00018825782424811802, + "loss": 1.4006, + "step": 5058 + }, + { + "epoch": 0.1811735634859527, + "grad_norm": 1.5389435291290283, + "learning_rate": 0.00018825237019811796, + "loss": 1.7775, + "step": 5059 + }, + { + "epoch": 0.18120937561552097, + "grad_norm": 1.2066320180892944, + "learning_rate": 0.0001882469149607961, + "loss": 1.5404, + "step": 5060 + }, + { + "epoch": 0.18124518774508927, + "grad_norm": 1.2888737916946411, + "learning_rate": 0.00018824145853622582, + "loss": 1.6816, + "step": 5061 + }, + { + "epoch": 0.18128099987465754, + "grad_norm": 1.9192713499069214, + "learning_rate": 0.00018823600092448054, + "loss": 1.6496, + "step": 5062 + }, + { + "epoch": 0.18131681200422584, + "grad_norm": 1.499048113822937, + "learning_rate": 0.0001882305421256337, + "loss": 1.4216, + "step": 5063 + }, + { + "epoch": 0.1813526241337941, + "grad_norm": 1.7019413709640503, + "learning_rate": 0.0001882250821397587, + "loss": 1.5245, + "step": 5064 + }, + { + "epoch": 0.1813884362633624, + "grad_norm": 1.9641376733779907, + "learning_rate": 0.00018821962096692905, + "loss": 1.622, + "step": 5065 + }, + { + "epoch": 0.1814242483929307, + "grad_norm": 2.3913800716400146, + "learning_rate": 0.00018821415860721818, + "loss": 1.2029, + "step": 5066 + }, + { + "epoch": 0.18146006052249897, + "grad_norm": 1.3694190979003906, + "learning_rate": 0.0001882086950606996, + "loss": 1.6648, + "step": 5067 + }, + { + "epoch": 0.18149587265206726, + "grad_norm": 1.5379868745803833, + "learning_rate": 0.0001882032303274468, + "loss": 1.4357, + "step": 5068 + }, + { + "epoch": 0.18153168478163553, + "grad_norm": 1.7823456525802612, + "learning_rate": 0.0001881977644075333, + "loss": 1.4252, + "step": 5069 + }, + { + "epoch": 0.18156749691120383, + "grad_norm": 1.8627572059631348, + "learning_rate": 0.00018819229730103267, + "loss": 1.6875, + "step": 5070 + }, + { + "epoch": 0.1816033090407721, + "grad_norm": 1.8082793951034546, + "learning_rate": 0.00018818682900801842, + "loss": 1.451, + "step": 5071 + }, + { + "epoch": 0.1816391211703404, + "grad_norm": 1.3875577449798584, + "learning_rate": 0.00018818135952856414, + "loss": 1.6158, + "step": 5072 + }, + { + "epoch": 0.1816749332999087, + "grad_norm": 1.2505030632019043, + "learning_rate": 0.00018817588886274345, + "loss": 1.7009, + "step": 5073 + }, + { + "epoch": 0.18171074542947696, + "grad_norm": 1.660299301147461, + "learning_rate": 0.00018817041701062987, + "loss": 1.666, + "step": 5074 + }, + { + "epoch": 0.18174655755904526, + "grad_norm": 1.854568600654602, + "learning_rate": 0.00018816494397229708, + "loss": 1.6123, + "step": 5075 + }, + { + "epoch": 0.18178236968861353, + "grad_norm": 2.6197986602783203, + "learning_rate": 0.0001881594697478187, + "loss": 1.5637, + "step": 5076 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 2.150740623474121, + "learning_rate": 0.00018815399433726835, + "loss": 1.6449, + "step": 5077 + }, + { + "epoch": 0.1818539939477501, + "grad_norm": 3.9189467430114746, + "learning_rate": 0.00018814851774071972, + "loss": 1.6728, + "step": 5078 + }, + { + "epoch": 0.1818898060773184, + "grad_norm": 1.6135289669036865, + "learning_rate": 0.00018814303995824653, + "loss": 1.6037, + "step": 5079 + }, + { + "epoch": 0.18192561820688669, + "grad_norm": 1.3914783000946045, + "learning_rate": 0.00018813756098992238, + "loss": 1.5109, + "step": 5080 + }, + { + "epoch": 0.18196143033645495, + "grad_norm": 1.7650524377822876, + "learning_rate": 0.00018813208083582106, + "loss": 1.44, + "step": 5081 + }, + { + "epoch": 0.18199724246602325, + "grad_norm": 1.4060581922531128, + "learning_rate": 0.00018812659949601627, + "loss": 1.5721, + "step": 5082 + }, + { + "epoch": 0.18203305459559152, + "grad_norm": 1.284258246421814, + "learning_rate": 0.00018812111697058174, + "loss": 1.5441, + "step": 5083 + }, + { + "epoch": 0.18206886672515982, + "grad_norm": 2.5756237506866455, + "learning_rate": 0.0001881156332595913, + "loss": 1.463, + "step": 5084 + }, + { + "epoch": 0.18210467885472809, + "grad_norm": 1.4949012994766235, + "learning_rate": 0.00018811014836311865, + "loss": 1.4694, + "step": 5085 + }, + { + "epoch": 0.18214049098429638, + "grad_norm": 1.3897700309753418, + "learning_rate": 0.00018810466228123758, + "loss": 1.5629, + "step": 5086 + }, + { + "epoch": 0.18217630311386468, + "grad_norm": 1.605708360671997, + "learning_rate": 0.00018809917501402197, + "loss": 1.4193, + "step": 5087 + }, + { + "epoch": 0.18221211524343295, + "grad_norm": 1.3950620889663696, + "learning_rate": 0.00018809368656154556, + "loss": 1.4448, + "step": 5088 + }, + { + "epoch": 0.18224792737300125, + "grad_norm": 1.950050950050354, + "learning_rate": 0.00018808819692388225, + "loss": 1.4997, + "step": 5089 + }, + { + "epoch": 0.18228373950256951, + "grad_norm": 1.9414753913879395, + "learning_rate": 0.00018808270610110584, + "loss": 1.3708, + "step": 5090 + }, + { + "epoch": 0.1823195516321378, + "grad_norm": 1.3951416015625, + "learning_rate": 0.0001880772140932903, + "loss": 1.6992, + "step": 5091 + }, + { + "epoch": 0.18235536376170608, + "grad_norm": 1.6158993244171143, + "learning_rate": 0.00018807172090050945, + "loss": 1.6278, + "step": 5092 + }, + { + "epoch": 0.18239117589127438, + "grad_norm": 1.6947451829910278, + "learning_rate": 0.00018806622652283713, + "loss": 1.5909, + "step": 5093 + }, + { + "epoch": 0.18242698802084267, + "grad_norm": 1.4441710710525513, + "learning_rate": 0.0001880607309603474, + "loss": 1.4248, + "step": 5094 + }, + { + "epoch": 0.18246280015041094, + "grad_norm": 1.387609839439392, + "learning_rate": 0.00018805523421311411, + "loss": 1.6221, + "step": 5095 + }, + { + "epoch": 0.18249861227997924, + "grad_norm": 1.3998216390609741, + "learning_rate": 0.0001880497362812112, + "loss": 1.7985, + "step": 5096 + }, + { + "epoch": 0.1825344244095475, + "grad_norm": 2.511861562728882, + "learning_rate": 0.00018804423716471268, + "loss": 1.4662, + "step": 5097 + }, + { + "epoch": 0.1825702365391158, + "grad_norm": 1.66793954372406, + "learning_rate": 0.00018803873686369253, + "loss": 1.3022, + "step": 5098 + }, + { + "epoch": 0.18260604866868407, + "grad_norm": 1.4244165420532227, + "learning_rate": 0.00018803323537822472, + "loss": 1.5109, + "step": 5099 + }, + { + "epoch": 0.18264186079825237, + "grad_norm": 1.2689194679260254, + "learning_rate": 0.00018802773270838329, + "loss": 1.4388, + "step": 5100 + }, + { + "epoch": 0.18267767292782064, + "grad_norm": 1.5355168581008911, + "learning_rate": 0.0001880222288542423, + "loss": 1.3934, + "step": 5101 + }, + { + "epoch": 0.18271348505738894, + "grad_norm": 1.695492148399353, + "learning_rate": 0.0001880167238158757, + "loss": 1.6925, + "step": 5102 + }, + { + "epoch": 0.18274929718695723, + "grad_norm": 1.5693861246109009, + "learning_rate": 0.00018801121759335764, + "loss": 1.6833, + "step": 5103 + }, + { + "epoch": 0.1827851093165255, + "grad_norm": 1.766347050666809, + "learning_rate": 0.0001880057101867622, + "loss": 1.4459, + "step": 5104 + }, + { + "epoch": 0.1828209214460938, + "grad_norm": 1.3743687868118286, + "learning_rate": 0.00018800020159616342, + "loss": 1.7515, + "step": 5105 + }, + { + "epoch": 0.18285673357566207, + "grad_norm": 1.5413109064102173, + "learning_rate": 0.00018799469182163544, + "loss": 1.5712, + "step": 5106 + }, + { + "epoch": 0.18289254570523036, + "grad_norm": 2.0827620029449463, + "learning_rate": 0.00018798918086325236, + "loss": 1.8366, + "step": 5107 + }, + { + "epoch": 0.18292835783479863, + "grad_norm": 1.954444408416748, + "learning_rate": 0.0001879836687210884, + "loss": 1.8004, + "step": 5108 + }, + { + "epoch": 0.18296416996436693, + "grad_norm": 1.6314371824264526, + "learning_rate": 0.00018797815539521763, + "loss": 1.6223, + "step": 5109 + }, + { + "epoch": 0.18299998209393523, + "grad_norm": 1.643937349319458, + "learning_rate": 0.00018797264088571427, + "loss": 1.9055, + "step": 5110 + }, + { + "epoch": 0.1830357942235035, + "grad_norm": 2.121596574783325, + "learning_rate": 0.0001879671251926525, + "loss": 1.7183, + "step": 5111 + }, + { + "epoch": 0.1830716063530718, + "grad_norm": 1.6858540773391724, + "learning_rate": 0.00018796160831610655, + "loss": 1.7874, + "step": 5112 + }, + { + "epoch": 0.18310741848264006, + "grad_norm": 1.651357889175415, + "learning_rate": 0.00018795609025615062, + "loss": 1.6096, + "step": 5113 + }, + { + "epoch": 0.18314323061220836, + "grad_norm": 1.636696457862854, + "learning_rate": 0.00018795057101285895, + "loss": 1.7267, + "step": 5114 + }, + { + "epoch": 0.18317904274177663, + "grad_norm": 2.019561529159546, + "learning_rate": 0.0001879450505863058, + "loss": 1.5251, + "step": 5115 + }, + { + "epoch": 0.18321485487134492, + "grad_norm": 1.834036946296692, + "learning_rate": 0.00018793952897656544, + "loss": 1.4337, + "step": 5116 + }, + { + "epoch": 0.18325066700091322, + "grad_norm": 1.3989968299865723, + "learning_rate": 0.00018793400618371213, + "loss": 1.2123, + "step": 5117 + }, + { + "epoch": 0.1832864791304815, + "grad_norm": 1.2780959606170654, + "learning_rate": 0.0001879284822078202, + "loss": 1.4671, + "step": 5118 + }, + { + "epoch": 0.18332229126004979, + "grad_norm": 2.15604567527771, + "learning_rate": 0.00018792295704896396, + "loss": 1.5538, + "step": 5119 + }, + { + "epoch": 0.18335810338961805, + "grad_norm": 1.4170936346054077, + "learning_rate": 0.00018791743070721776, + "loss": 1.9131, + "step": 5120 + }, + { + "epoch": 0.18339391551918635, + "grad_norm": 1.7687115669250488, + "learning_rate": 0.0001879119031826559, + "loss": 1.6419, + "step": 5121 + }, + { + "epoch": 0.18342972764875462, + "grad_norm": 1.4007076025009155, + "learning_rate": 0.00018790637447535283, + "loss": 1.701, + "step": 5122 + }, + { + "epoch": 0.18346553977832292, + "grad_norm": 1.8551342487335205, + "learning_rate": 0.00018790084458538285, + "loss": 1.7679, + "step": 5123 + }, + { + "epoch": 0.1835013519078912, + "grad_norm": 1.933009386062622, + "learning_rate": 0.0001878953135128204, + "loss": 1.6205, + "step": 5124 + }, + { + "epoch": 0.18353716403745948, + "grad_norm": 1.9234230518341064, + "learning_rate": 0.00018788978125773987, + "loss": 1.4022, + "step": 5125 + }, + { + "epoch": 0.18357297616702778, + "grad_norm": 1.5829157829284668, + "learning_rate": 0.0001878842478202157, + "loss": 1.4791, + "step": 5126 + }, + { + "epoch": 0.18360878829659605, + "grad_norm": 1.4500535726547241, + "learning_rate": 0.00018787871320032236, + "loss": 1.6684, + "step": 5127 + }, + { + "epoch": 0.18364460042616434, + "grad_norm": 2.3648834228515625, + "learning_rate": 0.0001878731773981343, + "loss": 1.9178, + "step": 5128 + }, + { + "epoch": 0.18368041255573261, + "grad_norm": 2.026014566421509, + "learning_rate": 0.00018786764041372594, + "loss": 1.7238, + "step": 5129 + }, + { + "epoch": 0.1837162246853009, + "grad_norm": 2.3626532554626465, + "learning_rate": 0.00018786210224717184, + "loss": 1.6493, + "step": 5130 + }, + { + "epoch": 0.1837520368148692, + "grad_norm": 1.9708654880523682, + "learning_rate": 0.0001878565628985465, + "loss": 1.7803, + "step": 5131 + }, + { + "epoch": 0.18378784894443748, + "grad_norm": 2.1790413856506348, + "learning_rate": 0.00018785102236792444, + "loss": 1.5954, + "step": 5132 + }, + { + "epoch": 0.18382366107400577, + "grad_norm": 1.556118369102478, + "learning_rate": 0.00018784548065538018, + "loss": 1.7467, + "step": 5133 + }, + { + "epoch": 0.18385947320357404, + "grad_norm": 1.646291971206665, + "learning_rate": 0.0001878399377609883, + "loss": 1.3901, + "step": 5134 + }, + { + "epoch": 0.18389528533314234, + "grad_norm": 1.8888112306594849, + "learning_rate": 0.00018783439368482335, + "loss": 1.8621, + "step": 5135 + }, + { + "epoch": 0.1839310974627106, + "grad_norm": 1.5839077234268188, + "learning_rate": 0.00018782884842695992, + "loss": 1.3945, + "step": 5136 + }, + { + "epoch": 0.1839669095922789, + "grad_norm": 1.7424887418746948, + "learning_rate": 0.00018782330198747265, + "loss": 1.5983, + "step": 5137 + }, + { + "epoch": 0.1840027217218472, + "grad_norm": 1.7385514974594116, + "learning_rate": 0.0001878177543664361, + "loss": 1.8294, + "step": 5138 + }, + { + "epoch": 0.18403853385141547, + "grad_norm": 1.573832392692566, + "learning_rate": 0.00018781220556392497, + "loss": 1.6636, + "step": 5139 + }, + { + "epoch": 0.18407434598098377, + "grad_norm": 2.315880537033081, + "learning_rate": 0.00018780665558001388, + "loss": 1.6846, + "step": 5140 + }, + { + "epoch": 0.18411015811055204, + "grad_norm": 1.4497591257095337, + "learning_rate": 0.00018780110441477752, + "loss": 1.4998, + "step": 5141 + }, + { + "epoch": 0.18414597024012033, + "grad_norm": 1.6672792434692383, + "learning_rate": 0.00018779555206829054, + "loss": 1.6283, + "step": 5142 + }, + { + "epoch": 0.1841817823696886, + "grad_norm": 1.4344167709350586, + "learning_rate": 0.00018778999854062765, + "loss": 1.3823, + "step": 5143 + }, + { + "epoch": 0.1842175944992569, + "grad_norm": 1.8563214540481567, + "learning_rate": 0.00018778444383186357, + "loss": 1.5636, + "step": 5144 + }, + { + "epoch": 0.1842534066288252, + "grad_norm": 1.5528720617294312, + "learning_rate": 0.00018777888794207302, + "loss": 1.5238, + "step": 5145 + }, + { + "epoch": 0.18428921875839346, + "grad_norm": 1.825433373451233, + "learning_rate": 0.0001877733308713308, + "loss": 1.3471, + "step": 5146 + }, + { + "epoch": 0.18432503088796176, + "grad_norm": 1.9533745050430298, + "learning_rate": 0.00018776777261971162, + "loss": 1.5054, + "step": 5147 + }, + { + "epoch": 0.18436084301753003, + "grad_norm": 2.1287336349487305, + "learning_rate": 0.00018776221318729026, + "loss": 1.8556, + "step": 5148 + }, + { + "epoch": 0.18439665514709833, + "grad_norm": 3.0732877254486084, + "learning_rate": 0.00018775665257414153, + "loss": 1.6433, + "step": 5149 + }, + { + "epoch": 0.1844324672766666, + "grad_norm": 1.8303265571594238, + "learning_rate": 0.00018775109078034022, + "loss": 1.4745, + "step": 5150 + }, + { + "epoch": 0.1844682794062349, + "grad_norm": 1.7558493614196777, + "learning_rate": 0.00018774552780596117, + "loss": 1.6524, + "step": 5151 + }, + { + "epoch": 0.1845040915358032, + "grad_norm": 1.354691505432129, + "learning_rate": 0.00018773996365107926, + "loss": 1.0427, + "step": 5152 + }, + { + "epoch": 0.18453990366537146, + "grad_norm": 1.8739055395126343, + "learning_rate": 0.00018773439831576929, + "loss": 1.2968, + "step": 5153 + }, + { + "epoch": 0.18457571579493975, + "grad_norm": 1.5816646814346313, + "learning_rate": 0.00018772883180010616, + "loss": 1.5516, + "step": 5154 + }, + { + "epoch": 0.18461152792450802, + "grad_norm": 1.3845489025115967, + "learning_rate": 0.0001877232641041648, + "loss": 1.3547, + "step": 5155 + }, + { + "epoch": 0.18464734005407632, + "grad_norm": 1.0942845344543457, + "learning_rate": 0.00018771769522802004, + "loss": 1.6058, + "step": 5156 + }, + { + "epoch": 0.1846831521836446, + "grad_norm": 2.4216606616973877, + "learning_rate": 0.00018771212517174686, + "loss": 1.8014, + "step": 5157 + }, + { + "epoch": 0.18471896431321289, + "grad_norm": 2.278639554977417, + "learning_rate": 0.00018770655393542012, + "loss": 1.574, + "step": 5158 + }, + { + "epoch": 0.18475477644278118, + "grad_norm": 1.3788586854934692, + "learning_rate": 0.0001877009815191149, + "loss": 1.4745, + "step": 5159 + }, + { + "epoch": 0.18479058857234945, + "grad_norm": 1.4675894975662231, + "learning_rate": 0.00018769540792290608, + "loss": 1.5777, + "step": 5160 + }, + { + "epoch": 0.18482640070191775, + "grad_norm": 1.862516164779663, + "learning_rate": 0.00018768983314686866, + "loss": 1.5242, + "step": 5161 + }, + { + "epoch": 0.18486221283148602, + "grad_norm": 1.9851064682006836, + "learning_rate": 0.00018768425719107765, + "loss": 1.5826, + "step": 5162 + }, + { + "epoch": 0.1848980249610543, + "grad_norm": 1.6969146728515625, + "learning_rate": 0.00018767868005560806, + "loss": 1.6246, + "step": 5163 + }, + { + "epoch": 0.18493383709062258, + "grad_norm": 1.3387763500213623, + "learning_rate": 0.0001876731017405349, + "loss": 1.4774, + "step": 5164 + }, + { + "epoch": 0.18496964922019088, + "grad_norm": 1.9716835021972656, + "learning_rate": 0.0001876675222459333, + "loss": 1.3885, + "step": 5165 + }, + { + "epoch": 0.18500546134975918, + "grad_norm": 3.473733901977539, + "learning_rate": 0.0001876619415718782, + "loss": 1.4541, + "step": 5166 + }, + { + "epoch": 0.18504127347932744, + "grad_norm": 1.877964973449707, + "learning_rate": 0.00018765635971844483, + "loss": 1.563, + "step": 5167 + }, + { + "epoch": 0.18507708560889574, + "grad_norm": 2.445122718811035, + "learning_rate": 0.00018765077668570816, + "loss": 1.7484, + "step": 5168 + }, + { + "epoch": 0.185112897738464, + "grad_norm": 1.877567172050476, + "learning_rate": 0.00018764519247374336, + "loss": 1.5732, + "step": 5169 + }, + { + "epoch": 0.1851487098680323, + "grad_norm": 2.211268663406372, + "learning_rate": 0.00018763960708262557, + "loss": 1.684, + "step": 5170 + }, + { + "epoch": 0.18518452199760058, + "grad_norm": 2.7970688343048096, + "learning_rate": 0.0001876340205124299, + "loss": 1.4222, + "step": 5171 + }, + { + "epoch": 0.18522033412716887, + "grad_norm": 1.5880224704742432, + "learning_rate": 0.00018762843276323151, + "loss": 1.824, + "step": 5172 + }, + { + "epoch": 0.18525614625673717, + "grad_norm": 1.6042425632476807, + "learning_rate": 0.0001876228438351056, + "loss": 1.6025, + "step": 5173 + }, + { + "epoch": 0.18529195838630544, + "grad_norm": 1.544500470161438, + "learning_rate": 0.00018761725372812735, + "loss": 1.7088, + "step": 5174 + }, + { + "epoch": 0.18532777051587374, + "grad_norm": 2.324397325515747, + "learning_rate": 0.00018761166244237197, + "loss": 1.5568, + "step": 5175 + }, + { + "epoch": 0.185363582645442, + "grad_norm": 3.774132013320923, + "learning_rate": 0.00018760606997791468, + "loss": 1.2232, + "step": 5176 + }, + { + "epoch": 0.1853993947750103, + "grad_norm": 1.6885324716567993, + "learning_rate": 0.00018760047633483074, + "loss": 1.7845, + "step": 5177 + }, + { + "epoch": 0.18543520690457857, + "grad_norm": 1.3358042240142822, + "learning_rate": 0.00018759488151319539, + "loss": 1.4645, + "step": 5178 + }, + { + "epoch": 0.18547101903414687, + "grad_norm": 1.6287620067596436, + "learning_rate": 0.00018758928551308385, + "loss": 1.5811, + "step": 5179 + }, + { + "epoch": 0.18550683116371516, + "grad_norm": 1.919089674949646, + "learning_rate": 0.0001875836883345715, + "loss": 1.4477, + "step": 5180 + }, + { + "epoch": 0.18554264329328343, + "grad_norm": 1.681052803993225, + "learning_rate": 0.00018757808997773358, + "loss": 1.3891, + "step": 5181 + }, + { + "epoch": 0.18557845542285173, + "grad_norm": 1.7119311094284058, + "learning_rate": 0.00018757249044264542, + "loss": 1.5623, + "step": 5182 + }, + { + "epoch": 0.18561426755242, + "grad_norm": 1.74424147605896, + "learning_rate": 0.00018756688972938239, + "loss": 1.5426, + "step": 5183 + }, + { + "epoch": 0.1856500796819883, + "grad_norm": 1.6085398197174072, + "learning_rate": 0.0001875612878380198, + "loss": 1.9637, + "step": 5184 + }, + { + "epoch": 0.18568589181155656, + "grad_norm": 1.232580542564392, + "learning_rate": 0.00018755568476863302, + "loss": 1.7271, + "step": 5185 + }, + { + "epoch": 0.18572170394112486, + "grad_norm": 1.6930204629898071, + "learning_rate": 0.00018755008052129743, + "loss": 1.7984, + "step": 5186 + }, + { + "epoch": 0.18575751607069316, + "grad_norm": 2.1576590538024902, + "learning_rate": 0.00018754447509608847, + "loss": 1.378, + "step": 5187 + }, + { + "epoch": 0.18579332820026143, + "grad_norm": 1.799008846282959, + "learning_rate": 0.0001875388684930815, + "loss": 1.2764, + "step": 5188 + }, + { + "epoch": 0.18582914032982972, + "grad_norm": 1.5881048440933228, + "learning_rate": 0.00018753326071235197, + "loss": 1.6597, + "step": 5189 + }, + { + "epoch": 0.185864952459398, + "grad_norm": 2.208606719970703, + "learning_rate": 0.00018752765175397533, + "loss": 1.6626, + "step": 5190 + }, + { + "epoch": 0.1859007645889663, + "grad_norm": 1.4758398532867432, + "learning_rate": 0.00018752204161802706, + "loss": 1.6252, + "step": 5191 + }, + { + "epoch": 0.18593657671853456, + "grad_norm": 1.7125110626220703, + "learning_rate": 0.00018751643030458256, + "loss": 1.5789, + "step": 5192 + }, + { + "epoch": 0.18597238884810285, + "grad_norm": 2.279865264892578, + "learning_rate": 0.00018751081781371743, + "loss": 1.5543, + "step": 5193 + }, + { + "epoch": 0.18600820097767115, + "grad_norm": 1.559960126876831, + "learning_rate": 0.00018750520414550711, + "loss": 1.4844, + "step": 5194 + }, + { + "epoch": 0.18604401310723942, + "grad_norm": 1.2859904766082764, + "learning_rate": 0.00018749958930002717, + "loss": 1.5597, + "step": 5195 + }, + { + "epoch": 0.18607982523680772, + "grad_norm": 2.042282819747925, + "learning_rate": 0.00018749397327735308, + "loss": 1.2362, + "step": 5196 + }, + { + "epoch": 0.18611563736637599, + "grad_norm": 2.044483184814453, + "learning_rate": 0.00018748835607756045, + "loss": 1.631, + "step": 5197 + }, + { + "epoch": 0.18615144949594428, + "grad_norm": 1.2718511819839478, + "learning_rate": 0.00018748273770072485, + "loss": 1.5658, + "step": 5198 + }, + { + "epoch": 0.18618726162551255, + "grad_norm": 1.6728475093841553, + "learning_rate": 0.00018747711814692185, + "loss": 1.897, + "step": 5199 + }, + { + "epoch": 0.18622307375508085, + "grad_norm": 1.7691491842269897, + "learning_rate": 0.00018747149741622706, + "loss": 1.6203, + "step": 5200 + }, + { + "epoch": 0.18625888588464912, + "grad_norm": 1.5065457820892334, + "learning_rate": 0.0001874658755087161, + "loss": 1.805, + "step": 5201 + }, + { + "epoch": 0.1862946980142174, + "grad_norm": 1.8144469261169434, + "learning_rate": 0.00018746025242446463, + "loss": 1.5208, + "step": 5202 + }, + { + "epoch": 0.1863305101437857, + "grad_norm": 1.7388842105865479, + "learning_rate": 0.00018745462816354826, + "loss": 1.4172, + "step": 5203 + }, + { + "epoch": 0.18636632227335398, + "grad_norm": 1.3475241661071777, + "learning_rate": 0.0001874490027260427, + "loss": 1.6845, + "step": 5204 + }, + { + "epoch": 0.18640213440292228, + "grad_norm": 2.016415596008301, + "learning_rate": 0.0001874433761120236, + "loss": 1.4557, + "step": 5205 + }, + { + "epoch": 0.18643794653249054, + "grad_norm": 1.5078966617584229, + "learning_rate": 0.00018743774832156667, + "loss": 1.5403, + "step": 5206 + }, + { + "epoch": 0.18647375866205884, + "grad_norm": 1.3218117952346802, + "learning_rate": 0.0001874321193547476, + "loss": 1.5534, + "step": 5207 + }, + { + "epoch": 0.1865095707916271, + "grad_norm": 1.9156447649002075, + "learning_rate": 0.00018742648921164215, + "loss": 1.4724, + "step": 5208 + }, + { + "epoch": 0.1865453829211954, + "grad_norm": 2.4503142833709717, + "learning_rate": 0.00018742085789232607, + "loss": 1.4975, + "step": 5209 + }, + { + "epoch": 0.1865811950507637, + "grad_norm": 1.653955101966858, + "learning_rate": 0.0001874152253968751, + "loss": 1.5087, + "step": 5210 + }, + { + "epoch": 0.18661700718033197, + "grad_norm": 1.6351701021194458, + "learning_rate": 0.00018740959172536506, + "loss": 1.471, + "step": 5211 + }, + { + "epoch": 0.18665281930990027, + "grad_norm": 1.942446231842041, + "learning_rate": 0.0001874039568778717, + "loss": 1.5784, + "step": 5212 + }, + { + "epoch": 0.18668863143946854, + "grad_norm": 1.5409560203552246, + "learning_rate": 0.0001873983208544708, + "loss": 1.7789, + "step": 5213 + }, + { + "epoch": 0.18672444356903684, + "grad_norm": 1.5599496364593506, + "learning_rate": 0.00018739268365523828, + "loss": 1.3787, + "step": 5214 + }, + { + "epoch": 0.1867602556986051, + "grad_norm": 2.1475934982299805, + "learning_rate": 0.00018738704528024994, + "loss": 1.8485, + "step": 5215 + }, + { + "epoch": 0.1867960678281734, + "grad_norm": 1.8166731595993042, + "learning_rate": 0.00018738140572958155, + "loss": 1.5655, + "step": 5216 + }, + { + "epoch": 0.1868318799577417, + "grad_norm": 2.244706630706787, + "learning_rate": 0.00018737576500330914, + "loss": 1.4576, + "step": 5217 + }, + { + "epoch": 0.18686769208730997, + "grad_norm": 1.2443066835403442, + "learning_rate": 0.00018737012310150847, + "loss": 1.3797, + "step": 5218 + }, + { + "epoch": 0.18690350421687826, + "grad_norm": 1.4576160907745361, + "learning_rate": 0.00018736448002425554, + "loss": 1.4719, + "step": 5219 + }, + { + "epoch": 0.18693931634644653, + "grad_norm": 1.2665461301803589, + "learning_rate": 0.00018735883577162619, + "loss": 1.5914, + "step": 5220 + }, + { + "epoch": 0.18697512847601483, + "grad_norm": 1.9680484533309937, + "learning_rate": 0.0001873531903436964, + "loss": 1.7045, + "step": 5221 + }, + { + "epoch": 0.1870109406055831, + "grad_norm": 1.387949824333191, + "learning_rate": 0.00018734754374054207, + "loss": 1.4011, + "step": 5222 + }, + { + "epoch": 0.1870467527351514, + "grad_norm": 1.9003175497055054, + "learning_rate": 0.0001873418959622393, + "loss": 1.443, + "step": 5223 + }, + { + "epoch": 0.1870825648647197, + "grad_norm": 1.8332993984222412, + "learning_rate": 0.0001873362470088639, + "loss": 1.7366, + "step": 5224 + }, + { + "epoch": 0.18711837699428796, + "grad_norm": 1.9775099754333496, + "learning_rate": 0.00018733059688049198, + "loss": 1.574, + "step": 5225 + }, + { + "epoch": 0.18715418912385626, + "grad_norm": 2.4329521656036377, + "learning_rate": 0.00018732494557719952, + "loss": 1.8489, + "step": 5226 + }, + { + "epoch": 0.18719000125342453, + "grad_norm": 1.4437566995620728, + "learning_rate": 0.00018731929309906254, + "loss": 1.4442, + "step": 5227 + }, + { + "epoch": 0.18722581338299282, + "grad_norm": 1.4024875164031982, + "learning_rate": 0.00018731363944615717, + "loss": 1.2043, + "step": 5228 + }, + { + "epoch": 0.1872616255125611, + "grad_norm": 1.692596197128296, + "learning_rate": 0.00018730798461855938, + "loss": 1.4935, + "step": 5229 + }, + { + "epoch": 0.1872974376421294, + "grad_norm": 1.579940676689148, + "learning_rate": 0.00018730232861634524, + "loss": 1.5019, + "step": 5230 + }, + { + "epoch": 0.18733324977169769, + "grad_norm": 1.8518500328063965, + "learning_rate": 0.0001872966714395909, + "loss": 1.5695, + "step": 5231 + }, + { + "epoch": 0.18736906190126595, + "grad_norm": 1.7087852954864502, + "learning_rate": 0.00018729101308837245, + "loss": 1.844, + "step": 5232 + }, + { + "epoch": 0.18740487403083425, + "grad_norm": 1.8843986988067627, + "learning_rate": 0.000187285353562766, + "loss": 1.4705, + "step": 5233 + }, + { + "epoch": 0.18744068616040252, + "grad_norm": 1.5346555709838867, + "learning_rate": 0.00018727969286284776, + "loss": 1.4533, + "step": 5234 + }, + { + "epoch": 0.18747649828997082, + "grad_norm": 1.526907205581665, + "learning_rate": 0.0001872740309886938, + "loss": 1.6521, + "step": 5235 + }, + { + "epoch": 0.18751231041953909, + "grad_norm": 2.257167100906372, + "learning_rate": 0.00018726836794038035, + "loss": 1.4737, + "step": 5236 + }, + { + "epoch": 0.18754812254910738, + "grad_norm": 2.471599817276001, + "learning_rate": 0.00018726270371798357, + "loss": 1.5256, + "step": 5237 + }, + { + "epoch": 0.18758393467867568, + "grad_norm": 1.6534587144851685, + "learning_rate": 0.00018725703832157966, + "loss": 1.7539, + "step": 5238 + }, + { + "epoch": 0.18761974680824395, + "grad_norm": 1.419296383857727, + "learning_rate": 0.00018725137175124482, + "loss": 1.5579, + "step": 5239 + }, + { + "epoch": 0.18765555893781224, + "grad_norm": 1.8545119762420654, + "learning_rate": 0.0001872457040070554, + "loss": 1.6909, + "step": 5240 + }, + { + "epoch": 0.1876913710673805, + "grad_norm": 1.6722489595413208, + "learning_rate": 0.0001872400350890875, + "loss": 1.4801, + "step": 5241 + }, + { + "epoch": 0.1877271831969488, + "grad_norm": 1.4976879358291626, + "learning_rate": 0.00018723436499741748, + "loss": 1.3575, + "step": 5242 + }, + { + "epoch": 0.18776299532651708, + "grad_norm": 1.9052339792251587, + "learning_rate": 0.0001872286937321216, + "loss": 1.2981, + "step": 5243 + }, + { + "epoch": 0.18779880745608538, + "grad_norm": 1.9531632661819458, + "learning_rate": 0.00018722302129327618, + "loss": 1.4758, + "step": 5244 + }, + { + "epoch": 0.18783461958565367, + "grad_norm": 1.5285097360610962, + "learning_rate": 0.0001872173476809575, + "loss": 1.39, + "step": 5245 + }, + { + "epoch": 0.18787043171522194, + "grad_norm": 2.232374906539917, + "learning_rate": 0.00018721167289524195, + "loss": 1.7108, + "step": 5246 + }, + { + "epoch": 0.18790624384479024, + "grad_norm": 1.175122618675232, + "learning_rate": 0.0001872059969362058, + "loss": 1.578, + "step": 5247 + }, + { + "epoch": 0.1879420559743585, + "grad_norm": 1.2910648584365845, + "learning_rate": 0.00018720031980392544, + "loss": 1.4514, + "step": 5248 + }, + { + "epoch": 0.1879778681039268, + "grad_norm": 2.2122230529785156, + "learning_rate": 0.0001871946414984773, + "loss": 1.5838, + "step": 5249 + }, + { + "epoch": 0.18801368023349507, + "grad_norm": 1.6212471723556519, + "learning_rate": 0.00018718896201993767, + "loss": 1.103, + "step": 5250 + }, + { + "epoch": 0.18804949236306337, + "grad_norm": 1.6619919538497925, + "learning_rate": 0.00018718328136838305, + "loss": 1.4881, + "step": 5251 + }, + { + "epoch": 0.18808530449263167, + "grad_norm": 1.6589463949203491, + "learning_rate": 0.00018717759954388986, + "loss": 1.4259, + "step": 5252 + }, + { + "epoch": 0.18812111662219994, + "grad_norm": 1.471097707748413, + "learning_rate": 0.00018717191654653452, + "loss": 1.6129, + "step": 5253 + }, + { + "epoch": 0.18815692875176823, + "grad_norm": 1.1274768114089966, + "learning_rate": 0.00018716623237639347, + "loss": 1.4241, + "step": 5254 + }, + { + "epoch": 0.1881927408813365, + "grad_norm": 1.8204070329666138, + "learning_rate": 0.00018716054703354318, + "loss": 1.6479, + "step": 5255 + }, + { + "epoch": 0.1882285530109048, + "grad_norm": 2.424353837966919, + "learning_rate": 0.0001871548605180602, + "loss": 1.5044, + "step": 5256 + }, + { + "epoch": 0.18826436514047307, + "grad_norm": 1.7272015810012817, + "learning_rate": 0.00018714917283002094, + "loss": 1.9286, + "step": 5257 + }, + { + "epoch": 0.18830017727004136, + "grad_norm": 1.4467058181762695, + "learning_rate": 0.000187143483969502, + "loss": 1.7475, + "step": 5258 + }, + { + "epoch": 0.18833598939960966, + "grad_norm": 1.6046894788742065, + "learning_rate": 0.00018713779393657993, + "loss": 1.5029, + "step": 5259 + }, + { + "epoch": 0.18837180152917793, + "grad_norm": 1.3539988994598389, + "learning_rate": 0.00018713210273133118, + "loss": 1.5756, + "step": 5260 + }, + { + "epoch": 0.18840761365874623, + "grad_norm": 1.6449440717697144, + "learning_rate": 0.00018712641035383243, + "loss": 1.4489, + "step": 5261 + }, + { + "epoch": 0.1884434257883145, + "grad_norm": 1.5281622409820557, + "learning_rate": 0.00018712071680416017, + "loss": 1.6258, + "step": 5262 + }, + { + "epoch": 0.1884792379178828, + "grad_norm": 1.7914692163467407, + "learning_rate": 0.00018711502208239108, + "loss": 1.2722, + "step": 5263 + }, + { + "epoch": 0.18851505004745106, + "grad_norm": 1.9788846969604492, + "learning_rate": 0.0001871093261886017, + "loss": 1.3999, + "step": 5264 + }, + { + "epoch": 0.18855086217701936, + "grad_norm": 1.66983962059021, + "learning_rate": 0.00018710362912286872, + "loss": 1.7335, + "step": 5265 + }, + { + "epoch": 0.18858667430658765, + "grad_norm": 2.1798553466796875, + "learning_rate": 0.00018709793088526877, + "loss": 1.5971, + "step": 5266 + }, + { + "epoch": 0.18862248643615592, + "grad_norm": 2.0857560634613037, + "learning_rate": 0.0001870922314758785, + "loss": 1.4487, + "step": 5267 + }, + { + "epoch": 0.18865829856572422, + "grad_norm": 1.8404710292816162, + "learning_rate": 0.0001870865308947746, + "loss": 1.5343, + "step": 5268 + }, + { + "epoch": 0.1886941106952925, + "grad_norm": 1.9581575393676758, + "learning_rate": 0.00018708082914203376, + "loss": 1.4325, + "step": 5269 + }, + { + "epoch": 0.18872992282486079, + "grad_norm": 1.5912389755249023, + "learning_rate": 0.0001870751262177327, + "loss": 1.503, + "step": 5270 + }, + { + "epoch": 0.18876573495442905, + "grad_norm": 2.4967665672302246, + "learning_rate": 0.00018706942212194812, + "loss": 1.4733, + "step": 5271 + }, + { + "epoch": 0.18880154708399735, + "grad_norm": 1.365278959274292, + "learning_rate": 0.0001870637168547568, + "loss": 1.6569, + "step": 5272 + }, + { + "epoch": 0.18883735921356565, + "grad_norm": 1.912571668624878, + "learning_rate": 0.00018705801041623546, + "loss": 1.7692, + "step": 5273 + }, + { + "epoch": 0.18887317134313392, + "grad_norm": 2.0421035289764404, + "learning_rate": 0.0001870523028064609, + "loss": 1.4651, + "step": 5274 + }, + { + "epoch": 0.1889089834727022, + "grad_norm": 1.6724461317062378, + "learning_rate": 0.00018704659402550986, + "loss": 1.6736, + "step": 5275 + }, + { + "epoch": 0.18894479560227048, + "grad_norm": 2.037821054458618, + "learning_rate": 0.0001870408840734592, + "loss": 1.492, + "step": 5276 + }, + { + "epoch": 0.18898060773183878, + "grad_norm": 1.94366455078125, + "learning_rate": 0.00018703517295038573, + "loss": 1.3209, + "step": 5277 + }, + { + "epoch": 0.18901641986140705, + "grad_norm": 1.6621423959732056, + "learning_rate": 0.00018702946065636623, + "loss": 1.4488, + "step": 5278 + }, + { + "epoch": 0.18905223199097534, + "grad_norm": 2.5038325786590576, + "learning_rate": 0.00018702374719147766, + "loss": 1.2706, + "step": 5279 + }, + { + "epoch": 0.18908804412054364, + "grad_norm": 1.4643855094909668, + "learning_rate": 0.00018701803255579677, + "loss": 1.7987, + "step": 5280 + }, + { + "epoch": 0.1891238562501119, + "grad_norm": 1.5782333612442017, + "learning_rate": 0.00018701231674940054, + "loss": 1.7855, + "step": 5281 + }, + { + "epoch": 0.1891596683796802, + "grad_norm": 1.8929588794708252, + "learning_rate": 0.0001870065997723658, + "loss": 1.6753, + "step": 5282 + }, + { + "epoch": 0.18919548050924848, + "grad_norm": 1.534611463546753, + "learning_rate": 0.00018700088162476952, + "loss": 1.439, + "step": 5283 + }, + { + "epoch": 0.18923129263881677, + "grad_norm": 1.16856849193573, + "learning_rate": 0.00018699516230668856, + "loss": 1.6246, + "step": 5284 + }, + { + "epoch": 0.18926710476838504, + "grad_norm": 1.5402215719223022, + "learning_rate": 0.00018698944181819993, + "loss": 1.3181, + "step": 5285 + }, + { + "epoch": 0.18930291689795334, + "grad_norm": 1.398967981338501, + "learning_rate": 0.00018698372015938058, + "loss": 1.9008, + "step": 5286 + }, + { + "epoch": 0.18933872902752164, + "grad_norm": 1.6031793355941772, + "learning_rate": 0.00018697799733030746, + "loss": 1.5932, + "step": 5287 + }, + { + "epoch": 0.1893745411570899, + "grad_norm": 1.5183573961257935, + "learning_rate": 0.00018697227333105756, + "loss": 1.6072, + "step": 5288 + }, + { + "epoch": 0.1894103532866582, + "grad_norm": 2.281196117401123, + "learning_rate": 0.00018696654816170795, + "loss": 1.6894, + "step": 5289 + }, + { + "epoch": 0.18944616541622647, + "grad_norm": 1.6974093914031982, + "learning_rate": 0.0001869608218223356, + "loss": 1.2243, + "step": 5290 + }, + { + "epoch": 0.18948197754579477, + "grad_norm": 1.6468058824539185, + "learning_rate": 0.0001869550943130175, + "loss": 1.2508, + "step": 5291 + }, + { + "epoch": 0.18951778967536304, + "grad_norm": 1.3340821266174316, + "learning_rate": 0.00018694936563383086, + "loss": 1.3316, + "step": 5292 + }, + { + "epoch": 0.18955360180493133, + "grad_norm": 1.515095591545105, + "learning_rate": 0.00018694363578485262, + "loss": 1.6026, + "step": 5293 + }, + { + "epoch": 0.18958941393449963, + "grad_norm": 1.9505254030227661, + "learning_rate": 0.00018693790476615992, + "loss": 1.5093, + "step": 5294 + }, + { + "epoch": 0.1896252260640679, + "grad_norm": 1.6551986932754517, + "learning_rate": 0.00018693217257782985, + "loss": 1.2679, + "step": 5295 + }, + { + "epoch": 0.1896610381936362, + "grad_norm": 1.6577612161636353, + "learning_rate": 0.00018692643921993952, + "loss": 1.583, + "step": 5296 + }, + { + "epoch": 0.18969685032320446, + "grad_norm": 1.423334002494812, + "learning_rate": 0.0001869207046925661, + "loss": 1.508, + "step": 5297 + }, + { + "epoch": 0.18973266245277276, + "grad_norm": 1.5805872678756714, + "learning_rate": 0.0001869149689957867, + "loss": 1.7373, + "step": 5298 + }, + { + "epoch": 0.18976847458234103, + "grad_norm": 1.982264518737793, + "learning_rate": 0.0001869092321296785, + "loss": 1.4873, + "step": 5299 + }, + { + "epoch": 0.18980428671190933, + "grad_norm": 1.3803657293319702, + "learning_rate": 0.00018690349409431872, + "loss": 1.347, + "step": 5300 + }, + { + "epoch": 0.1898400988414776, + "grad_norm": 1.725968360900879, + "learning_rate": 0.00018689775488978452, + "loss": 1.6085, + "step": 5301 + }, + { + "epoch": 0.1898759109710459, + "grad_norm": 2.5847837924957275, + "learning_rate": 0.0001868920145161531, + "loss": 1.5907, + "step": 5302 + }, + { + "epoch": 0.1899117231006142, + "grad_norm": 1.841369390487671, + "learning_rate": 0.0001868862729735017, + "loss": 1.8148, + "step": 5303 + }, + { + "epoch": 0.18994753523018246, + "grad_norm": 2.197054147720337, + "learning_rate": 0.00018688053026190757, + "loss": 1.2587, + "step": 5304 + }, + { + "epoch": 0.18998334735975075, + "grad_norm": 1.7132951021194458, + "learning_rate": 0.000186874786381448, + "loss": 1.5339, + "step": 5305 + }, + { + "epoch": 0.19001915948931902, + "grad_norm": 1.6471575498580933, + "learning_rate": 0.0001868690413322002, + "loss": 1.5682, + "step": 5306 + }, + { + "epoch": 0.19005497161888732, + "grad_norm": 1.9123592376708984, + "learning_rate": 0.00018686329511424153, + "loss": 1.7367, + "step": 5307 + }, + { + "epoch": 0.1900907837484556, + "grad_norm": 2.1285805702209473, + "learning_rate": 0.00018685754772764928, + "loss": 1.7673, + "step": 5308 + }, + { + "epoch": 0.19012659587802389, + "grad_norm": 1.352774739265442, + "learning_rate": 0.00018685179917250072, + "loss": 1.6098, + "step": 5309 + }, + { + "epoch": 0.19016240800759218, + "grad_norm": 1.3797439336776733, + "learning_rate": 0.0001868460494488733, + "loss": 1.5094, + "step": 5310 + }, + { + "epoch": 0.19019822013716045, + "grad_norm": 2.124342918395996, + "learning_rate": 0.00018684029855684425, + "loss": 1.6705, + "step": 5311 + }, + { + "epoch": 0.19023403226672875, + "grad_norm": 1.2310181856155396, + "learning_rate": 0.00018683454649649103, + "loss": 1.4928, + "step": 5312 + }, + { + "epoch": 0.19026984439629702, + "grad_norm": 1.4215664863586426, + "learning_rate": 0.00018682879326789098, + "loss": 1.6286, + "step": 5313 + }, + { + "epoch": 0.1903056565258653, + "grad_norm": 1.8718503713607788, + "learning_rate": 0.00018682303887112154, + "loss": 1.4652, + "step": 5314 + }, + { + "epoch": 0.19034146865543358, + "grad_norm": 2.483689308166504, + "learning_rate": 0.00018681728330626008, + "loss": 1.8905, + "step": 5315 + }, + { + "epoch": 0.19037728078500188, + "grad_norm": 1.4977774620056152, + "learning_rate": 0.00018681152657338404, + "loss": 1.8194, + "step": 5316 + }, + { + "epoch": 0.19041309291457018, + "grad_norm": 1.7027922868728638, + "learning_rate": 0.00018680576867257095, + "loss": 1.6571, + "step": 5317 + }, + { + "epoch": 0.19044890504413844, + "grad_norm": 1.6885054111480713, + "learning_rate": 0.00018680000960389818, + "loss": 1.4111, + "step": 5318 + }, + { + "epoch": 0.19048471717370674, + "grad_norm": 1.3242998123168945, + "learning_rate": 0.00018679424936744323, + "loss": 1.3061, + "step": 5319 + }, + { + "epoch": 0.190520529303275, + "grad_norm": 2.3705389499664307, + "learning_rate": 0.00018678848796328362, + "loss": 1.2376, + "step": 5320 + }, + { + "epoch": 0.1905563414328433, + "grad_norm": 1.4017654657363892, + "learning_rate": 0.00018678272539149687, + "loss": 1.2629, + "step": 5321 + }, + { + "epoch": 0.19059215356241158, + "grad_norm": 1.6854712963104248, + "learning_rate": 0.00018677696165216048, + "loss": 1.6078, + "step": 5322 + }, + { + "epoch": 0.19062796569197987, + "grad_norm": 1.8608250617980957, + "learning_rate": 0.000186771196745352, + "loss": 1.7155, + "step": 5323 + }, + { + "epoch": 0.19066377782154817, + "grad_norm": 1.982079029083252, + "learning_rate": 0.000186765430671149, + "loss": 1.529, + "step": 5324 + }, + { + "epoch": 0.19069958995111644, + "grad_norm": 1.904109239578247, + "learning_rate": 0.00018675966342962904, + "loss": 1.5579, + "step": 5325 + }, + { + "epoch": 0.19073540208068474, + "grad_norm": 1.7409393787384033, + "learning_rate": 0.00018675389502086976, + "loss": 1.6718, + "step": 5326 + }, + { + "epoch": 0.190771214210253, + "grad_norm": 1.7952065467834473, + "learning_rate": 0.00018674812544494865, + "loss": 1.6698, + "step": 5327 + }, + { + "epoch": 0.1908070263398213, + "grad_norm": 2.428290367126465, + "learning_rate": 0.00018674235470194348, + "loss": 1.7883, + "step": 5328 + }, + { + "epoch": 0.19084283846938957, + "grad_norm": 1.6777397394180298, + "learning_rate": 0.0001867365827919318, + "loss": 1.6194, + "step": 5329 + }, + { + "epoch": 0.19087865059895787, + "grad_norm": 2.2093400955200195, + "learning_rate": 0.00018673080971499126, + "loss": 1.4838, + "step": 5330 + }, + { + "epoch": 0.19091446272852616, + "grad_norm": 1.6835341453552246, + "learning_rate": 0.00018672503547119957, + "loss": 1.4022, + "step": 5331 + }, + { + "epoch": 0.19095027485809443, + "grad_norm": 1.4746954441070557, + "learning_rate": 0.00018671926006063442, + "loss": 1.2283, + "step": 5332 + }, + { + "epoch": 0.19098608698766273, + "grad_norm": 2.170846939086914, + "learning_rate": 0.00018671348348337343, + "loss": 1.8196, + "step": 5333 + }, + { + "epoch": 0.191021899117231, + "grad_norm": 1.571633219718933, + "learning_rate": 0.00018670770573949442, + "loss": 1.5886, + "step": 5334 + }, + { + "epoch": 0.1910577112467993, + "grad_norm": 1.9823776483535767, + "learning_rate": 0.00018670192682907505, + "loss": 1.6207, + "step": 5335 + }, + { + "epoch": 0.19109352337636756, + "grad_norm": 1.2739793062210083, + "learning_rate": 0.00018669614675219308, + "loss": 1.5255, + "step": 5336 + }, + { + "epoch": 0.19112933550593586, + "grad_norm": 1.8321658372879028, + "learning_rate": 0.0001866903655089263, + "loss": 1.7024, + "step": 5337 + }, + { + "epoch": 0.19116514763550416, + "grad_norm": 1.5952870845794678, + "learning_rate": 0.00018668458309935247, + "loss": 1.5503, + "step": 5338 + }, + { + "epoch": 0.19120095976507243, + "grad_norm": 1.4116911888122559, + "learning_rate": 0.0001866787995235494, + "loss": 1.8354, + "step": 5339 + }, + { + "epoch": 0.19123677189464072, + "grad_norm": 1.6893408298492432, + "learning_rate": 0.00018667301478159489, + "loss": 1.7062, + "step": 5340 + }, + { + "epoch": 0.191272584024209, + "grad_norm": 1.6460639238357544, + "learning_rate": 0.00018666722887356673, + "loss": 1.7274, + "step": 5341 + }, + { + "epoch": 0.1913083961537773, + "grad_norm": 1.753871202468872, + "learning_rate": 0.00018666144179954283, + "loss": 2.059, + "step": 5342 + }, + { + "epoch": 0.19134420828334556, + "grad_norm": 1.788332223892212, + "learning_rate": 0.00018665565355960103, + "loss": 1.2346, + "step": 5343 + }, + { + "epoch": 0.19138002041291385, + "grad_norm": 1.3304243087768555, + "learning_rate": 0.0001866498641538192, + "loss": 1.7645, + "step": 5344 + }, + { + "epoch": 0.19141583254248215, + "grad_norm": 1.1889979839324951, + "learning_rate": 0.00018664407358227517, + "loss": 1.5125, + "step": 5345 + }, + { + "epoch": 0.19145164467205042, + "grad_norm": 2.2318155765533447, + "learning_rate": 0.0001866382818450469, + "loss": 1.4744, + "step": 5346 + }, + { + "epoch": 0.19148745680161872, + "grad_norm": 1.7189171314239502, + "learning_rate": 0.00018663248894221232, + "loss": 1.466, + "step": 5347 + }, + { + "epoch": 0.19152326893118699, + "grad_norm": 1.3835846185684204, + "learning_rate": 0.00018662669487384936, + "loss": 1.6789, + "step": 5348 + }, + { + "epoch": 0.19155908106075528, + "grad_norm": 2.947704315185547, + "learning_rate": 0.00018662089964003594, + "loss": 1.5365, + "step": 5349 + }, + { + "epoch": 0.19159489319032355, + "grad_norm": 1.4203107357025146, + "learning_rate": 0.00018661510324085003, + "loss": 1.6378, + "step": 5350 + }, + { + "epoch": 0.19163070531989185, + "grad_norm": 1.2650010585784912, + "learning_rate": 0.00018660930567636968, + "loss": 1.4558, + "step": 5351 + }, + { + "epoch": 0.19166651744946014, + "grad_norm": 1.485657811164856, + "learning_rate": 0.00018660350694667282, + "loss": 1.5388, + "step": 5352 + }, + { + "epoch": 0.1917023295790284, + "grad_norm": 1.648183822631836, + "learning_rate": 0.00018659770705183748, + "loss": 1.3599, + "step": 5353 + }, + { + "epoch": 0.1917381417085967, + "grad_norm": 1.3641210794448853, + "learning_rate": 0.00018659190599194168, + "loss": 1.336, + "step": 5354 + }, + { + "epoch": 0.19177395383816498, + "grad_norm": 1.6528867483139038, + "learning_rate": 0.0001865861037670635, + "loss": 1.5748, + "step": 5355 + }, + { + "epoch": 0.19180976596773328, + "grad_norm": 1.54402756690979, + "learning_rate": 0.00018658030037728098, + "loss": 1.495, + "step": 5356 + }, + { + "epoch": 0.19184557809730154, + "grad_norm": 1.9251580238342285, + "learning_rate": 0.00018657449582267218, + "loss": 1.5426, + "step": 5357 + }, + { + "epoch": 0.19188139022686984, + "grad_norm": 2.060964345932007, + "learning_rate": 0.00018656869010331523, + "loss": 1.7168, + "step": 5358 + }, + { + "epoch": 0.19191720235643814, + "grad_norm": 1.657238483428955, + "learning_rate": 0.00018656288321928824, + "loss": 1.2452, + "step": 5359 + }, + { + "epoch": 0.1919530144860064, + "grad_norm": 1.5768673419952393, + "learning_rate": 0.0001865570751706693, + "loss": 1.5858, + "step": 5360 + }, + { + "epoch": 0.1919888266155747, + "grad_norm": 1.583263635635376, + "learning_rate": 0.00018655126595753654, + "loss": 1.2867, + "step": 5361 + }, + { + "epoch": 0.19202463874514297, + "grad_norm": 1.5505911111831665, + "learning_rate": 0.00018654545557996816, + "loss": 1.5733, + "step": 5362 + }, + { + "epoch": 0.19206045087471127, + "grad_norm": 1.305277705192566, + "learning_rate": 0.0001865396440380423, + "loss": 1.6863, + "step": 5363 + }, + { + "epoch": 0.19209626300427954, + "grad_norm": 2.091987133026123, + "learning_rate": 0.00018653383133183718, + "loss": 1.9752, + "step": 5364 + }, + { + "epoch": 0.19213207513384783, + "grad_norm": 1.7626006603240967, + "learning_rate": 0.00018652801746143097, + "loss": 1.6712, + "step": 5365 + }, + { + "epoch": 0.19216788726341613, + "grad_norm": 1.4710909128189087, + "learning_rate": 0.00018652220242690187, + "loss": 1.7792, + "step": 5366 + }, + { + "epoch": 0.1922036993929844, + "grad_norm": 1.8461323976516724, + "learning_rate": 0.00018651638622832817, + "loss": 1.3979, + "step": 5367 + }, + { + "epoch": 0.1922395115225527, + "grad_norm": 1.7423018217086792, + "learning_rate": 0.00018651056886578808, + "loss": 1.5811, + "step": 5368 + }, + { + "epoch": 0.19227532365212097, + "grad_norm": 2.2182414531707764, + "learning_rate": 0.00018650475033935992, + "loss": 1.6277, + "step": 5369 + }, + { + "epoch": 0.19231113578168926, + "grad_norm": 1.7264537811279297, + "learning_rate": 0.00018649893064912187, + "loss": 1.6134, + "step": 5370 + }, + { + "epoch": 0.19234694791125753, + "grad_norm": 2.0035877227783203, + "learning_rate": 0.00018649310979515228, + "loss": 1.6803, + "step": 5371 + }, + { + "epoch": 0.19238276004082583, + "grad_norm": 1.4782098531723022, + "learning_rate": 0.0001864872877775295, + "loss": 1.7665, + "step": 5372 + }, + { + "epoch": 0.19241857217039413, + "grad_norm": 1.79098641872406, + "learning_rate": 0.00018648146459633182, + "loss": 1.3728, + "step": 5373 + }, + { + "epoch": 0.1924543842999624, + "grad_norm": 1.7012203931808472, + "learning_rate": 0.00018647564025163756, + "loss": 1.6528, + "step": 5374 + }, + { + "epoch": 0.1924901964295307, + "grad_norm": 1.1748199462890625, + "learning_rate": 0.00018646981474352515, + "loss": 1.5076, + "step": 5375 + }, + { + "epoch": 0.19252600855909896, + "grad_norm": 1.2763925790786743, + "learning_rate": 0.0001864639880720729, + "loss": 1.4572, + "step": 5376 + }, + { + "epoch": 0.19256182068866726, + "grad_norm": 1.897787094116211, + "learning_rate": 0.0001864581602373592, + "loss": 1.6753, + "step": 5377 + }, + { + "epoch": 0.19259763281823553, + "grad_norm": 1.900539755821228, + "learning_rate": 0.00018645233123946252, + "loss": 1.3106, + "step": 5378 + }, + { + "epoch": 0.19263344494780382, + "grad_norm": 1.413638949394226, + "learning_rate": 0.0001864465010784612, + "loss": 1.4018, + "step": 5379 + }, + { + "epoch": 0.19266925707737212, + "grad_norm": 1.6315624713897705, + "learning_rate": 0.00018644066975443373, + "loss": 1.6498, + "step": 5380 + }, + { + "epoch": 0.1927050692069404, + "grad_norm": 1.668734073638916, + "learning_rate": 0.00018643483726745857, + "loss": 1.2368, + "step": 5381 + }, + { + "epoch": 0.19274088133650868, + "grad_norm": 2.507664203643799, + "learning_rate": 0.00018642900361761413, + "loss": 1.5805, + "step": 5382 + }, + { + "epoch": 0.19277669346607695, + "grad_norm": 2.0644752979278564, + "learning_rate": 0.00018642316880497893, + "loss": 1.6608, + "step": 5383 + }, + { + "epoch": 0.19281250559564525, + "grad_norm": 2.1488327980041504, + "learning_rate": 0.00018641733282963153, + "loss": 1.9823, + "step": 5384 + }, + { + "epoch": 0.19284831772521352, + "grad_norm": 1.6034685373306274, + "learning_rate": 0.00018641149569165034, + "loss": 1.5834, + "step": 5385 + }, + { + "epoch": 0.19288412985478182, + "grad_norm": 1.9179775714874268, + "learning_rate": 0.00018640565739111393, + "loss": 1.534, + "step": 5386 + }, + { + "epoch": 0.1929199419843501, + "grad_norm": 1.6921823024749756, + "learning_rate": 0.0001863998179281009, + "loss": 1.5869, + "step": 5387 + }, + { + "epoch": 0.19295575411391838, + "grad_norm": 1.5110647678375244, + "learning_rate": 0.0001863939773026897, + "loss": 1.4647, + "step": 5388 + }, + { + "epoch": 0.19299156624348668, + "grad_norm": 1.7196460962295532, + "learning_rate": 0.00018638813551495901, + "loss": 1.7801, + "step": 5389 + }, + { + "epoch": 0.19302737837305495, + "grad_norm": 1.4753451347351074, + "learning_rate": 0.0001863822925649874, + "loss": 1.8287, + "step": 5390 + }, + { + "epoch": 0.19306319050262324, + "grad_norm": 1.5066410303115845, + "learning_rate": 0.00018637644845285344, + "loss": 1.4221, + "step": 5391 + }, + { + "epoch": 0.1930990026321915, + "grad_norm": 1.68746817111969, + "learning_rate": 0.00018637060317863583, + "loss": 1.9171, + "step": 5392 + }, + { + "epoch": 0.1931348147617598, + "grad_norm": 1.6595805883407593, + "learning_rate": 0.00018636475674241315, + "loss": 1.6928, + "step": 5393 + }, + { + "epoch": 0.1931706268913281, + "grad_norm": 1.706078290939331, + "learning_rate": 0.00018635890914426404, + "loss": 1.618, + "step": 5394 + }, + { + "epoch": 0.19320643902089638, + "grad_norm": 1.5801364183425903, + "learning_rate": 0.00018635306038426724, + "loss": 1.5143, + "step": 5395 + }, + { + "epoch": 0.19324225115046467, + "grad_norm": 2.3066155910491943, + "learning_rate": 0.00018634721046250139, + "loss": 1.4796, + "step": 5396 + }, + { + "epoch": 0.19327806328003294, + "grad_norm": 1.6973354816436768, + "learning_rate": 0.00018634135937904518, + "loss": 1.7408, + "step": 5397 + }, + { + "epoch": 0.19331387540960124, + "grad_norm": 1.4387764930725098, + "learning_rate": 0.00018633550713397737, + "loss": 1.4763, + "step": 5398 + }, + { + "epoch": 0.1933496875391695, + "grad_norm": 1.51828134059906, + "learning_rate": 0.00018632965372737665, + "loss": 1.71, + "step": 5399 + }, + { + "epoch": 0.1933854996687378, + "grad_norm": 2.4643683433532715, + "learning_rate": 0.00018632379915932185, + "loss": 1.6933, + "step": 5400 + }, + { + "epoch": 0.19342131179830607, + "grad_norm": 1.6648520231246948, + "learning_rate": 0.00018631794342989163, + "loss": 1.2457, + "step": 5401 + }, + { + "epoch": 0.19345712392787437, + "grad_norm": 1.4097614288330078, + "learning_rate": 0.00018631208653916486, + "loss": 1.503, + "step": 5402 + }, + { + "epoch": 0.19349293605744267, + "grad_norm": 1.4005558490753174, + "learning_rate": 0.0001863062284872203, + "loss": 1.2093, + "step": 5403 + }, + { + "epoch": 0.19352874818701093, + "grad_norm": 1.770641803741455, + "learning_rate": 0.00018630036927413672, + "loss": 1.409, + "step": 5404 + }, + { + "epoch": 0.19356456031657923, + "grad_norm": 1.9547629356384277, + "learning_rate": 0.00018629450889999302, + "loss": 1.5299, + "step": 5405 + }, + { + "epoch": 0.1936003724461475, + "grad_norm": 1.4061657190322876, + "learning_rate": 0.000186288647364868, + "loss": 1.4272, + "step": 5406 + }, + { + "epoch": 0.1936361845757158, + "grad_norm": 2.2210681438446045, + "learning_rate": 0.00018628278466884055, + "loss": 1.4814, + "step": 5407 + }, + { + "epoch": 0.19367199670528407, + "grad_norm": 1.4909852743148804, + "learning_rate": 0.00018627692081198954, + "loss": 1.4706, + "step": 5408 + }, + { + "epoch": 0.19370780883485236, + "grad_norm": 1.9401264190673828, + "learning_rate": 0.00018627105579439382, + "loss": 1.2785, + "step": 5409 + }, + { + "epoch": 0.19374362096442066, + "grad_norm": 1.8306653499603271, + "learning_rate": 0.00018626518961613236, + "loss": 1.3549, + "step": 5410 + }, + { + "epoch": 0.19377943309398893, + "grad_norm": 1.7504512071609497, + "learning_rate": 0.00018625932227728402, + "loss": 1.6292, + "step": 5411 + }, + { + "epoch": 0.19381524522355723, + "grad_norm": 1.2952179908752441, + "learning_rate": 0.00018625345377792777, + "loss": 1.5527, + "step": 5412 + }, + { + "epoch": 0.1938510573531255, + "grad_norm": 1.9367178678512573, + "learning_rate": 0.0001862475841181426, + "loss": 1.611, + "step": 5413 + }, + { + "epoch": 0.1938868694826938, + "grad_norm": 2.24019455909729, + "learning_rate": 0.00018624171329800738, + "loss": 1.7476, + "step": 5414 + }, + { + "epoch": 0.19392268161226206, + "grad_norm": 1.6576284170150757, + "learning_rate": 0.00018623584131760118, + "loss": 1.7381, + "step": 5415 + }, + { + "epoch": 0.19395849374183036, + "grad_norm": 1.9396398067474365, + "learning_rate": 0.00018622996817700295, + "loss": 1.5697, + "step": 5416 + }, + { + "epoch": 0.19399430587139865, + "grad_norm": 1.9461170434951782, + "learning_rate": 0.00018622409387629175, + "loss": 1.4934, + "step": 5417 + }, + { + "epoch": 0.19403011800096692, + "grad_norm": 1.8357774019241333, + "learning_rate": 0.0001862182184155466, + "loss": 1.7385, + "step": 5418 + }, + { + "epoch": 0.19406593013053522, + "grad_norm": 1.4890779256820679, + "learning_rate": 0.00018621234179484647, + "loss": 1.5282, + "step": 5419 + }, + { + "epoch": 0.1941017422601035, + "grad_norm": 2.412010431289673, + "learning_rate": 0.00018620646401427054, + "loss": 1.7779, + "step": 5420 + }, + { + "epoch": 0.19413755438967178, + "grad_norm": 2.704232692718506, + "learning_rate": 0.00018620058507389783, + "loss": 1.5965, + "step": 5421 + }, + { + "epoch": 0.19417336651924005, + "grad_norm": 1.7113075256347656, + "learning_rate": 0.00018619470497380745, + "loss": 1.7815, + "step": 5422 + }, + { + "epoch": 0.19420917864880835, + "grad_norm": 1.8636949062347412, + "learning_rate": 0.00018618882371407847, + "loss": 1.6106, + "step": 5423 + }, + { + "epoch": 0.19424499077837665, + "grad_norm": 1.3164920806884766, + "learning_rate": 0.00018618294129479007, + "loss": 1.3721, + "step": 5424 + }, + { + "epoch": 0.19428080290794492, + "grad_norm": 1.5221353769302368, + "learning_rate": 0.00018617705771602132, + "loss": 1.4453, + "step": 5425 + }, + { + "epoch": 0.1943166150375132, + "grad_norm": 2.4712367057800293, + "learning_rate": 0.00018617117297785145, + "loss": 1.9291, + "step": 5426 + }, + { + "epoch": 0.19435242716708148, + "grad_norm": 1.8102853298187256, + "learning_rate": 0.00018616528708035958, + "loss": 1.6105, + "step": 5427 + }, + { + "epoch": 0.19438823929664978, + "grad_norm": 1.8387972116470337, + "learning_rate": 0.00018615940002362496, + "loss": 1.4088, + "step": 5428 + }, + { + "epoch": 0.19442405142621805, + "grad_norm": 1.8663088083267212, + "learning_rate": 0.0001861535118077267, + "loss": 1.5058, + "step": 5429 + }, + { + "epoch": 0.19445986355578634, + "grad_norm": 2.4539215564727783, + "learning_rate": 0.0001861476224327441, + "loss": 1.8223, + "step": 5430 + }, + { + "epoch": 0.19449567568535464, + "grad_norm": 1.7828904390335083, + "learning_rate": 0.00018614173189875636, + "loss": 1.4433, + "step": 5431 + }, + { + "epoch": 0.1945314878149229, + "grad_norm": 2.0361180305480957, + "learning_rate": 0.0001861358402058427, + "loss": 1.5065, + "step": 5432 + }, + { + "epoch": 0.1945672999444912, + "grad_norm": 1.5484020709991455, + "learning_rate": 0.00018612994735408246, + "loss": 1.7755, + "step": 5433 + }, + { + "epoch": 0.19460311207405948, + "grad_norm": 1.351025938987732, + "learning_rate": 0.00018612405334355488, + "loss": 1.5675, + "step": 5434 + }, + { + "epoch": 0.19463892420362777, + "grad_norm": 1.856544852256775, + "learning_rate": 0.00018611815817433925, + "loss": 1.8301, + "step": 5435 + }, + { + "epoch": 0.19467473633319604, + "grad_norm": 1.3813837766647339, + "learning_rate": 0.00018611226184651484, + "loss": 1.7018, + "step": 5436 + }, + { + "epoch": 0.19471054846276434, + "grad_norm": 1.4931066036224365, + "learning_rate": 0.00018610636436016106, + "loss": 1.5255, + "step": 5437 + }, + { + "epoch": 0.19474636059233263, + "grad_norm": 1.5089569091796875, + "learning_rate": 0.00018610046571535723, + "loss": 1.4895, + "step": 5438 + }, + { + "epoch": 0.1947821727219009, + "grad_norm": 1.7368457317352295, + "learning_rate": 0.00018609456591218266, + "loss": 1.1445, + "step": 5439 + }, + { + "epoch": 0.1948179848514692, + "grad_norm": 1.6913045644760132, + "learning_rate": 0.0001860886649507168, + "loss": 1.5326, + "step": 5440 + }, + { + "epoch": 0.19485379698103747, + "grad_norm": 1.6324273347854614, + "learning_rate": 0.00018608276283103896, + "loss": 1.7677, + "step": 5441 + }, + { + "epoch": 0.19488960911060577, + "grad_norm": 1.6233586072921753, + "learning_rate": 0.0001860768595532286, + "loss": 1.1984, + "step": 5442 + }, + { + "epoch": 0.19492542124017403, + "grad_norm": 1.4947038888931274, + "learning_rate": 0.00018607095511736515, + "loss": 1.7798, + "step": 5443 + }, + { + "epoch": 0.19496123336974233, + "grad_norm": 1.6145468950271606, + "learning_rate": 0.00018606504952352798, + "loss": 1.4282, + "step": 5444 + }, + { + "epoch": 0.19499704549931063, + "grad_norm": 2.4696900844573975, + "learning_rate": 0.00018605914277179664, + "loss": 1.8904, + "step": 5445 + }, + { + "epoch": 0.1950328576288789, + "grad_norm": 1.766887903213501, + "learning_rate": 0.00018605323486225049, + "loss": 1.6336, + "step": 5446 + }, + { + "epoch": 0.1950686697584472, + "grad_norm": 1.9337714910507202, + "learning_rate": 0.00018604732579496908, + "loss": 1.5009, + "step": 5447 + }, + { + "epoch": 0.19510448188801546, + "grad_norm": 1.4915785789489746, + "learning_rate": 0.0001860414155700319, + "loss": 1.552, + "step": 5448 + }, + { + "epoch": 0.19514029401758376, + "grad_norm": 1.450589656829834, + "learning_rate": 0.00018603550418751845, + "loss": 1.4491, + "step": 5449 + }, + { + "epoch": 0.19517610614715203, + "grad_norm": 2.733760118484497, + "learning_rate": 0.0001860295916475083, + "loss": 1.539, + "step": 5450 + }, + { + "epoch": 0.19521191827672033, + "grad_norm": 1.459010362625122, + "learning_rate": 0.00018602367795008093, + "loss": 1.7806, + "step": 5451 + }, + { + "epoch": 0.19524773040628862, + "grad_norm": 1.4586896896362305, + "learning_rate": 0.00018601776309531593, + "loss": 1.4384, + "step": 5452 + }, + { + "epoch": 0.1952835425358569, + "grad_norm": 1.6691211462020874, + "learning_rate": 0.00018601184708329292, + "loss": 1.5412, + "step": 5453 + }, + { + "epoch": 0.1953193546654252, + "grad_norm": 2.4468443393707275, + "learning_rate": 0.00018600592991409141, + "loss": 1.5801, + "step": 5454 + }, + { + "epoch": 0.19535516679499346, + "grad_norm": 2.1524693965911865, + "learning_rate": 0.00018600001158779108, + "loss": 1.6428, + "step": 5455 + }, + { + "epoch": 0.19539097892456175, + "grad_norm": 1.6459256410598755, + "learning_rate": 0.00018599409210447152, + "loss": 1.6684, + "step": 5456 + }, + { + "epoch": 0.19542679105413002, + "grad_norm": 1.3564072847366333, + "learning_rate": 0.0001859881714642124, + "loss": 1.6664, + "step": 5457 + }, + { + "epoch": 0.19546260318369832, + "grad_norm": 1.9026246070861816, + "learning_rate": 0.00018598224966709332, + "loss": 1.6176, + "step": 5458 + }, + { + "epoch": 0.19549841531326662, + "grad_norm": 1.4654345512390137, + "learning_rate": 0.00018597632671319398, + "loss": 1.2719, + "step": 5459 + }, + { + "epoch": 0.19553422744283488, + "grad_norm": 1.4894424676895142, + "learning_rate": 0.0001859704026025941, + "loss": 1.519, + "step": 5460 + }, + { + "epoch": 0.19557003957240318, + "grad_norm": 1.355757713317871, + "learning_rate": 0.0001859644773353733, + "loss": 1.4525, + "step": 5461 + }, + { + "epoch": 0.19560585170197145, + "grad_norm": 1.8707811832427979, + "learning_rate": 0.00018595855091161137, + "loss": 1.4504, + "step": 5462 + }, + { + "epoch": 0.19564166383153975, + "grad_norm": 1.3696109056472778, + "learning_rate": 0.00018595262333138802, + "loss": 1.7364, + "step": 5463 + }, + { + "epoch": 0.19567747596110802, + "grad_norm": 1.756722092628479, + "learning_rate": 0.000185946694594783, + "loss": 1.9198, + "step": 5464 + }, + { + "epoch": 0.1957132880906763, + "grad_norm": 1.6089026927947998, + "learning_rate": 0.000185940764701876, + "loss": 1.4237, + "step": 5465 + }, + { + "epoch": 0.1957491002202446, + "grad_norm": 2.058664560317993, + "learning_rate": 0.00018593483365274694, + "loss": 1.4842, + "step": 5466 + }, + { + "epoch": 0.19578491234981288, + "grad_norm": 1.3057408332824707, + "learning_rate": 0.00018592890144747553, + "loss": 1.4789, + "step": 5467 + }, + { + "epoch": 0.19582072447938118, + "grad_norm": 2.4167938232421875, + "learning_rate": 0.00018592296808614156, + "loss": 1.9637, + "step": 5468 + }, + { + "epoch": 0.19585653660894944, + "grad_norm": 1.7090920209884644, + "learning_rate": 0.0001859170335688249, + "loss": 1.3604, + "step": 5469 + }, + { + "epoch": 0.19589234873851774, + "grad_norm": 1.6871206760406494, + "learning_rate": 0.0001859110978956054, + "loss": 1.3423, + "step": 5470 + }, + { + "epoch": 0.195928160868086, + "grad_norm": 1.3249101638793945, + "learning_rate": 0.00018590516106656288, + "loss": 1.6105, + "step": 5471 + }, + { + "epoch": 0.1959639729976543, + "grad_norm": 2.58074688911438, + "learning_rate": 0.00018589922308177723, + "loss": 1.5327, + "step": 5472 + }, + { + "epoch": 0.1959997851272226, + "grad_norm": 1.6249611377716064, + "learning_rate": 0.0001858932839413283, + "loss": 1.4159, + "step": 5473 + }, + { + "epoch": 0.19603559725679087, + "grad_norm": 1.391710877418518, + "learning_rate": 0.0001858873436452961, + "loss": 1.2536, + "step": 5474 + }, + { + "epoch": 0.19607140938635917, + "grad_norm": 1.7863149642944336, + "learning_rate": 0.0001858814021937604, + "loss": 1.2555, + "step": 5475 + }, + { + "epoch": 0.19610722151592744, + "grad_norm": 2.6330697536468506, + "learning_rate": 0.0001858754595868013, + "loss": 1.3862, + "step": 5476 + }, + { + "epoch": 0.19614303364549573, + "grad_norm": 1.5848230123519897, + "learning_rate": 0.0001858695158244986, + "loss": 1.36, + "step": 5477 + }, + { + "epoch": 0.196178845775064, + "grad_norm": 2.69571852684021, + "learning_rate": 0.00018586357090693233, + "loss": 2.119, + "step": 5478 + }, + { + "epoch": 0.1962146579046323, + "grad_norm": 1.6064741611480713, + "learning_rate": 0.0001858576248341825, + "loss": 1.4919, + "step": 5479 + }, + { + "epoch": 0.1962504700342006, + "grad_norm": 3.198639154434204, + "learning_rate": 0.00018585167760632905, + "loss": 1.6805, + "step": 5480 + }, + { + "epoch": 0.19628628216376887, + "grad_norm": 1.5669358968734741, + "learning_rate": 0.00018584572922345202, + "loss": 1.42, + "step": 5481 + }, + { + "epoch": 0.19632209429333716, + "grad_norm": 1.6380369663238525, + "learning_rate": 0.00018583977968563144, + "loss": 1.4726, + "step": 5482 + }, + { + "epoch": 0.19635790642290543, + "grad_norm": 1.8254414796829224, + "learning_rate": 0.00018583382899294736, + "loss": 1.4393, + "step": 5483 + }, + { + "epoch": 0.19639371855247373, + "grad_norm": 1.700395941734314, + "learning_rate": 0.00018582787714547982, + "loss": 1.3454, + "step": 5484 + }, + { + "epoch": 0.196429530682042, + "grad_norm": 1.4598270654678345, + "learning_rate": 0.0001858219241433089, + "loss": 1.7212, + "step": 5485 + }, + { + "epoch": 0.1964653428116103, + "grad_norm": 1.7634484767913818, + "learning_rate": 0.0001858159699865147, + "loss": 1.47, + "step": 5486 + }, + { + "epoch": 0.1965011549411786, + "grad_norm": 1.5036978721618652, + "learning_rate": 0.00018581001467517734, + "loss": 1.8402, + "step": 5487 + }, + { + "epoch": 0.19653696707074686, + "grad_norm": 2.055551767349243, + "learning_rate": 0.00018580405820937688, + "loss": 2.0814, + "step": 5488 + }, + { + "epoch": 0.19657277920031516, + "grad_norm": 1.3088536262512207, + "learning_rate": 0.0001857981005891935, + "loss": 1.5177, + "step": 5489 + }, + { + "epoch": 0.19660859132988343, + "grad_norm": 1.9697926044464111, + "learning_rate": 0.00018579214181470736, + "loss": 1.3903, + "step": 5490 + }, + { + "epoch": 0.19664440345945172, + "grad_norm": 1.503617286682129, + "learning_rate": 0.00018578618188599863, + "loss": 1.5119, + "step": 5491 + }, + { + "epoch": 0.19668021558902, + "grad_norm": 2.178367853164673, + "learning_rate": 0.00018578022080314747, + "loss": 1.3386, + "step": 5492 + }, + { + "epoch": 0.1967160277185883, + "grad_norm": 1.7492157220840454, + "learning_rate": 0.00018577425856623408, + "loss": 1.5961, + "step": 5493 + }, + { + "epoch": 0.19675183984815658, + "grad_norm": 1.7134050130844116, + "learning_rate": 0.00018576829517533868, + "loss": 1.3217, + "step": 5494 + }, + { + "epoch": 0.19678765197772485, + "grad_norm": 1.4412091970443726, + "learning_rate": 0.00018576233063054151, + "loss": 1.5938, + "step": 5495 + }, + { + "epoch": 0.19682346410729315, + "grad_norm": 1.636104702949524, + "learning_rate": 0.00018575636493192282, + "loss": 1.8974, + "step": 5496 + }, + { + "epoch": 0.19685927623686142, + "grad_norm": 1.2458416223526, + "learning_rate": 0.00018575039807956282, + "loss": 1.6478, + "step": 5497 + }, + { + "epoch": 0.19689508836642972, + "grad_norm": 2.1648898124694824, + "learning_rate": 0.00018574443007354186, + "loss": 1.4979, + "step": 5498 + }, + { + "epoch": 0.19693090049599798, + "grad_norm": 1.4280565977096558, + "learning_rate": 0.00018573846091394017, + "loss": 1.4956, + "step": 5499 + }, + { + "epoch": 0.19696671262556628, + "grad_norm": 1.6333988904953003, + "learning_rate": 0.00018573249060083812, + "loss": 1.5715, + "step": 5500 + }, + { + "epoch": 0.19700252475513455, + "grad_norm": 1.6430987119674683, + "learning_rate": 0.00018572651913431596, + "loss": 1.699, + "step": 5501 + }, + { + "epoch": 0.19703833688470285, + "grad_norm": 1.433403730392456, + "learning_rate": 0.00018572054651445408, + "loss": 1.5459, + "step": 5502 + }, + { + "epoch": 0.19707414901427114, + "grad_norm": 2.47914981842041, + "learning_rate": 0.00018571457274133279, + "loss": 1.7359, + "step": 5503 + }, + { + "epoch": 0.1971099611438394, + "grad_norm": 1.688639760017395, + "learning_rate": 0.0001857085978150325, + "loss": 1.7368, + "step": 5504 + }, + { + "epoch": 0.1971457732734077, + "grad_norm": 1.4147371053695679, + "learning_rate": 0.0001857026217356336, + "loss": 1.4653, + "step": 5505 + }, + { + "epoch": 0.19718158540297598, + "grad_norm": 1.7496812343597412, + "learning_rate": 0.00018569664450321645, + "loss": 1.5381, + "step": 5506 + }, + { + "epoch": 0.19721739753254428, + "grad_norm": 1.178428292274475, + "learning_rate": 0.00018569066611786152, + "loss": 1.559, + "step": 5507 + }, + { + "epoch": 0.19725320966211254, + "grad_norm": 1.813152551651001, + "learning_rate": 0.00018568468657964918, + "loss": 1.4128, + "step": 5508 + }, + { + "epoch": 0.19728902179168084, + "grad_norm": 1.3909872770309448, + "learning_rate": 0.00018567870588865994, + "loss": 1.6463, + "step": 5509 + }, + { + "epoch": 0.19732483392124914, + "grad_norm": 1.5748597383499146, + "learning_rate": 0.0001856727240449742, + "loss": 1.7144, + "step": 5510 + }, + { + "epoch": 0.1973606460508174, + "grad_norm": 1.5605316162109375, + "learning_rate": 0.0001856667410486725, + "loss": 1.8678, + "step": 5511 + }, + { + "epoch": 0.1973964581803857, + "grad_norm": 1.2146022319793701, + "learning_rate": 0.00018566075689983527, + "loss": 1.7029, + "step": 5512 + }, + { + "epoch": 0.19743227030995397, + "grad_norm": 1.2470415830612183, + "learning_rate": 0.00018565477159854306, + "loss": 1.7266, + "step": 5513 + }, + { + "epoch": 0.19746808243952227, + "grad_norm": 1.8456315994262695, + "learning_rate": 0.00018564878514487637, + "loss": 1.4474, + "step": 5514 + }, + { + "epoch": 0.19750389456909054, + "grad_norm": 1.6164729595184326, + "learning_rate": 0.0001856427975389158, + "loss": 1.8401, + "step": 5515 + }, + { + "epoch": 0.19753970669865883, + "grad_norm": 2.2162938117980957, + "learning_rate": 0.00018563680878074182, + "loss": 1.4896, + "step": 5516 + }, + { + "epoch": 0.19757551882822713, + "grad_norm": 1.71847403049469, + "learning_rate": 0.00018563081887043505, + "loss": 1.5033, + "step": 5517 + }, + { + "epoch": 0.1976113309577954, + "grad_norm": 1.6429033279418945, + "learning_rate": 0.00018562482780807606, + "loss": 1.4215, + "step": 5518 + }, + { + "epoch": 0.1976471430873637, + "grad_norm": 1.78325355052948, + "learning_rate": 0.00018561883559374548, + "loss": 1.4578, + "step": 5519 + }, + { + "epoch": 0.19768295521693197, + "grad_norm": 2.1721127033233643, + "learning_rate": 0.0001856128422275239, + "loss": 1.6051, + "step": 5520 + }, + { + "epoch": 0.19771876734650026, + "grad_norm": 1.7439571619033813, + "learning_rate": 0.00018560684770949198, + "loss": 1.6344, + "step": 5521 + }, + { + "epoch": 0.19775457947606853, + "grad_norm": 2.096254348754883, + "learning_rate": 0.0001856008520397303, + "loss": 1.6342, + "step": 5522 + }, + { + "epoch": 0.19779039160563683, + "grad_norm": 2.046941041946411, + "learning_rate": 0.00018559485521831958, + "loss": 1.6274, + "step": 5523 + }, + { + "epoch": 0.19782620373520513, + "grad_norm": 1.634363055229187, + "learning_rate": 0.00018558885724534054, + "loss": 1.4636, + "step": 5524 + }, + { + "epoch": 0.1978620158647734, + "grad_norm": 1.593102216720581, + "learning_rate": 0.00018558285812087378, + "loss": 1.6564, + "step": 5525 + }, + { + "epoch": 0.1978978279943417, + "grad_norm": 1.5818394422531128, + "learning_rate": 0.0001855768578450001, + "loss": 1.6222, + "step": 5526 + }, + { + "epoch": 0.19793364012390996, + "grad_norm": 1.5421195030212402, + "learning_rate": 0.00018557085641780018, + "loss": 1.6816, + "step": 5527 + }, + { + "epoch": 0.19796945225347826, + "grad_norm": 1.7133044004440308, + "learning_rate": 0.0001855648538393547, + "loss": 1.7907, + "step": 5528 + }, + { + "epoch": 0.19800526438304653, + "grad_norm": 3.0813517570495605, + "learning_rate": 0.00018555885010974454, + "loss": 1.4835, + "step": 5529 + }, + { + "epoch": 0.19804107651261482, + "grad_norm": 2.0917038917541504, + "learning_rate": 0.00018555284522905042, + "loss": 1.6455, + "step": 5530 + }, + { + "epoch": 0.19807688864218312, + "grad_norm": 1.5909149646759033, + "learning_rate": 0.00018554683919735313, + "loss": 1.6325, + "step": 5531 + }, + { + "epoch": 0.1981127007717514, + "grad_norm": 1.382629156112671, + "learning_rate": 0.0001855408320147334, + "loss": 1.5228, + "step": 5532 + }, + { + "epoch": 0.19814851290131968, + "grad_norm": 1.8413569927215576, + "learning_rate": 0.00018553482368127217, + "loss": 1.6239, + "step": 5533 + }, + { + "epoch": 0.19818432503088795, + "grad_norm": 1.6792410612106323, + "learning_rate": 0.0001855288141970502, + "loss": 1.5107, + "step": 5534 + }, + { + "epoch": 0.19822013716045625, + "grad_norm": 1.9996005296707153, + "learning_rate": 0.00018552280356214838, + "loss": 1.5626, + "step": 5535 + }, + { + "epoch": 0.19825594929002452, + "grad_norm": 1.782104253768921, + "learning_rate": 0.00018551679177664755, + "loss": 1.4933, + "step": 5536 + }, + { + "epoch": 0.19829176141959282, + "grad_norm": 1.5002424716949463, + "learning_rate": 0.0001855107788406286, + "loss": 1.6503, + "step": 5537 + }, + { + "epoch": 0.1983275735491611, + "grad_norm": 1.3976023197174072, + "learning_rate": 0.0001855047647541724, + "loss": 1.5177, + "step": 5538 + }, + { + "epoch": 0.19836338567872938, + "grad_norm": 1.560597538948059, + "learning_rate": 0.00018549874951735988, + "loss": 1.7587, + "step": 5539 + }, + { + "epoch": 0.19839919780829768, + "grad_norm": 1.5787992477416992, + "learning_rate": 0.00018549273313027198, + "loss": 1.5259, + "step": 5540 + }, + { + "epoch": 0.19843500993786595, + "grad_norm": 1.5140466690063477, + "learning_rate": 0.00018548671559298963, + "loss": 1.6429, + "step": 5541 + }, + { + "epoch": 0.19847082206743424, + "grad_norm": 1.7304792404174805, + "learning_rate": 0.00018548069690559383, + "loss": 1.6498, + "step": 5542 + }, + { + "epoch": 0.1985066341970025, + "grad_norm": 2.0761163234710693, + "learning_rate": 0.00018547467706816546, + "loss": 1.4024, + "step": 5543 + }, + { + "epoch": 0.1985424463265708, + "grad_norm": 1.838121771812439, + "learning_rate": 0.00018546865608078559, + "loss": 1.7005, + "step": 5544 + }, + { + "epoch": 0.1985782584561391, + "grad_norm": 1.4359781742095947, + "learning_rate": 0.0001854626339435352, + "loss": 1.7055, + "step": 5545 + }, + { + "epoch": 0.19861407058570738, + "grad_norm": 1.474090576171875, + "learning_rate": 0.0001854566106564953, + "loss": 1.4552, + "step": 5546 + }, + { + "epoch": 0.19864988271527567, + "grad_norm": 1.9340765476226807, + "learning_rate": 0.00018545058621974693, + "loss": 1.5708, + "step": 5547 + }, + { + "epoch": 0.19868569484484394, + "grad_norm": 1.570233941078186, + "learning_rate": 0.00018544456063337116, + "loss": 1.4817, + "step": 5548 + }, + { + "epoch": 0.19872150697441224, + "grad_norm": 1.671566128730774, + "learning_rate": 0.00018543853389744905, + "loss": 1.593, + "step": 5549 + }, + { + "epoch": 0.1987573191039805, + "grad_norm": 1.457958459854126, + "learning_rate": 0.00018543250601206165, + "loss": 1.7928, + "step": 5550 + }, + { + "epoch": 0.1987931312335488, + "grad_norm": 1.4195009469985962, + "learning_rate": 0.00018542647697729009, + "loss": 1.4611, + "step": 5551 + }, + { + "epoch": 0.1988289433631171, + "grad_norm": 1.9061682224273682, + "learning_rate": 0.00018542044679321549, + "loss": 1.582, + "step": 5552 + }, + { + "epoch": 0.19886475549268537, + "grad_norm": 1.399892807006836, + "learning_rate": 0.00018541441545991892, + "loss": 1.5558, + "step": 5553 + }, + { + "epoch": 0.19890056762225367, + "grad_norm": 1.7510415315628052, + "learning_rate": 0.00018540838297748162, + "loss": 1.3359, + "step": 5554 + }, + { + "epoch": 0.19893637975182193, + "grad_norm": 1.2389678955078125, + "learning_rate": 0.0001854023493459847, + "loss": 1.3077, + "step": 5555 + }, + { + "epoch": 0.19897219188139023, + "grad_norm": 1.1171600818634033, + "learning_rate": 0.00018539631456550927, + "loss": 1.1792, + "step": 5556 + }, + { + "epoch": 0.1990080040109585, + "grad_norm": 1.5652157068252563, + "learning_rate": 0.00018539027863613664, + "loss": 1.5377, + "step": 5557 + }, + { + "epoch": 0.1990438161405268, + "grad_norm": 1.341874361038208, + "learning_rate": 0.0001853842415579479, + "loss": 1.6419, + "step": 5558 + }, + { + "epoch": 0.1990796282700951, + "grad_norm": 2.2000255584716797, + "learning_rate": 0.0001853782033310244, + "loss": 1.5106, + "step": 5559 + }, + { + "epoch": 0.19911544039966336, + "grad_norm": 1.4743818044662476, + "learning_rate": 0.00018537216395544723, + "loss": 1.5688, + "step": 5560 + }, + { + "epoch": 0.19915125252923166, + "grad_norm": 2.2175040245056152, + "learning_rate": 0.00018536612343129778, + "loss": 1.505, + "step": 5561 + }, + { + "epoch": 0.19918706465879993, + "grad_norm": 1.6958733797073364, + "learning_rate": 0.0001853600817586572, + "loss": 1.762, + "step": 5562 + }, + { + "epoch": 0.19922287678836822, + "grad_norm": 1.9428437948226929, + "learning_rate": 0.00018535403893760684, + "loss": 1.6962, + "step": 5563 + }, + { + "epoch": 0.1992586889179365, + "grad_norm": 1.8468763828277588, + "learning_rate": 0.00018534799496822802, + "loss": 1.464, + "step": 5564 + }, + { + "epoch": 0.1992945010475048, + "grad_norm": 1.587731122970581, + "learning_rate": 0.00018534194985060198, + "loss": 1.6521, + "step": 5565 + }, + { + "epoch": 0.1993303131770731, + "grad_norm": 1.6171441078186035, + "learning_rate": 0.0001853359035848101, + "loss": 1.6767, + "step": 5566 + }, + { + "epoch": 0.19936612530664136, + "grad_norm": 2.446106195449829, + "learning_rate": 0.0001853298561709337, + "loss": 1.5723, + "step": 5567 + }, + { + "epoch": 0.19940193743620965, + "grad_norm": 1.455934762954712, + "learning_rate": 0.0001853238076090542, + "loss": 1.6693, + "step": 5568 + }, + { + "epoch": 0.19943774956577792, + "grad_norm": 1.3956012725830078, + "learning_rate": 0.00018531775789925288, + "loss": 1.5087, + "step": 5569 + }, + { + "epoch": 0.19947356169534622, + "grad_norm": 1.2480905055999756, + "learning_rate": 0.00018531170704161117, + "loss": 1.4799, + "step": 5570 + }, + { + "epoch": 0.1995093738249145, + "grad_norm": 2.1647627353668213, + "learning_rate": 0.00018530565503621052, + "loss": 1.5694, + "step": 5571 + }, + { + "epoch": 0.19954518595448278, + "grad_norm": 2.104330062866211, + "learning_rate": 0.00018529960188313233, + "loss": 1.3568, + "step": 5572 + }, + { + "epoch": 0.19958099808405108, + "grad_norm": 1.5048644542694092, + "learning_rate": 0.000185293547582458, + "loss": 1.4253, + "step": 5573 + }, + { + "epoch": 0.19961681021361935, + "grad_norm": 1.904709815979004, + "learning_rate": 0.000185287492134269, + "loss": 1.7133, + "step": 5574 + }, + { + "epoch": 0.19965262234318765, + "grad_norm": 2.1871891021728516, + "learning_rate": 0.00018528143553864682, + "loss": 1.4204, + "step": 5575 + }, + { + "epoch": 0.19968843447275592, + "grad_norm": 1.9296875, + "learning_rate": 0.00018527537779567294, + "loss": 1.8226, + "step": 5576 + }, + { + "epoch": 0.1997242466023242, + "grad_norm": 1.6656574010849, + "learning_rate": 0.00018526931890542882, + "loss": 1.4225, + "step": 5577 + }, + { + "epoch": 0.19976005873189248, + "grad_norm": 1.660765290260315, + "learning_rate": 0.00018526325886799601, + "loss": 1.3672, + "step": 5578 + }, + { + "epoch": 0.19979587086146078, + "grad_norm": 1.7996066808700562, + "learning_rate": 0.00018525719768345606, + "loss": 1.7538, + "step": 5579 + }, + { + "epoch": 0.19983168299102907, + "grad_norm": 2.2600836753845215, + "learning_rate": 0.00018525113535189047, + "loss": 2.0307, + "step": 5580 + }, + { + "epoch": 0.19986749512059734, + "grad_norm": 1.627731204032898, + "learning_rate": 0.00018524507187338082, + "loss": 1.7304, + "step": 5581 + }, + { + "epoch": 0.19990330725016564, + "grad_norm": 1.4446642398834229, + "learning_rate": 0.00018523900724800872, + "loss": 1.4868, + "step": 5582 + }, + { + "epoch": 0.1999391193797339, + "grad_norm": 1.7544971704483032, + "learning_rate": 0.00018523294147585568, + "loss": 1.373, + "step": 5583 + }, + { + "epoch": 0.1999749315093022, + "grad_norm": 1.6773810386657715, + "learning_rate": 0.00018522687455700337, + "loss": 1.5266, + "step": 5584 + }, + { + "epoch": 0.20001074363887048, + "grad_norm": 1.7085083723068237, + "learning_rate": 0.0001852208064915334, + "loss": 1.6517, + "step": 5585 + }, + { + "epoch": 0.20004655576843877, + "grad_norm": 1.6179802417755127, + "learning_rate": 0.00018521473727952742, + "loss": 1.577, + "step": 5586 + }, + { + "epoch": 0.20008236789800707, + "grad_norm": 1.6990835666656494, + "learning_rate": 0.00018520866692106703, + "loss": 1.7635, + "step": 5587 + }, + { + "epoch": 0.20011818002757534, + "grad_norm": 1.5353055000305176, + "learning_rate": 0.00018520259541623398, + "loss": 1.6554, + "step": 5588 + }, + { + "epoch": 0.20015399215714363, + "grad_norm": 1.858586072921753, + "learning_rate": 0.0001851965227651099, + "loss": 1.5469, + "step": 5589 + }, + { + "epoch": 0.2001898042867119, + "grad_norm": 1.5021321773529053, + "learning_rate": 0.00018519044896777648, + "loss": 1.8475, + "step": 5590 + }, + { + "epoch": 0.2002256164162802, + "grad_norm": 1.355085015296936, + "learning_rate": 0.0001851843740243155, + "loss": 1.6449, + "step": 5591 + }, + { + "epoch": 0.20026142854584847, + "grad_norm": 1.4706250429153442, + "learning_rate": 0.00018517829793480861, + "loss": 1.6929, + "step": 5592 + }, + { + "epoch": 0.20029724067541677, + "grad_norm": 2.08127498626709, + "learning_rate": 0.0001851722206993376, + "loss": 1.5465, + "step": 5593 + }, + { + "epoch": 0.20033305280498506, + "grad_norm": 2.2466256618499756, + "learning_rate": 0.00018516614231798423, + "loss": 1.8191, + "step": 5594 + }, + { + "epoch": 0.20036886493455333, + "grad_norm": 1.2826673984527588, + "learning_rate": 0.00018516006279083026, + "loss": 1.7799, + "step": 5595 + }, + { + "epoch": 0.20040467706412163, + "grad_norm": 1.394559621810913, + "learning_rate": 0.0001851539821179575, + "loss": 1.4522, + "step": 5596 + }, + { + "epoch": 0.2004404891936899, + "grad_norm": 1.4629602432250977, + "learning_rate": 0.00018514790029944777, + "loss": 1.7201, + "step": 5597 + }, + { + "epoch": 0.2004763013232582, + "grad_norm": 1.7223575115203857, + "learning_rate": 0.00018514181733538285, + "loss": 1.4502, + "step": 5598 + }, + { + "epoch": 0.20051211345282646, + "grad_norm": 1.4611260890960693, + "learning_rate": 0.00018513573322584463, + "loss": 1.6936, + "step": 5599 + }, + { + "epoch": 0.20054792558239476, + "grad_norm": 1.4663982391357422, + "learning_rate": 0.0001851296479709149, + "loss": 1.7965, + "step": 5600 + }, + { + "epoch": 0.20058373771196303, + "grad_norm": 1.651157259941101, + "learning_rate": 0.00018512356157067558, + "loss": 1.5611, + "step": 5601 + }, + { + "epoch": 0.20061954984153132, + "grad_norm": 1.538913369178772, + "learning_rate": 0.00018511747402520857, + "loss": 1.1651, + "step": 5602 + }, + { + "epoch": 0.20065536197109962, + "grad_norm": 1.3776566982269287, + "learning_rate": 0.0001851113853345957, + "loss": 1.676, + "step": 5603 + }, + { + "epoch": 0.2006911741006679, + "grad_norm": 1.6608750820159912, + "learning_rate": 0.00018510529549891895, + "loss": 1.6722, + "step": 5604 + }, + { + "epoch": 0.2007269862302362, + "grad_norm": 2.144692897796631, + "learning_rate": 0.00018509920451826022, + "loss": 1.2254, + "step": 5605 + }, + { + "epoch": 0.20076279835980446, + "grad_norm": 1.9433279037475586, + "learning_rate": 0.00018509311239270145, + "loss": 1.4043, + "step": 5606 + }, + { + "epoch": 0.20079861048937275, + "grad_norm": 1.8131729364395142, + "learning_rate": 0.00018508701912232464, + "loss": 1.4766, + "step": 5607 + }, + { + "epoch": 0.20083442261894102, + "grad_norm": 1.691941261291504, + "learning_rate": 0.00018508092470721175, + "loss": 1.3085, + "step": 5608 + }, + { + "epoch": 0.20087023474850932, + "grad_norm": 1.7415412664413452, + "learning_rate": 0.0001850748291474447, + "loss": 1.6581, + "step": 5609 + }, + { + "epoch": 0.20090604687807762, + "grad_norm": 1.5415544509887695, + "learning_rate": 0.00018506873244310563, + "loss": 1.3757, + "step": 5610 + }, + { + "epoch": 0.20094185900764588, + "grad_norm": 1.8270162343978882, + "learning_rate": 0.00018506263459427648, + "loss": 1.4954, + "step": 5611 + }, + { + "epoch": 0.20097767113721418, + "grad_norm": 1.324442982673645, + "learning_rate": 0.00018505653560103928, + "loss": 1.5829, + "step": 5612 + }, + { + "epoch": 0.20101348326678245, + "grad_norm": 1.567414402961731, + "learning_rate": 0.00018505043546347612, + "loss": 1.3768, + "step": 5613 + }, + { + "epoch": 0.20104929539635075, + "grad_norm": 1.504832148551941, + "learning_rate": 0.00018504433418166908, + "loss": 1.4962, + "step": 5614 + }, + { + "epoch": 0.20108510752591902, + "grad_norm": 1.508852481842041, + "learning_rate": 0.00018503823175570021, + "loss": 1.6095, + "step": 5615 + }, + { + "epoch": 0.2011209196554873, + "grad_norm": 1.5696232318878174, + "learning_rate": 0.00018503212818565161, + "loss": 1.3963, + "step": 5616 + }, + { + "epoch": 0.2011567317850556, + "grad_norm": 1.629343867301941, + "learning_rate": 0.00018502602347160544, + "loss": 1.4996, + "step": 5617 + }, + { + "epoch": 0.20119254391462388, + "grad_norm": 1.793253779411316, + "learning_rate": 0.00018501991761364376, + "loss": 1.4764, + "step": 5618 + }, + { + "epoch": 0.20122835604419217, + "grad_norm": 1.5109243392944336, + "learning_rate": 0.00018501381061184876, + "loss": 1.71, + "step": 5619 + }, + { + "epoch": 0.20126416817376044, + "grad_norm": 1.6845890283584595, + "learning_rate": 0.0001850077024663026, + "loss": 1.411, + "step": 5620 + }, + { + "epoch": 0.20129998030332874, + "grad_norm": 1.3381202220916748, + "learning_rate": 0.00018500159317708749, + "loss": 1.9312, + "step": 5621 + }, + { + "epoch": 0.201335792432897, + "grad_norm": 1.788730502128601, + "learning_rate": 0.00018499548274428557, + "loss": 1.4369, + "step": 5622 + }, + { + "epoch": 0.2013716045624653, + "grad_norm": 1.5795271396636963, + "learning_rate": 0.00018498937116797904, + "loss": 1.4818, + "step": 5623 + }, + { + "epoch": 0.2014074166920336, + "grad_norm": 1.4016799926757812, + "learning_rate": 0.0001849832584482502, + "loss": 1.6474, + "step": 5624 + }, + { + "epoch": 0.20144322882160187, + "grad_norm": 1.7416629791259766, + "learning_rate": 0.00018497714458518122, + "loss": 1.6052, + "step": 5625 + }, + { + "epoch": 0.20147904095117017, + "grad_norm": 1.3918477296829224, + "learning_rate": 0.00018497102957885434, + "loss": 1.349, + "step": 5626 + }, + { + "epoch": 0.20151485308073844, + "grad_norm": 1.4549763202667236, + "learning_rate": 0.0001849649134293519, + "loss": 1.5576, + "step": 5627 + }, + { + "epoch": 0.20155066521030673, + "grad_norm": 1.955651044845581, + "learning_rate": 0.00018495879613675612, + "loss": 1.4563, + "step": 5628 + }, + { + "epoch": 0.201586477339875, + "grad_norm": 1.9450856447219849, + "learning_rate": 0.00018495267770114935, + "loss": 1.3856, + "step": 5629 + }, + { + "epoch": 0.2016222894694433, + "grad_norm": 1.2987256050109863, + "learning_rate": 0.00018494655812261387, + "loss": 1.5538, + "step": 5630 + }, + { + "epoch": 0.2016581015990116, + "grad_norm": 1.7535088062286377, + "learning_rate": 0.00018494043740123202, + "loss": 1.7828, + "step": 5631 + }, + { + "epoch": 0.20169391372857987, + "grad_norm": 1.5175572633743286, + "learning_rate": 0.00018493431553708614, + "loss": 1.6587, + "step": 5632 + }, + { + "epoch": 0.20172972585814816, + "grad_norm": 1.82554292678833, + "learning_rate": 0.0001849281925302586, + "loss": 1.6141, + "step": 5633 + }, + { + "epoch": 0.20176553798771643, + "grad_norm": 1.5592626333236694, + "learning_rate": 0.0001849220683808318, + "loss": 1.5767, + "step": 5634 + }, + { + "epoch": 0.20180135011728473, + "grad_norm": 1.3777804374694824, + "learning_rate": 0.00018491594308888814, + "loss": 1.4966, + "step": 5635 + }, + { + "epoch": 0.201837162246853, + "grad_norm": 1.6428275108337402, + "learning_rate": 0.00018490981665450994, + "loss": 1.6563, + "step": 5636 + }, + { + "epoch": 0.2018729743764213, + "grad_norm": 1.416022777557373, + "learning_rate": 0.00018490368907777974, + "loss": 1.8694, + "step": 5637 + }, + { + "epoch": 0.2019087865059896, + "grad_norm": 1.6186103820800781, + "learning_rate": 0.0001848975603587799, + "loss": 1.5395, + "step": 5638 + }, + { + "epoch": 0.20194459863555786, + "grad_norm": 1.5777395963668823, + "learning_rate": 0.00018489143049759286, + "loss": 1.6282, + "step": 5639 + }, + { + "epoch": 0.20198041076512616, + "grad_norm": 1.8486963510513306, + "learning_rate": 0.00018488529949430116, + "loss": 1.5511, + "step": 5640 + }, + { + "epoch": 0.20201622289469442, + "grad_norm": 2.059964418411255, + "learning_rate": 0.00018487916734898722, + "loss": 1.5742, + "step": 5641 + }, + { + "epoch": 0.20205203502426272, + "grad_norm": 2.6553430557250977, + "learning_rate": 0.0001848730340617336, + "loss": 1.5722, + "step": 5642 + }, + { + "epoch": 0.202087847153831, + "grad_norm": 1.6141916513442993, + "learning_rate": 0.00018486689963262277, + "loss": 1.6907, + "step": 5643 + }, + { + "epoch": 0.2021236592833993, + "grad_norm": 1.7546532154083252, + "learning_rate": 0.00018486076406173726, + "loss": 1.4523, + "step": 5644 + }, + { + "epoch": 0.20215947141296758, + "grad_norm": 1.4749776124954224, + "learning_rate": 0.00018485462734915966, + "loss": 1.5986, + "step": 5645 + }, + { + "epoch": 0.20219528354253585, + "grad_norm": 1.5348141193389893, + "learning_rate": 0.0001848484894949725, + "loss": 1.7698, + "step": 5646 + }, + { + "epoch": 0.20223109567210415, + "grad_norm": 1.4728552103042603, + "learning_rate": 0.00018484235049925836, + "loss": 1.6859, + "step": 5647 + }, + { + "epoch": 0.20226690780167242, + "grad_norm": 2.210272789001465, + "learning_rate": 0.00018483621036209983, + "loss": 1.907, + "step": 5648 + }, + { + "epoch": 0.20230271993124072, + "grad_norm": 1.673674464225769, + "learning_rate": 0.0001848300690835795, + "loss": 1.665, + "step": 5649 + }, + { + "epoch": 0.20233853206080898, + "grad_norm": 1.225216269493103, + "learning_rate": 0.00018482392666378003, + "loss": 1.5294, + "step": 5650 + }, + { + "epoch": 0.20237434419037728, + "grad_norm": 1.159347414970398, + "learning_rate": 0.00018481778310278405, + "loss": 1.4917, + "step": 5651 + }, + { + "epoch": 0.20241015631994558, + "grad_norm": 1.6550319194793701, + "learning_rate": 0.0001848116384006742, + "loss": 1.6163, + "step": 5652 + }, + { + "epoch": 0.20244596844951385, + "grad_norm": 1.5937591791152954, + "learning_rate": 0.00018480549255753313, + "loss": 1.4051, + "step": 5653 + }, + { + "epoch": 0.20248178057908214, + "grad_norm": 1.5911157131195068, + "learning_rate": 0.0001847993455734436, + "loss": 1.6417, + "step": 5654 + }, + { + "epoch": 0.2025175927086504, + "grad_norm": 1.6294200420379639, + "learning_rate": 0.00018479319744848821, + "loss": 1.7678, + "step": 5655 + }, + { + "epoch": 0.2025534048382187, + "grad_norm": 2.7911853790283203, + "learning_rate": 0.00018478704818274976, + "loss": 1.693, + "step": 5656 + }, + { + "epoch": 0.20258921696778698, + "grad_norm": 1.8016701936721802, + "learning_rate": 0.00018478089777631092, + "loss": 1.608, + "step": 5657 + }, + { + "epoch": 0.20262502909735527, + "grad_norm": 1.3736586570739746, + "learning_rate": 0.00018477474622925449, + "loss": 1.5401, + "step": 5658 + }, + { + "epoch": 0.20266084122692357, + "grad_norm": 1.949476718902588, + "learning_rate": 0.00018476859354166317, + "loss": 1.7005, + "step": 5659 + }, + { + "epoch": 0.20269665335649184, + "grad_norm": 1.9096870422363281, + "learning_rate": 0.0001847624397136198, + "loss": 1.6647, + "step": 5660 + }, + { + "epoch": 0.20273246548606014, + "grad_norm": 1.571459412574768, + "learning_rate": 0.0001847562847452071, + "loss": 1.6983, + "step": 5661 + }, + { + "epoch": 0.2027682776156284, + "grad_norm": 1.6884174346923828, + "learning_rate": 0.0001847501286365079, + "loss": 1.4849, + "step": 5662 + }, + { + "epoch": 0.2028040897451967, + "grad_norm": 1.2368390560150146, + "learning_rate": 0.00018474397138760508, + "loss": 1.5239, + "step": 5663 + }, + { + "epoch": 0.20283990187476497, + "grad_norm": 2.1719744205474854, + "learning_rate": 0.00018473781299858146, + "loss": 1.7039, + "step": 5664 + }, + { + "epoch": 0.20287571400433327, + "grad_norm": 1.8494501113891602, + "learning_rate": 0.0001847316534695198, + "loss": 1.8157, + "step": 5665 + }, + { + "epoch": 0.20291152613390157, + "grad_norm": 1.4276621341705322, + "learning_rate": 0.0001847254928005031, + "loss": 1.8173, + "step": 5666 + }, + { + "epoch": 0.20294733826346983, + "grad_norm": 1.520430088043213, + "learning_rate": 0.00018471933099161415, + "loss": 1.6106, + "step": 5667 + }, + { + "epoch": 0.20298315039303813, + "grad_norm": 1.8959459066390991, + "learning_rate": 0.00018471316804293594, + "loss": 1.7846, + "step": 5668 + }, + { + "epoch": 0.2030189625226064, + "grad_norm": 1.4433817863464355, + "learning_rate": 0.00018470700395455125, + "loss": 1.9018, + "step": 5669 + }, + { + "epoch": 0.2030547746521747, + "grad_norm": 1.5954314470291138, + "learning_rate": 0.00018470083872654312, + "loss": 1.2725, + "step": 5670 + }, + { + "epoch": 0.20309058678174297, + "grad_norm": 1.5294626951217651, + "learning_rate": 0.00018469467235899444, + "loss": 1.5162, + "step": 5671 + }, + { + "epoch": 0.20312639891131126, + "grad_norm": 1.519560694694519, + "learning_rate": 0.00018468850485198822, + "loss": 1.5687, + "step": 5672 + }, + { + "epoch": 0.20316221104087956, + "grad_norm": 2.257767915725708, + "learning_rate": 0.00018468233620560739, + "loss": 1.3524, + "step": 5673 + }, + { + "epoch": 0.20319802317044783, + "grad_norm": 1.5276037454605103, + "learning_rate": 0.00018467616641993498, + "loss": 1.6337, + "step": 5674 + }, + { + "epoch": 0.20323383530001612, + "grad_norm": 1.8103950023651123, + "learning_rate": 0.00018466999549505392, + "loss": 1.4483, + "step": 5675 + }, + { + "epoch": 0.2032696474295844, + "grad_norm": 1.5021846294403076, + "learning_rate": 0.00018466382343104734, + "loss": 1.6137, + "step": 5676 + }, + { + "epoch": 0.2033054595591527, + "grad_norm": 1.6001121997833252, + "learning_rate": 0.00018465765022799823, + "loss": 1.3985, + "step": 5677 + }, + { + "epoch": 0.20334127168872096, + "grad_norm": 1.44736647605896, + "learning_rate": 0.00018465147588598958, + "loss": 1.2853, + "step": 5678 + }, + { + "epoch": 0.20337708381828926, + "grad_norm": 1.746927261352539, + "learning_rate": 0.00018464530040510456, + "loss": 1.6917, + "step": 5679 + }, + { + "epoch": 0.20341289594785755, + "grad_norm": 1.9499015808105469, + "learning_rate": 0.0001846391237854262, + "loss": 1.7338, + "step": 5680 + }, + { + "epoch": 0.20344870807742582, + "grad_norm": 1.527923583984375, + "learning_rate": 0.0001846329460270376, + "loss": 1.5749, + "step": 5681 + }, + { + "epoch": 0.20348452020699412, + "grad_norm": 1.8569077253341675, + "learning_rate": 0.0001846267671300219, + "loss": 1.7941, + "step": 5682 + }, + { + "epoch": 0.2035203323365624, + "grad_norm": 1.7201156616210938, + "learning_rate": 0.00018462058709446216, + "loss": 1.7085, + "step": 5683 + }, + { + "epoch": 0.20355614446613068, + "grad_norm": 3.1380867958068848, + "learning_rate": 0.00018461440592044165, + "loss": 1.887, + "step": 5684 + }, + { + "epoch": 0.20359195659569895, + "grad_norm": 1.3634907007217407, + "learning_rate": 0.00018460822360804338, + "loss": 1.6883, + "step": 5685 + }, + { + "epoch": 0.20362776872526725, + "grad_norm": 1.4703376293182373, + "learning_rate": 0.00018460204015735064, + "loss": 1.5107, + "step": 5686 + }, + { + "epoch": 0.20366358085483555, + "grad_norm": 1.7667032480239868, + "learning_rate": 0.00018459585556844656, + "loss": 1.829, + "step": 5687 + }, + { + "epoch": 0.20369939298440382, + "grad_norm": 2.221970796585083, + "learning_rate": 0.00018458966984141438, + "loss": 1.3804, + "step": 5688 + }, + { + "epoch": 0.2037352051139721, + "grad_norm": 2.146343231201172, + "learning_rate": 0.00018458348297633727, + "loss": 1.6312, + "step": 5689 + }, + { + "epoch": 0.20377101724354038, + "grad_norm": 1.7487691640853882, + "learning_rate": 0.00018457729497329853, + "loss": 1.38, + "step": 5690 + }, + { + "epoch": 0.20380682937310868, + "grad_norm": 1.7649887800216675, + "learning_rate": 0.0001845711058323814, + "loss": 1.6627, + "step": 5691 + }, + { + "epoch": 0.20384264150267695, + "grad_norm": 1.3033711910247803, + "learning_rate": 0.0001845649155536691, + "loss": 1.6029, + "step": 5692 + }, + { + "epoch": 0.20387845363224524, + "grad_norm": 1.5455960035324097, + "learning_rate": 0.00018455872413724496, + "loss": 1.573, + "step": 5693 + }, + { + "epoch": 0.2039142657618135, + "grad_norm": 1.7799946069717407, + "learning_rate": 0.00018455253158319225, + "loss": 1.4815, + "step": 5694 + }, + { + "epoch": 0.2039500778913818, + "grad_norm": 1.9104725122451782, + "learning_rate": 0.00018454633789159427, + "loss": 1.684, + "step": 5695 + }, + { + "epoch": 0.2039858900209501, + "grad_norm": 1.4825611114501953, + "learning_rate": 0.0001845401430625344, + "loss": 1.5976, + "step": 5696 + }, + { + "epoch": 0.20402170215051837, + "grad_norm": 1.7644739151000977, + "learning_rate": 0.00018453394709609598, + "loss": 2.0426, + "step": 5697 + }, + { + "epoch": 0.20405751428008667, + "grad_norm": 1.5432175397872925, + "learning_rate": 0.0001845277499923623, + "loss": 1.8283, + "step": 5698 + }, + { + "epoch": 0.20409332640965494, + "grad_norm": 2.520534038543701, + "learning_rate": 0.0001845215517514168, + "loss": 1.7116, + "step": 5699 + }, + { + "epoch": 0.20412913853922324, + "grad_norm": 1.333713173866272, + "learning_rate": 0.0001845153523733428, + "loss": 1.5942, + "step": 5700 + }, + { + "epoch": 0.2041649506687915, + "grad_norm": 1.654269814491272, + "learning_rate": 0.00018450915185822382, + "loss": 1.8312, + "step": 5701 + }, + { + "epoch": 0.2042007627983598, + "grad_norm": 1.8361196517944336, + "learning_rate": 0.00018450295020614317, + "loss": 1.592, + "step": 5702 + }, + { + "epoch": 0.2042365749279281, + "grad_norm": 1.52461576461792, + "learning_rate": 0.00018449674741718433, + "loss": 1.8999, + "step": 5703 + }, + { + "epoch": 0.20427238705749637, + "grad_norm": 1.7082089185714722, + "learning_rate": 0.00018449054349143072, + "loss": 1.4507, + "step": 5704 + }, + { + "epoch": 0.20430819918706467, + "grad_norm": 1.8044872283935547, + "learning_rate": 0.0001844843384289659, + "loss": 1.9495, + "step": 5705 + }, + { + "epoch": 0.20434401131663293, + "grad_norm": 2.0654454231262207, + "learning_rate": 0.00018447813222987323, + "loss": 2.0662, + "step": 5706 + }, + { + "epoch": 0.20437982344620123, + "grad_norm": 2.090245008468628, + "learning_rate": 0.00018447192489423625, + "loss": 1.7459, + "step": 5707 + }, + { + "epoch": 0.2044156355757695, + "grad_norm": 1.994872808456421, + "learning_rate": 0.00018446571642213852, + "loss": 2.0094, + "step": 5708 + }, + { + "epoch": 0.2044514477053378, + "grad_norm": 1.3152283430099487, + "learning_rate": 0.0001844595068136635, + "loss": 1.6219, + "step": 5709 + }, + { + "epoch": 0.2044872598349061, + "grad_norm": 1.7344331741333008, + "learning_rate": 0.0001844532960688948, + "loss": 1.5429, + "step": 5710 + }, + { + "epoch": 0.20452307196447436, + "grad_norm": 1.7377667427062988, + "learning_rate": 0.0001844470841879159, + "loss": 1.6329, + "step": 5711 + }, + { + "epoch": 0.20455888409404266, + "grad_norm": 1.2515347003936768, + "learning_rate": 0.00018444087117081042, + "loss": 1.5964, + "step": 5712 + }, + { + "epoch": 0.20459469622361093, + "grad_norm": 1.538106918334961, + "learning_rate": 0.00018443465701766196, + "loss": 1.323, + "step": 5713 + }, + { + "epoch": 0.20463050835317922, + "grad_norm": 1.9756258726119995, + "learning_rate": 0.0001844284417285541, + "loss": 1.729, + "step": 5714 + }, + { + "epoch": 0.2046663204827475, + "grad_norm": 1.4631593227386475, + "learning_rate": 0.00018442222530357043, + "loss": 1.7244, + "step": 5715 + }, + { + "epoch": 0.2047021326123158, + "grad_norm": 1.497786045074463, + "learning_rate": 0.00018441600774279465, + "loss": 1.6359, + "step": 5716 + }, + { + "epoch": 0.2047379447418841, + "grad_norm": 1.5431510210037231, + "learning_rate": 0.00018440978904631032, + "loss": 1.3245, + "step": 5717 + }, + { + "epoch": 0.20477375687145236, + "grad_norm": 1.3295104503631592, + "learning_rate": 0.00018440356921420122, + "loss": 1.5629, + "step": 5718 + }, + { + "epoch": 0.20480956900102065, + "grad_norm": 1.2782552242279053, + "learning_rate": 0.00018439734824655092, + "loss": 1.3756, + "step": 5719 + }, + { + "epoch": 0.20484538113058892, + "grad_norm": 1.6534874439239502, + "learning_rate": 0.00018439112614344322, + "loss": 1.3338, + "step": 5720 + }, + { + "epoch": 0.20488119326015722, + "grad_norm": 1.5594265460968018, + "learning_rate": 0.0001843849029049617, + "loss": 1.5184, + "step": 5721 + }, + { + "epoch": 0.2049170053897255, + "grad_norm": 1.6822896003723145, + "learning_rate": 0.00018437867853119023, + "loss": 1.4627, + "step": 5722 + }, + { + "epoch": 0.20495281751929378, + "grad_norm": 2.03576397895813, + "learning_rate": 0.00018437245302221244, + "loss": 1.646, + "step": 5723 + }, + { + "epoch": 0.20498862964886208, + "grad_norm": 1.5040608644485474, + "learning_rate": 0.00018436622637811215, + "loss": 1.596, + "step": 5724 + }, + { + "epoch": 0.20502444177843035, + "grad_norm": 1.8175115585327148, + "learning_rate": 0.0001843599985989731, + "loss": 1.4633, + "step": 5725 + }, + { + "epoch": 0.20506025390799865, + "grad_norm": 2.4232726097106934, + "learning_rate": 0.0001843537696848791, + "loss": 1.5774, + "step": 5726 + }, + { + "epoch": 0.20509606603756692, + "grad_norm": 1.5448403358459473, + "learning_rate": 0.0001843475396359139, + "loss": 1.234, + "step": 5727 + }, + { + "epoch": 0.2051318781671352, + "grad_norm": 1.332551121711731, + "learning_rate": 0.00018434130845216138, + "loss": 1.4809, + "step": 5728 + }, + { + "epoch": 0.20516769029670348, + "grad_norm": 1.351884365081787, + "learning_rate": 0.00018433507613370534, + "loss": 1.569, + "step": 5729 + }, + { + "epoch": 0.20520350242627178, + "grad_norm": 1.5869837999343872, + "learning_rate": 0.00018432884268062964, + "loss": 1.7137, + "step": 5730 + }, + { + "epoch": 0.20523931455584007, + "grad_norm": 1.690000295639038, + "learning_rate": 0.00018432260809301816, + "loss": 1.5449, + "step": 5731 + }, + { + "epoch": 0.20527512668540834, + "grad_norm": 1.3282488584518433, + "learning_rate": 0.00018431637237095472, + "loss": 1.3576, + "step": 5732 + }, + { + "epoch": 0.20531093881497664, + "grad_norm": 2.1671175956726074, + "learning_rate": 0.00018431013551452327, + "loss": 1.5015, + "step": 5733 + }, + { + "epoch": 0.2053467509445449, + "grad_norm": 1.4850959777832031, + "learning_rate": 0.0001843038975238077, + "loss": 1.2529, + "step": 5734 + }, + { + "epoch": 0.2053825630741132, + "grad_norm": 1.4230232238769531, + "learning_rate": 0.00018429765839889193, + "loss": 1.3751, + "step": 5735 + }, + { + "epoch": 0.20541837520368147, + "grad_norm": 1.605311632156372, + "learning_rate": 0.0001842914181398599, + "loss": 1.6034, + "step": 5736 + }, + { + "epoch": 0.20545418733324977, + "grad_norm": 1.7204928398132324, + "learning_rate": 0.00018428517674679557, + "loss": 1.3672, + "step": 5737 + }, + { + "epoch": 0.20548999946281807, + "grad_norm": 1.6784799098968506, + "learning_rate": 0.0001842789342197829, + "loss": 1.5203, + "step": 5738 + }, + { + "epoch": 0.20552581159238634, + "grad_norm": 1.939692497253418, + "learning_rate": 0.00018427269055890588, + "loss": 1.9134, + "step": 5739 + }, + { + "epoch": 0.20556162372195463, + "grad_norm": 1.4052371978759766, + "learning_rate": 0.00018426644576424855, + "loss": 1.5995, + "step": 5740 + }, + { + "epoch": 0.2055974358515229, + "grad_norm": 1.1106784343719482, + "learning_rate": 0.00018426019983589482, + "loss": 1.5371, + "step": 5741 + }, + { + "epoch": 0.2056332479810912, + "grad_norm": 1.7211228609085083, + "learning_rate": 0.00018425395277392882, + "loss": 1.6038, + "step": 5742 + }, + { + "epoch": 0.20566906011065947, + "grad_norm": 1.6395955085754395, + "learning_rate": 0.0001842477045784346, + "loss": 1.4582, + "step": 5743 + }, + { + "epoch": 0.20570487224022777, + "grad_norm": 1.9705967903137207, + "learning_rate": 0.00018424145524949614, + "loss": 1.7143, + "step": 5744 + }, + { + "epoch": 0.20574068436979606, + "grad_norm": 3.6009767055511475, + "learning_rate": 0.00018423520478719758, + "loss": 1.5917, + "step": 5745 + }, + { + "epoch": 0.20577649649936433, + "grad_norm": 1.7975378036499023, + "learning_rate": 0.00018422895319162298, + "loss": 1.4714, + "step": 5746 + }, + { + "epoch": 0.20581230862893263, + "grad_norm": 2.167494535446167, + "learning_rate": 0.0001842227004628565, + "loss": 1.4169, + "step": 5747 + }, + { + "epoch": 0.2058481207585009, + "grad_norm": 2.0247933864593506, + "learning_rate": 0.00018421644660098217, + "loss": 1.6403, + "step": 5748 + }, + { + "epoch": 0.2058839328880692, + "grad_norm": 1.7096855640411377, + "learning_rate": 0.00018421019160608424, + "loss": 1.5459, + "step": 5749 + }, + { + "epoch": 0.20591974501763746, + "grad_norm": 2.594543933868408, + "learning_rate": 0.00018420393547824676, + "loss": 1.4367, + "step": 5750 + }, + { + "epoch": 0.20595555714720576, + "grad_norm": 2.801156520843506, + "learning_rate": 0.000184197678217554, + "loss": 1.5627, + "step": 5751 + }, + { + "epoch": 0.20599136927677406, + "grad_norm": 1.9084726572036743, + "learning_rate": 0.00018419141982409001, + "loss": 1.3815, + "step": 5752 + }, + { + "epoch": 0.20602718140634232, + "grad_norm": 1.531064748764038, + "learning_rate": 0.00018418516029793916, + "loss": 1.639, + "step": 5753 + }, + { + "epoch": 0.20606299353591062, + "grad_norm": 1.4335132837295532, + "learning_rate": 0.00018417889963918548, + "loss": 1.849, + "step": 5754 + }, + { + "epoch": 0.2060988056654789, + "grad_norm": 1.271621584892273, + "learning_rate": 0.00018417263784791335, + "loss": 1.7477, + "step": 5755 + }, + { + "epoch": 0.2061346177950472, + "grad_norm": 1.3262522220611572, + "learning_rate": 0.0001841663749242069, + "loss": 1.7865, + "step": 5756 + }, + { + "epoch": 0.20617042992461546, + "grad_norm": 1.6832629442214966, + "learning_rate": 0.0001841601108681505, + "loss": 1.3944, + "step": 5757 + }, + { + "epoch": 0.20620624205418375, + "grad_norm": 1.295717477798462, + "learning_rate": 0.00018415384567982833, + "loss": 1.6328, + "step": 5758 + }, + { + "epoch": 0.20624205418375205, + "grad_norm": 1.605613350868225, + "learning_rate": 0.0001841475793593247, + "loss": 1.4366, + "step": 5759 + }, + { + "epoch": 0.20627786631332032, + "grad_norm": 1.8435920476913452, + "learning_rate": 0.00018414131190672394, + "loss": 1.5914, + "step": 5760 + }, + { + "epoch": 0.20631367844288862, + "grad_norm": 1.62568199634552, + "learning_rate": 0.00018413504332211037, + "loss": 1.3896, + "step": 5761 + }, + { + "epoch": 0.20634949057245688, + "grad_norm": 1.6363476514816284, + "learning_rate": 0.00018412877360556834, + "loss": 1.624, + "step": 5762 + }, + { + "epoch": 0.20638530270202518, + "grad_norm": 1.4820305109024048, + "learning_rate": 0.00018412250275718218, + "loss": 1.5039, + "step": 5763 + }, + { + "epoch": 0.20642111483159345, + "grad_norm": 1.8757277727127075, + "learning_rate": 0.00018411623077703624, + "loss": 1.7367, + "step": 5764 + }, + { + "epoch": 0.20645692696116175, + "grad_norm": 2.476335287094116, + "learning_rate": 0.0001841099576652149, + "loss": 1.8845, + "step": 5765 + }, + { + "epoch": 0.20649273909073004, + "grad_norm": 1.4801583290100098, + "learning_rate": 0.00018410368342180263, + "loss": 1.4245, + "step": 5766 + }, + { + "epoch": 0.2065285512202983, + "grad_norm": 1.6733896732330322, + "learning_rate": 0.00018409740804688373, + "loss": 1.3733, + "step": 5767 + }, + { + "epoch": 0.2065643633498666, + "grad_norm": 1.4926596879959106, + "learning_rate": 0.0001840911315405427, + "loss": 1.4559, + "step": 5768 + }, + { + "epoch": 0.20660017547943488, + "grad_norm": 1.447648048400879, + "learning_rate": 0.00018408485390286397, + "loss": 1.5615, + "step": 5769 + }, + { + "epoch": 0.20663598760900317, + "grad_norm": 1.9112826585769653, + "learning_rate": 0.00018407857513393197, + "loss": 1.665, + "step": 5770 + }, + { + "epoch": 0.20667179973857144, + "grad_norm": 2.436891794204712, + "learning_rate": 0.00018407229523383122, + "loss": 1.681, + "step": 5771 + }, + { + "epoch": 0.20670761186813974, + "grad_norm": 1.7354581356048584, + "learning_rate": 0.00018406601420264618, + "loss": 1.4212, + "step": 5772 + }, + { + "epoch": 0.20674342399770804, + "grad_norm": 1.4902445077896118, + "learning_rate": 0.00018405973204046135, + "loss": 1.6882, + "step": 5773 + }, + { + "epoch": 0.2067792361272763, + "grad_norm": 1.775509238243103, + "learning_rate": 0.00018405344874736126, + "loss": 1.3206, + "step": 5774 + }, + { + "epoch": 0.2068150482568446, + "grad_norm": 1.1771329641342163, + "learning_rate": 0.00018404716432343044, + "loss": 1.4992, + "step": 5775 + }, + { + "epoch": 0.20685086038641287, + "grad_norm": 1.741278052330017, + "learning_rate": 0.0001840408787687534, + "loss": 1.2352, + "step": 5776 + }, + { + "epoch": 0.20688667251598117, + "grad_norm": 2.0680439472198486, + "learning_rate": 0.0001840345920834148, + "loss": 1.5474, + "step": 5777 + }, + { + "epoch": 0.20692248464554944, + "grad_norm": 1.8485260009765625, + "learning_rate": 0.00018402830426749914, + "loss": 1.3399, + "step": 5778 + }, + { + "epoch": 0.20695829677511773, + "grad_norm": 1.583206057548523, + "learning_rate": 0.00018402201532109102, + "loss": 1.6553, + "step": 5779 + }, + { + "epoch": 0.20699410890468603, + "grad_norm": 1.9147899150848389, + "learning_rate": 0.00018401572524427505, + "loss": 1.9019, + "step": 5780 + }, + { + "epoch": 0.2070299210342543, + "grad_norm": 1.735785961151123, + "learning_rate": 0.0001840094340371359, + "loss": 1.778, + "step": 5781 + }, + { + "epoch": 0.2070657331638226, + "grad_norm": 1.9432623386383057, + "learning_rate": 0.00018400314169975818, + "loss": 1.5151, + "step": 5782 + }, + { + "epoch": 0.20710154529339087, + "grad_norm": 1.6666193008422852, + "learning_rate": 0.00018399684823222653, + "loss": 1.2603, + "step": 5783 + }, + { + "epoch": 0.20713735742295916, + "grad_norm": 1.642317771911621, + "learning_rate": 0.00018399055363462562, + "loss": 1.4213, + "step": 5784 + }, + { + "epoch": 0.20717316955252743, + "grad_norm": 2.0764431953430176, + "learning_rate": 0.0001839842579070402, + "loss": 1.278, + "step": 5785 + }, + { + "epoch": 0.20720898168209573, + "grad_norm": 1.3511927127838135, + "learning_rate": 0.0001839779610495549, + "loss": 1.4598, + "step": 5786 + }, + { + "epoch": 0.20724479381166402, + "grad_norm": 1.897929072380066, + "learning_rate": 0.00018397166306225444, + "loss": 1.6758, + "step": 5787 + }, + { + "epoch": 0.2072806059412323, + "grad_norm": 1.5352063179016113, + "learning_rate": 0.00018396536394522359, + "loss": 1.655, + "step": 5788 + }, + { + "epoch": 0.2073164180708006, + "grad_norm": 1.7674046754837036, + "learning_rate": 0.00018395906369854704, + "loss": 1.4136, + "step": 5789 + }, + { + "epoch": 0.20735223020036886, + "grad_norm": 1.2308050394058228, + "learning_rate": 0.00018395276232230964, + "loss": 1.4874, + "step": 5790 + }, + { + "epoch": 0.20738804232993716, + "grad_norm": 1.8490521907806396, + "learning_rate": 0.00018394645981659608, + "loss": 1.4895, + "step": 5791 + }, + { + "epoch": 0.20742385445950542, + "grad_norm": 1.8193918466567993, + "learning_rate": 0.00018394015618149122, + "loss": 1.3781, + "step": 5792 + }, + { + "epoch": 0.20745966658907372, + "grad_norm": 1.4143835306167603, + "learning_rate": 0.00018393385141707977, + "loss": 1.5742, + "step": 5793 + }, + { + "epoch": 0.207495478718642, + "grad_norm": 1.3196909427642822, + "learning_rate": 0.00018392754552344666, + "loss": 1.3337, + "step": 5794 + }, + { + "epoch": 0.2075312908482103, + "grad_norm": 1.3538028001785278, + "learning_rate": 0.00018392123850067668, + "loss": 1.5709, + "step": 5795 + }, + { + "epoch": 0.20756710297777858, + "grad_norm": 1.751012921333313, + "learning_rate": 0.00018391493034885468, + "loss": 1.4149, + "step": 5796 + }, + { + "epoch": 0.20760291510734685, + "grad_norm": 1.4756646156311035, + "learning_rate": 0.00018390862106806554, + "loss": 1.5291, + "step": 5797 + }, + { + "epoch": 0.20763872723691515, + "grad_norm": 1.8159650564193726, + "learning_rate": 0.00018390231065839414, + "loss": 1.5351, + "step": 5798 + }, + { + "epoch": 0.20767453936648342, + "grad_norm": 1.8627691268920898, + "learning_rate": 0.00018389599911992538, + "loss": 1.4553, + "step": 5799 + }, + { + "epoch": 0.20771035149605171, + "grad_norm": 1.5494425296783447, + "learning_rate": 0.00018388968645274416, + "loss": 1.4181, + "step": 5800 + }, + { + "epoch": 0.20774616362561998, + "grad_norm": 1.2609368562698364, + "learning_rate": 0.00018388337265693542, + "loss": 1.474, + "step": 5801 + }, + { + "epoch": 0.20778197575518828, + "grad_norm": 1.9432854652404785, + "learning_rate": 0.0001838770577325841, + "loss": 1.5919, + "step": 5802 + }, + { + "epoch": 0.20781778788475658, + "grad_norm": 1.5743569135665894, + "learning_rate": 0.00018387074167977517, + "loss": 1.4351, + "step": 5803 + }, + { + "epoch": 0.20785360001432485, + "grad_norm": 1.721057653427124, + "learning_rate": 0.00018386442449859358, + "loss": 1.538, + "step": 5804 + }, + { + "epoch": 0.20788941214389314, + "grad_norm": 2.5399715900421143, + "learning_rate": 0.00018385810618912435, + "loss": 1.5953, + "step": 5805 + }, + { + "epoch": 0.2079252242734614, + "grad_norm": 1.6732019186019897, + "learning_rate": 0.00018385178675145246, + "loss": 1.371, + "step": 5806 + }, + { + "epoch": 0.2079610364030297, + "grad_norm": 1.5420591831207275, + "learning_rate": 0.00018384546618566296, + "loss": 1.3819, + "step": 5807 + }, + { + "epoch": 0.20799684853259798, + "grad_norm": 2.665620803833008, + "learning_rate": 0.00018383914449184084, + "loss": 1.7292, + "step": 5808 + }, + { + "epoch": 0.20803266066216627, + "grad_norm": 1.8281484842300415, + "learning_rate": 0.0001838328216700712, + "loss": 1.3194, + "step": 5809 + }, + { + "epoch": 0.20806847279173457, + "grad_norm": 3.399212598800659, + "learning_rate": 0.00018382649772043908, + "loss": 1.9008, + "step": 5810 + }, + { + "epoch": 0.20810428492130284, + "grad_norm": 1.844014286994934, + "learning_rate": 0.00018382017264302955, + "loss": 1.7792, + "step": 5811 + }, + { + "epoch": 0.20814009705087114, + "grad_norm": 1.7047966718673706, + "learning_rate": 0.0001838138464379277, + "loss": 1.3427, + "step": 5812 + }, + { + "epoch": 0.2081759091804394, + "grad_norm": 1.635097861289978, + "learning_rate": 0.0001838075191052187, + "loss": 1.8786, + "step": 5813 + }, + { + "epoch": 0.2082117213100077, + "grad_norm": 1.8636630773544312, + "learning_rate": 0.0001838011906449876, + "loss": 1.6942, + "step": 5814 + }, + { + "epoch": 0.20824753343957597, + "grad_norm": 1.397093653678894, + "learning_rate": 0.0001837948610573196, + "loss": 1.4005, + "step": 5815 + }, + { + "epoch": 0.20828334556914427, + "grad_norm": 1.8946696519851685, + "learning_rate": 0.0001837885303422998, + "loss": 1.748, + "step": 5816 + }, + { + "epoch": 0.20831915769871256, + "grad_norm": 1.4679687023162842, + "learning_rate": 0.00018378219850001345, + "loss": 1.4857, + "step": 5817 + }, + { + "epoch": 0.20835496982828083, + "grad_norm": 1.5090714693069458, + "learning_rate": 0.00018377586553054565, + "loss": 1.7174, + "step": 5818 + }, + { + "epoch": 0.20839078195784913, + "grad_norm": 1.7879695892333984, + "learning_rate": 0.00018376953143398167, + "loss": 1.6207, + "step": 5819 + }, + { + "epoch": 0.2084265940874174, + "grad_norm": 2.3632588386535645, + "learning_rate": 0.00018376319621040668, + "loss": 1.4637, + "step": 5820 + }, + { + "epoch": 0.2084624062169857, + "grad_norm": 2.151449680328369, + "learning_rate": 0.00018375685985990594, + "loss": 2.0277, + "step": 5821 + }, + { + "epoch": 0.20849821834655397, + "grad_norm": 1.7777212858200073, + "learning_rate": 0.00018375052238256466, + "loss": 1.7095, + "step": 5822 + }, + { + "epoch": 0.20853403047612226, + "grad_norm": 2.0421993732452393, + "learning_rate": 0.00018374418377846817, + "loss": 1.8641, + "step": 5823 + }, + { + "epoch": 0.20856984260569056, + "grad_norm": 1.6348986625671387, + "learning_rate": 0.0001837378440477017, + "loss": 1.6932, + "step": 5824 + }, + { + "epoch": 0.20860565473525883, + "grad_norm": 1.481870174407959, + "learning_rate": 0.00018373150319035055, + "loss": 1.6623, + "step": 5825 + }, + { + "epoch": 0.20864146686482712, + "grad_norm": 1.5054895877838135, + "learning_rate": 0.00018372516120650003, + "loss": 1.579, + "step": 5826 + }, + { + "epoch": 0.2086772789943954, + "grad_norm": 1.6486055850982666, + "learning_rate": 0.00018371881809623545, + "loss": 1.4551, + "step": 5827 + }, + { + "epoch": 0.2087130911239637, + "grad_norm": 1.4928501844406128, + "learning_rate": 0.0001837124738596422, + "loss": 1.8089, + "step": 5828 + }, + { + "epoch": 0.20874890325353196, + "grad_norm": 1.265228509902954, + "learning_rate": 0.00018370612849680557, + "loss": 1.6432, + "step": 5829 + }, + { + "epoch": 0.20878471538310026, + "grad_norm": 1.9759631156921387, + "learning_rate": 0.00018369978200781094, + "loss": 1.6254, + "step": 5830 + }, + { + "epoch": 0.20882052751266855, + "grad_norm": 1.7568730115890503, + "learning_rate": 0.00018369343439274372, + "loss": 1.8442, + "step": 5831 + }, + { + "epoch": 0.20885633964223682, + "grad_norm": 1.8063440322875977, + "learning_rate": 0.0001836870856516893, + "loss": 1.4209, + "step": 5832 + }, + { + "epoch": 0.20889215177180512, + "grad_norm": 1.3789457082748413, + "learning_rate": 0.0001836807357847331, + "loss": 1.4017, + "step": 5833 + }, + { + "epoch": 0.2089279639013734, + "grad_norm": 1.9040539264678955, + "learning_rate": 0.00018367438479196055, + "loss": 1.7246, + "step": 5834 + }, + { + "epoch": 0.20896377603094168, + "grad_norm": 2.3700289726257324, + "learning_rate": 0.00018366803267345704, + "loss": 1.5969, + "step": 5835 + }, + { + "epoch": 0.20899958816050995, + "grad_norm": 1.2375210523605347, + "learning_rate": 0.0001836616794293081, + "loss": 1.5293, + "step": 5836 + }, + { + "epoch": 0.20903540029007825, + "grad_norm": 1.7296556234359741, + "learning_rate": 0.00018365532505959918, + "loss": 1.5033, + "step": 5837 + }, + { + "epoch": 0.20907121241964655, + "grad_norm": 1.7073261737823486, + "learning_rate": 0.00018364896956441577, + "loss": 1.6766, + "step": 5838 + }, + { + "epoch": 0.20910702454921481, + "grad_norm": 1.5448671579360962, + "learning_rate": 0.00018364261294384336, + "loss": 1.5826, + "step": 5839 + }, + { + "epoch": 0.2091428366787831, + "grad_norm": 1.3705118894577026, + "learning_rate": 0.0001836362551979675, + "loss": 1.5987, + "step": 5840 + }, + { + "epoch": 0.20917864880835138, + "grad_norm": 1.785382866859436, + "learning_rate": 0.00018362989632687374, + "loss": 1.7167, + "step": 5841 + }, + { + "epoch": 0.20921446093791968, + "grad_norm": 1.6626938581466675, + "learning_rate": 0.00018362353633064754, + "loss": 1.5908, + "step": 5842 + }, + { + "epoch": 0.20925027306748795, + "grad_norm": 1.831031084060669, + "learning_rate": 0.00018361717520937458, + "loss": 1.8772, + "step": 5843 + }, + { + "epoch": 0.20928608519705624, + "grad_norm": 1.589511752128601, + "learning_rate": 0.00018361081296314037, + "loss": 1.815, + "step": 5844 + }, + { + "epoch": 0.20932189732662454, + "grad_norm": 1.4295203685760498, + "learning_rate": 0.0001836044495920305, + "loss": 1.4799, + "step": 5845 + }, + { + "epoch": 0.2093577094561928, + "grad_norm": 1.112631916999817, + "learning_rate": 0.00018359808509613062, + "loss": 1.3803, + "step": 5846 + }, + { + "epoch": 0.2093935215857611, + "grad_norm": 1.5145599842071533, + "learning_rate": 0.00018359171947552631, + "loss": 1.3917, + "step": 5847 + }, + { + "epoch": 0.20942933371532937, + "grad_norm": 2.381051778793335, + "learning_rate": 0.00018358535273030327, + "loss": 1.5099, + "step": 5848 + }, + { + "epoch": 0.20946514584489767, + "grad_norm": 1.7032874822616577, + "learning_rate": 0.0001835789848605471, + "loss": 1.3714, + "step": 5849 + }, + { + "epoch": 0.20950095797446594, + "grad_norm": 1.7090039253234863, + "learning_rate": 0.00018357261586634353, + "loss": 1.5838, + "step": 5850 + }, + { + "epoch": 0.20953677010403424, + "grad_norm": 1.4482029676437378, + "learning_rate": 0.00018356624574777822, + "loss": 1.5396, + "step": 5851 + }, + { + "epoch": 0.20957258223360253, + "grad_norm": 1.815578579902649, + "learning_rate": 0.0001835598745049368, + "loss": 1.5823, + "step": 5852 + }, + { + "epoch": 0.2096083943631708, + "grad_norm": 1.842309594154358, + "learning_rate": 0.00018355350213790513, + "loss": 1.5635, + "step": 5853 + }, + { + "epoch": 0.2096442064927391, + "grad_norm": 1.4999361038208008, + "learning_rate": 0.00018354712864676885, + "loss": 1.5074, + "step": 5854 + }, + { + "epoch": 0.20968001862230737, + "grad_norm": 1.7008081674575806, + "learning_rate": 0.00018354075403161367, + "loss": 1.369, + "step": 5855 + }, + { + "epoch": 0.20971583075187566, + "grad_norm": 2.609595537185669, + "learning_rate": 0.00018353437829252543, + "loss": 1.3887, + "step": 5856 + }, + { + "epoch": 0.20975164288144393, + "grad_norm": 1.640960454940796, + "learning_rate": 0.00018352800142958992, + "loss": 1.4278, + "step": 5857 + }, + { + "epoch": 0.20978745501101223, + "grad_norm": 1.3580068349838257, + "learning_rate": 0.00018352162344289284, + "loss": 1.4415, + "step": 5858 + }, + { + "epoch": 0.20982326714058053, + "grad_norm": 1.7644946575164795, + "learning_rate": 0.0001835152443325201, + "loss": 1.592, + "step": 5859 + }, + { + "epoch": 0.2098590792701488, + "grad_norm": 1.3494611978530884, + "learning_rate": 0.00018350886409855744, + "loss": 1.6329, + "step": 5860 + }, + { + "epoch": 0.2098948913997171, + "grad_norm": 1.8251352310180664, + "learning_rate": 0.00018350248274109077, + "loss": 1.8776, + "step": 5861 + }, + { + "epoch": 0.20993070352928536, + "grad_norm": 1.4748811721801758, + "learning_rate": 0.00018349610026020585, + "loss": 1.4748, + "step": 5862 + }, + { + "epoch": 0.20996651565885366, + "grad_norm": 1.604453206062317, + "learning_rate": 0.00018348971665598865, + "loss": 1.2366, + "step": 5863 + }, + { + "epoch": 0.21000232778842193, + "grad_norm": 1.736939787864685, + "learning_rate": 0.000183483331928525, + "loss": 1.5625, + "step": 5864 + }, + { + "epoch": 0.21003813991799022, + "grad_norm": 1.6238269805908203, + "learning_rate": 0.00018347694607790077, + "loss": 1.5814, + "step": 5865 + }, + { + "epoch": 0.21007395204755852, + "grad_norm": 1.4518622159957886, + "learning_rate": 0.00018347055910420193, + "loss": 1.6499, + "step": 5866 + }, + { + "epoch": 0.2101097641771268, + "grad_norm": 2.65487003326416, + "learning_rate": 0.0001834641710075144, + "loss": 1.417, + "step": 5867 + }, + { + "epoch": 0.2101455763066951, + "grad_norm": 2.1541175842285156, + "learning_rate": 0.0001834577817879241, + "loss": 1.1873, + "step": 5868 + }, + { + "epoch": 0.21018138843626336, + "grad_norm": 1.5349715948104858, + "learning_rate": 0.000183451391445517, + "loss": 1.6339, + "step": 5869 + }, + { + "epoch": 0.21021720056583165, + "grad_norm": 1.4636268615722656, + "learning_rate": 0.00018344499998037907, + "loss": 1.4769, + "step": 5870 + }, + { + "epoch": 0.21025301269539992, + "grad_norm": 1.8863812685012817, + "learning_rate": 0.0001834386073925963, + "loss": 1.6514, + "step": 5871 + }, + { + "epoch": 0.21028882482496822, + "grad_norm": 1.8188995122909546, + "learning_rate": 0.0001834322136822547, + "loss": 1.2138, + "step": 5872 + }, + { + "epoch": 0.21032463695453651, + "grad_norm": 1.6070584058761597, + "learning_rate": 0.00018342581884944027, + "loss": 1.5714, + "step": 5873 + }, + { + "epoch": 0.21036044908410478, + "grad_norm": 1.322187900543213, + "learning_rate": 0.0001834194228942391, + "loss": 1.4738, + "step": 5874 + }, + { + "epoch": 0.21039626121367308, + "grad_norm": 1.4101799726486206, + "learning_rate": 0.00018341302581673715, + "loss": 1.256, + "step": 5875 + }, + { + "epoch": 0.21043207334324135, + "grad_norm": 2.0768353939056396, + "learning_rate": 0.00018340662761702055, + "loss": 1.7106, + "step": 5876 + }, + { + "epoch": 0.21046788547280965, + "grad_norm": 1.6466964483261108, + "learning_rate": 0.00018340022829517537, + "loss": 1.7091, + "step": 5877 + }, + { + "epoch": 0.21050369760237791, + "grad_norm": 2.0485565662384033, + "learning_rate": 0.00018339382785128767, + "loss": 1.7706, + "step": 5878 + }, + { + "epoch": 0.2105395097319462, + "grad_norm": 1.6183522939682007, + "learning_rate": 0.00018338742628544363, + "loss": 1.6943, + "step": 5879 + }, + { + "epoch": 0.2105753218615145, + "grad_norm": 1.507336974143982, + "learning_rate": 0.0001833810235977293, + "loss": 1.6523, + "step": 5880 + }, + { + "epoch": 0.21061113399108278, + "grad_norm": 1.335508942604065, + "learning_rate": 0.00018337461978823084, + "loss": 1.5629, + "step": 5881 + }, + { + "epoch": 0.21064694612065107, + "grad_norm": 1.6823132038116455, + "learning_rate": 0.00018336821485703445, + "loss": 1.3857, + "step": 5882 + }, + { + "epoch": 0.21068275825021934, + "grad_norm": 1.4094301462173462, + "learning_rate": 0.00018336180880422625, + "loss": 1.7403, + "step": 5883 + }, + { + "epoch": 0.21071857037978764, + "grad_norm": 2.0397469997406006, + "learning_rate": 0.00018335540162989244, + "loss": 1.7507, + "step": 5884 + }, + { + "epoch": 0.2107543825093559, + "grad_norm": 1.7900669574737549, + "learning_rate": 0.00018334899333411926, + "loss": 1.6038, + "step": 5885 + }, + { + "epoch": 0.2107901946389242, + "grad_norm": 1.5094026327133179, + "learning_rate": 0.00018334258391699285, + "loss": 1.65, + "step": 5886 + }, + { + "epoch": 0.2108260067684925, + "grad_norm": 1.6741819381713867, + "learning_rate": 0.00018333617337859946, + "loss": 1.6157, + "step": 5887 + }, + { + "epoch": 0.21086181889806077, + "grad_norm": 1.4904005527496338, + "learning_rate": 0.00018332976171902537, + "loss": 1.6095, + "step": 5888 + }, + { + "epoch": 0.21089763102762907, + "grad_norm": 1.66598379611969, + "learning_rate": 0.00018332334893835683, + "loss": 1.3666, + "step": 5889 + }, + { + "epoch": 0.21093344315719734, + "grad_norm": 1.658950686454773, + "learning_rate": 0.00018331693503668013, + "loss": 1.6743, + "step": 5890 + }, + { + "epoch": 0.21096925528676563, + "grad_norm": 1.507813811302185, + "learning_rate": 0.00018331052001408152, + "loss": 1.3427, + "step": 5891 + }, + { + "epoch": 0.2110050674163339, + "grad_norm": 1.7073756456375122, + "learning_rate": 0.0001833041038706473, + "loss": 1.5811, + "step": 5892 + }, + { + "epoch": 0.2110408795459022, + "grad_norm": 1.5660537481307983, + "learning_rate": 0.00018329768660646384, + "loss": 1.5551, + "step": 5893 + }, + { + "epoch": 0.21107669167547047, + "grad_norm": 1.3659303188323975, + "learning_rate": 0.00018329126822161747, + "loss": 1.469, + "step": 5894 + }, + { + "epoch": 0.21111250380503876, + "grad_norm": 1.2605630159378052, + "learning_rate": 0.0001832848487161945, + "loss": 1.5376, + "step": 5895 + }, + { + "epoch": 0.21114831593460706, + "grad_norm": 1.5032516717910767, + "learning_rate": 0.00018327842809028134, + "loss": 1.4326, + "step": 5896 + }, + { + "epoch": 0.21118412806417533, + "grad_norm": 1.8132591247558594, + "learning_rate": 0.00018327200634396434, + "loss": 1.614, + "step": 5897 + }, + { + "epoch": 0.21121994019374363, + "grad_norm": 1.4367175102233887, + "learning_rate": 0.0001832655834773299, + "loss": 1.4983, + "step": 5898 + }, + { + "epoch": 0.2112557523233119, + "grad_norm": 1.4380050897598267, + "learning_rate": 0.00018325915949046444, + "loss": 1.3839, + "step": 5899 + }, + { + "epoch": 0.2112915644528802, + "grad_norm": 1.1733310222625732, + "learning_rate": 0.00018325273438345437, + "loss": 1.582, + "step": 5900 + }, + { + "epoch": 0.21132737658244846, + "grad_norm": 1.7513288259506226, + "learning_rate": 0.0001832463081563862, + "loss": 1.5806, + "step": 5901 + }, + { + "epoch": 0.21136318871201676, + "grad_norm": 1.4104002714157104, + "learning_rate": 0.00018323988080934628, + "loss": 1.6676, + "step": 5902 + }, + { + "epoch": 0.21139900084158506, + "grad_norm": 1.9451051950454712, + "learning_rate": 0.00018323345234242118, + "loss": 1.5106, + "step": 5903 + }, + { + "epoch": 0.21143481297115332, + "grad_norm": 1.9381672143936157, + "learning_rate": 0.0001832270227556973, + "loss": 1.0622, + "step": 5904 + }, + { + "epoch": 0.21147062510072162, + "grad_norm": 1.0977705717086792, + "learning_rate": 0.0001832205920492612, + "loss": 1.4634, + "step": 5905 + }, + { + "epoch": 0.2115064372302899, + "grad_norm": 2.0630342960357666, + "learning_rate": 0.0001832141602231994, + "loss": 1.5579, + "step": 5906 + }, + { + "epoch": 0.2115422493598582, + "grad_norm": 1.915184497833252, + "learning_rate": 0.0001832077272775984, + "loss": 1.8376, + "step": 5907 + }, + { + "epoch": 0.21157806148942646, + "grad_norm": 2.1756672859191895, + "learning_rate": 0.0001832012932125448, + "loss": 1.6073, + "step": 5908 + }, + { + "epoch": 0.21161387361899475, + "grad_norm": 1.440247654914856, + "learning_rate": 0.00018319485802812503, + "loss": 1.5361, + "step": 5909 + }, + { + "epoch": 0.21164968574856305, + "grad_norm": 1.6484793424606323, + "learning_rate": 0.00018318842172442582, + "loss": 1.7188, + "step": 5910 + }, + { + "epoch": 0.21168549787813132, + "grad_norm": 2.225233793258667, + "learning_rate": 0.0001831819843015337, + "loss": 1.5521, + "step": 5911 + }, + { + "epoch": 0.21172131000769961, + "grad_norm": 1.6229103803634644, + "learning_rate": 0.00018317554575953527, + "loss": 1.4822, + "step": 5912 + }, + { + "epoch": 0.21175712213726788, + "grad_norm": 2.2891218662261963, + "learning_rate": 0.00018316910609851713, + "loss": 1.3342, + "step": 5913 + }, + { + "epoch": 0.21179293426683618, + "grad_norm": 1.2021150588989258, + "learning_rate": 0.00018316266531856598, + "loss": 1.6548, + "step": 5914 + }, + { + "epoch": 0.21182874639640445, + "grad_norm": 1.3572343587875366, + "learning_rate": 0.00018315622341976844, + "loss": 1.6756, + "step": 5915 + }, + { + "epoch": 0.21186455852597275, + "grad_norm": 1.7767938375473022, + "learning_rate": 0.0001831497804022112, + "loss": 1.398, + "step": 5916 + }, + { + "epoch": 0.21190037065554104, + "grad_norm": 2.0373289585113525, + "learning_rate": 0.00018314333626598089, + "loss": 1.4118, + "step": 5917 + }, + { + "epoch": 0.2119361827851093, + "grad_norm": 1.7908626794815063, + "learning_rate": 0.0001831368910111642, + "loss": 1.4941, + "step": 5918 + }, + { + "epoch": 0.2119719949146776, + "grad_norm": 1.8968188762664795, + "learning_rate": 0.00018313044463784793, + "loss": 1.4038, + "step": 5919 + }, + { + "epoch": 0.21200780704424588, + "grad_norm": 1.641762137413025, + "learning_rate": 0.00018312399714611876, + "loss": 1.649, + "step": 5920 + }, + { + "epoch": 0.21204361917381417, + "grad_norm": 1.3661926984786987, + "learning_rate": 0.00018311754853606344, + "loss": 1.7253, + "step": 5921 + }, + { + "epoch": 0.21207943130338244, + "grad_norm": 1.5163073539733887, + "learning_rate": 0.00018311109880776868, + "loss": 1.6616, + "step": 5922 + }, + { + "epoch": 0.21211524343295074, + "grad_norm": 1.510380506515503, + "learning_rate": 0.00018310464796132133, + "loss": 1.5898, + "step": 5923 + }, + { + "epoch": 0.21215105556251904, + "grad_norm": 1.931257963180542, + "learning_rate": 0.0001830981959968081, + "loss": 1.5899, + "step": 5924 + }, + { + "epoch": 0.2121868676920873, + "grad_norm": 1.626753330230713, + "learning_rate": 0.00018309174291431587, + "loss": 1.5853, + "step": 5925 + }, + { + "epoch": 0.2122226798216556, + "grad_norm": 1.351036548614502, + "learning_rate": 0.00018308528871393138, + "loss": 1.3402, + "step": 5926 + }, + { + "epoch": 0.21225849195122387, + "grad_norm": 1.6709541082382202, + "learning_rate": 0.00018307883339574153, + "loss": 1.544, + "step": 5927 + }, + { + "epoch": 0.21229430408079217, + "grad_norm": 3.2044754028320312, + "learning_rate": 0.00018307237695983314, + "loss": 1.5532, + "step": 5928 + }, + { + "epoch": 0.21233011621036044, + "grad_norm": 1.6904844045639038, + "learning_rate": 0.00018306591940629307, + "loss": 1.4901, + "step": 5929 + }, + { + "epoch": 0.21236592833992873, + "grad_norm": 1.53884756565094, + "learning_rate": 0.00018305946073520822, + "loss": 1.5167, + "step": 5930 + }, + { + "epoch": 0.21240174046949703, + "grad_norm": 1.9038699865341187, + "learning_rate": 0.00018305300094666543, + "loss": 1.5806, + "step": 5931 + }, + { + "epoch": 0.2124375525990653, + "grad_norm": 1.951897382736206, + "learning_rate": 0.00018304654004075167, + "loss": 1.7285, + "step": 5932 + }, + { + "epoch": 0.2124733647286336, + "grad_norm": 1.620422124862671, + "learning_rate": 0.0001830400780175538, + "loss": 1.2822, + "step": 5933 + }, + { + "epoch": 0.21250917685820186, + "grad_norm": 1.8646143674850464, + "learning_rate": 0.00018303361487715883, + "loss": 1.5526, + "step": 5934 + }, + { + "epoch": 0.21254498898777016, + "grad_norm": 1.271231770515442, + "learning_rate": 0.00018302715061965365, + "loss": 1.9238, + "step": 5935 + }, + { + "epoch": 0.21258080111733843, + "grad_norm": 1.5239760875701904, + "learning_rate": 0.00018302068524512528, + "loss": 1.327, + "step": 5936 + }, + { + "epoch": 0.21261661324690673, + "grad_norm": 1.642659068107605, + "learning_rate": 0.00018301421875366067, + "loss": 1.6107, + "step": 5937 + }, + { + "epoch": 0.21265242537647502, + "grad_norm": 1.5333079099655151, + "learning_rate": 0.00018300775114534683, + "loss": 1.6014, + "step": 5938 + }, + { + "epoch": 0.2126882375060433, + "grad_norm": 1.676763892173767, + "learning_rate": 0.00018300128242027078, + "loss": 1.3634, + "step": 5939 + }, + { + "epoch": 0.2127240496356116, + "grad_norm": 1.5230528116226196, + "learning_rate": 0.00018299481257851952, + "loss": 1.8291, + "step": 5940 + }, + { + "epoch": 0.21275986176517986, + "grad_norm": 1.4611763954162598, + "learning_rate": 0.00018298834162018012, + "loss": 1.6594, + "step": 5941 + }, + { + "epoch": 0.21279567389474816, + "grad_norm": 1.8587379455566406, + "learning_rate": 0.00018298186954533962, + "loss": 1.4631, + "step": 5942 + }, + { + "epoch": 0.21283148602431642, + "grad_norm": 2.7046003341674805, + "learning_rate": 0.00018297539635408512, + "loss": 1.5129, + "step": 5943 + }, + { + "epoch": 0.21286729815388472, + "grad_norm": 2.1789839267730713, + "learning_rate": 0.00018296892204650367, + "loss": 1.6845, + "step": 5944 + }, + { + "epoch": 0.21290311028345302, + "grad_norm": 1.4094992876052856, + "learning_rate": 0.00018296244662268241, + "loss": 1.7135, + "step": 5945 + }, + { + "epoch": 0.2129389224130213, + "grad_norm": 1.2647565603256226, + "learning_rate": 0.00018295597008270847, + "loss": 1.6716, + "step": 5946 + }, + { + "epoch": 0.21297473454258958, + "grad_norm": 1.6723589897155762, + "learning_rate": 0.00018294949242666895, + "loss": 1.7836, + "step": 5947 + }, + { + "epoch": 0.21301054667215785, + "grad_norm": 1.859749674797058, + "learning_rate": 0.00018294301365465095, + "loss": 1.6115, + "step": 5948 + }, + { + "epoch": 0.21304635880172615, + "grad_norm": 1.2979680299758911, + "learning_rate": 0.00018293653376674177, + "loss": 1.3994, + "step": 5949 + }, + { + "epoch": 0.21308217093129442, + "grad_norm": 1.4515433311462402, + "learning_rate": 0.00018293005276302844, + "loss": 1.3845, + "step": 5950 + }, + { + "epoch": 0.21311798306086271, + "grad_norm": 1.7454895973205566, + "learning_rate": 0.00018292357064359828, + "loss": 1.7939, + "step": 5951 + }, + { + "epoch": 0.213153795190431, + "grad_norm": 1.893728494644165, + "learning_rate": 0.0001829170874085384, + "loss": 1.5585, + "step": 5952 + }, + { + "epoch": 0.21318960731999928, + "grad_norm": 1.5000876188278198, + "learning_rate": 0.00018291060305793608, + "loss": 1.524, + "step": 5953 + }, + { + "epoch": 0.21322541944956758, + "grad_norm": 1.9762799739837646, + "learning_rate": 0.00018290411759187855, + "loss": 1.6497, + "step": 5954 + }, + { + "epoch": 0.21326123157913585, + "grad_norm": 2.6302709579467773, + "learning_rate": 0.00018289763101045302, + "loss": 1.6087, + "step": 5955 + }, + { + "epoch": 0.21329704370870414, + "grad_norm": 2.3435487747192383, + "learning_rate": 0.00018289114331374685, + "loss": 1.4392, + "step": 5956 + }, + { + "epoch": 0.2133328558382724, + "grad_norm": 1.2770248651504517, + "learning_rate": 0.00018288465450184722, + "loss": 1.2857, + "step": 5957 + }, + { + "epoch": 0.2133686679678407, + "grad_norm": 1.952136754989624, + "learning_rate": 0.0001828781645748415, + "loss": 1.5773, + "step": 5958 + }, + { + "epoch": 0.213404480097409, + "grad_norm": 1.2606817483901978, + "learning_rate": 0.00018287167353281698, + "loss": 1.3636, + "step": 5959 + }, + { + "epoch": 0.21344029222697727, + "grad_norm": 1.316763162612915, + "learning_rate": 0.000182865181375861, + "loss": 1.2342, + "step": 5960 + }, + { + "epoch": 0.21347610435654557, + "grad_norm": 1.371982216835022, + "learning_rate": 0.0001828586881040609, + "loss": 1.6322, + "step": 5961 + }, + { + "epoch": 0.21351191648611384, + "grad_norm": 1.4569306373596191, + "learning_rate": 0.00018285219371750398, + "loss": 1.389, + "step": 5962 + }, + { + "epoch": 0.21354772861568214, + "grad_norm": 1.64267897605896, + "learning_rate": 0.0001828456982162777, + "loss": 1.734, + "step": 5963 + }, + { + "epoch": 0.2135835407452504, + "grad_norm": 2.057509660720825, + "learning_rate": 0.0001828392016004694, + "loss": 1.8128, + "step": 5964 + }, + { + "epoch": 0.2136193528748187, + "grad_norm": 1.5392036437988281, + "learning_rate": 0.00018283270387016654, + "loss": 1.5686, + "step": 5965 + }, + { + "epoch": 0.213655165004387, + "grad_norm": 2.590017557144165, + "learning_rate": 0.00018282620502545647, + "loss": 1.8044, + "step": 5966 + }, + { + "epoch": 0.21369097713395527, + "grad_norm": 1.446028709411621, + "learning_rate": 0.00018281970506642663, + "loss": 1.3209, + "step": 5967 + }, + { + "epoch": 0.21372678926352356, + "grad_norm": 2.1150851249694824, + "learning_rate": 0.0001828132039931645, + "loss": 1.6727, + "step": 5968 + }, + { + "epoch": 0.21376260139309183, + "grad_norm": 1.4321050643920898, + "learning_rate": 0.00018280670180575754, + "loss": 1.4389, + "step": 5969 + }, + { + "epoch": 0.21379841352266013, + "grad_norm": 1.3660449981689453, + "learning_rate": 0.00018280019850429321, + "loss": 1.4298, + "step": 5970 + }, + { + "epoch": 0.2138342256522284, + "grad_norm": 1.6467535495758057, + "learning_rate": 0.000182793694088859, + "loss": 1.4494, + "step": 5971 + }, + { + "epoch": 0.2138700377817967, + "grad_norm": 2.1352715492248535, + "learning_rate": 0.00018278718855954247, + "loss": 1.7657, + "step": 5972 + }, + { + "epoch": 0.213905849911365, + "grad_norm": 2.0386393070220947, + "learning_rate": 0.00018278068191643107, + "loss": 1.709, + "step": 5973 + }, + { + "epoch": 0.21394166204093326, + "grad_norm": 1.3228604793548584, + "learning_rate": 0.0001827741741596124, + "loss": 1.4663, + "step": 5974 + }, + { + "epoch": 0.21397747417050156, + "grad_norm": 2.2697250843048096, + "learning_rate": 0.00018276766528917398, + "loss": 1.8581, + "step": 5975 + }, + { + "epoch": 0.21401328630006983, + "grad_norm": 1.8088141679763794, + "learning_rate": 0.00018276115530520336, + "loss": 1.7364, + "step": 5976 + }, + { + "epoch": 0.21404909842963812, + "grad_norm": 1.4815630912780762, + "learning_rate": 0.0001827546442077882, + "loss": 1.3892, + "step": 5977 + }, + { + "epoch": 0.2140849105592064, + "grad_norm": 1.7200106382369995, + "learning_rate": 0.000182748131997016, + "loss": 1.4714, + "step": 5978 + }, + { + "epoch": 0.2141207226887747, + "grad_norm": 1.388688087463379, + "learning_rate": 0.00018274161867297447, + "loss": 1.7146, + "step": 5979 + }, + { + "epoch": 0.214156534818343, + "grad_norm": 1.3790088891983032, + "learning_rate": 0.00018273510423575117, + "loss": 1.5785, + "step": 5980 + }, + { + "epoch": 0.21419234694791126, + "grad_norm": 1.8050322532653809, + "learning_rate": 0.00018272858868543374, + "loss": 1.3374, + "step": 5981 + }, + { + "epoch": 0.21422815907747955, + "grad_norm": 1.851999282836914, + "learning_rate": 0.00018272207202210986, + "loss": 1.4933, + "step": 5982 + }, + { + "epoch": 0.21426397120704782, + "grad_norm": 1.7498927116394043, + "learning_rate": 0.00018271555424586723, + "loss": 1.8221, + "step": 5983 + }, + { + "epoch": 0.21429978333661612, + "grad_norm": 1.9699684381484985, + "learning_rate": 0.0001827090353567935, + "loss": 1.563, + "step": 5984 + }, + { + "epoch": 0.2143355954661844, + "grad_norm": 1.985451102256775, + "learning_rate": 0.0001827025153549764, + "loss": 1.8659, + "step": 5985 + }, + { + "epoch": 0.21437140759575268, + "grad_norm": 1.738781213760376, + "learning_rate": 0.00018269599424050362, + "loss": 1.582, + "step": 5986 + }, + { + "epoch": 0.21440721972532098, + "grad_norm": 1.600334644317627, + "learning_rate": 0.00018268947201346291, + "loss": 1.6137, + "step": 5987 + }, + { + "epoch": 0.21444303185488925, + "grad_norm": 1.34315824508667, + "learning_rate": 0.00018268294867394204, + "loss": 1.4509, + "step": 5988 + }, + { + "epoch": 0.21447884398445755, + "grad_norm": 2.184556007385254, + "learning_rate": 0.00018267642422202873, + "loss": 1.7843, + "step": 5989 + }, + { + "epoch": 0.21451465611402581, + "grad_norm": 1.9627654552459717, + "learning_rate": 0.00018266989865781076, + "loss": 1.7417, + "step": 5990 + }, + { + "epoch": 0.2145504682435941, + "grad_norm": 1.7299060821533203, + "learning_rate": 0.00018266337198137594, + "loss": 1.652, + "step": 5991 + }, + { + "epoch": 0.21458628037316238, + "grad_norm": 1.5557527542114258, + "learning_rate": 0.00018265684419281213, + "loss": 1.6432, + "step": 5992 + }, + { + "epoch": 0.21462209250273068, + "grad_norm": 1.7059578895568848, + "learning_rate": 0.00018265031529220705, + "loss": 1.456, + "step": 5993 + }, + { + "epoch": 0.21465790463229895, + "grad_norm": 1.6850507259368896, + "learning_rate": 0.0001826437852796486, + "loss": 1.5577, + "step": 5994 + }, + { + "epoch": 0.21469371676186724, + "grad_norm": 2.0986411571502686, + "learning_rate": 0.00018263725415522462, + "loss": 1.5896, + "step": 5995 + }, + { + "epoch": 0.21472952889143554, + "grad_norm": 2.174347400665283, + "learning_rate": 0.000182630721919023, + "loss": 1.6033, + "step": 5996 + }, + { + "epoch": 0.2147653410210038, + "grad_norm": 1.433610439300537, + "learning_rate": 0.00018262418857113157, + "loss": 1.6622, + "step": 5997 + }, + { + "epoch": 0.2148011531505721, + "grad_norm": 1.3677752017974854, + "learning_rate": 0.00018261765411163827, + "loss": 1.6868, + "step": 5998 + }, + { + "epoch": 0.21483696528014037, + "grad_norm": 1.4604482650756836, + "learning_rate": 0.000182611118540631, + "loss": 1.3887, + "step": 5999 + }, + { + "epoch": 0.21487277740970867, + "grad_norm": 1.462753415107727, + "learning_rate": 0.00018260458185819772, + "loss": 1.5631, + "step": 6000 + }, + { + "epoch": 0.21490858953927694, + "grad_norm": 1.279866099357605, + "learning_rate": 0.00018259804406442633, + "loss": 1.3168, + "step": 6001 + }, + { + "epoch": 0.21494440166884524, + "grad_norm": 2.1085100173950195, + "learning_rate": 0.0001825915051594048, + "loss": 1.8449, + "step": 6002 + }, + { + "epoch": 0.21498021379841353, + "grad_norm": 1.6899045705795288, + "learning_rate": 0.0001825849651432211, + "loss": 1.4615, + "step": 6003 + }, + { + "epoch": 0.2150160259279818, + "grad_norm": 1.6363755464553833, + "learning_rate": 0.0001825784240159632, + "loss": 1.4947, + "step": 6004 + }, + { + "epoch": 0.2150518380575501, + "grad_norm": 1.7822414636611938, + "learning_rate": 0.00018257188177771914, + "loss": 1.579, + "step": 6005 + }, + { + "epoch": 0.21508765018711837, + "grad_norm": 2.806736469268799, + "learning_rate": 0.00018256533842857695, + "loss": 1.4047, + "step": 6006 + }, + { + "epoch": 0.21512346231668666, + "grad_norm": 1.5889925956726074, + "learning_rate": 0.0001825587939686246, + "loss": 1.563, + "step": 6007 + }, + { + "epoch": 0.21515927444625493, + "grad_norm": 1.2758655548095703, + "learning_rate": 0.00018255224839795018, + "loss": 1.6386, + "step": 6008 + }, + { + "epoch": 0.21519508657582323, + "grad_norm": 1.890386939048767, + "learning_rate": 0.00018254570171664174, + "loss": 1.4298, + "step": 6009 + }, + { + "epoch": 0.21523089870539153, + "grad_norm": 1.240126609802246, + "learning_rate": 0.00018253915392478737, + "loss": 1.4986, + "step": 6010 + }, + { + "epoch": 0.2152667108349598, + "grad_norm": 2.025054454803467, + "learning_rate": 0.00018253260502247513, + "loss": 1.7933, + "step": 6011 + }, + { + "epoch": 0.2153025229645281, + "grad_norm": 1.7431395053863525, + "learning_rate": 0.00018252605500979316, + "loss": 1.8384, + "step": 6012 + }, + { + "epoch": 0.21533833509409636, + "grad_norm": 1.3581688404083252, + "learning_rate": 0.00018251950388682958, + "loss": 1.4347, + "step": 6013 + }, + { + "epoch": 0.21537414722366466, + "grad_norm": 1.4468944072723389, + "learning_rate": 0.0001825129516536725, + "loss": 1.5021, + "step": 6014 + }, + { + "epoch": 0.21540995935323293, + "grad_norm": 2.2720768451690674, + "learning_rate": 0.0001825063983104101, + "loss": 1.7324, + "step": 6015 + }, + { + "epoch": 0.21544577148280122, + "grad_norm": 2.472119092941284, + "learning_rate": 0.00018249984385713055, + "loss": 1.905, + "step": 6016 + }, + { + "epoch": 0.21548158361236952, + "grad_norm": 1.8467071056365967, + "learning_rate": 0.000182493288293922, + "loss": 1.4867, + "step": 6017 + }, + { + "epoch": 0.2155173957419378, + "grad_norm": 1.5712488889694214, + "learning_rate": 0.00018248673162087268, + "loss": 1.5598, + "step": 6018 + }, + { + "epoch": 0.2155532078715061, + "grad_norm": 1.773179054260254, + "learning_rate": 0.00018248017383807076, + "loss": 1.539, + "step": 6019 + }, + { + "epoch": 0.21558902000107436, + "grad_norm": 1.6343525648117065, + "learning_rate": 0.0001824736149456045, + "loss": 1.6218, + "step": 6020 + }, + { + "epoch": 0.21562483213064265, + "grad_norm": 1.5364235639572144, + "learning_rate": 0.00018246705494356214, + "loss": 1.3762, + "step": 6021 + }, + { + "epoch": 0.21566064426021092, + "grad_norm": 1.940355896949768, + "learning_rate": 0.00018246049383203192, + "loss": 1.4093, + "step": 6022 + }, + { + "epoch": 0.21569645638977922, + "grad_norm": 1.5513060092926025, + "learning_rate": 0.00018245393161110215, + "loss": 1.0682, + "step": 6023 + }, + { + "epoch": 0.21573226851934751, + "grad_norm": 1.6082549095153809, + "learning_rate": 0.00018244736828086107, + "loss": 1.2319, + "step": 6024 + }, + { + "epoch": 0.21576808064891578, + "grad_norm": 1.8053920269012451, + "learning_rate": 0.00018244080384139698, + "loss": 1.7213, + "step": 6025 + }, + { + "epoch": 0.21580389277848408, + "grad_norm": 1.4318716526031494, + "learning_rate": 0.00018243423829279824, + "loss": 1.5302, + "step": 6026 + }, + { + "epoch": 0.21583970490805235, + "grad_norm": 1.3476557731628418, + "learning_rate": 0.00018242767163515318, + "loss": 1.3816, + "step": 6027 + }, + { + "epoch": 0.21587551703762065, + "grad_norm": 2.339296817779541, + "learning_rate": 0.00018242110386855007, + "loss": 1.2601, + "step": 6028 + }, + { + "epoch": 0.21591132916718891, + "grad_norm": 1.4866342544555664, + "learning_rate": 0.00018241453499307734, + "loss": 1.4542, + "step": 6029 + }, + { + "epoch": 0.2159471412967572, + "grad_norm": 2.2102510929107666, + "learning_rate": 0.00018240796500882338, + "loss": 1.8524, + "step": 6030 + }, + { + "epoch": 0.2159829534263255, + "grad_norm": 1.8730422258377075, + "learning_rate": 0.0001824013939158765, + "loss": 1.7469, + "step": 6031 + }, + { + "epoch": 0.21601876555589378, + "grad_norm": 1.2295218706130981, + "learning_rate": 0.0001823948217143252, + "loss": 1.5667, + "step": 6032 + }, + { + "epoch": 0.21605457768546207, + "grad_norm": 1.4555342197418213, + "learning_rate": 0.00018238824840425785, + "loss": 1.3013, + "step": 6033 + }, + { + "epoch": 0.21609038981503034, + "grad_norm": 2.010502576828003, + "learning_rate": 0.00018238167398576286, + "loss": 1.5193, + "step": 6034 + }, + { + "epoch": 0.21612620194459864, + "grad_norm": 2.1535098552703857, + "learning_rate": 0.00018237509845892873, + "loss": 1.6717, + "step": 6035 + }, + { + "epoch": 0.2161620140741669, + "grad_norm": 1.6601369380950928, + "learning_rate": 0.00018236852182384393, + "loss": 1.5295, + "step": 6036 + }, + { + "epoch": 0.2161978262037352, + "grad_norm": 1.641852617263794, + "learning_rate": 0.00018236194408059685, + "loss": 1.1745, + "step": 6037 + }, + { + "epoch": 0.2162336383333035, + "grad_norm": 1.5132910013198853, + "learning_rate": 0.00018235536522927611, + "loss": 1.6211, + "step": 6038 + }, + { + "epoch": 0.21626945046287177, + "grad_norm": 1.4342702627182007, + "learning_rate": 0.00018234878526997015, + "loss": 1.371, + "step": 6039 + }, + { + "epoch": 0.21630526259244007, + "grad_norm": 1.345091462135315, + "learning_rate": 0.0001823422042027675, + "loss": 1.5545, + "step": 6040 + }, + { + "epoch": 0.21634107472200834, + "grad_norm": 2.147703170776367, + "learning_rate": 0.0001823356220277567, + "loss": 1.5573, + "step": 6041 + }, + { + "epoch": 0.21637688685157663, + "grad_norm": 1.943217396736145, + "learning_rate": 0.00018232903874502632, + "loss": 1.3352, + "step": 6042 + }, + { + "epoch": 0.2164126989811449, + "grad_norm": 1.6899324655532837, + "learning_rate": 0.00018232245435466493, + "loss": 1.4203, + "step": 6043 + }, + { + "epoch": 0.2164485111107132, + "grad_norm": 1.9825857877731323, + "learning_rate": 0.0001823158688567611, + "loss": 1.591, + "step": 6044 + }, + { + "epoch": 0.2164843232402815, + "grad_norm": 1.5396236181259155, + "learning_rate": 0.00018230928225140342, + "loss": 1.8448, + "step": 6045 + }, + { + "epoch": 0.21652013536984976, + "grad_norm": 1.9984638690948486, + "learning_rate": 0.00018230269453868052, + "loss": 1.417, + "step": 6046 + }, + { + "epoch": 0.21655594749941806, + "grad_norm": 1.8226370811462402, + "learning_rate": 0.00018229610571868102, + "loss": 2.0727, + "step": 6047 + }, + { + "epoch": 0.21659175962898633, + "grad_norm": 1.4168521165847778, + "learning_rate": 0.0001822895157914936, + "loss": 1.7747, + "step": 6048 + }, + { + "epoch": 0.21662757175855463, + "grad_norm": 1.5113242864608765, + "learning_rate": 0.00018228292475720687, + "loss": 1.7142, + "step": 6049 + }, + { + "epoch": 0.2166633838881229, + "grad_norm": 1.5562756061553955, + "learning_rate": 0.00018227633261590955, + "loss": 1.6209, + "step": 6050 + }, + { + "epoch": 0.2166991960176912, + "grad_norm": 1.5761768817901611, + "learning_rate": 0.00018226973936769027, + "loss": 1.6547, + "step": 6051 + }, + { + "epoch": 0.2167350081472595, + "grad_norm": 1.78731369972229, + "learning_rate": 0.0001822631450126378, + "loss": 1.3805, + "step": 6052 + }, + { + "epoch": 0.21677082027682776, + "grad_norm": 1.4102073907852173, + "learning_rate": 0.00018225654955084079, + "loss": 1.562, + "step": 6053 + }, + { + "epoch": 0.21680663240639605, + "grad_norm": 1.5794233083724976, + "learning_rate": 0.00018224995298238804, + "loss": 1.6456, + "step": 6054 + }, + { + "epoch": 0.21684244453596432, + "grad_norm": 1.8132075071334839, + "learning_rate": 0.00018224335530736825, + "loss": 1.9961, + "step": 6055 + }, + { + "epoch": 0.21687825666553262, + "grad_norm": 1.6012133359909058, + "learning_rate": 0.0001822367565258702, + "loss": 1.7669, + "step": 6056 + }, + { + "epoch": 0.2169140687951009, + "grad_norm": 1.3363044261932373, + "learning_rate": 0.0001822301566379827, + "loss": 1.6712, + "step": 6057 + }, + { + "epoch": 0.2169498809246692, + "grad_norm": 1.529463291168213, + "learning_rate": 0.00018222355564379448, + "loss": 1.3612, + "step": 6058 + }, + { + "epoch": 0.21698569305423748, + "grad_norm": 1.4082953929901123, + "learning_rate": 0.00018221695354339435, + "loss": 1.4312, + "step": 6059 + }, + { + "epoch": 0.21702150518380575, + "grad_norm": 1.7950927019119263, + "learning_rate": 0.00018221035033687123, + "loss": 1.5232, + "step": 6060 + }, + { + "epoch": 0.21705731731337405, + "grad_norm": 1.8718847036361694, + "learning_rate": 0.00018220374602431386, + "loss": 1.7111, + "step": 6061 + }, + { + "epoch": 0.21709312944294232, + "grad_norm": 1.4784345626831055, + "learning_rate": 0.0001821971406058111, + "loss": 1.6418, + "step": 6062 + }, + { + "epoch": 0.21712894157251061, + "grad_norm": 1.6133242845535278, + "learning_rate": 0.00018219053408145185, + "loss": 1.3773, + "step": 6063 + }, + { + "epoch": 0.21716475370207888, + "grad_norm": 1.6696078777313232, + "learning_rate": 0.000182183926451325, + "loss": 1.6862, + "step": 6064 + }, + { + "epoch": 0.21720056583164718, + "grad_norm": 1.7715810537338257, + "learning_rate": 0.00018217731771551942, + "loss": 1.734, + "step": 6065 + }, + { + "epoch": 0.21723637796121548, + "grad_norm": 1.6767003536224365, + "learning_rate": 0.00018217070787412404, + "loss": 1.4907, + "step": 6066 + }, + { + "epoch": 0.21727219009078375, + "grad_norm": 1.7085485458374023, + "learning_rate": 0.00018216409692722779, + "loss": 1.8883, + "step": 6067 + }, + { + "epoch": 0.21730800222035204, + "grad_norm": 1.3934434652328491, + "learning_rate": 0.00018215748487491958, + "loss": 1.3403, + "step": 6068 + }, + { + "epoch": 0.2173438143499203, + "grad_norm": 2.090029001235962, + "learning_rate": 0.00018215087171728837, + "loss": 1.7042, + "step": 6069 + }, + { + "epoch": 0.2173796264794886, + "grad_norm": 2.287304639816284, + "learning_rate": 0.00018214425745442317, + "loss": 1.7766, + "step": 6070 + }, + { + "epoch": 0.21741543860905688, + "grad_norm": 1.7092816829681396, + "learning_rate": 0.00018213764208641292, + "loss": 1.4644, + "step": 6071 + }, + { + "epoch": 0.21745125073862517, + "grad_norm": 1.7535308599472046, + "learning_rate": 0.00018213102561334668, + "loss": 1.39, + "step": 6072 + }, + { + "epoch": 0.21748706286819347, + "grad_norm": 2.193551778793335, + "learning_rate": 0.00018212440803531342, + "loss": 1.5143, + "step": 6073 + }, + { + "epoch": 0.21752287499776174, + "grad_norm": 1.7536066770553589, + "learning_rate": 0.00018211778935240219, + "loss": 1.617, + "step": 6074 + }, + { + "epoch": 0.21755868712733004, + "grad_norm": 1.5110563039779663, + "learning_rate": 0.00018211116956470203, + "loss": 1.6147, + "step": 6075 + }, + { + "epoch": 0.2175944992568983, + "grad_norm": 1.4865481853485107, + "learning_rate": 0.00018210454867230195, + "loss": 1.6124, + "step": 6076 + }, + { + "epoch": 0.2176303113864666, + "grad_norm": 1.9881222248077393, + "learning_rate": 0.00018209792667529112, + "loss": 1.5487, + "step": 6077 + }, + { + "epoch": 0.21766612351603487, + "grad_norm": 1.733006477355957, + "learning_rate": 0.00018209130357375858, + "loss": 1.7815, + "step": 6078 + }, + { + "epoch": 0.21770193564560317, + "grad_norm": 1.2904987335205078, + "learning_rate": 0.00018208467936779347, + "loss": 1.8821, + "step": 6079 + }, + { + "epoch": 0.21773774777517146, + "grad_norm": 2.602548837661743, + "learning_rate": 0.00018207805405748482, + "loss": 1.4122, + "step": 6080 + }, + { + "epoch": 0.21777355990473973, + "grad_norm": 1.7389355897903442, + "learning_rate": 0.00018207142764292187, + "loss": 1.4373, + "step": 6081 + }, + { + "epoch": 0.21780937203430803, + "grad_norm": 1.316788911819458, + "learning_rate": 0.00018206480012419372, + "loss": 1.6428, + "step": 6082 + }, + { + "epoch": 0.2178451841638763, + "grad_norm": 1.8363261222839355, + "learning_rate": 0.0001820581715013895, + "loss": 1.4394, + "step": 6083 + }, + { + "epoch": 0.2178809962934446, + "grad_norm": 1.57510507106781, + "learning_rate": 0.0001820515417745985, + "loss": 1.4141, + "step": 6084 + }, + { + "epoch": 0.21791680842301286, + "grad_norm": 1.3749953508377075, + "learning_rate": 0.0001820449109439098, + "loss": 1.5511, + "step": 6085 + }, + { + "epoch": 0.21795262055258116, + "grad_norm": 1.5707284212112427, + "learning_rate": 0.00018203827900941264, + "loss": 1.7035, + "step": 6086 + }, + { + "epoch": 0.21798843268214946, + "grad_norm": 1.7157187461853027, + "learning_rate": 0.0001820316459711963, + "loss": 1.599, + "step": 6087 + }, + { + "epoch": 0.21802424481171773, + "grad_norm": 1.5144253969192505, + "learning_rate": 0.0001820250118293499, + "loss": 1.9309, + "step": 6088 + }, + { + "epoch": 0.21806005694128602, + "grad_norm": 1.6175367832183838, + "learning_rate": 0.00018201837658396287, + "loss": 1.6032, + "step": 6089 + }, + { + "epoch": 0.2180958690708543, + "grad_norm": 1.6035107374191284, + "learning_rate": 0.00018201174023512433, + "loss": 1.4334, + "step": 6090 + }, + { + "epoch": 0.2181316812004226, + "grad_norm": 1.8502047061920166, + "learning_rate": 0.0001820051027829236, + "loss": 1.354, + "step": 6091 + }, + { + "epoch": 0.21816749332999086, + "grad_norm": 1.380337119102478, + "learning_rate": 0.00018199846422745002, + "loss": 1.6663, + "step": 6092 + }, + { + "epoch": 0.21820330545955915, + "grad_norm": 1.2796369791030884, + "learning_rate": 0.00018199182456879286, + "loss": 1.5156, + "step": 6093 + }, + { + "epoch": 0.21823911758912742, + "grad_norm": 1.8583347797393799, + "learning_rate": 0.00018198518380704143, + "loss": 1.896, + "step": 6094 + }, + { + "epoch": 0.21827492971869572, + "grad_norm": 1.7820069789886475, + "learning_rate": 0.00018197854194228517, + "loss": 1.8372, + "step": 6095 + }, + { + "epoch": 0.21831074184826402, + "grad_norm": 2.359118938446045, + "learning_rate": 0.00018197189897461332, + "loss": 1.5367, + "step": 6096 + }, + { + "epoch": 0.2183465539778323, + "grad_norm": 1.677788496017456, + "learning_rate": 0.00018196525490411534, + "loss": 1.5914, + "step": 6097 + }, + { + "epoch": 0.21838236610740058, + "grad_norm": 1.7194525003433228, + "learning_rate": 0.00018195860973088058, + "loss": 1.6376, + "step": 6098 + }, + { + "epoch": 0.21841817823696885, + "grad_norm": 1.8285008668899536, + "learning_rate": 0.00018195196345499842, + "loss": 1.8412, + "step": 6099 + }, + { + "epoch": 0.21845399036653715, + "grad_norm": 2.399778366088867, + "learning_rate": 0.00018194531607655833, + "loss": 1.3773, + "step": 6100 + }, + { + "epoch": 0.21848980249610542, + "grad_norm": 1.259798288345337, + "learning_rate": 0.0001819386675956497, + "loss": 1.3212, + "step": 6101 + }, + { + "epoch": 0.21852561462567371, + "grad_norm": 1.758084774017334, + "learning_rate": 0.000181932018012362, + "loss": 1.6877, + "step": 6102 + }, + { + "epoch": 0.218561426755242, + "grad_norm": 1.4051774740219116, + "learning_rate": 0.00018192536732678468, + "loss": 1.4873, + "step": 6103 + }, + { + "epoch": 0.21859723888481028, + "grad_norm": 1.8246309757232666, + "learning_rate": 0.00018191871553900718, + "loss": 1.7631, + "step": 6104 + }, + { + "epoch": 0.21863305101437858, + "grad_norm": 1.6398242712020874, + "learning_rate": 0.00018191206264911908, + "loss": 1.3941, + "step": 6105 + }, + { + "epoch": 0.21866886314394685, + "grad_norm": 2.2544310092926025, + "learning_rate": 0.0001819054086572098, + "loss": 1.7397, + "step": 6106 + }, + { + "epoch": 0.21870467527351514, + "grad_norm": 1.5466108322143555, + "learning_rate": 0.00018189875356336893, + "loss": 1.5326, + "step": 6107 + }, + { + "epoch": 0.2187404874030834, + "grad_norm": 1.4698082208633423, + "learning_rate": 0.00018189209736768595, + "loss": 1.4511, + "step": 6108 + }, + { + "epoch": 0.2187762995326517, + "grad_norm": 1.9529378414154053, + "learning_rate": 0.00018188544007025043, + "loss": 1.8635, + "step": 6109 + }, + { + "epoch": 0.21881211166222, + "grad_norm": 1.4893909692764282, + "learning_rate": 0.00018187878167115197, + "loss": 1.6932, + "step": 6110 + }, + { + "epoch": 0.21884792379178827, + "grad_norm": 1.3069435358047485, + "learning_rate": 0.00018187212217048008, + "loss": 1.5887, + "step": 6111 + }, + { + "epoch": 0.21888373592135657, + "grad_norm": 1.8581095933914185, + "learning_rate": 0.00018186546156832444, + "loss": 1.7178, + "step": 6112 + }, + { + "epoch": 0.21891954805092484, + "grad_norm": 1.3659560680389404, + "learning_rate": 0.00018185879986477456, + "loss": 1.2872, + "step": 6113 + }, + { + "epoch": 0.21895536018049314, + "grad_norm": 1.5570614337921143, + "learning_rate": 0.00018185213705992014, + "loss": 1.5665, + "step": 6114 + }, + { + "epoch": 0.2189911723100614, + "grad_norm": 1.6068415641784668, + "learning_rate": 0.00018184547315385082, + "loss": 1.869, + "step": 6115 + }, + { + "epoch": 0.2190269844396297, + "grad_norm": 1.2946780920028687, + "learning_rate": 0.0001818388081466562, + "loss": 1.4014, + "step": 6116 + }, + { + "epoch": 0.219062796569198, + "grad_norm": 1.327081322669983, + "learning_rate": 0.00018183214203842601, + "loss": 1.8673, + "step": 6117 + }, + { + "epoch": 0.21909860869876627, + "grad_norm": 1.738240122795105, + "learning_rate": 0.00018182547482924988, + "loss": 1.406, + "step": 6118 + }, + { + "epoch": 0.21913442082833456, + "grad_norm": 2.0041754245758057, + "learning_rate": 0.00018181880651921755, + "loss": 1.6267, + "step": 6119 + }, + { + "epoch": 0.21917023295790283, + "grad_norm": 1.6936300992965698, + "learning_rate": 0.0001818121371084187, + "loss": 1.741, + "step": 6120 + }, + { + "epoch": 0.21920604508747113, + "grad_norm": 1.2888492345809937, + "learning_rate": 0.00018180546659694307, + "loss": 1.5933, + "step": 6121 + }, + { + "epoch": 0.2192418572170394, + "grad_norm": 1.3462530374526978, + "learning_rate": 0.0001817987949848804, + "loss": 1.6054, + "step": 6122 + }, + { + "epoch": 0.2192776693466077, + "grad_norm": 2.644855260848999, + "learning_rate": 0.0001817921222723205, + "loss": 1.7208, + "step": 6123 + }, + { + "epoch": 0.219313481476176, + "grad_norm": 1.3023197650909424, + "learning_rate": 0.00018178544845935308, + "loss": 1.6019, + "step": 6124 + }, + { + "epoch": 0.21934929360574426, + "grad_norm": 1.9801610708236694, + "learning_rate": 0.00018177877354606797, + "loss": 1.4413, + "step": 6125 + }, + { + "epoch": 0.21938510573531256, + "grad_norm": 1.6524903774261475, + "learning_rate": 0.00018177209753255492, + "loss": 1.4086, + "step": 6126 + }, + { + "epoch": 0.21942091786488083, + "grad_norm": 1.5688244104385376, + "learning_rate": 0.00018176542041890376, + "loss": 1.5965, + "step": 6127 + }, + { + "epoch": 0.21945672999444912, + "grad_norm": 1.5412373542785645, + "learning_rate": 0.00018175874220520438, + "loss": 1.4583, + "step": 6128 + }, + { + "epoch": 0.2194925421240174, + "grad_norm": 1.9981822967529297, + "learning_rate": 0.00018175206289154655, + "loss": 1.5695, + "step": 6129 + }, + { + "epoch": 0.2195283542535857, + "grad_norm": 1.5729684829711914, + "learning_rate": 0.00018174538247802015, + "loss": 1.1199, + "step": 6130 + }, + { + "epoch": 0.21956416638315399, + "grad_norm": 1.5781065225601196, + "learning_rate": 0.00018173870096471512, + "loss": 1.7226, + "step": 6131 + }, + { + "epoch": 0.21959997851272225, + "grad_norm": 1.6586942672729492, + "learning_rate": 0.00018173201835172128, + "loss": 1.8269, + "step": 6132 + }, + { + "epoch": 0.21963579064229055, + "grad_norm": 1.3786426782608032, + "learning_rate": 0.00018172533463912857, + "loss": 1.4942, + "step": 6133 + }, + { + "epoch": 0.21967160277185882, + "grad_norm": 1.72449791431427, + "learning_rate": 0.00018171864982702692, + "loss": 1.2589, + "step": 6134 + }, + { + "epoch": 0.21970741490142712, + "grad_norm": 1.6442193984985352, + "learning_rate": 0.0001817119639155062, + "loss": 1.3379, + "step": 6135 + }, + { + "epoch": 0.21974322703099539, + "grad_norm": 1.8619656562805176, + "learning_rate": 0.00018170527690465643, + "loss": 1.5929, + "step": 6136 + }, + { + "epoch": 0.21977903916056368, + "grad_norm": 1.7643412351608276, + "learning_rate": 0.00018169858879456757, + "loss": 1.5387, + "step": 6137 + }, + { + "epoch": 0.21981485129013198, + "grad_norm": 1.723496437072754, + "learning_rate": 0.00018169189958532953, + "loss": 1.6536, + "step": 6138 + }, + { + "epoch": 0.21985066341970025, + "grad_norm": 2.1918203830718994, + "learning_rate": 0.0001816852092770324, + "loss": 1.6178, + "step": 6139 + }, + { + "epoch": 0.21988647554926855, + "grad_norm": 1.7967193126678467, + "learning_rate": 0.00018167851786976612, + "loss": 1.4998, + "step": 6140 + }, + { + "epoch": 0.21992228767883681, + "grad_norm": 1.3858860731124878, + "learning_rate": 0.00018167182536362074, + "loss": 1.4789, + "step": 6141 + }, + { + "epoch": 0.2199580998084051, + "grad_norm": 1.7353756427764893, + "learning_rate": 0.00018166513175868633, + "loss": 1.4175, + "step": 6142 + }, + { + "epoch": 0.21999391193797338, + "grad_norm": 1.6364984512329102, + "learning_rate": 0.0001816584370550529, + "loss": 1.2205, + "step": 6143 + }, + { + "epoch": 0.22002972406754168, + "grad_norm": 2.147890567779541, + "learning_rate": 0.00018165174125281053, + "loss": 1.4915, + "step": 6144 + }, + { + "epoch": 0.22006553619710997, + "grad_norm": 1.781660556793213, + "learning_rate": 0.0001816450443520493, + "loss": 1.4868, + "step": 6145 + }, + { + "epoch": 0.22010134832667824, + "grad_norm": 1.9566551446914673, + "learning_rate": 0.00018163834635285931, + "loss": 1.2187, + "step": 6146 + }, + { + "epoch": 0.22013716045624654, + "grad_norm": 1.7707067728042603, + "learning_rate": 0.00018163164725533068, + "loss": 1.7138, + "step": 6147 + }, + { + "epoch": 0.2201729725858148, + "grad_norm": 3.547220468521118, + "learning_rate": 0.0001816249470595535, + "loss": 1.4808, + "step": 6148 + }, + { + "epoch": 0.2202087847153831, + "grad_norm": 2.155532121658325, + "learning_rate": 0.000181618245765618, + "loss": 1.5946, + "step": 6149 + }, + { + "epoch": 0.22024459684495137, + "grad_norm": 2.2041211128234863, + "learning_rate": 0.00018161154337361426, + "loss": 1.5524, + "step": 6150 + }, + { + "epoch": 0.22028040897451967, + "grad_norm": 1.5323847532272339, + "learning_rate": 0.00018160483988363248, + "loss": 1.6958, + "step": 6151 + }, + { + "epoch": 0.22031622110408797, + "grad_norm": 1.5857058763504028, + "learning_rate": 0.00018159813529576284, + "loss": 1.9383, + "step": 6152 + }, + { + "epoch": 0.22035203323365624, + "grad_norm": 1.6870907545089722, + "learning_rate": 0.0001815914296100955, + "loss": 1.7842, + "step": 6153 + }, + { + "epoch": 0.22038784536322453, + "grad_norm": 1.9299302101135254, + "learning_rate": 0.00018158472282672078, + "loss": 1.8147, + "step": 6154 + }, + { + "epoch": 0.2204236574927928, + "grad_norm": 1.6161481142044067, + "learning_rate": 0.00018157801494572885, + "loss": 1.6953, + "step": 6155 + }, + { + "epoch": 0.2204594696223611, + "grad_norm": 1.6391772031784058, + "learning_rate": 0.00018157130596720996, + "loss": 1.6505, + "step": 6156 + }, + { + "epoch": 0.22049528175192937, + "grad_norm": 1.602982759475708, + "learning_rate": 0.0001815645958912543, + "loss": 1.3066, + "step": 6157 + }, + { + "epoch": 0.22053109388149766, + "grad_norm": 2.5656521320343018, + "learning_rate": 0.0001815578847179523, + "loss": 1.6071, + "step": 6158 + }, + { + "epoch": 0.22056690601106596, + "grad_norm": 1.99832284450531, + "learning_rate": 0.0001815511724473941, + "loss": 1.7087, + "step": 6159 + }, + { + "epoch": 0.22060271814063423, + "grad_norm": 2.7880725860595703, + "learning_rate": 0.0001815444590796701, + "loss": 1.4331, + "step": 6160 + }, + { + "epoch": 0.22063853027020253, + "grad_norm": 2.317629337310791, + "learning_rate": 0.00018153774461487058, + "loss": 1.5764, + "step": 6161 + }, + { + "epoch": 0.2206743423997708, + "grad_norm": 1.6050294637680054, + "learning_rate": 0.00018153102905308589, + "loss": 1.8059, + "step": 6162 + }, + { + "epoch": 0.2207101545293391, + "grad_norm": 1.231059193611145, + "learning_rate": 0.00018152431239440637, + "loss": 1.6119, + "step": 6163 + }, + { + "epoch": 0.22074596665890736, + "grad_norm": 1.5513490438461304, + "learning_rate": 0.00018151759463892235, + "loss": 1.6632, + "step": 6164 + }, + { + "epoch": 0.22078177878847566, + "grad_norm": 1.990567684173584, + "learning_rate": 0.00018151087578672427, + "loss": 1.4703, + "step": 6165 + }, + { + "epoch": 0.22081759091804395, + "grad_norm": 1.8540167808532715, + "learning_rate": 0.00018150415583790253, + "loss": 1.3144, + "step": 6166 + }, + { + "epoch": 0.22085340304761222, + "grad_norm": 1.6363404989242554, + "learning_rate": 0.00018149743479254745, + "loss": 1.7437, + "step": 6167 + }, + { + "epoch": 0.22088921517718052, + "grad_norm": 1.5769498348236084, + "learning_rate": 0.00018149071265074955, + "loss": 1.6028, + "step": 6168 + }, + { + "epoch": 0.2209250273067488, + "grad_norm": 1.8275872468948364, + "learning_rate": 0.0001814839894125992, + "loss": 1.1498, + "step": 6169 + }, + { + "epoch": 0.22096083943631709, + "grad_norm": 1.8236889839172363, + "learning_rate": 0.0001814772650781869, + "loss": 1.728, + "step": 6170 + }, + { + "epoch": 0.22099665156588535, + "grad_norm": 2.3265621662139893, + "learning_rate": 0.0001814705396476031, + "loss": 1.3853, + "step": 6171 + }, + { + "epoch": 0.22103246369545365, + "grad_norm": 1.8353328704833984, + "learning_rate": 0.00018146381312093826, + "loss": 1.526, + "step": 6172 + }, + { + "epoch": 0.22106827582502195, + "grad_norm": 1.6437424421310425, + "learning_rate": 0.00018145708549828287, + "loss": 1.8824, + "step": 6173 + }, + { + "epoch": 0.22110408795459022, + "grad_norm": 1.395546317100525, + "learning_rate": 0.00018145035677972753, + "loss": 1.7179, + "step": 6174 + }, + { + "epoch": 0.2211399000841585, + "grad_norm": 1.4064130783081055, + "learning_rate": 0.00018144362696536267, + "loss": 1.5391, + "step": 6175 + }, + { + "epoch": 0.22117571221372678, + "grad_norm": 2.1587369441986084, + "learning_rate": 0.00018143689605527885, + "loss": 1.1614, + "step": 6176 + }, + { + "epoch": 0.22121152434329508, + "grad_norm": 1.8785380125045776, + "learning_rate": 0.00018143016404956669, + "loss": 1.1919, + "step": 6177 + }, + { + "epoch": 0.22124733647286335, + "grad_norm": 1.5250529050827026, + "learning_rate": 0.00018142343094831667, + "loss": 1.7925, + "step": 6178 + }, + { + "epoch": 0.22128314860243165, + "grad_norm": 1.775303840637207, + "learning_rate": 0.0001814166967516194, + "loss": 1.5447, + "step": 6179 + }, + { + "epoch": 0.22131896073199994, + "grad_norm": 1.2775332927703857, + "learning_rate": 0.00018140996145956552, + "loss": 1.1843, + "step": 6180 + }, + { + "epoch": 0.2213547728615682, + "grad_norm": 1.5202330350875854, + "learning_rate": 0.00018140322507224563, + "loss": 1.4731, + "step": 6181 + }, + { + "epoch": 0.2213905849911365, + "grad_norm": 1.5258673429489136, + "learning_rate": 0.00018139648758975032, + "loss": 1.2282, + "step": 6182 + }, + { + "epoch": 0.22142639712070478, + "grad_norm": 2.3224363327026367, + "learning_rate": 0.00018138974901217027, + "loss": 1.9237, + "step": 6183 + }, + { + "epoch": 0.22146220925027307, + "grad_norm": 1.9350823163986206, + "learning_rate": 0.00018138300933959615, + "loss": 1.67, + "step": 6184 + }, + { + "epoch": 0.22149802137984134, + "grad_norm": 1.8837072849273682, + "learning_rate": 0.0001813762685721186, + "loss": 1.4745, + "step": 6185 + }, + { + "epoch": 0.22153383350940964, + "grad_norm": 1.769679307937622, + "learning_rate": 0.00018136952670982833, + "loss": 1.3859, + "step": 6186 + }, + { + "epoch": 0.22156964563897794, + "grad_norm": 1.521981120109558, + "learning_rate": 0.00018136278375281605, + "loss": 1.5587, + "step": 6187 + }, + { + "epoch": 0.2216054577685462, + "grad_norm": 1.510655403137207, + "learning_rate": 0.00018135603970117242, + "loss": 1.5694, + "step": 6188 + }, + { + "epoch": 0.2216412698981145, + "grad_norm": 1.176619529724121, + "learning_rate": 0.00018134929455498828, + "loss": 1.48, + "step": 6189 + }, + { + "epoch": 0.22167708202768277, + "grad_norm": 2.1085398197174072, + "learning_rate": 0.0001813425483143543, + "loss": 1.8069, + "step": 6190 + }, + { + "epoch": 0.22171289415725107, + "grad_norm": 1.5834347009658813, + "learning_rate": 0.00018133580097936123, + "loss": 1.5686, + "step": 6191 + }, + { + "epoch": 0.22174870628681934, + "grad_norm": 2.538158893585205, + "learning_rate": 0.00018132905255009986, + "loss": 1.6837, + "step": 6192 + }, + { + "epoch": 0.22178451841638763, + "grad_norm": 1.5322290658950806, + "learning_rate": 0.00018132230302666104, + "loss": 1.6239, + "step": 6193 + }, + { + "epoch": 0.2218203305459559, + "grad_norm": 2.453597068786621, + "learning_rate": 0.0001813155524091355, + "loss": 1.3409, + "step": 6194 + }, + { + "epoch": 0.2218561426755242, + "grad_norm": 1.7916405200958252, + "learning_rate": 0.00018130880069761412, + "loss": 1.8466, + "step": 6195 + }, + { + "epoch": 0.2218919548050925, + "grad_norm": 1.4588032960891724, + "learning_rate": 0.00018130204789218769, + "loss": 1.7948, + "step": 6196 + }, + { + "epoch": 0.22192776693466076, + "grad_norm": 1.7840462923049927, + "learning_rate": 0.00018129529399294706, + "loss": 1.9397, + "step": 6197 + }, + { + "epoch": 0.22196357906422906, + "grad_norm": 1.4719196557998657, + "learning_rate": 0.00018128853899998312, + "loss": 1.3483, + "step": 6198 + }, + { + "epoch": 0.22199939119379733, + "grad_norm": 1.5863127708435059, + "learning_rate": 0.00018128178291338678, + "loss": 1.7581, + "step": 6199 + }, + { + "epoch": 0.22203520332336563, + "grad_norm": 1.4045363664627075, + "learning_rate": 0.00018127502573324887, + "loss": 1.4722, + "step": 6200 + }, + { + "epoch": 0.2220710154529339, + "grad_norm": 1.8057374954223633, + "learning_rate": 0.00018126826745966032, + "loss": 1.7084, + "step": 6201 + }, + { + "epoch": 0.2221068275825022, + "grad_norm": 1.5089365243911743, + "learning_rate": 0.00018126150809271208, + "loss": 1.678, + "step": 6202 + }, + { + "epoch": 0.2221426397120705, + "grad_norm": 1.7062714099884033, + "learning_rate": 0.00018125474763249505, + "loss": 1.7745, + "step": 6203 + }, + { + "epoch": 0.22217845184163876, + "grad_norm": 2.236283540725708, + "learning_rate": 0.00018124798607910018, + "loss": 1.6298, + "step": 6204 + }, + { + "epoch": 0.22221426397120705, + "grad_norm": 1.514384150505066, + "learning_rate": 0.0001812412234326185, + "loss": 1.5133, + "step": 6205 + }, + { + "epoch": 0.22225007610077532, + "grad_norm": 1.520835041999817, + "learning_rate": 0.00018123445969314095, + "loss": 1.8003, + "step": 6206 + }, + { + "epoch": 0.22228588823034362, + "grad_norm": 2.0588576793670654, + "learning_rate": 0.00018122769486075854, + "loss": 1.6132, + "step": 6207 + }, + { + "epoch": 0.2223217003599119, + "grad_norm": 1.1865562200546265, + "learning_rate": 0.00018122092893556224, + "loss": 1.5413, + "step": 6208 + }, + { + "epoch": 0.22235751248948019, + "grad_norm": 1.4218183755874634, + "learning_rate": 0.0001812141619176431, + "loss": 1.5644, + "step": 6209 + }, + { + "epoch": 0.22239332461904848, + "grad_norm": 1.5924785137176514, + "learning_rate": 0.00018120739380709218, + "loss": 1.6539, + "step": 6210 + }, + { + "epoch": 0.22242913674861675, + "grad_norm": 1.443805456161499, + "learning_rate": 0.00018120062460400056, + "loss": 1.3789, + "step": 6211 + }, + { + "epoch": 0.22246494887818505, + "grad_norm": 1.639870285987854, + "learning_rate": 0.00018119385430845925, + "loss": 1.5284, + "step": 6212 + }, + { + "epoch": 0.22250076100775332, + "grad_norm": 1.666635274887085, + "learning_rate": 0.00018118708292055936, + "loss": 1.8486, + "step": 6213 + }, + { + "epoch": 0.2225365731373216, + "grad_norm": 2.6075167655944824, + "learning_rate": 0.00018118031044039198, + "loss": 1.778, + "step": 6214 + }, + { + "epoch": 0.22257238526688988, + "grad_norm": 1.558547019958496, + "learning_rate": 0.00018117353686804825, + "loss": 1.4225, + "step": 6215 + }, + { + "epoch": 0.22260819739645818, + "grad_norm": 1.4020191431045532, + "learning_rate": 0.00018116676220361933, + "loss": 1.5796, + "step": 6216 + }, + { + "epoch": 0.22264400952602648, + "grad_norm": 2.120662212371826, + "learning_rate": 0.00018115998644719627, + "loss": 1.9943, + "step": 6217 + }, + { + "epoch": 0.22267982165559475, + "grad_norm": 1.3515700101852417, + "learning_rate": 0.0001811532095988703, + "loss": 1.3811, + "step": 6218 + }, + { + "epoch": 0.22271563378516304, + "grad_norm": 1.45658278465271, + "learning_rate": 0.00018114643165873258, + "loss": 1.5035, + "step": 6219 + }, + { + "epoch": 0.2227514459147313, + "grad_norm": 1.6097304821014404, + "learning_rate": 0.00018113965262687426, + "loss": 1.524, + "step": 6220 + }, + { + "epoch": 0.2227872580442996, + "grad_norm": 1.5943652391433716, + "learning_rate": 0.00018113287250338662, + "loss": 1.3009, + "step": 6221 + }, + { + "epoch": 0.22282307017386788, + "grad_norm": 1.5684243440628052, + "learning_rate": 0.0001811260912883608, + "loss": 1.4074, + "step": 6222 + }, + { + "epoch": 0.22285888230343617, + "grad_norm": 1.7942955493927002, + "learning_rate": 0.0001811193089818881, + "loss": 1.5083, + "step": 6223 + }, + { + "epoch": 0.22289469443300447, + "grad_norm": 2.2910614013671875, + "learning_rate": 0.0001811125255840597, + "loss": 1.6178, + "step": 6224 + }, + { + "epoch": 0.22293050656257274, + "grad_norm": 1.332371473312378, + "learning_rate": 0.00018110574109496692, + "loss": 1.4492, + "step": 6225 + }, + { + "epoch": 0.22296631869214104, + "grad_norm": 1.8471901416778564, + "learning_rate": 0.000181098955514701, + "loss": 1.9102, + "step": 6226 + }, + { + "epoch": 0.2230021308217093, + "grad_norm": 1.6000697612762451, + "learning_rate": 0.00018109216884335325, + "loss": 1.6102, + "step": 6227 + }, + { + "epoch": 0.2230379429512776, + "grad_norm": 2.0699243545532227, + "learning_rate": 0.00018108538108101496, + "loss": 1.8961, + "step": 6228 + }, + { + "epoch": 0.22307375508084587, + "grad_norm": 1.5482474565505981, + "learning_rate": 0.00018107859222777747, + "loss": 1.6179, + "step": 6229 + }, + { + "epoch": 0.22310956721041417, + "grad_norm": 1.4630217552185059, + "learning_rate": 0.0001810718022837321, + "loss": 1.3147, + "step": 6230 + }, + { + "epoch": 0.22314537933998246, + "grad_norm": 1.7212188243865967, + "learning_rate": 0.00018106501124897024, + "loss": 1.6115, + "step": 6231 + }, + { + "epoch": 0.22318119146955073, + "grad_norm": 1.2010308504104614, + "learning_rate": 0.00018105821912358318, + "loss": 1.5784, + "step": 6232 + }, + { + "epoch": 0.22321700359911903, + "grad_norm": 2.4285857677459717, + "learning_rate": 0.00018105142590766235, + "loss": 1.3719, + "step": 6233 + }, + { + "epoch": 0.2232528157286873, + "grad_norm": 1.9801949262619019, + "learning_rate": 0.00018104463160129912, + "loss": 1.5422, + "step": 6234 + }, + { + "epoch": 0.2232886278582556, + "grad_norm": 1.3408149480819702, + "learning_rate": 0.00018103783620458495, + "loss": 1.3496, + "step": 6235 + }, + { + "epoch": 0.22332443998782386, + "grad_norm": 1.6955066919326782, + "learning_rate": 0.0001810310397176112, + "loss": 1.6563, + "step": 6236 + }, + { + "epoch": 0.22336025211739216, + "grad_norm": 1.4549084901809692, + "learning_rate": 0.0001810242421404693, + "loss": 1.4026, + "step": 6237 + }, + { + "epoch": 0.22339606424696046, + "grad_norm": 1.6285228729248047, + "learning_rate": 0.00018101744347325078, + "loss": 1.511, + "step": 6238 + }, + { + "epoch": 0.22343187637652873, + "grad_norm": 1.6430824995040894, + "learning_rate": 0.00018101064371604705, + "loss": 1.3705, + "step": 6239 + }, + { + "epoch": 0.22346768850609702, + "grad_norm": 2.0069923400878906, + "learning_rate": 0.0001810038428689496, + "loss": 1.7298, + "step": 6240 + }, + { + "epoch": 0.2235035006356653, + "grad_norm": 1.5702707767486572, + "learning_rate": 0.00018099704093204997, + "loss": 1.7249, + "step": 6241 + }, + { + "epoch": 0.2235393127652336, + "grad_norm": 1.7741975784301758, + "learning_rate": 0.00018099023790543956, + "loss": 1.6276, + "step": 6242 + }, + { + "epoch": 0.22357512489480186, + "grad_norm": 2.8160290718078613, + "learning_rate": 0.00018098343378921002, + "loss": 1.3841, + "step": 6243 + }, + { + "epoch": 0.22361093702437015, + "grad_norm": 1.9434565305709839, + "learning_rate": 0.00018097662858345282, + "loss": 1.2333, + "step": 6244 + }, + { + "epoch": 0.22364674915393845, + "grad_norm": 1.4331356287002563, + "learning_rate": 0.00018096982228825957, + "loss": 1.6064, + "step": 6245 + }, + { + "epoch": 0.22368256128350672, + "grad_norm": 1.8122775554656982, + "learning_rate": 0.00018096301490372175, + "loss": 1.5847, + "step": 6246 + }, + { + "epoch": 0.22371837341307502, + "grad_norm": 1.7992037534713745, + "learning_rate": 0.00018095620642993106, + "loss": 1.5136, + "step": 6247 + }, + { + "epoch": 0.22375418554264329, + "grad_norm": 2.3840503692626953, + "learning_rate": 0.000180949396866979, + "loss": 1.4274, + "step": 6248 + }, + { + "epoch": 0.22378999767221158, + "grad_norm": 1.6450737714767456, + "learning_rate": 0.00018094258621495724, + "loss": 1.5595, + "step": 6249 + }, + { + "epoch": 0.22382580980177985, + "grad_norm": 1.2827435731887817, + "learning_rate": 0.00018093577447395737, + "loss": 1.6185, + "step": 6250 + }, + { + "epoch": 0.22386162193134815, + "grad_norm": 2.183157444000244, + "learning_rate": 0.00018092896164407108, + "loss": 1.3872, + "step": 6251 + }, + { + "epoch": 0.22389743406091644, + "grad_norm": 2.581634521484375, + "learning_rate": 0.00018092214772538994, + "loss": 1.6593, + "step": 6252 + }, + { + "epoch": 0.2239332461904847, + "grad_norm": 1.961072325706482, + "learning_rate": 0.00018091533271800576, + "loss": 1.5356, + "step": 6253 + }, + { + "epoch": 0.223969058320053, + "grad_norm": 2.0827033519744873, + "learning_rate": 0.00018090851662201011, + "loss": 1.5634, + "step": 6254 + }, + { + "epoch": 0.22400487044962128, + "grad_norm": 1.3981750011444092, + "learning_rate": 0.00018090169943749476, + "loss": 1.6067, + "step": 6255 + }, + { + "epoch": 0.22404068257918958, + "grad_norm": 1.7544121742248535, + "learning_rate": 0.00018089488116455137, + "loss": 1.7618, + "step": 6256 + }, + { + "epoch": 0.22407649470875785, + "grad_norm": 1.768568754196167, + "learning_rate": 0.00018088806180327174, + "loss": 1.7251, + "step": 6257 + }, + { + "epoch": 0.22411230683832614, + "grad_norm": 1.7023133039474487, + "learning_rate": 0.00018088124135374754, + "loss": 1.3179, + "step": 6258 + }, + { + "epoch": 0.22414811896789444, + "grad_norm": 1.566927433013916, + "learning_rate": 0.00018087441981607056, + "loss": 1.3815, + "step": 6259 + }, + { + "epoch": 0.2241839310974627, + "grad_norm": 1.7557883262634277, + "learning_rate": 0.00018086759719033261, + "loss": 1.6312, + "step": 6260 + }, + { + "epoch": 0.224219743227031, + "grad_norm": 1.8031694889068604, + "learning_rate": 0.00018086077347662544, + "loss": 1.3539, + "step": 6261 + }, + { + "epoch": 0.22425555535659927, + "grad_norm": 1.501671552658081, + "learning_rate": 0.00018085394867504087, + "loss": 1.7447, + "step": 6262 + }, + { + "epoch": 0.22429136748616757, + "grad_norm": 1.342667579650879, + "learning_rate": 0.00018084712278567072, + "loss": 1.5844, + "step": 6263 + }, + { + "epoch": 0.22432717961573584, + "grad_norm": 2.0589170455932617, + "learning_rate": 0.00018084029580860679, + "loss": 2.0065, + "step": 6264 + }, + { + "epoch": 0.22436299174530414, + "grad_norm": 1.7275969982147217, + "learning_rate": 0.000180833467743941, + "loss": 1.4656, + "step": 6265 + }, + { + "epoch": 0.22439880387487243, + "grad_norm": 1.4489154815673828, + "learning_rate": 0.00018082663859176514, + "loss": 1.4036, + "step": 6266 + }, + { + "epoch": 0.2244346160044407, + "grad_norm": 1.4381591081619263, + "learning_rate": 0.00018081980835217115, + "loss": 1.5086, + "step": 6267 + }, + { + "epoch": 0.224470428134009, + "grad_norm": 1.5848140716552734, + "learning_rate": 0.00018081297702525083, + "loss": 1.88, + "step": 6268 + }, + { + "epoch": 0.22450624026357727, + "grad_norm": 1.8268117904663086, + "learning_rate": 0.0001808061446110962, + "loss": 1.713, + "step": 6269 + }, + { + "epoch": 0.22454205239314556, + "grad_norm": 2.428276538848877, + "learning_rate": 0.0001807993111097991, + "loss": 1.7823, + "step": 6270 + }, + { + "epoch": 0.22457786452271383, + "grad_norm": 1.898605465888977, + "learning_rate": 0.0001807924765214515, + "loss": 1.4447, + "step": 6271 + }, + { + "epoch": 0.22461367665228213, + "grad_norm": 1.346147894859314, + "learning_rate": 0.00018078564084614534, + "loss": 1.4593, + "step": 6272 + }, + { + "epoch": 0.22464948878185043, + "grad_norm": 1.8311480283737183, + "learning_rate": 0.0001807788040839726, + "loss": 1.6737, + "step": 6273 + }, + { + "epoch": 0.2246853009114187, + "grad_norm": 1.3992347717285156, + "learning_rate": 0.0001807719662350252, + "loss": 1.5412, + "step": 6274 + }, + { + "epoch": 0.224721113040987, + "grad_norm": 1.5278624296188354, + "learning_rate": 0.00018076512729939522, + "loss": 1.6334, + "step": 6275 + }, + { + "epoch": 0.22475692517055526, + "grad_norm": 1.8901277780532837, + "learning_rate": 0.00018075828727717464, + "loss": 1.8401, + "step": 6276 + }, + { + "epoch": 0.22479273730012356, + "grad_norm": 2.278604745864868, + "learning_rate": 0.00018075144616845544, + "loss": 1.7158, + "step": 6277 + }, + { + "epoch": 0.22482854942969183, + "grad_norm": 1.5036617517471313, + "learning_rate": 0.00018074460397332973, + "loss": 1.7371, + "step": 6278 + }, + { + "epoch": 0.22486436155926012, + "grad_norm": 1.6316254138946533, + "learning_rate": 0.00018073776069188954, + "loss": 1.6364, + "step": 6279 + }, + { + "epoch": 0.22490017368882842, + "grad_norm": 1.824449062347412, + "learning_rate": 0.0001807309163242269, + "loss": 1.2859, + "step": 6280 + }, + { + "epoch": 0.2249359858183967, + "grad_norm": 1.523816466331482, + "learning_rate": 0.0001807240708704339, + "loss": 1.6968, + "step": 6281 + }, + { + "epoch": 0.22497179794796499, + "grad_norm": 1.8160182237625122, + "learning_rate": 0.0001807172243306027, + "loss": 1.5923, + "step": 6282 + }, + { + "epoch": 0.22500761007753325, + "grad_norm": 1.6295522451400757, + "learning_rate": 0.00018071037670482532, + "loss": 1.5968, + "step": 6283 + }, + { + "epoch": 0.22504342220710155, + "grad_norm": 1.463212251663208, + "learning_rate": 0.00018070352799319395, + "loss": 1.5347, + "step": 6284 + }, + { + "epoch": 0.22507923433666982, + "grad_norm": 2.812549114227295, + "learning_rate": 0.0001806966781958007, + "loss": 1.654, + "step": 6285 + }, + { + "epoch": 0.22511504646623812, + "grad_norm": 2.011024236679077, + "learning_rate": 0.00018068982731273773, + "loss": 1.9352, + "step": 6286 + }, + { + "epoch": 0.2251508585958064, + "grad_norm": 1.4859150648117065, + "learning_rate": 0.00018068297534409725, + "loss": 1.6877, + "step": 6287 + }, + { + "epoch": 0.22518667072537468, + "grad_norm": 1.5548865795135498, + "learning_rate": 0.00018067612228997137, + "loss": 1.53, + "step": 6288 + }, + { + "epoch": 0.22522248285494298, + "grad_norm": 1.7031159400939941, + "learning_rate": 0.00018066926815045236, + "loss": 1.8067, + "step": 6289 + }, + { + "epoch": 0.22525829498451125, + "grad_norm": 1.9277985095977783, + "learning_rate": 0.00018066241292563238, + "loss": 1.3285, + "step": 6290 + }, + { + "epoch": 0.22529410711407954, + "grad_norm": 1.4769515991210938, + "learning_rate": 0.00018065555661560368, + "loss": 1.6458, + "step": 6291 + }, + { + "epoch": 0.2253299192436478, + "grad_norm": 2.014338970184326, + "learning_rate": 0.00018064869922045852, + "loss": 1.4688, + "step": 6292 + }, + { + "epoch": 0.2253657313732161, + "grad_norm": 2.711336851119995, + "learning_rate": 0.00018064184074028915, + "loss": 1.5906, + "step": 6293 + }, + { + "epoch": 0.22540154350278438, + "grad_norm": 1.8387864828109741, + "learning_rate": 0.0001806349811751878, + "loss": 1.6775, + "step": 6294 + }, + { + "epoch": 0.22543735563235268, + "grad_norm": 1.484028935432434, + "learning_rate": 0.00018062812052524683, + "loss": 1.4705, + "step": 6295 + }, + { + "epoch": 0.22547316776192097, + "grad_norm": 1.9155514240264893, + "learning_rate": 0.00018062125879055846, + "loss": 1.6196, + "step": 6296 + }, + { + "epoch": 0.22550897989148924, + "grad_norm": 2.07671856880188, + "learning_rate": 0.00018061439597121508, + "loss": 1.6635, + "step": 6297 + }, + { + "epoch": 0.22554479202105754, + "grad_norm": 1.2981393337249756, + "learning_rate": 0.000180607532067309, + "loss": 1.3212, + "step": 6298 + }, + { + "epoch": 0.2255806041506258, + "grad_norm": 1.5452812910079956, + "learning_rate": 0.0001806006670789325, + "loss": 1.7199, + "step": 6299 + }, + { + "epoch": 0.2256164162801941, + "grad_norm": 2.042917251586914, + "learning_rate": 0.00018059380100617802, + "loss": 1.561, + "step": 6300 + }, + { + "epoch": 0.22565222840976237, + "grad_norm": 2.002063035964966, + "learning_rate": 0.0001805869338491379, + "loss": 1.4581, + "step": 6301 + }, + { + "epoch": 0.22568804053933067, + "grad_norm": 1.6189521551132202, + "learning_rate": 0.00018058006560790453, + "loss": 1.2429, + "step": 6302 + }, + { + "epoch": 0.22572385266889897, + "grad_norm": 1.7002862691879272, + "learning_rate": 0.00018057319628257034, + "loss": 1.6999, + "step": 6303 + }, + { + "epoch": 0.22575966479846724, + "grad_norm": 1.913028597831726, + "learning_rate": 0.0001805663258732277, + "loss": 1.9182, + "step": 6304 + }, + { + "epoch": 0.22579547692803553, + "grad_norm": 1.3990799188613892, + "learning_rate": 0.0001805594543799691, + "loss": 1.6439, + "step": 6305 + }, + { + "epoch": 0.2258312890576038, + "grad_norm": 1.9262644052505493, + "learning_rate": 0.00018055258180288696, + "loss": 1.6578, + "step": 6306 + }, + { + "epoch": 0.2258671011871721, + "grad_norm": 1.1598010063171387, + "learning_rate": 0.0001805457081420737, + "loss": 1.645, + "step": 6307 + }, + { + "epoch": 0.22590291331674037, + "grad_norm": 2.582381248474121, + "learning_rate": 0.00018053883339762183, + "loss": 1.9006, + "step": 6308 + }, + { + "epoch": 0.22593872544630866, + "grad_norm": 1.5848309993743896, + "learning_rate": 0.00018053195756962388, + "loss": 1.4253, + "step": 6309 + }, + { + "epoch": 0.22597453757587696, + "grad_norm": 1.6140457391738892, + "learning_rate": 0.0001805250806581723, + "loss": 1.3751, + "step": 6310 + }, + { + "epoch": 0.22601034970544523, + "grad_norm": 1.7949855327606201, + "learning_rate": 0.00018051820266335963, + "loss": 1.4448, + "step": 6311 + }, + { + "epoch": 0.22604616183501353, + "grad_norm": 1.4919768571853638, + "learning_rate": 0.0001805113235852784, + "loss": 1.4016, + "step": 6312 + }, + { + "epoch": 0.2260819739645818, + "grad_norm": 1.2824842929840088, + "learning_rate": 0.00018050444342402114, + "loss": 1.3802, + "step": 6313 + }, + { + "epoch": 0.2261177860941501, + "grad_norm": 1.514653205871582, + "learning_rate": 0.0001804975621796805, + "loss": 1.4011, + "step": 6314 + }, + { + "epoch": 0.22615359822371836, + "grad_norm": 1.4949651956558228, + "learning_rate": 0.00018049067985234895, + "loss": 1.3139, + "step": 6315 + }, + { + "epoch": 0.22618941035328666, + "grad_norm": 1.9180841445922852, + "learning_rate": 0.00018048379644211915, + "loss": 1.4127, + "step": 6316 + }, + { + "epoch": 0.22622522248285495, + "grad_norm": 1.3751518726348877, + "learning_rate": 0.00018047691194908368, + "loss": 1.4633, + "step": 6317 + }, + { + "epoch": 0.22626103461242322, + "grad_norm": 2.1771159172058105, + "learning_rate": 0.00018047002637333517, + "loss": 1.6793, + "step": 6318 + }, + { + "epoch": 0.22629684674199152, + "grad_norm": 1.4497225284576416, + "learning_rate": 0.00018046313971496622, + "loss": 1.4479, + "step": 6319 + }, + { + "epoch": 0.2263326588715598, + "grad_norm": 1.9885940551757812, + "learning_rate": 0.00018045625197406957, + "loss": 1.4871, + "step": 6320 + }, + { + "epoch": 0.22636847100112809, + "grad_norm": 1.246687412261963, + "learning_rate": 0.00018044936315073779, + "loss": 1.6281, + "step": 6321 + }, + { + "epoch": 0.22640428313069635, + "grad_norm": 3.3700385093688965, + "learning_rate": 0.0001804424732450636, + "loss": 1.9536, + "step": 6322 + }, + { + "epoch": 0.22644009526026465, + "grad_norm": 1.9437425136566162, + "learning_rate": 0.0001804355822571397, + "loss": 1.7087, + "step": 6323 + }, + { + "epoch": 0.22647590738983295, + "grad_norm": 1.4002232551574707, + "learning_rate": 0.00018042869018705882, + "loss": 1.37, + "step": 6324 + }, + { + "epoch": 0.22651171951940122, + "grad_norm": 1.4852545261383057, + "learning_rate": 0.00018042179703491365, + "loss": 1.4754, + "step": 6325 + }, + { + "epoch": 0.2265475316489695, + "grad_norm": 1.689438819885254, + "learning_rate": 0.00018041490280079693, + "loss": 1.6379, + "step": 6326 + }, + { + "epoch": 0.22658334377853778, + "grad_norm": 1.4079216718673706, + "learning_rate": 0.00018040800748480142, + "loss": 1.4282, + "step": 6327 + }, + { + "epoch": 0.22661915590810608, + "grad_norm": 1.5791819095611572, + "learning_rate": 0.00018040111108701988, + "loss": 1.7066, + "step": 6328 + }, + { + "epoch": 0.22665496803767435, + "grad_norm": 1.8294531106948853, + "learning_rate": 0.00018039421360754513, + "loss": 1.7546, + "step": 6329 + }, + { + "epoch": 0.22669078016724264, + "grad_norm": 1.7738605737686157, + "learning_rate": 0.0001803873150464699, + "loss": 1.3616, + "step": 6330 + }, + { + "epoch": 0.22672659229681094, + "grad_norm": 1.9782477617263794, + "learning_rate": 0.00018038041540388705, + "loss": 1.3159, + "step": 6331 + }, + { + "epoch": 0.2267624044263792, + "grad_norm": 1.4635730981826782, + "learning_rate": 0.00018037351467988942, + "loss": 1.4511, + "step": 6332 + }, + { + "epoch": 0.2267982165559475, + "grad_norm": 1.2722338438034058, + "learning_rate": 0.0001803666128745698, + "loss": 1.2824, + "step": 6333 + }, + { + "epoch": 0.22683402868551578, + "grad_norm": 1.7284460067749023, + "learning_rate": 0.00018035970998802106, + "loss": 1.5667, + "step": 6334 + }, + { + "epoch": 0.22686984081508407, + "grad_norm": 2.6332521438598633, + "learning_rate": 0.0001803528060203361, + "loss": 1.4429, + "step": 6335 + }, + { + "epoch": 0.22690565294465234, + "grad_norm": 1.378933310508728, + "learning_rate": 0.00018034590097160778, + "loss": 1.5293, + "step": 6336 + }, + { + "epoch": 0.22694146507422064, + "grad_norm": 1.7138888835906982, + "learning_rate": 0.000180338994841929, + "loss": 1.4708, + "step": 6337 + }, + { + "epoch": 0.22697727720378894, + "grad_norm": 1.4144269227981567, + "learning_rate": 0.00018033208763139266, + "loss": 1.578, + "step": 6338 + }, + { + "epoch": 0.2270130893333572, + "grad_norm": 1.4831655025482178, + "learning_rate": 0.0001803251793400917, + "loss": 1.6398, + "step": 6339 + }, + { + "epoch": 0.2270489014629255, + "grad_norm": 1.3129905462265015, + "learning_rate": 0.0001803182699681191, + "loss": 1.4506, + "step": 6340 + }, + { + "epoch": 0.22708471359249377, + "grad_norm": 1.9662981033325195, + "learning_rate": 0.00018031135951556774, + "loss": 1.3558, + "step": 6341 + }, + { + "epoch": 0.22712052572206207, + "grad_norm": 2.3339595794677734, + "learning_rate": 0.00018030444798253066, + "loss": 1.4351, + "step": 6342 + }, + { + "epoch": 0.22715633785163034, + "grad_norm": 1.4761534929275513, + "learning_rate": 0.0001802975353691008, + "loss": 1.6159, + "step": 6343 + }, + { + "epoch": 0.22719214998119863, + "grad_norm": 1.3196064233779907, + "learning_rate": 0.00018029062167537117, + "loss": 1.6138, + "step": 6344 + }, + { + "epoch": 0.22722796211076693, + "grad_norm": 2.0419766902923584, + "learning_rate": 0.00018028370690143482, + "loss": 1.8339, + "step": 6345 + }, + { + "epoch": 0.2272637742403352, + "grad_norm": 1.5514767169952393, + "learning_rate": 0.00018027679104738473, + "loss": 1.3189, + "step": 6346 + }, + { + "epoch": 0.2272995863699035, + "grad_norm": 2.7501816749572754, + "learning_rate": 0.00018026987411331398, + "loss": 1.8378, + "step": 6347 + }, + { + "epoch": 0.22733539849947176, + "grad_norm": 1.7505801916122437, + "learning_rate": 0.0001802629560993156, + "loss": 1.1969, + "step": 6348 + }, + { + "epoch": 0.22737121062904006, + "grad_norm": 1.4281238317489624, + "learning_rate": 0.0001802560370054827, + "loss": 1.78, + "step": 6349 + }, + { + "epoch": 0.22740702275860833, + "grad_norm": 1.6368708610534668, + "learning_rate": 0.00018024911683190833, + "loss": 1.3031, + "step": 6350 + }, + { + "epoch": 0.22744283488817663, + "grad_norm": 1.8474522829055786, + "learning_rate": 0.0001802421955786856, + "loss": 1.8179, + "step": 6351 + }, + { + "epoch": 0.22747864701774492, + "grad_norm": 1.4447168111801147, + "learning_rate": 0.00018023527324590764, + "loss": 1.4314, + "step": 6352 + }, + { + "epoch": 0.2275144591473132, + "grad_norm": 1.9680529832839966, + "learning_rate": 0.0001802283498336676, + "loss": 1.5552, + "step": 6353 + }, + { + "epoch": 0.2275502712768815, + "grad_norm": 1.547568917274475, + "learning_rate": 0.00018022142534205858, + "loss": 1.5545, + "step": 6354 + }, + { + "epoch": 0.22758608340644976, + "grad_norm": 1.4191595315933228, + "learning_rate": 0.00018021449977117374, + "loss": 1.359, + "step": 6355 + }, + { + "epoch": 0.22762189553601805, + "grad_norm": 1.7385724782943726, + "learning_rate": 0.00018020757312110628, + "loss": 1.6338, + "step": 6356 + }, + { + "epoch": 0.22765770766558632, + "grad_norm": 1.5614619255065918, + "learning_rate": 0.0001802006453919494, + "loss": 1.7259, + "step": 6357 + }, + { + "epoch": 0.22769351979515462, + "grad_norm": 1.4145011901855469, + "learning_rate": 0.0001801937165837963, + "loss": 1.7155, + "step": 6358 + }, + { + "epoch": 0.22772933192472292, + "grad_norm": 1.572713017463684, + "learning_rate": 0.00018018678669674015, + "loss": 1.7646, + "step": 6359 + }, + { + "epoch": 0.22776514405429119, + "grad_norm": 1.4809653759002686, + "learning_rate": 0.00018017985573087425, + "loss": 1.4841, + "step": 6360 + }, + { + "epoch": 0.22780095618385948, + "grad_norm": 2.632725715637207, + "learning_rate": 0.0001801729236862918, + "loss": 1.9187, + "step": 6361 + }, + { + "epoch": 0.22783676831342775, + "grad_norm": 1.5056543350219727, + "learning_rate": 0.0001801659905630861, + "loss": 1.6402, + "step": 6362 + }, + { + "epoch": 0.22787258044299605, + "grad_norm": 1.3668396472930908, + "learning_rate": 0.00018015905636135037, + "loss": 1.4198, + "step": 6363 + }, + { + "epoch": 0.22790839257256432, + "grad_norm": 1.650078296661377, + "learning_rate": 0.00018015212108117793, + "loss": 1.8745, + "step": 6364 + }, + { + "epoch": 0.2279442047021326, + "grad_norm": 2.297715902328491, + "learning_rate": 0.0001801451847226621, + "loss": 1.8726, + "step": 6365 + }, + { + "epoch": 0.2279800168317009, + "grad_norm": 1.448528528213501, + "learning_rate": 0.0001801382472858962, + "loss": 1.5484, + "step": 6366 + }, + { + "epoch": 0.22801582896126918, + "grad_norm": 1.730246901512146, + "learning_rate": 0.00018013130877097357, + "loss": 1.4851, + "step": 6367 + }, + { + "epoch": 0.22805164109083748, + "grad_norm": 1.6730163097381592, + "learning_rate": 0.0001801243691779875, + "loss": 1.522, + "step": 6368 + }, + { + "epoch": 0.22808745322040574, + "grad_norm": 1.6910569667816162, + "learning_rate": 0.00018011742850703146, + "loss": 1.4104, + "step": 6369 + }, + { + "epoch": 0.22812326534997404, + "grad_norm": 1.7938308715820312, + "learning_rate": 0.00018011048675819872, + "loss": 1.4716, + "step": 6370 + }, + { + "epoch": 0.2281590774795423, + "grad_norm": 1.5565799474716187, + "learning_rate": 0.0001801035439315827, + "loss": 1.5583, + "step": 6371 + }, + { + "epoch": 0.2281948896091106, + "grad_norm": 1.8678364753723145, + "learning_rate": 0.00018009660002727684, + "loss": 1.5625, + "step": 6372 + }, + { + "epoch": 0.2282307017386789, + "grad_norm": 1.3503905534744263, + "learning_rate": 0.00018008965504537455, + "loss": 1.5808, + "step": 6373 + }, + { + "epoch": 0.22826651386824717, + "grad_norm": 1.5878171920776367, + "learning_rate": 0.00018008270898596927, + "loss": 1.7811, + "step": 6374 + }, + { + "epoch": 0.22830232599781547, + "grad_norm": 1.4853515625, + "learning_rate": 0.00018007576184915443, + "loss": 1.5756, + "step": 6375 + }, + { + "epoch": 0.22833813812738374, + "grad_norm": 1.5101004838943481, + "learning_rate": 0.00018006881363502348, + "loss": 1.3801, + "step": 6376 + }, + { + "epoch": 0.22837395025695204, + "grad_norm": 1.6938331127166748, + "learning_rate": 0.00018006186434366996, + "loss": 1.7725, + "step": 6377 + }, + { + "epoch": 0.2284097623865203, + "grad_norm": 2.058910846710205, + "learning_rate": 0.0001800549139751873, + "loss": 1.5681, + "step": 6378 + }, + { + "epoch": 0.2284455745160886, + "grad_norm": 1.5402759313583374, + "learning_rate": 0.00018004796252966908, + "loss": 1.4617, + "step": 6379 + }, + { + "epoch": 0.2284813866456569, + "grad_norm": 1.5306357145309448, + "learning_rate": 0.00018004101000720872, + "loss": 1.7913, + "step": 6380 + }, + { + "epoch": 0.22851719877522517, + "grad_norm": 1.2929790019989014, + "learning_rate": 0.00018003405640789987, + "loss": 1.553, + "step": 6381 + }, + { + "epoch": 0.22855301090479346, + "grad_norm": 2.393298387527466, + "learning_rate": 0.00018002710173183596, + "loss": 1.6883, + "step": 6382 + }, + { + "epoch": 0.22858882303436173, + "grad_norm": 2.0982320308685303, + "learning_rate": 0.00018002014597911066, + "loss": 1.6216, + "step": 6383 + }, + { + "epoch": 0.22862463516393003, + "grad_norm": 1.3045873641967773, + "learning_rate": 0.00018001318914981753, + "loss": 1.4899, + "step": 6384 + }, + { + "epoch": 0.2286604472934983, + "grad_norm": 2.3285608291625977, + "learning_rate": 0.00018000623124405014, + "loss": 1.4443, + "step": 6385 + }, + { + "epoch": 0.2286962594230666, + "grad_norm": 1.8748106956481934, + "learning_rate": 0.0001799992722619021, + "loss": 1.2426, + "step": 6386 + }, + { + "epoch": 0.22873207155263486, + "grad_norm": 1.3914124965667725, + "learning_rate": 0.00017999231220346707, + "loss": 1.5566, + "step": 6387 + }, + { + "epoch": 0.22876788368220316, + "grad_norm": 1.7344447374343872, + "learning_rate": 0.00017998535106883862, + "loss": 1.052, + "step": 6388 + }, + { + "epoch": 0.22880369581177146, + "grad_norm": 1.5599722862243652, + "learning_rate": 0.00017997838885811047, + "loss": 1.6529, + "step": 6389 + }, + { + "epoch": 0.22883950794133973, + "grad_norm": 2.0522711277008057, + "learning_rate": 0.00017997142557137625, + "loss": 1.4925, + "step": 6390 + }, + { + "epoch": 0.22887532007090802, + "grad_norm": 1.712795376777649, + "learning_rate": 0.00017996446120872967, + "loss": 1.7085, + "step": 6391 + }, + { + "epoch": 0.2289111322004763, + "grad_norm": 1.9475706815719604, + "learning_rate": 0.00017995749577026443, + "loss": 1.6277, + "step": 6392 + }, + { + "epoch": 0.2289469443300446, + "grad_norm": 1.562954068183899, + "learning_rate": 0.0001799505292560742, + "loss": 1.8237, + "step": 6393 + }, + { + "epoch": 0.22898275645961286, + "grad_norm": 1.4970970153808594, + "learning_rate": 0.00017994356166625271, + "loss": 1.6119, + "step": 6394 + }, + { + "epoch": 0.22901856858918115, + "grad_norm": 1.8240835666656494, + "learning_rate": 0.0001799365930008937, + "loss": 1.7235, + "step": 6395 + }, + { + "epoch": 0.22905438071874945, + "grad_norm": 1.410115361213684, + "learning_rate": 0.000179929623260091, + "loss": 1.429, + "step": 6396 + }, + { + "epoch": 0.22909019284831772, + "grad_norm": 1.732579231262207, + "learning_rate": 0.0001799226524439383, + "loss": 1.6388, + "step": 6397 + }, + { + "epoch": 0.22912600497788602, + "grad_norm": 2.0173988342285156, + "learning_rate": 0.0001799156805525294, + "loss": 1.6149, + "step": 6398 + }, + { + "epoch": 0.22916181710745429, + "grad_norm": 1.4980366230010986, + "learning_rate": 0.00017990870758595811, + "loss": 1.5596, + "step": 6399 + }, + { + "epoch": 0.22919762923702258, + "grad_norm": 2.1325318813323975, + "learning_rate": 0.0001799017335443182, + "loss": 1.5557, + "step": 6400 + }, + { + "epoch": 0.22923344136659085, + "grad_norm": 1.7884522676467896, + "learning_rate": 0.00017989475842770358, + "loss": 1.6517, + "step": 6401 + }, + { + "epoch": 0.22926925349615915, + "grad_norm": 2.133835554122925, + "learning_rate": 0.00017988778223620799, + "loss": 1.7446, + "step": 6402 + }, + { + "epoch": 0.22930506562572744, + "grad_norm": 1.8718260526657104, + "learning_rate": 0.0001798808049699254, + "loss": 1.7916, + "step": 6403 + }, + { + "epoch": 0.2293408777552957, + "grad_norm": 1.6703238487243652, + "learning_rate": 0.00017987382662894955, + "loss": 1.5125, + "step": 6404 + }, + { + "epoch": 0.229376689884864, + "grad_norm": 1.7600868940353394, + "learning_rate": 0.00017986684721337442, + "loss": 1.5224, + "step": 6405 + }, + { + "epoch": 0.22941250201443228, + "grad_norm": 2.184349298477173, + "learning_rate": 0.00017985986672329392, + "loss": 1.5551, + "step": 6406 + }, + { + "epoch": 0.22944831414400058, + "grad_norm": 1.4052538871765137, + "learning_rate": 0.0001798528851588019, + "loss": 1.7645, + "step": 6407 + }, + { + "epoch": 0.22948412627356884, + "grad_norm": 1.7552518844604492, + "learning_rate": 0.0001798459025199923, + "loss": 1.5787, + "step": 6408 + }, + { + "epoch": 0.22951993840313714, + "grad_norm": 1.7903473377227783, + "learning_rate": 0.00017983891880695907, + "loss": 1.6544, + "step": 6409 + }, + { + "epoch": 0.22955575053270544, + "grad_norm": 1.63605797290802, + "learning_rate": 0.00017983193401979616, + "loss": 1.7336, + "step": 6410 + }, + { + "epoch": 0.2295915626622737, + "grad_norm": 1.7833870649337769, + "learning_rate": 0.0001798249481585976, + "loss": 1.425, + "step": 6411 + }, + { + "epoch": 0.229627374791842, + "grad_norm": 1.5450332164764404, + "learning_rate": 0.0001798179612234573, + "loss": 1.3434, + "step": 6412 + }, + { + "epoch": 0.22966318692141027, + "grad_norm": 1.3541749715805054, + "learning_rate": 0.0001798109732144693, + "loss": 1.5277, + "step": 6413 + }, + { + "epoch": 0.22969899905097857, + "grad_norm": 2.7870755195617676, + "learning_rate": 0.0001798039841317276, + "loss": 1.5438, + "step": 6414 + }, + { + "epoch": 0.22973481118054684, + "grad_norm": 1.9439632892608643, + "learning_rate": 0.00017979699397532625, + "loss": 1.6845, + "step": 6415 + }, + { + "epoch": 0.22977062331011514, + "grad_norm": 1.472992181777954, + "learning_rate": 0.00017979000274535926, + "loss": 1.59, + "step": 6416 + }, + { + "epoch": 0.22980643543968343, + "grad_norm": 1.9579699039459229, + "learning_rate": 0.0001797830104419207, + "loss": 1.3821, + "step": 6417 + }, + { + "epoch": 0.2298422475692517, + "grad_norm": 1.2323920726776123, + "learning_rate": 0.00017977601706510465, + "loss": 1.5868, + "step": 6418 + }, + { + "epoch": 0.22987805969882, + "grad_norm": 1.9308135509490967, + "learning_rate": 0.0001797690226150052, + "loss": 1.5732, + "step": 6419 + }, + { + "epoch": 0.22991387182838827, + "grad_norm": 1.283857822418213, + "learning_rate": 0.00017976202709171643, + "loss": 1.3582, + "step": 6420 + }, + { + "epoch": 0.22994968395795656, + "grad_norm": 1.7926145792007446, + "learning_rate": 0.0001797550304953325, + "loss": 1.542, + "step": 6421 + }, + { + "epoch": 0.22998549608752483, + "grad_norm": 2.011465072631836, + "learning_rate": 0.00017974803282594747, + "loss": 1.3821, + "step": 6422 + }, + { + "epoch": 0.23002130821709313, + "grad_norm": 1.5812910795211792, + "learning_rate": 0.00017974103408365557, + "loss": 1.5659, + "step": 6423 + }, + { + "epoch": 0.23005712034666143, + "grad_norm": 1.8776155710220337, + "learning_rate": 0.00017973403426855088, + "loss": 1.7368, + "step": 6424 + }, + { + "epoch": 0.2300929324762297, + "grad_norm": 1.4800854921340942, + "learning_rate": 0.00017972703338072762, + "loss": 1.6797, + "step": 6425 + }, + { + "epoch": 0.230128744605798, + "grad_norm": 1.9689767360687256, + "learning_rate": 0.00017972003142027997, + "loss": 2.1101, + "step": 6426 + }, + { + "epoch": 0.23016455673536626, + "grad_norm": 1.2748472690582275, + "learning_rate": 0.00017971302838730213, + "loss": 1.5971, + "step": 6427 + }, + { + "epoch": 0.23020036886493456, + "grad_norm": 1.3918734788894653, + "learning_rate": 0.00017970602428188834, + "loss": 1.4222, + "step": 6428 + }, + { + "epoch": 0.23023618099450283, + "grad_norm": 1.911766529083252, + "learning_rate": 0.00017969901910413276, + "loss": 1.3167, + "step": 6429 + }, + { + "epoch": 0.23027199312407112, + "grad_norm": 1.62160325050354, + "learning_rate": 0.00017969201285412972, + "loss": 1.6437, + "step": 6430 + }, + { + "epoch": 0.23030780525363942, + "grad_norm": 1.9006664752960205, + "learning_rate": 0.0001796850055319734, + "loss": 1.3403, + "step": 6431 + }, + { + "epoch": 0.2303436173832077, + "grad_norm": 3.353529691696167, + "learning_rate": 0.00017967799713775815, + "loss": 1.7944, + "step": 6432 + }, + { + "epoch": 0.23037942951277599, + "grad_norm": 2.1918132305145264, + "learning_rate": 0.00017967098767157822, + "loss": 1.7478, + "step": 6433 + }, + { + "epoch": 0.23041524164234425, + "grad_norm": 1.9099441766738892, + "learning_rate": 0.00017966397713352792, + "loss": 1.2382, + "step": 6434 + }, + { + "epoch": 0.23045105377191255, + "grad_norm": 1.6920967102050781, + "learning_rate": 0.00017965696552370156, + "loss": 1.4024, + "step": 6435 + }, + { + "epoch": 0.23048686590148082, + "grad_norm": 1.8513035774230957, + "learning_rate": 0.00017964995284219348, + "loss": 1.4771, + "step": 6436 + }, + { + "epoch": 0.23052267803104912, + "grad_norm": 2.202528476715088, + "learning_rate": 0.00017964293908909803, + "loss": 1.9447, + "step": 6437 + }, + { + "epoch": 0.2305584901606174, + "grad_norm": 1.4865727424621582, + "learning_rate": 0.00017963592426450956, + "loss": 1.4472, + "step": 6438 + }, + { + "epoch": 0.23059430229018568, + "grad_norm": 1.3838335275650024, + "learning_rate": 0.0001796289083685225, + "loss": 1.7319, + "step": 6439 + }, + { + "epoch": 0.23063011441975398, + "grad_norm": 1.7137506008148193, + "learning_rate": 0.0001796218914012311, + "loss": 1.6542, + "step": 6440 + }, + { + "epoch": 0.23066592654932225, + "grad_norm": 1.3480849266052246, + "learning_rate": 0.0001796148733627299, + "loss": 1.6549, + "step": 6441 + }, + { + "epoch": 0.23070173867889054, + "grad_norm": 1.6950236558914185, + "learning_rate": 0.00017960785425311332, + "loss": 1.503, + "step": 6442 + }, + { + "epoch": 0.2307375508084588, + "grad_norm": 2.1359996795654297, + "learning_rate": 0.0001796008340724757, + "loss": 1.5654, + "step": 6443 + }, + { + "epoch": 0.2307733629380271, + "grad_norm": 1.839069128036499, + "learning_rate": 0.00017959381282091152, + "loss": 1.5582, + "step": 6444 + }, + { + "epoch": 0.2308091750675954, + "grad_norm": 2.33095383644104, + "learning_rate": 0.0001795867904985153, + "loss": 1.7936, + "step": 6445 + }, + { + "epoch": 0.23084498719716368, + "grad_norm": 1.4011579751968384, + "learning_rate": 0.00017957976710538144, + "loss": 1.7156, + "step": 6446 + }, + { + "epoch": 0.23088079932673197, + "grad_norm": 1.6660329103469849, + "learning_rate": 0.00017957274264160448, + "loss": 1.8214, + "step": 6447 + }, + { + "epoch": 0.23091661145630024, + "grad_norm": 1.4621220827102661, + "learning_rate": 0.0001795657171072789, + "loss": 1.6597, + "step": 6448 + }, + { + "epoch": 0.23095242358586854, + "grad_norm": 1.8586870431900024, + "learning_rate": 0.00017955869050249925, + "loss": 2.0127, + "step": 6449 + }, + { + "epoch": 0.2309882357154368, + "grad_norm": 1.8170366287231445, + "learning_rate": 0.00017955166282736002, + "loss": 1.6028, + "step": 6450 + }, + { + "epoch": 0.2310240478450051, + "grad_norm": 1.4119073152542114, + "learning_rate": 0.0001795446340819558, + "loss": 1.1546, + "step": 6451 + }, + { + "epoch": 0.2310598599745734, + "grad_norm": 1.4222018718719482, + "learning_rate": 0.0001795376042663811, + "loss": 1.3849, + "step": 6452 + }, + { + "epoch": 0.23109567210414167, + "grad_norm": 1.6756181716918945, + "learning_rate": 0.00017953057338073055, + "loss": 1.6725, + "step": 6453 + }, + { + "epoch": 0.23113148423370997, + "grad_norm": 1.8242745399475098, + "learning_rate": 0.00017952354142509872, + "loss": 1.5498, + "step": 6454 + }, + { + "epoch": 0.23116729636327824, + "grad_norm": 1.50627601146698, + "learning_rate": 0.00017951650839958023, + "loss": 1.4107, + "step": 6455 + }, + { + "epoch": 0.23120310849284653, + "grad_norm": 1.5063188076019287, + "learning_rate": 0.0001795094743042697, + "loss": 1.5205, + "step": 6456 + }, + { + "epoch": 0.2312389206224148, + "grad_norm": 1.6867979764938354, + "learning_rate": 0.00017950243913926171, + "loss": 1.4739, + "step": 6457 + }, + { + "epoch": 0.2312747327519831, + "grad_norm": 2.8445029258728027, + "learning_rate": 0.000179495402904651, + "loss": 1.8396, + "step": 6458 + }, + { + "epoch": 0.2313105448815514, + "grad_norm": 1.4332222938537598, + "learning_rate": 0.00017948836560053216, + "loss": 1.3423, + "step": 6459 + }, + { + "epoch": 0.23134635701111966, + "grad_norm": 1.8352477550506592, + "learning_rate": 0.00017948132722699992, + "loss": 1.7802, + "step": 6460 + }, + { + "epoch": 0.23138216914068796, + "grad_norm": 1.4083185195922852, + "learning_rate": 0.0001794742877841489, + "loss": 1.6408, + "step": 6461 + }, + { + "epoch": 0.23141798127025623, + "grad_norm": 1.7609177827835083, + "learning_rate": 0.00017946724727207388, + "loss": 1.3707, + "step": 6462 + }, + { + "epoch": 0.23145379339982453, + "grad_norm": 2.6420741081237793, + "learning_rate": 0.00017946020569086955, + "loss": 1.2318, + "step": 6463 + }, + { + "epoch": 0.2314896055293928, + "grad_norm": 1.4578214883804321, + "learning_rate": 0.00017945316304063066, + "loss": 1.3948, + "step": 6464 + }, + { + "epoch": 0.2315254176589611, + "grad_norm": 1.539219617843628, + "learning_rate": 0.00017944611932145193, + "loss": 1.5818, + "step": 6465 + }, + { + "epoch": 0.2315612297885294, + "grad_norm": 2.507035493850708, + "learning_rate": 0.0001794390745334281, + "loss": 1.1414, + "step": 6466 + }, + { + "epoch": 0.23159704191809766, + "grad_norm": 2.5263805389404297, + "learning_rate": 0.00017943202867665408, + "loss": 1.7751, + "step": 6467 + }, + { + "epoch": 0.23163285404766595, + "grad_norm": 1.9388482570648193, + "learning_rate": 0.00017942498175122453, + "loss": 1.9844, + "step": 6468 + }, + { + "epoch": 0.23166866617723422, + "grad_norm": 1.8652827739715576, + "learning_rate": 0.0001794179337572343, + "loss": 1.4795, + "step": 6469 + }, + { + "epoch": 0.23170447830680252, + "grad_norm": 1.9985748529434204, + "learning_rate": 0.0001794108846947782, + "loss": 1.7811, + "step": 6470 + }, + { + "epoch": 0.2317402904363708, + "grad_norm": 2.3072433471679688, + "learning_rate": 0.00017940383456395109, + "loss": 1.4727, + "step": 6471 + }, + { + "epoch": 0.23177610256593908, + "grad_norm": 1.2530908584594727, + "learning_rate": 0.00017939678336484783, + "loss": 1.459, + "step": 6472 + }, + { + "epoch": 0.23181191469550738, + "grad_norm": 1.7884855270385742, + "learning_rate": 0.00017938973109756323, + "loss": 1.7551, + "step": 6473 + }, + { + "epoch": 0.23184772682507565, + "grad_norm": 2.037320613861084, + "learning_rate": 0.00017938267776219225, + "loss": 1.684, + "step": 6474 + }, + { + "epoch": 0.23188353895464395, + "grad_norm": 1.3687164783477783, + "learning_rate": 0.00017937562335882968, + "loss": 1.7375, + "step": 6475 + }, + { + "epoch": 0.23191935108421222, + "grad_norm": 1.592024564743042, + "learning_rate": 0.00017936856788757055, + "loss": 1.5635, + "step": 6476 + }, + { + "epoch": 0.2319551632137805, + "grad_norm": 1.433759331703186, + "learning_rate": 0.00017936151134850966, + "loss": 1.6694, + "step": 6477 + }, + { + "epoch": 0.23199097534334878, + "grad_norm": 2.6087303161621094, + "learning_rate": 0.00017935445374174202, + "loss": 1.5391, + "step": 6478 + }, + { + "epoch": 0.23202678747291708, + "grad_norm": 1.7297744750976562, + "learning_rate": 0.0001793473950673626, + "loss": 1.6245, + "step": 6479 + }, + { + "epoch": 0.23206259960248538, + "grad_norm": 1.6509273052215576, + "learning_rate": 0.00017934033532546632, + "loss": 1.6673, + "step": 6480 + }, + { + "epoch": 0.23209841173205364, + "grad_norm": 1.5647467374801636, + "learning_rate": 0.00017933327451614812, + "loss": 1.6559, + "step": 6481 + }, + { + "epoch": 0.23213422386162194, + "grad_norm": 1.4652289152145386, + "learning_rate": 0.0001793262126395031, + "loss": 1.5293, + "step": 6482 + }, + { + "epoch": 0.2321700359911902, + "grad_norm": 1.4504691362380981, + "learning_rate": 0.00017931914969562617, + "loss": 1.5716, + "step": 6483 + }, + { + "epoch": 0.2322058481207585, + "grad_norm": 1.9844943284988403, + "learning_rate": 0.0001793120856846124, + "loss": 1.6779, + "step": 6484 + }, + { + "epoch": 0.23224166025032678, + "grad_norm": 1.2851874828338623, + "learning_rate": 0.00017930502060655682, + "loss": 1.4328, + "step": 6485 + }, + { + "epoch": 0.23227747237989507, + "grad_norm": 1.9430853128433228, + "learning_rate": 0.0001792979544615545, + "loss": 1.7059, + "step": 6486 + }, + { + "epoch": 0.23231328450946334, + "grad_norm": 1.595819115638733, + "learning_rate": 0.00017929088724970052, + "loss": 1.567, + "step": 6487 + }, + { + "epoch": 0.23234909663903164, + "grad_norm": 2.0435125827789307, + "learning_rate": 0.0001792838189710899, + "loss": 1.8018, + "step": 6488 + }, + { + "epoch": 0.23238490876859993, + "grad_norm": 1.225368618965149, + "learning_rate": 0.00017927674962581774, + "loss": 1.3603, + "step": 6489 + }, + { + "epoch": 0.2324207208981682, + "grad_norm": 1.9997037649154663, + "learning_rate": 0.0001792696792139792, + "loss": 1.4549, + "step": 6490 + }, + { + "epoch": 0.2324565330277365, + "grad_norm": 1.4556210041046143, + "learning_rate": 0.0001792626077356694, + "loss": 1.5237, + "step": 6491 + }, + { + "epoch": 0.23249234515730477, + "grad_norm": 1.36371648311615, + "learning_rate": 0.0001792555351909834, + "loss": 1.7727, + "step": 6492 + }, + { + "epoch": 0.23252815728687307, + "grad_norm": 1.272359848022461, + "learning_rate": 0.0001792484615800164, + "loss": 1.4476, + "step": 6493 + }, + { + "epoch": 0.23256396941644134, + "grad_norm": 1.8688451051712036, + "learning_rate": 0.00017924138690286366, + "loss": 1.6197, + "step": 6494 + }, + { + "epoch": 0.23259978154600963, + "grad_norm": 1.4307365417480469, + "learning_rate": 0.0001792343111596202, + "loss": 1.5767, + "step": 6495 + }, + { + "epoch": 0.23263559367557793, + "grad_norm": 1.3361865282058716, + "learning_rate": 0.00017922723435038131, + "loss": 1.2674, + "step": 6496 + }, + { + "epoch": 0.2326714058051462, + "grad_norm": 1.858230710029602, + "learning_rate": 0.00017922015647524217, + "loss": 1.6375, + "step": 6497 + }, + { + "epoch": 0.2327072179347145, + "grad_norm": 1.5478800535202026, + "learning_rate": 0.00017921307753429803, + "loss": 1.2073, + "step": 6498 + }, + { + "epoch": 0.23274303006428276, + "grad_norm": 1.2986811399459839, + "learning_rate": 0.00017920599752764408, + "loss": 1.2458, + "step": 6499 + }, + { + "epoch": 0.23277884219385106, + "grad_norm": 1.7942970991134644, + "learning_rate": 0.00017919891645537563, + "loss": 1.5128, + "step": 6500 + }, + { + "epoch": 0.23281465432341933, + "grad_norm": 2.231881618499756, + "learning_rate": 0.0001791918343175879, + "loss": 1.7035, + "step": 6501 + }, + { + "epoch": 0.23285046645298763, + "grad_norm": 1.8615409135818481, + "learning_rate": 0.0001791847511143762, + "loss": 1.269, + "step": 6502 + }, + { + "epoch": 0.23288627858255592, + "grad_norm": 1.8193210363388062, + "learning_rate": 0.0001791776668458358, + "loss": 1.7327, + "step": 6503 + }, + { + "epoch": 0.2329220907121242, + "grad_norm": 1.4053817987442017, + "learning_rate": 0.00017917058151206204, + "loss": 1.6057, + "step": 6504 + }, + { + "epoch": 0.2329579028416925, + "grad_norm": 1.530077576637268, + "learning_rate": 0.00017916349511315022, + "loss": 1.6364, + "step": 6505 + }, + { + "epoch": 0.23299371497126076, + "grad_norm": 1.6833380460739136, + "learning_rate": 0.0001791564076491957, + "loss": 1.5841, + "step": 6506 + }, + { + "epoch": 0.23302952710082905, + "grad_norm": 1.6770578622817993, + "learning_rate": 0.0001791493191202938, + "loss": 1.8875, + "step": 6507 + }, + { + "epoch": 0.23306533923039732, + "grad_norm": 2.6495304107666016, + "learning_rate": 0.00017914222952653992, + "loss": 1.6587, + "step": 6508 + }, + { + "epoch": 0.23310115135996562, + "grad_norm": 1.6221790313720703, + "learning_rate": 0.00017913513886802943, + "loss": 1.5729, + "step": 6509 + }, + { + "epoch": 0.23313696348953392, + "grad_norm": 1.6502245664596558, + "learning_rate": 0.0001791280471448577, + "loss": 1.6398, + "step": 6510 + }, + { + "epoch": 0.23317277561910218, + "grad_norm": 1.5571110248565674, + "learning_rate": 0.00017912095435712017, + "loss": 1.515, + "step": 6511 + }, + { + "epoch": 0.23320858774867048, + "grad_norm": 1.4226124286651611, + "learning_rate": 0.0001791138605049123, + "loss": 1.7522, + "step": 6512 + }, + { + "epoch": 0.23324439987823875, + "grad_norm": 1.4621069431304932, + "learning_rate": 0.00017910676558832944, + "loss": 1.6877, + "step": 6513 + }, + { + "epoch": 0.23328021200780705, + "grad_norm": 2.131011724472046, + "learning_rate": 0.0001790996696074671, + "loss": 1.6325, + "step": 6514 + }, + { + "epoch": 0.23331602413737532, + "grad_norm": 1.4561113119125366, + "learning_rate": 0.00017909257256242076, + "loss": 1.3617, + "step": 6515 + }, + { + "epoch": 0.2333518362669436, + "grad_norm": 1.3390988111495972, + "learning_rate": 0.00017908547445328585, + "loss": 1.6789, + "step": 6516 + }, + { + "epoch": 0.2333876483965119, + "grad_norm": 1.5033286809921265, + "learning_rate": 0.0001790783752801579, + "loss": 1.547, + "step": 6517 + }, + { + "epoch": 0.23342346052608018, + "grad_norm": 1.6072916984558105, + "learning_rate": 0.00017907127504313241, + "loss": 1.3535, + "step": 6518 + }, + { + "epoch": 0.23345927265564848, + "grad_norm": 1.4692038297653198, + "learning_rate": 0.00017906417374230493, + "loss": 1.5247, + "step": 6519 + }, + { + "epoch": 0.23349508478521674, + "grad_norm": 1.9015370607376099, + "learning_rate": 0.00017905707137777098, + "loss": 1.353, + "step": 6520 + }, + { + "epoch": 0.23353089691478504, + "grad_norm": 1.8074142932891846, + "learning_rate": 0.00017904996794962608, + "loss": 1.4386, + "step": 6521 + }, + { + "epoch": 0.2335667090443533, + "grad_norm": 1.8867095708847046, + "learning_rate": 0.00017904286345796582, + "loss": 1.5851, + "step": 6522 + }, + { + "epoch": 0.2336025211739216, + "grad_norm": 1.6819953918457031, + "learning_rate": 0.00017903575790288585, + "loss": 1.5244, + "step": 6523 + }, + { + "epoch": 0.2336383333034899, + "grad_norm": 2.2761800289154053, + "learning_rate": 0.00017902865128448166, + "loss": 1.8374, + "step": 6524 + }, + { + "epoch": 0.23367414543305817, + "grad_norm": 1.5938763618469238, + "learning_rate": 0.00017902154360284893, + "loss": 1.3266, + "step": 6525 + }, + { + "epoch": 0.23370995756262647, + "grad_norm": 1.7620958089828491, + "learning_rate": 0.00017901443485808324, + "loss": 1.6188, + "step": 6526 + }, + { + "epoch": 0.23374576969219474, + "grad_norm": 1.395570158958435, + "learning_rate": 0.00017900732505028025, + "loss": 1.5055, + "step": 6527 + }, + { + "epoch": 0.23378158182176303, + "grad_norm": 1.467537522315979, + "learning_rate": 0.00017900021417953564, + "loss": 1.5082, + "step": 6528 + }, + { + "epoch": 0.2338173939513313, + "grad_norm": 1.5979235172271729, + "learning_rate": 0.000178993102245945, + "loss": 1.6297, + "step": 6529 + }, + { + "epoch": 0.2338532060808996, + "grad_norm": 1.9237210750579834, + "learning_rate": 0.0001789859892496041, + "loss": 1.4545, + "step": 6530 + }, + { + "epoch": 0.2338890182104679, + "grad_norm": 1.3320746421813965, + "learning_rate": 0.00017897887519060862, + "loss": 1.3044, + "step": 6531 + }, + { + "epoch": 0.23392483034003617, + "grad_norm": 1.386905312538147, + "learning_rate": 0.0001789717600690542, + "loss": 1.6688, + "step": 6532 + }, + { + "epoch": 0.23396064246960446, + "grad_norm": 1.8010025024414062, + "learning_rate": 0.00017896464388503664, + "loss": 1.2744, + "step": 6533 + }, + { + "epoch": 0.23399645459917273, + "grad_norm": 1.6420273780822754, + "learning_rate": 0.00017895752663865167, + "loss": 1.6643, + "step": 6534 + }, + { + "epoch": 0.23403226672874103, + "grad_norm": 1.6220277547836304, + "learning_rate": 0.00017895040832999502, + "loss": 1.6328, + "step": 6535 + }, + { + "epoch": 0.2340680788583093, + "grad_norm": 1.4307856559753418, + "learning_rate": 0.00017894328895916244, + "loss": 1.6593, + "step": 6536 + }, + { + "epoch": 0.2341038909878776, + "grad_norm": 1.9479830265045166, + "learning_rate": 0.00017893616852624974, + "loss": 1.4733, + "step": 6537 + }, + { + "epoch": 0.2341397031174459, + "grad_norm": 1.6316691637039185, + "learning_rate": 0.00017892904703135272, + "loss": 1.6507, + "step": 6538 + }, + { + "epoch": 0.23417551524701416, + "grad_norm": 1.9461170434951782, + "learning_rate": 0.0001789219244745672, + "loss": 0.9729, + "step": 6539 + }, + { + "epoch": 0.23421132737658246, + "grad_norm": 1.3413337469100952, + "learning_rate": 0.00017891480085598896, + "loss": 1.6239, + "step": 6540 + }, + { + "epoch": 0.23424713950615073, + "grad_norm": 1.4433578252792358, + "learning_rate": 0.00017890767617571388, + "loss": 1.4859, + "step": 6541 + }, + { + "epoch": 0.23428295163571902, + "grad_norm": 1.8441306352615356, + "learning_rate": 0.00017890055043383782, + "loss": 1.5165, + "step": 6542 + }, + { + "epoch": 0.2343187637652873, + "grad_norm": 1.71013343334198, + "learning_rate": 0.0001788934236304566, + "loss": 1.5745, + "step": 6543 + }, + { + "epoch": 0.2343545758948556, + "grad_norm": 2.440901517868042, + "learning_rate": 0.00017888629576566614, + "loss": 2.0483, + "step": 6544 + }, + { + "epoch": 0.23439038802442388, + "grad_norm": 1.9219796657562256, + "learning_rate": 0.00017887916683956233, + "loss": 1.4539, + "step": 6545 + }, + { + "epoch": 0.23442620015399215, + "grad_norm": 1.5786051750183105, + "learning_rate": 0.0001788720368522411, + "loss": 1.3918, + "step": 6546 + }, + { + "epoch": 0.23446201228356045, + "grad_norm": 1.9648666381835938, + "learning_rate": 0.0001788649058037983, + "loss": 1.5756, + "step": 6547 + }, + { + "epoch": 0.23449782441312872, + "grad_norm": 1.5821706056594849, + "learning_rate": 0.00017885777369432994, + "loss": 1.402, + "step": 6548 + }, + { + "epoch": 0.23453363654269702, + "grad_norm": 1.3339437246322632, + "learning_rate": 0.000178850640523932, + "loss": 1.5828, + "step": 6549 + }, + { + "epoch": 0.23456944867226528, + "grad_norm": 1.375614047050476, + "learning_rate": 0.00017884350629270035, + "loss": 1.6135, + "step": 6550 + }, + { + "epoch": 0.23460526080183358, + "grad_norm": 1.6413074731826782, + "learning_rate": 0.00017883637100073104, + "loss": 1.1794, + "step": 6551 + }, + { + "epoch": 0.23464107293140188, + "grad_norm": 1.5169514417648315, + "learning_rate": 0.00017882923464812006, + "loss": 1.1396, + "step": 6552 + }, + { + "epoch": 0.23467688506097015, + "grad_norm": 1.6196491718292236, + "learning_rate": 0.00017882209723496338, + "loss": 1.7209, + "step": 6553 + }, + { + "epoch": 0.23471269719053844, + "grad_norm": 1.7131088972091675, + "learning_rate": 0.00017881495876135708, + "loss": 1.6069, + "step": 6554 + }, + { + "epoch": 0.2347485093201067, + "grad_norm": 1.6686679124832153, + "learning_rate": 0.00017880781922739717, + "loss": 1.9669, + "step": 6555 + }, + { + "epoch": 0.234784321449675, + "grad_norm": 1.7967392206192017, + "learning_rate": 0.0001788006786331797, + "loss": 1.5043, + "step": 6556 + }, + { + "epoch": 0.23482013357924328, + "grad_norm": 1.6955111026763916, + "learning_rate": 0.00017879353697880073, + "loss": 1.6294, + "step": 6557 + }, + { + "epoch": 0.23485594570881158, + "grad_norm": 1.5779500007629395, + "learning_rate": 0.00017878639426435638, + "loss": 1.3588, + "step": 6558 + }, + { + "epoch": 0.23489175783837987, + "grad_norm": 1.3038164377212524, + "learning_rate": 0.00017877925048994273, + "loss": 1.5094, + "step": 6559 + }, + { + "epoch": 0.23492756996794814, + "grad_norm": 1.5466959476470947, + "learning_rate": 0.00017877210565565586, + "loss": 1.7958, + "step": 6560 + }, + { + "epoch": 0.23496338209751644, + "grad_norm": 1.521091103553772, + "learning_rate": 0.0001787649597615919, + "loss": 1.6854, + "step": 6561 + }, + { + "epoch": 0.2349991942270847, + "grad_norm": 1.5606482028961182, + "learning_rate": 0.00017875781280784705, + "loss": 1.4973, + "step": 6562 + }, + { + "epoch": 0.235035006356653, + "grad_norm": 2.0361833572387695, + "learning_rate": 0.0001787506647945174, + "loss": 1.3534, + "step": 6563 + }, + { + "epoch": 0.23507081848622127, + "grad_norm": 2.3265650272369385, + "learning_rate": 0.00017874351572169913, + "loss": 1.3959, + "step": 6564 + }, + { + "epoch": 0.23510663061578957, + "grad_norm": 1.6580675840377808, + "learning_rate": 0.00017873636558948846, + "loss": 1.4783, + "step": 6565 + }, + { + "epoch": 0.23514244274535787, + "grad_norm": 1.493943691253662, + "learning_rate": 0.00017872921439798152, + "loss": 1.6773, + "step": 6566 + }, + { + "epoch": 0.23517825487492613, + "grad_norm": 1.4382308721542358, + "learning_rate": 0.00017872206214727455, + "loss": 1.4732, + "step": 6567 + }, + { + "epoch": 0.23521406700449443, + "grad_norm": 1.3880226612091064, + "learning_rate": 0.0001787149088374638, + "loss": 1.462, + "step": 6568 + }, + { + "epoch": 0.2352498791340627, + "grad_norm": 1.3058699369430542, + "learning_rate": 0.00017870775446864547, + "loss": 1.5238, + "step": 6569 + }, + { + "epoch": 0.235285691263631, + "grad_norm": 2.446697235107422, + "learning_rate": 0.00017870059904091584, + "loss": 1.4423, + "step": 6570 + }, + { + "epoch": 0.23532150339319927, + "grad_norm": 1.8130626678466797, + "learning_rate": 0.00017869344255437117, + "loss": 1.5468, + "step": 6571 + }, + { + "epoch": 0.23535731552276756, + "grad_norm": 1.4712576866149902, + "learning_rate": 0.00017868628500910773, + "loss": 1.6648, + "step": 6572 + }, + { + "epoch": 0.23539312765233586, + "grad_norm": 1.2141464948654175, + "learning_rate": 0.00017867912640522182, + "loss": 1.7291, + "step": 6573 + }, + { + "epoch": 0.23542893978190413, + "grad_norm": 1.6197760105133057, + "learning_rate": 0.00017867196674280976, + "loss": 1.9246, + "step": 6574 + }, + { + "epoch": 0.23546475191147243, + "grad_norm": 1.6172927618026733, + "learning_rate": 0.00017866480602196787, + "loss": 1.3133, + "step": 6575 + }, + { + "epoch": 0.2355005640410407, + "grad_norm": 1.7807902097702026, + "learning_rate": 0.00017865764424279248, + "loss": 1.4431, + "step": 6576 + }, + { + "epoch": 0.235536376170609, + "grad_norm": 1.82480788230896, + "learning_rate": 0.00017865048140537995, + "loss": 1.7821, + "step": 6577 + }, + { + "epoch": 0.23557218830017726, + "grad_norm": 1.434335708618164, + "learning_rate": 0.00017864331750982665, + "loss": 1.4632, + "step": 6578 + }, + { + "epoch": 0.23560800042974556, + "grad_norm": 1.5445455312728882, + "learning_rate": 0.0001786361525562289, + "loss": 1.634, + "step": 6579 + }, + { + "epoch": 0.23564381255931385, + "grad_norm": 1.4568474292755127, + "learning_rate": 0.0001786289865446832, + "loss": 1.5565, + "step": 6580 + }, + { + "epoch": 0.23567962468888212, + "grad_norm": 1.866904377937317, + "learning_rate": 0.00017862181947528592, + "loss": 1.5269, + "step": 6581 + }, + { + "epoch": 0.23571543681845042, + "grad_norm": 2.703174352645874, + "learning_rate": 0.00017861465134813348, + "loss": 1.5168, + "step": 6582 + }, + { + "epoch": 0.2357512489480187, + "grad_norm": 1.7558021545410156, + "learning_rate": 0.00017860748216332227, + "loss": 1.4795, + "step": 6583 + }, + { + "epoch": 0.23578706107758698, + "grad_norm": 1.4444247484207153, + "learning_rate": 0.00017860031192094882, + "loss": 1.4356, + "step": 6584 + }, + { + "epoch": 0.23582287320715525, + "grad_norm": 1.3739668130874634, + "learning_rate": 0.00017859314062110954, + "loss": 1.4568, + "step": 6585 + }, + { + "epoch": 0.23585868533672355, + "grad_norm": 1.9809393882751465, + "learning_rate": 0.00017858596826390093, + "loss": 1.6483, + "step": 6586 + }, + { + "epoch": 0.23589449746629182, + "grad_norm": 1.643008828163147, + "learning_rate": 0.0001785787948494195, + "loss": 1.7832, + "step": 6587 + }, + { + "epoch": 0.23593030959586012, + "grad_norm": 1.5805420875549316, + "learning_rate": 0.00017857162037776173, + "loss": 1.5009, + "step": 6588 + }, + { + "epoch": 0.2359661217254284, + "grad_norm": 1.6239664554595947, + "learning_rate": 0.0001785644448490242, + "loss": 1.5902, + "step": 6589 + }, + { + "epoch": 0.23600193385499668, + "grad_norm": 1.4822232723236084, + "learning_rate": 0.00017855726826330334, + "loss": 1.5981, + "step": 6590 + }, + { + "epoch": 0.23603774598456498, + "grad_norm": 1.701465129852295, + "learning_rate": 0.00017855009062069582, + "loss": 1.4967, + "step": 6591 + }, + { + "epoch": 0.23607355811413325, + "grad_norm": 2.413892984390259, + "learning_rate": 0.00017854291192129812, + "loss": 1.6526, + "step": 6592 + }, + { + "epoch": 0.23610937024370154, + "grad_norm": 1.274664282798767, + "learning_rate": 0.00017853573216520684, + "loss": 1.6259, + "step": 6593 + }, + { + "epoch": 0.2361451823732698, + "grad_norm": 1.5732338428497314, + "learning_rate": 0.00017852855135251864, + "loss": 1.2351, + "step": 6594 + }, + { + "epoch": 0.2361809945028381, + "grad_norm": 1.7816098928451538, + "learning_rate": 0.00017852136948333006, + "loss": 1.6006, + "step": 6595 + }, + { + "epoch": 0.2362168066324064, + "grad_norm": 1.5788737535476685, + "learning_rate": 0.00017851418655773772, + "loss": 1.5484, + "step": 6596 + }, + { + "epoch": 0.23625261876197468, + "grad_norm": 1.6438406705856323, + "learning_rate": 0.00017850700257583828, + "loss": 1.704, + "step": 6597 + }, + { + "epoch": 0.23628843089154297, + "grad_norm": 1.7204110622406006, + "learning_rate": 0.00017849981753772836, + "loss": 1.6303, + "step": 6598 + }, + { + "epoch": 0.23632424302111124, + "grad_norm": 1.7380869388580322, + "learning_rate": 0.0001784926314435047, + "loss": 1.7262, + "step": 6599 + }, + { + "epoch": 0.23636005515067954, + "grad_norm": 2.4021716117858887, + "learning_rate": 0.00017848544429326392, + "loss": 1.4997, + "step": 6600 + }, + { + "epoch": 0.2363958672802478, + "grad_norm": 2.6693549156188965, + "learning_rate": 0.00017847825608710273, + "loss": 1.4439, + "step": 6601 + }, + { + "epoch": 0.2364316794098161, + "grad_norm": 1.5793838500976562, + "learning_rate": 0.00017847106682511782, + "loss": 1.519, + "step": 6602 + }, + { + "epoch": 0.2364674915393844, + "grad_norm": 2.942883253097534, + "learning_rate": 0.00017846387650740592, + "loss": 1.8072, + "step": 6603 + }, + { + "epoch": 0.23650330366895267, + "grad_norm": 1.5325664281845093, + "learning_rate": 0.00017845668513406378, + "loss": 1.3556, + "step": 6604 + }, + { + "epoch": 0.23653911579852097, + "grad_norm": 1.4211392402648926, + "learning_rate": 0.00017844949270518816, + "loss": 1.2079, + "step": 6605 + }, + { + "epoch": 0.23657492792808923, + "grad_norm": 1.4556447267532349, + "learning_rate": 0.00017844229922087582, + "loss": 1.477, + "step": 6606 + }, + { + "epoch": 0.23661074005765753, + "grad_norm": 2.2137489318847656, + "learning_rate": 0.00017843510468122347, + "loss": 1.6519, + "step": 6607 + }, + { + "epoch": 0.2366465521872258, + "grad_norm": 1.6451637744903564, + "learning_rate": 0.00017842790908632802, + "loss": 1.4015, + "step": 6608 + }, + { + "epoch": 0.2366823643167941, + "grad_norm": 1.9870620965957642, + "learning_rate": 0.00017842071243628617, + "loss": 1.3847, + "step": 6609 + }, + { + "epoch": 0.2367181764463624, + "grad_norm": 1.9691691398620605, + "learning_rate": 0.0001784135147311948, + "loss": 1.5867, + "step": 6610 + }, + { + "epoch": 0.23675398857593066, + "grad_norm": 1.8930854797363281, + "learning_rate": 0.00017840631597115076, + "loss": 1.7622, + "step": 6611 + }, + { + "epoch": 0.23678980070549896, + "grad_norm": 1.7381625175476074, + "learning_rate": 0.00017839911615625086, + "loss": 1.5428, + "step": 6612 + }, + { + "epoch": 0.23682561283506723, + "grad_norm": 1.6592036485671997, + "learning_rate": 0.00017839191528659198, + "loss": 1.6095, + "step": 6613 + }, + { + "epoch": 0.23686142496463553, + "grad_norm": 1.5722460746765137, + "learning_rate": 0.000178384713362271, + "loss": 1.529, + "step": 6614 + }, + { + "epoch": 0.2368972370942038, + "grad_norm": 1.8130706548690796, + "learning_rate": 0.00017837751038338482, + "loss": 1.619, + "step": 6615 + }, + { + "epoch": 0.2369330492237721, + "grad_norm": 1.874606728553772, + "learning_rate": 0.00017837030635003032, + "loss": 1.675, + "step": 6616 + }, + { + "epoch": 0.2369688613533404, + "grad_norm": 1.4484257698059082, + "learning_rate": 0.00017836310126230444, + "loss": 1.4822, + "step": 6617 + }, + { + "epoch": 0.23700467348290866, + "grad_norm": 1.4785351753234863, + "learning_rate": 0.00017835589512030413, + "loss": 1.6452, + "step": 6618 + }, + { + "epoch": 0.23704048561247695, + "grad_norm": 1.5347731113433838, + "learning_rate": 0.00017834868792412632, + "loss": 1.4137, + "step": 6619 + }, + { + "epoch": 0.23707629774204522, + "grad_norm": 1.8363127708435059, + "learning_rate": 0.00017834147967386797, + "loss": 1.3594, + "step": 6620 + }, + { + "epoch": 0.23711210987161352, + "grad_norm": 2.004275321960449, + "learning_rate": 0.00017833427036962604, + "loss": 1.6062, + "step": 6621 + }, + { + "epoch": 0.2371479220011818, + "grad_norm": 1.5115212202072144, + "learning_rate": 0.0001783270600114976, + "loss": 1.4327, + "step": 6622 + }, + { + "epoch": 0.23718373413075008, + "grad_norm": 1.6097291707992554, + "learning_rate": 0.00017831984859957955, + "loss": 1.4477, + "step": 6623 + }, + { + "epoch": 0.23721954626031838, + "grad_norm": 1.896243929862976, + "learning_rate": 0.00017831263613396898, + "loss": 1.7308, + "step": 6624 + }, + { + "epoch": 0.23725535838988665, + "grad_norm": 2.4240522384643555, + "learning_rate": 0.0001783054226147629, + "loss": 1.4856, + "step": 6625 + }, + { + "epoch": 0.23729117051945495, + "grad_norm": 1.8697086572647095, + "learning_rate": 0.0001782982080420584, + "loss": 1.3909, + "step": 6626 + }, + { + "epoch": 0.23732698264902322, + "grad_norm": 1.623146891593933, + "learning_rate": 0.00017829099241595245, + "loss": 1.6915, + "step": 6627 + }, + { + "epoch": 0.2373627947785915, + "grad_norm": 1.2264447212219238, + "learning_rate": 0.00017828377573654225, + "loss": 1.7342, + "step": 6628 + }, + { + "epoch": 0.23739860690815978, + "grad_norm": 1.5856890678405762, + "learning_rate": 0.00017827655800392478, + "loss": 1.7977, + "step": 6629 + }, + { + "epoch": 0.23743441903772808, + "grad_norm": 1.787320852279663, + "learning_rate": 0.00017826933921819723, + "loss": 1.4515, + "step": 6630 + }, + { + "epoch": 0.23747023116729638, + "grad_norm": 1.6283372640609741, + "learning_rate": 0.00017826211937945665, + "loss": 1.9511, + "step": 6631 + }, + { + "epoch": 0.23750604329686464, + "grad_norm": 1.7102324962615967, + "learning_rate": 0.00017825489848780022, + "loss": 1.6524, + "step": 6632 + }, + { + "epoch": 0.23754185542643294, + "grad_norm": 2.1773719787597656, + "learning_rate": 0.00017824767654332505, + "loss": 1.7596, + "step": 6633 + }, + { + "epoch": 0.2375776675560012, + "grad_norm": 1.3473145961761475, + "learning_rate": 0.00017824045354612836, + "loss": 1.2803, + "step": 6634 + }, + { + "epoch": 0.2376134796855695, + "grad_norm": 1.546225666999817, + "learning_rate": 0.00017823322949630727, + "loss": 1.5292, + "step": 6635 + }, + { + "epoch": 0.23764929181513778, + "grad_norm": 1.7746154069900513, + "learning_rate": 0.000178226004393959, + "loss": 1.4978, + "step": 6636 + }, + { + "epoch": 0.23768510394470607, + "grad_norm": 1.3727898597717285, + "learning_rate": 0.0001782187782391807, + "loss": 1.3965, + "step": 6637 + }, + { + "epoch": 0.23772091607427437, + "grad_norm": 2.398136615753174, + "learning_rate": 0.0001782115510320697, + "loss": 1.827, + "step": 6638 + }, + { + "epoch": 0.23775672820384264, + "grad_norm": 1.6898165941238403, + "learning_rate": 0.00017820432277272313, + "loss": 1.5812, + "step": 6639 + }, + { + "epoch": 0.23779254033341093, + "grad_norm": 1.2213449478149414, + "learning_rate": 0.00017819709346123826, + "loss": 1.6483, + "step": 6640 + }, + { + "epoch": 0.2378283524629792, + "grad_norm": 1.784114956855774, + "learning_rate": 0.0001781898630977124, + "loss": 1.5349, + "step": 6641 + }, + { + "epoch": 0.2378641645925475, + "grad_norm": 1.8772135972976685, + "learning_rate": 0.00017818263168224276, + "loss": 1.5394, + "step": 6642 + }, + { + "epoch": 0.23789997672211577, + "grad_norm": 2.150813341140747, + "learning_rate": 0.0001781753992149267, + "loss": 1.554, + "step": 6643 + }, + { + "epoch": 0.23793578885168407, + "grad_norm": 1.4352753162384033, + "learning_rate": 0.00017816816569586144, + "loss": 1.7107, + "step": 6644 + }, + { + "epoch": 0.23797160098125236, + "grad_norm": 1.6923801898956299, + "learning_rate": 0.00017816093112514437, + "loss": 1.5776, + "step": 6645 + }, + { + "epoch": 0.23800741311082063, + "grad_norm": 1.5510084629058838, + "learning_rate": 0.00017815369550287278, + "loss": 1.5203, + "step": 6646 + }, + { + "epoch": 0.23804322524038893, + "grad_norm": 1.1947929859161377, + "learning_rate": 0.00017814645882914402, + "loss": 1.6705, + "step": 6647 + }, + { + "epoch": 0.2380790373699572, + "grad_norm": 1.9524680376052856, + "learning_rate": 0.00017813922110405548, + "loss": 1.7717, + "step": 6648 + }, + { + "epoch": 0.2381148494995255, + "grad_norm": 1.318559169769287, + "learning_rate": 0.00017813198232770447, + "loss": 1.445, + "step": 6649 + }, + { + "epoch": 0.23815066162909376, + "grad_norm": 1.6523274183273315, + "learning_rate": 0.00017812474250018844, + "loss": 1.4655, + "step": 6650 + }, + { + "epoch": 0.23818647375866206, + "grad_norm": 1.3412474393844604, + "learning_rate": 0.00017811750162160478, + "loss": 1.4733, + "step": 6651 + }, + { + "epoch": 0.23822228588823036, + "grad_norm": 1.291676640510559, + "learning_rate": 0.00017811025969205092, + "loss": 1.3368, + "step": 6652 + }, + { + "epoch": 0.23825809801779863, + "grad_norm": 1.5550686120986938, + "learning_rate": 0.00017810301671162426, + "loss": 1.7387, + "step": 6653 + }, + { + "epoch": 0.23829391014736692, + "grad_norm": 1.3425484895706177, + "learning_rate": 0.00017809577268042224, + "loss": 1.5973, + "step": 6654 + }, + { + "epoch": 0.2383297222769352, + "grad_norm": 1.645659327507019, + "learning_rate": 0.00017808852759854235, + "loss": 1.4145, + "step": 6655 + }, + { + "epoch": 0.2383655344065035, + "grad_norm": 1.3277592658996582, + "learning_rate": 0.00017808128146608204, + "loss": 1.4578, + "step": 6656 + }, + { + "epoch": 0.23840134653607176, + "grad_norm": 1.8686926364898682, + "learning_rate": 0.0001780740342831388, + "loss": 1.5942, + "step": 6657 + }, + { + "epoch": 0.23843715866564005, + "grad_norm": 1.322493553161621, + "learning_rate": 0.00017806678604981012, + "loss": 1.2065, + "step": 6658 + }, + { + "epoch": 0.23847297079520835, + "grad_norm": 1.6285942792892456, + "learning_rate": 0.00017805953676619356, + "loss": 1.6118, + "step": 6659 + }, + { + "epoch": 0.23850878292477662, + "grad_norm": 1.799084186553955, + "learning_rate": 0.00017805228643238662, + "loss": 1.5807, + "step": 6660 + }, + { + "epoch": 0.23854459505434492, + "grad_norm": 2.529820680618286, + "learning_rate": 0.00017804503504848684, + "loss": 1.4212, + "step": 6661 + }, + { + "epoch": 0.23858040718391318, + "grad_norm": 2.7669854164123535, + "learning_rate": 0.00017803778261459181, + "loss": 1.5821, + "step": 6662 + }, + { + "epoch": 0.23861621931348148, + "grad_norm": 1.924968957901001, + "learning_rate": 0.00017803052913079905, + "loss": 1.3714, + "step": 6663 + }, + { + "epoch": 0.23865203144304975, + "grad_norm": 1.2607649564743042, + "learning_rate": 0.0001780232745972062, + "loss": 1.3695, + "step": 6664 + }, + { + "epoch": 0.23868784357261805, + "grad_norm": 1.7694482803344727, + "learning_rate": 0.00017801601901391078, + "loss": 1.433, + "step": 6665 + }, + { + "epoch": 0.23872365570218634, + "grad_norm": 1.6368229389190674, + "learning_rate": 0.0001780087623810105, + "loss": 1.8103, + "step": 6666 + }, + { + "epoch": 0.2387594678317546, + "grad_norm": 1.6224684715270996, + "learning_rate": 0.00017800150469860293, + "loss": 1.7134, + "step": 6667 + }, + { + "epoch": 0.2387952799613229, + "grad_norm": 1.4222160577774048, + "learning_rate": 0.00017799424596678573, + "loss": 1.5797, + "step": 6668 + }, + { + "epoch": 0.23883109209089118, + "grad_norm": 1.7973840236663818, + "learning_rate": 0.00017798698618565653, + "loss": 1.552, + "step": 6669 + }, + { + "epoch": 0.23886690422045948, + "grad_norm": 1.7325069904327393, + "learning_rate": 0.0001779797253553131, + "loss": 1.9692, + "step": 6670 + }, + { + "epoch": 0.23890271635002774, + "grad_norm": 1.5968601703643799, + "learning_rate": 0.000177972463475853, + "loss": 1.68, + "step": 6671 + }, + { + "epoch": 0.23893852847959604, + "grad_norm": 1.8423383235931396, + "learning_rate": 0.000177965200547374, + "loss": 1.8313, + "step": 6672 + }, + { + "epoch": 0.23897434060916434, + "grad_norm": 2.4150888919830322, + "learning_rate": 0.00017795793656997377, + "loss": 1.3579, + "step": 6673 + }, + { + "epoch": 0.2390101527387326, + "grad_norm": 1.754241943359375, + "learning_rate": 0.00017795067154375007, + "loss": 1.4763, + "step": 6674 + }, + { + "epoch": 0.2390459648683009, + "grad_norm": 1.7057687044143677, + "learning_rate": 0.00017794340546880064, + "loss": 1.5102, + "step": 6675 + }, + { + "epoch": 0.23908177699786917, + "grad_norm": 1.922208309173584, + "learning_rate": 0.00017793613834522326, + "loss": 1.4778, + "step": 6676 + }, + { + "epoch": 0.23911758912743747, + "grad_norm": 1.225993037223816, + "learning_rate": 0.0001779288701731156, + "loss": 1.5574, + "step": 6677 + }, + { + "epoch": 0.23915340125700574, + "grad_norm": 2.4143288135528564, + "learning_rate": 0.00017792160095257556, + "loss": 1.6166, + "step": 6678 + }, + { + "epoch": 0.23918921338657403, + "grad_norm": 1.4441640377044678, + "learning_rate": 0.00017791433068370087, + "loss": 1.5373, + "step": 6679 + }, + { + "epoch": 0.23922502551614233, + "grad_norm": 1.7802202701568604, + "learning_rate": 0.00017790705936658938, + "loss": 1.1986, + "step": 6680 + }, + { + "epoch": 0.2392608376457106, + "grad_norm": 2.0310826301574707, + "learning_rate": 0.00017789978700133888, + "loss": 1.8544, + "step": 6681 + }, + { + "epoch": 0.2392966497752789, + "grad_norm": 1.6153504848480225, + "learning_rate": 0.00017789251358804725, + "loss": 1.5214, + "step": 6682 + }, + { + "epoch": 0.23933246190484717, + "grad_norm": 1.5683387517929077, + "learning_rate": 0.00017788523912681231, + "loss": 1.5966, + "step": 6683 + }, + { + "epoch": 0.23936827403441546, + "grad_norm": 2.497631072998047, + "learning_rate": 0.00017787796361773197, + "loss": 1.4201, + "step": 6684 + }, + { + "epoch": 0.23940408616398373, + "grad_norm": 1.5542875528335571, + "learning_rate": 0.00017787068706090405, + "loss": 1.6915, + "step": 6685 + }, + { + "epoch": 0.23943989829355203, + "grad_norm": 1.6513056755065918, + "learning_rate": 0.0001778634094564265, + "loss": 1.4809, + "step": 6686 + }, + { + "epoch": 0.2394757104231203, + "grad_norm": 2.319539785385132, + "learning_rate": 0.0001778561308043972, + "loss": 1.4589, + "step": 6687 + }, + { + "epoch": 0.2395115225526886, + "grad_norm": 1.8904774188995361, + "learning_rate": 0.00017784885110491412, + "loss": 1.6562, + "step": 6688 + }, + { + "epoch": 0.2395473346822569, + "grad_norm": 1.582964539527893, + "learning_rate": 0.00017784157035807515, + "loss": 1.6561, + "step": 6689 + }, + { + "epoch": 0.23958314681182516, + "grad_norm": 1.625274896621704, + "learning_rate": 0.00017783428856397825, + "loss": 1.4374, + "step": 6690 + }, + { + "epoch": 0.23961895894139346, + "grad_norm": 1.5694506168365479, + "learning_rate": 0.00017782700572272137, + "loss": 1.6288, + "step": 6691 + }, + { + "epoch": 0.23965477107096173, + "grad_norm": 2.2098121643066406, + "learning_rate": 0.00017781972183440254, + "loss": 1.6151, + "step": 6692 + }, + { + "epoch": 0.23969058320053002, + "grad_norm": 2.150939464569092, + "learning_rate": 0.00017781243689911973, + "loss": 1.6039, + "step": 6693 + }, + { + "epoch": 0.2397263953300983, + "grad_norm": 1.4807977676391602, + "learning_rate": 0.00017780515091697096, + "loss": 1.7675, + "step": 6694 + }, + { + "epoch": 0.2397622074596666, + "grad_norm": 1.5989789962768555, + "learning_rate": 0.00017779786388805424, + "loss": 1.3923, + "step": 6695 + }, + { + "epoch": 0.23979801958923488, + "grad_norm": 1.3950860500335693, + "learning_rate": 0.00017779057581246763, + "loss": 1.4481, + "step": 6696 + }, + { + "epoch": 0.23983383171880315, + "grad_norm": 1.7480555772781372, + "learning_rate": 0.00017778328669030918, + "loss": 2.0431, + "step": 6697 + }, + { + "epoch": 0.23986964384837145, + "grad_norm": 1.7169839143753052, + "learning_rate": 0.0001777759965216769, + "loss": 1.5416, + "step": 6698 + }, + { + "epoch": 0.23990545597793972, + "grad_norm": 1.6872042417526245, + "learning_rate": 0.0001777687053066689, + "loss": 1.726, + "step": 6699 + }, + { + "epoch": 0.23994126810750802, + "grad_norm": 1.3428165912628174, + "learning_rate": 0.00017776141304538332, + "loss": 1.4027, + "step": 6700 + }, + { + "epoch": 0.23997708023707628, + "grad_norm": 1.719918131828308, + "learning_rate": 0.00017775411973791822, + "loss": 1.6605, + "step": 6701 + }, + { + "epoch": 0.24001289236664458, + "grad_norm": 1.745969533920288, + "learning_rate": 0.00017774682538437175, + "loss": 1.4542, + "step": 6702 + }, + { + "epoch": 0.24004870449621288, + "grad_norm": 1.530691146850586, + "learning_rate": 0.00017773952998484204, + "loss": 1.5051, + "step": 6703 + }, + { + "epoch": 0.24008451662578115, + "grad_norm": 1.7143809795379639, + "learning_rate": 0.0001777322335394272, + "loss": 1.7822, + "step": 6704 + }, + { + "epoch": 0.24012032875534944, + "grad_norm": 2.3987367153167725, + "learning_rate": 0.00017772493604822543, + "loss": 1.5643, + "step": 6705 + }, + { + "epoch": 0.2401561408849177, + "grad_norm": 1.9206353425979614, + "learning_rate": 0.00017771763751133488, + "loss": 1.6011, + "step": 6706 + }, + { + "epoch": 0.240191953014486, + "grad_norm": 1.3068255186080933, + "learning_rate": 0.0001777103379288538, + "loss": 1.3736, + "step": 6707 + }, + { + "epoch": 0.24022776514405428, + "grad_norm": 1.405949592590332, + "learning_rate": 0.00017770303730088035, + "loss": 1.5704, + "step": 6708 + }, + { + "epoch": 0.24026357727362257, + "grad_norm": 2.038421392440796, + "learning_rate": 0.00017769573562751275, + "loss": 1.5038, + "step": 6709 + }, + { + "epoch": 0.24029938940319087, + "grad_norm": 1.7716072797775269, + "learning_rate": 0.0001776884329088493, + "loss": 1.7277, + "step": 6710 + }, + { + "epoch": 0.24033520153275914, + "grad_norm": 1.2767550945281982, + "learning_rate": 0.00017768112914498817, + "loss": 1.2572, + "step": 6711 + }, + { + "epoch": 0.24037101366232744, + "grad_norm": 2.466627836227417, + "learning_rate": 0.00017767382433602762, + "loss": 1.4945, + "step": 6712 + }, + { + "epoch": 0.2404068257918957, + "grad_norm": 1.978790521621704, + "learning_rate": 0.00017766651848206597, + "loss": 1.8079, + "step": 6713 + }, + { + "epoch": 0.240442637921464, + "grad_norm": 1.4621291160583496, + "learning_rate": 0.00017765921158320152, + "loss": 1.6702, + "step": 6714 + }, + { + "epoch": 0.24047845005103227, + "grad_norm": 1.7895176410675049, + "learning_rate": 0.00017765190363953253, + "loss": 1.2385, + "step": 6715 + }, + { + "epoch": 0.24051426218060057, + "grad_norm": 1.5651522874832153, + "learning_rate": 0.00017764459465115736, + "loss": 1.6956, + "step": 6716 + }, + { + "epoch": 0.24055007431016887, + "grad_norm": 1.6989222764968872, + "learning_rate": 0.0001776372846181743, + "loss": 1.5337, + "step": 6717 + }, + { + "epoch": 0.24058588643973713, + "grad_norm": 2.1151821613311768, + "learning_rate": 0.00017762997354068172, + "loss": 1.8988, + "step": 6718 + }, + { + "epoch": 0.24062169856930543, + "grad_norm": 2.0804238319396973, + "learning_rate": 0.00017762266141877796, + "loss": 1.4169, + "step": 6719 + }, + { + "epoch": 0.2406575106988737, + "grad_norm": 1.9376277923583984, + "learning_rate": 0.00017761534825256144, + "loss": 1.5997, + "step": 6720 + }, + { + "epoch": 0.240693322828442, + "grad_norm": 1.5434688329696655, + "learning_rate": 0.00017760803404213052, + "loss": 1.0933, + "step": 6721 + }, + { + "epoch": 0.24072913495801027, + "grad_norm": 2.0929014682769775, + "learning_rate": 0.00017760071878758363, + "loss": 1.4231, + "step": 6722 + }, + { + "epoch": 0.24076494708757856, + "grad_norm": 2.1523287296295166, + "learning_rate": 0.00017759340248901917, + "loss": 1.7028, + "step": 6723 + }, + { + "epoch": 0.24080075921714686, + "grad_norm": 1.568102240562439, + "learning_rate": 0.00017758608514653555, + "loss": 1.4582, + "step": 6724 + }, + { + "epoch": 0.24083657134671513, + "grad_norm": 1.5287367105484009, + "learning_rate": 0.00017757876676023125, + "loss": 1.503, + "step": 6725 + }, + { + "epoch": 0.24087238347628342, + "grad_norm": 1.5015430450439453, + "learning_rate": 0.0001775714473302047, + "loss": 1.4269, + "step": 6726 + }, + { + "epoch": 0.2409081956058517, + "grad_norm": 1.2125335931777954, + "learning_rate": 0.0001775641268565544, + "loss": 1.5207, + "step": 6727 + }, + { + "epoch": 0.24094400773542, + "grad_norm": 2.623624086380005, + "learning_rate": 0.0001775568053393788, + "loss": 1.7906, + "step": 6728 + }, + { + "epoch": 0.24097981986498826, + "grad_norm": 1.626744270324707, + "learning_rate": 0.00017754948277877642, + "loss": 1.6469, + "step": 6729 + }, + { + "epoch": 0.24101563199455656, + "grad_norm": 1.9941731691360474, + "learning_rate": 0.0001775421591748458, + "loss": 1.5711, + "step": 6730 + }, + { + "epoch": 0.24105144412412485, + "grad_norm": 1.4977132081985474, + "learning_rate": 0.00017753483452768545, + "loss": 1.5056, + "step": 6731 + }, + { + "epoch": 0.24108725625369312, + "grad_norm": 1.9533582925796509, + "learning_rate": 0.0001775275088373939, + "loss": 1.5436, + "step": 6732 + }, + { + "epoch": 0.24112306838326142, + "grad_norm": 2.0875680446624756, + "learning_rate": 0.00017752018210406972, + "loss": 1.8286, + "step": 6733 + }, + { + "epoch": 0.2411588805128297, + "grad_norm": 1.3487203121185303, + "learning_rate": 0.00017751285432781152, + "loss": 1.483, + "step": 6734 + }, + { + "epoch": 0.24119469264239798, + "grad_norm": 1.8331043720245361, + "learning_rate": 0.00017750552550871782, + "loss": 1.6495, + "step": 6735 + }, + { + "epoch": 0.24123050477196625, + "grad_norm": 1.6515965461730957, + "learning_rate": 0.00017749819564688725, + "loss": 1.3278, + "step": 6736 + }, + { + "epoch": 0.24126631690153455, + "grad_norm": 1.197119116783142, + "learning_rate": 0.00017749086474241844, + "loss": 1.5107, + "step": 6737 + }, + { + "epoch": 0.24130212903110285, + "grad_norm": 2.1350796222686768, + "learning_rate": 0.00017748353279540999, + "loss": 1.705, + "step": 6738 + }, + { + "epoch": 0.24133794116067112, + "grad_norm": 1.517443060874939, + "learning_rate": 0.00017747619980596055, + "loss": 1.6448, + "step": 6739 + }, + { + "epoch": 0.2413737532902394, + "grad_norm": 1.9738852977752686, + "learning_rate": 0.00017746886577416876, + "loss": 1.4471, + "step": 6740 + }, + { + "epoch": 0.24140956541980768, + "grad_norm": 1.6398792266845703, + "learning_rate": 0.00017746153070013335, + "loss": 1.7618, + "step": 6741 + }, + { + "epoch": 0.24144537754937598, + "grad_norm": 1.5546917915344238, + "learning_rate": 0.00017745419458395294, + "loss": 1.7391, + "step": 6742 + }, + { + "epoch": 0.24148118967894425, + "grad_norm": 1.7060317993164062, + "learning_rate": 0.00017744685742572625, + "loss": 1.9142, + "step": 6743 + }, + { + "epoch": 0.24151700180851254, + "grad_norm": 1.6105141639709473, + "learning_rate": 0.000177439519225552, + "loss": 1.5765, + "step": 6744 + }, + { + "epoch": 0.24155281393808084, + "grad_norm": 1.5093984603881836, + "learning_rate": 0.0001774321799835289, + "loss": 1.6196, + "step": 6745 + }, + { + "epoch": 0.2415886260676491, + "grad_norm": 1.6497421264648438, + "learning_rate": 0.00017742483969975572, + "loss": 1.575, + "step": 6746 + }, + { + "epoch": 0.2416244381972174, + "grad_norm": 1.5175936222076416, + "learning_rate": 0.00017741749837433117, + "loss": 1.6146, + "step": 6747 + }, + { + "epoch": 0.24166025032678567, + "grad_norm": 1.6494495868682861, + "learning_rate": 0.00017741015600735403, + "loss": 1.4422, + "step": 6748 + }, + { + "epoch": 0.24169606245635397, + "grad_norm": 1.5587164163589478, + "learning_rate": 0.0001774028125989231, + "loss": 1.7001, + "step": 6749 + }, + { + "epoch": 0.24173187458592224, + "grad_norm": 2.176522970199585, + "learning_rate": 0.00017739546814913722, + "loss": 1.6531, + "step": 6750 + }, + { + "epoch": 0.24176768671549054, + "grad_norm": 1.236741065979004, + "learning_rate": 0.00017738812265809508, + "loss": 1.5294, + "step": 6751 + }, + { + "epoch": 0.24180349884505883, + "grad_norm": 1.8967229127883911, + "learning_rate": 0.0001773807761258956, + "loss": 1.4595, + "step": 6752 + }, + { + "epoch": 0.2418393109746271, + "grad_norm": 2.09911847114563, + "learning_rate": 0.0001773734285526376, + "loss": 1.4782, + "step": 6753 + }, + { + "epoch": 0.2418751231041954, + "grad_norm": 1.3407038450241089, + "learning_rate": 0.0001773660799384199, + "loss": 1.4959, + "step": 6754 + }, + { + "epoch": 0.24191093523376367, + "grad_norm": 1.8099561929702759, + "learning_rate": 0.0001773587302833414, + "loss": 1.2301, + "step": 6755 + }, + { + "epoch": 0.24194674736333197, + "grad_norm": 1.691125750541687, + "learning_rate": 0.000177351379587501, + "loss": 1.3265, + "step": 6756 + }, + { + "epoch": 0.24198255949290023, + "grad_norm": 1.1911656856536865, + "learning_rate": 0.0001773440278509975, + "loss": 1.7656, + "step": 6757 + }, + { + "epoch": 0.24201837162246853, + "grad_norm": 1.8270931243896484, + "learning_rate": 0.00017733667507392991, + "loss": 1.5717, + "step": 6758 + }, + { + "epoch": 0.24205418375203683, + "grad_norm": 2.1644065380096436, + "learning_rate": 0.00017732932125639713, + "loss": 1.6089, + "step": 6759 + }, + { + "epoch": 0.2420899958816051, + "grad_norm": 1.8169310092926025, + "learning_rate": 0.00017732196639849804, + "loss": 1.6283, + "step": 6760 + }, + { + "epoch": 0.2421258080111734, + "grad_norm": 1.8802064657211304, + "learning_rate": 0.0001773146105003317, + "loss": 1.5327, + "step": 6761 + }, + { + "epoch": 0.24216162014074166, + "grad_norm": 1.2530100345611572, + "learning_rate": 0.00017730725356199692, + "loss": 1.553, + "step": 6762 + }, + { + "epoch": 0.24219743227030996, + "grad_norm": 1.9466575384140015, + "learning_rate": 0.0001772998955835928, + "loss": 1.7341, + "step": 6763 + }, + { + "epoch": 0.24223324439987823, + "grad_norm": 1.4589053392410278, + "learning_rate": 0.00017729253656521832, + "loss": 1.4459, + "step": 6764 + }, + { + "epoch": 0.24226905652944652, + "grad_norm": 1.6372430324554443, + "learning_rate": 0.00017728517650697243, + "loss": 1.6402, + "step": 6765 + }, + { + "epoch": 0.24230486865901482, + "grad_norm": 1.8554078340530396, + "learning_rate": 0.0001772778154089542, + "loss": 1.511, + "step": 6766 + }, + { + "epoch": 0.2423406807885831, + "grad_norm": 1.3996329307556152, + "learning_rate": 0.0001772704532712626, + "loss": 1.6167, + "step": 6767 + }, + { + "epoch": 0.2423764929181514, + "grad_norm": 1.9406613111495972, + "learning_rate": 0.00017726309009399676, + "loss": 1.6832, + "step": 6768 + }, + { + "epoch": 0.24241230504771966, + "grad_norm": 2.206716537475586, + "learning_rate": 0.0001772557258772557, + "loss": 1.6634, + "step": 6769 + }, + { + "epoch": 0.24244811717728795, + "grad_norm": 1.4321078062057495, + "learning_rate": 0.0001772483606211385, + "loss": 1.4673, + "step": 6770 + }, + { + "epoch": 0.24248392930685622, + "grad_norm": 1.2396024465560913, + "learning_rate": 0.00017724099432574425, + "loss": 1.5565, + "step": 6771 + }, + { + "epoch": 0.24251974143642452, + "grad_norm": 1.692460060119629, + "learning_rate": 0.00017723362699117206, + "loss": 1.8142, + "step": 6772 + }, + { + "epoch": 0.24255555356599282, + "grad_norm": 1.9822334051132202, + "learning_rate": 0.00017722625861752103, + "loss": 1.6546, + "step": 6773 + }, + { + "epoch": 0.24259136569556108, + "grad_norm": 1.494781255722046, + "learning_rate": 0.0001772188892048903, + "loss": 1.4441, + "step": 6774 + }, + { + "epoch": 0.24262717782512938, + "grad_norm": 2.4597997665405273, + "learning_rate": 0.00017721151875337907, + "loss": 2.0706, + "step": 6775 + }, + { + "epoch": 0.24266298995469765, + "grad_norm": 1.3796907663345337, + "learning_rate": 0.00017720414726308642, + "loss": 1.5801, + "step": 6776 + }, + { + "epoch": 0.24269880208426595, + "grad_norm": 1.4934757947921753, + "learning_rate": 0.00017719677473411154, + "loss": 1.3307, + "step": 6777 + }, + { + "epoch": 0.24273461421383422, + "grad_norm": 1.4481245279312134, + "learning_rate": 0.00017718940116655363, + "loss": 1.7379, + "step": 6778 + }, + { + "epoch": 0.2427704263434025, + "grad_norm": 1.3896671533584595, + "learning_rate": 0.00017718202656051194, + "loss": 1.7022, + "step": 6779 + }, + { + "epoch": 0.2428062384729708, + "grad_norm": 1.709444284439087, + "learning_rate": 0.0001771746509160856, + "loss": 1.4188, + "step": 6780 + }, + { + "epoch": 0.24284205060253908, + "grad_norm": 1.5271943807601929, + "learning_rate": 0.00017716727423337388, + "loss": 1.4017, + "step": 6781 + }, + { + "epoch": 0.24287786273210737, + "grad_norm": 1.7868486642837524, + "learning_rate": 0.00017715989651247602, + "loss": 1.6336, + "step": 6782 + }, + { + "epoch": 0.24291367486167564, + "grad_norm": 2.2249338626861572, + "learning_rate": 0.0001771525177534913, + "loss": 1.4794, + "step": 6783 + }, + { + "epoch": 0.24294948699124394, + "grad_norm": 1.2865608930587769, + "learning_rate": 0.00017714513795651898, + "loss": 1.4339, + "step": 6784 + }, + { + "epoch": 0.2429852991208122, + "grad_norm": 1.402057409286499, + "learning_rate": 0.00017713775712165832, + "loss": 1.705, + "step": 6785 + }, + { + "epoch": 0.2430211112503805, + "grad_norm": 1.5345312356948853, + "learning_rate": 0.00017713037524900863, + "loss": 1.6735, + "step": 6786 + }, + { + "epoch": 0.24305692337994877, + "grad_norm": 1.4884132146835327, + "learning_rate": 0.00017712299233866923, + "loss": 1.5185, + "step": 6787 + }, + { + "epoch": 0.24309273550951707, + "grad_norm": 2.092890977859497, + "learning_rate": 0.0001771156083907395, + "loss": 1.8618, + "step": 6788 + }, + { + "epoch": 0.24312854763908537, + "grad_norm": 1.263377070426941, + "learning_rate": 0.0001771082234053187, + "loss": 1.5741, + "step": 6789 + }, + { + "epoch": 0.24316435976865364, + "grad_norm": 1.8555021286010742, + "learning_rate": 0.0001771008373825062, + "loss": 1.4518, + "step": 6790 + }, + { + "epoch": 0.24320017189822193, + "grad_norm": 1.67750084400177, + "learning_rate": 0.0001770934503224014, + "loss": 1.3955, + "step": 6791 + }, + { + "epoch": 0.2432359840277902, + "grad_norm": 1.6171958446502686, + "learning_rate": 0.00017708606222510367, + "loss": 1.3687, + "step": 6792 + }, + { + "epoch": 0.2432717961573585, + "grad_norm": 2.1286425590515137, + "learning_rate": 0.0001770786730907124, + "loss": 1.5347, + "step": 6793 + }, + { + "epoch": 0.24330760828692677, + "grad_norm": 1.4012260437011719, + "learning_rate": 0.00017707128291932702, + "loss": 1.4511, + "step": 6794 + }, + { + "epoch": 0.24334342041649507, + "grad_norm": 2.556178569793701, + "learning_rate": 0.00017706389171104694, + "loss": 1.8041, + "step": 6795 + }, + { + "epoch": 0.24337923254606336, + "grad_norm": 2.018625259399414, + "learning_rate": 0.00017705649946597157, + "loss": 1.6411, + "step": 6796 + }, + { + "epoch": 0.24341504467563163, + "grad_norm": 1.3456376791000366, + "learning_rate": 0.00017704910618420044, + "loss": 1.537, + "step": 6797 + }, + { + "epoch": 0.24345085680519993, + "grad_norm": 1.720267653465271, + "learning_rate": 0.00017704171186583295, + "loss": 1.6337, + "step": 6798 + }, + { + "epoch": 0.2434866689347682, + "grad_norm": 1.793156623840332, + "learning_rate": 0.00017703431651096862, + "loss": 1.9494, + "step": 6799 + }, + { + "epoch": 0.2435224810643365, + "grad_norm": 2.458571195602417, + "learning_rate": 0.00017702692011970693, + "loss": 1.347, + "step": 6800 + }, + { + "epoch": 0.24355829319390476, + "grad_norm": 2.3884029388427734, + "learning_rate": 0.00017701952269214737, + "loss": 1.5462, + "step": 6801 + }, + { + "epoch": 0.24359410532347306, + "grad_norm": 2.0176198482513428, + "learning_rate": 0.00017701212422838948, + "loss": 1.5606, + "step": 6802 + }, + { + "epoch": 0.24362991745304136, + "grad_norm": 2.216850519180298, + "learning_rate": 0.00017700472472853283, + "loss": 1.7962, + "step": 6803 + }, + { + "epoch": 0.24366572958260962, + "grad_norm": 1.417450189590454, + "learning_rate": 0.00017699732419267688, + "loss": 1.4538, + "step": 6804 + }, + { + "epoch": 0.24370154171217792, + "grad_norm": 2.4908945560455322, + "learning_rate": 0.0001769899226209213, + "loss": 1.3105, + "step": 6805 + }, + { + "epoch": 0.2437373538417462, + "grad_norm": 1.8872681856155396, + "learning_rate": 0.0001769825200133656, + "loss": 1.6064, + "step": 6806 + }, + { + "epoch": 0.2437731659713145, + "grad_norm": 1.579154133796692, + "learning_rate": 0.00017697511637010938, + "loss": 1.2872, + "step": 6807 + }, + { + "epoch": 0.24380897810088276, + "grad_norm": 2.0289571285247803, + "learning_rate": 0.0001769677116912523, + "loss": 1.4756, + "step": 6808 + }, + { + "epoch": 0.24384479023045105, + "grad_norm": 1.60190749168396, + "learning_rate": 0.00017696030597689393, + "loss": 1.501, + "step": 6809 + }, + { + "epoch": 0.24388060236001935, + "grad_norm": 1.8626149892807007, + "learning_rate": 0.00017695289922713389, + "loss": 1.4009, + "step": 6810 + }, + { + "epoch": 0.24391641448958762, + "grad_norm": 1.8006678819656372, + "learning_rate": 0.00017694549144207185, + "loss": 1.6877, + "step": 6811 + }, + { + "epoch": 0.24395222661915592, + "grad_norm": 1.565643310546875, + "learning_rate": 0.0001769380826218075, + "loss": 1.6743, + "step": 6812 + }, + { + "epoch": 0.24398803874872418, + "grad_norm": 1.7469196319580078, + "learning_rate": 0.00017693067276644049, + "loss": 1.6445, + "step": 6813 + }, + { + "epoch": 0.24402385087829248, + "grad_norm": 1.8169236183166504, + "learning_rate": 0.00017692326187607052, + "loss": 1.4561, + "step": 6814 + }, + { + "epoch": 0.24405966300786075, + "grad_norm": 1.2375288009643555, + "learning_rate": 0.00017691584995079725, + "loss": 1.2383, + "step": 6815 + }, + { + "epoch": 0.24409547513742905, + "grad_norm": 1.8225083351135254, + "learning_rate": 0.00017690843699072045, + "loss": 1.6827, + "step": 6816 + }, + { + "epoch": 0.24413128726699734, + "grad_norm": 1.7936789989471436, + "learning_rate": 0.00017690102299593985, + "loss": 1.7272, + "step": 6817 + }, + { + "epoch": 0.2441670993965656, + "grad_norm": 1.4096508026123047, + "learning_rate": 0.00017689360796655515, + "loss": 1.5091, + "step": 6818 + }, + { + "epoch": 0.2442029115261339, + "grad_norm": 1.3659213781356812, + "learning_rate": 0.00017688619190266616, + "loss": 1.6082, + "step": 6819 + }, + { + "epoch": 0.24423872365570218, + "grad_norm": 1.6657557487487793, + "learning_rate": 0.00017687877480437262, + "loss": 1.5673, + "step": 6820 + }, + { + "epoch": 0.24427453578527047, + "grad_norm": 1.459875226020813, + "learning_rate": 0.00017687135667177436, + "loss": 1.6767, + "step": 6821 + }, + { + "epoch": 0.24431034791483874, + "grad_norm": 1.7852752208709717, + "learning_rate": 0.00017686393750497112, + "loss": 1.5721, + "step": 6822 + }, + { + "epoch": 0.24434616004440704, + "grad_norm": 1.7470086812973022, + "learning_rate": 0.0001768565173040628, + "loss": 1.3168, + "step": 6823 + }, + { + "epoch": 0.24438197217397534, + "grad_norm": 2.435948371887207, + "learning_rate": 0.0001768490960691491, + "loss": 1.7345, + "step": 6824 + }, + { + "epoch": 0.2444177843035436, + "grad_norm": 1.1866915225982666, + "learning_rate": 0.00017684167380033002, + "loss": 1.6174, + "step": 6825 + }, + { + "epoch": 0.2444535964331119, + "grad_norm": 1.7603780031204224, + "learning_rate": 0.00017683425049770527, + "loss": 1.6618, + "step": 6826 + }, + { + "epoch": 0.24448940856268017, + "grad_norm": 1.6207976341247559, + "learning_rate": 0.00017682682616137484, + "loss": 1.5455, + "step": 6827 + }, + { + "epoch": 0.24452522069224847, + "grad_norm": 1.5510258674621582, + "learning_rate": 0.00017681940079143855, + "loss": 1.5884, + "step": 6828 + }, + { + "epoch": 0.24456103282181674, + "grad_norm": 1.862894892692566, + "learning_rate": 0.0001768119743879963, + "loss": 1.4781, + "step": 6829 + }, + { + "epoch": 0.24459684495138503, + "grad_norm": 1.6464170217514038, + "learning_rate": 0.00017680454695114802, + "loss": 1.7318, + "step": 6830 + }, + { + "epoch": 0.24463265708095333, + "grad_norm": 1.6220334768295288, + "learning_rate": 0.00017679711848099362, + "loss": 1.4225, + "step": 6831 + }, + { + "epoch": 0.2446684692105216, + "grad_norm": 1.9520400762557983, + "learning_rate": 0.0001767896889776331, + "loss": 1.8062, + "step": 6832 + }, + { + "epoch": 0.2447042813400899, + "grad_norm": 1.7717475891113281, + "learning_rate": 0.00017678225844116628, + "loss": 1.5246, + "step": 6833 + }, + { + "epoch": 0.24474009346965817, + "grad_norm": 1.6127135753631592, + "learning_rate": 0.00017677482687169328, + "loss": 1.6002, + "step": 6834 + }, + { + "epoch": 0.24477590559922646, + "grad_norm": 1.3745635747909546, + "learning_rate": 0.000176767394269314, + "loss": 1.6012, + "step": 6835 + }, + { + "epoch": 0.24481171772879473, + "grad_norm": 1.3935896158218384, + "learning_rate": 0.00017675996063412844, + "loss": 1.5386, + "step": 6836 + }, + { + "epoch": 0.24484752985836303, + "grad_norm": 1.7197144031524658, + "learning_rate": 0.00017675252596623665, + "loss": 1.7675, + "step": 6837 + }, + { + "epoch": 0.24488334198793132, + "grad_norm": 1.8122268915176392, + "learning_rate": 0.00017674509026573864, + "loss": 1.7868, + "step": 6838 + }, + { + "epoch": 0.2449191541174996, + "grad_norm": 1.5085411071777344, + "learning_rate": 0.00017673765353273438, + "loss": 1.6055, + "step": 6839 + }, + { + "epoch": 0.2449549662470679, + "grad_norm": 2.0807392597198486, + "learning_rate": 0.00017673021576732404, + "loss": 1.5603, + "step": 6840 + }, + { + "epoch": 0.24499077837663616, + "grad_norm": 2.034391164779663, + "learning_rate": 0.00017672277696960756, + "loss": 1.2525, + "step": 6841 + }, + { + "epoch": 0.24502659050620446, + "grad_norm": 1.4948173761367798, + "learning_rate": 0.0001767153371396851, + "loss": 1.6105, + "step": 6842 + }, + { + "epoch": 0.24506240263577272, + "grad_norm": 1.3254578113555908, + "learning_rate": 0.00017670789627765676, + "loss": 1.5827, + "step": 6843 + }, + { + "epoch": 0.24509821476534102, + "grad_norm": 1.611053228378296, + "learning_rate": 0.0001767004543836226, + "loss": 1.7118, + "step": 6844 + }, + { + "epoch": 0.24513402689490932, + "grad_norm": 1.7157574892044067, + "learning_rate": 0.00017669301145768277, + "loss": 1.6883, + "step": 6845 + }, + { + "epoch": 0.2451698390244776, + "grad_norm": 1.8786518573760986, + "learning_rate": 0.0001766855674999374, + "loss": 1.5183, + "step": 6846 + }, + { + "epoch": 0.24520565115404588, + "grad_norm": 1.7475110292434692, + "learning_rate": 0.00017667812251048664, + "loss": 1.5028, + "step": 6847 + }, + { + "epoch": 0.24524146328361415, + "grad_norm": 1.989743709564209, + "learning_rate": 0.00017667067648943064, + "loss": 1.8715, + "step": 6848 + }, + { + "epoch": 0.24527727541318245, + "grad_norm": 1.4464224576950073, + "learning_rate": 0.00017666322943686957, + "loss": 1.577, + "step": 6849 + }, + { + "epoch": 0.24531308754275072, + "grad_norm": 2.7265331745147705, + "learning_rate": 0.00017665578135290364, + "loss": 1.4392, + "step": 6850 + }, + { + "epoch": 0.24534889967231902, + "grad_norm": 2.068549156188965, + "learning_rate": 0.00017664833223763306, + "loss": 1.5901, + "step": 6851 + }, + { + "epoch": 0.2453847118018873, + "grad_norm": 1.8748308420181274, + "learning_rate": 0.00017664088209115805, + "loss": 1.581, + "step": 6852 + }, + { + "epoch": 0.24542052393145558, + "grad_norm": 1.8242149353027344, + "learning_rate": 0.00017663343091357881, + "loss": 1.8448, + "step": 6853 + }, + { + "epoch": 0.24545633606102388, + "grad_norm": 1.4797003269195557, + "learning_rate": 0.00017662597870499562, + "loss": 1.4438, + "step": 6854 + }, + { + "epoch": 0.24549214819059215, + "grad_norm": 1.5570054054260254, + "learning_rate": 0.00017661852546550875, + "loss": 1.6526, + "step": 6855 + }, + { + "epoch": 0.24552796032016044, + "grad_norm": 1.3332992792129517, + "learning_rate": 0.00017661107119521842, + "loss": 1.5923, + "step": 6856 + }, + { + "epoch": 0.2455637724497287, + "grad_norm": 2.3711464405059814, + "learning_rate": 0.00017660361589422497, + "loss": 1.4343, + "step": 6857 + }, + { + "epoch": 0.245599584579297, + "grad_norm": 1.5084023475646973, + "learning_rate": 0.00017659615956262865, + "loss": 1.7877, + "step": 6858 + }, + { + "epoch": 0.2456353967088653, + "grad_norm": 1.477497935295105, + "learning_rate": 0.00017658870220052983, + "loss": 1.5231, + "step": 6859 + }, + { + "epoch": 0.24567120883843357, + "grad_norm": 1.6098153591156006, + "learning_rate": 0.00017658124380802882, + "loss": 1.4208, + "step": 6860 + }, + { + "epoch": 0.24570702096800187, + "grad_norm": 1.4886384010314941, + "learning_rate": 0.00017657378438522593, + "loss": 1.6053, + "step": 6861 + }, + { + "epoch": 0.24574283309757014, + "grad_norm": 1.6936547756195068, + "learning_rate": 0.00017656632393222156, + "loss": 1.426, + "step": 6862 + }, + { + "epoch": 0.24577864522713844, + "grad_norm": 1.6345033645629883, + "learning_rate": 0.00017655886244911603, + "loss": 1.5935, + "step": 6863 + }, + { + "epoch": 0.2458144573567067, + "grad_norm": 1.7725507020950317, + "learning_rate": 0.00017655139993600982, + "loss": 1.3166, + "step": 6864 + }, + { + "epoch": 0.245850269486275, + "grad_norm": 1.4159860610961914, + "learning_rate": 0.0001765439363930032, + "loss": 1.5238, + "step": 6865 + }, + { + "epoch": 0.2458860816158433, + "grad_norm": 1.7116299867630005, + "learning_rate": 0.00017653647182019671, + "loss": 1.7032, + "step": 6866 + }, + { + "epoch": 0.24592189374541157, + "grad_norm": 2.9299378395080566, + "learning_rate": 0.0001765290062176907, + "loss": 1.5626, + "step": 6867 + }, + { + "epoch": 0.24595770587497987, + "grad_norm": 1.746596336364746, + "learning_rate": 0.00017652153958558562, + "loss": 1.3949, + "step": 6868 + }, + { + "epoch": 0.24599351800454813, + "grad_norm": 1.607316017150879, + "learning_rate": 0.00017651407192398195, + "loss": 1.4019, + "step": 6869 + }, + { + "epoch": 0.24602933013411643, + "grad_norm": 1.8668955564498901, + "learning_rate": 0.0001765066032329801, + "loss": 1.5584, + "step": 6870 + }, + { + "epoch": 0.2460651422636847, + "grad_norm": 1.5562617778778076, + "learning_rate": 0.0001764991335126806, + "loss": 1.4447, + "step": 6871 + }, + { + "epoch": 0.246100954393253, + "grad_norm": 1.6031650304794312, + "learning_rate": 0.000176491662763184, + "loss": 1.603, + "step": 6872 + }, + { + "epoch": 0.2461367665228213, + "grad_norm": 1.7676963806152344, + "learning_rate": 0.0001764841909845907, + "loss": 1.6027, + "step": 6873 + }, + { + "epoch": 0.24617257865238956, + "grad_norm": 1.4618767499923706, + "learning_rate": 0.00017647671817700122, + "loss": 1.149, + "step": 6874 + }, + { + "epoch": 0.24620839078195786, + "grad_norm": 1.703264832496643, + "learning_rate": 0.00017646924434051617, + "loss": 1.9018, + "step": 6875 + }, + { + "epoch": 0.24624420291152613, + "grad_norm": 1.5716698169708252, + "learning_rate": 0.0001764617694752361, + "loss": 1.3323, + "step": 6876 + }, + { + "epoch": 0.24628001504109442, + "grad_norm": 1.4548003673553467, + "learning_rate": 0.00017645429358126156, + "loss": 1.6873, + "step": 6877 + }, + { + "epoch": 0.2463158271706627, + "grad_norm": 1.5308620929718018, + "learning_rate": 0.0001764468166586931, + "loss": 1.5299, + "step": 6878 + }, + { + "epoch": 0.246351639300231, + "grad_norm": 1.557874083518982, + "learning_rate": 0.00017643933870763133, + "loss": 1.4238, + "step": 6879 + }, + { + "epoch": 0.2463874514297993, + "grad_norm": 2.1845991611480713, + "learning_rate": 0.00017643185972817684, + "loss": 1.5245, + "step": 6880 + }, + { + "epoch": 0.24642326355936756, + "grad_norm": 1.7284934520721436, + "learning_rate": 0.0001764243797204303, + "loss": 1.3871, + "step": 6881 + }, + { + "epoch": 0.24645907568893585, + "grad_norm": 1.6655116081237793, + "learning_rate": 0.0001764168986844923, + "loss": 1.6806, + "step": 6882 + }, + { + "epoch": 0.24649488781850412, + "grad_norm": 2.0219454765319824, + "learning_rate": 0.00017640941662046345, + "loss": 1.6322, + "step": 6883 + }, + { + "epoch": 0.24653069994807242, + "grad_norm": 1.672935962677002, + "learning_rate": 0.00017640193352844454, + "loss": 1.532, + "step": 6884 + }, + { + "epoch": 0.2465665120776407, + "grad_norm": 1.3515428304672241, + "learning_rate": 0.00017639444940853612, + "loss": 1.466, + "step": 6885 + }, + { + "epoch": 0.24660232420720898, + "grad_norm": 1.778982162475586, + "learning_rate": 0.00017638696426083893, + "loss": 1.67, + "step": 6886 + }, + { + "epoch": 0.24663813633677725, + "grad_norm": 1.437366008758545, + "learning_rate": 0.00017637947808545369, + "loss": 1.282, + "step": 6887 + }, + { + "epoch": 0.24667394846634555, + "grad_norm": 1.3975059986114502, + "learning_rate": 0.00017637199088248106, + "loss": 1.7724, + "step": 6888 + }, + { + "epoch": 0.24670976059591385, + "grad_norm": 1.6595691442489624, + "learning_rate": 0.00017636450265202185, + "loss": 1.5012, + "step": 6889 + }, + { + "epoch": 0.24674557272548212, + "grad_norm": 1.3656624555587769, + "learning_rate": 0.00017635701339417672, + "loss": 1.3337, + "step": 6890 + }, + { + "epoch": 0.2467813848550504, + "grad_norm": 1.6880717277526855, + "learning_rate": 0.0001763495231090465, + "loss": 1.8932, + "step": 6891 + }, + { + "epoch": 0.24681719698461868, + "grad_norm": 1.7251242399215698, + "learning_rate": 0.0001763420317967319, + "loss": 1.3787, + "step": 6892 + }, + { + "epoch": 0.24685300911418698, + "grad_norm": 1.4598206281661987, + "learning_rate": 0.00017633453945733373, + "loss": 1.3369, + "step": 6893 + }, + { + "epoch": 0.24688882124375525, + "grad_norm": 2.0298573970794678, + "learning_rate": 0.00017632704609095283, + "loss": 1.7565, + "step": 6894 + }, + { + "epoch": 0.24692463337332354, + "grad_norm": 1.364871859550476, + "learning_rate": 0.00017631955169768998, + "loss": 1.6694, + "step": 6895 + }, + { + "epoch": 0.24696044550289184, + "grad_norm": 1.5108788013458252, + "learning_rate": 0.00017631205627764598, + "loss": 1.6086, + "step": 6896 + }, + { + "epoch": 0.2469962576324601, + "grad_norm": 1.313675045967102, + "learning_rate": 0.0001763045598309217, + "loss": 1.7611, + "step": 6897 + }, + { + "epoch": 0.2470320697620284, + "grad_norm": 1.818166732788086, + "learning_rate": 0.00017629706235761802, + "loss": 1.6188, + "step": 6898 + }, + { + "epoch": 0.24706788189159667, + "grad_norm": 1.8856993913650513, + "learning_rate": 0.00017628956385783577, + "loss": 1.7872, + "step": 6899 + }, + { + "epoch": 0.24710369402116497, + "grad_norm": 1.811854362487793, + "learning_rate": 0.00017628206433167583, + "loss": 1.6524, + "step": 6900 + }, + { + "epoch": 0.24713950615073324, + "grad_norm": 1.3475021123886108, + "learning_rate": 0.00017627456377923911, + "loss": 1.4307, + "step": 6901 + }, + { + "epoch": 0.24717531828030154, + "grad_norm": 1.7281248569488525, + "learning_rate": 0.00017626706220062654, + "loss": 1.6302, + "step": 6902 + }, + { + "epoch": 0.24721113040986983, + "grad_norm": 2.6777141094207764, + "learning_rate": 0.00017625955959593904, + "loss": 1.6385, + "step": 6903 + }, + { + "epoch": 0.2472469425394381, + "grad_norm": 1.415355920791626, + "learning_rate": 0.0001762520559652775, + "loss": 1.5923, + "step": 6904 + }, + { + "epoch": 0.2472827546690064, + "grad_norm": 1.761030912399292, + "learning_rate": 0.00017624455130874292, + "loss": 1.6887, + "step": 6905 + }, + { + "epoch": 0.24731856679857467, + "grad_norm": 1.5250886678695679, + "learning_rate": 0.00017623704562643624, + "loss": 1.7323, + "step": 6906 + }, + { + "epoch": 0.24735437892814297, + "grad_norm": 1.85284423828125, + "learning_rate": 0.00017622953891845847, + "loss": 1.4674, + "step": 6907 + }, + { + "epoch": 0.24739019105771123, + "grad_norm": 2.0614492893218994, + "learning_rate": 0.00017622203118491055, + "loss": 1.7599, + "step": 6908 + }, + { + "epoch": 0.24742600318727953, + "grad_norm": 2.1460821628570557, + "learning_rate": 0.00017621452242589354, + "loss": 1.437, + "step": 6909 + }, + { + "epoch": 0.24746181531684783, + "grad_norm": 1.4736385345458984, + "learning_rate": 0.00017620701264150845, + "loss": 1.7396, + "step": 6910 + }, + { + "epoch": 0.2474976274464161, + "grad_norm": 1.7290641069412231, + "learning_rate": 0.0001761995018318563, + "loss": 1.508, + "step": 6911 + }, + { + "epoch": 0.2475334395759844, + "grad_norm": 1.502830982208252, + "learning_rate": 0.00017619198999703812, + "loss": 1.455, + "step": 6912 + }, + { + "epoch": 0.24756925170555266, + "grad_norm": 1.5334092378616333, + "learning_rate": 0.00017618447713715503, + "loss": 1.6101, + "step": 6913 + }, + { + "epoch": 0.24760506383512096, + "grad_norm": 1.4206371307373047, + "learning_rate": 0.00017617696325230805, + "loss": 1.6806, + "step": 6914 + }, + { + "epoch": 0.24764087596468923, + "grad_norm": 1.6280468702316284, + "learning_rate": 0.0001761694483425983, + "loss": 1.4521, + "step": 6915 + }, + { + "epoch": 0.24767668809425752, + "grad_norm": 1.273987054824829, + "learning_rate": 0.00017616193240812687, + "loss": 1.5572, + "step": 6916 + }, + { + "epoch": 0.24771250022382582, + "grad_norm": 1.521125078201294, + "learning_rate": 0.00017615441544899488, + "loss": 1.6782, + "step": 6917 + }, + { + "epoch": 0.2477483123533941, + "grad_norm": 1.4935173988342285, + "learning_rate": 0.00017614689746530345, + "loss": 1.7162, + "step": 6918 + }, + { + "epoch": 0.2477841244829624, + "grad_norm": 1.9287409782409668, + "learning_rate": 0.00017613937845715376, + "loss": 1.5272, + "step": 6919 + }, + { + "epoch": 0.24781993661253066, + "grad_norm": 1.6758594512939453, + "learning_rate": 0.00017613185842464693, + "loss": 1.7733, + "step": 6920 + }, + { + "epoch": 0.24785574874209895, + "grad_norm": 2.1583969593048096, + "learning_rate": 0.00017612433736788417, + "loss": 1.6371, + "step": 6921 + }, + { + "epoch": 0.24789156087166722, + "grad_norm": 1.667779803276062, + "learning_rate": 0.0001761168152869666, + "loss": 1.5351, + "step": 6922 + }, + { + "epoch": 0.24792737300123552, + "grad_norm": 1.3906711339950562, + "learning_rate": 0.00017610929218199553, + "loss": 1.6684, + "step": 6923 + }, + { + "epoch": 0.24796318513080381, + "grad_norm": 1.3722862005233765, + "learning_rate": 0.00017610176805307206, + "loss": 1.411, + "step": 6924 + }, + { + "epoch": 0.24799899726037208, + "grad_norm": 1.9369726181030273, + "learning_rate": 0.00017609424290029746, + "loss": 1.5838, + "step": 6925 + }, + { + "epoch": 0.24803480938994038, + "grad_norm": 2.327025890350342, + "learning_rate": 0.000176086716723773, + "loss": 1.6616, + "step": 6926 + }, + { + "epoch": 0.24807062151950865, + "grad_norm": 1.8831285238265991, + "learning_rate": 0.0001760791895235999, + "loss": 1.2508, + "step": 6927 + }, + { + "epoch": 0.24810643364907695, + "grad_norm": 1.6104234457015991, + "learning_rate": 0.00017607166129987944, + "loss": 1.5941, + "step": 6928 + }, + { + "epoch": 0.24814224577864522, + "grad_norm": 1.5727404356002808, + "learning_rate": 0.0001760641320527129, + "loss": 1.4856, + "step": 6929 + }, + { + "epoch": 0.2481780579082135, + "grad_norm": 2.3073596954345703, + "learning_rate": 0.00017605660178220158, + "loss": 1.8495, + "step": 6930 + }, + { + "epoch": 0.2482138700377818, + "grad_norm": 1.5391161441802979, + "learning_rate": 0.0001760490704884468, + "loss": 1.7637, + "step": 6931 + }, + { + "epoch": 0.24824968216735008, + "grad_norm": 2.0064144134521484, + "learning_rate": 0.00017604153817154985, + "loss": 1.8597, + "step": 6932 + }, + { + "epoch": 0.24828549429691837, + "grad_norm": 1.7063194513320923, + "learning_rate": 0.00017603400483161212, + "loss": 1.2984, + "step": 6933 + }, + { + "epoch": 0.24832130642648664, + "grad_norm": 1.3737378120422363, + "learning_rate": 0.0001760264704687349, + "loss": 1.5487, + "step": 6934 + }, + { + "epoch": 0.24835711855605494, + "grad_norm": 1.4802964925765991, + "learning_rate": 0.00017601893508301962, + "loss": 1.2851, + "step": 6935 + }, + { + "epoch": 0.2483929306856232, + "grad_norm": 1.6147825717926025, + "learning_rate": 0.0001760113986745676, + "loss": 1.7134, + "step": 6936 + }, + { + "epoch": 0.2484287428151915, + "grad_norm": 2.331864833831787, + "learning_rate": 0.00017600386124348028, + "loss": 1.3899, + "step": 6937 + }, + { + "epoch": 0.2484645549447598, + "grad_norm": 1.2941458225250244, + "learning_rate": 0.00017599632278985904, + "loss": 1.5272, + "step": 6938 + }, + { + "epoch": 0.24850036707432807, + "grad_norm": 1.85355544090271, + "learning_rate": 0.00017598878331380528, + "loss": 1.6488, + "step": 6939 + }, + { + "epoch": 0.24853617920389637, + "grad_norm": 1.6847429275512695, + "learning_rate": 0.0001759812428154205, + "loss": 1.4094, + "step": 6940 + }, + { + "epoch": 0.24857199133346464, + "grad_norm": 2.6703569889068604, + "learning_rate": 0.00017597370129480606, + "loss": 1.2444, + "step": 6941 + }, + { + "epoch": 0.24860780346303293, + "grad_norm": 1.4878178834915161, + "learning_rate": 0.00017596615875206347, + "loss": 1.6358, + "step": 6942 + }, + { + "epoch": 0.2486436155926012, + "grad_norm": 1.5205473899841309, + "learning_rate": 0.00017595861518729424, + "loss": 1.5765, + "step": 6943 + }, + { + "epoch": 0.2486794277221695, + "grad_norm": 1.6328760385513306, + "learning_rate": 0.00017595107060059984, + "loss": 1.5, + "step": 6944 + }, + { + "epoch": 0.2487152398517378, + "grad_norm": 1.4762791395187378, + "learning_rate": 0.0001759435249920817, + "loss": 1.9516, + "step": 6945 + }, + { + "epoch": 0.24875105198130606, + "grad_norm": 1.660315752029419, + "learning_rate": 0.0001759359783618414, + "loss": 1.6335, + "step": 6946 + }, + { + "epoch": 0.24878686411087436, + "grad_norm": 1.6185013055801392, + "learning_rate": 0.00017592843070998049, + "loss": 1.7045, + "step": 6947 + }, + { + "epoch": 0.24882267624044263, + "grad_norm": 1.4820398092269897, + "learning_rate": 0.00017592088203660045, + "loss": 1.4543, + "step": 6948 + }, + { + "epoch": 0.24885848837001093, + "grad_norm": 1.3168247938156128, + "learning_rate": 0.00017591333234180293, + "loss": 1.5607, + "step": 6949 + }, + { + "epoch": 0.2488943004995792, + "grad_norm": 1.3358012437820435, + "learning_rate": 0.0001759057816256894, + "loss": 1.452, + "step": 6950 + }, + { + "epoch": 0.2489301126291475, + "grad_norm": 2.118440866470337, + "learning_rate": 0.00017589822988836148, + "loss": 1.5474, + "step": 6951 + }, + { + "epoch": 0.2489659247587158, + "grad_norm": 2.4116692543029785, + "learning_rate": 0.00017589067712992082, + "loss": 1.3814, + "step": 6952 + }, + { + "epoch": 0.24900173688828406, + "grad_norm": 1.8422210216522217, + "learning_rate": 0.00017588312335046897, + "loss": 1.6097, + "step": 6953 + }, + { + "epoch": 0.24903754901785236, + "grad_norm": 1.4137614965438843, + "learning_rate": 0.00017587556855010755, + "loss": 1.7229, + "step": 6954 + }, + { + "epoch": 0.24907336114742062, + "grad_norm": 1.4702421426773071, + "learning_rate": 0.00017586801272893827, + "loss": 1.7011, + "step": 6955 + }, + { + "epoch": 0.24910917327698892, + "grad_norm": 1.7273403406143188, + "learning_rate": 0.00017586045588706273, + "loss": 1.2071, + "step": 6956 + }, + { + "epoch": 0.2491449854065572, + "grad_norm": 1.629349946975708, + "learning_rate": 0.0001758528980245826, + "loss": 1.6223, + "step": 6957 + }, + { + "epoch": 0.2491807975361255, + "grad_norm": 1.4688459634780884, + "learning_rate": 0.00017584533914159956, + "loss": 1.2623, + "step": 6958 + }, + { + "epoch": 0.24921660966569378, + "grad_norm": 1.8785773515701294, + "learning_rate": 0.00017583777923821533, + "loss": 1.5056, + "step": 6959 + }, + { + "epoch": 0.24925242179526205, + "grad_norm": 1.7873069047927856, + "learning_rate": 0.0001758302183145316, + "loss": 1.5735, + "step": 6960 + }, + { + "epoch": 0.24928823392483035, + "grad_norm": 1.8960673809051514, + "learning_rate": 0.00017582265637065012, + "loss": 1.7953, + "step": 6961 + }, + { + "epoch": 0.24932404605439862, + "grad_norm": 1.3856395483016968, + "learning_rate": 0.00017581509340667257, + "loss": 1.5991, + "step": 6962 + }, + { + "epoch": 0.24935985818396691, + "grad_norm": 1.5916454792022705, + "learning_rate": 0.00017580752942270077, + "loss": 1.7537, + "step": 6963 + }, + { + "epoch": 0.24939567031353518, + "grad_norm": 1.445228099822998, + "learning_rate": 0.0001757999644188364, + "loss": 1.6487, + "step": 6964 + }, + { + "epoch": 0.24943148244310348, + "grad_norm": 1.5743353366851807, + "learning_rate": 0.0001757923983951813, + "loss": 1.5899, + "step": 6965 + }, + { + "epoch": 0.24946729457267178, + "grad_norm": 1.4328770637512207, + "learning_rate": 0.00017578483135183726, + "loss": 1.6986, + "step": 6966 + }, + { + "epoch": 0.24950310670224005, + "grad_norm": 1.5256279706954956, + "learning_rate": 0.00017577726328890604, + "loss": 1.3874, + "step": 6967 + }, + { + "epoch": 0.24953891883180834, + "grad_norm": 1.6930445432662964, + "learning_rate": 0.0001757696942064895, + "loss": 1.5721, + "step": 6968 + }, + { + "epoch": 0.2495747309613766, + "grad_norm": 1.8914119005203247, + "learning_rate": 0.00017576212410468949, + "loss": 1.3895, + "step": 6969 + }, + { + "epoch": 0.2496105430909449, + "grad_norm": 1.2605080604553223, + "learning_rate": 0.00017575455298360782, + "loss": 1.562, + "step": 6970 + }, + { + "epoch": 0.24964635522051318, + "grad_norm": 1.4235647916793823, + "learning_rate": 0.00017574698084334633, + "loss": 1.6511, + "step": 6971 + }, + { + "epoch": 0.24968216735008147, + "grad_norm": 1.5089948177337646, + "learning_rate": 0.00017573940768400692, + "loss": 1.2899, + "step": 6972 + }, + { + "epoch": 0.24971797947964977, + "grad_norm": 1.1215814352035522, + "learning_rate": 0.00017573183350569148, + "loss": 1.2685, + "step": 6973 + }, + { + "epoch": 0.24975379160921804, + "grad_norm": 1.6614004373550415, + "learning_rate": 0.00017572425830850193, + "loss": 1.4867, + "step": 6974 + }, + { + "epoch": 0.24978960373878634, + "grad_norm": 1.3425636291503906, + "learning_rate": 0.00017571668209254013, + "loss": 1.6014, + "step": 6975 + }, + { + "epoch": 0.2498254158683546, + "grad_norm": 1.719012975692749, + "learning_rate": 0.00017570910485790805, + "loss": 1.3348, + "step": 6976 + }, + { + "epoch": 0.2498612279979229, + "grad_norm": 1.7479561567306519, + "learning_rate": 0.00017570152660470765, + "loss": 1.7141, + "step": 6977 + }, + { + "epoch": 0.24989704012749117, + "grad_norm": 1.8728541135787964, + "learning_rate": 0.00017569394733304083, + "loss": 1.7683, + "step": 6978 + }, + { + "epoch": 0.24993285225705947, + "grad_norm": 1.8570350408554077, + "learning_rate": 0.00017568636704300958, + "loss": 1.7784, + "step": 6979 + }, + { + "epoch": 0.24996866438662776, + "grad_norm": 1.8610103130340576, + "learning_rate": 0.0001756787857347159, + "loss": 1.416, + "step": 6980 + }, + { + "epoch": 0.25000447651619606, + "grad_norm": 1.5104469060897827, + "learning_rate": 0.00017567120340826177, + "loss": 1.6459, + "step": 6981 + }, + { + "epoch": 0.25004028864576433, + "grad_norm": 1.4502947330474854, + "learning_rate": 0.0001756636200637492, + "loss": 1.3919, + "step": 6982 + }, + { + "epoch": 0.2500761007753326, + "grad_norm": 1.8482967615127563, + "learning_rate": 0.00017565603570128023, + "loss": 1.5629, + "step": 6983 + }, + { + "epoch": 0.25011191290490087, + "grad_norm": 1.9667346477508545, + "learning_rate": 0.0001756484503209569, + "loss": 1.6214, + "step": 6984 + }, + { + "epoch": 0.2501477250344692, + "grad_norm": 1.8211997747421265, + "learning_rate": 0.00017564086392288125, + "loss": 1.8636, + "step": 6985 + }, + { + "epoch": 0.25018353716403746, + "grad_norm": 1.3446124792099, + "learning_rate": 0.00017563327650715535, + "loss": 1.8394, + "step": 6986 + }, + { + "epoch": 0.25021934929360573, + "grad_norm": 1.7953028678894043, + "learning_rate": 0.00017562568807388126, + "loss": 1.6372, + "step": 6987 + }, + { + "epoch": 0.25025516142317406, + "grad_norm": 1.6460068225860596, + "learning_rate": 0.0001756180986231611, + "loss": 1.5185, + "step": 6988 + }, + { + "epoch": 0.2502909735527423, + "grad_norm": 1.467795491218567, + "learning_rate": 0.00017561050815509695, + "loss": 1.5313, + "step": 6989 + }, + { + "epoch": 0.2503267856823106, + "grad_norm": 1.5542267560958862, + "learning_rate": 0.00017560291666979095, + "loss": 1.1941, + "step": 6990 + }, + { + "epoch": 0.25036259781187886, + "grad_norm": 1.5457571744918823, + "learning_rate": 0.00017559532416734524, + "loss": 1.7399, + "step": 6991 + }, + { + "epoch": 0.2503984099414472, + "grad_norm": 1.6578049659729004, + "learning_rate": 0.00017558773064786193, + "loss": 1.4826, + "step": 6992 + }, + { + "epoch": 0.25043422207101546, + "grad_norm": 2.136821746826172, + "learning_rate": 0.0001755801361114432, + "loss": 1.8302, + "step": 6993 + }, + { + "epoch": 0.2504700342005837, + "grad_norm": 1.3996917009353638, + "learning_rate": 0.00017557254055819126, + "loss": 1.5399, + "step": 6994 + }, + { + "epoch": 0.25050584633015205, + "grad_norm": 1.3739955425262451, + "learning_rate": 0.00017556494398820823, + "loss": 1.3365, + "step": 6995 + }, + { + "epoch": 0.2505416584597203, + "grad_norm": 1.558883786201477, + "learning_rate": 0.0001755573464015964, + "loss": 1.6076, + "step": 6996 + }, + { + "epoch": 0.2505774705892886, + "grad_norm": 1.6406570672988892, + "learning_rate": 0.00017554974779845792, + "loss": 1.8316, + "step": 6997 + }, + { + "epoch": 0.25061328271885686, + "grad_norm": 1.7725262641906738, + "learning_rate": 0.000175542148178895, + "loss": 1.7184, + "step": 6998 + }, + { + "epoch": 0.2506490948484252, + "grad_norm": 1.7781455516815186, + "learning_rate": 0.00017553454754300996, + "loss": 1.2366, + "step": 6999 + }, + { + "epoch": 0.25068490697799345, + "grad_norm": 1.753017783164978, + "learning_rate": 0.000175526945890905, + "loss": 1.6132, + "step": 7000 + }, + { + "epoch": 0.2507207191075617, + "grad_norm": 2.3922290802001953, + "learning_rate": 0.0001755193432226824, + "loss": 1.5879, + "step": 7001 + }, + { + "epoch": 0.25075653123713004, + "grad_norm": 1.8872179985046387, + "learning_rate": 0.00017551173953844445, + "loss": 1.4223, + "step": 7002 + }, + { + "epoch": 0.2507923433666983, + "grad_norm": 1.5126007795333862, + "learning_rate": 0.00017550413483829344, + "loss": 1.7051, + "step": 7003 + }, + { + "epoch": 0.2508281554962666, + "grad_norm": 1.5426768064498901, + "learning_rate": 0.0001754965291223317, + "loss": 1.3291, + "step": 7004 + }, + { + "epoch": 0.25086396762583485, + "grad_norm": 1.5832163095474243, + "learning_rate": 0.00017548892239066156, + "loss": 1.26, + "step": 7005 + }, + { + "epoch": 0.2508997797554032, + "grad_norm": 1.4844779968261719, + "learning_rate": 0.00017548131464338533, + "loss": 1.697, + "step": 7006 + }, + { + "epoch": 0.25093559188497144, + "grad_norm": 1.5662215948104858, + "learning_rate": 0.00017547370588060537, + "loss": 1.3258, + "step": 7007 + }, + { + "epoch": 0.2509714040145397, + "grad_norm": 1.3159148693084717, + "learning_rate": 0.00017546609610242405, + "loss": 1.589, + "step": 7008 + }, + { + "epoch": 0.25100721614410804, + "grad_norm": 1.476625919342041, + "learning_rate": 0.00017545848530894377, + "loss": 1.715, + "step": 7009 + }, + { + "epoch": 0.2510430282736763, + "grad_norm": 1.64826238155365, + "learning_rate": 0.0001754508735002669, + "loss": 1.8173, + "step": 7010 + }, + { + "epoch": 0.2510788404032446, + "grad_norm": 1.6059377193450928, + "learning_rate": 0.00017544326067649583, + "loss": 1.6255, + "step": 7011 + }, + { + "epoch": 0.25111465253281284, + "grad_norm": 1.6135188341140747, + "learning_rate": 0.00017543564683773302, + "loss": 1.6396, + "step": 7012 + }, + { + "epoch": 0.25115046466238117, + "grad_norm": 1.242232322692871, + "learning_rate": 0.00017542803198408087, + "loss": 1.6733, + "step": 7013 + }, + { + "epoch": 0.25118627679194944, + "grad_norm": 2.01194167137146, + "learning_rate": 0.00017542041611564186, + "loss": 1.4317, + "step": 7014 + }, + { + "epoch": 0.2512220889215177, + "grad_norm": 1.904948353767395, + "learning_rate": 0.00017541279923251844, + "loss": 1.6848, + "step": 7015 + }, + { + "epoch": 0.25125790105108603, + "grad_norm": 1.638850212097168, + "learning_rate": 0.00017540518133481308, + "loss": 1.3428, + "step": 7016 + }, + { + "epoch": 0.2512937131806543, + "grad_norm": 2.054460287094116, + "learning_rate": 0.00017539756242262826, + "loss": 1.5478, + "step": 7017 + }, + { + "epoch": 0.25132952531022257, + "grad_norm": 1.5042341947555542, + "learning_rate": 0.0001753899424960665, + "loss": 1.6176, + "step": 7018 + }, + { + "epoch": 0.25136533743979084, + "grad_norm": 1.8969051837921143, + "learning_rate": 0.0001753823215552303, + "loss": 1.3305, + "step": 7019 + }, + { + "epoch": 0.25140114956935916, + "grad_norm": 1.6631965637207031, + "learning_rate": 0.00017537469960022221, + "loss": 1.7607, + "step": 7020 + }, + { + "epoch": 0.25143696169892743, + "grad_norm": 1.4155299663543701, + "learning_rate": 0.00017536707663114477, + "loss": 1.4719, + "step": 7021 + }, + { + "epoch": 0.2514727738284957, + "grad_norm": 1.9903786182403564, + "learning_rate": 0.00017535945264810052, + "loss": 1.5096, + "step": 7022 + }, + { + "epoch": 0.251508585958064, + "grad_norm": 1.4660794734954834, + "learning_rate": 0.00017535182765119204, + "loss": 1.3804, + "step": 7023 + }, + { + "epoch": 0.2515443980876323, + "grad_norm": 1.6268247365951538, + "learning_rate": 0.00017534420164052193, + "loss": 1.4592, + "step": 7024 + }, + { + "epoch": 0.25158021021720056, + "grad_norm": 1.503402590751648, + "learning_rate": 0.00017533657461619274, + "loss": 1.4945, + "step": 7025 + }, + { + "epoch": 0.25161602234676883, + "grad_norm": 1.4199435710906982, + "learning_rate": 0.00017532894657830715, + "loss": 1.7763, + "step": 7026 + }, + { + "epoch": 0.25165183447633716, + "grad_norm": 1.6909997463226318, + "learning_rate": 0.00017532131752696776, + "loss": 1.546, + "step": 7027 + }, + { + "epoch": 0.2516876466059054, + "grad_norm": 1.7875890731811523, + "learning_rate": 0.00017531368746227718, + "loss": 1.501, + "step": 7028 + }, + { + "epoch": 0.2517234587354737, + "grad_norm": 2.4746861457824707, + "learning_rate": 0.00017530605638433805, + "loss": 1.576, + "step": 7029 + }, + { + "epoch": 0.251759270865042, + "grad_norm": 1.6079216003417969, + "learning_rate": 0.00017529842429325312, + "loss": 1.7127, + "step": 7030 + }, + { + "epoch": 0.2517950829946103, + "grad_norm": 1.7867952585220337, + "learning_rate": 0.00017529079118912502, + "loss": 1.1966, + "step": 7031 + }, + { + "epoch": 0.25183089512417856, + "grad_norm": 1.4089163541793823, + "learning_rate": 0.00017528315707205643, + "loss": 1.7839, + "step": 7032 + }, + { + "epoch": 0.2518667072537468, + "grad_norm": 1.644866704940796, + "learning_rate": 0.00017527552194215005, + "loss": 1.4532, + "step": 7033 + }, + { + "epoch": 0.25190251938331515, + "grad_norm": 1.627656102180481, + "learning_rate": 0.00017526788579950864, + "loss": 1.7095, + "step": 7034 + }, + { + "epoch": 0.2519383315128834, + "grad_norm": 1.9081600904464722, + "learning_rate": 0.0001752602486442349, + "loss": 1.6353, + "step": 7035 + }, + { + "epoch": 0.2519741436424517, + "grad_norm": 1.579540729522705, + "learning_rate": 0.0001752526104764316, + "loss": 1.4417, + "step": 7036 + }, + { + "epoch": 0.25200995577201996, + "grad_norm": 1.7195303440093994, + "learning_rate": 0.0001752449712962015, + "loss": 1.3465, + "step": 7037 + }, + { + "epoch": 0.2520457679015883, + "grad_norm": 1.6513937711715698, + "learning_rate": 0.00017523733110364736, + "loss": 1.3911, + "step": 7038 + }, + { + "epoch": 0.25208158003115655, + "grad_norm": 1.918748140335083, + "learning_rate": 0.000175229689898872, + "loss": 1.728, + "step": 7039 + }, + { + "epoch": 0.2521173921607248, + "grad_norm": 1.678743600845337, + "learning_rate": 0.00017522204768197818, + "loss": 1.3992, + "step": 7040 + }, + { + "epoch": 0.25215320429029314, + "grad_norm": 1.795161247253418, + "learning_rate": 0.00017521440445306875, + "loss": 1.6584, + "step": 7041 + }, + { + "epoch": 0.2521890164198614, + "grad_norm": 1.783471941947937, + "learning_rate": 0.00017520676021224652, + "loss": 1.5853, + "step": 7042 + }, + { + "epoch": 0.2522248285494297, + "grad_norm": 1.6953623294830322, + "learning_rate": 0.00017519911495961435, + "loss": 1.6603, + "step": 7043 + }, + { + "epoch": 0.25226064067899795, + "grad_norm": 1.5812227725982666, + "learning_rate": 0.0001751914686952751, + "loss": 1.6276, + "step": 7044 + }, + { + "epoch": 0.2522964528085663, + "grad_norm": 1.6534466743469238, + "learning_rate": 0.0001751838214193316, + "loss": 1.6614, + "step": 7045 + }, + { + "epoch": 0.25233226493813454, + "grad_norm": 1.963584303855896, + "learning_rate": 0.0001751761731318868, + "loss": 1.3825, + "step": 7046 + }, + { + "epoch": 0.2523680770677028, + "grad_norm": 1.7353453636169434, + "learning_rate": 0.00017516852383304353, + "loss": 1.3718, + "step": 7047 + }, + { + "epoch": 0.25240388919727114, + "grad_norm": 1.7376201152801514, + "learning_rate": 0.00017516087352290472, + "loss": 1.6218, + "step": 7048 + }, + { + "epoch": 0.2524397013268394, + "grad_norm": 1.3723000288009644, + "learning_rate": 0.00017515322220157333, + "loss": 1.234, + "step": 7049 + }, + { + "epoch": 0.2524755134564077, + "grad_norm": 1.7133162021636963, + "learning_rate": 0.0001751455698691523, + "loss": 1.1578, + "step": 7050 + }, + { + "epoch": 0.25251132558597594, + "grad_norm": 1.6886383295059204, + "learning_rate": 0.00017513791652574453, + "loss": 1.6405, + "step": 7051 + }, + { + "epoch": 0.25254713771554427, + "grad_norm": 1.5092798471450806, + "learning_rate": 0.00017513026217145302, + "loss": 1.7148, + "step": 7052 + }, + { + "epoch": 0.25258294984511254, + "grad_norm": 1.4195395708084106, + "learning_rate": 0.00017512260680638072, + "loss": 1.4879, + "step": 7053 + }, + { + "epoch": 0.2526187619746808, + "grad_norm": 1.9386016130447388, + "learning_rate": 0.00017511495043063066, + "loss": 1.7046, + "step": 7054 + }, + { + "epoch": 0.25265457410424913, + "grad_norm": 1.3876835107803345, + "learning_rate": 0.00017510729304430584, + "loss": 1.5976, + "step": 7055 + }, + { + "epoch": 0.2526903862338174, + "grad_norm": 1.3243073225021362, + "learning_rate": 0.00017509963464750928, + "loss": 1.4122, + "step": 7056 + }, + { + "epoch": 0.25272619836338567, + "grad_norm": 2.481052875518799, + "learning_rate": 0.000175091975240344, + "loss": 1.6062, + "step": 7057 + }, + { + "epoch": 0.25276201049295394, + "grad_norm": 1.4827232360839844, + "learning_rate": 0.00017508431482291304, + "loss": 1.3834, + "step": 7058 + }, + { + "epoch": 0.25279782262252226, + "grad_norm": 1.5816888809204102, + "learning_rate": 0.0001750766533953195, + "loss": 1.8041, + "step": 7059 + }, + { + "epoch": 0.25283363475209053, + "grad_norm": 1.6099169254302979, + "learning_rate": 0.00017506899095766641, + "loss": 1.5866, + "step": 7060 + }, + { + "epoch": 0.2528694468816588, + "grad_norm": 1.552196979522705, + "learning_rate": 0.0001750613275100569, + "loss": 1.3113, + "step": 7061 + }, + { + "epoch": 0.2529052590112271, + "grad_norm": 1.621914029121399, + "learning_rate": 0.00017505366305259402, + "loss": 1.4599, + "step": 7062 + }, + { + "epoch": 0.2529410711407954, + "grad_norm": 1.5911298990249634, + "learning_rate": 0.00017504599758538095, + "loss": 1.3794, + "step": 7063 + }, + { + "epoch": 0.25297688327036366, + "grad_norm": 1.4210929870605469, + "learning_rate": 0.00017503833110852078, + "loss": 1.3875, + "step": 7064 + }, + { + "epoch": 0.25301269539993193, + "grad_norm": 2.398237705230713, + "learning_rate": 0.00017503066362211663, + "loss": 1.6252, + "step": 7065 + }, + { + "epoch": 0.25304850752950026, + "grad_norm": 1.9180196523666382, + "learning_rate": 0.00017502299512627172, + "loss": 1.6482, + "step": 7066 + }, + { + "epoch": 0.2530843196590685, + "grad_norm": 1.9644347429275513, + "learning_rate": 0.00017501532562108916, + "loss": 1.3261, + "step": 7067 + }, + { + "epoch": 0.2531201317886368, + "grad_norm": 1.9443340301513672, + "learning_rate": 0.00017500765510667217, + "loss": 1.5519, + "step": 7068 + }, + { + "epoch": 0.2531559439182051, + "grad_norm": 2.6250624656677246, + "learning_rate": 0.0001749999835831239, + "loss": 1.7498, + "step": 7069 + }, + { + "epoch": 0.2531917560477734, + "grad_norm": 1.7860009670257568, + "learning_rate": 0.00017499231105054763, + "loss": 1.4288, + "step": 7070 + }, + { + "epoch": 0.25322756817734166, + "grad_norm": 1.8363738059997559, + "learning_rate": 0.00017498463750904652, + "loss": 1.1333, + "step": 7071 + }, + { + "epoch": 0.2532633803069099, + "grad_norm": 1.4646892547607422, + "learning_rate": 0.00017497696295872385, + "loss": 1.7276, + "step": 7072 + }, + { + "epoch": 0.25329919243647825, + "grad_norm": 1.5846657752990723, + "learning_rate": 0.00017496928739968288, + "loss": 1.693, + "step": 7073 + }, + { + "epoch": 0.2533350045660465, + "grad_norm": 1.4653985500335693, + "learning_rate": 0.0001749616108320268, + "loss": 1.4689, + "step": 7074 + }, + { + "epoch": 0.2533708166956148, + "grad_norm": 1.4848623275756836, + "learning_rate": 0.000174953933255859, + "loss": 1.4579, + "step": 7075 + }, + { + "epoch": 0.2534066288251831, + "grad_norm": 1.226401448249817, + "learning_rate": 0.0001749462546712827, + "loss": 1.2865, + "step": 7076 + }, + { + "epoch": 0.2534424409547514, + "grad_norm": 1.9527426958084106, + "learning_rate": 0.00017493857507840116, + "loss": 1.3185, + "step": 7077 + }, + { + "epoch": 0.25347825308431965, + "grad_norm": 2.5343284606933594, + "learning_rate": 0.0001749308944773178, + "loss": 1.7735, + "step": 7078 + }, + { + "epoch": 0.2535140652138879, + "grad_norm": 1.6038436889648438, + "learning_rate": 0.0001749232128681359, + "loss": 1.5201, + "step": 7079 + }, + { + "epoch": 0.25354987734345624, + "grad_norm": 1.7659517526626587, + "learning_rate": 0.00017491553025095882, + "loss": 1.8174, + "step": 7080 + }, + { + "epoch": 0.2535856894730245, + "grad_norm": 1.3074805736541748, + "learning_rate": 0.00017490784662588992, + "loss": 1.4389, + "step": 7081 + }, + { + "epoch": 0.2536215016025928, + "grad_norm": 1.4077818393707275, + "learning_rate": 0.00017490016199303256, + "loss": 1.4818, + "step": 7082 + }, + { + "epoch": 0.2536573137321611, + "grad_norm": 1.844086766242981, + "learning_rate": 0.00017489247635249012, + "loss": 1.7697, + "step": 7083 + }, + { + "epoch": 0.2536931258617294, + "grad_norm": 1.78142249584198, + "learning_rate": 0.00017488478970436604, + "loss": 1.5957, + "step": 7084 + }, + { + "epoch": 0.25372893799129764, + "grad_norm": 1.8254570960998535, + "learning_rate": 0.0001748771020487637, + "loss": 1.4769, + "step": 7085 + }, + { + "epoch": 0.2537647501208659, + "grad_norm": 1.7204885482788086, + "learning_rate": 0.00017486941338578653, + "loss": 1.4355, + "step": 7086 + }, + { + "epoch": 0.25380056225043424, + "grad_norm": 1.4895552396774292, + "learning_rate": 0.000174861723715538, + "loss": 1.407, + "step": 7087 + }, + { + "epoch": 0.2538363743800025, + "grad_norm": 2.2478761672973633, + "learning_rate": 0.0001748540330381215, + "loss": 1.7522, + "step": 7088 + }, + { + "epoch": 0.2538721865095708, + "grad_norm": 1.7592345476150513, + "learning_rate": 0.00017484634135364057, + "loss": 1.3917, + "step": 7089 + }, + { + "epoch": 0.2539079986391391, + "grad_norm": 1.845581293106079, + "learning_rate": 0.00017483864866219868, + "loss": 1.4701, + "step": 7090 + }, + { + "epoch": 0.25394381076870737, + "grad_norm": 1.259600043296814, + "learning_rate": 0.00017483095496389928, + "loss": 1.203, + "step": 7091 + }, + { + "epoch": 0.25397962289827564, + "grad_norm": 1.8078480958938599, + "learning_rate": 0.0001748232602588459, + "loss": 1.5376, + "step": 7092 + }, + { + "epoch": 0.2540154350278439, + "grad_norm": 1.4246379137039185, + "learning_rate": 0.0001748155645471421, + "loss": 1.7493, + "step": 7093 + }, + { + "epoch": 0.25405124715741223, + "grad_norm": 1.7548315525054932, + "learning_rate": 0.00017480786782889137, + "loss": 1.3788, + "step": 7094 + }, + { + "epoch": 0.2540870592869805, + "grad_norm": 1.760630488395691, + "learning_rate": 0.00017480017010419724, + "loss": 1.6053, + "step": 7095 + }, + { + "epoch": 0.25412287141654877, + "grad_norm": 1.9188206195831299, + "learning_rate": 0.00017479247137316335, + "loss": 1.5271, + "step": 7096 + }, + { + "epoch": 0.2541586835461171, + "grad_norm": 1.4333956241607666, + "learning_rate": 0.0001747847716358932, + "loss": 1.3216, + "step": 7097 + }, + { + "epoch": 0.25419449567568536, + "grad_norm": 1.5908639430999756, + "learning_rate": 0.00017477707089249043, + "loss": 1.3639, + "step": 7098 + }, + { + "epoch": 0.25423030780525363, + "grad_norm": 1.311259388923645, + "learning_rate": 0.00017476936914305862, + "loss": 1.6077, + "step": 7099 + }, + { + "epoch": 0.2542661199348219, + "grad_norm": 1.6787432432174683, + "learning_rate": 0.00017476166638770142, + "loss": 1.6371, + "step": 7100 + }, + { + "epoch": 0.2543019320643902, + "grad_norm": 1.8875246047973633, + "learning_rate": 0.0001747539626265224, + "loss": 1.5801, + "step": 7101 + }, + { + "epoch": 0.2543377441939585, + "grad_norm": 2.0725271701812744, + "learning_rate": 0.00017474625785962524, + "loss": 1.6117, + "step": 7102 + }, + { + "epoch": 0.25437355632352676, + "grad_norm": 2.2691195011138916, + "learning_rate": 0.00017473855208711362, + "loss": 1.6867, + "step": 7103 + }, + { + "epoch": 0.2544093684530951, + "grad_norm": 1.4735385179519653, + "learning_rate": 0.00017473084530909117, + "loss": 1.5192, + "step": 7104 + }, + { + "epoch": 0.25444518058266336, + "grad_norm": 1.6299970149993896, + "learning_rate": 0.0001747231375256616, + "loss": 1.74, + "step": 7105 + }, + { + "epoch": 0.2544809927122316, + "grad_norm": 1.8156439065933228, + "learning_rate": 0.00017471542873692862, + "loss": 1.6018, + "step": 7106 + }, + { + "epoch": 0.2545168048417999, + "grad_norm": 1.7628570795059204, + "learning_rate": 0.0001747077189429959, + "loss": 1.3708, + "step": 7107 + }, + { + "epoch": 0.2545526169713682, + "grad_norm": 1.5150426626205444, + "learning_rate": 0.00017470000814396718, + "loss": 1.7994, + "step": 7108 + }, + { + "epoch": 0.2545884291009365, + "grad_norm": 2.202345371246338, + "learning_rate": 0.0001746922963399462, + "loss": 1.3141, + "step": 7109 + }, + { + "epoch": 0.25462424123050476, + "grad_norm": 1.6175556182861328, + "learning_rate": 0.00017468458353103676, + "loss": 1.4706, + "step": 7110 + }, + { + "epoch": 0.2546600533600731, + "grad_norm": 1.6333051919937134, + "learning_rate": 0.00017467686971734257, + "loss": 1.621, + "step": 7111 + }, + { + "epoch": 0.25469586548964135, + "grad_norm": 1.4622938632965088, + "learning_rate": 0.0001746691548989674, + "loss": 1.1874, + "step": 7112 + }, + { + "epoch": 0.2547316776192096, + "grad_norm": 1.556118369102478, + "learning_rate": 0.00017466143907601508, + "loss": 1.5941, + "step": 7113 + }, + { + "epoch": 0.2547674897487779, + "grad_norm": 1.436787724494934, + "learning_rate": 0.00017465372224858937, + "loss": 1.5353, + "step": 7114 + }, + { + "epoch": 0.2548033018783462, + "grad_norm": 1.2040411233901978, + "learning_rate": 0.00017464600441679417, + "loss": 1.5374, + "step": 7115 + }, + { + "epoch": 0.2548391140079145, + "grad_norm": 1.6581120491027832, + "learning_rate": 0.0001746382855807333, + "loss": 1.5132, + "step": 7116 + }, + { + "epoch": 0.25487492613748275, + "grad_norm": 1.7110474109649658, + "learning_rate": 0.0001746305657405105, + "loss": 1.5188, + "step": 7117 + }, + { + "epoch": 0.2549107382670511, + "grad_norm": 1.6720296144485474, + "learning_rate": 0.00017462284489622973, + "loss": 1.4921, + "step": 7118 + }, + { + "epoch": 0.25494655039661934, + "grad_norm": 1.5299689769744873, + "learning_rate": 0.00017461512304799484, + "loss": 1.4806, + "step": 7119 + }, + { + "epoch": 0.2549823625261876, + "grad_norm": 1.5790822505950928, + "learning_rate": 0.0001746074001959097, + "loss": 1.2113, + "step": 7120 + }, + { + "epoch": 0.2550181746557559, + "grad_norm": 2.029690980911255, + "learning_rate": 0.00017459967634007826, + "loss": 1.6939, + "step": 7121 + }, + { + "epoch": 0.2550539867853242, + "grad_norm": 1.6915394067764282, + "learning_rate": 0.00017459195148060438, + "loss": 1.2217, + "step": 7122 + }, + { + "epoch": 0.2550897989148925, + "grad_norm": 2.6420986652374268, + "learning_rate": 0.00017458422561759203, + "loss": 2.0125, + "step": 7123 + }, + { + "epoch": 0.25512561104446074, + "grad_norm": 1.7383835315704346, + "learning_rate": 0.0001745764987511451, + "loss": 1.2076, + "step": 7124 + }, + { + "epoch": 0.25516142317402907, + "grad_norm": 1.5973914861679077, + "learning_rate": 0.0001745687708813676, + "loss": 1.5747, + "step": 7125 + }, + { + "epoch": 0.25519723530359734, + "grad_norm": 1.8514518737792969, + "learning_rate": 0.00017456104200836347, + "loss": 1.3596, + "step": 7126 + }, + { + "epoch": 0.2552330474331656, + "grad_norm": 1.7249946594238281, + "learning_rate": 0.00017455331213223668, + "loss": 1.7315, + "step": 7127 + }, + { + "epoch": 0.2552688595627339, + "grad_norm": 1.592330813407898, + "learning_rate": 0.00017454558125309125, + "loss": 1.4371, + "step": 7128 + }, + { + "epoch": 0.2553046716923022, + "grad_norm": 4.213657379150391, + "learning_rate": 0.00017453784937103122, + "loss": 1.464, + "step": 7129 + }, + { + "epoch": 0.25534048382187047, + "grad_norm": 1.4696651697158813, + "learning_rate": 0.00017453011648616053, + "loss": 1.4912, + "step": 7130 + }, + { + "epoch": 0.25537629595143874, + "grad_norm": 1.5843778848648071, + "learning_rate": 0.00017452238259858327, + "loss": 1.6334, + "step": 7131 + }, + { + "epoch": 0.25541210808100706, + "grad_norm": 1.2764482498168945, + "learning_rate": 0.00017451464770840348, + "loss": 1.5292, + "step": 7132 + }, + { + "epoch": 0.25544792021057533, + "grad_norm": 1.7480367422103882, + "learning_rate": 0.00017450691181572522, + "loss": 1.387, + "step": 7133 + }, + { + "epoch": 0.2554837323401436, + "grad_norm": 2.2001123428344727, + "learning_rate": 0.00017449917492065256, + "loss": 1.4539, + "step": 7134 + }, + { + "epoch": 0.25551954446971187, + "grad_norm": 1.604820728302002, + "learning_rate": 0.0001744914370232896, + "loss": 1.5517, + "step": 7135 + }, + { + "epoch": 0.2555553565992802, + "grad_norm": 1.7131909132003784, + "learning_rate": 0.00017448369812374045, + "loss": 1.4457, + "step": 7136 + }, + { + "epoch": 0.25559116872884846, + "grad_norm": 1.6699351072311401, + "learning_rate": 0.00017447595822210924, + "loss": 1.2495, + "step": 7137 + }, + { + "epoch": 0.25562698085841673, + "grad_norm": 1.6325026750564575, + "learning_rate": 0.00017446821731850008, + "loss": 1.4587, + "step": 7138 + }, + { + "epoch": 0.25566279298798505, + "grad_norm": 1.5442523956298828, + "learning_rate": 0.00017446047541301707, + "loss": 1.4748, + "step": 7139 + }, + { + "epoch": 0.2556986051175533, + "grad_norm": 2.9096853733062744, + "learning_rate": 0.00017445273250576442, + "loss": 1.6572, + "step": 7140 + }, + { + "epoch": 0.2557344172471216, + "grad_norm": 1.796773910522461, + "learning_rate": 0.0001744449885968463, + "loss": 1.4552, + "step": 7141 + }, + { + "epoch": 0.25577022937668986, + "grad_norm": 2.095973253250122, + "learning_rate": 0.00017443724368636693, + "loss": 1.4722, + "step": 7142 + }, + { + "epoch": 0.2558060415062582, + "grad_norm": 1.7989848852157593, + "learning_rate": 0.00017442949777443038, + "loss": 1.7267, + "step": 7143 + }, + { + "epoch": 0.25584185363582646, + "grad_norm": 1.8527088165283203, + "learning_rate": 0.000174421750861141, + "loss": 1.5778, + "step": 7144 + }, + { + "epoch": 0.2558776657653947, + "grad_norm": 1.6256322860717773, + "learning_rate": 0.00017441400294660294, + "loss": 1.6681, + "step": 7145 + }, + { + "epoch": 0.25591347789496305, + "grad_norm": 1.3841743469238281, + "learning_rate": 0.00017440625403092045, + "loss": 1.5331, + "step": 7146 + }, + { + "epoch": 0.2559492900245313, + "grad_norm": 1.9727689027786255, + "learning_rate": 0.00017439850411419782, + "loss": 1.5752, + "step": 7147 + }, + { + "epoch": 0.2559851021540996, + "grad_norm": 1.6270530223846436, + "learning_rate": 0.00017439075319653928, + "loss": 1.7994, + "step": 7148 + }, + { + "epoch": 0.25602091428366786, + "grad_norm": 1.7818320989608765, + "learning_rate": 0.0001743830012780491, + "loss": 1.6047, + "step": 7149 + }, + { + "epoch": 0.2560567264132362, + "grad_norm": 1.760941505432129, + "learning_rate": 0.00017437524835883157, + "loss": 1.6809, + "step": 7150 + }, + { + "epoch": 0.25609253854280445, + "grad_norm": 1.2127779722213745, + "learning_rate": 0.00017436749443899103, + "loss": 1.6395, + "step": 7151 + }, + { + "epoch": 0.2561283506723727, + "grad_norm": 2.064774751663208, + "learning_rate": 0.00017435973951863179, + "loss": 1.5165, + "step": 7152 + }, + { + "epoch": 0.25616416280194104, + "grad_norm": 2.9734225273132324, + "learning_rate": 0.0001743519835978581, + "loss": 1.3607, + "step": 7153 + }, + { + "epoch": 0.2561999749315093, + "grad_norm": 1.5963677167892456, + "learning_rate": 0.00017434422667677446, + "loss": 1.6676, + "step": 7154 + }, + { + "epoch": 0.2562357870610776, + "grad_norm": 1.5437045097351074, + "learning_rate": 0.00017433646875548512, + "loss": 1.583, + "step": 7155 + }, + { + "epoch": 0.25627159919064585, + "grad_norm": 1.6270062923431396, + "learning_rate": 0.0001743287098340945, + "loss": 1.4786, + "step": 7156 + }, + { + "epoch": 0.2563074113202142, + "grad_norm": 1.7923583984375, + "learning_rate": 0.00017432094991270692, + "loss": 1.6218, + "step": 7157 + }, + { + "epoch": 0.25634322344978244, + "grad_norm": 2.1703848838806152, + "learning_rate": 0.00017431318899142686, + "loss": 2.0572, + "step": 7158 + }, + { + "epoch": 0.2563790355793507, + "grad_norm": 1.547512173652649, + "learning_rate": 0.0001743054270703587, + "loss": 1.4179, + "step": 7159 + }, + { + "epoch": 0.25641484770891904, + "grad_norm": 1.7714463472366333, + "learning_rate": 0.00017429766414960685, + "loss": 1.5927, + "step": 7160 + }, + { + "epoch": 0.2564506598384873, + "grad_norm": 2.190034866333008, + "learning_rate": 0.0001742899002292758, + "loss": 1.8858, + "step": 7161 + }, + { + "epoch": 0.2564864719680556, + "grad_norm": 1.3032276630401611, + "learning_rate": 0.00017428213530946995, + "loss": 1.6218, + "step": 7162 + }, + { + "epoch": 0.25652228409762384, + "grad_norm": 1.72031569480896, + "learning_rate": 0.00017427436939029378, + "loss": 1.6231, + "step": 7163 + }, + { + "epoch": 0.25655809622719217, + "grad_norm": 1.9104535579681396, + "learning_rate": 0.00017426660247185177, + "loss": 1.4184, + "step": 7164 + }, + { + "epoch": 0.25659390835676044, + "grad_norm": 1.665347933769226, + "learning_rate": 0.0001742588345542484, + "loss": 1.9099, + "step": 7165 + }, + { + "epoch": 0.2566297204863287, + "grad_norm": 1.2468254566192627, + "learning_rate": 0.00017425106563758824, + "loss": 1.458, + "step": 7166 + }, + { + "epoch": 0.25666553261589703, + "grad_norm": 2.2506954669952393, + "learning_rate": 0.00017424329572197578, + "loss": 1.6085, + "step": 7167 + }, + { + "epoch": 0.2567013447454653, + "grad_norm": 1.5501458644866943, + "learning_rate": 0.0001742355248075155, + "loss": 1.4801, + "step": 7168 + }, + { + "epoch": 0.25673715687503357, + "grad_norm": 1.3393751382827759, + "learning_rate": 0.00017422775289431202, + "loss": 1.4539, + "step": 7169 + }, + { + "epoch": 0.25677296900460184, + "grad_norm": 1.88332200050354, + "learning_rate": 0.00017421997998246985, + "loss": 1.6451, + "step": 7170 + }, + { + "epoch": 0.25680878113417016, + "grad_norm": 2.4323949813842773, + "learning_rate": 0.0001742122060720936, + "loss": 1.7289, + "step": 7171 + }, + { + "epoch": 0.25684459326373843, + "grad_norm": 2.0443127155303955, + "learning_rate": 0.00017420443116328784, + "loss": 1.3284, + "step": 7172 + }, + { + "epoch": 0.2568804053933067, + "grad_norm": 1.3344660997390747, + "learning_rate": 0.0001741966552561572, + "loss": 1.3553, + "step": 7173 + }, + { + "epoch": 0.256916217522875, + "grad_norm": 1.9592489004135132, + "learning_rate": 0.00017418887835080624, + "loss": 1.3423, + "step": 7174 + }, + { + "epoch": 0.2569520296524433, + "grad_norm": 1.8439921140670776, + "learning_rate": 0.0001741811004473396, + "loss": 1.4173, + "step": 7175 + }, + { + "epoch": 0.25698784178201156, + "grad_norm": 2.6069836616516113, + "learning_rate": 0.000174173321545862, + "loss": 1.42, + "step": 7176 + }, + { + "epoch": 0.25702365391157983, + "grad_norm": 1.5937824249267578, + "learning_rate": 0.000174165541646478, + "loss": 1.5705, + "step": 7177 + }, + { + "epoch": 0.25705946604114815, + "grad_norm": 1.4017707109451294, + "learning_rate": 0.0001741577607492923, + "loss": 1.4923, + "step": 7178 + }, + { + "epoch": 0.2570952781707164, + "grad_norm": 1.6453132629394531, + "learning_rate": 0.00017414997885440957, + "loss": 1.6204, + "step": 7179 + }, + { + "epoch": 0.2571310903002847, + "grad_norm": 1.494167685508728, + "learning_rate": 0.00017414219596193455, + "loss": 1.2734, + "step": 7180 + }, + { + "epoch": 0.257166902429853, + "grad_norm": 1.562565565109253, + "learning_rate": 0.0001741344120719719, + "loss": 1.327, + "step": 7181 + }, + { + "epoch": 0.2572027145594213, + "grad_norm": 1.6081397533416748, + "learning_rate": 0.00017412662718462637, + "loss": 1.546, + "step": 7182 + }, + { + "epoch": 0.25723852668898955, + "grad_norm": 1.5741753578186035, + "learning_rate": 0.00017411884130000271, + "loss": 1.6006, + "step": 7183 + }, + { + "epoch": 0.2572743388185578, + "grad_norm": 2.1187093257904053, + "learning_rate": 0.00017411105441820563, + "loss": 1.5594, + "step": 7184 + }, + { + "epoch": 0.25731015094812615, + "grad_norm": 1.4211379289627075, + "learning_rate": 0.0001741032665393399, + "loss": 1.6873, + "step": 7185 + }, + { + "epoch": 0.2573459630776944, + "grad_norm": 1.706182837486267, + "learning_rate": 0.00017409547766351034, + "loss": 1.3723, + "step": 7186 + }, + { + "epoch": 0.2573817752072627, + "grad_norm": 1.8475958108901978, + "learning_rate": 0.00017408768779082165, + "loss": 1.4974, + "step": 7187 + }, + { + "epoch": 0.257417587336831, + "grad_norm": 1.5614365339279175, + "learning_rate": 0.00017407989692137872, + "loss": 1.5942, + "step": 7188 + }, + { + "epoch": 0.2574533994663993, + "grad_norm": 2.806450843811035, + "learning_rate": 0.0001740721050552863, + "loss": 1.6668, + "step": 7189 + }, + { + "epoch": 0.25748921159596755, + "grad_norm": 2.1845903396606445, + "learning_rate": 0.0001740643121926493, + "loss": 1.4023, + "step": 7190 + }, + { + "epoch": 0.2575250237255358, + "grad_norm": 1.9280097484588623, + "learning_rate": 0.0001740565183335725, + "loss": 1.4899, + "step": 7191 + }, + { + "epoch": 0.25756083585510414, + "grad_norm": 1.400450348854065, + "learning_rate": 0.00017404872347816076, + "loss": 1.2537, + "step": 7192 + }, + { + "epoch": 0.2575966479846724, + "grad_norm": 2.4562292098999023, + "learning_rate": 0.00017404092762651898, + "loss": 1.501, + "step": 7193 + }, + { + "epoch": 0.2576324601142407, + "grad_norm": 2.2671611309051514, + "learning_rate": 0.000174033130778752, + "loss": 1.4855, + "step": 7194 + }, + { + "epoch": 0.257668272243809, + "grad_norm": 1.6012718677520752, + "learning_rate": 0.00017402533293496477, + "loss": 1.4119, + "step": 7195 + }, + { + "epoch": 0.2577040843733773, + "grad_norm": 1.4546482563018799, + "learning_rate": 0.00017401753409526216, + "loss": 1.6494, + "step": 7196 + }, + { + "epoch": 0.25773989650294554, + "grad_norm": 2.1474809646606445, + "learning_rate": 0.0001740097342597491, + "loss": 1.6661, + "step": 7197 + }, + { + "epoch": 0.2577757086325138, + "grad_norm": 1.6500005722045898, + "learning_rate": 0.0001740019334285305, + "loss": 1.4682, + "step": 7198 + }, + { + "epoch": 0.25781152076208214, + "grad_norm": 1.741763710975647, + "learning_rate": 0.0001739941316017114, + "loss": 1.4744, + "step": 7199 + }, + { + "epoch": 0.2578473328916504, + "grad_norm": 2.0389153957366943, + "learning_rate": 0.00017398632877939666, + "loss": 1.584, + "step": 7200 + }, + { + "epoch": 0.2578831450212187, + "grad_norm": 1.478593111038208, + "learning_rate": 0.00017397852496169134, + "loss": 1.4449, + "step": 7201 + }, + { + "epoch": 0.257918957150787, + "grad_norm": 2.4067537784576416, + "learning_rate": 0.00017397072014870037, + "loss": 1.3542, + "step": 7202 + }, + { + "epoch": 0.25795476928035527, + "grad_norm": 2.244966506958008, + "learning_rate": 0.00017396291434052877, + "loss": 1.4444, + "step": 7203 + }, + { + "epoch": 0.25799058140992354, + "grad_norm": 1.5638600587844849, + "learning_rate": 0.00017395510753728157, + "loss": 1.441, + "step": 7204 + }, + { + "epoch": 0.2580263935394918, + "grad_norm": 1.8143665790557861, + "learning_rate": 0.0001739472997390638, + "loss": 1.7743, + "step": 7205 + }, + { + "epoch": 0.25806220566906013, + "grad_norm": 1.868913173675537, + "learning_rate": 0.00017393949094598047, + "loss": 1.3007, + "step": 7206 + }, + { + "epoch": 0.2580980177986284, + "grad_norm": 1.5457756519317627, + "learning_rate": 0.00017393168115813673, + "loss": 1.6073, + "step": 7207 + }, + { + "epoch": 0.25813382992819667, + "grad_norm": 1.3162868022918701, + "learning_rate": 0.0001739238703756375, + "loss": 1.5155, + "step": 7208 + }, + { + "epoch": 0.258169642057765, + "grad_norm": 1.955180287361145, + "learning_rate": 0.00017391605859858798, + "loss": 1.5373, + "step": 7209 + }, + { + "epoch": 0.25820545418733326, + "grad_norm": 1.7129416465759277, + "learning_rate": 0.00017390824582709326, + "loss": 1.8984, + "step": 7210 + }, + { + "epoch": 0.25824126631690153, + "grad_norm": 1.3623757362365723, + "learning_rate": 0.0001739004320612584, + "loss": 1.5941, + "step": 7211 + }, + { + "epoch": 0.2582770784464698, + "grad_norm": 1.5919731855392456, + "learning_rate": 0.00017389261730118858, + "loss": 1.6802, + "step": 7212 + }, + { + "epoch": 0.2583128905760381, + "grad_norm": 1.5978248119354248, + "learning_rate": 0.0001738848015469889, + "loss": 1.4566, + "step": 7213 + }, + { + "epoch": 0.2583487027056064, + "grad_norm": 2.2260446548461914, + "learning_rate": 0.0001738769847987645, + "loss": 1.2414, + "step": 7214 + }, + { + "epoch": 0.25838451483517466, + "grad_norm": 1.7300844192504883, + "learning_rate": 0.0001738691670566206, + "loss": 1.3088, + "step": 7215 + }, + { + "epoch": 0.258420326964743, + "grad_norm": 1.403152346611023, + "learning_rate": 0.0001738613483206623, + "loss": 1.607, + "step": 7216 + }, + { + "epoch": 0.25845613909431125, + "grad_norm": 1.4186724424362183, + "learning_rate": 0.00017385352859099483, + "loss": 1.6773, + "step": 7217 + }, + { + "epoch": 0.2584919512238795, + "grad_norm": 1.7511612176895142, + "learning_rate": 0.00017384570786772345, + "loss": 1.8717, + "step": 7218 + }, + { + "epoch": 0.2585277633534478, + "grad_norm": 2.500760793685913, + "learning_rate": 0.00017383788615095327, + "loss": 1.6079, + "step": 7219 + }, + { + "epoch": 0.2585635754830161, + "grad_norm": 1.5331605672836304, + "learning_rate": 0.0001738300634407896, + "loss": 1.4821, + "step": 7220 + }, + { + "epoch": 0.2585993876125844, + "grad_norm": 1.4042764902114868, + "learning_rate": 0.00017382223973733767, + "loss": 1.5336, + "step": 7221 + }, + { + "epoch": 0.25863519974215265, + "grad_norm": 1.2958499193191528, + "learning_rate": 0.0001738144150407027, + "loss": 1.5881, + "step": 7222 + }, + { + "epoch": 0.258671011871721, + "grad_norm": 1.417944073677063, + "learning_rate": 0.00017380658935099, + "loss": 1.6748, + "step": 7223 + }, + { + "epoch": 0.25870682400128925, + "grad_norm": 1.7564271688461304, + "learning_rate": 0.00017379876266830486, + "loss": 1.5369, + "step": 7224 + }, + { + "epoch": 0.2587426361308575, + "grad_norm": 1.4452077150344849, + "learning_rate": 0.00017379093499275258, + "loss": 1.6696, + "step": 7225 + }, + { + "epoch": 0.2587784482604258, + "grad_norm": 1.0980790853500366, + "learning_rate": 0.00017378310632443843, + "loss": 1.2389, + "step": 7226 + }, + { + "epoch": 0.2588142603899941, + "grad_norm": 1.6883370876312256, + "learning_rate": 0.00017377527666346772, + "loss": 1.6746, + "step": 7227 + }, + { + "epoch": 0.2588500725195624, + "grad_norm": 1.3964970111846924, + "learning_rate": 0.00017376744600994587, + "loss": 1.6275, + "step": 7228 + }, + { + "epoch": 0.25888588464913065, + "grad_norm": 2.0311527252197266, + "learning_rate": 0.00017375961436397818, + "loss": 1.3165, + "step": 7229 + }, + { + "epoch": 0.2589216967786989, + "grad_norm": 1.7014087438583374, + "learning_rate": 0.00017375178172567002, + "loss": 1.5707, + "step": 7230 + }, + { + "epoch": 0.25895750890826724, + "grad_norm": 1.977447748184204, + "learning_rate": 0.00017374394809512676, + "loss": 1.5756, + "step": 7231 + }, + { + "epoch": 0.2589933210378355, + "grad_norm": 1.3852441310882568, + "learning_rate": 0.0001737361134724538, + "loss": 1.4648, + "step": 7232 + }, + { + "epoch": 0.2590291331674038, + "grad_norm": 1.8260564804077148, + "learning_rate": 0.00017372827785775655, + "loss": 1.8661, + "step": 7233 + }, + { + "epoch": 0.2590649452969721, + "grad_norm": 1.7901363372802734, + "learning_rate": 0.00017372044125114045, + "loss": 1.3408, + "step": 7234 + }, + { + "epoch": 0.2591007574265404, + "grad_norm": 2.0273873805999756, + "learning_rate": 0.0001737126036527109, + "loss": 1.5693, + "step": 7235 + }, + { + "epoch": 0.25913656955610864, + "grad_norm": 1.6030428409576416, + "learning_rate": 0.00017370476506257333, + "loss": 1.444, + "step": 7236 + }, + { + "epoch": 0.2591723816856769, + "grad_norm": 1.4109402894973755, + "learning_rate": 0.0001736969254808332, + "loss": 1.3457, + "step": 7237 + }, + { + "epoch": 0.25920819381524524, + "grad_norm": 2.0388617515563965, + "learning_rate": 0.00017368908490759605, + "loss": 1.7262, + "step": 7238 + }, + { + "epoch": 0.2592440059448135, + "grad_norm": 2.0132298469543457, + "learning_rate": 0.00017368124334296727, + "loss": 1.3662, + "step": 7239 + }, + { + "epoch": 0.2592798180743818, + "grad_norm": 1.9762897491455078, + "learning_rate": 0.00017367340078705242, + "loss": 1.5578, + "step": 7240 + }, + { + "epoch": 0.2593156302039501, + "grad_norm": 2.113196611404419, + "learning_rate": 0.000173665557239957, + "loss": 1.7907, + "step": 7241 + }, + { + "epoch": 0.25935144233351837, + "grad_norm": 2.036367416381836, + "learning_rate": 0.00017365771270178652, + "loss": 1.6257, + "step": 7242 + }, + { + "epoch": 0.25938725446308664, + "grad_norm": 1.8975567817687988, + "learning_rate": 0.00017364986717264652, + "loss": 1.7479, + "step": 7243 + }, + { + "epoch": 0.2594230665926549, + "grad_norm": 1.5035462379455566, + "learning_rate": 0.00017364202065264258, + "loss": 1.7363, + "step": 7244 + }, + { + "epoch": 0.25945887872222323, + "grad_norm": 2.175387382507324, + "learning_rate": 0.00017363417314188024, + "loss": 1.8906, + "step": 7245 + }, + { + "epoch": 0.2594946908517915, + "grad_norm": 1.3993414640426636, + "learning_rate": 0.00017362632464046506, + "loss": 1.5836, + "step": 7246 + }, + { + "epoch": 0.25953050298135977, + "grad_norm": 1.9026535749435425, + "learning_rate": 0.00017361847514850266, + "loss": 1.5866, + "step": 7247 + }, + { + "epoch": 0.2595663151109281, + "grad_norm": 2.123619318008423, + "learning_rate": 0.00017361062466609867, + "loss": 1.1663, + "step": 7248 + }, + { + "epoch": 0.25960212724049636, + "grad_norm": 1.8966612815856934, + "learning_rate": 0.00017360277319335865, + "loss": 1.5483, + "step": 7249 + }, + { + "epoch": 0.25963793937006463, + "grad_norm": 1.919511079788208, + "learning_rate": 0.00017359492073038826, + "loss": 1.7607, + "step": 7250 + }, + { + "epoch": 0.2596737514996329, + "grad_norm": 1.6821178197860718, + "learning_rate": 0.00017358706727729311, + "loss": 1.3796, + "step": 7251 + }, + { + "epoch": 0.2597095636292012, + "grad_norm": 1.4946900606155396, + "learning_rate": 0.00017357921283417892, + "loss": 1.7299, + "step": 7252 + }, + { + "epoch": 0.2597453757587695, + "grad_norm": 1.5954385995864868, + "learning_rate": 0.00017357135740115137, + "loss": 1.4342, + "step": 7253 + }, + { + "epoch": 0.25978118788833776, + "grad_norm": 1.8522142171859741, + "learning_rate": 0.00017356350097831605, + "loss": 1.4027, + "step": 7254 + }, + { + "epoch": 0.2598170000179061, + "grad_norm": 1.701069712638855, + "learning_rate": 0.00017355564356577873, + "loss": 1.6356, + "step": 7255 + }, + { + "epoch": 0.25985281214747435, + "grad_norm": 2.1512773036956787, + "learning_rate": 0.00017354778516364512, + "loss": 1.4857, + "step": 7256 + }, + { + "epoch": 0.2598886242770426, + "grad_norm": 2.0113370418548584, + "learning_rate": 0.00017353992577202093, + "loss": 1.7957, + "step": 7257 + }, + { + "epoch": 0.2599244364066109, + "grad_norm": 1.7597367763519287, + "learning_rate": 0.00017353206539101186, + "loss": 1.4799, + "step": 7258 + }, + { + "epoch": 0.2599602485361792, + "grad_norm": 1.4501522779464722, + "learning_rate": 0.00017352420402072375, + "loss": 1.4439, + "step": 7259 + }, + { + "epoch": 0.2599960606657475, + "grad_norm": 1.3950390815734863, + "learning_rate": 0.00017351634166126227, + "loss": 1.6883, + "step": 7260 + }, + { + "epoch": 0.26003187279531575, + "grad_norm": 1.6042085886001587, + "learning_rate": 0.00017350847831273329, + "loss": 1.7091, + "step": 7261 + }, + { + "epoch": 0.2600676849248841, + "grad_norm": 1.632921814918518, + "learning_rate": 0.00017350061397524252, + "loss": 1.541, + "step": 7262 + }, + { + "epoch": 0.26010349705445235, + "grad_norm": 1.503475546836853, + "learning_rate": 0.0001734927486488958, + "loss": 1.5553, + "step": 7263 + }, + { + "epoch": 0.2601393091840206, + "grad_norm": 1.7809571027755737, + "learning_rate": 0.00017348488233379897, + "loss": 1.8013, + "step": 7264 + }, + { + "epoch": 0.2601751213135889, + "grad_norm": 1.4528809785842896, + "learning_rate": 0.0001734770150300578, + "loss": 1.5241, + "step": 7265 + }, + { + "epoch": 0.2602109334431572, + "grad_norm": 1.5344305038452148, + "learning_rate": 0.00017346914673777822, + "loss": 1.7202, + "step": 7266 + }, + { + "epoch": 0.2602467455727255, + "grad_norm": 1.6437627077102661, + "learning_rate": 0.000173461277457066, + "loss": 1.4926, + "step": 7267 + }, + { + "epoch": 0.26028255770229375, + "grad_norm": 1.5676155090332031, + "learning_rate": 0.00017345340718802704, + "loss": 1.6236, + "step": 7268 + }, + { + "epoch": 0.2603183698318621, + "grad_norm": 2.03763747215271, + "learning_rate": 0.00017344553593076726, + "loss": 1.3897, + "step": 7269 + }, + { + "epoch": 0.26035418196143034, + "grad_norm": 2.440800666809082, + "learning_rate": 0.00017343766368539253, + "loss": 1.7987, + "step": 7270 + }, + { + "epoch": 0.2603899940909986, + "grad_norm": 1.6920621395111084, + "learning_rate": 0.00017342979045200876, + "loss": 1.8625, + "step": 7271 + }, + { + "epoch": 0.2604258062205669, + "grad_norm": 1.7777220010757446, + "learning_rate": 0.00017342191623072187, + "loss": 1.375, + "step": 7272 + }, + { + "epoch": 0.2604616183501352, + "grad_norm": 1.4054484367370605, + "learning_rate": 0.00017341404102163782, + "loss": 1.64, + "step": 7273 + }, + { + "epoch": 0.2604974304797035, + "grad_norm": 2.3189139366149902, + "learning_rate": 0.00017340616482486253, + "loss": 1.6583, + "step": 7274 + }, + { + "epoch": 0.26053324260927174, + "grad_norm": 1.4085335731506348, + "learning_rate": 0.00017339828764050198, + "loss": 1.3233, + "step": 7275 + }, + { + "epoch": 0.26056905473884007, + "grad_norm": 1.6071890592575073, + "learning_rate": 0.00017339040946866217, + "loss": 1.6617, + "step": 7276 + }, + { + "epoch": 0.26060486686840834, + "grad_norm": 1.9245442152023315, + "learning_rate": 0.00017338253030944905, + "loss": 1.7055, + "step": 7277 + }, + { + "epoch": 0.2606406789979766, + "grad_norm": 2.0860061645507812, + "learning_rate": 0.00017337465016296864, + "loss": 1.5268, + "step": 7278 + }, + { + "epoch": 0.2606764911275449, + "grad_norm": 1.7437604665756226, + "learning_rate": 0.00017336676902932695, + "loss": 1.4931, + "step": 7279 + }, + { + "epoch": 0.2607123032571132, + "grad_norm": 2.2297158241271973, + "learning_rate": 0.00017335888690863, + "loss": 1.4163, + "step": 7280 + }, + { + "epoch": 0.26074811538668147, + "grad_norm": 1.6739795207977295, + "learning_rate": 0.00017335100380098392, + "loss": 1.6585, + "step": 7281 + }, + { + "epoch": 0.26078392751624974, + "grad_norm": 1.6758095026016235, + "learning_rate": 0.00017334311970649465, + "loss": 1.5066, + "step": 7282 + }, + { + "epoch": 0.26081973964581806, + "grad_norm": 1.510539174079895, + "learning_rate": 0.00017333523462526832, + "loss": 1.5937, + "step": 7283 + }, + { + "epoch": 0.26085555177538633, + "grad_norm": 2.0457797050476074, + "learning_rate": 0.000173327348557411, + "loss": 1.6134, + "step": 7284 + }, + { + "epoch": 0.2608913639049546, + "grad_norm": 1.2520917654037476, + "learning_rate": 0.00017331946150302878, + "loss": 1.4669, + "step": 7285 + }, + { + "epoch": 0.26092717603452287, + "grad_norm": 1.4760613441467285, + "learning_rate": 0.00017331157346222779, + "loss": 1.3615, + "step": 7286 + }, + { + "epoch": 0.2609629881640912, + "grad_norm": 1.5453470945358276, + "learning_rate": 0.00017330368443511417, + "loss": 1.6023, + "step": 7287 + }, + { + "epoch": 0.26099880029365946, + "grad_norm": 1.4087209701538086, + "learning_rate": 0.00017329579442179401, + "loss": 1.7286, + "step": 7288 + }, + { + "epoch": 0.26103461242322773, + "grad_norm": 1.5778745412826538, + "learning_rate": 0.00017328790342237347, + "loss": 1.4194, + "step": 7289 + }, + { + "epoch": 0.26107042455279605, + "grad_norm": 1.4336007833480835, + "learning_rate": 0.00017328001143695874, + "loss": 1.2318, + "step": 7290 + }, + { + "epoch": 0.2611062366823643, + "grad_norm": 1.543596625328064, + "learning_rate": 0.00017327211846565596, + "loss": 1.5086, + "step": 7291 + }, + { + "epoch": 0.2611420488119326, + "grad_norm": 1.4977967739105225, + "learning_rate": 0.0001732642245085714, + "loss": 1.5653, + "step": 7292 + }, + { + "epoch": 0.26117786094150086, + "grad_norm": 1.346598744392395, + "learning_rate": 0.00017325632956581113, + "loss": 1.5548, + "step": 7293 + }, + { + "epoch": 0.2612136730710692, + "grad_norm": 2.323775053024292, + "learning_rate": 0.00017324843363748148, + "loss": 1.521, + "step": 7294 + }, + { + "epoch": 0.26124948520063745, + "grad_norm": 1.5547338724136353, + "learning_rate": 0.00017324053672368862, + "loss": 1.5629, + "step": 7295 + }, + { + "epoch": 0.2612852973302057, + "grad_norm": 1.9380877017974854, + "learning_rate": 0.0001732326388245388, + "loss": 1.5277, + "step": 7296 + }, + { + "epoch": 0.26132110945977405, + "grad_norm": 2.077476739883423, + "learning_rate": 0.00017322473994013833, + "loss": 1.4113, + "step": 7297 + }, + { + "epoch": 0.2613569215893423, + "grad_norm": 1.7269058227539062, + "learning_rate": 0.00017321684007059343, + "loss": 1.4078, + "step": 7298 + }, + { + "epoch": 0.2613927337189106, + "grad_norm": 1.5610759258270264, + "learning_rate": 0.00017320893921601036, + "loss": 1.348, + "step": 7299 + }, + { + "epoch": 0.26142854584847885, + "grad_norm": 1.4764721393585205, + "learning_rate": 0.00017320103737649548, + "loss": 1.3946, + "step": 7300 + }, + { + "epoch": 0.2614643579780472, + "grad_norm": 1.443997859954834, + "learning_rate": 0.00017319313455215504, + "loss": 1.4114, + "step": 7301 + }, + { + "epoch": 0.26150017010761545, + "grad_norm": 1.4743808507919312, + "learning_rate": 0.00017318523074309538, + "loss": 1.7602, + "step": 7302 + }, + { + "epoch": 0.2615359822371837, + "grad_norm": 1.5797048807144165, + "learning_rate": 0.00017317732594942286, + "loss": 1.5341, + "step": 7303 + }, + { + "epoch": 0.26157179436675204, + "grad_norm": 2.011399745941162, + "learning_rate": 0.0001731694201712438, + "loss": 1.5281, + "step": 7304 + }, + { + "epoch": 0.2616076064963203, + "grad_norm": 3.038586378097534, + "learning_rate": 0.0001731615134086646, + "loss": 1.4805, + "step": 7305 + }, + { + "epoch": 0.2616434186258886, + "grad_norm": 1.6645811796188354, + "learning_rate": 0.00017315360566179158, + "loss": 1.4753, + "step": 7306 + }, + { + "epoch": 0.26167923075545685, + "grad_norm": 1.668992519378662, + "learning_rate": 0.00017314569693073115, + "loss": 1.545, + "step": 7307 + }, + { + "epoch": 0.2617150428850252, + "grad_norm": 1.6997525691986084, + "learning_rate": 0.00017313778721558975, + "loss": 1.5749, + "step": 7308 + }, + { + "epoch": 0.26175085501459344, + "grad_norm": 1.5419024229049683, + "learning_rate": 0.00017312987651647374, + "loss": 1.457, + "step": 7309 + }, + { + "epoch": 0.2617866671441617, + "grad_norm": 1.712589144706726, + "learning_rate": 0.0001731219648334896, + "loss": 2.0299, + "step": 7310 + }, + { + "epoch": 0.26182247927373004, + "grad_norm": 1.5588375329971313, + "learning_rate": 0.00017311405216674373, + "loss": 1.5899, + "step": 7311 + }, + { + "epoch": 0.2618582914032983, + "grad_norm": 1.906366229057312, + "learning_rate": 0.00017310613851634257, + "loss": 1.698, + "step": 7312 + }, + { + "epoch": 0.2618941035328666, + "grad_norm": 1.9092109203338623, + "learning_rate": 0.00017309822388239266, + "loss": 1.6362, + "step": 7313 + }, + { + "epoch": 0.26192991566243484, + "grad_norm": 1.797682285308838, + "learning_rate": 0.0001730903082650004, + "loss": 1.3929, + "step": 7314 + }, + { + "epoch": 0.26196572779200317, + "grad_norm": 2.1721010208129883, + "learning_rate": 0.00017308239166427232, + "loss": 1.9409, + "step": 7315 + }, + { + "epoch": 0.26200153992157144, + "grad_norm": 2.2614290714263916, + "learning_rate": 0.00017307447408031497, + "loss": 1.5107, + "step": 7316 + }, + { + "epoch": 0.2620373520511397, + "grad_norm": 1.8231735229492188, + "learning_rate": 0.0001730665555132348, + "loss": 1.4254, + "step": 7317 + }, + { + "epoch": 0.26207316418070803, + "grad_norm": 1.7644098997116089, + "learning_rate": 0.00017305863596313837, + "loss": 1.4696, + "step": 7318 + }, + { + "epoch": 0.2621089763102763, + "grad_norm": 1.7434821128845215, + "learning_rate": 0.00017305071543013227, + "loss": 1.522, + "step": 7319 + }, + { + "epoch": 0.26214478843984457, + "grad_norm": 1.7708624601364136, + "learning_rate": 0.000173042793914323, + "loss": 1.4897, + "step": 7320 + }, + { + "epoch": 0.26218060056941284, + "grad_norm": 1.6023200750350952, + "learning_rate": 0.00017303487141581716, + "loss": 1.58, + "step": 7321 + }, + { + "epoch": 0.26221641269898116, + "grad_norm": 2.101994752883911, + "learning_rate": 0.0001730269479347213, + "loss": 1.7245, + "step": 7322 + }, + { + "epoch": 0.26225222482854943, + "grad_norm": 1.9376888275146484, + "learning_rate": 0.00017301902347114208, + "loss": 1.5611, + "step": 7323 + }, + { + "epoch": 0.2622880369581177, + "grad_norm": 2.2252988815307617, + "learning_rate": 0.0001730110980251861, + "loss": 1.7032, + "step": 7324 + }, + { + "epoch": 0.262323849087686, + "grad_norm": 1.6127136945724487, + "learning_rate": 0.00017300317159695995, + "loss": 1.4226, + "step": 7325 + }, + { + "epoch": 0.2623596612172543, + "grad_norm": 2.258042812347412, + "learning_rate": 0.0001729952441865703, + "loss": 1.6989, + "step": 7326 + }, + { + "epoch": 0.26239547334682256, + "grad_norm": 1.4718269109725952, + "learning_rate": 0.0001729873157941238, + "loss": 1.5979, + "step": 7327 + }, + { + "epoch": 0.26243128547639083, + "grad_norm": 1.5999681949615479, + "learning_rate": 0.00017297938641972716, + "loss": 1.4861, + "step": 7328 + }, + { + "epoch": 0.26246709760595915, + "grad_norm": 1.6622743606567383, + "learning_rate": 0.00017297145606348695, + "loss": 1.4849, + "step": 7329 + }, + { + "epoch": 0.2625029097355274, + "grad_norm": 2.1193180084228516, + "learning_rate": 0.00017296352472550994, + "loss": 1.8168, + "step": 7330 + }, + { + "epoch": 0.2625387218650957, + "grad_norm": 1.6978873014450073, + "learning_rate": 0.00017295559240590282, + "loss": 1.9455, + "step": 7331 + }, + { + "epoch": 0.262574533994664, + "grad_norm": 1.5294241905212402, + "learning_rate": 0.00017294765910477234, + "loss": 1.5882, + "step": 7332 + }, + { + "epoch": 0.2626103461242323, + "grad_norm": 1.2453467845916748, + "learning_rate": 0.00017293972482222515, + "loss": 1.4419, + "step": 7333 + }, + { + "epoch": 0.26264615825380055, + "grad_norm": 2.2070870399475098, + "learning_rate": 0.00017293178955836807, + "loss": 1.7747, + "step": 7334 + }, + { + "epoch": 0.2626819703833688, + "grad_norm": 1.5115678310394287, + "learning_rate": 0.00017292385331330786, + "loss": 1.6529, + "step": 7335 + }, + { + "epoch": 0.26271778251293715, + "grad_norm": 1.4143116474151611, + "learning_rate": 0.00017291591608715123, + "loss": 1.5566, + "step": 7336 + }, + { + "epoch": 0.2627535946425054, + "grad_norm": 1.4451357126235962, + "learning_rate": 0.00017290797788000503, + "loss": 1.6816, + "step": 7337 + }, + { + "epoch": 0.2627894067720737, + "grad_norm": 1.6054461002349854, + "learning_rate": 0.00017290003869197603, + "loss": 1.7646, + "step": 7338 + }, + { + "epoch": 0.262825218901642, + "grad_norm": 1.4832358360290527, + "learning_rate": 0.00017289209852317102, + "loss": 1.2788, + "step": 7339 + }, + { + "epoch": 0.2628610310312103, + "grad_norm": 1.7397518157958984, + "learning_rate": 0.00017288415737369689, + "loss": 1.5923, + "step": 7340 + }, + { + "epoch": 0.26289684316077855, + "grad_norm": 2.040109157562256, + "learning_rate": 0.0001728762152436604, + "loss": 1.5015, + "step": 7341 + }, + { + "epoch": 0.2629326552903468, + "grad_norm": 1.6289827823638916, + "learning_rate": 0.00017286827213316844, + "loss": 1.283, + "step": 7342 + }, + { + "epoch": 0.26296846741991514, + "grad_norm": 1.4795184135437012, + "learning_rate": 0.0001728603280423279, + "loss": 1.6067, + "step": 7343 + }, + { + "epoch": 0.2630042795494834, + "grad_norm": 1.4530529975891113, + "learning_rate": 0.00017285238297124562, + "loss": 1.3256, + "step": 7344 + }, + { + "epoch": 0.2630400916790517, + "grad_norm": 2.1068873405456543, + "learning_rate": 0.00017284443692002846, + "loss": 1.8416, + "step": 7345 + }, + { + "epoch": 0.26307590380862, + "grad_norm": 1.2771168947219849, + "learning_rate": 0.00017283648988878343, + "loss": 1.5879, + "step": 7346 + }, + { + "epoch": 0.2631117159381883, + "grad_norm": 1.4236646890640259, + "learning_rate": 0.00017282854187761735, + "loss": 1.8003, + "step": 7347 + }, + { + "epoch": 0.26314752806775654, + "grad_norm": 1.7822551727294922, + "learning_rate": 0.00017282059288663715, + "loss": 1.5744, + "step": 7348 + }, + { + "epoch": 0.2631833401973248, + "grad_norm": 1.6378618478775024, + "learning_rate": 0.00017281264291594983, + "loss": 1.69, + "step": 7349 + }, + { + "epoch": 0.26321915232689314, + "grad_norm": 1.6540489196777344, + "learning_rate": 0.00017280469196566235, + "loss": 1.3977, + "step": 7350 + }, + { + "epoch": 0.2632549644564614, + "grad_norm": 1.8168818950653076, + "learning_rate": 0.0001727967400358816, + "loss": 1.5793, + "step": 7351 + }, + { + "epoch": 0.2632907765860297, + "grad_norm": 2.310772657394409, + "learning_rate": 0.00017278878712671464, + "loss": 1.8026, + "step": 7352 + }, + { + "epoch": 0.263326588715598, + "grad_norm": 1.4462745189666748, + "learning_rate": 0.00017278083323826846, + "loss": 1.6332, + "step": 7353 + }, + { + "epoch": 0.26336240084516627, + "grad_norm": 1.3352108001708984, + "learning_rate": 0.00017277287837065002, + "loss": 1.2135, + "step": 7354 + }, + { + "epoch": 0.26339821297473454, + "grad_norm": 2.1556811332702637, + "learning_rate": 0.0001727649225239664, + "loss": 1.4066, + "step": 7355 + }, + { + "epoch": 0.2634340251043028, + "grad_norm": 1.7638002634048462, + "learning_rate": 0.00017275696569832457, + "loss": 1.623, + "step": 7356 + }, + { + "epoch": 0.26346983723387113, + "grad_norm": 1.8131543397903442, + "learning_rate": 0.00017274900789383165, + "loss": 1.4694, + "step": 7357 + }, + { + "epoch": 0.2635056493634394, + "grad_norm": 1.7827945947647095, + "learning_rate": 0.0001727410491105946, + "loss": 1.3291, + "step": 7358 + }, + { + "epoch": 0.26354146149300767, + "grad_norm": 2.088897943496704, + "learning_rate": 0.00017273308934872064, + "loss": 1.4516, + "step": 7359 + }, + { + "epoch": 0.263577273622576, + "grad_norm": 1.7818443775177002, + "learning_rate": 0.00017272512860831674, + "loss": 1.8525, + "step": 7360 + }, + { + "epoch": 0.26361308575214426, + "grad_norm": 1.5051300525665283, + "learning_rate": 0.00017271716688949007, + "loss": 1.5285, + "step": 7361 + }, + { + "epoch": 0.26364889788171253, + "grad_norm": 2.0163896083831787, + "learning_rate": 0.0001727092041923477, + "loss": 1.5589, + "step": 7362 + }, + { + "epoch": 0.2636847100112808, + "grad_norm": 1.6455193758010864, + "learning_rate": 0.00017270124051699682, + "loss": 1.673, + "step": 7363 + }, + { + "epoch": 0.2637205221408491, + "grad_norm": 1.561238408088684, + "learning_rate": 0.00017269327586354446, + "loss": 1.6594, + "step": 7364 + }, + { + "epoch": 0.2637563342704174, + "grad_norm": 1.765101671218872, + "learning_rate": 0.00017268531023209788, + "loss": 1.5668, + "step": 7365 + }, + { + "epoch": 0.26379214639998566, + "grad_norm": 2.3557276725769043, + "learning_rate": 0.0001726773436227642, + "loss": 1.5615, + "step": 7366 + }, + { + "epoch": 0.263827958529554, + "grad_norm": 1.725612759590149, + "learning_rate": 0.0001726693760356506, + "loss": 1.5138, + "step": 7367 + }, + { + "epoch": 0.26386377065912225, + "grad_norm": 1.6361420154571533, + "learning_rate": 0.0001726614074708643, + "loss": 1.3698, + "step": 7368 + }, + { + "epoch": 0.2638995827886905, + "grad_norm": 1.775638461112976, + "learning_rate": 0.00017265343792851248, + "loss": 1.3577, + "step": 7369 + }, + { + "epoch": 0.2639353949182588, + "grad_norm": 1.387298345565796, + "learning_rate": 0.00017264546740870234, + "loss": 1.5771, + "step": 7370 + }, + { + "epoch": 0.2639712070478271, + "grad_norm": 1.341925859451294, + "learning_rate": 0.0001726374959115412, + "loss": 1.3746, + "step": 7371 + }, + { + "epoch": 0.2640070191773954, + "grad_norm": 1.5934443473815918, + "learning_rate": 0.0001726295234371362, + "loss": 1.5295, + "step": 7372 + }, + { + "epoch": 0.26404283130696365, + "grad_norm": 1.5405175685882568, + "learning_rate": 0.00017262154998559466, + "loss": 1.5476, + "step": 7373 + }, + { + "epoch": 0.264078643436532, + "grad_norm": 1.6285407543182373, + "learning_rate": 0.00017261357555702387, + "loss": 1.565, + "step": 7374 + }, + { + "epoch": 0.26411445556610025, + "grad_norm": 2.251279592514038, + "learning_rate": 0.00017260560015153106, + "loss": 1.7117, + "step": 7375 + }, + { + "epoch": 0.2641502676956685, + "grad_norm": 2.265822410583496, + "learning_rate": 0.00017259762376922356, + "loss": 2.0113, + "step": 7376 + }, + { + "epoch": 0.2641860798252368, + "grad_norm": 2.2183547019958496, + "learning_rate": 0.00017258964641020868, + "loss": 1.5966, + "step": 7377 + }, + { + "epoch": 0.2642218919548051, + "grad_norm": 2.0988476276397705, + "learning_rate": 0.0001725816680745937, + "loss": 1.3639, + "step": 7378 + }, + { + "epoch": 0.2642577040843734, + "grad_norm": 2.5295467376708984, + "learning_rate": 0.00017257368876248604, + "loss": 1.4807, + "step": 7379 + }, + { + "epoch": 0.26429351621394165, + "grad_norm": 1.4437577724456787, + "learning_rate": 0.000172565708473993, + "loss": 1.4974, + "step": 7380 + }, + { + "epoch": 0.26432932834351, + "grad_norm": 1.2854939699172974, + "learning_rate": 0.00017255772720922195, + "loss": 1.6244, + "step": 7381 + }, + { + "epoch": 0.26436514047307824, + "grad_norm": 1.3018090724945068, + "learning_rate": 0.0001725497449682803, + "loss": 1.3882, + "step": 7382 + }, + { + "epoch": 0.2644009526026465, + "grad_norm": 1.5124176740646362, + "learning_rate": 0.00017254176175127538, + "loss": 1.4241, + "step": 7383 + }, + { + "epoch": 0.2644367647322148, + "grad_norm": 2.2638661861419678, + "learning_rate": 0.00017253377755831466, + "loss": 1.7341, + "step": 7384 + }, + { + "epoch": 0.2644725768617831, + "grad_norm": 1.42527174949646, + "learning_rate": 0.00017252579238950552, + "loss": 1.5593, + "step": 7385 + }, + { + "epoch": 0.2645083889913514, + "grad_norm": 1.981040596961975, + "learning_rate": 0.00017251780624495536, + "loss": 1.482, + "step": 7386 + }, + { + "epoch": 0.26454420112091964, + "grad_norm": 2.1076390743255615, + "learning_rate": 0.0001725098191247717, + "loss": 1.5527, + "step": 7387 + }, + { + "epoch": 0.26458001325048797, + "grad_norm": 1.287087321281433, + "learning_rate": 0.00017250183102906195, + "loss": 1.7802, + "step": 7388 + }, + { + "epoch": 0.26461582538005624, + "grad_norm": 1.3365637063980103, + "learning_rate": 0.00017249384195793357, + "loss": 1.4736, + "step": 7389 + }, + { + "epoch": 0.2646516375096245, + "grad_norm": 1.6480307579040527, + "learning_rate": 0.000172485851911494, + "loss": 1.8273, + "step": 7390 + }, + { + "epoch": 0.2646874496391928, + "grad_norm": 2.1490418910980225, + "learning_rate": 0.00017247786088985087, + "loss": 1.751, + "step": 7391 + }, + { + "epoch": 0.2647232617687611, + "grad_norm": 1.840654730796814, + "learning_rate": 0.0001724698688931116, + "loss": 1.772, + "step": 7392 + }, + { + "epoch": 0.26475907389832937, + "grad_norm": 3.2278695106506348, + "learning_rate": 0.0001724618759213837, + "loss": 1.7355, + "step": 7393 + }, + { + "epoch": 0.26479488602789764, + "grad_norm": 2.007066249847412, + "learning_rate": 0.00017245388197477477, + "loss": 1.4662, + "step": 7394 + }, + { + "epoch": 0.26483069815746596, + "grad_norm": 1.8525476455688477, + "learning_rate": 0.0001724458870533923, + "loss": 1.6683, + "step": 7395 + }, + { + "epoch": 0.26486651028703423, + "grad_norm": 1.865721583366394, + "learning_rate": 0.00017243789115734383, + "loss": 1.7475, + "step": 7396 + }, + { + "epoch": 0.2649023224166025, + "grad_norm": 1.5090208053588867, + "learning_rate": 0.00017242989428673701, + "loss": 1.5835, + "step": 7397 + }, + { + "epoch": 0.26493813454617077, + "grad_norm": 2.2835018634796143, + "learning_rate": 0.0001724218964416794, + "loss": 1.4297, + "step": 7398 + }, + { + "epoch": 0.2649739466757391, + "grad_norm": 1.7916940450668335, + "learning_rate": 0.00017241389762227857, + "loss": 1.666, + "step": 7399 + }, + { + "epoch": 0.26500975880530736, + "grad_norm": 1.830520510673523, + "learning_rate": 0.00017240589782864215, + "loss": 1.5415, + "step": 7400 + }, + { + "epoch": 0.26504557093487563, + "grad_norm": 1.5442086458206177, + "learning_rate": 0.00017239789706087778, + "loss": 1.6078, + "step": 7401 + }, + { + "epoch": 0.26508138306444395, + "grad_norm": 1.3873393535614014, + "learning_rate": 0.0001723898953190931, + "loss": 1.6903, + "step": 7402 + }, + { + "epoch": 0.2651171951940122, + "grad_norm": 1.6361714601516724, + "learning_rate": 0.00017238189260339573, + "loss": 1.6274, + "step": 7403 + }, + { + "epoch": 0.2651530073235805, + "grad_norm": 1.1806063652038574, + "learning_rate": 0.00017237388891389336, + "loss": 1.5139, + "step": 7404 + }, + { + "epoch": 0.26518881945314876, + "grad_norm": 1.8152517080307007, + "learning_rate": 0.0001723658842506937, + "loss": 1.3687, + "step": 7405 + }, + { + "epoch": 0.2652246315827171, + "grad_norm": 1.5166256427764893, + "learning_rate": 0.0001723578786139044, + "loss": 1.3072, + "step": 7406 + }, + { + "epoch": 0.26526044371228535, + "grad_norm": 1.715145468711853, + "learning_rate": 0.00017234987200363317, + "loss": 1.4742, + "step": 7407 + }, + { + "epoch": 0.2652962558418536, + "grad_norm": 2.2993531227111816, + "learning_rate": 0.00017234186441998777, + "loss": 1.5068, + "step": 7408 + }, + { + "epoch": 0.26533206797142195, + "grad_norm": 1.8281476497650146, + "learning_rate": 0.00017233385586307588, + "loss": 1.6021, + "step": 7409 + }, + { + "epoch": 0.2653678801009902, + "grad_norm": 1.5883283615112305, + "learning_rate": 0.00017232584633300522, + "loss": 1.6, + "step": 7410 + }, + { + "epoch": 0.2654036922305585, + "grad_norm": 1.610416293144226, + "learning_rate": 0.00017231783582988367, + "loss": 1.6896, + "step": 7411 + }, + { + "epoch": 0.26543950436012675, + "grad_norm": 1.5972161293029785, + "learning_rate": 0.00017230982435381887, + "loss": 1.7697, + "step": 7412 + }, + { + "epoch": 0.2654753164896951, + "grad_norm": 1.538904070854187, + "learning_rate": 0.00017230181190491862, + "loss": 1.5154, + "step": 7413 + }, + { + "epoch": 0.26551112861926335, + "grad_norm": 1.459111213684082, + "learning_rate": 0.0001722937984832908, + "loss": 1.6878, + "step": 7414 + }, + { + "epoch": 0.2655469407488316, + "grad_norm": 1.890040636062622, + "learning_rate": 0.0001722857840890432, + "loss": 1.3518, + "step": 7415 + }, + { + "epoch": 0.26558275287839994, + "grad_norm": 1.6970199346542358, + "learning_rate": 0.00017227776872228359, + "loss": 1.5614, + "step": 7416 + }, + { + "epoch": 0.2656185650079682, + "grad_norm": 1.4791874885559082, + "learning_rate": 0.00017226975238311982, + "loss": 1.4532, + "step": 7417 + }, + { + "epoch": 0.2656543771375365, + "grad_norm": 1.8702563047409058, + "learning_rate": 0.00017226173507165976, + "loss": 1.7689, + "step": 7418 + }, + { + "epoch": 0.26569018926710475, + "grad_norm": 1.5942208766937256, + "learning_rate": 0.0001722537167880113, + "loss": 1.7556, + "step": 7419 + }, + { + "epoch": 0.2657260013966731, + "grad_norm": 1.6000324487686157, + "learning_rate": 0.00017224569753228225, + "loss": 1.5437, + "step": 7420 + }, + { + "epoch": 0.26576181352624134, + "grad_norm": 2.1063296794891357, + "learning_rate": 0.00017223767730458053, + "loss": 1.7174, + "step": 7421 + }, + { + "epoch": 0.2657976256558096, + "grad_norm": 1.3740330934524536, + "learning_rate": 0.00017222965610501405, + "loss": 1.4734, + "step": 7422 + }, + { + "epoch": 0.26583343778537794, + "grad_norm": 1.5399854183197021, + "learning_rate": 0.00017222163393369071, + "loss": 1.5247, + "step": 7423 + }, + { + "epoch": 0.2658692499149462, + "grad_norm": 1.5186257362365723, + "learning_rate": 0.00017221361079071846, + "loss": 1.901, + "step": 7424 + }, + { + "epoch": 0.2659050620445145, + "grad_norm": 1.7701181173324585, + "learning_rate": 0.00017220558667620518, + "loss": 1.4393, + "step": 7425 + }, + { + "epoch": 0.26594087417408274, + "grad_norm": 1.7692437171936035, + "learning_rate": 0.0001721975615902589, + "loss": 1.5675, + "step": 7426 + }, + { + "epoch": 0.26597668630365107, + "grad_norm": 2.3157219886779785, + "learning_rate": 0.00017218953553298759, + "loss": 1.5851, + "step": 7427 + }, + { + "epoch": 0.26601249843321934, + "grad_norm": 1.5588358640670776, + "learning_rate": 0.00017218150850449915, + "loss": 1.2724, + "step": 7428 + }, + { + "epoch": 0.2660483105627876, + "grad_norm": 1.2565510272979736, + "learning_rate": 0.00017217348050490162, + "loss": 1.6188, + "step": 7429 + }, + { + "epoch": 0.2660841226923559, + "grad_norm": 1.3198089599609375, + "learning_rate": 0.00017216545153430303, + "loss": 1.4822, + "step": 7430 + }, + { + "epoch": 0.2661199348219242, + "grad_norm": 1.2001327276229858, + "learning_rate": 0.00017215742159281137, + "loss": 1.4967, + "step": 7431 + }, + { + "epoch": 0.26615574695149247, + "grad_norm": 1.8182188272476196, + "learning_rate": 0.00017214939068053468, + "loss": 1.6206, + "step": 7432 + }, + { + "epoch": 0.26619155908106074, + "grad_norm": 1.916847586631775, + "learning_rate": 0.000172141358797581, + "loss": 1.6809, + "step": 7433 + }, + { + "epoch": 0.26622737121062906, + "grad_norm": 1.1985223293304443, + "learning_rate": 0.0001721333259440584, + "loss": 1.4388, + "step": 7434 + }, + { + "epoch": 0.26626318334019733, + "grad_norm": 1.6115180253982544, + "learning_rate": 0.00017212529212007492, + "loss": 1.6658, + "step": 7435 + }, + { + "epoch": 0.2662989954697656, + "grad_norm": 1.7065064907073975, + "learning_rate": 0.0001721172573257387, + "loss": 1.5012, + "step": 7436 + }, + { + "epoch": 0.26633480759933387, + "grad_norm": 1.26376211643219, + "learning_rate": 0.0001721092215611578, + "loss": 1.0781, + "step": 7437 + }, + { + "epoch": 0.2663706197289022, + "grad_norm": 1.3036880493164062, + "learning_rate": 0.00017210118482644036, + "loss": 1.5433, + "step": 7438 + }, + { + "epoch": 0.26640643185847046, + "grad_norm": 1.7557674646377563, + "learning_rate": 0.00017209314712169445, + "loss": 1.449, + "step": 7439 + }, + { + "epoch": 0.26644224398803873, + "grad_norm": 1.509422779083252, + "learning_rate": 0.00017208510844702823, + "loss": 1.6469, + "step": 7440 + }, + { + "epoch": 0.26647805611760705, + "grad_norm": 1.7672717571258545, + "learning_rate": 0.00017207706880254987, + "loss": 1.6361, + "step": 7441 + }, + { + "epoch": 0.2665138682471753, + "grad_norm": 1.363512396812439, + "learning_rate": 0.00017206902818836756, + "loss": 1.2467, + "step": 7442 + }, + { + "epoch": 0.2665496803767436, + "grad_norm": 2.0854268074035645, + "learning_rate": 0.00017206098660458937, + "loss": 1.596, + "step": 7443 + }, + { + "epoch": 0.26658549250631186, + "grad_norm": 1.7014752626419067, + "learning_rate": 0.00017205294405132362, + "loss": 1.4051, + "step": 7444 + }, + { + "epoch": 0.2666213046358802, + "grad_norm": 1.4514315128326416, + "learning_rate": 0.00017204490052867842, + "loss": 1.3514, + "step": 7445 + }, + { + "epoch": 0.26665711676544845, + "grad_norm": 1.4428623914718628, + "learning_rate": 0.00017203685603676202, + "loss": 1.6256, + "step": 7446 + }, + { + "epoch": 0.2666929288950167, + "grad_norm": 2.695467233657837, + "learning_rate": 0.0001720288105756826, + "loss": 1.8272, + "step": 7447 + }, + { + "epoch": 0.26672874102458505, + "grad_norm": 1.776180386543274, + "learning_rate": 0.0001720207641455485, + "loss": 1.6105, + "step": 7448 + }, + { + "epoch": 0.2667645531541533, + "grad_norm": 1.5230605602264404, + "learning_rate": 0.0001720127167464679, + "loss": 1.6002, + "step": 7449 + }, + { + "epoch": 0.2668003652837216, + "grad_norm": 1.4507273435592651, + "learning_rate": 0.00017200466837854908, + "loss": 1.6159, + "step": 7450 + }, + { + "epoch": 0.26683617741328985, + "grad_norm": 2.1067185401916504, + "learning_rate": 0.00017199661904190037, + "loss": 1.3856, + "step": 7451 + }, + { + "epoch": 0.2668719895428582, + "grad_norm": 1.6292303800582886, + "learning_rate": 0.00017198856873662996, + "loss": 1.5911, + "step": 7452 + }, + { + "epoch": 0.26690780167242645, + "grad_norm": 1.3270217180252075, + "learning_rate": 0.00017198051746284624, + "loss": 1.4468, + "step": 7453 + }, + { + "epoch": 0.2669436138019947, + "grad_norm": 1.3355484008789062, + "learning_rate": 0.00017197246522065752, + "loss": 1.4923, + "step": 7454 + }, + { + "epoch": 0.26697942593156304, + "grad_norm": 1.5357935428619385, + "learning_rate": 0.00017196441201017208, + "loss": 1.2817, + "step": 7455 + }, + { + "epoch": 0.2670152380611313, + "grad_norm": 1.796885371208191, + "learning_rate": 0.00017195635783149834, + "loss": 1.7462, + "step": 7456 + }, + { + "epoch": 0.2670510501906996, + "grad_norm": 2.4231655597686768, + "learning_rate": 0.0001719483026847446, + "loss": 1.9039, + "step": 7457 + }, + { + "epoch": 0.26708686232026785, + "grad_norm": 1.710451602935791, + "learning_rate": 0.00017194024657001927, + "loss": 1.5666, + "step": 7458 + }, + { + "epoch": 0.2671226744498362, + "grad_norm": 1.5234856605529785, + "learning_rate": 0.0001719321894874307, + "loss": 1.6446, + "step": 7459 + }, + { + "epoch": 0.26715848657940444, + "grad_norm": 1.9876848459243774, + "learning_rate": 0.00017192413143708735, + "loss": 1.6329, + "step": 7460 + }, + { + "epoch": 0.2671942987089727, + "grad_norm": 1.7578610181808472, + "learning_rate": 0.00017191607241909753, + "loss": 1.5056, + "step": 7461 + }, + { + "epoch": 0.26723011083854104, + "grad_norm": 1.4033610820770264, + "learning_rate": 0.00017190801243356977, + "loss": 1.4246, + "step": 7462 + }, + { + "epoch": 0.2672659229681093, + "grad_norm": 1.780724048614502, + "learning_rate": 0.0001718999514806124, + "loss": 1.3211, + "step": 7463 + }, + { + "epoch": 0.2673017350976776, + "grad_norm": 2.1150660514831543, + "learning_rate": 0.000171891889560334, + "loss": 1.5886, + "step": 7464 + }, + { + "epoch": 0.26733754722724584, + "grad_norm": 1.8331183195114136, + "learning_rate": 0.0001718838266728429, + "loss": 1.4792, + "step": 7465 + }, + { + "epoch": 0.26737335935681417, + "grad_norm": 1.8556559085845947, + "learning_rate": 0.00017187576281824766, + "loss": 1.5132, + "step": 7466 + }, + { + "epoch": 0.26740917148638244, + "grad_norm": 1.7563835382461548, + "learning_rate": 0.00017186769799665673, + "loss": 1.5768, + "step": 7467 + }, + { + "epoch": 0.2674449836159507, + "grad_norm": 2.372431516647339, + "learning_rate": 0.00017185963220817864, + "loss": 1.5395, + "step": 7468 + }, + { + "epoch": 0.26748079574551903, + "grad_norm": 1.086047649383545, + "learning_rate": 0.0001718515654529219, + "loss": 1.3656, + "step": 7469 + }, + { + "epoch": 0.2675166078750873, + "grad_norm": 1.8703947067260742, + "learning_rate": 0.000171843497730995, + "loss": 1.6605, + "step": 7470 + }, + { + "epoch": 0.26755242000465557, + "grad_norm": 1.627095103263855, + "learning_rate": 0.00017183542904250656, + "loss": 1.6894, + "step": 7471 + }, + { + "epoch": 0.26758823213422384, + "grad_norm": 1.8694849014282227, + "learning_rate": 0.00017182735938756506, + "loss": 1.4633, + "step": 7472 + }, + { + "epoch": 0.26762404426379216, + "grad_norm": 1.7789463996887207, + "learning_rate": 0.00017181928876627907, + "loss": 1.5519, + "step": 7473 + }, + { + "epoch": 0.26765985639336043, + "grad_norm": 2.8911938667297363, + "learning_rate": 0.0001718112171787572, + "loss": 1.5961, + "step": 7474 + }, + { + "epoch": 0.2676956685229287, + "grad_norm": 1.5189895629882812, + "learning_rate": 0.000171803144625108, + "loss": 1.401, + "step": 7475 + }, + { + "epoch": 0.267731480652497, + "grad_norm": 1.9463037252426147, + "learning_rate": 0.00017179507110544014, + "loss": 1.8663, + "step": 7476 + }, + { + "epoch": 0.2677672927820653, + "grad_norm": 1.4575629234313965, + "learning_rate": 0.0001717869966198622, + "loss": 1.5934, + "step": 7477 + }, + { + "epoch": 0.26780310491163356, + "grad_norm": 1.911528468132019, + "learning_rate": 0.00017177892116848284, + "loss": 1.572, + "step": 7478 + }, + { + "epoch": 0.26783891704120183, + "grad_norm": 1.543802261352539, + "learning_rate": 0.00017177084475141069, + "loss": 1.7237, + "step": 7479 + }, + { + "epoch": 0.26787472917077015, + "grad_norm": 1.4773614406585693, + "learning_rate": 0.0001717627673687544, + "loss": 1.6445, + "step": 7480 + }, + { + "epoch": 0.2679105413003384, + "grad_norm": 1.8544567823410034, + "learning_rate": 0.0001717546890206226, + "loss": 1.2569, + "step": 7481 + }, + { + "epoch": 0.2679463534299067, + "grad_norm": 1.9463386535644531, + "learning_rate": 0.00017174660970712403, + "loss": 1.5379, + "step": 7482 + }, + { + "epoch": 0.267982165559475, + "grad_norm": 2.0309228897094727, + "learning_rate": 0.00017173852942836739, + "loss": 1.6161, + "step": 7483 + }, + { + "epoch": 0.2680179776890433, + "grad_norm": 1.7174644470214844, + "learning_rate": 0.00017173044818446137, + "loss": 1.6121, + "step": 7484 + }, + { + "epoch": 0.26805378981861155, + "grad_norm": 1.8857377767562866, + "learning_rate": 0.00017172236597551467, + "loss": 1.6238, + "step": 7485 + }, + { + "epoch": 0.2680896019481798, + "grad_norm": 1.3329565525054932, + "learning_rate": 0.0001717142828016361, + "loss": 1.7221, + "step": 7486 + }, + { + "epoch": 0.26812541407774815, + "grad_norm": 1.682708978652954, + "learning_rate": 0.00017170619866293434, + "loss": 1.1758, + "step": 7487 + }, + { + "epoch": 0.2681612262073164, + "grad_norm": 1.6127010583877563, + "learning_rate": 0.00017169811355951815, + "loss": 1.5131, + "step": 7488 + }, + { + "epoch": 0.2681970383368847, + "grad_norm": 1.7014356851577759, + "learning_rate": 0.0001716900274914963, + "loss": 1.3939, + "step": 7489 + }, + { + "epoch": 0.268232850466453, + "grad_norm": 1.85292387008667, + "learning_rate": 0.00017168194045897767, + "loss": 1.434, + "step": 7490 + }, + { + "epoch": 0.2682686625960213, + "grad_norm": 1.6793440580368042, + "learning_rate": 0.000171673852462071, + "loss": 1.3301, + "step": 7491 + }, + { + "epoch": 0.26830447472558955, + "grad_norm": 1.561973214149475, + "learning_rate": 0.00017166576350088506, + "loss": 1.2769, + "step": 7492 + }, + { + "epoch": 0.2683402868551578, + "grad_norm": 1.6289607286453247, + "learning_rate": 0.0001716576735755287, + "loss": 1.4536, + "step": 7493 + }, + { + "epoch": 0.26837609898472614, + "grad_norm": 1.7525798082351685, + "learning_rate": 0.00017164958268611077, + "loss": 1.3704, + "step": 7494 + }, + { + "epoch": 0.2684119111142944, + "grad_norm": 1.8854219913482666, + "learning_rate": 0.00017164149083274017, + "loss": 1.6748, + "step": 7495 + }, + { + "epoch": 0.2684477232438627, + "grad_norm": 1.3732788562774658, + "learning_rate": 0.0001716333980155257, + "loss": 1.5534, + "step": 7496 + }, + { + "epoch": 0.268483535373431, + "grad_norm": 1.6843669414520264, + "learning_rate": 0.00017162530423457626, + "loss": 1.7485, + "step": 7497 + }, + { + "epoch": 0.2685193475029993, + "grad_norm": 1.746842861175537, + "learning_rate": 0.00017161720949000075, + "loss": 1.7009, + "step": 7498 + }, + { + "epoch": 0.26855515963256754, + "grad_norm": 2.6704607009887695, + "learning_rate": 0.00017160911378190808, + "loss": 1.4903, + "step": 7499 + }, + { + "epoch": 0.2685909717621358, + "grad_norm": 1.9854313135147095, + "learning_rate": 0.00017160101711040713, + "loss": 1.7229, + "step": 7500 + }, + { + "epoch": 0.26862678389170414, + "grad_norm": 2.014387845993042, + "learning_rate": 0.00017159291947560682, + "loss": 1.5854, + "step": 7501 + }, + { + "epoch": 0.2686625960212724, + "grad_norm": 1.4004807472229004, + "learning_rate": 0.00017158482087761617, + "loss": 1.5235, + "step": 7502 + }, + { + "epoch": 0.2686984081508407, + "grad_norm": 1.545235276222229, + "learning_rate": 0.0001715767213165441, + "loss": 1.6951, + "step": 7503 + }, + { + "epoch": 0.268734220280409, + "grad_norm": 1.4803507328033447, + "learning_rate": 0.00017156862079249953, + "loss": 1.7183, + "step": 7504 + }, + { + "epoch": 0.26877003240997727, + "grad_norm": 1.6288284063339233, + "learning_rate": 0.00017156051930559155, + "loss": 1.6609, + "step": 7505 + }, + { + "epoch": 0.26880584453954554, + "grad_norm": 1.784579873085022, + "learning_rate": 0.00017155241685592903, + "loss": 1.6495, + "step": 7506 + }, + { + "epoch": 0.2688416566691138, + "grad_norm": 1.2053595781326294, + "learning_rate": 0.00017154431344362106, + "loss": 1.4678, + "step": 7507 + }, + { + "epoch": 0.26887746879868213, + "grad_norm": 1.9320080280303955, + "learning_rate": 0.00017153620906877666, + "loss": 1.6605, + "step": 7508 + }, + { + "epoch": 0.2689132809282504, + "grad_norm": 1.4452803134918213, + "learning_rate": 0.00017152810373150478, + "loss": 1.6794, + "step": 7509 + }, + { + "epoch": 0.26894909305781867, + "grad_norm": 1.9542020559310913, + "learning_rate": 0.00017151999743191456, + "loss": 1.4616, + "step": 7510 + }, + { + "epoch": 0.268984905187387, + "grad_norm": 2.002528667449951, + "learning_rate": 0.00017151189017011503, + "loss": 1.7598, + "step": 7511 + }, + { + "epoch": 0.26902071731695526, + "grad_norm": 1.4809521436691284, + "learning_rate": 0.00017150378194621529, + "loss": 1.5741, + "step": 7512 + }, + { + "epoch": 0.26905652944652353, + "grad_norm": 2.5902462005615234, + "learning_rate": 0.0001714956727603244, + "loss": 1.5187, + "step": 7513 + }, + { + "epoch": 0.2690923415760918, + "grad_norm": 1.7048165798187256, + "learning_rate": 0.0001714875626125514, + "loss": 1.5129, + "step": 7514 + }, + { + "epoch": 0.2691281537056601, + "grad_norm": 1.714754343032837, + "learning_rate": 0.0001714794515030055, + "loss": 1.9192, + "step": 7515 + }, + { + "epoch": 0.2691639658352284, + "grad_norm": 1.5746515989303589, + "learning_rate": 0.00017147133943179577, + "loss": 1.5068, + "step": 7516 + }, + { + "epoch": 0.26919977796479666, + "grad_norm": 1.5344665050506592, + "learning_rate": 0.00017146322639903137, + "loss": 1.6429, + "step": 7517 + }, + { + "epoch": 0.269235590094365, + "grad_norm": 1.722382664680481, + "learning_rate": 0.00017145511240482142, + "loss": 1.3101, + "step": 7518 + }, + { + "epoch": 0.26927140222393325, + "grad_norm": 1.7240898609161377, + "learning_rate": 0.00017144699744927507, + "loss": 1.657, + "step": 7519 + }, + { + "epoch": 0.2693072143535015, + "grad_norm": 2.000774383544922, + "learning_rate": 0.0001714388815325016, + "loss": 1.4911, + "step": 7520 + }, + { + "epoch": 0.2693430264830698, + "grad_norm": 2.6266844272613525, + "learning_rate": 0.0001714307646546101, + "loss": 1.7797, + "step": 7521 + }, + { + "epoch": 0.2693788386126381, + "grad_norm": 1.6065651178359985, + "learning_rate": 0.00017142264681570978, + "loss": 1.6816, + "step": 7522 + }, + { + "epoch": 0.2694146507422064, + "grad_norm": 1.504185438156128, + "learning_rate": 0.00017141452801590988, + "loss": 1.2781, + "step": 7523 + }, + { + "epoch": 0.26945046287177465, + "grad_norm": 1.5775504112243652, + "learning_rate": 0.00017140640825531967, + "loss": 1.6476, + "step": 7524 + }, + { + "epoch": 0.269486275001343, + "grad_norm": 1.482826828956604, + "learning_rate": 0.0001713982875340483, + "loss": 1.7394, + "step": 7525 + }, + { + "epoch": 0.26952208713091125, + "grad_norm": 2.138429641723633, + "learning_rate": 0.00017139016585220512, + "loss": 1.6288, + "step": 7526 + }, + { + "epoch": 0.2695578992604795, + "grad_norm": 1.5291169881820679, + "learning_rate": 0.0001713820432098993, + "loss": 1.3784, + "step": 7527 + }, + { + "epoch": 0.2695937113900478, + "grad_norm": 1.7900766134262085, + "learning_rate": 0.00017137391960724013, + "loss": 1.4586, + "step": 7528 + }, + { + "epoch": 0.2696295235196161, + "grad_norm": 1.9882597923278809, + "learning_rate": 0.000171365795044337, + "loss": 1.5581, + "step": 7529 + }, + { + "epoch": 0.2696653356491844, + "grad_norm": 1.6502845287322998, + "learning_rate": 0.00017135766952129913, + "loss": 1.4082, + "step": 7530 + }, + { + "epoch": 0.26970114777875265, + "grad_norm": 1.6091289520263672, + "learning_rate": 0.00017134954303823588, + "loss": 1.5386, + "step": 7531 + }, + { + "epoch": 0.269736959908321, + "grad_norm": 1.4166127443313599, + "learning_rate": 0.00017134141559525654, + "loss": 1.4206, + "step": 7532 + }, + { + "epoch": 0.26977277203788924, + "grad_norm": 1.5068105459213257, + "learning_rate": 0.00017133328719247048, + "loss": 1.3298, + "step": 7533 + }, + { + "epoch": 0.2698085841674575, + "grad_norm": 1.52097749710083, + "learning_rate": 0.00017132515782998704, + "loss": 1.3535, + "step": 7534 + }, + { + "epoch": 0.2698443962970258, + "grad_norm": 1.7753351926803589, + "learning_rate": 0.00017131702750791564, + "loss": 1.2874, + "step": 7535 + }, + { + "epoch": 0.2698802084265941, + "grad_norm": 1.4764755964279175, + "learning_rate": 0.0001713088962263656, + "loss": 1.6138, + "step": 7536 + }, + { + "epoch": 0.2699160205561624, + "grad_norm": 1.4922411441802979, + "learning_rate": 0.00017130076398544635, + "loss": 1.7891, + "step": 7537 + }, + { + "epoch": 0.26995183268573064, + "grad_norm": 1.7627573013305664, + "learning_rate": 0.0001712926307852673, + "loss": 1.4984, + "step": 7538 + }, + { + "epoch": 0.26998764481529897, + "grad_norm": 1.7921463251113892, + "learning_rate": 0.00017128449662593786, + "loss": 1.5461, + "step": 7539 + }, + { + "epoch": 0.27002345694486724, + "grad_norm": 1.4542369842529297, + "learning_rate": 0.00017127636150756747, + "loss": 1.7536, + "step": 7540 + }, + { + "epoch": 0.2700592690744355, + "grad_norm": 1.5525248050689697, + "learning_rate": 0.00017126822543026555, + "loss": 1.6489, + "step": 7541 + }, + { + "epoch": 0.2700950812040038, + "grad_norm": 2.08571720123291, + "learning_rate": 0.00017126008839414163, + "loss": 1.8435, + "step": 7542 + }, + { + "epoch": 0.2701308933335721, + "grad_norm": 1.3870586156845093, + "learning_rate": 0.00017125195039930508, + "loss": 1.3148, + "step": 7543 + }, + { + "epoch": 0.27016670546314037, + "grad_norm": 3.187460422515869, + "learning_rate": 0.0001712438114458655, + "loss": 1.6014, + "step": 7544 + }, + { + "epoch": 0.27020251759270864, + "grad_norm": 1.4459199905395508, + "learning_rate": 0.00017123567153393233, + "loss": 1.4049, + "step": 7545 + }, + { + "epoch": 0.27023832972227696, + "grad_norm": 2.3334615230560303, + "learning_rate": 0.00017122753066361508, + "loss": 1.542, + "step": 7546 + }, + { + "epoch": 0.27027414185184523, + "grad_norm": 1.4789403676986694, + "learning_rate": 0.00017121938883502328, + "loss": 1.7571, + "step": 7547 + }, + { + "epoch": 0.2703099539814135, + "grad_norm": 1.5586892366409302, + "learning_rate": 0.00017121124604826645, + "loss": 1.6486, + "step": 7548 + }, + { + "epoch": 0.27034576611098177, + "grad_norm": 1.6130740642547607, + "learning_rate": 0.00017120310230345418, + "loss": 1.3414, + "step": 7549 + }, + { + "epoch": 0.2703815782405501, + "grad_norm": 1.3961517810821533, + "learning_rate": 0.000171194957600696, + "loss": 1.4985, + "step": 7550 + }, + { + "epoch": 0.27041739037011836, + "grad_norm": 1.2499228715896606, + "learning_rate": 0.00017118681194010153, + "loss": 1.2782, + "step": 7551 + }, + { + "epoch": 0.27045320249968663, + "grad_norm": 2.04305362701416, + "learning_rate": 0.0001711786653217803, + "loss": 1.409, + "step": 7552 + }, + { + "epoch": 0.27048901462925495, + "grad_norm": 1.6010410785675049, + "learning_rate": 0.00017117051774584194, + "loss": 1.324, + "step": 7553 + }, + { + "epoch": 0.2705248267588232, + "grad_norm": 1.6431407928466797, + "learning_rate": 0.00017116236921239607, + "loss": 1.4098, + "step": 7554 + }, + { + "epoch": 0.2705606388883915, + "grad_norm": 1.8824855089187622, + "learning_rate": 0.00017115421972155234, + "loss": 1.498, + "step": 7555 + }, + { + "epoch": 0.27059645101795976, + "grad_norm": 1.4907480478286743, + "learning_rate": 0.00017114606927342036, + "loss": 1.4708, + "step": 7556 + }, + { + "epoch": 0.2706322631475281, + "grad_norm": 1.5933283567428589, + "learning_rate": 0.0001711379178681098, + "loss": 1.3108, + "step": 7557 + }, + { + "epoch": 0.27066807527709635, + "grad_norm": 1.7678366899490356, + "learning_rate": 0.00017112976550573026, + "loss": 1.6437, + "step": 7558 + }, + { + "epoch": 0.2707038874066646, + "grad_norm": 1.8513715267181396, + "learning_rate": 0.00017112161218639152, + "loss": 1.6796, + "step": 7559 + }, + { + "epoch": 0.27073969953623295, + "grad_norm": 1.9389971494674683, + "learning_rate": 0.00017111345791020324, + "loss": 1.5358, + "step": 7560 + }, + { + "epoch": 0.2707755116658012, + "grad_norm": 1.4210546016693115, + "learning_rate": 0.0001711053026772751, + "loss": 1.546, + "step": 7561 + }, + { + "epoch": 0.2708113237953695, + "grad_norm": 2.9331257343292236, + "learning_rate": 0.00017109714648771683, + "loss": 1.4633, + "step": 7562 + }, + { + "epoch": 0.27084713592493775, + "grad_norm": 2.1464321613311768, + "learning_rate": 0.00017108898934163814, + "loss": 1.9307, + "step": 7563 + }, + { + "epoch": 0.2708829480545061, + "grad_norm": 1.5199549198150635, + "learning_rate": 0.0001710808312391488, + "loss": 1.1444, + "step": 7564 + }, + { + "epoch": 0.27091876018407435, + "grad_norm": 1.5869702100753784, + "learning_rate": 0.0001710726721803586, + "loss": 1.3033, + "step": 7565 + }, + { + "epoch": 0.2709545723136426, + "grad_norm": 1.7587640285491943, + "learning_rate": 0.00017106451216537723, + "loss": 1.5064, + "step": 7566 + }, + { + "epoch": 0.27099038444321094, + "grad_norm": 1.4721653461456299, + "learning_rate": 0.00017105635119431457, + "loss": 1.4631, + "step": 7567 + }, + { + "epoch": 0.2710261965727792, + "grad_norm": 1.7774546146392822, + "learning_rate": 0.0001710481892672803, + "loss": 1.5663, + "step": 7568 + }, + { + "epoch": 0.2710620087023475, + "grad_norm": 1.6262949705123901, + "learning_rate": 0.00017104002638438433, + "loss": 1.7109, + "step": 7569 + }, + { + "epoch": 0.27109782083191575, + "grad_norm": 1.9079837799072266, + "learning_rate": 0.00017103186254573642, + "loss": 1.5319, + "step": 7570 + }, + { + "epoch": 0.2711336329614841, + "grad_norm": 2.4740493297576904, + "learning_rate": 0.00017102369775144643, + "loss": 1.4753, + "step": 7571 + }, + { + "epoch": 0.27116944509105234, + "grad_norm": 1.6783044338226318, + "learning_rate": 0.0001710155320016242, + "loss": 1.6086, + "step": 7572 + }, + { + "epoch": 0.2712052572206206, + "grad_norm": 1.438635230064392, + "learning_rate": 0.00017100736529637958, + "loss": 1.4598, + "step": 7573 + }, + { + "epoch": 0.27124106935018893, + "grad_norm": 1.3612160682678223, + "learning_rate": 0.0001709991976358225, + "loss": 1.766, + "step": 7574 + }, + { + "epoch": 0.2712768814797572, + "grad_norm": 2.465081214904785, + "learning_rate": 0.00017099102902006275, + "loss": 1.5949, + "step": 7575 + }, + { + "epoch": 0.2713126936093255, + "grad_norm": 2.009199380874634, + "learning_rate": 0.00017098285944921028, + "loss": 1.9, + "step": 7576 + }, + { + "epoch": 0.27134850573889374, + "grad_norm": 1.7520637512207031, + "learning_rate": 0.00017097468892337503, + "loss": 1.6389, + "step": 7577 + }, + { + "epoch": 0.27138431786846207, + "grad_norm": 1.6367089748382568, + "learning_rate": 0.00017096651744266686, + "loss": 1.7878, + "step": 7578 + }, + { + "epoch": 0.27142012999803034, + "grad_norm": 1.7914072275161743, + "learning_rate": 0.00017095834500719574, + "loss": 1.5384, + "step": 7579 + }, + { + "epoch": 0.2714559421275986, + "grad_norm": 1.9473427534103394, + "learning_rate": 0.00017095017161707164, + "loss": 1.5749, + "step": 7580 + }, + { + "epoch": 0.27149175425716693, + "grad_norm": 1.9787449836730957, + "learning_rate": 0.00017094199727240447, + "loss": 1.6192, + "step": 7581 + }, + { + "epoch": 0.2715275663867352, + "grad_norm": 1.2734183073043823, + "learning_rate": 0.00017093382197330427, + "loss": 1.3988, + "step": 7582 + }, + { + "epoch": 0.27156337851630347, + "grad_norm": 1.445749044418335, + "learning_rate": 0.00017092564571988096, + "loss": 1.4681, + "step": 7583 + }, + { + "epoch": 0.27159919064587174, + "grad_norm": 1.8988313674926758, + "learning_rate": 0.0001709174685122446, + "loss": 1.3218, + "step": 7584 + }, + { + "epoch": 0.27163500277544006, + "grad_norm": 1.7073367834091187, + "learning_rate": 0.00017090929035050513, + "loss": 1.6704, + "step": 7585 + }, + { + "epoch": 0.27167081490500833, + "grad_norm": 1.4656105041503906, + "learning_rate": 0.00017090111123477266, + "loss": 1.4475, + "step": 7586 + }, + { + "epoch": 0.2717066270345766, + "grad_norm": 2.0253167152404785, + "learning_rate": 0.0001708929311651572, + "loss": 1.7153, + "step": 7587 + }, + { + "epoch": 0.2717424391641449, + "grad_norm": 1.6637212038040161, + "learning_rate": 0.0001708847501417688, + "loss": 1.4888, + "step": 7588 + }, + { + "epoch": 0.2717782512937132, + "grad_norm": 1.6039817333221436, + "learning_rate": 0.00017087656816471754, + "loss": 1.2485, + "step": 7589 + }, + { + "epoch": 0.27181406342328146, + "grad_norm": 1.811334490776062, + "learning_rate": 0.00017086838523411343, + "loss": 1.7239, + "step": 7590 + }, + { + "epoch": 0.27184987555284973, + "grad_norm": 1.2404311895370483, + "learning_rate": 0.00017086020135006664, + "loss": 1.2436, + "step": 7591 + }, + { + "epoch": 0.27188568768241805, + "grad_norm": 1.3640433549880981, + "learning_rate": 0.00017085201651268722, + "loss": 1.4845, + "step": 7592 + }, + { + "epoch": 0.2719214998119863, + "grad_norm": 2.0693013668060303, + "learning_rate": 0.00017084383072208534, + "loss": 1.7518, + "step": 7593 + }, + { + "epoch": 0.2719573119415546, + "grad_norm": 1.524749994277954, + "learning_rate": 0.00017083564397837108, + "loss": 1.6015, + "step": 7594 + }, + { + "epoch": 0.2719931240711229, + "grad_norm": 1.4143753051757812, + "learning_rate": 0.00017082745628165463, + "loss": 1.6716, + "step": 7595 + }, + { + "epoch": 0.2720289362006912, + "grad_norm": 1.6675052642822266, + "learning_rate": 0.0001708192676320461, + "loss": 1.3527, + "step": 7596 + }, + { + "epoch": 0.27206474833025945, + "grad_norm": 1.9079591035842896, + "learning_rate": 0.00017081107802965564, + "loss": 1.601, + "step": 7597 + }, + { + "epoch": 0.2721005604598277, + "grad_norm": 1.47043776512146, + "learning_rate": 0.0001708028874745935, + "loss": 1.6689, + "step": 7598 + }, + { + "epoch": 0.27213637258939605, + "grad_norm": 1.9490792751312256, + "learning_rate": 0.0001707946959669698, + "loss": 1.3982, + "step": 7599 + }, + { + "epoch": 0.2721721847189643, + "grad_norm": 1.7489452362060547, + "learning_rate": 0.00017078650350689482, + "loss": 1.4841, + "step": 7600 + }, + { + "epoch": 0.2722079968485326, + "grad_norm": 1.3652783632278442, + "learning_rate": 0.00017077831009447878, + "loss": 1.4928, + "step": 7601 + }, + { + "epoch": 0.2722438089781009, + "grad_norm": 1.3833941221237183, + "learning_rate": 0.00017077011572983183, + "loss": 1.2949, + "step": 7602 + }, + { + "epoch": 0.2722796211076692, + "grad_norm": 1.9464653730392456, + "learning_rate": 0.00017076192041306425, + "loss": 1.6098, + "step": 7603 + }, + { + "epoch": 0.27231543323723745, + "grad_norm": 1.9903013706207275, + "learning_rate": 0.00017075372414428633, + "loss": 1.9585, + "step": 7604 + }, + { + "epoch": 0.2723512453668057, + "grad_norm": 1.821960210800171, + "learning_rate": 0.00017074552692360832, + "loss": 1.3332, + "step": 7605 + }, + { + "epoch": 0.27238705749637404, + "grad_norm": 2.120266914367676, + "learning_rate": 0.00017073732875114045, + "loss": 1.465, + "step": 7606 + }, + { + "epoch": 0.2724228696259423, + "grad_norm": 1.6445937156677246, + "learning_rate": 0.0001707291296269931, + "loss": 1.5826, + "step": 7607 + }, + { + "epoch": 0.2724586817555106, + "grad_norm": 1.6000337600708008, + "learning_rate": 0.00017072092955127657, + "loss": 1.4611, + "step": 7608 + }, + { + "epoch": 0.2724944938850789, + "grad_norm": 1.9415369033813477, + "learning_rate": 0.00017071272852410113, + "loss": 1.5284, + "step": 7609 + }, + { + "epoch": 0.2725303060146472, + "grad_norm": 1.6123709678649902, + "learning_rate": 0.00017070452654557717, + "loss": 1.5851, + "step": 7610 + }, + { + "epoch": 0.27256611814421544, + "grad_norm": 1.5228267908096313, + "learning_rate": 0.00017069632361581497, + "loss": 1.8117, + "step": 7611 + }, + { + "epoch": 0.2726019302737837, + "grad_norm": 1.7121150493621826, + "learning_rate": 0.00017068811973492497, + "loss": 1.6039, + "step": 7612 + }, + { + "epoch": 0.27263774240335203, + "grad_norm": 1.8781545162200928, + "learning_rate": 0.00017067991490301744, + "loss": 1.5765, + "step": 7613 + }, + { + "epoch": 0.2726735545329203, + "grad_norm": 2.0894546508789062, + "learning_rate": 0.00017067170912020286, + "loss": 1.5713, + "step": 7614 + }, + { + "epoch": 0.2727093666624886, + "grad_norm": 1.5253492593765259, + "learning_rate": 0.0001706635023865916, + "loss": 1.4626, + "step": 7615 + }, + { + "epoch": 0.2727451787920569, + "grad_norm": 1.6403868198394775, + "learning_rate": 0.00017065529470229403, + "loss": 1.5156, + "step": 7616 + }, + { + "epoch": 0.27278099092162517, + "grad_norm": 1.4445627927780151, + "learning_rate": 0.00017064708606742067, + "loss": 1.7814, + "step": 7617 + }, + { + "epoch": 0.27281680305119343, + "grad_norm": 1.6161423921585083, + "learning_rate": 0.00017063887648208185, + "loss": 1.8426, + "step": 7618 + }, + { + "epoch": 0.2728526151807617, + "grad_norm": 2.159152030944824, + "learning_rate": 0.00017063066594638805, + "loss": 1.6711, + "step": 7619 + }, + { + "epoch": 0.27288842731033003, + "grad_norm": 1.585121989250183, + "learning_rate": 0.0001706224544604498, + "loss": 1.6355, + "step": 7620 + }, + { + "epoch": 0.2729242394398983, + "grad_norm": 2.4911491870880127, + "learning_rate": 0.00017061424202437748, + "loss": 1.8799, + "step": 7621 + }, + { + "epoch": 0.27296005156946657, + "grad_norm": 1.3590527772903442, + "learning_rate": 0.00017060602863828165, + "loss": 1.3842, + "step": 7622 + }, + { + "epoch": 0.2729958636990349, + "grad_norm": 1.6807591915130615, + "learning_rate": 0.00017059781430227275, + "loss": 1.8145, + "step": 7623 + }, + { + "epoch": 0.27303167582860316, + "grad_norm": 1.9049506187438965, + "learning_rate": 0.00017058959901646134, + "loss": 1.5613, + "step": 7624 + }, + { + "epoch": 0.27306748795817143, + "grad_norm": 1.9668833017349243, + "learning_rate": 0.00017058138278095792, + "loss": 1.3905, + "step": 7625 + }, + { + "epoch": 0.2731033000877397, + "grad_norm": 1.8975080251693726, + "learning_rate": 0.00017057316559587307, + "loss": 1.6166, + "step": 7626 + }, + { + "epoch": 0.273139112217308, + "grad_norm": 1.7440699338912964, + "learning_rate": 0.00017056494746131725, + "loss": 1.5235, + "step": 7627 + }, + { + "epoch": 0.2731749243468763, + "grad_norm": 1.5745868682861328, + "learning_rate": 0.00017055672837740113, + "loss": 1.4461, + "step": 7628 + }, + { + "epoch": 0.27321073647644456, + "grad_norm": 1.4910067319869995, + "learning_rate": 0.00017054850834423522, + "loss": 1.6589, + "step": 7629 + }, + { + "epoch": 0.27324654860601283, + "grad_norm": 1.9283604621887207, + "learning_rate": 0.00017054028736193013, + "loss": 1.3877, + "step": 7630 + }, + { + "epoch": 0.27328236073558115, + "grad_norm": 1.5712474584579468, + "learning_rate": 0.00017053206543059647, + "loss": 1.6484, + "step": 7631 + }, + { + "epoch": 0.2733181728651494, + "grad_norm": 1.4814081192016602, + "learning_rate": 0.00017052384255034485, + "loss": 1.58, + "step": 7632 + }, + { + "epoch": 0.2733539849947177, + "grad_norm": 2.332150936126709, + "learning_rate": 0.00017051561872128592, + "loss": 1.4022, + "step": 7633 + }, + { + "epoch": 0.273389797124286, + "grad_norm": 1.6437220573425293, + "learning_rate": 0.00017050739394353028, + "loss": 1.4863, + "step": 7634 + }, + { + "epoch": 0.2734256092538543, + "grad_norm": 2.036505937576294, + "learning_rate": 0.00017049916821718861, + "loss": 1.5958, + "step": 7635 + }, + { + "epoch": 0.27346142138342255, + "grad_norm": 1.2595261335372925, + "learning_rate": 0.00017049094154237155, + "loss": 1.4669, + "step": 7636 + }, + { + "epoch": 0.2734972335129908, + "grad_norm": 1.5755056142807007, + "learning_rate": 0.0001704827139191898, + "loss": 1.6032, + "step": 7637 + }, + { + "epoch": 0.27353304564255915, + "grad_norm": 1.5014159679412842, + "learning_rate": 0.00017047448534775406, + "loss": 1.6173, + "step": 7638 + }, + { + "epoch": 0.2735688577721274, + "grad_norm": 1.7377209663391113, + "learning_rate": 0.00017046625582817503, + "loss": 1.4034, + "step": 7639 + }, + { + "epoch": 0.2736046699016957, + "grad_norm": 1.9546681642532349, + "learning_rate": 0.00017045802536056344, + "loss": 1.7961, + "step": 7640 + }, + { + "epoch": 0.273640482031264, + "grad_norm": 1.5216879844665527, + "learning_rate": 0.00017044979394502995, + "loss": 1.6273, + "step": 7641 + }, + { + "epoch": 0.2736762941608323, + "grad_norm": 2.2344608306884766, + "learning_rate": 0.0001704415615816854, + "loss": 1.5878, + "step": 7642 + }, + { + "epoch": 0.27371210629040055, + "grad_norm": 1.5303434133529663, + "learning_rate": 0.0001704333282706405, + "loss": 1.5562, + "step": 7643 + }, + { + "epoch": 0.2737479184199688, + "grad_norm": 2.065340280532837, + "learning_rate": 0.00017042509401200598, + "loss": 1.6453, + "step": 7644 + }, + { + "epoch": 0.27378373054953714, + "grad_norm": 1.7603098154067993, + "learning_rate": 0.00017041685880589272, + "loss": 1.7976, + "step": 7645 + }, + { + "epoch": 0.2738195426791054, + "grad_norm": 1.29735267162323, + "learning_rate": 0.0001704086226524114, + "loss": 1.4381, + "step": 7646 + }, + { + "epoch": 0.2738553548086737, + "grad_norm": 1.7243160009384155, + "learning_rate": 0.0001704003855516729, + "loss": 1.6664, + "step": 7647 + }, + { + "epoch": 0.273891166938242, + "grad_norm": 2.5377590656280518, + "learning_rate": 0.00017039214750378805, + "loss": 1.7078, + "step": 7648 + }, + { + "epoch": 0.2739269790678103, + "grad_norm": 2.229074478149414, + "learning_rate": 0.00017038390850886766, + "loss": 1.4732, + "step": 7649 + }, + { + "epoch": 0.27396279119737854, + "grad_norm": 1.3973140716552734, + "learning_rate": 0.00017037566856702255, + "loss": 1.3579, + "step": 7650 + }, + { + "epoch": 0.2739986033269468, + "grad_norm": 1.4385688304901123, + "learning_rate": 0.00017036742767836355, + "loss": 1.6784, + "step": 7651 + }, + { + "epoch": 0.27403441545651513, + "grad_norm": 1.4046844244003296, + "learning_rate": 0.00017035918584300163, + "loss": 1.4343, + "step": 7652 + }, + { + "epoch": 0.2740702275860834, + "grad_norm": 1.4777336120605469, + "learning_rate": 0.00017035094306104762, + "loss": 1.752, + "step": 7653 + }, + { + "epoch": 0.2741060397156517, + "grad_norm": 1.6677403450012207, + "learning_rate": 0.0001703426993326124, + "loss": 1.5784, + "step": 7654 + }, + { + "epoch": 0.27414185184522, + "grad_norm": 1.8168667554855347, + "learning_rate": 0.0001703344546578069, + "loss": 1.7299, + "step": 7655 + }, + { + "epoch": 0.27417766397478827, + "grad_norm": 1.6721216440200806, + "learning_rate": 0.00017032620903674207, + "loss": 1.5574, + "step": 7656 + }, + { + "epoch": 0.27421347610435653, + "grad_norm": 1.7680671215057373, + "learning_rate": 0.0001703179624695288, + "loss": 1.3656, + "step": 7657 + }, + { + "epoch": 0.2742492882339248, + "grad_norm": 2.2017982006073, + "learning_rate": 0.00017030971495627802, + "loss": 1.5316, + "step": 7658 + }, + { + "epoch": 0.27428510036349313, + "grad_norm": 1.4911608695983887, + "learning_rate": 0.00017030146649710072, + "loss": 1.6055, + "step": 7659 + }, + { + "epoch": 0.2743209124930614, + "grad_norm": 1.9482650756835938, + "learning_rate": 0.00017029321709210787, + "loss": 1.7288, + "step": 7660 + }, + { + "epoch": 0.27435672462262967, + "grad_norm": 1.816979169845581, + "learning_rate": 0.00017028496674141051, + "loss": 1.1743, + "step": 7661 + }, + { + "epoch": 0.274392536752198, + "grad_norm": 1.9508891105651855, + "learning_rate": 0.0001702767154451195, + "loss": 1.5504, + "step": 7662 + }, + { + "epoch": 0.27442834888176626, + "grad_norm": 1.4764585494995117, + "learning_rate": 0.000170268463203346, + "loss": 1.5337, + "step": 7663 + }, + { + "epoch": 0.27446416101133453, + "grad_norm": 1.4491671323776245, + "learning_rate": 0.00017026021001620095, + "loss": 1.6598, + "step": 7664 + }, + { + "epoch": 0.2744999731409028, + "grad_norm": 1.9225811958312988, + "learning_rate": 0.00017025195588379538, + "loss": 1.4929, + "step": 7665 + }, + { + "epoch": 0.2745357852704711, + "grad_norm": 1.355978012084961, + "learning_rate": 0.0001702437008062404, + "loss": 1.1573, + "step": 7666 + }, + { + "epoch": 0.2745715974000394, + "grad_norm": 1.796846866607666, + "learning_rate": 0.00017023544478364698, + "loss": 1.3795, + "step": 7667 + }, + { + "epoch": 0.27460740952960766, + "grad_norm": 1.581048846244812, + "learning_rate": 0.0001702271878161263, + "loss": 1.365, + "step": 7668 + }, + { + "epoch": 0.274643221659176, + "grad_norm": 1.55429208278656, + "learning_rate": 0.0001702189299037894, + "loss": 1.5851, + "step": 7669 + }, + { + "epoch": 0.27467903378874425, + "grad_norm": 1.660404920578003, + "learning_rate": 0.00017021067104674734, + "loss": 2.0028, + "step": 7670 + }, + { + "epoch": 0.2747148459183125, + "grad_norm": 2.531282663345337, + "learning_rate": 0.00017020241124511128, + "loss": 1.6508, + "step": 7671 + }, + { + "epoch": 0.2747506580478808, + "grad_norm": 1.8213257789611816, + "learning_rate": 0.0001701941504989923, + "loss": 1.4164, + "step": 7672 + }, + { + "epoch": 0.2747864701774491, + "grad_norm": 1.5226930379867554, + "learning_rate": 0.00017018588880850162, + "loss": 1.7048, + "step": 7673 + }, + { + "epoch": 0.2748222823070174, + "grad_norm": 1.236707329750061, + "learning_rate": 0.0001701776261737503, + "loss": 1.4652, + "step": 7674 + }, + { + "epoch": 0.27485809443658565, + "grad_norm": 1.5473086833953857, + "learning_rate": 0.00017016936259484953, + "loss": 1.5635, + "step": 7675 + }, + { + "epoch": 0.274893906566154, + "grad_norm": 1.5289727449417114, + "learning_rate": 0.00017016109807191056, + "loss": 1.5221, + "step": 7676 + }, + { + "epoch": 0.27492971869572225, + "grad_norm": 1.2557204961776733, + "learning_rate": 0.00017015283260504447, + "loss": 1.5362, + "step": 7677 + }, + { + "epoch": 0.2749655308252905, + "grad_norm": 1.6917392015457153, + "learning_rate": 0.00017014456619436253, + "loss": 1.5005, + "step": 7678 + }, + { + "epoch": 0.2750013429548588, + "grad_norm": 1.553214192390442, + "learning_rate": 0.00017013629883997594, + "loss": 1.6225, + "step": 7679 + }, + { + "epoch": 0.2750371550844271, + "grad_norm": 1.3000906705856323, + "learning_rate": 0.00017012803054199587, + "loss": 1.6618, + "step": 7680 + }, + { + "epoch": 0.2750729672139954, + "grad_norm": 1.7456116676330566, + "learning_rate": 0.00017011976130053367, + "loss": 1.4712, + "step": 7681 + }, + { + "epoch": 0.27510877934356365, + "grad_norm": 1.459320068359375, + "learning_rate": 0.00017011149111570051, + "loss": 1.6217, + "step": 7682 + }, + { + "epoch": 0.27514459147313197, + "grad_norm": 2.354222297668457, + "learning_rate": 0.00017010321998760762, + "loss": 1.6253, + "step": 7683 + }, + { + "epoch": 0.27518040360270024, + "grad_norm": 1.62624192237854, + "learning_rate": 0.0001700949479163664, + "loss": 1.4179, + "step": 7684 + }, + { + "epoch": 0.2752162157322685, + "grad_norm": 1.8651334047317505, + "learning_rate": 0.00017008667490208803, + "loss": 1.4862, + "step": 7685 + }, + { + "epoch": 0.2752520278618368, + "grad_norm": 1.6018571853637695, + "learning_rate": 0.00017007840094488387, + "loss": 1.7953, + "step": 7686 + }, + { + "epoch": 0.2752878399914051, + "grad_norm": 1.353165626525879, + "learning_rate": 0.00017007012604486525, + "loss": 1.6099, + "step": 7687 + }, + { + "epoch": 0.2753236521209734, + "grad_norm": 1.3602633476257324, + "learning_rate": 0.0001700618502021434, + "loss": 1.5703, + "step": 7688 + }, + { + "epoch": 0.27535946425054164, + "grad_norm": 1.2508971691131592, + "learning_rate": 0.00017005357341682979, + "loss": 1.6003, + "step": 7689 + }, + { + "epoch": 0.27539527638010997, + "grad_norm": 1.8892033100128174, + "learning_rate": 0.0001700452956890357, + "loss": 1.7102, + "step": 7690 + }, + { + "epoch": 0.27543108850967823, + "grad_norm": 1.4698898792266846, + "learning_rate": 0.0001700370170188725, + "loss": 1.6364, + "step": 7691 + }, + { + "epoch": 0.2754669006392465, + "grad_norm": 1.6167817115783691, + "learning_rate": 0.00017002873740645157, + "loss": 1.7467, + "step": 7692 + }, + { + "epoch": 0.2755027127688148, + "grad_norm": 1.442500352859497, + "learning_rate": 0.00017002045685188431, + "loss": 1.4752, + "step": 7693 + }, + { + "epoch": 0.2755385248983831, + "grad_norm": 1.7867764234542847, + "learning_rate": 0.00017001217535528215, + "loss": 1.5237, + "step": 7694 + }, + { + "epoch": 0.27557433702795137, + "grad_norm": 1.4577064514160156, + "learning_rate": 0.00017000389291675644, + "loss": 1.683, + "step": 7695 + }, + { + "epoch": 0.27561014915751963, + "grad_norm": 1.7021827697753906, + "learning_rate": 0.00016999560953641867, + "loss": 1.7309, + "step": 7696 + }, + { + "epoch": 0.27564596128708796, + "grad_norm": 1.3312740325927734, + "learning_rate": 0.00016998732521438024, + "loss": 1.2283, + "step": 7697 + }, + { + "epoch": 0.27568177341665623, + "grad_norm": 1.972090482711792, + "learning_rate": 0.00016997903995075265, + "loss": 1.3293, + "step": 7698 + }, + { + "epoch": 0.2757175855462245, + "grad_norm": 1.9187233448028564, + "learning_rate": 0.00016997075374564733, + "loss": 1.3078, + "step": 7699 + }, + { + "epoch": 0.27575339767579277, + "grad_norm": 1.3226217031478882, + "learning_rate": 0.00016996246659917578, + "loss": 1.3863, + "step": 7700 + }, + { + "epoch": 0.2757892098053611, + "grad_norm": 2.0401716232299805, + "learning_rate": 0.0001699541785114495, + "loss": 1.4268, + "step": 7701 + }, + { + "epoch": 0.27582502193492936, + "grad_norm": 1.7860567569732666, + "learning_rate": 0.00016994588948257997, + "loss": 1.358, + "step": 7702 + }, + { + "epoch": 0.27586083406449763, + "grad_norm": 1.6787052154541016, + "learning_rate": 0.0001699375995126787, + "loss": 1.6829, + "step": 7703 + }, + { + "epoch": 0.27589664619406595, + "grad_norm": 1.7264974117279053, + "learning_rate": 0.00016992930860185726, + "loss": 1.5136, + "step": 7704 + }, + { + "epoch": 0.2759324583236342, + "grad_norm": 1.6791878938674927, + "learning_rate": 0.0001699210167502272, + "loss": 1.3216, + "step": 7705 + }, + { + "epoch": 0.2759682704532025, + "grad_norm": 1.2904554605484009, + "learning_rate": 0.00016991272395790007, + "loss": 1.3571, + "step": 7706 + }, + { + "epoch": 0.27600408258277076, + "grad_norm": 2.196139335632324, + "learning_rate": 0.00016990443022498735, + "loss": 1.5075, + "step": 7707 + }, + { + "epoch": 0.2760398947123391, + "grad_norm": 1.6233375072479248, + "learning_rate": 0.0001698961355516007, + "loss": 1.5892, + "step": 7708 + }, + { + "epoch": 0.27607570684190735, + "grad_norm": 1.7111704349517822, + "learning_rate": 0.00016988783993785177, + "loss": 1.6294, + "step": 7709 + }, + { + "epoch": 0.2761115189714756, + "grad_norm": 1.633927345275879, + "learning_rate": 0.00016987954338385202, + "loss": 1.3068, + "step": 7710 + }, + { + "epoch": 0.27614733110104395, + "grad_norm": 1.6509286165237427, + "learning_rate": 0.0001698712458897132, + "loss": 1.5484, + "step": 7711 + }, + { + "epoch": 0.2761831432306122, + "grad_norm": 1.8952949047088623, + "learning_rate": 0.0001698629474555469, + "loss": 1.4961, + "step": 7712 + }, + { + "epoch": 0.2762189553601805, + "grad_norm": 2.397242546081543, + "learning_rate": 0.00016985464808146473, + "loss": 1.8338, + "step": 7713 + }, + { + "epoch": 0.27625476748974875, + "grad_norm": 1.9433248043060303, + "learning_rate": 0.0001698463477675784, + "loss": 1.5395, + "step": 7714 + }, + { + "epoch": 0.2762905796193171, + "grad_norm": 2.032348394393921, + "learning_rate": 0.00016983804651399956, + "loss": 1.5128, + "step": 7715 + }, + { + "epoch": 0.27632639174888535, + "grad_norm": 2.3550405502319336, + "learning_rate": 0.00016982974432083986, + "loss": 1.4864, + "step": 7716 + }, + { + "epoch": 0.2763622038784536, + "grad_norm": 1.410760521888733, + "learning_rate": 0.00016982144118821103, + "loss": 1.6236, + "step": 7717 + }, + { + "epoch": 0.27639801600802194, + "grad_norm": 1.9511107206344604, + "learning_rate": 0.0001698131371162248, + "loss": 1.7513, + "step": 7718 + }, + { + "epoch": 0.2764338281375902, + "grad_norm": 1.6737018823623657, + "learning_rate": 0.00016980483210499286, + "loss": 1.5436, + "step": 7719 + }, + { + "epoch": 0.2764696402671585, + "grad_norm": 1.807410478591919, + "learning_rate": 0.00016979652615462692, + "loss": 1.773, + "step": 7720 + }, + { + "epoch": 0.27650545239672675, + "grad_norm": 1.6671122312545776, + "learning_rate": 0.00016978821926523873, + "loss": 1.5451, + "step": 7721 + }, + { + "epoch": 0.27654126452629507, + "grad_norm": 1.9303603172302246, + "learning_rate": 0.00016977991143694014, + "loss": 1.6225, + "step": 7722 + }, + { + "epoch": 0.27657707665586334, + "grad_norm": 1.4524554014205933, + "learning_rate": 0.00016977160266984283, + "loss": 1.4761, + "step": 7723 + }, + { + "epoch": 0.2766128887854316, + "grad_norm": 1.716299295425415, + "learning_rate": 0.00016976329296405855, + "loss": 1.3507, + "step": 7724 + }, + { + "epoch": 0.27664870091499993, + "grad_norm": 1.3396046161651611, + "learning_rate": 0.0001697549823196992, + "loss": 1.7153, + "step": 7725 + }, + { + "epoch": 0.2766845130445682, + "grad_norm": 1.991355299949646, + "learning_rate": 0.00016974667073687655, + "loss": 1.648, + "step": 7726 + }, + { + "epoch": 0.27672032517413647, + "grad_norm": 1.623319149017334, + "learning_rate": 0.00016973835821570236, + "loss": 1.5506, + "step": 7727 + }, + { + "epoch": 0.27675613730370474, + "grad_norm": 1.4190425872802734, + "learning_rate": 0.00016973004475628856, + "loss": 1.4248, + "step": 7728 + }, + { + "epoch": 0.27679194943327307, + "grad_norm": 1.6432379484176636, + "learning_rate": 0.00016972173035874693, + "loss": 1.7124, + "step": 7729 + }, + { + "epoch": 0.27682776156284133, + "grad_norm": 1.4056947231292725, + "learning_rate": 0.00016971341502318936, + "loss": 1.4815, + "step": 7730 + }, + { + "epoch": 0.2768635736924096, + "grad_norm": 1.9628760814666748, + "learning_rate": 0.00016970509874972774, + "loss": 1.7848, + "step": 7731 + }, + { + "epoch": 0.27689938582197793, + "grad_norm": 1.8493858575820923, + "learning_rate": 0.0001696967815384739, + "loss": 1.6528, + "step": 7732 + }, + { + "epoch": 0.2769351979515462, + "grad_norm": 1.7261875867843628, + "learning_rate": 0.0001696884633895398, + "loss": 1.5981, + "step": 7733 + }, + { + "epoch": 0.27697101008111447, + "grad_norm": 1.6887271404266357, + "learning_rate": 0.00016968014430303728, + "loss": 1.6599, + "step": 7734 + }, + { + "epoch": 0.27700682221068273, + "grad_norm": 1.777239203453064, + "learning_rate": 0.0001696718242790783, + "loss": 1.4756, + "step": 7735 + }, + { + "epoch": 0.27704263434025106, + "grad_norm": 1.546141266822815, + "learning_rate": 0.0001696635033177748, + "loss": 1.551, + "step": 7736 + }, + { + "epoch": 0.27707844646981933, + "grad_norm": 1.5476120710372925, + "learning_rate": 0.00016965518141923874, + "loss": 1.6505, + "step": 7737 + }, + { + "epoch": 0.2771142585993876, + "grad_norm": 1.809834361076355, + "learning_rate": 0.00016964685858358202, + "loss": 1.8532, + "step": 7738 + }, + { + "epoch": 0.2771500707289559, + "grad_norm": 2.0858800411224365, + "learning_rate": 0.0001696385348109167, + "loss": 1.7213, + "step": 7739 + }, + { + "epoch": 0.2771858828585242, + "grad_norm": 1.580114722251892, + "learning_rate": 0.0001696302101013547, + "loss": 1.6748, + "step": 7740 + }, + { + "epoch": 0.27722169498809246, + "grad_norm": 1.333909273147583, + "learning_rate": 0.00016962188445500807, + "loss": 1.632, + "step": 7741 + }, + { + "epoch": 0.27725750711766073, + "grad_norm": 1.7078574895858765, + "learning_rate": 0.00016961355787198875, + "loss": 1.7498, + "step": 7742 + }, + { + "epoch": 0.27729331924722905, + "grad_norm": 1.3296380043029785, + "learning_rate": 0.00016960523035240883, + "loss": 1.5679, + "step": 7743 + }, + { + "epoch": 0.2773291313767973, + "grad_norm": 1.2145541906356812, + "learning_rate": 0.0001695969018963803, + "loss": 1.5629, + "step": 7744 + }, + { + "epoch": 0.2773649435063656, + "grad_norm": 1.5315901041030884, + "learning_rate": 0.00016958857250401525, + "loss": 1.7075, + "step": 7745 + }, + { + "epoch": 0.2774007556359339, + "grad_norm": 1.5283602476119995, + "learning_rate": 0.0001695802421754257, + "loss": 1.7226, + "step": 7746 + }, + { + "epoch": 0.2774365677655022, + "grad_norm": 1.7161786556243896, + "learning_rate": 0.00016957191091072376, + "loss": 1.4375, + "step": 7747 + }, + { + "epoch": 0.27747237989507045, + "grad_norm": 1.6843448877334595, + "learning_rate": 0.0001695635787100215, + "loss": 1.3808, + "step": 7748 + }, + { + "epoch": 0.2775081920246387, + "grad_norm": 1.336690068244934, + "learning_rate": 0.000169555245573431, + "loss": 1.3474, + "step": 7749 + }, + { + "epoch": 0.27754400415420705, + "grad_norm": 1.7416538000106812, + "learning_rate": 0.0001695469115010644, + "loss": 1.2874, + "step": 7750 + }, + { + "epoch": 0.2775798162837753, + "grad_norm": 1.2422138452529907, + "learning_rate": 0.00016953857649303381, + "loss": 1.6665, + "step": 7751 + }, + { + "epoch": 0.2776156284133436, + "grad_norm": 1.4898710250854492, + "learning_rate": 0.00016953024054945138, + "loss": 1.3389, + "step": 7752 + }, + { + "epoch": 0.2776514405429119, + "grad_norm": 2.7302865982055664, + "learning_rate": 0.00016952190367042926, + "loss": 1.5453, + "step": 7753 + }, + { + "epoch": 0.2776872526724802, + "grad_norm": 3.756218671798706, + "learning_rate": 0.0001695135658560796, + "loss": 2.0241, + "step": 7754 + }, + { + "epoch": 0.27772306480204845, + "grad_norm": 1.5968868732452393, + "learning_rate": 0.00016950522710651455, + "loss": 1.335, + "step": 7755 + }, + { + "epoch": 0.2777588769316167, + "grad_norm": 1.5735868215560913, + "learning_rate": 0.00016949688742184637, + "loss": 1.5448, + "step": 7756 + }, + { + "epoch": 0.27779468906118504, + "grad_norm": 1.4668728113174438, + "learning_rate": 0.0001694885468021872, + "loss": 1.5775, + "step": 7757 + }, + { + "epoch": 0.2778305011907533, + "grad_norm": 1.89346182346344, + "learning_rate": 0.00016948020524764924, + "loss": 1.3931, + "step": 7758 + }, + { + "epoch": 0.2778663133203216, + "grad_norm": 1.6620376110076904, + "learning_rate": 0.00016947186275834475, + "loss": 1.6433, + "step": 7759 + }, + { + "epoch": 0.2779021254498899, + "grad_norm": 1.8120241165161133, + "learning_rate": 0.00016946351933438595, + "loss": 1.418, + "step": 7760 + }, + { + "epoch": 0.27793793757945817, + "grad_norm": 1.5897480249404907, + "learning_rate": 0.00016945517497588512, + "loss": 1.5138, + "step": 7761 + }, + { + "epoch": 0.27797374970902644, + "grad_norm": 2.4386966228485107, + "learning_rate": 0.00016944682968295452, + "loss": 1.5283, + "step": 7762 + }, + { + "epoch": 0.2780095618385947, + "grad_norm": 1.8611398935317993, + "learning_rate": 0.00016943848345570638, + "loss": 1.6865, + "step": 7763 + }, + { + "epoch": 0.27804537396816303, + "grad_norm": 1.441057562828064, + "learning_rate": 0.00016943013629425302, + "loss": 1.453, + "step": 7764 + }, + { + "epoch": 0.2780811860977313, + "grad_norm": 1.4636635780334473, + "learning_rate": 0.00016942178819870672, + "loss": 1.2875, + "step": 7765 + }, + { + "epoch": 0.27811699822729957, + "grad_norm": 1.454779028892517, + "learning_rate": 0.00016941343916917982, + "loss": 1.6621, + "step": 7766 + }, + { + "epoch": 0.2781528103568679, + "grad_norm": 1.4269800186157227, + "learning_rate": 0.00016940508920578463, + "loss": 1.7387, + "step": 7767 + }, + { + "epoch": 0.27818862248643617, + "grad_norm": 2.06813645362854, + "learning_rate": 0.00016939673830863348, + "loss": 1.4947, + "step": 7768 + }, + { + "epoch": 0.27822443461600443, + "grad_norm": 1.9147825241088867, + "learning_rate": 0.00016938838647783877, + "loss": 1.5197, + "step": 7769 + }, + { + "epoch": 0.2782602467455727, + "grad_norm": 2.2611420154571533, + "learning_rate": 0.00016938003371351278, + "loss": 1.7869, + "step": 7770 + }, + { + "epoch": 0.27829605887514103, + "grad_norm": 2.2714648246765137, + "learning_rate": 0.00016937168001576795, + "loss": 1.4491, + "step": 7771 + }, + { + "epoch": 0.2783318710047093, + "grad_norm": 1.5648062229156494, + "learning_rate": 0.00016936332538471666, + "loss": 1.3759, + "step": 7772 + }, + { + "epoch": 0.27836768313427757, + "grad_norm": 1.8871150016784668, + "learning_rate": 0.00016935496982047128, + "loss": 1.9112, + "step": 7773 + }, + { + "epoch": 0.2784034952638459, + "grad_norm": 1.4186395406723022, + "learning_rate": 0.00016934661332314424, + "loss": 1.7234, + "step": 7774 + }, + { + "epoch": 0.27843930739341416, + "grad_norm": 1.551112174987793, + "learning_rate": 0.000169338255892848, + "loss": 1.5181, + "step": 7775 + }, + { + "epoch": 0.27847511952298243, + "grad_norm": 1.92075514793396, + "learning_rate": 0.00016932989752969495, + "loss": 1.5922, + "step": 7776 + }, + { + "epoch": 0.2785109316525507, + "grad_norm": 1.918872356414795, + "learning_rate": 0.00016932153823379754, + "loss": 1.6461, + "step": 7777 + }, + { + "epoch": 0.278546743782119, + "grad_norm": 1.9076799154281616, + "learning_rate": 0.00016931317800526828, + "loss": 1.9765, + "step": 7778 + }, + { + "epoch": 0.2785825559116873, + "grad_norm": 1.9921611547470093, + "learning_rate": 0.0001693048168442196, + "loss": 1.6124, + "step": 7779 + }, + { + "epoch": 0.27861836804125556, + "grad_norm": 1.2875360250473022, + "learning_rate": 0.000169296454750764, + "loss": 1.4644, + "step": 7780 + }, + { + "epoch": 0.2786541801708239, + "grad_norm": 1.5666717290878296, + "learning_rate": 0.00016928809172501397, + "loss": 1.8456, + "step": 7781 + }, + { + "epoch": 0.27868999230039215, + "grad_norm": 2.1061506271362305, + "learning_rate": 0.00016927972776708208, + "loss": 1.3571, + "step": 7782 + }, + { + "epoch": 0.2787258044299604, + "grad_norm": 1.7013448476791382, + "learning_rate": 0.0001692713628770808, + "loss": 1.1378, + "step": 7783 + }, + { + "epoch": 0.2787616165595287, + "grad_norm": 2.2516982555389404, + "learning_rate": 0.00016926299705512273, + "loss": 1.2538, + "step": 7784 + }, + { + "epoch": 0.278797428689097, + "grad_norm": 1.5621269941329956, + "learning_rate": 0.0001692546303013203, + "loss": 1.8246, + "step": 7785 + }, + { + "epoch": 0.2788332408186653, + "grad_norm": 2.3607735633850098, + "learning_rate": 0.0001692462626157862, + "loss": 1.4497, + "step": 7786 + }, + { + "epoch": 0.27886905294823355, + "grad_norm": 2.1050877571105957, + "learning_rate": 0.00016923789399863294, + "loss": 1.4979, + "step": 7787 + }, + { + "epoch": 0.2789048650778019, + "grad_norm": 1.711255431175232, + "learning_rate": 0.00016922952444997313, + "loss": 1.5194, + "step": 7788 + }, + { + "epoch": 0.27894067720737015, + "grad_norm": 1.6491427421569824, + "learning_rate": 0.00016922115396991939, + "loss": 1.4156, + "step": 7789 + }, + { + "epoch": 0.2789764893369384, + "grad_norm": 1.5196592807769775, + "learning_rate": 0.00016921278255858425, + "loss": 1.4432, + "step": 7790 + }, + { + "epoch": 0.2790123014665067, + "grad_norm": 1.4955500364303589, + "learning_rate": 0.00016920441021608048, + "loss": 1.806, + "step": 7791 + }, + { + "epoch": 0.279048113596075, + "grad_norm": 1.6842684745788574, + "learning_rate": 0.0001691960369425206, + "loss": 1.628, + "step": 7792 + }, + { + "epoch": 0.2790839257256433, + "grad_norm": 1.859779953956604, + "learning_rate": 0.0001691876627380173, + "loss": 1.4083, + "step": 7793 + }, + { + "epoch": 0.27911973785521155, + "grad_norm": 1.8693050146102905, + "learning_rate": 0.00016917928760268325, + "loss": 1.6231, + "step": 7794 + }, + { + "epoch": 0.27915554998477987, + "grad_norm": 1.3143736124038696, + "learning_rate": 0.0001691709115366311, + "loss": 1.5244, + "step": 7795 + }, + { + "epoch": 0.27919136211434814, + "grad_norm": 1.5413780212402344, + "learning_rate": 0.00016916253453997358, + "loss": 1.6463, + "step": 7796 + }, + { + "epoch": 0.2792271742439164, + "grad_norm": 1.8122531175613403, + "learning_rate": 0.00016915415661282335, + "loss": 1.5876, + "step": 7797 + }, + { + "epoch": 0.2792629863734847, + "grad_norm": 2.2389962673187256, + "learning_rate": 0.00016914577775529316, + "loss": 1.3309, + "step": 7798 + }, + { + "epoch": 0.279298798503053, + "grad_norm": 1.547379732131958, + "learning_rate": 0.0001691373979674957, + "loss": 1.9148, + "step": 7799 + }, + { + "epoch": 0.27933461063262127, + "grad_norm": 1.9751660823822021, + "learning_rate": 0.00016912901724954377, + "loss": 1.7473, + "step": 7800 + }, + { + "epoch": 0.27937042276218954, + "grad_norm": 1.5748672485351562, + "learning_rate": 0.00016912063560155005, + "loss": 1.5901, + "step": 7801 + }, + { + "epoch": 0.27940623489175787, + "grad_norm": 2.0997776985168457, + "learning_rate": 0.00016911225302362738, + "loss": 1.5381, + "step": 7802 + }, + { + "epoch": 0.27944204702132613, + "grad_norm": 2.0922765731811523, + "learning_rate": 0.00016910386951588845, + "loss": 1.7502, + "step": 7803 + }, + { + "epoch": 0.2794778591508944, + "grad_norm": 1.5022773742675781, + "learning_rate": 0.0001690954850784461, + "loss": 1.9076, + "step": 7804 + }, + { + "epoch": 0.27951367128046267, + "grad_norm": 1.3663434982299805, + "learning_rate": 0.00016908709971141312, + "loss": 1.5557, + "step": 7805 + }, + { + "epoch": 0.279549483410031, + "grad_norm": 2.1710376739501953, + "learning_rate": 0.00016907871341490235, + "loss": 1.6564, + "step": 7806 + }, + { + "epoch": 0.27958529553959927, + "grad_norm": 2.2670748233795166, + "learning_rate": 0.00016907032618902661, + "loss": 1.3921, + "step": 7807 + }, + { + "epoch": 0.27962110766916753, + "grad_norm": 1.7037529945373535, + "learning_rate": 0.00016906193803389868, + "loss": 1.6198, + "step": 7808 + }, + { + "epoch": 0.27965691979873586, + "grad_norm": 2.3128836154937744, + "learning_rate": 0.00016905354894963147, + "loss": 1.8036, + "step": 7809 + }, + { + "epoch": 0.27969273192830413, + "grad_norm": 1.735877275466919, + "learning_rate": 0.00016904515893633785, + "loss": 1.5839, + "step": 7810 + }, + { + "epoch": 0.2797285440578724, + "grad_norm": 1.5727331638336182, + "learning_rate": 0.0001690367679941307, + "loss": 1.4397, + "step": 7811 + }, + { + "epoch": 0.27976435618744067, + "grad_norm": 1.5470871925354004, + "learning_rate": 0.00016902837612312285, + "loss": 1.8064, + "step": 7812 + }, + { + "epoch": 0.279800168317009, + "grad_norm": 1.5642430782318115, + "learning_rate": 0.00016901998332342726, + "loss": 1.6186, + "step": 7813 + }, + { + "epoch": 0.27983598044657726, + "grad_norm": 1.3714781999588013, + "learning_rate": 0.00016901158959515682, + "loss": 1.5868, + "step": 7814 + }, + { + "epoch": 0.27987179257614553, + "grad_norm": 1.6559131145477295, + "learning_rate": 0.00016900319493842446, + "loss": 1.3897, + "step": 7815 + }, + { + "epoch": 0.27990760470571385, + "grad_norm": 2.2328150272369385, + "learning_rate": 0.00016899479935334307, + "loss": 2.1561, + "step": 7816 + }, + { + "epoch": 0.2799434168352821, + "grad_norm": 1.605811357498169, + "learning_rate": 0.0001689864028400257, + "loss": 1.535, + "step": 7817 + }, + { + "epoch": 0.2799792289648504, + "grad_norm": 1.7465509176254272, + "learning_rate": 0.00016897800539858527, + "loss": 1.5555, + "step": 7818 + }, + { + "epoch": 0.28001504109441866, + "grad_norm": 1.5020278692245483, + "learning_rate": 0.00016896960702913476, + "loss": 1.6375, + "step": 7819 + }, + { + "epoch": 0.280050853223987, + "grad_norm": 1.255519151687622, + "learning_rate": 0.00016896120773178712, + "loss": 1.7075, + "step": 7820 + }, + { + "epoch": 0.28008666535355525, + "grad_norm": 1.8907737731933594, + "learning_rate": 0.00016895280750665542, + "loss": 1.7681, + "step": 7821 + }, + { + "epoch": 0.2801224774831235, + "grad_norm": 1.6590030193328857, + "learning_rate": 0.0001689444063538526, + "loss": 1.5601, + "step": 7822 + }, + { + "epoch": 0.2801582896126918, + "grad_norm": 2.1344289779663086, + "learning_rate": 0.00016893600427349173, + "loss": 1.3434, + "step": 7823 + }, + { + "epoch": 0.2801941017422601, + "grad_norm": 1.558802843093872, + "learning_rate": 0.00016892760126568584, + "loss": 1.4994, + "step": 7824 + }, + { + "epoch": 0.2802299138718284, + "grad_norm": 1.822415828704834, + "learning_rate": 0.00016891919733054802, + "loss": 1.5621, + "step": 7825 + }, + { + "epoch": 0.28026572600139665, + "grad_norm": 1.2544628381729126, + "learning_rate": 0.00016891079246819128, + "loss": 1.4667, + "step": 7826 + }, + { + "epoch": 0.280301538130965, + "grad_norm": 1.3893107175827026, + "learning_rate": 0.0001689023866787287, + "loss": 1.6372, + "step": 7827 + }, + { + "epoch": 0.28033735026053325, + "grad_norm": 1.3773599863052368, + "learning_rate": 0.00016889397996227342, + "loss": 1.5978, + "step": 7828 + }, + { + "epoch": 0.2803731623901015, + "grad_norm": 1.4866869449615479, + "learning_rate": 0.00016888557231893846, + "loss": 1.5474, + "step": 7829 + }, + { + "epoch": 0.2804089745196698, + "grad_norm": 1.25416100025177, + "learning_rate": 0.00016887716374883703, + "loss": 1.5429, + "step": 7830 + }, + { + "epoch": 0.2804447866492381, + "grad_norm": 1.3411985635757446, + "learning_rate": 0.0001688687542520822, + "loss": 1.601, + "step": 7831 + }, + { + "epoch": 0.2804805987788064, + "grad_norm": 1.2994372844696045, + "learning_rate": 0.0001688603438287871, + "loss": 1.4822, + "step": 7832 + }, + { + "epoch": 0.28051641090837465, + "grad_norm": 2.017587900161743, + "learning_rate": 0.00016885193247906488, + "loss": 1.3413, + "step": 7833 + }, + { + "epoch": 0.28055222303794297, + "grad_norm": 1.8447659015655518, + "learning_rate": 0.00016884352020302875, + "loss": 1.7149, + "step": 7834 + }, + { + "epoch": 0.28058803516751124, + "grad_norm": 1.4341516494750977, + "learning_rate": 0.00016883510700079182, + "loss": 1.5375, + "step": 7835 + }, + { + "epoch": 0.2806238472970795, + "grad_norm": 1.4541009664535522, + "learning_rate": 0.00016882669287246734, + "loss": 1.5869, + "step": 7836 + }, + { + "epoch": 0.2806596594266478, + "grad_norm": 1.8895456790924072, + "learning_rate": 0.0001688182778181685, + "loss": 1.647, + "step": 7837 + }, + { + "epoch": 0.2806954715562161, + "grad_norm": 2.3410048484802246, + "learning_rate": 0.0001688098618380085, + "loss": 1.3165, + "step": 7838 + }, + { + "epoch": 0.28073128368578437, + "grad_norm": 1.4924992322921753, + "learning_rate": 0.00016880144493210052, + "loss": 1.551, + "step": 7839 + }, + { + "epoch": 0.28076709581535264, + "grad_norm": 1.6559271812438965, + "learning_rate": 0.00016879302710055792, + "loss": 1.5487, + "step": 7840 + }, + { + "epoch": 0.28080290794492097, + "grad_norm": 1.4658722877502441, + "learning_rate": 0.0001687846083434938, + "loss": 1.7468, + "step": 7841 + }, + { + "epoch": 0.28083872007448923, + "grad_norm": 1.6023316383361816, + "learning_rate": 0.00016877618866102155, + "loss": 1.6474, + "step": 7842 + }, + { + "epoch": 0.2808745322040575, + "grad_norm": 2.7606184482574463, + "learning_rate": 0.0001687677680532544, + "loss": 1.3463, + "step": 7843 + }, + { + "epoch": 0.28091034433362577, + "grad_norm": 2.0281307697296143, + "learning_rate": 0.00016875934652030563, + "loss": 1.5334, + "step": 7844 + }, + { + "epoch": 0.2809461564631941, + "grad_norm": 1.7015283107757568, + "learning_rate": 0.00016875092406228853, + "loss": 1.4474, + "step": 7845 + }, + { + "epoch": 0.28098196859276237, + "grad_norm": 2.321729898452759, + "learning_rate": 0.00016874250067931644, + "loss": 1.7248, + "step": 7846 + }, + { + "epoch": 0.28101778072233063, + "grad_norm": 1.3899301290512085, + "learning_rate": 0.00016873407637150268, + "loss": 1.5622, + "step": 7847 + }, + { + "epoch": 0.28105359285189896, + "grad_norm": 2.0998470783233643, + "learning_rate": 0.00016872565113896056, + "loss": 1.318, + "step": 7848 + }, + { + "epoch": 0.28108940498146723, + "grad_norm": 1.7129113674163818, + "learning_rate": 0.00016871722498180346, + "loss": 1.6792, + "step": 7849 + }, + { + "epoch": 0.2811252171110355, + "grad_norm": 1.6951273679733276, + "learning_rate": 0.00016870879790014474, + "loss": 1.506, + "step": 7850 + }, + { + "epoch": 0.28116102924060377, + "grad_norm": 2.1859934329986572, + "learning_rate": 0.00016870036989409778, + "loss": 1.6351, + "step": 7851 + }, + { + "epoch": 0.2811968413701721, + "grad_norm": 2.049006938934326, + "learning_rate": 0.00016869194096377597, + "loss": 1.5945, + "step": 7852 + }, + { + "epoch": 0.28123265349974036, + "grad_norm": 1.6370213031768799, + "learning_rate": 0.00016868351110929268, + "loss": 1.6024, + "step": 7853 + }, + { + "epoch": 0.28126846562930863, + "grad_norm": 1.693914532661438, + "learning_rate": 0.00016867508033076135, + "loss": 1.2703, + "step": 7854 + }, + { + "epoch": 0.28130427775887695, + "grad_norm": 1.6464945077896118, + "learning_rate": 0.00016866664862829543, + "loss": 1.5552, + "step": 7855 + }, + { + "epoch": 0.2813400898884452, + "grad_norm": 1.5109620094299316, + "learning_rate": 0.00016865821600200827, + "loss": 1.7248, + "step": 7856 + }, + { + "epoch": 0.2813759020180135, + "grad_norm": 1.7957203388214111, + "learning_rate": 0.0001686497824520134, + "loss": 1.8003, + "step": 7857 + }, + { + "epoch": 0.28141171414758176, + "grad_norm": 1.7840375900268555, + "learning_rate": 0.00016864134797842426, + "loss": 1.4401, + "step": 7858 + }, + { + "epoch": 0.2814475262771501, + "grad_norm": 1.9683198928833008, + "learning_rate": 0.00016863291258135434, + "loss": 1.4671, + "step": 7859 + }, + { + "epoch": 0.28148333840671835, + "grad_norm": 1.9993942975997925, + "learning_rate": 0.00016862447626091707, + "loss": 1.7088, + "step": 7860 + }, + { + "epoch": 0.2815191505362866, + "grad_norm": 1.76691472530365, + "learning_rate": 0.00016861603901722601, + "loss": 1.3172, + "step": 7861 + }, + { + "epoch": 0.28155496266585495, + "grad_norm": 1.4511359930038452, + "learning_rate": 0.00016860760085039467, + "loss": 1.5278, + "step": 7862 + }, + { + "epoch": 0.2815907747954232, + "grad_norm": 1.7963536977767944, + "learning_rate": 0.00016859916176053657, + "loss": 1.6062, + "step": 7863 + }, + { + "epoch": 0.2816265869249915, + "grad_norm": 1.900141954421997, + "learning_rate": 0.00016859072174776522, + "loss": 1.3176, + "step": 7864 + }, + { + "epoch": 0.28166239905455975, + "grad_norm": 1.7314437627792358, + "learning_rate": 0.00016858228081219416, + "loss": 1.5749, + "step": 7865 + }, + { + "epoch": 0.2816982111841281, + "grad_norm": 2.220306396484375, + "learning_rate": 0.000168573838953937, + "loss": 1.5796, + "step": 7866 + }, + { + "epoch": 0.28173402331369635, + "grad_norm": 1.7966647148132324, + "learning_rate": 0.00016856539617310728, + "loss": 1.5018, + "step": 7867 + }, + { + "epoch": 0.2817698354432646, + "grad_norm": 1.964325189590454, + "learning_rate": 0.0001685569524698186, + "loss": 1.7829, + "step": 7868 + }, + { + "epoch": 0.28180564757283294, + "grad_norm": 1.6538739204406738, + "learning_rate": 0.00016854850784418457, + "loss": 1.8246, + "step": 7869 + }, + { + "epoch": 0.2818414597024012, + "grad_norm": 1.633487343788147, + "learning_rate": 0.00016854006229631877, + "loss": 1.674, + "step": 7870 + }, + { + "epoch": 0.2818772718319695, + "grad_norm": 2.0294923782348633, + "learning_rate": 0.00016853161582633486, + "loss": 1.8599, + "step": 7871 + }, + { + "epoch": 0.28191308396153775, + "grad_norm": 1.7625004053115845, + "learning_rate": 0.00016852316843434645, + "loss": 1.2676, + "step": 7872 + }, + { + "epoch": 0.28194889609110607, + "grad_norm": 1.4743856191635132, + "learning_rate": 0.0001685147201204672, + "loss": 1.8254, + "step": 7873 + }, + { + "epoch": 0.28198470822067434, + "grad_norm": 1.7030346393585205, + "learning_rate": 0.00016850627088481077, + "loss": 1.4622, + "step": 7874 + }, + { + "epoch": 0.2820205203502426, + "grad_norm": 1.8332545757293701, + "learning_rate": 0.0001684978207274908, + "loss": 1.6707, + "step": 7875 + }, + { + "epoch": 0.28205633247981093, + "grad_norm": 1.432371973991394, + "learning_rate": 0.00016848936964862106, + "loss": 1.4167, + "step": 7876 + }, + { + "epoch": 0.2820921446093792, + "grad_norm": 1.7265770435333252, + "learning_rate": 0.00016848091764831518, + "loss": 1.4469, + "step": 7877 + }, + { + "epoch": 0.28212795673894747, + "grad_norm": 3.417351484298706, + "learning_rate": 0.00016847246472668684, + "loss": 1.5023, + "step": 7878 + }, + { + "epoch": 0.28216376886851574, + "grad_norm": 1.2689472436904907, + "learning_rate": 0.00016846401088384987, + "loss": 1.5067, + "step": 7879 + }, + { + "epoch": 0.28219958099808407, + "grad_norm": 2.011307954788208, + "learning_rate": 0.0001684555561199179, + "loss": 1.605, + "step": 7880 + }, + { + "epoch": 0.28223539312765233, + "grad_norm": 2.8057658672332764, + "learning_rate": 0.00016844710043500478, + "loss": 1.6136, + "step": 7881 + }, + { + "epoch": 0.2822712052572206, + "grad_norm": 1.857756495475769, + "learning_rate": 0.00016843864382922418, + "loss": 1.6513, + "step": 7882 + }, + { + "epoch": 0.2823070173867889, + "grad_norm": 1.6902923583984375, + "learning_rate": 0.0001684301863026899, + "loss": 1.29, + "step": 7883 + }, + { + "epoch": 0.2823428295163572, + "grad_norm": 2.0259835720062256, + "learning_rate": 0.00016842172785551572, + "loss": 1.7171, + "step": 7884 + }, + { + "epoch": 0.28237864164592547, + "grad_norm": 1.9104828834533691, + "learning_rate": 0.00016841326848781546, + "loss": 1.6705, + "step": 7885 + }, + { + "epoch": 0.28241445377549373, + "grad_norm": 2.1479547023773193, + "learning_rate": 0.00016840480819970294, + "loss": 1.5749, + "step": 7886 + }, + { + "epoch": 0.28245026590506206, + "grad_norm": 1.6750024557113647, + "learning_rate": 0.00016839634699129197, + "loss": 1.4534, + "step": 7887 + }, + { + "epoch": 0.28248607803463033, + "grad_norm": 1.583740472793579, + "learning_rate": 0.00016838788486269634, + "loss": 1.3206, + "step": 7888 + }, + { + "epoch": 0.2825218901641986, + "grad_norm": 1.3861204385757446, + "learning_rate": 0.00016837942181402993, + "loss": 1.3911, + "step": 7889 + }, + { + "epoch": 0.2825577022937669, + "grad_norm": 1.6452444791793823, + "learning_rate": 0.00016837095784540663, + "loss": 1.5608, + "step": 7890 + }, + { + "epoch": 0.2825935144233352, + "grad_norm": 1.4533544778823853, + "learning_rate": 0.0001683624929569403, + "loss": 1.7149, + "step": 7891 + }, + { + "epoch": 0.28262932655290346, + "grad_norm": 1.6614084243774414, + "learning_rate": 0.0001683540271487448, + "loss": 1.4509, + "step": 7892 + }, + { + "epoch": 0.28266513868247173, + "grad_norm": 1.9571036100387573, + "learning_rate": 0.000168345560420934, + "loss": 1.6858, + "step": 7893 + }, + { + "epoch": 0.28270095081204005, + "grad_norm": 1.823486328125, + "learning_rate": 0.00016833709277362186, + "loss": 1.788, + "step": 7894 + }, + { + "epoch": 0.2827367629416083, + "grad_norm": 1.6518898010253906, + "learning_rate": 0.0001683286242069223, + "loss": 1.4248, + "step": 7895 + }, + { + "epoch": 0.2827725750711766, + "grad_norm": 1.626325011253357, + "learning_rate": 0.00016832015472094923, + "loss": 1.203, + "step": 7896 + }, + { + "epoch": 0.2828083872007449, + "grad_norm": 1.368032693862915, + "learning_rate": 0.0001683116843158166, + "loss": 1.3924, + "step": 7897 + }, + { + "epoch": 0.2828441993303132, + "grad_norm": 1.738690972328186, + "learning_rate": 0.00016830321299163837, + "loss": 1.7386, + "step": 7898 + }, + { + "epoch": 0.28288001145988145, + "grad_norm": 1.9578567743301392, + "learning_rate": 0.0001682947407485285, + "loss": 1.4225, + "step": 7899 + }, + { + "epoch": 0.2829158235894497, + "grad_norm": 2.169372320175171, + "learning_rate": 0.00016828626758660104, + "loss": 1.7834, + "step": 7900 + }, + { + "epoch": 0.28295163571901805, + "grad_norm": 2.070221185684204, + "learning_rate": 0.00016827779350596988, + "loss": 1.6165, + "step": 7901 + }, + { + "epoch": 0.2829874478485863, + "grad_norm": 1.5898553133010864, + "learning_rate": 0.00016826931850674913, + "loss": 1.7263, + "step": 7902 + }, + { + "epoch": 0.2830232599781546, + "grad_norm": 1.5495221614837646, + "learning_rate": 0.0001682608425890527, + "loss": 1.4515, + "step": 7903 + }, + { + "epoch": 0.2830590721077229, + "grad_norm": 1.3812370300292969, + "learning_rate": 0.00016825236575299473, + "loss": 1.3405, + "step": 7904 + }, + { + "epoch": 0.2830948842372912, + "grad_norm": 1.676615595817566, + "learning_rate": 0.0001682438879986892, + "loss": 1.32, + "step": 7905 + }, + { + "epoch": 0.28313069636685945, + "grad_norm": 1.6965440511703491, + "learning_rate": 0.0001682354093262502, + "loss": 1.5504, + "step": 7906 + }, + { + "epoch": 0.2831665084964277, + "grad_norm": 1.978925108909607, + "learning_rate": 0.00016822692973579177, + "loss": 1.4036, + "step": 7907 + }, + { + "epoch": 0.28320232062599604, + "grad_norm": 2.1448557376861572, + "learning_rate": 0.000168218449227428, + "loss": 1.7598, + "step": 7908 + }, + { + "epoch": 0.2832381327555643, + "grad_norm": 1.9719595909118652, + "learning_rate": 0.00016820996780127302, + "loss": 1.3712, + "step": 7909 + }, + { + "epoch": 0.2832739448851326, + "grad_norm": 1.57223379611969, + "learning_rate": 0.00016820148545744089, + "loss": 1.5271, + "step": 7910 + }, + { + "epoch": 0.2833097570147009, + "grad_norm": 1.6346989870071411, + "learning_rate": 0.00016819300219604572, + "loss": 1.6543, + "step": 7911 + }, + { + "epoch": 0.28334556914426917, + "grad_norm": 1.9346641302108765, + "learning_rate": 0.00016818451801720169, + "loss": 1.643, + "step": 7912 + }, + { + "epoch": 0.28338138127383744, + "grad_norm": 1.87030827999115, + "learning_rate": 0.00016817603292102292, + "loss": 1.7069, + "step": 7913 + }, + { + "epoch": 0.2834171934034057, + "grad_norm": 1.740334391593933, + "learning_rate": 0.00016816754690762356, + "loss": 1.7116, + "step": 7914 + }, + { + "epoch": 0.28345300553297403, + "grad_norm": 2.0844287872314453, + "learning_rate": 0.0001681590599771178, + "loss": 1.5254, + "step": 7915 + }, + { + "epoch": 0.2834888176625423, + "grad_norm": 1.4731221199035645, + "learning_rate": 0.00016815057212961985, + "loss": 1.5977, + "step": 7916 + }, + { + "epoch": 0.28352462979211057, + "grad_norm": 1.638742208480835, + "learning_rate": 0.0001681420833652438, + "loss": 1.5354, + "step": 7917 + }, + { + "epoch": 0.2835604419216789, + "grad_norm": 1.5350745916366577, + "learning_rate": 0.00016813359368410394, + "loss": 1.6585, + "step": 7918 + }, + { + "epoch": 0.28359625405124717, + "grad_norm": 1.2498204708099365, + "learning_rate": 0.00016812510308631445, + "loss": 1.312, + "step": 7919 + }, + { + "epoch": 0.28363206618081543, + "grad_norm": 1.752829670906067, + "learning_rate": 0.00016811661157198956, + "loss": 1.3958, + "step": 7920 + }, + { + "epoch": 0.2836678783103837, + "grad_norm": 1.6889894008636475, + "learning_rate": 0.00016810811914124354, + "loss": 1.5042, + "step": 7921 + }, + { + "epoch": 0.283703690439952, + "grad_norm": 2.0191566944122314, + "learning_rate": 0.00016809962579419064, + "loss": 1.523, + "step": 7922 + }, + { + "epoch": 0.2837395025695203, + "grad_norm": 2.0825445652008057, + "learning_rate": 0.0001680911315309451, + "loss": 1.5253, + "step": 7923 + }, + { + "epoch": 0.28377531469908857, + "grad_norm": 1.8054052591323853, + "learning_rate": 0.00016808263635162123, + "loss": 1.7837, + "step": 7924 + }, + { + "epoch": 0.2838111268286569, + "grad_norm": 1.8791615962982178, + "learning_rate": 0.0001680741402563333, + "loss": 1.2332, + "step": 7925 + }, + { + "epoch": 0.28384693895822516, + "grad_norm": 1.7158640623092651, + "learning_rate": 0.00016806564324519565, + "loss": 1.5696, + "step": 7926 + }, + { + "epoch": 0.28388275108779343, + "grad_norm": 1.4893388748168945, + "learning_rate": 0.00016805714531832253, + "loss": 1.5782, + "step": 7927 + }, + { + "epoch": 0.2839185632173617, + "grad_norm": 2.08494234085083, + "learning_rate": 0.00016804864647582832, + "loss": 1.8085, + "step": 7928 + }, + { + "epoch": 0.28395437534693, + "grad_norm": 1.590214729309082, + "learning_rate": 0.00016804014671782736, + "loss": 1.7791, + "step": 7929 + }, + { + "epoch": 0.2839901874764983, + "grad_norm": 1.7456499338150024, + "learning_rate": 0.00016803164604443395, + "loss": 1.2424, + "step": 7930 + }, + { + "epoch": 0.28402599960606656, + "grad_norm": 2.2353994846343994, + "learning_rate": 0.00016802314445576254, + "loss": 1.2298, + "step": 7931 + }, + { + "epoch": 0.2840618117356349, + "grad_norm": 1.718406319618225, + "learning_rate": 0.00016801464195192746, + "loss": 1.6377, + "step": 7932 + }, + { + "epoch": 0.28409762386520315, + "grad_norm": 1.8352092504501343, + "learning_rate": 0.00016800613853304311, + "loss": 1.3559, + "step": 7933 + }, + { + "epoch": 0.2841334359947714, + "grad_norm": 1.6804618835449219, + "learning_rate": 0.00016799763419922387, + "loss": 1.8663, + "step": 7934 + }, + { + "epoch": 0.2841692481243397, + "grad_norm": 1.5867995023727417, + "learning_rate": 0.00016798912895058416, + "loss": 1.4769, + "step": 7935 + }, + { + "epoch": 0.284205060253908, + "grad_norm": 1.8360810279846191, + "learning_rate": 0.00016798062278723845, + "loss": 1.6387, + "step": 7936 + }, + { + "epoch": 0.2842408723834763, + "grad_norm": 1.573351263999939, + "learning_rate": 0.00016797211570930115, + "loss": 1.6323, + "step": 7937 + }, + { + "epoch": 0.28427668451304455, + "grad_norm": 1.9696694612503052, + "learning_rate": 0.0001679636077168867, + "loss": 1.8065, + "step": 7938 + }, + { + "epoch": 0.2843124966426129, + "grad_norm": 1.245781421661377, + "learning_rate": 0.00016795509881010955, + "loss": 1.4907, + "step": 7939 + }, + { + "epoch": 0.28434830877218115, + "grad_norm": 1.5631426572799683, + "learning_rate": 0.00016794658898908424, + "loss": 1.5878, + "step": 7940 + }, + { + "epoch": 0.2843841209017494, + "grad_norm": 2.6311140060424805, + "learning_rate": 0.00016793807825392517, + "loss": 1.6975, + "step": 7941 + }, + { + "epoch": 0.2844199330313177, + "grad_norm": 1.682794213294983, + "learning_rate": 0.00016792956660474694, + "loss": 1.7496, + "step": 7942 + }, + { + "epoch": 0.284455745160886, + "grad_norm": 1.3867418766021729, + "learning_rate": 0.00016792105404166404, + "loss": 1.6555, + "step": 7943 + }, + { + "epoch": 0.2844915572904543, + "grad_norm": 2.540458917617798, + "learning_rate": 0.00016791254056479092, + "loss": 1.2869, + "step": 7944 + }, + { + "epoch": 0.28452736942002255, + "grad_norm": 1.879603624343872, + "learning_rate": 0.00016790402617424216, + "loss": 1.645, + "step": 7945 + }, + { + "epoch": 0.28456318154959087, + "grad_norm": 1.6672354936599731, + "learning_rate": 0.00016789551087013232, + "loss": 1.6987, + "step": 7946 + }, + { + "epoch": 0.28459899367915914, + "grad_norm": 1.5219227075576782, + "learning_rate": 0.00016788699465257597, + "loss": 1.6638, + "step": 7947 + }, + { + "epoch": 0.2846348058087274, + "grad_norm": 1.9220548868179321, + "learning_rate": 0.00016787847752168769, + "loss": 1.3491, + "step": 7948 + }, + { + "epoch": 0.2846706179382957, + "grad_norm": 1.5979007482528687, + "learning_rate": 0.00016786995947758204, + "loss": 1.5428, + "step": 7949 + }, + { + "epoch": 0.284706430067864, + "grad_norm": 1.6197651624679565, + "learning_rate": 0.00016786144052037365, + "loss": 1.416, + "step": 7950 + }, + { + "epoch": 0.28474224219743227, + "grad_norm": 1.597205400466919, + "learning_rate": 0.00016785292065017707, + "loss": 1.3962, + "step": 7951 + }, + { + "epoch": 0.28477805432700054, + "grad_norm": 2.123845100402832, + "learning_rate": 0.000167844399867107, + "loss": 1.3666, + "step": 7952 + }, + { + "epoch": 0.28481386645656886, + "grad_norm": 1.9097387790679932, + "learning_rate": 0.00016783587817127804, + "loss": 1.3416, + "step": 7953 + }, + { + "epoch": 0.28484967858613713, + "grad_norm": 1.6967304944992065, + "learning_rate": 0.00016782735556280484, + "loss": 1.6285, + "step": 7954 + }, + { + "epoch": 0.2848854907157054, + "grad_norm": 1.3434966802597046, + "learning_rate": 0.00016781883204180207, + "loss": 1.5585, + "step": 7955 + }, + { + "epoch": 0.28492130284527367, + "grad_norm": 2.0665807723999023, + "learning_rate": 0.00016781030760838436, + "loss": 1.9691, + "step": 7956 + }, + { + "epoch": 0.284957114974842, + "grad_norm": 1.8598235845565796, + "learning_rate": 0.00016780178226266646, + "loss": 1.6914, + "step": 7957 + }, + { + "epoch": 0.28499292710441027, + "grad_norm": 1.6443570852279663, + "learning_rate": 0.00016779325600476303, + "loss": 1.7151, + "step": 7958 + }, + { + "epoch": 0.28502873923397853, + "grad_norm": 2.839176893234253, + "learning_rate": 0.00016778472883478878, + "loss": 1.4785, + "step": 7959 + }, + { + "epoch": 0.28506455136354686, + "grad_norm": 1.6859476566314697, + "learning_rate": 0.00016777620075285847, + "loss": 1.3766, + "step": 7960 + }, + { + "epoch": 0.2851003634931151, + "grad_norm": 2.1881790161132812, + "learning_rate": 0.00016776767175908676, + "loss": 1.8795, + "step": 7961 + }, + { + "epoch": 0.2851361756226834, + "grad_norm": 1.856256127357483, + "learning_rate": 0.00016775914185358846, + "loss": 1.7113, + "step": 7962 + }, + { + "epoch": 0.28517198775225167, + "grad_norm": 2.153108596801758, + "learning_rate": 0.00016775061103647834, + "loss": 1.6182, + "step": 7963 + }, + { + "epoch": 0.28520779988182, + "grad_norm": 1.3774833679199219, + "learning_rate": 0.00016774207930787108, + "loss": 1.5698, + "step": 7964 + }, + { + "epoch": 0.28524361201138826, + "grad_norm": 2.905151844024658, + "learning_rate": 0.00016773354666788155, + "loss": 1.2458, + "step": 7965 + }, + { + "epoch": 0.28527942414095653, + "grad_norm": 1.9551284313201904, + "learning_rate": 0.00016772501311662454, + "loss": 1.6533, + "step": 7966 + }, + { + "epoch": 0.28531523627052485, + "grad_norm": 1.6111496686935425, + "learning_rate": 0.00016771647865421483, + "loss": 1.5987, + "step": 7967 + }, + { + "epoch": 0.2853510484000931, + "grad_norm": 1.438314437866211, + "learning_rate": 0.00016770794328076726, + "loss": 1.626, + "step": 7968 + }, + { + "epoch": 0.2853868605296614, + "grad_norm": 1.7360272407531738, + "learning_rate": 0.00016769940699639662, + "loss": 1.5362, + "step": 7969 + }, + { + "epoch": 0.28542267265922966, + "grad_norm": 1.3850613832473755, + "learning_rate": 0.0001676908698012178, + "loss": 1.7958, + "step": 7970 + }, + { + "epoch": 0.285458484788798, + "grad_norm": 1.258893370628357, + "learning_rate": 0.0001676823316953456, + "loss": 1.0779, + "step": 7971 + }, + { + "epoch": 0.28549429691836625, + "grad_norm": 1.8675183057785034, + "learning_rate": 0.00016767379267889498, + "loss": 1.5385, + "step": 7972 + }, + { + "epoch": 0.2855301090479345, + "grad_norm": 1.3767646551132202, + "learning_rate": 0.00016766525275198078, + "loss": 1.6422, + "step": 7973 + }, + { + "epoch": 0.28556592117750285, + "grad_norm": 1.6324293613433838, + "learning_rate": 0.00016765671191471785, + "loss": 1.5077, + "step": 7974 + }, + { + "epoch": 0.2856017333070711, + "grad_norm": 1.361929178237915, + "learning_rate": 0.00016764817016722114, + "loss": 1.6237, + "step": 7975 + }, + { + "epoch": 0.2856375454366394, + "grad_norm": 1.4396480321884155, + "learning_rate": 0.00016763962750960558, + "loss": 1.6242, + "step": 7976 + }, + { + "epoch": 0.28567335756620765, + "grad_norm": 1.8851977586746216, + "learning_rate": 0.00016763108394198605, + "loss": 1.7705, + "step": 7977 + }, + { + "epoch": 0.285709169695776, + "grad_norm": 1.8253602981567383, + "learning_rate": 0.00016762253946447757, + "loss": 1.6693, + "step": 7978 + }, + { + "epoch": 0.28574498182534425, + "grad_norm": 1.959054946899414, + "learning_rate": 0.000167613994077195, + "loss": 1.499, + "step": 7979 + }, + { + "epoch": 0.2857807939549125, + "grad_norm": 1.7831599712371826, + "learning_rate": 0.00016760544778025337, + "loss": 1.8024, + "step": 7980 + }, + { + "epoch": 0.28581660608448084, + "grad_norm": 2.5011425018310547, + "learning_rate": 0.00016759690057376769, + "loss": 1.7597, + "step": 7981 + }, + { + "epoch": 0.2858524182140491, + "grad_norm": 1.647387146949768, + "learning_rate": 0.00016758835245785284, + "loss": 1.6501, + "step": 7982 + }, + { + "epoch": 0.2858882303436174, + "grad_norm": 1.6591894626617432, + "learning_rate": 0.00016757980343262393, + "loss": 1.8044, + "step": 7983 + }, + { + "epoch": 0.28592404247318565, + "grad_norm": 1.6702752113342285, + "learning_rate": 0.00016757125349819592, + "loss": 1.8583, + "step": 7984 + }, + { + "epoch": 0.28595985460275397, + "grad_norm": 1.6072109937667847, + "learning_rate": 0.00016756270265468385, + "loss": 1.392, + "step": 7985 + }, + { + "epoch": 0.28599566673232224, + "grad_norm": 1.9452677965164185, + "learning_rate": 0.00016755415090220278, + "loss": 1.5837, + "step": 7986 + }, + { + "epoch": 0.2860314788618905, + "grad_norm": 1.7112654447555542, + "learning_rate": 0.00016754559824086774, + "loss": 1.3601, + "step": 7987 + }, + { + "epoch": 0.28606729099145883, + "grad_norm": 1.639289140701294, + "learning_rate": 0.00016753704467079383, + "loss": 1.7134, + "step": 7988 + }, + { + "epoch": 0.2861031031210271, + "grad_norm": 1.4900970458984375, + "learning_rate": 0.00016752849019209607, + "loss": 1.4638, + "step": 7989 + }, + { + "epoch": 0.28613891525059537, + "grad_norm": 1.5364689826965332, + "learning_rate": 0.00016751993480488956, + "loss": 1.6713, + "step": 7990 + }, + { + "epoch": 0.28617472738016364, + "grad_norm": 1.6079754829406738, + "learning_rate": 0.0001675113785092895, + "loss": 1.8136, + "step": 7991 + }, + { + "epoch": 0.28621053950973196, + "grad_norm": 1.4302982091903687, + "learning_rate": 0.00016750282130541084, + "loss": 1.4623, + "step": 7992 + }, + { + "epoch": 0.28624635163930023, + "grad_norm": 1.5606993436813354, + "learning_rate": 0.00016749426319336884, + "loss": 1.5872, + "step": 7993 + }, + { + "epoch": 0.2862821637688685, + "grad_norm": 1.3386200666427612, + "learning_rate": 0.00016748570417327857, + "loss": 1.5573, + "step": 7994 + }, + { + "epoch": 0.2863179758984368, + "grad_norm": 2.041780710220337, + "learning_rate": 0.0001674771442452552, + "loss": 1.6269, + "step": 7995 + }, + { + "epoch": 0.2863537880280051, + "grad_norm": 1.6531713008880615, + "learning_rate": 0.0001674685834094139, + "loss": 1.5946, + "step": 7996 + }, + { + "epoch": 0.28638960015757337, + "grad_norm": 1.9872679710388184, + "learning_rate": 0.00016746002166586984, + "loss": 1.5185, + "step": 7997 + }, + { + "epoch": 0.28642541228714163, + "grad_norm": 1.2475894689559937, + "learning_rate": 0.00016745145901473819, + "loss": 1.3904, + "step": 7998 + }, + { + "epoch": 0.28646122441670996, + "grad_norm": 1.5508625507354736, + "learning_rate": 0.0001674428954561342, + "loss": 1.4844, + "step": 7999 + }, + { + "epoch": 0.2864970365462782, + "grad_norm": 2.021813154220581, + "learning_rate": 0.000167434330990173, + "loss": 1.6274, + "step": 8000 + }, + { + "epoch": 0.2865328486758465, + "grad_norm": 1.5985814332962036, + "learning_rate": 0.0001674257656169699, + "loss": 1.837, + "step": 8001 + }, + { + "epoch": 0.2865686608054148, + "grad_norm": 1.310536503791809, + "learning_rate": 0.00016741719933664008, + "loss": 1.5341, + "step": 8002 + }, + { + "epoch": 0.2866044729349831, + "grad_norm": 2.1644248962402344, + "learning_rate": 0.00016740863214929883, + "loss": 1.4358, + "step": 8003 + }, + { + "epoch": 0.28664028506455136, + "grad_norm": 1.359682559967041, + "learning_rate": 0.00016740006405506133, + "loss": 1.4231, + "step": 8004 + }, + { + "epoch": 0.28667609719411963, + "grad_norm": 1.2971854209899902, + "learning_rate": 0.00016739149505404298, + "loss": 1.2328, + "step": 8005 + }, + { + "epoch": 0.28671190932368795, + "grad_norm": 1.5941914319992065, + "learning_rate": 0.00016738292514635893, + "loss": 1.3761, + "step": 8006 + }, + { + "epoch": 0.2867477214532562, + "grad_norm": 1.3879280090332031, + "learning_rate": 0.0001673743543321246, + "loss": 1.5472, + "step": 8007 + }, + { + "epoch": 0.2867835335828245, + "grad_norm": 2.025636911392212, + "learning_rate": 0.00016736578261145518, + "loss": 1.7866, + "step": 8008 + }, + { + "epoch": 0.2868193457123928, + "grad_norm": 1.9905762672424316, + "learning_rate": 0.00016735720998446607, + "loss": 1.451, + "step": 8009 + }, + { + "epoch": 0.2868551578419611, + "grad_norm": 2.0516388416290283, + "learning_rate": 0.0001673486364512726, + "loss": 1.5917, + "step": 8010 + }, + { + "epoch": 0.28689096997152935, + "grad_norm": 1.2939916849136353, + "learning_rate": 0.00016734006201199006, + "loss": 1.2998, + "step": 8011 + }, + { + "epoch": 0.2869267821010976, + "grad_norm": 1.6238223314285278, + "learning_rate": 0.00016733148666673388, + "loss": 1.3898, + "step": 8012 + }, + { + "epoch": 0.28696259423066595, + "grad_norm": 2.0216140747070312, + "learning_rate": 0.0001673229104156194, + "loss": 1.5983, + "step": 8013 + }, + { + "epoch": 0.2869984063602342, + "grad_norm": 1.5885149240493774, + "learning_rate": 0.000167314333258762, + "loss": 1.5381, + "step": 8014 + }, + { + "epoch": 0.2870342184898025, + "grad_norm": 2.034376382827759, + "learning_rate": 0.00016730575519627707, + "loss": 1.442, + "step": 8015 + }, + { + "epoch": 0.2870700306193708, + "grad_norm": 1.7644431591033936, + "learning_rate": 0.00016729717622828002, + "loss": 1.4961, + "step": 8016 + }, + { + "epoch": 0.2871058427489391, + "grad_norm": 1.5184577703475952, + "learning_rate": 0.00016728859635488626, + "loss": 1.536, + "step": 8017 + }, + { + "epoch": 0.28714165487850735, + "grad_norm": 1.647737741470337, + "learning_rate": 0.00016728001557621126, + "loss": 1.5551, + "step": 8018 + }, + { + "epoch": 0.2871774670080756, + "grad_norm": 1.5250535011291504, + "learning_rate": 0.00016727143389237042, + "loss": 1.4027, + "step": 8019 + }, + { + "epoch": 0.28721327913764394, + "grad_norm": 1.6826478242874146, + "learning_rate": 0.0001672628513034792, + "loss": 1.6263, + "step": 8020 + }, + { + "epoch": 0.2872490912672122, + "grad_norm": 1.4399031400680542, + "learning_rate": 0.0001672542678096531, + "loss": 1.6192, + "step": 8021 + }, + { + "epoch": 0.2872849033967805, + "grad_norm": 1.9906154870986938, + "learning_rate": 0.00016724568341100758, + "loss": 1.4139, + "step": 8022 + }, + { + "epoch": 0.28732071552634875, + "grad_norm": 1.7506561279296875, + "learning_rate": 0.0001672370981076581, + "loss": 1.4831, + "step": 8023 + }, + { + "epoch": 0.28735652765591707, + "grad_norm": 1.703379511833191, + "learning_rate": 0.00016722851189972024, + "loss": 1.5373, + "step": 8024 + }, + { + "epoch": 0.28739233978548534, + "grad_norm": 1.5690869092941284, + "learning_rate": 0.00016721992478730942, + "loss": 1.2874, + "step": 8025 + }, + { + "epoch": 0.2874281519150536, + "grad_norm": 1.3549939393997192, + "learning_rate": 0.00016721133677054123, + "loss": 1.3596, + "step": 8026 + }, + { + "epoch": 0.28746396404462193, + "grad_norm": 1.2308684587478638, + "learning_rate": 0.00016720274784953122, + "loss": 1.1399, + "step": 8027 + }, + { + "epoch": 0.2874997761741902, + "grad_norm": 1.5450139045715332, + "learning_rate": 0.00016719415802439493, + "loss": 1.5061, + "step": 8028 + }, + { + "epoch": 0.28753558830375847, + "grad_norm": 1.6292673349380493, + "learning_rate": 0.0001671855672952479, + "loss": 1.4167, + "step": 8029 + }, + { + "epoch": 0.28757140043332674, + "grad_norm": 2.0085721015930176, + "learning_rate": 0.00016717697566220573, + "loss": 1.6847, + "step": 8030 + }, + { + "epoch": 0.28760721256289506, + "grad_norm": 1.688120722770691, + "learning_rate": 0.00016716838312538402, + "loss": 1.5717, + "step": 8031 + }, + { + "epoch": 0.28764302469246333, + "grad_norm": 1.3048220872879028, + "learning_rate": 0.00016715978968489834, + "loss": 1.5839, + "step": 8032 + }, + { + "epoch": 0.2876788368220316, + "grad_norm": 1.4469718933105469, + "learning_rate": 0.0001671511953408643, + "loss": 1.5844, + "step": 8033 + }, + { + "epoch": 0.2877146489515999, + "grad_norm": 1.7788381576538086, + "learning_rate": 0.0001671426000933976, + "loss": 1.5919, + "step": 8034 + }, + { + "epoch": 0.2877504610811682, + "grad_norm": 1.2445861101150513, + "learning_rate": 0.00016713400394261378, + "loss": 1.5703, + "step": 8035 + }, + { + "epoch": 0.28778627321073647, + "grad_norm": 1.5097942352294922, + "learning_rate": 0.00016712540688862854, + "loss": 1.7577, + "step": 8036 + }, + { + "epoch": 0.28782208534030473, + "grad_norm": 1.4741946458816528, + "learning_rate": 0.0001671168089315575, + "loss": 1.7221, + "step": 8037 + }, + { + "epoch": 0.28785789746987306, + "grad_norm": 2.001183271408081, + "learning_rate": 0.00016710821007151646, + "loss": 1.6299, + "step": 8038 + }, + { + "epoch": 0.2878937095994413, + "grad_norm": 2.835008144378662, + "learning_rate": 0.00016709961030862092, + "loss": 1.3872, + "step": 8039 + }, + { + "epoch": 0.2879295217290096, + "grad_norm": 2.880967140197754, + "learning_rate": 0.00016709100964298673, + "loss": 1.097, + "step": 8040 + }, + { + "epoch": 0.2879653338585779, + "grad_norm": 1.6155633926391602, + "learning_rate": 0.00016708240807472956, + "loss": 1.6969, + "step": 8041 + }, + { + "epoch": 0.2880011459881462, + "grad_norm": 1.329376459121704, + "learning_rate": 0.00016707380560396508, + "loss": 1.7448, + "step": 8042 + }, + { + "epoch": 0.28803695811771446, + "grad_norm": 2.0664658546447754, + "learning_rate": 0.0001670652022308091, + "loss": 1.4754, + "step": 8043 + }, + { + "epoch": 0.28807277024728273, + "grad_norm": 1.5695208311080933, + "learning_rate": 0.0001670565979553773, + "loss": 1.6802, + "step": 8044 + }, + { + "epoch": 0.28810858237685105, + "grad_norm": 1.879258155822754, + "learning_rate": 0.0001670479927777855, + "loss": 1.3501, + "step": 8045 + }, + { + "epoch": 0.2881443945064193, + "grad_norm": 2.0454368591308594, + "learning_rate": 0.0001670393866981494, + "loss": 1.6627, + "step": 8046 + }, + { + "epoch": 0.2881802066359876, + "grad_norm": 1.3214797973632812, + "learning_rate": 0.00016703077971658487, + "loss": 1.484, + "step": 8047 + }, + { + "epoch": 0.2882160187655559, + "grad_norm": 1.7491061687469482, + "learning_rate": 0.00016702217183320762, + "loss": 1.7092, + "step": 8048 + }, + { + "epoch": 0.2882518308951242, + "grad_norm": 1.3021421432495117, + "learning_rate": 0.00016701356304813357, + "loss": 1.4552, + "step": 8049 + }, + { + "epoch": 0.28828764302469245, + "grad_norm": 1.3362562656402588, + "learning_rate": 0.00016700495336147841, + "loss": 1.4673, + "step": 8050 + }, + { + "epoch": 0.2883234551542607, + "grad_norm": 1.692622184753418, + "learning_rate": 0.00016699634277335805, + "loss": 1.4147, + "step": 8051 + }, + { + "epoch": 0.28835926728382905, + "grad_norm": 2.1186437606811523, + "learning_rate": 0.00016698773128388832, + "loss": 1.7805, + "step": 8052 + }, + { + "epoch": 0.2883950794133973, + "grad_norm": 1.411199688911438, + "learning_rate": 0.00016697911889318508, + "loss": 1.4397, + "step": 8053 + }, + { + "epoch": 0.2884308915429656, + "grad_norm": 1.3372883796691895, + "learning_rate": 0.00016697050560136417, + "loss": 1.3642, + "step": 8054 + }, + { + "epoch": 0.2884667036725339, + "grad_norm": 1.3349575996398926, + "learning_rate": 0.0001669618914085415, + "loss": 1.6066, + "step": 8055 + }, + { + "epoch": 0.2885025158021022, + "grad_norm": 1.5699543952941895, + "learning_rate": 0.00016695327631483298, + "loss": 1.9668, + "step": 8056 + }, + { + "epoch": 0.28853832793167045, + "grad_norm": 1.399040699005127, + "learning_rate": 0.00016694466032035447, + "loss": 1.5224, + "step": 8057 + }, + { + "epoch": 0.2885741400612387, + "grad_norm": 1.3568496704101562, + "learning_rate": 0.0001669360434252219, + "loss": 1.5744, + "step": 8058 + }, + { + "epoch": 0.28860995219080704, + "grad_norm": 3.9023430347442627, + "learning_rate": 0.00016692742562955123, + "loss": 1.4893, + "step": 8059 + }, + { + "epoch": 0.2886457643203753, + "grad_norm": 1.27892005443573, + "learning_rate": 0.00016691880693345837, + "loss": 1.5439, + "step": 8060 + }, + { + "epoch": 0.2886815764499436, + "grad_norm": 1.1989108324050903, + "learning_rate": 0.00016691018733705926, + "loss": 1.4919, + "step": 8061 + }, + { + "epoch": 0.2887173885795119, + "grad_norm": 1.627945899963379, + "learning_rate": 0.00016690156684046991, + "loss": 1.5184, + "step": 8062 + }, + { + "epoch": 0.28875320070908017, + "grad_norm": 1.6738954782485962, + "learning_rate": 0.00016689294544380628, + "loss": 1.6135, + "step": 8063 + }, + { + "epoch": 0.28878901283864844, + "grad_norm": 1.9493415355682373, + "learning_rate": 0.00016688432314718434, + "loss": 1.5187, + "step": 8064 + }, + { + "epoch": 0.2888248249682167, + "grad_norm": 1.8097580671310425, + "learning_rate": 0.0001668756999507201, + "loss": 1.7367, + "step": 8065 + }, + { + "epoch": 0.28886063709778503, + "grad_norm": 2.6163599491119385, + "learning_rate": 0.00016686707585452962, + "loss": 1.8102, + "step": 8066 + }, + { + "epoch": 0.2888964492273533, + "grad_norm": 1.294272780418396, + "learning_rate": 0.00016685845085872883, + "loss": 1.2674, + "step": 8067 + }, + { + "epoch": 0.28893226135692157, + "grad_norm": 1.7067070007324219, + "learning_rate": 0.00016684982496343386, + "loss": 1.9238, + "step": 8068 + }, + { + "epoch": 0.2889680734864899, + "grad_norm": 1.9256243705749512, + "learning_rate": 0.0001668411981687607, + "loss": 1.2723, + "step": 8069 + }, + { + "epoch": 0.28900388561605816, + "grad_norm": 1.8836833238601685, + "learning_rate": 0.00016683257047482548, + "loss": 1.9504, + "step": 8070 + }, + { + "epoch": 0.28903969774562643, + "grad_norm": 1.2621315717697144, + "learning_rate": 0.0001668239418817442, + "loss": 1.5191, + "step": 8071 + }, + { + "epoch": 0.2890755098751947, + "grad_norm": 1.624903678894043, + "learning_rate": 0.000166815312389633, + "loss": 1.3441, + "step": 8072 + }, + { + "epoch": 0.289111322004763, + "grad_norm": 1.787150502204895, + "learning_rate": 0.00016680668199860793, + "loss": 1.7283, + "step": 8073 + }, + { + "epoch": 0.2891471341343313, + "grad_norm": 1.5423107147216797, + "learning_rate": 0.00016679805070878514, + "loss": 1.5735, + "step": 8074 + }, + { + "epoch": 0.28918294626389957, + "grad_norm": 1.5319451093673706, + "learning_rate": 0.00016678941852028075, + "loss": 1.754, + "step": 8075 + }, + { + "epoch": 0.2892187583934679, + "grad_norm": 1.78415846824646, + "learning_rate": 0.0001667807854332109, + "loss": 1.6037, + "step": 8076 + }, + { + "epoch": 0.28925457052303616, + "grad_norm": 1.5288525819778442, + "learning_rate": 0.0001667721514476917, + "loss": 1.7794, + "step": 8077 + }, + { + "epoch": 0.2892903826526044, + "grad_norm": 1.83223295211792, + "learning_rate": 0.0001667635165638393, + "loss": 1.4731, + "step": 8078 + }, + { + "epoch": 0.2893261947821727, + "grad_norm": 1.4526853561401367, + "learning_rate": 0.00016675488078176994, + "loss": 1.65, + "step": 8079 + }, + { + "epoch": 0.289362006911741, + "grad_norm": 1.5872565507888794, + "learning_rate": 0.00016674624410159978, + "loss": 1.5926, + "step": 8080 + }, + { + "epoch": 0.2893978190413093, + "grad_norm": 1.7032769918441772, + "learning_rate": 0.000166737606523445, + "loss": 1.8127, + "step": 8081 + }, + { + "epoch": 0.28943363117087756, + "grad_norm": 1.5613534450531006, + "learning_rate": 0.00016672896804742178, + "loss": 1.8724, + "step": 8082 + }, + { + "epoch": 0.2894694433004459, + "grad_norm": 1.4352047443389893, + "learning_rate": 0.00016672032867364638, + "loss": 1.6871, + "step": 8083 + }, + { + "epoch": 0.28950525543001415, + "grad_norm": 1.5885288715362549, + "learning_rate": 0.00016671168840223503, + "loss": 1.5456, + "step": 8084 + }, + { + "epoch": 0.2895410675595824, + "grad_norm": 1.3646939992904663, + "learning_rate": 0.00016670304723330397, + "loss": 1.3528, + "step": 8085 + }, + { + "epoch": 0.2895768796891507, + "grad_norm": 2.0885651111602783, + "learning_rate": 0.00016669440516696945, + "loss": 1.3586, + "step": 8086 + }, + { + "epoch": 0.289612691818719, + "grad_norm": 1.4972714185714722, + "learning_rate": 0.0001666857622033477, + "loss": 1.6482, + "step": 8087 + }, + { + "epoch": 0.2896485039482873, + "grad_norm": 1.8946317434310913, + "learning_rate": 0.00016667711834255505, + "loss": 1.672, + "step": 8088 + }, + { + "epoch": 0.28968431607785555, + "grad_norm": 1.687410593032837, + "learning_rate": 0.0001666684735847078, + "loss": 1.1767, + "step": 8089 + }, + { + "epoch": 0.2897201282074239, + "grad_norm": 1.3974437713623047, + "learning_rate": 0.00016665982792992226, + "loss": 1.5356, + "step": 8090 + }, + { + "epoch": 0.28975594033699215, + "grad_norm": 1.5038459300994873, + "learning_rate": 0.00016665118137831468, + "loss": 1.2653, + "step": 8091 + }, + { + "epoch": 0.2897917524665604, + "grad_norm": 1.456130862236023, + "learning_rate": 0.00016664253393000144, + "loss": 1.8139, + "step": 8092 + }, + { + "epoch": 0.2898275645961287, + "grad_norm": 1.6186647415161133, + "learning_rate": 0.00016663388558509887, + "loss": 1.5897, + "step": 8093 + }, + { + "epoch": 0.289863376725697, + "grad_norm": 1.3481247425079346, + "learning_rate": 0.00016662523634372334, + "loss": 1.4714, + "step": 8094 + }, + { + "epoch": 0.2898991888552653, + "grad_norm": 1.415844440460205, + "learning_rate": 0.00016661658620599113, + "loss": 1.6026, + "step": 8095 + }, + { + "epoch": 0.28993500098483355, + "grad_norm": 2.165412187576294, + "learning_rate": 0.00016660793517201875, + "loss": 1.6079, + "step": 8096 + }, + { + "epoch": 0.28997081311440187, + "grad_norm": 1.5296587944030762, + "learning_rate": 0.00016659928324192248, + "loss": 1.7825, + "step": 8097 + }, + { + "epoch": 0.29000662524397014, + "grad_norm": 1.6164709329605103, + "learning_rate": 0.0001665906304158188, + "loss": 1.5188, + "step": 8098 + }, + { + "epoch": 0.2900424373735384, + "grad_norm": 1.5287069082260132, + "learning_rate": 0.00016658197669382405, + "loss": 1.579, + "step": 8099 + }, + { + "epoch": 0.2900782495031067, + "grad_norm": 2.6942615509033203, + "learning_rate": 0.0001665733220760547, + "loss": 1.3303, + "step": 8100 + }, + { + "epoch": 0.290114061632675, + "grad_norm": 2.589672327041626, + "learning_rate": 0.00016656466656262718, + "loss": 1.4543, + "step": 8101 + }, + { + "epoch": 0.29014987376224327, + "grad_norm": 1.4460633993148804, + "learning_rate": 0.00016655601015365794, + "loss": 1.6278, + "step": 8102 + }, + { + "epoch": 0.29018568589181154, + "grad_norm": 1.2138534784317017, + "learning_rate": 0.00016654735284926341, + "loss": 1.57, + "step": 8103 + }, + { + "epoch": 0.29022149802137986, + "grad_norm": 1.3130338191986084, + "learning_rate": 0.00016653869464956008, + "loss": 1.5317, + "step": 8104 + }, + { + "epoch": 0.29025731015094813, + "grad_norm": 2.141824245452881, + "learning_rate": 0.00016653003555466448, + "loss": 1.3303, + "step": 8105 + }, + { + "epoch": 0.2902931222805164, + "grad_norm": 1.5391119718551636, + "learning_rate": 0.00016652137556469305, + "loss": 1.3035, + "step": 8106 + }, + { + "epoch": 0.29032893441008467, + "grad_norm": 2.6114346981048584, + "learning_rate": 0.00016651271467976232, + "loss": 1.6472, + "step": 8107 + }, + { + "epoch": 0.290364746539653, + "grad_norm": 2.207442283630371, + "learning_rate": 0.0001665040528999888, + "loss": 1.5477, + "step": 8108 + }, + { + "epoch": 0.29040055866922126, + "grad_norm": 1.4280242919921875, + "learning_rate": 0.00016649539022548903, + "loss": 1.5803, + "step": 8109 + }, + { + "epoch": 0.29043637079878953, + "grad_norm": 1.6368767023086548, + "learning_rate": 0.00016648672665637958, + "loss": 1.4957, + "step": 8110 + }, + { + "epoch": 0.29047218292835786, + "grad_norm": 1.8318397998809814, + "learning_rate": 0.00016647806219277698, + "loss": 1.4815, + "step": 8111 + }, + { + "epoch": 0.2905079950579261, + "grad_norm": 2.205343723297119, + "learning_rate": 0.0001664693968347978, + "loss": 1.8598, + "step": 8112 + }, + { + "epoch": 0.2905438071874944, + "grad_norm": 2.0023486614227295, + "learning_rate": 0.00016646073058255862, + "loss": 1.3708, + "step": 8113 + }, + { + "epoch": 0.29057961931706267, + "grad_norm": 1.8614946603775024, + "learning_rate": 0.00016645206343617603, + "loss": 1.7015, + "step": 8114 + }, + { + "epoch": 0.290615431446631, + "grad_norm": 1.6110183000564575, + "learning_rate": 0.00016644339539576664, + "loss": 1.5513, + "step": 8115 + }, + { + "epoch": 0.29065124357619926, + "grad_norm": 1.5081207752227783, + "learning_rate": 0.0001664347264614471, + "loss": 1.6817, + "step": 8116 + }, + { + "epoch": 0.2906870557057675, + "grad_norm": 1.4741617441177368, + "learning_rate": 0.000166426056633334, + "loss": 1.7121, + "step": 8117 + }, + { + "epoch": 0.29072286783533585, + "grad_norm": 1.6981256008148193, + "learning_rate": 0.00016641738591154396, + "loss": 1.626, + "step": 8118 + }, + { + "epoch": 0.2907586799649041, + "grad_norm": 1.4795587062835693, + "learning_rate": 0.00016640871429619372, + "loss": 1.5249, + "step": 8119 + }, + { + "epoch": 0.2907944920944724, + "grad_norm": 1.8192358016967773, + "learning_rate": 0.00016640004178739985, + "loss": 1.5308, + "step": 8120 + }, + { + "epoch": 0.29083030422404066, + "grad_norm": 1.5716443061828613, + "learning_rate": 0.0001663913683852791, + "loss": 1.1341, + "step": 8121 + }, + { + "epoch": 0.290866116353609, + "grad_norm": 1.5312237739562988, + "learning_rate": 0.00016638269408994808, + "loss": 1.5524, + "step": 8122 + }, + { + "epoch": 0.29090192848317725, + "grad_norm": 1.443957805633545, + "learning_rate": 0.00016637401890152358, + "loss": 1.4105, + "step": 8123 + }, + { + "epoch": 0.2909377406127455, + "grad_norm": 1.9536917209625244, + "learning_rate": 0.00016636534282012225, + "loss": 1.4532, + "step": 8124 + }, + { + "epoch": 0.29097355274231385, + "grad_norm": 2.228641986846924, + "learning_rate": 0.00016635666584586083, + "loss": 1.5409, + "step": 8125 + }, + { + "epoch": 0.2910093648718821, + "grad_norm": 2.049088716506958, + "learning_rate": 0.00016634798797885607, + "loss": 1.6185, + "step": 8126 + }, + { + "epoch": 0.2910451770014504, + "grad_norm": 1.800985336303711, + "learning_rate": 0.00016633930921922474, + "loss": 1.6347, + "step": 8127 + }, + { + "epoch": 0.29108098913101865, + "grad_norm": 1.634249210357666, + "learning_rate": 0.00016633062956708354, + "loss": 1.4489, + "step": 8128 + }, + { + "epoch": 0.291116801260587, + "grad_norm": 1.9593605995178223, + "learning_rate": 0.0001663219490225493, + "loss": 1.4183, + "step": 8129 + }, + { + "epoch": 0.29115261339015525, + "grad_norm": 1.6916444301605225, + "learning_rate": 0.0001663132675857388, + "loss": 1.8659, + "step": 8130 + }, + { + "epoch": 0.2911884255197235, + "grad_norm": 1.6571805477142334, + "learning_rate": 0.0001663045852567688, + "loss": 1.648, + "step": 8131 + }, + { + "epoch": 0.29122423764929184, + "grad_norm": 1.6843384504318237, + "learning_rate": 0.00016629590203575613, + "loss": 1.2479, + "step": 8132 + }, + { + "epoch": 0.2912600497788601, + "grad_norm": 1.5704199075698853, + "learning_rate": 0.0001662872179228176, + "loss": 1.6044, + "step": 8133 + }, + { + "epoch": 0.2912958619084284, + "grad_norm": 2.2352945804595947, + "learning_rate": 0.0001662785329180701, + "loss": 1.6788, + "step": 8134 + }, + { + "epoch": 0.29133167403799665, + "grad_norm": 1.5743358135223389, + "learning_rate": 0.0001662698470216304, + "loss": 1.4598, + "step": 8135 + }, + { + "epoch": 0.29136748616756497, + "grad_norm": 1.7981449365615845, + "learning_rate": 0.0001662611602336154, + "loss": 1.5804, + "step": 8136 + }, + { + "epoch": 0.29140329829713324, + "grad_norm": 1.3479498624801636, + "learning_rate": 0.00016625247255414198, + "loss": 1.7092, + "step": 8137 + }, + { + "epoch": 0.2914391104267015, + "grad_norm": 1.1672223806381226, + "learning_rate": 0.000166243783983327, + "loss": 1.3365, + "step": 8138 + }, + { + "epoch": 0.29147492255626983, + "grad_norm": 1.7496581077575684, + "learning_rate": 0.00016623509452128732, + "loss": 1.5851, + "step": 8139 + }, + { + "epoch": 0.2915107346858381, + "grad_norm": 1.4864665269851685, + "learning_rate": 0.00016622640416813988, + "loss": 1.586, + "step": 8140 + }, + { + "epoch": 0.29154654681540637, + "grad_norm": 1.4444739818572998, + "learning_rate": 0.00016621771292400162, + "loss": 1.5328, + "step": 8141 + }, + { + "epoch": 0.29158235894497464, + "grad_norm": 1.6213339567184448, + "learning_rate": 0.00016620902078898943, + "loss": 1.2968, + "step": 8142 + }, + { + "epoch": 0.29161817107454296, + "grad_norm": 1.5538610219955444, + "learning_rate": 0.0001662003277632203, + "loss": 1.5661, + "step": 8143 + }, + { + "epoch": 0.29165398320411123, + "grad_norm": 2.134049654006958, + "learning_rate": 0.0001661916338468111, + "loss": 1.399, + "step": 8144 + }, + { + "epoch": 0.2916897953336795, + "grad_norm": 2.0003087520599365, + "learning_rate": 0.00016618293903987888, + "loss": 1.6424, + "step": 8145 + }, + { + "epoch": 0.2917256074632478, + "grad_norm": 2.1988790035247803, + "learning_rate": 0.00016617424334254061, + "loss": 1.5365, + "step": 8146 + }, + { + "epoch": 0.2917614195928161, + "grad_norm": 1.5732612609863281, + "learning_rate": 0.00016616554675491325, + "loss": 1.8022, + "step": 8147 + }, + { + "epoch": 0.29179723172238436, + "grad_norm": 1.4355159997940063, + "learning_rate": 0.00016615684927711376, + "loss": 1.4396, + "step": 8148 + }, + { + "epoch": 0.29183304385195263, + "grad_norm": 1.6861153841018677, + "learning_rate": 0.00016614815090925923, + "loss": 1.5059, + "step": 8149 + }, + { + "epoch": 0.29186885598152096, + "grad_norm": 1.6933960914611816, + "learning_rate": 0.00016613945165146668, + "loss": 1.6366, + "step": 8150 + }, + { + "epoch": 0.2919046681110892, + "grad_norm": 1.724313735961914, + "learning_rate": 0.00016613075150385308, + "loss": 1.3966, + "step": 8151 + }, + { + "epoch": 0.2919404802406575, + "grad_norm": 1.6139729022979736, + "learning_rate": 0.00016612205046653554, + "loss": 1.5069, + "step": 8152 + }, + { + "epoch": 0.2919762923702258, + "grad_norm": 1.470168113708496, + "learning_rate": 0.00016611334853963106, + "loss": 1.2152, + "step": 8153 + }, + { + "epoch": 0.2920121044997941, + "grad_norm": 1.4632880687713623, + "learning_rate": 0.0001661046457232568, + "loss": 1.2306, + "step": 8154 + }, + { + "epoch": 0.29204791662936236, + "grad_norm": 1.752076268196106, + "learning_rate": 0.00016609594201752982, + "loss": 1.6839, + "step": 8155 + }, + { + "epoch": 0.2920837287589306, + "grad_norm": 1.5564721822738647, + "learning_rate": 0.00016608723742256719, + "loss": 1.5811, + "step": 8156 + }, + { + "epoch": 0.29211954088849895, + "grad_norm": 1.4910109043121338, + "learning_rate": 0.00016607853193848597, + "loss": 1.4991, + "step": 8157 + }, + { + "epoch": 0.2921553530180672, + "grad_norm": 1.9942272901535034, + "learning_rate": 0.0001660698255654034, + "loss": 1.6296, + "step": 8158 + }, + { + "epoch": 0.2921911651476355, + "grad_norm": 2.043497323989868, + "learning_rate": 0.0001660611183034365, + "loss": 1.7203, + "step": 8159 + }, + { + "epoch": 0.2922269772772038, + "grad_norm": 1.740036129951477, + "learning_rate": 0.00016605241015270247, + "loss": 1.5992, + "step": 8160 + }, + { + "epoch": 0.2922627894067721, + "grad_norm": 1.7537078857421875, + "learning_rate": 0.0001660437011133185, + "loss": 1.7468, + "step": 8161 + }, + { + "epoch": 0.29229860153634035, + "grad_norm": 1.6161073446273804, + "learning_rate": 0.0001660349911854017, + "loss": 1.6065, + "step": 8162 + }, + { + "epoch": 0.2923344136659086, + "grad_norm": 2.7794902324676514, + "learning_rate": 0.0001660262803690693, + "loss": 1.356, + "step": 8163 + }, + { + "epoch": 0.29237022579547695, + "grad_norm": 1.438673496246338, + "learning_rate": 0.00016601756866443845, + "loss": 1.7374, + "step": 8164 + }, + { + "epoch": 0.2924060379250452, + "grad_norm": 1.6174455881118774, + "learning_rate": 0.00016600885607162636, + "loss": 1.2237, + "step": 8165 + }, + { + "epoch": 0.2924418500546135, + "grad_norm": 1.4131717681884766, + "learning_rate": 0.00016600014259075024, + "loss": 1.6027, + "step": 8166 + }, + { + "epoch": 0.2924776621841818, + "grad_norm": 2.0395007133483887, + "learning_rate": 0.00016599142822192736, + "loss": 1.0541, + "step": 8167 + }, + { + "epoch": 0.2925134743137501, + "grad_norm": 1.4359726905822754, + "learning_rate": 0.00016598271296527494, + "loss": 1.7906, + "step": 8168 + }, + { + "epoch": 0.29254928644331835, + "grad_norm": 1.8879810571670532, + "learning_rate": 0.00016597399682091024, + "loss": 1.2775, + "step": 8169 + }, + { + "epoch": 0.2925850985728866, + "grad_norm": 1.5420466661453247, + "learning_rate": 0.00016596527978895046, + "loss": 1.3398, + "step": 8170 + }, + { + "epoch": 0.29262091070245494, + "grad_norm": 1.2738362550735474, + "learning_rate": 0.00016595656186951297, + "loss": 1.7642, + "step": 8171 + }, + { + "epoch": 0.2926567228320232, + "grad_norm": 1.8173683881759644, + "learning_rate": 0.00016594784306271502, + "loss": 1.6041, + "step": 8172 + }, + { + "epoch": 0.2926925349615915, + "grad_norm": 1.720488429069519, + "learning_rate": 0.00016593912336867393, + "loss": 1.3482, + "step": 8173 + }, + { + "epoch": 0.2927283470911598, + "grad_norm": 2.0132124423980713, + "learning_rate": 0.00016593040278750694, + "loss": 1.5073, + "step": 8174 + }, + { + "epoch": 0.29276415922072807, + "grad_norm": 1.5772103071212769, + "learning_rate": 0.00016592168131933144, + "loss": 1.519, + "step": 8175 + }, + { + "epoch": 0.29279997135029634, + "grad_norm": 2.622490406036377, + "learning_rate": 0.00016591295896426476, + "loss": 1.2288, + "step": 8176 + }, + { + "epoch": 0.2928357834798646, + "grad_norm": 1.7737879753112793, + "learning_rate": 0.00016590423572242422, + "loss": 1.5786, + "step": 8177 + }, + { + "epoch": 0.29287159560943293, + "grad_norm": 1.4796202182769775, + "learning_rate": 0.0001658955115939272, + "loss": 1.3456, + "step": 8178 + }, + { + "epoch": 0.2929074077390012, + "grad_norm": 2.105363368988037, + "learning_rate": 0.00016588678657889112, + "loss": 1.6187, + "step": 8179 + }, + { + "epoch": 0.29294321986856947, + "grad_norm": 1.5410113334655762, + "learning_rate": 0.00016587806067743327, + "loss": 2.0069, + "step": 8180 + }, + { + "epoch": 0.2929790319981378, + "grad_norm": 1.7439554929733276, + "learning_rate": 0.00016586933388967109, + "loss": 1.3726, + "step": 8181 + }, + { + "epoch": 0.29301484412770606, + "grad_norm": 2.0335540771484375, + "learning_rate": 0.000165860606215722, + "loss": 1.5939, + "step": 8182 + }, + { + "epoch": 0.29305065625727433, + "grad_norm": 1.8200868368148804, + "learning_rate": 0.0001658518776557034, + "loss": 1.2275, + "step": 8183 + }, + { + "epoch": 0.2930864683868426, + "grad_norm": 1.4193452596664429, + "learning_rate": 0.00016584314820973273, + "loss": 1.4636, + "step": 8184 + }, + { + "epoch": 0.2931222805164109, + "grad_norm": 1.3906633853912354, + "learning_rate": 0.00016583441787792745, + "loss": 1.3655, + "step": 8185 + }, + { + "epoch": 0.2931580926459792, + "grad_norm": 1.5661295652389526, + "learning_rate": 0.00016582568666040497, + "loss": 1.8314, + "step": 8186 + }, + { + "epoch": 0.29319390477554746, + "grad_norm": 1.2787010669708252, + "learning_rate": 0.0001658169545572828, + "loss": 1.4975, + "step": 8187 + }, + { + "epoch": 0.2932297169051158, + "grad_norm": 1.9857454299926758, + "learning_rate": 0.0001658082215686784, + "loss": 1.3856, + "step": 8188 + }, + { + "epoch": 0.29326552903468406, + "grad_norm": 1.7901902198791504, + "learning_rate": 0.00016579948769470927, + "loss": 1.8455, + "step": 8189 + }, + { + "epoch": 0.2933013411642523, + "grad_norm": 1.7982251644134521, + "learning_rate": 0.00016579075293549292, + "loss": 1.3959, + "step": 8190 + }, + { + "epoch": 0.2933371532938206, + "grad_norm": 1.3840776681900024, + "learning_rate": 0.00016578201729114682, + "loss": 1.5659, + "step": 8191 + }, + { + "epoch": 0.2933729654233889, + "grad_norm": 1.91280996799469, + "learning_rate": 0.00016577328076178855, + "loss": 1.604, + "step": 8192 + }, + { + "epoch": 0.2934087775529572, + "grad_norm": 1.5600473880767822, + "learning_rate": 0.0001657645433475356, + "loss": 1.3662, + "step": 8193 + }, + { + "epoch": 0.29344458968252546, + "grad_norm": 1.603729248046875, + "learning_rate": 0.0001657558050485056, + "loss": 1.3253, + "step": 8194 + }, + { + "epoch": 0.2934804018120938, + "grad_norm": 1.883900761604309, + "learning_rate": 0.00016574706586481607, + "loss": 1.2011, + "step": 8195 + }, + { + "epoch": 0.29351621394166205, + "grad_norm": 2.2101593017578125, + "learning_rate": 0.0001657383257965845, + "loss": 1.7439, + "step": 8196 + }, + { + "epoch": 0.2935520260712303, + "grad_norm": 2.143193006515503, + "learning_rate": 0.0001657295848439286, + "loss": 1.2733, + "step": 8197 + }, + { + "epoch": 0.2935878382007986, + "grad_norm": 1.8261759281158447, + "learning_rate": 0.00016572084300696594, + "loss": 1.6156, + "step": 8198 + }, + { + "epoch": 0.2936236503303669, + "grad_norm": 2.3701932430267334, + "learning_rate": 0.0001657121002858141, + "loss": 1.8107, + "step": 8199 + }, + { + "epoch": 0.2936594624599352, + "grad_norm": 1.553908109664917, + "learning_rate": 0.0001657033566805907, + "loss": 1.2718, + "step": 8200 + }, + { + "epoch": 0.29369527458950345, + "grad_norm": 1.8061621189117432, + "learning_rate": 0.00016569461219141337, + "loss": 1.8474, + "step": 8201 + }, + { + "epoch": 0.2937310867190718, + "grad_norm": 3.017632484436035, + "learning_rate": 0.00016568586681839982, + "loss": 1.5226, + "step": 8202 + }, + { + "epoch": 0.29376689884864005, + "grad_norm": 1.9157863855361938, + "learning_rate": 0.00016567712056166762, + "loss": 1.4055, + "step": 8203 + }, + { + "epoch": 0.2938027109782083, + "grad_norm": 1.4582178592681885, + "learning_rate": 0.0001656683734213345, + "loss": 1.6516, + "step": 8204 + }, + { + "epoch": 0.2938385231077766, + "grad_norm": 1.641868233680725, + "learning_rate": 0.00016565962539751808, + "loss": 1.6904, + "step": 8205 + }, + { + "epoch": 0.2938743352373449, + "grad_norm": 1.7872939109802246, + "learning_rate": 0.00016565087649033614, + "loss": 1.3973, + "step": 8206 + }, + { + "epoch": 0.2939101473669132, + "grad_norm": 1.8729043006896973, + "learning_rate": 0.00016564212669990634, + "loss": 1.4836, + "step": 8207 + }, + { + "epoch": 0.29394595949648145, + "grad_norm": 1.4845998287200928, + "learning_rate": 0.00016563337602634642, + "loss": 1.6219, + "step": 8208 + }, + { + "epoch": 0.29398177162604977, + "grad_norm": 1.3829447031021118, + "learning_rate": 0.00016562462446977403, + "loss": 1.5805, + "step": 8209 + }, + { + "epoch": 0.29401758375561804, + "grad_norm": 1.5662988424301147, + "learning_rate": 0.000165615872030307, + "loss": 1.4469, + "step": 8210 + }, + { + "epoch": 0.2940533958851863, + "grad_norm": 1.8096336126327515, + "learning_rate": 0.00016560711870806303, + "loss": 1.6272, + "step": 8211 + }, + { + "epoch": 0.2940892080147546, + "grad_norm": 1.8783751726150513, + "learning_rate": 0.00016559836450315992, + "loss": 1.5174, + "step": 8212 + }, + { + "epoch": 0.2941250201443229, + "grad_norm": 1.8585585355758667, + "learning_rate": 0.00016558960941571543, + "loss": 1.482, + "step": 8213 + }, + { + "epoch": 0.29416083227389117, + "grad_norm": 1.624119758605957, + "learning_rate": 0.00016558085344584736, + "loss": 1.2782, + "step": 8214 + }, + { + "epoch": 0.29419664440345944, + "grad_norm": 1.2839974164962769, + "learning_rate": 0.00016557209659367347, + "loss": 1.5674, + "step": 8215 + }, + { + "epoch": 0.29423245653302776, + "grad_norm": 2.2805612087249756, + "learning_rate": 0.00016556333885931162, + "loss": 1.6619, + "step": 8216 + }, + { + "epoch": 0.29426826866259603, + "grad_norm": 1.5599788427352905, + "learning_rate": 0.00016555458024287964, + "loss": 1.7332, + "step": 8217 + }, + { + "epoch": 0.2943040807921643, + "grad_norm": 2.071613311767578, + "learning_rate": 0.0001655458207444953, + "loss": 1.7957, + "step": 8218 + }, + { + "epoch": 0.29433989292173257, + "grad_norm": 1.492445945739746, + "learning_rate": 0.0001655370603642765, + "loss": 1.5455, + "step": 8219 + }, + { + "epoch": 0.2943757050513009, + "grad_norm": 1.4632467031478882, + "learning_rate": 0.0001655282991023411, + "loss": 1.5157, + "step": 8220 + }, + { + "epoch": 0.29441151718086916, + "grad_norm": 2.3645126819610596, + "learning_rate": 0.000165519536958807, + "loss": 1.7148, + "step": 8221 + }, + { + "epoch": 0.29444732931043743, + "grad_norm": 1.6779077053070068, + "learning_rate": 0.000165510773933792, + "loss": 1.3695, + "step": 8222 + }, + { + "epoch": 0.2944831414400057, + "grad_norm": 1.7853635549545288, + "learning_rate": 0.00016550201002741403, + "loss": 1.5484, + "step": 8223 + }, + { + "epoch": 0.294518953569574, + "grad_norm": 2.639172077178955, + "learning_rate": 0.00016549324523979102, + "loss": 1.8244, + "step": 8224 + }, + { + "epoch": 0.2945547656991423, + "grad_norm": 1.921919345855713, + "learning_rate": 0.0001654844795710409, + "loss": 1.7747, + "step": 8225 + }, + { + "epoch": 0.29459057782871056, + "grad_norm": 1.9929426908493042, + "learning_rate": 0.00016547571302128153, + "loss": 1.6522, + "step": 8226 + }, + { + "epoch": 0.2946263899582789, + "grad_norm": 1.6712838411331177, + "learning_rate": 0.00016546694559063093, + "loss": 1.5273, + "step": 8227 + }, + { + "epoch": 0.29466220208784716, + "grad_norm": 1.7256261110305786, + "learning_rate": 0.000165458177279207, + "loss": 1.7917, + "step": 8228 + }, + { + "epoch": 0.2946980142174154, + "grad_norm": 1.6912678480148315, + "learning_rate": 0.00016544940808712775, + "loss": 1.5481, + "step": 8229 + }, + { + "epoch": 0.2947338263469837, + "grad_norm": 1.7609024047851562, + "learning_rate": 0.00016544063801451114, + "loss": 1.7015, + "step": 8230 + }, + { + "epoch": 0.294769638476552, + "grad_norm": 2.914182662963867, + "learning_rate": 0.00016543186706147514, + "loss": 1.8822, + "step": 8231 + }, + { + "epoch": 0.2948054506061203, + "grad_norm": 1.4983017444610596, + "learning_rate": 0.00016542309522813779, + "loss": 1.5277, + "step": 8232 + }, + { + "epoch": 0.29484126273568856, + "grad_norm": 1.6739991903305054, + "learning_rate": 0.00016541432251461705, + "loss": 1.5439, + "step": 8233 + }, + { + "epoch": 0.2948770748652569, + "grad_norm": 1.3391531705856323, + "learning_rate": 0.000165405548921031, + "loss": 1.3658, + "step": 8234 + }, + { + "epoch": 0.29491288699482515, + "grad_norm": 1.941439151763916, + "learning_rate": 0.0001653967744474977, + "loss": 1.2919, + "step": 8235 + }, + { + "epoch": 0.2949486991243934, + "grad_norm": 1.8408081531524658, + "learning_rate": 0.00016538799909413508, + "loss": 1.4787, + "step": 8236 + }, + { + "epoch": 0.2949845112539617, + "grad_norm": 1.8048700094223022, + "learning_rate": 0.00016537922286106134, + "loss": 1.5778, + "step": 8237 + }, + { + "epoch": 0.29502032338353, + "grad_norm": 2.9586312770843506, + "learning_rate": 0.00016537044574839444, + "loss": 1.5999, + "step": 8238 + }, + { + "epoch": 0.2950561355130983, + "grad_norm": 1.534470796585083, + "learning_rate": 0.00016536166775625252, + "loss": 1.7927, + "step": 8239 + }, + { + "epoch": 0.29509194764266655, + "grad_norm": 1.389212965965271, + "learning_rate": 0.0001653528888847537, + "loss": 1.4612, + "step": 8240 + }, + { + "epoch": 0.2951277597722349, + "grad_norm": 1.5178078413009644, + "learning_rate": 0.00016534410913401603, + "loss": 1.491, + "step": 8241 + }, + { + "epoch": 0.29516357190180315, + "grad_norm": 2.1112802028656006, + "learning_rate": 0.0001653353285041577, + "loss": 1.4394, + "step": 8242 + }, + { + "epoch": 0.2951993840313714, + "grad_norm": 1.57144296169281, + "learning_rate": 0.00016532654699529678, + "loss": 1.7182, + "step": 8243 + }, + { + "epoch": 0.2952351961609397, + "grad_norm": 1.8326722383499146, + "learning_rate": 0.00016531776460755143, + "loss": 1.4076, + "step": 8244 + }, + { + "epoch": 0.295271008290508, + "grad_norm": 1.9306200742721558, + "learning_rate": 0.0001653089813410398, + "loss": 1.6902, + "step": 8245 + }, + { + "epoch": 0.2953068204200763, + "grad_norm": 1.3576823472976685, + "learning_rate": 0.00016530019719588007, + "loss": 1.4347, + "step": 8246 + }, + { + "epoch": 0.29534263254964455, + "grad_norm": 1.4128303527832031, + "learning_rate": 0.00016529141217219045, + "loss": 1.6012, + "step": 8247 + }, + { + "epoch": 0.29537844467921287, + "grad_norm": 1.5558239221572876, + "learning_rate": 0.00016528262627008906, + "loss": 1.4238, + "step": 8248 + }, + { + "epoch": 0.29541425680878114, + "grad_norm": 1.8013745546340942, + "learning_rate": 0.00016527383948969416, + "loss": 1.7537, + "step": 8249 + }, + { + "epoch": 0.2954500689383494, + "grad_norm": 1.5508323907852173, + "learning_rate": 0.00016526505183112394, + "loss": 1.6048, + "step": 8250 + }, + { + "epoch": 0.2954858810679177, + "grad_norm": 1.7320663928985596, + "learning_rate": 0.00016525626329449668, + "loss": 1.7845, + "step": 8251 + }, + { + "epoch": 0.295521693197486, + "grad_norm": 1.934444785118103, + "learning_rate": 0.0001652474738799305, + "loss": 1.6607, + "step": 8252 + }, + { + "epoch": 0.29555750532705427, + "grad_norm": 1.4806501865386963, + "learning_rate": 0.00016523868358754378, + "loss": 1.5327, + "step": 8253 + }, + { + "epoch": 0.29559331745662254, + "grad_norm": 2.4541170597076416, + "learning_rate": 0.00016522989241745469, + "loss": 1.4625, + "step": 8254 + }, + { + "epoch": 0.29562912958619086, + "grad_norm": 1.7268229722976685, + "learning_rate": 0.00016522110036978153, + "loss": 1.3799, + "step": 8255 + }, + { + "epoch": 0.29566494171575913, + "grad_norm": 2.0476462841033936, + "learning_rate": 0.0001652123074446426, + "loss": 1.7806, + "step": 8256 + }, + { + "epoch": 0.2957007538453274, + "grad_norm": 1.8949542045593262, + "learning_rate": 0.00016520351364215623, + "loss": 1.8599, + "step": 8257 + }, + { + "epoch": 0.29573656597489567, + "grad_norm": 1.6571156978607178, + "learning_rate": 0.00016519471896244063, + "loss": 1.5826, + "step": 8258 + }, + { + "epoch": 0.295772378104464, + "grad_norm": 1.580122947692871, + "learning_rate": 0.00016518592340561422, + "loss": 1.45, + "step": 8259 + }, + { + "epoch": 0.29580819023403226, + "grad_norm": 1.4408222436904907, + "learning_rate": 0.0001651771269717953, + "loss": 1.3734, + "step": 8260 + }, + { + "epoch": 0.29584400236360053, + "grad_norm": 1.6053831577301025, + "learning_rate": 0.0001651683296611022, + "loss": 1.7505, + "step": 8261 + }, + { + "epoch": 0.29587981449316886, + "grad_norm": 1.855286717414856, + "learning_rate": 0.0001651595314736533, + "loss": 1.4902, + "step": 8262 + }, + { + "epoch": 0.2959156266227371, + "grad_norm": 2.0580155849456787, + "learning_rate": 0.00016515073240956692, + "loss": 1.6515, + "step": 8263 + }, + { + "epoch": 0.2959514387523054, + "grad_norm": 1.4917335510253906, + "learning_rate": 0.0001651419324689615, + "loss": 1.5483, + "step": 8264 + }, + { + "epoch": 0.29598725088187366, + "grad_norm": 1.6483070850372314, + "learning_rate": 0.00016513313165195538, + "loss": 1.4299, + "step": 8265 + }, + { + "epoch": 0.296023063011442, + "grad_norm": 1.8228284120559692, + "learning_rate": 0.00016512432995866702, + "loss": 1.4021, + "step": 8266 + }, + { + "epoch": 0.29605887514101026, + "grad_norm": 1.4714165925979614, + "learning_rate": 0.00016511552738921479, + "loss": 1.7201, + "step": 8267 + }, + { + "epoch": 0.2960946872705785, + "grad_norm": 1.59091317653656, + "learning_rate": 0.0001651067239437171, + "loss": 1.686, + "step": 8268 + }, + { + "epoch": 0.29613049940014685, + "grad_norm": 1.409410834312439, + "learning_rate": 0.00016509791962229247, + "loss": 1.5451, + "step": 8269 + }, + { + "epoch": 0.2961663115297151, + "grad_norm": 1.9544563293457031, + "learning_rate": 0.0001650891144250593, + "loss": 1.6566, + "step": 8270 + }, + { + "epoch": 0.2962021236592834, + "grad_norm": 1.4585261344909668, + "learning_rate": 0.00016508030835213605, + "loss": 1.4464, + "step": 8271 + }, + { + "epoch": 0.29623793578885166, + "grad_norm": 1.7657428979873657, + "learning_rate": 0.00016507150140364116, + "loss": 1.4297, + "step": 8272 + }, + { + "epoch": 0.29627374791842, + "grad_norm": 1.381548285484314, + "learning_rate": 0.0001650626935796932, + "loss": 1.5166, + "step": 8273 + }, + { + "epoch": 0.29630956004798825, + "grad_norm": 1.7373437881469727, + "learning_rate": 0.00016505388488041058, + "loss": 1.5385, + "step": 8274 + }, + { + "epoch": 0.2963453721775565, + "grad_norm": 1.6844048500061035, + "learning_rate": 0.0001650450753059119, + "loss": 1.4545, + "step": 8275 + }, + { + "epoch": 0.29638118430712485, + "grad_norm": 1.7241437435150146, + "learning_rate": 0.00016503626485631561, + "loss": 1.5535, + "step": 8276 + }, + { + "epoch": 0.2964169964366931, + "grad_norm": 1.5121371746063232, + "learning_rate": 0.00016502745353174026, + "loss": 1.7588, + "step": 8277 + }, + { + "epoch": 0.2964528085662614, + "grad_norm": 2.0365071296691895, + "learning_rate": 0.0001650186413323044, + "loss": 1.7113, + "step": 8278 + }, + { + "epoch": 0.29648862069582965, + "grad_norm": 1.6844797134399414, + "learning_rate": 0.0001650098282581266, + "loss": 1.3356, + "step": 8279 + }, + { + "epoch": 0.296524432825398, + "grad_norm": 1.9010940790176392, + "learning_rate": 0.00016500101430932541, + "loss": 1.7213, + "step": 8280 + }, + { + "epoch": 0.29656024495496625, + "grad_norm": 1.565537452697754, + "learning_rate": 0.00016499219948601943, + "loss": 1.6433, + "step": 8281 + }, + { + "epoch": 0.2965960570845345, + "grad_norm": 1.7080549001693726, + "learning_rate": 0.00016498338378832724, + "loss": 1.5829, + "step": 8282 + }, + { + "epoch": 0.29663186921410284, + "grad_norm": 1.454037070274353, + "learning_rate": 0.00016497456721636743, + "loss": 1.5696, + "step": 8283 + }, + { + "epoch": 0.2966676813436711, + "grad_norm": 1.5883538722991943, + "learning_rate": 0.00016496574977025862, + "loss": 1.5431, + "step": 8284 + }, + { + "epoch": 0.2967034934732394, + "grad_norm": 1.2771259546279907, + "learning_rate": 0.00016495693145011947, + "loss": 1.2164, + "step": 8285 + }, + { + "epoch": 0.29673930560280765, + "grad_norm": 1.6104718446731567, + "learning_rate": 0.00016494811225606858, + "loss": 1.3776, + "step": 8286 + }, + { + "epoch": 0.29677511773237597, + "grad_norm": 1.967247724533081, + "learning_rate": 0.00016493929218822467, + "loss": 1.2856, + "step": 8287 + }, + { + "epoch": 0.29681092986194424, + "grad_norm": 1.7795873880386353, + "learning_rate": 0.0001649304712467063, + "loss": 1.4519, + "step": 8288 + }, + { + "epoch": 0.2968467419915125, + "grad_norm": 2.0310049057006836, + "learning_rate": 0.00016492164943163217, + "loss": 1.5672, + "step": 8289 + }, + { + "epoch": 0.29688255412108083, + "grad_norm": 1.3638725280761719, + "learning_rate": 0.00016491282674312103, + "loss": 1.5978, + "step": 8290 + }, + { + "epoch": 0.2969183662506491, + "grad_norm": 1.5186011791229248, + "learning_rate": 0.00016490400318129153, + "loss": 1.3685, + "step": 8291 + }, + { + "epoch": 0.29695417838021737, + "grad_norm": 1.4526851177215576, + "learning_rate": 0.0001648951787462624, + "loss": 1.5908, + "step": 8292 + }, + { + "epoch": 0.29698999050978564, + "grad_norm": 2.9446048736572266, + "learning_rate": 0.0001648863534381523, + "loss": 1.8091, + "step": 8293 + }, + { + "epoch": 0.29702580263935396, + "grad_norm": 1.383347511291504, + "learning_rate": 0.00016487752725708005, + "loss": 1.5765, + "step": 8294 + }, + { + "epoch": 0.29706161476892223, + "grad_norm": 1.7374597787857056, + "learning_rate": 0.00016486870020316437, + "loss": 1.4451, + "step": 8295 + }, + { + "epoch": 0.2970974268984905, + "grad_norm": 1.5283315181732178, + "learning_rate": 0.000164859872276524, + "loss": 1.5174, + "step": 8296 + }, + { + "epoch": 0.2971332390280588, + "grad_norm": 1.4629818201065063, + "learning_rate": 0.0001648510434772777, + "loss": 1.5145, + "step": 8297 + }, + { + "epoch": 0.2971690511576271, + "grad_norm": 1.7839181423187256, + "learning_rate": 0.00016484221380554424, + "loss": 1.4526, + "step": 8298 + }, + { + "epoch": 0.29720486328719536, + "grad_norm": 1.8995558023452759, + "learning_rate": 0.00016483338326144244, + "loss": 1.5883, + "step": 8299 + }, + { + "epoch": 0.29724067541676363, + "grad_norm": 1.3919588327407837, + "learning_rate": 0.0001648245518450911, + "loss": 1.2238, + "step": 8300 + }, + { + "epoch": 0.29727648754633196, + "grad_norm": 1.8809860944747925, + "learning_rate": 0.00016481571955660903, + "loss": 1.6602, + "step": 8301 + }, + { + "epoch": 0.2973122996759002, + "grad_norm": 1.5680149793624878, + "learning_rate": 0.0001648068863961151, + "loss": 1.4787, + "step": 8302 + }, + { + "epoch": 0.2973481118054685, + "grad_norm": 1.533979892730713, + "learning_rate": 0.00016479805236372806, + "loss": 1.3545, + "step": 8303 + }, + { + "epoch": 0.2973839239350368, + "grad_norm": 1.733001708984375, + "learning_rate": 0.00016478921745956686, + "loss": 1.7839, + "step": 8304 + }, + { + "epoch": 0.2974197360646051, + "grad_norm": 2.160454750061035, + "learning_rate": 0.00016478038168375028, + "loss": 1.7761, + "step": 8305 + }, + { + "epoch": 0.29745554819417336, + "grad_norm": 1.3565226793289185, + "learning_rate": 0.00016477154503639723, + "loss": 1.5211, + "step": 8306 + }, + { + "epoch": 0.2974913603237416, + "grad_norm": 1.4650872945785522, + "learning_rate": 0.00016476270751762656, + "loss": 1.1181, + "step": 8307 + }, + { + "epoch": 0.29752717245330995, + "grad_norm": 1.9036059379577637, + "learning_rate": 0.00016475386912755724, + "loss": 1.4345, + "step": 8308 + }, + { + "epoch": 0.2975629845828782, + "grad_norm": 2.0247349739074707, + "learning_rate": 0.0001647450298663081, + "loss": 1.7541, + "step": 8309 + }, + { + "epoch": 0.2975987967124465, + "grad_norm": 1.7385376691818237, + "learning_rate": 0.00016473618973399811, + "loss": 1.5686, + "step": 8310 + }, + { + "epoch": 0.2976346088420148, + "grad_norm": 1.8126300573349, + "learning_rate": 0.00016472734873074622, + "loss": 1.8558, + "step": 8311 + }, + { + "epoch": 0.2976704209715831, + "grad_norm": 1.5259616374969482, + "learning_rate": 0.00016471850685667133, + "loss": 1.4215, + "step": 8312 + }, + { + "epoch": 0.29770623310115135, + "grad_norm": 1.6709272861480713, + "learning_rate": 0.0001647096641118924, + "loss": 1.5873, + "step": 8313 + }, + { + "epoch": 0.2977420452307196, + "grad_norm": 1.7461010217666626, + "learning_rate": 0.00016470082049652843, + "loss": 1.579, + "step": 8314 + }, + { + "epoch": 0.29777785736028795, + "grad_norm": 1.4879776239395142, + "learning_rate": 0.00016469197601069838, + "loss": 1.4497, + "step": 8315 + }, + { + "epoch": 0.2978136694898562, + "grad_norm": 1.4053375720977783, + "learning_rate": 0.00016468313065452121, + "loss": 1.5219, + "step": 8316 + }, + { + "epoch": 0.2978494816194245, + "grad_norm": 1.5887116193771362, + "learning_rate": 0.00016467428442811595, + "loss": 1.3375, + "step": 8317 + }, + { + "epoch": 0.2978852937489928, + "grad_norm": 2.0717251300811768, + "learning_rate": 0.00016466543733160163, + "loss": 1.7047, + "step": 8318 + }, + { + "epoch": 0.2979211058785611, + "grad_norm": 1.4569612741470337, + "learning_rate": 0.00016465658936509726, + "loss": 1.4269, + "step": 8319 + }, + { + "epoch": 0.29795691800812935, + "grad_norm": 1.4222359657287598, + "learning_rate": 0.0001646477405287219, + "loss": 1.5088, + "step": 8320 + }, + { + "epoch": 0.2979927301376976, + "grad_norm": 1.5125815868377686, + "learning_rate": 0.00016463889082259456, + "loss": 1.5915, + "step": 8321 + }, + { + "epoch": 0.29802854226726594, + "grad_norm": 1.9704197645187378, + "learning_rate": 0.00016463004024683432, + "loss": 1.6187, + "step": 8322 + }, + { + "epoch": 0.2980643543968342, + "grad_norm": 1.6360664367675781, + "learning_rate": 0.0001646211888015603, + "loss": 1.3247, + "step": 8323 + }, + { + "epoch": 0.2981001665264025, + "grad_norm": 1.4352376461029053, + "learning_rate": 0.0001646123364868915, + "loss": 1.3175, + "step": 8324 + }, + { + "epoch": 0.2981359786559708, + "grad_norm": 1.6311535835266113, + "learning_rate": 0.00016460348330294704, + "loss": 1.2099, + "step": 8325 + }, + { + "epoch": 0.29817179078553907, + "grad_norm": 1.5380768775939941, + "learning_rate": 0.00016459462924984605, + "loss": 1.782, + "step": 8326 + }, + { + "epoch": 0.29820760291510734, + "grad_norm": 1.4307458400726318, + "learning_rate": 0.00016458577432770766, + "loss": 1.5807, + "step": 8327 + }, + { + "epoch": 0.2982434150446756, + "grad_norm": 1.925248384475708, + "learning_rate": 0.000164576918536651, + "loss": 1.4601, + "step": 8328 + }, + { + "epoch": 0.29827922717424393, + "grad_norm": 1.3756663799285889, + "learning_rate": 0.0001645680618767952, + "loss": 1.487, + "step": 8329 + }, + { + "epoch": 0.2983150393038122, + "grad_norm": 1.7213997840881348, + "learning_rate": 0.00016455920434825936, + "loss": 1.681, + "step": 8330 + }, + { + "epoch": 0.29835085143338047, + "grad_norm": 1.3830705881118774, + "learning_rate": 0.00016455034595116278, + "loss": 1.3369, + "step": 8331 + }, + { + "epoch": 0.2983866635629488, + "grad_norm": 1.7282135486602783, + "learning_rate": 0.00016454148668562454, + "loss": 1.1979, + "step": 8332 + }, + { + "epoch": 0.29842247569251706, + "grad_norm": 2.0980710983276367, + "learning_rate": 0.0001645326265517638, + "loss": 1.4047, + "step": 8333 + }, + { + "epoch": 0.29845828782208533, + "grad_norm": 1.5762666463851929, + "learning_rate": 0.00016452376554969983, + "loss": 1.7234, + "step": 8334 + }, + { + "epoch": 0.2984940999516536, + "grad_norm": 1.630379319190979, + "learning_rate": 0.00016451490367955183, + "loss": 1.5907, + "step": 8335 + }, + { + "epoch": 0.2985299120812219, + "grad_norm": 2.5405523777008057, + "learning_rate": 0.00016450604094143904, + "loss": 1.6051, + "step": 8336 + }, + { + "epoch": 0.2985657242107902, + "grad_norm": 1.3858790397644043, + "learning_rate": 0.00016449717733548066, + "loss": 1.5432, + "step": 8337 + }, + { + "epoch": 0.29860153634035846, + "grad_norm": 1.749637246131897, + "learning_rate": 0.00016448831286179595, + "loss": 1.6572, + "step": 8338 + }, + { + "epoch": 0.2986373484699268, + "grad_norm": 1.4837186336517334, + "learning_rate": 0.00016447944752050417, + "loss": 1.552, + "step": 8339 + }, + { + "epoch": 0.29867316059949506, + "grad_norm": 1.4648246765136719, + "learning_rate": 0.00016447058131172462, + "loss": 1.4813, + "step": 8340 + }, + { + "epoch": 0.2987089727290633, + "grad_norm": 1.9088497161865234, + "learning_rate": 0.00016446171423557652, + "loss": 1.6785, + "step": 8341 + }, + { + "epoch": 0.2987447848586316, + "grad_norm": 1.8436561822891235, + "learning_rate": 0.00016445284629217923, + "loss": 1.6028, + "step": 8342 + }, + { + "epoch": 0.2987805969881999, + "grad_norm": 1.7736629247665405, + "learning_rate": 0.00016444397748165205, + "loss": 1.672, + "step": 8343 + }, + { + "epoch": 0.2988164091177682, + "grad_norm": 1.4750014543533325, + "learning_rate": 0.00016443510780411423, + "loss": 1.5656, + "step": 8344 + }, + { + "epoch": 0.29885222124733646, + "grad_norm": 2.1682565212249756, + "learning_rate": 0.0001644262372596852, + "loss": 1.507, + "step": 8345 + }, + { + "epoch": 0.2988880333769048, + "grad_norm": 1.6554548740386963, + "learning_rate": 0.00016441736584848422, + "loss": 1.4692, + "step": 8346 + }, + { + "epoch": 0.29892384550647305, + "grad_norm": 1.3574022054672241, + "learning_rate": 0.0001644084935706307, + "loss": 1.4869, + "step": 8347 + }, + { + "epoch": 0.2989596576360413, + "grad_norm": 1.7374560832977295, + "learning_rate": 0.00016439962042624396, + "loss": 1.7313, + "step": 8348 + }, + { + "epoch": 0.2989954697656096, + "grad_norm": 1.4507265090942383, + "learning_rate": 0.0001643907464154434, + "loss": 1.569, + "step": 8349 + }, + { + "epoch": 0.2990312818951779, + "grad_norm": 1.6869750022888184, + "learning_rate": 0.00016438187153834842, + "loss": 1.5569, + "step": 8350 + }, + { + "epoch": 0.2990670940247462, + "grad_norm": 1.4608795642852783, + "learning_rate": 0.0001643729957950784, + "loss": 1.5137, + "step": 8351 + }, + { + "epoch": 0.29910290615431445, + "grad_norm": 1.5899919271469116, + "learning_rate": 0.00016436411918575275, + "loss": 1.4346, + "step": 8352 + }, + { + "epoch": 0.2991387182838828, + "grad_norm": 1.7465403079986572, + "learning_rate": 0.00016435524171049088, + "loss": 1.8228, + "step": 8353 + }, + { + "epoch": 0.29917453041345105, + "grad_norm": 1.4676436185836792, + "learning_rate": 0.00016434636336941228, + "loss": 1.9146, + "step": 8354 + }, + { + "epoch": 0.2992103425430193, + "grad_norm": 1.4257675409317017, + "learning_rate": 0.00016433748416263633, + "loss": 1.7837, + "step": 8355 + }, + { + "epoch": 0.2992461546725876, + "grad_norm": 2.4013359546661377, + "learning_rate": 0.00016432860409028253, + "loss": 1.7198, + "step": 8356 + }, + { + "epoch": 0.2992819668021559, + "grad_norm": 1.3581948280334473, + "learning_rate": 0.00016431972315247037, + "loss": 1.3561, + "step": 8357 + }, + { + "epoch": 0.2993177789317242, + "grad_norm": 1.4917012453079224, + "learning_rate": 0.00016431084134931927, + "loss": 1.789, + "step": 8358 + }, + { + "epoch": 0.29935359106129245, + "grad_norm": 1.5227102041244507, + "learning_rate": 0.00016430195868094875, + "loss": 1.6315, + "step": 8359 + }, + { + "epoch": 0.29938940319086077, + "grad_norm": 1.2591630220413208, + "learning_rate": 0.00016429307514747834, + "loss": 1.6123, + "step": 8360 + }, + { + "epoch": 0.29942521532042904, + "grad_norm": 1.3538967370986938, + "learning_rate": 0.00016428419074902752, + "loss": 1.4282, + "step": 8361 + }, + { + "epoch": 0.2994610274499973, + "grad_norm": 2.2829110622406006, + "learning_rate": 0.00016427530548571585, + "loss": 1.5408, + "step": 8362 + }, + { + "epoch": 0.2994968395795656, + "grad_norm": 1.5202713012695312, + "learning_rate": 0.00016426641935766284, + "loss": 1.5582, + "step": 8363 + }, + { + "epoch": 0.2995326517091339, + "grad_norm": 1.6621960401535034, + "learning_rate": 0.00016425753236498807, + "loss": 1.3942, + "step": 8364 + }, + { + "epoch": 0.29956846383870217, + "grad_norm": 1.912682294845581, + "learning_rate": 0.00016424864450781108, + "loss": 1.6684, + "step": 8365 + }, + { + "epoch": 0.29960427596827044, + "grad_norm": 1.3282166719436646, + "learning_rate": 0.00016423975578625142, + "loss": 1.5807, + "step": 8366 + }, + { + "epoch": 0.29964008809783876, + "grad_norm": 1.7995824813842773, + "learning_rate": 0.00016423086620042879, + "loss": 1.4495, + "step": 8367 + }, + { + "epoch": 0.29967590022740703, + "grad_norm": 1.258223056793213, + "learning_rate": 0.00016422197575046265, + "loss": 1.7473, + "step": 8368 + }, + { + "epoch": 0.2997117123569753, + "grad_norm": 1.346404790878296, + "learning_rate": 0.00016421308443647265, + "loss": 1.4512, + "step": 8369 + }, + { + "epoch": 0.29974752448654357, + "grad_norm": 1.8389484882354736, + "learning_rate": 0.00016420419225857846, + "loss": 1.5317, + "step": 8370 + }, + { + "epoch": 0.2997833366161119, + "grad_norm": 1.1908183097839355, + "learning_rate": 0.00016419529921689967, + "loss": 1.2523, + "step": 8371 + }, + { + "epoch": 0.29981914874568016, + "grad_norm": 1.4290704727172852, + "learning_rate": 0.00016418640531155597, + "loss": 1.552, + "step": 8372 + }, + { + "epoch": 0.29985496087524843, + "grad_norm": 1.5900216102600098, + "learning_rate": 0.00016417751054266692, + "loss": 1.4178, + "step": 8373 + }, + { + "epoch": 0.29989077300481676, + "grad_norm": 1.5539711713790894, + "learning_rate": 0.00016416861491035228, + "loss": 1.1549, + "step": 8374 + }, + { + "epoch": 0.299926585134385, + "grad_norm": 1.5073728561401367, + "learning_rate": 0.0001641597184147317, + "loss": 1.4495, + "step": 8375 + }, + { + "epoch": 0.2999623972639533, + "grad_norm": 1.420148491859436, + "learning_rate": 0.0001641508210559249, + "loss": 1.381, + "step": 8376 + }, + { + "epoch": 0.29999820939352156, + "grad_norm": 1.7865599393844604, + "learning_rate": 0.00016414192283405147, + "loss": 1.3243, + "step": 8377 + }, + { + "epoch": 0.3000340215230899, + "grad_norm": 2.0387165546417236, + "learning_rate": 0.00016413302374923124, + "loss": 1.6688, + "step": 8378 + }, + { + "epoch": 0.30006983365265816, + "grad_norm": 1.510023832321167, + "learning_rate": 0.00016412412380158392, + "loss": 1.726, + "step": 8379 + }, + { + "epoch": 0.3001056457822264, + "grad_norm": 1.5913726091384888, + "learning_rate": 0.00016411522299122924, + "loss": 1.7165, + "step": 8380 + }, + { + "epoch": 0.30014145791179475, + "grad_norm": 1.7233575582504272, + "learning_rate": 0.0001641063213182869, + "loss": 1.5722, + "step": 8381 + }, + { + "epoch": 0.300177270041363, + "grad_norm": 1.4803311824798584, + "learning_rate": 0.00016409741878287671, + "loss": 1.2891, + "step": 8382 + }, + { + "epoch": 0.3002130821709313, + "grad_norm": 2.3403327465057373, + "learning_rate": 0.00016408851538511846, + "loss": 1.494, + "step": 8383 + }, + { + "epoch": 0.30024889430049956, + "grad_norm": 1.5356950759887695, + "learning_rate": 0.0001640796111251319, + "loss": 1.2206, + "step": 8384 + }, + { + "epoch": 0.3002847064300679, + "grad_norm": 1.619539737701416, + "learning_rate": 0.0001640707060030368, + "loss": 1.5673, + "step": 8385 + }, + { + "epoch": 0.30032051855963615, + "grad_norm": 1.3473477363586426, + "learning_rate": 0.00016406180001895298, + "loss": 1.7308, + "step": 8386 + }, + { + "epoch": 0.3003563306892044, + "grad_norm": 1.37346351146698, + "learning_rate": 0.00016405289317300033, + "loss": 1.307, + "step": 8387 + }, + { + "epoch": 0.30039214281877274, + "grad_norm": 1.9907050132751465, + "learning_rate": 0.00016404398546529859, + "loss": 1.7465, + "step": 8388 + }, + { + "epoch": 0.300427954948341, + "grad_norm": 1.3930773735046387, + "learning_rate": 0.00016403507689596763, + "loss": 1.4954, + "step": 8389 + }, + { + "epoch": 0.3004637670779093, + "grad_norm": 2.656479835510254, + "learning_rate": 0.0001640261674651273, + "loss": 1.4997, + "step": 8390 + }, + { + "epoch": 0.30049957920747755, + "grad_norm": 2.1430234909057617, + "learning_rate": 0.0001640172571728975, + "loss": 1.6448, + "step": 8391 + }, + { + "epoch": 0.3005353913370459, + "grad_norm": 1.5094202756881714, + "learning_rate": 0.0001640083460193981, + "loss": 1.3046, + "step": 8392 + }, + { + "epoch": 0.30057120346661415, + "grad_norm": 1.5384578704833984, + "learning_rate": 0.00016399943400474895, + "loss": 1.3601, + "step": 8393 + }, + { + "epoch": 0.3006070155961824, + "grad_norm": 1.3882454633712769, + "learning_rate": 0.00016399052112906994, + "loss": 1.3568, + "step": 8394 + }, + { + "epoch": 0.30064282772575074, + "grad_norm": 1.2879197597503662, + "learning_rate": 0.00016398160739248104, + "loss": 1.1968, + "step": 8395 + }, + { + "epoch": 0.300678639855319, + "grad_norm": 1.569235920906067, + "learning_rate": 0.00016397269279510215, + "loss": 1.4976, + "step": 8396 + }, + { + "epoch": 0.3007144519848873, + "grad_norm": 1.489635705947876, + "learning_rate": 0.00016396377733705317, + "loss": 1.4407, + "step": 8397 + }, + { + "epoch": 0.30075026411445555, + "grad_norm": 2.0916008949279785, + "learning_rate": 0.00016395486101845408, + "loss": 1.3873, + "step": 8398 + }, + { + "epoch": 0.30078607624402387, + "grad_norm": 1.9567631483078003, + "learning_rate": 0.00016394594383942486, + "loss": 1.2718, + "step": 8399 + }, + { + "epoch": 0.30082188837359214, + "grad_norm": 2.1201071739196777, + "learning_rate": 0.00016393702580008542, + "loss": 1.3483, + "step": 8400 + }, + { + "epoch": 0.3008577005031604, + "grad_norm": 1.7831958532333374, + "learning_rate": 0.00016392810690055577, + "loss": 1.4788, + "step": 8401 + }, + { + "epoch": 0.30089351263272873, + "grad_norm": 1.9678820371627808, + "learning_rate": 0.00016391918714095592, + "loss": 1.6395, + "step": 8402 + }, + { + "epoch": 0.300929324762297, + "grad_norm": 1.531544804573059, + "learning_rate": 0.00016391026652140585, + "loss": 1.4363, + "step": 8403 + }, + { + "epoch": 0.30096513689186527, + "grad_norm": 2.5241010189056396, + "learning_rate": 0.00016390134504202557, + "loss": 1.3736, + "step": 8404 + }, + { + "epoch": 0.30100094902143354, + "grad_norm": 2.2505462169647217, + "learning_rate": 0.00016389242270293514, + "loss": 1.1755, + "step": 8405 + }, + { + "epoch": 0.30103676115100186, + "grad_norm": 1.7323700189590454, + "learning_rate": 0.00016388349950425456, + "loss": 1.5693, + "step": 8406 + }, + { + "epoch": 0.30107257328057013, + "grad_norm": 1.761555552482605, + "learning_rate": 0.0001638745754461039, + "loss": 1.3442, + "step": 8407 + }, + { + "epoch": 0.3011083854101384, + "grad_norm": 1.8615801334381104, + "learning_rate": 0.00016386565052860323, + "loss": 1.6013, + "step": 8408 + }, + { + "epoch": 0.3011441975397067, + "grad_norm": 1.4850836992263794, + "learning_rate": 0.00016385672475187262, + "loss": 1.4907, + "step": 8409 + }, + { + "epoch": 0.301180009669275, + "grad_norm": 1.6146981716156006, + "learning_rate": 0.00016384779811603214, + "loss": 1.5575, + "step": 8410 + }, + { + "epoch": 0.30121582179884326, + "grad_norm": 1.6145588159561157, + "learning_rate": 0.0001638388706212019, + "loss": 1.4367, + "step": 8411 + }, + { + "epoch": 0.30125163392841153, + "grad_norm": 2.3963756561279297, + "learning_rate": 0.000163829942267502, + "loss": 1.5233, + "step": 8412 + }, + { + "epoch": 0.30128744605797986, + "grad_norm": 1.9880651235580444, + "learning_rate": 0.00016382101305505254, + "loss": 1.676, + "step": 8413 + }, + { + "epoch": 0.3013232581875481, + "grad_norm": 1.5233274698257446, + "learning_rate": 0.0001638120829839737, + "loss": 1.5215, + "step": 8414 + }, + { + "epoch": 0.3013590703171164, + "grad_norm": 1.7098640203475952, + "learning_rate": 0.00016380315205438554, + "loss": 1.3578, + "step": 8415 + }, + { + "epoch": 0.3013948824466847, + "grad_norm": 2.10298490524292, + "learning_rate": 0.00016379422026640831, + "loss": 1.5301, + "step": 8416 + }, + { + "epoch": 0.301430694576253, + "grad_norm": 1.466875433921814, + "learning_rate": 0.00016378528762016218, + "loss": 1.2364, + "step": 8417 + }, + { + "epoch": 0.30146650670582126, + "grad_norm": 1.7537953853607178, + "learning_rate": 0.00016377635411576723, + "loss": 1.5604, + "step": 8418 + }, + { + "epoch": 0.3015023188353895, + "grad_norm": 2.448864221572876, + "learning_rate": 0.00016376741975334368, + "loss": 1.6991, + "step": 8419 + }, + { + "epoch": 0.30153813096495785, + "grad_norm": 2.6458969116210938, + "learning_rate": 0.0001637584845330118, + "loss": 1.9327, + "step": 8420 + }, + { + "epoch": 0.3015739430945261, + "grad_norm": 1.5955549478530884, + "learning_rate": 0.00016374954845489175, + "loss": 1.4872, + "step": 8421 + }, + { + "epoch": 0.3016097552240944, + "grad_norm": 1.0321791172027588, + "learning_rate": 0.00016374061151910372, + "loss": 1.2884, + "step": 8422 + }, + { + "epoch": 0.30164556735366266, + "grad_norm": 1.4548122882843018, + "learning_rate": 0.000163731673725768, + "loss": 1.3905, + "step": 8423 + }, + { + "epoch": 0.301681379483231, + "grad_norm": 1.4599908590316772, + "learning_rate": 0.00016372273507500481, + "loss": 1.637, + "step": 8424 + }, + { + "epoch": 0.30171719161279925, + "grad_norm": 1.4581354856491089, + "learning_rate": 0.00016371379556693442, + "loss": 1.4863, + "step": 8425 + }, + { + "epoch": 0.3017530037423675, + "grad_norm": 1.6916433572769165, + "learning_rate": 0.0001637048552016771, + "loss": 1.3605, + "step": 8426 + }, + { + "epoch": 0.30178881587193584, + "grad_norm": 1.710843801498413, + "learning_rate": 0.00016369591397935314, + "loss": 1.6055, + "step": 8427 + }, + { + "epoch": 0.3018246280015041, + "grad_norm": 1.4895509481430054, + "learning_rate": 0.0001636869719000828, + "loss": 1.4946, + "step": 8428 + }, + { + "epoch": 0.3018604401310724, + "grad_norm": 1.3431249856948853, + "learning_rate": 0.0001636780289639864, + "loss": 1.5975, + "step": 8429 + }, + { + "epoch": 0.30189625226064065, + "grad_norm": 1.9175156354904175, + "learning_rate": 0.00016366908517118428, + "loss": 1.2682, + "step": 8430 + }, + { + "epoch": 0.301932064390209, + "grad_norm": 1.7841920852661133, + "learning_rate": 0.00016366014052179674, + "loss": 1.1883, + "step": 8431 + }, + { + "epoch": 0.30196787651977725, + "grad_norm": 2.296452760696411, + "learning_rate": 0.0001636511950159441, + "loss": 1.4858, + "step": 8432 + }, + { + "epoch": 0.3020036886493455, + "grad_norm": 1.7015984058380127, + "learning_rate": 0.00016364224865374677, + "loss": 1.6448, + "step": 8433 + }, + { + "epoch": 0.30203950077891384, + "grad_norm": 1.5796960592269897, + "learning_rate": 0.00016363330143532508, + "loss": 1.5356, + "step": 8434 + }, + { + "epoch": 0.3020753129084821, + "grad_norm": 1.562314510345459, + "learning_rate": 0.00016362435336079938, + "loss": 1.503, + "step": 8435 + }, + { + "epoch": 0.3021111250380504, + "grad_norm": 1.5993971824645996, + "learning_rate": 0.00016361540443029008, + "loss": 1.7656, + "step": 8436 + }, + { + "epoch": 0.30214693716761865, + "grad_norm": 2.1478962898254395, + "learning_rate": 0.00016360645464391754, + "loss": 1.4395, + "step": 8437 + }, + { + "epoch": 0.30218274929718697, + "grad_norm": 2.219712257385254, + "learning_rate": 0.00016359750400180226, + "loss": 1.7623, + "step": 8438 + }, + { + "epoch": 0.30221856142675524, + "grad_norm": 1.7242192029953003, + "learning_rate": 0.00016358855250406455, + "loss": 1.3607, + "step": 8439 + }, + { + "epoch": 0.3022543735563235, + "grad_norm": 1.4670357704162598, + "learning_rate": 0.0001635796001508249, + "loss": 1.6872, + "step": 8440 + }, + { + "epoch": 0.30229018568589183, + "grad_norm": 1.7151837348937988, + "learning_rate": 0.00016357064694220375, + "loss": 1.7587, + "step": 8441 + }, + { + "epoch": 0.3023259978154601, + "grad_norm": 2.2550930976867676, + "learning_rate": 0.00016356169287832156, + "loss": 1.598, + "step": 8442 + }, + { + "epoch": 0.30236180994502837, + "grad_norm": 1.5630356073379517, + "learning_rate": 0.00016355273795929875, + "loss": 1.6793, + "step": 8443 + }, + { + "epoch": 0.30239762207459664, + "grad_norm": 2.2968051433563232, + "learning_rate": 0.00016354378218525584, + "loss": 1.9349, + "step": 8444 + }, + { + "epoch": 0.30243343420416496, + "grad_norm": 1.6883846521377563, + "learning_rate": 0.00016353482555631334, + "loss": 1.5545, + "step": 8445 + }, + { + "epoch": 0.30246924633373323, + "grad_norm": 1.5997358560562134, + "learning_rate": 0.00016352586807259168, + "loss": 1.9512, + "step": 8446 + }, + { + "epoch": 0.3025050584633015, + "grad_norm": 1.1980454921722412, + "learning_rate": 0.00016351690973421138, + "loss": 1.5382, + "step": 8447 + }, + { + "epoch": 0.3025408705928698, + "grad_norm": 1.3863976001739502, + "learning_rate": 0.00016350795054129305, + "loss": 1.4965, + "step": 8448 + }, + { + "epoch": 0.3025766827224381, + "grad_norm": 1.4191679954528809, + "learning_rate": 0.00016349899049395713, + "loss": 1.6027, + "step": 8449 + }, + { + "epoch": 0.30261249485200636, + "grad_norm": 1.6715854406356812, + "learning_rate": 0.0001634900295923242, + "loss": 1.7932, + "step": 8450 + }, + { + "epoch": 0.30264830698157463, + "grad_norm": 1.6888922452926636, + "learning_rate": 0.00016348106783651482, + "loss": 1.107, + "step": 8451 + }, + { + "epoch": 0.30268411911114296, + "grad_norm": 2.2722463607788086, + "learning_rate": 0.00016347210522664956, + "loss": 1.847, + "step": 8452 + }, + { + "epoch": 0.3027199312407112, + "grad_norm": 1.5517665147781372, + "learning_rate": 0.000163463141762849, + "loss": 1.4461, + "step": 8453 + }, + { + "epoch": 0.3027557433702795, + "grad_norm": 1.6976096630096436, + "learning_rate": 0.00016345417744523374, + "loss": 1.5298, + "step": 8454 + }, + { + "epoch": 0.3027915554998478, + "grad_norm": 1.631230354309082, + "learning_rate": 0.00016344521227392437, + "loss": 1.4179, + "step": 8455 + }, + { + "epoch": 0.3028273676294161, + "grad_norm": 1.2763921022415161, + "learning_rate": 0.00016343624624904151, + "loss": 1.7387, + "step": 8456 + }, + { + "epoch": 0.30286317975898436, + "grad_norm": 2.225421667098999, + "learning_rate": 0.00016342727937070577, + "loss": 1.4409, + "step": 8457 + }, + { + "epoch": 0.3028989918885526, + "grad_norm": 2.22019100189209, + "learning_rate": 0.0001634183116390378, + "loss": 1.9976, + "step": 8458 + }, + { + "epoch": 0.30293480401812095, + "grad_norm": 1.8093554973602295, + "learning_rate": 0.00016340934305415823, + "loss": 1.7154, + "step": 8459 + }, + { + "epoch": 0.3029706161476892, + "grad_norm": 1.4893295764923096, + "learning_rate": 0.00016340037361618778, + "loss": 1.3436, + "step": 8460 + }, + { + "epoch": 0.3030064282772575, + "grad_norm": 1.99727201461792, + "learning_rate": 0.00016339140332524707, + "loss": 1.6495, + "step": 8461 + }, + { + "epoch": 0.3030422404068258, + "grad_norm": 1.6516386270523071, + "learning_rate": 0.0001633824321814568, + "loss": 1.7346, + "step": 8462 + }, + { + "epoch": 0.3030780525363941, + "grad_norm": 1.309701919555664, + "learning_rate": 0.00016337346018493768, + "loss": 1.4401, + "step": 8463 + }, + { + "epoch": 0.30311386466596235, + "grad_norm": 1.5844017267227173, + "learning_rate": 0.00016336448733581037, + "loss": 1.7372, + "step": 8464 + }, + { + "epoch": 0.3031496767955306, + "grad_norm": 1.5818967819213867, + "learning_rate": 0.00016335551363419562, + "loss": 1.5704, + "step": 8465 + }, + { + "epoch": 0.30318548892509894, + "grad_norm": 1.4946575164794922, + "learning_rate": 0.00016334653908021415, + "loss": 1.6854, + "step": 8466 + }, + { + "epoch": 0.3032213010546672, + "grad_norm": 2.2354283332824707, + "learning_rate": 0.00016333756367398674, + "loss": 1.9091, + "step": 8467 + }, + { + "epoch": 0.3032571131842355, + "grad_norm": 1.3000431060791016, + "learning_rate": 0.00016332858741563408, + "loss": 1.4807, + "step": 8468 + }, + { + "epoch": 0.3032929253138038, + "grad_norm": 1.4877946376800537, + "learning_rate": 0.00016331961030527698, + "loss": 1.3538, + "step": 8469 + }, + { + "epoch": 0.3033287374433721, + "grad_norm": 1.6280548572540283, + "learning_rate": 0.00016331063234303618, + "loss": 1.6621, + "step": 8470 + }, + { + "epoch": 0.30336454957294035, + "grad_norm": 1.3985967636108398, + "learning_rate": 0.0001633016535290325, + "loss": 1.2751, + "step": 8471 + }, + { + "epoch": 0.3034003617025086, + "grad_norm": 2.0573692321777344, + "learning_rate": 0.00016329267386338674, + "loss": 1.4004, + "step": 8472 + }, + { + "epoch": 0.30343617383207694, + "grad_norm": 1.4988259077072144, + "learning_rate": 0.0001632836933462197, + "loss": 1.2686, + "step": 8473 + }, + { + "epoch": 0.3034719859616452, + "grad_norm": 1.387237548828125, + "learning_rate": 0.00016327471197765216, + "loss": 1.4457, + "step": 8474 + }, + { + "epoch": 0.3035077980912135, + "grad_norm": 2.1457505226135254, + "learning_rate": 0.000163265729757805, + "loss": 1.6394, + "step": 8475 + }, + { + "epoch": 0.3035436102207818, + "grad_norm": 1.7140662670135498, + "learning_rate": 0.00016325674668679906, + "loss": 1.5671, + "step": 8476 + }, + { + "epoch": 0.30357942235035007, + "grad_norm": 1.7790052890777588, + "learning_rate": 0.00016324776276475518, + "loss": 1.6923, + "step": 8477 + }, + { + "epoch": 0.30361523447991834, + "grad_norm": 1.8531421422958374, + "learning_rate": 0.0001632387779917943, + "loss": 1.6007, + "step": 8478 + }, + { + "epoch": 0.3036510466094866, + "grad_norm": 1.4344226121902466, + "learning_rate": 0.00016322979236803713, + "loss": 1.6599, + "step": 8479 + }, + { + "epoch": 0.30368685873905493, + "grad_norm": 1.5738317966461182, + "learning_rate": 0.00016322080589360472, + "loss": 1.7954, + "step": 8480 + }, + { + "epoch": 0.3037226708686232, + "grad_norm": 1.9308953285217285, + "learning_rate": 0.0001632118185686179, + "loss": 1.4939, + "step": 8481 + }, + { + "epoch": 0.30375848299819147, + "grad_norm": 1.450674057006836, + "learning_rate": 0.0001632028303931976, + "loss": 1.4645, + "step": 8482 + }, + { + "epoch": 0.3037942951277598, + "grad_norm": 1.5372213125228882, + "learning_rate": 0.00016319384136746477, + "loss": 1.6913, + "step": 8483 + }, + { + "epoch": 0.30383010725732806, + "grad_norm": 1.1684625148773193, + "learning_rate": 0.0001631848514915403, + "loss": 1.5842, + "step": 8484 + }, + { + "epoch": 0.30386591938689633, + "grad_norm": 1.4962623119354248, + "learning_rate": 0.00016317586076554515, + "loss": 1.6532, + "step": 8485 + }, + { + "epoch": 0.3039017315164646, + "grad_norm": 1.7394448518753052, + "learning_rate": 0.0001631668691896003, + "loss": 1.5523, + "step": 8486 + }, + { + "epoch": 0.3039375436460329, + "grad_norm": 2.025491237640381, + "learning_rate": 0.00016315787676382667, + "loss": 1.6246, + "step": 8487 + }, + { + "epoch": 0.3039733557756012, + "grad_norm": 3.2317962646484375, + "learning_rate": 0.0001631488834883453, + "loss": 1.5146, + "step": 8488 + }, + { + "epoch": 0.30400916790516946, + "grad_norm": 2.3871772289276123, + "learning_rate": 0.00016313988936327717, + "loss": 1.7163, + "step": 8489 + }, + { + "epoch": 0.3040449800347378, + "grad_norm": 1.6114022731781006, + "learning_rate": 0.00016313089438874326, + "loss": 1.8252, + "step": 8490 + }, + { + "epoch": 0.30408079216430606, + "grad_norm": 1.964486002922058, + "learning_rate": 0.00016312189856486462, + "loss": 1.7783, + "step": 8491 + }, + { + "epoch": 0.3041166042938743, + "grad_norm": 1.4887510538101196, + "learning_rate": 0.00016311290189176223, + "loss": 1.5578, + "step": 8492 + }, + { + "epoch": 0.3041524164234426, + "grad_norm": 2.208726167678833, + "learning_rate": 0.00016310390436955716, + "loss": 1.51, + "step": 8493 + }, + { + "epoch": 0.3041882285530109, + "grad_norm": 1.4921742677688599, + "learning_rate": 0.00016309490599837045, + "loss": 1.77, + "step": 8494 + }, + { + "epoch": 0.3042240406825792, + "grad_norm": 1.6540600061416626, + "learning_rate": 0.00016308590677832315, + "loss": 1.4772, + "step": 8495 + }, + { + "epoch": 0.30425985281214746, + "grad_norm": 1.4199429750442505, + "learning_rate": 0.0001630769067095364, + "loss": 1.4281, + "step": 8496 + }, + { + "epoch": 0.3042956649417158, + "grad_norm": 2.591513156890869, + "learning_rate": 0.0001630679057921312, + "loss": 1.4973, + "step": 8497 + }, + { + "epoch": 0.30433147707128405, + "grad_norm": 1.456068515777588, + "learning_rate": 0.0001630589040262287, + "loss": 1.4531, + "step": 8498 + }, + { + "epoch": 0.3043672892008523, + "grad_norm": 2.0415728092193604, + "learning_rate": 0.00016304990141194996, + "loss": 1.4656, + "step": 8499 + }, + { + "epoch": 0.3044031013304206, + "grad_norm": 1.6179795265197754, + "learning_rate": 0.00016304089794941614, + "loss": 1.7035, + "step": 8500 + }, + { + "epoch": 0.3044389134599889, + "grad_norm": 1.568784475326538, + "learning_rate": 0.00016303189363874835, + "loss": 1.3761, + "step": 8501 + }, + { + "epoch": 0.3044747255895572, + "grad_norm": 1.5610696077346802, + "learning_rate": 0.00016302288848006776, + "loss": 1.3034, + "step": 8502 + }, + { + "epoch": 0.30451053771912545, + "grad_norm": 2.025500774383545, + "learning_rate": 0.00016301388247349545, + "loss": 1.4422, + "step": 8503 + }, + { + "epoch": 0.3045463498486938, + "grad_norm": 1.6458475589752197, + "learning_rate": 0.00016300487561915266, + "loss": 1.5523, + "step": 8504 + }, + { + "epoch": 0.30458216197826204, + "grad_norm": 1.8756228685379028, + "learning_rate": 0.00016299586791716054, + "loss": 1.642, + "step": 8505 + }, + { + "epoch": 0.3046179741078303, + "grad_norm": 1.4094475507736206, + "learning_rate": 0.00016298685936764026, + "loss": 1.3843, + "step": 8506 + }, + { + "epoch": 0.3046537862373986, + "grad_norm": 1.7004481554031372, + "learning_rate": 0.00016297784997071308, + "loss": 1.4843, + "step": 8507 + }, + { + "epoch": 0.3046895983669669, + "grad_norm": 1.459661602973938, + "learning_rate": 0.00016296883972650013, + "loss": 1.4667, + "step": 8508 + }, + { + "epoch": 0.3047254104965352, + "grad_norm": 1.3875668048858643, + "learning_rate": 0.00016295982863512266, + "loss": 1.4236, + "step": 8509 + }, + { + "epoch": 0.30476122262610345, + "grad_norm": 2.15937876701355, + "learning_rate": 0.00016295081669670191, + "loss": 1.7024, + "step": 8510 + }, + { + "epoch": 0.30479703475567177, + "grad_norm": 1.7457246780395508, + "learning_rate": 0.00016294180391135914, + "loss": 1.3893, + "step": 8511 + }, + { + "epoch": 0.30483284688524004, + "grad_norm": 1.7229722738265991, + "learning_rate": 0.00016293279027921557, + "loss": 1.4539, + "step": 8512 + }, + { + "epoch": 0.3048686590148083, + "grad_norm": 2.1370930671691895, + "learning_rate": 0.0001629237758003925, + "loss": 1.7631, + "step": 8513 + }, + { + "epoch": 0.3049044711443766, + "grad_norm": 1.49868905544281, + "learning_rate": 0.00016291476047501115, + "loss": 1.6417, + "step": 8514 + }, + { + "epoch": 0.3049402832739449, + "grad_norm": 1.5884264707565308, + "learning_rate": 0.0001629057443031929, + "loss": 1.4924, + "step": 8515 + }, + { + "epoch": 0.30497609540351317, + "grad_norm": 1.7246391773223877, + "learning_rate": 0.000162896727285059, + "loss": 1.4056, + "step": 8516 + }, + { + "epoch": 0.30501190753308144, + "grad_norm": 1.9741127490997314, + "learning_rate": 0.00016288770942073075, + "loss": 1.6657, + "step": 8517 + }, + { + "epoch": 0.30504771966264976, + "grad_norm": 1.5608097314834595, + "learning_rate": 0.00016287869071032952, + "loss": 1.6203, + "step": 8518 + }, + { + "epoch": 0.30508353179221803, + "grad_norm": 1.6867769956588745, + "learning_rate": 0.00016286967115397655, + "loss": 1.4592, + "step": 8519 + }, + { + "epoch": 0.3051193439217863, + "grad_norm": 1.8151174783706665, + "learning_rate": 0.00016286065075179332, + "loss": 1.7879, + "step": 8520 + }, + { + "epoch": 0.30515515605135457, + "grad_norm": 2.2028374671936035, + "learning_rate": 0.00016285162950390104, + "loss": 1.4099, + "step": 8521 + }, + { + "epoch": 0.3051909681809229, + "grad_norm": 2.5838005542755127, + "learning_rate": 0.00016284260741042123, + "loss": 1.4674, + "step": 8522 + }, + { + "epoch": 0.30522678031049116, + "grad_norm": 1.9016218185424805, + "learning_rate": 0.00016283358447147516, + "loss": 1.8284, + "step": 8523 + }, + { + "epoch": 0.30526259244005943, + "grad_norm": 2.4954068660736084, + "learning_rate": 0.0001628245606871843, + "loss": 1.5852, + "step": 8524 + }, + { + "epoch": 0.30529840456962776, + "grad_norm": 2.0137617588043213, + "learning_rate": 0.00016281553605766998, + "loss": 1.7631, + "step": 8525 + }, + { + "epoch": 0.305334216699196, + "grad_norm": 2.0389068126678467, + "learning_rate": 0.00016280651058305363, + "loss": 1.661, + "step": 8526 + }, + { + "epoch": 0.3053700288287643, + "grad_norm": 1.6179884672164917, + "learning_rate": 0.00016279748426345673, + "loss": 1.5391, + "step": 8527 + }, + { + "epoch": 0.30540584095833256, + "grad_norm": 1.4547618627548218, + "learning_rate": 0.0001627884570990007, + "loss": 1.6447, + "step": 8528 + }, + { + "epoch": 0.3054416530879009, + "grad_norm": 1.4951916933059692, + "learning_rate": 0.0001627794290898069, + "loss": 1.714, + "step": 8529 + }, + { + "epoch": 0.30547746521746916, + "grad_norm": 1.8673044443130493, + "learning_rate": 0.00016277040023599692, + "loss": 1.6272, + "step": 8530 + }, + { + "epoch": 0.3055132773470374, + "grad_norm": 1.5993306636810303, + "learning_rate": 0.00016276137053769217, + "loss": 1.3022, + "step": 8531 + }, + { + "epoch": 0.30554908947660575, + "grad_norm": 1.4907748699188232, + "learning_rate": 0.0001627523399950141, + "loss": 1.606, + "step": 8532 + }, + { + "epoch": 0.305584901606174, + "grad_norm": 1.347849726676941, + "learning_rate": 0.00016274330860808426, + "loss": 1.5253, + "step": 8533 + }, + { + "epoch": 0.3056207137357423, + "grad_norm": 1.296706199645996, + "learning_rate": 0.00016273427637702415, + "loss": 1.1995, + "step": 8534 + }, + { + "epoch": 0.30565652586531056, + "grad_norm": 1.5187026262283325, + "learning_rate": 0.00016272524330195525, + "loss": 1.1413, + "step": 8535 + }, + { + "epoch": 0.3056923379948789, + "grad_norm": 2.58811616897583, + "learning_rate": 0.00016271620938299912, + "loss": 1.7734, + "step": 8536 + }, + { + "epoch": 0.30572815012444715, + "grad_norm": 1.5693305730819702, + "learning_rate": 0.0001627071746202773, + "loss": 1.6448, + "step": 8537 + }, + { + "epoch": 0.3057639622540154, + "grad_norm": 1.9932421445846558, + "learning_rate": 0.00016269813901391132, + "loss": 1.6241, + "step": 8538 + }, + { + "epoch": 0.30579977438358374, + "grad_norm": 2.0719470977783203, + "learning_rate": 0.00016268910256402277, + "loss": 1.5669, + "step": 8539 + }, + { + "epoch": 0.305835586513152, + "grad_norm": 1.9695795774459839, + "learning_rate": 0.00016268006527073322, + "loss": 1.5724, + "step": 8540 + }, + { + "epoch": 0.3058713986427203, + "grad_norm": 1.7500296831130981, + "learning_rate": 0.00016267102713416417, + "loss": 1.3741, + "step": 8541 + }, + { + "epoch": 0.30590721077228855, + "grad_norm": 1.8860368728637695, + "learning_rate": 0.00016266198815443738, + "loss": 1.4433, + "step": 8542 + }, + { + "epoch": 0.3059430229018569, + "grad_norm": 1.7154771089553833, + "learning_rate": 0.00016265294833167434, + "loss": 1.5552, + "step": 8543 + }, + { + "epoch": 0.30597883503142514, + "grad_norm": 1.621751070022583, + "learning_rate": 0.0001626439076659967, + "loss": 1.4277, + "step": 8544 + }, + { + "epoch": 0.3060146471609934, + "grad_norm": 1.4160710573196411, + "learning_rate": 0.00016263486615752606, + "loss": 1.2813, + "step": 8545 + }, + { + "epoch": 0.30605045929056174, + "grad_norm": 1.513929009437561, + "learning_rate": 0.00016262582380638407, + "loss": 1.4992, + "step": 8546 + }, + { + "epoch": 0.30608627142013, + "grad_norm": 1.4435629844665527, + "learning_rate": 0.00016261678061269244, + "loss": 1.5759, + "step": 8547 + }, + { + "epoch": 0.3061220835496983, + "grad_norm": 1.9000530242919922, + "learning_rate": 0.0001626077365765728, + "loss": 1.575, + "step": 8548 + }, + { + "epoch": 0.30615789567926655, + "grad_norm": 1.5278146266937256, + "learning_rate": 0.00016259869169814678, + "loss": 1.8735, + "step": 8549 + }, + { + "epoch": 0.30619370780883487, + "grad_norm": 1.8656506538391113, + "learning_rate": 0.00016258964597753615, + "loss": 1.4926, + "step": 8550 + }, + { + "epoch": 0.30622951993840314, + "grad_norm": 2.041372060775757, + "learning_rate": 0.00016258059941486259, + "loss": 1.2913, + "step": 8551 + }, + { + "epoch": 0.3062653320679714, + "grad_norm": 1.8052326440811157, + "learning_rate": 0.00016257155201024776, + "loss": 1.5017, + "step": 8552 + }, + { + "epoch": 0.30630114419753973, + "grad_norm": 2.2449746131896973, + "learning_rate": 0.0001625625037638134, + "loss": 1.531, + "step": 8553 + }, + { + "epoch": 0.306336956327108, + "grad_norm": 1.6481692790985107, + "learning_rate": 0.00016255345467568126, + "loss": 1.5007, + "step": 8554 + }, + { + "epoch": 0.30637276845667627, + "grad_norm": 2.4112162590026855, + "learning_rate": 0.00016254440474597307, + "loss": 1.6164, + "step": 8555 + }, + { + "epoch": 0.30640858058624454, + "grad_norm": 1.6740220785140991, + "learning_rate": 0.0001625353539748106, + "loss": 1.4353, + "step": 8556 + }, + { + "epoch": 0.30644439271581286, + "grad_norm": 2.5266458988189697, + "learning_rate": 0.00016252630236231557, + "loss": 1.3675, + "step": 8557 + }, + { + "epoch": 0.30648020484538113, + "grad_norm": 1.4388402700424194, + "learning_rate": 0.00016251724990860983, + "loss": 1.6966, + "step": 8558 + }, + { + "epoch": 0.3065160169749494, + "grad_norm": 1.701951026916504, + "learning_rate": 0.00016250819661381516, + "loss": 1.5309, + "step": 8559 + }, + { + "epoch": 0.3065518291045177, + "grad_norm": 1.3928033113479614, + "learning_rate": 0.0001624991424780533, + "loss": 1.1926, + "step": 8560 + }, + { + "epoch": 0.306587641234086, + "grad_norm": 1.6623480319976807, + "learning_rate": 0.0001624900875014461, + "loss": 1.5179, + "step": 8561 + }, + { + "epoch": 0.30662345336365426, + "grad_norm": 1.2392915487289429, + "learning_rate": 0.0001624810316841154, + "loss": 1.6586, + "step": 8562 + }, + { + "epoch": 0.30665926549322253, + "grad_norm": 1.4448860883712769, + "learning_rate": 0.000162471975026183, + "loss": 1.3141, + "step": 8563 + }, + { + "epoch": 0.30669507762279086, + "grad_norm": 1.6523834466934204, + "learning_rate": 0.0001624629175277707, + "loss": 1.4945, + "step": 8564 + }, + { + "epoch": 0.3067308897523591, + "grad_norm": 1.7651172876358032, + "learning_rate": 0.0001624538591890005, + "loss": 1.4052, + "step": 8565 + }, + { + "epoch": 0.3067667018819274, + "grad_norm": 1.7851136922836304, + "learning_rate": 0.00016244480000999416, + "loss": 1.2272, + "step": 8566 + }, + { + "epoch": 0.3068025140114957, + "grad_norm": 1.844192385673523, + "learning_rate": 0.0001624357399908736, + "loss": 1.9858, + "step": 8567 + }, + { + "epoch": 0.306838326141064, + "grad_norm": 1.3948429822921753, + "learning_rate": 0.00016242667913176064, + "loss": 1.6287, + "step": 8568 + }, + { + "epoch": 0.30687413827063226, + "grad_norm": 2.166774034500122, + "learning_rate": 0.00016241761743277726, + "loss": 1.682, + "step": 8569 + }, + { + "epoch": 0.3069099504002005, + "grad_norm": 2.1550753116607666, + "learning_rate": 0.00016240855489404535, + "loss": 1.3612, + "step": 8570 + }, + { + "epoch": 0.30694576252976885, + "grad_norm": 1.3740218877792358, + "learning_rate": 0.00016239949151568688, + "loss": 1.5035, + "step": 8571 + }, + { + "epoch": 0.3069815746593371, + "grad_norm": 1.7257821559906006, + "learning_rate": 0.0001623904272978237, + "loss": 1.5221, + "step": 8572 + }, + { + "epoch": 0.3070173867889054, + "grad_norm": 2.2711541652679443, + "learning_rate": 0.00016238136224057777, + "loss": 1.6717, + "step": 8573 + }, + { + "epoch": 0.3070531989184737, + "grad_norm": 1.7106958627700806, + "learning_rate": 0.00016237229634407112, + "loss": 1.8193, + "step": 8574 + }, + { + "epoch": 0.307089011048042, + "grad_norm": 1.976996898651123, + "learning_rate": 0.0001623632296084257, + "loss": 1.6273, + "step": 8575 + }, + { + "epoch": 0.30712482317761025, + "grad_norm": 2.0148656368255615, + "learning_rate": 0.0001623541620337634, + "loss": 1.7886, + "step": 8576 + }, + { + "epoch": 0.3071606353071785, + "grad_norm": 1.4920234680175781, + "learning_rate": 0.00016234509362020633, + "loss": 1.6572, + "step": 8577 + }, + { + "epoch": 0.30719644743674684, + "grad_norm": 2.609459638595581, + "learning_rate": 0.00016233602436787644, + "loss": 1.4282, + "step": 8578 + }, + { + "epoch": 0.3072322595663151, + "grad_norm": 1.3255739212036133, + "learning_rate": 0.00016232695427689575, + "loss": 1.5997, + "step": 8579 + }, + { + "epoch": 0.3072680716958834, + "grad_norm": 2.2817599773406982, + "learning_rate": 0.00016231788334738627, + "loss": 1.4272, + "step": 8580 + }, + { + "epoch": 0.3073038838254517, + "grad_norm": 1.5442206859588623, + "learning_rate": 0.00016230881157947006, + "loss": 1.5387, + "step": 8581 + }, + { + "epoch": 0.30733969595502, + "grad_norm": 1.3098913431167603, + "learning_rate": 0.00016229973897326919, + "loss": 1.5564, + "step": 8582 + }, + { + "epoch": 0.30737550808458824, + "grad_norm": 1.755606770515442, + "learning_rate": 0.00016229066552890563, + "loss": 1.6957, + "step": 8583 + }, + { + "epoch": 0.3074113202141565, + "grad_norm": 1.984049916267395, + "learning_rate": 0.0001622815912465016, + "loss": 1.314, + "step": 8584 + }, + { + "epoch": 0.30744713234372484, + "grad_norm": 1.4831622838974, + "learning_rate": 0.00016227251612617902, + "loss": 1.3996, + "step": 8585 + }, + { + "epoch": 0.3074829444732931, + "grad_norm": 2.159074306488037, + "learning_rate": 0.0001622634401680601, + "loss": 1.7296, + "step": 8586 + }, + { + "epoch": 0.3075187566028614, + "grad_norm": 1.5581837892532349, + "learning_rate": 0.0001622543633722669, + "loss": 1.3766, + "step": 8587 + }, + { + "epoch": 0.3075545687324297, + "grad_norm": 1.5768377780914307, + "learning_rate": 0.00016224528573892153, + "loss": 1.5968, + "step": 8588 + }, + { + "epoch": 0.30759038086199797, + "grad_norm": 1.7396345138549805, + "learning_rate": 0.00016223620726814615, + "loss": 1.461, + "step": 8589 + }, + { + "epoch": 0.30762619299156624, + "grad_norm": 2.5793988704681396, + "learning_rate": 0.00016222712796006285, + "loss": 1.4118, + "step": 8590 + }, + { + "epoch": 0.3076620051211345, + "grad_norm": 1.5253145694732666, + "learning_rate": 0.00016221804781479384, + "loss": 1.4673, + "step": 8591 + }, + { + "epoch": 0.30769781725070283, + "grad_norm": 1.6572800874710083, + "learning_rate": 0.00016220896683246126, + "loss": 1.187, + "step": 8592 + }, + { + "epoch": 0.3077336293802711, + "grad_norm": 2.2144687175750732, + "learning_rate": 0.00016219988501318727, + "loss": 1.6095, + "step": 8593 + }, + { + "epoch": 0.30776944150983937, + "grad_norm": 2.0595297813415527, + "learning_rate": 0.00016219080235709403, + "loss": 1.8326, + "step": 8594 + }, + { + "epoch": 0.3078052536394077, + "grad_norm": 1.579040288925171, + "learning_rate": 0.0001621817188643038, + "loss": 1.5968, + "step": 8595 + }, + { + "epoch": 0.30784106576897596, + "grad_norm": 1.8620355129241943, + "learning_rate": 0.0001621726345349387, + "loss": 1.5707, + "step": 8596 + }, + { + "epoch": 0.30787687789854423, + "grad_norm": 1.648568868637085, + "learning_rate": 0.00016216354936912105, + "loss": 1.6395, + "step": 8597 + }, + { + "epoch": 0.3079126900281125, + "grad_norm": 1.642183542251587, + "learning_rate": 0.000162154463366973, + "loss": 1.7484, + "step": 8598 + }, + { + "epoch": 0.3079485021576808, + "grad_norm": 1.5687962770462036, + "learning_rate": 0.00016214537652861687, + "loss": 1.1456, + "step": 8599 + }, + { + "epoch": 0.3079843142872491, + "grad_norm": 1.3825608491897583, + "learning_rate": 0.00016213628885417483, + "loss": 1.4397, + "step": 8600 + }, + { + "epoch": 0.30802012641681736, + "grad_norm": 1.5594370365142822, + "learning_rate": 0.00016212720034376914, + "loss": 1.6672, + "step": 8601 + }, + { + "epoch": 0.3080559385463857, + "grad_norm": 1.4455797672271729, + "learning_rate": 0.00016211811099752215, + "loss": 1.5133, + "step": 8602 + }, + { + "epoch": 0.30809175067595396, + "grad_norm": 1.868511438369751, + "learning_rate": 0.00016210902081555605, + "loss": 1.5547, + "step": 8603 + }, + { + "epoch": 0.3081275628055222, + "grad_norm": 1.375169277191162, + "learning_rate": 0.00016209992979799326, + "loss": 1.4245, + "step": 8604 + }, + { + "epoch": 0.3081633749350905, + "grad_norm": 2.5195486545562744, + "learning_rate": 0.00016209083794495598, + "loss": 1.76, + "step": 8605 + }, + { + "epoch": 0.3081991870646588, + "grad_norm": 1.5071332454681396, + "learning_rate": 0.00016208174525656656, + "loss": 1.2215, + "step": 8606 + }, + { + "epoch": 0.3082349991942271, + "grad_norm": 1.7664871215820312, + "learning_rate": 0.00016207265173294734, + "loss": 1.6029, + "step": 8607 + }, + { + "epoch": 0.30827081132379536, + "grad_norm": 2.763017416000366, + "learning_rate": 0.00016206355737422067, + "loss": 1.5892, + "step": 8608 + }, + { + "epoch": 0.3083066234533637, + "grad_norm": 1.5861375331878662, + "learning_rate": 0.00016205446218050892, + "loss": 1.0838, + "step": 8609 + }, + { + "epoch": 0.30834243558293195, + "grad_norm": 1.6708229780197144, + "learning_rate": 0.00016204536615193439, + "loss": 1.7046, + "step": 8610 + }, + { + "epoch": 0.3083782477125002, + "grad_norm": 1.6125863790512085, + "learning_rate": 0.00016203626928861948, + "loss": 1.6065, + "step": 8611 + }, + { + "epoch": 0.3084140598420685, + "grad_norm": 2.4662134647369385, + "learning_rate": 0.00016202717159068662, + "loss": 1.631, + "step": 8612 + }, + { + "epoch": 0.3084498719716368, + "grad_norm": 1.8300012350082397, + "learning_rate": 0.00016201807305825817, + "loss": 1.49, + "step": 8613 + }, + { + "epoch": 0.3084856841012051, + "grad_norm": 1.6964879035949707, + "learning_rate": 0.0001620089736914565, + "loss": 1.3984, + "step": 8614 + }, + { + "epoch": 0.30852149623077335, + "grad_norm": 1.575770616531372, + "learning_rate": 0.0001619998734904041, + "loss": 1.6827, + "step": 8615 + }, + { + "epoch": 0.3085573083603416, + "grad_norm": 1.308549165725708, + "learning_rate": 0.00016199077245522341, + "loss": 1.5704, + "step": 8616 + }, + { + "epoch": 0.30859312048990994, + "grad_norm": 2.0342214107513428, + "learning_rate": 0.00016198167058603682, + "loss": 1.6039, + "step": 8617 + }, + { + "epoch": 0.3086289326194782, + "grad_norm": 1.666513442993164, + "learning_rate": 0.0001619725678829668, + "loss": 1.5431, + "step": 8618 + }, + { + "epoch": 0.3086647447490465, + "grad_norm": 1.700683832168579, + "learning_rate": 0.0001619634643461358, + "loss": 1.6518, + "step": 8619 + }, + { + "epoch": 0.3087005568786148, + "grad_norm": 2.255946159362793, + "learning_rate": 0.00016195435997566632, + "loss": 1.7746, + "step": 8620 + }, + { + "epoch": 0.3087363690081831, + "grad_norm": 1.617912769317627, + "learning_rate": 0.00016194525477168087, + "loss": 1.6655, + "step": 8621 + }, + { + "epoch": 0.30877218113775134, + "grad_norm": 2.162280321121216, + "learning_rate": 0.0001619361487343019, + "loss": 1.5503, + "step": 8622 + }, + { + "epoch": 0.3088079932673196, + "grad_norm": 1.8481221199035645, + "learning_rate": 0.00016192704186365195, + "loss": 1.5285, + "step": 8623 + }, + { + "epoch": 0.30884380539688794, + "grad_norm": 1.9384018182754517, + "learning_rate": 0.00016191793415985353, + "loss": 1.9594, + "step": 8624 + }, + { + "epoch": 0.3088796175264562, + "grad_norm": 1.9377477169036865, + "learning_rate": 0.00016190882562302914, + "loss": 1.5948, + "step": 8625 + }, + { + "epoch": 0.3089154296560245, + "grad_norm": 1.4632514715194702, + "learning_rate": 0.0001618997162533014, + "loss": 1.6144, + "step": 8626 + }, + { + "epoch": 0.3089512417855928, + "grad_norm": 1.6265308856964111, + "learning_rate": 0.0001618906060507928, + "loss": 1.6253, + "step": 8627 + }, + { + "epoch": 0.30898705391516107, + "grad_norm": 1.8380078077316284, + "learning_rate": 0.00016188149501562596, + "loss": 1.287, + "step": 8628 + }, + { + "epoch": 0.30902286604472934, + "grad_norm": 1.5310187339782715, + "learning_rate": 0.00016187238314792338, + "loss": 1.3016, + "step": 8629 + }, + { + "epoch": 0.3090586781742976, + "grad_norm": 1.4253685474395752, + "learning_rate": 0.00016186327044780772, + "loss": 1.4775, + "step": 8630 + }, + { + "epoch": 0.30909449030386593, + "grad_norm": 2.2036783695220947, + "learning_rate": 0.00016185415691540156, + "loss": 1.5592, + "step": 8631 + }, + { + "epoch": 0.3091303024334342, + "grad_norm": 2.154386281967163, + "learning_rate": 0.0001618450425508275, + "loss": 1.4297, + "step": 8632 + }, + { + "epoch": 0.30916611456300247, + "grad_norm": 1.7909826040267944, + "learning_rate": 0.00016183592735420817, + "loss": 1.3135, + "step": 8633 + }, + { + "epoch": 0.3092019266925708, + "grad_norm": 1.5986603498458862, + "learning_rate": 0.0001618268113256662, + "loss": 1.6078, + "step": 8634 + }, + { + "epoch": 0.30923773882213906, + "grad_norm": 1.9522628784179688, + "learning_rate": 0.0001618176944653242, + "loss": 1.6146, + "step": 8635 + }, + { + "epoch": 0.30927355095170733, + "grad_norm": 1.3854929208755493, + "learning_rate": 0.0001618085767733049, + "loss": 1.551, + "step": 8636 + }, + { + "epoch": 0.3093093630812756, + "grad_norm": 1.41969895362854, + "learning_rate": 0.0001617994582497309, + "loss": 1.3639, + "step": 8637 + }, + { + "epoch": 0.3093451752108439, + "grad_norm": 2.2371199131011963, + "learning_rate": 0.00016179033889472493, + "loss": 1.5884, + "step": 8638 + }, + { + "epoch": 0.3093809873404122, + "grad_norm": 1.718151330947876, + "learning_rate": 0.00016178121870840965, + "loss": 1.5665, + "step": 8639 + }, + { + "epoch": 0.30941679946998046, + "grad_norm": 1.7268788814544678, + "learning_rate": 0.00016177209769090774, + "loss": 1.6803, + "step": 8640 + }, + { + "epoch": 0.3094526115995488, + "grad_norm": 1.3030462265014648, + "learning_rate": 0.00016176297584234196, + "loss": 1.5642, + "step": 8641 + }, + { + "epoch": 0.30948842372911706, + "grad_norm": 1.4846490621566772, + "learning_rate": 0.00016175385316283502, + "loss": 1.4474, + "step": 8642 + }, + { + "epoch": 0.3095242358586853, + "grad_norm": 1.95109224319458, + "learning_rate": 0.00016174472965250965, + "loss": 1.6802, + "step": 8643 + }, + { + "epoch": 0.3095600479882536, + "grad_norm": 2.3213067054748535, + "learning_rate": 0.00016173560531148855, + "loss": 1.6874, + "step": 8644 + }, + { + "epoch": 0.3095958601178219, + "grad_norm": 1.6281256675720215, + "learning_rate": 0.0001617264801398945, + "loss": 1.6978, + "step": 8645 + }, + { + "epoch": 0.3096316722473902, + "grad_norm": 1.7163335084915161, + "learning_rate": 0.0001617173541378503, + "loss": 1.507, + "step": 8646 + }, + { + "epoch": 0.30966748437695846, + "grad_norm": 2.341641426086426, + "learning_rate": 0.0001617082273054787, + "loss": 1.6396, + "step": 8647 + }, + { + "epoch": 0.3097032965065268, + "grad_norm": 2.265071392059326, + "learning_rate": 0.00016169909964290256, + "loss": 1.383, + "step": 8648 + }, + { + "epoch": 0.30973910863609505, + "grad_norm": 2.135606288909912, + "learning_rate": 0.00016168997115024458, + "loss": 1.6133, + "step": 8649 + }, + { + "epoch": 0.3097749207656633, + "grad_norm": 1.6686729192733765, + "learning_rate": 0.0001616808418276276, + "loss": 1.4704, + "step": 8650 + }, + { + "epoch": 0.3098107328952316, + "grad_norm": 1.7093509435653687, + "learning_rate": 0.00016167171167517447, + "loss": 1.6578, + "step": 8651 + }, + { + "epoch": 0.3098465450247999, + "grad_norm": 1.4961060285568237, + "learning_rate": 0.00016166258069300803, + "loss": 1.5668, + "step": 8652 + }, + { + "epoch": 0.3098823571543682, + "grad_norm": 1.3876949548721313, + "learning_rate": 0.00016165344888125106, + "loss": 1.6351, + "step": 8653 + }, + { + "epoch": 0.30991816928393645, + "grad_norm": 1.5020776987075806, + "learning_rate": 0.00016164431624002647, + "loss": 1.4318, + "step": 8654 + }, + { + "epoch": 0.3099539814135048, + "grad_norm": 1.5066617727279663, + "learning_rate": 0.00016163518276945715, + "loss": 1.6516, + "step": 8655 + }, + { + "epoch": 0.30998979354307304, + "grad_norm": 1.377149224281311, + "learning_rate": 0.00016162604846966594, + "loss": 1.5357, + "step": 8656 + }, + { + "epoch": 0.3100256056726413, + "grad_norm": 1.648619294166565, + "learning_rate": 0.0001616169133407757, + "loss": 1.9226, + "step": 8657 + }, + { + "epoch": 0.3100614178022096, + "grad_norm": 1.3753312826156616, + "learning_rate": 0.00016160777738290945, + "loss": 1.5631, + "step": 8658 + }, + { + "epoch": 0.3100972299317779, + "grad_norm": 1.6483653783798218, + "learning_rate": 0.00016159864059618997, + "loss": 1.762, + "step": 8659 + }, + { + "epoch": 0.3101330420613462, + "grad_norm": 1.7791167497634888, + "learning_rate": 0.00016158950298074022, + "loss": 1.575, + "step": 8660 + }, + { + "epoch": 0.31016885419091444, + "grad_norm": 2.1656835079193115, + "learning_rate": 0.00016158036453668318, + "loss": 1.6359, + "step": 8661 + }, + { + "epoch": 0.31020466632048277, + "grad_norm": 1.743486762046814, + "learning_rate": 0.00016157122526414176, + "loss": 1.8204, + "step": 8662 + }, + { + "epoch": 0.31024047845005104, + "grad_norm": 1.796557903289795, + "learning_rate": 0.00016156208516323895, + "loss": 1.3592, + "step": 8663 + }, + { + "epoch": 0.3102762905796193, + "grad_norm": 1.2790051698684692, + "learning_rate": 0.00016155294423409768, + "loss": 1.476, + "step": 8664 + }, + { + "epoch": 0.3103121027091876, + "grad_norm": 1.7429996728897095, + "learning_rate": 0.00016154380247684094, + "loss": 1.5231, + "step": 8665 + }, + { + "epoch": 0.3103479148387559, + "grad_norm": 2.2776238918304443, + "learning_rate": 0.00016153465989159172, + "loss": 1.5724, + "step": 8666 + }, + { + "epoch": 0.31038372696832417, + "grad_norm": 1.5032392740249634, + "learning_rate": 0.00016152551647847304, + "loss": 1.7274, + "step": 8667 + }, + { + "epoch": 0.31041953909789244, + "grad_norm": 1.6818513870239258, + "learning_rate": 0.00016151637223760785, + "loss": 1.4738, + "step": 8668 + }, + { + "epoch": 0.31045535122746076, + "grad_norm": 1.457304835319519, + "learning_rate": 0.0001615072271691193, + "loss": 1.5367, + "step": 8669 + }, + { + "epoch": 0.31049116335702903, + "grad_norm": 1.749411702156067, + "learning_rate": 0.00016149808127313025, + "loss": 1.6854, + "step": 8670 + }, + { + "epoch": 0.3105269754865973, + "grad_norm": 1.698837399482727, + "learning_rate": 0.00016148893454976393, + "loss": 1.5631, + "step": 8671 + }, + { + "epoch": 0.31056278761616557, + "grad_norm": 1.9084186553955078, + "learning_rate": 0.00016147978699914325, + "loss": 1.3116, + "step": 8672 + }, + { + "epoch": 0.3105985997457339, + "grad_norm": 1.5788426399230957, + "learning_rate": 0.00016147063862139138, + "loss": 1.5894, + "step": 8673 + }, + { + "epoch": 0.31063441187530216, + "grad_norm": 1.705234169960022, + "learning_rate": 0.00016146148941663136, + "loss": 1.4553, + "step": 8674 + }, + { + "epoch": 0.31067022400487043, + "grad_norm": 1.3608901500701904, + "learning_rate": 0.00016145233938498626, + "loss": 1.5954, + "step": 8675 + }, + { + "epoch": 0.31070603613443876, + "grad_norm": 1.920832872390747, + "learning_rate": 0.00016144318852657921, + "loss": 1.6358, + "step": 8676 + }, + { + "epoch": 0.310741848264007, + "grad_norm": 1.759215235710144, + "learning_rate": 0.00016143403684153328, + "loss": 1.6726, + "step": 8677 + }, + { + "epoch": 0.3107776603935753, + "grad_norm": 1.8007652759552002, + "learning_rate": 0.00016142488432997168, + "loss": 1.6163, + "step": 8678 + }, + { + "epoch": 0.31081347252314356, + "grad_norm": 2.198650598526001, + "learning_rate": 0.00016141573099201744, + "loss": 1.4736, + "step": 8679 + }, + { + "epoch": 0.3108492846527119, + "grad_norm": 1.4647966623306274, + "learning_rate": 0.00016140657682779384, + "loss": 1.2301, + "step": 8680 + }, + { + "epoch": 0.31088509678228016, + "grad_norm": 1.752133846282959, + "learning_rate": 0.0001613974218374239, + "loss": 1.558, + "step": 8681 + }, + { + "epoch": 0.3109209089118484, + "grad_norm": 1.8326823711395264, + "learning_rate": 0.00016138826602103085, + "loss": 1.8069, + "step": 8682 + }, + { + "epoch": 0.31095672104141675, + "grad_norm": 1.3511853218078613, + "learning_rate": 0.00016137910937873788, + "loss": 1.5138, + "step": 8683 + }, + { + "epoch": 0.310992533170985, + "grad_norm": 1.8145192861557007, + "learning_rate": 0.00016136995191066818, + "loss": 1.7591, + "step": 8684 + }, + { + "epoch": 0.3110283453005533, + "grad_norm": 1.5319371223449707, + "learning_rate": 0.0001613607936169449, + "loss": 1.8161, + "step": 8685 + }, + { + "epoch": 0.31106415743012156, + "grad_norm": 1.9978666305541992, + "learning_rate": 0.00016135163449769132, + "loss": 1.4615, + "step": 8686 + }, + { + "epoch": 0.3110999695596899, + "grad_norm": 2.378784418106079, + "learning_rate": 0.0001613424745530306, + "loss": 1.5786, + "step": 8687 + }, + { + "epoch": 0.31113578168925815, + "grad_norm": 2.055859327316284, + "learning_rate": 0.00016133331378308604, + "loss": 1.6159, + "step": 8688 + }, + { + "epoch": 0.3111715938188264, + "grad_norm": 1.796878695487976, + "learning_rate": 0.00016132415218798085, + "loss": 1.6153, + "step": 8689 + }, + { + "epoch": 0.31120740594839474, + "grad_norm": 1.5123406648635864, + "learning_rate": 0.0001613149897678383, + "loss": 1.556, + "step": 8690 + }, + { + "epoch": 0.311243218077963, + "grad_norm": 2.314534902572632, + "learning_rate": 0.00016130582652278163, + "loss": 1.4182, + "step": 8691 + }, + { + "epoch": 0.3112790302075313, + "grad_norm": 1.2278296947479248, + "learning_rate": 0.0001612966624529342, + "loss": 1.6972, + "step": 8692 + }, + { + "epoch": 0.31131484233709955, + "grad_norm": 1.6352144479751587, + "learning_rate": 0.0001612874975584192, + "loss": 1.4284, + "step": 8693 + }, + { + "epoch": 0.3113506544666679, + "grad_norm": 1.5539416074752808, + "learning_rate": 0.00016127833183936, + "loss": 1.524, + "step": 8694 + }, + { + "epoch": 0.31138646659623614, + "grad_norm": 2.1352217197418213, + "learning_rate": 0.00016126916529587987, + "loss": 1.5975, + "step": 8695 + }, + { + "epoch": 0.3114222787258044, + "grad_norm": 1.6609954833984375, + "learning_rate": 0.00016125999792810213, + "loss": 1.26, + "step": 8696 + }, + { + "epoch": 0.31145809085537274, + "grad_norm": 1.37396240234375, + "learning_rate": 0.00016125082973615017, + "loss": 1.2051, + "step": 8697 + }, + { + "epoch": 0.311493902984941, + "grad_norm": 2.057098865509033, + "learning_rate": 0.00016124166072014728, + "loss": 1.5141, + "step": 8698 + }, + { + "epoch": 0.3115297151145093, + "grad_norm": 1.9792077541351318, + "learning_rate": 0.00016123249088021688, + "loss": 1.6889, + "step": 8699 + }, + { + "epoch": 0.31156552724407754, + "grad_norm": 1.6447113752365112, + "learning_rate": 0.00016122332021648226, + "loss": 1.4876, + "step": 8700 + }, + { + "epoch": 0.31160133937364587, + "grad_norm": 1.9308607578277588, + "learning_rate": 0.00016121414872906687, + "loss": 1.4819, + "step": 8701 + }, + { + "epoch": 0.31163715150321414, + "grad_norm": 1.500744342803955, + "learning_rate": 0.00016120497641809408, + "loss": 1.5922, + "step": 8702 + }, + { + "epoch": 0.3116729636327824, + "grad_norm": 2.0981481075286865, + "learning_rate": 0.00016119580328368725, + "loss": 1.7535, + "step": 8703 + }, + { + "epoch": 0.31170877576235073, + "grad_norm": 1.7200511693954468, + "learning_rate": 0.0001611866293259698, + "loss": 1.3421, + "step": 8704 + }, + { + "epoch": 0.311744587891919, + "grad_norm": 1.9420268535614014, + "learning_rate": 0.00016117745454506522, + "loss": 1.446, + "step": 8705 + }, + { + "epoch": 0.31178040002148727, + "grad_norm": 2.217729091644287, + "learning_rate": 0.00016116827894109686, + "loss": 1.2806, + "step": 8706 + }, + { + "epoch": 0.31181621215105554, + "grad_norm": 1.7282236814498901, + "learning_rate": 0.00016115910251418827, + "loss": 1.5973, + "step": 8707 + }, + { + "epoch": 0.31185202428062386, + "grad_norm": 1.4423658847808838, + "learning_rate": 0.0001611499252644628, + "loss": 1.5761, + "step": 8708 + }, + { + "epoch": 0.31188783641019213, + "grad_norm": 1.5264742374420166, + "learning_rate": 0.00016114074719204396, + "loss": 1.5954, + "step": 8709 + }, + { + "epoch": 0.3119236485397604, + "grad_norm": 2.1491074562072754, + "learning_rate": 0.00016113156829705526, + "loss": 1.5177, + "step": 8710 + }, + { + "epoch": 0.3119594606693287, + "grad_norm": 1.6613092422485352, + "learning_rate": 0.00016112238857962017, + "loss": 1.6098, + "step": 8711 + }, + { + "epoch": 0.311995272798897, + "grad_norm": 1.7055860757827759, + "learning_rate": 0.00016111320803986217, + "loss": 1.7401, + "step": 8712 + }, + { + "epoch": 0.31203108492846526, + "grad_norm": 1.538976788520813, + "learning_rate": 0.00016110402667790475, + "loss": 1.7231, + "step": 8713 + }, + { + "epoch": 0.31206689705803353, + "grad_norm": 2.240173816680908, + "learning_rate": 0.0001610948444938715, + "loss": 1.8553, + "step": 8714 + }, + { + "epoch": 0.31210270918760186, + "grad_norm": 1.49239182472229, + "learning_rate": 0.00016108566148788594, + "loss": 1.2735, + "step": 8715 + }, + { + "epoch": 0.3121385213171701, + "grad_norm": 2.0355594158172607, + "learning_rate": 0.00016107647766007159, + "loss": 1.5694, + "step": 8716 + }, + { + "epoch": 0.3121743334467384, + "grad_norm": 2.6552181243896484, + "learning_rate": 0.000161067293010552, + "loss": 1.4701, + "step": 8717 + }, + { + "epoch": 0.3122101455763067, + "grad_norm": 1.752543568611145, + "learning_rate": 0.00016105810753945076, + "loss": 1.6328, + "step": 8718 + }, + { + "epoch": 0.312245957705875, + "grad_norm": 1.854783296585083, + "learning_rate": 0.00016104892124689147, + "loss": 1.723, + "step": 8719 + }, + { + "epoch": 0.31228176983544326, + "grad_norm": 1.347758412361145, + "learning_rate": 0.00016103973413299767, + "loss": 1.4929, + "step": 8720 + }, + { + "epoch": 0.3123175819650115, + "grad_norm": 1.979087471961975, + "learning_rate": 0.00016103054619789298, + "loss": 1.639, + "step": 8721 + }, + { + "epoch": 0.31235339409457985, + "grad_norm": 2.2528393268585205, + "learning_rate": 0.00016102135744170098, + "loss": 1.7851, + "step": 8722 + }, + { + "epoch": 0.3123892062241481, + "grad_norm": 2.4195220470428467, + "learning_rate": 0.00016101216786454538, + "loss": 1.6844, + "step": 8723 + }, + { + "epoch": 0.3124250183537164, + "grad_norm": 2.5333340167999268, + "learning_rate": 0.00016100297746654975, + "loss": 1.7123, + "step": 8724 + }, + { + "epoch": 0.3124608304832847, + "grad_norm": 1.6024668216705322, + "learning_rate": 0.00016099378624783773, + "loss": 1.7999, + "step": 8725 + }, + { + "epoch": 0.312496642612853, + "grad_norm": 1.7024332284927368, + "learning_rate": 0.00016098459420853302, + "loss": 1.881, + "step": 8726 + }, + { + "epoch": 0.31253245474242125, + "grad_norm": 2.0940849781036377, + "learning_rate": 0.00016097540134875924, + "loss": 1.5664, + "step": 8727 + }, + { + "epoch": 0.3125682668719895, + "grad_norm": 1.422271728515625, + "learning_rate": 0.00016096620766864011, + "loss": 1.5992, + "step": 8728 + }, + { + "epoch": 0.31260407900155784, + "grad_norm": 2.1082029342651367, + "learning_rate": 0.00016095701316829925, + "loss": 1.2487, + "step": 8729 + }, + { + "epoch": 0.3126398911311261, + "grad_norm": 1.6328003406524658, + "learning_rate": 0.00016094781784786044, + "loss": 1.5948, + "step": 8730 + }, + { + "epoch": 0.3126757032606944, + "grad_norm": 1.901384949684143, + "learning_rate": 0.00016093862170744733, + "loss": 1.5415, + "step": 8731 + }, + { + "epoch": 0.3127115153902627, + "grad_norm": 1.7360332012176514, + "learning_rate": 0.00016092942474718372, + "loss": 1.7264, + "step": 8732 + }, + { + "epoch": 0.312747327519831, + "grad_norm": 1.5709059238433838, + "learning_rate": 0.00016092022696719327, + "loss": 1.798, + "step": 8733 + }, + { + "epoch": 0.31278313964939924, + "grad_norm": 1.9449841976165771, + "learning_rate": 0.00016091102836759974, + "loss": 1.915, + "step": 8734 + }, + { + "epoch": 0.3128189517789675, + "grad_norm": 1.2012020349502563, + "learning_rate": 0.00016090182894852687, + "loss": 1.1363, + "step": 8735 + }, + { + "epoch": 0.31285476390853584, + "grad_norm": 2.4898271560668945, + "learning_rate": 0.0001608926287100985, + "loss": 1.7112, + "step": 8736 + }, + { + "epoch": 0.3128905760381041, + "grad_norm": 2.0908682346343994, + "learning_rate": 0.00016088342765243832, + "loss": 1.8249, + "step": 8737 + }, + { + "epoch": 0.3129263881676724, + "grad_norm": 2.0061733722686768, + "learning_rate": 0.00016087422577567016, + "loss": 1.538, + "step": 8738 + }, + { + "epoch": 0.3129622002972407, + "grad_norm": 1.3970668315887451, + "learning_rate": 0.00016086502307991783, + "loss": 1.4419, + "step": 8739 + }, + { + "epoch": 0.31299801242680897, + "grad_norm": 1.6271003484725952, + "learning_rate": 0.0001608558195653051, + "loss": 1.2525, + "step": 8740 + }, + { + "epoch": 0.31303382455637724, + "grad_norm": 1.6746578216552734, + "learning_rate": 0.00016084661523195585, + "loss": 1.7385, + "step": 8741 + }, + { + "epoch": 0.3130696366859455, + "grad_norm": 2.0202746391296387, + "learning_rate": 0.00016083741007999388, + "loss": 1.5669, + "step": 8742 + }, + { + "epoch": 0.31310544881551383, + "grad_norm": 1.568692684173584, + "learning_rate": 0.00016082820410954297, + "loss": 1.6994, + "step": 8743 + }, + { + "epoch": 0.3131412609450821, + "grad_norm": 1.844053030014038, + "learning_rate": 0.0001608189973207271, + "loss": 1.8135, + "step": 8744 + }, + { + "epoch": 0.31317707307465037, + "grad_norm": 3.1667978763580322, + "learning_rate": 0.00016080978971367004, + "loss": 1.8685, + "step": 8745 + }, + { + "epoch": 0.3132128852042187, + "grad_norm": 2.173363447189331, + "learning_rate": 0.00016080058128849572, + "loss": 1.4853, + "step": 8746 + }, + { + "epoch": 0.31324869733378696, + "grad_norm": 1.3892388343811035, + "learning_rate": 0.000160791372045328, + "loss": 1.3848, + "step": 8747 + }, + { + "epoch": 0.31328450946335523, + "grad_norm": 1.9529330730438232, + "learning_rate": 0.00016078216198429077, + "loss": 1.4344, + "step": 8748 + }, + { + "epoch": 0.3133203215929235, + "grad_norm": 1.5327616930007935, + "learning_rate": 0.00016077295110550796, + "loss": 1.4037, + "step": 8749 + }, + { + "epoch": 0.3133561337224918, + "grad_norm": 1.5518343448638916, + "learning_rate": 0.0001607637394091035, + "loss": 1.4539, + "step": 8750 + }, + { + "epoch": 0.3133919458520601, + "grad_norm": 1.8877317905426025, + "learning_rate": 0.00016075452689520128, + "loss": 1.7284, + "step": 8751 + }, + { + "epoch": 0.31342775798162836, + "grad_norm": 2.0070106983184814, + "learning_rate": 0.0001607453135639253, + "loss": 1.7028, + "step": 8752 + }, + { + "epoch": 0.3134635701111967, + "grad_norm": 1.7277096509933472, + "learning_rate": 0.00016073609941539944, + "loss": 1.3793, + "step": 8753 + }, + { + "epoch": 0.31349938224076496, + "grad_norm": 2.4877145290374756, + "learning_rate": 0.00016072688444974774, + "loss": 1.5569, + "step": 8754 + }, + { + "epoch": 0.3135351943703332, + "grad_norm": 1.5935946702957153, + "learning_rate": 0.00016071766866709413, + "loss": 1.3458, + "step": 8755 + }, + { + "epoch": 0.3135710064999015, + "grad_norm": 1.6100068092346191, + "learning_rate": 0.00016070845206756263, + "loss": 1.7064, + "step": 8756 + }, + { + "epoch": 0.3136068186294698, + "grad_norm": 1.5823991298675537, + "learning_rate": 0.00016069923465127718, + "loss": 1.6565, + "step": 8757 + }, + { + "epoch": 0.3136426307590381, + "grad_norm": 1.5896168947219849, + "learning_rate": 0.00016069001641836182, + "loss": 1.2576, + "step": 8758 + }, + { + "epoch": 0.31367844288860636, + "grad_norm": 1.5499836206436157, + "learning_rate": 0.00016068079736894058, + "loss": 1.642, + "step": 8759 + }, + { + "epoch": 0.3137142550181747, + "grad_norm": 1.5305835008621216, + "learning_rate": 0.00016067157750313752, + "loss": 1.3835, + "step": 8760 + }, + { + "epoch": 0.31375006714774295, + "grad_norm": 1.4311829805374146, + "learning_rate": 0.00016066235682107662, + "loss": 1.6501, + "step": 8761 + }, + { + "epoch": 0.3137858792773112, + "grad_norm": 1.4986733198165894, + "learning_rate": 0.00016065313532288196, + "loss": 1.44, + "step": 8762 + }, + { + "epoch": 0.3138216914068795, + "grad_norm": 1.811113953590393, + "learning_rate": 0.0001606439130086776, + "loss": 1.341, + "step": 8763 + }, + { + "epoch": 0.3138575035364478, + "grad_norm": 1.6800071001052856, + "learning_rate": 0.00016063468987858763, + "loss": 1.6177, + "step": 8764 + }, + { + "epoch": 0.3138933156660161, + "grad_norm": 1.6395264863967896, + "learning_rate": 0.00016062546593273612, + "loss": 1.3695, + "step": 8765 + }, + { + "epoch": 0.31392912779558435, + "grad_norm": 1.7987147569656372, + "learning_rate": 0.00016061624117124715, + "loss": 1.634, + "step": 8766 + }, + { + "epoch": 0.3139649399251527, + "grad_norm": 1.6628080606460571, + "learning_rate": 0.00016060701559424484, + "loss": 1.509, + "step": 8767 + }, + { + "epoch": 0.31400075205472094, + "grad_norm": 1.6090434789657593, + "learning_rate": 0.00016059778920185332, + "loss": 1.3413, + "step": 8768 + }, + { + "epoch": 0.3140365641842892, + "grad_norm": 1.616045355796814, + "learning_rate": 0.0001605885619941967, + "loss": 1.6345, + "step": 8769 + }, + { + "epoch": 0.3140723763138575, + "grad_norm": 2.1155946254730225, + "learning_rate": 0.00016057933397139914, + "loss": 1.4609, + "step": 8770 + }, + { + "epoch": 0.3141081884434258, + "grad_norm": 1.705063819885254, + "learning_rate": 0.0001605701051335848, + "loss": 1.677, + "step": 8771 + }, + { + "epoch": 0.3141440005729941, + "grad_norm": 1.82172691822052, + "learning_rate": 0.00016056087548087784, + "loss": 1.5523, + "step": 8772 + }, + { + "epoch": 0.31417981270256234, + "grad_norm": 1.3792582750320435, + "learning_rate": 0.00016055164501340235, + "loss": 1.5636, + "step": 8773 + }, + { + "epoch": 0.31421562483213067, + "grad_norm": 1.869604468345642, + "learning_rate": 0.00016054241373128264, + "loss": 1.7203, + "step": 8774 + }, + { + "epoch": 0.31425143696169894, + "grad_norm": 1.405261754989624, + "learning_rate": 0.00016053318163464285, + "loss": 1.5212, + "step": 8775 + }, + { + "epoch": 0.3142872490912672, + "grad_norm": 1.8906536102294922, + "learning_rate": 0.00016052394872360717, + "loss": 1.6859, + "step": 8776 + }, + { + "epoch": 0.3143230612208355, + "grad_norm": 1.6787978410720825, + "learning_rate": 0.00016051471499829983, + "loss": 1.5595, + "step": 8777 + }, + { + "epoch": 0.3143588733504038, + "grad_norm": 1.543586254119873, + "learning_rate": 0.00016050548045884501, + "loss": 1.5273, + "step": 8778 + }, + { + "epoch": 0.31439468547997207, + "grad_norm": 1.6794708967208862, + "learning_rate": 0.00016049624510536704, + "loss": 1.7836, + "step": 8779 + }, + { + "epoch": 0.31443049760954034, + "grad_norm": 1.616531252861023, + "learning_rate": 0.00016048700893799014, + "loss": 1.3527, + "step": 8780 + }, + { + "epoch": 0.31446630973910866, + "grad_norm": 1.3275195360183716, + "learning_rate": 0.00016047777195683858, + "loss": 1.3845, + "step": 8781 + }, + { + "epoch": 0.31450212186867693, + "grad_norm": 2.247191905975342, + "learning_rate": 0.00016046853416203655, + "loss": 1.394, + "step": 8782 + }, + { + "epoch": 0.3145379339982452, + "grad_norm": 1.556274175643921, + "learning_rate": 0.0001604592955537084, + "loss": 1.7258, + "step": 8783 + }, + { + "epoch": 0.31457374612781347, + "grad_norm": 1.2704604864120483, + "learning_rate": 0.00016045005613197843, + "loss": 1.2194, + "step": 8784 + }, + { + "epoch": 0.3146095582573818, + "grad_norm": 2.084599494934082, + "learning_rate": 0.00016044081589697092, + "loss": 1.7761, + "step": 8785 + }, + { + "epoch": 0.31464537038695006, + "grad_norm": 1.5749155282974243, + "learning_rate": 0.0001604315748488102, + "loss": 1.4076, + "step": 8786 + }, + { + "epoch": 0.31468118251651833, + "grad_norm": 1.5411909818649292, + "learning_rate": 0.00016042233298762062, + "loss": 1.5305, + "step": 8787 + }, + { + "epoch": 0.31471699464608666, + "grad_norm": 1.7500187158584595, + "learning_rate": 0.00016041309031352644, + "loss": 1.3224, + "step": 8788 + }, + { + "epoch": 0.3147528067756549, + "grad_norm": 1.9515713453292847, + "learning_rate": 0.00016040384682665214, + "loss": 1.5637, + "step": 8789 + }, + { + "epoch": 0.3147886189052232, + "grad_norm": 1.4233375787734985, + "learning_rate": 0.0001603946025271219, + "loss": 1.6988, + "step": 8790 + }, + { + "epoch": 0.31482443103479146, + "grad_norm": 1.8382158279418945, + "learning_rate": 0.00016038535741506025, + "loss": 1.5435, + "step": 8791 + }, + { + "epoch": 0.3148602431643598, + "grad_norm": 1.6217607259750366, + "learning_rate": 0.00016037611149059147, + "loss": 1.4189, + "step": 8792 + }, + { + "epoch": 0.31489605529392806, + "grad_norm": 1.7804478406906128, + "learning_rate": 0.00016036686475384002, + "loss": 1.3143, + "step": 8793 + }, + { + "epoch": 0.3149318674234963, + "grad_norm": 1.3662558794021606, + "learning_rate": 0.00016035761720493023, + "loss": 1.4369, + "step": 8794 + }, + { + "epoch": 0.31496767955306465, + "grad_norm": 1.6197288036346436, + "learning_rate": 0.0001603483688439866, + "loss": 1.4308, + "step": 8795 + }, + { + "epoch": 0.3150034916826329, + "grad_norm": 2.1956467628479004, + "learning_rate": 0.00016033911967113347, + "loss": 1.4458, + "step": 8796 + }, + { + "epoch": 0.3150393038122012, + "grad_norm": 1.8377485275268555, + "learning_rate": 0.00016032986968649536, + "loss": 1.6501, + "step": 8797 + }, + { + "epoch": 0.31507511594176946, + "grad_norm": 2.408724546432495, + "learning_rate": 0.00016032061889019662, + "loss": 1.2958, + "step": 8798 + }, + { + "epoch": 0.3151109280713378, + "grad_norm": 1.6144731044769287, + "learning_rate": 0.00016031136728236184, + "loss": 1.5546, + "step": 8799 + }, + { + "epoch": 0.31514674020090605, + "grad_norm": 1.4665641784667969, + "learning_rate": 0.00016030211486311533, + "loss": 1.4769, + "step": 8800 + }, + { + "epoch": 0.3151825523304743, + "grad_norm": 1.562260389328003, + "learning_rate": 0.0001602928616325817, + "loss": 1.8916, + "step": 8801 + }, + { + "epoch": 0.31521836446004264, + "grad_norm": 1.9326006174087524, + "learning_rate": 0.00016028360759088534, + "loss": 1.6079, + "step": 8802 + }, + { + "epoch": 0.3152541765896109, + "grad_norm": 1.7705698013305664, + "learning_rate": 0.00016027435273815085, + "loss": 1.4926, + "step": 8803 + }, + { + "epoch": 0.3152899887191792, + "grad_norm": 1.473935842514038, + "learning_rate": 0.00016026509707450266, + "loss": 1.379, + "step": 8804 + }, + { + "epoch": 0.31532580084874745, + "grad_norm": 3.5197129249572754, + "learning_rate": 0.00016025584060006532, + "loss": 1.5021, + "step": 8805 + }, + { + "epoch": 0.3153616129783158, + "grad_norm": 1.7519930601119995, + "learning_rate": 0.0001602465833149634, + "loss": 1.5798, + "step": 8806 + }, + { + "epoch": 0.31539742510788404, + "grad_norm": 1.7574677467346191, + "learning_rate": 0.0001602373252193214, + "loss": 1.5769, + "step": 8807 + }, + { + "epoch": 0.3154332372374523, + "grad_norm": 1.7921092510223389, + "learning_rate": 0.0001602280663132639, + "loss": 1.31, + "step": 8808 + }, + { + "epoch": 0.31546904936702064, + "grad_norm": 2.8339967727661133, + "learning_rate": 0.00016021880659691546, + "loss": 1.3994, + "step": 8809 + }, + { + "epoch": 0.3155048614965889, + "grad_norm": 1.3064004182815552, + "learning_rate": 0.00016020954607040065, + "loss": 1.5455, + "step": 8810 + }, + { + "epoch": 0.3155406736261572, + "grad_norm": 1.6051712036132812, + "learning_rate": 0.00016020028473384402, + "loss": 1.7627, + "step": 8811 + }, + { + "epoch": 0.31557648575572544, + "grad_norm": 2.130141019821167, + "learning_rate": 0.00016019102258737027, + "loss": 1.3654, + "step": 8812 + }, + { + "epoch": 0.31561229788529377, + "grad_norm": 1.2903608083724976, + "learning_rate": 0.00016018175963110389, + "loss": 1.4281, + "step": 8813 + }, + { + "epoch": 0.31564811001486204, + "grad_norm": 2.988527536392212, + "learning_rate": 0.00016017249586516963, + "loss": 1.5691, + "step": 8814 + }, + { + "epoch": 0.3156839221444303, + "grad_norm": 1.7782341241836548, + "learning_rate": 0.000160163231289692, + "loss": 1.6, + "step": 8815 + }, + { + "epoch": 0.3157197342739986, + "grad_norm": 2.2343456745147705, + "learning_rate": 0.00016015396590479575, + "loss": 1.5319, + "step": 8816 + }, + { + "epoch": 0.3157555464035669, + "grad_norm": 1.5296396017074585, + "learning_rate": 0.00016014469971060543, + "loss": 1.5057, + "step": 8817 + }, + { + "epoch": 0.31579135853313517, + "grad_norm": 1.5707215070724487, + "learning_rate": 0.0001601354327072458, + "loss": 1.4725, + "step": 8818 + }, + { + "epoch": 0.31582717066270344, + "grad_norm": 1.7088760137557983, + "learning_rate": 0.00016012616489484148, + "loss": 1.4094, + "step": 8819 + }, + { + "epoch": 0.31586298279227176, + "grad_norm": 1.6975724697113037, + "learning_rate": 0.00016011689627351712, + "loss": 1.4261, + "step": 8820 + }, + { + "epoch": 0.31589879492184003, + "grad_norm": 2.539350986480713, + "learning_rate": 0.00016010762684339752, + "loss": 1.4377, + "step": 8821 + }, + { + "epoch": 0.3159346070514083, + "grad_norm": 1.2720330953598022, + "learning_rate": 0.00016009835660460732, + "loss": 1.4547, + "step": 8822 + }, + { + "epoch": 0.31597041918097657, + "grad_norm": 1.6770910024642944, + "learning_rate": 0.00016008908555727123, + "loss": 1.469, + "step": 8823 + }, + { + "epoch": 0.3160062313105449, + "grad_norm": 2.0010297298431396, + "learning_rate": 0.00016007981370151406, + "loss": 1.5004, + "step": 8824 + }, + { + "epoch": 0.31604204344011316, + "grad_norm": 1.965331792831421, + "learning_rate": 0.00016007054103746047, + "loss": 1.59, + "step": 8825 + }, + { + "epoch": 0.31607785556968143, + "grad_norm": 1.8841441869735718, + "learning_rate": 0.00016006126756523524, + "loss": 1.6099, + "step": 8826 + }, + { + "epoch": 0.31611366769924976, + "grad_norm": 1.748795986175537, + "learning_rate": 0.0001600519932849631, + "loss": 1.3553, + "step": 8827 + }, + { + "epoch": 0.316149479828818, + "grad_norm": 1.5516470670700073, + "learning_rate": 0.00016004271819676887, + "loss": 1.6262, + "step": 8828 + }, + { + "epoch": 0.3161852919583863, + "grad_norm": 1.339726209640503, + "learning_rate": 0.0001600334423007773, + "loss": 1.4469, + "step": 8829 + }, + { + "epoch": 0.31622110408795456, + "grad_norm": 1.936703085899353, + "learning_rate": 0.0001600241655971132, + "loss": 1.5799, + "step": 8830 + }, + { + "epoch": 0.3162569162175229, + "grad_norm": 2.148743152618408, + "learning_rate": 0.0001600148880859014, + "loss": 1.5546, + "step": 8831 + }, + { + "epoch": 0.31629272834709116, + "grad_norm": 2.121833562850952, + "learning_rate": 0.0001600056097672667, + "loss": 1.7325, + "step": 8832 + }, + { + "epoch": 0.3163285404766594, + "grad_norm": 1.56114661693573, + "learning_rate": 0.00015999633064133392, + "loss": 1.3962, + "step": 8833 + }, + { + "epoch": 0.31636435260622775, + "grad_norm": 1.717720866203308, + "learning_rate": 0.0001599870507082279, + "loss": 1.6272, + "step": 8834 + }, + { + "epoch": 0.316400164735796, + "grad_norm": 1.254668116569519, + "learning_rate": 0.0001599777699680735, + "loss": 1.547, + "step": 8835 + }, + { + "epoch": 0.3164359768653643, + "grad_norm": 1.9581760168075562, + "learning_rate": 0.0001599684884209955, + "loss": 1.5793, + "step": 8836 + }, + { + "epoch": 0.31647178899493256, + "grad_norm": 1.534761905670166, + "learning_rate": 0.00015995920606711893, + "loss": 1.2339, + "step": 8837 + }, + { + "epoch": 0.3165076011245009, + "grad_norm": 1.87445867061615, + "learning_rate": 0.00015994992290656855, + "loss": 1.6946, + "step": 8838 + }, + { + "epoch": 0.31654341325406915, + "grad_norm": 1.2772846221923828, + "learning_rate": 0.00015994063893946928, + "loss": 1.609, + "step": 8839 + }, + { + "epoch": 0.3165792253836374, + "grad_norm": 1.4138472080230713, + "learning_rate": 0.000159931354165946, + "loss": 1.6902, + "step": 8840 + }, + { + "epoch": 0.31661503751320574, + "grad_norm": 1.8769257068634033, + "learning_rate": 0.0001599220685861237, + "loss": 1.3907, + "step": 8841 + }, + { + "epoch": 0.316650849642774, + "grad_norm": 1.4753496646881104, + "learning_rate": 0.00015991278220012727, + "loss": 1.6795, + "step": 8842 + }, + { + "epoch": 0.3166866617723423, + "grad_norm": 2.22638201713562, + "learning_rate": 0.00015990349500808162, + "loss": 1.538, + "step": 8843 + }, + { + "epoch": 0.31672247390191055, + "grad_norm": 1.6586682796478271, + "learning_rate": 0.00015989420701011171, + "loss": 1.4434, + "step": 8844 + }, + { + "epoch": 0.3167582860314789, + "grad_norm": 1.5222989320755005, + "learning_rate": 0.0001598849182063425, + "loss": 1.4534, + "step": 8845 + }, + { + "epoch": 0.31679409816104714, + "grad_norm": 1.7269152402877808, + "learning_rate": 0.00015987562859689898, + "loss": 1.5415, + "step": 8846 + }, + { + "epoch": 0.3168299102906154, + "grad_norm": 1.076930284500122, + "learning_rate": 0.0001598663381819061, + "loss": 1.4976, + "step": 8847 + }, + { + "epoch": 0.31686572242018374, + "grad_norm": 1.478829026222229, + "learning_rate": 0.00015985704696148885, + "loss": 1.8695, + "step": 8848 + }, + { + "epoch": 0.316901534549752, + "grad_norm": 1.7606927156448364, + "learning_rate": 0.00015984775493577225, + "loss": 1.5552, + "step": 8849 + }, + { + "epoch": 0.3169373466793203, + "grad_norm": 1.9603705406188965, + "learning_rate": 0.0001598384621048813, + "loss": 1.4099, + "step": 8850 + }, + { + "epoch": 0.31697315880888854, + "grad_norm": 1.4589382410049438, + "learning_rate": 0.00015982916846894106, + "loss": 1.4863, + "step": 8851 + }, + { + "epoch": 0.31700897093845687, + "grad_norm": 1.6379142999649048, + "learning_rate": 0.0001598198740280765, + "loss": 1.6302, + "step": 8852 + }, + { + "epoch": 0.31704478306802514, + "grad_norm": 1.3787086009979248, + "learning_rate": 0.00015981057878241273, + "loss": 1.2704, + "step": 8853 + }, + { + "epoch": 0.3170805951975934, + "grad_norm": 1.5794938802719116, + "learning_rate": 0.00015980128273207473, + "loss": 1.6077, + "step": 8854 + }, + { + "epoch": 0.31711640732716173, + "grad_norm": 1.6516194343566895, + "learning_rate": 0.00015979198587718764, + "loss": 1.4949, + "step": 8855 + }, + { + "epoch": 0.31715221945673, + "grad_norm": 1.768387794494629, + "learning_rate": 0.00015978268821787648, + "loss": 2.0093, + "step": 8856 + }, + { + "epoch": 0.31718803158629827, + "grad_norm": 1.8336304426193237, + "learning_rate": 0.0001597733897542664, + "loss": 1.8041, + "step": 8857 + }, + { + "epoch": 0.31722384371586654, + "grad_norm": 1.474213719367981, + "learning_rate": 0.0001597640904864824, + "loss": 1.4373, + "step": 8858 + }, + { + "epoch": 0.31725965584543486, + "grad_norm": 2.2122840881347656, + "learning_rate": 0.00015975479041464974, + "loss": 1.4047, + "step": 8859 + }, + { + "epoch": 0.31729546797500313, + "grad_norm": 1.903743028640747, + "learning_rate": 0.0001597454895388934, + "loss": 1.3211, + "step": 8860 + }, + { + "epoch": 0.3173312801045714, + "grad_norm": 1.590577483177185, + "learning_rate": 0.00015973618785933858, + "loss": 1.6325, + "step": 8861 + }, + { + "epoch": 0.3173670922341397, + "grad_norm": 1.5541341304779053, + "learning_rate": 0.00015972688537611038, + "loss": 1.5876, + "step": 8862 + }, + { + "epoch": 0.317402904363708, + "grad_norm": 1.4802968502044678, + "learning_rate": 0.000159717582089334, + "loss": 1.5479, + "step": 8863 + }, + { + "epoch": 0.31743871649327626, + "grad_norm": 1.7659330368041992, + "learning_rate": 0.0001597082779991346, + "loss": 1.6208, + "step": 8864 + }, + { + "epoch": 0.31747452862284453, + "grad_norm": 1.7611559629440308, + "learning_rate": 0.0001596989731056373, + "loss": 1.5296, + "step": 8865 + }, + { + "epoch": 0.31751034075241286, + "grad_norm": 1.8107753992080688, + "learning_rate": 0.00015968966740896736, + "loss": 1.704, + "step": 8866 + }, + { + "epoch": 0.3175461528819811, + "grad_norm": 1.5399701595306396, + "learning_rate": 0.0001596803609092499, + "loss": 1.4575, + "step": 8867 + }, + { + "epoch": 0.3175819650115494, + "grad_norm": 1.4436416625976562, + "learning_rate": 0.0001596710536066102, + "loss": 1.1832, + "step": 8868 + }, + { + "epoch": 0.3176177771411177, + "grad_norm": 1.613898754119873, + "learning_rate": 0.00015966174550117342, + "loss": 1.6416, + "step": 8869 + }, + { + "epoch": 0.317653589270686, + "grad_norm": 1.4900312423706055, + "learning_rate": 0.00015965243659306482, + "loss": 1.4502, + "step": 8870 + }, + { + "epoch": 0.31768940140025426, + "grad_norm": 1.5705586671829224, + "learning_rate": 0.00015964312688240967, + "loss": 1.8653, + "step": 8871 + }, + { + "epoch": 0.3177252135298225, + "grad_norm": 2.103754997253418, + "learning_rate": 0.00015963381636933312, + "loss": 1.2867, + "step": 8872 + }, + { + "epoch": 0.31776102565939085, + "grad_norm": 1.2797647714614868, + "learning_rate": 0.00015962450505396051, + "loss": 1.2829, + "step": 8873 + }, + { + "epoch": 0.3177968377889591, + "grad_norm": 1.5395480394363403, + "learning_rate": 0.00015961519293641714, + "loss": 1.4187, + "step": 8874 + }, + { + "epoch": 0.3178326499185274, + "grad_norm": 1.9138820171356201, + "learning_rate": 0.0001596058800168282, + "loss": 1.371, + "step": 8875 + }, + { + "epoch": 0.3178684620480957, + "grad_norm": 1.351996898651123, + "learning_rate": 0.00015959656629531904, + "loss": 1.3799, + "step": 8876 + }, + { + "epoch": 0.317904274177664, + "grad_norm": 1.498100757598877, + "learning_rate": 0.00015958725177201495, + "loss": 1.3474, + "step": 8877 + }, + { + "epoch": 0.31794008630723225, + "grad_norm": 1.4798411130905151, + "learning_rate": 0.0001595779364470413, + "loss": 1.548, + "step": 8878 + }, + { + "epoch": 0.3179758984368005, + "grad_norm": 1.4499621391296387, + "learning_rate": 0.0001595686203205233, + "loss": 1.7545, + "step": 8879 + }, + { + "epoch": 0.31801171056636884, + "grad_norm": 1.3401906490325928, + "learning_rate": 0.00015955930339258634, + "loss": 1.5813, + "step": 8880 + }, + { + "epoch": 0.3180475226959371, + "grad_norm": 1.535853624343872, + "learning_rate": 0.00015954998566335583, + "loss": 1.8776, + "step": 8881 + }, + { + "epoch": 0.3180833348255054, + "grad_norm": 1.4423049688339233, + "learning_rate": 0.00015954066713295707, + "loss": 1.468, + "step": 8882 + }, + { + "epoch": 0.3181191469550737, + "grad_norm": 1.5601565837860107, + "learning_rate": 0.00015953134780151543, + "loss": 1.6817, + "step": 8883 + }, + { + "epoch": 0.318154959084642, + "grad_norm": 1.770087480545044, + "learning_rate": 0.00015952202766915627, + "loss": 1.8615, + "step": 8884 + }, + { + "epoch": 0.31819077121421024, + "grad_norm": 1.8110880851745605, + "learning_rate": 0.00015951270673600503, + "loss": 1.1239, + "step": 8885 + }, + { + "epoch": 0.3182265833437785, + "grad_norm": 1.4439191818237305, + "learning_rate": 0.0001595033850021871, + "loss": 1.377, + "step": 8886 + }, + { + "epoch": 0.31826239547334684, + "grad_norm": 2.1991729736328125, + "learning_rate": 0.00015949406246782785, + "loss": 1.3046, + "step": 8887 + }, + { + "epoch": 0.3182982076029151, + "grad_norm": 1.6699777841567993, + "learning_rate": 0.00015948473913305274, + "loss": 1.4169, + "step": 8888 + }, + { + "epoch": 0.3183340197324834, + "grad_norm": 1.6651912927627563, + "learning_rate": 0.00015947541499798721, + "loss": 1.6165, + "step": 8889 + }, + { + "epoch": 0.3183698318620517, + "grad_norm": 1.996559500694275, + "learning_rate": 0.00015946609006275666, + "loss": 1.3417, + "step": 8890 + }, + { + "epoch": 0.31840564399161997, + "grad_norm": 1.8512824773788452, + "learning_rate": 0.0001594567643274866, + "loss": 1.5618, + "step": 8891 + }, + { + "epoch": 0.31844145612118824, + "grad_norm": 2.2073004245758057, + "learning_rate": 0.00015944743779230244, + "loss": 1.4179, + "step": 8892 + }, + { + "epoch": 0.3184772682507565, + "grad_norm": 1.4508863687515259, + "learning_rate": 0.00015943811045732973, + "loss": 1.4505, + "step": 8893 + }, + { + "epoch": 0.31851308038032483, + "grad_norm": 2.824732542037964, + "learning_rate": 0.00015942878232269388, + "loss": 1.6473, + "step": 8894 + }, + { + "epoch": 0.3185488925098931, + "grad_norm": 1.6162723302841187, + "learning_rate": 0.00015941945338852044, + "loss": 1.1406, + "step": 8895 + }, + { + "epoch": 0.31858470463946137, + "grad_norm": 1.8925987482070923, + "learning_rate": 0.0001594101236549349, + "loss": 1.3241, + "step": 8896 + }, + { + "epoch": 0.3186205167690297, + "grad_norm": 1.586099624633789, + "learning_rate": 0.00015940079312206276, + "loss": 1.3499, + "step": 8897 + }, + { + "epoch": 0.31865632889859796, + "grad_norm": 1.7400895357131958, + "learning_rate": 0.00015939146179002957, + "loss": 1.5616, + "step": 8898 + }, + { + "epoch": 0.31869214102816623, + "grad_norm": 1.7760902643203735, + "learning_rate": 0.00015938212965896088, + "loss": 1.4622, + "step": 8899 + }, + { + "epoch": 0.3187279531577345, + "grad_norm": 1.524623990058899, + "learning_rate": 0.00015937279672898223, + "loss": 1.5722, + "step": 8900 + }, + { + "epoch": 0.3187637652873028, + "grad_norm": 1.5912421941757202, + "learning_rate": 0.0001593634630002192, + "loss": 1.4204, + "step": 8901 + }, + { + "epoch": 0.3187995774168711, + "grad_norm": 2.2556264400482178, + "learning_rate": 0.00015935412847279735, + "loss": 1.7675, + "step": 8902 + }, + { + "epoch": 0.31883538954643936, + "grad_norm": 1.4288015365600586, + "learning_rate": 0.00015934479314684224, + "loss": 1.4929, + "step": 8903 + }, + { + "epoch": 0.3188712016760077, + "grad_norm": 1.7608261108398438, + "learning_rate": 0.00015933545702247952, + "loss": 1.6204, + "step": 8904 + }, + { + "epoch": 0.31890701380557596, + "grad_norm": 1.5694899559020996, + "learning_rate": 0.00015932612009983475, + "loss": 1.5306, + "step": 8905 + }, + { + "epoch": 0.3189428259351442, + "grad_norm": 1.9936593770980835, + "learning_rate": 0.00015931678237903353, + "loss": 1.7889, + "step": 8906 + }, + { + "epoch": 0.3189786380647125, + "grad_norm": 1.8415395021438599, + "learning_rate": 0.00015930744386020152, + "loss": 1.3515, + "step": 8907 + }, + { + "epoch": 0.3190144501942808, + "grad_norm": 2.319857597351074, + "learning_rate": 0.0001592981045434644, + "loss": 1.338, + "step": 8908 + }, + { + "epoch": 0.3190502623238491, + "grad_norm": 1.5549952983856201, + "learning_rate": 0.0001592887644289477, + "loss": 1.5115, + "step": 8909 + }, + { + "epoch": 0.31908607445341736, + "grad_norm": 2.5317156314849854, + "learning_rate": 0.0001592794235167772, + "loss": 1.2207, + "step": 8910 + }, + { + "epoch": 0.3191218865829857, + "grad_norm": 1.3313252925872803, + "learning_rate": 0.00015927008180707854, + "loss": 1.5237, + "step": 8911 + }, + { + "epoch": 0.31915769871255395, + "grad_norm": 2.1463143825531006, + "learning_rate": 0.00015926073929997735, + "loss": 1.4248, + "step": 8912 + }, + { + "epoch": 0.3191935108421222, + "grad_norm": 1.8104480504989624, + "learning_rate": 0.00015925139599559939, + "loss": 1.5752, + "step": 8913 + }, + { + "epoch": 0.3192293229716905, + "grad_norm": 2.5905628204345703, + "learning_rate": 0.0001592420518940703, + "loss": 1.5152, + "step": 8914 + }, + { + "epoch": 0.3192651351012588, + "grad_norm": 2.154703140258789, + "learning_rate": 0.0001592327069955158, + "loss": 1.8066, + "step": 8915 + }, + { + "epoch": 0.3193009472308271, + "grad_norm": 1.2889586687088013, + "learning_rate": 0.00015922336130006162, + "loss": 1.2934, + "step": 8916 + }, + { + "epoch": 0.31933675936039535, + "grad_norm": 1.5765522718429565, + "learning_rate": 0.00015921401480783356, + "loss": 1.7927, + "step": 8917 + }, + { + "epoch": 0.3193725714899637, + "grad_norm": 1.3909289836883545, + "learning_rate": 0.0001592046675189573, + "loss": 1.4859, + "step": 8918 + }, + { + "epoch": 0.31940838361953194, + "grad_norm": 1.4396029710769653, + "learning_rate": 0.00015919531943355857, + "loss": 1.3662, + "step": 8919 + }, + { + "epoch": 0.3194441957491002, + "grad_norm": 2.2816967964172363, + "learning_rate": 0.0001591859705517632, + "loss": 1.678, + "step": 8920 + }, + { + "epoch": 0.3194800078786685, + "grad_norm": 2.0098812580108643, + "learning_rate": 0.00015917662087369693, + "loss": 1.8597, + "step": 8921 + }, + { + "epoch": 0.3195158200082368, + "grad_norm": 1.5743976831436157, + "learning_rate": 0.0001591672703994856, + "loss": 1.3576, + "step": 8922 + }, + { + "epoch": 0.3195516321378051, + "grad_norm": 1.5148391723632812, + "learning_rate": 0.00015915791912925493, + "loss": 1.4273, + "step": 8923 + }, + { + "epoch": 0.31958744426737334, + "grad_norm": 1.6980741024017334, + "learning_rate": 0.00015914856706313076, + "loss": 1.4308, + "step": 8924 + }, + { + "epoch": 0.31962325639694167, + "grad_norm": 1.7159067392349243, + "learning_rate": 0.00015913921420123892, + "loss": 1.843, + "step": 8925 + }, + { + "epoch": 0.31965906852650994, + "grad_norm": 1.428824782371521, + "learning_rate": 0.00015912986054370524, + "loss": 1.4445, + "step": 8926 + }, + { + "epoch": 0.3196948806560782, + "grad_norm": 1.6858423948287964, + "learning_rate": 0.00015912050609065556, + "loss": 1.1656, + "step": 8927 + }, + { + "epoch": 0.3197306927856465, + "grad_norm": 1.671278476715088, + "learning_rate": 0.00015911115084221575, + "loss": 1.7137, + "step": 8928 + }, + { + "epoch": 0.3197665049152148, + "grad_norm": 2.173511505126953, + "learning_rate": 0.00015910179479851163, + "loss": 1.6492, + "step": 8929 + }, + { + "epoch": 0.31980231704478307, + "grad_norm": 1.9193826913833618, + "learning_rate": 0.0001590924379596691, + "loss": 1.5464, + "step": 8930 + }, + { + "epoch": 0.31983812917435134, + "grad_norm": 1.7963279485702515, + "learning_rate": 0.00015908308032581406, + "loss": 1.7011, + "step": 8931 + }, + { + "epoch": 0.31987394130391966, + "grad_norm": 1.4274190664291382, + "learning_rate": 0.00015907372189707237, + "loss": 1.4982, + "step": 8932 + }, + { + "epoch": 0.31990975343348793, + "grad_norm": 1.926442265510559, + "learning_rate": 0.00015906436267356993, + "loss": 1.3027, + "step": 8933 + }, + { + "epoch": 0.3199455655630562, + "grad_norm": 1.577248454093933, + "learning_rate": 0.00015905500265543272, + "loss": 1.4747, + "step": 8934 + }, + { + "epoch": 0.31998137769262447, + "grad_norm": 2.0698063373565674, + "learning_rate": 0.0001590456418427866, + "loss": 1.6706, + "step": 8935 + }, + { + "epoch": 0.3200171898221928, + "grad_norm": 1.6844439506530762, + "learning_rate": 0.00015903628023575755, + "loss": 1.2918, + "step": 8936 + }, + { + "epoch": 0.32005300195176106, + "grad_norm": 1.304545283317566, + "learning_rate": 0.00015902691783447142, + "loss": 1.4205, + "step": 8937 + }, + { + "epoch": 0.32008881408132933, + "grad_norm": 1.6640790700912476, + "learning_rate": 0.00015901755463905434, + "loss": 1.5702, + "step": 8938 + }, + { + "epoch": 0.32012462621089766, + "grad_norm": 1.569692611694336, + "learning_rate": 0.00015900819064963218, + "loss": 1.5811, + "step": 8939 + }, + { + "epoch": 0.3201604383404659, + "grad_norm": 1.8186932802200317, + "learning_rate": 0.00015899882586633093, + "loss": 1.2104, + "step": 8940 + }, + { + "epoch": 0.3201962504700342, + "grad_norm": 1.6925221681594849, + "learning_rate": 0.00015898946028927656, + "loss": 1.5124, + "step": 8941 + }, + { + "epoch": 0.32023206259960246, + "grad_norm": 1.6941040754318237, + "learning_rate": 0.0001589800939185951, + "loss": 1.5075, + "step": 8942 + }, + { + "epoch": 0.3202678747291708, + "grad_norm": 1.9452513456344604, + "learning_rate": 0.00015897072675441254, + "loss": 1.5764, + "step": 8943 + }, + { + "epoch": 0.32030368685873906, + "grad_norm": 2.2057533264160156, + "learning_rate": 0.00015896135879685494, + "loss": 1.7398, + "step": 8944 + }, + { + "epoch": 0.3203394989883073, + "grad_norm": 1.8561280965805054, + "learning_rate": 0.0001589519900460483, + "loss": 1.5916, + "step": 8945 + }, + { + "epoch": 0.32037531111787565, + "grad_norm": 1.8078523874282837, + "learning_rate": 0.00015894262050211868, + "loss": 1.4903, + "step": 8946 + }, + { + "epoch": 0.3204111232474439, + "grad_norm": 1.5678856372833252, + "learning_rate": 0.00015893325016519213, + "loss": 1.5585, + "step": 8947 + }, + { + "epoch": 0.3204469353770122, + "grad_norm": 2.330113172531128, + "learning_rate": 0.0001589238790353947, + "loss": 1.6519, + "step": 8948 + }, + { + "epoch": 0.32048274750658046, + "grad_norm": 1.4870229959487915, + "learning_rate": 0.00015891450711285254, + "loss": 1.569, + "step": 8949 + }, + { + "epoch": 0.3205185596361488, + "grad_norm": 2.701037883758545, + "learning_rate": 0.00015890513439769164, + "loss": 1.8232, + "step": 8950 + }, + { + "epoch": 0.32055437176571705, + "grad_norm": 1.4118682146072388, + "learning_rate": 0.00015889576089003814, + "loss": 1.373, + "step": 8951 + }, + { + "epoch": 0.3205901838952853, + "grad_norm": 1.6131876707077026, + "learning_rate": 0.00015888638659001815, + "loss": 1.503, + "step": 8952 + }, + { + "epoch": 0.32062599602485364, + "grad_norm": 2.0327181816101074, + "learning_rate": 0.0001588770114977578, + "loss": 1.5179, + "step": 8953 + }, + { + "epoch": 0.3206618081544219, + "grad_norm": 1.8969197273254395, + "learning_rate": 0.00015886763561338317, + "loss": 1.8301, + "step": 8954 + }, + { + "epoch": 0.3206976202839902, + "grad_norm": 1.4035693407058716, + "learning_rate": 0.00015885825893702048, + "loss": 1.6315, + "step": 8955 + }, + { + "epoch": 0.32073343241355845, + "grad_norm": 1.4035834074020386, + "learning_rate": 0.0001588488814687958, + "loss": 1.5281, + "step": 8956 + }, + { + "epoch": 0.3207692445431268, + "grad_norm": 1.6486170291900635, + "learning_rate": 0.00015883950320883536, + "loss": 1.3552, + "step": 8957 + }, + { + "epoch": 0.32080505667269504, + "grad_norm": 1.6082379817962646, + "learning_rate": 0.0001588301241572653, + "loss": 1.5837, + "step": 8958 + }, + { + "epoch": 0.3208408688022633, + "grad_norm": 1.3907922506332397, + "learning_rate": 0.0001588207443142118, + "loss": 1.6525, + "step": 8959 + }, + { + "epoch": 0.32087668093183164, + "grad_norm": 1.9259198904037476, + "learning_rate": 0.00015881136367980103, + "loss": 1.6522, + "step": 8960 + }, + { + "epoch": 0.3209124930613999, + "grad_norm": 1.92613685131073, + "learning_rate": 0.00015880198225415925, + "loss": 1.2868, + "step": 8961 + }, + { + "epoch": 0.3209483051909682, + "grad_norm": 1.6421856880187988, + "learning_rate": 0.00015879260003741265, + "loss": 1.7828, + "step": 8962 + }, + { + "epoch": 0.32098411732053644, + "grad_norm": 1.4043571949005127, + "learning_rate": 0.00015878321702968745, + "loss": 1.6435, + "step": 8963 + }, + { + "epoch": 0.32101992945010477, + "grad_norm": 2.38551926612854, + "learning_rate": 0.0001587738332311099, + "loss": 2.2681, + "step": 8964 + }, + { + "epoch": 0.32105574157967304, + "grad_norm": 1.5245349407196045, + "learning_rate": 0.0001587644486418062, + "loss": 1.5347, + "step": 8965 + }, + { + "epoch": 0.3210915537092413, + "grad_norm": 1.5355474948883057, + "learning_rate": 0.00015875506326190267, + "loss": 1.2685, + "step": 8966 + }, + { + "epoch": 0.32112736583880963, + "grad_norm": 1.4486896991729736, + "learning_rate": 0.00015874567709152557, + "loss": 1.221, + "step": 8967 + }, + { + "epoch": 0.3211631779683779, + "grad_norm": 1.4446889162063599, + "learning_rate": 0.00015873629013080114, + "loss": 1.5916, + "step": 8968 + }, + { + "epoch": 0.32119899009794617, + "grad_norm": 2.2588870525360107, + "learning_rate": 0.0001587269023798557, + "loss": 1.5266, + "step": 8969 + }, + { + "epoch": 0.32123480222751444, + "grad_norm": 1.638065218925476, + "learning_rate": 0.0001587175138388155, + "loss": 1.5114, + "step": 8970 + }, + { + "epoch": 0.32127061435708276, + "grad_norm": 2.1063241958618164, + "learning_rate": 0.00015870812450780695, + "loss": 1.5352, + "step": 8971 + }, + { + "epoch": 0.32130642648665103, + "grad_norm": 1.6843229532241821, + "learning_rate": 0.00015869873438695628, + "loss": 1.4685, + "step": 8972 + }, + { + "epoch": 0.3213422386162193, + "grad_norm": 1.2848390340805054, + "learning_rate": 0.00015868934347638985, + "loss": 1.3653, + "step": 8973 + }, + { + "epoch": 0.3213780507457876, + "grad_norm": 1.7618560791015625, + "learning_rate": 0.00015867995177623403, + "loss": 1.5555, + "step": 8974 + }, + { + "epoch": 0.3214138628753559, + "grad_norm": 2.0183522701263428, + "learning_rate": 0.00015867055928661517, + "loss": 1.6017, + "step": 8975 + }, + { + "epoch": 0.32144967500492416, + "grad_norm": 1.6978468894958496, + "learning_rate": 0.00015866116600765957, + "loss": 1.493, + "step": 8976 + }, + { + "epoch": 0.32148548713449243, + "grad_norm": 1.5752780437469482, + "learning_rate": 0.00015865177193949366, + "loss": 1.5721, + "step": 8977 + }, + { + "epoch": 0.32152129926406076, + "grad_norm": 1.5918536186218262, + "learning_rate": 0.0001586423770822438, + "loss": 1.4768, + "step": 8978 + }, + { + "epoch": 0.321557111393629, + "grad_norm": 1.8312736749649048, + "learning_rate": 0.0001586329814360364, + "loss": 1.6069, + "step": 8979 + }, + { + "epoch": 0.3215929235231973, + "grad_norm": 1.5193839073181152, + "learning_rate": 0.0001586235850009979, + "loss": 1.386, + "step": 8980 + }, + { + "epoch": 0.3216287356527656, + "grad_norm": 2.7475154399871826, + "learning_rate": 0.00015861418777725467, + "loss": 1.451, + "step": 8981 + }, + { + "epoch": 0.3216645477823339, + "grad_norm": 1.462113618850708, + "learning_rate": 0.00015860478976493313, + "loss": 1.4151, + "step": 8982 + }, + { + "epoch": 0.32170035991190216, + "grad_norm": 1.9317468404769897, + "learning_rate": 0.00015859539096415976, + "loss": 1.6592, + "step": 8983 + }, + { + "epoch": 0.3217361720414704, + "grad_norm": 1.8942067623138428, + "learning_rate": 0.000158585991375061, + "loss": 1.643, + "step": 8984 + }, + { + "epoch": 0.32177198417103875, + "grad_norm": 1.6340413093566895, + "learning_rate": 0.00015857659099776327, + "loss": 1.5538, + "step": 8985 + }, + { + "epoch": 0.321807796300607, + "grad_norm": 1.7912198305130005, + "learning_rate": 0.0001585671898323931, + "loss": 1.7632, + "step": 8986 + }, + { + "epoch": 0.3218436084301753, + "grad_norm": 1.6519590616226196, + "learning_rate": 0.0001585577878790769, + "loss": 1.1778, + "step": 8987 + }, + { + "epoch": 0.3218794205597436, + "grad_norm": 1.6828880310058594, + "learning_rate": 0.00015854838513794118, + "loss": 1.7943, + "step": 8988 + }, + { + "epoch": 0.3219152326893119, + "grad_norm": 1.6437028646469116, + "learning_rate": 0.00015853898160911252, + "loss": 1.8893, + "step": 8989 + }, + { + "epoch": 0.32195104481888015, + "grad_norm": 1.3070831298828125, + "learning_rate": 0.00015852957729271735, + "loss": 1.1287, + "step": 8990 + }, + { + "epoch": 0.3219868569484484, + "grad_norm": 1.915002703666687, + "learning_rate": 0.00015852017218888218, + "loss": 1.3972, + "step": 8991 + }, + { + "epoch": 0.32202266907801674, + "grad_norm": 1.2701020240783691, + "learning_rate": 0.0001585107662977336, + "loss": 1.5031, + "step": 8992 + }, + { + "epoch": 0.322058481207585, + "grad_norm": 1.6998685598373413, + "learning_rate": 0.00015850135961939814, + "loss": 1.2343, + "step": 8993 + }, + { + "epoch": 0.3220942933371533, + "grad_norm": 1.3520300388336182, + "learning_rate": 0.00015849195215400234, + "loss": 1.6848, + "step": 8994 + }, + { + "epoch": 0.3221301054667216, + "grad_norm": 1.4436218738555908, + "learning_rate": 0.0001584825439016728, + "loss": 1.5324, + "step": 8995 + }, + { + "epoch": 0.3221659175962899, + "grad_norm": 2.294487953186035, + "learning_rate": 0.00015847313486253603, + "loss": 1.8497, + "step": 8996 + }, + { + "epoch": 0.32220172972585814, + "grad_norm": 1.5929428339004517, + "learning_rate": 0.0001584637250367187, + "loss": 1.4359, + "step": 8997 + }, + { + "epoch": 0.3222375418554264, + "grad_norm": 2.296165943145752, + "learning_rate": 0.00015845431442434733, + "loss": 1.8707, + "step": 8998 + }, + { + "epoch": 0.32227335398499474, + "grad_norm": 1.5806833505630493, + "learning_rate": 0.00015844490302554856, + "loss": 1.6144, + "step": 8999 + }, + { + "epoch": 0.322309166114563, + "grad_norm": 1.7950143814086914, + "learning_rate": 0.00015843549084044903, + "loss": 1.7005, + "step": 9000 + }, + { + "epoch": 0.3223449782441313, + "grad_norm": 2.0502243041992188, + "learning_rate": 0.0001584260778691753, + "loss": 1.5181, + "step": 9001 + }, + { + "epoch": 0.3223807903736996, + "grad_norm": 1.9354183673858643, + "learning_rate": 0.00015841666411185411, + "loss": 1.3239, + "step": 9002 + }, + { + "epoch": 0.32241660250326787, + "grad_norm": 1.4576351642608643, + "learning_rate": 0.000158407249568612, + "loss": 1.46, + "step": 9003 + }, + { + "epoch": 0.32245241463283614, + "grad_norm": 1.6081417798995972, + "learning_rate": 0.00015839783423957576, + "loss": 1.4387, + "step": 9004 + }, + { + "epoch": 0.3224882267624044, + "grad_norm": 2.0253992080688477, + "learning_rate": 0.00015838841812487194, + "loss": 1.7281, + "step": 9005 + }, + { + "epoch": 0.32252403889197273, + "grad_norm": 1.742729902267456, + "learning_rate": 0.00015837900122462725, + "loss": 1.3755, + "step": 9006 + }, + { + "epoch": 0.322559851021541, + "grad_norm": 2.4512531757354736, + "learning_rate": 0.00015836958353896845, + "loss": 1.3772, + "step": 9007 + }, + { + "epoch": 0.32259566315110927, + "grad_norm": 1.7238280773162842, + "learning_rate": 0.00015836016506802218, + "loss": 1.3526, + "step": 9008 + }, + { + "epoch": 0.3226314752806776, + "grad_norm": 2.1217103004455566, + "learning_rate": 0.00015835074581191516, + "loss": 1.7, + "step": 9009 + }, + { + "epoch": 0.32266728741024586, + "grad_norm": 1.888587474822998, + "learning_rate": 0.00015834132577077412, + "loss": 1.5873, + "step": 9010 + }, + { + "epoch": 0.32270309953981413, + "grad_norm": 1.8031010627746582, + "learning_rate": 0.00015833190494472582, + "loss": 1.8754, + "step": 9011 + }, + { + "epoch": 0.3227389116693824, + "grad_norm": 1.394250750541687, + "learning_rate": 0.00015832248333389693, + "loss": 1.5981, + "step": 9012 + }, + { + "epoch": 0.3227747237989507, + "grad_norm": 1.8603804111480713, + "learning_rate": 0.00015831306093841432, + "loss": 1.4249, + "step": 9013 + }, + { + "epoch": 0.322810535928519, + "grad_norm": 2.721745014190674, + "learning_rate": 0.00015830363775840467, + "loss": 1.3469, + "step": 9014 + }, + { + "epoch": 0.32284634805808726, + "grad_norm": 1.7731702327728271, + "learning_rate": 0.00015829421379399475, + "loss": 1.742, + "step": 9015 + }, + { + "epoch": 0.32288216018765553, + "grad_norm": 1.3356654644012451, + "learning_rate": 0.00015828478904531142, + "loss": 1.4635, + "step": 9016 + }, + { + "epoch": 0.32291797231722386, + "grad_norm": 1.3644803762435913, + "learning_rate": 0.0001582753635124814, + "loss": 1.3199, + "step": 9017 + }, + { + "epoch": 0.3229537844467921, + "grad_norm": 1.669406771659851, + "learning_rate": 0.00015826593719563156, + "loss": 1.3352, + "step": 9018 + }, + { + "epoch": 0.3229895965763604, + "grad_norm": 1.548660397529602, + "learning_rate": 0.0001582565100948887, + "loss": 1.5234, + "step": 9019 + }, + { + "epoch": 0.3230254087059287, + "grad_norm": 1.9580634832382202, + "learning_rate": 0.00015824708221037965, + "loss": 1.5996, + "step": 9020 + }, + { + "epoch": 0.323061220835497, + "grad_norm": 1.9380831718444824, + "learning_rate": 0.0001582376535422312, + "loss": 1.245, + "step": 9021 + }, + { + "epoch": 0.32309703296506526, + "grad_norm": 1.6162350177764893, + "learning_rate": 0.00015822822409057024, + "loss": 1.4507, + "step": 9022 + }, + { + "epoch": 0.3231328450946335, + "grad_norm": 1.2511042356491089, + "learning_rate": 0.00015821879385552367, + "loss": 1.3691, + "step": 9023 + }, + { + "epoch": 0.32316865722420185, + "grad_norm": 1.4707595109939575, + "learning_rate": 0.00015820936283721834, + "loss": 1.6036, + "step": 9024 + }, + { + "epoch": 0.3232044693537701, + "grad_norm": 2.1324875354766846, + "learning_rate": 0.00015819993103578106, + "loss": 1.4427, + "step": 9025 + }, + { + "epoch": 0.3232402814833384, + "grad_norm": 1.6112456321716309, + "learning_rate": 0.0001581904984513388, + "loss": 1.3933, + "step": 9026 + }, + { + "epoch": 0.3232760936129067, + "grad_norm": 2.6338889598846436, + "learning_rate": 0.00015818106508401847, + "loss": 1.508, + "step": 9027 + }, + { + "epoch": 0.323311905742475, + "grad_norm": 2.0561411380767822, + "learning_rate": 0.00015817163093394693, + "loss": 1.3873, + "step": 9028 + }, + { + "epoch": 0.32334771787204325, + "grad_norm": 1.4188092947006226, + "learning_rate": 0.00015816219600125114, + "loss": 1.3726, + "step": 9029 + }, + { + "epoch": 0.3233835300016115, + "grad_norm": 2.1878104209899902, + "learning_rate": 0.00015815276028605807, + "loss": 1.3186, + "step": 9030 + }, + { + "epoch": 0.32341934213117984, + "grad_norm": 1.4584554433822632, + "learning_rate": 0.00015814332378849457, + "loss": 1.5844, + "step": 9031 + }, + { + "epoch": 0.3234551542607481, + "grad_norm": 2.6032674312591553, + "learning_rate": 0.00015813388650868766, + "loss": 1.4215, + "step": 9032 + }, + { + "epoch": 0.3234909663903164, + "grad_norm": 1.6895302534103394, + "learning_rate": 0.00015812444844676428, + "loss": 1.5978, + "step": 9033 + }, + { + "epoch": 0.3235267785198847, + "grad_norm": 1.8651604652404785, + "learning_rate": 0.00015811500960285143, + "loss": 1.3851, + "step": 9034 + }, + { + "epoch": 0.323562590649453, + "grad_norm": 1.7477426528930664, + "learning_rate": 0.00015810556997707608, + "loss": 1.2799, + "step": 9035 + }, + { + "epoch": 0.32359840277902124, + "grad_norm": 1.9270200729370117, + "learning_rate": 0.00015809612956956527, + "loss": 1.335, + "step": 9036 + }, + { + "epoch": 0.3236342149085895, + "grad_norm": 1.5941219329833984, + "learning_rate": 0.00015808668838044595, + "loss": 1.5451, + "step": 9037 + }, + { + "epoch": 0.32367002703815784, + "grad_norm": 2.0007755756378174, + "learning_rate": 0.00015807724640984518, + "loss": 1.4461, + "step": 9038 + }, + { + "epoch": 0.3237058391677261, + "grad_norm": 2.1980857849121094, + "learning_rate": 0.00015806780365788998, + "loss": 1.8456, + "step": 9039 + }, + { + "epoch": 0.3237416512972944, + "grad_norm": 4.429500579833984, + "learning_rate": 0.00015805836012470733, + "loss": 1.8815, + "step": 9040 + }, + { + "epoch": 0.3237774634268627, + "grad_norm": 1.9881982803344727, + "learning_rate": 0.0001580489158104244, + "loss": 1.5833, + "step": 9041 + }, + { + "epoch": 0.32381327555643097, + "grad_norm": 2.5081233978271484, + "learning_rate": 0.00015803947071516813, + "loss": 1.2176, + "step": 9042 + }, + { + "epoch": 0.32384908768599924, + "grad_norm": 1.551257848739624, + "learning_rate": 0.00015803002483906568, + "loss": 1.6953, + "step": 9043 + }, + { + "epoch": 0.3238848998155675, + "grad_norm": 1.6371716260910034, + "learning_rate": 0.0001580205781822441, + "loss": 1.4023, + "step": 9044 + }, + { + "epoch": 0.32392071194513583, + "grad_norm": 1.3484255075454712, + "learning_rate": 0.00015801113074483046, + "loss": 1.6748, + "step": 9045 + }, + { + "epoch": 0.3239565240747041, + "grad_norm": 1.9768213033676147, + "learning_rate": 0.0001580016825269519, + "loss": 1.6158, + "step": 9046 + }, + { + "epoch": 0.32399233620427237, + "grad_norm": 1.9486857652664185, + "learning_rate": 0.00015799223352873555, + "loss": 1.4215, + "step": 9047 + }, + { + "epoch": 0.3240281483338407, + "grad_norm": 1.4396650791168213, + "learning_rate": 0.00015798278375030845, + "loss": 1.1456, + "step": 9048 + }, + { + "epoch": 0.32406396046340896, + "grad_norm": 1.3435920476913452, + "learning_rate": 0.0001579733331917978, + "loss": 1.4729, + "step": 9049 + }, + { + "epoch": 0.32409977259297723, + "grad_norm": 1.549360990524292, + "learning_rate": 0.00015796388185333076, + "loss": 1.4954, + "step": 9050 + }, + { + "epoch": 0.3241355847225455, + "grad_norm": 1.8293956518173218, + "learning_rate": 0.00015795442973503442, + "loss": 1.6314, + "step": 9051 + }, + { + "epoch": 0.3241713968521138, + "grad_norm": 1.7965165376663208, + "learning_rate": 0.00015794497683703601, + "loss": 1.9173, + "step": 9052 + }, + { + "epoch": 0.3242072089816821, + "grad_norm": 1.5780364274978638, + "learning_rate": 0.00015793552315946266, + "loss": 1.5728, + "step": 9053 + }, + { + "epoch": 0.32424302111125036, + "grad_norm": 3.517538547515869, + "learning_rate": 0.00015792606870244162, + "loss": 1.7913, + "step": 9054 + }, + { + "epoch": 0.3242788332408187, + "grad_norm": 1.9409570693969727, + "learning_rate": 0.0001579166134661, + "loss": 1.3348, + "step": 9055 + }, + { + "epoch": 0.32431464537038696, + "grad_norm": 1.7771880626678467, + "learning_rate": 0.00015790715745056506, + "loss": 1.6459, + "step": 9056 + }, + { + "epoch": 0.3243504574999552, + "grad_norm": 1.7616101503372192, + "learning_rate": 0.00015789770065596404, + "loss": 1.7816, + "step": 9057 + }, + { + "epoch": 0.3243862696295235, + "grad_norm": 1.777740478515625, + "learning_rate": 0.00015788824308242408, + "loss": 1.8904, + "step": 9058 + }, + { + "epoch": 0.3244220817590918, + "grad_norm": 2.1632614135742188, + "learning_rate": 0.00015787878473007253, + "loss": 1.6462, + "step": 9059 + }, + { + "epoch": 0.3244578938886601, + "grad_norm": 2.3858914375305176, + "learning_rate": 0.00015786932559903657, + "loss": 1.6606, + "step": 9060 + }, + { + "epoch": 0.32449370601822836, + "grad_norm": 1.9479713439941406, + "learning_rate": 0.00015785986568944352, + "loss": 1.7255, + "step": 9061 + }, + { + "epoch": 0.3245295181477967, + "grad_norm": 1.4364668130874634, + "learning_rate": 0.00015785040500142057, + "loss": 1.5002, + "step": 9062 + }, + { + "epoch": 0.32456533027736495, + "grad_norm": 2.068448066711426, + "learning_rate": 0.00015784094353509507, + "loss": 1.4521, + "step": 9063 + }, + { + "epoch": 0.3246011424069332, + "grad_norm": 2.5338597297668457, + "learning_rate": 0.00015783148129059425, + "loss": 1.5654, + "step": 9064 + }, + { + "epoch": 0.3246369545365015, + "grad_norm": 2.1325583457946777, + "learning_rate": 0.00015782201826804548, + "loss": 1.6559, + "step": 9065 + }, + { + "epoch": 0.3246727666660698, + "grad_norm": 1.665851354598999, + "learning_rate": 0.000157812554467576, + "loss": 1.3783, + "step": 9066 + }, + { + "epoch": 0.3247085787956381, + "grad_norm": 1.4407624006271362, + "learning_rate": 0.0001578030898893132, + "loss": 1.6831, + "step": 9067 + }, + { + "epoch": 0.32474439092520635, + "grad_norm": 1.782218098640442, + "learning_rate": 0.00015779362453338438, + "loss": 1.7928, + "step": 9068 + }, + { + "epoch": 0.3247802030547747, + "grad_norm": 1.5871248245239258, + "learning_rate": 0.0001577841583999169, + "loss": 1.4532, + "step": 9069 + }, + { + "epoch": 0.32481601518434294, + "grad_norm": 1.4326859712600708, + "learning_rate": 0.00015777469148903808, + "loss": 1.5741, + "step": 9070 + }, + { + "epoch": 0.3248518273139112, + "grad_norm": 1.2834270000457764, + "learning_rate": 0.00015776522380087532, + "loss": 1.5228, + "step": 9071 + }, + { + "epoch": 0.3248876394434795, + "grad_norm": 1.9993133544921875, + "learning_rate": 0.00015775575533555602, + "loss": 1.6291, + "step": 9072 + }, + { + "epoch": 0.3249234515730478, + "grad_norm": 1.301542043685913, + "learning_rate": 0.0001577462860932075, + "loss": 1.4781, + "step": 9073 + }, + { + "epoch": 0.3249592637026161, + "grad_norm": 1.7204660177230835, + "learning_rate": 0.00015773681607395717, + "loss": 1.2986, + "step": 9074 + }, + { + "epoch": 0.32499507583218434, + "grad_norm": 2.050814628601074, + "learning_rate": 0.0001577273452779325, + "loss": 1.8979, + "step": 9075 + }, + { + "epoch": 0.32503088796175267, + "grad_norm": 1.3881765604019165, + "learning_rate": 0.00015771787370526084, + "loss": 1.5927, + "step": 9076 + }, + { + "epoch": 0.32506670009132094, + "grad_norm": 1.4091527462005615, + "learning_rate": 0.0001577084013560696, + "loss": 1.3812, + "step": 9077 + }, + { + "epoch": 0.3251025122208892, + "grad_norm": 2.129033088684082, + "learning_rate": 0.0001576989282304863, + "loss": 1.4002, + "step": 9078 + }, + { + "epoch": 0.3251383243504575, + "grad_norm": 1.6239925622940063, + "learning_rate": 0.00015768945432863835, + "loss": 1.4022, + "step": 9079 + }, + { + "epoch": 0.3251741364800258, + "grad_norm": 2.6688060760498047, + "learning_rate": 0.00015767997965065322, + "loss": 1.7299, + "step": 9080 + }, + { + "epoch": 0.32520994860959407, + "grad_norm": 1.6006053686141968, + "learning_rate": 0.00015767050419665836, + "loss": 1.4682, + "step": 9081 + }, + { + "epoch": 0.32524576073916234, + "grad_norm": 1.31721830368042, + "learning_rate": 0.00015766102796678123, + "loss": 1.5401, + "step": 9082 + }, + { + "epoch": 0.32528157286873066, + "grad_norm": 2.0746214389801025, + "learning_rate": 0.00015765155096114934, + "loss": 1.6972, + "step": 9083 + }, + { + "epoch": 0.32531738499829893, + "grad_norm": 1.4532837867736816, + "learning_rate": 0.00015764207317989023, + "loss": 1.1788, + "step": 9084 + }, + { + "epoch": 0.3253531971278672, + "grad_norm": 1.7890650033950806, + "learning_rate": 0.00015763259462313136, + "loss": 1.5181, + "step": 9085 + }, + { + "epoch": 0.32538900925743547, + "grad_norm": 1.5378397703170776, + "learning_rate": 0.00015762311529100024, + "loss": 1.7676, + "step": 9086 + }, + { + "epoch": 0.3254248213870038, + "grad_norm": 1.4087746143341064, + "learning_rate": 0.00015761363518362447, + "loss": 1.5524, + "step": 9087 + }, + { + "epoch": 0.32546063351657206, + "grad_norm": 1.3853182792663574, + "learning_rate": 0.00015760415430113157, + "loss": 1.6659, + "step": 9088 + }, + { + "epoch": 0.32549644564614033, + "grad_norm": 1.8238393068313599, + "learning_rate": 0.00015759467264364905, + "loss": 1.7841, + "step": 9089 + }, + { + "epoch": 0.32553225777570866, + "grad_norm": 1.984938383102417, + "learning_rate": 0.00015758519021130451, + "loss": 1.5044, + "step": 9090 + }, + { + "epoch": 0.3255680699052769, + "grad_norm": 1.9241176843643188, + "learning_rate": 0.0001575757070042255, + "loss": 1.658, + "step": 9091 + }, + { + "epoch": 0.3256038820348452, + "grad_norm": 1.9488638639450073, + "learning_rate": 0.00015756622302253966, + "loss": 1.5738, + "step": 9092 + }, + { + "epoch": 0.32563969416441346, + "grad_norm": 1.1886811256408691, + "learning_rate": 0.0001575567382663745, + "loss": 1.5968, + "step": 9093 + }, + { + "epoch": 0.3256755062939818, + "grad_norm": 1.2945717573165894, + "learning_rate": 0.00015754725273585767, + "loss": 1.3245, + "step": 9094 + }, + { + "epoch": 0.32571131842355006, + "grad_norm": 1.674601674079895, + "learning_rate": 0.0001575377664311168, + "loss": 1.521, + "step": 9095 + }, + { + "epoch": 0.3257471305531183, + "grad_norm": 1.8002949953079224, + "learning_rate": 0.00015752827935227952, + "loss": 1.4273, + "step": 9096 + }, + { + "epoch": 0.32578294268268665, + "grad_norm": 2.0485355854034424, + "learning_rate": 0.00015751879149947343, + "loss": 1.6217, + "step": 9097 + }, + { + "epoch": 0.3258187548122549, + "grad_norm": 1.7260215282440186, + "learning_rate": 0.0001575093028728262, + "loss": 1.1519, + "step": 9098 + }, + { + "epoch": 0.3258545669418232, + "grad_norm": 2.4711947441101074, + "learning_rate": 0.00015749981347246549, + "loss": 1.5758, + "step": 9099 + }, + { + "epoch": 0.32589037907139146, + "grad_norm": 1.7342562675476074, + "learning_rate": 0.00015749032329851894, + "loss": 1.3542, + "step": 9100 + }, + { + "epoch": 0.3259261912009598, + "grad_norm": 1.5560182332992554, + "learning_rate": 0.00015748083235111424, + "loss": 1.4425, + "step": 9101 + }, + { + "epoch": 0.32596200333052805, + "grad_norm": 1.55160653591156, + "learning_rate": 0.00015747134063037908, + "loss": 1.7229, + "step": 9102 + }, + { + "epoch": 0.3259978154600963, + "grad_norm": 1.796183466911316, + "learning_rate": 0.0001574618481364412, + "loss": 1.5933, + "step": 9103 + }, + { + "epoch": 0.32603362758966464, + "grad_norm": 1.8876303434371948, + "learning_rate": 0.00015745235486942826, + "loss": 1.6456, + "step": 9104 + }, + { + "epoch": 0.3260694397192329, + "grad_norm": 1.37518310546875, + "learning_rate": 0.00015744286082946797, + "loss": 1.3399, + "step": 9105 + }, + { + "epoch": 0.3261052518488012, + "grad_norm": 1.4342260360717773, + "learning_rate": 0.0001574333660166881, + "loss": 1.5002, + "step": 9106 + }, + { + "epoch": 0.32614106397836945, + "grad_norm": 1.8933452367782593, + "learning_rate": 0.0001574238704312164, + "loss": 1.2322, + "step": 9107 + }, + { + "epoch": 0.3261768761079378, + "grad_norm": 1.6854076385498047, + "learning_rate": 0.00015741437407318056, + "loss": 1.6315, + "step": 9108 + }, + { + "epoch": 0.32621268823750604, + "grad_norm": 1.8995707035064697, + "learning_rate": 0.00015740487694270838, + "loss": 1.4575, + "step": 9109 + }, + { + "epoch": 0.3262485003670743, + "grad_norm": 2.63657283782959, + "learning_rate": 0.00015739537903992765, + "loss": 1.3891, + "step": 9110 + }, + { + "epoch": 0.32628431249664264, + "grad_norm": 1.7770910263061523, + "learning_rate": 0.0001573858803649661, + "loss": 1.4237, + "step": 9111 + }, + { + "epoch": 0.3263201246262109, + "grad_norm": 2.1773881912231445, + "learning_rate": 0.00015737638091795157, + "loss": 1.6328, + "step": 9112 + }, + { + "epoch": 0.3263559367557792, + "grad_norm": 1.9160898923873901, + "learning_rate": 0.00015736688069901183, + "loss": 1.5335, + "step": 9113 + }, + { + "epoch": 0.32639174888534744, + "grad_norm": 1.9974308013916016, + "learning_rate": 0.00015735737970827473, + "loss": 1.1659, + "step": 9114 + }, + { + "epoch": 0.32642756101491577, + "grad_norm": 1.807265281677246, + "learning_rate": 0.00015734787794586806, + "loss": 1.5504, + "step": 9115 + }, + { + "epoch": 0.32646337314448404, + "grad_norm": 1.8353880643844604, + "learning_rate": 0.00015733837541191968, + "loss": 1.6066, + "step": 9116 + }, + { + "epoch": 0.3264991852740523, + "grad_norm": 1.676720380783081, + "learning_rate": 0.00015732887210655742, + "loss": 1.4618, + "step": 9117 + }, + { + "epoch": 0.32653499740362063, + "grad_norm": 1.6320055723190308, + "learning_rate": 0.00015731936802990912, + "loss": 1.7968, + "step": 9118 + }, + { + "epoch": 0.3265708095331889, + "grad_norm": 2.2749667167663574, + "learning_rate": 0.00015730986318210265, + "loss": 1.4673, + "step": 9119 + }, + { + "epoch": 0.32660662166275717, + "grad_norm": 1.3211688995361328, + "learning_rate": 0.00015730035756326592, + "loss": 1.5625, + "step": 9120 + }, + { + "epoch": 0.32664243379232544, + "grad_norm": 1.8194984197616577, + "learning_rate": 0.00015729085117352674, + "loss": 1.4496, + "step": 9121 + }, + { + "epoch": 0.32667824592189376, + "grad_norm": 2.0746090412139893, + "learning_rate": 0.00015728134401301312, + "loss": 1.2436, + "step": 9122 + }, + { + "epoch": 0.32671405805146203, + "grad_norm": 1.5881251096725464, + "learning_rate": 0.0001572718360818529, + "loss": 1.3511, + "step": 9123 + }, + { + "epoch": 0.3267498701810303, + "grad_norm": 1.9258278608322144, + "learning_rate": 0.00015726232738017397, + "loss": 1.5968, + "step": 9124 + }, + { + "epoch": 0.3267856823105986, + "grad_norm": 1.449415922164917, + "learning_rate": 0.00015725281790810431, + "loss": 1.5124, + "step": 9125 + }, + { + "epoch": 0.3268214944401669, + "grad_norm": 1.274713158607483, + "learning_rate": 0.00015724330766577182, + "loss": 1.6398, + "step": 9126 + }, + { + "epoch": 0.32685730656973516, + "grad_norm": 1.757870078086853, + "learning_rate": 0.0001572337966533045, + "loss": 1.8657, + "step": 9127 + }, + { + "epoch": 0.32689311869930343, + "grad_norm": 1.3837286233901978, + "learning_rate": 0.0001572242848708302, + "loss": 1.5027, + "step": 9128 + }, + { + "epoch": 0.32692893082887176, + "grad_norm": 2.0841174125671387, + "learning_rate": 0.00015721477231847702, + "loss": 1.5041, + "step": 9129 + }, + { + "epoch": 0.32696474295844, + "grad_norm": 3.516465663909912, + "learning_rate": 0.00015720525899637285, + "loss": 1.6544, + "step": 9130 + }, + { + "epoch": 0.3270005550880083, + "grad_norm": 1.4936449527740479, + "learning_rate": 0.00015719574490464573, + "loss": 1.5635, + "step": 9131 + }, + { + "epoch": 0.3270363672175766, + "grad_norm": 2.2506232261657715, + "learning_rate": 0.00015718623004342362, + "loss": 1.719, + "step": 9132 + }, + { + "epoch": 0.3270721793471449, + "grad_norm": 1.8259614706039429, + "learning_rate": 0.00015717671441283458, + "loss": 1.7564, + "step": 9133 + }, + { + "epoch": 0.32710799147671316, + "grad_norm": 1.6399543285369873, + "learning_rate": 0.0001571671980130066, + "loss": 1.8228, + "step": 9134 + }, + { + "epoch": 0.3271438036062814, + "grad_norm": 1.5702282190322876, + "learning_rate": 0.00015715768084406765, + "loss": 1.5844, + "step": 9135 + }, + { + "epoch": 0.32717961573584975, + "grad_norm": 1.4354286193847656, + "learning_rate": 0.0001571481629061459, + "loss": 1.5932, + "step": 9136 + }, + { + "epoch": 0.327215427865418, + "grad_norm": 1.8592087030410767, + "learning_rate": 0.0001571386441993693, + "loss": 1.686, + "step": 9137 + }, + { + "epoch": 0.3272512399949863, + "grad_norm": 1.372339129447937, + "learning_rate": 0.00015712912472386597, + "loss": 1.682, + "step": 9138 + }, + { + "epoch": 0.3272870521245546, + "grad_norm": 1.4004724025726318, + "learning_rate": 0.00015711960447976393, + "loss": 1.834, + "step": 9139 + }, + { + "epoch": 0.3273228642541229, + "grad_norm": 1.7258496284484863, + "learning_rate": 0.0001571100834671913, + "loss": 1.549, + "step": 9140 + }, + { + "epoch": 0.32735867638369115, + "grad_norm": 1.3979535102844238, + "learning_rate": 0.00015710056168627618, + "loss": 1.689, + "step": 9141 + }, + { + "epoch": 0.3273944885132594, + "grad_norm": 2.24281907081604, + "learning_rate": 0.00015709103913714664, + "loss": 1.5228, + "step": 9142 + }, + { + "epoch": 0.32743030064282774, + "grad_norm": 1.3384792804718018, + "learning_rate": 0.0001570815158199308, + "loss": 1.2907, + "step": 9143 + }, + { + "epoch": 0.327466112772396, + "grad_norm": 1.8024048805236816, + "learning_rate": 0.00015707199173475682, + "loss": 1.4062, + "step": 9144 + }, + { + "epoch": 0.3275019249019643, + "grad_norm": 2.6472220420837402, + "learning_rate": 0.00015706246688175282, + "loss": 1.2139, + "step": 9145 + }, + { + "epoch": 0.3275377370315326, + "grad_norm": 1.628482699394226, + "learning_rate": 0.00015705294126104692, + "loss": 1.4098, + "step": 9146 + }, + { + "epoch": 0.3275735491611009, + "grad_norm": 1.4141241312026978, + "learning_rate": 0.00015704341487276726, + "loss": 1.2319, + "step": 9147 + }, + { + "epoch": 0.32760936129066914, + "grad_norm": 1.8727307319641113, + "learning_rate": 0.00015703388771704205, + "loss": 1.6832, + "step": 9148 + }, + { + "epoch": 0.3276451734202374, + "grad_norm": 1.9481236934661865, + "learning_rate": 0.00015702435979399946, + "loss": 1.4832, + "step": 9149 + }, + { + "epoch": 0.32768098554980574, + "grad_norm": 1.4255954027175903, + "learning_rate": 0.00015701483110376762, + "loss": 1.3543, + "step": 9150 + }, + { + "epoch": 0.327716797679374, + "grad_norm": 2.0080718994140625, + "learning_rate": 0.00015700530164647485, + "loss": 1.6513, + "step": 9151 + }, + { + "epoch": 0.3277526098089423, + "grad_norm": 1.8739054203033447, + "learning_rate": 0.00015699577142224924, + "loss": 1.1768, + "step": 9152 + }, + { + "epoch": 0.3277884219385106, + "grad_norm": 1.5352083444595337, + "learning_rate": 0.000156986240431219, + "loss": 1.4779, + "step": 9153 + }, + { + "epoch": 0.32782423406807887, + "grad_norm": 1.6548919677734375, + "learning_rate": 0.00015697670867351247, + "loss": 1.5669, + "step": 9154 + }, + { + "epoch": 0.32786004619764714, + "grad_norm": 1.8532062768936157, + "learning_rate": 0.0001569671761492578, + "loss": 1.713, + "step": 9155 + }, + { + "epoch": 0.3278958583272154, + "grad_norm": 1.4983744621276855, + "learning_rate": 0.00015695764285858323, + "loss": 1.7685, + "step": 9156 + }, + { + "epoch": 0.32793167045678373, + "grad_norm": 1.8667110204696655, + "learning_rate": 0.00015694810880161706, + "loss": 1.3894, + "step": 9157 + }, + { + "epoch": 0.327967482586352, + "grad_norm": 1.7034077644348145, + "learning_rate": 0.00015693857397848756, + "loss": 1.6046, + "step": 9158 + }, + { + "epoch": 0.32800329471592027, + "grad_norm": 2.344395399093628, + "learning_rate": 0.00015692903838932299, + "loss": 1.5522, + "step": 9159 + }, + { + "epoch": 0.3280391068454886, + "grad_norm": 2.167020797729492, + "learning_rate": 0.00015691950203425162, + "loss": 1.3704, + "step": 9160 + }, + { + "epoch": 0.32807491897505686, + "grad_norm": 1.6182739734649658, + "learning_rate": 0.0001569099649134018, + "loss": 1.3054, + "step": 9161 + }, + { + "epoch": 0.32811073110462513, + "grad_norm": 1.4349606037139893, + "learning_rate": 0.0001569004270269018, + "loss": 1.4676, + "step": 9162 + }, + { + "epoch": 0.3281465432341934, + "grad_norm": 1.9794306755065918, + "learning_rate": 0.00015689088837487995, + "loss": 1.8758, + "step": 9163 + }, + { + "epoch": 0.3281823553637617, + "grad_norm": 1.2538909912109375, + "learning_rate": 0.00015688134895746459, + "loss": 1.5755, + "step": 9164 + }, + { + "epoch": 0.32821816749333, + "grad_norm": 2.244776487350464, + "learning_rate": 0.000156871808774784, + "loss": 1.6322, + "step": 9165 + }, + { + "epoch": 0.32825397962289826, + "grad_norm": 1.843839406967163, + "learning_rate": 0.00015686226782696662, + "loss": 1.6366, + "step": 9166 + }, + { + "epoch": 0.3282897917524666, + "grad_norm": 2.0505659580230713, + "learning_rate": 0.0001568527261141408, + "loss": 1.5395, + "step": 9167 + }, + { + "epoch": 0.32832560388203486, + "grad_norm": 1.6145918369293213, + "learning_rate": 0.00015684318363643485, + "loss": 1.4893, + "step": 9168 + }, + { + "epoch": 0.3283614160116031, + "grad_norm": 1.7534884214401245, + "learning_rate": 0.0001568336403939772, + "loss": 1.5143, + "step": 9169 + }, + { + "epoch": 0.3283972281411714, + "grad_norm": 1.496999740600586, + "learning_rate": 0.00015682409638689623, + "loss": 1.6016, + "step": 9170 + }, + { + "epoch": 0.3284330402707397, + "grad_norm": 1.4966206550598145, + "learning_rate": 0.00015681455161532034, + "loss": 1.311, + "step": 9171 + }, + { + "epoch": 0.328468852400308, + "grad_norm": 1.686967372894287, + "learning_rate": 0.00015680500607937793, + "loss": 1.2972, + "step": 9172 + }, + { + "epoch": 0.32850466452987626, + "grad_norm": 1.885436773300171, + "learning_rate": 0.00015679545977919745, + "loss": 1.7629, + "step": 9173 + }, + { + "epoch": 0.3285404766594446, + "grad_norm": 1.4676927328109741, + "learning_rate": 0.0001567859127149073, + "loss": 1.7276, + "step": 9174 + }, + { + "epoch": 0.32857628878901285, + "grad_norm": 1.650899887084961, + "learning_rate": 0.00015677636488663595, + "loss": 1.5655, + "step": 9175 + }, + { + "epoch": 0.3286121009185811, + "grad_norm": 1.4728350639343262, + "learning_rate": 0.00015676681629451185, + "loss": 1.682, + "step": 9176 + }, + { + "epoch": 0.3286479130481494, + "grad_norm": 1.5229905843734741, + "learning_rate": 0.0001567572669386635, + "loss": 1.628, + "step": 9177 + }, + { + "epoch": 0.3286837251777177, + "grad_norm": 1.990729808807373, + "learning_rate": 0.0001567477168192193, + "loss": 1.9226, + "step": 9178 + }, + { + "epoch": 0.328719537307286, + "grad_norm": 1.5891399383544922, + "learning_rate": 0.00015673816593630776, + "loss": 1.3932, + "step": 9179 + }, + { + "epoch": 0.32875534943685425, + "grad_norm": 1.5668306350708008, + "learning_rate": 0.00015672861429005737, + "loss": 1.3301, + "step": 9180 + }, + { + "epoch": 0.3287911615664226, + "grad_norm": 1.5943611860275269, + "learning_rate": 0.00015671906188059672, + "loss": 1.5854, + "step": 9181 + }, + { + "epoch": 0.32882697369599084, + "grad_norm": 1.9587252140045166, + "learning_rate": 0.0001567095087080542, + "loss": 1.8593, + "step": 9182 + }, + { + "epoch": 0.3288627858255591, + "grad_norm": 1.4713166952133179, + "learning_rate": 0.00015669995477255838, + "loss": 1.6878, + "step": 9183 + }, + { + "epoch": 0.3288985979551274, + "grad_norm": 2.106905937194824, + "learning_rate": 0.00015669040007423784, + "loss": 1.5709, + "step": 9184 + }, + { + "epoch": 0.3289344100846957, + "grad_norm": 2.200146198272705, + "learning_rate": 0.00015668084461322108, + "loss": 1.6029, + "step": 9185 + }, + { + "epoch": 0.328970222214264, + "grad_norm": 1.6008678674697876, + "learning_rate": 0.00015667128838963668, + "loss": 1.6772, + "step": 9186 + }, + { + "epoch": 0.32900603434383224, + "grad_norm": 1.828938603401184, + "learning_rate": 0.00015666173140361315, + "loss": 1.7683, + "step": 9187 + }, + { + "epoch": 0.32904184647340057, + "grad_norm": 2.305866241455078, + "learning_rate": 0.00015665217365527917, + "loss": 1.7502, + "step": 9188 + }, + { + "epoch": 0.32907765860296884, + "grad_norm": 2.0801093578338623, + "learning_rate": 0.00015664261514476322, + "loss": 1.3371, + "step": 9189 + }, + { + "epoch": 0.3291134707325371, + "grad_norm": 1.4453667402267456, + "learning_rate": 0.00015663305587219396, + "loss": 1.477, + "step": 9190 + }, + { + "epoch": 0.3291492828621054, + "grad_norm": 1.6667442321777344, + "learning_rate": 0.00015662349583770002, + "loss": 1.8024, + "step": 9191 + }, + { + "epoch": 0.3291850949916737, + "grad_norm": 1.4693574905395508, + "learning_rate": 0.00015661393504140994, + "loss": 1.3105, + "step": 9192 + }, + { + "epoch": 0.32922090712124197, + "grad_norm": 1.4278935194015503, + "learning_rate": 0.0001566043734834524, + "loss": 1.8824, + "step": 9193 + }, + { + "epoch": 0.32925671925081024, + "grad_norm": 2.0216543674468994, + "learning_rate": 0.00015659481116395604, + "loss": 1.6132, + "step": 9194 + }, + { + "epoch": 0.32929253138037856, + "grad_norm": 1.8408523797988892, + "learning_rate": 0.0001565852480830495, + "loss": 1.191, + "step": 9195 + }, + { + "epoch": 0.32932834350994683, + "grad_norm": 1.845346450805664, + "learning_rate": 0.00015657568424086145, + "loss": 1.8479, + "step": 9196 + }, + { + "epoch": 0.3293641556395151, + "grad_norm": 1.3020843267440796, + "learning_rate": 0.0001565661196375205, + "loss": 1.3387, + "step": 9197 + }, + { + "epoch": 0.32939996776908337, + "grad_norm": 1.6613608598709106, + "learning_rate": 0.00015655655427315542, + "loss": 1.4907, + "step": 9198 + }, + { + "epoch": 0.3294357798986517, + "grad_norm": 1.7599493265151978, + "learning_rate": 0.00015654698814789484, + "loss": 1.5625, + "step": 9199 + }, + { + "epoch": 0.32947159202821996, + "grad_norm": 2.014738082885742, + "learning_rate": 0.00015653742126186745, + "loss": 1.6346, + "step": 9200 + }, + { + "epoch": 0.32950740415778823, + "grad_norm": 1.7053759098052979, + "learning_rate": 0.00015652785361520204, + "loss": 1.7228, + "step": 9201 + }, + { + "epoch": 0.32954321628735656, + "grad_norm": 1.6620781421661377, + "learning_rate": 0.00015651828520802722, + "loss": 1.5416, + "step": 9202 + }, + { + "epoch": 0.3295790284169248, + "grad_norm": 1.5674805641174316, + "learning_rate": 0.00015650871604047182, + "loss": 1.6512, + "step": 9203 + }, + { + "epoch": 0.3296148405464931, + "grad_norm": 1.6600102186203003, + "learning_rate": 0.0001564991461126645, + "loss": 1.8433, + "step": 9204 + }, + { + "epoch": 0.32965065267606136, + "grad_norm": 2.119927167892456, + "learning_rate": 0.00015648957542473406, + "loss": 1.6382, + "step": 9205 + }, + { + "epoch": 0.3296864648056297, + "grad_norm": 2.353123426437378, + "learning_rate": 0.00015648000397680924, + "loss": 1.4772, + "step": 9206 + }, + { + "epoch": 0.32972227693519796, + "grad_norm": 2.0134799480438232, + "learning_rate": 0.00015647043176901886, + "loss": 1.6496, + "step": 9207 + }, + { + "epoch": 0.3297580890647662, + "grad_norm": 1.255784034729004, + "learning_rate": 0.00015646085880149162, + "loss": 1.6315, + "step": 9208 + }, + { + "epoch": 0.3297939011943345, + "grad_norm": 1.3342036008834839, + "learning_rate": 0.00015645128507435637, + "loss": 1.3743, + "step": 9209 + }, + { + "epoch": 0.3298297133239028, + "grad_norm": 1.8134342432022095, + "learning_rate": 0.00015644171058774192, + "loss": 1.6786, + "step": 9210 + }, + { + "epoch": 0.3298655254534711, + "grad_norm": 1.3972450494766235, + "learning_rate": 0.000156432135341777, + "loss": 1.6555, + "step": 9211 + }, + { + "epoch": 0.32990133758303936, + "grad_norm": 1.5140109062194824, + "learning_rate": 0.00015642255933659053, + "loss": 1.5552, + "step": 9212 + }, + { + "epoch": 0.3299371497126077, + "grad_norm": 1.5048795938491821, + "learning_rate": 0.0001564129825723113, + "loss": 1.4055, + "step": 9213 + }, + { + "epoch": 0.32997296184217595, + "grad_norm": 2.151780605316162, + "learning_rate": 0.00015640340504906818, + "loss": 1.5771, + "step": 9214 + }, + { + "epoch": 0.3300087739717442, + "grad_norm": 1.7445200681686401, + "learning_rate": 0.00015639382676698997, + "loss": 1.8844, + "step": 9215 + }, + { + "epoch": 0.3300445861013125, + "grad_norm": 1.9518778324127197, + "learning_rate": 0.00015638424772620554, + "loss": 2.0622, + "step": 9216 + }, + { + "epoch": 0.3300803982308808, + "grad_norm": 2.5693469047546387, + "learning_rate": 0.00015637466792684383, + "loss": 1.3957, + "step": 9217 + }, + { + "epoch": 0.3301162103604491, + "grad_norm": 1.5530214309692383, + "learning_rate": 0.00015636508736903366, + "loss": 1.3895, + "step": 9218 + }, + { + "epoch": 0.33015202249001735, + "grad_norm": 1.2154394388198853, + "learning_rate": 0.00015635550605290396, + "loss": 1.561, + "step": 9219 + }, + { + "epoch": 0.3301878346195857, + "grad_norm": 1.5539181232452393, + "learning_rate": 0.00015634592397858362, + "loss": 1.2993, + "step": 9220 + }, + { + "epoch": 0.33022364674915394, + "grad_norm": 2.4144694805145264, + "learning_rate": 0.00015633634114620154, + "loss": 2.0973, + "step": 9221 + }, + { + "epoch": 0.3302594588787222, + "grad_norm": 1.9893107414245605, + "learning_rate": 0.00015632675755588668, + "loss": 2.0019, + "step": 9222 + }, + { + "epoch": 0.3302952710082905, + "grad_norm": 1.4870593547821045, + "learning_rate": 0.00015631717320776795, + "loss": 1.1392, + "step": 9223 + }, + { + "epoch": 0.3303310831378588, + "grad_norm": 1.4275274276733398, + "learning_rate": 0.00015630758810197427, + "loss": 1.6437, + "step": 9224 + }, + { + "epoch": 0.3303668952674271, + "grad_norm": 1.9945303201675415, + "learning_rate": 0.00015629800223863465, + "loss": 1.5218, + "step": 9225 + }, + { + "epoch": 0.33040270739699534, + "grad_norm": 1.4058603048324585, + "learning_rate": 0.000156288415617878, + "loss": 1.4086, + "step": 9226 + }, + { + "epoch": 0.33043851952656367, + "grad_norm": 1.57683265209198, + "learning_rate": 0.00015627882823983336, + "loss": 1.6094, + "step": 9227 + }, + { + "epoch": 0.33047433165613194, + "grad_norm": 3.029585361480713, + "learning_rate": 0.00015626924010462968, + "loss": 1.7547, + "step": 9228 + }, + { + "epoch": 0.3305101437857002, + "grad_norm": 1.5655510425567627, + "learning_rate": 0.00015625965121239592, + "loss": 1.8381, + "step": 9229 + }, + { + "epoch": 0.3305459559152685, + "grad_norm": 2.5353496074676514, + "learning_rate": 0.00015625006156326117, + "loss": 1.8627, + "step": 9230 + }, + { + "epoch": 0.3305817680448368, + "grad_norm": 1.9974932670593262, + "learning_rate": 0.00015624047115735435, + "loss": 1.605, + "step": 9231 + }, + { + "epoch": 0.33061758017440507, + "grad_norm": 2.7682440280914307, + "learning_rate": 0.00015623087999480458, + "loss": 1.9736, + "step": 9232 + }, + { + "epoch": 0.33065339230397334, + "grad_norm": 1.4476059675216675, + "learning_rate": 0.00015622128807574081, + "loss": 1.7133, + "step": 9233 + }, + { + "epoch": 0.33068920443354166, + "grad_norm": 1.8865472078323364, + "learning_rate": 0.00015621169540029216, + "loss": 1.4769, + "step": 9234 + }, + { + "epoch": 0.33072501656310993, + "grad_norm": 2.4696617126464844, + "learning_rate": 0.00015620210196858763, + "loss": 1.5348, + "step": 9235 + }, + { + "epoch": 0.3307608286926782, + "grad_norm": 1.3538154363632202, + "learning_rate": 0.00015619250778075634, + "loss": 1.5639, + "step": 9236 + }, + { + "epoch": 0.33079664082224647, + "grad_norm": 1.380228042602539, + "learning_rate": 0.00015618291283692735, + "loss": 1.4208, + "step": 9237 + }, + { + "epoch": 0.3308324529518148, + "grad_norm": 1.5593613386154175, + "learning_rate": 0.0001561733171372297, + "loss": 1.4575, + "step": 9238 + }, + { + "epoch": 0.33086826508138306, + "grad_norm": 1.0856677293777466, + "learning_rate": 0.00015616372068179255, + "loss": 1.287, + "step": 9239 + }, + { + "epoch": 0.33090407721095133, + "grad_norm": 1.7727956771850586, + "learning_rate": 0.00015615412347074498, + "loss": 1.4814, + "step": 9240 + }, + { + "epoch": 0.33093988934051966, + "grad_norm": 2.7936644554138184, + "learning_rate": 0.0001561445255042161, + "loss": 1.4023, + "step": 9241 + }, + { + "epoch": 0.3309757014700879, + "grad_norm": 1.5259954929351807, + "learning_rate": 0.00015613492678233509, + "loss": 1.4711, + "step": 9242 + }, + { + "epoch": 0.3310115135996562, + "grad_norm": 1.6369067430496216, + "learning_rate": 0.000156125327305231, + "loss": 1.6207, + "step": 9243 + }, + { + "epoch": 0.33104732572922446, + "grad_norm": 1.5903338193893433, + "learning_rate": 0.00015611572707303307, + "loss": 1.5504, + "step": 9244 + }, + { + "epoch": 0.3310831378587928, + "grad_norm": 1.5647993087768555, + "learning_rate": 0.00015610612608587035, + "loss": 1.722, + "step": 9245 + }, + { + "epoch": 0.33111894998836106, + "grad_norm": 1.9212912321090698, + "learning_rate": 0.00015609652434387216, + "loss": 1.2589, + "step": 9246 + }, + { + "epoch": 0.3311547621179293, + "grad_norm": 1.876175045967102, + "learning_rate": 0.00015608692184716753, + "loss": 1.5151, + "step": 9247 + }, + { + "epoch": 0.33119057424749765, + "grad_norm": 2.0057926177978516, + "learning_rate": 0.00015607731859588575, + "loss": 1.4391, + "step": 9248 + }, + { + "epoch": 0.3312263863770659, + "grad_norm": 1.4844675064086914, + "learning_rate": 0.00015606771459015598, + "loss": 1.8257, + "step": 9249 + }, + { + "epoch": 0.3312621985066342, + "grad_norm": 1.5540400743484497, + "learning_rate": 0.00015605810983010743, + "loss": 1.2488, + "step": 9250 + }, + { + "epoch": 0.33129801063620246, + "grad_norm": 2.0735278129577637, + "learning_rate": 0.0001560485043158693, + "loss": 1.4661, + "step": 9251 + }, + { + "epoch": 0.3313338227657708, + "grad_norm": 1.5882608890533447, + "learning_rate": 0.00015603889804757085, + "loss": 1.4818, + "step": 9252 + }, + { + "epoch": 0.33136963489533905, + "grad_norm": 1.3272738456726074, + "learning_rate": 0.00015602929102534132, + "loss": 1.6939, + "step": 9253 + }, + { + "epoch": 0.3314054470249073, + "grad_norm": 1.2422311305999756, + "learning_rate": 0.00015601968324930997, + "loss": 1.3206, + "step": 9254 + }, + { + "epoch": 0.33144125915447564, + "grad_norm": 1.8080556392669678, + "learning_rate": 0.000156010074719606, + "loss": 1.2622, + "step": 9255 + }, + { + "epoch": 0.3314770712840439, + "grad_norm": 1.7431944608688354, + "learning_rate": 0.00015600046543635875, + "loss": 1.4804, + "step": 9256 + }, + { + "epoch": 0.3315128834136122, + "grad_norm": 1.3665114641189575, + "learning_rate": 0.0001559908553996975, + "loss": 1.6605, + "step": 9257 + }, + { + "epoch": 0.33154869554318045, + "grad_norm": 1.7952216863632202, + "learning_rate": 0.00015598124460975148, + "loss": 1.6897, + "step": 9258 + }, + { + "epoch": 0.3315845076727488, + "grad_norm": 1.934351921081543, + "learning_rate": 0.00015597163306665002, + "loss": 1.2434, + "step": 9259 + }, + { + "epoch": 0.33162031980231704, + "grad_norm": 1.282073974609375, + "learning_rate": 0.00015596202077052245, + "loss": 1.1964, + "step": 9260 + }, + { + "epoch": 0.3316561319318853, + "grad_norm": 1.3141311407089233, + "learning_rate": 0.00015595240772149803, + "loss": 1.2027, + "step": 9261 + }, + { + "epoch": 0.33169194406145364, + "grad_norm": 1.557449221611023, + "learning_rate": 0.0001559427939197062, + "loss": 1.7988, + "step": 9262 + }, + { + "epoch": 0.3317277561910219, + "grad_norm": 1.4467054605484009, + "learning_rate": 0.0001559331793652762, + "loss": 1.2495, + "step": 9263 + }, + { + "epoch": 0.3317635683205902, + "grad_norm": 1.3926703929901123, + "learning_rate": 0.00015592356405833745, + "loss": 1.5037, + "step": 9264 + }, + { + "epoch": 0.33179938045015844, + "grad_norm": 2.078423261642456, + "learning_rate": 0.00015591394799901927, + "loss": 1.6851, + "step": 9265 + }, + { + "epoch": 0.33183519257972677, + "grad_norm": 2.431959867477417, + "learning_rate": 0.00015590433118745106, + "loss": 1.8476, + "step": 9266 + }, + { + "epoch": 0.33187100470929504, + "grad_norm": 1.8334934711456299, + "learning_rate": 0.00015589471362376217, + "loss": 1.8419, + "step": 9267 + }, + { + "epoch": 0.3319068168388633, + "grad_norm": 1.5225297212600708, + "learning_rate": 0.00015588509530808199, + "loss": 1.9179, + "step": 9268 + }, + { + "epoch": 0.33194262896843163, + "grad_norm": 1.3365960121154785, + "learning_rate": 0.00015587547624053993, + "loss": 1.1894, + "step": 9269 + }, + { + "epoch": 0.3319784410979999, + "grad_norm": 2.1068549156188965, + "learning_rate": 0.00015586585642126543, + "loss": 1.6613, + "step": 9270 + }, + { + "epoch": 0.33201425322756817, + "grad_norm": 1.5305715799331665, + "learning_rate": 0.00015585623585038792, + "loss": 1.4709, + "step": 9271 + }, + { + "epoch": 0.33205006535713644, + "grad_norm": 1.8630884885787964, + "learning_rate": 0.00015584661452803676, + "loss": 1.6649, + "step": 9272 + }, + { + "epoch": 0.33208587748670476, + "grad_norm": 1.756103754043579, + "learning_rate": 0.00015583699245434146, + "loss": 1.5741, + "step": 9273 + }, + { + "epoch": 0.33212168961627303, + "grad_norm": 1.6057642698287964, + "learning_rate": 0.00015582736962943148, + "loss": 1.6483, + "step": 9274 + }, + { + "epoch": 0.3321575017458413, + "grad_norm": 1.2686374187469482, + "learning_rate": 0.00015581774605343622, + "loss": 1.4546, + "step": 9275 + }, + { + "epoch": 0.3321933138754096, + "grad_norm": 1.5505086183547974, + "learning_rate": 0.0001558081217264852, + "loss": 1.6977, + "step": 9276 + }, + { + "epoch": 0.3322291260049779, + "grad_norm": 1.148769736289978, + "learning_rate": 0.00015579849664870788, + "loss": 1.4565, + "step": 9277 + }, + { + "epoch": 0.33226493813454616, + "grad_norm": 1.3341801166534424, + "learning_rate": 0.00015578887082023373, + "loss": 1.4761, + "step": 9278 + }, + { + "epoch": 0.33230075026411443, + "grad_norm": 1.950057029724121, + "learning_rate": 0.00015577924424119233, + "loss": 1.4574, + "step": 9279 + }, + { + "epoch": 0.33233656239368276, + "grad_norm": 1.5431779623031616, + "learning_rate": 0.00015576961691171314, + "loss": 1.4851, + "step": 9280 + }, + { + "epoch": 0.332372374523251, + "grad_norm": 1.7088158130645752, + "learning_rate": 0.0001557599888319257, + "loss": 1.7068, + "step": 9281 + }, + { + "epoch": 0.3324081866528193, + "grad_norm": 2.0593230724334717, + "learning_rate": 0.00015575036000195952, + "loss": 1.4745, + "step": 9282 + }, + { + "epoch": 0.3324439987823876, + "grad_norm": 1.5324299335479736, + "learning_rate": 0.00015574073042194417, + "loss": 1.8138, + "step": 9283 + }, + { + "epoch": 0.3324798109119559, + "grad_norm": 1.420212984085083, + "learning_rate": 0.0001557311000920092, + "loss": 1.5369, + "step": 9284 + }, + { + "epoch": 0.33251562304152416, + "grad_norm": 2.050830364227295, + "learning_rate": 0.00015572146901228414, + "loss": 1.6046, + "step": 9285 + }, + { + "epoch": 0.3325514351710924, + "grad_norm": 1.4355131387710571, + "learning_rate": 0.0001557118371828986, + "loss": 1.4598, + "step": 9286 + }, + { + "epoch": 0.33258724730066075, + "grad_norm": 2.0514612197875977, + "learning_rate": 0.00015570220460398216, + "loss": 1.3749, + "step": 9287 + }, + { + "epoch": 0.332623059430229, + "grad_norm": 1.370781421661377, + "learning_rate": 0.00015569257127566441, + "loss": 1.6945, + "step": 9288 + }, + { + "epoch": 0.3326588715597973, + "grad_norm": 1.860186219215393, + "learning_rate": 0.00015568293719807493, + "loss": 1.5742, + "step": 9289 + }, + { + "epoch": 0.3326946836893656, + "grad_norm": 1.4769200086593628, + "learning_rate": 0.00015567330237134338, + "loss": 1.6708, + "step": 9290 + }, + { + "epoch": 0.3327304958189339, + "grad_norm": 1.4108153581619263, + "learning_rate": 0.00015566366679559937, + "loss": 1.5285, + "step": 9291 + }, + { + "epoch": 0.33276630794850215, + "grad_norm": 1.5427703857421875, + "learning_rate": 0.0001556540304709725, + "loss": 1.3293, + "step": 9292 + }, + { + "epoch": 0.3328021200780704, + "grad_norm": 1.8373152017593384, + "learning_rate": 0.00015564439339759245, + "loss": 1.6855, + "step": 9293 + }, + { + "epoch": 0.33283793220763874, + "grad_norm": 2.347149133682251, + "learning_rate": 0.00015563475557558887, + "loss": 1.4346, + "step": 9294 + }, + { + "epoch": 0.332873744337207, + "grad_norm": 2.9432506561279297, + "learning_rate": 0.00015562511700509138, + "loss": 1.3162, + "step": 9295 + }, + { + "epoch": 0.3329095564667753, + "grad_norm": 2.5354115962982178, + "learning_rate": 0.00015561547768622974, + "loss": 1.7069, + "step": 9296 + }, + { + "epoch": 0.3329453685963436, + "grad_norm": 1.4742679595947266, + "learning_rate": 0.00015560583761913357, + "loss": 1.2812, + "step": 9297 + }, + { + "epoch": 0.3329811807259119, + "grad_norm": 1.7055310010910034, + "learning_rate": 0.00015559619680393256, + "loss": 1.54, + "step": 9298 + }, + { + "epoch": 0.33301699285548014, + "grad_norm": 1.5904908180236816, + "learning_rate": 0.00015558655524075646, + "loss": 1.4334, + "step": 9299 + }, + { + "epoch": 0.3330528049850484, + "grad_norm": 1.9072233438491821, + "learning_rate": 0.00015557691292973494, + "loss": 1.4061, + "step": 9300 + }, + { + "epoch": 0.33308861711461674, + "grad_norm": 1.6825381517410278, + "learning_rate": 0.0001555672698709978, + "loss": 1.3886, + "step": 9301 + }, + { + "epoch": 0.333124429244185, + "grad_norm": 1.6894252300262451, + "learning_rate": 0.00015555762606467465, + "loss": 1.4306, + "step": 9302 + }, + { + "epoch": 0.3331602413737533, + "grad_norm": 1.9015343189239502, + "learning_rate": 0.00015554798151089534, + "loss": 1.3311, + "step": 9303 + }, + { + "epoch": 0.3331960535033216, + "grad_norm": 2.412245512008667, + "learning_rate": 0.00015553833620978957, + "loss": 1.5272, + "step": 9304 + }, + { + "epoch": 0.33323186563288987, + "grad_norm": 2.0586354732513428, + "learning_rate": 0.00015552869016148714, + "loss": 1.8866, + "step": 9305 + }, + { + "epoch": 0.33326767776245814, + "grad_norm": 1.8574638366699219, + "learning_rate": 0.0001555190433661178, + "loss": 1.4357, + "step": 9306 + }, + { + "epoch": 0.3333034898920264, + "grad_norm": 1.5744131803512573, + "learning_rate": 0.00015550939582381135, + "loss": 1.3504, + "step": 9307 + }, + { + "epoch": 0.33333930202159473, + "grad_norm": 1.7387135028839111, + "learning_rate": 0.00015549974753469763, + "loss": 1.5314, + "step": 9308 + }, + { + "epoch": 0.333375114151163, + "grad_norm": 1.4193501472473145, + "learning_rate": 0.00015549009849890634, + "loss": 1.3785, + "step": 9309 + }, + { + "epoch": 0.33341092628073127, + "grad_norm": 1.4320520162582397, + "learning_rate": 0.0001554804487165674, + "loss": 1.7415, + "step": 9310 + }, + { + "epoch": 0.3334467384102996, + "grad_norm": 2.080427408218384, + "learning_rate": 0.00015547079818781055, + "loss": 1.4669, + "step": 9311 + }, + { + "epoch": 0.33348255053986786, + "grad_norm": 2.3720574378967285, + "learning_rate": 0.00015546114691276567, + "loss": 1.5753, + "step": 9312 + }, + { + "epoch": 0.33351836266943613, + "grad_norm": 1.8366647958755493, + "learning_rate": 0.0001554514948915626, + "loss": 1.5028, + "step": 9313 + }, + { + "epoch": 0.3335541747990044, + "grad_norm": 1.6955592632293701, + "learning_rate": 0.00015544184212433116, + "loss": 1.5171, + "step": 9314 + }, + { + "epoch": 0.3335899869285727, + "grad_norm": 2.041065216064453, + "learning_rate": 0.00015543218861120125, + "loss": 1.2655, + "step": 9315 + }, + { + "epoch": 0.333625799058141, + "grad_norm": 1.9579260349273682, + "learning_rate": 0.00015542253435230278, + "loss": 1.4517, + "step": 9316 + }, + { + "epoch": 0.33366161118770926, + "grad_norm": 1.472667932510376, + "learning_rate": 0.0001554128793477656, + "loss": 1.2546, + "step": 9317 + }, + { + "epoch": 0.3336974233172776, + "grad_norm": 1.5596768856048584, + "learning_rate": 0.0001554032235977196, + "loss": 1.5515, + "step": 9318 + }, + { + "epoch": 0.33373323544684586, + "grad_norm": 1.527860164642334, + "learning_rate": 0.0001553935671022947, + "loss": 1.3611, + "step": 9319 + }, + { + "epoch": 0.3337690475764141, + "grad_norm": 1.6964343786239624, + "learning_rate": 0.00015538390986162082, + "loss": 1.5114, + "step": 9320 + }, + { + "epoch": 0.3338048597059824, + "grad_norm": 1.6963163614273071, + "learning_rate": 0.00015537425187582785, + "loss": 1.7816, + "step": 9321 + }, + { + "epoch": 0.3338406718355507, + "grad_norm": 1.6652700901031494, + "learning_rate": 0.00015536459314504573, + "loss": 1.8709, + "step": 9322 + }, + { + "epoch": 0.333876483965119, + "grad_norm": 1.7200714349746704, + "learning_rate": 0.00015535493366940442, + "loss": 1.7631, + "step": 9323 + }, + { + "epoch": 0.33391229609468726, + "grad_norm": 2.2434933185577393, + "learning_rate": 0.0001553452734490339, + "loss": 1.6587, + "step": 9324 + }, + { + "epoch": 0.3339481082242556, + "grad_norm": 1.4792317152023315, + "learning_rate": 0.00015533561248406413, + "loss": 1.5377, + "step": 9325 + }, + { + "epoch": 0.33398392035382385, + "grad_norm": 1.9175893068313599, + "learning_rate": 0.00015532595077462507, + "loss": 1.6146, + "step": 9326 + }, + { + "epoch": 0.3340197324833921, + "grad_norm": 3.432957649230957, + "learning_rate": 0.0001553162883208467, + "loss": 1.5732, + "step": 9327 + }, + { + "epoch": 0.3340555446129604, + "grad_norm": 1.9333349466323853, + "learning_rate": 0.00015530662512285902, + "loss": 1.211, + "step": 9328 + }, + { + "epoch": 0.3340913567425287, + "grad_norm": 2.2683115005493164, + "learning_rate": 0.00015529696118079205, + "loss": 1.5071, + "step": 9329 + }, + { + "epoch": 0.334127168872097, + "grad_norm": 1.9926609992980957, + "learning_rate": 0.00015528729649477574, + "loss": 1.571, + "step": 9330 + }, + { + "epoch": 0.33416298100166525, + "grad_norm": 1.6616355180740356, + "learning_rate": 0.00015527763106494024, + "loss": 1.4256, + "step": 9331 + }, + { + "epoch": 0.3341987931312336, + "grad_norm": 1.9569220542907715, + "learning_rate": 0.0001552679648914155, + "loss": 1.8484, + "step": 9332 + }, + { + "epoch": 0.33423460526080184, + "grad_norm": 1.5473068952560425, + "learning_rate": 0.00015525829797433157, + "loss": 1.405, + "step": 9333 + }, + { + "epoch": 0.3342704173903701, + "grad_norm": 1.8051518201828003, + "learning_rate": 0.00015524863031381853, + "loss": 1.2997, + "step": 9334 + }, + { + "epoch": 0.3343062295199384, + "grad_norm": 1.7402265071868896, + "learning_rate": 0.00015523896191000643, + "loss": 1.7362, + "step": 9335 + }, + { + "epoch": 0.3343420416495067, + "grad_norm": 1.6411949396133423, + "learning_rate": 0.00015522929276302536, + "loss": 1.3214, + "step": 9336 + }, + { + "epoch": 0.334377853779075, + "grad_norm": 1.3993909358978271, + "learning_rate": 0.0001552196228730054, + "loss": 1.3546, + "step": 9337 + }, + { + "epoch": 0.33441366590864324, + "grad_norm": 2.0956623554229736, + "learning_rate": 0.00015520995224007662, + "loss": 1.5105, + "step": 9338 + }, + { + "epoch": 0.33444947803821157, + "grad_norm": 1.8981349468231201, + "learning_rate": 0.00015520028086436915, + "loss": 1.4384, + "step": 9339 + }, + { + "epoch": 0.33448529016777984, + "grad_norm": 2.153280258178711, + "learning_rate": 0.00015519060874601313, + "loss": 1.3638, + "step": 9340 + }, + { + "epoch": 0.3345211022973481, + "grad_norm": 1.4576733112335205, + "learning_rate": 0.00015518093588513863, + "loss": 1.5844, + "step": 9341 + }, + { + "epoch": 0.3345569144269164, + "grad_norm": 1.9353009462356567, + "learning_rate": 0.0001551712622818758, + "loss": 1.6606, + "step": 9342 + }, + { + "epoch": 0.3345927265564847, + "grad_norm": 1.6368129253387451, + "learning_rate": 0.00015516158793635486, + "loss": 1.2273, + "step": 9343 + }, + { + "epoch": 0.33462853868605297, + "grad_norm": 1.8436633348464966, + "learning_rate": 0.00015515191284870588, + "loss": 1.771, + "step": 9344 + }, + { + "epoch": 0.33466435081562124, + "grad_norm": 1.6525850296020508, + "learning_rate": 0.00015514223701905904, + "loss": 1.375, + "step": 9345 + }, + { + "epoch": 0.33470016294518956, + "grad_norm": 1.5455073118209839, + "learning_rate": 0.00015513256044754457, + "loss": 1.6118, + "step": 9346 + }, + { + "epoch": 0.33473597507475783, + "grad_norm": 1.7292460203170776, + "learning_rate": 0.00015512288313429258, + "loss": 1.6382, + "step": 9347 + }, + { + "epoch": 0.3347717872043261, + "grad_norm": 2.00976824760437, + "learning_rate": 0.0001551132050794333, + "loss": 1.4821, + "step": 9348 + }, + { + "epoch": 0.33480759933389437, + "grad_norm": 2.144354820251465, + "learning_rate": 0.00015510352628309693, + "loss": 1.8286, + "step": 9349 + }, + { + "epoch": 0.3348434114634627, + "grad_norm": 1.8691902160644531, + "learning_rate": 0.00015509384674541372, + "loss": 1.5248, + "step": 9350 + }, + { + "epoch": 0.33487922359303096, + "grad_norm": 1.6684008836746216, + "learning_rate": 0.00015508416646651385, + "loss": 1.5576, + "step": 9351 + }, + { + "epoch": 0.33491503572259923, + "grad_norm": 1.8261315822601318, + "learning_rate": 0.0001550744854465276, + "loss": 1.704, + "step": 9352 + }, + { + "epoch": 0.33495084785216755, + "grad_norm": 2.0527100563049316, + "learning_rate": 0.00015506480368558516, + "loss": 1.7708, + "step": 9353 + }, + { + "epoch": 0.3349866599817358, + "grad_norm": 1.6841418743133545, + "learning_rate": 0.00015505512118381683, + "loss": 1.365, + "step": 9354 + }, + { + "epoch": 0.3350224721113041, + "grad_norm": 1.531112790107727, + "learning_rate": 0.00015504543794135284, + "loss": 1.527, + "step": 9355 + }, + { + "epoch": 0.33505828424087236, + "grad_norm": 1.4720302820205688, + "learning_rate": 0.00015503575395832352, + "loss": 1.4721, + "step": 9356 + }, + { + "epoch": 0.3350940963704407, + "grad_norm": 2.4081616401672363, + "learning_rate": 0.00015502606923485906, + "loss": 1.4112, + "step": 9357 + }, + { + "epoch": 0.33512990850000896, + "grad_norm": 1.4726104736328125, + "learning_rate": 0.00015501638377108987, + "loss": 1.4118, + "step": 9358 + }, + { + "epoch": 0.3351657206295772, + "grad_norm": 1.6351170539855957, + "learning_rate": 0.00015500669756714618, + "loss": 1.341, + "step": 9359 + }, + { + "epoch": 0.33520153275914555, + "grad_norm": 2.084728717803955, + "learning_rate": 0.0001549970106231583, + "loss": 1.457, + "step": 9360 + }, + { + "epoch": 0.3352373448887138, + "grad_norm": 1.7264491319656372, + "learning_rate": 0.00015498732293925667, + "loss": 1.5515, + "step": 9361 + }, + { + "epoch": 0.3352731570182821, + "grad_norm": 1.6600877046585083, + "learning_rate": 0.00015497763451557148, + "loss": 1.5989, + "step": 9362 + }, + { + "epoch": 0.33530896914785036, + "grad_norm": 1.4888222217559814, + "learning_rate": 0.00015496794535223315, + "loss": 1.0686, + "step": 9363 + }, + { + "epoch": 0.3353447812774187, + "grad_norm": 2.209500312805176, + "learning_rate": 0.000154958255449372, + "loss": 1.722, + "step": 9364 + }, + { + "epoch": 0.33538059340698695, + "grad_norm": 1.8588663339614868, + "learning_rate": 0.00015494856480711844, + "loss": 1.3165, + "step": 9365 + }, + { + "epoch": 0.3354164055365552, + "grad_norm": 2.067023754119873, + "learning_rate": 0.0001549388734256028, + "loss": 1.8567, + "step": 9366 + }, + { + "epoch": 0.33545221766612354, + "grad_norm": 1.5753854513168335, + "learning_rate": 0.00015492918130495547, + "loss": 1.2895, + "step": 9367 + }, + { + "epoch": 0.3354880297956918, + "grad_norm": 1.8132989406585693, + "learning_rate": 0.0001549194884453069, + "loss": 1.4248, + "step": 9368 + }, + { + "epoch": 0.3355238419252601, + "grad_norm": 1.5888770818710327, + "learning_rate": 0.00015490979484678743, + "loss": 1.2666, + "step": 9369 + }, + { + "epoch": 0.33555965405482835, + "grad_norm": 2.0815927982330322, + "learning_rate": 0.0001549001005095275, + "loss": 1.6768, + "step": 9370 + }, + { + "epoch": 0.3355954661843967, + "grad_norm": 1.6123459339141846, + "learning_rate": 0.00015489040543365754, + "loss": 1.4473, + "step": 9371 + }, + { + "epoch": 0.33563127831396494, + "grad_norm": 1.461475133895874, + "learning_rate": 0.00015488070961930796, + "loss": 1.4638, + "step": 9372 + }, + { + "epoch": 0.3356670904435332, + "grad_norm": 1.3855684995651245, + "learning_rate": 0.00015487101306660924, + "loss": 1.3953, + "step": 9373 + }, + { + "epoch": 0.33570290257310154, + "grad_norm": 2.608703851699829, + "learning_rate": 0.00015486131577569182, + "loss": 1.5456, + "step": 9374 + }, + { + "epoch": 0.3357387147026698, + "grad_norm": 1.4365531206130981, + "learning_rate": 0.00015485161774668615, + "loss": 1.5599, + "step": 9375 + }, + { + "epoch": 0.3357745268322381, + "grad_norm": 1.4558398723602295, + "learning_rate": 0.00015484191897972274, + "loss": 1.4977, + "step": 9376 + }, + { + "epoch": 0.33581033896180634, + "grad_norm": 1.7082833051681519, + "learning_rate": 0.000154832219474932, + "loss": 1.3407, + "step": 9377 + }, + { + "epoch": 0.33584615109137467, + "grad_norm": 1.5729615688323975, + "learning_rate": 0.00015482251923244452, + "loss": 1.4501, + "step": 9378 + }, + { + "epoch": 0.33588196322094294, + "grad_norm": 1.2978506088256836, + "learning_rate": 0.00015481281825239072, + "loss": 1.5467, + "step": 9379 + }, + { + "epoch": 0.3359177753505112, + "grad_norm": 1.3954700231552124, + "learning_rate": 0.00015480311653490124, + "loss": 1.4613, + "step": 9380 + }, + { + "epoch": 0.33595358748007953, + "grad_norm": 1.4414048194885254, + "learning_rate": 0.00015479341408010643, + "loss": 1.4413, + "step": 9381 + }, + { + "epoch": 0.3359893996096478, + "grad_norm": 1.9647263288497925, + "learning_rate": 0.00015478371088813696, + "loss": 1.8244, + "step": 9382 + }, + { + "epoch": 0.33602521173921607, + "grad_norm": 1.619997262954712, + "learning_rate": 0.0001547740069591233, + "loss": 1.6385, + "step": 9383 + }, + { + "epoch": 0.33606102386878434, + "grad_norm": 1.9023948907852173, + "learning_rate": 0.00015476430229319603, + "loss": 1.5266, + "step": 9384 + }, + { + "epoch": 0.33609683599835266, + "grad_norm": 1.7760009765625, + "learning_rate": 0.00015475459689048572, + "loss": 1.5373, + "step": 9385 + }, + { + "epoch": 0.33613264812792093, + "grad_norm": 1.5633459091186523, + "learning_rate": 0.00015474489075112296, + "loss": 1.4418, + "step": 9386 + }, + { + "epoch": 0.3361684602574892, + "grad_norm": 1.4270317554473877, + "learning_rate": 0.00015473518387523825, + "loss": 1.5547, + "step": 9387 + }, + { + "epoch": 0.3362042723870575, + "grad_norm": 2.286064386367798, + "learning_rate": 0.0001547254762629623, + "loss": 1.5236, + "step": 9388 + }, + { + "epoch": 0.3362400845166258, + "grad_norm": 1.7473195791244507, + "learning_rate": 0.00015471576791442564, + "loss": 1.5732, + "step": 9389 + }, + { + "epoch": 0.33627589664619406, + "grad_norm": 1.81753408908844, + "learning_rate": 0.00015470605882975891, + "loss": 1.4656, + "step": 9390 + }, + { + "epoch": 0.33631170877576233, + "grad_norm": 1.6820625066757202, + "learning_rate": 0.00015469634900909271, + "loss": 1.4767, + "step": 9391 + }, + { + "epoch": 0.33634752090533065, + "grad_norm": 1.315958023071289, + "learning_rate": 0.00015468663845255768, + "loss": 1.5724, + "step": 9392 + }, + { + "epoch": 0.3363833330348989, + "grad_norm": 2.7729685306549072, + "learning_rate": 0.0001546769271602845, + "loss": 1.4956, + "step": 9393 + }, + { + "epoch": 0.3364191451644672, + "grad_norm": 2.246842622756958, + "learning_rate": 0.0001546672151324038, + "loss": 1.6252, + "step": 9394 + }, + { + "epoch": 0.3364549572940355, + "grad_norm": 1.5574842691421509, + "learning_rate": 0.0001546575023690462, + "loss": 1.6173, + "step": 9395 + }, + { + "epoch": 0.3364907694236038, + "grad_norm": 1.6773743629455566, + "learning_rate": 0.00015464778887034242, + "loss": 1.4211, + "step": 9396 + }, + { + "epoch": 0.33652658155317206, + "grad_norm": 1.4379688501358032, + "learning_rate": 0.0001546380746364231, + "loss": 1.5853, + "step": 9397 + }, + { + "epoch": 0.3365623936827403, + "grad_norm": 1.9088571071624756, + "learning_rate": 0.00015462835966741903, + "loss": 0.9463, + "step": 9398 + }, + { + "epoch": 0.33659820581230865, + "grad_norm": 1.3112964630126953, + "learning_rate": 0.0001546186439634608, + "loss": 1.6045, + "step": 9399 + }, + { + "epoch": 0.3366340179418769, + "grad_norm": 1.9358813762664795, + "learning_rate": 0.0001546089275246792, + "loss": 1.6525, + "step": 9400 + }, + { + "epoch": 0.3366698300714452, + "grad_norm": 1.9341341257095337, + "learning_rate": 0.00015459921035120488, + "loss": 1.5339, + "step": 9401 + }, + { + "epoch": 0.3367056422010135, + "grad_norm": 1.2685867547988892, + "learning_rate": 0.00015458949244316866, + "loss": 1.5571, + "step": 9402 + }, + { + "epoch": 0.3367414543305818, + "grad_norm": 1.3416786193847656, + "learning_rate": 0.00015457977380070118, + "loss": 1.4485, + "step": 9403 + }, + { + "epoch": 0.33677726646015005, + "grad_norm": 2.085193157196045, + "learning_rate": 0.00015457005442393327, + "loss": 1.656, + "step": 9404 + }, + { + "epoch": 0.3368130785897183, + "grad_norm": 1.7671416997909546, + "learning_rate": 0.00015456033431299567, + "loss": 1.3032, + "step": 9405 + }, + { + "epoch": 0.33684889071928664, + "grad_norm": 1.6520533561706543, + "learning_rate": 0.00015455061346801916, + "loss": 1.3252, + "step": 9406 + }, + { + "epoch": 0.3368847028488549, + "grad_norm": 2.0870721340179443, + "learning_rate": 0.00015454089188913454, + "loss": 1.3519, + "step": 9407 + }, + { + "epoch": 0.3369205149784232, + "grad_norm": 2.2233269214630127, + "learning_rate": 0.00015453116957647254, + "loss": 1.4443, + "step": 9408 + }, + { + "epoch": 0.33695632710799145, + "grad_norm": 1.454346776008606, + "learning_rate": 0.00015452144653016397, + "loss": 1.5947, + "step": 9409 + }, + { + "epoch": 0.3369921392375598, + "grad_norm": 1.5266408920288086, + "learning_rate": 0.0001545117227503397, + "loss": 1.456, + "step": 9410 + }, + { + "epoch": 0.33702795136712804, + "grad_norm": 1.5770138502120972, + "learning_rate": 0.00015450199823713047, + "loss": 1.4124, + "step": 9411 + }, + { + "epoch": 0.3370637634966963, + "grad_norm": 1.7103294134140015, + "learning_rate": 0.0001544922729906672, + "loss": 1.4488, + "step": 9412 + }, + { + "epoch": 0.33709957562626464, + "grad_norm": 1.5208357572555542, + "learning_rate": 0.00015448254701108067, + "loss": 1.437, + "step": 9413 + }, + { + "epoch": 0.3371353877558329, + "grad_norm": 1.9375684261322021, + "learning_rate": 0.00015447282029850174, + "loss": 1.2997, + "step": 9414 + }, + { + "epoch": 0.3371711998854012, + "grad_norm": 1.5922077894210815, + "learning_rate": 0.00015446309285306131, + "loss": 1.5067, + "step": 9415 + }, + { + "epoch": 0.33720701201496944, + "grad_norm": 1.5397335290908813, + "learning_rate": 0.0001544533646748902, + "loss": 1.3838, + "step": 9416 + }, + { + "epoch": 0.33724282414453777, + "grad_norm": 1.709815263748169, + "learning_rate": 0.00015444363576411929, + "loss": 1.3891, + "step": 9417 + }, + { + "epoch": 0.33727863627410604, + "grad_norm": 1.6586835384368896, + "learning_rate": 0.00015443390612087952, + "loss": 1.3357, + "step": 9418 + }, + { + "epoch": 0.3373144484036743, + "grad_norm": 1.817344069480896, + "learning_rate": 0.00015442417574530173, + "loss": 1.5462, + "step": 9419 + }, + { + "epoch": 0.33735026053324263, + "grad_norm": 1.4759312868118286, + "learning_rate": 0.00015441444463751687, + "loss": 1.6142, + "step": 9420 + }, + { + "epoch": 0.3373860726628109, + "grad_norm": 1.9167733192443848, + "learning_rate": 0.00015440471279765583, + "loss": 1.5566, + "step": 9421 + }, + { + "epoch": 0.33742188479237917, + "grad_norm": 1.3484073877334595, + "learning_rate": 0.00015439498022584957, + "loss": 1.4768, + "step": 9422 + }, + { + "epoch": 0.33745769692194744, + "grad_norm": 1.864970088005066, + "learning_rate": 0.00015438524692222902, + "loss": 1.5269, + "step": 9423 + }, + { + "epoch": 0.33749350905151576, + "grad_norm": 1.4716098308563232, + "learning_rate": 0.00015437551288692512, + "loss": 1.452, + "step": 9424 + }, + { + "epoch": 0.33752932118108403, + "grad_norm": 2.067917823791504, + "learning_rate": 0.00015436577812006884, + "loss": 1.4989, + "step": 9425 + }, + { + "epoch": 0.3375651333106523, + "grad_norm": 2.316983938217163, + "learning_rate": 0.00015435604262179116, + "loss": 1.2659, + "step": 9426 + }, + { + "epoch": 0.3376009454402206, + "grad_norm": 2.3151962757110596, + "learning_rate": 0.000154346306392223, + "loss": 1.5562, + "step": 9427 + }, + { + "epoch": 0.3376367575697889, + "grad_norm": 1.9097533226013184, + "learning_rate": 0.00015433656943149543, + "loss": 1.7194, + "step": 9428 + }, + { + "epoch": 0.33767256969935716, + "grad_norm": 1.6860368251800537, + "learning_rate": 0.00015432683173973935, + "loss": 1.7439, + "step": 9429 + }, + { + "epoch": 0.33770838182892543, + "grad_norm": 1.6204484701156616, + "learning_rate": 0.0001543170933170859, + "loss": 1.4332, + "step": 9430 + }, + { + "epoch": 0.33774419395849375, + "grad_norm": 1.807880163192749, + "learning_rate": 0.00015430735416366596, + "loss": 1.7937, + "step": 9431 + }, + { + "epoch": 0.337780006088062, + "grad_norm": 1.4650651216506958, + "learning_rate": 0.00015429761427961065, + "loss": 1.456, + "step": 9432 + }, + { + "epoch": 0.3378158182176303, + "grad_norm": 1.7693088054656982, + "learning_rate": 0.00015428787366505094, + "loss": 1.6138, + "step": 9433 + }, + { + "epoch": 0.3378516303471986, + "grad_norm": 2.2674636840820312, + "learning_rate": 0.00015427813232011799, + "loss": 1.6677, + "step": 9434 + }, + { + "epoch": 0.3378874424767669, + "grad_norm": 1.9544172286987305, + "learning_rate": 0.00015426839024494272, + "loss": 1.6052, + "step": 9435 + }, + { + "epoch": 0.33792325460633515, + "grad_norm": 2.344970703125, + "learning_rate": 0.0001542586474396563, + "loss": 1.4323, + "step": 9436 + }, + { + "epoch": 0.3379590667359034, + "grad_norm": 2.032620906829834, + "learning_rate": 0.00015424890390438974, + "loss": 1.5969, + "step": 9437 + }, + { + "epoch": 0.33799487886547175, + "grad_norm": 1.735715627670288, + "learning_rate": 0.00015423915963927418, + "loss": 1.4165, + "step": 9438 + }, + { + "epoch": 0.33803069099504, + "grad_norm": 1.2005674839019775, + "learning_rate": 0.00015422941464444064, + "loss": 1.569, + "step": 9439 + }, + { + "epoch": 0.3380665031246083, + "grad_norm": 1.4805797338485718, + "learning_rate": 0.00015421966892002032, + "loss": 1.5567, + "step": 9440 + }, + { + "epoch": 0.3381023152541766, + "grad_norm": 1.743964672088623, + "learning_rate": 0.00015420992246614428, + "loss": 1.5592, + "step": 9441 + }, + { + "epoch": 0.3381381273837449, + "grad_norm": 1.772685170173645, + "learning_rate": 0.00015420017528294368, + "loss": 1.4956, + "step": 9442 + }, + { + "epoch": 0.33817393951331315, + "grad_norm": 1.8625121116638184, + "learning_rate": 0.00015419042737054963, + "loss": 1.8012, + "step": 9443 + }, + { + "epoch": 0.3382097516428814, + "grad_norm": 1.4339003562927246, + "learning_rate": 0.00015418067872909326, + "loss": 1.5661, + "step": 9444 + }, + { + "epoch": 0.33824556377244974, + "grad_norm": 1.6561118364334106, + "learning_rate": 0.00015417092935870574, + "loss": 1.7139, + "step": 9445 + }, + { + "epoch": 0.338281375902018, + "grad_norm": 1.8280638456344604, + "learning_rate": 0.00015416117925951827, + "loss": 1.143, + "step": 9446 + }, + { + "epoch": 0.3383171880315863, + "grad_norm": 1.4793591499328613, + "learning_rate": 0.000154151428431662, + "loss": 1.422, + "step": 9447 + }, + { + "epoch": 0.3383530001611546, + "grad_norm": 1.537424921989441, + "learning_rate": 0.00015414167687526805, + "loss": 1.3919, + "step": 9448 + }, + { + "epoch": 0.3383888122907229, + "grad_norm": 2.0304996967315674, + "learning_rate": 0.00015413192459046772, + "loss": 1.3456, + "step": 9449 + }, + { + "epoch": 0.33842462442029114, + "grad_norm": 1.8802131414413452, + "learning_rate": 0.00015412217157739216, + "loss": 1.2977, + "step": 9450 + }, + { + "epoch": 0.3384604365498594, + "grad_norm": 1.9226833581924438, + "learning_rate": 0.00015411241783617262, + "loss": 1.4789, + "step": 9451 + }, + { + "epoch": 0.33849624867942774, + "grad_norm": 1.7963732481002808, + "learning_rate": 0.0001541026633669403, + "loss": 1.3647, + "step": 9452 + }, + { + "epoch": 0.338532060808996, + "grad_norm": 2.629523515701294, + "learning_rate": 0.0001540929081698264, + "loss": 1.3783, + "step": 9453 + }, + { + "epoch": 0.3385678729385643, + "grad_norm": 1.9142543077468872, + "learning_rate": 0.00015408315224496222, + "loss": 1.5034, + "step": 9454 + }, + { + "epoch": 0.3386036850681326, + "grad_norm": 1.6466937065124512, + "learning_rate": 0.00015407339559247895, + "loss": 1.5608, + "step": 9455 + }, + { + "epoch": 0.33863949719770087, + "grad_norm": 1.5068143606185913, + "learning_rate": 0.00015406363821250793, + "loss": 1.5739, + "step": 9456 + }, + { + "epoch": 0.33867530932726914, + "grad_norm": 1.6713124513626099, + "learning_rate": 0.00015405388010518038, + "loss": 1.9615, + "step": 9457 + }, + { + "epoch": 0.3387111214568374, + "grad_norm": 2.409848928451538, + "learning_rate": 0.00015404412127062762, + "loss": 1.7276, + "step": 9458 + }, + { + "epoch": 0.33874693358640573, + "grad_norm": 1.9540921449661255, + "learning_rate": 0.00015403436170898088, + "loss": 1.6689, + "step": 9459 + }, + { + "epoch": 0.338782745715974, + "grad_norm": 1.4777239561080933, + "learning_rate": 0.00015402460142037154, + "loss": 1.4505, + "step": 9460 + }, + { + "epoch": 0.33881855784554227, + "grad_norm": 2.009676694869995, + "learning_rate": 0.00015401484040493085, + "loss": 1.6446, + "step": 9461 + }, + { + "epoch": 0.3388543699751106, + "grad_norm": 1.903566598892212, + "learning_rate": 0.00015400507866279018, + "loss": 1.5648, + "step": 9462 + }, + { + "epoch": 0.33889018210467886, + "grad_norm": 1.4451779127120972, + "learning_rate": 0.0001539953161940808, + "loss": 1.6952, + "step": 9463 + }, + { + "epoch": 0.33892599423424713, + "grad_norm": 2.141256332397461, + "learning_rate": 0.00015398555299893412, + "loss": 1.4004, + "step": 9464 + }, + { + "epoch": 0.3389618063638154, + "grad_norm": 1.5447112321853638, + "learning_rate": 0.00015397578907748146, + "loss": 1.6946, + "step": 9465 + }, + { + "epoch": 0.3389976184933837, + "grad_norm": 1.8307271003723145, + "learning_rate": 0.00015396602442985417, + "loss": 1.4, + "step": 9466 + }, + { + "epoch": 0.339033430622952, + "grad_norm": 1.5202934741973877, + "learning_rate": 0.00015395625905618364, + "loss": 1.52, + "step": 9467 + }, + { + "epoch": 0.33906924275252026, + "grad_norm": 1.4197298288345337, + "learning_rate": 0.00015394649295660123, + "loss": 1.6863, + "step": 9468 + }, + { + "epoch": 0.3391050548820886, + "grad_norm": 1.4701685905456543, + "learning_rate": 0.00015393672613123836, + "loss": 1.3284, + "step": 9469 + }, + { + "epoch": 0.33914086701165685, + "grad_norm": 1.5090365409851074, + "learning_rate": 0.0001539269585802264, + "loss": 1.5528, + "step": 9470 + }, + { + "epoch": 0.3391766791412251, + "grad_norm": 1.5357013940811157, + "learning_rate": 0.0001539171903036968, + "loss": 1.5706, + "step": 9471 + }, + { + "epoch": 0.3392124912707934, + "grad_norm": 1.3371944427490234, + "learning_rate": 0.0001539074213017809, + "loss": 1.2997, + "step": 9472 + }, + { + "epoch": 0.3392483034003617, + "grad_norm": 1.5634480714797974, + "learning_rate": 0.00015389765157461022, + "loss": 1.2902, + "step": 9473 + }, + { + "epoch": 0.33928411552993, + "grad_norm": 1.4515788555145264, + "learning_rate": 0.00015388788112231615, + "loss": 1.2752, + "step": 9474 + }, + { + "epoch": 0.33931992765949825, + "grad_norm": 1.9081002473831177, + "learning_rate": 0.00015387810994503016, + "loss": 1.4576, + "step": 9475 + }, + { + "epoch": 0.3393557397890666, + "grad_norm": 1.6875176429748535, + "learning_rate": 0.0001538683380428837, + "loss": 1.1983, + "step": 9476 + }, + { + "epoch": 0.33939155191863485, + "grad_norm": 2.4767041206359863, + "learning_rate": 0.00015385856541600825, + "loss": 1.47, + "step": 9477 + }, + { + "epoch": 0.3394273640482031, + "grad_norm": 1.4980638027191162, + "learning_rate": 0.00015384879206453524, + "loss": 1.4585, + "step": 9478 + }, + { + "epoch": 0.3394631761777714, + "grad_norm": 2.4069361686706543, + "learning_rate": 0.00015383901798859622, + "loss": 1.485, + "step": 9479 + }, + { + "epoch": 0.3394989883073397, + "grad_norm": 1.3716156482696533, + "learning_rate": 0.00015382924318832264, + "loss": 1.4037, + "step": 9480 + }, + { + "epoch": 0.339534800436908, + "grad_norm": 1.2617043256759644, + "learning_rate": 0.00015381946766384602, + "loss": 1.4763, + "step": 9481 + }, + { + "epoch": 0.33957061256647625, + "grad_norm": 1.9273022413253784, + "learning_rate": 0.0001538096914152979, + "loss": 1.676, + "step": 9482 + }, + { + "epoch": 0.3396064246960446, + "grad_norm": 1.8018248081207275, + "learning_rate": 0.00015379991444280979, + "loss": 1.6164, + "step": 9483 + }, + { + "epoch": 0.33964223682561284, + "grad_norm": 1.717280387878418, + "learning_rate": 0.00015379013674651323, + "loss": 1.6003, + "step": 9484 + }, + { + "epoch": 0.3396780489551811, + "grad_norm": 1.4812376499176025, + "learning_rate": 0.00015378035832653975, + "loss": 1.7848, + "step": 9485 + }, + { + "epoch": 0.3397138610847494, + "grad_norm": 1.6170680522918701, + "learning_rate": 0.00015377057918302097, + "loss": 1.567, + "step": 9486 + }, + { + "epoch": 0.3397496732143177, + "grad_norm": 1.3598394393920898, + "learning_rate": 0.00015376079931608838, + "loss": 1.5678, + "step": 9487 + }, + { + "epoch": 0.339785485343886, + "grad_norm": 1.3419015407562256, + "learning_rate": 0.00015375101872587357, + "loss": 1.4495, + "step": 9488 + }, + { + "epoch": 0.33982129747345424, + "grad_norm": 1.6549489498138428, + "learning_rate": 0.00015374123741250815, + "loss": 1.4173, + "step": 9489 + }, + { + "epoch": 0.33985710960302257, + "grad_norm": 1.4437153339385986, + "learning_rate": 0.00015373145537612369, + "loss": 1.5415, + "step": 9490 + }, + { + "epoch": 0.33989292173259084, + "grad_norm": 3.1351869106292725, + "learning_rate": 0.00015372167261685178, + "loss": 1.3433, + "step": 9491 + }, + { + "epoch": 0.3399287338621591, + "grad_norm": 2.001826763153076, + "learning_rate": 0.0001537118891348241, + "loss": 1.3978, + "step": 9492 + }, + { + "epoch": 0.3399645459917274, + "grad_norm": 1.6999043226242065, + "learning_rate": 0.00015370210493017222, + "loss": 1.4984, + "step": 9493 + }, + { + "epoch": 0.3400003581212957, + "grad_norm": 2.0367825031280518, + "learning_rate": 0.00015369232000302777, + "loss": 1.7663, + "step": 9494 + }, + { + "epoch": 0.34003617025086397, + "grad_norm": 2.383127212524414, + "learning_rate": 0.00015368253435352246, + "loss": 1.5093, + "step": 9495 + }, + { + "epoch": 0.34007198238043224, + "grad_norm": 1.8723777532577515, + "learning_rate": 0.00015367274798178788, + "loss": 1.5293, + "step": 9496 + }, + { + "epoch": 0.34010779451000056, + "grad_norm": 1.9914052486419678, + "learning_rate": 0.0001536629608879557, + "loss": 1.4791, + "step": 9497 + }, + { + "epoch": 0.34014360663956883, + "grad_norm": 1.547864317893982, + "learning_rate": 0.00015365317307215759, + "loss": 1.7745, + "step": 9498 + }, + { + "epoch": 0.3401794187691371, + "grad_norm": 2.0833046436309814, + "learning_rate": 0.00015364338453452528, + "loss": 1.5529, + "step": 9499 + }, + { + "epoch": 0.34021523089870537, + "grad_norm": 1.4355981349945068, + "learning_rate": 0.00015363359527519036, + "loss": 1.5735, + "step": 9500 + }, + { + "epoch": 0.3402510430282737, + "grad_norm": 2.000267267227173, + "learning_rate": 0.00015362380529428466, + "loss": 1.4826, + "step": 9501 + }, + { + "epoch": 0.34028685515784196, + "grad_norm": 2.343449831008911, + "learning_rate": 0.0001536140145919398, + "loss": 1.479, + "step": 9502 + }, + { + "epoch": 0.34032266728741023, + "grad_norm": 1.5564789772033691, + "learning_rate": 0.00015360422316828754, + "loss": 1.6355, + "step": 9503 + }, + { + "epoch": 0.34035847941697855, + "grad_norm": 1.6426461935043335, + "learning_rate": 0.0001535944310234596, + "loss": 1.7457, + "step": 9504 + }, + { + "epoch": 0.3403942915465468, + "grad_norm": 1.9233627319335938, + "learning_rate": 0.0001535846381575877, + "loss": 1.7404, + "step": 9505 + }, + { + "epoch": 0.3404301036761151, + "grad_norm": 1.6464073657989502, + "learning_rate": 0.00015357484457080366, + "loss": 1.1418, + "step": 9506 + }, + { + "epoch": 0.34046591580568336, + "grad_norm": 1.4640589952468872, + "learning_rate": 0.00015356505026323917, + "loss": 1.5402, + "step": 9507 + }, + { + "epoch": 0.3405017279352517, + "grad_norm": 1.8246670961380005, + "learning_rate": 0.00015355525523502603, + "loss": 1.6663, + "step": 9508 + }, + { + "epoch": 0.34053754006481995, + "grad_norm": 1.9067139625549316, + "learning_rate": 0.00015354545948629598, + "loss": 1.6515, + "step": 9509 + }, + { + "epoch": 0.3405733521943882, + "grad_norm": 1.3062249422073364, + "learning_rate": 0.00015353566301718087, + "loss": 1.3262, + "step": 9510 + }, + { + "epoch": 0.34060916432395655, + "grad_norm": 1.529366374015808, + "learning_rate": 0.00015352586582781247, + "loss": 1.5815, + "step": 9511 + }, + { + "epoch": 0.3406449764535248, + "grad_norm": 1.1446207761764526, + "learning_rate": 0.0001535160679183226, + "loss": 1.3855, + "step": 9512 + }, + { + "epoch": 0.3406807885830931, + "grad_norm": 1.392532467842102, + "learning_rate": 0.00015350626928884307, + "loss": 1.6394, + "step": 9513 + }, + { + "epoch": 0.34071660071266135, + "grad_norm": 1.3918993473052979, + "learning_rate": 0.00015349646993950567, + "loss": 1.3734, + "step": 9514 + }, + { + "epoch": 0.3407524128422297, + "grad_norm": 1.6025148630142212, + "learning_rate": 0.00015348666987044228, + "loss": 1.603, + "step": 9515 + }, + { + "epoch": 0.34078822497179795, + "grad_norm": 1.2502943277359009, + "learning_rate": 0.00015347686908178475, + "loss": 1.2778, + "step": 9516 + }, + { + "epoch": 0.3408240371013662, + "grad_norm": 1.4781215190887451, + "learning_rate": 0.0001534670675736649, + "loss": 1.2568, + "step": 9517 + }, + { + "epoch": 0.34085984923093454, + "grad_norm": 1.4211047887802124, + "learning_rate": 0.00015345726534621466, + "loss": 1.7071, + "step": 9518 + }, + { + "epoch": 0.3408956613605028, + "grad_norm": 1.5720802545547485, + "learning_rate": 0.00015344746239956587, + "loss": 1.6633, + "step": 9519 + }, + { + "epoch": 0.3409314734900711, + "grad_norm": 1.248879075050354, + "learning_rate": 0.00015343765873385037, + "loss": 1.731, + "step": 9520 + }, + { + "epoch": 0.34096728561963935, + "grad_norm": 1.5327391624450684, + "learning_rate": 0.00015342785434920017, + "loss": 1.3957, + "step": 9521 + }, + { + "epoch": 0.3410030977492077, + "grad_norm": 1.6686993837356567, + "learning_rate": 0.00015341804924574707, + "loss": 1.5403, + "step": 9522 + }, + { + "epoch": 0.34103890987877594, + "grad_norm": 1.3809535503387451, + "learning_rate": 0.00015340824342362303, + "loss": 1.6858, + "step": 9523 + }, + { + "epoch": 0.3410747220083442, + "grad_norm": 1.8143247365951538, + "learning_rate": 0.00015339843688295997, + "loss": 1.5465, + "step": 9524 + }, + { + "epoch": 0.34111053413791254, + "grad_norm": 1.9842431545257568, + "learning_rate": 0.00015338862962388977, + "loss": 1.294, + "step": 9525 + }, + { + "epoch": 0.3411463462674808, + "grad_norm": 1.9405596256256104, + "learning_rate": 0.0001533788216465445, + "loss": 1.6388, + "step": 9526 + }, + { + "epoch": 0.3411821583970491, + "grad_norm": 1.7968621253967285, + "learning_rate": 0.00015336901295105596, + "loss": 1.8472, + "step": 9527 + }, + { + "epoch": 0.34121797052661734, + "grad_norm": 2.323481559753418, + "learning_rate": 0.00015335920353755627, + "loss": 1.5243, + "step": 9528 + }, + { + "epoch": 0.34125378265618567, + "grad_norm": 3.580449342727661, + "learning_rate": 0.00015334939340617726, + "loss": 1.6631, + "step": 9529 + }, + { + "epoch": 0.34128959478575394, + "grad_norm": 1.3738518953323364, + "learning_rate": 0.00015333958255705102, + "loss": 1.6034, + "step": 9530 + }, + { + "epoch": 0.3413254069153222, + "grad_norm": 1.4165027141571045, + "learning_rate": 0.00015332977099030953, + "loss": 1.8126, + "step": 9531 + }, + { + "epoch": 0.34136121904489053, + "grad_norm": 1.6622012853622437, + "learning_rate": 0.0001533199587060847, + "loss": 1.3838, + "step": 9532 + }, + { + "epoch": 0.3413970311744588, + "grad_norm": 1.4645581245422363, + "learning_rate": 0.0001533101457045086, + "loss": 1.648, + "step": 9533 + }, + { + "epoch": 0.34143284330402707, + "grad_norm": 1.7480566501617432, + "learning_rate": 0.0001533003319857133, + "loss": 1.3988, + "step": 9534 + }, + { + "epoch": 0.34146865543359534, + "grad_norm": 1.2829252481460571, + "learning_rate": 0.00015329051754983076, + "loss": 1.313, + "step": 9535 + }, + { + "epoch": 0.34150446756316366, + "grad_norm": 1.990755558013916, + "learning_rate": 0.00015328070239699305, + "loss": 1.4727, + "step": 9536 + }, + { + "epoch": 0.34154027969273193, + "grad_norm": 1.4537103176116943, + "learning_rate": 0.0001532708865273322, + "loss": 1.4554, + "step": 9537 + }, + { + "epoch": 0.3415760918223002, + "grad_norm": 1.6201869249343872, + "learning_rate": 0.0001532610699409803, + "loss": 1.5224, + "step": 9538 + }, + { + "epoch": 0.3416119039518685, + "grad_norm": 1.5027778148651123, + "learning_rate": 0.00015325125263806943, + "loss": 1.4322, + "step": 9539 + }, + { + "epoch": 0.3416477160814368, + "grad_norm": 1.5254660844802856, + "learning_rate": 0.0001532414346187316, + "loss": 1.5606, + "step": 9540 + }, + { + "epoch": 0.34168352821100506, + "grad_norm": 1.727812647819519, + "learning_rate": 0.000153231615883099, + "loss": 1.6136, + "step": 9541 + }, + { + "epoch": 0.34171934034057333, + "grad_norm": 2.322026491165161, + "learning_rate": 0.0001532217964313036, + "loss": 1.5319, + "step": 9542 + }, + { + "epoch": 0.34175515247014165, + "grad_norm": 2.0224554538726807, + "learning_rate": 0.00015321197626347766, + "loss": 1.6205, + "step": 9543 + }, + { + "epoch": 0.3417909645997099, + "grad_norm": 1.3443764448165894, + "learning_rate": 0.00015320215537975313, + "loss": 1.6137, + "step": 9544 + }, + { + "epoch": 0.3418267767292782, + "grad_norm": 1.777479648590088, + "learning_rate": 0.0001531923337802623, + "loss": 1.7252, + "step": 9545 + }, + { + "epoch": 0.3418625888588465, + "grad_norm": 1.4354276657104492, + "learning_rate": 0.0001531825114651372, + "loss": 1.5497, + "step": 9546 + }, + { + "epoch": 0.3418984009884148, + "grad_norm": 2.5619523525238037, + "learning_rate": 0.00015317268843451003, + "loss": 1.5586, + "step": 9547 + }, + { + "epoch": 0.34193421311798305, + "grad_norm": 1.8442054986953735, + "learning_rate": 0.0001531628646885129, + "loss": 1.6815, + "step": 9548 + }, + { + "epoch": 0.3419700252475513, + "grad_norm": 1.897230625152588, + "learning_rate": 0.000153153040227278, + "loss": 1.4317, + "step": 9549 + }, + { + "epoch": 0.34200583737711965, + "grad_norm": 1.4326685667037964, + "learning_rate": 0.00015314321505093751, + "loss": 1.6475, + "step": 9550 + }, + { + "epoch": 0.3420416495066879, + "grad_norm": 2.543628692626953, + "learning_rate": 0.00015313338915962362, + "loss": 1.6623, + "step": 9551 + }, + { + "epoch": 0.3420774616362562, + "grad_norm": 1.4500302076339722, + "learning_rate": 0.0001531235625534685, + "loss": 1.3383, + "step": 9552 + }, + { + "epoch": 0.3421132737658245, + "grad_norm": 1.4041082859039307, + "learning_rate": 0.00015311373523260437, + "loss": 1.3654, + "step": 9553 + }, + { + "epoch": 0.3421490858953928, + "grad_norm": 1.5999187231063843, + "learning_rate": 0.00015310390719716348, + "loss": 1.5783, + "step": 9554 + }, + { + "epoch": 0.34218489802496105, + "grad_norm": 1.8230500221252441, + "learning_rate": 0.000153094078447278, + "loss": 1.4435, + "step": 9555 + }, + { + "epoch": 0.3422207101545293, + "grad_norm": 2.0518381595611572, + "learning_rate": 0.00015308424898308017, + "loss": 1.5841, + "step": 9556 + }, + { + "epoch": 0.34225652228409764, + "grad_norm": 1.2910943031311035, + "learning_rate": 0.00015307441880470227, + "loss": 1.5275, + "step": 9557 + }, + { + "epoch": 0.3422923344136659, + "grad_norm": 1.3538620471954346, + "learning_rate": 0.00015306458791227646, + "loss": 1.685, + "step": 9558 + }, + { + "epoch": 0.3423281465432342, + "grad_norm": 2.853557586669922, + "learning_rate": 0.00015305475630593516, + "loss": 1.5067, + "step": 9559 + }, + { + "epoch": 0.3423639586728025, + "grad_norm": 2.141958713531494, + "learning_rate": 0.00015304492398581046, + "loss": 1.4039, + "step": 9560 + }, + { + "epoch": 0.3423997708023708, + "grad_norm": 2.1874539852142334, + "learning_rate": 0.0001530350909520348, + "loss": 1.6577, + "step": 9561 + }, + { + "epoch": 0.34243558293193904, + "grad_norm": 1.843515396118164, + "learning_rate": 0.00015302525720474038, + "loss": 1.6085, + "step": 9562 + }, + { + "epoch": 0.3424713950615073, + "grad_norm": 2.459810733795166, + "learning_rate": 0.00015301542274405948, + "loss": 1.5343, + "step": 9563 + }, + { + "epoch": 0.34250720719107564, + "grad_norm": 2.1196999549865723, + "learning_rate": 0.00015300558757012448, + "loss": 1.2307, + "step": 9564 + }, + { + "epoch": 0.3425430193206439, + "grad_norm": 5.689758777618408, + "learning_rate": 0.00015299575168306774, + "loss": 1.5805, + "step": 9565 + }, + { + "epoch": 0.3425788314502122, + "grad_norm": 1.7519400119781494, + "learning_rate": 0.00015298591508302142, + "loss": 1.3325, + "step": 9566 + }, + { + "epoch": 0.3426146435797805, + "grad_norm": 1.4828169345855713, + "learning_rate": 0.000152976077770118, + "loss": 1.5663, + "step": 9567 + }, + { + "epoch": 0.34265045570934877, + "grad_norm": 1.916289210319519, + "learning_rate": 0.00015296623974448982, + "loss": 1.7129, + "step": 9568 + }, + { + "epoch": 0.34268626783891704, + "grad_norm": 1.342599630355835, + "learning_rate": 0.00015295640100626914, + "loss": 1.3287, + "step": 9569 + }, + { + "epoch": 0.3427220799684853, + "grad_norm": 1.6580992937088013, + "learning_rate": 0.00015294656155558843, + "loss": 1.2133, + "step": 9570 + }, + { + "epoch": 0.34275789209805363, + "grad_norm": 1.2530838251113892, + "learning_rate": 0.00015293672139258003, + "loss": 1.3984, + "step": 9571 + }, + { + "epoch": 0.3427937042276219, + "grad_norm": 1.6452621221542358, + "learning_rate": 0.00015292688051737633, + "loss": 1.5749, + "step": 9572 + }, + { + "epoch": 0.34282951635719017, + "grad_norm": 1.9408454895019531, + "learning_rate": 0.0001529170389301097, + "loss": 1.5734, + "step": 9573 + }, + { + "epoch": 0.3428653284867585, + "grad_norm": 1.6352133750915527, + "learning_rate": 0.00015290719663091262, + "loss": 1.5879, + "step": 9574 + }, + { + "epoch": 0.34290114061632676, + "grad_norm": 2.4092254638671875, + "learning_rate": 0.00015289735361991743, + "loss": 1.4012, + "step": 9575 + }, + { + "epoch": 0.34293695274589503, + "grad_norm": 1.5429397821426392, + "learning_rate": 0.00015288750989725657, + "loss": 1.2422, + "step": 9576 + }, + { + "epoch": 0.3429727648754633, + "grad_norm": 1.6832060813903809, + "learning_rate": 0.00015287766546306247, + "loss": 1.6365, + "step": 9577 + }, + { + "epoch": 0.3430085770050316, + "grad_norm": 1.3857985734939575, + "learning_rate": 0.00015286782031746763, + "loss": 1.4832, + "step": 9578 + }, + { + "epoch": 0.3430443891345999, + "grad_norm": 2.488194227218628, + "learning_rate": 0.00015285797446060442, + "loss": 1.8965, + "step": 9579 + }, + { + "epoch": 0.34308020126416816, + "grad_norm": 1.5207929611206055, + "learning_rate": 0.00015284812789260536, + "loss": 1.5894, + "step": 9580 + }, + { + "epoch": 0.3431160133937365, + "grad_norm": 1.9598724842071533, + "learning_rate": 0.00015283828061360291, + "loss": 1.423, + "step": 9581 + }, + { + "epoch": 0.34315182552330475, + "grad_norm": 2.2789857387542725, + "learning_rate": 0.00015282843262372955, + "loss": 1.7208, + "step": 9582 + }, + { + "epoch": 0.343187637652873, + "grad_norm": 1.4618828296661377, + "learning_rate": 0.0001528185839231178, + "loss": 1.7469, + "step": 9583 + }, + { + "epoch": 0.3432234497824413, + "grad_norm": 1.8113200664520264, + "learning_rate": 0.00015280873451190008, + "loss": 1.5084, + "step": 9584 + }, + { + "epoch": 0.3432592619120096, + "grad_norm": 2.0830557346343994, + "learning_rate": 0.000152798884390209, + "loss": 1.5144, + "step": 9585 + }, + { + "epoch": 0.3432950740415779, + "grad_norm": 2.5192768573760986, + "learning_rate": 0.000152789033558177, + "loss": 1.6695, + "step": 9586 + }, + { + "epoch": 0.34333088617114615, + "grad_norm": 2.1534531116485596, + "learning_rate": 0.0001527791820159367, + "loss": 1.5468, + "step": 9587 + }, + { + "epoch": 0.3433666983007145, + "grad_norm": 1.605986475944519, + "learning_rate": 0.00015276932976362052, + "loss": 1.2892, + "step": 9588 + }, + { + "epoch": 0.34340251043028275, + "grad_norm": 1.4528756141662598, + "learning_rate": 0.00015275947680136112, + "loss": 1.5642, + "step": 9589 + }, + { + "epoch": 0.343438322559851, + "grad_norm": 1.5881379842758179, + "learning_rate": 0.000152749623129291, + "loss": 1.5147, + "step": 9590 + }, + { + "epoch": 0.3434741346894193, + "grad_norm": 1.604173183441162, + "learning_rate": 0.00015273976874754274, + "loss": 1.5219, + "step": 9591 + }, + { + "epoch": 0.3435099468189876, + "grad_norm": 1.7604161500930786, + "learning_rate": 0.00015272991365624896, + "loss": 1.727, + "step": 9592 + }, + { + "epoch": 0.3435457589485559, + "grad_norm": 1.5851354598999023, + "learning_rate": 0.00015272005785554215, + "loss": 1.7405, + "step": 9593 + }, + { + "epoch": 0.34358157107812415, + "grad_norm": 1.7585153579711914, + "learning_rate": 0.000152710201345555, + "loss": 1.5413, + "step": 9594 + }, + { + "epoch": 0.3436173832076925, + "grad_norm": 2.0630173683166504, + "learning_rate": 0.00015270034412642007, + "loss": 1.7264, + "step": 9595 + }, + { + "epoch": 0.34365319533726074, + "grad_norm": 2.172152042388916, + "learning_rate": 0.00015269048619827, + "loss": 1.7205, + "step": 9596 + }, + { + "epoch": 0.343689007466829, + "grad_norm": 2.5975513458251953, + "learning_rate": 0.0001526806275612374, + "loss": 1.9148, + "step": 9597 + }, + { + "epoch": 0.3437248195963973, + "grad_norm": 1.559423804283142, + "learning_rate": 0.00015267076821545489, + "loss": 1.6451, + "step": 9598 + }, + { + "epoch": 0.3437606317259656, + "grad_norm": 2.140960693359375, + "learning_rate": 0.00015266090816105514, + "loss": 1.9031, + "step": 9599 + }, + { + "epoch": 0.3437964438555339, + "grad_norm": 1.62228262424469, + "learning_rate": 0.00015265104739817082, + "loss": 1.8635, + "step": 9600 + }, + { + "epoch": 0.34383225598510214, + "grad_norm": 1.7803810834884644, + "learning_rate": 0.00015264118592693457, + "loss": 1.2803, + "step": 9601 + }, + { + "epoch": 0.34386806811467047, + "grad_norm": 1.7218365669250488, + "learning_rate": 0.00015263132374747907, + "loss": 1.3518, + "step": 9602 + }, + { + "epoch": 0.34390388024423874, + "grad_norm": 1.9356237649917603, + "learning_rate": 0.00015262146085993697, + "loss": 1.7451, + "step": 9603 + }, + { + "epoch": 0.343939692373807, + "grad_norm": 1.4498659372329712, + "learning_rate": 0.00015261159726444098, + "loss": 1.7193, + "step": 9604 + }, + { + "epoch": 0.3439755045033753, + "grad_norm": 1.850898027420044, + "learning_rate": 0.00015260173296112385, + "loss": 1.4767, + "step": 9605 + }, + { + "epoch": 0.3440113166329436, + "grad_norm": 1.6529873609542847, + "learning_rate": 0.00015259186795011823, + "loss": 1.9236, + "step": 9606 + }, + { + "epoch": 0.34404712876251187, + "grad_norm": 2.0878450870513916, + "learning_rate": 0.0001525820022315569, + "loss": 1.684, + "step": 9607 + }, + { + "epoch": 0.34408294089208014, + "grad_norm": 1.974679708480835, + "learning_rate": 0.0001525721358055725, + "loss": 1.4546, + "step": 9608 + }, + { + "epoch": 0.3441187530216484, + "grad_norm": 1.576131820678711, + "learning_rate": 0.0001525622686722979, + "loss": 1.5243, + "step": 9609 + }, + { + "epoch": 0.34415456515121673, + "grad_norm": 2.246279716491699, + "learning_rate": 0.00015255240083186572, + "loss": 1.2719, + "step": 9610 + }, + { + "epoch": 0.344190377280785, + "grad_norm": 1.4744218587875366, + "learning_rate": 0.00015254253228440877, + "loss": 1.569, + "step": 9611 + }, + { + "epoch": 0.34422618941035327, + "grad_norm": 2.0905468463897705, + "learning_rate": 0.00015253266303005987, + "loss": 1.3189, + "step": 9612 + }, + { + "epoch": 0.3442620015399216, + "grad_norm": 1.9109582901000977, + "learning_rate": 0.00015252279306895172, + "loss": 1.3807, + "step": 9613 + }, + { + "epoch": 0.34429781366948986, + "grad_norm": 1.441930890083313, + "learning_rate": 0.00015251292240121714, + "loss": 1.3186, + "step": 9614 + }, + { + "epoch": 0.34433362579905813, + "grad_norm": 1.6397720575332642, + "learning_rate": 0.0001525030510269889, + "loss": 1.477, + "step": 9615 + }, + { + "epoch": 0.3443694379286264, + "grad_norm": 1.3502668142318726, + "learning_rate": 0.00015249317894639987, + "loss": 1.5195, + "step": 9616 + }, + { + "epoch": 0.3444052500581947, + "grad_norm": 1.8765751123428345, + "learning_rate": 0.00015248330615958282, + "loss": 1.4637, + "step": 9617 + }, + { + "epoch": 0.344441062187763, + "grad_norm": 1.7291653156280518, + "learning_rate": 0.00015247343266667061, + "loss": 1.5855, + "step": 9618 + }, + { + "epoch": 0.34447687431733126, + "grad_norm": 1.695115566253662, + "learning_rate": 0.00015246355846779602, + "loss": 1.8087, + "step": 9619 + }, + { + "epoch": 0.3445126864468996, + "grad_norm": 1.4280116558074951, + "learning_rate": 0.00015245368356309194, + "loss": 1.3901, + "step": 9620 + }, + { + "epoch": 0.34454849857646785, + "grad_norm": 1.5115997791290283, + "learning_rate": 0.00015244380795269118, + "loss": 1.7692, + "step": 9621 + }, + { + "epoch": 0.3445843107060361, + "grad_norm": 1.878031849861145, + "learning_rate": 0.00015243393163672664, + "loss": 1.469, + "step": 9622 + }, + { + "epoch": 0.3446201228356044, + "grad_norm": 1.791983723640442, + "learning_rate": 0.00015242405461533118, + "loss": 1.6334, + "step": 9623 + }, + { + "epoch": 0.3446559349651727, + "grad_norm": 1.6626172065734863, + "learning_rate": 0.0001524141768886377, + "loss": 1.527, + "step": 9624 + }, + { + "epoch": 0.344691747094741, + "grad_norm": 2.501840829849243, + "learning_rate": 0.0001524042984567791, + "loss": 1.6678, + "step": 9625 + }, + { + "epoch": 0.34472755922430925, + "grad_norm": 1.8956242799758911, + "learning_rate": 0.0001523944193198882, + "loss": 1.4583, + "step": 9626 + }, + { + "epoch": 0.3447633713538776, + "grad_norm": 2.0565409660339355, + "learning_rate": 0.00015238453947809805, + "loss": 1.697, + "step": 9627 + }, + { + "epoch": 0.34479918348344585, + "grad_norm": 2.0155301094055176, + "learning_rate": 0.00015237465893154143, + "loss": 1.71, + "step": 9628 + }, + { + "epoch": 0.3448349956130141, + "grad_norm": 2.654571056365967, + "learning_rate": 0.00015236477768035137, + "loss": 1.5797, + "step": 9629 + }, + { + "epoch": 0.3448708077425824, + "grad_norm": 1.6048696041107178, + "learning_rate": 0.00015235489572466078, + "loss": 1.7246, + "step": 9630 + }, + { + "epoch": 0.3449066198721507, + "grad_norm": 2.183603525161743, + "learning_rate": 0.00015234501306460256, + "loss": 1.3855, + "step": 9631 + }, + { + "epoch": 0.344942432001719, + "grad_norm": 1.7944676876068115, + "learning_rate": 0.0001523351297003097, + "loss": 1.585, + "step": 9632 + }, + { + "epoch": 0.34497824413128725, + "grad_norm": 1.8598835468292236, + "learning_rate": 0.00015232524563191523, + "loss": 1.3623, + "step": 9633 + }, + { + "epoch": 0.3450140562608556, + "grad_norm": 2.6025619506835938, + "learning_rate": 0.00015231536085955205, + "loss": 1.6624, + "step": 9634 + }, + { + "epoch": 0.34504986839042384, + "grad_norm": 1.3773020505905151, + "learning_rate": 0.00015230547538335317, + "loss": 1.68, + "step": 9635 + }, + { + "epoch": 0.3450856805199921, + "grad_norm": 2.4558515548706055, + "learning_rate": 0.00015229558920345162, + "loss": 1.5141, + "step": 9636 + }, + { + "epoch": 0.3451214926495604, + "grad_norm": 1.6440409421920776, + "learning_rate": 0.00015228570231998033, + "loss": 1.5429, + "step": 9637 + }, + { + "epoch": 0.3451573047791287, + "grad_norm": 1.8336268663406372, + "learning_rate": 0.00015227581473307238, + "loss": 1.559, + "step": 9638 + }, + { + "epoch": 0.345193116908697, + "grad_norm": 1.4943885803222656, + "learning_rate": 0.00015226592644286075, + "loss": 1.3127, + "step": 9639 + }, + { + "epoch": 0.34522892903826524, + "grad_norm": 1.8755306005477905, + "learning_rate": 0.00015225603744947852, + "loss": 1.5688, + "step": 9640 + }, + { + "epoch": 0.34526474116783357, + "grad_norm": 1.2759945392608643, + "learning_rate": 0.0001522461477530587, + "loss": 1.5486, + "step": 9641 + }, + { + "epoch": 0.34530055329740184, + "grad_norm": 1.2741237878799438, + "learning_rate": 0.00015223625735373436, + "loss": 1.5329, + "step": 9642 + }, + { + "epoch": 0.3453363654269701, + "grad_norm": 1.827254056930542, + "learning_rate": 0.00015222636625163854, + "loss": 1.7189, + "step": 9643 + }, + { + "epoch": 0.3453721775565384, + "grad_norm": 1.470656394958496, + "learning_rate": 0.00015221647444690437, + "loss": 1.3983, + "step": 9644 + }, + { + "epoch": 0.3454079896861067, + "grad_norm": 1.6272372007369995, + "learning_rate": 0.00015220658193966489, + "loss": 1.4919, + "step": 9645 + }, + { + "epoch": 0.34544380181567497, + "grad_norm": 1.6300824880599976, + "learning_rate": 0.00015219668873005314, + "loss": 1.567, + "step": 9646 + }, + { + "epoch": 0.34547961394524324, + "grad_norm": 1.6557114124298096, + "learning_rate": 0.0001521867948182023, + "loss": 1.5153, + "step": 9647 + }, + { + "epoch": 0.34551542607481156, + "grad_norm": 1.9078369140625, + "learning_rate": 0.00015217690020424547, + "loss": 1.8161, + "step": 9648 + }, + { + "epoch": 0.34555123820437983, + "grad_norm": 1.5560898780822754, + "learning_rate": 0.00015216700488831573, + "loss": 1.6721, + "step": 9649 + }, + { + "epoch": 0.3455870503339481, + "grad_norm": 1.4022656679153442, + "learning_rate": 0.00015215710887054622, + "loss": 1.5221, + "step": 9650 + }, + { + "epoch": 0.34562286246351637, + "grad_norm": 1.6788058280944824, + "learning_rate": 0.00015214721215107011, + "loss": 1.459, + "step": 9651 + }, + { + "epoch": 0.3456586745930847, + "grad_norm": 1.7111127376556396, + "learning_rate": 0.0001521373147300205, + "loss": 1.3335, + "step": 9652 + }, + { + "epoch": 0.34569448672265296, + "grad_norm": 1.7129048109054565, + "learning_rate": 0.0001521274166075306, + "loss": 1.5987, + "step": 9653 + }, + { + "epoch": 0.34573029885222123, + "grad_norm": 1.3389177322387695, + "learning_rate": 0.00015211751778373357, + "loss": 1.424, + "step": 9654 + }, + { + "epoch": 0.34576611098178955, + "grad_norm": 1.848729133605957, + "learning_rate": 0.0001521076182587625, + "loss": 1.7741, + "step": 9655 + }, + { + "epoch": 0.3458019231113578, + "grad_norm": 1.724698781967163, + "learning_rate": 0.0001520977180327507, + "loss": 1.4231, + "step": 9656 + }, + { + "epoch": 0.3458377352409261, + "grad_norm": 2.336831569671631, + "learning_rate": 0.00015208781710583126, + "loss": 1.2322, + "step": 9657 + }, + { + "epoch": 0.34587354737049436, + "grad_norm": 2.277905225753784, + "learning_rate": 0.00015207791547813744, + "loss": 1.6277, + "step": 9658 + }, + { + "epoch": 0.3459093595000627, + "grad_norm": 1.8792333602905273, + "learning_rate": 0.00015206801314980245, + "loss": 1.7224, + "step": 9659 + }, + { + "epoch": 0.34594517162963095, + "grad_norm": 1.5436029434204102, + "learning_rate": 0.00015205811012095952, + "loss": 1.7911, + "step": 9660 + }, + { + "epoch": 0.3459809837591992, + "grad_norm": 1.6252689361572266, + "learning_rate": 0.00015204820639174184, + "loss": 1.5968, + "step": 9661 + }, + { + "epoch": 0.34601679588876755, + "grad_norm": 1.3083243370056152, + "learning_rate": 0.00015203830196228272, + "loss": 1.6488, + "step": 9662 + }, + { + "epoch": 0.3460526080183358, + "grad_norm": 2.9727373123168945, + "learning_rate": 0.00015202839683271536, + "loss": 1.2147, + "step": 9663 + }, + { + "epoch": 0.3460884201479041, + "grad_norm": 1.409303069114685, + "learning_rate": 0.000152018491003173, + "loss": 1.3595, + "step": 9664 + }, + { + "epoch": 0.34612423227747235, + "grad_norm": 1.4419803619384766, + "learning_rate": 0.00015200858447378897, + "loss": 1.5398, + "step": 9665 + }, + { + "epoch": 0.3461600444070407, + "grad_norm": 1.60395348072052, + "learning_rate": 0.0001519986772446965, + "loss": 1.44, + "step": 9666 + }, + { + "epoch": 0.34619585653660895, + "grad_norm": 1.5209178924560547, + "learning_rate": 0.00015198876931602894, + "loss": 1.6457, + "step": 9667 + }, + { + "epoch": 0.3462316686661772, + "grad_norm": 1.4209908246994019, + "learning_rate": 0.00015197886068791952, + "loss": 1.5196, + "step": 9668 + }, + { + "epoch": 0.34626748079574554, + "grad_norm": 1.624914288520813, + "learning_rate": 0.00015196895136050157, + "loss": 1.3967, + "step": 9669 + }, + { + "epoch": 0.3463032929253138, + "grad_norm": 1.4199450016021729, + "learning_rate": 0.00015195904133390842, + "loss": 1.2958, + "step": 9670 + }, + { + "epoch": 0.3463391050548821, + "grad_norm": 1.8208658695220947, + "learning_rate": 0.00015194913060827343, + "loss": 1.4508, + "step": 9671 + }, + { + "epoch": 0.34637491718445035, + "grad_norm": 1.6885290145874023, + "learning_rate": 0.00015193921918372984, + "loss": 1.4547, + "step": 9672 + }, + { + "epoch": 0.3464107293140187, + "grad_norm": 1.8708484172821045, + "learning_rate": 0.00015192930706041112, + "loss": 1.5467, + "step": 9673 + }, + { + "epoch": 0.34644654144358694, + "grad_norm": 1.600899338722229, + "learning_rate": 0.00015191939423845049, + "loss": 1.4948, + "step": 9674 + }, + { + "epoch": 0.3464823535731552, + "grad_norm": 1.680953025817871, + "learning_rate": 0.0001519094807179814, + "loss": 1.5515, + "step": 9675 + }, + { + "epoch": 0.34651816570272354, + "grad_norm": 2.119535207748413, + "learning_rate": 0.00015189956649913722, + "loss": 1.5226, + "step": 9676 + }, + { + "epoch": 0.3465539778322918, + "grad_norm": 1.7617014646530151, + "learning_rate": 0.0001518896515820513, + "loss": 1.8543, + "step": 9677 + }, + { + "epoch": 0.3465897899618601, + "grad_norm": 1.7265313863754272, + "learning_rate": 0.00015187973596685706, + "loss": 1.5269, + "step": 9678 + }, + { + "epoch": 0.34662560209142834, + "grad_norm": 1.4291623830795288, + "learning_rate": 0.0001518698196536879, + "loss": 1.4145, + "step": 9679 + }, + { + "epoch": 0.34666141422099667, + "grad_norm": 1.4286699295043945, + "learning_rate": 0.00015185990264267725, + "loss": 1.4807, + "step": 9680 + }, + { + "epoch": 0.34669722635056494, + "grad_norm": 1.8250465393066406, + "learning_rate": 0.00015184998493395846, + "loss": 1.841, + "step": 9681 + }, + { + "epoch": 0.3467330384801332, + "grad_norm": 1.9351611137390137, + "learning_rate": 0.00015184006652766503, + "loss": 1.4977, + "step": 9682 + }, + { + "epoch": 0.34676885060970153, + "grad_norm": 2.0955495834350586, + "learning_rate": 0.00015183014742393036, + "loss": 1.6411, + "step": 9683 + }, + { + "epoch": 0.3468046627392698, + "grad_norm": 2.191845178604126, + "learning_rate": 0.0001518202276228879, + "loss": 1.3476, + "step": 9684 + }, + { + "epoch": 0.34684047486883807, + "grad_norm": 1.5463204383850098, + "learning_rate": 0.00015181030712467113, + "loss": 1.4183, + "step": 9685 + }, + { + "epoch": 0.34687628699840634, + "grad_norm": 1.43691086769104, + "learning_rate": 0.0001518003859294135, + "loss": 1.643, + "step": 9686 + }, + { + "epoch": 0.34691209912797466, + "grad_norm": 1.4780082702636719, + "learning_rate": 0.00015179046403724852, + "loss": 1.5861, + "step": 9687 + }, + { + "epoch": 0.34694791125754293, + "grad_norm": 1.961837887763977, + "learning_rate": 0.00015178054144830965, + "loss": 1.7576, + "step": 9688 + }, + { + "epoch": 0.3469837233871112, + "grad_norm": 1.8840779066085815, + "learning_rate": 0.0001517706181627304, + "loss": 1.3534, + "step": 9689 + }, + { + "epoch": 0.3470195355166795, + "grad_norm": 2.285611629486084, + "learning_rate": 0.0001517606941806442, + "loss": 1.8627, + "step": 9690 + }, + { + "epoch": 0.3470553476462478, + "grad_norm": 1.699450135231018, + "learning_rate": 0.00015175076950218468, + "loss": 1.3957, + "step": 9691 + }, + { + "epoch": 0.34709115977581606, + "grad_norm": 1.8687032461166382, + "learning_rate": 0.00015174084412748529, + "loss": 1.4801, + "step": 9692 + }, + { + "epoch": 0.34712697190538433, + "grad_norm": 1.4657593965530396, + "learning_rate": 0.00015173091805667957, + "loss": 1.5096, + "step": 9693 + }, + { + "epoch": 0.34716278403495265, + "grad_norm": 1.5671417713165283, + "learning_rate": 0.0001517209912899011, + "loss": 1.6317, + "step": 9694 + }, + { + "epoch": 0.3471985961645209, + "grad_norm": 1.8080321550369263, + "learning_rate": 0.00015171106382728342, + "loss": 1.2073, + "step": 9695 + }, + { + "epoch": 0.3472344082940892, + "grad_norm": 1.7904155254364014, + "learning_rate": 0.00015170113566896005, + "loss": 1.3794, + "step": 9696 + }, + { + "epoch": 0.3472702204236575, + "grad_norm": 1.9424508810043335, + "learning_rate": 0.0001516912068150646, + "loss": 1.6862, + "step": 9697 + }, + { + "epoch": 0.3473060325532258, + "grad_norm": 1.6134363412857056, + "learning_rate": 0.00015168127726573064, + "loss": 1.5705, + "step": 9698 + }, + { + "epoch": 0.34734184468279405, + "grad_norm": 2.201932430267334, + "learning_rate": 0.00015167134702109177, + "loss": 1.5044, + "step": 9699 + }, + { + "epoch": 0.3473776568123623, + "grad_norm": 1.6935111284255981, + "learning_rate": 0.00015166141608128158, + "loss": 1.7495, + "step": 9700 + }, + { + "epoch": 0.34741346894193065, + "grad_norm": 1.9058157205581665, + "learning_rate": 0.0001516514844464336, + "loss": 1.7792, + "step": 9701 + }, + { + "epoch": 0.3474492810714989, + "grad_norm": 1.739087462425232, + "learning_rate": 0.00015164155211668163, + "loss": 1.1891, + "step": 9702 + }, + { + "epoch": 0.3474850932010672, + "grad_norm": 2.0421783924102783, + "learning_rate": 0.00015163161909215913, + "loss": 1.4558, + "step": 9703 + }, + { + "epoch": 0.3475209053306355, + "grad_norm": 1.9003093242645264, + "learning_rate": 0.00015162168537299979, + "loss": 1.7258, + "step": 9704 + }, + { + "epoch": 0.3475567174602038, + "grad_norm": 1.799619197845459, + "learning_rate": 0.00015161175095933729, + "loss": 1.4118, + "step": 9705 + }, + { + "epoch": 0.34759252958977205, + "grad_norm": 2.3882851600646973, + "learning_rate": 0.00015160181585130523, + "loss": 1.4103, + "step": 9706 + }, + { + "epoch": 0.3476283417193403, + "grad_norm": 1.882601022720337, + "learning_rate": 0.00015159188004903733, + "loss": 1.6516, + "step": 9707 + }, + { + "epoch": 0.34766415384890864, + "grad_norm": 2.743563413619995, + "learning_rate": 0.0001515819435526672, + "loss": 1.6921, + "step": 9708 + }, + { + "epoch": 0.3476999659784769, + "grad_norm": 1.8046648502349854, + "learning_rate": 0.00015157200636232857, + "loss": 1.6462, + "step": 9709 + }, + { + "epoch": 0.3477357781080452, + "grad_norm": 1.5821819305419922, + "learning_rate": 0.0001515620684781551, + "loss": 1.6373, + "step": 9710 + }, + { + "epoch": 0.3477715902376135, + "grad_norm": 1.2368242740631104, + "learning_rate": 0.00015155212990028053, + "loss": 1.4887, + "step": 9711 + }, + { + "epoch": 0.3478074023671818, + "grad_norm": 1.3953620195388794, + "learning_rate": 0.00015154219062883854, + "loss": 1.348, + "step": 9712 + }, + { + "epoch": 0.34784321449675004, + "grad_norm": 1.7120468616485596, + "learning_rate": 0.00015153225066396288, + "loss": 1.4454, + "step": 9713 + }, + { + "epoch": 0.3478790266263183, + "grad_norm": 1.358496904373169, + "learning_rate": 0.00015152231000578723, + "loss": 1.3551, + "step": 9714 + }, + { + "epoch": 0.34791483875588664, + "grad_norm": 1.2302814722061157, + "learning_rate": 0.00015151236865444537, + "loss": 1.5733, + "step": 9715 + }, + { + "epoch": 0.3479506508854549, + "grad_norm": 2.139761447906494, + "learning_rate": 0.00015150242661007103, + "loss": 1.3676, + "step": 9716 + }, + { + "epoch": 0.3479864630150232, + "grad_norm": 2.1644511222839355, + "learning_rate": 0.000151492483872798, + "loss": 1.493, + "step": 9717 + }, + { + "epoch": 0.3480222751445915, + "grad_norm": 2.0593531131744385, + "learning_rate": 0.00015148254044276, + "loss": 1.788, + "step": 9718 + }, + { + "epoch": 0.34805808727415977, + "grad_norm": 1.453326940536499, + "learning_rate": 0.00015147259632009082, + "loss": 1.2916, + "step": 9719 + }, + { + "epoch": 0.34809389940372804, + "grad_norm": 1.3180242776870728, + "learning_rate": 0.00015146265150492428, + "loss": 1.4568, + "step": 9720 + }, + { + "epoch": 0.3481297115332963, + "grad_norm": 1.6377161741256714, + "learning_rate": 0.0001514527059973941, + "loss": 1.5079, + "step": 9721 + }, + { + "epoch": 0.34816552366286463, + "grad_norm": 1.53775155544281, + "learning_rate": 0.00015144275979763416, + "loss": 1.7657, + "step": 9722 + }, + { + "epoch": 0.3482013357924329, + "grad_norm": 1.615124225616455, + "learning_rate": 0.0001514328129057782, + "loss": 1.5548, + "step": 9723 + }, + { + "epoch": 0.34823714792200117, + "grad_norm": 2.9852917194366455, + "learning_rate": 0.00015142286532196018, + "loss": 1.5382, + "step": 9724 + }, + { + "epoch": 0.3482729600515695, + "grad_norm": 1.7540369033813477, + "learning_rate": 0.00015141291704631374, + "loss": 1.539, + "step": 9725 + }, + { + "epoch": 0.34830877218113776, + "grad_norm": 1.644862174987793, + "learning_rate": 0.00015140296807897289, + "loss": 1.495, + "step": 9726 + }, + { + "epoch": 0.34834458431070603, + "grad_norm": 1.7135051488876343, + "learning_rate": 0.00015139301842007137, + "loss": 1.623, + "step": 9727 + }, + { + "epoch": 0.3483803964402743, + "grad_norm": 1.8585213422775269, + "learning_rate": 0.0001513830680697431, + "loss": 1.4913, + "step": 9728 + }, + { + "epoch": 0.3484162085698426, + "grad_norm": 1.7377535104751587, + "learning_rate": 0.0001513731170281219, + "loss": 1.3441, + "step": 9729 + }, + { + "epoch": 0.3484520206994109, + "grad_norm": 1.4166041612625122, + "learning_rate": 0.00015136316529534168, + "loss": 1.6572, + "step": 9730 + }, + { + "epoch": 0.34848783282897916, + "grad_norm": 2.0903475284576416, + "learning_rate": 0.00015135321287153636, + "loss": 1.2165, + "step": 9731 + }, + { + "epoch": 0.3485236449585475, + "grad_norm": 1.809444785118103, + "learning_rate": 0.00015134325975683975, + "loss": 1.5006, + "step": 9732 + }, + { + "epoch": 0.34855945708811575, + "grad_norm": 2.5409252643585205, + "learning_rate": 0.00015133330595138586, + "loss": 1.9359, + "step": 9733 + }, + { + "epoch": 0.348595269217684, + "grad_norm": 1.4193686246871948, + "learning_rate": 0.00015132335145530854, + "loss": 1.4533, + "step": 9734 + }, + { + "epoch": 0.3486310813472523, + "grad_norm": 1.9613677263259888, + "learning_rate": 0.00015131339626874173, + "loss": 1.3445, + "step": 9735 + }, + { + "epoch": 0.3486668934768206, + "grad_norm": 1.5214570760726929, + "learning_rate": 0.00015130344039181935, + "loss": 1.2932, + "step": 9736 + }, + { + "epoch": 0.3487027056063889, + "grad_norm": 1.8563958406448364, + "learning_rate": 0.00015129348382467535, + "loss": 1.876, + "step": 9737 + }, + { + "epoch": 0.34873851773595715, + "grad_norm": 1.4242786169052124, + "learning_rate": 0.00015128352656744372, + "loss": 1.6204, + "step": 9738 + }, + { + "epoch": 0.3487743298655255, + "grad_norm": 2.609499931335449, + "learning_rate": 0.00015127356862025835, + "loss": 1.422, + "step": 9739 + }, + { + "epoch": 0.34881014199509375, + "grad_norm": 1.4955037832260132, + "learning_rate": 0.00015126360998325326, + "loss": 1.4913, + "step": 9740 + }, + { + "epoch": 0.348845954124662, + "grad_norm": 1.7982819080352783, + "learning_rate": 0.00015125365065656244, + "loss": 1.2715, + "step": 9741 + }, + { + "epoch": 0.3488817662542303, + "grad_norm": 2.949068307876587, + "learning_rate": 0.00015124369064031988, + "loss": 1.8439, + "step": 9742 + }, + { + "epoch": 0.3489175783837986, + "grad_norm": 1.6375192403793335, + "learning_rate": 0.00015123372993465953, + "loss": 1.7551, + "step": 9743 + }, + { + "epoch": 0.3489533905133669, + "grad_norm": 1.3728387355804443, + "learning_rate": 0.00015122376853971545, + "loss": 1.5968, + "step": 9744 + }, + { + "epoch": 0.34898920264293515, + "grad_norm": 2.1111888885498047, + "learning_rate": 0.00015121380645562163, + "loss": 1.5258, + "step": 9745 + }, + { + "epoch": 0.3490250147725035, + "grad_norm": 1.3476275205612183, + "learning_rate": 0.0001512038436825121, + "loss": 1.368, + "step": 9746 + }, + { + "epoch": 0.34906082690207174, + "grad_norm": 1.5277924537658691, + "learning_rate": 0.0001511938802205209, + "loss": 1.5331, + "step": 9747 + }, + { + "epoch": 0.34909663903164, + "grad_norm": 1.4983168840408325, + "learning_rate": 0.0001511839160697821, + "loss": 1.3779, + "step": 9748 + }, + { + "epoch": 0.3491324511612083, + "grad_norm": 1.4858800172805786, + "learning_rate": 0.00015117395123042968, + "loss": 1.2519, + "step": 9749 + }, + { + "epoch": 0.3491682632907766, + "grad_norm": 1.6948373317718506, + "learning_rate": 0.0001511639857025978, + "loss": 1.7947, + "step": 9750 + }, + { + "epoch": 0.3492040754203449, + "grad_norm": 1.306716799736023, + "learning_rate": 0.0001511540194864205, + "loss": 1.4975, + "step": 9751 + }, + { + "epoch": 0.34923988754991314, + "grad_norm": 1.2586356401443481, + "learning_rate": 0.0001511440525820318, + "loss": 1.5344, + "step": 9752 + }, + { + "epoch": 0.34927569967948147, + "grad_norm": 1.4947229623794556, + "learning_rate": 0.0001511340849895659, + "loss": 1.4292, + "step": 9753 + }, + { + "epoch": 0.34931151180904974, + "grad_norm": 1.699182391166687, + "learning_rate": 0.0001511241167091568, + "loss": 1.6858, + "step": 9754 + }, + { + "epoch": 0.349347323938618, + "grad_norm": 1.7800447940826416, + "learning_rate": 0.0001511141477409387, + "loss": 1.441, + "step": 9755 + }, + { + "epoch": 0.3493831360681863, + "grad_norm": 1.2749764919281006, + "learning_rate": 0.00015110417808504562, + "loss": 1.682, + "step": 9756 + }, + { + "epoch": 0.3494189481977546, + "grad_norm": 1.828865885734558, + "learning_rate": 0.00015109420774161178, + "loss": 1.5552, + "step": 9757 + }, + { + "epoch": 0.34945476032732287, + "grad_norm": 1.4188942909240723, + "learning_rate": 0.00015108423671077124, + "loss": 1.6542, + "step": 9758 + }, + { + "epoch": 0.34949057245689114, + "grad_norm": 1.8428118228912354, + "learning_rate": 0.00015107426499265825, + "loss": 1.5784, + "step": 9759 + }, + { + "epoch": 0.34952638458645946, + "grad_norm": 1.4908447265625, + "learning_rate": 0.00015106429258740687, + "loss": 1.6085, + "step": 9760 + }, + { + "epoch": 0.34956219671602773, + "grad_norm": 1.5967750549316406, + "learning_rate": 0.00015105431949515133, + "loss": 1.1694, + "step": 9761 + }, + { + "epoch": 0.349598008845596, + "grad_norm": 1.6794275045394897, + "learning_rate": 0.00015104434571602577, + "loss": 1.4624, + "step": 9762 + }, + { + "epoch": 0.34963382097516427, + "grad_norm": 1.981397271156311, + "learning_rate": 0.00015103437125016432, + "loss": 1.3633, + "step": 9763 + }, + { + "epoch": 0.3496696331047326, + "grad_norm": 2.4205143451690674, + "learning_rate": 0.0001510243960977013, + "loss": 1.7959, + "step": 9764 + }, + { + "epoch": 0.34970544523430086, + "grad_norm": 2.2734334468841553, + "learning_rate": 0.00015101442025877083, + "loss": 1.3619, + "step": 9765 + }, + { + "epoch": 0.34974125736386913, + "grad_norm": 1.8504884243011475, + "learning_rate": 0.00015100444373350715, + "loss": 1.6551, + "step": 9766 + }, + { + "epoch": 0.34977706949343745, + "grad_norm": 1.5831323862075806, + "learning_rate": 0.00015099446652204446, + "loss": 1.5193, + "step": 9767 + }, + { + "epoch": 0.3498128816230057, + "grad_norm": 1.8388887643814087, + "learning_rate": 0.00015098448862451702, + "loss": 1.4238, + "step": 9768 + }, + { + "epoch": 0.349848693752574, + "grad_norm": 1.5945453643798828, + "learning_rate": 0.00015097451004105904, + "loss": 1.4274, + "step": 9769 + }, + { + "epoch": 0.34988450588214226, + "grad_norm": 1.5054142475128174, + "learning_rate": 0.00015096453077180475, + "loss": 1.506, + "step": 9770 + }, + { + "epoch": 0.3499203180117106, + "grad_norm": 1.8652490377426147, + "learning_rate": 0.00015095455081688846, + "loss": 1.8354, + "step": 9771 + }, + { + "epoch": 0.34995613014127885, + "grad_norm": 1.8523324728012085, + "learning_rate": 0.0001509445701764444, + "loss": 1.6814, + "step": 9772 + }, + { + "epoch": 0.3499919422708471, + "grad_norm": 1.3865050077438354, + "learning_rate": 0.00015093458885060687, + "loss": 1.2962, + "step": 9773 + }, + { + "epoch": 0.35002775440041545, + "grad_norm": 1.6977640390396118, + "learning_rate": 0.00015092460683951015, + "loss": 1.4121, + "step": 9774 + }, + { + "epoch": 0.3500635665299837, + "grad_norm": 2.1910016536712646, + "learning_rate": 0.00015091462414328855, + "loss": 1.9099, + "step": 9775 + }, + { + "epoch": 0.350099378659552, + "grad_norm": 2.3362877368927, + "learning_rate": 0.00015090464076207634, + "loss": 1.7061, + "step": 9776 + }, + { + "epoch": 0.35013519078912025, + "grad_norm": 1.5573481321334839, + "learning_rate": 0.0001508946566960079, + "loss": 1.6889, + "step": 9777 + }, + { + "epoch": 0.3501710029186886, + "grad_norm": 1.7748955488204956, + "learning_rate": 0.0001508846719452174, + "loss": 1.3411, + "step": 9778 + }, + { + "epoch": 0.35020681504825685, + "grad_norm": 1.7088675498962402, + "learning_rate": 0.00015087468650983935, + "loss": 1.5765, + "step": 9779 + }, + { + "epoch": 0.3502426271778251, + "grad_norm": 2.062631368637085, + "learning_rate": 0.00015086470039000802, + "loss": 1.5179, + "step": 9780 + }, + { + "epoch": 0.35027843930739344, + "grad_norm": 1.7780126333236694, + "learning_rate": 0.00015085471358585774, + "loss": 1.5016, + "step": 9781 + }, + { + "epoch": 0.3503142514369617, + "grad_norm": 1.4579542875289917, + "learning_rate": 0.00015084472609752284, + "loss": 1.5187, + "step": 9782 + }, + { + "epoch": 0.35035006356653, + "grad_norm": 1.4404877424240112, + "learning_rate": 0.0001508347379251378, + "loss": 1.5978, + "step": 9783 + }, + { + "epoch": 0.35038587569609825, + "grad_norm": 1.950384259223938, + "learning_rate": 0.0001508247490688369, + "loss": 1.4719, + "step": 9784 + }, + { + "epoch": 0.3504216878256666, + "grad_norm": 1.659462571144104, + "learning_rate": 0.0001508147595287546, + "loss": 1.6149, + "step": 9785 + }, + { + "epoch": 0.35045749995523484, + "grad_norm": 1.4204747676849365, + "learning_rate": 0.00015080476930502522, + "loss": 1.6546, + "step": 9786 + }, + { + "epoch": 0.3504933120848031, + "grad_norm": 1.5785166025161743, + "learning_rate": 0.0001507947783977832, + "loss": 1.5731, + "step": 9787 + }, + { + "epoch": 0.35052912421437143, + "grad_norm": 1.9538383483886719, + "learning_rate": 0.00015078478680716299, + "loss": 1.473, + "step": 9788 + }, + { + "epoch": 0.3505649363439397, + "grad_norm": 1.3947443962097168, + "learning_rate": 0.00015077479453329894, + "loss": 1.327, + "step": 9789 + }, + { + "epoch": 0.350600748473508, + "grad_norm": 2.454119920730591, + "learning_rate": 0.0001507648015763256, + "loss": 1.497, + "step": 9790 + }, + { + "epoch": 0.35063656060307624, + "grad_norm": 2.188707113265991, + "learning_rate": 0.00015075480793637724, + "loss": 1.8418, + "step": 9791 + }, + { + "epoch": 0.35067237273264457, + "grad_norm": 1.5564440488815308, + "learning_rate": 0.00015074481361358844, + "loss": 1.6697, + "step": 9792 + }, + { + "epoch": 0.35070818486221284, + "grad_norm": 2.128994941711426, + "learning_rate": 0.00015073481860809363, + "loss": 1.3797, + "step": 9793 + }, + { + "epoch": 0.3507439969917811, + "grad_norm": 1.9400798082351685, + "learning_rate": 0.00015072482292002732, + "loss": 1.5212, + "step": 9794 + }, + { + "epoch": 0.35077980912134943, + "grad_norm": 1.5561938285827637, + "learning_rate": 0.0001507148265495239, + "loss": 1.4001, + "step": 9795 + }, + { + "epoch": 0.3508156212509177, + "grad_norm": 1.46834135055542, + "learning_rate": 0.00015070482949671794, + "loss": 1.3285, + "step": 9796 + }, + { + "epoch": 0.35085143338048597, + "grad_norm": 1.8120313882827759, + "learning_rate": 0.0001506948317617439, + "loss": 1.809, + "step": 9797 + }, + { + "epoch": 0.35088724551005424, + "grad_norm": 1.6378916501998901, + "learning_rate": 0.00015068483334473623, + "loss": 1.6557, + "step": 9798 + }, + { + "epoch": 0.35092305763962256, + "grad_norm": 1.4764701128005981, + "learning_rate": 0.00015067483424582956, + "loss": 1.5015, + "step": 9799 + }, + { + "epoch": 0.35095886976919083, + "grad_norm": 1.7734726667404175, + "learning_rate": 0.00015066483446515836, + "loss": 1.2686, + "step": 9800 + }, + { + "epoch": 0.3509946818987591, + "grad_norm": 1.5742462873458862, + "learning_rate": 0.00015065483400285716, + "loss": 1.6867, + "step": 9801 + }, + { + "epoch": 0.3510304940283274, + "grad_norm": 1.916930079460144, + "learning_rate": 0.00015064483285906052, + "loss": 1.4477, + "step": 9802 + }, + { + "epoch": 0.3510663061578957, + "grad_norm": 1.2710362672805786, + "learning_rate": 0.00015063483103390296, + "loss": 1.4712, + "step": 9803 + }, + { + "epoch": 0.35110211828746396, + "grad_norm": 1.4401026964187622, + "learning_rate": 0.00015062482852751908, + "loss": 1.4466, + "step": 9804 + }, + { + "epoch": 0.35113793041703223, + "grad_norm": 1.357136845588684, + "learning_rate": 0.0001506148253400434, + "loss": 1.4442, + "step": 9805 + }, + { + "epoch": 0.35117374254660055, + "grad_norm": 1.5309224128723145, + "learning_rate": 0.00015060482147161058, + "loss": 1.4533, + "step": 9806 + }, + { + "epoch": 0.3512095546761688, + "grad_norm": 1.6512397527694702, + "learning_rate": 0.00015059481692235514, + "loss": 1.596, + "step": 9807 + }, + { + "epoch": 0.3512453668057371, + "grad_norm": 1.5348283052444458, + "learning_rate": 0.0001505848116924117, + "loss": 1.4335, + "step": 9808 + }, + { + "epoch": 0.35128117893530536, + "grad_norm": 1.6342748403549194, + "learning_rate": 0.00015057480578191485, + "loss": 1.1626, + "step": 9809 + }, + { + "epoch": 0.3513169910648737, + "grad_norm": 1.6329182386398315, + "learning_rate": 0.00015056479919099927, + "loss": 1.5846, + "step": 9810 + }, + { + "epoch": 0.35135280319444195, + "grad_norm": 1.609890341758728, + "learning_rate": 0.0001505547919197995, + "loss": 1.5383, + "step": 9811 + }, + { + "epoch": 0.3513886153240102, + "grad_norm": 1.6261221170425415, + "learning_rate": 0.00015054478396845026, + "loss": 1.6088, + "step": 9812 + }, + { + "epoch": 0.35142442745357855, + "grad_norm": 1.727931261062622, + "learning_rate": 0.00015053477533708608, + "loss": 1.6369, + "step": 9813 + }, + { + "epoch": 0.3514602395831468, + "grad_norm": 1.760061264038086, + "learning_rate": 0.00015052476602584177, + "loss": 1.5403, + "step": 9814 + }, + { + "epoch": 0.3514960517127151, + "grad_norm": 1.8632763624191284, + "learning_rate": 0.00015051475603485183, + "loss": 1.1816, + "step": 9815 + }, + { + "epoch": 0.35153186384228335, + "grad_norm": 1.4889694452285767, + "learning_rate": 0.00015050474536425101, + "loss": 1.5174, + "step": 9816 + }, + { + "epoch": 0.3515676759718517, + "grad_norm": 1.9194355010986328, + "learning_rate": 0.00015049473401417403, + "loss": 1.6779, + "step": 9817 + }, + { + "epoch": 0.35160348810141995, + "grad_norm": 1.6100517511367798, + "learning_rate": 0.00015048472198475553, + "loss": 1.5647, + "step": 9818 + }, + { + "epoch": 0.3516393002309882, + "grad_norm": 1.5931763648986816, + "learning_rate": 0.00015047470927613018, + "loss": 1.6965, + "step": 9819 + }, + { + "epoch": 0.35167511236055654, + "grad_norm": 1.833192229270935, + "learning_rate": 0.00015046469588843276, + "loss": 1.5959, + "step": 9820 + }, + { + "epoch": 0.3517109244901248, + "grad_norm": 2.009188175201416, + "learning_rate": 0.00015045468182179795, + "loss": 1.8723, + "step": 9821 + }, + { + "epoch": 0.3517467366196931, + "grad_norm": 1.9709876775741577, + "learning_rate": 0.0001504446670763605, + "loss": 1.677, + "step": 9822 + }, + { + "epoch": 0.35178254874926135, + "grad_norm": 1.6057246923446655, + "learning_rate": 0.0001504346516522551, + "loss": 1.7295, + "step": 9823 + }, + { + "epoch": 0.3518183608788297, + "grad_norm": 1.5857231616973877, + "learning_rate": 0.00015042463554961648, + "loss": 1.4851, + "step": 9824 + }, + { + "epoch": 0.35185417300839794, + "grad_norm": 2.4867377281188965, + "learning_rate": 0.0001504146187685795, + "loss": 1.5757, + "step": 9825 + }, + { + "epoch": 0.3518899851379662, + "grad_norm": 1.5649453401565552, + "learning_rate": 0.0001504046013092788, + "loss": 1.2162, + "step": 9826 + }, + { + "epoch": 0.35192579726753453, + "grad_norm": 1.9253320693969727, + "learning_rate": 0.00015039458317184923, + "loss": 1.6233, + "step": 9827 + }, + { + "epoch": 0.3519616093971028, + "grad_norm": 1.2841295003890991, + "learning_rate": 0.00015038456435642554, + "loss": 1.5658, + "step": 9828 + }, + { + "epoch": 0.3519974215266711, + "grad_norm": 1.5589649677276611, + "learning_rate": 0.0001503745448631425, + "loss": 1.6228, + "step": 9829 + }, + { + "epoch": 0.35203323365623934, + "grad_norm": 2.096522808074951, + "learning_rate": 0.00015036452469213504, + "loss": 1.5682, + "step": 9830 + }, + { + "epoch": 0.35206904578580767, + "grad_norm": 1.6603925228118896, + "learning_rate": 0.00015035450384353775, + "loss": 1.6372, + "step": 9831 + }, + { + "epoch": 0.35210485791537594, + "grad_norm": 1.6871788501739502, + "learning_rate": 0.0001503444823174856, + "loss": 1.7357, + "step": 9832 + }, + { + "epoch": 0.3521406700449442, + "grad_norm": 1.7978812456130981, + "learning_rate": 0.0001503344601141134, + "loss": 1.3659, + "step": 9833 + }, + { + "epoch": 0.35217648217451253, + "grad_norm": 1.4551135301589966, + "learning_rate": 0.00015032443723355597, + "loss": 1.5676, + "step": 9834 + }, + { + "epoch": 0.3522122943040808, + "grad_norm": 2.247584819793701, + "learning_rate": 0.0001503144136759481, + "loss": 1.3566, + "step": 9835 + }, + { + "epoch": 0.35224810643364907, + "grad_norm": 1.6647343635559082, + "learning_rate": 0.00015030438944142475, + "loss": 1.5347, + "step": 9836 + }, + { + "epoch": 0.35228391856321734, + "grad_norm": 1.3498740196228027, + "learning_rate": 0.0001502943645301207, + "loss": 1.241, + "step": 9837 + }, + { + "epoch": 0.35231973069278566, + "grad_norm": 1.8701205253601074, + "learning_rate": 0.00015028433894217087, + "loss": 1.4011, + "step": 9838 + }, + { + "epoch": 0.35235554282235393, + "grad_norm": 1.4306402206420898, + "learning_rate": 0.0001502743126777101, + "loss": 1.493, + "step": 9839 + }, + { + "epoch": 0.3523913549519222, + "grad_norm": 1.9978787899017334, + "learning_rate": 0.0001502642857368733, + "loss": 1.591, + "step": 9840 + }, + { + "epoch": 0.3524271670814905, + "grad_norm": 1.7107425928115845, + "learning_rate": 0.00015025425811979542, + "loss": 1.2661, + "step": 9841 + }, + { + "epoch": 0.3524629792110588, + "grad_norm": 1.2457308769226074, + "learning_rate": 0.00015024422982661125, + "loss": 1.5245, + "step": 9842 + }, + { + "epoch": 0.35249879134062706, + "grad_norm": 1.4251497983932495, + "learning_rate": 0.00015023420085745584, + "loss": 1.5731, + "step": 9843 + }, + { + "epoch": 0.35253460347019533, + "grad_norm": 2.0676839351654053, + "learning_rate": 0.00015022417121246398, + "loss": 1.4724, + "step": 9844 + }, + { + "epoch": 0.35257041559976365, + "grad_norm": 1.7826956510543823, + "learning_rate": 0.00015021414089177077, + "loss": 1.7058, + "step": 9845 + }, + { + "epoch": 0.3526062277293319, + "grad_norm": 1.7209420204162598, + "learning_rate": 0.000150204109895511, + "loss": 1.5886, + "step": 9846 + }, + { + "epoch": 0.3526420398589002, + "grad_norm": 1.6267695426940918, + "learning_rate": 0.00015019407822381973, + "loss": 1.4277, + "step": 9847 + }, + { + "epoch": 0.3526778519884685, + "grad_norm": 1.7787023782730103, + "learning_rate": 0.00015018404587683186, + "loss": 1.7895, + "step": 9848 + }, + { + "epoch": 0.3527136641180368, + "grad_norm": 1.4573239088058472, + "learning_rate": 0.0001501740128546824, + "loss": 1.278, + "step": 9849 + }, + { + "epoch": 0.35274947624760505, + "grad_norm": 1.3814666271209717, + "learning_rate": 0.00015016397915750633, + "loss": 1.2868, + "step": 9850 + }, + { + "epoch": 0.3527852883771733, + "grad_norm": 2.118358850479126, + "learning_rate": 0.0001501539447854386, + "loss": 1.3829, + "step": 9851 + }, + { + "epoch": 0.35282110050674165, + "grad_norm": 1.9069745540618896, + "learning_rate": 0.00015014390973861424, + "loss": 1.7754, + "step": 9852 + }, + { + "epoch": 0.3528569126363099, + "grad_norm": 2.0276107788085938, + "learning_rate": 0.00015013387401716823, + "loss": 1.4873, + "step": 9853 + }, + { + "epoch": 0.3528927247658782, + "grad_norm": 2.06880784034729, + "learning_rate": 0.00015012383762123566, + "loss": 1.729, + "step": 9854 + }, + { + "epoch": 0.3529285368954465, + "grad_norm": 2.1052966117858887, + "learning_rate": 0.0001501138005509515, + "loss": 1.46, + "step": 9855 + }, + { + "epoch": 0.3529643490250148, + "grad_norm": 1.9557901620864868, + "learning_rate": 0.00015010376280645077, + "loss": 1.4189, + "step": 9856 + }, + { + "epoch": 0.35300016115458305, + "grad_norm": 1.9544841051101685, + "learning_rate": 0.00015009372438786858, + "loss": 1.7716, + "step": 9857 + }, + { + "epoch": 0.3530359732841513, + "grad_norm": 1.9688549041748047, + "learning_rate": 0.00015008368529533992, + "loss": 1.7624, + "step": 9858 + }, + { + "epoch": 0.35307178541371964, + "grad_norm": 1.3620836734771729, + "learning_rate": 0.00015007364552899988, + "loss": 1.6062, + "step": 9859 + }, + { + "epoch": 0.3531075975432879, + "grad_norm": 1.8967136144638062, + "learning_rate": 0.0001500636050889835, + "loss": 1.1698, + "step": 9860 + }, + { + "epoch": 0.3531434096728562, + "grad_norm": 1.6941111087799072, + "learning_rate": 0.00015005356397542597, + "loss": 1.7253, + "step": 9861 + }, + { + "epoch": 0.3531792218024245, + "grad_norm": 1.463535189628601, + "learning_rate": 0.00015004352218846222, + "loss": 1.5704, + "step": 9862 + }, + { + "epoch": 0.3532150339319928, + "grad_norm": 2.1995041370391846, + "learning_rate": 0.00015003347972822746, + "loss": 1.7306, + "step": 9863 + }, + { + "epoch": 0.35325084606156104, + "grad_norm": 1.662667155265808, + "learning_rate": 0.00015002343659485678, + "loss": 1.6146, + "step": 9864 + }, + { + "epoch": 0.3532866581911293, + "grad_norm": 1.8545045852661133, + "learning_rate": 0.0001500133927884853, + "loss": 1.7385, + "step": 9865 + }, + { + "epoch": 0.35332247032069763, + "grad_norm": 1.3238962888717651, + "learning_rate": 0.0001500033483092481, + "loss": 1.477, + "step": 9866 + }, + { + "epoch": 0.3533582824502659, + "grad_norm": 2.3159918785095215, + "learning_rate": 0.00014999330315728035, + "loss": 1.4599, + "step": 9867 + }, + { + "epoch": 0.3533940945798342, + "grad_norm": 1.7593953609466553, + "learning_rate": 0.00014998325733271722, + "loss": 1.3809, + "step": 9868 + }, + { + "epoch": 0.3534299067094025, + "grad_norm": 1.9399266242980957, + "learning_rate": 0.0001499732108356938, + "loss": 1.5886, + "step": 9869 + }, + { + "epoch": 0.35346571883897077, + "grad_norm": 1.7394417524337769, + "learning_rate": 0.00014996316366634532, + "loss": 1.4151, + "step": 9870 + }, + { + "epoch": 0.35350153096853904, + "grad_norm": 2.505788564682007, + "learning_rate": 0.0001499531158248069, + "loss": 1.7199, + "step": 9871 + }, + { + "epoch": 0.3535373430981073, + "grad_norm": 1.7400318384170532, + "learning_rate": 0.00014994306731121374, + "loss": 1.7993, + "step": 9872 + }, + { + "epoch": 0.35357315522767563, + "grad_norm": 1.9016368389129639, + "learning_rate": 0.00014993301812570104, + "loss": 1.54, + "step": 9873 + }, + { + "epoch": 0.3536089673572439, + "grad_norm": 1.5940747261047363, + "learning_rate": 0.00014992296826840402, + "loss": 1.4284, + "step": 9874 + }, + { + "epoch": 0.35364477948681217, + "grad_norm": 2.1987946033477783, + "learning_rate": 0.00014991291773945782, + "loss": 1.4855, + "step": 9875 + }, + { + "epoch": 0.3536805916163805, + "grad_norm": 1.4558799266815186, + "learning_rate": 0.0001499028665389977, + "loss": 1.8473, + "step": 9876 + }, + { + "epoch": 0.35371640374594876, + "grad_norm": 1.5671563148498535, + "learning_rate": 0.00014989281466715887, + "loss": 1.5182, + "step": 9877 + }, + { + "epoch": 0.35375221587551703, + "grad_norm": 1.395065188407898, + "learning_rate": 0.0001498827621240766, + "loss": 1.3509, + "step": 9878 + }, + { + "epoch": 0.3537880280050853, + "grad_norm": 1.8056888580322266, + "learning_rate": 0.0001498727089098861, + "loss": 1.779, + "step": 9879 + }, + { + "epoch": 0.3538238401346536, + "grad_norm": 1.9839811325073242, + "learning_rate": 0.00014986265502472262, + "loss": 1.813, + "step": 9880 + }, + { + "epoch": 0.3538596522642219, + "grad_norm": 1.4838645458221436, + "learning_rate": 0.00014985260046872145, + "loss": 1.1724, + "step": 9881 + }, + { + "epoch": 0.35389546439379016, + "grad_norm": 1.5873481035232544, + "learning_rate": 0.00014984254524201784, + "loss": 1.3684, + "step": 9882 + }, + { + "epoch": 0.3539312765233585, + "grad_norm": 1.2479912042617798, + "learning_rate": 0.0001498324893447471, + "loss": 1.1336, + "step": 9883 + }, + { + "epoch": 0.35396708865292675, + "grad_norm": 1.6061489582061768, + "learning_rate": 0.00014982243277704446, + "loss": 1.5485, + "step": 9884 + }, + { + "epoch": 0.354002900782495, + "grad_norm": 2.5165042877197266, + "learning_rate": 0.0001498123755390453, + "loss": 1.4389, + "step": 9885 + }, + { + "epoch": 0.3540387129120633, + "grad_norm": 2.0060083866119385, + "learning_rate": 0.00014980231763088482, + "loss": 1.5283, + "step": 9886 + }, + { + "epoch": 0.3540745250416316, + "grad_norm": 1.2914835214614868, + "learning_rate": 0.00014979225905269842, + "loss": 1.4375, + "step": 9887 + }, + { + "epoch": 0.3541103371711999, + "grad_norm": 1.3552098274230957, + "learning_rate": 0.0001497821998046214, + "loss": 1.4385, + "step": 9888 + }, + { + "epoch": 0.35414614930076815, + "grad_norm": 1.729246735572815, + "learning_rate": 0.0001497721398867891, + "loss": 1.4477, + "step": 9889 + }, + { + "epoch": 0.3541819614303365, + "grad_norm": 1.430737018585205, + "learning_rate": 0.00014976207929933688, + "loss": 1.6311, + "step": 9890 + }, + { + "epoch": 0.35421777355990475, + "grad_norm": 1.8276435136795044, + "learning_rate": 0.00014975201804240005, + "loss": 1.6353, + "step": 9891 + }, + { + "epoch": 0.354253585689473, + "grad_norm": 1.814315915107727, + "learning_rate": 0.00014974195611611402, + "loss": 1.9347, + "step": 9892 + }, + { + "epoch": 0.3542893978190413, + "grad_norm": 1.468151330947876, + "learning_rate": 0.00014973189352061409, + "loss": 1.3659, + "step": 9893 + }, + { + "epoch": 0.3543252099486096, + "grad_norm": 2.0722012519836426, + "learning_rate": 0.0001497218302560357, + "loss": 1.5477, + "step": 9894 + }, + { + "epoch": 0.3543610220781779, + "grad_norm": 1.7138365507125854, + "learning_rate": 0.0001497117663225142, + "loss": 1.6164, + "step": 9895 + }, + { + "epoch": 0.35439683420774615, + "grad_norm": 1.5745422840118408, + "learning_rate": 0.00014970170172018505, + "loss": 1.566, + "step": 9896 + }, + { + "epoch": 0.35443264633731447, + "grad_norm": 2.2554714679718018, + "learning_rate": 0.00014969163644918358, + "loss": 1.4819, + "step": 9897 + }, + { + "epoch": 0.35446845846688274, + "grad_norm": 1.376289963722229, + "learning_rate": 0.00014968157050964526, + "loss": 1.4859, + "step": 9898 + }, + { + "epoch": 0.354504270596451, + "grad_norm": 1.6688337326049805, + "learning_rate": 0.00014967150390170547, + "loss": 1.5748, + "step": 9899 + }, + { + "epoch": 0.3545400827260193, + "grad_norm": 2.0181753635406494, + "learning_rate": 0.0001496614366254997, + "loss": 1.37, + "step": 9900 + }, + { + "epoch": 0.3545758948555876, + "grad_norm": 1.6945894956588745, + "learning_rate": 0.00014965136868116334, + "loss": 1.5793, + "step": 9901 + }, + { + "epoch": 0.3546117069851559, + "grad_norm": 1.9368959665298462, + "learning_rate": 0.00014964130006883187, + "loss": 1.6875, + "step": 9902 + }, + { + "epoch": 0.35464751911472414, + "grad_norm": 1.573068380355835, + "learning_rate": 0.00014963123078864073, + "loss": 1.5334, + "step": 9903 + }, + { + "epoch": 0.35468333124429247, + "grad_norm": 1.3953185081481934, + "learning_rate": 0.0001496211608407254, + "loss": 1.6489, + "step": 9904 + }, + { + "epoch": 0.35471914337386073, + "grad_norm": 2.508678674697876, + "learning_rate": 0.00014961109022522135, + "loss": 1.9514, + "step": 9905 + }, + { + "epoch": 0.354754955503429, + "grad_norm": 2.420624256134033, + "learning_rate": 0.0001496010189422641, + "loss": 1.4283, + "step": 9906 + }, + { + "epoch": 0.3547907676329973, + "grad_norm": 1.3854237794876099, + "learning_rate": 0.00014959094699198907, + "loss": 1.2818, + "step": 9907 + }, + { + "epoch": 0.3548265797625656, + "grad_norm": 1.5231170654296875, + "learning_rate": 0.00014958087437453186, + "loss": 1.4811, + "step": 9908 + }, + { + "epoch": 0.35486239189213387, + "grad_norm": 1.3888393640518188, + "learning_rate": 0.00014957080109002794, + "loss": 1.406, + "step": 9909 + }, + { + "epoch": 0.35489820402170213, + "grad_norm": 2.292565107345581, + "learning_rate": 0.00014956072713861286, + "loss": 1.6349, + "step": 9910 + }, + { + "epoch": 0.35493401615127046, + "grad_norm": 2.0095014572143555, + "learning_rate": 0.00014955065252042206, + "loss": 1.6491, + "step": 9911 + }, + { + "epoch": 0.35496982828083873, + "grad_norm": 2.346771240234375, + "learning_rate": 0.00014954057723559115, + "loss": 1.8228, + "step": 9912 + }, + { + "epoch": 0.355005640410407, + "grad_norm": 1.9204994440078735, + "learning_rate": 0.0001495305012842557, + "loss": 1.6642, + "step": 9913 + }, + { + "epoch": 0.35504145253997527, + "grad_norm": 1.8857648372650146, + "learning_rate": 0.00014952042466655126, + "loss": 1.5507, + "step": 9914 + }, + { + "epoch": 0.3550772646695436, + "grad_norm": 1.6578903198242188, + "learning_rate": 0.00014951034738261337, + "loss": 1.7453, + "step": 9915 + }, + { + "epoch": 0.35511307679911186, + "grad_norm": 1.5523202419281006, + "learning_rate": 0.00014950026943257762, + "loss": 1.6968, + "step": 9916 + }, + { + "epoch": 0.35514888892868013, + "grad_norm": 2.2549266815185547, + "learning_rate": 0.00014949019081657959, + "loss": 1.512, + "step": 9917 + }, + { + "epoch": 0.35518470105824845, + "grad_norm": 2.5637290477752686, + "learning_rate": 0.00014948011153475491, + "loss": 1.7135, + "step": 9918 + }, + { + "epoch": 0.3552205131878167, + "grad_norm": 1.7439630031585693, + "learning_rate": 0.0001494700315872391, + "loss": 1.5241, + "step": 9919 + }, + { + "epoch": 0.355256325317385, + "grad_norm": 2.2923781871795654, + "learning_rate": 0.00014945995097416788, + "loss": 1.5614, + "step": 9920 + }, + { + "epoch": 0.35529213744695326, + "grad_norm": 1.4263309240341187, + "learning_rate": 0.0001494498696956768, + "loss": 1.618, + "step": 9921 + }, + { + "epoch": 0.3553279495765216, + "grad_norm": 1.650952935218811, + "learning_rate": 0.0001494397877519015, + "loss": 1.531, + "step": 9922 + }, + { + "epoch": 0.35536376170608985, + "grad_norm": 1.950554609298706, + "learning_rate": 0.00014942970514297761, + "loss": 1.3641, + "step": 9923 + }, + { + "epoch": 0.3553995738356581, + "grad_norm": 1.464562177658081, + "learning_rate": 0.00014941962186904083, + "loss": 1.508, + "step": 9924 + }, + { + "epoch": 0.35543538596522645, + "grad_norm": 1.9498792886734009, + "learning_rate": 0.00014940953793022676, + "loss": 1.7068, + "step": 9925 + }, + { + "epoch": 0.3554711980947947, + "grad_norm": 2.579939126968384, + "learning_rate": 0.00014939945332667108, + "loss": 1.5399, + "step": 9926 + }, + { + "epoch": 0.355507010224363, + "grad_norm": 1.616997480392456, + "learning_rate": 0.00014938936805850955, + "loss": 1.2796, + "step": 9927 + }, + { + "epoch": 0.35554282235393125, + "grad_norm": 1.5822079181671143, + "learning_rate": 0.0001493792821258777, + "loss": 1.5981, + "step": 9928 + }, + { + "epoch": 0.3555786344834996, + "grad_norm": 1.6461950540542603, + "learning_rate": 0.00014936919552891134, + "loss": 1.2901, + "step": 9929 + }, + { + "epoch": 0.35561444661306785, + "grad_norm": 1.615821123123169, + "learning_rate": 0.00014935910826774612, + "loss": 1.7623, + "step": 9930 + }, + { + "epoch": 0.3556502587426361, + "grad_norm": 2.009953260421753, + "learning_rate": 0.0001493490203425178, + "loss": 1.4007, + "step": 9931 + }, + { + "epoch": 0.35568607087220444, + "grad_norm": 1.6070547103881836, + "learning_rate": 0.00014933893175336202, + "loss": 1.6903, + "step": 9932 + }, + { + "epoch": 0.3557218830017727, + "grad_norm": 1.6759109497070312, + "learning_rate": 0.0001493288425004146, + "loss": 1.5817, + "step": 9933 + }, + { + "epoch": 0.355757695131341, + "grad_norm": 2.2906291484832764, + "learning_rate": 0.00014931875258381117, + "loss": 1.7121, + "step": 9934 + }, + { + "epoch": 0.35579350726090925, + "grad_norm": 1.4188705682754517, + "learning_rate": 0.00014930866200368761, + "loss": 1.81, + "step": 9935 + }, + { + "epoch": 0.35582931939047757, + "grad_norm": 2.0052592754364014, + "learning_rate": 0.0001492985707601796, + "loss": 1.5919, + "step": 9936 + }, + { + "epoch": 0.35586513152004584, + "grad_norm": 1.7482327222824097, + "learning_rate": 0.00014928847885342287, + "loss": 1.4017, + "step": 9937 + }, + { + "epoch": 0.3559009436496141, + "grad_norm": 1.6666542291641235, + "learning_rate": 0.00014927838628355327, + "loss": 1.3715, + "step": 9938 + }, + { + "epoch": 0.35593675577918243, + "grad_norm": 1.207485556602478, + "learning_rate": 0.0001492682930507065, + "loss": 1.3941, + "step": 9939 + }, + { + "epoch": 0.3559725679087507, + "grad_norm": 1.869559645652771, + "learning_rate": 0.00014925819915501847, + "loss": 1.7862, + "step": 9940 + }, + { + "epoch": 0.356008380038319, + "grad_norm": 1.9369616508483887, + "learning_rate": 0.00014924810459662484, + "loss": 1.6843, + "step": 9941 + }, + { + "epoch": 0.35604419216788724, + "grad_norm": 2.2861275672912598, + "learning_rate": 0.0001492380093756615, + "loss": 1.3355, + "step": 9942 + }, + { + "epoch": 0.35608000429745557, + "grad_norm": 1.4812921285629272, + "learning_rate": 0.0001492279134922643, + "loss": 1.4724, + "step": 9943 + }, + { + "epoch": 0.35611581642702383, + "grad_norm": 1.890170931816101, + "learning_rate": 0.000149217816946569, + "loss": 1.403, + "step": 9944 + }, + { + "epoch": 0.3561516285565921, + "grad_norm": 1.7531453371047974, + "learning_rate": 0.00014920771973871147, + "loss": 1.8111, + "step": 9945 + }, + { + "epoch": 0.35618744068616043, + "grad_norm": 2.1455671787261963, + "learning_rate": 0.00014919762186882754, + "loss": 1.6263, + "step": 9946 + }, + { + "epoch": 0.3562232528157287, + "grad_norm": 1.9717743396759033, + "learning_rate": 0.00014918752333705303, + "loss": 1.1631, + "step": 9947 + }, + { + "epoch": 0.35625906494529697, + "grad_norm": 1.4446394443511963, + "learning_rate": 0.00014917742414352386, + "loss": 1.4625, + "step": 9948 + }, + { + "epoch": 0.35629487707486523, + "grad_norm": 1.9484100341796875, + "learning_rate": 0.00014916732428837593, + "loss": 1.8046, + "step": 9949 + }, + { + "epoch": 0.35633068920443356, + "grad_norm": 1.383619785308838, + "learning_rate": 0.00014915722377174503, + "loss": 1.3816, + "step": 9950 + }, + { + "epoch": 0.35636650133400183, + "grad_norm": 1.6241815090179443, + "learning_rate": 0.0001491471225937671, + "loss": 1.5867, + "step": 9951 + }, + { + "epoch": 0.3564023134635701, + "grad_norm": 1.8264425992965698, + "learning_rate": 0.000149137020754578, + "loss": 1.6515, + "step": 9952 + }, + { + "epoch": 0.3564381255931384, + "grad_norm": 1.544845700263977, + "learning_rate": 0.0001491269182543137, + "loss": 1.5767, + "step": 9953 + }, + { + "epoch": 0.3564739377227067, + "grad_norm": 2.761215925216675, + "learning_rate": 0.0001491168150931101, + "loss": 1.3875, + "step": 9954 + }, + { + "epoch": 0.35650974985227496, + "grad_norm": 2.150071859359741, + "learning_rate": 0.00014910671127110308, + "loss": 1.3972, + "step": 9955 + }, + { + "epoch": 0.35654556198184323, + "grad_norm": 1.8057886362075806, + "learning_rate": 0.00014909660678842862, + "loss": 1.4971, + "step": 9956 + }, + { + "epoch": 0.35658137411141155, + "grad_norm": 1.5841161012649536, + "learning_rate": 0.0001490865016452226, + "loss": 1.6912, + "step": 9957 + }, + { + "epoch": 0.3566171862409798, + "grad_norm": 1.4754817485809326, + "learning_rate": 0.00014907639584162109, + "loss": 1.6008, + "step": 9958 + }, + { + "epoch": 0.3566529983705481, + "grad_norm": 1.5032598972320557, + "learning_rate": 0.00014906628937775995, + "loss": 1.6716, + "step": 9959 + }, + { + "epoch": 0.3566888105001164, + "grad_norm": 1.5778634548187256, + "learning_rate": 0.00014905618225377517, + "loss": 1.2682, + "step": 9960 + }, + { + "epoch": 0.3567246226296847, + "grad_norm": 2.124629259109497, + "learning_rate": 0.00014904607446980273, + "loss": 1.4935, + "step": 9961 + }, + { + "epoch": 0.35676043475925295, + "grad_norm": 1.7733240127563477, + "learning_rate": 0.00014903596602597864, + "loss": 1.8838, + "step": 9962 + }, + { + "epoch": 0.3567962468888212, + "grad_norm": 2.439082622528076, + "learning_rate": 0.00014902585692243885, + "loss": 1.5614, + "step": 9963 + }, + { + "epoch": 0.35683205901838955, + "grad_norm": 1.8383203744888306, + "learning_rate": 0.00014901574715931942, + "loss": 1.386, + "step": 9964 + }, + { + "epoch": 0.3568678711479578, + "grad_norm": 1.448671579360962, + "learning_rate": 0.00014900563673675633, + "loss": 1.3934, + "step": 9965 + }, + { + "epoch": 0.3569036832775261, + "grad_norm": 1.9962146282196045, + "learning_rate": 0.00014899552565488563, + "loss": 1.7006, + "step": 9966 + }, + { + "epoch": 0.3569394954070944, + "grad_norm": 1.699221134185791, + "learning_rate": 0.0001489854139138433, + "loss": 1.5871, + "step": 9967 + }, + { + "epoch": 0.3569753075366627, + "grad_norm": 1.5800422430038452, + "learning_rate": 0.00014897530151376545, + "loss": 1.6377, + "step": 9968 + }, + { + "epoch": 0.35701111966623095, + "grad_norm": 1.6919838190078735, + "learning_rate": 0.00014896518845478805, + "loss": 1.424, + "step": 9969 + }, + { + "epoch": 0.3570469317957992, + "grad_norm": 1.6668004989624023, + "learning_rate": 0.00014895507473704718, + "loss": 1.6673, + "step": 9970 + }, + { + "epoch": 0.35708274392536754, + "grad_norm": 1.9433567523956299, + "learning_rate": 0.00014894496036067903, + "loss": 1.3958, + "step": 9971 + }, + { + "epoch": 0.3571185560549358, + "grad_norm": 2.3744451999664307, + "learning_rate": 0.00014893484532581947, + "loss": 1.5432, + "step": 9972 + }, + { + "epoch": 0.3571543681845041, + "grad_norm": 1.429279088973999, + "learning_rate": 0.00014892472963260475, + "loss": 1.9036, + "step": 9973 + }, + { + "epoch": 0.3571901803140724, + "grad_norm": 1.7184338569641113, + "learning_rate": 0.00014891461328117087, + "loss": 1.3598, + "step": 9974 + }, + { + "epoch": 0.35722599244364067, + "grad_norm": 1.5399808883666992, + "learning_rate": 0.00014890449627165398, + "loss": 1.5129, + "step": 9975 + }, + { + "epoch": 0.35726180457320894, + "grad_norm": 1.5570214986801147, + "learning_rate": 0.00014889437860419013, + "loss": 1.6888, + "step": 9976 + }, + { + "epoch": 0.3572976167027772, + "grad_norm": 1.7206803560256958, + "learning_rate": 0.00014888426027891553, + "loss": 1.6131, + "step": 9977 + }, + { + "epoch": 0.35733342883234553, + "grad_norm": 1.5917768478393555, + "learning_rate": 0.00014887414129596623, + "loss": 1.6743, + "step": 9978 + }, + { + "epoch": 0.3573692409619138, + "grad_norm": 1.8771898746490479, + "learning_rate": 0.00014886402165547845, + "loss": 1.4912, + "step": 9979 + }, + { + "epoch": 0.3574050530914821, + "grad_norm": 1.189568281173706, + "learning_rate": 0.00014885390135758826, + "loss": 1.4007, + "step": 9980 + }, + { + "epoch": 0.3574408652210504, + "grad_norm": 1.9327512979507446, + "learning_rate": 0.00014884378040243184, + "loss": 1.6163, + "step": 9981 + }, + { + "epoch": 0.35747667735061867, + "grad_norm": 1.1493306159973145, + "learning_rate": 0.0001488336587901454, + "loss": 1.4209, + "step": 9982 + }, + { + "epoch": 0.35751248948018693, + "grad_norm": 1.4550294876098633, + "learning_rate": 0.000148823536520865, + "loss": 1.3286, + "step": 9983 + }, + { + "epoch": 0.3575483016097552, + "grad_norm": 1.721457600593567, + "learning_rate": 0.00014881341359472696, + "loss": 1.6156, + "step": 9984 + }, + { + "epoch": 0.35758411373932353, + "grad_norm": 1.9905215501785278, + "learning_rate": 0.00014880329001186736, + "loss": 1.5031, + "step": 9985 + }, + { + "epoch": 0.3576199258688918, + "grad_norm": 1.5918910503387451, + "learning_rate": 0.00014879316577242246, + "loss": 1.4491, + "step": 9986 + }, + { + "epoch": 0.35765573799846007, + "grad_norm": 2.3297674655914307, + "learning_rate": 0.00014878304087652847, + "loss": 1.5093, + "step": 9987 + }, + { + "epoch": 0.3576915501280284, + "grad_norm": 1.8075636625289917, + "learning_rate": 0.00014877291532432158, + "loss": 1.4857, + "step": 9988 + }, + { + "epoch": 0.35772736225759666, + "grad_norm": 1.7248746156692505, + "learning_rate": 0.00014876278911593802, + "loss": 1.6853, + "step": 9989 + }, + { + "epoch": 0.35776317438716493, + "grad_norm": 1.792223334312439, + "learning_rate": 0.00014875266225151403, + "loss": 1.2533, + "step": 9990 + }, + { + "epoch": 0.3577989865167332, + "grad_norm": 1.5456364154815674, + "learning_rate": 0.00014874253473118586, + "loss": 1.4788, + "step": 9991 + }, + { + "epoch": 0.3578347986463015, + "grad_norm": 1.5730777978897095, + "learning_rate": 0.00014873240655508975, + "loss": 1.4009, + "step": 9992 + }, + { + "epoch": 0.3578706107758698, + "grad_norm": 1.5028215646743774, + "learning_rate": 0.00014872227772336197, + "loss": 1.5448, + "step": 9993 + }, + { + "epoch": 0.35790642290543806, + "grad_norm": 1.584532618522644, + "learning_rate": 0.0001487121482361388, + "loss": 1.4787, + "step": 9994 + }, + { + "epoch": 0.3579422350350064, + "grad_norm": 1.7555292844772339, + "learning_rate": 0.00014870201809355653, + "loss": 1.3804, + "step": 9995 + }, + { + "epoch": 0.35797804716457465, + "grad_norm": 1.8509167432785034, + "learning_rate": 0.0001486918872957514, + "loss": 1.4473, + "step": 9996 + }, + { + "epoch": 0.3580138592941429, + "grad_norm": 1.639758825302124, + "learning_rate": 0.00014868175584285974, + "loss": 1.4596, + "step": 9997 + }, + { + "epoch": 0.3580496714237112, + "grad_norm": 1.8261754512786865, + "learning_rate": 0.00014867162373501786, + "loss": 1.6244, + "step": 9998 + }, + { + "epoch": 0.3580854835532795, + "grad_norm": 1.5407792329788208, + "learning_rate": 0.00014866149097236204, + "loss": 1.4145, + "step": 9999 + }, + { + "epoch": 0.3581212956828478, + "grad_norm": 1.653843879699707, + "learning_rate": 0.00014865135755502866, + "loss": 1.3846, + "step": 10000 + }, + { + "epoch": 0.35815710781241605, + "grad_norm": 1.4592039585113525, + "learning_rate": 0.000148641223483154, + "loss": 1.3947, + "step": 10001 + }, + { + "epoch": 0.3581929199419843, + "grad_norm": 1.936527132987976, + "learning_rate": 0.00014863108875687444, + "loss": 1.2548, + "step": 10002 + }, + { + "epoch": 0.35822873207155265, + "grad_norm": 1.7076057195663452, + "learning_rate": 0.00014862095337632626, + "loss": 1.3316, + "step": 10003 + }, + { + "epoch": 0.3582645442011209, + "grad_norm": 1.6434259414672852, + "learning_rate": 0.00014861081734164592, + "loss": 1.6868, + "step": 10004 + }, + { + "epoch": 0.3583003563306892, + "grad_norm": 1.6703202724456787, + "learning_rate": 0.0001486006806529697, + "loss": 1.4944, + "step": 10005 + }, + { + "epoch": 0.3583361684602575, + "grad_norm": 1.4541386365890503, + "learning_rate": 0.00014859054331043406, + "loss": 1.6077, + "step": 10006 + }, + { + "epoch": 0.3583719805898258, + "grad_norm": 1.5497562885284424, + "learning_rate": 0.0001485804053141753, + "loss": 1.5403, + "step": 10007 + }, + { + "epoch": 0.35840779271939405, + "grad_norm": 1.54259192943573, + "learning_rate": 0.00014857026666432988, + "loss": 1.5234, + "step": 10008 + }, + { + "epoch": 0.3584436048489623, + "grad_norm": 1.5208549499511719, + "learning_rate": 0.00014856012736103413, + "loss": 1.6961, + "step": 10009 + }, + { + "epoch": 0.35847941697853064, + "grad_norm": 1.6700527667999268, + "learning_rate": 0.00014854998740442454, + "loss": 1.5871, + "step": 10010 + }, + { + "epoch": 0.3585152291080989, + "grad_norm": 1.868094563484192, + "learning_rate": 0.00014853984679463747, + "loss": 1.6691, + "step": 10011 + }, + { + "epoch": 0.3585510412376672, + "grad_norm": 2.0113749504089355, + "learning_rate": 0.00014852970553180938, + "loss": 1.1609, + "step": 10012 + }, + { + "epoch": 0.3585868533672355, + "grad_norm": 1.3860893249511719, + "learning_rate": 0.0001485195636160767, + "loss": 1.4034, + "step": 10013 + }, + { + "epoch": 0.35862266549680377, + "grad_norm": 2.6419312953948975, + "learning_rate": 0.00014850942104757588, + "loss": 1.5761, + "step": 10014 + }, + { + "epoch": 0.35865847762637204, + "grad_norm": 1.6753020286560059, + "learning_rate": 0.0001484992778264434, + "loss": 1.5387, + "step": 10015 + }, + { + "epoch": 0.3586942897559403, + "grad_norm": 1.7237919569015503, + "learning_rate": 0.00014848913395281568, + "loss": 1.4762, + "step": 10016 + }, + { + "epoch": 0.35873010188550863, + "grad_norm": 1.547621488571167, + "learning_rate": 0.00014847898942682922, + "loss": 1.5384, + "step": 10017 + }, + { + "epoch": 0.3587659140150769, + "grad_norm": 1.5596733093261719, + "learning_rate": 0.00014846884424862044, + "loss": 1.4112, + "step": 10018 + }, + { + "epoch": 0.35880172614464517, + "grad_norm": 1.4373083114624023, + "learning_rate": 0.0001484586984183259, + "loss": 1.3855, + "step": 10019 + }, + { + "epoch": 0.3588375382742135, + "grad_norm": 1.8072415590286255, + "learning_rate": 0.0001484485519360821, + "loss": 1.5663, + "step": 10020 + }, + { + "epoch": 0.35887335040378177, + "grad_norm": 1.838571310043335, + "learning_rate": 0.00014843840480202554, + "loss": 1.401, + "step": 10021 + }, + { + "epoch": 0.35890916253335003, + "grad_norm": 1.517883539199829, + "learning_rate": 0.00014842825701629267, + "loss": 1.2617, + "step": 10022 + }, + { + "epoch": 0.3589449746629183, + "grad_norm": 1.988364577293396, + "learning_rate": 0.0001484181085790201, + "loss": 1.478, + "step": 10023 + }, + { + "epoch": 0.35898078679248663, + "grad_norm": 1.5956732034683228, + "learning_rate": 0.00014840795949034439, + "loss": 1.5329, + "step": 10024 + }, + { + "epoch": 0.3590165989220549, + "grad_norm": 1.7661099433898926, + "learning_rate": 0.00014839780975040194, + "loss": 1.465, + "step": 10025 + }, + { + "epoch": 0.35905241105162317, + "grad_norm": 1.3681966066360474, + "learning_rate": 0.00014838765935932944, + "loss": 1.5103, + "step": 10026 + }, + { + "epoch": 0.3590882231811915, + "grad_norm": 2.267117977142334, + "learning_rate": 0.00014837750831726338, + "loss": 1.52, + "step": 10027 + }, + { + "epoch": 0.35912403531075976, + "grad_norm": 1.8167510032653809, + "learning_rate": 0.00014836735662434035, + "loss": 1.6152, + "step": 10028 + }, + { + "epoch": 0.35915984744032803, + "grad_norm": 1.5171904563903809, + "learning_rate": 0.00014835720428069693, + "loss": 1.0415, + "step": 10029 + }, + { + "epoch": 0.3591956595698963, + "grad_norm": 1.476212739944458, + "learning_rate": 0.0001483470512864697, + "loss": 1.345, + "step": 10030 + }, + { + "epoch": 0.3592314716994646, + "grad_norm": 1.4677760601043701, + "learning_rate": 0.00014833689764179523, + "loss": 1.6914, + "step": 10031 + }, + { + "epoch": 0.3592672838290329, + "grad_norm": 1.542527198791504, + "learning_rate": 0.00014832674334681022, + "loss": 1.5375, + "step": 10032 + }, + { + "epoch": 0.35930309595860116, + "grad_norm": 1.7230689525604248, + "learning_rate": 0.00014831658840165117, + "loss": 1.832, + "step": 10033 + }, + { + "epoch": 0.3593389080881695, + "grad_norm": 2.0981647968292236, + "learning_rate": 0.00014830643280645472, + "loss": 1.4325, + "step": 10034 + }, + { + "epoch": 0.35937472021773775, + "grad_norm": 1.4687614440917969, + "learning_rate": 0.00014829627656135757, + "loss": 1.5059, + "step": 10035 + }, + { + "epoch": 0.359410532347306, + "grad_norm": 1.6538559198379517, + "learning_rate": 0.0001482861196664963, + "loss": 1.4741, + "step": 10036 + }, + { + "epoch": 0.3594463444768743, + "grad_norm": 1.6689916849136353, + "learning_rate": 0.00014827596212200762, + "loss": 1.5735, + "step": 10037 + }, + { + "epoch": 0.3594821566064426, + "grad_norm": 1.9871309995651245, + "learning_rate": 0.00014826580392802806, + "loss": 1.8311, + "step": 10038 + }, + { + "epoch": 0.3595179687360109, + "grad_norm": 2.3612310886383057, + "learning_rate": 0.00014825564508469443, + "loss": 1.7536, + "step": 10039 + }, + { + "epoch": 0.35955378086557915, + "grad_norm": 1.9663517475128174, + "learning_rate": 0.00014824548559214332, + "loss": 1.4124, + "step": 10040 + }, + { + "epoch": 0.3595895929951475, + "grad_norm": 1.631361961364746, + "learning_rate": 0.0001482353254505114, + "loss": 1.6325, + "step": 10041 + }, + { + "epoch": 0.35962540512471575, + "grad_norm": 1.6884336471557617, + "learning_rate": 0.00014822516465993546, + "loss": 1.4287, + "step": 10042 + }, + { + "epoch": 0.359661217254284, + "grad_norm": 1.5852062702178955, + "learning_rate": 0.0001482150032205521, + "loss": 1.6681, + "step": 10043 + }, + { + "epoch": 0.3596970293838523, + "grad_norm": 1.4541549682617188, + "learning_rate": 0.00014820484113249805, + "loss": 1.3463, + "step": 10044 + }, + { + "epoch": 0.3597328415134206, + "grad_norm": 1.5388379096984863, + "learning_rate": 0.00014819467839591007, + "loss": 1.5042, + "step": 10045 + }, + { + "epoch": 0.3597686536429889, + "grad_norm": 1.8870503902435303, + "learning_rate": 0.00014818451501092485, + "loss": 1.5489, + "step": 10046 + }, + { + "epoch": 0.35980446577255715, + "grad_norm": 1.8153746128082275, + "learning_rate": 0.00014817435097767912, + "loss": 1.6726, + "step": 10047 + }, + { + "epoch": 0.35984027790212547, + "grad_norm": 1.4867844581604004, + "learning_rate": 0.00014816418629630968, + "loss": 1.3771, + "step": 10048 + }, + { + "epoch": 0.35987609003169374, + "grad_norm": 1.6260762214660645, + "learning_rate": 0.0001481540209669532, + "loss": 1.5194, + "step": 10049 + }, + { + "epoch": 0.359911902161262, + "grad_norm": 1.7411034107208252, + "learning_rate": 0.0001481438549897465, + "loss": 1.3861, + "step": 10050 + }, + { + "epoch": 0.3599477142908303, + "grad_norm": 1.4617067575454712, + "learning_rate": 0.00014813368836482632, + "loss": 1.1129, + "step": 10051 + }, + { + "epoch": 0.3599835264203986, + "grad_norm": 2.216752767562866, + "learning_rate": 0.00014812352109232947, + "loss": 1.3592, + "step": 10052 + }, + { + "epoch": 0.36001933854996687, + "grad_norm": 1.7082031965255737, + "learning_rate": 0.0001481133531723927, + "loss": 1.2952, + "step": 10053 + }, + { + "epoch": 0.36005515067953514, + "grad_norm": 2.7109858989715576, + "learning_rate": 0.00014810318460515282, + "loss": 1.9209, + "step": 10054 + }, + { + "epoch": 0.36009096280910347, + "grad_norm": 1.6055452823638916, + "learning_rate": 0.00014809301539074667, + "loss": 1.3236, + "step": 10055 + }, + { + "epoch": 0.36012677493867173, + "grad_norm": 1.9085619449615479, + "learning_rate": 0.000148082845529311, + "loss": 1.6722, + "step": 10056 + }, + { + "epoch": 0.36016258706824, + "grad_norm": 1.3563839197158813, + "learning_rate": 0.00014807267502098267, + "loss": 1.4078, + "step": 10057 + }, + { + "epoch": 0.36019839919780827, + "grad_norm": 1.5918482542037964, + "learning_rate": 0.00014806250386589851, + "loss": 1.5898, + "step": 10058 + }, + { + "epoch": 0.3602342113273766, + "grad_norm": 1.934302806854248, + "learning_rate": 0.0001480523320641954, + "loss": 1.288, + "step": 10059 + }, + { + "epoch": 0.36027002345694487, + "grad_norm": 2.7092247009277344, + "learning_rate": 0.00014804215961601008, + "loss": 1.4568, + "step": 10060 + }, + { + "epoch": 0.36030583558651313, + "grad_norm": 1.5964667797088623, + "learning_rate": 0.00014803198652147952, + "loss": 1.8112, + "step": 10061 + }, + { + "epoch": 0.36034164771608146, + "grad_norm": 1.7393267154693604, + "learning_rate": 0.00014802181278074052, + "loss": 1.5946, + "step": 10062 + }, + { + "epoch": 0.36037745984564973, + "grad_norm": 1.6845488548278809, + "learning_rate": 0.00014801163839392998, + "loss": 1.544, + "step": 10063 + }, + { + "epoch": 0.360413271975218, + "grad_norm": 1.5113362073898315, + "learning_rate": 0.00014800146336118474, + "loss": 1.5542, + "step": 10064 + }, + { + "epoch": 0.36044908410478627, + "grad_norm": 1.7169773578643799, + "learning_rate": 0.0001479912876826418, + "loss": 1.5439, + "step": 10065 + }, + { + "epoch": 0.3604848962343546, + "grad_norm": 1.4633331298828125, + "learning_rate": 0.0001479811113584379, + "loss": 1.595, + "step": 10066 + }, + { + "epoch": 0.36052070836392286, + "grad_norm": 1.579038381576538, + "learning_rate": 0.00014797093438871008, + "loss": 1.4934, + "step": 10067 + }, + { + "epoch": 0.36055652049349113, + "grad_norm": 1.9892122745513916, + "learning_rate": 0.00014796075677359525, + "loss": 1.5002, + "step": 10068 + }, + { + "epoch": 0.36059233262305945, + "grad_norm": 1.6243116855621338, + "learning_rate": 0.00014795057851323023, + "loss": 1.5465, + "step": 10069 + }, + { + "epoch": 0.3606281447526277, + "grad_norm": 1.6825838088989258, + "learning_rate": 0.0001479403996077521, + "loss": 1.163, + "step": 10070 + }, + { + "epoch": 0.360663956882196, + "grad_norm": 2.1626789569854736, + "learning_rate": 0.0001479302200572977, + "loss": 1.389, + "step": 10071 + }, + { + "epoch": 0.36069976901176426, + "grad_norm": 1.4115447998046875, + "learning_rate": 0.00014792003986200403, + "loss": 1.4016, + "step": 10072 + }, + { + "epoch": 0.3607355811413326, + "grad_norm": 2.0680456161499023, + "learning_rate": 0.000147909859022008, + "loss": 1.4026, + "step": 10073 + }, + { + "epoch": 0.36077139327090085, + "grad_norm": 1.1022719144821167, + "learning_rate": 0.00014789967753744664, + "loss": 1.3813, + "step": 10074 + }, + { + "epoch": 0.3608072054004691, + "grad_norm": 1.9899226427078247, + "learning_rate": 0.00014788949540845689, + "loss": 1.4066, + "step": 10075 + }, + { + "epoch": 0.36084301753003745, + "grad_norm": 1.577979564666748, + "learning_rate": 0.0001478793126351758, + "loss": 1.5398, + "step": 10076 + }, + { + "epoch": 0.3608788296596057, + "grad_norm": 1.9945513010025024, + "learning_rate": 0.00014786912921774028, + "loss": 1.5577, + "step": 10077 + }, + { + "epoch": 0.360914641789174, + "grad_norm": 1.5704258680343628, + "learning_rate": 0.00014785894515628736, + "loss": 1.6951, + "step": 10078 + }, + { + "epoch": 0.36095045391874225, + "grad_norm": 1.760332703590393, + "learning_rate": 0.0001478487604509541, + "loss": 1.4839, + "step": 10079 + }, + { + "epoch": 0.3609862660483106, + "grad_norm": 1.6892656087875366, + "learning_rate": 0.00014783857510187743, + "loss": 1.7012, + "step": 10080 + }, + { + "epoch": 0.36102207817787885, + "grad_norm": 1.9408334493637085, + "learning_rate": 0.00014782838910919449, + "loss": 1.3263, + "step": 10081 + }, + { + "epoch": 0.3610578903074471, + "grad_norm": 1.6504887342453003, + "learning_rate": 0.00014781820247304227, + "loss": 1.5619, + "step": 10082 + }, + { + "epoch": 0.36109370243701544, + "grad_norm": 1.608626127243042, + "learning_rate": 0.00014780801519355782, + "loss": 1.461, + "step": 10083 + }, + { + "epoch": 0.3611295145665837, + "grad_norm": 2.068258285522461, + "learning_rate": 0.00014779782727087815, + "loss": 1.5741, + "step": 10084 + }, + { + "epoch": 0.361165326696152, + "grad_norm": 1.8851020336151123, + "learning_rate": 0.0001477876387051404, + "loss": 1.66, + "step": 10085 + }, + { + "epoch": 0.36120113882572025, + "grad_norm": 1.5188018083572388, + "learning_rate": 0.00014777744949648163, + "loss": 1.4233, + "step": 10086 + }, + { + "epoch": 0.36123695095528857, + "grad_norm": 1.492539405822754, + "learning_rate": 0.00014776725964503888, + "loss": 1.4101, + "step": 10087 + }, + { + "epoch": 0.36127276308485684, + "grad_norm": 2.47213077545166, + "learning_rate": 0.00014775706915094928, + "loss": 1.7695, + "step": 10088 + }, + { + "epoch": 0.3613085752144251, + "grad_norm": 3.236239194869995, + "learning_rate": 0.0001477468780143499, + "loss": 1.3959, + "step": 10089 + }, + { + "epoch": 0.36134438734399343, + "grad_norm": 1.5189967155456543, + "learning_rate": 0.00014773668623537786, + "loss": 1.3773, + "step": 10090 + }, + { + "epoch": 0.3613801994735617, + "grad_norm": 1.4003039598464966, + "learning_rate": 0.0001477264938141703, + "loss": 1.6206, + "step": 10091 + }, + { + "epoch": 0.36141601160312997, + "grad_norm": 1.4691357612609863, + "learning_rate": 0.00014771630075086434, + "loss": 1.5497, + "step": 10092 + }, + { + "epoch": 0.36145182373269824, + "grad_norm": 1.6265677213668823, + "learning_rate": 0.00014770610704559708, + "loss": 1.7204, + "step": 10093 + }, + { + "epoch": 0.36148763586226657, + "grad_norm": 1.6044228076934814, + "learning_rate": 0.0001476959126985057, + "loss": 1.464, + "step": 10094 + }, + { + "epoch": 0.36152344799183483, + "grad_norm": 1.8812122344970703, + "learning_rate": 0.00014768571770972734, + "loss": 1.5653, + "step": 10095 + }, + { + "epoch": 0.3615592601214031, + "grad_norm": 1.898262858390808, + "learning_rate": 0.00014767552207939913, + "loss": 1.2871, + "step": 10096 + }, + { + "epoch": 0.3615950722509714, + "grad_norm": 1.926661729812622, + "learning_rate": 0.0001476653258076583, + "loss": 1.5776, + "step": 10097 + }, + { + "epoch": 0.3616308843805397, + "grad_norm": 1.504490852355957, + "learning_rate": 0.00014765512889464198, + "loss": 1.4083, + "step": 10098 + }, + { + "epoch": 0.36166669651010797, + "grad_norm": 1.455038070678711, + "learning_rate": 0.00014764493134048737, + "loss": 1.553, + "step": 10099 + }, + { + "epoch": 0.36170250863967623, + "grad_norm": 1.8379685878753662, + "learning_rate": 0.00014763473314533166, + "loss": 1.2926, + "step": 10100 + }, + { + "epoch": 0.36173832076924456, + "grad_norm": 1.7743902206420898, + "learning_rate": 0.0001476245343093121, + "loss": 1.4345, + "step": 10101 + }, + { + "epoch": 0.36177413289881283, + "grad_norm": 1.722127914428711, + "learning_rate": 0.00014761433483256582, + "loss": 1.528, + "step": 10102 + }, + { + "epoch": 0.3618099450283811, + "grad_norm": 1.4296984672546387, + "learning_rate": 0.00014760413471523012, + "loss": 1.3692, + "step": 10103 + }, + { + "epoch": 0.3618457571579494, + "grad_norm": 1.6753002405166626, + "learning_rate": 0.00014759393395744215, + "loss": 1.6153, + "step": 10104 + }, + { + "epoch": 0.3618815692875177, + "grad_norm": 3.1370999813079834, + "learning_rate": 0.00014758373255933924, + "loss": 1.8066, + "step": 10105 + }, + { + "epoch": 0.36191738141708596, + "grad_norm": 1.653550624847412, + "learning_rate": 0.00014757353052105853, + "loss": 1.2039, + "step": 10106 + }, + { + "epoch": 0.36195319354665423, + "grad_norm": 2.0767953395843506, + "learning_rate": 0.00014756332784273738, + "loss": 1.678, + "step": 10107 + }, + { + "epoch": 0.36198900567622255, + "grad_norm": 1.60899019241333, + "learning_rate": 0.00014755312452451296, + "loss": 1.8436, + "step": 10108 + }, + { + "epoch": 0.3620248178057908, + "grad_norm": 1.545812726020813, + "learning_rate": 0.0001475429205665226, + "loss": 1.3691, + "step": 10109 + }, + { + "epoch": 0.3620606299353591, + "grad_norm": 1.5063380002975464, + "learning_rate": 0.0001475327159689036, + "loss": 1.7184, + "step": 10110 + }, + { + "epoch": 0.3620964420649274, + "grad_norm": 2.4832561016082764, + "learning_rate": 0.0001475225107317932, + "loss": 1.3295, + "step": 10111 + }, + { + "epoch": 0.3621322541944957, + "grad_norm": 3.1396214962005615, + "learning_rate": 0.00014751230485532873, + "loss": 1.6655, + "step": 10112 + }, + { + "epoch": 0.36216806632406395, + "grad_norm": 1.7851954698562622, + "learning_rate": 0.00014750209833964747, + "loss": 1.5574, + "step": 10113 + }, + { + "epoch": 0.3622038784536322, + "grad_norm": 1.6681740283966064, + "learning_rate": 0.00014749189118488677, + "loss": 1.4129, + "step": 10114 + }, + { + "epoch": 0.36223969058320055, + "grad_norm": 1.871593952178955, + "learning_rate": 0.0001474816833911839, + "loss": 1.536, + "step": 10115 + }, + { + "epoch": 0.3622755027127688, + "grad_norm": 2.309457540512085, + "learning_rate": 0.00014747147495867627, + "loss": 1.3783, + "step": 10116 + }, + { + "epoch": 0.3623113148423371, + "grad_norm": 2.2859158515930176, + "learning_rate": 0.00014746126588750116, + "loss": 1.6957, + "step": 10117 + }, + { + "epoch": 0.3623471269719054, + "grad_norm": 1.851605772972107, + "learning_rate": 0.00014745105617779594, + "loss": 1.4928, + "step": 10118 + }, + { + "epoch": 0.3623829391014737, + "grad_norm": 1.9329193830490112, + "learning_rate": 0.00014744084582969793, + "loss": 1.7967, + "step": 10119 + }, + { + "epoch": 0.36241875123104195, + "grad_norm": 1.6493408679962158, + "learning_rate": 0.00014743063484334455, + "loss": 1.6187, + "step": 10120 + }, + { + "epoch": 0.3624545633606102, + "grad_norm": 1.6800282001495361, + "learning_rate": 0.00014742042321887322, + "loss": 1.6358, + "step": 10121 + }, + { + "epoch": 0.36249037549017854, + "grad_norm": 1.4881089925765991, + "learning_rate": 0.00014741021095642117, + "loss": 1.6944, + "step": 10122 + }, + { + "epoch": 0.3625261876197468, + "grad_norm": 1.2354776859283447, + "learning_rate": 0.00014739999805612596, + "loss": 1.151, + "step": 10123 + }, + { + "epoch": 0.3625619997493151, + "grad_norm": 1.8695483207702637, + "learning_rate": 0.00014738978451812488, + "loss": 1.4648, + "step": 10124 + }, + { + "epoch": 0.3625978118788834, + "grad_norm": 1.4767268896102905, + "learning_rate": 0.00014737957034255538, + "loss": 1.5988, + "step": 10125 + }, + { + "epoch": 0.36263362400845167, + "grad_norm": 1.4204144477844238, + "learning_rate": 0.00014736935552955488, + "loss": 1.4762, + "step": 10126 + }, + { + "epoch": 0.36266943613801994, + "grad_norm": 1.6141799688339233, + "learning_rate": 0.00014735914007926084, + "loss": 1.4833, + "step": 10127 + }, + { + "epoch": 0.3627052482675882, + "grad_norm": 2.05619740486145, + "learning_rate": 0.0001473489239918106, + "loss": 1.391, + "step": 10128 + }, + { + "epoch": 0.36274106039715653, + "grad_norm": 1.3890151977539062, + "learning_rate": 0.0001473387072673417, + "loss": 1.5043, + "step": 10129 + }, + { + "epoch": 0.3627768725267248, + "grad_norm": 1.4893358945846558, + "learning_rate": 0.00014732848990599154, + "loss": 1.4655, + "step": 10130 + }, + { + "epoch": 0.36281268465629307, + "grad_norm": 1.4449459314346313, + "learning_rate": 0.0001473182719078976, + "loss": 1.5565, + "step": 10131 + }, + { + "epoch": 0.3628484967858614, + "grad_norm": 1.4098438024520874, + "learning_rate": 0.00014730805327319737, + "loss": 1.5439, + "step": 10132 + }, + { + "epoch": 0.36288430891542967, + "grad_norm": 1.4516136646270752, + "learning_rate": 0.00014729783400202828, + "loss": 1.5875, + "step": 10133 + }, + { + "epoch": 0.36292012104499793, + "grad_norm": 2.057598114013672, + "learning_rate": 0.00014728761409452785, + "loss": 1.5132, + "step": 10134 + }, + { + "epoch": 0.3629559331745662, + "grad_norm": 2.4684760570526123, + "learning_rate": 0.00014727739355083357, + "loss": 1.6289, + "step": 10135 + }, + { + "epoch": 0.3629917453041345, + "grad_norm": 2.0605359077453613, + "learning_rate": 0.00014726717237108293, + "loss": 1.6129, + "step": 10136 + }, + { + "epoch": 0.3630275574337028, + "grad_norm": 1.469491720199585, + "learning_rate": 0.00014725695055541348, + "loss": 1.4592, + "step": 10137 + }, + { + "epoch": 0.36306336956327107, + "grad_norm": 1.547979474067688, + "learning_rate": 0.00014724672810396272, + "loss": 1.441, + "step": 10138 + }, + { + "epoch": 0.3630991816928394, + "grad_norm": 1.272243618965149, + "learning_rate": 0.00014723650501686817, + "loss": 1.4066, + "step": 10139 + }, + { + "epoch": 0.36313499382240766, + "grad_norm": 1.7786619663238525, + "learning_rate": 0.00014722628129426734, + "loss": 1.4726, + "step": 10140 + }, + { + "epoch": 0.36317080595197593, + "grad_norm": 1.6617714166641235, + "learning_rate": 0.0001472160569362979, + "loss": 1.6447, + "step": 10141 + }, + { + "epoch": 0.3632066180815442, + "grad_norm": 2.5883841514587402, + "learning_rate": 0.0001472058319430972, + "loss": 1.4975, + "step": 10142 + }, + { + "epoch": 0.3632424302111125, + "grad_norm": 1.781213402748108, + "learning_rate": 0.000147195606314803, + "loss": 1.5525, + "step": 10143 + }, + { + "epoch": 0.3632782423406808, + "grad_norm": 4.267208099365234, + "learning_rate": 0.0001471853800515528, + "loss": 1.6374, + "step": 10144 + }, + { + "epoch": 0.36331405447024906, + "grad_norm": 1.460854172706604, + "learning_rate": 0.00014717515315348413, + "loss": 1.7116, + "step": 10145 + }, + { + "epoch": 0.3633498665998174, + "grad_norm": 1.7079209089279175, + "learning_rate": 0.00014716492562073466, + "loss": 1.4572, + "step": 10146 + }, + { + "epoch": 0.36338567872938565, + "grad_norm": 1.5507252216339111, + "learning_rate": 0.00014715469745344196, + "loss": 1.3672, + "step": 10147 + }, + { + "epoch": 0.3634214908589539, + "grad_norm": 2.1776158809661865, + "learning_rate": 0.00014714446865174362, + "loss": 1.7472, + "step": 10148 + }, + { + "epoch": 0.3634573029885222, + "grad_norm": 2.235363483428955, + "learning_rate": 0.00014713423921577725, + "loss": 1.4751, + "step": 10149 + }, + { + "epoch": 0.3634931151180905, + "grad_norm": 2.331434726715088, + "learning_rate": 0.0001471240091456805, + "loss": 1.5584, + "step": 10150 + }, + { + "epoch": 0.3635289272476588, + "grad_norm": 1.8828892707824707, + "learning_rate": 0.00014711377844159099, + "loss": 1.4935, + "step": 10151 + }, + { + "epoch": 0.36356473937722705, + "grad_norm": 1.5459474325180054, + "learning_rate": 0.00014710354710364637, + "loss": 1.1762, + "step": 10152 + }, + { + "epoch": 0.3636005515067954, + "grad_norm": 1.9720122814178467, + "learning_rate": 0.00014709331513198425, + "loss": 1.751, + "step": 10153 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 1.7527331113815308, + "learning_rate": 0.00014708308252674236, + "loss": 1.3377, + "step": 10154 + }, + { + "epoch": 0.3636721757659319, + "grad_norm": 1.449865698814392, + "learning_rate": 0.0001470728492880583, + "loss": 1.5698, + "step": 10155 + }, + { + "epoch": 0.3637079878955002, + "grad_norm": 1.602096438407898, + "learning_rate": 0.00014706261541606983, + "loss": 1.5431, + "step": 10156 + }, + { + "epoch": 0.3637438000250685, + "grad_norm": 1.5750555992126465, + "learning_rate": 0.00014705238091091455, + "loss": 1.5822, + "step": 10157 + }, + { + "epoch": 0.3637796121546368, + "grad_norm": 1.5211970806121826, + "learning_rate": 0.00014704214577273016, + "loss": 1.4798, + "step": 10158 + }, + { + "epoch": 0.36381542428420505, + "grad_norm": 1.606300950050354, + "learning_rate": 0.00014703191000165438, + "loss": 1.6259, + "step": 10159 + }, + { + "epoch": 0.36385123641377337, + "grad_norm": 2.0974745750427246, + "learning_rate": 0.00014702167359782493, + "loss": 1.5262, + "step": 10160 + }, + { + "epoch": 0.36388704854334164, + "grad_norm": 1.8589930534362793, + "learning_rate": 0.0001470114365613795, + "loss": 1.2566, + "step": 10161 + }, + { + "epoch": 0.3639228606729099, + "grad_norm": 1.8257662057876587, + "learning_rate": 0.00014700119889245582, + "loss": 1.5583, + "step": 10162 + }, + { + "epoch": 0.3639586728024782, + "grad_norm": 1.329789638519287, + "learning_rate": 0.00014699096059119166, + "loss": 1.4588, + "step": 10163 + }, + { + "epoch": 0.3639944849320465, + "grad_norm": 1.6192381381988525, + "learning_rate": 0.0001469807216577247, + "loss": 1.6746, + "step": 10164 + }, + { + "epoch": 0.36403029706161477, + "grad_norm": 3.251436948776245, + "learning_rate": 0.0001469704820921928, + "loss": 1.4111, + "step": 10165 + }, + { + "epoch": 0.36406610919118304, + "grad_norm": 1.9543156623840332, + "learning_rate": 0.00014696024189473362, + "loss": 1.4814, + "step": 10166 + }, + { + "epoch": 0.36410192132075136, + "grad_norm": 1.507798671722412, + "learning_rate": 0.00014695000106548496, + "loss": 1.4728, + "step": 10167 + }, + { + "epoch": 0.36413773345031963, + "grad_norm": 2.0722126960754395, + "learning_rate": 0.0001469397596045846, + "loss": 1.5825, + "step": 10168 + }, + { + "epoch": 0.3641735455798879, + "grad_norm": 2.018157720565796, + "learning_rate": 0.0001469295175121703, + "loss": 1.6858, + "step": 10169 + }, + { + "epoch": 0.36420935770945617, + "grad_norm": 1.5147225856781006, + "learning_rate": 0.00014691927478837987, + "loss": 1.5474, + "step": 10170 + }, + { + "epoch": 0.3642451698390245, + "grad_norm": 1.813011646270752, + "learning_rate": 0.00014690903143335117, + "loss": 1.454, + "step": 10171 + }, + { + "epoch": 0.36428098196859277, + "grad_norm": 1.3621699810028076, + "learning_rate": 0.00014689878744722192, + "loss": 1.4512, + "step": 10172 + }, + { + "epoch": 0.36431679409816103, + "grad_norm": 1.4398069381713867, + "learning_rate": 0.00014688854283013001, + "loss": 1.4105, + "step": 10173 + }, + { + "epoch": 0.36435260622772936, + "grad_norm": 1.4563603401184082, + "learning_rate": 0.0001468782975822132, + "loss": 1.3174, + "step": 10174 + }, + { + "epoch": 0.3643884183572976, + "grad_norm": 1.7026748657226562, + "learning_rate": 0.0001468680517036094, + "loss": 1.642, + "step": 10175 + }, + { + "epoch": 0.3644242304868659, + "grad_norm": 1.738409399986267, + "learning_rate": 0.0001468578051944564, + "loss": 1.4977, + "step": 10176 + }, + { + "epoch": 0.36446004261643417, + "grad_norm": 2.2785589694976807, + "learning_rate": 0.00014684755805489206, + "loss": 1.5793, + "step": 10177 + }, + { + "epoch": 0.3644958547460025, + "grad_norm": 1.4905781745910645, + "learning_rate": 0.0001468373102850543, + "loss": 1.4412, + "step": 10178 + }, + { + "epoch": 0.36453166687557076, + "grad_norm": 1.4216437339782715, + "learning_rate": 0.0001468270618850809, + "loss": 1.7233, + "step": 10179 + }, + { + "epoch": 0.36456747900513903, + "grad_norm": 2.1683859825134277, + "learning_rate": 0.0001468168128551098, + "loss": 1.7547, + "step": 10180 + }, + { + "epoch": 0.36460329113470735, + "grad_norm": 2.0887510776519775, + "learning_rate": 0.00014680656319527886, + "loss": 1.317, + "step": 10181 + }, + { + "epoch": 0.3646391032642756, + "grad_norm": 1.405361533164978, + "learning_rate": 0.00014679631290572602, + "loss": 1.423, + "step": 10182 + }, + { + "epoch": 0.3646749153938439, + "grad_norm": 1.915013313293457, + "learning_rate": 0.00014678606198658916, + "loss": 1.3918, + "step": 10183 + }, + { + "epoch": 0.36471072752341216, + "grad_norm": 1.6929726600646973, + "learning_rate": 0.00014677581043800615, + "loss": 1.5082, + "step": 10184 + }, + { + "epoch": 0.3647465396529805, + "grad_norm": 1.6973090171813965, + "learning_rate": 0.00014676555826011496, + "loss": 1.2838, + "step": 10185 + }, + { + "epoch": 0.36478235178254875, + "grad_norm": 1.729513168334961, + "learning_rate": 0.0001467553054530535, + "loss": 1.9042, + "step": 10186 + }, + { + "epoch": 0.364818163912117, + "grad_norm": 1.4617276191711426, + "learning_rate": 0.00014674505201695971, + "loss": 1.4484, + "step": 10187 + }, + { + "epoch": 0.36485397604168535, + "grad_norm": 1.501288890838623, + "learning_rate": 0.00014673479795197154, + "loss": 1.1133, + "step": 10188 + }, + { + "epoch": 0.3648897881712536, + "grad_norm": 1.6365547180175781, + "learning_rate": 0.00014672454325822696, + "loss": 1.5686, + "step": 10189 + }, + { + "epoch": 0.3649256003008219, + "grad_norm": 1.5337694883346558, + "learning_rate": 0.00014671428793586392, + "loss": 1.3405, + "step": 10190 + }, + { + "epoch": 0.36496141243039015, + "grad_norm": 3.3464503288269043, + "learning_rate": 0.0001467040319850204, + "loss": 1.5726, + "step": 10191 + }, + { + "epoch": 0.3649972245599585, + "grad_norm": 1.6443172693252563, + "learning_rate": 0.0001466937754058344, + "loss": 1.6067, + "step": 10192 + }, + { + "epoch": 0.36503303668952675, + "grad_norm": 1.2222121953964233, + "learning_rate": 0.00014668351819844384, + "loss": 1.3071, + "step": 10193 + }, + { + "epoch": 0.365068848819095, + "grad_norm": 1.9840507507324219, + "learning_rate": 0.00014667326036298675, + "loss": 1.7483, + "step": 10194 + }, + { + "epoch": 0.36510466094866334, + "grad_norm": 1.4311078786849976, + "learning_rate": 0.00014666300189960116, + "loss": 1.1324, + "step": 10195 + }, + { + "epoch": 0.3651404730782316, + "grad_norm": 1.5631734132766724, + "learning_rate": 0.00014665274280842508, + "loss": 1.4826, + "step": 10196 + }, + { + "epoch": 0.3651762852077999, + "grad_norm": 1.5126618146896362, + "learning_rate": 0.0001466424830895965, + "loss": 1.4611, + "step": 10197 + }, + { + "epoch": 0.36521209733736815, + "grad_norm": 1.7029465436935425, + "learning_rate": 0.00014663222274325353, + "loss": 1.2322, + "step": 10198 + }, + { + "epoch": 0.36524790946693647, + "grad_norm": 1.7353816032409668, + "learning_rate": 0.0001466219617695341, + "loss": 1.2907, + "step": 10199 + }, + { + "epoch": 0.36528372159650474, + "grad_norm": 1.881523847579956, + "learning_rate": 0.00014661170016857633, + "loss": 1.6683, + "step": 10200 + }, + { + "epoch": 0.365319533726073, + "grad_norm": 1.5477781295776367, + "learning_rate": 0.00014660143794051827, + "loss": 1.7577, + "step": 10201 + }, + { + "epoch": 0.3653553458556413, + "grad_norm": 2.418309211730957, + "learning_rate": 0.00014659117508549797, + "loss": 1.4513, + "step": 10202 + }, + { + "epoch": 0.3653911579852096, + "grad_norm": 1.7008320093154907, + "learning_rate": 0.0001465809116036535, + "loss": 1.6526, + "step": 10203 + }, + { + "epoch": 0.36542697011477787, + "grad_norm": 1.8753713369369507, + "learning_rate": 0.00014657064749512295, + "loss": 1.8226, + "step": 10204 + }, + { + "epoch": 0.36546278224434614, + "grad_norm": 1.8317325115203857, + "learning_rate": 0.0001465603827600444, + "loss": 1.5395, + "step": 10205 + }, + { + "epoch": 0.36549859437391446, + "grad_norm": 1.8578588962554932, + "learning_rate": 0.00014655011739855595, + "loss": 1.5256, + "step": 10206 + }, + { + "epoch": 0.36553440650348273, + "grad_norm": 1.3821361064910889, + "learning_rate": 0.00014653985141079576, + "loss": 1.4077, + "step": 10207 + }, + { + "epoch": 0.365570218633051, + "grad_norm": 1.7334470748901367, + "learning_rate": 0.00014652958479690185, + "loss": 1.2891, + "step": 10208 + }, + { + "epoch": 0.36560603076261927, + "grad_norm": 1.7649697065353394, + "learning_rate": 0.00014651931755701246, + "loss": 1.4587, + "step": 10209 + }, + { + "epoch": 0.3656418428921876, + "grad_norm": 1.560272455215454, + "learning_rate": 0.0001465090496912656, + "loss": 1.2729, + "step": 10210 + }, + { + "epoch": 0.36567765502175587, + "grad_norm": 2.465453624725342, + "learning_rate": 0.0001464987811997995, + "loss": 1.7935, + "step": 10211 + }, + { + "epoch": 0.36571346715132413, + "grad_norm": 1.8573144674301147, + "learning_rate": 0.00014648851208275224, + "loss": 1.7883, + "step": 10212 + }, + { + "epoch": 0.36574927928089246, + "grad_norm": 1.308641791343689, + "learning_rate": 0.00014647824234026205, + "loss": 1.2575, + "step": 10213 + }, + { + "epoch": 0.3657850914104607, + "grad_norm": 2.2015023231506348, + "learning_rate": 0.00014646797197246706, + "loss": 1.302, + "step": 10214 + }, + { + "epoch": 0.365820903540029, + "grad_norm": 1.6897261142730713, + "learning_rate": 0.00014645770097950544, + "loss": 1.527, + "step": 10215 + }, + { + "epoch": 0.36585671566959727, + "grad_norm": 1.403554916381836, + "learning_rate": 0.00014644742936151538, + "loss": 1.525, + "step": 10216 + }, + { + "epoch": 0.3658925277991656, + "grad_norm": 1.6537847518920898, + "learning_rate": 0.00014643715711863507, + "loss": 1.566, + "step": 10217 + }, + { + "epoch": 0.36592833992873386, + "grad_norm": 1.7466622591018677, + "learning_rate": 0.00014642688425100273, + "loss": 1.8627, + "step": 10218 + }, + { + "epoch": 0.36596415205830213, + "grad_norm": 1.805519700050354, + "learning_rate": 0.00014641661075875652, + "loss": 1.373, + "step": 10219 + }, + { + "epoch": 0.36599996418787045, + "grad_norm": 1.6405415534973145, + "learning_rate": 0.0001464063366420347, + "loss": 1.498, + "step": 10220 + }, + { + "epoch": 0.3660357763174387, + "grad_norm": 1.6992943286895752, + "learning_rate": 0.00014639606190097547, + "loss": 1.4758, + "step": 10221 + }, + { + "epoch": 0.366071588447007, + "grad_norm": 1.6012146472930908, + "learning_rate": 0.00014638578653571708, + "loss": 1.5874, + "step": 10222 + }, + { + "epoch": 0.36610740057657526, + "grad_norm": 1.8396260738372803, + "learning_rate": 0.00014637551054639774, + "loss": 1.5488, + "step": 10223 + }, + { + "epoch": 0.3661432127061436, + "grad_norm": 2.8932039737701416, + "learning_rate": 0.00014636523393315578, + "loss": 1.3023, + "step": 10224 + }, + { + "epoch": 0.36617902483571185, + "grad_norm": 1.3132829666137695, + "learning_rate": 0.00014635495669612934, + "loss": 1.5124, + "step": 10225 + }, + { + "epoch": 0.3662148369652801, + "grad_norm": 1.9502285718917847, + "learning_rate": 0.0001463446788354568, + "loss": 1.6832, + "step": 10226 + }, + { + "epoch": 0.36625064909484845, + "grad_norm": 1.5486946105957031, + "learning_rate": 0.00014633440035127638, + "loss": 1.5737, + "step": 10227 + }, + { + "epoch": 0.3662864612244167, + "grad_norm": 1.6194562911987305, + "learning_rate": 0.00014632412124372635, + "loss": 1.3977, + "step": 10228 + }, + { + "epoch": 0.366322273353985, + "grad_norm": 1.5558431148529053, + "learning_rate": 0.00014631384151294507, + "loss": 1.8146, + "step": 10229 + }, + { + "epoch": 0.36635808548355325, + "grad_norm": 1.5941044092178345, + "learning_rate": 0.00014630356115907073, + "loss": 1.6239, + "step": 10230 + }, + { + "epoch": 0.3663938976131216, + "grad_norm": 1.5189129114151, + "learning_rate": 0.00014629328018224175, + "loss": 1.4244, + "step": 10231 + }, + { + "epoch": 0.36642970974268985, + "grad_norm": 1.5772870779037476, + "learning_rate": 0.00014628299858259638, + "loss": 1.4397, + "step": 10232 + }, + { + "epoch": 0.3664655218722581, + "grad_norm": 1.4384115934371948, + "learning_rate": 0.00014627271636027297, + "loss": 1.5807, + "step": 10233 + }, + { + "epoch": 0.36650133400182644, + "grad_norm": 1.3554954528808594, + "learning_rate": 0.00014626243351540983, + "loss": 1.106, + "step": 10234 + }, + { + "epoch": 0.3665371461313947, + "grad_norm": 1.87827730178833, + "learning_rate": 0.00014625215004814533, + "loss": 1.7206, + "step": 10235 + }, + { + "epoch": 0.366572958260963, + "grad_norm": 2.0337166786193848, + "learning_rate": 0.00014624186595861785, + "loss": 1.4494, + "step": 10236 + }, + { + "epoch": 0.36660877039053125, + "grad_norm": 1.458093523979187, + "learning_rate": 0.00014623158124696565, + "loss": 1.6958, + "step": 10237 + }, + { + "epoch": 0.36664458252009957, + "grad_norm": 1.5604604482650757, + "learning_rate": 0.00014622129591332722, + "loss": 1.6612, + "step": 10238 + }, + { + "epoch": 0.36668039464966784, + "grad_norm": 1.7306214570999146, + "learning_rate": 0.0001462110099578408, + "loss": 1.6922, + "step": 10239 + }, + { + "epoch": 0.3667162067792361, + "grad_norm": 1.6424694061279297, + "learning_rate": 0.0001462007233806449, + "loss": 1.4857, + "step": 10240 + }, + { + "epoch": 0.36675201890880443, + "grad_norm": 1.6787950992584229, + "learning_rate": 0.00014619043618187784, + "loss": 1.0656, + "step": 10241 + }, + { + "epoch": 0.3667878310383727, + "grad_norm": 2.0720412731170654, + "learning_rate": 0.00014618014836167807, + "loss": 1.6523, + "step": 10242 + }, + { + "epoch": 0.36682364316794097, + "grad_norm": 1.870436191558838, + "learning_rate": 0.00014616985992018394, + "loss": 1.7099, + "step": 10243 + }, + { + "epoch": 0.36685945529750924, + "grad_norm": 1.4863344430923462, + "learning_rate": 0.00014615957085753394, + "loss": 1.5977, + "step": 10244 + }, + { + "epoch": 0.36689526742707756, + "grad_norm": 1.7124794721603394, + "learning_rate": 0.00014614928117386643, + "loss": 1.3756, + "step": 10245 + }, + { + "epoch": 0.36693107955664583, + "grad_norm": 4.074779987335205, + "learning_rate": 0.0001461389908693199, + "loss": 1.6485, + "step": 10246 + }, + { + "epoch": 0.3669668916862141, + "grad_norm": 1.4112684726715088, + "learning_rate": 0.0001461286999440327, + "loss": 1.0859, + "step": 10247 + }, + { + "epoch": 0.3670027038157824, + "grad_norm": 1.4489010572433472, + "learning_rate": 0.00014611840839814336, + "loss": 1.3586, + "step": 10248 + }, + { + "epoch": 0.3670385159453507, + "grad_norm": 1.4673880338668823, + "learning_rate": 0.00014610811623179038, + "loss": 1.3926, + "step": 10249 + }, + { + "epoch": 0.36707432807491897, + "grad_norm": 1.5906109809875488, + "learning_rate": 0.00014609782344511213, + "loss": 1.2522, + "step": 10250 + }, + { + "epoch": 0.36711014020448723, + "grad_norm": 1.204659342765808, + "learning_rate": 0.0001460875300382471, + "loss": 1.2822, + "step": 10251 + }, + { + "epoch": 0.36714595233405556, + "grad_norm": 1.5988489389419556, + "learning_rate": 0.00014607723601133384, + "loss": 1.4988, + "step": 10252 + }, + { + "epoch": 0.3671817644636238, + "grad_norm": 1.5991809368133545, + "learning_rate": 0.00014606694136451082, + "loss": 1.7249, + "step": 10253 + }, + { + "epoch": 0.3672175765931921, + "grad_norm": 1.5101706981658936, + "learning_rate": 0.0001460566460979165, + "loss": 1.5289, + "step": 10254 + }, + { + "epoch": 0.3672533887227604, + "grad_norm": 1.951116681098938, + "learning_rate": 0.00014604635021168942, + "loss": 1.5754, + "step": 10255 + }, + { + "epoch": 0.3672892008523287, + "grad_norm": 2.0610105991363525, + "learning_rate": 0.00014603605370596808, + "loss": 1.1101, + "step": 10256 + }, + { + "epoch": 0.36732501298189696, + "grad_norm": 1.4582083225250244, + "learning_rate": 0.00014602575658089105, + "loss": 1.5282, + "step": 10257 + }, + { + "epoch": 0.36736082511146523, + "grad_norm": 1.7049942016601562, + "learning_rate": 0.0001460154588365968, + "loss": 1.4164, + "step": 10258 + }, + { + "epoch": 0.36739663724103355, + "grad_norm": 1.7061821222305298, + "learning_rate": 0.00014600516047322392, + "loss": 1.6465, + "step": 10259 + }, + { + "epoch": 0.3674324493706018, + "grad_norm": 2.0501723289489746, + "learning_rate": 0.00014599486149091096, + "loss": 1.4027, + "step": 10260 + }, + { + "epoch": 0.3674682615001701, + "grad_norm": 1.5676246881484985, + "learning_rate": 0.00014598456188979643, + "loss": 1.7521, + "step": 10261 + }, + { + "epoch": 0.3675040736297384, + "grad_norm": 1.6521248817443848, + "learning_rate": 0.000145974261670019, + "loss": 1.4259, + "step": 10262 + }, + { + "epoch": 0.3675398857593067, + "grad_norm": 1.571914792060852, + "learning_rate": 0.00014596396083171715, + "loss": 1.7523, + "step": 10263 + }, + { + "epoch": 0.36757569788887495, + "grad_norm": 1.5955133438110352, + "learning_rate": 0.0001459536593750295, + "loss": 1.3922, + "step": 10264 + }, + { + "epoch": 0.3676115100184432, + "grad_norm": 1.9849090576171875, + "learning_rate": 0.00014594335730009462, + "loss": 1.5897, + "step": 10265 + }, + { + "epoch": 0.36764732214801155, + "grad_norm": 1.507638692855835, + "learning_rate": 0.00014593305460705114, + "loss": 1.5146, + "step": 10266 + }, + { + "epoch": 0.3676831342775798, + "grad_norm": 1.4899927377700806, + "learning_rate": 0.00014592275129603766, + "loss": 1.5611, + "step": 10267 + }, + { + "epoch": 0.3677189464071481, + "grad_norm": 1.6360082626342773, + "learning_rate": 0.00014591244736719282, + "loss": 1.5844, + "step": 10268 + }, + { + "epoch": 0.3677547585367164, + "grad_norm": 1.3872517347335815, + "learning_rate": 0.00014590214282065518, + "loss": 1.5452, + "step": 10269 + }, + { + "epoch": 0.3677905706662847, + "grad_norm": 1.434220790863037, + "learning_rate": 0.00014589183765656343, + "loss": 1.3571, + "step": 10270 + }, + { + "epoch": 0.36782638279585295, + "grad_norm": 1.7095938920974731, + "learning_rate": 0.00014588153187505625, + "loss": 1.4374, + "step": 10271 + }, + { + "epoch": 0.3678621949254212, + "grad_norm": 1.4860492944717407, + "learning_rate": 0.00014587122547627217, + "loss": 1.5456, + "step": 10272 + }, + { + "epoch": 0.36789800705498954, + "grad_norm": 1.56809663772583, + "learning_rate": 0.00014586091846034997, + "loss": 1.343, + "step": 10273 + }, + { + "epoch": 0.3679338191845578, + "grad_norm": 1.3190221786499023, + "learning_rate": 0.00014585061082742824, + "loss": 1.5654, + "step": 10274 + }, + { + "epoch": 0.3679696313141261, + "grad_norm": 1.5554225444793701, + "learning_rate": 0.0001458403025776457, + "loss": 1.6568, + "step": 10275 + }, + { + "epoch": 0.3680054434436944, + "grad_norm": 1.8499597311019897, + "learning_rate": 0.000145829993711141, + "loss": 1.606, + "step": 10276 + }, + { + "epoch": 0.36804125557326267, + "grad_norm": 1.9961286783218384, + "learning_rate": 0.00014581968422805287, + "loss": 1.6618, + "step": 10277 + }, + { + "epoch": 0.36807706770283094, + "grad_norm": 1.6830213069915771, + "learning_rate": 0.00014580937412852, + "loss": 1.4959, + "step": 10278 + }, + { + "epoch": 0.3681128798323992, + "grad_norm": 1.537229299545288, + "learning_rate": 0.0001457990634126811, + "loss": 1.5156, + "step": 10279 + }, + { + "epoch": 0.36814869196196753, + "grad_norm": 1.7691799402236938, + "learning_rate": 0.00014578875208067483, + "loss": 1.2807, + "step": 10280 + }, + { + "epoch": 0.3681845040915358, + "grad_norm": 1.5051552057266235, + "learning_rate": 0.00014577844013264, + "loss": 1.5256, + "step": 10281 + }, + { + "epoch": 0.36822031622110407, + "grad_norm": 1.39759361743927, + "learning_rate": 0.0001457681275687153, + "loss": 1.6506, + "step": 10282 + }, + { + "epoch": 0.3682561283506724, + "grad_norm": 1.8484997749328613, + "learning_rate": 0.00014575781438903946, + "loss": 1.5992, + "step": 10283 + }, + { + "epoch": 0.36829194048024066, + "grad_norm": 1.4711723327636719, + "learning_rate": 0.0001457475005937513, + "loss": 1.5188, + "step": 10284 + }, + { + "epoch": 0.36832775260980893, + "grad_norm": 2.3037071228027344, + "learning_rate": 0.0001457371861829895, + "loss": 1.5384, + "step": 10285 + }, + { + "epoch": 0.3683635647393772, + "grad_norm": 1.4197289943695068, + "learning_rate": 0.00014572687115689282, + "loss": 1.4293, + "step": 10286 + }, + { + "epoch": 0.3683993768689455, + "grad_norm": 1.5581529140472412, + "learning_rate": 0.0001457165555156001, + "loss": 1.2188, + "step": 10287 + }, + { + "epoch": 0.3684351889985138, + "grad_norm": 1.9231946468353271, + "learning_rate": 0.00014570623925925014, + "loss": 1.1728, + "step": 10288 + }, + { + "epoch": 0.36847100112808207, + "grad_norm": 1.9287388324737549, + "learning_rate": 0.00014569592238798163, + "loss": 1.4546, + "step": 10289 + }, + { + "epoch": 0.3685068132576504, + "grad_norm": 2.1329243183135986, + "learning_rate": 0.00014568560490193345, + "loss": 1.4835, + "step": 10290 + }, + { + "epoch": 0.36854262538721866, + "grad_norm": 1.5576348304748535, + "learning_rate": 0.0001456752868012444, + "loss": 1.4383, + "step": 10291 + }, + { + "epoch": 0.3685784375167869, + "grad_norm": 1.8817024230957031, + "learning_rate": 0.00014566496808605326, + "loss": 1.4858, + "step": 10292 + }, + { + "epoch": 0.3686142496463552, + "grad_norm": 1.5834522247314453, + "learning_rate": 0.00014565464875649888, + "loss": 1.5599, + "step": 10293 + }, + { + "epoch": 0.3686500617759235, + "grad_norm": 1.5673562288284302, + "learning_rate": 0.0001456443288127201, + "loss": 1.4004, + "step": 10294 + }, + { + "epoch": 0.3686858739054918, + "grad_norm": 1.6744060516357422, + "learning_rate": 0.00014563400825485576, + "loss": 1.5878, + "step": 10295 + }, + { + "epoch": 0.36872168603506006, + "grad_norm": 1.609439492225647, + "learning_rate": 0.00014562368708304467, + "loss": 1.4532, + "step": 10296 + }, + { + "epoch": 0.3687574981646284, + "grad_norm": 2.1061041355133057, + "learning_rate": 0.00014561336529742575, + "loss": 1.5246, + "step": 10297 + }, + { + "epoch": 0.36879331029419665, + "grad_norm": 1.7930060625076294, + "learning_rate": 0.00014560304289813785, + "loss": 1.3858, + "step": 10298 + }, + { + "epoch": 0.3688291224237649, + "grad_norm": 1.5626553297042847, + "learning_rate": 0.0001455927198853198, + "loss": 1.5151, + "step": 10299 + }, + { + "epoch": 0.3688649345533332, + "grad_norm": 2.6362035274505615, + "learning_rate": 0.00014558239625911052, + "loss": 1.9309, + "step": 10300 + }, + { + "epoch": 0.3689007466829015, + "grad_norm": 1.53107488155365, + "learning_rate": 0.00014557207201964893, + "loss": 1.4419, + "step": 10301 + }, + { + "epoch": 0.3689365588124698, + "grad_norm": 2.428128957748413, + "learning_rate": 0.00014556174716707384, + "loss": 1.6143, + "step": 10302 + }, + { + "epoch": 0.36897237094203805, + "grad_norm": 1.9132128953933716, + "learning_rate": 0.00014555142170152423, + "loss": 1.6772, + "step": 10303 + }, + { + "epoch": 0.3690081830716064, + "grad_norm": 1.4515597820281982, + "learning_rate": 0.00014554109562313903, + "loss": 1.3292, + "step": 10304 + }, + { + "epoch": 0.36904399520117465, + "grad_norm": 2.2293195724487305, + "learning_rate": 0.00014553076893205708, + "loss": 1.4524, + "step": 10305 + }, + { + "epoch": 0.3690798073307429, + "grad_norm": 1.7687851190567017, + "learning_rate": 0.00014552044162841743, + "loss": 1.2241, + "step": 10306 + }, + { + "epoch": 0.3691156194603112, + "grad_norm": 1.509236454963684, + "learning_rate": 0.0001455101137123589, + "loss": 1.6139, + "step": 10307 + }, + { + "epoch": 0.3691514315898795, + "grad_norm": 1.5203906297683716, + "learning_rate": 0.00014549978518402053, + "loss": 1.646, + "step": 10308 + }, + { + "epoch": 0.3691872437194478, + "grad_norm": 2.03283429145813, + "learning_rate": 0.0001454894560435412, + "loss": 1.3515, + "step": 10309 + }, + { + "epoch": 0.36922305584901605, + "grad_norm": 1.2739007472991943, + "learning_rate": 0.00014547912629105995, + "loss": 1.3494, + "step": 10310 + }, + { + "epoch": 0.36925886797858437, + "grad_norm": 1.7174979448318481, + "learning_rate": 0.00014546879592671573, + "loss": 1.5571, + "step": 10311 + }, + { + "epoch": 0.36929468010815264, + "grad_norm": 1.639888048171997, + "learning_rate": 0.00014545846495064748, + "loss": 1.6965, + "step": 10312 + }, + { + "epoch": 0.3693304922377209, + "grad_norm": 1.811215877532959, + "learning_rate": 0.0001454481333629942, + "loss": 1.5453, + "step": 10313 + }, + { + "epoch": 0.3693663043672892, + "grad_norm": 2.070788621902466, + "learning_rate": 0.00014543780116389496, + "loss": 1.5838, + "step": 10314 + }, + { + "epoch": 0.3694021164968575, + "grad_norm": 1.934891700744629, + "learning_rate": 0.0001454274683534887, + "loss": 1.2206, + "step": 10315 + }, + { + "epoch": 0.36943792862642577, + "grad_norm": 2.129443645477295, + "learning_rate": 0.00014541713493191444, + "loss": 1.5475, + "step": 10316 + }, + { + "epoch": 0.36947374075599404, + "grad_norm": 1.6835366487503052, + "learning_rate": 0.00014540680089931125, + "loss": 1.4861, + "step": 10317 + }, + { + "epoch": 0.36950955288556236, + "grad_norm": 1.4131048917770386, + "learning_rate": 0.00014539646625581805, + "loss": 1.1092, + "step": 10318 + }, + { + "epoch": 0.36954536501513063, + "grad_norm": 2.283271551132202, + "learning_rate": 0.00014538613100157404, + "loss": 1.4867, + "step": 10319 + }, + { + "epoch": 0.3695811771446989, + "grad_norm": 1.9396806955337524, + "learning_rate": 0.0001453757951367181, + "loss": 1.6176, + "step": 10320 + }, + { + "epoch": 0.36961698927426717, + "grad_norm": 1.417442798614502, + "learning_rate": 0.00014536545866138941, + "loss": 1.3439, + "step": 10321 + }, + { + "epoch": 0.3696528014038355, + "grad_norm": 1.7332024574279785, + "learning_rate": 0.000145355121575727, + "loss": 1.4305, + "step": 10322 + }, + { + "epoch": 0.36968861353340376, + "grad_norm": 1.3164316415786743, + "learning_rate": 0.00014534478387986992, + "loss": 1.3904, + "step": 10323 + }, + { + "epoch": 0.36972442566297203, + "grad_norm": 2.070188045501709, + "learning_rate": 0.0001453344455739573, + "loss": 1.4226, + "step": 10324 + }, + { + "epoch": 0.36976023779254036, + "grad_norm": 1.8793950080871582, + "learning_rate": 0.0001453241066581281, + "loss": 1.3511, + "step": 10325 + }, + { + "epoch": 0.3697960499221086, + "grad_norm": 1.7022724151611328, + "learning_rate": 0.0001453137671325216, + "loss": 1.4012, + "step": 10326 + }, + { + "epoch": 0.3698318620516769, + "grad_norm": 1.8162293434143066, + "learning_rate": 0.00014530342699727676, + "loss": 1.2831, + "step": 10327 + }, + { + "epoch": 0.36986767418124517, + "grad_norm": 1.9489846229553223, + "learning_rate": 0.0001452930862525328, + "loss": 1.5896, + "step": 10328 + }, + { + "epoch": 0.3699034863108135, + "grad_norm": 1.9331752061843872, + "learning_rate": 0.00014528274489842872, + "loss": 1.3076, + "step": 10329 + }, + { + "epoch": 0.36993929844038176, + "grad_norm": 1.6232823133468628, + "learning_rate": 0.00014527240293510377, + "loss": 1.3819, + "step": 10330 + }, + { + "epoch": 0.36997511056995, + "grad_norm": 2.704481840133667, + "learning_rate": 0.000145262060362697, + "loss": 1.7733, + "step": 10331 + }, + { + "epoch": 0.37001092269951835, + "grad_norm": 1.7846752405166626, + "learning_rate": 0.00014525171718134762, + "loss": 1.5951, + "step": 10332 + }, + { + "epoch": 0.3700467348290866, + "grad_norm": 1.3831145763397217, + "learning_rate": 0.00014524137339119478, + "loss": 1.5853, + "step": 10333 + }, + { + "epoch": 0.3700825469586549, + "grad_norm": 2.4267773628234863, + "learning_rate": 0.00014523102899237754, + "loss": 1.3878, + "step": 10334 + }, + { + "epoch": 0.37011835908822316, + "grad_norm": 1.7978699207305908, + "learning_rate": 0.00014522068398503522, + "loss": 1.6433, + "step": 10335 + }, + { + "epoch": 0.3701541712177915, + "grad_norm": 1.7147051095962524, + "learning_rate": 0.00014521033836930689, + "loss": 1.0625, + "step": 10336 + }, + { + "epoch": 0.37018998334735975, + "grad_norm": 2.743452310562134, + "learning_rate": 0.0001451999921453318, + "loss": 1.5351, + "step": 10337 + }, + { + "epoch": 0.370225795476928, + "grad_norm": 1.5962437391281128, + "learning_rate": 0.00014518964531324907, + "loss": 1.4559, + "step": 10338 + }, + { + "epoch": 0.37026160760649635, + "grad_norm": 1.8143364191055298, + "learning_rate": 0.000145179297873198, + "loss": 1.9998, + "step": 10339 + }, + { + "epoch": 0.3702974197360646, + "grad_norm": 1.644858956336975, + "learning_rate": 0.00014516894982531775, + "loss": 1.4414, + "step": 10340 + }, + { + "epoch": 0.3703332318656329, + "grad_norm": 1.4432719945907593, + "learning_rate": 0.00014515860116974752, + "loss": 1.5982, + "step": 10341 + }, + { + "epoch": 0.37036904399520115, + "grad_norm": 1.5924537181854248, + "learning_rate": 0.0001451482519066266, + "loss": 1.3127, + "step": 10342 + }, + { + "epoch": 0.3704048561247695, + "grad_norm": 1.6284945011138916, + "learning_rate": 0.00014513790203609416, + "loss": 1.2715, + "step": 10343 + }, + { + "epoch": 0.37044066825433775, + "grad_norm": 1.8258150815963745, + "learning_rate": 0.0001451275515582895, + "loss": 1.7172, + "step": 10344 + }, + { + "epoch": 0.370476480383906, + "grad_norm": 1.3459279537200928, + "learning_rate": 0.0001451172004733518, + "loss": 1.6076, + "step": 10345 + }, + { + "epoch": 0.37051229251347434, + "grad_norm": 1.891798496246338, + "learning_rate": 0.00014510684878142038, + "loss": 1.7049, + "step": 10346 + }, + { + "epoch": 0.3705481046430426, + "grad_norm": 1.2938027381896973, + "learning_rate": 0.00014509649648263449, + "loss": 1.6065, + "step": 10347 + }, + { + "epoch": 0.3705839167726109, + "grad_norm": 1.8871210813522339, + "learning_rate": 0.00014508614357713342, + "loss": 1.4965, + "step": 10348 + }, + { + "epoch": 0.37061972890217915, + "grad_norm": 1.41502046585083, + "learning_rate": 0.00014507579006505642, + "loss": 1.4462, + "step": 10349 + }, + { + "epoch": 0.37065554103174747, + "grad_norm": 1.7434496879577637, + "learning_rate": 0.00014506543594654288, + "loss": 1.2558, + "step": 10350 + }, + { + "epoch": 0.37069135316131574, + "grad_norm": 1.9141662120819092, + "learning_rate": 0.00014505508122173198, + "loss": 1.607, + "step": 10351 + }, + { + "epoch": 0.370727165290884, + "grad_norm": 1.4460514783859253, + "learning_rate": 0.00014504472589076307, + "loss": 1.8445, + "step": 10352 + }, + { + "epoch": 0.37076297742045233, + "grad_norm": 1.3187059164047241, + "learning_rate": 0.00014503436995377548, + "loss": 1.6002, + "step": 10353 + }, + { + "epoch": 0.3707987895500206, + "grad_norm": 1.4016839265823364, + "learning_rate": 0.00014502401341090853, + "loss": 1.3959, + "step": 10354 + }, + { + "epoch": 0.37083460167958887, + "grad_norm": 1.599345088005066, + "learning_rate": 0.00014501365626230157, + "loss": 1.9364, + "step": 10355 + }, + { + "epoch": 0.37087041380915714, + "grad_norm": 2.149724006652832, + "learning_rate": 0.00014500329850809394, + "loss": 1.5702, + "step": 10356 + }, + { + "epoch": 0.37090622593872546, + "grad_norm": 1.2839821577072144, + "learning_rate": 0.00014499294014842494, + "loss": 1.1688, + "step": 10357 + }, + { + "epoch": 0.37094203806829373, + "grad_norm": 1.5366618633270264, + "learning_rate": 0.000144982581183434, + "loss": 1.2508, + "step": 10358 + }, + { + "epoch": 0.370977850197862, + "grad_norm": 1.925222635269165, + "learning_rate": 0.00014497222161326045, + "loss": 1.4266, + "step": 10359 + }, + { + "epoch": 0.3710136623274303, + "grad_norm": 1.6245794296264648, + "learning_rate": 0.00014496186143804366, + "loss": 1.5413, + "step": 10360 + }, + { + "epoch": 0.3710494744569986, + "grad_norm": 1.6511958837509155, + "learning_rate": 0.000144951500657923, + "loss": 1.4162, + "step": 10361 + }, + { + "epoch": 0.37108528658656686, + "grad_norm": 1.8239529132843018, + "learning_rate": 0.00014494113927303792, + "loss": 1.4576, + "step": 10362 + }, + { + "epoch": 0.37112109871613513, + "grad_norm": 2.1355695724487305, + "learning_rate": 0.00014493077728352778, + "loss": 1.6973, + "step": 10363 + }, + { + "epoch": 0.37115691084570346, + "grad_norm": 1.5001815557479858, + "learning_rate": 0.00014492041468953194, + "loss": 1.207, + "step": 10364 + }, + { + "epoch": 0.3711927229752717, + "grad_norm": 1.4016623497009277, + "learning_rate": 0.0001449100514911899, + "loss": 1.3825, + "step": 10365 + }, + { + "epoch": 0.37122853510484, + "grad_norm": 2.0167899131774902, + "learning_rate": 0.00014489968768864107, + "loss": 1.6569, + "step": 10366 + }, + { + "epoch": 0.3712643472344083, + "grad_norm": 1.745682716369629, + "learning_rate": 0.00014488932328202484, + "loss": 1.574, + "step": 10367 + }, + { + "epoch": 0.3713001593639766, + "grad_norm": 1.8677984476089478, + "learning_rate": 0.00014487895827148067, + "loss": 1.8146, + "step": 10368 + }, + { + "epoch": 0.37133597149354486, + "grad_norm": 2.221468687057495, + "learning_rate": 0.00014486859265714798, + "loss": 1.5426, + "step": 10369 + }, + { + "epoch": 0.3713717836231131, + "grad_norm": 1.536044716835022, + "learning_rate": 0.00014485822643916626, + "loss": 1.2835, + "step": 10370 + }, + { + "epoch": 0.37140759575268145, + "grad_norm": 1.6062583923339844, + "learning_rate": 0.00014484785961767498, + "loss": 1.5294, + "step": 10371 + }, + { + "epoch": 0.3714434078822497, + "grad_norm": 1.858888030052185, + "learning_rate": 0.0001448374921928136, + "loss": 1.4026, + "step": 10372 + }, + { + "epoch": 0.371479220011818, + "grad_norm": 1.7808476686477661, + "learning_rate": 0.00014482712416472157, + "loss": 1.829, + "step": 10373 + }, + { + "epoch": 0.3715150321413863, + "grad_norm": 1.4914846420288086, + "learning_rate": 0.00014481675553353843, + "loss": 1.2286, + "step": 10374 + }, + { + "epoch": 0.3715508442709546, + "grad_norm": 1.6184641122817993, + "learning_rate": 0.00014480638629940366, + "loss": 1.7472, + "step": 10375 + }, + { + "epoch": 0.37158665640052285, + "grad_norm": 1.7213737964630127, + "learning_rate": 0.00014479601646245676, + "loss": 1.4159, + "step": 10376 + }, + { + "epoch": 0.3716224685300911, + "grad_norm": 1.4510760307312012, + "learning_rate": 0.00014478564602283725, + "loss": 1.4266, + "step": 10377 + }, + { + "epoch": 0.37165828065965945, + "grad_norm": 1.8478851318359375, + "learning_rate": 0.0001447752749806846, + "loss": 1.5385, + "step": 10378 + }, + { + "epoch": 0.3716940927892277, + "grad_norm": 1.6792529821395874, + "learning_rate": 0.00014476490333613842, + "loss": 1.5736, + "step": 10379 + }, + { + "epoch": 0.371729904918796, + "grad_norm": 1.8026201725006104, + "learning_rate": 0.00014475453108933817, + "loss": 1.7353, + "step": 10380 + }, + { + "epoch": 0.3717657170483643, + "grad_norm": 1.645272135734558, + "learning_rate": 0.00014474415824042346, + "loss": 1.2897, + "step": 10381 + }, + { + "epoch": 0.3718015291779326, + "grad_norm": 1.5104624032974243, + "learning_rate": 0.0001447337847895338, + "loss": 1.7482, + "step": 10382 + }, + { + "epoch": 0.37183734130750085, + "grad_norm": 2.0950675010681152, + "learning_rate": 0.00014472341073680883, + "loss": 2.0707, + "step": 10383 + }, + { + "epoch": 0.3718731534370691, + "grad_norm": 1.8873536586761475, + "learning_rate": 0.00014471303608238798, + "loss": 1.4105, + "step": 10384 + }, + { + "epoch": 0.37190896556663744, + "grad_norm": 2.1229586601257324, + "learning_rate": 0.00014470266082641095, + "loss": 1.9201, + "step": 10385 + }, + { + "epoch": 0.3719447776962057, + "grad_norm": 2.1284492015838623, + "learning_rate": 0.00014469228496901727, + "loss": 1.5545, + "step": 10386 + }, + { + "epoch": 0.371980589825774, + "grad_norm": 1.978696584701538, + "learning_rate": 0.00014468190851034656, + "loss": 1.6813, + "step": 10387 + }, + { + "epoch": 0.3720164019553423, + "grad_norm": 1.5713815689086914, + "learning_rate": 0.0001446715314505384, + "loss": 1.6417, + "step": 10388 + }, + { + "epoch": 0.37205221408491057, + "grad_norm": 1.5084996223449707, + "learning_rate": 0.00014466115378973236, + "loss": 1.4994, + "step": 10389 + }, + { + "epoch": 0.37208802621447884, + "grad_norm": 1.6603682041168213, + "learning_rate": 0.00014465077552806813, + "loss": 1.8155, + "step": 10390 + }, + { + "epoch": 0.3721238383440471, + "grad_norm": 1.6536749601364136, + "learning_rate": 0.00014464039666568532, + "loss": 1.1957, + "step": 10391 + }, + { + "epoch": 0.37215965047361543, + "grad_norm": 1.334580659866333, + "learning_rate": 0.00014463001720272357, + "loss": 1.4941, + "step": 10392 + }, + { + "epoch": 0.3721954626031837, + "grad_norm": 1.4311217069625854, + "learning_rate": 0.00014461963713932247, + "loss": 1.3092, + "step": 10393 + }, + { + "epoch": 0.37223127473275197, + "grad_norm": 1.9430100917816162, + "learning_rate": 0.00014460925647562174, + "loss": 1.2872, + "step": 10394 + }, + { + "epoch": 0.3722670868623203, + "grad_norm": 1.8805584907531738, + "learning_rate": 0.000144598875211761, + "loss": 1.7865, + "step": 10395 + }, + { + "epoch": 0.37230289899188856, + "grad_norm": 1.2648087739944458, + "learning_rate": 0.00014458849334787993, + "loss": 1.4374, + "step": 10396 + }, + { + "epoch": 0.37233871112145683, + "grad_norm": 1.6092779636383057, + "learning_rate": 0.00014457811088411816, + "loss": 1.559, + "step": 10397 + }, + { + "epoch": 0.3723745232510251, + "grad_norm": 1.3519525527954102, + "learning_rate": 0.00014456772782061545, + "loss": 1.2963, + "step": 10398 + }, + { + "epoch": 0.3724103353805934, + "grad_norm": 2.0180888175964355, + "learning_rate": 0.00014455734415751143, + "loss": 1.574, + "step": 10399 + }, + { + "epoch": 0.3724461475101617, + "grad_norm": 1.8862159252166748, + "learning_rate": 0.00014454695989494582, + "loss": 1.651, + "step": 10400 + }, + { + "epoch": 0.37248195963972996, + "grad_norm": 1.7211228609085083, + "learning_rate": 0.00014453657503305832, + "loss": 1.3319, + "step": 10401 + }, + { + "epoch": 0.37251777176929823, + "grad_norm": 2.443127155303955, + "learning_rate": 0.00014452618957198866, + "loss": 1.4359, + "step": 10402 + }, + { + "epoch": 0.37255358389886656, + "grad_norm": 1.24772047996521, + "learning_rate": 0.00014451580351187656, + "loss": 1.6141, + "step": 10403 + }, + { + "epoch": 0.3725893960284348, + "grad_norm": 1.9694045782089233, + "learning_rate": 0.00014450541685286173, + "loss": 1.6621, + "step": 10404 + }, + { + "epoch": 0.3726252081580031, + "grad_norm": 1.6941626071929932, + "learning_rate": 0.00014449502959508394, + "loss": 1.6966, + "step": 10405 + }, + { + "epoch": 0.3726610202875714, + "grad_norm": 1.5425902605056763, + "learning_rate": 0.00014448464173868293, + "loss": 1.4486, + "step": 10406 + }, + { + "epoch": 0.3726968324171397, + "grad_norm": 1.4285595417022705, + "learning_rate": 0.00014447425328379843, + "loss": 1.0123, + "step": 10407 + }, + { + "epoch": 0.37273264454670796, + "grad_norm": 1.7585430145263672, + "learning_rate": 0.00014446386423057022, + "loss": 1.2697, + "step": 10408 + }, + { + "epoch": 0.3727684566762762, + "grad_norm": 2.169542074203491, + "learning_rate": 0.00014445347457913807, + "loss": 1.3233, + "step": 10409 + }, + { + "epoch": 0.37280426880584455, + "grad_norm": 1.826337456703186, + "learning_rate": 0.00014444308432964175, + "loss": 1.2875, + "step": 10410 + }, + { + "epoch": 0.3728400809354128, + "grad_norm": 2.5263783931732178, + "learning_rate": 0.00014443269348222109, + "loss": 1.7066, + "step": 10411 + }, + { + "epoch": 0.3728758930649811, + "grad_norm": 1.9127196073532104, + "learning_rate": 0.00014442230203701582, + "loss": 1.4219, + "step": 10412 + }, + { + "epoch": 0.3729117051945494, + "grad_norm": 1.6260210275650024, + "learning_rate": 0.0001444119099941658, + "loss": 1.5465, + "step": 10413 + }, + { + "epoch": 0.3729475173241177, + "grad_norm": 1.7325187921524048, + "learning_rate": 0.0001444015173538108, + "loss": 1.5095, + "step": 10414 + }, + { + "epoch": 0.37298332945368595, + "grad_norm": 1.852473258972168, + "learning_rate": 0.00014439112411609065, + "loss": 1.4091, + "step": 10415 + }, + { + "epoch": 0.3730191415832542, + "grad_norm": 2.1807992458343506, + "learning_rate": 0.00014438073028114523, + "loss": 1.3722, + "step": 10416 + }, + { + "epoch": 0.37305495371282255, + "grad_norm": 1.6915746927261353, + "learning_rate": 0.00014437033584911428, + "loss": 1.638, + "step": 10417 + }, + { + "epoch": 0.3730907658423908, + "grad_norm": 2.2476296424865723, + "learning_rate": 0.00014435994082013772, + "loss": 1.4248, + "step": 10418 + }, + { + "epoch": 0.3731265779719591, + "grad_norm": 1.4829312562942505, + "learning_rate": 0.00014434954519435537, + "loss": 1.599, + "step": 10419 + }, + { + "epoch": 0.3731623901015274, + "grad_norm": 2.0151619911193848, + "learning_rate": 0.0001443391489719071, + "loss": 1.3809, + "step": 10420 + }, + { + "epoch": 0.3731982022310957, + "grad_norm": 2.2441935539245605, + "learning_rate": 0.0001443287521529328, + "loss": 1.5466, + "step": 10421 + }, + { + "epoch": 0.37323401436066395, + "grad_norm": 1.6749489307403564, + "learning_rate": 0.00014431835473757227, + "loss": 1.1317, + "step": 10422 + }, + { + "epoch": 0.3732698264902322, + "grad_norm": 1.493473768234253, + "learning_rate": 0.0001443079567259655, + "loss": 1.5646, + "step": 10423 + }, + { + "epoch": 0.37330563861980054, + "grad_norm": 2.4875717163085938, + "learning_rate": 0.00014429755811825226, + "loss": 1.505, + "step": 10424 + }, + { + "epoch": 0.3733414507493688, + "grad_norm": 1.3609141111373901, + "learning_rate": 0.00014428715891457255, + "loss": 1.3917, + "step": 10425 + }, + { + "epoch": 0.3733772628789371, + "grad_norm": 1.8194928169250488, + "learning_rate": 0.00014427675911506623, + "loss": 1.3385, + "step": 10426 + }, + { + "epoch": 0.3734130750085054, + "grad_norm": 1.7996721267700195, + "learning_rate": 0.00014426635871987327, + "loss": 1.5479, + "step": 10427 + }, + { + "epoch": 0.37344888713807367, + "grad_norm": 2.608707904815674, + "learning_rate": 0.0001442559577291335, + "loss": 1.3896, + "step": 10428 + }, + { + "epoch": 0.37348469926764194, + "grad_norm": 1.6226502656936646, + "learning_rate": 0.00014424555614298693, + "loss": 1.4873, + "step": 10429 + }, + { + "epoch": 0.3735205113972102, + "grad_norm": 1.5772932767868042, + "learning_rate": 0.0001442351539615735, + "loss": 1.2307, + "step": 10430 + }, + { + "epoch": 0.37355632352677853, + "grad_norm": 1.3217772245407104, + "learning_rate": 0.00014422475118503307, + "loss": 1.756, + "step": 10431 + }, + { + "epoch": 0.3735921356563468, + "grad_norm": 1.4471333026885986, + "learning_rate": 0.0001442143478135057, + "loss": 1.3107, + "step": 10432 + }, + { + "epoch": 0.37362794778591507, + "grad_norm": 1.474479079246521, + "learning_rate": 0.00014420394384713129, + "loss": 1.5484, + "step": 10433 + }, + { + "epoch": 0.3736637599154834, + "grad_norm": 1.8751987218856812, + "learning_rate": 0.00014419353928604988, + "loss": 1.4065, + "step": 10434 + }, + { + "epoch": 0.37369957204505166, + "grad_norm": 2.282353639602661, + "learning_rate": 0.00014418313413040138, + "loss": 1.1441, + "step": 10435 + }, + { + "epoch": 0.37373538417461993, + "grad_norm": 2.0015993118286133, + "learning_rate": 0.00014417272838032578, + "loss": 1.5908, + "step": 10436 + }, + { + "epoch": 0.3737711963041882, + "grad_norm": 2.5349526405334473, + "learning_rate": 0.00014416232203596312, + "loss": 1.5447, + "step": 10437 + }, + { + "epoch": 0.3738070084337565, + "grad_norm": 1.7229876518249512, + "learning_rate": 0.00014415191509745338, + "loss": 1.3676, + "step": 10438 + }, + { + "epoch": 0.3738428205633248, + "grad_norm": 2.226806640625, + "learning_rate": 0.0001441415075649366, + "loss": 1.4815, + "step": 10439 + }, + { + "epoch": 0.37387863269289306, + "grad_norm": 3.008434772491455, + "learning_rate": 0.00014413109943855275, + "loss": 1.2429, + "step": 10440 + }, + { + "epoch": 0.3739144448224614, + "grad_norm": 1.4525420665740967, + "learning_rate": 0.00014412069071844186, + "loss": 1.4308, + "step": 10441 + }, + { + "epoch": 0.37395025695202966, + "grad_norm": 1.8240820169448853, + "learning_rate": 0.00014411028140474402, + "loss": 1.6106, + "step": 10442 + }, + { + "epoch": 0.3739860690815979, + "grad_norm": 1.7192800045013428, + "learning_rate": 0.0001440998714975992, + "loss": 1.6079, + "step": 10443 + }, + { + "epoch": 0.3740218812111662, + "grad_norm": 1.9115632772445679, + "learning_rate": 0.00014408946099714754, + "loss": 1.6781, + "step": 10444 + }, + { + "epoch": 0.3740576933407345, + "grad_norm": 1.8034567832946777, + "learning_rate": 0.00014407904990352904, + "loss": 1.4781, + "step": 10445 + }, + { + "epoch": 0.3740935054703028, + "grad_norm": 1.9719936847686768, + "learning_rate": 0.00014406863821688374, + "loss": 1.7442, + "step": 10446 + }, + { + "epoch": 0.37412931759987106, + "grad_norm": 1.7410471439361572, + "learning_rate": 0.00014405822593735183, + "loss": 1.6857, + "step": 10447 + }, + { + "epoch": 0.3741651297294394, + "grad_norm": 1.7776700258255005, + "learning_rate": 0.0001440478130650733, + "loss": 1.1856, + "step": 10448 + }, + { + "epoch": 0.37420094185900765, + "grad_norm": 1.678805947303772, + "learning_rate": 0.00014403739960018824, + "loss": 1.3772, + "step": 10449 + }, + { + "epoch": 0.3742367539885759, + "grad_norm": 1.3966861963272095, + "learning_rate": 0.00014402698554283675, + "loss": 1.6829, + "step": 10450 + }, + { + "epoch": 0.3742725661181442, + "grad_norm": 2.3683388233184814, + "learning_rate": 0.00014401657089315904, + "loss": 1.6944, + "step": 10451 + }, + { + "epoch": 0.3743083782477125, + "grad_norm": 2.4242324829101562, + "learning_rate": 0.00014400615565129507, + "loss": 1.4101, + "step": 10452 + }, + { + "epoch": 0.3743441903772808, + "grad_norm": 2.579834461212158, + "learning_rate": 0.00014399573981738507, + "loss": 1.3847, + "step": 10453 + }, + { + "epoch": 0.37438000250684905, + "grad_norm": 1.703393578529358, + "learning_rate": 0.00014398532339156912, + "loss": 1.4959, + "step": 10454 + }, + { + "epoch": 0.3744158146364174, + "grad_norm": 2.0507161617279053, + "learning_rate": 0.00014397490637398742, + "loss": 1.2748, + "step": 10455 + }, + { + "epoch": 0.37445162676598565, + "grad_norm": 1.2460108995437622, + "learning_rate": 0.00014396448876478007, + "loss": 1.5361, + "step": 10456 + }, + { + "epoch": 0.3744874388955539, + "grad_norm": 1.6622123718261719, + "learning_rate": 0.00014395407056408722, + "loss": 1.5679, + "step": 10457 + }, + { + "epoch": 0.3745232510251222, + "grad_norm": 2.1253249645233154, + "learning_rate": 0.00014394365177204904, + "loss": 1.7294, + "step": 10458 + }, + { + "epoch": 0.3745590631546905, + "grad_norm": 2.055772542953491, + "learning_rate": 0.00014393323238880571, + "loss": 1.2525, + "step": 10459 + }, + { + "epoch": 0.3745948752842588, + "grad_norm": 1.6590421199798584, + "learning_rate": 0.00014392281241449743, + "loss": 1.4193, + "step": 10460 + }, + { + "epoch": 0.37463068741382705, + "grad_norm": 1.8132797479629517, + "learning_rate": 0.00014391239184926433, + "loss": 1.4351, + "step": 10461 + }, + { + "epoch": 0.37466649954339537, + "grad_norm": 1.3465622663497925, + "learning_rate": 0.00014390197069324667, + "loss": 1.6417, + "step": 10462 + }, + { + "epoch": 0.37470231167296364, + "grad_norm": 1.4198817014694214, + "learning_rate": 0.0001438915489465846, + "loss": 1.3433, + "step": 10463 + }, + { + "epoch": 0.3747381238025319, + "grad_norm": 2.0465641021728516, + "learning_rate": 0.0001438811266094184, + "loss": 1.8104, + "step": 10464 + }, + { + "epoch": 0.3747739359321002, + "grad_norm": 1.4621689319610596, + "learning_rate": 0.0001438707036818882, + "loss": 1.5587, + "step": 10465 + }, + { + "epoch": 0.3748097480616685, + "grad_norm": 1.6371957063674927, + "learning_rate": 0.00014386028016413426, + "loss": 1.4263, + "step": 10466 + }, + { + "epoch": 0.37484556019123677, + "grad_norm": 2.203371286392212, + "learning_rate": 0.00014384985605629685, + "loss": 1.5286, + "step": 10467 + }, + { + "epoch": 0.37488137232080504, + "grad_norm": 1.7377517223358154, + "learning_rate": 0.0001438394313585162, + "loss": 1.3761, + "step": 10468 + }, + { + "epoch": 0.37491718445037336, + "grad_norm": 1.7066103219985962, + "learning_rate": 0.00014382900607093254, + "loss": 1.6471, + "step": 10469 + }, + { + "epoch": 0.37495299657994163, + "grad_norm": 3.365816593170166, + "learning_rate": 0.00014381858019368613, + "loss": 1.805, + "step": 10470 + }, + { + "epoch": 0.3749888087095099, + "grad_norm": 1.8790528774261475, + "learning_rate": 0.00014380815372691728, + "loss": 1.3121, + "step": 10471 + }, + { + "epoch": 0.37502462083907817, + "grad_norm": 1.5260752439498901, + "learning_rate": 0.00014379772667076618, + "loss": 1.7436, + "step": 10472 + }, + { + "epoch": 0.3750604329686465, + "grad_norm": 1.6847316026687622, + "learning_rate": 0.0001437872990253732, + "loss": 1.5917, + "step": 10473 + }, + { + "epoch": 0.37509624509821476, + "grad_norm": 1.5153547525405884, + "learning_rate": 0.0001437768707908786, + "loss": 1.5685, + "step": 10474 + }, + { + "epoch": 0.37513205722778303, + "grad_norm": 1.3384404182434082, + "learning_rate": 0.00014376644196742263, + "loss": 1.3586, + "step": 10475 + }, + { + "epoch": 0.37516786935735136, + "grad_norm": 1.3099168539047241, + "learning_rate": 0.00014375601255514565, + "loss": 1.1538, + "step": 10476 + }, + { + "epoch": 0.3752036814869196, + "grad_norm": 2.159562110900879, + "learning_rate": 0.00014374558255418797, + "loss": 1.3775, + "step": 10477 + }, + { + "epoch": 0.3752394936164879, + "grad_norm": 1.8578730821609497, + "learning_rate": 0.00014373515196468991, + "loss": 1.4176, + "step": 10478 + }, + { + "epoch": 0.37527530574605616, + "grad_norm": 1.4514464139938354, + "learning_rate": 0.00014372472078679177, + "loss": 1.2886, + "step": 10479 + }, + { + "epoch": 0.3753111178756245, + "grad_norm": 2.7915217876434326, + "learning_rate": 0.00014371428902063395, + "loss": 1.6081, + "step": 10480 + }, + { + "epoch": 0.37534693000519276, + "grad_norm": 1.7059495449066162, + "learning_rate": 0.00014370385666635674, + "loss": 1.3397, + "step": 10481 + }, + { + "epoch": 0.375382742134761, + "grad_norm": 1.5151960849761963, + "learning_rate": 0.00014369342372410053, + "loss": 1.4587, + "step": 10482 + }, + { + "epoch": 0.37541855426432935, + "grad_norm": 1.5763031244277954, + "learning_rate": 0.00014368299019400563, + "loss": 1.0639, + "step": 10483 + }, + { + "epoch": 0.3754543663938976, + "grad_norm": 1.905612587928772, + "learning_rate": 0.0001436725560762125, + "loss": 1.7506, + "step": 10484 + }, + { + "epoch": 0.3754901785234659, + "grad_norm": 1.5665966272354126, + "learning_rate": 0.0001436621213708614, + "loss": 1.4628, + "step": 10485 + }, + { + "epoch": 0.37552599065303416, + "grad_norm": 1.8243300914764404, + "learning_rate": 0.0001436516860780928, + "loss": 1.5423, + "step": 10486 + }, + { + "epoch": 0.3755618027826025, + "grad_norm": 1.7763745784759521, + "learning_rate": 0.00014364125019804708, + "loss": 1.8964, + "step": 10487 + }, + { + "epoch": 0.37559761491217075, + "grad_norm": 2.380751848220825, + "learning_rate": 0.00014363081373086462, + "loss": 1.7111, + "step": 10488 + }, + { + "epoch": 0.375633427041739, + "grad_norm": 2.1618824005126953, + "learning_rate": 0.00014362037667668584, + "loss": 1.2592, + "step": 10489 + }, + { + "epoch": 0.37566923917130735, + "grad_norm": 2.174567222595215, + "learning_rate": 0.00014360993903565116, + "loss": 1.8956, + "step": 10490 + }, + { + "epoch": 0.3757050513008756, + "grad_norm": 1.3785721063613892, + "learning_rate": 0.00014359950080790101, + "loss": 1.5306, + "step": 10491 + }, + { + "epoch": 0.3757408634304439, + "grad_norm": 1.9032706022262573, + "learning_rate": 0.0001435890619935758, + "loss": 1.3604, + "step": 10492 + }, + { + "epoch": 0.37577667556001215, + "grad_norm": 1.70711088180542, + "learning_rate": 0.00014357862259281603, + "loss": 1.7252, + "step": 10493 + }, + { + "epoch": 0.3758124876895805, + "grad_norm": 1.943627119064331, + "learning_rate": 0.00014356818260576206, + "loss": 1.4448, + "step": 10494 + }, + { + "epoch": 0.37584829981914875, + "grad_norm": 1.4574229717254639, + "learning_rate": 0.0001435577420325544, + "loss": 1.5815, + "step": 10495 + }, + { + "epoch": 0.375884111948717, + "grad_norm": 1.5903644561767578, + "learning_rate": 0.0001435473008733335, + "loss": 1.5568, + "step": 10496 + }, + { + "epoch": 0.37591992407828534, + "grad_norm": 1.8567113876342773, + "learning_rate": 0.00014353685912823987, + "loss": 1.2216, + "step": 10497 + }, + { + "epoch": 0.3759557362078536, + "grad_norm": 1.382388949394226, + "learning_rate": 0.00014352641679741393, + "loss": 1.6755, + "step": 10498 + }, + { + "epoch": 0.3759915483374219, + "grad_norm": 1.329992651939392, + "learning_rate": 0.0001435159738809962, + "loss": 1.2925, + "step": 10499 + }, + { + "epoch": 0.37602736046699015, + "grad_norm": 2.657209873199463, + "learning_rate": 0.0001435055303791272, + "loss": 1.673, + "step": 10500 + }, + { + "epoch": 0.37606317259655847, + "grad_norm": 1.9933116436004639, + "learning_rate": 0.00014349508629194738, + "loss": 1.4519, + "step": 10501 + }, + { + "epoch": 0.37609898472612674, + "grad_norm": 1.554787516593933, + "learning_rate": 0.00014348464161959728, + "loss": 1.6938, + "step": 10502 + }, + { + "epoch": 0.376134796855695, + "grad_norm": 1.7749775648117065, + "learning_rate": 0.0001434741963622174, + "loss": 1.3281, + "step": 10503 + }, + { + "epoch": 0.37617060898526333, + "grad_norm": 1.529089331626892, + "learning_rate": 0.00014346375051994833, + "loss": 1.3901, + "step": 10504 + }, + { + "epoch": 0.3762064211148316, + "grad_norm": 1.5324797630310059, + "learning_rate": 0.00014345330409293053, + "loss": 1.4895, + "step": 10505 + }, + { + "epoch": 0.37624223324439987, + "grad_norm": 1.7260066270828247, + "learning_rate": 0.0001434428570813046, + "loss": 1.5439, + "step": 10506 + }, + { + "epoch": 0.37627804537396814, + "grad_norm": 1.5831547975540161, + "learning_rate": 0.00014343240948521104, + "loss": 1.4157, + "step": 10507 + }, + { + "epoch": 0.37631385750353646, + "grad_norm": 1.6812795400619507, + "learning_rate": 0.00014342196130479043, + "loss": 1.6985, + "step": 10508 + }, + { + "epoch": 0.37634966963310473, + "grad_norm": 1.7851736545562744, + "learning_rate": 0.0001434115125401834, + "loss": 1.6871, + "step": 10509 + }, + { + "epoch": 0.376385481762673, + "grad_norm": 1.9314453601837158, + "learning_rate": 0.00014340106319153038, + "loss": 1.6787, + "step": 10510 + }, + { + "epoch": 0.3764212938922413, + "grad_norm": 1.592041254043579, + "learning_rate": 0.0001433906132589721, + "loss": 1.3062, + "step": 10511 + }, + { + "epoch": 0.3764571060218096, + "grad_norm": 1.572988510131836, + "learning_rate": 0.00014338016274264905, + "loss": 1.7478, + "step": 10512 + }, + { + "epoch": 0.37649291815137786, + "grad_norm": 2.4121146202087402, + "learning_rate": 0.0001433697116427019, + "loss": 1.6346, + "step": 10513 + }, + { + "epoch": 0.37652873028094613, + "grad_norm": 1.4479660987854004, + "learning_rate": 0.0001433592599592712, + "loss": 1.4558, + "step": 10514 + }, + { + "epoch": 0.37656454241051446, + "grad_norm": 1.7201353311538696, + "learning_rate": 0.00014334880769249758, + "loss": 1.4518, + "step": 10515 + }, + { + "epoch": 0.3766003545400827, + "grad_norm": 1.6340991258621216, + "learning_rate": 0.00014333835484252167, + "loss": 1.6044, + "step": 10516 + }, + { + "epoch": 0.376636166669651, + "grad_norm": 1.3527189493179321, + "learning_rate": 0.00014332790140948414, + "loss": 1.4002, + "step": 10517 + }, + { + "epoch": 0.3766719787992193, + "grad_norm": 2.4372007846832275, + "learning_rate": 0.00014331744739352556, + "loss": 1.747, + "step": 10518 + }, + { + "epoch": 0.3767077909287876, + "grad_norm": 1.1843258142471313, + "learning_rate": 0.0001433069927947866, + "loss": 1.5292, + "step": 10519 + }, + { + "epoch": 0.37674360305835586, + "grad_norm": 2.6395835876464844, + "learning_rate": 0.0001432965376134079, + "loss": 1.8212, + "step": 10520 + }, + { + "epoch": 0.3767794151879241, + "grad_norm": 1.5669384002685547, + "learning_rate": 0.00014328608184953012, + "loss": 1.3947, + "step": 10521 + }, + { + "epoch": 0.37681522731749245, + "grad_norm": 1.5199379920959473, + "learning_rate": 0.000143275625503294, + "loss": 1.4586, + "step": 10522 + }, + { + "epoch": 0.3768510394470607, + "grad_norm": 1.574935793876648, + "learning_rate": 0.0001432651685748401, + "loss": 1.4656, + "step": 10523 + }, + { + "epoch": 0.376886851576629, + "grad_norm": 1.8273627758026123, + "learning_rate": 0.0001432547110643092, + "loss": 1.596, + "step": 10524 + }, + { + "epoch": 0.3769226637061973, + "grad_norm": 1.7247346639633179, + "learning_rate": 0.00014324425297184193, + "loss": 1.555, + "step": 10525 + }, + { + "epoch": 0.3769584758357656, + "grad_norm": 1.646228551864624, + "learning_rate": 0.00014323379429757906, + "loss": 1.6124, + "step": 10526 + }, + { + "epoch": 0.37699428796533385, + "grad_norm": 1.649630069732666, + "learning_rate": 0.00014322333504166124, + "loss": 1.8379, + "step": 10527 + }, + { + "epoch": 0.3770301000949021, + "grad_norm": 2.7177631855010986, + "learning_rate": 0.00014321287520422917, + "loss": 1.3449, + "step": 10528 + }, + { + "epoch": 0.37706591222447045, + "grad_norm": 1.322988748550415, + "learning_rate": 0.00014320241478542363, + "loss": 1.4162, + "step": 10529 + }, + { + "epoch": 0.3771017243540387, + "grad_norm": 2.104003429412842, + "learning_rate": 0.0001431919537853853, + "loss": 1.7102, + "step": 10530 + }, + { + "epoch": 0.377137536483607, + "grad_norm": 1.8676283359527588, + "learning_rate": 0.000143181492204255, + "loss": 1.4098, + "step": 10531 + }, + { + "epoch": 0.3771733486131753, + "grad_norm": 1.5684523582458496, + "learning_rate": 0.0001431710300421734, + "loss": 1.5385, + "step": 10532 + }, + { + "epoch": 0.3772091607427436, + "grad_norm": 1.7146109342575073, + "learning_rate": 0.00014316056729928126, + "loss": 1.7981, + "step": 10533 + }, + { + "epoch": 0.37724497287231185, + "grad_norm": 1.6780128479003906, + "learning_rate": 0.00014315010397571937, + "loss": 1.4944, + "step": 10534 + }, + { + "epoch": 0.3772807850018801, + "grad_norm": 3.0013227462768555, + "learning_rate": 0.0001431396400716285, + "loss": 2.0381, + "step": 10535 + }, + { + "epoch": 0.37731659713144844, + "grad_norm": 1.5567467212677002, + "learning_rate": 0.00014312917558714943, + "loss": 1.4914, + "step": 10536 + }, + { + "epoch": 0.3773524092610167, + "grad_norm": 1.4302515983581543, + "learning_rate": 0.00014311871052242293, + "loss": 1.5742, + "step": 10537 + }, + { + "epoch": 0.377388221390585, + "grad_norm": 2.07220196723938, + "learning_rate": 0.00014310824487758975, + "loss": 1.5267, + "step": 10538 + }, + { + "epoch": 0.3774240335201533, + "grad_norm": 1.5602160692214966, + "learning_rate": 0.00014309777865279078, + "loss": 1.1705, + "step": 10539 + }, + { + "epoch": 0.37745984564972157, + "grad_norm": 1.5262706279754639, + "learning_rate": 0.00014308731184816678, + "loss": 1.6653, + "step": 10540 + }, + { + "epoch": 0.37749565777928984, + "grad_norm": 1.5797264575958252, + "learning_rate": 0.00014307684446385855, + "loss": 1.4342, + "step": 10541 + }, + { + "epoch": 0.3775314699088581, + "grad_norm": 1.70090651512146, + "learning_rate": 0.000143066376500007, + "loss": 1.7305, + "step": 10542 + }, + { + "epoch": 0.37756728203842643, + "grad_norm": 1.5267583131790161, + "learning_rate": 0.00014305590795675286, + "loss": 1.5904, + "step": 10543 + }, + { + "epoch": 0.3776030941679947, + "grad_norm": 1.5385205745697021, + "learning_rate": 0.00014304543883423708, + "loss": 1.7465, + "step": 10544 + }, + { + "epoch": 0.37763890629756297, + "grad_norm": 2.824392795562744, + "learning_rate": 0.0001430349691326004, + "loss": 1.4748, + "step": 10545 + }, + { + "epoch": 0.3776747184271313, + "grad_norm": 1.8800132274627686, + "learning_rate": 0.00014302449885198373, + "loss": 1.4558, + "step": 10546 + }, + { + "epoch": 0.37771053055669956, + "grad_norm": 1.483883023262024, + "learning_rate": 0.00014301402799252793, + "loss": 1.384, + "step": 10547 + }, + { + "epoch": 0.37774634268626783, + "grad_norm": 1.6904969215393066, + "learning_rate": 0.00014300355655437385, + "loss": 1.6016, + "step": 10548 + }, + { + "epoch": 0.3777821548158361, + "grad_norm": 1.5325121879577637, + "learning_rate": 0.00014299308453766238, + "loss": 1.419, + "step": 10549 + }, + { + "epoch": 0.3778179669454044, + "grad_norm": 1.5163652896881104, + "learning_rate": 0.00014298261194253443, + "loss": 1.6518, + "step": 10550 + }, + { + "epoch": 0.3778537790749727, + "grad_norm": 1.606885313987732, + "learning_rate": 0.00014297213876913087, + "loss": 1.6129, + "step": 10551 + }, + { + "epoch": 0.37788959120454096, + "grad_norm": 2.0452921390533447, + "learning_rate": 0.00014296166501759263, + "loss": 1.5371, + "step": 10552 + }, + { + "epoch": 0.3779254033341093, + "grad_norm": 1.4756042957305908, + "learning_rate": 0.00014295119068806063, + "loss": 1.4163, + "step": 10553 + }, + { + "epoch": 0.37796121546367756, + "grad_norm": 1.4416539669036865, + "learning_rate": 0.00014294071578067568, + "loss": 1.5408, + "step": 10554 + }, + { + "epoch": 0.3779970275932458, + "grad_norm": 1.5078434944152832, + "learning_rate": 0.00014293024029557886, + "loss": 1.2568, + "step": 10555 + }, + { + "epoch": 0.3780328397228141, + "grad_norm": 1.6258286237716675, + "learning_rate": 0.000142919764232911, + "loss": 1.8241, + "step": 10556 + }, + { + "epoch": 0.3780686518523824, + "grad_norm": 1.5990259647369385, + "learning_rate": 0.0001429092875928131, + "loss": 1.4475, + "step": 10557 + }, + { + "epoch": 0.3781044639819507, + "grad_norm": 1.7794666290283203, + "learning_rate": 0.00014289881037542605, + "loss": 1.5112, + "step": 10558 + }, + { + "epoch": 0.37814027611151896, + "grad_norm": 1.70427668094635, + "learning_rate": 0.00014288833258089086, + "loss": 1.4345, + "step": 10559 + }, + { + "epoch": 0.3781760882410873, + "grad_norm": 1.3057318925857544, + "learning_rate": 0.00014287785420934846, + "loss": 1.4043, + "step": 10560 + }, + { + "epoch": 0.37821190037065555, + "grad_norm": 1.8462861776351929, + "learning_rate": 0.0001428673752609399, + "loss": 1.131, + "step": 10561 + }, + { + "epoch": 0.3782477125002238, + "grad_norm": 1.4198931455612183, + "learning_rate": 0.00014285689573580607, + "loss": 1.5534, + "step": 10562 + }, + { + "epoch": 0.3782835246297921, + "grad_norm": 1.9457460641860962, + "learning_rate": 0.00014284641563408796, + "loss": 1.711, + "step": 10563 + }, + { + "epoch": 0.3783193367593604, + "grad_norm": 1.85354483127594, + "learning_rate": 0.00014283593495592663, + "loss": 1.6055, + "step": 10564 + }, + { + "epoch": 0.3783551488889287, + "grad_norm": 3.7298476696014404, + "learning_rate": 0.000142825453701463, + "loss": 1.5535, + "step": 10565 + }, + { + "epoch": 0.37839096101849695, + "grad_norm": 1.84163498878479, + "learning_rate": 0.00014281497187083818, + "loss": 1.3187, + "step": 10566 + }, + { + "epoch": 0.3784267731480653, + "grad_norm": 1.7626897096633911, + "learning_rate": 0.00014280448946419312, + "loss": 1.4398, + "step": 10567 + }, + { + "epoch": 0.37846258527763355, + "grad_norm": 1.497876524925232, + "learning_rate": 0.0001427940064816689, + "loss": 1.5085, + "step": 10568 + }, + { + "epoch": 0.3784983974072018, + "grad_norm": 2.1282432079315186, + "learning_rate": 0.00014278352292340646, + "loss": 1.3299, + "step": 10569 + }, + { + "epoch": 0.3785342095367701, + "grad_norm": 1.8996422290802002, + "learning_rate": 0.00014277303878954694, + "loss": 1.7137, + "step": 10570 + }, + { + "epoch": 0.3785700216663384, + "grad_norm": 2.442035675048828, + "learning_rate": 0.00014276255408023138, + "loss": 1.4634, + "step": 10571 + }, + { + "epoch": 0.3786058337959067, + "grad_norm": 1.3051122426986694, + "learning_rate": 0.00014275206879560079, + "loss": 1.4763, + "step": 10572 + }, + { + "epoch": 0.37864164592547495, + "grad_norm": 1.575643539428711, + "learning_rate": 0.00014274158293579628, + "loss": 1.7568, + "step": 10573 + }, + { + "epoch": 0.37867745805504327, + "grad_norm": 1.7613773345947266, + "learning_rate": 0.00014273109650095886, + "loss": 1.4811, + "step": 10574 + }, + { + "epoch": 0.37871327018461154, + "grad_norm": 1.501792073249817, + "learning_rate": 0.0001427206094912297, + "loss": 1.4062, + "step": 10575 + }, + { + "epoch": 0.3787490823141798, + "grad_norm": 1.534451961517334, + "learning_rate": 0.00014271012190674983, + "loss": 1.3444, + "step": 10576 + }, + { + "epoch": 0.3787848944437481, + "grad_norm": 1.2604267597198486, + "learning_rate": 0.00014269963374766034, + "loss": 1.2716, + "step": 10577 + }, + { + "epoch": 0.3788207065733164, + "grad_norm": 2.2848007678985596, + "learning_rate": 0.00014268914501410239, + "loss": 1.5, + "step": 10578 + }, + { + "epoch": 0.37885651870288467, + "grad_norm": 1.4307903051376343, + "learning_rate": 0.00014267865570621706, + "loss": 1.5404, + "step": 10579 + }, + { + "epoch": 0.37889233083245294, + "grad_norm": 2.649573802947998, + "learning_rate": 0.00014266816582414547, + "loss": 1.4351, + "step": 10580 + }, + { + "epoch": 0.37892814296202126, + "grad_norm": 2.3270211219787598, + "learning_rate": 0.00014265767536802873, + "loss": 1.4689, + "step": 10581 + }, + { + "epoch": 0.37896395509158953, + "grad_norm": 1.3619784116744995, + "learning_rate": 0.000142647184338008, + "loss": 1.5399, + "step": 10582 + }, + { + "epoch": 0.3789997672211578, + "grad_norm": 1.9002552032470703, + "learning_rate": 0.0001426366927342244, + "loss": 1.6398, + "step": 10583 + }, + { + "epoch": 0.37903557935072607, + "grad_norm": 1.818941593170166, + "learning_rate": 0.0001426262005568191, + "loss": 1.3925, + "step": 10584 + }, + { + "epoch": 0.3790713914802944, + "grad_norm": 1.4739636182785034, + "learning_rate": 0.00014261570780593327, + "loss": 1.4677, + "step": 10585 + }, + { + "epoch": 0.37910720360986266, + "grad_norm": 1.7951068878173828, + "learning_rate": 0.00014260521448170805, + "loss": 1.5913, + "step": 10586 + }, + { + "epoch": 0.37914301573943093, + "grad_norm": 1.6509017944335938, + "learning_rate": 0.0001425947205842846, + "loss": 1.3659, + "step": 10587 + }, + { + "epoch": 0.37917882786899926, + "grad_norm": 1.6819849014282227, + "learning_rate": 0.00014258422611380418, + "loss": 1.6058, + "step": 10588 + }, + { + "epoch": 0.3792146399985675, + "grad_norm": 2.2275047302246094, + "learning_rate": 0.0001425737310704079, + "loss": 1.1905, + "step": 10589 + }, + { + "epoch": 0.3792504521281358, + "grad_norm": 1.8153003454208374, + "learning_rate": 0.000142563235454237, + "loss": 1.8863, + "step": 10590 + }, + { + "epoch": 0.37928626425770406, + "grad_norm": 1.4210957288742065, + "learning_rate": 0.00014255273926543264, + "loss": 1.4082, + "step": 10591 + }, + { + "epoch": 0.3793220763872724, + "grad_norm": 1.9765852689743042, + "learning_rate": 0.0001425422425041361, + "loss": 1.7252, + "step": 10592 + }, + { + "epoch": 0.37935788851684066, + "grad_norm": 1.3808059692382812, + "learning_rate": 0.00014253174517048854, + "loss": 1.3727, + "step": 10593 + }, + { + "epoch": 0.3793937006464089, + "grad_norm": 1.5124943256378174, + "learning_rate": 0.00014252124726463121, + "loss": 1.456, + "step": 10594 + }, + { + "epoch": 0.37942951277597725, + "grad_norm": 1.4630928039550781, + "learning_rate": 0.00014251074878670537, + "loss": 1.6548, + "step": 10595 + }, + { + "epoch": 0.3794653249055455, + "grad_norm": 2.346756935119629, + "learning_rate": 0.00014250024973685218, + "loss": 1.6495, + "step": 10596 + }, + { + "epoch": 0.3795011370351138, + "grad_norm": 1.8365200757980347, + "learning_rate": 0.000142489750115213, + "loss": 1.4613, + "step": 10597 + }, + { + "epoch": 0.37953694916468206, + "grad_norm": 1.351531982421875, + "learning_rate": 0.00014247924992192906, + "loss": 1.497, + "step": 10598 + }, + { + "epoch": 0.3795727612942504, + "grad_norm": 1.3425579071044922, + "learning_rate": 0.00014246874915714157, + "loss": 1.0996, + "step": 10599 + }, + { + "epoch": 0.37960857342381865, + "grad_norm": 1.2506022453308105, + "learning_rate": 0.00014245824782099185, + "loss": 1.3726, + "step": 10600 + }, + { + "epoch": 0.3796443855533869, + "grad_norm": 1.398263692855835, + "learning_rate": 0.00014244774591362118, + "loss": 1.4526, + "step": 10601 + }, + { + "epoch": 0.3796801976829552, + "grad_norm": 1.5792254209518433, + "learning_rate": 0.00014243724343517082, + "loss": 1.4306, + "step": 10602 + }, + { + "epoch": 0.3797160098125235, + "grad_norm": 1.728993535041809, + "learning_rate": 0.0001424267403857821, + "loss": 1.2494, + "step": 10603 + }, + { + "epoch": 0.3797518219420918, + "grad_norm": 1.8500906229019165, + "learning_rate": 0.00014241623676559633, + "loss": 1.2789, + "step": 10604 + }, + { + "epoch": 0.37978763407166005, + "grad_norm": 2.3870582580566406, + "learning_rate": 0.0001424057325747548, + "loss": 1.5109, + "step": 10605 + }, + { + "epoch": 0.3798234462012284, + "grad_norm": 2.648805856704712, + "learning_rate": 0.00014239522781339884, + "loss": 1.3092, + "step": 10606 + }, + { + "epoch": 0.37985925833079665, + "grad_norm": 1.6133131980895996, + "learning_rate": 0.00014238472248166977, + "loss": 1.5906, + "step": 10607 + }, + { + "epoch": 0.3798950704603649, + "grad_norm": 1.9158686399459839, + "learning_rate": 0.00014237421657970894, + "loss": 1.7831, + "step": 10608 + }, + { + "epoch": 0.3799308825899332, + "grad_norm": 1.773206353187561, + "learning_rate": 0.00014236371010765766, + "loss": 1.4976, + "step": 10609 + }, + { + "epoch": 0.3799666947195015, + "grad_norm": 1.8020765781402588, + "learning_rate": 0.00014235320306565732, + "loss": 1.656, + "step": 10610 + }, + { + "epoch": 0.3800025068490698, + "grad_norm": 1.7115094661712646, + "learning_rate": 0.00014234269545384927, + "loss": 1.7404, + "step": 10611 + }, + { + "epoch": 0.38003831897863805, + "grad_norm": 1.5301717519760132, + "learning_rate": 0.00014233218727237489, + "loss": 1.6062, + "step": 10612 + }, + { + "epoch": 0.38007413110820637, + "grad_norm": 1.6753218173980713, + "learning_rate": 0.00014232167852137547, + "loss": 1.6029, + "step": 10613 + }, + { + "epoch": 0.38010994323777464, + "grad_norm": 1.724470853805542, + "learning_rate": 0.00014231116920099252, + "loss": 1.4289, + "step": 10614 + }, + { + "epoch": 0.3801457553673429, + "grad_norm": 1.236194133758545, + "learning_rate": 0.00014230065931136735, + "loss": 1.4111, + "step": 10615 + }, + { + "epoch": 0.3801815674969112, + "grad_norm": 2.511920213699341, + "learning_rate": 0.00014229014885264136, + "loss": 1.4207, + "step": 10616 + }, + { + "epoch": 0.3802173796264795, + "grad_norm": 1.4968230724334717, + "learning_rate": 0.00014227963782495598, + "loss": 1.5417, + "step": 10617 + }, + { + "epoch": 0.38025319175604777, + "grad_norm": 1.8402019739151, + "learning_rate": 0.0001422691262284526, + "loss": 1.5169, + "step": 10618 + }, + { + "epoch": 0.38028900388561604, + "grad_norm": 1.912214756011963, + "learning_rate": 0.00014225861406327265, + "loss": 1.5186, + "step": 10619 + }, + { + "epoch": 0.38032481601518436, + "grad_norm": 1.7003743648529053, + "learning_rate": 0.00014224810132955755, + "loss": 1.4935, + "step": 10620 + }, + { + "epoch": 0.38036062814475263, + "grad_norm": 1.291312575340271, + "learning_rate": 0.00014223758802744878, + "loss": 1.4836, + "step": 10621 + }, + { + "epoch": 0.3803964402743209, + "grad_norm": 2.281581163406372, + "learning_rate": 0.0001422270741570877, + "loss": 1.3742, + "step": 10622 + }, + { + "epoch": 0.38043225240388917, + "grad_norm": 1.739435076713562, + "learning_rate": 0.00014221655971861582, + "loss": 1.5536, + "step": 10623 + }, + { + "epoch": 0.3804680645334575, + "grad_norm": 2.075655937194824, + "learning_rate": 0.0001422060447121746, + "loss": 1.4077, + "step": 10624 + }, + { + "epoch": 0.38050387666302576, + "grad_norm": 1.8676741123199463, + "learning_rate": 0.0001421955291379055, + "loss": 1.7129, + "step": 10625 + }, + { + "epoch": 0.38053968879259403, + "grad_norm": 1.853179693222046, + "learning_rate": 0.00014218501299594996, + "loss": 1.369, + "step": 10626 + }, + { + "epoch": 0.38057550092216236, + "grad_norm": 1.8642727136611938, + "learning_rate": 0.00014217449628644947, + "loss": 1.3751, + "step": 10627 + }, + { + "epoch": 0.3806113130517306, + "grad_norm": 2.065969467163086, + "learning_rate": 0.00014216397900954558, + "loss": 1.4474, + "step": 10628 + }, + { + "epoch": 0.3806471251812989, + "grad_norm": 2.014706611633301, + "learning_rate": 0.00014215346116537968, + "loss": 1.7028, + "step": 10629 + }, + { + "epoch": 0.38068293731086716, + "grad_norm": 1.5329726934432983, + "learning_rate": 0.0001421429427540934, + "loss": 1.3595, + "step": 10630 + }, + { + "epoch": 0.3807187494404355, + "grad_norm": 1.8910022974014282, + "learning_rate": 0.00014213242377582815, + "loss": 1.3986, + "step": 10631 + }, + { + "epoch": 0.38075456157000376, + "grad_norm": 1.6006379127502441, + "learning_rate": 0.0001421219042307255, + "loss": 1.6447, + "step": 10632 + }, + { + "epoch": 0.380790373699572, + "grad_norm": 1.9453397989273071, + "learning_rate": 0.00014211138411892696, + "loss": 1.5806, + "step": 10633 + }, + { + "epoch": 0.38082618582914035, + "grad_norm": 1.7229535579681396, + "learning_rate": 0.00014210086344057404, + "loss": 1.6743, + "step": 10634 + }, + { + "epoch": 0.3808619979587086, + "grad_norm": 1.395990252494812, + "learning_rate": 0.00014209034219580833, + "loss": 1.4022, + "step": 10635 + }, + { + "epoch": 0.3808978100882769, + "grad_norm": 1.543500304222107, + "learning_rate": 0.00014207982038477135, + "loss": 1.6443, + "step": 10636 + }, + { + "epoch": 0.38093362221784516, + "grad_norm": 1.8862489461898804, + "learning_rate": 0.00014206929800760466, + "loss": 1.8299, + "step": 10637 + }, + { + "epoch": 0.3809694343474135, + "grad_norm": 1.6491189002990723, + "learning_rate": 0.00014205877506444982, + "loss": 1.5355, + "step": 10638 + }, + { + "epoch": 0.38100524647698175, + "grad_norm": 2.0166943073272705, + "learning_rate": 0.00014204825155544846, + "loss": 1.2944, + "step": 10639 + }, + { + "epoch": 0.38104105860655, + "grad_norm": 1.385064959526062, + "learning_rate": 0.00014203772748074206, + "loss": 1.5232, + "step": 10640 + }, + { + "epoch": 0.38107687073611834, + "grad_norm": 1.4159349203109741, + "learning_rate": 0.00014202720284047234, + "loss": 1.3836, + "step": 10641 + }, + { + "epoch": 0.3811126828656866, + "grad_norm": 1.7959661483764648, + "learning_rate": 0.00014201667763478074, + "loss": 1.7063, + "step": 10642 + }, + { + "epoch": 0.3811484949952549, + "grad_norm": 1.8224889039993286, + "learning_rate": 0.00014200615186380899, + "loss": 1.6915, + "step": 10643 + }, + { + "epoch": 0.38118430712482315, + "grad_norm": 2.0088465213775635, + "learning_rate": 0.0001419956255276986, + "loss": 1.6776, + "step": 10644 + }, + { + "epoch": 0.3812201192543915, + "grad_norm": 1.4970310926437378, + "learning_rate": 0.00014198509862659129, + "loss": 1.4258, + "step": 10645 + }, + { + "epoch": 0.38125593138395975, + "grad_norm": 1.4173898696899414, + "learning_rate": 0.00014197457116062857, + "loss": 1.7229, + "step": 10646 + }, + { + "epoch": 0.381291743513528, + "grad_norm": 1.68673837184906, + "learning_rate": 0.0001419640431299522, + "loss": 1.4912, + "step": 10647 + }, + { + "epoch": 0.38132755564309634, + "grad_norm": 2.059001922607422, + "learning_rate": 0.00014195351453470374, + "loss": 1.328, + "step": 10648 + }, + { + "epoch": 0.3813633677726646, + "grad_norm": 1.7006314992904663, + "learning_rate": 0.00014194298537502487, + "loss": 1.4513, + "step": 10649 + }, + { + "epoch": 0.3813991799022329, + "grad_norm": 1.4936386346817017, + "learning_rate": 0.00014193245565105722, + "loss": 1.6796, + "step": 10650 + }, + { + "epoch": 0.38143499203180115, + "grad_norm": 1.7479816675186157, + "learning_rate": 0.00014192192536294245, + "loss": 1.4616, + "step": 10651 + }, + { + "epoch": 0.38147080416136947, + "grad_norm": 1.7160698175430298, + "learning_rate": 0.00014191139451082228, + "loss": 1.6427, + "step": 10652 + }, + { + "epoch": 0.38150661629093774, + "grad_norm": 1.7994788885116577, + "learning_rate": 0.00014190086309483834, + "loss": 1.4788, + "step": 10653 + }, + { + "epoch": 0.381542428420506, + "grad_norm": 1.9248762130737305, + "learning_rate": 0.00014189033111513234, + "loss": 1.6344, + "step": 10654 + }, + { + "epoch": 0.38157824055007433, + "grad_norm": 2.2364611625671387, + "learning_rate": 0.00014187979857184597, + "loss": 1.5578, + "step": 10655 + }, + { + "epoch": 0.3816140526796426, + "grad_norm": 1.8766387701034546, + "learning_rate": 0.00014186926546512095, + "loss": 1.7108, + "step": 10656 + }, + { + "epoch": 0.38164986480921087, + "grad_norm": 2.0187058448791504, + "learning_rate": 0.00014185873179509893, + "loss": 1.3808, + "step": 10657 + }, + { + "epoch": 0.38168567693877914, + "grad_norm": 1.377492070198059, + "learning_rate": 0.00014184819756192168, + "loss": 1.2254, + "step": 10658 + }, + { + "epoch": 0.38172148906834746, + "grad_norm": 1.88951575756073, + "learning_rate": 0.00014183766276573096, + "loss": 1.5138, + "step": 10659 + }, + { + "epoch": 0.38175730119791573, + "grad_norm": 2.606764554977417, + "learning_rate": 0.00014182712740666838, + "loss": 1.9735, + "step": 10660 + }, + { + "epoch": 0.381793113327484, + "grad_norm": 2.150054693222046, + "learning_rate": 0.00014181659148487582, + "loss": 1.38, + "step": 10661 + }, + { + "epoch": 0.3818289254570523, + "grad_norm": 1.516271948814392, + "learning_rate": 0.00014180605500049493, + "loss": 1.7134, + "step": 10662 + }, + { + "epoch": 0.3818647375866206, + "grad_norm": 1.3602635860443115, + "learning_rate": 0.0001417955179536675, + "loss": 1.4687, + "step": 10663 + }, + { + "epoch": 0.38190054971618886, + "grad_norm": 1.6596342325210571, + "learning_rate": 0.00014178498034453528, + "loss": 1.5292, + "step": 10664 + }, + { + "epoch": 0.38193636184575713, + "grad_norm": 1.9334921836853027, + "learning_rate": 0.00014177444217324005, + "loss": 1.3077, + "step": 10665 + }, + { + "epoch": 0.38197217397532546, + "grad_norm": 1.7454739809036255, + "learning_rate": 0.00014176390343992358, + "loss": 1.722, + "step": 10666 + }, + { + "epoch": 0.3820079861048937, + "grad_norm": 1.3706307411193848, + "learning_rate": 0.0001417533641447277, + "loss": 1.488, + "step": 10667 + }, + { + "epoch": 0.382043798234462, + "grad_norm": 1.5092109441757202, + "learning_rate": 0.00014174282428779412, + "loss": 1.7905, + "step": 10668 + }, + { + "epoch": 0.3820796103640303, + "grad_norm": 1.692136526107788, + "learning_rate": 0.0001417322838692647, + "loss": 1.2234, + "step": 10669 + }, + { + "epoch": 0.3821154224935986, + "grad_norm": 1.8451822996139526, + "learning_rate": 0.00014172174288928124, + "loss": 1.5694, + "step": 10670 + }, + { + "epoch": 0.38215123462316686, + "grad_norm": 1.4315675497055054, + "learning_rate": 0.00014171120134798552, + "loss": 1.4778, + "step": 10671 + }, + { + "epoch": 0.3821870467527351, + "grad_norm": 1.8963932991027832, + "learning_rate": 0.00014170065924551942, + "loss": 1.2421, + "step": 10672 + }, + { + "epoch": 0.38222285888230345, + "grad_norm": 1.7373079061508179, + "learning_rate": 0.00014169011658202472, + "loss": 1.2913, + "step": 10673 + }, + { + "epoch": 0.3822586710118717, + "grad_norm": 1.6694564819335938, + "learning_rate": 0.00014167957335764331, + "loss": 1.6226, + "step": 10674 + }, + { + "epoch": 0.38229448314144, + "grad_norm": 2.1219732761383057, + "learning_rate": 0.00014166902957251696, + "loss": 1.6144, + "step": 10675 + }, + { + "epoch": 0.3823302952710083, + "grad_norm": 1.57712721824646, + "learning_rate": 0.00014165848522678756, + "loss": 1.1831, + "step": 10676 + }, + { + "epoch": 0.3823661074005766, + "grad_norm": 1.6538848876953125, + "learning_rate": 0.00014164794032059703, + "loss": 1.7272, + "step": 10677 + }, + { + "epoch": 0.38240191953014485, + "grad_norm": 1.840171217918396, + "learning_rate": 0.00014163739485408716, + "loss": 1.5288, + "step": 10678 + }, + { + "epoch": 0.3824377316597131, + "grad_norm": 1.7260515689849854, + "learning_rate": 0.00014162684882739984, + "loss": 1.6227, + "step": 10679 + }, + { + "epoch": 0.38247354378928144, + "grad_norm": 2.425084352493286, + "learning_rate": 0.00014161630224067694, + "loss": 1.4853, + "step": 10680 + }, + { + "epoch": 0.3825093559188497, + "grad_norm": 1.8197312355041504, + "learning_rate": 0.0001416057550940604, + "loss": 1.797, + "step": 10681 + }, + { + "epoch": 0.382545168048418, + "grad_norm": 1.4861470460891724, + "learning_rate": 0.00014159520738769212, + "loss": 1.5128, + "step": 10682 + }, + { + "epoch": 0.3825809801779863, + "grad_norm": 1.9796706438064575, + "learning_rate": 0.00014158465912171396, + "loss": 1.574, + "step": 10683 + }, + { + "epoch": 0.3826167923075546, + "grad_norm": 1.8207260370254517, + "learning_rate": 0.00014157411029626783, + "loss": 1.3684, + "step": 10684 + }, + { + "epoch": 0.38265260443712285, + "grad_norm": 1.8487191200256348, + "learning_rate": 0.0001415635609114957, + "loss": 1.386, + "step": 10685 + }, + { + "epoch": 0.3826884165666911, + "grad_norm": 1.9048634767532349, + "learning_rate": 0.00014155301096753945, + "loss": 1.496, + "step": 10686 + }, + { + "epoch": 0.38272422869625944, + "grad_norm": 1.796779751777649, + "learning_rate": 0.00014154246046454107, + "loss": 1.4984, + "step": 10687 + }, + { + "epoch": 0.3827600408258277, + "grad_norm": 1.5472294092178345, + "learning_rate": 0.00014153190940264246, + "loss": 1.6013, + "step": 10688 + }, + { + "epoch": 0.382795852955396, + "grad_norm": 2.0484673976898193, + "learning_rate": 0.00014152135778198557, + "loss": 1.7283, + "step": 10689 + }, + { + "epoch": 0.3828316650849643, + "grad_norm": 1.5459712743759155, + "learning_rate": 0.00014151080560271235, + "loss": 1.3491, + "step": 10690 + }, + { + "epoch": 0.38286747721453257, + "grad_norm": 1.379616618156433, + "learning_rate": 0.00014150025286496483, + "loss": 1.3149, + "step": 10691 + }, + { + "epoch": 0.38290328934410084, + "grad_norm": 2.17366886138916, + "learning_rate": 0.0001414896995688849, + "loss": 1.6789, + "step": 10692 + }, + { + "epoch": 0.3829391014736691, + "grad_norm": 1.3768093585968018, + "learning_rate": 0.00014147914571461455, + "loss": 1.5071, + "step": 10693 + }, + { + "epoch": 0.38297491360323743, + "grad_norm": 1.906646728515625, + "learning_rate": 0.0001414685913022959, + "loss": 1.6087, + "step": 10694 + }, + { + "epoch": 0.3830107257328057, + "grad_norm": 2.058373212814331, + "learning_rate": 0.00014145803633207077, + "loss": 1.7719, + "step": 10695 + }, + { + "epoch": 0.38304653786237397, + "grad_norm": 1.584075689315796, + "learning_rate": 0.00014144748080408126, + "loss": 1.4597, + "step": 10696 + }, + { + "epoch": 0.3830823499919423, + "grad_norm": 1.2557836771011353, + "learning_rate": 0.00014143692471846935, + "loss": 1.2409, + "step": 10697 + }, + { + "epoch": 0.38311816212151056, + "grad_norm": 1.7716764211654663, + "learning_rate": 0.0001414263680753771, + "loss": 1.6804, + "step": 10698 + }, + { + "epoch": 0.38315397425107883, + "grad_norm": 1.3837485313415527, + "learning_rate": 0.00014141581087494644, + "loss": 1.5138, + "step": 10699 + }, + { + "epoch": 0.3831897863806471, + "grad_norm": 1.3387659788131714, + "learning_rate": 0.00014140525311731952, + "loss": 1.4457, + "step": 10700 + }, + { + "epoch": 0.3832255985102154, + "grad_norm": 1.4828484058380127, + "learning_rate": 0.00014139469480263828, + "loss": 1.6133, + "step": 10701 + }, + { + "epoch": 0.3832614106397837, + "grad_norm": 2.200072765350342, + "learning_rate": 0.00014138413593104486, + "loss": 1.3238, + "step": 10702 + }, + { + "epoch": 0.38329722276935196, + "grad_norm": 1.9859720468521118, + "learning_rate": 0.0001413735765026813, + "loss": 1.7688, + "step": 10703 + }, + { + "epoch": 0.3833330348989203, + "grad_norm": 1.7335363626480103, + "learning_rate": 0.00014136301651768957, + "loss": 1.6909, + "step": 10704 + }, + { + "epoch": 0.38336884702848856, + "grad_norm": 1.7980806827545166, + "learning_rate": 0.00014135245597621184, + "loss": 1.2562, + "step": 10705 + }, + { + "epoch": 0.3834046591580568, + "grad_norm": 1.8698501586914062, + "learning_rate": 0.00014134189487839013, + "loss": 1.6418, + "step": 10706 + }, + { + "epoch": 0.3834404712876251, + "grad_norm": 1.5134303569793701, + "learning_rate": 0.0001413313332243666, + "loss": 1.3264, + "step": 10707 + }, + { + "epoch": 0.3834762834171934, + "grad_norm": 1.5431501865386963, + "learning_rate": 0.00014132077101428324, + "loss": 1.4467, + "step": 10708 + }, + { + "epoch": 0.3835120955467617, + "grad_norm": 1.60693359375, + "learning_rate": 0.00014131020824828224, + "loss": 1.5158, + "step": 10709 + }, + { + "epoch": 0.38354790767632996, + "grad_norm": 2.905691385269165, + "learning_rate": 0.00014129964492650568, + "loss": 1.7937, + "step": 10710 + }, + { + "epoch": 0.3835837198058983, + "grad_norm": 1.4310888051986694, + "learning_rate": 0.00014128908104909567, + "loss": 1.286, + "step": 10711 + }, + { + "epoch": 0.38361953193546655, + "grad_norm": 1.8025977611541748, + "learning_rate": 0.00014127851661619432, + "loss": 1.27, + "step": 10712 + }, + { + "epoch": 0.3836553440650348, + "grad_norm": 1.9007517099380493, + "learning_rate": 0.00014126795162794378, + "loss": 1.5953, + "step": 10713 + }, + { + "epoch": 0.3836911561946031, + "grad_norm": 2.110842704772949, + "learning_rate": 0.00014125738608448618, + "loss": 1.5767, + "step": 10714 + }, + { + "epoch": 0.3837269683241714, + "grad_norm": 1.5326857566833496, + "learning_rate": 0.00014124681998596366, + "loss": 1.6487, + "step": 10715 + }, + { + "epoch": 0.3837627804537397, + "grad_norm": 1.8157939910888672, + "learning_rate": 0.0001412362533325184, + "loss": 1.588, + "step": 10716 + }, + { + "epoch": 0.38379859258330795, + "grad_norm": 1.4543850421905518, + "learning_rate": 0.0001412256861242925, + "loss": 1.5232, + "step": 10717 + }, + { + "epoch": 0.3838344047128763, + "grad_norm": 1.69603431224823, + "learning_rate": 0.00014121511836142823, + "loss": 1.727, + "step": 10718 + }, + { + "epoch": 0.38387021684244454, + "grad_norm": 1.8203978538513184, + "learning_rate": 0.00014120455004406766, + "loss": 1.6048, + "step": 10719 + }, + { + "epoch": 0.3839060289720128, + "grad_norm": 1.70366370677948, + "learning_rate": 0.00014119398117235304, + "loss": 1.6988, + "step": 10720 + }, + { + "epoch": 0.3839418411015811, + "grad_norm": 1.6187806129455566, + "learning_rate": 0.00014118341174642653, + "loss": 1.534, + "step": 10721 + }, + { + "epoch": 0.3839776532311494, + "grad_norm": 1.8127055168151855, + "learning_rate": 0.00014117284176643033, + "loss": 1.429, + "step": 10722 + }, + { + "epoch": 0.3840134653607177, + "grad_norm": 2.1849961280822754, + "learning_rate": 0.00014116227123250668, + "loss": 1.515, + "step": 10723 + }, + { + "epoch": 0.38404927749028595, + "grad_norm": 1.5022850036621094, + "learning_rate": 0.00014115170014479775, + "loss": 1.6862, + "step": 10724 + }, + { + "epoch": 0.38408508961985427, + "grad_norm": 1.709812879562378, + "learning_rate": 0.0001411411285034458, + "loss": 1.5168, + "step": 10725 + }, + { + "epoch": 0.38412090174942254, + "grad_norm": 1.5337954759597778, + "learning_rate": 0.000141130556308593, + "loss": 1.595, + "step": 10726 + }, + { + "epoch": 0.3841567138789908, + "grad_norm": 1.5661762952804565, + "learning_rate": 0.00014111998356038162, + "loss": 1.325, + "step": 10727 + }, + { + "epoch": 0.3841925260085591, + "grad_norm": 2.0021166801452637, + "learning_rate": 0.00014110941025895392, + "loss": 1.3808, + "step": 10728 + }, + { + "epoch": 0.3842283381381274, + "grad_norm": 1.3688938617706299, + "learning_rate": 0.00014109883640445214, + "loss": 1.366, + "step": 10729 + }, + { + "epoch": 0.38426415026769567, + "grad_norm": 2.30067777633667, + "learning_rate": 0.00014108826199701852, + "loss": 1.3969, + "step": 10730 + }, + { + "epoch": 0.38429996239726394, + "grad_norm": 2.1623682975769043, + "learning_rate": 0.00014107768703679533, + "loss": 1.4078, + "step": 10731 + }, + { + "epoch": 0.38433577452683226, + "grad_norm": 1.4993696212768555, + "learning_rate": 0.00014106711152392484, + "loss": 1.499, + "step": 10732 + }, + { + "epoch": 0.38437158665640053, + "grad_norm": 1.903861403465271, + "learning_rate": 0.00014105653545854935, + "loss": 1.5057, + "step": 10733 + }, + { + "epoch": 0.3844073987859688, + "grad_norm": 1.6415305137634277, + "learning_rate": 0.00014104595884081113, + "loss": 1.5481, + "step": 10734 + }, + { + "epoch": 0.38444321091553707, + "grad_norm": 1.8244068622589111, + "learning_rate": 0.00014103538167085247, + "loss": 1.5552, + "step": 10735 + }, + { + "epoch": 0.3844790230451054, + "grad_norm": 1.8771268129348755, + "learning_rate": 0.0001410248039488157, + "loss": 1.4309, + "step": 10736 + }, + { + "epoch": 0.38451483517467366, + "grad_norm": 1.5982030630111694, + "learning_rate": 0.0001410142256748431, + "loss": 1.4636, + "step": 10737 + }, + { + "epoch": 0.38455064730424193, + "grad_norm": 1.3975290060043335, + "learning_rate": 0.00014100364684907702, + "loss": 1.3491, + "step": 10738 + }, + { + "epoch": 0.38458645943381026, + "grad_norm": 1.9042540788650513, + "learning_rate": 0.00014099306747165975, + "loss": 1.2723, + "step": 10739 + }, + { + "epoch": 0.3846222715633785, + "grad_norm": 1.3731106519699097, + "learning_rate": 0.00014098248754273364, + "loss": 1.757, + "step": 10740 + }, + { + "epoch": 0.3846580836929468, + "grad_norm": 2.2348291873931885, + "learning_rate": 0.000140971907062441, + "loss": 1.2937, + "step": 10741 + }, + { + "epoch": 0.38469389582251506, + "grad_norm": 1.5206588506698608, + "learning_rate": 0.0001409613260309242, + "loss": 1.3901, + "step": 10742 + }, + { + "epoch": 0.3847297079520834, + "grad_norm": 1.705182671546936, + "learning_rate": 0.00014095074444832561, + "loss": 1.5864, + "step": 10743 + }, + { + "epoch": 0.38476552008165166, + "grad_norm": 1.3100666999816895, + "learning_rate": 0.0001409401623147876, + "loss": 1.5685, + "step": 10744 + }, + { + "epoch": 0.3848013322112199, + "grad_norm": 2.2978193759918213, + "learning_rate": 0.00014092957963045245, + "loss": 1.895, + "step": 10745 + }, + { + "epoch": 0.38483714434078825, + "grad_norm": 1.4124058485031128, + "learning_rate": 0.00014091899639546263, + "loss": 1.0143, + "step": 10746 + }, + { + "epoch": 0.3848729564703565, + "grad_norm": 1.691389799118042, + "learning_rate": 0.00014090841260996055, + "loss": 1.596, + "step": 10747 + }, + { + "epoch": 0.3849087685999248, + "grad_norm": 1.6830295324325562, + "learning_rate": 0.0001408978282740885, + "loss": 1.3329, + "step": 10748 + }, + { + "epoch": 0.38494458072949306, + "grad_norm": 1.5190984010696411, + "learning_rate": 0.0001408872433879889, + "loss": 1.4487, + "step": 10749 + }, + { + "epoch": 0.3849803928590614, + "grad_norm": 1.6178358793258667, + "learning_rate": 0.00014087665795180422, + "loss": 1.4554, + "step": 10750 + }, + { + "epoch": 0.38501620498862965, + "grad_norm": 1.344931721687317, + "learning_rate": 0.00014086607196567682, + "loss": 1.5279, + "step": 10751 + }, + { + "epoch": 0.3850520171181979, + "grad_norm": 1.3446136713027954, + "learning_rate": 0.00014085548542974914, + "loss": 1.4228, + "step": 10752 + }, + { + "epoch": 0.38508782924776624, + "grad_norm": 1.3284231424331665, + "learning_rate": 0.0001408448983441636, + "loss": 1.2936, + "step": 10753 + }, + { + "epoch": 0.3851236413773345, + "grad_norm": 1.6570699214935303, + "learning_rate": 0.00014083431070906262, + "loss": 1.6028, + "step": 10754 + }, + { + "epoch": 0.3851594535069028, + "grad_norm": 1.6138721704483032, + "learning_rate": 0.0001408237225245887, + "loss": 1.4631, + "step": 10755 + }, + { + "epoch": 0.38519526563647105, + "grad_norm": 1.7377655506134033, + "learning_rate": 0.00014081313379088424, + "loss": 1.5583, + "step": 10756 + }, + { + "epoch": 0.3852310777660394, + "grad_norm": 1.4762719869613647, + "learning_rate": 0.0001408025445080917, + "loss": 1.4366, + "step": 10757 + }, + { + "epoch": 0.38526688989560764, + "grad_norm": 1.4237333536148071, + "learning_rate": 0.00014079195467635354, + "loss": 1.3627, + "step": 10758 + }, + { + "epoch": 0.3853027020251759, + "grad_norm": 1.8359794616699219, + "learning_rate": 0.00014078136429581227, + "loss": 1.5827, + "step": 10759 + }, + { + "epoch": 0.38533851415474424, + "grad_norm": 1.5892670154571533, + "learning_rate": 0.00014077077336661036, + "loss": 1.2771, + "step": 10760 + }, + { + "epoch": 0.3853743262843125, + "grad_norm": 2.054046392440796, + "learning_rate": 0.00014076018188889026, + "loss": 1.5361, + "step": 10761 + }, + { + "epoch": 0.3854101384138808, + "grad_norm": 1.899831771850586, + "learning_rate": 0.0001407495898627945, + "loss": 1.6762, + "step": 10762 + }, + { + "epoch": 0.38544595054344905, + "grad_norm": 1.9101499319076538, + "learning_rate": 0.00014073899728846555, + "loss": 1.5444, + "step": 10763 + }, + { + "epoch": 0.38548176267301737, + "grad_norm": 1.364566445350647, + "learning_rate": 0.00014072840416604597, + "loss": 1.4441, + "step": 10764 + }, + { + "epoch": 0.38551757480258564, + "grad_norm": 1.467511773109436, + "learning_rate": 0.00014071781049567825, + "loss": 1.5548, + "step": 10765 + }, + { + "epoch": 0.3855533869321539, + "grad_norm": 1.9000247716903687, + "learning_rate": 0.0001407072162775049, + "loss": 1.6866, + "step": 10766 + }, + { + "epoch": 0.38558919906172223, + "grad_norm": 1.3294975757598877, + "learning_rate": 0.00014069662151166846, + "loss": 1.6307, + "step": 10767 + }, + { + "epoch": 0.3856250111912905, + "grad_norm": 1.628848910331726, + "learning_rate": 0.00014068602619831148, + "loss": 1.4484, + "step": 10768 + }, + { + "epoch": 0.38566082332085877, + "grad_norm": 1.6399376392364502, + "learning_rate": 0.0001406754303375765, + "loss": 1.7626, + "step": 10769 + }, + { + "epoch": 0.38569663545042704, + "grad_norm": 1.6069952249526978, + "learning_rate": 0.00014066483392960604, + "loss": 1.5228, + "step": 10770 + }, + { + "epoch": 0.38573244757999536, + "grad_norm": 2.2001421451568604, + "learning_rate": 0.00014065423697454273, + "loss": 1.5175, + "step": 10771 + }, + { + "epoch": 0.38576825970956363, + "grad_norm": 1.3679170608520508, + "learning_rate": 0.0001406436394725291, + "loss": 1.4571, + "step": 10772 + }, + { + "epoch": 0.3858040718391319, + "grad_norm": 2.004768133163452, + "learning_rate": 0.00014063304142370773, + "loss": 1.1387, + "step": 10773 + }, + { + "epoch": 0.3858398839687002, + "grad_norm": 2.107898235321045, + "learning_rate": 0.0001406224428282212, + "loss": 1.7398, + "step": 10774 + }, + { + "epoch": 0.3858756960982685, + "grad_norm": 1.65401029586792, + "learning_rate": 0.0001406118436862121, + "loss": 1.4425, + "step": 10775 + }, + { + "epoch": 0.38591150822783676, + "grad_norm": 2.257986307144165, + "learning_rate": 0.000140601243997823, + "loss": 1.7269, + "step": 10776 + }, + { + "epoch": 0.38594732035740503, + "grad_norm": 1.4186952114105225, + "learning_rate": 0.00014059064376319657, + "loss": 1.647, + "step": 10777 + }, + { + "epoch": 0.38598313248697336, + "grad_norm": 1.5061075687408447, + "learning_rate": 0.00014058004298247537, + "loss": 1.4102, + "step": 10778 + }, + { + "epoch": 0.3860189446165416, + "grad_norm": 2.0114200115203857, + "learning_rate": 0.00014056944165580202, + "loss": 1.763, + "step": 10779 + }, + { + "epoch": 0.3860547567461099, + "grad_norm": 1.347678780555725, + "learning_rate": 0.00014055883978331916, + "loss": 1.5323, + "step": 10780 + }, + { + "epoch": 0.3860905688756782, + "grad_norm": 1.6494736671447754, + "learning_rate": 0.00014054823736516945, + "loss": 1.7314, + "step": 10781 + }, + { + "epoch": 0.3861263810052465, + "grad_norm": 1.7377164363861084, + "learning_rate": 0.00014053763440149552, + "loss": 1.4244, + "step": 10782 + }, + { + "epoch": 0.38616219313481476, + "grad_norm": 2.1563901901245117, + "learning_rate": 0.00014052703089244, + "loss": 1.5343, + "step": 10783 + }, + { + "epoch": 0.386198005264383, + "grad_norm": 1.365647315979004, + "learning_rate": 0.00014051642683814557, + "loss": 1.491, + "step": 10784 + }, + { + "epoch": 0.38623381739395135, + "grad_norm": 1.7293301820755005, + "learning_rate": 0.00014050582223875484, + "loss": 1.5437, + "step": 10785 + }, + { + "epoch": 0.3862696295235196, + "grad_norm": 2.316777229309082, + "learning_rate": 0.00014049521709441057, + "loss": 1.612, + "step": 10786 + }, + { + "epoch": 0.3863054416530879, + "grad_norm": 1.4455130100250244, + "learning_rate": 0.00014048461140525533, + "loss": 1.5892, + "step": 10787 + }, + { + "epoch": 0.3863412537826562, + "grad_norm": 1.716500997543335, + "learning_rate": 0.0001404740051714319, + "loss": 1.5128, + "step": 10788 + }, + { + "epoch": 0.3863770659122245, + "grad_norm": 1.4911115169525146, + "learning_rate": 0.00014046339839308294, + "loss": 1.6088, + "step": 10789 + }, + { + "epoch": 0.38641287804179275, + "grad_norm": 2.029979944229126, + "learning_rate": 0.00014045279107035116, + "loss": 1.488, + "step": 10790 + }, + { + "epoch": 0.386448690171361, + "grad_norm": 1.7694584131240845, + "learning_rate": 0.00014044218320337923, + "loss": 1.5967, + "step": 10791 + }, + { + "epoch": 0.38648450230092934, + "grad_norm": 1.7229918241500854, + "learning_rate": 0.00014043157479230988, + "loss": 1.4873, + "step": 10792 + }, + { + "epoch": 0.3865203144304976, + "grad_norm": 1.647547721862793, + "learning_rate": 0.00014042096583728587, + "loss": 1.496, + "step": 10793 + }, + { + "epoch": 0.3865561265600659, + "grad_norm": 1.3134502172470093, + "learning_rate": 0.0001404103563384499, + "loss": 1.3876, + "step": 10794 + }, + { + "epoch": 0.38659193868963415, + "grad_norm": 1.5994287729263306, + "learning_rate": 0.00014039974629594473, + "loss": 1.7171, + "step": 10795 + }, + { + "epoch": 0.3866277508192025, + "grad_norm": 1.576795220375061, + "learning_rate": 0.00014038913570991302, + "loss": 1.3701, + "step": 10796 + }, + { + "epoch": 0.38666356294877074, + "grad_norm": 1.9416791200637817, + "learning_rate": 0.00014037852458049764, + "loss": 1.4239, + "step": 10797 + }, + { + "epoch": 0.386699375078339, + "grad_norm": 1.8116300106048584, + "learning_rate": 0.0001403679129078413, + "loss": 1.5017, + "step": 10798 + }, + { + "epoch": 0.38673518720790734, + "grad_norm": 1.8027644157409668, + "learning_rate": 0.00014035730069208676, + "loss": 1.4818, + "step": 10799 + }, + { + "epoch": 0.3867709993374756, + "grad_norm": 1.8859972953796387, + "learning_rate": 0.0001403466879333768, + "loss": 1.5649, + "step": 10800 + }, + { + "epoch": 0.3868068114670439, + "grad_norm": 1.517880916595459, + "learning_rate": 0.00014033607463185416, + "loss": 1.2432, + "step": 10801 + }, + { + "epoch": 0.38684262359661215, + "grad_norm": 1.304983377456665, + "learning_rate": 0.0001403254607876617, + "loss": 1.5928, + "step": 10802 + }, + { + "epoch": 0.38687843572618047, + "grad_norm": 2.295943021774292, + "learning_rate": 0.00014031484640094217, + "loss": 1.9763, + "step": 10803 + }, + { + "epoch": 0.38691424785574874, + "grad_norm": 1.5124098062515259, + "learning_rate": 0.0001403042314718384, + "loss": 1.5043, + "step": 10804 + }, + { + "epoch": 0.386950059985317, + "grad_norm": 1.4850661754608154, + "learning_rate": 0.00014029361600049315, + "loss": 1.3102, + "step": 10805 + }, + { + "epoch": 0.38698587211488533, + "grad_norm": 1.8223068714141846, + "learning_rate": 0.0001402829999870493, + "loss": 1.2721, + "step": 10806 + }, + { + "epoch": 0.3870216842444536, + "grad_norm": 1.4435514211654663, + "learning_rate": 0.00014027238343164965, + "loss": 1.4753, + "step": 10807 + }, + { + "epoch": 0.38705749637402187, + "grad_norm": 1.5776134729385376, + "learning_rate": 0.000140261766334437, + "loss": 1.7435, + "step": 10808 + }, + { + "epoch": 0.38709330850359014, + "grad_norm": 1.5881763696670532, + "learning_rate": 0.00014025114869555425, + "loss": 1.7955, + "step": 10809 + }, + { + "epoch": 0.38712912063315846, + "grad_norm": 1.520105242729187, + "learning_rate": 0.00014024053051514418, + "loss": 1.7156, + "step": 10810 + }, + { + "epoch": 0.38716493276272673, + "grad_norm": 2.3292229175567627, + "learning_rate": 0.00014022991179334971, + "loss": 1.5198, + "step": 10811 + }, + { + "epoch": 0.387200744892295, + "grad_norm": 1.6294552087783813, + "learning_rate": 0.00014021929253031366, + "loss": 1.3763, + "step": 10812 + }, + { + "epoch": 0.3872365570218633, + "grad_norm": 1.838181972503662, + "learning_rate": 0.0001402086727261789, + "loss": 1.7643, + "step": 10813 + }, + { + "epoch": 0.3872723691514316, + "grad_norm": 3.064434051513672, + "learning_rate": 0.0001401980523810883, + "loss": 1.6755, + "step": 10814 + }, + { + "epoch": 0.38730818128099986, + "grad_norm": 1.7557802200317383, + "learning_rate": 0.0001401874314951848, + "loss": 1.4726, + "step": 10815 + }, + { + "epoch": 0.38734399341056813, + "grad_norm": 1.710260033607483, + "learning_rate": 0.0001401768100686112, + "loss": 1.6089, + "step": 10816 + }, + { + "epoch": 0.38737980554013646, + "grad_norm": 1.567458152770996, + "learning_rate": 0.00014016618810151047, + "loss": 1.5403, + "step": 10817 + }, + { + "epoch": 0.3874156176697047, + "grad_norm": 1.4473252296447754, + "learning_rate": 0.00014015556559402551, + "loss": 1.2182, + "step": 10818 + }, + { + "epoch": 0.387451429799273, + "grad_norm": 1.6443681716918945, + "learning_rate": 0.0001401449425462992, + "loss": 1.5927, + "step": 10819 + }, + { + "epoch": 0.3874872419288413, + "grad_norm": 1.7293095588684082, + "learning_rate": 0.00014013431895847447, + "loss": 1.4499, + "step": 10820 + }, + { + "epoch": 0.3875230540584096, + "grad_norm": 1.4922728538513184, + "learning_rate": 0.0001401236948306942, + "loss": 1.5064, + "step": 10821 + }, + { + "epoch": 0.38755886618797786, + "grad_norm": 1.9737058877944946, + "learning_rate": 0.00014011307016310144, + "loss": 1.3372, + "step": 10822 + }, + { + "epoch": 0.3875946783175461, + "grad_norm": 3.1489598751068115, + "learning_rate": 0.00014010244495583901, + "loss": 1.6959, + "step": 10823 + }, + { + "epoch": 0.38763049044711445, + "grad_norm": 2.1126914024353027, + "learning_rate": 0.00014009181920904995, + "loss": 1.5703, + "step": 10824 + }, + { + "epoch": 0.3876663025766827, + "grad_norm": 2.1052439212799072, + "learning_rate": 0.00014008119292287715, + "loss": 1.8133, + "step": 10825 + }, + { + "epoch": 0.387702114706251, + "grad_norm": 1.6701992750167847, + "learning_rate": 0.00014007056609746362, + "loss": 1.4596, + "step": 10826 + }, + { + "epoch": 0.3877379268358193, + "grad_norm": 1.8624612092971802, + "learning_rate": 0.00014005993873295234, + "loss": 1.4561, + "step": 10827 + }, + { + "epoch": 0.3877737389653876, + "grad_norm": 1.4672082662582397, + "learning_rate": 0.0001400493108294862, + "loss": 1.5657, + "step": 10828 + }, + { + "epoch": 0.38780955109495585, + "grad_norm": 2.454993486404419, + "learning_rate": 0.00014003868238720828, + "loss": 1.7523, + "step": 10829 + }, + { + "epoch": 0.3878453632245241, + "grad_norm": 2.431044578552246, + "learning_rate": 0.0001400280534062615, + "loss": 1.798, + "step": 10830 + }, + { + "epoch": 0.38788117535409244, + "grad_norm": 1.4329546689987183, + "learning_rate": 0.0001400174238867889, + "loss": 1.3742, + "step": 10831 + }, + { + "epoch": 0.3879169874836607, + "grad_norm": 1.5714000463485718, + "learning_rate": 0.00014000679382893352, + "loss": 1.4247, + "step": 10832 + }, + { + "epoch": 0.387952799613229, + "grad_norm": 1.4519602060317993, + "learning_rate": 0.0001399961632328383, + "loss": 1.6319, + "step": 10833 + }, + { + "epoch": 0.3879886117427973, + "grad_norm": 1.8219505548477173, + "learning_rate": 0.00013998553209864628, + "loss": 1.6325, + "step": 10834 + }, + { + "epoch": 0.3880244238723656, + "grad_norm": 1.3136101961135864, + "learning_rate": 0.00013997490042650054, + "loss": 1.2345, + "step": 10835 + }, + { + "epoch": 0.38806023600193384, + "grad_norm": 2.3637969493865967, + "learning_rate": 0.00013996426821654407, + "loss": 1.588, + "step": 10836 + }, + { + "epoch": 0.3880960481315021, + "grad_norm": 1.519654631614685, + "learning_rate": 0.00013995363546891992, + "loss": 1.6943, + "step": 10837 + }, + { + "epoch": 0.38813186026107044, + "grad_norm": 1.674434781074524, + "learning_rate": 0.00013994300218377113, + "loss": 1.9109, + "step": 10838 + }, + { + "epoch": 0.3881676723906387, + "grad_norm": 1.8324869871139526, + "learning_rate": 0.0001399323683612408, + "loss": 1.5401, + "step": 10839 + }, + { + "epoch": 0.388203484520207, + "grad_norm": 1.7333762645721436, + "learning_rate": 0.00013992173400147193, + "loss": 1.3558, + "step": 10840 + }, + { + "epoch": 0.3882392966497753, + "grad_norm": 1.3128198385238647, + "learning_rate": 0.00013991109910460763, + "loss": 1.4118, + "step": 10841 + }, + { + "epoch": 0.38827510877934357, + "grad_norm": 1.262176752090454, + "learning_rate": 0.00013990046367079098, + "loss": 1.5413, + "step": 10842 + }, + { + "epoch": 0.38831092090891184, + "grad_norm": 1.3987606763839722, + "learning_rate": 0.00013988982770016505, + "loss": 1.4809, + "step": 10843 + }, + { + "epoch": 0.3883467330384801, + "grad_norm": 1.460222601890564, + "learning_rate": 0.00013987919119287296, + "loss": 1.6431, + "step": 10844 + }, + { + "epoch": 0.38838254516804843, + "grad_norm": 1.888704776763916, + "learning_rate": 0.00013986855414905777, + "loss": 1.3711, + "step": 10845 + }, + { + "epoch": 0.3884183572976167, + "grad_norm": 1.4014217853546143, + "learning_rate": 0.00013985791656886262, + "loss": 1.457, + "step": 10846 + }, + { + "epoch": 0.38845416942718497, + "grad_norm": 2.005265951156616, + "learning_rate": 0.00013984727845243062, + "loss": 1.5159, + "step": 10847 + }, + { + "epoch": 0.3884899815567533, + "grad_norm": 1.5637860298156738, + "learning_rate": 0.00013983663979990488, + "loss": 1.4922, + "step": 10848 + }, + { + "epoch": 0.38852579368632156, + "grad_norm": 2.402989387512207, + "learning_rate": 0.00013982600061142854, + "loss": 1.4786, + "step": 10849 + }, + { + "epoch": 0.38856160581588983, + "grad_norm": 1.529707670211792, + "learning_rate": 0.00013981536088714474, + "loss": 1.4036, + "step": 10850 + }, + { + "epoch": 0.3885974179454581, + "grad_norm": 1.7621464729309082, + "learning_rate": 0.0001398047206271966, + "loss": 1.474, + "step": 10851 + }, + { + "epoch": 0.3886332300750264, + "grad_norm": 1.6757811307907104, + "learning_rate": 0.00013979407983172733, + "loss": 1.3865, + "step": 10852 + }, + { + "epoch": 0.3886690422045947, + "grad_norm": 1.592047929763794, + "learning_rate": 0.00013978343850088002, + "loss": 1.3851, + "step": 10853 + }, + { + "epoch": 0.38870485433416296, + "grad_norm": 1.6998521089553833, + "learning_rate": 0.00013977279663479784, + "loss": 1.3954, + "step": 10854 + }, + { + "epoch": 0.3887406664637313, + "grad_norm": 1.5489436388015747, + "learning_rate": 0.000139762154233624, + "loss": 1.4166, + "step": 10855 + }, + { + "epoch": 0.38877647859329956, + "grad_norm": 1.9484127759933472, + "learning_rate": 0.00013975151129750168, + "loss": 1.6642, + "step": 10856 + }, + { + "epoch": 0.3888122907228678, + "grad_norm": 1.5500437021255493, + "learning_rate": 0.00013974086782657404, + "loss": 1.6083, + "step": 10857 + }, + { + "epoch": 0.3888481028524361, + "grad_norm": 1.78920316696167, + "learning_rate": 0.00013973022382098428, + "loss": 1.2267, + "step": 10858 + }, + { + "epoch": 0.3888839149820044, + "grad_norm": 1.2382129430770874, + "learning_rate": 0.0001397195792808756, + "loss": 1.5163, + "step": 10859 + }, + { + "epoch": 0.3889197271115727, + "grad_norm": 1.68257737159729, + "learning_rate": 0.00013970893420639123, + "loss": 1.6837, + "step": 10860 + }, + { + "epoch": 0.38895553924114096, + "grad_norm": 1.7023179531097412, + "learning_rate": 0.00013969828859767438, + "loss": 1.3082, + "step": 10861 + }, + { + "epoch": 0.3889913513707093, + "grad_norm": 1.9222726821899414, + "learning_rate": 0.00013968764245486824, + "loss": 1.2721, + "step": 10862 + }, + { + "epoch": 0.38902716350027755, + "grad_norm": 1.7391574382781982, + "learning_rate": 0.0001396769957781161, + "loss": 1.5351, + "step": 10863 + }, + { + "epoch": 0.3890629756298458, + "grad_norm": 1.9035283327102661, + "learning_rate": 0.00013966634856756114, + "loss": 1.8143, + "step": 10864 + }, + { + "epoch": 0.3890987877594141, + "grad_norm": 1.817996621131897, + "learning_rate": 0.0001396557008233466, + "loss": 1.4709, + "step": 10865 + }, + { + "epoch": 0.3891345998889824, + "grad_norm": 1.6776641607284546, + "learning_rate": 0.0001396450525456158, + "loss": 1.8253, + "step": 10866 + }, + { + "epoch": 0.3891704120185507, + "grad_norm": 1.5271328687667847, + "learning_rate": 0.0001396344037345119, + "loss": 1.4313, + "step": 10867 + }, + { + "epoch": 0.38920622414811895, + "grad_norm": 2.033863067626953, + "learning_rate": 0.0001396237543901783, + "loss": 1.3897, + "step": 10868 + }, + { + "epoch": 0.3892420362776873, + "grad_norm": 2.935681104660034, + "learning_rate": 0.00013961310451275814, + "loss": 1.6515, + "step": 10869 + }, + { + "epoch": 0.38927784840725554, + "grad_norm": 2.0245561599731445, + "learning_rate": 0.00013960245410239478, + "loss": 1.5185, + "step": 10870 + }, + { + "epoch": 0.3893136605368238, + "grad_norm": 1.5656148195266724, + "learning_rate": 0.00013959180315923148, + "loss": 1.2929, + "step": 10871 + }, + { + "epoch": 0.3893494726663921, + "grad_norm": 2.131190538406372, + "learning_rate": 0.00013958115168341155, + "loss": 1.2898, + "step": 10872 + }, + { + "epoch": 0.3893852847959604, + "grad_norm": 1.8734376430511475, + "learning_rate": 0.00013957049967507824, + "loss": 1.292, + "step": 10873 + }, + { + "epoch": 0.3894210969255287, + "grad_norm": 1.5268676280975342, + "learning_rate": 0.00013955984713437492, + "loss": 1.6088, + "step": 10874 + }, + { + "epoch": 0.38945690905509694, + "grad_norm": 2.3420557975769043, + "learning_rate": 0.00013954919406144488, + "loss": 1.5742, + "step": 10875 + }, + { + "epoch": 0.38949272118466527, + "grad_norm": 2.152472496032715, + "learning_rate": 0.00013953854045643146, + "loss": 1.7288, + "step": 10876 + }, + { + "epoch": 0.38952853331423354, + "grad_norm": 1.4512126445770264, + "learning_rate": 0.00013952788631947798, + "loss": 1.5296, + "step": 10877 + }, + { + "epoch": 0.3895643454438018, + "grad_norm": 1.953406810760498, + "learning_rate": 0.00013951723165072776, + "loss": 1.575, + "step": 10878 + }, + { + "epoch": 0.3896001575733701, + "grad_norm": 1.3942762613296509, + "learning_rate": 0.00013950657645032418, + "loss": 1.5451, + "step": 10879 + }, + { + "epoch": 0.3896359697029384, + "grad_norm": 2.523400068283081, + "learning_rate": 0.0001394959207184106, + "loss": 1.6235, + "step": 10880 + }, + { + "epoch": 0.38967178183250667, + "grad_norm": 1.6096271276474, + "learning_rate": 0.00013948526445513033, + "loss": 1.6995, + "step": 10881 + }, + { + "epoch": 0.38970759396207494, + "grad_norm": 2.4701058864593506, + "learning_rate": 0.00013947460766062673, + "loss": 1.5205, + "step": 10882 + }, + { + "epoch": 0.38974340609164326, + "grad_norm": 1.6288639307022095, + "learning_rate": 0.00013946395033504323, + "loss": 1.3126, + "step": 10883 + }, + { + "epoch": 0.38977921822121153, + "grad_norm": 1.5390440225601196, + "learning_rate": 0.00013945329247852317, + "loss": 1.6212, + "step": 10884 + }, + { + "epoch": 0.3898150303507798, + "grad_norm": 2.1103153228759766, + "learning_rate": 0.00013944263409120997, + "loss": 1.2683, + "step": 10885 + }, + { + "epoch": 0.38985084248034807, + "grad_norm": 1.4607430696487427, + "learning_rate": 0.00013943197517324698, + "loss": 1.768, + "step": 10886 + }, + { + "epoch": 0.3898866546099164, + "grad_norm": 1.542883276939392, + "learning_rate": 0.00013942131572477763, + "loss": 1.5286, + "step": 10887 + }, + { + "epoch": 0.38992246673948466, + "grad_norm": 1.6651703119277954, + "learning_rate": 0.00013941065574594536, + "loss": 1.5245, + "step": 10888 + }, + { + "epoch": 0.38995827886905293, + "grad_norm": 2.000406503677368, + "learning_rate": 0.0001393999952368935, + "loss": 1.3287, + "step": 10889 + }, + { + "epoch": 0.38999409099862126, + "grad_norm": 1.859923243522644, + "learning_rate": 0.0001393893341977656, + "loss": 1.5322, + "step": 10890 + }, + { + "epoch": 0.3900299031281895, + "grad_norm": 1.680471658706665, + "learning_rate": 0.00013937867262870494, + "loss": 1.2547, + "step": 10891 + }, + { + "epoch": 0.3900657152577578, + "grad_norm": 1.6399215459823608, + "learning_rate": 0.00013936801052985508, + "loss": 1.71, + "step": 10892 + }, + { + "epoch": 0.39010152738732606, + "grad_norm": 1.9132766723632812, + "learning_rate": 0.0001393573479013594, + "loss": 1.5958, + "step": 10893 + }, + { + "epoch": 0.3901373395168944, + "grad_norm": 1.6897841691970825, + "learning_rate": 0.00013934668474336137, + "loss": 1.7573, + "step": 10894 + }, + { + "epoch": 0.39017315164646266, + "grad_norm": 1.609229564666748, + "learning_rate": 0.00013933602105600446, + "loss": 1.3524, + "step": 10895 + }, + { + "epoch": 0.3902089637760309, + "grad_norm": 1.546349287033081, + "learning_rate": 0.00013932535683943212, + "loss": 1.3228, + "step": 10896 + }, + { + "epoch": 0.39024477590559925, + "grad_norm": 1.713444709777832, + "learning_rate": 0.00013931469209378788, + "loss": 1.5714, + "step": 10897 + }, + { + "epoch": 0.3902805880351675, + "grad_norm": 1.8013064861297607, + "learning_rate": 0.0001393040268192151, + "loss": 1.5539, + "step": 10898 + }, + { + "epoch": 0.3903164001647358, + "grad_norm": 1.501839280128479, + "learning_rate": 0.00013929336101585737, + "loss": 1.5056, + "step": 10899 + }, + { + "epoch": 0.39035221229430406, + "grad_norm": 1.4470911026000977, + "learning_rate": 0.00013928269468385814, + "loss": 1.3623, + "step": 10900 + }, + { + "epoch": 0.3903880244238724, + "grad_norm": 1.782845377922058, + "learning_rate": 0.00013927202782336093, + "loss": 1.5582, + "step": 10901 + }, + { + "epoch": 0.39042383655344065, + "grad_norm": 1.5116702318191528, + "learning_rate": 0.0001392613604345092, + "loss": 1.2386, + "step": 10902 + }, + { + "epoch": 0.3904596486830089, + "grad_norm": 1.541406512260437, + "learning_rate": 0.00013925069251744657, + "loss": 1.5252, + "step": 10903 + }, + { + "epoch": 0.39049546081257724, + "grad_norm": 1.852392315864563, + "learning_rate": 0.0001392400240723165, + "loss": 1.5734, + "step": 10904 + }, + { + "epoch": 0.3905312729421455, + "grad_norm": 1.53135085105896, + "learning_rate": 0.00013922935509926249, + "loss": 1.5326, + "step": 10905 + }, + { + "epoch": 0.3905670850717138, + "grad_norm": 1.9085524082183838, + "learning_rate": 0.00013921868559842813, + "loss": 1.7359, + "step": 10906 + }, + { + "epoch": 0.39060289720128205, + "grad_norm": 1.9585771560668945, + "learning_rate": 0.00013920801556995693, + "loss": 1.4947, + "step": 10907 + }, + { + "epoch": 0.3906387093308504, + "grad_norm": 2.3244411945343018, + "learning_rate": 0.00013919734501399248, + "loss": 1.4937, + "step": 10908 + }, + { + "epoch": 0.39067452146041864, + "grad_norm": 2.101654052734375, + "learning_rate": 0.0001391866739306783, + "loss": 1.5937, + "step": 10909 + }, + { + "epoch": 0.3907103335899869, + "grad_norm": 1.9944405555725098, + "learning_rate": 0.00013917600232015798, + "loss": 1.5416, + "step": 10910 + }, + { + "epoch": 0.39074614571955524, + "grad_norm": 1.5313005447387695, + "learning_rate": 0.00013916533018257506, + "loss": 1.5824, + "step": 10911 + }, + { + "epoch": 0.3907819578491235, + "grad_norm": 1.490833044052124, + "learning_rate": 0.00013915465751807314, + "loss": 1.5661, + "step": 10912 + }, + { + "epoch": 0.3908177699786918, + "grad_norm": 1.355104923248291, + "learning_rate": 0.00013914398432679582, + "loss": 1.498, + "step": 10913 + }, + { + "epoch": 0.39085358210826004, + "grad_norm": 1.4459413290023804, + "learning_rate": 0.00013913331060888667, + "loss": 1.0748, + "step": 10914 + }, + { + "epoch": 0.39088939423782837, + "grad_norm": 1.934935212135315, + "learning_rate": 0.00013912263636448936, + "loss": 1.3628, + "step": 10915 + }, + { + "epoch": 0.39092520636739664, + "grad_norm": 1.31533944606781, + "learning_rate": 0.00013911196159374737, + "loss": 1.476, + "step": 10916 + }, + { + "epoch": 0.3909610184969649, + "grad_norm": 1.765876293182373, + "learning_rate": 0.00013910128629680441, + "loss": 1.4795, + "step": 10917 + }, + { + "epoch": 0.39099683062653323, + "grad_norm": 1.559696912765503, + "learning_rate": 0.0001390906104738041, + "loss": 1.7247, + "step": 10918 + }, + { + "epoch": 0.3910326427561015, + "grad_norm": 1.7019007205963135, + "learning_rate": 0.00013907993412489003, + "loss": 1.3482, + "step": 10919 + }, + { + "epoch": 0.39106845488566977, + "grad_norm": 1.4443830251693726, + "learning_rate": 0.00013906925725020586, + "loss": 1.5142, + "step": 10920 + }, + { + "epoch": 0.39110426701523804, + "grad_norm": 1.9812666177749634, + "learning_rate": 0.00013905857984989524, + "loss": 1.4914, + "step": 10921 + }, + { + "epoch": 0.39114007914480636, + "grad_norm": 1.4861475229263306, + "learning_rate": 0.00013904790192410178, + "loss": 1.4392, + "step": 10922 + }, + { + "epoch": 0.39117589127437463, + "grad_norm": 1.292574167251587, + "learning_rate": 0.0001390372234729692, + "loss": 1.2898, + "step": 10923 + }, + { + "epoch": 0.3912117034039429, + "grad_norm": 1.654436469078064, + "learning_rate": 0.00013902654449664115, + "loss": 1.4982, + "step": 10924 + }, + { + "epoch": 0.3912475155335112, + "grad_norm": 1.3484128713607788, + "learning_rate": 0.00013901586499526125, + "loss": 1.6477, + "step": 10925 + }, + { + "epoch": 0.3912833276630795, + "grad_norm": 1.577406883239746, + "learning_rate": 0.0001390051849689732, + "loss": 1.5074, + "step": 10926 + }, + { + "epoch": 0.39131913979264776, + "grad_norm": 1.7564611434936523, + "learning_rate": 0.00013899450441792074, + "loss": 0.9454, + "step": 10927 + }, + { + "epoch": 0.39135495192221603, + "grad_norm": 1.9713129997253418, + "learning_rate": 0.00013898382334224748, + "loss": 1.3936, + "step": 10928 + }, + { + "epoch": 0.39139076405178436, + "grad_norm": 1.8760312795639038, + "learning_rate": 0.0001389731417420972, + "loss": 1.5234, + "step": 10929 + }, + { + "epoch": 0.3914265761813526, + "grad_norm": 1.71038019657135, + "learning_rate": 0.00013896245961761354, + "loss": 1.4217, + "step": 10930 + }, + { + "epoch": 0.3914623883109209, + "grad_norm": 1.5973619222640991, + "learning_rate": 0.00013895177696894023, + "loss": 1.3573, + "step": 10931 + }, + { + "epoch": 0.3914982004404892, + "grad_norm": 1.6818480491638184, + "learning_rate": 0.00013894109379622104, + "loss": 1.3301, + "step": 10932 + }, + { + "epoch": 0.3915340125700575, + "grad_norm": 1.7403838634490967, + "learning_rate": 0.00013893041009959968, + "loss": 1.5322, + "step": 10933 + }, + { + "epoch": 0.39156982469962576, + "grad_norm": 1.5131548643112183, + "learning_rate": 0.00013891972587921987, + "loss": 1.6662, + "step": 10934 + }, + { + "epoch": 0.391605636829194, + "grad_norm": 1.3520056009292603, + "learning_rate": 0.0001389090411352253, + "loss": 1.5041, + "step": 10935 + }, + { + "epoch": 0.39164144895876235, + "grad_norm": 1.8540492057800293, + "learning_rate": 0.0001388983558677598, + "loss": 1.3033, + "step": 10936 + }, + { + "epoch": 0.3916772610883306, + "grad_norm": 2.1093995571136475, + "learning_rate": 0.00013888767007696709, + "loss": 1.2123, + "step": 10937 + }, + { + "epoch": 0.3917130732178989, + "grad_norm": 1.8228912353515625, + "learning_rate": 0.00013887698376299095, + "loss": 1.5145, + "step": 10938 + }, + { + "epoch": 0.3917488853474672, + "grad_norm": 2.670003652572632, + "learning_rate": 0.00013886629692597512, + "loss": 1.4614, + "step": 10939 + }, + { + "epoch": 0.3917846974770355, + "grad_norm": 1.368762731552124, + "learning_rate": 0.00013885560956606344, + "loss": 1.6102, + "step": 10940 + }, + { + "epoch": 0.39182050960660375, + "grad_norm": 1.714065670967102, + "learning_rate": 0.00013884492168339963, + "loss": 1.7329, + "step": 10941 + }, + { + "epoch": 0.391856321736172, + "grad_norm": 1.4392451047897339, + "learning_rate": 0.00013883423327812748, + "loss": 1.3499, + "step": 10942 + }, + { + "epoch": 0.39189213386574034, + "grad_norm": 1.5470072031021118, + "learning_rate": 0.00013882354435039085, + "loss": 1.1483, + "step": 10943 + }, + { + "epoch": 0.3919279459953086, + "grad_norm": 1.6122442483901978, + "learning_rate": 0.00013881285490033348, + "loss": 1.3297, + "step": 10944 + }, + { + "epoch": 0.3919637581248769, + "grad_norm": 1.8792885541915894, + "learning_rate": 0.00013880216492809924, + "loss": 1.7625, + "step": 10945 + }, + { + "epoch": 0.3919995702544452, + "grad_norm": 1.8728435039520264, + "learning_rate": 0.00013879147443383188, + "loss": 1.2987, + "step": 10946 + }, + { + "epoch": 0.3920353823840135, + "grad_norm": 1.4084367752075195, + "learning_rate": 0.00013878078341767532, + "loss": 1.3605, + "step": 10947 + }, + { + "epoch": 0.39207119451358174, + "grad_norm": 2.4692366123199463, + "learning_rate": 0.00013877009187977332, + "loss": 1.8605, + "step": 10948 + }, + { + "epoch": 0.39210700664315, + "grad_norm": 1.8761765956878662, + "learning_rate": 0.00013875939982026976, + "loss": 1.5695, + "step": 10949 + }, + { + "epoch": 0.39214281877271834, + "grad_norm": 1.9340341091156006, + "learning_rate": 0.00013874870723930847, + "loss": 1.6881, + "step": 10950 + }, + { + "epoch": 0.3921786309022866, + "grad_norm": 1.415968418121338, + "learning_rate": 0.00013873801413703327, + "loss": 1.4739, + "step": 10951 + }, + { + "epoch": 0.3922144430318549, + "grad_norm": 1.7562787532806396, + "learning_rate": 0.00013872732051358808, + "loss": 1.7517, + "step": 10952 + }, + { + "epoch": 0.3922502551614232, + "grad_norm": 1.4888852834701538, + "learning_rate": 0.00013871662636911672, + "loss": 1.5784, + "step": 10953 + }, + { + "epoch": 0.39228606729099147, + "grad_norm": 1.862896203994751, + "learning_rate": 0.00013870593170376317, + "loss": 1.4573, + "step": 10954 + }, + { + "epoch": 0.39232187942055974, + "grad_norm": 1.7131226062774658, + "learning_rate": 0.00013869523651767116, + "loss": 1.8789, + "step": 10955 + }, + { + "epoch": 0.392357691550128, + "grad_norm": 1.4080753326416016, + "learning_rate": 0.0001386845408109847, + "loss": 1.4324, + "step": 10956 + }, + { + "epoch": 0.39239350367969633, + "grad_norm": 1.5288814306259155, + "learning_rate": 0.0001386738445838476, + "loss": 1.4847, + "step": 10957 + }, + { + "epoch": 0.3924293158092646, + "grad_norm": 1.7750880718231201, + "learning_rate": 0.00013866314783640384, + "loss": 1.6698, + "step": 10958 + }, + { + "epoch": 0.39246512793883287, + "grad_norm": 1.8281431198120117, + "learning_rate": 0.0001386524505687973, + "loss": 1.4908, + "step": 10959 + }, + { + "epoch": 0.3925009400684012, + "grad_norm": 1.838854432106018, + "learning_rate": 0.00013864175278117187, + "loss": 1.4854, + "step": 10960 + }, + { + "epoch": 0.39253675219796946, + "grad_norm": 1.9536410570144653, + "learning_rate": 0.00013863105447367154, + "loss": 1.6562, + "step": 10961 + }, + { + "epoch": 0.39257256432753773, + "grad_norm": 2.2680394649505615, + "learning_rate": 0.00013862035564644017, + "loss": 1.5296, + "step": 10962 + }, + { + "epoch": 0.392608376457106, + "grad_norm": 1.7162909507751465, + "learning_rate": 0.00013860965629962176, + "loss": 1.8054, + "step": 10963 + }, + { + "epoch": 0.3926441885866743, + "grad_norm": 1.5176362991333008, + "learning_rate": 0.0001385989564333602, + "loss": 1.6443, + "step": 10964 + }, + { + "epoch": 0.3926800007162426, + "grad_norm": 1.4724088907241821, + "learning_rate": 0.00013858825604779945, + "loss": 1.7563, + "step": 10965 + }, + { + "epoch": 0.39271581284581086, + "grad_norm": 2.1419851779937744, + "learning_rate": 0.00013857755514308352, + "loss": 1.6473, + "step": 10966 + }, + { + "epoch": 0.3927516249753792, + "grad_norm": 1.5697284936904907, + "learning_rate": 0.00013856685371935637, + "loss": 1.694, + "step": 10967 + }, + { + "epoch": 0.39278743710494746, + "grad_norm": 1.7596726417541504, + "learning_rate": 0.00013855615177676191, + "loss": 1.5056, + "step": 10968 + }, + { + "epoch": 0.3928232492345157, + "grad_norm": 1.5144803524017334, + "learning_rate": 0.0001385454493154442, + "loss": 1.589, + "step": 10969 + }, + { + "epoch": 0.392859061364084, + "grad_norm": 2.599090337753296, + "learning_rate": 0.00013853474633554715, + "loss": 1.4556, + "step": 10970 + }, + { + "epoch": 0.3928948734936523, + "grad_norm": 1.5587658882141113, + "learning_rate": 0.00013852404283721482, + "loss": 1.5985, + "step": 10971 + }, + { + "epoch": 0.3929306856232206, + "grad_norm": 1.9133366346359253, + "learning_rate": 0.00013851333882059118, + "loss": 1.2122, + "step": 10972 + }, + { + "epoch": 0.39296649775278886, + "grad_norm": 2.5919742584228516, + "learning_rate": 0.00013850263428582022, + "loss": 1.6127, + "step": 10973 + }, + { + "epoch": 0.3930023098823572, + "grad_norm": 1.6070665121078491, + "learning_rate": 0.000138491929233046, + "loss": 1.5863, + "step": 10974 + }, + { + "epoch": 0.39303812201192545, + "grad_norm": 1.462077021598816, + "learning_rate": 0.00013848122366241254, + "loss": 1.6105, + "step": 10975 + }, + { + "epoch": 0.3930739341414937, + "grad_norm": 2.1995644569396973, + "learning_rate": 0.00013847051757406384, + "loss": 1.6227, + "step": 10976 + }, + { + "epoch": 0.393109746271062, + "grad_norm": 2.4004290103912354, + "learning_rate": 0.00013845981096814397, + "loss": 1.446, + "step": 10977 + }, + { + "epoch": 0.3931455584006303, + "grad_norm": 1.6153998374938965, + "learning_rate": 0.00013844910384479693, + "loss": 1.5331, + "step": 10978 + }, + { + "epoch": 0.3931813705301986, + "grad_norm": 1.8544167280197144, + "learning_rate": 0.00013843839620416678, + "loss": 1.8615, + "step": 10979 + }, + { + "epoch": 0.39321718265976685, + "grad_norm": 1.9386411905288696, + "learning_rate": 0.00013842768804639763, + "loss": 1.3094, + "step": 10980 + }, + { + "epoch": 0.3932529947893352, + "grad_norm": 1.922755479812622, + "learning_rate": 0.00013841697937163344, + "loss": 1.6329, + "step": 10981 + }, + { + "epoch": 0.39328880691890344, + "grad_norm": 1.3943654298782349, + "learning_rate": 0.0001384062701800184, + "loss": 1.542, + "step": 10982 + }, + { + "epoch": 0.3933246190484717, + "grad_norm": 1.7033149003982544, + "learning_rate": 0.00013839556047169654, + "loss": 1.6458, + "step": 10983 + }, + { + "epoch": 0.39336043117804, + "grad_norm": 1.56367826461792, + "learning_rate": 0.00013838485024681192, + "loss": 1.6611, + "step": 10984 + }, + { + "epoch": 0.3933962433076083, + "grad_norm": 1.4666606187820435, + "learning_rate": 0.00013837413950550865, + "loss": 1.7409, + "step": 10985 + }, + { + "epoch": 0.3934320554371766, + "grad_norm": 1.5731333494186401, + "learning_rate": 0.00013836342824793084, + "loss": 0.9751, + "step": 10986 + }, + { + "epoch": 0.39346786756674484, + "grad_norm": 1.75849449634552, + "learning_rate": 0.00013835271647422262, + "loss": 1.4298, + "step": 10987 + }, + { + "epoch": 0.39350367969631317, + "grad_norm": 1.5057103633880615, + "learning_rate": 0.000138342004184528, + "loss": 1.4403, + "step": 10988 + }, + { + "epoch": 0.39353949182588144, + "grad_norm": 1.6212468147277832, + "learning_rate": 0.00013833129137899122, + "loss": 1.2772, + "step": 10989 + }, + { + "epoch": 0.3935753039554497, + "grad_norm": 3.099400758743286, + "learning_rate": 0.00013832057805775636, + "loss": 1.8536, + "step": 10990 + }, + { + "epoch": 0.393611116085018, + "grad_norm": 1.6015698909759521, + "learning_rate": 0.00013830986422096756, + "loss": 1.5765, + "step": 10991 + }, + { + "epoch": 0.3936469282145863, + "grad_norm": 1.789380669593811, + "learning_rate": 0.0001382991498687689, + "loss": 1.57, + "step": 10992 + }, + { + "epoch": 0.39368274034415457, + "grad_norm": 1.9253126382827759, + "learning_rate": 0.00013828843500130462, + "loss": 1.7023, + "step": 10993 + }, + { + "epoch": 0.39371855247372284, + "grad_norm": 1.3724284172058105, + "learning_rate": 0.00013827771961871885, + "loss": 1.3947, + "step": 10994 + }, + { + "epoch": 0.3937543646032911, + "grad_norm": 1.8409966230392456, + "learning_rate": 0.0001382670037211557, + "loss": 1.3981, + "step": 10995 + }, + { + "epoch": 0.39379017673285943, + "grad_norm": 1.4560490846633911, + "learning_rate": 0.0001382562873087594, + "loss": 1.5862, + "step": 10996 + }, + { + "epoch": 0.3938259888624277, + "grad_norm": 2.6296908855438232, + "learning_rate": 0.00013824557038167408, + "loss": 1.4751, + "step": 10997 + }, + { + "epoch": 0.39386180099199597, + "grad_norm": 1.4091401100158691, + "learning_rate": 0.00013823485294004397, + "loss": 1.2807, + "step": 10998 + }, + { + "epoch": 0.3938976131215643, + "grad_norm": 2.486598253250122, + "learning_rate": 0.00013822413498401322, + "loss": 1.4194, + "step": 10999 + }, + { + "epoch": 0.39393342525113256, + "grad_norm": 1.8565677404403687, + "learning_rate": 0.00013821341651372603, + "loss": 1.4072, + "step": 11000 + }, + { + "epoch": 0.39396923738070083, + "grad_norm": 1.5582574605941772, + "learning_rate": 0.00013820269752932662, + "loss": 1.1239, + "step": 11001 + }, + { + "epoch": 0.3940050495102691, + "grad_norm": 1.714348316192627, + "learning_rate": 0.0001381919780309592, + "loss": 1.5673, + "step": 11002 + }, + { + "epoch": 0.3940408616398374, + "grad_norm": 2.184767723083496, + "learning_rate": 0.000138181258018768, + "loss": 1.8622, + "step": 11003 + }, + { + "epoch": 0.3940766737694057, + "grad_norm": 1.768456220626831, + "learning_rate": 0.00013817053749289718, + "loss": 1.511, + "step": 11004 + }, + { + "epoch": 0.39411248589897396, + "grad_norm": 1.9604367017745972, + "learning_rate": 0.00013815981645349105, + "loss": 1.5908, + "step": 11005 + }, + { + "epoch": 0.3941482980285423, + "grad_norm": 1.6243501901626587, + "learning_rate": 0.00013814909490069378, + "loss": 1.4674, + "step": 11006 + }, + { + "epoch": 0.39418411015811056, + "grad_norm": 1.532596468925476, + "learning_rate": 0.00013813837283464968, + "loss": 1.5244, + "step": 11007 + }, + { + "epoch": 0.3942199222876788, + "grad_norm": 2.020131826400757, + "learning_rate": 0.00013812765025550294, + "loss": 1.3743, + "step": 11008 + }, + { + "epoch": 0.3942557344172471, + "grad_norm": 1.766100287437439, + "learning_rate": 0.00013811692716339785, + "loss": 1.5782, + "step": 11009 + }, + { + "epoch": 0.3942915465468154, + "grad_norm": 1.5012962818145752, + "learning_rate": 0.00013810620355847868, + "loss": 1.5708, + "step": 11010 + }, + { + "epoch": 0.3943273586763837, + "grad_norm": 1.4849494695663452, + "learning_rate": 0.00013809547944088968, + "loss": 1.9724, + "step": 11011 + }, + { + "epoch": 0.39436317080595196, + "grad_norm": 2.92488694190979, + "learning_rate": 0.00013808475481077518, + "loss": 1.7187, + "step": 11012 + }, + { + "epoch": 0.3943989829355203, + "grad_norm": 1.5572278499603271, + "learning_rate": 0.00013807402966827944, + "loss": 1.4533, + "step": 11013 + }, + { + "epoch": 0.39443479506508855, + "grad_norm": 1.4926722049713135, + "learning_rate": 0.00013806330401354671, + "loss": 1.3641, + "step": 11014 + }, + { + "epoch": 0.3944706071946568, + "grad_norm": 1.7692962884902954, + "learning_rate": 0.0001380525778467213, + "loss": 1.3569, + "step": 11015 + }, + { + "epoch": 0.3945064193242251, + "grad_norm": 1.5108805894851685, + "learning_rate": 0.00013804185116794755, + "loss": 1.4749, + "step": 11016 + }, + { + "epoch": 0.3945422314537934, + "grad_norm": 2.093026876449585, + "learning_rate": 0.00013803112397736976, + "loss": 1.4546, + "step": 11017 + }, + { + "epoch": 0.3945780435833617, + "grad_norm": 1.4690407514572144, + "learning_rate": 0.0001380203962751323, + "loss": 1.2591, + "step": 11018 + }, + { + "epoch": 0.39461385571292995, + "grad_norm": 1.3096941709518433, + "learning_rate": 0.0001380096680613794, + "loss": 1.4565, + "step": 11019 + }, + { + "epoch": 0.3946496678424983, + "grad_norm": 1.9637916088104248, + "learning_rate": 0.00013799893933625547, + "loss": 1.2479, + "step": 11020 + }, + { + "epoch": 0.39468547997206654, + "grad_norm": 1.3697775602340698, + "learning_rate": 0.00013798821009990486, + "loss": 1.457, + "step": 11021 + }, + { + "epoch": 0.3947212921016348, + "grad_norm": 1.6081949472427368, + "learning_rate": 0.00013797748035247184, + "loss": 1.509, + "step": 11022 + }, + { + "epoch": 0.3947571042312031, + "grad_norm": 2.8257575035095215, + "learning_rate": 0.0001379667500941008, + "loss": 1.5442, + "step": 11023 + }, + { + "epoch": 0.3947929163607714, + "grad_norm": 1.9925357103347778, + "learning_rate": 0.00013795601932493613, + "loss": 1.6229, + "step": 11024 + }, + { + "epoch": 0.3948287284903397, + "grad_norm": 1.7090239524841309, + "learning_rate": 0.0001379452880451222, + "loss": 1.4409, + "step": 11025 + }, + { + "epoch": 0.39486454061990794, + "grad_norm": 1.6960958242416382, + "learning_rate": 0.00013793455625480332, + "loss": 1.4845, + "step": 11026 + }, + { + "epoch": 0.39490035274947627, + "grad_norm": 1.5618412494659424, + "learning_rate": 0.00013792382395412392, + "loss": 1.8159, + "step": 11027 + }, + { + "epoch": 0.39493616487904454, + "grad_norm": 1.413428783416748, + "learning_rate": 0.00013791309114322841, + "loss": 1.2555, + "step": 11028 + }, + { + "epoch": 0.3949719770086128, + "grad_norm": 2.023017644882202, + "learning_rate": 0.00013790235782226118, + "loss": 1.5708, + "step": 11029 + }, + { + "epoch": 0.3950077891381811, + "grad_norm": 1.610162615776062, + "learning_rate": 0.0001378916239913666, + "loss": 1.7002, + "step": 11030 + }, + { + "epoch": 0.3950436012677494, + "grad_norm": 1.5207340717315674, + "learning_rate": 0.0001378808896506891, + "loss": 1.4454, + "step": 11031 + }, + { + "epoch": 0.39507941339731767, + "grad_norm": 1.6515626907348633, + "learning_rate": 0.00013787015480037307, + "loss": 1.5858, + "step": 11032 + }, + { + "epoch": 0.39511522552688594, + "grad_norm": 2.073228359222412, + "learning_rate": 0.00013785941944056298, + "loss": 1.5202, + "step": 11033 + }, + { + "epoch": 0.39515103765645426, + "grad_norm": 1.6093931198120117, + "learning_rate": 0.00013784868357140322, + "loss": 1.6465, + "step": 11034 + }, + { + "epoch": 0.39518684978602253, + "grad_norm": 2.0961008071899414, + "learning_rate": 0.00013783794719303825, + "loss": 1.6145, + "step": 11035 + }, + { + "epoch": 0.3952226619155908, + "grad_norm": 1.5990267992019653, + "learning_rate": 0.0001378272103056125, + "loss": 1.6946, + "step": 11036 + }, + { + "epoch": 0.39525847404515907, + "grad_norm": 1.5354348421096802, + "learning_rate": 0.00013781647290927044, + "loss": 1.4329, + "step": 11037 + }, + { + "epoch": 0.3952942861747274, + "grad_norm": 1.4117317199707031, + "learning_rate": 0.00013780573500415654, + "loss": 1.4226, + "step": 11038 + }, + { + "epoch": 0.39533009830429566, + "grad_norm": 1.3799502849578857, + "learning_rate": 0.0001377949965904152, + "loss": 1.3842, + "step": 11039 + }, + { + "epoch": 0.39536591043386393, + "grad_norm": 1.3154661655426025, + "learning_rate": 0.00013778425766819096, + "loss": 1.3348, + "step": 11040 + }, + { + "epoch": 0.39540172256343226, + "grad_norm": 2.150657892227173, + "learning_rate": 0.00013777351823762826, + "loss": 1.652, + "step": 11041 + }, + { + "epoch": 0.3954375346930005, + "grad_norm": 1.667708396911621, + "learning_rate": 0.0001377627782988716, + "loss": 1.3867, + "step": 11042 + }, + { + "epoch": 0.3954733468225688, + "grad_norm": 1.616040825843811, + "learning_rate": 0.00013775203785206544, + "loss": 1.2286, + "step": 11043 + }, + { + "epoch": 0.39550915895213706, + "grad_norm": 1.6211915016174316, + "learning_rate": 0.00013774129689735437, + "loss": 1.3339, + "step": 11044 + }, + { + "epoch": 0.3955449710817054, + "grad_norm": 2.3818204402923584, + "learning_rate": 0.00013773055543488276, + "loss": 1.5299, + "step": 11045 + }, + { + "epoch": 0.39558078321127366, + "grad_norm": 1.4598685503005981, + "learning_rate": 0.00013771981346479524, + "loss": 1.5545, + "step": 11046 + }, + { + "epoch": 0.3956165953408419, + "grad_norm": 2.1078193187713623, + "learning_rate": 0.0001377090709872363, + "loss": 1.712, + "step": 11047 + }, + { + "epoch": 0.39565240747041025, + "grad_norm": 1.702226161956787, + "learning_rate": 0.00013769832800235041, + "loss": 1.4261, + "step": 11048 + }, + { + "epoch": 0.3956882195999785, + "grad_norm": 1.6927379369735718, + "learning_rate": 0.00013768758451028216, + "loss": 1.5355, + "step": 11049 + }, + { + "epoch": 0.3957240317295468, + "grad_norm": 1.7418303489685059, + "learning_rate": 0.00013767684051117605, + "loss": 1.5623, + "step": 11050 + }, + { + "epoch": 0.39575984385911506, + "grad_norm": 1.6147509813308716, + "learning_rate": 0.0001376660960051767, + "loss": 1.7547, + "step": 11051 + }, + { + "epoch": 0.3957956559886834, + "grad_norm": 1.5473684072494507, + "learning_rate": 0.0001376553509924286, + "loss": 1.4403, + "step": 11052 + }, + { + "epoch": 0.39583146811825165, + "grad_norm": 1.8391984701156616, + "learning_rate": 0.00013764460547307632, + "loss": 1.6435, + "step": 11053 + }, + { + "epoch": 0.3958672802478199, + "grad_norm": 1.5264986753463745, + "learning_rate": 0.0001376338594472644, + "loss": 1.4671, + "step": 11054 + }, + { + "epoch": 0.39590309237738824, + "grad_norm": 2.0358102321624756, + "learning_rate": 0.00013762311291513747, + "loss": 1.5083, + "step": 11055 + }, + { + "epoch": 0.3959389045069565, + "grad_norm": 1.842419147491455, + "learning_rate": 0.0001376123658768401, + "loss": 1.5692, + "step": 11056 + }, + { + "epoch": 0.3959747166365248, + "grad_norm": 1.4179729223251343, + "learning_rate": 0.00013760161833251683, + "loss": 1.4895, + "step": 11057 + }, + { + "epoch": 0.39601052876609305, + "grad_norm": 1.3994336128234863, + "learning_rate": 0.00013759087028231232, + "loss": 1.3776, + "step": 11058 + }, + { + "epoch": 0.3960463408956614, + "grad_norm": 2.0251388549804688, + "learning_rate": 0.0001375801217263711, + "loss": 1.6126, + "step": 11059 + }, + { + "epoch": 0.39608215302522964, + "grad_norm": 1.455640435218811, + "learning_rate": 0.00013756937266483788, + "loss": 1.513, + "step": 11060 + }, + { + "epoch": 0.3961179651547979, + "grad_norm": 1.6252317428588867, + "learning_rate": 0.00013755862309785716, + "loss": 1.6759, + "step": 11061 + }, + { + "epoch": 0.39615377728436624, + "grad_norm": 2.397956609725952, + "learning_rate": 0.00013754787302557364, + "loss": 1.5216, + "step": 11062 + }, + { + "epoch": 0.3961895894139345, + "grad_norm": 1.762626051902771, + "learning_rate": 0.0001375371224481319, + "loss": 1.7711, + "step": 11063 + }, + { + "epoch": 0.3962254015435028, + "grad_norm": 1.7544844150543213, + "learning_rate": 0.0001375263713656766, + "loss": 1.3992, + "step": 11064 + }, + { + "epoch": 0.39626121367307104, + "grad_norm": 1.8529858589172363, + "learning_rate": 0.00013751561977835242, + "loss": 1.9665, + "step": 11065 + }, + { + "epoch": 0.39629702580263937, + "grad_norm": 1.527550220489502, + "learning_rate": 0.00013750486768630393, + "loss": 1.6083, + "step": 11066 + }, + { + "epoch": 0.39633283793220764, + "grad_norm": 1.813584327697754, + "learning_rate": 0.00013749411508967582, + "loss": 1.5276, + "step": 11067 + }, + { + "epoch": 0.3963686500617759, + "grad_norm": 2.098923921585083, + "learning_rate": 0.0001374833619886128, + "loss": 2.1238, + "step": 11068 + }, + { + "epoch": 0.39640446219134423, + "grad_norm": 1.9573220014572144, + "learning_rate": 0.0001374726083832594, + "loss": 1.4054, + "step": 11069 + }, + { + "epoch": 0.3964402743209125, + "grad_norm": 1.5510364770889282, + "learning_rate": 0.00013746185427376047, + "loss": 1.4817, + "step": 11070 + }, + { + "epoch": 0.39647608645048077, + "grad_norm": 1.4360061883926392, + "learning_rate": 0.00013745109966026056, + "loss": 1.533, + "step": 11071 + }, + { + "epoch": 0.39651189858004904, + "grad_norm": 1.4258822202682495, + "learning_rate": 0.00013744034454290447, + "loss": 1.6174, + "step": 11072 + }, + { + "epoch": 0.39654771070961736, + "grad_norm": 1.630110740661621, + "learning_rate": 0.0001374295889218368, + "loss": 1.4781, + "step": 11073 + }, + { + "epoch": 0.39658352283918563, + "grad_norm": 1.3876277208328247, + "learning_rate": 0.0001374188327972023, + "loss": 1.4681, + "step": 11074 + }, + { + "epoch": 0.3966193349687539, + "grad_norm": 1.5342926979064941, + "learning_rate": 0.0001374080761691457, + "loss": 1.5694, + "step": 11075 + }, + { + "epoch": 0.3966551470983222, + "grad_norm": 1.8465675115585327, + "learning_rate": 0.00013739731903781164, + "loss": 1.4775, + "step": 11076 + }, + { + "epoch": 0.3966909592278905, + "grad_norm": 1.8121492862701416, + "learning_rate": 0.00013738656140334493, + "loss": 1.4287, + "step": 11077 + }, + { + "epoch": 0.39672677135745876, + "grad_norm": 1.8412760496139526, + "learning_rate": 0.00013737580326589024, + "loss": 1.7312, + "step": 11078 + }, + { + "epoch": 0.39676258348702703, + "grad_norm": 2.1565401554107666, + "learning_rate": 0.00013736504462559234, + "loss": 1.4282, + "step": 11079 + }, + { + "epoch": 0.39679839561659536, + "grad_norm": 1.9769231081008911, + "learning_rate": 0.00013735428548259594, + "loss": 1.3531, + "step": 11080 + }, + { + "epoch": 0.3968342077461636, + "grad_norm": 2.425816535949707, + "learning_rate": 0.00013734352583704587, + "loss": 1.6955, + "step": 11081 + }, + { + "epoch": 0.3968700198757319, + "grad_norm": 1.4811224937438965, + "learning_rate": 0.00013733276568908678, + "loss": 1.7023, + "step": 11082 + }, + { + "epoch": 0.3969058320053002, + "grad_norm": 1.5459316968917847, + "learning_rate": 0.00013732200503886348, + "loss": 1.3232, + "step": 11083 + }, + { + "epoch": 0.3969416441348685, + "grad_norm": 1.856197476387024, + "learning_rate": 0.00013731124388652077, + "loss": 1.5471, + "step": 11084 + }, + { + "epoch": 0.39697745626443676, + "grad_norm": 1.9269754886627197, + "learning_rate": 0.00013730048223220336, + "loss": 1.4477, + "step": 11085 + }, + { + "epoch": 0.397013268394005, + "grad_norm": 1.5941188335418701, + "learning_rate": 0.00013728972007605612, + "loss": 1.5713, + "step": 11086 + }, + { + "epoch": 0.39704908052357335, + "grad_norm": 1.502655267715454, + "learning_rate": 0.00013727895741822379, + "loss": 1.3498, + "step": 11087 + }, + { + "epoch": 0.3970848926531416, + "grad_norm": 1.3070005178451538, + "learning_rate": 0.00013726819425885115, + "loss": 1.5596, + "step": 11088 + }, + { + "epoch": 0.3971207047827099, + "grad_norm": 1.96981680393219, + "learning_rate": 0.00013725743059808305, + "loss": 1.588, + "step": 11089 + }, + { + "epoch": 0.3971565169122782, + "grad_norm": 2.043673276901245, + "learning_rate": 0.00013724666643606429, + "loss": 1.6474, + "step": 11090 + }, + { + "epoch": 0.3971923290418465, + "grad_norm": 1.5892807245254517, + "learning_rate": 0.00013723590177293967, + "loss": 1.5802, + "step": 11091 + }, + { + "epoch": 0.39722814117141475, + "grad_norm": 2.098597764968872, + "learning_rate": 0.000137225136608854, + "loss": 1.8254, + "step": 11092 + }, + { + "epoch": 0.397263953300983, + "grad_norm": 1.9812650680541992, + "learning_rate": 0.00013721437094395213, + "loss": 1.4145, + "step": 11093 + }, + { + "epoch": 0.39729976543055134, + "grad_norm": 2.5424814224243164, + "learning_rate": 0.00013720360477837893, + "loss": 1.5099, + "step": 11094 + }, + { + "epoch": 0.3973355775601196, + "grad_norm": 1.5709184408187866, + "learning_rate": 0.00013719283811227921, + "loss": 1.6173, + "step": 11095 + }, + { + "epoch": 0.3973713896896879, + "grad_norm": 1.7686853408813477, + "learning_rate": 0.0001371820709457978, + "loss": 1.8625, + "step": 11096 + }, + { + "epoch": 0.3974072018192562, + "grad_norm": 2.2390694618225098, + "learning_rate": 0.00013717130327907964, + "loss": 1.6054, + "step": 11097 + }, + { + "epoch": 0.3974430139488245, + "grad_norm": 2.229501247406006, + "learning_rate": 0.00013716053511226949, + "loss": 1.4254, + "step": 11098 + }, + { + "epoch": 0.39747882607839274, + "grad_norm": 2.011171817779541, + "learning_rate": 0.00013714976644551232, + "loss": 1.4815, + "step": 11099 + }, + { + "epoch": 0.397514638207961, + "grad_norm": 1.347294569015503, + "learning_rate": 0.00013713899727895294, + "loss": 1.3786, + "step": 11100 + }, + { + "epoch": 0.39755045033752934, + "grad_norm": 1.8997390270233154, + "learning_rate": 0.00013712822761273625, + "loss": 1.6241, + "step": 11101 + }, + { + "epoch": 0.3975862624670976, + "grad_norm": 1.4208718538284302, + "learning_rate": 0.00013711745744700714, + "loss": 1.5554, + "step": 11102 + }, + { + "epoch": 0.3976220745966659, + "grad_norm": 1.6987639665603638, + "learning_rate": 0.00013710668678191054, + "loss": 1.5971, + "step": 11103 + }, + { + "epoch": 0.3976578867262342, + "grad_norm": 2.2498080730438232, + "learning_rate": 0.00013709591561759135, + "loss": 1.378, + "step": 11104 + }, + { + "epoch": 0.39769369885580247, + "grad_norm": 1.919611930847168, + "learning_rate": 0.00013708514395419444, + "loss": 1.8187, + "step": 11105 + }, + { + "epoch": 0.39772951098537074, + "grad_norm": 1.7510775327682495, + "learning_rate": 0.00013707437179186476, + "loss": 1.4297, + "step": 11106 + }, + { + "epoch": 0.397765323114939, + "grad_norm": 1.8031326532363892, + "learning_rate": 0.0001370635991307472, + "loss": 1.6006, + "step": 11107 + }, + { + "epoch": 0.39780113524450733, + "grad_norm": 1.6740226745605469, + "learning_rate": 0.00013705282597098676, + "loss": 1.2622, + "step": 11108 + }, + { + "epoch": 0.3978369473740756, + "grad_norm": 1.7174181938171387, + "learning_rate": 0.00013704205231272835, + "loss": 1.7329, + "step": 11109 + }, + { + "epoch": 0.39787275950364387, + "grad_norm": 1.7932454347610474, + "learning_rate": 0.0001370312781561169, + "loss": 1.6485, + "step": 11110 + }, + { + "epoch": 0.3979085716332122, + "grad_norm": 1.698287010192871, + "learning_rate": 0.00013702050350129734, + "loss": 1.6169, + "step": 11111 + }, + { + "epoch": 0.39794438376278046, + "grad_norm": 1.7249375581741333, + "learning_rate": 0.00013700972834841468, + "loss": 1.9336, + "step": 11112 + }, + { + "epoch": 0.39798019589234873, + "grad_norm": 1.4442729949951172, + "learning_rate": 0.00013699895269761385, + "loss": 1.2067, + "step": 11113 + }, + { + "epoch": 0.398016008021917, + "grad_norm": 1.2577317953109741, + "learning_rate": 0.00013698817654903984, + "loss": 1.5325, + "step": 11114 + }, + { + "epoch": 0.3980518201514853, + "grad_norm": 1.5931488275527954, + "learning_rate": 0.00013697739990283764, + "loss": 1.568, + "step": 11115 + }, + { + "epoch": 0.3980876322810536, + "grad_norm": 1.8899900913238525, + "learning_rate": 0.00013696662275915222, + "loss": 1.2821, + "step": 11116 + }, + { + "epoch": 0.39812344441062186, + "grad_norm": 1.9897520542144775, + "learning_rate": 0.00013695584511812857, + "loss": 1.5535, + "step": 11117 + }, + { + "epoch": 0.3981592565401902, + "grad_norm": 2.0698153972625732, + "learning_rate": 0.00013694506697991169, + "loss": 1.5855, + "step": 11118 + }, + { + "epoch": 0.39819506866975846, + "grad_norm": 2.299964427947998, + "learning_rate": 0.00013693428834464662, + "loss": 1.5469, + "step": 11119 + }, + { + "epoch": 0.3982308807993267, + "grad_norm": 1.849847674369812, + "learning_rate": 0.00013692350921247829, + "loss": 1.4537, + "step": 11120 + }, + { + "epoch": 0.398266692928895, + "grad_norm": 1.605886459350586, + "learning_rate": 0.00013691272958355182, + "loss": 1.6433, + "step": 11121 + }, + { + "epoch": 0.3983025050584633, + "grad_norm": 1.4784764051437378, + "learning_rate": 0.00013690194945801214, + "loss": 1.6726, + "step": 11122 + }, + { + "epoch": 0.3983383171880316, + "grad_norm": 2.1290183067321777, + "learning_rate": 0.00013689116883600436, + "loss": 1.6535, + "step": 11123 + }, + { + "epoch": 0.39837412931759986, + "grad_norm": 1.8331060409545898, + "learning_rate": 0.00013688038771767345, + "loss": 1.5419, + "step": 11124 + }, + { + "epoch": 0.3984099414471682, + "grad_norm": 1.70211660861969, + "learning_rate": 0.00013686960610316456, + "loss": 1.2083, + "step": 11125 + }, + { + "epoch": 0.39844575357673645, + "grad_norm": 1.9089553356170654, + "learning_rate": 0.00013685882399262265, + "loss": 1.8192, + "step": 11126 + }, + { + "epoch": 0.3984815657063047, + "grad_norm": 1.5048595666885376, + "learning_rate": 0.0001368480413861928, + "loss": 1.5921, + "step": 11127 + }, + { + "epoch": 0.398517377835873, + "grad_norm": 1.579541802406311, + "learning_rate": 0.0001368372582840201, + "loss": 1.5551, + "step": 11128 + }, + { + "epoch": 0.3985531899654413, + "grad_norm": 1.3262720108032227, + "learning_rate": 0.00013682647468624958, + "loss": 1.4928, + "step": 11129 + }, + { + "epoch": 0.3985890020950096, + "grad_norm": 1.7601323127746582, + "learning_rate": 0.00013681569059302638, + "loss": 1.5003, + "step": 11130 + }, + { + "epoch": 0.39862481422457785, + "grad_norm": 2.1725378036499023, + "learning_rate": 0.00013680490600449552, + "loss": 1.6579, + "step": 11131 + }, + { + "epoch": 0.3986606263541462, + "grad_norm": 1.540235161781311, + "learning_rate": 0.00013679412092080213, + "loss": 1.5752, + "step": 11132 + }, + { + "epoch": 0.39869643848371444, + "grad_norm": 1.5734163522720337, + "learning_rate": 0.0001367833353420913, + "loss": 1.7024, + "step": 11133 + }, + { + "epoch": 0.3987322506132827, + "grad_norm": 1.3746634721755981, + "learning_rate": 0.00013677254926850818, + "loss": 1.53, + "step": 11134 + }, + { + "epoch": 0.398768062742851, + "grad_norm": 1.5181868076324463, + "learning_rate": 0.0001367617627001978, + "loss": 1.4935, + "step": 11135 + }, + { + "epoch": 0.3988038748724193, + "grad_norm": 1.9445048570632935, + "learning_rate": 0.0001367509756373053, + "loss": 1.541, + "step": 11136 + }, + { + "epoch": 0.3988396870019876, + "grad_norm": 1.680619478225708, + "learning_rate": 0.0001367401880799759, + "loss": 1.596, + "step": 11137 + }, + { + "epoch": 0.39887549913155584, + "grad_norm": 1.310683250427246, + "learning_rate": 0.0001367294000283546, + "loss": 1.5233, + "step": 11138 + }, + { + "epoch": 0.39891131126112417, + "grad_norm": 1.5829931497573853, + "learning_rate": 0.00013671861148258665, + "loss": 1.6821, + "step": 11139 + }, + { + "epoch": 0.39894712339069244, + "grad_norm": 1.4462175369262695, + "learning_rate": 0.0001367078224428171, + "loss": 1.5534, + "step": 11140 + }, + { + "epoch": 0.3989829355202607, + "grad_norm": 1.4816615581512451, + "learning_rate": 0.00013669703290919118, + "loss": 1.5118, + "step": 11141 + }, + { + "epoch": 0.399018747649829, + "grad_norm": 1.906777262687683, + "learning_rate": 0.00013668624288185402, + "loss": 1.8655, + "step": 11142 + }, + { + "epoch": 0.3990545597793973, + "grad_norm": 1.6060969829559326, + "learning_rate": 0.00013667545236095076, + "loss": 1.4475, + "step": 11143 + }, + { + "epoch": 0.39909037190896557, + "grad_norm": 1.6323474645614624, + "learning_rate": 0.00013666466134662662, + "loss": 1.6315, + "step": 11144 + }, + { + "epoch": 0.39912618403853384, + "grad_norm": 1.5477476119995117, + "learning_rate": 0.00013665386983902672, + "loss": 1.7228, + "step": 11145 + }, + { + "epoch": 0.39916199616810216, + "grad_norm": 1.9727189540863037, + "learning_rate": 0.00013664307783829634, + "loss": 1.5455, + "step": 11146 + }, + { + "epoch": 0.39919780829767043, + "grad_norm": 1.7289396524429321, + "learning_rate": 0.00013663228534458054, + "loss": 1.5659, + "step": 11147 + }, + { + "epoch": 0.3992336204272387, + "grad_norm": 1.9361799955368042, + "learning_rate": 0.00013662149235802465, + "loss": 1.4145, + "step": 11148 + }, + { + "epoch": 0.39926943255680697, + "grad_norm": 1.8130748271942139, + "learning_rate": 0.0001366106988787738, + "loss": 1.8146, + "step": 11149 + }, + { + "epoch": 0.3993052446863753, + "grad_norm": 1.4281504154205322, + "learning_rate": 0.00013659990490697322, + "loss": 1.4696, + "step": 11150 + }, + { + "epoch": 0.39934105681594356, + "grad_norm": 1.5529637336730957, + "learning_rate": 0.0001365891104427681, + "loss": 1.6064, + "step": 11151 + }, + { + "epoch": 0.39937686894551183, + "grad_norm": 1.484028697013855, + "learning_rate": 0.00013657831548630377, + "loss": 1.5061, + "step": 11152 + }, + { + "epoch": 0.39941268107508016, + "grad_norm": 1.630724310874939, + "learning_rate": 0.00013656752003772535, + "loss": 1.5966, + "step": 11153 + }, + { + "epoch": 0.3994484932046484, + "grad_norm": 1.61417818069458, + "learning_rate": 0.00013655672409717813, + "loss": 1.4266, + "step": 11154 + }, + { + "epoch": 0.3994843053342167, + "grad_norm": 3.1259992122650146, + "learning_rate": 0.0001365459276648073, + "loss": 1.6252, + "step": 11155 + }, + { + "epoch": 0.39952011746378496, + "grad_norm": 1.3071650266647339, + "learning_rate": 0.00013653513074075816, + "loss": 1.5813, + "step": 11156 + }, + { + "epoch": 0.3995559295933533, + "grad_norm": 1.3977751731872559, + "learning_rate": 0.000136524333325176, + "loss": 1.6765, + "step": 11157 + }, + { + "epoch": 0.39959174172292156, + "grad_norm": 1.4183727502822876, + "learning_rate": 0.00013651353541820603, + "loss": 1.299, + "step": 11158 + }, + { + "epoch": 0.3996275538524898, + "grad_norm": 1.6623196601867676, + "learning_rate": 0.00013650273701999353, + "loss": 1.4861, + "step": 11159 + }, + { + "epoch": 0.39966336598205815, + "grad_norm": 1.5375479459762573, + "learning_rate": 0.0001364919381306838, + "loss": 1.7469, + "step": 11160 + }, + { + "epoch": 0.3996991781116264, + "grad_norm": 1.6787738800048828, + "learning_rate": 0.00013648113875042213, + "loss": 1.4347, + "step": 11161 + }, + { + "epoch": 0.3997349902411947, + "grad_norm": 2.2604610919952393, + "learning_rate": 0.00013647033887935378, + "loss": 1.3712, + "step": 11162 + }, + { + "epoch": 0.39977080237076296, + "grad_norm": 2.34169340133667, + "learning_rate": 0.00013645953851762406, + "loss": 1.1879, + "step": 11163 + }, + { + "epoch": 0.3998066145003313, + "grad_norm": 1.8770477771759033, + "learning_rate": 0.00013644873766537828, + "loss": 1.5683, + "step": 11164 + }, + { + "epoch": 0.39984242662989955, + "grad_norm": 1.317833423614502, + "learning_rate": 0.00013643793632276175, + "loss": 1.0902, + "step": 11165 + }, + { + "epoch": 0.3998782387594678, + "grad_norm": 1.86030912399292, + "learning_rate": 0.00013642713448991977, + "loss": 1.0972, + "step": 11166 + }, + { + "epoch": 0.39991405088903614, + "grad_norm": 3.0373973846435547, + "learning_rate": 0.0001364163321669977, + "loss": 1.9666, + "step": 11167 + }, + { + "epoch": 0.3999498630186044, + "grad_norm": 1.878786563873291, + "learning_rate": 0.00013640552935414085, + "loss": 1.4069, + "step": 11168 + }, + { + "epoch": 0.3999856751481727, + "grad_norm": 1.554965615272522, + "learning_rate": 0.00013639472605149456, + "loss": 1.3437, + "step": 11169 + }, + { + "epoch": 0.40002148727774095, + "grad_norm": 1.9674904346466064, + "learning_rate": 0.00013638392225920418, + "loss": 1.7638, + "step": 11170 + }, + { + "epoch": 0.4000572994073093, + "grad_norm": 1.3602914810180664, + "learning_rate": 0.00013637311797741507, + "loss": 1.4951, + "step": 11171 + }, + { + "epoch": 0.40009311153687754, + "grad_norm": 2.014244318008423, + "learning_rate": 0.00013636231320627258, + "loss": 1.5531, + "step": 11172 + }, + { + "epoch": 0.4001289236664458, + "grad_norm": 1.8055243492126465, + "learning_rate": 0.00013635150794592205, + "loss": 1.1671, + "step": 11173 + }, + { + "epoch": 0.40016473579601414, + "grad_norm": 1.5012273788452148, + "learning_rate": 0.00013634070219650888, + "loss": 1.5464, + "step": 11174 + }, + { + "epoch": 0.4002005479255824, + "grad_norm": 1.5045928955078125, + "learning_rate": 0.00013632989595817842, + "loss": 1.1954, + "step": 11175 + }, + { + "epoch": 0.4002363600551507, + "grad_norm": 1.548518180847168, + "learning_rate": 0.0001363190892310761, + "loss": 1.3469, + "step": 11176 + }, + { + "epoch": 0.40027217218471894, + "grad_norm": 1.9656542539596558, + "learning_rate": 0.00013630828201534727, + "loss": 1.4379, + "step": 11177 + }, + { + "epoch": 0.40030798431428727, + "grad_norm": 1.4483617544174194, + "learning_rate": 0.00013629747431113734, + "loss": 1.7661, + "step": 11178 + }, + { + "epoch": 0.40034379644385554, + "grad_norm": 2.1301279067993164, + "learning_rate": 0.0001362866661185917, + "loss": 1.5943, + "step": 11179 + }, + { + "epoch": 0.4003796085734238, + "grad_norm": 1.7540024518966675, + "learning_rate": 0.00013627585743785582, + "loss": 1.6543, + "step": 11180 + }, + { + "epoch": 0.40041542070299213, + "grad_norm": 1.4993693828582764, + "learning_rate": 0.000136265048269075, + "loss": 1.4884, + "step": 11181 + }, + { + "epoch": 0.4004512328325604, + "grad_norm": 1.55181086063385, + "learning_rate": 0.0001362542386123948, + "loss": 1.3658, + "step": 11182 + }, + { + "epoch": 0.40048704496212867, + "grad_norm": 1.8561002016067505, + "learning_rate": 0.00013624342846796058, + "loss": 1.321, + "step": 11183 + }, + { + "epoch": 0.40052285709169694, + "grad_norm": 1.5602530241012573, + "learning_rate": 0.00013623261783591773, + "loss": 1.6882, + "step": 11184 + }, + { + "epoch": 0.40055866922126526, + "grad_norm": 1.8052551746368408, + "learning_rate": 0.00013622180671641178, + "loss": 1.3348, + "step": 11185 + }, + { + "epoch": 0.40059448135083353, + "grad_norm": 1.9859580993652344, + "learning_rate": 0.0001362109951095881, + "loss": 1.6761, + "step": 11186 + }, + { + "epoch": 0.4006302934804018, + "grad_norm": 1.7264128923416138, + "learning_rate": 0.00013620018301559224, + "loss": 1.5664, + "step": 11187 + }, + { + "epoch": 0.4006661056099701, + "grad_norm": 1.941522240638733, + "learning_rate": 0.0001361893704345696, + "loss": 1.5823, + "step": 11188 + }, + { + "epoch": 0.4007019177395384, + "grad_norm": 1.5751817226409912, + "learning_rate": 0.00013617855736666566, + "loss": 1.5329, + "step": 11189 + }, + { + "epoch": 0.40073772986910666, + "grad_norm": 2.8278791904449463, + "learning_rate": 0.00013616774381202591, + "loss": 1.644, + "step": 11190 + }, + { + "epoch": 0.40077354199867493, + "grad_norm": 1.5788226127624512, + "learning_rate": 0.00013615692977079577, + "loss": 1.3641, + "step": 11191 + }, + { + "epoch": 0.40080935412824326, + "grad_norm": 1.462515115737915, + "learning_rate": 0.00013614611524312084, + "loss": 1.6041, + "step": 11192 + }, + { + "epoch": 0.4008451662578115, + "grad_norm": 1.6150527000427246, + "learning_rate": 0.0001361353002291465, + "loss": 1.4713, + "step": 11193 + }, + { + "epoch": 0.4008809783873798, + "grad_norm": 1.730208158493042, + "learning_rate": 0.00013612448472901834, + "loss": 1.5619, + "step": 11194 + }, + { + "epoch": 0.40091679051694806, + "grad_norm": 1.573136329650879, + "learning_rate": 0.00013611366874288186, + "loss": 1.8576, + "step": 11195 + }, + { + "epoch": 0.4009526026465164, + "grad_norm": 1.67069411277771, + "learning_rate": 0.0001361028522708825, + "loss": 1.6783, + "step": 11196 + }, + { + "epoch": 0.40098841477608466, + "grad_norm": 1.7830244302749634, + "learning_rate": 0.00013609203531316587, + "loss": 1.5543, + "step": 11197 + }, + { + "epoch": 0.4010242269056529, + "grad_norm": 1.4327481985092163, + "learning_rate": 0.0001360812178698774, + "loss": 1.1594, + "step": 11198 + }, + { + "epoch": 0.40106003903522125, + "grad_norm": 1.6612839698791504, + "learning_rate": 0.00013607039994116278, + "loss": 1.4161, + "step": 11199 + }, + { + "epoch": 0.4010958511647895, + "grad_norm": 2.0589075088500977, + "learning_rate": 0.00013605958152716738, + "loss": 1.7591, + "step": 11200 + }, + { + "epoch": 0.4011316632943578, + "grad_norm": 1.7634742259979248, + "learning_rate": 0.00013604876262803686, + "loss": 1.3869, + "step": 11201 + }, + { + "epoch": 0.40116747542392606, + "grad_norm": 2.3785300254821777, + "learning_rate": 0.00013603794324391672, + "loss": 1.3486, + "step": 11202 + }, + { + "epoch": 0.4012032875534944, + "grad_norm": 1.635969877243042, + "learning_rate": 0.00013602712337495255, + "loss": 1.3801, + "step": 11203 + }, + { + "epoch": 0.40123909968306265, + "grad_norm": 1.580276370048523, + "learning_rate": 0.0001360163030212899, + "loss": 1.6318, + "step": 11204 + }, + { + "epoch": 0.4012749118126309, + "grad_norm": 1.8174611330032349, + "learning_rate": 0.00013600548218307436, + "loss": 1.3491, + "step": 11205 + }, + { + "epoch": 0.40131072394219924, + "grad_norm": 1.3981926441192627, + "learning_rate": 0.0001359946608604515, + "loss": 1.6881, + "step": 11206 + }, + { + "epoch": 0.4013465360717675, + "grad_norm": 1.802919864654541, + "learning_rate": 0.00013598383905356692, + "loss": 1.2794, + "step": 11207 + }, + { + "epoch": 0.4013823482013358, + "grad_norm": 1.6066579818725586, + "learning_rate": 0.00013597301676256617, + "loss": 1.356, + "step": 11208 + }, + { + "epoch": 0.40141816033090405, + "grad_norm": 1.9527837038040161, + "learning_rate": 0.0001359621939875949, + "loss": 1.5963, + "step": 11209 + }, + { + "epoch": 0.4014539724604724, + "grad_norm": 1.324955701828003, + "learning_rate": 0.00013595137072879867, + "loss": 1.2462, + "step": 11210 + }, + { + "epoch": 0.40148978459004064, + "grad_norm": 1.5357877016067505, + "learning_rate": 0.00013594054698632315, + "loss": 1.4547, + "step": 11211 + }, + { + "epoch": 0.4015255967196089, + "grad_norm": 1.6919299364089966, + "learning_rate": 0.00013592972276031394, + "loss": 1.6643, + "step": 11212 + }, + { + "epoch": 0.40156140884917724, + "grad_norm": 1.8728044033050537, + "learning_rate": 0.00013591889805091663, + "loss": 1.6113, + "step": 11213 + }, + { + "epoch": 0.4015972209787455, + "grad_norm": 2.333636999130249, + "learning_rate": 0.00013590807285827688, + "loss": 1.7807, + "step": 11214 + }, + { + "epoch": 0.4016330331083138, + "grad_norm": 1.5089396238327026, + "learning_rate": 0.00013589724718254036, + "loss": 1.212, + "step": 11215 + }, + { + "epoch": 0.40166884523788204, + "grad_norm": 1.860792636871338, + "learning_rate": 0.00013588642102385266, + "loss": 1.782, + "step": 11216 + }, + { + "epoch": 0.40170465736745037, + "grad_norm": 1.6318405866622925, + "learning_rate": 0.00013587559438235945, + "loss": 1.5339, + "step": 11217 + }, + { + "epoch": 0.40174046949701864, + "grad_norm": 1.822420358657837, + "learning_rate": 0.0001358647672582064, + "loss": 1.453, + "step": 11218 + }, + { + "epoch": 0.4017762816265869, + "grad_norm": 1.4181208610534668, + "learning_rate": 0.00013585393965153916, + "loss": 1.3009, + "step": 11219 + }, + { + "epoch": 0.40181209375615523, + "grad_norm": 1.3538849353790283, + "learning_rate": 0.00013584311156250342, + "loss": 1.6589, + "step": 11220 + }, + { + "epoch": 0.4018479058857235, + "grad_norm": 1.5220893621444702, + "learning_rate": 0.00013583228299124484, + "loss": 1.7311, + "step": 11221 + }, + { + "epoch": 0.40188371801529177, + "grad_norm": 1.5974732637405396, + "learning_rate": 0.00013582145393790913, + "loss": 1.4992, + "step": 11222 + }, + { + "epoch": 0.40191953014486004, + "grad_norm": 2.0485281944274902, + "learning_rate": 0.00013581062440264194, + "loss": 1.4456, + "step": 11223 + }, + { + "epoch": 0.40195534227442836, + "grad_norm": 1.678962230682373, + "learning_rate": 0.000135799794385589, + "loss": 1.5724, + "step": 11224 + }, + { + "epoch": 0.40199115440399663, + "grad_norm": 1.9616618156433105, + "learning_rate": 0.00013578896388689602, + "loss": 1.5544, + "step": 11225 + }, + { + "epoch": 0.4020269665335649, + "grad_norm": 1.4487061500549316, + "learning_rate": 0.00013577813290670867, + "loss": 1.467, + "step": 11226 + }, + { + "epoch": 0.4020627786631332, + "grad_norm": 2.1156294345855713, + "learning_rate": 0.00013576730144517271, + "loss": 1.8479, + "step": 11227 + }, + { + "epoch": 0.4020985907927015, + "grad_norm": 1.5825573205947876, + "learning_rate": 0.00013575646950243384, + "loss": 1.7531, + "step": 11228 + }, + { + "epoch": 0.40213440292226976, + "grad_norm": 2.3100428581237793, + "learning_rate": 0.0001357456370786378, + "loss": 1.467, + "step": 11229 + }, + { + "epoch": 0.40217021505183803, + "grad_norm": 1.4864134788513184, + "learning_rate": 0.0001357348041739303, + "loss": 1.5086, + "step": 11230 + }, + { + "epoch": 0.40220602718140636, + "grad_norm": 1.748326301574707, + "learning_rate": 0.00013572397078845716, + "loss": 1.3973, + "step": 11231 + }, + { + "epoch": 0.4022418393109746, + "grad_norm": 1.4479734897613525, + "learning_rate": 0.00013571313692236405, + "loss": 1.4109, + "step": 11232 + }, + { + "epoch": 0.4022776514405429, + "grad_norm": 1.8514389991760254, + "learning_rate": 0.0001357023025757967, + "loss": 1.5098, + "step": 11233 + }, + { + "epoch": 0.4023134635701112, + "grad_norm": 1.9194141626358032, + "learning_rate": 0.00013569146774890099, + "loss": 1.6107, + "step": 11234 + }, + { + "epoch": 0.4023492756996795, + "grad_norm": 1.9210814237594604, + "learning_rate": 0.00013568063244182257, + "loss": 1.5859, + "step": 11235 + }, + { + "epoch": 0.40238508782924776, + "grad_norm": 1.5344477891921997, + "learning_rate": 0.00013566979665470728, + "loss": 1.4888, + "step": 11236 + }, + { + "epoch": 0.402420899958816, + "grad_norm": 2.001387596130371, + "learning_rate": 0.0001356589603877009, + "loss": 1.5566, + "step": 11237 + }, + { + "epoch": 0.40245671208838435, + "grad_norm": 1.8244560956954956, + "learning_rate": 0.0001356481236409492, + "loss": 1.2914, + "step": 11238 + }, + { + "epoch": 0.4024925242179526, + "grad_norm": 1.440247654914856, + "learning_rate": 0.00013563728641459793, + "loss": 1.5425, + "step": 11239 + }, + { + "epoch": 0.4025283363475209, + "grad_norm": 1.331313133239746, + "learning_rate": 0.000135626448708793, + "loss": 1.5918, + "step": 11240 + }, + { + "epoch": 0.4025641484770892, + "grad_norm": 1.4485142230987549, + "learning_rate": 0.00013561561052368015, + "loss": 1.5563, + "step": 11241 + }, + { + "epoch": 0.4025999606066575, + "grad_norm": 1.3569890260696411, + "learning_rate": 0.0001356047718594052, + "loss": 1.6618, + "step": 11242 + }, + { + "epoch": 0.40263577273622575, + "grad_norm": 1.8978397846221924, + "learning_rate": 0.00013559393271611397, + "loss": 1.7617, + "step": 11243 + }, + { + "epoch": 0.402671584865794, + "grad_norm": 1.6500682830810547, + "learning_rate": 0.00013558309309395224, + "loss": 1.1864, + "step": 11244 + }, + { + "epoch": 0.40270739699536234, + "grad_norm": 1.9103257656097412, + "learning_rate": 0.00013557225299306599, + "loss": 1.4868, + "step": 11245 + }, + { + "epoch": 0.4027432091249306, + "grad_norm": 1.6831821203231812, + "learning_rate": 0.00013556141241360088, + "loss": 1.8324, + "step": 11246 + }, + { + "epoch": 0.4027790212544989, + "grad_norm": 1.6163675785064697, + "learning_rate": 0.00013555057135570286, + "loss": 1.4925, + "step": 11247 + }, + { + "epoch": 0.4028148333840672, + "grad_norm": 1.781957983970642, + "learning_rate": 0.00013553972981951776, + "loss": 1.3588, + "step": 11248 + }, + { + "epoch": 0.4028506455136355, + "grad_norm": 1.2353285551071167, + "learning_rate": 0.00013552888780519144, + "loss": 1.4921, + "step": 11249 + }, + { + "epoch": 0.40288645764320374, + "grad_norm": 1.3386553525924683, + "learning_rate": 0.00013551804531286975, + "loss": 1.6313, + "step": 11250 + }, + { + "epoch": 0.402922269772772, + "grad_norm": 1.669419288635254, + "learning_rate": 0.0001355072023426986, + "loss": 1.432, + "step": 11251 + }, + { + "epoch": 0.40295808190234034, + "grad_norm": 2.4253623485565186, + "learning_rate": 0.00013549635889482383, + "loss": 1.4862, + "step": 11252 + }, + { + "epoch": 0.4029938940319086, + "grad_norm": 2.272901773452759, + "learning_rate": 0.00013548551496939132, + "loss": 1.4508, + "step": 11253 + }, + { + "epoch": 0.4030297061614769, + "grad_norm": 1.535967230796814, + "learning_rate": 0.00013547467056654702, + "loss": 1.576, + "step": 11254 + }, + { + "epoch": 0.4030655182910452, + "grad_norm": 1.4173799753189087, + "learning_rate": 0.00013546382568643676, + "loss": 1.7135, + "step": 11255 + }, + { + "epoch": 0.40310133042061347, + "grad_norm": 1.3089243173599243, + "learning_rate": 0.00013545298032920647, + "loss": 1.4986, + "step": 11256 + }, + { + "epoch": 0.40313714255018174, + "grad_norm": 1.5522119998931885, + "learning_rate": 0.00013544213449500204, + "loss": 1.5136, + "step": 11257 + }, + { + "epoch": 0.40317295467975, + "grad_norm": 1.4867783784866333, + "learning_rate": 0.00013543128818396946, + "loss": 1.4835, + "step": 11258 + }, + { + "epoch": 0.40320876680931833, + "grad_norm": 1.8949105739593506, + "learning_rate": 0.0001354204413962546, + "loss": 1.5772, + "step": 11259 + }, + { + "epoch": 0.4032445789388866, + "grad_norm": 2.0525848865509033, + "learning_rate": 0.00013540959413200335, + "loss": 1.4172, + "step": 11260 + }, + { + "epoch": 0.40328039106845487, + "grad_norm": 1.6664822101593018, + "learning_rate": 0.0001353987463913617, + "loss": 1.6019, + "step": 11261 + }, + { + "epoch": 0.4033162031980232, + "grad_norm": 2.6384687423706055, + "learning_rate": 0.0001353878981744756, + "loss": 1.633, + "step": 11262 + }, + { + "epoch": 0.40335201532759146, + "grad_norm": 1.2509064674377441, + "learning_rate": 0.00013537704948149093, + "loss": 1.5135, + "step": 11263 + }, + { + "epoch": 0.40338782745715973, + "grad_norm": 2.389633893966675, + "learning_rate": 0.00013536620031255373, + "loss": 1.7426, + "step": 11264 + }, + { + "epoch": 0.403423639586728, + "grad_norm": 2.0430221557617188, + "learning_rate": 0.0001353553506678099, + "loss": 1.5984, + "step": 11265 + }, + { + "epoch": 0.4034594517162963, + "grad_norm": 1.7450594902038574, + "learning_rate": 0.00013534450054740544, + "loss": 1.5399, + "step": 11266 + }, + { + "epoch": 0.4034952638458646, + "grad_norm": 1.6870568990707397, + "learning_rate": 0.0001353336499514863, + "loss": 1.4738, + "step": 11267 + }, + { + "epoch": 0.40353107597543286, + "grad_norm": 1.8001635074615479, + "learning_rate": 0.00013532279888019851, + "loss": 1.2668, + "step": 11268 + }, + { + "epoch": 0.4035668881050012, + "grad_norm": 2.56563663482666, + "learning_rate": 0.00013531194733368805, + "loss": 1.8747, + "step": 11269 + }, + { + "epoch": 0.40360270023456946, + "grad_norm": 1.735708236694336, + "learning_rate": 0.00013530109531210082, + "loss": 1.5633, + "step": 11270 + }, + { + "epoch": 0.4036385123641377, + "grad_norm": 1.9577866792678833, + "learning_rate": 0.00013529024281558292, + "loss": 1.4575, + "step": 11271 + }, + { + "epoch": 0.403674324493706, + "grad_norm": 2.2287938594818115, + "learning_rate": 0.00013527938984428031, + "loss": 1.6088, + "step": 11272 + }, + { + "epoch": 0.4037101366232743, + "grad_norm": 1.7325700521469116, + "learning_rate": 0.00013526853639833904, + "loss": 1.7316, + "step": 11273 + }, + { + "epoch": 0.4037459487528426, + "grad_norm": 1.6742218732833862, + "learning_rate": 0.0001352576824779051, + "loss": 1.3724, + "step": 11274 + }, + { + "epoch": 0.40378176088241086, + "grad_norm": 1.6109325885772705, + "learning_rate": 0.0001352468280831245, + "loss": 1.2103, + "step": 11275 + }, + { + "epoch": 0.4038175730119792, + "grad_norm": 1.916214108467102, + "learning_rate": 0.00013523597321414332, + "loss": 1.5114, + "step": 11276 + }, + { + "epoch": 0.40385338514154745, + "grad_norm": 2.142789840698242, + "learning_rate": 0.00013522511787110756, + "loss": 1.6882, + "step": 11277 + }, + { + "epoch": 0.4038891972711157, + "grad_norm": 1.7598340511322021, + "learning_rate": 0.00013521426205416326, + "loss": 1.49, + "step": 11278 + }, + { + "epoch": 0.403925009400684, + "grad_norm": 1.6194044351577759, + "learning_rate": 0.00013520340576345653, + "loss": 1.6978, + "step": 11279 + }, + { + "epoch": 0.4039608215302523, + "grad_norm": 1.4875582456588745, + "learning_rate": 0.00013519254899913333, + "loss": 1.7141, + "step": 11280 + }, + { + "epoch": 0.4039966336598206, + "grad_norm": 1.9088953733444214, + "learning_rate": 0.0001351816917613398, + "loss": 1.4717, + "step": 11281 + }, + { + "epoch": 0.40403244578938885, + "grad_norm": 1.5134812593460083, + "learning_rate": 0.00013517083405022203, + "loss": 1.6138, + "step": 11282 + }, + { + "epoch": 0.4040682579189572, + "grad_norm": 2.6607894897460938, + "learning_rate": 0.000135159975865926, + "loss": 1.6279, + "step": 11283 + }, + { + "epoch": 0.40410407004852544, + "grad_norm": 2.2413384914398193, + "learning_rate": 0.00013514911720859785, + "loss": 1.6775, + "step": 11284 + }, + { + "epoch": 0.4041398821780937, + "grad_norm": 1.3868253231048584, + "learning_rate": 0.00013513825807838373, + "loss": 1.425, + "step": 11285 + }, + { + "epoch": 0.404175694307662, + "grad_norm": 1.4734158515930176, + "learning_rate": 0.0001351273984754296, + "loss": 1.4511, + "step": 11286 + }, + { + "epoch": 0.4042115064372303, + "grad_norm": 1.6310186386108398, + "learning_rate": 0.00013511653839988168, + "loss": 1.5508, + "step": 11287 + }, + { + "epoch": 0.4042473185667986, + "grad_norm": 2.4635744094848633, + "learning_rate": 0.000135105677851886, + "loss": 1.4342, + "step": 11288 + }, + { + "epoch": 0.40428313069636684, + "grad_norm": 1.7325843572616577, + "learning_rate": 0.00013509481683158874, + "loss": 1.5283, + "step": 11289 + }, + { + "epoch": 0.40431894282593517, + "grad_norm": 1.6843584775924683, + "learning_rate": 0.00013508395533913593, + "loss": 1.4873, + "step": 11290 + }, + { + "epoch": 0.40435475495550344, + "grad_norm": 1.6397114992141724, + "learning_rate": 0.0001350730933746738, + "loss": 1.3648, + "step": 11291 + }, + { + "epoch": 0.4043905670850717, + "grad_norm": 1.6449841260910034, + "learning_rate": 0.00013506223093834844, + "loss": 1.5678, + "step": 11292 + }, + { + "epoch": 0.40442637921464, + "grad_norm": 1.9263081550598145, + "learning_rate": 0.000135051368030306, + "loss": 1.6234, + "step": 11293 + }, + { + "epoch": 0.4044621913442083, + "grad_norm": 1.5096728801727295, + "learning_rate": 0.00013504050465069263, + "loss": 1.7048, + "step": 11294 + }, + { + "epoch": 0.40449800347377657, + "grad_norm": 1.5539870262145996, + "learning_rate": 0.0001350296407996544, + "loss": 1.4478, + "step": 11295 + }, + { + "epoch": 0.40453381560334484, + "grad_norm": 1.7265819311141968, + "learning_rate": 0.0001350187764773376, + "loss": 1.5398, + "step": 11296 + }, + { + "epoch": 0.40456962773291316, + "grad_norm": 1.5002291202545166, + "learning_rate": 0.0001350079116838883, + "loss": 1.5066, + "step": 11297 + }, + { + "epoch": 0.40460543986248143, + "grad_norm": 1.3665101528167725, + "learning_rate": 0.0001349970464194527, + "loss": 1.2355, + "step": 11298 + }, + { + "epoch": 0.4046412519920497, + "grad_norm": 1.7081962823867798, + "learning_rate": 0.000134986180684177, + "loss": 1.4824, + "step": 11299 + }, + { + "epoch": 0.40467706412161797, + "grad_norm": 1.3695849180221558, + "learning_rate": 0.0001349753144782074, + "loss": 1.5157, + "step": 11300 + }, + { + "epoch": 0.4047128762511863, + "grad_norm": 2.080824613571167, + "learning_rate": 0.00013496444780169, + "loss": 1.5212, + "step": 11301 + }, + { + "epoch": 0.40474868838075456, + "grad_norm": 1.3417965173721313, + "learning_rate": 0.0001349535806547711, + "loss": 1.2956, + "step": 11302 + }, + { + "epoch": 0.40478450051032283, + "grad_norm": 1.4742838144302368, + "learning_rate": 0.00013494271303759686, + "loss": 1.7153, + "step": 11303 + }, + { + "epoch": 0.40482031263989116, + "grad_norm": 1.4015898704528809, + "learning_rate": 0.0001349318449503135, + "loss": 1.3194, + "step": 11304 + }, + { + "epoch": 0.4048561247694594, + "grad_norm": 1.504159688949585, + "learning_rate": 0.00013492097639306716, + "loss": 1.5072, + "step": 11305 + }, + { + "epoch": 0.4048919368990277, + "grad_norm": 1.7774419784545898, + "learning_rate": 0.00013491010736600418, + "loss": 1.7561, + "step": 11306 + }, + { + "epoch": 0.40492774902859596, + "grad_norm": 1.551797866821289, + "learning_rate": 0.0001348992378692707, + "loss": 1.5001, + "step": 11307 + }, + { + "epoch": 0.4049635611581643, + "grad_norm": 2.107363224029541, + "learning_rate": 0.000134888367903013, + "loss": 1.3526, + "step": 11308 + }, + { + "epoch": 0.40499937328773256, + "grad_norm": 3.087078332901001, + "learning_rate": 0.00013487749746737734, + "loss": 1.4878, + "step": 11309 + }, + { + "epoch": 0.4050351854173008, + "grad_norm": 1.5227560997009277, + "learning_rate": 0.0001348666265625099, + "loss": 1.4858, + "step": 11310 + }, + { + "epoch": 0.40507099754686915, + "grad_norm": 1.4322707653045654, + "learning_rate": 0.00013485575518855703, + "loss": 1.4656, + "step": 11311 + }, + { + "epoch": 0.4051068096764374, + "grad_norm": 2.5870959758758545, + "learning_rate": 0.00013484488334566488, + "loss": 1.466, + "step": 11312 + }, + { + "epoch": 0.4051426218060057, + "grad_norm": 1.6336684226989746, + "learning_rate": 0.00013483401103397982, + "loss": 1.5328, + "step": 11313 + }, + { + "epoch": 0.40517843393557396, + "grad_norm": 1.972943902015686, + "learning_rate": 0.00013482313825364804, + "loss": 1.3578, + "step": 11314 + }, + { + "epoch": 0.4052142460651423, + "grad_norm": 2.440329074859619, + "learning_rate": 0.00013481226500481588, + "loss": 1.7942, + "step": 11315 + }, + { + "epoch": 0.40525005819471055, + "grad_norm": 1.815645694732666, + "learning_rate": 0.00013480139128762956, + "loss": 1.5568, + "step": 11316 + }, + { + "epoch": 0.4052858703242788, + "grad_norm": 1.4573594331741333, + "learning_rate": 0.00013479051710223544, + "loss": 1.5361, + "step": 11317 + }, + { + "epoch": 0.40532168245384714, + "grad_norm": 1.3116105794906616, + "learning_rate": 0.00013477964244877977, + "loss": 1.5229, + "step": 11318 + }, + { + "epoch": 0.4053574945834154, + "grad_norm": 1.4079471826553345, + "learning_rate": 0.0001347687673274089, + "loss": 1.2103, + "step": 11319 + }, + { + "epoch": 0.4053933067129837, + "grad_norm": 1.2982499599456787, + "learning_rate": 0.00013475789173826908, + "loss": 1.5497, + "step": 11320 + }, + { + "epoch": 0.40542911884255195, + "grad_norm": 1.6635154485702515, + "learning_rate": 0.0001347470156815067, + "loss": 1.5242, + "step": 11321 + }, + { + "epoch": 0.4054649309721203, + "grad_norm": 1.7815632820129395, + "learning_rate": 0.000134736139157268, + "loss": 1.6833, + "step": 11322 + }, + { + "epoch": 0.40550074310168854, + "grad_norm": 1.7860198020935059, + "learning_rate": 0.0001347252621656994, + "loss": 1.5752, + "step": 11323 + }, + { + "epoch": 0.4055365552312568, + "grad_norm": 1.5376157760620117, + "learning_rate": 0.00013471438470694715, + "loss": 1.5881, + "step": 11324 + }, + { + "epoch": 0.40557236736082514, + "grad_norm": 2.517604112625122, + "learning_rate": 0.00013470350678115763, + "loss": 1.3554, + "step": 11325 + }, + { + "epoch": 0.4056081794903934, + "grad_norm": 1.7985392808914185, + "learning_rate": 0.00013469262838847724, + "loss": 1.3522, + "step": 11326 + }, + { + "epoch": 0.4056439916199617, + "grad_norm": 1.6951795816421509, + "learning_rate": 0.00013468174952905223, + "loss": 1.3, + "step": 11327 + }, + { + "epoch": 0.40567980374952994, + "grad_norm": 1.8560683727264404, + "learning_rate": 0.00013467087020302906, + "loss": 1.4333, + "step": 11328 + }, + { + "epoch": 0.40571561587909827, + "grad_norm": 1.9412565231323242, + "learning_rate": 0.00013465999041055405, + "loss": 1.4037, + "step": 11329 + }, + { + "epoch": 0.40575142800866654, + "grad_norm": 2.763273000717163, + "learning_rate": 0.00013464911015177356, + "loss": 1.3453, + "step": 11330 + }, + { + "epoch": 0.4057872401382348, + "grad_norm": 1.9450551271438599, + "learning_rate": 0.000134638229426834, + "loss": 1.6067, + "step": 11331 + }, + { + "epoch": 0.40582305226780313, + "grad_norm": 1.551149606704712, + "learning_rate": 0.0001346273482358817, + "loss": 1.5155, + "step": 11332 + }, + { + "epoch": 0.4058588643973714, + "grad_norm": 1.4368470907211304, + "learning_rate": 0.00013461646657906315, + "loss": 1.556, + "step": 11333 + }, + { + "epoch": 0.40589467652693967, + "grad_norm": 2.317528486251831, + "learning_rate": 0.00013460558445652467, + "loss": 1.9397, + "step": 11334 + }, + { + "epoch": 0.40593048865650794, + "grad_norm": 1.6196165084838867, + "learning_rate": 0.0001345947018684127, + "loss": 1.6216, + "step": 11335 + }, + { + "epoch": 0.40596630078607626, + "grad_norm": 1.3833155632019043, + "learning_rate": 0.00013458381881487362, + "loss": 1.4274, + "step": 11336 + }, + { + "epoch": 0.40600211291564453, + "grad_norm": 2.2270424365997314, + "learning_rate": 0.0001345729352960539, + "loss": 1.3553, + "step": 11337 + }, + { + "epoch": 0.4060379250452128, + "grad_norm": 1.5133336782455444, + "learning_rate": 0.00013456205131209988, + "loss": 1.5309, + "step": 11338 + }, + { + "epoch": 0.4060737371747811, + "grad_norm": 1.5618396997451782, + "learning_rate": 0.0001345511668631581, + "loss": 1.411, + "step": 11339 + }, + { + "epoch": 0.4061095493043494, + "grad_norm": 1.4137815237045288, + "learning_rate": 0.0001345402819493749, + "loss": 1.677, + "step": 11340 + }, + { + "epoch": 0.40614536143391766, + "grad_norm": 1.4480739831924438, + "learning_rate": 0.00013452939657089677, + "loss": 1.4027, + "step": 11341 + }, + { + "epoch": 0.40618117356348593, + "grad_norm": 1.5944091081619263, + "learning_rate": 0.00013451851072787013, + "loss": 1.2003, + "step": 11342 + }, + { + "epoch": 0.40621698569305426, + "grad_norm": 2.348677635192871, + "learning_rate": 0.00013450762442044148, + "loss": 1.3259, + "step": 11343 + }, + { + "epoch": 0.4062527978226225, + "grad_norm": 1.4781785011291504, + "learning_rate": 0.00013449673764875724, + "loss": 1.4524, + "step": 11344 + }, + { + "epoch": 0.4062886099521908, + "grad_norm": 1.2914907932281494, + "learning_rate": 0.00013448585041296392, + "loss": 1.4464, + "step": 11345 + }, + { + "epoch": 0.4063244220817591, + "grad_norm": 1.4890108108520508, + "learning_rate": 0.00013447496271320794, + "loss": 1.5749, + "step": 11346 + }, + { + "epoch": 0.4063602342113274, + "grad_norm": 1.6016771793365479, + "learning_rate": 0.00013446407454963582, + "loss": 1.6343, + "step": 11347 + }, + { + "epoch": 0.40639604634089566, + "grad_norm": 1.856371521949768, + "learning_rate": 0.00013445318592239405, + "loss": 1.3749, + "step": 11348 + }, + { + "epoch": 0.4064318584704639, + "grad_norm": 1.780447006225586, + "learning_rate": 0.00013444229683162904, + "loss": 1.5556, + "step": 11349 + }, + { + "epoch": 0.40646767060003225, + "grad_norm": 2.996824026107788, + "learning_rate": 0.00013443140727748738, + "loss": 1.5899, + "step": 11350 + }, + { + "epoch": 0.4065034827296005, + "grad_norm": 1.7526699304580688, + "learning_rate": 0.0001344205172601156, + "loss": 1.501, + "step": 11351 + }, + { + "epoch": 0.4065392948591688, + "grad_norm": 3.1774349212646484, + "learning_rate": 0.00013440962677966012, + "loss": 1.3241, + "step": 11352 + }, + { + "epoch": 0.4065751069887371, + "grad_norm": 1.467477798461914, + "learning_rate": 0.0001343987358362675, + "loss": 1.7437, + "step": 11353 + }, + { + "epoch": 0.4066109191183054, + "grad_norm": 2.0850508213043213, + "learning_rate": 0.00013438784443008426, + "loss": 1.3627, + "step": 11354 + }, + { + "epoch": 0.40664673124787365, + "grad_norm": 1.6988506317138672, + "learning_rate": 0.00013437695256125694, + "loss": 1.7532, + "step": 11355 + }, + { + "epoch": 0.4066825433774419, + "grad_norm": 2.0369551181793213, + "learning_rate": 0.00013436606022993207, + "loss": 1.561, + "step": 11356 + }, + { + "epoch": 0.40671835550701024, + "grad_norm": 1.7097434997558594, + "learning_rate": 0.00013435516743625617, + "loss": 1.8608, + "step": 11357 + }, + { + "epoch": 0.4067541676365785, + "grad_norm": 1.62166166305542, + "learning_rate": 0.0001343442741803758, + "loss": 1.4686, + "step": 11358 + }, + { + "epoch": 0.4067899797661468, + "grad_norm": 1.4330084323883057, + "learning_rate": 0.00013433338046243753, + "loss": 1.5757, + "step": 11359 + }, + { + "epoch": 0.4068257918957151, + "grad_norm": 1.9137028455734253, + "learning_rate": 0.0001343224862825879, + "loss": 1.4864, + "step": 11360 + }, + { + "epoch": 0.4068616040252834, + "grad_norm": 1.4365180730819702, + "learning_rate": 0.00013431159164097354, + "loss": 1.6628, + "step": 11361 + }, + { + "epoch": 0.40689741615485164, + "grad_norm": 1.4980610609054565, + "learning_rate": 0.0001343006965377409, + "loss": 1.1436, + "step": 11362 + }, + { + "epoch": 0.4069332282844199, + "grad_norm": 2.000791549682617, + "learning_rate": 0.00013428980097303668, + "loss": 1.6187, + "step": 11363 + }, + { + "epoch": 0.40696904041398824, + "grad_norm": 1.9143515825271606, + "learning_rate": 0.0001342789049470074, + "loss": 1.6053, + "step": 11364 + }, + { + "epoch": 0.4070048525435565, + "grad_norm": 1.4883040189743042, + "learning_rate": 0.0001342680084597997, + "loss": 1.3271, + "step": 11365 + }, + { + "epoch": 0.4070406646731248, + "grad_norm": 1.6826872825622559, + "learning_rate": 0.00013425711151156014, + "loss": 1.7451, + "step": 11366 + }, + { + "epoch": 0.4070764768026931, + "grad_norm": 1.5650379657745361, + "learning_rate": 0.00013424621410243533, + "loss": 1.5927, + "step": 11367 + }, + { + "epoch": 0.40711228893226137, + "grad_norm": 2.28118634223938, + "learning_rate": 0.00013423531623257189, + "loss": 1.3887, + "step": 11368 + }, + { + "epoch": 0.40714810106182964, + "grad_norm": 1.7174426317214966, + "learning_rate": 0.0001342244179021164, + "loss": 1.7299, + "step": 11369 + }, + { + "epoch": 0.4071839131913979, + "grad_norm": 2.240532875061035, + "learning_rate": 0.00013421351911121554, + "loss": 1.5032, + "step": 11370 + }, + { + "epoch": 0.40721972532096623, + "grad_norm": 1.6691335439682007, + "learning_rate": 0.00013420261986001587, + "loss": 1.6486, + "step": 11371 + }, + { + "epoch": 0.4072555374505345, + "grad_norm": 2.2593131065368652, + "learning_rate": 0.00013419172014866412, + "loss": 1.4365, + "step": 11372 + }, + { + "epoch": 0.40729134958010277, + "grad_norm": 2.0483601093292236, + "learning_rate": 0.00013418081997730686, + "loss": 1.6662, + "step": 11373 + }, + { + "epoch": 0.4073271617096711, + "grad_norm": 1.5782848596572876, + "learning_rate": 0.00013416991934609075, + "loss": 1.5193, + "step": 11374 + }, + { + "epoch": 0.40736297383923936, + "grad_norm": 1.8060847520828247, + "learning_rate": 0.00013415901825516248, + "loss": 1.5458, + "step": 11375 + }, + { + "epoch": 0.40739878596880763, + "grad_norm": 1.808779001235962, + "learning_rate": 0.00013414811670466864, + "loss": 1.6425, + "step": 11376 + }, + { + "epoch": 0.4074345980983759, + "grad_norm": 1.4207754135131836, + "learning_rate": 0.00013413721469475597, + "loss": 1.4269, + "step": 11377 + }, + { + "epoch": 0.4074704102279442, + "grad_norm": 1.7335152626037598, + "learning_rate": 0.00013412631222557112, + "loss": 1.5142, + "step": 11378 + }, + { + "epoch": 0.4075062223575125, + "grad_norm": 1.4673773050308228, + "learning_rate": 0.00013411540929726072, + "loss": 1.406, + "step": 11379 + }, + { + "epoch": 0.40754203448708076, + "grad_norm": 1.2677370309829712, + "learning_rate": 0.0001341045059099715, + "loss": 1.6056, + "step": 11380 + }, + { + "epoch": 0.4075778466166491, + "grad_norm": 2.6489250659942627, + "learning_rate": 0.00013409360206385017, + "loss": 1.6785, + "step": 11381 + }, + { + "epoch": 0.40761365874621736, + "grad_norm": 1.6678003072738647, + "learning_rate": 0.00013408269775904338, + "loss": 1.64, + "step": 11382 + }, + { + "epoch": 0.4076494708757856, + "grad_norm": 1.7335892915725708, + "learning_rate": 0.00013407179299569787, + "loss": 1.405, + "step": 11383 + }, + { + "epoch": 0.4076852830053539, + "grad_norm": 1.8726739883422852, + "learning_rate": 0.00013406088777396033, + "loss": 1.3341, + "step": 11384 + }, + { + "epoch": 0.4077210951349222, + "grad_norm": 1.368068814277649, + "learning_rate": 0.00013404998209397748, + "loss": 1.5909, + "step": 11385 + }, + { + "epoch": 0.4077569072644905, + "grad_norm": 1.7381190061569214, + "learning_rate": 0.00013403907595589605, + "loss": 1.5986, + "step": 11386 + }, + { + "epoch": 0.40779271939405876, + "grad_norm": 1.8664449453353882, + "learning_rate": 0.0001340281693598627, + "loss": 1.5861, + "step": 11387 + }, + { + "epoch": 0.407828531523627, + "grad_norm": 1.7823396921157837, + "learning_rate": 0.0001340172623060243, + "loss": 1.3708, + "step": 11388 + }, + { + "epoch": 0.40786434365319535, + "grad_norm": 2.0772182941436768, + "learning_rate": 0.0001340063547945275, + "loss": 1.5313, + "step": 11389 + }, + { + "epoch": 0.4079001557827636, + "grad_norm": 1.9356260299682617, + "learning_rate": 0.00013399544682551903, + "loss": 1.5213, + "step": 11390 + }, + { + "epoch": 0.4079359679123319, + "grad_norm": 1.6405465602874756, + "learning_rate": 0.00013398453839914574, + "loss": 1.6985, + "step": 11391 + }, + { + "epoch": 0.4079717800419002, + "grad_norm": 2.4301764965057373, + "learning_rate": 0.00013397362951555425, + "loss": 1.3963, + "step": 11392 + }, + { + "epoch": 0.4080075921714685, + "grad_norm": 1.3280025720596313, + "learning_rate": 0.00013396272017489143, + "loss": 1.4303, + "step": 11393 + }, + { + "epoch": 0.40804340430103675, + "grad_norm": 2.0638201236724854, + "learning_rate": 0.000133951810377304, + "loss": 1.6184, + "step": 11394 + }, + { + "epoch": 0.408079216430605, + "grad_norm": 1.1395586729049683, + "learning_rate": 0.00013394090012293879, + "loss": 1.673, + "step": 11395 + }, + { + "epoch": 0.40811502856017334, + "grad_norm": 1.4466770887374878, + "learning_rate": 0.0001339299894119425, + "loss": 1.6637, + "step": 11396 + }, + { + "epoch": 0.4081508406897416, + "grad_norm": 2.1084930896759033, + "learning_rate": 0.00013391907824446202, + "loss": 1.53, + "step": 11397 + }, + { + "epoch": 0.4081866528193099, + "grad_norm": 1.475360631942749, + "learning_rate": 0.00013390816662064406, + "loss": 1.5123, + "step": 11398 + }, + { + "epoch": 0.4082224649488782, + "grad_norm": 1.497314691543579, + "learning_rate": 0.00013389725454063549, + "loss": 1.4362, + "step": 11399 + }, + { + "epoch": 0.4082582770784465, + "grad_norm": 2.5741963386535645, + "learning_rate": 0.00013388634200458305, + "loss": 1.6763, + "step": 11400 + }, + { + "epoch": 0.40829408920801474, + "grad_norm": 2.4950594902038574, + "learning_rate": 0.00013387542901263362, + "loss": 1.5976, + "step": 11401 + }, + { + "epoch": 0.408329901337583, + "grad_norm": 1.6021647453308105, + "learning_rate": 0.00013386451556493396, + "loss": 1.8064, + "step": 11402 + }, + { + "epoch": 0.40836571346715134, + "grad_norm": 1.5093995332717896, + "learning_rate": 0.00013385360166163094, + "loss": 1.6469, + "step": 11403 + }, + { + "epoch": 0.4084015255967196, + "grad_norm": 1.5861656665802002, + "learning_rate": 0.00013384268730287136, + "loss": 1.3995, + "step": 11404 + }, + { + "epoch": 0.4084373377262879, + "grad_norm": 1.5600558519363403, + "learning_rate": 0.0001338317724888021, + "loss": 1.1743, + "step": 11405 + }, + { + "epoch": 0.4084731498558562, + "grad_norm": 1.8158670663833618, + "learning_rate": 0.00013382085721956997, + "loss": 1.6269, + "step": 11406 + }, + { + "epoch": 0.40850896198542447, + "grad_norm": 1.5254065990447998, + "learning_rate": 0.00013380994149532181, + "loss": 1.3899, + "step": 11407 + }, + { + "epoch": 0.40854477411499274, + "grad_norm": 1.4239981174468994, + "learning_rate": 0.00013379902531620455, + "loss": 1.6413, + "step": 11408 + }, + { + "epoch": 0.408580586244561, + "grad_norm": 2.2646539211273193, + "learning_rate": 0.00013378810868236497, + "loss": 1.7264, + "step": 11409 + }, + { + "epoch": 0.40861639837412933, + "grad_norm": 1.6422070264816284, + "learning_rate": 0.00013377719159394998, + "loss": 1.5903, + "step": 11410 + }, + { + "epoch": 0.4086522105036976, + "grad_norm": 1.7455012798309326, + "learning_rate": 0.00013376627405110644, + "loss": 1.4329, + "step": 11411 + }, + { + "epoch": 0.40868802263326587, + "grad_norm": 1.596226453781128, + "learning_rate": 0.00013375535605398127, + "loss": 1.5022, + "step": 11412 + }, + { + "epoch": 0.4087238347628342, + "grad_norm": 1.9514662027359009, + "learning_rate": 0.00013374443760272127, + "loss": 1.3195, + "step": 11413 + }, + { + "epoch": 0.40875964689240246, + "grad_norm": 1.9477596282958984, + "learning_rate": 0.0001337335186974734, + "loss": 1.605, + "step": 11414 + }, + { + "epoch": 0.40879545902197073, + "grad_norm": 1.2788459062576294, + "learning_rate": 0.00013372259933838458, + "loss": 1.4258, + "step": 11415 + }, + { + "epoch": 0.408831271151539, + "grad_norm": 2.610154628753662, + "learning_rate": 0.00013371167952560168, + "loss": 1.2783, + "step": 11416 + }, + { + "epoch": 0.4088670832811073, + "grad_norm": 2.161350727081299, + "learning_rate": 0.00013370075925927158, + "loss": 1.4277, + "step": 11417 + }, + { + "epoch": 0.4089028954106756, + "grad_norm": 1.61553156375885, + "learning_rate": 0.00013368983853954126, + "loss": 1.5061, + "step": 11418 + }, + { + "epoch": 0.40893870754024386, + "grad_norm": 1.382102370262146, + "learning_rate": 0.00013367891736655764, + "loss": 1.4812, + "step": 11419 + }, + { + "epoch": 0.4089745196698122, + "grad_norm": 1.7254366874694824, + "learning_rate": 0.0001336679957404676, + "loss": 1.6011, + "step": 11420 + }, + { + "epoch": 0.40901033179938046, + "grad_norm": 1.3416959047317505, + "learning_rate": 0.00013365707366141814, + "loss": 1.6771, + "step": 11421 + }, + { + "epoch": 0.4090461439289487, + "grad_norm": 1.6707831621170044, + "learning_rate": 0.00013364615112955612, + "loss": 1.5072, + "step": 11422 + }, + { + "epoch": 0.409081956058517, + "grad_norm": 2.198908567428589, + "learning_rate": 0.0001336352281450286, + "loss": 1.7351, + "step": 11423 + }, + { + "epoch": 0.4091177681880853, + "grad_norm": 1.4816632270812988, + "learning_rate": 0.0001336243047079824, + "loss": 1.5977, + "step": 11424 + }, + { + "epoch": 0.4091535803176536, + "grad_norm": 1.7870755195617676, + "learning_rate": 0.00013361338081856457, + "loss": 1.6438, + "step": 11425 + }, + { + "epoch": 0.40918939244722186, + "grad_norm": 1.7458114624023438, + "learning_rate": 0.0001336024564769221, + "loss": 1.4006, + "step": 11426 + }, + { + "epoch": 0.4092252045767902, + "grad_norm": 1.7427453994750977, + "learning_rate": 0.00013359153168320188, + "loss": 1.3585, + "step": 11427 + }, + { + "epoch": 0.40926101670635845, + "grad_norm": 1.8659039735794067, + "learning_rate": 0.00013358060643755098, + "loss": 1.67, + "step": 11428 + }, + { + "epoch": 0.4092968288359267, + "grad_norm": 1.4073402881622314, + "learning_rate": 0.00013356968074011626, + "loss": 1.4902, + "step": 11429 + }, + { + "epoch": 0.409332640965495, + "grad_norm": 1.6928961277008057, + "learning_rate": 0.00013355875459104485, + "loss": 1.6261, + "step": 11430 + }, + { + "epoch": 0.4093684530950633, + "grad_norm": 1.7611867189407349, + "learning_rate": 0.00013354782799048366, + "loss": 1.3424, + "step": 11431 + }, + { + "epoch": 0.4094042652246316, + "grad_norm": 1.9710921049118042, + "learning_rate": 0.00013353690093857972, + "loss": 1.4281, + "step": 11432 + }, + { + "epoch": 0.40944007735419985, + "grad_norm": 1.504773497581482, + "learning_rate": 0.00013352597343548004, + "loss": 1.4062, + "step": 11433 + }, + { + "epoch": 0.4094758894837682, + "grad_norm": 1.5385361909866333, + "learning_rate": 0.00013351504548133166, + "loss": 1.7332, + "step": 11434 + }, + { + "epoch": 0.40951170161333644, + "grad_norm": 2.2584147453308105, + "learning_rate": 0.00013350411707628153, + "loss": 1.75, + "step": 11435 + }, + { + "epoch": 0.4095475137429047, + "grad_norm": 1.3944424390792847, + "learning_rate": 0.00013349318822047674, + "loss": 1.35, + "step": 11436 + }, + { + "epoch": 0.409583325872473, + "grad_norm": 1.3100533485412598, + "learning_rate": 0.00013348225891406432, + "loss": 1.429, + "step": 11437 + }, + { + "epoch": 0.4096191380020413, + "grad_norm": 2.255528688430786, + "learning_rate": 0.00013347132915719127, + "loss": 1.749, + "step": 11438 + }, + { + "epoch": 0.4096549501316096, + "grad_norm": 2.328932523727417, + "learning_rate": 0.0001334603989500047, + "loss": 1.3934, + "step": 11439 + }, + { + "epoch": 0.40969076226117784, + "grad_norm": 1.5605809688568115, + "learning_rate": 0.00013344946829265157, + "loss": 1.5377, + "step": 11440 + }, + { + "epoch": 0.40972657439074617, + "grad_norm": 2.7059600353240967, + "learning_rate": 0.000133438537185279, + "loss": 1.8359, + "step": 11441 + }, + { + "epoch": 0.40976238652031444, + "grad_norm": 1.4177329540252686, + "learning_rate": 0.00013342760562803406, + "loss": 1.5699, + "step": 11442 + }, + { + "epoch": 0.4097981986498827, + "grad_norm": 2.09751296043396, + "learning_rate": 0.0001334166736210638, + "loss": 1.5161, + "step": 11443 + }, + { + "epoch": 0.409834010779451, + "grad_norm": 1.62708580493927, + "learning_rate": 0.00013340574116451533, + "loss": 1.3609, + "step": 11444 + }, + { + "epoch": 0.4098698229090193, + "grad_norm": 1.794075608253479, + "learning_rate": 0.0001333948082585357, + "loss": 1.4662, + "step": 11445 + }, + { + "epoch": 0.40990563503858757, + "grad_norm": 1.4780937433242798, + "learning_rate": 0.00013338387490327195, + "loss": 1.3648, + "step": 11446 + }, + { + "epoch": 0.40994144716815584, + "grad_norm": 1.2816733121871948, + "learning_rate": 0.00013337294109887123, + "loss": 1.3649, + "step": 11447 + }, + { + "epoch": 0.40997725929772416, + "grad_norm": 1.4756011962890625, + "learning_rate": 0.0001333620068454807, + "loss": 1.4781, + "step": 11448 + }, + { + "epoch": 0.41001307142729243, + "grad_norm": 1.638892412185669, + "learning_rate": 0.00013335107214324733, + "loss": 1.7207, + "step": 11449 + }, + { + "epoch": 0.4100488835568607, + "grad_norm": 1.8446863889694214, + "learning_rate": 0.00013334013699231836, + "loss": 1.3898, + "step": 11450 + }, + { + "epoch": 0.41008469568642897, + "grad_norm": 1.9317458868026733, + "learning_rate": 0.0001333292013928408, + "loss": 1.6734, + "step": 11451 + }, + { + "epoch": 0.4101205078159973, + "grad_norm": 1.8174721002578735, + "learning_rate": 0.00013331826534496188, + "loss": 1.6215, + "step": 11452 + }, + { + "epoch": 0.41015631994556556, + "grad_norm": 1.832736849784851, + "learning_rate": 0.00013330732884882866, + "loss": 1.4646, + "step": 11453 + }, + { + "epoch": 0.41019213207513383, + "grad_norm": 2.605854034423828, + "learning_rate": 0.0001332963919045883, + "loss": 1.5914, + "step": 11454 + }, + { + "epoch": 0.41022794420470216, + "grad_norm": 2.3421616554260254, + "learning_rate": 0.00013328545451238793, + "loss": 1.5082, + "step": 11455 + }, + { + "epoch": 0.4102637563342704, + "grad_norm": 1.8743304014205933, + "learning_rate": 0.00013327451667237468, + "loss": 1.3477, + "step": 11456 + }, + { + "epoch": 0.4102995684638387, + "grad_norm": 1.307234764099121, + "learning_rate": 0.00013326357838469574, + "loss": 1.5954, + "step": 11457 + }, + { + "epoch": 0.41033538059340696, + "grad_norm": 1.5379959344863892, + "learning_rate": 0.0001332526396494983, + "loss": 1.4679, + "step": 11458 + }, + { + "epoch": 0.4103711927229753, + "grad_norm": 1.7482596635818481, + "learning_rate": 0.00013324170046692942, + "loss": 1.5744, + "step": 11459 + }, + { + "epoch": 0.41040700485254356, + "grad_norm": 1.8453181982040405, + "learning_rate": 0.00013323076083713637, + "loss": 1.4618, + "step": 11460 + }, + { + "epoch": 0.4104428169821118, + "grad_norm": 2.0559778213500977, + "learning_rate": 0.00013321982076026632, + "loss": 1.632, + "step": 11461 + }, + { + "epoch": 0.41047862911168015, + "grad_norm": 1.8249529600143433, + "learning_rate": 0.0001332088802364664, + "loss": 1.6765, + "step": 11462 + }, + { + "epoch": 0.4105144412412484, + "grad_norm": 1.922978162765503, + "learning_rate": 0.00013319793926588387, + "loss": 1.1515, + "step": 11463 + }, + { + "epoch": 0.4105502533708167, + "grad_norm": 1.7666670083999634, + "learning_rate": 0.00013318699784866585, + "loss": 1.6148, + "step": 11464 + }, + { + "epoch": 0.41058606550038496, + "grad_norm": 1.6551225185394287, + "learning_rate": 0.0001331760559849596, + "loss": 1.2441, + "step": 11465 + }, + { + "epoch": 0.4106218776299533, + "grad_norm": 1.6817725896835327, + "learning_rate": 0.0001331651136749123, + "loss": 1.3188, + "step": 11466 + }, + { + "epoch": 0.41065768975952155, + "grad_norm": 1.4896317720413208, + "learning_rate": 0.0001331541709186712, + "loss": 1.3486, + "step": 11467 + }, + { + "epoch": 0.4106935018890898, + "grad_norm": 1.92238450050354, + "learning_rate": 0.00013314322771638346, + "loss": 1.8901, + "step": 11468 + }, + { + "epoch": 0.41072931401865814, + "grad_norm": 1.4980189800262451, + "learning_rate": 0.00013313228406819637, + "loss": 1.5268, + "step": 11469 + }, + { + "epoch": 0.4107651261482264, + "grad_norm": 1.5852466821670532, + "learning_rate": 0.00013312133997425712, + "loss": 1.697, + "step": 11470 + }, + { + "epoch": 0.4108009382777947, + "grad_norm": 1.6819641590118408, + "learning_rate": 0.00013311039543471297, + "loss": 1.4556, + "step": 11471 + }, + { + "epoch": 0.41083675040736295, + "grad_norm": 1.6359261274337769, + "learning_rate": 0.00013309945044971116, + "loss": 1.2987, + "step": 11472 + }, + { + "epoch": 0.4108725625369313, + "grad_norm": 1.8399405479431152, + "learning_rate": 0.00013308850501939892, + "loss": 1.6601, + "step": 11473 + }, + { + "epoch": 0.41090837466649954, + "grad_norm": 1.4478241205215454, + "learning_rate": 0.00013307755914392357, + "loss": 1.2634, + "step": 11474 + }, + { + "epoch": 0.4109441867960678, + "grad_norm": 1.807931661605835, + "learning_rate": 0.0001330666128234323, + "loss": 1.7326, + "step": 11475 + }, + { + "epoch": 0.41097999892563614, + "grad_norm": 1.5771223306655884, + "learning_rate": 0.0001330556660580724, + "loss": 1.25, + "step": 11476 + }, + { + "epoch": 0.4110158110552044, + "grad_norm": 1.909822702407837, + "learning_rate": 0.00013304471884799116, + "loss": 1.7395, + "step": 11477 + }, + { + "epoch": 0.4110516231847727, + "grad_norm": 1.6032434701919556, + "learning_rate": 0.00013303377119333587, + "loss": 1.6306, + "step": 11478 + }, + { + "epoch": 0.41108743531434094, + "grad_norm": 1.5883413553237915, + "learning_rate": 0.0001330228230942538, + "loss": 1.2959, + "step": 11479 + }, + { + "epoch": 0.41112324744390927, + "grad_norm": 1.9156928062438965, + "learning_rate": 0.00013301187455089223, + "loss": 1.936, + "step": 11480 + }, + { + "epoch": 0.41115905957347754, + "grad_norm": 3.1075665950775146, + "learning_rate": 0.00013300092556339847, + "loss": 1.8545, + "step": 11481 + }, + { + "epoch": 0.4111948717030458, + "grad_norm": 1.515472412109375, + "learning_rate": 0.00013298997613191978, + "loss": 1.5118, + "step": 11482 + }, + { + "epoch": 0.41123068383261413, + "grad_norm": 1.5612744092941284, + "learning_rate": 0.00013297902625660358, + "loss": 1.3228, + "step": 11483 + }, + { + "epoch": 0.4112664959621824, + "grad_norm": 2.1935417652130127, + "learning_rate": 0.00013296807593759708, + "loss": 1.5941, + "step": 11484 + }, + { + "epoch": 0.41130230809175067, + "grad_norm": 1.5191771984100342, + "learning_rate": 0.0001329571251750477, + "loss": 1.5533, + "step": 11485 + }, + { + "epoch": 0.41133812022131894, + "grad_norm": 1.5615873336791992, + "learning_rate": 0.00013294617396910266, + "loss": 1.7188, + "step": 11486 + }, + { + "epoch": 0.41137393235088726, + "grad_norm": 1.4643330574035645, + "learning_rate": 0.00013293522231990935, + "loss": 1.582, + "step": 11487 + }, + { + "epoch": 0.41140974448045553, + "grad_norm": 2.06705379486084, + "learning_rate": 0.00013292427022761514, + "loss": 1.5501, + "step": 11488 + }, + { + "epoch": 0.4114455566100238, + "grad_norm": 1.5073274374008179, + "learning_rate": 0.0001329133176923673, + "loss": 1.5135, + "step": 11489 + }, + { + "epoch": 0.4114813687395921, + "grad_norm": 1.912886381149292, + "learning_rate": 0.00013290236471431326, + "loss": 1.4496, + "step": 11490 + }, + { + "epoch": 0.4115171808691604, + "grad_norm": 1.8361574411392212, + "learning_rate": 0.00013289141129360033, + "loss": 1.6251, + "step": 11491 + }, + { + "epoch": 0.41155299299872866, + "grad_norm": 2.233924388885498, + "learning_rate": 0.0001328804574303759, + "loss": 1.8372, + "step": 11492 + }, + { + "epoch": 0.41158880512829693, + "grad_norm": 1.716523289680481, + "learning_rate": 0.0001328695031247873, + "loss": 1.6532, + "step": 11493 + }, + { + "epoch": 0.41162461725786526, + "grad_norm": 1.9167834520339966, + "learning_rate": 0.00013285854837698195, + "loss": 1.5691, + "step": 11494 + }, + { + "epoch": 0.4116604293874335, + "grad_norm": 1.637646198272705, + "learning_rate": 0.0001328475931871072, + "loss": 1.5658, + "step": 11495 + }, + { + "epoch": 0.4116962415170018, + "grad_norm": 1.4613744020462036, + "learning_rate": 0.0001328366375553105, + "loss": 1.2251, + "step": 11496 + }, + { + "epoch": 0.4117320536465701, + "grad_norm": 2.3519818782806396, + "learning_rate": 0.00013282568148173917, + "loss": 1.4193, + "step": 11497 + }, + { + "epoch": 0.4117678657761384, + "grad_norm": 1.911476731300354, + "learning_rate": 0.00013281472496654064, + "loss": 1.3368, + "step": 11498 + }, + { + "epoch": 0.41180367790570666, + "grad_norm": 1.8679819107055664, + "learning_rate": 0.0001328037680098623, + "loss": 1.4186, + "step": 11499 + }, + { + "epoch": 0.4118394900352749, + "grad_norm": 1.6194400787353516, + "learning_rate": 0.00013279281061185158, + "loss": 1.4609, + "step": 11500 + }, + { + "epoch": 0.41187530216484325, + "grad_norm": 1.498268961906433, + "learning_rate": 0.0001327818527726559, + "loss": 1.4629, + "step": 11501 + }, + { + "epoch": 0.4119111142944115, + "grad_norm": 1.735053300857544, + "learning_rate": 0.00013277089449242267, + "loss": 1.3692, + "step": 11502 + }, + { + "epoch": 0.4119469264239798, + "grad_norm": 1.634087085723877, + "learning_rate": 0.00013275993577129932, + "loss": 1.7921, + "step": 11503 + }, + { + "epoch": 0.4119827385535481, + "grad_norm": 2.1054511070251465, + "learning_rate": 0.0001327489766094333, + "loss": 1.3704, + "step": 11504 + }, + { + "epoch": 0.4120185506831164, + "grad_norm": 1.6996413469314575, + "learning_rate": 0.00013273801700697206, + "loss": 1.7709, + "step": 11505 + }, + { + "epoch": 0.41205436281268465, + "grad_norm": 1.709455966949463, + "learning_rate": 0.00013272705696406302, + "loss": 1.5758, + "step": 11506 + }, + { + "epoch": 0.4120901749422529, + "grad_norm": 1.550470232963562, + "learning_rate": 0.00013271609648085367, + "loss": 1.5027, + "step": 11507 + }, + { + "epoch": 0.41212598707182124, + "grad_norm": 1.6597813367843628, + "learning_rate": 0.0001327051355574914, + "loss": 1.7267, + "step": 11508 + }, + { + "epoch": 0.4121617992013895, + "grad_norm": 1.2792216539382935, + "learning_rate": 0.0001326941741941237, + "loss": 1.3561, + "step": 11509 + }, + { + "epoch": 0.4121976113309578, + "grad_norm": 1.414903163909912, + "learning_rate": 0.00013268321239089809, + "loss": 1.5848, + "step": 11510 + }, + { + "epoch": 0.4122334234605261, + "grad_norm": 1.810530185699463, + "learning_rate": 0.00013267225014796202, + "loss": 1.4441, + "step": 11511 + }, + { + "epoch": 0.4122692355900944, + "grad_norm": 1.5842233896255493, + "learning_rate": 0.00013266128746546296, + "loss": 1.1982, + "step": 11512 + }, + { + "epoch": 0.41230504771966264, + "grad_norm": 1.3252923488616943, + "learning_rate": 0.0001326503243435484, + "loss": 1.4864, + "step": 11513 + }, + { + "epoch": 0.4123408598492309, + "grad_norm": 1.435280203819275, + "learning_rate": 0.00013263936078236586, + "loss": 1.2947, + "step": 11514 + }, + { + "epoch": 0.41237667197879924, + "grad_norm": 2.3701577186584473, + "learning_rate": 0.00013262839678206283, + "loss": 1.7331, + "step": 11515 + }, + { + "epoch": 0.4124124841083675, + "grad_norm": 1.5976884365081787, + "learning_rate": 0.00013261743234278678, + "loss": 1.4953, + "step": 11516 + }, + { + "epoch": 0.4124482962379358, + "grad_norm": 1.6659822463989258, + "learning_rate": 0.00013260646746468527, + "loss": 1.1783, + "step": 11517 + }, + { + "epoch": 0.4124841083675041, + "grad_norm": 1.7905832529067993, + "learning_rate": 0.0001325955021479058, + "loss": 1.5895, + "step": 11518 + }, + { + "epoch": 0.41251992049707237, + "grad_norm": 1.614429235458374, + "learning_rate": 0.00013258453639259586, + "loss": 1.3956, + "step": 11519 + }, + { + "epoch": 0.41255573262664064, + "grad_norm": 1.761165738105774, + "learning_rate": 0.00013257357019890307, + "loss": 1.3419, + "step": 11520 + }, + { + "epoch": 0.4125915447562089, + "grad_norm": 2.056187868118286, + "learning_rate": 0.00013256260356697485, + "loss": 1.4939, + "step": 11521 + }, + { + "epoch": 0.41262735688577723, + "grad_norm": 1.6465550661087036, + "learning_rate": 0.00013255163649695886, + "loss": 1.6814, + "step": 11522 + }, + { + "epoch": 0.4126631690153455, + "grad_norm": 2.2140185832977295, + "learning_rate": 0.00013254066898900257, + "loss": 1.5524, + "step": 11523 + }, + { + "epoch": 0.41269898114491377, + "grad_norm": 1.8904930353164673, + "learning_rate": 0.00013252970104325352, + "loss": 1.4541, + "step": 11524 + }, + { + "epoch": 0.4127347932744821, + "grad_norm": 1.2835383415222168, + "learning_rate": 0.00013251873265985936, + "loss": 1.469, + "step": 11525 + }, + { + "epoch": 0.41277060540405036, + "grad_norm": 1.3030658960342407, + "learning_rate": 0.00013250776383896752, + "loss": 1.3568, + "step": 11526 + }, + { + "epoch": 0.41280641753361863, + "grad_norm": 1.3432520627975464, + "learning_rate": 0.00013249679458072572, + "loss": 1.4884, + "step": 11527 + }, + { + "epoch": 0.4128422296631869, + "grad_norm": 2.3544211387634277, + "learning_rate": 0.00013248582488528142, + "loss": 1.2597, + "step": 11528 + }, + { + "epoch": 0.4128780417927552, + "grad_norm": 1.6606460809707642, + "learning_rate": 0.0001324748547527823, + "loss": 1.4726, + "step": 11529 + }, + { + "epoch": 0.4129138539223235, + "grad_norm": 1.4075069427490234, + "learning_rate": 0.00013246388418337586, + "loss": 1.2732, + "step": 11530 + }, + { + "epoch": 0.41294966605189176, + "grad_norm": 1.6073061227798462, + "learning_rate": 0.00013245291317720974, + "loss": 1.3229, + "step": 11531 + }, + { + "epoch": 0.4129854781814601, + "grad_norm": 1.994727373123169, + "learning_rate": 0.00013244194173443155, + "loss": 1.3324, + "step": 11532 + }, + { + "epoch": 0.41302129031102836, + "grad_norm": 1.824912190437317, + "learning_rate": 0.00013243096985518887, + "loss": 1.3809, + "step": 11533 + }, + { + "epoch": 0.4130571024405966, + "grad_norm": 1.7901192903518677, + "learning_rate": 0.00013241999753962932, + "loss": 1.2446, + "step": 11534 + }, + { + "epoch": 0.4130929145701649, + "grad_norm": 1.5490052700042725, + "learning_rate": 0.00013240902478790052, + "loss": 1.373, + "step": 11535 + }, + { + "epoch": 0.4131287266997332, + "grad_norm": 1.581001877784729, + "learning_rate": 0.0001323980516001501, + "loss": 1.3801, + "step": 11536 + }, + { + "epoch": 0.4131645388293015, + "grad_norm": 2.0129377841949463, + "learning_rate": 0.00013238707797652569, + "loss": 1.6714, + "step": 11537 + }, + { + "epoch": 0.41320035095886976, + "grad_norm": 1.7160298824310303, + "learning_rate": 0.0001323761039171749, + "loss": 1.2577, + "step": 11538 + }, + { + "epoch": 0.4132361630884381, + "grad_norm": 1.5854284763336182, + "learning_rate": 0.00013236512942224545, + "loss": 1.5169, + "step": 11539 + }, + { + "epoch": 0.41327197521800635, + "grad_norm": 2.3109540939331055, + "learning_rate": 0.0001323541544918849, + "loss": 1.8822, + "step": 11540 + }, + { + "epoch": 0.4133077873475746, + "grad_norm": 2.023212194442749, + "learning_rate": 0.00013234317912624093, + "loss": 1.5002, + "step": 11541 + }, + { + "epoch": 0.4133435994771429, + "grad_norm": 2.173719644546509, + "learning_rate": 0.0001323322033254612, + "loss": 1.6894, + "step": 11542 + }, + { + "epoch": 0.4133794116067112, + "grad_norm": 1.548795223236084, + "learning_rate": 0.00013232122708969337, + "loss": 1.3606, + "step": 11543 + }, + { + "epoch": 0.4134152237362795, + "grad_norm": 1.749751329421997, + "learning_rate": 0.00013231025041908514, + "loss": 1.8365, + "step": 11544 + }, + { + "epoch": 0.41345103586584775, + "grad_norm": 1.4382846355438232, + "learning_rate": 0.00013229927331378418, + "loss": 1.3604, + "step": 11545 + }, + { + "epoch": 0.4134868479954161, + "grad_norm": 2.0954902172088623, + "learning_rate": 0.0001322882957739381, + "loss": 1.538, + "step": 11546 + }, + { + "epoch": 0.41352266012498434, + "grad_norm": 1.5251587629318237, + "learning_rate": 0.00013227731779969472, + "loss": 1.2223, + "step": 11547 + }, + { + "epoch": 0.4135584722545526, + "grad_norm": 2.0528721809387207, + "learning_rate": 0.00013226633939120164, + "loss": 1.2464, + "step": 11548 + }, + { + "epoch": 0.4135942843841209, + "grad_norm": 1.6846539974212646, + "learning_rate": 0.00013225536054860658, + "loss": 1.3667, + "step": 11549 + }, + { + "epoch": 0.4136300965136892, + "grad_norm": 2.08156418800354, + "learning_rate": 0.00013224438127205725, + "loss": 1.2521, + "step": 11550 + }, + { + "epoch": 0.4136659086432575, + "grad_norm": 1.3722084760665894, + "learning_rate": 0.0001322334015617014, + "loss": 1.5203, + "step": 11551 + }, + { + "epoch": 0.41370172077282574, + "grad_norm": 1.3709852695465088, + "learning_rate": 0.00013222242141768664, + "loss": 0.9214, + "step": 11552 + }, + { + "epoch": 0.41373753290239407, + "grad_norm": 1.7349940538406372, + "learning_rate": 0.00013221144084016082, + "loss": 1.5187, + "step": 11553 + }, + { + "epoch": 0.41377334503196234, + "grad_norm": 1.7656437158584595, + "learning_rate": 0.00013220045982927157, + "loss": 1.4667, + "step": 11554 + }, + { + "epoch": 0.4138091571615306, + "grad_norm": 1.6982886791229248, + "learning_rate": 0.00013218947838516672, + "loss": 1.5724, + "step": 11555 + }, + { + "epoch": 0.4138449692910989, + "grad_norm": 2.2578091621398926, + "learning_rate": 0.0001321784965079939, + "loss": 1.9695, + "step": 11556 + }, + { + "epoch": 0.4138807814206672, + "grad_norm": 1.5725754499435425, + "learning_rate": 0.00013216751419790096, + "loss": 1.4167, + "step": 11557 + }, + { + "epoch": 0.41391659355023547, + "grad_norm": 1.307800054550171, + "learning_rate": 0.00013215653145503558, + "loss": 1.4289, + "step": 11558 + }, + { + "epoch": 0.41395240567980374, + "grad_norm": 1.5263270139694214, + "learning_rate": 0.00013214554827954556, + "loss": 1.4657, + "step": 11559 + }, + { + "epoch": 0.41398821780937206, + "grad_norm": 1.9524672031402588, + "learning_rate": 0.00013213456467157868, + "loss": 1.4146, + "step": 11560 + }, + { + "epoch": 0.41402402993894033, + "grad_norm": 1.5733195543289185, + "learning_rate": 0.00013212358063128266, + "loss": 1.2555, + "step": 11561 + }, + { + "epoch": 0.4140598420685086, + "grad_norm": 1.4676419496536255, + "learning_rate": 0.0001321125961588053, + "loss": 1.4726, + "step": 11562 + }, + { + "epoch": 0.41409565419807687, + "grad_norm": 2.0057151317596436, + "learning_rate": 0.00013210161125429436, + "loss": 1.6748, + "step": 11563 + }, + { + "epoch": 0.4141314663276452, + "grad_norm": 1.7160898447036743, + "learning_rate": 0.0001320906259178977, + "loss": 1.6463, + "step": 11564 + }, + { + "epoch": 0.41416727845721346, + "grad_norm": 1.602569818496704, + "learning_rate": 0.00013207964014976299, + "loss": 1.6806, + "step": 11565 + }, + { + "epoch": 0.41420309058678173, + "grad_norm": 1.489131212234497, + "learning_rate": 0.00013206865395003816, + "loss": 1.2136, + "step": 11566 + }, + { + "epoch": 0.41423890271635005, + "grad_norm": 1.3555222749710083, + "learning_rate": 0.00013205766731887094, + "loss": 1.4043, + "step": 11567 + }, + { + "epoch": 0.4142747148459183, + "grad_norm": 2.4561402797698975, + "learning_rate": 0.00013204668025640915, + "loss": 1.5604, + "step": 11568 + }, + { + "epoch": 0.4143105269754866, + "grad_norm": 1.891719102859497, + "learning_rate": 0.00013203569276280062, + "loss": 1.7261, + "step": 11569 + }, + { + "epoch": 0.41434633910505486, + "grad_norm": 1.5649393796920776, + "learning_rate": 0.00013202470483819316, + "loss": 1.3593, + "step": 11570 + }, + { + "epoch": 0.4143821512346232, + "grad_norm": 1.5750397443771362, + "learning_rate": 0.00013201371648273463, + "loss": 1.5991, + "step": 11571 + }, + { + "epoch": 0.41441796336419146, + "grad_norm": 2.288243055343628, + "learning_rate": 0.00013200272769657283, + "loss": 1.5515, + "step": 11572 + }, + { + "epoch": 0.4144537754937597, + "grad_norm": 1.4258782863616943, + "learning_rate": 0.00013199173847985559, + "loss": 1.3312, + "step": 11573 + }, + { + "epoch": 0.41448958762332805, + "grad_norm": 1.5753511190414429, + "learning_rate": 0.0001319807488327308, + "loss": 1.4183, + "step": 11574 + }, + { + "epoch": 0.4145253997528963, + "grad_norm": 1.7554752826690674, + "learning_rate": 0.00013196975875534624, + "loss": 1.6035, + "step": 11575 + }, + { + "epoch": 0.4145612118824646, + "grad_norm": 2.1951706409454346, + "learning_rate": 0.00013195876824784988, + "loss": 1.4181, + "step": 11576 + }, + { + "epoch": 0.41459702401203286, + "grad_norm": 1.4915632009506226, + "learning_rate": 0.00013194777731038946, + "loss": 1.5221, + "step": 11577 + }, + { + "epoch": 0.4146328361416012, + "grad_norm": 1.97896409034729, + "learning_rate": 0.00013193678594311295, + "loss": 1.6528, + "step": 11578 + }, + { + "epoch": 0.41466864827116945, + "grad_norm": 1.9763433933258057, + "learning_rate": 0.00013192579414616815, + "loss": 1.324, + "step": 11579 + }, + { + "epoch": 0.4147044604007377, + "grad_norm": 1.8413565158843994, + "learning_rate": 0.000131914801919703, + "loss": 1.4303, + "step": 11580 + }, + { + "epoch": 0.41474027253030604, + "grad_norm": 1.5440016984939575, + "learning_rate": 0.0001319038092638653, + "loss": 1.5037, + "step": 11581 + }, + { + "epoch": 0.4147760846598743, + "grad_norm": 1.447457194328308, + "learning_rate": 0.00013189281617880308, + "loss": 1.3649, + "step": 11582 + }, + { + "epoch": 0.4148118967894426, + "grad_norm": 1.756569743156433, + "learning_rate": 0.0001318818226646641, + "loss": 1.4381, + "step": 11583 + }, + { + "epoch": 0.41484770891901085, + "grad_norm": 2.2443044185638428, + "learning_rate": 0.00013187082872159636, + "loss": 1.5342, + "step": 11584 + }, + { + "epoch": 0.4148835210485792, + "grad_norm": 1.3705558776855469, + "learning_rate": 0.0001318598343497477, + "loss": 1.2355, + "step": 11585 + }, + { + "epoch": 0.41491933317814744, + "grad_norm": 2.452343463897705, + "learning_rate": 0.00013184883954926607, + "loss": 1.2603, + "step": 11586 + }, + { + "epoch": 0.4149551453077157, + "grad_norm": 1.5595102310180664, + "learning_rate": 0.0001318378443202994, + "loss": 1.6226, + "step": 11587 + }, + { + "epoch": 0.414990957437284, + "grad_norm": 2.130498170852661, + "learning_rate": 0.00013182684866299557, + "loss": 1.63, + "step": 11588 + }, + { + "epoch": 0.4150267695668523, + "grad_norm": 1.8103166818618774, + "learning_rate": 0.00013181585257750257, + "loss": 1.4003, + "step": 11589 + }, + { + "epoch": 0.4150625816964206, + "grad_norm": 1.686850905418396, + "learning_rate": 0.0001318048560639683, + "loss": 1.3748, + "step": 11590 + }, + { + "epoch": 0.41509839382598884, + "grad_norm": 1.4466736316680908, + "learning_rate": 0.00013179385912254072, + "loss": 1.5537, + "step": 11591 + }, + { + "epoch": 0.41513420595555717, + "grad_norm": 1.7150828838348389, + "learning_rate": 0.00013178286175336777, + "loss": 1.1753, + "step": 11592 + }, + { + "epoch": 0.41517001808512544, + "grad_norm": 1.8393244743347168, + "learning_rate": 0.00013177186395659743, + "loss": 1.7963, + "step": 11593 + }, + { + "epoch": 0.4152058302146937, + "grad_norm": 1.8172906637191772, + "learning_rate": 0.00013176086573237766, + "loss": 1.5713, + "step": 11594 + }, + { + "epoch": 0.415241642344262, + "grad_norm": 1.7263849973678589, + "learning_rate": 0.0001317498670808564, + "loss": 1.6143, + "step": 11595 + }, + { + "epoch": 0.4152774544738303, + "grad_norm": 1.7366745471954346, + "learning_rate": 0.0001317388680021816, + "loss": 1.6609, + "step": 11596 + }, + { + "epoch": 0.41531326660339857, + "grad_norm": 1.5991458892822266, + "learning_rate": 0.00013172786849650133, + "loss": 1.6754, + "step": 11597 + }, + { + "epoch": 0.41534907873296684, + "grad_norm": 1.716823697090149, + "learning_rate": 0.00013171686856396344, + "loss": 1.608, + "step": 11598 + }, + { + "epoch": 0.41538489086253516, + "grad_norm": 1.7387841939926147, + "learning_rate": 0.00013170586820471605, + "loss": 1.3947, + "step": 11599 + }, + { + "epoch": 0.41542070299210343, + "grad_norm": 1.3563843965530396, + "learning_rate": 0.00013169486741890706, + "loss": 1.4469, + "step": 11600 + }, + { + "epoch": 0.4154565151216717, + "grad_norm": 1.6906555891036987, + "learning_rate": 0.0001316838662066845, + "loss": 1.3223, + "step": 11601 + }, + { + "epoch": 0.41549232725123997, + "grad_norm": 1.6035621166229248, + "learning_rate": 0.00013167286456819646, + "loss": 1.6543, + "step": 11602 + }, + { + "epoch": 0.4155281393808083, + "grad_norm": 1.3639655113220215, + "learning_rate": 0.00013166186250359086, + "loss": 1.6864, + "step": 11603 + }, + { + "epoch": 0.41556395151037656, + "grad_norm": 2.101548910140991, + "learning_rate": 0.00013165086001301575, + "loss": 1.5005, + "step": 11604 + }, + { + "epoch": 0.41559976363994483, + "grad_norm": 1.2370082139968872, + "learning_rate": 0.0001316398570966191, + "loss": 1.4163, + "step": 11605 + }, + { + "epoch": 0.41563557576951315, + "grad_norm": 1.4403676986694336, + "learning_rate": 0.000131628853754549, + "loss": 1.7009, + "step": 11606 + }, + { + "epoch": 0.4156713878990814, + "grad_norm": 1.7065383195877075, + "learning_rate": 0.00013161784998695349, + "loss": 1.6103, + "step": 11607 + }, + { + "epoch": 0.4157072000286497, + "grad_norm": 3.6323750019073486, + "learning_rate": 0.00013160684579398057, + "loss": 1.4806, + "step": 11608 + }, + { + "epoch": 0.41574301215821796, + "grad_norm": 1.4103000164031982, + "learning_rate": 0.00013159584117577831, + "loss": 1.3592, + "step": 11609 + }, + { + "epoch": 0.4157788242877863, + "grad_norm": 1.6261008977890015, + "learning_rate": 0.0001315848361324948, + "loss": 1.591, + "step": 11610 + }, + { + "epoch": 0.41581463641735456, + "grad_norm": 2.0343966484069824, + "learning_rate": 0.000131573830664278, + "loss": 1.6907, + "step": 11611 + }, + { + "epoch": 0.4158504485469228, + "grad_norm": 1.7296793460845947, + "learning_rate": 0.0001315628247712761, + "loss": 1.5331, + "step": 11612 + }, + { + "epoch": 0.41588626067649115, + "grad_norm": 1.7771193981170654, + "learning_rate": 0.0001315518184536371, + "loss": 1.5747, + "step": 11613 + }, + { + "epoch": 0.4159220728060594, + "grad_norm": 1.993929147720337, + "learning_rate": 0.00013154081171150902, + "loss": 1.7395, + "step": 11614 + }, + { + "epoch": 0.4159578849356277, + "grad_norm": 1.6930261850357056, + "learning_rate": 0.00013152980454504007, + "loss": 1.7027, + "step": 11615 + }, + { + "epoch": 0.41599369706519596, + "grad_norm": 2.1392862796783447, + "learning_rate": 0.00013151879695437823, + "loss": 2.0039, + "step": 11616 + }, + { + "epoch": 0.4160295091947643, + "grad_norm": 1.4961529970169067, + "learning_rate": 0.00013150778893967165, + "loss": 1.8718, + "step": 11617 + }, + { + "epoch": 0.41606532132433255, + "grad_norm": 2.090205430984497, + "learning_rate": 0.0001314967805010684, + "loss": 1.7351, + "step": 11618 + }, + { + "epoch": 0.4161011334539008, + "grad_norm": 1.9120638370513916, + "learning_rate": 0.0001314857716387166, + "loss": 1.6389, + "step": 11619 + }, + { + "epoch": 0.41613694558346914, + "grad_norm": 1.6917821168899536, + "learning_rate": 0.00013147476235276438, + "loss": 1.5383, + "step": 11620 + }, + { + "epoch": 0.4161727577130374, + "grad_norm": 1.3282452821731567, + "learning_rate": 0.00013146375264335978, + "loss": 1.4817, + "step": 11621 + }, + { + "epoch": 0.4162085698426057, + "grad_norm": 1.9482810497283936, + "learning_rate": 0.00013145274251065103, + "loss": 1.5597, + "step": 11622 + }, + { + "epoch": 0.41624438197217395, + "grad_norm": 1.5540615320205688, + "learning_rate": 0.00013144173195478616, + "loss": 1.5281, + "step": 11623 + }, + { + "epoch": 0.4162801941017423, + "grad_norm": 1.9348515272140503, + "learning_rate": 0.0001314307209759134, + "loss": 1.4764, + "step": 11624 + }, + { + "epoch": 0.41631600623131054, + "grad_norm": 2.240257978439331, + "learning_rate": 0.00013141970957418074, + "loss": 1.5697, + "step": 11625 + }, + { + "epoch": 0.4163518183608788, + "grad_norm": 1.4661803245544434, + "learning_rate": 0.0001314086977497365, + "loss": 1.2688, + "step": 11626 + }, + { + "epoch": 0.41638763049044714, + "grad_norm": 1.7400275468826294, + "learning_rate": 0.0001313976855027287, + "loss": 1.4783, + "step": 11627 + }, + { + "epoch": 0.4164234426200154, + "grad_norm": 1.930822730064392, + "learning_rate": 0.00013138667283330556, + "loss": 1.6808, + "step": 11628 + }, + { + "epoch": 0.4164592547495837, + "grad_norm": 1.6102182865142822, + "learning_rate": 0.00013137565974161524, + "loss": 1.7887, + "step": 11629 + }, + { + "epoch": 0.41649506687915194, + "grad_norm": 1.7597026824951172, + "learning_rate": 0.00013136464622780583, + "loss": 1.6061, + "step": 11630 + }, + { + "epoch": 0.41653087900872027, + "grad_norm": 2.7631938457489014, + "learning_rate": 0.00013135363229202564, + "loss": 1.6641, + "step": 11631 + }, + { + "epoch": 0.41656669113828854, + "grad_norm": 1.9850269556045532, + "learning_rate": 0.0001313426179344227, + "loss": 1.2111, + "step": 11632 + }, + { + "epoch": 0.4166025032678568, + "grad_norm": 1.4527435302734375, + "learning_rate": 0.0001313316031551453, + "loss": 1.5459, + "step": 11633 + }, + { + "epoch": 0.41663831539742513, + "grad_norm": 1.589457631111145, + "learning_rate": 0.00013132058795434158, + "loss": 1.6198, + "step": 11634 + }, + { + "epoch": 0.4166741275269934, + "grad_norm": 1.4151225090026855, + "learning_rate": 0.0001313095723321598, + "loss": 1.4616, + "step": 11635 + }, + { + "epoch": 0.41670993965656167, + "grad_norm": 1.7117297649383545, + "learning_rate": 0.00013129855628874805, + "loss": 1.6629, + "step": 11636 + }, + { + "epoch": 0.41674575178612994, + "grad_norm": 2.363799571990967, + "learning_rate": 0.0001312875398242546, + "loss": 1.6905, + "step": 11637 + }, + { + "epoch": 0.41678156391569826, + "grad_norm": 1.919259786605835, + "learning_rate": 0.0001312765229388277, + "loss": 1.4872, + "step": 11638 + }, + { + "epoch": 0.41681737604526653, + "grad_norm": 2.0056421756744385, + "learning_rate": 0.00013126550563261551, + "loss": 1.5765, + "step": 11639 + }, + { + "epoch": 0.4168531881748348, + "grad_norm": 1.4317076206207275, + "learning_rate": 0.00013125448790576627, + "loss": 1.3747, + "step": 11640 + }, + { + "epoch": 0.4168890003044031, + "grad_norm": 1.9106783866882324, + "learning_rate": 0.00013124346975842822, + "loss": 1.3107, + "step": 11641 + }, + { + "epoch": 0.4169248124339714, + "grad_norm": 3.9579901695251465, + "learning_rate": 0.00013123245119074956, + "loss": 1.5716, + "step": 11642 + }, + { + "epoch": 0.41696062456353966, + "grad_norm": 2.3893094062805176, + "learning_rate": 0.00013122143220287854, + "loss": 1.5343, + "step": 11643 + }, + { + "epoch": 0.41699643669310793, + "grad_norm": 1.485538363456726, + "learning_rate": 0.00013121041279496348, + "loss": 1.5387, + "step": 11644 + }, + { + "epoch": 0.41703224882267625, + "grad_norm": 1.8348642587661743, + "learning_rate": 0.00013119939296715253, + "loss": 1.653, + "step": 11645 + }, + { + "epoch": 0.4170680609522445, + "grad_norm": 1.5645591020584106, + "learning_rate": 0.00013118837271959403, + "loss": 1.6355, + "step": 11646 + }, + { + "epoch": 0.4171038730818128, + "grad_norm": 1.9574569463729858, + "learning_rate": 0.0001311773520524362, + "loss": 1.6933, + "step": 11647 + }, + { + "epoch": 0.4171396852113811, + "grad_norm": 2.693382740020752, + "learning_rate": 0.00013116633096582728, + "loss": 1.2307, + "step": 11648 + }, + { + "epoch": 0.4171754973409494, + "grad_norm": 2.2191860675811768, + "learning_rate": 0.0001311553094599156, + "loss": 1.5716, + "step": 11649 + }, + { + "epoch": 0.41721130947051766, + "grad_norm": 1.8328752517700195, + "learning_rate": 0.00013114428753484942, + "loss": 1.4303, + "step": 11650 + }, + { + "epoch": 0.4172471216000859, + "grad_norm": 1.64580500125885, + "learning_rate": 0.00013113326519077702, + "loss": 1.8983, + "step": 11651 + }, + { + "epoch": 0.41728293372965425, + "grad_norm": 1.5777573585510254, + "learning_rate": 0.0001311222424278467, + "loss": 1.3932, + "step": 11652 + }, + { + "epoch": 0.4173187458592225, + "grad_norm": 1.5518149137496948, + "learning_rate": 0.00013111121924620672, + "loss": 1.3117, + "step": 11653 + }, + { + "epoch": 0.4173545579887908, + "grad_norm": 1.7711327075958252, + "learning_rate": 0.00013110019564600546, + "loss": 1.4527, + "step": 11654 + }, + { + "epoch": 0.4173903701183591, + "grad_norm": 1.4793881177902222, + "learning_rate": 0.00013108917162739115, + "loss": 1.7026, + "step": 11655 + }, + { + "epoch": 0.4174261822479274, + "grad_norm": 1.7926698923110962, + "learning_rate": 0.00013107814719051216, + "loss": 1.4584, + "step": 11656 + }, + { + "epoch": 0.41746199437749565, + "grad_norm": 1.60378897190094, + "learning_rate": 0.0001310671223355168, + "loss": 1.6985, + "step": 11657 + }, + { + "epoch": 0.4174978065070639, + "grad_norm": 1.862740397453308, + "learning_rate": 0.00013105609706255336, + "loss": 1.6307, + "step": 11658 + }, + { + "epoch": 0.41753361863663224, + "grad_norm": 1.755902647972107, + "learning_rate": 0.00013104507137177022, + "loss": 1.6147, + "step": 11659 + }, + { + "epoch": 0.4175694307662005, + "grad_norm": 1.6584196090698242, + "learning_rate": 0.00013103404526331564, + "loss": 1.4459, + "step": 11660 + }, + { + "epoch": 0.4176052428957688, + "grad_norm": 1.5859105587005615, + "learning_rate": 0.00013102301873733807, + "loss": 1.5394, + "step": 11661 + }, + { + "epoch": 0.4176410550253371, + "grad_norm": 1.9892284870147705, + "learning_rate": 0.00013101199179398572, + "loss": 1.3006, + "step": 11662 + }, + { + "epoch": 0.4176768671549054, + "grad_norm": 1.620436191558838, + "learning_rate": 0.0001310009644334071, + "loss": 1.4348, + "step": 11663 + }, + { + "epoch": 0.41771267928447364, + "grad_norm": 1.9087800979614258, + "learning_rate": 0.00013098993665575047, + "loss": 1.725, + "step": 11664 + }, + { + "epoch": 0.4177484914140419, + "grad_norm": 1.4620959758758545, + "learning_rate": 0.0001309789084611642, + "loss": 1.4238, + "step": 11665 + }, + { + "epoch": 0.41778430354361024, + "grad_norm": 1.6417380571365356, + "learning_rate": 0.00013096787984979673, + "loss": 1.9673, + "step": 11666 + }, + { + "epoch": 0.4178201156731785, + "grad_norm": 1.8227124214172363, + "learning_rate": 0.00013095685082179632, + "loss": 1.5281, + "step": 11667 + }, + { + "epoch": 0.4178559278027468, + "grad_norm": 1.9912152290344238, + "learning_rate": 0.00013094582137731145, + "loss": 1.3199, + "step": 11668 + }, + { + "epoch": 0.4178917399323151, + "grad_norm": 1.9016451835632324, + "learning_rate": 0.00013093479151649043, + "loss": 1.5948, + "step": 11669 + }, + { + "epoch": 0.41792755206188337, + "grad_norm": 1.4120560884475708, + "learning_rate": 0.00013092376123948174, + "loss": 1.3703, + "step": 11670 + }, + { + "epoch": 0.41796336419145164, + "grad_norm": 1.8982212543487549, + "learning_rate": 0.0001309127305464337, + "loss": 1.149, + "step": 11671 + }, + { + "epoch": 0.4179991763210199, + "grad_norm": 1.4889423847198486, + "learning_rate": 0.00013090169943749476, + "loss": 1.5313, + "step": 11672 + }, + { + "epoch": 0.41803498845058823, + "grad_norm": 1.54822838306427, + "learning_rate": 0.00013089066791281332, + "loss": 1.4798, + "step": 11673 + }, + { + "epoch": 0.4180708005801565, + "grad_norm": 2.2774507999420166, + "learning_rate": 0.00013087963597253777, + "loss": 1.3878, + "step": 11674 + }, + { + "epoch": 0.41810661270972477, + "grad_norm": 1.52279794216156, + "learning_rate": 0.00013086860361681657, + "loss": 1.3788, + "step": 11675 + }, + { + "epoch": 0.4181424248392931, + "grad_norm": 1.921139121055603, + "learning_rate": 0.00013085757084579808, + "loss": 1.6913, + "step": 11676 + }, + { + "epoch": 0.41817823696886136, + "grad_norm": 1.6303210258483887, + "learning_rate": 0.00013084653765963085, + "loss": 1.636, + "step": 11677 + }, + { + "epoch": 0.41821404909842963, + "grad_norm": 1.3981053829193115, + "learning_rate": 0.0001308355040584632, + "loss": 1.4341, + "step": 11678 + }, + { + "epoch": 0.4182498612279979, + "grad_norm": 1.5886470079421997, + "learning_rate": 0.0001308244700424436, + "loss": 1.5647, + "step": 11679 + }, + { + "epoch": 0.4182856733575662, + "grad_norm": 1.4517158269882202, + "learning_rate": 0.00013081343561172055, + "loss": 1.4405, + "step": 11680 + }, + { + "epoch": 0.4183214854871345, + "grad_norm": 1.6235915422439575, + "learning_rate": 0.00013080240076644245, + "loss": 1.6848, + "step": 11681 + }, + { + "epoch": 0.41835729761670276, + "grad_norm": 1.8105449676513672, + "learning_rate": 0.0001307913655067578, + "loss": 1.4255, + "step": 11682 + }, + { + "epoch": 0.4183931097462711, + "grad_norm": 1.8867751359939575, + "learning_rate": 0.00013078032983281505, + "loss": 1.8674, + "step": 11683 + }, + { + "epoch": 0.41842892187583935, + "grad_norm": 1.5423702001571655, + "learning_rate": 0.00013076929374476265, + "loss": 1.6115, + "step": 11684 + }, + { + "epoch": 0.4184647340054076, + "grad_norm": 1.234898567199707, + "learning_rate": 0.00013075825724274907, + "loss": 1.3527, + "step": 11685 + }, + { + "epoch": 0.4185005461349759, + "grad_norm": 1.7368626594543457, + "learning_rate": 0.00013074722032692287, + "loss": 1.4389, + "step": 11686 + }, + { + "epoch": 0.4185363582645442, + "grad_norm": 2.3689353466033936, + "learning_rate": 0.00013073618299743242, + "loss": 1.4973, + "step": 11687 + }, + { + "epoch": 0.4185721703941125, + "grad_norm": 1.5590380430221558, + "learning_rate": 0.00013072514525442632, + "loss": 1.482, + "step": 11688 + }, + { + "epoch": 0.41860798252368076, + "grad_norm": 1.8684630393981934, + "learning_rate": 0.000130714107098053, + "loss": 1.4597, + "step": 11689 + }, + { + "epoch": 0.4186437946532491, + "grad_norm": 1.368911623954773, + "learning_rate": 0.000130703068528461, + "loss": 1.4001, + "step": 11690 + }, + { + "epoch": 0.41867960678281735, + "grad_norm": 2.009352922439575, + "learning_rate": 0.00013069202954579882, + "loss": 1.6073, + "step": 11691 + }, + { + "epoch": 0.4187154189123856, + "grad_norm": 1.8053749799728394, + "learning_rate": 0.00013068099015021498, + "loss": 1.2745, + "step": 11692 + }, + { + "epoch": 0.4187512310419539, + "grad_norm": 2.223372459411621, + "learning_rate": 0.00013066995034185798, + "loss": 1.7715, + "step": 11693 + }, + { + "epoch": 0.4187870431715222, + "grad_norm": 1.833401083946228, + "learning_rate": 0.00013065891012087634, + "loss": 1.7072, + "step": 11694 + }, + { + "epoch": 0.4188228553010905, + "grad_norm": 1.3824347257614136, + "learning_rate": 0.00013064786948741863, + "loss": 1.4636, + "step": 11695 + }, + { + "epoch": 0.41885866743065875, + "grad_norm": 1.4842058420181274, + "learning_rate": 0.00013063682844163338, + "loss": 1.2638, + "step": 11696 + }, + { + "epoch": 0.4188944795602271, + "grad_norm": 1.8669413328170776, + "learning_rate": 0.00013062578698366909, + "loss": 1.5242, + "step": 11697 + }, + { + "epoch": 0.41893029168979534, + "grad_norm": 1.6303125619888306, + "learning_rate": 0.00013061474511367435, + "loss": 1.6688, + "step": 11698 + }, + { + "epoch": 0.4189661038193636, + "grad_norm": 1.7029786109924316, + "learning_rate": 0.00013060370283179772, + "loss": 1.7938, + "step": 11699 + }, + { + "epoch": 0.4190019159489319, + "grad_norm": 2.1118316650390625, + "learning_rate": 0.00013059266013818775, + "loss": 1.5609, + "step": 11700 + }, + { + "epoch": 0.4190377280785002, + "grad_norm": 1.7894207239151, + "learning_rate": 0.000130581617032993, + "loss": 1.6328, + "step": 11701 + }, + { + "epoch": 0.4190735402080685, + "grad_norm": 1.6137210130691528, + "learning_rate": 0.00013057057351636205, + "loss": 1.1927, + "step": 11702 + }, + { + "epoch": 0.41910935233763674, + "grad_norm": 1.628922700881958, + "learning_rate": 0.00013055952958844345, + "loss": 1.7027, + "step": 11703 + }, + { + "epoch": 0.41914516446720507, + "grad_norm": 1.7288146018981934, + "learning_rate": 0.00013054848524938576, + "loss": 1.5557, + "step": 11704 + }, + { + "epoch": 0.41918097659677334, + "grad_norm": 1.3177454471588135, + "learning_rate": 0.00013053744049933765, + "loss": 1.5625, + "step": 11705 + }, + { + "epoch": 0.4192167887263416, + "grad_norm": 1.693861722946167, + "learning_rate": 0.00013052639533844766, + "loss": 1.3449, + "step": 11706 + }, + { + "epoch": 0.4192526008559099, + "grad_norm": 1.604946494102478, + "learning_rate": 0.0001305153497668644, + "loss": 1.5387, + "step": 11707 + }, + { + "epoch": 0.4192884129854782, + "grad_norm": 1.9413237571716309, + "learning_rate": 0.00013050430378473647, + "loss": 1.275, + "step": 11708 + }, + { + "epoch": 0.41932422511504647, + "grad_norm": 1.5269191265106201, + "learning_rate": 0.00013049325739221247, + "loss": 1.6002, + "step": 11709 + }, + { + "epoch": 0.41936003724461474, + "grad_norm": 1.6607357263565063, + "learning_rate": 0.00013048221058944103, + "loss": 1.5763, + "step": 11710 + }, + { + "epoch": 0.41939584937418306, + "grad_norm": 1.8496578931808472, + "learning_rate": 0.00013047116337657077, + "loss": 1.734, + "step": 11711 + }, + { + "epoch": 0.41943166150375133, + "grad_norm": 1.3404603004455566, + "learning_rate": 0.00013046011575375033, + "loss": 1.3846, + "step": 11712 + }, + { + "epoch": 0.4194674736333196, + "grad_norm": 1.9869346618652344, + "learning_rate": 0.00013044906772112828, + "loss": 1.4686, + "step": 11713 + }, + { + "epoch": 0.41950328576288787, + "grad_norm": 1.6581941843032837, + "learning_rate": 0.00013043801927885334, + "loss": 1.5109, + "step": 11714 + }, + { + "epoch": 0.4195390978924562, + "grad_norm": 2.1327946186065674, + "learning_rate": 0.00013042697042707407, + "loss": 1.2431, + "step": 11715 + }, + { + "epoch": 0.41957491002202446, + "grad_norm": 1.4978673458099365, + "learning_rate": 0.0001304159211659392, + "loss": 1.5151, + "step": 11716 + }, + { + "epoch": 0.41961072215159273, + "grad_norm": 1.6439182758331299, + "learning_rate": 0.00013040487149559735, + "loss": 1.4472, + "step": 11717 + }, + { + "epoch": 0.41964653428116105, + "grad_norm": 1.6311148405075073, + "learning_rate": 0.00013039382141619713, + "loss": 1.6517, + "step": 11718 + }, + { + "epoch": 0.4196823464107293, + "grad_norm": 1.6701090335845947, + "learning_rate": 0.0001303827709278873, + "loss": 1.8037, + "step": 11719 + }, + { + "epoch": 0.4197181585402976, + "grad_norm": 1.9653955698013306, + "learning_rate": 0.0001303717200308164, + "loss": 1.3443, + "step": 11720 + }, + { + "epoch": 0.41975397066986586, + "grad_norm": 1.8693047761917114, + "learning_rate": 0.00013036066872513326, + "loss": 1.6978, + "step": 11721 + }, + { + "epoch": 0.4197897827994342, + "grad_norm": 1.5519013404846191, + "learning_rate": 0.00013034961701098645, + "loss": 1.355, + "step": 11722 + }, + { + "epoch": 0.41982559492900245, + "grad_norm": 2.0810554027557373, + "learning_rate": 0.0001303385648885247, + "loss": 1.8034, + "step": 11723 + }, + { + "epoch": 0.4198614070585707, + "grad_norm": 2.0059664249420166, + "learning_rate": 0.00013032751235789668, + "loss": 1.53, + "step": 11724 + }, + { + "epoch": 0.41989721918813905, + "grad_norm": 1.6518661975860596, + "learning_rate": 0.00013031645941925115, + "loss": 1.591, + "step": 11725 + }, + { + "epoch": 0.4199330313177073, + "grad_norm": 1.6742372512817383, + "learning_rate": 0.00013030540607273674, + "loss": 1.6334, + "step": 11726 + }, + { + "epoch": 0.4199688434472756, + "grad_norm": 1.625395655632019, + "learning_rate": 0.00013029435231850215, + "loss": 1.4972, + "step": 11727 + }, + { + "epoch": 0.42000465557684385, + "grad_norm": 1.3360016345977783, + "learning_rate": 0.00013028329815669616, + "loss": 1.2917, + "step": 11728 + }, + { + "epoch": 0.4200404677064122, + "grad_norm": 1.6598440408706665, + "learning_rate": 0.00013027224358746743, + "loss": 1.4068, + "step": 11729 + }, + { + "epoch": 0.42007627983598045, + "grad_norm": 1.5892927646636963, + "learning_rate": 0.00013026118861096472, + "loss": 1.239, + "step": 11730 + }, + { + "epoch": 0.4201120919655487, + "grad_norm": 1.4301259517669678, + "learning_rate": 0.00013025013322733674, + "loss": 1.4511, + "step": 11731 + }, + { + "epoch": 0.42014790409511704, + "grad_norm": 1.4316529035568237, + "learning_rate": 0.00013023907743673228, + "loss": 1.5084, + "step": 11732 + }, + { + "epoch": 0.4201837162246853, + "grad_norm": 1.458953857421875, + "learning_rate": 0.0001302280212393, + "loss": 1.3913, + "step": 11733 + }, + { + "epoch": 0.4202195283542536, + "grad_norm": 2.121302366256714, + "learning_rate": 0.0001302169646351887, + "loss": 1.6365, + "step": 11734 + }, + { + "epoch": 0.42025534048382185, + "grad_norm": 1.6328710317611694, + "learning_rate": 0.00013020590762454713, + "loss": 1.5309, + "step": 11735 + }, + { + "epoch": 0.4202911526133902, + "grad_norm": 1.7382535934448242, + "learning_rate": 0.00013019485020752402, + "loss": 1.28, + "step": 11736 + }, + { + "epoch": 0.42032696474295844, + "grad_norm": 1.6773878335952759, + "learning_rate": 0.00013018379238426814, + "loss": 1.7261, + "step": 11737 + }, + { + "epoch": 0.4203627768725267, + "grad_norm": 2.123692750930786, + "learning_rate": 0.0001301727341549283, + "loss": 1.6939, + "step": 11738 + }, + { + "epoch": 0.42039858900209504, + "grad_norm": 1.591450810432434, + "learning_rate": 0.0001301616755196532, + "loss": 1.6366, + "step": 11739 + }, + { + "epoch": 0.4204344011316633, + "grad_norm": 1.7275581359863281, + "learning_rate": 0.0001301506164785917, + "loss": 1.6026, + "step": 11740 + }, + { + "epoch": 0.4204702132612316, + "grad_norm": 1.6297035217285156, + "learning_rate": 0.00013013955703189252, + "loss": 1.683, + "step": 11741 + }, + { + "epoch": 0.42050602539079984, + "grad_norm": 1.9007762670516968, + "learning_rate": 0.00013012849717970447, + "loss": 1.3047, + "step": 11742 + }, + { + "epoch": 0.42054183752036817, + "grad_norm": 1.8373106718063354, + "learning_rate": 0.00013011743692217638, + "loss": 1.6415, + "step": 11743 + }, + { + "epoch": 0.42057764964993644, + "grad_norm": 1.351611852645874, + "learning_rate": 0.000130106376259457, + "loss": 1.3333, + "step": 11744 + }, + { + "epoch": 0.4206134617795047, + "grad_norm": 1.4901500940322876, + "learning_rate": 0.0001300953151916952, + "loss": 1.6745, + "step": 11745 + }, + { + "epoch": 0.42064927390907303, + "grad_norm": 1.6721500158309937, + "learning_rate": 0.0001300842537190397, + "loss": 1.4681, + "step": 11746 + }, + { + "epoch": 0.4206850860386413, + "grad_norm": 2.408515453338623, + "learning_rate": 0.00013007319184163944, + "loss": 1.4235, + "step": 11747 + }, + { + "epoch": 0.42072089816820957, + "grad_norm": 2.0116817951202393, + "learning_rate": 0.00013006212955964311, + "loss": 1.586, + "step": 11748 + }, + { + "epoch": 0.42075671029777784, + "grad_norm": 1.4370110034942627, + "learning_rate": 0.00013005106687319966, + "loss": 1.8267, + "step": 11749 + }, + { + "epoch": 0.42079252242734616, + "grad_norm": 1.985016942024231, + "learning_rate": 0.00013004000378245782, + "loss": 1.8785, + "step": 11750 + }, + { + "epoch": 0.42082833455691443, + "grad_norm": 1.489750623703003, + "learning_rate": 0.00013002894028756653, + "loss": 1.3115, + "step": 11751 + }, + { + "epoch": 0.4208641466864827, + "grad_norm": 1.9699902534484863, + "learning_rate": 0.0001300178763886745, + "loss": 1.4464, + "step": 11752 + }, + { + "epoch": 0.420899958816051, + "grad_norm": 1.4457578659057617, + "learning_rate": 0.00013000681208593073, + "loss": 1.6236, + "step": 11753 + }, + { + "epoch": 0.4209357709456193, + "grad_norm": 1.5516526699066162, + "learning_rate": 0.000129995747379484, + "loss": 1.5156, + "step": 11754 + }, + { + "epoch": 0.42097158307518756, + "grad_norm": 1.7440998554229736, + "learning_rate": 0.00012998468226948316, + "loss": 1.7088, + "step": 11755 + }, + { + "epoch": 0.42100739520475583, + "grad_norm": 1.6341153383255005, + "learning_rate": 0.00012997361675607714, + "loss": 1.3348, + "step": 11756 + }, + { + "epoch": 0.42104320733432415, + "grad_norm": 2.47872257232666, + "learning_rate": 0.0001299625508394147, + "loss": 1.6474, + "step": 11757 + }, + { + "epoch": 0.4210790194638924, + "grad_norm": 1.4526902437210083, + "learning_rate": 0.00012995148451964487, + "loss": 1.5869, + "step": 11758 + }, + { + "epoch": 0.4211148315934607, + "grad_norm": 1.5664465427398682, + "learning_rate": 0.00012994041779691639, + "loss": 1.6571, + "step": 11759 + }, + { + "epoch": 0.421150643723029, + "grad_norm": 1.4599276781082153, + "learning_rate": 0.00012992935067137823, + "loss": 1.357, + "step": 11760 + }, + { + "epoch": 0.4211864558525973, + "grad_norm": 1.712045669555664, + "learning_rate": 0.00012991828314317923, + "loss": 1.531, + "step": 11761 + }, + { + "epoch": 0.42122226798216555, + "grad_norm": 2.1027767658233643, + "learning_rate": 0.00012990721521246839, + "loss": 1.6629, + "step": 11762 + }, + { + "epoch": 0.4212580801117338, + "grad_norm": 1.7195019721984863, + "learning_rate": 0.00012989614687939453, + "loss": 1.5359, + "step": 11763 + }, + { + "epoch": 0.42129389224130215, + "grad_norm": 1.4466419219970703, + "learning_rate": 0.00012988507814410652, + "loss": 1.6942, + "step": 11764 + }, + { + "epoch": 0.4213297043708704, + "grad_norm": 1.2719695568084717, + "learning_rate": 0.0001298740090067534, + "loss": 1.5447, + "step": 11765 + }, + { + "epoch": 0.4213655165004387, + "grad_norm": 2.5164668560028076, + "learning_rate": 0.000129862939467484, + "loss": 1.5569, + "step": 11766 + }, + { + "epoch": 0.421401328630007, + "grad_norm": 1.691938877105713, + "learning_rate": 0.00012985186952644724, + "loss": 1.4605, + "step": 11767 + }, + { + "epoch": 0.4214371407595753, + "grad_norm": 1.8045603036880493, + "learning_rate": 0.0001298407991837921, + "loss": 1.3567, + "step": 11768 + }, + { + "epoch": 0.42147295288914355, + "grad_norm": 1.5395952463150024, + "learning_rate": 0.0001298297284396675, + "loss": 1.6325, + "step": 11769 + }, + { + "epoch": 0.4215087650187118, + "grad_norm": 1.8432800769805908, + "learning_rate": 0.0001298186572942224, + "loss": 1.4202, + "step": 11770 + }, + { + "epoch": 0.42154457714828014, + "grad_norm": 2.240885019302368, + "learning_rate": 0.00012980758574760573, + "loss": 1.9024, + "step": 11771 + }, + { + "epoch": 0.4215803892778484, + "grad_norm": 2.1993038654327393, + "learning_rate": 0.00012979651379996642, + "loss": 1.5855, + "step": 11772 + }, + { + "epoch": 0.4216162014074167, + "grad_norm": 1.9593286514282227, + "learning_rate": 0.00012978544145145343, + "loss": 1.5088, + "step": 11773 + }, + { + "epoch": 0.421652013536985, + "grad_norm": 1.6190402507781982, + "learning_rate": 0.0001297743687022158, + "loss": 1.6438, + "step": 11774 + }, + { + "epoch": 0.4216878256665533, + "grad_norm": 1.6543118953704834, + "learning_rate": 0.0001297632955524024, + "loss": 1.2093, + "step": 11775 + }, + { + "epoch": 0.42172363779612154, + "grad_norm": 1.6316370964050293, + "learning_rate": 0.00012975222200216227, + "loss": 1.3618, + "step": 11776 + }, + { + "epoch": 0.4217594499256898, + "grad_norm": 1.841871738433838, + "learning_rate": 0.00012974114805164438, + "loss": 1.6844, + "step": 11777 + }, + { + "epoch": 0.42179526205525814, + "grad_norm": 1.207383632659912, + "learning_rate": 0.0001297300737009977, + "loss": 1.4037, + "step": 11778 + }, + { + "epoch": 0.4218310741848264, + "grad_norm": 1.9168089628219604, + "learning_rate": 0.00012971899895037123, + "loss": 1.3065, + "step": 11779 + }, + { + "epoch": 0.4218668863143947, + "grad_norm": 1.7665578126907349, + "learning_rate": 0.00012970792379991396, + "loss": 1.4799, + "step": 11780 + }, + { + "epoch": 0.421902698443963, + "grad_norm": 1.5356736183166504, + "learning_rate": 0.00012969684824977492, + "loss": 1.5496, + "step": 11781 + }, + { + "epoch": 0.42193851057353127, + "grad_norm": 1.6036309003829956, + "learning_rate": 0.00012968577230010304, + "loss": 1.6618, + "step": 11782 + }, + { + "epoch": 0.42197432270309954, + "grad_norm": 1.2157410383224487, + "learning_rate": 0.0001296746959510474, + "loss": 1.4071, + "step": 11783 + }, + { + "epoch": 0.4220101348326678, + "grad_norm": 1.6793562173843384, + "learning_rate": 0.00012966361920275702, + "loss": 1.2486, + "step": 11784 + }, + { + "epoch": 0.42204594696223613, + "grad_norm": 1.6444344520568848, + "learning_rate": 0.0001296525420553809, + "loss": 1.5359, + "step": 11785 + }, + { + "epoch": 0.4220817590918044, + "grad_norm": 2.4854736328125, + "learning_rate": 0.00012964146450906807, + "loss": 1.4089, + "step": 11786 + }, + { + "epoch": 0.42211757122137267, + "grad_norm": 2.2101008892059326, + "learning_rate": 0.0001296303865639676, + "loss": 1.6471, + "step": 11787 + }, + { + "epoch": 0.42215338335094094, + "grad_norm": 1.3930305242538452, + "learning_rate": 0.00012961930822022848, + "loss": 1.5859, + "step": 11788 + }, + { + "epoch": 0.42218919548050926, + "grad_norm": 1.6884602308273315, + "learning_rate": 0.00012960822947799978, + "loss": 1.4459, + "step": 11789 + }, + { + "epoch": 0.42222500761007753, + "grad_norm": 1.2813154458999634, + "learning_rate": 0.0001295971503374305, + "loss": 1.5418, + "step": 11790 + }, + { + "epoch": 0.4222608197396458, + "grad_norm": 1.8060952425003052, + "learning_rate": 0.0001295860707986698, + "loss": 1.5921, + "step": 11791 + }, + { + "epoch": 0.4222966318692141, + "grad_norm": 2.59269642829895, + "learning_rate": 0.00012957499086186665, + "loss": 1.3835, + "step": 11792 + }, + { + "epoch": 0.4223324439987824, + "grad_norm": 1.729286789894104, + "learning_rate": 0.00012956391052717017, + "loss": 1.4255, + "step": 11793 + }, + { + "epoch": 0.42236825612835066, + "grad_norm": 1.4950153827667236, + "learning_rate": 0.00012955282979472938, + "loss": 1.1579, + "step": 11794 + }, + { + "epoch": 0.42240406825791893, + "grad_norm": 1.7770169973373413, + "learning_rate": 0.00012954174866469336, + "loss": 1.7252, + "step": 11795 + }, + { + "epoch": 0.42243988038748725, + "grad_norm": 1.7633696794509888, + "learning_rate": 0.0001295306671372113, + "loss": 1.5152, + "step": 11796 + }, + { + "epoch": 0.4224756925170555, + "grad_norm": 2.1306397914886475, + "learning_rate": 0.00012951958521243215, + "loss": 1.6468, + "step": 11797 + }, + { + "epoch": 0.4225115046466238, + "grad_norm": 1.2364559173583984, + "learning_rate": 0.00012950850289050508, + "loss": 1.5101, + "step": 11798 + }, + { + "epoch": 0.4225473167761921, + "grad_norm": 1.563754677772522, + "learning_rate": 0.00012949742017157915, + "loss": 1.3717, + "step": 11799 + }, + { + "epoch": 0.4225831289057604, + "grad_norm": 1.5410443544387817, + "learning_rate": 0.0001294863370558035, + "loss": 1.3353, + "step": 11800 + }, + { + "epoch": 0.42261894103532865, + "grad_norm": 1.564864993095398, + "learning_rate": 0.0001294752535433272, + "loss": 1.3936, + "step": 11801 + }, + { + "epoch": 0.4226547531648969, + "grad_norm": 1.5698978900909424, + "learning_rate": 0.0001294641696342994, + "loss": 1.5412, + "step": 11802 + }, + { + "epoch": 0.42269056529446525, + "grad_norm": 2.2333059310913086, + "learning_rate": 0.00012945308532886918, + "loss": 1.3542, + "step": 11803 + }, + { + "epoch": 0.4227263774240335, + "grad_norm": 1.559870719909668, + "learning_rate": 0.0001294420006271857, + "loss": 1.3988, + "step": 11804 + }, + { + "epoch": 0.4227621895536018, + "grad_norm": 1.3145016431808472, + "learning_rate": 0.00012943091552939807, + "loss": 1.2073, + "step": 11805 + }, + { + "epoch": 0.4227980016831701, + "grad_norm": 1.6928826570510864, + "learning_rate": 0.00012941983003565544, + "loss": 1.376, + "step": 11806 + }, + { + "epoch": 0.4228338138127384, + "grad_norm": 1.3938753604888916, + "learning_rate": 0.00012940874414610698, + "loss": 1.5549, + "step": 11807 + }, + { + "epoch": 0.42286962594230665, + "grad_norm": 1.5950511693954468, + "learning_rate": 0.00012939765786090178, + "loss": 1.3254, + "step": 11808 + }, + { + "epoch": 0.4229054380718749, + "grad_norm": 1.5573126077651978, + "learning_rate": 0.000129386571180189, + "loss": 1.5849, + "step": 11809 + }, + { + "epoch": 0.42294125020144324, + "grad_norm": 2.0934598445892334, + "learning_rate": 0.00012937548410411778, + "loss": 1.4638, + "step": 11810 + }, + { + "epoch": 0.4229770623310115, + "grad_norm": 1.5477838516235352, + "learning_rate": 0.0001293643966328374, + "loss": 1.4116, + "step": 11811 + }, + { + "epoch": 0.4230128744605798, + "grad_norm": 1.7207345962524414, + "learning_rate": 0.00012935330876649687, + "loss": 1.5597, + "step": 11812 + }, + { + "epoch": 0.4230486865901481, + "grad_norm": 1.701752781867981, + "learning_rate": 0.00012934222050524547, + "loss": 1.6269, + "step": 11813 + }, + { + "epoch": 0.4230844987197164, + "grad_norm": 1.5868911743164062, + "learning_rate": 0.00012933113184923232, + "loss": 1.4728, + "step": 11814 + }, + { + "epoch": 0.42312031084928464, + "grad_norm": 1.414086103439331, + "learning_rate": 0.00012932004279860663, + "loss": 1.3683, + "step": 11815 + }, + { + "epoch": 0.4231561229788529, + "grad_norm": 2.1255552768707275, + "learning_rate": 0.0001293089533535176, + "loss": 1.8208, + "step": 11816 + }, + { + "epoch": 0.42319193510842124, + "grad_norm": 1.8227784633636475, + "learning_rate": 0.00012929786351411439, + "loss": 1.3282, + "step": 11817 + }, + { + "epoch": 0.4232277472379895, + "grad_norm": 1.7852251529693604, + "learning_rate": 0.00012928677328054623, + "loss": 1.5736, + "step": 11818 + }, + { + "epoch": 0.4232635593675578, + "grad_norm": 2.8936076164245605, + "learning_rate": 0.00012927568265296227, + "loss": 1.5581, + "step": 11819 + }, + { + "epoch": 0.4232993714971261, + "grad_norm": 1.452358365058899, + "learning_rate": 0.00012926459163151182, + "loss": 1.6559, + "step": 11820 + }, + { + "epoch": 0.42333518362669437, + "grad_norm": 1.7268511056900024, + "learning_rate": 0.000129253500216344, + "loss": 1.604, + "step": 11821 + }, + { + "epoch": 0.42337099575626264, + "grad_norm": 1.3520252704620361, + "learning_rate": 0.00012924240840760811, + "loss": 1.4204, + "step": 11822 + }, + { + "epoch": 0.4234068078858309, + "grad_norm": 1.4777060747146606, + "learning_rate": 0.0001292313162054533, + "loss": 1.3741, + "step": 11823 + }, + { + "epoch": 0.42344262001539923, + "grad_norm": 1.7785598039627075, + "learning_rate": 0.00012922022361002886, + "loss": 1.6714, + "step": 11824 + }, + { + "epoch": 0.4234784321449675, + "grad_norm": 1.92035710811615, + "learning_rate": 0.00012920913062148398, + "loss": 1.8324, + "step": 11825 + }, + { + "epoch": 0.42351424427453577, + "grad_norm": 1.8269778490066528, + "learning_rate": 0.00012919803723996794, + "loss": 1.7714, + "step": 11826 + }, + { + "epoch": 0.4235500564041041, + "grad_norm": 1.4856898784637451, + "learning_rate": 0.00012918694346562997, + "loss": 1.7747, + "step": 11827 + }, + { + "epoch": 0.42358586853367236, + "grad_norm": 1.4505622386932373, + "learning_rate": 0.0001291758492986193, + "loss": 1.6209, + "step": 11828 + }, + { + "epoch": 0.42362168066324063, + "grad_norm": 1.4111675024032593, + "learning_rate": 0.00012916475473908525, + "loss": 1.2532, + "step": 11829 + }, + { + "epoch": 0.4236574927928089, + "grad_norm": 1.8499622344970703, + "learning_rate": 0.00012915365978717703, + "loss": 1.4083, + "step": 11830 + }, + { + "epoch": 0.4236933049223772, + "grad_norm": 1.4217849969863892, + "learning_rate": 0.0001291425644430439, + "loss": 1.5601, + "step": 11831 + }, + { + "epoch": 0.4237291170519455, + "grad_norm": 1.7619093656539917, + "learning_rate": 0.0001291314687068352, + "loss": 1.3126, + "step": 11832 + }, + { + "epoch": 0.42376492918151376, + "grad_norm": 1.2267673015594482, + "learning_rate": 0.00012912037257870016, + "loss": 1.5579, + "step": 11833 + }, + { + "epoch": 0.4238007413110821, + "grad_norm": 1.6975821256637573, + "learning_rate": 0.00012910927605878803, + "loss": 1.4415, + "step": 11834 + }, + { + "epoch": 0.42383655344065035, + "grad_norm": 1.650166392326355, + "learning_rate": 0.0001290981791472482, + "loss": 1.8445, + "step": 11835 + }, + { + "epoch": 0.4238723655702186, + "grad_norm": 1.423077940940857, + "learning_rate": 0.00012908708184422983, + "loss": 1.3797, + "step": 11836 + }, + { + "epoch": 0.4239081776997869, + "grad_norm": 1.5820120573043823, + "learning_rate": 0.0001290759841498823, + "loss": 1.6298, + "step": 11837 + }, + { + "epoch": 0.4239439898293552, + "grad_norm": 1.6595290899276733, + "learning_rate": 0.00012906488606435497, + "loss": 1.5093, + "step": 11838 + }, + { + "epoch": 0.4239798019589235, + "grad_norm": 2.5189099311828613, + "learning_rate": 0.00012905378758779702, + "loss": 1.3822, + "step": 11839 + }, + { + "epoch": 0.42401561408849175, + "grad_norm": 1.7056360244750977, + "learning_rate": 0.00012904268872035787, + "loss": 1.253, + "step": 11840 + }, + { + "epoch": 0.4240514262180601, + "grad_norm": 1.8914939165115356, + "learning_rate": 0.00012903158946218682, + "loss": 1.4382, + "step": 11841 + }, + { + "epoch": 0.42408723834762835, + "grad_norm": 1.5297006368637085, + "learning_rate": 0.0001290204898134332, + "loss": 1.5213, + "step": 11842 + }, + { + "epoch": 0.4241230504771966, + "grad_norm": 1.7533084154129028, + "learning_rate": 0.00012900938977424627, + "loss": 1.4682, + "step": 11843 + }, + { + "epoch": 0.4241588626067649, + "grad_norm": 1.830748200416565, + "learning_rate": 0.00012899828934477545, + "loss": 1.6938, + "step": 11844 + }, + { + "epoch": 0.4241946747363332, + "grad_norm": 1.4886054992675781, + "learning_rate": 0.00012898718852517003, + "loss": 1.4325, + "step": 11845 + }, + { + "epoch": 0.4242304868659015, + "grad_norm": 1.547685146331787, + "learning_rate": 0.0001289760873155794, + "loss": 1.6207, + "step": 11846 + }, + { + "epoch": 0.42426629899546975, + "grad_norm": 1.416277289390564, + "learning_rate": 0.00012896498571615287, + "loss": 1.5239, + "step": 11847 + }, + { + "epoch": 0.4243021111250381, + "grad_norm": 1.5753027200698853, + "learning_rate": 0.00012895388372703985, + "loss": 1.6592, + "step": 11848 + }, + { + "epoch": 0.42433792325460634, + "grad_norm": 1.7635771036148071, + "learning_rate": 0.00012894278134838963, + "loss": 1.6179, + "step": 11849 + }, + { + "epoch": 0.4243737353841746, + "grad_norm": 1.644930362701416, + "learning_rate": 0.00012893167858035168, + "loss": 1.5943, + "step": 11850 + }, + { + "epoch": 0.4244095475137429, + "grad_norm": 1.333198070526123, + "learning_rate": 0.00012892057542307527, + "loss": 1.695, + "step": 11851 + }, + { + "epoch": 0.4244453596433112, + "grad_norm": 2.0017025470733643, + "learning_rate": 0.00012890947187670982, + "loss": 1.617, + "step": 11852 + }, + { + "epoch": 0.4244811717728795, + "grad_norm": 1.3186886310577393, + "learning_rate": 0.00012889836794140473, + "loss": 1.634, + "step": 11853 + }, + { + "epoch": 0.42451698390244774, + "grad_norm": 1.535506248474121, + "learning_rate": 0.00012888726361730935, + "loss": 1.5335, + "step": 11854 + }, + { + "epoch": 0.42455279603201607, + "grad_norm": 1.9215971231460571, + "learning_rate": 0.00012887615890457314, + "loss": 1.5718, + "step": 11855 + }, + { + "epoch": 0.42458860816158434, + "grad_norm": 1.4248054027557373, + "learning_rate": 0.00012886505380334544, + "loss": 1.4283, + "step": 11856 + }, + { + "epoch": 0.4246244202911526, + "grad_norm": 1.514243483543396, + "learning_rate": 0.00012885394831377569, + "loss": 1.6051, + "step": 11857 + }, + { + "epoch": 0.4246602324207209, + "grad_norm": 1.4963092803955078, + "learning_rate": 0.00012884284243601325, + "loss": 1.4867, + "step": 11858 + }, + { + "epoch": 0.4246960445502892, + "grad_norm": 1.4643319845199585, + "learning_rate": 0.0001288317361702076, + "loss": 1.4199, + "step": 11859 + }, + { + "epoch": 0.42473185667985747, + "grad_norm": 1.4797992706298828, + "learning_rate": 0.00012882062951650815, + "loss": 1.4582, + "step": 11860 + }, + { + "epoch": 0.42476766880942574, + "grad_norm": 2.1441917419433594, + "learning_rate": 0.00012880952247506426, + "loss": 1.4834, + "step": 11861 + }, + { + "epoch": 0.42480348093899406, + "grad_norm": 1.7394976615905762, + "learning_rate": 0.00012879841504602544, + "loss": 1.747, + "step": 11862 + }, + { + "epoch": 0.42483929306856233, + "grad_norm": 2.274009943008423, + "learning_rate": 0.00012878730722954104, + "loss": 1.596, + "step": 11863 + }, + { + "epoch": 0.4248751051981306, + "grad_norm": 1.643452525138855, + "learning_rate": 0.00012877619902576062, + "loss": 1.6008, + "step": 11864 + }, + { + "epoch": 0.42491091732769887, + "grad_norm": 1.7636380195617676, + "learning_rate": 0.0001287650904348335, + "loss": 1.037, + "step": 11865 + }, + { + "epoch": 0.4249467294572672, + "grad_norm": 1.6334855556488037, + "learning_rate": 0.00012875398145690924, + "loss": 1.278, + "step": 11866 + }, + { + "epoch": 0.42498254158683546, + "grad_norm": 1.7743029594421387, + "learning_rate": 0.00012874287209213724, + "loss": 1.373, + "step": 11867 + }, + { + "epoch": 0.42501835371640373, + "grad_norm": 1.3626136779785156, + "learning_rate": 0.00012873176234066698, + "loss": 1.5498, + "step": 11868 + }, + { + "epoch": 0.42505416584597205, + "grad_norm": 1.7889161109924316, + "learning_rate": 0.0001287206522026479, + "loss": 1.6937, + "step": 11869 + }, + { + "epoch": 0.4250899779755403, + "grad_norm": 1.5051333904266357, + "learning_rate": 0.0001287095416782295, + "loss": 1.4984, + "step": 11870 + }, + { + "epoch": 0.4251257901051086, + "grad_norm": 1.7558956146240234, + "learning_rate": 0.00012869843076756125, + "loss": 1.4792, + "step": 11871 + }, + { + "epoch": 0.42516160223467686, + "grad_norm": 1.8319754600524902, + "learning_rate": 0.0001286873194707926, + "loss": 1.943, + "step": 11872 + }, + { + "epoch": 0.4251974143642452, + "grad_norm": 1.6281830072402954, + "learning_rate": 0.0001286762077880731, + "loss": 1.3749, + "step": 11873 + }, + { + "epoch": 0.42523322649381345, + "grad_norm": 1.5572781562805176, + "learning_rate": 0.00012866509571955221, + "loss": 1.7843, + "step": 11874 + }, + { + "epoch": 0.4252690386233817, + "grad_norm": 1.4318057298660278, + "learning_rate": 0.00012865398326537944, + "loss": 1.423, + "step": 11875 + }, + { + "epoch": 0.42530485075295005, + "grad_norm": 1.6866233348846436, + "learning_rate": 0.0001286428704257043, + "loss": 1.545, + "step": 11876 + }, + { + "epoch": 0.4253406628825183, + "grad_norm": 1.7870521545410156, + "learning_rate": 0.00012863175720067627, + "loss": 1.7577, + "step": 11877 + }, + { + "epoch": 0.4253764750120866, + "grad_norm": 1.4595621824264526, + "learning_rate": 0.00012862064359044485, + "loss": 1.5902, + "step": 11878 + }, + { + "epoch": 0.42541228714165485, + "grad_norm": 1.636542797088623, + "learning_rate": 0.00012860952959515962, + "loss": 1.4817, + "step": 11879 + }, + { + "epoch": 0.4254480992712232, + "grad_norm": 2.6225593090057373, + "learning_rate": 0.00012859841521497008, + "loss": 1.208, + "step": 11880 + }, + { + "epoch": 0.42548391140079145, + "grad_norm": 1.9722484350204468, + "learning_rate": 0.00012858730045002572, + "loss": 1.7867, + "step": 11881 + }, + { + "epoch": 0.4255197235303597, + "grad_norm": 1.486351490020752, + "learning_rate": 0.00012857618530047615, + "loss": 1.5268, + "step": 11882 + }, + { + "epoch": 0.42555553565992804, + "grad_norm": 1.3822557926177979, + "learning_rate": 0.0001285650697664708, + "loss": 1.5033, + "step": 11883 + }, + { + "epoch": 0.4255913477894963, + "grad_norm": 1.99789559841156, + "learning_rate": 0.00012855395384815937, + "loss": 1.8533, + "step": 11884 + }, + { + "epoch": 0.4256271599190646, + "grad_norm": 1.781887173652649, + "learning_rate": 0.00012854283754569127, + "loss": 1.4333, + "step": 11885 + }, + { + "epoch": 0.42566297204863285, + "grad_norm": 1.5997289419174194, + "learning_rate": 0.00012853172085921613, + "loss": 1.4675, + "step": 11886 + }, + { + "epoch": 0.4256987841782012, + "grad_norm": 1.4423165321350098, + "learning_rate": 0.00012852060378888347, + "loss": 1.6605, + "step": 11887 + }, + { + "epoch": 0.42573459630776944, + "grad_norm": 2.1727519035339355, + "learning_rate": 0.00012850948633484288, + "loss": 1.4342, + "step": 11888 + }, + { + "epoch": 0.4257704084373377, + "grad_norm": 1.8773438930511475, + "learning_rate": 0.00012849836849724392, + "loss": 1.2294, + "step": 11889 + }, + { + "epoch": 0.42580622056690604, + "grad_norm": 1.5362021923065186, + "learning_rate": 0.0001284872502762362, + "loss": 1.6375, + "step": 11890 + }, + { + "epoch": 0.4258420326964743, + "grad_norm": 2.041426658630371, + "learning_rate": 0.00012847613167196923, + "loss": 1.4472, + "step": 11891 + }, + { + "epoch": 0.4258778448260426, + "grad_norm": 1.3776212930679321, + "learning_rate": 0.00012846501268459266, + "loss": 1.2878, + "step": 11892 + }, + { + "epoch": 0.42591365695561084, + "grad_norm": 1.8141348361968994, + "learning_rate": 0.00012845389331425606, + "loss": 1.2643, + "step": 11893 + }, + { + "epoch": 0.42594946908517917, + "grad_norm": 1.4836647510528564, + "learning_rate": 0.00012844277356110906, + "loss": 1.6488, + "step": 11894 + }, + { + "epoch": 0.42598528121474744, + "grad_norm": 1.3631209135055542, + "learning_rate": 0.0001284316534253012, + "loss": 1.3385, + "step": 11895 + }, + { + "epoch": 0.4260210933443157, + "grad_norm": 2.604870319366455, + "learning_rate": 0.0001284205329069821, + "loss": 1.5673, + "step": 11896 + }, + { + "epoch": 0.42605690547388403, + "grad_norm": 1.4967273473739624, + "learning_rate": 0.00012840941200630143, + "loss": 1.5662, + "step": 11897 + }, + { + "epoch": 0.4260927176034523, + "grad_norm": 1.547292709350586, + "learning_rate": 0.00012839829072340875, + "loss": 1.4669, + "step": 11898 + }, + { + "epoch": 0.42612852973302057, + "grad_norm": 1.751133918762207, + "learning_rate": 0.0001283871690584537, + "loss": 1.4935, + "step": 11899 + }, + { + "epoch": 0.42616434186258884, + "grad_norm": 1.6280288696289062, + "learning_rate": 0.00012837604701158587, + "loss": 1.7348, + "step": 11900 + }, + { + "epoch": 0.42620015399215716, + "grad_norm": 1.4411042928695679, + "learning_rate": 0.000128364924582955, + "loss": 1.5411, + "step": 11901 + }, + { + "epoch": 0.42623596612172543, + "grad_norm": 1.841700553894043, + "learning_rate": 0.00012835380177271058, + "loss": 1.4303, + "step": 11902 + }, + { + "epoch": 0.4262717782512937, + "grad_norm": 1.4292374849319458, + "learning_rate": 0.00012834267858100238, + "loss": 1.3253, + "step": 11903 + }, + { + "epoch": 0.426307590380862, + "grad_norm": 1.8277291059494019, + "learning_rate": 0.00012833155500798003, + "loss": 1.5402, + "step": 11904 + }, + { + "epoch": 0.4263434025104303, + "grad_norm": 1.7723889350891113, + "learning_rate": 0.0001283204310537931, + "loss": 1.5183, + "step": 11905 + }, + { + "epoch": 0.42637921463999856, + "grad_norm": 1.554281234741211, + "learning_rate": 0.00012830930671859132, + "loss": 1.3464, + "step": 11906 + }, + { + "epoch": 0.42641502676956683, + "grad_norm": 1.9932013750076294, + "learning_rate": 0.00012829818200252432, + "loss": 1.584, + "step": 11907 + }, + { + "epoch": 0.42645083889913515, + "grad_norm": 1.3307794332504272, + "learning_rate": 0.0001282870569057418, + "loss": 1.5602, + "step": 11908 + }, + { + "epoch": 0.4264866510287034, + "grad_norm": 1.5475578308105469, + "learning_rate": 0.0001282759314283934, + "loss": 1.3701, + "step": 11909 + }, + { + "epoch": 0.4265224631582717, + "grad_norm": 1.49881911277771, + "learning_rate": 0.00012826480557062884, + "loss": 1.339, + "step": 11910 + }, + { + "epoch": 0.42655827528784, + "grad_norm": 2.00813364982605, + "learning_rate": 0.00012825367933259774, + "loss": 1.5767, + "step": 11911 + }, + { + "epoch": 0.4265940874174083, + "grad_norm": 1.6309877634048462, + "learning_rate": 0.00012824255271444987, + "loss": 1.5292, + "step": 11912 + }, + { + "epoch": 0.42662989954697655, + "grad_norm": 1.6364531517028809, + "learning_rate": 0.00012823142571633488, + "loss": 1.6134, + "step": 11913 + }, + { + "epoch": 0.4266657116765448, + "grad_norm": 1.7823848724365234, + "learning_rate": 0.00012822029833840245, + "loss": 1.3748, + "step": 11914 + }, + { + "epoch": 0.42670152380611315, + "grad_norm": 1.6810747385025024, + "learning_rate": 0.00012820917058080234, + "loss": 1.6998, + "step": 11915 + }, + { + "epoch": 0.4267373359356814, + "grad_norm": 1.9501549005508423, + "learning_rate": 0.0001281980424436842, + "loss": 1.5711, + "step": 11916 + }, + { + "epoch": 0.4267731480652497, + "grad_norm": 1.8748064041137695, + "learning_rate": 0.00012818691392719778, + "loss": 1.5153, + "step": 11917 + }, + { + "epoch": 0.426808960194818, + "grad_norm": 1.5978293418884277, + "learning_rate": 0.00012817578503149276, + "loss": 1.6448, + "step": 11918 + }, + { + "epoch": 0.4268447723243863, + "grad_norm": 1.8714993000030518, + "learning_rate": 0.00012816465575671895, + "loss": 1.4063, + "step": 11919 + }, + { + "epoch": 0.42688058445395455, + "grad_norm": 1.4037456512451172, + "learning_rate": 0.000128153526103026, + "loss": 1.5125, + "step": 11920 + }, + { + "epoch": 0.4269163965835228, + "grad_norm": 1.7890756130218506, + "learning_rate": 0.00012814239607056367, + "loss": 1.3303, + "step": 11921 + }, + { + "epoch": 0.42695220871309114, + "grad_norm": 2.235222578048706, + "learning_rate": 0.00012813126565948171, + "loss": 1.1991, + "step": 11922 + }, + { + "epoch": 0.4269880208426594, + "grad_norm": 2.393725633621216, + "learning_rate": 0.00012812013486992985, + "loss": 1.7267, + "step": 11923 + }, + { + "epoch": 0.4270238329722277, + "grad_norm": 1.7477926015853882, + "learning_rate": 0.00012810900370205784, + "loss": 1.7094, + "step": 11924 + }, + { + "epoch": 0.427059645101796, + "grad_norm": 1.6497920751571655, + "learning_rate": 0.00012809787215601546, + "loss": 1.6286, + "step": 11925 + }, + { + "epoch": 0.4270954572313643, + "grad_norm": 1.7119648456573486, + "learning_rate": 0.00012808674023195244, + "loss": 1.5985, + "step": 11926 + }, + { + "epoch": 0.42713126936093254, + "grad_norm": 1.2613149881362915, + "learning_rate": 0.00012807560793001856, + "loss": 1.426, + "step": 11927 + }, + { + "epoch": 0.4271670814905008, + "grad_norm": 1.6869969367980957, + "learning_rate": 0.0001280644752503636, + "loss": 1.4147, + "step": 11928 + }, + { + "epoch": 0.42720289362006914, + "grad_norm": 1.247328758239746, + "learning_rate": 0.00012805334219313734, + "loss": 1.5805, + "step": 11929 + }, + { + "epoch": 0.4272387057496374, + "grad_norm": 2.01601243019104, + "learning_rate": 0.00012804220875848953, + "loss": 1.4124, + "step": 11930 + }, + { + "epoch": 0.4272745178792057, + "grad_norm": 2.005664587020874, + "learning_rate": 0.00012803107494657, + "loss": 1.4524, + "step": 11931 + }, + { + "epoch": 0.427310330008774, + "grad_norm": 2.120532274246216, + "learning_rate": 0.0001280199407575285, + "loss": 1.7616, + "step": 11932 + }, + { + "epoch": 0.42734614213834227, + "grad_norm": 1.7293062210083008, + "learning_rate": 0.0001280088061915148, + "loss": 1.3992, + "step": 11933 + }, + { + "epoch": 0.42738195426791054, + "grad_norm": 2.858889579772949, + "learning_rate": 0.00012799767124867874, + "loss": 1.418, + "step": 11934 + }, + { + "epoch": 0.4274177663974788, + "grad_norm": 3.2688639163970947, + "learning_rate": 0.00012798653592917017, + "loss": 1.47, + "step": 11935 + }, + { + "epoch": 0.42745357852704713, + "grad_norm": 1.553168773651123, + "learning_rate": 0.00012797540023313882, + "loss": 1.6188, + "step": 11936 + }, + { + "epoch": 0.4274893906566154, + "grad_norm": 1.8382031917572021, + "learning_rate": 0.0001279642641607346, + "loss": 1.5762, + "step": 11937 + }, + { + "epoch": 0.42752520278618367, + "grad_norm": 1.7807074785232544, + "learning_rate": 0.00012795312771210726, + "loss": 1.4538, + "step": 11938 + }, + { + "epoch": 0.427561014915752, + "grad_norm": 1.8465766906738281, + "learning_rate": 0.00012794199088740665, + "loss": 1.3205, + "step": 11939 + }, + { + "epoch": 0.42759682704532026, + "grad_norm": 1.5674152374267578, + "learning_rate": 0.00012793085368678254, + "loss": 1.5163, + "step": 11940 + }, + { + "epoch": 0.42763263917488853, + "grad_norm": 1.7111066579818726, + "learning_rate": 0.00012791971611038488, + "loss": 1.2398, + "step": 11941 + }, + { + "epoch": 0.4276684513044568, + "grad_norm": 1.754284381866455, + "learning_rate": 0.00012790857815836342, + "loss": 1.4891, + "step": 11942 + }, + { + "epoch": 0.4277042634340251, + "grad_norm": 1.5504387617111206, + "learning_rate": 0.00012789743983086807, + "loss": 1.4967, + "step": 11943 + }, + { + "epoch": 0.4277400755635934, + "grad_norm": 1.2893853187561035, + "learning_rate": 0.00012788630112804862, + "loss": 1.4526, + "step": 11944 + }, + { + "epoch": 0.42777588769316166, + "grad_norm": 1.9921671152114868, + "learning_rate": 0.000127875162050055, + "loss": 1.4574, + "step": 11945 + }, + { + "epoch": 0.42781169982273, + "grad_norm": 1.7197723388671875, + "learning_rate": 0.00012786402259703697, + "loss": 1.2273, + "step": 11946 + }, + { + "epoch": 0.42784751195229825, + "grad_norm": 2.255772590637207, + "learning_rate": 0.0001278528827691445, + "loss": 1.456, + "step": 11947 + }, + { + "epoch": 0.4278833240818665, + "grad_norm": 2.25209903717041, + "learning_rate": 0.00012784174256652743, + "loss": 1.4299, + "step": 11948 + }, + { + "epoch": 0.4279191362114348, + "grad_norm": 1.667028784751892, + "learning_rate": 0.0001278306019893356, + "loss": 1.021, + "step": 11949 + }, + { + "epoch": 0.4279549483410031, + "grad_norm": 1.792156457901001, + "learning_rate": 0.00012781946103771892, + "loss": 1.347, + "step": 11950 + }, + { + "epoch": 0.4279907604705714, + "grad_norm": 1.6226983070373535, + "learning_rate": 0.0001278083197118273, + "loss": 1.6862, + "step": 11951 + }, + { + "epoch": 0.42802657260013965, + "grad_norm": 1.4472019672393799, + "learning_rate": 0.00012779717801181058, + "loss": 1.6126, + "step": 11952 + }, + { + "epoch": 0.428062384729708, + "grad_norm": 2.137946844100952, + "learning_rate": 0.0001277860359378187, + "loss": 1.5559, + "step": 11953 + }, + { + "epoch": 0.42809819685927625, + "grad_norm": 2.2679343223571777, + "learning_rate": 0.00012777489349000156, + "loss": 1.5858, + "step": 11954 + }, + { + "epoch": 0.4281340089888445, + "grad_norm": 1.3759374618530273, + "learning_rate": 0.00012776375066850902, + "loss": 1.4777, + "step": 11955 + }, + { + "epoch": 0.4281698211184128, + "grad_norm": 1.530863642692566, + "learning_rate": 0.00012775260747349107, + "loss": 1.1968, + "step": 11956 + }, + { + "epoch": 0.4282056332479811, + "grad_norm": 1.4429298639297485, + "learning_rate": 0.0001277414639050976, + "loss": 1.6555, + "step": 11957 + }, + { + "epoch": 0.4282414453775494, + "grad_norm": 2.4273345470428467, + "learning_rate": 0.00012773031996347845, + "loss": 1.3997, + "step": 11958 + }, + { + "epoch": 0.42827725750711765, + "grad_norm": 1.5408649444580078, + "learning_rate": 0.00012771917564878367, + "loss": 1.3479, + "step": 11959 + }, + { + "epoch": 0.428313069636686, + "grad_norm": 1.3045849800109863, + "learning_rate": 0.00012770803096116308, + "loss": 1.3514, + "step": 11960 + }, + { + "epoch": 0.42834888176625424, + "grad_norm": 1.5886048078536987, + "learning_rate": 0.00012769688590076673, + "loss": 1.3276, + "step": 11961 + }, + { + "epoch": 0.4283846938958225, + "grad_norm": 1.6271336078643799, + "learning_rate": 0.00012768574046774446, + "loss": 1.6032, + "step": 11962 + }, + { + "epoch": 0.4284205060253908, + "grad_norm": 1.9809879064559937, + "learning_rate": 0.00012767459466224632, + "loss": 1.6735, + "step": 11963 + }, + { + "epoch": 0.4284563181549591, + "grad_norm": 1.7093204259872437, + "learning_rate": 0.00012766344848442218, + "loss": 1.7656, + "step": 11964 + }, + { + "epoch": 0.4284921302845274, + "grad_norm": 1.7945045232772827, + "learning_rate": 0.00012765230193442198, + "loss": 1.5466, + "step": 11965 + }, + { + "epoch": 0.42852794241409564, + "grad_norm": 2.3185229301452637, + "learning_rate": 0.0001276411550123958, + "loss": 1.4034, + "step": 11966 + }, + { + "epoch": 0.42856375454366397, + "grad_norm": 1.5377254486083984, + "learning_rate": 0.00012763000771849348, + "loss": 1.7145, + "step": 11967 + }, + { + "epoch": 0.42859956667323224, + "grad_norm": 1.9657632112503052, + "learning_rate": 0.00012761886005286508, + "loss": 1.5286, + "step": 11968 + }, + { + "epoch": 0.4286353788028005, + "grad_norm": 1.495625376701355, + "learning_rate": 0.0001276077120156605, + "loss": 1.3016, + "step": 11969 + }, + { + "epoch": 0.4286711909323688, + "grad_norm": 1.7579660415649414, + "learning_rate": 0.0001275965636070298, + "loss": 1.2155, + "step": 11970 + }, + { + "epoch": 0.4287070030619371, + "grad_norm": 2.0565755367279053, + "learning_rate": 0.00012758541482712295, + "loss": 1.3118, + "step": 11971 + }, + { + "epoch": 0.42874281519150537, + "grad_norm": 1.7184526920318604, + "learning_rate": 0.0001275742656760899, + "loss": 1.3649, + "step": 11972 + }, + { + "epoch": 0.42877862732107364, + "grad_norm": 1.7197214365005493, + "learning_rate": 0.00012756311615408068, + "loss": 1.2731, + "step": 11973 + }, + { + "epoch": 0.42881443945064196, + "grad_norm": 1.3079050779342651, + "learning_rate": 0.0001275519662612453, + "loss": 1.3794, + "step": 11974 + }, + { + "epoch": 0.42885025158021023, + "grad_norm": 1.7446621656417847, + "learning_rate": 0.00012754081599773373, + "loss": 1.6582, + "step": 11975 + }, + { + "epoch": 0.4288860637097785, + "grad_norm": 1.5403920412063599, + "learning_rate": 0.00012752966536369603, + "loss": 1.3348, + "step": 11976 + }, + { + "epoch": 0.42892187583934677, + "grad_norm": 1.4818763732910156, + "learning_rate": 0.00012751851435928216, + "loss": 1.5335, + "step": 11977 + }, + { + "epoch": 0.4289576879689151, + "grad_norm": 1.5109602212905884, + "learning_rate": 0.00012750736298464216, + "loss": 1.3644, + "step": 11978 + }, + { + "epoch": 0.42899350009848336, + "grad_norm": 2.0222465991973877, + "learning_rate": 0.00012749621123992613, + "loss": 1.3843, + "step": 11979 + }, + { + "epoch": 0.42902931222805163, + "grad_norm": 1.4966095685958862, + "learning_rate": 0.000127485059125284, + "loss": 1.337, + "step": 11980 + }, + { + "epoch": 0.42906512435761995, + "grad_norm": 1.5781891345977783, + "learning_rate": 0.0001274739066408659, + "loss": 1.6262, + "step": 11981 + }, + { + "epoch": 0.4291009364871882, + "grad_norm": 1.5741047859191895, + "learning_rate": 0.0001274627537868218, + "loss": 1.5799, + "step": 11982 + }, + { + "epoch": 0.4291367486167565, + "grad_norm": 1.477160096168518, + "learning_rate": 0.00012745160056330178, + "loss": 1.658, + "step": 11983 + }, + { + "epoch": 0.42917256074632476, + "grad_norm": 2.298166036605835, + "learning_rate": 0.00012744044697045586, + "loss": 1.9077, + "step": 11984 + }, + { + "epoch": 0.4292083728758931, + "grad_norm": 1.8923615217208862, + "learning_rate": 0.00012742929300843417, + "loss": 1.634, + "step": 11985 + }, + { + "epoch": 0.42924418500546135, + "grad_norm": 2.1138875484466553, + "learning_rate": 0.00012741813867738665, + "loss": 1.8653, + "step": 11986 + }, + { + "epoch": 0.4292799971350296, + "grad_norm": 2.7330331802368164, + "learning_rate": 0.00012740698397746352, + "loss": 1.5775, + "step": 11987 + }, + { + "epoch": 0.4293158092645979, + "grad_norm": 1.834524154663086, + "learning_rate": 0.0001273958289088147, + "loss": 1.6638, + "step": 11988 + }, + { + "epoch": 0.4293516213941662, + "grad_norm": 1.6845818758010864, + "learning_rate": 0.0001273846734715904, + "loss": 1.3616, + "step": 11989 + }, + { + "epoch": 0.4293874335237345, + "grad_norm": 1.5297510623931885, + "learning_rate": 0.0001273735176659406, + "loss": 1.2825, + "step": 11990 + }, + { + "epoch": 0.42942324565330275, + "grad_norm": 1.622412085533142, + "learning_rate": 0.00012736236149201547, + "loss": 1.5248, + "step": 11991 + }, + { + "epoch": 0.4294590577828711, + "grad_norm": 1.8445333242416382, + "learning_rate": 0.000127351204949965, + "loss": 1.6047, + "step": 11992 + }, + { + "epoch": 0.42949486991243935, + "grad_norm": 1.7746164798736572, + "learning_rate": 0.0001273400480399394, + "loss": 1.3141, + "step": 11993 + }, + { + "epoch": 0.4295306820420076, + "grad_norm": 1.7072081565856934, + "learning_rate": 0.00012732889076208872, + "loss": 1.5455, + "step": 11994 + }, + { + "epoch": 0.4295664941715759, + "grad_norm": 1.5068244934082031, + "learning_rate": 0.00012731773311656304, + "loss": 1.6053, + "step": 11995 + }, + { + "epoch": 0.4296023063011442, + "grad_norm": 1.6956735849380493, + "learning_rate": 0.00012730657510351252, + "loss": 1.6389, + "step": 11996 + }, + { + "epoch": 0.4296381184307125, + "grad_norm": 1.3766717910766602, + "learning_rate": 0.00012729541672308722, + "loss": 1.3844, + "step": 11997 + }, + { + "epoch": 0.42967393056028075, + "grad_norm": 1.9688079357147217, + "learning_rate": 0.00012728425797543731, + "loss": 1.5564, + "step": 11998 + }, + { + "epoch": 0.4297097426898491, + "grad_norm": 1.6623727083206177, + "learning_rate": 0.00012727309886071292, + "loss": 1.1367, + "step": 11999 + }, + { + "epoch": 0.42974555481941734, + "grad_norm": 1.2293025255203247, + "learning_rate": 0.00012726193937906416, + "loss": 1.4147, + "step": 12000 + }, + { + "epoch": 0.4297813669489856, + "grad_norm": 2.1265058517456055, + "learning_rate": 0.00012725077953064119, + "loss": 1.5568, + "step": 12001 + }, + { + "epoch": 0.4298171790785539, + "grad_norm": 1.5156117677688599, + "learning_rate": 0.0001272396193155941, + "loss": 1.3789, + "step": 12002 + }, + { + "epoch": 0.4298529912081222, + "grad_norm": 2.3674230575561523, + "learning_rate": 0.0001272284587340731, + "loss": 1.4465, + "step": 12003 + }, + { + "epoch": 0.4298888033376905, + "grad_norm": 1.604242205619812, + "learning_rate": 0.00012721729778622826, + "loss": 1.6652, + "step": 12004 + }, + { + "epoch": 0.42992461546725874, + "grad_norm": 1.3439925909042358, + "learning_rate": 0.00012720613647220985, + "loss": 1.2755, + "step": 12005 + }, + { + "epoch": 0.42996042759682707, + "grad_norm": 3.2509360313415527, + "learning_rate": 0.0001271949747921679, + "loss": 1.6501, + "step": 12006 + }, + { + "epoch": 0.42999623972639534, + "grad_norm": 1.9098608493804932, + "learning_rate": 0.0001271838127462527, + "loss": 1.5589, + "step": 12007 + }, + { + "epoch": 0.4300320518559636, + "grad_norm": 2.2342143058776855, + "learning_rate": 0.0001271726503346143, + "loss": 1.221, + "step": 12008 + }, + { + "epoch": 0.4300678639855319, + "grad_norm": 1.2617571353912354, + "learning_rate": 0.00012716148755740302, + "loss": 1.5833, + "step": 12009 + }, + { + "epoch": 0.4301036761151002, + "grad_norm": 1.4430612325668335, + "learning_rate": 0.00012715032441476892, + "loss": 1.5831, + "step": 12010 + }, + { + "epoch": 0.43013948824466847, + "grad_norm": 1.642476201057434, + "learning_rate": 0.00012713916090686223, + "loss": 1.626, + "step": 12011 + }, + { + "epoch": 0.43017530037423674, + "grad_norm": 1.5486873388290405, + "learning_rate": 0.00012712799703383314, + "loss": 1.4034, + "step": 12012 + }, + { + "epoch": 0.43021111250380506, + "grad_norm": 2.0201334953308105, + "learning_rate": 0.00012711683279583181, + "loss": 1.557, + "step": 12013 + }, + { + "epoch": 0.43024692463337333, + "grad_norm": 1.413825511932373, + "learning_rate": 0.00012710566819300854, + "loss": 1.6422, + "step": 12014 + }, + { + "epoch": 0.4302827367629416, + "grad_norm": 1.2569329738616943, + "learning_rate": 0.00012709450322551338, + "loss": 1.4474, + "step": 12015 + }, + { + "epoch": 0.43031854889250987, + "grad_norm": 1.874431848526001, + "learning_rate": 0.00012708333789349671, + "loss": 1.6731, + "step": 12016 + }, + { + "epoch": 0.4303543610220782, + "grad_norm": 1.6098730564117432, + "learning_rate": 0.00012707217219710864, + "loss": 1.416, + "step": 12017 + }, + { + "epoch": 0.43039017315164646, + "grad_norm": 1.6854571104049683, + "learning_rate": 0.0001270610061364994, + "loss": 1.4469, + "step": 12018 + }, + { + "epoch": 0.43042598528121473, + "grad_norm": 1.5974562168121338, + "learning_rate": 0.00012704983971181924, + "loss": 1.3327, + "step": 12019 + }, + { + "epoch": 0.43046179741078305, + "grad_norm": 1.5737115144729614, + "learning_rate": 0.00012703867292321837, + "loss": 1.779, + "step": 12020 + }, + { + "epoch": 0.4304976095403513, + "grad_norm": 1.6570041179656982, + "learning_rate": 0.00012702750577084704, + "loss": 1.5277, + "step": 12021 + }, + { + "epoch": 0.4305334216699196, + "grad_norm": 1.3756600618362427, + "learning_rate": 0.00012701633825485547, + "loss": 1.616, + "step": 12022 + }, + { + "epoch": 0.43056923379948786, + "grad_norm": 1.5845674276351929, + "learning_rate": 0.00012700517037539394, + "loss": 1.7271, + "step": 12023 + }, + { + "epoch": 0.4306050459290562, + "grad_norm": 1.8880497217178345, + "learning_rate": 0.00012699400213261262, + "loss": 1.512, + "step": 12024 + }, + { + "epoch": 0.43064085805862445, + "grad_norm": 1.240716576576233, + "learning_rate": 0.0001269828335266619, + "loss": 1.258, + "step": 12025 + }, + { + "epoch": 0.4306766701881927, + "grad_norm": 1.8755682706832886, + "learning_rate": 0.00012697166455769192, + "loss": 1.6694, + "step": 12026 + }, + { + "epoch": 0.43071248231776105, + "grad_norm": 1.8921705484390259, + "learning_rate": 0.000126960495225853, + "loss": 1.3913, + "step": 12027 + }, + { + "epoch": 0.4307482944473293, + "grad_norm": 1.7119035720825195, + "learning_rate": 0.00012694932553129537, + "loss": 1.3978, + "step": 12028 + }, + { + "epoch": 0.4307841065768976, + "grad_norm": 1.7253758907318115, + "learning_rate": 0.00012693815547416934, + "loss": 1.5819, + "step": 12029 + }, + { + "epoch": 0.43081991870646585, + "grad_norm": 1.5359183549880981, + "learning_rate": 0.00012692698505462516, + "loss": 1.3002, + "step": 12030 + }, + { + "epoch": 0.4308557308360342, + "grad_norm": 1.8340015411376953, + "learning_rate": 0.00012691581427281317, + "loss": 1.536, + "step": 12031 + }, + { + "epoch": 0.43089154296560245, + "grad_norm": 1.6742093563079834, + "learning_rate": 0.00012690464312888357, + "loss": 1.3717, + "step": 12032 + }, + { + "epoch": 0.4309273550951707, + "grad_norm": 2.0416200160980225, + "learning_rate": 0.0001268934716229867, + "loss": 1.751, + "step": 12033 + }, + { + "epoch": 0.43096316722473904, + "grad_norm": 1.9123685359954834, + "learning_rate": 0.0001268822997552729, + "loss": 1.5788, + "step": 12034 + }, + { + "epoch": 0.4309989793543073, + "grad_norm": 1.4536606073379517, + "learning_rate": 0.00012687112752589243, + "loss": 1.5731, + "step": 12035 + }, + { + "epoch": 0.4310347914838756, + "grad_norm": 1.5964137315750122, + "learning_rate": 0.00012685995493499558, + "loss": 1.2681, + "step": 12036 + }, + { + "epoch": 0.43107060361344385, + "grad_norm": 1.6218794584274292, + "learning_rate": 0.00012684878198273268, + "loss": 1.449, + "step": 12037 + }, + { + "epoch": 0.4311064157430122, + "grad_norm": 1.6466941833496094, + "learning_rate": 0.00012683760866925408, + "loss": 1.5735, + "step": 12038 + }, + { + "epoch": 0.43114222787258044, + "grad_norm": 1.5569286346435547, + "learning_rate": 0.00012682643499471003, + "loss": 1.4015, + "step": 12039 + }, + { + "epoch": 0.4311780400021487, + "grad_norm": 1.7250436544418335, + "learning_rate": 0.00012681526095925094, + "loss": 1.4717, + "step": 12040 + }, + { + "epoch": 0.43121385213171703, + "grad_norm": 1.3829773664474487, + "learning_rate": 0.00012680408656302707, + "loss": 1.5221, + "step": 12041 + }, + { + "epoch": 0.4312496642612853, + "grad_norm": 1.6153755187988281, + "learning_rate": 0.00012679291180618882, + "loss": 1.6274, + "step": 12042 + }, + { + "epoch": 0.4312854763908536, + "grad_norm": 1.467158555984497, + "learning_rate": 0.00012678173668888645, + "loss": 1.4342, + "step": 12043 + }, + { + "epoch": 0.43132128852042184, + "grad_norm": 1.740787148475647, + "learning_rate": 0.00012677056121127042, + "loss": 1.855, + "step": 12044 + }, + { + "epoch": 0.43135710064999017, + "grad_norm": 1.7290964126586914, + "learning_rate": 0.000126759385373491, + "loss": 1.5766, + "step": 12045 + }, + { + "epoch": 0.43139291277955844, + "grad_norm": 1.2313038110733032, + "learning_rate": 0.00012674820917569856, + "loss": 1.0122, + "step": 12046 + }, + { + "epoch": 0.4314287249091267, + "grad_norm": 1.7760591506958008, + "learning_rate": 0.0001267370326180435, + "loss": 1.4813, + "step": 12047 + }, + { + "epoch": 0.43146453703869503, + "grad_norm": 2.26284122467041, + "learning_rate": 0.0001267258557006761, + "loss": 1.6065, + "step": 12048 + }, + { + "epoch": 0.4315003491682633, + "grad_norm": 1.742746114730835, + "learning_rate": 0.00012671467842374683, + "loss": 1.8581, + "step": 12049 + }, + { + "epoch": 0.43153616129783157, + "grad_norm": 2.243811845779419, + "learning_rate": 0.000126703500787406, + "loss": 1.4152, + "step": 12050 + }, + { + "epoch": 0.43157197342739984, + "grad_norm": 1.9817683696746826, + "learning_rate": 0.000126692322791804, + "loss": 1.4592, + "step": 12051 + }, + { + "epoch": 0.43160778555696816, + "grad_norm": 1.667299509048462, + "learning_rate": 0.00012668114443709124, + "loss": 1.8244, + "step": 12052 + }, + { + "epoch": 0.43164359768653643, + "grad_norm": 1.9919838905334473, + "learning_rate": 0.0001266699657234181, + "loss": 1.7562, + "step": 12053 + }, + { + "epoch": 0.4316794098161047, + "grad_norm": 1.6336984634399414, + "learning_rate": 0.000126658786650935, + "loss": 1.5799, + "step": 12054 + }, + { + "epoch": 0.431715221945673, + "grad_norm": 1.2953890562057495, + "learning_rate": 0.00012664760721979227, + "loss": 1.2287, + "step": 12055 + }, + { + "epoch": 0.4317510340752413, + "grad_norm": 1.822380781173706, + "learning_rate": 0.00012663642743014037, + "loss": 1.2884, + "step": 12056 + }, + { + "epoch": 0.43178684620480956, + "grad_norm": 2.05786395072937, + "learning_rate": 0.0001266252472821297, + "loss": 1.4525, + "step": 12057 + }, + { + "epoch": 0.43182265833437783, + "grad_norm": 1.759456992149353, + "learning_rate": 0.00012661406677591067, + "loss": 1.2702, + "step": 12058 + }, + { + "epoch": 0.43185847046394615, + "grad_norm": 1.612596035003662, + "learning_rate": 0.00012660288591163373, + "loss": 1.5832, + "step": 12059 + }, + { + "epoch": 0.4318942825935144, + "grad_norm": 1.3318983316421509, + "learning_rate": 0.00012659170468944924, + "loss": 1.2931, + "step": 12060 + }, + { + "epoch": 0.4319300947230827, + "grad_norm": 1.8278427124023438, + "learning_rate": 0.00012658052310950767, + "loss": 1.6496, + "step": 12061 + }, + { + "epoch": 0.431965906852651, + "grad_norm": 1.4100539684295654, + "learning_rate": 0.00012656934117195946, + "loss": 1.6082, + "step": 12062 + }, + { + "epoch": 0.4320017189822193, + "grad_norm": 1.8377265930175781, + "learning_rate": 0.00012655815887695503, + "loss": 1.5018, + "step": 12063 + }, + { + "epoch": 0.43203753111178755, + "grad_norm": 1.7149466276168823, + "learning_rate": 0.00012654697622464483, + "loss": 1.6423, + "step": 12064 + }, + { + "epoch": 0.4320733432413558, + "grad_norm": 1.707377552986145, + "learning_rate": 0.0001265357932151793, + "loss": 1.5497, + "step": 12065 + }, + { + "epoch": 0.43210915537092415, + "grad_norm": 1.3086154460906982, + "learning_rate": 0.0001265246098487089, + "loss": 1.3625, + "step": 12066 + }, + { + "epoch": 0.4321449675004924, + "grad_norm": 1.9298018217086792, + "learning_rate": 0.0001265134261253841, + "loss": 1.4781, + "step": 12067 + }, + { + "epoch": 0.4321807796300607, + "grad_norm": 1.6140490770339966, + "learning_rate": 0.00012650224204535535, + "loss": 1.5533, + "step": 12068 + }, + { + "epoch": 0.432216591759629, + "grad_norm": 1.5894731283187866, + "learning_rate": 0.00012649105760877312, + "loss": 1.3304, + "step": 12069 + }, + { + "epoch": 0.4322524038891973, + "grad_norm": 1.940897822380066, + "learning_rate": 0.00012647987281578789, + "loss": 1.7295, + "step": 12070 + }, + { + "epoch": 0.43228821601876555, + "grad_norm": 1.6926662921905518, + "learning_rate": 0.0001264686876665501, + "loss": 1.2571, + "step": 12071 + }, + { + "epoch": 0.4323240281483338, + "grad_norm": 1.5556963682174683, + "learning_rate": 0.00012645750216121028, + "loss": 1.3275, + "step": 12072 + }, + { + "epoch": 0.43235984027790214, + "grad_norm": 1.9698765277862549, + "learning_rate": 0.0001264463162999189, + "loss": 1.517, + "step": 12073 + }, + { + "epoch": 0.4323956524074704, + "grad_norm": 1.5589221715927124, + "learning_rate": 0.00012643513008282645, + "loss": 1.5866, + "step": 12074 + }, + { + "epoch": 0.4324314645370387, + "grad_norm": 1.5207163095474243, + "learning_rate": 0.00012642394351008337, + "loss": 1.5868, + "step": 12075 + }, + { + "epoch": 0.432467276666607, + "grad_norm": 1.5345786809921265, + "learning_rate": 0.00012641275658184026, + "loss": 1.5199, + "step": 12076 + }, + { + "epoch": 0.4325030887961753, + "grad_norm": 1.7944084405899048, + "learning_rate": 0.00012640156929824757, + "loss": 1.627, + "step": 12077 + }, + { + "epoch": 0.43253890092574354, + "grad_norm": 1.890235185623169, + "learning_rate": 0.00012639038165945584, + "loss": 1.4801, + "step": 12078 + }, + { + "epoch": 0.4325747130553118, + "grad_norm": 1.4263070821762085, + "learning_rate": 0.00012637919366561556, + "loss": 1.4128, + "step": 12079 + }, + { + "epoch": 0.43261052518488013, + "grad_norm": 1.502324104309082, + "learning_rate": 0.00012636800531687728, + "loss": 1.3671, + "step": 12080 + }, + { + "epoch": 0.4326463373144484, + "grad_norm": 2.5059921741485596, + "learning_rate": 0.00012635681661339146, + "loss": 1.7858, + "step": 12081 + }, + { + "epoch": 0.4326821494440167, + "grad_norm": 1.9203873872756958, + "learning_rate": 0.00012634562755530867, + "loss": 1.7033, + "step": 12082 + }, + { + "epoch": 0.432717961573585, + "grad_norm": 1.613374948501587, + "learning_rate": 0.00012633443814277946, + "loss": 1.1675, + "step": 12083 + }, + { + "epoch": 0.43275377370315327, + "grad_norm": 1.6708298921585083, + "learning_rate": 0.00012632324837595434, + "loss": 1.5178, + "step": 12084 + }, + { + "epoch": 0.43278958583272154, + "grad_norm": 1.9573644399642944, + "learning_rate": 0.00012631205825498388, + "loss": 1.3646, + "step": 12085 + }, + { + "epoch": 0.4328253979622898, + "grad_norm": 1.6646674871444702, + "learning_rate": 0.0001263008677800186, + "loss": 1.2774, + "step": 12086 + }, + { + "epoch": 0.43286121009185813, + "grad_norm": 1.70304536819458, + "learning_rate": 0.0001262896769512091, + "loss": 1.2577, + "step": 12087 + }, + { + "epoch": 0.4328970222214264, + "grad_norm": 2.205632209777832, + "learning_rate": 0.0001262784857687059, + "loss": 1.7386, + "step": 12088 + }, + { + "epoch": 0.43293283435099467, + "grad_norm": 1.4651166200637817, + "learning_rate": 0.00012626729423265956, + "loss": 1.5016, + "step": 12089 + }, + { + "epoch": 0.432968646480563, + "grad_norm": 2.221482992172241, + "learning_rate": 0.00012625610234322064, + "loss": 1.6606, + "step": 12090 + }, + { + "epoch": 0.43300445861013126, + "grad_norm": 1.4816288948059082, + "learning_rate": 0.00012624491010053976, + "loss": 1.5373, + "step": 12091 + }, + { + "epoch": 0.43304027073969953, + "grad_norm": 3.446542978286743, + "learning_rate": 0.00012623371750476747, + "loss": 2.0685, + "step": 12092 + }, + { + "epoch": 0.4330760828692678, + "grad_norm": 1.557287573814392, + "learning_rate": 0.00012622252455605435, + "loss": 1.4696, + "step": 12093 + }, + { + "epoch": 0.4331118949988361, + "grad_norm": 1.9975833892822266, + "learning_rate": 0.00012621133125455093, + "loss": 1.5988, + "step": 12094 + }, + { + "epoch": 0.4331477071284044, + "grad_norm": 1.311399221420288, + "learning_rate": 0.0001262001376004079, + "loss": 1.2518, + "step": 12095 + }, + { + "epoch": 0.43318351925797266, + "grad_norm": 1.9268242120742798, + "learning_rate": 0.00012618894359377585, + "loss": 1.3004, + "step": 12096 + }, + { + "epoch": 0.433219331387541, + "grad_norm": 1.7908586263656616, + "learning_rate": 0.0001261777492348053, + "loss": 1.5952, + "step": 12097 + }, + { + "epoch": 0.43325514351710925, + "grad_norm": 1.4732061624526978, + "learning_rate": 0.00012616655452364693, + "loss": 1.6161, + "step": 12098 + }, + { + "epoch": 0.4332909556466775, + "grad_norm": 1.9595246315002441, + "learning_rate": 0.0001261553594604513, + "loss": 1.5878, + "step": 12099 + }, + { + "epoch": 0.4333267677762458, + "grad_norm": 1.7076550722122192, + "learning_rate": 0.00012614416404536905, + "loss": 1.602, + "step": 12100 + }, + { + "epoch": 0.4333625799058141, + "grad_norm": 1.8889092206954956, + "learning_rate": 0.00012613296827855078, + "loss": 1.4688, + "step": 12101 + }, + { + "epoch": 0.4333983920353824, + "grad_norm": 1.7788153886795044, + "learning_rate": 0.0001261217721601472, + "loss": 1.7163, + "step": 12102 + }, + { + "epoch": 0.43343420416495065, + "grad_norm": 1.702117919921875, + "learning_rate": 0.00012611057569030876, + "loss": 1.3403, + "step": 12103 + }, + { + "epoch": 0.433470016294519, + "grad_norm": 1.5412400960922241, + "learning_rate": 0.0001260993788691863, + "loss": 1.5185, + "step": 12104 + }, + { + "epoch": 0.43350582842408725, + "grad_norm": 1.8668831586837769, + "learning_rate": 0.00012608818169693032, + "loss": 1.6909, + "step": 12105 + }, + { + "epoch": 0.4335416405536555, + "grad_norm": 2.0278031826019287, + "learning_rate": 0.00012607698417369152, + "loss": 1.4961, + "step": 12106 + }, + { + "epoch": 0.4335774526832238, + "grad_norm": 1.955277919769287, + "learning_rate": 0.00012606578629962054, + "loss": 1.8627, + "step": 12107 + }, + { + "epoch": 0.4336132648127921, + "grad_norm": 1.6680927276611328, + "learning_rate": 0.00012605458807486797, + "loss": 1.3742, + "step": 12108 + }, + { + "epoch": 0.4336490769423604, + "grad_norm": 1.52384352684021, + "learning_rate": 0.0001260433894995846, + "loss": 1.182, + "step": 12109 + }, + { + "epoch": 0.43368488907192865, + "grad_norm": 1.390926718711853, + "learning_rate": 0.00012603219057392097, + "loss": 1.4092, + "step": 12110 + }, + { + "epoch": 0.43372070120149697, + "grad_norm": 1.52166748046875, + "learning_rate": 0.0001260209912980278, + "loss": 1.6895, + "step": 12111 + }, + { + "epoch": 0.43375651333106524, + "grad_norm": 1.4530707597732544, + "learning_rate": 0.0001260097916720558, + "loss": 1.0416, + "step": 12112 + }, + { + "epoch": 0.4337923254606335, + "grad_norm": 1.5703519582748413, + "learning_rate": 0.00012599859169615558, + "loss": 2.0043, + "step": 12113 + }, + { + "epoch": 0.4338281375902018, + "grad_norm": 1.6512939929962158, + "learning_rate": 0.00012598739137047784, + "loss": 1.549, + "step": 12114 + }, + { + "epoch": 0.4338639497197701, + "grad_norm": 2.4029769897460938, + "learning_rate": 0.00012597619069517328, + "loss": 1.8357, + "step": 12115 + }, + { + "epoch": 0.4338997618493384, + "grad_norm": 1.5254900455474854, + "learning_rate": 0.00012596498967039257, + "loss": 1.5647, + "step": 12116 + }, + { + "epoch": 0.43393557397890664, + "grad_norm": 2.040191411972046, + "learning_rate": 0.0001259537882962864, + "loss": 1.4003, + "step": 12117 + }, + { + "epoch": 0.43397138610847497, + "grad_norm": 1.724648118019104, + "learning_rate": 0.0001259425865730055, + "loss": 1.3143, + "step": 12118 + }, + { + "epoch": 0.43400719823804323, + "grad_norm": 1.1972649097442627, + "learning_rate": 0.00012593138450070056, + "loss": 1.3465, + "step": 12119 + }, + { + "epoch": 0.4340430103676115, + "grad_norm": 1.4744864702224731, + "learning_rate": 0.0001259201820795223, + "loss": 1.3086, + "step": 12120 + }, + { + "epoch": 0.4340788224971798, + "grad_norm": 1.8176345825195312, + "learning_rate": 0.00012590897930962142, + "loss": 1.3817, + "step": 12121 + }, + { + "epoch": 0.4341146346267481, + "grad_norm": 1.617890477180481, + "learning_rate": 0.00012589777619114863, + "loss": 1.7052, + "step": 12122 + }, + { + "epoch": 0.43415044675631637, + "grad_norm": 2.1249704360961914, + "learning_rate": 0.0001258865727242547, + "loss": 1.4104, + "step": 12123 + }, + { + "epoch": 0.43418625888588464, + "grad_norm": 1.825442910194397, + "learning_rate": 0.00012587536890909033, + "loss": 1.6887, + "step": 12124 + }, + { + "epoch": 0.43422207101545296, + "grad_norm": 1.7071682214736938, + "learning_rate": 0.0001258641647458062, + "loss": 1.5086, + "step": 12125 + }, + { + "epoch": 0.43425788314502123, + "grad_norm": 1.7802238464355469, + "learning_rate": 0.00012585296023455314, + "loss": 1.6946, + "step": 12126 + }, + { + "epoch": 0.4342936952745895, + "grad_norm": 1.907939076423645, + "learning_rate": 0.00012584175537548183, + "loss": 1.3304, + "step": 12127 + }, + { + "epoch": 0.43432950740415777, + "grad_norm": 1.8659141063690186, + "learning_rate": 0.00012583055016874303, + "loss": 1.4764, + "step": 12128 + }, + { + "epoch": 0.4343653195337261, + "grad_norm": 1.5259766578674316, + "learning_rate": 0.00012581934461448747, + "loss": 1.5803, + "step": 12129 + }, + { + "epoch": 0.43440113166329436, + "grad_norm": 2.2665178775787354, + "learning_rate": 0.00012580813871286597, + "loss": 1.5027, + "step": 12130 + }, + { + "epoch": 0.43443694379286263, + "grad_norm": 1.9262688159942627, + "learning_rate": 0.00012579693246402924, + "loss": 1.5477, + "step": 12131 + }, + { + "epoch": 0.43447275592243095, + "grad_norm": 2.344991683959961, + "learning_rate": 0.00012578572586812806, + "loss": 1.3432, + "step": 12132 + }, + { + "epoch": 0.4345085680519992, + "grad_norm": 2.302216053009033, + "learning_rate": 0.00012577451892531322, + "loss": 1.5703, + "step": 12133 + }, + { + "epoch": 0.4345443801815675, + "grad_norm": 1.6896110773086548, + "learning_rate": 0.00012576331163573548, + "loss": 1.3081, + "step": 12134 + }, + { + "epoch": 0.43458019231113576, + "grad_norm": 1.8697706460952759, + "learning_rate": 0.00012575210399954557, + "loss": 1.3996, + "step": 12135 + }, + { + "epoch": 0.4346160044407041, + "grad_norm": 1.9721256494522095, + "learning_rate": 0.00012574089601689433, + "loss": 1.153, + "step": 12136 + }, + { + "epoch": 0.43465181657027235, + "grad_norm": 2.7742340564727783, + "learning_rate": 0.00012572968768793257, + "loss": 1.7565, + "step": 12137 + }, + { + "epoch": 0.4346876286998406, + "grad_norm": 2.0509705543518066, + "learning_rate": 0.00012571847901281103, + "loss": 1.3774, + "step": 12138 + }, + { + "epoch": 0.43472344082940895, + "grad_norm": 1.5328233242034912, + "learning_rate": 0.0001257072699916805, + "loss": 1.3377, + "step": 12139 + }, + { + "epoch": 0.4347592529589772, + "grad_norm": 1.7979052066802979, + "learning_rate": 0.00012569606062469186, + "loss": 1.544, + "step": 12140 + }, + { + "epoch": 0.4347950650885455, + "grad_norm": 1.7817195653915405, + "learning_rate": 0.00012568485091199585, + "loss": 1.3429, + "step": 12141 + }, + { + "epoch": 0.43483087721811375, + "grad_norm": 1.9444987773895264, + "learning_rate": 0.0001256736408537433, + "loss": 1.6757, + "step": 12142 + }, + { + "epoch": 0.4348666893476821, + "grad_norm": 1.5094964504241943, + "learning_rate": 0.00012566243045008504, + "loss": 1.6575, + "step": 12143 + }, + { + "epoch": 0.43490250147725035, + "grad_norm": 2.030996322631836, + "learning_rate": 0.0001256512197011719, + "loss": 1.636, + "step": 12144 + }, + { + "epoch": 0.4349383136068186, + "grad_norm": 1.5814388990402222, + "learning_rate": 0.00012564000860715464, + "loss": 1.5221, + "step": 12145 + }, + { + "epoch": 0.43497412573638694, + "grad_norm": 3.0495545864105225, + "learning_rate": 0.00012562879716818416, + "loss": 1.6251, + "step": 12146 + }, + { + "epoch": 0.4350099378659552, + "grad_norm": 1.359506368637085, + "learning_rate": 0.00012561758538441126, + "loss": 1.3825, + "step": 12147 + }, + { + "epoch": 0.4350457499955235, + "grad_norm": 1.534612774848938, + "learning_rate": 0.0001256063732559868, + "loss": 1.283, + "step": 12148 + }, + { + "epoch": 0.43508156212509175, + "grad_norm": 1.858231544494629, + "learning_rate": 0.0001255951607830616, + "loss": 1.363, + "step": 12149 + }, + { + "epoch": 0.43511737425466007, + "grad_norm": 1.4661773443222046, + "learning_rate": 0.00012558394796578656, + "loss": 1.3254, + "step": 12150 + }, + { + "epoch": 0.43515318638422834, + "grad_norm": 1.5293601751327515, + "learning_rate": 0.0001255727348043125, + "loss": 1.7608, + "step": 12151 + }, + { + "epoch": 0.4351889985137966, + "grad_norm": 1.2543964385986328, + "learning_rate": 0.00012556152129879027, + "loss": 1.5267, + "step": 12152 + }, + { + "epoch": 0.43522481064336493, + "grad_norm": 1.9496577978134155, + "learning_rate": 0.00012555030744937075, + "loss": 1.2079, + "step": 12153 + }, + { + "epoch": 0.4352606227729332, + "grad_norm": 2.417306661605835, + "learning_rate": 0.0001255390932562048, + "loss": 1.3698, + "step": 12154 + }, + { + "epoch": 0.4352964349025015, + "grad_norm": 1.6135632991790771, + "learning_rate": 0.00012552787871944327, + "loss": 1.6963, + "step": 12155 + }, + { + "epoch": 0.43533224703206974, + "grad_norm": 2.258427619934082, + "learning_rate": 0.00012551666383923705, + "loss": 1.4545, + "step": 12156 + }, + { + "epoch": 0.43536805916163807, + "grad_norm": 1.6863501071929932, + "learning_rate": 0.00012550544861573707, + "loss": 1.113, + "step": 12157 + }, + { + "epoch": 0.43540387129120633, + "grad_norm": 1.6821973323822021, + "learning_rate": 0.0001254942330490942, + "loss": 1.6309, + "step": 12158 + }, + { + "epoch": 0.4354396834207746, + "grad_norm": 1.7726070880889893, + "learning_rate": 0.00012548301713945925, + "loss": 1.5211, + "step": 12159 + }, + { + "epoch": 0.43547549555034293, + "grad_norm": 1.206180453300476, + "learning_rate": 0.00012547180088698322, + "loss": 1.4238, + "step": 12160 + }, + { + "epoch": 0.4355113076799112, + "grad_norm": 2.325413227081299, + "learning_rate": 0.00012546058429181692, + "loss": 1.8346, + "step": 12161 + }, + { + "epoch": 0.43554711980947947, + "grad_norm": 1.9589569568634033, + "learning_rate": 0.00012544936735411135, + "loss": 1.4175, + "step": 12162 + }, + { + "epoch": 0.43558293193904773, + "grad_norm": 2.0499401092529297, + "learning_rate": 0.00012543815007401733, + "loss": 1.6312, + "step": 12163 + }, + { + "epoch": 0.43561874406861606, + "grad_norm": 1.615285873413086, + "learning_rate": 0.00012542693245168584, + "loss": 1.5656, + "step": 12164 + }, + { + "epoch": 0.43565455619818433, + "grad_norm": 2.342385768890381, + "learning_rate": 0.00012541571448726775, + "loss": 1.3668, + "step": 12165 + }, + { + "epoch": 0.4356903683277526, + "grad_norm": 2.1468067169189453, + "learning_rate": 0.00012540449618091403, + "loss": 1.5807, + "step": 12166 + }, + { + "epoch": 0.4357261804573209, + "grad_norm": 1.7686083316802979, + "learning_rate": 0.00012539327753277555, + "loss": 1.7541, + "step": 12167 + }, + { + "epoch": 0.4357619925868892, + "grad_norm": 1.9072706699371338, + "learning_rate": 0.00012538205854300334, + "loss": 1.6316, + "step": 12168 + }, + { + "epoch": 0.43579780471645746, + "grad_norm": 1.3254297971725464, + "learning_rate": 0.00012537083921174822, + "loss": 1.516, + "step": 12169 + }, + { + "epoch": 0.43583361684602573, + "grad_norm": 1.3646912574768066, + "learning_rate": 0.0001253596195391612, + "loss": 1.3667, + "step": 12170 + }, + { + "epoch": 0.43586942897559405, + "grad_norm": 1.3135740756988525, + "learning_rate": 0.0001253483995253932, + "loss": 1.669, + "step": 12171 + }, + { + "epoch": 0.4359052411051623, + "grad_norm": 1.6707199811935425, + "learning_rate": 0.00012533717917059516, + "loss": 1.4503, + "step": 12172 + }, + { + "epoch": 0.4359410532347306, + "grad_norm": 1.645582675933838, + "learning_rate": 0.0001253259584749181, + "loss": 1.3948, + "step": 12173 + }, + { + "epoch": 0.4359768653642989, + "grad_norm": 1.7216689586639404, + "learning_rate": 0.0001253147374385129, + "loss": 1.4025, + "step": 12174 + }, + { + "epoch": 0.4360126774938672, + "grad_norm": 1.6870415210723877, + "learning_rate": 0.0001253035160615306, + "loss": 1.7272, + "step": 12175 + }, + { + "epoch": 0.43604848962343545, + "grad_norm": 1.3931629657745361, + "learning_rate": 0.00012529229434412212, + "loss": 1.3757, + "step": 12176 + }, + { + "epoch": 0.4360843017530037, + "grad_norm": 1.6886948347091675, + "learning_rate": 0.00012528107228643843, + "loss": 1.4498, + "step": 12177 + }, + { + "epoch": 0.43612011388257205, + "grad_norm": 1.4105387926101685, + "learning_rate": 0.00012526984988863054, + "loss": 1.5208, + "step": 12178 + }, + { + "epoch": 0.4361559260121403, + "grad_norm": 2.1833765506744385, + "learning_rate": 0.0001252586271508494, + "loss": 1.7194, + "step": 12179 + }, + { + "epoch": 0.4361917381417086, + "grad_norm": 1.9593642950057983, + "learning_rate": 0.000125247404073246, + "loss": 1.5396, + "step": 12180 + }, + { + "epoch": 0.43622755027127685, + "grad_norm": 2.019223690032959, + "learning_rate": 0.0001252361806559714, + "loss": 1.6001, + "step": 12181 + }, + { + "epoch": 0.4362633624008452, + "grad_norm": 2.205815076828003, + "learning_rate": 0.00012522495689917647, + "loss": 1.5243, + "step": 12182 + }, + { + "epoch": 0.43629917453041345, + "grad_norm": 1.9048100709915161, + "learning_rate": 0.00012521373280301233, + "loss": 1.6801, + "step": 12183 + }, + { + "epoch": 0.4363349866599817, + "grad_norm": 1.4723803997039795, + "learning_rate": 0.0001252025083676299, + "loss": 1.1884, + "step": 12184 + }, + { + "epoch": 0.43637079878955004, + "grad_norm": 1.567564845085144, + "learning_rate": 0.00012519128359318027, + "loss": 1.4242, + "step": 12185 + }, + { + "epoch": 0.4364066109191183, + "grad_norm": 1.5921801328659058, + "learning_rate": 0.0001251800584798144, + "loss": 1.4445, + "step": 12186 + }, + { + "epoch": 0.4364424230486866, + "grad_norm": 2.2769577503204346, + "learning_rate": 0.0001251688330276833, + "loss": 1.5012, + "step": 12187 + }, + { + "epoch": 0.43647823517825485, + "grad_norm": 1.4964098930358887, + "learning_rate": 0.00012515760723693807, + "loss": 1.7067, + "step": 12188 + }, + { + "epoch": 0.43651404730782317, + "grad_norm": 1.4439140558242798, + "learning_rate": 0.00012514638110772963, + "loss": 1.1494, + "step": 12189 + }, + { + "epoch": 0.43654985943739144, + "grad_norm": 1.3597850799560547, + "learning_rate": 0.0001251351546402091, + "loss": 1.672, + "step": 12190 + }, + { + "epoch": 0.4365856715669597, + "grad_norm": 1.3259572982788086, + "learning_rate": 0.00012512392783452746, + "loss": 1.47, + "step": 12191 + }, + { + "epoch": 0.43662148369652803, + "grad_norm": 1.7703850269317627, + "learning_rate": 0.00012511270069083582, + "loss": 1.4195, + "step": 12192 + }, + { + "epoch": 0.4366572958260963, + "grad_norm": 1.2541024684906006, + "learning_rate": 0.00012510147320928515, + "loss": 1.3005, + "step": 12193 + }, + { + "epoch": 0.4366931079556646, + "grad_norm": 1.4400556087493896, + "learning_rate": 0.00012509024539002653, + "loss": 1.1587, + "step": 12194 + }, + { + "epoch": 0.43672892008523284, + "grad_norm": 1.3740122318267822, + "learning_rate": 0.00012507901723321106, + "loss": 1.5497, + "step": 12195 + }, + { + "epoch": 0.43676473221480117, + "grad_norm": 1.596411108970642, + "learning_rate": 0.00012506778873898976, + "loss": 1.5982, + "step": 12196 + }, + { + "epoch": 0.43680054434436943, + "grad_norm": 1.8824422359466553, + "learning_rate": 0.00012505655990751368, + "loss": 1.9263, + "step": 12197 + }, + { + "epoch": 0.4368363564739377, + "grad_norm": 1.811933159828186, + "learning_rate": 0.0001250453307389339, + "loss": 1.6699, + "step": 12198 + }, + { + "epoch": 0.43687216860350603, + "grad_norm": 1.7005141973495483, + "learning_rate": 0.0001250341012334015, + "loss": 1.6277, + "step": 12199 + }, + { + "epoch": 0.4369079807330743, + "grad_norm": 1.5186580419540405, + "learning_rate": 0.00012502287139106756, + "loss": 1.174, + "step": 12200 + }, + { + "epoch": 0.43694379286264257, + "grad_norm": 1.7548470497131348, + "learning_rate": 0.0001250116412120832, + "loss": 1.6404, + "step": 12201 + }, + { + "epoch": 0.43697960499221083, + "grad_norm": 1.803139090538025, + "learning_rate": 0.00012500041069659943, + "loss": 1.2575, + "step": 12202 + }, + { + "epoch": 0.43701541712177916, + "grad_norm": 1.5411206483840942, + "learning_rate": 0.00012498917984476738, + "loss": 1.5819, + "step": 12203 + }, + { + "epoch": 0.43705122925134743, + "grad_norm": 1.999428391456604, + "learning_rate": 0.00012497794865673817, + "loss": 1.2803, + "step": 12204 + }, + { + "epoch": 0.4370870413809157, + "grad_norm": 1.7252352237701416, + "learning_rate": 0.0001249667171326629, + "loss": 1.7101, + "step": 12205 + }, + { + "epoch": 0.437122853510484, + "grad_norm": 1.4021079540252686, + "learning_rate": 0.0001249554852726926, + "loss": 1.3464, + "step": 12206 + }, + { + "epoch": 0.4371586656400523, + "grad_norm": 1.7048399448394775, + "learning_rate": 0.00012494425307697847, + "loss": 1.5214, + "step": 12207 + }, + { + "epoch": 0.43719447776962056, + "grad_norm": 1.2630609273910522, + "learning_rate": 0.0001249330205456716, + "loss": 1.5876, + "step": 12208 + }, + { + "epoch": 0.43723028989918883, + "grad_norm": 1.411924958229065, + "learning_rate": 0.00012492178767892307, + "loss": 1.4632, + "step": 12209 + }, + { + "epoch": 0.43726610202875715, + "grad_norm": 1.7386451959609985, + "learning_rate": 0.00012491055447688405, + "loss": 1.4687, + "step": 12210 + }, + { + "epoch": 0.4373019141583254, + "grad_norm": 1.9168484210968018, + "learning_rate": 0.00012489932093970568, + "loss": 1.6312, + "step": 12211 + }, + { + "epoch": 0.4373377262878937, + "grad_norm": 2.0553414821624756, + "learning_rate": 0.00012488808706753902, + "loss": 1.7418, + "step": 12212 + }, + { + "epoch": 0.437373538417462, + "grad_norm": 1.310865879058838, + "learning_rate": 0.00012487685286053526, + "loss": 1.6169, + "step": 12213 + }, + { + "epoch": 0.4374093505470303, + "grad_norm": 1.4720237255096436, + "learning_rate": 0.00012486561831884552, + "loss": 1.11, + "step": 12214 + }, + { + "epoch": 0.43744516267659855, + "grad_norm": 1.5787687301635742, + "learning_rate": 0.000124854383442621, + "loss": 1.6203, + "step": 12215 + }, + { + "epoch": 0.4374809748061668, + "grad_norm": 1.404030203819275, + "learning_rate": 0.00012484314823201276, + "loss": 1.2607, + "step": 12216 + }, + { + "epoch": 0.43751678693573515, + "grad_norm": 1.9852256774902344, + "learning_rate": 0.00012483191268717207, + "loss": 1.6324, + "step": 12217 + }, + { + "epoch": 0.4375525990653034, + "grad_norm": 1.8439569473266602, + "learning_rate": 0.00012482067680824998, + "loss": 1.3118, + "step": 12218 + }, + { + "epoch": 0.4375884111948717, + "grad_norm": 1.3990365266799927, + "learning_rate": 0.0001248094405953977, + "loss": 1.4231, + "step": 12219 + }, + { + "epoch": 0.43762422332444, + "grad_norm": 1.943037748336792, + "learning_rate": 0.00012479820404876643, + "loss": 1.4883, + "step": 12220 + }, + { + "epoch": 0.4376600354540083, + "grad_norm": 1.4832576513290405, + "learning_rate": 0.0001247869671685073, + "loss": 1.3203, + "step": 12221 + }, + { + "epoch": 0.43769584758357655, + "grad_norm": 1.6420166492462158, + "learning_rate": 0.0001247757299547715, + "loss": 1.254, + "step": 12222 + }, + { + "epoch": 0.4377316597131448, + "grad_norm": 2.4458463191986084, + "learning_rate": 0.00012476449240771023, + "loss": 1.5143, + "step": 12223 + }, + { + "epoch": 0.43776747184271314, + "grad_norm": 1.7896454334259033, + "learning_rate": 0.0001247532545274746, + "loss": 1.7974, + "step": 12224 + }, + { + "epoch": 0.4378032839722814, + "grad_norm": 1.4857879877090454, + "learning_rate": 0.00012474201631421588, + "loss": 1.3927, + "step": 12225 + }, + { + "epoch": 0.4378390961018497, + "grad_norm": 2.298076629638672, + "learning_rate": 0.00012473077776808527, + "loss": 1.5126, + "step": 12226 + }, + { + "epoch": 0.437874908231418, + "grad_norm": 1.8825565576553345, + "learning_rate": 0.00012471953888923393, + "loss": 1.4028, + "step": 12227 + }, + { + "epoch": 0.43791072036098627, + "grad_norm": 1.3885728120803833, + "learning_rate": 0.00012470829967781307, + "loss": 1.3174, + "step": 12228 + }, + { + "epoch": 0.43794653249055454, + "grad_norm": 2.4574766159057617, + "learning_rate": 0.00012469706013397395, + "loss": 1.7062, + "step": 12229 + }, + { + "epoch": 0.4379823446201228, + "grad_norm": 1.4123119115829468, + "learning_rate": 0.00012468582025786774, + "loss": 1.656, + "step": 12230 + }, + { + "epoch": 0.43801815674969113, + "grad_norm": 1.421113133430481, + "learning_rate": 0.0001246745800496456, + "loss": 1.444, + "step": 12231 + }, + { + "epoch": 0.4380539688792594, + "grad_norm": 2.3142082691192627, + "learning_rate": 0.00012466333950945889, + "loss": 1.6337, + "step": 12232 + }, + { + "epoch": 0.4380897810088277, + "grad_norm": 2.1670494079589844, + "learning_rate": 0.0001246520986374587, + "loss": 1.3216, + "step": 12233 + }, + { + "epoch": 0.438125593138396, + "grad_norm": 1.628150463104248, + "learning_rate": 0.00012464085743379635, + "loss": 1.4509, + "step": 12234 + }, + { + "epoch": 0.43816140526796427, + "grad_norm": 1.204681634902954, + "learning_rate": 0.000124629615898623, + "loss": 1.4405, + "step": 12235 + }, + { + "epoch": 0.43819721739753253, + "grad_norm": 1.9603196382522583, + "learning_rate": 0.00012461837403209, + "loss": 1.7334, + "step": 12236 + }, + { + "epoch": 0.4382330295271008, + "grad_norm": 1.7064441442489624, + "learning_rate": 0.0001246071318343485, + "loss": 1.3215, + "step": 12237 + }, + { + "epoch": 0.43826884165666913, + "grad_norm": 2.009902238845825, + "learning_rate": 0.0001245958893055498, + "loss": 1.3249, + "step": 12238 + }, + { + "epoch": 0.4383046537862374, + "grad_norm": 1.3667140007019043, + "learning_rate": 0.00012458464644584516, + "loss": 1.254, + "step": 12239 + }, + { + "epoch": 0.43834046591580567, + "grad_norm": 1.28581964969635, + "learning_rate": 0.00012457340325538576, + "loss": 1.4058, + "step": 12240 + }, + { + "epoch": 0.438376278045374, + "grad_norm": 1.8916394710540771, + "learning_rate": 0.00012456215973432295, + "loss": 1.2694, + "step": 12241 + }, + { + "epoch": 0.43841209017494226, + "grad_norm": 1.4369720220565796, + "learning_rate": 0.00012455091588280793, + "loss": 1.7439, + "step": 12242 + }, + { + "epoch": 0.43844790230451053, + "grad_norm": 1.723207712173462, + "learning_rate": 0.00012453967170099204, + "loss": 1.3808, + "step": 12243 + }, + { + "epoch": 0.4384837144340788, + "grad_norm": 2.0032958984375, + "learning_rate": 0.00012452842718902647, + "loss": 1.5483, + "step": 12244 + }, + { + "epoch": 0.4385195265636471, + "grad_norm": 1.5736902952194214, + "learning_rate": 0.00012451718234706262, + "loss": 1.3339, + "step": 12245 + }, + { + "epoch": 0.4385553386932154, + "grad_norm": 1.6149911880493164, + "learning_rate": 0.00012450593717525167, + "loss": 1.3523, + "step": 12246 + }, + { + "epoch": 0.43859115082278366, + "grad_norm": 2.623713254928589, + "learning_rate": 0.00012449469167374498, + "loss": 1.7543, + "step": 12247 + }, + { + "epoch": 0.438626962952352, + "grad_norm": 1.808893084526062, + "learning_rate": 0.00012448344584269379, + "loss": 1.5875, + "step": 12248 + }, + { + "epoch": 0.43866277508192025, + "grad_norm": 1.4351035356521606, + "learning_rate": 0.0001244721996822494, + "loss": 1.4845, + "step": 12249 + }, + { + "epoch": 0.4386985872114885, + "grad_norm": 1.4466192722320557, + "learning_rate": 0.00012446095319256314, + "loss": 1.476, + "step": 12250 + }, + { + "epoch": 0.4387343993410568, + "grad_norm": 1.7350643873214722, + "learning_rate": 0.00012444970637378631, + "loss": 1.4816, + "step": 12251 + }, + { + "epoch": 0.4387702114706251, + "grad_norm": 1.5797699689865112, + "learning_rate": 0.0001244384592260702, + "loss": 1.4879, + "step": 12252 + }, + { + "epoch": 0.4388060236001934, + "grad_norm": 1.4239426851272583, + "learning_rate": 0.00012442721174956616, + "loss": 1.5366, + "step": 12253 + }, + { + "epoch": 0.43884183572976165, + "grad_norm": 1.4414507150650024, + "learning_rate": 0.0001244159639444255, + "loss": 1.6121, + "step": 12254 + }, + { + "epoch": 0.43887764785933, + "grad_norm": 1.5636897087097168, + "learning_rate": 0.00012440471581079952, + "loss": 1.758, + "step": 12255 + }, + { + "epoch": 0.43891345998889825, + "grad_norm": 1.4484236240386963, + "learning_rate": 0.0001243934673488396, + "loss": 1.4626, + "step": 12256 + }, + { + "epoch": 0.4389492721184665, + "grad_norm": 1.4698162078857422, + "learning_rate": 0.00012438221855869702, + "loss": 1.4597, + "step": 12257 + }, + { + "epoch": 0.4389850842480348, + "grad_norm": 1.6307086944580078, + "learning_rate": 0.00012437096944052317, + "loss": 1.5859, + "step": 12258 + }, + { + "epoch": 0.4390208963776031, + "grad_norm": 1.7705332040786743, + "learning_rate": 0.0001243597199944693, + "loss": 1.4369, + "step": 12259 + }, + { + "epoch": 0.4390567085071714, + "grad_norm": 1.8133080005645752, + "learning_rate": 0.0001243484702206869, + "loss": 1.2604, + "step": 12260 + }, + { + "epoch": 0.43909252063673965, + "grad_norm": 1.4457899332046509, + "learning_rate": 0.00012433722011932717, + "loss": 1.3294, + "step": 12261 + }, + { + "epoch": 0.43912833276630797, + "grad_norm": 1.6493401527404785, + "learning_rate": 0.00012432596969054157, + "loss": 1.7194, + "step": 12262 + }, + { + "epoch": 0.43916414489587624, + "grad_norm": 1.8156285285949707, + "learning_rate": 0.0001243147189344814, + "loss": 1.4789, + "step": 12263 + }, + { + "epoch": 0.4391999570254445, + "grad_norm": 1.2668172121047974, + "learning_rate": 0.0001243034678512981, + "loss": 1.5012, + "step": 12264 + }, + { + "epoch": 0.4392357691550128, + "grad_norm": 1.7122809886932373, + "learning_rate": 0.00012429221644114294, + "loss": 1.7531, + "step": 12265 + }, + { + "epoch": 0.4392715812845811, + "grad_norm": 1.553624153137207, + "learning_rate": 0.00012428096470416738, + "loss": 1.2313, + "step": 12266 + }, + { + "epoch": 0.43930739341414937, + "grad_norm": 2.247083902359009, + "learning_rate": 0.00012426971264052275, + "loss": 1.3864, + "step": 12267 + }, + { + "epoch": 0.43934320554371764, + "grad_norm": 1.7876657247543335, + "learning_rate": 0.00012425846025036042, + "loss": 1.4111, + "step": 12268 + }, + { + "epoch": 0.43937901767328597, + "grad_norm": 1.5700833797454834, + "learning_rate": 0.0001242472075338318, + "loss": 1.5301, + "step": 12269 + }, + { + "epoch": 0.43941482980285423, + "grad_norm": 1.5357482433319092, + "learning_rate": 0.0001242359544910883, + "loss": 1.5518, + "step": 12270 + }, + { + "epoch": 0.4394506419324225, + "grad_norm": 1.4696317911148071, + "learning_rate": 0.00012422470112228125, + "loss": 1.6317, + "step": 12271 + }, + { + "epoch": 0.43948645406199077, + "grad_norm": 1.7287973165512085, + "learning_rate": 0.00012421344742756215, + "loss": 1.3712, + "step": 12272 + }, + { + "epoch": 0.4395222661915591, + "grad_norm": 3.024156093597412, + "learning_rate": 0.00012420219340708236, + "loss": 1.6948, + "step": 12273 + }, + { + "epoch": 0.43955807832112737, + "grad_norm": 1.4425593614578247, + "learning_rate": 0.00012419093906099323, + "loss": 1.5203, + "step": 12274 + }, + { + "epoch": 0.43959389045069563, + "grad_norm": 1.4806897640228271, + "learning_rate": 0.00012417968438944622, + "loss": 1.3722, + "step": 12275 + }, + { + "epoch": 0.43962970258026396, + "grad_norm": 2.641904354095459, + "learning_rate": 0.0001241684293925928, + "loss": 1.5145, + "step": 12276 + }, + { + "epoch": 0.43966551470983223, + "grad_norm": 1.801173448562622, + "learning_rate": 0.00012415717407058427, + "loss": 1.5123, + "step": 12277 + }, + { + "epoch": 0.4397013268394005, + "grad_norm": 1.3477219343185425, + "learning_rate": 0.00012414591842357215, + "loss": 1.2466, + "step": 12278 + }, + { + "epoch": 0.43973713896896877, + "grad_norm": 1.4272079467773438, + "learning_rate": 0.00012413466245170783, + "loss": 1.1956, + "step": 12279 + }, + { + "epoch": 0.4397729510985371, + "grad_norm": 1.5467042922973633, + "learning_rate": 0.0001241234061551428, + "loss": 1.3165, + "step": 12280 + }, + { + "epoch": 0.43980876322810536, + "grad_norm": 2.0391323566436768, + "learning_rate": 0.00012411214953402842, + "loss": 1.5702, + "step": 12281 + }, + { + "epoch": 0.43984457535767363, + "grad_norm": 1.9930360317230225, + "learning_rate": 0.00012410089258851618, + "loss": 1.6248, + "step": 12282 + }, + { + "epoch": 0.43988038748724195, + "grad_norm": 1.4755240678787231, + "learning_rate": 0.00012408963531875753, + "loss": 1.6192, + "step": 12283 + }, + { + "epoch": 0.4399161996168102, + "grad_norm": 1.4436578750610352, + "learning_rate": 0.00012407837772490389, + "loss": 1.3747, + "step": 12284 + }, + { + "epoch": 0.4399520117463785, + "grad_norm": 1.4381917715072632, + "learning_rate": 0.00012406711980710676, + "loss": 1.5223, + "step": 12285 + }, + { + "epoch": 0.43998782387594676, + "grad_norm": 1.4474295377731323, + "learning_rate": 0.00012405586156551753, + "loss": 1.4443, + "step": 12286 + }, + { + "epoch": 0.4400236360055151, + "grad_norm": 2.0564377307891846, + "learning_rate": 0.00012404460300028774, + "loss": 1.13, + "step": 12287 + }, + { + "epoch": 0.44005944813508335, + "grad_norm": 1.5260390043258667, + "learning_rate": 0.00012403334411156884, + "loss": 1.5361, + "step": 12288 + }, + { + "epoch": 0.4400952602646516, + "grad_norm": 1.644852638244629, + "learning_rate": 0.0001240220848995123, + "loss": 1.5544, + "step": 12289 + }, + { + "epoch": 0.44013107239421995, + "grad_norm": 1.850511908531189, + "learning_rate": 0.00012401082536426958, + "loss": 1.5099, + "step": 12290 + }, + { + "epoch": 0.4401668845237882, + "grad_norm": 1.6762350797653198, + "learning_rate": 0.00012399956550599218, + "loss": 1.6942, + "step": 12291 + }, + { + "epoch": 0.4402026966533565, + "grad_norm": 1.6215894222259521, + "learning_rate": 0.0001239883053248316, + "loss": 1.5009, + "step": 12292 + }, + { + "epoch": 0.44023850878292475, + "grad_norm": 1.8665066957473755, + "learning_rate": 0.0001239770448209393, + "loss": 1.6581, + "step": 12293 + }, + { + "epoch": 0.4402743209124931, + "grad_norm": 2.1635780334472656, + "learning_rate": 0.00012396578399446678, + "loss": 1.5384, + "step": 12294 + }, + { + "epoch": 0.44031013304206135, + "grad_norm": 1.7075426578521729, + "learning_rate": 0.00012395452284556558, + "loss": 1.7423, + "step": 12295 + }, + { + "epoch": 0.4403459451716296, + "grad_norm": 1.9434218406677246, + "learning_rate": 0.00012394326137438714, + "loss": 1.3288, + "step": 12296 + }, + { + "epoch": 0.44038175730119794, + "grad_norm": 1.5657232999801636, + "learning_rate": 0.000123931999581083, + "loss": 1.2778, + "step": 12297 + }, + { + "epoch": 0.4404175694307662, + "grad_norm": 1.5658713579177856, + "learning_rate": 0.00012392073746580472, + "loss": 1.5705, + "step": 12298 + }, + { + "epoch": 0.4404533815603345, + "grad_norm": 1.3626611232757568, + "learning_rate": 0.00012390947502870375, + "loss": 1.286, + "step": 12299 + }, + { + "epoch": 0.44048919368990275, + "grad_norm": 1.8273500204086304, + "learning_rate": 0.00012389821226993164, + "loss": 1.304, + "step": 12300 + }, + { + "epoch": 0.44052500581947107, + "grad_norm": 1.7179958820343018, + "learning_rate": 0.0001238869491896399, + "loss": 1.674, + "step": 12301 + }, + { + "epoch": 0.44056081794903934, + "grad_norm": 1.5758401155471802, + "learning_rate": 0.00012387568578798005, + "loss": 1.2103, + "step": 12302 + }, + { + "epoch": 0.4405966300786076, + "grad_norm": 1.8869274854660034, + "learning_rate": 0.00012386442206510368, + "loss": 1.684, + "step": 12303 + }, + { + "epoch": 0.44063244220817593, + "grad_norm": 1.4876755475997925, + "learning_rate": 0.00012385315802116226, + "loss": 1.5119, + "step": 12304 + }, + { + "epoch": 0.4406682543377442, + "grad_norm": 2.015285015106201, + "learning_rate": 0.0001238418936563074, + "loss": 1.6014, + "step": 12305 + }, + { + "epoch": 0.44070406646731247, + "grad_norm": 1.7106752395629883, + "learning_rate": 0.0001238306289706906, + "loss": 1.6048, + "step": 12306 + }, + { + "epoch": 0.44073987859688074, + "grad_norm": 1.6270339488983154, + "learning_rate": 0.00012381936396446344, + "loss": 1.2119, + "step": 12307 + }, + { + "epoch": 0.44077569072644907, + "grad_norm": 1.604411005973816, + "learning_rate": 0.00012380809863777746, + "loss": 1.4839, + "step": 12308 + }, + { + "epoch": 0.44081150285601733, + "grad_norm": 3.05351185798645, + "learning_rate": 0.00012379683299078422, + "loss": 1.8318, + "step": 12309 + }, + { + "epoch": 0.4408473149855856, + "grad_norm": 1.8058035373687744, + "learning_rate": 0.00012378556702363527, + "loss": 1.3283, + "step": 12310 + }, + { + "epoch": 0.4408831271151539, + "grad_norm": 1.5172128677368164, + "learning_rate": 0.00012377430073648218, + "loss": 1.4016, + "step": 12311 + }, + { + "epoch": 0.4409189392447222, + "grad_norm": 2.027984142303467, + "learning_rate": 0.0001237630341294766, + "loss": 1.5973, + "step": 12312 + }, + { + "epoch": 0.44095475137429047, + "grad_norm": 1.950453281402588, + "learning_rate": 0.00012375176720277002, + "loss": 1.521, + "step": 12313 + }, + { + "epoch": 0.44099056350385873, + "grad_norm": 2.8115463256835938, + "learning_rate": 0.00012374049995651405, + "loss": 1.5522, + "step": 12314 + }, + { + "epoch": 0.44102637563342706, + "grad_norm": 1.677557110786438, + "learning_rate": 0.00012372923239086024, + "loss": 1.4244, + "step": 12315 + }, + { + "epoch": 0.44106218776299533, + "grad_norm": 1.6209502220153809, + "learning_rate": 0.00012371796450596028, + "loss": 1.2205, + "step": 12316 + }, + { + "epoch": 0.4410979998925636, + "grad_norm": 1.4655624628067017, + "learning_rate": 0.00012370669630196567, + "loss": 1.627, + "step": 12317 + }, + { + "epoch": 0.4411338120221319, + "grad_norm": 2.140920639038086, + "learning_rate": 0.00012369542777902805, + "loss": 1.5118, + "step": 12318 + }, + { + "epoch": 0.4411696241517002, + "grad_norm": 1.743704915046692, + "learning_rate": 0.00012368415893729902, + "loss": 1.6768, + "step": 12319 + }, + { + "epoch": 0.44120543628126846, + "grad_norm": 1.668463945388794, + "learning_rate": 0.00012367288977693016, + "loss": 1.5263, + "step": 12320 + }, + { + "epoch": 0.44124124841083673, + "grad_norm": 1.3567086458206177, + "learning_rate": 0.0001236616202980731, + "loss": 1.3551, + "step": 12321 + }, + { + "epoch": 0.44127706054040505, + "grad_norm": 1.6387543678283691, + "learning_rate": 0.0001236503505008795, + "loss": 1.2526, + "step": 12322 + }, + { + "epoch": 0.4413128726699733, + "grad_norm": 1.8538243770599365, + "learning_rate": 0.0001236390803855009, + "loss": 1.745, + "step": 12323 + }, + { + "epoch": 0.4413486847995416, + "grad_norm": 1.6371301412582397, + "learning_rate": 0.00012362780995208895, + "loss": 1.5907, + "step": 12324 + }, + { + "epoch": 0.4413844969291099, + "grad_norm": 1.6978520154953003, + "learning_rate": 0.00012361653920079534, + "loss": 1.6795, + "step": 12325 + }, + { + "epoch": 0.4414203090586782, + "grad_norm": 1.94414484500885, + "learning_rate": 0.00012360526813177163, + "loss": 1.388, + "step": 12326 + }, + { + "epoch": 0.44145612118824645, + "grad_norm": 1.762611746788025, + "learning_rate": 0.00012359399674516955, + "loss": 1.4216, + "step": 12327 + }, + { + "epoch": 0.4414919333178147, + "grad_norm": 1.6278012990951538, + "learning_rate": 0.00012358272504114058, + "loss": 1.5087, + "step": 12328 + }, + { + "epoch": 0.44152774544738305, + "grad_norm": 1.7553491592407227, + "learning_rate": 0.00012357145301983651, + "loss": 1.6467, + "step": 12329 + }, + { + "epoch": 0.4415635575769513, + "grad_norm": 1.4429503679275513, + "learning_rate": 0.00012356018068140895, + "loss": 1.6824, + "step": 12330 + }, + { + "epoch": 0.4415993697065196, + "grad_norm": 1.9834673404693604, + "learning_rate": 0.00012354890802600957, + "loss": 1.6054, + "step": 12331 + }, + { + "epoch": 0.4416351818360879, + "grad_norm": 1.7594319581985474, + "learning_rate": 0.00012353763505378997, + "loss": 1.431, + "step": 12332 + }, + { + "epoch": 0.4416709939656562, + "grad_norm": 2.2970147132873535, + "learning_rate": 0.00012352636176490186, + "loss": 2.0729, + "step": 12333 + }, + { + "epoch": 0.44170680609522445, + "grad_norm": 1.2946401834487915, + "learning_rate": 0.00012351508815949691, + "loss": 1.4574, + "step": 12334 + }, + { + "epoch": 0.4417426182247927, + "grad_norm": 1.8043230772018433, + "learning_rate": 0.00012350381423772676, + "loss": 1.379, + "step": 12335 + }, + { + "epoch": 0.44177843035436104, + "grad_norm": 1.3034635782241821, + "learning_rate": 0.00012349253999974314, + "loss": 1.0734, + "step": 12336 + }, + { + "epoch": 0.4418142424839293, + "grad_norm": 1.2619051933288574, + "learning_rate": 0.00012348126544569767, + "loss": 1.4902, + "step": 12337 + }, + { + "epoch": 0.4418500546134976, + "grad_norm": 1.3467988967895508, + "learning_rate": 0.00012346999057574209, + "loss": 1.6634, + "step": 12338 + }, + { + "epoch": 0.4418858667430659, + "grad_norm": 1.8174192905426025, + "learning_rate": 0.00012345871539002801, + "loss": 1.5092, + "step": 12339 + }, + { + "epoch": 0.44192167887263417, + "grad_norm": 1.8187414407730103, + "learning_rate": 0.00012344743988870722, + "loss": 1.4353, + "step": 12340 + }, + { + "epoch": 0.44195749100220244, + "grad_norm": 1.7980650663375854, + "learning_rate": 0.00012343616407193135, + "loss": 1.5176, + "step": 12341 + }, + { + "epoch": 0.4419933031317707, + "grad_norm": 1.7426297664642334, + "learning_rate": 0.00012342488793985214, + "loss": 1.47, + "step": 12342 + }, + { + "epoch": 0.44202911526133903, + "grad_norm": 1.9516719579696655, + "learning_rate": 0.00012341361149262125, + "loss": 1.3465, + "step": 12343 + }, + { + "epoch": 0.4420649273909073, + "grad_norm": 2.266122817993164, + "learning_rate": 0.00012340233473039045, + "loss": 1.2277, + "step": 12344 + }, + { + "epoch": 0.44210073952047557, + "grad_norm": 2.047231674194336, + "learning_rate": 0.00012339105765331142, + "loss": 1.1451, + "step": 12345 + }, + { + "epoch": 0.4421365516500439, + "grad_norm": 2.4801621437072754, + "learning_rate": 0.00012337978026153587, + "loss": 1.3022, + "step": 12346 + }, + { + "epoch": 0.44217236377961217, + "grad_norm": 1.6295838356018066, + "learning_rate": 0.00012336850255521554, + "loss": 1.4763, + "step": 12347 + }, + { + "epoch": 0.44220817590918043, + "grad_norm": 1.4583467245101929, + "learning_rate": 0.00012335722453450215, + "loss": 1.5955, + "step": 12348 + }, + { + "epoch": 0.4422439880387487, + "grad_norm": 1.6887112855911255, + "learning_rate": 0.00012334594619954742, + "loss": 1.3007, + "step": 12349 + }, + { + "epoch": 0.442279800168317, + "grad_norm": 2.0861968994140625, + "learning_rate": 0.0001233346675505031, + "loss": 1.613, + "step": 12350 + }, + { + "epoch": 0.4423156122978853, + "grad_norm": 2.0231857299804688, + "learning_rate": 0.00012332338858752094, + "loss": 1.5615, + "step": 12351 + }, + { + "epoch": 0.44235142442745357, + "grad_norm": 2.4194977283477783, + "learning_rate": 0.0001233121093107527, + "loss": 1.5146, + "step": 12352 + }, + { + "epoch": 0.4423872365570219, + "grad_norm": 2.190960645675659, + "learning_rate": 0.00012330082972035006, + "loss": 1.3055, + "step": 12353 + }, + { + "epoch": 0.44242304868659016, + "grad_norm": 2.3201444149017334, + "learning_rate": 0.00012328954981646482, + "loss": 1.7702, + "step": 12354 + }, + { + "epoch": 0.44245886081615843, + "grad_norm": 1.6395111083984375, + "learning_rate": 0.0001232782695992487, + "loss": 1.3856, + "step": 12355 + }, + { + "epoch": 0.4424946729457267, + "grad_norm": 2.251711845397949, + "learning_rate": 0.00012326698906885353, + "loss": 1.8577, + "step": 12356 + }, + { + "epoch": 0.442530485075295, + "grad_norm": 1.6219013929367065, + "learning_rate": 0.00012325570822543103, + "loss": 1.3516, + "step": 12357 + }, + { + "epoch": 0.4425662972048633, + "grad_norm": 1.8324930667877197, + "learning_rate": 0.00012324442706913296, + "loss": 1.3791, + "step": 12358 + }, + { + "epoch": 0.44260210933443156, + "grad_norm": 2.259781837463379, + "learning_rate": 0.0001232331456001111, + "loss": 1.6316, + "step": 12359 + }, + { + "epoch": 0.4426379214639999, + "grad_norm": 2.230318307876587, + "learning_rate": 0.00012322186381851725, + "loss": 1.7068, + "step": 12360 + }, + { + "epoch": 0.44267373359356815, + "grad_norm": 1.9674252271652222, + "learning_rate": 0.00012321058172450318, + "loss": 1.7323, + "step": 12361 + }, + { + "epoch": 0.4427095457231364, + "grad_norm": 1.609484314918518, + "learning_rate": 0.0001231992993182207, + "loss": 1.5709, + "step": 12362 + }, + { + "epoch": 0.4427453578527047, + "grad_norm": 1.5630927085876465, + "learning_rate": 0.00012318801659982152, + "loss": 1.605, + "step": 12363 + }, + { + "epoch": 0.442781169982273, + "grad_norm": 1.5356788635253906, + "learning_rate": 0.00012317673356945753, + "loss": 1.161, + "step": 12364 + }, + { + "epoch": 0.4428169821118413, + "grad_norm": 2.1178667545318604, + "learning_rate": 0.00012316545022728043, + "loss": 1.4586, + "step": 12365 + }, + { + "epoch": 0.44285279424140955, + "grad_norm": 3.0236313343048096, + "learning_rate": 0.00012315416657344213, + "loss": 1.5712, + "step": 12366 + }, + { + "epoch": 0.4428886063709779, + "grad_norm": 2.9537036418914795, + "learning_rate": 0.00012314288260809435, + "loss": 1.5299, + "step": 12367 + }, + { + "epoch": 0.44292441850054615, + "grad_norm": 1.7508946657180786, + "learning_rate": 0.00012313159833138892, + "loss": 1.7762, + "step": 12368 + }, + { + "epoch": 0.4429602306301144, + "grad_norm": 1.5886273384094238, + "learning_rate": 0.00012312031374347773, + "loss": 1.4263, + "step": 12369 + }, + { + "epoch": 0.4429960427596827, + "grad_norm": 1.5932828187942505, + "learning_rate": 0.00012310902884451252, + "loss": 1.6544, + "step": 12370 + }, + { + "epoch": 0.443031854889251, + "grad_norm": 1.7331477403640747, + "learning_rate": 0.00012309774363464514, + "loss": 1.6996, + "step": 12371 + }, + { + "epoch": 0.4430676670188193, + "grad_norm": 1.5816339254379272, + "learning_rate": 0.00012308645811402738, + "loss": 1.469, + "step": 12372 + }, + { + "epoch": 0.44310347914838755, + "grad_norm": 2.0275886058807373, + "learning_rate": 0.00012307517228281117, + "loss": 1.8279, + "step": 12373 + }, + { + "epoch": 0.44313929127795587, + "grad_norm": 1.7863155603408813, + "learning_rate": 0.00012306388614114822, + "loss": 1.543, + "step": 12374 + }, + { + "epoch": 0.44317510340752414, + "grad_norm": 1.4801067113876343, + "learning_rate": 0.00012305259968919046, + "loss": 1.5689, + "step": 12375 + }, + { + "epoch": 0.4432109155370924, + "grad_norm": 1.4803080558776855, + "learning_rate": 0.00012304131292708968, + "loss": 1.3355, + "step": 12376 + }, + { + "epoch": 0.4432467276666607, + "grad_norm": 1.7210575342178345, + "learning_rate": 0.0001230300258549978, + "loss": 1.602, + "step": 12377 + }, + { + "epoch": 0.443282539796229, + "grad_norm": 1.4873367547988892, + "learning_rate": 0.00012301873847306657, + "loss": 1.3953, + "step": 12378 + }, + { + "epoch": 0.44331835192579727, + "grad_norm": 1.6360687017440796, + "learning_rate": 0.00012300745078144796, + "loss": 1.4753, + "step": 12379 + }, + { + "epoch": 0.44335416405536554, + "grad_norm": 1.5548124313354492, + "learning_rate": 0.00012299616278029375, + "loss": 1.3875, + "step": 12380 + }, + { + "epoch": 0.4433899761849338, + "grad_norm": 1.4438000917434692, + "learning_rate": 0.00012298487446975583, + "loss": 1.5891, + "step": 12381 + }, + { + "epoch": 0.44342578831450213, + "grad_norm": 2.041623830795288, + "learning_rate": 0.0001229735858499861, + "loss": 1.3645, + "step": 12382 + }, + { + "epoch": 0.4434616004440704, + "grad_norm": 1.9327969551086426, + "learning_rate": 0.0001229622969211364, + "loss": 1.5932, + "step": 12383 + }, + { + "epoch": 0.44349741257363867, + "grad_norm": 1.3270928859710693, + "learning_rate": 0.00012295100768335858, + "loss": 1.7006, + "step": 12384 + }, + { + "epoch": 0.443533224703207, + "grad_norm": 1.8910813331604004, + "learning_rate": 0.00012293971813680458, + "loss": 1.6462, + "step": 12385 + }, + { + "epoch": 0.44356903683277527, + "grad_norm": 2.8436388969421387, + "learning_rate": 0.00012292842828162627, + "loss": 1.5743, + "step": 12386 + }, + { + "epoch": 0.44360484896234353, + "grad_norm": 1.7232835292816162, + "learning_rate": 0.00012291713811797553, + "loss": 1.4634, + "step": 12387 + }, + { + "epoch": 0.4436406610919118, + "grad_norm": 1.5384464263916016, + "learning_rate": 0.00012290584764600425, + "loss": 1.4929, + "step": 12388 + }, + { + "epoch": 0.4436764732214801, + "grad_norm": 1.418431043624878, + "learning_rate": 0.00012289455686586434, + "loss": 1.4444, + "step": 12389 + }, + { + "epoch": 0.4437122853510484, + "grad_norm": 1.4715633392333984, + "learning_rate": 0.0001228832657777077, + "loss": 1.5561, + "step": 12390 + }, + { + "epoch": 0.44374809748061667, + "grad_norm": 1.7735953330993652, + "learning_rate": 0.00012287197438168624, + "loss": 1.9554, + "step": 12391 + }, + { + "epoch": 0.443783909610185, + "grad_norm": 1.6121642589569092, + "learning_rate": 0.00012286068267795185, + "loss": 1.3333, + "step": 12392 + }, + { + "epoch": 0.44381972173975326, + "grad_norm": 1.414981722831726, + "learning_rate": 0.00012284939066665648, + "loss": 1.5743, + "step": 12393 + }, + { + "epoch": 0.44385553386932153, + "grad_norm": 1.645015001296997, + "learning_rate": 0.00012283809834795202, + "loss": 1.6721, + "step": 12394 + }, + { + "epoch": 0.4438913459988898, + "grad_norm": 1.310796856880188, + "learning_rate": 0.00012282680572199043, + "loss": 1.4809, + "step": 12395 + }, + { + "epoch": 0.4439271581284581, + "grad_norm": 1.5144050121307373, + "learning_rate": 0.00012281551278892357, + "loss": 1.1173, + "step": 12396 + }, + { + "epoch": 0.4439629702580264, + "grad_norm": 1.5878082513809204, + "learning_rate": 0.00012280421954890346, + "loss": 1.4621, + "step": 12397 + }, + { + "epoch": 0.44399878238759466, + "grad_norm": 2.0923068523406982, + "learning_rate": 0.000122792926002082, + "loss": 1.4257, + "step": 12398 + }, + { + "epoch": 0.444034594517163, + "grad_norm": 1.6810699701309204, + "learning_rate": 0.00012278163214861107, + "loss": 1.3412, + "step": 12399 + }, + { + "epoch": 0.44407040664673125, + "grad_norm": 1.7527968883514404, + "learning_rate": 0.00012277033798864268, + "loss": 1.673, + "step": 12400 + }, + { + "epoch": 0.4441062187762995, + "grad_norm": 2.4101831912994385, + "learning_rate": 0.00012275904352232876, + "loss": 1.4516, + "step": 12401 + }, + { + "epoch": 0.4441420309058678, + "grad_norm": 1.7955946922302246, + "learning_rate": 0.00012274774874982132, + "loss": 1.8107, + "step": 12402 + }, + { + "epoch": 0.4441778430354361, + "grad_norm": 1.6984844207763672, + "learning_rate": 0.0001227364536712722, + "loss": 1.5125, + "step": 12403 + }, + { + "epoch": 0.4442136551650044, + "grad_norm": 1.418100357055664, + "learning_rate": 0.00012272515828683344, + "loss": 1.487, + "step": 12404 + }, + { + "epoch": 0.44424946729457265, + "grad_norm": 1.629875898361206, + "learning_rate": 0.00012271386259665701, + "loss": 1.4305, + "step": 12405 + }, + { + "epoch": 0.444285279424141, + "grad_norm": 2.2954277992248535, + "learning_rate": 0.00012270256660089484, + "loss": 1.7927, + "step": 12406 + }, + { + "epoch": 0.44432109155370925, + "grad_norm": 1.8321059942245483, + "learning_rate": 0.00012269127029969893, + "loss": 1.5456, + "step": 12407 + }, + { + "epoch": 0.4443569036832775, + "grad_norm": 1.7216614484786987, + "learning_rate": 0.00012267997369322126, + "loss": 1.5889, + "step": 12408 + }, + { + "epoch": 0.4443927158128458, + "grad_norm": 1.7237794399261475, + "learning_rate": 0.00012266867678161375, + "loss": 1.3024, + "step": 12409 + }, + { + "epoch": 0.4444285279424141, + "grad_norm": 1.995652437210083, + "learning_rate": 0.00012265737956502847, + "loss": 1.3176, + "step": 12410 + }, + { + "epoch": 0.4444643400719824, + "grad_norm": 1.5316146612167358, + "learning_rate": 0.0001226460820436174, + "loss": 1.627, + "step": 12411 + }, + { + "epoch": 0.44450015220155065, + "grad_norm": 1.9641482830047607, + "learning_rate": 0.00012263478421753243, + "loss": 1.3494, + "step": 12412 + }, + { + "epoch": 0.44453596433111897, + "grad_norm": 1.9962435960769653, + "learning_rate": 0.0001226234860869257, + "loss": 1.3349, + "step": 12413 + }, + { + "epoch": 0.44457177646068724, + "grad_norm": 1.4365565776824951, + "learning_rate": 0.00012261218765194913, + "loss": 1.4867, + "step": 12414 + }, + { + "epoch": 0.4446075885902555, + "grad_norm": 1.7713817358016968, + "learning_rate": 0.00012260088891275476, + "loss": 1.5989, + "step": 12415 + }, + { + "epoch": 0.4446434007198238, + "grad_norm": 1.790130376815796, + "learning_rate": 0.00012258958986949455, + "loss": 1.5266, + "step": 12416 + }, + { + "epoch": 0.4446792128493921, + "grad_norm": 1.350547432899475, + "learning_rate": 0.00012257829052232056, + "loss": 1.8426, + "step": 12417 + }, + { + "epoch": 0.44471502497896037, + "grad_norm": 1.452842116355896, + "learning_rate": 0.00012256699087138479, + "loss": 1.4189, + "step": 12418 + }, + { + "epoch": 0.44475083710852864, + "grad_norm": 2.3444418907165527, + "learning_rate": 0.0001225556909168393, + "loss": 1.565, + "step": 12419 + }, + { + "epoch": 0.44478664923809696, + "grad_norm": 2.152139902114868, + "learning_rate": 0.00012254439065883602, + "loss": 1.4686, + "step": 12420 + }, + { + "epoch": 0.44482246136766523, + "grad_norm": 1.6714543104171753, + "learning_rate": 0.0001225330900975271, + "loss": 1.4201, + "step": 12421 + }, + { + "epoch": 0.4448582734972335, + "grad_norm": 1.691991925239563, + "learning_rate": 0.00012252178923306448, + "loss": 1.2425, + "step": 12422 + }, + { + "epoch": 0.44489408562680177, + "grad_norm": 1.3791453838348389, + "learning_rate": 0.00012251048806560027, + "loss": 1.2884, + "step": 12423 + }, + { + "epoch": 0.4449298977563701, + "grad_norm": 1.6652225255966187, + "learning_rate": 0.00012249918659528648, + "loss": 1.4209, + "step": 12424 + }, + { + "epoch": 0.44496570988593837, + "grad_norm": 2.57869291305542, + "learning_rate": 0.0001224878848222751, + "loss": 1.6552, + "step": 12425 + }, + { + "epoch": 0.44500152201550663, + "grad_norm": 1.6043140888214111, + "learning_rate": 0.0001224765827467183, + "loss": 1.2117, + "step": 12426 + }, + { + "epoch": 0.44503733414507496, + "grad_norm": 1.4723271131515503, + "learning_rate": 0.00012246528036876807, + "loss": 1.5539, + "step": 12427 + }, + { + "epoch": 0.4450731462746432, + "grad_norm": 2.1970090866088867, + "learning_rate": 0.00012245397768857646, + "loss": 1.4757, + "step": 12428 + }, + { + "epoch": 0.4451089584042115, + "grad_norm": 1.648956537246704, + "learning_rate": 0.0001224426747062955, + "loss": 1.4227, + "step": 12429 + }, + { + "epoch": 0.44514477053377977, + "grad_norm": 1.9277909994125366, + "learning_rate": 0.00012243137142207733, + "loss": 1.6855, + "step": 12430 + }, + { + "epoch": 0.4451805826633481, + "grad_norm": 1.7373301982879639, + "learning_rate": 0.000122420067836074, + "loss": 1.3754, + "step": 12431 + }, + { + "epoch": 0.44521639479291636, + "grad_norm": 1.5616296529769897, + "learning_rate": 0.0001224087639484376, + "loss": 1.539, + "step": 12432 + }, + { + "epoch": 0.44525220692248463, + "grad_norm": 2.611417055130005, + "learning_rate": 0.00012239745975932016, + "loss": 1.4478, + "step": 12433 + }, + { + "epoch": 0.44528801905205295, + "grad_norm": 3.34085750579834, + "learning_rate": 0.00012238615526887378, + "loss": 2.1207, + "step": 12434 + }, + { + "epoch": 0.4453238311816212, + "grad_norm": 1.9090518951416016, + "learning_rate": 0.00012237485047725057, + "loss": 1.6396, + "step": 12435 + }, + { + "epoch": 0.4453596433111895, + "grad_norm": 1.521642804145813, + "learning_rate": 0.00012236354538460259, + "loss": 1.5772, + "step": 12436 + }, + { + "epoch": 0.44539545544075776, + "grad_norm": 2.2825441360473633, + "learning_rate": 0.000122352239991082, + "loss": 1.5948, + "step": 12437 + }, + { + "epoch": 0.4454312675703261, + "grad_norm": 1.408004641532898, + "learning_rate": 0.0001223409342968408, + "loss": 1.244, + "step": 12438 + }, + { + "epoch": 0.44546707969989435, + "grad_norm": 1.4252254962921143, + "learning_rate": 0.00012232962830203116, + "loss": 1.5186, + "step": 12439 + }, + { + "epoch": 0.4455028918294626, + "grad_norm": 2.0045323371887207, + "learning_rate": 0.00012231832200680518, + "loss": 1.4156, + "step": 12440 + }, + { + "epoch": 0.44553870395903095, + "grad_norm": 1.9673701524734497, + "learning_rate": 0.00012230701541131499, + "loss": 1.2987, + "step": 12441 + }, + { + "epoch": 0.4455745160885992, + "grad_norm": 1.8658106327056885, + "learning_rate": 0.00012229570851571265, + "loss": 1.5869, + "step": 12442 + }, + { + "epoch": 0.4456103282181675, + "grad_norm": 2.748161554336548, + "learning_rate": 0.00012228440132015033, + "loss": 1.375, + "step": 12443 + }, + { + "epoch": 0.44564614034773575, + "grad_norm": 2.1242969036102295, + "learning_rate": 0.0001222730938247801, + "loss": 1.8089, + "step": 12444 + }, + { + "epoch": 0.4456819524773041, + "grad_norm": 1.5516866445541382, + "learning_rate": 0.00012226178602975417, + "loss": 1.4706, + "step": 12445 + }, + { + "epoch": 0.44571776460687235, + "grad_norm": 2.02280855178833, + "learning_rate": 0.00012225047793522462, + "loss": 1.583, + "step": 12446 + }, + { + "epoch": 0.4457535767364406, + "grad_norm": 1.7258055210113525, + "learning_rate": 0.00012223916954134356, + "loss": 1.6239, + "step": 12447 + }, + { + "epoch": 0.44578938886600894, + "grad_norm": 1.9967955350875854, + "learning_rate": 0.00012222786084826318, + "loss": 1.5723, + "step": 12448 + }, + { + "epoch": 0.4458252009955772, + "grad_norm": 1.7219719886779785, + "learning_rate": 0.00012221655185613557, + "loss": 1.376, + "step": 12449 + }, + { + "epoch": 0.4458610131251455, + "grad_norm": 1.6591328382492065, + "learning_rate": 0.00012220524256511297, + "loss": 1.5033, + "step": 12450 + }, + { + "epoch": 0.44589682525471375, + "grad_norm": 1.6007272005081177, + "learning_rate": 0.00012219393297534744, + "loss": 1.0919, + "step": 12451 + }, + { + "epoch": 0.44593263738428207, + "grad_norm": 1.706794261932373, + "learning_rate": 0.00012218262308699119, + "loss": 1.319, + "step": 12452 + }, + { + "epoch": 0.44596844951385034, + "grad_norm": 1.502685785293579, + "learning_rate": 0.00012217131290019633, + "loss": 1.5424, + "step": 12453 + }, + { + "epoch": 0.4460042616434186, + "grad_norm": 1.5585651397705078, + "learning_rate": 0.00012216000241511507, + "loss": 1.5909, + "step": 12454 + }, + { + "epoch": 0.44604007377298693, + "grad_norm": 3.2422244548797607, + "learning_rate": 0.00012214869163189958, + "loss": 1.6386, + "step": 12455 + }, + { + "epoch": 0.4460758859025552, + "grad_norm": 1.7532734870910645, + "learning_rate": 0.00012213738055070195, + "loss": 1.59, + "step": 12456 + }, + { + "epoch": 0.44611169803212347, + "grad_norm": 1.761208176612854, + "learning_rate": 0.0001221260691716745, + "loss": 1.3127, + "step": 12457 + }, + { + "epoch": 0.44614751016169174, + "grad_norm": 1.8905987739562988, + "learning_rate": 0.0001221147574949693, + "loss": 1.4267, + "step": 12458 + }, + { + "epoch": 0.44618332229126006, + "grad_norm": 1.5984662771224976, + "learning_rate": 0.00012210344552073855, + "loss": 1.8281, + "step": 12459 + }, + { + "epoch": 0.44621913442082833, + "grad_norm": 1.3725858926773071, + "learning_rate": 0.00012209213324913446, + "loss": 1.4297, + "step": 12460 + }, + { + "epoch": 0.4462549465503966, + "grad_norm": 2.176337242126465, + "learning_rate": 0.00012208082068030924, + "loss": 1.7024, + "step": 12461 + }, + { + "epoch": 0.4462907586799649, + "grad_norm": 1.6479774713516235, + "learning_rate": 0.00012206950781441502, + "loss": 1.2077, + "step": 12462 + }, + { + "epoch": 0.4463265708095332, + "grad_norm": 1.9320515394210815, + "learning_rate": 0.00012205819465160407, + "loss": 1.8261, + "step": 12463 + }, + { + "epoch": 0.44636238293910147, + "grad_norm": 1.9457646608352661, + "learning_rate": 0.00012204688119202852, + "loss": 1.6547, + "step": 12464 + }, + { + "epoch": 0.44639819506866973, + "grad_norm": 1.4203673601150513, + "learning_rate": 0.00012203556743584063, + "loss": 1.449, + "step": 12465 + }, + { + "epoch": 0.44643400719823806, + "grad_norm": 1.9975597858428955, + "learning_rate": 0.00012202425338319265, + "loss": 1.7199, + "step": 12466 + }, + { + "epoch": 0.4464698193278063, + "grad_norm": 1.3389594554901123, + "learning_rate": 0.00012201293903423675, + "loss": 1.3139, + "step": 12467 + }, + { + "epoch": 0.4465056314573746, + "grad_norm": 1.865659475326538, + "learning_rate": 0.00012200162438912512, + "loss": 1.7206, + "step": 12468 + }, + { + "epoch": 0.4465414435869429, + "grad_norm": 2.4966471195220947, + "learning_rate": 0.00012199030944801, + "loss": 1.8555, + "step": 12469 + }, + { + "epoch": 0.4465772557165112, + "grad_norm": 1.775084137916565, + "learning_rate": 0.00012197899421104367, + "loss": 1.52, + "step": 12470 + }, + { + "epoch": 0.44661306784607946, + "grad_norm": 1.4114207029342651, + "learning_rate": 0.00012196767867837829, + "loss": 1.531, + "step": 12471 + }, + { + "epoch": 0.44664887997564773, + "grad_norm": 1.6351429224014282, + "learning_rate": 0.00012195636285016614, + "loss": 1.5006, + "step": 12472 + }, + { + "epoch": 0.44668469210521605, + "grad_norm": 1.3841909170150757, + "learning_rate": 0.00012194504672655944, + "loss": 1.4175, + "step": 12473 + }, + { + "epoch": 0.4467205042347843, + "grad_norm": 1.8590139150619507, + "learning_rate": 0.00012193373030771046, + "loss": 1.4761, + "step": 12474 + }, + { + "epoch": 0.4467563163643526, + "grad_norm": 1.685275673866272, + "learning_rate": 0.00012192241359377143, + "loss": 1.5694, + "step": 12475 + }, + { + "epoch": 0.4467921284939209, + "grad_norm": 1.7317240238189697, + "learning_rate": 0.00012191109658489462, + "loss": 1.3867, + "step": 12476 + }, + { + "epoch": 0.4468279406234892, + "grad_norm": 2.1701266765594482, + "learning_rate": 0.00012189977928123224, + "loss": 1.6338, + "step": 12477 + }, + { + "epoch": 0.44686375275305745, + "grad_norm": 1.709526538848877, + "learning_rate": 0.0001218884616829366, + "loss": 1.4318, + "step": 12478 + }, + { + "epoch": 0.4468995648826257, + "grad_norm": 2.59258770942688, + "learning_rate": 0.00012187714379015993, + "loss": 1.8929, + "step": 12479 + }, + { + "epoch": 0.44693537701219405, + "grad_norm": 1.6843186616897583, + "learning_rate": 0.00012186582560305448, + "loss": 1.4508, + "step": 12480 + }, + { + "epoch": 0.4469711891417623, + "grad_norm": 1.3819479942321777, + "learning_rate": 0.0001218545071217726, + "loss": 1.5629, + "step": 12481 + }, + { + "epoch": 0.4470070012713306, + "grad_norm": 2.069230079650879, + "learning_rate": 0.00012184318834646648, + "loss": 1.745, + "step": 12482 + }, + { + "epoch": 0.4470428134008989, + "grad_norm": 1.6494131088256836, + "learning_rate": 0.00012183186927728846, + "loss": 1.4686, + "step": 12483 + }, + { + "epoch": 0.4470786255304672, + "grad_norm": 1.2454302310943604, + "learning_rate": 0.00012182054991439078, + "loss": 1.2643, + "step": 12484 + }, + { + "epoch": 0.44711443766003545, + "grad_norm": 1.628246784210205, + "learning_rate": 0.00012180923025792579, + "loss": 1.6646, + "step": 12485 + }, + { + "epoch": 0.4471502497896037, + "grad_norm": 1.3497425317764282, + "learning_rate": 0.00012179791030804573, + "loss": 1.4701, + "step": 12486 + }, + { + "epoch": 0.44718606191917204, + "grad_norm": 1.5856043100357056, + "learning_rate": 0.00012178659006490285, + "loss": 1.625, + "step": 12487 + }, + { + "epoch": 0.4472218740487403, + "grad_norm": 2.0601372718811035, + "learning_rate": 0.00012177526952864955, + "loss": 1.6557, + "step": 12488 + }, + { + "epoch": 0.4472576861783086, + "grad_norm": 1.5500800609588623, + "learning_rate": 0.00012176394869943805, + "loss": 1.7003, + "step": 12489 + }, + { + "epoch": 0.4472934983078769, + "grad_norm": 1.6595675945281982, + "learning_rate": 0.00012175262757742074, + "loss": 1.5211, + "step": 12490 + }, + { + "epoch": 0.44732931043744517, + "grad_norm": 1.8388330936431885, + "learning_rate": 0.00012174130616274985, + "loss": 1.5596, + "step": 12491 + }, + { + "epoch": 0.44736512256701344, + "grad_norm": 1.7560348510742188, + "learning_rate": 0.00012172998445557775, + "loss": 1.4747, + "step": 12492 + }, + { + "epoch": 0.4474009346965817, + "grad_norm": 2.180821180343628, + "learning_rate": 0.00012171866245605671, + "loss": 1.6482, + "step": 12493 + }, + { + "epoch": 0.44743674682615003, + "grad_norm": 1.6616812944412231, + "learning_rate": 0.00012170734016433914, + "loss": 1.4381, + "step": 12494 + }, + { + "epoch": 0.4474725589557183, + "grad_norm": 1.351104736328125, + "learning_rate": 0.00012169601758057727, + "loss": 1.6061, + "step": 12495 + }, + { + "epoch": 0.44750837108528657, + "grad_norm": 1.8594951629638672, + "learning_rate": 0.00012168469470492345, + "loss": 1.5047, + "step": 12496 + }, + { + "epoch": 0.4475441832148549, + "grad_norm": 1.516777515411377, + "learning_rate": 0.00012167337153753007, + "loss": 1.9187, + "step": 12497 + }, + { + "epoch": 0.44757999534442316, + "grad_norm": 1.9803366661071777, + "learning_rate": 0.00012166204807854942, + "loss": 1.584, + "step": 12498 + }, + { + "epoch": 0.44761580747399143, + "grad_norm": 1.7764346599578857, + "learning_rate": 0.00012165072432813385, + "loss": 1.388, + "step": 12499 + }, + { + "epoch": 0.4476516196035597, + "grad_norm": 1.7678972482681274, + "learning_rate": 0.0001216394002864357, + "loss": 1.3775, + "step": 12500 + }, + { + "epoch": 0.447687431733128, + "grad_norm": 1.4079352617263794, + "learning_rate": 0.00012162807595360737, + "loss": 1.5185, + "step": 12501 + }, + { + "epoch": 0.4477232438626963, + "grad_norm": 1.933819055557251, + "learning_rate": 0.00012161675132980114, + "loss": 1.5765, + "step": 12502 + }, + { + "epoch": 0.44775905599226457, + "grad_norm": 2.9456725120544434, + "learning_rate": 0.00012160542641516945, + "loss": 1.8331, + "step": 12503 + }, + { + "epoch": 0.4477948681218329, + "grad_norm": 1.9133472442626953, + "learning_rate": 0.00012159410120986456, + "loss": 1.6421, + "step": 12504 + }, + { + "epoch": 0.44783068025140116, + "grad_norm": 1.5416887998580933, + "learning_rate": 0.00012158277571403893, + "loss": 1.3045, + "step": 12505 + }, + { + "epoch": 0.4478664923809694, + "grad_norm": 2.035646677017212, + "learning_rate": 0.00012157144992784486, + "loss": 1.4106, + "step": 12506 + }, + { + "epoch": 0.4479023045105377, + "grad_norm": 1.2931156158447266, + "learning_rate": 0.00012156012385143479, + "loss": 1.7832, + "step": 12507 + }, + { + "epoch": 0.447938116640106, + "grad_norm": 1.6115169525146484, + "learning_rate": 0.00012154879748496104, + "loss": 1.4598, + "step": 12508 + }, + { + "epoch": 0.4479739287696743, + "grad_norm": 2.0151591300964355, + "learning_rate": 0.00012153747082857601, + "loss": 1.7758, + "step": 12509 + }, + { + "epoch": 0.44800974089924256, + "grad_norm": 1.969767451286316, + "learning_rate": 0.00012152614388243213, + "loss": 1.692, + "step": 12510 + }, + { + "epoch": 0.4480455530288109, + "grad_norm": 2.38447904586792, + "learning_rate": 0.00012151481664668175, + "loss": 1.5807, + "step": 12511 + }, + { + "epoch": 0.44808136515837915, + "grad_norm": 1.623461127281189, + "learning_rate": 0.00012150348912147723, + "loss": 1.3877, + "step": 12512 + }, + { + "epoch": 0.4481171772879474, + "grad_norm": 1.4375425577163696, + "learning_rate": 0.000121492161306971, + "loss": 1.5076, + "step": 12513 + }, + { + "epoch": 0.4481529894175157, + "grad_norm": 1.580809473991394, + "learning_rate": 0.00012148083320331549, + "loss": 1.5652, + "step": 12514 + }, + { + "epoch": 0.448188801547084, + "grad_norm": 2.1061019897460938, + "learning_rate": 0.00012146950481066304, + "loss": 1.6577, + "step": 12515 + }, + { + "epoch": 0.4482246136766523, + "grad_norm": 1.5830954313278198, + "learning_rate": 0.00012145817612916612, + "loss": 1.5543, + "step": 12516 + }, + { + "epoch": 0.44826042580622055, + "grad_norm": 1.8224624395370483, + "learning_rate": 0.00012144684715897711, + "loss": 1.2341, + "step": 12517 + }, + { + "epoch": 0.4482962379357889, + "grad_norm": 1.5227636098861694, + "learning_rate": 0.00012143551790024848, + "loss": 1.641, + "step": 12518 + }, + { + "epoch": 0.44833205006535715, + "grad_norm": 1.8081822395324707, + "learning_rate": 0.00012142418835313254, + "loss": 1.3633, + "step": 12519 + }, + { + "epoch": 0.4483678621949254, + "grad_norm": 1.5999212265014648, + "learning_rate": 0.00012141285851778183, + "loss": 1.444, + "step": 12520 + }, + { + "epoch": 0.4484036743244937, + "grad_norm": 2.473132371902466, + "learning_rate": 0.00012140152839434873, + "loss": 1.7155, + "step": 12521 + }, + { + "epoch": 0.448439486454062, + "grad_norm": 2.2659380435943604, + "learning_rate": 0.00012139019798298563, + "loss": 1.7137, + "step": 12522 + }, + { + "epoch": 0.4484752985836303, + "grad_norm": 2.1838572025299072, + "learning_rate": 0.00012137886728384504, + "loss": 1.5565, + "step": 12523 + }, + { + "epoch": 0.44851111071319855, + "grad_norm": 1.3856747150421143, + "learning_rate": 0.00012136753629707936, + "loss": 1.5583, + "step": 12524 + }, + { + "epoch": 0.44854692284276687, + "grad_norm": 1.9057285785675049, + "learning_rate": 0.00012135620502284104, + "loss": 1.5314, + "step": 12525 + }, + { + "epoch": 0.44858273497233514, + "grad_norm": 1.7440130710601807, + "learning_rate": 0.00012134487346128252, + "loss": 1.6575, + "step": 12526 + }, + { + "epoch": 0.4486185471019034, + "grad_norm": 2.055412769317627, + "learning_rate": 0.00012133354161255628, + "loss": 1.4412, + "step": 12527 + }, + { + "epoch": 0.4486543592314717, + "grad_norm": 1.5365710258483887, + "learning_rate": 0.00012132220947681472, + "loss": 1.2439, + "step": 12528 + }, + { + "epoch": 0.44869017136104, + "grad_norm": 1.7394073009490967, + "learning_rate": 0.00012131087705421036, + "loss": 1.5784, + "step": 12529 + }, + { + "epoch": 0.44872598349060827, + "grad_norm": 1.6387853622436523, + "learning_rate": 0.00012129954434489566, + "loss": 1.4735, + "step": 12530 + }, + { + "epoch": 0.44876179562017654, + "grad_norm": 1.3802781105041504, + "learning_rate": 0.00012128821134902302, + "loss": 1.4855, + "step": 12531 + }, + { + "epoch": 0.44879760774974486, + "grad_norm": 2.449227809906006, + "learning_rate": 0.00012127687806674499, + "loss": 1.7276, + "step": 12532 + }, + { + "epoch": 0.44883341987931313, + "grad_norm": 1.6007570028305054, + "learning_rate": 0.00012126554449821399, + "loss": 1.549, + "step": 12533 + }, + { + "epoch": 0.4488692320088814, + "grad_norm": 1.3325066566467285, + "learning_rate": 0.00012125421064358253, + "loss": 1.3366, + "step": 12534 + }, + { + "epoch": 0.44890504413844967, + "grad_norm": 1.4373480081558228, + "learning_rate": 0.00012124287650300307, + "loss": 1.6836, + "step": 12535 + }, + { + "epoch": 0.448940856268018, + "grad_norm": 2.2037734985351562, + "learning_rate": 0.00012123154207662815, + "loss": 1.4235, + "step": 12536 + }, + { + "epoch": 0.44897666839758626, + "grad_norm": 1.5827618837356567, + "learning_rate": 0.00012122020736461018, + "loss": 1.5252, + "step": 12537 + }, + { + "epoch": 0.44901248052715453, + "grad_norm": 1.817306399345398, + "learning_rate": 0.0001212088723671017, + "loss": 1.6918, + "step": 12538 + }, + { + "epoch": 0.44904829265672286, + "grad_norm": 1.913685917854309, + "learning_rate": 0.0001211975370842552, + "loss": 1.6136, + "step": 12539 + }, + { + "epoch": 0.4490841047862911, + "grad_norm": 2.0694994926452637, + "learning_rate": 0.00012118620151622317, + "loss": 1.3688, + "step": 12540 + }, + { + "epoch": 0.4491199169158594, + "grad_norm": 1.696552038192749, + "learning_rate": 0.00012117486566315814, + "loss": 1.4033, + "step": 12541 + }, + { + "epoch": 0.44915572904542767, + "grad_norm": 1.6482453346252441, + "learning_rate": 0.0001211635295252126, + "loss": 1.4401, + "step": 12542 + }, + { + "epoch": 0.449191541174996, + "grad_norm": 1.2823840379714966, + "learning_rate": 0.0001211521931025391, + "loss": 1.4209, + "step": 12543 + }, + { + "epoch": 0.44922735330456426, + "grad_norm": 1.704340934753418, + "learning_rate": 0.00012114085639529007, + "loss": 1.5546, + "step": 12544 + }, + { + "epoch": 0.4492631654341325, + "grad_norm": 1.6278151273727417, + "learning_rate": 0.00012112951940361812, + "loss": 1.6255, + "step": 12545 + }, + { + "epoch": 0.44929897756370085, + "grad_norm": 2.9371583461761475, + "learning_rate": 0.00012111818212767572, + "loss": 1.8694, + "step": 12546 + }, + { + "epoch": 0.4493347896932691, + "grad_norm": 1.6199440956115723, + "learning_rate": 0.00012110684456761547, + "loss": 1.1853, + "step": 12547 + }, + { + "epoch": 0.4493706018228374, + "grad_norm": 1.508470892906189, + "learning_rate": 0.0001210955067235898, + "loss": 1.4939, + "step": 12548 + }, + { + "epoch": 0.44940641395240566, + "grad_norm": 1.4398698806762695, + "learning_rate": 0.00012108416859575131, + "loss": 1.5191, + "step": 12549 + }, + { + "epoch": 0.449442226081974, + "grad_norm": 1.856343150138855, + "learning_rate": 0.00012107283018425256, + "loss": 1.8702, + "step": 12550 + }, + { + "epoch": 0.44947803821154225, + "grad_norm": 1.3848611116409302, + "learning_rate": 0.00012106149148924602, + "loss": 1.6158, + "step": 12551 + }, + { + "epoch": 0.4495138503411105, + "grad_norm": 1.683535099029541, + "learning_rate": 0.0001210501525108843, + "loss": 1.8212, + "step": 12552 + }, + { + "epoch": 0.44954966247067885, + "grad_norm": 1.8491222858428955, + "learning_rate": 0.00012103881324931991, + "loss": 1.3237, + "step": 12553 + }, + { + "epoch": 0.4495854746002471, + "grad_norm": 1.5038716793060303, + "learning_rate": 0.00012102747370470546, + "loss": 1.2653, + "step": 12554 + }, + { + "epoch": 0.4496212867298154, + "grad_norm": 1.5144926309585571, + "learning_rate": 0.00012101613387719348, + "loss": 1.6953, + "step": 12555 + }, + { + "epoch": 0.44965709885938365, + "grad_norm": 2.2025644779205322, + "learning_rate": 0.00012100479376693652, + "loss": 1.5872, + "step": 12556 + }, + { + "epoch": 0.449692910988952, + "grad_norm": 1.7867844104766846, + "learning_rate": 0.00012099345337408712, + "loss": 1.371, + "step": 12557 + }, + { + "epoch": 0.44972872311852025, + "grad_norm": 1.6688547134399414, + "learning_rate": 0.00012098211269879791, + "loss": 1.3092, + "step": 12558 + }, + { + "epoch": 0.4497645352480885, + "grad_norm": 1.5913275480270386, + "learning_rate": 0.00012097077174122143, + "loss": 1.6201, + "step": 12559 + }, + { + "epoch": 0.44980034737765684, + "grad_norm": 1.4191871881484985, + "learning_rate": 0.00012095943050151026, + "loss": 1.5449, + "step": 12560 + }, + { + "epoch": 0.4498361595072251, + "grad_norm": 1.5668357610702515, + "learning_rate": 0.000120948088979817, + "loss": 1.2965, + "step": 12561 + }, + { + "epoch": 0.4498719716367934, + "grad_norm": 1.4503635168075562, + "learning_rate": 0.00012093674717629419, + "loss": 1.5289, + "step": 12562 + }, + { + "epoch": 0.44990778376636165, + "grad_norm": 1.6835050582885742, + "learning_rate": 0.00012092540509109451, + "loss": 1.5644, + "step": 12563 + }, + { + "epoch": 0.44994359589592997, + "grad_norm": 1.895340919494629, + "learning_rate": 0.00012091406272437049, + "loss": 1.3566, + "step": 12564 + }, + { + "epoch": 0.44997940802549824, + "grad_norm": 1.6163996458053589, + "learning_rate": 0.00012090272007627472, + "loss": 1.5901, + "step": 12565 + }, + { + "epoch": 0.4500152201550665, + "grad_norm": 1.5837438106536865, + "learning_rate": 0.0001208913771469598, + "loss": 1.3175, + "step": 12566 + }, + { + "epoch": 0.45005103228463483, + "grad_norm": 1.9061717987060547, + "learning_rate": 0.00012088003393657837, + "loss": 1.4182, + "step": 12567 + }, + { + "epoch": 0.4500868444142031, + "grad_norm": 1.5280214548110962, + "learning_rate": 0.00012086869044528297, + "loss": 1.5659, + "step": 12568 + }, + { + "epoch": 0.45012265654377137, + "grad_norm": 2.357043743133545, + "learning_rate": 0.0001208573466732263, + "loss": 1.5281, + "step": 12569 + }, + { + "epoch": 0.45015846867333964, + "grad_norm": 1.4980483055114746, + "learning_rate": 0.00012084600262056094, + "loss": 1.6226, + "step": 12570 + }, + { + "epoch": 0.45019428080290796, + "grad_norm": 1.7290112972259521, + "learning_rate": 0.0001208346582874395, + "loss": 1.5311, + "step": 12571 + }, + { + "epoch": 0.45023009293247623, + "grad_norm": 1.7056915760040283, + "learning_rate": 0.00012082331367401458, + "loss": 1.532, + "step": 12572 + }, + { + "epoch": 0.4502659050620445, + "grad_norm": 2.0714375972747803, + "learning_rate": 0.00012081196878043885, + "loss": 1.2363, + "step": 12573 + }, + { + "epoch": 0.4503017171916128, + "grad_norm": 1.90829598903656, + "learning_rate": 0.00012080062360686495, + "loss": 1.5991, + "step": 12574 + }, + { + "epoch": 0.4503375293211811, + "grad_norm": 1.3349385261535645, + "learning_rate": 0.00012078927815344545, + "loss": 1.2521, + "step": 12575 + }, + { + "epoch": 0.45037334145074936, + "grad_norm": 1.827406883239746, + "learning_rate": 0.00012077793242033307, + "loss": 1.6133, + "step": 12576 + }, + { + "epoch": 0.45040915358031763, + "grad_norm": 1.918264627456665, + "learning_rate": 0.00012076658640768036, + "loss": 1.5396, + "step": 12577 + }, + { + "epoch": 0.45044496570988596, + "grad_norm": 1.7360564470291138, + "learning_rate": 0.00012075524011564005, + "loss": 1.3884, + "step": 12578 + }, + { + "epoch": 0.4504807778394542, + "grad_norm": 1.4378196001052856, + "learning_rate": 0.00012074389354436475, + "loss": 1.5432, + "step": 12579 + }, + { + "epoch": 0.4505165899690225, + "grad_norm": 1.7006964683532715, + "learning_rate": 0.00012073254669400713, + "loss": 1.5753, + "step": 12580 + }, + { + "epoch": 0.45055240209859077, + "grad_norm": 1.619866132736206, + "learning_rate": 0.00012072119956471981, + "loss": 1.5881, + "step": 12581 + }, + { + "epoch": 0.4505882142281591, + "grad_norm": 1.2958283424377441, + "learning_rate": 0.00012070985215665551, + "loss": 1.6152, + "step": 12582 + }, + { + "epoch": 0.45062402635772736, + "grad_norm": 2.0520436763763428, + "learning_rate": 0.00012069850446996686, + "loss": 1.7036, + "step": 12583 + }, + { + "epoch": 0.4506598384872956, + "grad_norm": 1.5955454111099243, + "learning_rate": 0.00012068715650480653, + "loss": 1.4784, + "step": 12584 + }, + { + "epoch": 0.45069565061686395, + "grad_norm": 2.244309425354004, + "learning_rate": 0.00012067580826132718, + "loss": 2.0482, + "step": 12585 + }, + { + "epoch": 0.4507314627464322, + "grad_norm": 1.449869155883789, + "learning_rate": 0.0001206644597396815, + "loss": 1.4782, + "step": 12586 + }, + { + "epoch": 0.4507672748760005, + "grad_norm": 1.647153615951538, + "learning_rate": 0.00012065311094002218, + "loss": 1.2063, + "step": 12587 + }, + { + "epoch": 0.45080308700556876, + "grad_norm": 1.6689027547836304, + "learning_rate": 0.00012064176186250189, + "loss": 1.7451, + "step": 12588 + }, + { + "epoch": 0.4508388991351371, + "grad_norm": 1.4815447330474854, + "learning_rate": 0.00012063041250727331, + "loss": 1.3996, + "step": 12589 + }, + { + "epoch": 0.45087471126470535, + "grad_norm": 1.4417043924331665, + "learning_rate": 0.00012061906287448914, + "loss": 1.5646, + "step": 12590 + }, + { + "epoch": 0.4509105233942736, + "grad_norm": 2.2002885341644287, + "learning_rate": 0.00012060771296430209, + "loss": 1.3061, + "step": 12591 + }, + { + "epoch": 0.45094633552384195, + "grad_norm": 1.4272916316986084, + "learning_rate": 0.00012059636277686486, + "loss": 1.0158, + "step": 12592 + }, + { + "epoch": 0.4509821476534102, + "grad_norm": 1.6139081716537476, + "learning_rate": 0.00012058501231233011, + "loss": 1.4949, + "step": 12593 + }, + { + "epoch": 0.4510179597829785, + "grad_norm": 1.6155054569244385, + "learning_rate": 0.00012057366157085058, + "loss": 1.3464, + "step": 12594 + }, + { + "epoch": 0.45105377191254675, + "grad_norm": 1.561390995979309, + "learning_rate": 0.00012056231055257896, + "loss": 1.3473, + "step": 12595 + }, + { + "epoch": 0.4510895840421151, + "grad_norm": 1.356062650680542, + "learning_rate": 0.000120550959257668, + "loss": 1.3973, + "step": 12596 + }, + { + "epoch": 0.45112539617168335, + "grad_norm": 1.578811526298523, + "learning_rate": 0.00012053960768627036, + "loss": 1.5595, + "step": 12597 + }, + { + "epoch": 0.4511612083012516, + "grad_norm": 2.086986780166626, + "learning_rate": 0.00012052825583853881, + "loss": 1.3671, + "step": 12598 + }, + { + "epoch": 0.45119702043081994, + "grad_norm": 2.0918164253234863, + "learning_rate": 0.00012051690371462608, + "loss": 1.1172, + "step": 12599 + }, + { + "epoch": 0.4512328325603882, + "grad_norm": 1.4624220132827759, + "learning_rate": 0.00012050555131468484, + "loss": 1.4732, + "step": 12600 + }, + { + "epoch": 0.4512686446899565, + "grad_norm": 1.6903061866760254, + "learning_rate": 0.00012049419863886786, + "loss": 1.6673, + "step": 12601 + }, + { + "epoch": 0.45130445681952475, + "grad_norm": 1.5074726343154907, + "learning_rate": 0.00012048284568732791, + "loss": 2.0319, + "step": 12602 + }, + { + "epoch": 0.45134026894909307, + "grad_norm": 2.3268983364105225, + "learning_rate": 0.00012047149246021763, + "loss": 1.5876, + "step": 12603 + }, + { + "epoch": 0.45137608107866134, + "grad_norm": 1.8839945793151855, + "learning_rate": 0.00012046013895768986, + "loss": 1.5132, + "step": 12604 + }, + { + "epoch": 0.4514118932082296, + "grad_norm": 1.4591853618621826, + "learning_rate": 0.0001204487851798973, + "loss": 1.6994, + "step": 12605 + }, + { + "epoch": 0.45144770533779793, + "grad_norm": 1.9590816497802734, + "learning_rate": 0.00012043743112699273, + "loss": 1.2946, + "step": 12606 + }, + { + "epoch": 0.4514835174673662, + "grad_norm": 1.7824480533599854, + "learning_rate": 0.0001204260767991289, + "loss": 1.4297, + "step": 12607 + }, + { + "epoch": 0.45151932959693447, + "grad_norm": 2.672687292098999, + "learning_rate": 0.00012041472219645854, + "loss": 1.4044, + "step": 12608 + }, + { + "epoch": 0.45155514172650274, + "grad_norm": 2.600595474243164, + "learning_rate": 0.00012040336731913442, + "loss": 1.8836, + "step": 12609 + }, + { + "epoch": 0.45159095385607106, + "grad_norm": 1.701601266860962, + "learning_rate": 0.00012039201216730931, + "loss": 1.4913, + "step": 12610 + }, + { + "epoch": 0.45162676598563933, + "grad_norm": 2.1332130432128906, + "learning_rate": 0.00012038065674113598, + "loss": 1.8227, + "step": 12611 + }, + { + "epoch": 0.4516625781152076, + "grad_norm": 1.9110596179962158, + "learning_rate": 0.0001203693010407672, + "loss": 1.6782, + "step": 12612 + }, + { + "epoch": 0.4516983902447759, + "grad_norm": 1.5340908765792847, + "learning_rate": 0.00012035794506635575, + "loss": 1.5475, + "step": 12613 + }, + { + "epoch": 0.4517342023743442, + "grad_norm": 1.7282644510269165, + "learning_rate": 0.0001203465888180544, + "loss": 1.8253, + "step": 12614 + }, + { + "epoch": 0.45177001450391246, + "grad_norm": 2.7363064289093018, + "learning_rate": 0.00012033523229601598, + "loss": 1.4577, + "step": 12615 + }, + { + "epoch": 0.45180582663348073, + "grad_norm": 1.7300324440002441, + "learning_rate": 0.00012032387550039319, + "loss": 1.6123, + "step": 12616 + }, + { + "epoch": 0.45184163876304906, + "grad_norm": 1.5794569253921509, + "learning_rate": 0.00012031251843133891, + "loss": 1.5446, + "step": 12617 + }, + { + "epoch": 0.4518774508926173, + "grad_norm": 2.11045241355896, + "learning_rate": 0.00012030116108900589, + "loss": 1.5784, + "step": 12618 + }, + { + "epoch": 0.4519132630221856, + "grad_norm": 1.8082817792892456, + "learning_rate": 0.00012028980347354692, + "loss": 1.3738, + "step": 12619 + }, + { + "epoch": 0.4519490751517539, + "grad_norm": 1.3321770429611206, + "learning_rate": 0.00012027844558511483, + "loss": 1.6105, + "step": 12620 + }, + { + "epoch": 0.4519848872813222, + "grad_norm": 1.5995198488235474, + "learning_rate": 0.00012026708742386239, + "loss": 1.3222, + "step": 12621 + }, + { + "epoch": 0.45202069941089046, + "grad_norm": 2.029466390609741, + "learning_rate": 0.00012025572898994246, + "loss": 1.4356, + "step": 12622 + }, + { + "epoch": 0.4520565115404587, + "grad_norm": 1.4476046562194824, + "learning_rate": 0.00012024437028350779, + "loss": 1.5459, + "step": 12623 + }, + { + "epoch": 0.45209232367002705, + "grad_norm": 1.6885541677474976, + "learning_rate": 0.00012023301130471128, + "loss": 1.4401, + "step": 12624 + }, + { + "epoch": 0.4521281357995953, + "grad_norm": 1.6791876554489136, + "learning_rate": 0.00012022165205370565, + "loss": 1.6886, + "step": 12625 + }, + { + "epoch": 0.4521639479291636, + "grad_norm": 1.5566707849502563, + "learning_rate": 0.00012021029253064382, + "loss": 1.6669, + "step": 12626 + }, + { + "epoch": 0.4521997600587319, + "grad_norm": 1.7910056114196777, + "learning_rate": 0.00012019893273567855, + "loss": 1.3655, + "step": 12627 + }, + { + "epoch": 0.4522355721883002, + "grad_norm": 1.7929489612579346, + "learning_rate": 0.00012018757266896267, + "loss": 1.563, + "step": 12628 + }, + { + "epoch": 0.45227138431786845, + "grad_norm": 2.385453701019287, + "learning_rate": 0.00012017621233064908, + "loss": 2.0782, + "step": 12629 + }, + { + "epoch": 0.4523071964474367, + "grad_norm": 1.5448579788208008, + "learning_rate": 0.00012016485172089056, + "loss": 1.3892, + "step": 12630 + }, + { + "epoch": 0.45234300857700505, + "grad_norm": 2.077671527862549, + "learning_rate": 0.00012015349083983998, + "loss": 1.3285, + "step": 12631 + }, + { + "epoch": 0.4523788207065733, + "grad_norm": 1.3240525722503662, + "learning_rate": 0.00012014212968765018, + "loss": 1.5076, + "step": 12632 + }, + { + "epoch": 0.4524146328361416, + "grad_norm": 2.3672759532928467, + "learning_rate": 0.000120130768264474, + "loss": 1.4313, + "step": 12633 + }, + { + "epoch": 0.4524504449657099, + "grad_norm": 1.4926321506500244, + "learning_rate": 0.00012011940657046427, + "loss": 1.6316, + "step": 12634 + }, + { + "epoch": 0.4524862570952782, + "grad_norm": 1.6162370443344116, + "learning_rate": 0.00012010804460577395, + "loss": 1.5586, + "step": 12635 + }, + { + "epoch": 0.45252206922484645, + "grad_norm": 1.8206548690795898, + "learning_rate": 0.00012009668237055578, + "loss": 1.8584, + "step": 12636 + }, + { + "epoch": 0.4525578813544147, + "grad_norm": 1.6547601222991943, + "learning_rate": 0.00012008531986496266, + "loss": 1.4219, + "step": 12637 + }, + { + "epoch": 0.45259369348398304, + "grad_norm": 1.5482653379440308, + "learning_rate": 0.0001200739570891475, + "loss": 1.4394, + "step": 12638 + }, + { + "epoch": 0.4526295056135513, + "grad_norm": 1.3440611362457275, + "learning_rate": 0.0001200625940432631, + "loss": 1.211, + "step": 12639 + }, + { + "epoch": 0.4526653177431196, + "grad_norm": 1.2386219501495361, + "learning_rate": 0.00012005123072746242, + "loss": 1.4767, + "step": 12640 + }, + { + "epoch": 0.4527011298726879, + "grad_norm": 2.0508835315704346, + "learning_rate": 0.00012003986714189825, + "loss": 1.305, + "step": 12641 + }, + { + "epoch": 0.45273694200225617, + "grad_norm": 1.6780390739440918, + "learning_rate": 0.00012002850328672357, + "loss": 1.7344, + "step": 12642 + }, + { + "epoch": 0.45277275413182444, + "grad_norm": 1.7541413307189941, + "learning_rate": 0.00012001713916209117, + "loss": 1.2988, + "step": 12643 + }, + { + "epoch": 0.4528085662613927, + "grad_norm": 4.736975193023682, + "learning_rate": 0.00012000577476815402, + "loss": 1.3954, + "step": 12644 + }, + { + "epoch": 0.45284437839096103, + "grad_norm": 1.9038360118865967, + "learning_rate": 0.00011999441010506496, + "loss": 1.5603, + "step": 12645 + }, + { + "epoch": 0.4528801905205293, + "grad_norm": 1.3158032894134521, + "learning_rate": 0.00011998304517297687, + "loss": 1.3335, + "step": 12646 + }, + { + "epoch": 0.45291600265009757, + "grad_norm": 1.6054844856262207, + "learning_rate": 0.00011997167997204272, + "loss": 1.5157, + "step": 12647 + }, + { + "epoch": 0.4529518147796659, + "grad_norm": 2.2230138778686523, + "learning_rate": 0.00011996031450241536, + "loss": 1.4866, + "step": 12648 + }, + { + "epoch": 0.45298762690923416, + "grad_norm": 2.4321999549865723, + "learning_rate": 0.00011994894876424773, + "loss": 1.7621, + "step": 12649 + }, + { + "epoch": 0.45302343903880243, + "grad_norm": 1.6048284769058228, + "learning_rate": 0.00011993758275769273, + "loss": 1.4613, + "step": 12650 + }, + { + "epoch": 0.4530592511683707, + "grad_norm": 1.7765322923660278, + "learning_rate": 0.00011992621648290328, + "loss": 1.4474, + "step": 12651 + }, + { + "epoch": 0.453095063297939, + "grad_norm": 1.6691166162490845, + "learning_rate": 0.00011991484994003226, + "loss": 1.5177, + "step": 12652 + }, + { + "epoch": 0.4531308754275073, + "grad_norm": 2.022249698638916, + "learning_rate": 0.00011990348312923266, + "loss": 1.697, + "step": 12653 + }, + { + "epoch": 0.45316668755707556, + "grad_norm": 2.6226868629455566, + "learning_rate": 0.00011989211605065733, + "loss": 1.6507, + "step": 12654 + }, + { + "epoch": 0.4532024996866439, + "grad_norm": 2.3299243450164795, + "learning_rate": 0.00011988074870445927, + "loss": 1.8215, + "step": 12655 + }, + { + "epoch": 0.45323831181621216, + "grad_norm": 1.7379266023635864, + "learning_rate": 0.00011986938109079133, + "loss": 1.4798, + "step": 12656 + }, + { + "epoch": 0.4532741239457804, + "grad_norm": 1.6245919466018677, + "learning_rate": 0.00011985801320980654, + "loss": 1.6791, + "step": 12657 + }, + { + "epoch": 0.4533099360753487, + "grad_norm": 1.2778525352478027, + "learning_rate": 0.00011984664506165777, + "loss": 1.4197, + "step": 12658 + }, + { + "epoch": 0.453345748204917, + "grad_norm": 1.747637152671814, + "learning_rate": 0.00011983527664649801, + "loss": 1.6668, + "step": 12659 + }, + { + "epoch": 0.4533815603344853, + "grad_norm": 2.747385025024414, + "learning_rate": 0.0001198239079644802, + "loss": 1.4045, + "step": 12660 + }, + { + "epoch": 0.45341737246405356, + "grad_norm": 2.079819917678833, + "learning_rate": 0.00011981253901575726, + "loss": 1.73, + "step": 12661 + }, + { + "epoch": 0.4534531845936219, + "grad_norm": 1.518641471862793, + "learning_rate": 0.00011980116980048217, + "loss": 1.6875, + "step": 12662 + }, + { + "epoch": 0.45348899672319015, + "grad_norm": 1.9608591794967651, + "learning_rate": 0.00011978980031880789, + "loss": 1.85, + "step": 12663 + }, + { + "epoch": 0.4535248088527584, + "grad_norm": 1.6720517873764038, + "learning_rate": 0.00011977843057088735, + "loss": 1.7237, + "step": 12664 + }, + { + "epoch": 0.4535606209823267, + "grad_norm": 2.0230205059051514, + "learning_rate": 0.00011976706055687357, + "loss": 1.5775, + "step": 12665 + }, + { + "epoch": 0.453596433111895, + "grad_norm": 1.5287965536117554, + "learning_rate": 0.00011975569027691947, + "loss": 1.6407, + "step": 12666 + }, + { + "epoch": 0.4536322452414633, + "grad_norm": 1.822375774383545, + "learning_rate": 0.00011974431973117804, + "loss": 1.761, + "step": 12667 + }, + { + "epoch": 0.45366805737103155, + "grad_norm": 1.4832029342651367, + "learning_rate": 0.00011973294891980224, + "loss": 1.5349, + "step": 12668 + }, + { + "epoch": 0.4537038695005999, + "grad_norm": 1.5550483465194702, + "learning_rate": 0.00011972157784294508, + "loss": 1.4935, + "step": 12669 + }, + { + "epoch": 0.45373968163016815, + "grad_norm": 1.5525519847869873, + "learning_rate": 0.00011971020650075954, + "loss": 1.2612, + "step": 12670 + }, + { + "epoch": 0.4537754937597364, + "grad_norm": 1.824698805809021, + "learning_rate": 0.00011969883489339862, + "loss": 1.3504, + "step": 12671 + }, + { + "epoch": 0.4538113058893047, + "grad_norm": 1.8953741788864136, + "learning_rate": 0.00011968746302101523, + "loss": 1.4381, + "step": 12672 + }, + { + "epoch": 0.453847118018873, + "grad_norm": 1.4722163677215576, + "learning_rate": 0.00011967609088376245, + "loss": 1.3717, + "step": 12673 + }, + { + "epoch": 0.4538829301484413, + "grad_norm": 1.3879319429397583, + "learning_rate": 0.00011966471848179324, + "loss": 1.4582, + "step": 12674 + }, + { + "epoch": 0.45391874227800955, + "grad_norm": 2.1239492893218994, + "learning_rate": 0.00011965334581526062, + "loss": 1.3412, + "step": 12675 + }, + { + "epoch": 0.45395455440757787, + "grad_norm": 1.5013140439987183, + "learning_rate": 0.00011964197288431756, + "loss": 1.3925, + "step": 12676 + }, + { + "epoch": 0.45399036653714614, + "grad_norm": 1.629763126373291, + "learning_rate": 0.00011963059968911712, + "loss": 1.7531, + "step": 12677 + }, + { + "epoch": 0.4540261786667144, + "grad_norm": 2.189053773880005, + "learning_rate": 0.00011961922622981225, + "loss": 1.5956, + "step": 12678 + }, + { + "epoch": 0.4540619907962827, + "grad_norm": 1.6427063941955566, + "learning_rate": 0.00011960785250655604, + "loss": 1.6511, + "step": 12679 + }, + { + "epoch": 0.454097802925851, + "grad_norm": 1.7626343965530396, + "learning_rate": 0.00011959647851950145, + "loss": 1.4323, + "step": 12680 + }, + { + "epoch": 0.45413361505541927, + "grad_norm": 1.5623730421066284, + "learning_rate": 0.0001195851042688015, + "loss": 1.7968, + "step": 12681 + }, + { + "epoch": 0.45416942718498754, + "grad_norm": 1.481370210647583, + "learning_rate": 0.00011957372975460925, + "loss": 1.5173, + "step": 12682 + }, + { + "epoch": 0.45420523931455586, + "grad_norm": 1.105037808418274, + "learning_rate": 0.00011956235497707771, + "loss": 1.403, + "step": 12683 + }, + { + "epoch": 0.45424105144412413, + "grad_norm": 1.2726575136184692, + "learning_rate": 0.00011955097993635991, + "loss": 1.0945, + "step": 12684 + }, + { + "epoch": 0.4542768635736924, + "grad_norm": 1.2600315809249878, + "learning_rate": 0.0001195396046326089, + "loss": 1.557, + "step": 12685 + }, + { + "epoch": 0.45431267570326067, + "grad_norm": 1.5838819742202759, + "learning_rate": 0.00011952822906597773, + "loss": 1.4059, + "step": 12686 + }, + { + "epoch": 0.454348487832829, + "grad_norm": 1.760190725326538, + "learning_rate": 0.0001195168532366194, + "loss": 1.3405, + "step": 12687 + }, + { + "epoch": 0.45438429996239726, + "grad_norm": 2.3661205768585205, + "learning_rate": 0.000119505477144687, + "loss": 1.6014, + "step": 12688 + }, + { + "epoch": 0.45442011209196553, + "grad_norm": 1.6117066144943237, + "learning_rate": 0.00011949410079033359, + "loss": 1.4842, + "step": 12689 + }, + { + "epoch": 0.45445592422153386, + "grad_norm": 1.467039942741394, + "learning_rate": 0.00011948272417371216, + "loss": 1.5621, + "step": 12690 + }, + { + "epoch": 0.4544917363511021, + "grad_norm": 1.4559403657913208, + "learning_rate": 0.00011947134729497583, + "loss": 1.6804, + "step": 12691 + }, + { + "epoch": 0.4545275484806704, + "grad_norm": 1.5294259786605835, + "learning_rate": 0.00011945997015427761, + "loss": 1.5, + "step": 12692 + }, + { + "epoch": 0.45456336061023866, + "grad_norm": 1.8137285709381104, + "learning_rate": 0.00011944859275177063, + "loss": 1.5028, + "step": 12693 + }, + { + "epoch": 0.454599172739807, + "grad_norm": 1.7047687768936157, + "learning_rate": 0.00011943721508760788, + "loss": 1.5601, + "step": 12694 + }, + { + "epoch": 0.45463498486937526, + "grad_norm": 1.5781142711639404, + "learning_rate": 0.00011942583716194251, + "loss": 1.5172, + "step": 12695 + }, + { + "epoch": 0.4546707969989435, + "grad_norm": 1.6754029989242554, + "learning_rate": 0.00011941445897492755, + "loss": 1.654, + "step": 12696 + }, + { + "epoch": 0.45470660912851185, + "grad_norm": 1.5918335914611816, + "learning_rate": 0.0001194030805267161, + "loss": 1.4808, + "step": 12697 + }, + { + "epoch": 0.4547424212580801, + "grad_norm": 1.8302372694015503, + "learning_rate": 0.0001193917018174612, + "loss": 1.2755, + "step": 12698 + }, + { + "epoch": 0.4547782333876484, + "grad_norm": 2.5428664684295654, + "learning_rate": 0.00011938032284731599, + "loss": 1.5935, + "step": 12699 + }, + { + "epoch": 0.45481404551721666, + "grad_norm": 1.475474238395691, + "learning_rate": 0.00011936894361643351, + "loss": 1.2114, + "step": 12700 + }, + { + "epoch": 0.454849857646785, + "grad_norm": 1.447190284729004, + "learning_rate": 0.00011935756412496688, + "loss": 1.4698, + "step": 12701 + }, + { + "epoch": 0.45488566977635325, + "grad_norm": 1.8733444213867188, + "learning_rate": 0.00011934618437306921, + "loss": 1.5448, + "step": 12702 + }, + { + "epoch": 0.4549214819059215, + "grad_norm": 1.7487825155258179, + "learning_rate": 0.00011933480436089357, + "loss": 1.3573, + "step": 12703 + }, + { + "epoch": 0.45495729403548985, + "grad_norm": 1.2866499423980713, + "learning_rate": 0.0001193234240885931, + "loss": 1.2512, + "step": 12704 + }, + { + "epoch": 0.4549931061650581, + "grad_norm": 2.3530335426330566, + "learning_rate": 0.00011931204355632089, + "loss": 1.5614, + "step": 12705 + }, + { + "epoch": 0.4550289182946264, + "grad_norm": 2.5091681480407715, + "learning_rate": 0.00011930066276423003, + "loss": 1.6465, + "step": 12706 + }, + { + "epoch": 0.45506473042419465, + "grad_norm": 1.8246722221374512, + "learning_rate": 0.00011928928171247362, + "loss": 1.2457, + "step": 12707 + }, + { + "epoch": 0.455100542553763, + "grad_norm": 1.3693405389785767, + "learning_rate": 0.00011927790040120484, + "loss": 1.4335, + "step": 12708 + }, + { + "epoch": 0.45513635468333125, + "grad_norm": 1.6477317810058594, + "learning_rate": 0.00011926651883057676, + "loss": 1.3098, + "step": 12709 + }, + { + "epoch": 0.4551721668128995, + "grad_norm": 1.7943549156188965, + "learning_rate": 0.00011925513700074253, + "loss": 1.4057, + "step": 12710 + }, + { + "epoch": 0.45520797894246784, + "grad_norm": 1.7328225374221802, + "learning_rate": 0.00011924375491185526, + "loss": 1.3484, + "step": 12711 + }, + { + "epoch": 0.4552437910720361, + "grad_norm": 1.3542251586914062, + "learning_rate": 0.00011923237256406812, + "loss": 1.549, + "step": 12712 + }, + { + "epoch": 0.4552796032016044, + "grad_norm": 1.7922602891921997, + "learning_rate": 0.00011922098995753417, + "loss": 1.9181, + "step": 12713 + }, + { + "epoch": 0.45531541533117265, + "grad_norm": 1.549342155456543, + "learning_rate": 0.00011920960709240662, + "loss": 1.543, + "step": 12714 + }, + { + "epoch": 0.45535122746074097, + "grad_norm": 1.754526972770691, + "learning_rate": 0.0001191982239688386, + "loss": 1.2378, + "step": 12715 + }, + { + "epoch": 0.45538703959030924, + "grad_norm": 1.7519073486328125, + "learning_rate": 0.00011918684058698319, + "loss": 1.4704, + "step": 12716 + }, + { + "epoch": 0.4554228517198775, + "grad_norm": 1.3553789854049683, + "learning_rate": 0.00011917545694699365, + "loss": 1.3812, + "step": 12717 + }, + { + "epoch": 0.45545866384944583, + "grad_norm": 1.7538580894470215, + "learning_rate": 0.00011916407304902302, + "loss": 1.7192, + "step": 12718 + }, + { + "epoch": 0.4554944759790141, + "grad_norm": 1.9250893592834473, + "learning_rate": 0.00011915268889322456, + "loss": 1.4597, + "step": 12719 + }, + { + "epoch": 0.45553028810858237, + "grad_norm": 1.8414658308029175, + "learning_rate": 0.00011914130447975131, + "loss": 1.8114, + "step": 12720 + }, + { + "epoch": 0.45556610023815064, + "grad_norm": 1.3821085691452026, + "learning_rate": 0.00011912991980875654, + "loss": 1.4411, + "step": 12721 + }, + { + "epoch": 0.45560191236771896, + "grad_norm": 1.5049561262130737, + "learning_rate": 0.00011911853488039337, + "loss": 1.4983, + "step": 12722 + }, + { + "epoch": 0.45563772449728723, + "grad_norm": 1.8283472061157227, + "learning_rate": 0.00011910714969481498, + "loss": 1.5525, + "step": 12723 + }, + { + "epoch": 0.4556735366268555, + "grad_norm": 1.6375936269760132, + "learning_rate": 0.00011909576425217455, + "loss": 1.8264, + "step": 12724 + }, + { + "epoch": 0.4557093487564238, + "grad_norm": 1.530905842781067, + "learning_rate": 0.0001190843785526252, + "loss": 1.5206, + "step": 12725 + }, + { + "epoch": 0.4557451608859921, + "grad_norm": 1.5139286518096924, + "learning_rate": 0.00011907299259632019, + "loss": 1.4437, + "step": 12726 + }, + { + "epoch": 0.45578097301556036, + "grad_norm": 1.5392321348190308, + "learning_rate": 0.00011906160638341264, + "loss": 1.43, + "step": 12727 + }, + { + "epoch": 0.45581678514512863, + "grad_norm": 2.803849458694458, + "learning_rate": 0.00011905021991405578, + "loss": 1.8596, + "step": 12728 + }, + { + "epoch": 0.45585259727469696, + "grad_norm": 1.545616865158081, + "learning_rate": 0.00011903883318840279, + "loss": 1.6796, + "step": 12729 + }, + { + "epoch": 0.4558884094042652, + "grad_norm": 1.9524319171905518, + "learning_rate": 0.00011902744620660686, + "loss": 1.559, + "step": 12730 + }, + { + "epoch": 0.4559242215338335, + "grad_norm": 2.1568334102630615, + "learning_rate": 0.00011901605896882116, + "loss": 1.5606, + "step": 12731 + }, + { + "epoch": 0.4559600336634018, + "grad_norm": 2.381507635116577, + "learning_rate": 0.00011900467147519893, + "loss": 1.2422, + "step": 12732 + }, + { + "epoch": 0.4559958457929701, + "grad_norm": 1.6769733428955078, + "learning_rate": 0.00011899328372589338, + "loss": 1.4349, + "step": 12733 + }, + { + "epoch": 0.45603165792253836, + "grad_norm": 1.7291309833526611, + "learning_rate": 0.00011898189572105767, + "loss": 1.5192, + "step": 12734 + }, + { + "epoch": 0.4560674700521066, + "grad_norm": 2.131802797317505, + "learning_rate": 0.00011897050746084504, + "loss": 1.3411, + "step": 12735 + }, + { + "epoch": 0.45610328218167495, + "grad_norm": 1.5254218578338623, + "learning_rate": 0.0001189591189454087, + "loss": 1.3057, + "step": 12736 + }, + { + "epoch": 0.4561390943112432, + "grad_norm": 1.46796715259552, + "learning_rate": 0.00011894773017490189, + "loss": 1.2683, + "step": 12737 + }, + { + "epoch": 0.4561749064408115, + "grad_norm": 1.5670194625854492, + "learning_rate": 0.00011893634114947778, + "loss": 1.3185, + "step": 12738 + }, + { + "epoch": 0.4562107185703798, + "grad_norm": 1.6794108152389526, + "learning_rate": 0.00011892495186928966, + "loss": 1.5493, + "step": 12739 + }, + { + "epoch": 0.4562465306999481, + "grad_norm": 1.5749014616012573, + "learning_rate": 0.00011891356233449069, + "loss": 1.4679, + "step": 12740 + }, + { + "epoch": 0.45628234282951635, + "grad_norm": 1.948190450668335, + "learning_rate": 0.0001189021725452342, + "loss": 1.5716, + "step": 12741 + }, + { + "epoch": 0.4563181549590846, + "grad_norm": 1.7379204034805298, + "learning_rate": 0.00011889078250167329, + "loss": 1.4168, + "step": 12742 + }, + { + "epoch": 0.45635396708865295, + "grad_norm": 1.6801133155822754, + "learning_rate": 0.0001188793922039613, + "loss": 1.1718, + "step": 12743 + }, + { + "epoch": 0.4563897792182212, + "grad_norm": 2.2052829265594482, + "learning_rate": 0.00011886800165225143, + "loss": 1.5625, + "step": 12744 + }, + { + "epoch": 0.4564255913477895, + "grad_norm": 1.8127167224884033, + "learning_rate": 0.00011885661084669693, + "loss": 1.4002, + "step": 12745 + }, + { + "epoch": 0.4564614034773578, + "grad_norm": 1.4773932695388794, + "learning_rate": 0.00011884521978745106, + "loss": 1.6552, + "step": 12746 + }, + { + "epoch": 0.4564972156069261, + "grad_norm": 1.6315429210662842, + "learning_rate": 0.00011883382847466706, + "loss": 1.6935, + "step": 12747 + }, + { + "epoch": 0.45653302773649435, + "grad_norm": 2.3406314849853516, + "learning_rate": 0.00011882243690849824, + "loss": 1.343, + "step": 12748 + }, + { + "epoch": 0.4565688398660626, + "grad_norm": 1.2826876640319824, + "learning_rate": 0.00011881104508909778, + "loss": 1.4219, + "step": 12749 + }, + { + "epoch": 0.45660465199563094, + "grad_norm": 1.8661410808563232, + "learning_rate": 0.00011879965301661897, + "loss": 1.7659, + "step": 12750 + }, + { + "epoch": 0.4566404641251992, + "grad_norm": 1.9449056386947632, + "learning_rate": 0.00011878826069121505, + "loss": 1.4855, + "step": 12751 + }, + { + "epoch": 0.4566762762547675, + "grad_norm": 1.3876155614852905, + "learning_rate": 0.00011877686811303937, + "loss": 1.3685, + "step": 12752 + }, + { + "epoch": 0.4567120883843358, + "grad_norm": 1.7499020099639893, + "learning_rate": 0.00011876547528224511, + "loss": 1.6676, + "step": 12753 + }, + { + "epoch": 0.45674790051390407, + "grad_norm": 1.5707976818084717, + "learning_rate": 0.00011875408219898561, + "loss": 1.6834, + "step": 12754 + }, + { + "epoch": 0.45678371264347234, + "grad_norm": 1.6355860233306885, + "learning_rate": 0.00011874268886341409, + "loss": 1.6243, + "step": 12755 + }, + { + "epoch": 0.4568195247730406, + "grad_norm": 1.9601993560791016, + "learning_rate": 0.00011873129527568388, + "loss": 1.6477, + "step": 12756 + }, + { + "epoch": 0.45685533690260893, + "grad_norm": 1.5781079530715942, + "learning_rate": 0.00011871990143594827, + "loss": 1.543, + "step": 12757 + }, + { + "epoch": 0.4568911490321772, + "grad_norm": 2.1427834033966064, + "learning_rate": 0.00011870850734436054, + "loss": 1.5517, + "step": 12758 + }, + { + "epoch": 0.45692696116174547, + "grad_norm": 1.4001954793930054, + "learning_rate": 0.00011869711300107398, + "loss": 1.3041, + "step": 12759 + }, + { + "epoch": 0.4569627732913138, + "grad_norm": 1.4365400075912476, + "learning_rate": 0.00011868571840624185, + "loss": 1.5515, + "step": 12760 + }, + { + "epoch": 0.45699858542088206, + "grad_norm": 1.820007562637329, + "learning_rate": 0.0001186743235600175, + "loss": 1.6383, + "step": 12761 + }, + { + "epoch": 0.45703439755045033, + "grad_norm": 2.0829074382781982, + "learning_rate": 0.0001186629284625542, + "loss": 1.684, + "step": 12762 + }, + { + "epoch": 0.4570702096800186, + "grad_norm": 2.3919122219085693, + "learning_rate": 0.00011865153311400529, + "loss": 1.4495, + "step": 12763 + }, + { + "epoch": 0.4571060218095869, + "grad_norm": 1.607649564743042, + "learning_rate": 0.00011864013751452405, + "loss": 1.6239, + "step": 12764 + }, + { + "epoch": 0.4571418339391552, + "grad_norm": 1.874072551727295, + "learning_rate": 0.00011862874166426381, + "loss": 1.3917, + "step": 12765 + }, + { + "epoch": 0.45717764606872346, + "grad_norm": 1.8643479347229004, + "learning_rate": 0.00011861734556337787, + "loss": 1.5116, + "step": 12766 + }, + { + "epoch": 0.4572134581982918, + "grad_norm": 1.378171682357788, + "learning_rate": 0.00011860594921201958, + "loss": 1.5747, + "step": 12767 + }, + { + "epoch": 0.45724927032786006, + "grad_norm": 2.0554890632629395, + "learning_rate": 0.00011859455261034225, + "loss": 1.5593, + "step": 12768 + }, + { + "epoch": 0.4572850824574283, + "grad_norm": 1.3280192613601685, + "learning_rate": 0.00011858315575849914, + "loss": 1.3724, + "step": 12769 + }, + { + "epoch": 0.4573208945869966, + "grad_norm": 1.340054988861084, + "learning_rate": 0.00011857175865664372, + "loss": 1.5089, + "step": 12770 + }, + { + "epoch": 0.4573567067165649, + "grad_norm": 2.0556211471557617, + "learning_rate": 0.00011856036130492917, + "loss": 1.8506, + "step": 12771 + }, + { + "epoch": 0.4573925188461332, + "grad_norm": 1.7079956531524658, + "learning_rate": 0.00011854896370350894, + "loss": 1.5426, + "step": 12772 + }, + { + "epoch": 0.45742833097570146, + "grad_norm": 1.4690487384796143, + "learning_rate": 0.00011853756585253633, + "loss": 1.3461, + "step": 12773 + }, + { + "epoch": 0.4574641431052697, + "grad_norm": 1.5755233764648438, + "learning_rate": 0.00011852616775216467, + "loss": 1.6978, + "step": 12774 + }, + { + "epoch": 0.45749995523483805, + "grad_norm": 1.4344228506088257, + "learning_rate": 0.00011851476940254733, + "loss": 1.5199, + "step": 12775 + }, + { + "epoch": 0.4575357673644063, + "grad_norm": 1.373246431350708, + "learning_rate": 0.00011850337080383764, + "loss": 1.5406, + "step": 12776 + }, + { + "epoch": 0.4575715794939746, + "grad_norm": 1.866769790649414, + "learning_rate": 0.000118491971956189, + "loss": 1.3924, + "step": 12777 + }, + { + "epoch": 0.4576073916235429, + "grad_norm": 1.9502559900283813, + "learning_rate": 0.00011848057285975467, + "loss": 1.4118, + "step": 12778 + }, + { + "epoch": 0.4576432037531112, + "grad_norm": 1.2907614707946777, + "learning_rate": 0.00011846917351468811, + "loss": 1.06, + "step": 12779 + }, + { + "epoch": 0.45767901588267945, + "grad_norm": 1.657185673713684, + "learning_rate": 0.00011845777392114263, + "loss": 1.4635, + "step": 12780 + }, + { + "epoch": 0.4577148280122477, + "grad_norm": 2.1318721771240234, + "learning_rate": 0.00011844637407927161, + "loss": 1.3132, + "step": 12781 + }, + { + "epoch": 0.45775064014181605, + "grad_norm": 1.5741397142410278, + "learning_rate": 0.00011843497398922842, + "loss": 1.3551, + "step": 12782 + }, + { + "epoch": 0.4577864522713843, + "grad_norm": 1.3207534551620483, + "learning_rate": 0.00011842357365116645, + "loss": 1.7398, + "step": 12783 + }, + { + "epoch": 0.4578222644009526, + "grad_norm": 1.8568518161773682, + "learning_rate": 0.00011841217306523904, + "loss": 1.7068, + "step": 12784 + }, + { + "epoch": 0.4578580765305209, + "grad_norm": 1.3285858631134033, + "learning_rate": 0.00011840077223159965, + "loss": 1.676, + "step": 12785 + }, + { + "epoch": 0.4578938886600892, + "grad_norm": 2.2445781230926514, + "learning_rate": 0.00011838937115040154, + "loss": 1.5313, + "step": 12786 + }, + { + "epoch": 0.45792970078965745, + "grad_norm": 1.4467315673828125, + "learning_rate": 0.00011837796982179817, + "loss": 1.3382, + "step": 12787 + }, + { + "epoch": 0.4579655129192257, + "grad_norm": 1.7218992710113525, + "learning_rate": 0.00011836656824594295, + "loss": 1.685, + "step": 12788 + }, + { + "epoch": 0.45800132504879404, + "grad_norm": 1.835891604423523, + "learning_rate": 0.0001183551664229892, + "loss": 1.7658, + "step": 12789 + }, + { + "epoch": 0.4580371371783623, + "grad_norm": 1.723070740699768, + "learning_rate": 0.0001183437643530904, + "loss": 1.7154, + "step": 12790 + }, + { + "epoch": 0.4580729493079306, + "grad_norm": 2.18641996383667, + "learning_rate": 0.00011833236203639987, + "loss": 1.5002, + "step": 12791 + }, + { + "epoch": 0.4581087614374989, + "grad_norm": 1.8997712135314941, + "learning_rate": 0.00011832095947307111, + "loss": 1.8594, + "step": 12792 + }, + { + "epoch": 0.45814457356706717, + "grad_norm": 2.286189317703247, + "learning_rate": 0.00011830955666325748, + "loss": 1.2922, + "step": 12793 + }, + { + "epoch": 0.45818038569663544, + "grad_norm": 1.3605753183364868, + "learning_rate": 0.00011829815360711234, + "loss": 1.4984, + "step": 12794 + }, + { + "epoch": 0.4582161978262037, + "grad_norm": 1.8744243383407593, + "learning_rate": 0.00011828675030478915, + "loss": 1.5561, + "step": 12795 + }, + { + "epoch": 0.45825200995577203, + "grad_norm": 1.505820870399475, + "learning_rate": 0.00011827534675644134, + "loss": 1.3156, + "step": 12796 + }, + { + "epoch": 0.4582878220853403, + "grad_norm": 1.856209397315979, + "learning_rate": 0.00011826394296222229, + "loss": 1.4686, + "step": 12797 + }, + { + "epoch": 0.45832363421490857, + "grad_norm": 1.7676700353622437, + "learning_rate": 0.00011825253892228547, + "loss": 1.4413, + "step": 12798 + }, + { + "epoch": 0.4583594463444769, + "grad_norm": 2.2490038871765137, + "learning_rate": 0.00011824113463678427, + "loss": 1.6963, + "step": 12799 + }, + { + "epoch": 0.45839525847404516, + "grad_norm": 1.618058204650879, + "learning_rate": 0.00011822973010587213, + "loss": 1.6653, + "step": 12800 + }, + { + "epoch": 0.45843107060361343, + "grad_norm": 1.5924299955368042, + "learning_rate": 0.0001182183253297025, + "loss": 1.3864, + "step": 12801 + }, + { + "epoch": 0.4584668827331817, + "grad_norm": 2.89251971244812, + "learning_rate": 0.00011820692030842879, + "loss": 1.3747, + "step": 12802 + }, + { + "epoch": 0.45850269486275, + "grad_norm": 1.7772977352142334, + "learning_rate": 0.00011819551504220447, + "loss": 1.5318, + "step": 12803 + }, + { + "epoch": 0.4585385069923183, + "grad_norm": 1.6167027950286865, + "learning_rate": 0.00011818410953118296, + "loss": 1.3861, + "step": 12804 + }, + { + "epoch": 0.45857431912188656, + "grad_norm": 1.2884777784347534, + "learning_rate": 0.0001181727037755177, + "loss": 0.8727, + "step": 12805 + }, + { + "epoch": 0.4586101312514549, + "grad_norm": 1.5280265808105469, + "learning_rate": 0.00011816129777536216, + "loss": 1.255, + "step": 12806 + }, + { + "epoch": 0.45864594338102316, + "grad_norm": 1.8953804969787598, + "learning_rate": 0.00011814989153086977, + "loss": 1.5201, + "step": 12807 + }, + { + "epoch": 0.4586817555105914, + "grad_norm": 1.4978008270263672, + "learning_rate": 0.00011813848504219403, + "loss": 1.3398, + "step": 12808 + }, + { + "epoch": 0.4587175676401597, + "grad_norm": 1.7780437469482422, + "learning_rate": 0.00011812707830948835, + "loss": 1.6133, + "step": 12809 + }, + { + "epoch": 0.458753379769728, + "grad_norm": 1.5132312774658203, + "learning_rate": 0.0001181156713329062, + "loss": 1.4327, + "step": 12810 + }, + { + "epoch": 0.4587891918992963, + "grad_norm": 1.6807321310043335, + "learning_rate": 0.0001181042641126011, + "loss": 1.4845, + "step": 12811 + }, + { + "epoch": 0.45882500402886456, + "grad_norm": 1.6463050842285156, + "learning_rate": 0.00011809285664872645, + "loss": 1.3477, + "step": 12812 + }, + { + "epoch": 0.4588608161584329, + "grad_norm": 1.670136570930481, + "learning_rate": 0.00011808144894143575, + "loss": 1.5761, + "step": 12813 + }, + { + "epoch": 0.45889662828800115, + "grad_norm": 1.6936041116714478, + "learning_rate": 0.00011807004099088251, + "loss": 1.4741, + "step": 12814 + }, + { + "epoch": 0.4589324404175694, + "grad_norm": 1.5984362363815308, + "learning_rate": 0.00011805863279722014, + "loss": 1.6037, + "step": 12815 + }, + { + "epoch": 0.4589682525471377, + "grad_norm": 1.9859976768493652, + "learning_rate": 0.00011804722436060218, + "loss": 1.6921, + "step": 12816 + }, + { + "epoch": 0.459004064676706, + "grad_norm": 1.768145203590393, + "learning_rate": 0.00011803581568118207, + "loss": 1.322, + "step": 12817 + }, + { + "epoch": 0.4590398768062743, + "grad_norm": 1.2715734243392944, + "learning_rate": 0.00011802440675911335, + "loss": 1.4228, + "step": 12818 + }, + { + "epoch": 0.45907568893584255, + "grad_norm": 1.2998805046081543, + "learning_rate": 0.00011801299759454947, + "loss": 1.2947, + "step": 12819 + }, + { + "epoch": 0.4591115010654109, + "grad_norm": 2.153787612915039, + "learning_rate": 0.00011800158818764395, + "loss": 1.8239, + "step": 12820 + }, + { + "epoch": 0.45914731319497915, + "grad_norm": 1.4369038343429565, + "learning_rate": 0.0001179901785385503, + "loss": 1.4372, + "step": 12821 + }, + { + "epoch": 0.4591831253245474, + "grad_norm": 2.008143901824951, + "learning_rate": 0.00011797876864742198, + "loss": 1.6738, + "step": 12822 + }, + { + "epoch": 0.4592189374541157, + "grad_norm": 1.253076434135437, + "learning_rate": 0.00011796735851441254, + "loss": 1.2106, + "step": 12823 + }, + { + "epoch": 0.459254749583684, + "grad_norm": 1.9890899658203125, + "learning_rate": 0.00011795594813967543, + "loss": 1.865, + "step": 12824 + }, + { + "epoch": 0.4592905617132523, + "grad_norm": 1.4723730087280273, + "learning_rate": 0.00011794453752336425, + "loss": 1.3423, + "step": 12825 + }, + { + "epoch": 0.45932637384282055, + "grad_norm": 3.3311033248901367, + "learning_rate": 0.00011793312666563241, + "loss": 1.5936, + "step": 12826 + }, + { + "epoch": 0.45936218597238887, + "grad_norm": 1.470004916191101, + "learning_rate": 0.00011792171556663353, + "loss": 1.1274, + "step": 12827 + }, + { + "epoch": 0.45939799810195714, + "grad_norm": 1.7341986894607544, + "learning_rate": 0.00011791030422652105, + "loss": 1.1953, + "step": 12828 + }, + { + "epoch": 0.4594338102315254, + "grad_norm": 1.5090793371200562, + "learning_rate": 0.00011789889264544855, + "loss": 1.3247, + "step": 12829 + }, + { + "epoch": 0.4594696223610937, + "grad_norm": 1.5003623962402344, + "learning_rate": 0.00011788748082356955, + "loss": 1.3318, + "step": 12830 + }, + { + "epoch": 0.459505434490662, + "grad_norm": 2.2069389820098877, + "learning_rate": 0.00011787606876103753, + "loss": 1.6227, + "step": 12831 + }, + { + "epoch": 0.45954124662023027, + "grad_norm": 1.5146262645721436, + "learning_rate": 0.00011786465645800609, + "loss": 1.5597, + "step": 12832 + }, + { + "epoch": 0.45957705874979854, + "grad_norm": 1.5742777585983276, + "learning_rate": 0.00011785324391462873, + "loss": 1.4421, + "step": 12833 + }, + { + "epoch": 0.45961287087936686, + "grad_norm": 1.3737620115280151, + "learning_rate": 0.000117841831131059, + "loss": 1.6641, + "step": 12834 + }, + { + "epoch": 0.45964868300893513, + "grad_norm": 1.7888473272323608, + "learning_rate": 0.00011783041810745045, + "loss": 1.2693, + "step": 12835 + }, + { + "epoch": 0.4596844951385034, + "grad_norm": 1.42727530002594, + "learning_rate": 0.00011781900484395665, + "loss": 1.5367, + "step": 12836 + }, + { + "epoch": 0.45972030726807167, + "grad_norm": 1.4414780139923096, + "learning_rate": 0.00011780759134073107, + "loss": 2.0235, + "step": 12837 + }, + { + "epoch": 0.45975611939764, + "grad_norm": 1.4907771348953247, + "learning_rate": 0.00011779617759792738, + "loss": 1.4951, + "step": 12838 + }, + { + "epoch": 0.45979193152720826, + "grad_norm": 1.888153314590454, + "learning_rate": 0.00011778476361569903, + "loss": 1.3646, + "step": 12839 + }, + { + "epoch": 0.45982774365677653, + "grad_norm": 1.6788287162780762, + "learning_rate": 0.00011777334939419966, + "loss": 1.3867, + "step": 12840 + }, + { + "epoch": 0.45986355578634486, + "grad_norm": 1.3962904214859009, + "learning_rate": 0.00011776193493358278, + "loss": 1.6541, + "step": 12841 + }, + { + "epoch": 0.4598993679159131, + "grad_norm": 1.7222188711166382, + "learning_rate": 0.00011775052023400197, + "loss": 1.5073, + "step": 12842 + }, + { + "epoch": 0.4599351800454814, + "grad_norm": 1.299524188041687, + "learning_rate": 0.0001177391052956108, + "loss": 1.1971, + "step": 12843 + }, + { + "epoch": 0.45997099217504966, + "grad_norm": 1.7616795301437378, + "learning_rate": 0.00011772769011856286, + "loss": 1.499, + "step": 12844 + }, + { + "epoch": 0.460006804304618, + "grad_norm": 1.8444628715515137, + "learning_rate": 0.00011771627470301174, + "loss": 1.5539, + "step": 12845 + }, + { + "epoch": 0.46004261643418626, + "grad_norm": 1.4378998279571533, + "learning_rate": 0.00011770485904911099, + "loss": 1.5755, + "step": 12846 + }, + { + "epoch": 0.4600784285637545, + "grad_norm": 1.6388529539108276, + "learning_rate": 0.0001176934431570142, + "loss": 1.4707, + "step": 12847 + }, + { + "epoch": 0.46011424069332285, + "grad_norm": 1.789455771446228, + "learning_rate": 0.00011768202702687492, + "loss": 1.3956, + "step": 12848 + }, + { + "epoch": 0.4601500528228911, + "grad_norm": 1.9952750205993652, + "learning_rate": 0.00011767061065884682, + "loss": 1.7959, + "step": 12849 + }, + { + "epoch": 0.4601858649524594, + "grad_norm": 2.108474016189575, + "learning_rate": 0.00011765919405308341, + "loss": 1.4997, + "step": 12850 + }, + { + "epoch": 0.46022167708202766, + "grad_norm": 1.9382110834121704, + "learning_rate": 0.00011764777720973835, + "loss": 1.4933, + "step": 12851 + }, + { + "epoch": 0.460257489211596, + "grad_norm": 1.7366992235183716, + "learning_rate": 0.00011763636012896518, + "loss": 1.6449, + "step": 12852 + }, + { + "epoch": 0.46029330134116425, + "grad_norm": 1.8171309232711792, + "learning_rate": 0.00011762494281091756, + "loss": 1.6787, + "step": 12853 + }, + { + "epoch": 0.4603291134707325, + "grad_norm": 1.760102391242981, + "learning_rate": 0.00011761352525574905, + "loss": 1.2648, + "step": 12854 + }, + { + "epoch": 0.46036492560030084, + "grad_norm": 2.294532537460327, + "learning_rate": 0.00011760210746361329, + "loss": 1.6493, + "step": 12855 + }, + { + "epoch": 0.4604007377298691, + "grad_norm": 1.5895004272460938, + "learning_rate": 0.00011759068943466389, + "loss": 1.3155, + "step": 12856 + }, + { + "epoch": 0.4604365498594374, + "grad_norm": 1.6277079582214355, + "learning_rate": 0.00011757927116905442, + "loss": 1.4816, + "step": 12857 + }, + { + "epoch": 0.46047236198900565, + "grad_norm": 1.8704755306243896, + "learning_rate": 0.00011756785266693857, + "loss": 1.4445, + "step": 12858 + }, + { + "epoch": 0.460508174118574, + "grad_norm": 1.737906813621521, + "learning_rate": 0.00011755643392846991, + "loss": 1.7793, + "step": 12859 + }, + { + "epoch": 0.46054398624814225, + "grad_norm": 1.859326720237732, + "learning_rate": 0.00011754501495380209, + "loss": 1.6184, + "step": 12860 + }, + { + "epoch": 0.4605797983777105, + "grad_norm": 1.8571624755859375, + "learning_rate": 0.00011753359574308869, + "loss": 1.5855, + "step": 12861 + }, + { + "epoch": 0.46061561050727884, + "grad_norm": 1.6223071813583374, + "learning_rate": 0.00011752217629648341, + "loss": 1.3289, + "step": 12862 + }, + { + "epoch": 0.4606514226368471, + "grad_norm": 1.7638866901397705, + "learning_rate": 0.00011751075661413982, + "loss": 1.5636, + "step": 12863 + }, + { + "epoch": 0.4606872347664154, + "grad_norm": 1.5971314907073975, + "learning_rate": 0.00011749933669621161, + "loss": 1.6486, + "step": 12864 + }, + { + "epoch": 0.46072304689598365, + "grad_norm": 1.6933928728103638, + "learning_rate": 0.0001174879165428524, + "loss": 1.8318, + "step": 12865 + }, + { + "epoch": 0.46075885902555197, + "grad_norm": 1.8240853548049927, + "learning_rate": 0.00011747649615421581, + "loss": 1.5988, + "step": 12866 + }, + { + "epoch": 0.46079467115512024, + "grad_norm": 1.140206217765808, + "learning_rate": 0.00011746507553045552, + "loss": 1.3376, + "step": 12867 + }, + { + "epoch": 0.4608304832846885, + "grad_norm": 1.5556854009628296, + "learning_rate": 0.00011745365467172516, + "loss": 1.4646, + "step": 12868 + }, + { + "epoch": 0.46086629541425683, + "grad_norm": 2.0136656761169434, + "learning_rate": 0.00011744223357817841, + "loss": 1.8497, + "step": 12869 + }, + { + "epoch": 0.4609021075438251, + "grad_norm": 2.2033441066741943, + "learning_rate": 0.00011743081224996888, + "loss": 1.5519, + "step": 12870 + }, + { + "epoch": 0.46093791967339337, + "grad_norm": 1.670477032661438, + "learning_rate": 0.00011741939068725027, + "loss": 1.5283, + "step": 12871 + }, + { + "epoch": 0.46097373180296164, + "grad_norm": 2.0751805305480957, + "learning_rate": 0.00011740796889017623, + "loss": 1.4366, + "step": 12872 + }, + { + "epoch": 0.46100954393252996, + "grad_norm": 1.6487994194030762, + "learning_rate": 0.0001173965468589004, + "loss": 1.3776, + "step": 12873 + }, + { + "epoch": 0.46104535606209823, + "grad_norm": 1.5400201082229614, + "learning_rate": 0.0001173851245935765, + "loss": 1.4772, + "step": 12874 + }, + { + "epoch": 0.4610811681916665, + "grad_norm": 1.987924337387085, + "learning_rate": 0.00011737370209435816, + "loss": 1.3146, + "step": 12875 + }, + { + "epoch": 0.4611169803212348, + "grad_norm": 2.468710422515869, + "learning_rate": 0.00011736227936139908, + "loss": 1.2949, + "step": 12876 + }, + { + "epoch": 0.4611527924508031, + "grad_norm": 1.6486579179763794, + "learning_rate": 0.00011735085639485291, + "loss": 1.462, + "step": 12877 + }, + { + "epoch": 0.46118860458037136, + "grad_norm": 1.8904602527618408, + "learning_rate": 0.00011733943319487337, + "loss": 1.6574, + "step": 12878 + }, + { + "epoch": 0.46122441670993963, + "grad_norm": 1.5434259176254272, + "learning_rate": 0.00011732800976161408, + "loss": 1.4993, + "step": 12879 + }, + { + "epoch": 0.46126022883950796, + "grad_norm": 1.8802088499069214, + "learning_rate": 0.00011731658609522881, + "loss": 1.2326, + "step": 12880 + }, + { + "epoch": 0.4612960409690762, + "grad_norm": 1.7803751230239868, + "learning_rate": 0.0001173051621958712, + "loss": 2.1058, + "step": 12881 + }, + { + "epoch": 0.4613318530986445, + "grad_norm": 1.6606054306030273, + "learning_rate": 0.00011729373806369499, + "loss": 1.6868, + "step": 12882 + }, + { + "epoch": 0.4613676652282128, + "grad_norm": 1.609900951385498, + "learning_rate": 0.0001172823136988538, + "loss": 1.2621, + "step": 12883 + }, + { + "epoch": 0.4614034773577811, + "grad_norm": 2.8169360160827637, + "learning_rate": 0.00011727088910150137, + "loss": 1.2055, + "step": 12884 + }, + { + "epoch": 0.46143928948734936, + "grad_norm": 1.9346868991851807, + "learning_rate": 0.00011725946427179142, + "loss": 1.8939, + "step": 12885 + }, + { + "epoch": 0.4614751016169176, + "grad_norm": 1.8201242685317993, + "learning_rate": 0.00011724803920987761, + "loss": 1.7242, + "step": 12886 + }, + { + "epoch": 0.46151091374648595, + "grad_norm": 1.8943721055984497, + "learning_rate": 0.00011723661391591371, + "loss": 1.7006, + "step": 12887 + }, + { + "epoch": 0.4615467258760542, + "grad_norm": 1.6095993518829346, + "learning_rate": 0.00011722518839005341, + "loss": 1.4559, + "step": 12888 + }, + { + "epoch": 0.4615825380056225, + "grad_norm": 1.338904619216919, + "learning_rate": 0.00011721376263245041, + "loss": 1.6563, + "step": 12889 + }, + { + "epoch": 0.4616183501351908, + "grad_norm": 1.563049554824829, + "learning_rate": 0.00011720233664325846, + "loss": 1.2904, + "step": 12890 + }, + { + "epoch": 0.4616541622647591, + "grad_norm": 1.619492530822754, + "learning_rate": 0.00011719091042263124, + "loss": 1.5699, + "step": 12891 + }, + { + "epoch": 0.46168997439432735, + "grad_norm": 1.8848820924758911, + "learning_rate": 0.00011717948397072246, + "loss": 1.7522, + "step": 12892 + }, + { + "epoch": 0.4617257865238956, + "grad_norm": 1.626729130744934, + "learning_rate": 0.00011716805728768593, + "loss": 1.4691, + "step": 12893 + }, + { + "epoch": 0.46176159865346394, + "grad_norm": 2.1712372303009033, + "learning_rate": 0.00011715663037367532, + "loss": 1.5208, + "step": 12894 + }, + { + "epoch": 0.4617974107830322, + "grad_norm": 1.8824517726898193, + "learning_rate": 0.00011714520322884439, + "loss": 1.3719, + "step": 12895 + }, + { + "epoch": 0.4618332229126005, + "grad_norm": 2.1174161434173584, + "learning_rate": 0.00011713377585334684, + "loss": 1.4144, + "step": 12896 + }, + { + "epoch": 0.4618690350421688, + "grad_norm": 1.8503297567367554, + "learning_rate": 0.00011712234824733644, + "loss": 1.5521, + "step": 12897 + }, + { + "epoch": 0.4619048471717371, + "grad_norm": 1.4610905647277832, + "learning_rate": 0.00011711092041096693, + "loss": 1.3756, + "step": 12898 + }, + { + "epoch": 0.46194065930130535, + "grad_norm": 2.196420907974243, + "learning_rate": 0.0001170994923443921, + "loss": 1.6765, + "step": 12899 + }, + { + "epoch": 0.4619764714308736, + "grad_norm": 1.5925959348678589, + "learning_rate": 0.00011708806404776563, + "loss": 1.5467, + "step": 12900 + }, + { + "epoch": 0.46201228356044194, + "grad_norm": 1.4882124662399292, + "learning_rate": 0.00011707663552124128, + "loss": 1.7759, + "step": 12901 + }, + { + "epoch": 0.4620480956900102, + "grad_norm": 1.9978431463241577, + "learning_rate": 0.00011706520676497285, + "loss": 1.7564, + "step": 12902 + }, + { + "epoch": 0.4620839078195785, + "grad_norm": 1.8712289333343506, + "learning_rate": 0.00011705377777911406, + "loss": 1.5388, + "step": 12903 + }, + { + "epoch": 0.4621197199491468, + "grad_norm": 3.0363831520080566, + "learning_rate": 0.0001170423485638187, + "loss": 1.5825, + "step": 12904 + }, + { + "epoch": 0.46215553207871507, + "grad_norm": 1.3153165578842163, + "learning_rate": 0.00011703091911924051, + "loss": 1.306, + "step": 12905 + }, + { + "epoch": 0.46219134420828334, + "grad_norm": 1.8656954765319824, + "learning_rate": 0.0001170194894455333, + "loss": 1.3723, + "step": 12906 + }, + { + "epoch": 0.4622271563378516, + "grad_norm": 1.8745927810668945, + "learning_rate": 0.0001170080595428508, + "loss": 1.4533, + "step": 12907 + }, + { + "epoch": 0.46226296846741993, + "grad_norm": 1.440604329109192, + "learning_rate": 0.00011699662941134679, + "loss": 1.3826, + "step": 12908 + }, + { + "epoch": 0.4622987805969882, + "grad_norm": 1.6274751424789429, + "learning_rate": 0.00011698519905117507, + "loss": 1.3327, + "step": 12909 + }, + { + "epoch": 0.46233459272655647, + "grad_norm": 1.5062453746795654, + "learning_rate": 0.00011697376846248937, + "loss": 1.5934, + "step": 12910 + }, + { + "epoch": 0.4623704048561248, + "grad_norm": 1.5800942182540894, + "learning_rate": 0.00011696233764544353, + "loss": 1.3237, + "step": 12911 + }, + { + "epoch": 0.46240621698569306, + "grad_norm": 1.7886601686477661, + "learning_rate": 0.00011695090660019132, + "loss": 1.512, + "step": 12912 + }, + { + "epoch": 0.46244202911526133, + "grad_norm": 1.6986116170883179, + "learning_rate": 0.00011693947532688653, + "loss": 1.5372, + "step": 12913 + }, + { + "epoch": 0.4624778412448296, + "grad_norm": 1.8401761054992676, + "learning_rate": 0.00011692804382568294, + "loss": 1.5287, + "step": 12914 + }, + { + "epoch": 0.4625136533743979, + "grad_norm": 1.7199172973632812, + "learning_rate": 0.00011691661209673437, + "loss": 1.5185, + "step": 12915 + }, + { + "epoch": 0.4625494655039662, + "grad_norm": 1.6660505533218384, + "learning_rate": 0.00011690518014019458, + "loss": 1.3418, + "step": 12916 + }, + { + "epoch": 0.46258527763353446, + "grad_norm": 2.050318956375122, + "learning_rate": 0.00011689374795621744, + "loss": 1.4391, + "step": 12917 + }, + { + "epoch": 0.4626210897631028, + "grad_norm": 1.6393921375274658, + "learning_rate": 0.00011688231554495668, + "loss": 1.4133, + "step": 12918 + }, + { + "epoch": 0.46265690189267106, + "grad_norm": 1.5814021825790405, + "learning_rate": 0.00011687088290656613, + "loss": 1.2826, + "step": 12919 + }, + { + "epoch": 0.4626927140222393, + "grad_norm": 1.5501033067703247, + "learning_rate": 0.00011685945004119965, + "loss": 1.6777, + "step": 12920 + }, + { + "epoch": 0.4627285261518076, + "grad_norm": 1.8156019449234009, + "learning_rate": 0.00011684801694901099, + "loss": 1.4462, + "step": 12921 + }, + { + "epoch": 0.4627643382813759, + "grad_norm": 1.7225736379623413, + "learning_rate": 0.00011683658363015402, + "loss": 1.2303, + "step": 12922 + }, + { + "epoch": 0.4628001504109442, + "grad_norm": 1.728131890296936, + "learning_rate": 0.0001168251500847825, + "loss": 1.4446, + "step": 12923 + }, + { + "epoch": 0.46283596254051246, + "grad_norm": 2.56362247467041, + "learning_rate": 0.00011681371631305032, + "loss": 1.6334, + "step": 12924 + }, + { + "epoch": 0.4628717746700808, + "grad_norm": 2.016491413116455, + "learning_rate": 0.00011680228231511123, + "loss": 1.3181, + "step": 12925 + }, + { + "epoch": 0.46290758679964905, + "grad_norm": 1.6673835515975952, + "learning_rate": 0.00011679084809111915, + "loss": 1.51, + "step": 12926 + }, + { + "epoch": 0.4629433989292173, + "grad_norm": 1.5395658016204834, + "learning_rate": 0.00011677941364122787, + "loss": 1.3948, + "step": 12927 + }, + { + "epoch": 0.4629792110587856, + "grad_norm": 1.4805071353912354, + "learning_rate": 0.0001167679789655912, + "loss": 1.3565, + "step": 12928 + }, + { + "epoch": 0.4630150231883539, + "grad_norm": 1.8518954515457153, + "learning_rate": 0.00011675654406436301, + "loss": 1.2927, + "step": 12929 + }, + { + "epoch": 0.4630508353179222, + "grad_norm": 2.3132810592651367, + "learning_rate": 0.00011674510893769713, + "loss": 1.5319, + "step": 12930 + }, + { + "epoch": 0.46308664744749045, + "grad_norm": 1.4110107421875, + "learning_rate": 0.00011673367358574741, + "loss": 0.9963, + "step": 12931 + }, + { + "epoch": 0.4631224595770588, + "grad_norm": 1.7076191902160645, + "learning_rate": 0.00011672223800866768, + "loss": 1.7017, + "step": 12932 + }, + { + "epoch": 0.46315827170662704, + "grad_norm": 2.103776693344116, + "learning_rate": 0.00011671080220661183, + "loss": 1.4991, + "step": 12933 + }, + { + "epoch": 0.4631940838361953, + "grad_norm": 1.6993529796600342, + "learning_rate": 0.00011669936617973367, + "loss": 1.4374, + "step": 12934 + }, + { + "epoch": 0.4632298959657636, + "grad_norm": 1.5589717626571655, + "learning_rate": 0.00011668792992818714, + "loss": 1.3671, + "step": 12935 + }, + { + "epoch": 0.4632657080953319, + "grad_norm": 1.5046939849853516, + "learning_rate": 0.000116676493452126, + "loss": 1.6337, + "step": 12936 + }, + { + "epoch": 0.4633015202249002, + "grad_norm": 1.6186277866363525, + "learning_rate": 0.00011666505675170413, + "loss": 1.6858, + "step": 12937 + }, + { + "epoch": 0.46333733235446845, + "grad_norm": 2.007828950881958, + "learning_rate": 0.00011665361982707543, + "loss": 1.6257, + "step": 12938 + }, + { + "epoch": 0.46337314448403677, + "grad_norm": 1.3552885055541992, + "learning_rate": 0.00011664218267839375, + "loss": 1.3457, + "step": 12939 + }, + { + "epoch": 0.46340895661360504, + "grad_norm": 2.314358949661255, + "learning_rate": 0.000116630745305813, + "loss": 1.3535, + "step": 12940 + }, + { + "epoch": 0.4634447687431733, + "grad_norm": 1.718798279762268, + "learning_rate": 0.00011661930770948699, + "loss": 1.4442, + "step": 12941 + }, + { + "epoch": 0.4634805808727416, + "grad_norm": 1.6392550468444824, + "learning_rate": 0.00011660786988956964, + "loss": 1.3206, + "step": 12942 + }, + { + "epoch": 0.4635163930023099, + "grad_norm": 1.5570787191390991, + "learning_rate": 0.00011659643184621485, + "loss": 1.3863, + "step": 12943 + }, + { + "epoch": 0.46355220513187817, + "grad_norm": 2.412844181060791, + "learning_rate": 0.00011658499357957646, + "loss": 1.6282, + "step": 12944 + }, + { + "epoch": 0.46358801726144644, + "grad_norm": 1.7498209476470947, + "learning_rate": 0.00011657355508980836, + "loss": 1.4421, + "step": 12945 + }, + { + "epoch": 0.46362382939101476, + "grad_norm": 1.9625868797302246, + "learning_rate": 0.00011656211637706449, + "loss": 1.8111, + "step": 12946 + }, + { + "epoch": 0.46365964152058303, + "grad_norm": 2.1074130535125732, + "learning_rate": 0.00011655067744149865, + "loss": 1.4252, + "step": 12947 + }, + { + "epoch": 0.4636954536501513, + "grad_norm": 1.5764514207839966, + "learning_rate": 0.00011653923828326485, + "loss": 1.2029, + "step": 12948 + }, + { + "epoch": 0.46373126577971957, + "grad_norm": 1.5708736181259155, + "learning_rate": 0.0001165277989025169, + "loss": 1.3832, + "step": 12949 + }, + { + "epoch": 0.4637670779092879, + "grad_norm": 1.6243404150009155, + "learning_rate": 0.00011651635929940874, + "loss": 1.2595, + "step": 12950 + }, + { + "epoch": 0.46380289003885616, + "grad_norm": 2.1377639770507812, + "learning_rate": 0.00011650491947409427, + "loss": 1.7661, + "step": 12951 + }, + { + "epoch": 0.46383870216842443, + "grad_norm": 1.541272759437561, + "learning_rate": 0.00011649347942672741, + "loss": 1.5135, + "step": 12952 + }, + { + "epoch": 0.46387451429799276, + "grad_norm": 2.0266079902648926, + "learning_rate": 0.00011648203915746208, + "loss": 1.7173, + "step": 12953 + }, + { + "epoch": 0.463910326427561, + "grad_norm": 1.9679988622665405, + "learning_rate": 0.00011647059866645213, + "loss": 1.5395, + "step": 12954 + }, + { + "epoch": 0.4639461385571293, + "grad_norm": 1.322680950164795, + "learning_rate": 0.00011645915795385154, + "loss": 1.603, + "step": 12955 + }, + { + "epoch": 0.46398195068669756, + "grad_norm": 1.8947259187698364, + "learning_rate": 0.0001164477170198142, + "loss": 1.292, + "step": 12956 + }, + { + "epoch": 0.4640177628162659, + "grad_norm": 1.686653733253479, + "learning_rate": 0.00011643627586449406, + "loss": 1.6056, + "step": 12957 + }, + { + "epoch": 0.46405357494583416, + "grad_norm": 2.2820146083831787, + "learning_rate": 0.000116424834488045, + "loss": 1.4883, + "step": 12958 + }, + { + "epoch": 0.4640893870754024, + "grad_norm": 2.1820619106292725, + "learning_rate": 0.00011641339289062101, + "loss": 1.6662, + "step": 12959 + }, + { + "epoch": 0.46412519920497075, + "grad_norm": 2.1300809383392334, + "learning_rate": 0.00011640195107237596, + "loss": 1.3605, + "step": 12960 + }, + { + "epoch": 0.464161011334539, + "grad_norm": 1.3338943719863892, + "learning_rate": 0.00011639050903346387, + "loss": 1.4019, + "step": 12961 + }, + { + "epoch": 0.4641968234641073, + "grad_norm": 1.9710988998413086, + "learning_rate": 0.00011637906677403859, + "loss": 1.3826, + "step": 12962 + }, + { + "epoch": 0.46423263559367556, + "grad_norm": 1.3079078197479248, + "learning_rate": 0.00011636762429425407, + "loss": 1.1247, + "step": 12963 + }, + { + "epoch": 0.4642684477232439, + "grad_norm": 1.8297098875045776, + "learning_rate": 0.0001163561815942643, + "loss": 1.5155, + "step": 12964 + }, + { + "epoch": 0.46430425985281215, + "grad_norm": 1.8603323698043823, + "learning_rate": 0.00011634473867422322, + "loss": 1.4551, + "step": 12965 + }, + { + "epoch": 0.4643400719823804, + "grad_norm": 1.6557583808898926, + "learning_rate": 0.00011633329553428476, + "loss": 1.29, + "step": 12966 + }, + { + "epoch": 0.46437588411194874, + "grad_norm": 1.406653881072998, + "learning_rate": 0.00011632185217460283, + "loss": 1.5551, + "step": 12967 + }, + { + "epoch": 0.464411696241517, + "grad_norm": 1.6777414083480835, + "learning_rate": 0.00011631040859533148, + "loss": 1.3424, + "step": 12968 + }, + { + "epoch": 0.4644475083710853, + "grad_norm": 2.046297311782837, + "learning_rate": 0.00011629896479662461, + "loss": 1.5278, + "step": 12969 + }, + { + "epoch": 0.46448332050065355, + "grad_norm": 1.4424058198928833, + "learning_rate": 0.0001162875207786362, + "loss": 1.3149, + "step": 12970 + }, + { + "epoch": 0.4645191326302219, + "grad_norm": 1.1536939144134521, + "learning_rate": 0.00011627607654152022, + "loss": 1.4943, + "step": 12971 + }, + { + "epoch": 0.46455494475979014, + "grad_norm": 1.738486409187317, + "learning_rate": 0.0001162646320854306, + "loss": 1.3925, + "step": 12972 + }, + { + "epoch": 0.4645907568893584, + "grad_norm": 2.092022657394409, + "learning_rate": 0.00011625318741052133, + "loss": 1.7829, + "step": 12973 + }, + { + "epoch": 0.4646265690189267, + "grad_norm": 3.35194993019104, + "learning_rate": 0.0001162417425169464, + "loss": 1.3859, + "step": 12974 + }, + { + "epoch": 0.464662381148495, + "grad_norm": 1.7595601081848145, + "learning_rate": 0.00011623029740485978, + "loss": 1.4044, + "step": 12975 + }, + { + "epoch": 0.4646981932780633, + "grad_norm": 1.210787057876587, + "learning_rate": 0.00011621885207441541, + "loss": 1.5113, + "step": 12976 + }, + { + "epoch": 0.46473400540763155, + "grad_norm": 1.481057047843933, + "learning_rate": 0.00011620740652576736, + "loss": 1.1624, + "step": 12977 + }, + { + "epoch": 0.46476981753719987, + "grad_norm": 2.3686330318450928, + "learning_rate": 0.0001161959607590695, + "loss": 1.4887, + "step": 12978 + }, + { + "epoch": 0.46480562966676814, + "grad_norm": 2.417525053024292, + "learning_rate": 0.00011618451477447596, + "loss": 1.5583, + "step": 12979 + }, + { + "epoch": 0.4648414417963364, + "grad_norm": 2.8242523670196533, + "learning_rate": 0.00011617306857214059, + "loss": 1.6658, + "step": 12980 + }, + { + "epoch": 0.4648772539259047, + "grad_norm": 2.044356107711792, + "learning_rate": 0.00011616162215221744, + "loss": 1.3755, + "step": 12981 + }, + { + "epoch": 0.464913066055473, + "grad_norm": 1.7669847011566162, + "learning_rate": 0.00011615017551486054, + "loss": 1.6859, + "step": 12982 + }, + { + "epoch": 0.46494887818504127, + "grad_norm": 1.8860310316085815, + "learning_rate": 0.00011613872866022384, + "loss": 1.4955, + "step": 12983 + }, + { + "epoch": 0.46498469031460954, + "grad_norm": 1.8935774564743042, + "learning_rate": 0.00011612728158846138, + "loss": 1.7469, + "step": 12984 + }, + { + "epoch": 0.46502050244417786, + "grad_norm": 1.7466204166412354, + "learning_rate": 0.00011611583429972715, + "loss": 1.3623, + "step": 12985 + }, + { + "epoch": 0.46505631457374613, + "grad_norm": 2.1702635288238525, + "learning_rate": 0.00011610438679417515, + "loss": 1.4292, + "step": 12986 + }, + { + "epoch": 0.4650921267033144, + "grad_norm": 1.6326416730880737, + "learning_rate": 0.0001160929390719594, + "loss": 1.5971, + "step": 12987 + }, + { + "epoch": 0.46512793883288267, + "grad_norm": 3.18381667137146, + "learning_rate": 0.00011608149113323392, + "loss": 1.5523, + "step": 12988 + }, + { + "epoch": 0.465163750962451, + "grad_norm": 2.2429869174957275, + "learning_rate": 0.00011607004297815271, + "loss": 1.2013, + "step": 12989 + }, + { + "epoch": 0.46519956309201926, + "grad_norm": 1.3546289205551147, + "learning_rate": 0.00011605859460686981, + "loss": 1.4236, + "step": 12990 + }, + { + "epoch": 0.46523537522158753, + "grad_norm": 1.5490727424621582, + "learning_rate": 0.00011604714601953922, + "loss": 1.567, + "step": 12991 + }, + { + "epoch": 0.46527118735115586, + "grad_norm": 1.2240389585494995, + "learning_rate": 0.00011603569721631499, + "loss": 1.0347, + "step": 12992 + }, + { + "epoch": 0.4653069994807241, + "grad_norm": 1.6459600925445557, + "learning_rate": 0.00011602424819735111, + "loss": 1.7073, + "step": 12993 + }, + { + "epoch": 0.4653428116102924, + "grad_norm": 1.6901960372924805, + "learning_rate": 0.00011601279896280167, + "loss": 1.72, + "step": 12994 + }, + { + "epoch": 0.46537862373986066, + "grad_norm": 1.3724249601364136, + "learning_rate": 0.00011600134951282067, + "loss": 1.6381, + "step": 12995 + }, + { + "epoch": 0.465414435869429, + "grad_norm": 2.0271923542022705, + "learning_rate": 0.00011598989984756216, + "loss": 1.5456, + "step": 12996 + }, + { + "epoch": 0.46545024799899726, + "grad_norm": 1.7223308086395264, + "learning_rate": 0.0001159784499671802, + "loss": 1.9994, + "step": 12997 + }, + { + "epoch": 0.4654860601285655, + "grad_norm": 1.5626134872436523, + "learning_rate": 0.00011596699987182873, + "loss": 1.4624, + "step": 12998 + }, + { + "epoch": 0.46552187225813385, + "grad_norm": 1.7725961208343506, + "learning_rate": 0.00011595554956166195, + "loss": 1.4648, + "step": 12999 + }, + { + "epoch": 0.4655576843877021, + "grad_norm": 1.5563966035842896, + "learning_rate": 0.00011594409903683376, + "loss": 1.4602, + "step": 13000 + }, + { + "epoch": 0.4655934965172704, + "grad_norm": 1.607021450996399, + "learning_rate": 0.00011593264829749835, + "loss": 1.4378, + "step": 13001 + }, + { + "epoch": 0.46562930864683866, + "grad_norm": 1.4815516471862793, + "learning_rate": 0.00011592119734380966, + "loss": 1.705, + "step": 13002 + }, + { + "epoch": 0.465665120776407, + "grad_norm": 1.9756717681884766, + "learning_rate": 0.00011590974617592182, + "loss": 1.6251, + "step": 13003 + }, + { + "epoch": 0.46570093290597525, + "grad_norm": 1.837310791015625, + "learning_rate": 0.00011589829479398886, + "loss": 1.8303, + "step": 13004 + }, + { + "epoch": 0.4657367450355435, + "grad_norm": 2.223160982131958, + "learning_rate": 0.00011588684319816485, + "loss": 1.3854, + "step": 13005 + }, + { + "epoch": 0.46577255716511184, + "grad_norm": 1.3153750896453857, + "learning_rate": 0.00011587539138860388, + "loss": 1.4643, + "step": 13006 + }, + { + "epoch": 0.4658083692946801, + "grad_norm": 1.9978464841842651, + "learning_rate": 0.00011586393936545995, + "loss": 1.6035, + "step": 13007 + }, + { + "epoch": 0.4658441814242484, + "grad_norm": 1.5772230625152588, + "learning_rate": 0.00011585248712888724, + "loss": 1.7976, + "step": 13008 + }, + { + "epoch": 0.46587999355381665, + "grad_norm": 1.839242935180664, + "learning_rate": 0.0001158410346790397, + "loss": 1.4875, + "step": 13009 + }, + { + "epoch": 0.465915805683385, + "grad_norm": 1.7222756147384644, + "learning_rate": 0.00011582958201607152, + "loss": 1.562, + "step": 13010 + }, + { + "epoch": 0.46595161781295324, + "grad_norm": 1.490675687789917, + "learning_rate": 0.0001158181291401367, + "loss": 1.5901, + "step": 13011 + }, + { + "epoch": 0.4659874299425215, + "grad_norm": 1.3239779472351074, + "learning_rate": 0.00011580667605138937, + "loss": 1.3461, + "step": 13012 + }, + { + "epoch": 0.46602324207208984, + "grad_norm": 1.3944867849349976, + "learning_rate": 0.0001157952227499836, + "loss": 1.8864, + "step": 13013 + }, + { + "epoch": 0.4660590542016581, + "grad_norm": 1.853308916091919, + "learning_rate": 0.0001157837692360735, + "loss": 1.388, + "step": 13014 + }, + { + "epoch": 0.4660948663312264, + "grad_norm": 1.9267886877059937, + "learning_rate": 0.00011577231550981313, + "loss": 1.7276, + "step": 13015 + }, + { + "epoch": 0.46613067846079465, + "grad_norm": 1.8433860540390015, + "learning_rate": 0.00011576086157135659, + "loss": 1.4372, + "step": 13016 + }, + { + "epoch": 0.46616649059036297, + "grad_norm": 1.6603800058364868, + "learning_rate": 0.00011574940742085803, + "loss": 1.5427, + "step": 13017 + }, + { + "epoch": 0.46620230271993124, + "grad_norm": 1.883219838142395, + "learning_rate": 0.00011573795305847146, + "loss": 1.6654, + "step": 13018 + }, + { + "epoch": 0.4662381148494995, + "grad_norm": 1.6327214241027832, + "learning_rate": 0.00011572649848435104, + "loss": 1.7289, + "step": 13019 + }, + { + "epoch": 0.46627392697906783, + "grad_norm": 1.6758432388305664, + "learning_rate": 0.00011571504369865087, + "loss": 1.3519, + "step": 13020 + }, + { + "epoch": 0.4663097391086361, + "grad_norm": 2.6810929775238037, + "learning_rate": 0.0001157035887015251, + "loss": 1.2821, + "step": 13021 + }, + { + "epoch": 0.46634555123820437, + "grad_norm": 1.7383793592453003, + "learning_rate": 0.00011569213349312773, + "loss": 2.0079, + "step": 13022 + }, + { + "epoch": 0.46638136336777264, + "grad_norm": 2.8930468559265137, + "learning_rate": 0.000115680678073613, + "loss": 1.5185, + "step": 13023 + }, + { + "epoch": 0.46641717549734096, + "grad_norm": 1.9949698448181152, + "learning_rate": 0.00011566922244313496, + "loss": 1.3717, + "step": 13024 + }, + { + "epoch": 0.46645298762690923, + "grad_norm": 1.4897794723510742, + "learning_rate": 0.00011565776660184772, + "loss": 1.3828, + "step": 13025 + }, + { + "epoch": 0.4664887997564775, + "grad_norm": 1.995154619216919, + "learning_rate": 0.00011564631054990546, + "loss": 1.618, + "step": 13026 + }, + { + "epoch": 0.4665246118860458, + "grad_norm": 1.3552685976028442, + "learning_rate": 0.00011563485428746226, + "loss": 1.4857, + "step": 13027 + }, + { + "epoch": 0.4665604240156141, + "grad_norm": 1.694174885749817, + "learning_rate": 0.00011562339781467226, + "loss": 1.5152, + "step": 13028 + }, + { + "epoch": 0.46659623614518236, + "grad_norm": 2.1866040229797363, + "learning_rate": 0.00011561194113168958, + "loss": 1.5322, + "step": 13029 + }, + { + "epoch": 0.46663204827475063, + "grad_norm": 1.6240884065628052, + "learning_rate": 0.0001156004842386684, + "loss": 1.6096, + "step": 13030 + }, + { + "epoch": 0.46666786040431896, + "grad_norm": 1.6429814100265503, + "learning_rate": 0.0001155890271357628, + "loss": 1.136, + "step": 13031 + }, + { + "epoch": 0.4667036725338872, + "grad_norm": 1.7728595733642578, + "learning_rate": 0.00011557756982312699, + "loss": 1.627, + "step": 13032 + }, + { + "epoch": 0.4667394846634555, + "grad_norm": 1.5901148319244385, + "learning_rate": 0.00011556611230091502, + "loss": 1.7406, + "step": 13033 + }, + { + "epoch": 0.4667752967930238, + "grad_norm": 1.8100167512893677, + "learning_rate": 0.00011555465456928114, + "loss": 1.3819, + "step": 13034 + }, + { + "epoch": 0.4668111089225921, + "grad_norm": 1.715928316116333, + "learning_rate": 0.0001155431966283794, + "loss": 1.0788, + "step": 13035 + }, + { + "epoch": 0.46684692105216036, + "grad_norm": 1.3283747434616089, + "learning_rate": 0.00011553173847836403, + "loss": 1.3998, + "step": 13036 + }, + { + "epoch": 0.4668827331817286, + "grad_norm": 2.5385360717773438, + "learning_rate": 0.00011552028011938913, + "loss": 1.2955, + "step": 13037 + }, + { + "epoch": 0.46691854531129695, + "grad_norm": 1.693988561630249, + "learning_rate": 0.0001155088215516089, + "loss": 1.2398, + "step": 13038 + }, + { + "epoch": 0.4669543574408652, + "grad_norm": 1.468666911125183, + "learning_rate": 0.00011549736277517746, + "loss": 1.5346, + "step": 13039 + }, + { + "epoch": 0.4669901695704335, + "grad_norm": 1.6999561786651611, + "learning_rate": 0.00011548590379024904, + "loss": 1.4972, + "step": 13040 + }, + { + "epoch": 0.4670259817000018, + "grad_norm": 2.801893711090088, + "learning_rate": 0.00011547444459697772, + "loss": 1.5316, + "step": 13041 + }, + { + "epoch": 0.4670617938295701, + "grad_norm": 1.450239658355713, + "learning_rate": 0.00011546298519551771, + "loss": 1.4053, + "step": 13042 + }, + { + "epoch": 0.46709760595913835, + "grad_norm": 1.53555428981781, + "learning_rate": 0.00011545152558602319, + "loss": 1.6854, + "step": 13043 + }, + { + "epoch": 0.4671334180887066, + "grad_norm": 1.7407196760177612, + "learning_rate": 0.00011544006576864832, + "loss": 1.4238, + "step": 13044 + }, + { + "epoch": 0.46716923021827494, + "grad_norm": 1.4221725463867188, + "learning_rate": 0.00011542860574354727, + "loss": 1.6033, + "step": 13045 + }, + { + "epoch": 0.4672050423478432, + "grad_norm": 1.597596526145935, + "learning_rate": 0.00011541714551087423, + "loss": 1.5698, + "step": 13046 + }, + { + "epoch": 0.4672408544774115, + "grad_norm": 1.6821784973144531, + "learning_rate": 0.00011540568507078342, + "loss": 1.4342, + "step": 13047 + }, + { + "epoch": 0.4672766666069798, + "grad_norm": 1.4949378967285156, + "learning_rate": 0.00011539422442342895, + "loss": 1.3515, + "step": 13048 + }, + { + "epoch": 0.4673124787365481, + "grad_norm": 1.5018175840377808, + "learning_rate": 0.00011538276356896507, + "loss": 1.4951, + "step": 13049 + }, + { + "epoch": 0.46734829086611634, + "grad_norm": 1.6001633405685425, + "learning_rate": 0.00011537130250754595, + "loss": 1.5004, + "step": 13050 + }, + { + "epoch": 0.4673841029956846, + "grad_norm": 1.7056968212127686, + "learning_rate": 0.00011535984123932578, + "loss": 1.2113, + "step": 13051 + }, + { + "epoch": 0.46741991512525294, + "grad_norm": 1.4191452264785767, + "learning_rate": 0.00011534837976445875, + "loss": 1.1832, + "step": 13052 + }, + { + "epoch": 0.4674557272548212, + "grad_norm": 1.751197338104248, + "learning_rate": 0.00011533691808309905, + "loss": 1.8554, + "step": 13053 + }, + { + "epoch": 0.4674915393843895, + "grad_norm": 1.9440951347351074, + "learning_rate": 0.00011532545619540094, + "loss": 1.2174, + "step": 13054 + }, + { + "epoch": 0.4675273515139578, + "grad_norm": 1.4406771659851074, + "learning_rate": 0.00011531399410151855, + "loss": 1.505, + "step": 13055 + }, + { + "epoch": 0.46756316364352607, + "grad_norm": 1.4604896306991577, + "learning_rate": 0.00011530253180160614, + "loss": 1.5097, + "step": 13056 + }, + { + "epoch": 0.46759897577309434, + "grad_norm": 1.3812698125839233, + "learning_rate": 0.00011529106929581792, + "loss": 0.9882, + "step": 13057 + }, + { + "epoch": 0.4676347879026626, + "grad_norm": 1.4525638818740845, + "learning_rate": 0.00011527960658430807, + "loss": 1.4344, + "step": 13058 + }, + { + "epoch": 0.46767060003223093, + "grad_norm": 1.5971357822418213, + "learning_rate": 0.00011526814366723084, + "loss": 1.4928, + "step": 13059 + }, + { + "epoch": 0.4677064121617992, + "grad_norm": 2.0160109996795654, + "learning_rate": 0.00011525668054474039, + "loss": 1.2046, + "step": 13060 + }, + { + "epoch": 0.46774222429136747, + "grad_norm": 1.648202657699585, + "learning_rate": 0.00011524521721699102, + "loss": 1.3332, + "step": 13061 + }, + { + "epoch": 0.4677780364209358, + "grad_norm": 1.7077761888504028, + "learning_rate": 0.0001152337536841369, + "loss": 1.5201, + "step": 13062 + }, + { + "epoch": 0.46781384855050406, + "grad_norm": 2.0169153213500977, + "learning_rate": 0.00011522228994633229, + "loss": 1.6287, + "step": 13063 + }, + { + "epoch": 0.46784966068007233, + "grad_norm": 1.86139976978302, + "learning_rate": 0.00011521082600373136, + "loss": 1.4691, + "step": 13064 + }, + { + "epoch": 0.4678854728096406, + "grad_norm": 1.6686205863952637, + "learning_rate": 0.00011519936185648842, + "loss": 1.3001, + "step": 13065 + }, + { + "epoch": 0.4679212849392089, + "grad_norm": 1.5188180208206177, + "learning_rate": 0.0001151878975047577, + "loss": 1.4057, + "step": 13066 + }, + { + "epoch": 0.4679570970687772, + "grad_norm": 1.5588150024414062, + "learning_rate": 0.00011517643294869339, + "loss": 1.4067, + "step": 13067 + }, + { + "epoch": 0.46799290919834546, + "grad_norm": 1.6581311225891113, + "learning_rate": 0.00011516496818844972, + "loss": 1.6517, + "step": 13068 + }, + { + "epoch": 0.4680287213279138, + "grad_norm": 1.5204647779464722, + "learning_rate": 0.000115153503224181, + "loss": 1.6362, + "step": 13069 + }, + { + "epoch": 0.46806453345748206, + "grad_norm": 1.700719952583313, + "learning_rate": 0.00011514203805604142, + "loss": 1.2189, + "step": 13070 + }, + { + "epoch": 0.4681003455870503, + "grad_norm": 2.6246724128723145, + "learning_rate": 0.00011513057268418526, + "loss": 1.6423, + "step": 13071 + }, + { + "epoch": 0.4681361577166186, + "grad_norm": 1.727502465248108, + "learning_rate": 0.00011511910710876677, + "loss": 1.7205, + "step": 13072 + }, + { + "epoch": 0.4681719698461869, + "grad_norm": 1.5376635789871216, + "learning_rate": 0.00011510764132994016, + "loss": 1.5014, + "step": 13073 + }, + { + "epoch": 0.4682077819757552, + "grad_norm": 1.3821709156036377, + "learning_rate": 0.00011509617534785976, + "loss": 1.5177, + "step": 13074 + }, + { + "epoch": 0.46824359410532346, + "grad_norm": 1.698994755744934, + "learning_rate": 0.00011508470916267978, + "loss": 1.4925, + "step": 13075 + }, + { + "epoch": 0.4682794062348918, + "grad_norm": 1.7280097007751465, + "learning_rate": 0.00011507324277455452, + "loss": 1.3431, + "step": 13076 + }, + { + "epoch": 0.46831521836446005, + "grad_norm": 1.7975046634674072, + "learning_rate": 0.00011506177618363818, + "loss": 1.5999, + "step": 13077 + }, + { + "epoch": 0.4683510304940283, + "grad_norm": 1.7372099161148071, + "learning_rate": 0.00011505030939008508, + "loss": 1.3853, + "step": 13078 + }, + { + "epoch": 0.4683868426235966, + "grad_norm": 1.693571925163269, + "learning_rate": 0.0001150388423940495, + "loss": 1.3432, + "step": 13079 + }, + { + "epoch": 0.4684226547531649, + "grad_norm": 1.4951120615005493, + "learning_rate": 0.00011502737519568567, + "loss": 1.7497, + "step": 13080 + }, + { + "epoch": 0.4684584668827332, + "grad_norm": 1.509081482887268, + "learning_rate": 0.00011501590779514793, + "loss": 1.5924, + "step": 13081 + }, + { + "epoch": 0.46849427901230145, + "grad_norm": 2.52064847946167, + "learning_rate": 0.00011500444019259047, + "loss": 1.4573, + "step": 13082 + }, + { + "epoch": 0.4685300911418698, + "grad_norm": 1.4621198177337646, + "learning_rate": 0.00011499297238816767, + "loss": 1.5936, + "step": 13083 + }, + { + "epoch": 0.46856590327143804, + "grad_norm": 1.402459979057312, + "learning_rate": 0.00011498150438203373, + "loss": 1.7086, + "step": 13084 + }, + { + "epoch": 0.4686017154010063, + "grad_norm": 1.5725468397140503, + "learning_rate": 0.00011497003617434301, + "loss": 1.4823, + "step": 13085 + }, + { + "epoch": 0.4686375275305746, + "grad_norm": 2.153813123703003, + "learning_rate": 0.00011495856776524971, + "loss": 1.2271, + "step": 13086 + }, + { + "epoch": 0.4686733396601429, + "grad_norm": 1.4646168947219849, + "learning_rate": 0.00011494709915490822, + "loss": 1.4015, + "step": 13087 + }, + { + "epoch": 0.4687091517897112, + "grad_norm": 1.6754770278930664, + "learning_rate": 0.00011493563034347277, + "loss": 1.5845, + "step": 13088 + }, + { + "epoch": 0.46874496391927944, + "grad_norm": 1.6596485376358032, + "learning_rate": 0.00011492416133109769, + "loss": 1.3597, + "step": 13089 + }, + { + "epoch": 0.46878077604884777, + "grad_norm": 1.3753113746643066, + "learning_rate": 0.00011491269211793725, + "loss": 1.4336, + "step": 13090 + }, + { + "epoch": 0.46881658817841604, + "grad_norm": 1.4884370565414429, + "learning_rate": 0.00011490122270414578, + "loss": 1.3863, + "step": 13091 + }, + { + "epoch": 0.4688524003079843, + "grad_norm": 2.0943214893341064, + "learning_rate": 0.0001148897530898776, + "loss": 1.1964, + "step": 13092 + }, + { + "epoch": 0.4688882124375526, + "grad_norm": 1.4185950756072998, + "learning_rate": 0.000114878283275287, + "loss": 1.3801, + "step": 13093 + }, + { + "epoch": 0.4689240245671209, + "grad_norm": 1.359785795211792, + "learning_rate": 0.00011486681326052828, + "loss": 1.501, + "step": 13094 + }, + { + "epoch": 0.46895983669668917, + "grad_norm": 1.7992392778396606, + "learning_rate": 0.00011485534304575575, + "loss": 1.6647, + "step": 13095 + }, + { + "epoch": 0.46899564882625744, + "grad_norm": 1.468299150466919, + "learning_rate": 0.00011484387263112377, + "loss": 1.3893, + "step": 13096 + }, + { + "epoch": 0.46903146095582576, + "grad_norm": 1.3851277828216553, + "learning_rate": 0.0001148324020167866, + "loss": 1.0484, + "step": 13097 + }, + { + "epoch": 0.46906727308539403, + "grad_norm": 2.560601234436035, + "learning_rate": 0.0001148209312028986, + "loss": 1.1958, + "step": 13098 + }, + { + "epoch": 0.4691030852149623, + "grad_norm": 1.7320241928100586, + "learning_rate": 0.0001148094601896141, + "loss": 1.3779, + "step": 13099 + }, + { + "epoch": 0.46913889734453057, + "grad_norm": 2.7635598182678223, + "learning_rate": 0.00011479798897708742, + "loss": 1.9673, + "step": 13100 + }, + { + "epoch": 0.4691747094740989, + "grad_norm": 2.2257347106933594, + "learning_rate": 0.00011478651756547287, + "loss": 1.7453, + "step": 13101 + }, + { + "epoch": 0.46921052160366716, + "grad_norm": 1.73111891746521, + "learning_rate": 0.00011477504595492481, + "loss": 1.1503, + "step": 13102 + }, + { + "epoch": 0.46924633373323543, + "grad_norm": 1.4754983186721802, + "learning_rate": 0.00011476357414559757, + "loss": 1.5525, + "step": 13103 + }, + { + "epoch": 0.46928214586280376, + "grad_norm": 3.5211539268493652, + "learning_rate": 0.00011475210213764547, + "loss": 1.7384, + "step": 13104 + }, + { + "epoch": 0.469317957992372, + "grad_norm": 1.7858822345733643, + "learning_rate": 0.00011474062993122288, + "loss": 1.4745, + "step": 13105 + }, + { + "epoch": 0.4693537701219403, + "grad_norm": 1.6460684537887573, + "learning_rate": 0.0001147291575264841, + "loss": 1.7301, + "step": 13106 + }, + { + "epoch": 0.46938958225150856, + "grad_norm": 1.3619961738586426, + "learning_rate": 0.00011471768492358354, + "loss": 1.2949, + "step": 13107 + }, + { + "epoch": 0.4694253943810769, + "grad_norm": 1.6338173151016235, + "learning_rate": 0.00011470621212267547, + "loss": 1.3691, + "step": 13108 + }, + { + "epoch": 0.46946120651064516, + "grad_norm": 1.5736229419708252, + "learning_rate": 0.00011469473912391433, + "loss": 1.38, + "step": 13109 + }, + { + "epoch": 0.4694970186402134, + "grad_norm": 1.666130542755127, + "learning_rate": 0.0001146832659274544, + "loss": 1.5988, + "step": 13110 + }, + { + "epoch": 0.46953283076978175, + "grad_norm": 2.119744062423706, + "learning_rate": 0.00011467179253345008, + "loss": 1.6468, + "step": 13111 + }, + { + "epoch": 0.46956864289935, + "grad_norm": 2.0681087970733643, + "learning_rate": 0.00011466031894205574, + "loss": 1.404, + "step": 13112 + }, + { + "epoch": 0.4696044550289183, + "grad_norm": 1.5972648859024048, + "learning_rate": 0.00011464884515342568, + "loss": 1.4593, + "step": 13113 + }, + { + "epoch": 0.46964026715848656, + "grad_norm": 1.4360387325286865, + "learning_rate": 0.00011463737116771434, + "loss": 1.2351, + "step": 13114 + }, + { + "epoch": 0.4696760792880549, + "grad_norm": 1.9295847415924072, + "learning_rate": 0.00011462589698507603, + "loss": 1.6692, + "step": 13115 + }, + { + "epoch": 0.46971189141762315, + "grad_norm": 1.6461082696914673, + "learning_rate": 0.00011461442260566513, + "loss": 1.2059, + "step": 13116 + }, + { + "epoch": 0.4697477035471914, + "grad_norm": 2.095632791519165, + "learning_rate": 0.00011460294802963602, + "loss": 1.2712, + "step": 13117 + }, + { + "epoch": 0.46978351567675974, + "grad_norm": 2.0552680492401123, + "learning_rate": 0.00011459147325714312, + "loss": 1.4689, + "step": 13118 + }, + { + "epoch": 0.469819327806328, + "grad_norm": 1.3072434663772583, + "learning_rate": 0.00011457999828834073, + "loss": 1.5075, + "step": 13119 + }, + { + "epoch": 0.4698551399358963, + "grad_norm": 2.8187243938446045, + "learning_rate": 0.00011456852312338331, + "loss": 1.3692, + "step": 13120 + }, + { + "epoch": 0.46989095206546455, + "grad_norm": 1.7331218719482422, + "learning_rate": 0.00011455704776242517, + "loss": 1.6142, + "step": 13121 + }, + { + "epoch": 0.4699267641950329, + "grad_norm": 1.6014031171798706, + "learning_rate": 0.00011454557220562074, + "loss": 1.3266, + "step": 13122 + }, + { + "epoch": 0.46996257632460114, + "grad_norm": 1.5360310077667236, + "learning_rate": 0.0001145340964531244, + "loss": 1.4116, + "step": 13123 + }, + { + "epoch": 0.4699983884541694, + "grad_norm": 1.7113350629806519, + "learning_rate": 0.00011452262050509053, + "loss": 1.3728, + "step": 13124 + }, + { + "epoch": 0.47003420058373774, + "grad_norm": 2.2795071601867676, + "learning_rate": 0.00011451114436167356, + "loss": 1.5459, + "step": 13125 + }, + { + "epoch": 0.470070012713306, + "grad_norm": 1.575621247291565, + "learning_rate": 0.00011449966802302783, + "loss": 1.7569, + "step": 13126 + }, + { + "epoch": 0.4701058248428743, + "grad_norm": 2.293635129928589, + "learning_rate": 0.0001144881914893078, + "loss": 1.1277, + "step": 13127 + }, + { + "epoch": 0.47014163697244254, + "grad_norm": 1.592656135559082, + "learning_rate": 0.00011447671476066781, + "loss": 1.4337, + "step": 13128 + }, + { + "epoch": 0.47017744910201087, + "grad_norm": 1.407986044883728, + "learning_rate": 0.00011446523783726235, + "loss": 1.4221, + "step": 13129 + }, + { + "epoch": 0.47021326123157914, + "grad_norm": 1.9266597032546997, + "learning_rate": 0.00011445376071924572, + "loss": 1.4518, + "step": 13130 + }, + { + "epoch": 0.4702490733611474, + "grad_norm": 1.3258838653564453, + "learning_rate": 0.00011444228340677241, + "loss": 1.705, + "step": 13131 + }, + { + "epoch": 0.47028488549071573, + "grad_norm": 1.457871675491333, + "learning_rate": 0.00011443080589999677, + "loss": 1.695, + "step": 13132 + }, + { + "epoch": 0.470320697620284, + "grad_norm": 1.8480134010314941, + "learning_rate": 0.00011441932819907328, + "loss": 1.5101, + "step": 13133 + }, + { + "epoch": 0.47035650974985227, + "grad_norm": 2.691573143005371, + "learning_rate": 0.00011440785030415633, + "loss": 1.2265, + "step": 13134 + }, + { + "epoch": 0.47039232187942054, + "grad_norm": 1.8243192434310913, + "learning_rate": 0.00011439637221540031, + "loss": 1.8265, + "step": 13135 + }, + { + "epoch": 0.47042813400898886, + "grad_norm": 1.973116397857666, + "learning_rate": 0.00011438489393295973, + "loss": 1.5911, + "step": 13136 + }, + { + "epoch": 0.47046394613855713, + "grad_norm": 1.8532606363296509, + "learning_rate": 0.00011437341545698892, + "loss": 1.4446, + "step": 13137 + }, + { + "epoch": 0.4704997582681254, + "grad_norm": 1.6671091318130493, + "learning_rate": 0.00011436193678764236, + "loss": 1.3737, + "step": 13138 + }, + { + "epoch": 0.4705355703976937, + "grad_norm": 1.4185718297958374, + "learning_rate": 0.00011435045792507443, + "loss": 1.4509, + "step": 13139 + }, + { + "epoch": 0.470571382527262, + "grad_norm": 1.73945152759552, + "learning_rate": 0.00011433897886943965, + "loss": 1.8183, + "step": 13140 + }, + { + "epoch": 0.47060719465683026, + "grad_norm": 2.0083441734313965, + "learning_rate": 0.00011432749962089235, + "loss": 1.1479, + "step": 13141 + }, + { + "epoch": 0.47064300678639853, + "grad_norm": 2.459001064300537, + "learning_rate": 0.00011431602017958707, + "loss": 1.3052, + "step": 13142 + }, + { + "epoch": 0.47067881891596686, + "grad_norm": 2.419342041015625, + "learning_rate": 0.00011430454054567819, + "loss": 1.6516, + "step": 13143 + }, + { + "epoch": 0.4707146310455351, + "grad_norm": 1.7359721660614014, + "learning_rate": 0.00011429306071932018, + "loss": 1.2583, + "step": 13144 + }, + { + "epoch": 0.4707504431751034, + "grad_norm": 1.5329176187515259, + "learning_rate": 0.00011428158070066743, + "loss": 1.6391, + "step": 13145 + }, + { + "epoch": 0.4707862553046717, + "grad_norm": 2.4268431663513184, + "learning_rate": 0.00011427010048987448, + "loss": 1.8331, + "step": 13146 + }, + { + "epoch": 0.47082206743424, + "grad_norm": 2.1451423168182373, + "learning_rate": 0.00011425862008709574, + "loss": 1.4687, + "step": 13147 + }, + { + "epoch": 0.47085787956380826, + "grad_norm": 1.776679515838623, + "learning_rate": 0.00011424713949248562, + "loss": 1.4487, + "step": 13148 + }, + { + "epoch": 0.4708936916933765, + "grad_norm": 1.735393762588501, + "learning_rate": 0.00011423565870619863, + "loss": 1.439, + "step": 13149 + }, + { + "epoch": 0.47092950382294485, + "grad_norm": 1.2827190160751343, + "learning_rate": 0.00011422417772838923, + "loss": 1.2839, + "step": 13150 + }, + { + "epoch": 0.4709653159525131, + "grad_norm": 1.7964860200881958, + "learning_rate": 0.00011421269655921185, + "loss": 1.345, + "step": 13151 + }, + { + "epoch": 0.4710011280820814, + "grad_norm": 1.468343734741211, + "learning_rate": 0.00011420121519882096, + "loss": 1.4463, + "step": 13152 + }, + { + "epoch": 0.4710369402116497, + "grad_norm": 1.5348808765411377, + "learning_rate": 0.00011418973364737107, + "loss": 1.5609, + "step": 13153 + }, + { + "epoch": 0.471072752341218, + "grad_norm": 1.6379225254058838, + "learning_rate": 0.00011417825190501658, + "loss": 1.3981, + "step": 13154 + }, + { + "epoch": 0.47110856447078625, + "grad_norm": 1.5981770753860474, + "learning_rate": 0.00011416676997191205, + "loss": 1.5883, + "step": 13155 + }, + { + "epoch": 0.4711443766003545, + "grad_norm": 1.8267465829849243, + "learning_rate": 0.00011415528784821188, + "loss": 1.6164, + "step": 13156 + }, + { + "epoch": 0.47118018872992284, + "grad_norm": 1.5526596307754517, + "learning_rate": 0.00011414380553407055, + "loss": 1.382, + "step": 13157 + }, + { + "epoch": 0.4712160008594911, + "grad_norm": 2.0587666034698486, + "learning_rate": 0.00011413232302964258, + "loss": 1.7454, + "step": 13158 + }, + { + "epoch": 0.4712518129890594, + "grad_norm": 1.841937780380249, + "learning_rate": 0.00011412084033508242, + "loss": 1.4733, + "step": 13159 + }, + { + "epoch": 0.4712876251186277, + "grad_norm": 2.0795516967773438, + "learning_rate": 0.00011410935745054459, + "loss": 1.391, + "step": 13160 + }, + { + "epoch": 0.471323437248196, + "grad_norm": 1.4707566499710083, + "learning_rate": 0.00011409787437618353, + "loss": 1.6325, + "step": 13161 + }, + { + "epoch": 0.47135924937776424, + "grad_norm": 2.5612967014312744, + "learning_rate": 0.00011408639111215378, + "loss": 1.6126, + "step": 13162 + }, + { + "epoch": 0.4713950615073325, + "grad_norm": 1.5790444612503052, + "learning_rate": 0.00011407490765860978, + "loss": 1.395, + "step": 13163 + }, + { + "epoch": 0.47143087363690084, + "grad_norm": 1.65281081199646, + "learning_rate": 0.00011406342401570609, + "loss": 1.5043, + "step": 13164 + }, + { + "epoch": 0.4714666857664691, + "grad_norm": 1.3939710855484009, + "learning_rate": 0.00011405194018359715, + "loss": 1.4143, + "step": 13165 + }, + { + "epoch": 0.4715024978960374, + "grad_norm": 1.7264175415039062, + "learning_rate": 0.00011404045616243745, + "loss": 1.6513, + "step": 13166 + }, + { + "epoch": 0.4715383100256057, + "grad_norm": 1.5612338781356812, + "learning_rate": 0.00011402897195238158, + "loss": 1.8289, + "step": 13167 + }, + { + "epoch": 0.47157412215517397, + "grad_norm": 1.8469725847244263, + "learning_rate": 0.00011401748755358395, + "loss": 1.3954, + "step": 13168 + }, + { + "epoch": 0.47160993428474224, + "grad_norm": 1.5684376955032349, + "learning_rate": 0.00011400600296619912, + "loss": 1.5721, + "step": 13169 + }, + { + "epoch": 0.4716457464143105, + "grad_norm": 1.6736838817596436, + "learning_rate": 0.00011399451819038159, + "loss": 1.3776, + "step": 13170 + }, + { + "epoch": 0.47168155854387883, + "grad_norm": 1.5765982866287231, + "learning_rate": 0.00011398303322628585, + "loss": 1.5997, + "step": 13171 + }, + { + "epoch": 0.4717173706734471, + "grad_norm": 1.5754213333129883, + "learning_rate": 0.00011397154807406645, + "loss": 1.3318, + "step": 13172 + }, + { + "epoch": 0.47175318280301537, + "grad_norm": 1.5405632257461548, + "learning_rate": 0.00011396006273387792, + "loss": 1.2578, + "step": 13173 + }, + { + "epoch": 0.47178899493258364, + "grad_norm": 1.6634725332260132, + "learning_rate": 0.0001139485772058747, + "loss": 1.7118, + "step": 13174 + }, + { + "epoch": 0.47182480706215196, + "grad_norm": 1.8009767532348633, + "learning_rate": 0.0001139370914902114, + "loss": 1.5235, + "step": 13175 + }, + { + "epoch": 0.47186061919172023, + "grad_norm": 2.3448922634124756, + "learning_rate": 0.00011392560558704249, + "loss": 1.4457, + "step": 13176 + }, + { + "epoch": 0.4718964313212885, + "grad_norm": 1.6356074810028076, + "learning_rate": 0.00011391411949652253, + "loss": 1.4746, + "step": 13177 + }, + { + "epoch": 0.4719322434508568, + "grad_norm": 2.3725569248199463, + "learning_rate": 0.00011390263321880605, + "loss": 1.4069, + "step": 13178 + }, + { + "epoch": 0.4719680555804251, + "grad_norm": 1.9027031660079956, + "learning_rate": 0.00011389114675404755, + "loss": 1.6479, + "step": 13179 + }, + { + "epoch": 0.47200386770999336, + "grad_norm": 1.7592540979385376, + "learning_rate": 0.00011387966010240161, + "loss": 1.5657, + "step": 13180 + }, + { + "epoch": 0.47203967983956163, + "grad_norm": 3.3529903888702393, + "learning_rate": 0.00011386817326402273, + "loss": 1.5566, + "step": 13181 + }, + { + "epoch": 0.47207549196912996, + "grad_norm": 2.228374481201172, + "learning_rate": 0.00011385668623906551, + "loss": 1.553, + "step": 13182 + }, + { + "epoch": 0.4721113040986982, + "grad_norm": 1.72072434425354, + "learning_rate": 0.00011384519902768441, + "loss": 1.503, + "step": 13183 + }, + { + "epoch": 0.4721471162282665, + "grad_norm": 1.6252473592758179, + "learning_rate": 0.00011383371163003403, + "loss": 1.2247, + "step": 13184 + }, + { + "epoch": 0.4721829283578348, + "grad_norm": 2.5225577354431152, + "learning_rate": 0.00011382222404626888, + "loss": 1.4804, + "step": 13185 + }, + { + "epoch": 0.4722187404874031, + "grad_norm": 1.921576738357544, + "learning_rate": 0.00011381073627654357, + "loss": 1.7643, + "step": 13186 + }, + { + "epoch": 0.47225455261697136, + "grad_norm": 1.2695116996765137, + "learning_rate": 0.00011379924832101258, + "loss": 1.6016, + "step": 13187 + }, + { + "epoch": 0.4722903647465396, + "grad_norm": 1.5697298049926758, + "learning_rate": 0.00011378776017983053, + "loss": 1.5873, + "step": 13188 + }, + { + "epoch": 0.47232617687610795, + "grad_norm": 1.467513084411621, + "learning_rate": 0.00011377627185315194, + "loss": 1.318, + "step": 13189 + }, + { + "epoch": 0.4723619890056762, + "grad_norm": 1.839012861251831, + "learning_rate": 0.00011376478334113139, + "loss": 1.3583, + "step": 13190 + }, + { + "epoch": 0.4723978011352445, + "grad_norm": 1.906954050064087, + "learning_rate": 0.00011375329464392343, + "loss": 1.7615, + "step": 13191 + }, + { + "epoch": 0.4724336132648128, + "grad_norm": 2.299314022064209, + "learning_rate": 0.00011374180576168263, + "loss": 1.3546, + "step": 13192 + }, + { + "epoch": 0.4724694253943811, + "grad_norm": 1.6634674072265625, + "learning_rate": 0.00011373031669456358, + "loss": 1.3662, + "step": 13193 + }, + { + "epoch": 0.47250523752394935, + "grad_norm": 1.2625876665115356, + "learning_rate": 0.0001137188274427208, + "loss": 1.4021, + "step": 13194 + }, + { + "epoch": 0.4725410496535176, + "grad_norm": 1.9035825729370117, + "learning_rate": 0.00011370733800630892, + "loss": 1.4559, + "step": 13195 + }, + { + "epoch": 0.47257686178308594, + "grad_norm": 1.5406285524368286, + "learning_rate": 0.00011369584838548246, + "loss": 1.3728, + "step": 13196 + }, + { + "epoch": 0.4726126739126542, + "grad_norm": 1.461430311203003, + "learning_rate": 0.00011368435858039605, + "loss": 1.3152, + "step": 13197 + }, + { + "epoch": 0.4726484860422225, + "grad_norm": 1.4157794713974, + "learning_rate": 0.00011367286859120423, + "loss": 1.328, + "step": 13198 + }, + { + "epoch": 0.4726842981717908, + "grad_norm": 1.5689760446548462, + "learning_rate": 0.00011366137841806161, + "loss": 1.5654, + "step": 13199 + }, + { + "epoch": 0.4727201103013591, + "grad_norm": 1.5379902124404907, + "learning_rate": 0.00011364988806112278, + "loss": 1.5379, + "step": 13200 + }, + { + "epoch": 0.47275592243092734, + "grad_norm": 2.011894941329956, + "learning_rate": 0.00011363839752054228, + "loss": 1.8056, + "step": 13201 + }, + { + "epoch": 0.4727917345604956, + "grad_norm": 1.3880537748336792, + "learning_rate": 0.00011362690679647477, + "loss": 1.5678, + "step": 13202 + }, + { + "epoch": 0.47282754669006394, + "grad_norm": 1.6055556535720825, + "learning_rate": 0.00011361541588907477, + "loss": 1.6305, + "step": 13203 + }, + { + "epoch": 0.4728633588196322, + "grad_norm": 1.4604688882827759, + "learning_rate": 0.00011360392479849693, + "loss": 1.4214, + "step": 13204 + }, + { + "epoch": 0.4728991709492005, + "grad_norm": 1.9168239831924438, + "learning_rate": 0.00011359243352489581, + "loss": 1.7095, + "step": 13205 + }, + { + "epoch": 0.4729349830787688, + "grad_norm": 1.8139801025390625, + "learning_rate": 0.00011358094206842607, + "loss": 1.5206, + "step": 13206 + }, + { + "epoch": 0.47297079520833707, + "grad_norm": 1.9469952583312988, + "learning_rate": 0.00011356945042924223, + "loss": 1.613, + "step": 13207 + }, + { + "epoch": 0.47300660733790534, + "grad_norm": 1.7959978580474854, + "learning_rate": 0.00011355795860749899, + "loss": 1.53, + "step": 13208 + }, + { + "epoch": 0.4730424194674736, + "grad_norm": 1.6030470132827759, + "learning_rate": 0.00011354646660335086, + "loss": 1.4548, + "step": 13209 + }, + { + "epoch": 0.47307823159704193, + "grad_norm": 2.472611904144287, + "learning_rate": 0.00011353497441695251, + "loss": 1.3722, + "step": 13210 + }, + { + "epoch": 0.4731140437266102, + "grad_norm": 1.9840824604034424, + "learning_rate": 0.00011352348204845853, + "loss": 1.2221, + "step": 13211 + }, + { + "epoch": 0.47314985585617847, + "grad_norm": 1.8577934503555298, + "learning_rate": 0.00011351198949802355, + "loss": 1.0622, + "step": 13212 + }, + { + "epoch": 0.4731856679857468, + "grad_norm": 1.9593387842178345, + "learning_rate": 0.0001135004967658022, + "loss": 1.6045, + "step": 13213 + }, + { + "epoch": 0.47322148011531506, + "grad_norm": 1.7175911664962769, + "learning_rate": 0.00011348900385194903, + "loss": 1.5449, + "step": 13214 + }, + { + "epoch": 0.47325729224488333, + "grad_norm": 1.7575346231460571, + "learning_rate": 0.00011347751075661876, + "loss": 1.5037, + "step": 13215 + }, + { + "epoch": 0.4732931043744516, + "grad_norm": 2.017416477203369, + "learning_rate": 0.00011346601747996595, + "loss": 1.57, + "step": 13216 + }, + { + "epoch": 0.4733289165040199, + "grad_norm": 1.4869771003723145, + "learning_rate": 0.00011345452402214527, + "loss": 1.5248, + "step": 13217 + }, + { + "epoch": 0.4733647286335882, + "grad_norm": 1.63876473903656, + "learning_rate": 0.0001134430303833113, + "loss": 1.5725, + "step": 13218 + }, + { + "epoch": 0.47340054076315646, + "grad_norm": 1.4624106884002686, + "learning_rate": 0.00011343153656361867, + "loss": 1.3269, + "step": 13219 + }, + { + "epoch": 0.4734363528927248, + "grad_norm": 2.3383195400238037, + "learning_rate": 0.00011342004256322208, + "loss": 1.415, + "step": 13220 + }, + { + "epoch": 0.47347216502229306, + "grad_norm": 1.4437799453735352, + "learning_rate": 0.00011340854838227611, + "loss": 1.6656, + "step": 13221 + }, + { + "epoch": 0.4735079771518613, + "grad_norm": 1.8068561553955078, + "learning_rate": 0.00011339705402093543, + "loss": 1.8916, + "step": 13222 + }, + { + "epoch": 0.4735437892814296, + "grad_norm": 2.249796152114868, + "learning_rate": 0.00011338555947935465, + "loss": 1.6621, + "step": 13223 + }, + { + "epoch": 0.4735796014109979, + "grad_norm": 1.4828672409057617, + "learning_rate": 0.00011337406475768846, + "loss": 1.7506, + "step": 13224 + }, + { + "epoch": 0.4736154135405662, + "grad_norm": 1.364309310913086, + "learning_rate": 0.00011336256985609144, + "loss": 1.5618, + "step": 13225 + }, + { + "epoch": 0.47365122567013446, + "grad_norm": 1.287990689277649, + "learning_rate": 0.00011335107477471834, + "loss": 1.3136, + "step": 13226 + }, + { + "epoch": 0.4736870377997028, + "grad_norm": 2.1887640953063965, + "learning_rate": 0.00011333957951372372, + "loss": 1.2726, + "step": 13227 + }, + { + "epoch": 0.47372284992927105, + "grad_norm": 1.8509095907211304, + "learning_rate": 0.00011332808407326225, + "loss": 1.3898, + "step": 13228 + }, + { + "epoch": 0.4737586620588393, + "grad_norm": 1.4751075506210327, + "learning_rate": 0.0001133165884534886, + "loss": 1.7061, + "step": 13229 + }, + { + "epoch": 0.4737944741884076, + "grad_norm": 1.6264978647232056, + "learning_rate": 0.00011330509265455745, + "loss": 1.5171, + "step": 13230 + }, + { + "epoch": 0.4738302863179759, + "grad_norm": 1.8143638372421265, + "learning_rate": 0.00011329359667662342, + "loss": 1.2657, + "step": 13231 + }, + { + "epoch": 0.4738660984475442, + "grad_norm": 1.415449619293213, + "learning_rate": 0.00011328210051984118, + "loss": 1.6253, + "step": 13232 + }, + { + "epoch": 0.47390191057711245, + "grad_norm": 1.2469733953475952, + "learning_rate": 0.00011327060418436545, + "loss": 1.4911, + "step": 13233 + }, + { + "epoch": 0.4739377227066808, + "grad_norm": 1.2812349796295166, + "learning_rate": 0.00011325910767035086, + "loss": 1.2655, + "step": 13234 + }, + { + "epoch": 0.47397353483624904, + "grad_norm": 1.9594454765319824, + "learning_rate": 0.00011324761097795206, + "loss": 1.7503, + "step": 13235 + }, + { + "epoch": 0.4740093469658173, + "grad_norm": 1.5524890422821045, + "learning_rate": 0.00011323611410732375, + "loss": 1.5084, + "step": 13236 + }, + { + "epoch": 0.4740451590953856, + "grad_norm": 1.7627413272857666, + "learning_rate": 0.0001132246170586206, + "loss": 1.5972, + "step": 13237 + }, + { + "epoch": 0.4740809712249539, + "grad_norm": 1.644841194152832, + "learning_rate": 0.00011321311983199727, + "loss": 1.7392, + "step": 13238 + }, + { + "epoch": 0.4741167833545222, + "grad_norm": 1.3779560327529907, + "learning_rate": 0.00011320162242760848, + "loss": 1.4715, + "step": 13239 + }, + { + "epoch": 0.47415259548409044, + "grad_norm": 1.7711284160614014, + "learning_rate": 0.00011319012484560885, + "loss": 1.5618, + "step": 13240 + }, + { + "epoch": 0.47418840761365877, + "grad_norm": 1.8239142894744873, + "learning_rate": 0.00011317862708615314, + "loss": 1.6008, + "step": 13241 + }, + { + "epoch": 0.47422421974322704, + "grad_norm": 1.5546090602874756, + "learning_rate": 0.00011316712914939598, + "loss": 1.5524, + "step": 13242 + }, + { + "epoch": 0.4742600318727953, + "grad_norm": 1.3858740329742432, + "learning_rate": 0.00011315563103549211, + "loss": 1.0793, + "step": 13243 + }, + { + "epoch": 0.4742958440023636, + "grad_norm": 2.5385050773620605, + "learning_rate": 0.00011314413274459618, + "loss": 1.5313, + "step": 13244 + }, + { + "epoch": 0.4743316561319319, + "grad_norm": 2.2116689682006836, + "learning_rate": 0.0001131326342768629, + "loss": 1.6439, + "step": 13245 + }, + { + "epoch": 0.47436746826150017, + "grad_norm": 1.4536186456680298, + "learning_rate": 0.00011312113563244695, + "loss": 1.6257, + "step": 13246 + }, + { + "epoch": 0.47440328039106844, + "grad_norm": 2.8695340156555176, + "learning_rate": 0.00011310963681150304, + "loss": 1.7846, + "step": 13247 + }, + { + "epoch": 0.47443909252063676, + "grad_norm": 1.562320590019226, + "learning_rate": 0.0001130981378141859, + "loss": 1.8618, + "step": 13248 + }, + { + "epoch": 0.47447490465020503, + "grad_norm": 1.378989815711975, + "learning_rate": 0.0001130866386406502, + "loss": 1.4203, + "step": 13249 + }, + { + "epoch": 0.4745107167797733, + "grad_norm": 1.7417534589767456, + "learning_rate": 0.00011307513929105067, + "loss": 1.4547, + "step": 13250 + }, + { + "epoch": 0.47454652890934157, + "grad_norm": 2.0014612674713135, + "learning_rate": 0.000113063639765542, + "loss": 1.3752, + "step": 13251 + }, + { + "epoch": 0.4745823410389099, + "grad_norm": 1.651104211807251, + "learning_rate": 0.00011305214006427892, + "loss": 1.3597, + "step": 13252 + }, + { + "epoch": 0.47461815316847816, + "grad_norm": 2.426689624786377, + "learning_rate": 0.00011304064018741612, + "loss": 1.6382, + "step": 13253 + }, + { + "epoch": 0.47465396529804643, + "grad_norm": 1.799432635307312, + "learning_rate": 0.0001130291401351083, + "loss": 1.5994, + "step": 13254 + }, + { + "epoch": 0.47468977742761476, + "grad_norm": 1.2933775186538696, + "learning_rate": 0.00011301763990751025, + "loss": 1.5075, + "step": 13255 + }, + { + "epoch": 0.474725589557183, + "grad_norm": 1.687794804573059, + "learning_rate": 0.00011300613950477661, + "loss": 1.7965, + "step": 13256 + }, + { + "epoch": 0.4747614016867513, + "grad_norm": 1.6241377592086792, + "learning_rate": 0.00011299463892706217, + "loss": 1.7507, + "step": 13257 + }, + { + "epoch": 0.47479721381631956, + "grad_norm": 1.474320650100708, + "learning_rate": 0.0001129831381745216, + "loss": 1.4066, + "step": 13258 + }, + { + "epoch": 0.4748330259458879, + "grad_norm": 1.6746569871902466, + "learning_rate": 0.00011297163724730968, + "loss": 1.5502, + "step": 13259 + }, + { + "epoch": 0.47486883807545616, + "grad_norm": 1.6739968061447144, + "learning_rate": 0.00011296013614558107, + "loss": 0.9947, + "step": 13260 + }, + { + "epoch": 0.4749046502050244, + "grad_norm": 1.6017934083938599, + "learning_rate": 0.00011294863486949059, + "loss": 1.1272, + "step": 13261 + }, + { + "epoch": 0.47494046233459275, + "grad_norm": 1.7970075607299805, + "learning_rate": 0.00011293713341919292, + "loss": 1.5517, + "step": 13262 + }, + { + "epoch": 0.474976274464161, + "grad_norm": 2.006655216217041, + "learning_rate": 0.0001129256317948428, + "loss": 1.7006, + "step": 13263 + }, + { + "epoch": 0.4750120865937293, + "grad_norm": 1.580830693244934, + "learning_rate": 0.00011291412999659499, + "loss": 1.593, + "step": 13264 + }, + { + "epoch": 0.47504789872329756, + "grad_norm": 1.7830466032028198, + "learning_rate": 0.00011290262802460419, + "loss": 1.3509, + "step": 13265 + }, + { + "epoch": 0.4750837108528659, + "grad_norm": 1.5933284759521484, + "learning_rate": 0.0001128911258790252, + "loss": 1.3845, + "step": 13266 + }, + { + "epoch": 0.47511952298243415, + "grad_norm": 1.4569121599197388, + "learning_rate": 0.00011287962356001272, + "loss": 1.4742, + "step": 13267 + }, + { + "epoch": 0.4751553351120024, + "grad_norm": 1.6202170848846436, + "learning_rate": 0.00011286812106772153, + "loss": 1.5486, + "step": 13268 + }, + { + "epoch": 0.47519114724157074, + "grad_norm": 1.7830826044082642, + "learning_rate": 0.00011285661840230636, + "loss": 1.2932, + "step": 13269 + }, + { + "epoch": 0.475226959371139, + "grad_norm": 1.6476013660430908, + "learning_rate": 0.000112845115563922, + "loss": 1.6752, + "step": 13270 + }, + { + "epoch": 0.4752627715007073, + "grad_norm": 1.4701865911483765, + "learning_rate": 0.00011283361255272315, + "loss": 1.2875, + "step": 13271 + }, + { + "epoch": 0.47529858363027555, + "grad_norm": 1.7325506210327148, + "learning_rate": 0.00011282210936886463, + "loss": 1.4609, + "step": 13272 + }, + { + "epoch": 0.4753343957598439, + "grad_norm": 1.6176021099090576, + "learning_rate": 0.00011281060601250113, + "loss": 1.2128, + "step": 13273 + }, + { + "epoch": 0.47537020788941214, + "grad_norm": 1.9186975955963135, + "learning_rate": 0.00011279910248378746, + "loss": 1.6815, + "step": 13274 + }, + { + "epoch": 0.4754060200189804, + "grad_norm": 1.3656400442123413, + "learning_rate": 0.00011278759878287839, + "loss": 1.6489, + "step": 13275 + }, + { + "epoch": 0.47544183214854874, + "grad_norm": 1.6302509307861328, + "learning_rate": 0.00011277609490992866, + "loss": 1.5046, + "step": 13276 + }, + { + "epoch": 0.475477644278117, + "grad_norm": 1.7330888509750366, + "learning_rate": 0.00011276459086509305, + "loss": 1.3758, + "step": 13277 + }, + { + "epoch": 0.4755134564076853, + "grad_norm": 1.6934552192687988, + "learning_rate": 0.00011275308664852635, + "loss": 1.5643, + "step": 13278 + }, + { + "epoch": 0.47554926853725354, + "grad_norm": 1.8864017724990845, + "learning_rate": 0.00011274158226038334, + "loss": 1.5, + "step": 13279 + }, + { + "epoch": 0.47558508066682187, + "grad_norm": 1.9266653060913086, + "learning_rate": 0.00011273007770081873, + "loss": 1.5524, + "step": 13280 + }, + { + "epoch": 0.47562089279639014, + "grad_norm": 1.5648225545883179, + "learning_rate": 0.00011271857296998737, + "loss": 1.3658, + "step": 13281 + }, + { + "epoch": 0.4756567049259584, + "grad_norm": 1.9169940948486328, + "learning_rate": 0.000112707068068044, + "loss": 1.7119, + "step": 13282 + }, + { + "epoch": 0.47569251705552673, + "grad_norm": 1.5786782503128052, + "learning_rate": 0.00011269556299514346, + "loss": 1.2967, + "step": 13283 + }, + { + "epoch": 0.475728329185095, + "grad_norm": 1.858866810798645, + "learning_rate": 0.00011268405775144044, + "loss": 1.4823, + "step": 13284 + }, + { + "epoch": 0.47576414131466327, + "grad_norm": 1.7044215202331543, + "learning_rate": 0.00011267255233708982, + "loss": 1.3542, + "step": 13285 + }, + { + "epoch": 0.47579995344423154, + "grad_norm": 1.5558875799179077, + "learning_rate": 0.00011266104675224633, + "loss": 1.3713, + "step": 13286 + }, + { + "epoch": 0.47583576557379986, + "grad_norm": 1.7449630498886108, + "learning_rate": 0.00011264954099706481, + "loss": 1.3809, + "step": 13287 + }, + { + "epoch": 0.47587157770336813, + "grad_norm": 1.7783395051956177, + "learning_rate": 0.00011263803507170005, + "loss": 1.4655, + "step": 13288 + }, + { + "epoch": 0.4759073898329364, + "grad_norm": 2.3066070079803467, + "learning_rate": 0.00011262652897630678, + "loss": 1.5637, + "step": 13289 + }, + { + "epoch": 0.4759432019625047, + "grad_norm": 1.9367107152938843, + "learning_rate": 0.0001126150227110399, + "loss": 1.4018, + "step": 13290 + }, + { + "epoch": 0.475979014092073, + "grad_norm": 1.6617939472198486, + "learning_rate": 0.00011260351627605413, + "loss": 1.2903, + "step": 13291 + }, + { + "epoch": 0.47601482622164126, + "grad_norm": 1.5744348764419556, + "learning_rate": 0.00011259200967150432, + "loss": 1.2362, + "step": 13292 + }, + { + "epoch": 0.47605063835120953, + "grad_norm": 2.4343421459198, + "learning_rate": 0.00011258050289754524, + "loss": 1.8115, + "step": 13293 + }, + { + "epoch": 0.47608645048077786, + "grad_norm": 1.7691670656204224, + "learning_rate": 0.00011256899595433175, + "loss": 1.4662, + "step": 13294 + }, + { + "epoch": 0.4761222626103461, + "grad_norm": 1.9748578071594238, + "learning_rate": 0.0001125574888420186, + "loss": 1.4279, + "step": 13295 + }, + { + "epoch": 0.4761580747399144, + "grad_norm": 1.6512370109558105, + "learning_rate": 0.00011254598156076066, + "loss": 1.7022, + "step": 13296 + }, + { + "epoch": 0.4761938868694827, + "grad_norm": 2.0833606719970703, + "learning_rate": 0.00011253447411071274, + "loss": 1.7322, + "step": 13297 + }, + { + "epoch": 0.476229698999051, + "grad_norm": 1.4301164150238037, + "learning_rate": 0.00011252296649202957, + "loss": 1.4129, + "step": 13298 + }, + { + "epoch": 0.47626551112861926, + "grad_norm": 1.6318367719650269, + "learning_rate": 0.00011251145870486612, + "loss": 1.3278, + "step": 13299 + }, + { + "epoch": 0.4763013232581875, + "grad_norm": 1.5819051265716553, + "learning_rate": 0.00011249995074937708, + "loss": 1.6432, + "step": 13300 + }, + { + "epoch": 0.47633713538775585, + "grad_norm": 1.7689194679260254, + "learning_rate": 0.00011248844262571737, + "loss": 1.4826, + "step": 13301 + }, + { + "epoch": 0.4763729475173241, + "grad_norm": 1.4098398685455322, + "learning_rate": 0.00011247693433404172, + "loss": 1.0356, + "step": 13302 + }, + { + "epoch": 0.4764087596468924, + "grad_norm": 1.6499768495559692, + "learning_rate": 0.00011246542587450504, + "loss": 1.6287, + "step": 13303 + }, + { + "epoch": 0.4764445717764607, + "grad_norm": 1.453571081161499, + "learning_rate": 0.00011245391724726213, + "loss": 1.3817, + "step": 13304 + }, + { + "epoch": 0.476480383906029, + "grad_norm": 3.2583165168762207, + "learning_rate": 0.00011244240845246783, + "loss": 1.4521, + "step": 13305 + }, + { + "epoch": 0.47651619603559725, + "grad_norm": 1.647336721420288, + "learning_rate": 0.00011243089949027699, + "loss": 1.613, + "step": 13306 + }, + { + "epoch": 0.4765520081651655, + "grad_norm": 1.7628077268600464, + "learning_rate": 0.0001124193903608444, + "loss": 1.5009, + "step": 13307 + }, + { + "epoch": 0.47658782029473384, + "grad_norm": 1.7588131427764893, + "learning_rate": 0.00011240788106432496, + "loss": 1.691, + "step": 13308 + }, + { + "epoch": 0.4766236324243021, + "grad_norm": 2.2901549339294434, + "learning_rate": 0.00011239637160087346, + "loss": 1.419, + "step": 13309 + }, + { + "epoch": 0.4766594445538704, + "grad_norm": 1.769014596939087, + "learning_rate": 0.00011238486197064479, + "loss": 1.4347, + "step": 13310 + }, + { + "epoch": 0.4766952566834387, + "grad_norm": 1.45048987865448, + "learning_rate": 0.00011237335217379377, + "loss": 1.4141, + "step": 13311 + }, + { + "epoch": 0.476731068813007, + "grad_norm": 2.432612895965576, + "learning_rate": 0.00011236184221047526, + "loss": 1.8489, + "step": 13312 + }, + { + "epoch": 0.47676688094257524, + "grad_norm": 1.4043242931365967, + "learning_rate": 0.00011235033208084411, + "loss": 1.5789, + "step": 13313 + }, + { + "epoch": 0.4768026930721435, + "grad_norm": 1.6627109050750732, + "learning_rate": 0.00011233882178505519, + "loss": 1.4942, + "step": 13314 + }, + { + "epoch": 0.47683850520171184, + "grad_norm": 2.114083766937256, + "learning_rate": 0.00011232731132326331, + "loss": 1.534, + "step": 13315 + }, + { + "epoch": 0.4768743173312801, + "grad_norm": 1.5076711177825928, + "learning_rate": 0.00011231580069562335, + "loss": 1.359, + "step": 13316 + }, + { + "epoch": 0.4769101294608484, + "grad_norm": 1.8410205841064453, + "learning_rate": 0.0001123042899022902, + "loss": 1.624, + "step": 13317 + }, + { + "epoch": 0.4769459415904167, + "grad_norm": 1.6031441688537598, + "learning_rate": 0.00011229277894341869, + "loss": 1.6562, + "step": 13318 + }, + { + "epoch": 0.47698175371998497, + "grad_norm": 1.749197006225586, + "learning_rate": 0.0001122812678191637, + "loss": 1.6577, + "step": 13319 + }, + { + "epoch": 0.47701756584955324, + "grad_norm": 2.062668800354004, + "learning_rate": 0.00011226975652968011, + "loss": 1.5842, + "step": 13320 + }, + { + "epoch": 0.4770533779791215, + "grad_norm": 2.1983795166015625, + "learning_rate": 0.00011225824507512275, + "loss": 1.3001, + "step": 13321 + }, + { + "epoch": 0.47708919010868983, + "grad_norm": 1.686004400253296, + "learning_rate": 0.00011224673345564651, + "loss": 1.5864, + "step": 13322 + }, + { + "epoch": 0.4771250022382581, + "grad_norm": 1.8390787839889526, + "learning_rate": 0.0001122352216714063, + "loss": 1.6522, + "step": 13323 + }, + { + "epoch": 0.47716081436782637, + "grad_norm": 1.6921519041061401, + "learning_rate": 0.00011222370972255694, + "loss": 1.4168, + "step": 13324 + }, + { + "epoch": 0.4771966264973947, + "grad_norm": 1.567881464958191, + "learning_rate": 0.00011221219760925334, + "loss": 1.4169, + "step": 13325 + }, + { + "epoch": 0.47723243862696296, + "grad_norm": 1.9966059923171997, + "learning_rate": 0.00011220068533165036, + "loss": 1.4439, + "step": 13326 + }, + { + "epoch": 0.47726825075653123, + "grad_norm": 1.6342418193817139, + "learning_rate": 0.00011218917288990292, + "loss": 1.4409, + "step": 13327 + }, + { + "epoch": 0.4773040628860995, + "grad_norm": 1.9518680572509766, + "learning_rate": 0.00011217766028416585, + "loss": 1.5651, + "step": 13328 + }, + { + "epoch": 0.4773398750156678, + "grad_norm": 1.5526176691055298, + "learning_rate": 0.00011216614751459408, + "loss": 1.6035, + "step": 13329 + }, + { + "epoch": 0.4773756871452361, + "grad_norm": 1.5337470769882202, + "learning_rate": 0.00011215463458134252, + "loss": 1.6495, + "step": 13330 + }, + { + "epoch": 0.47741149927480436, + "grad_norm": 2.321065902709961, + "learning_rate": 0.000112143121484566, + "loss": 1.6215, + "step": 13331 + }, + { + "epoch": 0.4774473114043727, + "grad_norm": 1.355303168296814, + "learning_rate": 0.00011213160822441948, + "loss": 1.6907, + "step": 13332 + }, + { + "epoch": 0.47748312353394096, + "grad_norm": 1.7356313467025757, + "learning_rate": 0.00011212009480105777, + "loss": 1.5611, + "step": 13333 + }, + { + "epoch": 0.4775189356635092, + "grad_norm": 2.1936237812042236, + "learning_rate": 0.00011210858121463586, + "loss": 1.4047, + "step": 13334 + }, + { + "epoch": 0.4775547477930775, + "grad_norm": 1.9643810987472534, + "learning_rate": 0.00011209706746530858, + "loss": 1.6057, + "step": 13335 + }, + { + "epoch": 0.4775905599226458, + "grad_norm": 1.537932276725769, + "learning_rate": 0.00011208555355323088, + "loss": 1.3793, + "step": 13336 + }, + { + "epoch": 0.4776263720522141, + "grad_norm": 1.4360474348068237, + "learning_rate": 0.00011207403947855761, + "loss": 1.2832, + "step": 13337 + }, + { + "epoch": 0.47766218418178236, + "grad_norm": 1.8667532205581665, + "learning_rate": 0.00011206252524144373, + "loss": 1.3285, + "step": 13338 + }, + { + "epoch": 0.4776979963113507, + "grad_norm": 1.374731183052063, + "learning_rate": 0.00011205101084204414, + "loss": 1.4788, + "step": 13339 + }, + { + "epoch": 0.47773380844091895, + "grad_norm": 1.8810253143310547, + "learning_rate": 0.00011203949628051376, + "loss": 1.4788, + "step": 13340 + }, + { + "epoch": 0.4777696205704872, + "grad_norm": 1.5324774980545044, + "learning_rate": 0.00011202798155700748, + "loss": 1.472, + "step": 13341 + }, + { + "epoch": 0.4778054327000555, + "grad_norm": 1.8011895418167114, + "learning_rate": 0.0001120164666716802, + "loss": 1.3606, + "step": 13342 + }, + { + "epoch": 0.4778412448296238, + "grad_norm": 1.6887016296386719, + "learning_rate": 0.0001120049516246869, + "loss": 1.562, + "step": 13343 + }, + { + "epoch": 0.4778770569591921, + "grad_norm": 1.4731894731521606, + "learning_rate": 0.0001119934364161824, + "loss": 1.3769, + "step": 13344 + }, + { + "epoch": 0.47791286908876035, + "grad_norm": 1.3754960298538208, + "learning_rate": 0.00011198192104632174, + "loss": 1.2356, + "step": 13345 + }, + { + "epoch": 0.4779486812183287, + "grad_norm": 1.9692893028259277, + "learning_rate": 0.00011197040551525977, + "loss": 1.4277, + "step": 13346 + }, + { + "epoch": 0.47798449334789694, + "grad_norm": 1.330018401145935, + "learning_rate": 0.00011195888982315144, + "loss": 1.4329, + "step": 13347 + }, + { + "epoch": 0.4780203054774652, + "grad_norm": 2.0799808502197266, + "learning_rate": 0.00011194737397015164, + "loss": 1.5684, + "step": 13348 + }, + { + "epoch": 0.4780561176070335, + "grad_norm": 1.6616966724395752, + "learning_rate": 0.00011193585795641539, + "loss": 1.2649, + "step": 13349 + }, + { + "epoch": 0.4780919297366018, + "grad_norm": 1.8805179595947266, + "learning_rate": 0.00011192434178209755, + "loss": 1.3895, + "step": 13350 + }, + { + "epoch": 0.4781277418661701, + "grad_norm": 1.4745457172393799, + "learning_rate": 0.00011191282544735304, + "loss": 1.6853, + "step": 13351 + }, + { + "epoch": 0.47816355399573834, + "grad_norm": 1.5154930353164673, + "learning_rate": 0.00011190130895233686, + "loss": 1.5363, + "step": 13352 + }, + { + "epoch": 0.47819936612530667, + "grad_norm": 1.273663878440857, + "learning_rate": 0.00011188979229720389, + "loss": 1.5082, + "step": 13353 + }, + { + "epoch": 0.47823517825487494, + "grad_norm": 1.5829232931137085, + "learning_rate": 0.00011187827548210915, + "loss": 1.4881, + "step": 13354 + }, + { + "epoch": 0.4782709903844432, + "grad_norm": 2.338301658630371, + "learning_rate": 0.00011186675850720749, + "loss": 1.3946, + "step": 13355 + }, + { + "epoch": 0.4783068025140115, + "grad_norm": 1.3622664213180542, + "learning_rate": 0.00011185524137265393, + "loss": 1.5696, + "step": 13356 + }, + { + "epoch": 0.4783426146435798, + "grad_norm": 2.10280179977417, + "learning_rate": 0.00011184372407860336, + "loss": 1.7405, + "step": 13357 + }, + { + "epoch": 0.47837842677314807, + "grad_norm": 1.4048125743865967, + "learning_rate": 0.00011183220662521079, + "loss": 1.6725, + "step": 13358 + }, + { + "epoch": 0.47841423890271634, + "grad_norm": 1.3935976028442383, + "learning_rate": 0.00011182068901263114, + "loss": 1.3918, + "step": 13359 + }, + { + "epoch": 0.47845005103228466, + "grad_norm": 2.3466100692749023, + "learning_rate": 0.00011180917124101936, + "loss": 1.7777, + "step": 13360 + }, + { + "epoch": 0.47848586316185293, + "grad_norm": 1.657720923423767, + "learning_rate": 0.0001117976533105304, + "loss": 1.563, + "step": 13361 + }, + { + "epoch": 0.4785216752914212, + "grad_norm": 1.7979260683059692, + "learning_rate": 0.00011178613522131924, + "loss": 1.2531, + "step": 13362 + }, + { + "epoch": 0.47855748742098947, + "grad_norm": 1.7003462314605713, + "learning_rate": 0.00011177461697354084, + "loss": 1.6431, + "step": 13363 + }, + { + "epoch": 0.4785932995505578, + "grad_norm": 1.7036478519439697, + "learning_rate": 0.00011176309856735014, + "loss": 1.3807, + "step": 13364 + }, + { + "epoch": 0.47862911168012606, + "grad_norm": 1.8643072843551636, + "learning_rate": 0.00011175158000290216, + "loss": 1.6466, + "step": 13365 + }, + { + "epoch": 0.47866492380969433, + "grad_norm": 1.696761131286621, + "learning_rate": 0.00011174006128035178, + "loss": 1.4672, + "step": 13366 + }, + { + "epoch": 0.47870073593926266, + "grad_norm": 1.6161099672317505, + "learning_rate": 0.00011172854239985409, + "loss": 1.4775, + "step": 13367 + }, + { + "epoch": 0.4787365480688309, + "grad_norm": 1.5913996696472168, + "learning_rate": 0.0001117170233615639, + "loss": 1.2976, + "step": 13368 + }, + { + "epoch": 0.4787723601983992, + "grad_norm": 1.3425949811935425, + "learning_rate": 0.00011170550416563634, + "loss": 1.2461, + "step": 13369 + }, + { + "epoch": 0.47880817232796746, + "grad_norm": 1.8969595432281494, + "learning_rate": 0.0001116939848122263, + "loss": 1.4931, + "step": 13370 + }, + { + "epoch": 0.4788439844575358, + "grad_norm": 1.8349109888076782, + "learning_rate": 0.00011168246530148876, + "loss": 1.6161, + "step": 13371 + }, + { + "epoch": 0.47887979658710406, + "grad_norm": 2.742892265319824, + "learning_rate": 0.00011167094563357876, + "loss": 1.6424, + "step": 13372 + }, + { + "epoch": 0.4789156087166723, + "grad_norm": 3.602156162261963, + "learning_rate": 0.00011165942580865118, + "loss": 1.5828, + "step": 13373 + }, + { + "epoch": 0.4789514208462406, + "grad_norm": 1.6566842794418335, + "learning_rate": 0.00011164790582686113, + "loss": 1.4616, + "step": 13374 + }, + { + "epoch": 0.4789872329758089, + "grad_norm": 1.709389567375183, + "learning_rate": 0.0001116363856883635, + "loss": 1.5164, + "step": 13375 + }, + { + "epoch": 0.4790230451053772, + "grad_norm": 2.565922498703003, + "learning_rate": 0.00011162486539331334, + "loss": 1.4494, + "step": 13376 + }, + { + "epoch": 0.47905885723494546, + "grad_norm": 1.883772373199463, + "learning_rate": 0.00011161334494186557, + "loss": 1.4987, + "step": 13377 + }, + { + "epoch": 0.4790946693645138, + "grad_norm": 1.9089127779006958, + "learning_rate": 0.00011160182433417525, + "loss": 1.4315, + "step": 13378 + }, + { + "epoch": 0.47913048149408205, + "grad_norm": 1.619033932685852, + "learning_rate": 0.00011159030357039733, + "loss": 1.2727, + "step": 13379 + }, + { + "epoch": 0.4791662936236503, + "grad_norm": 1.3101295232772827, + "learning_rate": 0.00011157878265068685, + "loss": 1.4185, + "step": 13380 + }, + { + "epoch": 0.4792021057532186, + "grad_norm": 2.5190060138702393, + "learning_rate": 0.00011156726157519877, + "loss": 1.8318, + "step": 13381 + }, + { + "epoch": 0.4792379178827869, + "grad_norm": 1.801878809928894, + "learning_rate": 0.00011155574034408812, + "loss": 1.5164, + "step": 13382 + }, + { + "epoch": 0.4792737300123552, + "grad_norm": 1.659848928451538, + "learning_rate": 0.00011154421895750984, + "loss": 1.3737, + "step": 13383 + }, + { + "epoch": 0.47930954214192345, + "grad_norm": 1.8164340257644653, + "learning_rate": 0.00011153269741561905, + "loss": 1.5673, + "step": 13384 + }, + { + "epoch": 0.4793453542714918, + "grad_norm": 1.5826667547225952, + "learning_rate": 0.00011152117571857065, + "loss": 1.0404, + "step": 13385 + }, + { + "epoch": 0.47938116640106004, + "grad_norm": 1.6665927171707153, + "learning_rate": 0.0001115096538665197, + "loss": 1.4599, + "step": 13386 + }, + { + "epoch": 0.4794169785306283, + "grad_norm": 2.6693007946014404, + "learning_rate": 0.00011149813185962124, + "loss": 1.4425, + "step": 13387 + }, + { + "epoch": 0.4794527906601966, + "grad_norm": 1.5640629529953003, + "learning_rate": 0.00011148660969803019, + "loss": 2.0646, + "step": 13388 + }, + { + "epoch": 0.4794886027897649, + "grad_norm": 1.4011385440826416, + "learning_rate": 0.00011147508738190167, + "loss": 1.5649, + "step": 13389 + }, + { + "epoch": 0.4795244149193332, + "grad_norm": 1.667718529701233, + "learning_rate": 0.0001114635649113906, + "loss": 1.4943, + "step": 13390 + }, + { + "epoch": 0.47956022704890144, + "grad_norm": 1.893916368484497, + "learning_rate": 0.00011145204228665209, + "loss": 1.2751, + "step": 13391 + }, + { + "epoch": 0.47959603917846977, + "grad_norm": 1.5027039051055908, + "learning_rate": 0.00011144051950784111, + "loss": 1.395, + "step": 13392 + }, + { + "epoch": 0.47963185130803804, + "grad_norm": 1.4640353918075562, + "learning_rate": 0.00011142899657511272, + "loss": 1.5298, + "step": 13393 + }, + { + "epoch": 0.4796676634376063, + "grad_norm": 1.6055121421813965, + "learning_rate": 0.00011141747348862191, + "loss": 1.5649, + "step": 13394 + }, + { + "epoch": 0.4797034755671746, + "grad_norm": 1.6210955381393433, + "learning_rate": 0.00011140595024852369, + "loss": 1.6393, + "step": 13395 + }, + { + "epoch": 0.4797392876967429, + "grad_norm": 1.5539766550064087, + "learning_rate": 0.00011139442685497317, + "loss": 1.6614, + "step": 13396 + }, + { + "epoch": 0.47977509982631117, + "grad_norm": 2.0545995235443115, + "learning_rate": 0.0001113829033081253, + "loss": 1.3625, + "step": 13397 + }, + { + "epoch": 0.47981091195587944, + "grad_norm": 1.3574188947677612, + "learning_rate": 0.00011137137960813517, + "loss": 1.34, + "step": 13398 + }, + { + "epoch": 0.47984672408544776, + "grad_norm": 1.6591427326202393, + "learning_rate": 0.00011135985575515778, + "loss": 1.6447, + "step": 13399 + }, + { + "epoch": 0.47988253621501603, + "grad_norm": 1.2985018491744995, + "learning_rate": 0.0001113483317493482, + "loss": 1.4672, + "step": 13400 + }, + { + "epoch": 0.4799183483445843, + "grad_norm": 1.8751863241195679, + "learning_rate": 0.00011133680759086145, + "loss": 1.725, + "step": 13401 + }, + { + "epoch": 0.47995416047415257, + "grad_norm": 1.4524515867233276, + "learning_rate": 0.00011132528327985256, + "loss": 1.6149, + "step": 13402 + }, + { + "epoch": 0.4799899726037209, + "grad_norm": 1.3275116682052612, + "learning_rate": 0.00011131375881647664, + "loss": 1.1616, + "step": 13403 + }, + { + "epoch": 0.48002578473328916, + "grad_norm": 1.6403449773788452, + "learning_rate": 0.00011130223420088864, + "loss": 1.2983, + "step": 13404 + }, + { + "epoch": 0.48006159686285743, + "grad_norm": 1.282675862312317, + "learning_rate": 0.00011129070943324366, + "loss": 1.3949, + "step": 13405 + }, + { + "epoch": 0.48009740899242576, + "grad_norm": 1.388199806213379, + "learning_rate": 0.00011127918451369676, + "loss": 1.4264, + "step": 13406 + }, + { + "epoch": 0.480133221121994, + "grad_norm": 1.4312509298324585, + "learning_rate": 0.00011126765944240298, + "loss": 1.4627, + "step": 13407 + }, + { + "epoch": 0.4801690332515623, + "grad_norm": 1.5060590505599976, + "learning_rate": 0.00011125613421951737, + "loss": 1.3497, + "step": 13408 + }, + { + "epoch": 0.48020484538113056, + "grad_norm": 1.3689676523208618, + "learning_rate": 0.00011124460884519503, + "loss": 1.4491, + "step": 13409 + }, + { + "epoch": 0.4802406575106989, + "grad_norm": 1.402147650718689, + "learning_rate": 0.00011123308331959093, + "loss": 1.5145, + "step": 13410 + }, + { + "epoch": 0.48027646964026716, + "grad_norm": 1.3701300621032715, + "learning_rate": 0.00011122155764286024, + "loss": 1.3315, + "step": 13411 + }, + { + "epoch": 0.4803122817698354, + "grad_norm": 1.9080400466918945, + "learning_rate": 0.00011121003181515792, + "loss": 1.461, + "step": 13412 + }, + { + "epoch": 0.48034809389940375, + "grad_norm": 2.7309305667877197, + "learning_rate": 0.00011119850583663908, + "loss": 1.8333, + "step": 13413 + }, + { + "epoch": 0.480383906028972, + "grad_norm": 2.27876615524292, + "learning_rate": 0.00011118697970745881, + "loss": 1.5516, + "step": 13414 + }, + { + "epoch": 0.4804197181585403, + "grad_norm": 1.6473275423049927, + "learning_rate": 0.00011117545342777215, + "loss": 1.3657, + "step": 13415 + }, + { + "epoch": 0.48045553028810856, + "grad_norm": 1.6574909687042236, + "learning_rate": 0.0001111639269977342, + "loss": 1.6839, + "step": 13416 + }, + { + "epoch": 0.4804913424176769, + "grad_norm": 1.214024305343628, + "learning_rate": 0.00011115240041749999, + "loss": 1.4461, + "step": 13417 + }, + { + "epoch": 0.48052715454724515, + "grad_norm": 2.024338960647583, + "learning_rate": 0.00011114087368722463, + "loss": 1.2144, + "step": 13418 + }, + { + "epoch": 0.4805629666768134, + "grad_norm": 1.9191267490386963, + "learning_rate": 0.00011112934680706317, + "loss": 1.5104, + "step": 13419 + }, + { + "epoch": 0.48059877880638174, + "grad_norm": 1.7122324705123901, + "learning_rate": 0.00011111781977717075, + "loss": 1.2507, + "step": 13420 + }, + { + "epoch": 0.48063459093595, + "grad_norm": 1.6362801790237427, + "learning_rate": 0.00011110629259770235, + "loss": 1.6835, + "step": 13421 + }, + { + "epoch": 0.4806704030655183, + "grad_norm": 1.6250897645950317, + "learning_rate": 0.00011109476526881313, + "loss": 1.4953, + "step": 13422 + }, + { + "epoch": 0.48070621519508655, + "grad_norm": 1.9805313348770142, + "learning_rate": 0.00011108323779065814, + "loss": 1.5814, + "step": 13423 + }, + { + "epoch": 0.4807420273246549, + "grad_norm": 1.6796422004699707, + "learning_rate": 0.00011107171016339251, + "loss": 1.6619, + "step": 13424 + }, + { + "epoch": 0.48077783945422314, + "grad_norm": 1.6529179811477661, + "learning_rate": 0.00011106018238717128, + "loss": 1.4423, + "step": 13425 + }, + { + "epoch": 0.4808136515837914, + "grad_norm": 1.6704829931259155, + "learning_rate": 0.00011104865446214957, + "loss": 1.3773, + "step": 13426 + }, + { + "epoch": 0.48084946371335974, + "grad_norm": 1.3127925395965576, + "learning_rate": 0.00011103712638848244, + "loss": 1.5127, + "step": 13427 + }, + { + "epoch": 0.480885275842928, + "grad_norm": 1.6566414833068848, + "learning_rate": 0.00011102559816632507, + "loss": 1.6901, + "step": 13428 + }, + { + "epoch": 0.4809210879724963, + "grad_norm": 1.6600404977798462, + "learning_rate": 0.00011101406979583246, + "loss": 1.3477, + "step": 13429 + }, + { + "epoch": 0.48095690010206454, + "grad_norm": 2.285459518432617, + "learning_rate": 0.00011100254127715975, + "loss": 1.3787, + "step": 13430 + }, + { + "epoch": 0.48099271223163287, + "grad_norm": 1.7669966220855713, + "learning_rate": 0.00011099101261046205, + "loss": 1.4803, + "step": 13431 + }, + { + "epoch": 0.48102852436120114, + "grad_norm": 1.491285800933838, + "learning_rate": 0.00011097948379589444, + "loss": 1.4013, + "step": 13432 + }, + { + "epoch": 0.4810643364907694, + "grad_norm": 1.9096039533615112, + "learning_rate": 0.00011096795483361205, + "loss": 1.3939, + "step": 13433 + }, + { + "epoch": 0.48110014862033773, + "grad_norm": 1.4066028594970703, + "learning_rate": 0.00011095642572376996, + "loss": 1.4652, + "step": 13434 + }, + { + "epoch": 0.481135960749906, + "grad_norm": 1.5006831884384155, + "learning_rate": 0.0001109448964665233, + "loss": 1.3876, + "step": 13435 + }, + { + "epoch": 0.48117177287947427, + "grad_norm": 1.5165512561798096, + "learning_rate": 0.00011093336706202717, + "loss": 1.696, + "step": 13436 + }, + { + "epoch": 0.48120758500904254, + "grad_norm": 1.273987889289856, + "learning_rate": 0.00011092183751043672, + "loss": 1.608, + "step": 13437 + }, + { + "epoch": 0.48124339713861086, + "grad_norm": 1.6874688863754272, + "learning_rate": 0.000110910307811907, + "loss": 1.301, + "step": 13438 + }, + { + "epoch": 0.48127920926817913, + "grad_norm": 1.5528160333633423, + "learning_rate": 0.00011089877796659319, + "loss": 1.3853, + "step": 13439 + }, + { + "epoch": 0.4813150213977474, + "grad_norm": 1.5378245115280151, + "learning_rate": 0.00011088724797465036, + "loss": 1.5139, + "step": 13440 + }, + { + "epoch": 0.4813508335273157, + "grad_norm": 2.0010392665863037, + "learning_rate": 0.00011087571783623365, + "loss": 1.5756, + "step": 13441 + }, + { + "epoch": 0.481386645656884, + "grad_norm": 1.7677205801010132, + "learning_rate": 0.0001108641875514982, + "loss": 1.336, + "step": 13442 + }, + { + "epoch": 0.48142245778645226, + "grad_norm": 2.936492443084717, + "learning_rate": 0.00011085265712059909, + "loss": 1.787, + "step": 13443 + }, + { + "epoch": 0.48145826991602053, + "grad_norm": 1.785649061203003, + "learning_rate": 0.00011084112654369152, + "loss": 1.2747, + "step": 13444 + }, + { + "epoch": 0.48149408204558886, + "grad_norm": 1.6284798383712769, + "learning_rate": 0.00011082959582093055, + "loss": 1.3396, + "step": 13445 + }, + { + "epoch": 0.4815298941751571, + "grad_norm": 1.441011667251587, + "learning_rate": 0.00011081806495247136, + "loss": 1.6317, + "step": 13446 + }, + { + "epoch": 0.4815657063047254, + "grad_norm": 1.789101004600525, + "learning_rate": 0.00011080653393846905, + "loss": 1.6096, + "step": 13447 + }, + { + "epoch": 0.4816015184342937, + "grad_norm": 2.5847105979919434, + "learning_rate": 0.00011079500277907875, + "loss": 1.321, + "step": 13448 + }, + { + "epoch": 0.481637330563862, + "grad_norm": 1.5998847484588623, + "learning_rate": 0.00011078347147445563, + "loss": 1.5482, + "step": 13449 + }, + { + "epoch": 0.48167314269343026, + "grad_norm": 1.8561943769454956, + "learning_rate": 0.0001107719400247548, + "loss": 1.3777, + "step": 13450 + }, + { + "epoch": 0.4817089548229985, + "grad_norm": 2.0072786808013916, + "learning_rate": 0.00011076040843013141, + "loss": 1.149, + "step": 13451 + }, + { + "epoch": 0.48174476695256685, + "grad_norm": 1.9042658805847168, + "learning_rate": 0.00011074887669074058, + "loss": 1.3163, + "step": 13452 + }, + { + "epoch": 0.4817805790821351, + "grad_norm": 1.6006115674972534, + "learning_rate": 0.00011073734480673754, + "loss": 1.5273, + "step": 13453 + }, + { + "epoch": 0.4818163912117034, + "grad_norm": 1.7646973133087158, + "learning_rate": 0.00011072581277827732, + "loss": 1.5288, + "step": 13454 + }, + { + "epoch": 0.4818522033412717, + "grad_norm": 2.283583641052246, + "learning_rate": 0.00011071428060551517, + "loss": 1.7976, + "step": 13455 + }, + { + "epoch": 0.48188801547084, + "grad_norm": 1.4994382858276367, + "learning_rate": 0.00011070274828860618, + "loss": 1.3215, + "step": 13456 + }, + { + "epoch": 0.48192382760040825, + "grad_norm": 1.39043128490448, + "learning_rate": 0.0001106912158277055, + "loss": 1.3991, + "step": 13457 + }, + { + "epoch": 0.4819596397299765, + "grad_norm": 1.6843230724334717, + "learning_rate": 0.00011067968322296831, + "loss": 1.6234, + "step": 13458 + }, + { + "epoch": 0.48199545185954484, + "grad_norm": 1.7703015804290771, + "learning_rate": 0.00011066815047454975, + "loss": 1.517, + "step": 13459 + }, + { + "epoch": 0.4820312639891131, + "grad_norm": 1.695458173751831, + "learning_rate": 0.00011065661758260502, + "loss": 1.6138, + "step": 13460 + }, + { + "epoch": 0.4820670761186814, + "grad_norm": 1.6094825267791748, + "learning_rate": 0.00011064508454728921, + "loss": 1.3752, + "step": 13461 + }, + { + "epoch": 0.4821028882482497, + "grad_norm": 1.7632977962493896, + "learning_rate": 0.00011063355136875753, + "loss": 1.541, + "step": 13462 + }, + { + "epoch": 0.482138700377818, + "grad_norm": 1.7887697219848633, + "learning_rate": 0.00011062201804716512, + "loss": 1.5856, + "step": 13463 + }, + { + "epoch": 0.48217451250738624, + "grad_norm": 1.7318227291107178, + "learning_rate": 0.0001106104845826672, + "loss": 1.2289, + "step": 13464 + }, + { + "epoch": 0.4822103246369545, + "grad_norm": 1.6601182222366333, + "learning_rate": 0.00011059895097541888, + "loss": 1.875, + "step": 13465 + }, + { + "epoch": 0.48224613676652284, + "grad_norm": 1.9784289598464966, + "learning_rate": 0.00011058741722557533, + "loss": 1.3676, + "step": 13466 + }, + { + "epoch": 0.4822819488960911, + "grad_norm": 1.8831995725631714, + "learning_rate": 0.00011057588333329174, + "loss": 1.6114, + "step": 13467 + }, + { + "epoch": 0.4823177610256594, + "grad_norm": 1.6999905109405518, + "learning_rate": 0.00011056434929872325, + "loss": 1.6761, + "step": 13468 + }, + { + "epoch": 0.4823535731552277, + "grad_norm": 1.5982940196990967, + "learning_rate": 0.00011055281512202513, + "loss": 1.4517, + "step": 13469 + }, + { + "epoch": 0.48238938528479597, + "grad_norm": 2.8717644214630127, + "learning_rate": 0.00011054128080335246, + "loss": 1.6728, + "step": 13470 + }, + { + "epoch": 0.48242519741436424, + "grad_norm": 1.9627916812896729, + "learning_rate": 0.00011052974634286046, + "loss": 1.7458, + "step": 13471 + }, + { + "epoch": 0.4824610095439325, + "grad_norm": 1.698922872543335, + "learning_rate": 0.00011051821174070429, + "loss": 1.8436, + "step": 13472 + }, + { + "epoch": 0.48249682167350083, + "grad_norm": 1.2740743160247803, + "learning_rate": 0.0001105066769970392, + "loss": 1.5197, + "step": 13473 + }, + { + "epoch": 0.4825326338030691, + "grad_norm": 1.7765341997146606, + "learning_rate": 0.00011049514211202028, + "loss": 1.8556, + "step": 13474 + }, + { + "epoch": 0.48256844593263737, + "grad_norm": 1.9646724462509155, + "learning_rate": 0.00011048360708580279, + "loss": 1.5785, + "step": 13475 + }, + { + "epoch": 0.4826042580622057, + "grad_norm": 2.2040209770202637, + "learning_rate": 0.00011047207191854185, + "loss": 1.5735, + "step": 13476 + }, + { + "epoch": 0.48264007019177396, + "grad_norm": 1.3888829946517944, + "learning_rate": 0.00011046053661039273, + "loss": 1.4665, + "step": 13477 + }, + { + "epoch": 0.48267588232134223, + "grad_norm": 1.831881046295166, + "learning_rate": 0.00011044900116151053, + "loss": 1.6623, + "step": 13478 + }, + { + "epoch": 0.4827116944509105, + "grad_norm": 1.4265269041061401, + "learning_rate": 0.00011043746557205054, + "loss": 1.5404, + "step": 13479 + }, + { + "epoch": 0.4827475065804788, + "grad_norm": 1.828692078590393, + "learning_rate": 0.0001104259298421679, + "loss": 1.5943, + "step": 13480 + }, + { + "epoch": 0.4827833187100471, + "grad_norm": 1.9613791704177856, + "learning_rate": 0.00011041439397201785, + "loss": 1.6634, + "step": 13481 + }, + { + "epoch": 0.48281913083961536, + "grad_norm": 1.9883949756622314, + "learning_rate": 0.00011040285796175553, + "loss": 1.19, + "step": 13482 + }, + { + "epoch": 0.4828549429691837, + "grad_norm": 1.7061982154846191, + "learning_rate": 0.00011039132181153618, + "loss": 1.5244, + "step": 13483 + }, + { + "epoch": 0.48289075509875196, + "grad_norm": 1.7107230424880981, + "learning_rate": 0.00011037978552151502, + "loss": 1.4301, + "step": 13484 + }, + { + "epoch": 0.4829265672283202, + "grad_norm": 2.234708070755005, + "learning_rate": 0.0001103682490918472, + "loss": 1.8736, + "step": 13485 + }, + { + "epoch": 0.4829623793578885, + "grad_norm": 1.9554941654205322, + "learning_rate": 0.000110356712522688, + "loss": 1.6488, + "step": 13486 + }, + { + "epoch": 0.4829981914874568, + "grad_norm": 1.7168772220611572, + "learning_rate": 0.00011034517581419255, + "loss": 1.7765, + "step": 13487 + }, + { + "epoch": 0.4830340036170251, + "grad_norm": 1.7673360109329224, + "learning_rate": 0.00011033363896651613, + "loss": 1.3514, + "step": 13488 + }, + { + "epoch": 0.48306981574659336, + "grad_norm": 1.7775872945785522, + "learning_rate": 0.00011032210197981392, + "loss": 1.2557, + "step": 13489 + }, + { + "epoch": 0.4831056278761617, + "grad_norm": 1.549501895904541, + "learning_rate": 0.00011031056485424116, + "loss": 1.4594, + "step": 13490 + }, + { + "epoch": 0.48314144000572995, + "grad_norm": 1.6182304620742798, + "learning_rate": 0.00011029902758995304, + "loss": 1.4751, + "step": 13491 + }, + { + "epoch": 0.4831772521352982, + "grad_norm": 1.4240484237670898, + "learning_rate": 0.00011028749018710478, + "loss": 1.3701, + "step": 13492 + }, + { + "epoch": 0.4832130642648665, + "grad_norm": 1.6473289728164673, + "learning_rate": 0.00011027595264585162, + "loss": 1.2206, + "step": 13493 + }, + { + "epoch": 0.4832488763944348, + "grad_norm": 2.2116892337799072, + "learning_rate": 0.00011026441496634874, + "loss": 1.6348, + "step": 13494 + }, + { + "epoch": 0.4832846885240031, + "grad_norm": 1.27376127243042, + "learning_rate": 0.00011025287714875143, + "loss": 1.3734, + "step": 13495 + }, + { + "epoch": 0.48332050065357135, + "grad_norm": 1.800029993057251, + "learning_rate": 0.00011024133919321486, + "loss": 1.5834, + "step": 13496 + }, + { + "epoch": 0.4833563127831397, + "grad_norm": 1.679153561592102, + "learning_rate": 0.00011022980109989431, + "loss": 1.4844, + "step": 13497 + }, + { + "epoch": 0.48339212491270794, + "grad_norm": 2.0695579051971436, + "learning_rate": 0.00011021826286894496, + "loss": 1.7796, + "step": 13498 + }, + { + "epoch": 0.4834279370422762, + "grad_norm": 1.740222692489624, + "learning_rate": 0.0001102067245005221, + "loss": 1.4109, + "step": 13499 + }, + { + "epoch": 0.4834637491718445, + "grad_norm": 1.6901319026947021, + "learning_rate": 0.0001101951859947809, + "loss": 1.8769, + "step": 13500 + }, + { + "epoch": 0.4834995613014128, + "grad_norm": 1.6777935028076172, + "learning_rate": 0.00011018364735187661, + "loss": 1.4192, + "step": 13501 + }, + { + "epoch": 0.4835353734309811, + "grad_norm": 1.7092137336730957, + "learning_rate": 0.00011017210857196449, + "loss": 1.6799, + "step": 13502 + }, + { + "epoch": 0.48357118556054934, + "grad_norm": 1.8037439584732056, + "learning_rate": 0.00011016056965519979, + "loss": 1.5957, + "step": 13503 + }, + { + "epoch": 0.48360699769011767, + "grad_norm": 1.3701008558273315, + "learning_rate": 0.00011014903060173772, + "loss": 1.6095, + "step": 13504 + }, + { + "epoch": 0.48364280981968594, + "grad_norm": 1.4158246517181396, + "learning_rate": 0.00011013749141173351, + "loss": 1.3914, + "step": 13505 + }, + { + "epoch": 0.4836786219492542, + "grad_norm": 1.9191498756408691, + "learning_rate": 0.00011012595208534247, + "loss": 1.9305, + "step": 13506 + }, + { + "epoch": 0.4837144340788225, + "grad_norm": 2.0347769260406494, + "learning_rate": 0.00011011441262271975, + "loss": 1.3077, + "step": 13507 + }, + { + "epoch": 0.4837502462083908, + "grad_norm": 1.7553139925003052, + "learning_rate": 0.00011010287302402073, + "loss": 1.2249, + "step": 13508 + }, + { + "epoch": 0.48378605833795907, + "grad_norm": 1.331820011138916, + "learning_rate": 0.00011009133328940053, + "loss": 1.5748, + "step": 13509 + }, + { + "epoch": 0.48382187046752734, + "grad_norm": 1.791427731513977, + "learning_rate": 0.00011007979341901446, + "loss": 1.4737, + "step": 13510 + }, + { + "epoch": 0.48385768259709566, + "grad_norm": 1.544029951095581, + "learning_rate": 0.0001100682534130178, + "loss": 1.4346, + "step": 13511 + }, + { + "epoch": 0.48389349472666393, + "grad_norm": 1.728143334388733, + "learning_rate": 0.00011005671327156574, + "loss": 1.5266, + "step": 13512 + }, + { + "epoch": 0.4839293068562322, + "grad_norm": 1.5404763221740723, + "learning_rate": 0.00011004517299481363, + "loss": 1.4122, + "step": 13513 + }, + { + "epoch": 0.48396511898580047, + "grad_norm": 1.9863710403442383, + "learning_rate": 0.00011003363258291664, + "loss": 1.6186, + "step": 13514 + }, + { + "epoch": 0.4840009311153688, + "grad_norm": 2.0675201416015625, + "learning_rate": 0.00011002209203603007, + "loss": 1.4077, + "step": 13515 + }, + { + "epoch": 0.48403674324493706, + "grad_norm": 1.5609140396118164, + "learning_rate": 0.00011001055135430916, + "loss": 1.3322, + "step": 13516 + }, + { + "epoch": 0.48407255537450533, + "grad_norm": 1.3592745065689087, + "learning_rate": 0.00010999901053790924, + "loss": 1.5969, + "step": 13517 + }, + { + "epoch": 0.48410836750407366, + "grad_norm": 2.3698625564575195, + "learning_rate": 0.0001099874695869855, + "loss": 1.474, + "step": 13518 + }, + { + "epoch": 0.4841441796336419, + "grad_norm": 2.192758798599243, + "learning_rate": 0.00010997592850169325, + "loss": 1.759, + "step": 13519 + }, + { + "epoch": 0.4841799917632102, + "grad_norm": 1.3578786849975586, + "learning_rate": 0.00010996438728218772, + "loss": 1.4524, + "step": 13520 + }, + { + "epoch": 0.48421580389277846, + "grad_norm": 1.4247230291366577, + "learning_rate": 0.00010995284592862425, + "loss": 1.782, + "step": 13521 + }, + { + "epoch": 0.4842516160223468, + "grad_norm": 2.091456890106201, + "learning_rate": 0.00010994130444115804, + "loss": 1.5596, + "step": 13522 + }, + { + "epoch": 0.48428742815191506, + "grad_norm": 2.0627150535583496, + "learning_rate": 0.00010992976281994443, + "loss": 1.3551, + "step": 13523 + }, + { + "epoch": 0.4843232402814833, + "grad_norm": 1.4531793594360352, + "learning_rate": 0.00010991822106513867, + "loss": 1.4569, + "step": 13524 + }, + { + "epoch": 0.48435905241105165, + "grad_norm": 2.3870937824249268, + "learning_rate": 0.00010990667917689603, + "loss": 1.7541, + "step": 13525 + }, + { + "epoch": 0.4843948645406199, + "grad_norm": 1.8330572843551636, + "learning_rate": 0.00010989513715537184, + "loss": 1.6933, + "step": 13526 + }, + { + "epoch": 0.4844306766701882, + "grad_norm": 1.6797586679458618, + "learning_rate": 0.00010988359500072128, + "loss": 1.3094, + "step": 13527 + }, + { + "epoch": 0.48446648879975646, + "grad_norm": 1.7150591611862183, + "learning_rate": 0.00010987205271309972, + "loss": 1.5811, + "step": 13528 + }, + { + "epoch": 0.4845023009293248, + "grad_norm": 1.74473237991333, + "learning_rate": 0.00010986051029266242, + "loss": 1.6228, + "step": 13529 + }, + { + "epoch": 0.48453811305889305, + "grad_norm": 2.2052595615386963, + "learning_rate": 0.0001098489677395647, + "loss": 1.8501, + "step": 13530 + }, + { + "epoch": 0.4845739251884613, + "grad_norm": 1.872136116027832, + "learning_rate": 0.0001098374250539618, + "loss": 1.5437, + "step": 13531 + }, + { + "epoch": 0.48460973731802964, + "grad_norm": 2.1946043968200684, + "learning_rate": 0.00010982588223600905, + "loss": 1.2338, + "step": 13532 + }, + { + "epoch": 0.4846455494475979, + "grad_norm": 1.7079534530639648, + "learning_rate": 0.00010981433928586168, + "loss": 1.8226, + "step": 13533 + }, + { + "epoch": 0.4846813615771662, + "grad_norm": 1.761330008506775, + "learning_rate": 0.00010980279620367511, + "loss": 1.4908, + "step": 13534 + }, + { + "epoch": 0.48471717370673445, + "grad_norm": 1.1783751249313354, + "learning_rate": 0.00010979125298960453, + "loss": 1.5007, + "step": 13535 + }, + { + "epoch": 0.4847529858363028, + "grad_norm": 1.9154187440872192, + "learning_rate": 0.00010977970964380526, + "loss": 1.528, + "step": 13536 + }, + { + "epoch": 0.48478879796587104, + "grad_norm": 1.502217411994934, + "learning_rate": 0.00010976816616643262, + "loss": 1.37, + "step": 13537 + }, + { + "epoch": 0.4848246100954393, + "grad_norm": 1.479232907295227, + "learning_rate": 0.0001097566225576419, + "loss": 1.4488, + "step": 13538 + }, + { + "epoch": 0.48486042222500764, + "grad_norm": 1.262938380241394, + "learning_rate": 0.00010974507881758842, + "loss": 1.6514, + "step": 13539 + }, + { + "epoch": 0.4848962343545759, + "grad_norm": 1.8512773513793945, + "learning_rate": 0.00010973353494642745, + "loss": 1.5816, + "step": 13540 + }, + { + "epoch": 0.4849320464841442, + "grad_norm": 1.9465118646621704, + "learning_rate": 0.00010972199094431435, + "loss": 1.7751, + "step": 13541 + }, + { + "epoch": 0.48496785861371244, + "grad_norm": 1.3644713163375854, + "learning_rate": 0.00010971044681140437, + "loss": 1.7066, + "step": 13542 + }, + { + "epoch": 0.48500367074328077, + "grad_norm": 1.4403965473175049, + "learning_rate": 0.0001096989025478529, + "loss": 1.6519, + "step": 13543 + }, + { + "epoch": 0.48503948287284904, + "grad_norm": 1.8270875215530396, + "learning_rate": 0.0001096873581538152, + "loss": 1.712, + "step": 13544 + }, + { + "epoch": 0.4850752950024173, + "grad_norm": 1.5152555704116821, + "learning_rate": 0.00010967581362944654, + "loss": 1.6795, + "step": 13545 + }, + { + "epoch": 0.48511110713198563, + "grad_norm": 1.2786191701889038, + "learning_rate": 0.00010966426897490234, + "loss": 1.5847, + "step": 13546 + }, + { + "epoch": 0.4851469192615539, + "grad_norm": 1.5491951704025269, + "learning_rate": 0.00010965272419033782, + "loss": 1.3634, + "step": 13547 + }, + { + "epoch": 0.48518273139112217, + "grad_norm": 2.059805393218994, + "learning_rate": 0.0001096411792759084, + "loss": 1.2069, + "step": 13548 + }, + { + "epoch": 0.48521854352069044, + "grad_norm": 1.5590767860412598, + "learning_rate": 0.0001096296342317693, + "loss": 1.3519, + "step": 13549 + }, + { + "epoch": 0.48525435565025876, + "grad_norm": 1.7230417728424072, + "learning_rate": 0.00010961808905807593, + "loss": 1.4459, + "step": 13550 + }, + { + "epoch": 0.48529016777982703, + "grad_norm": 1.9240474700927734, + "learning_rate": 0.00010960654375498357, + "loss": 1.6497, + "step": 13551 + }, + { + "epoch": 0.4853259799093953, + "grad_norm": 1.6086010932922363, + "learning_rate": 0.00010959499832264754, + "loss": 1.5178, + "step": 13552 + }, + { + "epoch": 0.4853617920389636, + "grad_norm": 1.5947424173355103, + "learning_rate": 0.00010958345276122322, + "loss": 1.203, + "step": 13553 + }, + { + "epoch": 0.4853976041685319, + "grad_norm": 1.5040249824523926, + "learning_rate": 0.00010957190707086586, + "loss": 1.3151, + "step": 13554 + }, + { + "epoch": 0.48543341629810016, + "grad_norm": 1.2605984210968018, + "learning_rate": 0.00010956036125173088, + "loss": 1.5936, + "step": 13555 + }, + { + "epoch": 0.48546922842766843, + "grad_norm": 1.7355071306228638, + "learning_rate": 0.00010954881530397352, + "loss": 1.5246, + "step": 13556 + }, + { + "epoch": 0.48550504055723676, + "grad_norm": 1.455668568611145, + "learning_rate": 0.0001095372692277492, + "loss": 1.3897, + "step": 13557 + }, + { + "epoch": 0.485540852686805, + "grad_norm": 1.6682887077331543, + "learning_rate": 0.00010952572302321322, + "loss": 1.4213, + "step": 13558 + }, + { + "epoch": 0.4855766648163733, + "grad_norm": 1.7977643013000488, + "learning_rate": 0.00010951417669052093, + "loss": 1.573, + "step": 13559 + }, + { + "epoch": 0.4856124769459416, + "grad_norm": 1.5340325832366943, + "learning_rate": 0.00010950263022982766, + "loss": 1.6457, + "step": 13560 + }, + { + "epoch": 0.4856482890755099, + "grad_norm": 2.6738224029541016, + "learning_rate": 0.0001094910836412888, + "loss": 1.4636, + "step": 13561 + }, + { + "epoch": 0.48568410120507816, + "grad_norm": 1.7948660850524902, + "learning_rate": 0.00010947953692505959, + "loss": 1.7762, + "step": 13562 + }, + { + "epoch": 0.4857199133346464, + "grad_norm": 1.6173733472824097, + "learning_rate": 0.00010946799008129547, + "loss": 1.169, + "step": 13563 + }, + { + "epoch": 0.48575572546421475, + "grad_norm": 2.0082993507385254, + "learning_rate": 0.00010945644311015172, + "loss": 1.8853, + "step": 13564 + }, + { + "epoch": 0.485791537593783, + "grad_norm": 2.996189832687378, + "learning_rate": 0.00010944489601178373, + "loss": 1.7372, + "step": 13565 + }, + { + "epoch": 0.4858273497233513, + "grad_norm": 1.859113097190857, + "learning_rate": 0.0001094333487863469, + "loss": 1.6007, + "step": 13566 + }, + { + "epoch": 0.48586316185291956, + "grad_norm": 1.7060550451278687, + "learning_rate": 0.00010942180143399647, + "loss": 1.445, + "step": 13567 + }, + { + "epoch": 0.4858989739824879, + "grad_norm": 2.118669271469116, + "learning_rate": 0.0001094102539548879, + "loss": 1.4383, + "step": 13568 + }, + { + "epoch": 0.48593478611205615, + "grad_norm": 2.2744529247283936, + "learning_rate": 0.00010939870634917647, + "loss": 1.8303, + "step": 13569 + }, + { + "epoch": 0.4859705982416244, + "grad_norm": 1.9283888339996338, + "learning_rate": 0.00010938715861701762, + "loss": 1.7504, + "step": 13570 + }, + { + "epoch": 0.48600641037119274, + "grad_norm": 1.3578182458877563, + "learning_rate": 0.00010937561075856662, + "loss": 1.5591, + "step": 13571 + }, + { + "epoch": 0.486042222500761, + "grad_norm": 1.8386645317077637, + "learning_rate": 0.00010936406277397888, + "loss": 1.5983, + "step": 13572 + }, + { + "epoch": 0.4860780346303293, + "grad_norm": 1.6330554485321045, + "learning_rate": 0.00010935251466340973, + "loss": 1.4696, + "step": 13573 + }, + { + "epoch": 0.48611384675989755, + "grad_norm": 1.2304561138153076, + "learning_rate": 0.0001093409664270146, + "loss": 1.3601, + "step": 13574 + }, + { + "epoch": 0.4861496588894659, + "grad_norm": 1.8582013845443726, + "learning_rate": 0.00010932941806494876, + "loss": 1.6357, + "step": 13575 + }, + { + "epoch": 0.48618547101903414, + "grad_norm": 1.4908101558685303, + "learning_rate": 0.0001093178695773677, + "loss": 1.2766, + "step": 13576 + }, + { + "epoch": 0.4862212831486024, + "grad_norm": 1.5224658250808716, + "learning_rate": 0.00010930632096442665, + "loss": 1.6613, + "step": 13577 + }, + { + "epoch": 0.48625709527817074, + "grad_norm": 1.5885058641433716, + "learning_rate": 0.00010929477222628113, + "loss": 1.4425, + "step": 13578 + }, + { + "epoch": 0.486292907407739, + "grad_norm": 1.9274603128433228, + "learning_rate": 0.00010928322336308641, + "loss": 1.5363, + "step": 13579 + }, + { + "epoch": 0.4863287195373073, + "grad_norm": 1.4560315608978271, + "learning_rate": 0.00010927167437499788, + "loss": 1.6627, + "step": 13580 + }, + { + "epoch": 0.48636453166687554, + "grad_norm": 1.4085999727249146, + "learning_rate": 0.00010926012526217095, + "loss": 1.3984, + "step": 13581 + }, + { + "epoch": 0.48640034379644387, + "grad_norm": 1.5395698547363281, + "learning_rate": 0.00010924857602476095, + "loss": 1.3167, + "step": 13582 + }, + { + "epoch": 0.48643615592601214, + "grad_norm": 1.8653262853622437, + "learning_rate": 0.00010923702666292333, + "loss": 1.6573, + "step": 13583 + }, + { + "epoch": 0.4864719680555804, + "grad_norm": 2.26827073097229, + "learning_rate": 0.0001092254771768134, + "loss": 1.6199, + "step": 13584 + }, + { + "epoch": 0.48650778018514873, + "grad_norm": 1.4036325216293335, + "learning_rate": 0.0001092139275665866, + "loss": 1.6415, + "step": 13585 + }, + { + "epoch": 0.486543592314717, + "grad_norm": 1.6470149755477905, + "learning_rate": 0.00010920237783239824, + "loss": 1.3443, + "step": 13586 + }, + { + "epoch": 0.48657940444428527, + "grad_norm": 1.6262797117233276, + "learning_rate": 0.0001091908279744038, + "loss": 1.4092, + "step": 13587 + }, + { + "epoch": 0.48661521657385354, + "grad_norm": 1.7407779693603516, + "learning_rate": 0.00010917927799275865, + "loss": 1.4334, + "step": 13588 + }, + { + "epoch": 0.48665102870342186, + "grad_norm": 2.5387730598449707, + "learning_rate": 0.00010916772788761809, + "loss": 1.5939, + "step": 13589 + }, + { + "epoch": 0.48668684083299013, + "grad_norm": 2.3920860290527344, + "learning_rate": 0.00010915617765913761, + "loss": 1.4822, + "step": 13590 + }, + { + "epoch": 0.4867226529625584, + "grad_norm": 2.5543646812438965, + "learning_rate": 0.00010914462730747257, + "loss": 1.477, + "step": 13591 + }, + { + "epoch": 0.4867584650921267, + "grad_norm": 1.4672547578811646, + "learning_rate": 0.00010913307683277838, + "loss": 1.4579, + "step": 13592 + }, + { + "epoch": 0.486794277221695, + "grad_norm": 2.104875326156616, + "learning_rate": 0.0001091215262352104, + "loss": 1.5106, + "step": 13593 + }, + { + "epoch": 0.48683008935126326, + "grad_norm": 1.5510029792785645, + "learning_rate": 0.00010910997551492405, + "loss": 1.319, + "step": 13594 + }, + { + "epoch": 0.48686590148083153, + "grad_norm": 1.9528536796569824, + "learning_rate": 0.00010909842467207472, + "loss": 1.5796, + "step": 13595 + }, + { + "epoch": 0.48690171361039986, + "grad_norm": 3.5056748390197754, + "learning_rate": 0.00010908687370681785, + "loss": 1.6121, + "step": 13596 + }, + { + "epoch": 0.4869375257399681, + "grad_norm": 2.442275047302246, + "learning_rate": 0.00010907532261930881, + "loss": 1.5149, + "step": 13597 + }, + { + "epoch": 0.4869733378695364, + "grad_norm": 2.3697030544281006, + "learning_rate": 0.00010906377140970301, + "loss": 1.6489, + "step": 13598 + }, + { + "epoch": 0.4870091499991047, + "grad_norm": 2.148909568786621, + "learning_rate": 0.00010905222007815585, + "loss": 1.3923, + "step": 13599 + }, + { + "epoch": 0.487044962128673, + "grad_norm": 2.649132490158081, + "learning_rate": 0.00010904066862482274, + "loss": 1.5161, + "step": 13600 + }, + { + "epoch": 0.48708077425824126, + "grad_norm": 1.370690941810608, + "learning_rate": 0.00010902911704985912, + "loss": 1.5841, + "step": 13601 + }, + { + "epoch": 0.4871165863878095, + "grad_norm": 2.150310516357422, + "learning_rate": 0.00010901756535342033, + "loss": 1.7005, + "step": 13602 + }, + { + "epoch": 0.48715239851737785, + "grad_norm": 1.4429428577423096, + "learning_rate": 0.00010900601353566188, + "loss": 1.5266, + "step": 13603 + }, + { + "epoch": 0.4871882106469461, + "grad_norm": 1.5756672620773315, + "learning_rate": 0.0001089944615967391, + "loss": 1.3184, + "step": 13604 + }, + { + "epoch": 0.4872240227765144, + "grad_norm": 1.7169619798660278, + "learning_rate": 0.0001089829095368075, + "loss": 1.4519, + "step": 13605 + }, + { + "epoch": 0.4872598349060827, + "grad_norm": 1.8188672065734863, + "learning_rate": 0.00010897135735602238, + "loss": 1.3889, + "step": 13606 + }, + { + "epoch": 0.487295647035651, + "grad_norm": 1.3726422786712646, + "learning_rate": 0.00010895980505453924, + "loss": 1.2956, + "step": 13607 + }, + { + "epoch": 0.48733145916521925, + "grad_norm": 1.986498236656189, + "learning_rate": 0.00010894825263251345, + "loss": 1.7822, + "step": 13608 + }, + { + "epoch": 0.4873672712947875, + "grad_norm": 1.6833266019821167, + "learning_rate": 0.00010893670009010049, + "loss": 1.4204, + "step": 13609 + }, + { + "epoch": 0.48740308342435584, + "grad_norm": 1.488139271736145, + "learning_rate": 0.00010892514742745576, + "loss": 1.1151, + "step": 13610 + }, + { + "epoch": 0.4874388955539241, + "grad_norm": 1.7593934535980225, + "learning_rate": 0.00010891359464473468, + "loss": 1.5433, + "step": 13611 + }, + { + "epoch": 0.4874747076834924, + "grad_norm": 1.4429570436477661, + "learning_rate": 0.00010890204174209269, + "loss": 1.4472, + "step": 13612 + }, + { + "epoch": 0.4875105198130607, + "grad_norm": 1.4732005596160889, + "learning_rate": 0.00010889048871968517, + "loss": 1.4272, + "step": 13613 + }, + { + "epoch": 0.487546331942629, + "grad_norm": 2.193288564682007, + "learning_rate": 0.00010887893557766766, + "loss": 1.7031, + "step": 13614 + }, + { + "epoch": 0.48758214407219724, + "grad_norm": 1.877280592918396, + "learning_rate": 0.00010886738231619549, + "loss": 1.6894, + "step": 13615 + }, + { + "epoch": 0.4876179562017655, + "grad_norm": 1.6814889907836914, + "learning_rate": 0.00010885582893542411, + "loss": 1.5371, + "step": 13616 + }, + { + "epoch": 0.48765376833133384, + "grad_norm": 1.7409570217132568, + "learning_rate": 0.00010884427543550899, + "loss": 1.8072, + "step": 13617 + }, + { + "epoch": 0.4876895804609021, + "grad_norm": 1.5650906562805176, + "learning_rate": 0.00010883272181660558, + "loss": 1.2065, + "step": 13618 + }, + { + "epoch": 0.4877253925904704, + "grad_norm": 1.6816296577453613, + "learning_rate": 0.00010882116807886924, + "loss": 1.491, + "step": 13619 + }, + { + "epoch": 0.4877612047200387, + "grad_norm": 1.757469892501831, + "learning_rate": 0.0001088096142224555, + "loss": 1.5318, + "step": 13620 + }, + { + "epoch": 0.48779701684960697, + "grad_norm": 1.3962442874908447, + "learning_rate": 0.00010879806024751975, + "loss": 1.3877, + "step": 13621 + }, + { + "epoch": 0.48783282897917524, + "grad_norm": 2.594102144241333, + "learning_rate": 0.00010878650615421744, + "loss": 1.4608, + "step": 13622 + }, + { + "epoch": 0.4878686411087435, + "grad_norm": 1.906245231628418, + "learning_rate": 0.00010877495194270407, + "loss": 1.5072, + "step": 13623 + }, + { + "epoch": 0.48790445323831183, + "grad_norm": 1.6580541133880615, + "learning_rate": 0.00010876339761313499, + "loss": 1.4024, + "step": 13624 + }, + { + "epoch": 0.4879402653678801, + "grad_norm": 1.6919373273849487, + "learning_rate": 0.00010875184316566571, + "loss": 1.5576, + "step": 13625 + }, + { + "epoch": 0.48797607749744837, + "grad_norm": 1.9276906251907349, + "learning_rate": 0.00010874028860045166, + "loss": 1.4877, + "step": 13626 + }, + { + "epoch": 0.4880118896270167, + "grad_norm": 1.4505412578582764, + "learning_rate": 0.00010872873391764833, + "loss": 1.4795, + "step": 13627 + }, + { + "epoch": 0.48804770175658496, + "grad_norm": 1.4646049737930298, + "learning_rate": 0.00010871717911741113, + "loss": 1.3365, + "step": 13628 + }, + { + "epoch": 0.48808351388615323, + "grad_norm": 1.593316674232483, + "learning_rate": 0.00010870562419989552, + "loss": 1.4228, + "step": 13629 + }, + { + "epoch": 0.4881193260157215, + "grad_norm": 1.555849552154541, + "learning_rate": 0.00010869406916525698, + "loss": 1.5343, + "step": 13630 + }, + { + "epoch": 0.4881551381452898, + "grad_norm": 2.0024313926696777, + "learning_rate": 0.00010868251401365095, + "loss": 1.4696, + "step": 13631 + }, + { + "epoch": 0.4881909502748581, + "grad_norm": 1.810181736946106, + "learning_rate": 0.0001086709587452329, + "loss": 1.3037, + "step": 13632 + }, + { + "epoch": 0.48822676240442636, + "grad_norm": 2.0608158111572266, + "learning_rate": 0.00010865940336015828, + "loss": 1.5016, + "step": 13633 + }, + { + "epoch": 0.4882625745339947, + "grad_norm": 1.37624192237854, + "learning_rate": 0.00010864784785858256, + "loss": 1.296, + "step": 13634 + }, + { + "epoch": 0.48829838666356296, + "grad_norm": 1.3577290773391724, + "learning_rate": 0.00010863629224066116, + "loss": 1.3755, + "step": 13635 + }, + { + "epoch": 0.4883341987931312, + "grad_norm": 1.884279727935791, + "learning_rate": 0.00010862473650654965, + "loss": 1.6172, + "step": 13636 + }, + { + "epoch": 0.4883700109226995, + "grad_norm": 1.4926834106445312, + "learning_rate": 0.00010861318065640338, + "loss": 1.3022, + "step": 13637 + }, + { + "epoch": 0.4884058230522678, + "grad_norm": 3.462437629699707, + "learning_rate": 0.00010860162469037792, + "loss": 1.6107, + "step": 13638 + }, + { + "epoch": 0.4884416351818361, + "grad_norm": 2.0196216106414795, + "learning_rate": 0.00010859006860862865, + "loss": 1.5062, + "step": 13639 + }, + { + "epoch": 0.48847744731140436, + "grad_norm": 1.4165526628494263, + "learning_rate": 0.00010857851241131114, + "loss": 1.389, + "step": 13640 + }, + { + "epoch": 0.4885132594409727, + "grad_norm": 1.861929178237915, + "learning_rate": 0.0001085669560985808, + "loss": 1.6056, + "step": 13641 + }, + { + "epoch": 0.48854907157054095, + "grad_norm": 2.1622304916381836, + "learning_rate": 0.0001085553996705931, + "loss": 1.9087, + "step": 13642 + }, + { + "epoch": 0.4885848837001092, + "grad_norm": 1.8672003746032715, + "learning_rate": 0.00010854384312750354, + "loss": 1.3609, + "step": 13643 + }, + { + "epoch": 0.4886206958296775, + "grad_norm": 2.506395101547241, + "learning_rate": 0.00010853228646946758, + "loss": 1.2844, + "step": 13644 + }, + { + "epoch": 0.4886565079592458, + "grad_norm": 1.754264235496521, + "learning_rate": 0.00010852072969664073, + "loss": 1.551, + "step": 13645 + }, + { + "epoch": 0.4886923200888141, + "grad_norm": 1.666245698928833, + "learning_rate": 0.00010850917280917843, + "loss": 1.0954, + "step": 13646 + }, + { + "epoch": 0.48872813221838235, + "grad_norm": 1.6731163263320923, + "learning_rate": 0.0001084976158072362, + "loss": 1.7465, + "step": 13647 + }, + { + "epoch": 0.4887639443479507, + "grad_norm": 1.6194301843643188, + "learning_rate": 0.0001084860586909695, + "loss": 1.1302, + "step": 13648 + }, + { + "epoch": 0.48879975647751894, + "grad_norm": 1.5961239337921143, + "learning_rate": 0.00010847450146053386, + "loss": 1.3433, + "step": 13649 + }, + { + "epoch": 0.4888355686070872, + "grad_norm": 1.8925201892852783, + "learning_rate": 0.0001084629441160847, + "loss": 1.5822, + "step": 13650 + }, + { + "epoch": 0.4888713807366555, + "grad_norm": 1.876792550086975, + "learning_rate": 0.00010845138665777754, + "loss": 1.7623, + "step": 13651 + }, + { + "epoch": 0.4889071928662238, + "grad_norm": 1.422800898551941, + "learning_rate": 0.0001084398290857679, + "loss": 1.3395, + "step": 13652 + }, + { + "epoch": 0.4889430049957921, + "grad_norm": 2.462639808654785, + "learning_rate": 0.00010842827140021121, + "loss": 1.5845, + "step": 13653 + }, + { + "epoch": 0.48897881712536034, + "grad_norm": 1.4146637916564941, + "learning_rate": 0.00010841671360126304, + "loss": 1.4434, + "step": 13654 + }, + { + "epoch": 0.48901462925492867, + "grad_norm": 1.4680838584899902, + "learning_rate": 0.0001084051556890788, + "loss": 1.5377, + "step": 13655 + }, + { + "epoch": 0.48905044138449694, + "grad_norm": 1.9159295558929443, + "learning_rate": 0.0001083935976638141, + "loss": 1.5379, + "step": 13656 + }, + { + "epoch": 0.4890862535140652, + "grad_norm": 1.6047742366790771, + "learning_rate": 0.00010838203952562432, + "loss": 1.4923, + "step": 13657 + }, + { + "epoch": 0.4891220656436335, + "grad_norm": 1.7296887636184692, + "learning_rate": 0.00010837048127466505, + "loss": 1.2314, + "step": 13658 + }, + { + "epoch": 0.4891578777732018, + "grad_norm": 1.9508219957351685, + "learning_rate": 0.00010835892291109169, + "loss": 1.4367, + "step": 13659 + }, + { + "epoch": 0.48919368990277007, + "grad_norm": 1.9842190742492676, + "learning_rate": 0.00010834736443505986, + "loss": 1.613, + "step": 13660 + }, + { + "epoch": 0.48922950203233834, + "grad_norm": 1.6061060428619385, + "learning_rate": 0.00010833580584672496, + "loss": 1.2524, + "step": 13661 + }, + { + "epoch": 0.48926531416190666, + "grad_norm": 1.3053823709487915, + "learning_rate": 0.00010832424714624259, + "loss": 1.4031, + "step": 13662 + }, + { + "epoch": 0.48930112629147493, + "grad_norm": 1.4019272327423096, + "learning_rate": 0.00010831268833376817, + "loss": 1.3219, + "step": 13663 + }, + { + "epoch": 0.4893369384210432, + "grad_norm": 1.8044565916061401, + "learning_rate": 0.00010830112940945726, + "loss": 1.5856, + "step": 13664 + }, + { + "epoch": 0.48937275055061147, + "grad_norm": 2.211778163909912, + "learning_rate": 0.00010828957037346538, + "loss": 1.3458, + "step": 13665 + }, + { + "epoch": 0.4894085626801798, + "grad_norm": 1.8323140144348145, + "learning_rate": 0.00010827801122594802, + "loss": 1.2703, + "step": 13666 + }, + { + "epoch": 0.48944437480974806, + "grad_norm": 1.6370885372161865, + "learning_rate": 0.00010826645196706074, + "loss": 1.4509, + "step": 13667 + }, + { + "epoch": 0.48948018693931633, + "grad_norm": 1.6698061227798462, + "learning_rate": 0.00010825489259695894, + "loss": 1.5467, + "step": 13668 + }, + { + "epoch": 0.48951599906888466, + "grad_norm": 1.5696372985839844, + "learning_rate": 0.00010824333311579824, + "loss": 1.6898, + "step": 13669 + }, + { + "epoch": 0.4895518111984529, + "grad_norm": 2.0624501705169678, + "learning_rate": 0.00010823177352373412, + "loss": 1.5877, + "step": 13670 + }, + { + "epoch": 0.4895876233280212, + "grad_norm": 1.8093184232711792, + "learning_rate": 0.00010822021382092211, + "loss": 1.575, + "step": 13671 + }, + { + "epoch": 0.48962343545758946, + "grad_norm": 1.586574912071228, + "learning_rate": 0.00010820865400751772, + "loss": 1.5977, + "step": 13672 + }, + { + "epoch": 0.4896592475871578, + "grad_norm": 2.6463475227355957, + "learning_rate": 0.00010819709408367649, + "loss": 1.5361, + "step": 13673 + }, + { + "epoch": 0.48969505971672606, + "grad_norm": 1.738493800163269, + "learning_rate": 0.00010818553404955391, + "loss": 1.6058, + "step": 13674 + }, + { + "epoch": 0.4897308718462943, + "grad_norm": 1.6610356569290161, + "learning_rate": 0.00010817397390530555, + "loss": 1.608, + "step": 13675 + }, + { + "epoch": 0.48976668397586265, + "grad_norm": 1.5615880489349365, + "learning_rate": 0.00010816241365108692, + "loss": 1.6981, + "step": 13676 + }, + { + "epoch": 0.4898024961054309, + "grad_norm": 1.5565402507781982, + "learning_rate": 0.00010815085328705352, + "loss": 1.3005, + "step": 13677 + }, + { + "epoch": 0.4898383082349992, + "grad_norm": 1.6391171216964722, + "learning_rate": 0.00010813929281336092, + "loss": 1.4786, + "step": 13678 + }, + { + "epoch": 0.48987412036456746, + "grad_norm": 1.5482856035232544, + "learning_rate": 0.00010812773223016461, + "loss": 1.4095, + "step": 13679 + }, + { + "epoch": 0.4899099324941358, + "grad_norm": 1.6968822479248047, + "learning_rate": 0.00010811617153762017, + "loss": 1.5978, + "step": 13680 + }, + { + "epoch": 0.48994574462370405, + "grad_norm": 1.4903837442398071, + "learning_rate": 0.0001081046107358831, + "loss": 1.3526, + "step": 13681 + }, + { + "epoch": 0.4899815567532723, + "grad_norm": 1.5468765497207642, + "learning_rate": 0.00010809304982510897, + "loss": 1.487, + "step": 13682 + }, + { + "epoch": 0.49001736888284064, + "grad_norm": 2.329375743865967, + "learning_rate": 0.00010808148880545325, + "loss": 1.346, + "step": 13683 + }, + { + "epoch": 0.4900531810124089, + "grad_norm": 1.809410810470581, + "learning_rate": 0.00010806992767707155, + "loss": 1.8127, + "step": 13684 + }, + { + "epoch": 0.4900889931419772, + "grad_norm": 2.0929079055786133, + "learning_rate": 0.00010805836644011939, + "loss": 1.697, + "step": 13685 + }, + { + "epoch": 0.49012480527154545, + "grad_norm": 1.9662256240844727, + "learning_rate": 0.00010804680509475229, + "loss": 1.3781, + "step": 13686 + }, + { + "epoch": 0.4901606174011138, + "grad_norm": 1.429788589477539, + "learning_rate": 0.00010803524364112583, + "loss": 1.3815, + "step": 13687 + }, + { + "epoch": 0.49019642953068204, + "grad_norm": 1.752447247505188, + "learning_rate": 0.0001080236820793955, + "loss": 1.3346, + "step": 13688 + }, + { + "epoch": 0.4902322416602503, + "grad_norm": 1.527882695198059, + "learning_rate": 0.00010801212040971691, + "loss": 1.5413, + "step": 13689 + }, + { + "epoch": 0.49026805378981864, + "grad_norm": 1.3394904136657715, + "learning_rate": 0.00010800055863224555, + "loss": 1.4344, + "step": 13690 + }, + { + "epoch": 0.4903038659193869, + "grad_norm": 1.955166220664978, + "learning_rate": 0.00010798899674713699, + "loss": 1.4497, + "step": 13691 + }, + { + "epoch": 0.4903396780489552, + "grad_norm": 1.8119986057281494, + "learning_rate": 0.00010797743475454678, + "loss": 1.7094, + "step": 13692 + }, + { + "epoch": 0.49037549017852344, + "grad_norm": 1.3964793682098389, + "learning_rate": 0.0001079658726546305, + "loss": 1.4808, + "step": 13693 + }, + { + "epoch": 0.49041130230809177, + "grad_norm": 2.0613391399383545, + "learning_rate": 0.00010795431044754367, + "loss": 1.2227, + "step": 13694 + }, + { + "epoch": 0.49044711443766004, + "grad_norm": 1.5445024967193604, + "learning_rate": 0.00010794274813344185, + "loss": 1.5322, + "step": 13695 + }, + { + "epoch": 0.4904829265672283, + "grad_norm": 1.8224143981933594, + "learning_rate": 0.0001079311857124806, + "loss": 1.3995, + "step": 13696 + }, + { + "epoch": 0.49051873869679663, + "grad_norm": 2.142685890197754, + "learning_rate": 0.00010791962318481547, + "loss": 1.7053, + "step": 13697 + }, + { + "epoch": 0.4905545508263649, + "grad_norm": 1.9301700592041016, + "learning_rate": 0.00010790806055060205, + "loss": 1.7767, + "step": 13698 + }, + { + "epoch": 0.49059036295593317, + "grad_norm": 2.0079996585845947, + "learning_rate": 0.00010789649780999585, + "loss": 1.6479, + "step": 13699 + }, + { + "epoch": 0.49062617508550144, + "grad_norm": 1.3682293891906738, + "learning_rate": 0.00010788493496315246, + "loss": 1.2197, + "step": 13700 + }, + { + "epoch": 0.49066198721506976, + "grad_norm": 1.5623469352722168, + "learning_rate": 0.00010787337201022745, + "loss": 1.6502, + "step": 13701 + }, + { + "epoch": 0.49069779934463803, + "grad_norm": 1.6920688152313232, + "learning_rate": 0.00010786180895137639, + "loss": 1.2194, + "step": 13702 + }, + { + "epoch": 0.4907336114742063, + "grad_norm": 1.5415152311325073, + "learning_rate": 0.0001078502457867548, + "loss": 1.4212, + "step": 13703 + }, + { + "epoch": 0.4907694236037746, + "grad_norm": 1.6158078908920288, + "learning_rate": 0.00010783868251651833, + "loss": 1.3743, + "step": 13704 + }, + { + "epoch": 0.4908052357333429, + "grad_norm": 1.6433581113815308, + "learning_rate": 0.00010782711914082242, + "loss": 1.4155, + "step": 13705 + }, + { + "epoch": 0.49084104786291116, + "grad_norm": 1.4370219707489014, + "learning_rate": 0.00010781555565982276, + "loss": 1.429, + "step": 13706 + }, + { + "epoch": 0.49087685999247943, + "grad_norm": 1.5507996082305908, + "learning_rate": 0.00010780399207367489, + "loss": 1.1886, + "step": 13707 + }, + { + "epoch": 0.49091267212204776, + "grad_norm": 1.837388515472412, + "learning_rate": 0.00010779242838253433, + "loss": 1.5502, + "step": 13708 + }, + { + "epoch": 0.490948484251616, + "grad_norm": 1.5387765169143677, + "learning_rate": 0.00010778086458655677, + "loss": 1.5225, + "step": 13709 + }, + { + "epoch": 0.4909842963811843, + "grad_norm": 2.5038530826568604, + "learning_rate": 0.00010776930068589764, + "loss": 1.6695, + "step": 13710 + }, + { + "epoch": 0.4910201085107526, + "grad_norm": 2.6884608268737793, + "learning_rate": 0.00010775773668071265, + "loss": 1.4632, + "step": 13711 + }, + { + "epoch": 0.4910559206403209, + "grad_norm": 1.9262381792068481, + "learning_rate": 0.00010774617257115728, + "loss": 1.0545, + "step": 13712 + }, + { + "epoch": 0.49109173276988916, + "grad_norm": 1.8971209526062012, + "learning_rate": 0.00010773460835738718, + "loss": 1.4246, + "step": 13713 + }, + { + "epoch": 0.4911275448994574, + "grad_norm": 1.7927980422973633, + "learning_rate": 0.00010772304403955789, + "loss": 1.4387, + "step": 13714 + }, + { + "epoch": 0.49116335702902575, + "grad_norm": 1.5163729190826416, + "learning_rate": 0.000107711479617825, + "loss": 1.3497, + "step": 13715 + }, + { + "epoch": 0.491199169158594, + "grad_norm": 1.4729920625686646, + "learning_rate": 0.00010769991509234408, + "loss": 1.6522, + "step": 13716 + }, + { + "epoch": 0.4912349812881623, + "grad_norm": 1.7279658317565918, + "learning_rate": 0.00010768835046327077, + "loss": 1.1325, + "step": 13717 + }, + { + "epoch": 0.4912707934177306, + "grad_norm": 1.7967742681503296, + "learning_rate": 0.00010767678573076058, + "loss": 1.6004, + "step": 13718 + }, + { + "epoch": 0.4913066055472989, + "grad_norm": 2.122061252593994, + "learning_rate": 0.00010766522089496915, + "loss": 1.5491, + "step": 13719 + }, + { + "epoch": 0.49134241767686715, + "grad_norm": 1.501920461654663, + "learning_rate": 0.00010765365595605212, + "loss": 1.5364, + "step": 13720 + }, + { + "epoch": 0.4913782298064354, + "grad_norm": 1.8357949256896973, + "learning_rate": 0.00010764209091416497, + "loss": 1.5825, + "step": 13721 + }, + { + "epoch": 0.49141404193600374, + "grad_norm": 1.3038849830627441, + "learning_rate": 0.00010763052576946335, + "loss": 1.5351, + "step": 13722 + }, + { + "epoch": 0.491449854065572, + "grad_norm": 2.2974419593811035, + "learning_rate": 0.00010761896052210285, + "loss": 1.3127, + "step": 13723 + }, + { + "epoch": 0.4914856661951403, + "grad_norm": 1.4514243602752686, + "learning_rate": 0.00010760739517223908, + "loss": 1.3836, + "step": 13724 + }, + { + "epoch": 0.4915214783247086, + "grad_norm": 2.4426000118255615, + "learning_rate": 0.00010759582972002758, + "loss": 1.7502, + "step": 13725 + }, + { + "epoch": 0.4915572904542769, + "grad_norm": 1.43705153465271, + "learning_rate": 0.00010758426416562402, + "loss": 1.6914, + "step": 13726 + }, + { + "epoch": 0.49159310258384514, + "grad_norm": 2.030698776245117, + "learning_rate": 0.00010757269850918394, + "loss": 1.3523, + "step": 13727 + }, + { + "epoch": 0.4916289147134134, + "grad_norm": 2.493643045425415, + "learning_rate": 0.00010756113275086302, + "loss": 1.5557, + "step": 13728 + }, + { + "epoch": 0.49166472684298174, + "grad_norm": 2.235466241836548, + "learning_rate": 0.00010754956689081678, + "loss": 1.7038, + "step": 13729 + }, + { + "epoch": 0.49170053897255, + "grad_norm": 1.6229029893875122, + "learning_rate": 0.00010753800092920086, + "loss": 1.5441, + "step": 13730 + }, + { + "epoch": 0.4917363511021183, + "grad_norm": 1.7201504707336426, + "learning_rate": 0.00010752643486617086, + "loss": 1.6628, + "step": 13731 + }, + { + "epoch": 0.4917721632316866, + "grad_norm": 2.171365737915039, + "learning_rate": 0.00010751486870188239, + "loss": 1.4995, + "step": 13732 + }, + { + "epoch": 0.49180797536125487, + "grad_norm": 2.607131004333496, + "learning_rate": 0.00010750330243649104, + "loss": 1.6046, + "step": 13733 + }, + { + "epoch": 0.49184378749082314, + "grad_norm": 2.4226737022399902, + "learning_rate": 0.00010749173607015247, + "loss": 1.4492, + "step": 13734 + }, + { + "epoch": 0.4918795996203914, + "grad_norm": 1.5893970727920532, + "learning_rate": 0.00010748016960302223, + "loss": 1.403, + "step": 13735 + }, + { + "epoch": 0.49191541174995973, + "grad_norm": 1.5359864234924316, + "learning_rate": 0.00010746860303525595, + "loss": 1.6285, + "step": 13736 + }, + { + "epoch": 0.491951223879528, + "grad_norm": 2.033862829208374, + "learning_rate": 0.00010745703636700926, + "loss": 1.6867, + "step": 13737 + }, + { + "epoch": 0.49198703600909627, + "grad_norm": 1.6129465103149414, + "learning_rate": 0.00010744546959843777, + "loss": 1.6712, + "step": 13738 + }, + { + "epoch": 0.4920228481386646, + "grad_norm": 1.5458647012710571, + "learning_rate": 0.00010743390272969706, + "loss": 1.4477, + "step": 13739 + }, + { + "epoch": 0.49205866026823286, + "grad_norm": 1.4577420949935913, + "learning_rate": 0.00010742233576094283, + "loss": 1.3451, + "step": 13740 + }, + { + "epoch": 0.49209447239780113, + "grad_norm": 2.2439849376678467, + "learning_rate": 0.0001074107686923306, + "loss": 1.5242, + "step": 13741 + }, + { + "epoch": 0.4921302845273694, + "grad_norm": 1.4966247081756592, + "learning_rate": 0.00010739920152401605, + "loss": 1.8109, + "step": 13742 + }, + { + "epoch": 0.4921660966569377, + "grad_norm": 1.5827927589416504, + "learning_rate": 0.00010738763425615479, + "loss": 1.2532, + "step": 13743 + }, + { + "epoch": 0.492201908786506, + "grad_norm": 1.451015591621399, + "learning_rate": 0.00010737606688890245, + "loss": 1.5968, + "step": 13744 + }, + { + "epoch": 0.49223772091607426, + "grad_norm": 1.5125499963760376, + "learning_rate": 0.00010736449942241465, + "loss": 1.3855, + "step": 13745 + }, + { + "epoch": 0.4922735330456426, + "grad_norm": 1.5985307693481445, + "learning_rate": 0.000107352931856847, + "loss": 1.4018, + "step": 13746 + }, + { + "epoch": 0.49230934517521086, + "grad_norm": 1.6459596157073975, + "learning_rate": 0.00010734136419235512, + "loss": 1.4337, + "step": 13747 + }, + { + "epoch": 0.4923451573047791, + "grad_norm": 2.634399652481079, + "learning_rate": 0.00010732979642909466, + "loss": 1.4944, + "step": 13748 + }, + { + "epoch": 0.4923809694343474, + "grad_norm": 1.453285574913025, + "learning_rate": 0.00010731822856722127, + "loss": 1.595, + "step": 13749 + }, + { + "epoch": 0.4924167815639157, + "grad_norm": 2.0285205841064453, + "learning_rate": 0.00010730666060689053, + "loss": 1.8837, + "step": 13750 + }, + { + "epoch": 0.492452593693484, + "grad_norm": 1.861598253250122, + "learning_rate": 0.00010729509254825811, + "loss": 1.6692, + "step": 13751 + }, + { + "epoch": 0.49248840582305226, + "grad_norm": 2.062983751296997, + "learning_rate": 0.00010728352439147959, + "loss": 1.6496, + "step": 13752 + }, + { + "epoch": 0.4925242179526206, + "grad_norm": 2.475240707397461, + "learning_rate": 0.00010727195613671071, + "loss": 1.6116, + "step": 13753 + }, + { + "epoch": 0.49256003008218885, + "grad_norm": 1.5981770753860474, + "learning_rate": 0.00010726038778410699, + "loss": 1.3383, + "step": 13754 + }, + { + "epoch": 0.4925958422117571, + "grad_norm": 1.3601890802383423, + "learning_rate": 0.00010724881933382416, + "loss": 1.55, + "step": 13755 + }, + { + "epoch": 0.4926316543413254, + "grad_norm": 1.8555890321731567, + "learning_rate": 0.00010723725078601778, + "loss": 1.701, + "step": 13756 + }, + { + "epoch": 0.4926674664708937, + "grad_norm": 1.4763880968093872, + "learning_rate": 0.00010722568214084354, + "loss": 1.7419, + "step": 13757 + }, + { + "epoch": 0.492703278600462, + "grad_norm": 1.3820867538452148, + "learning_rate": 0.00010721411339845707, + "loss": 1.4405, + "step": 13758 + }, + { + "epoch": 0.49273909073003025, + "grad_norm": 1.5505809783935547, + "learning_rate": 0.00010720254455901399, + "loss": 1.3917, + "step": 13759 + }, + { + "epoch": 0.4927749028595986, + "grad_norm": 3.055396556854248, + "learning_rate": 0.00010719097562266998, + "loss": 1.2911, + "step": 13760 + }, + { + "epoch": 0.49281071498916684, + "grad_norm": 1.4654674530029297, + "learning_rate": 0.00010717940658958066, + "loss": 1.5803, + "step": 13761 + }, + { + "epoch": 0.4928465271187351, + "grad_norm": 1.8840539455413818, + "learning_rate": 0.00010716783745990169, + "loss": 1.2817, + "step": 13762 + }, + { + "epoch": 0.4928823392483034, + "grad_norm": 1.674445390701294, + "learning_rate": 0.0001071562682337887, + "loss": 1.5135, + "step": 13763 + }, + { + "epoch": 0.4929181513778717, + "grad_norm": 1.557632327079773, + "learning_rate": 0.0001071446989113974, + "loss": 1.4121, + "step": 13764 + }, + { + "epoch": 0.49295396350744, + "grad_norm": 1.402957558631897, + "learning_rate": 0.00010713312949288334, + "loss": 1.3646, + "step": 13765 + }, + { + "epoch": 0.49298977563700824, + "grad_norm": 1.951153039932251, + "learning_rate": 0.00010712155997840225, + "loss": 1.468, + "step": 13766 + }, + { + "epoch": 0.4930255877665765, + "grad_norm": 1.7936666011810303, + "learning_rate": 0.00010710999036810975, + "loss": 1.123, + "step": 13767 + }, + { + "epoch": 0.49306139989614484, + "grad_norm": 1.553545594215393, + "learning_rate": 0.00010709842066216151, + "loss": 1.1884, + "step": 13768 + }, + { + "epoch": 0.4930972120257131, + "grad_norm": 1.3447620868682861, + "learning_rate": 0.00010708685086071316, + "loss": 1.2112, + "step": 13769 + }, + { + "epoch": 0.4931330241552814, + "grad_norm": 2.356231212615967, + "learning_rate": 0.00010707528096392038, + "loss": 1.3566, + "step": 13770 + }, + { + "epoch": 0.4931688362848497, + "grad_norm": 1.819908618927002, + "learning_rate": 0.00010706371097193881, + "loss": 1.7402, + "step": 13771 + }, + { + "epoch": 0.49320464841441797, + "grad_norm": 2.1119303703308105, + "learning_rate": 0.00010705214088492415, + "loss": 1.3566, + "step": 13772 + }, + { + "epoch": 0.49324046054398624, + "grad_norm": 1.6259195804595947, + "learning_rate": 0.00010704057070303201, + "loss": 1.4016, + "step": 13773 + }, + { + "epoch": 0.4932762726735545, + "grad_norm": 1.7888505458831787, + "learning_rate": 0.00010702900042641806, + "loss": 1.5483, + "step": 13774 + }, + { + "epoch": 0.49331208480312283, + "grad_norm": 1.7072391510009766, + "learning_rate": 0.00010701743005523801, + "loss": 1.4431, + "step": 13775 + }, + { + "epoch": 0.4933478969326911, + "grad_norm": 2.1504554748535156, + "learning_rate": 0.00010700585958964744, + "loss": 1.7045, + "step": 13776 + }, + { + "epoch": 0.49338370906225937, + "grad_norm": 2.100933790206909, + "learning_rate": 0.00010699428902980211, + "loss": 1.7934, + "step": 13777 + }, + { + "epoch": 0.4934195211918277, + "grad_norm": 1.4316816329956055, + "learning_rate": 0.00010698271837585762, + "loss": 1.3737, + "step": 13778 + }, + { + "epoch": 0.49345533332139596, + "grad_norm": 1.4523588418960571, + "learning_rate": 0.0001069711476279697, + "loss": 1.533, + "step": 13779 + }, + { + "epoch": 0.49349114545096423, + "grad_norm": 1.4418319463729858, + "learning_rate": 0.00010695957678629391, + "loss": 1.3883, + "step": 13780 + }, + { + "epoch": 0.4935269575805325, + "grad_norm": 1.8043620586395264, + "learning_rate": 0.00010694800585098606, + "loss": 1.4568, + "step": 13781 + }, + { + "epoch": 0.4935627697101008, + "grad_norm": 1.7246334552764893, + "learning_rate": 0.00010693643482220173, + "loss": 1.6841, + "step": 13782 + }, + { + "epoch": 0.4935985818396691, + "grad_norm": 1.7109289169311523, + "learning_rate": 0.0001069248637000966, + "loss": 1.3743, + "step": 13783 + }, + { + "epoch": 0.49363439396923736, + "grad_norm": 1.5216609239578247, + "learning_rate": 0.0001069132924848264, + "loss": 1.3224, + "step": 13784 + }, + { + "epoch": 0.4936702060988057, + "grad_norm": 1.9945144653320312, + "learning_rate": 0.00010690172117654672, + "loss": 1.5012, + "step": 13785 + }, + { + "epoch": 0.49370601822837396, + "grad_norm": 1.4590290784835815, + "learning_rate": 0.00010689014977541332, + "loss": 1.408, + "step": 13786 + }, + { + "epoch": 0.4937418303579422, + "grad_norm": 1.3395782709121704, + "learning_rate": 0.00010687857828158182, + "loss": 1.5821, + "step": 13787 + }, + { + "epoch": 0.4937776424875105, + "grad_norm": 1.3145579099655151, + "learning_rate": 0.00010686700669520792, + "loss": 1.5071, + "step": 13788 + }, + { + "epoch": 0.4938134546170788, + "grad_norm": 1.4483972787857056, + "learning_rate": 0.00010685543501644732, + "loss": 1.36, + "step": 13789 + }, + { + "epoch": 0.4938492667466471, + "grad_norm": 5.216784477233887, + "learning_rate": 0.00010684386324545567, + "loss": 1.8394, + "step": 13790 + }, + { + "epoch": 0.49388507887621536, + "grad_norm": 1.2942969799041748, + "learning_rate": 0.0001068322913823887, + "loss": 1.1399, + "step": 13791 + }, + { + "epoch": 0.4939208910057837, + "grad_norm": 1.4806745052337646, + "learning_rate": 0.00010682071942740202, + "loss": 1.2029, + "step": 13792 + }, + { + "epoch": 0.49395670313535195, + "grad_norm": 1.8519744873046875, + "learning_rate": 0.0001068091473806514, + "loss": 1.3285, + "step": 13793 + }, + { + "epoch": 0.4939925152649202, + "grad_norm": 1.7038651704788208, + "learning_rate": 0.00010679757524229244, + "loss": 1.3596, + "step": 13794 + }, + { + "epoch": 0.4940283273944885, + "grad_norm": 1.638228178024292, + "learning_rate": 0.0001067860030124809, + "loss": 1.328, + "step": 13795 + }, + { + "epoch": 0.4940641395240568, + "grad_norm": 1.7471988201141357, + "learning_rate": 0.00010677443069137242, + "loss": 1.2233, + "step": 13796 + }, + { + "epoch": 0.4940999516536251, + "grad_norm": 2.5956523418426514, + "learning_rate": 0.00010676285827912276, + "loss": 1.2647, + "step": 13797 + }, + { + "epoch": 0.49413576378319335, + "grad_norm": 1.6266028881072998, + "learning_rate": 0.00010675128577588751, + "loss": 1.6781, + "step": 13798 + }, + { + "epoch": 0.4941715759127617, + "grad_norm": 1.3634744882583618, + "learning_rate": 0.00010673971318182247, + "loss": 1.6182, + "step": 13799 + }, + { + "epoch": 0.49420738804232994, + "grad_norm": 1.7638877630233765, + "learning_rate": 0.00010672814049708326, + "loss": 1.5701, + "step": 13800 + }, + { + "epoch": 0.4942432001718982, + "grad_norm": 1.6434555053710938, + "learning_rate": 0.0001067165677218256, + "loss": 1.2647, + "step": 13801 + }, + { + "epoch": 0.4942790123014665, + "grad_norm": 1.9135644435882568, + "learning_rate": 0.00010670499485620517, + "loss": 1.5567, + "step": 13802 + }, + { + "epoch": 0.4943148244310348, + "grad_norm": 1.8451272249221802, + "learning_rate": 0.0001066934219003777, + "loss": 1.2508, + "step": 13803 + }, + { + "epoch": 0.4943506365606031, + "grad_norm": 1.3484454154968262, + "learning_rate": 0.00010668184885449886, + "loss": 1.4343, + "step": 13804 + }, + { + "epoch": 0.49438644869017134, + "grad_norm": 1.644727110862732, + "learning_rate": 0.00010667027571872436, + "loss": 1.2379, + "step": 13805 + }, + { + "epoch": 0.49442226081973967, + "grad_norm": 1.5474905967712402, + "learning_rate": 0.00010665870249320993, + "loss": 1.5323, + "step": 13806 + }, + { + "epoch": 0.49445807294930794, + "grad_norm": 1.9647862911224365, + "learning_rate": 0.00010664712917811121, + "loss": 1.7564, + "step": 13807 + }, + { + "epoch": 0.4944938850788762, + "grad_norm": 1.648308515548706, + "learning_rate": 0.000106635555773584, + "loss": 1.36, + "step": 13808 + }, + { + "epoch": 0.4945296972084445, + "grad_norm": 1.834714651107788, + "learning_rate": 0.00010662398227978389, + "loss": 1.223, + "step": 13809 + }, + { + "epoch": 0.4945655093380128, + "grad_norm": 1.4639192819595337, + "learning_rate": 0.00010661240869686669, + "loss": 1.4133, + "step": 13810 + }, + { + "epoch": 0.49460132146758107, + "grad_norm": 1.9548790454864502, + "learning_rate": 0.00010660083502498801, + "loss": 1.2963, + "step": 13811 + }, + { + "epoch": 0.49463713359714934, + "grad_norm": 1.7296000719070435, + "learning_rate": 0.00010658926126430364, + "loss": 1.6591, + "step": 13812 + }, + { + "epoch": 0.49467294572671766, + "grad_norm": 1.7165069580078125, + "learning_rate": 0.00010657768741496923, + "loss": 1.3809, + "step": 13813 + }, + { + "epoch": 0.49470875785628593, + "grad_norm": 1.616808295249939, + "learning_rate": 0.00010656611347714056, + "loss": 1.647, + "step": 13814 + }, + { + "epoch": 0.4947445699858542, + "grad_norm": 2.2919797897338867, + "learning_rate": 0.00010655453945097327, + "loss": 1.8562, + "step": 13815 + }, + { + "epoch": 0.49478038211542247, + "grad_norm": 2.0228257179260254, + "learning_rate": 0.0001065429653366231, + "loss": 1.5348, + "step": 13816 + }, + { + "epoch": 0.4948161942449908, + "grad_norm": 2.164900064468384, + "learning_rate": 0.00010653139113424581, + "loss": 1.5874, + "step": 13817 + }, + { + "epoch": 0.49485200637455906, + "grad_norm": 1.9388035535812378, + "learning_rate": 0.00010651981684399705, + "loss": 1.3361, + "step": 13818 + }, + { + "epoch": 0.49488781850412733, + "grad_norm": 2.0324113368988037, + "learning_rate": 0.0001065082424660326, + "loss": 1.2808, + "step": 13819 + }, + { + "epoch": 0.49492363063369565, + "grad_norm": 1.7220652103424072, + "learning_rate": 0.00010649666800050808, + "loss": 1.3884, + "step": 13820 + }, + { + "epoch": 0.4949594427632639, + "grad_norm": 2.1359825134277344, + "learning_rate": 0.00010648509344757933, + "loss": 1.4864, + "step": 13821 + }, + { + "epoch": 0.4949952548928322, + "grad_norm": 1.7690446376800537, + "learning_rate": 0.00010647351880740197, + "loss": 1.5202, + "step": 13822 + }, + { + "epoch": 0.49503106702240046, + "grad_norm": 1.6136664152145386, + "learning_rate": 0.00010646194408013179, + "loss": 1.4969, + "step": 13823 + }, + { + "epoch": 0.4950668791519688, + "grad_norm": 1.6835991144180298, + "learning_rate": 0.00010645036926592449, + "loss": 1.3849, + "step": 13824 + }, + { + "epoch": 0.49510269128153706, + "grad_norm": 2.4620182514190674, + "learning_rate": 0.00010643879436493578, + "loss": 1.602, + "step": 13825 + }, + { + "epoch": 0.4951385034111053, + "grad_norm": 2.525754928588867, + "learning_rate": 0.0001064272193773214, + "loss": 1.4661, + "step": 13826 + }, + { + "epoch": 0.49517431554067365, + "grad_norm": 2.1498541831970215, + "learning_rate": 0.00010641564430323707, + "loss": 1.4336, + "step": 13827 + }, + { + "epoch": 0.4952101276702419, + "grad_norm": 2.0926995277404785, + "learning_rate": 0.00010640406914283854, + "loss": 1.2882, + "step": 13828 + }, + { + "epoch": 0.4952459397998102, + "grad_norm": 2.03147554397583, + "learning_rate": 0.00010639249389628149, + "loss": 1.7856, + "step": 13829 + }, + { + "epoch": 0.49528175192937846, + "grad_norm": 1.6365413665771484, + "learning_rate": 0.00010638091856372172, + "loss": 1.3358, + "step": 13830 + }, + { + "epoch": 0.4953175640589468, + "grad_norm": 1.5908212661743164, + "learning_rate": 0.00010636934314531488, + "loss": 1.7253, + "step": 13831 + }, + { + "epoch": 0.49535337618851505, + "grad_norm": 1.616167664527893, + "learning_rate": 0.00010635776764121677, + "loss": 1.6732, + "step": 13832 + }, + { + "epoch": 0.4953891883180833, + "grad_norm": 1.6805557012557983, + "learning_rate": 0.00010634619205158307, + "loss": 1.7188, + "step": 13833 + }, + { + "epoch": 0.49542500044765164, + "grad_norm": 1.613982081413269, + "learning_rate": 0.00010633461637656958, + "loss": 1.4378, + "step": 13834 + }, + { + "epoch": 0.4954608125772199, + "grad_norm": 1.827687382698059, + "learning_rate": 0.00010632304061633199, + "loss": 1.4883, + "step": 13835 + }, + { + "epoch": 0.4954966247067882, + "grad_norm": 1.3360340595245361, + "learning_rate": 0.00010631146477102602, + "loss": 1.2353, + "step": 13836 + }, + { + "epoch": 0.49553243683635645, + "grad_norm": 1.3688260316848755, + "learning_rate": 0.00010629988884080745, + "loss": 1.4722, + "step": 13837 + }, + { + "epoch": 0.4955682489659248, + "grad_norm": 2.0016164779663086, + "learning_rate": 0.00010628831282583201, + "loss": 1.2342, + "step": 13838 + }, + { + "epoch": 0.49560406109549304, + "grad_norm": 2.039696216583252, + "learning_rate": 0.00010627673672625542, + "loss": 1.5256, + "step": 13839 + }, + { + "epoch": 0.4956398732250613, + "grad_norm": 1.414087176322937, + "learning_rate": 0.00010626516054223341, + "loss": 1.5765, + "step": 13840 + }, + { + "epoch": 0.49567568535462964, + "grad_norm": 1.4707834720611572, + "learning_rate": 0.0001062535842739218, + "loss": 1.3901, + "step": 13841 + }, + { + "epoch": 0.4957114974841979, + "grad_norm": 1.269789695739746, + "learning_rate": 0.00010624200792147622, + "loss": 1.707, + "step": 13842 + }, + { + "epoch": 0.4957473096137662, + "grad_norm": 1.7977982759475708, + "learning_rate": 0.00010623043148505254, + "loss": 1.3614, + "step": 13843 + }, + { + "epoch": 0.49578312174333444, + "grad_norm": 2.498763084411621, + "learning_rate": 0.00010621885496480641, + "loss": 1.4808, + "step": 13844 + }, + { + "epoch": 0.49581893387290277, + "grad_norm": 1.9062105417251587, + "learning_rate": 0.00010620727836089359, + "loss": 1.6724, + "step": 13845 + }, + { + "epoch": 0.49585474600247104, + "grad_norm": 1.6607482433319092, + "learning_rate": 0.00010619570167346987, + "loss": 1.3214, + "step": 13846 + }, + { + "epoch": 0.4958905581320393, + "grad_norm": 1.4918594360351562, + "learning_rate": 0.00010618412490269096, + "loss": 1.4561, + "step": 13847 + }, + { + "epoch": 0.49592637026160763, + "grad_norm": 1.8576520681381226, + "learning_rate": 0.00010617254804871264, + "loss": 1.4214, + "step": 13848 + }, + { + "epoch": 0.4959621823911759, + "grad_norm": 1.425275444984436, + "learning_rate": 0.00010616097111169063, + "loss": 1.4187, + "step": 13849 + }, + { + "epoch": 0.49599799452074417, + "grad_norm": 1.8478344678878784, + "learning_rate": 0.00010614939409178072, + "loss": 1.4093, + "step": 13850 + }, + { + "epoch": 0.49603380665031244, + "grad_norm": 2.1850032806396484, + "learning_rate": 0.00010613781698913863, + "loss": 1.2745, + "step": 13851 + }, + { + "epoch": 0.49606961877988076, + "grad_norm": 1.5047380924224854, + "learning_rate": 0.00010612623980392016, + "loss": 1.1031, + "step": 13852 + }, + { + "epoch": 0.49610543090944903, + "grad_norm": 1.6822580099105835, + "learning_rate": 0.00010611466253628101, + "loss": 1.4585, + "step": 13853 + }, + { + "epoch": 0.4961412430390173, + "grad_norm": 1.5494749546051025, + "learning_rate": 0.00010610308518637697, + "loss": 1.3333, + "step": 13854 + }, + { + "epoch": 0.4961770551685856, + "grad_norm": 1.5541067123413086, + "learning_rate": 0.00010609150775436378, + "loss": 1.5149, + "step": 13855 + }, + { + "epoch": 0.4962128672981539, + "grad_norm": 1.8944567441940308, + "learning_rate": 0.00010607993024039722, + "loss": 1.2425, + "step": 13856 + }, + { + "epoch": 0.49624867942772216, + "grad_norm": 1.1840757131576538, + "learning_rate": 0.00010606835264463305, + "loss": 1.4237, + "step": 13857 + }, + { + "epoch": 0.49628449155729043, + "grad_norm": 2.1415929794311523, + "learning_rate": 0.00010605677496722699, + "loss": 1.7492, + "step": 13858 + }, + { + "epoch": 0.49632030368685875, + "grad_norm": 1.6626441478729248, + "learning_rate": 0.00010604519720833486, + "loss": 1.6241, + "step": 13859 + }, + { + "epoch": 0.496356115816427, + "grad_norm": 1.551418423652649, + "learning_rate": 0.00010603361936811239, + "loss": 1.5366, + "step": 13860 + }, + { + "epoch": 0.4963919279459953, + "grad_norm": 2.124976396560669, + "learning_rate": 0.00010602204144671539, + "loss": 1.6503, + "step": 13861 + }, + { + "epoch": 0.4964277400755636, + "grad_norm": 2.3942666053771973, + "learning_rate": 0.00010601046344429955, + "loss": 1.5967, + "step": 13862 + }, + { + "epoch": 0.4964635522051319, + "grad_norm": 2.203838586807251, + "learning_rate": 0.0001059988853610207, + "loss": 1.3865, + "step": 13863 + }, + { + "epoch": 0.49649936433470016, + "grad_norm": 1.3743383884429932, + "learning_rate": 0.00010598730719703456, + "loss": 1.78, + "step": 13864 + }, + { + "epoch": 0.4965351764642684, + "grad_norm": 1.5764050483703613, + "learning_rate": 0.00010597572895249694, + "loss": 1.1433, + "step": 13865 + }, + { + "epoch": 0.49657098859383675, + "grad_norm": 2.720665454864502, + "learning_rate": 0.00010596415062756358, + "loss": 1.4423, + "step": 13866 + }, + { + "epoch": 0.496606800723405, + "grad_norm": 1.9178234338760376, + "learning_rate": 0.0001059525722223903, + "loss": 1.5331, + "step": 13867 + }, + { + "epoch": 0.4966426128529733, + "grad_norm": 1.4973068237304688, + "learning_rate": 0.0001059409937371328, + "loss": 1.5225, + "step": 13868 + }, + { + "epoch": 0.4966784249825416, + "grad_norm": 1.7021870613098145, + "learning_rate": 0.00010592941517194692, + "loss": 1.3905, + "step": 13869 + }, + { + "epoch": 0.4967142371121099, + "grad_norm": 1.600563645362854, + "learning_rate": 0.00010591783652698841, + "loss": 1.3413, + "step": 13870 + }, + { + "epoch": 0.49675004924167815, + "grad_norm": 1.8694027662277222, + "learning_rate": 0.00010590625780241302, + "loss": 1.4979, + "step": 13871 + }, + { + "epoch": 0.4967858613712464, + "grad_norm": 1.3049858808517456, + "learning_rate": 0.00010589467899837657, + "loss": 1.2021, + "step": 13872 + }, + { + "epoch": 0.49682167350081474, + "grad_norm": 1.7500213384628296, + "learning_rate": 0.0001058831001150348, + "loss": 1.4798, + "step": 13873 + }, + { + "epoch": 0.496857485630383, + "grad_norm": 1.521600365638733, + "learning_rate": 0.00010587152115254353, + "loss": 1.5667, + "step": 13874 + }, + { + "epoch": 0.4968932977599513, + "grad_norm": 1.659149408340454, + "learning_rate": 0.0001058599421110585, + "loss": 1.3085, + "step": 13875 + }, + { + "epoch": 0.4969291098895196, + "grad_norm": 1.9441237449645996, + "learning_rate": 0.0001058483629907355, + "loss": 1.5502, + "step": 13876 + }, + { + "epoch": 0.4969649220190879, + "grad_norm": 1.8426682949066162, + "learning_rate": 0.00010583678379173032, + "loss": 1.7238, + "step": 13877 + }, + { + "epoch": 0.49700073414865614, + "grad_norm": 2.1774513721466064, + "learning_rate": 0.00010582520451419877, + "loss": 1.4809, + "step": 13878 + }, + { + "epoch": 0.4970365462782244, + "grad_norm": 1.3162360191345215, + "learning_rate": 0.0001058136251582966, + "loss": 1.573, + "step": 13879 + }, + { + "epoch": 0.49707235840779274, + "grad_norm": 1.2653931379318237, + "learning_rate": 0.00010580204572417957, + "loss": 1.6874, + "step": 13880 + }, + { + "epoch": 0.497108170537361, + "grad_norm": 2.2073793411254883, + "learning_rate": 0.00010579046621200355, + "loss": 1.8626, + "step": 13881 + }, + { + "epoch": 0.4971439826669293, + "grad_norm": 1.9331858158111572, + "learning_rate": 0.00010577888662192424, + "loss": 1.6169, + "step": 13882 + }, + { + "epoch": 0.4971797947964976, + "grad_norm": 1.5831600427627563, + "learning_rate": 0.00010576730695409747, + "loss": 1.5547, + "step": 13883 + }, + { + "epoch": 0.49721560692606587, + "grad_norm": 1.8316388130187988, + "learning_rate": 0.00010575572720867901, + "loss": 1.7315, + "step": 13884 + }, + { + "epoch": 0.49725141905563414, + "grad_norm": 1.7903798818588257, + "learning_rate": 0.0001057441473858247, + "loss": 1.4935, + "step": 13885 + }, + { + "epoch": 0.4972872311852024, + "grad_norm": 1.3750848770141602, + "learning_rate": 0.00010573256748569027, + "loss": 1.3742, + "step": 13886 + }, + { + "epoch": 0.49732304331477073, + "grad_norm": 2.335555076599121, + "learning_rate": 0.00010572098750843155, + "loss": 1.5615, + "step": 13887 + }, + { + "epoch": 0.497358855444339, + "grad_norm": 1.5992259979248047, + "learning_rate": 0.00010570940745420433, + "loss": 1.6146, + "step": 13888 + }, + { + "epoch": 0.49739466757390727, + "grad_norm": 1.6702439785003662, + "learning_rate": 0.00010569782732316438, + "loss": 1.443, + "step": 13889 + }, + { + "epoch": 0.4974304797034756, + "grad_norm": 1.9406352043151855, + "learning_rate": 0.00010568624711546752, + "loss": 1.4274, + "step": 13890 + }, + { + "epoch": 0.49746629183304386, + "grad_norm": 1.4451533555984497, + "learning_rate": 0.00010567466683126952, + "loss": 1.5557, + "step": 13891 + }, + { + "epoch": 0.49750210396261213, + "grad_norm": 2.7517757415771484, + "learning_rate": 0.00010566308647072624, + "loss": 1.5878, + "step": 13892 + }, + { + "epoch": 0.4975379160921804, + "grad_norm": 1.9673051834106445, + "learning_rate": 0.0001056515060339934, + "loss": 1.7578, + "step": 13893 + }, + { + "epoch": 0.4975737282217487, + "grad_norm": 1.653322458267212, + "learning_rate": 0.00010563992552122686, + "loss": 1.2221, + "step": 13894 + }, + { + "epoch": 0.497609540351317, + "grad_norm": 1.8169872760772705, + "learning_rate": 0.00010562834493258237, + "loss": 1.4192, + "step": 13895 + }, + { + "epoch": 0.49764535248088526, + "grad_norm": 1.7515963315963745, + "learning_rate": 0.00010561676426821581, + "loss": 1.4524, + "step": 13896 + }, + { + "epoch": 0.4976811646104536, + "grad_norm": 1.6058905124664307, + "learning_rate": 0.00010560518352828288, + "loss": 1.5834, + "step": 13897 + }, + { + "epoch": 0.49771697674002185, + "grad_norm": 2.050147294998169, + "learning_rate": 0.00010559360271293947, + "loss": 1.3106, + "step": 13898 + }, + { + "epoch": 0.4977527888695901, + "grad_norm": 1.8379297256469727, + "learning_rate": 0.00010558202182234132, + "loss": 1.6386, + "step": 13899 + }, + { + "epoch": 0.4977886009991584, + "grad_norm": 1.9180848598480225, + "learning_rate": 0.00010557044085664428, + "loss": 1.4842, + "step": 13900 + }, + { + "epoch": 0.4978244131287267, + "grad_norm": 1.9606422185897827, + "learning_rate": 0.00010555885981600416, + "loss": 1.225, + "step": 13901 + }, + { + "epoch": 0.497860225258295, + "grad_norm": 1.8793365955352783, + "learning_rate": 0.00010554727870057671, + "loss": 1.6759, + "step": 13902 + }, + { + "epoch": 0.49789603738786326, + "grad_norm": 2.0016098022460938, + "learning_rate": 0.00010553569751051782, + "loss": 1.4724, + "step": 13903 + }, + { + "epoch": 0.4979318495174316, + "grad_norm": 2.300502061843872, + "learning_rate": 0.00010552411624598325, + "loss": 1.3582, + "step": 13904 + }, + { + "epoch": 0.49796766164699985, + "grad_norm": 2.5776402950286865, + "learning_rate": 0.00010551253490712882, + "loss": 1.529, + "step": 13905 + }, + { + "epoch": 0.4980034737765681, + "grad_norm": 1.8135322332382202, + "learning_rate": 0.00010550095349411033, + "loss": 1.3956, + "step": 13906 + }, + { + "epoch": 0.4980392859061364, + "grad_norm": 1.6198707818984985, + "learning_rate": 0.00010548937200708365, + "loss": 1.348, + "step": 13907 + }, + { + "epoch": 0.4980750980357047, + "grad_norm": 1.9541133642196655, + "learning_rate": 0.0001054777904462045, + "loss": 1.4611, + "step": 13908 + }, + { + "epoch": 0.498110910165273, + "grad_norm": 1.7286028861999512, + "learning_rate": 0.00010546620881162876, + "loss": 1.1831, + "step": 13909 + }, + { + "epoch": 0.49814672229484125, + "grad_norm": 1.5478944778442383, + "learning_rate": 0.00010545462710351224, + "loss": 1.5389, + "step": 13910 + }, + { + "epoch": 0.4981825344244096, + "grad_norm": 1.9012272357940674, + "learning_rate": 0.00010544304532201075, + "loss": 1.2058, + "step": 13911 + }, + { + "epoch": 0.49821834655397784, + "grad_norm": 1.586827039718628, + "learning_rate": 0.0001054314634672801, + "loss": 1.8115, + "step": 13912 + }, + { + "epoch": 0.4982541586835461, + "grad_norm": 1.4416097402572632, + "learning_rate": 0.00010541988153947609, + "loss": 1.4744, + "step": 13913 + }, + { + "epoch": 0.4982899708131144, + "grad_norm": 1.427348256111145, + "learning_rate": 0.00010540829953875462, + "loss": 1.3261, + "step": 13914 + }, + { + "epoch": 0.4983257829426827, + "grad_norm": 1.6142998933792114, + "learning_rate": 0.00010539671746527142, + "loss": 1.4967, + "step": 13915 + }, + { + "epoch": 0.498361595072251, + "grad_norm": 1.6555075645446777, + "learning_rate": 0.00010538513531918237, + "loss": 1.2144, + "step": 13916 + }, + { + "epoch": 0.49839740720181924, + "grad_norm": 1.8272299766540527, + "learning_rate": 0.00010537355310064323, + "loss": 1.3989, + "step": 13917 + }, + { + "epoch": 0.49843321933138757, + "grad_norm": 1.2751270532608032, + "learning_rate": 0.00010536197080980991, + "loss": 1.6073, + "step": 13918 + }, + { + "epoch": 0.49846903146095584, + "grad_norm": 1.965511679649353, + "learning_rate": 0.00010535038844683816, + "loss": 1.6124, + "step": 13919 + }, + { + "epoch": 0.4985048435905241, + "grad_norm": 1.3621838092803955, + "learning_rate": 0.00010533880601188384, + "loss": 1.436, + "step": 13920 + }, + { + "epoch": 0.4985406557200924, + "grad_norm": 2.440661668777466, + "learning_rate": 0.00010532722350510277, + "loss": 1.3973, + "step": 13921 + }, + { + "epoch": 0.4985764678496607, + "grad_norm": 1.5295976400375366, + "learning_rate": 0.00010531564092665079, + "loss": 1.5994, + "step": 13922 + }, + { + "epoch": 0.49861227997922897, + "grad_norm": 2.2189676761627197, + "learning_rate": 0.00010530405827668372, + "loss": 1.3265, + "step": 13923 + }, + { + "epoch": 0.49864809210879724, + "grad_norm": 1.4622187614440918, + "learning_rate": 0.00010529247555535738, + "loss": 1.7136, + "step": 13924 + }, + { + "epoch": 0.49868390423836556, + "grad_norm": 1.473183274269104, + "learning_rate": 0.00010528089276282762, + "loss": 1.6749, + "step": 13925 + }, + { + "epoch": 0.49871971636793383, + "grad_norm": 1.5246021747589111, + "learning_rate": 0.00010526930989925023, + "loss": 1.5913, + "step": 13926 + }, + { + "epoch": 0.4987555284975021, + "grad_norm": 1.6022064685821533, + "learning_rate": 0.0001052577269647811, + "loss": 1.4599, + "step": 13927 + }, + { + "epoch": 0.49879134062707037, + "grad_norm": 1.3866850137710571, + "learning_rate": 0.00010524614395957602, + "loss": 1.3702, + "step": 13928 + }, + { + "epoch": 0.4988271527566387, + "grad_norm": 2.0144548416137695, + "learning_rate": 0.00010523456088379084, + "loss": 1.582, + "step": 13929 + }, + { + "epoch": 0.49886296488620696, + "grad_norm": 2.1504111289978027, + "learning_rate": 0.00010522297773758141, + "loss": 1.6746, + "step": 13930 + }, + { + "epoch": 0.49889877701577523, + "grad_norm": 1.4238390922546387, + "learning_rate": 0.00010521139452110354, + "loss": 1.6415, + "step": 13931 + }, + { + "epoch": 0.49893458914534355, + "grad_norm": 2.0092551708221436, + "learning_rate": 0.0001051998112345131, + "loss": 1.1434, + "step": 13932 + }, + { + "epoch": 0.4989704012749118, + "grad_norm": 1.7443785667419434, + "learning_rate": 0.00010518822787796587, + "loss": 1.6474, + "step": 13933 + }, + { + "epoch": 0.4990062134044801, + "grad_norm": 1.9198230504989624, + "learning_rate": 0.00010517664445161775, + "loss": 1.291, + "step": 13934 + }, + { + "epoch": 0.49904202553404836, + "grad_norm": 2.1047768592834473, + "learning_rate": 0.00010516506095562455, + "loss": 1.4204, + "step": 13935 + }, + { + "epoch": 0.4990778376636167, + "grad_norm": 1.772157907485962, + "learning_rate": 0.00010515347739014212, + "loss": 1.4815, + "step": 13936 + }, + { + "epoch": 0.49911364979318495, + "grad_norm": 1.4584022760391235, + "learning_rate": 0.00010514189375532629, + "loss": 1.1986, + "step": 13937 + }, + { + "epoch": 0.4991494619227532, + "grad_norm": 1.6682506799697876, + "learning_rate": 0.00010513031005133293, + "loss": 1.1585, + "step": 13938 + }, + { + "epoch": 0.49918527405232155, + "grad_norm": 2.0032548904418945, + "learning_rate": 0.00010511872627831785, + "loss": 1.8689, + "step": 13939 + }, + { + "epoch": 0.4992210861818898, + "grad_norm": 1.799172043800354, + "learning_rate": 0.00010510714243643693, + "loss": 1.2141, + "step": 13940 + }, + { + "epoch": 0.4992568983114581, + "grad_norm": 1.4442138671875, + "learning_rate": 0.00010509555852584598, + "loss": 1.3636, + "step": 13941 + }, + { + "epoch": 0.49929271044102636, + "grad_norm": 1.4175442457199097, + "learning_rate": 0.00010508397454670085, + "loss": 1.4478, + "step": 13942 + }, + { + "epoch": 0.4993285225705947, + "grad_norm": 2.0027573108673096, + "learning_rate": 0.00010507239049915742, + "loss": 1.8723, + "step": 13943 + }, + { + "epoch": 0.49936433470016295, + "grad_norm": 1.363532543182373, + "learning_rate": 0.00010506080638337152, + "loss": 1.5395, + "step": 13944 + }, + { + "epoch": 0.4994001468297312, + "grad_norm": 2.1520915031433105, + "learning_rate": 0.000105049222199499, + "loss": 1.5888, + "step": 13945 + }, + { + "epoch": 0.49943595895929954, + "grad_norm": 1.5983831882476807, + "learning_rate": 0.0001050376379476957, + "loss": 1.2995, + "step": 13946 + }, + { + "epoch": 0.4994717710888678, + "grad_norm": 1.9708709716796875, + "learning_rate": 0.00010502605362811748, + "loss": 1.3502, + "step": 13947 + }, + { + "epoch": 0.4995075832184361, + "grad_norm": 1.6863350868225098, + "learning_rate": 0.00010501446924092018, + "loss": 1.3486, + "step": 13948 + }, + { + "epoch": 0.49954339534800435, + "grad_norm": 1.7589701414108276, + "learning_rate": 0.0001050028847862597, + "loss": 1.4596, + "step": 13949 + }, + { + "epoch": 0.4995792074775727, + "grad_norm": 1.4610109329223633, + "learning_rate": 0.00010499130026429182, + "loss": 1.5116, + "step": 13950 + }, + { + "epoch": 0.49961501960714094, + "grad_norm": 1.776166558265686, + "learning_rate": 0.00010497971567517246, + "loss": 1.3748, + "step": 13951 + }, + { + "epoch": 0.4996508317367092, + "grad_norm": 2.0865702629089355, + "learning_rate": 0.00010496813101905745, + "loss": 1.4648, + "step": 13952 + }, + { + "epoch": 0.49968664386627754, + "grad_norm": 1.7952061891555786, + "learning_rate": 0.00010495654629610264, + "loss": 1.233, + "step": 13953 + }, + { + "epoch": 0.4997224559958458, + "grad_norm": 2.238560914993286, + "learning_rate": 0.00010494496150646387, + "loss": 1.596, + "step": 13954 + }, + { + "epoch": 0.4997582681254141, + "grad_norm": 1.680570125579834, + "learning_rate": 0.000104933376650297, + "loss": 1.4756, + "step": 13955 + }, + { + "epoch": 0.49979408025498234, + "grad_norm": 2.210883140563965, + "learning_rate": 0.00010492179172775797, + "loss": 1.3497, + "step": 13956 + }, + { + "epoch": 0.49982989238455067, + "grad_norm": 1.6548429727554321, + "learning_rate": 0.00010491020673900256, + "loss": 1.4613, + "step": 13957 + }, + { + "epoch": 0.49986570451411894, + "grad_norm": 1.5173407793045044, + "learning_rate": 0.00010489862168418667, + "loss": 1.237, + "step": 13958 + }, + { + "epoch": 0.4999015166436872, + "grad_norm": 1.6748205423355103, + "learning_rate": 0.00010488703656346612, + "loss": 1.6796, + "step": 13959 + }, + { + "epoch": 0.49993732877325553, + "grad_norm": 2.2903249263763428, + "learning_rate": 0.00010487545137699682, + "loss": 1.3784, + "step": 13960 + }, + { + "epoch": 0.4999731409028238, + "grad_norm": 1.832918643951416, + "learning_rate": 0.00010486386612493458, + "loss": 1.4155, + "step": 13961 + }, + { + "epoch": 0.5000089530323921, + "grad_norm": 1.8992984294891357, + "learning_rate": 0.00010485228080743532, + "loss": 1.3437, + "step": 13962 + }, + { + "epoch": 0.5000447651619604, + "grad_norm": 1.5817441940307617, + "learning_rate": 0.00010484069542465484, + "loss": 1.2825, + "step": 13963 + }, + { + "epoch": 0.5000805772915287, + "grad_norm": 1.7901121377944946, + "learning_rate": 0.00010482910997674911, + "loss": 1.4525, + "step": 13964 + }, + { + "epoch": 0.5001163894210969, + "grad_norm": 1.9378963708877563, + "learning_rate": 0.00010481752446387387, + "loss": 1.5827, + "step": 13965 + }, + { + "epoch": 0.5001522015506652, + "grad_norm": 1.3163588047027588, + "learning_rate": 0.0001048059388861851, + "loss": 1.3767, + "step": 13966 + }, + { + "epoch": 0.5001880136802335, + "grad_norm": 2.0047686100006104, + "learning_rate": 0.00010479435324383861, + "loss": 1.4476, + "step": 13967 + }, + { + "epoch": 0.5002238258098017, + "grad_norm": 1.7671470642089844, + "learning_rate": 0.00010478276753699028, + "loss": 1.523, + "step": 13968 + }, + { + "epoch": 0.5002596379393701, + "grad_norm": 1.4821014404296875, + "learning_rate": 0.00010477118176579597, + "loss": 1.4437, + "step": 13969 + }, + { + "epoch": 0.5002954500689384, + "grad_norm": 2.142829656600952, + "learning_rate": 0.00010475959593041156, + "loss": 1.5739, + "step": 13970 + }, + { + "epoch": 0.5003312621985067, + "grad_norm": 2.060955047607422, + "learning_rate": 0.00010474801003099294, + "loss": 1.5802, + "step": 13971 + }, + { + "epoch": 0.5003670743280749, + "grad_norm": 1.8736474514007568, + "learning_rate": 0.00010473642406769597, + "loss": 1.2844, + "step": 13972 + }, + { + "epoch": 0.5004028864576432, + "grad_norm": 1.4875705242156982, + "learning_rate": 0.00010472483804067652, + "loss": 1.0674, + "step": 13973 + }, + { + "epoch": 0.5004386985872115, + "grad_norm": 1.404869556427002, + "learning_rate": 0.00010471325195009047, + "loss": 1.6174, + "step": 13974 + }, + { + "epoch": 0.5004745107167797, + "grad_norm": 1.461624264717102, + "learning_rate": 0.00010470166579609371, + "loss": 1.5879, + "step": 13975 + }, + { + "epoch": 0.5005103228463481, + "grad_norm": 1.3409490585327148, + "learning_rate": 0.0001046900795788421, + "loss": 1.4595, + "step": 13976 + }, + { + "epoch": 0.5005461349759164, + "grad_norm": 1.7915490865707397, + "learning_rate": 0.00010467849329849148, + "loss": 1.5817, + "step": 13977 + }, + { + "epoch": 0.5005819471054846, + "grad_norm": 1.4532082080841064, + "learning_rate": 0.00010466690695519781, + "loss": 1.4423, + "step": 13978 + }, + { + "epoch": 0.5006177592350529, + "grad_norm": 1.5998505353927612, + "learning_rate": 0.00010465532054911689, + "loss": 1.564, + "step": 13979 + }, + { + "epoch": 0.5006535713646212, + "grad_norm": 1.4868236780166626, + "learning_rate": 0.00010464373408040467, + "loss": 1.6871, + "step": 13980 + }, + { + "epoch": 0.5006893834941895, + "grad_norm": 1.7556103467941284, + "learning_rate": 0.00010463214754921697, + "loss": 1.4825, + "step": 13981 + }, + { + "epoch": 0.5007251956237577, + "grad_norm": 2.2433080673217773, + "learning_rate": 0.00010462056095570974, + "loss": 1.5892, + "step": 13982 + }, + { + "epoch": 0.5007610077533261, + "grad_norm": 1.4293811321258545, + "learning_rate": 0.00010460897430003877, + "loss": 1.4694, + "step": 13983 + }, + { + "epoch": 0.5007968198828944, + "grad_norm": 2.3637685775756836, + "learning_rate": 0.00010459738758236006, + "loss": 2.1556, + "step": 13984 + }, + { + "epoch": 0.5008326320124626, + "grad_norm": 1.4137765169143677, + "learning_rate": 0.00010458580080282938, + "loss": 1.5547, + "step": 13985 + }, + { + "epoch": 0.5008684441420309, + "grad_norm": 1.7568750381469727, + "learning_rate": 0.00010457421396160265, + "loss": 1.4713, + "step": 13986 + }, + { + "epoch": 0.5009042562715992, + "grad_norm": 1.891427755355835, + "learning_rate": 0.00010456262705883581, + "loss": 1.0563, + "step": 13987 + }, + { + "epoch": 0.5009400684011674, + "grad_norm": 1.6349992752075195, + "learning_rate": 0.0001045510400946847, + "loss": 1.4901, + "step": 13988 + }, + { + "epoch": 0.5009758805307357, + "grad_norm": 1.4608160257339478, + "learning_rate": 0.00010453945306930521, + "loss": 1.507, + "step": 13989 + }, + { + "epoch": 0.5010116926603041, + "grad_norm": 1.6028715372085571, + "learning_rate": 0.00010452786598285323, + "loss": 1.6952, + "step": 13990 + }, + { + "epoch": 0.5010475047898724, + "grad_norm": 1.4232906103134155, + "learning_rate": 0.00010451627883548468, + "loss": 1.4949, + "step": 13991 + }, + { + "epoch": 0.5010833169194406, + "grad_norm": 1.2715191841125488, + "learning_rate": 0.00010450469162735539, + "loss": 1.3847, + "step": 13992 + }, + { + "epoch": 0.5011191290490089, + "grad_norm": 1.617362380027771, + "learning_rate": 0.00010449310435862134, + "loss": 1.508, + "step": 13993 + }, + { + "epoch": 0.5011549411785772, + "grad_norm": 1.5455505847930908, + "learning_rate": 0.00010448151702943831, + "loss": 1.557, + "step": 13994 + }, + { + "epoch": 0.5011907533081454, + "grad_norm": 1.6332513093948364, + "learning_rate": 0.00010446992963996227, + "loss": 1.5056, + "step": 13995 + }, + { + "epoch": 0.5012265654377137, + "grad_norm": 1.7719979286193848, + "learning_rate": 0.00010445834219034909, + "loss": 1.6498, + "step": 13996 + }, + { + "epoch": 0.5012623775672821, + "grad_norm": 1.3904492855072021, + "learning_rate": 0.00010444675468075467, + "loss": 1.4026, + "step": 13997 + }, + { + "epoch": 0.5012981896968504, + "grad_norm": 1.3235598802566528, + "learning_rate": 0.00010443516711133487, + "loss": 1.4448, + "step": 13998 + }, + { + "epoch": 0.5013340018264186, + "grad_norm": 1.7896219491958618, + "learning_rate": 0.00010442357948224564, + "loss": 1.6782, + "step": 13999 + }, + { + "epoch": 0.5013698139559869, + "grad_norm": 1.3523547649383545, + "learning_rate": 0.00010441199179364287, + "loss": 1.0761, + "step": 14000 + }, + { + "epoch": 0.5014056260855552, + "grad_norm": 1.9005093574523926, + "learning_rate": 0.00010440040404568241, + "loss": 1.5543, + "step": 14001 + }, + { + "epoch": 0.5014414382151234, + "grad_norm": 1.3568506240844727, + "learning_rate": 0.00010438881623852026, + "loss": 1.6062, + "step": 14002 + }, + { + "epoch": 0.5014772503446917, + "grad_norm": 1.6488691568374634, + "learning_rate": 0.00010437722837231218, + "loss": 1.7009, + "step": 14003 + }, + { + "epoch": 0.5015130624742601, + "grad_norm": 1.5517204999923706, + "learning_rate": 0.00010436564044721415, + "loss": 1.7509, + "step": 14004 + }, + { + "epoch": 0.5015488746038284, + "grad_norm": 1.2834587097167969, + "learning_rate": 0.00010435405246338205, + "loss": 1.4208, + "step": 14005 + }, + { + "epoch": 0.5015846867333966, + "grad_norm": 1.3293063640594482, + "learning_rate": 0.00010434246442097184, + "loss": 1.4019, + "step": 14006 + }, + { + "epoch": 0.5016204988629649, + "grad_norm": 2.0238394737243652, + "learning_rate": 0.00010433087632013931, + "loss": 1.3617, + "step": 14007 + }, + { + "epoch": 0.5016563109925332, + "grad_norm": 1.7734540700912476, + "learning_rate": 0.00010431928816104048, + "loss": 1.3808, + "step": 14008 + }, + { + "epoch": 0.5016921231221014, + "grad_norm": 1.458070158958435, + "learning_rate": 0.00010430769994383116, + "loss": 1.4744, + "step": 14009 + }, + { + "epoch": 0.5017279352516697, + "grad_norm": 1.3446053266525269, + "learning_rate": 0.0001042961116686673, + "loss": 1.4234, + "step": 14010 + }, + { + "epoch": 0.5017637473812381, + "grad_norm": 2.673496723175049, + "learning_rate": 0.00010428452333570482, + "loss": 1.9296, + "step": 14011 + }, + { + "epoch": 0.5017995595108063, + "grad_norm": 2.1430649757385254, + "learning_rate": 0.0001042729349450996, + "loss": 1.4881, + "step": 14012 + }, + { + "epoch": 0.5018353716403746, + "grad_norm": 1.6296019554138184, + "learning_rate": 0.00010426134649700754, + "loss": 1.1715, + "step": 14013 + }, + { + "epoch": 0.5018711837699429, + "grad_norm": 1.853363275527954, + "learning_rate": 0.00010424975799158456, + "loss": 1.3376, + "step": 14014 + }, + { + "epoch": 0.5019069958995112, + "grad_norm": 1.6113402843475342, + "learning_rate": 0.00010423816942898659, + "loss": 1.6665, + "step": 14015 + }, + { + "epoch": 0.5019428080290794, + "grad_norm": 1.607465147972107, + "learning_rate": 0.00010422658080936947, + "loss": 1.5115, + "step": 14016 + }, + { + "epoch": 0.5019786201586477, + "grad_norm": 1.796221375465393, + "learning_rate": 0.00010421499213288919, + "loss": 1.5607, + "step": 14017 + }, + { + "epoch": 0.5020144322882161, + "grad_norm": 2.2916972637176514, + "learning_rate": 0.00010420340339970163, + "loss": 1.6604, + "step": 14018 + }, + { + "epoch": 0.5020502444177843, + "grad_norm": 1.6994593143463135, + "learning_rate": 0.0001041918146099627, + "loss": 1.5249, + "step": 14019 + }, + { + "epoch": 0.5020860565473526, + "grad_norm": 1.3633710145950317, + "learning_rate": 0.00010418022576382831, + "loss": 1.2622, + "step": 14020 + }, + { + "epoch": 0.5021218686769209, + "grad_norm": 2.406529426574707, + "learning_rate": 0.00010416863686145434, + "loss": 1.3539, + "step": 14021 + }, + { + "epoch": 0.5021576808064891, + "grad_norm": 1.4602991342544556, + "learning_rate": 0.00010415704790299678, + "loss": 1.3125, + "step": 14022 + }, + { + "epoch": 0.5021934929360574, + "grad_norm": 1.564888596534729, + "learning_rate": 0.00010414545888861149, + "loss": 1.4893, + "step": 14023 + }, + { + "epoch": 0.5022293050656257, + "grad_norm": 1.9032400846481323, + "learning_rate": 0.0001041338698184544, + "loss": 1.425, + "step": 14024 + }, + { + "epoch": 0.5022651171951941, + "grad_norm": 2.332868814468384, + "learning_rate": 0.00010412228069268142, + "loss": 1.4777, + "step": 14025 + }, + { + "epoch": 0.5023009293247623, + "grad_norm": 1.6929960250854492, + "learning_rate": 0.00010411069151144848, + "loss": 1.729, + "step": 14026 + }, + { + "epoch": 0.5023367414543306, + "grad_norm": 2.2395436763763428, + "learning_rate": 0.00010409910227491146, + "loss": 1.6846, + "step": 14027 + }, + { + "epoch": 0.5023725535838989, + "grad_norm": 1.671336054801941, + "learning_rate": 0.00010408751298322634, + "loss": 1.2541, + "step": 14028 + }, + { + "epoch": 0.5024083657134671, + "grad_norm": 1.7518086433410645, + "learning_rate": 0.00010407592363654901, + "loss": 1.3721, + "step": 14029 + }, + { + "epoch": 0.5024441778430354, + "grad_norm": 2.0253491401672363, + "learning_rate": 0.00010406433423503534, + "loss": 1.5186, + "step": 14030 + }, + { + "epoch": 0.5024799899726037, + "grad_norm": 1.4864667654037476, + "learning_rate": 0.00010405274477884135, + "loss": 1.1936, + "step": 14031 + }, + { + "epoch": 0.5025158021021721, + "grad_norm": 1.1731634140014648, + "learning_rate": 0.00010404115526812286, + "loss": 1.4125, + "step": 14032 + }, + { + "epoch": 0.5025516142317403, + "grad_norm": 1.3496315479278564, + "learning_rate": 0.00010402956570303586, + "loss": 1.3462, + "step": 14033 + }, + { + "epoch": 0.5025874263613086, + "grad_norm": 1.6473044157028198, + "learning_rate": 0.00010401797608373625, + "loss": 1.7143, + "step": 14034 + }, + { + "epoch": 0.5026232384908769, + "grad_norm": 2.9563591480255127, + "learning_rate": 0.00010400638641037996, + "loss": 1.505, + "step": 14035 + }, + { + "epoch": 0.5026590506204451, + "grad_norm": 1.4683979749679565, + "learning_rate": 0.00010399479668312288, + "loss": 1.5082, + "step": 14036 + }, + { + "epoch": 0.5026948627500134, + "grad_norm": 1.4357550144195557, + "learning_rate": 0.00010398320690212102, + "loss": 1.4316, + "step": 14037 + }, + { + "epoch": 0.5027306748795817, + "grad_norm": 1.8764081001281738, + "learning_rate": 0.00010397161706753021, + "loss": 1.4092, + "step": 14038 + }, + { + "epoch": 0.50276648700915, + "grad_norm": 1.5990819931030273, + "learning_rate": 0.00010396002717950644, + "loss": 1.4917, + "step": 14039 + }, + { + "epoch": 0.5028022991387183, + "grad_norm": 1.571290373802185, + "learning_rate": 0.00010394843723820558, + "loss": 1.3678, + "step": 14040 + }, + { + "epoch": 0.5028381112682866, + "grad_norm": 2.3405323028564453, + "learning_rate": 0.00010393684724378358, + "loss": 1.4077, + "step": 14041 + }, + { + "epoch": 0.5028739233978549, + "grad_norm": 1.5465885400772095, + "learning_rate": 0.00010392525719639642, + "loss": 0.9557, + "step": 14042 + }, + { + "epoch": 0.5029097355274231, + "grad_norm": 1.379290223121643, + "learning_rate": 0.00010391366709619994, + "loss": 1.3364, + "step": 14043 + }, + { + "epoch": 0.5029455476569914, + "grad_norm": 1.6196434497833252, + "learning_rate": 0.00010390207694335017, + "loss": 1.4637, + "step": 14044 + }, + { + "epoch": 0.5029813597865597, + "grad_norm": 1.7534099817276, + "learning_rate": 0.00010389048673800294, + "loss": 1.4994, + "step": 14045 + }, + { + "epoch": 0.503017171916128, + "grad_norm": 2.0215682983398438, + "learning_rate": 0.00010387889648031428, + "loss": 1.5993, + "step": 14046 + }, + { + "epoch": 0.5030529840456963, + "grad_norm": 1.6501681804656982, + "learning_rate": 0.00010386730617044005, + "loss": 1.3859, + "step": 14047 + }, + { + "epoch": 0.5030887961752646, + "grad_norm": 1.9652694463729858, + "learning_rate": 0.0001038557158085362, + "loss": 1.6148, + "step": 14048 + }, + { + "epoch": 0.5031246083048329, + "grad_norm": 1.6333247423171997, + "learning_rate": 0.00010384412539475865, + "loss": 1.1812, + "step": 14049 + }, + { + "epoch": 0.5031604204344011, + "grad_norm": 1.772522211074829, + "learning_rate": 0.00010383253492926339, + "loss": 1.6932, + "step": 14050 + }, + { + "epoch": 0.5031962325639694, + "grad_norm": 1.6307519674301147, + "learning_rate": 0.00010382094441220627, + "loss": 1.4595, + "step": 14051 + }, + { + "epoch": 0.5032320446935377, + "grad_norm": 1.7731724977493286, + "learning_rate": 0.00010380935384374331, + "loss": 1.5997, + "step": 14052 + }, + { + "epoch": 0.503267856823106, + "grad_norm": 1.5439388751983643, + "learning_rate": 0.00010379776322403039, + "loss": 1.2718, + "step": 14053 + }, + { + "epoch": 0.5033036689526743, + "grad_norm": 2.08516001701355, + "learning_rate": 0.00010378617255322344, + "loss": 1.1215, + "step": 14054 + }, + { + "epoch": 0.5033394810822426, + "grad_norm": 1.9739441871643066, + "learning_rate": 0.00010377458183147848, + "loss": 1.5623, + "step": 14055 + }, + { + "epoch": 0.5033752932118108, + "grad_norm": 1.473125696182251, + "learning_rate": 0.00010376299105895135, + "loss": 1.4087, + "step": 14056 + }, + { + "epoch": 0.5034111053413791, + "grad_norm": 1.925338625907898, + "learning_rate": 0.00010375140023579805, + "loss": 1.5258, + "step": 14057 + }, + { + "epoch": 0.5034469174709474, + "grad_norm": 1.9338470697402954, + "learning_rate": 0.0001037398093621745, + "loss": 1.7284, + "step": 14058 + }, + { + "epoch": 0.5034827296005157, + "grad_norm": 1.9991499185562134, + "learning_rate": 0.00010372821843823661, + "loss": 1.5905, + "step": 14059 + }, + { + "epoch": 0.503518541730084, + "grad_norm": 1.4696353673934937, + "learning_rate": 0.00010371662746414037, + "loss": 1.4171, + "step": 14060 + }, + { + "epoch": 0.5035543538596523, + "grad_norm": 1.5892161130905151, + "learning_rate": 0.00010370503644004171, + "loss": 1.5635, + "step": 14061 + }, + { + "epoch": 0.5035901659892206, + "grad_norm": 1.718597650527954, + "learning_rate": 0.00010369344536609653, + "loss": 1.7008, + "step": 14062 + }, + { + "epoch": 0.5036259781187888, + "grad_norm": 1.6046531200408936, + "learning_rate": 0.00010368185424246084, + "loss": 1.7398, + "step": 14063 + }, + { + "epoch": 0.5036617902483571, + "grad_norm": 1.702903389930725, + "learning_rate": 0.00010367026306929056, + "loss": 1.6981, + "step": 14064 + }, + { + "epoch": 0.5036976023779254, + "grad_norm": 2.5990512371063232, + "learning_rate": 0.00010365867184674159, + "loss": 1.39, + "step": 14065 + }, + { + "epoch": 0.5037334145074936, + "grad_norm": 1.5704472064971924, + "learning_rate": 0.00010364708057496992, + "loss": 1.6272, + "step": 14066 + }, + { + "epoch": 0.5037692266370619, + "grad_norm": 1.577022671699524, + "learning_rate": 0.00010363548925413149, + "loss": 1.2842, + "step": 14067 + }, + { + "epoch": 0.5038050387666303, + "grad_norm": 2.005908250808716, + "learning_rate": 0.00010362389788438225, + "loss": 1.2809, + "step": 14068 + }, + { + "epoch": 0.5038408508961986, + "grad_norm": 2.519007444381714, + "learning_rate": 0.00010361230646587812, + "loss": 1.509, + "step": 14069 + }, + { + "epoch": 0.5038766630257668, + "grad_norm": 2.2054390907287598, + "learning_rate": 0.00010360071499877508, + "loss": 1.4182, + "step": 14070 + }, + { + "epoch": 0.5039124751553351, + "grad_norm": 1.6459708213806152, + "learning_rate": 0.00010358912348322904, + "loss": 1.1879, + "step": 14071 + }, + { + "epoch": 0.5039482872849034, + "grad_norm": 2.3594183921813965, + "learning_rate": 0.00010357753191939601, + "loss": 1.7322, + "step": 14072 + }, + { + "epoch": 0.5039840994144716, + "grad_norm": 1.8860596418380737, + "learning_rate": 0.0001035659403074319, + "loss": 1.8158, + "step": 14073 + }, + { + "epoch": 0.5040199115440399, + "grad_norm": 1.5959560871124268, + "learning_rate": 0.00010355434864749262, + "loss": 1.337, + "step": 14074 + }, + { + "epoch": 0.5040557236736083, + "grad_norm": 1.7443625926971436, + "learning_rate": 0.0001035427569397342, + "loss": 1.4158, + "step": 14075 + }, + { + "epoch": 0.5040915358031766, + "grad_norm": 1.536818027496338, + "learning_rate": 0.00010353116518431254, + "loss": 1.351, + "step": 14076 + }, + { + "epoch": 0.5041273479327448, + "grad_norm": 1.3087923526763916, + "learning_rate": 0.00010351957338138363, + "loss": 1.7281, + "step": 14077 + }, + { + "epoch": 0.5041631600623131, + "grad_norm": 1.8406116962432861, + "learning_rate": 0.00010350798153110337, + "loss": 1.114, + "step": 14078 + }, + { + "epoch": 0.5041989721918814, + "grad_norm": 1.7270954847335815, + "learning_rate": 0.00010349638963362777, + "loss": 1.6466, + "step": 14079 + }, + { + "epoch": 0.5042347843214496, + "grad_norm": 2.479389190673828, + "learning_rate": 0.00010348479768911272, + "loss": 1.2647, + "step": 14080 + }, + { + "epoch": 0.5042705964510179, + "grad_norm": 1.7443976402282715, + "learning_rate": 0.00010347320569771428, + "loss": 1.6703, + "step": 14081 + }, + { + "epoch": 0.5043064085805863, + "grad_norm": 1.6824451684951782, + "learning_rate": 0.00010346161365958829, + "loss": 1.1959, + "step": 14082 + }, + { + "epoch": 0.5043422207101546, + "grad_norm": 1.8849799633026123, + "learning_rate": 0.00010345002157489074, + "loss": 1.3789, + "step": 14083 + }, + { + "epoch": 0.5043780328397228, + "grad_norm": 1.5173338651657104, + "learning_rate": 0.00010343842944377764, + "loss": 1.7663, + "step": 14084 + }, + { + "epoch": 0.5044138449692911, + "grad_norm": 1.5596895217895508, + "learning_rate": 0.00010342683726640487, + "loss": 1.4884, + "step": 14085 + }, + { + "epoch": 0.5044496570988594, + "grad_norm": 1.4701156616210938, + "learning_rate": 0.00010341524504292845, + "loss": 1.5526, + "step": 14086 + }, + { + "epoch": 0.5044854692284276, + "grad_norm": 1.7516635656356812, + "learning_rate": 0.00010340365277350428, + "loss": 1.4059, + "step": 14087 + }, + { + "epoch": 0.5045212813579959, + "grad_norm": 1.8188012838363647, + "learning_rate": 0.0001033920604582884, + "loss": 1.4001, + "step": 14088 + }, + { + "epoch": 0.5045570934875643, + "grad_norm": 2.2817301750183105, + "learning_rate": 0.00010338046809743668, + "loss": 1.162, + "step": 14089 + }, + { + "epoch": 0.5045929056171325, + "grad_norm": 1.7542790174484253, + "learning_rate": 0.00010336887569110518, + "loss": 1.505, + "step": 14090 + }, + { + "epoch": 0.5046287177467008, + "grad_norm": 1.8242846727371216, + "learning_rate": 0.00010335728323944974, + "loss": 1.319, + "step": 14091 + }, + { + "epoch": 0.5046645298762691, + "grad_norm": 1.5669910907745361, + "learning_rate": 0.00010334569074262641, + "loss": 1.4626, + "step": 14092 + }, + { + "epoch": 0.5047003420058374, + "grad_norm": 1.8271517753601074, + "learning_rate": 0.00010333409820079112, + "loss": 1.4747, + "step": 14093 + }, + { + "epoch": 0.5047361541354056, + "grad_norm": 1.7885736227035522, + "learning_rate": 0.00010332250561409986, + "loss": 1.4699, + "step": 14094 + }, + { + "epoch": 0.5047719662649739, + "grad_norm": 1.3920366764068604, + "learning_rate": 0.00010331091298270854, + "loss": 1.4448, + "step": 14095 + }, + { + "epoch": 0.5048077783945423, + "grad_norm": 2.2439980506896973, + "learning_rate": 0.00010329932030677316, + "loss": 1.6206, + "step": 14096 + }, + { + "epoch": 0.5048435905241105, + "grad_norm": 1.532996654510498, + "learning_rate": 0.00010328772758644971, + "loss": 1.4726, + "step": 14097 + }, + { + "epoch": 0.5048794026536788, + "grad_norm": 1.5202641487121582, + "learning_rate": 0.00010327613482189409, + "loss": 1.3134, + "step": 14098 + }, + { + "epoch": 0.5049152147832471, + "grad_norm": 2.1792054176330566, + "learning_rate": 0.00010326454201326236, + "loss": 1.3475, + "step": 14099 + }, + { + "epoch": 0.5049510269128153, + "grad_norm": 1.6390247344970703, + "learning_rate": 0.00010325294916071038, + "loss": 1.262, + "step": 14100 + }, + { + "epoch": 0.5049868390423836, + "grad_norm": 1.5163406133651733, + "learning_rate": 0.00010324135626439419, + "loss": 1.4472, + "step": 14101 + }, + { + "epoch": 0.5050226511719519, + "grad_norm": 1.6749697923660278, + "learning_rate": 0.0001032297633244697, + "loss": 1.4647, + "step": 14102 + }, + { + "epoch": 0.5050584633015203, + "grad_norm": 1.6296559572219849, + "learning_rate": 0.00010321817034109293, + "loss": 1.1982, + "step": 14103 + }, + { + "epoch": 0.5050942754310885, + "grad_norm": 1.5373493432998657, + "learning_rate": 0.00010320657731441982, + "loss": 1.4018, + "step": 14104 + }, + { + "epoch": 0.5051300875606568, + "grad_norm": 1.7386698722839355, + "learning_rate": 0.00010319498424460636, + "loss": 1.302, + "step": 14105 + }, + { + "epoch": 0.5051658996902251, + "grad_norm": 2.1452550888061523, + "learning_rate": 0.0001031833911318085, + "loss": 1.5013, + "step": 14106 + }, + { + "epoch": 0.5052017118197933, + "grad_norm": 1.7856847047805786, + "learning_rate": 0.00010317179797618223, + "loss": 1.5817, + "step": 14107 + }, + { + "epoch": 0.5052375239493616, + "grad_norm": 1.160574197769165, + "learning_rate": 0.00010316020477788353, + "loss": 1.6855, + "step": 14108 + }, + { + "epoch": 0.5052733360789299, + "grad_norm": 1.5507274866104126, + "learning_rate": 0.0001031486115370683, + "loss": 1.6322, + "step": 14109 + }, + { + "epoch": 0.5053091482084983, + "grad_norm": 1.567002296447754, + "learning_rate": 0.00010313701825389259, + "loss": 1.4388, + "step": 14110 + }, + { + "epoch": 0.5053449603380665, + "grad_norm": 1.6256871223449707, + "learning_rate": 0.00010312542492851234, + "loss": 1.1, + "step": 14111 + }, + { + "epoch": 0.5053807724676348, + "grad_norm": 1.9150153398513794, + "learning_rate": 0.00010311383156108354, + "loss": 1.8181, + "step": 14112 + }, + { + "epoch": 0.5054165845972031, + "grad_norm": 1.7468947172164917, + "learning_rate": 0.00010310223815176215, + "loss": 1.3511, + "step": 14113 + }, + { + "epoch": 0.5054523967267713, + "grad_norm": 1.5880811214447021, + "learning_rate": 0.00010309064470070414, + "loss": 1.5384, + "step": 14114 + }, + { + "epoch": 0.5054882088563396, + "grad_norm": 1.6013243198394775, + "learning_rate": 0.00010307905120806549, + "loss": 1.4457, + "step": 14115 + }, + { + "epoch": 0.5055240209859079, + "grad_norm": 1.7990401983261108, + "learning_rate": 0.00010306745767400219, + "loss": 1.4054, + "step": 14116 + }, + { + "epoch": 0.5055598331154763, + "grad_norm": 2.4490861892700195, + "learning_rate": 0.0001030558640986702, + "loss": 1.6798, + "step": 14117 + }, + { + "epoch": 0.5055956452450445, + "grad_norm": 1.4762895107269287, + "learning_rate": 0.0001030442704822255, + "loss": 1.1657, + "step": 14118 + }, + { + "epoch": 0.5056314573746128, + "grad_norm": 1.4917511940002441, + "learning_rate": 0.00010303267682482405, + "loss": 1.4921, + "step": 14119 + }, + { + "epoch": 0.5056672695041811, + "grad_norm": 1.858914852142334, + "learning_rate": 0.00010302108312662184, + "loss": 1.2912, + "step": 14120 + }, + { + "epoch": 0.5057030816337493, + "grad_norm": 2.0417349338531494, + "learning_rate": 0.00010300948938777491, + "loss": 1.7917, + "step": 14121 + }, + { + "epoch": 0.5057388937633176, + "grad_norm": 1.6435681581497192, + "learning_rate": 0.00010299789560843911, + "loss": 1.3145, + "step": 14122 + }, + { + "epoch": 0.5057747058928859, + "grad_norm": 1.7804651260375977, + "learning_rate": 0.00010298630178877053, + "loss": 1.4567, + "step": 14123 + }, + { + "epoch": 0.5058105180224542, + "grad_norm": 1.9142791032791138, + "learning_rate": 0.00010297470792892512, + "loss": 1.1872, + "step": 14124 + }, + { + "epoch": 0.5058463301520225, + "grad_norm": 2.6280245780944824, + "learning_rate": 0.00010296311402905884, + "loss": 1.3906, + "step": 14125 + }, + { + "epoch": 0.5058821422815908, + "grad_norm": 1.2249655723571777, + "learning_rate": 0.0001029515200893277, + "loss": 1.4255, + "step": 14126 + }, + { + "epoch": 0.505917954411159, + "grad_norm": 1.620898962020874, + "learning_rate": 0.00010293992610988763, + "loss": 1.3322, + "step": 14127 + }, + { + "epoch": 0.5059537665407273, + "grad_norm": 1.8030281066894531, + "learning_rate": 0.00010292833209089467, + "loss": 1.2237, + "step": 14128 + }, + { + "epoch": 0.5059895786702956, + "grad_norm": 1.8104889392852783, + "learning_rate": 0.00010291673803250477, + "loss": 1.4957, + "step": 14129 + }, + { + "epoch": 0.5060253907998639, + "grad_norm": 1.5003832578659058, + "learning_rate": 0.00010290514393487391, + "loss": 1.5665, + "step": 14130 + }, + { + "epoch": 0.5060612029294322, + "grad_norm": 1.9296363592147827, + "learning_rate": 0.00010289354979815811, + "loss": 1.5859, + "step": 14131 + }, + { + "epoch": 0.5060970150590005, + "grad_norm": 1.9950153827667236, + "learning_rate": 0.00010288195562251332, + "loss": 1.5064, + "step": 14132 + }, + { + "epoch": 0.5061328271885688, + "grad_norm": 2.030654191970825, + "learning_rate": 0.00010287036140809552, + "loss": 1.5293, + "step": 14133 + }, + { + "epoch": 0.506168639318137, + "grad_norm": 1.89315664768219, + "learning_rate": 0.00010285876715506076, + "loss": 1.3902, + "step": 14134 + }, + { + "epoch": 0.5062044514477053, + "grad_norm": 1.6932226419448853, + "learning_rate": 0.00010284717286356493, + "loss": 1.3832, + "step": 14135 + }, + { + "epoch": 0.5062402635772736, + "grad_norm": 1.2903110980987549, + "learning_rate": 0.00010283557853376408, + "loss": 1.3911, + "step": 14136 + }, + { + "epoch": 0.5062760757068419, + "grad_norm": 1.9288479089736938, + "learning_rate": 0.00010282398416581415, + "loss": 1.7958, + "step": 14137 + }, + { + "epoch": 0.5063118878364102, + "grad_norm": 1.6040328741073608, + "learning_rate": 0.00010281238975987118, + "loss": 1.5238, + "step": 14138 + }, + { + "epoch": 0.5063476999659785, + "grad_norm": 2.265596866607666, + "learning_rate": 0.00010280079531609112, + "loss": 1.4235, + "step": 14139 + }, + { + "epoch": 0.5063835120955468, + "grad_norm": 1.681878685951233, + "learning_rate": 0.00010278920083462997, + "loss": 1.6886, + "step": 14140 + }, + { + "epoch": 0.506419324225115, + "grad_norm": 2.133225917816162, + "learning_rate": 0.00010277760631564375, + "loss": 1.2276, + "step": 14141 + }, + { + "epoch": 0.5064551363546833, + "grad_norm": 1.6065014600753784, + "learning_rate": 0.00010276601175928839, + "loss": 1.4719, + "step": 14142 + }, + { + "epoch": 0.5064909484842516, + "grad_norm": 1.765504240989685, + "learning_rate": 0.00010275441716571996, + "loss": 1.3059, + "step": 14143 + }, + { + "epoch": 0.5065267606138198, + "grad_norm": 1.4708679914474487, + "learning_rate": 0.00010274282253509436, + "loss": 1.4785, + "step": 14144 + }, + { + "epoch": 0.5065625727433882, + "grad_norm": 1.4955041408538818, + "learning_rate": 0.00010273122786756762, + "loss": 1.5128, + "step": 14145 + }, + { + "epoch": 0.5065983848729565, + "grad_norm": 1.961424469947815, + "learning_rate": 0.00010271963316329571, + "loss": 1.4041, + "step": 14146 + }, + { + "epoch": 0.5066341970025248, + "grad_norm": 1.4497519731521606, + "learning_rate": 0.00010270803842243469, + "loss": 1.7535, + "step": 14147 + }, + { + "epoch": 0.506670009132093, + "grad_norm": 1.6153457164764404, + "learning_rate": 0.00010269644364514046, + "loss": 1.5254, + "step": 14148 + }, + { + "epoch": 0.5067058212616613, + "grad_norm": 1.7884387969970703, + "learning_rate": 0.0001026848488315691, + "loss": 1.5087, + "step": 14149 + }, + { + "epoch": 0.5067416333912296, + "grad_norm": 1.7952286005020142, + "learning_rate": 0.00010267325398187653, + "loss": 1.4182, + "step": 14150 + }, + { + "epoch": 0.5067774455207978, + "grad_norm": 2.6730382442474365, + "learning_rate": 0.00010266165909621879, + "loss": 1.722, + "step": 14151 + }, + { + "epoch": 0.5068132576503662, + "grad_norm": 1.7425639629364014, + "learning_rate": 0.00010265006417475189, + "loss": 1.6575, + "step": 14152 + }, + { + "epoch": 0.5068490697799345, + "grad_norm": 1.971174955368042, + "learning_rate": 0.00010263846921763174, + "loss": 1.3717, + "step": 14153 + }, + { + "epoch": 0.5068848819095028, + "grad_norm": 1.8181272745132446, + "learning_rate": 0.00010262687422501442, + "loss": 1.2995, + "step": 14154 + }, + { + "epoch": 0.506920694039071, + "grad_norm": 1.9725772142410278, + "learning_rate": 0.00010261527919705589, + "loss": 1.7611, + "step": 14155 + }, + { + "epoch": 0.5069565061686393, + "grad_norm": 2.269573926925659, + "learning_rate": 0.00010260368413391217, + "loss": 1.5659, + "step": 14156 + }, + { + "epoch": 0.5069923182982076, + "grad_norm": 1.5113763809204102, + "learning_rate": 0.0001025920890357392, + "loss": 1.4647, + "step": 14157 + }, + { + "epoch": 0.5070281304277758, + "grad_norm": 1.9952131509780884, + "learning_rate": 0.00010258049390269305, + "loss": 1.5054, + "step": 14158 + }, + { + "epoch": 0.5070639425573442, + "grad_norm": 1.6506963968276978, + "learning_rate": 0.00010256889873492966, + "loss": 1.5954, + "step": 14159 + }, + { + "epoch": 0.5070997546869125, + "grad_norm": 1.6681649684906006, + "learning_rate": 0.00010255730353260507, + "loss": 1.2699, + "step": 14160 + }, + { + "epoch": 0.5071355668164808, + "grad_norm": 1.7544012069702148, + "learning_rate": 0.00010254570829587527, + "loss": 1.2456, + "step": 14161 + }, + { + "epoch": 0.507171378946049, + "grad_norm": 1.9082049131393433, + "learning_rate": 0.00010253411302489622, + "loss": 1.3864, + "step": 14162 + }, + { + "epoch": 0.5072071910756173, + "grad_norm": 5.814779758453369, + "learning_rate": 0.00010252251771982395, + "loss": 1.5455, + "step": 14163 + }, + { + "epoch": 0.5072430032051856, + "grad_norm": 1.5114103555679321, + "learning_rate": 0.00010251092238081446, + "loss": 1.7425, + "step": 14164 + }, + { + "epoch": 0.5072788153347538, + "grad_norm": 1.5640950202941895, + "learning_rate": 0.00010249932700802376, + "loss": 1.5903, + "step": 14165 + }, + { + "epoch": 0.5073146274643222, + "grad_norm": 2.070931911468506, + "learning_rate": 0.00010248773160160782, + "loss": 1.5774, + "step": 14166 + }, + { + "epoch": 0.5073504395938905, + "grad_norm": 1.4230797290802002, + "learning_rate": 0.0001024761361617227, + "loss": 1.6921, + "step": 14167 + }, + { + "epoch": 0.5073862517234587, + "grad_norm": 1.4877146482467651, + "learning_rate": 0.00010246454068852431, + "loss": 1.3322, + "step": 14168 + }, + { + "epoch": 0.507422063853027, + "grad_norm": 2.0262210369110107, + "learning_rate": 0.00010245294518216875, + "loss": 1.4121, + "step": 14169 + }, + { + "epoch": 0.5074578759825953, + "grad_norm": 1.7050576210021973, + "learning_rate": 0.00010244134964281195, + "loss": 1.336, + "step": 14170 + }, + { + "epoch": 0.5074936881121636, + "grad_norm": 2.0105693340301514, + "learning_rate": 0.00010242975407060995, + "loss": 1.2359, + "step": 14171 + }, + { + "epoch": 0.5075295002417318, + "grad_norm": 1.4477938413619995, + "learning_rate": 0.00010241815846571874, + "loss": 1.7411, + "step": 14172 + }, + { + "epoch": 0.5075653123713002, + "grad_norm": 1.3273900747299194, + "learning_rate": 0.00010240656282829433, + "loss": 1.434, + "step": 14173 + }, + { + "epoch": 0.5076011245008685, + "grad_norm": 1.914575457572937, + "learning_rate": 0.00010239496715849273, + "loss": 1.2457, + "step": 14174 + }, + { + "epoch": 0.5076369366304367, + "grad_norm": 1.6995282173156738, + "learning_rate": 0.0001023833714564699, + "loss": 1.5573, + "step": 14175 + }, + { + "epoch": 0.507672748760005, + "grad_norm": 1.8010412454605103, + "learning_rate": 0.00010237177572238192, + "loss": 1.5153, + "step": 14176 + }, + { + "epoch": 0.5077085608895733, + "grad_norm": 1.5251318216323853, + "learning_rate": 0.00010236017995638472, + "loss": 1.5024, + "step": 14177 + }, + { + "epoch": 0.5077443730191415, + "grad_norm": 3.0716559886932373, + "learning_rate": 0.00010234858415863439, + "loss": 1.5426, + "step": 14178 + }, + { + "epoch": 0.5077801851487098, + "grad_norm": 2.010403633117676, + "learning_rate": 0.00010233698832928686, + "loss": 1.3712, + "step": 14179 + }, + { + "epoch": 0.5078159972782782, + "grad_norm": 1.76226806640625, + "learning_rate": 0.00010232539246849818, + "loss": 1.4153, + "step": 14180 + }, + { + "epoch": 0.5078518094078465, + "grad_norm": 1.8289686441421509, + "learning_rate": 0.00010231379657642432, + "loss": 1.4113, + "step": 14181 + }, + { + "epoch": 0.5078876215374147, + "grad_norm": 1.5149344205856323, + "learning_rate": 0.00010230220065322132, + "loss": 1.4344, + "step": 14182 + }, + { + "epoch": 0.507923433666983, + "grad_norm": 1.530060052871704, + "learning_rate": 0.00010229060469904519, + "loss": 1.0206, + "step": 14183 + }, + { + "epoch": 0.5079592457965513, + "grad_norm": 2.5815203189849854, + "learning_rate": 0.00010227900871405191, + "loss": 1.6194, + "step": 14184 + }, + { + "epoch": 0.5079950579261195, + "grad_norm": 1.5144827365875244, + "learning_rate": 0.00010226741269839755, + "loss": 1.377, + "step": 14185 + }, + { + "epoch": 0.5080308700556878, + "grad_norm": 1.7543262243270874, + "learning_rate": 0.00010225581665223802, + "loss": 1.4925, + "step": 14186 + }, + { + "epoch": 0.5080666821852562, + "grad_norm": 1.8062946796417236, + "learning_rate": 0.00010224422057572947, + "loss": 1.6961, + "step": 14187 + }, + { + "epoch": 0.5081024943148245, + "grad_norm": 1.5800213813781738, + "learning_rate": 0.00010223262446902775, + "loss": 1.6731, + "step": 14188 + }, + { + "epoch": 0.5081383064443927, + "grad_norm": 1.6881344318389893, + "learning_rate": 0.00010222102833228897, + "loss": 1.488, + "step": 14189 + }, + { + "epoch": 0.508174118573961, + "grad_norm": 1.4703749418258667, + "learning_rate": 0.00010220943216566912, + "loss": 1.2877, + "step": 14190 + }, + { + "epoch": 0.5082099307035293, + "grad_norm": 2.940575361251831, + "learning_rate": 0.00010219783596932421, + "loss": 1.3642, + "step": 14191 + }, + { + "epoch": 0.5082457428330975, + "grad_norm": 1.343244194984436, + "learning_rate": 0.00010218623974341024, + "loss": 1.5526, + "step": 14192 + }, + { + "epoch": 0.5082815549626658, + "grad_norm": 1.7862319946289062, + "learning_rate": 0.00010217464348808323, + "loss": 1.5906, + "step": 14193 + }, + { + "epoch": 0.5083173670922342, + "grad_norm": 2.7882981300354004, + "learning_rate": 0.00010216304720349922, + "loss": 1.6598, + "step": 14194 + }, + { + "epoch": 0.5083531792218025, + "grad_norm": 1.9673621654510498, + "learning_rate": 0.00010215145088981419, + "loss": 1.6815, + "step": 14195 + }, + { + "epoch": 0.5083889913513707, + "grad_norm": 1.8103598356246948, + "learning_rate": 0.0001021398545471842, + "loss": 1.2642, + "step": 14196 + }, + { + "epoch": 0.508424803480939, + "grad_norm": 1.8523528575897217, + "learning_rate": 0.00010212825817576519, + "loss": 1.7025, + "step": 14197 + }, + { + "epoch": 0.5084606156105073, + "grad_norm": 1.207170009613037, + "learning_rate": 0.00010211666177571322, + "loss": 1.4524, + "step": 14198 + }, + { + "epoch": 0.5084964277400755, + "grad_norm": 1.7149916887283325, + "learning_rate": 0.00010210506534718427, + "loss": 1.4284, + "step": 14199 + }, + { + "epoch": 0.5085322398696438, + "grad_norm": 1.2960801124572754, + "learning_rate": 0.00010209346889033442, + "loss": 1.4645, + "step": 14200 + }, + { + "epoch": 0.5085680519992122, + "grad_norm": 1.768568754196167, + "learning_rate": 0.00010208187240531962, + "loss": 1.2931, + "step": 14201 + }, + { + "epoch": 0.5086038641287804, + "grad_norm": 1.9338597059249878, + "learning_rate": 0.00010207027589229594, + "loss": 1.6961, + "step": 14202 + }, + { + "epoch": 0.5086396762583487, + "grad_norm": 1.3252862691879272, + "learning_rate": 0.00010205867935141933, + "loss": 1.5093, + "step": 14203 + }, + { + "epoch": 0.508675488387917, + "grad_norm": 1.6431336402893066, + "learning_rate": 0.00010204708278284587, + "loss": 1.4769, + "step": 14204 + }, + { + "epoch": 0.5087113005174853, + "grad_norm": 1.6076064109802246, + "learning_rate": 0.00010203548618673155, + "loss": 1.5178, + "step": 14205 + }, + { + "epoch": 0.5087471126470535, + "grad_norm": 1.594680905342102, + "learning_rate": 0.00010202388956323238, + "loss": 1.4088, + "step": 14206 + }, + { + "epoch": 0.5087829247766218, + "grad_norm": 1.4047375917434692, + "learning_rate": 0.0001020122929125044, + "loss": 1.4558, + "step": 14207 + }, + { + "epoch": 0.5088187369061902, + "grad_norm": 1.6259403228759766, + "learning_rate": 0.00010200069623470358, + "loss": 1.2941, + "step": 14208 + }, + { + "epoch": 0.5088545490357584, + "grad_norm": 1.942922830581665, + "learning_rate": 0.00010198909952998603, + "loss": 1.6474, + "step": 14209 + }, + { + "epoch": 0.5088903611653267, + "grad_norm": 1.5637397766113281, + "learning_rate": 0.00010197750279850767, + "loss": 1.4758, + "step": 14210 + }, + { + "epoch": 0.508926173294895, + "grad_norm": 1.5786371231079102, + "learning_rate": 0.00010196590604042457, + "loss": 1.3429, + "step": 14211 + }, + { + "epoch": 0.5089619854244632, + "grad_norm": 1.7941700220108032, + "learning_rate": 0.00010195430925589274, + "loss": 1.4954, + "step": 14212 + }, + { + "epoch": 0.5089977975540315, + "grad_norm": 1.7135039567947388, + "learning_rate": 0.00010194271244506821, + "loss": 1.5166, + "step": 14213 + }, + { + "epoch": 0.5090336096835998, + "grad_norm": 1.4030609130859375, + "learning_rate": 0.00010193111560810697, + "loss": 1.5622, + "step": 14214 + }, + { + "epoch": 0.5090694218131682, + "grad_norm": 2.290910005569458, + "learning_rate": 0.00010191951874516508, + "loss": 1.1519, + "step": 14215 + }, + { + "epoch": 0.5091052339427364, + "grad_norm": 2.0241501331329346, + "learning_rate": 0.00010190792185639855, + "loss": 1.575, + "step": 14216 + }, + { + "epoch": 0.5091410460723047, + "grad_norm": 1.8566899299621582, + "learning_rate": 0.00010189632494196335, + "loss": 1.5069, + "step": 14217 + }, + { + "epoch": 0.509176858201873, + "grad_norm": 1.780731201171875, + "learning_rate": 0.00010188472800201558, + "loss": 1.4822, + "step": 14218 + }, + { + "epoch": 0.5092126703314412, + "grad_norm": 1.620386004447937, + "learning_rate": 0.00010187313103671122, + "loss": 1.4713, + "step": 14219 + }, + { + "epoch": 0.5092484824610095, + "grad_norm": 1.4523818492889404, + "learning_rate": 0.00010186153404620628, + "loss": 1.557, + "step": 14220 + }, + { + "epoch": 0.5092842945905778, + "grad_norm": 1.6923600435256958, + "learning_rate": 0.00010184993703065682, + "loss": 1.4402, + "step": 14221 + }, + { + "epoch": 0.5093201067201462, + "grad_norm": 1.507537841796875, + "learning_rate": 0.00010183833999021884, + "loss": 1.3889, + "step": 14222 + }, + { + "epoch": 0.5093559188497144, + "grad_norm": 1.9163919687271118, + "learning_rate": 0.00010182674292504837, + "loss": 1.5374, + "step": 14223 + }, + { + "epoch": 0.5093917309792827, + "grad_norm": 2.0340425968170166, + "learning_rate": 0.00010181514583530141, + "loss": 1.4581, + "step": 14224 + }, + { + "epoch": 0.509427543108851, + "grad_norm": 2.15578293800354, + "learning_rate": 0.00010180354872113403, + "loss": 1.6459, + "step": 14225 + }, + { + "epoch": 0.5094633552384192, + "grad_norm": 1.6318968534469604, + "learning_rate": 0.0001017919515827022, + "loss": 1.5743, + "step": 14226 + }, + { + "epoch": 0.5094991673679875, + "grad_norm": 1.603484034538269, + "learning_rate": 0.000101780354420162, + "loss": 1.4433, + "step": 14227 + }, + { + "epoch": 0.5095349794975558, + "grad_norm": 1.2180871963500977, + "learning_rate": 0.00010176875723366941, + "loss": 1.3916, + "step": 14228 + }, + { + "epoch": 0.5095707916271242, + "grad_norm": 1.6632503271102905, + "learning_rate": 0.00010175716002338049, + "loss": 1.3513, + "step": 14229 + }, + { + "epoch": 0.5096066037566924, + "grad_norm": 1.5550673007965088, + "learning_rate": 0.00010174556278945123, + "loss": 1.3926, + "step": 14230 + }, + { + "epoch": 0.5096424158862607, + "grad_norm": 1.7034988403320312, + "learning_rate": 0.00010173396553203771, + "loss": 1.4134, + "step": 14231 + }, + { + "epoch": 0.509678228015829, + "grad_norm": 2.4369795322418213, + "learning_rate": 0.00010172236825129588, + "loss": 1.4433, + "step": 14232 + }, + { + "epoch": 0.5097140401453972, + "grad_norm": 1.695494532585144, + "learning_rate": 0.00010171077094738183, + "loss": 1.6308, + "step": 14233 + }, + { + "epoch": 0.5097498522749655, + "grad_norm": 1.9752106666564941, + "learning_rate": 0.00010169917362045154, + "loss": 1.4111, + "step": 14234 + }, + { + "epoch": 0.5097856644045338, + "grad_norm": 1.5427497625350952, + "learning_rate": 0.00010168757627066105, + "loss": 1.403, + "step": 14235 + }, + { + "epoch": 0.5098214765341021, + "grad_norm": 1.7571707963943481, + "learning_rate": 0.00010167597889816644, + "loss": 1.5686, + "step": 14236 + }, + { + "epoch": 0.5098572886636704, + "grad_norm": 1.598278522491455, + "learning_rate": 0.00010166438150312367, + "loss": 1.6431, + "step": 14237 + }, + { + "epoch": 0.5098931007932387, + "grad_norm": 1.8703125715255737, + "learning_rate": 0.00010165278408568881, + "loss": 1.3825, + "step": 14238 + }, + { + "epoch": 0.509928912922807, + "grad_norm": 1.4079030752182007, + "learning_rate": 0.00010164118664601785, + "loss": 1.5527, + "step": 14239 + }, + { + "epoch": 0.5099647250523752, + "grad_norm": 1.8269530534744263, + "learning_rate": 0.0001016295891842669, + "loss": 1.6979, + "step": 14240 + }, + { + "epoch": 0.5100005371819435, + "grad_norm": 1.5141544342041016, + "learning_rate": 0.00010161799170059187, + "loss": 1.4384, + "step": 14241 + }, + { + "epoch": 0.5100363493115118, + "grad_norm": 1.7513827085494995, + "learning_rate": 0.00010160639419514888, + "loss": 1.094, + "step": 14242 + }, + { + "epoch": 0.5100721614410801, + "grad_norm": 1.3838261365890503, + "learning_rate": 0.00010159479666809388, + "loss": 1.3501, + "step": 14243 + }, + { + "epoch": 0.5101079735706484, + "grad_norm": 1.4064701795578003, + "learning_rate": 0.00010158319911958301, + "loss": 1.5364, + "step": 14244 + }, + { + "epoch": 0.5101437857002167, + "grad_norm": 1.7723150253295898, + "learning_rate": 0.00010157160154977219, + "loss": 1.2483, + "step": 14245 + }, + { + "epoch": 0.510179597829785, + "grad_norm": 1.6301299333572388, + "learning_rate": 0.00010156000395881752, + "loss": 1.6533, + "step": 14246 + }, + { + "epoch": 0.5102154099593532, + "grad_norm": 1.759650707244873, + "learning_rate": 0.000101548406346875, + "loss": 1.5664, + "step": 14247 + }, + { + "epoch": 0.5102512220889215, + "grad_norm": 1.7729034423828125, + "learning_rate": 0.00010153680871410065, + "loss": 1.5183, + "step": 14248 + }, + { + "epoch": 0.5102870342184898, + "grad_norm": 1.9804837703704834, + "learning_rate": 0.00010152521106065058, + "loss": 1.4567, + "step": 14249 + }, + { + "epoch": 0.5103228463480581, + "grad_norm": 2.063938856124878, + "learning_rate": 0.00010151361338668072, + "loss": 1.6644, + "step": 14250 + }, + { + "epoch": 0.5103586584776264, + "grad_norm": 1.628487467765808, + "learning_rate": 0.00010150201569234717, + "loss": 1.1544, + "step": 14251 + }, + { + "epoch": 0.5103944706071947, + "grad_norm": 2.041860580444336, + "learning_rate": 0.0001014904179778059, + "loss": 1.6834, + "step": 14252 + }, + { + "epoch": 0.5104302827367629, + "grad_norm": 1.646464467048645, + "learning_rate": 0.000101478820243213, + "loss": 1.7831, + "step": 14253 + }, + { + "epoch": 0.5104660948663312, + "grad_norm": 1.8969465494155884, + "learning_rate": 0.00010146722248872446, + "loss": 1.4377, + "step": 14254 + }, + { + "epoch": 0.5105019069958995, + "grad_norm": 1.9447615146636963, + "learning_rate": 0.00010145562471449638, + "loss": 1.4619, + "step": 14255 + }, + { + "epoch": 0.5105377191254677, + "grad_norm": 1.6894398927688599, + "learning_rate": 0.00010144402692068472, + "loss": 1.6591, + "step": 14256 + }, + { + "epoch": 0.5105735312550361, + "grad_norm": 2.469947576522827, + "learning_rate": 0.00010143242910744555, + "loss": 1.587, + "step": 14257 + }, + { + "epoch": 0.5106093433846044, + "grad_norm": 1.7453415393829346, + "learning_rate": 0.00010142083127493489, + "loss": 1.5956, + "step": 14258 + }, + { + "epoch": 0.5106451555141727, + "grad_norm": 1.6180731058120728, + "learning_rate": 0.00010140923342330875, + "loss": 1.5102, + "step": 14259 + }, + { + "epoch": 0.5106809676437409, + "grad_norm": 2.149956226348877, + "learning_rate": 0.00010139763555272323, + "loss": 1.8065, + "step": 14260 + }, + { + "epoch": 0.5107167797733092, + "grad_norm": 1.7655017375946045, + "learning_rate": 0.0001013860376633343, + "loss": 1.1825, + "step": 14261 + }, + { + "epoch": 0.5107525919028775, + "grad_norm": 1.5514721870422363, + "learning_rate": 0.00010137443975529804, + "loss": 1.3279, + "step": 14262 + }, + { + "epoch": 0.5107884040324457, + "grad_norm": 1.7187747955322266, + "learning_rate": 0.00010136284182877045, + "loss": 1.4676, + "step": 14263 + }, + { + "epoch": 0.5108242161620141, + "grad_norm": 1.7656067609786987, + "learning_rate": 0.0001013512438839076, + "loss": 1.5519, + "step": 14264 + }, + { + "epoch": 0.5108600282915824, + "grad_norm": 1.3366622924804688, + "learning_rate": 0.00010133964592086547, + "loss": 1.3903, + "step": 14265 + }, + { + "epoch": 0.5108958404211507, + "grad_norm": 2.655071973800659, + "learning_rate": 0.00010132804793980018, + "loss": 1.3475, + "step": 14266 + }, + { + "epoch": 0.5109316525507189, + "grad_norm": 1.4642434120178223, + "learning_rate": 0.0001013164499408677, + "loss": 1.736, + "step": 14267 + }, + { + "epoch": 0.5109674646802872, + "grad_norm": 1.510628342628479, + "learning_rate": 0.00010130485192422408, + "loss": 1.3315, + "step": 14268 + }, + { + "epoch": 0.5110032768098555, + "grad_norm": 1.7870886325836182, + "learning_rate": 0.00010129325389002536, + "loss": 1.4371, + "step": 14269 + }, + { + "epoch": 0.5110390889394237, + "grad_norm": 1.452154517173767, + "learning_rate": 0.00010128165583842757, + "loss": 1.5644, + "step": 14270 + }, + { + "epoch": 0.5110749010689921, + "grad_norm": 1.9587507247924805, + "learning_rate": 0.00010127005776958676, + "loss": 1.5341, + "step": 14271 + }, + { + "epoch": 0.5111107131985604, + "grad_norm": 1.3689603805541992, + "learning_rate": 0.00010125845968365895, + "loss": 1.3541, + "step": 14272 + }, + { + "epoch": 0.5111465253281287, + "grad_norm": 2.153557300567627, + "learning_rate": 0.00010124686158080021, + "loss": 1.5126, + "step": 14273 + }, + { + "epoch": 0.5111823374576969, + "grad_norm": 1.3762892484664917, + "learning_rate": 0.00010123526346116654, + "loss": 1.5685, + "step": 14274 + }, + { + "epoch": 0.5112181495872652, + "grad_norm": 1.7966989278793335, + "learning_rate": 0.00010122366532491403, + "loss": 1.2602, + "step": 14275 + }, + { + "epoch": 0.5112539617168335, + "grad_norm": 1.1151483058929443, + "learning_rate": 0.00010121206717219865, + "loss": 1.4299, + "step": 14276 + }, + { + "epoch": 0.5112897738464017, + "grad_norm": 1.83100163936615, + "learning_rate": 0.00010120046900317646, + "loss": 1.7361, + "step": 14277 + }, + { + "epoch": 0.5113255859759701, + "grad_norm": 1.368430495262146, + "learning_rate": 0.00010118887081800352, + "loss": 1.5495, + "step": 14278 + }, + { + "epoch": 0.5113613981055384, + "grad_norm": 1.5631763935089111, + "learning_rate": 0.00010117727261683585, + "loss": 1.3579, + "step": 14279 + }, + { + "epoch": 0.5113972102351066, + "grad_norm": 3.0901801586151123, + "learning_rate": 0.00010116567439982952, + "loss": 1.7898, + "step": 14280 + }, + { + "epoch": 0.5114330223646749, + "grad_norm": 1.6391278505325317, + "learning_rate": 0.0001011540761671405, + "loss": 1.1463, + "step": 14281 + }, + { + "epoch": 0.5114688344942432, + "grad_norm": 1.5354077816009521, + "learning_rate": 0.00010114247791892491, + "loss": 1.4781, + "step": 14282 + }, + { + "epoch": 0.5115046466238115, + "grad_norm": 2.2333121299743652, + "learning_rate": 0.00010113087965533874, + "loss": 1.5416, + "step": 14283 + }, + { + "epoch": 0.5115404587533797, + "grad_norm": 1.9875568151474, + "learning_rate": 0.00010111928137653808, + "loss": 1.677, + "step": 14284 + }, + { + "epoch": 0.5115762708829481, + "grad_norm": 1.5071085691452026, + "learning_rate": 0.00010110768308267889, + "loss": 1.2818, + "step": 14285 + }, + { + "epoch": 0.5116120830125164, + "grad_norm": 1.5551725625991821, + "learning_rate": 0.00010109608477391725, + "loss": 1.5007, + "step": 14286 + }, + { + "epoch": 0.5116478951420846, + "grad_norm": 2.123044967651367, + "learning_rate": 0.00010108448645040919, + "loss": 1.5999, + "step": 14287 + }, + { + "epoch": 0.5116837072716529, + "grad_norm": 1.681573748588562, + "learning_rate": 0.00010107288811231081, + "loss": 1.2607, + "step": 14288 + }, + { + "epoch": 0.5117195194012212, + "grad_norm": 1.9159537553787231, + "learning_rate": 0.00010106128975977809, + "loss": 1.5472, + "step": 14289 + }, + { + "epoch": 0.5117553315307894, + "grad_norm": 1.6600178480148315, + "learning_rate": 0.00010104969139296705, + "loss": 1.6619, + "step": 14290 + }, + { + "epoch": 0.5117911436603577, + "grad_norm": 1.5200772285461426, + "learning_rate": 0.00010103809301203382, + "loss": 1.4235, + "step": 14291 + }, + { + "epoch": 0.5118269557899261, + "grad_norm": 1.593790054321289, + "learning_rate": 0.00010102649461713434, + "loss": 1.496, + "step": 14292 + }, + { + "epoch": 0.5118627679194944, + "grad_norm": 1.5583328008651733, + "learning_rate": 0.00010101489620842475, + "loss": 1.5407, + "step": 14293 + }, + { + "epoch": 0.5118985800490626, + "grad_norm": 1.877834677696228, + "learning_rate": 0.00010100329778606101, + "loss": 1.7199, + "step": 14294 + }, + { + "epoch": 0.5119343921786309, + "grad_norm": 1.555271029472351, + "learning_rate": 0.0001009916993501992, + "loss": 1.4016, + "step": 14295 + }, + { + "epoch": 0.5119702043081992, + "grad_norm": 1.6726455688476562, + "learning_rate": 0.00010098010090099532, + "loss": 1.306, + "step": 14296 + }, + { + "epoch": 0.5120060164377674, + "grad_norm": 1.5355541706085205, + "learning_rate": 0.00010096850243860549, + "loss": 1.3042, + "step": 14297 + }, + { + "epoch": 0.5120418285673357, + "grad_norm": 1.3412790298461914, + "learning_rate": 0.00010095690396318569, + "loss": 1.529, + "step": 14298 + }, + { + "epoch": 0.5120776406969041, + "grad_norm": 2.1176607608795166, + "learning_rate": 0.00010094530547489201, + "loss": 1.731, + "step": 14299 + }, + { + "epoch": 0.5121134528264724, + "grad_norm": 1.660214900970459, + "learning_rate": 0.0001009337069738804, + "loss": 1.5807, + "step": 14300 + }, + { + "epoch": 0.5121492649560406, + "grad_norm": 1.5404260158538818, + "learning_rate": 0.00010092210846030703, + "loss": 1.3894, + "step": 14301 + }, + { + "epoch": 0.5121850770856089, + "grad_norm": 1.514736294746399, + "learning_rate": 0.00010091050993432787, + "loss": 1.4389, + "step": 14302 + }, + { + "epoch": 0.5122208892151772, + "grad_norm": 2.2876694202423096, + "learning_rate": 0.00010089891139609895, + "loss": 1.441, + "step": 14303 + }, + { + "epoch": 0.5122567013447454, + "grad_norm": 1.7587552070617676, + "learning_rate": 0.00010088731284577636, + "loss": 1.4818, + "step": 14304 + }, + { + "epoch": 0.5122925134743137, + "grad_norm": 1.566940426826477, + "learning_rate": 0.0001008757142835161, + "loss": 1.3247, + "step": 14305 + }, + { + "epoch": 0.5123283256038821, + "grad_norm": 1.3367424011230469, + "learning_rate": 0.00010086411570947424, + "loss": 1.6241, + "step": 14306 + }, + { + "epoch": 0.5123641377334504, + "grad_norm": 1.3539397716522217, + "learning_rate": 0.0001008525171238068, + "loss": 1.5768, + "step": 14307 + }, + { + "epoch": 0.5123999498630186, + "grad_norm": 1.3885242938995361, + "learning_rate": 0.00010084091852666988, + "loss": 1.1827, + "step": 14308 + }, + { + "epoch": 0.5124357619925869, + "grad_norm": 2.392909288406372, + "learning_rate": 0.00010082931991821945, + "loss": 1.4581, + "step": 14309 + }, + { + "epoch": 0.5124715741221552, + "grad_norm": 1.3797001838684082, + "learning_rate": 0.00010081772129861163, + "loss": 1.6449, + "step": 14310 + }, + { + "epoch": 0.5125073862517234, + "grad_norm": 1.8504879474639893, + "learning_rate": 0.00010080612266800241, + "loss": 1.502, + "step": 14311 + }, + { + "epoch": 0.5125431983812917, + "grad_norm": 1.6124776601791382, + "learning_rate": 0.0001007945240265478, + "loss": 1.6722, + "step": 14312 + }, + { + "epoch": 0.5125790105108601, + "grad_norm": 1.4799977540969849, + "learning_rate": 0.00010078292537440397, + "loss": 2.0444, + "step": 14313 + }, + { + "epoch": 0.5126148226404283, + "grad_norm": 1.6208722591400146, + "learning_rate": 0.00010077132671172685, + "loss": 1.4614, + "step": 14314 + }, + { + "epoch": 0.5126506347699966, + "grad_norm": 1.8112127780914307, + "learning_rate": 0.00010075972803867254, + "loss": 1.4982, + "step": 14315 + }, + { + "epoch": 0.5126864468995649, + "grad_norm": 1.5253667831420898, + "learning_rate": 0.00010074812935539703, + "loss": 1.5199, + "step": 14316 + }, + { + "epoch": 0.5127222590291332, + "grad_norm": 1.505617618560791, + "learning_rate": 0.00010073653066205644, + "loss": 1.5824, + "step": 14317 + }, + { + "epoch": 0.5127580711587014, + "grad_norm": 1.8238226175308228, + "learning_rate": 0.00010072493195880676, + "loss": 1.239, + "step": 14318 + }, + { + "epoch": 0.5127938832882697, + "grad_norm": 1.3497518301010132, + "learning_rate": 0.00010071333324580408, + "loss": 1.4374, + "step": 14319 + }, + { + "epoch": 0.5128296954178381, + "grad_norm": 1.3062442541122437, + "learning_rate": 0.00010070173452320442, + "loss": 1.2157, + "step": 14320 + }, + { + "epoch": 0.5128655075474063, + "grad_norm": 1.9949729442596436, + "learning_rate": 0.0001006901357911638, + "loss": 1.4122, + "step": 14321 + }, + { + "epoch": 0.5129013196769746, + "grad_norm": 1.8121817111968994, + "learning_rate": 0.00010067853704983832, + "loss": 1.277, + "step": 14322 + }, + { + "epoch": 0.5129371318065429, + "grad_norm": 1.786101222038269, + "learning_rate": 0.00010066693829938398, + "loss": 1.7341, + "step": 14323 + }, + { + "epoch": 0.5129729439361111, + "grad_norm": 1.6490147113800049, + "learning_rate": 0.00010065533953995688, + "loss": 1.5394, + "step": 14324 + }, + { + "epoch": 0.5130087560656794, + "grad_norm": 2.683562755584717, + "learning_rate": 0.00010064374077171296, + "loss": 1.633, + "step": 14325 + }, + { + "epoch": 0.5130445681952477, + "grad_norm": 1.9112385511398315, + "learning_rate": 0.00010063214199480842, + "loss": 1.6243, + "step": 14326 + }, + { + "epoch": 0.5130803803248161, + "grad_norm": 1.7935097217559814, + "learning_rate": 0.00010062054320939916, + "loss": 1.1181, + "step": 14327 + }, + { + "epoch": 0.5131161924543843, + "grad_norm": 1.3657851219177246, + "learning_rate": 0.00010060894441564135, + "loss": 1.6137, + "step": 14328 + }, + { + "epoch": 0.5131520045839526, + "grad_norm": 1.3997024297714233, + "learning_rate": 0.00010059734561369095, + "loss": 1.4953, + "step": 14329 + }, + { + "epoch": 0.5131878167135209, + "grad_norm": 1.8069899082183838, + "learning_rate": 0.00010058574680370403, + "loss": 1.5614, + "step": 14330 + }, + { + "epoch": 0.5132236288430891, + "grad_norm": 1.4796655178070068, + "learning_rate": 0.00010057414798583664, + "loss": 1.5437, + "step": 14331 + }, + { + "epoch": 0.5132594409726574, + "grad_norm": 1.5718828439712524, + "learning_rate": 0.00010056254916024483, + "loss": 1.4086, + "step": 14332 + }, + { + "epoch": 0.5132952531022257, + "grad_norm": 2.2739932537078857, + "learning_rate": 0.00010055095032708466, + "loss": 1.4159, + "step": 14333 + }, + { + "epoch": 0.5133310652317941, + "grad_norm": 1.714881181716919, + "learning_rate": 0.00010053935148651214, + "loss": 1.4963, + "step": 14334 + }, + { + "epoch": 0.5133668773613623, + "grad_norm": 1.6868878602981567, + "learning_rate": 0.00010052775263868337, + "loss": 1.2638, + "step": 14335 + }, + { + "epoch": 0.5134026894909306, + "grad_norm": 1.6076358556747437, + "learning_rate": 0.00010051615378375434, + "loss": 1.7306, + "step": 14336 + }, + { + "epoch": 0.5134385016204989, + "grad_norm": 1.4571819305419922, + "learning_rate": 0.00010050455492188118, + "loss": 1.7523, + "step": 14337 + }, + { + "epoch": 0.5134743137500671, + "grad_norm": 1.9559829235076904, + "learning_rate": 0.00010049295605321984, + "loss": 1.1767, + "step": 14338 + }, + { + "epoch": 0.5135101258796354, + "grad_norm": 1.8952054977416992, + "learning_rate": 0.00010048135717792641, + "loss": 1.3615, + "step": 14339 + }, + { + "epoch": 0.5135459380092037, + "grad_norm": 3.2473466396331787, + "learning_rate": 0.00010046975829615695, + "loss": 1.7416, + "step": 14340 + }, + { + "epoch": 0.513581750138772, + "grad_norm": 1.5299546718597412, + "learning_rate": 0.00010045815940806751, + "loss": 1.5168, + "step": 14341 + }, + { + "epoch": 0.5136175622683403, + "grad_norm": 1.6583263874053955, + "learning_rate": 0.00010044656051381411, + "loss": 1.4021, + "step": 14342 + }, + { + "epoch": 0.5136533743979086, + "grad_norm": 1.6013065576553345, + "learning_rate": 0.00010043496161355282, + "loss": 1.6424, + "step": 14343 + }, + { + "epoch": 0.5136891865274769, + "grad_norm": 1.4253227710723877, + "learning_rate": 0.00010042336270743968, + "loss": 1.6771, + "step": 14344 + }, + { + "epoch": 0.5137249986570451, + "grad_norm": 2.0314788818359375, + "learning_rate": 0.00010041176379563073, + "loss": 1.6002, + "step": 14345 + }, + { + "epoch": 0.5137608107866134, + "grad_norm": 1.651528239250183, + "learning_rate": 0.00010040016487828208, + "loss": 1.6224, + "step": 14346 + }, + { + "epoch": 0.5137966229161817, + "grad_norm": 1.8376193046569824, + "learning_rate": 0.00010038856595554967, + "loss": 1.4108, + "step": 14347 + }, + { + "epoch": 0.51383243504575, + "grad_norm": 2.1696889400482178, + "learning_rate": 0.00010037696702758963, + "loss": 1.3069, + "step": 14348 + }, + { + "epoch": 0.5138682471753183, + "grad_norm": 1.4069865942001343, + "learning_rate": 0.00010036536809455796, + "loss": 1.3905, + "step": 14349 + }, + { + "epoch": 0.5139040593048866, + "grad_norm": 2.0680787563323975, + "learning_rate": 0.00010035376915661076, + "loss": 1.7688, + "step": 14350 + }, + { + "epoch": 0.5139398714344549, + "grad_norm": 2.34431529045105, + "learning_rate": 0.00010034217021390404, + "loss": 1.3928, + "step": 14351 + }, + { + "epoch": 0.5139756835640231, + "grad_norm": 1.8667163848876953, + "learning_rate": 0.00010033057126659388, + "loss": 1.6729, + "step": 14352 + }, + { + "epoch": 0.5140114956935914, + "grad_norm": 1.555355429649353, + "learning_rate": 0.0001003189723148363, + "loss": 1.4538, + "step": 14353 + }, + { + "epoch": 0.5140473078231597, + "grad_norm": 1.3511971235275269, + "learning_rate": 0.00010030737335878735, + "loss": 1.7526, + "step": 14354 + }, + { + "epoch": 0.514083119952728, + "grad_norm": 1.7594633102416992, + "learning_rate": 0.00010029577439860312, + "loss": 1.3122, + "step": 14355 + }, + { + "epoch": 0.5141189320822963, + "grad_norm": 1.5887043476104736, + "learning_rate": 0.00010028417543443958, + "loss": 1.536, + "step": 14356 + }, + { + "epoch": 0.5141547442118646, + "grad_norm": 1.7294690608978271, + "learning_rate": 0.00010027257646645285, + "loss": 1.6294, + "step": 14357 + }, + { + "epoch": 0.5141905563414328, + "grad_norm": 1.778171181678772, + "learning_rate": 0.00010026097749479895, + "loss": 1.3429, + "step": 14358 + }, + { + "epoch": 0.5142263684710011, + "grad_norm": 2.0442750453948975, + "learning_rate": 0.00010024937851963394, + "loss": 1.1833, + "step": 14359 + }, + { + "epoch": 0.5142621806005694, + "grad_norm": 1.6226485967636108, + "learning_rate": 0.00010023777954111384, + "loss": 1.1286, + "step": 14360 + }, + { + "epoch": 0.5142979927301377, + "grad_norm": 1.7150418758392334, + "learning_rate": 0.00010022618055939477, + "loss": 1.7438, + "step": 14361 + }, + { + "epoch": 0.514333804859706, + "grad_norm": 1.712880253791809, + "learning_rate": 0.00010021458157463268, + "loss": 1.5001, + "step": 14362 + }, + { + "epoch": 0.5143696169892743, + "grad_norm": 2.2642722129821777, + "learning_rate": 0.0001002029825869837, + "loss": 1.6687, + "step": 14363 + }, + { + "epoch": 0.5144054291188426, + "grad_norm": 1.4798762798309326, + "learning_rate": 0.00010019138359660387, + "loss": 1.6926, + "step": 14364 + }, + { + "epoch": 0.5144412412484108, + "grad_norm": 2.6592345237731934, + "learning_rate": 0.00010017978460364919, + "loss": 1.6522, + "step": 14365 + }, + { + "epoch": 0.5144770533779791, + "grad_norm": 1.899743914604187, + "learning_rate": 0.00010016818560827577, + "loss": 1.4547, + "step": 14366 + }, + { + "epoch": 0.5145128655075474, + "grad_norm": 1.760870099067688, + "learning_rate": 0.00010015658661063957, + "loss": 1.5189, + "step": 14367 + }, + { + "epoch": 0.5145486776371156, + "grad_norm": 1.7101985216140747, + "learning_rate": 0.00010014498761089677, + "loss": 1.691, + "step": 14368 + }, + { + "epoch": 0.514584489766684, + "grad_norm": 1.8395957946777344, + "learning_rate": 0.0001001333886092033, + "loss": 1.4385, + "step": 14369 + }, + { + "epoch": 0.5146203018962523, + "grad_norm": 1.5016250610351562, + "learning_rate": 0.00010012178960571527, + "loss": 1.7028, + "step": 14370 + }, + { + "epoch": 0.5146561140258206, + "grad_norm": 1.3344529867172241, + "learning_rate": 0.00010011019060058873, + "loss": 1.5635, + "step": 14371 + }, + { + "epoch": 0.5146919261553888, + "grad_norm": 1.4595482349395752, + "learning_rate": 0.00010009859159397974, + "loss": 1.5788, + "step": 14372 + }, + { + "epoch": 0.5147277382849571, + "grad_norm": 1.6308788061141968, + "learning_rate": 0.00010008699258604429, + "loss": 1.2402, + "step": 14373 + }, + { + "epoch": 0.5147635504145254, + "grad_norm": 1.6189227104187012, + "learning_rate": 0.00010007539357693845, + "loss": 1.4277, + "step": 14374 + }, + { + "epoch": 0.5147993625440936, + "grad_norm": 1.546007513999939, + "learning_rate": 0.00010006379456681834, + "loss": 1.3149, + "step": 14375 + }, + { + "epoch": 0.514835174673662, + "grad_norm": 1.400831699371338, + "learning_rate": 0.00010005219555583991, + "loss": 1.7555, + "step": 14376 + }, + { + "epoch": 0.5148709868032303, + "grad_norm": 1.3352601528167725, + "learning_rate": 0.00010004059654415927, + "loss": 1.6648, + "step": 14377 + }, + { + "epoch": 0.5149067989327986, + "grad_norm": 1.8999031782150269, + "learning_rate": 0.00010002899753193246, + "loss": 1.5502, + "step": 14378 + }, + { + "epoch": 0.5149426110623668, + "grad_norm": 2.032066822052002, + "learning_rate": 0.00010001739851931553, + "loss": 1.6505, + "step": 14379 + }, + { + "epoch": 0.5149784231919351, + "grad_norm": 1.6035372018814087, + "learning_rate": 0.00010000579950646452, + "loss": 1.7394, + "step": 14380 + }, + { + "epoch": 0.5150142353215034, + "grad_norm": 1.6789970397949219, + "learning_rate": 9.999420049353549e-05, + "loss": 1.5343, + "step": 14381 + }, + { + "epoch": 0.5150500474510716, + "grad_norm": 2.004620313644409, + "learning_rate": 9.998260148068449e-05, + "loss": 1.3025, + "step": 14382 + }, + { + "epoch": 0.51508585958064, + "grad_norm": 2.159876823425293, + "learning_rate": 9.997100246806755e-05, + "loss": 1.7772, + "step": 14383 + }, + { + "epoch": 0.5151216717102083, + "grad_norm": 1.748475193977356, + "learning_rate": 9.995940345584074e-05, + "loss": 1.7536, + "step": 14384 + }, + { + "epoch": 0.5151574838397766, + "grad_norm": 1.7124178409576416, + "learning_rate": 9.994780444416013e-05, + "loss": 1.5581, + "step": 14385 + }, + { + "epoch": 0.5151932959693448, + "grad_norm": 1.6359690427780151, + "learning_rate": 9.99362054331817e-05, + "loss": 1.7249, + "step": 14386 + }, + { + "epoch": 0.5152291080989131, + "grad_norm": 1.629350185394287, + "learning_rate": 9.992460642306156e-05, + "loss": 1.2294, + "step": 14387 + }, + { + "epoch": 0.5152649202284814, + "grad_norm": 1.9517797231674194, + "learning_rate": 9.991300741395574e-05, + "loss": 1.2735, + "step": 14388 + }, + { + "epoch": 0.5153007323580496, + "grad_norm": 1.8916865587234497, + "learning_rate": 9.99014084060203e-05, + "loss": 1.5717, + "step": 14389 + }, + { + "epoch": 0.515336544487618, + "grad_norm": 1.816402554512024, + "learning_rate": 9.988980939941127e-05, + "loss": 1.5327, + "step": 14390 + }, + { + "epoch": 0.5153723566171863, + "grad_norm": 1.7927991151809692, + "learning_rate": 9.987821039428474e-05, + "loss": 1.1213, + "step": 14391 + }, + { + "epoch": 0.5154081687467545, + "grad_norm": 1.3315367698669434, + "learning_rate": 9.986661139079671e-05, + "loss": 1.1219, + "step": 14392 + }, + { + "epoch": 0.5154439808763228, + "grad_norm": 1.9263297319412231, + "learning_rate": 9.985501238910325e-05, + "loss": 1.585, + "step": 14393 + }, + { + "epoch": 0.5154797930058911, + "grad_norm": 1.5922685861587524, + "learning_rate": 9.984341338936043e-05, + "loss": 1.5042, + "step": 14394 + }, + { + "epoch": 0.5155156051354594, + "grad_norm": 2.0485384464263916, + "learning_rate": 9.983181439172426e-05, + "loss": 1.5463, + "step": 14395 + }, + { + "epoch": 0.5155514172650276, + "grad_norm": 1.7884843349456787, + "learning_rate": 9.982021539635084e-05, + "loss": 1.5589, + "step": 14396 + }, + { + "epoch": 0.515587229394596, + "grad_norm": 1.4187160730361938, + "learning_rate": 9.980861640339614e-05, + "loss": 1.1647, + "step": 14397 + }, + { + "epoch": 0.5156230415241643, + "grad_norm": 1.627793312072754, + "learning_rate": 9.979701741301631e-05, + "loss": 1.2982, + "step": 14398 + }, + { + "epoch": 0.5156588536537325, + "grad_norm": 1.3230700492858887, + "learning_rate": 9.978541842536732e-05, + "loss": 1.4045, + "step": 14399 + }, + { + "epoch": 0.5156946657833008, + "grad_norm": 1.5387710332870483, + "learning_rate": 9.977381944060525e-05, + "loss": 1.4987, + "step": 14400 + }, + { + "epoch": 0.5157304779128691, + "grad_norm": 1.577405333518982, + "learning_rate": 9.976222045888614e-05, + "loss": 1.5148, + "step": 14401 + }, + { + "epoch": 0.5157662900424373, + "grad_norm": 1.404995322227478, + "learning_rate": 9.975062148036608e-05, + "loss": 1.537, + "step": 14402 + }, + { + "epoch": 0.5158021021720056, + "grad_norm": 1.305178165435791, + "learning_rate": 9.97390225052011e-05, + "loss": 1.6459, + "step": 14403 + }, + { + "epoch": 0.515837914301574, + "grad_norm": 2.065877676010132, + "learning_rate": 9.972742353354717e-05, + "loss": 1.6797, + "step": 14404 + }, + { + "epoch": 0.5158737264311423, + "grad_norm": 1.7629402875900269, + "learning_rate": 9.971582456556045e-05, + "loss": 1.4101, + "step": 14405 + }, + { + "epoch": 0.5159095385607105, + "grad_norm": 2.007244348526001, + "learning_rate": 9.970422560139692e-05, + "loss": 1.5678, + "step": 14406 + }, + { + "epoch": 0.5159453506902788, + "grad_norm": 1.5562615394592285, + "learning_rate": 9.969262664121267e-05, + "loss": 1.6061, + "step": 14407 + }, + { + "epoch": 0.5159811628198471, + "grad_norm": 1.781718134880066, + "learning_rate": 9.968102768516371e-05, + "loss": 1.6981, + "step": 14408 + }, + { + "epoch": 0.5160169749494153, + "grad_norm": 1.2723973989486694, + "learning_rate": 9.966942873340614e-05, + "loss": 1.3363, + "step": 14409 + }, + { + "epoch": 0.5160527870789836, + "grad_norm": 1.9770539999008179, + "learning_rate": 9.965782978609595e-05, + "loss": 1.2804, + "step": 14410 + }, + { + "epoch": 0.516088599208552, + "grad_norm": 1.5593479871749878, + "learning_rate": 9.964623084338926e-05, + "loss": 1.5882, + "step": 14411 + }, + { + "epoch": 0.5161244113381203, + "grad_norm": 1.7799978256225586, + "learning_rate": 9.963463190544208e-05, + "loss": 1.4457, + "step": 14412 + }, + { + "epoch": 0.5161602234676885, + "grad_norm": 1.773402214050293, + "learning_rate": 9.96230329724104e-05, + "loss": 1.6663, + "step": 14413 + }, + { + "epoch": 0.5161960355972568, + "grad_norm": 1.4051496982574463, + "learning_rate": 9.961143404445038e-05, + "loss": 1.439, + "step": 14414 + }, + { + "epoch": 0.5162318477268251, + "grad_norm": 1.4480347633361816, + "learning_rate": 9.959983512171796e-05, + "loss": 1.4239, + "step": 14415 + }, + { + "epoch": 0.5162676598563933, + "grad_norm": 1.744836449623108, + "learning_rate": 9.95882362043693e-05, + "loss": 1.5925, + "step": 14416 + }, + { + "epoch": 0.5163034719859616, + "grad_norm": 2.2484891414642334, + "learning_rate": 9.957663729256033e-05, + "loss": 1.4217, + "step": 14417 + }, + { + "epoch": 0.51633928411553, + "grad_norm": 1.41450834274292, + "learning_rate": 9.956503838644719e-05, + "loss": 1.4532, + "step": 14418 + }, + { + "epoch": 0.5163750962450983, + "grad_norm": 1.5045841932296753, + "learning_rate": 9.95534394861859e-05, + "loss": 1.5254, + "step": 14419 + }, + { + "epoch": 0.5164109083746665, + "grad_norm": 1.430303931236267, + "learning_rate": 9.954184059193251e-05, + "loss": 1.5091, + "step": 14420 + }, + { + "epoch": 0.5164467205042348, + "grad_norm": 2.064718008041382, + "learning_rate": 9.953024170384309e-05, + "loss": 1.3218, + "step": 14421 + }, + { + "epoch": 0.5164825326338031, + "grad_norm": 1.8702689409255981, + "learning_rate": 9.95186428220736e-05, + "loss": 1.3991, + "step": 14422 + }, + { + "epoch": 0.5165183447633713, + "grad_norm": 1.897904634475708, + "learning_rate": 9.950704394678021e-05, + "loss": 1.3649, + "step": 14423 + }, + { + "epoch": 0.5165541568929396, + "grad_norm": 1.8715026378631592, + "learning_rate": 9.949544507811885e-05, + "loss": 1.4766, + "step": 14424 + }, + { + "epoch": 0.516589969022508, + "grad_norm": 1.6160260438919067, + "learning_rate": 9.948384621624569e-05, + "loss": 1.7052, + "step": 14425 + }, + { + "epoch": 0.5166257811520762, + "grad_norm": 1.8741852045059204, + "learning_rate": 9.947224736131662e-05, + "loss": 1.4936, + "step": 14426 + }, + { + "epoch": 0.5166615932816445, + "grad_norm": 2.4261343479156494, + "learning_rate": 9.946064851348788e-05, + "loss": 1.4065, + "step": 14427 + }, + { + "epoch": 0.5166974054112128, + "grad_norm": 1.7130334377288818, + "learning_rate": 9.944904967291533e-05, + "loss": 1.1564, + "step": 14428 + }, + { + "epoch": 0.516733217540781, + "grad_norm": 1.6435831785202026, + "learning_rate": 9.94374508397552e-05, + "loss": 1.2979, + "step": 14429 + }, + { + "epoch": 0.5167690296703493, + "grad_norm": 1.4821659326553345, + "learning_rate": 9.94258520141634e-05, + "loss": 1.3349, + "step": 14430 + }, + { + "epoch": 0.5168048417999176, + "grad_norm": 1.582342267036438, + "learning_rate": 9.941425319629598e-05, + "loss": 1.6496, + "step": 14431 + }, + { + "epoch": 0.516840653929486, + "grad_norm": 1.3501367568969727, + "learning_rate": 9.94026543863091e-05, + "loss": 1.2989, + "step": 14432 + }, + { + "epoch": 0.5168764660590542, + "grad_norm": 1.5915173292160034, + "learning_rate": 9.939105558435866e-05, + "loss": 1.7494, + "step": 14433 + }, + { + "epoch": 0.5169122781886225, + "grad_norm": 1.774556040763855, + "learning_rate": 9.937945679060085e-05, + "loss": 1.3733, + "step": 14434 + }, + { + "epoch": 0.5169480903181908, + "grad_norm": 2.2483110427856445, + "learning_rate": 9.93678580051916e-05, + "loss": 1.4633, + "step": 14435 + }, + { + "epoch": 0.516983902447759, + "grad_norm": 1.5181875228881836, + "learning_rate": 9.935625922828705e-05, + "loss": 1.6502, + "step": 14436 + }, + { + "epoch": 0.5170197145773273, + "grad_norm": 1.6699066162109375, + "learning_rate": 9.934466046004313e-05, + "loss": 1.4195, + "step": 14437 + }, + { + "epoch": 0.5170555267068956, + "grad_norm": 1.4470813274383545, + "learning_rate": 9.933306170061604e-05, + "loss": 1.4196, + "step": 14438 + }, + { + "epoch": 0.517091338836464, + "grad_norm": 1.7814797163009644, + "learning_rate": 9.932146295016172e-05, + "loss": 1.5409, + "step": 14439 + }, + { + "epoch": 0.5171271509660322, + "grad_norm": 1.530346155166626, + "learning_rate": 9.930986420883623e-05, + "loss": 1.4033, + "step": 14440 + }, + { + "epoch": 0.5171629630956005, + "grad_norm": 3.921464443206787, + "learning_rate": 9.929826547679563e-05, + "loss": 1.6177, + "step": 14441 + }, + { + "epoch": 0.5171987752251688, + "grad_norm": 1.572906494140625, + "learning_rate": 9.928666675419595e-05, + "loss": 1.6209, + "step": 14442 + }, + { + "epoch": 0.517234587354737, + "grad_norm": 2.0591559410095215, + "learning_rate": 9.927506804119326e-05, + "loss": 1.1671, + "step": 14443 + }, + { + "epoch": 0.5172703994843053, + "grad_norm": 1.4399479627609253, + "learning_rate": 9.926346933794357e-05, + "loss": 1.4316, + "step": 14444 + }, + { + "epoch": 0.5173062116138736, + "grad_norm": 1.3752440214157104, + "learning_rate": 9.925187064460299e-05, + "loss": 1.4705, + "step": 14445 + }, + { + "epoch": 0.517342023743442, + "grad_norm": 1.7031986713409424, + "learning_rate": 9.924027196132747e-05, + "loss": 1.3291, + "step": 14446 + }, + { + "epoch": 0.5173778358730102, + "grad_norm": 1.779016137123108, + "learning_rate": 9.922867328827319e-05, + "loss": 1.2053, + "step": 14447 + }, + { + "epoch": 0.5174136480025785, + "grad_norm": 1.4940849542617798, + "learning_rate": 9.921707462559608e-05, + "loss": 1.6245, + "step": 14448 + }, + { + "epoch": 0.5174494601321468, + "grad_norm": 1.6371047496795654, + "learning_rate": 9.92054759734522e-05, + "loss": 1.2594, + "step": 14449 + }, + { + "epoch": 0.517485272261715, + "grad_norm": 2.4114179611206055, + "learning_rate": 9.919387733199764e-05, + "loss": 1.4802, + "step": 14450 + }, + { + "epoch": 0.5175210843912833, + "grad_norm": 1.8427387475967407, + "learning_rate": 9.91822787013884e-05, + "loss": 1.7216, + "step": 14451 + }, + { + "epoch": 0.5175568965208516, + "grad_norm": 1.5938351154327393, + "learning_rate": 9.917068008178056e-05, + "loss": 1.3973, + "step": 14452 + }, + { + "epoch": 0.51759270865042, + "grad_norm": 1.515212059020996, + "learning_rate": 9.915908147333013e-05, + "loss": 1.3184, + "step": 14453 + }, + { + "epoch": 0.5176285207799882, + "grad_norm": 2.3098485469818115, + "learning_rate": 9.91474828761932e-05, + "loss": 1.3732, + "step": 14454 + }, + { + "epoch": 0.5176643329095565, + "grad_norm": 1.29752779006958, + "learning_rate": 9.913588429052578e-05, + "loss": 1.3128, + "step": 14455 + }, + { + "epoch": 0.5177001450391248, + "grad_norm": 1.9097155332565308, + "learning_rate": 9.912428571648393e-05, + "loss": 1.5393, + "step": 14456 + }, + { + "epoch": 0.517735957168693, + "grad_norm": 1.6922508478164673, + "learning_rate": 9.91126871542237e-05, + "loss": 1.4879, + "step": 14457 + }, + { + "epoch": 0.5177717692982613, + "grad_norm": 2.132388114929199, + "learning_rate": 9.910108860390107e-05, + "loss": 1.5717, + "step": 14458 + }, + { + "epoch": 0.5178075814278296, + "grad_norm": 2.9997427463531494, + "learning_rate": 9.908949006567218e-05, + "loss": 1.6868, + "step": 14459 + }, + { + "epoch": 0.5178433935573978, + "grad_norm": 1.777268409729004, + "learning_rate": 9.9077891539693e-05, + "loss": 1.6998, + "step": 14460 + }, + { + "epoch": 0.5178792056869662, + "grad_norm": 2.5405521392822266, + "learning_rate": 9.906629302611961e-05, + "loss": 1.8193, + "step": 14461 + }, + { + "epoch": 0.5179150178165345, + "grad_norm": 2.0599591732025146, + "learning_rate": 9.905469452510803e-05, + "loss": 1.7768, + "step": 14462 + }, + { + "epoch": 0.5179508299461028, + "grad_norm": 1.894958734512329, + "learning_rate": 9.904309603681433e-05, + "loss": 1.4591, + "step": 14463 + }, + { + "epoch": 0.517986642075671, + "grad_norm": 1.3576775789260864, + "learning_rate": 9.903149756139453e-05, + "loss": 1.4085, + "step": 14464 + }, + { + "epoch": 0.5180224542052393, + "grad_norm": 1.7294950485229492, + "learning_rate": 9.901989909900469e-05, + "loss": 1.7503, + "step": 14465 + }, + { + "epoch": 0.5180582663348076, + "grad_norm": 1.5523263216018677, + "learning_rate": 9.900830064980084e-05, + "loss": 1.5677, + "step": 14466 + }, + { + "epoch": 0.5180940784643758, + "grad_norm": 1.6701172590255737, + "learning_rate": 9.899670221393901e-05, + "loss": 1.7297, + "step": 14467 + }, + { + "epoch": 0.5181298905939442, + "grad_norm": 1.5365049839019775, + "learning_rate": 9.89851037915753e-05, + "loss": 1.4214, + "step": 14468 + }, + { + "epoch": 0.5181657027235125, + "grad_norm": 1.5968941450119019, + "learning_rate": 9.897350538286566e-05, + "loss": 1.4887, + "step": 14469 + }, + { + "epoch": 0.5182015148530807, + "grad_norm": 2.073341131210327, + "learning_rate": 9.896190698796621e-05, + "loss": 1.4142, + "step": 14470 + }, + { + "epoch": 0.518237326982649, + "grad_norm": 2.2669150829315186, + "learning_rate": 9.895030860703295e-05, + "loss": 1.6122, + "step": 14471 + }, + { + "epoch": 0.5182731391122173, + "grad_norm": 1.5452347993850708, + "learning_rate": 9.893871024022195e-05, + "loss": 1.2547, + "step": 14472 + }, + { + "epoch": 0.5183089512417856, + "grad_norm": 1.556300401687622, + "learning_rate": 9.89271118876892e-05, + "loss": 1.587, + "step": 14473 + }, + { + "epoch": 0.5183447633713538, + "grad_norm": 1.6784199476242065, + "learning_rate": 9.891551354959082e-05, + "loss": 1.5504, + "step": 14474 + }, + { + "epoch": 0.5183805755009222, + "grad_norm": 2.0387167930603027, + "learning_rate": 9.890391522608278e-05, + "loss": 1.5993, + "step": 14475 + }, + { + "epoch": 0.5184163876304905, + "grad_norm": 1.639019250869751, + "learning_rate": 9.889231691732115e-05, + "loss": 1.5219, + "step": 14476 + }, + { + "epoch": 0.5184521997600587, + "grad_norm": 1.59255051612854, + "learning_rate": 9.888071862346198e-05, + "loss": 1.5876, + "step": 14477 + }, + { + "epoch": 0.518488011889627, + "grad_norm": 1.663064956665039, + "learning_rate": 9.886912034466127e-05, + "loss": 1.0076, + "step": 14478 + }, + { + "epoch": 0.5185238240191953, + "grad_norm": 1.6870861053466797, + "learning_rate": 9.885752208107511e-05, + "loss": 1.5689, + "step": 14479 + }, + { + "epoch": 0.5185596361487635, + "grad_norm": 1.2715725898742676, + "learning_rate": 9.88459238328595e-05, + "loss": 1.3663, + "step": 14480 + }, + { + "epoch": 0.5185954482783318, + "grad_norm": 2.3270177841186523, + "learning_rate": 9.883432560017052e-05, + "loss": 1.6806, + "step": 14481 + }, + { + "epoch": 0.5186312604079002, + "grad_norm": 2.031411647796631, + "learning_rate": 9.882272738316418e-05, + "loss": 1.6844, + "step": 14482 + }, + { + "epoch": 0.5186670725374685, + "grad_norm": 1.4794574975967407, + "learning_rate": 9.88111291819965e-05, + "loss": 1.5449, + "step": 14483 + }, + { + "epoch": 0.5187028846670367, + "grad_norm": 1.525395154953003, + "learning_rate": 9.879953099682358e-05, + "loss": 1.4823, + "step": 14484 + }, + { + "epoch": 0.518738696796605, + "grad_norm": 2.1387875080108643, + "learning_rate": 9.878793282780137e-05, + "loss": 1.4335, + "step": 14485 + }, + { + "epoch": 0.5187745089261733, + "grad_norm": 1.8972395658493042, + "learning_rate": 9.877633467508602e-05, + "loss": 1.3409, + "step": 14486 + }, + { + "epoch": 0.5188103210557415, + "grad_norm": 1.8318805694580078, + "learning_rate": 9.876473653883346e-05, + "loss": 1.3479, + "step": 14487 + }, + { + "epoch": 0.5188461331853098, + "grad_norm": 1.8570977449417114, + "learning_rate": 9.87531384191998e-05, + "loss": 1.4737, + "step": 14488 + }, + { + "epoch": 0.5188819453148782, + "grad_norm": 1.4921190738677979, + "learning_rate": 9.874154031634103e-05, + "loss": 1.5501, + "step": 14489 + }, + { + "epoch": 0.5189177574444465, + "grad_norm": 2.1454734802246094, + "learning_rate": 9.872994223041325e-05, + "loss": 1.7193, + "step": 14490 + }, + { + "epoch": 0.5189535695740147, + "grad_norm": 1.6640002727508545, + "learning_rate": 9.871834416157246e-05, + "loss": 1.1278, + "step": 14491 + }, + { + "epoch": 0.518989381703583, + "grad_norm": 1.7006652355194092, + "learning_rate": 9.870674610997467e-05, + "loss": 1.6311, + "step": 14492 + }, + { + "epoch": 0.5190251938331513, + "grad_norm": 1.4623268842697144, + "learning_rate": 9.869514807577595e-05, + "loss": 1.0958, + "step": 14493 + }, + { + "epoch": 0.5190610059627195, + "grad_norm": 1.8661326169967651, + "learning_rate": 9.868355005913232e-05, + "loss": 1.2537, + "step": 14494 + }, + { + "epoch": 0.5190968180922878, + "grad_norm": 1.5580703020095825, + "learning_rate": 9.867195206019985e-05, + "loss": 1.6413, + "step": 14495 + }, + { + "epoch": 0.5191326302218562, + "grad_norm": 2.2893712520599365, + "learning_rate": 9.866035407913452e-05, + "loss": 1.5394, + "step": 14496 + }, + { + "epoch": 0.5191684423514245, + "grad_norm": 1.4904677867889404, + "learning_rate": 9.864875611609243e-05, + "loss": 1.5643, + "step": 14497 + }, + { + "epoch": 0.5192042544809927, + "grad_norm": 1.5410066843032837, + "learning_rate": 9.863715817122956e-05, + "loss": 1.3726, + "step": 14498 + }, + { + "epoch": 0.519240066610561, + "grad_norm": 1.7164530754089355, + "learning_rate": 9.862556024470199e-05, + "loss": 1.6264, + "step": 14499 + }, + { + "epoch": 0.5192758787401293, + "grad_norm": 1.4902276992797852, + "learning_rate": 9.861396233666574e-05, + "loss": 1.5273, + "step": 14500 + }, + { + "epoch": 0.5193116908696975, + "grad_norm": 1.731277346611023, + "learning_rate": 9.860236444727679e-05, + "loss": 1.5882, + "step": 14501 + }, + { + "epoch": 0.5193475029992658, + "grad_norm": 1.6587519645690918, + "learning_rate": 9.859076657669127e-05, + "loss": 1.3716, + "step": 14502 + }, + { + "epoch": 0.5193833151288342, + "grad_norm": 2.1876940727233887, + "learning_rate": 9.857916872506513e-05, + "loss": 1.5538, + "step": 14503 + }, + { + "epoch": 0.5194191272584024, + "grad_norm": 1.7598111629486084, + "learning_rate": 9.856757089255448e-05, + "loss": 1.4646, + "step": 14504 + }, + { + "epoch": 0.5194549393879707, + "grad_norm": 1.1007812023162842, + "learning_rate": 9.85559730793153e-05, + "loss": 1.2949, + "step": 14505 + }, + { + "epoch": 0.519490751517539, + "grad_norm": 1.786514163017273, + "learning_rate": 9.854437528550364e-05, + "loss": 1.4212, + "step": 14506 + }, + { + "epoch": 0.5195265636471073, + "grad_norm": 1.267168641090393, + "learning_rate": 9.853277751127552e-05, + "loss": 1.5191, + "step": 14507 + }, + { + "epoch": 0.5195623757766755, + "grad_norm": 1.632846713066101, + "learning_rate": 9.852117975678701e-05, + "loss": 1.3769, + "step": 14508 + }, + { + "epoch": 0.5195981879062438, + "grad_norm": 1.708034873008728, + "learning_rate": 9.850958202219414e-05, + "loss": 1.4926, + "step": 14509 + }, + { + "epoch": 0.5196340000358122, + "grad_norm": 1.7593141794204712, + "learning_rate": 9.849798430765286e-05, + "loss": 1.3824, + "step": 14510 + }, + { + "epoch": 0.5196698121653804, + "grad_norm": 2.2882802486419678, + "learning_rate": 9.848638661331933e-05, + "loss": 1.4505, + "step": 14511 + }, + { + "epoch": 0.5197056242949487, + "grad_norm": 1.923661231994629, + "learning_rate": 9.847478893934944e-05, + "loss": 1.5545, + "step": 14512 + }, + { + "epoch": 0.519741436424517, + "grad_norm": 2.511514186859131, + "learning_rate": 9.846319128589936e-05, + "loss": 1.2665, + "step": 14513 + }, + { + "epoch": 0.5197772485540852, + "grad_norm": 1.4388993978500366, + "learning_rate": 9.845159365312501e-05, + "loss": 1.2587, + "step": 14514 + }, + { + "epoch": 0.5198130606836535, + "grad_norm": 1.2576457262039185, + "learning_rate": 9.84399960411825e-05, + "loss": 1.1563, + "step": 14515 + }, + { + "epoch": 0.5198488728132218, + "grad_norm": 2.3293657302856445, + "learning_rate": 9.842839845022781e-05, + "loss": 1.6719, + "step": 14516 + }, + { + "epoch": 0.5198846849427902, + "grad_norm": 1.6276086568832397, + "learning_rate": 9.841680088041701e-05, + "loss": 1.5341, + "step": 14517 + }, + { + "epoch": 0.5199204970723584, + "grad_norm": 1.4863203763961792, + "learning_rate": 9.840520333190615e-05, + "loss": 1.3341, + "step": 14518 + }, + { + "epoch": 0.5199563092019267, + "grad_norm": 1.4681768417358398, + "learning_rate": 9.839360580485115e-05, + "loss": 1.2057, + "step": 14519 + }, + { + "epoch": 0.519992121331495, + "grad_norm": 2.524519443511963, + "learning_rate": 9.838200829940818e-05, + "loss": 1.4854, + "step": 14520 + }, + { + "epoch": 0.5200279334610632, + "grad_norm": 1.424342155456543, + "learning_rate": 9.837041081573312e-05, + "loss": 1.1943, + "step": 14521 + }, + { + "epoch": 0.5200637455906315, + "grad_norm": 1.5036619901657104, + "learning_rate": 9.835881335398216e-05, + "loss": 1.5235, + "step": 14522 + }, + { + "epoch": 0.5200995577201998, + "grad_norm": 1.508616328239441, + "learning_rate": 9.834721591431118e-05, + "loss": 1.6061, + "step": 14523 + }, + { + "epoch": 0.5201353698497682, + "grad_norm": 1.989552617073059, + "learning_rate": 9.833561849687634e-05, + "loss": 1.6041, + "step": 14524 + }, + { + "epoch": 0.5201711819793364, + "grad_norm": 1.4946794509887695, + "learning_rate": 9.832402110183355e-05, + "loss": 1.4669, + "step": 14525 + }, + { + "epoch": 0.5202069941089047, + "grad_norm": 1.4868971109390259, + "learning_rate": 9.831242372933896e-05, + "loss": 1.4832, + "step": 14526 + }, + { + "epoch": 0.520242806238473, + "grad_norm": 1.2756190299987793, + "learning_rate": 9.830082637954851e-05, + "loss": 1.2785, + "step": 14527 + }, + { + "epoch": 0.5202786183680412, + "grad_norm": 1.9612704515457153, + "learning_rate": 9.828922905261819e-05, + "loss": 1.355, + "step": 14528 + }, + { + "epoch": 0.5203144304976095, + "grad_norm": 2.44775652885437, + "learning_rate": 9.827763174870417e-05, + "loss": 1.4828, + "step": 14529 + }, + { + "epoch": 0.5203502426271778, + "grad_norm": 1.8801780939102173, + "learning_rate": 9.826603446796231e-05, + "loss": 1.1552, + "step": 14530 + }, + { + "epoch": 0.5203860547567462, + "grad_norm": 2.448315143585205, + "learning_rate": 9.82544372105488e-05, + "loss": 1.5563, + "step": 14531 + }, + { + "epoch": 0.5204218668863144, + "grad_norm": 2.337183713912964, + "learning_rate": 9.824283997661952e-05, + "loss": 1.6272, + "step": 14532 + }, + { + "epoch": 0.5204576790158827, + "grad_norm": 1.5295284986495972, + "learning_rate": 9.823124276633061e-05, + "loss": 1.7039, + "step": 14533 + }, + { + "epoch": 0.520493491145451, + "grad_norm": 2.0706472396850586, + "learning_rate": 9.821964557983799e-05, + "loss": 1.4274, + "step": 14534 + }, + { + "epoch": 0.5205293032750192, + "grad_norm": 1.6976041793823242, + "learning_rate": 9.820804841729782e-05, + "loss": 1.3637, + "step": 14535 + }, + { + "epoch": 0.5205651154045875, + "grad_norm": 1.4130737781524658, + "learning_rate": 9.819645127886602e-05, + "loss": 1.3593, + "step": 14536 + }, + { + "epoch": 0.5206009275341558, + "grad_norm": 1.3637433052062988, + "learning_rate": 9.818485416469861e-05, + "loss": 1.4258, + "step": 14537 + }, + { + "epoch": 0.5206367396637241, + "grad_norm": 1.7225490808486938, + "learning_rate": 9.817325707495167e-05, + "loss": 1.5675, + "step": 14538 + }, + { + "epoch": 0.5206725517932924, + "grad_norm": 2.0762996673583984, + "learning_rate": 9.816166000978119e-05, + "loss": 1.2102, + "step": 14539 + }, + { + "epoch": 0.5207083639228607, + "grad_norm": 1.6375013589859009, + "learning_rate": 9.815006296934321e-05, + "loss": 1.5834, + "step": 14540 + }, + { + "epoch": 0.520744176052429, + "grad_norm": 1.525462031364441, + "learning_rate": 9.813846595379371e-05, + "loss": 1.6418, + "step": 14541 + }, + { + "epoch": 0.5207799881819972, + "grad_norm": 1.9003756046295166, + "learning_rate": 9.812686896328882e-05, + "loss": 1.9051, + "step": 14542 + }, + { + "epoch": 0.5208158003115655, + "grad_norm": 1.4063869714736938, + "learning_rate": 9.811527199798443e-05, + "loss": 1.2691, + "step": 14543 + }, + { + "epoch": 0.5208516124411338, + "grad_norm": 2.0352799892425537, + "learning_rate": 9.810367505803667e-05, + "loss": 1.5649, + "step": 14544 + }, + { + "epoch": 0.5208874245707021, + "grad_norm": 2.369400978088379, + "learning_rate": 9.80920781436015e-05, + "loss": 1.5076, + "step": 14545 + }, + { + "epoch": 0.5209232367002704, + "grad_norm": 1.6463871002197266, + "learning_rate": 9.808048125483494e-05, + "loss": 1.2952, + "step": 14546 + }, + { + "epoch": 0.5209590488298387, + "grad_norm": 1.4209014177322388, + "learning_rate": 9.806888439189306e-05, + "loss": 1.6418, + "step": 14547 + }, + { + "epoch": 0.520994860959407, + "grad_norm": 1.5328431129455566, + "learning_rate": 9.805728755493182e-05, + "loss": 1.1911, + "step": 14548 + }, + { + "epoch": 0.5210306730889752, + "grad_norm": 1.4673058986663818, + "learning_rate": 9.804569074410729e-05, + "loss": 1.3006, + "step": 14549 + }, + { + "epoch": 0.5210664852185435, + "grad_norm": 1.8996427059173584, + "learning_rate": 9.803409395957545e-05, + "loss": 1.54, + "step": 14550 + }, + { + "epoch": 0.5211022973481118, + "grad_norm": 1.5183322429656982, + "learning_rate": 9.802249720149236e-05, + "loss": 1.444, + "step": 14551 + }, + { + "epoch": 0.5211381094776801, + "grad_norm": 2.154158353805542, + "learning_rate": 9.8010900470014e-05, + "loss": 1.3751, + "step": 14552 + }, + { + "epoch": 0.5211739216072484, + "grad_norm": 1.3777554035186768, + "learning_rate": 9.799930376529643e-05, + "loss": 1.5448, + "step": 14553 + }, + { + "epoch": 0.5212097337368167, + "grad_norm": 1.716951847076416, + "learning_rate": 9.798770708749563e-05, + "loss": 1.4537, + "step": 14554 + }, + { + "epoch": 0.5212455458663849, + "grad_norm": 1.4311693906784058, + "learning_rate": 9.797611043676764e-05, + "loss": 1.6834, + "step": 14555 + }, + { + "epoch": 0.5212813579959532, + "grad_norm": 1.699777603149414, + "learning_rate": 9.796451381326849e-05, + "loss": 1.7289, + "step": 14556 + }, + { + "epoch": 0.5213171701255215, + "grad_norm": 1.704720139503479, + "learning_rate": 9.795291721715414e-05, + "loss": 1.7788, + "step": 14557 + }, + { + "epoch": 0.5213529822550897, + "grad_norm": 1.8193825483322144, + "learning_rate": 9.794132064858069e-05, + "loss": 1.7059, + "step": 14558 + }, + { + "epoch": 0.5213887943846581, + "grad_norm": 1.9302318096160889, + "learning_rate": 9.792972410770409e-05, + "loss": 1.6084, + "step": 14559 + }, + { + "epoch": 0.5214246065142264, + "grad_norm": 1.3157739639282227, + "learning_rate": 9.791812759468039e-05, + "loss": 1.2628, + "step": 14560 + }, + { + "epoch": 0.5214604186437947, + "grad_norm": 1.3691009283065796, + "learning_rate": 9.79065311096656e-05, + "loss": 1.2926, + "step": 14561 + }, + { + "epoch": 0.5214962307733629, + "grad_norm": 1.2893990278244019, + "learning_rate": 9.789493465281574e-05, + "loss": 1.6619, + "step": 14562 + }, + { + "epoch": 0.5215320429029312, + "grad_norm": 1.7624584436416626, + "learning_rate": 9.788333822428682e-05, + "loss": 1.5958, + "step": 14563 + }, + { + "epoch": 0.5215678550324995, + "grad_norm": 1.5968478918075562, + "learning_rate": 9.787174182423484e-05, + "loss": 1.5239, + "step": 14564 + }, + { + "epoch": 0.5216036671620677, + "grad_norm": 2.377908945083618, + "learning_rate": 9.786014545281585e-05, + "loss": 1.5145, + "step": 14565 + }, + { + "epoch": 0.5216394792916361, + "grad_norm": 1.5355181694030762, + "learning_rate": 9.78485491101858e-05, + "loss": 1.5049, + "step": 14566 + }, + { + "epoch": 0.5216752914212044, + "grad_norm": 1.638087272644043, + "learning_rate": 9.783695279650079e-05, + "loss": 1.3355, + "step": 14567 + }, + { + "epoch": 0.5217111035507727, + "grad_norm": 2.3670177459716797, + "learning_rate": 9.782535651191676e-05, + "loss": 1.2759, + "step": 14568 + }, + { + "epoch": 0.5217469156803409, + "grad_norm": 2.8764686584472656, + "learning_rate": 9.781376025658977e-05, + "loss": 1.5412, + "step": 14569 + }, + { + "epoch": 0.5217827278099092, + "grad_norm": 1.5233416557312012, + "learning_rate": 9.78021640306758e-05, + "loss": 1.1315, + "step": 14570 + }, + { + "epoch": 0.5218185399394775, + "grad_norm": 1.8155995607376099, + "learning_rate": 9.77905678343309e-05, + "loss": 1.769, + "step": 14571 + }, + { + "epoch": 0.5218543520690457, + "grad_norm": 1.6024277210235596, + "learning_rate": 9.777897166771107e-05, + "loss": 1.2893, + "step": 14572 + }, + { + "epoch": 0.5218901641986141, + "grad_norm": 1.7057386636734009, + "learning_rate": 9.776737553097227e-05, + "loss": 1.6424, + "step": 14573 + }, + { + "epoch": 0.5219259763281824, + "grad_norm": 2.125110149383545, + "learning_rate": 9.775577942427058e-05, + "loss": 1.3813, + "step": 14574 + }, + { + "epoch": 0.5219617884577507, + "grad_norm": 1.6502152681350708, + "learning_rate": 9.774418334776196e-05, + "loss": 1.4526, + "step": 14575 + }, + { + "epoch": 0.5219976005873189, + "grad_norm": 1.7954357862472534, + "learning_rate": 9.773258730160247e-05, + "loss": 1.5242, + "step": 14576 + }, + { + "epoch": 0.5220334127168872, + "grad_norm": 1.6591635942459106, + "learning_rate": 9.772099128594808e-05, + "loss": 1.3148, + "step": 14577 + }, + { + "epoch": 0.5220692248464555, + "grad_norm": 1.6281665563583374, + "learning_rate": 9.770939530095482e-05, + "loss": 1.6924, + "step": 14578 + }, + { + "epoch": 0.5221050369760237, + "grad_norm": 1.7232989072799683, + "learning_rate": 9.769779934677869e-05, + "loss": 1.7348, + "step": 14579 + }, + { + "epoch": 0.5221408491055921, + "grad_norm": 2.068788766860962, + "learning_rate": 9.76862034235757e-05, + "loss": 1.4131, + "step": 14580 + }, + { + "epoch": 0.5221766612351604, + "grad_norm": 2.2537050247192383, + "learning_rate": 9.767460753150186e-05, + "loss": 1.3277, + "step": 14581 + }, + { + "epoch": 0.5222124733647286, + "grad_norm": 2.0896992683410645, + "learning_rate": 9.766301167071316e-05, + "loss": 1.5849, + "step": 14582 + }, + { + "epoch": 0.5222482854942969, + "grad_norm": 1.4738152027130127, + "learning_rate": 9.765141584136565e-05, + "loss": 1.4946, + "step": 14583 + }, + { + "epoch": 0.5222840976238652, + "grad_norm": 1.8603551387786865, + "learning_rate": 9.763982004361527e-05, + "loss": 1.6751, + "step": 14584 + }, + { + "epoch": 0.5223199097534335, + "grad_norm": 1.5763189792633057, + "learning_rate": 9.76282242776181e-05, + "loss": 1.2796, + "step": 14585 + }, + { + "epoch": 0.5223557218830017, + "grad_norm": 1.772830843925476, + "learning_rate": 9.76166285435301e-05, + "loss": 1.6905, + "step": 14586 + }, + { + "epoch": 0.5223915340125701, + "grad_norm": 1.5136207342147827, + "learning_rate": 9.76050328415073e-05, + "loss": 1.3872, + "step": 14587 + }, + { + "epoch": 0.5224273461421384, + "grad_norm": 1.3309680223464966, + "learning_rate": 9.759343717170571e-05, + "loss": 1.2488, + "step": 14588 + }, + { + "epoch": 0.5224631582717066, + "grad_norm": 2.110062837600708, + "learning_rate": 9.758184153428126e-05, + "loss": 1.5052, + "step": 14589 + }, + { + "epoch": 0.5224989704012749, + "grad_norm": 1.3645905256271362, + "learning_rate": 9.757024592939008e-05, + "loss": 1.4249, + "step": 14590 + }, + { + "epoch": 0.5225347825308432, + "grad_norm": 1.6391838788986206, + "learning_rate": 9.755865035718807e-05, + "loss": 1.5082, + "step": 14591 + }, + { + "epoch": 0.5225705946604114, + "grad_norm": 1.9833340644836426, + "learning_rate": 9.754705481783127e-05, + "loss": 1.6181, + "step": 14592 + }, + { + "epoch": 0.5226064067899797, + "grad_norm": 1.6536145210266113, + "learning_rate": 9.753545931147569e-05, + "loss": 1.7407, + "step": 14593 + }, + { + "epoch": 0.5226422189195481, + "grad_norm": 1.3998302221298218, + "learning_rate": 9.752386383827733e-05, + "loss": 1.5109, + "step": 14594 + }, + { + "epoch": 0.5226780310491164, + "grad_norm": 2.0293946266174316, + "learning_rate": 9.751226839839217e-05, + "loss": 1.5249, + "step": 14595 + }, + { + "epoch": 0.5227138431786846, + "grad_norm": 1.617629885673523, + "learning_rate": 9.750067299197625e-05, + "loss": 1.64, + "step": 14596 + }, + { + "epoch": 0.5227496553082529, + "grad_norm": 1.4700734615325928, + "learning_rate": 9.748907761918558e-05, + "loss": 1.4105, + "step": 14597 + }, + { + "epoch": 0.5227854674378212, + "grad_norm": 2.149026870727539, + "learning_rate": 9.747748228017606e-05, + "loss": 1.4068, + "step": 14598 + }, + { + "epoch": 0.5228212795673894, + "grad_norm": 1.604383111000061, + "learning_rate": 9.746588697510381e-05, + "loss": 1.4969, + "step": 14599 + }, + { + "epoch": 0.5228570916969577, + "grad_norm": 2.5129008293151855, + "learning_rate": 9.745429170412476e-05, + "loss": 1.5898, + "step": 14600 + }, + { + "epoch": 0.5228929038265261, + "grad_norm": 1.2759195566177368, + "learning_rate": 9.744269646739494e-05, + "loss": 1.2335, + "step": 14601 + }, + { + "epoch": 0.5229287159560944, + "grad_norm": 1.4611231088638306, + "learning_rate": 9.743110126507034e-05, + "loss": 1.2164, + "step": 14602 + }, + { + "epoch": 0.5229645280856626, + "grad_norm": 1.63002610206604, + "learning_rate": 9.741950609730696e-05, + "loss": 1.5584, + "step": 14603 + }, + { + "epoch": 0.5230003402152309, + "grad_norm": 1.3063840866088867, + "learning_rate": 9.74079109642608e-05, + "loss": 1.0043, + "step": 14604 + }, + { + "epoch": 0.5230361523447992, + "grad_norm": 1.531385064125061, + "learning_rate": 9.739631586608786e-05, + "loss": 1.3484, + "step": 14605 + }, + { + "epoch": 0.5230719644743674, + "grad_norm": 1.7938756942749023, + "learning_rate": 9.738472080294415e-05, + "loss": 1.5975, + "step": 14606 + }, + { + "epoch": 0.5231077766039357, + "grad_norm": 1.6742467880249023, + "learning_rate": 9.737312577498559e-05, + "loss": 1.5811, + "step": 14607 + }, + { + "epoch": 0.5231435887335041, + "grad_norm": 1.9026448726654053, + "learning_rate": 9.73615307823683e-05, + "loss": 1.3136, + "step": 14608 + }, + { + "epoch": 0.5231794008630724, + "grad_norm": 1.5856115818023682, + "learning_rate": 9.734993582524814e-05, + "loss": 1.4487, + "step": 14609 + }, + { + "epoch": 0.5232152129926406, + "grad_norm": 1.6417968273162842, + "learning_rate": 9.733834090378125e-05, + "loss": 1.3294, + "step": 14610 + }, + { + "epoch": 0.5232510251222089, + "grad_norm": 1.5064736604690552, + "learning_rate": 9.732674601812347e-05, + "loss": 1.7278, + "step": 14611 + }, + { + "epoch": 0.5232868372517772, + "grad_norm": 1.5335125923156738, + "learning_rate": 9.731515116843094e-05, + "loss": 1.5331, + "step": 14612 + }, + { + "epoch": 0.5233226493813454, + "grad_norm": 1.4457260370254517, + "learning_rate": 9.730355635485953e-05, + "loss": 1.3542, + "step": 14613 + }, + { + "epoch": 0.5233584615109137, + "grad_norm": 1.6329476833343506, + "learning_rate": 9.729196157756534e-05, + "loss": 1.429, + "step": 14614 + }, + { + "epoch": 0.5233942736404821, + "grad_norm": 2.010593891143799, + "learning_rate": 9.728036683670433e-05, + "loss": 1.5153, + "step": 14615 + }, + { + "epoch": 0.5234300857700503, + "grad_norm": 1.8740265369415283, + "learning_rate": 9.72687721324324e-05, + "loss": 1.582, + "step": 14616 + }, + { + "epoch": 0.5234658978996186, + "grad_norm": 2.2544100284576416, + "learning_rate": 9.725717746490571e-05, + "loss": 1.6876, + "step": 14617 + }, + { + "epoch": 0.5235017100291869, + "grad_norm": 1.692060112953186, + "learning_rate": 9.724558283428007e-05, + "loss": 1.5436, + "step": 14618 + }, + { + "epoch": 0.5235375221587552, + "grad_norm": 1.4941781759262085, + "learning_rate": 9.723398824071164e-05, + "loss": 1.4658, + "step": 14619 + }, + { + "epoch": 0.5235733342883234, + "grad_norm": 1.7871164083480835, + "learning_rate": 9.722239368435624e-05, + "loss": 1.5842, + "step": 14620 + }, + { + "epoch": 0.5236091464178917, + "grad_norm": 2.2636523246765137, + "learning_rate": 9.721079916537004e-05, + "loss": 1.709, + "step": 14621 + }, + { + "epoch": 0.5236449585474601, + "grad_norm": 1.7238781452178955, + "learning_rate": 9.719920468390888e-05, + "loss": 1.6237, + "step": 14622 + }, + { + "epoch": 0.5236807706770283, + "grad_norm": 2.144887924194336, + "learning_rate": 9.718761024012886e-05, + "loss": 1.3904, + "step": 14623 + }, + { + "epoch": 0.5237165828065966, + "grad_norm": 1.6877436637878418, + "learning_rate": 9.717601583418588e-05, + "loss": 1.5621, + "step": 14624 + }, + { + "epoch": 0.5237523949361649, + "grad_norm": 1.2671278715133667, + "learning_rate": 9.716442146623594e-05, + "loss": 1.2752, + "step": 14625 + }, + { + "epoch": 0.5237882070657331, + "grad_norm": 2.0848145484924316, + "learning_rate": 9.715282713643512e-05, + "loss": 1.378, + "step": 14626 + }, + { + "epoch": 0.5238240191953014, + "grad_norm": 2.0249481201171875, + "learning_rate": 9.714123284493925e-05, + "loss": 1.227, + "step": 14627 + }, + { + "epoch": 0.5238598313248697, + "grad_norm": 1.9421577453613281, + "learning_rate": 9.712963859190449e-05, + "loss": 1.6061, + "step": 14628 + }, + { + "epoch": 0.5238956434544381, + "grad_norm": 1.442251443862915, + "learning_rate": 9.711804437748669e-05, + "loss": 1.4937, + "step": 14629 + }, + { + "epoch": 0.5239314555840063, + "grad_norm": 1.8193825483322144, + "learning_rate": 9.710645020184193e-05, + "loss": 1.5649, + "step": 14630 + }, + { + "epoch": 0.5239672677135746, + "grad_norm": 1.8133513927459717, + "learning_rate": 9.709485606512607e-05, + "loss": 1.6271, + "step": 14631 + }, + { + "epoch": 0.5240030798431429, + "grad_norm": 1.5228266716003418, + "learning_rate": 9.708326196749527e-05, + "loss": 1.9068, + "step": 14632 + }, + { + "epoch": 0.5240388919727111, + "grad_norm": 2.925337314605713, + "learning_rate": 9.707166790910538e-05, + "loss": 1.6324, + "step": 14633 + }, + { + "epoch": 0.5240747041022794, + "grad_norm": 2.007188081741333, + "learning_rate": 9.70600738901124e-05, + "loss": 1.4698, + "step": 14634 + }, + { + "epoch": 0.5241105162318477, + "grad_norm": 1.4450551271438599, + "learning_rate": 9.704847991067236e-05, + "loss": 1.4539, + "step": 14635 + }, + { + "epoch": 0.5241463283614161, + "grad_norm": 1.247611165046692, + "learning_rate": 9.703688597094118e-05, + "loss": 1.394, + "step": 14636 + }, + { + "epoch": 0.5241821404909843, + "grad_norm": 1.4059821367263794, + "learning_rate": 9.702529207107491e-05, + "loss": 1.4542, + "step": 14637 + }, + { + "epoch": 0.5242179526205526, + "grad_norm": 1.3357560634613037, + "learning_rate": 9.701369821122945e-05, + "loss": 1.4459, + "step": 14638 + }, + { + "epoch": 0.5242537647501209, + "grad_norm": 1.6569325923919678, + "learning_rate": 9.70021043915609e-05, + "loss": 1.3154, + "step": 14639 + }, + { + "epoch": 0.5242895768796891, + "grad_norm": 1.5145827531814575, + "learning_rate": 9.69905106122251e-05, + "loss": 1.4301, + "step": 14640 + }, + { + "epoch": 0.5243253890092574, + "grad_norm": 1.7709879875183105, + "learning_rate": 9.697891687337817e-05, + "loss": 1.4174, + "step": 14641 + }, + { + "epoch": 0.5243612011388257, + "grad_norm": 1.6160484552383423, + "learning_rate": 9.696732317517599e-05, + "loss": 1.5869, + "step": 14642 + }, + { + "epoch": 0.524397013268394, + "grad_norm": 1.8500139713287354, + "learning_rate": 9.695572951777454e-05, + "loss": 1.5376, + "step": 14643 + }, + { + "epoch": 0.5244328253979623, + "grad_norm": 2.073073148727417, + "learning_rate": 9.694413590132985e-05, + "loss": 1.3357, + "step": 14644 + }, + { + "epoch": 0.5244686375275306, + "grad_norm": 1.8784855604171753, + "learning_rate": 9.693254232599784e-05, + "loss": 1.4267, + "step": 14645 + }, + { + "epoch": 0.5245044496570989, + "grad_norm": 1.3910045623779297, + "learning_rate": 9.692094879193455e-05, + "loss": 1.5124, + "step": 14646 + }, + { + "epoch": 0.5245402617866671, + "grad_norm": 2.0571978092193604, + "learning_rate": 9.690935529929587e-05, + "loss": 1.9757, + "step": 14647 + }, + { + "epoch": 0.5245760739162354, + "grad_norm": 1.7781730890274048, + "learning_rate": 9.689776184823789e-05, + "loss": 1.3493, + "step": 14648 + }, + { + "epoch": 0.5246118860458037, + "grad_norm": 1.7598072290420532, + "learning_rate": 9.688616843891648e-05, + "loss": 1.1725, + "step": 14649 + }, + { + "epoch": 0.524647698175372, + "grad_norm": 2.3508193492889404, + "learning_rate": 9.687457507148768e-05, + "loss": 1.685, + "step": 14650 + }, + { + "epoch": 0.5246835103049403, + "grad_norm": 2.0934391021728516, + "learning_rate": 9.686298174610745e-05, + "loss": 1.3282, + "step": 14651 + }, + { + "epoch": 0.5247193224345086, + "grad_norm": 1.3602052927017212, + "learning_rate": 9.685138846293171e-05, + "loss": 1.3148, + "step": 14652 + }, + { + "epoch": 0.5247551345640769, + "grad_norm": 1.5019041299819946, + "learning_rate": 9.683979522211652e-05, + "loss": 1.4427, + "step": 14653 + }, + { + "epoch": 0.5247909466936451, + "grad_norm": 1.4769161939620972, + "learning_rate": 9.682820202381779e-05, + "loss": 1.4798, + "step": 14654 + }, + { + "epoch": 0.5248267588232134, + "grad_norm": 1.3109664916992188, + "learning_rate": 9.681660886819152e-05, + "loss": 1.3262, + "step": 14655 + }, + { + "epoch": 0.5248625709527817, + "grad_norm": 1.9010754823684692, + "learning_rate": 9.680501575539365e-05, + "loss": 1.4391, + "step": 14656 + }, + { + "epoch": 0.52489838308235, + "grad_norm": 1.923020839691162, + "learning_rate": 9.679342268558019e-05, + "loss": 1.4267, + "step": 14657 + }, + { + "epoch": 0.5249341952119183, + "grad_norm": 1.572023630142212, + "learning_rate": 9.678182965890708e-05, + "loss": 1.4757, + "step": 14658 + }, + { + "epoch": 0.5249700073414866, + "grad_norm": 2.1957943439483643, + "learning_rate": 9.677023667553033e-05, + "loss": 1.4175, + "step": 14659 + }, + { + "epoch": 0.5250058194710548, + "grad_norm": 1.832504153251648, + "learning_rate": 9.675864373560586e-05, + "loss": 1.2453, + "step": 14660 + }, + { + "epoch": 0.5250416316006231, + "grad_norm": 2.1098172664642334, + "learning_rate": 9.674705083928965e-05, + "loss": 1.6742, + "step": 14661 + }, + { + "epoch": 0.5250774437301914, + "grad_norm": 1.6381540298461914, + "learning_rate": 9.673545798673769e-05, + "loss": 1.4837, + "step": 14662 + }, + { + "epoch": 0.5251132558597597, + "grad_norm": 1.4275070428848267, + "learning_rate": 9.67238651781059e-05, + "loss": 1.3579, + "step": 14663 + }, + { + "epoch": 0.525149067989328, + "grad_norm": 1.609323501586914, + "learning_rate": 9.671227241355031e-05, + "loss": 1.3936, + "step": 14664 + }, + { + "epoch": 0.5251848801188963, + "grad_norm": 1.49817955493927, + "learning_rate": 9.670067969322684e-05, + "loss": 1.5228, + "step": 14665 + }, + { + "epoch": 0.5252206922484646, + "grad_norm": 1.4510881900787354, + "learning_rate": 9.668908701729148e-05, + "loss": 1.3275, + "step": 14666 + }, + { + "epoch": 0.5252565043780328, + "grad_norm": 1.4491652250289917, + "learning_rate": 9.667749438590017e-05, + "loss": 1.4348, + "step": 14667 + }, + { + "epoch": 0.5252923165076011, + "grad_norm": 1.9161545038223267, + "learning_rate": 9.66659017992089e-05, + "loss": 1.5006, + "step": 14668 + }, + { + "epoch": 0.5253281286371694, + "grad_norm": 1.586111307144165, + "learning_rate": 9.665430925737362e-05, + "loss": 1.5004, + "step": 14669 + }, + { + "epoch": 0.5253639407667376, + "grad_norm": 1.7797801494598389, + "learning_rate": 9.664271676055027e-05, + "loss": 1.3378, + "step": 14670 + }, + { + "epoch": 0.525399752896306, + "grad_norm": 1.4467164278030396, + "learning_rate": 9.663112430889487e-05, + "loss": 1.249, + "step": 14671 + }, + { + "epoch": 0.5254355650258743, + "grad_norm": 1.5761590003967285, + "learning_rate": 9.661953190256333e-05, + "loss": 1.6898, + "step": 14672 + }, + { + "epoch": 0.5254713771554426, + "grad_norm": 1.3600083589553833, + "learning_rate": 9.660793954171163e-05, + "loss": 1.6802, + "step": 14673 + }, + { + "epoch": 0.5255071892850108, + "grad_norm": 1.4301543235778809, + "learning_rate": 9.65963472264957e-05, + "loss": 1.5224, + "step": 14674 + }, + { + "epoch": 0.5255430014145791, + "grad_norm": 1.6215018033981323, + "learning_rate": 9.658475495707157e-05, + "loss": 1.6439, + "step": 14675 + }, + { + "epoch": 0.5255788135441474, + "grad_norm": 1.8243167400360107, + "learning_rate": 9.657316273359515e-05, + "loss": 1.2978, + "step": 14676 + }, + { + "epoch": 0.5256146256737156, + "grad_norm": 1.8288040161132812, + "learning_rate": 9.65615705562224e-05, + "loss": 1.7493, + "step": 14677 + }, + { + "epoch": 0.525650437803284, + "grad_norm": 1.4830673933029175, + "learning_rate": 9.654997842510928e-05, + "loss": 1.687, + "step": 14678 + }, + { + "epoch": 0.5256862499328523, + "grad_norm": 1.6650246381759644, + "learning_rate": 9.653838634041173e-05, + "loss": 1.5981, + "step": 14679 + }, + { + "epoch": 0.5257220620624206, + "grad_norm": 1.6408103704452515, + "learning_rate": 9.652679430228576e-05, + "loss": 1.7101, + "step": 14680 + }, + { + "epoch": 0.5257578741919888, + "grad_norm": 1.654687523841858, + "learning_rate": 9.651520231088726e-05, + "loss": 1.4703, + "step": 14681 + }, + { + "epoch": 0.5257936863215571, + "grad_norm": 1.589911699295044, + "learning_rate": 9.650361036637225e-05, + "loss": 1.5057, + "step": 14682 + }, + { + "epoch": 0.5258294984511254, + "grad_norm": 1.8815149068832397, + "learning_rate": 9.649201846889663e-05, + "loss": 1.6379, + "step": 14683 + }, + { + "epoch": 0.5258653105806936, + "grad_norm": 1.7435766458511353, + "learning_rate": 9.64804266186164e-05, + "loss": 1.4799, + "step": 14684 + }, + { + "epoch": 0.525901122710262, + "grad_norm": 1.667809247970581, + "learning_rate": 9.646883481568748e-05, + "loss": 1.5879, + "step": 14685 + }, + { + "epoch": 0.5259369348398303, + "grad_norm": 1.662645936012268, + "learning_rate": 9.645724306026582e-05, + "loss": 1.5564, + "step": 14686 + }, + { + "epoch": 0.5259727469693986, + "grad_norm": 1.785279631614685, + "learning_rate": 9.644565135250739e-05, + "loss": 1.2906, + "step": 14687 + }, + { + "epoch": 0.5260085590989668, + "grad_norm": 2.513857364654541, + "learning_rate": 9.643405969256814e-05, + "loss": 1.3403, + "step": 14688 + }, + { + "epoch": 0.5260443712285351, + "grad_norm": 2.5112111568450928, + "learning_rate": 9.642246808060401e-05, + "loss": 1.6178, + "step": 14689 + }, + { + "epoch": 0.5260801833581034, + "grad_norm": 1.8668169975280762, + "learning_rate": 9.641087651677096e-05, + "loss": 1.6152, + "step": 14690 + }, + { + "epoch": 0.5261159954876716, + "grad_norm": 1.6442675590515137, + "learning_rate": 9.639928500122495e-05, + "loss": 1.8965, + "step": 14691 + }, + { + "epoch": 0.52615180761724, + "grad_norm": 1.996891975402832, + "learning_rate": 9.638769353412189e-05, + "loss": 1.8066, + "step": 14692 + }, + { + "epoch": 0.5261876197468083, + "grad_norm": 1.669157862663269, + "learning_rate": 9.637610211561779e-05, + "loss": 1.584, + "step": 14693 + }, + { + "epoch": 0.5262234318763765, + "grad_norm": 1.7696603536605835, + "learning_rate": 9.636451074586856e-05, + "loss": 1.5034, + "step": 14694 + }, + { + "epoch": 0.5262592440059448, + "grad_norm": 1.5234453678131104, + "learning_rate": 9.63529194250301e-05, + "loss": 1.4586, + "step": 14695 + }, + { + "epoch": 0.5262950561355131, + "grad_norm": 1.9366183280944824, + "learning_rate": 9.634132815325844e-05, + "loss": 1.4498, + "step": 14696 + }, + { + "epoch": 0.5263308682650814, + "grad_norm": 1.821183443069458, + "learning_rate": 9.632973693070947e-05, + "loss": 1.1892, + "step": 14697 + }, + { + "epoch": 0.5263666803946496, + "grad_norm": 1.601720929145813, + "learning_rate": 9.631814575753918e-05, + "loss": 1.59, + "step": 14698 + }, + { + "epoch": 0.526402492524218, + "grad_norm": 1.829098105430603, + "learning_rate": 9.630655463390347e-05, + "loss": 1.2141, + "step": 14699 + }, + { + "epoch": 0.5264383046537863, + "grad_norm": 1.2035573720932007, + "learning_rate": 9.629496355995831e-05, + "loss": 1.3387, + "step": 14700 + }, + { + "epoch": 0.5264741167833545, + "grad_norm": 1.415198802947998, + "learning_rate": 9.628337253585964e-05, + "loss": 1.3656, + "step": 14701 + }, + { + "epoch": 0.5265099289129228, + "grad_norm": 1.8165593147277832, + "learning_rate": 9.62717815617634e-05, + "loss": 1.5449, + "step": 14702 + }, + { + "epoch": 0.5265457410424911, + "grad_norm": 1.64618718624115, + "learning_rate": 9.626019063782557e-05, + "loss": 1.4761, + "step": 14703 + }, + { + "epoch": 0.5265815531720593, + "grad_norm": 1.2743486166000366, + "learning_rate": 9.624859976420196e-05, + "loss": 1.4644, + "step": 14704 + }, + { + "epoch": 0.5266173653016276, + "grad_norm": 1.5379201173782349, + "learning_rate": 9.623700894104869e-05, + "loss": 1.2605, + "step": 14705 + }, + { + "epoch": 0.526653177431196, + "grad_norm": 1.9712167978286743, + "learning_rate": 9.622541816852153e-05, + "loss": 1.3814, + "step": 14706 + }, + { + "epoch": 0.5266889895607643, + "grad_norm": 2.3892998695373535, + "learning_rate": 9.621382744677658e-05, + "loss": 1.8143, + "step": 14707 + }, + { + "epoch": 0.5267248016903325, + "grad_norm": 1.2501753568649292, + "learning_rate": 9.620223677596962e-05, + "loss": 1.4968, + "step": 14708 + }, + { + "epoch": 0.5267606138199008, + "grad_norm": 1.9475300312042236, + "learning_rate": 9.619064615625671e-05, + "loss": 1.5235, + "step": 14709 + }, + { + "epoch": 0.5267964259494691, + "grad_norm": 2.2302019596099854, + "learning_rate": 9.617905558779373e-05, + "loss": 1.6564, + "step": 14710 + }, + { + "epoch": 0.5268322380790373, + "grad_norm": 1.5667731761932373, + "learning_rate": 9.616746507073664e-05, + "loss": 1.4212, + "step": 14711 + }, + { + "epoch": 0.5268680502086056, + "grad_norm": 2.0810258388519287, + "learning_rate": 9.61558746052414e-05, + "loss": 1.8592, + "step": 14712 + }, + { + "epoch": 0.526903862338174, + "grad_norm": 1.1688623428344727, + "learning_rate": 9.614428419146381e-05, + "loss": 1.5327, + "step": 14713 + }, + { + "epoch": 0.5269396744677423, + "grad_norm": 2.2579002380371094, + "learning_rate": 9.613269382956e-05, + "loss": 1.5626, + "step": 14714 + }, + { + "epoch": 0.5269754865973105, + "grad_norm": 1.4683496952056885, + "learning_rate": 9.612110351968573e-05, + "loss": 1.3157, + "step": 14715 + }, + { + "epoch": 0.5270112987268788, + "grad_norm": 1.8713817596435547, + "learning_rate": 9.610951326199707e-05, + "loss": 1.4675, + "step": 14716 + }, + { + "epoch": 0.5270471108564471, + "grad_norm": 2.5871567726135254, + "learning_rate": 9.609792305664984e-05, + "loss": 1.8185, + "step": 14717 + }, + { + "epoch": 0.5270829229860153, + "grad_norm": 1.4792441129684448, + "learning_rate": 9.608633290380008e-05, + "loss": 1.4989, + "step": 14718 + }, + { + "epoch": 0.5271187351155836, + "grad_norm": 1.3156225681304932, + "learning_rate": 9.60747428036036e-05, + "loss": 1.3848, + "step": 14719 + }, + { + "epoch": 0.527154547245152, + "grad_norm": 1.3840765953063965, + "learning_rate": 9.606315275621644e-05, + "loss": 1.5157, + "step": 14720 + }, + { + "epoch": 0.5271903593747203, + "grad_norm": 1.6863317489624023, + "learning_rate": 9.605156276179447e-05, + "loss": 1.5742, + "step": 14721 + }, + { + "epoch": 0.5272261715042885, + "grad_norm": 1.9276484251022339, + "learning_rate": 9.60399728204936e-05, + "loss": 1.7787, + "step": 14722 + }, + { + "epoch": 0.5272619836338568, + "grad_norm": 1.3478091955184937, + "learning_rate": 9.602838293246984e-05, + "loss": 1.5074, + "step": 14723 + }, + { + "epoch": 0.5272977957634251, + "grad_norm": 1.4894832372665405, + "learning_rate": 9.6016793097879e-05, + "loss": 1.3248, + "step": 14724 + }, + { + "epoch": 0.5273336078929933, + "grad_norm": 1.655637502670288, + "learning_rate": 9.600520331687713e-05, + "loss": 1.6261, + "step": 14725 + }, + { + "epoch": 0.5273694200225616, + "grad_norm": 1.4634572267532349, + "learning_rate": 9.599361358962005e-05, + "loss": 1.3915, + "step": 14726 + }, + { + "epoch": 0.52740523215213, + "grad_norm": 1.8374460935592651, + "learning_rate": 9.598202391626379e-05, + "loss": 1.5723, + "step": 14727 + }, + { + "epoch": 0.5274410442816982, + "grad_norm": 1.6577955484390259, + "learning_rate": 9.597043429696413e-05, + "loss": 1.5211, + "step": 14728 + }, + { + "epoch": 0.5274768564112665, + "grad_norm": 1.743890404701233, + "learning_rate": 9.595884473187716e-05, + "loss": 1.6837, + "step": 14729 + }, + { + "epoch": 0.5275126685408348, + "grad_norm": 1.3714507818222046, + "learning_rate": 9.594725522115871e-05, + "loss": 1.5259, + "step": 14730 + }, + { + "epoch": 0.527548480670403, + "grad_norm": 1.3609662055969238, + "learning_rate": 9.593566576496468e-05, + "loss": 1.5633, + "step": 14731 + }, + { + "epoch": 0.5275842927999713, + "grad_norm": 1.580083966255188, + "learning_rate": 9.592407636345104e-05, + "loss": 1.3267, + "step": 14732 + }, + { + "epoch": 0.5276201049295396, + "grad_norm": 1.6475521326065063, + "learning_rate": 9.591248701677368e-05, + "loss": 1.6335, + "step": 14733 + }, + { + "epoch": 0.527655917059108, + "grad_norm": 1.232351541519165, + "learning_rate": 9.590089772508856e-05, + "loss": 1.3328, + "step": 14734 + }, + { + "epoch": 0.5276917291886762, + "grad_norm": 1.7131110429763794, + "learning_rate": 9.588930848855152e-05, + "loss": 1.8763, + "step": 14735 + }, + { + "epoch": 0.5277275413182445, + "grad_norm": 1.3705060482025146, + "learning_rate": 9.58777193073186e-05, + "loss": 1.5272, + "step": 14736 + }, + { + "epoch": 0.5277633534478128, + "grad_norm": 2.4891092777252197, + "learning_rate": 9.58661301815456e-05, + "loss": 1.6995, + "step": 14737 + }, + { + "epoch": 0.527799165577381, + "grad_norm": 1.6959772109985352, + "learning_rate": 9.585454111138853e-05, + "loss": 1.2758, + "step": 14738 + }, + { + "epoch": 0.5278349777069493, + "grad_norm": 1.470699667930603, + "learning_rate": 9.584295209700326e-05, + "loss": 1.6098, + "step": 14739 + }, + { + "epoch": 0.5278707898365176, + "grad_norm": 2.051999092102051, + "learning_rate": 9.583136313854567e-05, + "loss": 1.627, + "step": 14740 + }, + { + "epoch": 0.527906601966086, + "grad_norm": 1.461935043334961, + "learning_rate": 9.581977423617173e-05, + "loss": 1.0868, + "step": 14741 + }, + { + "epoch": 0.5279424140956542, + "grad_norm": 1.76284921169281, + "learning_rate": 9.580818539003733e-05, + "loss": 1.3957, + "step": 14742 + }, + { + "epoch": 0.5279782262252225, + "grad_norm": 1.7833982706069946, + "learning_rate": 9.579659660029841e-05, + "loss": 1.5588, + "step": 14743 + }, + { + "epoch": 0.5280140383547908, + "grad_norm": 1.697553038597107, + "learning_rate": 9.578500786711082e-05, + "loss": 1.6178, + "step": 14744 + }, + { + "epoch": 0.528049850484359, + "grad_norm": 2.319493055343628, + "learning_rate": 9.577341919063055e-05, + "loss": 1.2546, + "step": 14745 + }, + { + "epoch": 0.5280856626139273, + "grad_norm": 1.524808406829834, + "learning_rate": 9.576183057101345e-05, + "loss": 1.2757, + "step": 14746 + }, + { + "epoch": 0.5281214747434956, + "grad_norm": 1.9178670644760132, + "learning_rate": 9.575024200841547e-05, + "loss": 1.4959, + "step": 14747 + }, + { + "epoch": 0.528157286873064, + "grad_norm": 2.4134249687194824, + "learning_rate": 9.573865350299251e-05, + "loss": 1.5957, + "step": 14748 + }, + { + "epoch": 0.5281930990026322, + "grad_norm": 1.6927388906478882, + "learning_rate": 9.572706505490043e-05, + "loss": 1.6095, + "step": 14749 + }, + { + "epoch": 0.5282289111322005, + "grad_norm": 1.5404099225997925, + "learning_rate": 9.571547666429521e-05, + "loss": 1.338, + "step": 14750 + }, + { + "epoch": 0.5282647232617688, + "grad_norm": 1.3156019449234009, + "learning_rate": 9.57038883313327e-05, + "loss": 1.345, + "step": 14751 + }, + { + "epoch": 0.528300535391337, + "grad_norm": 1.3669182062149048, + "learning_rate": 9.569230005616887e-05, + "loss": 1.6818, + "step": 14752 + }, + { + "epoch": 0.5283363475209053, + "grad_norm": 1.1214356422424316, + "learning_rate": 9.568071183895954e-05, + "loss": 1.3239, + "step": 14753 + }, + { + "epoch": 0.5283721596504736, + "grad_norm": 2.038661479949951, + "learning_rate": 9.56691236798607e-05, + "loss": 1.3369, + "step": 14754 + }, + { + "epoch": 0.528407971780042, + "grad_norm": 2.319150686264038, + "learning_rate": 9.565753557902818e-05, + "loss": 1.7293, + "step": 14755 + }, + { + "epoch": 0.5284437839096102, + "grad_norm": 1.3403654098510742, + "learning_rate": 9.564594753661796e-05, + "loss": 1.5201, + "step": 14756 + }, + { + "epoch": 0.5284795960391785, + "grad_norm": 1.679591178894043, + "learning_rate": 9.563435955278587e-05, + "loss": 1.3043, + "step": 14757 + }, + { + "epoch": 0.5285154081687468, + "grad_norm": 1.7903519868850708, + "learning_rate": 9.562277162768785e-05, + "loss": 1.5777, + "step": 14758 + }, + { + "epoch": 0.528551220298315, + "grad_norm": 1.7412021160125732, + "learning_rate": 9.561118376147979e-05, + "loss": 1.7425, + "step": 14759 + }, + { + "epoch": 0.5285870324278833, + "grad_norm": 1.7483241558074951, + "learning_rate": 9.559959595431758e-05, + "loss": 1.6111, + "step": 14760 + }, + { + "epoch": 0.5286228445574516, + "grad_norm": 1.5660126209259033, + "learning_rate": 9.558800820635715e-05, + "loss": 1.5506, + "step": 14761 + }, + { + "epoch": 0.52865865668702, + "grad_norm": 1.5264723300933838, + "learning_rate": 9.557642051775436e-05, + "loss": 1.4398, + "step": 14762 + }, + { + "epoch": 0.5286944688165882, + "grad_norm": 1.40674889087677, + "learning_rate": 9.556483288866515e-05, + "loss": 1.2538, + "step": 14763 + }, + { + "epoch": 0.5287302809461565, + "grad_norm": 1.4413470029830933, + "learning_rate": 9.555324531924536e-05, + "loss": 1.5525, + "step": 14764 + }, + { + "epoch": 0.5287660930757248, + "grad_norm": 1.8058099746704102, + "learning_rate": 9.554165780965095e-05, + "loss": 1.6483, + "step": 14765 + }, + { + "epoch": 0.528801905205293, + "grad_norm": 1.4641554355621338, + "learning_rate": 9.553007036003777e-05, + "loss": 1.4808, + "step": 14766 + }, + { + "epoch": 0.5288377173348613, + "grad_norm": 1.9598422050476074, + "learning_rate": 9.551848297056171e-05, + "loss": 1.3693, + "step": 14767 + }, + { + "epoch": 0.5288735294644296, + "grad_norm": 1.3409147262573242, + "learning_rate": 9.550689564137872e-05, + "loss": 1.4255, + "step": 14768 + }, + { + "epoch": 0.5289093415939979, + "grad_norm": 1.9200905561447144, + "learning_rate": 9.54953083726446e-05, + "loss": 1.692, + "step": 14769 + }, + { + "epoch": 0.5289451537235662, + "grad_norm": 2.1611762046813965, + "learning_rate": 9.548372116451535e-05, + "loss": 1.7349, + "step": 14770 + }, + { + "epoch": 0.5289809658531345, + "grad_norm": 1.589970350265503, + "learning_rate": 9.547213401714677e-05, + "loss": 1.4199, + "step": 14771 + }, + { + "epoch": 0.5290167779827027, + "grad_norm": 1.5894039869308472, + "learning_rate": 9.546054693069481e-05, + "loss": 1.5671, + "step": 14772 + }, + { + "epoch": 0.529052590112271, + "grad_norm": 1.421728253364563, + "learning_rate": 9.544895990531532e-05, + "loss": 1.4939, + "step": 14773 + }, + { + "epoch": 0.5290884022418393, + "grad_norm": 1.5457122325897217, + "learning_rate": 9.54373729411642e-05, + "loss": 1.363, + "step": 14774 + }, + { + "epoch": 0.5291242143714076, + "grad_norm": 2.2558932304382324, + "learning_rate": 9.542578603839736e-05, + "loss": 1.6362, + "step": 14775 + }, + { + "epoch": 0.5291600265009759, + "grad_norm": 2.0703535079956055, + "learning_rate": 9.541419919717064e-05, + "loss": 1.2543, + "step": 14776 + }, + { + "epoch": 0.5291958386305442, + "grad_norm": 1.567732810974121, + "learning_rate": 9.540261241763999e-05, + "loss": 1.497, + "step": 14777 + }, + { + "epoch": 0.5292316507601125, + "grad_norm": 1.5150243043899536, + "learning_rate": 9.539102569996124e-05, + "loss": 1.4928, + "step": 14778 + }, + { + "epoch": 0.5292674628896807, + "grad_norm": 1.4762626886367798, + "learning_rate": 9.53794390442903e-05, + "loss": 1.4142, + "step": 14779 + }, + { + "epoch": 0.529303275019249, + "grad_norm": 1.325579285621643, + "learning_rate": 9.536785245078304e-05, + "loss": 1.1916, + "step": 14780 + }, + { + "epoch": 0.5293390871488173, + "grad_norm": 1.704073429107666, + "learning_rate": 9.535626591959536e-05, + "loss": 1.7114, + "step": 14781 + }, + { + "epoch": 0.5293748992783855, + "grad_norm": 1.6421252489089966, + "learning_rate": 9.534467945088313e-05, + "loss": 1.4885, + "step": 14782 + }, + { + "epoch": 0.5294107114079539, + "grad_norm": 1.4458218812942505, + "learning_rate": 9.533309304480221e-05, + "loss": 1.5306, + "step": 14783 + }, + { + "epoch": 0.5294465235375222, + "grad_norm": 1.281543254852295, + "learning_rate": 9.532150670150854e-05, + "loss": 1.3572, + "step": 14784 + }, + { + "epoch": 0.5294823356670905, + "grad_norm": 2.6501762866973877, + "learning_rate": 9.530992042115794e-05, + "loss": 1.604, + "step": 14785 + }, + { + "epoch": 0.5295181477966587, + "grad_norm": 1.6431045532226562, + "learning_rate": 9.529833420390631e-05, + "loss": 1.6053, + "step": 14786 + }, + { + "epoch": 0.529553959926227, + "grad_norm": 1.435895323753357, + "learning_rate": 9.528674804990954e-05, + "loss": 1.4593, + "step": 14787 + }, + { + "epoch": 0.5295897720557953, + "grad_norm": 1.460228443145752, + "learning_rate": 9.527516195932349e-05, + "loss": 1.246, + "step": 14788 + }, + { + "epoch": 0.5296255841853635, + "grad_norm": 1.7602965831756592, + "learning_rate": 9.526357593230403e-05, + "loss": 1.6471, + "step": 14789 + }, + { + "epoch": 0.5296613963149319, + "grad_norm": 2.1996421813964844, + "learning_rate": 9.525198996900707e-05, + "loss": 1.6656, + "step": 14790 + }, + { + "epoch": 0.5296972084445002, + "grad_norm": 3.2415080070495605, + "learning_rate": 9.524040406958847e-05, + "loss": 1.6736, + "step": 14791 + }, + { + "epoch": 0.5297330205740685, + "grad_norm": 1.3910014629364014, + "learning_rate": 9.522881823420404e-05, + "loss": 1.4791, + "step": 14792 + }, + { + "epoch": 0.5297688327036367, + "grad_norm": 1.5591599941253662, + "learning_rate": 9.521723246300977e-05, + "loss": 1.5115, + "step": 14793 + }, + { + "epoch": 0.529804644833205, + "grad_norm": 1.4825100898742676, + "learning_rate": 9.520564675616141e-05, + "loss": 1.675, + "step": 14794 + }, + { + "epoch": 0.5298404569627733, + "grad_norm": 1.701743245124817, + "learning_rate": 9.519406111381492e-05, + "loss": 1.409, + "step": 14795 + }, + { + "epoch": 0.5298762690923415, + "grad_norm": 1.5088096857070923, + "learning_rate": 9.518247553612613e-05, + "loss": 1.2716, + "step": 14796 + }, + { + "epoch": 0.5299120812219099, + "grad_norm": 1.9061812162399292, + "learning_rate": 9.517089002325093e-05, + "loss": 1.8451, + "step": 14797 + }, + { + "epoch": 0.5299478933514782, + "grad_norm": 1.4009358882904053, + "learning_rate": 9.515930457534514e-05, + "loss": 1.1994, + "step": 14798 + }, + { + "epoch": 0.5299837054810465, + "grad_norm": 1.6844242811203003, + "learning_rate": 9.514771919256472e-05, + "loss": 1.4261, + "step": 14799 + }, + { + "epoch": 0.5300195176106147, + "grad_norm": 1.8280789852142334, + "learning_rate": 9.513613387506547e-05, + "loss": 1.5539, + "step": 14800 + }, + { + "epoch": 0.530055329740183, + "grad_norm": 1.3354803323745728, + "learning_rate": 9.512454862300321e-05, + "loss": 1.4928, + "step": 14801 + }, + { + "epoch": 0.5300911418697513, + "grad_norm": 1.5250039100646973, + "learning_rate": 9.511296343653391e-05, + "loss": 1.094, + "step": 14802 + }, + { + "epoch": 0.5301269539993195, + "grad_norm": 2.810237169265747, + "learning_rate": 9.510137831581334e-05, + "loss": 1.7582, + "step": 14803 + }, + { + "epoch": 0.5301627661288879, + "grad_norm": 1.547545075416565, + "learning_rate": 9.508979326099747e-05, + "loss": 1.0475, + "step": 14804 + }, + { + "epoch": 0.5301985782584562, + "grad_norm": 2.176013469696045, + "learning_rate": 9.507820827224202e-05, + "loss": 1.4733, + "step": 14805 + }, + { + "epoch": 0.5302343903880244, + "grad_norm": 1.2221146821975708, + "learning_rate": 9.5066623349703e-05, + "loss": 1.1236, + "step": 14806 + }, + { + "epoch": 0.5302702025175927, + "grad_norm": 1.900315523147583, + "learning_rate": 9.505503849353613e-05, + "loss": 1.6962, + "step": 14807 + }, + { + "epoch": 0.530306014647161, + "grad_norm": 1.854783296585083, + "learning_rate": 9.504345370389739e-05, + "loss": 1.5795, + "step": 14808 + }, + { + "epoch": 0.5303418267767293, + "grad_norm": 1.4548566341400146, + "learning_rate": 9.50318689809426e-05, + "loss": 1.5435, + "step": 14809 + }, + { + "epoch": 0.5303776389062975, + "grad_norm": 1.689694881439209, + "learning_rate": 9.502028432482755e-05, + "loss": 1.5173, + "step": 14810 + }, + { + "epoch": 0.5304134510358659, + "grad_norm": 1.620489478111267, + "learning_rate": 9.50086997357082e-05, + "loss": 1.2642, + "step": 14811 + }, + { + "epoch": 0.5304492631654342, + "grad_norm": 2.3089258670806885, + "learning_rate": 9.499711521374031e-05, + "loss": 1.5737, + "step": 14812 + }, + { + "epoch": 0.5304850752950024, + "grad_norm": 2.3097901344299316, + "learning_rate": 9.498553075907985e-05, + "loss": 1.688, + "step": 14813 + }, + { + "epoch": 0.5305208874245707, + "grad_norm": 1.4224505424499512, + "learning_rate": 9.497394637188251e-05, + "loss": 1.4562, + "step": 14814 + }, + { + "epoch": 0.530556699554139, + "grad_norm": 1.8305726051330566, + "learning_rate": 9.496236205230433e-05, + "loss": 1.4744, + "step": 14815 + }, + { + "epoch": 0.5305925116837072, + "grad_norm": 1.4154525995254517, + "learning_rate": 9.4950777800501e-05, + "loss": 1.4215, + "step": 14816 + }, + { + "epoch": 0.5306283238132755, + "grad_norm": 1.8109185695648193, + "learning_rate": 9.49391936166285e-05, + "loss": 1.3802, + "step": 14817 + }, + { + "epoch": 0.5306641359428439, + "grad_norm": 1.5938786268234253, + "learning_rate": 9.492760950084261e-05, + "loss": 1.4896, + "step": 14818 + }, + { + "epoch": 0.5306999480724122, + "grad_norm": 1.4196759462356567, + "learning_rate": 9.491602545329916e-05, + "loss": 1.4166, + "step": 14819 + }, + { + "epoch": 0.5307357602019804, + "grad_norm": 1.5144906044006348, + "learning_rate": 9.490444147415407e-05, + "loss": 1.3838, + "step": 14820 + }, + { + "epoch": 0.5307715723315487, + "grad_norm": 2.2972259521484375, + "learning_rate": 9.489285756356307e-05, + "loss": 1.5379, + "step": 14821 + }, + { + "epoch": 0.530807384461117, + "grad_norm": 1.4997377395629883, + "learning_rate": 9.488127372168218e-05, + "loss": 1.4986, + "step": 14822 + }, + { + "epoch": 0.5308431965906852, + "grad_norm": 2.4222404956817627, + "learning_rate": 9.486968994866708e-05, + "loss": 1.652, + "step": 14823 + }, + { + "epoch": 0.5308790087202535, + "grad_norm": 1.8816639184951782, + "learning_rate": 9.485810624467372e-05, + "loss": 1.3654, + "step": 14824 + }, + { + "epoch": 0.5309148208498219, + "grad_norm": 1.8047064542770386, + "learning_rate": 9.484652260985787e-05, + "loss": 1.4498, + "step": 14825 + }, + { + "epoch": 0.5309506329793902, + "grad_norm": 1.329789638519287, + "learning_rate": 9.483493904437548e-05, + "loss": 1.3135, + "step": 14826 + }, + { + "epoch": 0.5309864451089584, + "grad_norm": 3.3496294021606445, + "learning_rate": 9.482335554838229e-05, + "loss": 1.1479, + "step": 14827 + }, + { + "epoch": 0.5310222572385267, + "grad_norm": 1.7045812606811523, + "learning_rate": 9.481177212203415e-05, + "loss": 1.8932, + "step": 14828 + }, + { + "epoch": 0.531058069368095, + "grad_norm": 1.3624588251113892, + "learning_rate": 9.480018876548695e-05, + "loss": 1.4705, + "step": 14829 + }, + { + "epoch": 0.5310938814976632, + "grad_norm": 1.6676069498062134, + "learning_rate": 9.478860547889647e-05, + "loss": 1.5551, + "step": 14830 + }, + { + "epoch": 0.5311296936272315, + "grad_norm": 1.7879300117492676, + "learning_rate": 9.477702226241862e-05, + "loss": 1.5219, + "step": 14831 + }, + { + "epoch": 0.5311655057567999, + "grad_norm": 1.5257636308670044, + "learning_rate": 9.476543911620918e-05, + "loss": 1.5649, + "step": 14832 + }, + { + "epoch": 0.5312013178863682, + "grad_norm": 2.001162052154541, + "learning_rate": 9.4753856040424e-05, + "loss": 1.549, + "step": 14833 + }, + { + "epoch": 0.5312371300159364, + "grad_norm": 2.1974189281463623, + "learning_rate": 9.47422730352189e-05, + "loss": 1.283, + "step": 14834 + }, + { + "epoch": 0.5312729421455047, + "grad_norm": 1.6734133958816528, + "learning_rate": 9.47306901007498e-05, + "loss": 1.4485, + "step": 14835 + }, + { + "epoch": 0.531308754275073, + "grad_norm": 1.9478635787963867, + "learning_rate": 9.471910723717243e-05, + "loss": 1.4408, + "step": 14836 + }, + { + "epoch": 0.5313445664046412, + "grad_norm": 2.0421502590179443, + "learning_rate": 9.470752444464265e-05, + "loss": 1.6513, + "step": 14837 + }, + { + "epoch": 0.5313803785342095, + "grad_norm": 1.3936954736709595, + "learning_rate": 9.469594172331631e-05, + "loss": 1.3181, + "step": 14838 + }, + { + "epoch": 0.5314161906637779, + "grad_norm": 1.9355803728103638, + "learning_rate": 9.468435907334922e-05, + "loss": 1.3821, + "step": 14839 + }, + { + "epoch": 0.5314520027933461, + "grad_norm": 1.3044540882110596, + "learning_rate": 9.467277649489725e-05, + "loss": 1.381, + "step": 14840 + }, + { + "epoch": 0.5314878149229144, + "grad_norm": 1.856377124786377, + "learning_rate": 9.466119398811617e-05, + "loss": 1.5137, + "step": 14841 + }, + { + "epoch": 0.5315236270524827, + "grad_norm": 1.3732273578643799, + "learning_rate": 9.464961155316187e-05, + "loss": 1.4637, + "step": 14842 + }, + { + "epoch": 0.531559439182051, + "grad_norm": 1.383651852607727, + "learning_rate": 9.463802919019011e-05, + "loss": 1.2581, + "step": 14843 + }, + { + "epoch": 0.5315952513116192, + "grad_norm": 2.1127378940582275, + "learning_rate": 9.462644689935678e-05, + "loss": 1.6943, + "step": 14844 + }, + { + "epoch": 0.5316310634411875, + "grad_norm": 1.6620867252349854, + "learning_rate": 9.461486468081768e-05, + "loss": 1.5948, + "step": 14845 + }, + { + "epoch": 0.5316668755707559, + "grad_norm": 1.6653188467025757, + "learning_rate": 9.460328253472859e-05, + "loss": 1.7102, + "step": 14846 + }, + { + "epoch": 0.5317026877003241, + "grad_norm": 1.5845062732696533, + "learning_rate": 9.459170046124542e-05, + "loss": 1.6149, + "step": 14847 + }, + { + "epoch": 0.5317384998298924, + "grad_norm": 1.3799561262130737, + "learning_rate": 9.458011846052391e-05, + "loss": 1.5361, + "step": 14848 + }, + { + "epoch": 0.5317743119594607, + "grad_norm": 2.455580949783325, + "learning_rate": 9.456853653271992e-05, + "loss": 1.7296, + "step": 14849 + }, + { + "epoch": 0.531810124089029, + "grad_norm": 1.4650472402572632, + "learning_rate": 9.455695467798927e-05, + "loss": 1.4852, + "step": 14850 + }, + { + "epoch": 0.5318459362185972, + "grad_norm": 1.7289412021636963, + "learning_rate": 9.454537289648779e-05, + "loss": 1.44, + "step": 14851 + }, + { + "epoch": 0.5318817483481655, + "grad_norm": 1.5133506059646606, + "learning_rate": 9.453379118837125e-05, + "loss": 1.3471, + "step": 14852 + }, + { + "epoch": 0.5319175604777338, + "grad_norm": 1.7192788124084473, + "learning_rate": 9.452220955379553e-05, + "loss": 1.6764, + "step": 14853 + }, + { + "epoch": 0.5319533726073021, + "grad_norm": 2.162766456604004, + "learning_rate": 9.45106279929164e-05, + "loss": 1.3744, + "step": 14854 + }, + { + "epoch": 0.5319891847368704, + "grad_norm": 1.7734508514404297, + "learning_rate": 9.449904650588968e-05, + "loss": 1.3988, + "step": 14855 + }, + { + "epoch": 0.5320249968664387, + "grad_norm": 1.7432608604431152, + "learning_rate": 9.44874650928712e-05, + "loss": 1.2843, + "step": 14856 + }, + { + "epoch": 0.5320608089960069, + "grad_norm": 1.3881850242614746, + "learning_rate": 9.447588375401676e-05, + "loss": 1.3026, + "step": 14857 + }, + { + "epoch": 0.5320966211255752, + "grad_norm": 1.451550006866455, + "learning_rate": 9.44643024894822e-05, + "loss": 1.2257, + "step": 14858 + }, + { + "epoch": 0.5321324332551435, + "grad_norm": 1.406640648841858, + "learning_rate": 9.445272129942329e-05, + "loss": 1.3289, + "step": 14859 + }, + { + "epoch": 0.5321682453847117, + "grad_norm": 1.8054431676864624, + "learning_rate": 9.444114018399588e-05, + "loss": 1.4196, + "step": 14860 + }, + { + "epoch": 0.5322040575142801, + "grad_norm": 1.7391663789749146, + "learning_rate": 9.442955914335573e-05, + "loss": 1.5092, + "step": 14861 + }, + { + "epoch": 0.5322398696438484, + "grad_norm": 1.8281937837600708, + "learning_rate": 9.441797817765869e-05, + "loss": 1.2757, + "step": 14862 + }, + { + "epoch": 0.5322756817734167, + "grad_norm": 1.541002869606018, + "learning_rate": 9.440639728706058e-05, + "loss": 1.2199, + "step": 14863 + }, + { + "epoch": 0.5323114939029849, + "grad_norm": 1.8189629316329956, + "learning_rate": 9.439481647171714e-05, + "loss": 1.6036, + "step": 14864 + }, + { + "epoch": 0.5323473060325532, + "grad_norm": 1.3375712633132935, + "learning_rate": 9.438323573178424e-05, + "loss": 1.4157, + "step": 14865 + }, + { + "epoch": 0.5323831181621215, + "grad_norm": 2.0243418216705322, + "learning_rate": 9.437165506741764e-05, + "loss": 1.5781, + "step": 14866 + }, + { + "epoch": 0.5324189302916897, + "grad_norm": 1.824872612953186, + "learning_rate": 9.436007447877316e-05, + "loss": 1.1952, + "step": 14867 + }, + { + "epoch": 0.5324547424212581, + "grad_norm": 1.589608073234558, + "learning_rate": 9.43484939660066e-05, + "loss": 1.1883, + "step": 14868 + }, + { + "epoch": 0.5324905545508264, + "grad_norm": 1.530319333076477, + "learning_rate": 9.433691352927378e-05, + "loss": 1.3574, + "step": 14869 + }, + { + "epoch": 0.5325263666803947, + "grad_norm": 1.7386776208877563, + "learning_rate": 9.43253331687305e-05, + "loss": 1.5034, + "step": 14870 + }, + { + "epoch": 0.5325621788099629, + "grad_norm": 1.9396528005599976, + "learning_rate": 9.43137528845325e-05, + "loss": 1.5202, + "step": 14871 + }, + { + "epoch": 0.5325979909395312, + "grad_norm": 1.8094663619995117, + "learning_rate": 9.430217267683566e-05, + "loss": 1.7822, + "step": 14872 + }, + { + "epoch": 0.5326338030690995, + "grad_norm": 1.1985933780670166, + "learning_rate": 9.42905925457957e-05, + "loss": 1.4856, + "step": 14873 + }, + { + "epoch": 0.5326696151986677, + "grad_norm": 1.9986737966537476, + "learning_rate": 9.427901249156847e-05, + "loss": 1.5435, + "step": 14874 + }, + { + "epoch": 0.5327054273282361, + "grad_norm": 2.1009483337402344, + "learning_rate": 9.426743251430974e-05, + "loss": 1.5721, + "step": 14875 + }, + { + "epoch": 0.5327412394578044, + "grad_norm": 1.6523008346557617, + "learning_rate": 9.425585261417533e-05, + "loss": 1.4228, + "step": 14876 + }, + { + "epoch": 0.5327770515873727, + "grad_norm": 1.8854312896728516, + "learning_rate": 9.424427279132099e-05, + "loss": 1.4939, + "step": 14877 + }, + { + "epoch": 0.5328128637169409, + "grad_norm": 2.156968355178833, + "learning_rate": 9.423269304590256e-05, + "loss": 0.997, + "step": 14878 + }, + { + "epoch": 0.5328486758465092, + "grad_norm": 1.7135142087936401, + "learning_rate": 9.42211133780758e-05, + "loss": 1.5855, + "step": 14879 + }, + { + "epoch": 0.5328844879760775, + "grad_norm": 1.341581106185913, + "learning_rate": 9.420953378799649e-05, + "loss": 1.3719, + "step": 14880 + }, + { + "epoch": 0.5329203001056457, + "grad_norm": 1.8618860244750977, + "learning_rate": 9.419795427582044e-05, + "loss": 1.3623, + "step": 14881 + }, + { + "epoch": 0.5329561122352141, + "grad_norm": 2.2615368366241455, + "learning_rate": 9.418637484170344e-05, + "loss": 1.4273, + "step": 14882 + }, + { + "epoch": 0.5329919243647824, + "grad_norm": 1.9147205352783203, + "learning_rate": 9.417479548580126e-05, + "loss": 1.5137, + "step": 14883 + }, + { + "epoch": 0.5330277364943506, + "grad_norm": 1.4778187274932861, + "learning_rate": 9.416321620826968e-05, + "loss": 1.8574, + "step": 14884 + }, + { + "epoch": 0.5330635486239189, + "grad_norm": 1.726284384727478, + "learning_rate": 9.415163700926451e-05, + "loss": 1.362, + "step": 14885 + }, + { + "epoch": 0.5330993607534872, + "grad_norm": 2.3607828617095947, + "learning_rate": 9.414005788894151e-05, + "loss": 1.7774, + "step": 14886 + }, + { + "epoch": 0.5331351728830555, + "grad_norm": 1.7209479808807373, + "learning_rate": 9.41284788474565e-05, + "loss": 1.3326, + "step": 14887 + }, + { + "epoch": 0.5331709850126237, + "grad_norm": 1.455073356628418, + "learning_rate": 9.411689988496526e-05, + "loss": 1.7173, + "step": 14888 + }, + { + "epoch": 0.5332067971421921, + "grad_norm": 1.9332654476165771, + "learning_rate": 9.410532100162344e-05, + "loss": 1.4577, + "step": 14889 + }, + { + "epoch": 0.5332426092717604, + "grad_norm": 1.50914466381073, + "learning_rate": 9.409374219758702e-05, + "loss": 1.2237, + "step": 14890 + }, + { + "epoch": 0.5332784214013286, + "grad_norm": 1.5229016542434692, + "learning_rate": 9.408216347301161e-05, + "loss": 1.5624, + "step": 14891 + }, + { + "epoch": 0.5333142335308969, + "grad_norm": 2.082638740539551, + "learning_rate": 9.40705848280531e-05, + "loss": 1.3889, + "step": 14892 + }, + { + "epoch": 0.5333500456604652, + "grad_norm": 1.3961759805679321, + "learning_rate": 9.40590062628672e-05, + "loss": 1.5499, + "step": 14893 + }, + { + "epoch": 0.5333858577900334, + "grad_norm": 1.6336990594863892, + "learning_rate": 9.404742777760974e-05, + "loss": 1.3981, + "step": 14894 + }, + { + "epoch": 0.5334216699196017, + "grad_norm": 1.8657147884368896, + "learning_rate": 9.403584937243642e-05, + "loss": 1.5618, + "step": 14895 + }, + { + "epoch": 0.5334574820491701, + "grad_norm": 2.1878414154052734, + "learning_rate": 9.402427104750308e-05, + "loss": 1.4487, + "step": 14896 + }, + { + "epoch": 0.5334932941787384, + "grad_norm": 1.487822413444519, + "learning_rate": 9.401269280296549e-05, + "loss": 1.2005, + "step": 14897 + }, + { + "epoch": 0.5335291063083066, + "grad_norm": 1.6004787683486938, + "learning_rate": 9.400111463897932e-05, + "loss": 1.5498, + "step": 14898 + }, + { + "epoch": 0.5335649184378749, + "grad_norm": 1.8500304222106934, + "learning_rate": 9.39895365557005e-05, + "loss": 1.4665, + "step": 14899 + }, + { + "epoch": 0.5336007305674432, + "grad_norm": 1.6620436906814575, + "learning_rate": 9.397795855328464e-05, + "loss": 1.6153, + "step": 14900 + }, + { + "epoch": 0.5336365426970114, + "grad_norm": 1.6935945749282837, + "learning_rate": 9.396638063188764e-05, + "loss": 1.6425, + "step": 14901 + }, + { + "epoch": 0.5336723548265797, + "grad_norm": 1.664432168006897, + "learning_rate": 9.395480279166514e-05, + "loss": 1.6335, + "step": 14902 + }, + { + "epoch": 0.5337081669561481, + "grad_norm": 2.122021436691284, + "learning_rate": 9.394322503277305e-05, + "loss": 1.3633, + "step": 14903 + }, + { + "epoch": 0.5337439790857164, + "grad_norm": 1.5068094730377197, + "learning_rate": 9.393164735536696e-05, + "loss": 1.3178, + "step": 14904 + }, + { + "epoch": 0.5337797912152846, + "grad_norm": 1.499644160270691, + "learning_rate": 9.39200697596028e-05, + "loss": 1.3545, + "step": 14905 + }, + { + "epoch": 0.5338156033448529, + "grad_norm": 1.8724576234817505, + "learning_rate": 9.390849224563627e-05, + "loss": 1.5587, + "step": 14906 + }, + { + "epoch": 0.5338514154744212, + "grad_norm": 2.1973133087158203, + "learning_rate": 9.389691481362304e-05, + "loss": 1.4083, + "step": 14907 + }, + { + "epoch": 0.5338872276039894, + "grad_norm": 2.883030414581299, + "learning_rate": 9.388533746371904e-05, + "loss": 1.3479, + "step": 14908 + }, + { + "epoch": 0.5339230397335577, + "grad_norm": 2.0845448970794678, + "learning_rate": 9.387376019607985e-05, + "loss": 1.248, + "step": 14909 + }, + { + "epoch": 0.5339588518631261, + "grad_norm": 1.4338700771331787, + "learning_rate": 9.386218301086139e-05, + "loss": 1.3079, + "step": 14910 + }, + { + "epoch": 0.5339946639926944, + "grad_norm": 2.109936475753784, + "learning_rate": 9.385060590821929e-05, + "loss": 1.4467, + "step": 14911 + }, + { + "epoch": 0.5340304761222626, + "grad_norm": 1.9435588121414185, + "learning_rate": 9.38390288883094e-05, + "loss": 1.5993, + "step": 14912 + }, + { + "epoch": 0.5340662882518309, + "grad_norm": 1.5473748445510864, + "learning_rate": 9.382745195128736e-05, + "loss": 1.3432, + "step": 14913 + }, + { + "epoch": 0.5341021003813992, + "grad_norm": 1.929419994354248, + "learning_rate": 9.381587509730907e-05, + "loss": 1.627, + "step": 14914 + }, + { + "epoch": 0.5341379125109674, + "grad_norm": 1.4407709836959839, + "learning_rate": 9.380429832653017e-05, + "loss": 1.2589, + "step": 14915 + }, + { + "epoch": 0.5341737246405357, + "grad_norm": 1.594947099685669, + "learning_rate": 9.379272163910643e-05, + "loss": 1.3406, + "step": 14916 + }, + { + "epoch": 0.5342095367701041, + "grad_norm": 1.8034594058990479, + "learning_rate": 9.378114503519364e-05, + "loss": 1.2251, + "step": 14917 + }, + { + "epoch": 0.5342453488996723, + "grad_norm": 1.5109189748764038, + "learning_rate": 9.376956851494747e-05, + "loss": 1.5412, + "step": 14918 + }, + { + "epoch": 0.5342811610292406, + "grad_norm": 1.4823503494262695, + "learning_rate": 9.375799207852379e-05, + "loss": 1.5642, + "step": 14919 + }, + { + "epoch": 0.5343169731588089, + "grad_norm": 1.2817720174789429, + "learning_rate": 9.37464157260782e-05, + "loss": 1.4638, + "step": 14920 + }, + { + "epoch": 0.5343527852883772, + "grad_norm": 1.9059113264083862, + "learning_rate": 9.37348394577666e-05, + "loss": 1.6068, + "step": 14921 + }, + { + "epoch": 0.5343885974179454, + "grad_norm": 1.876328945159912, + "learning_rate": 9.372326327374459e-05, + "loss": 1.377, + "step": 14922 + }, + { + "epoch": 0.5344244095475137, + "grad_norm": 2.0040924549102783, + "learning_rate": 9.371168717416803e-05, + "loss": 1.3396, + "step": 14923 + }, + { + "epoch": 0.5344602216770821, + "grad_norm": 2.007840156555176, + "learning_rate": 9.370011115919258e-05, + "loss": 1.6018, + "step": 14924 + }, + { + "epoch": 0.5344960338066503, + "grad_norm": 1.4200763702392578, + "learning_rate": 9.368853522897399e-05, + "loss": 1.435, + "step": 14925 + }, + { + "epoch": 0.5345318459362186, + "grad_norm": 1.4511983394622803, + "learning_rate": 9.367695938366805e-05, + "loss": 1.2733, + "step": 14926 + }, + { + "epoch": 0.5345676580657869, + "grad_norm": 1.3073828220367432, + "learning_rate": 9.366538362343043e-05, + "loss": 1.504, + "step": 14927 + }, + { + "epoch": 0.5346034701953551, + "grad_norm": 1.3790490627288818, + "learning_rate": 9.365380794841694e-05, + "loss": 1.4996, + "step": 14928 + }, + { + "epoch": 0.5346392823249234, + "grad_norm": 2.486955165863037, + "learning_rate": 9.364223235878324e-05, + "loss": 1.5752, + "step": 14929 + }, + { + "epoch": 0.5346750944544917, + "grad_norm": 2.4846272468566895, + "learning_rate": 9.363065685468514e-05, + "loss": 1.585, + "step": 14930 + }, + { + "epoch": 0.5347109065840601, + "grad_norm": 1.5754536390304565, + "learning_rate": 9.361908143627829e-05, + "loss": 1.3481, + "step": 14931 + }, + { + "epoch": 0.5347467187136283, + "grad_norm": 1.701556921005249, + "learning_rate": 9.360750610371852e-05, + "loss": 2.0244, + "step": 14932 + }, + { + "epoch": 0.5347825308431966, + "grad_norm": 1.8549612760543823, + "learning_rate": 9.35959308571615e-05, + "loss": 1.4935, + "step": 14933 + }, + { + "epoch": 0.5348183429727649, + "grad_norm": 1.695146918296814, + "learning_rate": 9.358435569676295e-05, + "loss": 1.3858, + "step": 14934 + }, + { + "epoch": 0.5348541551023331, + "grad_norm": 1.631511926651001, + "learning_rate": 9.357278062267863e-05, + "loss": 1.59, + "step": 14935 + }, + { + "epoch": 0.5348899672319014, + "grad_norm": 2.117126703262329, + "learning_rate": 9.356120563506424e-05, + "loss": 1.409, + "step": 14936 + }, + { + "epoch": 0.5349257793614697, + "grad_norm": 1.4399936199188232, + "learning_rate": 9.354963073407555e-05, + "loss": 1.6277, + "step": 14937 + }, + { + "epoch": 0.5349615914910381, + "grad_norm": 1.9248534440994263, + "learning_rate": 9.353805591986822e-05, + "loss": 1.2877, + "step": 14938 + }, + { + "epoch": 0.5349974036206063, + "grad_norm": 1.6802445650100708, + "learning_rate": 9.352648119259804e-05, + "loss": 1.3423, + "step": 14939 + }, + { + "epoch": 0.5350332157501746, + "grad_norm": 1.4071083068847656, + "learning_rate": 9.35149065524207e-05, + "loss": 1.1726, + "step": 14940 + }, + { + "epoch": 0.5350690278797429, + "grad_norm": 2.1739704608917236, + "learning_rate": 9.350333199949193e-05, + "loss": 1.7501, + "step": 14941 + }, + { + "epoch": 0.5351048400093111, + "grad_norm": 2.068229913711548, + "learning_rate": 9.349175753396746e-05, + "loss": 1.7553, + "step": 14942 + }, + { + "epoch": 0.5351406521388794, + "grad_norm": 2.328054666519165, + "learning_rate": 9.348018315600297e-05, + "loss": 1.5384, + "step": 14943 + }, + { + "epoch": 0.5351764642684477, + "grad_norm": 1.6056578159332275, + "learning_rate": 9.346860886575422e-05, + "loss": 1.6585, + "step": 14944 + }, + { + "epoch": 0.535212276398016, + "grad_norm": 1.6351563930511475, + "learning_rate": 9.34570346633769e-05, + "loss": 1.5569, + "step": 14945 + }, + { + "epoch": 0.5352480885275843, + "grad_norm": 1.8765473365783691, + "learning_rate": 9.344546054902677e-05, + "loss": 1.6506, + "step": 14946 + }, + { + "epoch": 0.5352839006571526, + "grad_norm": 2.3817811012268066, + "learning_rate": 9.343388652285947e-05, + "loss": 1.5692, + "step": 14947 + }, + { + "epoch": 0.5353197127867209, + "grad_norm": 2.1171789169311523, + "learning_rate": 9.342231258503079e-05, + "loss": 1.9199, + "step": 14948 + }, + { + "epoch": 0.5353555249162891, + "grad_norm": 1.4918116331100464, + "learning_rate": 9.34107387356964e-05, + "loss": 1.3828, + "step": 14949 + }, + { + "epoch": 0.5353913370458574, + "grad_norm": 1.5187290906906128, + "learning_rate": 9.339916497501202e-05, + "loss": 1.5775, + "step": 14950 + }, + { + "epoch": 0.5354271491754257, + "grad_norm": 1.756986379623413, + "learning_rate": 9.338759130313338e-05, + "loss": 1.7123, + "step": 14951 + }, + { + "epoch": 0.535462961304994, + "grad_norm": 1.6845223903656006, + "learning_rate": 9.337601772021612e-05, + "loss": 1.1958, + "step": 14952 + }, + { + "epoch": 0.5354987734345623, + "grad_norm": 1.5527392625808716, + "learning_rate": 9.336444422641605e-05, + "loss": 1.4844, + "step": 14953 + }, + { + "epoch": 0.5355345855641306, + "grad_norm": 1.50359046459198, + "learning_rate": 9.335287082188878e-05, + "loss": 1.5718, + "step": 14954 + }, + { + "epoch": 0.5355703976936989, + "grad_norm": 1.854921579360962, + "learning_rate": 9.334129750679009e-05, + "loss": 1.6565, + "step": 14955 + }, + { + "epoch": 0.5356062098232671, + "grad_norm": 2.034636974334717, + "learning_rate": 9.332972428127563e-05, + "loss": 1.3642, + "step": 14956 + }, + { + "epoch": 0.5356420219528354, + "grad_norm": 1.7532157897949219, + "learning_rate": 9.331815114550115e-05, + "loss": 1.5281, + "step": 14957 + }, + { + "epoch": 0.5356778340824037, + "grad_norm": 1.6303654909133911, + "learning_rate": 9.330657809962231e-05, + "loss": 1.2697, + "step": 14958 + }, + { + "epoch": 0.535713646211972, + "grad_norm": 1.6121984720230103, + "learning_rate": 9.329500514379485e-05, + "loss": 1.4018, + "step": 14959 + }, + { + "epoch": 0.5357494583415403, + "grad_norm": 1.9481133222579956, + "learning_rate": 9.328343227817443e-05, + "loss": 1.8712, + "step": 14960 + }, + { + "epoch": 0.5357852704711086, + "grad_norm": 2.3141913414001465, + "learning_rate": 9.327185950291676e-05, + "loss": 1.7397, + "step": 14961 + }, + { + "epoch": 0.5358210826006768, + "grad_norm": 2.181881904602051, + "learning_rate": 9.326028681817755e-05, + "loss": 1.4634, + "step": 14962 + }, + { + "epoch": 0.5358568947302451, + "grad_norm": 1.6960126161575317, + "learning_rate": 9.324871422411248e-05, + "loss": 1.4148, + "step": 14963 + }, + { + "epoch": 0.5358927068598134, + "grad_norm": 1.1461007595062256, + "learning_rate": 9.323714172087726e-05, + "loss": 1.4833, + "step": 14964 + }, + { + "epoch": 0.5359285189893817, + "grad_norm": 1.4815764427185059, + "learning_rate": 9.322556930862757e-05, + "loss": 1.3105, + "step": 14965 + }, + { + "epoch": 0.53596433111895, + "grad_norm": 1.6154530048370361, + "learning_rate": 9.321399698751912e-05, + "loss": 1.2409, + "step": 14966 + }, + { + "epoch": 0.5360001432485183, + "grad_norm": 1.841649055480957, + "learning_rate": 9.320242475770756e-05, + "loss": 1.3561, + "step": 14967 + }, + { + "epoch": 0.5360359553780866, + "grad_norm": 2.050635814666748, + "learning_rate": 9.319085261934864e-05, + "loss": 1.5387, + "step": 14968 + }, + { + "epoch": 0.5360717675076548, + "grad_norm": 1.3458319902420044, + "learning_rate": 9.317928057259799e-05, + "loss": 1.5461, + "step": 14969 + }, + { + "epoch": 0.5361075796372231, + "grad_norm": 1.581529974937439, + "learning_rate": 9.316770861761132e-05, + "loss": 1.6266, + "step": 14970 + }, + { + "epoch": 0.5361433917667914, + "grad_norm": 1.65879487991333, + "learning_rate": 9.315613675454435e-05, + "loss": 1.7114, + "step": 14971 + }, + { + "epoch": 0.5361792038963596, + "grad_norm": 1.7967455387115479, + "learning_rate": 9.314456498355269e-05, + "loss": 1.2497, + "step": 14972 + }, + { + "epoch": 0.536215016025928, + "grad_norm": 1.6957311630249023, + "learning_rate": 9.313299330479209e-05, + "loss": 1.3297, + "step": 14973 + }, + { + "epoch": 0.5362508281554963, + "grad_norm": 1.4996039867401123, + "learning_rate": 9.31214217184182e-05, + "loss": 1.5683, + "step": 14974 + }, + { + "epoch": 0.5362866402850646, + "grad_norm": 1.9086555242538452, + "learning_rate": 9.31098502245867e-05, + "loss": 1.507, + "step": 14975 + }, + { + "epoch": 0.5363224524146328, + "grad_norm": 1.6246367692947388, + "learning_rate": 9.30982788234533e-05, + "loss": 1.4698, + "step": 14976 + }, + { + "epoch": 0.5363582645442011, + "grad_norm": 1.8579511642456055, + "learning_rate": 9.308670751517363e-05, + "loss": 1.4766, + "step": 14977 + }, + { + "epoch": 0.5363940766737694, + "grad_norm": 2.250375270843506, + "learning_rate": 9.307513629990342e-05, + "loss": 1.3034, + "step": 14978 + }, + { + "epoch": 0.5364298888033376, + "grad_norm": 1.5072667598724365, + "learning_rate": 9.306356517779828e-05, + "loss": 1.5313, + "step": 14979 + }, + { + "epoch": 0.536465700932906, + "grad_norm": 3.7206661701202393, + "learning_rate": 9.305199414901397e-05, + "loss": 1.8558, + "step": 14980 + }, + { + "epoch": 0.5365015130624743, + "grad_norm": 1.4382824897766113, + "learning_rate": 9.304042321370607e-05, + "loss": 1.4454, + "step": 14981 + }, + { + "epoch": 0.5365373251920426, + "grad_norm": 2.4771111011505127, + "learning_rate": 9.302885237203034e-05, + "loss": 1.402, + "step": 14982 + }, + { + "epoch": 0.5365731373216108, + "grad_norm": 1.8501386642456055, + "learning_rate": 9.301728162414238e-05, + "loss": 1.2341, + "step": 14983 + }, + { + "epoch": 0.5366089494511791, + "grad_norm": 1.4732383489608765, + "learning_rate": 9.30057109701979e-05, + "loss": 1.5761, + "step": 14984 + }, + { + "epoch": 0.5366447615807474, + "grad_norm": 2.0961062908172607, + "learning_rate": 9.299414041035259e-05, + "loss": 1.7277, + "step": 14985 + }, + { + "epoch": 0.5366805737103156, + "grad_norm": 1.619267463684082, + "learning_rate": 9.298256994476202e-05, + "loss": 1.5175, + "step": 14986 + }, + { + "epoch": 0.536716385839884, + "grad_norm": 1.4270780086517334, + "learning_rate": 9.297099957358199e-05, + "loss": 1.695, + "step": 14987 + }, + { + "epoch": 0.5367521979694523, + "grad_norm": 1.7265422344207764, + "learning_rate": 9.295942929696801e-05, + "loss": 1.2141, + "step": 14988 + }, + { + "epoch": 0.5367880100990206, + "grad_norm": 2.4905295372009277, + "learning_rate": 9.294785911507589e-05, + "loss": 1.3067, + "step": 14989 + }, + { + "epoch": 0.5368238222285888, + "grad_norm": 1.9801867008209229, + "learning_rate": 9.29362890280612e-05, + "loss": 1.8363, + "step": 14990 + }, + { + "epoch": 0.5368596343581571, + "grad_norm": 1.467184066772461, + "learning_rate": 9.292471903607964e-05, + "loss": 1.6563, + "step": 14991 + }, + { + "epoch": 0.5368954464877254, + "grad_norm": 1.6070553064346313, + "learning_rate": 9.291314913928685e-05, + "loss": 1.35, + "step": 14992 + }, + { + "epoch": 0.5369312586172936, + "grad_norm": 1.391000747680664, + "learning_rate": 9.290157933783852e-05, + "loss": 1.2004, + "step": 14993 + }, + { + "epoch": 0.536967070746862, + "grad_norm": 1.8240994215011597, + "learning_rate": 9.28900096318903e-05, + "loss": 1.3666, + "step": 14994 + }, + { + "epoch": 0.5370028828764303, + "grad_norm": 1.497835636138916, + "learning_rate": 9.287844002159776e-05, + "loss": 1.73, + "step": 14995 + }, + { + "epoch": 0.5370386950059985, + "grad_norm": 1.8223732709884644, + "learning_rate": 9.286687050711668e-05, + "loss": 1.3505, + "step": 14996 + }, + { + "epoch": 0.5370745071355668, + "grad_norm": 1.5214042663574219, + "learning_rate": 9.285530108860262e-05, + "loss": 1.5422, + "step": 14997 + }, + { + "epoch": 0.5371103192651351, + "grad_norm": 1.55804443359375, + "learning_rate": 9.284373176621131e-05, + "loss": 1.5534, + "step": 14998 + }, + { + "epoch": 0.5371461313947034, + "grad_norm": 2.063025712966919, + "learning_rate": 9.28321625400983e-05, + "loss": 1.4235, + "step": 14999 + }, + { + "epoch": 0.5371819435242716, + "grad_norm": 1.5878913402557373, + "learning_rate": 9.282059341041936e-05, + "loss": 1.5036, + "step": 15000 + }, + { + "epoch": 0.53721775565384, + "grad_norm": 1.506630778312683, + "learning_rate": 9.280902437733003e-05, + "loss": 1.208, + "step": 15001 + }, + { + "epoch": 0.5372535677834083, + "grad_norm": 1.781821846961975, + "learning_rate": 9.279745544098602e-05, + "loss": 1.4643, + "step": 15002 + }, + { + "epoch": 0.5372893799129765, + "grad_norm": 1.9320119619369507, + "learning_rate": 9.278588660154298e-05, + "loss": 1.6491, + "step": 15003 + }, + { + "epoch": 0.5373251920425448, + "grad_norm": 1.846867322921753, + "learning_rate": 9.277431785915647e-05, + "loss": 1.3241, + "step": 15004 + }, + { + "epoch": 0.5373610041721131, + "grad_norm": 1.7389922142028809, + "learning_rate": 9.276274921398225e-05, + "loss": 1.5716, + "step": 15005 + }, + { + "epoch": 0.5373968163016813, + "grad_norm": 1.6828988790512085, + "learning_rate": 9.275118066617585e-05, + "loss": 1.5801, + "step": 15006 + }, + { + "epoch": 0.5374326284312496, + "grad_norm": 1.8095980882644653, + "learning_rate": 9.273961221589303e-05, + "loss": 1.3198, + "step": 15007 + }, + { + "epoch": 0.537468440560818, + "grad_norm": 1.7171796560287476, + "learning_rate": 9.27280438632893e-05, + "loss": 1.4818, + "step": 15008 + }, + { + "epoch": 0.5375042526903863, + "grad_norm": 1.8513648509979248, + "learning_rate": 9.271647560852042e-05, + "loss": 1.6135, + "step": 15009 + }, + { + "epoch": 0.5375400648199545, + "grad_norm": 1.5115734338760376, + "learning_rate": 9.27049074517419e-05, + "loss": 1.6142, + "step": 15010 + }, + { + "epoch": 0.5375758769495228, + "grad_norm": 2.058396100997925, + "learning_rate": 9.26933393931095e-05, + "loss": 1.4059, + "step": 15011 + }, + { + "epoch": 0.5376116890790911, + "grad_norm": 1.8577311038970947, + "learning_rate": 9.268177143277877e-05, + "loss": 1.5906, + "step": 15012 + }, + { + "epoch": 0.5376475012086593, + "grad_norm": 1.5660582780838013, + "learning_rate": 9.267020357090535e-05, + "loss": 1.4381, + "step": 15013 + }, + { + "epoch": 0.5376833133382276, + "grad_norm": 1.3787236213684082, + "learning_rate": 9.265863580764492e-05, + "loss": 1.4798, + "step": 15014 + }, + { + "epoch": 0.537719125467796, + "grad_norm": 2.0579323768615723, + "learning_rate": 9.264706814315302e-05, + "loss": 1.571, + "step": 15015 + }, + { + "epoch": 0.5377549375973643, + "grad_norm": 1.3962444067001343, + "learning_rate": 9.263550057758539e-05, + "loss": 1.59, + "step": 15016 + }, + { + "epoch": 0.5377907497269325, + "grad_norm": 1.7206778526306152, + "learning_rate": 9.262393311109754e-05, + "loss": 1.3043, + "step": 15017 + }, + { + "epoch": 0.5378265618565008, + "grad_norm": 1.2327122688293457, + "learning_rate": 9.261236574384523e-05, + "loss": 1.4594, + "step": 15018 + }, + { + "epoch": 0.5378623739860691, + "grad_norm": 2.1515252590179443, + "learning_rate": 9.260079847598393e-05, + "loss": 1.4466, + "step": 15019 + }, + { + "epoch": 0.5378981861156373, + "grad_norm": 1.5963287353515625, + "learning_rate": 9.258923130766942e-05, + "loss": 1.4193, + "step": 15020 + }, + { + "epoch": 0.5379339982452056, + "grad_norm": 1.5895057916641235, + "learning_rate": 9.257766423905722e-05, + "loss": 1.724, + "step": 15021 + }, + { + "epoch": 0.537969810374774, + "grad_norm": 1.5199759006500244, + "learning_rate": 9.256609727030294e-05, + "loss": 1.645, + "step": 15022 + }, + { + "epoch": 0.5380056225043423, + "grad_norm": 1.6282600164413452, + "learning_rate": 9.255453040156228e-05, + "loss": 1.43, + "step": 15023 + }, + { + "epoch": 0.5380414346339105, + "grad_norm": 1.6002341508865356, + "learning_rate": 9.254296363299077e-05, + "loss": 1.4412, + "step": 15024 + }, + { + "epoch": 0.5380772467634788, + "grad_norm": 2.265721559524536, + "learning_rate": 9.253139696474409e-05, + "loss": 1.5447, + "step": 15025 + }, + { + "epoch": 0.5381130588930471, + "grad_norm": 3.0661771297454834, + "learning_rate": 9.25198303969778e-05, + "loss": 1.6547, + "step": 15026 + }, + { + "epoch": 0.5381488710226153, + "grad_norm": 2.4014828205108643, + "learning_rate": 9.250826392984757e-05, + "loss": 1.3629, + "step": 15027 + }, + { + "epoch": 0.5381846831521836, + "grad_norm": 2.069535732269287, + "learning_rate": 9.249669756350894e-05, + "loss": 1.6487, + "step": 15028 + }, + { + "epoch": 0.538220495281752, + "grad_norm": 1.3042799234390259, + "learning_rate": 9.248513129811765e-05, + "loss": 1.197, + "step": 15029 + }, + { + "epoch": 0.5382563074113202, + "grad_norm": 1.619163990020752, + "learning_rate": 9.247356513382917e-05, + "loss": 1.5524, + "step": 15030 + }, + { + "epoch": 0.5382921195408885, + "grad_norm": 1.8219351768493652, + "learning_rate": 9.246199907079916e-05, + "loss": 1.4046, + "step": 15031 + }, + { + "epoch": 0.5383279316704568, + "grad_norm": 1.6462182998657227, + "learning_rate": 9.245043310918325e-05, + "loss": 1.5918, + "step": 15032 + }, + { + "epoch": 0.538363743800025, + "grad_norm": 2.164386034011841, + "learning_rate": 9.2438867249137e-05, + "loss": 1.681, + "step": 15033 + }, + { + "epoch": 0.5383995559295933, + "grad_norm": 1.3424277305603027, + "learning_rate": 9.242730149081606e-05, + "loss": 1.6123, + "step": 15034 + }, + { + "epoch": 0.5384353680591616, + "grad_norm": 1.6956967115402222, + "learning_rate": 9.241573583437599e-05, + "loss": 1.6094, + "step": 15035 + }, + { + "epoch": 0.53847118018873, + "grad_norm": 1.7979881763458252, + "learning_rate": 9.240417027997243e-05, + "loss": 1.4714, + "step": 15036 + }, + { + "epoch": 0.5385069923182982, + "grad_norm": 1.8987326622009277, + "learning_rate": 9.239260482776096e-05, + "loss": 1.5237, + "step": 15037 + }, + { + "epoch": 0.5385428044478665, + "grad_norm": 1.9217215776443481, + "learning_rate": 9.238103947789718e-05, + "loss": 1.458, + "step": 15038 + }, + { + "epoch": 0.5385786165774348, + "grad_norm": 2.3088903427124023, + "learning_rate": 9.236947423053669e-05, + "loss": 1.6509, + "step": 15039 + }, + { + "epoch": 0.538614428707003, + "grad_norm": 1.4139968156814575, + "learning_rate": 9.235790908583506e-05, + "loss": 1.3816, + "step": 15040 + }, + { + "epoch": 0.5386502408365713, + "grad_norm": 2.1365206241607666, + "learning_rate": 9.234634404394793e-05, + "loss": 1.4619, + "step": 15041 + }, + { + "epoch": 0.5386860529661396, + "grad_norm": 2.3481082916259766, + "learning_rate": 9.233477910503083e-05, + "loss": 1.5677, + "step": 15042 + }, + { + "epoch": 0.538721865095708, + "grad_norm": 2.0316877365112305, + "learning_rate": 9.232321426923943e-05, + "loss": 1.3205, + "step": 15043 + }, + { + "epoch": 0.5387576772252762, + "grad_norm": 1.9657565355300903, + "learning_rate": 9.231164953672926e-05, + "loss": 1.6539, + "step": 15044 + }, + { + "epoch": 0.5387934893548445, + "grad_norm": 1.8171610832214355, + "learning_rate": 9.230008490765593e-05, + "loss": 1.1466, + "step": 15045 + }, + { + "epoch": 0.5388293014844128, + "grad_norm": 1.1965806484222412, + "learning_rate": 9.228852038217502e-05, + "loss": 1.2931, + "step": 15046 + }, + { + "epoch": 0.538865113613981, + "grad_norm": 1.6096956729888916, + "learning_rate": 9.227695596044215e-05, + "loss": 1.4272, + "step": 15047 + }, + { + "epoch": 0.5389009257435493, + "grad_norm": 2.070911407470703, + "learning_rate": 9.226539164261286e-05, + "loss": 1.3034, + "step": 15048 + }, + { + "epoch": 0.5389367378731176, + "grad_norm": 2.4568874835968018, + "learning_rate": 9.225382742884273e-05, + "loss": 1.4975, + "step": 15049 + }, + { + "epoch": 0.538972550002686, + "grad_norm": 1.6348614692687988, + "learning_rate": 9.224226331928738e-05, + "loss": 1.5438, + "step": 15050 + }, + { + "epoch": 0.5390083621322542, + "grad_norm": 1.6172714233398438, + "learning_rate": 9.223069931410236e-05, + "loss": 1.3941, + "step": 15051 + }, + { + "epoch": 0.5390441742618225, + "grad_norm": 1.6091904640197754, + "learning_rate": 9.221913541344327e-05, + "loss": 1.4381, + "step": 15052 + }, + { + "epoch": 0.5390799863913908, + "grad_norm": 1.614936351776123, + "learning_rate": 9.220757161746566e-05, + "loss": 1.5637, + "step": 15053 + }, + { + "epoch": 0.539115798520959, + "grad_norm": 2.01474928855896, + "learning_rate": 9.219600792632513e-05, + "loss": 1.4325, + "step": 15054 + }, + { + "epoch": 0.5391516106505273, + "grad_norm": 1.680156946182251, + "learning_rate": 9.218444434017724e-05, + "loss": 1.6222, + "step": 15055 + }, + { + "epoch": 0.5391874227800956, + "grad_norm": 1.7810890674591064, + "learning_rate": 9.217288085917759e-05, + "loss": 1.4141, + "step": 15056 + }, + { + "epoch": 0.539223234909664, + "grad_norm": 1.8147783279418945, + "learning_rate": 9.216131748348174e-05, + "loss": 1.6173, + "step": 15057 + }, + { + "epoch": 0.5392590470392322, + "grad_norm": 1.4602004289627075, + "learning_rate": 9.21497542132452e-05, + "loss": 1.4772, + "step": 15058 + }, + { + "epoch": 0.5392948591688005, + "grad_norm": 1.867448329925537, + "learning_rate": 9.213819104862365e-05, + "loss": 1.3885, + "step": 15059 + }, + { + "epoch": 0.5393306712983688, + "grad_norm": 1.4362049102783203, + "learning_rate": 9.212662798977256e-05, + "loss": 1.5253, + "step": 15060 + }, + { + "epoch": 0.539366483427937, + "grad_norm": 1.3346989154815674, + "learning_rate": 9.211506503684755e-05, + "loss": 1.2639, + "step": 15061 + }, + { + "epoch": 0.5394022955575053, + "grad_norm": 1.570544958114624, + "learning_rate": 9.210350219000416e-05, + "loss": 1.6639, + "step": 15062 + }, + { + "epoch": 0.5394381076870736, + "grad_norm": 1.4797194004058838, + "learning_rate": 9.209193944939798e-05, + "loss": 1.4074, + "step": 15063 + }, + { + "epoch": 0.539473919816642, + "grad_norm": 1.966138243675232, + "learning_rate": 9.208037681518454e-05, + "loss": 1.7929, + "step": 15064 + }, + { + "epoch": 0.5395097319462102, + "grad_norm": 1.6170921325683594, + "learning_rate": 9.206881428751941e-05, + "loss": 1.2353, + "step": 15065 + }, + { + "epoch": 0.5395455440757785, + "grad_norm": 1.8537172079086304, + "learning_rate": 9.205725186655817e-05, + "loss": 1.466, + "step": 15066 + }, + { + "epoch": 0.5395813562053468, + "grad_norm": 1.2693291902542114, + "learning_rate": 9.204568955245634e-05, + "loss": 1.4357, + "step": 15067 + }, + { + "epoch": 0.539617168334915, + "grad_norm": 1.3635295629501343, + "learning_rate": 9.203412734536951e-05, + "loss": 1.4153, + "step": 15068 + }, + { + "epoch": 0.5396529804644833, + "grad_norm": 1.594772219657898, + "learning_rate": 9.202256524545322e-05, + "loss": 1.3883, + "step": 15069 + }, + { + "epoch": 0.5396887925940516, + "grad_norm": 1.3356691598892212, + "learning_rate": 9.201100325286302e-05, + "loss": 1.4835, + "step": 15070 + }, + { + "epoch": 0.5397246047236199, + "grad_norm": 2.0734238624572754, + "learning_rate": 9.199944136775446e-05, + "loss": 1.5278, + "step": 15071 + }, + { + "epoch": 0.5397604168531882, + "grad_norm": 1.533120036125183, + "learning_rate": 9.198787959028312e-05, + "loss": 1.536, + "step": 15072 + }, + { + "epoch": 0.5397962289827565, + "grad_norm": 1.554632544517517, + "learning_rate": 9.197631792060453e-05, + "loss": 1.365, + "step": 15073 + }, + { + "epoch": 0.5398320411123247, + "grad_norm": 1.5368367433547974, + "learning_rate": 9.196475635887419e-05, + "loss": 1.6686, + "step": 15074 + }, + { + "epoch": 0.539867853241893, + "grad_norm": 1.6258572340011597, + "learning_rate": 9.195319490524772e-05, + "loss": 1.3955, + "step": 15075 + }, + { + "epoch": 0.5399036653714613, + "grad_norm": 1.443794846534729, + "learning_rate": 9.194163355988062e-05, + "loss": 1.3147, + "step": 15076 + }, + { + "epoch": 0.5399394775010296, + "grad_norm": 1.916318655014038, + "learning_rate": 9.193007232292846e-05, + "loss": 1.7055, + "step": 15077 + }, + { + "epoch": 0.5399752896305979, + "grad_norm": 1.584694504737854, + "learning_rate": 9.191851119454675e-05, + "loss": 1.1801, + "step": 15078 + }, + { + "epoch": 0.5400111017601662, + "grad_norm": 1.3532837629318237, + "learning_rate": 9.190695017489106e-05, + "loss": 1.3885, + "step": 15079 + }, + { + "epoch": 0.5400469138897345, + "grad_norm": 2.9349710941314697, + "learning_rate": 9.18953892641169e-05, + "loss": 1.4769, + "step": 15080 + }, + { + "epoch": 0.5400827260193027, + "grad_norm": 2.0163779258728027, + "learning_rate": 9.188382846237984e-05, + "loss": 1.4724, + "step": 15081 + }, + { + "epoch": 0.540118538148871, + "grad_norm": 1.6981626749038696, + "learning_rate": 9.187226776983543e-05, + "loss": 1.676, + "step": 15082 + }, + { + "epoch": 0.5401543502784393, + "grad_norm": 1.884413719177246, + "learning_rate": 9.18607071866391e-05, + "loss": 1.7854, + "step": 15083 + }, + { + "epoch": 0.5401901624080075, + "grad_norm": 1.4751265048980713, + "learning_rate": 9.184914671294653e-05, + "loss": 1.3642, + "step": 15084 + }, + { + "epoch": 0.5402259745375759, + "grad_norm": 1.6983938217163086, + "learning_rate": 9.18375863489131e-05, + "loss": 1.3347, + "step": 15085 + }, + { + "epoch": 0.5402617866671442, + "grad_norm": 1.627390742301941, + "learning_rate": 9.182602609469448e-05, + "loss": 1.4281, + "step": 15086 + }, + { + "epoch": 0.5402975987967125, + "grad_norm": 1.8508156538009644, + "learning_rate": 9.18144659504461e-05, + "loss": 1.2141, + "step": 15087 + }, + { + "epoch": 0.5403334109262807, + "grad_norm": 1.926315188407898, + "learning_rate": 9.180290591632354e-05, + "loss": 1.5036, + "step": 15088 + }, + { + "epoch": 0.540369223055849, + "grad_norm": 1.5384167432785034, + "learning_rate": 9.179134599248228e-05, + "loss": 1.3299, + "step": 15089 + }, + { + "epoch": 0.5404050351854173, + "grad_norm": 1.4410181045532227, + "learning_rate": 9.177978617907791e-05, + "loss": 1.4095, + "step": 15090 + }, + { + "epoch": 0.5404408473149855, + "grad_norm": 1.6018924713134766, + "learning_rate": 9.176822647626593e-05, + "loss": 1.627, + "step": 15091 + }, + { + "epoch": 0.5404766594445539, + "grad_norm": 1.6720951795578003, + "learning_rate": 9.175666688420177e-05, + "loss": 1.5403, + "step": 15092 + }, + { + "epoch": 0.5405124715741222, + "grad_norm": 1.8459980487823486, + "learning_rate": 9.17451074030411e-05, + "loss": 1.4794, + "step": 15093 + }, + { + "epoch": 0.5405482837036905, + "grad_norm": 1.8089991807937622, + "learning_rate": 9.17335480329393e-05, + "loss": 1.6141, + "step": 15094 + }, + { + "epoch": 0.5405840958332587, + "grad_norm": 1.9303104877471924, + "learning_rate": 9.1721988774052e-05, + "loss": 1.248, + "step": 15095 + }, + { + "epoch": 0.540619907962827, + "grad_norm": 1.3829426765441895, + "learning_rate": 9.17104296265346e-05, + "loss": 1.2953, + "step": 15096 + }, + { + "epoch": 0.5406557200923953, + "grad_norm": 1.5334969758987427, + "learning_rate": 9.169887059054275e-05, + "loss": 1.4165, + "step": 15097 + }, + { + "epoch": 0.5406915322219635, + "grad_norm": 2.001737117767334, + "learning_rate": 9.168731166623182e-05, + "loss": 1.294, + "step": 15098 + }, + { + "epoch": 0.5407273443515319, + "grad_norm": 3.4213063716888428, + "learning_rate": 9.167575285375744e-05, + "loss": 1.5401, + "step": 15099 + }, + { + "epoch": 0.5407631564811002, + "grad_norm": 1.7544281482696533, + "learning_rate": 9.166419415327508e-05, + "loss": 1.4561, + "step": 15100 + }, + { + "epoch": 0.5407989686106685, + "grad_norm": 1.7456077337265015, + "learning_rate": 9.165263556494016e-05, + "loss": 1.1195, + "step": 15101 + }, + { + "epoch": 0.5408347807402367, + "grad_norm": 1.4219002723693848, + "learning_rate": 9.164107708890835e-05, + "loss": 1.1713, + "step": 15102 + }, + { + "epoch": 0.540870592869805, + "grad_norm": 1.255926489830017, + "learning_rate": 9.162951872533498e-05, + "loss": 1.3403, + "step": 15103 + }, + { + "epoch": 0.5409064049993733, + "grad_norm": 2.029594898223877, + "learning_rate": 9.161796047437572e-05, + "loss": 1.5172, + "step": 15104 + }, + { + "epoch": 0.5409422171289415, + "grad_norm": 1.3873099088668823, + "learning_rate": 9.160640233618591e-05, + "loss": 1.314, + "step": 15105 + }, + { + "epoch": 0.5409780292585099, + "grad_norm": 1.565388798713684, + "learning_rate": 9.15948443109212e-05, + "loss": 1.3721, + "step": 15106 + }, + { + "epoch": 0.5410138413880782, + "grad_norm": 1.4900970458984375, + "learning_rate": 9.158328639873695e-05, + "loss": 1.3958, + "step": 15107 + }, + { + "epoch": 0.5410496535176464, + "grad_norm": 1.506885051727295, + "learning_rate": 9.15717285997888e-05, + "loss": 1.3126, + "step": 15108 + }, + { + "epoch": 0.5410854656472147, + "grad_norm": 1.5706779956817627, + "learning_rate": 9.156017091423215e-05, + "loss": 1.4397, + "step": 15109 + }, + { + "epoch": 0.541121277776783, + "grad_norm": 1.3363198041915894, + "learning_rate": 9.154861334222248e-05, + "loss": 1.3809, + "step": 15110 + }, + { + "epoch": 0.5411570899063513, + "grad_norm": 1.466255784034729, + "learning_rate": 9.153705588391535e-05, + "loss": 1.0968, + "step": 15111 + }, + { + "epoch": 0.5411929020359195, + "grad_norm": 1.7009021043777466, + "learning_rate": 9.152549853946615e-05, + "loss": 1.1897, + "step": 15112 + }, + { + "epoch": 0.5412287141654879, + "grad_norm": 1.6660683155059814, + "learning_rate": 9.151394130903052e-05, + "loss": 1.5026, + "step": 15113 + }, + { + "epoch": 0.5412645262950562, + "grad_norm": 1.4293491840362549, + "learning_rate": 9.15023841927638e-05, + "loss": 1.7752, + "step": 15114 + }, + { + "epoch": 0.5413003384246244, + "grad_norm": 1.6292088031768799, + "learning_rate": 9.14908271908216e-05, + "loss": 1.4279, + "step": 15115 + }, + { + "epoch": 0.5413361505541927, + "grad_norm": 2.2113747596740723, + "learning_rate": 9.147927030335928e-05, + "loss": 1.5957, + "step": 15116 + }, + { + "epoch": 0.541371962683761, + "grad_norm": 2.821272611618042, + "learning_rate": 9.146771353053245e-05, + "loss": 1.2521, + "step": 15117 + }, + { + "epoch": 0.5414077748133292, + "grad_norm": 1.4644893407821655, + "learning_rate": 9.14561568724965e-05, + "loss": 1.1884, + "step": 15118 + }, + { + "epoch": 0.5414435869428975, + "grad_norm": 1.7300174236297607, + "learning_rate": 9.144460032940693e-05, + "loss": 1.5423, + "step": 15119 + }, + { + "epoch": 0.5414793990724659, + "grad_norm": 1.2341820001602173, + "learning_rate": 9.143304390141925e-05, + "loss": 1.0844, + "step": 15120 + }, + { + "epoch": 0.5415152112020342, + "grad_norm": 1.7158368825912476, + "learning_rate": 9.142148758868887e-05, + "loss": 1.6006, + "step": 15121 + }, + { + "epoch": 0.5415510233316024, + "grad_norm": 2.46889328956604, + "learning_rate": 9.140993139137135e-05, + "loss": 1.613, + "step": 15122 + }, + { + "epoch": 0.5415868354611707, + "grad_norm": 1.745641827583313, + "learning_rate": 9.139837530962209e-05, + "loss": 1.3685, + "step": 15123 + }, + { + "epoch": 0.541622647590739, + "grad_norm": 1.7009758949279785, + "learning_rate": 9.138681934359663e-05, + "loss": 1.5774, + "step": 15124 + }, + { + "epoch": 0.5416584597203072, + "grad_norm": 2.010895252227783, + "learning_rate": 9.137526349345036e-05, + "loss": 1.421, + "step": 15125 + }, + { + "epoch": 0.5416942718498755, + "grad_norm": 1.514216423034668, + "learning_rate": 9.136370775933885e-05, + "loss": 1.4046, + "step": 15126 + }, + { + "epoch": 0.5417300839794439, + "grad_norm": 1.5812219381332397, + "learning_rate": 9.135215214141751e-05, + "loss": 1.4894, + "step": 15127 + }, + { + "epoch": 0.5417658961090122, + "grad_norm": 1.7049273252487183, + "learning_rate": 9.134059663984176e-05, + "loss": 1.7633, + "step": 15128 + }, + { + "epoch": 0.5418017082385804, + "grad_norm": 1.498794674873352, + "learning_rate": 9.132904125476715e-05, + "loss": 1.3923, + "step": 15129 + }, + { + "epoch": 0.5418375203681487, + "grad_norm": 2.9248688220977783, + "learning_rate": 9.131748598634907e-05, + "loss": 1.9405, + "step": 15130 + }, + { + "epoch": 0.541873332497717, + "grad_norm": 1.411406397819519, + "learning_rate": 9.130593083474305e-05, + "loss": 1.2329, + "step": 15131 + }, + { + "epoch": 0.5419091446272852, + "grad_norm": 1.7705422639846802, + "learning_rate": 9.129437580010449e-05, + "loss": 1.3942, + "step": 15132 + }, + { + "epoch": 0.5419449567568535, + "grad_norm": 2.8008735179901123, + "learning_rate": 9.12828208825889e-05, + "loss": 1.4574, + "step": 15133 + }, + { + "epoch": 0.5419807688864219, + "grad_norm": 1.664994478225708, + "learning_rate": 9.12712660823517e-05, + "loss": 1.5583, + "step": 15134 + }, + { + "epoch": 0.5420165810159902, + "grad_norm": 2.5544071197509766, + "learning_rate": 9.125971139954835e-05, + "loss": 1.6255, + "step": 15135 + }, + { + "epoch": 0.5420523931455584, + "grad_norm": 1.645238995552063, + "learning_rate": 9.124815683433432e-05, + "loss": 1.2817, + "step": 15136 + }, + { + "epoch": 0.5420882052751267, + "grad_norm": 1.7806026935577393, + "learning_rate": 9.123660238686503e-05, + "loss": 1.6258, + "step": 15137 + }, + { + "epoch": 0.542124017404695, + "grad_norm": 1.9285032749176025, + "learning_rate": 9.122504805729598e-05, + "loss": 1.6613, + "step": 15138 + }, + { + "epoch": 0.5421598295342632, + "grad_norm": 1.6295703649520874, + "learning_rate": 9.121349384578255e-05, + "loss": 1.3031, + "step": 15139 + }, + { + "epoch": 0.5421956416638315, + "grad_norm": 1.7545650005340576, + "learning_rate": 9.120193975248027e-05, + "loss": 1.6079, + "step": 15140 + }, + { + "epoch": 0.5422314537933999, + "grad_norm": 1.3660227060317993, + "learning_rate": 9.119038577754451e-05, + "loss": 1.2539, + "step": 15141 + }, + { + "epoch": 0.5422672659229681, + "grad_norm": 1.3754740953445435, + "learning_rate": 9.117883192113077e-05, + "loss": 1.3293, + "step": 15142 + }, + { + "epoch": 0.5423030780525364, + "grad_norm": 1.3937410116195679, + "learning_rate": 9.116727818339444e-05, + "loss": 1.6224, + "step": 15143 + }, + { + "epoch": 0.5423388901821047, + "grad_norm": 2.4949188232421875, + "learning_rate": 9.115572456449102e-05, + "loss": 1.6398, + "step": 15144 + }, + { + "epoch": 0.542374702311673, + "grad_norm": 1.8907426595687866, + "learning_rate": 9.114417106457591e-05, + "loss": 1.6803, + "step": 15145 + }, + { + "epoch": 0.5424105144412412, + "grad_norm": 1.436632752418518, + "learning_rate": 9.113261768380454e-05, + "loss": 1.4615, + "step": 15146 + }, + { + "epoch": 0.5424463265708095, + "grad_norm": 1.9328235387802124, + "learning_rate": 9.112106442233237e-05, + "loss": 1.47, + "step": 15147 + }, + { + "epoch": 0.5424821387003779, + "grad_norm": 1.6240284442901611, + "learning_rate": 9.110951128031482e-05, + "loss": 1.2528, + "step": 15148 + }, + { + "epoch": 0.5425179508299461, + "grad_norm": 1.9730523824691772, + "learning_rate": 9.109795825790735e-05, + "loss": 1.3464, + "step": 15149 + }, + { + "epoch": 0.5425537629595144, + "grad_norm": 1.679447054862976, + "learning_rate": 9.108640535526533e-05, + "loss": 1.6473, + "step": 15150 + }, + { + "epoch": 0.5425895750890827, + "grad_norm": 1.7408350706100464, + "learning_rate": 9.107485257254426e-05, + "loss": 1.5971, + "step": 15151 + }, + { + "epoch": 0.542625387218651, + "grad_norm": 1.811719536781311, + "learning_rate": 9.106329990989952e-05, + "loss": 1.3407, + "step": 15152 + }, + { + "epoch": 0.5426611993482192, + "grad_norm": 2.72822904586792, + "learning_rate": 9.105174736748656e-05, + "loss": 1.5943, + "step": 15153 + }, + { + "epoch": 0.5426970114777875, + "grad_norm": 1.4611486196517944, + "learning_rate": 9.104019494546081e-05, + "loss": 1.3498, + "step": 15154 + }, + { + "epoch": 0.5427328236073559, + "grad_norm": 1.499182105064392, + "learning_rate": 9.102864264397765e-05, + "loss": 1.4977, + "step": 15155 + }, + { + "epoch": 0.5427686357369241, + "grad_norm": 1.542519211769104, + "learning_rate": 9.101709046319256e-05, + "loss": 1.1291, + "step": 15156 + }, + { + "epoch": 0.5428044478664924, + "grad_norm": 1.6112427711486816, + "learning_rate": 9.10055384032609e-05, + "loss": 1.3433, + "step": 15157 + }, + { + "epoch": 0.5428402599960607, + "grad_norm": 1.7602654695510864, + "learning_rate": 9.099398646433814e-05, + "loss": 1.6902, + "step": 15158 + }, + { + "epoch": 0.5428760721256289, + "grad_norm": 1.9010089635849, + "learning_rate": 9.098243464657966e-05, + "loss": 1.7847, + "step": 15159 + }, + { + "epoch": 0.5429118842551972, + "grad_norm": 2.192765235900879, + "learning_rate": 9.097088295014092e-05, + "loss": 1.4833, + "step": 15160 + }, + { + "epoch": 0.5429476963847655, + "grad_norm": 2.360835313796997, + "learning_rate": 9.095933137517727e-05, + "loss": 1.9125, + "step": 15161 + }, + { + "epoch": 0.5429835085143339, + "grad_norm": 1.5438072681427002, + "learning_rate": 9.094777992184417e-05, + "loss": 1.1242, + "step": 15162 + }, + { + "epoch": 0.5430193206439021, + "grad_norm": 1.586403727531433, + "learning_rate": 9.093622859029701e-05, + "loss": 1.7543, + "step": 15163 + }, + { + "epoch": 0.5430551327734704, + "grad_norm": 1.6280819177627563, + "learning_rate": 9.09246773806912e-05, + "loss": 1.2004, + "step": 15164 + }, + { + "epoch": 0.5430909449030387, + "grad_norm": 2.1167192459106445, + "learning_rate": 9.091312629318216e-05, + "loss": 1.7255, + "step": 15165 + }, + { + "epoch": 0.5431267570326069, + "grad_norm": 1.3767898082733154, + "learning_rate": 9.090157532792526e-05, + "loss": 1.4296, + "step": 15166 + }, + { + "epoch": 0.5431625691621752, + "grad_norm": 1.5592973232269287, + "learning_rate": 9.089002448507596e-05, + "loss": 1.3892, + "step": 15167 + }, + { + "epoch": 0.5431983812917435, + "grad_norm": 2.22155499458313, + "learning_rate": 9.087847376478961e-05, + "loss": 1.269, + "step": 15168 + }, + { + "epoch": 0.5432341934213119, + "grad_norm": 1.561850905418396, + "learning_rate": 9.086692316722166e-05, + "loss": 1.3457, + "step": 15169 + }, + { + "epoch": 0.5432700055508801, + "grad_norm": 1.6075046062469482, + "learning_rate": 9.085537269252747e-05, + "loss": 1.3777, + "step": 15170 + }, + { + "epoch": 0.5433058176804484, + "grad_norm": 1.8987433910369873, + "learning_rate": 9.08438223408624e-05, + "loss": 1.2371, + "step": 15171 + }, + { + "epoch": 0.5433416298100167, + "grad_norm": 1.4926401376724243, + "learning_rate": 9.083227211238192e-05, + "loss": 1.4493, + "step": 15172 + }, + { + "epoch": 0.5433774419395849, + "grad_norm": 2.8346004486083984, + "learning_rate": 9.082072200724139e-05, + "loss": 1.5074, + "step": 15173 + }, + { + "epoch": 0.5434132540691532, + "grad_norm": 1.5992660522460938, + "learning_rate": 9.08091720255962e-05, + "loss": 1.1084, + "step": 15174 + }, + { + "epoch": 0.5434490661987215, + "grad_norm": 1.871173620223999, + "learning_rate": 9.079762216760174e-05, + "loss": 1.3968, + "step": 15175 + }, + { + "epoch": 0.5434848783282898, + "grad_norm": 2.4197909832000732, + "learning_rate": 9.078607243341344e-05, + "loss": 1.8805, + "step": 15176 + }, + { + "epoch": 0.5435206904578581, + "grad_norm": 1.300917387008667, + "learning_rate": 9.077452282318661e-05, + "loss": 1.3799, + "step": 15177 + }, + { + "epoch": 0.5435565025874264, + "grad_norm": 1.500741720199585, + "learning_rate": 9.076297333707669e-05, + "loss": 1.3054, + "step": 15178 + }, + { + "epoch": 0.5435923147169947, + "grad_norm": 1.7115274667739868, + "learning_rate": 9.07514239752391e-05, + "loss": 1.2259, + "step": 15179 + }, + { + "epoch": 0.5436281268465629, + "grad_norm": 1.6599082946777344, + "learning_rate": 9.073987473782907e-05, + "loss": 1.7227, + "step": 15180 + }, + { + "epoch": 0.5436639389761312, + "grad_norm": 1.701534390449524, + "learning_rate": 9.072832562500217e-05, + "loss": 1.5188, + "step": 15181 + }, + { + "epoch": 0.5436997511056995, + "grad_norm": 1.621097445487976, + "learning_rate": 9.071677663691361e-05, + "loss": 1.3214, + "step": 15182 + }, + { + "epoch": 0.5437355632352678, + "grad_norm": 1.8554396629333496, + "learning_rate": 9.070522777371892e-05, + "loss": 2.0134, + "step": 15183 + }, + { + "epoch": 0.5437713753648361, + "grad_norm": 1.690596342086792, + "learning_rate": 9.069367903557333e-05, + "loss": 1.5976, + "step": 15184 + }, + { + "epoch": 0.5438071874944044, + "grad_norm": 1.5468312501907349, + "learning_rate": 9.068213042263234e-05, + "loss": 1.3421, + "step": 15185 + }, + { + "epoch": 0.5438429996239726, + "grad_norm": 1.7078195810317993, + "learning_rate": 9.067058193505124e-05, + "loss": 1.5431, + "step": 15186 + }, + { + "epoch": 0.5438788117535409, + "grad_norm": 1.6992605924606323, + "learning_rate": 9.065903357298544e-05, + "loss": 1.4592, + "step": 15187 + }, + { + "epoch": 0.5439146238831092, + "grad_norm": 1.696571707725525, + "learning_rate": 9.064748533659031e-05, + "loss": 1.644, + "step": 15188 + }, + { + "epoch": 0.5439504360126775, + "grad_norm": 1.737825632095337, + "learning_rate": 9.063593722602115e-05, + "loss": 1.2788, + "step": 15189 + }, + { + "epoch": 0.5439862481422458, + "grad_norm": 2.366448402404785, + "learning_rate": 9.062438924143344e-05, + "loss": 1.4315, + "step": 15190 + }, + { + "epoch": 0.5440220602718141, + "grad_norm": 2.0418972969055176, + "learning_rate": 9.06128413829824e-05, + "loss": 1.4927, + "step": 15191 + }, + { + "epoch": 0.5440578724013824, + "grad_norm": 2.0547597408294678, + "learning_rate": 9.060129365082354e-05, + "loss": 1.5544, + "step": 15192 + }, + { + "epoch": 0.5440936845309506, + "grad_norm": 2.0122506618499756, + "learning_rate": 9.05897460451121e-05, + "loss": 1.5898, + "step": 15193 + }, + { + "epoch": 0.5441294966605189, + "grad_norm": 2.682504177093506, + "learning_rate": 9.057819856600355e-05, + "loss": 1.5509, + "step": 15194 + }, + { + "epoch": 0.5441653087900872, + "grad_norm": 2.28589129447937, + "learning_rate": 9.056665121365311e-05, + "loss": 1.5996, + "step": 15195 + }, + { + "epoch": 0.5442011209196554, + "grad_norm": 1.6263935565948486, + "learning_rate": 9.055510398821627e-05, + "loss": 1.2338, + "step": 15196 + }, + { + "epoch": 0.5442369330492238, + "grad_norm": 1.4128479957580566, + "learning_rate": 9.054355688984833e-05, + "loss": 1.6509, + "step": 15197 + }, + { + "epoch": 0.5442727451787921, + "grad_norm": 1.4908677339553833, + "learning_rate": 9.053200991870456e-05, + "loss": 1.0973, + "step": 15198 + }, + { + "epoch": 0.5443085573083604, + "grad_norm": 2.3797690868377686, + "learning_rate": 9.052046307494046e-05, + "loss": 1.4909, + "step": 15199 + }, + { + "epoch": 0.5443443694379286, + "grad_norm": 1.5624659061431885, + "learning_rate": 9.050891635871124e-05, + "loss": 1.1965, + "step": 15200 + }, + { + "epoch": 0.5443801815674969, + "grad_norm": 1.94329833984375, + "learning_rate": 9.049736977017236e-05, + "loss": 1.661, + "step": 15201 + }, + { + "epoch": 0.5444159936970652, + "grad_norm": 1.6274837255477905, + "learning_rate": 9.048582330947906e-05, + "loss": 1.4358, + "step": 15202 + }, + { + "epoch": 0.5444518058266334, + "grad_norm": 2.1027121543884277, + "learning_rate": 9.04742769767868e-05, + "loss": 1.8225, + "step": 15203 + }, + { + "epoch": 0.5444876179562018, + "grad_norm": 1.6133577823638916, + "learning_rate": 9.046273077225078e-05, + "loss": 1.5472, + "step": 15204 + }, + { + "epoch": 0.5445234300857701, + "grad_norm": 1.9349349737167358, + "learning_rate": 9.045118469602649e-05, + "loss": 1.4636, + "step": 15205 + }, + { + "epoch": 0.5445592422153384, + "grad_norm": 1.7289633750915527, + "learning_rate": 9.043963874826917e-05, + "loss": 1.6001, + "step": 15206 + }, + { + "epoch": 0.5445950543449066, + "grad_norm": 1.6920100450515747, + "learning_rate": 9.042809292913415e-05, + "loss": 1.3169, + "step": 15207 + }, + { + "epoch": 0.5446308664744749, + "grad_norm": 1.8096444606781006, + "learning_rate": 9.041654723877683e-05, + "loss": 1.2999, + "step": 15208 + }, + { + "epoch": 0.5446666786040432, + "grad_norm": 1.588729977607727, + "learning_rate": 9.040500167735247e-05, + "loss": 1.7168, + "step": 15209 + }, + { + "epoch": 0.5447024907336114, + "grad_norm": 1.802148699760437, + "learning_rate": 9.039345624501646e-05, + "loss": 1.3229, + "step": 15210 + }, + { + "epoch": 0.5447383028631798, + "grad_norm": 1.531667947769165, + "learning_rate": 9.038191094192407e-05, + "loss": 1.3616, + "step": 15211 + }, + { + "epoch": 0.5447741149927481, + "grad_norm": 1.4396610260009766, + "learning_rate": 9.037036576823072e-05, + "loss": 0.9985, + "step": 15212 + }, + { + "epoch": 0.5448099271223164, + "grad_norm": 1.5046268701553345, + "learning_rate": 9.035882072409161e-05, + "loss": 1.5455, + "step": 15213 + }, + { + "epoch": 0.5448457392518846, + "grad_norm": 2.3657383918762207, + "learning_rate": 9.034727580966219e-05, + "loss": 1.8284, + "step": 15214 + }, + { + "epoch": 0.5448815513814529, + "grad_norm": 1.6111987829208374, + "learning_rate": 9.033573102509771e-05, + "loss": 1.4184, + "step": 15215 + }, + { + "epoch": 0.5449173635110212, + "grad_norm": 1.466270923614502, + "learning_rate": 9.032418637055348e-05, + "loss": 1.4768, + "step": 15216 + }, + { + "epoch": 0.5449531756405894, + "grad_norm": 1.4635989665985107, + "learning_rate": 9.031264184618487e-05, + "loss": 1.4865, + "step": 15217 + }, + { + "epoch": 0.5449889877701578, + "grad_norm": 2.0012905597686768, + "learning_rate": 9.030109745214713e-05, + "loss": 1.5261, + "step": 15218 + }, + { + "epoch": 0.5450247998997261, + "grad_norm": 1.6931449174880981, + "learning_rate": 9.028955318859564e-05, + "loss": 1.3085, + "step": 15219 + }, + { + "epoch": 0.5450606120292943, + "grad_norm": 1.5689709186553955, + "learning_rate": 9.027800905568568e-05, + "loss": 1.5905, + "step": 15220 + }, + { + "epoch": 0.5450964241588626, + "grad_norm": 1.997705101966858, + "learning_rate": 9.026646505357258e-05, + "loss": 1.835, + "step": 15221 + }, + { + "epoch": 0.5451322362884309, + "grad_norm": 1.988075852394104, + "learning_rate": 9.025492118241161e-05, + "loss": 1.92, + "step": 15222 + }, + { + "epoch": 0.5451680484179992, + "grad_norm": 1.7597702741622925, + "learning_rate": 9.024337744235814e-05, + "loss": 1.6645, + "step": 15223 + }, + { + "epoch": 0.5452038605475674, + "grad_norm": 1.6029980182647705, + "learning_rate": 9.023183383356743e-05, + "loss": 1.5673, + "step": 15224 + }, + { + "epoch": 0.5452396726771358, + "grad_norm": 1.7730108499526978, + "learning_rate": 9.022029035619478e-05, + "loss": 1.4413, + "step": 15225 + }, + { + "epoch": 0.5452754848067041, + "grad_norm": 1.6495401859283447, + "learning_rate": 9.020874701039552e-05, + "loss": 1.5705, + "step": 15226 + }, + { + "epoch": 0.5453112969362723, + "grad_norm": 2.009526252746582, + "learning_rate": 9.019720379632493e-05, + "loss": 1.4947, + "step": 15227 + }, + { + "epoch": 0.5453471090658406, + "grad_norm": 1.3570290803909302, + "learning_rate": 9.018566071413833e-05, + "loss": 1.4492, + "step": 15228 + }, + { + "epoch": 0.5453829211954089, + "grad_norm": 1.9762985706329346, + "learning_rate": 9.017411776399099e-05, + "loss": 1.8505, + "step": 15229 + }, + { + "epoch": 0.5454187333249771, + "grad_norm": 1.482008695602417, + "learning_rate": 9.016257494603824e-05, + "loss": 1.0744, + "step": 15230 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 1.898165225982666, + "learning_rate": 9.015103226043533e-05, + "loss": 1.5189, + "step": 15231 + }, + { + "epoch": 0.5454903575841138, + "grad_norm": 1.4939240217208862, + "learning_rate": 9.01394897073376e-05, + "loss": 1.5322, + "step": 15232 + }, + { + "epoch": 0.5455261697136821, + "grad_norm": 1.3671813011169434, + "learning_rate": 9.012794728690032e-05, + "loss": 1.5075, + "step": 15233 + }, + { + "epoch": 0.5455619818432503, + "grad_norm": 1.485896348953247, + "learning_rate": 9.011640499927875e-05, + "loss": 1.589, + "step": 15234 + }, + { + "epoch": 0.5455977939728186, + "grad_norm": 1.7984567880630493, + "learning_rate": 9.010486284462823e-05, + "loss": 1.4442, + "step": 15235 + }, + { + "epoch": 0.5456336061023869, + "grad_norm": 1.7843817472457886, + "learning_rate": 9.009332082310398e-05, + "loss": 1.3122, + "step": 15236 + }, + { + "epoch": 0.5456694182319551, + "grad_norm": 1.760123610496521, + "learning_rate": 9.008177893486136e-05, + "loss": 1.5041, + "step": 15237 + }, + { + "epoch": 0.5457052303615234, + "grad_norm": 1.4291465282440186, + "learning_rate": 9.007023718005558e-05, + "loss": 1.6004, + "step": 15238 + }, + { + "epoch": 0.5457410424910918, + "grad_norm": 2.074390172958374, + "learning_rate": 9.005869555884197e-05, + "loss": 1.4731, + "step": 15239 + }, + { + "epoch": 0.5457768546206601, + "grad_norm": 1.5591212511062622, + "learning_rate": 9.004715407137577e-05, + "loss": 1.4422, + "step": 15240 + }, + { + "epoch": 0.5458126667502283, + "grad_norm": 1.5369257926940918, + "learning_rate": 9.003561271781229e-05, + "loss": 1.4392, + "step": 15241 + }, + { + "epoch": 0.5458484788797966, + "grad_norm": 1.3781731128692627, + "learning_rate": 9.002407149830679e-05, + "loss": 1.5801, + "step": 15242 + }, + { + "epoch": 0.5458842910093649, + "grad_norm": 1.7291080951690674, + "learning_rate": 9.001253041301453e-05, + "loss": 1.6995, + "step": 15243 + }, + { + "epoch": 0.5459201031389331, + "grad_norm": 1.5849418640136719, + "learning_rate": 9.00009894620908e-05, + "loss": 1.5973, + "step": 15244 + }, + { + "epoch": 0.5459559152685014, + "grad_norm": 1.6605151891708374, + "learning_rate": 8.998944864569084e-05, + "loss": 1.6434, + "step": 15245 + }, + { + "epoch": 0.5459917273980698, + "grad_norm": 1.9970816373825073, + "learning_rate": 8.997790796396996e-05, + "loss": 1.8821, + "step": 15246 + }, + { + "epoch": 0.546027539527638, + "grad_norm": 1.8191704750061035, + "learning_rate": 8.996636741708337e-05, + "loss": 1.7131, + "step": 15247 + }, + { + "epoch": 0.5460633516572063, + "grad_norm": 1.318717122077942, + "learning_rate": 8.995482700518639e-05, + "loss": 1.3839, + "step": 15248 + }, + { + "epoch": 0.5460991637867746, + "grad_norm": 1.5386637449264526, + "learning_rate": 8.994328672843424e-05, + "loss": 1.74, + "step": 15249 + }, + { + "epoch": 0.5461349759163429, + "grad_norm": 1.6829789876937866, + "learning_rate": 8.993174658698221e-05, + "loss": 1.3832, + "step": 15250 + }, + { + "epoch": 0.5461707880459111, + "grad_norm": 2.5032663345336914, + "learning_rate": 8.992020658098555e-05, + "loss": 1.4875, + "step": 15251 + }, + { + "epoch": 0.5462066001754794, + "grad_norm": 2.0662424564361572, + "learning_rate": 8.990866671059948e-05, + "loss": 1.5559, + "step": 15252 + }, + { + "epoch": 0.5462424123050477, + "grad_norm": 1.9819056987762451, + "learning_rate": 8.98971269759793e-05, + "loss": 1.7462, + "step": 15253 + }, + { + "epoch": 0.546278224434616, + "grad_norm": 1.755515456199646, + "learning_rate": 8.988558737728023e-05, + "loss": 1.4854, + "step": 15254 + }, + { + "epoch": 0.5463140365641843, + "grad_norm": 1.8276987075805664, + "learning_rate": 8.987404791465757e-05, + "loss": 1.5511, + "step": 15255 + }, + { + "epoch": 0.5463498486937526, + "grad_norm": 1.7192363739013672, + "learning_rate": 8.986250858826649e-05, + "loss": 1.7131, + "step": 15256 + }, + { + "epoch": 0.5463856608233209, + "grad_norm": 1.8038772344589233, + "learning_rate": 8.985096939826231e-05, + "loss": 1.4206, + "step": 15257 + }, + { + "epoch": 0.5464214729528891, + "grad_norm": 2.844478130340576, + "learning_rate": 8.983943034480022e-05, + "loss": 1.8703, + "step": 15258 + }, + { + "epoch": 0.5464572850824574, + "grad_norm": 1.6079673767089844, + "learning_rate": 8.982789142803552e-05, + "loss": 1.0447, + "step": 15259 + }, + { + "epoch": 0.5464930972120257, + "grad_norm": 1.8626993894577026, + "learning_rate": 8.981635264812341e-05, + "loss": 1.5425, + "step": 15260 + }, + { + "epoch": 0.546528909341594, + "grad_norm": 1.9732133150100708, + "learning_rate": 8.980481400521911e-05, + "loss": 1.7118, + "step": 15261 + }, + { + "epoch": 0.5465647214711623, + "grad_norm": 1.683758020401001, + "learning_rate": 8.979327549947794e-05, + "loss": 1.7804, + "step": 15262 + }, + { + "epoch": 0.5466005336007306, + "grad_norm": 2.286837339401245, + "learning_rate": 8.978173713105503e-05, + "loss": 1.6872, + "step": 15263 + }, + { + "epoch": 0.5466363457302988, + "grad_norm": 2.094520330429077, + "learning_rate": 8.977019890010571e-05, + "loss": 1.5192, + "step": 15264 + }, + { + "epoch": 0.5466721578598671, + "grad_norm": 1.6780959367752075, + "learning_rate": 8.975866080678512e-05, + "loss": 1.451, + "step": 15265 + }, + { + "epoch": 0.5467079699894354, + "grad_norm": 1.653809905052185, + "learning_rate": 8.974712285124858e-05, + "loss": 1.4063, + "step": 15266 + }, + { + "epoch": 0.5467437821190037, + "grad_norm": 1.4220283031463623, + "learning_rate": 8.973558503365129e-05, + "loss": 1.5108, + "step": 15267 + }, + { + "epoch": 0.546779594248572, + "grad_norm": 2.0603954792022705, + "learning_rate": 8.97240473541484e-05, + "loss": 1.6627, + "step": 15268 + }, + { + "epoch": 0.5468154063781403, + "grad_norm": 1.338783621788025, + "learning_rate": 8.971250981289525e-05, + "loss": 1.4841, + "step": 15269 + }, + { + "epoch": 0.5468512185077086, + "grad_norm": 2.398456573486328, + "learning_rate": 8.970097241004697e-05, + "loss": 1.5663, + "step": 15270 + }, + { + "epoch": 0.5468870306372768, + "grad_norm": 1.6194860935211182, + "learning_rate": 8.968943514575888e-05, + "loss": 1.6908, + "step": 15271 + }, + { + "epoch": 0.5469228427668451, + "grad_norm": 1.4247815608978271, + "learning_rate": 8.967789802018607e-05, + "loss": 1.4542, + "step": 15272 + }, + { + "epoch": 0.5469586548964134, + "grad_norm": 1.889357566833496, + "learning_rate": 8.966636103348388e-05, + "loss": 1.7865, + "step": 15273 + }, + { + "epoch": 0.5469944670259816, + "grad_norm": 1.6137316226959229, + "learning_rate": 8.965482418580746e-05, + "loss": 1.8113, + "step": 15274 + }, + { + "epoch": 0.54703027915555, + "grad_norm": 1.8220938444137573, + "learning_rate": 8.964328747731204e-05, + "loss": 1.4244, + "step": 15275 + }, + { + "epoch": 0.5470660912851183, + "grad_norm": 1.8542633056640625, + "learning_rate": 8.963175090815285e-05, + "loss": 1.32, + "step": 15276 + }, + { + "epoch": 0.5471019034146866, + "grad_norm": 1.4733332395553589, + "learning_rate": 8.9620214478485e-05, + "loss": 1.2989, + "step": 15277 + }, + { + "epoch": 0.5471377155442548, + "grad_norm": 1.4526832103729248, + "learning_rate": 8.960867818846386e-05, + "loss": 1.1323, + "step": 15278 + }, + { + "epoch": 0.5471735276738231, + "grad_norm": 1.3265506029129028, + "learning_rate": 8.959714203824449e-05, + "loss": 1.397, + "step": 15279 + }, + { + "epoch": 0.5472093398033914, + "grad_norm": 2.6252152919769287, + "learning_rate": 8.958560602798221e-05, + "loss": 1.7199, + "step": 15280 + }, + { + "epoch": 0.5472451519329596, + "grad_norm": 1.9414409399032593, + "learning_rate": 8.95740701578321e-05, + "loss": 1.5143, + "step": 15281 + }, + { + "epoch": 0.547280964062528, + "grad_norm": 1.725482702255249, + "learning_rate": 8.956253442794948e-05, + "loss": 1.1287, + "step": 15282 + }, + { + "epoch": 0.5473167761920963, + "grad_norm": 1.5736626386642456, + "learning_rate": 8.955099883848945e-05, + "loss": 1.4276, + "step": 15283 + }, + { + "epoch": 0.5473525883216646, + "grad_norm": 1.4038923978805542, + "learning_rate": 8.953946338960731e-05, + "loss": 1.2686, + "step": 15284 + }, + { + "epoch": 0.5473884004512328, + "grad_norm": 1.9812275171279907, + "learning_rate": 8.952792808145819e-05, + "loss": 1.5779, + "step": 15285 + }, + { + "epoch": 0.5474242125808011, + "grad_norm": 1.4937734603881836, + "learning_rate": 8.951639291419723e-05, + "loss": 1.4877, + "step": 15286 + }, + { + "epoch": 0.5474600247103694, + "grad_norm": 1.4445714950561523, + "learning_rate": 8.950485788797976e-05, + "loss": 1.6559, + "step": 15287 + }, + { + "epoch": 0.5474958368399376, + "grad_norm": 1.952468752861023, + "learning_rate": 8.949332300296082e-05, + "loss": 1.5121, + "step": 15288 + }, + { + "epoch": 0.547531648969506, + "grad_norm": 1.7824208736419678, + "learning_rate": 8.948178825929572e-05, + "loss": 1.3028, + "step": 15289 + }, + { + "epoch": 0.5475674610990743, + "grad_norm": 1.73703134059906, + "learning_rate": 8.947025365713953e-05, + "loss": 1.5888, + "step": 15290 + }, + { + "epoch": 0.5476032732286426, + "grad_norm": 1.6486480236053467, + "learning_rate": 8.945871919664757e-05, + "loss": 1.1964, + "step": 15291 + }, + { + "epoch": 0.5476390853582108, + "grad_norm": 1.4871211051940918, + "learning_rate": 8.944718487797487e-05, + "loss": 1.2785, + "step": 15292 + }, + { + "epoch": 0.5476748974877791, + "grad_norm": 1.483238697052002, + "learning_rate": 8.943565070127676e-05, + "loss": 1.6927, + "step": 15293 + }, + { + "epoch": 0.5477107096173474, + "grad_norm": 1.6315890550613403, + "learning_rate": 8.94241166667083e-05, + "loss": 1.4549, + "step": 15294 + }, + { + "epoch": 0.5477465217469156, + "grad_norm": 2.340033769607544, + "learning_rate": 8.94125827744247e-05, + "loss": 1.6152, + "step": 15295 + }, + { + "epoch": 0.547782333876484, + "grad_norm": 1.4662576913833618, + "learning_rate": 8.940104902458117e-05, + "loss": 1.6437, + "step": 15296 + }, + { + "epoch": 0.5478181460060523, + "grad_norm": 1.7731101512908936, + "learning_rate": 8.938951541733282e-05, + "loss": 1.377, + "step": 15297 + }, + { + "epoch": 0.5478539581356205, + "grad_norm": 1.815285563468933, + "learning_rate": 8.93779819528349e-05, + "loss": 1.4197, + "step": 15298 + }, + { + "epoch": 0.5478897702651888, + "grad_norm": 2.2570080757141113, + "learning_rate": 8.936644863124246e-05, + "loss": 1.6707, + "step": 15299 + }, + { + "epoch": 0.5479255823947571, + "grad_norm": 1.9335801601409912, + "learning_rate": 8.935491545271081e-05, + "loss": 1.5724, + "step": 15300 + }, + { + "epoch": 0.5479613945243254, + "grad_norm": 1.3450572490692139, + "learning_rate": 8.934338241739498e-05, + "loss": 0.9793, + "step": 15301 + }, + { + "epoch": 0.5479972066538936, + "grad_norm": 2.0288915634155273, + "learning_rate": 8.933184952545026e-05, + "loss": 1.5523, + "step": 15302 + }, + { + "epoch": 0.548033018783462, + "grad_norm": 1.486048936843872, + "learning_rate": 8.932031677703172e-05, + "loss": 1.1013, + "step": 15303 + }, + { + "epoch": 0.5480688309130303, + "grad_norm": 1.9262478351593018, + "learning_rate": 8.930878417229453e-05, + "loss": 1.5825, + "step": 15304 + }, + { + "epoch": 0.5481046430425985, + "grad_norm": 2.0612335205078125, + "learning_rate": 8.929725171139387e-05, + "loss": 1.3004, + "step": 15305 + }, + { + "epoch": 0.5481404551721668, + "grad_norm": 2.523592948913574, + "learning_rate": 8.928571939448486e-05, + "loss": 1.8698, + "step": 15306 + }, + { + "epoch": 0.5481762673017351, + "grad_norm": 1.6707018613815308, + "learning_rate": 8.927418722172269e-05, + "loss": 1.5277, + "step": 15307 + }, + { + "epoch": 0.5482120794313033, + "grad_norm": 1.4799731969833374, + "learning_rate": 8.926265519326246e-05, + "loss": 1.1901, + "step": 15308 + }, + { + "epoch": 0.5482478915608716, + "grad_norm": 1.638659119606018, + "learning_rate": 8.925112330925943e-05, + "loss": 1.4529, + "step": 15309 + }, + { + "epoch": 0.54828370369044, + "grad_norm": 1.3840322494506836, + "learning_rate": 8.923959156986859e-05, + "loss": 1.625, + "step": 15310 + }, + { + "epoch": 0.5483195158200083, + "grad_norm": 1.525613784790039, + "learning_rate": 8.922805997524524e-05, + "loss": 1.5994, + "step": 15311 + }, + { + "epoch": 0.5483553279495765, + "grad_norm": 2.075780153274536, + "learning_rate": 8.921652852554442e-05, + "loss": 1.9281, + "step": 15312 + }, + { + "epoch": 0.5483911400791448, + "grad_norm": 1.4483722448349, + "learning_rate": 8.920499722092129e-05, + "loss": 1.3819, + "step": 15313 + }, + { + "epoch": 0.5484269522087131, + "grad_norm": 2.663997173309326, + "learning_rate": 8.9193466061531e-05, + "loss": 1.02, + "step": 15314 + }, + { + "epoch": 0.5484627643382813, + "grad_norm": 1.449901819229126, + "learning_rate": 8.918193504752868e-05, + "loss": 1.4184, + "step": 15315 + }, + { + "epoch": 0.5484985764678496, + "grad_norm": 1.5887750387191772, + "learning_rate": 8.917040417906947e-05, + "loss": 1.4567, + "step": 15316 + }, + { + "epoch": 0.548534388597418, + "grad_norm": 1.6006042957305908, + "learning_rate": 8.91588734563085e-05, + "loss": 1.5301, + "step": 15317 + }, + { + "epoch": 0.5485702007269863, + "grad_norm": 1.9737050533294678, + "learning_rate": 8.914734287940092e-05, + "loss": 1.5652, + "step": 15318 + }, + { + "epoch": 0.5486060128565545, + "grad_norm": 1.3489166498184204, + "learning_rate": 8.913581244850182e-05, + "loss": 1.2951, + "step": 15319 + }, + { + "epoch": 0.5486418249861228, + "grad_norm": 1.744086503982544, + "learning_rate": 8.912428216376637e-05, + "loss": 1.4222, + "step": 15320 + }, + { + "epoch": 0.5486776371156911, + "grad_norm": 1.5901646614074707, + "learning_rate": 8.911275202534968e-05, + "loss": 1.3965, + "step": 15321 + }, + { + "epoch": 0.5487134492452593, + "grad_norm": 2.381009578704834, + "learning_rate": 8.910122203340684e-05, + "loss": 1.7014, + "step": 15322 + }, + { + "epoch": 0.5487492613748276, + "grad_norm": 1.773105263710022, + "learning_rate": 8.908969218809302e-05, + "loss": 1.2225, + "step": 15323 + }, + { + "epoch": 0.548785073504396, + "grad_norm": 1.7156941890716553, + "learning_rate": 8.907816248956331e-05, + "loss": 1.5767, + "step": 15324 + }, + { + "epoch": 0.5488208856339643, + "grad_norm": 2.0456743240356445, + "learning_rate": 8.906663293797284e-05, + "loss": 1.7365, + "step": 15325 + }, + { + "epoch": 0.5488566977635325, + "grad_norm": 1.5815179347991943, + "learning_rate": 8.905510353347671e-05, + "loss": 1.5657, + "step": 15326 + }, + { + "epoch": 0.5488925098931008, + "grad_norm": 1.702483892440796, + "learning_rate": 8.904357427623007e-05, + "loss": 1.6139, + "step": 15327 + }, + { + "epoch": 0.5489283220226691, + "grad_norm": 1.5035403966903687, + "learning_rate": 8.903204516638796e-05, + "loss": 1.515, + "step": 15328 + }, + { + "epoch": 0.5489641341522373, + "grad_norm": 1.8640391826629639, + "learning_rate": 8.902051620410558e-05, + "loss": 1.5362, + "step": 15329 + }, + { + "epoch": 0.5489999462818056, + "grad_norm": 1.583310842514038, + "learning_rate": 8.9008987389538e-05, + "loss": 1.5499, + "step": 15330 + }, + { + "epoch": 0.549035758411374, + "grad_norm": 1.6002247333526611, + "learning_rate": 8.899745872284026e-05, + "loss": 1.1217, + "step": 15331 + }, + { + "epoch": 0.5490715705409422, + "grad_norm": 1.4390969276428223, + "learning_rate": 8.898593020416756e-05, + "loss": 1.6414, + "step": 15332 + }, + { + "epoch": 0.5491073826705105, + "grad_norm": 1.7034553289413452, + "learning_rate": 8.897440183367496e-05, + "loss": 1.549, + "step": 15333 + }, + { + "epoch": 0.5491431948000788, + "grad_norm": 1.4598033428192139, + "learning_rate": 8.896287361151757e-05, + "loss": 1.4662, + "step": 15334 + }, + { + "epoch": 0.549179006929647, + "grad_norm": 2.3608148097991943, + "learning_rate": 8.895134553785044e-05, + "loss": 1.5284, + "step": 15335 + }, + { + "epoch": 0.5492148190592153, + "grad_norm": 1.7964868545532227, + "learning_rate": 8.893981761282874e-05, + "loss": 1.5421, + "step": 15336 + }, + { + "epoch": 0.5492506311887836, + "grad_norm": 1.50372314453125, + "learning_rate": 8.89282898366075e-05, + "loss": 1.2766, + "step": 15337 + }, + { + "epoch": 0.549286443318352, + "grad_norm": 1.470895767211914, + "learning_rate": 8.891676220934188e-05, + "loss": 1.2957, + "step": 15338 + }, + { + "epoch": 0.5493222554479202, + "grad_norm": 1.9627442359924316, + "learning_rate": 8.89052347311869e-05, + "loss": 1.5351, + "step": 15339 + }, + { + "epoch": 0.5493580675774885, + "grad_norm": 2.062045097351074, + "learning_rate": 8.889370740229767e-05, + "loss": 1.4394, + "step": 15340 + }, + { + "epoch": 0.5493938797070568, + "grad_norm": 1.9214800596237183, + "learning_rate": 8.88821802228293e-05, + "loss": 1.3882, + "step": 15341 + }, + { + "epoch": 0.549429691836625, + "grad_norm": 1.5839799642562866, + "learning_rate": 8.887065319293684e-05, + "loss": 1.427, + "step": 15342 + }, + { + "epoch": 0.5494655039661933, + "grad_norm": 1.8803417682647705, + "learning_rate": 8.88591263127754e-05, + "loss": 1.4825, + "step": 15343 + }, + { + "epoch": 0.5495013160957616, + "grad_norm": 1.5140496492385864, + "learning_rate": 8.884759958250002e-05, + "loss": 1.4992, + "step": 15344 + }, + { + "epoch": 0.54953712822533, + "grad_norm": 1.6109691858291626, + "learning_rate": 8.883607300226581e-05, + "loss": 1.3751, + "step": 15345 + }, + { + "epoch": 0.5495729403548982, + "grad_norm": 2.227198600769043, + "learning_rate": 8.882454657222784e-05, + "loss": 1.6037, + "step": 15346 + }, + { + "epoch": 0.5496087524844665, + "grad_norm": 1.5396227836608887, + "learning_rate": 8.88130202925412e-05, + "loss": 1.4925, + "step": 15347 + }, + { + "epoch": 0.5496445646140348, + "grad_norm": 2.18083119392395, + "learning_rate": 8.880149416336093e-05, + "loss": 1.7599, + "step": 15348 + }, + { + "epoch": 0.549680376743603, + "grad_norm": 1.8912416696548462, + "learning_rate": 8.878996818484209e-05, + "loss": 1.5699, + "step": 15349 + }, + { + "epoch": 0.5497161888731713, + "grad_norm": 1.3284300565719604, + "learning_rate": 8.87784423571398e-05, + "loss": 1.4352, + "step": 15350 + }, + { + "epoch": 0.5497520010027396, + "grad_norm": 1.830657720565796, + "learning_rate": 8.876691668040907e-05, + "loss": 1.5481, + "step": 15351 + }, + { + "epoch": 0.549787813132308, + "grad_norm": 2.0324718952178955, + "learning_rate": 8.8755391154805e-05, + "loss": 1.5041, + "step": 15352 + }, + { + "epoch": 0.5498236252618762, + "grad_norm": 1.8235435485839844, + "learning_rate": 8.874386578048261e-05, + "loss": 1.2626, + "step": 15353 + }, + { + "epoch": 0.5498594373914445, + "grad_norm": 1.490576148033142, + "learning_rate": 8.873234055759703e-05, + "loss": 1.5272, + "step": 15354 + }, + { + "epoch": 0.5498952495210128, + "grad_norm": 1.8105982542037964, + "learning_rate": 8.872081548630325e-05, + "loss": 1.4809, + "step": 15355 + }, + { + "epoch": 0.549931061650581, + "grad_norm": 1.976482629776001, + "learning_rate": 8.870929056675636e-05, + "loss": 1.597, + "step": 15356 + }, + { + "epoch": 0.5499668737801493, + "grad_norm": 1.8228706121444702, + "learning_rate": 8.86977657991114e-05, + "loss": 1.7323, + "step": 15357 + }, + { + "epoch": 0.5500026859097176, + "grad_norm": 1.821420669555664, + "learning_rate": 8.86862411835234e-05, + "loss": 1.3638, + "step": 15358 + }, + { + "epoch": 0.550038498039286, + "grad_norm": 1.6070480346679688, + "learning_rate": 8.867471672014745e-05, + "loss": 1.3667, + "step": 15359 + }, + { + "epoch": 0.5500743101688542, + "grad_norm": 1.3694812059402466, + "learning_rate": 8.866319240913856e-05, + "loss": 1.2689, + "step": 15360 + }, + { + "epoch": 0.5501101222984225, + "grad_norm": 1.4904698133468628, + "learning_rate": 8.865166825065182e-05, + "loss": 1.3611, + "step": 15361 + }, + { + "epoch": 0.5501459344279908, + "grad_norm": 1.2036080360412598, + "learning_rate": 8.864014424484222e-05, + "loss": 1.2641, + "step": 15362 + }, + { + "epoch": 0.550181746557559, + "grad_norm": 1.5762604475021362, + "learning_rate": 8.862862039186485e-05, + "loss": 1.6896, + "step": 15363 + }, + { + "epoch": 0.5502175586871273, + "grad_norm": 2.1189725399017334, + "learning_rate": 8.861709669187474e-05, + "loss": 1.4898, + "step": 15364 + }, + { + "epoch": 0.5502533708166956, + "grad_norm": 1.6944526433944702, + "learning_rate": 8.860557314502685e-05, + "loss": 1.5987, + "step": 15365 + }, + { + "epoch": 0.5502891829462639, + "grad_norm": 1.749204158782959, + "learning_rate": 8.859404975147632e-05, + "loss": 1.3471, + "step": 15366 + }, + { + "epoch": 0.5503249950758322, + "grad_norm": 1.700831651687622, + "learning_rate": 8.858252651137812e-05, + "loss": 1.526, + "step": 15367 + }, + { + "epoch": 0.5503608072054005, + "grad_norm": 1.3145804405212402, + "learning_rate": 8.857100342488732e-05, + "loss": 1.4017, + "step": 15368 + }, + { + "epoch": 0.5503966193349688, + "grad_norm": 1.3753552436828613, + "learning_rate": 8.855948049215888e-05, + "loss": 1.5874, + "step": 15369 + }, + { + "epoch": 0.550432431464537, + "grad_norm": 1.5876967906951904, + "learning_rate": 8.854795771334794e-05, + "loss": 1.537, + "step": 15370 + }, + { + "epoch": 0.5504682435941053, + "grad_norm": 1.6385436058044434, + "learning_rate": 8.85364350886094e-05, + "loss": 1.3111, + "step": 15371 + }, + { + "epoch": 0.5505040557236736, + "grad_norm": 2.4700660705566406, + "learning_rate": 8.852491261809837e-05, + "loss": 1.7414, + "step": 15372 + }, + { + "epoch": 0.5505398678532419, + "grad_norm": 1.4579800367355347, + "learning_rate": 8.851339030196986e-05, + "loss": 1.4192, + "step": 15373 + }, + { + "epoch": 0.5505756799828102, + "grad_norm": 1.7124450206756592, + "learning_rate": 8.85018681403788e-05, + "loss": 1.2621, + "step": 15374 + }, + { + "epoch": 0.5506114921123785, + "grad_norm": 1.5826530456542969, + "learning_rate": 8.849034613348035e-05, + "loss": 1.3615, + "step": 15375 + }, + { + "epoch": 0.5506473042419467, + "grad_norm": 1.3330745697021484, + "learning_rate": 8.847882428142936e-05, + "loss": 1.3821, + "step": 15376 + }, + { + "epoch": 0.550683116371515, + "grad_norm": 1.582822322845459, + "learning_rate": 8.8467302584381e-05, + "loss": 1.4889, + "step": 15377 + }, + { + "epoch": 0.5507189285010833, + "grad_norm": 1.2767612934112549, + "learning_rate": 8.845578104249014e-05, + "loss": 1.3505, + "step": 15378 + }, + { + "epoch": 0.5507547406306516, + "grad_norm": 1.4788082838058472, + "learning_rate": 8.844425965591192e-05, + "loss": 1.5282, + "step": 15379 + }, + { + "epoch": 0.5507905527602199, + "grad_norm": 1.8105095624923706, + "learning_rate": 8.843273842480124e-05, + "loss": 1.425, + "step": 15380 + }, + { + "epoch": 0.5508263648897882, + "grad_norm": 1.4543743133544922, + "learning_rate": 8.842121734931316e-05, + "loss": 1.6334, + "step": 15381 + }, + { + "epoch": 0.5508621770193565, + "grad_norm": 1.4791871309280396, + "learning_rate": 8.840969642960271e-05, + "loss": 1.3785, + "step": 15382 + }, + { + "epoch": 0.5508979891489247, + "grad_norm": 1.55195152759552, + "learning_rate": 8.839817566582477e-05, + "loss": 1.3623, + "step": 15383 + }, + { + "epoch": 0.550933801278493, + "grad_norm": 1.8789154291152954, + "learning_rate": 8.838665505813448e-05, + "loss": 1.3948, + "step": 15384 + }, + { + "epoch": 0.5509696134080613, + "grad_norm": 1.813826560974121, + "learning_rate": 8.837513460668668e-05, + "loss": 1.4754, + "step": 15385 + }, + { + "epoch": 0.5510054255376295, + "grad_norm": 1.9964311122894287, + "learning_rate": 8.836361431163653e-05, + "loss": 1.4923, + "step": 15386 + }, + { + "epoch": 0.5510412376671979, + "grad_norm": 1.5974700450897217, + "learning_rate": 8.835209417313886e-05, + "loss": 1.6589, + "step": 15387 + }, + { + "epoch": 0.5510770497967662, + "grad_norm": 1.9215779304504395, + "learning_rate": 8.834057419134883e-05, + "loss": 1.7583, + "step": 15388 + }, + { + "epoch": 0.5511128619263345, + "grad_norm": 1.61617910861969, + "learning_rate": 8.832905436642125e-05, + "loss": 1.4681, + "step": 15389 + }, + { + "epoch": 0.5511486740559027, + "grad_norm": 1.5911004543304443, + "learning_rate": 8.831753469851126e-05, + "loss": 1.537, + "step": 15390 + }, + { + "epoch": 0.551184486185471, + "grad_norm": 1.8469878435134888, + "learning_rate": 8.830601518777375e-05, + "loss": 1.4466, + "step": 15391 + }, + { + "epoch": 0.5512202983150393, + "grad_norm": 1.4108341932296753, + "learning_rate": 8.829449583436367e-05, + "loss": 1.4935, + "step": 15392 + }, + { + "epoch": 0.5512561104446075, + "grad_norm": 1.4014348983764648, + "learning_rate": 8.828297663843612e-05, + "loss": 1.5126, + "step": 15393 + }, + { + "epoch": 0.5512919225741759, + "grad_norm": 2.2653427124023438, + "learning_rate": 8.827145760014595e-05, + "loss": 1.7555, + "step": 15394 + }, + { + "epoch": 0.5513277347037442, + "grad_norm": 1.639029622077942, + "learning_rate": 8.825993871964823e-05, + "loss": 1.6526, + "step": 15395 + }, + { + "epoch": 0.5513635468333125, + "grad_norm": 1.7813209295272827, + "learning_rate": 8.824841999709785e-05, + "loss": 1.1952, + "step": 15396 + }, + { + "epoch": 0.5513993589628807, + "grad_norm": 1.8362061977386475, + "learning_rate": 8.823690143264988e-05, + "loss": 1.6557, + "step": 15397 + }, + { + "epoch": 0.551435171092449, + "grad_norm": 1.6683216094970703, + "learning_rate": 8.822538302645916e-05, + "loss": 1.7202, + "step": 15398 + }, + { + "epoch": 0.5514709832220173, + "grad_norm": 1.9283270835876465, + "learning_rate": 8.821386477868078e-05, + "loss": 1.1112, + "step": 15399 + }, + { + "epoch": 0.5515067953515855, + "grad_norm": 1.510189175605774, + "learning_rate": 8.820234668946963e-05, + "loss": 1.5949, + "step": 15400 + }, + { + "epoch": 0.5515426074811539, + "grad_norm": 1.425412893295288, + "learning_rate": 8.819082875898068e-05, + "loss": 1.4958, + "step": 15401 + }, + { + "epoch": 0.5515784196107222, + "grad_norm": 1.7452600002288818, + "learning_rate": 8.817931098736891e-05, + "loss": 1.4762, + "step": 15402 + }, + { + "epoch": 0.5516142317402905, + "grad_norm": 1.7585675716400146, + "learning_rate": 8.816779337478923e-05, + "loss": 1.4523, + "step": 15403 + }, + { + "epoch": 0.5516500438698587, + "grad_norm": 2.050502300262451, + "learning_rate": 8.815627592139665e-05, + "loss": 1.1354, + "step": 15404 + }, + { + "epoch": 0.551685855999427, + "grad_norm": 1.3366676568984985, + "learning_rate": 8.814475862734608e-05, + "loss": 1.568, + "step": 15405 + }, + { + "epoch": 0.5517216681289953, + "grad_norm": 1.8632868528366089, + "learning_rate": 8.813324149279254e-05, + "loss": 1.3182, + "step": 15406 + }, + { + "epoch": 0.5517574802585635, + "grad_norm": 1.6237183809280396, + "learning_rate": 8.812172451789086e-05, + "loss": 1.2567, + "step": 15407 + }, + { + "epoch": 0.5517932923881319, + "grad_norm": 1.8499741554260254, + "learning_rate": 8.811020770279612e-05, + "loss": 1.4372, + "step": 15408 + }, + { + "epoch": 0.5518291045177002, + "grad_norm": 1.5876901149749756, + "learning_rate": 8.809869104766318e-05, + "loss": 1.6619, + "step": 15409 + }, + { + "epoch": 0.5518649166472684, + "grad_norm": 1.637397289276123, + "learning_rate": 8.808717455264698e-05, + "loss": 1.7995, + "step": 15410 + }, + { + "epoch": 0.5519007287768367, + "grad_norm": 1.551072120666504, + "learning_rate": 8.80756582179025e-05, + "loss": 1.5143, + "step": 15411 + }, + { + "epoch": 0.551936540906405, + "grad_norm": 1.4706289768218994, + "learning_rate": 8.806414204358465e-05, + "loss": 1.3013, + "step": 15412 + }, + { + "epoch": 0.5519723530359733, + "grad_norm": 1.3856925964355469, + "learning_rate": 8.805262602984838e-05, + "loss": 1.4325, + "step": 15413 + }, + { + "epoch": 0.5520081651655415, + "grad_norm": 1.3337805271148682, + "learning_rate": 8.804111017684858e-05, + "loss": 1.4605, + "step": 15414 + }, + { + "epoch": 0.5520439772951099, + "grad_norm": 1.699302077293396, + "learning_rate": 8.802959448474025e-05, + "loss": 1.3212, + "step": 15415 + }, + { + "epoch": 0.5520797894246782, + "grad_norm": 1.4418692588806152, + "learning_rate": 8.801807895367827e-05, + "loss": 1.4426, + "step": 15416 + }, + { + "epoch": 0.5521156015542464, + "grad_norm": 1.6650549173355103, + "learning_rate": 8.80065635838176e-05, + "loss": 1.3448, + "step": 15417 + }, + { + "epoch": 0.5521514136838147, + "grad_norm": 1.9519139528274536, + "learning_rate": 8.799504837531315e-05, + "loss": 1.7081, + "step": 15418 + }, + { + "epoch": 0.552187225813383, + "grad_norm": 1.3951752185821533, + "learning_rate": 8.798353332831981e-05, + "loss": 1.3193, + "step": 15419 + }, + { + "epoch": 0.5522230379429512, + "grad_norm": 1.665259599685669, + "learning_rate": 8.797201844299257e-05, + "loss": 1.5382, + "step": 15420 + }, + { + "epoch": 0.5522588500725195, + "grad_norm": 1.426015019416809, + "learning_rate": 8.796050371948627e-05, + "loss": 1.5341, + "step": 15421 + }, + { + "epoch": 0.5522946622020879, + "grad_norm": 1.8713806867599487, + "learning_rate": 8.794898915795588e-05, + "loss": 1.6166, + "step": 15422 + }, + { + "epoch": 0.5523304743316562, + "grad_norm": 1.9937385320663452, + "learning_rate": 8.793747475855628e-05, + "loss": 1.2537, + "step": 15423 + }, + { + "epoch": 0.5523662864612244, + "grad_norm": 1.5066399574279785, + "learning_rate": 8.792596052144242e-05, + "loss": 1.7567, + "step": 15424 + }, + { + "epoch": 0.5524020985907927, + "grad_norm": 1.2568379640579224, + "learning_rate": 8.791444644676916e-05, + "loss": 1.3674, + "step": 15425 + }, + { + "epoch": 0.552437910720361, + "grad_norm": 1.4235587120056152, + "learning_rate": 8.790293253469145e-05, + "loss": 1.661, + "step": 15426 + }, + { + "epoch": 0.5524737228499292, + "grad_norm": 2.1775028705596924, + "learning_rate": 8.789141878536419e-05, + "loss": 1.3307, + "step": 15427 + }, + { + "epoch": 0.5525095349794975, + "grad_norm": 1.7875803709030151, + "learning_rate": 8.787990519894224e-05, + "loss": 1.4441, + "step": 15428 + }, + { + "epoch": 0.5525453471090659, + "grad_norm": 1.3177114725112915, + "learning_rate": 8.786839177558057e-05, + "loss": 1.7458, + "step": 15429 + }, + { + "epoch": 0.5525811592386342, + "grad_norm": 1.7923847436904907, + "learning_rate": 8.7856878515434e-05, + "loss": 1.6531, + "step": 15430 + }, + { + "epoch": 0.5526169713682024, + "grad_norm": 1.7845262289047241, + "learning_rate": 8.784536541865752e-05, + "loss": 1.382, + "step": 15431 + }, + { + "epoch": 0.5526527834977707, + "grad_norm": 1.7725820541381836, + "learning_rate": 8.783385248540591e-05, + "loss": 1.2906, + "step": 15432 + }, + { + "epoch": 0.552688595627339, + "grad_norm": 1.6670875549316406, + "learning_rate": 8.782233971583416e-05, + "loss": 1.7273, + "step": 15433 + }, + { + "epoch": 0.5527244077569072, + "grad_norm": 2.1010894775390625, + "learning_rate": 8.781082711009709e-05, + "loss": 1.7307, + "step": 15434 + }, + { + "epoch": 0.5527602198864755, + "grad_norm": 1.56618070602417, + "learning_rate": 8.779931466834965e-05, + "loss": 1.2643, + "step": 15435 + }, + { + "epoch": 0.5527960320160439, + "grad_norm": 1.57991623878479, + "learning_rate": 8.778780239074669e-05, + "loss": 1.3456, + "step": 15436 + }, + { + "epoch": 0.5528318441456122, + "grad_norm": 2.865915060043335, + "learning_rate": 8.777629027744307e-05, + "loss": 1.5232, + "step": 15437 + }, + { + "epoch": 0.5528676562751804, + "grad_norm": 1.6558449268341064, + "learning_rate": 8.776477832859374e-05, + "loss": 1.4121, + "step": 15438 + }, + { + "epoch": 0.5529034684047487, + "grad_norm": 2.0313730239868164, + "learning_rate": 8.77532665443535e-05, + "loss": 1.3529, + "step": 15439 + }, + { + "epoch": 0.552939280534317, + "grad_norm": 1.625325322151184, + "learning_rate": 8.774175492487728e-05, + "loss": 1.4956, + "step": 15440 + }, + { + "epoch": 0.5529750926638852, + "grad_norm": 1.4914642572402954, + "learning_rate": 8.77302434703199e-05, + "loss": 1.3326, + "step": 15441 + }, + { + "epoch": 0.5530109047934535, + "grad_norm": 2.1764352321624756, + "learning_rate": 8.771873218083631e-05, + "loss": 1.5886, + "step": 15442 + }, + { + "epoch": 0.5530467169230219, + "grad_norm": 1.5987873077392578, + "learning_rate": 8.770722105658132e-05, + "loss": 1.3776, + "step": 15443 + }, + { + "epoch": 0.5530825290525901, + "grad_norm": 1.513716459274292, + "learning_rate": 8.769571009770982e-05, + "loss": 1.3559, + "step": 15444 + }, + { + "epoch": 0.5531183411821584, + "grad_norm": 1.3887488842010498, + "learning_rate": 8.768419930437667e-05, + "loss": 1.6609, + "step": 15445 + }, + { + "epoch": 0.5531541533117267, + "grad_norm": 1.3617610931396484, + "learning_rate": 8.767268867673671e-05, + "loss": 1.6054, + "step": 15446 + }, + { + "epoch": 0.553189965441295, + "grad_norm": 1.9251405000686646, + "learning_rate": 8.766117821494485e-05, + "loss": 1.323, + "step": 15447 + }, + { + "epoch": 0.5532257775708632, + "grad_norm": 2.4319498538970947, + "learning_rate": 8.76496679191559e-05, + "loss": 1.7116, + "step": 15448 + }, + { + "epoch": 0.5532615897004315, + "grad_norm": 2.2947161197662354, + "learning_rate": 8.763815778952475e-05, + "loss": 1.504, + "step": 15449 + }, + { + "epoch": 0.5532974018299999, + "grad_norm": 1.3940249681472778, + "learning_rate": 8.762664782620623e-05, + "loss": 1.6442, + "step": 15450 + }, + { + "epoch": 0.5533332139595681, + "grad_norm": 1.715877890586853, + "learning_rate": 8.761513802935523e-05, + "loss": 1.6743, + "step": 15451 + }, + { + "epoch": 0.5533690260891364, + "grad_norm": 1.2846055030822754, + "learning_rate": 8.760362839912654e-05, + "loss": 1.4043, + "step": 15452 + }, + { + "epoch": 0.5534048382187047, + "grad_norm": 1.3711576461791992, + "learning_rate": 8.759211893567505e-05, + "loss": 1.5188, + "step": 15453 + }, + { + "epoch": 0.5534406503482729, + "grad_norm": 1.988338589668274, + "learning_rate": 8.758060963915562e-05, + "loss": 1.5419, + "step": 15454 + }, + { + "epoch": 0.5534764624778412, + "grad_norm": 2.278092384338379, + "learning_rate": 8.756910050972304e-05, + "loss": 1.5371, + "step": 15455 + }, + { + "epoch": 0.5535122746074095, + "grad_norm": 1.3961970806121826, + "learning_rate": 8.755759154753219e-05, + "loss": 1.3566, + "step": 15456 + }, + { + "epoch": 0.5535480867369779, + "grad_norm": 1.6591577529907227, + "learning_rate": 8.754608275273788e-05, + "loss": 1.2977, + "step": 15457 + }, + { + "epoch": 0.5535838988665461, + "grad_norm": 1.8924493789672852, + "learning_rate": 8.753457412549497e-05, + "loss": 1.4803, + "step": 15458 + }, + { + "epoch": 0.5536197109961144, + "grad_norm": 1.7182888984680176, + "learning_rate": 8.752306566595828e-05, + "loss": 1.3179, + "step": 15459 + }, + { + "epoch": 0.5536555231256827, + "grad_norm": 1.6327793598175049, + "learning_rate": 8.751155737428267e-05, + "loss": 1.6394, + "step": 15460 + }, + { + "epoch": 0.5536913352552509, + "grad_norm": 1.3792765140533447, + "learning_rate": 8.750004925062296e-05, + "loss": 1.649, + "step": 15461 + }, + { + "epoch": 0.5537271473848192, + "grad_norm": 1.5709487199783325, + "learning_rate": 8.74885412951339e-05, + "loss": 1.5722, + "step": 15462 + }, + { + "epoch": 0.5537629595143875, + "grad_norm": 2.683051347732544, + "learning_rate": 8.747703350797044e-05, + "loss": 1.532, + "step": 15463 + }, + { + "epoch": 0.5537987716439559, + "grad_norm": 1.412434458732605, + "learning_rate": 8.74655258892873e-05, + "loss": 1.4681, + "step": 15464 + }, + { + "epoch": 0.5538345837735241, + "grad_norm": 1.9447873830795288, + "learning_rate": 8.745401843923936e-05, + "loss": 1.7162, + "step": 15465 + }, + { + "epoch": 0.5538703959030924, + "grad_norm": 1.498840093612671, + "learning_rate": 8.74425111579814e-05, + "loss": 1.3753, + "step": 15466 + }, + { + "epoch": 0.5539062080326607, + "grad_norm": 1.703001618385315, + "learning_rate": 8.743100404566828e-05, + "loss": 1.6041, + "step": 15467 + }, + { + "epoch": 0.5539420201622289, + "grad_norm": 1.341660737991333, + "learning_rate": 8.741949710245476e-05, + "loss": 1.349, + "step": 15468 + }, + { + "epoch": 0.5539778322917972, + "grad_norm": 2.8604416847229004, + "learning_rate": 8.740799032849572e-05, + "loss": 1.397, + "step": 15469 + }, + { + "epoch": 0.5540136444213655, + "grad_norm": 1.7713994979858398, + "learning_rate": 8.739648372394592e-05, + "loss": 1.7506, + "step": 15470 + }, + { + "epoch": 0.5540494565509338, + "grad_norm": 1.9339109659194946, + "learning_rate": 8.738497728896013e-05, + "loss": 1.3016, + "step": 15471 + }, + { + "epoch": 0.5540852686805021, + "grad_norm": 1.7749813795089722, + "learning_rate": 8.737347102369325e-05, + "loss": 1.0844, + "step": 15472 + }, + { + "epoch": 0.5541210808100704, + "grad_norm": 2.379054307937622, + "learning_rate": 8.736196492829997e-05, + "loss": 1.71, + "step": 15473 + }, + { + "epoch": 0.5541568929396387, + "grad_norm": 2.4072840213775635, + "learning_rate": 8.735045900293522e-05, + "loss": 1.4868, + "step": 15474 + }, + { + "epoch": 0.5541927050692069, + "grad_norm": 1.7160054445266724, + "learning_rate": 8.733895324775366e-05, + "loss": 1.4881, + "step": 15475 + }, + { + "epoch": 0.5542285171987752, + "grad_norm": 2.1116483211517334, + "learning_rate": 8.73274476629102e-05, + "loss": 1.5777, + "step": 15476 + }, + { + "epoch": 0.5542643293283435, + "grad_norm": 1.7254281044006348, + "learning_rate": 8.731594224855956e-05, + "loss": 1.5687, + "step": 15477 + }, + { + "epoch": 0.5543001414579118, + "grad_norm": 1.8794066905975342, + "learning_rate": 8.730443700485658e-05, + "loss": 1.4314, + "step": 15478 + }, + { + "epoch": 0.5543359535874801, + "grad_norm": 1.509204387664795, + "learning_rate": 8.729293193195603e-05, + "loss": 1.3987, + "step": 15479 + }, + { + "epoch": 0.5543717657170484, + "grad_norm": 1.610112190246582, + "learning_rate": 8.728142703001264e-05, + "loss": 1.4781, + "step": 15480 + }, + { + "epoch": 0.5544075778466167, + "grad_norm": 2.039473295211792, + "learning_rate": 8.72699222991813e-05, + "loss": 1.499, + "step": 15481 + }, + { + "epoch": 0.5544433899761849, + "grad_norm": 3.141580581665039, + "learning_rate": 8.725841773961669e-05, + "loss": 1.7102, + "step": 15482 + }, + { + "epoch": 0.5544792021057532, + "grad_norm": 2.184549570083618, + "learning_rate": 8.724691335147367e-05, + "loss": 1.6045, + "step": 15483 + }, + { + "epoch": 0.5545150142353215, + "grad_norm": 1.9326050281524658, + "learning_rate": 8.723540913490693e-05, + "loss": 1.5926, + "step": 15484 + }, + { + "epoch": 0.5545508263648898, + "grad_norm": 1.7067333459854126, + "learning_rate": 8.722390509007137e-05, + "loss": 1.3034, + "step": 15485 + }, + { + "epoch": 0.5545866384944581, + "grad_norm": 1.8227620124816895, + "learning_rate": 8.721240121712161e-05, + "loss": 1.5729, + "step": 15486 + }, + { + "epoch": 0.5546224506240264, + "grad_norm": 1.3171581029891968, + "learning_rate": 8.720089751621256e-05, + "loss": 1.4384, + "step": 15487 + }, + { + "epoch": 0.5546582627535946, + "grad_norm": 1.4920912981033325, + "learning_rate": 8.71893939874989e-05, + "loss": 1.294, + "step": 15488 + }, + { + "epoch": 0.5546940748831629, + "grad_norm": 1.4924278259277344, + "learning_rate": 8.717789063113539e-05, + "loss": 1.5305, + "step": 15489 + }, + { + "epoch": 0.5547298870127312, + "grad_norm": 1.9032200574874878, + "learning_rate": 8.716638744727687e-05, + "loss": 1.2737, + "step": 15490 + }, + { + "epoch": 0.5547656991422995, + "grad_norm": 1.732216477394104, + "learning_rate": 8.7154884436078e-05, + "loss": 1.2741, + "step": 15491 + }, + { + "epoch": 0.5548015112718678, + "grad_norm": 1.5945426225662231, + "learning_rate": 8.714338159769366e-05, + "loss": 1.5329, + "step": 15492 + }, + { + "epoch": 0.5548373234014361, + "grad_norm": 1.8944737911224365, + "learning_rate": 8.713187893227847e-05, + "loss": 1.2802, + "step": 15493 + }, + { + "epoch": 0.5548731355310044, + "grad_norm": 1.787889003753662, + "learning_rate": 8.71203764399873e-05, + "loss": 1.5427, + "step": 15494 + }, + { + "epoch": 0.5549089476605726, + "grad_norm": 1.3428040742874146, + "learning_rate": 8.71088741209748e-05, + "loss": 1.4309, + "step": 15495 + }, + { + "epoch": 0.5549447597901409, + "grad_norm": 1.8700381517410278, + "learning_rate": 8.709737197539583e-05, + "loss": 1.6271, + "step": 15496 + }, + { + "epoch": 0.5549805719197092, + "grad_norm": 1.4687343835830688, + "learning_rate": 8.708587000340506e-05, + "loss": 1.4383, + "step": 15497 + }, + { + "epoch": 0.5550163840492774, + "grad_norm": 1.5101184844970703, + "learning_rate": 8.707436820515723e-05, + "loss": 1.4074, + "step": 15498 + }, + { + "epoch": 0.5550521961788458, + "grad_norm": 1.7550967931747437, + "learning_rate": 8.706286658080711e-05, + "loss": 1.6502, + "step": 15499 + }, + { + "epoch": 0.5550880083084141, + "grad_norm": 1.6945569515228271, + "learning_rate": 8.705136513050944e-05, + "loss": 1.2085, + "step": 15500 + }, + { + "epoch": 0.5551238204379824, + "grad_norm": 1.832401990890503, + "learning_rate": 8.703986385441895e-05, + "loss": 1.4714, + "step": 15501 + }, + { + "epoch": 0.5551596325675506, + "grad_norm": 1.4257477521896362, + "learning_rate": 8.702836275269033e-05, + "loss": 1.3904, + "step": 15502 + }, + { + "epoch": 0.5551954446971189, + "grad_norm": 1.3720061779022217, + "learning_rate": 8.701686182547842e-05, + "loss": 1.4555, + "step": 15503 + }, + { + "epoch": 0.5552312568266872, + "grad_norm": 1.8269095420837402, + "learning_rate": 8.700536107293784e-05, + "loss": 1.4776, + "step": 15504 + }, + { + "epoch": 0.5552670689562554, + "grad_norm": 1.655152440071106, + "learning_rate": 8.699386049522341e-05, + "loss": 1.6387, + "step": 15505 + }, + { + "epoch": 0.5553028810858238, + "grad_norm": 1.5963819026947021, + "learning_rate": 8.69823600924898e-05, + "loss": 1.5153, + "step": 15506 + }, + { + "epoch": 0.5553386932153921, + "grad_norm": 1.3038275241851807, + "learning_rate": 8.697085986489172e-05, + "loss": 1.6874, + "step": 15507 + }, + { + "epoch": 0.5553745053449604, + "grad_norm": 1.676790714263916, + "learning_rate": 8.695935981258394e-05, + "loss": 1.2814, + "step": 15508 + }, + { + "epoch": 0.5554103174745286, + "grad_norm": 1.8038157224655151, + "learning_rate": 8.694785993572112e-05, + "loss": 1.3005, + "step": 15509 + }, + { + "epoch": 0.5554461296040969, + "grad_norm": 1.5027525424957275, + "learning_rate": 8.693636023445804e-05, + "loss": 1.4746, + "step": 15510 + }, + { + "epoch": 0.5554819417336652, + "grad_norm": 1.7654838562011719, + "learning_rate": 8.692486070894935e-05, + "loss": 1.6604, + "step": 15511 + }, + { + "epoch": 0.5555177538632334, + "grad_norm": 1.8913116455078125, + "learning_rate": 8.691336135934982e-05, + "loss": 1.6467, + "step": 15512 + }, + { + "epoch": 0.5555535659928018, + "grad_norm": 1.5639491081237793, + "learning_rate": 8.69018621858141e-05, + "loss": 1.033, + "step": 15513 + }, + { + "epoch": 0.5555893781223701, + "grad_norm": 1.46904718875885, + "learning_rate": 8.689036318849697e-05, + "loss": 1.4832, + "step": 15514 + }, + { + "epoch": 0.5556251902519384, + "grad_norm": 1.6029126644134521, + "learning_rate": 8.68788643675531e-05, + "loss": 1.5136, + "step": 15515 + }, + { + "epoch": 0.5556610023815066, + "grad_norm": 1.6579877138137817, + "learning_rate": 8.686736572313714e-05, + "loss": 1.373, + "step": 15516 + }, + { + "epoch": 0.5556968145110749, + "grad_norm": 1.6767444610595703, + "learning_rate": 8.685586725540387e-05, + "loss": 1.3224, + "step": 15517 + }, + { + "epoch": 0.5557326266406432, + "grad_norm": 1.8820594549179077, + "learning_rate": 8.684436896450791e-05, + "loss": 1.6569, + "step": 15518 + }, + { + "epoch": 0.5557684387702114, + "grad_norm": 1.8585944175720215, + "learning_rate": 8.683287085060404e-05, + "loss": 1.594, + "step": 15519 + }, + { + "epoch": 0.5558042508997798, + "grad_norm": 1.6034525632858276, + "learning_rate": 8.682137291384687e-05, + "loss": 1.4458, + "step": 15520 + }, + { + "epoch": 0.5558400630293481, + "grad_norm": 1.3279809951782227, + "learning_rate": 8.680987515439116e-05, + "loss": 1.4297, + "step": 15521 + }, + { + "epoch": 0.5558758751589163, + "grad_norm": 1.51862370967865, + "learning_rate": 8.679837757239156e-05, + "loss": 1.6286, + "step": 15522 + }, + { + "epoch": 0.5559116872884846, + "grad_norm": 1.6439845561981201, + "learning_rate": 8.678688016800276e-05, + "loss": 1.2965, + "step": 15523 + }, + { + "epoch": 0.5559474994180529, + "grad_norm": 1.7435225248336792, + "learning_rate": 8.677538294137945e-05, + "loss": 1.6844, + "step": 15524 + }, + { + "epoch": 0.5559833115476212, + "grad_norm": 1.3427046537399292, + "learning_rate": 8.676388589267628e-05, + "loss": 1.3433, + "step": 15525 + }, + { + "epoch": 0.5560191236771894, + "grad_norm": 1.8325779438018799, + "learning_rate": 8.675238902204797e-05, + "loss": 1.5351, + "step": 15526 + }, + { + "epoch": 0.5560549358067578, + "grad_norm": 1.5658588409423828, + "learning_rate": 8.674089232964916e-05, + "loss": 1.7484, + "step": 15527 + }, + { + "epoch": 0.5560907479363261, + "grad_norm": 1.382843017578125, + "learning_rate": 8.672939581563456e-05, + "loss": 1.4252, + "step": 15528 + }, + { + "epoch": 0.5561265600658943, + "grad_norm": 2.3734495639801025, + "learning_rate": 8.67178994801588e-05, + "loss": 1.3529, + "step": 15529 + }, + { + "epoch": 0.5561623721954626, + "grad_norm": 1.3029054403305054, + "learning_rate": 8.67064033233766e-05, + "loss": 1.331, + "step": 15530 + }, + { + "epoch": 0.5561981843250309, + "grad_norm": 1.900862455368042, + "learning_rate": 8.669490734544256e-05, + "loss": 1.6135, + "step": 15531 + }, + { + "epoch": 0.5562339964545991, + "grad_norm": 1.9537557363510132, + "learning_rate": 8.668341154651141e-05, + "loss": 1.5744, + "step": 15532 + }, + { + "epoch": 0.5562698085841674, + "grad_norm": 1.6260302066802979, + "learning_rate": 8.667191592673779e-05, + "loss": 1.5679, + "step": 15533 + }, + { + "epoch": 0.5563056207137358, + "grad_norm": 2.043226957321167, + "learning_rate": 8.666042048627632e-05, + "loss": 1.6189, + "step": 15534 + }, + { + "epoch": 0.5563414328433041, + "grad_norm": 1.6133548021316528, + "learning_rate": 8.66489252252817e-05, + "loss": 1.6424, + "step": 15535 + }, + { + "epoch": 0.5563772449728723, + "grad_norm": 1.5509310960769653, + "learning_rate": 8.663743014390855e-05, + "loss": 1.4819, + "step": 15536 + }, + { + "epoch": 0.5564130571024406, + "grad_norm": 1.5829284191131592, + "learning_rate": 8.662593524231158e-05, + "loss": 1.2861, + "step": 15537 + }, + { + "epoch": 0.5564488692320089, + "grad_norm": 1.2532970905303955, + "learning_rate": 8.661444052064536e-05, + "loss": 1.128, + "step": 15538 + }, + { + "epoch": 0.5564846813615771, + "grad_norm": 1.8142882585525513, + "learning_rate": 8.66029459790646e-05, + "loss": 1.4828, + "step": 15539 + }, + { + "epoch": 0.5565204934911454, + "grad_norm": 1.6689441204071045, + "learning_rate": 8.65914516177239e-05, + "loss": 1.5474, + "step": 15540 + }, + { + "epoch": 0.5565563056207138, + "grad_norm": 1.3482906818389893, + "learning_rate": 8.657995743677793e-05, + "loss": 1.5433, + "step": 15541 + }, + { + "epoch": 0.5565921177502821, + "grad_norm": 2.2926156520843506, + "learning_rate": 8.656846343638135e-05, + "loss": 1.6321, + "step": 15542 + }, + { + "epoch": 0.5566279298798503, + "grad_norm": 1.465704083442688, + "learning_rate": 8.655696961668873e-05, + "loss": 1.5127, + "step": 15543 + }, + { + "epoch": 0.5566637420094186, + "grad_norm": 2.068370819091797, + "learning_rate": 8.654547597785478e-05, + "loss": 1.6719, + "step": 15544 + }, + { + "epoch": 0.5566995541389869, + "grad_norm": 1.6043318510055542, + "learning_rate": 8.653398252003406e-05, + "loss": 1.426, + "step": 15545 + }, + { + "epoch": 0.5567353662685551, + "grad_norm": 1.5261003971099854, + "learning_rate": 8.652248924338126e-05, + "loss": 1.507, + "step": 15546 + }, + { + "epoch": 0.5567711783981234, + "grad_norm": 1.734169363975525, + "learning_rate": 8.651099614805097e-05, + "loss": 1.516, + "step": 15547 + }, + { + "epoch": 0.5568069905276918, + "grad_norm": 1.581032633781433, + "learning_rate": 8.649950323419783e-05, + "loss": 1.2642, + "step": 15548 + }, + { + "epoch": 0.55684280265726, + "grad_norm": 1.9459565877914429, + "learning_rate": 8.648801050197646e-05, + "loss": 1.6195, + "step": 15549 + }, + { + "epoch": 0.5568786147868283, + "grad_norm": 1.9734375476837158, + "learning_rate": 8.647651795154148e-05, + "loss": 1.366, + "step": 15550 + }, + { + "epoch": 0.5569144269163966, + "grad_norm": 1.4467110633850098, + "learning_rate": 8.646502558304751e-05, + "loss": 1.4128, + "step": 15551 + }, + { + "epoch": 0.5569502390459649, + "grad_norm": 2.0393340587615967, + "learning_rate": 8.645353339664915e-05, + "loss": 1.4668, + "step": 15552 + }, + { + "epoch": 0.5569860511755331, + "grad_norm": 1.865412950515747, + "learning_rate": 8.644204139250105e-05, + "loss": 1.4342, + "step": 15553 + }, + { + "epoch": 0.5570218633051014, + "grad_norm": 1.486951470375061, + "learning_rate": 8.643054957075776e-05, + "loss": 1.3204, + "step": 15554 + }, + { + "epoch": 0.5570576754346698, + "grad_norm": 1.633739709854126, + "learning_rate": 8.641905793157395e-05, + "loss": 1.3796, + "step": 15555 + }, + { + "epoch": 0.557093487564238, + "grad_norm": 1.7872612476348877, + "learning_rate": 8.640756647510417e-05, + "loss": 1.6145, + "step": 15556 + }, + { + "epoch": 0.5571292996938063, + "grad_norm": 1.5471066236495972, + "learning_rate": 8.639607520150308e-05, + "loss": 1.4271, + "step": 15557 + }, + { + "epoch": 0.5571651118233746, + "grad_norm": 1.6009304523468018, + "learning_rate": 8.638458411092527e-05, + "loss": 1.2723, + "step": 15558 + }, + { + "epoch": 0.5572009239529429, + "grad_norm": 1.389464259147644, + "learning_rate": 8.637309320352526e-05, + "loss": 1.6569, + "step": 15559 + }, + { + "epoch": 0.5572367360825111, + "grad_norm": 1.8739726543426514, + "learning_rate": 8.636160247945774e-05, + "loss": 1.6658, + "step": 15560 + }, + { + "epoch": 0.5572725482120794, + "grad_norm": 1.4336637258529663, + "learning_rate": 8.635011193887725e-05, + "loss": 1.4241, + "step": 15561 + }, + { + "epoch": 0.5573083603416478, + "grad_norm": 1.8983516693115234, + "learning_rate": 8.633862158193841e-05, + "loss": 1.4288, + "step": 15562 + }, + { + "epoch": 0.557344172471216, + "grad_norm": 1.7363568544387817, + "learning_rate": 8.632713140879577e-05, + "loss": 1.6088, + "step": 15563 + }, + { + "epoch": 0.5573799846007843, + "grad_norm": 1.7875018119812012, + "learning_rate": 8.631564141960397e-05, + "loss": 1.4726, + "step": 15564 + }, + { + "epoch": 0.5574157967303526, + "grad_norm": 1.811866044998169, + "learning_rate": 8.630415161451754e-05, + "loss": 1.6785, + "step": 15565 + }, + { + "epoch": 0.5574516088599208, + "grad_norm": 1.505098581314087, + "learning_rate": 8.62926619936911e-05, + "loss": 1.4012, + "step": 15566 + }, + { + "epoch": 0.5574874209894891, + "grad_norm": 1.4308191537857056, + "learning_rate": 8.628117255727924e-05, + "loss": 1.4559, + "step": 15567 + }, + { + "epoch": 0.5575232331190574, + "grad_norm": 1.7727469205856323, + "learning_rate": 8.626968330543643e-05, + "loss": 1.4809, + "step": 15568 + }, + { + "epoch": 0.5575590452486258, + "grad_norm": 1.7335854768753052, + "learning_rate": 8.62581942383174e-05, + "loss": 1.5017, + "step": 15569 + }, + { + "epoch": 0.557594857378194, + "grad_norm": 1.486517071723938, + "learning_rate": 8.624670535607658e-05, + "loss": 1.3723, + "step": 15570 + }, + { + "epoch": 0.5576306695077623, + "grad_norm": 2.2205562591552734, + "learning_rate": 8.623521665886865e-05, + "loss": 1.7048, + "step": 15571 + }, + { + "epoch": 0.5576664816373306, + "grad_norm": 1.4245879650115967, + "learning_rate": 8.622372814684806e-05, + "loss": 1.4024, + "step": 15572 + }, + { + "epoch": 0.5577022937668988, + "grad_norm": 1.3684970140457153, + "learning_rate": 8.621223982016948e-05, + "loss": 1.4418, + "step": 15573 + }, + { + "epoch": 0.5577381058964671, + "grad_norm": 1.8130236864089966, + "learning_rate": 8.620075167898743e-05, + "loss": 1.4848, + "step": 15574 + }, + { + "epoch": 0.5577739180260354, + "grad_norm": 1.3739123344421387, + "learning_rate": 8.618926372345645e-05, + "loss": 1.583, + "step": 15575 + }, + { + "epoch": 0.5578097301556038, + "grad_norm": 1.366071105003357, + "learning_rate": 8.617777595373117e-05, + "loss": 1.4026, + "step": 15576 + }, + { + "epoch": 0.557845542285172, + "grad_norm": 1.5953060388565063, + "learning_rate": 8.6166288369966e-05, + "loss": 1.2789, + "step": 15577 + }, + { + "epoch": 0.5578813544147403, + "grad_norm": 1.4939026832580566, + "learning_rate": 8.615480097231564e-05, + "loss": 1.1593, + "step": 15578 + }, + { + "epoch": 0.5579171665443086, + "grad_norm": 1.7710267305374146, + "learning_rate": 8.614331376093452e-05, + "loss": 1.7925, + "step": 15579 + }, + { + "epoch": 0.5579529786738768, + "grad_norm": 1.9933295249938965, + "learning_rate": 8.613182673597729e-05, + "loss": 1.5553, + "step": 15580 + }, + { + "epoch": 0.5579887908034451, + "grad_norm": 2.1700258255004883, + "learning_rate": 8.612033989759838e-05, + "loss": 1.4692, + "step": 15581 + }, + { + "epoch": 0.5580246029330134, + "grad_norm": 1.4422943592071533, + "learning_rate": 8.610885324595249e-05, + "loss": 1.5315, + "step": 15582 + }, + { + "epoch": 0.5580604150625817, + "grad_norm": 1.459005355834961, + "learning_rate": 8.609736678119396e-05, + "loss": 1.4583, + "step": 15583 + }, + { + "epoch": 0.55809622719215, + "grad_norm": 1.8012620210647583, + "learning_rate": 8.60858805034775e-05, + "loss": 1.6828, + "step": 15584 + }, + { + "epoch": 0.5581320393217183, + "grad_norm": 1.4135667085647583, + "learning_rate": 8.607439441295755e-05, + "loss": 1.4163, + "step": 15585 + }, + { + "epoch": 0.5581678514512866, + "grad_norm": 1.6235884428024292, + "learning_rate": 8.606290850978862e-05, + "loss": 1.4135, + "step": 15586 + }, + { + "epoch": 0.5582036635808548, + "grad_norm": 2.004629611968994, + "learning_rate": 8.605142279412533e-05, + "loss": 1.7444, + "step": 15587 + }, + { + "epoch": 0.5582394757104231, + "grad_norm": 1.713678002357483, + "learning_rate": 8.60399372661221e-05, + "loss": 1.7045, + "step": 15588 + }, + { + "epoch": 0.5582752878399914, + "grad_norm": 1.9293084144592285, + "learning_rate": 8.602845192593359e-05, + "loss": 1.1365, + "step": 15589 + }, + { + "epoch": 0.5583110999695597, + "grad_norm": 1.6164830923080444, + "learning_rate": 8.601696677371414e-05, + "loss": 1.6059, + "step": 15590 + }, + { + "epoch": 0.558346912099128, + "grad_norm": 2.2314774990081787, + "learning_rate": 8.600548180961845e-05, + "loss": 1.342, + "step": 15591 + }, + { + "epoch": 0.5583827242286963, + "grad_norm": 1.5393816232681274, + "learning_rate": 8.599399703380087e-05, + "loss": 1.3335, + "step": 15592 + }, + { + "epoch": 0.5584185363582646, + "grad_norm": 1.6473439931869507, + "learning_rate": 8.598251244641608e-05, + "loss": 1.4625, + "step": 15593 + }, + { + "epoch": 0.5584543484878328, + "grad_norm": 1.7329156398773193, + "learning_rate": 8.597102804761846e-05, + "loss": 1.6533, + "step": 15594 + }, + { + "epoch": 0.5584901606174011, + "grad_norm": 1.7898093461990356, + "learning_rate": 8.595954383756256e-05, + "loss": 1.3945, + "step": 15595 + }, + { + "epoch": 0.5585259727469694, + "grad_norm": 1.6420942544937134, + "learning_rate": 8.594805981640289e-05, + "loss": 1.5115, + "step": 15596 + }, + { + "epoch": 0.5585617848765377, + "grad_norm": 1.4303438663482666, + "learning_rate": 8.593657598429395e-05, + "loss": 1.4199, + "step": 15597 + }, + { + "epoch": 0.558597597006106, + "grad_norm": 1.711222767829895, + "learning_rate": 8.592509234139023e-05, + "loss": 1.3911, + "step": 15598 + }, + { + "epoch": 0.5586334091356743, + "grad_norm": 1.6813573837280273, + "learning_rate": 8.591360888784622e-05, + "loss": 1.3908, + "step": 15599 + }, + { + "epoch": 0.5586692212652425, + "grad_norm": 1.3513386249542236, + "learning_rate": 8.590212562381649e-05, + "loss": 1.5484, + "step": 15600 + }, + { + "epoch": 0.5587050333948108, + "grad_norm": 2.3890459537506104, + "learning_rate": 8.589064254945541e-05, + "loss": 1.5111, + "step": 15601 + }, + { + "epoch": 0.5587408455243791, + "grad_norm": 1.4889732599258423, + "learning_rate": 8.58791596649176e-05, + "loss": 1.4165, + "step": 15602 + }, + { + "epoch": 0.5587766576539474, + "grad_norm": 2.1360843181610107, + "learning_rate": 8.586767697035745e-05, + "loss": 1.5494, + "step": 15603 + }, + { + "epoch": 0.5588124697835157, + "grad_norm": 1.2866389751434326, + "learning_rate": 8.585619446592947e-05, + "loss": 1.4871, + "step": 15604 + }, + { + "epoch": 0.558848281913084, + "grad_norm": 1.881407380104065, + "learning_rate": 8.584471215178817e-05, + "loss": 1.5327, + "step": 15605 + }, + { + "epoch": 0.5588840940426523, + "grad_norm": 2.2813119888305664, + "learning_rate": 8.583323002808798e-05, + "loss": 1.8198, + "step": 15606 + }, + { + "epoch": 0.5589199061722205, + "grad_norm": 1.8343212604522705, + "learning_rate": 8.582174809498343e-05, + "loss": 1.5162, + "step": 15607 + }, + { + "epoch": 0.5589557183017888, + "grad_norm": 1.4812381267547607, + "learning_rate": 8.581026635262894e-05, + "loss": 1.6672, + "step": 15608 + }, + { + "epoch": 0.5589915304313571, + "grad_norm": 1.795681357383728, + "learning_rate": 8.579878480117906e-05, + "loss": 1.361, + "step": 15609 + }, + { + "epoch": 0.5590273425609253, + "grad_norm": 1.547515630722046, + "learning_rate": 8.578730344078817e-05, + "loss": 1.1694, + "step": 15610 + }, + { + "epoch": 0.5590631546904937, + "grad_norm": 2.669339656829834, + "learning_rate": 8.577582227161081e-05, + "loss": 1.6002, + "step": 15611 + }, + { + "epoch": 0.559098966820062, + "grad_norm": 1.5305811166763306, + "learning_rate": 8.57643412938014e-05, + "loss": 1.2915, + "step": 15612 + }, + { + "epoch": 0.5591347789496303, + "grad_norm": 2.1755855083465576, + "learning_rate": 8.575286050751441e-05, + "loss": 1.5044, + "step": 15613 + }, + { + "epoch": 0.5591705910791985, + "grad_norm": 1.5070805549621582, + "learning_rate": 8.574137991290432e-05, + "loss": 1.6467, + "step": 15614 + }, + { + "epoch": 0.5592064032087668, + "grad_norm": 1.981643795967102, + "learning_rate": 8.572989951012554e-05, + "loss": 1.484, + "step": 15615 + }, + { + "epoch": 0.5592422153383351, + "grad_norm": 1.5245190858840942, + "learning_rate": 8.571841929933258e-05, + "loss": 1.5356, + "step": 15616 + }, + { + "epoch": 0.5592780274679033, + "grad_norm": 1.464753270149231, + "learning_rate": 8.570693928067986e-05, + "loss": 1.2764, + "step": 15617 + }, + { + "epoch": 0.5593138395974717, + "grad_norm": 1.790344476699829, + "learning_rate": 8.569545945432185e-05, + "loss": 1.3695, + "step": 15618 + }, + { + "epoch": 0.55934965172704, + "grad_norm": 2.0477633476257324, + "learning_rate": 8.568397982041294e-05, + "loss": 1.5747, + "step": 15619 + }, + { + "epoch": 0.5593854638566083, + "grad_norm": 1.3731274604797363, + "learning_rate": 8.567250037910766e-05, + "loss": 1.5434, + "step": 15620 + }, + { + "epoch": 0.5594212759861765, + "grad_norm": 2.218919277191162, + "learning_rate": 8.56610211305604e-05, + "loss": 1.7308, + "step": 15621 + }, + { + "epoch": 0.5594570881157448, + "grad_norm": 1.6405763626098633, + "learning_rate": 8.564954207492558e-05, + "loss": 1.5039, + "step": 15622 + }, + { + "epoch": 0.5594929002453131, + "grad_norm": 1.4550385475158691, + "learning_rate": 8.56380632123577e-05, + "loss": 1.303, + "step": 15623 + }, + { + "epoch": 0.5595287123748813, + "grad_norm": 2.2137668132781982, + "learning_rate": 8.56265845430111e-05, + "loss": 1.7529, + "step": 15624 + }, + { + "epoch": 0.5595645245044497, + "grad_norm": 1.7952656745910645, + "learning_rate": 8.561510606704031e-05, + "loss": 1.4931, + "step": 15625 + }, + { + "epoch": 0.559600336634018, + "grad_norm": 2.030855417251587, + "learning_rate": 8.560362778459968e-05, + "loss": 1.302, + "step": 15626 + }, + { + "epoch": 0.5596361487635862, + "grad_norm": 1.3937506675720215, + "learning_rate": 8.55921496958437e-05, + "loss": 1.5187, + "step": 15627 + }, + { + "epoch": 0.5596719608931545, + "grad_norm": 2.013484477996826, + "learning_rate": 8.558067180092673e-05, + "loss": 1.5022, + "step": 15628 + }, + { + "epoch": 0.5597077730227228, + "grad_norm": 1.5687587261199951, + "learning_rate": 8.556919410000323e-05, + "loss": 1.4391, + "step": 15629 + }, + { + "epoch": 0.5597435851522911, + "grad_norm": 1.5793520212173462, + "learning_rate": 8.555771659322765e-05, + "loss": 1.5087, + "step": 15630 + }, + { + "epoch": 0.5597793972818593, + "grad_norm": 1.5034009218215942, + "learning_rate": 8.55462392807543e-05, + "loss": 1.4148, + "step": 15631 + }, + { + "epoch": 0.5598152094114277, + "grad_norm": 1.5135999917984009, + "learning_rate": 8.55347621627377e-05, + "loss": 1.3035, + "step": 15632 + }, + { + "epoch": 0.559851021540996, + "grad_norm": 1.4237425327301025, + "learning_rate": 8.55232852393322e-05, + "loss": 1.4377, + "step": 15633 + }, + { + "epoch": 0.5598868336705642, + "grad_norm": 1.6626579761505127, + "learning_rate": 8.551180851069222e-05, + "loss": 1.3463, + "step": 15634 + }, + { + "epoch": 0.5599226458001325, + "grad_norm": 1.5885761976242065, + "learning_rate": 8.550033197697218e-05, + "loss": 1.3632, + "step": 15635 + }, + { + "epoch": 0.5599584579297008, + "grad_norm": 1.4434884786605835, + "learning_rate": 8.548885563832646e-05, + "loss": 1.2362, + "step": 15636 + }, + { + "epoch": 0.559994270059269, + "grad_norm": 1.8405886888504028, + "learning_rate": 8.547737949490946e-05, + "loss": 1.6275, + "step": 15637 + }, + { + "epoch": 0.5600300821888373, + "grad_norm": 1.6832880973815918, + "learning_rate": 8.546590354687562e-05, + "loss": 1.7815, + "step": 15638 + }, + { + "epoch": 0.5600658943184057, + "grad_norm": 1.6571459770202637, + "learning_rate": 8.545442779437928e-05, + "loss": 1.4971, + "step": 15639 + }, + { + "epoch": 0.560101706447974, + "grad_norm": 1.7768093347549438, + "learning_rate": 8.544295223757484e-05, + "loss": 1.6878, + "step": 15640 + }, + { + "epoch": 0.5601375185775422, + "grad_norm": 1.4609851837158203, + "learning_rate": 8.543147687661673e-05, + "loss": 1.3395, + "step": 15641 + }, + { + "epoch": 0.5601733307071105, + "grad_norm": 1.545523762702942, + "learning_rate": 8.542000171165928e-05, + "loss": 1.5403, + "step": 15642 + }, + { + "epoch": 0.5602091428366788, + "grad_norm": 1.7643197774887085, + "learning_rate": 8.540852674285691e-05, + "loss": 1.7647, + "step": 15643 + }, + { + "epoch": 0.560244954966247, + "grad_norm": 1.7095211744308472, + "learning_rate": 8.539705197036398e-05, + "loss": 1.1087, + "step": 15644 + }, + { + "epoch": 0.5602807670958153, + "grad_norm": 1.9334076642990112, + "learning_rate": 8.53855773943349e-05, + "loss": 1.6575, + "step": 15645 + }, + { + "epoch": 0.5603165792253836, + "grad_norm": 1.9181638956069946, + "learning_rate": 8.537410301492398e-05, + "loss": 1.4329, + "step": 15646 + }, + { + "epoch": 0.560352391354952, + "grad_norm": 1.4931620359420776, + "learning_rate": 8.53626288322857e-05, + "loss": 1.3542, + "step": 15647 + }, + { + "epoch": 0.5603882034845202, + "grad_norm": 1.7490475177764893, + "learning_rate": 8.535115484657434e-05, + "loss": 1.2913, + "step": 15648 + }, + { + "epoch": 0.5604240156140885, + "grad_norm": 1.412573218345642, + "learning_rate": 8.533968105794428e-05, + "loss": 1.453, + "step": 15649 + }, + { + "epoch": 0.5604598277436568, + "grad_norm": 1.6083836555480957, + "learning_rate": 8.532820746654993e-05, + "loss": 1.6949, + "step": 15650 + }, + { + "epoch": 0.560495639873225, + "grad_norm": 1.9700466394424438, + "learning_rate": 8.53167340725456e-05, + "loss": 1.4605, + "step": 15651 + }, + { + "epoch": 0.5605314520027933, + "grad_norm": 1.7195794582366943, + "learning_rate": 8.530526087608569e-05, + "loss": 1.54, + "step": 15652 + }, + { + "epoch": 0.5605672641323616, + "grad_norm": 1.3336139917373657, + "learning_rate": 8.529378787732451e-05, + "loss": 1.5449, + "step": 15653 + }, + { + "epoch": 0.56060307626193, + "grad_norm": 1.7417322397232056, + "learning_rate": 8.528231507641648e-05, + "loss": 1.2722, + "step": 15654 + }, + { + "epoch": 0.5606388883914982, + "grad_norm": 1.8747280836105347, + "learning_rate": 8.527084247351595e-05, + "loss": 1.2224, + "step": 15655 + }, + { + "epoch": 0.5606747005210665, + "grad_norm": 1.4631984233856201, + "learning_rate": 8.525937006877714e-05, + "loss": 1.2657, + "step": 15656 + }, + { + "epoch": 0.5607105126506348, + "grad_norm": 2.2184841632843018, + "learning_rate": 8.524789786235458e-05, + "loss": 1.437, + "step": 15657 + }, + { + "epoch": 0.560746324780203, + "grad_norm": 1.5433226823806763, + "learning_rate": 8.523642585440245e-05, + "loss": 1.5117, + "step": 15658 + }, + { + "epoch": 0.5607821369097713, + "grad_norm": 1.7362242937088013, + "learning_rate": 8.522495404507521e-05, + "loss": 1.7659, + "step": 15659 + }, + { + "epoch": 0.5608179490393396, + "grad_norm": 1.6187702417373657, + "learning_rate": 8.521348243452714e-05, + "loss": 1.3126, + "step": 15660 + }, + { + "epoch": 0.560853761168908, + "grad_norm": 1.7105422019958496, + "learning_rate": 8.52020110229126e-05, + "loss": 1.468, + "step": 15661 + }, + { + "epoch": 0.5608895732984762, + "grad_norm": 1.5425986051559448, + "learning_rate": 8.51905398103859e-05, + "loss": 1.219, + "step": 15662 + }, + { + "epoch": 0.5609253854280445, + "grad_norm": 1.5455344915390015, + "learning_rate": 8.51790687971014e-05, + "loss": 1.4256, + "step": 15663 + }, + { + "epoch": 0.5609611975576128, + "grad_norm": 1.4394350051879883, + "learning_rate": 8.516759798321345e-05, + "loss": 1.3927, + "step": 15664 + }, + { + "epoch": 0.560997009687181, + "grad_norm": 1.6390577554702759, + "learning_rate": 8.515612736887627e-05, + "loss": 1.3861, + "step": 15665 + }, + { + "epoch": 0.5610328218167493, + "grad_norm": 1.270395040512085, + "learning_rate": 8.51446569542443e-05, + "loss": 1.0775, + "step": 15666 + }, + { + "epoch": 0.5610686339463176, + "grad_norm": 2.1888267993927, + "learning_rate": 8.513318673947173e-05, + "loss": 1.8735, + "step": 15667 + }, + { + "epoch": 0.5611044460758859, + "grad_norm": 1.4443387985229492, + "learning_rate": 8.512171672471305e-05, + "loss": 1.3064, + "step": 15668 + }, + { + "epoch": 0.5611402582054542, + "grad_norm": 1.3692225217819214, + "learning_rate": 8.51102469101224e-05, + "loss": 1.2717, + "step": 15669 + }, + { + "epoch": 0.5611760703350225, + "grad_norm": 1.8987263441085815, + "learning_rate": 8.509877729585423e-05, + "loss": 1.5354, + "step": 15670 + }, + { + "epoch": 0.5612118824645907, + "grad_norm": 1.4163073301315308, + "learning_rate": 8.508730788206273e-05, + "loss": 1.1945, + "step": 15671 + }, + { + "epoch": 0.561247694594159, + "grad_norm": 1.583075761795044, + "learning_rate": 8.507583866890233e-05, + "loss": 1.669, + "step": 15672 + }, + { + "epoch": 0.5612835067237273, + "grad_norm": 1.6579574346542358, + "learning_rate": 8.506436965652728e-05, + "loss": 1.2003, + "step": 15673 + }, + { + "epoch": 0.5613193188532956, + "grad_norm": 3.4547436237335205, + "learning_rate": 8.50529008450918e-05, + "loss": 1.6151, + "step": 15674 + }, + { + "epoch": 0.5613551309828639, + "grad_norm": 1.7894209623336792, + "learning_rate": 8.504143223475031e-05, + "loss": 1.4897, + "step": 15675 + }, + { + "epoch": 0.5613909431124322, + "grad_norm": 1.423621416091919, + "learning_rate": 8.502996382565702e-05, + "loss": 1.6746, + "step": 15676 + }, + { + "epoch": 0.5614267552420005, + "grad_norm": 1.8405758142471313, + "learning_rate": 8.501849561796631e-05, + "loss": 1.2419, + "step": 15677 + }, + { + "epoch": 0.5614625673715687, + "grad_norm": 1.9967997074127197, + "learning_rate": 8.500702761183234e-05, + "loss": 1.4297, + "step": 15678 + }, + { + "epoch": 0.561498379501137, + "grad_norm": 2.1513876914978027, + "learning_rate": 8.499555980740956e-05, + "loss": 1.4807, + "step": 15679 + }, + { + "epoch": 0.5615341916307053, + "grad_norm": 1.7307462692260742, + "learning_rate": 8.498409220485208e-05, + "loss": 1.5421, + "step": 15680 + }, + { + "epoch": 0.5615700037602736, + "grad_norm": 1.9727336168289185, + "learning_rate": 8.497262480431435e-05, + "loss": 1.393, + "step": 15681 + }, + { + "epoch": 0.5616058158898419, + "grad_norm": 1.4967458248138428, + "learning_rate": 8.496115760595054e-05, + "loss": 1.4945, + "step": 15682 + }, + { + "epoch": 0.5616416280194102, + "grad_norm": 1.8036267757415771, + "learning_rate": 8.494969060991493e-05, + "loss": 1.5742, + "step": 15683 + }, + { + "epoch": 0.5616774401489785, + "grad_norm": 1.3200416564941406, + "learning_rate": 8.493822381636185e-05, + "loss": 1.4853, + "step": 15684 + }, + { + "epoch": 0.5617132522785467, + "grad_norm": 1.4074757099151611, + "learning_rate": 8.49267572254455e-05, + "loss": 1.5135, + "step": 15685 + }, + { + "epoch": 0.561749064408115, + "grad_norm": 1.7859296798706055, + "learning_rate": 8.491529083732025e-05, + "loss": 1.4796, + "step": 15686 + }, + { + "epoch": 0.5617848765376833, + "grad_norm": 1.585807204246521, + "learning_rate": 8.490382465214025e-05, + "loss": 1.5701, + "step": 15687 + }, + { + "epoch": 0.5618206886672515, + "grad_norm": 2.1410164833068848, + "learning_rate": 8.489235867005985e-05, + "loss": 1.5223, + "step": 15688 + }, + { + "epoch": 0.5618565007968199, + "grad_norm": 1.5662782192230225, + "learning_rate": 8.488089289123324e-05, + "loss": 1.3799, + "step": 15689 + }, + { + "epoch": 0.5618923129263882, + "grad_norm": 1.8063139915466309, + "learning_rate": 8.486942731581478e-05, + "loss": 1.5595, + "step": 15690 + }, + { + "epoch": 0.5619281250559565, + "grad_norm": 2.0988364219665527, + "learning_rate": 8.485796194395862e-05, + "loss": 1.4958, + "step": 15691 + }, + { + "epoch": 0.5619639371855247, + "grad_norm": 1.6503610610961914, + "learning_rate": 8.484649677581904e-05, + "loss": 1.4374, + "step": 15692 + }, + { + "epoch": 0.561999749315093, + "grad_norm": 2.550079584121704, + "learning_rate": 8.483503181155031e-05, + "loss": 1.3537, + "step": 15693 + }, + { + "epoch": 0.5620355614446613, + "grad_norm": 1.5846173763275146, + "learning_rate": 8.482356705130665e-05, + "loss": 1.4495, + "step": 15694 + }, + { + "epoch": 0.5620713735742295, + "grad_norm": 1.265183687210083, + "learning_rate": 8.481210249524234e-05, + "loss": 1.4833, + "step": 15695 + }, + { + "epoch": 0.5621071857037979, + "grad_norm": 1.5761173963546753, + "learning_rate": 8.480063814351159e-05, + "loss": 1.3988, + "step": 15696 + }, + { + "epoch": 0.5621429978333662, + "grad_norm": 1.5768393278121948, + "learning_rate": 8.478917399626865e-05, + "loss": 1.3578, + "step": 15697 + }, + { + "epoch": 0.5621788099629345, + "grad_norm": 2.0033600330352783, + "learning_rate": 8.477771005366772e-05, + "loss": 1.3293, + "step": 15698 + }, + { + "epoch": 0.5622146220925027, + "grad_norm": 2.234950065612793, + "learning_rate": 8.476624631586313e-05, + "loss": 1.5952, + "step": 15699 + }, + { + "epoch": 0.562250434222071, + "grad_norm": 1.814936637878418, + "learning_rate": 8.475478278300902e-05, + "loss": 1.1814, + "step": 15700 + }, + { + "epoch": 0.5622862463516393, + "grad_norm": 1.5817487239837646, + "learning_rate": 8.474331945525963e-05, + "loss": 1.5717, + "step": 15701 + }, + { + "epoch": 0.5623220584812075, + "grad_norm": 1.440826654434204, + "learning_rate": 8.47318563327692e-05, + "loss": 1.4432, + "step": 15702 + }, + { + "epoch": 0.5623578706107759, + "grad_norm": 1.247573733329773, + "learning_rate": 8.472039341569195e-05, + "loss": 1.5174, + "step": 15703 + }, + { + "epoch": 0.5623936827403442, + "grad_norm": 1.999563217163086, + "learning_rate": 8.470893070418211e-05, + "loss": 1.622, + "step": 15704 + }, + { + "epoch": 0.5624294948699124, + "grad_norm": 2.3003087043762207, + "learning_rate": 8.469746819839387e-05, + "loss": 1.4743, + "step": 15705 + }, + { + "epoch": 0.5624653069994807, + "grad_norm": 1.6141663789749146, + "learning_rate": 8.468600589848146e-05, + "loss": 1.0057, + "step": 15706 + }, + { + "epoch": 0.562501119129049, + "grad_norm": 1.4969419240951538, + "learning_rate": 8.467454380459907e-05, + "loss": 1.4611, + "step": 15707 + }, + { + "epoch": 0.5625369312586173, + "grad_norm": 1.8018219470977783, + "learning_rate": 8.466308191690096e-05, + "loss": 1.7724, + "step": 15708 + }, + { + "epoch": 0.5625727433881855, + "grad_norm": 1.618523359298706, + "learning_rate": 8.46516202355413e-05, + "loss": 1.318, + "step": 15709 + }, + { + "epoch": 0.5626085555177539, + "grad_norm": 1.5267356634140015, + "learning_rate": 8.464015876067425e-05, + "loss": 1.5282, + "step": 15710 + }, + { + "epoch": 0.5626443676473222, + "grad_norm": 2.576260566711426, + "learning_rate": 8.462869749245408e-05, + "loss": 1.381, + "step": 15711 + }, + { + "epoch": 0.5626801797768904, + "grad_norm": 1.6371855735778809, + "learning_rate": 8.461723643103494e-05, + "loss": 1.516, + "step": 15712 + }, + { + "epoch": 0.5627159919064587, + "grad_norm": 2.0176661014556885, + "learning_rate": 8.460577557657107e-05, + "loss": 1.5839, + "step": 15713 + }, + { + "epoch": 0.562751804036027, + "grad_norm": 1.6947283744812012, + "learning_rate": 8.45943149292166e-05, + "loss": 1.2497, + "step": 15714 + }, + { + "epoch": 0.5627876161655953, + "grad_norm": 1.763598918914795, + "learning_rate": 8.458285448912578e-05, + "loss": 1.5134, + "step": 15715 + }, + { + "epoch": 0.5628234282951635, + "grad_norm": 1.442230463027954, + "learning_rate": 8.457139425645273e-05, + "loss": 1.3091, + "step": 15716 + }, + { + "epoch": 0.5628592404247319, + "grad_norm": 2.001328706741333, + "learning_rate": 8.455993423135172e-05, + "loss": 1.4218, + "step": 15717 + }, + { + "epoch": 0.5628950525543002, + "grad_norm": 1.904796838760376, + "learning_rate": 8.454847441397684e-05, + "loss": 1.7912, + "step": 15718 + }, + { + "epoch": 0.5629308646838684, + "grad_norm": 2.0739176273345947, + "learning_rate": 8.45370148044823e-05, + "loss": 1.5481, + "step": 15719 + }, + { + "epoch": 0.5629666768134367, + "grad_norm": 2.203639030456543, + "learning_rate": 8.452555540302231e-05, + "loss": 1.6342, + "step": 15720 + }, + { + "epoch": 0.563002488943005, + "grad_norm": 1.9044723510742188, + "learning_rate": 8.451409620975099e-05, + "loss": 1.491, + "step": 15721 + }, + { + "epoch": 0.5630383010725732, + "grad_norm": 1.4646804332733154, + "learning_rate": 8.450263722482255e-05, + "loss": 1.2206, + "step": 15722 + }, + { + "epoch": 0.5630741132021415, + "grad_norm": 2.891911268234253, + "learning_rate": 8.44911784483911e-05, + "loss": 1.8076, + "step": 15723 + }, + { + "epoch": 0.5631099253317099, + "grad_norm": 1.5828810930252075, + "learning_rate": 8.447971988061088e-05, + "loss": 1.6057, + "step": 15724 + }, + { + "epoch": 0.5631457374612782, + "grad_norm": 1.6931911706924438, + "learning_rate": 8.446826152163598e-05, + "loss": 1.6362, + "step": 15725 + }, + { + "epoch": 0.5631815495908464, + "grad_norm": 1.713527798652649, + "learning_rate": 8.44568033716206e-05, + "loss": 1.251, + "step": 15726 + }, + { + "epoch": 0.5632173617204147, + "grad_norm": 1.8473023176193237, + "learning_rate": 8.444534543071891e-05, + "loss": 1.661, + "step": 15727 + }, + { + "epoch": 0.563253173849983, + "grad_norm": 1.4170857667922974, + "learning_rate": 8.443388769908498e-05, + "loss": 1.5997, + "step": 15728 + }, + { + "epoch": 0.5632889859795512, + "grad_norm": 2.1845030784606934, + "learning_rate": 8.442243017687304e-05, + "loss": 1.2984, + "step": 15729 + }, + { + "epoch": 0.5633247981091195, + "grad_norm": 2.019676446914673, + "learning_rate": 8.44109728642372e-05, + "loss": 1.5803, + "step": 15730 + }, + { + "epoch": 0.5633606102386879, + "grad_norm": 2.300466537475586, + "learning_rate": 8.439951576133162e-05, + "loss": 1.8922, + "step": 15731 + }, + { + "epoch": 0.5633964223682562, + "grad_norm": 1.6747887134552002, + "learning_rate": 8.438805886831042e-05, + "loss": 1.4242, + "step": 15732 + }, + { + "epoch": 0.5634322344978244, + "grad_norm": 1.5151009559631348, + "learning_rate": 8.437660218532777e-05, + "loss": 1.2717, + "step": 15733 + }, + { + "epoch": 0.5634680466273927, + "grad_norm": 1.482073187828064, + "learning_rate": 8.436514571253775e-05, + "loss": 1.4709, + "step": 15734 + }, + { + "epoch": 0.563503858756961, + "grad_norm": 1.9614146947860718, + "learning_rate": 8.435368945009456e-05, + "loss": 1.7183, + "step": 15735 + }, + { + "epoch": 0.5635396708865292, + "grad_norm": 1.8879051208496094, + "learning_rate": 8.434223339815229e-05, + "loss": 1.5138, + "step": 15736 + }, + { + "epoch": 0.5635754830160975, + "grad_norm": 1.917839527130127, + "learning_rate": 8.433077755686506e-05, + "loss": 1.5215, + "step": 15737 + }, + { + "epoch": 0.5636112951456659, + "grad_norm": 1.6038079261779785, + "learning_rate": 8.431932192638703e-05, + "loss": 1.4635, + "step": 15738 + }, + { + "epoch": 0.5636471072752341, + "grad_norm": 1.5450505018234253, + "learning_rate": 8.430786650687227e-05, + "loss": 1.6093, + "step": 15739 + }, + { + "epoch": 0.5636829194048024, + "grad_norm": 1.6022473573684692, + "learning_rate": 8.429641129847494e-05, + "loss": 1.2194, + "step": 15740 + }, + { + "epoch": 0.5637187315343707, + "grad_norm": 2.185281991958618, + "learning_rate": 8.428495630134912e-05, + "loss": 1.5651, + "step": 15741 + }, + { + "epoch": 0.563754543663939, + "grad_norm": 1.5093828439712524, + "learning_rate": 8.427350151564897e-05, + "loss": 1.4803, + "step": 15742 + }, + { + "epoch": 0.5637903557935072, + "grad_norm": 1.753310203552246, + "learning_rate": 8.426204694152855e-05, + "loss": 1.3101, + "step": 15743 + }, + { + "epoch": 0.5638261679230755, + "grad_norm": 1.8272547721862793, + "learning_rate": 8.425059257914201e-05, + "loss": 1.466, + "step": 15744 + }, + { + "epoch": 0.5638619800526439, + "grad_norm": 1.5649486780166626, + "learning_rate": 8.423913842864342e-05, + "loss": 1.433, + "step": 15745 + }, + { + "epoch": 0.5638977921822121, + "grad_norm": 1.796422004699707, + "learning_rate": 8.422768449018688e-05, + "loss": 1.2773, + "step": 15746 + }, + { + "epoch": 0.5639336043117804, + "grad_norm": 1.6080162525177002, + "learning_rate": 8.421623076392652e-05, + "loss": 1.522, + "step": 15747 + }, + { + "epoch": 0.5639694164413487, + "grad_norm": 1.624840259552002, + "learning_rate": 8.420477725001639e-05, + "loss": 1.7583, + "step": 15748 + }, + { + "epoch": 0.564005228570917, + "grad_norm": 1.7605829238891602, + "learning_rate": 8.419332394861064e-05, + "loss": 1.283, + "step": 15749 + }, + { + "epoch": 0.5640410407004852, + "grad_norm": 1.488240361213684, + "learning_rate": 8.418187085986329e-05, + "loss": 1.3897, + "step": 15750 + }, + { + "epoch": 0.5640768528300535, + "grad_norm": 1.4900915622711182, + "learning_rate": 8.41704179839285e-05, + "loss": 1.5596, + "step": 15751 + }, + { + "epoch": 0.5641126649596219, + "grad_norm": 1.8292967081069946, + "learning_rate": 8.415896532096034e-05, + "loss": 1.4501, + "step": 15752 + }, + { + "epoch": 0.5641484770891901, + "grad_norm": 1.3246054649353027, + "learning_rate": 8.41475128711128e-05, + "loss": 1.0903, + "step": 15753 + }, + { + "epoch": 0.5641842892187584, + "grad_norm": 1.4729738235473633, + "learning_rate": 8.413606063454008e-05, + "loss": 1.3507, + "step": 15754 + }, + { + "epoch": 0.5642201013483267, + "grad_norm": 1.7757606506347656, + "learning_rate": 8.412460861139615e-05, + "loss": 1.708, + "step": 15755 + }, + { + "epoch": 0.5642559134778949, + "grad_norm": 3.3102619647979736, + "learning_rate": 8.411315680183517e-05, + "loss": 1.4701, + "step": 15756 + }, + { + "epoch": 0.5642917256074632, + "grad_norm": 1.4510157108306885, + "learning_rate": 8.410170520601115e-05, + "loss": 1.3559, + "step": 15757 + }, + { + "epoch": 0.5643275377370315, + "grad_norm": 1.7538570165634155, + "learning_rate": 8.40902538240782e-05, + "loss": 1.2939, + "step": 15758 + }, + { + "epoch": 0.5643633498665999, + "grad_norm": 1.3784433603286743, + "learning_rate": 8.407880265619035e-05, + "loss": 1.5924, + "step": 15759 + }, + { + "epoch": 0.5643991619961681, + "grad_norm": 2.172971248626709, + "learning_rate": 8.406735170250168e-05, + "loss": 1.3409, + "step": 15760 + }, + { + "epoch": 0.5644349741257364, + "grad_norm": 2.134845733642578, + "learning_rate": 8.405590096316626e-05, + "loss": 1.2721, + "step": 15761 + }, + { + "epoch": 0.5644707862553047, + "grad_norm": 1.4905674457550049, + "learning_rate": 8.404445043833809e-05, + "loss": 1.4646, + "step": 15762 + }, + { + "epoch": 0.5645065983848729, + "grad_norm": 1.4775800704956055, + "learning_rate": 8.40330001281713e-05, + "loss": 1.5261, + "step": 15763 + }, + { + "epoch": 0.5645424105144412, + "grad_norm": 1.478419303894043, + "learning_rate": 8.402155003281984e-05, + "loss": 1.569, + "step": 15764 + }, + { + "epoch": 0.5645782226440095, + "grad_norm": 1.6768771409988403, + "learning_rate": 8.401010015243787e-05, + "loss": 1.6972, + "step": 15765 + }, + { + "epoch": 0.5646140347735779, + "grad_norm": 1.3239349126815796, + "learning_rate": 8.399865048717932e-05, + "loss": 1.4826, + "step": 15766 + }, + { + "epoch": 0.5646498469031461, + "grad_norm": 2.080510377883911, + "learning_rate": 8.398720103719836e-05, + "loss": 1.6596, + "step": 15767 + }, + { + "epoch": 0.5646856590327144, + "grad_norm": 1.5443332195281982, + "learning_rate": 8.397575180264887e-05, + "loss": 1.2538, + "step": 15768 + }, + { + "epoch": 0.5647214711622827, + "grad_norm": 1.3316525220870972, + "learning_rate": 8.396430278368503e-05, + "loss": 1.3754, + "step": 15769 + }, + { + "epoch": 0.5647572832918509, + "grad_norm": 2.1287786960601807, + "learning_rate": 8.395285398046084e-05, + "loss": 1.6287, + "step": 15770 + }, + { + "epoch": 0.5647930954214192, + "grad_norm": 1.7575713396072388, + "learning_rate": 8.394140539313021e-05, + "loss": 1.4228, + "step": 15771 + }, + { + "epoch": 0.5648289075509875, + "grad_norm": 1.9802348613739014, + "learning_rate": 8.392995702184734e-05, + "loss": 1.5655, + "step": 15772 + }, + { + "epoch": 0.5648647196805558, + "grad_norm": 1.6376569271087646, + "learning_rate": 8.391850886676609e-05, + "loss": 0.9234, + "step": 15773 + }, + { + "epoch": 0.5649005318101241, + "grad_norm": 1.4419960975646973, + "learning_rate": 8.390706092804064e-05, + "loss": 1.4967, + "step": 15774 + }, + { + "epoch": 0.5649363439396924, + "grad_norm": 1.8754823207855225, + "learning_rate": 8.389561320582486e-05, + "loss": 1.3997, + "step": 15775 + }, + { + "epoch": 0.5649721560692607, + "grad_norm": 1.3436617851257324, + "learning_rate": 8.388416570027289e-05, + "loss": 1.5209, + "step": 15776 + }, + { + "epoch": 0.5650079681988289, + "grad_norm": 1.7056835889816284, + "learning_rate": 8.38727184115386e-05, + "loss": 1.1724, + "step": 15777 + }, + { + "epoch": 0.5650437803283972, + "grad_norm": 1.2598458528518677, + "learning_rate": 8.386127133977617e-05, + "loss": 1.352, + "step": 15778 + }, + { + "epoch": 0.5650795924579655, + "grad_norm": 1.2932641506195068, + "learning_rate": 8.384982448513949e-05, + "loss": 1.3726, + "step": 15779 + }, + { + "epoch": 0.5651154045875338, + "grad_norm": 1.6122196912765503, + "learning_rate": 8.383837784778257e-05, + "loss": 1.218, + "step": 15780 + }, + { + "epoch": 0.5651512167171021, + "grad_norm": 1.5172953605651855, + "learning_rate": 8.382693142785945e-05, + "loss": 1.5963, + "step": 15781 + }, + { + "epoch": 0.5651870288466704, + "grad_norm": 1.681890606880188, + "learning_rate": 8.381548522552406e-05, + "loss": 1.5023, + "step": 15782 + }, + { + "epoch": 0.5652228409762386, + "grad_norm": 2.0911543369293213, + "learning_rate": 8.38040392409305e-05, + "loss": 1.6502, + "step": 15783 + }, + { + "epoch": 0.5652586531058069, + "grad_norm": 2.043477773666382, + "learning_rate": 8.379259347423265e-05, + "loss": 1.4353, + "step": 15784 + }, + { + "epoch": 0.5652944652353752, + "grad_norm": 1.461133360862732, + "learning_rate": 8.37811479255846e-05, + "loss": 1.5957, + "step": 15785 + }, + { + "epoch": 0.5653302773649435, + "grad_norm": 1.580897331237793, + "learning_rate": 8.376970259514023e-05, + "loss": 1.1663, + "step": 15786 + }, + { + "epoch": 0.5653660894945118, + "grad_norm": 1.5734902620315552, + "learning_rate": 8.375825748305364e-05, + "loss": 1.6092, + "step": 15787 + }, + { + "epoch": 0.5654019016240801, + "grad_norm": 1.7830482721328735, + "learning_rate": 8.37468125894787e-05, + "loss": 1.5706, + "step": 15788 + }, + { + "epoch": 0.5654377137536484, + "grad_norm": 2.396804094314575, + "learning_rate": 8.373536791456944e-05, + "loss": 1.6365, + "step": 15789 + }, + { + "epoch": 0.5654735258832166, + "grad_norm": 1.630237340927124, + "learning_rate": 8.372392345847983e-05, + "loss": 1.693, + "step": 15790 + }, + { + "epoch": 0.5655093380127849, + "grad_norm": 2.5660781860351562, + "learning_rate": 8.371247922136383e-05, + "loss": 1.2867, + "step": 15791 + }, + { + "epoch": 0.5655451501423532, + "grad_norm": 1.5164788961410522, + "learning_rate": 8.370103520337542e-05, + "loss": 1.3451, + "step": 15792 + }, + { + "epoch": 0.5655809622719215, + "grad_norm": 1.45986807346344, + "learning_rate": 8.368959140466853e-05, + "loss": 1.4041, + "step": 15793 + }, + { + "epoch": 0.5656167744014898, + "grad_norm": 1.8049870729446411, + "learning_rate": 8.367814782539718e-05, + "loss": 1.7531, + "step": 15794 + }, + { + "epoch": 0.5656525865310581, + "grad_norm": 2.0593514442443848, + "learning_rate": 8.366670446571525e-05, + "loss": 1.1339, + "step": 15795 + }, + { + "epoch": 0.5656883986606264, + "grad_norm": 1.956852912902832, + "learning_rate": 8.365526132577681e-05, + "loss": 1.4611, + "step": 15796 + }, + { + "epoch": 0.5657242107901946, + "grad_norm": 1.8688738346099854, + "learning_rate": 8.364381840573573e-05, + "loss": 1.3412, + "step": 15797 + }, + { + "epoch": 0.5657600229197629, + "grad_norm": 1.790224313735962, + "learning_rate": 8.363237570574595e-05, + "loss": 1.1733, + "step": 15798 + }, + { + "epoch": 0.5657958350493312, + "grad_norm": 1.8264939785003662, + "learning_rate": 8.362093322596145e-05, + "loss": 1.4683, + "step": 15799 + }, + { + "epoch": 0.5658316471788994, + "grad_norm": 1.8965346813201904, + "learning_rate": 8.360949096653616e-05, + "loss": 1.797, + "step": 15800 + }, + { + "epoch": 0.5658674593084678, + "grad_norm": 2.144782066345215, + "learning_rate": 8.359804892762405e-05, + "loss": 1.7707, + "step": 15801 + }, + { + "epoch": 0.5659032714380361, + "grad_norm": 3.100724697113037, + "learning_rate": 8.3586607109379e-05, + "loss": 1.3111, + "step": 15802 + }, + { + "epoch": 0.5659390835676044, + "grad_norm": 2.0397491455078125, + "learning_rate": 8.357516551195501e-05, + "loss": 1.4468, + "step": 15803 + }, + { + "epoch": 0.5659748956971726, + "grad_norm": 1.3980594873428345, + "learning_rate": 8.356372413550597e-05, + "loss": 1.3259, + "step": 15804 + }, + { + "epoch": 0.5660107078267409, + "grad_norm": 1.8807727098464966, + "learning_rate": 8.355228298018582e-05, + "loss": 1.7511, + "step": 15805 + }, + { + "epoch": 0.5660465199563092, + "grad_norm": 2.131071090698242, + "learning_rate": 8.354084204614851e-05, + "loss": 1.2344, + "step": 15806 + }, + { + "epoch": 0.5660823320858774, + "grad_norm": 1.6939911842346191, + "learning_rate": 8.35294013335479e-05, + "loss": 1.4542, + "step": 15807 + }, + { + "epoch": 0.5661181442154458, + "grad_norm": 1.5454076528549194, + "learning_rate": 8.351796084253797e-05, + "loss": 1.4836, + "step": 15808 + }, + { + "epoch": 0.5661539563450141, + "grad_norm": 1.7581214904785156, + "learning_rate": 8.350652057327261e-05, + "loss": 1.3919, + "step": 15809 + }, + { + "epoch": 0.5661897684745824, + "grad_norm": 1.616428017616272, + "learning_rate": 8.349508052590574e-05, + "loss": 1.4052, + "step": 15810 + }, + { + "epoch": 0.5662255806041506, + "grad_norm": 1.2832040786743164, + "learning_rate": 8.348364070059127e-05, + "loss": 1.1419, + "step": 15811 + }, + { + "epoch": 0.5662613927337189, + "grad_norm": 1.918565034866333, + "learning_rate": 8.347220109748312e-05, + "loss": 1.5448, + "step": 15812 + }, + { + "epoch": 0.5662972048632872, + "grad_norm": 2.462134838104248, + "learning_rate": 8.346076171673518e-05, + "loss": 1.3867, + "step": 15813 + }, + { + "epoch": 0.5663330169928554, + "grad_norm": 1.435020923614502, + "learning_rate": 8.344932255850136e-05, + "loss": 1.2917, + "step": 15814 + }, + { + "epoch": 0.5663688291224238, + "grad_norm": 1.4067466259002686, + "learning_rate": 8.343788362293556e-05, + "loss": 1.4601, + "step": 15815 + }, + { + "epoch": 0.5664046412519921, + "grad_norm": 1.6416200399398804, + "learning_rate": 8.342644491019165e-05, + "loss": 1.3699, + "step": 15816 + }, + { + "epoch": 0.5664404533815603, + "grad_norm": 1.66130793094635, + "learning_rate": 8.341500642042359e-05, + "loss": 1.3382, + "step": 15817 + }, + { + "epoch": 0.5664762655111286, + "grad_norm": 1.7099733352661133, + "learning_rate": 8.340356815378517e-05, + "loss": 1.6387, + "step": 15818 + }, + { + "epoch": 0.5665120776406969, + "grad_norm": 1.7127584218978882, + "learning_rate": 8.339213011043038e-05, + "loss": 1.3284, + "step": 15819 + }, + { + "epoch": 0.5665478897702652, + "grad_norm": 1.6124918460845947, + "learning_rate": 8.338069229051302e-05, + "loss": 1.4032, + "step": 15820 + }, + { + "epoch": 0.5665837018998334, + "grad_norm": 2.4808313846588135, + "learning_rate": 8.336925469418704e-05, + "loss": 1.5178, + "step": 15821 + }, + { + "epoch": 0.5666195140294018, + "grad_norm": 1.6432664394378662, + "learning_rate": 8.335781732160625e-05, + "loss": 1.5208, + "step": 15822 + }, + { + "epoch": 0.5666553261589701, + "grad_norm": 1.5675996541976929, + "learning_rate": 8.334638017292459e-05, + "loss": 1.5968, + "step": 15823 + }, + { + "epoch": 0.5666911382885383, + "grad_norm": 2.1726925373077393, + "learning_rate": 8.33349432482959e-05, + "loss": 1.3455, + "step": 15824 + }, + { + "epoch": 0.5667269504181066, + "grad_norm": 1.6445999145507812, + "learning_rate": 8.332350654787404e-05, + "loss": 1.6555, + "step": 15825 + }, + { + "epoch": 0.5667627625476749, + "grad_norm": 1.3497015237808228, + "learning_rate": 8.33120700718129e-05, + "loss": 1.2937, + "step": 15826 + }, + { + "epoch": 0.5667985746772431, + "grad_norm": 1.9332237243652344, + "learning_rate": 8.330063382026631e-05, + "loss": 1.7678, + "step": 15827 + }, + { + "epoch": 0.5668343868068114, + "grad_norm": 2.0545902252197266, + "learning_rate": 8.328919779338819e-05, + "loss": 1.0707, + "step": 15828 + }, + { + "epoch": 0.5668701989363798, + "grad_norm": 1.9223384857177734, + "learning_rate": 8.327776199133232e-05, + "loss": 1.0993, + "step": 15829 + }, + { + "epoch": 0.5669060110659481, + "grad_norm": 1.645604133605957, + "learning_rate": 8.326632641425261e-05, + "loss": 1.661, + "step": 15830 + }, + { + "epoch": 0.5669418231955163, + "grad_norm": 1.6659955978393555, + "learning_rate": 8.325489106230288e-05, + "loss": 1.2141, + "step": 15831 + }, + { + "epoch": 0.5669776353250846, + "grad_norm": 1.6164517402648926, + "learning_rate": 8.324345593563701e-05, + "loss": 1.312, + "step": 15832 + }, + { + "epoch": 0.5670134474546529, + "grad_norm": 2.98215913772583, + "learning_rate": 8.323202103440884e-05, + "loss": 1.5099, + "step": 15833 + }, + { + "epoch": 0.5670492595842211, + "grad_norm": 1.6239739656448364, + "learning_rate": 8.322058635877216e-05, + "loss": 1.3561, + "step": 15834 + }, + { + "epoch": 0.5670850717137894, + "grad_norm": 2.104954957962036, + "learning_rate": 8.320915190888087e-05, + "loss": 1.7427, + "step": 15835 + }, + { + "epoch": 0.5671208838433578, + "grad_norm": 1.366011619567871, + "learning_rate": 8.319771768488877e-05, + "loss": 1.4357, + "step": 15836 + }, + { + "epoch": 0.5671566959729261, + "grad_norm": 1.9174448251724243, + "learning_rate": 8.318628368694972e-05, + "loss": 1.3326, + "step": 15837 + }, + { + "epoch": 0.5671925081024943, + "grad_norm": 1.4196544885635376, + "learning_rate": 8.317484991521751e-05, + "loss": 1.4263, + "step": 15838 + }, + { + "epoch": 0.5672283202320626, + "grad_norm": 1.5104621648788452, + "learning_rate": 8.316341636984602e-05, + "loss": 1.535, + "step": 15839 + }, + { + "epoch": 0.5672641323616309, + "grad_norm": 1.5836697816848755, + "learning_rate": 8.315198305098902e-05, + "loss": 1.2274, + "step": 15840 + }, + { + "epoch": 0.5672999444911991, + "grad_norm": 2.0471384525299072, + "learning_rate": 8.314054995880036e-05, + "loss": 1.5095, + "step": 15841 + }, + { + "epoch": 0.5673357566207674, + "grad_norm": 1.560638427734375, + "learning_rate": 8.312911709343388e-05, + "loss": 1.417, + "step": 15842 + }, + { + "epoch": 0.5673715687503358, + "grad_norm": 1.8646968603134155, + "learning_rate": 8.311768445504333e-05, + "loss": 1.5827, + "step": 15843 + }, + { + "epoch": 0.567407380879904, + "grad_norm": 1.4655557870864868, + "learning_rate": 8.31062520437826e-05, + "loss": 1.2498, + "step": 15844 + }, + { + "epoch": 0.5674431930094723, + "grad_norm": 2.248730421066284, + "learning_rate": 8.309481985980541e-05, + "loss": 1.4068, + "step": 15845 + }, + { + "epoch": 0.5674790051390406, + "grad_norm": 2.3204147815704346, + "learning_rate": 8.308338790326565e-05, + "loss": 1.7721, + "step": 15846 + }, + { + "epoch": 0.5675148172686089, + "grad_norm": 1.9696449041366577, + "learning_rate": 8.307195617431707e-05, + "loss": 1.5491, + "step": 15847 + }, + { + "epoch": 0.5675506293981771, + "grad_norm": 1.883335828781128, + "learning_rate": 8.306052467311349e-05, + "loss": 1.4826, + "step": 15848 + }, + { + "epoch": 0.5675864415277454, + "grad_norm": 2.11018705368042, + "learning_rate": 8.304909339980873e-05, + "loss": 1.7215, + "step": 15849 + }, + { + "epoch": 0.5676222536573138, + "grad_norm": 1.548668384552002, + "learning_rate": 8.303766235455648e-05, + "loss": 1.3243, + "step": 15850 + }, + { + "epoch": 0.567658065786882, + "grad_norm": 1.4991328716278076, + "learning_rate": 8.302623153751068e-05, + "loss": 1.4878, + "step": 15851 + }, + { + "epoch": 0.5676938779164503, + "grad_norm": 1.4898539781570435, + "learning_rate": 8.301480094882497e-05, + "loss": 1.2673, + "step": 15852 + }, + { + "epoch": 0.5677296900460186, + "grad_norm": 1.7317248582839966, + "learning_rate": 8.300337058865323e-05, + "loss": 1.4307, + "step": 15853 + }, + { + "epoch": 0.5677655021755869, + "grad_norm": 1.6654188632965088, + "learning_rate": 8.299194045714921e-05, + "loss": 1.5343, + "step": 15854 + }, + { + "epoch": 0.5678013143051551, + "grad_norm": 1.7130259275436401, + "learning_rate": 8.298051055446673e-05, + "loss": 1.544, + "step": 15855 + }, + { + "epoch": 0.5678371264347234, + "grad_norm": 3.522223472595215, + "learning_rate": 8.296908088075949e-05, + "loss": 1.9023, + "step": 15856 + }, + { + "epoch": 0.5678729385642918, + "grad_norm": 2.1206347942352295, + "learning_rate": 8.295765143618131e-05, + "loss": 1.6537, + "step": 15857 + }, + { + "epoch": 0.56790875069386, + "grad_norm": 2.9285621643066406, + "learning_rate": 8.294622222088598e-05, + "loss": 1.6514, + "step": 15858 + }, + { + "epoch": 0.5679445628234283, + "grad_norm": 2.09845232963562, + "learning_rate": 8.293479323502716e-05, + "loss": 1.3896, + "step": 15859 + }, + { + "epoch": 0.5679803749529966, + "grad_norm": 1.8847618103027344, + "learning_rate": 8.292336447875876e-05, + "loss": 1.6824, + "step": 15860 + }, + { + "epoch": 0.5680161870825648, + "grad_norm": 3.045405149459839, + "learning_rate": 8.291193595223438e-05, + "loss": 1.5033, + "step": 15861 + }, + { + "epoch": 0.5680519992121331, + "grad_norm": 1.4553380012512207, + "learning_rate": 8.290050765560795e-05, + "loss": 1.4361, + "step": 15862 + }, + { + "epoch": 0.5680878113417014, + "grad_norm": 1.3813594579696655, + "learning_rate": 8.288907958903305e-05, + "loss": 1.7692, + "step": 15863 + }, + { + "epoch": 0.5681236234712698, + "grad_norm": 1.9366446733474731, + "learning_rate": 8.287765175266358e-05, + "loss": 1.3047, + "step": 15864 + }, + { + "epoch": 0.568159435600838, + "grad_norm": 2.792715549468994, + "learning_rate": 8.286622414665317e-05, + "loss": 2.0675, + "step": 15865 + }, + { + "epoch": 0.5681952477304063, + "grad_norm": 1.838062047958374, + "learning_rate": 8.285479677115563e-05, + "loss": 1.4573, + "step": 15866 + }, + { + "epoch": 0.5682310598599746, + "grad_norm": 1.3714712858200073, + "learning_rate": 8.284336962632473e-05, + "loss": 1.4702, + "step": 15867 + }, + { + "epoch": 0.5682668719895428, + "grad_norm": 1.2245217561721802, + "learning_rate": 8.283194271231408e-05, + "loss": 1.5559, + "step": 15868 + }, + { + "epoch": 0.5683026841191111, + "grad_norm": 1.866948127746582, + "learning_rate": 8.282051602927757e-05, + "loss": 1.5925, + "step": 15869 + }, + { + "epoch": 0.5683384962486794, + "grad_norm": 1.918073058128357, + "learning_rate": 8.28090895773688e-05, + "loss": 1.2218, + "step": 15870 + }, + { + "epoch": 0.5683743083782478, + "grad_norm": 2.7581140995025635, + "learning_rate": 8.27976633567416e-05, + "loss": 1.1811, + "step": 15871 + }, + { + "epoch": 0.568410120507816, + "grad_norm": 1.6794708967208862, + "learning_rate": 8.27862373675496e-05, + "loss": 1.3733, + "step": 15872 + }, + { + "epoch": 0.5684459326373843, + "grad_norm": 2.179267644882202, + "learning_rate": 8.277481160994663e-05, + "loss": 1.3167, + "step": 15873 + }, + { + "epoch": 0.5684817447669526, + "grad_norm": 1.5156314373016357, + "learning_rate": 8.276338608408627e-05, + "loss": 1.2546, + "step": 15874 + }, + { + "epoch": 0.5685175568965208, + "grad_norm": 1.3936090469360352, + "learning_rate": 8.27519607901224e-05, + "loss": 1.2427, + "step": 15875 + }, + { + "epoch": 0.5685533690260891, + "grad_norm": 2.1718878746032715, + "learning_rate": 8.274053572820862e-05, + "loss": 1.238, + "step": 15876 + }, + { + "epoch": 0.5685891811556574, + "grad_norm": 1.6128958463668823, + "learning_rate": 8.272911089849866e-05, + "loss": 1.6238, + "step": 15877 + }, + { + "epoch": 0.5686249932852258, + "grad_norm": 1.745404601097107, + "learning_rate": 8.271768630114624e-05, + "loss": 1.3826, + "step": 15878 + }, + { + "epoch": 0.568660805414794, + "grad_norm": 1.595452070236206, + "learning_rate": 8.270626193630503e-05, + "loss": 1.4595, + "step": 15879 + }, + { + "epoch": 0.5686966175443623, + "grad_norm": 1.5459754467010498, + "learning_rate": 8.269483780412883e-05, + "loss": 1.2893, + "step": 15880 + }, + { + "epoch": 0.5687324296739306, + "grad_norm": 1.7879676818847656, + "learning_rate": 8.268341390477118e-05, + "loss": 1.8261, + "step": 15881 + }, + { + "epoch": 0.5687682418034988, + "grad_norm": 1.4515905380249023, + "learning_rate": 8.267199023838593e-05, + "loss": 1.7552, + "step": 15882 + }, + { + "epoch": 0.5688040539330671, + "grad_norm": 1.9007455110549927, + "learning_rate": 8.266056680512664e-05, + "loss": 1.4626, + "step": 15883 + }, + { + "epoch": 0.5688398660626354, + "grad_norm": 1.7271119356155396, + "learning_rate": 8.26491436051471e-05, + "loss": 1.5642, + "step": 15884 + }, + { + "epoch": 0.5688756781922037, + "grad_norm": 1.5597244501113892, + "learning_rate": 8.263772063860096e-05, + "loss": 1.3979, + "step": 15885 + }, + { + "epoch": 0.568911490321772, + "grad_norm": 1.8739885091781616, + "learning_rate": 8.262629790564186e-05, + "loss": 1.6296, + "step": 15886 + }, + { + "epoch": 0.5689473024513403, + "grad_norm": 1.602601408958435, + "learning_rate": 8.261487540642353e-05, + "loss": 1.619, + "step": 15887 + }, + { + "epoch": 0.5689831145809086, + "grad_norm": 1.4145638942718506, + "learning_rate": 8.26034531410996e-05, + "loss": 1.5039, + "step": 15888 + }, + { + "epoch": 0.5690189267104768, + "grad_norm": 1.6475313901901245, + "learning_rate": 8.259203110982381e-05, + "loss": 1.2161, + "step": 15889 + }, + { + "epoch": 0.5690547388400451, + "grad_norm": 1.7352582216262817, + "learning_rate": 8.258060931274976e-05, + "loss": 1.41, + "step": 15890 + }, + { + "epoch": 0.5690905509696134, + "grad_norm": 1.9929988384246826, + "learning_rate": 8.256918775003115e-05, + "loss": 1.3584, + "step": 15891 + }, + { + "epoch": 0.5691263630991817, + "grad_norm": 1.258603811264038, + "learning_rate": 8.255776642182159e-05, + "loss": 1.276, + "step": 15892 + }, + { + "epoch": 0.56916217522875, + "grad_norm": 1.3469874858856201, + "learning_rate": 8.254634532827487e-05, + "loss": 1.661, + "step": 15893 + }, + { + "epoch": 0.5691979873583183, + "grad_norm": 1.4749658107757568, + "learning_rate": 8.253492446954452e-05, + "loss": 1.7263, + "step": 15894 + }, + { + "epoch": 0.5692337994878865, + "grad_norm": 1.9066383838653564, + "learning_rate": 8.252350384578421e-05, + "loss": 1.4895, + "step": 15895 + }, + { + "epoch": 0.5692696116174548, + "grad_norm": 2.15698504447937, + "learning_rate": 8.251208345714764e-05, + "loss": 1.444, + "step": 15896 + }, + { + "epoch": 0.5693054237470231, + "grad_norm": 1.5320926904678345, + "learning_rate": 8.25006633037884e-05, + "loss": 1.5376, + "step": 15897 + }, + { + "epoch": 0.5693412358765914, + "grad_norm": 1.3792502880096436, + "learning_rate": 8.24892433858602e-05, + "loss": 1.6099, + "step": 15898 + }, + { + "epoch": 0.5693770480061597, + "grad_norm": 1.6922613382339478, + "learning_rate": 8.247782370351663e-05, + "loss": 1.3959, + "step": 15899 + }, + { + "epoch": 0.569412860135728, + "grad_norm": 2.027216911315918, + "learning_rate": 8.246640425691133e-05, + "loss": 1.516, + "step": 15900 + }, + { + "epoch": 0.5694486722652963, + "grad_norm": 1.6750237941741943, + "learning_rate": 8.245498504619794e-05, + "loss": 1.1921, + "step": 15901 + }, + { + "epoch": 0.5694844843948645, + "grad_norm": 1.6753225326538086, + "learning_rate": 8.244356607153011e-05, + "loss": 1.5097, + "step": 15902 + }, + { + "epoch": 0.5695202965244328, + "grad_norm": 1.3187848329544067, + "learning_rate": 8.243214733306145e-05, + "loss": 1.4227, + "step": 15903 + }, + { + "epoch": 0.5695561086540011, + "grad_norm": 1.5426853895187378, + "learning_rate": 8.242072883094559e-05, + "loss": 1.5042, + "step": 15904 + }, + { + "epoch": 0.5695919207835693, + "grad_norm": 1.53534996509552, + "learning_rate": 8.240931056533615e-05, + "loss": 1.2341, + "step": 15905 + }, + { + "epoch": 0.5696277329131377, + "grad_norm": 1.2395790815353394, + "learning_rate": 8.239789253638672e-05, + "loss": 1.2779, + "step": 15906 + }, + { + "epoch": 0.569663545042706, + "grad_norm": 1.5523747205734253, + "learning_rate": 8.238647474425097e-05, + "loss": 1.6073, + "step": 15907 + }, + { + "epoch": 0.5696993571722743, + "grad_norm": 1.5684142112731934, + "learning_rate": 8.237505718908246e-05, + "loss": 1.5768, + "step": 15908 + }, + { + "epoch": 0.5697351693018425, + "grad_norm": 1.7088567018508911, + "learning_rate": 8.236363987103483e-05, + "loss": 1.4598, + "step": 15909 + }, + { + "epoch": 0.5697709814314108, + "grad_norm": 1.5773688554763794, + "learning_rate": 8.235222279026168e-05, + "loss": 1.2386, + "step": 15910 + }, + { + "epoch": 0.5698067935609791, + "grad_norm": 1.7507705688476562, + "learning_rate": 8.234080594691663e-05, + "loss": 1.5207, + "step": 15911 + }, + { + "epoch": 0.5698426056905473, + "grad_norm": 1.7203186750411987, + "learning_rate": 8.232938934115323e-05, + "loss": 1.472, + "step": 15912 + }, + { + "epoch": 0.5698784178201157, + "grad_norm": 1.6175166368484497, + "learning_rate": 8.231797297312509e-05, + "loss": 1.6451, + "step": 15913 + }, + { + "epoch": 0.569914229949684, + "grad_norm": 1.395760416984558, + "learning_rate": 8.230655684298585e-05, + "loss": 1.4501, + "step": 15914 + }, + { + "epoch": 0.5699500420792523, + "grad_norm": 1.8864084482192993, + "learning_rate": 8.229514095088903e-05, + "loss": 1.4636, + "step": 15915 + }, + { + "epoch": 0.5699858542088205, + "grad_norm": 1.705157995223999, + "learning_rate": 8.228372529698828e-05, + "loss": 1.5955, + "step": 15916 + }, + { + "epoch": 0.5700216663383888, + "grad_norm": 2.0784878730773926, + "learning_rate": 8.227230988143712e-05, + "loss": 1.3895, + "step": 15917 + }, + { + "epoch": 0.5700574784679571, + "grad_norm": 1.4656957387924194, + "learning_rate": 8.22608947043892e-05, + "loss": 1.4649, + "step": 15918 + }, + { + "epoch": 0.5700932905975253, + "grad_norm": 1.8664727210998535, + "learning_rate": 8.224947976599804e-05, + "loss": 1.6218, + "step": 15919 + }, + { + "epoch": 0.5701291027270937, + "grad_norm": 1.4809681177139282, + "learning_rate": 8.223806506641724e-05, + "loss": 1.4457, + "step": 15920 + }, + { + "epoch": 0.570164914856662, + "grad_norm": 1.9111181497573853, + "learning_rate": 8.222665060580038e-05, + "loss": 1.413, + "step": 15921 + }, + { + "epoch": 0.5702007269862303, + "grad_norm": 1.56305992603302, + "learning_rate": 8.221523638430098e-05, + "loss": 1.2293, + "step": 15922 + }, + { + "epoch": 0.5702365391157985, + "grad_norm": 1.4683891534805298, + "learning_rate": 8.220382240207266e-05, + "loss": 1.5771, + "step": 15923 + }, + { + "epoch": 0.5702723512453668, + "grad_norm": 1.7615280151367188, + "learning_rate": 8.219240865926892e-05, + "loss": 1.4823, + "step": 15924 + }, + { + "epoch": 0.5703081633749351, + "grad_norm": 2.0023019313812256, + "learning_rate": 8.218099515604339e-05, + "loss": 1.4878, + "step": 15925 + }, + { + "epoch": 0.5703439755045033, + "grad_norm": 1.7098430395126343, + "learning_rate": 8.216958189254956e-05, + "loss": 1.6951, + "step": 15926 + }, + { + "epoch": 0.5703797876340717, + "grad_norm": 1.6771968603134155, + "learning_rate": 8.215816886894102e-05, + "loss": 1.3032, + "step": 15927 + }, + { + "epoch": 0.57041559976364, + "grad_norm": 1.426419973373413, + "learning_rate": 8.214675608537128e-05, + "loss": 1.1594, + "step": 15928 + }, + { + "epoch": 0.5704514118932082, + "grad_norm": 2.378345012664795, + "learning_rate": 8.213534354199392e-05, + "loss": 1.9314, + "step": 15929 + }, + { + "epoch": 0.5704872240227765, + "grad_norm": 1.3041222095489502, + "learning_rate": 8.212393123896249e-05, + "loss": 1.5663, + "step": 15930 + }, + { + "epoch": 0.5705230361523448, + "grad_norm": 1.2427996397018433, + "learning_rate": 8.211251917643047e-05, + "loss": 1.4987, + "step": 15931 + }, + { + "epoch": 0.5705588482819131, + "grad_norm": 1.7306779623031616, + "learning_rate": 8.210110735455147e-05, + "loss": 1.4224, + "step": 15932 + }, + { + "epoch": 0.5705946604114813, + "grad_norm": 1.6303404569625854, + "learning_rate": 8.208969577347894e-05, + "loss": 1.443, + "step": 15933 + }, + { + "epoch": 0.5706304725410497, + "grad_norm": 1.9509263038635254, + "learning_rate": 8.207828443336649e-05, + "loss": 1.3303, + "step": 15934 + }, + { + "epoch": 0.570666284670618, + "grad_norm": 2.3029608726501465, + "learning_rate": 8.206687333436758e-05, + "loss": 1.4231, + "step": 15935 + }, + { + "epoch": 0.5707020968001862, + "grad_norm": 1.5229395627975464, + "learning_rate": 8.205546247663578e-05, + "loss": 1.52, + "step": 15936 + }, + { + "epoch": 0.5707379089297545, + "grad_norm": 2.0228278636932373, + "learning_rate": 8.204405186032455e-05, + "loss": 1.7603, + "step": 15937 + }, + { + "epoch": 0.5707737210593228, + "grad_norm": 1.6591273546218872, + "learning_rate": 8.203264148558749e-05, + "loss": 1.3386, + "step": 15938 + }, + { + "epoch": 0.570809533188891, + "grad_norm": 1.5935134887695312, + "learning_rate": 8.202123135257804e-05, + "loss": 1.514, + "step": 15939 + }, + { + "epoch": 0.5708453453184593, + "grad_norm": 1.514012098312378, + "learning_rate": 8.20098214614497e-05, + "loss": 1.6337, + "step": 15940 + }, + { + "epoch": 0.5708811574480277, + "grad_norm": 1.718827724456787, + "learning_rate": 8.199841181235606e-05, + "loss": 1.4344, + "step": 15941 + }, + { + "epoch": 0.570916969577596, + "grad_norm": 2.1700141429901123, + "learning_rate": 8.198700240545053e-05, + "loss": 1.3989, + "step": 15942 + }, + { + "epoch": 0.5709527817071642, + "grad_norm": 1.856092929840088, + "learning_rate": 8.197559324088666e-05, + "loss": 1.6977, + "step": 15943 + }, + { + "epoch": 0.5709885938367325, + "grad_norm": 1.462905764579773, + "learning_rate": 8.196418431881793e-05, + "loss": 1.6105, + "step": 15944 + }, + { + "epoch": 0.5710244059663008, + "grad_norm": 2.0627965927124023, + "learning_rate": 8.195277563939785e-05, + "loss": 1.6915, + "step": 15945 + }, + { + "epoch": 0.571060218095869, + "grad_norm": 1.770424246788025, + "learning_rate": 8.194136720277992e-05, + "loss": 1.4564, + "step": 15946 + }, + { + "epoch": 0.5710960302254373, + "grad_norm": 3.049556255340576, + "learning_rate": 8.192995900911751e-05, + "loss": 1.3256, + "step": 15947 + }, + { + "epoch": 0.5711318423550057, + "grad_norm": 1.7785724401474, + "learning_rate": 8.191855105856428e-05, + "loss": 1.4055, + "step": 15948 + }, + { + "epoch": 0.571167654484574, + "grad_norm": 1.4011263847351074, + "learning_rate": 8.190714335127356e-05, + "loss": 1.401, + "step": 15949 + }, + { + "epoch": 0.5712034666141422, + "grad_norm": 1.9322203397750854, + "learning_rate": 8.189573588739892e-05, + "loss": 1.5981, + "step": 15950 + }, + { + "epoch": 0.5712392787437105, + "grad_norm": 2.2392494678497314, + "learning_rate": 8.188432866709379e-05, + "loss": 1.5755, + "step": 15951 + }, + { + "epoch": 0.5712750908732788, + "grad_norm": 1.8817596435546875, + "learning_rate": 8.187292169051168e-05, + "loss": 1.4319, + "step": 15952 + }, + { + "epoch": 0.571310903002847, + "grad_norm": 1.4532921314239502, + "learning_rate": 8.186151495780598e-05, + "loss": 1.2006, + "step": 15953 + }, + { + "epoch": 0.5713467151324153, + "grad_norm": 2.218801975250244, + "learning_rate": 8.185010846913024e-05, + "loss": 1.4041, + "step": 15954 + }, + { + "epoch": 0.5713825272619837, + "grad_norm": 1.6853936910629272, + "learning_rate": 8.183870222463789e-05, + "loss": 1.5148, + "step": 15955 + }, + { + "epoch": 0.571418339391552, + "grad_norm": 1.7166179418563843, + "learning_rate": 8.182729622448231e-05, + "loss": 1.6018, + "step": 15956 + }, + { + "epoch": 0.5714541515211202, + "grad_norm": 1.7702536582946777, + "learning_rate": 8.181589046881709e-05, + "loss": 1.4854, + "step": 15957 + }, + { + "epoch": 0.5714899636506885, + "grad_norm": 1.2718796730041504, + "learning_rate": 8.180448495779554e-05, + "loss": 1.5774, + "step": 15958 + }, + { + "epoch": 0.5715257757802568, + "grad_norm": 1.6079905033111572, + "learning_rate": 8.179307969157123e-05, + "loss": 1.2911, + "step": 15959 + }, + { + "epoch": 0.571561587909825, + "grad_norm": 1.5168037414550781, + "learning_rate": 8.17816746702975e-05, + "loss": 1.6445, + "step": 15960 + }, + { + "epoch": 0.5715974000393933, + "grad_norm": 1.877971887588501, + "learning_rate": 8.177026989412789e-05, + "loss": 1.2533, + "step": 15961 + }, + { + "epoch": 0.5716332121689617, + "grad_norm": 1.4898743629455566, + "learning_rate": 8.175886536321574e-05, + "loss": 1.2705, + "step": 15962 + }, + { + "epoch": 0.57166902429853, + "grad_norm": 1.495103359222412, + "learning_rate": 8.174746107771454e-05, + "loss": 1.6275, + "step": 15963 + }, + { + "epoch": 0.5717048364280982, + "grad_norm": 1.7005457878112793, + "learning_rate": 8.173605703777774e-05, + "loss": 1.5007, + "step": 15964 + }, + { + "epoch": 0.5717406485576665, + "grad_norm": 1.619920253753662, + "learning_rate": 8.172465324355868e-05, + "loss": 1.3175, + "step": 15965 + }, + { + "epoch": 0.5717764606872348, + "grad_norm": 1.668201208114624, + "learning_rate": 8.171324969521089e-05, + "loss": 1.1773, + "step": 15966 + }, + { + "epoch": 0.571812272816803, + "grad_norm": 1.5060385465621948, + "learning_rate": 8.170184639288767e-05, + "loss": 1.5491, + "step": 15967 + }, + { + "epoch": 0.5718480849463713, + "grad_norm": 1.9399768114089966, + "learning_rate": 8.169044333674259e-05, + "loss": 1.7142, + "step": 15968 + }, + { + "epoch": 0.5718838970759397, + "grad_norm": 1.6110990047454834, + "learning_rate": 8.167904052692889e-05, + "loss": 1.3316, + "step": 15969 + }, + { + "epoch": 0.5719197092055079, + "grad_norm": 1.6544129848480225, + "learning_rate": 8.166763796360014e-05, + "loss": 1.2218, + "step": 15970 + }, + { + "epoch": 0.5719555213350762, + "grad_norm": 1.499854564666748, + "learning_rate": 8.165623564690961e-05, + "loss": 1.5932, + "step": 15971 + }, + { + "epoch": 0.5719913334646445, + "grad_norm": 1.706298589706421, + "learning_rate": 8.164483357701082e-05, + "loss": 1.4479, + "step": 15972 + }, + { + "epoch": 0.5720271455942127, + "grad_norm": 1.6135319471359253, + "learning_rate": 8.163343175405712e-05, + "loss": 1.3372, + "step": 15973 + }, + { + "epoch": 0.572062957723781, + "grad_norm": 1.8496217727661133, + "learning_rate": 8.162203017820186e-05, + "loss": 1.4587, + "step": 15974 + }, + { + "epoch": 0.5720987698533493, + "grad_norm": 1.8537300825119019, + "learning_rate": 8.161062884959852e-05, + "loss": 1.5282, + "step": 15975 + }, + { + "epoch": 0.5721345819829177, + "grad_norm": 1.6979246139526367, + "learning_rate": 8.159922776840039e-05, + "loss": 1.4608, + "step": 15976 + }, + { + "epoch": 0.5721703941124859, + "grad_norm": 1.7887334823608398, + "learning_rate": 8.158782693476099e-05, + "loss": 1.6018, + "step": 15977 + }, + { + "epoch": 0.5722062062420542, + "grad_norm": 1.8376210927963257, + "learning_rate": 8.157642634883355e-05, + "loss": 1.3104, + "step": 15978 + }, + { + "epoch": 0.5722420183716225, + "grad_norm": 1.831979751586914, + "learning_rate": 8.156502601077159e-05, + "loss": 1.363, + "step": 15979 + }, + { + "epoch": 0.5722778305011907, + "grad_norm": 1.3710800409317017, + "learning_rate": 8.155362592072837e-05, + "loss": 1.4114, + "step": 15980 + }, + { + "epoch": 0.572313642630759, + "grad_norm": 1.5481157302856445, + "learning_rate": 8.15422260788574e-05, + "loss": 1.4933, + "step": 15981 + }, + { + "epoch": 0.5723494547603273, + "grad_norm": 1.3802767992019653, + "learning_rate": 8.153082648531192e-05, + "loss": 1.6345, + "step": 15982 + }, + { + "epoch": 0.5723852668898957, + "grad_norm": 1.3365625143051147, + "learning_rate": 8.151942714024534e-05, + "loss": 1.3163, + "step": 15983 + }, + { + "epoch": 0.5724210790194639, + "grad_norm": 1.9832862615585327, + "learning_rate": 8.150802804381105e-05, + "loss": 1.4807, + "step": 15984 + }, + { + "epoch": 0.5724568911490322, + "grad_norm": 2.2987887859344482, + "learning_rate": 8.149662919616238e-05, + "loss": 1.6213, + "step": 15985 + }, + { + "epoch": 0.5724927032786005, + "grad_norm": 1.9295026063919067, + "learning_rate": 8.14852305974527e-05, + "loss": 1.5155, + "step": 15986 + }, + { + "epoch": 0.5725285154081687, + "grad_norm": 2.3797919750213623, + "learning_rate": 8.147383224783534e-05, + "loss": 1.4002, + "step": 15987 + }, + { + "epoch": 0.572564327537737, + "grad_norm": 1.760378360748291, + "learning_rate": 8.146243414746371e-05, + "loss": 1.4426, + "step": 15988 + }, + { + "epoch": 0.5726001396673053, + "grad_norm": 1.804693579673767, + "learning_rate": 8.145103629649104e-05, + "loss": 1.3964, + "step": 15989 + }, + { + "epoch": 0.5726359517968737, + "grad_norm": 2.0817153453826904, + "learning_rate": 8.143963869507085e-05, + "loss": 1.2892, + "step": 15990 + }, + { + "epoch": 0.5726717639264419, + "grad_norm": 2.2405829429626465, + "learning_rate": 8.142824134335633e-05, + "loss": 1.5307, + "step": 15991 + }, + { + "epoch": 0.5727075760560102, + "grad_norm": 1.2104196548461914, + "learning_rate": 8.141684424150087e-05, + "loss": 1.6489, + "step": 15992 + }, + { + "epoch": 0.5727433881855785, + "grad_norm": 1.529676079750061, + "learning_rate": 8.14054473896578e-05, + "loss": 1.5803, + "step": 15993 + }, + { + "epoch": 0.5727792003151467, + "grad_norm": 1.550414800643921, + "learning_rate": 8.139405078798044e-05, + "loss": 1.6136, + "step": 15994 + }, + { + "epoch": 0.572815012444715, + "grad_norm": 1.7423759698867798, + "learning_rate": 8.138265443662215e-05, + "loss": 1.6426, + "step": 15995 + }, + { + "epoch": 0.5728508245742833, + "grad_norm": 1.7016830444335938, + "learning_rate": 8.137125833573622e-05, + "loss": 1.4305, + "step": 15996 + }, + { + "epoch": 0.5728866367038516, + "grad_norm": 2.0163040161132812, + "learning_rate": 8.135986248547597e-05, + "loss": 1.7001, + "step": 15997 + }, + { + "epoch": 0.5729224488334199, + "grad_norm": 1.8785911798477173, + "learning_rate": 8.134846688599473e-05, + "loss": 1.4775, + "step": 15998 + }, + { + "epoch": 0.5729582609629882, + "grad_norm": 1.5626881122589111, + "learning_rate": 8.133707153744582e-05, + "loss": 1.2578, + "step": 15999 + }, + { + "epoch": 0.5729940730925565, + "grad_norm": 1.6098331212997437, + "learning_rate": 8.132567643998254e-05, + "loss": 1.4393, + "step": 16000 + }, + { + "epoch": 0.5730298852221247, + "grad_norm": 1.5904412269592285, + "learning_rate": 8.131428159375817e-05, + "loss": 1.4826, + "step": 16001 + }, + { + "epoch": 0.573065697351693, + "grad_norm": 2.667109489440918, + "learning_rate": 8.130288699892608e-05, + "loss": 1.6257, + "step": 16002 + }, + { + "epoch": 0.5731015094812613, + "grad_norm": 1.7234574556350708, + "learning_rate": 8.129149265563947e-05, + "loss": 1.4741, + "step": 16003 + }, + { + "epoch": 0.5731373216108296, + "grad_norm": 2.2013614177703857, + "learning_rate": 8.128009856405174e-05, + "loss": 1.6424, + "step": 16004 + }, + { + "epoch": 0.5731731337403979, + "grad_norm": 1.7291687726974487, + "learning_rate": 8.126870472431613e-05, + "loss": 1.4264, + "step": 16005 + }, + { + "epoch": 0.5732089458699662, + "grad_norm": 1.8733566999435425, + "learning_rate": 8.125731113658594e-05, + "loss": 1.3412, + "step": 16006 + }, + { + "epoch": 0.5732447579995344, + "grad_norm": 1.6369208097457886, + "learning_rate": 8.124591780101443e-05, + "loss": 1.4097, + "step": 16007 + }, + { + "epoch": 0.5732805701291027, + "grad_norm": 1.460530400276184, + "learning_rate": 8.123452471775493e-05, + "loss": 1.4468, + "step": 16008 + }, + { + "epoch": 0.573316382258671, + "grad_norm": 2.263429880142212, + "learning_rate": 8.122313188696068e-05, + "loss": 1.2833, + "step": 16009 + }, + { + "epoch": 0.5733521943882393, + "grad_norm": 1.4812617301940918, + "learning_rate": 8.121173930878496e-05, + "loss": 1.3627, + "step": 16010 + }, + { + "epoch": 0.5733880065178076, + "grad_norm": 1.905060052871704, + "learning_rate": 8.120034698338108e-05, + "loss": 1.6278, + "step": 16011 + }, + { + "epoch": 0.5734238186473759, + "grad_norm": 1.6872472763061523, + "learning_rate": 8.118895491090225e-05, + "loss": 1.425, + "step": 16012 + }, + { + "epoch": 0.5734596307769442, + "grad_norm": 1.7046935558319092, + "learning_rate": 8.11775630915018e-05, + "loss": 1.628, + "step": 16013 + }, + { + "epoch": 0.5734954429065124, + "grad_norm": 1.5307683944702148, + "learning_rate": 8.116617152533292e-05, + "loss": 1.6749, + "step": 16014 + }, + { + "epoch": 0.5735312550360807, + "grad_norm": 1.8192733526229858, + "learning_rate": 8.115478021254895e-05, + "loss": 1.5034, + "step": 16015 + }, + { + "epoch": 0.573567067165649, + "grad_norm": 1.5006341934204102, + "learning_rate": 8.114338915330307e-05, + "loss": 1.0562, + "step": 16016 + }, + { + "epoch": 0.5736028792952172, + "grad_norm": 1.6144181489944458, + "learning_rate": 8.113199834774858e-05, + "loss": 1.4468, + "step": 16017 + }, + { + "epoch": 0.5736386914247856, + "grad_norm": 1.690004825592041, + "learning_rate": 8.112060779603873e-05, + "loss": 1.3783, + "step": 16018 + }, + { + "epoch": 0.5736745035543539, + "grad_norm": 1.5593258142471313, + "learning_rate": 8.110921749832672e-05, + "loss": 1.6404, + "step": 16019 + }, + { + "epoch": 0.5737103156839222, + "grad_norm": 2.3472657203674316, + "learning_rate": 8.109782745476585e-05, + "loss": 1.8699, + "step": 16020 + }, + { + "epoch": 0.5737461278134904, + "grad_norm": 1.9007211923599243, + "learning_rate": 8.108643766550929e-05, + "loss": 1.3917, + "step": 16021 + }, + { + "epoch": 0.5737819399430587, + "grad_norm": 1.5271756649017334, + "learning_rate": 8.107504813071036e-05, + "loss": 1.498, + "step": 16022 + }, + { + "epoch": 0.573817752072627, + "grad_norm": 1.8045490980148315, + "learning_rate": 8.10636588505222e-05, + "loss": 1.4419, + "step": 16023 + }, + { + "epoch": 0.5738535642021952, + "grad_norm": 1.7356674671173096, + "learning_rate": 8.105226982509812e-05, + "loss": 1.7494, + "step": 16024 + }, + { + "epoch": 0.5738893763317636, + "grad_norm": 1.2682982683181763, + "learning_rate": 8.10408810545913e-05, + "loss": 1.3909, + "step": 16025 + }, + { + "epoch": 0.5739251884613319, + "grad_norm": 1.9603110551834106, + "learning_rate": 8.102949253915497e-05, + "loss": 1.4457, + "step": 16026 + }, + { + "epoch": 0.5739610005909002, + "grad_norm": 1.7176648378372192, + "learning_rate": 8.101810427894236e-05, + "loss": 1.6139, + "step": 16027 + }, + { + "epoch": 0.5739968127204684, + "grad_norm": 1.5585652589797974, + "learning_rate": 8.100671627410664e-05, + "loss": 1.4038, + "step": 16028 + }, + { + "epoch": 0.5740326248500367, + "grad_norm": 1.3921308517456055, + "learning_rate": 8.099532852480108e-05, + "loss": 1.4396, + "step": 16029 + }, + { + "epoch": 0.574068436979605, + "grad_norm": 1.6771459579467773, + "learning_rate": 8.098394103117885e-05, + "loss": 1.4582, + "step": 16030 + }, + { + "epoch": 0.5741042491091732, + "grad_norm": 1.6584618091583252, + "learning_rate": 8.097255379339317e-05, + "loss": 1.3684, + "step": 16031 + }, + { + "epoch": 0.5741400612387416, + "grad_norm": 1.7140179872512817, + "learning_rate": 8.096116681159722e-05, + "loss": 1.2808, + "step": 16032 + }, + { + "epoch": 0.5741758733683099, + "grad_norm": 1.8170238733291626, + "learning_rate": 8.094978008594423e-05, + "loss": 1.5748, + "step": 16033 + }, + { + "epoch": 0.5742116854978782, + "grad_norm": 1.9986485242843628, + "learning_rate": 8.093839361658735e-05, + "loss": 1.4221, + "step": 16034 + }, + { + "epoch": 0.5742474976274464, + "grad_norm": 1.4016669988632202, + "learning_rate": 8.092700740367983e-05, + "loss": 1.4064, + "step": 16035 + }, + { + "epoch": 0.5742833097570147, + "grad_norm": 1.619900107383728, + "learning_rate": 8.091562144737481e-05, + "loss": 1.6672, + "step": 16036 + }, + { + "epoch": 0.574319121886583, + "grad_norm": 1.6384003162384033, + "learning_rate": 8.090423574782549e-05, + "loss": 1.5615, + "step": 16037 + }, + { + "epoch": 0.5743549340161512, + "grad_norm": 1.4524556398391724, + "learning_rate": 8.089285030518504e-05, + "loss": 1.5234, + "step": 16038 + }, + { + "epoch": 0.5743907461457196, + "grad_norm": 1.6690996885299683, + "learning_rate": 8.088146511960663e-05, + "loss": 1.8467, + "step": 16039 + }, + { + "epoch": 0.5744265582752879, + "grad_norm": 1.6018950939178467, + "learning_rate": 8.087008019124347e-05, + "loss": 1.1323, + "step": 16040 + }, + { + "epoch": 0.5744623704048561, + "grad_norm": 1.966084599494934, + "learning_rate": 8.085869552024869e-05, + "loss": 1.4668, + "step": 16041 + }, + { + "epoch": 0.5744981825344244, + "grad_norm": 1.7654863595962524, + "learning_rate": 8.084731110677548e-05, + "loss": 1.4355, + "step": 16042 + }, + { + "epoch": 0.5745339946639927, + "grad_norm": 1.6190152168273926, + "learning_rate": 8.083592695097702e-05, + "loss": 1.531, + "step": 16043 + }, + { + "epoch": 0.574569806793561, + "grad_norm": 1.6661125421524048, + "learning_rate": 8.082454305300637e-05, + "loss": 1.3279, + "step": 16044 + }, + { + "epoch": 0.5746056189231292, + "grad_norm": 1.7820805311203003, + "learning_rate": 8.081315941301683e-05, + "loss": 1.6347, + "step": 16045 + }, + { + "epoch": 0.5746414310526975, + "grad_norm": 1.4383074045181274, + "learning_rate": 8.080177603116142e-05, + "loss": 1.7054, + "step": 16046 + }, + { + "epoch": 0.5746772431822659, + "grad_norm": 1.4362151622772217, + "learning_rate": 8.079039290759341e-05, + "loss": 1.6129, + "step": 16047 + }, + { + "epoch": 0.5747130553118341, + "grad_norm": 1.533974051475525, + "learning_rate": 8.077901004246584e-05, + "loss": 1.7049, + "step": 16048 + }, + { + "epoch": 0.5747488674414024, + "grad_norm": 1.90034019947052, + "learning_rate": 8.076762743593191e-05, + "loss": 1.4234, + "step": 16049 + }, + { + "epoch": 0.5747846795709707, + "grad_norm": 1.5940901041030884, + "learning_rate": 8.075624508814474e-05, + "loss": 1.4504, + "step": 16050 + }, + { + "epoch": 0.574820491700539, + "grad_norm": 1.7032641172409058, + "learning_rate": 8.074486299925749e-05, + "loss": 1.4603, + "step": 16051 + }, + { + "epoch": 0.5748563038301072, + "grad_norm": 1.4532485008239746, + "learning_rate": 8.073348116942329e-05, + "loss": 1.7031, + "step": 16052 + }, + { + "epoch": 0.5748921159596755, + "grad_norm": 2.4565727710723877, + "learning_rate": 8.072209959879517e-05, + "loss": 1.1419, + "step": 16053 + }, + { + "epoch": 0.5749279280892439, + "grad_norm": 1.4893320798873901, + "learning_rate": 8.071071828752643e-05, + "loss": 1.2288, + "step": 16054 + }, + { + "epoch": 0.5749637402188121, + "grad_norm": 1.7563259601593018, + "learning_rate": 8.069933723577e-05, + "loss": 1.6753, + "step": 16055 + }, + { + "epoch": 0.5749995523483804, + "grad_norm": 2.3505520820617676, + "learning_rate": 8.068795644367918e-05, + "loss": 1.5375, + "step": 16056 + }, + { + "epoch": 0.5750353644779487, + "grad_norm": 1.5410603284835815, + "learning_rate": 8.06765759114069e-05, + "loss": 1.742, + "step": 16057 + }, + { + "epoch": 0.5750711766075169, + "grad_norm": 1.543412446975708, + "learning_rate": 8.066519563910645e-05, + "loss": 1.3408, + "step": 16058 + }, + { + "epoch": 0.5751069887370852, + "grad_norm": 2.5051183700561523, + "learning_rate": 8.065381562693078e-05, + "loss": 1.6021, + "step": 16059 + }, + { + "epoch": 0.5751428008666535, + "grad_norm": 2.775167942047119, + "learning_rate": 8.064243587503313e-05, + "loss": 1.4534, + "step": 16060 + }, + { + "epoch": 0.5751786129962219, + "grad_norm": 1.8102688789367676, + "learning_rate": 8.063105638356654e-05, + "loss": 1.2111, + "step": 16061 + }, + { + "epoch": 0.5752144251257901, + "grad_norm": 2.6372623443603516, + "learning_rate": 8.061967715268403e-05, + "loss": 1.7916, + "step": 16062 + }, + { + "epoch": 0.5752502372553584, + "grad_norm": 1.2923922538757324, + "learning_rate": 8.060829818253884e-05, + "loss": 1.3766, + "step": 16063 + }, + { + "epoch": 0.5752860493849267, + "grad_norm": 1.690511703491211, + "learning_rate": 8.059691947328391e-05, + "loss": 1.5836, + "step": 16064 + }, + { + "epoch": 0.5753218615144949, + "grad_norm": 1.6254609823226929, + "learning_rate": 8.058554102507248e-05, + "loss": 1.6672, + "step": 16065 + }, + { + "epoch": 0.5753576736440632, + "grad_norm": 1.411546230316162, + "learning_rate": 8.057416283805748e-05, + "loss": 1.4695, + "step": 16066 + }, + { + "epoch": 0.5753934857736315, + "grad_norm": 1.4763381481170654, + "learning_rate": 8.056278491239213e-05, + "loss": 1.1461, + "step": 16067 + }, + { + "epoch": 0.5754292979031999, + "grad_norm": 2.3086869716644287, + "learning_rate": 8.055140724822938e-05, + "loss": 1.614, + "step": 16068 + }, + { + "epoch": 0.5754651100327681, + "grad_norm": 1.1650549173355103, + "learning_rate": 8.054002984572241e-05, + "loss": 1.2589, + "step": 16069 + }, + { + "epoch": 0.5755009221623364, + "grad_norm": 1.5132914781570435, + "learning_rate": 8.052865270502422e-05, + "loss": 1.4143, + "step": 16070 + }, + { + "epoch": 0.5755367342919047, + "grad_norm": 1.7870749235153198, + "learning_rate": 8.051727582628788e-05, + "loss": 1.3014, + "step": 16071 + }, + { + "epoch": 0.5755725464214729, + "grad_norm": 1.9012000560760498, + "learning_rate": 8.050589920966647e-05, + "loss": 1.5146, + "step": 16072 + }, + { + "epoch": 0.5756083585510412, + "grad_norm": 1.918483853340149, + "learning_rate": 8.049452285531302e-05, + "loss": 1.6011, + "step": 16073 + }, + { + "epoch": 0.5756441706806095, + "grad_norm": 1.824968695640564, + "learning_rate": 8.048314676338062e-05, + "loss": 1.629, + "step": 16074 + }, + { + "epoch": 0.5756799828101778, + "grad_norm": 1.566021203994751, + "learning_rate": 8.047177093402228e-05, + "loss": 1.4753, + "step": 16075 + }, + { + "epoch": 0.5757157949397461, + "grad_norm": 1.6824095249176025, + "learning_rate": 8.046039536739111e-05, + "loss": 1.3666, + "step": 16076 + }, + { + "epoch": 0.5757516070693144, + "grad_norm": 2.086169958114624, + "learning_rate": 8.044902006364008e-05, + "loss": 1.6941, + "step": 16077 + }, + { + "epoch": 0.5757874191988827, + "grad_norm": 1.4665168523788452, + "learning_rate": 8.043764502292232e-05, + "loss": 1.5978, + "step": 16078 + }, + { + "epoch": 0.5758232313284509, + "grad_norm": 1.646644949913025, + "learning_rate": 8.04262702453908e-05, + "loss": 1.5653, + "step": 16079 + }, + { + "epoch": 0.5758590434580192, + "grad_norm": 1.9247905015945435, + "learning_rate": 8.041489573119853e-05, + "loss": 1.3912, + "step": 16080 + }, + { + "epoch": 0.5758948555875875, + "grad_norm": 1.658804178237915, + "learning_rate": 8.04035214804986e-05, + "loss": 1.3743, + "step": 16081 + }, + { + "epoch": 0.5759306677171558, + "grad_norm": 1.8746556043624878, + "learning_rate": 8.0392147493444e-05, + "loss": 1.3236, + "step": 16082 + }, + { + "epoch": 0.5759664798467241, + "grad_norm": 1.518371343612671, + "learning_rate": 8.038077377018776e-05, + "loss": 1.2282, + "step": 16083 + }, + { + "epoch": 0.5760022919762924, + "grad_norm": 2.3258750438690186, + "learning_rate": 8.03694003108829e-05, + "loss": 1.6595, + "step": 16084 + }, + { + "epoch": 0.5760381041058606, + "grad_norm": 2.2161455154418945, + "learning_rate": 8.035802711568245e-05, + "loss": 1.6238, + "step": 16085 + }, + { + "epoch": 0.5760739162354289, + "grad_norm": 1.8260993957519531, + "learning_rate": 8.03466541847394e-05, + "loss": 1.3711, + "step": 16086 + }, + { + "epoch": 0.5761097283649972, + "grad_norm": 1.9408621788024902, + "learning_rate": 8.033528151820679e-05, + "loss": 1.7465, + "step": 16087 + }, + { + "epoch": 0.5761455404945655, + "grad_norm": 1.375427007675171, + "learning_rate": 8.032390911623758e-05, + "loss": 1.6076, + "step": 16088 + }, + { + "epoch": 0.5761813526241338, + "grad_norm": 1.5959179401397705, + "learning_rate": 8.031253697898478e-05, + "loss": 1.5481, + "step": 16089 + }, + { + "epoch": 0.5762171647537021, + "grad_norm": 1.694019079208374, + "learning_rate": 8.030116510660143e-05, + "loss": 1.0399, + "step": 16090 + }, + { + "epoch": 0.5762529768832704, + "grad_norm": 1.893786907196045, + "learning_rate": 8.028979349924048e-05, + "loss": 1.616, + "step": 16091 + }, + { + "epoch": 0.5762887890128386, + "grad_norm": 1.5224804878234863, + "learning_rate": 8.027842215705494e-05, + "loss": 1.2418, + "step": 16092 + }, + { + "epoch": 0.5763246011424069, + "grad_norm": 1.8478490114212036, + "learning_rate": 8.026705108019777e-05, + "loss": 1.4941, + "step": 16093 + }, + { + "epoch": 0.5763604132719752, + "grad_norm": 1.8832570314407349, + "learning_rate": 8.0255680268822e-05, + "loss": 1.2081, + "step": 16094 + }, + { + "epoch": 0.5763962254015434, + "grad_norm": 1.4357125759124756, + "learning_rate": 8.024430972308056e-05, + "loss": 1.2784, + "step": 16095 + }, + { + "epoch": 0.5764320375311118, + "grad_norm": 1.4434853792190552, + "learning_rate": 8.023293944312647e-05, + "loss": 1.605, + "step": 16096 + }, + { + "epoch": 0.5764678496606801, + "grad_norm": 1.8244801759719849, + "learning_rate": 8.022156942911267e-05, + "loss": 1.3657, + "step": 16097 + }, + { + "epoch": 0.5765036617902484, + "grad_norm": 1.4665894508361816, + "learning_rate": 8.021019968119215e-05, + "loss": 1.6977, + "step": 16098 + }, + { + "epoch": 0.5765394739198166, + "grad_norm": 1.9285708665847778, + "learning_rate": 8.019883019951786e-05, + "loss": 1.2284, + "step": 16099 + }, + { + "epoch": 0.5765752860493849, + "grad_norm": 2.212251663208008, + "learning_rate": 8.018746098424276e-05, + "loss": 1.763, + "step": 16100 + }, + { + "epoch": 0.5766110981789532, + "grad_norm": 1.7550066709518433, + "learning_rate": 8.017609203551983e-05, + "loss": 1.4755, + "step": 16101 + }, + { + "epoch": 0.5766469103085214, + "grad_norm": 1.8005722761154175, + "learning_rate": 8.0164723353502e-05, + "loss": 1.434, + "step": 16102 + }, + { + "epoch": 0.5766827224380898, + "grad_norm": 1.9695098400115967, + "learning_rate": 8.015335493834224e-05, + "loss": 1.6995, + "step": 16103 + }, + { + "epoch": 0.5767185345676581, + "grad_norm": 1.847440481185913, + "learning_rate": 8.014198679019348e-05, + "loss": 1.7125, + "step": 16104 + }, + { + "epoch": 0.5767543466972264, + "grad_norm": 1.2874605655670166, + "learning_rate": 8.01306189092087e-05, + "loss": 1.4815, + "step": 16105 + }, + { + "epoch": 0.5767901588267946, + "grad_norm": 1.833154559135437, + "learning_rate": 8.011925129554078e-05, + "loss": 1.2951, + "step": 16106 + }, + { + "epoch": 0.5768259709563629, + "grad_norm": 1.9240434169769287, + "learning_rate": 8.01078839493427e-05, + "loss": 1.1895, + "step": 16107 + }, + { + "epoch": 0.5768617830859312, + "grad_norm": 1.921027421951294, + "learning_rate": 8.009651687076739e-05, + "loss": 1.4075, + "step": 16108 + }, + { + "epoch": 0.5768975952154994, + "grad_norm": 1.5504812002182007, + "learning_rate": 8.008515005996775e-05, + "loss": 1.2061, + "step": 16109 + }, + { + "epoch": 0.5769334073450678, + "grad_norm": 1.6226364374160767, + "learning_rate": 8.007378351709676e-05, + "loss": 1.6403, + "step": 16110 + }, + { + "epoch": 0.5769692194746361, + "grad_norm": 1.543958067893982, + "learning_rate": 8.006241724230728e-05, + "loss": 1.6479, + "step": 16111 + }, + { + "epoch": 0.5770050316042044, + "grad_norm": 1.6312440633773804, + "learning_rate": 8.005105123575228e-05, + "loss": 1.4381, + "step": 16112 + }, + { + "epoch": 0.5770408437337726, + "grad_norm": 1.547906517982483, + "learning_rate": 8.003968549758462e-05, + "loss": 1.6837, + "step": 16113 + }, + { + "epoch": 0.5770766558633409, + "grad_norm": 2.2383906841278076, + "learning_rate": 8.002832002795729e-05, + "loss": 0.9843, + "step": 16114 + }, + { + "epoch": 0.5771124679929092, + "grad_norm": 1.4814233779907227, + "learning_rate": 8.001695482702314e-05, + "loss": 1.3434, + "step": 16115 + }, + { + "epoch": 0.5771482801224774, + "grad_norm": 1.837546706199646, + "learning_rate": 8.000558989493507e-05, + "loss": 1.544, + "step": 16116 + }, + { + "epoch": 0.5771840922520458, + "grad_norm": 1.2523295879364014, + "learning_rate": 7.999422523184601e-05, + "loss": 1.287, + "step": 16117 + }, + { + "epoch": 0.5772199043816141, + "grad_norm": 1.5280488729476929, + "learning_rate": 7.998286083790883e-05, + "loss": 1.323, + "step": 16118 + }, + { + "epoch": 0.5772557165111823, + "grad_norm": 1.6794685125350952, + "learning_rate": 7.997149671327646e-05, + "loss": 1.4495, + "step": 16119 + }, + { + "epoch": 0.5772915286407506, + "grad_norm": 2.4332382678985596, + "learning_rate": 7.996013285810173e-05, + "loss": 1.5304, + "step": 16120 + }, + { + "epoch": 0.5773273407703189, + "grad_norm": 1.4107089042663574, + "learning_rate": 7.99487692725376e-05, + "loss": 1.2941, + "step": 16121 + }, + { + "epoch": 0.5773631528998872, + "grad_norm": 1.6227834224700928, + "learning_rate": 7.993740595673689e-05, + "loss": 1.3497, + "step": 16122 + }, + { + "epoch": 0.5773989650294554, + "grad_norm": 1.5155694484710693, + "learning_rate": 7.992604291085253e-05, + "loss": 1.6335, + "step": 16123 + }, + { + "epoch": 0.5774347771590238, + "grad_norm": 2.1160717010498047, + "learning_rate": 7.991468013503735e-05, + "loss": 1.6124, + "step": 16124 + }, + { + "epoch": 0.5774705892885921, + "grad_norm": 1.6966854333877563, + "learning_rate": 7.990331762944426e-05, + "loss": 1.4299, + "step": 16125 + }, + { + "epoch": 0.5775064014181603, + "grad_norm": 1.8344290256500244, + "learning_rate": 7.989195539422609e-05, + "loss": 1.5318, + "step": 16126 + }, + { + "epoch": 0.5775422135477286, + "grad_norm": 2.1543333530426025, + "learning_rate": 7.988059342953571e-05, + "loss": 1.7969, + "step": 16127 + }, + { + "epoch": 0.5775780256772969, + "grad_norm": 1.9290580749511719, + "learning_rate": 7.986923173552602e-05, + "loss": 1.7385, + "step": 16128 + }, + { + "epoch": 0.5776138378068651, + "grad_norm": 1.6262422800064087, + "learning_rate": 7.985787031234983e-05, + "loss": 1.4024, + "step": 16129 + }, + { + "epoch": 0.5776496499364334, + "grad_norm": 1.9909801483154297, + "learning_rate": 7.984650916016003e-05, + "loss": 1.3964, + "step": 16130 + }, + { + "epoch": 0.5776854620660018, + "grad_norm": 1.3821978569030762, + "learning_rate": 7.983514827910943e-05, + "loss": 1.3624, + "step": 16131 + }, + { + "epoch": 0.5777212741955701, + "grad_norm": 1.4430720806121826, + "learning_rate": 7.982378766935092e-05, + "loss": 1.6197, + "step": 16132 + }, + { + "epoch": 0.5777570863251383, + "grad_norm": 1.3576154708862305, + "learning_rate": 7.981242733103734e-05, + "loss": 1.4386, + "step": 16133 + }, + { + "epoch": 0.5777928984547066, + "grad_norm": 2.1250572204589844, + "learning_rate": 7.980106726432148e-05, + "loss": 1.2357, + "step": 16134 + }, + { + "epoch": 0.5778287105842749, + "grad_norm": 1.6728894710540771, + "learning_rate": 7.978970746935621e-05, + "loss": 1.5269, + "step": 16135 + }, + { + "epoch": 0.5778645227138431, + "grad_norm": 2.4180312156677246, + "learning_rate": 7.977834794629436e-05, + "loss": 1.5775, + "step": 16136 + }, + { + "epoch": 0.5779003348434114, + "grad_norm": 1.6585233211517334, + "learning_rate": 7.976698869528876e-05, + "loss": 1.5318, + "step": 16137 + }, + { + "epoch": 0.5779361469729798, + "grad_norm": 1.3715888261795044, + "learning_rate": 7.975562971649219e-05, + "loss": 1.5977, + "step": 16138 + }, + { + "epoch": 0.5779719591025481, + "grad_norm": 1.8172003030776978, + "learning_rate": 7.974427101005756e-05, + "loss": 1.3096, + "step": 16139 + }, + { + "epoch": 0.5780077712321163, + "grad_norm": 1.5655006170272827, + "learning_rate": 7.973291257613761e-05, + "loss": 1.4269, + "step": 16140 + }, + { + "epoch": 0.5780435833616846, + "grad_norm": 1.313693881034851, + "learning_rate": 7.97215544148852e-05, + "loss": 1.3408, + "step": 16141 + }, + { + "epoch": 0.5780793954912529, + "grad_norm": 1.644073486328125, + "learning_rate": 7.971019652645313e-05, + "loss": 1.4267, + "step": 16142 + }, + { + "epoch": 0.5781152076208211, + "grad_norm": 2.4095518589019775, + "learning_rate": 7.969883891099412e-05, + "loss": 1.4167, + "step": 16143 + }, + { + "epoch": 0.5781510197503894, + "grad_norm": 2.725630044937134, + "learning_rate": 7.968748156866113e-05, + "loss": 1.35, + "step": 16144 + }, + { + "epoch": 0.5781868318799578, + "grad_norm": 2.1629457473754883, + "learning_rate": 7.967612449960679e-05, + "loss": 1.6466, + "step": 16145 + }, + { + "epoch": 0.578222644009526, + "grad_norm": 1.4881287813186646, + "learning_rate": 7.966476770398404e-05, + "loss": 1.5601, + "step": 16146 + }, + { + "epoch": 0.5782584561390943, + "grad_norm": 1.5154433250427246, + "learning_rate": 7.965341118194559e-05, + "loss": 1.3574, + "step": 16147 + }, + { + "epoch": 0.5782942682686626, + "grad_norm": 1.5236823558807373, + "learning_rate": 7.964205493364426e-05, + "loss": 1.5902, + "step": 16148 + }, + { + "epoch": 0.5783300803982309, + "grad_norm": 1.974445104598999, + "learning_rate": 7.963069895923285e-05, + "loss": 1.5286, + "step": 16149 + }, + { + "epoch": 0.5783658925277991, + "grad_norm": 2.1835594177246094, + "learning_rate": 7.961934325886404e-05, + "loss": 1.2116, + "step": 16150 + }, + { + "epoch": 0.5784017046573674, + "grad_norm": 1.4302401542663574, + "learning_rate": 7.960798783269074e-05, + "loss": 1.565, + "step": 16151 + }, + { + "epoch": 0.5784375167869358, + "grad_norm": 2.1140241622924805, + "learning_rate": 7.95966326808656e-05, + "loss": 1.4891, + "step": 16152 + }, + { + "epoch": 0.578473328916504, + "grad_norm": 1.4253383874893188, + "learning_rate": 7.958527780354151e-05, + "loss": 1.386, + "step": 16153 + }, + { + "epoch": 0.5785091410460723, + "grad_norm": 1.2918790578842163, + "learning_rate": 7.957392320087112e-05, + "loss": 1.9235, + "step": 16154 + }, + { + "epoch": 0.5785449531756406, + "grad_norm": 1.124372959136963, + "learning_rate": 7.956256887300729e-05, + "loss": 1.4101, + "step": 16155 + }, + { + "epoch": 0.5785807653052089, + "grad_norm": 1.958508014678955, + "learning_rate": 7.955121482010268e-05, + "loss": 1.273, + "step": 16156 + }, + { + "epoch": 0.5786165774347771, + "grad_norm": 2.251087188720703, + "learning_rate": 7.953986104231018e-05, + "loss": 1.4734, + "step": 16157 + }, + { + "epoch": 0.5786523895643454, + "grad_norm": 2.016096353530884, + "learning_rate": 7.95285075397824e-05, + "loss": 1.6336, + "step": 16158 + }, + { + "epoch": 0.5786882016939138, + "grad_norm": 1.6927529573440552, + "learning_rate": 7.951715431267213e-05, + "loss": 1.6828, + "step": 16159 + }, + { + "epoch": 0.578724013823482, + "grad_norm": 1.4586488008499146, + "learning_rate": 7.950580136113219e-05, + "loss": 1.4452, + "step": 16160 + }, + { + "epoch": 0.5787598259530503, + "grad_norm": 1.6537890434265137, + "learning_rate": 7.949444868531517e-05, + "loss": 1.5215, + "step": 16161 + }, + { + "epoch": 0.5787956380826186, + "grad_norm": 1.5151437520980835, + "learning_rate": 7.948309628537399e-05, + "loss": 1.3912, + "step": 16162 + }, + { + "epoch": 0.5788314502121868, + "grad_norm": 1.7033724784851074, + "learning_rate": 7.94717441614612e-05, + "loss": 1.5666, + "step": 16163 + }, + { + "epoch": 0.5788672623417551, + "grad_norm": 1.2881311178207397, + "learning_rate": 7.946039231372967e-05, + "loss": 1.5314, + "step": 16164 + }, + { + "epoch": 0.5789030744713234, + "grad_norm": 1.7593480348587036, + "learning_rate": 7.944904074233201e-05, + "loss": 1.5596, + "step": 16165 + }, + { + "epoch": 0.5789388866008918, + "grad_norm": 1.4670255184173584, + "learning_rate": 7.943768944742107e-05, + "loss": 1.5353, + "step": 16166 + }, + { + "epoch": 0.57897469873046, + "grad_norm": 1.4414554834365845, + "learning_rate": 7.942633842914946e-05, + "loss": 1.538, + "step": 16167 + }, + { + "epoch": 0.5790105108600283, + "grad_norm": 1.5942708253860474, + "learning_rate": 7.941498768766991e-05, + "loss": 1.3996, + "step": 16168 + }, + { + "epoch": 0.5790463229895966, + "grad_norm": 2.574983596801758, + "learning_rate": 7.940363722313519e-05, + "loss": 1.3455, + "step": 16169 + }, + { + "epoch": 0.5790821351191648, + "grad_norm": 1.7376738786697388, + "learning_rate": 7.939228703569792e-05, + "loss": 1.5424, + "step": 16170 + }, + { + "epoch": 0.5791179472487331, + "grad_norm": 1.8088690042495728, + "learning_rate": 7.938093712551087e-05, + "loss": 1.7215, + "step": 16171 + }, + { + "epoch": 0.5791537593783014, + "grad_norm": 1.8485448360443115, + "learning_rate": 7.936958749272669e-05, + "loss": 1.6542, + "step": 16172 + }, + { + "epoch": 0.5791895715078698, + "grad_norm": 1.5748894214630127, + "learning_rate": 7.935823813749815e-05, + "loss": 1.5086, + "step": 16173 + }, + { + "epoch": 0.579225383637438, + "grad_norm": 2.4885716438293457, + "learning_rate": 7.934688905997781e-05, + "loss": 1.6712, + "step": 16174 + }, + { + "epoch": 0.5792611957670063, + "grad_norm": 1.5027422904968262, + "learning_rate": 7.933554026031852e-05, + "loss": 1.2673, + "step": 16175 + }, + { + "epoch": 0.5792970078965746, + "grad_norm": 1.5155824422836304, + "learning_rate": 7.932419173867286e-05, + "loss": 1.5036, + "step": 16176 + }, + { + "epoch": 0.5793328200261428, + "grad_norm": 1.7846710681915283, + "learning_rate": 7.93128434951935e-05, + "loss": 1.4373, + "step": 16177 + }, + { + "epoch": 0.5793686321557111, + "grad_norm": 1.8113620281219482, + "learning_rate": 7.930149553003318e-05, + "loss": 1.6359, + "step": 16178 + }, + { + "epoch": 0.5794044442852794, + "grad_norm": 1.718705654144287, + "learning_rate": 7.92901478433445e-05, + "loss": 1.4123, + "step": 16179 + }, + { + "epoch": 0.5794402564148478, + "grad_norm": 1.435090184211731, + "learning_rate": 7.92788004352802e-05, + "loss": 1.567, + "step": 16180 + }, + { + "epoch": 0.579476068544416, + "grad_norm": 1.6663739681243896, + "learning_rate": 7.926745330599289e-05, + "loss": 1.545, + "step": 16181 + }, + { + "epoch": 0.5795118806739843, + "grad_norm": 1.994544506072998, + "learning_rate": 7.925610645563527e-05, + "loss": 1.5623, + "step": 16182 + }, + { + "epoch": 0.5795476928035526, + "grad_norm": 2.0755226612091064, + "learning_rate": 7.924475988435996e-05, + "loss": 1.7355, + "step": 16183 + }, + { + "epoch": 0.5795835049331208, + "grad_norm": 1.354204773902893, + "learning_rate": 7.923341359231965e-05, + "loss": 1.4404, + "step": 16184 + }, + { + "epoch": 0.5796193170626891, + "grad_norm": 1.4060965776443481, + "learning_rate": 7.922206757966698e-05, + "loss": 1.1692, + "step": 16185 + }, + { + "epoch": 0.5796551291922574, + "grad_norm": 1.943017601966858, + "learning_rate": 7.921072184655457e-05, + "loss": 1.2975, + "step": 16186 + }, + { + "epoch": 0.5796909413218257, + "grad_norm": 1.4191889762878418, + "learning_rate": 7.91993763931351e-05, + "loss": 1.4388, + "step": 16187 + }, + { + "epoch": 0.579726753451394, + "grad_norm": 1.3876757621765137, + "learning_rate": 7.918803121956117e-05, + "loss": 1.4685, + "step": 16188 + }, + { + "epoch": 0.5797625655809623, + "grad_norm": 1.7913190126419067, + "learning_rate": 7.917668632598545e-05, + "loss": 1.269, + "step": 16189 + }, + { + "epoch": 0.5797983777105306, + "grad_norm": 2.235036611557007, + "learning_rate": 7.916534171256054e-05, + "loss": 1.1393, + "step": 16190 + }, + { + "epoch": 0.5798341898400988, + "grad_norm": 1.541192889213562, + "learning_rate": 7.91539973794391e-05, + "loss": 1.7477, + "step": 16191 + }, + { + "epoch": 0.5798700019696671, + "grad_norm": 1.6059012413024902, + "learning_rate": 7.914265332677371e-05, + "loss": 1.4964, + "step": 16192 + }, + { + "epoch": 0.5799058140992354, + "grad_norm": 1.5511407852172852, + "learning_rate": 7.913130955471704e-05, + "loss": 1.7551, + "step": 16193 + }, + { + "epoch": 0.5799416262288037, + "grad_norm": 1.5841376781463623, + "learning_rate": 7.911996606342168e-05, + "loss": 1.3235, + "step": 16194 + }, + { + "epoch": 0.579977438358372, + "grad_norm": 1.651566982269287, + "learning_rate": 7.910862285304022e-05, + "loss": 1.4021, + "step": 16195 + }, + { + "epoch": 0.5800132504879403, + "grad_norm": 1.7354012727737427, + "learning_rate": 7.909727992372533e-05, + "loss": 1.5678, + "step": 16196 + }, + { + "epoch": 0.5800490626175085, + "grad_norm": 1.7143256664276123, + "learning_rate": 7.908593727562954e-05, + "loss": 1.3513, + "step": 16197 + }, + { + "epoch": 0.5800848747470768, + "grad_norm": 1.5278571844100952, + "learning_rate": 7.907459490890551e-05, + "loss": 1.2315, + "step": 16198 + }, + { + "epoch": 0.5801206868766451, + "grad_norm": 1.3946882486343384, + "learning_rate": 7.906325282370579e-05, + "loss": 1.7144, + "step": 16199 + }, + { + "epoch": 0.5801564990062134, + "grad_norm": 1.6457384824752808, + "learning_rate": 7.905191102018302e-05, + "loss": 1.3999, + "step": 16200 + }, + { + "epoch": 0.5801923111357817, + "grad_norm": 1.8805018663406372, + "learning_rate": 7.904056949848975e-05, + "loss": 1.5579, + "step": 16201 + }, + { + "epoch": 0.58022812326535, + "grad_norm": 2.192815065383911, + "learning_rate": 7.90292282587786e-05, + "loss": 1.4462, + "step": 16202 + }, + { + "epoch": 0.5802639353949183, + "grad_norm": 1.707252860069275, + "learning_rate": 7.901788730120214e-05, + "loss": 1.1617, + "step": 16203 + }, + { + "epoch": 0.5802997475244865, + "grad_norm": 2.2788679599761963, + "learning_rate": 7.90065466259129e-05, + "loss": 1.3855, + "step": 16204 + }, + { + "epoch": 0.5803355596540548, + "grad_norm": 1.7162657976150513, + "learning_rate": 7.899520623306353e-05, + "loss": 1.2427, + "step": 16205 + }, + { + "epoch": 0.5803713717836231, + "grad_norm": 2.0036752223968506, + "learning_rate": 7.898386612280654e-05, + "loss": 1.4296, + "step": 16206 + }, + { + "epoch": 0.5804071839131913, + "grad_norm": 1.5575006008148193, + "learning_rate": 7.897252629529455e-05, + "loss": 1.6481, + "step": 16207 + }, + { + "epoch": 0.5804429960427597, + "grad_norm": 1.8689552545547485, + "learning_rate": 7.896118675068007e-05, + "loss": 1.5716, + "step": 16208 + }, + { + "epoch": 0.580478808172328, + "grad_norm": 1.734002947807312, + "learning_rate": 7.894984748911572e-05, + "loss": 1.4126, + "step": 16209 + }, + { + "epoch": 0.5805146203018963, + "grad_norm": 1.9795552492141724, + "learning_rate": 7.893850851075398e-05, + "loss": 1.3466, + "step": 16210 + }, + { + "epoch": 0.5805504324314645, + "grad_norm": 1.9898791313171387, + "learning_rate": 7.892716981574747e-05, + "loss": 1.5883, + "step": 16211 + }, + { + "epoch": 0.5805862445610328, + "grad_norm": 1.4967081546783447, + "learning_rate": 7.89158314042487e-05, + "loss": 1.5994, + "step": 16212 + }, + { + "epoch": 0.5806220566906011, + "grad_norm": 1.672903299331665, + "learning_rate": 7.890449327641021e-05, + "loss": 1.7382, + "step": 16213 + }, + { + "epoch": 0.5806578688201693, + "grad_norm": 1.7039663791656494, + "learning_rate": 7.889315543238457e-05, + "loss": 1.6315, + "step": 16214 + }, + { + "epoch": 0.5806936809497377, + "grad_norm": 1.6308118104934692, + "learning_rate": 7.888181787232427e-05, + "loss": 1.5503, + "step": 16215 + }, + { + "epoch": 0.580729493079306, + "grad_norm": 1.8817521333694458, + "learning_rate": 7.88704805963819e-05, + "loss": 1.5972, + "step": 16216 + }, + { + "epoch": 0.5807653052088743, + "grad_norm": 1.8613077402114868, + "learning_rate": 7.885914360470992e-05, + "loss": 1.6216, + "step": 16217 + }, + { + "epoch": 0.5808011173384425, + "grad_norm": 1.5126614570617676, + "learning_rate": 7.884780689746094e-05, + "loss": 1.4662, + "step": 16218 + }, + { + "epoch": 0.5808369294680108, + "grad_norm": 1.5945414304733276, + "learning_rate": 7.88364704747874e-05, + "loss": 1.5989, + "step": 16219 + }, + { + "epoch": 0.5808727415975791, + "grad_norm": 1.5594874620437622, + "learning_rate": 7.882513433684188e-05, + "loss": 1.5519, + "step": 16220 + }, + { + "epoch": 0.5809085537271473, + "grad_norm": 1.834822177886963, + "learning_rate": 7.881379848377685e-05, + "loss": 1.6284, + "step": 16221 + }, + { + "epoch": 0.5809443658567157, + "grad_norm": 2.2757322788238525, + "learning_rate": 7.880246291574482e-05, + "loss": 1.3027, + "step": 16222 + }, + { + "epoch": 0.580980177986284, + "grad_norm": 1.6082566976547241, + "learning_rate": 7.879112763289833e-05, + "loss": 1.4609, + "step": 16223 + }, + { + "epoch": 0.5810159901158523, + "grad_norm": 1.623914361000061, + "learning_rate": 7.877979263538983e-05, + "loss": 1.5868, + "step": 16224 + }, + { + "epoch": 0.5810518022454205, + "grad_norm": 1.4521030187606812, + "learning_rate": 7.876845792337189e-05, + "loss": 1.3755, + "step": 16225 + }, + { + "epoch": 0.5810876143749888, + "grad_norm": 1.9787819385528564, + "learning_rate": 7.875712349699692e-05, + "loss": 1.0918, + "step": 16226 + }, + { + "epoch": 0.5811234265045571, + "grad_norm": 2.0238037109375, + "learning_rate": 7.874578935641748e-05, + "loss": 1.3673, + "step": 16227 + }, + { + "epoch": 0.5811592386341253, + "grad_norm": 1.6591154336929321, + "learning_rate": 7.873445550178601e-05, + "loss": 1.4138, + "step": 16228 + }, + { + "epoch": 0.5811950507636937, + "grad_norm": 1.5953693389892578, + "learning_rate": 7.872312193325502e-05, + "loss": 1.095, + "step": 16229 + }, + { + "epoch": 0.581230862893262, + "grad_norm": 1.9868221282958984, + "learning_rate": 7.871178865097699e-05, + "loss": 1.5134, + "step": 16230 + }, + { + "epoch": 0.5812666750228302, + "grad_norm": 1.6181082725524902, + "learning_rate": 7.870045565510436e-05, + "loss": 1.344, + "step": 16231 + }, + { + "epoch": 0.5813024871523985, + "grad_norm": 1.8344509601593018, + "learning_rate": 7.868912294578965e-05, + "loss": 1.2748, + "step": 16232 + }, + { + "epoch": 0.5813382992819668, + "grad_norm": 1.7959150075912476, + "learning_rate": 7.867779052318528e-05, + "loss": 1.7419, + "step": 16233 + }, + { + "epoch": 0.581374111411535, + "grad_norm": 1.6314499378204346, + "learning_rate": 7.866645838744375e-05, + "loss": 1.7027, + "step": 16234 + }, + { + "epoch": 0.5814099235411033, + "grad_norm": 2.4061760902404785, + "learning_rate": 7.865512653871749e-05, + "loss": 1.3808, + "step": 16235 + }, + { + "epoch": 0.5814457356706717, + "grad_norm": 1.7618334293365479, + "learning_rate": 7.864379497715898e-05, + "loss": 1.5516, + "step": 16236 + }, + { + "epoch": 0.58148154780024, + "grad_norm": 1.2001464366912842, + "learning_rate": 7.863246370292065e-05, + "loss": 1.2329, + "step": 16237 + }, + { + "epoch": 0.5815173599298082, + "grad_norm": 1.4679129123687744, + "learning_rate": 7.862113271615499e-05, + "loss": 1.3114, + "step": 16238 + }, + { + "epoch": 0.5815531720593765, + "grad_norm": 1.4955120086669922, + "learning_rate": 7.860980201701441e-05, + "loss": 1.4209, + "step": 16239 + }, + { + "epoch": 0.5815889841889448, + "grad_norm": 1.7343052625656128, + "learning_rate": 7.859847160565131e-05, + "loss": 1.4165, + "step": 16240 + }, + { + "epoch": 0.581624796318513, + "grad_norm": 2.181267499923706, + "learning_rate": 7.858714148221822e-05, + "loss": 1.6077, + "step": 16241 + }, + { + "epoch": 0.5816606084480813, + "grad_norm": 1.4039392471313477, + "learning_rate": 7.857581164686744e-05, + "loss": 1.7029, + "step": 16242 + }, + { + "epoch": 0.5816964205776497, + "grad_norm": 2.262587070465088, + "learning_rate": 7.856448209975156e-05, + "loss": 1.7322, + "step": 16243 + }, + { + "epoch": 0.581732232707218, + "grad_norm": 1.8444851636886597, + "learning_rate": 7.855315284102288e-05, + "loss": 1.194, + "step": 16244 + }, + { + "epoch": 0.5817680448367862, + "grad_norm": 1.571876049041748, + "learning_rate": 7.854182387083389e-05, + "loss": 1.4038, + "step": 16245 + }, + { + "epoch": 0.5818038569663545, + "grad_norm": 2.275881767272949, + "learning_rate": 7.8530495189337e-05, + "loss": 1.6678, + "step": 16246 + }, + { + "epoch": 0.5818396690959228, + "grad_norm": 1.6172714233398438, + "learning_rate": 7.851916679668454e-05, + "loss": 1.4166, + "step": 16247 + }, + { + "epoch": 0.581875481225491, + "grad_norm": 2.5062615871429443, + "learning_rate": 7.850783869302905e-05, + "loss": 1.5164, + "step": 16248 + }, + { + "epoch": 0.5819112933550593, + "grad_norm": 2.113309144973755, + "learning_rate": 7.849651087852278e-05, + "loss": 1.3307, + "step": 16249 + }, + { + "epoch": 0.5819471054846277, + "grad_norm": 1.644601821899414, + "learning_rate": 7.848518335331832e-05, + "loss": 1.6786, + "step": 16250 + }, + { + "epoch": 0.581982917614196, + "grad_norm": 1.3407495021820068, + "learning_rate": 7.847385611756788e-05, + "loss": 1.343, + "step": 16251 + }, + { + "epoch": 0.5820187297437642, + "grad_norm": 1.6835163831710815, + "learning_rate": 7.8462529171424e-05, + "loss": 1.1167, + "step": 16252 + }, + { + "epoch": 0.5820545418733325, + "grad_norm": 1.790061593055725, + "learning_rate": 7.845120251503896e-05, + "loss": 1.8415, + "step": 16253 + }, + { + "epoch": 0.5820903540029008, + "grad_norm": 1.642216444015503, + "learning_rate": 7.843987614856525e-05, + "loss": 1.3947, + "step": 16254 + }, + { + "epoch": 0.582126166132469, + "grad_norm": 1.661774754524231, + "learning_rate": 7.842855007215517e-05, + "loss": 1.1908, + "step": 16255 + }, + { + "epoch": 0.5821619782620373, + "grad_norm": 1.867827296257019, + "learning_rate": 7.841722428596109e-05, + "loss": 1.5475, + "step": 16256 + }, + { + "epoch": 0.5821977903916057, + "grad_norm": 1.7840546369552612, + "learning_rate": 7.840589879013548e-05, + "loss": 1.6112, + "step": 16257 + }, + { + "epoch": 0.582233602521174, + "grad_norm": 1.458702802658081, + "learning_rate": 7.839457358483057e-05, + "loss": 1.3471, + "step": 16258 + }, + { + "epoch": 0.5822694146507422, + "grad_norm": 1.494171142578125, + "learning_rate": 7.838324867019888e-05, + "loss": 1.2515, + "step": 16259 + }, + { + "epoch": 0.5823052267803105, + "grad_norm": 1.5532360076904297, + "learning_rate": 7.837192404639264e-05, + "loss": 1.4255, + "step": 16260 + }, + { + "epoch": 0.5823410389098788, + "grad_norm": 1.6035205125808716, + "learning_rate": 7.83605997135643e-05, + "loss": 1.502, + "step": 16261 + }, + { + "epoch": 0.582376851039447, + "grad_norm": 1.7551525831222534, + "learning_rate": 7.834927567186614e-05, + "loss": 1.3939, + "step": 16262 + }, + { + "epoch": 0.5824126631690153, + "grad_norm": 1.5942054986953735, + "learning_rate": 7.833795192145062e-05, + "loss": 1.4372, + "step": 16263 + }, + { + "epoch": 0.5824484752985837, + "grad_norm": 1.804587960243225, + "learning_rate": 7.832662846246997e-05, + "loss": 1.4715, + "step": 16264 + }, + { + "epoch": 0.582484287428152, + "grad_norm": 1.6673866510391235, + "learning_rate": 7.831530529507656e-05, + "loss": 1.4346, + "step": 16265 + }, + { + "epoch": 0.5825200995577202, + "grad_norm": 1.700072169303894, + "learning_rate": 7.830398241942278e-05, + "loss": 1.438, + "step": 16266 + }, + { + "epoch": 0.5825559116872885, + "grad_norm": 1.6875628232955933, + "learning_rate": 7.829265983566088e-05, + "loss": 1.4232, + "step": 16267 + }, + { + "epoch": 0.5825917238168568, + "grad_norm": 1.3997061252593994, + "learning_rate": 7.82813375439433e-05, + "loss": 1.5627, + "step": 16268 + }, + { + "epoch": 0.582627535946425, + "grad_norm": 2.5741705894470215, + "learning_rate": 7.827001554442224e-05, + "loss": 1.5578, + "step": 16269 + }, + { + "epoch": 0.5826633480759933, + "grad_norm": 1.8426802158355713, + "learning_rate": 7.825869383725017e-05, + "loss": 1.4258, + "step": 16270 + }, + { + "epoch": 0.5826991602055617, + "grad_norm": 1.2935112714767456, + "learning_rate": 7.824737242257925e-05, + "loss": 1.4715, + "step": 16271 + }, + { + "epoch": 0.5827349723351299, + "grad_norm": 1.767466425895691, + "learning_rate": 7.823605130056196e-05, + "loss": 1.5631, + "step": 16272 + }, + { + "epoch": 0.5827707844646982, + "grad_norm": 2.372025489807129, + "learning_rate": 7.822473047135048e-05, + "loss": 1.4195, + "step": 16273 + }, + { + "epoch": 0.5828065965942665, + "grad_norm": 1.9583728313446045, + "learning_rate": 7.821340993509716e-05, + "loss": 1.5143, + "step": 16274 + }, + { + "epoch": 0.5828424087238347, + "grad_norm": 1.539113163948059, + "learning_rate": 7.820208969195432e-05, + "loss": 1.4506, + "step": 16275 + }, + { + "epoch": 0.582878220853403, + "grad_norm": 1.511946201324463, + "learning_rate": 7.819076974207425e-05, + "loss": 1.7248, + "step": 16276 + }, + { + "epoch": 0.5829140329829713, + "grad_norm": 1.7202016115188599, + "learning_rate": 7.817945008560923e-05, + "loss": 1.5891, + "step": 16277 + }, + { + "epoch": 0.5829498451125397, + "grad_norm": 1.5539473295211792, + "learning_rate": 7.816813072271155e-05, + "loss": 1.2524, + "step": 16278 + }, + { + "epoch": 0.5829856572421079, + "grad_norm": 1.6296128034591675, + "learning_rate": 7.815681165353353e-05, + "loss": 1.3194, + "step": 16279 + }, + { + "epoch": 0.5830214693716762, + "grad_norm": 1.5075005292892456, + "learning_rate": 7.814549287822743e-05, + "loss": 1.8098, + "step": 16280 + }, + { + "epoch": 0.5830572815012445, + "grad_norm": 2.4536232948303223, + "learning_rate": 7.813417439694553e-05, + "loss": 1.7013, + "step": 16281 + }, + { + "epoch": 0.5830930936308127, + "grad_norm": 1.7020354270935059, + "learning_rate": 7.812285620984012e-05, + "loss": 1.2331, + "step": 16282 + }, + { + "epoch": 0.583128905760381, + "grad_norm": 1.890274167060852, + "learning_rate": 7.811153831706344e-05, + "loss": 1.5311, + "step": 16283 + }, + { + "epoch": 0.5831647178899493, + "grad_norm": 1.4711588621139526, + "learning_rate": 7.81002207187678e-05, + "loss": 1.5288, + "step": 16284 + }, + { + "epoch": 0.5832005300195177, + "grad_norm": 1.563541293144226, + "learning_rate": 7.808890341510542e-05, + "loss": 1.3513, + "step": 16285 + }, + { + "epoch": 0.5832363421490859, + "grad_norm": 2.1257882118225098, + "learning_rate": 7.80775864062286e-05, + "loss": 1.2747, + "step": 16286 + }, + { + "epoch": 0.5832721542786542, + "grad_norm": 1.6666767597198486, + "learning_rate": 7.806626969228955e-05, + "loss": 1.256, + "step": 16287 + }, + { + "epoch": 0.5833079664082225, + "grad_norm": 1.7806174755096436, + "learning_rate": 7.805495327344058e-05, + "loss": 1.6864, + "step": 16288 + }, + { + "epoch": 0.5833437785377907, + "grad_norm": 1.5467238426208496, + "learning_rate": 7.804363714983387e-05, + "loss": 1.2907, + "step": 16289 + }, + { + "epoch": 0.583379590667359, + "grad_norm": 1.4566665887832642, + "learning_rate": 7.803232132162174e-05, + "loss": 1.5714, + "step": 16290 + }, + { + "epoch": 0.5834154027969273, + "grad_norm": 1.4355965852737427, + "learning_rate": 7.802100578895638e-05, + "loss": 1.507, + "step": 16291 + }, + { + "epoch": 0.5834512149264957, + "grad_norm": 1.7385189533233643, + "learning_rate": 7.800969055199003e-05, + "loss": 1.5291, + "step": 16292 + }, + { + "epoch": 0.5834870270560639, + "grad_norm": 1.3698575496673584, + "learning_rate": 7.799837561087493e-05, + "loss": 1.3751, + "step": 16293 + }, + { + "epoch": 0.5835228391856322, + "grad_norm": 1.8529399633407593, + "learning_rate": 7.798706096576329e-05, + "loss": 1.5273, + "step": 16294 + }, + { + "epoch": 0.5835586513152005, + "grad_norm": 1.3171405792236328, + "learning_rate": 7.797574661680737e-05, + "loss": 1.0538, + "step": 16295 + }, + { + "epoch": 0.5835944634447687, + "grad_norm": 1.4912500381469727, + "learning_rate": 7.796443256415935e-05, + "loss": 1.6792, + "step": 16296 + }, + { + "epoch": 0.583630275574337, + "grad_norm": 1.7634475231170654, + "learning_rate": 7.79531188079715e-05, + "loss": 1.4089, + "step": 16297 + }, + { + "epoch": 0.5836660877039053, + "grad_norm": 2.007218599319458, + "learning_rate": 7.794180534839597e-05, + "loss": 1.2288, + "step": 16298 + }, + { + "epoch": 0.5837018998334736, + "grad_norm": 2.622135877609253, + "learning_rate": 7.793049218558501e-05, + "loss": 1.4607, + "step": 16299 + }, + { + "epoch": 0.5837377119630419, + "grad_norm": 1.8438994884490967, + "learning_rate": 7.791917931969082e-05, + "loss": 1.6507, + "step": 16300 + }, + { + "epoch": 0.5837735240926102, + "grad_norm": 1.5510149002075195, + "learning_rate": 7.790786675086555e-05, + "loss": 1.5637, + "step": 16301 + }, + { + "epoch": 0.5838093362221785, + "grad_norm": 1.6385968923568726, + "learning_rate": 7.789655447926147e-05, + "loss": 1.5741, + "step": 16302 + }, + { + "epoch": 0.5838451483517467, + "grad_norm": 1.8733488321304321, + "learning_rate": 7.788524250503072e-05, + "loss": 1.6931, + "step": 16303 + }, + { + "epoch": 0.583880960481315, + "grad_norm": 1.5182271003723145, + "learning_rate": 7.787393082832553e-05, + "loss": 1.5635, + "step": 16304 + }, + { + "epoch": 0.5839167726108833, + "grad_norm": 1.9357662200927734, + "learning_rate": 7.786261944929803e-05, + "loss": 1.447, + "step": 16305 + }, + { + "epoch": 0.5839525847404516, + "grad_norm": 3.7043070793151855, + "learning_rate": 7.785130836810045e-05, + "loss": 1.4161, + "step": 16306 + }, + { + "epoch": 0.5839883968700199, + "grad_norm": 1.9308794736862183, + "learning_rate": 7.783999758488492e-05, + "loss": 1.3324, + "step": 16307 + }, + { + "epoch": 0.5840242089995882, + "grad_norm": 1.5649685859680176, + "learning_rate": 7.782868709980368e-05, + "loss": 1.5649, + "step": 16308 + }, + { + "epoch": 0.5840600211291564, + "grad_norm": 1.5657800436019897, + "learning_rate": 7.781737691300884e-05, + "loss": 1.4936, + "step": 16309 + }, + { + "epoch": 0.5840958332587247, + "grad_norm": 1.9875231981277466, + "learning_rate": 7.780606702465256e-05, + "loss": 1.534, + "step": 16310 + }, + { + "epoch": 0.584131645388293, + "grad_norm": 1.580452799797058, + "learning_rate": 7.779475743488705e-05, + "loss": 1.5182, + "step": 16311 + }, + { + "epoch": 0.5841674575178613, + "grad_norm": 1.5643384456634521, + "learning_rate": 7.778344814386441e-05, + "loss": 1.3858, + "step": 16312 + }, + { + "epoch": 0.5842032696474296, + "grad_norm": 1.5773082971572876, + "learning_rate": 7.777213915173685e-05, + "loss": 1.5475, + "step": 16313 + }, + { + "epoch": 0.5842390817769979, + "grad_norm": 1.6503983736038208, + "learning_rate": 7.776083045865645e-05, + "loss": 1.3458, + "step": 16314 + }, + { + "epoch": 0.5842748939065662, + "grad_norm": 2.040325403213501, + "learning_rate": 7.774952206477542e-05, + "loss": 1.533, + "step": 16315 + }, + { + "epoch": 0.5843107060361344, + "grad_norm": 1.8997361660003662, + "learning_rate": 7.773821397024584e-05, + "loss": 1.6031, + "step": 16316 + }, + { + "epoch": 0.5843465181657027, + "grad_norm": 1.708116054534912, + "learning_rate": 7.77269061752199e-05, + "loss": 1.4829, + "step": 16317 + }, + { + "epoch": 0.584382330295271, + "grad_norm": 1.731094241142273, + "learning_rate": 7.77155986798497e-05, + "loss": 1.5349, + "step": 16318 + }, + { + "epoch": 0.5844181424248392, + "grad_norm": 1.470812439918518, + "learning_rate": 7.770429148428736e-05, + "loss": 1.3997, + "step": 16319 + }, + { + "epoch": 0.5844539545544076, + "grad_norm": 1.660354495048523, + "learning_rate": 7.769298458868504e-05, + "loss": 1.2777, + "step": 16320 + }, + { + "epoch": 0.5844897666839759, + "grad_norm": 1.6110458374023438, + "learning_rate": 7.768167799319481e-05, + "loss": 1.3919, + "step": 16321 + }, + { + "epoch": 0.5845255788135442, + "grad_norm": 1.8067439794540405, + "learning_rate": 7.767037169796885e-05, + "loss": 1.4326, + "step": 16322 + }, + { + "epoch": 0.5845613909431124, + "grad_norm": 1.5376381874084473, + "learning_rate": 7.76590657031592e-05, + "loss": 1.5476, + "step": 16323 + }, + { + "epoch": 0.5845972030726807, + "grad_norm": 2.144806385040283, + "learning_rate": 7.764776000891805e-05, + "loss": 1.2215, + "step": 16324 + }, + { + "epoch": 0.584633015202249, + "grad_norm": 1.5610839128494263, + "learning_rate": 7.763645461539741e-05, + "loss": 1.0606, + "step": 16325 + }, + { + "epoch": 0.5846688273318172, + "grad_norm": 1.6355888843536377, + "learning_rate": 7.762514952274945e-05, + "loss": 1.3254, + "step": 16326 + }, + { + "epoch": 0.5847046394613856, + "grad_norm": 2.3415260314941406, + "learning_rate": 7.761384473112625e-05, + "loss": 1.5028, + "step": 16327 + }, + { + "epoch": 0.5847404515909539, + "grad_norm": 1.3841601610183716, + "learning_rate": 7.760254024067986e-05, + "loss": 1.3328, + "step": 16328 + }, + { + "epoch": 0.5847762637205222, + "grad_norm": 2.1620593070983887, + "learning_rate": 7.759123605156243e-05, + "loss": 1.6989, + "step": 16329 + }, + { + "epoch": 0.5848120758500904, + "grad_norm": 1.456923007965088, + "learning_rate": 7.757993216392599e-05, + "loss": 1.5152, + "step": 16330 + }, + { + "epoch": 0.5848478879796587, + "grad_norm": 1.7215697765350342, + "learning_rate": 7.756862857792268e-05, + "loss": 1.7743, + "step": 16331 + }, + { + "epoch": 0.584883700109227, + "grad_norm": 1.5496634244918823, + "learning_rate": 7.755732529370449e-05, + "loss": 1.5552, + "step": 16332 + }, + { + "epoch": 0.5849195122387952, + "grad_norm": 1.8300232887268066, + "learning_rate": 7.754602231142359e-05, + "loss": 1.7157, + "step": 16333 + }, + { + "epoch": 0.5849553243683636, + "grad_norm": 1.5172995328903198, + "learning_rate": 7.753471963123196e-05, + "loss": 1.148, + "step": 16334 + }, + { + "epoch": 0.5849911364979319, + "grad_norm": 1.6155294179916382, + "learning_rate": 7.752341725328171e-05, + "loss": 1.2817, + "step": 16335 + }, + { + "epoch": 0.5850269486275002, + "grad_norm": 1.5804482698440552, + "learning_rate": 7.751211517772491e-05, + "loss": 1.3887, + "step": 16336 + }, + { + "epoch": 0.5850627607570684, + "grad_norm": 1.509569525718689, + "learning_rate": 7.750081340471355e-05, + "loss": 1.3041, + "step": 16337 + }, + { + "epoch": 0.5850985728866367, + "grad_norm": 1.4579132795333862, + "learning_rate": 7.748951193439977e-05, + "loss": 1.5923, + "step": 16338 + }, + { + "epoch": 0.585134385016205, + "grad_norm": 1.461970567703247, + "learning_rate": 7.747821076693551e-05, + "loss": 1.3511, + "step": 16339 + }, + { + "epoch": 0.5851701971457732, + "grad_norm": 1.2731819152832031, + "learning_rate": 7.746690990247291e-05, + "loss": 1.4716, + "step": 16340 + }, + { + "epoch": 0.5852060092753416, + "grad_norm": 1.532819151878357, + "learning_rate": 7.745560934116398e-05, + "loss": 1.3444, + "step": 16341 + }, + { + "epoch": 0.5852418214049099, + "grad_norm": 1.585973858833313, + "learning_rate": 7.744430908316074e-05, + "loss": 1.5642, + "step": 16342 + }, + { + "epoch": 0.5852776335344781, + "grad_norm": 1.9086010456085205, + "learning_rate": 7.743300912861525e-05, + "loss": 1.3634, + "step": 16343 + }, + { + "epoch": 0.5853134456640464, + "grad_norm": 2.089420795440674, + "learning_rate": 7.742170947767945e-05, + "loss": 1.5245, + "step": 16344 + }, + { + "epoch": 0.5853492577936147, + "grad_norm": 1.5326855182647705, + "learning_rate": 7.741041013050549e-05, + "loss": 1.4089, + "step": 16345 + }, + { + "epoch": 0.585385069923183, + "grad_norm": 1.6081247329711914, + "learning_rate": 7.739911108724527e-05, + "loss": 1.3786, + "step": 16346 + }, + { + "epoch": 0.5854208820527512, + "grad_norm": 1.4763007164001465, + "learning_rate": 7.73878123480509e-05, + "loss": 1.3, + "step": 16347 + }, + { + "epoch": 0.5854566941823196, + "grad_norm": 1.4239866733551025, + "learning_rate": 7.73765139130743e-05, + "loss": 1.4136, + "step": 16348 + }, + { + "epoch": 0.5854925063118879, + "grad_norm": 1.4767128229141235, + "learning_rate": 7.736521578246758e-05, + "loss": 1.6589, + "step": 16349 + }, + { + "epoch": 0.5855283184414561, + "grad_norm": 2.082230806350708, + "learning_rate": 7.735391795638262e-05, + "loss": 1.2585, + "step": 16350 + }, + { + "epoch": 0.5855641305710244, + "grad_norm": 1.4269074201583862, + "learning_rate": 7.734262043497155e-05, + "loss": 1.7076, + "step": 16351 + }, + { + "epoch": 0.5855999427005927, + "grad_norm": 1.9690130949020386, + "learning_rate": 7.733132321838628e-05, + "loss": 1.4339, + "step": 16352 + }, + { + "epoch": 0.585635754830161, + "grad_norm": 1.7755242586135864, + "learning_rate": 7.732002630677878e-05, + "loss": 1.3142, + "step": 16353 + }, + { + "epoch": 0.5856715669597292, + "grad_norm": 1.3469429016113281, + "learning_rate": 7.73087297003011e-05, + "loss": 1.725, + "step": 16354 + }, + { + "epoch": 0.5857073790892976, + "grad_norm": 1.6812926530838013, + "learning_rate": 7.729743339910515e-05, + "loss": 1.1344, + "step": 16355 + }, + { + "epoch": 0.5857431912188659, + "grad_norm": 1.4505376815795898, + "learning_rate": 7.728613740334304e-05, + "loss": 1.3823, + "step": 16356 + }, + { + "epoch": 0.5857790033484341, + "grad_norm": 1.8146635293960571, + "learning_rate": 7.727484171316655e-05, + "loss": 1.3706, + "step": 16357 + }, + { + "epoch": 0.5858148154780024, + "grad_norm": 1.5328073501586914, + "learning_rate": 7.726354632872783e-05, + "loss": 1.5141, + "step": 16358 + }, + { + "epoch": 0.5858506276075707, + "grad_norm": 1.5390815734863281, + "learning_rate": 7.72522512501787e-05, + "loss": 1.3987, + "step": 16359 + }, + { + "epoch": 0.5858864397371389, + "grad_norm": 1.7571913003921509, + "learning_rate": 7.724095647767125e-05, + "loss": 1.7021, + "step": 16360 + }, + { + "epoch": 0.5859222518667072, + "grad_norm": 1.810835361480713, + "learning_rate": 7.722966201135736e-05, + "loss": 1.5146, + "step": 16361 + }, + { + "epoch": 0.5859580639962756, + "grad_norm": 1.467986822128296, + "learning_rate": 7.721836785138896e-05, + "loss": 1.0774, + "step": 16362 + }, + { + "epoch": 0.5859938761258439, + "grad_norm": 1.3759533166885376, + "learning_rate": 7.720707399791807e-05, + "loss": 1.5041, + "step": 16363 + }, + { + "epoch": 0.5860296882554121, + "grad_norm": 1.5349520444869995, + "learning_rate": 7.719578045109657e-05, + "loss": 1.4431, + "step": 16364 + }, + { + "epoch": 0.5860655003849804, + "grad_norm": 1.4117509126663208, + "learning_rate": 7.718448721107645e-05, + "loss": 1.1674, + "step": 16365 + }, + { + "epoch": 0.5861013125145487, + "grad_norm": 1.8245066404342651, + "learning_rate": 7.717319427800957e-05, + "loss": 1.4755, + "step": 16366 + }, + { + "epoch": 0.5861371246441169, + "grad_norm": 1.6505935192108154, + "learning_rate": 7.7161901652048e-05, + "loss": 1.2912, + "step": 16367 + }, + { + "epoch": 0.5861729367736852, + "grad_norm": 1.3770312070846558, + "learning_rate": 7.715060933334351e-05, + "loss": 1.3058, + "step": 16368 + }, + { + "epoch": 0.5862087489032536, + "grad_norm": 2.2501673698425293, + "learning_rate": 7.713931732204816e-05, + "loss": 1.6156, + "step": 16369 + }, + { + "epoch": 0.5862445610328219, + "grad_norm": 1.9228209257125854, + "learning_rate": 7.712802561831381e-05, + "loss": 1.5591, + "step": 16370 + }, + { + "epoch": 0.5862803731623901, + "grad_norm": 1.1532500982284546, + "learning_rate": 7.711673422229232e-05, + "loss": 1.4264, + "step": 16371 + }, + { + "epoch": 0.5863161852919584, + "grad_norm": 1.937796711921692, + "learning_rate": 7.71054431341357e-05, + "loss": 1.4744, + "step": 16372 + }, + { + "epoch": 0.5863519974215267, + "grad_norm": 1.4498701095581055, + "learning_rate": 7.709415235399577e-05, + "loss": 1.4989, + "step": 16373 + }, + { + "epoch": 0.5863878095510949, + "grad_norm": 1.9454270601272583, + "learning_rate": 7.708286188202451e-05, + "loss": 1.521, + "step": 16374 + }, + { + "epoch": 0.5864236216806632, + "grad_norm": 1.8580267429351807, + "learning_rate": 7.707157171837374e-05, + "loss": 1.3413, + "step": 16375 + }, + { + "epoch": 0.5864594338102316, + "grad_norm": 2.0274364948272705, + "learning_rate": 7.706028186319543e-05, + "loss": 1.4014, + "step": 16376 + }, + { + "epoch": 0.5864952459397998, + "grad_norm": 1.5042753219604492, + "learning_rate": 7.704899231664143e-05, + "loss": 1.5814, + "step": 16377 + }, + { + "epoch": 0.5865310580693681, + "grad_norm": 1.5244011878967285, + "learning_rate": 7.703770307886364e-05, + "loss": 1.1341, + "step": 16378 + }, + { + "epoch": 0.5865668701989364, + "grad_norm": 2.079533576965332, + "learning_rate": 7.702641415001394e-05, + "loss": 1.9471, + "step": 16379 + }, + { + "epoch": 0.5866026823285047, + "grad_norm": 1.7344539165496826, + "learning_rate": 7.701512553024418e-05, + "loss": 1.3302, + "step": 16380 + }, + { + "epoch": 0.5866384944580729, + "grad_norm": 1.6465486288070679, + "learning_rate": 7.700383721970628e-05, + "loss": 1.3835, + "step": 16381 + }, + { + "epoch": 0.5866743065876412, + "grad_norm": 2.1004645824432373, + "learning_rate": 7.699254921855206e-05, + "loss": 1.6895, + "step": 16382 + }, + { + "epoch": 0.5867101187172096, + "grad_norm": 1.8558729887008667, + "learning_rate": 7.698126152693345e-05, + "loss": 1.4859, + "step": 16383 + }, + { + "epoch": 0.5867459308467778, + "grad_norm": 1.7315419912338257, + "learning_rate": 7.696997414500223e-05, + "loss": 1.3328, + "step": 16384 + }, + { + "epoch": 0.5867817429763461, + "grad_norm": 2.36985182762146, + "learning_rate": 7.695868707291034e-05, + "loss": 1.4141, + "step": 16385 + }, + { + "epoch": 0.5868175551059144, + "grad_norm": 1.4058644771575928, + "learning_rate": 7.694740031080957e-05, + "loss": 1.2073, + "step": 16386 + }, + { + "epoch": 0.5868533672354826, + "grad_norm": 1.6645869016647339, + "learning_rate": 7.693611385885181e-05, + "loss": 1.488, + "step": 16387 + }, + { + "epoch": 0.5868891793650509, + "grad_norm": 1.4396991729736328, + "learning_rate": 7.69248277171889e-05, + "loss": 1.5952, + "step": 16388 + }, + { + "epoch": 0.5869249914946192, + "grad_norm": 2.9401330947875977, + "learning_rate": 7.691354188597263e-05, + "loss": 1.8068, + "step": 16389 + }, + { + "epoch": 0.5869608036241876, + "grad_norm": 1.3495395183563232, + "learning_rate": 7.69022563653549e-05, + "loss": 1.2889, + "step": 16390 + }, + { + "epoch": 0.5869966157537558, + "grad_norm": 1.5951523780822754, + "learning_rate": 7.689097115548751e-05, + "loss": 1.5938, + "step": 16391 + }, + { + "epoch": 0.5870324278833241, + "grad_norm": 1.477667212486267, + "learning_rate": 7.68796862565223e-05, + "loss": 1.5373, + "step": 16392 + }, + { + "epoch": 0.5870682400128924, + "grad_norm": 1.5856584310531616, + "learning_rate": 7.686840166861106e-05, + "loss": 1.3693, + "step": 16393 + }, + { + "epoch": 0.5871040521424606, + "grad_norm": 1.5418838262557983, + "learning_rate": 7.685711739190568e-05, + "loss": 1.4448, + "step": 16394 + }, + { + "epoch": 0.5871398642720289, + "grad_norm": 1.6027543544769287, + "learning_rate": 7.684583342655791e-05, + "loss": 1.5726, + "step": 16395 + }, + { + "epoch": 0.5871756764015972, + "grad_norm": 1.5729893445968628, + "learning_rate": 7.68345497727196e-05, + "loss": 1.5488, + "step": 16396 + }, + { + "epoch": 0.5872114885311656, + "grad_norm": 1.2920430898666382, + "learning_rate": 7.682326643054254e-05, + "loss": 1.663, + "step": 16397 + }, + { + "epoch": 0.5872473006607338, + "grad_norm": 1.6387059688568115, + "learning_rate": 7.681198340017852e-05, + "loss": 1.3222, + "step": 16398 + }, + { + "epoch": 0.5872831127903021, + "grad_norm": 1.7342307567596436, + "learning_rate": 7.680070068177936e-05, + "loss": 1.3333, + "step": 16399 + }, + { + "epoch": 0.5873189249198704, + "grad_norm": 1.7177081108093262, + "learning_rate": 7.678941827549683e-05, + "loss": 1.5123, + "step": 16400 + }, + { + "epoch": 0.5873547370494386, + "grad_norm": 1.5641347169876099, + "learning_rate": 7.677813618148276e-05, + "loss": 1.479, + "step": 16401 + }, + { + "epoch": 0.5873905491790069, + "grad_norm": 1.8655952215194702, + "learning_rate": 7.67668543998889e-05, + "loss": 1.6996, + "step": 16402 + }, + { + "epoch": 0.5874263613085752, + "grad_norm": 1.3110804557800293, + "learning_rate": 7.675557293086706e-05, + "loss": 1.4526, + "step": 16403 + }, + { + "epoch": 0.5874621734381436, + "grad_norm": 1.8281629085540771, + "learning_rate": 7.674429177456899e-05, + "loss": 1.6103, + "step": 16404 + }, + { + "epoch": 0.5874979855677118, + "grad_norm": 1.439719557762146, + "learning_rate": 7.673301093114649e-05, + "loss": 1.2931, + "step": 16405 + }, + { + "epoch": 0.5875337976972801, + "grad_norm": 1.3842743635177612, + "learning_rate": 7.672173040075131e-05, + "loss": 1.3253, + "step": 16406 + }, + { + "epoch": 0.5875696098268484, + "grad_norm": 1.4752776622772217, + "learning_rate": 7.671045018353521e-05, + "loss": 1.2556, + "step": 16407 + }, + { + "epoch": 0.5876054219564166, + "grad_norm": 1.8519983291625977, + "learning_rate": 7.669917027964998e-05, + "loss": 1.3942, + "step": 16408 + }, + { + "epoch": 0.5876412340859849, + "grad_norm": 1.96647047996521, + "learning_rate": 7.668789068924734e-05, + "loss": 1.4563, + "step": 16409 + }, + { + "epoch": 0.5876770462155532, + "grad_norm": 1.5094881057739258, + "learning_rate": 7.667661141247907e-05, + "loss": 1.2547, + "step": 16410 + }, + { + "epoch": 0.5877128583451215, + "grad_norm": 1.5891088247299194, + "learning_rate": 7.66653324494969e-05, + "loss": 1.5648, + "step": 16411 + }, + { + "epoch": 0.5877486704746898, + "grad_norm": 1.6747032403945923, + "learning_rate": 7.665405380045258e-05, + "loss": 1.2975, + "step": 16412 + }, + { + "epoch": 0.5877844826042581, + "grad_norm": 1.7869306802749634, + "learning_rate": 7.664277546549786e-05, + "loss": 1.7527, + "step": 16413 + }, + { + "epoch": 0.5878202947338264, + "grad_norm": 2.474652051925659, + "learning_rate": 7.663149744478448e-05, + "loss": 1.7304, + "step": 16414 + }, + { + "epoch": 0.5878561068633946, + "grad_norm": 1.5477724075317383, + "learning_rate": 7.662021973846415e-05, + "loss": 1.2809, + "step": 16415 + }, + { + "epoch": 0.5878919189929629, + "grad_norm": 1.887107014656067, + "learning_rate": 7.660894234668859e-05, + "loss": 1.5836, + "step": 16416 + }, + { + "epoch": 0.5879277311225312, + "grad_norm": 1.466249942779541, + "learning_rate": 7.659766526960957e-05, + "loss": 1.607, + "step": 16417 + }, + { + "epoch": 0.5879635432520995, + "grad_norm": 1.4023653268814087, + "learning_rate": 7.658638850737874e-05, + "loss": 1.0977, + "step": 16418 + }, + { + "epoch": 0.5879993553816678, + "grad_norm": 1.5768914222717285, + "learning_rate": 7.657511206014788e-05, + "loss": 1.587, + "step": 16419 + }, + { + "epoch": 0.5880351675112361, + "grad_norm": 1.5604252815246582, + "learning_rate": 7.656383592806865e-05, + "loss": 1.4809, + "step": 16420 + }, + { + "epoch": 0.5880709796408043, + "grad_norm": 1.8900164365768433, + "learning_rate": 7.655256011129279e-05, + "loss": 1.368, + "step": 16421 + }, + { + "epoch": 0.5881067917703726, + "grad_norm": 1.759080171585083, + "learning_rate": 7.654128460997198e-05, + "loss": 1.4159, + "step": 16422 + }, + { + "epoch": 0.5881426038999409, + "grad_norm": 2.4119346141815186, + "learning_rate": 7.653000942425794e-05, + "loss": 1.3701, + "step": 16423 + }, + { + "epoch": 0.5881784160295092, + "grad_norm": 1.836087703704834, + "learning_rate": 7.651873455430237e-05, + "loss": 1.5565, + "step": 16424 + }, + { + "epoch": 0.5882142281590775, + "grad_norm": 1.5751034021377563, + "learning_rate": 7.650746000025688e-05, + "loss": 1.6149, + "step": 16425 + }, + { + "epoch": 0.5882500402886458, + "grad_norm": 2.3002188205718994, + "learning_rate": 7.649618576227325e-05, + "loss": 1.5969, + "step": 16426 + }, + { + "epoch": 0.5882858524182141, + "grad_norm": 1.5687283277511597, + "learning_rate": 7.648491184050311e-05, + "loss": 1.6116, + "step": 16427 + }, + { + "epoch": 0.5883216645477823, + "grad_norm": 1.529663324356079, + "learning_rate": 7.647363823509815e-05, + "loss": 1.1357, + "step": 16428 + }, + { + "epoch": 0.5883574766773506, + "grad_norm": 1.6907438039779663, + "learning_rate": 7.646236494621004e-05, + "loss": 1.4858, + "step": 16429 + }, + { + "epoch": 0.5883932888069189, + "grad_norm": 1.6002248525619507, + "learning_rate": 7.645109197399047e-05, + "loss": 1.3588, + "step": 16430 + }, + { + "epoch": 0.5884291009364871, + "grad_norm": 1.6536250114440918, + "learning_rate": 7.643981931859104e-05, + "loss": 1.396, + "step": 16431 + }, + { + "epoch": 0.5884649130660555, + "grad_norm": 1.6143574714660645, + "learning_rate": 7.642854698016348e-05, + "loss": 1.3903, + "step": 16432 + }, + { + "epoch": 0.5885007251956238, + "grad_norm": 1.4766873121261597, + "learning_rate": 7.641727495885944e-05, + "loss": 1.4478, + "step": 16433 + }, + { + "epoch": 0.5885365373251921, + "grad_norm": 1.8612972497940063, + "learning_rate": 7.640600325483049e-05, + "loss": 1.4193, + "step": 16434 + }, + { + "epoch": 0.5885723494547603, + "grad_norm": 1.6533550024032593, + "learning_rate": 7.639473186822839e-05, + "loss": 1.6117, + "step": 16435 + }, + { + "epoch": 0.5886081615843286, + "grad_norm": 1.6703141927719116, + "learning_rate": 7.638346079920466e-05, + "loss": 1.782, + "step": 16436 + }, + { + "epoch": 0.5886439737138969, + "grad_norm": 1.5920336246490479, + "learning_rate": 7.637219004791106e-05, + "loss": 1.3684, + "step": 16437 + }, + { + "epoch": 0.5886797858434651, + "grad_norm": 1.2391998767852783, + "learning_rate": 7.636091961449911e-05, + "loss": 1.4898, + "step": 16438 + }, + { + "epoch": 0.5887155979730334, + "grad_norm": 1.9100953340530396, + "learning_rate": 7.634964949912054e-05, + "loss": 1.7013, + "step": 16439 + }, + { + "epoch": 0.5887514101026018, + "grad_norm": 2.002978563308716, + "learning_rate": 7.633837970192694e-05, + "loss": 1.6605, + "step": 16440 + }, + { + "epoch": 0.5887872222321701, + "grad_norm": 2.9292209148406982, + "learning_rate": 7.632711022306985e-05, + "loss": 1.3659, + "step": 16441 + }, + { + "epoch": 0.5888230343617383, + "grad_norm": 2.129926919937134, + "learning_rate": 7.631584106270103e-05, + "loss": 1.3669, + "step": 16442 + }, + { + "epoch": 0.5888588464913066, + "grad_norm": 1.524235486984253, + "learning_rate": 7.630457222097196e-05, + "loss": 1.3326, + "step": 16443 + }, + { + "epoch": 0.5888946586208749, + "grad_norm": 1.5463957786560059, + "learning_rate": 7.629330369803435e-05, + "loss": 1.3502, + "step": 16444 + }, + { + "epoch": 0.5889304707504431, + "grad_norm": 1.6512547731399536, + "learning_rate": 7.628203549403971e-05, + "loss": 1.7101, + "step": 16445 + }, + { + "epoch": 0.5889662828800114, + "grad_norm": 2.546053171157837, + "learning_rate": 7.627076760913976e-05, + "loss": 1.4441, + "step": 16446 + }, + { + "epoch": 0.5890020950095798, + "grad_norm": 1.6109521389007568, + "learning_rate": 7.625950004348595e-05, + "loss": 1.4997, + "step": 16447 + }, + { + "epoch": 0.589037907139148, + "grad_norm": 1.9896047115325928, + "learning_rate": 7.624823279723001e-05, + "loss": 1.3661, + "step": 16448 + }, + { + "epoch": 0.5890737192687163, + "grad_norm": 1.843091368675232, + "learning_rate": 7.623696587052343e-05, + "loss": 1.5451, + "step": 16449 + }, + { + "epoch": 0.5891095313982846, + "grad_norm": 2.0600764751434326, + "learning_rate": 7.622569926351781e-05, + "loss": 1.4128, + "step": 16450 + }, + { + "epoch": 0.5891453435278529, + "grad_norm": 1.6426974534988403, + "learning_rate": 7.621443297636478e-05, + "loss": 1.5547, + "step": 16451 + }, + { + "epoch": 0.5891811556574211, + "grad_norm": 1.6177315711975098, + "learning_rate": 7.62031670092158e-05, + "loss": 1.3688, + "step": 16452 + }, + { + "epoch": 0.5892169677869894, + "grad_norm": 1.4145938158035278, + "learning_rate": 7.619190136222259e-05, + "loss": 1.3171, + "step": 16453 + }, + { + "epoch": 0.5892527799165578, + "grad_norm": 2.2485506534576416, + "learning_rate": 7.618063603553655e-05, + "loss": 1.3094, + "step": 16454 + }, + { + "epoch": 0.589288592046126, + "grad_norm": 1.2966976165771484, + "learning_rate": 7.616937102930942e-05, + "loss": 1.4065, + "step": 16455 + }, + { + "epoch": 0.5893244041756943, + "grad_norm": 1.8887548446655273, + "learning_rate": 7.61581063436926e-05, + "loss": 1.3098, + "step": 16456 + }, + { + "epoch": 0.5893602163052626, + "grad_norm": 1.525266408920288, + "learning_rate": 7.614684197883775e-05, + "loss": 1.3332, + "step": 16457 + }, + { + "epoch": 0.5893960284348309, + "grad_norm": 2.2836077213287354, + "learning_rate": 7.613557793489637e-05, + "loss": 1.5326, + "step": 16458 + }, + { + "epoch": 0.5894318405643991, + "grad_norm": 1.719794511795044, + "learning_rate": 7.612431421201996e-05, + "loss": 1.4174, + "step": 16459 + }, + { + "epoch": 0.5894676526939674, + "grad_norm": 1.5337207317352295, + "learning_rate": 7.611305081036015e-05, + "loss": 1.1962, + "step": 16460 + }, + { + "epoch": 0.5895034648235358, + "grad_norm": 1.6725982427597046, + "learning_rate": 7.61017877300684e-05, + "loss": 1.4057, + "step": 16461 + }, + { + "epoch": 0.589539276953104, + "grad_norm": 1.944868564605713, + "learning_rate": 7.609052497129629e-05, + "loss": 1.4168, + "step": 16462 + }, + { + "epoch": 0.5895750890826723, + "grad_norm": 1.736919641494751, + "learning_rate": 7.607926253419531e-05, + "loss": 1.619, + "step": 16463 + }, + { + "epoch": 0.5896109012122406, + "grad_norm": 1.4023191928863525, + "learning_rate": 7.606800041891701e-05, + "loss": 1.5406, + "step": 16464 + }, + { + "epoch": 0.5896467133418088, + "grad_norm": 2.086000919342041, + "learning_rate": 7.605673862561284e-05, + "loss": 1.5427, + "step": 16465 + }, + { + "epoch": 0.5896825254713771, + "grad_norm": 1.7645149230957031, + "learning_rate": 7.604547715443445e-05, + "loss": 1.5254, + "step": 16466 + }, + { + "epoch": 0.5897183376009454, + "grad_norm": 1.2208935022354126, + "learning_rate": 7.603421600553324e-05, + "loss": 1.5195, + "step": 16467 + }, + { + "epoch": 0.5897541497305138, + "grad_norm": 1.5771421194076538, + "learning_rate": 7.602295517906072e-05, + "loss": 1.5837, + "step": 16468 + }, + { + "epoch": 0.589789961860082, + "grad_norm": 1.4784822463989258, + "learning_rate": 7.601169467516844e-05, + "loss": 1.4173, + "step": 16469 + }, + { + "epoch": 0.5898257739896503, + "grad_norm": 1.6251667737960815, + "learning_rate": 7.600043449400782e-05, + "loss": 1.6069, + "step": 16470 + }, + { + "epoch": 0.5898615861192186, + "grad_norm": 1.9627031087875366, + "learning_rate": 7.598917463573044e-05, + "loss": 1.801, + "step": 16471 + }, + { + "epoch": 0.5898973982487868, + "grad_norm": 2.1740000247955322, + "learning_rate": 7.59779151004877e-05, + "loss": 1.352, + "step": 16472 + }, + { + "epoch": 0.5899332103783551, + "grad_norm": 1.7333546876907349, + "learning_rate": 7.596665588843117e-05, + "loss": 1.5806, + "step": 16473 + }, + { + "epoch": 0.5899690225079234, + "grad_norm": 1.9666831493377686, + "learning_rate": 7.595539699971225e-05, + "loss": 1.4948, + "step": 16474 + }, + { + "epoch": 0.5900048346374918, + "grad_norm": 1.692284345626831, + "learning_rate": 7.594413843448248e-05, + "loss": 1.4834, + "step": 16475 + }, + { + "epoch": 0.59004064676706, + "grad_norm": 1.3781365156173706, + "learning_rate": 7.593288019289329e-05, + "loss": 1.3872, + "step": 16476 + }, + { + "epoch": 0.5900764588966283, + "grad_norm": 1.7096667289733887, + "learning_rate": 7.592162227509614e-05, + "loss": 1.4666, + "step": 16477 + }, + { + "epoch": 0.5901122710261966, + "grad_norm": 1.9078587293624878, + "learning_rate": 7.591036468124252e-05, + "loss": 1.5795, + "step": 16478 + }, + { + "epoch": 0.5901480831557648, + "grad_norm": 1.6421924829483032, + "learning_rate": 7.589910741148384e-05, + "loss": 1.2502, + "step": 16479 + }, + { + "epoch": 0.5901838952853331, + "grad_norm": 1.5889708995819092, + "learning_rate": 7.588785046597161e-05, + "loss": 1.4928, + "step": 16480 + }, + { + "epoch": 0.5902197074149014, + "grad_norm": 2.91912841796875, + "learning_rate": 7.587659384485723e-05, + "loss": 1.848, + "step": 16481 + }, + { + "epoch": 0.5902555195444698, + "grad_norm": 1.248953938484192, + "learning_rate": 7.586533754829218e-05, + "loss": 1.4012, + "step": 16482 + }, + { + "epoch": 0.590291331674038, + "grad_norm": 1.8123250007629395, + "learning_rate": 7.585408157642786e-05, + "loss": 1.2366, + "step": 16483 + }, + { + "epoch": 0.5903271438036063, + "grad_norm": 1.7613189220428467, + "learning_rate": 7.584282592941574e-05, + "loss": 1.1722, + "step": 16484 + }, + { + "epoch": 0.5903629559331746, + "grad_norm": 1.6711210012435913, + "learning_rate": 7.583157060740727e-05, + "loss": 1.635, + "step": 16485 + }, + { + "epoch": 0.5903987680627428, + "grad_norm": 1.593625783920288, + "learning_rate": 7.582031561055378e-05, + "loss": 1.5147, + "step": 16486 + }, + { + "epoch": 0.5904345801923111, + "grad_norm": 1.475412130355835, + "learning_rate": 7.58090609390068e-05, + "loss": 1.5504, + "step": 16487 + }, + { + "epoch": 0.5904703923218794, + "grad_norm": 2.0667693614959717, + "learning_rate": 7.579780659291768e-05, + "loss": 1.4428, + "step": 16488 + }, + { + "epoch": 0.5905062044514477, + "grad_norm": 1.4998021125793457, + "learning_rate": 7.578655257243786e-05, + "loss": 1.5839, + "step": 16489 + }, + { + "epoch": 0.590542016581016, + "grad_norm": 1.578344702720642, + "learning_rate": 7.577529887771873e-05, + "loss": 1.4212, + "step": 16490 + }, + { + "epoch": 0.5905778287105843, + "grad_norm": 1.7631887197494507, + "learning_rate": 7.576404550891172e-05, + "loss": 1.5553, + "step": 16491 + }, + { + "epoch": 0.5906136408401526, + "grad_norm": 1.7021032571792603, + "learning_rate": 7.57527924661682e-05, + "loss": 1.458, + "step": 16492 + }, + { + "epoch": 0.5906494529697208, + "grad_norm": 1.9911140203475952, + "learning_rate": 7.57415397496396e-05, + "loss": 1.699, + "step": 16493 + }, + { + "epoch": 0.5906852650992891, + "grad_norm": 1.4853942394256592, + "learning_rate": 7.57302873594773e-05, + "loss": 1.3604, + "step": 16494 + }, + { + "epoch": 0.5907210772288574, + "grad_norm": 1.4939459562301636, + "learning_rate": 7.571903529583265e-05, + "loss": 1.4661, + "step": 16495 + }, + { + "epoch": 0.5907568893584257, + "grad_norm": 1.3259398937225342, + "learning_rate": 7.570778355885708e-05, + "loss": 1.5328, + "step": 16496 + }, + { + "epoch": 0.590792701487994, + "grad_norm": 1.5033601522445679, + "learning_rate": 7.569653214870192e-05, + "loss": 1.6475, + "step": 16497 + }, + { + "epoch": 0.5908285136175623, + "grad_norm": 1.707831621170044, + "learning_rate": 7.568528106551862e-05, + "loss": 1.1923, + "step": 16498 + }, + { + "epoch": 0.5908643257471305, + "grad_norm": 2.281423807144165, + "learning_rate": 7.567403030945844e-05, + "loss": 1.4786, + "step": 16499 + }, + { + "epoch": 0.5909001378766988, + "grad_norm": 1.7174972295761108, + "learning_rate": 7.566277988067285e-05, + "loss": 1.4596, + "step": 16500 + }, + { + "epoch": 0.5909359500062671, + "grad_norm": 1.3764506578445435, + "learning_rate": 7.565152977931314e-05, + "loss": 1.4684, + "step": 16501 + }, + { + "epoch": 0.5909717621358354, + "grad_norm": 2.216895818710327, + "learning_rate": 7.56402800055307e-05, + "loss": 1.5763, + "step": 16502 + }, + { + "epoch": 0.5910075742654037, + "grad_norm": 1.5239827632904053, + "learning_rate": 7.562903055947688e-05, + "loss": 1.2643, + "step": 16503 + }, + { + "epoch": 0.591043386394972, + "grad_norm": 1.4618558883666992, + "learning_rate": 7.561778144130299e-05, + "loss": 1.4947, + "step": 16504 + }, + { + "epoch": 0.5910791985245403, + "grad_norm": 1.6704903841018677, + "learning_rate": 7.560653265116042e-05, + "loss": 1.4775, + "step": 16505 + }, + { + "epoch": 0.5911150106541085, + "grad_norm": 1.5482014417648315, + "learning_rate": 7.559528418920048e-05, + "loss": 1.3616, + "step": 16506 + }, + { + "epoch": 0.5911508227836768, + "grad_norm": 1.870289921760559, + "learning_rate": 7.558403605557453e-05, + "loss": 1.4348, + "step": 16507 + }, + { + "epoch": 0.5911866349132451, + "grad_norm": 1.4968332052230835, + "learning_rate": 7.557278825043385e-05, + "loss": 1.3695, + "step": 16508 + }, + { + "epoch": 0.5912224470428133, + "grad_norm": 1.5539411306381226, + "learning_rate": 7.556154077392982e-05, + "loss": 1.3545, + "step": 16509 + }, + { + "epoch": 0.5912582591723817, + "grad_norm": 1.5246566534042358, + "learning_rate": 7.555029362621371e-05, + "loss": 1.4596, + "step": 16510 + }, + { + "epoch": 0.59129407130195, + "grad_norm": 1.3691539764404297, + "learning_rate": 7.553904680743688e-05, + "loss": 1.4245, + "step": 16511 + }, + { + "epoch": 0.5913298834315183, + "grad_norm": 1.5674803256988525, + "learning_rate": 7.552780031775064e-05, + "loss": 1.3345, + "step": 16512 + }, + { + "epoch": 0.5913656955610865, + "grad_norm": 1.5378525257110596, + "learning_rate": 7.551655415730624e-05, + "loss": 1.5885, + "step": 16513 + }, + { + "epoch": 0.5914015076906548, + "grad_norm": 1.648198127746582, + "learning_rate": 7.550530832625505e-05, + "loss": 1.5619, + "step": 16514 + }, + { + "epoch": 0.5914373198202231, + "grad_norm": 2.732581615447998, + "learning_rate": 7.549406282474833e-05, + "loss": 2.0635, + "step": 16515 + }, + { + "epoch": 0.5914731319497913, + "grad_norm": 1.5190062522888184, + "learning_rate": 7.548281765293739e-05, + "loss": 1.583, + "step": 16516 + }, + { + "epoch": 0.5915089440793597, + "grad_norm": 1.6483665704727173, + "learning_rate": 7.54715728109735e-05, + "loss": 1.2961, + "step": 16517 + }, + { + "epoch": 0.591544756208928, + "grad_norm": 1.692216396331787, + "learning_rate": 7.546032829900797e-05, + "loss": 1.451, + "step": 16518 + }, + { + "epoch": 0.5915805683384963, + "grad_norm": 1.4392499923706055, + "learning_rate": 7.544908411719207e-05, + "loss": 1.3544, + "step": 16519 + }, + { + "epoch": 0.5916163804680645, + "grad_norm": 1.5235916376113892, + "learning_rate": 7.543784026567708e-05, + "loss": 1.4944, + "step": 16520 + }, + { + "epoch": 0.5916521925976328, + "grad_norm": 1.9779636859893799, + "learning_rate": 7.542659674461429e-05, + "loss": 1.3904, + "step": 16521 + }, + { + "epoch": 0.5916880047272011, + "grad_norm": 1.3239980936050415, + "learning_rate": 7.541535355415487e-05, + "loss": 1.524, + "step": 16522 + }, + { + "epoch": 0.5917238168567693, + "grad_norm": 1.3218424320220947, + "learning_rate": 7.540411069445021e-05, + "loss": 1.4584, + "step": 16523 + }, + { + "epoch": 0.5917596289863377, + "grad_norm": 1.6652659177780151, + "learning_rate": 7.53928681656515e-05, + "loss": 1.1819, + "step": 16524 + }, + { + "epoch": 0.591795441115906, + "grad_norm": 1.502694010734558, + "learning_rate": 7.538162596791002e-05, + "loss": 1.8572, + "step": 16525 + }, + { + "epoch": 0.5918312532454743, + "grad_norm": 1.5385327339172363, + "learning_rate": 7.537038410137698e-05, + "loss": 1.2326, + "step": 16526 + }, + { + "epoch": 0.5918670653750425, + "grad_norm": 1.5926456451416016, + "learning_rate": 7.535914256620368e-05, + "loss": 1.6148, + "step": 16527 + }, + { + "epoch": 0.5919028775046108, + "grad_norm": 1.8524401187896729, + "learning_rate": 7.534790136254132e-05, + "loss": 1.1339, + "step": 16528 + }, + { + "epoch": 0.5919386896341791, + "grad_norm": 1.430527687072754, + "learning_rate": 7.533666049054115e-05, + "loss": 1.4224, + "step": 16529 + }, + { + "epoch": 0.5919745017637473, + "grad_norm": 1.5670427083969116, + "learning_rate": 7.532541995035444e-05, + "loss": 1.6936, + "step": 16530 + }, + { + "epoch": 0.5920103138933157, + "grad_norm": 1.8079187870025635, + "learning_rate": 7.53141797421323e-05, + "loss": 1.2269, + "step": 16531 + }, + { + "epoch": 0.592046126022884, + "grad_norm": 1.840510368347168, + "learning_rate": 7.53029398660261e-05, + "loss": 1.5701, + "step": 16532 + }, + { + "epoch": 0.5920819381524522, + "grad_norm": 1.3353474140167236, + "learning_rate": 7.529170032218691e-05, + "loss": 1.6003, + "step": 16533 + }, + { + "epoch": 0.5921177502820205, + "grad_norm": 2.0196423530578613, + "learning_rate": 7.52804611107661e-05, + "loss": 1.605, + "step": 16534 + }, + { + "epoch": 0.5921535624115888, + "grad_norm": 1.7749592065811157, + "learning_rate": 7.526922223191473e-05, + "loss": 1.8357, + "step": 16535 + }, + { + "epoch": 0.592189374541157, + "grad_norm": 1.9717570543289185, + "learning_rate": 7.525798368578412e-05, + "loss": 1.4984, + "step": 16536 + }, + { + "epoch": 0.5922251866707253, + "grad_norm": 1.611171841621399, + "learning_rate": 7.524674547252544e-05, + "loss": 1.3366, + "step": 16537 + }, + { + "epoch": 0.5922609988002937, + "grad_norm": 1.4326066970825195, + "learning_rate": 7.523550759228981e-05, + "loss": 1.5233, + "step": 16538 + }, + { + "epoch": 0.592296810929862, + "grad_norm": 1.420731782913208, + "learning_rate": 7.522427004522855e-05, + "loss": 1.4304, + "step": 16539 + }, + { + "epoch": 0.5923326230594302, + "grad_norm": 2.4821457862854004, + "learning_rate": 7.52130328314927e-05, + "loss": 1.534, + "step": 16540 + }, + { + "epoch": 0.5923684351889985, + "grad_norm": 1.7809147834777832, + "learning_rate": 7.52017959512336e-05, + "loss": 1.3436, + "step": 16541 + }, + { + "epoch": 0.5924042473185668, + "grad_norm": 1.487674593925476, + "learning_rate": 7.519055940460227e-05, + "loss": 1.3221, + "step": 16542 + }, + { + "epoch": 0.592440059448135, + "grad_norm": 1.630618929862976, + "learning_rate": 7.517932319175003e-05, + "loss": 1.391, + "step": 16543 + }, + { + "epoch": 0.5924758715777033, + "grad_norm": 1.7148510217666626, + "learning_rate": 7.516808731282793e-05, + "loss": 1.6317, + "step": 16544 + }, + { + "epoch": 0.5925116837072717, + "grad_norm": 1.6634539365768433, + "learning_rate": 7.515685176798723e-05, + "loss": 1.8021, + "step": 16545 + }, + { + "epoch": 0.59254749583684, + "grad_norm": 1.7055913209915161, + "learning_rate": 7.514561655737904e-05, + "loss": 1.7112, + "step": 16546 + }, + { + "epoch": 0.5925833079664082, + "grad_norm": 1.6069657802581787, + "learning_rate": 7.513438168115449e-05, + "loss": 1.4381, + "step": 16547 + }, + { + "epoch": 0.5926191200959765, + "grad_norm": 1.99888277053833, + "learning_rate": 7.512314713946478e-05, + "loss": 1.677, + "step": 16548 + }, + { + "epoch": 0.5926549322255448, + "grad_norm": 1.9482409954071045, + "learning_rate": 7.5111912932461e-05, + "loss": 1.0121, + "step": 16549 + }, + { + "epoch": 0.592690744355113, + "grad_norm": 1.8642888069152832, + "learning_rate": 7.510067906029437e-05, + "loss": 1.5993, + "step": 16550 + }, + { + "epoch": 0.5927265564846813, + "grad_norm": 1.965591311454773, + "learning_rate": 7.508944552311594e-05, + "loss": 1.4873, + "step": 16551 + }, + { + "epoch": 0.5927623686142497, + "grad_norm": 1.6098365783691406, + "learning_rate": 7.507821232107695e-05, + "loss": 1.433, + "step": 16552 + }, + { + "epoch": 0.592798180743818, + "grad_norm": 1.4152309894561768, + "learning_rate": 7.506697945432841e-05, + "loss": 1.7259, + "step": 16553 + }, + { + "epoch": 0.5928339928733862, + "grad_norm": 1.5540013313293457, + "learning_rate": 7.505574692302155e-05, + "loss": 1.408, + "step": 16554 + }, + { + "epoch": 0.5928698050029545, + "grad_norm": 1.8713033199310303, + "learning_rate": 7.504451472730743e-05, + "loss": 1.4247, + "step": 16555 + }, + { + "epoch": 0.5929056171325228, + "grad_norm": 1.4902887344360352, + "learning_rate": 7.503328286733715e-05, + "loss": 1.4305, + "step": 16556 + }, + { + "epoch": 0.592941429262091, + "grad_norm": 2.143108367919922, + "learning_rate": 7.502205134326185e-05, + "loss": 1.4066, + "step": 16557 + }, + { + "epoch": 0.5929772413916593, + "grad_norm": 2.025181770324707, + "learning_rate": 7.501082015523263e-05, + "loss": 1.6475, + "step": 16558 + }, + { + "epoch": 0.5930130535212277, + "grad_norm": 2.7611522674560547, + "learning_rate": 7.499958930340061e-05, + "loss": 1.6847, + "step": 16559 + }, + { + "epoch": 0.593048865650796, + "grad_norm": 1.5839300155639648, + "learning_rate": 7.498835878791684e-05, + "loss": 1.6008, + "step": 16560 + }, + { + "epoch": 0.5930846777803642, + "grad_norm": 1.7940725088119507, + "learning_rate": 7.497712860893245e-05, + "loss": 1.2224, + "step": 16561 + }, + { + "epoch": 0.5931204899099325, + "grad_norm": 1.4097790718078613, + "learning_rate": 7.49658987665985e-05, + "loss": 1.5644, + "step": 16562 + }, + { + "epoch": 0.5931563020395008, + "grad_norm": 1.7357324361801147, + "learning_rate": 7.495466926106614e-05, + "loss": 1.3663, + "step": 16563 + }, + { + "epoch": 0.593192114169069, + "grad_norm": 2.125781774520874, + "learning_rate": 7.494344009248637e-05, + "loss": 1.6129, + "step": 16564 + }, + { + "epoch": 0.5932279262986373, + "grad_norm": 1.732550024986267, + "learning_rate": 7.493221126101028e-05, + "loss": 1.6356, + "step": 16565 + }, + { + "epoch": 0.5932637384282057, + "grad_norm": 1.9407490491867065, + "learning_rate": 7.492098276678898e-05, + "loss": 1.3264, + "step": 16566 + }, + { + "epoch": 0.593299550557774, + "grad_norm": 1.5045716762542725, + "learning_rate": 7.490975460997348e-05, + "loss": 1.1289, + "step": 16567 + }, + { + "epoch": 0.5933353626873422, + "grad_norm": 1.425619125366211, + "learning_rate": 7.489852679071488e-05, + "loss": 1.4394, + "step": 16568 + }, + { + "epoch": 0.5933711748169105, + "grad_norm": 2.01013445854187, + "learning_rate": 7.488729930916421e-05, + "loss": 1.4089, + "step": 16569 + }, + { + "epoch": 0.5934069869464788, + "grad_norm": 1.647908329963684, + "learning_rate": 7.487607216547255e-05, + "loss": 1.6508, + "step": 16570 + }, + { + "epoch": 0.593442799076047, + "grad_norm": 1.4094040393829346, + "learning_rate": 7.486484535979092e-05, + "loss": 1.5401, + "step": 16571 + }, + { + "epoch": 0.5934786112056153, + "grad_norm": 1.2045484781265259, + "learning_rate": 7.485361889227038e-05, + "loss": 1.2202, + "step": 16572 + }, + { + "epoch": 0.5935144233351837, + "grad_norm": 1.9746378660202026, + "learning_rate": 7.484239276306198e-05, + "loss": 1.4754, + "step": 16573 + }, + { + "epoch": 0.5935502354647519, + "grad_norm": 1.6177870035171509, + "learning_rate": 7.483116697231671e-05, + "loss": 1.2873, + "step": 16574 + }, + { + "epoch": 0.5935860475943202, + "grad_norm": 2.0700132846832275, + "learning_rate": 7.481994152018563e-05, + "loss": 1.1141, + "step": 16575 + }, + { + "epoch": 0.5936218597238885, + "grad_norm": 1.6111782789230347, + "learning_rate": 7.480871640681975e-05, + "loss": 1.4394, + "step": 16576 + }, + { + "epoch": 0.5936576718534567, + "grad_norm": 1.553205966949463, + "learning_rate": 7.479749163237012e-05, + "loss": 1.3981, + "step": 16577 + }, + { + "epoch": 0.593693483983025, + "grad_norm": 1.7631208896636963, + "learning_rate": 7.47862671969877e-05, + "loss": 1.7236, + "step": 16578 + }, + { + "epoch": 0.5937292961125933, + "grad_norm": 1.4806108474731445, + "learning_rate": 7.477504310082354e-05, + "loss": 1.5397, + "step": 16579 + }, + { + "epoch": 0.5937651082421617, + "grad_norm": 1.9398987293243408, + "learning_rate": 7.476381934402865e-05, + "loss": 1.2193, + "step": 16580 + }, + { + "epoch": 0.5938009203717299, + "grad_norm": 1.654647946357727, + "learning_rate": 7.475259592675402e-05, + "loss": 1.4312, + "step": 16581 + }, + { + "epoch": 0.5938367325012982, + "grad_norm": 1.9441183805465698, + "learning_rate": 7.474137284915065e-05, + "loss": 1.4922, + "step": 16582 + }, + { + "epoch": 0.5938725446308665, + "grad_norm": 2.232104539871216, + "learning_rate": 7.47301501113695e-05, + "loss": 1.2633, + "step": 16583 + }, + { + "epoch": 0.5939083567604347, + "grad_norm": 1.7381802797317505, + "learning_rate": 7.471892771356161e-05, + "loss": 1.4915, + "step": 16584 + }, + { + "epoch": 0.593944168890003, + "grad_norm": 1.7541508674621582, + "learning_rate": 7.470770565587792e-05, + "loss": 1.4452, + "step": 16585 + }, + { + "epoch": 0.5939799810195713, + "grad_norm": 1.4216136932373047, + "learning_rate": 7.469648393846943e-05, + "loss": 1.3694, + "step": 16586 + }, + { + "epoch": 0.5940157931491397, + "grad_norm": 1.4462414979934692, + "learning_rate": 7.46852625614871e-05, + "loss": 1.5004, + "step": 16587 + }, + { + "epoch": 0.5940516052787079, + "grad_norm": 1.6865832805633545, + "learning_rate": 7.467404152508193e-05, + "loss": 1.8021, + "step": 16588 + }, + { + "epoch": 0.5940874174082762, + "grad_norm": 2.2418625354766846, + "learning_rate": 7.466282082940484e-05, + "loss": 1.493, + "step": 16589 + }, + { + "epoch": 0.5941232295378445, + "grad_norm": 2.0363099575042725, + "learning_rate": 7.465160047460685e-05, + "loss": 1.7929, + "step": 16590 + }, + { + "epoch": 0.5941590416674127, + "grad_norm": 1.8767224550247192, + "learning_rate": 7.464038046083885e-05, + "loss": 1.5985, + "step": 16591 + }, + { + "epoch": 0.594194853796981, + "grad_norm": 1.5032994747161865, + "learning_rate": 7.462916078825182e-05, + "loss": 1.4272, + "step": 16592 + }, + { + "epoch": 0.5942306659265493, + "grad_norm": 1.4486925601959229, + "learning_rate": 7.46179414569967e-05, + "loss": 1.4855, + "step": 16593 + }, + { + "epoch": 0.5942664780561177, + "grad_norm": 1.592706561088562, + "learning_rate": 7.460672246722444e-05, + "loss": 1.7044, + "step": 16594 + }, + { + "epoch": 0.5943022901856859, + "grad_norm": 1.9221786260604858, + "learning_rate": 7.4595503819086e-05, + "loss": 1.2772, + "step": 16595 + }, + { + "epoch": 0.5943381023152542, + "grad_norm": 1.6944142580032349, + "learning_rate": 7.458428551273226e-05, + "loss": 1.4654, + "step": 16596 + }, + { + "epoch": 0.5943739144448225, + "grad_norm": 2.0230796337127686, + "learning_rate": 7.45730675483142e-05, + "loss": 1.3047, + "step": 16597 + }, + { + "epoch": 0.5944097265743907, + "grad_norm": 1.6869089603424072, + "learning_rate": 7.456184992598267e-05, + "loss": 1.5513, + "step": 16598 + }, + { + "epoch": 0.594445538703959, + "grad_norm": 1.6149792671203613, + "learning_rate": 7.455063264588869e-05, + "loss": 1.4627, + "step": 16599 + }, + { + "epoch": 0.5944813508335273, + "grad_norm": 1.7246642112731934, + "learning_rate": 7.453941570818309e-05, + "loss": 1.502, + "step": 16600 + }, + { + "epoch": 0.5945171629630956, + "grad_norm": 1.4698948860168457, + "learning_rate": 7.452819911301681e-05, + "loss": 1.1747, + "step": 16601 + }, + { + "epoch": 0.5945529750926639, + "grad_norm": 2.224147081375122, + "learning_rate": 7.451698286054076e-05, + "loss": 1.7846, + "step": 16602 + }, + { + "epoch": 0.5945887872222322, + "grad_norm": 1.9010647535324097, + "learning_rate": 7.450576695090583e-05, + "loss": 1.1994, + "step": 16603 + }, + { + "epoch": 0.5946245993518005, + "grad_norm": 1.8986421823501587, + "learning_rate": 7.449455138426294e-05, + "loss": 1.392, + "step": 16604 + }, + { + "epoch": 0.5946604114813687, + "grad_norm": 2.1596598625183105, + "learning_rate": 7.448333616076293e-05, + "loss": 1.5052, + "step": 16605 + }, + { + "epoch": 0.594696223610937, + "grad_norm": 2.1113686561584473, + "learning_rate": 7.447212128055675e-05, + "loss": 1.2468, + "step": 16606 + }, + { + "epoch": 0.5947320357405053, + "grad_norm": 1.4451698064804077, + "learning_rate": 7.446090674379522e-05, + "loss": 1.6236, + "step": 16607 + }, + { + "epoch": 0.5947678478700736, + "grad_norm": 2.0924429893493652, + "learning_rate": 7.444969255062928e-05, + "loss": 1.4995, + "step": 16608 + }, + { + "epoch": 0.5948036599996419, + "grad_norm": 1.6229060888290405, + "learning_rate": 7.443847870120976e-05, + "loss": 1.452, + "step": 16609 + }, + { + "epoch": 0.5948394721292102, + "grad_norm": 1.5565780401229858, + "learning_rate": 7.442726519568751e-05, + "loss": 1.4328, + "step": 16610 + }, + { + "epoch": 0.5948752842587784, + "grad_norm": 1.7966111898422241, + "learning_rate": 7.441605203421345e-05, + "loss": 1.5491, + "step": 16611 + }, + { + "epoch": 0.5949110963883467, + "grad_norm": 1.5467952489852905, + "learning_rate": 7.440483921693839e-05, + "loss": 1.5181, + "step": 16612 + }, + { + "epoch": 0.594946908517915, + "grad_norm": 1.3843365907669067, + "learning_rate": 7.439362674401322e-05, + "loss": 1.3912, + "step": 16613 + }, + { + "epoch": 0.5949827206474833, + "grad_norm": 1.6058661937713623, + "learning_rate": 7.438241461558875e-05, + "loss": 1.5003, + "step": 16614 + }, + { + "epoch": 0.5950185327770516, + "grad_norm": 1.2036961317062378, + "learning_rate": 7.437120283181586e-05, + "loss": 1.3581, + "step": 16615 + }, + { + "epoch": 0.5950543449066199, + "grad_norm": 1.3170970678329468, + "learning_rate": 7.435999139284538e-05, + "loss": 1.4601, + "step": 16616 + }, + { + "epoch": 0.5950901570361882, + "grad_norm": 2.2052175998687744, + "learning_rate": 7.434878029882814e-05, + "loss": 1.5695, + "step": 16617 + }, + { + "epoch": 0.5951259691657564, + "grad_norm": 1.6902546882629395, + "learning_rate": 7.433756954991499e-05, + "loss": 1.9375, + "step": 16618 + }, + { + "epoch": 0.5951617812953247, + "grad_norm": 1.6723322868347168, + "learning_rate": 7.43263591462567e-05, + "loss": 1.5111, + "step": 16619 + }, + { + "epoch": 0.595197593424893, + "grad_norm": 1.820304274559021, + "learning_rate": 7.431514908800417e-05, + "loss": 1.4817, + "step": 16620 + }, + { + "epoch": 0.5952334055544612, + "grad_norm": 2.129159927368164, + "learning_rate": 7.430393937530815e-05, + "loss": 1.5987, + "step": 16621 + }, + { + "epoch": 0.5952692176840296, + "grad_norm": 1.5790053606033325, + "learning_rate": 7.429273000831949e-05, + "loss": 1.5129, + "step": 16622 + }, + { + "epoch": 0.5953050298135979, + "grad_norm": 1.2789185047149658, + "learning_rate": 7.4281520987189e-05, + "loss": 1.4375, + "step": 16623 + }, + { + "epoch": 0.5953408419431662, + "grad_norm": 1.4424225091934204, + "learning_rate": 7.427031231206745e-05, + "loss": 1.2032, + "step": 16624 + }, + { + "epoch": 0.5953766540727344, + "grad_norm": 2.2218496799468994, + "learning_rate": 7.425910398310566e-05, + "loss": 1.3099, + "step": 16625 + }, + { + "epoch": 0.5954124662023027, + "grad_norm": 2.0501346588134766, + "learning_rate": 7.424789600045444e-05, + "loss": 1.5333, + "step": 16626 + }, + { + "epoch": 0.595448278331871, + "grad_norm": 1.7061489820480347, + "learning_rate": 7.423668836426458e-05, + "loss": 1.6931, + "step": 16627 + }, + { + "epoch": 0.5954840904614392, + "grad_norm": 2.1097772121429443, + "learning_rate": 7.422548107468679e-05, + "loss": 1.2726, + "step": 16628 + }, + { + "epoch": 0.5955199025910076, + "grad_norm": 1.6923251152038574, + "learning_rate": 7.421427413187197e-05, + "loss": 1.7505, + "step": 16629 + }, + { + "epoch": 0.5955557147205759, + "grad_norm": 1.5730112791061401, + "learning_rate": 7.420306753597076e-05, + "loss": 1.4845, + "step": 16630 + }, + { + "epoch": 0.5955915268501442, + "grad_norm": 1.3994970321655273, + "learning_rate": 7.419186128713407e-05, + "loss": 1.2497, + "step": 16631 + }, + { + "epoch": 0.5956273389797124, + "grad_norm": 1.8320046663284302, + "learning_rate": 7.418065538551253e-05, + "loss": 1.3596, + "step": 16632 + }, + { + "epoch": 0.5956631511092807, + "grad_norm": 3.0585572719573975, + "learning_rate": 7.4169449831257e-05, + "loss": 1.7986, + "step": 16633 + }, + { + "epoch": 0.595698963238849, + "grad_norm": 1.4562921524047852, + "learning_rate": 7.415824462451824e-05, + "loss": 1.3, + "step": 16634 + }, + { + "epoch": 0.5957347753684172, + "grad_norm": 1.64504075050354, + "learning_rate": 7.41470397654469e-05, + "loss": 1.5275, + "step": 16635 + }, + { + "epoch": 0.5957705874979856, + "grad_norm": 1.7294472455978394, + "learning_rate": 7.413583525419385e-05, + "loss": 1.3219, + "step": 16636 + }, + { + "epoch": 0.5958063996275539, + "grad_norm": 1.6882243156433105, + "learning_rate": 7.41246310909097e-05, + "loss": 1.5301, + "step": 16637 + }, + { + "epoch": 0.5958422117571222, + "grad_norm": 1.7502877712249756, + "learning_rate": 7.411342727574533e-05, + "loss": 1.6065, + "step": 16638 + }, + { + "epoch": 0.5958780238866904, + "grad_norm": 1.149178385734558, + "learning_rate": 7.410222380885135e-05, + "loss": 1.2456, + "step": 16639 + }, + { + "epoch": 0.5959138360162587, + "grad_norm": 1.7622125148773193, + "learning_rate": 7.409102069037862e-05, + "loss": 1.017, + "step": 16640 + }, + { + "epoch": 0.595949648145827, + "grad_norm": 2.7804136276245117, + "learning_rate": 7.407981792047769e-05, + "loss": 1.479, + "step": 16641 + }, + { + "epoch": 0.5959854602753952, + "grad_norm": 1.9207881689071655, + "learning_rate": 7.406861549929946e-05, + "loss": 1.4618, + "step": 16642 + }, + { + "epoch": 0.5960212724049636, + "grad_norm": 1.6801304817199707, + "learning_rate": 7.405741342699453e-05, + "loss": 1.2735, + "step": 16643 + }, + { + "epoch": 0.5960570845345319, + "grad_norm": 2.0418813228607178, + "learning_rate": 7.404621170371362e-05, + "loss": 1.4934, + "step": 16644 + }, + { + "epoch": 0.5960928966641001, + "grad_norm": 1.2861592769622803, + "learning_rate": 7.403501032960748e-05, + "loss": 1.4293, + "step": 16645 + }, + { + "epoch": 0.5961287087936684, + "grad_norm": 1.7379697561264038, + "learning_rate": 7.402380930482673e-05, + "loss": 1.3013, + "step": 16646 + }, + { + "epoch": 0.5961645209232367, + "grad_norm": 1.4268943071365356, + "learning_rate": 7.40126086295222e-05, + "loss": 1.5411, + "step": 16647 + }, + { + "epoch": 0.596200333052805, + "grad_norm": 1.8618836402893066, + "learning_rate": 7.400140830384443e-05, + "loss": 1.5819, + "step": 16648 + }, + { + "epoch": 0.5962361451823732, + "grad_norm": 2.0353612899780273, + "learning_rate": 7.399020832794424e-05, + "loss": 1.4602, + "step": 16649 + }, + { + "epoch": 0.5962719573119416, + "grad_norm": 1.4291691780090332, + "learning_rate": 7.397900870197216e-05, + "loss": 1.5755, + "step": 16650 + }, + { + "epoch": 0.5963077694415099, + "grad_norm": 1.5784722566604614, + "learning_rate": 7.396780942607904e-05, + "loss": 1.2951, + "step": 16651 + }, + { + "epoch": 0.5963435815710781, + "grad_norm": 2.4634132385253906, + "learning_rate": 7.395661050041545e-05, + "loss": 1.3903, + "step": 16652 + }, + { + "epoch": 0.5963793937006464, + "grad_norm": 1.3411083221435547, + "learning_rate": 7.394541192513202e-05, + "loss": 1.5628, + "step": 16653 + }, + { + "epoch": 0.5964152058302147, + "grad_norm": 1.625428557395935, + "learning_rate": 7.393421370037952e-05, + "loss": 1.4502, + "step": 16654 + }, + { + "epoch": 0.596451017959783, + "grad_norm": 1.308756947517395, + "learning_rate": 7.392301582630852e-05, + "loss": 0.9473, + "step": 16655 + }, + { + "epoch": 0.5964868300893512, + "grad_norm": 1.509726881980896, + "learning_rate": 7.391181830306972e-05, + "loss": 1.2107, + "step": 16656 + }, + { + "epoch": 0.5965226422189196, + "grad_norm": 1.3004424571990967, + "learning_rate": 7.390062113081373e-05, + "loss": 1.6063, + "step": 16657 + }, + { + "epoch": 0.5965584543484879, + "grad_norm": 1.2375189065933228, + "learning_rate": 7.388942430969123e-05, + "loss": 1.6043, + "step": 16658 + }, + { + "epoch": 0.5965942664780561, + "grad_norm": 1.9687907695770264, + "learning_rate": 7.387822783985283e-05, + "loss": 1.3229, + "step": 16659 + }, + { + "epoch": 0.5966300786076244, + "grad_norm": 2.021730422973633, + "learning_rate": 7.386703172144921e-05, + "loss": 1.2441, + "step": 16660 + }, + { + "epoch": 0.5966658907371927, + "grad_norm": 1.6941550970077515, + "learning_rate": 7.385583595463099e-05, + "loss": 1.7031, + "step": 16661 + }, + { + "epoch": 0.5967017028667609, + "grad_norm": 1.5119400024414062, + "learning_rate": 7.384464053954872e-05, + "loss": 1.5488, + "step": 16662 + }, + { + "epoch": 0.5967375149963292, + "grad_norm": 1.5219899415969849, + "learning_rate": 7.383344547635311e-05, + "loss": 1.378, + "step": 16663 + }, + { + "epoch": 0.5967733271258976, + "grad_norm": 1.6702772378921509, + "learning_rate": 7.382225076519471e-05, + "loss": 1.1648, + "step": 16664 + }, + { + "epoch": 0.5968091392554659, + "grad_norm": 1.8304907083511353, + "learning_rate": 7.381105640622419e-05, + "loss": 1.3808, + "step": 16665 + }, + { + "epoch": 0.5968449513850341, + "grad_norm": 1.6182093620300293, + "learning_rate": 7.379986239959209e-05, + "loss": 1.4653, + "step": 16666 + }, + { + "epoch": 0.5968807635146024, + "grad_norm": 1.697573184967041, + "learning_rate": 7.378866874544908e-05, + "loss": 1.0452, + "step": 16667 + }, + { + "epoch": 0.5969165756441707, + "grad_norm": 1.5860910415649414, + "learning_rate": 7.377747544394568e-05, + "loss": 1.2655, + "step": 16668 + }, + { + "epoch": 0.5969523877737389, + "grad_norm": 1.4699441194534302, + "learning_rate": 7.376628249523257e-05, + "loss": 1.4476, + "step": 16669 + }, + { + "epoch": 0.5969881999033072, + "grad_norm": 1.399438738822937, + "learning_rate": 7.375508989946027e-05, + "loss": 1.5486, + "step": 16670 + }, + { + "epoch": 0.5970240120328756, + "grad_norm": 1.9118127822875977, + "learning_rate": 7.374389765677938e-05, + "loss": 1.3724, + "step": 16671 + }, + { + "epoch": 0.5970598241624439, + "grad_norm": 1.6800737380981445, + "learning_rate": 7.373270576734048e-05, + "loss": 1.6724, + "step": 16672 + }, + { + "epoch": 0.5970956362920121, + "grad_norm": 1.4176125526428223, + "learning_rate": 7.372151423129414e-05, + "loss": 1.4472, + "step": 16673 + }, + { + "epoch": 0.5971314484215804, + "grad_norm": 1.7875847816467285, + "learning_rate": 7.371032304879094e-05, + "loss": 1.4848, + "step": 16674 + }, + { + "epoch": 0.5971672605511487, + "grad_norm": 1.5433893203735352, + "learning_rate": 7.369913221998141e-05, + "loss": 1.4387, + "step": 16675 + }, + { + "epoch": 0.5972030726807169, + "grad_norm": 2.274228572845459, + "learning_rate": 7.368794174501615e-05, + "loss": 1.5471, + "step": 16676 + }, + { + "epoch": 0.5972388848102852, + "grad_norm": 2.2298521995544434, + "learning_rate": 7.367675162404567e-05, + "loss": 1.6403, + "step": 16677 + }, + { + "epoch": 0.5972746969398536, + "grad_norm": 1.9916534423828125, + "learning_rate": 7.366556185722056e-05, + "loss": 1.4881, + "step": 16678 + }, + { + "epoch": 0.5973105090694218, + "grad_norm": 2.0193593502044678, + "learning_rate": 7.365437244469135e-05, + "loss": 1.4196, + "step": 16679 + }, + { + "epoch": 0.5973463211989901, + "grad_norm": 1.4831794500350952, + "learning_rate": 7.364318338660858e-05, + "loss": 1.7932, + "step": 16680 + }, + { + "epoch": 0.5973821333285584, + "grad_norm": 1.533564567565918, + "learning_rate": 7.363199468312277e-05, + "loss": 1.289, + "step": 16681 + }, + { + "epoch": 0.5974179454581267, + "grad_norm": 2.7487010955810547, + "learning_rate": 7.362080633438445e-05, + "loss": 1.6209, + "step": 16682 + }, + { + "epoch": 0.5974537575876949, + "grad_norm": 1.5415210723876953, + "learning_rate": 7.360961834054418e-05, + "loss": 1.7845, + "step": 16683 + }, + { + "epoch": 0.5974895697172632, + "grad_norm": 1.641719937324524, + "learning_rate": 7.359843070175242e-05, + "loss": 1.4157, + "step": 16684 + }, + { + "epoch": 0.5975253818468316, + "grad_norm": 1.4241138696670532, + "learning_rate": 7.358724341815975e-05, + "loss": 1.3276, + "step": 16685 + }, + { + "epoch": 0.5975611939763998, + "grad_norm": 2.071657419204712, + "learning_rate": 7.357605648991661e-05, + "loss": 1.293, + "step": 16686 + }, + { + "epoch": 0.5975970061059681, + "grad_norm": 1.8686021566390991, + "learning_rate": 7.356486991717359e-05, + "loss": 1.3658, + "step": 16687 + }, + { + "epoch": 0.5976328182355364, + "grad_norm": 2.0502803325653076, + "learning_rate": 7.355368370008113e-05, + "loss": 1.5443, + "step": 16688 + }, + { + "epoch": 0.5976686303651046, + "grad_norm": 1.549391508102417, + "learning_rate": 7.354249783878973e-05, + "loss": 1.4303, + "step": 16689 + }, + { + "epoch": 0.5977044424946729, + "grad_norm": 1.3476611375808716, + "learning_rate": 7.353131233344991e-05, + "loss": 1.569, + "step": 16690 + }, + { + "epoch": 0.5977402546242412, + "grad_norm": 1.3979164361953735, + "learning_rate": 7.352012718421212e-05, + "loss": 1.2161, + "step": 16691 + }, + { + "epoch": 0.5977760667538096, + "grad_norm": 2.414729595184326, + "learning_rate": 7.350894239122689e-05, + "loss": 1.8839, + "step": 16692 + }, + { + "epoch": 0.5978118788833778, + "grad_norm": 1.8490568399429321, + "learning_rate": 7.349775795464466e-05, + "loss": 1.3075, + "step": 16693 + }, + { + "epoch": 0.5978476910129461, + "grad_norm": 1.7273000478744507, + "learning_rate": 7.348657387461591e-05, + "loss": 1.4859, + "step": 16694 + }, + { + "epoch": 0.5978835031425144, + "grad_norm": 1.6168313026428223, + "learning_rate": 7.34753901512911e-05, + "loss": 1.6608, + "step": 16695 + }, + { + "epoch": 0.5979193152720826, + "grad_norm": 1.6971771717071533, + "learning_rate": 7.346420678482071e-05, + "loss": 1.2613, + "step": 16696 + }, + { + "epoch": 0.5979551274016509, + "grad_norm": 1.7362477779388428, + "learning_rate": 7.345302377535521e-05, + "loss": 1.3894, + "step": 16697 + }, + { + "epoch": 0.5979909395312192, + "grad_norm": 2.0205066204071045, + "learning_rate": 7.3441841123045e-05, + "loss": 1.6138, + "step": 16698 + }, + { + "epoch": 0.5980267516607876, + "grad_norm": 2.0709989070892334, + "learning_rate": 7.343065882804056e-05, + "loss": 1.0796, + "step": 16699 + }, + { + "epoch": 0.5980625637903558, + "grad_norm": 1.900892734527588, + "learning_rate": 7.341947689049233e-05, + "loss": 1.3942, + "step": 16700 + }, + { + "epoch": 0.5980983759199241, + "grad_norm": 1.6259151697158813, + "learning_rate": 7.340829531055078e-05, + "loss": 1.0872, + "step": 16701 + }, + { + "epoch": 0.5981341880494924, + "grad_norm": 2.2745935916900635, + "learning_rate": 7.339711408836629e-05, + "loss": 1.7529, + "step": 16702 + }, + { + "epoch": 0.5981700001790606, + "grad_norm": 1.6044317483901978, + "learning_rate": 7.338593322408933e-05, + "loss": 1.6281, + "step": 16703 + }, + { + "epoch": 0.5982058123086289, + "grad_norm": 2.226635217666626, + "learning_rate": 7.33747527178703e-05, + "loss": 1.595, + "step": 16704 + }, + { + "epoch": 0.5982416244381972, + "grad_norm": 2.034383535385132, + "learning_rate": 7.336357256985964e-05, + "loss": 1.4936, + "step": 16705 + }, + { + "epoch": 0.5982774365677656, + "grad_norm": 1.4976028203964233, + "learning_rate": 7.335239278020776e-05, + "loss": 1.532, + "step": 16706 + }, + { + "epoch": 0.5983132486973338, + "grad_norm": 1.4269556999206543, + "learning_rate": 7.334121334906503e-05, + "loss": 1.4254, + "step": 16707 + }, + { + "epoch": 0.5983490608269021, + "grad_norm": 1.4683499336242676, + "learning_rate": 7.333003427658192e-05, + "loss": 1.6565, + "step": 16708 + }, + { + "epoch": 0.5983848729564704, + "grad_norm": 2.059978485107422, + "learning_rate": 7.331885556290876e-05, + "loss": 1.6096, + "step": 16709 + }, + { + "epoch": 0.5984206850860386, + "grad_norm": 1.7053067684173584, + "learning_rate": 7.330767720819601e-05, + "loss": 1.6937, + "step": 16710 + }, + { + "epoch": 0.5984564972156069, + "grad_norm": 2.052213191986084, + "learning_rate": 7.329649921259402e-05, + "loss": 1.4403, + "step": 16711 + }, + { + "epoch": 0.5984923093451752, + "grad_norm": 1.58275306224823, + "learning_rate": 7.32853215762532e-05, + "loss": 1.3232, + "step": 16712 + }, + { + "epoch": 0.5985281214747435, + "grad_norm": 1.7357984781265259, + "learning_rate": 7.32741442993239e-05, + "loss": 1.4713, + "step": 16713 + }, + { + "epoch": 0.5985639336043118, + "grad_norm": 1.503201961517334, + "learning_rate": 7.326296738195654e-05, + "loss": 1.5623, + "step": 16714 + }, + { + "epoch": 0.5985997457338801, + "grad_norm": 1.5212639570236206, + "learning_rate": 7.325179082430148e-05, + "loss": 1.3255, + "step": 16715 + }, + { + "epoch": 0.5986355578634484, + "grad_norm": 1.991188645362854, + "learning_rate": 7.324061462650901e-05, + "loss": 1.4, + "step": 16716 + }, + { + "epoch": 0.5986713699930166, + "grad_norm": 2.1019017696380615, + "learning_rate": 7.32294387887296e-05, + "loss": 1.3448, + "step": 16717 + }, + { + "epoch": 0.5987071821225849, + "grad_norm": 1.5686511993408203, + "learning_rate": 7.321826331111353e-05, + "loss": 1.709, + "step": 16718 + }, + { + "epoch": 0.5987429942521532, + "grad_norm": 1.8359214067459106, + "learning_rate": 7.320708819381121e-05, + "loss": 1.534, + "step": 16719 + }, + { + "epoch": 0.5987788063817215, + "grad_norm": 1.9091947078704834, + "learning_rate": 7.319591343697293e-05, + "loss": 1.4201, + "step": 16720 + }, + { + "epoch": 0.5988146185112898, + "grad_norm": 1.5145343542099, + "learning_rate": 7.31847390407491e-05, + "loss": 1.4507, + "step": 16721 + }, + { + "epoch": 0.5988504306408581, + "grad_norm": 1.864920973777771, + "learning_rate": 7.317356500528996e-05, + "loss": 1.43, + "step": 16722 + }, + { + "epoch": 0.5988862427704263, + "grad_norm": 1.6559187173843384, + "learning_rate": 7.316239133074595e-05, + "loss": 1.3573, + "step": 16723 + }, + { + "epoch": 0.5989220548999946, + "grad_norm": 1.7337490320205688, + "learning_rate": 7.315121801726737e-05, + "loss": 1.2266, + "step": 16724 + }, + { + "epoch": 0.5989578670295629, + "grad_norm": 1.527982473373413, + "learning_rate": 7.314004506500443e-05, + "loss": 1.6621, + "step": 16725 + }, + { + "epoch": 0.5989936791591312, + "grad_norm": 1.4964022636413574, + "learning_rate": 7.312887247410762e-05, + "loss": 1.3453, + "step": 16726 + }, + { + "epoch": 0.5990294912886995, + "grad_norm": 1.426389455795288, + "learning_rate": 7.311770024472711e-05, + "loss": 1.503, + "step": 16727 + }, + { + "epoch": 0.5990653034182678, + "grad_norm": 1.8016306161880493, + "learning_rate": 7.31065283770133e-05, + "loss": 1.5566, + "step": 16728 + }, + { + "epoch": 0.5991011155478361, + "grad_norm": 1.6403863430023193, + "learning_rate": 7.309535687111644e-05, + "loss": 1.613, + "step": 16729 + }, + { + "epoch": 0.5991369276774043, + "grad_norm": 1.8085709810256958, + "learning_rate": 7.308418572718687e-05, + "loss": 1.7136, + "step": 16730 + }, + { + "epoch": 0.5991727398069726, + "grad_norm": 1.6913074254989624, + "learning_rate": 7.307301494537489e-05, + "loss": 1.2859, + "step": 16731 + }, + { + "epoch": 0.5992085519365409, + "grad_norm": 1.4169158935546875, + "learning_rate": 7.306184452583067e-05, + "loss": 1.6721, + "step": 16732 + }, + { + "epoch": 0.5992443640661091, + "grad_norm": 1.5620194673538208, + "learning_rate": 7.305067446870468e-05, + "loss": 1.4362, + "step": 16733 + }, + { + "epoch": 0.5992801761956775, + "grad_norm": 1.7782098054885864, + "learning_rate": 7.303950477414703e-05, + "loss": 1.5194, + "step": 16734 + }, + { + "epoch": 0.5993159883252458, + "grad_norm": 1.9278711080551147, + "learning_rate": 7.302833544230812e-05, + "loss": 1.2504, + "step": 16735 + }, + { + "epoch": 0.5993518004548141, + "grad_norm": 1.8028457164764404, + "learning_rate": 7.301716647333812e-05, + "loss": 1.0734, + "step": 16736 + }, + { + "epoch": 0.5993876125843823, + "grad_norm": 1.6356738805770874, + "learning_rate": 7.300599786738739e-05, + "loss": 1.6322, + "step": 16737 + }, + { + "epoch": 0.5994234247139506, + "grad_norm": 1.7443691492080688, + "learning_rate": 7.299482962460607e-05, + "loss": 1.4234, + "step": 16738 + }, + { + "epoch": 0.5994592368435189, + "grad_norm": 1.5225340127944946, + "learning_rate": 7.298366174514456e-05, + "loss": 1.5641, + "step": 16739 + }, + { + "epoch": 0.5994950489730871, + "grad_norm": 1.35145103931427, + "learning_rate": 7.297249422915301e-05, + "loss": 1.4949, + "step": 16740 + }, + { + "epoch": 0.5995308611026555, + "grad_norm": 2.0662713050842285, + "learning_rate": 7.296132707678166e-05, + "loss": 1.4635, + "step": 16741 + }, + { + "epoch": 0.5995666732322238, + "grad_norm": 1.6183621883392334, + "learning_rate": 7.29501602881808e-05, + "loss": 1.6438, + "step": 16742 + }, + { + "epoch": 0.5996024853617921, + "grad_norm": 2.1140263080596924, + "learning_rate": 7.29389938635006e-05, + "loss": 1.778, + "step": 16743 + }, + { + "epoch": 0.5996382974913603, + "grad_norm": 1.8030223846435547, + "learning_rate": 7.292782780289141e-05, + "loss": 1.6374, + "step": 16744 + }, + { + "epoch": 0.5996741096209286, + "grad_norm": 1.7146693468093872, + "learning_rate": 7.291666210650328e-05, + "loss": 1.3417, + "step": 16745 + }, + { + "epoch": 0.5997099217504969, + "grad_norm": 1.5830776691436768, + "learning_rate": 7.290549677448661e-05, + "loss": 1.4825, + "step": 16746 + }, + { + "epoch": 0.5997457338800651, + "grad_norm": 2.0402321815490723, + "learning_rate": 7.289433180699148e-05, + "loss": 1.2888, + "step": 16747 + }, + { + "epoch": 0.5997815460096335, + "grad_norm": 1.7229087352752686, + "learning_rate": 7.28831672041682e-05, + "loss": 1.4803, + "step": 16748 + }, + { + "epoch": 0.5998173581392018, + "grad_norm": 1.6451315879821777, + "learning_rate": 7.287200296616689e-05, + "loss": 1.5543, + "step": 16749 + }, + { + "epoch": 0.59985317026877, + "grad_norm": 2.058530569076538, + "learning_rate": 7.286083909313779e-05, + "loss": 1.4938, + "step": 16750 + }, + { + "epoch": 0.5998889823983383, + "grad_norm": 1.7364871501922607, + "learning_rate": 7.284967558523112e-05, + "loss": 1.5158, + "step": 16751 + }, + { + "epoch": 0.5999247945279066, + "grad_norm": 1.551857829093933, + "learning_rate": 7.2838512442597e-05, + "loss": 1.2004, + "step": 16752 + }, + { + "epoch": 0.5999606066574749, + "grad_norm": 1.3351510763168335, + "learning_rate": 7.282734966538569e-05, + "loss": 1.6516, + "step": 16753 + }, + { + "epoch": 0.5999964187870431, + "grad_norm": 1.86201012134552, + "learning_rate": 7.281618725374733e-05, + "loss": 1.3853, + "step": 16754 + }, + { + "epoch": 0.6000322309166115, + "grad_norm": 1.8424255847930908, + "learning_rate": 7.28050252078321e-05, + "loss": 1.5038, + "step": 16755 + }, + { + "epoch": 0.6000680430461798, + "grad_norm": 1.396933913230896, + "learning_rate": 7.279386352779016e-05, + "loss": 1.3897, + "step": 16756 + }, + { + "epoch": 0.600103855175748, + "grad_norm": 1.5650379657745361, + "learning_rate": 7.278270221377174e-05, + "loss": 1.2216, + "step": 16757 + }, + { + "epoch": 0.6001396673053163, + "grad_norm": 1.6911360025405884, + "learning_rate": 7.277154126592695e-05, + "loss": 1.413, + "step": 16758 + }, + { + "epoch": 0.6001754794348846, + "grad_norm": 1.6357327699661255, + "learning_rate": 7.276038068440592e-05, + "loss": 1.1571, + "step": 16759 + }, + { + "epoch": 0.6002112915644529, + "grad_norm": 1.6942464113235474, + "learning_rate": 7.274922046935885e-05, + "loss": 1.5146, + "step": 16760 + }, + { + "epoch": 0.6002471036940211, + "grad_norm": 1.8150880336761475, + "learning_rate": 7.273806062093585e-05, + "loss": 1.4704, + "step": 16761 + }, + { + "epoch": 0.6002829158235895, + "grad_norm": 1.3599683046340942, + "learning_rate": 7.27269011392871e-05, + "loss": 1.3613, + "step": 16762 + }, + { + "epoch": 0.6003187279531578, + "grad_norm": 1.4246058464050293, + "learning_rate": 7.271574202456268e-05, + "loss": 1.6298, + "step": 16763 + }, + { + "epoch": 0.600354540082726, + "grad_norm": 1.431622862815857, + "learning_rate": 7.27045832769128e-05, + "loss": 1.4792, + "step": 16764 + }, + { + "epoch": 0.6003903522122943, + "grad_norm": 2.5366933345794678, + "learning_rate": 7.269342489648752e-05, + "loss": 1.4837, + "step": 16765 + }, + { + "epoch": 0.6004261643418626, + "grad_norm": 1.9744517803192139, + "learning_rate": 7.268226688343699e-05, + "loss": 1.2326, + "step": 16766 + }, + { + "epoch": 0.6004619764714308, + "grad_norm": 1.7462183237075806, + "learning_rate": 7.267110923791133e-05, + "loss": 1.5414, + "step": 16767 + }, + { + "epoch": 0.6004977886009991, + "grad_norm": 1.5139094591140747, + "learning_rate": 7.265995196006062e-05, + "loss": 1.6375, + "step": 16768 + }, + { + "epoch": 0.6005336007305675, + "grad_norm": 1.9999083280563354, + "learning_rate": 7.264879505003502e-05, + "loss": 1.3057, + "step": 16769 + }, + { + "epoch": 0.6005694128601358, + "grad_norm": 1.6535240411758423, + "learning_rate": 7.263763850798458e-05, + "loss": 1.7807, + "step": 16770 + }, + { + "epoch": 0.600605224989704, + "grad_norm": 1.6899582147598267, + "learning_rate": 7.262648233405942e-05, + "loss": 1.2351, + "step": 16771 + }, + { + "epoch": 0.6006410371192723, + "grad_norm": 1.7519525289535522, + "learning_rate": 7.261532652840964e-05, + "loss": 1.4308, + "step": 16772 + }, + { + "epoch": 0.6006768492488406, + "grad_norm": 1.5700947046279907, + "learning_rate": 7.260417109118531e-05, + "loss": 1.3495, + "step": 16773 + }, + { + "epoch": 0.6007126613784088, + "grad_norm": 1.5926108360290527, + "learning_rate": 7.259301602253652e-05, + "loss": 1.6475, + "step": 16774 + }, + { + "epoch": 0.6007484735079771, + "grad_norm": 1.5498440265655518, + "learning_rate": 7.258186132261336e-05, + "loss": 1.6338, + "step": 16775 + }, + { + "epoch": 0.6007842856375455, + "grad_norm": 1.9977786540985107, + "learning_rate": 7.25707069915659e-05, + "loss": 1.3661, + "step": 16776 + }, + { + "epoch": 0.6008200977671138, + "grad_norm": 1.361795425415039, + "learning_rate": 7.255955302954416e-05, + "loss": 1.2156, + "step": 16777 + }, + { + "epoch": 0.600855909896682, + "grad_norm": 1.5042104721069336, + "learning_rate": 7.254839943669826e-05, + "loss": 1.3412, + "step": 16778 + }, + { + "epoch": 0.6008917220262503, + "grad_norm": 1.656468391418457, + "learning_rate": 7.253724621317822e-05, + "loss": 1.5451, + "step": 16779 + }, + { + "epoch": 0.6009275341558186, + "grad_norm": 1.2738529443740845, + "learning_rate": 7.252609335913413e-05, + "loss": 1.284, + "step": 16780 + }, + { + "epoch": 0.6009633462853868, + "grad_norm": 1.751894235610962, + "learning_rate": 7.251494087471599e-05, + "loss": 1.5876, + "step": 16781 + }, + { + "epoch": 0.6009991584149551, + "grad_norm": 1.967529058456421, + "learning_rate": 7.250378876007389e-05, + "loss": 1.2286, + "step": 16782 + }, + { + "epoch": 0.6010349705445235, + "grad_norm": 1.7119519710540771, + "learning_rate": 7.249263701535782e-05, + "loss": 1.6704, + "step": 16783 + }, + { + "epoch": 0.6010707826740918, + "grad_norm": 2.0444655418395996, + "learning_rate": 7.248148564071787e-05, + "loss": 1.3567, + "step": 16784 + }, + { + "epoch": 0.60110659480366, + "grad_norm": 2.1970937252044678, + "learning_rate": 7.247033463630402e-05, + "loss": 1.665, + "step": 16785 + }, + { + "epoch": 0.6011424069332283, + "grad_norm": 1.688825249671936, + "learning_rate": 7.24591840022663e-05, + "loss": 1.3525, + "step": 16786 + }, + { + "epoch": 0.6011782190627966, + "grad_norm": 1.3591644763946533, + "learning_rate": 7.244803373875475e-05, + "loss": 1.4548, + "step": 16787 + }, + { + "epoch": 0.6012140311923648, + "grad_norm": 2.0499813556671143, + "learning_rate": 7.243688384591934e-05, + "loss": 1.4286, + "step": 16788 + }, + { + "epoch": 0.6012498433219331, + "grad_norm": 1.7680226564407349, + "learning_rate": 7.242573432391012e-05, + "loss": 1.5824, + "step": 16789 + }, + { + "epoch": 0.6012856554515015, + "grad_norm": 1.749014139175415, + "learning_rate": 7.241458517287708e-05, + "loss": 1.6337, + "step": 16790 + }, + { + "epoch": 0.6013214675810697, + "grad_norm": 1.4293020963668823, + "learning_rate": 7.24034363929702e-05, + "loss": 1.6886, + "step": 16791 + }, + { + "epoch": 0.601357279710638, + "grad_norm": 1.5241018533706665, + "learning_rate": 7.23922879843395e-05, + "loss": 1.5131, + "step": 16792 + }, + { + "epoch": 0.6013930918402063, + "grad_norm": 2.091395854949951, + "learning_rate": 7.238113994713495e-05, + "loss": 1.5334, + "step": 16793 + }, + { + "epoch": 0.6014289039697746, + "grad_norm": 1.7409805059432983, + "learning_rate": 7.236999228150654e-05, + "loss": 1.5341, + "step": 16794 + }, + { + "epoch": 0.6014647160993428, + "grad_norm": 1.6296451091766357, + "learning_rate": 7.235884498760423e-05, + "loss": 1.4788, + "step": 16795 + }, + { + "epoch": 0.6015005282289111, + "grad_norm": 1.537999153137207, + "learning_rate": 7.234769806557802e-05, + "loss": 1.5781, + "step": 16796 + }, + { + "epoch": 0.6015363403584795, + "grad_norm": 1.5315535068511963, + "learning_rate": 7.233655151557786e-05, + "loss": 1.5663, + "step": 16797 + }, + { + "epoch": 0.6015721524880477, + "grad_norm": 1.7069329023361206, + "learning_rate": 7.232540533775371e-05, + "loss": 1.4498, + "step": 16798 + }, + { + "epoch": 0.601607964617616, + "grad_norm": 1.808366298675537, + "learning_rate": 7.231425953225552e-05, + "loss": 1.7831, + "step": 16799 + }, + { + "epoch": 0.6016437767471843, + "grad_norm": 1.2750335931777954, + "learning_rate": 7.230311409923329e-05, + "loss": 1.6704, + "step": 16800 + }, + { + "epoch": 0.6016795888767525, + "grad_norm": 2.135979652404785, + "learning_rate": 7.22919690388369e-05, + "loss": 1.5089, + "step": 16801 + }, + { + "epoch": 0.6017154010063208, + "grad_norm": 1.8281000852584839, + "learning_rate": 7.228082435121636e-05, + "loss": 1.7816, + "step": 16802 + }, + { + "epoch": 0.6017512131358891, + "grad_norm": 1.9501540660858154, + "learning_rate": 7.226968003652157e-05, + "loss": 1.4132, + "step": 16803 + }, + { + "epoch": 0.6017870252654575, + "grad_norm": 2.566105604171753, + "learning_rate": 7.225853609490244e-05, + "loss": 1.3759, + "step": 16804 + }, + { + "epoch": 0.6018228373950257, + "grad_norm": 1.4011183977127075, + "learning_rate": 7.224739252650894e-05, + "loss": 1.3426, + "step": 16805 + }, + { + "epoch": 0.601858649524594, + "grad_norm": 1.7511323690414429, + "learning_rate": 7.223624933149095e-05, + "loss": 1.3261, + "step": 16806 + }, + { + "epoch": 0.6018944616541623, + "grad_norm": 2.664832830429077, + "learning_rate": 7.222510650999845e-05, + "loss": 1.573, + "step": 16807 + }, + { + "epoch": 0.6019302737837305, + "grad_norm": 1.4400556087493896, + "learning_rate": 7.221396406218129e-05, + "loss": 1.2956, + "step": 16808 + }, + { + "epoch": 0.6019660859132988, + "grad_norm": 1.4470281600952148, + "learning_rate": 7.220282198818941e-05, + "loss": 1.5883, + "step": 16809 + }, + { + "epoch": 0.6020018980428671, + "grad_norm": 1.4601818323135376, + "learning_rate": 7.21916802881727e-05, + "loss": 1.4075, + "step": 16810 + }, + { + "epoch": 0.6020377101724355, + "grad_norm": 1.3011351823806763, + "learning_rate": 7.218053896228107e-05, + "loss": 1.2083, + "step": 16811 + }, + { + "epoch": 0.6020735223020037, + "grad_norm": 2.1279044151306152, + "learning_rate": 7.216939801066444e-05, + "loss": 1.5229, + "step": 16812 + }, + { + "epoch": 0.602109334431572, + "grad_norm": 1.5820890665054321, + "learning_rate": 7.215825743347259e-05, + "loss": 1.2355, + "step": 16813 + }, + { + "epoch": 0.6021451465611403, + "grad_norm": 1.6486124992370605, + "learning_rate": 7.214711723085553e-05, + "loss": 1.3337, + "step": 16814 + }, + { + "epoch": 0.6021809586907085, + "grad_norm": 1.8116123676300049, + "learning_rate": 7.213597740296304e-05, + "loss": 1.3902, + "step": 16815 + }, + { + "epoch": 0.6022167708202768, + "grad_norm": 1.5074397325515747, + "learning_rate": 7.212483794994503e-05, + "loss": 1.1192, + "step": 16816 + }, + { + "epoch": 0.6022525829498451, + "grad_norm": 1.316300868988037, + "learning_rate": 7.211369887195139e-05, + "loss": 1.3241, + "step": 16817 + }, + { + "epoch": 0.6022883950794135, + "grad_norm": 2.0315091609954834, + "learning_rate": 7.210256016913195e-05, + "loss": 1.5004, + "step": 16818 + }, + { + "epoch": 0.6023242072089817, + "grad_norm": 1.8143523931503296, + "learning_rate": 7.209142184163657e-05, + "loss": 1.6522, + "step": 16819 + }, + { + "epoch": 0.60236001933855, + "grad_norm": 1.3392434120178223, + "learning_rate": 7.208028388961515e-05, + "loss": 1.2916, + "step": 16820 + }, + { + "epoch": 0.6023958314681183, + "grad_norm": 1.4817171096801758, + "learning_rate": 7.206914631321749e-05, + "loss": 1.3744, + "step": 16821 + }, + { + "epoch": 0.6024316435976865, + "grad_norm": 1.1809767484664917, + "learning_rate": 7.205800911259338e-05, + "loss": 1.3433, + "step": 16822 + }, + { + "epoch": 0.6024674557272548, + "grad_norm": 1.8400154113769531, + "learning_rate": 7.204687228789279e-05, + "loss": 1.5053, + "step": 16823 + }, + { + "epoch": 0.6025032678568231, + "grad_norm": 1.5287377834320068, + "learning_rate": 7.20357358392654e-05, + "loss": 1.6242, + "step": 16824 + }, + { + "epoch": 0.6025390799863914, + "grad_norm": 1.6333818435668945, + "learning_rate": 7.202459976686118e-05, + "loss": 1.249, + "step": 16825 + }, + { + "epoch": 0.6025748921159597, + "grad_norm": 1.624626636505127, + "learning_rate": 7.201346407082982e-05, + "loss": 1.3627, + "step": 16826 + }, + { + "epoch": 0.602610704245528, + "grad_norm": 1.5464283227920532, + "learning_rate": 7.200232875132127e-05, + "loss": 1.652, + "step": 16827 + }, + { + "epoch": 0.6026465163750963, + "grad_norm": 1.678821325302124, + "learning_rate": 7.199119380848525e-05, + "loss": 1.4797, + "step": 16828 + }, + { + "epoch": 0.6026823285046645, + "grad_norm": 1.8502992391586304, + "learning_rate": 7.198005924247155e-05, + "loss": 1.7994, + "step": 16829 + }, + { + "epoch": 0.6027181406342328, + "grad_norm": 1.5439445972442627, + "learning_rate": 7.196892505343007e-05, + "loss": 1.1767, + "step": 16830 + }, + { + "epoch": 0.6027539527638011, + "grad_norm": 1.6171842813491821, + "learning_rate": 7.195779124151048e-05, + "loss": 1.3557, + "step": 16831 + }, + { + "epoch": 0.6027897648933694, + "grad_norm": 2.662693500518799, + "learning_rate": 7.19466578068627e-05, + "loss": 1.3941, + "step": 16832 + }, + { + "epoch": 0.6028255770229377, + "grad_norm": 1.5511195659637451, + "learning_rate": 7.193552474963638e-05, + "loss": 1.5937, + "step": 16833 + }, + { + "epoch": 0.602861389152506, + "grad_norm": 1.55390202999115, + "learning_rate": 7.192439206998146e-05, + "loss": 1.432, + "step": 16834 + }, + { + "epoch": 0.6028972012820742, + "grad_norm": 1.8486980199813843, + "learning_rate": 7.191325976804754e-05, + "loss": 1.3208, + "step": 16835 + }, + { + "epoch": 0.6029330134116425, + "grad_norm": 2.2025701999664307, + "learning_rate": 7.190212784398458e-05, + "loss": 1.6723, + "step": 16836 + }, + { + "epoch": 0.6029688255412108, + "grad_norm": 1.3402276039123535, + "learning_rate": 7.18909962979422e-05, + "loss": 1.279, + "step": 16837 + }, + { + "epoch": 0.603004637670779, + "grad_norm": 1.858518362045288, + "learning_rate": 7.187986513007018e-05, + "loss": 1.645, + "step": 16838 + }, + { + "epoch": 0.6030404498003473, + "grad_norm": 1.7115235328674316, + "learning_rate": 7.186873434051832e-05, + "loss": 1.5733, + "step": 16839 + }, + { + "epoch": 0.6030762619299157, + "grad_norm": 1.9213659763336182, + "learning_rate": 7.185760392943637e-05, + "loss": 1.2916, + "step": 16840 + }, + { + "epoch": 0.603112074059484, + "grad_norm": 1.2515188455581665, + "learning_rate": 7.184647389697405e-05, + "loss": 1.5181, + "step": 16841 + }, + { + "epoch": 0.6031478861890522, + "grad_norm": 1.9560471773147583, + "learning_rate": 7.183534424328106e-05, + "loss": 1.5256, + "step": 16842 + }, + { + "epoch": 0.6031836983186205, + "grad_norm": 1.352102518081665, + "learning_rate": 7.182421496850726e-05, + "loss": 1.4133, + "step": 16843 + }, + { + "epoch": 0.6032195104481888, + "grad_norm": 1.5485777854919434, + "learning_rate": 7.181308607280223e-05, + "loss": 1.5151, + "step": 16844 + }, + { + "epoch": 0.603255322577757, + "grad_norm": 1.3369213342666626, + "learning_rate": 7.180195755631584e-05, + "loss": 1.2579, + "step": 16845 + }, + { + "epoch": 0.6032911347073253, + "grad_norm": 2.329124689102173, + "learning_rate": 7.179082941919773e-05, + "loss": 1.3283, + "step": 16846 + }, + { + "epoch": 0.6033269468368937, + "grad_norm": 1.8115071058273315, + "learning_rate": 7.177970166159758e-05, + "loss": 1.5473, + "step": 16847 + }, + { + "epoch": 0.603362758966462, + "grad_norm": 1.4668792486190796, + "learning_rate": 7.176857428366517e-05, + "loss": 1.108, + "step": 16848 + }, + { + "epoch": 0.6033985710960302, + "grad_norm": 1.9151508808135986, + "learning_rate": 7.175744728555016e-05, + "loss": 1.3883, + "step": 16849 + }, + { + "epoch": 0.6034343832255985, + "grad_norm": 1.7193862199783325, + "learning_rate": 7.174632066740227e-05, + "loss": 1.9886, + "step": 16850 + }, + { + "epoch": 0.6034701953551668, + "grad_norm": 1.6349965333938599, + "learning_rate": 7.17351944293712e-05, + "loss": 1.7273, + "step": 16851 + }, + { + "epoch": 0.603506007484735, + "grad_norm": 1.7464351654052734, + "learning_rate": 7.172406857160662e-05, + "loss": 1.6175, + "step": 16852 + }, + { + "epoch": 0.6035418196143033, + "grad_norm": 1.512633204460144, + "learning_rate": 7.171294309425823e-05, + "loss": 1.5267, + "step": 16853 + }, + { + "epoch": 0.6035776317438717, + "grad_norm": 1.607930064201355, + "learning_rate": 7.17018179974757e-05, + "loss": 1.4871, + "step": 16854 + }, + { + "epoch": 0.60361344387344, + "grad_norm": 1.959700345993042, + "learning_rate": 7.169069328140872e-05, + "loss": 1.3557, + "step": 16855 + }, + { + "epoch": 0.6036492560030082, + "grad_norm": 1.5639288425445557, + "learning_rate": 7.167956894620694e-05, + "loss": 1.1726, + "step": 16856 + }, + { + "epoch": 0.6036850681325765, + "grad_norm": 1.6049681901931763, + "learning_rate": 7.166844499202002e-05, + "loss": 1.658, + "step": 16857 + }, + { + "epoch": 0.6037208802621448, + "grad_norm": 1.4732575416564941, + "learning_rate": 7.165732141899761e-05, + "loss": 1.4717, + "step": 16858 + }, + { + "epoch": 0.603756692391713, + "grad_norm": 2.0398313999176025, + "learning_rate": 7.164619822728941e-05, + "loss": 1.4927, + "step": 16859 + }, + { + "epoch": 0.6037925045212813, + "grad_norm": 1.3296444416046143, + "learning_rate": 7.163507541704503e-05, + "loss": 1.5785, + "step": 16860 + }, + { + "epoch": 0.6038283166508497, + "grad_norm": 1.3880574703216553, + "learning_rate": 7.162395298841414e-05, + "loss": 1.5667, + "step": 16861 + }, + { + "epoch": 0.603864128780418, + "grad_norm": 1.4414122104644775, + "learning_rate": 7.161283094154633e-05, + "loss": 1.5618, + "step": 16862 + }, + { + "epoch": 0.6038999409099862, + "grad_norm": 2.2014966011047363, + "learning_rate": 7.160170927659128e-05, + "loss": 1.4817, + "step": 16863 + }, + { + "epoch": 0.6039357530395545, + "grad_norm": 2.323162078857422, + "learning_rate": 7.159058799369861e-05, + "loss": 1.346, + "step": 16864 + }, + { + "epoch": 0.6039715651691228, + "grad_norm": 1.2874650955200195, + "learning_rate": 7.157946709301791e-05, + "loss": 0.9955, + "step": 16865 + }, + { + "epoch": 0.604007377298691, + "grad_norm": 1.3906731605529785, + "learning_rate": 7.156834657469885e-05, + "loss": 1.5495, + "step": 16866 + }, + { + "epoch": 0.6040431894282593, + "grad_norm": 1.6526511907577515, + "learning_rate": 7.155722643889097e-05, + "loss": 1.6115, + "step": 16867 + }, + { + "epoch": 0.6040790015578277, + "grad_norm": 1.9570540189743042, + "learning_rate": 7.154610668574395e-05, + "loss": 1.0826, + "step": 16868 + }, + { + "epoch": 0.604114813687396, + "grad_norm": 1.2908234596252441, + "learning_rate": 7.153498731540735e-05, + "loss": 1.1568, + "step": 16869 + }, + { + "epoch": 0.6041506258169642, + "grad_norm": 2.093600273132324, + "learning_rate": 7.15238683280308e-05, + "loss": 1.6383, + "step": 16870 + }, + { + "epoch": 0.6041864379465325, + "grad_norm": 1.7811310291290283, + "learning_rate": 7.151274972376383e-05, + "loss": 1.1835, + "step": 16871 + }, + { + "epoch": 0.6042222500761008, + "grad_norm": 1.4339679479599, + "learning_rate": 7.15016315027561e-05, + "loss": 1.1261, + "step": 16872 + }, + { + "epoch": 0.604258062205669, + "grad_norm": 1.8503432273864746, + "learning_rate": 7.149051366515716e-05, + "loss": 1.7787, + "step": 16873 + }, + { + "epoch": 0.6042938743352373, + "grad_norm": 1.6346187591552734, + "learning_rate": 7.147939621111655e-05, + "loss": 1.658, + "step": 16874 + }, + { + "epoch": 0.6043296864648057, + "grad_norm": 1.4566487073898315, + "learning_rate": 7.146827914078391e-05, + "loss": 1.5136, + "step": 16875 + }, + { + "epoch": 0.6043654985943739, + "grad_norm": 1.6601160764694214, + "learning_rate": 7.145716245430876e-05, + "loss": 1.5001, + "step": 16876 + }, + { + "epoch": 0.6044013107239422, + "grad_norm": 1.377268671989441, + "learning_rate": 7.144604615184067e-05, + "loss": 1.6418, + "step": 16877 + }, + { + "epoch": 0.6044371228535105, + "grad_norm": 1.7275559902191162, + "learning_rate": 7.143493023352918e-05, + "loss": 1.5162, + "step": 16878 + }, + { + "epoch": 0.6044729349830787, + "grad_norm": 1.9887871742248535, + "learning_rate": 7.142381469952388e-05, + "loss": 1.7494, + "step": 16879 + }, + { + "epoch": 0.604508747112647, + "grad_norm": 1.5621000528335571, + "learning_rate": 7.141269954997428e-05, + "loss": 1.5148, + "step": 16880 + }, + { + "epoch": 0.6045445592422153, + "grad_norm": 1.5300337076187134, + "learning_rate": 7.140158478502995e-05, + "loss": 1.4512, + "step": 16881 + }, + { + "epoch": 0.6045803713717837, + "grad_norm": 1.5173213481903076, + "learning_rate": 7.13904704048404e-05, + "loss": 1.2817, + "step": 16882 + }, + { + "epoch": 0.6046161835013519, + "grad_norm": 1.6534255743026733, + "learning_rate": 7.137935640955516e-05, + "loss": 1.5485, + "step": 16883 + }, + { + "epoch": 0.6046519956309202, + "grad_norm": 2.1657660007476807, + "learning_rate": 7.136824279932378e-05, + "loss": 1.3278, + "step": 16884 + }, + { + "epoch": 0.6046878077604885, + "grad_norm": 1.8624151945114136, + "learning_rate": 7.135712957429573e-05, + "loss": 1.2911, + "step": 16885 + }, + { + "epoch": 0.6047236198900567, + "grad_norm": 2.1512229442596436, + "learning_rate": 7.134601673462058e-05, + "loss": 1.5735, + "step": 16886 + }, + { + "epoch": 0.604759432019625, + "grad_norm": 1.6311861276626587, + "learning_rate": 7.133490428044778e-05, + "loss": 1.2567, + "step": 16887 + }, + { + "epoch": 0.6047952441491933, + "grad_norm": 1.3144361972808838, + "learning_rate": 7.132379221192691e-05, + "loss": 1.1432, + "step": 16888 + }, + { + "epoch": 0.6048310562787617, + "grad_norm": 2.04892635345459, + "learning_rate": 7.131268052920739e-05, + "loss": 1.3096, + "step": 16889 + }, + { + "epoch": 0.6048668684083299, + "grad_norm": 1.3833351135253906, + "learning_rate": 7.130156923243879e-05, + "loss": 1.5811, + "step": 16890 + }, + { + "epoch": 0.6049026805378982, + "grad_norm": 1.4970839023590088, + "learning_rate": 7.129045832177054e-05, + "loss": 1.2673, + "step": 16891 + }, + { + "epoch": 0.6049384926674665, + "grad_norm": 1.5738197565078735, + "learning_rate": 7.127934779735212e-05, + "loss": 1.552, + "step": 16892 + }, + { + "epoch": 0.6049743047970347, + "grad_norm": 1.8134737014770508, + "learning_rate": 7.126823765933306e-05, + "loss": 1.4541, + "step": 16893 + }, + { + "epoch": 0.605010116926603, + "grad_norm": 1.7841168642044067, + "learning_rate": 7.125712790786277e-05, + "loss": 1.3122, + "step": 16894 + }, + { + "epoch": 0.6050459290561713, + "grad_norm": 1.7300609350204468, + "learning_rate": 7.124601854309077e-05, + "loss": 1.6758, + "step": 16895 + }, + { + "epoch": 0.6050817411857397, + "grad_norm": 1.904471516609192, + "learning_rate": 7.123490956516649e-05, + "loss": 1.5818, + "step": 16896 + }, + { + "epoch": 0.6051175533153079, + "grad_norm": 1.6749107837677002, + "learning_rate": 7.122380097423941e-05, + "loss": 1.8399, + "step": 16897 + }, + { + "epoch": 0.6051533654448762, + "grad_norm": 1.8210564851760864, + "learning_rate": 7.121269277045894e-05, + "loss": 1.3766, + "step": 16898 + }, + { + "epoch": 0.6051891775744445, + "grad_norm": 1.5455830097198486, + "learning_rate": 7.120158495397459e-05, + "loss": 1.7764, + "step": 16899 + }, + { + "epoch": 0.6052249897040127, + "grad_norm": 1.4964015483856201, + "learning_rate": 7.119047752493576e-05, + "loss": 1.3216, + "step": 16900 + }, + { + "epoch": 0.605260801833581, + "grad_norm": 1.3908185958862305, + "learning_rate": 7.117937048349188e-05, + "loss": 1.5999, + "step": 16901 + }, + { + "epoch": 0.6052966139631493, + "grad_norm": 1.8004734516143799, + "learning_rate": 7.11682638297924e-05, + "loss": 1.818, + "step": 16902 + }, + { + "epoch": 0.6053324260927176, + "grad_norm": 1.4848984479904175, + "learning_rate": 7.115715756398674e-05, + "loss": 1.5299, + "step": 16903 + }, + { + "epoch": 0.6053682382222859, + "grad_norm": 1.5885624885559082, + "learning_rate": 7.114605168622432e-05, + "loss": 1.8183, + "step": 16904 + }, + { + "epoch": 0.6054040503518542, + "grad_norm": 1.8533222675323486, + "learning_rate": 7.113494619665456e-05, + "loss": 1.5371, + "step": 16905 + }, + { + "epoch": 0.6054398624814225, + "grad_norm": 2.0348994731903076, + "learning_rate": 7.112384109542687e-05, + "loss": 1.1959, + "step": 16906 + }, + { + "epoch": 0.6054756746109907, + "grad_norm": 1.8801383972167969, + "learning_rate": 7.111273638269063e-05, + "loss": 1.6672, + "step": 16907 + }, + { + "epoch": 0.605511486740559, + "grad_norm": 2.084892749786377, + "learning_rate": 7.110163205859528e-05, + "loss": 1.6724, + "step": 16908 + }, + { + "epoch": 0.6055472988701273, + "grad_norm": 1.7891219854354858, + "learning_rate": 7.109052812329023e-05, + "loss": 1.126, + "step": 16909 + }, + { + "epoch": 0.6055831109996956, + "grad_norm": 2.057370901107788, + "learning_rate": 7.107942457692475e-05, + "loss": 1.6619, + "step": 16910 + }, + { + "epoch": 0.6056189231292639, + "grad_norm": 1.486271858215332, + "learning_rate": 7.106832141964839e-05, + "loss": 1.3337, + "step": 16911 + }, + { + "epoch": 0.6056547352588322, + "grad_norm": 1.9159584045410156, + "learning_rate": 7.105721865161037e-05, + "loss": 1.5583, + "step": 16912 + }, + { + "epoch": 0.6056905473884004, + "grad_norm": 1.8747551441192627, + "learning_rate": 7.104611627296018e-05, + "loss": 1.3585, + "step": 16913 + }, + { + "epoch": 0.6057263595179687, + "grad_norm": 1.439809799194336, + "learning_rate": 7.103501428384714e-05, + "loss": 1.4312, + "step": 16914 + }, + { + "epoch": 0.605762171647537, + "grad_norm": 1.6186916828155518, + "learning_rate": 7.102391268442062e-05, + "loss": 1.4321, + "step": 16915 + }, + { + "epoch": 0.6057979837771053, + "grad_norm": 1.4521013498306274, + "learning_rate": 7.101281147482996e-05, + "loss": 1.4515, + "step": 16916 + }, + { + "epoch": 0.6058337959066736, + "grad_norm": 1.5437028408050537, + "learning_rate": 7.100171065522457e-05, + "loss": 1.3871, + "step": 16917 + }, + { + "epoch": 0.6058696080362419, + "grad_norm": 1.9497326612472534, + "learning_rate": 7.099061022575377e-05, + "loss": 1.6164, + "step": 16918 + }, + { + "epoch": 0.6059054201658102, + "grad_norm": 1.3493622541427612, + "learning_rate": 7.097951018656683e-05, + "loss": 1.4338, + "step": 16919 + }, + { + "epoch": 0.6059412322953784, + "grad_norm": 1.541733741760254, + "learning_rate": 7.09684105378132e-05, + "loss": 1.3511, + "step": 16920 + }, + { + "epoch": 0.6059770444249467, + "grad_norm": 1.4317058324813843, + "learning_rate": 7.095731127964211e-05, + "loss": 1.3761, + "step": 16921 + }, + { + "epoch": 0.606012856554515, + "grad_norm": 1.4599525928497314, + "learning_rate": 7.0946212412203e-05, + "loss": 1.8323, + "step": 16922 + }, + { + "epoch": 0.6060486686840832, + "grad_norm": 1.6733118295669556, + "learning_rate": 7.093511393564504e-05, + "loss": 1.6659, + "step": 16923 + }, + { + "epoch": 0.6060844808136516, + "grad_norm": 1.553957462310791, + "learning_rate": 7.092401585011771e-05, + "loss": 1.3088, + "step": 16924 + }, + { + "epoch": 0.6061202929432199, + "grad_norm": 1.6866424083709717, + "learning_rate": 7.091291815577022e-05, + "loss": 1.6451, + "step": 16925 + }, + { + "epoch": 0.6061561050727882, + "grad_norm": 1.5042775869369507, + "learning_rate": 7.090182085275185e-05, + "loss": 1.5038, + "step": 16926 + }, + { + "epoch": 0.6061919172023564, + "grad_norm": 1.7671167850494385, + "learning_rate": 7.089072394121201e-05, + "loss": 1.4282, + "step": 16927 + }, + { + "epoch": 0.6062277293319247, + "grad_norm": 1.7589083909988403, + "learning_rate": 7.087962742129988e-05, + "loss": 1.4675, + "step": 16928 + }, + { + "epoch": 0.606263541461493, + "grad_norm": 1.4512827396392822, + "learning_rate": 7.086853129316484e-05, + "loss": 1.5013, + "step": 16929 + }, + { + "epoch": 0.6062993535910612, + "grad_norm": 1.6962764263153076, + "learning_rate": 7.085743555695609e-05, + "loss": 1.1876, + "step": 16930 + }, + { + "epoch": 0.6063351657206296, + "grad_norm": 2.0834460258483887, + "learning_rate": 7.084634021282301e-05, + "loss": 1.6448, + "step": 16931 + }, + { + "epoch": 0.6063709778501979, + "grad_norm": 1.53248131275177, + "learning_rate": 7.083524526091475e-05, + "loss": 1.4959, + "step": 16932 + }, + { + "epoch": 0.6064067899797662, + "grad_norm": 1.4516412019729614, + "learning_rate": 7.082415070138071e-05, + "loss": 1.6379, + "step": 16933 + }, + { + "epoch": 0.6064426021093344, + "grad_norm": 1.47740638256073, + "learning_rate": 7.081305653437007e-05, + "loss": 1.4495, + "step": 16934 + }, + { + "epoch": 0.6064784142389027, + "grad_norm": 1.6300405263900757, + "learning_rate": 7.080196276003209e-05, + "loss": 1.4695, + "step": 16935 + }, + { + "epoch": 0.606514226368471, + "grad_norm": 2.4109437465667725, + "learning_rate": 7.079086937851604e-05, + "loss": 1.3669, + "step": 16936 + }, + { + "epoch": 0.6065500384980392, + "grad_norm": 1.4852944612503052, + "learning_rate": 7.077977638997117e-05, + "loss": 1.5768, + "step": 16937 + }, + { + "epoch": 0.6065858506276076, + "grad_norm": 2.6029889583587646, + "learning_rate": 7.076868379454673e-05, + "loss": 1.6087, + "step": 16938 + }, + { + "epoch": 0.6066216627571759, + "grad_norm": 1.5269404649734497, + "learning_rate": 7.07575915923919e-05, + "loss": 1.405, + "step": 16939 + }, + { + "epoch": 0.6066574748867442, + "grad_norm": 1.8272013664245605, + "learning_rate": 7.074649978365602e-05, + "loss": 1.2481, + "step": 16940 + }, + { + "epoch": 0.6066932870163124, + "grad_norm": 1.7521268129348755, + "learning_rate": 7.073540836848817e-05, + "loss": 1.5097, + "step": 16941 + }, + { + "epoch": 0.6067290991458807, + "grad_norm": 2.199578046798706, + "learning_rate": 7.072431734703772e-05, + "loss": 1.7481, + "step": 16942 + }, + { + "epoch": 0.606764911275449, + "grad_norm": 1.9226924180984497, + "learning_rate": 7.071322671945382e-05, + "loss": 1.5111, + "step": 16943 + }, + { + "epoch": 0.6068007234050172, + "grad_norm": 1.5912048816680908, + "learning_rate": 7.070213648588564e-05, + "loss": 1.5916, + "step": 16944 + }, + { + "epoch": 0.6068365355345856, + "grad_norm": 1.532690167427063, + "learning_rate": 7.069104664648244e-05, + "loss": 1.4422, + "step": 16945 + }, + { + "epoch": 0.6068723476641539, + "grad_norm": 2.0913636684417725, + "learning_rate": 7.06799572013934e-05, + "loss": 1.6614, + "step": 16946 + }, + { + "epoch": 0.6069081597937221, + "grad_norm": 1.6115814447402954, + "learning_rate": 7.066886815076771e-05, + "loss": 1.3298, + "step": 16947 + }, + { + "epoch": 0.6069439719232904, + "grad_norm": 1.4363234043121338, + "learning_rate": 7.065777949475456e-05, + "loss": 1.3889, + "step": 16948 + }, + { + "epoch": 0.6069797840528587, + "grad_norm": 1.3749061822891235, + "learning_rate": 7.064669123350316e-05, + "loss": 1.3767, + "step": 16949 + }, + { + "epoch": 0.607015596182427, + "grad_norm": 1.6005500555038452, + "learning_rate": 7.063560336716263e-05, + "loss": 1.4588, + "step": 16950 + }, + { + "epoch": 0.6070514083119952, + "grad_norm": 1.9682674407958984, + "learning_rate": 7.062451589588221e-05, + "loss": 1.2503, + "step": 16951 + }, + { + "epoch": 0.6070872204415636, + "grad_norm": 1.6852772235870361, + "learning_rate": 7.061342881981105e-05, + "loss": 1.4176, + "step": 16952 + }, + { + "epoch": 0.6071230325711319, + "grad_norm": 1.2630378007888794, + "learning_rate": 7.060234213909826e-05, + "loss": 1.5105, + "step": 16953 + }, + { + "epoch": 0.6071588447007001, + "grad_norm": 1.6143516302108765, + "learning_rate": 7.059125585389306e-05, + "loss": 1.5111, + "step": 16954 + }, + { + "epoch": 0.6071946568302684, + "grad_norm": 1.6906598806381226, + "learning_rate": 7.058016996434455e-05, + "loss": 1.713, + "step": 16955 + }, + { + "epoch": 0.6072304689598367, + "grad_norm": 1.9878149032592773, + "learning_rate": 7.056908447060195e-05, + "loss": 1.4993, + "step": 16956 + }, + { + "epoch": 0.607266281089405, + "grad_norm": 1.3433419466018677, + "learning_rate": 7.055799937281432e-05, + "loss": 1.6973, + "step": 16957 + }, + { + "epoch": 0.6073020932189732, + "grad_norm": 1.7787431478500366, + "learning_rate": 7.054691467113085e-05, + "loss": 1.4692, + "step": 16958 + }, + { + "epoch": 0.6073379053485416, + "grad_norm": 1.5732979774475098, + "learning_rate": 7.053583036570064e-05, + "loss": 1.3024, + "step": 16959 + }, + { + "epoch": 0.6073737174781099, + "grad_norm": 2.0106754302978516, + "learning_rate": 7.052474645667283e-05, + "loss": 1.637, + "step": 16960 + }, + { + "epoch": 0.6074095296076781, + "grad_norm": 1.5223397016525269, + "learning_rate": 7.051366294419655e-05, + "loss": 1.3299, + "step": 16961 + }, + { + "epoch": 0.6074453417372464, + "grad_norm": 1.6376421451568604, + "learning_rate": 7.050257982842088e-05, + "loss": 1.5557, + "step": 16962 + }, + { + "epoch": 0.6074811538668147, + "grad_norm": 1.5301560163497925, + "learning_rate": 7.049149710949497e-05, + "loss": 1.5164, + "step": 16963 + }, + { + "epoch": 0.6075169659963829, + "grad_norm": 1.3848516941070557, + "learning_rate": 7.048041478756786e-05, + "loss": 1.5645, + "step": 16964 + }, + { + "epoch": 0.6075527781259512, + "grad_norm": 2.091106653213501, + "learning_rate": 7.046933286278874e-05, + "loss": 1.2488, + "step": 16965 + }, + { + "epoch": 0.6075885902555196, + "grad_norm": 1.3417564630508423, + "learning_rate": 7.04582513353066e-05, + "loss": 1.3523, + "step": 16966 + }, + { + "epoch": 0.6076244023850879, + "grad_norm": 1.6278449296951294, + "learning_rate": 7.044717020527065e-05, + "loss": 1.6019, + "step": 16967 + }, + { + "epoch": 0.6076602145146561, + "grad_norm": 1.618781566619873, + "learning_rate": 7.043608947282985e-05, + "loss": 1.5129, + "step": 16968 + }, + { + "epoch": 0.6076960266442244, + "grad_norm": 1.8264074325561523, + "learning_rate": 7.042500913813337e-05, + "loss": 1.6284, + "step": 16969 + }, + { + "epoch": 0.6077318387737927, + "grad_norm": 1.528173565864563, + "learning_rate": 7.041392920133024e-05, + "loss": 1.384, + "step": 16970 + }, + { + "epoch": 0.6077676509033609, + "grad_norm": 1.6601237058639526, + "learning_rate": 7.040284966256949e-05, + "loss": 1.5068, + "step": 16971 + }, + { + "epoch": 0.6078034630329292, + "grad_norm": 1.6839256286621094, + "learning_rate": 7.039177052200026e-05, + "loss": 1.4348, + "step": 16972 + }, + { + "epoch": 0.6078392751624976, + "grad_norm": 1.93649160861969, + "learning_rate": 7.038069177977153e-05, + "loss": 1.2951, + "step": 16973 + }, + { + "epoch": 0.6078750872920659, + "grad_norm": 2.453674077987671, + "learning_rate": 7.036961343603243e-05, + "loss": 1.6723, + "step": 16974 + }, + { + "epoch": 0.6079108994216341, + "grad_norm": 1.814399242401123, + "learning_rate": 7.035853549093192e-05, + "loss": 1.5456, + "step": 16975 + }, + { + "epoch": 0.6079467115512024, + "grad_norm": 1.8552604913711548, + "learning_rate": 7.034745794461912e-05, + "loss": 1.6286, + "step": 16976 + }, + { + "epoch": 0.6079825236807707, + "grad_norm": 1.676199197769165, + "learning_rate": 7.033638079724298e-05, + "loss": 1.4652, + "step": 16977 + }, + { + "epoch": 0.6080183358103389, + "grad_norm": 2.279174566268921, + "learning_rate": 7.032530404895262e-05, + "loss": 1.6084, + "step": 16978 + }, + { + "epoch": 0.6080541479399072, + "grad_norm": 1.4861416816711426, + "learning_rate": 7.0314227699897e-05, + "loss": 1.3948, + "step": 16979 + }, + { + "epoch": 0.6080899600694756, + "grad_norm": 1.480503797531128, + "learning_rate": 7.030315175022513e-05, + "loss": 1.322, + "step": 16980 + }, + { + "epoch": 0.6081257721990438, + "grad_norm": 1.4660117626190186, + "learning_rate": 7.029207620008606e-05, + "loss": 1.6761, + "step": 16981 + }, + { + "epoch": 0.6081615843286121, + "grad_norm": 1.395601749420166, + "learning_rate": 7.028100104962878e-05, + "loss": 1.3519, + "step": 16982 + }, + { + "epoch": 0.6081973964581804, + "grad_norm": 1.7928545475006104, + "learning_rate": 7.026992629900232e-05, + "loss": 1.4069, + "step": 16983 + }, + { + "epoch": 0.6082332085877487, + "grad_norm": 1.6177374124526978, + "learning_rate": 7.025885194835562e-05, + "loss": 1.3573, + "step": 16984 + }, + { + "epoch": 0.6082690207173169, + "grad_norm": 2.3778645992279053, + "learning_rate": 7.024777799783774e-05, + "loss": 1.6212, + "step": 16985 + }, + { + "epoch": 0.6083048328468852, + "grad_norm": 1.5643190145492554, + "learning_rate": 7.02367044475976e-05, + "loss": 1.2325, + "step": 16986 + }, + { + "epoch": 0.6083406449764536, + "grad_norm": 1.7111154794692993, + "learning_rate": 7.022563129778422e-05, + "loss": 1.6208, + "step": 16987 + }, + { + "epoch": 0.6083764571060218, + "grad_norm": 2.059918165206909, + "learning_rate": 7.021455854854657e-05, + "loss": 1.5891, + "step": 16988 + }, + { + "epoch": 0.6084122692355901, + "grad_norm": 1.2656056880950928, + "learning_rate": 7.020348620003361e-05, + "loss": 1.5974, + "step": 16989 + }, + { + "epoch": 0.6084480813651584, + "grad_norm": 1.4218404293060303, + "learning_rate": 7.019241425239432e-05, + "loss": 1.4523, + "step": 16990 + }, + { + "epoch": 0.6084838934947266, + "grad_norm": 1.8264952898025513, + "learning_rate": 7.018134270577761e-05, + "loss": 1.4599, + "step": 16991 + }, + { + "epoch": 0.6085197056242949, + "grad_norm": 1.4522677659988403, + "learning_rate": 7.017027156033252e-05, + "loss": 1.7525, + "step": 16992 + }, + { + "epoch": 0.6085555177538632, + "grad_norm": 1.762906789779663, + "learning_rate": 7.01592008162079e-05, + "loss": 1.3414, + "step": 16993 + }, + { + "epoch": 0.6085913298834316, + "grad_norm": 1.9561679363250732, + "learning_rate": 7.014813047355277e-05, + "loss": 1.4562, + "step": 16994 + }, + { + "epoch": 0.6086271420129998, + "grad_norm": 2.302292823791504, + "learning_rate": 7.013706053251603e-05, + "loss": 1.5998, + "step": 16995 + }, + { + "epoch": 0.6086629541425681, + "grad_norm": 1.6201292276382446, + "learning_rate": 7.012599099324662e-05, + "loss": 1.3422, + "step": 16996 + }, + { + "epoch": 0.6086987662721364, + "grad_norm": 1.6970531940460205, + "learning_rate": 7.011492185589349e-05, + "loss": 1.5323, + "step": 16997 + }, + { + "epoch": 0.6087345784017046, + "grad_norm": 1.6018362045288086, + "learning_rate": 7.01038531206055e-05, + "loss": 1.5924, + "step": 16998 + }, + { + "epoch": 0.6087703905312729, + "grad_norm": 1.6218585968017578, + "learning_rate": 7.009278478753162e-05, + "loss": 1.3095, + "step": 16999 + }, + { + "epoch": 0.6088062026608412, + "grad_norm": 1.3571292161941528, + "learning_rate": 7.008171685682074e-05, + "loss": 1.1314, + "step": 17000 + }, + { + "epoch": 0.6088420147904096, + "grad_norm": 2.037933588027954, + "learning_rate": 7.007064932862178e-05, + "loss": 1.467, + "step": 17001 + }, + { + "epoch": 0.6088778269199778, + "grad_norm": 1.4938690662384033, + "learning_rate": 7.005958220308362e-05, + "loss": 1.5368, + "step": 17002 + }, + { + "epoch": 0.6089136390495461, + "grad_norm": 2.2486536502838135, + "learning_rate": 7.004851548035516e-05, + "loss": 1.234, + "step": 17003 + }, + { + "epoch": 0.6089494511791144, + "grad_norm": 1.8166847229003906, + "learning_rate": 7.003744916058528e-05, + "loss": 1.2638, + "step": 17004 + }, + { + "epoch": 0.6089852633086826, + "grad_norm": 1.850803017616272, + "learning_rate": 7.00263832439229e-05, + "loss": 1.3371, + "step": 17005 + }, + { + "epoch": 0.6090210754382509, + "grad_norm": 2.1468968391418457, + "learning_rate": 7.001531773051688e-05, + "loss": 1.1834, + "step": 17006 + }, + { + "epoch": 0.6090568875678192, + "grad_norm": 1.7560291290283203, + "learning_rate": 7.000425262051602e-05, + "loss": 1.7774, + "step": 17007 + }, + { + "epoch": 0.6090926996973876, + "grad_norm": 1.929193139076233, + "learning_rate": 6.999318791406931e-05, + "loss": 1.3495, + "step": 17008 + }, + { + "epoch": 0.6091285118269558, + "grad_norm": 1.4060529470443726, + "learning_rate": 6.998212361132549e-05, + "loss": 1.3532, + "step": 17009 + }, + { + "epoch": 0.6091643239565241, + "grad_norm": 1.2726495265960693, + "learning_rate": 6.997105971243352e-05, + "loss": 1.5055, + "step": 17010 + }, + { + "epoch": 0.6092001360860924, + "grad_norm": 1.7176103591918945, + "learning_rate": 6.995999621754219e-05, + "loss": 1.3575, + "step": 17011 + }, + { + "epoch": 0.6092359482156606, + "grad_norm": 1.3885085582733154, + "learning_rate": 6.994893312680037e-05, + "loss": 1.5756, + "step": 17012 + }, + { + "epoch": 0.6092717603452289, + "grad_norm": 1.7695868015289307, + "learning_rate": 6.99378704403569e-05, + "loss": 1.5509, + "step": 17013 + }, + { + "epoch": 0.6093075724747972, + "grad_norm": 1.2675243616104126, + "learning_rate": 6.99268081583606e-05, + "loss": 1.6999, + "step": 17014 + }, + { + "epoch": 0.6093433846043655, + "grad_norm": 1.3948224782943726, + "learning_rate": 6.991574628096033e-05, + "loss": 1.4275, + "step": 17015 + }, + { + "epoch": 0.6093791967339338, + "grad_norm": 2.945615530014038, + "learning_rate": 6.990468480830482e-05, + "loss": 1.319, + "step": 17016 + }, + { + "epoch": 0.6094150088635021, + "grad_norm": 2.0076494216918945, + "learning_rate": 6.989362374054302e-05, + "loss": 1.4243, + "step": 17017 + }, + { + "epoch": 0.6094508209930704, + "grad_norm": 2.0113306045532227, + "learning_rate": 6.988256307782363e-05, + "loss": 1.3713, + "step": 17018 + }, + { + "epoch": 0.6094866331226386, + "grad_norm": 1.3602279424667358, + "learning_rate": 6.987150282029555e-05, + "loss": 1.3158, + "step": 17019 + }, + { + "epoch": 0.6095224452522069, + "grad_norm": 2.0649795532226562, + "learning_rate": 6.986044296810749e-05, + "loss": 1.5799, + "step": 17020 + }, + { + "epoch": 0.6095582573817752, + "grad_norm": 1.709782600402832, + "learning_rate": 6.984938352140835e-05, + "loss": 1.7156, + "step": 17021 + }, + { + "epoch": 0.6095940695113435, + "grad_norm": 1.9718817472457886, + "learning_rate": 6.983832448034684e-05, + "loss": 1.3973, + "step": 17022 + }, + { + "epoch": 0.6096298816409118, + "grad_norm": 1.8555179834365845, + "learning_rate": 6.982726584507173e-05, + "loss": 1.3785, + "step": 17023 + }, + { + "epoch": 0.6096656937704801, + "grad_norm": 1.518578052520752, + "learning_rate": 6.981620761573188e-05, + "loss": 1.5094, + "step": 17024 + }, + { + "epoch": 0.6097015059000483, + "grad_norm": 1.679343342781067, + "learning_rate": 6.980514979247599e-05, + "loss": 1.6682, + "step": 17025 + }, + { + "epoch": 0.6097373180296166, + "grad_norm": 1.5894368886947632, + "learning_rate": 6.979409237545291e-05, + "loss": 1.3674, + "step": 17026 + }, + { + "epoch": 0.6097731301591849, + "grad_norm": 1.8799184560775757, + "learning_rate": 6.97830353648113e-05, + "loss": 1.6178, + "step": 17027 + }, + { + "epoch": 0.6098089422887532, + "grad_norm": 2.117532968521118, + "learning_rate": 6.977197876070003e-05, + "loss": 1.3589, + "step": 17028 + }, + { + "epoch": 0.6098447544183215, + "grad_norm": 1.57524573802948, + "learning_rate": 6.976092256326772e-05, + "loss": 1.3024, + "step": 17029 + }, + { + "epoch": 0.6098805665478898, + "grad_norm": 1.9387460947036743, + "learning_rate": 6.974986677266326e-05, + "loss": 1.4434, + "step": 17030 + }, + { + "epoch": 0.6099163786774581, + "grad_norm": 2.020622730255127, + "learning_rate": 6.973881138903531e-05, + "loss": 1.5545, + "step": 17031 + }, + { + "epoch": 0.6099521908070263, + "grad_norm": 1.553113341331482, + "learning_rate": 6.972775641253259e-05, + "loss": 1.2865, + "step": 17032 + }, + { + "epoch": 0.6099880029365946, + "grad_norm": 1.6942930221557617, + "learning_rate": 6.971670184330389e-05, + "loss": 1.4666, + "step": 17033 + }, + { + "epoch": 0.6100238150661629, + "grad_norm": 1.5093879699707031, + "learning_rate": 6.970564768149788e-05, + "loss": 1.3654, + "step": 17034 + }, + { + "epoch": 0.6100596271957311, + "grad_norm": 1.5208278894424438, + "learning_rate": 6.969459392726331e-05, + "loss": 1.4862, + "step": 17035 + }, + { + "epoch": 0.6100954393252995, + "grad_norm": 2.0230493545532227, + "learning_rate": 6.968354058074887e-05, + "loss": 1.2142, + "step": 17036 + }, + { + "epoch": 0.6101312514548678, + "grad_norm": 1.8557683229446411, + "learning_rate": 6.967248764210333e-05, + "loss": 1.2707, + "step": 17037 + }, + { + "epoch": 0.6101670635844361, + "grad_norm": 2.024885654449463, + "learning_rate": 6.966143511147529e-05, + "loss": 1.4075, + "step": 17038 + }, + { + "epoch": 0.6102028757140043, + "grad_norm": 1.9817204475402832, + "learning_rate": 6.965038298901356e-05, + "loss": 1.39, + "step": 17039 + }, + { + "epoch": 0.6102386878435726, + "grad_norm": 3.7637715339660645, + "learning_rate": 6.963933127486677e-05, + "loss": 1.4094, + "step": 17040 + }, + { + "epoch": 0.6102744999731409, + "grad_norm": 1.5731815099716187, + "learning_rate": 6.96282799691836e-05, + "loss": 1.4704, + "step": 17041 + }, + { + "epoch": 0.6103103121027091, + "grad_norm": 1.9939146041870117, + "learning_rate": 6.961722907211277e-05, + "loss": 1.4589, + "step": 17042 + }, + { + "epoch": 0.6103461242322775, + "grad_norm": 1.8387165069580078, + "learning_rate": 6.96061785838029e-05, + "loss": 1.3256, + "step": 17043 + }, + { + "epoch": 0.6103819363618458, + "grad_norm": 1.8265881538391113, + "learning_rate": 6.95951285044027e-05, + "loss": 1.5793, + "step": 17044 + }, + { + "epoch": 0.6104177484914141, + "grad_norm": 1.364792823791504, + "learning_rate": 6.958407883406082e-05, + "loss": 1.5501, + "step": 17045 + }, + { + "epoch": 0.6104535606209823, + "grad_norm": 1.4304333925247192, + "learning_rate": 6.957302957292596e-05, + "loss": 1.3732, + "step": 17046 + }, + { + "epoch": 0.6104893727505506, + "grad_norm": 1.5327463150024414, + "learning_rate": 6.956198072114669e-05, + "loss": 1.3424, + "step": 17047 + }, + { + "epoch": 0.6105251848801189, + "grad_norm": 1.5442783832550049, + "learning_rate": 6.955093227887175e-05, + "loss": 1.3564, + "step": 17048 + }, + { + "epoch": 0.6105609970096871, + "grad_norm": 1.7592949867248535, + "learning_rate": 6.953988424624973e-05, + "loss": 1.5495, + "step": 17049 + }, + { + "epoch": 0.6105968091392555, + "grad_norm": 1.6681785583496094, + "learning_rate": 6.952883662342926e-05, + "loss": 1.3294, + "step": 17050 + }, + { + "epoch": 0.6106326212688238, + "grad_norm": 2.178701639175415, + "learning_rate": 6.9517789410559e-05, + "loss": 1.6474, + "step": 17051 + }, + { + "epoch": 0.610668433398392, + "grad_norm": 1.7608951330184937, + "learning_rate": 6.950674260778755e-05, + "loss": 1.4254, + "step": 17052 + }, + { + "epoch": 0.6107042455279603, + "grad_norm": 1.4168155193328857, + "learning_rate": 6.949569621526357e-05, + "loss": 1.4225, + "step": 17053 + }, + { + "epoch": 0.6107400576575286, + "grad_norm": 1.6277146339416504, + "learning_rate": 6.948465023313562e-05, + "loss": 1.4284, + "step": 17054 + }, + { + "epoch": 0.6107758697870969, + "grad_norm": 1.4581594467163086, + "learning_rate": 6.947360466155237e-05, + "loss": 1.4763, + "step": 17055 + }, + { + "epoch": 0.6108116819166651, + "grad_norm": 2.682927370071411, + "learning_rate": 6.946255950066236e-05, + "loss": 1.6454, + "step": 17056 + }, + { + "epoch": 0.6108474940462335, + "grad_norm": 1.8958370685577393, + "learning_rate": 6.945151475061425e-05, + "loss": 1.5278, + "step": 17057 + }, + { + "epoch": 0.6108833061758018, + "grad_norm": 2.073317289352417, + "learning_rate": 6.944047041155662e-05, + "loss": 1.7897, + "step": 17058 + }, + { + "epoch": 0.61091911830537, + "grad_norm": 1.4627833366394043, + "learning_rate": 6.9429426483638e-05, + "loss": 1.3666, + "step": 17059 + }, + { + "epoch": 0.6109549304349383, + "grad_norm": 1.4396252632141113, + "learning_rate": 6.941838296700703e-05, + "loss": 1.3275, + "step": 17060 + }, + { + "epoch": 0.6109907425645066, + "grad_norm": 1.7097407579421997, + "learning_rate": 6.940733986181226e-05, + "loss": 1.4636, + "step": 17061 + }, + { + "epoch": 0.6110265546940749, + "grad_norm": 2.193366765975952, + "learning_rate": 6.939629716820229e-05, + "loss": 1.3695, + "step": 17062 + }, + { + "epoch": 0.6110623668236431, + "grad_norm": 1.5271308422088623, + "learning_rate": 6.938525488632563e-05, + "loss": 1.4105, + "step": 17063 + }, + { + "epoch": 0.6110981789532115, + "grad_norm": 1.4365044832229614, + "learning_rate": 6.937421301633091e-05, + "loss": 1.2441, + "step": 17064 + }, + { + "epoch": 0.6111339910827798, + "grad_norm": 2.1028079986572266, + "learning_rate": 6.936317155836664e-05, + "loss": 1.3424, + "step": 17065 + }, + { + "epoch": 0.611169803212348, + "grad_norm": 1.867521047592163, + "learning_rate": 6.935213051258138e-05, + "loss": 1.3437, + "step": 17066 + }, + { + "epoch": 0.6112056153419163, + "grad_norm": 1.709039330482483, + "learning_rate": 6.934108987912369e-05, + "loss": 1.5545, + "step": 17067 + }, + { + "epoch": 0.6112414274714846, + "grad_norm": 1.8121676445007324, + "learning_rate": 6.933004965814205e-05, + "loss": 1.7838, + "step": 17068 + }, + { + "epoch": 0.6112772396010528, + "grad_norm": 1.734595537185669, + "learning_rate": 6.931900984978506e-05, + "loss": 1.3001, + "step": 17069 + }, + { + "epoch": 0.6113130517306211, + "grad_norm": 1.3379672765731812, + "learning_rate": 6.930797045420119e-05, + "loss": 1.4297, + "step": 17070 + }, + { + "epoch": 0.6113488638601895, + "grad_norm": 1.8500645160675049, + "learning_rate": 6.929693147153902e-05, + "loss": 1.3034, + "step": 17071 + }, + { + "epoch": 0.6113846759897578, + "grad_norm": 1.3984006643295288, + "learning_rate": 6.9285892901947e-05, + "loss": 1.4553, + "step": 17072 + }, + { + "epoch": 0.611420488119326, + "grad_norm": 1.7027636766433716, + "learning_rate": 6.927485474557369e-05, + "loss": 1.6593, + "step": 17073 + }, + { + "epoch": 0.6114563002488943, + "grad_norm": 1.6550413370132446, + "learning_rate": 6.926381700256757e-05, + "loss": 1.5719, + "step": 17074 + }, + { + "epoch": 0.6114921123784626, + "grad_norm": 1.6176135540008545, + "learning_rate": 6.925277967307717e-05, + "loss": 1.7674, + "step": 17075 + }, + { + "epoch": 0.6115279245080308, + "grad_norm": 1.5670104026794434, + "learning_rate": 6.924174275725094e-05, + "loss": 1.5544, + "step": 17076 + }, + { + "epoch": 0.6115637366375991, + "grad_norm": 1.4483779668807983, + "learning_rate": 6.923070625523737e-05, + "loss": 1.6666, + "step": 17077 + }, + { + "epoch": 0.6115995487671675, + "grad_norm": 1.5518686771392822, + "learning_rate": 6.921967016718499e-05, + "loss": 1.15, + "step": 17078 + }, + { + "epoch": 0.6116353608967358, + "grad_norm": 1.4891283512115479, + "learning_rate": 6.920863449324221e-05, + "loss": 1.4193, + "step": 17079 + }, + { + "epoch": 0.611671173026304, + "grad_norm": 1.8726311922073364, + "learning_rate": 6.919759923355756e-05, + "loss": 1.4929, + "step": 17080 + }, + { + "epoch": 0.6117069851558723, + "grad_norm": 2.429152488708496, + "learning_rate": 6.918656438827946e-05, + "loss": 1.3696, + "step": 17081 + }, + { + "epoch": 0.6117427972854406, + "grad_norm": 1.599990725517273, + "learning_rate": 6.917552995755641e-05, + "loss": 1.4851, + "step": 17082 + }, + { + "epoch": 0.6117786094150088, + "grad_norm": 1.8051741123199463, + "learning_rate": 6.916449594153682e-05, + "loss": 1.43, + "step": 17083 + }, + { + "epoch": 0.6118144215445771, + "grad_norm": 1.5804451704025269, + "learning_rate": 6.915346234036919e-05, + "loss": 0.9616, + "step": 17084 + }, + { + "epoch": 0.6118502336741455, + "grad_norm": 1.3752074241638184, + "learning_rate": 6.914242915420193e-05, + "loss": 1.2154, + "step": 17085 + }, + { + "epoch": 0.6118860458037138, + "grad_norm": 1.573339581489563, + "learning_rate": 6.913139638318346e-05, + "loss": 1.3455, + "step": 17086 + }, + { + "epoch": 0.611921857933282, + "grad_norm": 1.6640084981918335, + "learning_rate": 6.912036402746227e-05, + "loss": 1.3296, + "step": 17087 + }, + { + "epoch": 0.6119576700628503, + "grad_norm": 1.7510104179382324, + "learning_rate": 6.910933208718671e-05, + "loss": 1.3694, + "step": 17088 + }, + { + "epoch": 0.6119934821924186, + "grad_norm": 1.2308443784713745, + "learning_rate": 6.909830056250527e-05, + "loss": 1.4283, + "step": 17089 + }, + { + "epoch": 0.6120292943219868, + "grad_norm": 1.7887136936187744, + "learning_rate": 6.908726945356632e-05, + "loss": 1.4801, + "step": 17090 + }, + { + "epoch": 0.6120651064515551, + "grad_norm": 1.6139098405838013, + "learning_rate": 6.90762387605183e-05, + "loss": 1.6002, + "step": 17091 + }, + { + "epoch": 0.6121009185811235, + "grad_norm": 1.8948484659194946, + "learning_rate": 6.906520848350957e-05, + "loss": 1.4782, + "step": 17092 + }, + { + "epoch": 0.6121367307106917, + "grad_norm": 1.8423246145248413, + "learning_rate": 6.905417862268859e-05, + "loss": 1.4955, + "step": 17093 + }, + { + "epoch": 0.61217254284026, + "grad_norm": 1.6913235187530518, + "learning_rate": 6.904314917820371e-05, + "loss": 1.3755, + "step": 17094 + }, + { + "epoch": 0.6122083549698283, + "grad_norm": 1.9365330934524536, + "learning_rate": 6.90321201502033e-05, + "loss": 1.5667, + "step": 17095 + }, + { + "epoch": 0.6122441670993966, + "grad_norm": 2.045860767364502, + "learning_rate": 6.90210915388358e-05, + "loss": 1.4949, + "step": 17096 + }, + { + "epoch": 0.6122799792289648, + "grad_norm": 1.4850900173187256, + "learning_rate": 6.901006334424953e-05, + "loss": 1.2754, + "step": 17097 + }, + { + "epoch": 0.6123157913585331, + "grad_norm": 1.5501368045806885, + "learning_rate": 6.89990355665929e-05, + "loss": 1.3983, + "step": 17098 + }, + { + "epoch": 0.6123516034881015, + "grad_norm": 1.471760630607605, + "learning_rate": 6.898800820601425e-05, + "loss": 1.4922, + "step": 17099 + }, + { + "epoch": 0.6123874156176697, + "grad_norm": 1.7545149326324463, + "learning_rate": 6.897698126266197e-05, + "loss": 1.83, + "step": 17100 + }, + { + "epoch": 0.612423227747238, + "grad_norm": 1.2826873064041138, + "learning_rate": 6.896595473668435e-05, + "loss": 1.589, + "step": 17101 + }, + { + "epoch": 0.6124590398768063, + "grad_norm": 1.2966712713241577, + "learning_rate": 6.89549286282298e-05, + "loss": 1.1177, + "step": 17102 + }, + { + "epoch": 0.6124948520063745, + "grad_norm": 1.3103042840957642, + "learning_rate": 6.894390293744668e-05, + "loss": 1.4203, + "step": 17103 + }, + { + "epoch": 0.6125306641359428, + "grad_norm": 1.613857388496399, + "learning_rate": 6.893287766448321e-05, + "loss": 1.4521, + "step": 17104 + }, + { + "epoch": 0.6125664762655111, + "grad_norm": 1.5288715362548828, + "learning_rate": 6.892185280948786e-05, + "loss": 1.3655, + "step": 17105 + }, + { + "epoch": 0.6126022883950795, + "grad_norm": 1.2962150573730469, + "learning_rate": 6.891082837260885e-05, + "loss": 1.3534, + "step": 17106 + }, + { + "epoch": 0.6126381005246477, + "grad_norm": 1.786429524421692, + "learning_rate": 6.889980435399456e-05, + "loss": 1.8125, + "step": 17107 + }, + { + "epoch": 0.612673912654216, + "grad_norm": 1.44051194190979, + "learning_rate": 6.888878075379326e-05, + "loss": 1.4555, + "step": 17108 + }, + { + "epoch": 0.6127097247837843, + "grad_norm": 1.836295485496521, + "learning_rate": 6.887775757215334e-05, + "loss": 1.7671, + "step": 17109 + }, + { + "epoch": 0.6127455369133525, + "grad_norm": 1.9834587574005127, + "learning_rate": 6.886673480922299e-05, + "loss": 1.5618, + "step": 17110 + }, + { + "epoch": 0.6127813490429208, + "grad_norm": 1.558998942375183, + "learning_rate": 6.88557124651506e-05, + "loss": 1.4063, + "step": 17111 + }, + { + "epoch": 0.6128171611724891, + "grad_norm": 1.5703425407409668, + "learning_rate": 6.884469054008444e-05, + "loss": 1.3246, + "step": 17112 + }, + { + "epoch": 0.6128529733020575, + "grad_norm": 1.665521264076233, + "learning_rate": 6.883366903417273e-05, + "loss": 1.3784, + "step": 17113 + }, + { + "epoch": 0.6128887854316257, + "grad_norm": 1.5847508907318115, + "learning_rate": 6.882264794756386e-05, + "loss": 1.3142, + "step": 17114 + }, + { + "epoch": 0.612924597561194, + "grad_norm": 1.8532623052597046, + "learning_rate": 6.881162728040598e-05, + "loss": 1.4714, + "step": 17115 + }, + { + "epoch": 0.6129604096907623, + "grad_norm": 1.7318142652511597, + "learning_rate": 6.880060703284748e-05, + "loss": 1.6911, + "step": 17116 + }, + { + "epoch": 0.6129962218203305, + "grad_norm": 1.7050365209579468, + "learning_rate": 6.878958720503652e-05, + "loss": 1.7391, + "step": 17117 + }, + { + "epoch": 0.6130320339498988, + "grad_norm": 1.766896367073059, + "learning_rate": 6.877856779712147e-05, + "loss": 1.4107, + "step": 17118 + }, + { + "epoch": 0.6130678460794671, + "grad_norm": 1.5562337636947632, + "learning_rate": 6.876754880925049e-05, + "loss": 1.4708, + "step": 17119 + }, + { + "epoch": 0.6131036582090355, + "grad_norm": 1.6931556463241577, + "learning_rate": 6.87565302415718e-05, + "loss": 1.5617, + "step": 17120 + }, + { + "epoch": 0.6131394703386037, + "grad_norm": 1.5268492698669434, + "learning_rate": 6.874551209423376e-05, + "loss": 1.5087, + "step": 17121 + }, + { + "epoch": 0.613175282468172, + "grad_norm": 1.6822506189346313, + "learning_rate": 6.873449436738451e-05, + "loss": 1.4391, + "step": 17122 + }, + { + "epoch": 0.6132110945977403, + "grad_norm": 1.6287306547164917, + "learning_rate": 6.872347706117233e-05, + "loss": 1.4453, + "step": 17123 + }, + { + "epoch": 0.6132469067273085, + "grad_norm": 1.3523920774459839, + "learning_rate": 6.871246017574537e-05, + "loss": 1.4051, + "step": 17124 + }, + { + "epoch": 0.6132827188568768, + "grad_norm": 2.1489264965057373, + "learning_rate": 6.870144371125198e-05, + "loss": 1.2533, + "step": 17125 + }, + { + "epoch": 0.6133185309864451, + "grad_norm": 1.7597583532333374, + "learning_rate": 6.869042766784022e-05, + "loss": 1.439, + "step": 17126 + }, + { + "epoch": 0.6133543431160134, + "grad_norm": 1.3666061162948608, + "learning_rate": 6.867941204565843e-05, + "loss": 1.1421, + "step": 17127 + }, + { + "epoch": 0.6133901552455817, + "grad_norm": 1.6297531127929688, + "learning_rate": 6.866839684485473e-05, + "loss": 1.4425, + "step": 17128 + }, + { + "epoch": 0.61342596737515, + "grad_norm": 1.778544545173645, + "learning_rate": 6.865738206557731e-05, + "loss": 1.3258, + "step": 17129 + }, + { + "epoch": 0.6134617795047183, + "grad_norm": 1.8034090995788574, + "learning_rate": 6.864636770797441e-05, + "loss": 1.5363, + "step": 17130 + }, + { + "epoch": 0.6134975916342865, + "grad_norm": 1.9214576482772827, + "learning_rate": 6.863535377219417e-05, + "loss": 1.5098, + "step": 17131 + }, + { + "epoch": 0.6135334037638548, + "grad_norm": 1.9046701192855835, + "learning_rate": 6.862434025838481e-05, + "loss": 1.6811, + "step": 17132 + }, + { + "epoch": 0.6135692158934231, + "grad_norm": 2.123274087905884, + "learning_rate": 6.861332716669444e-05, + "loss": 1.6217, + "step": 17133 + }, + { + "epoch": 0.6136050280229914, + "grad_norm": 1.7774008512496948, + "learning_rate": 6.860231449727133e-05, + "loss": 1.5144, + "step": 17134 + }, + { + "epoch": 0.6136408401525597, + "grad_norm": 1.6106023788452148, + "learning_rate": 6.859130225026351e-05, + "loss": 1.425, + "step": 17135 + }, + { + "epoch": 0.613676652282128, + "grad_norm": 1.860154628753662, + "learning_rate": 6.858029042581926e-05, + "loss": 1.1347, + "step": 17136 + }, + { + "epoch": 0.6137124644116962, + "grad_norm": 1.433030605316162, + "learning_rate": 6.856927902408666e-05, + "loss": 1.6064, + "step": 17137 + }, + { + "epoch": 0.6137482765412645, + "grad_norm": 1.4676074981689453, + "learning_rate": 6.855826804521386e-05, + "loss": 1.3863, + "step": 17138 + }, + { + "epoch": 0.6137840886708328, + "grad_norm": 1.465326189994812, + "learning_rate": 6.854725748934901e-05, + "loss": 1.6177, + "step": 17139 + }, + { + "epoch": 0.613819900800401, + "grad_norm": 2.1251637935638428, + "learning_rate": 6.853624735664021e-05, + "loss": 1.0243, + "step": 17140 + }, + { + "epoch": 0.6138557129299694, + "grad_norm": 1.8610551357269287, + "learning_rate": 6.852523764723566e-05, + "loss": 1.7238, + "step": 17141 + }, + { + "epoch": 0.6138915250595377, + "grad_norm": 1.2986605167388916, + "learning_rate": 6.85142283612834e-05, + "loss": 1.4475, + "step": 17142 + }, + { + "epoch": 0.613927337189106, + "grad_norm": 2.99711537361145, + "learning_rate": 6.850321949893162e-05, + "loss": 1.3241, + "step": 17143 + }, + { + "epoch": 0.6139631493186742, + "grad_norm": 1.5854588747024536, + "learning_rate": 6.849221106032837e-05, + "loss": 1.308, + "step": 17144 + }, + { + "epoch": 0.6139989614482425, + "grad_norm": 1.4505645036697388, + "learning_rate": 6.84812030456218e-05, + "loss": 1.5152, + "step": 17145 + }, + { + "epoch": 0.6140347735778108, + "grad_norm": 1.736111044883728, + "learning_rate": 6.847019545495998e-05, + "loss": 1.6171, + "step": 17146 + }, + { + "epoch": 0.614070585707379, + "grad_norm": 1.6995787620544434, + "learning_rate": 6.845918828849099e-05, + "loss": 1.0826, + "step": 17147 + }, + { + "epoch": 0.6141063978369474, + "grad_norm": 2.0281624794006348, + "learning_rate": 6.844818154636295e-05, + "loss": 1.5205, + "step": 17148 + }, + { + "epoch": 0.6141422099665157, + "grad_norm": 1.5841419696807861, + "learning_rate": 6.843717522872393e-05, + "loss": 1.4293, + "step": 17149 + }, + { + "epoch": 0.614178022096084, + "grad_norm": 1.547128677368164, + "learning_rate": 6.8426169335722e-05, + "loss": 1.4819, + "step": 17150 + }, + { + "epoch": 0.6142138342256522, + "grad_norm": 1.5373615026474, + "learning_rate": 6.841516386750523e-05, + "loss": 1.326, + "step": 17151 + }, + { + "epoch": 0.6142496463552205, + "grad_norm": 1.5892366170883179, + "learning_rate": 6.84041588242217e-05, + "loss": 1.772, + "step": 17152 + }, + { + "epoch": 0.6142854584847888, + "grad_norm": 1.3406174182891846, + "learning_rate": 6.839315420601943e-05, + "loss": 1.4543, + "step": 17153 + }, + { + "epoch": 0.614321270614357, + "grad_norm": 2.1202826499938965, + "learning_rate": 6.838215001304654e-05, + "loss": 1.5323, + "step": 17154 + }, + { + "epoch": 0.6143570827439254, + "grad_norm": 2.2001073360443115, + "learning_rate": 6.837114624545102e-05, + "loss": 1.8068, + "step": 17155 + }, + { + "epoch": 0.6143928948734937, + "grad_norm": 1.5223966836929321, + "learning_rate": 6.836014290338093e-05, + "loss": 1.5163, + "step": 17156 + }, + { + "epoch": 0.614428707003062, + "grad_norm": 2.0496227741241455, + "learning_rate": 6.834913998698432e-05, + "loss": 1.7119, + "step": 17157 + }, + { + "epoch": 0.6144645191326302, + "grad_norm": 1.7774089574813843, + "learning_rate": 6.833813749640916e-05, + "loss": 1.1044, + "step": 17158 + }, + { + "epoch": 0.6145003312621985, + "grad_norm": 1.9635884761810303, + "learning_rate": 6.832713543180356e-05, + "loss": 1.1691, + "step": 17159 + }, + { + "epoch": 0.6145361433917668, + "grad_norm": 1.539371371269226, + "learning_rate": 6.831613379331547e-05, + "loss": 1.3793, + "step": 17160 + }, + { + "epoch": 0.614571955521335, + "grad_norm": 1.989688515663147, + "learning_rate": 6.830513258109296e-05, + "loss": 1.8035, + "step": 17161 + }, + { + "epoch": 0.6146077676509034, + "grad_norm": 1.5166738033294678, + "learning_rate": 6.829413179528398e-05, + "loss": 1.2237, + "step": 17162 + }, + { + "epoch": 0.6146435797804717, + "grad_norm": 1.495975375175476, + "learning_rate": 6.828313143603657e-05, + "loss": 1.1678, + "step": 17163 + }, + { + "epoch": 0.61467939191004, + "grad_norm": 1.3702117204666138, + "learning_rate": 6.827213150349874e-05, + "loss": 1.5745, + "step": 17164 + }, + { + "epoch": 0.6147152040396082, + "grad_norm": 1.3687939643859863, + "learning_rate": 6.826113199781841e-05, + "loss": 1.2165, + "step": 17165 + }, + { + "epoch": 0.6147510161691765, + "grad_norm": 1.5471529960632324, + "learning_rate": 6.825013291914363e-05, + "loss": 1.4653, + "step": 17166 + }, + { + "epoch": 0.6147868282987448, + "grad_norm": 1.3832335472106934, + "learning_rate": 6.823913426762237e-05, + "loss": 1.4773, + "step": 17167 + }, + { + "epoch": 0.614822640428313, + "grad_norm": 1.4992239475250244, + "learning_rate": 6.822813604340257e-05, + "loss": 1.4386, + "step": 17168 + }, + { + "epoch": 0.6148584525578814, + "grad_norm": 1.5408648252487183, + "learning_rate": 6.821713824663221e-05, + "loss": 1.4708, + "step": 17169 + }, + { + "epoch": 0.6148942646874497, + "grad_norm": 1.663112998008728, + "learning_rate": 6.820614087745929e-05, + "loss": 1.4123, + "step": 17170 + }, + { + "epoch": 0.614930076817018, + "grad_norm": 1.6703062057495117, + "learning_rate": 6.81951439360317e-05, + "loss": 1.288, + "step": 17171 + }, + { + "epoch": 0.6149658889465862, + "grad_norm": 1.5483388900756836, + "learning_rate": 6.818414742249745e-05, + "loss": 1.3075, + "step": 17172 + }, + { + "epoch": 0.6150017010761545, + "grad_norm": 1.4388359785079956, + "learning_rate": 6.817315133700446e-05, + "loss": 1.5018, + "step": 17173 + }, + { + "epoch": 0.6150375132057228, + "grad_norm": 1.47565758228302, + "learning_rate": 6.816215567970063e-05, + "loss": 1.5788, + "step": 17174 + }, + { + "epoch": 0.615073325335291, + "grad_norm": 1.9998646974563599, + "learning_rate": 6.815116045073396e-05, + "loss": 1.4184, + "step": 17175 + }, + { + "epoch": 0.6151091374648594, + "grad_norm": 2.213521957397461, + "learning_rate": 6.814016565025231e-05, + "loss": 1.6169, + "step": 17176 + }, + { + "epoch": 0.6151449495944277, + "grad_norm": 1.4681707620620728, + "learning_rate": 6.812917127840368e-05, + "loss": 1.568, + "step": 17177 + }, + { + "epoch": 0.6151807617239959, + "grad_norm": 1.8381984233856201, + "learning_rate": 6.81181773353359e-05, + "loss": 1.657, + "step": 17178 + }, + { + "epoch": 0.6152165738535642, + "grad_norm": 1.8551054000854492, + "learning_rate": 6.810718382119694e-05, + "loss": 1.4701, + "step": 17179 + }, + { + "epoch": 0.6152523859831325, + "grad_norm": 1.313694953918457, + "learning_rate": 6.809619073613467e-05, + "loss": 1.3976, + "step": 17180 + }, + { + "epoch": 0.6152881981127007, + "grad_norm": 2.456167697906494, + "learning_rate": 6.808519808029703e-05, + "loss": 1.3742, + "step": 17181 + }, + { + "epoch": 0.615324010242269, + "grad_norm": 1.4624918699264526, + "learning_rate": 6.807420585383186e-05, + "loss": 1.5296, + "step": 17182 + }, + { + "epoch": 0.6153598223718374, + "grad_norm": 1.544368863105774, + "learning_rate": 6.806321405688707e-05, + "loss": 1.6921, + "step": 17183 + }, + { + "epoch": 0.6153956345014057, + "grad_norm": 1.5174061059951782, + "learning_rate": 6.805222268961054e-05, + "loss": 1.3493, + "step": 17184 + }, + { + "epoch": 0.6154314466309739, + "grad_norm": 1.750654935836792, + "learning_rate": 6.804123175215014e-05, + "loss": 1.5409, + "step": 17185 + }, + { + "epoch": 0.6154672587605422, + "grad_norm": 1.7481032609939575, + "learning_rate": 6.803024124465375e-05, + "loss": 1.4192, + "step": 17186 + }, + { + "epoch": 0.6155030708901105, + "grad_norm": 1.7678956985473633, + "learning_rate": 6.801925116726922e-05, + "loss": 1.3979, + "step": 17187 + }, + { + "epoch": 0.6155388830196787, + "grad_norm": 1.6800671815872192, + "learning_rate": 6.800826152014442e-05, + "loss": 1.277, + "step": 17188 + }, + { + "epoch": 0.615574695149247, + "grad_norm": 2.4462714195251465, + "learning_rate": 6.799727230342718e-05, + "loss": 1.5407, + "step": 17189 + }, + { + "epoch": 0.6156105072788154, + "grad_norm": 1.6587086915969849, + "learning_rate": 6.798628351726539e-05, + "loss": 1.3014, + "step": 17190 + }, + { + "epoch": 0.6156463194083837, + "grad_norm": 1.3719878196716309, + "learning_rate": 6.797529516180687e-05, + "loss": 1.2125, + "step": 17191 + }, + { + "epoch": 0.6156821315379519, + "grad_norm": 1.9094762802124023, + "learning_rate": 6.796430723719939e-05, + "loss": 1.2202, + "step": 17192 + }, + { + "epoch": 0.6157179436675202, + "grad_norm": 1.774322271347046, + "learning_rate": 6.795331974359088e-05, + "loss": 1.7452, + "step": 17193 + }, + { + "epoch": 0.6157537557970885, + "grad_norm": 2.135695695877075, + "learning_rate": 6.794233268112907e-05, + "loss": 1.7369, + "step": 17194 + }, + { + "epoch": 0.6157895679266567, + "grad_norm": 1.5609972476959229, + "learning_rate": 6.793134604996185e-05, + "loss": 1.347, + "step": 17195 + }, + { + "epoch": 0.615825380056225, + "grad_norm": 1.5753405094146729, + "learning_rate": 6.7920359850237e-05, + "loss": 1.3896, + "step": 17196 + }, + { + "epoch": 0.6158611921857934, + "grad_norm": 2.917074203491211, + "learning_rate": 6.790937408210233e-05, + "loss": 1.528, + "step": 17197 + }, + { + "epoch": 0.6158970043153617, + "grad_norm": 1.5047675371170044, + "learning_rate": 6.789838874570565e-05, + "loss": 1.5584, + "step": 17198 + }, + { + "epoch": 0.6159328164449299, + "grad_norm": 1.545350193977356, + "learning_rate": 6.788740384119472e-05, + "loss": 1.5371, + "step": 17199 + }, + { + "epoch": 0.6159686285744982, + "grad_norm": 1.9652788639068604, + "learning_rate": 6.787641936871739e-05, + "loss": 1.5017, + "step": 17200 + }, + { + "epoch": 0.6160044407040665, + "grad_norm": 1.3209742307662964, + "learning_rate": 6.786543532842133e-05, + "loss": 1.1845, + "step": 17201 + }, + { + "epoch": 0.6160402528336347, + "grad_norm": 2.2090561389923096, + "learning_rate": 6.785445172045448e-05, + "loss": 1.6238, + "step": 17202 + }, + { + "epoch": 0.616076064963203, + "grad_norm": 1.6406116485595703, + "learning_rate": 6.784346854496442e-05, + "loss": 1.2969, + "step": 17203 + }, + { + "epoch": 0.6161118770927714, + "grad_norm": 1.7043877840042114, + "learning_rate": 6.78324858020991e-05, + "loss": 1.5182, + "step": 17204 + }, + { + "epoch": 0.6161476892223396, + "grad_norm": 1.4550445079803467, + "learning_rate": 6.78215034920061e-05, + "loss": 1.5474, + "step": 17205 + }, + { + "epoch": 0.6161835013519079, + "grad_norm": 1.4933606386184692, + "learning_rate": 6.781052161483332e-05, + "loss": 1.3881, + "step": 17206 + }, + { + "epoch": 0.6162193134814762, + "grad_norm": 1.5615988969802856, + "learning_rate": 6.779954017072842e-05, + "loss": 1.5827, + "step": 17207 + }, + { + "epoch": 0.6162551256110445, + "grad_norm": 1.4446730613708496, + "learning_rate": 6.778855915983921e-05, + "loss": 1.1332, + "step": 17208 + }, + { + "epoch": 0.6162909377406127, + "grad_norm": 1.8601797819137573, + "learning_rate": 6.777757858231339e-05, + "loss": 1.252, + "step": 17209 + }, + { + "epoch": 0.616326749870181, + "grad_norm": 1.5915179252624512, + "learning_rate": 6.776659843829863e-05, + "loss": 1.4509, + "step": 17210 + }, + { + "epoch": 0.6163625619997494, + "grad_norm": 1.6909068822860718, + "learning_rate": 6.775561872794279e-05, + "loss": 1.477, + "step": 17211 + }, + { + "epoch": 0.6163983741293176, + "grad_norm": 1.5252338647842407, + "learning_rate": 6.774463945139343e-05, + "loss": 1.6273, + "step": 17212 + }, + { + "epoch": 0.6164341862588859, + "grad_norm": 1.677097201347351, + "learning_rate": 6.77336606087984e-05, + "loss": 1.4723, + "step": 17213 + }, + { + "epoch": 0.6164699983884542, + "grad_norm": 1.8449418544769287, + "learning_rate": 6.772268220030528e-05, + "loss": 1.201, + "step": 17214 + }, + { + "epoch": 0.6165058105180224, + "grad_norm": 1.9632333517074585, + "learning_rate": 6.77117042260619e-05, + "loss": 1.6116, + "step": 17215 + }, + { + "epoch": 0.6165416226475907, + "grad_norm": 1.7720052003860474, + "learning_rate": 6.770072668621583e-05, + "loss": 1.6009, + "step": 17216 + }, + { + "epoch": 0.616577434777159, + "grad_norm": 2.2499513626098633, + "learning_rate": 6.768974958091488e-05, + "loss": 1.2514, + "step": 17217 + }, + { + "epoch": 0.6166132469067274, + "grad_norm": 1.5738240480422974, + "learning_rate": 6.767877291030666e-05, + "loss": 1.6878, + "step": 17218 + }, + { + "epoch": 0.6166490590362956, + "grad_norm": 1.6386752128601074, + "learning_rate": 6.766779667453881e-05, + "loss": 1.4406, + "step": 17219 + }, + { + "epoch": 0.6166848711658639, + "grad_norm": 1.473636507987976, + "learning_rate": 6.765682087375912e-05, + "loss": 1.1257, + "step": 17220 + }, + { + "epoch": 0.6167206832954322, + "grad_norm": 1.5793477296829224, + "learning_rate": 6.764584550811512e-05, + "loss": 1.3315, + "step": 17221 + }, + { + "epoch": 0.6167564954250004, + "grad_norm": 2.4050142765045166, + "learning_rate": 6.763487057775459e-05, + "loss": 1.4098, + "step": 17222 + }, + { + "epoch": 0.6167923075545687, + "grad_norm": 1.66704261302948, + "learning_rate": 6.762389608282507e-05, + "loss": 1.2545, + "step": 17223 + }, + { + "epoch": 0.616828119684137, + "grad_norm": 1.397594928741455, + "learning_rate": 6.761292202347434e-05, + "loss": 1.5063, + "step": 17224 + }, + { + "epoch": 0.6168639318137054, + "grad_norm": 1.3447819948196411, + "learning_rate": 6.760194839984994e-05, + "loss": 1.281, + "step": 17225 + }, + { + "epoch": 0.6168997439432736, + "grad_norm": 1.6767772436141968, + "learning_rate": 6.75909752120995e-05, + "loss": 1.8086, + "step": 17226 + }, + { + "epoch": 0.6169355560728419, + "grad_norm": 1.7301512956619263, + "learning_rate": 6.758000246037072e-05, + "loss": 1.8668, + "step": 17227 + }, + { + "epoch": 0.6169713682024102, + "grad_norm": 1.7940595149993896, + "learning_rate": 6.756903014481116e-05, + "loss": 1.3394, + "step": 17228 + }, + { + "epoch": 0.6170071803319784, + "grad_norm": 1.7685900926589966, + "learning_rate": 6.75580582655685e-05, + "loss": 1.7772, + "step": 17229 + }, + { + "epoch": 0.6170429924615467, + "grad_norm": 1.6074085235595703, + "learning_rate": 6.754708682279027e-05, + "loss": 1.2769, + "step": 17230 + }, + { + "epoch": 0.617078804591115, + "grad_norm": 2.404395580291748, + "learning_rate": 6.753611581662418e-05, + "loss": 1.546, + "step": 17231 + }, + { + "epoch": 0.6171146167206832, + "grad_norm": 1.7004632949829102, + "learning_rate": 6.752514524721771e-05, + "loss": 1.705, + "step": 17232 + }, + { + "epoch": 0.6171504288502516, + "grad_norm": 1.2684026956558228, + "learning_rate": 6.751417511471859e-05, + "loss": 1.5119, + "step": 17233 + }, + { + "epoch": 0.6171862409798199, + "grad_norm": 1.4887030124664307, + "learning_rate": 6.750320541927433e-05, + "loss": 1.4895, + "step": 17234 + }, + { + "epoch": 0.6172220531093882, + "grad_norm": 1.5183050632476807, + "learning_rate": 6.749223616103249e-05, + "loss": 1.4796, + "step": 17235 + }, + { + "epoch": 0.6172578652389564, + "grad_norm": 1.4999533891677856, + "learning_rate": 6.74812673401407e-05, + "loss": 1.0856, + "step": 17236 + }, + { + "epoch": 0.6172936773685247, + "grad_norm": 1.952275276184082, + "learning_rate": 6.74702989567465e-05, + "loss": 1.5083, + "step": 17237 + }, + { + "epoch": 0.617329489498093, + "grad_norm": 1.649373173713684, + "learning_rate": 6.745933101099748e-05, + "loss": 1.5587, + "step": 17238 + }, + { + "epoch": 0.6173653016276612, + "grad_norm": 1.4925177097320557, + "learning_rate": 6.744836350304118e-05, + "loss": 1.1892, + "step": 17239 + }, + { + "epoch": 0.6174011137572296, + "grad_norm": 1.5347260236740112, + "learning_rate": 6.743739643302516e-05, + "loss": 1.4524, + "step": 17240 + }, + { + "epoch": 0.6174369258867979, + "grad_norm": 1.3831895589828491, + "learning_rate": 6.742642980109696e-05, + "loss": 1.421, + "step": 17241 + }, + { + "epoch": 0.6174727380163662, + "grad_norm": 2.0352489948272705, + "learning_rate": 6.741546360740415e-05, + "loss": 1.8222, + "step": 17242 + }, + { + "epoch": 0.6175085501459344, + "grad_norm": 1.7592264413833618, + "learning_rate": 6.740449785209425e-05, + "loss": 1.3654, + "step": 17243 + }, + { + "epoch": 0.6175443622755027, + "grad_norm": 1.4026323556900024, + "learning_rate": 6.739353253531475e-05, + "loss": 1.6612, + "step": 17244 + }, + { + "epoch": 0.617580174405071, + "grad_norm": 2.095674991607666, + "learning_rate": 6.738256765721324e-05, + "loss": 1.3747, + "step": 17245 + }, + { + "epoch": 0.6176159865346392, + "grad_norm": 1.5699025392532349, + "learning_rate": 6.73716032179372e-05, + "loss": 1.3018, + "step": 17246 + }, + { + "epoch": 0.6176517986642076, + "grad_norm": 2.3074796199798584, + "learning_rate": 6.736063921763415e-05, + "loss": 1.4714, + "step": 17247 + }, + { + "epoch": 0.6176876107937759, + "grad_norm": 1.499690294265747, + "learning_rate": 6.73496756564516e-05, + "loss": 1.3134, + "step": 17248 + }, + { + "epoch": 0.6177234229233441, + "grad_norm": 1.9261491298675537, + "learning_rate": 6.733871253453707e-05, + "loss": 1.6609, + "step": 17249 + }, + { + "epoch": 0.6177592350529124, + "grad_norm": 1.6872670650482178, + "learning_rate": 6.7327749852038e-05, + "loss": 1.2668, + "step": 17250 + }, + { + "epoch": 0.6177950471824807, + "grad_norm": 1.8014341592788696, + "learning_rate": 6.731678760910192e-05, + "loss": 1.5728, + "step": 17251 + }, + { + "epoch": 0.617830859312049, + "grad_norm": 1.621647596359253, + "learning_rate": 6.730582580587632e-05, + "loss": 1.3942, + "step": 17252 + }, + { + "epoch": 0.6178666714416172, + "grad_norm": 1.6194264888763428, + "learning_rate": 6.729486444250863e-05, + "loss": 1.5561, + "step": 17253 + }, + { + "epoch": 0.6179024835711856, + "grad_norm": 1.5640456676483154, + "learning_rate": 6.72839035191464e-05, + "loss": 1.169, + "step": 17254 + }, + { + "epoch": 0.6179382957007539, + "grad_norm": 1.567991018295288, + "learning_rate": 6.7272943035937e-05, + "loss": 1.6388, + "step": 17255 + }, + { + "epoch": 0.6179741078303221, + "grad_norm": 1.513309121131897, + "learning_rate": 6.726198299302796e-05, + "loss": 1.5298, + "step": 17256 + }, + { + "epoch": 0.6180099199598904, + "grad_norm": 1.6666152477264404, + "learning_rate": 6.72510233905667e-05, + "loss": 1.3742, + "step": 17257 + }, + { + "epoch": 0.6180457320894587, + "grad_norm": 1.8331990242004395, + "learning_rate": 6.724006422870069e-05, + "loss": 1.3625, + "step": 17258 + }, + { + "epoch": 0.618081544219027, + "grad_norm": 1.3519563674926758, + "learning_rate": 6.722910550757734e-05, + "loss": 1.5429, + "step": 17259 + }, + { + "epoch": 0.6181173563485952, + "grad_norm": 1.70087730884552, + "learning_rate": 6.721814722734412e-05, + "loss": 1.2871, + "step": 17260 + }, + { + "epoch": 0.6181531684781636, + "grad_norm": 1.6571663618087769, + "learning_rate": 6.720718938814846e-05, + "loss": 1.4969, + "step": 17261 + }, + { + "epoch": 0.6181889806077319, + "grad_norm": 1.8243577480316162, + "learning_rate": 6.719623199013771e-05, + "loss": 1.343, + "step": 17262 + }, + { + "epoch": 0.6182247927373001, + "grad_norm": 1.3904130458831787, + "learning_rate": 6.718527503345939e-05, + "loss": 1.3714, + "step": 17263 + }, + { + "epoch": 0.6182606048668684, + "grad_norm": 1.2243188619613647, + "learning_rate": 6.717431851826086e-05, + "loss": 1.0458, + "step": 17264 + }, + { + "epoch": 0.6182964169964367, + "grad_norm": 1.2935394048690796, + "learning_rate": 6.716336244468954e-05, + "loss": 1.5686, + "step": 17265 + }, + { + "epoch": 0.6183322291260049, + "grad_norm": 1.9957224130630493, + "learning_rate": 6.715240681289279e-05, + "loss": 1.585, + "step": 17266 + }, + { + "epoch": 0.6183680412555732, + "grad_norm": 1.5544694662094116, + "learning_rate": 6.714145162301808e-05, + "loss": 1.2735, + "step": 17267 + }, + { + "epoch": 0.6184038533851416, + "grad_norm": 1.6578425168991089, + "learning_rate": 6.713049687521272e-05, + "loss": 1.4704, + "step": 17268 + }, + { + "epoch": 0.6184396655147099, + "grad_norm": 1.665738821029663, + "learning_rate": 6.711954256962414e-05, + "loss": 1.5401, + "step": 17269 + }, + { + "epoch": 0.6184754776442781, + "grad_norm": 1.3819257020950317, + "learning_rate": 6.71085887063997e-05, + "loss": 1.7921, + "step": 17270 + }, + { + "epoch": 0.6185112897738464, + "grad_norm": 1.3936008214950562, + "learning_rate": 6.709763528568677e-05, + "loss": 1.2616, + "step": 17271 + }, + { + "epoch": 0.6185471019034147, + "grad_norm": 1.3957215547561646, + "learning_rate": 6.708668230763272e-05, + "loss": 1.3208, + "step": 17272 + }, + { + "epoch": 0.6185829140329829, + "grad_norm": 1.659361481666565, + "learning_rate": 6.707572977238489e-05, + "loss": 1.3095, + "step": 17273 + }, + { + "epoch": 0.6186187261625512, + "grad_norm": 1.7726492881774902, + "learning_rate": 6.706477768009067e-05, + "loss": 1.7466, + "step": 17274 + }, + { + "epoch": 0.6186545382921196, + "grad_norm": 1.3715457916259766, + "learning_rate": 6.705382603089737e-05, + "loss": 1.6041, + "step": 17275 + }, + { + "epoch": 0.6186903504216879, + "grad_norm": 1.717397928237915, + "learning_rate": 6.704287482495233e-05, + "loss": 1.5324, + "step": 17276 + }, + { + "epoch": 0.6187261625512561, + "grad_norm": 1.2430498600006104, + "learning_rate": 6.70319240624029e-05, + "loss": 1.3185, + "step": 17277 + }, + { + "epoch": 0.6187619746808244, + "grad_norm": 1.5718120336532593, + "learning_rate": 6.702097374339644e-05, + "loss": 1.1736, + "step": 17278 + }, + { + "epoch": 0.6187977868103927, + "grad_norm": 1.5527795553207397, + "learning_rate": 6.701002386808021e-05, + "loss": 1.3052, + "step": 17279 + }, + { + "epoch": 0.6188335989399609, + "grad_norm": 1.8009364604949951, + "learning_rate": 6.699907443660156e-05, + "loss": 1.3913, + "step": 17280 + }, + { + "epoch": 0.6188694110695292, + "grad_norm": 3.0369083881378174, + "learning_rate": 6.698812544910781e-05, + "loss": 1.4698, + "step": 17281 + }, + { + "epoch": 0.6189052231990976, + "grad_norm": 1.604009747505188, + "learning_rate": 6.697717690574623e-05, + "loss": 1.4513, + "step": 17282 + }, + { + "epoch": 0.6189410353286658, + "grad_norm": 1.6419212818145752, + "learning_rate": 6.696622880666415e-05, + "loss": 1.2407, + "step": 17283 + }, + { + "epoch": 0.6189768474582341, + "grad_norm": 2.8187286853790283, + "learning_rate": 6.695528115200883e-05, + "loss": 1.8745, + "step": 17284 + }, + { + "epoch": 0.6190126595878024, + "grad_norm": 1.8177802562713623, + "learning_rate": 6.69443339419276e-05, + "loss": 1.3942, + "step": 17285 + }, + { + "epoch": 0.6190484717173707, + "grad_norm": 1.7879760265350342, + "learning_rate": 6.69333871765677e-05, + "loss": 1.7262, + "step": 17286 + }, + { + "epoch": 0.6190842838469389, + "grad_norm": 1.4644464254379272, + "learning_rate": 6.692244085607644e-05, + "loss": 1.3389, + "step": 17287 + }, + { + "epoch": 0.6191200959765072, + "grad_norm": 1.9216207265853882, + "learning_rate": 6.69114949806011e-05, + "loss": 1.7144, + "step": 17288 + }, + { + "epoch": 0.6191559081060756, + "grad_norm": 1.678239345550537, + "learning_rate": 6.690054955028885e-05, + "loss": 1.4053, + "step": 17289 + }, + { + "epoch": 0.6191917202356438, + "grad_norm": 2.289401054382324, + "learning_rate": 6.688960456528705e-05, + "loss": 1.5856, + "step": 17290 + }, + { + "epoch": 0.6192275323652121, + "grad_norm": 1.9415135383605957, + "learning_rate": 6.687866002574289e-05, + "loss": 1.5077, + "step": 17291 + }, + { + "epoch": 0.6192633444947804, + "grad_norm": 1.520308256149292, + "learning_rate": 6.686771593180365e-05, + "loss": 1.2985, + "step": 17292 + }, + { + "epoch": 0.6192991566243486, + "grad_norm": 1.429878830909729, + "learning_rate": 6.685677228361654e-05, + "loss": 1.5415, + "step": 17293 + }, + { + "epoch": 0.6193349687539169, + "grad_norm": 1.9562313556671143, + "learning_rate": 6.684582908132883e-05, + "loss": 1.6137, + "step": 17294 + }, + { + "epoch": 0.6193707808834852, + "grad_norm": 2.758021831512451, + "learning_rate": 6.68348863250877e-05, + "loss": 1.4741, + "step": 17295 + }, + { + "epoch": 0.6194065930130536, + "grad_norm": 1.5317972898483276, + "learning_rate": 6.682394401504042e-05, + "loss": 1.7757, + "step": 17296 + }, + { + "epoch": 0.6194424051426218, + "grad_norm": 1.6802877187728882, + "learning_rate": 6.681300215133419e-05, + "loss": 1.4877, + "step": 17297 + }, + { + "epoch": 0.6194782172721901, + "grad_norm": 1.4757306575775146, + "learning_rate": 6.680206073411616e-05, + "loss": 1.478, + "step": 17298 + }, + { + "epoch": 0.6195140294017584, + "grad_norm": 1.6011658906936646, + "learning_rate": 6.679111976353362e-05, + "loss": 1.6643, + "step": 17299 + }, + { + "epoch": 0.6195498415313266, + "grad_norm": 1.3765159845352173, + "learning_rate": 6.67801792397337e-05, + "loss": 1.3161, + "step": 17300 + }, + { + "epoch": 0.6195856536608949, + "grad_norm": 1.9646791219711304, + "learning_rate": 6.676923916286365e-05, + "loss": 1.4321, + "step": 17301 + }, + { + "epoch": 0.6196214657904632, + "grad_norm": 1.6221250295639038, + "learning_rate": 6.675829953307057e-05, + "loss": 1.4556, + "step": 17302 + }, + { + "epoch": 0.6196572779200316, + "grad_norm": 1.8408706188201904, + "learning_rate": 6.674736035050173e-05, + "loss": 1.3377, + "step": 17303 + }, + { + "epoch": 0.6196930900495998, + "grad_norm": 1.496396541595459, + "learning_rate": 6.673642161530424e-05, + "loss": 1.1605, + "step": 17304 + }, + { + "epoch": 0.6197289021791681, + "grad_norm": 1.4522924423217773, + "learning_rate": 6.672548332762533e-05, + "loss": 1.586, + "step": 17305 + }, + { + "epoch": 0.6197647143087364, + "grad_norm": 1.8669017553329468, + "learning_rate": 6.671454548761212e-05, + "loss": 1.3713, + "step": 17306 + }, + { + "epoch": 0.6198005264383046, + "grad_norm": 1.4297741651535034, + "learning_rate": 6.670360809541171e-05, + "loss": 1.4443, + "step": 17307 + }, + { + "epoch": 0.6198363385678729, + "grad_norm": 1.9161500930786133, + "learning_rate": 6.669267115117137e-05, + "loss": 1.2736, + "step": 17308 + }, + { + "epoch": 0.6198721506974412, + "grad_norm": 1.817219614982605, + "learning_rate": 6.66817346550381e-05, + "loss": 1.4487, + "step": 17309 + }, + { + "epoch": 0.6199079628270096, + "grad_norm": 1.851157307624817, + "learning_rate": 6.66707986071592e-05, + "loss": 1.5179, + "step": 17310 + }, + { + "epoch": 0.6199437749565778, + "grad_norm": 1.622918725013733, + "learning_rate": 6.665986300768163e-05, + "loss": 1.6099, + "step": 17311 + }, + { + "epoch": 0.6199795870861461, + "grad_norm": 1.516659140586853, + "learning_rate": 6.664892785675267e-05, + "loss": 1.4413, + "step": 17312 + }, + { + "epoch": 0.6200153992157144, + "grad_norm": 1.7160168886184692, + "learning_rate": 6.663799315451931e-05, + "loss": 1.743, + "step": 17313 + }, + { + "epoch": 0.6200512113452826, + "grad_norm": 1.5063496828079224, + "learning_rate": 6.662705890112876e-05, + "loss": 1.5858, + "step": 17314 + }, + { + "epoch": 0.6200870234748509, + "grad_norm": 1.5053106546401978, + "learning_rate": 6.661612509672808e-05, + "loss": 1.6044, + "step": 17315 + }, + { + "epoch": 0.6201228356044192, + "grad_norm": 1.300164818763733, + "learning_rate": 6.660519174146433e-05, + "loss": 1.6188, + "step": 17316 + }, + { + "epoch": 0.6201586477339875, + "grad_norm": 3.0625481605529785, + "learning_rate": 6.659425883548471e-05, + "loss": 1.6178, + "step": 17317 + }, + { + "epoch": 0.6201944598635558, + "grad_norm": 1.4336274862289429, + "learning_rate": 6.658332637893619e-05, + "loss": 1.656, + "step": 17318 + }, + { + "epoch": 0.6202302719931241, + "grad_norm": 2.179532051086426, + "learning_rate": 6.657239437196596e-05, + "loss": 1.6422, + "step": 17319 + }, + { + "epoch": 0.6202660841226924, + "grad_norm": 1.5122623443603516, + "learning_rate": 6.656146281472098e-05, + "loss": 1.5129, + "step": 17320 + }, + { + "epoch": 0.6203018962522606, + "grad_norm": 1.994327425956726, + "learning_rate": 6.655053170734846e-05, + "loss": 1.3558, + "step": 17321 + }, + { + "epoch": 0.6203377083818289, + "grad_norm": 2.061368227005005, + "learning_rate": 6.653960104999537e-05, + "loss": 1.2273, + "step": 17322 + }, + { + "epoch": 0.6203735205113972, + "grad_norm": 2.0309760570526123, + "learning_rate": 6.652867084280876e-05, + "loss": 1.288, + "step": 17323 + }, + { + "epoch": 0.6204093326409655, + "grad_norm": 1.7593995332717896, + "learning_rate": 6.651774108593574e-05, + "loss": 1.3319, + "step": 17324 + }, + { + "epoch": 0.6204451447705338, + "grad_norm": 1.9512978792190552, + "learning_rate": 6.650681177952328e-05, + "loss": 1.2813, + "step": 17325 + }, + { + "epoch": 0.6204809569001021, + "grad_norm": 1.8783950805664062, + "learning_rate": 6.64958829237185e-05, + "loss": 1.346, + "step": 17326 + }, + { + "epoch": 0.6205167690296703, + "grad_norm": 1.5158321857452393, + "learning_rate": 6.648495451866838e-05, + "loss": 1.526, + "step": 17327 + }, + { + "epoch": 0.6205525811592386, + "grad_norm": 1.927087426185608, + "learning_rate": 6.647402656451998e-05, + "loss": 1.628, + "step": 17328 + }, + { + "epoch": 0.6205883932888069, + "grad_norm": 1.507460117340088, + "learning_rate": 6.646309906142027e-05, + "loss": 1.6315, + "step": 17329 + }, + { + "epoch": 0.6206242054183752, + "grad_norm": 1.6917349100112915, + "learning_rate": 6.645217200951636e-05, + "loss": 1.7155, + "step": 17330 + }, + { + "epoch": 0.6206600175479435, + "grad_norm": 1.8586498498916626, + "learning_rate": 6.644124540895518e-05, + "loss": 1.39, + "step": 17331 + }, + { + "epoch": 0.6206958296775118, + "grad_norm": 2.0919973850250244, + "learning_rate": 6.643031925988375e-05, + "loss": 1.6374, + "step": 17332 + }, + { + "epoch": 0.6207316418070801, + "grad_norm": 1.7580621242523193, + "learning_rate": 6.641939356244908e-05, + "loss": 1.2822, + "step": 17333 + }, + { + "epoch": 0.6207674539366483, + "grad_norm": 1.9603501558303833, + "learning_rate": 6.640846831679815e-05, + "loss": 1.1557, + "step": 17334 + }, + { + "epoch": 0.6208032660662166, + "grad_norm": 1.6108684539794922, + "learning_rate": 6.639754352307794e-05, + "loss": 1.203, + "step": 17335 + }, + { + "epoch": 0.6208390781957849, + "grad_norm": 1.8438420295715332, + "learning_rate": 6.638661918143542e-05, + "loss": 1.335, + "step": 17336 + }, + { + "epoch": 0.6208748903253531, + "grad_norm": 1.7761483192443848, + "learning_rate": 6.637569529201763e-05, + "loss": 1.3621, + "step": 17337 + }, + { + "epoch": 0.6209107024549215, + "grad_norm": 1.4380193948745728, + "learning_rate": 6.636477185497145e-05, + "loss": 1.5014, + "step": 17338 + }, + { + "epoch": 0.6209465145844898, + "grad_norm": 2.0738120079040527, + "learning_rate": 6.63538488704439e-05, + "loss": 1.3528, + "step": 17339 + }, + { + "epoch": 0.6209823267140581, + "grad_norm": 1.7343157529830933, + "learning_rate": 6.634292633858191e-05, + "loss": 1.5682, + "step": 17340 + }, + { + "epoch": 0.6210181388436263, + "grad_norm": 1.6832574605941772, + "learning_rate": 6.633200425953241e-05, + "loss": 1.6061, + "step": 17341 + }, + { + "epoch": 0.6210539509731946, + "grad_norm": 1.3964799642562866, + "learning_rate": 6.632108263344238e-05, + "loss": 1.2056, + "step": 17342 + }, + { + "epoch": 0.6210897631027629, + "grad_norm": 1.7358297109603882, + "learning_rate": 6.631016146045874e-05, + "loss": 1.2508, + "step": 17343 + }, + { + "epoch": 0.6211255752323311, + "grad_norm": 1.7798261642456055, + "learning_rate": 6.629924074072844e-05, + "loss": 1.5458, + "step": 17344 + }, + { + "epoch": 0.6211613873618995, + "grad_norm": 1.7764931917190552, + "learning_rate": 6.628832047439835e-05, + "loss": 1.4655, + "step": 17345 + }, + { + "epoch": 0.6211971994914678, + "grad_norm": 2.0410125255584717, + "learning_rate": 6.627740066161545e-05, + "loss": 1.3842, + "step": 17346 + }, + { + "epoch": 0.6212330116210361, + "grad_norm": 1.4399596452713013, + "learning_rate": 6.62664813025266e-05, + "loss": 1.3467, + "step": 17347 + }, + { + "epoch": 0.6212688237506043, + "grad_norm": 1.5043425559997559, + "learning_rate": 6.625556239727875e-05, + "loss": 1.5523, + "step": 17348 + }, + { + "epoch": 0.6213046358801726, + "grad_norm": 1.7739176750183105, + "learning_rate": 6.624464394601879e-05, + "loss": 1.1175, + "step": 17349 + }, + { + "epoch": 0.6213404480097409, + "grad_norm": 1.4829490184783936, + "learning_rate": 6.623372594889358e-05, + "loss": 1.5191, + "step": 17350 + }, + { + "epoch": 0.6213762601393091, + "grad_norm": 1.4071301221847534, + "learning_rate": 6.622280840605005e-05, + "loss": 1.3898, + "step": 17351 + }, + { + "epoch": 0.6214120722688775, + "grad_norm": 3.8605194091796875, + "learning_rate": 6.621189131763505e-05, + "loss": 1.3449, + "step": 17352 + }, + { + "epoch": 0.6214478843984458, + "grad_norm": 1.5796082019805908, + "learning_rate": 6.620097468379548e-05, + "loss": 1.176, + "step": 17353 + }, + { + "epoch": 0.621483696528014, + "grad_norm": 1.4441759586334229, + "learning_rate": 6.619005850467818e-05, + "loss": 1.422, + "step": 17354 + }, + { + "epoch": 0.6215195086575823, + "grad_norm": 1.868899941444397, + "learning_rate": 6.617914278043005e-05, + "loss": 1.5502, + "step": 17355 + }, + { + "epoch": 0.6215553207871506, + "grad_norm": 1.4756184816360474, + "learning_rate": 6.616822751119792e-05, + "loss": 1.4594, + "step": 17356 + }, + { + "epoch": 0.6215911329167189, + "grad_norm": 1.7463960647583008, + "learning_rate": 6.615731269712864e-05, + "loss": 1.4894, + "step": 17357 + }, + { + "epoch": 0.6216269450462871, + "grad_norm": 1.8824423551559448, + "learning_rate": 6.614639833836908e-05, + "loss": 1.4013, + "step": 17358 + }, + { + "epoch": 0.6216627571758555, + "grad_norm": 1.6457586288452148, + "learning_rate": 6.613548443506605e-05, + "loss": 1.6566, + "step": 17359 + }, + { + "epoch": 0.6216985693054238, + "grad_norm": 2.025904893875122, + "learning_rate": 6.612457098736642e-05, + "loss": 1.5478, + "step": 17360 + }, + { + "epoch": 0.621734381434992, + "grad_norm": 1.5645883083343506, + "learning_rate": 6.611365799541695e-05, + "loss": 1.5564, + "step": 17361 + }, + { + "epoch": 0.6217701935645603, + "grad_norm": 1.3645058870315552, + "learning_rate": 6.610274545936455e-05, + "loss": 1.4919, + "step": 17362 + }, + { + "epoch": 0.6218060056941286, + "grad_norm": 1.9030429124832153, + "learning_rate": 6.609183337935594e-05, + "loss": 1.7562, + "step": 17363 + }, + { + "epoch": 0.6218418178236969, + "grad_norm": 1.3259434700012207, + "learning_rate": 6.6080921755538e-05, + "loss": 1.4657, + "step": 17364 + }, + { + "epoch": 0.6218776299532651, + "grad_norm": 1.6076582670211792, + "learning_rate": 6.607001058805749e-05, + "loss": 1.5636, + "step": 17365 + }, + { + "epoch": 0.6219134420828335, + "grad_norm": 1.4271641969680786, + "learning_rate": 6.605909987706125e-05, + "loss": 1.3398, + "step": 17366 + }, + { + "epoch": 0.6219492542124018, + "grad_norm": 1.5322728157043457, + "learning_rate": 6.604818962269602e-05, + "loss": 1.1012, + "step": 17367 + }, + { + "epoch": 0.62198506634197, + "grad_norm": 1.7007532119750977, + "learning_rate": 6.603727982510859e-05, + "loss": 1.3559, + "step": 17368 + }, + { + "epoch": 0.6220208784715383, + "grad_norm": 1.661720633506775, + "learning_rate": 6.602637048444578e-05, + "loss": 1.6362, + "step": 17369 + }, + { + "epoch": 0.6220566906011066, + "grad_norm": 1.6217849254608154, + "learning_rate": 6.60154616008543e-05, + "loss": 1.1952, + "step": 17370 + }, + { + "epoch": 0.6220925027306748, + "grad_norm": 1.706960678100586, + "learning_rate": 6.600455317448098e-05, + "loss": 1.4172, + "step": 17371 + }, + { + "epoch": 0.6221283148602431, + "grad_norm": 2.1763417720794678, + "learning_rate": 6.599364520547251e-05, + "loss": 1.653, + "step": 17372 + }, + { + "epoch": 0.6221641269898115, + "grad_norm": 1.2412214279174805, + "learning_rate": 6.598273769397572e-05, + "loss": 1.522, + "step": 17373 + }, + { + "epoch": 0.6221999391193798, + "grad_norm": 2.1275556087493896, + "learning_rate": 6.597183064013728e-05, + "loss": 1.3012, + "step": 17374 + }, + { + "epoch": 0.622235751248948, + "grad_norm": 1.7945011854171753, + "learning_rate": 6.5960924044104e-05, + "loss": 1.3099, + "step": 17375 + }, + { + "epoch": 0.6222715633785163, + "grad_norm": 1.436598300933838, + "learning_rate": 6.595001790602255e-05, + "loss": 1.347, + "step": 17376 + }, + { + "epoch": 0.6223073755080846, + "grad_norm": 1.3291562795639038, + "learning_rate": 6.593911222603969e-05, + "loss": 1.2779, + "step": 17377 + }, + { + "epoch": 0.6223431876376528, + "grad_norm": 1.5897414684295654, + "learning_rate": 6.592820700430215e-05, + "loss": 1.7034, + "step": 17378 + }, + { + "epoch": 0.6223789997672211, + "grad_norm": 1.8603501319885254, + "learning_rate": 6.591730224095663e-05, + "loss": 1.4712, + "step": 17379 + }, + { + "epoch": 0.6224148118967895, + "grad_norm": 2.969459056854248, + "learning_rate": 6.590639793614985e-05, + "loss": 1.3636, + "step": 17380 + }, + { + "epoch": 0.6224506240263578, + "grad_norm": 2.0669147968292236, + "learning_rate": 6.589549409002851e-05, + "loss": 1.5975, + "step": 17381 + }, + { + "epoch": 0.622486436155926, + "grad_norm": 2.064814329147339, + "learning_rate": 6.588459070273931e-05, + "loss": 1.3923, + "step": 17382 + }, + { + "epoch": 0.6225222482854943, + "grad_norm": 1.8036562204360962, + "learning_rate": 6.58736877744289e-05, + "loss": 1.8272, + "step": 17383 + }, + { + "epoch": 0.6225580604150626, + "grad_norm": 1.964530348777771, + "learning_rate": 6.586278530524405e-05, + "loss": 1.5506, + "step": 17384 + }, + { + "epoch": 0.6225938725446308, + "grad_norm": 1.5007630586624146, + "learning_rate": 6.58518832953314e-05, + "loss": 1.519, + "step": 17385 + }, + { + "epoch": 0.6226296846741991, + "grad_norm": 1.749772310256958, + "learning_rate": 6.584098174483754e-05, + "loss": 1.2044, + "step": 17386 + }, + { + "epoch": 0.6226654968037675, + "grad_norm": 1.545983910560608, + "learning_rate": 6.583008065390925e-05, + "loss": 1.4835, + "step": 17387 + }, + { + "epoch": 0.6227013089333358, + "grad_norm": 1.9022502899169922, + "learning_rate": 6.581918002269315e-05, + "loss": 1.415, + "step": 17388 + }, + { + "epoch": 0.622737121062904, + "grad_norm": 1.8049886226654053, + "learning_rate": 6.58082798513359e-05, + "loss": 1.5487, + "step": 17389 + }, + { + "epoch": 0.6227729331924723, + "grad_norm": 1.7206758260726929, + "learning_rate": 6.579738013998411e-05, + "loss": 1.6095, + "step": 17390 + }, + { + "epoch": 0.6228087453220406, + "grad_norm": 1.4274195432662964, + "learning_rate": 6.578648088878449e-05, + "loss": 1.5214, + "step": 17391 + }, + { + "epoch": 0.6228445574516088, + "grad_norm": 2.0439743995666504, + "learning_rate": 6.577558209788362e-05, + "loss": 1.6615, + "step": 17392 + }, + { + "epoch": 0.6228803695811771, + "grad_norm": 1.6557294130325317, + "learning_rate": 6.576468376742815e-05, + "loss": 1.5092, + "step": 17393 + }, + { + "epoch": 0.6229161817107455, + "grad_norm": 1.3613946437835693, + "learning_rate": 6.575378589756472e-05, + "loss": 1.4136, + "step": 17394 + }, + { + "epoch": 0.6229519938403137, + "grad_norm": 1.593880295753479, + "learning_rate": 6.574288848843988e-05, + "loss": 1.2501, + "step": 17395 + }, + { + "epoch": 0.622987805969882, + "grad_norm": 2.1387696266174316, + "learning_rate": 6.573199154020033e-05, + "loss": 1.563, + "step": 17396 + }, + { + "epoch": 0.6230236180994503, + "grad_norm": 1.517091989517212, + "learning_rate": 6.57210950529926e-05, + "loss": 1.4276, + "step": 17397 + }, + { + "epoch": 0.6230594302290186, + "grad_norm": 1.2697229385375977, + "learning_rate": 6.571019902696335e-05, + "loss": 1.281, + "step": 17398 + }, + { + "epoch": 0.6230952423585868, + "grad_norm": 1.6039862632751465, + "learning_rate": 6.569930346225909e-05, + "loss": 1.5502, + "step": 17399 + }, + { + "epoch": 0.6231310544881551, + "grad_norm": 1.4640159606933594, + "learning_rate": 6.56884083590265e-05, + "loss": 1.2747, + "step": 17400 + }, + { + "epoch": 0.6231668666177235, + "grad_norm": 1.5991400480270386, + "learning_rate": 6.567751371741209e-05, + "loss": 1.6825, + "step": 17401 + }, + { + "epoch": 0.6232026787472917, + "grad_norm": 1.5831888914108276, + "learning_rate": 6.566661953756248e-05, + "loss": 1.3705, + "step": 17402 + }, + { + "epoch": 0.62323849087686, + "grad_norm": 1.7924561500549316, + "learning_rate": 6.565572581962425e-05, + "loss": 1.5186, + "step": 17403 + }, + { + "epoch": 0.6232743030064283, + "grad_norm": 2.017235040664673, + "learning_rate": 6.564483256374386e-05, + "loss": 1.1964, + "step": 17404 + }, + { + "epoch": 0.6233101151359965, + "grad_norm": 1.7464269399642944, + "learning_rate": 6.5633939770068e-05, + "loss": 1.7076, + "step": 17405 + }, + { + "epoch": 0.6233459272655648, + "grad_norm": 1.9727259874343872, + "learning_rate": 6.562304743874308e-05, + "loss": 1.4771, + "step": 17406 + }, + { + "epoch": 0.6233817393951331, + "grad_norm": 1.5742576122283936, + "learning_rate": 6.561215556991578e-05, + "loss": 1.3384, + "step": 17407 + }, + { + "epoch": 0.6234175515247015, + "grad_norm": 1.320135235786438, + "learning_rate": 6.56012641637325e-05, + "loss": 1.4939, + "step": 17408 + }, + { + "epoch": 0.6234533636542697, + "grad_norm": 1.445614218711853, + "learning_rate": 6.559037322033991e-05, + "loss": 1.5254, + "step": 17409 + }, + { + "epoch": 0.623489175783838, + "grad_norm": 1.6739253997802734, + "learning_rate": 6.55794827398844e-05, + "loss": 1.1231, + "step": 17410 + }, + { + "epoch": 0.6235249879134063, + "grad_norm": 1.5012933015823364, + "learning_rate": 6.556859272251261e-05, + "loss": 1.7445, + "step": 17411 + }, + { + "epoch": 0.6235608000429745, + "grad_norm": 1.6745601892471313, + "learning_rate": 6.555770316837098e-05, + "loss": 1.4139, + "step": 17412 + }, + { + "epoch": 0.6235966121725428, + "grad_norm": 1.7321993112564087, + "learning_rate": 6.554681407760598e-05, + "loss": 1.4242, + "step": 17413 + }, + { + "epoch": 0.6236324243021111, + "grad_norm": 1.5765000581741333, + "learning_rate": 6.553592545036421e-05, + "loss": 1.5936, + "step": 17414 + }, + { + "epoch": 0.6236682364316795, + "grad_norm": 2.3662610054016113, + "learning_rate": 6.552503728679204e-05, + "loss": 1.3233, + "step": 17415 + }, + { + "epoch": 0.6237040485612477, + "grad_norm": 1.363168478012085, + "learning_rate": 6.551414958703611e-05, + "loss": 1.3414, + "step": 17416 + }, + { + "epoch": 0.623739860690816, + "grad_norm": 1.336014986038208, + "learning_rate": 6.550326235124274e-05, + "loss": 1.4875, + "step": 17417 + }, + { + "epoch": 0.6237756728203843, + "grad_norm": 1.4121977090835571, + "learning_rate": 6.549237557955854e-05, + "loss": 1.8272, + "step": 17418 + }, + { + "epoch": 0.6238114849499525, + "grad_norm": 1.8297215700149536, + "learning_rate": 6.54814892721299e-05, + "loss": 1.1333, + "step": 17419 + }, + { + "epoch": 0.6238472970795208, + "grad_norm": 1.8196252584457397, + "learning_rate": 6.547060342910324e-05, + "loss": 1.7198, + "step": 17420 + }, + { + "epoch": 0.6238831092090891, + "grad_norm": 2.3765392303466797, + "learning_rate": 6.545971805062514e-05, + "loss": 1.7214, + "step": 17421 + }, + { + "epoch": 0.6239189213386575, + "grad_norm": 1.7555404901504517, + "learning_rate": 6.544883313684193e-05, + "loss": 1.4639, + "step": 17422 + }, + { + "epoch": 0.6239547334682257, + "grad_norm": 1.5131053924560547, + "learning_rate": 6.543794868790015e-05, + "loss": 1.2069, + "step": 17423 + }, + { + "epoch": 0.623990545597794, + "grad_norm": 1.5095545053482056, + "learning_rate": 6.542706470394614e-05, + "loss": 1.6183, + "step": 17424 + }, + { + "epoch": 0.6240263577273623, + "grad_norm": 2.3657705783843994, + "learning_rate": 6.54161811851264e-05, + "loss": 1.8115, + "step": 17425 + }, + { + "epoch": 0.6240621698569305, + "grad_norm": 1.826514482498169, + "learning_rate": 6.540529813158732e-05, + "loss": 1.5298, + "step": 17426 + }, + { + "epoch": 0.6240979819864988, + "grad_norm": 1.7273763418197632, + "learning_rate": 6.539441554347537e-05, + "loss": 1.4563, + "step": 17427 + }, + { + "epoch": 0.6241337941160671, + "grad_norm": 2.3398561477661133, + "learning_rate": 6.538353342093689e-05, + "loss": 1.6562, + "step": 17428 + }, + { + "epoch": 0.6241696062456354, + "grad_norm": 1.5986614227294922, + "learning_rate": 6.537265176411831e-05, + "loss": 1.5047, + "step": 17429 + }, + { + "epoch": 0.6242054183752037, + "grad_norm": 1.5671006441116333, + "learning_rate": 6.536177057316605e-05, + "loss": 1.452, + "step": 17430 + }, + { + "epoch": 0.624241230504772, + "grad_norm": 1.5327402353286743, + "learning_rate": 6.535088984822647e-05, + "loss": 1.5385, + "step": 17431 + }, + { + "epoch": 0.6242770426343403, + "grad_norm": 1.9305261373519897, + "learning_rate": 6.5340009589446e-05, + "loss": 1.5302, + "step": 17432 + }, + { + "epoch": 0.6243128547639085, + "grad_norm": 1.7964725494384766, + "learning_rate": 6.532912979697095e-05, + "loss": 1.6203, + "step": 17433 + }, + { + "epoch": 0.6243486668934768, + "grad_norm": 1.4314013719558716, + "learning_rate": 6.531825047094778e-05, + "loss": 1.6102, + "step": 17434 + }, + { + "epoch": 0.6243844790230451, + "grad_norm": 1.3836110830307007, + "learning_rate": 6.530737161152278e-05, + "loss": 1.6876, + "step": 17435 + }, + { + "epoch": 0.6244202911526134, + "grad_norm": 1.6772618293762207, + "learning_rate": 6.529649321884237e-05, + "loss": 1.4962, + "step": 17436 + }, + { + "epoch": 0.6244561032821817, + "grad_norm": 1.6304844617843628, + "learning_rate": 6.528561529305289e-05, + "loss": 1.2627, + "step": 17437 + }, + { + "epoch": 0.62449191541175, + "grad_norm": 1.0868332386016846, + "learning_rate": 6.527473783430064e-05, + "loss": 1.2082, + "step": 17438 + }, + { + "epoch": 0.6245277275413182, + "grad_norm": 1.9123562574386597, + "learning_rate": 6.526386084273202e-05, + "loss": 1.535, + "step": 17439 + }, + { + "epoch": 0.6245635396708865, + "grad_norm": 1.7259448766708374, + "learning_rate": 6.525298431849334e-05, + "loss": 1.6374, + "step": 17440 + }, + { + "epoch": 0.6245993518004548, + "grad_norm": 1.6573362350463867, + "learning_rate": 6.524210826173094e-05, + "loss": 1.4254, + "step": 17441 + }, + { + "epoch": 0.624635163930023, + "grad_norm": 1.7752695083618164, + "learning_rate": 6.523123267259113e-05, + "loss": 1.3835, + "step": 17442 + }, + { + "epoch": 0.6246709760595914, + "grad_norm": 2.4315598011016846, + "learning_rate": 6.522035755122024e-05, + "loss": 1.5414, + "step": 17443 + }, + { + "epoch": 0.6247067881891597, + "grad_norm": 1.4300681352615356, + "learning_rate": 6.520948289776459e-05, + "loss": 1.7243, + "step": 17444 + }, + { + "epoch": 0.624742600318728, + "grad_norm": 1.8628979921340942, + "learning_rate": 6.519860871237046e-05, + "loss": 1.4246, + "step": 17445 + }, + { + "epoch": 0.6247784124482962, + "grad_norm": 2.1132607460021973, + "learning_rate": 6.518773499518418e-05, + "loss": 1.5807, + "step": 17446 + }, + { + "epoch": 0.6248142245778645, + "grad_norm": 1.2737668752670288, + "learning_rate": 6.517686174635198e-05, + "loss": 1.4673, + "step": 17447 + }, + { + "epoch": 0.6248500367074328, + "grad_norm": 1.5888752937316895, + "learning_rate": 6.516598896602022e-05, + "loss": 1.4605, + "step": 17448 + }, + { + "epoch": 0.624885848837001, + "grad_norm": 1.590665578842163, + "learning_rate": 6.515511665433513e-05, + "loss": 1.5737, + "step": 17449 + }, + { + "epoch": 0.6249216609665694, + "grad_norm": 2.3191263675689697, + "learning_rate": 6.514424481144301e-05, + "loss": 1.3844, + "step": 17450 + }, + { + "epoch": 0.6249574730961377, + "grad_norm": 1.6052703857421875, + "learning_rate": 6.513337343749008e-05, + "loss": 1.6712, + "step": 17451 + }, + { + "epoch": 0.624993285225706, + "grad_norm": 1.7228527069091797, + "learning_rate": 6.512250253262268e-05, + "loss": 1.4053, + "step": 17452 + }, + { + "epoch": 0.6250290973552742, + "grad_norm": 2.069012403488159, + "learning_rate": 6.511163209698701e-05, + "loss": 1.5595, + "step": 17453 + }, + { + "epoch": 0.6250649094848425, + "grad_norm": 1.8847413063049316, + "learning_rate": 6.510076213072932e-05, + "loss": 1.4639, + "step": 17454 + }, + { + "epoch": 0.6251007216144108, + "grad_norm": 1.9726908206939697, + "learning_rate": 6.508989263399588e-05, + "loss": 1.2424, + "step": 17455 + }, + { + "epoch": 0.625136533743979, + "grad_norm": 1.4060355424880981, + "learning_rate": 6.507902360693286e-05, + "loss": 1.4809, + "step": 17456 + }, + { + "epoch": 0.6251723458735474, + "grad_norm": 1.643329381942749, + "learning_rate": 6.506815504968657e-05, + "loss": 1.4356, + "step": 17457 + }, + { + "epoch": 0.6252081580031157, + "grad_norm": 1.3762401342391968, + "learning_rate": 6.505728696240316e-05, + "loss": 1.3249, + "step": 17458 + }, + { + "epoch": 0.625243970132684, + "grad_norm": 1.967424988746643, + "learning_rate": 6.504641934522892e-05, + "loss": 1.3091, + "step": 17459 + }, + { + "epoch": 0.6252797822622522, + "grad_norm": 1.5457974672317505, + "learning_rate": 6.503555219830999e-05, + "loss": 1.1949, + "step": 17460 + }, + { + "epoch": 0.6253155943918205, + "grad_norm": 2.106031656265259, + "learning_rate": 6.502468552179263e-05, + "loss": 1.6295, + "step": 17461 + }, + { + "epoch": 0.6253514065213888, + "grad_norm": 1.3233288526535034, + "learning_rate": 6.501381931582297e-05, + "loss": 1.4302, + "step": 17462 + }, + { + "epoch": 0.625387218650957, + "grad_norm": 2.170650005340576, + "learning_rate": 6.500295358054729e-05, + "loss": 1.4235, + "step": 17463 + }, + { + "epoch": 0.6254230307805254, + "grad_norm": 1.5497565269470215, + "learning_rate": 6.499208831611172e-05, + "loss": 1.2203, + "step": 17464 + }, + { + "epoch": 0.6254588429100937, + "grad_norm": 2.3494434356689453, + "learning_rate": 6.498122352266242e-05, + "loss": 1.5483, + "step": 17465 + }, + { + "epoch": 0.625494655039662, + "grad_norm": 1.4786423444747925, + "learning_rate": 6.497035920034561e-05, + "loss": 1.4961, + "step": 17466 + }, + { + "epoch": 0.6255304671692302, + "grad_norm": 1.964049220085144, + "learning_rate": 6.49594953493074e-05, + "loss": 1.5353, + "step": 17467 + }, + { + "epoch": 0.6255662792987985, + "grad_norm": 1.518816351890564, + "learning_rate": 6.494863196969403e-05, + "loss": 1.7526, + "step": 17468 + }, + { + "epoch": 0.6256020914283668, + "grad_norm": 1.3744655847549438, + "learning_rate": 6.493776906165155e-05, + "loss": 1.5259, + "step": 17469 + }, + { + "epoch": 0.625637903557935, + "grad_norm": 1.8796976804733276, + "learning_rate": 6.49269066253262e-05, + "loss": 1.6298, + "step": 17470 + }, + { + "epoch": 0.6256737156875034, + "grad_norm": 1.5563455820083618, + "learning_rate": 6.491604466086405e-05, + "loss": 1.4925, + "step": 17471 + }, + { + "epoch": 0.6257095278170717, + "grad_norm": 1.5489403009414673, + "learning_rate": 6.49051831684113e-05, + "loss": 1.2473, + "step": 17472 + }, + { + "epoch": 0.62574533994664, + "grad_norm": 1.5099996328353882, + "learning_rate": 6.489432214811403e-05, + "loss": 1.2018, + "step": 17473 + }, + { + "epoch": 0.6257811520762082, + "grad_norm": 1.9059803485870361, + "learning_rate": 6.488346160011835e-05, + "loss": 1.2401, + "step": 17474 + }, + { + "epoch": 0.6258169642057765, + "grad_norm": 1.7727347612380981, + "learning_rate": 6.487260152457041e-05, + "loss": 1.5937, + "step": 17475 + }, + { + "epoch": 0.6258527763353448, + "grad_norm": 1.7974789142608643, + "learning_rate": 6.486174192161632e-05, + "loss": 1.5348, + "step": 17476 + }, + { + "epoch": 0.625888588464913, + "grad_norm": 1.85459566116333, + "learning_rate": 6.485088279140214e-05, + "loss": 1.7568, + "step": 17477 + }, + { + "epoch": 0.6259244005944814, + "grad_norm": 1.5206538438796997, + "learning_rate": 6.484002413407401e-05, + "loss": 1.3962, + "step": 17478 + }, + { + "epoch": 0.6259602127240497, + "grad_norm": 1.4857813119888306, + "learning_rate": 6.4829165949778e-05, + "loss": 1.5086, + "step": 17479 + }, + { + "epoch": 0.6259960248536179, + "grad_norm": 1.4960447549819946, + "learning_rate": 6.481830823866018e-05, + "loss": 1.3399, + "step": 17480 + }, + { + "epoch": 0.6260318369831862, + "grad_norm": 1.2727571725845337, + "learning_rate": 6.480745100086668e-05, + "loss": 1.4466, + "step": 17481 + }, + { + "epoch": 0.6260676491127545, + "grad_norm": 1.508490800857544, + "learning_rate": 6.479659423654352e-05, + "loss": 1.4005, + "step": 17482 + }, + { + "epoch": 0.6261034612423227, + "grad_norm": 1.6986658573150635, + "learning_rate": 6.478573794583673e-05, + "loss": 1.4465, + "step": 17483 + }, + { + "epoch": 0.626139273371891, + "grad_norm": 1.3787263631820679, + "learning_rate": 6.477488212889246e-05, + "loss": 1.4511, + "step": 17484 + }, + { + "epoch": 0.6261750855014594, + "grad_norm": 1.7577788829803467, + "learning_rate": 6.476402678585669e-05, + "loss": 1.1861, + "step": 17485 + }, + { + "epoch": 0.6262108976310277, + "grad_norm": 1.4000847339630127, + "learning_rate": 6.47531719168755e-05, + "loss": 1.5713, + "step": 17486 + }, + { + "epoch": 0.6262467097605959, + "grad_norm": 2.0742688179016113, + "learning_rate": 6.474231752209492e-05, + "loss": 1.5913, + "step": 17487 + }, + { + "epoch": 0.6262825218901642, + "grad_norm": 1.7926561832427979, + "learning_rate": 6.473146360166098e-05, + "loss": 1.533, + "step": 17488 + }, + { + "epoch": 0.6263183340197325, + "grad_norm": 1.5476453304290771, + "learning_rate": 6.472061015571968e-05, + "loss": 1.4394, + "step": 17489 + }, + { + "epoch": 0.6263541461493007, + "grad_norm": 1.240106225013733, + "learning_rate": 6.47097571844171e-05, + "loss": 1.4792, + "step": 17490 + }, + { + "epoch": 0.626389958278869, + "grad_norm": 1.3444880247116089, + "learning_rate": 6.469890468789922e-05, + "loss": 1.5799, + "step": 17491 + }, + { + "epoch": 0.6264257704084374, + "grad_norm": 1.7917805910110474, + "learning_rate": 6.468805266631199e-05, + "loss": 1.5301, + "step": 17492 + }, + { + "epoch": 0.6264615825380057, + "grad_norm": 2.6330838203430176, + "learning_rate": 6.467720111980151e-05, + "loss": 1.6501, + "step": 17493 + }, + { + "epoch": 0.6264973946675739, + "grad_norm": 1.631439208984375, + "learning_rate": 6.466635004851367e-05, + "loss": 1.368, + "step": 17494 + }, + { + "epoch": 0.6265332067971422, + "grad_norm": 1.8269366025924683, + "learning_rate": 6.46554994525946e-05, + "loss": 1.6264, + "step": 17495 + }, + { + "epoch": 0.6265690189267105, + "grad_norm": 1.577916145324707, + "learning_rate": 6.46446493321901e-05, + "loss": 1.3685, + "step": 17496 + }, + { + "epoch": 0.6266048310562787, + "grad_norm": 1.450053095817566, + "learning_rate": 6.46337996874463e-05, + "loss": 1.572, + "step": 17497 + }, + { + "epoch": 0.626640643185847, + "grad_norm": 1.6219733953475952, + "learning_rate": 6.462295051850907e-05, + "loss": 1.4317, + "step": 17498 + }, + { + "epoch": 0.6266764553154154, + "grad_norm": 1.7506242990493774, + "learning_rate": 6.461210182552444e-05, + "loss": 1.5448, + "step": 17499 + }, + { + "epoch": 0.6267122674449837, + "grad_norm": 1.4958577156066895, + "learning_rate": 6.460125360863835e-05, + "loss": 1.1158, + "step": 17500 + }, + { + "epoch": 0.6267480795745519, + "grad_norm": 1.796212077140808, + "learning_rate": 6.459040586799666e-05, + "loss": 1.5875, + "step": 17501 + }, + { + "epoch": 0.6267838917041202, + "grad_norm": 1.9976341724395752, + "learning_rate": 6.457955860374545e-05, + "loss": 1.4557, + "step": 17502 + }, + { + "epoch": 0.6268197038336885, + "grad_norm": 1.7986866235733032, + "learning_rate": 6.456871181603054e-05, + "loss": 1.7378, + "step": 17503 + }, + { + "epoch": 0.6268555159632567, + "grad_norm": 1.322219967842102, + "learning_rate": 6.455786550499796e-05, + "loss": 1.3182, + "step": 17504 + }, + { + "epoch": 0.626891328092825, + "grad_norm": 1.5401942729949951, + "learning_rate": 6.454701967079354e-05, + "loss": 1.5181, + "step": 17505 + }, + { + "epoch": 0.6269271402223934, + "grad_norm": 2.3169422149658203, + "learning_rate": 6.453617431356327e-05, + "loss": 1.8039, + "step": 17506 + }, + { + "epoch": 0.6269629523519616, + "grad_norm": 1.2413296699523926, + "learning_rate": 6.452532943345298e-05, + "loss": 1.6122, + "step": 17507 + }, + { + "epoch": 0.6269987644815299, + "grad_norm": 1.2607512474060059, + "learning_rate": 6.451448503060868e-05, + "loss": 1.3618, + "step": 17508 + }, + { + "epoch": 0.6270345766110982, + "grad_norm": 1.9498603343963623, + "learning_rate": 6.45036411051762e-05, + "loss": 1.2638, + "step": 17509 + }, + { + "epoch": 0.6270703887406665, + "grad_norm": 1.3453820943832397, + "learning_rate": 6.449279765730141e-05, + "loss": 1.6058, + "step": 17510 + }, + { + "epoch": 0.6271062008702347, + "grad_norm": 1.8720459938049316, + "learning_rate": 6.448195468713028e-05, + "loss": 1.231, + "step": 17511 + }, + { + "epoch": 0.627142012999803, + "grad_norm": 1.402961254119873, + "learning_rate": 6.447111219480857e-05, + "loss": 1.5311, + "step": 17512 + }, + { + "epoch": 0.6271778251293714, + "grad_norm": 1.858162760734558, + "learning_rate": 6.446027018048228e-05, + "loss": 1.5916, + "step": 17513 + }, + { + "epoch": 0.6272136372589396, + "grad_norm": 2.5841658115386963, + "learning_rate": 6.444942864429713e-05, + "loss": 1.3853, + "step": 17514 + }, + { + "epoch": 0.6272494493885079, + "grad_norm": 1.7156753540039062, + "learning_rate": 6.443858758639916e-05, + "loss": 1.5192, + "step": 17515 + }, + { + "epoch": 0.6272852615180762, + "grad_norm": 1.7334517240524292, + "learning_rate": 6.442774700693408e-05, + "loss": 1.2638, + "step": 17516 + }, + { + "epoch": 0.6273210736476444, + "grad_norm": 1.4472137689590454, + "learning_rate": 6.441690690604775e-05, + "loss": 1.9229, + "step": 17517 + }, + { + "epoch": 0.6273568857772127, + "grad_norm": 1.426712989807129, + "learning_rate": 6.440606728388607e-05, + "loss": 1.4067, + "step": 17518 + }, + { + "epoch": 0.627392697906781, + "grad_norm": 1.6888214349746704, + "learning_rate": 6.439522814059483e-05, + "loss": 1.5726, + "step": 17519 + }, + { + "epoch": 0.6274285100363494, + "grad_norm": 1.7400039434432983, + "learning_rate": 6.438438947631989e-05, + "loss": 1.3974, + "step": 17520 + }, + { + "epoch": 0.6274643221659176, + "grad_norm": 1.7932146787643433, + "learning_rate": 6.437355129120701e-05, + "loss": 1.3313, + "step": 17521 + }, + { + "epoch": 0.6275001342954859, + "grad_norm": 1.576096773147583, + "learning_rate": 6.436271358540206e-05, + "loss": 1.0929, + "step": 17522 + }, + { + "epoch": 0.6275359464250542, + "grad_norm": 1.7114145755767822, + "learning_rate": 6.435187635905082e-05, + "loss": 1.0729, + "step": 17523 + }, + { + "epoch": 0.6275717585546224, + "grad_norm": 1.7404381036758423, + "learning_rate": 6.434103961229913e-05, + "loss": 1.6533, + "step": 17524 + }, + { + "epoch": 0.6276075706841907, + "grad_norm": 1.227808952331543, + "learning_rate": 6.433020334529275e-05, + "loss": 1.4551, + "step": 17525 + }, + { + "epoch": 0.627643382813759, + "grad_norm": 1.4737696647644043, + "learning_rate": 6.431936755817746e-05, + "loss": 1.5414, + "step": 17526 + }, + { + "epoch": 0.6276791949433274, + "grad_norm": 1.8455876111984253, + "learning_rate": 6.430853225109908e-05, + "loss": 1.9524, + "step": 17527 + }, + { + "epoch": 0.6277150070728956, + "grad_norm": 1.4521657228469849, + "learning_rate": 6.42976974242033e-05, + "loss": 1.5804, + "step": 17528 + }, + { + "epoch": 0.6277508192024639, + "grad_norm": 1.937605381011963, + "learning_rate": 6.428686307763601e-05, + "loss": 1.1705, + "step": 17529 + }, + { + "epoch": 0.6277866313320322, + "grad_norm": 1.5884835720062256, + "learning_rate": 6.427602921154287e-05, + "loss": 1.5303, + "step": 17530 + }, + { + "epoch": 0.6278224434616004, + "grad_norm": 1.7557952404022217, + "learning_rate": 6.426519582606971e-05, + "loss": 1.269, + "step": 17531 + }, + { + "epoch": 0.6278582555911687, + "grad_norm": 1.5802031755447388, + "learning_rate": 6.42543629213622e-05, + "loss": 1.5768, + "step": 17532 + }, + { + "epoch": 0.627894067720737, + "grad_norm": 1.7923305034637451, + "learning_rate": 6.424353049756618e-05, + "loss": 1.411, + "step": 17533 + }, + { + "epoch": 0.6279298798503054, + "grad_norm": 1.7670484781265259, + "learning_rate": 6.423269855482732e-05, + "loss": 1.6283, + "step": 17534 + }, + { + "epoch": 0.6279656919798736, + "grad_norm": 1.5500022172927856, + "learning_rate": 6.422186709329134e-05, + "loss": 1.5638, + "step": 17535 + }, + { + "epoch": 0.6280015041094419, + "grad_norm": 1.745823860168457, + "learning_rate": 6.421103611310402e-05, + "loss": 1.4258, + "step": 17536 + }, + { + "epoch": 0.6280373162390102, + "grad_norm": 1.6846976280212402, + "learning_rate": 6.420020561441101e-05, + "loss": 1.3898, + "step": 17537 + }, + { + "epoch": 0.6280731283685784, + "grad_norm": 1.477970004081726, + "learning_rate": 6.41893755973581e-05, + "loss": 1.4848, + "step": 17538 + }, + { + "epoch": 0.6281089404981467, + "grad_norm": 1.6424835920333862, + "learning_rate": 6.417854606209091e-05, + "loss": 1.3707, + "step": 17539 + }, + { + "epoch": 0.628144752627715, + "grad_norm": 1.994249939918518, + "learning_rate": 6.41677170087552e-05, + "loss": 1.7501, + "step": 17540 + }, + { + "epoch": 0.6281805647572833, + "grad_norm": 2.2836825847625732, + "learning_rate": 6.41568884374966e-05, + "loss": 1.3445, + "step": 17541 + }, + { + "epoch": 0.6282163768868516, + "grad_norm": 1.7882944345474243, + "learning_rate": 6.414606034846087e-05, + "loss": 1.2719, + "step": 17542 + }, + { + "epoch": 0.6282521890164199, + "grad_norm": 1.532196044921875, + "learning_rate": 6.413523274179365e-05, + "loss": 1.3419, + "step": 17543 + }, + { + "epoch": 0.6282880011459882, + "grad_norm": 1.7860722541809082, + "learning_rate": 6.412440561764059e-05, + "loss": 1.613, + "step": 17544 + }, + { + "epoch": 0.6283238132755564, + "grad_norm": 1.4192113876342773, + "learning_rate": 6.411357897614738e-05, + "loss": 1.2385, + "step": 17545 + }, + { + "epoch": 0.6283596254051247, + "grad_norm": 1.6506589651107788, + "learning_rate": 6.410275281745967e-05, + "loss": 1.4374, + "step": 17546 + }, + { + "epoch": 0.628395437534693, + "grad_norm": 1.6427680253982544, + "learning_rate": 6.409192714172314e-05, + "loss": 1.4561, + "step": 17547 + }, + { + "epoch": 0.6284312496642613, + "grad_norm": 1.5878175497055054, + "learning_rate": 6.408110194908338e-05, + "loss": 1.3458, + "step": 17548 + }, + { + "epoch": 0.6284670617938296, + "grad_norm": 1.747763752937317, + "learning_rate": 6.407027723968611e-05, + "loss": 1.3013, + "step": 17549 + }, + { + "epoch": 0.6285028739233979, + "grad_norm": 1.7274367809295654, + "learning_rate": 6.405945301367687e-05, + "loss": 1.7029, + "step": 17550 + }, + { + "epoch": 0.6285386860529661, + "grad_norm": 1.8182168006896973, + "learning_rate": 6.404862927120134e-05, + "loss": 1.3585, + "step": 17551 + }, + { + "epoch": 0.6285744981825344, + "grad_norm": 1.4165538549423218, + "learning_rate": 6.403780601240514e-05, + "loss": 1.2135, + "step": 17552 + }, + { + "epoch": 0.6286103103121027, + "grad_norm": 1.433510661125183, + "learning_rate": 6.402698323743385e-05, + "loss": 1.313, + "step": 17553 + }, + { + "epoch": 0.628646122441671, + "grad_norm": 1.786746621131897, + "learning_rate": 6.401616094643312e-05, + "loss": 1.6242, + "step": 17554 + }, + { + "epoch": 0.6286819345712393, + "grad_norm": 1.820902705192566, + "learning_rate": 6.400533913954851e-05, + "loss": 1.5524, + "step": 17555 + }, + { + "epoch": 0.6287177467008076, + "grad_norm": 1.55777907371521, + "learning_rate": 6.399451781692567e-05, + "loss": 1.2882, + "step": 17556 + }, + { + "epoch": 0.6287535588303759, + "grad_norm": 1.7453067302703857, + "learning_rate": 6.398369697871011e-05, + "loss": 1.6019, + "step": 17557 + }, + { + "epoch": 0.6287893709599441, + "grad_norm": 1.8225655555725098, + "learning_rate": 6.397287662504747e-05, + "loss": 1.365, + "step": 17558 + }, + { + "epoch": 0.6288251830895124, + "grad_norm": 1.931082010269165, + "learning_rate": 6.39620567560833e-05, + "loss": 1.4394, + "step": 17559 + }, + { + "epoch": 0.6288609952190807, + "grad_norm": 2.121443271636963, + "learning_rate": 6.395123737196316e-05, + "loss": 1.3972, + "step": 17560 + }, + { + "epoch": 0.628896807348649, + "grad_norm": 1.686209797859192, + "learning_rate": 6.394041847283263e-05, + "loss": 1.4342, + "step": 17561 + }, + { + "epoch": 0.6289326194782173, + "grad_norm": 1.546787977218628, + "learning_rate": 6.392960005883726e-05, + "loss": 1.4053, + "step": 17562 + }, + { + "epoch": 0.6289684316077856, + "grad_norm": 1.6761842966079712, + "learning_rate": 6.391878213012258e-05, + "loss": 1.4526, + "step": 17563 + }, + { + "epoch": 0.6290042437373539, + "grad_norm": 2.2791762351989746, + "learning_rate": 6.390796468683416e-05, + "loss": 1.5521, + "step": 17564 + }, + { + "epoch": 0.6290400558669221, + "grad_norm": 1.24517822265625, + "learning_rate": 6.389714772911751e-05, + "loss": 1.4677, + "step": 17565 + }, + { + "epoch": 0.6290758679964904, + "grad_norm": 1.4214081764221191, + "learning_rate": 6.388633125711816e-05, + "loss": 1.5366, + "step": 17566 + }, + { + "epoch": 0.6291116801260587, + "grad_norm": 1.4436924457550049, + "learning_rate": 6.387551527098165e-05, + "loss": 1.5554, + "step": 17567 + }, + { + "epoch": 0.6291474922556269, + "grad_norm": 1.6681147813796997, + "learning_rate": 6.386469977085348e-05, + "loss": 1.1127, + "step": 17568 + }, + { + "epoch": 0.6291833043851953, + "grad_norm": 2.224066734313965, + "learning_rate": 6.385388475687918e-05, + "loss": 1.6578, + "step": 17569 + }, + { + "epoch": 0.6292191165147636, + "grad_norm": 1.7209625244140625, + "learning_rate": 6.384307022920424e-05, + "loss": 1.6013, + "step": 17570 + }, + { + "epoch": 0.6292549286443319, + "grad_norm": 1.4986475706100464, + "learning_rate": 6.383225618797412e-05, + "loss": 1.4255, + "step": 17571 + }, + { + "epoch": 0.6292907407739001, + "grad_norm": 1.6216480731964111, + "learning_rate": 6.382144263333436e-05, + "loss": 1.6899, + "step": 17572 + }, + { + "epoch": 0.6293265529034684, + "grad_norm": 2.151085138320923, + "learning_rate": 6.381062956543041e-05, + "loss": 1.7799, + "step": 17573 + }, + { + "epoch": 0.6293623650330367, + "grad_norm": 1.4433550834655762, + "learning_rate": 6.379981698440778e-05, + "loss": 1.4163, + "step": 17574 + }, + { + "epoch": 0.6293981771626049, + "grad_norm": 1.6672604084014893, + "learning_rate": 6.378900489041188e-05, + "loss": 1.4949, + "step": 17575 + }, + { + "epoch": 0.6294339892921733, + "grad_norm": 2.602228879928589, + "learning_rate": 6.377819328358826e-05, + "loss": 1.5507, + "step": 17576 + }, + { + "epoch": 0.6294698014217416, + "grad_norm": 1.5993750095367432, + "learning_rate": 6.37673821640823e-05, + "loss": 1.6015, + "step": 17577 + }, + { + "epoch": 0.6295056135513099, + "grad_norm": 1.3872603178024292, + "learning_rate": 6.375657153203947e-05, + "loss": 1.6021, + "step": 17578 + }, + { + "epoch": 0.6295414256808781, + "grad_norm": 1.776228904724121, + "learning_rate": 6.374576138760525e-05, + "loss": 1.4317, + "step": 17579 + }, + { + "epoch": 0.6295772378104464, + "grad_norm": 1.5714781284332275, + "learning_rate": 6.3734951730925e-05, + "loss": 1.1074, + "step": 17580 + }, + { + "epoch": 0.6296130499400147, + "grad_norm": 1.8212718963623047, + "learning_rate": 6.372414256214423e-05, + "loss": 1.4209, + "step": 17581 + }, + { + "epoch": 0.6296488620695829, + "grad_norm": 1.9626024961471558, + "learning_rate": 6.37133338814083e-05, + "loss": 1.3858, + "step": 17582 + }, + { + "epoch": 0.6296846741991513, + "grad_norm": 1.6976439952850342, + "learning_rate": 6.370252568886267e-05, + "loss": 1.4777, + "step": 17583 + }, + { + "epoch": 0.6297204863287196, + "grad_norm": 3.100534677505493, + "learning_rate": 6.369171798465274e-05, + "loss": 1.5667, + "step": 17584 + }, + { + "epoch": 0.6297562984582878, + "grad_norm": 1.5115562677383423, + "learning_rate": 6.368091076892392e-05, + "loss": 1.5159, + "step": 17585 + }, + { + "epoch": 0.6297921105878561, + "grad_norm": 1.9113214015960693, + "learning_rate": 6.367010404182158e-05, + "loss": 1.447, + "step": 17586 + }, + { + "epoch": 0.6298279227174244, + "grad_norm": 1.8984917402267456, + "learning_rate": 6.365929780349113e-05, + "loss": 1.4439, + "step": 17587 + }, + { + "epoch": 0.6298637348469927, + "grad_norm": 1.6823961734771729, + "learning_rate": 6.3648492054078e-05, + "loss": 1.5414, + "step": 17588 + }, + { + "epoch": 0.6298995469765609, + "grad_norm": 1.8366305828094482, + "learning_rate": 6.363768679372744e-05, + "loss": 1.303, + "step": 17589 + }, + { + "epoch": 0.6299353591061293, + "grad_norm": 1.6372846364974976, + "learning_rate": 6.362688202258496e-05, + "loss": 1.4375, + "step": 17590 + }, + { + "epoch": 0.6299711712356976, + "grad_norm": 2.158540964126587, + "learning_rate": 6.361607774079581e-05, + "loss": 1.5602, + "step": 17591 + }, + { + "epoch": 0.6300069833652658, + "grad_norm": 1.4774373769760132, + "learning_rate": 6.360527394850547e-05, + "loss": 1.3032, + "step": 17592 + }, + { + "epoch": 0.6300427954948341, + "grad_norm": 1.7481532096862793, + "learning_rate": 6.359447064585915e-05, + "loss": 1.3593, + "step": 17593 + }, + { + "epoch": 0.6300786076244024, + "grad_norm": 1.8624064922332764, + "learning_rate": 6.358366783300231e-05, + "loss": 1.2172, + "step": 17594 + }, + { + "epoch": 0.6301144197539706, + "grad_norm": 1.4835082292556763, + "learning_rate": 6.357286551008024e-05, + "loss": 1.4949, + "step": 17595 + }, + { + "epoch": 0.6301502318835389, + "grad_norm": 2.123321294784546, + "learning_rate": 6.356206367723829e-05, + "loss": 1.5935, + "step": 17596 + }, + { + "epoch": 0.6301860440131073, + "grad_norm": 1.4491807222366333, + "learning_rate": 6.355126233462179e-05, + "loss": 1.6078, + "step": 17597 + }, + { + "epoch": 0.6302218561426756, + "grad_norm": 1.65678870677948, + "learning_rate": 6.354046148237597e-05, + "loss": 1.5985, + "step": 17598 + }, + { + "epoch": 0.6302576682722438, + "grad_norm": 1.9164150953292847, + "learning_rate": 6.352966112064627e-05, + "loss": 1.6386, + "step": 17599 + }, + { + "epoch": 0.6302934804018121, + "grad_norm": 1.5138894319534302, + "learning_rate": 6.351886124957789e-05, + "loss": 1.5511, + "step": 17600 + }, + { + "epoch": 0.6303292925313804, + "grad_norm": 1.7643005847930908, + "learning_rate": 6.350806186931623e-05, + "loss": 1.5784, + "step": 17601 + }, + { + "epoch": 0.6303651046609486, + "grad_norm": 1.719307780265808, + "learning_rate": 6.349726298000647e-05, + "loss": 1.5924, + "step": 17602 + }, + { + "epoch": 0.6304009167905169, + "grad_norm": 1.3820748329162598, + "learning_rate": 6.3486464581794e-05, + "loss": 1.4025, + "step": 17603 + }, + { + "epoch": 0.6304367289200853, + "grad_norm": 1.675239086151123, + "learning_rate": 6.347566667482401e-05, + "loss": 1.4112, + "step": 17604 + }, + { + "epoch": 0.6304725410496536, + "grad_norm": 1.8763279914855957, + "learning_rate": 6.346486925924184e-05, + "loss": 1.6343, + "step": 17605 + }, + { + "epoch": 0.6305083531792218, + "grad_norm": 2.052152633666992, + "learning_rate": 6.345407233519273e-05, + "loss": 1.4135, + "step": 17606 + }, + { + "epoch": 0.6305441653087901, + "grad_norm": 1.8708726167678833, + "learning_rate": 6.344327590282189e-05, + "loss": 1.5575, + "step": 17607 + }, + { + "epoch": 0.6305799774383584, + "grad_norm": 2.31463360786438, + "learning_rate": 6.343247996227469e-05, + "loss": 1.5766, + "step": 17608 + }, + { + "epoch": 0.6306157895679266, + "grad_norm": 1.8864593505859375, + "learning_rate": 6.342168451369623e-05, + "loss": 1.2702, + "step": 17609 + }, + { + "epoch": 0.6306516016974949, + "grad_norm": 1.3110343217849731, + "learning_rate": 6.341088955723189e-05, + "loss": 1.0155, + "step": 17610 + }, + { + "epoch": 0.6306874138270633, + "grad_norm": 1.729931116104126, + "learning_rate": 6.340009509302676e-05, + "loss": 1.5527, + "step": 17611 + }, + { + "epoch": 0.6307232259566316, + "grad_norm": 1.4861717224121094, + "learning_rate": 6.338930112122622e-05, + "loss": 1.3048, + "step": 17612 + }, + { + "epoch": 0.6307590380861998, + "grad_norm": 1.8507078886032104, + "learning_rate": 6.337850764197539e-05, + "loss": 1.3849, + "step": 17613 + }, + { + "epoch": 0.6307948502157681, + "grad_norm": 1.3783198595046997, + "learning_rate": 6.336771465541947e-05, + "loss": 1.2361, + "step": 17614 + }, + { + "epoch": 0.6308306623453364, + "grad_norm": 1.9116185903549194, + "learning_rate": 6.33569221617037e-05, + "loss": 1.5638, + "step": 17615 + }, + { + "epoch": 0.6308664744749046, + "grad_norm": 1.7067476511001587, + "learning_rate": 6.334613016097328e-05, + "loss": 1.3243, + "step": 17616 + }, + { + "epoch": 0.6309022866044729, + "grad_norm": 1.42518150806427, + "learning_rate": 6.333533865337343e-05, + "loss": 1.5246, + "step": 17617 + }, + { + "epoch": 0.6309380987340413, + "grad_norm": 1.7045762538909912, + "learning_rate": 6.332454763904925e-05, + "loss": 1.5706, + "step": 17618 + }, + { + "epoch": 0.6309739108636095, + "grad_norm": 1.6341084241867065, + "learning_rate": 6.3313757118146e-05, + "loss": 1.2484, + "step": 17619 + }, + { + "epoch": 0.6310097229931778, + "grad_norm": 2.272749423980713, + "learning_rate": 6.330296709080881e-05, + "loss": 1.3682, + "step": 17620 + }, + { + "epoch": 0.6310455351227461, + "grad_norm": 2.296658992767334, + "learning_rate": 6.329217755718291e-05, + "loss": 1.847, + "step": 17621 + }, + { + "epoch": 0.6310813472523144, + "grad_norm": 1.2976129055023193, + "learning_rate": 6.328138851741338e-05, + "loss": 1.4858, + "step": 17622 + }, + { + "epoch": 0.6311171593818826, + "grad_norm": 1.4732706546783447, + "learning_rate": 6.32705999716454e-05, + "loss": 1.0149, + "step": 17623 + }, + { + "epoch": 0.6311529715114509, + "grad_norm": 2.097350597381592, + "learning_rate": 6.325981192002413e-05, + "loss": 1.7887, + "step": 17624 + }, + { + "epoch": 0.6311887836410193, + "grad_norm": 1.4998894929885864, + "learning_rate": 6.324902436269469e-05, + "loss": 1.47, + "step": 17625 + }, + { + "epoch": 0.6312245957705875, + "grad_norm": 2.4181065559387207, + "learning_rate": 6.323823729980222e-05, + "loss": 1.8616, + "step": 17626 + }, + { + "epoch": 0.6312604079001558, + "grad_norm": 2.439805507659912, + "learning_rate": 6.322745073149185e-05, + "loss": 1.6384, + "step": 17627 + }, + { + "epoch": 0.6312962200297241, + "grad_norm": 1.9213478565216064, + "learning_rate": 6.321666465790872e-05, + "loss": 1.6752, + "step": 17628 + }, + { + "epoch": 0.6313320321592923, + "grad_norm": 1.7492610216140747, + "learning_rate": 6.320587907919788e-05, + "loss": 1.6131, + "step": 17629 + }, + { + "epoch": 0.6313678442888606, + "grad_norm": 1.5439682006835938, + "learning_rate": 6.319509399550452e-05, + "loss": 1.3714, + "step": 17630 + }, + { + "epoch": 0.6314036564184289, + "grad_norm": 1.481459617614746, + "learning_rate": 6.318430940697367e-05, + "loss": 1.449, + "step": 17631 + }, + { + "epoch": 0.6314394685479972, + "grad_norm": 1.4452996253967285, + "learning_rate": 6.317352531375045e-05, + "loss": 1.2496, + "step": 17632 + }, + { + "epoch": 0.6314752806775655, + "grad_norm": 1.4040240049362183, + "learning_rate": 6.316274171597995e-05, + "loss": 1.4423, + "step": 17633 + }, + { + "epoch": 0.6315110928071338, + "grad_norm": 1.6512972116470337, + "learning_rate": 6.315195861380722e-05, + "loss": 1.3448, + "step": 17634 + }, + { + "epoch": 0.6315469049367021, + "grad_norm": 1.6966760158538818, + "learning_rate": 6.314117600737738e-05, + "loss": 1.5408, + "step": 17635 + }, + { + "epoch": 0.6315827170662703, + "grad_norm": 1.709964394569397, + "learning_rate": 6.313039389683546e-05, + "loss": 1.3916, + "step": 17636 + }, + { + "epoch": 0.6316185291958386, + "grad_norm": 1.6564950942993164, + "learning_rate": 6.311961228232654e-05, + "loss": 1.6976, + "step": 17637 + }, + { + "epoch": 0.6316543413254069, + "grad_norm": 1.531118631362915, + "learning_rate": 6.310883116399567e-05, + "loss": 1.3207, + "step": 17638 + }, + { + "epoch": 0.6316901534549751, + "grad_norm": 2.0177090167999268, + "learning_rate": 6.309805054198787e-05, + "loss": 1.6174, + "step": 17639 + }, + { + "epoch": 0.6317259655845435, + "grad_norm": 1.2285786867141724, + "learning_rate": 6.308727041644824e-05, + "loss": 1.6658, + "step": 17640 + }, + { + "epoch": 0.6317617777141118, + "grad_norm": 1.6047492027282715, + "learning_rate": 6.307649078752174e-05, + "loss": 1.6755, + "step": 17641 + }, + { + "epoch": 0.6317975898436801, + "grad_norm": 2.411289691925049, + "learning_rate": 6.306571165535343e-05, + "loss": 1.4401, + "step": 17642 + }, + { + "epoch": 0.6318334019732483, + "grad_norm": 1.399155855178833, + "learning_rate": 6.305493302008832e-05, + "loss": 1.7149, + "step": 17643 + }, + { + "epoch": 0.6318692141028166, + "grad_norm": 1.4516044855117798, + "learning_rate": 6.304415488187145e-05, + "loss": 1.4904, + "step": 17644 + }, + { + "epoch": 0.6319050262323849, + "grad_norm": 1.6470979452133179, + "learning_rate": 6.303337724084779e-05, + "loss": 1.6618, + "step": 17645 + }, + { + "epoch": 0.6319408383619531, + "grad_norm": 1.3379909992218018, + "learning_rate": 6.302260009716237e-05, + "loss": 1.3781, + "step": 17646 + }, + { + "epoch": 0.6319766504915215, + "grad_norm": 1.534791111946106, + "learning_rate": 6.301182345096017e-05, + "loss": 1.34, + "step": 17647 + }, + { + "epoch": 0.6320124626210898, + "grad_norm": 2.221349000930786, + "learning_rate": 6.300104730238616e-05, + "loss": 1.535, + "step": 17648 + }, + { + "epoch": 0.6320482747506581, + "grad_norm": 1.749962329864502, + "learning_rate": 6.299027165158536e-05, + "loss": 1.4936, + "step": 17649 + }, + { + "epoch": 0.6320840868802263, + "grad_norm": 1.2860403060913086, + "learning_rate": 6.297949649870267e-05, + "loss": 1.0702, + "step": 17650 + }, + { + "epoch": 0.6321198990097946, + "grad_norm": 1.5968939065933228, + "learning_rate": 6.296872184388315e-05, + "loss": 1.4381, + "step": 17651 + }, + { + "epoch": 0.6321557111393629, + "grad_norm": 1.518203854560852, + "learning_rate": 6.295794768727168e-05, + "loss": 1.2938, + "step": 17652 + }, + { + "epoch": 0.6321915232689311, + "grad_norm": 1.8395830392837524, + "learning_rate": 6.294717402901325e-05, + "loss": 1.5632, + "step": 17653 + }, + { + "epoch": 0.6322273353984995, + "grad_norm": 1.8332093954086304, + "learning_rate": 6.293640086925279e-05, + "loss": 1.6127, + "step": 17654 + }, + { + "epoch": 0.6322631475280678, + "grad_norm": 1.5718899965286255, + "learning_rate": 6.292562820813528e-05, + "loss": 1.4328, + "step": 17655 + }, + { + "epoch": 0.632298959657636, + "grad_norm": 2.0594699382781982, + "learning_rate": 6.291485604580559e-05, + "loss": 1.6376, + "step": 17656 + }, + { + "epoch": 0.6323347717872043, + "grad_norm": 1.631384253501892, + "learning_rate": 6.290408438240869e-05, + "loss": 1.513, + "step": 17657 + }, + { + "epoch": 0.6323705839167726, + "grad_norm": 1.8818942308425903, + "learning_rate": 6.289331321808948e-05, + "loss": 1.2976, + "step": 17658 + }, + { + "epoch": 0.6324063960463409, + "grad_norm": 2.297025680541992, + "learning_rate": 6.288254255299286e-05, + "loss": 1.5297, + "step": 17659 + }, + { + "epoch": 0.6324422081759091, + "grad_norm": 1.3660603761672974, + "learning_rate": 6.287177238726378e-05, + "loss": 1.504, + "step": 17660 + }, + { + "epoch": 0.6324780203054775, + "grad_norm": 1.9709035158157349, + "learning_rate": 6.28610027210471e-05, + "loss": 1.7812, + "step": 17661 + }, + { + "epoch": 0.6325138324350458, + "grad_norm": 1.61428964138031, + "learning_rate": 6.285023355448772e-05, + "loss": 1.4382, + "step": 17662 + }, + { + "epoch": 0.632549644564614, + "grad_norm": 2.234609365463257, + "learning_rate": 6.283946488773051e-05, + "loss": 1.5856, + "step": 17663 + }, + { + "epoch": 0.6325854566941823, + "grad_norm": 1.6669365167617798, + "learning_rate": 6.282869672092039e-05, + "loss": 1.4216, + "step": 17664 + }, + { + "epoch": 0.6326212688237506, + "grad_norm": 1.5173592567443848, + "learning_rate": 6.281792905420219e-05, + "loss": 1.4237, + "step": 17665 + }, + { + "epoch": 0.6326570809533189, + "grad_norm": 1.7789220809936523, + "learning_rate": 6.280716188772082e-05, + "loss": 1.406, + "step": 17666 + }, + { + "epoch": 0.6326928930828871, + "grad_norm": 2.917492389678955, + "learning_rate": 6.279639522162111e-05, + "loss": 1.6137, + "step": 17667 + }, + { + "epoch": 0.6327287052124555, + "grad_norm": 1.5990511178970337, + "learning_rate": 6.278562905604788e-05, + "loss": 1.3547, + "step": 17668 + }, + { + "epoch": 0.6327645173420238, + "grad_norm": 2.128406047821045, + "learning_rate": 6.277486339114605e-05, + "loss": 1.417, + "step": 17669 + }, + { + "epoch": 0.632800329471592, + "grad_norm": 2.0638668537139893, + "learning_rate": 6.276409822706038e-05, + "loss": 1.6075, + "step": 17670 + }, + { + "epoch": 0.6328361416011603, + "grad_norm": 2.864104747772217, + "learning_rate": 6.275333356393575e-05, + "loss": 1.4648, + "step": 17671 + }, + { + "epoch": 0.6328719537307286, + "grad_norm": 1.8883485794067383, + "learning_rate": 6.274256940191696e-05, + "loss": 1.5711, + "step": 17672 + }, + { + "epoch": 0.6329077658602968, + "grad_norm": 1.6292932033538818, + "learning_rate": 6.273180574114887e-05, + "loss": 1.7285, + "step": 17673 + }, + { + "epoch": 0.6329435779898651, + "grad_norm": 1.8495104312896729, + "learning_rate": 6.272104258177622e-05, + "loss": 1.1279, + "step": 17674 + }, + { + "epoch": 0.6329793901194335, + "grad_norm": 1.819200038909912, + "learning_rate": 6.271027992394389e-05, + "loss": 1.9726, + "step": 17675 + }, + { + "epoch": 0.6330152022490018, + "grad_norm": 1.7758945226669312, + "learning_rate": 6.269951776779667e-05, + "loss": 1.2936, + "step": 17676 + }, + { + "epoch": 0.63305101437857, + "grad_norm": 1.482681155204773, + "learning_rate": 6.268875611347925e-05, + "loss": 1.4099, + "step": 17677 + }, + { + "epoch": 0.6330868265081383, + "grad_norm": 1.7096611261367798, + "learning_rate": 6.267799496113656e-05, + "loss": 1.5464, + "step": 17678 + }, + { + "epoch": 0.6331226386377066, + "grad_norm": 1.5019376277923584, + "learning_rate": 6.266723431091323e-05, + "loss": 1.4345, + "step": 17679 + }, + { + "epoch": 0.6331584507672748, + "grad_norm": 2.354271173477173, + "learning_rate": 6.265647416295417e-05, + "loss": 1.5104, + "step": 17680 + }, + { + "epoch": 0.6331942628968431, + "grad_norm": 1.3241621255874634, + "learning_rate": 6.264571451740405e-05, + "loss": 1.5164, + "step": 17681 + }, + { + "epoch": 0.6332300750264115, + "grad_norm": 1.2999305725097656, + "learning_rate": 6.263495537440766e-05, + "loss": 1.5146, + "step": 17682 + }, + { + "epoch": 0.6332658871559798, + "grad_norm": 1.9902713298797607, + "learning_rate": 6.262419673410976e-05, + "loss": 1.5923, + "step": 17683 + }, + { + "epoch": 0.633301699285548, + "grad_norm": 1.6341190338134766, + "learning_rate": 6.261343859665507e-05, + "loss": 1.5331, + "step": 17684 + }, + { + "epoch": 0.6333375114151163, + "grad_norm": 1.755398154258728, + "learning_rate": 6.260268096218838e-05, + "loss": 1.4803, + "step": 17685 + }, + { + "epoch": 0.6333733235446846, + "grad_norm": 1.864999532699585, + "learning_rate": 6.259192383085432e-05, + "loss": 1.4069, + "step": 17686 + }, + { + "epoch": 0.6334091356742528, + "grad_norm": 1.3991942405700684, + "learning_rate": 6.258116720279773e-05, + "loss": 1.1271, + "step": 17687 + }, + { + "epoch": 0.6334449478038211, + "grad_norm": 1.9088125228881836, + "learning_rate": 6.257041107816319e-05, + "loss": 1.548, + "step": 17688 + }, + { + "epoch": 0.6334807599333895, + "grad_norm": 2.0545427799224854, + "learning_rate": 6.255965545709556e-05, + "loss": 1.441, + "step": 17689 + }, + { + "epoch": 0.6335165720629578, + "grad_norm": 1.577840805053711, + "learning_rate": 6.254890033973942e-05, + "loss": 1.6864, + "step": 17690 + }, + { + "epoch": 0.633552384192526, + "grad_norm": 1.5293114185333252, + "learning_rate": 6.253814572623958e-05, + "loss": 1.6347, + "step": 17691 + }, + { + "epoch": 0.6335881963220943, + "grad_norm": 1.3366191387176514, + "learning_rate": 6.252739161674059e-05, + "loss": 1.344, + "step": 17692 + }, + { + "epoch": 0.6336240084516626, + "grad_norm": 1.6025129556655884, + "learning_rate": 6.251663801138725e-05, + "loss": 1.5063, + "step": 17693 + }, + { + "epoch": 0.6336598205812308, + "grad_norm": 1.4043288230895996, + "learning_rate": 6.250588491032421e-05, + "loss": 1.5331, + "step": 17694 + }, + { + "epoch": 0.6336956327107991, + "grad_norm": 1.5216609239578247, + "learning_rate": 6.249513231369608e-05, + "loss": 1.4133, + "step": 17695 + }, + { + "epoch": 0.6337314448403675, + "grad_norm": 1.466712236404419, + "learning_rate": 6.248438022164763e-05, + "loss": 1.3336, + "step": 17696 + }, + { + "epoch": 0.6337672569699357, + "grad_norm": 2.017925262451172, + "learning_rate": 6.247362863432337e-05, + "loss": 1.5305, + "step": 17697 + }, + { + "epoch": 0.633803069099504, + "grad_norm": 1.440697431564331, + "learning_rate": 6.246287755186813e-05, + "loss": 1.379, + "step": 17698 + }, + { + "epoch": 0.6338388812290723, + "grad_norm": 1.7415283918380737, + "learning_rate": 6.245212697442637e-05, + "loss": 1.4176, + "step": 17699 + }, + { + "epoch": 0.6338746933586406, + "grad_norm": 1.6304659843444824, + "learning_rate": 6.244137690214287e-05, + "loss": 1.2807, + "step": 17700 + }, + { + "epoch": 0.6339105054882088, + "grad_norm": 2.0391323566436768, + "learning_rate": 6.243062733516211e-05, + "loss": 1.5144, + "step": 17701 + }, + { + "epoch": 0.6339463176177771, + "grad_norm": 1.8402481079101562, + "learning_rate": 6.24198782736289e-05, + "loss": 1.3738, + "step": 17702 + }, + { + "epoch": 0.6339821297473455, + "grad_norm": 1.4574590921401978, + "learning_rate": 6.240912971768771e-05, + "loss": 1.1814, + "step": 17703 + }, + { + "epoch": 0.6340179418769137, + "grad_norm": 2.0565900802612305, + "learning_rate": 6.239838166748318e-05, + "loss": 1.086, + "step": 17704 + }, + { + "epoch": 0.634053754006482, + "grad_norm": 1.5707656145095825, + "learning_rate": 6.238763412315993e-05, + "loss": 1.2242, + "step": 17705 + }, + { + "epoch": 0.6340895661360503, + "grad_norm": 2.836914539337158, + "learning_rate": 6.237688708486252e-05, + "loss": 1.5508, + "step": 17706 + }, + { + "epoch": 0.6341253782656185, + "grad_norm": 1.3280764818191528, + "learning_rate": 6.236614055273562e-05, + "loss": 1.2026, + "step": 17707 + }, + { + "epoch": 0.6341611903951868, + "grad_norm": 2.0606489181518555, + "learning_rate": 6.23553945269237e-05, + "loss": 1.6827, + "step": 17708 + }, + { + "epoch": 0.6341970025247551, + "grad_norm": 1.3689779043197632, + "learning_rate": 6.234464900757144e-05, + "loss": 1.4438, + "step": 17709 + }, + { + "epoch": 0.6342328146543235, + "grad_norm": 1.415412425994873, + "learning_rate": 6.233390399482334e-05, + "loss": 1.3191, + "step": 17710 + }, + { + "epoch": 0.6342686267838917, + "grad_norm": 1.9321517944335938, + "learning_rate": 6.232315948882394e-05, + "loss": 1.485, + "step": 17711 + }, + { + "epoch": 0.63430443891346, + "grad_norm": 1.5414435863494873, + "learning_rate": 6.231241548971788e-05, + "loss": 1.3746, + "step": 17712 + }, + { + "epoch": 0.6343402510430283, + "grad_norm": 1.7745535373687744, + "learning_rate": 6.230167199764962e-05, + "loss": 1.5908, + "step": 17713 + }, + { + "epoch": 0.6343760631725965, + "grad_norm": 1.9899230003356934, + "learning_rate": 6.229092901276376e-05, + "loss": 1.6602, + "step": 17714 + }, + { + "epoch": 0.6344118753021648, + "grad_norm": 2.1202757358551025, + "learning_rate": 6.228018653520477e-05, + "loss": 1.376, + "step": 17715 + }, + { + "epoch": 0.6344476874317331, + "grad_norm": 1.5492504835128784, + "learning_rate": 6.226944456511725e-05, + "loss": 1.4702, + "step": 17716 + }, + { + "epoch": 0.6344834995613015, + "grad_norm": 1.687817931175232, + "learning_rate": 6.225870310264567e-05, + "loss": 1.3696, + "step": 17717 + }, + { + "epoch": 0.6345193116908697, + "grad_norm": 2.061065435409546, + "learning_rate": 6.224796214793458e-05, + "loss": 1.7091, + "step": 17718 + }, + { + "epoch": 0.634555123820438, + "grad_norm": 1.6443705558776855, + "learning_rate": 6.223722170112845e-05, + "loss": 1.4229, + "step": 17719 + }, + { + "epoch": 0.6345909359500063, + "grad_norm": 1.9211294651031494, + "learning_rate": 6.222648176237179e-05, + "loss": 1.6184, + "step": 17720 + }, + { + "epoch": 0.6346267480795745, + "grad_norm": 1.7323452234268188, + "learning_rate": 6.221574233180907e-05, + "loss": 1.5455, + "step": 17721 + }, + { + "epoch": 0.6346625602091428, + "grad_norm": 1.5966200828552246, + "learning_rate": 6.220500340958482e-05, + "loss": 1.3132, + "step": 17722 + }, + { + "epoch": 0.6346983723387111, + "grad_norm": 1.7405643463134766, + "learning_rate": 6.219426499584351e-05, + "loss": 1.5249, + "step": 17723 + }, + { + "epoch": 0.6347341844682794, + "grad_norm": 1.7434638738632202, + "learning_rate": 6.218352709072957e-05, + "loss": 1.6002, + "step": 17724 + }, + { + "epoch": 0.6347699965978477, + "grad_norm": 1.4588823318481445, + "learning_rate": 6.21727896943875e-05, + "loss": 1.2939, + "step": 17725 + }, + { + "epoch": 0.634805808727416, + "grad_norm": 1.711930751800537, + "learning_rate": 6.216205280696177e-05, + "loss": 1.2147, + "step": 17726 + }, + { + "epoch": 0.6348416208569843, + "grad_norm": 1.7398806810379028, + "learning_rate": 6.21513164285968e-05, + "loss": 1.5812, + "step": 17727 + }, + { + "epoch": 0.6348774329865525, + "grad_norm": 1.6372019052505493, + "learning_rate": 6.214058055943706e-05, + "loss": 1.6223, + "step": 17728 + }, + { + "epoch": 0.6349132451161208, + "grad_norm": 1.3632045984268188, + "learning_rate": 6.212984519962695e-05, + "loss": 1.4353, + "step": 17729 + }, + { + "epoch": 0.6349490572456891, + "grad_norm": 1.4072738885879517, + "learning_rate": 6.211911034931094e-05, + "loss": 1.4157, + "step": 17730 + }, + { + "epoch": 0.6349848693752574, + "grad_norm": 1.8789604902267456, + "learning_rate": 6.210837600863342e-05, + "loss": 1.3836, + "step": 17731 + }, + { + "epoch": 0.6350206815048257, + "grad_norm": 1.5280240774154663, + "learning_rate": 6.209764217773884e-05, + "loss": 1.4962, + "step": 17732 + }, + { + "epoch": 0.635056493634394, + "grad_norm": 1.6792149543762207, + "learning_rate": 6.208690885677158e-05, + "loss": 1.3537, + "step": 17733 + }, + { + "epoch": 0.6350923057639623, + "grad_norm": 2.1413023471832275, + "learning_rate": 6.207617604587607e-05, + "loss": 1.4218, + "step": 17734 + }, + { + "epoch": 0.6351281178935305, + "grad_norm": 1.7607442140579224, + "learning_rate": 6.20654437451967e-05, + "loss": 1.6476, + "step": 17735 + }, + { + "epoch": 0.6351639300230988, + "grad_norm": 1.53827965259552, + "learning_rate": 6.205471195487784e-05, + "loss": 1.5853, + "step": 17736 + }, + { + "epoch": 0.6351997421526671, + "grad_norm": 1.5775997638702393, + "learning_rate": 6.204398067506389e-05, + "loss": 1.5806, + "step": 17737 + }, + { + "epoch": 0.6352355542822354, + "grad_norm": 1.4330687522888184, + "learning_rate": 6.203324990589922e-05, + "loss": 1.4459, + "step": 17738 + }, + { + "epoch": 0.6352713664118037, + "grad_norm": 1.8977481126785278, + "learning_rate": 6.20225196475282e-05, + "loss": 1.4269, + "step": 17739 + }, + { + "epoch": 0.635307178541372, + "grad_norm": 1.7836785316467285, + "learning_rate": 6.201178990009518e-05, + "loss": 1.547, + "step": 17740 + }, + { + "epoch": 0.6353429906709402, + "grad_norm": 1.7709898948669434, + "learning_rate": 6.200106066374454e-05, + "loss": 1.4268, + "step": 17741 + }, + { + "epoch": 0.6353788028005085, + "grad_norm": 2.1804091930389404, + "learning_rate": 6.199033193862059e-05, + "loss": 1.4339, + "step": 17742 + }, + { + "epoch": 0.6354146149300768, + "grad_norm": 1.4796829223632812, + "learning_rate": 6.197960372486772e-05, + "loss": 1.3443, + "step": 17743 + }, + { + "epoch": 0.635450427059645, + "grad_norm": 1.9017022848129272, + "learning_rate": 6.196887602263022e-05, + "loss": 1.4565, + "step": 17744 + }, + { + "epoch": 0.6354862391892134, + "grad_norm": 1.6325371265411377, + "learning_rate": 6.195814883205245e-05, + "loss": 1.6163, + "step": 17745 + }, + { + "epoch": 0.6355220513187817, + "grad_norm": 2.607729196548462, + "learning_rate": 6.194742215327873e-05, + "loss": 1.211, + "step": 17746 + }, + { + "epoch": 0.63555786344835, + "grad_norm": 1.6463838815689087, + "learning_rate": 6.193669598645334e-05, + "loss": 1.5084, + "step": 17747 + }, + { + "epoch": 0.6355936755779182, + "grad_norm": 1.6740944385528564, + "learning_rate": 6.19259703317206e-05, + "loss": 1.3549, + "step": 17748 + }, + { + "epoch": 0.6356294877074865, + "grad_norm": 2.299992561340332, + "learning_rate": 6.191524518922482e-05, + "loss": 1.4458, + "step": 17749 + }, + { + "epoch": 0.6356652998370548, + "grad_norm": 1.5784881114959717, + "learning_rate": 6.190452055911031e-05, + "loss": 1.5335, + "step": 17750 + }, + { + "epoch": 0.635701111966623, + "grad_norm": 1.283570647239685, + "learning_rate": 6.189379644152132e-05, + "loss": 1.3831, + "step": 17751 + }, + { + "epoch": 0.6357369240961914, + "grad_norm": 1.497164249420166, + "learning_rate": 6.188307283660216e-05, + "loss": 1.491, + "step": 17752 + }, + { + "epoch": 0.6357727362257597, + "grad_norm": 1.6668617725372314, + "learning_rate": 6.187234974449707e-05, + "loss": 1.4718, + "step": 17753 + }, + { + "epoch": 0.635808548355328, + "grad_norm": 1.4830564260482788, + "learning_rate": 6.186162716535036e-05, + "loss": 1.3756, + "step": 17754 + }, + { + "epoch": 0.6358443604848962, + "grad_norm": 1.7476052045822144, + "learning_rate": 6.185090509930624e-05, + "loss": 1.497, + "step": 17755 + }, + { + "epoch": 0.6358801726144645, + "grad_norm": 1.7061628103256226, + "learning_rate": 6.184018354650898e-05, + "loss": 1.3756, + "step": 17756 + }, + { + "epoch": 0.6359159847440328, + "grad_norm": 1.9672883749008179, + "learning_rate": 6.182946250710284e-05, + "loss": 1.6373, + "step": 17757 + }, + { + "epoch": 0.635951796873601, + "grad_norm": 1.8433897495269775, + "learning_rate": 6.181874198123203e-05, + "loss": 1.8807, + "step": 17758 + }, + { + "epoch": 0.6359876090031694, + "grad_norm": 1.7661159038543701, + "learning_rate": 6.18080219690408e-05, + "loss": 1.4735, + "step": 17759 + }, + { + "epoch": 0.6360234211327377, + "grad_norm": 2.0995090007781982, + "learning_rate": 6.179730247067336e-05, + "loss": 1.3749, + "step": 17760 + }, + { + "epoch": 0.636059233262306, + "grad_norm": 1.3635008335113525, + "learning_rate": 6.178658348627398e-05, + "loss": 1.3731, + "step": 17761 + }, + { + "epoch": 0.6360950453918742, + "grad_norm": 2.132258892059326, + "learning_rate": 6.177586501598679e-05, + "loss": 1.3157, + "step": 17762 + }, + { + "epoch": 0.6361308575214425, + "grad_norm": 1.4567443132400513, + "learning_rate": 6.176514705995604e-05, + "loss": 1.7088, + "step": 17763 + }, + { + "epoch": 0.6361666696510108, + "grad_norm": 1.368294358253479, + "learning_rate": 6.175442961832593e-05, + "loss": 1.4436, + "step": 17764 + }, + { + "epoch": 0.636202481780579, + "grad_norm": 1.7212412357330322, + "learning_rate": 6.174371269124061e-05, + "loss": 1.5667, + "step": 17765 + }, + { + "epoch": 0.6362382939101474, + "grad_norm": 1.3152505159378052, + "learning_rate": 6.173299627884432e-05, + "loss": 1.3987, + "step": 17766 + }, + { + "epoch": 0.6362741060397157, + "grad_norm": 1.347125768661499, + "learning_rate": 6.172228038128118e-05, + "loss": 1.3405, + "step": 17767 + }, + { + "epoch": 0.636309918169284, + "grad_norm": 1.3054901361465454, + "learning_rate": 6.171156499869539e-05, + "loss": 1.4982, + "step": 17768 + }, + { + "epoch": 0.6363457302988522, + "grad_norm": 1.41067636013031, + "learning_rate": 6.17008501312311e-05, + "loss": 1.1198, + "step": 17769 + }, + { + "epoch": 0.6363815424284205, + "grad_norm": 1.8490687608718872, + "learning_rate": 6.169013577903248e-05, + "loss": 1.6673, + "step": 17770 + }, + { + "epoch": 0.6364173545579888, + "grad_norm": 2.312915325164795, + "learning_rate": 6.167942194224365e-05, + "loss": 1.4044, + "step": 17771 + }, + { + "epoch": 0.636453166687557, + "grad_norm": 1.7895601987838745, + "learning_rate": 6.16687086210088e-05, + "loss": 1.634, + "step": 17772 + }, + { + "epoch": 0.6364889788171254, + "grad_norm": 1.7349082231521606, + "learning_rate": 6.165799581547203e-05, + "loss": 1.4799, + "step": 17773 + }, + { + "epoch": 0.6365247909466937, + "grad_norm": 1.9803053140640259, + "learning_rate": 6.164728352577743e-05, + "loss": 1.4382, + "step": 17774 + }, + { + "epoch": 0.6365606030762619, + "grad_norm": 2.0456080436706543, + "learning_rate": 6.16365717520692e-05, + "loss": 1.2846, + "step": 17775 + }, + { + "epoch": 0.6365964152058302, + "grad_norm": 2.208461046218872, + "learning_rate": 6.162586049449136e-05, + "loss": 1.3596, + "step": 17776 + }, + { + "epoch": 0.6366322273353985, + "grad_norm": 1.5197937488555908, + "learning_rate": 6.161514975318809e-05, + "loss": 1.5683, + "step": 17777 + }, + { + "epoch": 0.6366680394649668, + "grad_norm": 1.7372349500656128, + "learning_rate": 6.160443952830347e-05, + "loss": 1.4085, + "step": 17778 + }, + { + "epoch": 0.636703851594535, + "grad_norm": 1.7026249170303345, + "learning_rate": 6.159372981998161e-05, + "loss": 1.5112, + "step": 17779 + }, + { + "epoch": 0.6367396637241034, + "grad_norm": 1.795798897743225, + "learning_rate": 6.158302062836654e-05, + "loss": 1.5585, + "step": 17780 + }, + { + "epoch": 0.6367754758536717, + "grad_norm": 1.8992228507995605, + "learning_rate": 6.157231195360241e-05, + "loss": 1.6058, + "step": 17781 + }, + { + "epoch": 0.6368112879832399, + "grad_norm": 1.8138514757156372, + "learning_rate": 6.156160379583325e-05, + "loss": 1.3325, + "step": 17782 + }, + { + "epoch": 0.6368471001128082, + "grad_norm": 1.941710114479065, + "learning_rate": 6.155089615520308e-05, + "loss": 1.449, + "step": 17783 + }, + { + "epoch": 0.6368829122423765, + "grad_norm": 1.922827124595642, + "learning_rate": 6.154018903185608e-05, + "loss": 1.1852, + "step": 17784 + }, + { + "epoch": 0.6369187243719447, + "grad_norm": 1.9365390539169312, + "learning_rate": 6.152948242593615e-05, + "loss": 1.6456, + "step": 17785 + }, + { + "epoch": 0.636954536501513, + "grad_norm": 1.8548115491867065, + "learning_rate": 6.15187763375875e-05, + "loss": 1.6585, + "step": 17786 + }, + { + "epoch": 0.6369903486310814, + "grad_norm": 1.6139572858810425, + "learning_rate": 6.150807076695399e-05, + "loss": 1.3493, + "step": 17787 + }, + { + "epoch": 0.6370261607606497, + "grad_norm": 1.368597149848938, + "learning_rate": 6.149736571417979e-05, + "loss": 1.3896, + "step": 17788 + }, + { + "epoch": 0.6370619728902179, + "grad_norm": 1.8540292978286743, + "learning_rate": 6.148666117940882e-05, + "loss": 1.3981, + "step": 17789 + }, + { + "epoch": 0.6370977850197862, + "grad_norm": 1.3378417491912842, + "learning_rate": 6.147595716278519e-05, + "loss": 1.302, + "step": 17790 + }, + { + "epoch": 0.6371335971493545, + "grad_norm": 1.6063618659973145, + "learning_rate": 6.146525366445288e-05, + "loss": 1.5243, + "step": 17791 + }, + { + "epoch": 0.6371694092789227, + "grad_norm": 2.179349660873413, + "learning_rate": 6.145455068455583e-05, + "loss": 1.31, + "step": 17792 + }, + { + "epoch": 0.637205221408491, + "grad_norm": 1.9033253192901611, + "learning_rate": 6.144384822323812e-05, + "loss": 1.2234, + "step": 17793 + }, + { + "epoch": 0.6372410335380594, + "grad_norm": 1.3805716037750244, + "learning_rate": 6.143314628064365e-05, + "loss": 1.5376, + "step": 17794 + }, + { + "epoch": 0.6372768456676277, + "grad_norm": 1.3206768035888672, + "learning_rate": 6.14224448569165e-05, + "loss": 1.4238, + "step": 17795 + }, + { + "epoch": 0.6373126577971959, + "grad_norm": 1.6055296659469604, + "learning_rate": 6.141174395220053e-05, + "loss": 1.451, + "step": 17796 + }, + { + "epoch": 0.6373484699267642, + "grad_norm": 1.5268259048461914, + "learning_rate": 6.140104356663984e-05, + "loss": 1.3718, + "step": 17797 + }, + { + "epoch": 0.6373842820563325, + "grad_norm": 1.8566981554031372, + "learning_rate": 6.139034370037826e-05, + "loss": 1.4693, + "step": 17798 + }, + { + "epoch": 0.6374200941859007, + "grad_norm": 1.3930617570877075, + "learning_rate": 6.137964435355984e-05, + "loss": 1.7598, + "step": 17799 + }, + { + "epoch": 0.637455906315469, + "grad_norm": 2.081630229949951, + "learning_rate": 6.13689455263285e-05, + "loss": 1.3816, + "step": 17800 + }, + { + "epoch": 0.6374917184450374, + "grad_norm": 1.8918607234954834, + "learning_rate": 6.135824721882815e-05, + "loss": 1.487, + "step": 17801 + }, + { + "epoch": 0.6375275305746056, + "grad_norm": 1.4612147808074951, + "learning_rate": 6.134754943120273e-05, + "loss": 1.7173, + "step": 17802 + }, + { + "epoch": 0.6375633427041739, + "grad_norm": 1.6461340188980103, + "learning_rate": 6.133685216359615e-05, + "loss": 1.3156, + "step": 17803 + }, + { + "epoch": 0.6375991548337422, + "grad_norm": 2.132502317428589, + "learning_rate": 6.132615541615242e-05, + "loss": 1.617, + "step": 17804 + }, + { + "epoch": 0.6376349669633105, + "grad_norm": 1.3095436096191406, + "learning_rate": 6.131545918901531e-05, + "loss": 1.4064, + "step": 17805 + }, + { + "epoch": 0.6376707790928787, + "grad_norm": 1.5461561679840088, + "learning_rate": 6.130476348232887e-05, + "loss": 1.291, + "step": 17806 + }, + { + "epoch": 0.637706591222447, + "grad_norm": 1.4045078754425049, + "learning_rate": 6.12940682962369e-05, + "loss": 1.2808, + "step": 17807 + }, + { + "epoch": 0.6377424033520154, + "grad_norm": 1.6871155500411987, + "learning_rate": 6.128337363088327e-05, + "loss": 1.5045, + "step": 17808 + }, + { + "epoch": 0.6377782154815836, + "grad_norm": 1.3531205654144287, + "learning_rate": 6.127267948641195e-05, + "loss": 1.3646, + "step": 17809 + }, + { + "epoch": 0.6378140276111519, + "grad_norm": 1.4474282264709473, + "learning_rate": 6.126198586296676e-05, + "loss": 1.2597, + "step": 17810 + }, + { + "epoch": 0.6378498397407202, + "grad_norm": 1.5817090272903442, + "learning_rate": 6.12512927606916e-05, + "loss": 1.4625, + "step": 17811 + }, + { + "epoch": 0.6378856518702885, + "grad_norm": 1.6557588577270508, + "learning_rate": 6.124060017973027e-05, + "loss": 1.8598, + "step": 17812 + }, + { + "epoch": 0.6379214639998567, + "grad_norm": 1.735060453414917, + "learning_rate": 6.122990812022671e-05, + "loss": 1.4263, + "step": 17813 + }, + { + "epoch": 0.637957276129425, + "grad_norm": 1.6962659358978271, + "learning_rate": 6.12192165823247e-05, + "loss": 1.4953, + "step": 17814 + }, + { + "epoch": 0.6379930882589934, + "grad_norm": 2.5950944423675537, + "learning_rate": 6.120852556616811e-05, + "loss": 1.6169, + "step": 17815 + }, + { + "epoch": 0.6380289003885616, + "grad_norm": 1.4274699687957764, + "learning_rate": 6.11978350719008e-05, + "loss": 1.2505, + "step": 17816 + }, + { + "epoch": 0.6380647125181299, + "grad_norm": 2.5493216514587402, + "learning_rate": 6.118714509966654e-05, + "loss": 1.3164, + "step": 17817 + }, + { + "epoch": 0.6381005246476982, + "grad_norm": 2.081235647201538, + "learning_rate": 6.117645564960919e-05, + "loss": 1.1562, + "step": 17818 + }, + { + "epoch": 0.6381363367772664, + "grad_norm": 1.0999113321304321, + "learning_rate": 6.116576672187254e-05, + "loss": 1.5064, + "step": 17819 + }, + { + "epoch": 0.6381721489068347, + "grad_norm": 1.29554283618927, + "learning_rate": 6.115507831660042e-05, + "loss": 1.4769, + "step": 17820 + }, + { + "epoch": 0.638207961036403, + "grad_norm": 1.3888424634933472, + "learning_rate": 6.11443904339366e-05, + "loss": 1.8104, + "step": 17821 + }, + { + "epoch": 0.6382437731659714, + "grad_norm": 1.9146857261657715, + "learning_rate": 6.11337030740249e-05, + "loss": 1.4991, + "step": 17822 + }, + { + "epoch": 0.6382795852955396, + "grad_norm": 1.6722118854522705, + "learning_rate": 6.112301623700907e-05, + "loss": 1.4583, + "step": 17823 + }, + { + "epoch": 0.6383153974251079, + "grad_norm": 1.50333833694458, + "learning_rate": 6.111232992303292e-05, + "loss": 1.4124, + "step": 17824 + }, + { + "epoch": 0.6383512095546762, + "grad_norm": 1.567192792892456, + "learning_rate": 6.110164413224025e-05, + "loss": 1.3424, + "step": 17825 + }, + { + "epoch": 0.6383870216842444, + "grad_norm": 2.029167652130127, + "learning_rate": 6.109095886477472e-05, + "loss": 1.6334, + "step": 17826 + }, + { + "epoch": 0.6384228338138127, + "grad_norm": 1.5533322095870972, + "learning_rate": 6.108027412078018e-05, + "loss": 1.5, + "step": 17827 + }, + { + "epoch": 0.638458645943381, + "grad_norm": 1.8610965013504028, + "learning_rate": 6.106958990040033e-05, + "loss": 1.6345, + "step": 17828 + }, + { + "epoch": 0.6384944580729494, + "grad_norm": 1.191743016242981, + "learning_rate": 6.105890620377897e-05, + "loss": 1.3893, + "step": 17829 + }, + { + "epoch": 0.6385302702025176, + "grad_norm": 1.9871865510940552, + "learning_rate": 6.104822303105974e-05, + "loss": 1.4636, + "step": 17830 + }, + { + "epoch": 0.6385660823320859, + "grad_norm": 1.702633023262024, + "learning_rate": 6.103754038238648e-05, + "loss": 1.4101, + "step": 17831 + }, + { + "epoch": 0.6386018944616542, + "grad_norm": 2.4105355739593506, + "learning_rate": 6.102685825790282e-05, + "loss": 1.3532, + "step": 17832 + }, + { + "epoch": 0.6386377065912224, + "grad_norm": 1.8922439813613892, + "learning_rate": 6.1016176657752534e-05, + "loss": 1.4376, + "step": 17833 + }, + { + "epoch": 0.6386735187207907, + "grad_norm": 2.383431911468506, + "learning_rate": 6.100549558207931e-05, + "loss": 1.3371, + "step": 17834 + }, + { + "epoch": 0.638709330850359, + "grad_norm": 2.351727247238159, + "learning_rate": 6.099481503102682e-05, + "loss": 1.5323, + "step": 17835 + }, + { + "epoch": 0.6387451429799273, + "grad_norm": 2.596125364303589, + "learning_rate": 6.0984135004738784e-05, + "loss": 1.7484, + "step": 17836 + }, + { + "epoch": 0.6387809551094956, + "grad_norm": 1.5746511220932007, + "learning_rate": 6.097345550335889e-05, + "loss": 1.2177, + "step": 17837 + }, + { + "epoch": 0.6388167672390639, + "grad_norm": 1.227081298828125, + "learning_rate": 6.096277652703082e-05, + "loss": 1.6711, + "step": 17838 + }, + { + "epoch": 0.6388525793686322, + "grad_norm": 2.1082305908203125, + "learning_rate": 6.0952098075898214e-05, + "loss": 1.598, + "step": 17839 + }, + { + "epoch": 0.6388883914982004, + "grad_norm": 1.3371353149414062, + "learning_rate": 6.0941420150104776e-05, + "loss": 1.3421, + "step": 17840 + }, + { + "epoch": 0.6389242036277687, + "grad_norm": 1.6144399642944336, + "learning_rate": 6.0930742749794145e-05, + "loss": 1.352, + "step": 17841 + }, + { + "epoch": 0.638960015757337, + "grad_norm": 1.4856688976287842, + "learning_rate": 6.0920065875109986e-05, + "loss": 1.5137, + "step": 17842 + }, + { + "epoch": 0.6389958278869053, + "grad_norm": 2.919574737548828, + "learning_rate": 6.0909389526195935e-05, + "loss": 1.4141, + "step": 17843 + }, + { + "epoch": 0.6390316400164736, + "grad_norm": 1.6059921979904175, + "learning_rate": 6.0898713703195595e-05, + "loss": 1.6238, + "step": 17844 + }, + { + "epoch": 0.6390674521460419, + "grad_norm": 2.180042028427124, + "learning_rate": 6.0888038406252656e-05, + "loss": 1.2913, + "step": 17845 + }, + { + "epoch": 0.6391032642756102, + "grad_norm": 1.7399688959121704, + "learning_rate": 6.087736363551069e-05, + "loss": 1.5325, + "step": 17846 + }, + { + "epoch": 0.6391390764051784, + "grad_norm": 2.026592254638672, + "learning_rate": 6.086668939111333e-05, + "loss": 1.5609, + "step": 17847 + }, + { + "epoch": 0.6391748885347467, + "grad_norm": 1.4536274671554565, + "learning_rate": 6.085601567320418e-05, + "loss": 1.2754, + "step": 17848 + }, + { + "epoch": 0.639210700664315, + "grad_norm": 1.4030091762542725, + "learning_rate": 6.084534248192688e-05, + "loss": 1.3692, + "step": 17849 + }, + { + "epoch": 0.6392465127938833, + "grad_norm": 1.6104986667633057, + "learning_rate": 6.083466981742496e-05, + "loss": 1.6165, + "step": 17850 + }, + { + "epoch": 0.6392823249234516, + "grad_norm": 1.5701024532318115, + "learning_rate": 6.082399767984206e-05, + "loss": 1.5008, + "step": 17851 + }, + { + "epoch": 0.6393181370530199, + "grad_norm": 2.5654051303863525, + "learning_rate": 6.081332606932173e-05, + "loss": 1.4486, + "step": 17852 + }, + { + "epoch": 0.6393539491825881, + "grad_norm": 1.540667176246643, + "learning_rate": 6.0802654986007534e-05, + "loss": 1.5323, + "step": 17853 + }, + { + "epoch": 0.6393897613121564, + "grad_norm": 1.600110411643982, + "learning_rate": 6.079198443004308e-05, + "loss": 1.4927, + "step": 17854 + }, + { + "epoch": 0.6394255734417247, + "grad_norm": 1.3567900657653809, + "learning_rate": 6.0781314401571875e-05, + "loss": 1.5003, + "step": 17855 + }, + { + "epoch": 0.639461385571293, + "grad_norm": 3.0247714519500732, + "learning_rate": 6.077064490073752e-05, + "loss": 1.5352, + "step": 17856 + }, + { + "epoch": 0.6394971977008613, + "grad_norm": 2.202338695526123, + "learning_rate": 6.075997592768352e-05, + "loss": 1.8075, + "step": 17857 + }, + { + "epoch": 0.6395330098304296, + "grad_norm": 1.6735132932662964, + "learning_rate": 6.074930748255343e-05, + "loss": 1.3196, + "step": 17858 + }, + { + "epoch": 0.6395688219599979, + "grad_norm": 1.94517982006073, + "learning_rate": 6.073863956549077e-05, + "loss": 1.506, + "step": 17859 + }, + { + "epoch": 0.6396046340895661, + "grad_norm": 2.440854787826538, + "learning_rate": 6.07279721766391e-05, + "loss": 1.6454, + "step": 17860 + }, + { + "epoch": 0.6396404462191344, + "grad_norm": 1.7555967569351196, + "learning_rate": 6.071730531614189e-05, + "loss": 1.4305, + "step": 17861 + }, + { + "epoch": 0.6396762583487027, + "grad_norm": 1.5704050064086914, + "learning_rate": 6.070663898414266e-05, + "loss": 1.373, + "step": 17862 + }, + { + "epoch": 0.6397120704782709, + "grad_norm": 2.053732395172119, + "learning_rate": 6.069597318078493e-05, + "loss": 1.4501, + "step": 17863 + }, + { + "epoch": 0.6397478826078393, + "grad_norm": 1.6441607475280762, + "learning_rate": 6.0685307906212163e-05, + "loss": 1.1916, + "step": 17864 + }, + { + "epoch": 0.6397836947374076, + "grad_norm": 1.6410545110702515, + "learning_rate": 6.067464316056789e-05, + "loss": 1.2819, + "step": 17865 + }, + { + "epoch": 0.6398195068669759, + "grad_norm": 1.6242485046386719, + "learning_rate": 6.066397894399553e-05, + "loss": 1.5297, + "step": 17866 + }, + { + "epoch": 0.6398553189965441, + "grad_norm": 1.47056245803833, + "learning_rate": 6.065331525663864e-05, + "loss": 1.6764, + "step": 17867 + }, + { + "epoch": 0.6398911311261124, + "grad_norm": 1.3741710186004639, + "learning_rate": 6.064265209864061e-05, + "loss": 1.3427, + "step": 17868 + }, + { + "epoch": 0.6399269432556807, + "grad_norm": 1.7579123973846436, + "learning_rate": 6.063198947014495e-05, + "loss": 1.9434, + "step": 17869 + }, + { + "epoch": 0.6399627553852489, + "grad_norm": 1.678293228149414, + "learning_rate": 6.06213273712951e-05, + "loss": 1.2601, + "step": 17870 + }, + { + "epoch": 0.6399985675148173, + "grad_norm": 1.5967705249786377, + "learning_rate": 6.061066580223445e-05, + "loss": 1.5347, + "step": 17871 + }, + { + "epoch": 0.6400343796443856, + "grad_norm": 1.7844775915145874, + "learning_rate": 6.0600004763106524e-05, + "loss": 1.3676, + "step": 17872 + }, + { + "epoch": 0.6400701917739539, + "grad_norm": 1.7952393293380737, + "learning_rate": 6.058934425405467e-05, + "loss": 1.6115, + "step": 17873 + }, + { + "epoch": 0.6401060039035221, + "grad_norm": 1.9247483015060425, + "learning_rate": 6.0578684275222376e-05, + "loss": 1.2821, + "step": 17874 + }, + { + "epoch": 0.6401418160330904, + "grad_norm": 1.4412040710449219, + "learning_rate": 6.056802482675303e-05, + "loss": 1.4646, + "step": 17875 + }, + { + "epoch": 0.6401776281626587, + "grad_norm": 1.828956127166748, + "learning_rate": 6.055736590879007e-05, + "loss": 1.856, + "step": 17876 + }, + { + "epoch": 0.6402134402922269, + "grad_norm": 1.5418649911880493, + "learning_rate": 6.0546707521476844e-05, + "loss": 1.4854, + "step": 17877 + }, + { + "epoch": 0.6402492524217953, + "grad_norm": 1.4142812490463257, + "learning_rate": 6.0536049664956797e-05, + "loss": 1.3271, + "step": 17878 + }, + { + "epoch": 0.6402850645513636, + "grad_norm": 2.0951547622680664, + "learning_rate": 6.052539233937331e-05, + "loss": 1.2959, + "step": 17879 + }, + { + "epoch": 0.6403208766809318, + "grad_norm": 1.931695580482483, + "learning_rate": 6.0514735544869706e-05, + "loss": 1.6754, + "step": 17880 + }, + { + "epoch": 0.6403566888105001, + "grad_norm": 2.0580785274505615, + "learning_rate": 6.0504079281589454e-05, + "loss": 1.4303, + "step": 17881 + }, + { + "epoch": 0.6403925009400684, + "grad_norm": 1.1910802125930786, + "learning_rate": 6.049342354967581e-05, + "loss": 1.6201, + "step": 17882 + }, + { + "epoch": 0.6404283130696367, + "grad_norm": 1.6602959632873535, + "learning_rate": 6.0482768349272256e-05, + "loss": 1.3145, + "step": 17883 + }, + { + "epoch": 0.6404641251992049, + "grad_norm": 1.779673457145691, + "learning_rate": 6.047211368052201e-05, + "loss": 1.4677, + "step": 17884 + }, + { + "epoch": 0.6404999373287733, + "grad_norm": 1.6596450805664062, + "learning_rate": 6.0461459543568566e-05, + "loss": 1.5077, + "step": 17885 + }, + { + "epoch": 0.6405357494583416, + "grad_norm": 3.11470103263855, + "learning_rate": 6.04508059385551e-05, + "loss": 1.6707, + "step": 17886 + }, + { + "epoch": 0.6405715615879098, + "grad_norm": 1.251984715461731, + "learning_rate": 6.0440152865625076e-05, + "loss": 1.5564, + "step": 17887 + }, + { + "epoch": 0.6406073737174781, + "grad_norm": 2.3035483360290527, + "learning_rate": 6.042950032492179e-05, + "loss": 1.3278, + "step": 17888 + }, + { + "epoch": 0.6406431858470464, + "grad_norm": 1.6876386404037476, + "learning_rate": 6.041884831658848e-05, + "loss": 1.4541, + "step": 17889 + }, + { + "epoch": 0.6406789979766147, + "grad_norm": 1.6969448328018188, + "learning_rate": 6.040819684076856e-05, + "loss": 1.7653, + "step": 17890 + }, + { + "epoch": 0.6407148101061829, + "grad_norm": 1.8200666904449463, + "learning_rate": 6.039754589760522e-05, + "loss": 1.4405, + "step": 17891 + }, + { + "epoch": 0.6407506222357513, + "grad_norm": 1.7832893133163452, + "learning_rate": 6.038689548724189e-05, + "loss": 1.5234, + "step": 17892 + }, + { + "epoch": 0.6407864343653196, + "grad_norm": 1.9729869365692139, + "learning_rate": 6.037624560982171e-05, + "loss": 1.4516, + "step": 17893 + }, + { + "epoch": 0.6408222464948878, + "grad_norm": 1.648978590965271, + "learning_rate": 6.03655962654881e-05, + "loss": 1.7351, + "step": 17894 + }, + { + "epoch": 0.6408580586244561, + "grad_norm": 1.5560195446014404, + "learning_rate": 6.035494745438421e-05, + "loss": 1.4883, + "step": 17895 + }, + { + "epoch": 0.6408938707540244, + "grad_norm": 1.6403807401657104, + "learning_rate": 6.034429917665342e-05, + "loss": 1.7517, + "step": 17896 + }, + { + "epoch": 0.6409296828835926, + "grad_norm": 2.9381628036499023, + "learning_rate": 6.033365143243891e-05, + "loss": 1.672, + "step": 17897 + }, + { + "epoch": 0.6409654950131609, + "grad_norm": 1.4989262819290161, + "learning_rate": 6.0323004221883936e-05, + "loss": 1.1939, + "step": 17898 + }, + { + "epoch": 0.6410013071427293, + "grad_norm": 1.9374892711639404, + "learning_rate": 6.031235754513178e-05, + "loss": 1.4359, + "step": 17899 + }, + { + "epoch": 0.6410371192722976, + "grad_norm": 1.6095060110092163, + "learning_rate": 6.030171140232562e-05, + "loss": 1.5939, + "step": 17900 + }, + { + "epoch": 0.6410729314018658, + "grad_norm": 1.5997309684753418, + "learning_rate": 6.029106579360879e-05, + "loss": 1.1778, + "step": 17901 + }, + { + "epoch": 0.6411087435314341, + "grad_norm": 2.4153220653533936, + "learning_rate": 6.028042071912439e-05, + "loss": 1.6604, + "step": 17902 + }, + { + "epoch": 0.6411445556610024, + "grad_norm": 1.8344680070877075, + "learning_rate": 6.026977617901575e-05, + "loss": 1.4529, + "step": 17903 + }, + { + "epoch": 0.6411803677905706, + "grad_norm": 1.8805102109909058, + "learning_rate": 6.0259132173426006e-05, + "loss": 1.5929, + "step": 17904 + }, + { + "epoch": 0.6412161799201389, + "grad_norm": 1.684512972831726, + "learning_rate": 6.0248488702498353e-05, + "loss": 1.1998, + "step": 17905 + }, + { + "epoch": 0.6412519920497073, + "grad_norm": 2.2713093757629395, + "learning_rate": 6.0237845766376035e-05, + "loss": 1.6522, + "step": 17906 + }, + { + "epoch": 0.6412878041792756, + "grad_norm": 1.4675745964050293, + "learning_rate": 6.022720336520218e-05, + "loss": 1.632, + "step": 17907 + }, + { + "epoch": 0.6413236163088438, + "grad_norm": 1.731246829032898, + "learning_rate": 6.021656149912003e-05, + "loss": 1.7003, + "step": 17908 + }, + { + "epoch": 0.6413594284384121, + "grad_norm": 1.5518698692321777, + "learning_rate": 6.020592016827271e-05, + "loss": 1.4585, + "step": 17909 + }, + { + "epoch": 0.6413952405679804, + "grad_norm": 1.5268816947937012, + "learning_rate": 6.019527937280342e-05, + "loss": 1.244, + "step": 17910 + }, + { + "epoch": 0.6414310526975486, + "grad_norm": 2.2172024250030518, + "learning_rate": 6.018463911285528e-05, + "loss": 1.345, + "step": 17911 + }, + { + "epoch": 0.6414668648271169, + "grad_norm": 2.417259693145752, + "learning_rate": 6.0173999388571486e-05, + "loss": 1.4226, + "step": 17912 + }, + { + "epoch": 0.6415026769566853, + "grad_norm": 1.5019891262054443, + "learning_rate": 6.0163360200095153e-05, + "loss": 1.4445, + "step": 17913 + }, + { + "epoch": 0.6415384890862535, + "grad_norm": 1.449415683746338, + "learning_rate": 6.015272154756941e-05, + "loss": 1.5976, + "step": 17914 + }, + { + "epoch": 0.6415743012158218, + "grad_norm": 1.4184887409210205, + "learning_rate": 6.014208343113741e-05, + "loss": 1.3103, + "step": 17915 + }, + { + "epoch": 0.6416101133453901, + "grad_norm": 1.948178768157959, + "learning_rate": 6.0131445850942256e-05, + "loss": 1.5137, + "step": 17916 + }, + { + "epoch": 0.6416459254749584, + "grad_norm": 2.6223580837249756, + "learning_rate": 6.012080880712708e-05, + "loss": 1.2029, + "step": 17917 + }, + { + "epoch": 0.6416817376045266, + "grad_norm": 1.2477376461029053, + "learning_rate": 6.011017229983497e-05, + "loss": 1.3365, + "step": 17918 + }, + { + "epoch": 0.6417175497340949, + "grad_norm": 1.6361243724822998, + "learning_rate": 6.0099536329209046e-05, + "loss": 1.5658, + "step": 17919 + }, + { + "epoch": 0.6417533618636633, + "grad_norm": 1.3723866939544678, + "learning_rate": 6.008890089539239e-05, + "loss": 1.501, + "step": 17920 + }, + { + "epoch": 0.6417891739932315, + "grad_norm": 1.4834378957748413, + "learning_rate": 6.0078265998528105e-05, + "loss": 1.2211, + "step": 17921 + }, + { + "epoch": 0.6418249861227998, + "grad_norm": 1.4249910116195679, + "learning_rate": 6.006763163875925e-05, + "loss": 1.2466, + "step": 17922 + }, + { + "epoch": 0.6418607982523681, + "grad_norm": 2.4966237545013428, + "learning_rate": 6.005699781622889e-05, + "loss": 1.5329, + "step": 17923 + }, + { + "epoch": 0.6418966103819364, + "grad_norm": 1.3982599973678589, + "learning_rate": 6.00463645310801e-05, + "loss": 1.3409, + "step": 17924 + }, + { + "epoch": 0.6419324225115046, + "grad_norm": 1.6006054878234863, + "learning_rate": 6.003573178345594e-05, + "loss": 1.4297, + "step": 17925 + }, + { + "epoch": 0.6419682346410729, + "grad_norm": 1.729066014289856, + "learning_rate": 6.002509957349948e-05, + "loss": 1.3919, + "step": 17926 + }, + { + "epoch": 0.6420040467706413, + "grad_norm": 1.4634580612182617, + "learning_rate": 6.001446790135371e-05, + "loss": 1.2687, + "step": 17927 + }, + { + "epoch": 0.6420398589002095, + "grad_norm": 1.6232527494430542, + "learning_rate": 6.0003836767161726e-05, + "loss": 1.2855, + "step": 17928 + }, + { + "epoch": 0.6420756710297778, + "grad_norm": 1.6562719345092773, + "learning_rate": 5.999320617106649e-05, + "loss": 1.5466, + "step": 17929 + }, + { + "epoch": 0.6421114831593461, + "grad_norm": 1.4762431383132935, + "learning_rate": 5.9982576113211095e-05, + "loss": 1.2822, + "step": 17930 + }, + { + "epoch": 0.6421472952889143, + "grad_norm": 1.5727423429489136, + "learning_rate": 5.9971946593738525e-05, + "loss": 1.317, + "step": 17931 + }, + { + "epoch": 0.6421831074184826, + "grad_norm": 1.6262317895889282, + "learning_rate": 5.996131761279176e-05, + "loss": 1.701, + "step": 17932 + }, + { + "epoch": 0.6422189195480509, + "grad_norm": 1.5175750255584717, + "learning_rate": 5.995068917051383e-05, + "loss": 1.3711, + "step": 17933 + }, + { + "epoch": 0.6422547316776193, + "grad_norm": 1.4356316328048706, + "learning_rate": 5.9940061267047695e-05, + "loss": 1.4751, + "step": 17934 + }, + { + "epoch": 0.6422905438071875, + "grad_norm": 2.049543619155884, + "learning_rate": 5.992943390253639e-05, + "loss": 1.6956, + "step": 17935 + }, + { + "epoch": 0.6423263559367558, + "grad_norm": 1.311546802520752, + "learning_rate": 5.991880707712284e-05, + "loss": 1.445, + "step": 17936 + }, + { + "epoch": 0.6423621680663241, + "grad_norm": 2.0570836067199707, + "learning_rate": 5.9908180790950064e-05, + "loss": 1.3155, + "step": 17937 + }, + { + "epoch": 0.6423979801958923, + "grad_norm": 1.6217472553253174, + "learning_rate": 5.989755504416098e-05, + "loss": 1.4565, + "step": 17938 + }, + { + "epoch": 0.6424337923254606, + "grad_norm": 1.4959290027618408, + "learning_rate": 5.988692983689859e-05, + "loss": 1.7333, + "step": 17939 + }, + { + "epoch": 0.6424696044550289, + "grad_norm": 1.4493623971939087, + "learning_rate": 5.98763051693058e-05, + "loss": 1.2832, + "step": 17940 + }, + { + "epoch": 0.6425054165845973, + "grad_norm": 1.8293143510818481, + "learning_rate": 5.9865681041525566e-05, + "loss": 1.7435, + "step": 17941 + }, + { + "epoch": 0.6425412287141655, + "grad_norm": 1.8717924356460571, + "learning_rate": 5.9855057453700836e-05, + "loss": 1.7222, + "step": 17942 + }, + { + "epoch": 0.6425770408437338, + "grad_norm": 2.5850019454956055, + "learning_rate": 5.98444344059745e-05, + "loss": 1.4265, + "step": 17943 + }, + { + "epoch": 0.6426128529733021, + "grad_norm": 1.774579644203186, + "learning_rate": 5.9833811898489534e-05, + "loss": 1.478, + "step": 17944 + }, + { + "epoch": 0.6426486651028703, + "grad_norm": 1.7594568729400635, + "learning_rate": 5.982318993138879e-05, + "loss": 1.6105, + "step": 17945 + }, + { + "epoch": 0.6426844772324386, + "grad_norm": 1.3756812810897827, + "learning_rate": 5.981256850481523e-05, + "loss": 1.362, + "step": 17946 + }, + { + "epoch": 0.6427202893620069, + "grad_norm": 1.6097584962844849, + "learning_rate": 5.980194761891169e-05, + "loss": 1.5736, + "step": 17947 + }, + { + "epoch": 0.6427561014915752, + "grad_norm": 1.5040655136108398, + "learning_rate": 5.9791327273821105e-05, + "loss": 1.577, + "step": 17948 + }, + { + "epoch": 0.6427919136211435, + "grad_norm": 2.5497679710388184, + "learning_rate": 5.978070746968637e-05, + "loss": 1.4751, + "step": 17949 + }, + { + "epoch": 0.6428277257507118, + "grad_norm": 1.8910397291183472, + "learning_rate": 5.977008820665031e-05, + "loss": 1.6845, + "step": 17950 + }, + { + "epoch": 0.6428635378802801, + "grad_norm": 1.77099609375, + "learning_rate": 5.975946948485583e-05, + "loss": 1.5673, + "step": 17951 + }, + { + "epoch": 0.6428993500098483, + "grad_norm": 2.0042824745178223, + "learning_rate": 5.974885130444577e-05, + "loss": 1.5555, + "step": 17952 + }, + { + "epoch": 0.6429351621394166, + "grad_norm": 1.585511565208435, + "learning_rate": 5.9738233665563017e-05, + "loss": 1.3588, + "step": 17953 + }, + { + "epoch": 0.6429709742689849, + "grad_norm": 1.3501858711242676, + "learning_rate": 5.972761656835038e-05, + "loss": 1.2952, + "step": 17954 + }, + { + "epoch": 0.6430067863985532, + "grad_norm": 1.8139315843582153, + "learning_rate": 5.971700001295072e-05, + "loss": 1.4315, + "step": 17955 + }, + { + "epoch": 0.6430425985281215, + "grad_norm": 1.828161597251892, + "learning_rate": 5.9706383999506855e-05, + "loss": 1.4066, + "step": 17956 + }, + { + "epoch": 0.6430784106576898, + "grad_norm": 1.7610077857971191, + "learning_rate": 5.969576852816163e-05, + "loss": 1.7405, + "step": 17957 + }, + { + "epoch": 0.643114222787258, + "grad_norm": 2.1214122772216797, + "learning_rate": 5.968515359905785e-05, + "loss": 1.4642, + "step": 17958 + }, + { + "epoch": 0.6431500349168263, + "grad_norm": 2.081220865249634, + "learning_rate": 5.967453921233832e-05, + "loss": 1.465, + "step": 17959 + }, + { + "epoch": 0.6431858470463946, + "grad_norm": 2.047666549682617, + "learning_rate": 5.966392536814585e-05, + "loss": 1.4785, + "step": 17960 + }, + { + "epoch": 0.6432216591759629, + "grad_norm": 2.4153099060058594, + "learning_rate": 5.9653312066623234e-05, + "loss": 1.7967, + "step": 17961 + }, + { + "epoch": 0.6432574713055312, + "grad_norm": 1.863404393196106, + "learning_rate": 5.964269930791326e-05, + "loss": 1.5028, + "step": 17962 + }, + { + "epoch": 0.6432932834350995, + "grad_norm": 1.4706677198410034, + "learning_rate": 5.963208709215871e-05, + "loss": 1.3202, + "step": 17963 + }, + { + "epoch": 0.6433290955646678, + "grad_norm": 1.6551233530044556, + "learning_rate": 5.962147541950236e-05, + "loss": 1.4942, + "step": 17964 + }, + { + "epoch": 0.643364907694236, + "grad_norm": 2.2377161979675293, + "learning_rate": 5.961086429008696e-05, + "loss": 1.3441, + "step": 17965 + }, + { + "epoch": 0.6434007198238043, + "grad_norm": 1.5724660158157349, + "learning_rate": 5.960025370405531e-05, + "loss": 1.3756, + "step": 17966 + }, + { + "epoch": 0.6434365319533726, + "grad_norm": 1.5656579732894897, + "learning_rate": 5.958964366155014e-05, + "loss": 1.4846, + "step": 17967 + }, + { + "epoch": 0.6434723440829409, + "grad_norm": 2.0878260135650635, + "learning_rate": 5.957903416271414e-05, + "loss": 1.6058, + "step": 17968 + }, + { + "epoch": 0.6435081562125092, + "grad_norm": 1.9619213342666626, + "learning_rate": 5.9568425207690146e-05, + "loss": 1.5998, + "step": 17969 + }, + { + "epoch": 0.6435439683420775, + "grad_norm": 1.3761613368988037, + "learning_rate": 5.9557816796620804e-05, + "loss": 1.6619, + "step": 17970 + }, + { + "epoch": 0.6435797804716458, + "grad_norm": 1.948231816291809, + "learning_rate": 5.954720892964889e-05, + "loss": 1.3027, + "step": 17971 + }, + { + "epoch": 0.643615592601214, + "grad_norm": 1.6816844940185547, + "learning_rate": 5.9536601606917075e-05, + "loss": 1.4994, + "step": 17972 + }, + { + "epoch": 0.6436514047307823, + "grad_norm": 1.75279700756073, + "learning_rate": 5.952599482856811e-05, + "loss": 1.5466, + "step": 17973 + }, + { + "epoch": 0.6436872168603506, + "grad_norm": 1.7188254594802856, + "learning_rate": 5.951538859474467e-05, + "loss": 1.5254, + "step": 17974 + }, + { + "epoch": 0.6437230289899188, + "grad_norm": 1.8281207084655762, + "learning_rate": 5.950478290558947e-05, + "loss": 1.5956, + "step": 17975 + }, + { + "epoch": 0.6437588411194872, + "grad_norm": 1.9698082208633423, + "learning_rate": 5.9494177761245194e-05, + "loss": 1.5074, + "step": 17976 + }, + { + "epoch": 0.6437946532490555, + "grad_norm": 1.3871486186981201, + "learning_rate": 5.9483573161854464e-05, + "loss": 1.5655, + "step": 17977 + }, + { + "epoch": 0.6438304653786238, + "grad_norm": 1.6179381608963013, + "learning_rate": 5.947296910756004e-05, + "loss": 1.2935, + "step": 17978 + }, + { + "epoch": 0.643866277508192, + "grad_norm": 1.5491410493850708, + "learning_rate": 5.946236559850449e-05, + "loss": 1.4044, + "step": 17979 + }, + { + "epoch": 0.6439020896377603, + "grad_norm": 2.000506639480591, + "learning_rate": 5.945176263483057e-05, + "loss": 1.536, + "step": 17980 + }, + { + "epoch": 0.6439379017673286, + "grad_norm": 1.5044797658920288, + "learning_rate": 5.9441160216680826e-05, + "loss": 1.6398, + "step": 17981 + }, + { + "epoch": 0.6439737138968968, + "grad_norm": 2.1576712131500244, + "learning_rate": 5.9430558344198016e-05, + "loss": 1.4296, + "step": 17982 + }, + { + "epoch": 0.6440095260264652, + "grad_norm": 1.4655178785324097, + "learning_rate": 5.941995701752465e-05, + "loss": 1.4955, + "step": 17983 + }, + { + "epoch": 0.6440453381560335, + "grad_norm": 1.7146788835525513, + "learning_rate": 5.9409356236803456e-05, + "loss": 1.6319, + "step": 17984 + }, + { + "epoch": 0.6440811502856018, + "grad_norm": 2.0093584060668945, + "learning_rate": 5.9398756002177035e-05, + "loss": 1.4727, + "step": 17985 + }, + { + "epoch": 0.64411696241517, + "grad_norm": 1.5980204343795776, + "learning_rate": 5.938815631378794e-05, + "loss": 1.4705, + "step": 17986 + }, + { + "epoch": 0.6441527745447383, + "grad_norm": 1.7558846473693848, + "learning_rate": 5.937755717177885e-05, + "loss": 1.4372, + "step": 17987 + }, + { + "epoch": 0.6441885866743066, + "grad_norm": 2.151200294494629, + "learning_rate": 5.9366958576292284e-05, + "loss": 1.5712, + "step": 17988 + }, + { + "epoch": 0.6442243988038748, + "grad_norm": 1.6231688261032104, + "learning_rate": 5.9356360527470934e-05, + "loss": 1.5356, + "step": 17989 + }, + { + "epoch": 0.6442602109334432, + "grad_norm": 2.068516969680786, + "learning_rate": 5.9345763025457266e-05, + "loss": 1.7413, + "step": 17990 + }, + { + "epoch": 0.6442960230630115, + "grad_norm": 1.3470734357833862, + "learning_rate": 5.9335166070393975e-05, + "loss": 1.6185, + "step": 17991 + }, + { + "epoch": 0.6443318351925797, + "grad_norm": 1.7504842281341553, + "learning_rate": 5.93245696624235e-05, + "loss": 1.5977, + "step": 17992 + }, + { + "epoch": 0.644367647322148, + "grad_norm": 1.6177539825439453, + "learning_rate": 5.931397380168855e-05, + "loss": 1.3695, + "step": 17993 + }, + { + "epoch": 0.6444034594517163, + "grad_norm": 2.0384392738342285, + "learning_rate": 5.9303378488331576e-05, + "loss": 1.5366, + "step": 17994 + }, + { + "epoch": 0.6444392715812846, + "grad_norm": 2.010671377182007, + "learning_rate": 5.9292783722495126e-05, + "loss": 1.5376, + "step": 17995 + }, + { + "epoch": 0.6444750837108528, + "grad_norm": 1.5663658380508423, + "learning_rate": 5.928218950432179e-05, + "loss": 1.3194, + "step": 17996 + }, + { + "epoch": 0.6445108958404212, + "grad_norm": 1.3770331144332886, + "learning_rate": 5.927159583395403e-05, + "loss": 1.4241, + "step": 17997 + }, + { + "epoch": 0.6445467079699895, + "grad_norm": 1.6570316553115845, + "learning_rate": 5.926100271153446e-05, + "loss": 1.4073, + "step": 17998 + }, + { + "epoch": 0.6445825200995577, + "grad_norm": 1.6265000104904175, + "learning_rate": 5.9250410137205506e-05, + "loss": 1.485, + "step": 17999 + }, + { + "epoch": 0.644618332229126, + "grad_norm": 1.9524234533309937, + "learning_rate": 5.923981811110977e-05, + "loss": 1.7584, + "step": 18000 + }, + { + "epoch": 0.6446541443586943, + "grad_norm": 2.180847406387329, + "learning_rate": 5.922922663338969e-05, + "loss": 1.6183, + "step": 18001 + }, + { + "epoch": 0.6446899564882625, + "grad_norm": 1.5719871520996094, + "learning_rate": 5.921863570418775e-05, + "loss": 1.5754, + "step": 18002 + }, + { + "epoch": 0.6447257686178308, + "grad_norm": 1.1843252182006836, + "learning_rate": 5.9208045323646474e-05, + "loss": 1.3016, + "step": 18003 + }, + { + "epoch": 0.6447615807473992, + "grad_norm": 2.256765604019165, + "learning_rate": 5.919745549190834e-05, + "loss": 1.483, + "step": 18004 + }, + { + "epoch": 0.6447973928769675, + "grad_norm": 1.5203441381454468, + "learning_rate": 5.91868662091158e-05, + "loss": 1.4898, + "step": 18005 + }, + { + "epoch": 0.6448332050065357, + "grad_norm": 1.4746381044387817, + "learning_rate": 5.9176277475411324e-05, + "loss": 1.4107, + "step": 18006 + }, + { + "epoch": 0.644869017136104, + "grad_norm": 1.7098374366760254, + "learning_rate": 5.91656892909374e-05, + "loss": 1.1795, + "step": 18007 + }, + { + "epoch": 0.6449048292656723, + "grad_norm": 1.5304396152496338, + "learning_rate": 5.915510165583642e-05, + "loss": 1.2104, + "step": 18008 + }, + { + "epoch": 0.6449406413952405, + "grad_norm": 1.4094974994659424, + "learning_rate": 5.91445145702509e-05, + "loss": 1.361, + "step": 18009 + }, + { + "epoch": 0.6449764535248088, + "grad_norm": 1.5532375574111938, + "learning_rate": 5.9133928034323215e-05, + "loss": 1.5194, + "step": 18010 + }, + { + "epoch": 0.6450122656543772, + "grad_norm": 2.0995876789093018, + "learning_rate": 5.912334204819581e-05, + "loss": 1.6518, + "step": 18011 + }, + { + "epoch": 0.6450480777839455, + "grad_norm": 1.8141247034072876, + "learning_rate": 5.911275661201112e-05, + "loss": 1.5584, + "step": 18012 + }, + { + "epoch": 0.6450838899135137, + "grad_norm": 1.501083254814148, + "learning_rate": 5.910217172591155e-05, + "loss": 1.437, + "step": 18013 + }, + { + "epoch": 0.645119702043082, + "grad_norm": 1.8988889455795288, + "learning_rate": 5.90915873900395e-05, + "loss": 1.2275, + "step": 18014 + }, + { + "epoch": 0.6451555141726503, + "grad_norm": 1.5784447193145752, + "learning_rate": 5.908100360453737e-05, + "loss": 1.5859, + "step": 18015 + }, + { + "epoch": 0.6451913263022185, + "grad_norm": 1.3819835186004639, + "learning_rate": 5.9070420369547564e-05, + "loss": 1.3364, + "step": 18016 + }, + { + "epoch": 0.6452271384317868, + "grad_norm": 1.9142459630966187, + "learning_rate": 5.905983768521244e-05, + "loss": 1.2799, + "step": 18017 + }, + { + "epoch": 0.6452629505613552, + "grad_norm": 1.3652900457382202, + "learning_rate": 5.904925555167442e-05, + "loss": 1.206, + "step": 18018 + }, + { + "epoch": 0.6452987626909235, + "grad_norm": 1.5359416007995605, + "learning_rate": 5.903867396907583e-05, + "loss": 1.1311, + "step": 18019 + }, + { + "epoch": 0.6453345748204917, + "grad_norm": 1.9005250930786133, + "learning_rate": 5.9028092937559034e-05, + "loss": 1.5002, + "step": 18020 + }, + { + "epoch": 0.64537038695006, + "grad_norm": 1.9126530885696411, + "learning_rate": 5.901751245726641e-05, + "loss": 1.7099, + "step": 18021 + }, + { + "epoch": 0.6454061990796283, + "grad_norm": 1.833815574645996, + "learning_rate": 5.9006932528340284e-05, + "loss": 1.4293, + "step": 18022 + }, + { + "epoch": 0.6454420112091965, + "grad_norm": 3.0067315101623535, + "learning_rate": 5.899635315092301e-05, + "loss": 1.5751, + "step": 18023 + }, + { + "epoch": 0.6454778233387648, + "grad_norm": 1.957001805305481, + "learning_rate": 5.89857743251569e-05, + "loss": 1.3815, + "step": 18024 + }, + { + "epoch": 0.6455136354683331, + "grad_norm": 1.573081135749817, + "learning_rate": 5.897519605118431e-05, + "loss": 1.3973, + "step": 18025 + }, + { + "epoch": 0.6455494475979014, + "grad_norm": 1.9092211723327637, + "learning_rate": 5.896461832914753e-05, + "loss": 1.6152, + "step": 18026 + }, + { + "epoch": 0.6455852597274697, + "grad_norm": 1.6830337047576904, + "learning_rate": 5.8954041159188876e-05, + "loss": 1.8081, + "step": 18027 + }, + { + "epoch": 0.645621071857038, + "grad_norm": 1.7186933755874634, + "learning_rate": 5.894346454145068e-05, + "loss": 1.4044, + "step": 18028 + }, + { + "epoch": 0.6456568839866063, + "grad_norm": 1.6428803205490112, + "learning_rate": 5.8932888476075166e-05, + "loss": 1.5604, + "step": 18029 + }, + { + "epoch": 0.6456926961161745, + "grad_norm": 2.214829683303833, + "learning_rate": 5.89223129632047e-05, + "loss": 1.3191, + "step": 18030 + }, + { + "epoch": 0.6457285082457428, + "grad_norm": 1.8516603708267212, + "learning_rate": 5.8911738002981506e-05, + "loss": 1.5621, + "step": 18031 + }, + { + "epoch": 0.6457643203753111, + "grad_norm": 2.198071002960205, + "learning_rate": 5.890116359554789e-05, + "loss": 1.398, + "step": 18032 + }, + { + "epoch": 0.6458001325048794, + "grad_norm": 1.7119433879852295, + "learning_rate": 5.8890589741046084e-05, + "loss": 1.6987, + "step": 18033 + }, + { + "epoch": 0.6458359446344477, + "grad_norm": 1.5865453481674194, + "learning_rate": 5.888001643961839e-05, + "loss": 1.4178, + "step": 18034 + }, + { + "epoch": 0.645871756764016, + "grad_norm": 1.7404098510742188, + "learning_rate": 5.886944369140701e-05, + "loss": 1.8102, + "step": 18035 + }, + { + "epoch": 0.6459075688935842, + "grad_norm": 1.6258031129837036, + "learning_rate": 5.8858871496554235e-05, + "loss": 1.4598, + "step": 18036 + }, + { + "epoch": 0.6459433810231525, + "grad_norm": 2.006159543991089, + "learning_rate": 5.884829985520227e-05, + "loss": 1.6035, + "step": 18037 + }, + { + "epoch": 0.6459791931527208, + "grad_norm": 1.9466265439987183, + "learning_rate": 5.883772876749334e-05, + "loss": 1.3743, + "step": 18038 + }, + { + "epoch": 0.6460150052822891, + "grad_norm": 1.9104527235031128, + "learning_rate": 5.882715823356968e-05, + "loss": 1.4299, + "step": 18039 + }, + { + "epoch": 0.6460508174118574, + "grad_norm": 2.058929920196533, + "learning_rate": 5.881658825357348e-05, + "loss": 1.7316, + "step": 18040 + }, + { + "epoch": 0.6460866295414257, + "grad_norm": 1.6255000829696655, + "learning_rate": 5.8806018827646994e-05, + "loss": 1.4241, + "step": 18041 + }, + { + "epoch": 0.646122441670994, + "grad_norm": 1.745592713356018, + "learning_rate": 5.879544995593236e-05, + "loss": 1.6278, + "step": 18042 + }, + { + "epoch": 0.6461582538005622, + "grad_norm": 1.3595131635665894, + "learning_rate": 5.878488163857181e-05, + "loss": 1.262, + "step": 18043 + }, + { + "epoch": 0.6461940659301305, + "grad_norm": 1.6638784408569336, + "learning_rate": 5.87743138757075e-05, + "loss": 1.5087, + "step": 18044 + }, + { + "epoch": 0.6462298780596988, + "grad_norm": 1.6791967153549194, + "learning_rate": 5.8763746667481634e-05, + "loss": 1.3979, + "step": 18045 + }, + { + "epoch": 0.646265690189267, + "grad_norm": 1.6635165214538574, + "learning_rate": 5.8753180014036377e-05, + "loss": 1.4787, + "step": 18046 + }, + { + "epoch": 0.6463015023188354, + "grad_norm": 1.7006889581680298, + "learning_rate": 5.874261391551386e-05, + "loss": 1.5113, + "step": 18047 + }, + { + "epoch": 0.6463373144484037, + "grad_norm": 1.8248895406723022, + "learning_rate": 5.873204837205626e-05, + "loss": 1.518, + "step": 18048 + }, + { + "epoch": 0.646373126577972, + "grad_norm": 1.6426078081130981, + "learning_rate": 5.8721483383805696e-05, + "loss": 1.4435, + "step": 18049 + }, + { + "epoch": 0.6464089387075402, + "grad_norm": 1.9970711469650269, + "learning_rate": 5.871091895090437e-05, + "loss": 1.2891, + "step": 18050 + }, + { + "epoch": 0.6464447508371085, + "grad_norm": 2.244947910308838, + "learning_rate": 5.870035507349434e-05, + "loss": 1.8148, + "step": 18051 + }, + { + "epoch": 0.6464805629666768, + "grad_norm": 1.321719765663147, + "learning_rate": 5.8689791751717757e-05, + "loss": 1.286, + "step": 18052 + }, + { + "epoch": 0.646516375096245, + "grad_norm": 1.4628890752792358, + "learning_rate": 5.867922898571675e-05, + "loss": 1.6739, + "step": 18053 + }, + { + "epoch": 0.6465521872258134, + "grad_norm": 2.1549174785614014, + "learning_rate": 5.8668666775633426e-05, + "loss": 1.8747, + "step": 18054 + }, + { + "epoch": 0.6465879993553817, + "grad_norm": 1.6913148164749146, + "learning_rate": 5.8658105121609896e-05, + "loss": 1.4313, + "step": 18055 + }, + { + "epoch": 0.64662381148495, + "grad_norm": 1.9767898321151733, + "learning_rate": 5.864754402378818e-05, + "loss": 1.5655, + "step": 18056 + }, + { + "epoch": 0.6466596236145182, + "grad_norm": 1.8027931451797485, + "learning_rate": 5.863698348231045e-05, + "loss": 1.4809, + "step": 18057 + }, + { + "epoch": 0.6466954357440865, + "grad_norm": 1.5340521335601807, + "learning_rate": 5.862642349731874e-05, + "loss": 1.4937, + "step": 18058 + }, + { + "epoch": 0.6467312478736548, + "grad_norm": 1.65854811668396, + "learning_rate": 5.861586406895514e-05, + "loss": 1.2827, + "step": 18059 + }, + { + "epoch": 0.646767060003223, + "grad_norm": 1.2305033206939697, + "learning_rate": 5.8605305197361705e-05, + "loss": 1.4615, + "step": 18060 + }, + { + "epoch": 0.6468028721327914, + "grad_norm": 1.3957847356796265, + "learning_rate": 5.859474688268051e-05, + "loss": 1.4643, + "step": 18061 + }, + { + "epoch": 0.6468386842623597, + "grad_norm": 1.881250262260437, + "learning_rate": 5.8584189125053556e-05, + "loss": 1.2637, + "step": 18062 + }, + { + "epoch": 0.646874496391928, + "grad_norm": 1.5098015069961548, + "learning_rate": 5.857363192462294e-05, + "loss": 1.5307, + "step": 18063 + }, + { + "epoch": 0.6469103085214962, + "grad_norm": 1.6502503156661987, + "learning_rate": 5.8563075281530685e-05, + "loss": 1.1335, + "step": 18064 + }, + { + "epoch": 0.6469461206510645, + "grad_norm": 1.6741015911102295, + "learning_rate": 5.855251919591875e-05, + "loss": 1.5119, + "step": 18065 + }, + { + "epoch": 0.6469819327806328, + "grad_norm": 1.6774035692214966, + "learning_rate": 5.8541963667929276e-05, + "loss": 1.4242, + "step": 18066 + }, + { + "epoch": 0.647017744910201, + "grad_norm": 1.69266939163208, + "learning_rate": 5.8531408697704124e-05, + "loss": 1.2887, + "step": 18067 + }, + { + "epoch": 0.6470535570397694, + "grad_norm": 1.6085079908370972, + "learning_rate": 5.852085428538545e-05, + "loss": 1.5291, + "step": 18068 + }, + { + "epoch": 0.6470893691693377, + "grad_norm": 1.8055675029754639, + "learning_rate": 5.851030043111512e-05, + "loss": 1.6574, + "step": 18069 + }, + { + "epoch": 0.647125181298906, + "grad_norm": 1.7611117362976074, + "learning_rate": 5.849974713503521e-05, + "loss": 1.5388, + "step": 18070 + }, + { + "epoch": 0.6471609934284742, + "grad_norm": 1.4818743467330933, + "learning_rate": 5.848919439728765e-05, + "loss": 1.2733, + "step": 18071 + }, + { + "epoch": 0.6471968055580425, + "grad_norm": 1.5272941589355469, + "learning_rate": 5.847864221801446e-05, + "loss": 1.5622, + "step": 18072 + }, + { + "epoch": 0.6472326176876108, + "grad_norm": 1.7738280296325684, + "learning_rate": 5.8468090597357595e-05, + "loss": 1.6174, + "step": 18073 + }, + { + "epoch": 0.647268429817179, + "grad_norm": 2.0849111080169678, + "learning_rate": 5.845753953545894e-05, + "loss": 1.4549, + "step": 18074 + }, + { + "epoch": 0.6473042419467474, + "grad_norm": 2.5184481143951416, + "learning_rate": 5.8446989032460574e-05, + "loss": 1.7102, + "step": 18075 + }, + { + "epoch": 0.6473400540763157, + "grad_norm": 1.650846004486084, + "learning_rate": 5.84364390885043e-05, + "loss": 1.2554, + "step": 18076 + }, + { + "epoch": 0.6473758662058839, + "grad_norm": 1.4134989976882935, + "learning_rate": 5.8425889703732193e-05, + "loss": 1.3435, + "step": 18077 + }, + { + "epoch": 0.6474116783354522, + "grad_norm": 1.4370696544647217, + "learning_rate": 5.841534087828604e-05, + "loss": 1.2767, + "step": 18078 + }, + { + "epoch": 0.6474474904650205, + "grad_norm": 1.943215012550354, + "learning_rate": 5.840479261230791e-05, + "loss": 1.6713, + "step": 18079 + }, + { + "epoch": 0.6474833025945887, + "grad_norm": 1.4532264471054077, + "learning_rate": 5.839424490593957e-05, + "loss": 1.5316, + "step": 18080 + }, + { + "epoch": 0.647519114724157, + "grad_norm": 2.2967021465301514, + "learning_rate": 5.8383697759323045e-05, + "loss": 1.1804, + "step": 18081 + }, + { + "epoch": 0.6475549268537254, + "grad_norm": 1.5080279111862183, + "learning_rate": 5.8373151172600207e-05, + "loss": 1.2792, + "step": 18082 + }, + { + "epoch": 0.6475907389832937, + "grad_norm": 1.3563507795333862, + "learning_rate": 5.836260514591287e-05, + "loss": 1.6779, + "step": 18083 + }, + { + "epoch": 0.6476265511128619, + "grad_norm": 1.6162803173065186, + "learning_rate": 5.8352059679402994e-05, + "loss": 1.4045, + "step": 18084 + }, + { + "epoch": 0.6476623632424302, + "grad_norm": 1.5351731777191162, + "learning_rate": 5.834151477321242e-05, + "loss": 1.2639, + "step": 18085 + }, + { + "epoch": 0.6476981753719985, + "grad_norm": 1.8498332500457764, + "learning_rate": 5.833097042748308e-05, + "loss": 1.5596, + "step": 18086 + }, + { + "epoch": 0.6477339875015667, + "grad_norm": 1.558227777481079, + "learning_rate": 5.832042664235673e-05, + "loss": 1.3214, + "step": 18087 + }, + { + "epoch": 0.647769799631135, + "grad_norm": 1.8643543720245361, + "learning_rate": 5.8309883417975275e-05, + "loss": 1.2727, + "step": 18088 + }, + { + "epoch": 0.6478056117607034, + "grad_norm": 1.2156808376312256, + "learning_rate": 5.829934075448058e-05, + "loss": 1.1644, + "step": 18089 + }, + { + "epoch": 0.6478414238902717, + "grad_norm": 1.5507395267486572, + "learning_rate": 5.8288798652014485e-05, + "loss": 1.4555, + "step": 18090 + }, + { + "epoch": 0.6478772360198399, + "grad_norm": 1.6052091121673584, + "learning_rate": 5.827825711071877e-05, + "loss": 1.2833, + "step": 18091 + }, + { + "epoch": 0.6479130481494082, + "grad_norm": 1.7828127145767212, + "learning_rate": 5.8267716130735295e-05, + "loss": 1.452, + "step": 18092 + }, + { + "epoch": 0.6479488602789765, + "grad_norm": 2.0876612663269043, + "learning_rate": 5.82571757122059e-05, + "loss": 1.4555, + "step": 18093 + }, + { + "epoch": 0.6479846724085447, + "grad_norm": 1.3170816898345947, + "learning_rate": 5.824663585527232e-05, + "loss": 1.533, + "step": 18094 + }, + { + "epoch": 0.648020484538113, + "grad_norm": 1.6874192953109741, + "learning_rate": 5.8236096560076405e-05, + "loss": 1.3156, + "step": 18095 + }, + { + "epoch": 0.6480562966676814, + "grad_norm": 1.6135790348052979, + "learning_rate": 5.8225557826759935e-05, + "loss": 1.5129, + "step": 18096 + }, + { + "epoch": 0.6480921087972497, + "grad_norm": 1.4755977392196655, + "learning_rate": 5.821501965546474e-05, + "loss": 1.4064, + "step": 18097 + }, + { + "epoch": 0.6481279209268179, + "grad_norm": 2.032099723815918, + "learning_rate": 5.820448204633251e-05, + "loss": 1.5011, + "step": 18098 + }, + { + "epoch": 0.6481637330563862, + "grad_norm": 1.5562313795089722, + "learning_rate": 5.819394499950508e-05, + "loss": 1.4597, + "step": 18099 + }, + { + "epoch": 0.6481995451859545, + "grad_norm": 1.6698648929595947, + "learning_rate": 5.8183408515124216e-05, + "loss": 1.4674, + "step": 18100 + }, + { + "epoch": 0.6482353573155227, + "grad_norm": 1.8499637842178345, + "learning_rate": 5.817287259333162e-05, + "loss": 1.574, + "step": 18101 + }, + { + "epoch": 0.648271169445091, + "grad_norm": 2.0695765018463135, + "learning_rate": 5.816233723426907e-05, + "loss": 1.3691, + "step": 18102 + }, + { + "epoch": 0.6483069815746594, + "grad_norm": 1.5621305704116821, + "learning_rate": 5.81518024380783e-05, + "loss": 1.9263, + "step": 18103 + }, + { + "epoch": 0.6483427937042276, + "grad_norm": 1.3065502643585205, + "learning_rate": 5.814126820490109e-05, + "loss": 1.5336, + "step": 18104 + }, + { + "epoch": 0.6483786058337959, + "grad_norm": 1.9310072660446167, + "learning_rate": 5.8130734534879075e-05, + "loss": 1.8141, + "step": 18105 + }, + { + "epoch": 0.6484144179633642, + "grad_norm": 2.5569870471954346, + "learning_rate": 5.812020142815403e-05, + "loss": 1.4777, + "step": 18106 + }, + { + "epoch": 0.6484502300929325, + "grad_norm": 1.4359135627746582, + "learning_rate": 5.810966888486768e-05, + "loss": 1.0495, + "step": 18107 + }, + { + "epoch": 0.6484860422225007, + "grad_norm": 1.7345796823501587, + "learning_rate": 5.809913690516169e-05, + "loss": 1.1905, + "step": 18108 + }, + { + "epoch": 0.648521854352069, + "grad_norm": 1.5571644306182861, + "learning_rate": 5.808860548917778e-05, + "loss": 1.4347, + "step": 18109 + }, + { + "epoch": 0.6485576664816374, + "grad_norm": 2.2641749382019043, + "learning_rate": 5.807807463705754e-05, + "loss": 1.3108, + "step": 18110 + }, + { + "epoch": 0.6485934786112056, + "grad_norm": 1.5269492864608765, + "learning_rate": 5.8067544348942825e-05, + "loss": 1.6782, + "step": 18111 + }, + { + "epoch": 0.6486292907407739, + "grad_norm": 1.2696325778961182, + "learning_rate": 5.805701462497517e-05, + "loss": 1.6576, + "step": 18112 + }, + { + "epoch": 0.6486651028703422, + "grad_norm": 2.062124252319336, + "learning_rate": 5.804648546529627e-05, + "loss": 1.7734, + "step": 18113 + }, + { + "epoch": 0.6487009149999104, + "grad_norm": 1.5553981065750122, + "learning_rate": 5.803595687004779e-05, + "loss": 1.5541, + "step": 18114 + }, + { + "epoch": 0.6487367271294787, + "grad_norm": 2.4918081760406494, + "learning_rate": 5.802542883937143e-05, + "loss": 1.7596, + "step": 18115 + }, + { + "epoch": 0.648772539259047, + "grad_norm": 2.098888635635376, + "learning_rate": 5.801490137340879e-05, + "loss": 1.3388, + "step": 18116 + }, + { + "epoch": 0.6488083513886154, + "grad_norm": 1.448175311088562, + "learning_rate": 5.80043744723014e-05, + "loss": 1.3633, + "step": 18117 + }, + { + "epoch": 0.6488441635181836, + "grad_norm": 1.5029058456420898, + "learning_rate": 5.7993848136191065e-05, + "loss": 1.5644, + "step": 18118 + }, + { + "epoch": 0.6488799756477519, + "grad_norm": 2.209278106689453, + "learning_rate": 5.7983322365219287e-05, + "loss": 1.5103, + "step": 18119 + }, + { + "epoch": 0.6489157877773202, + "grad_norm": 1.7898609638214111, + "learning_rate": 5.797279715952774e-05, + "loss": 1.6017, + "step": 18120 + }, + { + "epoch": 0.6489515999068884, + "grad_norm": 2.122300624847412, + "learning_rate": 5.796227251925792e-05, + "loss": 1.7573, + "step": 18121 + }, + { + "epoch": 0.6489874120364567, + "grad_norm": 1.6340723037719727, + "learning_rate": 5.795174844455157e-05, + "loss": 1.5623, + "step": 18122 + }, + { + "epoch": 0.649023224166025, + "grad_norm": 1.4408314228057861, + "learning_rate": 5.7941224935550166e-05, + "loss": 1.5066, + "step": 18123 + }, + { + "epoch": 0.6490590362955934, + "grad_norm": 1.8224083185195923, + "learning_rate": 5.793070199239534e-05, + "loss": 1.2898, + "step": 18124 + }, + { + "epoch": 0.6490948484251616, + "grad_norm": 1.6745611429214478, + "learning_rate": 5.7920179615228684e-05, + "loss": 1.5694, + "step": 18125 + }, + { + "epoch": 0.6491306605547299, + "grad_norm": 1.6290042400360107, + "learning_rate": 5.790965780419171e-05, + "loss": 1.2513, + "step": 18126 + }, + { + "epoch": 0.6491664726842982, + "grad_norm": 1.885898232460022, + "learning_rate": 5.7899136559426015e-05, + "loss": 1.334, + "step": 18127 + }, + { + "epoch": 0.6492022848138664, + "grad_norm": 1.878179669380188, + "learning_rate": 5.788861588107306e-05, + "loss": 1.6988, + "step": 18128 + }, + { + "epoch": 0.6492380969434347, + "grad_norm": 2.0154190063476562, + "learning_rate": 5.787809576927454e-05, + "loss": 1.3509, + "step": 18129 + }, + { + "epoch": 0.649273909073003, + "grad_norm": 1.7065162658691406, + "learning_rate": 5.786757622417187e-05, + "loss": 1.6076, + "step": 18130 + }, + { + "epoch": 0.6493097212025714, + "grad_norm": 1.7891151905059814, + "learning_rate": 5.7857057245906656e-05, + "loss": 1.5914, + "step": 18131 + }, + { + "epoch": 0.6493455333321396, + "grad_norm": 1.5112416744232178, + "learning_rate": 5.784653883462029e-05, + "loss": 1.3508, + "step": 18132 + }, + { + "epoch": 0.6493813454617079, + "grad_norm": 1.3441530466079712, + "learning_rate": 5.7836020990454444e-05, + "loss": 1.4401, + "step": 18133 + }, + { + "epoch": 0.6494171575912762, + "grad_norm": 1.4599716663360596, + "learning_rate": 5.7825503713550555e-05, + "loss": 1.3342, + "step": 18134 + }, + { + "epoch": 0.6494529697208444, + "grad_norm": 1.3666131496429443, + "learning_rate": 5.7814987004050084e-05, + "loss": 1.2142, + "step": 18135 + }, + { + "epoch": 0.6494887818504127, + "grad_norm": 1.5471409559249878, + "learning_rate": 5.780447086209453e-05, + "loss": 1.1893, + "step": 18136 + }, + { + "epoch": 0.649524593979981, + "grad_norm": 1.7935959100723267, + "learning_rate": 5.779395528782541e-05, + "loss": 1.6171, + "step": 18137 + }, + { + "epoch": 0.6495604061095493, + "grad_norm": 2.097639799118042, + "learning_rate": 5.7783440281384205e-05, + "loss": 1.445, + "step": 18138 + }, + { + "epoch": 0.6495962182391176, + "grad_norm": 1.3056614398956299, + "learning_rate": 5.777292584291227e-05, + "loss": 1.4843, + "step": 18139 + }, + { + "epoch": 0.6496320303686859, + "grad_norm": 1.3585635423660278, + "learning_rate": 5.7762411972551254e-05, + "loss": 1.2193, + "step": 18140 + }, + { + "epoch": 0.6496678424982542, + "grad_norm": 1.2631720304489136, + "learning_rate": 5.775189867044244e-05, + "loss": 1.4512, + "step": 18141 + }, + { + "epoch": 0.6497036546278224, + "grad_norm": 1.6314293146133423, + "learning_rate": 5.7741385936727375e-05, + "loss": 1.6188, + "step": 18142 + }, + { + "epoch": 0.6497394667573907, + "grad_norm": 1.9369200468063354, + "learning_rate": 5.7730873771547423e-05, + "loss": 1.5405, + "step": 18143 + }, + { + "epoch": 0.649775278886959, + "grad_norm": 1.9945943355560303, + "learning_rate": 5.772036217504404e-05, + "loss": 1.4426, + "step": 18144 + }, + { + "epoch": 0.6498110910165273, + "grad_norm": 1.2491214275360107, + "learning_rate": 5.770985114735868e-05, + "loss": 1.3435, + "step": 18145 + }, + { + "epoch": 0.6498469031460956, + "grad_norm": 1.999806523323059, + "learning_rate": 5.76993406886327e-05, + "loss": 1.4863, + "step": 18146 + }, + { + "epoch": 0.6498827152756639, + "grad_norm": 1.5527349710464478, + "learning_rate": 5.768883079900751e-05, + "loss": 1.517, + "step": 18147 + }, + { + "epoch": 0.6499185274052321, + "grad_norm": 1.8263483047485352, + "learning_rate": 5.767832147862452e-05, + "loss": 1.562, + "step": 18148 + }, + { + "epoch": 0.6499543395348004, + "grad_norm": 1.532959222793579, + "learning_rate": 5.7667812727625184e-05, + "loss": 1.107, + "step": 18149 + }, + { + "epoch": 0.6499901516643687, + "grad_norm": 1.593616008758545, + "learning_rate": 5.765730454615072e-05, + "loss": 1.4869, + "step": 18150 + }, + { + "epoch": 0.650025963793937, + "grad_norm": 1.7059237957000732, + "learning_rate": 5.764679693434269e-05, + "loss": 1.5631, + "step": 18151 + }, + { + "epoch": 0.6500617759235053, + "grad_norm": 2.210345506668091, + "learning_rate": 5.763628989234238e-05, + "loss": 1.4786, + "step": 18152 + }, + { + "epoch": 0.6500975880530736, + "grad_norm": 1.792277455329895, + "learning_rate": 5.76257834202911e-05, + "loss": 1.3773, + "step": 18153 + }, + { + "epoch": 0.6501334001826419, + "grad_norm": 1.5056220293045044, + "learning_rate": 5.761527751833026e-05, + "loss": 1.4831, + "step": 18154 + }, + { + "epoch": 0.6501692123122101, + "grad_norm": 2.4533936977386475, + "learning_rate": 5.760477218660119e-05, + "loss": 1.5878, + "step": 18155 + }, + { + "epoch": 0.6502050244417784, + "grad_norm": 1.424676537513733, + "learning_rate": 5.759426742524524e-05, + "loss": 1.2297, + "step": 18156 + }, + { + "epoch": 0.6502408365713467, + "grad_norm": 1.37893807888031, + "learning_rate": 5.75837632344037e-05, + "loss": 1.6512, + "step": 18157 + }, + { + "epoch": 0.650276648700915, + "grad_norm": 1.7505850791931152, + "learning_rate": 5.757325961421791e-05, + "loss": 1.4344, + "step": 18158 + }, + { + "epoch": 0.6503124608304833, + "grad_norm": 1.5535842180252075, + "learning_rate": 5.756275656482918e-05, + "loss": 1.3422, + "step": 18159 + }, + { + "epoch": 0.6503482729600516, + "grad_norm": 1.5393567085266113, + "learning_rate": 5.7552254086378863e-05, + "loss": 1.4882, + "step": 18160 + }, + { + "epoch": 0.6503840850896199, + "grad_norm": 1.7456070184707642, + "learning_rate": 5.754175217900817e-05, + "loss": 1.766, + "step": 18161 + }, + { + "epoch": 0.6504198972191881, + "grad_norm": 2.0329902172088623, + "learning_rate": 5.753125084285844e-05, + "loss": 1.5138, + "step": 18162 + }, + { + "epoch": 0.6504557093487564, + "grad_norm": 1.3546897172927856, + "learning_rate": 5.752075007807098e-05, + "loss": 1.5011, + "step": 18163 + }, + { + "epoch": 0.6504915214783247, + "grad_norm": 1.7984650135040283, + "learning_rate": 5.751024988478701e-05, + "loss": 1.5396, + "step": 18164 + }, + { + "epoch": 0.6505273336078929, + "grad_norm": 1.3731608390808105, + "learning_rate": 5.749975026314781e-05, + "loss": 1.4057, + "step": 18165 + }, + { + "epoch": 0.6505631457374613, + "grad_norm": 2.18913197517395, + "learning_rate": 5.748925121329465e-05, + "loss": 1.5947, + "step": 18166 + }, + { + "epoch": 0.6505989578670296, + "grad_norm": 1.4021323919296265, + "learning_rate": 5.747875273536882e-05, + "loss": 1.316, + "step": 18167 + }, + { + "epoch": 0.6506347699965979, + "grad_norm": 1.6417192220687866, + "learning_rate": 5.746825482951148e-05, + "loss": 1.4414, + "step": 18168 + }, + { + "epoch": 0.6506705821261661, + "grad_norm": 2.237081289291382, + "learning_rate": 5.7457757495863916e-05, + "loss": 1.2582, + "step": 18169 + }, + { + "epoch": 0.6507063942557344, + "grad_norm": 1.3216358423233032, + "learning_rate": 5.744726073456739e-05, + "loss": 1.7212, + "step": 18170 + }, + { + "epoch": 0.6507422063853027, + "grad_norm": 1.4945186376571655, + "learning_rate": 5.7436764545763034e-05, + "loss": 1.6712, + "step": 18171 + }, + { + "epoch": 0.6507780185148709, + "grad_norm": 2.1347923278808594, + "learning_rate": 5.7426268929592105e-05, + "loss": 1.6936, + "step": 18172 + }, + { + "epoch": 0.6508138306444393, + "grad_norm": 1.752018690109253, + "learning_rate": 5.7415773886195834e-05, + "loss": 1.61, + "step": 18173 + }, + { + "epoch": 0.6508496427740076, + "grad_norm": 1.5680664777755737, + "learning_rate": 5.740527941571541e-05, + "loss": 1.3054, + "step": 18174 + }, + { + "epoch": 0.6508854549035759, + "grad_norm": 1.7406561374664307, + "learning_rate": 5.739478551829198e-05, + "loss": 1.3754, + "step": 18175 + }, + { + "epoch": 0.6509212670331441, + "grad_norm": 1.6612472534179688, + "learning_rate": 5.738429219406676e-05, + "loss": 1.2253, + "step": 18176 + }, + { + "epoch": 0.6509570791627124, + "grad_norm": 1.6799381971359253, + "learning_rate": 5.7373799443180906e-05, + "loss": 1.5481, + "step": 18177 + }, + { + "epoch": 0.6509928912922807, + "grad_norm": 2.118349313735962, + "learning_rate": 5.7363307265775635e-05, + "loss": 1.381, + "step": 18178 + }, + { + "epoch": 0.6510287034218489, + "grad_norm": 1.8382169008255005, + "learning_rate": 5.7352815661992046e-05, + "loss": 1.4768, + "step": 18179 + }, + { + "epoch": 0.6510645155514173, + "grad_norm": 1.4919451475143433, + "learning_rate": 5.734232463197129e-05, + "loss": 1.1821, + "step": 18180 + }, + { + "epoch": 0.6511003276809856, + "grad_norm": 1.5245254039764404, + "learning_rate": 5.7331834175854596e-05, + "loss": 1.4771, + "step": 18181 + }, + { + "epoch": 0.6511361398105538, + "grad_norm": 1.4875792264938354, + "learning_rate": 5.732134429378297e-05, + "loss": 1.3327, + "step": 18182 + }, + { + "epoch": 0.6511719519401221, + "grad_norm": 2.057316303253174, + "learning_rate": 5.731085498589761e-05, + "loss": 1.2137, + "step": 18183 + }, + { + "epoch": 0.6512077640696904, + "grad_norm": 1.833107352256775, + "learning_rate": 5.730036625233963e-05, + "loss": 2.0461, + "step": 18184 + }, + { + "epoch": 0.6512435761992587, + "grad_norm": 1.508131742477417, + "learning_rate": 5.728987809325019e-05, + "loss": 1.6103, + "step": 18185 + }, + { + "epoch": 0.6512793883288269, + "grad_norm": 1.5430577993392944, + "learning_rate": 5.727939050877031e-05, + "loss": 1.5925, + "step": 18186 + }, + { + "epoch": 0.6513152004583953, + "grad_norm": 2.0990476608276367, + "learning_rate": 5.726890349904113e-05, + "loss": 1.5422, + "step": 18187 + }, + { + "epoch": 0.6513510125879636, + "grad_norm": 2.0384583473205566, + "learning_rate": 5.725841706420376e-05, + "loss": 1.45, + "step": 18188 + }, + { + "epoch": 0.6513868247175318, + "grad_norm": 1.6039574146270752, + "learning_rate": 5.724793120439923e-05, + "loss": 1.7583, + "step": 18189 + }, + { + "epoch": 0.6514226368471001, + "grad_norm": 1.6907927989959717, + "learning_rate": 5.723744591976863e-05, + "loss": 1.2868, + "step": 18190 + }, + { + "epoch": 0.6514584489766684, + "grad_norm": 1.6348273754119873, + "learning_rate": 5.722696121045303e-05, + "loss": 1.482, + "step": 18191 + }, + { + "epoch": 0.6514942611062366, + "grad_norm": 1.6397366523742676, + "learning_rate": 5.7216477076593544e-05, + "loss": 1.4915, + "step": 18192 + }, + { + "epoch": 0.6515300732358049, + "grad_norm": 1.57917058467865, + "learning_rate": 5.7205993518331134e-05, + "loss": 1.2427, + "step": 18193 + }, + { + "epoch": 0.6515658853653733, + "grad_norm": 1.8741562366485596, + "learning_rate": 5.719551053580687e-05, + "loss": 1.4965, + "step": 18194 + }, + { + "epoch": 0.6516016974949416, + "grad_norm": 1.567555546760559, + "learning_rate": 5.718502812916186e-05, + "loss": 1.158, + "step": 18195 + }, + { + "epoch": 0.6516375096245098, + "grad_norm": 1.4905515909194946, + "learning_rate": 5.7174546298537005e-05, + "loss": 1.5231, + "step": 18196 + }, + { + "epoch": 0.6516733217540781, + "grad_norm": 1.853344202041626, + "learning_rate": 5.71640650440734e-05, + "loss": 1.7658, + "step": 18197 + }, + { + "epoch": 0.6517091338836464, + "grad_norm": 2.142179250717163, + "learning_rate": 5.715358436591205e-05, + "loss": 1.9089, + "step": 18198 + }, + { + "epoch": 0.6517449460132146, + "grad_norm": 2.0253007411956787, + "learning_rate": 5.7143104264193984e-05, + "loss": 1.3156, + "step": 18199 + }, + { + "epoch": 0.6517807581427829, + "grad_norm": 1.5804954767227173, + "learning_rate": 5.7132624739060134e-05, + "loss": 1.1425, + "step": 18200 + }, + { + "epoch": 0.6518165702723513, + "grad_norm": 1.8263784646987915, + "learning_rate": 5.712214579065152e-05, + "loss": 1.3975, + "step": 18201 + }, + { + "epoch": 0.6518523824019196, + "grad_norm": 1.8882803916931152, + "learning_rate": 5.711166741910912e-05, + "loss": 1.453, + "step": 18202 + }, + { + "epoch": 0.6518881945314878, + "grad_norm": 1.7997485399246216, + "learning_rate": 5.710118962457396e-05, + "loss": 1.3057, + "step": 18203 + }, + { + "epoch": 0.6519240066610561, + "grad_norm": 1.5908443927764893, + "learning_rate": 5.709071240718695e-05, + "loss": 1.2405, + "step": 18204 + }, + { + "epoch": 0.6519598187906244, + "grad_norm": 1.749822974205017, + "learning_rate": 5.7080235767088994e-05, + "loss": 1.4987, + "step": 18205 + }, + { + "epoch": 0.6519956309201926, + "grad_norm": 1.3174817562103271, + "learning_rate": 5.706975970442117e-05, + "loss": 1.3869, + "step": 18206 + }, + { + "epoch": 0.6520314430497609, + "grad_norm": 1.5723850727081299, + "learning_rate": 5.7059284219324315e-05, + "loss": 1.6457, + "step": 18207 + }, + { + "epoch": 0.6520672551793293, + "grad_norm": 1.735958218574524, + "learning_rate": 5.7048809311939446e-05, + "loss": 1.3457, + "step": 18208 + }, + { + "epoch": 0.6521030673088976, + "grad_norm": 1.7627031803131104, + "learning_rate": 5.703833498240736e-05, + "loss": 1.6223, + "step": 18209 + }, + { + "epoch": 0.6521388794384658, + "grad_norm": 2.1572265625, + "learning_rate": 5.702786123086914e-05, + "loss": 1.2937, + "step": 18210 + }, + { + "epoch": 0.6521746915680341, + "grad_norm": 1.6451489925384521, + "learning_rate": 5.701738805746558e-05, + "loss": 1.5857, + "step": 18211 + }, + { + "epoch": 0.6522105036976024, + "grad_norm": 1.494231104850769, + "learning_rate": 5.700691546233762e-05, + "loss": 1.493, + "step": 18212 + }, + { + "epoch": 0.6522463158271706, + "grad_norm": 1.4229929447174072, + "learning_rate": 5.699644344562619e-05, + "loss": 1.355, + "step": 18213 + }, + { + "epoch": 0.6522821279567389, + "grad_norm": 1.5671422481536865, + "learning_rate": 5.698597200747211e-05, + "loss": 1.2931, + "step": 18214 + }, + { + "epoch": 0.6523179400863073, + "grad_norm": 1.9639018774032593, + "learning_rate": 5.697550114801633e-05, + "loss": 1.4782, + "step": 18215 + }, + { + "epoch": 0.6523537522158755, + "grad_norm": 1.4121224880218506, + "learning_rate": 5.696503086739961e-05, + "loss": 1.2247, + "step": 18216 + }, + { + "epoch": 0.6523895643454438, + "grad_norm": 2.129319429397583, + "learning_rate": 5.695456116576296e-05, + "loss": 1.4432, + "step": 18217 + }, + { + "epoch": 0.6524253764750121, + "grad_norm": 1.39368736743927, + "learning_rate": 5.6944092043247124e-05, + "loss": 1.3975, + "step": 18218 + }, + { + "epoch": 0.6524611886045804, + "grad_norm": 1.3888850212097168, + "learning_rate": 5.693362349999303e-05, + "loss": 1.5083, + "step": 18219 + }, + { + "epoch": 0.6524970007341486, + "grad_norm": 1.4592702388763428, + "learning_rate": 5.6923155536141404e-05, + "loss": 1.449, + "step": 18220 + }, + { + "epoch": 0.6525328128637169, + "grad_norm": 2.164543628692627, + "learning_rate": 5.691268815183324e-05, + "loss": 1.2571, + "step": 18221 + }, + { + "epoch": 0.6525686249932853, + "grad_norm": 2.2034432888031006, + "learning_rate": 5.690222134720927e-05, + "loss": 1.562, + "step": 18222 + }, + { + "epoch": 0.6526044371228535, + "grad_norm": 1.5978227853775024, + "learning_rate": 5.6891755122410254e-05, + "loss": 1.0764, + "step": 18223 + }, + { + "epoch": 0.6526402492524218, + "grad_norm": 1.8049519062042236, + "learning_rate": 5.688128947757713e-05, + "loss": 1.4481, + "step": 18224 + }, + { + "epoch": 0.6526760613819901, + "grad_norm": 1.7930026054382324, + "learning_rate": 5.687082441285061e-05, + "loss": 1.5977, + "step": 18225 + }, + { + "epoch": 0.6527118735115583, + "grad_norm": 1.4128412008285522, + "learning_rate": 5.6860359928371546e-05, + "loss": 1.7345, + "step": 18226 + }, + { + "epoch": 0.6527476856411266, + "grad_norm": 1.6190688610076904, + "learning_rate": 5.6849896024280614e-05, + "loss": 1.2488, + "step": 18227 + }, + { + "epoch": 0.6527834977706949, + "grad_norm": 2.397874593734741, + "learning_rate": 5.6839432700718743e-05, + "loss": 1.2673, + "step": 18228 + }, + { + "epoch": 0.6528193099002633, + "grad_norm": 2.7797553539276123, + "learning_rate": 5.682896995782661e-05, + "loss": 1.5766, + "step": 18229 + }, + { + "epoch": 0.6528551220298315, + "grad_norm": 2.0291695594787598, + "learning_rate": 5.6818507795745025e-05, + "loss": 1.2149, + "step": 18230 + }, + { + "epoch": 0.6528909341593998, + "grad_norm": 1.5554500818252563, + "learning_rate": 5.6808046214614684e-05, + "loss": 1.534, + "step": 18231 + }, + { + "epoch": 0.6529267462889681, + "grad_norm": 1.7645570039749146, + "learning_rate": 5.679758521457637e-05, + "loss": 1.5774, + "step": 18232 + }, + { + "epoch": 0.6529625584185363, + "grad_norm": 1.3924230337142944, + "learning_rate": 5.678712479577086e-05, + "loss": 1.1851, + "step": 18233 + }, + { + "epoch": 0.6529983705481046, + "grad_norm": 1.7153980731964111, + "learning_rate": 5.67766649583388e-05, + "loss": 1.3035, + "step": 18234 + }, + { + "epoch": 0.6530341826776729, + "grad_norm": 1.4884206056594849, + "learning_rate": 5.676620570242097e-05, + "loss": 1.3867, + "step": 18235 + }, + { + "epoch": 0.6530699948072413, + "grad_norm": 2.1972150802612305, + "learning_rate": 5.675574702815807e-05, + "loss": 1.7816, + "step": 18236 + }, + { + "epoch": 0.6531058069368095, + "grad_norm": 2.568058967590332, + "learning_rate": 5.674528893569084e-05, + "loss": 1.2583, + "step": 18237 + }, + { + "epoch": 0.6531416190663778, + "grad_norm": 1.8737038373947144, + "learning_rate": 5.673483142515988e-05, + "loss": 1.2972, + "step": 18238 + }, + { + "epoch": 0.6531774311959461, + "grad_norm": 1.451999545097351, + "learning_rate": 5.672437449670605e-05, + "loss": 1.5523, + "step": 18239 + }, + { + "epoch": 0.6532132433255143, + "grad_norm": 1.7182345390319824, + "learning_rate": 5.6713918150469916e-05, + "loss": 1.3202, + "step": 18240 + }, + { + "epoch": 0.6532490554550826, + "grad_norm": 2.7142200469970703, + "learning_rate": 5.6703462386592145e-05, + "loss": 1.6818, + "step": 18241 + }, + { + "epoch": 0.6532848675846509, + "grad_norm": 1.4753506183624268, + "learning_rate": 5.6693007205213444e-05, + "loss": 1.4071, + "step": 18242 + }, + { + "epoch": 0.6533206797142193, + "grad_norm": 1.5137279033660889, + "learning_rate": 5.668255260647447e-05, + "loss": 1.6646, + "step": 18243 + }, + { + "epoch": 0.6533564918437875, + "grad_norm": 1.8348652124404907, + "learning_rate": 5.667209859051592e-05, + "loss": 1.3507, + "step": 18244 + }, + { + "epoch": 0.6533923039733558, + "grad_norm": 1.683578610420227, + "learning_rate": 5.6661645157478336e-05, + "loss": 1.1708, + "step": 18245 + }, + { + "epoch": 0.6534281161029241, + "grad_norm": 1.2713277339935303, + "learning_rate": 5.665119230750243e-05, + "loss": 1.412, + "step": 18246 + }, + { + "epoch": 0.6534639282324923, + "grad_norm": 1.712204098701477, + "learning_rate": 5.664074004072881e-05, + "loss": 1.2872, + "step": 18247 + }, + { + "epoch": 0.6534997403620606, + "grad_norm": 1.6084846258163452, + "learning_rate": 5.663028835729815e-05, + "loss": 1.4954, + "step": 18248 + }, + { + "epoch": 0.6535355524916289, + "grad_norm": 1.7049838304519653, + "learning_rate": 5.661983725735096e-05, + "loss": 1.5376, + "step": 18249 + }, + { + "epoch": 0.6535713646211972, + "grad_norm": 2.174365997314453, + "learning_rate": 5.6609386741027915e-05, + "loss": 1.7882, + "step": 18250 + }, + { + "epoch": 0.6536071767507655, + "grad_norm": 1.3150783777236938, + "learning_rate": 5.659893680846965e-05, + "loss": 1.4351, + "step": 18251 + }, + { + "epoch": 0.6536429888803338, + "grad_norm": 1.477768063545227, + "learning_rate": 5.658848745981667e-05, + "loss": 1.6592, + "step": 18252 + }, + { + "epoch": 0.653678801009902, + "grad_norm": 1.5167200565338135, + "learning_rate": 5.6578038695209566e-05, + "loss": 1.3087, + "step": 18253 + }, + { + "epoch": 0.6537146131394703, + "grad_norm": 1.7359503507614136, + "learning_rate": 5.656759051478897e-05, + "loss": 1.5542, + "step": 18254 + }, + { + "epoch": 0.6537504252690386, + "grad_norm": 1.6979146003723145, + "learning_rate": 5.655714291869544e-05, + "loss": 1.6755, + "step": 18255 + }, + { + "epoch": 0.6537862373986069, + "grad_norm": 1.4741054773330688, + "learning_rate": 5.654669590706948e-05, + "loss": 1.3516, + "step": 18256 + }, + { + "epoch": 0.6538220495281752, + "grad_norm": 1.841018795967102, + "learning_rate": 5.653624948005167e-05, + "loss": 1.3925, + "step": 18257 + }, + { + "epoch": 0.6538578616577435, + "grad_norm": 1.7704675197601318, + "learning_rate": 5.6525803637782614e-05, + "loss": 1.8108, + "step": 18258 + }, + { + "epoch": 0.6538936737873118, + "grad_norm": 1.8896334171295166, + "learning_rate": 5.651535838040275e-05, + "loss": 1.6515, + "step": 18259 + }, + { + "epoch": 0.65392948591688, + "grad_norm": 1.491990327835083, + "learning_rate": 5.6504913708052646e-05, + "loss": 1.5978, + "step": 18260 + }, + { + "epoch": 0.6539652980464483, + "grad_norm": 1.973260521888733, + "learning_rate": 5.6494469620872814e-05, + "loss": 1.4268, + "step": 18261 + }, + { + "epoch": 0.6540011101760166, + "grad_norm": 1.601784348487854, + "learning_rate": 5.648402611900383e-05, + "loss": 1.2062, + "step": 18262 + }, + { + "epoch": 0.6540369223055849, + "grad_norm": 1.3702232837677002, + "learning_rate": 5.647358320258609e-05, + "loss": 1.3752, + "step": 18263 + }, + { + "epoch": 0.6540727344351532, + "grad_norm": 1.3767213821411133, + "learning_rate": 5.6463140871760144e-05, + "loss": 1.5021, + "step": 18264 + }, + { + "epoch": 0.6541085465647215, + "grad_norm": 1.3063963651657104, + "learning_rate": 5.6452699126666486e-05, + "loss": 1.5594, + "step": 18265 + }, + { + "epoch": 0.6541443586942898, + "grad_norm": 1.5058377981185913, + "learning_rate": 5.644225796744562e-05, + "loss": 1.166, + "step": 18266 + }, + { + "epoch": 0.654180170823858, + "grad_norm": 1.676900863647461, + "learning_rate": 5.6431817394237964e-05, + "loss": 1.589, + "step": 18267 + }, + { + "epoch": 0.6542159829534263, + "grad_norm": 1.7111408710479736, + "learning_rate": 5.6421377407183997e-05, + "loss": 1.5941, + "step": 18268 + }, + { + "epoch": 0.6542517950829946, + "grad_norm": 1.4880335330963135, + "learning_rate": 5.641093800642423e-05, + "loss": 1.2594, + "step": 18269 + }, + { + "epoch": 0.6542876072125628, + "grad_norm": 2.1994059085845947, + "learning_rate": 5.640049919209902e-05, + "loss": 1.6901, + "step": 18270 + }, + { + "epoch": 0.6543234193421312, + "grad_norm": 1.6333506107330322, + "learning_rate": 5.6390060964348845e-05, + "loss": 1.4167, + "step": 18271 + }, + { + "epoch": 0.6543592314716995, + "grad_norm": 1.4062167406082153, + "learning_rate": 5.637962332331416e-05, + "loss": 1.2515, + "step": 18272 + }, + { + "epoch": 0.6543950436012678, + "grad_norm": 2.1119511127471924, + "learning_rate": 5.636918626913541e-05, + "loss": 1.6651, + "step": 18273 + }, + { + "epoch": 0.654430855730836, + "grad_norm": 1.8894344568252563, + "learning_rate": 5.6358749801952946e-05, + "loss": 1.7571, + "step": 18274 + }, + { + "epoch": 0.6544666678604043, + "grad_norm": 1.7292144298553467, + "learning_rate": 5.63483139219072e-05, + "loss": 1.4696, + "step": 18275 + }, + { + "epoch": 0.6545024799899726, + "grad_norm": 1.5995243787765503, + "learning_rate": 5.633787862913864e-05, + "loss": 1.1873, + "step": 18276 + }, + { + "epoch": 0.6545382921195408, + "grad_norm": 1.5615102052688599, + "learning_rate": 5.6327443923787546e-05, + "loss": 1.3503, + "step": 18277 + }, + { + "epoch": 0.6545741042491092, + "grad_norm": 1.3338347673416138, + "learning_rate": 5.631700980599437e-05, + "loss": 1.4405, + "step": 18278 + }, + { + "epoch": 0.6546099163786775, + "grad_norm": 1.7192423343658447, + "learning_rate": 5.630657627589948e-05, + "loss": 1.7277, + "step": 18279 + }, + { + "epoch": 0.6546457285082458, + "grad_norm": 2.087306499481201, + "learning_rate": 5.629614333364328e-05, + "loss": 1.3391, + "step": 18280 + }, + { + "epoch": 0.654681540637814, + "grad_norm": 1.4687906503677368, + "learning_rate": 5.628571097936606e-05, + "loss": 1.3238, + "step": 18281 + }, + { + "epoch": 0.6547173527673823, + "grad_norm": 2.546259641647339, + "learning_rate": 5.627527921320821e-05, + "loss": 1.4288, + "step": 18282 + }, + { + "epoch": 0.6547531648969506, + "grad_norm": 1.732775092124939, + "learning_rate": 5.626484803531008e-05, + "loss": 1.4941, + "step": 18283 + }, + { + "epoch": 0.6547889770265188, + "grad_norm": 1.4935340881347656, + "learning_rate": 5.625441744581205e-05, + "loss": 1.154, + "step": 18284 + }, + { + "epoch": 0.6548247891560872, + "grad_norm": 2.029008150100708, + "learning_rate": 5.624398744485435e-05, + "loss": 1.4768, + "step": 18285 + }, + { + "epoch": 0.6548606012856555, + "grad_norm": 1.9982153177261353, + "learning_rate": 5.623355803257737e-05, + "loss": 1.6078, + "step": 18286 + }, + { + "epoch": 0.6548964134152238, + "grad_norm": 1.5812995433807373, + "learning_rate": 5.622312920912145e-05, + "loss": 1.6001, + "step": 18287 + }, + { + "epoch": 0.654932225544792, + "grad_norm": 2.2737843990325928, + "learning_rate": 5.621270097462682e-05, + "loss": 1.7967, + "step": 18288 + }, + { + "epoch": 0.6549680376743603, + "grad_norm": 2.59417724609375, + "learning_rate": 5.620227332923382e-05, + "loss": 1.2416, + "step": 18289 + }, + { + "epoch": 0.6550038498039286, + "grad_norm": 1.4791409969329834, + "learning_rate": 5.619184627308273e-05, + "loss": 1.3457, + "step": 18290 + }, + { + "epoch": 0.6550396619334968, + "grad_norm": 1.6634405851364136, + "learning_rate": 5.618141980631389e-05, + "loss": 1.632, + "step": 18291 + }, + { + "epoch": 0.6550754740630652, + "grad_norm": 2.015676975250244, + "learning_rate": 5.617099392906751e-05, + "loss": 1.3177, + "step": 18292 + }, + { + "epoch": 0.6551112861926335, + "grad_norm": 1.8615721464157104, + "learning_rate": 5.61605686414838e-05, + "loss": 1.6381, + "step": 18293 + }, + { + "epoch": 0.6551470983222017, + "grad_norm": 1.3926013708114624, + "learning_rate": 5.615014394370317e-05, + "loss": 0.9642, + "step": 18294 + }, + { + "epoch": 0.65518291045177, + "grad_norm": 1.9081752300262451, + "learning_rate": 5.6139719835865745e-05, + "loss": 1.5377, + "step": 18295 + }, + { + "epoch": 0.6552187225813383, + "grad_norm": 1.6380687952041626, + "learning_rate": 5.612929631811181e-05, + "loss": 1.3983, + "step": 18296 + }, + { + "epoch": 0.6552545347109066, + "grad_norm": 1.7119892835617065, + "learning_rate": 5.611887339058162e-05, + "loss": 1.6677, + "step": 18297 + }, + { + "epoch": 0.6552903468404748, + "grad_norm": 1.7411001920700073, + "learning_rate": 5.610845105341542e-05, + "loss": 1.6975, + "step": 18298 + }, + { + "epoch": 0.6553261589700432, + "grad_norm": 2.2733662128448486, + "learning_rate": 5.609802930675335e-05, + "loss": 1.448, + "step": 18299 + }, + { + "epoch": 0.6553619710996115, + "grad_norm": 3.091132640838623, + "learning_rate": 5.608760815073567e-05, + "loss": 1.3997, + "step": 18300 + }, + { + "epoch": 0.6553977832291797, + "grad_norm": 1.3527662754058838, + "learning_rate": 5.6077187585502624e-05, + "loss": 1.2382, + "step": 18301 + }, + { + "epoch": 0.655433595358748, + "grad_norm": 1.2386977672576904, + "learning_rate": 5.6066767611194316e-05, + "loss": 1.4487, + "step": 18302 + }, + { + "epoch": 0.6554694074883163, + "grad_norm": 2.7018849849700928, + "learning_rate": 5.6056348227951025e-05, + "loss": 1.5491, + "step": 18303 + }, + { + "epoch": 0.6555052196178845, + "grad_norm": 1.7741349935531616, + "learning_rate": 5.6045929435912805e-05, + "loss": 1.446, + "step": 18304 + }, + { + "epoch": 0.6555410317474528, + "grad_norm": 1.929344892501831, + "learning_rate": 5.603551123521997e-05, + "loss": 1.2201, + "step": 18305 + }, + { + "epoch": 0.6555768438770212, + "grad_norm": 1.3161699771881104, + "learning_rate": 5.60250936260126e-05, + "loss": 1.4935, + "step": 18306 + }, + { + "epoch": 0.6556126560065895, + "grad_norm": 1.606744647026062, + "learning_rate": 5.601467660843087e-05, + "loss": 1.2209, + "step": 18307 + }, + { + "epoch": 0.6556484681361577, + "grad_norm": 1.6163558959960938, + "learning_rate": 5.600426018261493e-05, + "loss": 1.7188, + "step": 18308 + }, + { + "epoch": 0.655684280265726, + "grad_norm": 1.3618152141571045, + "learning_rate": 5.599384434870496e-05, + "loss": 1.2453, + "step": 18309 + }, + { + "epoch": 0.6557200923952943, + "grad_norm": 1.6127922534942627, + "learning_rate": 5.5983429106841046e-05, + "loss": 1.5509, + "step": 18310 + }, + { + "epoch": 0.6557559045248625, + "grad_norm": 1.6013727188110352, + "learning_rate": 5.597301445716323e-05, + "loss": 1.6438, + "step": 18311 + }, + { + "epoch": 0.6557917166544308, + "grad_norm": 1.5582809448242188, + "learning_rate": 5.59626003998118e-05, + "loss": 1.4424, + "step": 18312 + }, + { + "epoch": 0.6558275287839992, + "grad_norm": 1.5921964645385742, + "learning_rate": 5.595218693492674e-05, + "loss": 1.3418, + "step": 18313 + }, + { + "epoch": 0.6558633409135675, + "grad_norm": 1.9930670261383057, + "learning_rate": 5.594177406264822e-05, + "loss": 1.6266, + "step": 18314 + }, + { + "epoch": 0.6558991530431357, + "grad_norm": 1.9355586767196655, + "learning_rate": 5.593136178311622e-05, + "loss": 1.26, + "step": 18315 + }, + { + "epoch": 0.655934965172704, + "grad_norm": 2.1823208332061768, + "learning_rate": 5.592095009647099e-05, + "loss": 1.57, + "step": 18316 + }, + { + "epoch": 0.6559707773022723, + "grad_norm": 1.3704748153686523, + "learning_rate": 5.591053900285248e-05, + "loss": 1.3415, + "step": 18317 + }, + { + "epoch": 0.6560065894318405, + "grad_norm": 1.7008016109466553, + "learning_rate": 5.590012850240083e-05, + "loss": 1.5811, + "step": 18318 + }, + { + "epoch": 0.6560424015614088, + "grad_norm": 2.976754665374756, + "learning_rate": 5.5889718595256026e-05, + "loss": 1.6042, + "step": 18319 + }, + { + "epoch": 0.6560782136909772, + "grad_norm": 1.7134968042373657, + "learning_rate": 5.587930928155816e-05, + "loss": 1.2071, + "step": 18320 + }, + { + "epoch": 0.6561140258205455, + "grad_norm": 1.4464317560195923, + "learning_rate": 5.586890056144732e-05, + "loss": 1.6427, + "step": 18321 + }, + { + "epoch": 0.6561498379501137, + "grad_norm": 2.000373125076294, + "learning_rate": 5.585849243506342e-05, + "loss": 1.3813, + "step": 18322 + }, + { + "epoch": 0.656185650079682, + "grad_norm": 1.743994951248169, + "learning_rate": 5.584808490254664e-05, + "loss": 1.4915, + "step": 18323 + }, + { + "epoch": 0.6562214622092503, + "grad_norm": 1.9050312042236328, + "learning_rate": 5.5837677964036894e-05, + "loss": 1.6474, + "step": 18324 + }, + { + "epoch": 0.6562572743388185, + "grad_norm": 1.8708951473236084, + "learning_rate": 5.582727161967425e-05, + "loss": 1.3392, + "step": 18325 + }, + { + "epoch": 0.6562930864683868, + "grad_norm": 1.4769537448883057, + "learning_rate": 5.5816865869598625e-05, + "loss": 1.4556, + "step": 18326 + }, + { + "epoch": 0.6563288985979552, + "grad_norm": 1.4558557271957397, + "learning_rate": 5.5806460713950145e-05, + "loss": 1.4495, + "step": 18327 + }, + { + "epoch": 0.6563647107275234, + "grad_norm": 2.0436086654663086, + "learning_rate": 5.579605615286874e-05, + "loss": 1.5254, + "step": 18328 + }, + { + "epoch": 0.6564005228570917, + "grad_norm": 1.4093581438064575, + "learning_rate": 5.578565218649433e-05, + "loss": 1.4388, + "step": 18329 + }, + { + "epoch": 0.65643633498666, + "grad_norm": 1.895922064781189, + "learning_rate": 5.577524881496694e-05, + "loss": 1.2775, + "step": 18330 + }, + { + "epoch": 0.6564721471162283, + "grad_norm": 2.0171585083007812, + "learning_rate": 5.5764846038426535e-05, + "loss": 1.3864, + "step": 18331 + }, + { + "epoch": 0.6565079592457965, + "grad_norm": 2.5566394329071045, + "learning_rate": 5.57544438570131e-05, + "loss": 1.6843, + "step": 18332 + }, + { + "epoch": 0.6565437713753648, + "grad_norm": 1.510467290878296, + "learning_rate": 5.574404227086648e-05, + "loss": 1.4668, + "step": 18333 + }, + { + "epoch": 0.6565795835049332, + "grad_norm": 1.726326584815979, + "learning_rate": 5.573364128012677e-05, + "loss": 1.2047, + "step": 18334 + }, + { + "epoch": 0.6566153956345014, + "grad_norm": 1.3015578985214233, + "learning_rate": 5.572324088493377e-05, + "loss": 1.3682, + "step": 18335 + }, + { + "epoch": 0.6566512077640697, + "grad_norm": 2.0535759925842285, + "learning_rate": 5.571284108542748e-05, + "loss": 1.4943, + "step": 18336 + }, + { + "epoch": 0.656687019893638, + "grad_norm": 1.3764218091964722, + "learning_rate": 5.5702441881747755e-05, + "loss": 1.1779, + "step": 18337 + }, + { + "epoch": 0.6567228320232062, + "grad_norm": 2.7973520755767822, + "learning_rate": 5.5692043274034544e-05, + "loss": 1.6795, + "step": 18338 + }, + { + "epoch": 0.6567586441527745, + "grad_norm": 1.9879013299942017, + "learning_rate": 5.568164526242776e-05, + "loss": 1.4525, + "step": 18339 + }, + { + "epoch": 0.6567944562823428, + "grad_norm": 1.5268080234527588, + "learning_rate": 5.5671247847067254e-05, + "loss": 1.7501, + "step": 18340 + }, + { + "epoch": 0.6568302684119112, + "grad_norm": 1.5404284000396729, + "learning_rate": 5.566085102809291e-05, + "loss": 1.3821, + "step": 18341 + }, + { + "epoch": 0.6568660805414794, + "grad_norm": 1.526283860206604, + "learning_rate": 5.565045480564463e-05, + "loss": 1.3779, + "step": 18342 + }, + { + "epoch": 0.6569018926710477, + "grad_norm": 1.485095739364624, + "learning_rate": 5.5640059179862314e-05, + "loss": 1.3956, + "step": 18343 + }, + { + "epoch": 0.656937704800616, + "grad_norm": 1.6767833232879639, + "learning_rate": 5.562966415088574e-05, + "loss": 1.6607, + "step": 18344 + }, + { + "epoch": 0.6569735169301842, + "grad_norm": 1.563358187675476, + "learning_rate": 5.5619269718854805e-05, + "loss": 1.8258, + "step": 18345 + }, + { + "epoch": 0.6570093290597525, + "grad_norm": 1.4730910062789917, + "learning_rate": 5.560887588390938e-05, + "loss": 1.1704, + "step": 18346 + }, + { + "epoch": 0.6570451411893208, + "grad_norm": 1.5037370920181274, + "learning_rate": 5.559848264618923e-05, + "loss": 1.1723, + "step": 18347 + }, + { + "epoch": 0.6570809533188892, + "grad_norm": 1.9317376613616943, + "learning_rate": 5.5588090005834224e-05, + "loss": 1.3406, + "step": 18348 + }, + { + "epoch": 0.6571167654484574, + "grad_norm": 1.5945675373077393, + "learning_rate": 5.5577697962984195e-05, + "loss": 1.4684, + "step": 18349 + }, + { + "epoch": 0.6571525775780257, + "grad_norm": 1.4103537797927856, + "learning_rate": 5.556730651777897e-05, + "loss": 1.3841, + "step": 18350 + }, + { + "epoch": 0.657188389707594, + "grad_norm": 1.6418267488479614, + "learning_rate": 5.555691567035828e-05, + "loss": 1.4893, + "step": 18351 + }, + { + "epoch": 0.6572242018371622, + "grad_norm": 1.3882322311401367, + "learning_rate": 5.554652542086196e-05, + "loss": 1.4669, + "step": 18352 + }, + { + "epoch": 0.6572600139667305, + "grad_norm": 1.9791117906570435, + "learning_rate": 5.5536135769429795e-05, + "loss": 1.3963, + "step": 18353 + }, + { + "epoch": 0.6572958260962988, + "grad_norm": 2.5772440433502197, + "learning_rate": 5.552574671620161e-05, + "loss": 1.4643, + "step": 18354 + }, + { + "epoch": 0.6573316382258672, + "grad_norm": 2.2931079864501953, + "learning_rate": 5.551535826131711e-05, + "loss": 1.4, + "step": 18355 + }, + { + "epoch": 0.6573674503554354, + "grad_norm": 2.633115768432617, + "learning_rate": 5.5504970404916066e-05, + "loss": 1.1993, + "step": 18356 + }, + { + "epoch": 0.6574032624850037, + "grad_norm": 1.4638112783432007, + "learning_rate": 5.54945831471383e-05, + "loss": 1.5759, + "step": 18357 + }, + { + "epoch": 0.657439074614572, + "grad_norm": 1.9851884841918945, + "learning_rate": 5.548419648812346e-05, + "loss": 1.5184, + "step": 18358 + }, + { + "epoch": 0.6574748867441402, + "grad_norm": 1.440222978591919, + "learning_rate": 5.547381042801135e-05, + "loss": 1.5711, + "step": 18359 + }, + { + "epoch": 0.6575106988737085, + "grad_norm": 1.8197391033172607, + "learning_rate": 5.5463424966941676e-05, + "loss": 1.0896, + "step": 18360 + }, + { + "epoch": 0.6575465110032768, + "grad_norm": 1.7370043992996216, + "learning_rate": 5.545304010505421e-05, + "loss": 1.5494, + "step": 18361 + }, + { + "epoch": 0.6575823231328451, + "grad_norm": 1.4990167617797852, + "learning_rate": 5.54426558424886e-05, + "loss": 1.5032, + "step": 18362 + }, + { + "epoch": 0.6576181352624134, + "grad_norm": 2.4379074573516846, + "learning_rate": 5.543227217938457e-05, + "loss": 1.5814, + "step": 18363 + }, + { + "epoch": 0.6576539473919817, + "grad_norm": 1.6144640445709229, + "learning_rate": 5.5421889115881875e-05, + "loss": 1.3925, + "step": 18364 + }, + { + "epoch": 0.65768975952155, + "grad_norm": 1.5260868072509766, + "learning_rate": 5.5411506652120115e-05, + "loss": 1.5798, + "step": 18365 + }, + { + "epoch": 0.6577255716511182, + "grad_norm": 1.313092589378357, + "learning_rate": 5.540112478823902e-05, + "loss": 1.3203, + "step": 18366 + }, + { + "epoch": 0.6577613837806865, + "grad_norm": 1.5859086513519287, + "learning_rate": 5.5390743524378266e-05, + "loss": 1.5731, + "step": 18367 + }, + { + "epoch": 0.6577971959102548, + "grad_norm": 1.383134365081787, + "learning_rate": 5.538036286067756e-05, + "loss": 1.347, + "step": 18368 + }, + { + "epoch": 0.6578330080398231, + "grad_norm": 1.9735016822814941, + "learning_rate": 5.5369982797276454e-05, + "loss": 1.6849, + "step": 18369 + }, + { + "epoch": 0.6578688201693914, + "grad_norm": 1.9564305543899536, + "learning_rate": 5.5359603334314695e-05, + "loss": 1.2313, + "step": 18370 + }, + { + "epoch": 0.6579046322989597, + "grad_norm": 1.9404182434082031, + "learning_rate": 5.534922447193187e-05, + "loss": 1.7799, + "step": 18371 + }, + { + "epoch": 0.657940444428528, + "grad_norm": 1.4757919311523438, + "learning_rate": 5.533884621026767e-05, + "loss": 1.3568, + "step": 18372 + }, + { + "epoch": 0.6579762565580962, + "grad_norm": 1.4914617538452148, + "learning_rate": 5.5328468549461657e-05, + "loss": 1.5968, + "step": 18373 + }, + { + "epoch": 0.6580120686876645, + "grad_norm": 1.624997615814209, + "learning_rate": 5.531809148965347e-05, + "loss": 1.6594, + "step": 18374 + }, + { + "epoch": 0.6580478808172328, + "grad_norm": 1.5839698314666748, + "learning_rate": 5.530771503098278e-05, + "loss": 1.5126, + "step": 18375 + }, + { + "epoch": 0.6580836929468011, + "grad_norm": 1.6808476448059082, + "learning_rate": 5.529733917358908e-05, + "loss": 1.7064, + "step": 18376 + }, + { + "epoch": 0.6581195050763694, + "grad_norm": 1.3805513381958008, + "learning_rate": 5.528696391761201e-05, + "loss": 1.4942, + "step": 18377 + }, + { + "epoch": 0.6581553172059377, + "grad_norm": 1.3921979665756226, + "learning_rate": 5.527658926319119e-05, + "loss": 1.5189, + "step": 18378 + }, + { + "epoch": 0.6581911293355059, + "grad_norm": 1.867268681526184, + "learning_rate": 5.52662152104662e-05, + "loss": 1.3446, + "step": 18379 + }, + { + "epoch": 0.6582269414650742, + "grad_norm": 1.994785189628601, + "learning_rate": 5.5255841759576544e-05, + "loss": 1.253, + "step": 18380 + }, + { + "epoch": 0.6582627535946425, + "grad_norm": 1.367926836013794, + "learning_rate": 5.524546891066182e-05, + "loss": 1.4459, + "step": 18381 + }, + { + "epoch": 0.6582985657242107, + "grad_norm": 1.71571946144104, + "learning_rate": 5.5235096663861617e-05, + "loss": 1.8809, + "step": 18382 + }, + { + "epoch": 0.6583343778537791, + "grad_norm": 1.36968195438385, + "learning_rate": 5.5224725019315416e-05, + "loss": 1.5383, + "step": 18383 + }, + { + "epoch": 0.6583701899833474, + "grad_norm": 2.195829391479492, + "learning_rate": 5.521435397716278e-05, + "loss": 1.4893, + "step": 18384 + }, + { + "epoch": 0.6584060021129157, + "grad_norm": 1.5559254884719849, + "learning_rate": 5.520398353754324e-05, + "loss": 1.3855, + "step": 18385 + }, + { + "epoch": 0.6584418142424839, + "grad_norm": 1.5635045766830444, + "learning_rate": 5.519361370059637e-05, + "loss": 1.3387, + "step": 18386 + }, + { + "epoch": 0.6584776263720522, + "grad_norm": 1.6653882265090942, + "learning_rate": 5.518324446646157e-05, + "loss": 1.7117, + "step": 18387 + }, + { + "epoch": 0.6585134385016205, + "grad_norm": 1.4553028345108032, + "learning_rate": 5.517287583527843e-05, + "loss": 1.4408, + "step": 18388 + }, + { + "epoch": 0.6585492506311887, + "grad_norm": 1.7703020572662354, + "learning_rate": 5.51625078071864e-05, + "loss": 1.4806, + "step": 18389 + }, + { + "epoch": 0.6585850627607571, + "grad_norm": 1.610481858253479, + "learning_rate": 5.5152140382325044e-05, + "loss": 1.4433, + "step": 18390 + }, + { + "epoch": 0.6586208748903254, + "grad_norm": 1.5309536457061768, + "learning_rate": 5.5141773560833756e-05, + "loss": 1.7228, + "step": 18391 + }, + { + "epoch": 0.6586566870198937, + "grad_norm": 1.8306372165679932, + "learning_rate": 5.5131407342852026e-05, + "loss": 1.1958, + "step": 18392 + }, + { + "epoch": 0.6586924991494619, + "grad_norm": 1.701582431793213, + "learning_rate": 5.5121041728519386e-05, + "loss": 1.4244, + "step": 18393 + }, + { + "epoch": 0.6587283112790302, + "grad_norm": 1.6949228048324585, + "learning_rate": 5.5110676717975194e-05, + "loss": 1.3162, + "step": 18394 + }, + { + "epoch": 0.6587641234085985, + "grad_norm": 1.2450233697891235, + "learning_rate": 5.510031231135895e-05, + "loss": 1.482, + "step": 18395 + }, + { + "epoch": 0.6587999355381667, + "grad_norm": 1.6780352592468262, + "learning_rate": 5.508994850881008e-05, + "loss": 1.4635, + "step": 18396 + }, + { + "epoch": 0.6588357476677351, + "grad_norm": 2.5002284049987793, + "learning_rate": 5.507958531046806e-05, + "loss": 1.4665, + "step": 18397 + }, + { + "epoch": 0.6588715597973034, + "grad_norm": 1.386528730392456, + "learning_rate": 5.506922271647228e-05, + "loss": 1.7111, + "step": 18398 + }, + { + "epoch": 0.6589073719268717, + "grad_norm": 1.9130560159683228, + "learning_rate": 5.505886072696208e-05, + "loss": 1.3824, + "step": 18399 + }, + { + "epoch": 0.6589431840564399, + "grad_norm": 1.4952514171600342, + "learning_rate": 5.504849934207701e-05, + "loss": 1.4765, + "step": 18400 + }, + { + "epoch": 0.6589789961860082, + "grad_norm": 1.4171199798583984, + "learning_rate": 5.503813856195637e-05, + "loss": 1.3389, + "step": 18401 + }, + { + "epoch": 0.6590148083155765, + "grad_norm": 1.6343742609024048, + "learning_rate": 5.5027778386739606e-05, + "loss": 1.488, + "step": 18402 + }, + { + "epoch": 0.6590506204451447, + "grad_norm": 1.364643931388855, + "learning_rate": 5.5017418816565994e-05, + "loss": 1.5663, + "step": 18403 + }, + { + "epoch": 0.6590864325747131, + "grad_norm": 2.7921016216278076, + "learning_rate": 5.500705985157508e-05, + "loss": 1.5698, + "step": 18404 + }, + { + "epoch": 0.6591222447042814, + "grad_norm": 1.4503824710845947, + "learning_rate": 5.499670149190609e-05, + "loss": 1.3045, + "step": 18405 + }, + { + "epoch": 0.6591580568338496, + "grad_norm": 1.6705472469329834, + "learning_rate": 5.498634373769843e-05, + "loss": 1.6779, + "step": 18406 + }, + { + "epoch": 0.6591938689634179, + "grad_norm": 1.5752532482147217, + "learning_rate": 5.497598658909149e-05, + "loss": 1.4986, + "step": 18407 + }, + { + "epoch": 0.6592296810929862, + "grad_norm": 1.343363642692566, + "learning_rate": 5.496563004622455e-05, + "loss": 1.4247, + "step": 18408 + }, + { + "epoch": 0.6592654932225545, + "grad_norm": 1.5922566652297974, + "learning_rate": 5.495527410923699e-05, + "loss": 1.5302, + "step": 18409 + }, + { + "epoch": 0.6593013053521227, + "grad_norm": 1.7536109685897827, + "learning_rate": 5.494491877826804e-05, + "loss": 1.1251, + "step": 18410 + }, + { + "epoch": 0.6593371174816911, + "grad_norm": 1.656254768371582, + "learning_rate": 5.493456405345716e-05, + "loss": 1.4341, + "step": 18411 + }, + { + "epoch": 0.6593729296112594, + "grad_norm": 1.5095834732055664, + "learning_rate": 5.492420993494357e-05, + "loss": 1.4502, + "step": 18412 + }, + { + "epoch": 0.6594087417408276, + "grad_norm": 1.864039421081543, + "learning_rate": 5.491385642286662e-05, + "loss": 1.2983, + "step": 18413 + }, + { + "epoch": 0.6594445538703959, + "grad_norm": 1.595781922340393, + "learning_rate": 5.49035035173655e-05, + "loss": 1.332, + "step": 18414 + }, + { + "epoch": 0.6594803659999642, + "grad_norm": 1.7632122039794922, + "learning_rate": 5.4893151218579655e-05, + "loss": 1.5275, + "step": 18415 + }, + { + "epoch": 0.6595161781295324, + "grad_norm": 1.8122587203979492, + "learning_rate": 5.488279952664826e-05, + "loss": 1.4587, + "step": 18416 + }, + { + "epoch": 0.6595519902591007, + "grad_norm": 2.209500789642334, + "learning_rate": 5.4872448441710536e-05, + "loss": 1.665, + "step": 18417 + }, + { + "epoch": 0.659587802388669, + "grad_norm": 1.7916383743286133, + "learning_rate": 5.4862097963905865e-05, + "loss": 1.6607, + "step": 18418 + }, + { + "epoch": 0.6596236145182374, + "grad_norm": 1.0907057523727417, + "learning_rate": 5.485174809337342e-05, + "loss": 1.2834, + "step": 18419 + }, + { + "epoch": 0.6596594266478056, + "grad_norm": 1.6302517652511597, + "learning_rate": 5.484139883025251e-05, + "loss": 1.4004, + "step": 18420 + }, + { + "epoch": 0.6596952387773739, + "grad_norm": 1.5323625802993774, + "learning_rate": 5.4831050174682243e-05, + "loss": 1.5594, + "step": 18421 + }, + { + "epoch": 0.6597310509069422, + "grad_norm": 1.9435088634490967, + "learning_rate": 5.482070212680201e-05, + "loss": 1.5246, + "step": 18422 + }, + { + "epoch": 0.6597668630365104, + "grad_norm": 2.1833503246307373, + "learning_rate": 5.481035468675092e-05, + "loss": 1.4532, + "step": 18423 + }, + { + "epoch": 0.6598026751660787, + "grad_norm": 1.534131646156311, + "learning_rate": 5.4800007854668254e-05, + "loss": 1.3368, + "step": 18424 + }, + { + "epoch": 0.659838487295647, + "grad_norm": 2.482542037963867, + "learning_rate": 5.478966163069313e-05, + "loss": 1.5933, + "step": 18425 + }, + { + "epoch": 0.6598742994252154, + "grad_norm": 1.8399754762649536, + "learning_rate": 5.47793160149648e-05, + "loss": 1.5144, + "step": 18426 + }, + { + "epoch": 0.6599101115547836, + "grad_norm": 1.5900001525878906, + "learning_rate": 5.476897100762248e-05, + "loss": 1.6037, + "step": 18427 + }, + { + "epoch": 0.6599459236843519, + "grad_norm": 1.6989408731460571, + "learning_rate": 5.475862660880529e-05, + "loss": 1.1131, + "step": 18428 + }, + { + "epoch": 0.6599817358139202, + "grad_norm": 1.6499292850494385, + "learning_rate": 5.4748282818652386e-05, + "loss": 1.4069, + "step": 18429 + }, + { + "epoch": 0.6600175479434884, + "grad_norm": 1.6214898824691772, + "learning_rate": 5.473793963730299e-05, + "loss": 1.3842, + "step": 18430 + }, + { + "epoch": 0.6600533600730567, + "grad_norm": 1.480055332183838, + "learning_rate": 5.4727597064896276e-05, + "loss": 1.4681, + "step": 18431 + }, + { + "epoch": 0.660089172202625, + "grad_norm": 1.5914380550384521, + "learning_rate": 5.4717255101571253e-05, + "loss": 1.3343, + "step": 18432 + }, + { + "epoch": 0.6601249843321934, + "grad_norm": 1.7542521953582764, + "learning_rate": 5.470691374746724e-05, + "loss": 1.5553, + "step": 18433 + }, + { + "epoch": 0.6601607964617616, + "grad_norm": 2.195375442504883, + "learning_rate": 5.469657300272326e-05, + "loss": 1.4683, + "step": 18434 + }, + { + "epoch": 0.6601966085913299, + "grad_norm": 1.4942426681518555, + "learning_rate": 5.468623286747844e-05, + "loss": 1.2783, + "step": 18435 + }, + { + "epoch": 0.6602324207208982, + "grad_norm": 1.32637619972229, + "learning_rate": 5.4675893341871886e-05, + "loss": 1.4105, + "step": 18436 + }, + { + "epoch": 0.6602682328504664, + "grad_norm": 1.3210185766220093, + "learning_rate": 5.4665554426042734e-05, + "loss": 1.0158, + "step": 18437 + }, + { + "epoch": 0.6603040449800347, + "grad_norm": 1.8867077827453613, + "learning_rate": 5.465521612013012e-05, + "loss": 1.3697, + "step": 18438 + }, + { + "epoch": 0.660339857109603, + "grad_norm": 1.9586296081542969, + "learning_rate": 5.464487842427302e-05, + "loss": 1.6983, + "step": 18439 + }, + { + "epoch": 0.6603756692391713, + "grad_norm": 1.508347511291504, + "learning_rate": 5.463454133861059e-05, + "loss": 1.2857, + "step": 18440 + }, + { + "epoch": 0.6604114813687396, + "grad_norm": 1.4944887161254883, + "learning_rate": 5.462420486328188e-05, + "loss": 1.3681, + "step": 18441 + }, + { + "epoch": 0.6604472934983079, + "grad_norm": 1.4582613706588745, + "learning_rate": 5.461386899842601e-05, + "loss": 1.1148, + "step": 18442 + }, + { + "epoch": 0.6604831056278762, + "grad_norm": 1.7177790403366089, + "learning_rate": 5.460353374418195e-05, + "loss": 1.6026, + "step": 18443 + }, + { + "epoch": 0.6605189177574444, + "grad_norm": 1.6896547079086304, + "learning_rate": 5.459319910068879e-05, + "loss": 1.4609, + "step": 18444 + }, + { + "epoch": 0.6605547298870127, + "grad_norm": 1.8076750040054321, + "learning_rate": 5.4582865068085585e-05, + "loss": 1.537, + "step": 18445 + }, + { + "epoch": 0.660590542016581, + "grad_norm": 1.894033670425415, + "learning_rate": 5.4572531646511325e-05, + "loss": 1.5241, + "step": 18446 + }, + { + "epoch": 0.6606263541461493, + "grad_norm": 1.7730125188827515, + "learning_rate": 5.456219883610505e-05, + "loss": 1.5734, + "step": 18447 + }, + { + "epoch": 0.6606621662757176, + "grad_norm": 1.4446651935577393, + "learning_rate": 5.455186663700578e-05, + "loss": 1.6482, + "step": 18448 + }, + { + "epoch": 0.6606979784052859, + "grad_norm": 1.781532883644104, + "learning_rate": 5.4541535049352566e-05, + "loss": 1.6462, + "step": 18449 + }, + { + "epoch": 0.6607337905348541, + "grad_norm": 1.8322112560272217, + "learning_rate": 5.4531204073284316e-05, + "loss": 1.7429, + "step": 18450 + }, + { + "epoch": 0.6607696026644224, + "grad_norm": 1.6175395250320435, + "learning_rate": 5.4520873708940056e-05, + "loss": 1.3589, + "step": 18451 + }, + { + "epoch": 0.6608054147939907, + "grad_norm": 1.7501140832901, + "learning_rate": 5.451054395645883e-05, + "loss": 1.6793, + "step": 18452 + }, + { + "epoch": 0.660841226923559, + "grad_norm": 1.5763481855392456, + "learning_rate": 5.450021481597951e-05, + "loss": 1.3531, + "step": 18453 + }, + { + "epoch": 0.6608770390531273, + "grad_norm": 1.719218134880066, + "learning_rate": 5.448988628764111e-05, + "loss": 1.3094, + "step": 18454 + }, + { + "epoch": 0.6609128511826956, + "grad_norm": 1.532935619354248, + "learning_rate": 5.4479558371582584e-05, + "loss": 1.3702, + "step": 18455 + }, + { + "epoch": 0.6609486633122639, + "grad_norm": 1.7714307308197021, + "learning_rate": 5.446923106794293e-05, + "loss": 1.5029, + "step": 18456 + }, + { + "epoch": 0.6609844754418321, + "grad_norm": 1.7432639598846436, + "learning_rate": 5.4458904376860997e-05, + "loss": 1.5302, + "step": 18457 + }, + { + "epoch": 0.6610202875714004, + "grad_norm": 1.608049750328064, + "learning_rate": 5.444857829847576e-05, + "loss": 1.1499, + "step": 18458 + }, + { + "epoch": 0.6610560997009687, + "grad_norm": 1.6713452339172363, + "learning_rate": 5.443825283292615e-05, + "loss": 1.4979, + "step": 18459 + }, + { + "epoch": 0.661091911830537, + "grad_norm": 1.5441710948944092, + "learning_rate": 5.4427927980351124e-05, + "loss": 1.309, + "step": 18460 + }, + { + "epoch": 0.6611277239601053, + "grad_norm": 1.8041974306106567, + "learning_rate": 5.441760374088949e-05, + "loss": 1.2219, + "step": 18461 + }, + { + "epoch": 0.6611635360896736, + "grad_norm": 1.397838830947876, + "learning_rate": 5.4407280114680206e-05, + "loss": 1.2665, + "step": 18462 + }, + { + "epoch": 0.6611993482192419, + "grad_norm": 1.60728919506073, + "learning_rate": 5.439695710186219e-05, + "loss": 1.5308, + "step": 18463 + }, + { + "epoch": 0.6612351603488101, + "grad_norm": 1.4930665493011475, + "learning_rate": 5.4386634702574255e-05, + "loss": 1.5292, + "step": 18464 + }, + { + "epoch": 0.6612709724783784, + "grad_norm": 1.4357012510299683, + "learning_rate": 5.437631291695533e-05, + "loss": 1.4713, + "step": 18465 + }, + { + "epoch": 0.6613067846079467, + "grad_norm": 1.4895292520523071, + "learning_rate": 5.436599174514425e-05, + "loss": 1.5546, + "step": 18466 + }, + { + "epoch": 0.6613425967375149, + "grad_norm": 1.6815274953842163, + "learning_rate": 5.435567118727993e-05, + "loss": 1.2031, + "step": 18467 + }, + { + "epoch": 0.6613784088670833, + "grad_norm": 1.4173117876052856, + "learning_rate": 5.434535124350113e-05, + "loss": 1.6375, + "step": 18468 + }, + { + "epoch": 0.6614142209966516, + "grad_norm": 1.7272834777832031, + "learning_rate": 5.433503191394675e-05, + "loss": 1.4441, + "step": 18469 + }, + { + "epoch": 0.6614500331262199, + "grad_norm": 1.6564527750015259, + "learning_rate": 5.432471319875565e-05, + "loss": 1.4019, + "step": 18470 + }, + { + "epoch": 0.6614858452557881, + "grad_norm": 1.524398922920227, + "learning_rate": 5.431439509806657e-05, + "loss": 1.3863, + "step": 18471 + }, + { + "epoch": 0.6615216573853564, + "grad_norm": 1.881738543510437, + "learning_rate": 5.4304077612018375e-05, + "loss": 1.5866, + "step": 18472 + }, + { + "epoch": 0.6615574695149247, + "grad_norm": 1.785723328590393, + "learning_rate": 5.429376074074988e-05, + "loss": 1.595, + "step": 18473 + }, + { + "epoch": 0.6615932816444929, + "grad_norm": 1.6554456949234009, + "learning_rate": 5.4283444484399904e-05, + "loss": 1.2719, + "step": 18474 + }, + { + "epoch": 0.6616290937740613, + "grad_norm": 2.057368516921997, + "learning_rate": 5.427312884310718e-05, + "loss": 1.5038, + "step": 18475 + }, + { + "epoch": 0.6616649059036296, + "grad_norm": 1.655664086341858, + "learning_rate": 5.426281381701053e-05, + "loss": 1.4807, + "step": 18476 + }, + { + "epoch": 0.6617007180331979, + "grad_norm": 1.4010266065597534, + "learning_rate": 5.4252499406248724e-05, + "loss": 1.34, + "step": 18477 + }, + { + "epoch": 0.6617365301627661, + "grad_norm": 1.8840229511260986, + "learning_rate": 5.424218561096055e-05, + "loss": 1.7182, + "step": 18478 + }, + { + "epoch": 0.6617723422923344, + "grad_norm": 1.9274563789367676, + "learning_rate": 5.423187243128472e-05, + "loss": 1.5018, + "step": 18479 + }, + { + "epoch": 0.6618081544219027, + "grad_norm": 1.7007853984832764, + "learning_rate": 5.4221559867360014e-05, + "loss": 1.5183, + "step": 18480 + }, + { + "epoch": 0.6618439665514709, + "grad_norm": 1.4750630855560303, + "learning_rate": 5.4211247919325206e-05, + "loss": 1.5261, + "step": 18481 + }, + { + "epoch": 0.6618797786810393, + "grad_norm": 1.4993181228637695, + "learning_rate": 5.4200936587318954e-05, + "loss": 1.4829, + "step": 18482 + }, + { + "epoch": 0.6619155908106076, + "grad_norm": 2.4123010635375977, + "learning_rate": 5.4190625871480016e-05, + "loss": 1.646, + "step": 18483 + }, + { + "epoch": 0.6619514029401758, + "grad_norm": 1.5104199647903442, + "learning_rate": 5.4180315771947123e-05, + "loss": 1.4037, + "step": 18484 + }, + { + "epoch": 0.6619872150697441, + "grad_norm": 1.7275927066802979, + "learning_rate": 5.417000628885902e-05, + "loss": 1.6069, + "step": 18485 + }, + { + "epoch": 0.6620230271993124, + "grad_norm": 1.8526519536972046, + "learning_rate": 5.415969742235432e-05, + "loss": 1.7501, + "step": 18486 + }, + { + "epoch": 0.6620588393288807, + "grad_norm": 2.1937081813812256, + "learning_rate": 5.414938917257177e-05, + "loss": 1.4062, + "step": 18487 + }, + { + "epoch": 0.6620946514584489, + "grad_norm": 1.4393888711929321, + "learning_rate": 5.4139081539650084e-05, + "loss": 1.3466, + "step": 18488 + }, + { + "epoch": 0.6621304635880173, + "grad_norm": 1.2877326011657715, + "learning_rate": 5.412877452372784e-05, + "loss": 1.4105, + "step": 18489 + }, + { + "epoch": 0.6621662757175856, + "grad_norm": 1.5457091331481934, + "learning_rate": 5.411846812494379e-05, + "loss": 1.4512, + "step": 18490 + }, + { + "epoch": 0.6622020878471538, + "grad_norm": 2.318279266357422, + "learning_rate": 5.410816234343656e-05, + "loss": 1.4582, + "step": 18491 + }, + { + "epoch": 0.6622378999767221, + "grad_norm": 2.045260429382324, + "learning_rate": 5.4097857179344846e-05, + "loss": 1.4889, + "step": 18492 + }, + { + "epoch": 0.6622737121062904, + "grad_norm": 2.0480244159698486, + "learning_rate": 5.4087552632807225e-05, + "loss": 1.3724, + "step": 18493 + }, + { + "epoch": 0.6623095242358586, + "grad_norm": 1.5088456869125366, + "learning_rate": 5.407724870396235e-05, + "loss": 1.4677, + "step": 18494 + }, + { + "epoch": 0.6623453363654269, + "grad_norm": 1.4615206718444824, + "learning_rate": 5.4066945392948896e-05, + "loss": 1.0757, + "step": 18495 + }, + { + "epoch": 0.6623811484949953, + "grad_norm": 1.892917275428772, + "learning_rate": 5.40566426999054e-05, + "loss": 1.5292, + "step": 18496 + }, + { + "epoch": 0.6624169606245636, + "grad_norm": 1.6644346714019775, + "learning_rate": 5.404634062497057e-05, + "loss": 1.5624, + "step": 18497 + }, + { + "epoch": 0.6624527727541318, + "grad_norm": 2.008518695831299, + "learning_rate": 5.403603916828286e-05, + "loss": 1.6561, + "step": 18498 + }, + { + "epoch": 0.6624885848837001, + "grad_norm": 1.7898550033569336, + "learning_rate": 5.4025738329981035e-05, + "loss": 1.4068, + "step": 18499 + }, + { + "epoch": 0.6625243970132684, + "grad_norm": 1.7520909309387207, + "learning_rate": 5.401543811020356e-05, + "loss": 1.2904, + "step": 18500 + }, + { + "epoch": 0.6625602091428366, + "grad_norm": 1.633162021636963, + "learning_rate": 5.400513850908905e-05, + "loss": 1.3964, + "step": 18501 + }, + { + "epoch": 0.6625960212724049, + "grad_norm": 2.3457908630371094, + "learning_rate": 5.3994839526776065e-05, + "loss": 1.8168, + "step": 18502 + }, + { + "epoch": 0.6626318334019733, + "grad_norm": 1.3527460098266602, + "learning_rate": 5.398454116340322e-05, + "loss": 1.3832, + "step": 18503 + }, + { + "epoch": 0.6626676455315416, + "grad_norm": 1.643717646598816, + "learning_rate": 5.3974243419109016e-05, + "loss": 1.1241, + "step": 18504 + }, + { + "epoch": 0.6627034576611098, + "grad_norm": 1.5871790647506714, + "learning_rate": 5.396394629403192e-05, + "loss": 1.5845, + "step": 18505 + }, + { + "epoch": 0.6627392697906781, + "grad_norm": 1.3667486906051636, + "learning_rate": 5.395364978831061e-05, + "loss": 1.5827, + "step": 18506 + }, + { + "epoch": 0.6627750819202464, + "grad_norm": 1.8814173936843872, + "learning_rate": 5.394335390208352e-05, + "loss": 1.1862, + "step": 18507 + }, + { + "epoch": 0.6628108940498146, + "grad_norm": 1.5491501092910767, + "learning_rate": 5.393305863548924e-05, + "loss": 1.4095, + "step": 18508 + }, + { + "epoch": 0.6628467061793829, + "grad_norm": 1.667093276977539, + "learning_rate": 5.392276398866615e-05, + "loss": 1.4577, + "step": 18509 + }, + { + "epoch": 0.6628825183089513, + "grad_norm": 1.9760993719100952, + "learning_rate": 5.391246996175291e-05, + "loss": 1.5, + "step": 18510 + }, + { + "epoch": 0.6629183304385196, + "grad_norm": 2.045569658279419, + "learning_rate": 5.39021765548879e-05, + "loss": 1.6401, + "step": 18511 + }, + { + "epoch": 0.6629541425680878, + "grad_norm": 1.5511012077331543, + "learning_rate": 5.3891883768209686e-05, + "loss": 1.3945, + "step": 18512 + }, + { + "epoch": 0.6629899546976561, + "grad_norm": 1.8980215787887573, + "learning_rate": 5.388159160185665e-05, + "loss": 1.3399, + "step": 18513 + }, + { + "epoch": 0.6630257668272244, + "grad_norm": 1.6040958166122437, + "learning_rate": 5.387130005596732e-05, + "loss": 1.5281, + "step": 18514 + }, + { + "epoch": 0.6630615789567926, + "grad_norm": 2.547346830368042, + "learning_rate": 5.386100913068017e-05, + "loss": 1.5416, + "step": 18515 + }, + { + "epoch": 0.6630973910863609, + "grad_norm": 1.6553081274032593, + "learning_rate": 5.385071882613357e-05, + "loss": 1.2632, + "step": 18516 + }, + { + "epoch": 0.6631332032159293, + "grad_norm": 1.6644037961959839, + "learning_rate": 5.3840429142466096e-05, + "loss": 1.786, + "step": 18517 + }, + { + "epoch": 0.6631690153454975, + "grad_norm": 1.7519235610961914, + "learning_rate": 5.383014007981606e-05, + "loss": 1.5851, + "step": 18518 + }, + { + "epoch": 0.6632048274750658, + "grad_norm": 1.6565033197402954, + "learning_rate": 5.381985163832197e-05, + "loss": 1.5017, + "step": 18519 + }, + { + "epoch": 0.6632406396046341, + "grad_norm": 1.5082019567489624, + "learning_rate": 5.380956381812213e-05, + "loss": 1.5402, + "step": 18520 + }, + { + "epoch": 0.6632764517342024, + "grad_norm": 1.5436220169067383, + "learning_rate": 5.379927661935511e-05, + "loss": 1.2787, + "step": 18521 + }, + { + "epoch": 0.6633122638637706, + "grad_norm": 1.599420189857483, + "learning_rate": 5.3788990042159224e-05, + "loss": 1.3577, + "step": 18522 + }, + { + "epoch": 0.6633480759933389, + "grad_norm": 1.4425898790359497, + "learning_rate": 5.377870408667285e-05, + "loss": 1.3236, + "step": 18523 + }, + { + "epoch": 0.6633838881229073, + "grad_norm": 1.4394543170928955, + "learning_rate": 5.3768418753034375e-05, + "loss": 1.5358, + "step": 18524 + }, + { + "epoch": 0.6634197002524755, + "grad_norm": 1.8449910879135132, + "learning_rate": 5.375813404138219e-05, + "loss": 1.5375, + "step": 18525 + }, + { + "epoch": 0.6634555123820438, + "grad_norm": 1.8355271816253662, + "learning_rate": 5.37478499518547e-05, + "loss": 1.309, + "step": 18526 + }, + { + "epoch": 0.6634913245116121, + "grad_norm": 1.466902256011963, + "learning_rate": 5.3737566484590164e-05, + "loss": 1.3721, + "step": 18527 + }, + { + "epoch": 0.6635271366411803, + "grad_norm": 1.5819684267044067, + "learning_rate": 5.372728363972706e-05, + "loss": 1.4732, + "step": 18528 + }, + { + "epoch": 0.6635629487707486, + "grad_norm": 1.633408546447754, + "learning_rate": 5.371700141740364e-05, + "loss": 1.9911, + "step": 18529 + }, + { + "epoch": 0.6635987609003169, + "grad_norm": 1.6528607606887817, + "learning_rate": 5.3706719817758286e-05, + "loss": 1.5908, + "step": 18530 + }, + { + "epoch": 0.6636345730298853, + "grad_norm": 2.3122167587280273, + "learning_rate": 5.3696438840929276e-05, + "loss": 1.552, + "step": 18531 + }, + { + "epoch": 0.6636703851594535, + "grad_norm": 1.6594113111495972, + "learning_rate": 5.368615848705496e-05, + "loss": 1.7639, + "step": 18532 + }, + { + "epoch": 0.6637061972890218, + "grad_norm": 1.49611496925354, + "learning_rate": 5.367587875627367e-05, + "loss": 1.2976, + "step": 18533 + }, + { + "epoch": 0.6637420094185901, + "grad_norm": 1.8043385744094849, + "learning_rate": 5.366559964872364e-05, + "loss": 1.3638, + "step": 18534 + }, + { + "epoch": 0.6637778215481583, + "grad_norm": 1.6629353761672974, + "learning_rate": 5.36553211645432e-05, + "loss": 1.504, + "step": 18535 + }, + { + "epoch": 0.6638136336777266, + "grad_norm": 2.4057998657226562, + "learning_rate": 5.3645043303870634e-05, + "loss": 1.3973, + "step": 18536 + }, + { + "epoch": 0.6638494458072949, + "grad_norm": 1.4145482778549194, + "learning_rate": 5.363476606684425e-05, + "loss": 1.411, + "step": 18537 + }, + { + "epoch": 0.6638852579368633, + "grad_norm": 1.9292107820510864, + "learning_rate": 5.3624489453602255e-05, + "loss": 1.4168, + "step": 18538 + }, + { + "epoch": 0.6639210700664315, + "grad_norm": 2.106179714202881, + "learning_rate": 5.361421346428294e-05, + "loss": 1.4649, + "step": 18539 + }, + { + "epoch": 0.6639568821959998, + "grad_norm": 1.8230222463607788, + "learning_rate": 5.3603938099024576e-05, + "loss": 1.3864, + "step": 18540 + }, + { + "epoch": 0.6639926943255681, + "grad_norm": 1.3416988849639893, + "learning_rate": 5.359366335796534e-05, + "loss": 1.3887, + "step": 18541 + }, + { + "epoch": 0.6640285064551363, + "grad_norm": 1.5865682363510132, + "learning_rate": 5.35833892412435e-05, + "loss": 1.6314, + "step": 18542 + }, + { + "epoch": 0.6640643185847046, + "grad_norm": 1.7829359769821167, + "learning_rate": 5.3573115748997284e-05, + "loss": 1.3659, + "step": 18543 + }, + { + "epoch": 0.6641001307142729, + "grad_norm": 1.5039947032928467, + "learning_rate": 5.356284288136496e-05, + "loss": 1.2948, + "step": 18544 + }, + { + "epoch": 0.6641359428438413, + "grad_norm": 1.5445626974105835, + "learning_rate": 5.3552570638484644e-05, + "loss": 1.2314, + "step": 18545 + }, + { + "epoch": 0.6641717549734095, + "grad_norm": 1.75049889087677, + "learning_rate": 5.3542299020494567e-05, + "loss": 1.4201, + "step": 18546 + }, + { + "epoch": 0.6642075671029778, + "grad_norm": 1.5456581115722656, + "learning_rate": 5.3532028027532947e-05, + "loss": 1.6774, + "step": 18547 + }, + { + "epoch": 0.6642433792325461, + "grad_norm": 1.814924955368042, + "learning_rate": 5.352175765973797e-05, + "loss": 1.4873, + "step": 18548 + }, + { + "epoch": 0.6642791913621143, + "grad_norm": 1.8714579343795776, + "learning_rate": 5.351148791724776e-05, + "loss": 1.3834, + "step": 18549 + }, + { + "epoch": 0.6643150034916826, + "grad_norm": 1.5379172563552856, + "learning_rate": 5.3501218800200514e-05, + "loss": 1.426, + "step": 18550 + }, + { + "epoch": 0.6643508156212509, + "grad_norm": 1.540379524230957, + "learning_rate": 5.349095030873443e-05, + "loss": 1.4127, + "step": 18551 + }, + { + "epoch": 0.6643866277508192, + "grad_norm": 1.8711189031600952, + "learning_rate": 5.348068244298758e-05, + "loss": 1.3488, + "step": 18552 + }, + { + "epoch": 0.6644224398803875, + "grad_norm": 1.440477728843689, + "learning_rate": 5.347041520309815e-05, + "loss": 1.506, + "step": 18553 + }, + { + "epoch": 0.6644582520099558, + "grad_norm": 1.2828096151351929, + "learning_rate": 5.346014858920425e-05, + "loss": 1.4859, + "step": 18554 + }, + { + "epoch": 0.664494064139524, + "grad_norm": 1.695265769958496, + "learning_rate": 5.3449882601444054e-05, + "loss": 1.2407, + "step": 18555 + }, + { + "epoch": 0.6645298762690923, + "grad_norm": 2.340801239013672, + "learning_rate": 5.343961723995561e-05, + "loss": 1.5744, + "step": 18556 + }, + { + "epoch": 0.6645656883986606, + "grad_norm": 1.5414936542510986, + "learning_rate": 5.342935250487706e-05, + "loss": 1.3723, + "step": 18557 + }, + { + "epoch": 0.6646015005282289, + "grad_norm": 1.4003174304962158, + "learning_rate": 5.341908839634654e-05, + "loss": 1.6129, + "step": 18558 + }, + { + "epoch": 0.6646373126577972, + "grad_norm": 1.514754295349121, + "learning_rate": 5.340882491450205e-05, + "loss": 1.3989, + "step": 18559 + }, + { + "epoch": 0.6646731247873655, + "grad_norm": 1.9830070734024048, + "learning_rate": 5.339856205948175e-05, + "loss": 1.7887, + "step": 18560 + }, + { + "epoch": 0.6647089369169338, + "grad_norm": 1.8465994596481323, + "learning_rate": 5.338829983142366e-05, + "loss": 1.9694, + "step": 18561 + }, + { + "epoch": 0.664744749046502, + "grad_norm": 1.5064421892166138, + "learning_rate": 5.337803823046592e-05, + "loss": 1.391, + "step": 18562 + }, + { + "epoch": 0.6647805611760703, + "grad_norm": 1.7226872444152832, + "learning_rate": 5.33677772567465e-05, + "loss": 1.4824, + "step": 18563 + }, + { + "epoch": 0.6648163733056386, + "grad_norm": 2.0793983936309814, + "learning_rate": 5.335751691040348e-05, + "loss": 1.7713, + "step": 18564 + }, + { + "epoch": 0.6648521854352069, + "grad_norm": 1.8134207725524902, + "learning_rate": 5.334725719157492e-05, + "loss": 1.3679, + "step": 18565 + }, + { + "epoch": 0.6648879975647752, + "grad_norm": 2.0199649333953857, + "learning_rate": 5.333699810039885e-05, + "loss": 1.5586, + "step": 18566 + }, + { + "epoch": 0.6649238096943435, + "grad_norm": 1.5236722230911255, + "learning_rate": 5.3326739637013255e-05, + "loss": 1.5598, + "step": 18567 + }, + { + "epoch": 0.6649596218239118, + "grad_norm": 1.9306199550628662, + "learning_rate": 5.3316481801556173e-05, + "loss": 1.6249, + "step": 18568 + }, + { + "epoch": 0.66499543395348, + "grad_norm": 1.7623952627182007, + "learning_rate": 5.3306224594165654e-05, + "loss": 1.5419, + "step": 18569 + }, + { + "epoch": 0.6650312460830483, + "grad_norm": 1.5631310939788818, + "learning_rate": 5.3295968014979613e-05, + "loss": 1.6085, + "step": 18570 + }, + { + "epoch": 0.6650670582126166, + "grad_norm": 1.6538745164871216, + "learning_rate": 5.328571206413607e-05, + "loss": 1.1281, + "step": 18571 + }, + { + "epoch": 0.6651028703421848, + "grad_norm": 1.778361439704895, + "learning_rate": 5.3275456741773025e-05, + "loss": 1.6629, + "step": 18572 + }, + { + "epoch": 0.6651386824717532, + "grad_norm": 1.872376561164856, + "learning_rate": 5.3265202048028474e-05, + "loss": 1.7646, + "step": 18573 + }, + { + "epoch": 0.6651744946013215, + "grad_norm": 1.703460693359375, + "learning_rate": 5.32549479830403e-05, + "loss": 1.2561, + "step": 18574 + }, + { + "epoch": 0.6652103067308898, + "grad_norm": 1.7537646293640137, + "learning_rate": 5.324469454694651e-05, + "loss": 1.3299, + "step": 18575 + }, + { + "epoch": 0.665246118860458, + "grad_norm": 1.5586273670196533, + "learning_rate": 5.323444173988509e-05, + "loss": 1.397, + "step": 18576 + }, + { + "epoch": 0.6652819309900263, + "grad_norm": 1.939870834350586, + "learning_rate": 5.3224189561993886e-05, + "loss": 1.5419, + "step": 18577 + }, + { + "epoch": 0.6653177431195946, + "grad_norm": 1.8316444158554077, + "learning_rate": 5.321393801341088e-05, + "loss": 1.1162, + "step": 18578 + }, + { + "epoch": 0.6653535552491628, + "grad_norm": 1.9167686700820923, + "learning_rate": 5.320368709427399e-05, + "loss": 1.6643, + "step": 18579 + }, + { + "epoch": 0.6653893673787312, + "grad_norm": 1.688992977142334, + "learning_rate": 5.3193436804721154e-05, + "loss": 1.4744, + "step": 18580 + }, + { + "epoch": 0.6654251795082995, + "grad_norm": 1.4735733270645142, + "learning_rate": 5.318318714489021e-05, + "loss": 1.5833, + "step": 18581 + }, + { + "epoch": 0.6654609916378678, + "grad_norm": 1.6011989116668701, + "learning_rate": 5.317293811491911e-05, + "loss": 1.3692, + "step": 18582 + }, + { + "epoch": 0.665496803767436, + "grad_norm": 1.7354061603546143, + "learning_rate": 5.316268971494571e-05, + "loss": 1.3453, + "step": 18583 + }, + { + "epoch": 0.6655326158970043, + "grad_norm": 1.5282679796218872, + "learning_rate": 5.315244194510795e-05, + "loss": 1.5145, + "step": 18584 + }, + { + "epoch": 0.6655684280265726, + "grad_norm": 1.8057043552398682, + "learning_rate": 5.3142194805543625e-05, + "loss": 1.2936, + "step": 18585 + }, + { + "epoch": 0.6656042401561408, + "grad_norm": 1.4133145809173584, + "learning_rate": 5.313194829639061e-05, + "loss": 1.4274, + "step": 18586 + }, + { + "epoch": 0.6656400522857092, + "grad_norm": 1.4651211500167847, + "learning_rate": 5.312170241778682e-05, + "loss": 1.5827, + "step": 18587 + }, + { + "epoch": 0.6656758644152775, + "grad_norm": 1.4076982736587524, + "learning_rate": 5.311145716987003e-05, + "loss": 1.4423, + "step": 18588 + }, + { + "epoch": 0.6657116765448458, + "grad_norm": 1.5630006790161133, + "learning_rate": 5.310121255277809e-05, + "loss": 1.2223, + "step": 18589 + }, + { + "epoch": 0.665747488674414, + "grad_norm": 1.5476254224777222, + "learning_rate": 5.3090968566648836e-05, + "loss": 1.3305, + "step": 18590 + }, + { + "epoch": 0.6657833008039823, + "grad_norm": 1.4601191282272339, + "learning_rate": 5.308072521162013e-05, + "loss": 1.3458, + "step": 18591 + }, + { + "epoch": 0.6658191129335506, + "grad_norm": 1.768479347229004, + "learning_rate": 5.307048248782975e-05, + "loss": 1.4372, + "step": 18592 + }, + { + "epoch": 0.6658549250631188, + "grad_norm": 1.4947692155838013, + "learning_rate": 5.306024039541542e-05, + "loss": 1.4677, + "step": 18593 + }, + { + "epoch": 0.6658907371926872, + "grad_norm": 1.4802711009979248, + "learning_rate": 5.3049998934515076e-05, + "loss": 1.3701, + "step": 18594 + }, + { + "epoch": 0.6659265493222555, + "grad_norm": 1.391737461090088, + "learning_rate": 5.30397581052664e-05, + "loss": 1.2477, + "step": 18595 + }, + { + "epoch": 0.6659623614518237, + "grad_norm": 2.0155186653137207, + "learning_rate": 5.302951790780725e-05, + "loss": 1.8189, + "step": 18596 + }, + { + "epoch": 0.665998173581392, + "grad_norm": 1.9427070617675781, + "learning_rate": 5.3019278342275256e-05, + "loss": 1.421, + "step": 18597 + }, + { + "epoch": 0.6660339857109603, + "grad_norm": 1.6364455223083496, + "learning_rate": 5.300903940880837e-05, + "loss": 1.5302, + "step": 18598 + }, + { + "epoch": 0.6660697978405286, + "grad_norm": 1.3640127182006836, + "learning_rate": 5.299880110754418e-05, + "loss": 1.4539, + "step": 18599 + }, + { + "epoch": 0.6661056099700968, + "grad_norm": 1.8149890899658203, + "learning_rate": 5.298856343862051e-05, + "loss": 1.3804, + "step": 18600 + }, + { + "epoch": 0.6661414220996652, + "grad_norm": 2.145496368408203, + "learning_rate": 5.2978326402175125e-05, + "loss": 1.3423, + "step": 18601 + }, + { + "epoch": 0.6661772342292335, + "grad_norm": 1.472667932510376, + "learning_rate": 5.296808999834565e-05, + "loss": 1.0807, + "step": 18602 + }, + { + "epoch": 0.6662130463588017, + "grad_norm": 2.075178861618042, + "learning_rate": 5.295785422726991e-05, + "loss": 1.5908, + "step": 18603 + }, + { + "epoch": 0.66624885848837, + "grad_norm": 1.5914913415908813, + "learning_rate": 5.2947619089085463e-05, + "loss": 1.2944, + "step": 18604 + }, + { + "epoch": 0.6662846706179383, + "grad_norm": 1.5347884893417358, + "learning_rate": 5.2937384583930204e-05, + "loss": 1.5397, + "step": 18605 + }, + { + "epoch": 0.6663204827475065, + "grad_norm": 1.6247977018356323, + "learning_rate": 5.2927150711941675e-05, + "loss": 1.1967, + "step": 18606 + }, + { + "epoch": 0.6663562948770748, + "grad_norm": 1.4314088821411133, + "learning_rate": 5.2916917473257665e-05, + "loss": 1.5123, + "step": 18607 + }, + { + "epoch": 0.6663921070066432, + "grad_norm": 1.2514408826828003, + "learning_rate": 5.2906684868015724e-05, + "loss": 1.664, + "step": 18608 + }, + { + "epoch": 0.6664279191362115, + "grad_norm": 1.1693495512008667, + "learning_rate": 5.2896452896353656e-05, + "loss": 1.2389, + "step": 18609 + }, + { + "epoch": 0.6664637312657797, + "grad_norm": 2.151035785675049, + "learning_rate": 5.2886221558409065e-05, + "loss": 1.4911, + "step": 18610 + }, + { + "epoch": 0.666499543395348, + "grad_norm": 1.815909743309021, + "learning_rate": 5.287599085431951e-05, + "loss": 1.7131, + "step": 18611 + }, + { + "epoch": 0.6665353555249163, + "grad_norm": 1.4415584802627563, + "learning_rate": 5.2865760784222786e-05, + "loss": 1.3634, + "step": 18612 + }, + { + "epoch": 0.6665711676544845, + "grad_norm": 2.0365536212921143, + "learning_rate": 5.2855531348256424e-05, + "loss": 1.6613, + "step": 18613 + }, + { + "epoch": 0.6666069797840528, + "grad_norm": 1.3029762506484985, + "learning_rate": 5.2845302546558105e-05, + "loss": 1.4112, + "step": 18614 + }, + { + "epoch": 0.6666427919136212, + "grad_norm": 2.367351770401001, + "learning_rate": 5.283507437926534e-05, + "loss": 1.5832, + "step": 18615 + }, + { + "epoch": 0.6666786040431895, + "grad_norm": 1.4028692245483398, + "learning_rate": 5.2824846846515886e-05, + "loss": 1.381, + "step": 18616 + }, + { + "epoch": 0.6667144161727577, + "grad_norm": 1.3128471374511719, + "learning_rate": 5.281461994844723e-05, + "loss": 1.2977, + "step": 18617 + }, + { + "epoch": 0.666750228302326, + "grad_norm": 1.9322855472564697, + "learning_rate": 5.280439368519703e-05, + "loss": 1.5668, + "step": 18618 + }, + { + "epoch": 0.6667860404318943, + "grad_norm": 2.173131227493286, + "learning_rate": 5.27941680569028e-05, + "loss": 1.5765, + "step": 18619 + }, + { + "epoch": 0.6668218525614625, + "grad_norm": 1.7341821193695068, + "learning_rate": 5.2783943063702155e-05, + "loss": 1.6393, + "step": 18620 + }, + { + "epoch": 0.6668576646910308, + "grad_norm": 2.0364861488342285, + "learning_rate": 5.277371870573269e-05, + "loss": 1.6499, + "step": 18621 + }, + { + "epoch": 0.6668934768205992, + "grad_norm": 1.3587638139724731, + "learning_rate": 5.276349498313188e-05, + "loss": 1.3826, + "step": 18622 + }, + { + "epoch": 0.6669292889501675, + "grad_norm": 1.7243382930755615, + "learning_rate": 5.2753271896037316e-05, + "loss": 1.3895, + "step": 18623 + }, + { + "epoch": 0.6669651010797357, + "grad_norm": 1.643670916557312, + "learning_rate": 5.274304944458652e-05, + "loss": 1.5037, + "step": 18624 + }, + { + "epoch": 0.667000913209304, + "grad_norm": 2.013648509979248, + "learning_rate": 5.273282762891709e-05, + "loss": 1.341, + "step": 18625 + }, + { + "epoch": 0.6670367253388723, + "grad_norm": 1.607382893562317, + "learning_rate": 5.2722606449166426e-05, + "loss": 1.2063, + "step": 18626 + }, + { + "epoch": 0.6670725374684405, + "grad_norm": 1.6168111562728882, + "learning_rate": 5.271238590547216e-05, + "loss": 1.5996, + "step": 18627 + }, + { + "epoch": 0.6671083495980088, + "grad_norm": 1.732462763786316, + "learning_rate": 5.270216599797176e-05, + "loss": 1.466, + "step": 18628 + }, + { + "epoch": 0.6671441617275772, + "grad_norm": 1.7212103605270386, + "learning_rate": 5.269194672680267e-05, + "loss": 1.5072, + "step": 18629 + }, + { + "epoch": 0.6671799738571454, + "grad_norm": 1.5116984844207764, + "learning_rate": 5.268172809210241e-05, + "loss": 1.268, + "step": 18630 + }, + { + "epoch": 0.6672157859867137, + "grad_norm": 1.6461093425750732, + "learning_rate": 5.267151009400846e-05, + "loss": 1.543, + "step": 18631 + }, + { + "epoch": 0.667251598116282, + "grad_norm": 1.5249959230422974, + "learning_rate": 5.266129273265834e-05, + "loss": 1.5185, + "step": 18632 + }, + { + "epoch": 0.6672874102458503, + "grad_norm": 1.881767749786377, + "learning_rate": 5.2651076008189415e-05, + "loss": 1.3663, + "step": 18633 + }, + { + "epoch": 0.6673232223754185, + "grad_norm": 1.7331863641738892, + "learning_rate": 5.2640859920739194e-05, + "loss": 1.5388, + "step": 18634 + }, + { + "epoch": 0.6673590345049868, + "grad_norm": 1.5982319116592407, + "learning_rate": 5.263064447044511e-05, + "loss": 1.6447, + "step": 18635 + }, + { + "epoch": 0.6673948466345552, + "grad_norm": 1.6819671392440796, + "learning_rate": 5.262042965744465e-05, + "loss": 1.4766, + "step": 18636 + }, + { + "epoch": 0.6674306587641234, + "grad_norm": 2.2758350372314453, + "learning_rate": 5.261021548187515e-05, + "loss": 1.1752, + "step": 18637 + }, + { + "epoch": 0.6674664708936917, + "grad_norm": 1.5425456762313843, + "learning_rate": 5.260000194387407e-05, + "loss": 1.1864, + "step": 18638 + }, + { + "epoch": 0.66750228302326, + "grad_norm": 1.5215123891830444, + "learning_rate": 5.2589789043578855e-05, + "loss": 1.2848, + "step": 18639 + }, + { + "epoch": 0.6675380951528282, + "grad_norm": 1.7015901803970337, + "learning_rate": 5.257957678112684e-05, + "loss": 1.3467, + "step": 18640 + }, + { + "epoch": 0.6675739072823965, + "grad_norm": 1.6739884614944458, + "learning_rate": 5.2569365156655446e-05, + "loss": 1.7775, + "step": 18641 + }, + { + "epoch": 0.6676097194119648, + "grad_norm": 1.2966762781143188, + "learning_rate": 5.255915417030206e-05, + "loss": 1.268, + "step": 18642 + }, + { + "epoch": 0.6676455315415332, + "grad_norm": 1.2526237964630127, + "learning_rate": 5.254894382220412e-05, + "loss": 1.3321, + "step": 18643 + }, + { + "epoch": 0.6676813436711014, + "grad_norm": 1.7547239065170288, + "learning_rate": 5.2538734112498876e-05, + "loss": 1.5343, + "step": 18644 + }, + { + "epoch": 0.6677171558006697, + "grad_norm": 1.8325891494750977, + "learning_rate": 5.252852504132375e-05, + "loss": 1.3332, + "step": 18645 + }, + { + "epoch": 0.667752967930238, + "grad_norm": 2.117971658706665, + "learning_rate": 5.251831660881612e-05, + "loss": 1.454, + "step": 18646 + }, + { + "epoch": 0.6677887800598062, + "grad_norm": 1.859230875968933, + "learning_rate": 5.2508108815113264e-05, + "loss": 1.4038, + "step": 18647 + }, + { + "epoch": 0.6678245921893745, + "grad_norm": 2.172140121459961, + "learning_rate": 5.249790166035253e-05, + "loss": 1.1993, + "step": 18648 + }, + { + "epoch": 0.6678604043189428, + "grad_norm": 1.781510353088379, + "learning_rate": 5.2487695144671264e-05, + "loss": 1.5604, + "step": 18649 + }, + { + "epoch": 0.6678962164485112, + "grad_norm": 1.547957181930542, + "learning_rate": 5.247748926820683e-05, + "loss": 1.305, + "step": 18650 + }, + { + "epoch": 0.6679320285780794, + "grad_norm": 1.5986849069595337, + "learning_rate": 5.246728403109642e-05, + "loss": 1.8007, + "step": 18651 + }, + { + "epoch": 0.6679678407076477, + "grad_norm": 1.7270187139511108, + "learning_rate": 5.245707943347738e-05, + "loss": 1.1008, + "step": 18652 + }, + { + "epoch": 0.668003652837216, + "grad_norm": 1.8409010171890259, + "learning_rate": 5.244687547548703e-05, + "loss": 1.206, + "step": 18653 + }, + { + "epoch": 0.6680394649667842, + "grad_norm": 1.8632875680923462, + "learning_rate": 5.243667215726267e-05, + "loss": 1.5249, + "step": 18654 + }, + { + "epoch": 0.6680752770963525, + "grad_norm": 1.8973302841186523, + "learning_rate": 5.242646947894148e-05, + "loss": 1.3558, + "step": 18655 + }, + { + "epoch": 0.6681110892259208, + "grad_norm": 1.471661925315857, + "learning_rate": 5.241626744066079e-05, + "loss": 1.5711, + "step": 18656 + }, + { + "epoch": 0.6681469013554892, + "grad_norm": 1.3317395448684692, + "learning_rate": 5.240606604255787e-05, + "loss": 1.1836, + "step": 18657 + }, + { + "epoch": 0.6681827134850574, + "grad_norm": 1.742253065109253, + "learning_rate": 5.239586528476992e-05, + "loss": 1.5158, + "step": 18658 + }, + { + "epoch": 0.6682185256146257, + "grad_norm": 1.6909421682357788, + "learning_rate": 5.2385665167434175e-05, + "loss": 1.2893, + "step": 18659 + }, + { + "epoch": 0.668254337744194, + "grad_norm": 1.7493952512741089, + "learning_rate": 5.2375465690687895e-05, + "loss": 1.4646, + "step": 18660 + }, + { + "epoch": 0.6682901498737622, + "grad_norm": 1.4725412130355835, + "learning_rate": 5.236526685466834e-05, + "loss": 1.5871, + "step": 18661 + }, + { + "epoch": 0.6683259620033305, + "grad_norm": 1.6244035959243774, + "learning_rate": 5.235506865951263e-05, + "loss": 1.4979, + "step": 18662 + }, + { + "epoch": 0.6683617741328988, + "grad_norm": 1.7314326763153076, + "learning_rate": 5.234487110535802e-05, + "loss": 1.364, + "step": 18663 + }, + { + "epoch": 0.6683975862624671, + "grad_norm": 1.394898772239685, + "learning_rate": 5.233467419234173e-05, + "loss": 1.3766, + "step": 18664 + }, + { + "epoch": 0.6684333983920354, + "grad_norm": 1.64047110080719, + "learning_rate": 5.2324477920600876e-05, + "loss": 1.3212, + "step": 18665 + }, + { + "epoch": 0.6684692105216037, + "grad_norm": 1.883249044418335, + "learning_rate": 5.231428229027269e-05, + "loss": 1.3998, + "step": 18666 + }, + { + "epoch": 0.668505022651172, + "grad_norm": 1.5769582986831665, + "learning_rate": 5.23040873014943e-05, + "loss": 1.3623, + "step": 18667 + }, + { + "epoch": 0.6685408347807402, + "grad_norm": 1.6904163360595703, + "learning_rate": 5.229389295440295e-05, + "loss": 1.5583, + "step": 18668 + }, + { + "epoch": 0.6685766469103085, + "grad_norm": 1.9609761238098145, + "learning_rate": 5.228369924913567e-05, + "loss": 1.2652, + "step": 18669 + }, + { + "epoch": 0.6686124590398768, + "grad_norm": 1.8444284200668335, + "learning_rate": 5.22735061858297e-05, + "loss": 1.3636, + "step": 18670 + }, + { + "epoch": 0.6686482711694451, + "grad_norm": 1.3868964910507202, + "learning_rate": 5.2263313764622124e-05, + "loss": 1.17, + "step": 18671 + }, + { + "epoch": 0.6686840832990134, + "grad_norm": 1.3549922704696655, + "learning_rate": 5.225312198565013e-05, + "loss": 1.4785, + "step": 18672 + }, + { + "epoch": 0.6687198954285817, + "grad_norm": 1.5612128973007202, + "learning_rate": 5.224293084905074e-05, + "loss": 1.3645, + "step": 18673 + }, + { + "epoch": 0.66875570755815, + "grad_norm": 1.5936546325683594, + "learning_rate": 5.223274035496113e-05, + "loss": 1.5845, + "step": 18674 + }, + { + "epoch": 0.6687915196877182, + "grad_norm": 1.3002891540527344, + "learning_rate": 5.222255050351841e-05, + "loss": 1.4866, + "step": 18675 + }, + { + "epoch": 0.6688273318172865, + "grad_norm": 1.6563246250152588, + "learning_rate": 5.221236129485961e-05, + "loss": 1.1432, + "step": 18676 + }, + { + "epoch": 0.6688631439468548, + "grad_norm": 1.0846136808395386, + "learning_rate": 5.2202172729121844e-05, + "loss": 1.3841, + "step": 18677 + }, + { + "epoch": 0.6688989560764231, + "grad_norm": 1.6632499694824219, + "learning_rate": 5.219198480644221e-05, + "loss": 1.7012, + "step": 18678 + }, + { + "epoch": 0.6689347682059914, + "grad_norm": 1.6548495292663574, + "learning_rate": 5.2181797526957764e-05, + "loss": 1.4284, + "step": 18679 + }, + { + "epoch": 0.6689705803355597, + "grad_norm": 1.82999849319458, + "learning_rate": 5.2171610890805524e-05, + "loss": 1.2064, + "step": 18680 + }, + { + "epoch": 0.6690063924651279, + "grad_norm": 1.543445348739624, + "learning_rate": 5.216142489812256e-05, + "loss": 1.4014, + "step": 18681 + }, + { + "epoch": 0.6690422045946962, + "grad_norm": 2.067296028137207, + "learning_rate": 5.215123954904596e-05, + "loss": 1.4993, + "step": 18682 + }, + { + "epoch": 0.6690780167242645, + "grad_norm": 1.2235534191131592, + "learning_rate": 5.2141054843712675e-05, + "loss": 1.3685, + "step": 18683 + }, + { + "epoch": 0.6691138288538327, + "grad_norm": 1.3292855024337769, + "learning_rate": 5.213087078225975e-05, + "loss": 1.5753, + "step": 18684 + }, + { + "epoch": 0.6691496409834011, + "grad_norm": 1.5418928861618042, + "learning_rate": 5.212068736482423e-05, + "loss": 1.7125, + "step": 18685 + }, + { + "epoch": 0.6691854531129694, + "grad_norm": 1.34342360496521, + "learning_rate": 5.211050459154313e-05, + "loss": 1.6613, + "step": 18686 + }, + { + "epoch": 0.6692212652425377, + "grad_norm": 1.657388687133789, + "learning_rate": 5.210032246255338e-05, + "loss": 1.372, + "step": 18687 + }, + { + "epoch": 0.6692570773721059, + "grad_norm": 1.9113507270812988, + "learning_rate": 5.209014097799201e-05, + "loss": 1.6323, + "step": 18688 + }, + { + "epoch": 0.6692928895016742, + "grad_norm": 1.7546972036361694, + "learning_rate": 5.207996013799603e-05, + "loss": 1.4294, + "step": 18689 + }, + { + "epoch": 0.6693287016312425, + "grad_norm": 2.2239842414855957, + "learning_rate": 5.206977994270233e-05, + "loss": 1.5034, + "step": 18690 + }, + { + "epoch": 0.6693645137608107, + "grad_norm": 2.2625253200531006, + "learning_rate": 5.205960039224795e-05, + "loss": 1.7117, + "step": 18691 + }, + { + "epoch": 0.6694003258903791, + "grad_norm": 2.014033079147339, + "learning_rate": 5.2049421486769744e-05, + "loss": 1.5873, + "step": 18692 + }, + { + "epoch": 0.6694361380199474, + "grad_norm": 1.4991728067398071, + "learning_rate": 5.203924322640479e-05, + "loss": 1.2493, + "step": 18693 + }, + { + "epoch": 0.6694719501495157, + "grad_norm": 1.4453134536743164, + "learning_rate": 5.2029065611289926e-05, + "loss": 1.5998, + "step": 18694 + }, + { + "epoch": 0.6695077622790839, + "grad_norm": 1.5553264617919922, + "learning_rate": 5.2018888641562126e-05, + "loss": 1.2448, + "step": 18695 + }, + { + "epoch": 0.6695435744086522, + "grad_norm": 1.590145230293274, + "learning_rate": 5.200871231735822e-05, + "loss": 1.3542, + "step": 18696 + }, + { + "epoch": 0.6695793865382205, + "grad_norm": 1.796265959739685, + "learning_rate": 5.1998536638815266e-05, + "loss": 1.4915, + "step": 18697 + }, + { + "epoch": 0.6696151986677887, + "grad_norm": 1.4939812421798706, + "learning_rate": 5.198836160607008e-05, + "loss": 1.4749, + "step": 18698 + }, + { + "epoch": 0.6696510107973571, + "grad_norm": 1.3615853786468506, + "learning_rate": 5.197818721925949e-05, + "loss": 1.4405, + "step": 18699 + }, + { + "epoch": 0.6696868229269254, + "grad_norm": 1.585461974143982, + "learning_rate": 5.196801347852051e-05, + "loss": 1.3092, + "step": 18700 + }, + { + "epoch": 0.6697226350564937, + "grad_norm": 1.6997668743133545, + "learning_rate": 5.195784038398992e-05, + "loss": 1.5083, + "step": 18701 + }, + { + "epoch": 0.6697584471860619, + "grad_norm": 1.4121911525726318, + "learning_rate": 5.194766793580466e-05, + "loss": 1.1211, + "step": 18702 + }, + { + "epoch": 0.6697942593156302, + "grad_norm": 1.5166962146759033, + "learning_rate": 5.193749613410146e-05, + "loss": 1.2174, + "step": 18703 + }, + { + "epoch": 0.6698300714451985, + "grad_norm": 1.7505559921264648, + "learning_rate": 5.1927324979017335e-05, + "loss": 1.4222, + "step": 18704 + }, + { + "epoch": 0.6698658835747667, + "grad_norm": 1.718768835067749, + "learning_rate": 5.191715447068901e-05, + "loss": 1.2597, + "step": 18705 + }, + { + "epoch": 0.6699016957043351, + "grad_norm": 1.9997526407241821, + "learning_rate": 5.190698460925338e-05, + "loss": 1.5697, + "step": 18706 + }, + { + "epoch": 0.6699375078339034, + "grad_norm": 1.2355728149414062, + "learning_rate": 5.1896815394847195e-05, + "loss": 1.5054, + "step": 18707 + }, + { + "epoch": 0.6699733199634716, + "grad_norm": 1.4850220680236816, + "learning_rate": 5.188664682760731e-05, + "loss": 1.6727, + "step": 18708 + }, + { + "epoch": 0.6700091320930399, + "grad_norm": 1.334753155708313, + "learning_rate": 5.1876478907670576e-05, + "loss": 1.4472, + "step": 18709 + }, + { + "epoch": 0.6700449442226082, + "grad_norm": 1.5342482328414917, + "learning_rate": 5.186631163517367e-05, + "loss": 1.2759, + "step": 18710 + }, + { + "epoch": 0.6700807563521765, + "grad_norm": 1.6298881769180298, + "learning_rate": 5.185614501025353e-05, + "loss": 1.4384, + "step": 18711 + }, + { + "epoch": 0.6701165684817447, + "grad_norm": 1.4282634258270264, + "learning_rate": 5.184597903304681e-05, + "loss": 1.4343, + "step": 18712 + }, + { + "epoch": 0.6701523806113131, + "grad_norm": 1.7089579105377197, + "learning_rate": 5.183581370369037e-05, + "loss": 1.67, + "step": 18713 + }, + { + "epoch": 0.6701881927408814, + "grad_norm": 1.507480502128601, + "learning_rate": 5.182564902232086e-05, + "loss": 1.4983, + "step": 18714 + }, + { + "epoch": 0.6702240048704496, + "grad_norm": 1.9724632501602173, + "learning_rate": 5.1815484989075157e-05, + "loss": 1.4372, + "step": 18715 + }, + { + "epoch": 0.6702598170000179, + "grad_norm": 1.533915400505066, + "learning_rate": 5.1805321604089974e-05, + "loss": 1.4643, + "step": 18716 + }, + { + "epoch": 0.6702956291295862, + "grad_norm": 1.8036576509475708, + "learning_rate": 5.1795158867501966e-05, + "loss": 1.322, + "step": 18717 + }, + { + "epoch": 0.6703314412591544, + "grad_norm": 1.8307304382324219, + "learning_rate": 5.1784996779447926e-05, + "loss": 1.4068, + "step": 18718 + }, + { + "epoch": 0.6703672533887227, + "grad_norm": 1.794142484664917, + "learning_rate": 5.177483534006455e-05, + "loss": 1.6021, + "step": 18719 + }, + { + "epoch": 0.6704030655182911, + "grad_norm": 1.9921245574951172, + "learning_rate": 5.1764674549488614e-05, + "loss": 1.0792, + "step": 18720 + }, + { + "epoch": 0.6704388776478594, + "grad_norm": 1.671963095664978, + "learning_rate": 5.175451440785671e-05, + "loss": 1.7145, + "step": 18721 + }, + { + "epoch": 0.6704746897774276, + "grad_norm": 2.1620876789093018, + "learning_rate": 5.174435491530559e-05, + "loss": 1.6685, + "step": 18722 + }, + { + "epoch": 0.6705105019069959, + "grad_norm": 1.408326268196106, + "learning_rate": 5.173419607197193e-05, + "loss": 1.1387, + "step": 18723 + }, + { + "epoch": 0.6705463140365642, + "grad_norm": 1.5527623891830444, + "learning_rate": 5.172403787799245e-05, + "loss": 1.1697, + "step": 18724 + }, + { + "epoch": 0.6705821261661324, + "grad_norm": 1.6615278720855713, + "learning_rate": 5.1713880333503704e-05, + "loss": 1.6103, + "step": 18725 + }, + { + "epoch": 0.6706179382957007, + "grad_norm": 2.0983941555023193, + "learning_rate": 5.1703723438642436e-05, + "loss": 1.6692, + "step": 18726 + }, + { + "epoch": 0.6706537504252691, + "grad_norm": 1.7213473320007324, + "learning_rate": 5.16935671935453e-05, + "loss": 1.4165, + "step": 18727 + }, + { + "epoch": 0.6706895625548374, + "grad_norm": 2.2718751430511475, + "learning_rate": 5.1683411598348876e-05, + "loss": 1.4563, + "step": 18728 + }, + { + "epoch": 0.6707253746844056, + "grad_norm": 1.7657798528671265, + "learning_rate": 5.167325665318983e-05, + "loss": 1.6676, + "step": 18729 + }, + { + "epoch": 0.6707611868139739, + "grad_norm": 1.8499521017074585, + "learning_rate": 5.1663102358204754e-05, + "loss": 1.5885, + "step": 18730 + }, + { + "epoch": 0.6707969989435422, + "grad_norm": 1.6801620721817017, + "learning_rate": 5.165294871353035e-05, + "loss": 1.2257, + "step": 18731 + }, + { + "epoch": 0.6708328110731104, + "grad_norm": 1.3649678230285645, + "learning_rate": 5.16427957193031e-05, + "loss": 1.4112, + "step": 18732 + }, + { + "epoch": 0.6708686232026787, + "grad_norm": 1.5700057744979858, + "learning_rate": 5.163264337565967e-05, + "loss": 1.5946, + "step": 18733 + }, + { + "epoch": 0.6709044353322471, + "grad_norm": 1.8407195806503296, + "learning_rate": 5.1622491682736675e-05, + "loss": 1.2455, + "step": 18734 + }, + { + "epoch": 0.6709402474618154, + "grad_norm": 2.2659173011779785, + "learning_rate": 5.16123406406706e-05, + "loss": 1.8748, + "step": 18735 + }, + { + "epoch": 0.6709760595913836, + "grad_norm": 2.0498504638671875, + "learning_rate": 5.160219024959807e-05, + "loss": 1.9133, + "step": 18736 + }, + { + "epoch": 0.6710118717209519, + "grad_norm": 1.556580662727356, + "learning_rate": 5.159204050965565e-05, + "loss": 1.183, + "step": 18737 + }, + { + "epoch": 0.6710476838505202, + "grad_norm": 1.2161188125610352, + "learning_rate": 5.158189142097991e-05, + "loss": 1.2662, + "step": 18738 + }, + { + "epoch": 0.6710834959800884, + "grad_norm": 1.29056715965271, + "learning_rate": 5.157174298370734e-05, + "loss": 1.2986, + "step": 18739 + }, + { + "epoch": 0.6711193081096567, + "grad_norm": 2.0055019855499268, + "learning_rate": 5.15615951979745e-05, + "loss": 1.2297, + "step": 18740 + }, + { + "epoch": 0.6711551202392251, + "grad_norm": 1.7979457378387451, + "learning_rate": 5.155144806391789e-05, + "loss": 1.413, + "step": 18741 + }, + { + "epoch": 0.6711909323687933, + "grad_norm": 1.5609806776046753, + "learning_rate": 5.154130158167412e-05, + "loss": 1.4996, + "step": 18742 + }, + { + "epoch": 0.6712267444983616, + "grad_norm": 1.425581455230713, + "learning_rate": 5.153115575137959e-05, + "loss": 1.2355, + "step": 18743 + }, + { + "epoch": 0.6712625566279299, + "grad_norm": 2.3990538120269775, + "learning_rate": 5.152101057317082e-05, + "loss": 1.3091, + "step": 18744 + }, + { + "epoch": 0.6712983687574982, + "grad_norm": 1.8333427906036377, + "learning_rate": 5.151086604718438e-05, + "loss": 1.1728, + "step": 18745 + }, + { + "epoch": 0.6713341808870664, + "grad_norm": 1.5617541074752808, + "learning_rate": 5.150072217355664e-05, + "loss": 1.1859, + "step": 18746 + }, + { + "epoch": 0.6713699930166347, + "grad_norm": 1.6639366149902344, + "learning_rate": 5.149057895242412e-05, + "loss": 1.5773, + "step": 18747 + }, + { + "epoch": 0.6714058051462031, + "grad_norm": 1.8866615295410156, + "learning_rate": 5.148043638392329e-05, + "loss": 1.2715, + "step": 18748 + }, + { + "epoch": 0.6714416172757713, + "grad_norm": 1.8568611145019531, + "learning_rate": 5.147029446819065e-05, + "loss": 1.4749, + "step": 18749 + }, + { + "epoch": 0.6714774294053396, + "grad_norm": 2.128819704055786, + "learning_rate": 5.146015320536255e-05, + "loss": 1.2029, + "step": 18750 + }, + { + "epoch": 0.6715132415349079, + "grad_norm": 1.3638184070587158, + "learning_rate": 5.145001259557548e-05, + "loss": 1.3943, + "step": 18751 + }, + { + "epoch": 0.6715490536644761, + "grad_norm": 1.7398433685302734, + "learning_rate": 5.14398726389659e-05, + "loss": 1.082, + "step": 18752 + }, + { + "epoch": 0.6715848657940444, + "grad_norm": 2.225938320159912, + "learning_rate": 5.142973333567016e-05, + "loss": 1.4355, + "step": 18753 + }, + { + "epoch": 0.6716206779236127, + "grad_norm": 2.0473129749298096, + "learning_rate": 5.141959468582471e-05, + "loss": 1.4406, + "step": 18754 + }, + { + "epoch": 0.6716564900531811, + "grad_norm": 1.4689384698867798, + "learning_rate": 5.140945668956595e-05, + "loss": 1.4992, + "step": 18755 + }, + { + "epoch": 0.6716923021827493, + "grad_norm": 1.6381616592407227, + "learning_rate": 5.1399319347030306e-05, + "loss": 1.6428, + "step": 18756 + }, + { + "epoch": 0.6717281143123176, + "grad_norm": 1.9008187055587769, + "learning_rate": 5.1389182658354105e-05, + "loss": 1.8243, + "step": 18757 + }, + { + "epoch": 0.6717639264418859, + "grad_norm": 3.256089448928833, + "learning_rate": 5.137904662367373e-05, + "loss": 1.5297, + "step": 18758 + }, + { + "epoch": 0.6717997385714541, + "grad_norm": 1.397004246711731, + "learning_rate": 5.136891124312557e-05, + "loss": 1.2896, + "step": 18759 + }, + { + "epoch": 0.6718355507010224, + "grad_norm": 1.8906328678131104, + "learning_rate": 5.135877651684603e-05, + "loss": 1.5225, + "step": 18760 + }, + { + "epoch": 0.6718713628305907, + "grad_norm": 1.9930126667022705, + "learning_rate": 5.1348642444971364e-05, + "loss": 1.4106, + "step": 18761 + }, + { + "epoch": 0.6719071749601591, + "grad_norm": 1.5230907201766968, + "learning_rate": 5.133850902763795e-05, + "loss": 1.4992, + "step": 18762 + }, + { + "epoch": 0.6719429870897273, + "grad_norm": 1.6010066270828247, + "learning_rate": 5.132837626498217e-05, + "loss": 1.4002, + "step": 18763 + }, + { + "epoch": 0.6719787992192956, + "grad_norm": 1.5590764284133911, + "learning_rate": 5.1318244157140285e-05, + "loss": 1.2998, + "step": 18764 + }, + { + "epoch": 0.6720146113488639, + "grad_norm": 1.6329294443130493, + "learning_rate": 5.13081127042486e-05, + "loss": 1.1118, + "step": 18765 + }, + { + "epoch": 0.6720504234784321, + "grad_norm": 1.5088551044464111, + "learning_rate": 5.129798190644348e-05, + "loss": 1.2605, + "step": 18766 + }, + { + "epoch": 0.6720862356080004, + "grad_norm": 1.7145344018936157, + "learning_rate": 5.128785176386122e-05, + "loss": 1.3524, + "step": 18767 + }, + { + "epoch": 0.6721220477375687, + "grad_norm": 1.9711799621582031, + "learning_rate": 5.127772227663803e-05, + "loss": 1.4702, + "step": 18768 + }, + { + "epoch": 0.672157859867137, + "grad_norm": 2.004667043685913, + "learning_rate": 5.1267593444910254e-05, + "loss": 1.5904, + "step": 18769 + }, + { + "epoch": 0.6721936719967053, + "grad_norm": 2.5094735622406006, + "learning_rate": 5.125746526881417e-05, + "loss": 1.4116, + "step": 18770 + }, + { + "epoch": 0.6722294841262736, + "grad_norm": 1.3970831632614136, + "learning_rate": 5.1247337748486005e-05, + "loss": 1.3415, + "step": 18771 + }, + { + "epoch": 0.6722652962558419, + "grad_norm": 1.901026725769043, + "learning_rate": 5.1237210884061994e-05, + "loss": 1.382, + "step": 18772 + }, + { + "epoch": 0.6723011083854101, + "grad_norm": 1.8950276374816895, + "learning_rate": 5.1227084675678425e-05, + "loss": 1.2152, + "step": 18773 + }, + { + "epoch": 0.6723369205149784, + "grad_norm": 1.730116844177246, + "learning_rate": 5.121695912347156e-05, + "loss": 1.4224, + "step": 18774 + }, + { + "epoch": 0.6723727326445467, + "grad_norm": 1.482043743133545, + "learning_rate": 5.120683422757755e-05, + "loss": 1.526, + "step": 18775 + }, + { + "epoch": 0.672408544774115, + "grad_norm": 1.451438069343567, + "learning_rate": 5.119670998813264e-05, + "loss": 1.4829, + "step": 18776 + }, + { + "epoch": 0.6724443569036833, + "grad_norm": 2.14319109916687, + "learning_rate": 5.1186586405273055e-05, + "loss": 1.3499, + "step": 18777 + }, + { + "epoch": 0.6724801690332516, + "grad_norm": 2.263322353363037, + "learning_rate": 5.117646347913501e-05, + "loss": 1.144, + "step": 18778 + }, + { + "epoch": 0.6725159811628199, + "grad_norm": 1.8152785301208496, + "learning_rate": 5.116634120985467e-05, + "loss": 1.3347, + "step": 18779 + }, + { + "epoch": 0.6725517932923881, + "grad_norm": 1.4670072793960571, + "learning_rate": 5.115621959756815e-05, + "loss": 1.3096, + "step": 18780 + }, + { + "epoch": 0.6725876054219564, + "grad_norm": 1.4545962810516357, + "learning_rate": 5.1146098642411765e-05, + "loss": 1.4713, + "step": 18781 + }, + { + "epoch": 0.6726234175515247, + "grad_norm": 1.4335076808929443, + "learning_rate": 5.113597834452157e-05, + "loss": 1.323, + "step": 18782 + }, + { + "epoch": 0.672659229681093, + "grad_norm": 1.4578428268432617, + "learning_rate": 5.1125858704033745e-05, + "loss": 1.6607, + "step": 18783 + }, + { + "epoch": 0.6726950418106613, + "grad_norm": 1.8013838529586792, + "learning_rate": 5.111573972108446e-05, + "loss": 1.7419, + "step": 18784 + }, + { + "epoch": 0.6727308539402296, + "grad_norm": 1.3669114112854004, + "learning_rate": 5.1105621395809875e-05, + "loss": 1.3873, + "step": 18785 + }, + { + "epoch": 0.6727666660697978, + "grad_norm": 1.4641414880752563, + "learning_rate": 5.1095503728346095e-05, + "loss": 1.4673, + "step": 18786 + }, + { + "epoch": 0.6728024781993661, + "grad_norm": 2.017226219177246, + "learning_rate": 5.108538671882914e-05, + "loss": 1.6612, + "step": 18787 + }, + { + "epoch": 0.6728382903289344, + "grad_norm": 1.617396593093872, + "learning_rate": 5.10752703673953e-05, + "loss": 1.5628, + "step": 18788 + }, + { + "epoch": 0.6728741024585027, + "grad_norm": 1.8963992595672607, + "learning_rate": 5.106515467418054e-05, + "loss": 1.2682, + "step": 18789 + }, + { + "epoch": 0.672909914588071, + "grad_norm": 2.118973731994629, + "learning_rate": 5.1055039639321046e-05, + "loss": 1.3527, + "step": 18790 + }, + { + "epoch": 0.6729457267176393, + "grad_norm": 1.6391757726669312, + "learning_rate": 5.104492526295278e-05, + "loss": 1.6404, + "step": 18791 + }, + { + "epoch": 0.6729815388472076, + "grad_norm": 1.6890900135040283, + "learning_rate": 5.103481154521197e-05, + "loss": 1.1516, + "step": 18792 + }, + { + "epoch": 0.6730173509767758, + "grad_norm": 2.0314505100250244, + "learning_rate": 5.102469848623459e-05, + "loss": 1.0655, + "step": 18793 + }, + { + "epoch": 0.6730531631063441, + "grad_norm": 1.655391812324524, + "learning_rate": 5.10145860861567e-05, + "loss": 1.3118, + "step": 18794 + }, + { + "epoch": 0.6730889752359124, + "grad_norm": 1.847886085510254, + "learning_rate": 5.1004474345114404e-05, + "loss": 1.6991, + "step": 18795 + }, + { + "epoch": 0.6731247873654806, + "grad_norm": 1.691873550415039, + "learning_rate": 5.099436326324367e-05, + "loss": 1.4669, + "step": 18796 + }, + { + "epoch": 0.673160599495049, + "grad_norm": 1.4754281044006348, + "learning_rate": 5.098425284068062e-05, + "loss": 1.2964, + "step": 18797 + }, + { + "epoch": 0.6731964116246173, + "grad_norm": 1.3968569040298462, + "learning_rate": 5.0974143077561135e-05, + "loss": 1.4885, + "step": 18798 + }, + { + "epoch": 0.6732322237541856, + "grad_norm": 1.5505691766738892, + "learning_rate": 5.0964033974021386e-05, + "loss": 1.4887, + "step": 18799 + }, + { + "epoch": 0.6732680358837538, + "grad_norm": 1.2982192039489746, + "learning_rate": 5.095392553019728e-05, + "loss": 1.4633, + "step": 18800 + }, + { + "epoch": 0.6733038480133221, + "grad_norm": 1.4484449625015259, + "learning_rate": 5.094381774622488e-05, + "loss": 1.5766, + "step": 18801 + }, + { + "epoch": 0.6733396601428904, + "grad_norm": 2.2219207286834717, + "learning_rate": 5.0933710622240036e-05, + "loss": 1.6196, + "step": 18802 + }, + { + "epoch": 0.6733754722724586, + "grad_norm": 1.9814118146896362, + "learning_rate": 5.0923604158378924e-05, + "loss": 1.5831, + "step": 18803 + }, + { + "epoch": 0.673411284402027, + "grad_norm": 2.133800983428955, + "learning_rate": 5.091349835477741e-05, + "loss": 1.3814, + "step": 18804 + }, + { + "epoch": 0.6734470965315953, + "grad_norm": 1.7097578048706055, + "learning_rate": 5.0903393211571414e-05, + "loss": 1.5687, + "step": 18805 + }, + { + "epoch": 0.6734829086611636, + "grad_norm": 1.711245059967041, + "learning_rate": 5.089328872889694e-05, + "loss": 1.2619, + "step": 18806 + }, + { + "epoch": 0.6735187207907318, + "grad_norm": 1.5886105298995972, + "learning_rate": 5.0883184906889924e-05, + "loss": 1.3635, + "step": 18807 + }, + { + "epoch": 0.6735545329203001, + "grad_norm": 1.8277711868286133, + "learning_rate": 5.087308174568632e-05, + "loss": 1.7714, + "step": 18808 + }, + { + "epoch": 0.6735903450498684, + "grad_norm": 1.5488297939300537, + "learning_rate": 5.086297924542198e-05, + "loss": 1.2493, + "step": 18809 + }, + { + "epoch": 0.6736261571794366, + "grad_norm": 1.1911660432815552, + "learning_rate": 5.085287740623292e-05, + "loss": 1.4918, + "step": 18810 + }, + { + "epoch": 0.673661969309005, + "grad_norm": 2.2458438873291016, + "learning_rate": 5.0842776228255e-05, + "loss": 1.3248, + "step": 18811 + }, + { + "epoch": 0.6736977814385733, + "grad_norm": 1.5594806671142578, + "learning_rate": 5.083267571162412e-05, + "loss": 1.5582, + "step": 18812 + }, + { + "epoch": 0.6737335935681416, + "grad_norm": 2.4863622188568115, + "learning_rate": 5.082257585647614e-05, + "loss": 1.5133, + "step": 18813 + }, + { + "epoch": 0.6737694056977098, + "grad_norm": 2.48881196975708, + "learning_rate": 5.0812476662946975e-05, + "loss": 1.3237, + "step": 18814 + }, + { + "epoch": 0.6738052178272781, + "grad_norm": 1.7107995748519897, + "learning_rate": 5.0802378131172525e-05, + "loss": 1.8406, + "step": 18815 + }, + { + "epoch": 0.6738410299568464, + "grad_norm": 1.973778247833252, + "learning_rate": 5.079228026128857e-05, + "loss": 1.2656, + "step": 18816 + }, + { + "epoch": 0.6738768420864146, + "grad_norm": 1.4093636274337769, + "learning_rate": 5.078218305343102e-05, + "loss": 1.5239, + "step": 18817 + }, + { + "epoch": 0.6739126542159829, + "grad_norm": 2.64251446723938, + "learning_rate": 5.07720865077357e-05, + "loss": 1.4003, + "step": 18818 + }, + { + "epoch": 0.6739484663455513, + "grad_norm": 1.3757315874099731, + "learning_rate": 5.0761990624338504e-05, + "loss": 1.4934, + "step": 18819 + }, + { + "epoch": 0.6739842784751195, + "grad_norm": 1.2844980955123901, + "learning_rate": 5.075189540337514e-05, + "loss": 1.5985, + "step": 18820 + }, + { + "epoch": 0.6740200906046878, + "grad_norm": 2.0259010791778564, + "learning_rate": 5.074180084498157e-05, + "loss": 1.5542, + "step": 18821 + }, + { + "epoch": 0.6740559027342561, + "grad_norm": 1.7492506504058838, + "learning_rate": 5.0731706949293525e-05, + "loss": 1.4885, + "step": 18822 + }, + { + "epoch": 0.6740917148638244, + "grad_norm": 1.6474558115005493, + "learning_rate": 5.072161371644677e-05, + "loss": 1.5, + "step": 18823 + }, + { + "epoch": 0.6741275269933926, + "grad_norm": 1.8024877309799194, + "learning_rate": 5.0711521146577156e-05, + "loss": 1.3702, + "step": 18824 + }, + { + "epoch": 0.6741633391229609, + "grad_norm": 1.5845979452133179, + "learning_rate": 5.070142923982043e-05, + "loss": 1.2484, + "step": 18825 + }, + { + "epoch": 0.6741991512525293, + "grad_norm": 1.974819540977478, + "learning_rate": 5.069133799631243e-05, + "loss": 1.4982, + "step": 18826 + }, + { + "epoch": 0.6742349633820975, + "grad_norm": 2.497584819793701, + "learning_rate": 5.0681247416188826e-05, + "loss": 1.5294, + "step": 18827 + }, + { + "epoch": 0.6742707755116658, + "grad_norm": 1.8491204977035522, + "learning_rate": 5.067115749958543e-05, + "loss": 1.4184, + "step": 18828 + }, + { + "epoch": 0.6743065876412341, + "grad_norm": 1.4940153360366821, + "learning_rate": 5.066106824663798e-05, + "loss": 1.4861, + "step": 18829 + }, + { + "epoch": 0.6743423997708023, + "grad_norm": 1.228395938873291, + "learning_rate": 5.065097965748224e-05, + "loss": 1.543, + "step": 18830 + }, + { + "epoch": 0.6743782119003706, + "grad_norm": 1.5438652038574219, + "learning_rate": 5.0640891732253905e-05, + "loss": 1.3178, + "step": 18831 + }, + { + "epoch": 0.6744140240299389, + "grad_norm": 1.4345051050186157, + "learning_rate": 5.063080447108868e-05, + "loss": 1.5065, + "step": 18832 + }, + { + "epoch": 0.6744498361595073, + "grad_norm": 1.7704910039901733, + "learning_rate": 5.0620717874122336e-05, + "loss": 1.4677, + "step": 18833 + }, + { + "epoch": 0.6744856482890755, + "grad_norm": 1.8506509065628052, + "learning_rate": 5.06106319414905e-05, + "loss": 1.8333, + "step": 18834 + }, + { + "epoch": 0.6745214604186438, + "grad_norm": 1.587844967842102, + "learning_rate": 5.0600546673328916e-05, + "loss": 1.3863, + "step": 18835 + }, + { + "epoch": 0.6745572725482121, + "grad_norm": 1.68564772605896, + "learning_rate": 5.059046206977325e-05, + "loss": 1.9733, + "step": 18836 + }, + { + "epoch": 0.6745930846777803, + "grad_norm": 1.5647304058074951, + "learning_rate": 5.0580378130959216e-05, + "loss": 1.8071, + "step": 18837 + }, + { + "epoch": 0.6746288968073486, + "grad_norm": 1.7359753847122192, + "learning_rate": 5.05702948570224e-05, + "loss": 1.1989, + "step": 18838 + }, + { + "epoch": 0.6746647089369169, + "grad_norm": 2.1853439807891846, + "learning_rate": 5.056021224809853e-05, + "loss": 1.3432, + "step": 18839 + }, + { + "epoch": 0.6747005210664853, + "grad_norm": 1.6081196069717407, + "learning_rate": 5.055013030432326e-05, + "loss": 1.1698, + "step": 18840 + }, + { + "epoch": 0.6747363331960535, + "grad_norm": 1.6497262716293335, + "learning_rate": 5.054004902583216e-05, + "loss": 1.4807, + "step": 18841 + }, + { + "epoch": 0.6747721453256218, + "grad_norm": 2.125016689300537, + "learning_rate": 5.052996841276091e-05, + "loss": 1.4742, + "step": 18842 + }, + { + "epoch": 0.6748079574551901, + "grad_norm": 1.5704431533813477, + "learning_rate": 5.0519888465245116e-05, + "loss": 1.5663, + "step": 18843 + }, + { + "epoch": 0.6748437695847583, + "grad_norm": 2.178602695465088, + "learning_rate": 5.050980918342043e-05, + "loss": 1.57, + "step": 18844 + }, + { + "epoch": 0.6748795817143266, + "grad_norm": 1.4927809238433838, + "learning_rate": 5.04997305674224e-05, + "loss": 1.3771, + "step": 18845 + }, + { + "epoch": 0.6749153938438949, + "grad_norm": 3.4965755939483643, + "learning_rate": 5.048965261738664e-05, + "loss": 1.6496, + "step": 18846 + }, + { + "epoch": 0.6749512059734633, + "grad_norm": 1.5973447561264038, + "learning_rate": 5.047957533344874e-05, + "loss": 1.3466, + "step": 18847 + }, + { + "epoch": 0.6749870181030315, + "grad_norm": 1.5947939157485962, + "learning_rate": 5.0469498715744314e-05, + "loss": 1.3644, + "step": 18848 + }, + { + "epoch": 0.6750228302325998, + "grad_norm": 1.60691237449646, + "learning_rate": 5.045942276440885e-05, + "loss": 1.6925, + "step": 18849 + }, + { + "epoch": 0.6750586423621681, + "grad_norm": 1.4345650672912598, + "learning_rate": 5.0449347479577946e-05, + "loss": 1.163, + "step": 18850 + }, + { + "epoch": 0.6750944544917363, + "grad_norm": 2.119398593902588, + "learning_rate": 5.043927286138721e-05, + "loss": 1.5569, + "step": 18851 + }, + { + "epoch": 0.6751302666213046, + "grad_norm": 1.8614598512649536, + "learning_rate": 5.0429198909972086e-05, + "loss": 1.5067, + "step": 18852 + }, + { + "epoch": 0.6751660787508729, + "grad_norm": 2.991887331008911, + "learning_rate": 5.041912562546813e-05, + "loss": 1.673, + "step": 18853 + }, + { + "epoch": 0.6752018908804412, + "grad_norm": 1.687841534614563, + "learning_rate": 5.040905300801091e-05, + "loss": 1.235, + "step": 18854 + }, + { + "epoch": 0.6752377030100095, + "grad_norm": 2.048767566680908, + "learning_rate": 5.039898105773594e-05, + "loss": 1.5541, + "step": 18855 + }, + { + "epoch": 0.6752735151395778, + "grad_norm": 1.4294754266738892, + "learning_rate": 5.038890977477866e-05, + "loss": 1.2713, + "step": 18856 + }, + { + "epoch": 0.675309327269146, + "grad_norm": 2.4173076152801514, + "learning_rate": 5.037883915927462e-05, + "loss": 1.4883, + "step": 18857 + }, + { + "epoch": 0.6753451393987143, + "grad_norm": 1.8582412004470825, + "learning_rate": 5.036876921135931e-05, + "loss": 1.2395, + "step": 18858 + }, + { + "epoch": 0.6753809515282826, + "grad_norm": 1.710937738418579, + "learning_rate": 5.035869993116816e-05, + "loss": 1.5521, + "step": 18859 + }, + { + "epoch": 0.6754167636578509, + "grad_norm": 1.5905033349990845, + "learning_rate": 5.034863131883667e-05, + "loss": 1.5004, + "step": 18860 + }, + { + "epoch": 0.6754525757874192, + "grad_norm": 1.7289782762527466, + "learning_rate": 5.03385633745003e-05, + "loss": 1.4988, + "step": 18861 + }, + { + "epoch": 0.6754883879169875, + "grad_norm": 1.712811827659607, + "learning_rate": 5.032849609829454e-05, + "loss": 1.6214, + "step": 18862 + }, + { + "epoch": 0.6755242000465558, + "grad_norm": 2.12488055229187, + "learning_rate": 5.0318429490354754e-05, + "loss": 1.4074, + "step": 18863 + }, + { + "epoch": 0.675560012176124, + "grad_norm": 1.458487629890442, + "learning_rate": 5.030836355081643e-05, + "loss": 1.3575, + "step": 18864 + }, + { + "epoch": 0.6755958243056923, + "grad_norm": 2.2130470275878906, + "learning_rate": 5.0298298279814956e-05, + "loss": 1.617, + "step": 18865 + }, + { + "epoch": 0.6756316364352606, + "grad_norm": 1.495819091796875, + "learning_rate": 5.0288233677485806e-05, + "loss": 1.4228, + "step": 18866 + }, + { + "epoch": 0.6756674485648289, + "grad_norm": 1.9383759498596191, + "learning_rate": 5.027816974396432e-05, + "loss": 1.2309, + "step": 18867 + }, + { + "epoch": 0.6757032606943972, + "grad_norm": 1.6474690437316895, + "learning_rate": 5.0268106479385924e-05, + "loss": 1.1464, + "step": 18868 + }, + { + "epoch": 0.6757390728239655, + "grad_norm": 1.8009663820266724, + "learning_rate": 5.025804388388604e-05, + "loss": 1.7125, + "step": 18869 + }, + { + "epoch": 0.6757748849535338, + "grad_norm": 1.71844482421875, + "learning_rate": 5.024798195759998e-05, + "loss": 1.6644, + "step": 18870 + }, + { + "epoch": 0.675810697083102, + "grad_norm": 1.4656023979187012, + "learning_rate": 5.023792070066313e-05, + "loss": 1.552, + "step": 18871 + }, + { + "epoch": 0.6758465092126703, + "grad_norm": 1.699833869934082, + "learning_rate": 5.022786011321089e-05, + "loss": 1.2069, + "step": 18872 + }, + { + "epoch": 0.6758823213422386, + "grad_norm": 1.6764025688171387, + "learning_rate": 5.021780019537862e-05, + "loss": 1.5697, + "step": 18873 + }, + { + "epoch": 0.6759181334718068, + "grad_norm": 1.3379871845245361, + "learning_rate": 5.02077409473016e-05, + "loss": 1.5602, + "step": 18874 + }, + { + "epoch": 0.6759539456013752, + "grad_norm": 2.3208539485931396, + "learning_rate": 5.019768236911519e-05, + "loss": 1.5169, + "step": 18875 + }, + { + "epoch": 0.6759897577309435, + "grad_norm": 1.4288512468338013, + "learning_rate": 5.018762446095476e-05, + "loss": 1.1977, + "step": 18876 + }, + { + "epoch": 0.6760255698605118, + "grad_norm": 1.9357956647872925, + "learning_rate": 5.017756722295557e-05, + "loss": 1.1796, + "step": 18877 + }, + { + "epoch": 0.67606138199008, + "grad_norm": 1.7917031049728394, + "learning_rate": 5.016751065525292e-05, + "loss": 1.3224, + "step": 18878 + }, + { + "epoch": 0.6760971941196483, + "grad_norm": 1.693039059638977, + "learning_rate": 5.015745475798215e-05, + "loss": 1.3998, + "step": 18879 + }, + { + "epoch": 0.6761330062492166, + "grad_norm": 1.3678598403930664, + "learning_rate": 5.014739953127857e-05, + "loss": 1.185, + "step": 18880 + }, + { + "epoch": 0.6761688183787848, + "grad_norm": 1.3098324537277222, + "learning_rate": 5.013734497527739e-05, + "loss": 1.3323, + "step": 18881 + }, + { + "epoch": 0.6762046305083532, + "grad_norm": 1.487381935119629, + "learning_rate": 5.0127291090113917e-05, + "loss": 1.3497, + "step": 18882 + }, + { + "epoch": 0.6762404426379215, + "grad_norm": 2.2838425636291504, + "learning_rate": 5.011723787592344e-05, + "loss": 1.5023, + "step": 18883 + }, + { + "epoch": 0.6762762547674898, + "grad_norm": 1.4968982934951782, + "learning_rate": 5.0107185332841155e-05, + "loss": 1.221, + "step": 18884 + }, + { + "epoch": 0.676312066897058, + "grad_norm": 2.1041419506073, + "learning_rate": 5.009713346100235e-05, + "loss": 2.0279, + "step": 18885 + }, + { + "epoch": 0.6763478790266263, + "grad_norm": 1.8831173181533813, + "learning_rate": 5.008708226054219e-05, + "loss": 1.6359, + "step": 18886 + }, + { + "epoch": 0.6763836911561946, + "grad_norm": 1.8215208053588867, + "learning_rate": 5.007703173159604e-05, + "loss": 1.1922, + "step": 18887 + }, + { + "epoch": 0.6764195032857628, + "grad_norm": 1.7861186265945435, + "learning_rate": 5.0066981874298967e-05, + "loss": 1.351, + "step": 18888 + }, + { + "epoch": 0.6764553154153312, + "grad_norm": 1.681621789932251, + "learning_rate": 5.0056932688786294e-05, + "loss": 1.6124, + "step": 18889 + }, + { + "epoch": 0.6764911275448995, + "grad_norm": 1.4928348064422607, + "learning_rate": 5.00468841751931e-05, + "loss": 1.384, + "step": 18890 + }, + { + "epoch": 0.6765269396744678, + "grad_norm": 1.405524730682373, + "learning_rate": 5.0036836333654715e-05, + "loss": 1.1791, + "step": 18891 + }, + { + "epoch": 0.676562751804036, + "grad_norm": 1.3856693506240845, + "learning_rate": 5.0026789164306255e-05, + "loss": 1.5605, + "step": 18892 + }, + { + "epoch": 0.6765985639336043, + "grad_norm": 1.7855710983276367, + "learning_rate": 5.00167426672828e-05, + "loss": 1.2509, + "step": 18893 + }, + { + "epoch": 0.6766343760631726, + "grad_norm": 1.4088751077651978, + "learning_rate": 5.000669684271968e-05, + "loss": 1.4611, + "step": 18894 + }, + { + "epoch": 0.6766701881927408, + "grad_norm": 1.4906492233276367, + "learning_rate": 4.999665169075193e-05, + "loss": 1.2085, + "step": 18895 + }, + { + "epoch": 0.6767060003223092, + "grad_norm": 1.3844395875930786, + "learning_rate": 4.998660721151476e-05, + "loss": 1.0361, + "step": 18896 + }, + { + "epoch": 0.6767418124518775, + "grad_norm": 1.425974726676941, + "learning_rate": 4.997656340514321e-05, + "loss": 1.4404, + "step": 18897 + }, + { + "epoch": 0.6767776245814457, + "grad_norm": 2.3073105812072754, + "learning_rate": 4.996652027177255e-05, + "loss": 1.4989, + "step": 18898 + }, + { + "epoch": 0.676813436711014, + "grad_norm": 2.7531237602233887, + "learning_rate": 4.995647781153778e-05, + "loss": 1.6744, + "step": 18899 + }, + { + "epoch": 0.6768492488405823, + "grad_norm": 2.012333869934082, + "learning_rate": 4.99464360245741e-05, + "loss": 1.572, + "step": 18900 + }, + { + "epoch": 0.6768850609701506, + "grad_norm": 1.7777117490768433, + "learning_rate": 4.9936394911016504e-05, + "loss": 1.5683, + "step": 18901 + }, + { + "epoch": 0.6769208730997188, + "grad_norm": 1.54745614528656, + "learning_rate": 4.992635447100015e-05, + "loss": 1.6978, + "step": 18902 + }, + { + "epoch": 0.6769566852292872, + "grad_norm": 2.0074431896209717, + "learning_rate": 4.9916314704660126e-05, + "loss": 1.3513, + "step": 18903 + }, + { + "epoch": 0.6769924973588555, + "grad_norm": 1.5793383121490479, + "learning_rate": 4.9906275612131424e-05, + "loss": 1.4361, + "step": 18904 + }, + { + "epoch": 0.6770283094884237, + "grad_norm": 2.9052045345306396, + "learning_rate": 4.9896237193549244e-05, + "loss": 1.671, + "step": 18905 + }, + { + "epoch": 0.677064121617992, + "grad_norm": 2.1424288749694824, + "learning_rate": 4.988619944904852e-05, + "loss": 1.5612, + "step": 18906 + }, + { + "epoch": 0.6770999337475603, + "grad_norm": 1.384803295135498, + "learning_rate": 4.987616237876438e-05, + "loss": 1.2562, + "step": 18907 + }, + { + "epoch": 0.6771357458771285, + "grad_norm": 1.6141088008880615, + "learning_rate": 4.9866125982831745e-05, + "loss": 1.4878, + "step": 18908 + }, + { + "epoch": 0.6771715580066968, + "grad_norm": 2.246306896209717, + "learning_rate": 4.9856090261385793e-05, + "loss": 1.3069, + "step": 18909 + }, + { + "epoch": 0.6772073701362652, + "grad_norm": 1.907278060913086, + "learning_rate": 4.984605521456146e-05, + "loss": 1.3377, + "step": 18910 + }, + { + "epoch": 0.6772431822658335, + "grad_norm": 1.491541862487793, + "learning_rate": 4.983602084249372e-05, + "loss": 1.7067, + "step": 18911 + }, + { + "epoch": 0.6772789943954017, + "grad_norm": 1.672043800354004, + "learning_rate": 4.982598714531762e-05, + "loss": 1.3417, + "step": 18912 + }, + { + "epoch": 0.67731480652497, + "grad_norm": 1.6884132623672485, + "learning_rate": 4.981595412316815e-05, + "loss": 1.5376, + "step": 18913 + }, + { + "epoch": 0.6773506186545383, + "grad_norm": 1.6035497188568115, + "learning_rate": 4.980592177618031e-05, + "loss": 1.2029, + "step": 18914 + }, + { + "epoch": 0.6773864307841065, + "grad_norm": 1.87796151638031, + "learning_rate": 4.979589010448902e-05, + "loss": 1.2361, + "step": 18915 + }, + { + "epoch": 0.6774222429136748, + "grad_norm": 1.6658316850662231, + "learning_rate": 4.978585910822926e-05, + "loss": 1.2257, + "step": 18916 + }, + { + "epoch": 0.6774580550432432, + "grad_norm": 1.7389546632766724, + "learning_rate": 4.977582878753599e-05, + "loss": 1.6042, + "step": 18917 + }, + { + "epoch": 0.6774938671728115, + "grad_norm": 1.2868759632110596, + "learning_rate": 4.9765799142544215e-05, + "loss": 1.479, + "step": 18918 + }, + { + "epoch": 0.6775296793023797, + "grad_norm": 1.504171371459961, + "learning_rate": 4.975577017338876e-05, + "loss": 1.2906, + "step": 18919 + }, + { + "epoch": 0.677565491431948, + "grad_norm": 1.5632758140563965, + "learning_rate": 4.9745741880204613e-05, + "loss": 1.4864, + "step": 18920 + }, + { + "epoch": 0.6776013035615163, + "grad_norm": 1.4741456508636475, + "learning_rate": 4.973571426312673e-05, + "loss": 1.2293, + "step": 18921 + }, + { + "epoch": 0.6776371156910845, + "grad_norm": 1.4424731731414795, + "learning_rate": 4.9725687322289926e-05, + "loss": 1.4578, + "step": 18922 + }, + { + "epoch": 0.6776729278206528, + "grad_norm": 1.7455040216445923, + "learning_rate": 4.971566105782916e-05, + "loss": 1.6631, + "step": 18923 + }, + { + "epoch": 0.6777087399502212, + "grad_norm": 2.1548445224761963, + "learning_rate": 4.9705635469879306e-05, + "loss": 1.4788, + "step": 18924 + }, + { + "epoch": 0.6777445520797895, + "grad_norm": 1.7783231735229492, + "learning_rate": 4.969561055857529e-05, + "loss": 1.2761, + "step": 18925 + }, + { + "epoch": 0.6777803642093577, + "grad_norm": 1.5366977453231812, + "learning_rate": 4.9685586324051915e-05, + "loss": 1.4405, + "step": 18926 + }, + { + "epoch": 0.677816176338926, + "grad_norm": 1.5162105560302734, + "learning_rate": 4.967556276644406e-05, + "loss": 1.2265, + "step": 18927 + }, + { + "epoch": 0.6778519884684943, + "grad_norm": 1.6410696506500244, + "learning_rate": 4.966553988588665e-05, + "loss": 1.3447, + "step": 18928 + }, + { + "epoch": 0.6778878005980625, + "grad_norm": 2.0513529777526855, + "learning_rate": 4.965551768251442e-05, + "loss": 1.3223, + "step": 18929 + }, + { + "epoch": 0.6779236127276308, + "grad_norm": 1.720237374305725, + "learning_rate": 4.9645496156462266e-05, + "loss": 1.5513, + "step": 18930 + }, + { + "epoch": 0.6779594248571992, + "grad_norm": 1.4774359464645386, + "learning_rate": 4.963547530786501e-05, + "loss": 1.555, + "step": 18931 + }, + { + "epoch": 0.6779952369867674, + "grad_norm": 1.7184780836105347, + "learning_rate": 4.962545513685751e-05, + "loss": 1.3769, + "step": 18932 + }, + { + "epoch": 0.6780310491163357, + "grad_norm": 1.2298649549484253, + "learning_rate": 4.961543564357449e-05, + "loss": 1.2371, + "step": 18933 + }, + { + "epoch": 0.678066861245904, + "grad_norm": 1.4467629194259644, + "learning_rate": 4.9605416828150795e-05, + "loss": 1.5372, + "step": 18934 + }, + { + "epoch": 0.6781026733754723, + "grad_norm": 1.4205254316329956, + "learning_rate": 4.959539869072121e-05, + "loss": 1.6665, + "step": 18935 + }, + { + "epoch": 0.6781384855050405, + "grad_norm": 1.92221999168396, + "learning_rate": 4.958538123142056e-05, + "loss": 1.1817, + "step": 18936 + }, + { + "epoch": 0.6781742976346088, + "grad_norm": 1.6638027429580688, + "learning_rate": 4.957536445038353e-05, + "loss": 1.3898, + "step": 18937 + }, + { + "epoch": 0.6782101097641772, + "grad_norm": 1.847952961921692, + "learning_rate": 4.9565348347744934e-05, + "loss": 1.5683, + "step": 18938 + }, + { + "epoch": 0.6782459218937454, + "grad_norm": 1.6598225831985474, + "learning_rate": 4.955533292363955e-05, + "loss": 1.3324, + "step": 18939 + }, + { + "epoch": 0.6782817340233137, + "grad_norm": 1.522696852684021, + "learning_rate": 4.954531817820206e-05, + "loss": 1.4984, + "step": 18940 + }, + { + "epoch": 0.678317546152882, + "grad_norm": 1.516022801399231, + "learning_rate": 4.953530411156724e-05, + "loss": 1.3997, + "step": 18941 + }, + { + "epoch": 0.6783533582824502, + "grad_norm": 1.4114199876785278, + "learning_rate": 4.95252907238698e-05, + "loss": 1.3475, + "step": 18942 + }, + { + "epoch": 0.6783891704120185, + "grad_norm": 1.7230263948440552, + "learning_rate": 4.95152780152445e-05, + "loss": 1.5503, + "step": 18943 + }, + { + "epoch": 0.6784249825415868, + "grad_norm": 1.6824709177017212, + "learning_rate": 4.9505265985825976e-05, + "loss": 1.6195, + "step": 18944 + }, + { + "epoch": 0.6784607946711552, + "grad_norm": 1.916304588317871, + "learning_rate": 4.9495254635748975e-05, + "loss": 1.4234, + "step": 18945 + }, + { + "epoch": 0.6784966068007234, + "grad_norm": 1.438887596130371, + "learning_rate": 4.948524396514821e-05, + "loss": 1.351, + "step": 18946 + }, + { + "epoch": 0.6785324189302917, + "grad_norm": 1.4166353940963745, + "learning_rate": 4.947523397415829e-05, + "loss": 1.6459, + "step": 18947 + }, + { + "epoch": 0.67856823105986, + "grad_norm": 1.5291413068771362, + "learning_rate": 4.9465224662913925e-05, + "loss": 1.2955, + "step": 18948 + }, + { + "epoch": 0.6786040431894282, + "grad_norm": 1.7831827402114868, + "learning_rate": 4.9455216031549766e-05, + "loss": 1.5585, + "step": 18949 + }, + { + "epoch": 0.6786398553189965, + "grad_norm": 1.5598150491714478, + "learning_rate": 4.9445208080200536e-05, + "loss": 1.3291, + "step": 18950 + }, + { + "epoch": 0.6786756674485648, + "grad_norm": 1.9024678468704224, + "learning_rate": 4.943520080900076e-05, + "loss": 1.536, + "step": 18951 + }, + { + "epoch": 0.6787114795781332, + "grad_norm": 1.5680241584777832, + "learning_rate": 4.9425194218085145e-05, + "loss": 1.4492, + "step": 18952 + }, + { + "epoch": 0.6787472917077014, + "grad_norm": 1.4497802257537842, + "learning_rate": 4.94151883075883e-05, + "loss": 1.5935, + "step": 18953 + }, + { + "epoch": 0.6787831038372697, + "grad_norm": 1.754224419593811, + "learning_rate": 4.940518307764489e-05, + "loss": 1.3861, + "step": 18954 + }, + { + "epoch": 0.678818915966838, + "grad_norm": 1.9332152605056763, + "learning_rate": 4.939517852838944e-05, + "loss": 1.2178, + "step": 18955 + }, + { + "epoch": 0.6788547280964062, + "grad_norm": 1.623934268951416, + "learning_rate": 4.938517465995659e-05, + "loss": 1.5194, + "step": 18956 + }, + { + "epoch": 0.6788905402259745, + "grad_norm": 1.711578369140625, + "learning_rate": 4.937517147248096e-05, + "loss": 1.8629, + "step": 18957 + }, + { + "epoch": 0.6789263523555428, + "grad_norm": 2.0465073585510254, + "learning_rate": 4.936516896609707e-05, + "loss": 1.6172, + "step": 18958 + }, + { + "epoch": 0.6789621644851112, + "grad_norm": 2.6911144256591797, + "learning_rate": 4.9355167140939494e-05, + "loss": 1.484, + "step": 18959 + }, + { + "epoch": 0.6789979766146794, + "grad_norm": 2.0437419414520264, + "learning_rate": 4.934516599714284e-05, + "loss": 1.3593, + "step": 18960 + }, + { + "epoch": 0.6790337887442477, + "grad_norm": 1.5595124959945679, + "learning_rate": 4.933516553484167e-05, + "loss": 1.9304, + "step": 18961 + }, + { + "epoch": 0.679069600873816, + "grad_norm": 1.714911699295044, + "learning_rate": 4.9325165754170446e-05, + "loss": 1.1749, + "step": 18962 + }, + { + "epoch": 0.6791054130033842, + "grad_norm": 1.7814215421676636, + "learning_rate": 4.931516665526376e-05, + "loss": 1.5922, + "step": 18963 + }, + { + "epoch": 0.6791412251329525, + "grad_norm": 1.5183871984481812, + "learning_rate": 4.930516823825616e-05, + "loss": 1.4136, + "step": 18964 + }, + { + "epoch": 0.6791770372625208, + "grad_norm": 1.6205239295959473, + "learning_rate": 4.9295170503282095e-05, + "loss": 1.5224, + "step": 18965 + }, + { + "epoch": 0.6792128493920891, + "grad_norm": 1.9067203998565674, + "learning_rate": 4.928517345047611e-05, + "loss": 1.3327, + "step": 18966 + }, + { + "epoch": 0.6792486615216574, + "grad_norm": 1.7289701700210571, + "learning_rate": 4.927517707997269e-05, + "loss": 1.3748, + "step": 18967 + }, + { + "epoch": 0.6792844736512257, + "grad_norm": 1.5952321290969849, + "learning_rate": 4.926518139190638e-05, + "loss": 1.404, + "step": 18968 + }, + { + "epoch": 0.679320285780794, + "grad_norm": 1.9176799058914185, + "learning_rate": 4.925518638641157e-05, + "loss": 1.9199, + "step": 18969 + }, + { + "epoch": 0.6793560979103622, + "grad_norm": 1.8414347171783447, + "learning_rate": 4.924519206362276e-05, + "loss": 1.3767, + "step": 18970 + }, + { + "epoch": 0.6793919100399305, + "grad_norm": 1.6279493570327759, + "learning_rate": 4.9235198423674435e-05, + "loss": 1.5276, + "step": 18971 + }, + { + "epoch": 0.6794277221694988, + "grad_norm": 1.5713403224945068, + "learning_rate": 4.9225205466701064e-05, + "loss": 1.5543, + "step": 18972 + }, + { + "epoch": 0.6794635342990671, + "grad_norm": 1.7283962965011597, + "learning_rate": 4.9215213192837064e-05, + "loss": 1.6197, + "step": 18973 + }, + { + "epoch": 0.6794993464286354, + "grad_norm": 1.273970365524292, + "learning_rate": 4.920522160221679e-05, + "loss": 1.3018, + "step": 18974 + }, + { + "epoch": 0.6795351585582037, + "grad_norm": 1.8756721019744873, + "learning_rate": 4.91952306949748e-05, + "loss": 1.1821, + "step": 18975 + }, + { + "epoch": 0.679570970687772, + "grad_norm": 1.870096206665039, + "learning_rate": 4.918524047124543e-05, + "loss": 1.6877, + "step": 18976 + }, + { + "epoch": 0.6796067828173402, + "grad_norm": 1.3298908472061157, + "learning_rate": 4.9175250931163085e-05, + "loss": 1.5494, + "step": 18977 + }, + { + "epoch": 0.6796425949469085, + "grad_norm": 1.4009315967559814, + "learning_rate": 4.916526207486219e-05, + "loss": 1.4666, + "step": 18978 + }, + { + "epoch": 0.6796784070764768, + "grad_norm": 1.6784093379974365, + "learning_rate": 4.915527390247716e-05, + "loss": 1.684, + "step": 18979 + }, + { + "epoch": 0.6797142192060451, + "grad_norm": 4.324677467346191, + "learning_rate": 4.914528641414233e-05, + "loss": 1.8545, + "step": 18980 + }, + { + "epoch": 0.6797500313356134, + "grad_norm": 1.4996967315673828, + "learning_rate": 4.9135299609992004e-05, + "loss": 1.3009, + "step": 18981 + }, + { + "epoch": 0.6797858434651817, + "grad_norm": 1.5409586429595947, + "learning_rate": 4.912531349016067e-05, + "loss": 1.4616, + "step": 18982 + }, + { + "epoch": 0.6798216555947499, + "grad_norm": 1.7398431301116943, + "learning_rate": 4.911532805478259e-05, + "loss": 1.2676, + "step": 18983 + }, + { + "epoch": 0.6798574677243182, + "grad_norm": 1.7865056991577148, + "learning_rate": 4.910534330399219e-05, + "loss": 1.4283, + "step": 18984 + }, + { + "epoch": 0.6798932798538865, + "grad_norm": 1.7800086736679077, + "learning_rate": 4.909535923792365e-05, + "loss": 1.3978, + "step": 18985 + }, + { + "epoch": 0.6799290919834547, + "grad_norm": 1.393025517463684, + "learning_rate": 4.9085375856711465e-05, + "loss": 1.5825, + "step": 18986 + }, + { + "epoch": 0.6799649041130231, + "grad_norm": 1.8537535667419434, + "learning_rate": 4.907539316048985e-05, + "loss": 1.4, + "step": 18987 + }, + { + "epoch": 0.6800007162425914, + "grad_norm": 2.8984549045562744, + "learning_rate": 4.906541114939313e-05, + "loss": 1.4936, + "step": 18988 + }, + { + "epoch": 0.6800365283721597, + "grad_norm": 2.0612361431121826, + "learning_rate": 4.9055429823555624e-05, + "loss": 1.5854, + "step": 18989 + }, + { + "epoch": 0.6800723405017279, + "grad_norm": 1.7456159591674805, + "learning_rate": 4.9045449183111566e-05, + "loss": 1.1949, + "step": 18990 + }, + { + "epoch": 0.6801081526312962, + "grad_norm": 1.8124316930770874, + "learning_rate": 4.903546922819531e-05, + "loss": 1.5129, + "step": 18991 + }, + { + "epoch": 0.6801439647608645, + "grad_norm": 1.6117404699325562, + "learning_rate": 4.9025489958940985e-05, + "loss": 1.5364, + "step": 18992 + }, + { + "epoch": 0.6801797768904327, + "grad_norm": 1.6479026079177856, + "learning_rate": 4.9015511375483026e-05, + "loss": 1.2438, + "step": 18993 + }, + { + "epoch": 0.6802155890200011, + "grad_norm": 1.5882446765899658, + "learning_rate": 4.900553347795556e-05, + "loss": 1.2344, + "step": 18994 + }, + { + "epoch": 0.6802514011495694, + "grad_norm": 1.787760615348816, + "learning_rate": 4.899555626649289e-05, + "loss": 1.066, + "step": 18995 + }, + { + "epoch": 0.6802872132791377, + "grad_norm": 2.0816078186035156, + "learning_rate": 4.898557974122915e-05, + "loss": 1.528, + "step": 18996 + }, + { + "epoch": 0.6803230254087059, + "grad_norm": 2.4905543327331543, + "learning_rate": 4.8975603902298704e-05, + "loss": 1.3881, + "step": 18997 + }, + { + "epoch": 0.6803588375382742, + "grad_norm": 1.876359224319458, + "learning_rate": 4.896562874983569e-05, + "loss": 1.1194, + "step": 18998 + }, + { + "epoch": 0.6803946496678425, + "grad_norm": 1.7218693494796753, + "learning_rate": 4.8955654283974284e-05, + "loss": 1.4642, + "step": 18999 + }, + { + "epoch": 0.6804304617974107, + "grad_norm": 1.4344909191131592, + "learning_rate": 4.89456805048487e-05, + "loss": 1.5587, + "step": 19000 + }, + { + "epoch": 0.6804662739269791, + "grad_norm": 2.262502670288086, + "learning_rate": 4.893570741259312e-05, + "loss": 1.7689, + "step": 19001 + }, + { + "epoch": 0.6805020860565474, + "grad_norm": 1.9338393211364746, + "learning_rate": 4.892573500734179e-05, + "loss": 1.4208, + "step": 19002 + }, + { + "epoch": 0.6805378981861157, + "grad_norm": 1.5523253679275513, + "learning_rate": 4.891576328922872e-05, + "loss": 1.3446, + "step": 19003 + }, + { + "epoch": 0.6805737103156839, + "grad_norm": 2.149492025375366, + "learning_rate": 4.890579225838824e-05, + "loss": 1.575, + "step": 19004 + }, + { + "epoch": 0.6806095224452522, + "grad_norm": 1.690125823020935, + "learning_rate": 4.8895821914954376e-05, + "loss": 1.3665, + "step": 19005 + }, + { + "epoch": 0.6806453345748205, + "grad_norm": 1.6124178171157837, + "learning_rate": 4.888585225906136e-05, + "loss": 1.3594, + "step": 19006 + }, + { + "epoch": 0.6806811467043887, + "grad_norm": 2.0891807079315186, + "learning_rate": 4.8875883290843214e-05, + "loss": 1.3593, + "step": 19007 + }, + { + "epoch": 0.6807169588339571, + "grad_norm": 2.1637518405914307, + "learning_rate": 4.886591501043413e-05, + "loss": 1.6377, + "step": 19008 + }, + { + "epoch": 0.6807527709635254, + "grad_norm": 1.429513692855835, + "learning_rate": 4.885594741796823e-05, + "loss": 1.3786, + "step": 19009 + }, + { + "epoch": 0.6807885830930936, + "grad_norm": 1.508383870124817, + "learning_rate": 4.884598051357955e-05, + "loss": 1.5587, + "step": 19010 + }, + { + "epoch": 0.6808243952226619, + "grad_norm": 1.646700143814087, + "learning_rate": 4.883601429740222e-05, + "loss": 1.3783, + "step": 19011 + }, + { + "epoch": 0.6808602073522302, + "grad_norm": 1.7303789854049683, + "learning_rate": 4.882604876957032e-05, + "loss": 1.667, + "step": 19012 + }, + { + "epoch": 0.6808960194817985, + "grad_norm": 1.8976233005523682, + "learning_rate": 4.881608393021796e-05, + "loss": 1.5103, + "step": 19013 + }, + { + "epoch": 0.6809318316113667, + "grad_norm": 1.9167366027832031, + "learning_rate": 4.880611977947909e-05, + "loss": 1.4182, + "step": 19014 + }, + { + "epoch": 0.6809676437409351, + "grad_norm": 1.572396993637085, + "learning_rate": 4.879615631748793e-05, + "loss": 1.7171, + "step": 19015 + }, + { + "epoch": 0.6810034558705034, + "grad_norm": 1.5460922718048096, + "learning_rate": 4.8786193544378424e-05, + "loss": 1.5438, + "step": 19016 + }, + { + "epoch": 0.6810392680000716, + "grad_norm": 1.4947601556777954, + "learning_rate": 4.8776231460284595e-05, + "loss": 1.655, + "step": 19017 + }, + { + "epoch": 0.6810750801296399, + "grad_norm": 1.6060175895690918, + "learning_rate": 4.876627006534049e-05, + "loss": 1.5582, + "step": 19018 + }, + { + "epoch": 0.6811108922592082, + "grad_norm": 1.5002024173736572, + "learning_rate": 4.8756309359680145e-05, + "loss": 1.3706, + "step": 19019 + }, + { + "epoch": 0.6811467043887764, + "grad_norm": 1.6578645706176758, + "learning_rate": 4.874634934343759e-05, + "loss": 1.451, + "step": 19020 + }, + { + "epoch": 0.6811825165183447, + "grad_norm": 1.4593560695648193, + "learning_rate": 4.873639001674676e-05, + "loss": 1.2581, + "step": 19021 + }, + { + "epoch": 0.6812183286479131, + "grad_norm": 1.5626795291900635, + "learning_rate": 4.872643137974167e-05, + "loss": 1.5399, + "step": 19022 + }, + { + "epoch": 0.6812541407774814, + "grad_norm": 1.6952016353607178, + "learning_rate": 4.87164734325563e-05, + "loss": 1.4, + "step": 19023 + }, + { + "epoch": 0.6812899529070496, + "grad_norm": 1.9732818603515625, + "learning_rate": 4.870651617532468e-05, + "loss": 1.5631, + "step": 19024 + }, + { + "epoch": 0.6813257650366179, + "grad_norm": 1.7756799459457397, + "learning_rate": 4.869655960818068e-05, + "loss": 1.2446, + "step": 19025 + }, + { + "epoch": 0.6813615771661862, + "grad_norm": 1.959659457206726, + "learning_rate": 4.868660373125829e-05, + "loss": 1.4592, + "step": 19026 + }, + { + "epoch": 0.6813973892957544, + "grad_norm": 2.5035407543182373, + "learning_rate": 4.8676648544691495e-05, + "loss": 1.5678, + "step": 19027 + }, + { + "epoch": 0.6814332014253227, + "grad_norm": 1.5704766511917114, + "learning_rate": 4.866669404861416e-05, + "loss": 1.5047, + "step": 19028 + }, + { + "epoch": 0.6814690135548911, + "grad_norm": 2.848478078842163, + "learning_rate": 4.8656740243160236e-05, + "loss": 1.8893, + "step": 19029 + }, + { + "epoch": 0.6815048256844594, + "grad_norm": 1.4593943357467651, + "learning_rate": 4.864678712846365e-05, + "loss": 1.2788, + "step": 19030 + }, + { + "epoch": 0.6815406378140276, + "grad_norm": 1.6539605855941772, + "learning_rate": 4.863683470465833e-05, + "loss": 1.2998, + "step": 19031 + }, + { + "epoch": 0.6815764499435959, + "grad_norm": 1.4724090099334717, + "learning_rate": 4.862688297187812e-05, + "loss": 1.5732, + "step": 19032 + }, + { + "epoch": 0.6816122620731642, + "grad_norm": 2.0338592529296875, + "learning_rate": 4.8616931930256926e-05, + "loss": 1.4792, + "step": 19033 + }, + { + "epoch": 0.6816480742027324, + "grad_norm": 1.6050416231155396, + "learning_rate": 4.860698157992867e-05, + "loss": 1.0027, + "step": 19034 + }, + { + "epoch": 0.6816838863323007, + "grad_norm": 1.8929771184921265, + "learning_rate": 4.859703192102715e-05, + "loss": 1.3524, + "step": 19035 + }, + { + "epoch": 0.6817196984618691, + "grad_norm": 1.4023357629776, + "learning_rate": 4.858708295368626e-05, + "loss": 1.4543, + "step": 19036 + }, + { + "epoch": 0.6817555105914374, + "grad_norm": 1.6556355953216553, + "learning_rate": 4.857713467803985e-05, + "loss": 1.6469, + "step": 19037 + }, + { + "epoch": 0.6817913227210056, + "grad_norm": 2.0519371032714844, + "learning_rate": 4.85671870942218e-05, + "loss": 1.5793, + "step": 19038 + }, + { + "epoch": 0.6818271348505739, + "grad_norm": 1.6347476243972778, + "learning_rate": 4.855724020236586e-05, + "loss": 1.6161, + "step": 19039 + }, + { + "epoch": 0.6818629469801422, + "grad_norm": 1.3786358833312988, + "learning_rate": 4.854729400260591e-05, + "loss": 1.5669, + "step": 19040 + }, + { + "epoch": 0.6818987591097104, + "grad_norm": 1.5543736219406128, + "learning_rate": 4.853734849507574e-05, + "loss": 1.5422, + "step": 19041 + }, + { + "epoch": 0.6819345712392787, + "grad_norm": 1.4965298175811768, + "learning_rate": 4.8527403679909214e-05, + "loss": 1.1551, + "step": 19042 + }, + { + "epoch": 0.6819703833688471, + "grad_norm": 1.4858630895614624, + "learning_rate": 4.851745955724002e-05, + "loss": 1.5663, + "step": 19043 + }, + { + "epoch": 0.6820061954984153, + "grad_norm": 1.4483484029769897, + "learning_rate": 4.8507516127202014e-05, + "loss": 1.1364, + "step": 19044 + }, + { + "epoch": 0.6820420076279836, + "grad_norm": 1.929543375968933, + "learning_rate": 4.849757338992898e-05, + "loss": 1.5264, + "step": 19045 + }, + { + "epoch": 0.6820778197575519, + "grad_norm": 1.2829221487045288, + "learning_rate": 4.848763134555465e-05, + "loss": 1.1193, + "step": 19046 + }, + { + "epoch": 0.6821136318871202, + "grad_norm": 1.5956387519836426, + "learning_rate": 4.847768999421277e-05, + "loss": 1.2601, + "step": 19047 + }, + { + "epoch": 0.6821494440166884, + "grad_norm": 2.0086724758148193, + "learning_rate": 4.8467749336037124e-05, + "loss": 1.1854, + "step": 19048 + }, + { + "epoch": 0.6821852561462567, + "grad_norm": 1.4452983140945435, + "learning_rate": 4.8457809371161476e-05, + "loss": 1.3118, + "step": 19049 + }, + { + "epoch": 0.6822210682758251, + "grad_norm": 1.4968582391738892, + "learning_rate": 4.844787009971949e-05, + "loss": 1.4051, + "step": 19050 + }, + { + "epoch": 0.6822568804053933, + "grad_norm": 1.3636406660079956, + "learning_rate": 4.8437931521844894e-05, + "loss": 1.3358, + "step": 19051 + }, + { + "epoch": 0.6822926925349616, + "grad_norm": 1.9346957206726074, + "learning_rate": 4.8427993637671474e-05, + "loss": 1.9677, + "step": 19052 + }, + { + "epoch": 0.6823285046645299, + "grad_norm": 1.4876221418380737, + "learning_rate": 4.841805644733283e-05, + "loss": 1.6532, + "step": 19053 + }, + { + "epoch": 0.6823643167940981, + "grad_norm": 1.683264136314392, + "learning_rate": 4.8408119950962704e-05, + "loss": 1.6318, + "step": 19054 + }, + { + "epoch": 0.6824001289236664, + "grad_norm": 1.3608731031417847, + "learning_rate": 4.839818414869477e-05, + "loss": 1.3656, + "step": 19055 + }, + { + "epoch": 0.6824359410532347, + "grad_norm": 1.9160677194595337, + "learning_rate": 4.8388249040662744e-05, + "loss": 1.5936, + "step": 19056 + }, + { + "epoch": 0.6824717531828031, + "grad_norm": 1.7479875087738037, + "learning_rate": 4.8378314627000224e-05, + "loss": 1.7994, + "step": 19057 + }, + { + "epoch": 0.6825075653123713, + "grad_norm": 2.653571367263794, + "learning_rate": 4.836838090784088e-05, + "loss": 1.5861, + "step": 19058 + }, + { + "epoch": 0.6825433774419396, + "grad_norm": 1.605258584022522, + "learning_rate": 4.835844788331839e-05, + "loss": 1.5436, + "step": 19059 + }, + { + "epoch": 0.6825791895715079, + "grad_norm": 1.4455996751785278, + "learning_rate": 4.8348515553566396e-05, + "loss": 1.2487, + "step": 19060 + }, + { + "epoch": 0.6826150017010761, + "grad_norm": 1.330987572669983, + "learning_rate": 4.833858391871846e-05, + "loss": 1.4588, + "step": 19061 + }, + { + "epoch": 0.6826508138306444, + "grad_norm": 2.3444483280181885, + "learning_rate": 4.832865297890825e-05, + "loss": 1.5377, + "step": 19062 + }, + { + "epoch": 0.6826866259602127, + "grad_norm": 1.7140589952468872, + "learning_rate": 4.83187227342694e-05, + "loss": 1.2662, + "step": 19063 + }, + { + "epoch": 0.6827224380897811, + "grad_norm": 1.2725976705551147, + "learning_rate": 4.830879318493542e-05, + "loss": 1.3755, + "step": 19064 + }, + { + "epoch": 0.6827582502193493, + "grad_norm": 1.4281116724014282, + "learning_rate": 4.829886433103995e-05, + "loss": 1.4356, + "step": 19065 + }, + { + "epoch": 0.6827940623489176, + "grad_norm": 1.4100066423416138, + "learning_rate": 4.828893617271658e-05, + "loss": 1.3519, + "step": 19066 + }, + { + "epoch": 0.6828298744784859, + "grad_norm": 1.5357187986373901, + "learning_rate": 4.8279008710098916e-05, + "loss": 1.6721, + "step": 19067 + }, + { + "epoch": 0.6828656866080541, + "grad_norm": 1.6073108911514282, + "learning_rate": 4.8269081943320424e-05, + "loss": 1.4553, + "step": 19068 + }, + { + "epoch": 0.6829014987376224, + "grad_norm": 1.8977751731872559, + "learning_rate": 4.825915587251472e-05, + "loss": 1.4387, + "step": 19069 + }, + { + "epoch": 0.6829373108671907, + "grad_norm": 1.570206642150879, + "learning_rate": 4.824923049781536e-05, + "loss": 1.2027, + "step": 19070 + }, + { + "epoch": 0.682973122996759, + "grad_norm": 1.6373735666275024, + "learning_rate": 4.8239305819355805e-05, + "loss": 1.7076, + "step": 19071 + }, + { + "epoch": 0.6830089351263273, + "grad_norm": 2.1128671169281006, + "learning_rate": 4.822938183726967e-05, + "loss": 1.4418, + "step": 19072 + }, + { + "epoch": 0.6830447472558956, + "grad_norm": 1.6894068717956543, + "learning_rate": 4.821945855169035e-05, + "loss": 1.7592, + "step": 19073 + }, + { + "epoch": 0.6830805593854639, + "grad_norm": 1.6356310844421387, + "learning_rate": 4.8209535962751494e-05, + "loss": 1.4973, + "step": 19074 + }, + { + "epoch": 0.6831163715150321, + "grad_norm": 1.7310285568237305, + "learning_rate": 4.81996140705865e-05, + "loss": 1.5372, + "step": 19075 + }, + { + "epoch": 0.6831521836446004, + "grad_norm": 1.616203784942627, + "learning_rate": 4.8189692875328864e-05, + "loss": 1.6467, + "step": 19076 + }, + { + "epoch": 0.6831879957741687, + "grad_norm": 1.7261005640029907, + "learning_rate": 4.817977237711213e-05, + "loss": 1.5143, + "step": 19077 + }, + { + "epoch": 0.683223807903737, + "grad_norm": 1.4400568008422852, + "learning_rate": 4.816985257606967e-05, + "loss": 1.3393, + "step": 19078 + }, + { + "epoch": 0.6832596200333053, + "grad_norm": 2.898552179336548, + "learning_rate": 4.815993347233503e-05, + "loss": 1.7106, + "step": 19079 + }, + { + "epoch": 0.6832954321628736, + "grad_norm": 1.6326524019241333, + "learning_rate": 4.8150015066041545e-05, + "loss": 1.2989, + "step": 19080 + }, + { + "epoch": 0.6833312442924419, + "grad_norm": 1.612429141998291, + "learning_rate": 4.814009735732279e-05, + "loss": 1.4414, + "step": 19081 + }, + { + "epoch": 0.6833670564220101, + "grad_norm": 1.492590308189392, + "learning_rate": 4.8130180346312105e-05, + "loss": 1.5981, + "step": 19082 + }, + { + "epoch": 0.6834028685515784, + "grad_norm": 1.8465263843536377, + "learning_rate": 4.812026403314297e-05, + "loss": 1.37, + "step": 19083 + }, + { + "epoch": 0.6834386806811467, + "grad_norm": 1.8145172595977783, + "learning_rate": 4.811034841794868e-05, + "loss": 1.3823, + "step": 19084 + }, + { + "epoch": 0.683474492810715, + "grad_norm": 1.4729621410369873, + "learning_rate": 4.8100433500862794e-05, + "loss": 1.4076, + "step": 19085 + }, + { + "epoch": 0.6835103049402833, + "grad_norm": 2.7632548809051514, + "learning_rate": 4.809051928201864e-05, + "loss": 1.2546, + "step": 19086 + }, + { + "epoch": 0.6835461170698516, + "grad_norm": 1.296805739402771, + "learning_rate": 4.808060576154951e-05, + "loss": 1.1367, + "step": 19087 + }, + { + "epoch": 0.6835819291994198, + "grad_norm": 1.7383794784545898, + "learning_rate": 4.8070692939588934e-05, + "loss": 1.4106, + "step": 19088 + }, + { + "epoch": 0.6836177413289881, + "grad_norm": 2.4695076942443848, + "learning_rate": 4.8060780816270165e-05, + "loss": 1.3039, + "step": 19089 + }, + { + "epoch": 0.6836535534585564, + "grad_norm": 1.2985374927520752, + "learning_rate": 4.805086939172663e-05, + "loss": 1.5119, + "step": 19090 + }, + { + "epoch": 0.6836893655881247, + "grad_norm": 1.8521593809127808, + "learning_rate": 4.804095866609156e-05, + "loss": 1.4941, + "step": 19091 + }, + { + "epoch": 0.683725177717693, + "grad_norm": 1.4810516834259033, + "learning_rate": 4.803104863949844e-05, + "loss": 1.5291, + "step": 19092 + }, + { + "epoch": 0.6837609898472613, + "grad_norm": 1.5847200155258179, + "learning_rate": 4.80211393120805e-05, + "loss": 1.6587, + "step": 19093 + }, + { + "epoch": 0.6837968019768296, + "grad_norm": 1.4615356922149658, + "learning_rate": 4.801123068397111e-05, + "loss": 1.2841, + "step": 19094 + }, + { + "epoch": 0.6838326141063978, + "grad_norm": 2.2419803142547607, + "learning_rate": 4.800132275530351e-05, + "loss": 1.2867, + "step": 19095 + }, + { + "epoch": 0.6838684262359661, + "grad_norm": 1.2413926124572754, + "learning_rate": 4.799141552621105e-05, + "loss": 1.3555, + "step": 19096 + }, + { + "epoch": 0.6839042383655344, + "grad_norm": 1.478020191192627, + "learning_rate": 4.798150899682704e-05, + "loss": 1.4722, + "step": 19097 + }, + { + "epoch": 0.6839400504951026, + "grad_norm": 1.8806427717208862, + "learning_rate": 4.79716031672847e-05, + "loss": 1.7017, + "step": 19098 + }, + { + "epoch": 0.683975862624671, + "grad_norm": 1.7698049545288086, + "learning_rate": 4.7961698037717306e-05, + "loss": 1.4021, + "step": 19099 + }, + { + "epoch": 0.6840116747542393, + "grad_norm": 1.7766374349594116, + "learning_rate": 4.795179360825815e-05, + "loss": 1.5161, + "step": 19100 + }, + { + "epoch": 0.6840474868838076, + "grad_norm": 1.346203088760376, + "learning_rate": 4.794188987904051e-05, + "loss": 1.3395, + "step": 19101 + }, + { + "epoch": 0.6840832990133758, + "grad_norm": 1.899402141571045, + "learning_rate": 4.793198685019753e-05, + "loss": 1.436, + "step": 19102 + }, + { + "epoch": 0.6841191111429441, + "grad_norm": 1.439682126045227, + "learning_rate": 4.7922084521862565e-05, + "loss": 1.3542, + "step": 19103 + }, + { + "epoch": 0.6841549232725124, + "grad_norm": 1.2503219842910767, + "learning_rate": 4.791218289416879e-05, + "loss": 1.2213, + "step": 19104 + }, + { + "epoch": 0.6841907354020806, + "grad_norm": 1.6100847721099854, + "learning_rate": 4.790228196724935e-05, + "loss": 1.4521, + "step": 19105 + }, + { + "epoch": 0.684226547531649, + "grad_norm": 1.7780996561050415, + "learning_rate": 4.789238174123751e-05, + "loss": 1.4513, + "step": 19106 + }, + { + "epoch": 0.6842623596612173, + "grad_norm": 3.03469181060791, + "learning_rate": 4.788248221626647e-05, + "loss": 1.3681, + "step": 19107 + }, + { + "epoch": 0.6842981717907856, + "grad_norm": 1.8146162033081055, + "learning_rate": 4.7872583392469436e-05, + "loss": 1.3714, + "step": 19108 + }, + { + "epoch": 0.6843339839203538, + "grad_norm": 1.5532610416412354, + "learning_rate": 4.786268526997951e-05, + "loss": 1.4951, + "step": 19109 + }, + { + "epoch": 0.6843697960499221, + "grad_norm": 1.6222275495529175, + "learning_rate": 4.7852787848929916e-05, + "loss": 1.6106, + "step": 19110 + }, + { + "epoch": 0.6844056081794904, + "grad_norm": 1.9357314109802246, + "learning_rate": 4.7842891129453784e-05, + "loss": 1.4879, + "step": 19111 + }, + { + "epoch": 0.6844414203090586, + "grad_norm": 2.5077223777770996, + "learning_rate": 4.783299511168432e-05, + "loss": 1.3873, + "step": 19112 + }, + { + "epoch": 0.684477232438627, + "grad_norm": 2.1032609939575195, + "learning_rate": 4.7823099795754566e-05, + "loss": 1.42, + "step": 19113 + }, + { + "epoch": 0.6845130445681953, + "grad_norm": 1.413138508796692, + "learning_rate": 4.781320518179772e-05, + "loss": 1.5738, + "step": 19114 + }, + { + "epoch": 0.6845488566977636, + "grad_norm": 1.6410788297653198, + "learning_rate": 4.780331126994691e-05, + "loss": 1.6905, + "step": 19115 + }, + { + "epoch": 0.6845846688273318, + "grad_norm": 1.455068826675415, + "learning_rate": 4.779341806033517e-05, + "loss": 1.2427, + "step": 19116 + }, + { + "epoch": 0.6846204809569001, + "grad_norm": 1.6743617057800293, + "learning_rate": 4.778352555309565e-05, + "loss": 1.2889, + "step": 19117 + }, + { + "epoch": 0.6846562930864684, + "grad_norm": 1.5681772232055664, + "learning_rate": 4.777363374836146e-05, + "loss": 1.461, + "step": 19118 + }, + { + "epoch": 0.6846921052160366, + "grad_norm": 1.5241618156433105, + "learning_rate": 4.7763742646265674e-05, + "loss": 1.6756, + "step": 19119 + }, + { + "epoch": 0.684727917345605, + "grad_norm": 1.7373355627059937, + "learning_rate": 4.7753852246941335e-05, + "loss": 1.3074, + "step": 19120 + }, + { + "epoch": 0.6847637294751733, + "grad_norm": 1.4865138530731201, + "learning_rate": 4.774396255052151e-05, + "loss": 1.3415, + "step": 19121 + }, + { + "epoch": 0.6847995416047415, + "grad_norm": 1.9230724573135376, + "learning_rate": 4.773407355713929e-05, + "loss": 1.6522, + "step": 19122 + }, + { + "epoch": 0.6848353537343098, + "grad_norm": 1.556607961654663, + "learning_rate": 4.7724185266927666e-05, + "loss": 1.5193, + "step": 19123 + }, + { + "epoch": 0.6848711658638781, + "grad_norm": 1.8414274454116821, + "learning_rate": 4.7714297680019704e-05, + "loss": 1.4167, + "step": 19124 + }, + { + "epoch": 0.6849069779934464, + "grad_norm": 1.8002582788467407, + "learning_rate": 4.770441079654841e-05, + "loss": 1.4511, + "step": 19125 + }, + { + "epoch": 0.6849427901230146, + "grad_norm": 1.7834447622299194, + "learning_rate": 4.7694524616646865e-05, + "loss": 1.7016, + "step": 19126 + }, + { + "epoch": 0.684978602252583, + "grad_norm": 1.2345691919326782, + "learning_rate": 4.768463914044797e-05, + "loss": 1.3703, + "step": 19127 + }, + { + "epoch": 0.6850144143821513, + "grad_norm": 1.528149127960205, + "learning_rate": 4.767475436808478e-05, + "loss": 1.4703, + "step": 19128 + }, + { + "epoch": 0.6850502265117195, + "grad_norm": 1.437954306602478, + "learning_rate": 4.766487029969028e-05, + "loss": 1.4768, + "step": 19129 + }, + { + "epoch": 0.6850860386412878, + "grad_norm": 1.682350754737854, + "learning_rate": 4.765498693539747e-05, + "loss": 1.3122, + "step": 19130 + }, + { + "epoch": 0.6851218507708561, + "grad_norm": 2.119572401046753, + "learning_rate": 4.764510427533926e-05, + "loss": 1.4844, + "step": 19131 + }, + { + "epoch": 0.6851576629004243, + "grad_norm": 2.683337926864624, + "learning_rate": 4.763522231964864e-05, + "loss": 1.2952, + "step": 19132 + }, + { + "epoch": 0.6851934750299926, + "grad_norm": 1.712811827659607, + "learning_rate": 4.76253410684586e-05, + "loss": 1.5528, + "step": 19133 + }, + { + "epoch": 0.685229287159561, + "grad_norm": 1.5293190479278564, + "learning_rate": 4.761546052190199e-05, + "loss": 1.2965, + "step": 19134 + }, + { + "epoch": 0.6852650992891293, + "grad_norm": 1.6144920587539673, + "learning_rate": 4.7605580680111785e-05, + "loss": 1.313, + "step": 19135 + }, + { + "epoch": 0.6853009114186975, + "grad_norm": 1.2514257431030273, + "learning_rate": 4.7595701543220916e-05, + "loss": 1.3082, + "step": 19136 + }, + { + "epoch": 0.6853367235482658, + "grad_norm": 1.5040967464447021, + "learning_rate": 4.758582311136231e-05, + "loss": 1.9045, + "step": 19137 + }, + { + "epoch": 0.6853725356778341, + "grad_norm": 1.5086262226104736, + "learning_rate": 4.757594538466883e-05, + "loss": 1.5548, + "step": 19138 + }, + { + "epoch": 0.6854083478074023, + "grad_norm": 1.7702431678771973, + "learning_rate": 4.756606836327337e-05, + "loss": 1.2219, + "step": 19139 + }, + { + "epoch": 0.6854441599369706, + "grad_norm": 1.6345676183700562, + "learning_rate": 4.755619204730886e-05, + "loss": 1.534, + "step": 19140 + }, + { + "epoch": 0.685479972066539, + "grad_norm": 1.7369307279586792, + "learning_rate": 4.75463164369081e-05, + "loss": 1.377, + "step": 19141 + }, + { + "epoch": 0.6855157841961073, + "grad_norm": 2.0801753997802734, + "learning_rate": 4.7536441532204e-05, + "loss": 1.633, + "step": 19142 + }, + { + "epoch": 0.6855515963256755, + "grad_norm": 1.7809818983078003, + "learning_rate": 4.752656733332941e-05, + "loss": 1.6268, + "step": 19143 + }, + { + "epoch": 0.6855874084552438, + "grad_norm": 2.1582162380218506, + "learning_rate": 4.751669384041719e-05, + "loss": 1.4487, + "step": 19144 + }, + { + "epoch": 0.6856232205848121, + "grad_norm": 1.6797009706497192, + "learning_rate": 4.750682105360014e-05, + "loss": 1.5636, + "step": 19145 + }, + { + "epoch": 0.6856590327143803, + "grad_norm": 2.4573843479156494, + "learning_rate": 4.749694897301108e-05, + "loss": 1.4913, + "step": 19146 + }, + { + "epoch": 0.6856948448439486, + "grad_norm": 1.3981280326843262, + "learning_rate": 4.7487077598782856e-05, + "loss": 1.392, + "step": 19147 + }, + { + "epoch": 0.685730656973517, + "grad_norm": 1.4135205745697021, + "learning_rate": 4.747720693104831e-05, + "loss": 1.4138, + "step": 19148 + }, + { + "epoch": 0.6857664691030853, + "grad_norm": 1.4483755826950073, + "learning_rate": 4.7467336969940156e-05, + "loss": 1.1195, + "step": 19149 + }, + { + "epoch": 0.6858022812326535, + "grad_norm": 1.7879372835159302, + "learning_rate": 4.745746771559122e-05, + "loss": 1.1896, + "step": 19150 + }, + { + "epoch": 0.6858380933622218, + "grad_norm": 1.420569896697998, + "learning_rate": 4.744759916813432e-05, + "loss": 1.5825, + "step": 19151 + }, + { + "epoch": 0.6858739054917901, + "grad_norm": 1.677254319190979, + "learning_rate": 4.743773132770214e-05, + "loss": 0.9403, + "step": 19152 + }, + { + "epoch": 0.6859097176213583, + "grad_norm": 1.9434819221496582, + "learning_rate": 4.7427864194427484e-05, + "loss": 1.7483, + "step": 19153 + }, + { + "epoch": 0.6859455297509266, + "grad_norm": 1.6031337976455688, + "learning_rate": 4.74179977684431e-05, + "loss": 1.3214, + "step": 19154 + }, + { + "epoch": 0.685981341880495, + "grad_norm": 1.5116249322891235, + "learning_rate": 4.740813204988178e-05, + "loss": 1.3434, + "step": 19155 + }, + { + "epoch": 0.6860171540100632, + "grad_norm": 2.093729257583618, + "learning_rate": 4.739826703887616e-05, + "loss": 1.4053, + "step": 19156 + }, + { + "epoch": 0.6860529661396315, + "grad_norm": 1.526672124862671, + "learning_rate": 4.7388402735559014e-05, + "loss": 1.429, + "step": 19157 + }, + { + "epoch": 0.6860887782691998, + "grad_norm": 1.9945696592330933, + "learning_rate": 4.737853914006307e-05, + "loss": 1.2103, + "step": 19158 + }, + { + "epoch": 0.686124590398768, + "grad_norm": 1.6721898317337036, + "learning_rate": 4.736867625252097e-05, + "loss": 1.1878, + "step": 19159 + }, + { + "epoch": 0.6861604025283363, + "grad_norm": 1.792447805404663, + "learning_rate": 4.735881407306545e-05, + "loss": 1.1916, + "step": 19160 + }, + { + "epoch": 0.6861962146579046, + "grad_norm": 3.155874729156494, + "learning_rate": 4.734895260182918e-05, + "loss": 1.503, + "step": 19161 + }, + { + "epoch": 0.686232026787473, + "grad_norm": 1.5070594549179077, + "learning_rate": 4.733909183894487e-05, + "loss": 1.1776, + "step": 19162 + }, + { + "epoch": 0.6862678389170412, + "grad_norm": 1.7831507921218872, + "learning_rate": 4.732923178454512e-05, + "loss": 1.4696, + "step": 19163 + }, + { + "epoch": 0.6863036510466095, + "grad_norm": 1.283743977546692, + "learning_rate": 4.731937243876262e-05, + "loss": 1.2818, + "step": 19164 + }, + { + "epoch": 0.6863394631761778, + "grad_norm": 2.2443976402282715, + "learning_rate": 4.730951380173e-05, + "loss": 1.6624, + "step": 19165 + }, + { + "epoch": 0.686375275305746, + "grad_norm": 1.4468728303909302, + "learning_rate": 4.729965587357995e-05, + "loss": 1.0889, + "step": 19166 + }, + { + "epoch": 0.6864110874353143, + "grad_norm": 1.9218125343322754, + "learning_rate": 4.728979865444505e-05, + "loss": 1.3015, + "step": 19167 + }, + { + "epoch": 0.6864468995648826, + "grad_norm": 1.458131194114685, + "learning_rate": 4.7279942144457847e-05, + "loss": 1.6985, + "step": 19168 + }, + { + "epoch": 0.686482711694451, + "grad_norm": 1.8317525386810303, + "learning_rate": 4.7270086343751085e-05, + "loss": 1.5711, + "step": 19169 + }, + { + "epoch": 0.6865185238240192, + "grad_norm": 2.029881715774536, + "learning_rate": 4.7260231252457265e-05, + "loss": 1.7153, + "step": 19170 + }, + { + "epoch": 0.6865543359535875, + "grad_norm": 1.9590765237808228, + "learning_rate": 4.7250376870709e-05, + "loss": 1.1425, + "step": 19171 + }, + { + "epoch": 0.6865901480831558, + "grad_norm": 1.7559995651245117, + "learning_rate": 4.7240523198638875e-05, + "loss": 1.3486, + "step": 19172 + }, + { + "epoch": 0.686625960212724, + "grad_norm": 2.3266706466674805, + "learning_rate": 4.723067023637949e-05, + "loss": 1.6996, + "step": 19173 + }, + { + "epoch": 0.6866617723422923, + "grad_norm": 1.5278242826461792, + "learning_rate": 4.722081798406337e-05, + "loss": 1.6506, + "step": 19174 + }, + { + "epoch": 0.6866975844718606, + "grad_norm": 2.3450798988342285, + "learning_rate": 4.721096644182299e-05, + "loss": 1.345, + "step": 19175 + }, + { + "epoch": 0.686733396601429, + "grad_norm": 2.0274198055267334, + "learning_rate": 4.720111560979104e-05, + "loss": 1.4916, + "step": 19176 + }, + { + "epoch": 0.6867692087309972, + "grad_norm": 1.3534672260284424, + "learning_rate": 4.719126548809993e-05, + "loss": 1.4593, + "step": 19177 + }, + { + "epoch": 0.6868050208605655, + "grad_norm": 1.6133003234863281, + "learning_rate": 4.7181416076882266e-05, + "loss": 1.4837, + "step": 19178 + }, + { + "epoch": 0.6868408329901338, + "grad_norm": 1.8048996925354004, + "learning_rate": 4.7171567376270443e-05, + "loss": 1.5241, + "step": 19179 + }, + { + "epoch": 0.686876645119702, + "grad_norm": 1.4059399366378784, + "learning_rate": 4.716171938639711e-05, + "loss": 1.4154, + "step": 19180 + }, + { + "epoch": 0.6869124572492703, + "grad_norm": 1.9644883871078491, + "learning_rate": 4.715187210739466e-05, + "loss": 1.2348, + "step": 19181 + }, + { + "epoch": 0.6869482693788386, + "grad_norm": 2.089815378189087, + "learning_rate": 4.714202553939562e-05, + "loss": 1.4767, + "step": 19182 + }, + { + "epoch": 0.686984081508407, + "grad_norm": 1.4828203916549683, + "learning_rate": 4.713217968253242e-05, + "loss": 1.6034, + "step": 19183 + }, + { + "epoch": 0.6870198936379752, + "grad_norm": 1.8886553049087524, + "learning_rate": 4.712233453693754e-05, + "loss": 1.5413, + "step": 19184 + }, + { + "epoch": 0.6870557057675435, + "grad_norm": 2.037841796875, + "learning_rate": 4.711249010274349e-05, + "loss": 1.6245, + "step": 19185 + }, + { + "epoch": 0.6870915178971118, + "grad_norm": 1.8798308372497559, + "learning_rate": 4.710264638008258e-05, + "loss": 1.6074, + "step": 19186 + }, + { + "epoch": 0.68712733002668, + "grad_norm": 2.2628753185272217, + "learning_rate": 4.709280336908741e-05, + "loss": 1.5401, + "step": 19187 + }, + { + "epoch": 0.6871631421562483, + "grad_norm": 1.5651167631149292, + "learning_rate": 4.7082961069890284e-05, + "loss": 1.5082, + "step": 19188 + }, + { + "epoch": 0.6871989542858166, + "grad_norm": 1.807701826095581, + "learning_rate": 4.707311948262371e-05, + "loss": 1.448, + "step": 19189 + }, + { + "epoch": 0.687234766415385, + "grad_norm": 1.4807206392288208, + "learning_rate": 4.7063278607419944e-05, + "loss": 1.4774, + "step": 19190 + }, + { + "epoch": 0.6872705785449532, + "grad_norm": 1.4907841682434082, + "learning_rate": 4.705343844441158e-05, + "loss": 1.3822, + "step": 19191 + }, + { + "epoch": 0.6873063906745215, + "grad_norm": 2.6561944484710693, + "learning_rate": 4.704359899373089e-05, + "loss": 1.3852, + "step": 19192 + }, + { + "epoch": 0.6873422028040898, + "grad_norm": 1.906999111175537, + "learning_rate": 4.703376025551023e-05, + "loss": 1.4777, + "step": 19193 + }, + { + "epoch": 0.687378014933658, + "grad_norm": 1.8173359632492065, + "learning_rate": 4.7023922229882013e-05, + "loss": 1.3468, + "step": 19194 + }, + { + "epoch": 0.6874138270632263, + "grad_norm": 1.8602315187454224, + "learning_rate": 4.701408491697859e-05, + "loss": 1.5548, + "step": 19195 + }, + { + "epoch": 0.6874496391927946, + "grad_norm": 1.9973033666610718, + "learning_rate": 4.700424831693233e-05, + "loss": 1.4175, + "step": 19196 + }, + { + "epoch": 0.6874854513223629, + "grad_norm": 1.5902940034866333, + "learning_rate": 4.699441242987548e-05, + "loss": 1.2324, + "step": 19197 + }, + { + "epoch": 0.6875212634519312, + "grad_norm": 1.6441142559051514, + "learning_rate": 4.698457725594052e-05, + "loss": 1.7983, + "step": 19198 + }, + { + "epoch": 0.6875570755814995, + "grad_norm": 1.9269689321517944, + "learning_rate": 4.697474279525964e-05, + "loss": 1.3264, + "step": 19199 + }, + { + "epoch": 0.6875928877110677, + "grad_norm": 1.7410550117492676, + "learning_rate": 4.6964909047965246e-05, + "loss": 1.2364, + "step": 19200 + }, + { + "epoch": 0.687628699840636, + "grad_norm": 1.4461188316345215, + "learning_rate": 4.6955076014189545e-05, + "loss": 1.3768, + "step": 19201 + }, + { + "epoch": 0.6876645119702043, + "grad_norm": 1.9367955923080444, + "learning_rate": 4.694524369406488e-05, + "loss": 1.7564, + "step": 19202 + }, + { + "epoch": 0.6877003240997726, + "grad_norm": 2.228534460067749, + "learning_rate": 4.693541208772356e-05, + "loss": 1.5539, + "step": 19203 + }, + { + "epoch": 0.6877361362293409, + "grad_norm": 1.685943365097046, + "learning_rate": 4.692558119529778e-05, + "loss": 1.4786, + "step": 19204 + }, + { + "epoch": 0.6877719483589092, + "grad_norm": 1.4004324674606323, + "learning_rate": 4.691575101691985e-05, + "loss": 1.3005, + "step": 19205 + }, + { + "epoch": 0.6878077604884775, + "grad_norm": 1.7593801021575928, + "learning_rate": 4.6905921552722024e-05, + "loss": 1.1818, + "step": 19206 + }, + { + "epoch": 0.6878435726180457, + "grad_norm": 1.8382956981658936, + "learning_rate": 4.6896092802836555e-05, + "loss": 1.292, + "step": 19207 + }, + { + "epoch": 0.687879384747614, + "grad_norm": 1.941030502319336, + "learning_rate": 4.6886264767395635e-05, + "loss": 1.4367, + "step": 19208 + }, + { + "epoch": 0.6879151968771823, + "grad_norm": 2.078139543533325, + "learning_rate": 4.687643744653151e-05, + "loss": 1.4142, + "step": 19209 + }, + { + "epoch": 0.6879510090067505, + "grad_norm": 1.504701852798462, + "learning_rate": 4.6866610840376424e-05, + "loss": 1.4804, + "step": 19210 + }, + { + "epoch": 0.6879868211363188, + "grad_norm": 1.9218412637710571, + "learning_rate": 4.6856784949062516e-05, + "loss": 1.6285, + "step": 19211 + }, + { + "epoch": 0.6880226332658872, + "grad_norm": 1.6151492595672607, + "learning_rate": 4.6846959772722023e-05, + "loss": 1.4061, + "step": 19212 + }, + { + "epoch": 0.6880584453954555, + "grad_norm": 1.6147105693817139, + "learning_rate": 4.6837135311487125e-05, + "loss": 1.1338, + "step": 19213 + }, + { + "epoch": 0.6880942575250237, + "grad_norm": 1.5472468137741089, + "learning_rate": 4.6827311565490026e-05, + "loss": 1.6359, + "step": 19214 + }, + { + "epoch": 0.688130069654592, + "grad_norm": 1.5259733200073242, + "learning_rate": 4.681748853486283e-05, + "loss": 1.3292, + "step": 19215 + }, + { + "epoch": 0.6881658817841603, + "grad_norm": 1.8748228549957275, + "learning_rate": 4.6807666219737724e-05, + "loss": 1.451, + "step": 19216 + }, + { + "epoch": 0.6882016939137285, + "grad_norm": 1.626900315284729, + "learning_rate": 4.679784462024686e-05, + "loss": 1.6105, + "step": 19217 + }, + { + "epoch": 0.6882375060432968, + "grad_norm": 1.2625377178192139, + "learning_rate": 4.6788023736522405e-05, + "loss": 1.0924, + "step": 19218 + }, + { + "epoch": 0.6882733181728652, + "grad_norm": 1.3529118299484253, + "learning_rate": 4.677820356869641e-05, + "loss": 1.4678, + "step": 19219 + }, + { + "epoch": 0.6883091303024335, + "grad_norm": 2.293600082397461, + "learning_rate": 4.676838411690103e-05, + "loss": 1.5526, + "step": 19220 + }, + { + "epoch": 0.6883449424320017, + "grad_norm": 2.429054021835327, + "learning_rate": 4.675856538126843e-05, + "loss": 1.4861, + "step": 19221 + }, + { + "epoch": 0.68838075456157, + "grad_norm": 1.7435716390609741, + "learning_rate": 4.674874736193061e-05, + "loss": 1.6183, + "step": 19222 + }, + { + "epoch": 0.6884165666911383, + "grad_norm": 1.8326066732406616, + "learning_rate": 4.67389300590197e-05, + "loss": 1.3012, + "step": 19223 + }, + { + "epoch": 0.6884523788207065, + "grad_norm": 1.5112701654434204, + "learning_rate": 4.67291134726678e-05, + "loss": 1.1489, + "step": 19224 + }, + { + "epoch": 0.6884881909502748, + "grad_norm": 1.7756952047348022, + "learning_rate": 4.6719297603006994e-05, + "loss": 1.5206, + "step": 19225 + }, + { + "epoch": 0.6885240030798432, + "grad_norm": 2.031517744064331, + "learning_rate": 4.6709482450169275e-05, + "loss": 1.5716, + "step": 19226 + }, + { + "epoch": 0.6885598152094115, + "grad_norm": 1.7005447149276733, + "learning_rate": 4.6699668014286724e-05, + "loss": 1.6381, + "step": 19227 + }, + { + "epoch": 0.6885956273389797, + "grad_norm": 1.2920862436294556, + "learning_rate": 4.668985429549143e-05, + "loss": 1.6038, + "step": 19228 + }, + { + "epoch": 0.688631439468548, + "grad_norm": 1.2875529527664185, + "learning_rate": 4.6680041293915336e-05, + "loss": 1.2111, + "step": 19229 + }, + { + "epoch": 0.6886672515981163, + "grad_norm": 1.2606359720230103, + "learning_rate": 4.6670229009690516e-05, + "loss": 1.4477, + "step": 19230 + }, + { + "epoch": 0.6887030637276845, + "grad_norm": 2.3484201431274414, + "learning_rate": 4.666041744294898e-05, + "loss": 1.7855, + "step": 19231 + }, + { + "epoch": 0.6887388758572528, + "grad_norm": 1.6336703300476074, + "learning_rate": 4.665060659382274e-05, + "loss": 1.1235, + "step": 19232 + }, + { + "epoch": 0.6887746879868212, + "grad_norm": 2.369983196258545, + "learning_rate": 4.664079646244376e-05, + "loss": 1.323, + "step": 19233 + }, + { + "epoch": 0.6888105001163894, + "grad_norm": 1.3482604026794434, + "learning_rate": 4.663098704894402e-05, + "loss": 1.5943, + "step": 19234 + }, + { + "epoch": 0.6888463122459577, + "grad_norm": 1.468631386756897, + "learning_rate": 4.662117835345552e-05, + "loss": 1.4307, + "step": 19235 + }, + { + "epoch": 0.688882124375526, + "grad_norm": 1.9677053689956665, + "learning_rate": 4.661137037611024e-05, + "loss": 1.1598, + "step": 19236 + }, + { + "epoch": 0.6889179365050943, + "grad_norm": 1.6794158220291138, + "learning_rate": 4.660156311704007e-05, + "loss": 1.7906, + "step": 19237 + }, + { + "epoch": 0.6889537486346625, + "grad_norm": 1.7666298151016235, + "learning_rate": 4.659175657637699e-05, + "loss": 1.2263, + "step": 19238 + }, + { + "epoch": 0.6889895607642308, + "grad_norm": 2.3482303619384766, + "learning_rate": 4.658195075425297e-05, + "loss": 1.285, + "step": 19239 + }, + { + "epoch": 0.6890253728937992, + "grad_norm": 1.2648680210113525, + "learning_rate": 4.657214565079986e-05, + "loss": 1.4305, + "step": 19240 + }, + { + "epoch": 0.6890611850233674, + "grad_norm": 2.5876047611236572, + "learning_rate": 4.656234126614961e-05, + "loss": 1.7217, + "step": 19241 + }, + { + "epoch": 0.6890969971529357, + "grad_norm": 1.4185413122177124, + "learning_rate": 4.655253760043413e-05, + "loss": 1.5885, + "step": 19242 + }, + { + "epoch": 0.689132809282504, + "grad_norm": 1.3307768106460571, + "learning_rate": 4.654273465378536e-05, + "loss": 1.2001, + "step": 19243 + }, + { + "epoch": 0.6891686214120722, + "grad_norm": 1.5933369398117065, + "learning_rate": 4.65329324263351e-05, + "loss": 1.2176, + "step": 19244 + }, + { + "epoch": 0.6892044335416405, + "grad_norm": 1.4384766817092896, + "learning_rate": 4.652313091821526e-05, + "loss": 1.4176, + "step": 19245 + }, + { + "epoch": 0.6892402456712088, + "grad_norm": 1.2715067863464355, + "learning_rate": 4.651333012955775e-05, + "loss": 1.6592, + "step": 19246 + }, + { + "epoch": 0.6892760578007772, + "grad_norm": 1.4462275505065918, + "learning_rate": 4.650353006049436e-05, + "loss": 1.4901, + "step": 19247 + }, + { + "epoch": 0.6893118699303454, + "grad_norm": 1.8860747814178467, + "learning_rate": 4.649373071115697e-05, + "loss": 1.4205, + "step": 19248 + }, + { + "epoch": 0.6893476820599137, + "grad_norm": 1.4204922914505005, + "learning_rate": 4.6483932081677407e-05, + "loss": 1.3774, + "step": 19249 + }, + { + "epoch": 0.689383494189482, + "grad_norm": 1.5421233177185059, + "learning_rate": 4.647413417218756e-05, + "loss": 1.0301, + "step": 19250 + }, + { + "epoch": 0.6894193063190502, + "grad_norm": 1.686322808265686, + "learning_rate": 4.646433698281913e-05, + "loss": 1.5836, + "step": 19251 + }, + { + "epoch": 0.6894551184486185, + "grad_norm": 1.406529426574707, + "learning_rate": 4.645454051370401e-05, + "loss": 1.1692, + "step": 19252 + }, + { + "epoch": 0.6894909305781868, + "grad_norm": 1.3951549530029297, + "learning_rate": 4.644474476497397e-05, + "loss": 1.3326, + "step": 19253 + }, + { + "epoch": 0.6895267427077552, + "grad_norm": 1.7773816585540771, + "learning_rate": 4.6434949736760844e-05, + "loss": 1.3854, + "step": 19254 + }, + { + "epoch": 0.6895625548373234, + "grad_norm": 2.3042869567871094, + "learning_rate": 4.642515542919635e-05, + "loss": 1.5021, + "step": 19255 + }, + { + "epoch": 0.6895983669668917, + "grad_norm": 1.5629322528839111, + "learning_rate": 4.641536184241228e-05, + "loss": 1.4215, + "step": 19256 + }, + { + "epoch": 0.68963417909646, + "grad_norm": 1.878865122795105, + "learning_rate": 4.640556897654042e-05, + "loss": 1.3999, + "step": 19257 + }, + { + "epoch": 0.6896699912260282, + "grad_norm": 1.6263047456741333, + "learning_rate": 4.639577683171248e-05, + "loss": 1.5993, + "step": 19258 + }, + { + "epoch": 0.6897058033555965, + "grad_norm": 1.5195845365524292, + "learning_rate": 4.638598540806021e-05, + "loss": 1.4672, + "step": 19259 + }, + { + "epoch": 0.6897416154851648, + "grad_norm": 1.7965352535247803, + "learning_rate": 4.637619470571535e-05, + "loss": 1.7661, + "step": 19260 + }, + { + "epoch": 0.6897774276147332, + "grad_norm": 1.6480170488357544, + "learning_rate": 4.636640472480965e-05, + "loss": 1.4935, + "step": 19261 + }, + { + "epoch": 0.6898132397443014, + "grad_norm": 1.973185658454895, + "learning_rate": 4.635661546547476e-05, + "loss": 1.669, + "step": 19262 + }, + { + "epoch": 0.6898490518738697, + "grad_norm": 1.6498275995254517, + "learning_rate": 4.634682692784241e-05, + "loss": 1.2816, + "step": 19263 + }, + { + "epoch": 0.689884864003438, + "grad_norm": 1.2615090608596802, + "learning_rate": 4.6337039112044346e-05, + "loss": 1.011, + "step": 19264 + }, + { + "epoch": 0.6899206761330062, + "grad_norm": 1.4488906860351562, + "learning_rate": 4.632725201821215e-05, + "loss": 1.014, + "step": 19265 + }, + { + "epoch": 0.6899564882625745, + "grad_norm": 1.690446138381958, + "learning_rate": 4.6317465646477584e-05, + "loss": 1.1892, + "step": 19266 + }, + { + "epoch": 0.6899923003921428, + "grad_norm": 1.2341758012771606, + "learning_rate": 4.6307679996972205e-05, + "loss": 1.3111, + "step": 19267 + }, + { + "epoch": 0.6900281125217111, + "grad_norm": 1.505085825920105, + "learning_rate": 4.62978950698278e-05, + "loss": 1.3969, + "step": 19268 + }, + { + "epoch": 0.6900639246512794, + "grad_norm": 1.8536823987960815, + "learning_rate": 4.6288110865175914e-05, + "loss": 1.2291, + "step": 19269 + }, + { + "epoch": 0.6900997367808477, + "grad_norm": 1.2914131879806519, + "learning_rate": 4.627832738314821e-05, + "loss": 1.3716, + "step": 19270 + }, + { + "epoch": 0.690135548910416, + "grad_norm": 1.180411458015442, + "learning_rate": 4.6268544623876364e-05, + "loss": 1.2842, + "step": 19271 + }, + { + "epoch": 0.6901713610399842, + "grad_norm": 1.6704957485198975, + "learning_rate": 4.625876258749189e-05, + "loss": 1.3887, + "step": 19272 + }, + { + "epoch": 0.6902071731695525, + "grad_norm": 1.6580283641815186, + "learning_rate": 4.624898127412649e-05, + "loss": 1.4075, + "step": 19273 + }, + { + "epoch": 0.6902429852991208, + "grad_norm": 2.034627914428711, + "learning_rate": 4.623920068391163e-05, + "loss": 1.2691, + "step": 19274 + }, + { + "epoch": 0.6902787974286891, + "grad_norm": 1.591990351676941, + "learning_rate": 4.622942081697906e-05, + "loss": 1.4743, + "step": 19275 + }, + { + "epoch": 0.6903146095582574, + "grad_norm": 1.417268991470337, + "learning_rate": 4.6219641673460236e-05, + "loss": 1.257, + "step": 19276 + }, + { + "epoch": 0.6903504216878257, + "grad_norm": 1.6440649032592773, + "learning_rate": 4.62098632534868e-05, + "loss": 1.2822, + "step": 19277 + }, + { + "epoch": 0.690386233817394, + "grad_norm": 1.5523018836975098, + "learning_rate": 4.620008555719019e-05, + "loss": 1.3786, + "step": 19278 + }, + { + "epoch": 0.6904220459469622, + "grad_norm": 1.6315157413482666, + "learning_rate": 4.619030858470211e-05, + "loss": 1.5197, + "step": 19279 + }, + { + "epoch": 0.6904578580765305, + "grad_norm": 1.3777469396591187, + "learning_rate": 4.6180532336154014e-05, + "loss": 1.1425, + "step": 19280 + }, + { + "epoch": 0.6904936702060988, + "grad_norm": 1.4311867952346802, + "learning_rate": 4.617075681167736e-05, + "loss": 1.2902, + "step": 19281 + }, + { + "epoch": 0.6905294823356671, + "grad_norm": 1.3356856107711792, + "learning_rate": 4.616098201140382e-05, + "loss": 1.4217, + "step": 19282 + }, + { + "epoch": 0.6905652944652354, + "grad_norm": 1.3175742626190186, + "learning_rate": 4.615120793546478e-05, + "loss": 1.2804, + "step": 19283 + }, + { + "epoch": 0.6906011065948037, + "grad_norm": 1.5983734130859375, + "learning_rate": 4.6141434583991803e-05, + "loss": 1.4278, + "step": 19284 + }, + { + "epoch": 0.6906369187243719, + "grad_norm": 1.3159353733062744, + "learning_rate": 4.613166195711629e-05, + "loss": 1.3914, + "step": 19285 + }, + { + "epoch": 0.6906727308539402, + "grad_norm": 1.8552560806274414, + "learning_rate": 4.612189005496985e-05, + "loss": 1.2788, + "step": 19286 + }, + { + "epoch": 0.6907085429835085, + "grad_norm": 1.4018633365631104, + "learning_rate": 4.611211887768384e-05, + "loss": 1.4291, + "step": 19287 + }, + { + "epoch": 0.6907443551130767, + "grad_norm": 1.557361364364624, + "learning_rate": 4.6102348425389804e-05, + "loss": 1.2867, + "step": 19288 + }, + { + "epoch": 0.6907801672426451, + "grad_norm": 1.355259895324707, + "learning_rate": 4.609257869821911e-05, + "loss": 1.1086, + "step": 19289 + }, + { + "epoch": 0.6908159793722134, + "grad_norm": 1.498028039932251, + "learning_rate": 4.608280969630323e-05, + "loss": 1.7323, + "step": 19290 + }, + { + "epoch": 0.6908517915017817, + "grad_norm": 1.7910292148590088, + "learning_rate": 4.6073041419773635e-05, + "loss": 1.2225, + "step": 19291 + }, + { + "epoch": 0.6908876036313499, + "grad_norm": 1.6466988325119019, + "learning_rate": 4.606327386876167e-05, + "loss": 1.3962, + "step": 19292 + }, + { + "epoch": 0.6909234157609182, + "grad_norm": 1.5631616115570068, + "learning_rate": 4.605350704339879e-05, + "loss": 1.4526, + "step": 19293 + }, + { + "epoch": 0.6909592278904865, + "grad_norm": 2.4088470935821533, + "learning_rate": 4.604374094381637e-05, + "loss": 1.8184, + "step": 19294 + }, + { + "epoch": 0.6909950400200547, + "grad_norm": 1.8684895038604736, + "learning_rate": 4.603397557014587e-05, + "loss": 1.5012, + "step": 19295 + }, + { + "epoch": 0.6910308521496231, + "grad_norm": 1.6042251586914062, + "learning_rate": 4.602421092251854e-05, + "loss": 1.2943, + "step": 19296 + }, + { + "epoch": 0.6910666642791914, + "grad_norm": 2.03073787689209, + "learning_rate": 4.60144470010659e-05, + "loss": 1.9343, + "step": 19297 + }, + { + "epoch": 0.6911024764087597, + "grad_norm": 1.6332614421844482, + "learning_rate": 4.600468380591923e-05, + "loss": 1.4177, + "step": 19298 + }, + { + "epoch": 0.6911382885383279, + "grad_norm": 1.823080062866211, + "learning_rate": 4.599492133720986e-05, + "loss": 1.283, + "step": 19299 + }, + { + "epoch": 0.6911741006678962, + "grad_norm": 2.0920989513397217, + "learning_rate": 4.598515959506917e-05, + "loss": 1.3483, + "step": 19300 + }, + { + "epoch": 0.6912099127974645, + "grad_norm": 1.4750959873199463, + "learning_rate": 4.597539857962848e-05, + "loss": 1.3174, + "step": 19301 + }, + { + "epoch": 0.6912457249270327, + "grad_norm": 1.3776130676269531, + "learning_rate": 4.5965638291019145e-05, + "loss": 1.525, + "step": 19302 + }, + { + "epoch": 0.6912815370566011, + "grad_norm": 1.7445027828216553, + "learning_rate": 4.595587872937241e-05, + "loss": 1.5717, + "step": 19303 + }, + { + "epoch": 0.6913173491861694, + "grad_norm": 1.7076929807662964, + "learning_rate": 4.594611989481963e-05, + "loss": 1.1559, + "step": 19304 + }, + { + "epoch": 0.6913531613157377, + "grad_norm": 1.4765623807907104, + "learning_rate": 4.593636178749206e-05, + "loss": 1.6233, + "step": 19305 + }, + { + "epoch": 0.6913889734453059, + "grad_norm": 1.6138931512832642, + "learning_rate": 4.592660440752107e-05, + "loss": 1.3967, + "step": 19306 + }, + { + "epoch": 0.6914247855748742, + "grad_norm": 1.3620359897613525, + "learning_rate": 4.5916847755037806e-05, + "loss": 1.349, + "step": 19307 + }, + { + "epoch": 0.6914605977044425, + "grad_norm": 1.8544646501541138, + "learning_rate": 4.590709183017361e-05, + "loss": 1.5375, + "step": 19308 + }, + { + "epoch": 0.6914964098340107, + "grad_norm": 1.3989589214324951, + "learning_rate": 4.5897336633059737e-05, + "loss": 1.3476, + "step": 19309 + }, + { + "epoch": 0.6915322219635791, + "grad_norm": 2.901787281036377, + "learning_rate": 4.588758216382739e-05, + "loss": 1.6608, + "step": 19310 + }, + { + "epoch": 0.6915680340931474, + "grad_norm": 1.9451749324798584, + "learning_rate": 4.5877828422607824e-05, + "loss": 1.7291, + "step": 19311 + }, + { + "epoch": 0.6916038462227156, + "grad_norm": 1.8403728008270264, + "learning_rate": 4.5868075409532265e-05, + "loss": 1.2411, + "step": 19312 + }, + { + "epoch": 0.6916396583522839, + "grad_norm": 1.5571632385253906, + "learning_rate": 4.585832312473196e-05, + "loss": 1.6115, + "step": 19313 + }, + { + "epoch": 0.6916754704818522, + "grad_norm": 2.0901341438293457, + "learning_rate": 4.584857156833804e-05, + "loss": 1.6583, + "step": 19314 + }, + { + "epoch": 0.6917112826114205, + "grad_norm": 1.7531592845916748, + "learning_rate": 4.583882074048174e-05, + "loss": 1.2701, + "step": 19315 + }, + { + "epoch": 0.6917470947409887, + "grad_norm": 1.3582504987716675, + "learning_rate": 4.582907064129428e-05, + "loss": 1.3205, + "step": 19316 + }, + { + "epoch": 0.6917829068705571, + "grad_norm": 1.857133150100708, + "learning_rate": 4.5819321270906765e-05, + "loss": 1.5085, + "step": 19317 + }, + { + "epoch": 0.6918187190001254, + "grad_norm": 2.7562215328216553, + "learning_rate": 4.580957262945039e-05, + "loss": 1.4333, + "step": 19318 + }, + { + "epoch": 0.6918545311296936, + "grad_norm": 2.172489881515503, + "learning_rate": 4.5799824717056325e-05, + "loss": 1.6725, + "step": 19319 + }, + { + "epoch": 0.6918903432592619, + "grad_norm": 1.492525577545166, + "learning_rate": 4.579007753385573e-05, + "loss": 1.4247, + "step": 19320 + }, + { + "epoch": 0.6919261553888302, + "grad_norm": 1.6404074430465698, + "learning_rate": 4.578033107997969e-05, + "loss": 1.5811, + "step": 19321 + }, + { + "epoch": 0.6919619675183984, + "grad_norm": 1.5491911172866821, + "learning_rate": 4.577058535555935e-05, + "loss": 1.5538, + "step": 19322 + }, + { + "epoch": 0.6919977796479667, + "grad_norm": 1.4528319835662842, + "learning_rate": 4.576084036072584e-05, + "loss": 1.2544, + "step": 19323 + }, + { + "epoch": 0.6920335917775351, + "grad_norm": 1.7799615859985352, + "learning_rate": 4.575109609561029e-05, + "loss": 1.4805, + "step": 19324 + }, + { + "epoch": 0.6920694039071034, + "grad_norm": 1.4052472114562988, + "learning_rate": 4.5741352560343734e-05, + "loss": 1.364, + "step": 19325 + }, + { + "epoch": 0.6921052160366716, + "grad_norm": 1.4462934732437134, + "learning_rate": 4.5731609755057284e-05, + "loss": 1.5724, + "step": 19326 + }, + { + "epoch": 0.6921410281662399, + "grad_norm": 2.7505440711975098, + "learning_rate": 4.572186767988206e-05, + "loss": 1.4352, + "step": 19327 + }, + { + "epoch": 0.6921768402958082, + "grad_norm": 2.1405277252197266, + "learning_rate": 4.571212633494906e-05, + "loss": 1.8836, + "step": 19328 + }, + { + "epoch": 0.6922126524253764, + "grad_norm": 1.584587574005127, + "learning_rate": 4.5702385720389376e-05, + "loss": 1.2718, + "step": 19329 + }, + { + "epoch": 0.6922484645549447, + "grad_norm": 1.6535595655441284, + "learning_rate": 4.569264583633405e-05, + "loss": 1.3507, + "step": 19330 + }, + { + "epoch": 0.6922842766845131, + "grad_norm": 1.9386780261993408, + "learning_rate": 4.568290668291416e-05, + "loss": 1.345, + "step": 19331 + }, + { + "epoch": 0.6923200888140814, + "grad_norm": 1.418664813041687, + "learning_rate": 4.567316826026066e-05, + "loss": 1.1447, + "step": 19332 + }, + { + "epoch": 0.6923559009436496, + "grad_norm": 1.544927716255188, + "learning_rate": 4.5663430568504603e-05, + "loss": 1.5124, + "step": 19333 + }, + { + "epoch": 0.6923917130732179, + "grad_norm": 1.3872967958450317, + "learning_rate": 4.565369360777704e-05, + "loss": 1.6032, + "step": 19334 + }, + { + "epoch": 0.6924275252027862, + "grad_norm": 1.791256308555603, + "learning_rate": 4.564395737820888e-05, + "loss": 1.7009, + "step": 19335 + }, + { + "epoch": 0.6924633373323544, + "grad_norm": 1.7304942607879639, + "learning_rate": 4.563422187993117e-05, + "loss": 1.3953, + "step": 19336 + }, + { + "epoch": 0.6924991494619227, + "grad_norm": 1.745445966720581, + "learning_rate": 4.5624487113074874e-05, + "loss": 1.4433, + "step": 19337 + }, + { + "epoch": 0.6925349615914911, + "grad_norm": 1.7737088203430176, + "learning_rate": 4.5614753077771e-05, + "loss": 1.6636, + "step": 19338 + }, + { + "epoch": 0.6925707737210594, + "grad_norm": 1.6347934007644653, + "learning_rate": 4.560501977415044e-05, + "loss": 1.6366, + "step": 19339 + }, + { + "epoch": 0.6926065858506276, + "grad_norm": 1.919651985168457, + "learning_rate": 4.5595287202344175e-05, + "loss": 1.5327, + "step": 19340 + }, + { + "epoch": 0.6926423979801959, + "grad_norm": 1.5812537670135498, + "learning_rate": 4.558555536248313e-05, + "loss": 1.6632, + "step": 19341 + }, + { + "epoch": 0.6926782101097642, + "grad_norm": 1.4711294174194336, + "learning_rate": 4.55758242546983e-05, + "loss": 1.4774, + "step": 19342 + }, + { + "epoch": 0.6927140222393324, + "grad_norm": 1.3836665153503418, + "learning_rate": 4.5566093879120505e-05, + "loss": 1.2167, + "step": 19343 + }, + { + "epoch": 0.6927498343689007, + "grad_norm": 1.8161534070968628, + "learning_rate": 4.555636423588071e-05, + "loss": 1.4929, + "step": 19344 + }, + { + "epoch": 0.6927856464984691, + "grad_norm": 1.7006677389144897, + "learning_rate": 4.5546635325109844e-05, + "loss": 1.1194, + "step": 19345 + }, + { + "epoch": 0.6928214586280373, + "grad_norm": 1.4193700551986694, + "learning_rate": 4.553690714693872e-05, + "loss": 1.4037, + "step": 19346 + }, + { + "epoch": 0.6928572707576056, + "grad_norm": 1.9114856719970703, + "learning_rate": 4.5527179701498256e-05, + "loss": 1.4374, + "step": 19347 + }, + { + "epoch": 0.6928930828871739, + "grad_norm": 1.6580359935760498, + "learning_rate": 4.551745298891933e-05, + "loss": 1.3423, + "step": 19348 + }, + { + "epoch": 0.6929288950167422, + "grad_norm": 1.9556456804275513, + "learning_rate": 4.5507727009332824e-05, + "loss": 1.2934, + "step": 19349 + }, + { + "epoch": 0.6929647071463104, + "grad_norm": 2.0172038078308105, + "learning_rate": 4.549800176286954e-05, + "loss": 1.599, + "step": 19350 + }, + { + "epoch": 0.6930005192758787, + "grad_norm": 2.1281747817993164, + "learning_rate": 4.5488277249660325e-05, + "loss": 1.6343, + "step": 19351 + }, + { + "epoch": 0.6930363314054471, + "grad_norm": 1.7381844520568848, + "learning_rate": 4.5478553469836064e-05, + "loss": 1.2599, + "step": 19352 + }, + { + "epoch": 0.6930721435350153, + "grad_norm": 2.3697657585144043, + "learning_rate": 4.546883042352751e-05, + "loss": 1.7014, + "step": 19353 + }, + { + "epoch": 0.6931079556645836, + "grad_norm": 2.5228118896484375, + "learning_rate": 4.545910811086549e-05, + "loss": 1.4948, + "step": 19354 + }, + { + "epoch": 0.6931437677941519, + "grad_norm": 1.3983299732208252, + "learning_rate": 4.544938653198082e-05, + "loss": 1.4429, + "step": 19355 + }, + { + "epoch": 0.6931795799237201, + "grad_norm": 1.7777349948883057, + "learning_rate": 4.543966568700433e-05, + "loss": 1.4876, + "step": 19356 + }, + { + "epoch": 0.6932153920532884, + "grad_norm": 1.5651066303253174, + "learning_rate": 4.542994557606672e-05, + "loss": 1.683, + "step": 19357 + }, + { + "epoch": 0.6932512041828567, + "grad_norm": 1.167218565940857, + "learning_rate": 4.542022619929881e-05, + "loss": 1.5015, + "step": 19358 + }, + { + "epoch": 0.6932870163124251, + "grad_norm": 1.1746838092803955, + "learning_rate": 4.541050755683135e-05, + "loss": 1.4014, + "step": 19359 + }, + { + "epoch": 0.6933228284419933, + "grad_norm": 1.9746357202529907, + "learning_rate": 4.5400789648795136e-05, + "loss": 1.5701, + "step": 19360 + }, + { + "epoch": 0.6933586405715616, + "grad_norm": 2.039637804031372, + "learning_rate": 4.539107247532086e-05, + "loss": 1.6741, + "step": 19361 + }, + { + "epoch": 0.6933944527011299, + "grad_norm": 1.581687331199646, + "learning_rate": 4.5381356036539204e-05, + "loss": 1.2782, + "step": 19362 + }, + { + "epoch": 0.6934302648306981, + "grad_norm": 1.605271577835083, + "learning_rate": 4.537164033258101e-05, + "loss": 1.582, + "step": 19363 + }, + { + "epoch": 0.6934660769602664, + "grad_norm": 1.3678474426269531, + "learning_rate": 4.53619253635769e-05, + "loss": 1.5372, + "step": 19364 + }, + { + "epoch": 0.6935018890898347, + "grad_norm": 1.3803857564926147, + "learning_rate": 4.5352211129657596e-05, + "loss": 1.2848, + "step": 19365 + }, + { + "epoch": 0.6935377012194031, + "grad_norm": 1.4274797439575195, + "learning_rate": 4.5342497630953806e-05, + "loss": 1.5293, + "step": 19366 + }, + { + "epoch": 0.6935735133489713, + "grad_norm": 1.4727002382278442, + "learning_rate": 4.533278486759625e-05, + "loss": 1.2364, + "step": 19367 + }, + { + "epoch": 0.6936093254785396, + "grad_norm": 1.8014189004898071, + "learning_rate": 4.5323072839715555e-05, + "loss": 1.4292, + "step": 19368 + }, + { + "epoch": 0.6936451376081079, + "grad_norm": 1.8191452026367188, + "learning_rate": 4.531336154744231e-05, + "loss": 1.4423, + "step": 19369 + }, + { + "epoch": 0.6936809497376761, + "grad_norm": 1.5207409858703613, + "learning_rate": 4.530365099090732e-05, + "loss": 1.5638, + "step": 19370 + }, + { + "epoch": 0.6937167618672444, + "grad_norm": 1.5536220073699951, + "learning_rate": 4.5293941170241116e-05, + "loss": 1.5002, + "step": 19371 + }, + { + "epoch": 0.6937525739968127, + "grad_norm": 2.320056200027466, + "learning_rate": 4.528423208557441e-05, + "loss": 1.3389, + "step": 19372 + }, + { + "epoch": 0.693788386126381, + "grad_norm": 1.6876623630523682, + "learning_rate": 4.52745237370377e-05, + "loss": 1.2901, + "step": 19373 + }, + { + "epoch": 0.6938241982559493, + "grad_norm": 1.3508429527282715, + "learning_rate": 4.526481612476176e-05, + "loss": 1.3113, + "step": 19374 + }, + { + "epoch": 0.6938600103855176, + "grad_norm": 1.5658550262451172, + "learning_rate": 4.525510924887707e-05, + "loss": 1.5774, + "step": 19375 + }, + { + "epoch": 0.6938958225150859, + "grad_norm": 1.841926097869873, + "learning_rate": 4.524540310951432e-05, + "loss": 1.3422, + "step": 19376 + }, + { + "epoch": 0.6939316346446541, + "grad_norm": 2.058666467666626, + "learning_rate": 4.5235697706804e-05, + "loss": 1.4251, + "step": 19377 + }, + { + "epoch": 0.6939674467742224, + "grad_norm": 1.5321693420410156, + "learning_rate": 4.5225993040876724e-05, + "loss": 1.5993, + "step": 19378 + }, + { + "epoch": 0.6940032589037907, + "grad_norm": 1.959938645362854, + "learning_rate": 4.521628911186311e-05, + "loss": 1.6419, + "step": 19379 + }, + { + "epoch": 0.694039071033359, + "grad_norm": 1.6654266119003296, + "learning_rate": 4.5206585919893563e-05, + "loss": 1.5531, + "step": 19380 + }, + { + "epoch": 0.6940748831629273, + "grad_norm": 2.1616580486297607, + "learning_rate": 4.519688346509881e-05, + "loss": 1.935, + "step": 19381 + }, + { + "epoch": 0.6941106952924956, + "grad_norm": 1.5678056478500366, + "learning_rate": 4.5187181747609265e-05, + "loss": 1.4482, + "step": 19382 + }, + { + "epoch": 0.6941465074220639, + "grad_norm": 1.2296350002288818, + "learning_rate": 4.5177480767555525e-05, + "loss": 1.4928, + "step": 19383 + }, + { + "epoch": 0.6941823195516321, + "grad_norm": 1.5905427932739258, + "learning_rate": 4.516778052506798e-05, + "loss": 1.4104, + "step": 19384 + }, + { + "epoch": 0.6942181316812004, + "grad_norm": 2.694613456726074, + "learning_rate": 4.5158081020277296e-05, + "loss": 1.6461, + "step": 19385 + }, + { + "epoch": 0.6942539438107687, + "grad_norm": 1.8506790399551392, + "learning_rate": 4.5148382253313904e-05, + "loss": 1.4123, + "step": 19386 + }, + { + "epoch": 0.694289755940337, + "grad_norm": 1.9660435914993286, + "learning_rate": 4.513868422430823e-05, + "loss": 1.4974, + "step": 19387 + }, + { + "epoch": 0.6943255680699053, + "grad_norm": 1.9675049781799316, + "learning_rate": 4.5128986933390785e-05, + "loss": 1.1586, + "step": 19388 + }, + { + "epoch": 0.6943613801994736, + "grad_norm": 2.04257869720459, + "learning_rate": 4.5119290380692046e-05, + "loss": 1.4606, + "step": 19389 + }, + { + "epoch": 0.6943971923290418, + "grad_norm": 1.488710641860962, + "learning_rate": 4.5109594566342515e-05, + "loss": 1.605, + "step": 19390 + }, + { + "epoch": 0.6944330044586101, + "grad_norm": 1.9784034490585327, + "learning_rate": 4.50998994904725e-05, + "loss": 1.1938, + "step": 19391 + }, + { + "epoch": 0.6944688165881784, + "grad_norm": 1.4609148502349854, + "learning_rate": 4.50902051532126e-05, + "loss": 1.236, + "step": 19392 + }, + { + "epoch": 0.6945046287177467, + "grad_norm": 4.472887992858887, + "learning_rate": 4.508051155469312e-05, + "loss": 1.8098, + "step": 19393 + }, + { + "epoch": 0.694540440847315, + "grad_norm": 1.397859811782837, + "learning_rate": 4.5070818695044545e-05, + "loss": 1.4173, + "step": 19394 + }, + { + "epoch": 0.6945762529768833, + "grad_norm": 1.3701924085617065, + "learning_rate": 4.5061126574397225e-05, + "loss": 1.1701, + "step": 19395 + }, + { + "epoch": 0.6946120651064516, + "grad_norm": 1.419206976890564, + "learning_rate": 4.505143519288159e-05, + "loss": 1.4127, + "step": 19396 + }, + { + "epoch": 0.6946478772360198, + "grad_norm": 1.7711102962493896, + "learning_rate": 4.504174455062803e-05, + "loss": 1.4287, + "step": 19397 + }, + { + "epoch": 0.6946836893655881, + "grad_norm": 1.9416470527648926, + "learning_rate": 4.503205464776689e-05, + "loss": 1.193, + "step": 19398 + }, + { + "epoch": 0.6947195014951564, + "grad_norm": 1.6379601955413818, + "learning_rate": 4.502236548442853e-05, + "loss": 1.1867, + "step": 19399 + }, + { + "epoch": 0.6947553136247246, + "grad_norm": 1.5109336376190186, + "learning_rate": 4.501267706074335e-05, + "loss": 1.0608, + "step": 19400 + }, + { + "epoch": 0.694791125754293, + "grad_norm": 1.4156792163848877, + "learning_rate": 4.5002989376841684e-05, + "loss": 1.2618, + "step": 19401 + }, + { + "epoch": 0.6948269378838613, + "grad_norm": 1.7646799087524414, + "learning_rate": 4.499330243285383e-05, + "loss": 1.5062, + "step": 19402 + }, + { + "epoch": 0.6948627500134296, + "grad_norm": 1.45806884765625, + "learning_rate": 4.4983616228910144e-05, + "loss": 1.4862, + "step": 19403 + }, + { + "epoch": 0.6948985621429978, + "grad_norm": 2.211923837661743, + "learning_rate": 4.497393076514097e-05, + "loss": 1.3923, + "step": 19404 + }, + { + "epoch": 0.6949343742725661, + "grad_norm": 1.3924918174743652, + "learning_rate": 4.496424604167654e-05, + "loss": 1.6712, + "step": 19405 + }, + { + "epoch": 0.6949701864021344, + "grad_norm": 1.4601891040802002, + "learning_rate": 4.4954562058647187e-05, + "loss": 1.4383, + "step": 19406 + }, + { + "epoch": 0.6950059985317026, + "grad_norm": 1.7488446235656738, + "learning_rate": 4.49448788161832e-05, + "loss": 1.5504, + "step": 19407 + }, + { + "epoch": 0.695041810661271, + "grad_norm": 3.262617349624634, + "learning_rate": 4.493519631441488e-05, + "loss": 1.5263, + "step": 19408 + }, + { + "epoch": 0.6950776227908393, + "grad_norm": 1.4852958917617798, + "learning_rate": 4.492551455347245e-05, + "loss": 1.5381, + "step": 19409 + }, + { + "epoch": 0.6951134349204076, + "grad_norm": 1.5856446027755737, + "learning_rate": 4.491583353348616e-05, + "loss": 1.4643, + "step": 19410 + }, + { + "epoch": 0.6951492470499758, + "grad_norm": 2.515482187271118, + "learning_rate": 4.490615325458628e-05, + "loss": 1.7184, + "step": 19411 + }, + { + "epoch": 0.6951850591795441, + "grad_norm": 1.6691640615463257, + "learning_rate": 4.4896473716903085e-05, + "loss": 1.5105, + "step": 19412 + }, + { + "epoch": 0.6952208713091124, + "grad_norm": 1.558199405670166, + "learning_rate": 4.488679492056672e-05, + "loss": 1.3742, + "step": 19413 + }, + { + "epoch": 0.6952566834386806, + "grad_norm": 1.819074034690857, + "learning_rate": 4.487711686570744e-05, + "loss": 1.3235, + "step": 19414 + }, + { + "epoch": 0.695292495568249, + "grad_norm": 2.806718587875366, + "learning_rate": 4.4867439552455485e-05, + "loss": 1.4652, + "step": 19415 + }, + { + "epoch": 0.6953283076978173, + "grad_norm": 1.7254616022109985, + "learning_rate": 4.4857762980940974e-05, + "loss": 1.3435, + "step": 19416 + }, + { + "epoch": 0.6953641198273856, + "grad_norm": 1.4609277248382568, + "learning_rate": 4.484808715129414e-05, + "loss": 1.6299, + "step": 19417 + }, + { + "epoch": 0.6953999319569538, + "grad_norm": 2.2493855953216553, + "learning_rate": 4.483841206364514e-05, + "loss": 1.7297, + "step": 19418 + }, + { + "epoch": 0.6954357440865221, + "grad_norm": 1.397451639175415, + "learning_rate": 4.4828737718124204e-05, + "loss": 1.3422, + "step": 19419 + }, + { + "epoch": 0.6954715562160904, + "grad_norm": 1.560224175453186, + "learning_rate": 4.481906411486139e-05, + "loss": 1.0808, + "step": 19420 + }, + { + "epoch": 0.6955073683456586, + "grad_norm": 1.8856585025787354, + "learning_rate": 4.480939125398689e-05, + "loss": 1.556, + "step": 19421 + }, + { + "epoch": 0.695543180475227, + "grad_norm": 1.7038389444351196, + "learning_rate": 4.479971913563088e-05, + "loss": 1.2636, + "step": 19422 + }, + { + "epoch": 0.6955789926047953, + "grad_norm": 1.789746642112732, + "learning_rate": 4.4790047759923406e-05, + "loss": 1.4515, + "step": 19423 + }, + { + "epoch": 0.6956148047343635, + "grad_norm": 1.6814122200012207, + "learning_rate": 4.478037712699463e-05, + "loss": 1.6073, + "step": 19424 + }, + { + "epoch": 0.6956506168639318, + "grad_norm": 1.5029243230819702, + "learning_rate": 4.477070723697464e-05, + "loss": 1.3639, + "step": 19425 + }, + { + "epoch": 0.6956864289935001, + "grad_norm": 1.991762638092041, + "learning_rate": 4.476103808999359e-05, + "loss": 1.288, + "step": 19426 + }, + { + "epoch": 0.6957222411230684, + "grad_norm": 1.5815709829330444, + "learning_rate": 4.475136968618149e-05, + "loss": 1.4307, + "step": 19427 + }, + { + "epoch": 0.6957580532526366, + "grad_norm": 1.5167605876922607, + "learning_rate": 4.474170202566843e-05, + "loss": 1.3207, + "step": 19428 + }, + { + "epoch": 0.695793865382205, + "grad_norm": 1.501737356185913, + "learning_rate": 4.47320351085845e-05, + "loss": 1.1018, + "step": 19429 + }, + { + "epoch": 0.6958296775117733, + "grad_norm": 1.815674901008606, + "learning_rate": 4.472236893505978e-05, + "loss": 1.4576, + "step": 19430 + }, + { + "epoch": 0.6958654896413415, + "grad_norm": 1.786224603652954, + "learning_rate": 4.471270350522424e-05, + "loss": 1.2835, + "step": 19431 + }, + { + "epoch": 0.6959013017709098, + "grad_norm": 1.9200912714004517, + "learning_rate": 4.4703038819207975e-05, + "loss": 1.4545, + "step": 19432 + }, + { + "epoch": 0.6959371139004781, + "grad_norm": 2.014310359954834, + "learning_rate": 4.4693374877141015e-05, + "loss": 1.459, + "step": 19433 + }, + { + "epoch": 0.6959729260300463, + "grad_norm": 1.2802836894989014, + "learning_rate": 4.4683711679153325e-05, + "loss": 1.4649, + "step": 19434 + }, + { + "epoch": 0.6960087381596146, + "grad_norm": 1.7526559829711914, + "learning_rate": 4.467404922537495e-05, + "loss": 1.3993, + "step": 19435 + }, + { + "epoch": 0.696044550289183, + "grad_norm": 1.5115348100662231, + "learning_rate": 4.466438751593587e-05, + "loss": 1.4426, + "step": 19436 + }, + { + "epoch": 0.6960803624187513, + "grad_norm": 2.1051738262176514, + "learning_rate": 4.465472655096611e-05, + "loss": 1.542, + "step": 19437 + }, + { + "epoch": 0.6961161745483195, + "grad_norm": 1.828957438468933, + "learning_rate": 4.464506633059559e-05, + "loss": 1.3994, + "step": 19438 + }, + { + "epoch": 0.6961519866778878, + "grad_norm": 1.8577128648757935, + "learning_rate": 4.463540685495429e-05, + "loss": 1.5155, + "step": 19439 + }, + { + "epoch": 0.6961877988074561, + "grad_norm": 1.6411844491958618, + "learning_rate": 4.4625748124172204e-05, + "loss": 1.7316, + "step": 19440 + }, + { + "epoch": 0.6962236109370243, + "grad_norm": 1.6481902599334717, + "learning_rate": 4.461609013837923e-05, + "loss": 1.1623, + "step": 19441 + }, + { + "epoch": 0.6962594230665926, + "grad_norm": 1.815705418586731, + "learning_rate": 4.460643289770532e-05, + "loss": 1.5606, + "step": 19442 + }, + { + "epoch": 0.696295235196161, + "grad_norm": 2.0571019649505615, + "learning_rate": 4.4596776402280396e-05, + "loss": 1.4951, + "step": 19443 + }, + { + "epoch": 0.6963310473257293, + "grad_norm": 1.345685362815857, + "learning_rate": 4.458712065223442e-05, + "loss": 1.5385, + "step": 19444 + }, + { + "epoch": 0.6963668594552975, + "grad_norm": 2.47963285446167, + "learning_rate": 4.4577465647697223e-05, + "loss": 1.3012, + "step": 19445 + }, + { + "epoch": 0.6964026715848658, + "grad_norm": 1.4152905941009521, + "learning_rate": 4.456781138879873e-05, + "loss": 1.0918, + "step": 19446 + }, + { + "epoch": 0.6964384837144341, + "grad_norm": 1.7984778881072998, + "learning_rate": 4.455815787566884e-05, + "loss": 1.6954, + "step": 19447 + }, + { + "epoch": 0.6964742958440023, + "grad_norm": 1.517166018486023, + "learning_rate": 4.454850510843745e-05, + "loss": 1.5157, + "step": 19448 + }, + { + "epoch": 0.6965101079735706, + "grad_norm": 1.8566745519638062, + "learning_rate": 4.45388530872344e-05, + "loss": 1.1924, + "step": 19449 + }, + { + "epoch": 0.696545920103139, + "grad_norm": 1.6976255178451538, + "learning_rate": 4.452920181218947e-05, + "loss": 1.5863, + "step": 19450 + }, + { + "epoch": 0.6965817322327073, + "grad_norm": 1.6378042697906494, + "learning_rate": 4.451955128343266e-05, + "loss": 1.5929, + "step": 19451 + }, + { + "epoch": 0.6966175443622755, + "grad_norm": 1.7423259019851685, + "learning_rate": 4.450990150109367e-05, + "loss": 1.5762, + "step": 19452 + }, + { + "epoch": 0.6966533564918438, + "grad_norm": 1.9417845010757446, + "learning_rate": 4.4500252465302384e-05, + "loss": 1.7302, + "step": 19453 + }, + { + "epoch": 0.6966891686214121, + "grad_norm": 1.3983720541000366, + "learning_rate": 4.449060417618861e-05, + "loss": 1.582, + "step": 19454 + }, + { + "epoch": 0.6967249807509803, + "grad_norm": 1.4114207029342651, + "learning_rate": 4.44809566338822e-05, + "loss": 1.5144, + "step": 19455 + }, + { + "epoch": 0.6967607928805486, + "grad_norm": 1.3205265998840332, + "learning_rate": 4.447130983851285e-05, + "loss": 1.6001, + "step": 19456 + }, + { + "epoch": 0.696796605010117, + "grad_norm": 1.3919918537139893, + "learning_rate": 4.446166379021042e-05, + "loss": 1.5457, + "step": 19457 + }, + { + "epoch": 0.6968324171396852, + "grad_norm": 1.9750583171844482, + "learning_rate": 4.4452018489104684e-05, + "loss": 1.4017, + "step": 19458 + }, + { + "epoch": 0.6968682292692535, + "grad_norm": 1.6426039934158325, + "learning_rate": 4.4442373935325364e-05, + "loss": 1.3513, + "step": 19459 + }, + { + "epoch": 0.6969040413988218, + "grad_norm": 1.6933245658874512, + "learning_rate": 4.4432730129002265e-05, + "loss": 1.542, + "step": 19460 + }, + { + "epoch": 0.69693985352839, + "grad_norm": 1.6183847188949585, + "learning_rate": 4.442308707026504e-05, + "loss": 1.3853, + "step": 19461 + }, + { + "epoch": 0.6969756656579583, + "grad_norm": 1.628896713256836, + "learning_rate": 4.4413444759243564e-05, + "loss": 1.4663, + "step": 19462 + }, + { + "epoch": 0.6970114777875266, + "grad_norm": 1.992426872253418, + "learning_rate": 4.440380319606744e-05, + "loss": 2.0699, + "step": 19463 + }, + { + "epoch": 0.697047289917095, + "grad_norm": 1.5472890138626099, + "learning_rate": 4.439416238086643e-05, + "loss": 1.3658, + "step": 19464 + }, + { + "epoch": 0.6970831020466632, + "grad_norm": 1.2758383750915527, + "learning_rate": 4.438452231377025e-05, + "loss": 1.4012, + "step": 19465 + }, + { + "epoch": 0.6971189141762315, + "grad_norm": 1.9764435291290283, + "learning_rate": 4.4374882994908615e-05, + "loss": 1.5115, + "step": 19466 + }, + { + "epoch": 0.6971547263057998, + "grad_norm": 1.4551410675048828, + "learning_rate": 4.436524442441118e-05, + "loss": 1.1521, + "step": 19467 + }, + { + "epoch": 0.697190538435368, + "grad_norm": 1.883821964263916, + "learning_rate": 4.435560660240754e-05, + "loss": 1.1386, + "step": 19468 + }, + { + "epoch": 0.6972263505649363, + "grad_norm": 1.3271210193634033, + "learning_rate": 4.434596952902752e-05, + "loss": 1.5645, + "step": 19469 + }, + { + "epoch": 0.6972621626945046, + "grad_norm": 1.4583014249801636, + "learning_rate": 4.433633320440064e-05, + "loss": 1.4722, + "step": 19470 + }, + { + "epoch": 0.697297974824073, + "grad_norm": 1.2590141296386719, + "learning_rate": 4.432669762865664e-05, + "loss": 1.6447, + "step": 19471 + }, + { + "epoch": 0.6973337869536412, + "grad_norm": 1.6529279947280884, + "learning_rate": 4.431706280192503e-05, + "loss": 1.2192, + "step": 19472 + }, + { + "epoch": 0.6973695990832095, + "grad_norm": 1.5791985988616943, + "learning_rate": 4.4307428724335595e-05, + "loss": 1.4652, + "step": 19473 + }, + { + "epoch": 0.6974054112127778, + "grad_norm": 1.7167974710464478, + "learning_rate": 4.429779539601787e-05, + "loss": 1.4894, + "step": 19474 + }, + { + "epoch": 0.697441223342346, + "grad_norm": 2.156100273132324, + "learning_rate": 4.428816281710142e-05, + "loss": 1.4869, + "step": 19475 + }, + { + "epoch": 0.6974770354719143, + "grad_norm": 1.4207909107208252, + "learning_rate": 4.427853098771587e-05, + "loss": 1.5364, + "step": 19476 + }, + { + "epoch": 0.6975128476014826, + "grad_norm": 1.4815733432769775, + "learning_rate": 4.426889990799082e-05, + "loss": 1.434, + "step": 19477 + }, + { + "epoch": 0.697548659731051, + "grad_norm": 1.5822980403900146, + "learning_rate": 4.425926957805586e-05, + "loss": 0.9941, + "step": 19478 + }, + { + "epoch": 0.6975844718606192, + "grad_norm": 1.9064446687698364, + "learning_rate": 4.424963999804046e-05, + "loss": 1.3855, + "step": 19479 + }, + { + "epoch": 0.6976202839901875, + "grad_norm": 2.002734422683716, + "learning_rate": 4.4240011168074315e-05, + "loss": 1.2231, + "step": 19480 + }, + { + "epoch": 0.6976560961197558, + "grad_norm": 1.4305377006530762, + "learning_rate": 4.423038308828685e-05, + "loss": 1.5345, + "step": 19481 + }, + { + "epoch": 0.697691908249324, + "grad_norm": 1.573127031326294, + "learning_rate": 4.4220755758807695e-05, + "loss": 1.2691, + "step": 19482 + }, + { + "epoch": 0.6977277203788923, + "grad_norm": 1.9930295944213867, + "learning_rate": 4.421112917976628e-05, + "loss": 1.402, + "step": 19483 + }, + { + "epoch": 0.6977635325084606, + "grad_norm": 1.787060022354126, + "learning_rate": 4.420150335129215e-05, + "loss": 1.0692, + "step": 19484 + }, + { + "epoch": 0.697799344638029, + "grad_norm": 1.6993510723114014, + "learning_rate": 4.419187827351485e-05, + "loss": 1.5472, + "step": 19485 + }, + { + "epoch": 0.6978351567675972, + "grad_norm": 1.5781887769699097, + "learning_rate": 4.418225394656382e-05, + "loss": 1.5185, + "step": 19486 + }, + { + "epoch": 0.6978709688971655, + "grad_norm": 2.3409838676452637, + "learning_rate": 4.417263037056856e-05, + "loss": 1.5516, + "step": 19487 + }, + { + "epoch": 0.6979067810267338, + "grad_norm": 1.6964595317840576, + "learning_rate": 4.416300754565854e-05, + "loss": 1.6612, + "step": 19488 + }, + { + "epoch": 0.697942593156302, + "grad_norm": 1.3110731840133667, + "learning_rate": 4.415338547196326e-05, + "loss": 1.2798, + "step": 19489 + }, + { + "epoch": 0.6979784052858703, + "grad_norm": 1.5717277526855469, + "learning_rate": 4.414376414961208e-05, + "loss": 1.5264, + "step": 19490 + }, + { + "epoch": 0.6980142174154386, + "grad_norm": 1.8437398672103882, + "learning_rate": 4.4134143578734576e-05, + "loss": 1.4216, + "step": 19491 + }, + { + "epoch": 0.698050029545007, + "grad_norm": 1.6376128196716309, + "learning_rate": 4.41245237594601e-05, + "loss": 1.3332, + "step": 19492 + }, + { + "epoch": 0.6980858416745752, + "grad_norm": 1.9958080053329468, + "learning_rate": 4.411490469191806e-05, + "loss": 1.8033, + "step": 19493 + }, + { + "epoch": 0.6981216538041435, + "grad_norm": 1.52802574634552, + "learning_rate": 4.4105286376237874e-05, + "loss": 1.495, + "step": 19494 + }, + { + "epoch": 0.6981574659337118, + "grad_norm": 1.652212142944336, + "learning_rate": 4.409566881254897e-05, + "loss": 1.5077, + "step": 19495 + }, + { + "epoch": 0.69819327806328, + "grad_norm": 1.9880157709121704, + "learning_rate": 4.408605200098077e-05, + "loss": 1.7005, + "step": 19496 + }, + { + "epoch": 0.6982290901928483, + "grad_norm": 1.7847472429275513, + "learning_rate": 4.407643594166257e-05, + "loss": 1.4118, + "step": 19497 + }, + { + "epoch": 0.6982649023224166, + "grad_norm": 1.8568692207336426, + "learning_rate": 4.4066820634723805e-05, + "loss": 1.539, + "step": 19498 + }, + { + "epoch": 0.6983007144519849, + "grad_norm": 1.3895604610443115, + "learning_rate": 4.405720608029381e-05, + "loss": 1.5283, + "step": 19499 + }, + { + "epoch": 0.6983365265815532, + "grad_norm": 1.667009949684143, + "learning_rate": 4.404759227850198e-05, + "loss": 1.3788, + "step": 19500 + }, + { + "epoch": 0.6983723387111215, + "grad_norm": 1.8755829334259033, + "learning_rate": 4.403797922947759e-05, + "loss": 1.5928, + "step": 19501 + }, + { + "epoch": 0.6984081508406897, + "grad_norm": 1.7361255884170532, + "learning_rate": 4.4028366933349996e-05, + "loss": 1.3978, + "step": 19502 + }, + { + "epoch": 0.698443962970258, + "grad_norm": 1.4138158559799194, + "learning_rate": 4.4018755390248566e-05, + "loss": 1.3812, + "step": 19503 + }, + { + "epoch": 0.6984797750998263, + "grad_norm": 1.610290765762329, + "learning_rate": 4.400914460030254e-05, + "loss": 1.708, + "step": 19504 + }, + { + "epoch": 0.6985155872293946, + "grad_norm": 1.9601976871490479, + "learning_rate": 4.3999534563641253e-05, + "loss": 1.4285, + "step": 19505 + }, + { + "epoch": 0.6985513993589629, + "grad_norm": 1.4728658199310303, + "learning_rate": 4.3989925280393986e-05, + "loss": 1.671, + "step": 19506 + }, + { + "epoch": 0.6985872114885312, + "grad_norm": 1.6432338953018188, + "learning_rate": 4.3980316750690065e-05, + "loss": 1.3902, + "step": 19507 + }, + { + "epoch": 0.6986230236180995, + "grad_norm": 1.4036003351211548, + "learning_rate": 4.397070897465869e-05, + "loss": 1.653, + "step": 19508 + }, + { + "epoch": 0.6986588357476677, + "grad_norm": 1.8093888759613037, + "learning_rate": 4.396110195242915e-05, + "loss": 1.6891, + "step": 19509 + }, + { + "epoch": 0.698694647877236, + "grad_norm": 1.9804387092590332, + "learning_rate": 4.395149568413073e-05, + "loss": 1.5497, + "step": 19510 + }, + { + "epoch": 0.6987304600068043, + "grad_norm": 1.2982250452041626, + "learning_rate": 4.394189016989261e-05, + "loss": 1.3489, + "step": 19511 + }, + { + "epoch": 0.6987662721363725, + "grad_norm": 1.9783587455749512, + "learning_rate": 4.3932285409844046e-05, + "loss": 1.4654, + "step": 19512 + }, + { + "epoch": 0.6988020842659409, + "grad_norm": 2.4322969913482666, + "learning_rate": 4.392268140411425e-05, + "loss": 1.55, + "step": 19513 + }, + { + "epoch": 0.6988378963955092, + "grad_norm": 2.154332160949707, + "learning_rate": 4.391307815283249e-05, + "loss": 1.8838, + "step": 19514 + }, + { + "epoch": 0.6988737085250775, + "grad_norm": 1.7512561082839966, + "learning_rate": 4.390347565612787e-05, + "loss": 1.6804, + "step": 19515 + }, + { + "epoch": 0.6989095206546457, + "grad_norm": 1.7126457691192627, + "learning_rate": 4.3893873914129635e-05, + "loss": 1.0483, + "step": 19516 + }, + { + "epoch": 0.698945332784214, + "grad_norm": 1.6176780462265015, + "learning_rate": 4.388427292696695e-05, + "loss": 1.2061, + "step": 19517 + }, + { + "epoch": 0.6989811449137823, + "grad_norm": 1.496763825416565, + "learning_rate": 4.387467269476902e-05, + "loss": 1.4142, + "step": 19518 + }, + { + "epoch": 0.6990169570433505, + "grad_norm": 2.7602691650390625, + "learning_rate": 4.3865073217664944e-05, + "loss": 1.5354, + "step": 19519 + }, + { + "epoch": 0.6990527691729189, + "grad_norm": 2.1753737926483154, + "learning_rate": 4.38554744957839e-05, + "loss": 1.5896, + "step": 19520 + }, + { + "epoch": 0.6990885813024872, + "grad_norm": 2.356755018234253, + "learning_rate": 4.384587652925506e-05, + "loss": 1.5859, + "step": 19521 + }, + { + "epoch": 0.6991243934320555, + "grad_norm": 1.905924677848816, + "learning_rate": 4.383627931820747e-05, + "loss": 1.5317, + "step": 19522 + }, + { + "epoch": 0.6991602055616237, + "grad_norm": 2.0932862758636475, + "learning_rate": 4.382668286277031e-05, + "loss": 1.7417, + "step": 19523 + }, + { + "epoch": 0.699196017691192, + "grad_norm": 1.5198270082473755, + "learning_rate": 4.381708716307267e-05, + "loss": 1.3734, + "step": 19524 + }, + { + "epoch": 0.6992318298207603, + "grad_norm": 1.43095862865448, + "learning_rate": 4.3807492219243686e-05, + "loss": 1.4217, + "step": 19525 + }, + { + "epoch": 0.6992676419503285, + "grad_norm": 2.038198947906494, + "learning_rate": 4.379789803141238e-05, + "loss": 1.6064, + "step": 19526 + }, + { + "epoch": 0.6993034540798969, + "grad_norm": 1.9979199171066284, + "learning_rate": 4.378830459970785e-05, + "loss": 1.5887, + "step": 19527 + }, + { + "epoch": 0.6993392662094652, + "grad_norm": 1.6111292839050293, + "learning_rate": 4.3778711924259216e-05, + "loss": 1.5713, + "step": 19528 + }, + { + "epoch": 0.6993750783390335, + "grad_norm": 1.396235704421997, + "learning_rate": 4.3769120005195465e-05, + "loss": 1.1932, + "step": 19529 + }, + { + "epoch": 0.6994108904686017, + "grad_norm": 1.9758912324905396, + "learning_rate": 4.375952884264566e-05, + "loss": 1.4702, + "step": 19530 + }, + { + "epoch": 0.69944670259817, + "grad_norm": 1.5198663473129272, + "learning_rate": 4.374993843673886e-05, + "loss": 1.1155, + "step": 19531 + }, + { + "epoch": 0.6994825147277383, + "grad_norm": 1.6794745922088623, + "learning_rate": 4.37403487876041e-05, + "loss": 1.3603, + "step": 19532 + }, + { + "epoch": 0.6995183268573065, + "grad_norm": 1.6363730430603027, + "learning_rate": 4.373075989537035e-05, + "loss": 1.4035, + "step": 19533 + }, + { + "epoch": 0.6995541389868749, + "grad_norm": 1.9012279510498047, + "learning_rate": 4.372117176016665e-05, + "loss": 1.3656, + "step": 19534 + }, + { + "epoch": 0.6995899511164432, + "grad_norm": 1.7077008485794067, + "learning_rate": 4.371158438212199e-05, + "loss": 1.2935, + "step": 19535 + }, + { + "epoch": 0.6996257632460114, + "grad_norm": 1.684689998626709, + "learning_rate": 4.370199776136538e-05, + "loss": 1.5655, + "step": 19536 + }, + { + "epoch": 0.6996615753755797, + "grad_norm": 2.581475019454956, + "learning_rate": 4.3692411898025746e-05, + "loss": 1.4285, + "step": 19537 + }, + { + "epoch": 0.699697387505148, + "grad_norm": 2.067708730697632, + "learning_rate": 4.368282679223207e-05, + "loss": 1.7469, + "step": 19538 + }, + { + "epoch": 0.6997331996347163, + "grad_norm": 1.2996182441711426, + "learning_rate": 4.367324244411335e-05, + "loss": 1.0087, + "step": 19539 + }, + { + "epoch": 0.6997690117642845, + "grad_norm": 1.6188222169876099, + "learning_rate": 4.3663658853798476e-05, + "loss": 1.4346, + "step": 19540 + }, + { + "epoch": 0.6998048238938529, + "grad_norm": 1.531131386756897, + "learning_rate": 4.365407602141639e-05, + "loss": 1.5322, + "step": 19541 + }, + { + "epoch": 0.6998406360234212, + "grad_norm": 2.2801246643066406, + "learning_rate": 4.364449394709603e-05, + "loss": 1.7024, + "step": 19542 + }, + { + "epoch": 0.6998764481529894, + "grad_norm": 2.3423590660095215, + "learning_rate": 4.363491263096635e-05, + "loss": 1.7841, + "step": 19543 + }, + { + "epoch": 0.6999122602825577, + "grad_norm": 1.4121369123458862, + "learning_rate": 4.362533207315618e-05, + "loss": 1.5293, + "step": 19544 + }, + { + "epoch": 0.699948072412126, + "grad_norm": 2.1785571575164795, + "learning_rate": 4.361575227379444e-05, + "loss": 1.5964, + "step": 19545 + }, + { + "epoch": 0.6999838845416942, + "grad_norm": 1.5997748374938965, + "learning_rate": 4.360617323301007e-05, + "loss": 1.5425, + "step": 19546 + }, + { + "epoch": 0.7000196966712625, + "grad_norm": 2.7810516357421875, + "learning_rate": 4.359659495093186e-05, + "loss": 1.6532, + "step": 19547 + }, + { + "epoch": 0.7000555088008309, + "grad_norm": 1.4571328163146973, + "learning_rate": 4.35870174276887e-05, + "loss": 1.472, + "step": 19548 + }, + { + "epoch": 0.7000913209303992, + "grad_norm": 2.0664026737213135, + "learning_rate": 4.357744066340946e-05, + "loss": 1.59, + "step": 19549 + }, + { + "epoch": 0.7001271330599674, + "grad_norm": 1.6976768970489502, + "learning_rate": 4.356786465822301e-05, + "loss": 1.6835, + "step": 19550 + }, + { + "epoch": 0.7001629451895357, + "grad_norm": 1.4122880697250366, + "learning_rate": 4.3558289412258114e-05, + "loss": 1.3165, + "step": 19551 + }, + { + "epoch": 0.700198757319104, + "grad_norm": 1.928450584411621, + "learning_rate": 4.354871492564363e-05, + "loss": 1.4275, + "step": 19552 + }, + { + "epoch": 0.7002345694486722, + "grad_norm": 1.8557058572769165, + "learning_rate": 4.353914119850837e-05, + "loss": 1.0927, + "step": 19553 + }, + { + "epoch": 0.7002703815782405, + "grad_norm": 1.8503695726394653, + "learning_rate": 4.3529568230981165e-05, + "loss": 1.1398, + "step": 19554 + }, + { + "epoch": 0.7003061937078089, + "grad_norm": 1.4184296131134033, + "learning_rate": 4.351999602319079e-05, + "loss": 1.5463, + "step": 19555 + }, + { + "epoch": 0.7003420058373772, + "grad_norm": 2.062753200531006, + "learning_rate": 4.351042457526594e-05, + "loss": 1.2678, + "step": 19556 + }, + { + "epoch": 0.7003778179669454, + "grad_norm": 1.8937575817108154, + "learning_rate": 4.350085388733553e-05, + "loss": 1.1757, + "step": 19557 + }, + { + "epoch": 0.7004136300965137, + "grad_norm": 1.4495995044708252, + "learning_rate": 4.349128395952821e-05, + "loss": 1.4901, + "step": 19558 + }, + { + "epoch": 0.700449442226082, + "grad_norm": 1.6838757991790771, + "learning_rate": 4.3481714791972816e-05, + "loss": 1.8831, + "step": 19559 + }, + { + "epoch": 0.7004852543556502, + "grad_norm": 1.3522682189941406, + "learning_rate": 4.3472146384797973e-05, + "loss": 1.381, + "step": 19560 + }, + { + "epoch": 0.7005210664852185, + "grad_norm": 1.5966675281524658, + "learning_rate": 4.3462578738132557e-05, + "loss": 1.2488, + "step": 19561 + }, + { + "epoch": 0.7005568786147869, + "grad_norm": 1.5900689363479614, + "learning_rate": 4.345301185210517e-05, + "loss": 1.3429, + "step": 19562 + }, + { + "epoch": 0.7005926907443552, + "grad_norm": 2.1359074115753174, + "learning_rate": 4.344344572684459e-05, + "loss": 1.5794, + "step": 19563 + }, + { + "epoch": 0.7006285028739234, + "grad_norm": 1.512988567352295, + "learning_rate": 4.343388036247952e-05, + "loss": 1.2947, + "step": 19564 + }, + { + "epoch": 0.7006643150034917, + "grad_norm": 1.4352556467056274, + "learning_rate": 4.342431575913858e-05, + "loss": 1.2725, + "step": 19565 + }, + { + "epoch": 0.70070012713306, + "grad_norm": 1.7753974199295044, + "learning_rate": 4.341475191695054e-05, + "loss": 1.6684, + "step": 19566 + }, + { + "epoch": 0.7007359392626282, + "grad_norm": 1.5111207962036133, + "learning_rate": 4.340518883604395e-05, + "loss": 1.3503, + "step": 19567 + }, + { + "epoch": 0.7007717513921965, + "grad_norm": 1.4870178699493408, + "learning_rate": 4.339562651654761e-05, + "loss": 1.176, + "step": 19568 + }, + { + "epoch": 0.7008075635217649, + "grad_norm": 1.6527915000915527, + "learning_rate": 4.338606495859007e-05, + "loss": 1.715, + "step": 19569 + }, + { + "epoch": 0.7008433756513331, + "grad_norm": 2.162165403366089, + "learning_rate": 4.3376504162300035e-05, + "loss": 1.7323, + "step": 19570 + }, + { + "epoch": 0.7008791877809014, + "grad_norm": 1.7663673162460327, + "learning_rate": 4.336694412780605e-05, + "loss": 1.3105, + "step": 19571 + }, + { + "epoch": 0.7009149999104697, + "grad_norm": 1.6505602598190308, + "learning_rate": 4.3357384855236796e-05, + "loss": 1.3928, + "step": 19572 + }, + { + "epoch": 0.700950812040038, + "grad_norm": 1.5301345586776733, + "learning_rate": 4.33478263447209e-05, + "loss": 1.2207, + "step": 19573 + }, + { + "epoch": 0.7009866241696062, + "grad_norm": 1.6930309534072876, + "learning_rate": 4.333826859638684e-05, + "loss": 1.603, + "step": 19574 + }, + { + "epoch": 0.7010224362991745, + "grad_norm": 1.439517855644226, + "learning_rate": 4.332871161036337e-05, + "loss": 1.0671, + "step": 19575 + }, + { + "epoch": 0.7010582484287429, + "grad_norm": 1.5984947681427002, + "learning_rate": 4.331915538677894e-05, + "loss": 1.4029, + "step": 19576 + }, + { + "epoch": 0.7010940605583111, + "grad_norm": 1.5472118854522705, + "learning_rate": 4.3309599925762214e-05, + "loss": 1.2994, + "step": 19577 + }, + { + "epoch": 0.7011298726878794, + "grad_norm": 1.2958440780639648, + "learning_rate": 4.330004522744161e-05, + "loss": 1.6099, + "step": 19578 + }, + { + "epoch": 0.7011656848174477, + "grad_norm": 1.5507088899612427, + "learning_rate": 4.329049129194583e-05, + "loss": 1.4012, + "step": 19579 + }, + { + "epoch": 0.701201496947016, + "grad_norm": 2.030113935470581, + "learning_rate": 4.3280938119403346e-05, + "loss": 1.4628, + "step": 19580 + }, + { + "epoch": 0.7012373090765842, + "grad_norm": 1.508935570716858, + "learning_rate": 4.3271385709942636e-05, + "loss": 1.3467, + "step": 19581 + }, + { + "epoch": 0.7012731212061525, + "grad_norm": 1.4302729368209839, + "learning_rate": 4.326183406369226e-05, + "loss": 1.3545, + "step": 19582 + }, + { + "epoch": 0.7013089333357209, + "grad_norm": 2.1700620651245117, + "learning_rate": 4.325228318078073e-05, + "loss": 1.4485, + "step": 19583 + }, + { + "epoch": 0.7013447454652891, + "grad_norm": 1.713890790939331, + "learning_rate": 4.324273306133655e-05, + "loss": 1.7539, + "step": 19584 + }, + { + "epoch": 0.7013805575948574, + "grad_norm": 1.60768723487854, + "learning_rate": 4.3233183705488156e-05, + "loss": 1.1099, + "step": 19585 + }, + { + "epoch": 0.7014163697244257, + "grad_norm": 2.0066118240356445, + "learning_rate": 4.322363511336405e-05, + "loss": 1.4433, + "step": 19586 + }, + { + "epoch": 0.7014521818539939, + "grad_norm": 2.282560110092163, + "learning_rate": 4.32140872850927e-05, + "loss": 1.2975, + "step": 19587 + }, + { + "epoch": 0.7014879939835622, + "grad_norm": 1.573010802268982, + "learning_rate": 4.320454022080259e-05, + "loss": 1.4643, + "step": 19588 + }, + { + "epoch": 0.7015238061131305, + "grad_norm": 1.5037258863449097, + "learning_rate": 4.3194993920622095e-05, + "loss": 1.5382, + "step": 19589 + }, + { + "epoch": 0.7015596182426989, + "grad_norm": 1.8554325103759766, + "learning_rate": 4.318544838467968e-05, + "loss": 1.5909, + "step": 19590 + }, + { + "epoch": 0.7015954303722671, + "grad_norm": 2.168818950653076, + "learning_rate": 4.3175903613103815e-05, + "loss": 1.6751, + "step": 19591 + }, + { + "epoch": 0.7016312425018354, + "grad_norm": 1.7133216857910156, + "learning_rate": 4.316635960602283e-05, + "loss": 1.2098, + "step": 19592 + }, + { + "epoch": 0.7016670546314037, + "grad_norm": 1.6379142999649048, + "learning_rate": 4.3156816363565166e-05, + "loss": 1.4609, + "step": 19593 + }, + { + "epoch": 0.7017028667609719, + "grad_norm": 1.4387273788452148, + "learning_rate": 4.3147273885859215e-05, + "loss": 1.1765, + "step": 19594 + }, + { + "epoch": 0.7017386788905402, + "grad_norm": 1.2745956182479858, + "learning_rate": 4.3137732173033394e-05, + "loss": 1.553, + "step": 19595 + }, + { + "epoch": 0.7017744910201085, + "grad_norm": 1.6397004127502441, + "learning_rate": 4.3128191225216005e-05, + "loss": 1.2392, + "step": 19596 + }, + { + "epoch": 0.7018103031496769, + "grad_norm": 1.4887009859085083, + "learning_rate": 4.3118651042535444e-05, + "loss": 1.3739, + "step": 19597 + }, + { + "epoch": 0.7018461152792451, + "grad_norm": 1.4074066877365112, + "learning_rate": 4.31091116251201e-05, + "loss": 1.4146, + "step": 19598 + }, + { + "epoch": 0.7018819274088134, + "grad_norm": 1.663396954536438, + "learning_rate": 4.3099572973098236e-05, + "loss": 1.4598, + "step": 19599 + }, + { + "epoch": 0.7019177395383817, + "grad_norm": 2.8375847339630127, + "learning_rate": 4.309003508659822e-05, + "loss": 1.3936, + "step": 19600 + }, + { + "epoch": 0.7019535516679499, + "grad_norm": 1.628321886062622, + "learning_rate": 4.3080497965748376e-05, + "loss": 1.7446, + "step": 19601 + }, + { + "epoch": 0.7019893637975182, + "grad_norm": 1.359993815422058, + "learning_rate": 4.307096161067704e-05, + "loss": 1.1517, + "step": 19602 + }, + { + "epoch": 0.7020251759270865, + "grad_norm": 1.599370002746582, + "learning_rate": 4.3061426021512453e-05, + "loss": 1.2006, + "step": 19603 + }, + { + "epoch": 0.7020609880566548, + "grad_norm": 1.3849345445632935, + "learning_rate": 4.305189119838293e-05, + "loss": 1.2563, + "step": 19604 + }, + { + "epoch": 0.7020968001862231, + "grad_norm": 1.7541930675506592, + "learning_rate": 4.304235714141677e-05, + "loss": 1.5481, + "step": 19605 + }, + { + "epoch": 0.7021326123157914, + "grad_norm": 1.3335884809494019, + "learning_rate": 4.303282385074224e-05, + "loss": 1.4315, + "step": 19606 + }, + { + "epoch": 0.7021684244453597, + "grad_norm": 1.3472892045974731, + "learning_rate": 4.3023291326487556e-05, + "loss": 1.4085, + "step": 19607 + }, + { + "epoch": 0.7022042365749279, + "grad_norm": 1.5135165452957153, + "learning_rate": 4.301375956878099e-05, + "loss": 1.0491, + "step": 19608 + }, + { + "epoch": 0.7022400487044962, + "grad_norm": 2.763059616088867, + "learning_rate": 4.300422857775081e-05, + "loss": 1.5307, + "step": 19609 + }, + { + "epoch": 0.7022758608340645, + "grad_norm": 1.5471773147583008, + "learning_rate": 4.2994698353525184e-05, + "loss": 1.5784, + "step": 19610 + }, + { + "epoch": 0.7023116729636327, + "grad_norm": 1.618385910987854, + "learning_rate": 4.298516889623238e-05, + "loss": 1.3744, + "step": 19611 + }, + { + "epoch": 0.7023474850932011, + "grad_norm": 2.3037314414978027, + "learning_rate": 4.297564020600056e-05, + "loss": 1.6599, + "step": 19612 + }, + { + "epoch": 0.7023832972227694, + "grad_norm": 2.060870885848999, + "learning_rate": 4.2966112282957985e-05, + "loss": 1.4654, + "step": 19613 + }, + { + "epoch": 0.7024191093523376, + "grad_norm": 1.506044864654541, + "learning_rate": 4.295658512723277e-05, + "loss": 1.1635, + "step": 19614 + }, + { + "epoch": 0.7024549214819059, + "grad_norm": 1.5053657293319702, + "learning_rate": 4.2947058738953115e-05, + "loss": 1.5164, + "step": 19615 + }, + { + "epoch": 0.7024907336114742, + "grad_norm": 1.444291591644287, + "learning_rate": 4.293753311824724e-05, + "loss": 1.4666, + "step": 19616 + }, + { + "epoch": 0.7025265457410425, + "grad_norm": 1.5131818056106567, + "learning_rate": 4.2928008265243205e-05, + "loss": 1.1814, + "step": 19617 + }, + { + "epoch": 0.7025623578706107, + "grad_norm": 1.3681418895721436, + "learning_rate": 4.2918484180069205e-05, + "loss": 1.2736, + "step": 19618 + }, + { + "epoch": 0.7025981700001791, + "grad_norm": 1.5410593748092651, + "learning_rate": 4.290896086285338e-05, + "loss": 1.0753, + "step": 19619 + }, + { + "epoch": 0.7026339821297474, + "grad_norm": 1.949423909187317, + "learning_rate": 4.289943831372386e-05, + "loss": 1.4629, + "step": 19620 + }, + { + "epoch": 0.7026697942593156, + "grad_norm": 1.885960578918457, + "learning_rate": 4.2889916532808716e-05, + "loss": 1.6269, + "step": 19621 + }, + { + "epoch": 0.7027056063888839, + "grad_norm": 1.5690264701843262, + "learning_rate": 4.2880395520236086e-05, + "loss": 0.934, + "step": 19622 + }, + { + "epoch": 0.7027414185184522, + "grad_norm": 1.42863130569458, + "learning_rate": 4.287087527613405e-05, + "loss": 1.5016, + "step": 19623 + }, + { + "epoch": 0.7027772306480204, + "grad_norm": 1.8398011922836304, + "learning_rate": 4.2861355800630734e-05, + "loss": 1.6303, + "step": 19624 + }, + { + "epoch": 0.7028130427775887, + "grad_norm": 1.50557541847229, + "learning_rate": 4.285183709385413e-05, + "loss": 1.5899, + "step": 19625 + }, + { + "epoch": 0.7028488549071571, + "grad_norm": 2.3311352729797363, + "learning_rate": 4.284231915593234e-05, + "loss": 1.3451, + "step": 19626 + }, + { + "epoch": 0.7028846670367254, + "grad_norm": 1.7532374858856201, + "learning_rate": 4.283280198699346e-05, + "loss": 1.5832, + "step": 19627 + }, + { + "epoch": 0.7029204791662936, + "grad_norm": 1.79032564163208, + "learning_rate": 4.2823285587165454e-05, + "loss": 1.2493, + "step": 19628 + }, + { + "epoch": 0.7029562912958619, + "grad_norm": 2.0720388889312744, + "learning_rate": 4.281376995657638e-05, + "loss": 1.8434, + "step": 19629 + }, + { + "epoch": 0.7029921034254302, + "grad_norm": 1.5829167366027832, + "learning_rate": 4.2804255095354276e-05, + "loss": 1.6059, + "step": 19630 + }, + { + "epoch": 0.7030279155549984, + "grad_norm": 1.614844799041748, + "learning_rate": 4.279474100362717e-05, + "loss": 1.6384, + "step": 19631 + }, + { + "epoch": 0.7030637276845667, + "grad_norm": 1.5533795356750488, + "learning_rate": 4.278522768152301e-05, + "loss": 1.5463, + "step": 19632 + }, + { + "epoch": 0.7030995398141351, + "grad_norm": 1.7275530099868774, + "learning_rate": 4.27757151291698e-05, + "loss": 1.2183, + "step": 19633 + }, + { + "epoch": 0.7031353519437034, + "grad_norm": 1.5814635753631592, + "learning_rate": 4.2766203346695565e-05, + "loss": 1.5156, + "step": 19634 + }, + { + "epoch": 0.7031711640732716, + "grad_norm": 2.4060182571411133, + "learning_rate": 4.27566923342282e-05, + "loss": 1.5844, + "step": 19635 + }, + { + "epoch": 0.7032069762028399, + "grad_norm": 2.1266350746154785, + "learning_rate": 4.274718209189571e-05, + "loss": 1.4377, + "step": 19636 + }, + { + "epoch": 0.7032427883324082, + "grad_norm": 1.3293077945709229, + "learning_rate": 4.273767261982603e-05, + "loss": 1.4234, + "step": 19637 + }, + { + "epoch": 0.7032786004619764, + "grad_norm": 1.726370930671692, + "learning_rate": 4.272816391814714e-05, + "loss": 1.6574, + "step": 19638 + }, + { + "epoch": 0.7033144125915447, + "grad_norm": 1.4099525213241577, + "learning_rate": 4.271865598698689e-05, + "loss": 1.0711, + "step": 19639 + }, + { + "epoch": 0.7033502247211131, + "grad_norm": 1.8294678926467896, + "learning_rate": 4.2709148826473234e-05, + "loss": 1.6613, + "step": 19640 + }, + { + "epoch": 0.7033860368506814, + "grad_norm": 1.2897287607192993, + "learning_rate": 4.269964243673408e-05, + "loss": 1.2772, + "step": 19641 + }, + { + "epoch": 0.7034218489802496, + "grad_norm": 1.5318269729614258, + "learning_rate": 4.2690136817897363e-05, + "loss": 1.2968, + "step": 19642 + }, + { + "epoch": 0.7034576611098179, + "grad_norm": 1.5041943788528442, + "learning_rate": 4.2680631970090935e-05, + "loss": 1.3391, + "step": 19643 + }, + { + "epoch": 0.7034934732393862, + "grad_norm": 1.3494664430618286, + "learning_rate": 4.2671127893442586e-05, + "loss": 1.594, + "step": 19644 + }, + { + "epoch": 0.7035292853689544, + "grad_norm": 1.539660930633545, + "learning_rate": 4.266162458808034e-05, + "loss": 1.4438, + "step": 19645 + }, + { + "epoch": 0.7035650974985227, + "grad_norm": 1.8535983562469482, + "learning_rate": 4.2652122054131936e-05, + "loss": 1.7111, + "step": 19646 + }, + { + "epoch": 0.7036009096280911, + "grad_norm": 1.5027899742126465, + "learning_rate": 4.264262029172527e-05, + "loss": 1.62, + "step": 19647 + }, + { + "epoch": 0.7036367217576593, + "grad_norm": 2.0121660232543945, + "learning_rate": 4.2633119300988146e-05, + "loss": 1.4439, + "step": 19648 + }, + { + "epoch": 0.7036725338872276, + "grad_norm": 1.5586117506027222, + "learning_rate": 4.262361908204844e-05, + "loss": 1.2805, + "step": 19649 + }, + { + "epoch": 0.7037083460167959, + "grad_norm": 1.7597720623016357, + "learning_rate": 4.26141196350339e-05, + "loss": 1.2432, + "step": 19650 + }, + { + "epoch": 0.7037441581463642, + "grad_norm": 1.6648684740066528, + "learning_rate": 4.260462096007235e-05, + "loss": 1.3918, + "step": 19651 + }, + { + "epoch": 0.7037799702759324, + "grad_norm": 1.5515859127044678, + "learning_rate": 4.259512305729164e-05, + "loss": 1.4651, + "step": 19652 + }, + { + "epoch": 0.7038157824055007, + "grad_norm": 1.6097532510757446, + "learning_rate": 4.258562592681945e-05, + "loss": 1.4623, + "step": 19653 + }, + { + "epoch": 0.7038515945350691, + "grad_norm": 1.509201169013977, + "learning_rate": 4.2576129568783654e-05, + "loss": 1.6167, + "step": 19654 + }, + { + "epoch": 0.7038874066646373, + "grad_norm": 2.127002716064453, + "learning_rate": 4.2566633983311885e-05, + "loss": 1.636, + "step": 19655 + }, + { + "epoch": 0.7039232187942056, + "grad_norm": 2.234020948410034, + "learning_rate": 4.2557139170532045e-05, + "loss": 1.4049, + "step": 19656 + }, + { + "epoch": 0.7039590309237739, + "grad_norm": 1.5483022928237915, + "learning_rate": 4.2547645130571764e-05, + "loss": 1.6323, + "step": 19657 + }, + { + "epoch": 0.7039948430533421, + "grad_norm": 1.733654499053955, + "learning_rate": 4.253815186355881e-05, + "loss": 1.5718, + "step": 19658 + }, + { + "epoch": 0.7040306551829104, + "grad_norm": 1.627803921699524, + "learning_rate": 4.2528659369620905e-05, + "loss": 1.5466, + "step": 19659 + }, + { + "epoch": 0.7040664673124787, + "grad_norm": 1.8518602848052979, + "learning_rate": 4.2519167648885785e-05, + "loss": 1.5608, + "step": 19660 + }, + { + "epoch": 0.7041022794420471, + "grad_norm": 1.3604775667190552, + "learning_rate": 4.250967670148113e-05, + "loss": 1.2684, + "step": 19661 + }, + { + "epoch": 0.7041380915716153, + "grad_norm": 1.494215488433838, + "learning_rate": 4.250018652753454e-05, + "loss": 1.5892, + "step": 19662 + }, + { + "epoch": 0.7041739037011836, + "grad_norm": 1.7942612171173096, + "learning_rate": 4.2490697127173826e-05, + "loss": 1.3718, + "step": 19663 + }, + { + "epoch": 0.7042097158307519, + "grad_norm": 1.7529025077819824, + "learning_rate": 4.248120850052658e-05, + "loss": 1.3392, + "step": 19664 + }, + { + "epoch": 0.7042455279603201, + "grad_norm": 1.6615201234817505, + "learning_rate": 4.247172064772053e-05, + "loss": 1.4284, + "step": 19665 + }, + { + "epoch": 0.7042813400898884, + "grad_norm": 2.163020133972168, + "learning_rate": 4.246223356888318e-05, + "loss": 1.4905, + "step": 19666 + }, + { + "epoch": 0.7043171522194567, + "grad_norm": 1.709355115890503, + "learning_rate": 4.2452747264142335e-05, + "loss": 1.2316, + "step": 19667 + }, + { + "epoch": 0.7043529643490251, + "grad_norm": 1.8853095769882202, + "learning_rate": 4.244326173362555e-05, + "loss": 1.541, + "step": 19668 + }, + { + "epoch": 0.7043887764785933, + "grad_norm": 2.275879383087158, + "learning_rate": 4.2433776977460396e-05, + "loss": 1.6956, + "step": 19669 + }, + { + "epoch": 0.7044245886081616, + "grad_norm": 1.2705893516540527, + "learning_rate": 4.242429299577452e-05, + "loss": 1.2138, + "step": 19670 + }, + { + "epoch": 0.7044604007377299, + "grad_norm": 2.3329508304595947, + "learning_rate": 4.241480978869551e-05, + "loss": 1.461, + "step": 19671 + }, + { + "epoch": 0.7044962128672981, + "grad_norm": 1.6873449087142944, + "learning_rate": 4.2405327356351e-05, + "loss": 1.468, + "step": 19672 + }, + { + "epoch": 0.7045320249968664, + "grad_norm": 1.3374345302581787, + "learning_rate": 4.239584569886843e-05, + "loss": 1.4267, + "step": 19673 + }, + { + "epoch": 0.7045678371264347, + "grad_norm": 1.6494673490524292, + "learning_rate": 4.2386364816375545e-05, + "loss": 1.5675, + "step": 19674 + }, + { + "epoch": 0.704603649256003, + "grad_norm": 2.307361364364624, + "learning_rate": 4.2376884708999754e-05, + "loss": 1.659, + "step": 19675 + }, + { + "epoch": 0.7046394613855713, + "grad_norm": 1.6656765937805176, + "learning_rate": 4.23674053768687e-05, + "loss": 1.4741, + "step": 19676 + }, + { + "epoch": 0.7046752735151396, + "grad_norm": 2.520934820175171, + "learning_rate": 4.2357926820109816e-05, + "loss": 1.3717, + "step": 19677 + }, + { + "epoch": 0.7047110856447079, + "grad_norm": 1.7585898637771606, + "learning_rate": 4.234844903885068e-05, + "loss": 1.3923, + "step": 19678 + }, + { + "epoch": 0.7047468977742761, + "grad_norm": 1.6115695238113403, + "learning_rate": 4.233897203321883e-05, + "loss": 1.4965, + "step": 19679 + }, + { + "epoch": 0.7047827099038444, + "grad_norm": 1.3951077461242676, + "learning_rate": 4.23294958033417e-05, + "loss": 1.6331, + "step": 19680 + }, + { + "epoch": 0.7048185220334127, + "grad_norm": 1.9594569206237793, + "learning_rate": 4.232002034934681e-05, + "loss": 1.4238, + "step": 19681 + }, + { + "epoch": 0.704854334162981, + "grad_norm": 1.7811216115951538, + "learning_rate": 4.231054567136166e-05, + "loss": 1.4687, + "step": 19682 + }, + { + "epoch": 0.7048901462925493, + "grad_norm": 1.6424189805984497, + "learning_rate": 4.230107176951372e-05, + "loss": 1.3477, + "step": 19683 + }, + { + "epoch": 0.7049259584221176, + "grad_norm": 1.5223708152770996, + "learning_rate": 4.229159864393037e-05, + "loss": 1.6541, + "step": 19684 + }, + { + "epoch": 0.7049617705516859, + "grad_norm": 1.9250037670135498, + "learning_rate": 4.2282126294739186e-05, + "loss": 1.7329, + "step": 19685 + }, + { + "epoch": 0.7049975826812541, + "grad_norm": 1.6363168954849243, + "learning_rate": 4.227265472206756e-05, + "loss": 1.5585, + "step": 19686 + }, + { + "epoch": 0.7050333948108224, + "grad_norm": 1.635489821434021, + "learning_rate": 4.226318392604285e-05, + "loss": 1.2106, + "step": 19687 + }, + { + "epoch": 0.7050692069403907, + "grad_norm": 1.4792739152908325, + "learning_rate": 4.225371390679254e-05, + "loss": 1.4307, + "step": 19688 + }, + { + "epoch": 0.705105019069959, + "grad_norm": 1.7022123336791992, + "learning_rate": 4.224424466444401e-05, + "loss": 1.6674, + "step": 19689 + }, + { + "epoch": 0.7051408311995273, + "grad_norm": 1.5994065999984741, + "learning_rate": 4.2234776199124705e-05, + "loss": 1.3563, + "step": 19690 + }, + { + "epoch": 0.7051766433290956, + "grad_norm": 1.511945128440857, + "learning_rate": 4.222530851096194e-05, + "loss": 1.5933, + "step": 19691 + }, + { + "epoch": 0.7052124554586638, + "grad_norm": 1.8896557092666626, + "learning_rate": 4.221584160008313e-05, + "loss": 1.4839, + "step": 19692 + }, + { + "epoch": 0.7052482675882321, + "grad_norm": 1.4836983680725098, + "learning_rate": 4.220637546661562e-05, + "loss": 1.4684, + "step": 19693 + }, + { + "epoch": 0.7052840797178004, + "grad_norm": 1.3161821365356445, + "learning_rate": 4.2196910110686826e-05, + "loss": 1.5167, + "step": 19694 + }, + { + "epoch": 0.7053198918473687, + "grad_norm": 1.367013692855835, + "learning_rate": 4.218744553242402e-05, + "loss": 1.6204, + "step": 19695 + }, + { + "epoch": 0.705355703976937, + "grad_norm": 1.7846018075942993, + "learning_rate": 4.217798173195454e-05, + "loss": 1.3237, + "step": 19696 + }, + { + "epoch": 0.7053915161065053, + "grad_norm": 1.9348431825637817, + "learning_rate": 4.216851870940578e-05, + "loss": 1.5916, + "step": 19697 + }, + { + "epoch": 0.7054273282360736, + "grad_norm": 1.376637578010559, + "learning_rate": 4.2159056464904964e-05, + "loss": 1.316, + "step": 19698 + }, + { + "epoch": 0.7054631403656418, + "grad_norm": 2.1042535305023193, + "learning_rate": 4.2149594998579445e-05, + "loss": 1.1512, + "step": 19699 + }, + { + "epoch": 0.7054989524952101, + "grad_norm": 1.6151759624481201, + "learning_rate": 4.214013431055649e-05, + "loss": 1.2476, + "step": 19700 + }, + { + "epoch": 0.7055347646247784, + "grad_norm": 1.6883217096328735, + "learning_rate": 4.213067440096343e-05, + "loss": 1.5371, + "step": 19701 + }, + { + "epoch": 0.7055705767543466, + "grad_norm": 1.4629133939743042, + "learning_rate": 4.212121526992747e-05, + "loss": 1.6678, + "step": 19702 + }, + { + "epoch": 0.705606388883915, + "grad_norm": 1.3969576358795166, + "learning_rate": 4.211175691757591e-05, + "loss": 1.4144, + "step": 19703 + }, + { + "epoch": 0.7056422010134833, + "grad_norm": 1.6211917400360107, + "learning_rate": 4.2102299344036014e-05, + "loss": 1.4869, + "step": 19704 + }, + { + "epoch": 0.7056780131430516, + "grad_norm": 1.6717114448547363, + "learning_rate": 4.2092842549434954e-05, + "loss": 1.259, + "step": 19705 + }, + { + "epoch": 0.7057138252726198, + "grad_norm": 1.6694384813308716, + "learning_rate": 4.208338653390002e-05, + "loss": 1.5622, + "step": 19706 + }, + { + "epoch": 0.7057496374021881, + "grad_norm": 1.3916771411895752, + "learning_rate": 4.20739312975584e-05, + "loss": 1.5155, + "step": 19707 + }, + { + "epoch": 0.7057854495317564, + "grad_norm": 1.4139550924301147, + "learning_rate": 4.206447684053735e-05, + "loss": 1.3517, + "step": 19708 + }, + { + "epoch": 0.7058212616613246, + "grad_norm": 2.4061508178710938, + "learning_rate": 4.2055023162964e-05, + "loss": 1.742, + "step": 19709 + }, + { + "epoch": 0.705857073790893, + "grad_norm": 1.6226320266723633, + "learning_rate": 4.2045570264965574e-05, + "loss": 1.6635, + "step": 19710 + }, + { + "epoch": 0.7058928859204613, + "grad_norm": 1.434448003768921, + "learning_rate": 4.203611814666925e-05, + "loss": 1.2733, + "step": 19711 + }, + { + "epoch": 0.7059286980500296, + "grad_norm": 1.5434811115264893, + "learning_rate": 4.202666680820221e-05, + "loss": 1.2843, + "step": 19712 + }, + { + "epoch": 0.7059645101795978, + "grad_norm": 1.7157692909240723, + "learning_rate": 4.201721624969156e-05, + "loss": 1.4567, + "step": 19713 + }, + { + "epoch": 0.7060003223091661, + "grad_norm": 1.388255000114441, + "learning_rate": 4.200776647126447e-05, + "loss": 1.4829, + "step": 19714 + }, + { + "epoch": 0.7060361344387344, + "grad_norm": 1.2208225727081299, + "learning_rate": 4.199831747304811e-05, + "loss": 1.4678, + "step": 19715 + }, + { + "epoch": 0.7060719465683026, + "grad_norm": 1.6040128469467163, + "learning_rate": 4.198886925516954e-05, + "loss": 1.5309, + "step": 19716 + }, + { + "epoch": 0.706107758697871, + "grad_norm": 1.2374918460845947, + "learning_rate": 4.19794218177559e-05, + "loss": 1.4965, + "step": 19717 + }, + { + "epoch": 0.7061435708274393, + "grad_norm": 1.8679009675979614, + "learning_rate": 4.196997516093431e-05, + "loss": 1.6104, + "step": 19718 + }, + { + "epoch": 0.7061793829570076, + "grad_norm": 1.7728638648986816, + "learning_rate": 4.196052928483188e-05, + "loss": 1.5767, + "step": 19719 + }, + { + "epoch": 0.7062151950865758, + "grad_norm": 1.6892356872558594, + "learning_rate": 4.195108418957563e-05, + "loss": 1.5712, + "step": 19720 + }, + { + "epoch": 0.7062510072161441, + "grad_norm": 1.1942442655563354, + "learning_rate": 4.194163987529266e-05, + "loss": 1.105, + "step": 19721 + }, + { + "epoch": 0.7062868193457124, + "grad_norm": 2.3192408084869385, + "learning_rate": 4.1932196342110076e-05, + "loss": 1.4804, + "step": 19722 + }, + { + "epoch": 0.7063226314752806, + "grad_norm": 1.7069861888885498, + "learning_rate": 4.1922753590154854e-05, + "loss": 1.5507, + "step": 19723 + }, + { + "epoch": 0.706358443604849, + "grad_norm": 2.246624708175659, + "learning_rate": 4.1913311619554064e-05, + "loss": 1.5065, + "step": 19724 + }, + { + "epoch": 0.7063942557344173, + "grad_norm": 1.9762353897094727, + "learning_rate": 4.1903870430434736e-05, + "loss": 1.7526, + "step": 19725 + }, + { + "epoch": 0.7064300678639855, + "grad_norm": 1.4072891473770142, + "learning_rate": 4.189443002292392e-05, + "loss": 1.5152, + "step": 19726 + }, + { + "epoch": 0.7064658799935538, + "grad_norm": 2.209500312805176, + "learning_rate": 4.1884990397148584e-05, + "loss": 1.2394, + "step": 19727 + }, + { + "epoch": 0.7065016921231221, + "grad_norm": 2.366694450378418, + "learning_rate": 4.187555155323572e-05, + "loss": 1.7438, + "step": 19728 + }, + { + "epoch": 0.7065375042526904, + "grad_norm": 1.3919899463653564, + "learning_rate": 4.186611349131235e-05, + "loss": 1.5614, + "step": 19729 + }, + { + "epoch": 0.7065733163822586, + "grad_norm": 1.4385210275650024, + "learning_rate": 4.1856676211505465e-05, + "loss": 1.6848, + "step": 19730 + }, + { + "epoch": 0.706609128511827, + "grad_norm": 2.9990813732147217, + "learning_rate": 4.184723971394197e-05, + "loss": 1.3986, + "step": 19731 + }, + { + "epoch": 0.7066449406413953, + "grad_norm": 1.7250571250915527, + "learning_rate": 4.183780399874885e-05, + "loss": 1.7292, + "step": 19732 + }, + { + "epoch": 0.7066807527709635, + "grad_norm": 1.7180428504943848, + "learning_rate": 4.182836906605309e-05, + "loss": 1.2938, + "step": 19733 + }, + { + "epoch": 0.7067165649005318, + "grad_norm": 1.549298882484436, + "learning_rate": 4.1818934915981544e-05, + "loss": 1.4342, + "step": 19734 + }, + { + "epoch": 0.7067523770301001, + "grad_norm": 1.4171665906906128, + "learning_rate": 4.180950154866119e-05, + "loss": 1.6761, + "step": 19735 + }, + { + "epoch": 0.7067881891596683, + "grad_norm": 1.4383457899093628, + "learning_rate": 4.180006896421893e-05, + "loss": 1.3711, + "step": 19736 + }, + { + "epoch": 0.7068240012892366, + "grad_norm": 1.7060292959213257, + "learning_rate": 4.179063716278171e-05, + "loss": 1.7098, + "step": 19737 + }, + { + "epoch": 0.706859813418805, + "grad_norm": 1.4669756889343262, + "learning_rate": 4.178120614447634e-05, + "loss": 1.5862, + "step": 19738 + }, + { + "epoch": 0.7068956255483733, + "grad_norm": 1.5930043458938599, + "learning_rate": 4.177177590942974e-05, + "loss": 1.7459, + "step": 19739 + }, + { + "epoch": 0.7069314376779415, + "grad_norm": 1.4487543106079102, + "learning_rate": 4.176234645776883e-05, + "loss": 1.6866, + "step": 19740 + }, + { + "epoch": 0.7069672498075098, + "grad_norm": 1.9951848983764648, + "learning_rate": 4.1752917789620395e-05, + "loss": 1.4115, + "step": 19741 + }, + { + "epoch": 0.7070030619370781, + "grad_norm": 1.5408639907836914, + "learning_rate": 4.174348990511131e-05, + "loss": 1.3569, + "step": 19742 + }, + { + "epoch": 0.7070388740666463, + "grad_norm": 1.9540464878082275, + "learning_rate": 4.1734062804368426e-05, + "loss": 1.1295, + "step": 19743 + }, + { + "epoch": 0.7070746861962146, + "grad_norm": 1.8576759099960327, + "learning_rate": 4.17246364875186e-05, + "loss": 1.4529, + "step": 19744 + }, + { + "epoch": 0.707110498325783, + "grad_norm": 1.4856642484664917, + "learning_rate": 4.171521095468859e-05, + "loss": 1.6362, + "step": 19745 + }, + { + "epoch": 0.7071463104553513, + "grad_norm": 1.6320254802703857, + "learning_rate": 4.1705786206005235e-05, + "loss": 1.6359, + "step": 19746 + }, + { + "epoch": 0.7071821225849195, + "grad_norm": 1.697995901107788, + "learning_rate": 4.169636224159533e-05, + "loss": 1.277, + "step": 19747 + }, + { + "epoch": 0.7072179347144878, + "grad_norm": 1.8926945924758911, + "learning_rate": 4.1686939061585694e-05, + "loss": 1.8489, + "step": 19748 + }, + { + "epoch": 0.7072537468440561, + "grad_norm": 1.6354869604110718, + "learning_rate": 4.167751666610309e-05, + "loss": 1.7138, + "step": 19749 + }, + { + "epoch": 0.7072895589736243, + "grad_norm": 1.598158597946167, + "learning_rate": 4.166809505527418e-05, + "loss": 1.3413, + "step": 19750 + }, + { + "epoch": 0.7073253711031926, + "grad_norm": 1.6339963674545288, + "learning_rate": 4.165867422922589e-05, + "loss": 1.4861, + "step": 19751 + }, + { + "epoch": 0.707361183232761, + "grad_norm": 1.544044852256775, + "learning_rate": 4.1649254188084854e-05, + "loss": 1.2516, + "step": 19752 + }, + { + "epoch": 0.7073969953623293, + "grad_norm": 1.874853491783142, + "learning_rate": 4.1639834931977864e-05, + "loss": 1.657, + "step": 19753 + }, + { + "epoch": 0.7074328074918975, + "grad_norm": 1.7855067253112793, + "learning_rate": 4.163041646103154e-05, + "loss": 1.4241, + "step": 19754 + }, + { + "epoch": 0.7074686196214658, + "grad_norm": 2.4784038066864014, + "learning_rate": 4.162099877537274e-05, + "loss": 1.1453, + "step": 19755 + }, + { + "epoch": 0.7075044317510341, + "grad_norm": 1.4555649757385254, + "learning_rate": 4.161158187512808e-05, + "loss": 1.6522, + "step": 19756 + }, + { + "epoch": 0.7075402438806023, + "grad_norm": 2.270052909851074, + "learning_rate": 4.160216576042426e-05, + "loss": 1.7847, + "step": 19757 + }, + { + "epoch": 0.7075760560101706, + "grad_norm": 1.4863038063049316, + "learning_rate": 4.159275043138801e-05, + "loss": 1.3766, + "step": 19758 + }, + { + "epoch": 0.707611868139739, + "grad_norm": 1.55003023147583, + "learning_rate": 4.1583335888145915e-05, + "loss": 1.733, + "step": 19759 + }, + { + "epoch": 0.7076476802693072, + "grad_norm": 1.7473454475402832, + "learning_rate": 4.1573922130824725e-05, + "loss": 1.4507, + "step": 19760 + }, + { + "epoch": 0.7076834923988755, + "grad_norm": 1.6226022243499756, + "learning_rate": 4.156450915955099e-05, + "loss": 1.676, + "step": 19761 + }, + { + "epoch": 0.7077193045284438, + "grad_norm": 2.1244826316833496, + "learning_rate": 4.155509697445147e-05, + "loss": 1.7523, + "step": 19762 + }, + { + "epoch": 0.707755116658012, + "grad_norm": 1.449433445930481, + "learning_rate": 4.1545685575652695e-05, + "loss": 1.6795, + "step": 19763 + }, + { + "epoch": 0.7077909287875803, + "grad_norm": 1.377442717552185, + "learning_rate": 4.1536274963281355e-05, + "loss": 1.5041, + "step": 19764 + }, + { + "epoch": 0.7078267409171486, + "grad_norm": 1.4417132139205933, + "learning_rate": 4.152686513746399e-05, + "loss": 1.5012, + "step": 19765 + }, + { + "epoch": 0.707862553046717, + "grad_norm": 1.397241473197937, + "learning_rate": 4.151745609832722e-05, + "loss": 1.2836, + "step": 19766 + }, + { + "epoch": 0.7078983651762852, + "grad_norm": 1.9713245630264282, + "learning_rate": 4.150804784599769e-05, + "loss": 1.3324, + "step": 19767 + }, + { + "epoch": 0.7079341773058535, + "grad_norm": 1.558300256729126, + "learning_rate": 4.149864038060185e-05, + "loss": 1.6347, + "step": 19768 + }, + { + "epoch": 0.7079699894354218, + "grad_norm": 1.6256293058395386, + "learning_rate": 4.148923370226642e-05, + "loss": 1.4837, + "step": 19769 + }, + { + "epoch": 0.70800580156499, + "grad_norm": 1.6766290664672852, + "learning_rate": 4.147982781111783e-05, + "loss": 1.3036, + "step": 19770 + }, + { + "epoch": 0.7080416136945583, + "grad_norm": 1.964269757270813, + "learning_rate": 4.147042270728272e-05, + "loss": 1.138, + "step": 19771 + }, + { + "epoch": 0.7080774258241266, + "grad_norm": 1.2855141162872314, + "learning_rate": 4.146101839088749e-05, + "loss": 1.3127, + "step": 19772 + }, + { + "epoch": 0.708113237953695, + "grad_norm": 1.4238767623901367, + "learning_rate": 4.145161486205883e-05, + "loss": 1.6241, + "step": 19773 + }, + { + "epoch": 0.7081490500832632, + "grad_norm": 1.4786126613616943, + "learning_rate": 4.144221212092316e-05, + "loss": 1.4925, + "step": 19774 + }, + { + "epoch": 0.7081848622128315, + "grad_norm": 1.7244206666946411, + "learning_rate": 4.1432810167606964e-05, + "loss": 1.5353, + "step": 19775 + }, + { + "epoch": 0.7082206743423998, + "grad_norm": 1.447631597518921, + "learning_rate": 4.1423409002236755e-05, + "loss": 1.3844, + "step": 19776 + }, + { + "epoch": 0.708256486471968, + "grad_norm": 1.4471874237060547, + "learning_rate": 4.141400862493903e-05, + "loss": 1.3439, + "step": 19777 + }, + { + "epoch": 0.7082922986015363, + "grad_norm": 1.7642822265625, + "learning_rate": 4.140460903584027e-05, + "loss": 1.7151, + "step": 19778 + }, + { + "epoch": 0.7083281107311046, + "grad_norm": 1.674778699874878, + "learning_rate": 4.139521023506688e-05, + "loss": 1.6022, + "step": 19779 + }, + { + "epoch": 0.708363922860673, + "grad_norm": 1.7055083513259888, + "learning_rate": 4.1385812222745344e-05, + "loss": 1.5398, + "step": 19780 + }, + { + "epoch": 0.7083997349902412, + "grad_norm": 2.1311376094818115, + "learning_rate": 4.13764149990021e-05, + "loss": 1.2179, + "step": 19781 + }, + { + "epoch": 0.7084355471198095, + "grad_norm": 2.307271957397461, + "learning_rate": 4.136701856396361e-05, + "loss": 1.3747, + "step": 19782 + }, + { + "epoch": 0.7084713592493778, + "grad_norm": 1.5620884895324707, + "learning_rate": 4.135762291775622e-05, + "loss": 1.3208, + "step": 19783 + }, + { + "epoch": 0.708507171378946, + "grad_norm": 1.4920231103897095, + "learning_rate": 4.1348228060506364e-05, + "loss": 1.3738, + "step": 19784 + }, + { + "epoch": 0.7085429835085143, + "grad_norm": 1.612851858139038, + "learning_rate": 4.133883399234049e-05, + "loss": 1.3598, + "step": 19785 + }, + { + "epoch": 0.7085787956380826, + "grad_norm": 1.8075392246246338, + "learning_rate": 4.132944071338489e-05, + "loss": 1.9048, + "step": 19786 + }, + { + "epoch": 0.708614607767651, + "grad_norm": 1.3424999713897705, + "learning_rate": 4.132004822376598e-05, + "loss": 1.2624, + "step": 19787 + }, + { + "epoch": 0.7086504198972192, + "grad_norm": 1.75864577293396, + "learning_rate": 4.1310656523610144e-05, + "loss": 1.2026, + "step": 19788 + }, + { + "epoch": 0.7086862320267875, + "grad_norm": 1.7845993041992188, + "learning_rate": 4.130126561304376e-05, + "loss": 1.5058, + "step": 19789 + }, + { + "epoch": 0.7087220441563558, + "grad_norm": 2.524739980697632, + "learning_rate": 4.129187549219308e-05, + "loss": 1.5992, + "step": 19790 + }, + { + "epoch": 0.708757856285924, + "grad_norm": 1.766992211341858, + "learning_rate": 4.12824861611845e-05, + "loss": 1.4969, + "step": 19791 + }, + { + "epoch": 0.7087936684154923, + "grad_norm": 1.8509896993637085, + "learning_rate": 4.127309762014435e-05, + "loss": 1.6552, + "step": 19792 + }, + { + "epoch": 0.7088294805450606, + "grad_norm": 1.721527338027954, + "learning_rate": 4.12637098691989e-05, + "loss": 1.4734, + "step": 19793 + }, + { + "epoch": 0.7088652926746289, + "grad_norm": 2.155327558517456, + "learning_rate": 4.125432290847446e-05, + "loss": 1.1883, + "step": 19794 + }, + { + "epoch": 0.7089011048041972, + "grad_norm": 1.6428849697113037, + "learning_rate": 4.124493673809733e-05, + "loss": 1.1671, + "step": 19795 + }, + { + "epoch": 0.7089369169337655, + "grad_norm": 3.388340711593628, + "learning_rate": 4.123555135819382e-05, + "loss": 1.6063, + "step": 19796 + }, + { + "epoch": 0.7089727290633338, + "grad_norm": 1.632297396659851, + "learning_rate": 4.122616676889014e-05, + "loss": 1.5945, + "step": 19797 + }, + { + "epoch": 0.709008541192902, + "grad_norm": 1.713770866394043, + "learning_rate": 4.121678297031256e-05, + "loss": 1.6018, + "step": 19798 + }, + { + "epoch": 0.7090443533224703, + "grad_norm": 2.2215230464935303, + "learning_rate": 4.1207399962587356e-05, + "loss": 1.5367, + "step": 19799 + }, + { + "epoch": 0.7090801654520386, + "grad_norm": 1.535365343093872, + "learning_rate": 4.119801774584077e-05, + "loss": 1.2133, + "step": 19800 + }, + { + "epoch": 0.7091159775816069, + "grad_norm": 1.8413805961608887, + "learning_rate": 4.118863632019898e-05, + "loss": 1.5743, + "step": 19801 + }, + { + "epoch": 0.7091517897111752, + "grad_norm": 2.152158260345459, + "learning_rate": 4.117925568578822e-05, + "loss": 1.3583, + "step": 19802 + }, + { + "epoch": 0.7091876018407435, + "grad_norm": 1.7258599996566772, + "learning_rate": 4.116987584273474e-05, + "loss": 1.077, + "step": 19803 + }, + { + "epoch": 0.7092234139703117, + "grad_norm": 1.4642314910888672, + "learning_rate": 4.116049679116466e-05, + "loss": 1.1684, + "step": 19804 + }, + { + "epoch": 0.70925922609988, + "grad_norm": 1.8263181447982788, + "learning_rate": 4.11511185312042e-05, + "loss": 1.695, + "step": 19805 + }, + { + "epoch": 0.7092950382294483, + "grad_norm": 1.8259236812591553, + "learning_rate": 4.114174106297952e-05, + "loss": 1.166, + "step": 19806 + }, + { + "epoch": 0.7093308503590166, + "grad_norm": 1.4869673252105713, + "learning_rate": 4.113236438661684e-05, + "loss": 1.2245, + "step": 19807 + }, + { + "epoch": 0.7093666624885849, + "grad_norm": 1.906758427619934, + "learning_rate": 4.112298850224223e-05, + "loss": 1.4127, + "step": 19808 + }, + { + "epoch": 0.7094024746181532, + "grad_norm": 1.2898838520050049, + "learning_rate": 4.111361340998186e-05, + "loss": 1.4856, + "step": 19809 + }, + { + "epoch": 0.7094382867477215, + "grad_norm": 1.67406165599823, + "learning_rate": 4.11042391099619e-05, + "loss": 1.3279, + "step": 19810 + }, + { + "epoch": 0.7094740988772897, + "grad_norm": 1.590075969696045, + "learning_rate": 4.109486560230839e-05, + "loss": 1.5435, + "step": 19811 + }, + { + "epoch": 0.709509911006858, + "grad_norm": 1.462267518043518, + "learning_rate": 4.108549288714748e-05, + "loss": 1.507, + "step": 19812 + }, + { + "epoch": 0.7095457231364263, + "grad_norm": 1.8588240146636963, + "learning_rate": 4.107612096460528e-05, + "loss": 1.4161, + "step": 19813 + }, + { + "epoch": 0.7095815352659945, + "grad_norm": 1.5106000900268555, + "learning_rate": 4.1066749834807895e-05, + "loss": 1.3721, + "step": 19814 + }, + { + "epoch": 0.7096173473955629, + "grad_norm": 1.4201984405517578, + "learning_rate": 4.105737949788133e-05, + "loss": 1.5243, + "step": 19815 + }, + { + "epoch": 0.7096531595251312, + "grad_norm": 1.4662309885025024, + "learning_rate": 4.10480099539517e-05, + "loss": 1.6587, + "step": 19816 + }, + { + "epoch": 0.7096889716546995, + "grad_norm": 1.658463954925537, + "learning_rate": 4.103864120314506e-05, + "loss": 1.4674, + "step": 19817 + }, + { + "epoch": 0.7097247837842677, + "grad_norm": 1.8113993406295776, + "learning_rate": 4.1029273245587476e-05, + "loss": 1.505, + "step": 19818 + }, + { + "epoch": 0.709760595913836, + "grad_norm": 2.001314878463745, + "learning_rate": 4.101990608140492e-05, + "loss": 1.4655, + "step": 19819 + }, + { + "epoch": 0.7097964080434043, + "grad_norm": 1.4082540273666382, + "learning_rate": 4.101053971072345e-05, + "loss": 1.414, + "step": 19820 + }, + { + "epoch": 0.7098322201729725, + "grad_norm": 1.8123232126235962, + "learning_rate": 4.1001174133669116e-05, + "loss": 1.5416, + "step": 19821 + }, + { + "epoch": 0.7098680323025409, + "grad_norm": 1.7902493476867676, + "learning_rate": 4.099180935036784e-05, + "loss": 1.4017, + "step": 19822 + }, + { + "epoch": 0.7099038444321092, + "grad_norm": 1.911616563796997, + "learning_rate": 4.0982445360945654e-05, + "loss": 1.6244, + "step": 19823 + }, + { + "epoch": 0.7099396565616775, + "grad_norm": 1.7272944450378418, + "learning_rate": 4.097308216552854e-05, + "loss": 1.5453, + "step": 19824 + }, + { + "epoch": 0.7099754686912457, + "grad_norm": 1.4212089776992798, + "learning_rate": 4.0963719764242504e-05, + "loss": 1.5528, + "step": 19825 + }, + { + "epoch": 0.710011280820814, + "grad_norm": 2.0135669708251953, + "learning_rate": 4.0954358157213436e-05, + "loss": 1.2744, + "step": 19826 + }, + { + "epoch": 0.7100470929503823, + "grad_norm": 1.7139232158660889, + "learning_rate": 4.0944997344567304e-05, + "loss": 1.3807, + "step": 19827 + }, + { + "epoch": 0.7100829050799505, + "grad_norm": 1.644747018814087, + "learning_rate": 4.0935637326430095e-05, + "loss": 1.3795, + "step": 19828 + }, + { + "epoch": 0.7101187172095189, + "grad_norm": 1.47983717918396, + "learning_rate": 4.092627810292767e-05, + "loss": 1.5415, + "step": 19829 + }, + { + "epoch": 0.7101545293390872, + "grad_norm": 1.429914951324463, + "learning_rate": 4.0916919674185974e-05, + "loss": 1.4195, + "step": 19830 + }, + { + "epoch": 0.7101903414686555, + "grad_norm": 1.6520344018936157, + "learning_rate": 4.09075620403309e-05, + "loss": 1.4766, + "step": 19831 + }, + { + "epoch": 0.7102261535982237, + "grad_norm": 1.82496976852417, + "learning_rate": 4.0898205201488404e-05, + "loss": 1.6098, + "step": 19832 + }, + { + "epoch": 0.710261965727792, + "grad_norm": 1.775235891342163, + "learning_rate": 4.088884915778427e-05, + "loss": 1.431, + "step": 19833 + }, + { + "epoch": 0.7102977778573603, + "grad_norm": 1.3506832122802734, + "learning_rate": 4.087949390934443e-05, + "loss": 1.537, + "step": 19834 + }, + { + "epoch": 0.7103335899869285, + "grad_norm": 1.5820268392562866, + "learning_rate": 4.0870139456294745e-05, + "loss": 1.2898, + "step": 19835 + }, + { + "epoch": 0.7103694021164969, + "grad_norm": 1.426241397857666, + "learning_rate": 4.0860785798761094e-05, + "loss": 1.3136, + "step": 19836 + }, + { + "epoch": 0.7104052142460652, + "grad_norm": 1.4646059274673462, + "learning_rate": 4.0851432936869296e-05, + "loss": 1.1997, + "step": 19837 + }, + { + "epoch": 0.7104410263756334, + "grad_norm": 1.9413141012191772, + "learning_rate": 4.0842080870745084e-05, + "loss": 1.5866, + "step": 19838 + }, + { + "epoch": 0.7104768385052017, + "grad_norm": 1.6418945789337158, + "learning_rate": 4.083272960051444e-05, + "loss": 1.4602, + "step": 19839 + }, + { + "epoch": 0.71051265063477, + "grad_norm": 1.7183191776275635, + "learning_rate": 4.0823379126303064e-05, + "loss": 1.2933, + "step": 19840 + }, + { + "epoch": 0.7105484627643383, + "grad_norm": 1.4567809104919434, + "learning_rate": 4.0814029448236803e-05, + "loss": 1.3024, + "step": 19841 + }, + { + "epoch": 0.7105842748939065, + "grad_norm": 3.3251304626464844, + "learning_rate": 4.080468056644141e-05, + "loss": 1.2703, + "step": 19842 + }, + { + "epoch": 0.7106200870234749, + "grad_norm": 2.0006585121154785, + "learning_rate": 4.0795332481042736e-05, + "loss": 1.1981, + "step": 19843 + }, + { + "epoch": 0.7106558991530432, + "grad_norm": 1.7054616212844849, + "learning_rate": 4.078598519216645e-05, + "loss": 1.5251, + "step": 19844 + }, + { + "epoch": 0.7106917112826114, + "grad_norm": 2.462589979171753, + "learning_rate": 4.077663869993835e-05, + "loss": 1.3953, + "step": 19845 + }, + { + "epoch": 0.7107275234121797, + "grad_norm": 1.619739055633545, + "learning_rate": 4.076729300448423e-05, + "loss": 1.3919, + "step": 19846 + }, + { + "epoch": 0.710763335541748, + "grad_norm": 1.6942226886749268, + "learning_rate": 4.075794810592973e-05, + "loss": 1.2181, + "step": 19847 + }, + { + "epoch": 0.7107991476713162, + "grad_norm": 1.8538376092910767, + "learning_rate": 4.074860400440067e-05, + "loss": 1.4492, + "step": 19848 + }, + { + "epoch": 0.7108349598008845, + "grad_norm": 1.5637342929840088, + "learning_rate": 4.073926070002264e-05, + "loss": 1.4442, + "step": 19849 + }, + { + "epoch": 0.7108707719304529, + "grad_norm": 1.4168877601623535, + "learning_rate": 4.072991819292148e-05, + "loss": 1.9249, + "step": 19850 + }, + { + "epoch": 0.7109065840600212, + "grad_norm": 1.5848156213760376, + "learning_rate": 4.0720576483222795e-05, + "loss": 1.3855, + "step": 19851 + }, + { + "epoch": 0.7109423961895894, + "grad_norm": 1.6019319295883179, + "learning_rate": 4.0711235571052306e-05, + "loss": 1.6355, + "step": 19852 + }, + { + "epoch": 0.7109782083191577, + "grad_norm": 1.3284941911697388, + "learning_rate": 4.070189545653561e-05, + "loss": 1.214, + "step": 19853 + }, + { + "epoch": 0.711014020448726, + "grad_norm": 1.7176045179367065, + "learning_rate": 4.069255613979849e-05, + "loss": 1.523, + "step": 19854 + }, + { + "epoch": 0.7110498325782942, + "grad_norm": 1.7608132362365723, + "learning_rate": 4.068321762096652e-05, + "loss": 1.1746, + "step": 19855 + }, + { + "epoch": 0.7110856447078625, + "grad_norm": 1.74787437915802, + "learning_rate": 4.067387990016528e-05, + "loss": 1.2927, + "step": 19856 + }, + { + "epoch": 0.7111214568374309, + "grad_norm": 1.6266214847564697, + "learning_rate": 4.0664542977520526e-05, + "loss": 1.3775, + "step": 19857 + }, + { + "epoch": 0.7111572689669992, + "grad_norm": 1.6558783054351807, + "learning_rate": 4.065520685315777e-05, + "loss": 1.3691, + "step": 19858 + }, + { + "epoch": 0.7111930810965674, + "grad_norm": 1.853967308998108, + "learning_rate": 4.0645871527202695e-05, + "loss": 1.3375, + "step": 19859 + }, + { + "epoch": 0.7112288932261357, + "grad_norm": 1.459820032119751, + "learning_rate": 4.063653699978079e-05, + "loss": 1.3781, + "step": 19860 + }, + { + "epoch": 0.711264705355704, + "grad_norm": 1.3726294040679932, + "learning_rate": 4.062720327101778e-05, + "loss": 1.5798, + "step": 19861 + }, + { + "epoch": 0.7113005174852722, + "grad_norm": 1.7460395097732544, + "learning_rate": 4.0617870341039155e-05, + "loss": 1.499, + "step": 19862 + }, + { + "epoch": 0.7113363296148405, + "grad_norm": 1.8195322751998901, + "learning_rate": 4.060853820997046e-05, + "loss": 1.3855, + "step": 19863 + }, + { + "epoch": 0.7113721417444089, + "grad_norm": 1.6731899976730347, + "learning_rate": 4.059920687793727e-05, + "loss": 1.6307, + "step": 19864 + }, + { + "epoch": 0.7114079538739772, + "grad_norm": 1.573760986328125, + "learning_rate": 4.058987634506514e-05, + "loss": 1.4133, + "step": 19865 + }, + { + "epoch": 0.7114437660035454, + "grad_norm": 1.6020008325576782, + "learning_rate": 4.058054661147961e-05, + "loss": 1.6497, + "step": 19866 + }, + { + "epoch": 0.7114795781331137, + "grad_norm": 1.3755627870559692, + "learning_rate": 4.057121767730612e-05, + "loss": 1.3899, + "step": 19867 + }, + { + "epoch": 0.711515390262682, + "grad_norm": 1.3662739992141724, + "learning_rate": 4.05618895426703e-05, + "loss": 1.5769, + "step": 19868 + }, + { + "epoch": 0.7115512023922502, + "grad_norm": 1.8042664527893066, + "learning_rate": 4.055256220769755e-05, + "loss": 1.3367, + "step": 19869 + }, + { + "epoch": 0.7115870145218185, + "grad_norm": 1.653998851776123, + "learning_rate": 4.0543235672513434e-05, + "loss": 1.4023, + "step": 19870 + }, + { + "epoch": 0.7116228266513869, + "grad_norm": 2.058856964111328, + "learning_rate": 4.0533909937243365e-05, + "loss": 1.3852, + "step": 19871 + }, + { + "epoch": 0.7116586387809551, + "grad_norm": 1.6064378023147583, + "learning_rate": 4.0524585002012815e-05, + "loss": 1.3659, + "step": 19872 + }, + { + "epoch": 0.7116944509105234, + "grad_norm": 1.5606815814971924, + "learning_rate": 4.05152608669473e-05, + "loss": 1.3211, + "step": 19873 + }, + { + "epoch": 0.7117302630400917, + "grad_norm": 1.5185139179229736, + "learning_rate": 4.0505937532172175e-05, + "loss": 1.4429, + "step": 19874 + }, + { + "epoch": 0.71176607516966, + "grad_norm": 1.587683081626892, + "learning_rate": 4.049661499781293e-05, + "loss": 1.6571, + "step": 19875 + }, + { + "epoch": 0.7118018872992282, + "grad_norm": 1.8505325317382812, + "learning_rate": 4.048729326399498e-05, + "loss": 1.396, + "step": 19876 + }, + { + "epoch": 0.7118376994287965, + "grad_norm": 1.4003081321716309, + "learning_rate": 4.047797233084375e-05, + "loss": 1.402, + "step": 19877 + }, + { + "epoch": 0.7118735115583649, + "grad_norm": 1.828133225440979, + "learning_rate": 4.0468652198484603e-05, + "loss": 1.035, + "step": 19878 + }, + { + "epoch": 0.7119093236879331, + "grad_norm": 1.4773956537246704, + "learning_rate": 4.045933286704296e-05, + "loss": 1.5263, + "step": 19879 + }, + { + "epoch": 0.7119451358175014, + "grad_norm": 1.9004855155944824, + "learning_rate": 4.0450014336644204e-05, + "loss": 1.337, + "step": 19880 + }, + { + "epoch": 0.7119809479470697, + "grad_norm": 2.385497808456421, + "learning_rate": 4.0440696607413665e-05, + "loss": 1.6802, + "step": 19881 + }, + { + "epoch": 0.712016760076638, + "grad_norm": 1.733431339263916, + "learning_rate": 4.0431379679476735e-05, + "loss": 1.6468, + "step": 19882 + }, + { + "epoch": 0.7120525722062062, + "grad_norm": 1.4043059349060059, + "learning_rate": 4.042206355295875e-05, + "loss": 1.4577, + "step": 19883 + }, + { + "epoch": 0.7120883843357745, + "grad_norm": 1.3792320489883423, + "learning_rate": 4.0412748227985075e-05, + "loss": 1.4173, + "step": 19884 + }, + { + "epoch": 0.7121241964653429, + "grad_norm": 1.3899900913238525, + "learning_rate": 4.040343370468098e-05, + "loss": 1.3588, + "step": 19885 + }, + { + "epoch": 0.7121600085949111, + "grad_norm": 1.430406928062439, + "learning_rate": 4.039411998317182e-05, + "loss": 1.6367, + "step": 19886 + }, + { + "epoch": 0.7121958207244794, + "grad_norm": 1.4435672760009766, + "learning_rate": 4.038480706358287e-05, + "loss": 1.3293, + "step": 19887 + }, + { + "epoch": 0.7122316328540477, + "grad_norm": 1.3121012449264526, + "learning_rate": 4.0375494946039495e-05, + "loss": 1.0906, + "step": 19888 + }, + { + "epoch": 0.7122674449836159, + "grad_norm": 1.6354104280471802, + "learning_rate": 4.0366183630666885e-05, + "loss": 1.4199, + "step": 19889 + }, + { + "epoch": 0.7123032571131842, + "grad_norm": 1.617457389831543, + "learning_rate": 4.035687311759036e-05, + "loss": 1.2094, + "step": 19890 + }, + { + "epoch": 0.7123390692427525, + "grad_norm": 2.983320713043213, + "learning_rate": 4.03475634069352e-05, + "loss": 1.2032, + "step": 19891 + }, + { + "epoch": 0.7123748813723209, + "grad_norm": 1.9741209745407104, + "learning_rate": 4.033825449882659e-05, + "loss": 1.6208, + "step": 19892 + }, + { + "epoch": 0.7124106935018891, + "grad_norm": 1.3680564165115356, + "learning_rate": 4.032894639338981e-05, + "loss": 1.7192, + "step": 19893 + }, + { + "epoch": 0.7124465056314574, + "grad_norm": 1.6451799869537354, + "learning_rate": 4.031963909075009e-05, + "loss": 1.2906, + "step": 19894 + }, + { + "epoch": 0.7124823177610257, + "grad_norm": 1.493516206741333, + "learning_rate": 4.0310332591032675e-05, + "loss": 1.4103, + "step": 19895 + }, + { + "epoch": 0.7125181298905939, + "grad_norm": 1.9640635251998901, + "learning_rate": 4.030102689436271e-05, + "loss": 1.3533, + "step": 19896 + }, + { + "epoch": 0.7125539420201622, + "grad_norm": 1.6041368246078491, + "learning_rate": 4.0291722000865416e-05, + "loss": 1.2857, + "step": 19897 + }, + { + "epoch": 0.7125897541497305, + "grad_norm": 1.584389090538025, + "learning_rate": 4.0282417910666025e-05, + "loss": 1.5981, + "step": 19898 + }, + { + "epoch": 0.7126255662792988, + "grad_norm": 1.9366203546524048, + "learning_rate": 4.027311462388964e-05, + "loss": 1.6245, + "step": 19899 + }, + { + "epoch": 0.7126613784088671, + "grad_norm": 1.880853533744812, + "learning_rate": 4.026381214066145e-05, + "loss": 1.358, + "step": 19900 + }, + { + "epoch": 0.7126971905384354, + "grad_norm": 1.7722489833831787, + "learning_rate": 4.025451046110661e-05, + "loss": 1.5929, + "step": 19901 + }, + { + "epoch": 0.7127330026680037, + "grad_norm": 1.6037089824676514, + "learning_rate": 4.024520958535031e-05, + "loss": 1.5791, + "step": 19902 + }, + { + "epoch": 0.7127688147975719, + "grad_norm": 2.314192295074463, + "learning_rate": 4.023590951351759e-05, + "loss": 1.5203, + "step": 19903 + }, + { + "epoch": 0.7128046269271402, + "grad_norm": 2.308858633041382, + "learning_rate": 4.022661024573362e-05, + "loss": 1.2871, + "step": 19904 + }, + { + "epoch": 0.7128404390567085, + "grad_norm": 1.604375958442688, + "learning_rate": 4.0217311782123514e-05, + "loss": 1.7423, + "step": 19905 + }, + { + "epoch": 0.7128762511862768, + "grad_norm": 1.2021867036819458, + "learning_rate": 4.020801412281239e-05, + "loss": 1.246, + "step": 19906 + }, + { + "epoch": 0.7129120633158451, + "grad_norm": 2.0452868938446045, + "learning_rate": 4.019871726792528e-05, + "loss": 1.5018, + "step": 19907 + }, + { + "epoch": 0.7129478754454134, + "grad_norm": 1.4524165391921997, + "learning_rate": 4.0189421217587297e-05, + "loss": 1.4838, + "step": 19908 + }, + { + "epoch": 0.7129836875749817, + "grad_norm": 1.3504445552825928, + "learning_rate": 4.0180125971923524e-05, + "loss": 1.5034, + "step": 19909 + }, + { + "epoch": 0.7130194997045499, + "grad_norm": 2.032801389694214, + "learning_rate": 4.017083153105897e-05, + "loss": 1.5505, + "step": 19910 + }, + { + "epoch": 0.7130553118341182, + "grad_norm": 1.6039384603500366, + "learning_rate": 4.0161537895118695e-05, + "loss": 1.3706, + "step": 19911 + }, + { + "epoch": 0.7130911239636865, + "grad_norm": 1.8873577117919922, + "learning_rate": 4.0152245064227745e-05, + "loss": 1.2697, + "step": 19912 + }, + { + "epoch": 0.7131269360932548, + "grad_norm": 1.6298884153366089, + "learning_rate": 4.0142953038511176e-05, + "loss": 1.422, + "step": 19913 + }, + { + "epoch": 0.7131627482228231, + "grad_norm": 1.6877508163452148, + "learning_rate": 4.013366181809393e-05, + "loss": 1.2018, + "step": 19914 + }, + { + "epoch": 0.7131985603523914, + "grad_norm": 1.8622572422027588, + "learning_rate": 4.0124371403101034e-05, + "loss": 1.6352, + "step": 19915 + }, + { + "epoch": 0.7132343724819596, + "grad_norm": 2.3264732360839844, + "learning_rate": 4.0115081793657525e-05, + "loss": 1.5532, + "step": 19916 + }, + { + "epoch": 0.7132701846115279, + "grad_norm": 2.014953374862671, + "learning_rate": 4.010579298988832e-05, + "loss": 1.7364, + "step": 19917 + }, + { + "epoch": 0.7133059967410962, + "grad_norm": 2.2904231548309326, + "learning_rate": 4.00965049919184e-05, + "loss": 1.2316, + "step": 19918 + }, + { + "epoch": 0.7133418088706645, + "grad_norm": 1.9575514793395996, + "learning_rate": 4.0087217799872746e-05, + "loss": 1.4247, + "step": 19919 + }, + { + "epoch": 0.7133776210002328, + "grad_norm": 1.7283276319503784, + "learning_rate": 4.007793141387633e-05, + "loss": 1.4632, + "step": 19920 + }, + { + "epoch": 0.7134134331298011, + "grad_norm": 1.2572942972183228, + "learning_rate": 4.0068645834054e-05, + "loss": 1.251, + "step": 19921 + }, + { + "epoch": 0.7134492452593694, + "grad_norm": 1.8809939622879028, + "learning_rate": 4.0059361060530755e-05, + "loss": 1.8018, + "step": 19922 + }, + { + "epoch": 0.7134850573889376, + "grad_norm": 2.7003350257873535, + "learning_rate": 4.005007709343147e-05, + "loss": 1.3071, + "step": 19923 + }, + { + "epoch": 0.7135208695185059, + "grad_norm": 1.7158159017562866, + "learning_rate": 4.004079393288112e-05, + "loss": 1.5144, + "step": 19924 + }, + { + "epoch": 0.7135566816480742, + "grad_norm": 1.4738690853118896, + "learning_rate": 4.00315115790045e-05, + "loss": 1.6998, + "step": 19925 + }, + { + "epoch": 0.7135924937776424, + "grad_norm": 1.3729649782180786, + "learning_rate": 4.002223003192654e-05, + "loss": 1.2989, + "step": 19926 + }, + { + "epoch": 0.7136283059072108, + "grad_norm": 1.5292845964431763, + "learning_rate": 4.001294929177215e-05, + "loss": 1.683, + "step": 19927 + }, + { + "epoch": 0.7136641180367791, + "grad_norm": 1.4550150632858276, + "learning_rate": 4.0003669358666106e-05, + "loss": 1.4341, + "step": 19928 + }, + { + "epoch": 0.7136999301663474, + "grad_norm": 1.1493412256240845, + "learning_rate": 3.9994390232733304e-05, + "loss": 1.3642, + "step": 19929 + }, + { + "epoch": 0.7137357422959156, + "grad_norm": 1.5714877843856812, + "learning_rate": 3.9985111914098585e-05, + "loss": 1.3609, + "step": 19930 + }, + { + "epoch": 0.7137715544254839, + "grad_norm": 1.6583268642425537, + "learning_rate": 3.99758344028868e-05, + "loss": 1.5051, + "step": 19931 + }, + { + "epoch": 0.7138073665550522, + "grad_norm": 2.281008005142212, + "learning_rate": 3.99665576992227e-05, + "loss": 1.8181, + "step": 19932 + }, + { + "epoch": 0.7138431786846204, + "grad_norm": 1.5775352716445923, + "learning_rate": 3.995728180323114e-05, + "loss": 1.5004, + "step": 19933 + }, + { + "epoch": 0.7138789908141888, + "grad_norm": 1.927957534790039, + "learning_rate": 3.994800671503694e-05, + "loss": 1.4526, + "step": 19934 + }, + { + "epoch": 0.7139148029437571, + "grad_norm": 1.4535878896713257, + "learning_rate": 3.9938732434764805e-05, + "loss": 1.5883, + "step": 19935 + }, + { + "epoch": 0.7139506150733254, + "grad_norm": 1.65828537940979, + "learning_rate": 3.992945896253958e-05, + "loss": 1.6344, + "step": 19936 + }, + { + "epoch": 0.7139864272028936, + "grad_norm": 1.9846526384353638, + "learning_rate": 3.992018629848594e-05, + "loss": 1.3487, + "step": 19937 + }, + { + "epoch": 0.7140222393324619, + "grad_norm": 1.7143404483795166, + "learning_rate": 3.991091444272876e-05, + "loss": 1.2854, + "step": 19938 + }, + { + "epoch": 0.7140580514620302, + "grad_norm": 1.7557034492492676, + "learning_rate": 3.9901643395392685e-05, + "loss": 1.5111, + "step": 19939 + }, + { + "epoch": 0.7140938635915984, + "grad_norm": 1.5067017078399658, + "learning_rate": 3.989237315660248e-05, + "loss": 1.4103, + "step": 19940 + }, + { + "epoch": 0.7141296757211668, + "grad_norm": 2.0715649127960205, + "learning_rate": 3.988310372648285e-05, + "loss": 1.4079, + "step": 19941 + }, + { + "epoch": 0.7141654878507351, + "grad_norm": 1.3460463285446167, + "learning_rate": 3.9873835105158564e-05, + "loss": 1.5776, + "step": 19942 + }, + { + "epoch": 0.7142012999803034, + "grad_norm": 1.6910752058029175, + "learning_rate": 3.9864567292754266e-05, + "loss": 1.6837, + "step": 19943 + }, + { + "epoch": 0.7142371121098716, + "grad_norm": 2.667029619216919, + "learning_rate": 3.985530028939456e-05, + "loss": 1.3131, + "step": 19944 + }, + { + "epoch": 0.7142729242394399, + "grad_norm": 1.554051160812378, + "learning_rate": 3.9846034095204285e-05, + "loss": 1.3344, + "step": 19945 + }, + { + "epoch": 0.7143087363690082, + "grad_norm": 1.4796596765518188, + "learning_rate": 3.9836768710308e-05, + "loss": 1.6145, + "step": 19946 + }, + { + "epoch": 0.7143445484985764, + "grad_norm": 1.7634326219558716, + "learning_rate": 3.982750413483043e-05, + "loss": 1.2404, + "step": 19947 + }, + { + "epoch": 0.7143803606281448, + "grad_norm": 1.5787911415100098, + "learning_rate": 3.981824036889609e-05, + "loss": 1.3975, + "step": 19948 + }, + { + "epoch": 0.7144161727577131, + "grad_norm": 1.527350902557373, + "learning_rate": 3.9808977412629764e-05, + "loss": 1.4577, + "step": 19949 + }, + { + "epoch": 0.7144519848872813, + "grad_norm": 1.693781852722168, + "learning_rate": 3.979971526615598e-05, + "loss": 1.1326, + "step": 19950 + }, + { + "epoch": 0.7144877970168496, + "grad_norm": 1.5585476160049438, + "learning_rate": 3.9790453929599384e-05, + "loss": 1.2612, + "step": 19951 + }, + { + "epoch": 0.7145236091464179, + "grad_norm": 1.7941901683807373, + "learning_rate": 3.978119340308458e-05, + "loss": 1.5067, + "step": 19952 + }, + { + "epoch": 0.7145594212759862, + "grad_norm": 1.4558805227279663, + "learning_rate": 3.977193368673612e-05, + "loss": 1.2185, + "step": 19953 + }, + { + "epoch": 0.7145952334055544, + "grad_norm": 1.8235039710998535, + "learning_rate": 3.976267478067863e-05, + "loss": 1.4494, + "step": 19954 + }, + { + "epoch": 0.7146310455351228, + "grad_norm": 2.5660204887390137, + "learning_rate": 3.975341668503659e-05, + "loss": 1.5131, + "step": 19955 + }, + { + "epoch": 0.7146668576646911, + "grad_norm": 2.2466602325439453, + "learning_rate": 3.9744159399934676e-05, + "loss": 1.6216, + "step": 19956 + }, + { + "epoch": 0.7147026697942593, + "grad_norm": 1.3325560092926025, + "learning_rate": 3.973490292549735e-05, + "loss": 1.478, + "step": 19957 + }, + { + "epoch": 0.7147384819238276, + "grad_norm": 1.7814619541168213, + "learning_rate": 3.97256472618492e-05, + "loss": 1.5762, + "step": 19958 + }, + { + "epoch": 0.7147742940533959, + "grad_norm": 1.7509052753448486, + "learning_rate": 3.971639240911468e-05, + "loss": 1.2004, + "step": 19959 + }, + { + "epoch": 0.7148101061829641, + "grad_norm": 1.4850753545761108, + "learning_rate": 3.970713836741834e-05, + "loss": 1.1819, + "step": 19960 + }, + { + "epoch": 0.7148459183125324, + "grad_norm": 1.3009765148162842, + "learning_rate": 3.9697885136884716e-05, + "loss": 1.5121, + "step": 19961 + }, + { + "epoch": 0.7148817304421008, + "grad_norm": 1.357944369316101, + "learning_rate": 3.968863271763822e-05, + "loss": 1.4413, + "step": 19962 + }, + { + "epoch": 0.7149175425716691, + "grad_norm": 1.5240039825439453, + "learning_rate": 3.967938110980338e-05, + "loss": 1.3346, + "step": 19963 + }, + { + "epoch": 0.7149533547012373, + "grad_norm": 1.7359546422958374, + "learning_rate": 3.9670130313504675e-05, + "loss": 1.5019, + "step": 19964 + }, + { + "epoch": 0.7149891668308056, + "grad_norm": 1.9136874675750732, + "learning_rate": 3.9660880328866556e-05, + "loss": 1.3177, + "step": 19965 + }, + { + "epoch": 0.7150249789603739, + "grad_norm": 1.4809314012527466, + "learning_rate": 3.96516311560134e-05, + "loss": 1.5468, + "step": 19966 + }, + { + "epoch": 0.7150607910899421, + "grad_norm": 1.871216058731079, + "learning_rate": 3.964238279506979e-05, + "loss": 1.6134, + "step": 19967 + }, + { + "epoch": 0.7150966032195104, + "grad_norm": 1.5043405294418335, + "learning_rate": 3.963313524616005e-05, + "loss": 1.3557, + "step": 19968 + }, + { + "epoch": 0.7151324153490788, + "grad_norm": 1.8438221216201782, + "learning_rate": 3.962388850940857e-05, + "loss": 1.5986, + "step": 19969 + }, + { + "epoch": 0.7151682274786471, + "grad_norm": 1.7853418588638306, + "learning_rate": 3.9614642584939784e-05, + "loss": 1.4494, + "step": 19970 + }, + { + "epoch": 0.7152040396082153, + "grad_norm": 1.7872045040130615, + "learning_rate": 3.96053974728781e-05, + "loss": 1.6464, + "step": 19971 + }, + { + "epoch": 0.7152398517377836, + "grad_norm": 1.82921302318573, + "learning_rate": 3.9596153173347925e-05, + "loss": 1.6829, + "step": 19972 + }, + { + "epoch": 0.7152756638673519, + "grad_norm": 1.4460041522979736, + "learning_rate": 3.958690968647356e-05, + "loss": 1.5682, + "step": 19973 + }, + { + "epoch": 0.7153114759969201, + "grad_norm": 2.098288059234619, + "learning_rate": 3.9577667012379395e-05, + "loss": 1.6599, + "step": 19974 + }, + { + "epoch": 0.7153472881264884, + "grad_norm": 1.5381921529769897, + "learning_rate": 3.956842515118978e-05, + "loss": 1.543, + "step": 19975 + }, + { + "epoch": 0.7153831002560568, + "grad_norm": 1.5164440870285034, + "learning_rate": 3.955918410302909e-05, + "loss": 1.3406, + "step": 19976 + }, + { + "epoch": 0.715418912385625, + "grad_norm": 1.4968868494033813, + "learning_rate": 3.954994386802158e-05, + "loss": 1.4984, + "step": 19977 + }, + { + "epoch": 0.7154547245151933, + "grad_norm": 1.6591699123382568, + "learning_rate": 3.95407044462916e-05, + "loss": 1.326, + "step": 19978 + }, + { + "epoch": 0.7154905366447616, + "grad_norm": 1.5657908916473389, + "learning_rate": 3.953146583796349e-05, + "loss": 1.2784, + "step": 19979 + }, + { + "epoch": 0.7155263487743299, + "grad_norm": 1.5644398927688599, + "learning_rate": 3.952222804316148e-05, + "loss": 1.7664, + "step": 19980 + }, + { + "epoch": 0.7155621609038981, + "grad_norm": 1.6657533645629883, + "learning_rate": 3.9512991062009874e-05, + "loss": 1.3903, + "step": 19981 + }, + { + "epoch": 0.7155979730334664, + "grad_norm": 1.300279140472412, + "learning_rate": 3.9503754894632947e-05, + "loss": 1.5107, + "step": 19982 + }, + { + "epoch": 0.7156337851630348, + "grad_norm": 1.9345062971115112, + "learning_rate": 3.949451954115501e-05, + "loss": 1.638, + "step": 19983 + }, + { + "epoch": 0.715669597292603, + "grad_norm": 1.9588655233383179, + "learning_rate": 3.948528500170021e-05, + "loss": 1.377, + "step": 19984 + }, + { + "epoch": 0.7157054094221713, + "grad_norm": 1.5046672821044922, + "learning_rate": 3.9476051276392853e-05, + "loss": 1.4962, + "step": 19985 + }, + { + "epoch": 0.7157412215517396, + "grad_norm": 1.284699559211731, + "learning_rate": 3.946681836535721e-05, + "loss": 1.0213, + "step": 19986 + }, + { + "epoch": 0.7157770336813079, + "grad_norm": 1.8973195552825928, + "learning_rate": 3.945758626871738e-05, + "loss": 1.608, + "step": 19987 + }, + { + "epoch": 0.7158128458108761, + "grad_norm": 1.9553606510162354, + "learning_rate": 3.9448354986597645e-05, + "loss": 1.344, + "step": 19988 + }, + { + "epoch": 0.7158486579404444, + "grad_norm": 1.8713107109069824, + "learning_rate": 3.943912451912219e-05, + "loss": 1.4795, + "step": 19989 + }, + { + "epoch": 0.7158844700700128, + "grad_norm": 2.1087417602539062, + "learning_rate": 3.9429894866415226e-05, + "loss": 1.3713, + "step": 19990 + }, + { + "epoch": 0.715920282199581, + "grad_norm": 1.916988492012024, + "learning_rate": 3.9420666028600874e-05, + "loss": 1.7671, + "step": 19991 + }, + { + "epoch": 0.7159560943291493, + "grad_norm": 1.5088728666305542, + "learning_rate": 3.9411438005803305e-05, + "loss": 1.2984, + "step": 19992 + }, + { + "epoch": 0.7159919064587176, + "grad_norm": 1.6458569765090942, + "learning_rate": 3.9402210798146686e-05, + "loss": 1.4498, + "step": 19993 + }, + { + "epoch": 0.7160277185882858, + "grad_norm": 1.7323598861694336, + "learning_rate": 3.939298440575519e-05, + "loss": 1.6207, + "step": 19994 + }, + { + "epoch": 0.7160635307178541, + "grad_norm": 1.744994044303894, + "learning_rate": 3.9383758828752884e-05, + "loss": 1.537, + "step": 19995 + }, + { + "epoch": 0.7160993428474224, + "grad_norm": 1.408331036567688, + "learning_rate": 3.93745340672639e-05, + "loss": 1.5115, + "step": 19996 + }, + { + "epoch": 0.7161351549769908, + "grad_norm": 1.3587737083435059, + "learning_rate": 3.936531012141241e-05, + "loss": 1.3417, + "step": 19997 + }, + { + "epoch": 0.716170967106559, + "grad_norm": 1.5102040767669678, + "learning_rate": 3.935608699132242e-05, + "loss": 1.1886, + "step": 19998 + }, + { + "epoch": 0.7162067792361273, + "grad_norm": 1.4365791082382202, + "learning_rate": 3.9346864677118046e-05, + "loss": 1.4104, + "step": 19999 + }, + { + "epoch": 0.7162425913656956, + "grad_norm": 1.6736547946929932, + "learning_rate": 3.9337643178923376e-05, + "loss": 1.2907, + "step": 20000 + }, + { + "epoch": 0.7162784034952638, + "grad_norm": 1.6852706670761108, + "learning_rate": 3.932842249686251e-05, + "loss": 1.5204, + "step": 20001 + }, + { + "epoch": 0.7163142156248321, + "grad_norm": 1.6308528184890747, + "learning_rate": 3.9319202631059414e-05, + "loss": 1.3969, + "step": 20002 + }, + { + "epoch": 0.7163500277544004, + "grad_norm": 1.8392781019210815, + "learning_rate": 3.9309983581638173e-05, + "loss": 1.5858, + "step": 20003 + }, + { + "epoch": 0.7163858398839686, + "grad_norm": 1.9511610269546509, + "learning_rate": 3.9300765348722854e-05, + "loss": 1.1621, + "step": 20004 + }, + { + "epoch": 0.716421652013537, + "grad_norm": 1.9499866962432861, + "learning_rate": 3.929154793243741e-05, + "loss": 1.5144, + "step": 20005 + }, + { + "epoch": 0.7164574641431053, + "grad_norm": 1.5823554992675781, + "learning_rate": 3.928233133290589e-05, + "loss": 1.5373, + "step": 20006 + }, + { + "epoch": 0.7164932762726736, + "grad_norm": 2.0642759799957275, + "learning_rate": 3.927311555025227e-05, + "loss": 1.6982, + "step": 20007 + }, + { + "epoch": 0.7165290884022418, + "grad_norm": 1.6306183338165283, + "learning_rate": 3.926390058460058e-05, + "loss": 1.4509, + "step": 20008 + }, + { + "epoch": 0.7165649005318101, + "grad_norm": 1.752936601638794, + "learning_rate": 3.925468643607473e-05, + "loss": 1.5816, + "step": 20009 + }, + { + "epoch": 0.7166007126613784, + "grad_norm": 1.5219781398773193, + "learning_rate": 3.9245473104798726e-05, + "loss": 1.3056, + "step": 20010 + }, + { + "epoch": 0.7166365247909466, + "grad_norm": 1.620758295059204, + "learning_rate": 3.923626059089651e-05, + "loss": 1.2589, + "step": 20011 + }, + { + "epoch": 0.716672336920515, + "grad_norm": 1.6076922416687012, + "learning_rate": 3.9227048894492055e-05, + "loss": 1.6003, + "step": 20012 + }, + { + "epoch": 0.7167081490500833, + "grad_norm": 1.5931798219680786, + "learning_rate": 3.921783801570924e-05, + "loss": 1.341, + "step": 20013 + }, + { + "epoch": 0.7167439611796516, + "grad_norm": 1.723936676979065, + "learning_rate": 3.9208627954672014e-05, + "loss": 1.4163, + "step": 20014 + }, + { + "epoch": 0.7167797733092198, + "grad_norm": 1.620898962020874, + "learning_rate": 3.9199418711504307e-05, + "loss": 1.3572, + "step": 20015 + }, + { + "epoch": 0.7168155854387881, + "grad_norm": 1.4487621784210205, + "learning_rate": 3.919021028632998e-05, + "loss": 1.5906, + "step": 20016 + }, + { + "epoch": 0.7168513975683564, + "grad_norm": 1.5113482475280762, + "learning_rate": 3.918100267927292e-05, + "loss": 1.6878, + "step": 20017 + }, + { + "epoch": 0.7168872096979246, + "grad_norm": 1.350650668144226, + "learning_rate": 3.917179589045701e-05, + "loss": 1.3855, + "step": 20018 + }, + { + "epoch": 0.716923021827493, + "grad_norm": 2.0095953941345215, + "learning_rate": 3.9162589920006164e-05, + "loss": 1.6719, + "step": 20019 + }, + { + "epoch": 0.7169588339570613, + "grad_norm": 2.0317280292510986, + "learning_rate": 3.9153384768044163e-05, + "loss": 1.2714, + "step": 20020 + }, + { + "epoch": 0.7169946460866296, + "grad_norm": 1.5975542068481445, + "learning_rate": 3.9144180434694885e-05, + "loss": 1.4929, + "step": 20021 + }, + { + "epoch": 0.7170304582161978, + "grad_norm": 1.5322084426879883, + "learning_rate": 3.91349769200822e-05, + "loss": 1.0812, + "step": 20022 + }, + { + "epoch": 0.7170662703457661, + "grad_norm": 1.5115982294082642, + "learning_rate": 3.9125774224329845e-05, + "loss": 1.4755, + "step": 20023 + }, + { + "epoch": 0.7171020824753344, + "grad_norm": 1.715003252029419, + "learning_rate": 3.911657234756169e-05, + "loss": 1.4666, + "step": 20024 + }, + { + "epoch": 0.7171378946049026, + "grad_norm": 1.540813684463501, + "learning_rate": 3.9107371289901504e-05, + "loss": 1.4246, + "step": 20025 + }, + { + "epoch": 0.717173706734471, + "grad_norm": 1.781424880027771, + "learning_rate": 3.909817105147314e-05, + "loss": 1.3314, + "step": 20026 + }, + { + "epoch": 0.7172095188640393, + "grad_norm": 1.4089975357055664, + "learning_rate": 3.9088971632400286e-05, + "loss": 1.2955, + "step": 20027 + }, + { + "epoch": 0.7172453309936075, + "grad_norm": 1.7052937746047974, + "learning_rate": 3.907977303280674e-05, + "loss": 1.6117, + "step": 20028 + }, + { + "epoch": 0.7172811431231758, + "grad_norm": 1.4896774291992188, + "learning_rate": 3.907057525281628e-05, + "loss": 1.4082, + "step": 20029 + }, + { + "epoch": 0.7173169552527441, + "grad_norm": 1.4978755712509155, + "learning_rate": 3.906137829255266e-05, + "loss": 1.3267, + "step": 20030 + }, + { + "epoch": 0.7173527673823124, + "grad_norm": 1.3901692628860474, + "learning_rate": 3.90521821521396e-05, + "loss": 1.1466, + "step": 20031 + }, + { + "epoch": 0.7173885795118806, + "grad_norm": 1.3833082914352417, + "learning_rate": 3.904298683170074e-05, + "loss": 1.577, + "step": 20032 + }, + { + "epoch": 0.717424391641449, + "grad_norm": 1.5521522760391235, + "learning_rate": 3.903379233135994e-05, + "loss": 1.2189, + "step": 20033 + }, + { + "epoch": 0.7174602037710173, + "grad_norm": 1.3972352743148804, + "learning_rate": 3.9024598651240774e-05, + "loss": 1.5434, + "step": 20034 + }, + { + "epoch": 0.7174960159005855, + "grad_norm": 1.4136375188827515, + "learning_rate": 3.901540579146698e-05, + "loss": 1.4805, + "step": 20035 + }, + { + "epoch": 0.7175318280301538, + "grad_norm": 1.4638866186141968, + "learning_rate": 3.900621375216226e-05, + "loss": 1.5089, + "step": 20036 + }, + { + "epoch": 0.7175676401597221, + "grad_norm": 1.317194938659668, + "learning_rate": 3.8997022533450264e-05, + "loss": 1.3669, + "step": 20037 + }, + { + "epoch": 0.7176034522892903, + "grad_norm": 2.1901493072509766, + "learning_rate": 3.898783213545463e-05, + "loss": 1.1969, + "step": 20038 + }, + { + "epoch": 0.7176392644188586, + "grad_norm": 1.7815966606140137, + "learning_rate": 3.8978642558298994e-05, + "loss": 1.6605, + "step": 20039 + }, + { + "epoch": 0.717675076548427, + "grad_norm": 1.8504796028137207, + "learning_rate": 3.8969453802107057e-05, + "loss": 1.2785, + "step": 20040 + }, + { + "epoch": 0.7177108886779953, + "grad_norm": 1.529250144958496, + "learning_rate": 3.8960265867002364e-05, + "loss": 1.6428, + "step": 20041 + }, + { + "epoch": 0.7177467008075635, + "grad_norm": 1.3686065673828125, + "learning_rate": 3.895107875310858e-05, + "loss": 1.1473, + "step": 20042 + }, + { + "epoch": 0.7177825129371318, + "grad_norm": 2.0113461017608643, + "learning_rate": 3.894189246054922e-05, + "loss": 1.4563, + "step": 20043 + }, + { + "epoch": 0.7178183250667001, + "grad_norm": 1.429203748703003, + "learning_rate": 3.893270698944802e-05, + "loss": 1.3154, + "step": 20044 + }, + { + "epoch": 0.7178541371962683, + "grad_norm": 2.117223024368286, + "learning_rate": 3.892352233992843e-05, + "loss": 1.5677, + "step": 20045 + }, + { + "epoch": 0.7178899493258366, + "grad_norm": 1.57451331615448, + "learning_rate": 3.89143385121141e-05, + "loss": 1.3936, + "step": 20046 + }, + { + "epoch": 0.717925761455405, + "grad_norm": 1.4294334650039673, + "learning_rate": 3.8905155506128476e-05, + "loss": 1.4473, + "step": 20047 + }, + { + "epoch": 0.7179615735849733, + "grad_norm": 1.9604945182800293, + "learning_rate": 3.889597332209526e-05, + "loss": 0.9822, + "step": 20048 + }, + { + "epoch": 0.7179973857145415, + "grad_norm": 1.716846227645874, + "learning_rate": 3.888679196013789e-05, + "loss": 1.7099, + "step": 20049 + }, + { + "epoch": 0.7180331978441098, + "grad_norm": 1.586357831954956, + "learning_rate": 3.887761142037984e-05, + "loss": 1.4334, + "step": 20050 + }, + { + "epoch": 0.7180690099736781, + "grad_norm": 1.982703685760498, + "learning_rate": 3.886843170294475e-05, + "loss": 1.7308, + "step": 20051 + }, + { + "epoch": 0.7181048221032463, + "grad_norm": 2.7019457817077637, + "learning_rate": 3.8859252807956035e-05, + "loss": 1.4159, + "step": 20052 + }, + { + "epoch": 0.7181406342328146, + "grad_norm": 1.6056925058364868, + "learning_rate": 3.885007473553723e-05, + "loss": 1.2037, + "step": 20053 + }, + { + "epoch": 0.718176446362383, + "grad_norm": 1.4310388565063477, + "learning_rate": 3.8840897485811737e-05, + "loss": 1.2534, + "step": 20054 + }, + { + "epoch": 0.7182122584919512, + "grad_norm": 1.3919428586959839, + "learning_rate": 3.883172105890314e-05, + "loss": 1.6811, + "step": 20055 + }, + { + "epoch": 0.7182480706215195, + "grad_norm": 1.4286328554153442, + "learning_rate": 3.8822545454934836e-05, + "loss": 1.3516, + "step": 20056 + }, + { + "epoch": 0.7182838827510878, + "grad_norm": 1.5007710456848145, + "learning_rate": 3.881337067403022e-05, + "loss": 1.6128, + "step": 20057 + }, + { + "epoch": 0.7183196948806561, + "grad_norm": 1.7567851543426514, + "learning_rate": 3.8804196716312805e-05, + "loss": 1.6251, + "step": 20058 + }, + { + "epoch": 0.7183555070102243, + "grad_norm": 1.5256688594818115, + "learning_rate": 3.879502358190596e-05, + "loss": 1.5159, + "step": 20059 + }, + { + "epoch": 0.7183913191397926, + "grad_norm": 1.9770673513412476, + "learning_rate": 3.878585127093317e-05, + "loss": 1.393, + "step": 20060 + }, + { + "epoch": 0.718427131269361, + "grad_norm": 1.4122111797332764, + "learning_rate": 3.877667978351772e-05, + "loss": 1.4472, + "step": 20061 + }, + { + "epoch": 0.7184629433989292, + "grad_norm": 1.5489094257354736, + "learning_rate": 3.876750911978315e-05, + "loss": 1.3976, + "step": 20062 + }, + { + "epoch": 0.7184987555284975, + "grad_norm": 1.4532663822174072, + "learning_rate": 3.875833927985272e-05, + "loss": 1.2297, + "step": 20063 + }, + { + "epoch": 0.7185345676580658, + "grad_norm": 1.3802516460418701, + "learning_rate": 3.8749170263849865e-05, + "loss": 1.1974, + "step": 20064 + }, + { + "epoch": 0.718570379787634, + "grad_norm": 1.4403727054595947, + "learning_rate": 3.874000207189789e-05, + "loss": 1.4378, + "step": 20065 + }, + { + "epoch": 0.7186061919172023, + "grad_norm": 1.9045497179031372, + "learning_rate": 3.8730834704120164e-05, + "loss": 1.7718, + "step": 20066 + }, + { + "epoch": 0.7186420040467706, + "grad_norm": 1.5778992176055908, + "learning_rate": 3.8721668160640054e-05, + "loss": 1.6047, + "step": 20067 + }, + { + "epoch": 0.718677816176339, + "grad_norm": 1.8331999778747559, + "learning_rate": 3.871250244158083e-05, + "loss": 1.6248, + "step": 20068 + }, + { + "epoch": 0.7187136283059072, + "grad_norm": 1.9123164415359497, + "learning_rate": 3.870333754706583e-05, + "loss": 1.5269, + "step": 20069 + }, + { + "epoch": 0.7187494404354755, + "grad_norm": 2.063952922821045, + "learning_rate": 3.8694173477218355e-05, + "loss": 1.7413, + "step": 20070 + }, + { + "epoch": 0.7187852525650438, + "grad_norm": 1.6923632621765137, + "learning_rate": 3.8685010232161736e-05, + "loss": 1.0536, + "step": 20071 + }, + { + "epoch": 0.718821064694612, + "grad_norm": 1.7924041748046875, + "learning_rate": 3.8675847812019175e-05, + "loss": 1.3545, + "step": 20072 + }, + { + "epoch": 0.7188568768241803, + "grad_norm": 1.7178550958633423, + "learning_rate": 3.866668621691397e-05, + "loss": 1.3935, + "step": 20073 + }, + { + "epoch": 0.7188926889537486, + "grad_norm": 1.6161495447158813, + "learning_rate": 3.8657525446969436e-05, + "loss": 1.3834, + "step": 20074 + }, + { + "epoch": 0.718928501083317, + "grad_norm": 2.147392988204956, + "learning_rate": 3.864836550230874e-05, + "loss": 1.6886, + "step": 20075 + }, + { + "epoch": 0.7189643132128852, + "grad_norm": 1.8922128677368164, + "learning_rate": 3.863920638305512e-05, + "loss": 1.468, + "step": 20076 + }, + { + "epoch": 0.7190001253424535, + "grad_norm": 1.7388283014297485, + "learning_rate": 3.863004808933186e-05, + "loss": 1.554, + "step": 20077 + }, + { + "epoch": 0.7190359374720218, + "grad_norm": 1.6552281379699707, + "learning_rate": 3.8620890621262164e-05, + "loss": 1.6504, + "step": 20078 + }, + { + "epoch": 0.71907174960159, + "grad_norm": 1.4679269790649414, + "learning_rate": 3.8611733978969176e-05, + "loss": 1.451, + "step": 20079 + }, + { + "epoch": 0.7191075617311583, + "grad_norm": 1.8459839820861816, + "learning_rate": 3.860257816257612e-05, + "loss": 1.4404, + "step": 20080 + }, + { + "epoch": 0.7191433738607266, + "grad_norm": 1.5493810176849365, + "learning_rate": 3.859342317220619e-05, + "loss": 1.3396, + "step": 20081 + }, + { + "epoch": 0.719179185990295, + "grad_norm": 1.4488211870193481, + "learning_rate": 3.8584269007982565e-05, + "loss": 1.5612, + "step": 20082 + }, + { + "epoch": 0.7192149981198632, + "grad_norm": 2.1801438331604004, + "learning_rate": 3.857511567002835e-05, + "loss": 1.1745, + "step": 20083 + }, + { + "epoch": 0.7192508102494315, + "grad_norm": 1.371329665184021, + "learning_rate": 3.8565963158466714e-05, + "loss": 1.4076, + "step": 20084 + }, + { + "epoch": 0.7192866223789998, + "grad_norm": 1.5765669345855713, + "learning_rate": 3.855681147342084e-05, + "loss": 1.4628, + "step": 20085 + }, + { + "epoch": 0.719322434508568, + "grad_norm": 1.3415297269821167, + "learning_rate": 3.854766061501378e-05, + "loss": 1.6046, + "step": 20086 + }, + { + "epoch": 0.7193582466381363, + "grad_norm": 1.9021408557891846, + "learning_rate": 3.853851058336867e-05, + "loss": 1.4807, + "step": 20087 + }, + { + "epoch": 0.7193940587677046, + "grad_norm": 1.543630599975586, + "learning_rate": 3.852936137860863e-05, + "loss": 1.2401, + "step": 20088 + }, + { + "epoch": 0.719429870897273, + "grad_norm": 1.5069166421890259, + "learning_rate": 3.8520213000856763e-05, + "loss": 1.6359, + "step": 20089 + }, + { + "epoch": 0.7194656830268412, + "grad_norm": 1.6449099779129028, + "learning_rate": 3.85110654502361e-05, + "loss": 1.4342, + "step": 20090 + }, + { + "epoch": 0.7195014951564095, + "grad_norm": 1.4722683429718018, + "learning_rate": 3.8501918726869744e-05, + "loss": 1.6538, + "step": 20091 + }, + { + "epoch": 0.7195373072859778, + "grad_norm": 1.6070823669433594, + "learning_rate": 3.8492772830880776e-05, + "loss": 1.2967, + "step": 20092 + }, + { + "epoch": 0.719573119415546, + "grad_norm": 1.3607196807861328, + "learning_rate": 3.848362776239217e-05, + "loss": 1.559, + "step": 20093 + }, + { + "epoch": 0.7196089315451143, + "grad_norm": 1.7744579315185547, + "learning_rate": 3.847448352152701e-05, + "loss": 1.7096, + "step": 20094 + }, + { + "epoch": 0.7196447436746826, + "grad_norm": 1.426999807357788, + "learning_rate": 3.84653401084083e-05, + "loss": 1.4129, + "step": 20095 + }, + { + "epoch": 0.7196805558042509, + "grad_norm": 1.5814954042434692, + "learning_rate": 3.8456197523159096e-05, + "loss": 1.3991, + "step": 20096 + }, + { + "epoch": 0.7197163679338192, + "grad_norm": 1.6949788331985474, + "learning_rate": 3.844705576590235e-05, + "loss": 1.2601, + "step": 20097 + }, + { + "epoch": 0.7197521800633875, + "grad_norm": 1.731933832168579, + "learning_rate": 3.843791483676107e-05, + "loss": 1.5995, + "step": 20098 + }, + { + "epoch": 0.7197879921929558, + "grad_norm": 1.780846357345581, + "learning_rate": 3.842877473585823e-05, + "loss": 1.2574, + "step": 20099 + }, + { + "epoch": 0.719823804322524, + "grad_norm": 1.3572626113891602, + "learning_rate": 3.841963546331684e-05, + "loss": 1.4639, + "step": 20100 + }, + { + "epoch": 0.7198596164520923, + "grad_norm": 1.2794239521026611, + "learning_rate": 3.841049701925978e-05, + "loss": 1.2574, + "step": 20101 + }, + { + "epoch": 0.7198954285816606, + "grad_norm": 1.5886186361312866, + "learning_rate": 3.840135940381006e-05, + "loss": 1.3844, + "step": 20102 + }, + { + "epoch": 0.7199312407112289, + "grad_norm": 2.106218099594116, + "learning_rate": 3.839222261709061e-05, + "loss": 1.3317, + "step": 20103 + }, + { + "epoch": 0.7199670528407972, + "grad_norm": 2.0230941772460938, + "learning_rate": 3.83830866592243e-05, + "loss": 1.5562, + "step": 20104 + }, + { + "epoch": 0.7200028649703655, + "grad_norm": 1.5232678651809692, + "learning_rate": 3.8373951530334086e-05, + "loss": 1.0045, + "step": 20105 + }, + { + "epoch": 0.7200386770999337, + "grad_norm": 3.936558961868286, + "learning_rate": 3.836481723054286e-05, + "loss": 1.4348, + "step": 20106 + }, + { + "epoch": 0.720074489229502, + "grad_norm": 1.2785394191741943, + "learning_rate": 3.835568375997355e-05, + "loss": 1.5436, + "step": 20107 + }, + { + "epoch": 0.7201103013590703, + "grad_norm": 2.008934259414673, + "learning_rate": 3.8346551118748967e-05, + "loss": 1.3253, + "step": 20108 + }, + { + "epoch": 0.7201461134886386, + "grad_norm": 1.5482290983200073, + "learning_rate": 3.8337419306992e-05, + "loss": 1.4049, + "step": 20109 + }, + { + "epoch": 0.7201819256182069, + "grad_norm": 1.702072262763977, + "learning_rate": 3.8328288324825566e-05, + "loss": 1.4319, + "step": 20110 + }, + { + "epoch": 0.7202177377477752, + "grad_norm": 2.1846251487731934, + "learning_rate": 3.831915817237243e-05, + "loss": 1.3064, + "step": 20111 + }, + { + "epoch": 0.7202535498773435, + "grad_norm": 1.7581915855407715, + "learning_rate": 3.831002884975544e-05, + "loss": 1.4488, + "step": 20112 + }, + { + "epoch": 0.7202893620069117, + "grad_norm": 1.5173016786575317, + "learning_rate": 3.830090035709745e-05, + "loss": 1.2531, + "step": 20113 + }, + { + "epoch": 0.72032517413648, + "grad_norm": 1.3958326578140259, + "learning_rate": 3.8291772694521285e-05, + "loss": 1.5105, + "step": 20114 + }, + { + "epoch": 0.7203609862660483, + "grad_norm": 2.2686309814453125, + "learning_rate": 3.82826458621497e-05, + "loss": 1.3909, + "step": 20115 + }, + { + "epoch": 0.7203967983956165, + "grad_norm": 2.0584423542022705, + "learning_rate": 3.82735198601055e-05, + "loss": 1.6335, + "step": 20116 + }, + { + "epoch": 0.7204326105251849, + "grad_norm": 1.203965187072754, + "learning_rate": 3.8264394688511466e-05, + "loss": 1.2574, + "step": 20117 + }, + { + "epoch": 0.7204684226547532, + "grad_norm": 1.8112740516662598, + "learning_rate": 3.82552703474904e-05, + "loss": 1.4215, + "step": 20118 + }, + { + "epoch": 0.7205042347843215, + "grad_norm": 1.7669355869293213, + "learning_rate": 3.8246146837165e-05, + "loss": 1.6208, + "step": 20119 + }, + { + "epoch": 0.7205400469138897, + "grad_norm": 1.4590661525726318, + "learning_rate": 3.823702415765803e-05, + "loss": 1.3783, + "step": 20120 + }, + { + "epoch": 0.720575859043458, + "grad_norm": 2.1782760620117188, + "learning_rate": 3.822790230909227e-05, + "loss": 1.64, + "step": 20121 + }, + { + "epoch": 0.7206116711730263, + "grad_norm": 1.6818681955337524, + "learning_rate": 3.821878129159037e-05, + "loss": 1.6294, + "step": 20122 + }, + { + "epoch": 0.7206474833025945, + "grad_norm": 1.7230101823806763, + "learning_rate": 3.8209661105275077e-05, + "loss": 1.3779, + "step": 20123 + }, + { + "epoch": 0.7206832954321629, + "grad_norm": 1.3677054643630981, + "learning_rate": 3.820054175026908e-05, + "loss": 1.0637, + "step": 20124 + }, + { + "epoch": 0.7207191075617312, + "grad_norm": 1.5686241388320923, + "learning_rate": 3.8191423226695125e-05, + "loss": 1.278, + "step": 20125 + }, + { + "epoch": 0.7207549196912995, + "grad_norm": 1.5883556604385376, + "learning_rate": 3.81823055346758e-05, + "loss": 1.3986, + "step": 20126 + }, + { + "epoch": 0.7207907318208677, + "grad_norm": 1.4147443771362305, + "learning_rate": 3.817318867433383e-05, + "loss": 1.3892, + "step": 20127 + }, + { + "epoch": 0.720826543950436, + "grad_norm": 1.6232571601867676, + "learning_rate": 3.816407264579187e-05, + "loss": 1.2004, + "step": 20128 + }, + { + "epoch": 0.7208623560800043, + "grad_norm": 1.8220059871673584, + "learning_rate": 3.8154957449172524e-05, + "loss": 1.1985, + "step": 20129 + }, + { + "epoch": 0.7208981682095725, + "grad_norm": 1.627021074295044, + "learning_rate": 3.814584308459849e-05, + "loss": 1.3011, + "step": 20130 + }, + { + "epoch": 0.7209339803391409, + "grad_norm": 1.998015284538269, + "learning_rate": 3.8136729552192274e-05, + "loss": 1.4227, + "step": 20131 + }, + { + "epoch": 0.7209697924687092, + "grad_norm": 1.4177651405334473, + "learning_rate": 3.812761685207664e-05, + "loss": 1.2593, + "step": 20132 + }, + { + "epoch": 0.7210056045982774, + "grad_norm": 1.8777852058410645, + "learning_rate": 3.811850498437407e-05, + "loss": 1.3507, + "step": 20133 + }, + { + "epoch": 0.7210414167278457, + "grad_norm": 2.2260215282440186, + "learning_rate": 3.81093939492072e-05, + "loss": 1.3991, + "step": 20134 + }, + { + "epoch": 0.721077228857414, + "grad_norm": 1.6670490503311157, + "learning_rate": 3.810028374669859e-05, + "loss": 1.2042, + "step": 20135 + }, + { + "epoch": 0.7211130409869823, + "grad_norm": 2.2959189414978027, + "learning_rate": 3.8091174376970876e-05, + "loss": 1.3529, + "step": 20136 + }, + { + "epoch": 0.7211488531165505, + "grad_norm": 1.5722131729125977, + "learning_rate": 3.808206584014653e-05, + "loss": 1.2084, + "step": 20137 + }, + { + "epoch": 0.7211846652461189, + "grad_norm": 1.5049318075180054, + "learning_rate": 3.807295813634807e-05, + "loss": 1.6146, + "step": 20138 + }, + { + "epoch": 0.7212204773756872, + "grad_norm": 2.4031875133514404, + "learning_rate": 3.8063851265698134e-05, + "loss": 1.467, + "step": 20139 + }, + { + "epoch": 0.7212562895052554, + "grad_norm": 1.4516150951385498, + "learning_rate": 3.805474522831916e-05, + "loss": 1.5926, + "step": 20140 + }, + { + "epoch": 0.7212921016348237, + "grad_norm": 1.6928277015686035, + "learning_rate": 3.804564002433371e-05, + "loss": 1.4698, + "step": 20141 + }, + { + "epoch": 0.721327913764392, + "grad_norm": 2.470627546310425, + "learning_rate": 3.8036535653864193e-05, + "loss": 1.4727, + "step": 20142 + }, + { + "epoch": 0.7213637258939603, + "grad_norm": 1.5808488130569458, + "learning_rate": 3.8027432117033237e-05, + "loss": 1.5674, + "step": 20143 + }, + { + "epoch": 0.7213995380235285, + "grad_norm": 1.833074688911438, + "learning_rate": 3.80183294139632e-05, + "loss": 1.4833, + "step": 20144 + }, + { + "epoch": 0.7214353501530969, + "grad_norm": 1.6991946697235107, + "learning_rate": 3.8009227544776595e-05, + "loss": 1.6437, + "step": 20145 + }, + { + "epoch": 0.7214711622826652, + "grad_norm": 1.4071147441864014, + "learning_rate": 3.80001265095959e-05, + "loss": 0.9343, + "step": 20146 + }, + { + "epoch": 0.7215069744122334, + "grad_norm": 1.4687845706939697, + "learning_rate": 3.799102630854351e-05, + "loss": 1.4609, + "step": 20147 + }, + { + "epoch": 0.7215427865418017, + "grad_norm": 1.7575792074203491, + "learning_rate": 3.79819269417419e-05, + "loss": 1.5505, + "step": 20148 + }, + { + "epoch": 0.72157859867137, + "grad_norm": 2.31231689453125, + "learning_rate": 3.797282840931339e-05, + "loss": 1.5541, + "step": 20149 + }, + { + "epoch": 0.7216144108009382, + "grad_norm": 1.4037084579467773, + "learning_rate": 3.796373071138054e-05, + "loss": 1.6367, + "step": 20150 + }, + { + "epoch": 0.7216502229305065, + "grad_norm": 2.1780078411102295, + "learning_rate": 3.795463384806564e-05, + "loss": 1.6328, + "step": 20151 + }, + { + "epoch": 0.7216860350600749, + "grad_norm": 1.7256742715835571, + "learning_rate": 3.794553781949114e-05, + "loss": 1.3406, + "step": 20152 + }, + { + "epoch": 0.7217218471896432, + "grad_norm": 1.9494801759719849, + "learning_rate": 3.793644262577934e-05, + "loss": 1.3391, + "step": 20153 + }, + { + "epoch": 0.7217576593192114, + "grad_norm": 1.6500424146652222, + "learning_rate": 3.7927348267052666e-05, + "loss": 1.4641, + "step": 20154 + }, + { + "epoch": 0.7217934714487797, + "grad_norm": 1.6580140590667725, + "learning_rate": 3.791825474343348e-05, + "loss": 1.5671, + "step": 20155 + }, + { + "epoch": 0.721829283578348, + "grad_norm": 1.598800778388977, + "learning_rate": 3.790916205504406e-05, + "loss": 1.4017, + "step": 20156 + }, + { + "epoch": 0.7218650957079162, + "grad_norm": 1.5164865255355835, + "learning_rate": 3.7900070202006764e-05, + "loss": 1.1569, + "step": 20157 + }, + { + "epoch": 0.7219009078374845, + "grad_norm": 1.3159844875335693, + "learning_rate": 3.789097918444394e-05, + "loss": 1.4161, + "step": 20158 + }, + { + "epoch": 0.7219367199670529, + "grad_norm": 1.960456132888794, + "learning_rate": 3.78818890024779e-05, + "loss": 1.3394, + "step": 20159 + }, + { + "epoch": 0.7219725320966212, + "grad_norm": 1.6618950366973877, + "learning_rate": 3.787279965623085e-05, + "loss": 1.719, + "step": 20160 + }, + { + "epoch": 0.7220083442261894, + "grad_norm": 1.2125014066696167, + "learning_rate": 3.786371114582521e-05, + "loss": 1.1418, + "step": 20161 + }, + { + "epoch": 0.7220441563557577, + "grad_norm": 1.914084792137146, + "learning_rate": 3.785462347138319e-05, + "loss": 1.6195, + "step": 20162 + }, + { + "epoch": 0.722079968485326, + "grad_norm": 1.5807491540908813, + "learning_rate": 3.784553663302701e-05, + "loss": 1.5063, + "step": 20163 + }, + { + "epoch": 0.7221157806148942, + "grad_norm": 1.9150186777114868, + "learning_rate": 3.783645063087896e-05, + "loss": 1.2985, + "step": 20164 + }, + { + "epoch": 0.7221515927444625, + "grad_norm": 2.09673810005188, + "learning_rate": 3.782736546506128e-05, + "loss": 1.5551, + "step": 20165 + }, + { + "epoch": 0.7221874048740309, + "grad_norm": 1.5515776872634888, + "learning_rate": 3.781828113569624e-05, + "loss": 1.7333, + "step": 20166 + }, + { + "epoch": 0.7222232170035991, + "grad_norm": 1.499321460723877, + "learning_rate": 3.780919764290599e-05, + "loss": 1.4817, + "step": 20167 + }, + { + "epoch": 0.7222590291331674, + "grad_norm": 1.4306893348693848, + "learning_rate": 3.780011498681276e-05, + "loss": 1.2433, + "step": 20168 + }, + { + "epoch": 0.7222948412627357, + "grad_norm": 1.8531023263931274, + "learning_rate": 3.779103316753875e-05, + "loss": 1.205, + "step": 20169 + }, + { + "epoch": 0.722330653392304, + "grad_norm": 1.7147332429885864, + "learning_rate": 3.778195218520618e-05, + "loss": 1.3981, + "step": 20170 + }, + { + "epoch": 0.7223664655218722, + "grad_norm": 1.5731045007705688, + "learning_rate": 3.777287203993716e-05, + "loss": 1.5347, + "step": 20171 + }, + { + "epoch": 0.7224022776514405, + "grad_norm": 1.9253267049789429, + "learning_rate": 3.7763792731853865e-05, + "loss": 1.7233, + "step": 20172 + }, + { + "epoch": 0.7224380897810089, + "grad_norm": 1.4065598249435425, + "learning_rate": 3.77547142610785e-05, + "loss": 1.1908, + "step": 20173 + }, + { + "epoch": 0.7224739019105771, + "grad_norm": 1.5758453607559204, + "learning_rate": 3.774563662773314e-05, + "loss": 1.2734, + "step": 20174 + }, + { + "epoch": 0.7225097140401454, + "grad_norm": 1.502901315689087, + "learning_rate": 3.773655983193992e-05, + "loss": 1.5944, + "step": 20175 + }, + { + "epoch": 0.7225455261697137, + "grad_norm": 1.7962548732757568, + "learning_rate": 3.772748387382099e-05, + "loss": 1.2866, + "step": 20176 + }, + { + "epoch": 0.722581338299282, + "grad_norm": 2.224817991256714, + "learning_rate": 3.7718408753498456e-05, + "loss": 1.3286, + "step": 20177 + }, + { + "epoch": 0.7226171504288502, + "grad_norm": 1.7116045951843262, + "learning_rate": 3.770933447109437e-05, + "loss": 1.2324, + "step": 20178 + }, + { + "epoch": 0.7226529625584185, + "grad_norm": 1.5392895936965942, + "learning_rate": 3.7700261026730844e-05, + "loss": 1.2427, + "step": 20179 + }, + { + "epoch": 0.7226887746879869, + "grad_norm": 1.7307379245758057, + "learning_rate": 3.7691188420529974e-05, + "loss": 1.4682, + "step": 20180 + }, + { + "epoch": 0.7227245868175551, + "grad_norm": 1.6550228595733643, + "learning_rate": 3.768211665261375e-05, + "loss": 1.5901, + "step": 20181 + }, + { + "epoch": 0.7227603989471234, + "grad_norm": 1.491673469543457, + "learning_rate": 3.7673045723104275e-05, + "loss": 1.3454, + "step": 20182 + }, + { + "epoch": 0.7227962110766917, + "grad_norm": 1.3128697872161865, + "learning_rate": 3.7663975632123574e-05, + "loss": 1.4408, + "step": 20183 + }, + { + "epoch": 0.7228320232062599, + "grad_norm": 1.577620267868042, + "learning_rate": 3.76549063797937e-05, + "loss": 1.1176, + "step": 20184 + }, + { + "epoch": 0.7228678353358282, + "grad_norm": 2.135578155517578, + "learning_rate": 3.7645837966236605e-05, + "loss": 1.4422, + "step": 20185 + }, + { + "epoch": 0.7229036474653965, + "grad_norm": 1.4765774011611938, + "learning_rate": 3.763677039157433e-05, + "loss": 1.2351, + "step": 20186 + }, + { + "epoch": 0.7229394595949649, + "grad_norm": 1.3106032609939575, + "learning_rate": 3.762770365592887e-05, + "loss": 1.3763, + "step": 20187 + }, + { + "epoch": 0.7229752717245331, + "grad_norm": 1.6912516355514526, + "learning_rate": 3.7618637759422236e-05, + "loss": 1.4735, + "step": 20188 + }, + { + "epoch": 0.7230110838541014, + "grad_norm": 1.7040541172027588, + "learning_rate": 3.760957270217633e-05, + "loss": 1.4154, + "step": 20189 + }, + { + "epoch": 0.7230468959836697, + "grad_norm": 1.61090087890625, + "learning_rate": 3.7600508484313146e-05, + "loss": 1.4419, + "step": 20190 + }, + { + "epoch": 0.7230827081132379, + "grad_norm": 1.7219356298446655, + "learning_rate": 3.759144510595467e-05, + "loss": 1.5413, + "step": 20191 + }, + { + "epoch": 0.7231185202428062, + "grad_norm": 1.4904463291168213, + "learning_rate": 3.7582382567222754e-05, + "loss": 1.0032, + "step": 20192 + }, + { + "epoch": 0.7231543323723745, + "grad_norm": 1.4005389213562012, + "learning_rate": 3.757332086823937e-05, + "loss": 1.4532, + "step": 20193 + }, + { + "epoch": 0.7231901445019429, + "grad_norm": 1.4816468954086304, + "learning_rate": 3.756426000912644e-05, + "loss": 1.5392, + "step": 20194 + }, + { + "epoch": 0.7232259566315111, + "grad_norm": 1.534940481185913, + "learning_rate": 3.7555199990005874e-05, + "loss": 1.1027, + "step": 20195 + }, + { + "epoch": 0.7232617687610794, + "grad_norm": 1.4121631383895874, + "learning_rate": 3.754614081099952e-05, + "loss": 1.1582, + "step": 20196 + }, + { + "epoch": 0.7232975808906477, + "grad_norm": 1.200635552406311, + "learning_rate": 3.753708247222928e-05, + "loss": 1.3734, + "step": 20197 + }, + { + "epoch": 0.7233333930202159, + "grad_norm": 1.5664600133895874, + "learning_rate": 3.752802497381706e-05, + "loss": 1.4627, + "step": 20198 + }, + { + "epoch": 0.7233692051497842, + "grad_norm": 1.6560693979263306, + "learning_rate": 3.751896831588464e-05, + "loss": 1.3806, + "step": 20199 + }, + { + "epoch": 0.7234050172793525, + "grad_norm": 1.5157887935638428, + "learning_rate": 3.7509912498553914e-05, + "loss": 1.495, + "step": 20200 + }, + { + "epoch": 0.7234408294089208, + "grad_norm": 2.2800793647766113, + "learning_rate": 3.750085752194671e-05, + "loss": 1.3048, + "step": 20201 + }, + { + "epoch": 0.7234766415384891, + "grad_norm": 1.5321568250656128, + "learning_rate": 3.749180338618488e-05, + "loss": 1.5647, + "step": 20202 + }, + { + "epoch": 0.7235124536680574, + "grad_norm": 1.694242238998413, + "learning_rate": 3.7482750091390176e-05, + "loss": 1.4978, + "step": 20203 + }, + { + "epoch": 0.7235482657976257, + "grad_norm": 1.7178481817245483, + "learning_rate": 3.7473697637684416e-05, + "loss": 1.3819, + "step": 20204 + }, + { + "epoch": 0.7235840779271939, + "grad_norm": 1.4250034093856812, + "learning_rate": 3.746464602518941e-05, + "loss": 1.4255, + "step": 20205 + }, + { + "epoch": 0.7236198900567622, + "grad_norm": 1.6631689071655273, + "learning_rate": 3.745559525402696e-05, + "loss": 1.5659, + "step": 20206 + }, + { + "epoch": 0.7236557021863305, + "grad_norm": 1.8999367952346802, + "learning_rate": 3.744654532431876e-05, + "loss": 1.5464, + "step": 20207 + }, + { + "epoch": 0.7236915143158988, + "grad_norm": 1.983514428138733, + "learning_rate": 3.743749623618661e-05, + "loss": 1.4633, + "step": 20208 + }, + { + "epoch": 0.7237273264454671, + "grad_norm": 2.2431626319885254, + "learning_rate": 3.742844798975229e-05, + "loss": 1.4338, + "step": 20209 + }, + { + "epoch": 0.7237631385750354, + "grad_norm": 2.3390116691589355, + "learning_rate": 3.7419400585137444e-05, + "loss": 1.6569, + "step": 20210 + }, + { + "epoch": 0.7237989507046036, + "grad_norm": 1.4298285245895386, + "learning_rate": 3.741035402246385e-05, + "loss": 1.4772, + "step": 20211 + }, + { + "epoch": 0.7238347628341719, + "grad_norm": 1.4829427003860474, + "learning_rate": 3.74013083018532e-05, + "loss": 1.6161, + "step": 20212 + }, + { + "epoch": 0.7238705749637402, + "grad_norm": 1.5598061084747314, + "learning_rate": 3.7392263423427234e-05, + "loss": 1.6156, + "step": 20213 + }, + { + "epoch": 0.7239063870933085, + "grad_norm": 1.6008559465408325, + "learning_rate": 3.738321938730758e-05, + "loss": 1.3155, + "step": 20214 + }, + { + "epoch": 0.7239421992228768, + "grad_norm": 1.7069321870803833, + "learning_rate": 3.737417619361593e-05, + "loss": 1.5145, + "step": 20215 + }, + { + "epoch": 0.7239780113524451, + "grad_norm": 1.5327214002609253, + "learning_rate": 3.7365133842473995e-05, + "loss": 1.4894, + "step": 20216 + }, + { + "epoch": 0.7240138234820134, + "grad_norm": 1.2927590608596802, + "learning_rate": 3.735609233400336e-05, + "loss": 1.4621, + "step": 20217 + }, + { + "epoch": 0.7240496356115816, + "grad_norm": 1.4662754535675049, + "learning_rate": 3.734705166832569e-05, + "loss": 1.1987, + "step": 20218 + }, + { + "epoch": 0.7240854477411499, + "grad_norm": 1.7217975854873657, + "learning_rate": 3.7338011845562624e-05, + "loss": 1.4568, + "step": 20219 + }, + { + "epoch": 0.7241212598707182, + "grad_norm": 1.6077433824539185, + "learning_rate": 3.732897286583582e-05, + "loss": 1.4954, + "step": 20220 + }, + { + "epoch": 0.7241570720002865, + "grad_norm": 1.8069626092910767, + "learning_rate": 3.7319934729266814e-05, + "loss": 1.6723, + "step": 20221 + }, + { + "epoch": 0.7241928841298548, + "grad_norm": 1.7534021139144897, + "learning_rate": 3.731089743597723e-05, + "loss": 1.7045, + "step": 20222 + }, + { + "epoch": 0.7242286962594231, + "grad_norm": 1.8184095621109009, + "learning_rate": 3.7301860986088666e-05, + "loss": 1.724, + "step": 20223 + }, + { + "epoch": 0.7242645083889914, + "grad_norm": 2.0938992500305176, + "learning_rate": 3.729282537972272e-05, + "loss": 1.6204, + "step": 20224 + }, + { + "epoch": 0.7243003205185596, + "grad_norm": 1.575728178024292, + "learning_rate": 3.728379061700091e-05, + "loss": 1.3117, + "step": 20225 + }, + { + "epoch": 0.7243361326481279, + "grad_norm": 1.9121419191360474, + "learning_rate": 3.727475669804474e-05, + "loss": 1.3417, + "step": 20226 + }, + { + "epoch": 0.7243719447776962, + "grad_norm": 2.2062628269195557, + "learning_rate": 3.726572362297588e-05, + "loss": 1.7373, + "step": 20227 + }, + { + "epoch": 0.7244077569072644, + "grad_norm": 1.6102908849716187, + "learning_rate": 3.725669139191574e-05, + "loss": 1.3962, + "step": 20228 + }, + { + "epoch": 0.7244435690368328, + "grad_norm": 1.7527639865875244, + "learning_rate": 3.7247660004985897e-05, + "loss": 1.5474, + "step": 20229 + }, + { + "epoch": 0.7244793811664011, + "grad_norm": 2.023170232772827, + "learning_rate": 3.723862946230784e-05, + "loss": 1.3935, + "step": 20230 + }, + { + "epoch": 0.7245151932959694, + "grad_norm": 1.7724480628967285, + "learning_rate": 3.7229599764003096e-05, + "loss": 1.3569, + "step": 20231 + }, + { + "epoch": 0.7245510054255376, + "grad_norm": 1.76382315158844, + "learning_rate": 3.7220570910193096e-05, + "loss": 1.5279, + "step": 20232 + }, + { + "epoch": 0.7245868175551059, + "grad_norm": 1.4322667121887207, + "learning_rate": 3.721154290099933e-05, + "loss": 1.259, + "step": 20233 + }, + { + "epoch": 0.7246226296846742, + "grad_norm": 1.2567883729934692, + "learning_rate": 3.7202515736543296e-05, + "loss": 1.3401, + "step": 20234 + }, + { + "epoch": 0.7246584418142424, + "grad_norm": 1.4475733041763306, + "learning_rate": 3.7193489416946383e-05, + "loss": 1.4377, + "step": 20235 + }, + { + "epoch": 0.7246942539438108, + "grad_norm": 1.2449748516082764, + "learning_rate": 3.718446394233007e-05, + "loss": 1.5922, + "step": 20236 + }, + { + "epoch": 0.7247300660733791, + "grad_norm": 1.514506220817566, + "learning_rate": 3.717543931281572e-05, + "loss": 1.3933, + "step": 20237 + }, + { + "epoch": 0.7247658782029474, + "grad_norm": 1.5675554275512695, + "learning_rate": 3.7166415528524854e-05, + "loss": 1.0401, + "step": 20238 + }, + { + "epoch": 0.7248016903325156, + "grad_norm": 1.6422276496887207, + "learning_rate": 3.715739258957879e-05, + "loss": 1.411, + "step": 20239 + }, + { + "epoch": 0.7248375024620839, + "grad_norm": 1.1734343767166138, + "learning_rate": 3.714837049609898e-05, + "loss": 1.5028, + "step": 20240 + }, + { + "epoch": 0.7248733145916522, + "grad_norm": 1.268044352531433, + "learning_rate": 3.71393492482067e-05, + "loss": 1.3508, + "step": 20241 + }, + { + "epoch": 0.7249091267212204, + "grad_norm": 1.7053765058517456, + "learning_rate": 3.713032884602346e-05, + "loss": 1.4333, + "step": 20242 + }, + { + "epoch": 0.7249449388507888, + "grad_norm": 2.118558406829834, + "learning_rate": 3.712130928967056e-05, + "loss": 1.5188, + "step": 20243 + }, + { + "epoch": 0.7249807509803571, + "grad_norm": 1.3212299346923828, + "learning_rate": 3.711229057926925e-05, + "loss": 1.7121, + "step": 20244 + }, + { + "epoch": 0.7250165631099253, + "grad_norm": 1.4363412857055664, + "learning_rate": 3.710327271494103e-05, + "loss": 1.4603, + "step": 20245 + }, + { + "epoch": 0.7250523752394936, + "grad_norm": 2.0643067359924316, + "learning_rate": 3.709425569680711e-05, + "loss": 1.6963, + "step": 20246 + }, + { + "epoch": 0.7250881873690619, + "grad_norm": 1.556532621383667, + "learning_rate": 3.708523952498887e-05, + "loss": 1.3901, + "step": 20247 + }, + { + "epoch": 0.7251239994986302, + "grad_norm": 1.4042514562606812, + "learning_rate": 3.707622419960751e-05, + "loss": 1.5076, + "step": 20248 + }, + { + "epoch": 0.7251598116281984, + "grad_norm": 2.3042328357696533, + "learning_rate": 3.7067209720784456e-05, + "loss": 1.2477, + "step": 20249 + }, + { + "epoch": 0.7251956237577668, + "grad_norm": 1.4075384140014648, + "learning_rate": 3.705819608864092e-05, + "loss": 1.472, + "step": 20250 + }, + { + "epoch": 0.7252314358873351, + "grad_norm": 1.6488524675369263, + "learning_rate": 3.704918330329813e-05, + "loss": 1.6962, + "step": 20251 + }, + { + "epoch": 0.7252672480169033, + "grad_norm": 1.7311025857925415, + "learning_rate": 3.704017136487737e-05, + "loss": 1.3322, + "step": 20252 + }, + { + "epoch": 0.7253030601464716, + "grad_norm": 1.5187498331069946, + "learning_rate": 3.70311602734999e-05, + "loss": 1.6023, + "step": 20253 + }, + { + "epoch": 0.7253388722760399, + "grad_norm": 1.8480957746505737, + "learning_rate": 3.702215002928699e-05, + "loss": 1.3335, + "step": 20254 + }, + { + "epoch": 0.7253746844056081, + "grad_norm": 2.199833393096924, + "learning_rate": 3.701314063235972e-05, + "loss": 1.33, + "step": 20255 + }, + { + "epoch": 0.7254104965351764, + "grad_norm": 1.5730723142623901, + "learning_rate": 3.7004132082839485e-05, + "loss": 1.0013, + "step": 20256 + }, + { + "epoch": 0.7254463086647448, + "grad_norm": 1.6218773126602173, + "learning_rate": 3.699512438084736e-05, + "loss": 1.2575, + "step": 20257 + }, + { + "epoch": 0.7254821207943131, + "grad_norm": 2.079622507095337, + "learning_rate": 3.6986117526504595e-05, + "loss": 1.5331, + "step": 20258 + }, + { + "epoch": 0.7255179329238813, + "grad_norm": 1.8061600923538208, + "learning_rate": 3.6977111519932295e-05, + "loss": 1.3609, + "step": 20259 + }, + { + "epoch": 0.7255537450534496, + "grad_norm": 2.0372745990753174, + "learning_rate": 3.696810636125168e-05, + "loss": 1.6058, + "step": 20260 + }, + { + "epoch": 0.7255895571830179, + "grad_norm": 1.9786975383758545, + "learning_rate": 3.69591020505839e-05, + "loss": 1.6915, + "step": 20261 + }, + { + "epoch": 0.7256253693125861, + "grad_norm": 1.4099403619766235, + "learning_rate": 3.6950098588050074e-05, + "loss": 1.2547, + "step": 20262 + }, + { + "epoch": 0.7256611814421544, + "grad_norm": 1.5284037590026855, + "learning_rate": 3.6941095973771334e-05, + "loss": 1.3404, + "step": 20263 + }, + { + "epoch": 0.7256969935717228, + "grad_norm": 1.6371190547943115, + "learning_rate": 3.6932094207868806e-05, + "loss": 1.5385, + "step": 20264 + }, + { + "epoch": 0.7257328057012911, + "grad_norm": 2.2500505447387695, + "learning_rate": 3.692309329046364e-05, + "loss": 1.6577, + "step": 20265 + }, + { + "epoch": 0.7257686178308593, + "grad_norm": 1.5898199081420898, + "learning_rate": 3.691409322167685e-05, + "loss": 1.5561, + "step": 20266 + }, + { + "epoch": 0.7258044299604276, + "grad_norm": 1.3862446546554565, + "learning_rate": 3.690509400162957e-05, + "loss": 1.1933, + "step": 20267 + }, + { + "epoch": 0.7258402420899959, + "grad_norm": 2.0691616535186768, + "learning_rate": 3.689609563044288e-05, + "loss": 1.6209, + "step": 20268 + }, + { + "epoch": 0.7258760542195641, + "grad_norm": 1.7785332202911377, + "learning_rate": 3.68870981082378e-05, + "loss": 1.4089, + "step": 20269 + }, + { + "epoch": 0.7259118663491324, + "grad_norm": 1.5039234161376953, + "learning_rate": 3.687810143513541e-05, + "loss": 1.1616, + "step": 20270 + }, + { + "epoch": 0.7259476784787008, + "grad_norm": 1.4419066905975342, + "learning_rate": 3.686910561125675e-05, + "loss": 1.4868, + "step": 20271 + }, + { + "epoch": 0.725983490608269, + "grad_norm": 1.712319254875183, + "learning_rate": 3.6860110636722856e-05, + "loss": 1.5521, + "step": 20272 + }, + { + "epoch": 0.7260193027378373, + "grad_norm": 1.749627709388733, + "learning_rate": 3.6851116511654705e-05, + "loss": 1.3738, + "step": 20273 + }, + { + "epoch": 0.7260551148674056, + "grad_norm": 1.3292651176452637, + "learning_rate": 3.684212323617333e-05, + "loss": 1.4005, + "step": 20274 + }, + { + "epoch": 0.7260909269969739, + "grad_norm": 1.6305632591247559, + "learning_rate": 3.683313081039971e-05, + "loss": 1.5385, + "step": 20275 + }, + { + "epoch": 0.7261267391265421, + "grad_norm": 1.43117094039917, + "learning_rate": 3.6824139234454876e-05, + "loss": 1.5204, + "step": 20276 + }, + { + "epoch": 0.7261625512561104, + "grad_norm": 1.7682982683181763, + "learning_rate": 3.681514850845972e-05, + "loss": 1.5069, + "step": 20277 + }, + { + "epoch": 0.7261983633856788, + "grad_norm": 1.9330202341079712, + "learning_rate": 3.6806158632535235e-05, + "loss": 1.547, + "step": 20278 + }, + { + "epoch": 0.726234175515247, + "grad_norm": 1.5854822397232056, + "learning_rate": 3.679716960680242e-05, + "loss": 1.6898, + "step": 20279 + }, + { + "epoch": 0.7262699876448153, + "grad_norm": 1.5087810754776, + "learning_rate": 3.6788181431382106e-05, + "loss": 1.6337, + "step": 20280 + }, + { + "epoch": 0.7263057997743836, + "grad_norm": 1.8749876022338867, + "learning_rate": 3.6779194106395285e-05, + "loss": 1.6861, + "step": 20281 + }, + { + "epoch": 0.7263416119039519, + "grad_norm": 1.8120014667510986, + "learning_rate": 3.677020763196286e-05, + "loss": 1.673, + "step": 20282 + }, + { + "epoch": 0.7263774240335201, + "grad_norm": 3.601649522781372, + "learning_rate": 3.676122200820577e-05, + "loss": 2.1907, + "step": 20283 + }, + { + "epoch": 0.7264132361630884, + "grad_norm": 1.9699167013168335, + "learning_rate": 3.6752237235244825e-05, + "loss": 1.4932, + "step": 20284 + }, + { + "epoch": 0.7264490482926568, + "grad_norm": 1.2282700538635254, + "learning_rate": 3.6743253313200945e-05, + "loss": 1.2088, + "step": 20285 + }, + { + "epoch": 0.726484860422225, + "grad_norm": 1.4451675415039062, + "learning_rate": 3.673427024219502e-05, + "loss": 1.669, + "step": 20286 + }, + { + "epoch": 0.7265206725517933, + "grad_norm": 1.7398184537887573, + "learning_rate": 3.672528802234786e-05, + "loss": 1.1963, + "step": 20287 + }, + { + "epoch": 0.7265564846813616, + "grad_norm": 1.5839215517044067, + "learning_rate": 3.671630665378033e-05, + "loss": 1.6271, + "step": 20288 + }, + { + "epoch": 0.7265922968109298, + "grad_norm": 1.601536512374878, + "learning_rate": 3.670732613661326e-05, + "loss": 1.5271, + "step": 20289 + }, + { + "epoch": 0.7266281089404981, + "grad_norm": 1.5038881301879883, + "learning_rate": 3.669834647096752e-05, + "loss": 1.4387, + "step": 20290 + }, + { + "epoch": 0.7266639210700664, + "grad_norm": 3.174227476119995, + "learning_rate": 3.668936765696383e-05, + "loss": 1.489, + "step": 20291 + }, + { + "epoch": 0.7266997331996348, + "grad_norm": 1.7805509567260742, + "learning_rate": 3.6680389694723025e-05, + "loss": 1.6459, + "step": 20292 + }, + { + "epoch": 0.726735545329203, + "grad_norm": 2.242903232574463, + "learning_rate": 3.667141258436592e-05, + "loss": 1.7589, + "step": 20293 + }, + { + "epoch": 0.7267713574587713, + "grad_norm": 1.5864604711532593, + "learning_rate": 3.666243632601329e-05, + "loss": 1.6975, + "step": 20294 + }, + { + "epoch": 0.7268071695883396, + "grad_norm": 1.4965282678604126, + "learning_rate": 3.6653460919785855e-05, + "loss": 1.4238, + "step": 20295 + }, + { + "epoch": 0.7268429817179078, + "grad_norm": 1.7679928541183472, + "learning_rate": 3.6644486365804385e-05, + "loss": 1.2437, + "step": 20296 + }, + { + "epoch": 0.7268787938474761, + "grad_norm": 1.58811616897583, + "learning_rate": 3.663551266418966e-05, + "loss": 1.5576, + "step": 20297 + }, + { + "epoch": 0.7269146059770444, + "grad_norm": 1.8119292259216309, + "learning_rate": 3.662653981506235e-05, + "loss": 1.4472, + "step": 20298 + }, + { + "epoch": 0.7269504181066128, + "grad_norm": 2.19586443901062, + "learning_rate": 3.661756781854321e-05, + "loss": 1.371, + "step": 20299 + }, + { + "epoch": 0.726986230236181, + "grad_norm": 1.741518259048462, + "learning_rate": 3.660859667475293e-05, + "loss": 1.3297, + "step": 20300 + }, + { + "epoch": 0.7270220423657493, + "grad_norm": 3.0605978965759277, + "learning_rate": 3.659962638381224e-05, + "loss": 1.763, + "step": 20301 + }, + { + "epoch": 0.7270578544953176, + "grad_norm": 1.5931988954544067, + "learning_rate": 3.6590656945841775e-05, + "loss": 1.4453, + "step": 20302 + }, + { + "epoch": 0.7270936666248858, + "grad_norm": 1.3028597831726074, + "learning_rate": 3.6581688360962206e-05, + "loss": 1.4289, + "step": 20303 + }, + { + "epoch": 0.7271294787544541, + "grad_norm": 1.5821110010147095, + "learning_rate": 3.6572720629294276e-05, + "loss": 1.7085, + "step": 20304 + }, + { + "epoch": 0.7271652908840224, + "grad_norm": 1.6137861013412476, + "learning_rate": 3.656375375095853e-05, + "loss": 1.2497, + "step": 20305 + }, + { + "epoch": 0.7272011030135908, + "grad_norm": 1.879036545753479, + "learning_rate": 3.655478772607565e-05, + "loss": 1.6787, + "step": 20306 + }, + { + "epoch": 0.727236915143159, + "grad_norm": 1.7845239639282227, + "learning_rate": 3.654582255476626e-05, + "loss": 1.6337, + "step": 20307 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 2.2749297618865967, + "learning_rate": 3.6536858237151015e-05, + "loss": 1.226, + "step": 20308 + }, + { + "epoch": 0.7273085394022956, + "grad_norm": 1.7021535634994507, + "learning_rate": 3.652789477335045e-05, + "loss": 1.4401, + "step": 20309 + }, + { + "epoch": 0.7273443515318638, + "grad_norm": 2.237609624862671, + "learning_rate": 3.651893216348517e-05, + "loss": 1.3534, + "step": 20310 + }, + { + "epoch": 0.7273801636614321, + "grad_norm": 1.8463900089263916, + "learning_rate": 3.65099704076758e-05, + "loss": 1.2325, + "step": 20311 + }, + { + "epoch": 0.7274159757910004, + "grad_norm": 1.7602739334106445, + "learning_rate": 3.650100950604289e-05, + "loss": 1.4707, + "step": 20312 + }, + { + "epoch": 0.7274517879205687, + "grad_norm": 1.4012271165847778, + "learning_rate": 3.649204945870701e-05, + "loss": 1.4284, + "step": 20313 + }, + { + "epoch": 0.727487600050137, + "grad_norm": 1.6311163902282715, + "learning_rate": 3.6483090265788614e-05, + "loss": 1.2821, + "step": 20314 + }, + { + "epoch": 0.7275234121797053, + "grad_norm": 1.6619857549667358, + "learning_rate": 3.647413192740836e-05, + "loss": 1.284, + "step": 20315 + }, + { + "epoch": 0.7275592243092736, + "grad_norm": 1.6856287717819214, + "learning_rate": 3.64651744436867e-05, + "loss": 1.4404, + "step": 20316 + }, + { + "epoch": 0.7275950364388418, + "grad_norm": 2.37723708152771, + "learning_rate": 3.6456217814744165e-05, + "loss": 1.6775, + "step": 20317 + }, + { + "epoch": 0.7276308485684101, + "grad_norm": 1.4512906074523926, + "learning_rate": 3.644726204070125e-05, + "loss": 1.6137, + "step": 20318 + }, + { + "epoch": 0.7276666606979784, + "grad_norm": 1.3987200260162354, + "learning_rate": 3.643830712167847e-05, + "loss": 1.5412, + "step": 20319 + }, + { + "epoch": 0.7277024728275467, + "grad_norm": 1.5745872259140015, + "learning_rate": 3.6429353057796255e-05, + "loss": 1.3009, + "step": 20320 + }, + { + "epoch": 0.727738284957115, + "grad_norm": 2.594204902648926, + "learning_rate": 3.642039984917509e-05, + "loss": 1.5018, + "step": 20321 + }, + { + "epoch": 0.7277740970866833, + "grad_norm": 1.6258982419967651, + "learning_rate": 3.641144749593548e-05, + "loss": 1.3628, + "step": 20322 + }, + { + "epoch": 0.7278099092162515, + "grad_norm": 1.546212911605835, + "learning_rate": 3.640249599819777e-05, + "loss": 1.53, + "step": 20323 + }, + { + "epoch": 0.7278457213458198, + "grad_norm": 1.531711220741272, + "learning_rate": 3.639354535608248e-05, + "loss": 1.4359, + "step": 20324 + }, + { + "epoch": 0.7278815334753881, + "grad_norm": 1.293178677558899, + "learning_rate": 3.638459556970993e-05, + "loss": 1.3002, + "step": 20325 + }, + { + "epoch": 0.7279173456049564, + "grad_norm": 1.8886504173278809, + "learning_rate": 3.637564663920066e-05, + "loss": 1.5775, + "step": 20326 + }, + { + "epoch": 0.7279531577345247, + "grad_norm": 1.3030343055725098, + "learning_rate": 3.636669856467495e-05, + "loss": 1.443, + "step": 20327 + }, + { + "epoch": 0.727988969864093, + "grad_norm": 1.8650761842727661, + "learning_rate": 3.635775134625323e-05, + "loss": 1.484, + "step": 20328 + }, + { + "epoch": 0.7280247819936613, + "grad_norm": 1.1256822347640991, + "learning_rate": 3.634880498405587e-05, + "loss": 1.3617, + "step": 20329 + }, + { + "epoch": 0.7280605941232295, + "grad_norm": 1.5693796873092651, + "learning_rate": 3.6339859478203274e-05, + "loss": 1.2764, + "step": 20330 + }, + { + "epoch": 0.7280964062527978, + "grad_norm": 2.721557378768921, + "learning_rate": 3.6330914828815755e-05, + "loss": 1.6122, + "step": 20331 + }, + { + "epoch": 0.7281322183823661, + "grad_norm": 1.8987950086593628, + "learning_rate": 3.632197103601358e-05, + "loss": 1.1593, + "step": 20332 + }, + { + "epoch": 0.7281680305119343, + "grad_norm": 1.3564764261245728, + "learning_rate": 3.6313028099917226e-05, + "loss": 1.4216, + "step": 20333 + }, + { + "epoch": 0.7282038426415027, + "grad_norm": 1.3423123359680176, + "learning_rate": 3.6304086020646874e-05, + "loss": 1.2289, + "step": 20334 + }, + { + "epoch": 0.728239654771071, + "grad_norm": 2.045650005340576, + "learning_rate": 3.629514479832292e-05, + "loss": 1.5775, + "step": 20335 + }, + { + "epoch": 0.7282754669006393, + "grad_norm": 1.7621314525604248, + "learning_rate": 3.628620443306556e-05, + "loss": 1.4242, + "step": 20336 + }, + { + "epoch": 0.7283112790302075, + "grad_norm": 1.5578246116638184, + "learning_rate": 3.62772649249952e-05, + "loss": 1.4387, + "step": 20337 + }, + { + "epoch": 0.7283470911597758, + "grad_norm": 1.3517426252365112, + "learning_rate": 3.626832627423201e-05, + "loss": 1.7368, + "step": 20338 + }, + { + "epoch": 0.7283829032893441, + "grad_norm": 1.414923906326294, + "learning_rate": 3.6259388480896316e-05, + "loss": 1.4573, + "step": 20339 + }, + { + "epoch": 0.7284187154189123, + "grad_norm": 1.8400875329971313, + "learning_rate": 3.62504515451083e-05, + "loss": 1.3403, + "step": 20340 + }, + { + "epoch": 0.7284545275484807, + "grad_norm": 1.4856648445129395, + "learning_rate": 3.624151546698822e-05, + "loss": 1.5307, + "step": 20341 + }, + { + "epoch": 0.728490339678049, + "grad_norm": 1.957085371017456, + "learning_rate": 3.623258024665635e-05, + "loss": 1.3083, + "step": 20342 + }, + { + "epoch": 0.7285261518076173, + "grad_norm": 1.5131561756134033, + "learning_rate": 3.6223645884232784e-05, + "loss": 1.4215, + "step": 20343 + }, + { + "epoch": 0.7285619639371855, + "grad_norm": 1.8135859966278076, + "learning_rate": 3.621471237983787e-05, + "loss": 1.5306, + "step": 20344 + }, + { + "epoch": 0.7285977760667538, + "grad_norm": 2.384350538253784, + "learning_rate": 3.620577973359168e-05, + "loss": 1.492, + "step": 20345 + }, + { + "epoch": 0.7286335881963221, + "grad_norm": 1.0702521800994873, + "learning_rate": 3.619684794561448e-05, + "loss": 0.9597, + "step": 20346 + }, + { + "epoch": 0.7286694003258903, + "grad_norm": 1.8656784296035767, + "learning_rate": 3.618791701602635e-05, + "loss": 1.4725, + "step": 20347 + }, + { + "epoch": 0.7287052124554587, + "grad_norm": 1.8539940118789673, + "learning_rate": 3.617898694494749e-05, + "loss": 1.6358, + "step": 20348 + }, + { + "epoch": 0.728741024585027, + "grad_norm": 1.284334659576416, + "learning_rate": 3.6170057732498064e-05, + "loss": 1.6315, + "step": 20349 + }, + { + "epoch": 0.7287768367145953, + "grad_norm": 1.698467493057251, + "learning_rate": 3.616112937879814e-05, + "loss": 1.4993, + "step": 20350 + }, + { + "epoch": 0.7288126488441635, + "grad_norm": 1.6168347597122192, + "learning_rate": 3.6152201883967885e-05, + "loss": 1.56, + "step": 20351 + }, + { + "epoch": 0.7288484609737318, + "grad_norm": 1.26214599609375, + "learning_rate": 3.6143275248127394e-05, + "loss": 1.4896, + "step": 20352 + }, + { + "epoch": 0.7288842731033001, + "grad_norm": 1.7499330043792725, + "learning_rate": 3.61343494713968e-05, + "loss": 1.2003, + "step": 20353 + }, + { + "epoch": 0.7289200852328683, + "grad_norm": 2.1772499084472656, + "learning_rate": 3.612542455389608e-05, + "loss": 1.5853, + "step": 20354 + }, + { + "epoch": 0.7289558973624367, + "grad_norm": 1.5749849081039429, + "learning_rate": 3.611650049574545e-05, + "loss": 1.5532, + "step": 20355 + }, + { + "epoch": 0.728991709492005, + "grad_norm": 1.753783106803894, + "learning_rate": 3.61075772970649e-05, + "loss": 1.4504, + "step": 20356 + }, + { + "epoch": 0.7290275216215732, + "grad_norm": 2.087217330932617, + "learning_rate": 3.609865495797445e-05, + "loss": 1.6384, + "step": 20357 + }, + { + "epoch": 0.7290633337511415, + "grad_norm": 1.6963598728179932, + "learning_rate": 3.608973347859418e-05, + "loss": 1.388, + "step": 20358 + }, + { + "epoch": 0.7290991458807098, + "grad_norm": 1.416736364364624, + "learning_rate": 3.6080812859044086e-05, + "loss": 1.4647, + "step": 20359 + }, + { + "epoch": 0.7291349580102781, + "grad_norm": 2.7827279567718506, + "learning_rate": 3.607189309944427e-05, + "loss": 1.3479, + "step": 20360 + }, + { + "epoch": 0.7291707701398463, + "grad_norm": 1.966021180152893, + "learning_rate": 3.6062974199914615e-05, + "loss": 1.5521, + "step": 20361 + }, + { + "epoch": 0.7292065822694147, + "grad_norm": 1.5805641412734985, + "learning_rate": 3.6054056160575164e-05, + "loss": 1.3172, + "step": 20362 + }, + { + "epoch": 0.729242394398983, + "grad_norm": 1.9374136924743652, + "learning_rate": 3.6045138981545915e-05, + "loss": 1.4399, + "step": 20363 + }, + { + "epoch": 0.7292782065285512, + "grad_norm": 2.0594398975372314, + "learning_rate": 3.603622266294686e-05, + "loss": 1.1237, + "step": 20364 + }, + { + "epoch": 0.7293140186581195, + "grad_norm": 1.5763599872589111, + "learning_rate": 3.6027307204897886e-05, + "loss": 1.3851, + "step": 20365 + }, + { + "epoch": 0.7293498307876878, + "grad_norm": 1.818981409072876, + "learning_rate": 3.601839260751897e-05, + "loss": 1.3178, + "step": 20366 + }, + { + "epoch": 0.729385642917256, + "grad_norm": 1.4879289865493774, + "learning_rate": 3.600947887093009e-05, + "loss": 1.3811, + "step": 20367 + }, + { + "epoch": 0.7294214550468243, + "grad_norm": 1.7002722024917603, + "learning_rate": 3.600056599525109e-05, + "loss": 1.3244, + "step": 20368 + }, + { + "epoch": 0.7294572671763927, + "grad_norm": 1.7715823650360107, + "learning_rate": 3.5991653980601926e-05, + "loss": 1.2399, + "step": 20369 + }, + { + "epoch": 0.729493079305961, + "grad_norm": 2.60311222076416, + "learning_rate": 3.59827428271025e-05, + "loss": 1.6152, + "step": 20370 + }, + { + "epoch": 0.7295288914355292, + "grad_norm": 1.446850061416626, + "learning_rate": 3.597383253487272e-05, + "loss": 1.3049, + "step": 20371 + }, + { + "epoch": 0.7295647035650975, + "grad_norm": 1.3441686630249023, + "learning_rate": 3.59649231040324e-05, + "loss": 1.4572, + "step": 20372 + }, + { + "epoch": 0.7296005156946658, + "grad_norm": 1.666536569595337, + "learning_rate": 3.595601453470143e-05, + "loss": 1.5172, + "step": 20373 + }, + { + "epoch": 0.729636327824234, + "grad_norm": 1.4410146474838257, + "learning_rate": 3.594710682699972e-05, + "loss": 1.6782, + "step": 20374 + }, + { + "epoch": 0.7296721399538023, + "grad_norm": 1.8410130739212036, + "learning_rate": 3.5938199981047036e-05, + "loss": 1.5535, + "step": 20375 + }, + { + "epoch": 0.7297079520833707, + "grad_norm": 1.9997683763504028, + "learning_rate": 3.592929399696323e-05, + "loss": 1.3053, + "step": 20376 + }, + { + "epoch": 0.729743764212939, + "grad_norm": 1.5407726764678955, + "learning_rate": 3.592038887486813e-05, + "loss": 1.4949, + "step": 20377 + }, + { + "epoch": 0.7297795763425072, + "grad_norm": 2.03817081451416, + "learning_rate": 3.591148461488157e-05, + "loss": 1.5091, + "step": 20378 + }, + { + "epoch": 0.7298153884720755, + "grad_norm": 1.7712717056274414, + "learning_rate": 3.590258121712329e-05, + "loss": 1.8162, + "step": 20379 + }, + { + "epoch": 0.7298512006016438, + "grad_norm": 1.590785264968872, + "learning_rate": 3.589367868171309e-05, + "loss": 1.2453, + "step": 20380 + }, + { + "epoch": 0.729887012731212, + "grad_norm": 1.64919114112854, + "learning_rate": 3.5884777008770765e-05, + "loss": 1.5673, + "step": 20381 + }, + { + "epoch": 0.7299228248607803, + "grad_norm": 1.4542080163955688, + "learning_rate": 3.587587619841609e-05, + "loss": 0.9534, + "step": 20382 + }, + { + "epoch": 0.7299586369903487, + "grad_norm": 1.3254774808883667, + "learning_rate": 3.586697625076876e-05, + "loss": 1.4704, + "step": 20383 + }, + { + "epoch": 0.729994449119917, + "grad_norm": 1.9751310348510742, + "learning_rate": 3.585807716594853e-05, + "loss": 1.4692, + "step": 20384 + }, + { + "epoch": 0.7300302612494852, + "grad_norm": 1.78018057346344, + "learning_rate": 3.584917894407517e-05, + "loss": 1.4879, + "step": 20385 + }, + { + "epoch": 0.7300660733790535, + "grad_norm": 1.4248936176300049, + "learning_rate": 3.584028158526832e-05, + "loss": 1.4147, + "step": 20386 + }, + { + "epoch": 0.7301018855086218, + "grad_norm": 2.423626184463501, + "learning_rate": 3.583138508964773e-05, + "loss": 1.6582, + "step": 20387 + }, + { + "epoch": 0.73013769763819, + "grad_norm": 1.622358798980713, + "learning_rate": 3.582248945733307e-05, + "loss": 1.6244, + "step": 20388 + }, + { + "epoch": 0.7301735097677583, + "grad_norm": 2.3228282928466797, + "learning_rate": 3.581359468844408e-05, + "loss": 1.8354, + "step": 20389 + }, + { + "epoch": 0.7302093218973267, + "grad_norm": 1.6981981992721558, + "learning_rate": 3.580470078310034e-05, + "loss": 1.3045, + "step": 20390 + }, + { + "epoch": 0.730245134026895, + "grad_norm": 2.001455068588257, + "learning_rate": 3.579580774142155e-05, + "loss": 1.2112, + "step": 20391 + }, + { + "epoch": 0.7302809461564632, + "grad_norm": 2.1798324584960938, + "learning_rate": 3.5786915563527376e-05, + "loss": 1.155, + "step": 20392 + }, + { + "epoch": 0.7303167582860315, + "grad_norm": 1.86077880859375, + "learning_rate": 3.577802424953739e-05, + "loss": 1.7546, + "step": 20393 + }, + { + "epoch": 0.7303525704155998, + "grad_norm": 2.3468453884124756, + "learning_rate": 3.576913379957125e-05, + "loss": 1.22, + "step": 20394 + }, + { + "epoch": 0.730388382545168, + "grad_norm": 1.3611379861831665, + "learning_rate": 3.5760244213748565e-05, + "loss": 1.3976, + "step": 20395 + }, + { + "epoch": 0.7304241946747363, + "grad_norm": 1.5339411497116089, + "learning_rate": 3.575135549218895e-05, + "loss": 1.5949, + "step": 20396 + }, + { + "epoch": 0.7304600068043047, + "grad_norm": 2.1556475162506104, + "learning_rate": 3.5742467635011956e-05, + "loss": 1.4444, + "step": 20397 + }, + { + "epoch": 0.7304958189338729, + "grad_norm": 1.9611434936523438, + "learning_rate": 3.5733580642337174e-05, + "loss": 1.4917, + "step": 20398 + }, + { + "epoch": 0.7305316310634412, + "grad_norm": 1.5778285264968872, + "learning_rate": 3.572469451428415e-05, + "loss": 1.274, + "step": 20399 + }, + { + "epoch": 0.7305674431930095, + "grad_norm": 1.7948662042617798, + "learning_rate": 3.57158092509725e-05, + "loss": 1.245, + "step": 20400 + }, + { + "epoch": 0.7306032553225777, + "grad_norm": 1.58558988571167, + "learning_rate": 3.5706924852521674e-05, + "loss": 1.4525, + "step": 20401 + }, + { + "epoch": 0.730639067452146, + "grad_norm": 2.173316478729248, + "learning_rate": 3.5698041319051245e-05, + "loss": 1.7764, + "step": 20402 + }, + { + "epoch": 0.7306748795817143, + "grad_norm": 1.275415062904358, + "learning_rate": 3.5689158650680765e-05, + "loss": 1.5483, + "step": 20403 + }, + { + "epoch": 0.7307106917112826, + "grad_norm": 1.8517347574234009, + "learning_rate": 3.568027684752966e-05, + "loss": 1.6182, + "step": 20404 + }, + { + "epoch": 0.7307465038408509, + "grad_norm": 1.9856765270233154, + "learning_rate": 3.5671395909717477e-05, + "loss": 1.3489, + "step": 20405 + }, + { + "epoch": 0.7307823159704192, + "grad_norm": 1.7788243293762207, + "learning_rate": 3.566251583736367e-05, + "loss": 1.5068, + "step": 20406 + }, + { + "epoch": 0.7308181280999875, + "grad_norm": 1.406541109085083, + "learning_rate": 3.5653636630587764e-05, + "loss": 1.3148, + "step": 20407 + }, + { + "epoch": 0.7308539402295557, + "grad_norm": 1.4304836988449097, + "learning_rate": 3.5644758289509126e-05, + "loss": 1.1475, + "step": 20408 + }, + { + "epoch": 0.730889752359124, + "grad_norm": 1.7873742580413818, + "learning_rate": 3.563588081424727e-05, + "loss": 1.363, + "step": 20409 + }, + { + "epoch": 0.7309255644886923, + "grad_norm": 1.6076723337173462, + "learning_rate": 3.5627004204921645e-05, + "loss": 1.7126, + "step": 20410 + }, + { + "epoch": 0.7309613766182605, + "grad_norm": 1.798566460609436, + "learning_rate": 3.561812846165161e-05, + "loss": 1.4759, + "step": 20411 + }, + { + "epoch": 0.7309971887478289, + "grad_norm": 2.0032665729522705, + "learning_rate": 3.56092535845566e-05, + "loss": 1.3545, + "step": 20412 + }, + { + "epoch": 0.7310330008773972, + "grad_norm": 1.4603948593139648, + "learning_rate": 3.560037957375604e-05, + "loss": 1.3149, + "step": 20413 + }, + { + "epoch": 0.7310688130069655, + "grad_norm": 1.8404431343078613, + "learning_rate": 3.5591506429369325e-05, + "loss": 1.7072, + "step": 20414 + }, + { + "epoch": 0.7311046251365337, + "grad_norm": 1.4005978107452393, + "learning_rate": 3.558263415151578e-05, + "loss": 1.4017, + "step": 20415 + }, + { + "epoch": 0.731140437266102, + "grad_norm": 1.3122055530548096, + "learning_rate": 3.557376274031481e-05, + "loss": 1.3193, + "step": 20416 + }, + { + "epoch": 0.7311762493956703, + "grad_norm": 1.732801914215088, + "learning_rate": 3.556489219588575e-05, + "loss": 1.8159, + "step": 20417 + }, + { + "epoch": 0.7312120615252385, + "grad_norm": 1.7822617292404175, + "learning_rate": 3.5556022518347975e-05, + "loss": 1.529, + "step": 20418 + }, + { + "epoch": 0.7312478736548069, + "grad_norm": 1.499024510383606, + "learning_rate": 3.55471537078208e-05, + "loss": 1.2738, + "step": 20419 + }, + { + "epoch": 0.7312836857843752, + "grad_norm": 3.1258182525634766, + "learning_rate": 3.553828576442346e-05, + "loss": 2.0017, + "step": 20420 + }, + { + "epoch": 0.7313194979139435, + "grad_norm": 1.561532974243164, + "learning_rate": 3.552941868827542e-05, + "loss": 1.5181, + "step": 20421 + }, + { + "epoch": 0.7313553100435117, + "grad_norm": 1.2803188562393188, + "learning_rate": 3.552055247949584e-05, + "loss": 1.5402, + "step": 20422 + }, + { + "epoch": 0.73139112217308, + "grad_norm": 2.332247734069824, + "learning_rate": 3.5511687138204097e-05, + "loss": 1.2652, + "step": 20423 + }, + { + "epoch": 0.7314269343026483, + "grad_norm": 1.3746566772460938, + "learning_rate": 3.5502822664519345e-05, + "loss": 1.5465, + "step": 20424 + }, + { + "epoch": 0.7314627464322165, + "grad_norm": 1.6845834255218506, + "learning_rate": 3.549395905856099e-05, + "loss": 1.3739, + "step": 20425 + }, + { + "epoch": 0.7314985585617849, + "grad_norm": 1.6807371377944946, + "learning_rate": 3.5485096320448176e-05, + "loss": 1.6331, + "step": 20426 + }, + { + "epoch": 0.7315343706913532, + "grad_norm": 1.6530333757400513, + "learning_rate": 3.547623445030016e-05, + "loss": 1.2784, + "step": 20427 + }, + { + "epoch": 0.7315701828209215, + "grad_norm": 1.698762059211731, + "learning_rate": 3.546737344823623e-05, + "loss": 1.4305, + "step": 20428 + }, + { + "epoch": 0.7316059949504897, + "grad_norm": 2.0598812103271484, + "learning_rate": 3.545851331437551e-05, + "loss": 1.3792, + "step": 20429 + }, + { + "epoch": 0.731641807080058, + "grad_norm": 1.8785374164581299, + "learning_rate": 3.544965404883728e-05, + "loss": 1.1282, + "step": 20430 + }, + { + "epoch": 0.7316776192096263, + "grad_norm": 2.2922866344451904, + "learning_rate": 3.544079565174061e-05, + "loss": 1.4628, + "step": 20431 + }, + { + "epoch": 0.7317134313391945, + "grad_norm": 2.218919277191162, + "learning_rate": 3.543193812320483e-05, + "loss": 1.3955, + "step": 20432 + }, + { + "epoch": 0.7317492434687629, + "grad_norm": 1.631608009338379, + "learning_rate": 3.542308146334901e-05, + "loss": 1.6856, + "step": 20433 + }, + { + "epoch": 0.7317850555983312, + "grad_norm": 1.6857116222381592, + "learning_rate": 3.541422567229235e-05, + "loss": 1.4069, + "step": 20434 + }, + { + "epoch": 0.7318208677278994, + "grad_norm": 1.6167864799499512, + "learning_rate": 3.540537075015393e-05, + "loss": 1.6345, + "step": 20435 + }, + { + "epoch": 0.7318566798574677, + "grad_norm": 1.622429609298706, + "learning_rate": 3.539651669705297e-05, + "loss": 1.3154, + "step": 20436 + }, + { + "epoch": 0.731892491987036, + "grad_norm": 1.8493098020553589, + "learning_rate": 3.538766351310856e-05, + "loss": 1.664, + "step": 20437 + }, + { + "epoch": 0.7319283041166043, + "grad_norm": 2.0410239696502686, + "learning_rate": 3.537881119843972e-05, + "loss": 1.0608, + "step": 20438 + }, + { + "epoch": 0.7319641162461725, + "grad_norm": 1.2448198795318604, + "learning_rate": 3.5369959753165694e-05, + "loss": 1.3688, + "step": 20439 + }, + { + "epoch": 0.7319999283757409, + "grad_norm": 1.456103801727295, + "learning_rate": 3.536110917740545e-05, + "loss": 1.4517, + "step": 20440 + }, + { + "epoch": 0.7320357405053092, + "grad_norm": 1.7088440656661987, + "learning_rate": 3.5352259471278146e-05, + "loss": 1.3145, + "step": 20441 + }, + { + "epoch": 0.7320715526348774, + "grad_norm": 1.8974305391311646, + "learning_rate": 3.534341063490273e-05, + "loss": 1.3612, + "step": 20442 + }, + { + "epoch": 0.7321073647644457, + "grad_norm": 1.5721908807754517, + "learning_rate": 3.533456266839838e-05, + "loss": 1.4555, + "step": 20443 + }, + { + "epoch": 0.732143176894014, + "grad_norm": 1.6463236808776855, + "learning_rate": 3.532571557188409e-05, + "loss": 1.4655, + "step": 20444 + }, + { + "epoch": 0.7321789890235822, + "grad_norm": 2.0902090072631836, + "learning_rate": 3.531686934547884e-05, + "loss": 1.2894, + "step": 20445 + }, + { + "epoch": 0.7322148011531505, + "grad_norm": 1.4955464601516724, + "learning_rate": 3.5308023989301676e-05, + "loss": 1.286, + "step": 20446 + }, + { + "epoch": 0.7322506132827189, + "grad_norm": 2.225278854370117, + "learning_rate": 3.529917950347159e-05, + "loss": 1.6805, + "step": 20447 + }, + { + "epoch": 0.7322864254122872, + "grad_norm": 1.585610270500183, + "learning_rate": 3.529033588810764e-05, + "loss": 1.1171, + "step": 20448 + }, + { + "epoch": 0.7323222375418554, + "grad_norm": 1.6811686754226685, + "learning_rate": 3.52814931433287e-05, + "loss": 1.5252, + "step": 20449 + }, + { + "epoch": 0.7323580496714237, + "grad_norm": 2.0592916011810303, + "learning_rate": 3.52726512692538e-05, + "loss": 1.6063, + "step": 20450 + }, + { + "epoch": 0.732393861800992, + "grad_norm": 1.6226791143417358, + "learning_rate": 3.526381026600188e-05, + "loss": 1.3123, + "step": 20451 + }, + { + "epoch": 0.7324296739305602, + "grad_norm": 1.4268947839736938, + "learning_rate": 3.5254970133691925e-05, + "loss": 1.3222, + "step": 20452 + }, + { + "epoch": 0.7324654860601285, + "grad_norm": 1.6242847442626953, + "learning_rate": 3.5246130872442794e-05, + "loss": 1.3596, + "step": 20453 + }, + { + "epoch": 0.7325012981896969, + "grad_norm": 1.6318475008010864, + "learning_rate": 3.523729248237345e-05, + "loss": 1.2142, + "step": 20454 + }, + { + "epoch": 0.7325371103192652, + "grad_norm": 1.932065486907959, + "learning_rate": 3.522845496360283e-05, + "loss": 1.6373, + "step": 20455 + }, + { + "epoch": 0.7325729224488334, + "grad_norm": 1.579846978187561, + "learning_rate": 3.5219618316249766e-05, + "loss": 1.5628, + "step": 20456 + }, + { + "epoch": 0.7326087345784017, + "grad_norm": 1.3308910131454468, + "learning_rate": 3.521078254043317e-05, + "loss": 1.6539, + "step": 20457 + }, + { + "epoch": 0.73264454670797, + "grad_norm": 1.5070046186447144, + "learning_rate": 3.5201947636271934e-05, + "loss": 1.1408, + "step": 20458 + }, + { + "epoch": 0.7326803588375382, + "grad_norm": 2.3446309566497803, + "learning_rate": 3.519311360388494e-05, + "loss": 1.5686, + "step": 20459 + }, + { + "epoch": 0.7327161709671065, + "grad_norm": 1.3851144313812256, + "learning_rate": 3.518428044339097e-05, + "loss": 1.4275, + "step": 20460 + }, + { + "epoch": 0.7327519830966749, + "grad_norm": 1.6674261093139648, + "learning_rate": 3.5175448154908895e-05, + "loss": 1.459, + "step": 20461 + }, + { + "epoch": 0.7327877952262432, + "grad_norm": 1.6122231483459473, + "learning_rate": 3.516661673855759e-05, + "loss": 1.3823, + "step": 20462 + }, + { + "epoch": 0.7328236073558114, + "grad_norm": 1.8272632360458374, + "learning_rate": 3.51577861944558e-05, + "loss": 1.5136, + "step": 20463 + }, + { + "epoch": 0.7328594194853797, + "grad_norm": 2.42698335647583, + "learning_rate": 3.5148956522722346e-05, + "loss": 1.2525, + "step": 20464 + }, + { + "epoch": 0.732895231614948, + "grad_norm": 1.7631263732910156, + "learning_rate": 3.5140127723476034e-05, + "loss": 1.351, + "step": 20465 + }, + { + "epoch": 0.7329310437445162, + "grad_norm": 1.357838749885559, + "learning_rate": 3.513129979683567e-05, + "loss": 1.1883, + "step": 20466 + }, + { + "epoch": 0.7329668558740845, + "grad_norm": 1.483927845954895, + "learning_rate": 3.5122472742919965e-05, + "loss": 1.3296, + "step": 20467 + }, + { + "epoch": 0.7330026680036529, + "grad_norm": 2.0464913845062256, + "learning_rate": 3.51136465618477e-05, + "loss": 1.4515, + "step": 20468 + }, + { + "epoch": 0.7330384801332211, + "grad_norm": 1.5297338962554932, + "learning_rate": 3.510482125373762e-05, + "loss": 1.1064, + "step": 20469 + }, + { + "epoch": 0.7330742922627894, + "grad_norm": 1.6181906461715698, + "learning_rate": 3.50959968187085e-05, + "loss": 1.2575, + "step": 20470 + }, + { + "epoch": 0.7331101043923577, + "grad_norm": 1.5216774940490723, + "learning_rate": 3.508717325687898e-05, + "loss": 1.234, + "step": 20471 + }, + { + "epoch": 0.733145916521926, + "grad_norm": 2.048452615737915, + "learning_rate": 3.5078350568367825e-05, + "loss": 1.4341, + "step": 20472 + }, + { + "epoch": 0.7331817286514942, + "grad_norm": 1.6940174102783203, + "learning_rate": 3.5069528753293746e-05, + "loss": 1.5693, + "step": 20473 + }, + { + "epoch": 0.7332175407810625, + "grad_norm": 1.9572473764419556, + "learning_rate": 3.506070781177537e-05, + "loss": 1.419, + "step": 20474 + }, + { + "epoch": 0.7332533529106309, + "grad_norm": 1.8093883991241455, + "learning_rate": 3.505188774393141e-05, + "loss": 1.4015, + "step": 20475 + }, + { + "epoch": 0.7332891650401991, + "grad_norm": 1.1968109607696533, + "learning_rate": 3.504306854988052e-05, + "loss": 1.3426, + "step": 20476 + }, + { + "epoch": 0.7333249771697674, + "grad_norm": 1.6046714782714844, + "learning_rate": 3.5034250229741384e-05, + "loss": 1.5721, + "step": 20477 + }, + { + "epoch": 0.7333607892993357, + "grad_norm": 2.118436813354492, + "learning_rate": 3.5025432783632585e-05, + "loss": 1.56, + "step": 20478 + }, + { + "epoch": 0.733396601428904, + "grad_norm": 2.097813129425049, + "learning_rate": 3.501661621167277e-05, + "loss": 1.2924, + "step": 20479 + }, + { + "epoch": 0.7334324135584722, + "grad_norm": 1.8150711059570312, + "learning_rate": 3.50078005139806e-05, + "loss": 1.5607, + "step": 20480 + }, + { + "epoch": 0.7334682256880405, + "grad_norm": 2.765345335006714, + "learning_rate": 3.49989856906746e-05, + "loss": 1.6204, + "step": 20481 + }, + { + "epoch": 0.7335040378176089, + "grad_norm": 2.4202771186828613, + "learning_rate": 3.499017174187341e-05, + "loss": 1.797, + "step": 20482 + }, + { + "epoch": 0.7335398499471771, + "grad_norm": 1.5121333599090576, + "learning_rate": 3.498135866769561e-05, + "loss": 1.4781, + "step": 20483 + }, + { + "epoch": 0.7335756620767454, + "grad_norm": 1.4281476736068726, + "learning_rate": 3.497254646825978e-05, + "loss": 1.3851, + "step": 20484 + }, + { + "epoch": 0.7336114742063137, + "grad_norm": 1.7604395151138306, + "learning_rate": 3.496373514368443e-05, + "loss": 1.3445, + "step": 20485 + }, + { + "epoch": 0.7336472863358819, + "grad_norm": 1.9605915546417236, + "learning_rate": 3.495492469408813e-05, + "loss": 1.0695, + "step": 20486 + }, + { + "epoch": 0.7336830984654502, + "grad_norm": 1.3502087593078613, + "learning_rate": 3.494611511958942e-05, + "loss": 1.2486, + "step": 20487 + }, + { + "epoch": 0.7337189105950185, + "grad_norm": 2.571399688720703, + "learning_rate": 3.493730642030685e-05, + "loss": 1.5465, + "step": 20488 + }, + { + "epoch": 0.7337547227245869, + "grad_norm": 1.424328327178955, + "learning_rate": 3.492849859635885e-05, + "loss": 1.2137, + "step": 20489 + }, + { + "epoch": 0.7337905348541551, + "grad_norm": 1.9703563451766968, + "learning_rate": 3.4919691647863984e-05, + "loss": 1.3256, + "step": 20490 + }, + { + "epoch": 0.7338263469837234, + "grad_norm": 1.6080145835876465, + "learning_rate": 3.491088557494074e-05, + "loss": 1.4436, + "step": 20491 + }, + { + "epoch": 0.7338621591132917, + "grad_norm": 1.7678338289260864, + "learning_rate": 3.490208037770755e-05, + "loss": 1.6468, + "step": 20492 + }, + { + "epoch": 0.7338979712428599, + "grad_norm": 1.807364583015442, + "learning_rate": 3.4893276056282894e-05, + "loss": 1.4296, + "step": 20493 + }, + { + "epoch": 0.7339337833724282, + "grad_norm": 1.544736623764038, + "learning_rate": 3.4884472610785224e-05, + "loss": 1.3378, + "step": 20494 + }, + { + "epoch": 0.7339695955019965, + "grad_norm": 1.670017123222351, + "learning_rate": 3.487567004133302e-05, + "loss": 1.5572, + "step": 20495 + }, + { + "epoch": 0.7340054076315649, + "grad_norm": 1.2803508043289185, + "learning_rate": 3.4866868348044634e-05, + "loss": 1.5029, + "step": 20496 + }, + { + "epoch": 0.7340412197611331, + "grad_norm": 1.7097656726837158, + "learning_rate": 3.485806753103852e-05, + "loss": 1.4671, + "step": 20497 + }, + { + "epoch": 0.7340770318907014, + "grad_norm": 2.6056883335113525, + "learning_rate": 3.484926759043311e-05, + "loss": 1.5693, + "step": 20498 + }, + { + "epoch": 0.7341128440202697, + "grad_norm": 2.137596607208252, + "learning_rate": 3.484046852634674e-05, + "loss": 1.4641, + "step": 20499 + }, + { + "epoch": 0.7341486561498379, + "grad_norm": 1.4234815835952759, + "learning_rate": 3.483167033889781e-05, + "loss": 1.4199, + "step": 20500 + }, + { + "epoch": 0.7341844682794062, + "grad_norm": 1.8504927158355713, + "learning_rate": 3.4822873028204694e-05, + "loss": 1.445, + "step": 20501 + }, + { + "epoch": 0.7342202804089745, + "grad_norm": 1.4451167583465576, + "learning_rate": 3.481407659438579e-05, + "loss": 1.3129, + "step": 20502 + }, + { + "epoch": 0.7342560925385428, + "grad_norm": 1.8816039562225342, + "learning_rate": 3.480528103755937e-05, + "loss": 1.5152, + "step": 20503 + }, + { + "epoch": 0.7342919046681111, + "grad_norm": 1.3867018222808838, + "learning_rate": 3.479648635784378e-05, + "loss": 1.3284, + "step": 20504 + }, + { + "epoch": 0.7343277167976794, + "grad_norm": 2.107089042663574, + "learning_rate": 3.478769255535738e-05, + "loss": 1.5423, + "step": 20505 + }, + { + "epoch": 0.7343635289272477, + "grad_norm": 1.6862614154815674, + "learning_rate": 3.4778899630218483e-05, + "loss": 1.618, + "step": 20506 + }, + { + "epoch": 0.7343993410568159, + "grad_norm": 2.2158303260803223, + "learning_rate": 3.4770107582545365e-05, + "loss": 1.2988, + "step": 20507 + }, + { + "epoch": 0.7344351531863842, + "grad_norm": 1.4314731359481812, + "learning_rate": 3.4761316412456235e-05, + "loss": 1.4692, + "step": 20508 + }, + { + "epoch": 0.7344709653159525, + "grad_norm": 1.4693208932876587, + "learning_rate": 3.4752526120069516e-05, + "loss": 1.2102, + "step": 20509 + }, + { + "epoch": 0.7345067774455208, + "grad_norm": 1.7098275423049927, + "learning_rate": 3.474373670550336e-05, + "loss": 1.4572, + "step": 20510 + }, + { + "epoch": 0.7345425895750891, + "grad_norm": 1.5971198081970215, + "learning_rate": 3.4734948168876045e-05, + "loss": 1.6368, + "step": 20511 + }, + { + "epoch": 0.7345784017046574, + "grad_norm": 2.2307322025299072, + "learning_rate": 3.4726160510305824e-05, + "loss": 1.5739, + "step": 20512 + }, + { + "epoch": 0.7346142138342256, + "grad_norm": 1.6279468536376953, + "learning_rate": 3.471737372991095e-05, + "loss": 1.7299, + "step": 20513 + }, + { + "epoch": 0.7346500259637939, + "grad_norm": 2.171621084213257, + "learning_rate": 3.470858782780957e-05, + "loss": 1.7927, + "step": 20514 + }, + { + "epoch": 0.7346858380933622, + "grad_norm": 1.6976094245910645, + "learning_rate": 3.469980280411992e-05, + "loss": 1.2166, + "step": 20515 + }, + { + "epoch": 0.7347216502229305, + "grad_norm": 1.5864616632461548, + "learning_rate": 3.469101865896023e-05, + "loss": 1.4572, + "step": 20516 + }, + { + "epoch": 0.7347574623524988, + "grad_norm": 1.2989685535430908, + "learning_rate": 3.468223539244859e-05, + "loss": 1.2894, + "step": 20517 + }, + { + "epoch": 0.7347932744820671, + "grad_norm": 1.3882758617401123, + "learning_rate": 3.467345300470327e-05, + "loss": 1.3966, + "step": 20518 + }, + { + "epoch": 0.7348290866116354, + "grad_norm": 1.4743343591690063, + "learning_rate": 3.466467149584231e-05, + "loss": 1.7203, + "step": 20519 + }, + { + "epoch": 0.7348648987412036, + "grad_norm": 2.1220481395721436, + "learning_rate": 3.4655890865983975e-05, + "loss": 1.2081, + "step": 20520 + }, + { + "epoch": 0.7349007108707719, + "grad_norm": 1.9602645635604858, + "learning_rate": 3.464711111524631e-05, + "loss": 1.5757, + "step": 20521 + }, + { + "epoch": 0.7349365230003402, + "grad_norm": 1.7136316299438477, + "learning_rate": 3.4638332243747464e-05, + "loss": 1.1922, + "step": 20522 + }, + { + "epoch": 0.7349723351299084, + "grad_norm": 1.697861671447754, + "learning_rate": 3.4629554251605545e-05, + "loss": 1.5317, + "step": 20523 + }, + { + "epoch": 0.7350081472594768, + "grad_norm": 1.6139419078826904, + "learning_rate": 3.4620777138938695e-05, + "loss": 1.5648, + "step": 20524 + }, + { + "epoch": 0.7350439593890451, + "grad_norm": 1.9879515171051025, + "learning_rate": 3.461200090586495e-05, + "loss": 1.5372, + "step": 20525 + }, + { + "epoch": 0.7350797715186134, + "grad_norm": 1.6399869918823242, + "learning_rate": 3.4603225552502315e-05, + "loss": 1.3933, + "step": 20526 + }, + { + "epoch": 0.7351155836481816, + "grad_norm": 1.606967568397522, + "learning_rate": 3.4594451078969005e-05, + "loss": 1.2946, + "step": 20527 + }, + { + "epoch": 0.7351513957777499, + "grad_norm": 1.6109073162078857, + "learning_rate": 3.458567748538295e-05, + "loss": 1.5296, + "step": 20528 + }, + { + "epoch": 0.7351872079073182, + "grad_norm": 1.4543050527572632, + "learning_rate": 3.457690477186225e-05, + "loss": 1.4994, + "step": 20529 + }, + { + "epoch": 0.7352230200368864, + "grad_norm": 1.369817852973938, + "learning_rate": 3.4568132938524845e-05, + "loss": 1.2237, + "step": 20530 + }, + { + "epoch": 0.7352588321664548, + "grad_norm": 1.4970976114273071, + "learning_rate": 3.455936198548888e-05, + "loss": 1.4621, + "step": 20531 + }, + { + "epoch": 0.7352946442960231, + "grad_norm": 1.6439193487167358, + "learning_rate": 3.455059191287225e-05, + "loss": 1.5679, + "step": 20532 + }, + { + "epoch": 0.7353304564255914, + "grad_norm": 1.7179007530212402, + "learning_rate": 3.454182272079303e-05, + "loss": 1.494, + "step": 20533 + }, + { + "epoch": 0.7353662685551596, + "grad_norm": 1.4830936193466187, + "learning_rate": 3.45330544093691e-05, + "loss": 1.4933, + "step": 20534 + }, + { + "epoch": 0.7354020806847279, + "grad_norm": 1.8476966619491577, + "learning_rate": 3.4524286978718475e-05, + "loss": 1.3555, + "step": 20535 + }, + { + "epoch": 0.7354378928142962, + "grad_norm": 1.3938565254211426, + "learning_rate": 3.451552042895916e-05, + "loss": 1.3831, + "step": 20536 + }, + { + "epoch": 0.7354737049438644, + "grad_norm": 1.6916418075561523, + "learning_rate": 3.450675476020897e-05, + "loss": 1.3601, + "step": 20537 + }, + { + "epoch": 0.7355095170734328, + "grad_norm": 2.049785614013672, + "learning_rate": 3.449798997258599e-05, + "loss": 1.4742, + "step": 20538 + }, + { + "epoch": 0.7355453292030011, + "grad_norm": 1.3984603881835938, + "learning_rate": 3.4489226066208025e-05, + "loss": 1.2174, + "step": 20539 + }, + { + "epoch": 0.7355811413325694, + "grad_norm": 1.4211353063583374, + "learning_rate": 3.448046304119306e-05, + "loss": 1.4096, + "step": 20540 + }, + { + "epoch": 0.7356169534621376, + "grad_norm": 1.744436502456665, + "learning_rate": 3.44717008976589e-05, + "loss": 1.4444, + "step": 20541 + }, + { + "epoch": 0.7356527655917059, + "grad_norm": 1.7035998106002808, + "learning_rate": 3.446293963572349e-05, + "loss": 1.3967, + "step": 20542 + }, + { + "epoch": 0.7356885777212742, + "grad_norm": 1.3411145210266113, + "learning_rate": 3.4454179255504726e-05, + "loss": 1.2792, + "step": 20543 + }, + { + "epoch": 0.7357243898508424, + "grad_norm": 1.5670628547668457, + "learning_rate": 3.44454197571204e-05, + "loss": 1.4644, + "step": 20544 + }, + { + "epoch": 0.7357602019804108, + "grad_norm": 1.505676031112671, + "learning_rate": 3.4436661140688386e-05, + "loss": 1.36, + "step": 20545 + }, + { + "epoch": 0.7357960141099791, + "grad_norm": 1.3100101947784424, + "learning_rate": 3.442790340632652e-05, + "loss": 1.2666, + "step": 20546 + }, + { + "epoch": 0.7358318262395473, + "grad_norm": 1.3918676376342773, + "learning_rate": 3.441914655415268e-05, + "loss": 1.5261, + "step": 20547 + }, + { + "epoch": 0.7358676383691156, + "grad_norm": 1.698038935661316, + "learning_rate": 3.441039058428456e-05, + "loss": 1.4861, + "step": 20548 + }, + { + "epoch": 0.7359034504986839, + "grad_norm": 1.634504795074463, + "learning_rate": 3.440163549684009e-05, + "loss": 1.447, + "step": 20549 + }, + { + "epoch": 0.7359392626282522, + "grad_norm": 1.5148628950119019, + "learning_rate": 3.4392881291936995e-05, + "loss": 1.4647, + "step": 20550 + }, + { + "epoch": 0.7359750747578204, + "grad_norm": 1.5409778356552124, + "learning_rate": 3.438412796969304e-05, + "loss": 1.6875, + "step": 20551 + }, + { + "epoch": 0.7360108868873888, + "grad_norm": 1.373879075050354, + "learning_rate": 3.4375375530225984e-05, + "loss": 1.3546, + "step": 20552 + }, + { + "epoch": 0.7360466990169571, + "grad_norm": 1.4863821268081665, + "learning_rate": 3.436662397365361e-05, + "loss": 1.6509, + "step": 20553 + }, + { + "epoch": 0.7360825111465253, + "grad_norm": 2.397731065750122, + "learning_rate": 3.435787330009369e-05, + "loss": 1.1656, + "step": 20554 + }, + { + "epoch": 0.7361183232760936, + "grad_norm": 1.484554409980774, + "learning_rate": 3.4349123509663874e-05, + "loss": 1.5836, + "step": 20555 + }, + { + "epoch": 0.7361541354056619, + "grad_norm": 1.328442096710205, + "learning_rate": 3.434037460248191e-05, + "loss": 1.4112, + "step": 20556 + }, + { + "epoch": 0.7361899475352301, + "grad_norm": 1.321702480316162, + "learning_rate": 3.433162657866552e-05, + "loss": 1.3672, + "step": 20557 + }, + { + "epoch": 0.7362257596647984, + "grad_norm": 2.4087703227996826, + "learning_rate": 3.4322879438332414e-05, + "loss": 1.1557, + "step": 20558 + }, + { + "epoch": 0.7362615717943668, + "grad_norm": 1.7339824438095093, + "learning_rate": 3.431413318160022e-05, + "loss": 1.1772, + "step": 20559 + }, + { + "epoch": 0.7362973839239351, + "grad_norm": 1.9930311441421509, + "learning_rate": 3.430538780858663e-05, + "loss": 1.5279, + "step": 20560 + }, + { + "epoch": 0.7363331960535033, + "grad_norm": 1.6439847946166992, + "learning_rate": 3.429664331940935e-05, + "loss": 1.3942, + "step": 20561 + }, + { + "epoch": 0.7363690081830716, + "grad_norm": 1.385252594947815, + "learning_rate": 3.4287899714185944e-05, + "loss": 1.2338, + "step": 20562 + }, + { + "epoch": 0.7364048203126399, + "grad_norm": 1.9350563287734985, + "learning_rate": 3.427915699303408e-05, + "loss": 1.143, + "step": 20563 + }, + { + "epoch": 0.7364406324422081, + "grad_norm": 1.605703353881836, + "learning_rate": 3.427041515607139e-05, + "loss": 1.2759, + "step": 20564 + }, + { + "epoch": 0.7364764445717764, + "grad_norm": 1.4933596849441528, + "learning_rate": 3.426167420341552e-05, + "loss": 1.2269, + "step": 20565 + }, + { + "epoch": 0.7365122567013448, + "grad_norm": 1.7159199714660645, + "learning_rate": 3.4252934135183977e-05, + "loss": 1.2173, + "step": 20566 + }, + { + "epoch": 0.7365480688309131, + "grad_norm": 1.7331993579864502, + "learning_rate": 3.4244194951494414e-05, + "loss": 1.4148, + "step": 20567 + }, + { + "epoch": 0.7365838809604813, + "grad_norm": 1.2863825559616089, + "learning_rate": 3.4235456652464405e-05, + "loss": 1.3893, + "step": 20568 + }, + { + "epoch": 0.7366196930900496, + "grad_norm": 1.5917906761169434, + "learning_rate": 3.422671923821148e-05, + "loss": 1.2366, + "step": 20569 + }, + { + "epoch": 0.7366555052196179, + "grad_norm": 1.5418617725372314, + "learning_rate": 3.421798270885319e-05, + "loss": 1.2398, + "step": 20570 + }, + { + "epoch": 0.7366913173491861, + "grad_norm": 1.4444352388381958, + "learning_rate": 3.420924706450711e-05, + "loss": 1.5176, + "step": 20571 + }, + { + "epoch": 0.7367271294787544, + "grad_norm": 1.4469548463821411, + "learning_rate": 3.4200512305290764e-05, + "loss": 1.4863, + "step": 20572 + }, + { + "epoch": 0.7367629416083228, + "grad_norm": 1.415252923965454, + "learning_rate": 3.419177843132162e-05, + "loss": 1.3998, + "step": 20573 + }, + { + "epoch": 0.736798753737891, + "grad_norm": 2.118942975997925, + "learning_rate": 3.418304544271721e-05, + "loss": 1.3851, + "step": 20574 + }, + { + "epoch": 0.7368345658674593, + "grad_norm": 1.5836764574050903, + "learning_rate": 3.417431333959503e-05, + "loss": 1.4488, + "step": 20575 + }, + { + "epoch": 0.7368703779970276, + "grad_norm": 1.3505635261535645, + "learning_rate": 3.4165582122072594e-05, + "loss": 1.4246, + "step": 20576 + }, + { + "epoch": 0.7369061901265959, + "grad_norm": 1.835097312927246, + "learning_rate": 3.4156851790267283e-05, + "loss": 1.3717, + "step": 20577 + }, + { + "epoch": 0.7369420022561641, + "grad_norm": 1.7995364665985107, + "learning_rate": 3.4148122344296605e-05, + "loss": 1.7411, + "step": 20578 + }, + { + "epoch": 0.7369778143857324, + "grad_norm": 1.40411376953125, + "learning_rate": 3.413939378427804e-05, + "loss": 1.227, + "step": 20579 + }, + { + "epoch": 0.7370136265153008, + "grad_norm": 1.34280526638031, + "learning_rate": 3.413066611032894e-05, + "loss": 1.7086, + "step": 20580 + }, + { + "epoch": 0.737049438644869, + "grad_norm": 1.728757381439209, + "learning_rate": 3.412193932256675e-05, + "loss": 1.5762, + "step": 20581 + }, + { + "epoch": 0.7370852507744373, + "grad_norm": 1.793157935142517, + "learning_rate": 3.41132134211089e-05, + "loss": 1.6236, + "step": 20582 + }, + { + "epoch": 0.7371210629040056, + "grad_norm": 1.6262106895446777, + "learning_rate": 3.410448840607281e-05, + "loss": 1.2699, + "step": 20583 + }, + { + "epoch": 0.7371568750335739, + "grad_norm": 1.7049390077590942, + "learning_rate": 3.4095764277575795e-05, + "loss": 1.3049, + "step": 20584 + }, + { + "epoch": 0.7371926871631421, + "grad_norm": 1.4077553749084473, + "learning_rate": 3.4087041035735256e-05, + "loss": 1.4297, + "step": 20585 + }, + { + "epoch": 0.7372284992927104, + "grad_norm": 2.50809383392334, + "learning_rate": 3.40783186806686e-05, + "loss": 1.3544, + "step": 20586 + }, + { + "epoch": 0.7372643114222788, + "grad_norm": 1.5117441415786743, + "learning_rate": 3.406959721249309e-05, + "loss": 1.3596, + "step": 20587 + }, + { + "epoch": 0.737300123551847, + "grad_norm": 1.937361240386963, + "learning_rate": 3.406087663132611e-05, + "loss": 1.3741, + "step": 20588 + }, + { + "epoch": 0.7373359356814153, + "grad_norm": 1.7514429092407227, + "learning_rate": 3.4052156937284984e-05, + "loss": 1.0765, + "step": 20589 + }, + { + "epoch": 0.7373717478109836, + "grad_norm": 1.5857789516448975, + "learning_rate": 3.404343813048705e-05, + "loss": 1.1952, + "step": 20590 + }, + { + "epoch": 0.7374075599405518, + "grad_norm": 1.6648368835449219, + "learning_rate": 3.4034720211049544e-05, + "loss": 1.5579, + "step": 20591 + }, + { + "epoch": 0.7374433720701201, + "grad_norm": 1.51568603515625, + "learning_rate": 3.402600317908978e-05, + "loss": 1.0948, + "step": 20592 + }, + { + "epoch": 0.7374791841996884, + "grad_norm": 2.056521415710449, + "learning_rate": 3.401728703472505e-05, + "loss": 1.2874, + "step": 20593 + }, + { + "epoch": 0.7375149963292568, + "grad_norm": 1.7449597120285034, + "learning_rate": 3.400857177807265e-05, + "loss": 1.0096, + "step": 20594 + }, + { + "epoch": 0.737550808458825, + "grad_norm": 3.0433919429779053, + "learning_rate": 3.399985740924976e-05, + "loss": 1.276, + "step": 20595 + }, + { + "epoch": 0.7375866205883933, + "grad_norm": 2.3665289878845215, + "learning_rate": 3.399114392837365e-05, + "loss": 1.1514, + "step": 20596 + }, + { + "epoch": 0.7376224327179616, + "grad_norm": 1.5721759796142578, + "learning_rate": 3.3982431335561596e-05, + "loss": 1.3701, + "step": 20597 + }, + { + "epoch": 0.7376582448475298, + "grad_norm": 1.6225780248641968, + "learning_rate": 3.397371963093072e-05, + "loss": 1.0813, + "step": 20598 + }, + { + "epoch": 0.7376940569770981, + "grad_norm": 1.3954616785049438, + "learning_rate": 3.39650088145983e-05, + "loss": 1.326, + "step": 20599 + }, + { + "epoch": 0.7377298691066664, + "grad_norm": 1.427741289138794, + "learning_rate": 3.3956298886681496e-05, + "loss": 1.2924, + "step": 20600 + }, + { + "epoch": 0.7377656812362348, + "grad_norm": 2.5152885913848877, + "learning_rate": 3.3947589847297537e-05, + "loss": 1.5709, + "step": 20601 + }, + { + "epoch": 0.737801493365803, + "grad_norm": 1.531655192375183, + "learning_rate": 3.393888169656351e-05, + "loss": 1.6402, + "step": 20602 + }, + { + "epoch": 0.7378373054953713, + "grad_norm": 1.998115062713623, + "learning_rate": 3.393017443459663e-05, + "loss": 1.1815, + "step": 20603 + }, + { + "epoch": 0.7378731176249396, + "grad_norm": 1.5857163667678833, + "learning_rate": 3.392146806151405e-05, + "loss": 1.403, + "step": 20604 + }, + { + "epoch": 0.7379089297545078, + "grad_norm": 1.4481664896011353, + "learning_rate": 3.3912762577432864e-05, + "loss": 1.3232, + "step": 20605 + }, + { + "epoch": 0.7379447418840761, + "grad_norm": 1.4353240728378296, + "learning_rate": 3.3904057982470204e-05, + "loss": 1.5147, + "step": 20606 + }, + { + "epoch": 0.7379805540136444, + "grad_norm": 2.2608537673950195, + "learning_rate": 3.389535427674318e-05, + "loss": 1.2307, + "step": 20607 + }, + { + "epoch": 0.7380163661432128, + "grad_norm": 1.633110761642456, + "learning_rate": 3.3886651460368934e-05, + "loss": 1.3472, + "step": 20608 + }, + { + "epoch": 0.738052178272781, + "grad_norm": 1.5929673910140991, + "learning_rate": 3.3877949533464485e-05, + "loss": 1.7282, + "step": 20609 + }, + { + "epoch": 0.7380879904023493, + "grad_norm": 1.4499571323394775, + "learning_rate": 3.3869248496146935e-05, + "loss": 1.5401, + "step": 20610 + }, + { + "epoch": 0.7381238025319176, + "grad_norm": 2.3304126262664795, + "learning_rate": 3.3860548348533326e-05, + "loss": 1.6311, + "step": 20611 + }, + { + "epoch": 0.7381596146614858, + "grad_norm": 1.2469449043273926, + "learning_rate": 3.385184909074077e-05, + "loss": 1.4746, + "step": 20612 + }, + { + "epoch": 0.7381954267910541, + "grad_norm": 1.6836285591125488, + "learning_rate": 3.384315072288626e-05, + "loss": 1.4823, + "step": 20613 + }, + { + "epoch": 0.7382312389206224, + "grad_norm": 2.1215879917144775, + "learning_rate": 3.383445324508676e-05, + "loss": 1.3305, + "step": 20614 + }, + { + "epoch": 0.7382670510501907, + "grad_norm": 1.1666394472122192, + "learning_rate": 3.382575665745941e-05, + "loss": 0.8717, + "step": 20615 + }, + { + "epoch": 0.738302863179759, + "grad_norm": 1.4922236204147339, + "learning_rate": 3.3817060960121105e-05, + "loss": 1.6702, + "step": 20616 + }, + { + "epoch": 0.7383386753093273, + "grad_norm": 1.494726300239563, + "learning_rate": 3.380836615318891e-05, + "loss": 1.3197, + "step": 20617 + }, + { + "epoch": 0.7383744874388956, + "grad_norm": 1.798538327217102, + "learning_rate": 3.37996722367797e-05, + "loss": 1.4822, + "step": 20618 + }, + { + "epoch": 0.7384102995684638, + "grad_norm": 2.2261242866516113, + "learning_rate": 3.3790979211010576e-05, + "loss": 1.676, + "step": 20619 + }, + { + "epoch": 0.7384461116980321, + "grad_norm": 1.9397705793380737, + "learning_rate": 3.3782287075998386e-05, + "loss": 1.452, + "step": 20620 + }, + { + "epoch": 0.7384819238276004, + "grad_norm": 2.365359306335449, + "learning_rate": 3.377359583186012e-05, + "loss": 1.4546, + "step": 20621 + }, + { + "epoch": 0.7385177359571687, + "grad_norm": 1.9693297147750854, + "learning_rate": 3.376490547871272e-05, + "loss": 1.6731, + "step": 20622 + }, + { + "epoch": 0.738553548086737, + "grad_norm": 1.434880018234253, + "learning_rate": 3.375621601667305e-05, + "loss": 1.2206, + "step": 20623 + }, + { + "epoch": 0.7385893602163053, + "grad_norm": 1.6470460891723633, + "learning_rate": 3.3747527445858074e-05, + "loss": 1.6089, + "step": 20624 + }, + { + "epoch": 0.7386251723458735, + "grad_norm": 1.7691365480422974, + "learning_rate": 3.373883976638459e-05, + "loss": 1.1974, + "step": 20625 + }, + { + "epoch": 0.7386609844754418, + "grad_norm": 1.4882787466049194, + "learning_rate": 3.3730152978369614e-05, + "loss": 1.5165, + "step": 20626 + }, + { + "epoch": 0.7386967966050101, + "grad_norm": 1.4077999591827393, + "learning_rate": 3.3721467081929914e-05, + "loss": 1.0703, + "step": 20627 + }, + { + "epoch": 0.7387326087345784, + "grad_norm": 1.4659085273742676, + "learning_rate": 3.371278207718241e-05, + "loss": 1.2775, + "step": 20628 + }, + { + "epoch": 0.7387684208641467, + "grad_norm": 1.8654396533966064, + "learning_rate": 3.370409796424386e-05, + "loss": 1.6343, + "step": 20629 + }, + { + "epoch": 0.738804232993715, + "grad_norm": 1.9576849937438965, + "learning_rate": 3.369541474323122e-05, + "loss": 1.2882, + "step": 20630 + }, + { + "epoch": 0.7388400451232833, + "grad_norm": 1.4226261377334595, + "learning_rate": 3.3686732414261254e-05, + "loss": 1.4776, + "step": 20631 + }, + { + "epoch": 0.7388758572528515, + "grad_norm": 1.3740144968032837, + "learning_rate": 3.367805097745069e-05, + "loss": 1.4068, + "step": 20632 + }, + { + "epoch": 0.7389116693824198, + "grad_norm": 1.6114263534545898, + "learning_rate": 3.366937043291648e-05, + "loss": 1.3795, + "step": 20633 + }, + { + "epoch": 0.7389474815119881, + "grad_norm": 2.5558178424835205, + "learning_rate": 3.3660690780775286e-05, + "loss": 1.7663, + "step": 20634 + }, + { + "epoch": 0.7389832936415563, + "grad_norm": 1.6846691370010376, + "learning_rate": 3.3652012021143964e-05, + "loss": 1.6188, + "step": 20635 + }, + { + "epoch": 0.7390191057711247, + "grad_norm": 1.8610543012619019, + "learning_rate": 3.364333415413917e-05, + "loss": 1.2382, + "step": 20636 + }, + { + "epoch": 0.739054917900693, + "grad_norm": 1.7064975500106812, + "learning_rate": 3.363465717987778e-05, + "loss": 1.6473, + "step": 20637 + }, + { + "epoch": 0.7390907300302613, + "grad_norm": 1.726022720336914, + "learning_rate": 3.3625981098476444e-05, + "loss": 1.4555, + "step": 20638 + }, + { + "epoch": 0.7391265421598295, + "grad_norm": 1.8379038572311401, + "learning_rate": 3.3617305910051956e-05, + "loss": 1.499, + "step": 20639 + }, + { + "epoch": 0.7391623542893978, + "grad_norm": 1.7083884477615356, + "learning_rate": 3.3608631614720955e-05, + "loss": 1.7173, + "step": 20640 + }, + { + "epoch": 0.7391981664189661, + "grad_norm": 1.3180886507034302, + "learning_rate": 3.359995821260017e-05, + "loss": 1.41, + "step": 20641 + }, + { + "epoch": 0.7392339785485343, + "grad_norm": 1.614916443824768, + "learning_rate": 3.359128570380633e-05, + "loss": 1.2589, + "step": 20642 + }, + { + "epoch": 0.7392697906781027, + "grad_norm": 1.9553498029708862, + "learning_rate": 3.3582614088456055e-05, + "loss": 1.4967, + "step": 20643 + }, + { + "epoch": 0.739305602807671, + "grad_norm": 1.4962507486343384, + "learning_rate": 3.3573943366666026e-05, + "loss": 1.3142, + "step": 20644 + }, + { + "epoch": 0.7393414149372393, + "grad_norm": 1.409155249595642, + "learning_rate": 3.356527353855291e-05, + "loss": 1.4818, + "step": 20645 + }, + { + "epoch": 0.7393772270668075, + "grad_norm": 1.665363073348999, + "learning_rate": 3.355660460423338e-05, + "loss": 1.1948, + "step": 20646 + }, + { + "epoch": 0.7394130391963758, + "grad_norm": 1.674485445022583, + "learning_rate": 3.354793656382399e-05, + "loss": 1.4648, + "step": 20647 + }, + { + "epoch": 0.7394488513259441, + "grad_norm": 1.315306305885315, + "learning_rate": 3.35392694174414e-05, + "loss": 1.4364, + "step": 20648 + }, + { + "epoch": 0.7394846634555123, + "grad_norm": 1.6919763088226318, + "learning_rate": 3.3530603165202245e-05, + "loss": 1.3387, + "step": 20649 + }, + { + "epoch": 0.7395204755850807, + "grad_norm": 1.8464726209640503, + "learning_rate": 3.352193780722306e-05, + "loss": 1.4225, + "step": 20650 + }, + { + "epoch": 0.739556287714649, + "grad_norm": 1.7736725807189941, + "learning_rate": 3.351327334362043e-05, + "loss": 1.352, + "step": 20651 + }, + { + "epoch": 0.7395920998442173, + "grad_norm": 1.6074994802474976, + "learning_rate": 3.3504609774510964e-05, + "loss": 1.3402, + "step": 20652 + }, + { + "epoch": 0.7396279119737855, + "grad_norm": 1.3930764198303223, + "learning_rate": 3.349594710001123e-05, + "loss": 1.3765, + "step": 20653 + }, + { + "epoch": 0.7396637241033538, + "grad_norm": 1.6473559141159058, + "learning_rate": 3.3487285320237705e-05, + "loss": 1.2232, + "step": 20654 + }, + { + "epoch": 0.7396995362329221, + "grad_norm": 1.8392513990402222, + "learning_rate": 3.347862443530697e-05, + "loss": 1.4693, + "step": 20655 + }, + { + "epoch": 0.7397353483624903, + "grad_norm": 1.4013627767562866, + "learning_rate": 3.3469964445335566e-05, + "loss": 1.1409, + "step": 20656 + }, + { + "epoch": 0.7397711604920587, + "grad_norm": 1.6355818510055542, + "learning_rate": 3.346130535043993e-05, + "loss": 1.2311, + "step": 20657 + }, + { + "epoch": 0.739806972621627, + "grad_norm": 1.5286083221435547, + "learning_rate": 3.3452647150736615e-05, + "loss": 1.3021, + "step": 20658 + }, + { + "epoch": 0.7398427847511952, + "grad_norm": 1.5414046049118042, + "learning_rate": 3.3443989846342084e-05, + "loss": 1.1697, + "step": 20659 + }, + { + "epoch": 0.7398785968807635, + "grad_norm": 1.3940845727920532, + "learning_rate": 3.3435333437372854e-05, + "loss": 1.5052, + "step": 20660 + }, + { + "epoch": 0.7399144090103318, + "grad_norm": 1.6988524198532104, + "learning_rate": 3.3426677923945314e-05, + "loss": 1.4081, + "step": 20661 + }, + { + "epoch": 0.7399502211399, + "grad_norm": 1.670135498046875, + "learning_rate": 3.341802330617596e-05, + "loss": 1.4241, + "step": 20662 + }, + { + "epoch": 0.7399860332694683, + "grad_norm": 1.8642773628234863, + "learning_rate": 3.3409369584181216e-05, + "loss": 1.4035, + "step": 20663 + }, + { + "epoch": 0.7400218453990367, + "grad_norm": 1.446311593055725, + "learning_rate": 3.340071675807753e-05, + "loss": 1.6849, + "step": 20664 + }, + { + "epoch": 0.740057657528605, + "grad_norm": 1.5131405591964722, + "learning_rate": 3.3392064827981275e-05, + "loss": 1.6785, + "step": 20665 + }, + { + "epoch": 0.7400934696581732, + "grad_norm": 2.638115406036377, + "learning_rate": 3.338341379400885e-05, + "loss": 1.3795, + "step": 20666 + }, + { + "epoch": 0.7401292817877415, + "grad_norm": 1.5250235795974731, + "learning_rate": 3.337476365627672e-05, + "loss": 1.421, + "step": 20667 + }, + { + "epoch": 0.7401650939173098, + "grad_norm": 1.3569045066833496, + "learning_rate": 3.336611441490115e-05, + "loss": 1.533, + "step": 20668 + }, + { + "epoch": 0.740200906046878, + "grad_norm": 1.659124493598938, + "learning_rate": 3.335746606999858e-05, + "loss": 1.3057, + "step": 20669 + }, + { + "epoch": 0.7402367181764463, + "grad_norm": 1.662147045135498, + "learning_rate": 3.334881862168532e-05, + "loss": 1.3973, + "step": 20670 + }, + { + "epoch": 0.7402725303060147, + "grad_norm": 1.852306604385376, + "learning_rate": 3.334017207007778e-05, + "loss": 1.5306, + "step": 20671 + }, + { + "epoch": 0.740308342435583, + "grad_norm": 1.557503581047058, + "learning_rate": 3.33315264152922e-05, + "loss": 1.2448, + "step": 20672 + }, + { + "epoch": 0.7403441545651512, + "grad_norm": 2.1327314376831055, + "learning_rate": 3.332288165744494e-05, + "loss": 1.3542, + "step": 20673 + }, + { + "epoch": 0.7403799666947195, + "grad_norm": 1.6384520530700684, + "learning_rate": 3.3314237796652324e-05, + "loss": 1.7015, + "step": 20674 + }, + { + "epoch": 0.7404157788242878, + "grad_norm": 1.7940348386764526, + "learning_rate": 3.33055948330306e-05, + "loss": 1.4875, + "step": 20675 + }, + { + "epoch": 0.740451590953856, + "grad_norm": 2.3545427322387695, + "learning_rate": 3.329695276669605e-05, + "loss": 1.3333, + "step": 20676 + }, + { + "epoch": 0.7404874030834243, + "grad_norm": 1.4605990648269653, + "learning_rate": 3.3288311597764976e-05, + "loss": 1.713, + "step": 20677 + }, + { + "epoch": 0.7405232152129927, + "grad_norm": 2.598592758178711, + "learning_rate": 3.327967132635364e-05, + "loss": 1.4987, + "step": 20678 + }, + { + "epoch": 0.740559027342561, + "grad_norm": 1.3932163715362549, + "learning_rate": 3.3271031952578245e-05, + "loss": 1.2992, + "step": 20679 + }, + { + "epoch": 0.7405948394721292, + "grad_norm": 2.0070300102233887, + "learning_rate": 3.326239347655503e-05, + "loss": 1.5381, + "step": 20680 + }, + { + "epoch": 0.7406306516016975, + "grad_norm": 1.4914416074752808, + "learning_rate": 3.325375589840023e-05, + "loss": 1.4634, + "step": 20681 + }, + { + "epoch": 0.7406664637312658, + "grad_norm": 1.81553053855896, + "learning_rate": 3.3245119218230066e-05, + "loss": 1.3835, + "step": 20682 + }, + { + "epoch": 0.740702275860834, + "grad_norm": 1.629849910736084, + "learning_rate": 3.32364834361607e-05, + "loss": 1.4646, + "step": 20683 + }, + { + "epoch": 0.7407380879904023, + "grad_norm": 1.6208226680755615, + "learning_rate": 3.3227848552308326e-05, + "loss": 1.4159, + "step": 20684 + }, + { + "epoch": 0.7407739001199707, + "grad_norm": 1.9516637325286865, + "learning_rate": 3.321921456678915e-05, + "loss": 1.3905, + "step": 20685 + }, + { + "epoch": 0.740809712249539, + "grad_norm": 1.5003976821899414, + "learning_rate": 3.321058147971927e-05, + "loss": 1.7165, + "step": 20686 + }, + { + "epoch": 0.7408455243791072, + "grad_norm": 1.5799978971481323, + "learning_rate": 3.320194929121486e-05, + "loss": 1.6016, + "step": 20687 + }, + { + "epoch": 0.7408813365086755, + "grad_norm": 1.946989893913269, + "learning_rate": 3.319331800139207e-05, + "loss": 1.8459, + "step": 20688 + }, + { + "epoch": 0.7409171486382438, + "grad_norm": 1.4369703531265259, + "learning_rate": 3.318468761036704e-05, + "loss": 1.5164, + "step": 20689 + }, + { + "epoch": 0.740952960767812, + "grad_norm": 1.370896339416504, + "learning_rate": 3.3176058118255816e-05, + "loss": 1.4781, + "step": 20690 + }, + { + "epoch": 0.7409887728973803, + "grad_norm": 1.3151863813400269, + "learning_rate": 3.316742952517453e-05, + "loss": 1.3523, + "step": 20691 + }, + { + "epoch": 0.7410245850269487, + "grad_norm": 1.8912273645401, + "learning_rate": 3.3158801831239314e-05, + "loss": 1.345, + "step": 20692 + }, + { + "epoch": 0.741060397156517, + "grad_norm": 1.3911374807357788, + "learning_rate": 3.3150175036566166e-05, + "loss": 1.6394, + "step": 20693 + }, + { + "epoch": 0.7410962092860852, + "grad_norm": 1.4584274291992188, + "learning_rate": 3.314154914127118e-05, + "loss": 1.3457, + "step": 20694 + }, + { + "epoch": 0.7411320214156535, + "grad_norm": 1.4885680675506592, + "learning_rate": 3.31329241454704e-05, + "loss": 1.709, + "step": 20695 + }, + { + "epoch": 0.7411678335452218, + "grad_norm": 2.0191259384155273, + "learning_rate": 3.312430004927992e-05, + "loss": 1.3875, + "step": 20696 + }, + { + "epoch": 0.74120364567479, + "grad_norm": 2.2317590713500977, + "learning_rate": 3.311567685281568e-05, + "loss": 1.6509, + "step": 20697 + }, + { + "epoch": 0.7412394578043583, + "grad_norm": 1.3376753330230713, + "learning_rate": 3.310705455619374e-05, + "loss": 1.3804, + "step": 20698 + }, + { + "epoch": 0.7412752699339267, + "grad_norm": 1.5647464990615845, + "learning_rate": 3.309843315953008e-05, + "loss": 1.2623, + "step": 20699 + }, + { + "epoch": 0.7413110820634949, + "grad_norm": 1.6733916997909546, + "learning_rate": 3.3089812662940754e-05, + "loss": 1.1969, + "step": 20700 + }, + { + "epoch": 0.7413468941930632, + "grad_norm": 1.6514251232147217, + "learning_rate": 3.308119306654168e-05, + "loss": 1.2187, + "step": 20701 + }, + { + "epoch": 0.7413827063226315, + "grad_norm": 1.6864891052246094, + "learning_rate": 3.3072574370448783e-05, + "loss": 1.5163, + "step": 20702 + }, + { + "epoch": 0.7414185184521997, + "grad_norm": 1.344460129737854, + "learning_rate": 3.306395657477812e-05, + "loss": 1.4425, + "step": 20703 + }, + { + "epoch": 0.741454330581768, + "grad_norm": 1.5931432247161865, + "learning_rate": 3.3055339679645544e-05, + "loss": 1.5247, + "step": 20704 + }, + { + "epoch": 0.7414901427113363, + "grad_norm": 2.037649631500244, + "learning_rate": 3.304672368516704e-05, + "loss": 1.5746, + "step": 20705 + }, + { + "epoch": 0.7415259548409047, + "grad_norm": 1.2448827028274536, + "learning_rate": 3.303810859145848e-05, + "loss": 1.4724, + "step": 20706 + }, + { + "epoch": 0.7415617669704729, + "grad_norm": 1.5991346836090088, + "learning_rate": 3.302949439863584e-05, + "loss": 1.6548, + "step": 20707 + }, + { + "epoch": 0.7415975791000412, + "grad_norm": 1.6858115196228027, + "learning_rate": 3.3020881106814936e-05, + "loss": 1.6506, + "step": 20708 + }, + { + "epoch": 0.7416333912296095, + "grad_norm": 1.7268314361572266, + "learning_rate": 3.301226871611168e-05, + "loss": 1.278, + "step": 20709 + }, + { + "epoch": 0.7416692033591777, + "grad_norm": 1.3854961395263672, + "learning_rate": 3.3003657226641974e-05, + "loss": 1.3806, + "step": 20710 + }, + { + "epoch": 0.741705015488746, + "grad_norm": 1.8541377782821655, + "learning_rate": 3.2995046638521595e-05, + "loss": 1.0329, + "step": 20711 + }, + { + "epoch": 0.7417408276183143, + "grad_norm": 1.6589943170547485, + "learning_rate": 3.2986436951866486e-05, + "loss": 1.3059, + "step": 20712 + }, + { + "epoch": 0.7417766397478827, + "grad_norm": 1.4660264253616333, + "learning_rate": 3.2977828166792345e-05, + "loss": 1.7456, + "step": 20713 + }, + { + "epoch": 0.7418124518774509, + "grad_norm": 1.5078004598617554, + "learning_rate": 3.296922028341515e-05, + "loss": 1.3345, + "step": 20714 + }, + { + "epoch": 0.7418482640070192, + "grad_norm": 1.6876119375228882, + "learning_rate": 3.29606133018506e-05, + "loss": 1.5791, + "step": 20715 + }, + { + "epoch": 0.7418840761365875, + "grad_norm": 1.7531828880310059, + "learning_rate": 3.2952007222214545e-05, + "loss": 1.1627, + "step": 20716 + }, + { + "epoch": 0.7419198882661557, + "grad_norm": 3.027247667312622, + "learning_rate": 3.29434020446227e-05, + "loss": 1.1017, + "step": 20717 + }, + { + "epoch": 0.741955700395724, + "grad_norm": 1.8014774322509766, + "learning_rate": 3.293479776919093e-05, + "loss": 1.4673, + "step": 20718 + }, + { + "epoch": 0.7419915125252923, + "grad_norm": 1.9687037467956543, + "learning_rate": 3.292619439603495e-05, + "loss": 1.5301, + "step": 20719 + }, + { + "epoch": 0.7420273246548607, + "grad_norm": 1.6570146083831787, + "learning_rate": 3.291759192527045e-05, + "loss": 1.5799, + "step": 20720 + }, + { + "epoch": 0.7420631367844289, + "grad_norm": 1.3398667573928833, + "learning_rate": 3.290899035701328e-05, + "loss": 1.5945, + "step": 20721 + }, + { + "epoch": 0.7420989489139972, + "grad_norm": 1.6556895971298218, + "learning_rate": 3.2900389691379074e-05, + "loss": 1.3598, + "step": 20722 + }, + { + "epoch": 0.7421347610435655, + "grad_norm": 1.4631439447402954, + "learning_rate": 3.2891789928483594e-05, + "loss": 1.1738, + "step": 20723 + }, + { + "epoch": 0.7421705731731337, + "grad_norm": 1.3914145231246948, + "learning_rate": 3.2883191068442464e-05, + "loss": 1.4977, + "step": 20724 + }, + { + "epoch": 0.742206385302702, + "grad_norm": 1.4877334833145142, + "learning_rate": 3.287459311137149e-05, + "loss": 1.7173, + "step": 20725 + }, + { + "epoch": 0.7422421974322703, + "grad_norm": 1.5080418586730957, + "learning_rate": 3.286599605738624e-05, + "loss": 1.361, + "step": 20726 + }, + { + "epoch": 0.7422780095618386, + "grad_norm": 1.6488600969314575, + "learning_rate": 3.285739990660246e-05, + "loss": 1.0568, + "step": 20727 + }, + { + "epoch": 0.7423138216914069, + "grad_norm": 1.9895234107971191, + "learning_rate": 3.284880465913571e-05, + "loss": 1.6501, + "step": 20728 + }, + { + "epoch": 0.7423496338209752, + "grad_norm": 1.7913322448730469, + "learning_rate": 3.284021031510168e-05, + "loss": 1.395, + "step": 20729 + }, + { + "epoch": 0.7423854459505435, + "grad_norm": 1.5944781303405762, + "learning_rate": 3.2831616874616036e-05, + "loss": 1.5749, + "step": 20730 + }, + { + "epoch": 0.7424212580801117, + "grad_norm": 1.548054814338684, + "learning_rate": 3.282302433779426e-05, + "loss": 1.6684, + "step": 20731 + }, + { + "epoch": 0.74245707020968, + "grad_norm": 1.558003306388855, + "learning_rate": 3.281443270475212e-05, + "loss": 1.3898, + "step": 20732 + }, + { + "epoch": 0.7424928823392483, + "grad_norm": 1.7061145305633545, + "learning_rate": 3.280584197560508e-05, + "loss": 1.7347, + "step": 20733 + }, + { + "epoch": 0.7425286944688166, + "grad_norm": 1.6126638650894165, + "learning_rate": 3.2797252150468804e-05, + "loss": 1.5521, + "step": 20734 + }, + { + "epoch": 0.7425645065983849, + "grad_norm": 1.6583861112594604, + "learning_rate": 3.278866322945874e-05, + "loss": 1.4871, + "step": 20735 + }, + { + "epoch": 0.7426003187279532, + "grad_norm": 1.5712144374847412, + "learning_rate": 3.278007521269059e-05, + "loss": 1.5044, + "step": 20736 + }, + { + "epoch": 0.7426361308575214, + "grad_norm": 2.0407989025115967, + "learning_rate": 3.2771488100279814e-05, + "loss": 1.4875, + "step": 20737 + }, + { + "epoch": 0.7426719429870897, + "grad_norm": 1.7771286964416504, + "learning_rate": 3.2762901892341926e-05, + "loss": 1.2809, + "step": 20738 + }, + { + "epoch": 0.742707755116658, + "grad_norm": 2.0323855876922607, + "learning_rate": 3.2754316588992454e-05, + "loss": 1.4569, + "step": 20739 + }, + { + "epoch": 0.7427435672462263, + "grad_norm": 1.6384506225585938, + "learning_rate": 3.274573219034691e-05, + "loss": 1.7623, + "step": 20740 + }, + { + "epoch": 0.7427793793757946, + "grad_norm": 1.7023667097091675, + "learning_rate": 3.2737148696520824e-05, + "loss": 1.3912, + "step": 20741 + }, + { + "epoch": 0.7428151915053629, + "grad_norm": 1.7247306108474731, + "learning_rate": 3.272856610762961e-05, + "loss": 1.496, + "step": 20742 + }, + { + "epoch": 0.7428510036349312, + "grad_norm": 1.7189574241638184, + "learning_rate": 3.271998442378875e-05, + "loss": 1.613, + "step": 20743 + }, + { + "epoch": 0.7428868157644994, + "grad_norm": 2.2422940731048584, + "learning_rate": 3.271140364511377e-05, + "loss": 1.7311, + "step": 20744 + }, + { + "epoch": 0.7429226278940677, + "grad_norm": 1.8080083131790161, + "learning_rate": 3.270282377172001e-05, + "loss": 1.3932, + "step": 20745 + }, + { + "epoch": 0.742958440023636, + "grad_norm": 1.682898759841919, + "learning_rate": 3.269424480372295e-05, + "loss": 1.6761, + "step": 20746 + }, + { + "epoch": 0.7429942521532042, + "grad_norm": 1.3644120693206787, + "learning_rate": 3.268566674123802e-05, + "loss": 1.4248, + "step": 20747 + }, + { + "epoch": 0.7430300642827726, + "grad_norm": 2.0592284202575684, + "learning_rate": 3.267708958438063e-05, + "loss": 1.3517, + "step": 20748 + }, + { + "epoch": 0.7430658764123409, + "grad_norm": 1.3781828880310059, + "learning_rate": 3.266851333326614e-05, + "loss": 1.4983, + "step": 20749 + }, + { + "epoch": 0.7431016885419092, + "grad_norm": 1.7163243293762207, + "learning_rate": 3.265993798800995e-05, + "loss": 1.0821, + "step": 20750 + }, + { + "epoch": 0.7431375006714774, + "grad_norm": 1.565303087234497, + "learning_rate": 3.265136354872742e-05, + "loss": 1.3198, + "step": 20751 + }, + { + "epoch": 0.7431733128010457, + "grad_norm": 1.516454815864563, + "learning_rate": 3.2642790015533965e-05, + "loss": 1.1464, + "step": 20752 + }, + { + "epoch": 0.743209124930614, + "grad_norm": 1.5399000644683838, + "learning_rate": 3.2634217388544855e-05, + "loss": 1.6111, + "step": 20753 + }, + { + "epoch": 0.7432449370601822, + "grad_norm": 1.608729600906372, + "learning_rate": 3.2625645667875434e-05, + "loss": 1.4164, + "step": 20754 + }, + { + "epoch": 0.7432807491897506, + "grad_norm": 1.7107634544372559, + "learning_rate": 3.26170748536411e-05, + "loss": 1.4349, + "step": 20755 + }, + { + "epoch": 0.7433165613193189, + "grad_norm": 1.5795230865478516, + "learning_rate": 3.260850494595707e-05, + "loss": 1.3175, + "step": 20756 + }, + { + "epoch": 0.7433523734488872, + "grad_norm": 1.8577889204025269, + "learning_rate": 3.259993594493866e-05, + "loss": 1.6718, + "step": 20757 + }, + { + "epoch": 0.7433881855784554, + "grad_norm": 1.8486729860305786, + "learning_rate": 3.2591367850701194e-05, + "loss": 1.424, + "step": 20758 + }, + { + "epoch": 0.7434239977080237, + "grad_norm": 1.7383266687393188, + "learning_rate": 3.2582800663359933e-05, + "loss": 1.584, + "step": 20759 + }, + { + "epoch": 0.743459809837592, + "grad_norm": 1.9045100212097168, + "learning_rate": 3.257423438303011e-05, + "loss": 1.4333, + "step": 20760 + }, + { + "epoch": 0.7434956219671602, + "grad_norm": 1.3774325847625732, + "learning_rate": 3.256566900982699e-05, + "loss": 1.4165, + "step": 20761 + }, + { + "epoch": 0.7435314340967286, + "grad_norm": 1.4504196643829346, + "learning_rate": 3.255710454386585e-05, + "loss": 1.3648, + "step": 20762 + }, + { + "epoch": 0.7435672462262969, + "grad_norm": 1.3335399627685547, + "learning_rate": 3.2548540985261824e-05, + "loss": 1.5259, + "step": 20763 + }, + { + "epoch": 0.7436030583558652, + "grad_norm": 1.8252145051956177, + "learning_rate": 3.2539978334130174e-05, + "loss": 1.2774, + "step": 20764 + }, + { + "epoch": 0.7436388704854334, + "grad_norm": 1.58820378780365, + "learning_rate": 3.253141659058611e-05, + "loss": 1.4351, + "step": 20765 + }, + { + "epoch": 0.7436746826150017, + "grad_norm": 1.7370541095733643, + "learning_rate": 3.252285575474483e-05, + "loss": 1.6168, + "step": 20766 + }, + { + "epoch": 0.74371049474457, + "grad_norm": 2.1864709854125977, + "learning_rate": 3.251429582672145e-05, + "loss": 2.0227, + "step": 20767 + }, + { + "epoch": 0.7437463068741382, + "grad_norm": 2.4286386966705322, + "learning_rate": 3.2505736806631185e-05, + "loss": 1.8233, + "step": 20768 + }, + { + "epoch": 0.7437821190037066, + "grad_norm": 1.746517300605774, + "learning_rate": 3.249717869458916e-05, + "loss": 1.2833, + "step": 20769 + }, + { + "epoch": 0.7438179311332749, + "grad_norm": 1.7618461847305298, + "learning_rate": 3.248862149071056e-05, + "loss": 1.5458, + "step": 20770 + }, + { + "epoch": 0.7438537432628431, + "grad_norm": 1.8682754039764404, + "learning_rate": 3.248006519511043e-05, + "loss": 1.6408, + "step": 20771 + }, + { + "epoch": 0.7438895553924114, + "grad_norm": 1.6848514080047607, + "learning_rate": 3.247150980790394e-05, + "loss": 1.5804, + "step": 20772 + }, + { + "epoch": 0.7439253675219797, + "grad_norm": 1.2802892923355103, + "learning_rate": 3.2462955329206213e-05, + "loss": 1.3587, + "step": 20773 + }, + { + "epoch": 0.743961179651548, + "grad_norm": 1.8104581832885742, + "learning_rate": 3.245440175913227e-05, + "loss": 1.4791, + "step": 20774 + }, + { + "epoch": 0.7439969917811162, + "grad_norm": 1.7056621313095093, + "learning_rate": 3.244584909779722e-05, + "loss": 1.6028, + "step": 20775 + }, + { + "epoch": 0.7440328039106846, + "grad_norm": 2.1532299518585205, + "learning_rate": 3.243729734531614e-05, + "loss": 1.3802, + "step": 20776 + }, + { + "epoch": 0.7440686160402529, + "grad_norm": 1.530900239944458, + "learning_rate": 3.2428746501804106e-05, + "loss": 1.5308, + "step": 20777 + }, + { + "epoch": 0.7441044281698211, + "grad_norm": 1.56569242477417, + "learning_rate": 3.2420196567376096e-05, + "loss": 1.538, + "step": 20778 + }, + { + "epoch": 0.7441402402993894, + "grad_norm": 1.6894400119781494, + "learning_rate": 3.241164754214716e-05, + "loss": 1.3194, + "step": 20779 + }, + { + "epoch": 0.7441760524289577, + "grad_norm": 2.2958922386169434, + "learning_rate": 3.2403099426232365e-05, + "loss": 1.4457, + "step": 20780 + }, + { + "epoch": 0.744211864558526, + "grad_norm": 1.536910057067871, + "learning_rate": 3.239455221974663e-05, + "loss": 0.9383, + "step": 20781 + }, + { + "epoch": 0.7442476766880942, + "grad_norm": 1.4549648761749268, + "learning_rate": 3.2386005922804996e-05, + "loss": 1.4439, + "step": 20782 + }, + { + "epoch": 0.7442834888176626, + "grad_norm": 1.574837327003479, + "learning_rate": 3.237746053552244e-05, + "loss": 1.3761, + "step": 20783 + }, + { + "epoch": 0.7443193009472309, + "grad_norm": 1.8946826457977295, + "learning_rate": 3.2368916058013956e-05, + "loss": 1.4469, + "step": 20784 + }, + { + "epoch": 0.7443551130767991, + "grad_norm": 2.358954906463623, + "learning_rate": 3.236037249039444e-05, + "loss": 1.6472, + "step": 20785 + }, + { + "epoch": 0.7443909252063674, + "grad_norm": 1.663940668106079, + "learning_rate": 3.235182983277886e-05, + "loss": 1.4214, + "step": 20786 + }, + { + "epoch": 0.7444267373359357, + "grad_norm": 1.7048293352127075, + "learning_rate": 3.234328808528215e-05, + "loss": 1.4342, + "step": 20787 + }, + { + "epoch": 0.7444625494655039, + "grad_norm": 1.6530091762542725, + "learning_rate": 3.233474724801926e-05, + "loss": 1.5719, + "step": 20788 + }, + { + "epoch": 0.7444983615950722, + "grad_norm": 1.466918706893921, + "learning_rate": 3.232620732110503e-05, + "loss": 1.3972, + "step": 20789 + }, + { + "epoch": 0.7445341737246406, + "grad_norm": 1.7518681287765503, + "learning_rate": 3.231766830465439e-05, + "loss": 1.5575, + "step": 20790 + }, + { + "epoch": 0.7445699858542089, + "grad_norm": 1.4471641778945923, + "learning_rate": 3.230913019878224e-05, + "loss": 1.448, + "step": 20791 + }, + { + "epoch": 0.7446057979837771, + "grad_norm": 1.9745919704437256, + "learning_rate": 3.230059300360342e-05, + "loss": 1.4811, + "step": 20792 + }, + { + "epoch": 0.7446416101133454, + "grad_norm": 1.7885055541992188, + "learning_rate": 3.229205671923278e-05, + "loss": 1.5635, + "step": 20793 + }, + { + "epoch": 0.7446774222429137, + "grad_norm": 1.5172529220581055, + "learning_rate": 3.2283521345785176e-05, + "loss": 1.5204, + "step": 20794 + }, + { + "epoch": 0.7447132343724819, + "grad_norm": 1.5513358116149902, + "learning_rate": 3.227498688337548e-05, + "loss": 1.4141, + "step": 20795 + }, + { + "epoch": 0.7447490465020502, + "grad_norm": 1.9045683145523071, + "learning_rate": 3.226645333211845e-05, + "loss": 1.4086, + "step": 20796 + }, + { + "epoch": 0.7447848586316185, + "grad_norm": 1.5256716012954712, + "learning_rate": 3.225792069212892e-05, + "loss": 1.5432, + "step": 20797 + }, + { + "epoch": 0.7448206707611869, + "grad_norm": 1.4963194131851196, + "learning_rate": 3.224938896352171e-05, + "loss": 1.1279, + "step": 20798 + }, + { + "epoch": 0.7448564828907551, + "grad_norm": 1.3626344203948975, + "learning_rate": 3.2240858146411546e-05, + "loss": 1.1153, + "step": 20799 + }, + { + "epoch": 0.7448922950203234, + "grad_norm": 1.44807767868042, + "learning_rate": 3.2232328240913277e-05, + "loss": 1.3652, + "step": 20800 + }, + { + "epoch": 0.7449281071498917, + "grad_norm": 2.269850492477417, + "learning_rate": 3.222379924714155e-05, + "loss": 1.1314, + "step": 20801 + }, + { + "epoch": 0.7449639192794599, + "grad_norm": 1.7084736824035645, + "learning_rate": 3.221527116521124e-05, + "loss": 1.5226, + "step": 20802 + }, + { + "epoch": 0.7449997314090282, + "grad_norm": 2.213249444961548, + "learning_rate": 3.220674399523699e-05, + "loss": 1.3941, + "step": 20803 + }, + { + "epoch": 0.7450355435385965, + "grad_norm": 1.5036464929580688, + "learning_rate": 3.219821773733355e-05, + "loss": 1.5555, + "step": 20804 + }, + { + "epoch": 0.7450713556681648, + "grad_norm": 1.3128011226654053, + "learning_rate": 3.218969239161563e-05, + "loss": 1.3888, + "step": 20805 + }, + { + "epoch": 0.7451071677977331, + "grad_norm": 1.5558034181594849, + "learning_rate": 3.2181167958197964e-05, + "loss": 1.2282, + "step": 20806 + }, + { + "epoch": 0.7451429799273014, + "grad_norm": 2.2121920585632324, + "learning_rate": 3.2172644437195207e-05, + "loss": 1.733, + "step": 20807 + }, + { + "epoch": 0.7451787920568697, + "grad_norm": 1.9016462564468384, + "learning_rate": 3.216412182872196e-05, + "loss": 1.4272, + "step": 20808 + }, + { + "epoch": 0.7452146041864379, + "grad_norm": 1.629558801651001, + "learning_rate": 3.215560013289301e-05, + "loss": 1.5492, + "step": 20809 + }, + { + "epoch": 0.7452504163160062, + "grad_norm": 3.6835665702819824, + "learning_rate": 3.2147079349822925e-05, + "loss": 1.5285, + "step": 20810 + }, + { + "epoch": 0.7452862284455745, + "grad_norm": 1.4725559949874878, + "learning_rate": 3.2138559479626395e-05, + "loss": 1.4868, + "step": 20811 + }, + { + "epoch": 0.7453220405751428, + "grad_norm": 1.4919462203979492, + "learning_rate": 3.2130040522417946e-05, + "loss": 1.5146, + "step": 20812 + }, + { + "epoch": 0.7453578527047111, + "grad_norm": 1.767691731452942, + "learning_rate": 3.212152247831233e-05, + "loss": 1.2523, + "step": 20813 + }, + { + "epoch": 0.7453936648342794, + "grad_norm": 1.4259493350982666, + "learning_rate": 3.211300534742402e-05, + "loss": 1.7543, + "step": 20814 + }, + { + "epoch": 0.7454294769638476, + "grad_norm": 1.6164785623550415, + "learning_rate": 3.210448912986767e-05, + "loss": 1.5031, + "step": 20815 + }, + { + "epoch": 0.7454652890934159, + "grad_norm": 1.4968827962875366, + "learning_rate": 3.209597382575786e-05, + "loss": 1.4044, + "step": 20816 + }, + { + "epoch": 0.7455011012229842, + "grad_norm": 1.9254761934280396, + "learning_rate": 3.208745943520911e-05, + "loss": 0.9738, + "step": 20817 + }, + { + "epoch": 0.7455369133525525, + "grad_norm": 1.466422438621521, + "learning_rate": 3.207894595833603e-05, + "loss": 1.3917, + "step": 20818 + }, + { + "epoch": 0.7455727254821208, + "grad_norm": 1.6191308498382568, + "learning_rate": 3.207043339525304e-05, + "loss": 1.6631, + "step": 20819 + }, + { + "epoch": 0.7456085376116891, + "grad_norm": 1.7039192914962769, + "learning_rate": 3.206192174607482e-05, + "loss": 1.4387, + "step": 20820 + }, + { + "epoch": 0.7456443497412574, + "grad_norm": 1.9393994808197021, + "learning_rate": 3.205341101091578e-05, + "loss": 1.2903, + "step": 20821 + }, + { + "epoch": 0.7456801618708256, + "grad_norm": 1.7326945066452026, + "learning_rate": 3.2044901189890473e-05, + "loss": 1.5565, + "step": 20822 + }, + { + "epoch": 0.7457159740003939, + "grad_norm": 1.9713574647903442, + "learning_rate": 3.2036392283113304e-05, + "loss": 1.3975, + "step": 20823 + }, + { + "epoch": 0.7457517861299622, + "grad_norm": 1.6577719449996948, + "learning_rate": 3.202788429069887e-05, + "loss": 1.582, + "step": 20824 + }, + { + "epoch": 0.7457875982595304, + "grad_norm": 1.6279882192611694, + "learning_rate": 3.201937721276159e-05, + "loss": 1.5023, + "step": 20825 + }, + { + "epoch": 0.7458234103890988, + "grad_norm": 2.0655462741851807, + "learning_rate": 3.201087104941586e-05, + "loss": 1.4759, + "step": 20826 + }, + { + "epoch": 0.7458592225186671, + "grad_norm": 1.3762956857681274, + "learning_rate": 3.2002365800776154e-05, + "loss": 1.3902, + "step": 20827 + }, + { + "epoch": 0.7458950346482354, + "grad_norm": 1.3251439332962036, + "learning_rate": 3.199386146695691e-05, + "loss": 1.4928, + "step": 20828 + }, + { + "epoch": 0.7459308467778036, + "grad_norm": 1.480262041091919, + "learning_rate": 3.1985358048072574e-05, + "loss": 1.4624, + "step": 20829 + }, + { + "epoch": 0.7459666589073719, + "grad_norm": 1.2799279689788818, + "learning_rate": 3.197685554423745e-05, + "loss": 1.3353, + "step": 20830 + }, + { + "epoch": 0.7460024710369402, + "grad_norm": 1.4563822746276855, + "learning_rate": 3.1968353955566045e-05, + "loss": 1.0802, + "step": 20831 + }, + { + "epoch": 0.7460382831665084, + "grad_norm": 1.2952502965927124, + "learning_rate": 3.195985328217266e-05, + "loss": 1.1515, + "step": 20832 + }, + { + "epoch": 0.7460740952960768, + "grad_norm": 1.3360506296157837, + "learning_rate": 3.1951353524171715e-05, + "loss": 1.5667, + "step": 20833 + }, + { + "epoch": 0.7461099074256451, + "grad_norm": 1.5215046405792236, + "learning_rate": 3.194285468167749e-05, + "loss": 1.424, + "step": 20834 + }, + { + "epoch": 0.7461457195552134, + "grad_norm": 1.5855753421783447, + "learning_rate": 3.1934356754804385e-05, + "loss": 1.4095, + "step": 20835 + }, + { + "epoch": 0.7461815316847816, + "grad_norm": 1.7662148475646973, + "learning_rate": 3.192585974366673e-05, + "loss": 1.4234, + "step": 20836 + }, + { + "epoch": 0.7462173438143499, + "grad_norm": 1.916852593421936, + "learning_rate": 3.19173636483788e-05, + "loss": 1.4165, + "step": 20837 + }, + { + "epoch": 0.7462531559439182, + "grad_norm": 1.7372554540634155, + "learning_rate": 3.190886846905491e-05, + "loss": 1.3308, + "step": 20838 + }, + { + "epoch": 0.7462889680734864, + "grad_norm": 1.336731195449829, + "learning_rate": 3.190037420580937e-05, + "loss": 1.2971, + "step": 20839 + }, + { + "epoch": 0.7463247802030548, + "grad_norm": 1.8577148914337158, + "learning_rate": 3.1891880858756484e-05, + "loss": 1.6954, + "step": 20840 + }, + { + "epoch": 0.7463605923326231, + "grad_norm": 2.185137987136841, + "learning_rate": 3.1883388428010465e-05, + "loss": 1.6788, + "step": 20841 + }, + { + "epoch": 0.7463964044621914, + "grad_norm": 1.7305773496627808, + "learning_rate": 3.187489691368558e-05, + "loss": 1.7874, + "step": 20842 + }, + { + "epoch": 0.7464322165917596, + "grad_norm": 1.6137861013412476, + "learning_rate": 3.186640631589611e-05, + "loss": 1.2475, + "step": 20843 + }, + { + "epoch": 0.7464680287213279, + "grad_norm": 1.5062600374221802, + "learning_rate": 3.1857916634756234e-05, + "loss": 1.2966, + "step": 20844 + }, + { + "epoch": 0.7465038408508962, + "grad_norm": 1.881579875946045, + "learning_rate": 3.184942787038019e-05, + "loss": 1.6763, + "step": 20845 + }, + { + "epoch": 0.7465396529804644, + "grad_norm": 1.7157776355743408, + "learning_rate": 3.184094002288219e-05, + "loss": 1.4101, + "step": 20846 + }, + { + "epoch": 0.7465754651100328, + "grad_norm": 1.5331974029541016, + "learning_rate": 3.1832453092376446e-05, + "loss": 1.4136, + "step": 20847 + }, + { + "epoch": 0.7466112772396011, + "grad_norm": 2.0889968872070312, + "learning_rate": 3.182396707897709e-05, + "loss": 1.2539, + "step": 20848 + }, + { + "epoch": 0.7466470893691693, + "grad_norm": 1.3212164640426636, + "learning_rate": 3.1815481982798324e-05, + "loss": 1.3957, + "step": 20849 + }, + { + "epoch": 0.7466829014987376, + "grad_norm": 1.5992352962493896, + "learning_rate": 3.1806997803954316e-05, + "loss": 1.2118, + "step": 20850 + }, + { + "epoch": 0.7467187136283059, + "grad_norm": 1.8797727823257446, + "learning_rate": 3.1798514542559164e-05, + "loss": 1.2311, + "step": 20851 + }, + { + "epoch": 0.7467545257578742, + "grad_norm": 1.4515434503555298, + "learning_rate": 3.1790032198727014e-05, + "loss": 1.4252, + "step": 20852 + }, + { + "epoch": 0.7467903378874424, + "grad_norm": 1.5532227754592896, + "learning_rate": 3.178155077257201e-05, + "loss": 1.2506, + "step": 20853 + }, + { + "epoch": 0.7468261500170108, + "grad_norm": 1.7346625328063965, + "learning_rate": 3.177307026420827e-05, + "loss": 1.4919, + "step": 20854 + }, + { + "epoch": 0.7468619621465791, + "grad_norm": 1.302531361579895, + "learning_rate": 3.176459067374984e-05, + "loss": 1.3361, + "step": 20855 + }, + { + "epoch": 0.7468977742761473, + "grad_norm": 2.094909191131592, + "learning_rate": 3.175611200131081e-05, + "loss": 1.3555, + "step": 20856 + }, + { + "epoch": 0.7469335864057156, + "grad_norm": 1.6115424633026123, + "learning_rate": 3.174763424700528e-05, + "loss": 1.4763, + "step": 20857 + }, + { + "epoch": 0.7469693985352839, + "grad_norm": 2.035444974899292, + "learning_rate": 3.1739157410947316e-05, + "loss": 1.6644, + "step": 20858 + }, + { + "epoch": 0.7470052106648521, + "grad_norm": 1.8379755020141602, + "learning_rate": 3.173068149325091e-05, + "loss": 1.6804, + "step": 20859 + }, + { + "epoch": 0.7470410227944204, + "grad_norm": 1.5692671537399292, + "learning_rate": 3.172220649403011e-05, + "loss": 1.5543, + "step": 20860 + }, + { + "epoch": 0.7470768349239888, + "grad_norm": 1.5705647468566895, + "learning_rate": 3.1713732413399e-05, + "loss": 1.475, + "step": 20861 + }, + { + "epoch": 0.7471126470535571, + "grad_norm": 1.777363896369934, + "learning_rate": 3.1705259251471496e-05, + "loss": 1.4334, + "step": 20862 + }, + { + "epoch": 0.7471484591831253, + "grad_norm": 2.5444350242614746, + "learning_rate": 3.169678700836164e-05, + "loss": 1.3177, + "step": 20863 + }, + { + "epoch": 0.7471842713126936, + "grad_norm": 2.386817216873169, + "learning_rate": 3.168831568418341e-05, + "loss": 1.6777, + "step": 20864 + }, + { + "epoch": 0.7472200834422619, + "grad_norm": 1.4960191249847412, + "learning_rate": 3.16798452790508e-05, + "loss": 1.3594, + "step": 20865 + }, + { + "epoch": 0.7472558955718301, + "grad_norm": 1.516856074333191, + "learning_rate": 3.167137579307773e-05, + "loss": 1.5066, + "step": 20866 + }, + { + "epoch": 0.7472917077013984, + "grad_norm": 1.7102595567703247, + "learning_rate": 3.1662907226378145e-05, + "loss": 1.5464, + "step": 20867 + }, + { + "epoch": 0.7473275198309668, + "grad_norm": 1.462852120399475, + "learning_rate": 3.165443957906603e-05, + "loss": 1.2122, + "step": 20868 + }, + { + "epoch": 0.7473633319605351, + "grad_norm": 1.76582670211792, + "learning_rate": 3.164597285125525e-05, + "loss": 1.5579, + "step": 20869 + }, + { + "epoch": 0.7473991440901033, + "grad_norm": 2.204495906829834, + "learning_rate": 3.163750704305972e-05, + "loss": 1.5112, + "step": 20870 + }, + { + "epoch": 0.7474349562196716, + "grad_norm": 2.0119872093200684, + "learning_rate": 3.162904215459336e-05, + "loss": 1.4266, + "step": 20871 + }, + { + "epoch": 0.7474707683492399, + "grad_norm": 1.4584416151046753, + "learning_rate": 3.1620578185970075e-05, + "loss": 1.4588, + "step": 20872 + }, + { + "epoch": 0.7475065804788081, + "grad_norm": 1.7966676950454712, + "learning_rate": 3.161211513730368e-05, + "loss": 1.4396, + "step": 20873 + }, + { + "epoch": 0.7475423926083764, + "grad_norm": 1.828666090965271, + "learning_rate": 3.160365300870804e-05, + "loss": 1.3623, + "step": 20874 + }, + { + "epoch": 0.7475782047379448, + "grad_norm": 1.817426085472107, + "learning_rate": 3.159519180029705e-05, + "loss": 1.5559, + "step": 20875 + }, + { + "epoch": 0.747614016867513, + "grad_norm": 1.6124011278152466, + "learning_rate": 3.1586731512184545e-05, + "loss": 1.2511, + "step": 20876 + }, + { + "epoch": 0.7476498289970813, + "grad_norm": 1.5697824954986572, + "learning_rate": 3.157827214448428e-05, + "loss": 1.4974, + "step": 20877 + }, + { + "epoch": 0.7476856411266496, + "grad_norm": 1.6202102899551392, + "learning_rate": 3.1569813697310115e-05, + "loss": 1.536, + "step": 20878 + }, + { + "epoch": 0.7477214532562179, + "grad_norm": 1.5296554565429688, + "learning_rate": 3.156135617077587e-05, + "loss": 1.4601, + "step": 20879 + }, + { + "epoch": 0.7477572653857861, + "grad_norm": 1.323926568031311, + "learning_rate": 3.155289956499525e-05, + "loss": 1.3919, + "step": 20880 + }, + { + "epoch": 0.7477930775153544, + "grad_norm": 1.4089916944503784, + "learning_rate": 3.15444438800821e-05, + "loss": 1.5501, + "step": 20881 + }, + { + "epoch": 0.7478288896449228, + "grad_norm": 1.9115785360336304, + "learning_rate": 3.1535989116150146e-05, + "loss": 1.6466, + "step": 20882 + }, + { + "epoch": 0.747864701774491, + "grad_norm": 1.4998923540115356, + "learning_rate": 3.1527535273313166e-05, + "loss": 1.3259, + "step": 20883 + }, + { + "epoch": 0.7479005139040593, + "grad_norm": 1.8716665506362915, + "learning_rate": 3.151908235168486e-05, + "loss": 1.6371, + "step": 20884 + }, + { + "epoch": 0.7479363260336276, + "grad_norm": 1.64845871925354, + "learning_rate": 3.151063035137896e-05, + "loss": 1.4893, + "step": 20885 + }, + { + "epoch": 0.7479721381631959, + "grad_norm": 1.3553320169448853, + "learning_rate": 3.1502179272509216e-05, + "loss": 1.3028, + "step": 20886 + }, + { + "epoch": 0.7480079502927641, + "grad_norm": 1.9826897382736206, + "learning_rate": 3.149372911518926e-05, + "loss": 1.5283, + "step": 20887 + }, + { + "epoch": 0.7480437624223324, + "grad_norm": 1.5831016302108765, + "learning_rate": 3.1485279879532826e-05, + "loss": 1.4323, + "step": 20888 + }, + { + "epoch": 0.7480795745519008, + "grad_norm": 1.9086743593215942, + "learning_rate": 3.147683156565355e-05, + "loss": 1.4901, + "step": 20889 + }, + { + "epoch": 0.748115386681469, + "grad_norm": 1.5491160154342651, + "learning_rate": 3.146838417366517e-05, + "loss": 1.7092, + "step": 20890 + }, + { + "epoch": 0.7481511988110373, + "grad_norm": 1.6630737781524658, + "learning_rate": 3.145993770368124e-05, + "loss": 1.4201, + "step": 20891 + }, + { + "epoch": 0.7481870109406056, + "grad_norm": 1.6699607372283936, + "learning_rate": 3.1451492155815444e-05, + "loss": 1.2344, + "step": 20892 + }, + { + "epoch": 0.7482228230701738, + "grad_norm": 1.4572018384933472, + "learning_rate": 3.1443047530181394e-05, + "loss": 1.5246, + "step": 20893 + }, + { + "epoch": 0.7482586351997421, + "grad_norm": 1.2610589265823364, + "learning_rate": 3.143460382689274e-05, + "loss": 1.3851, + "step": 20894 + }, + { + "epoch": 0.7482944473293104, + "grad_norm": 1.5143929719924927, + "learning_rate": 3.142616104606304e-05, + "loss": 1.1789, + "step": 20895 + }, + { + "epoch": 0.7483302594588788, + "grad_norm": 1.5444740056991577, + "learning_rate": 3.141771918780584e-05, + "loss": 1.2268, + "step": 20896 + }, + { + "epoch": 0.748366071588447, + "grad_norm": 1.3086482286453247, + "learning_rate": 3.140927825223482e-05, + "loss": 1.2681, + "step": 20897 + }, + { + "epoch": 0.7484018837180153, + "grad_norm": 1.645628809928894, + "learning_rate": 3.140083823946346e-05, + "loss": 1.5036, + "step": 20898 + }, + { + "epoch": 0.7484376958475836, + "grad_norm": 1.3148621320724487, + "learning_rate": 3.139239914960532e-05, + "loss": 1.3868, + "step": 20899 + }, + { + "epoch": 0.7484735079771518, + "grad_norm": 1.466416597366333, + "learning_rate": 3.138396098277396e-05, + "loss": 1.4579, + "step": 20900 + }, + { + "epoch": 0.7485093201067201, + "grad_norm": 2.0066518783569336, + "learning_rate": 3.1375523739082936e-05, + "loss": 1.3685, + "step": 20901 + }, + { + "epoch": 0.7485451322362884, + "grad_norm": 1.307276964187622, + "learning_rate": 3.136708741864568e-05, + "loss": 1.0352, + "step": 20902 + }, + { + "epoch": 0.7485809443658568, + "grad_norm": 1.4867724180221558, + "learning_rate": 3.135865202157574e-05, + "loss": 1.5873, + "step": 20903 + }, + { + "epoch": 0.748616756495425, + "grad_norm": 1.7414683103561401, + "learning_rate": 3.135021754798663e-05, + "loss": 1.5434, + "step": 20904 + }, + { + "epoch": 0.7486525686249933, + "grad_norm": 1.3864867687225342, + "learning_rate": 3.134178399799175e-05, + "loss": 1.3304, + "step": 20905 + }, + { + "epoch": 0.7486883807545616, + "grad_norm": 1.6754690408706665, + "learning_rate": 3.1333351371704634e-05, + "loss": 1.5759, + "step": 20906 + }, + { + "epoch": 0.7487241928841298, + "grad_norm": 2.0780746936798096, + "learning_rate": 3.132491966923864e-05, + "loss": 1.8282, + "step": 20907 + }, + { + "epoch": 0.7487600050136981, + "grad_norm": 1.4286201000213623, + "learning_rate": 3.131648889070734e-05, + "loss": 1.4423, + "step": 20908 + }, + { + "epoch": 0.7487958171432664, + "grad_norm": 1.1952495574951172, + "learning_rate": 3.130805903622405e-05, + "loss": 1.5991, + "step": 20909 + }, + { + "epoch": 0.7488316292728348, + "grad_norm": 1.5690759420394897, + "learning_rate": 3.129963010590224e-05, + "loss": 1.4049, + "step": 20910 + }, + { + "epoch": 0.748867441402403, + "grad_norm": 1.718424916267395, + "learning_rate": 3.1291202099855245e-05, + "loss": 1.5049, + "step": 20911 + }, + { + "epoch": 0.7489032535319713, + "grad_norm": 1.4387127161026, + "learning_rate": 3.1282775018196554e-05, + "loss": 1.6321, + "step": 20912 + }, + { + "epoch": 0.7489390656615396, + "grad_norm": 1.6412798166275024, + "learning_rate": 3.127434886103948e-05, + "loss": 1.3564, + "step": 20913 + }, + { + "epoch": 0.7489748777911078, + "grad_norm": 1.4113123416900635, + "learning_rate": 3.1265923628497327e-05, + "loss": 1.3528, + "step": 20914 + }, + { + "epoch": 0.7490106899206761, + "grad_norm": 1.7293850183486938, + "learning_rate": 3.125749932068359e-05, + "loss": 1.3285, + "step": 20915 + }, + { + "epoch": 0.7490465020502444, + "grad_norm": 1.4474704265594482, + "learning_rate": 3.124907593771148e-05, + "loss": 1.4824, + "step": 20916 + }, + { + "epoch": 0.7490823141798127, + "grad_norm": 2.540874719619751, + "learning_rate": 3.1240653479694415e-05, + "loss": 1.2577, + "step": 20917 + }, + { + "epoch": 0.749118126309381, + "grad_norm": 1.6497268676757812, + "learning_rate": 3.123223194674559e-05, + "loss": 1.3947, + "step": 20918 + }, + { + "epoch": 0.7491539384389493, + "grad_norm": 1.7501670122146606, + "learning_rate": 3.122381133897846e-05, + "loss": 1.7379, + "step": 20919 + }, + { + "epoch": 0.7491897505685176, + "grad_norm": 1.541893482208252, + "learning_rate": 3.121539165650619e-05, + "loss": 1.1808, + "step": 20920 + }, + { + "epoch": 0.7492255626980858, + "grad_norm": 1.4933574199676514, + "learning_rate": 3.120697289944213e-05, + "loss": 1.4671, + "step": 20921 + }, + { + "epoch": 0.7492613748276541, + "grad_norm": 1.7959959506988525, + "learning_rate": 3.119855506789948e-05, + "loss": 1.4678, + "step": 20922 + }, + { + "epoch": 0.7492971869572224, + "grad_norm": 1.9064847230911255, + "learning_rate": 3.1190138161991536e-05, + "loss": 1.616, + "step": 20923 + }, + { + "epoch": 0.7493329990867907, + "grad_norm": 1.8895035982131958, + "learning_rate": 3.118172218183154e-05, + "loss": 1.3401, + "step": 20924 + }, + { + "epoch": 0.749368811216359, + "grad_norm": 3.042356252670288, + "learning_rate": 3.117330712753265e-05, + "loss": 1.5714, + "step": 20925 + }, + { + "epoch": 0.7494046233459273, + "grad_norm": 1.5778942108154297, + "learning_rate": 3.11648929992082e-05, + "loss": 1.4569, + "step": 20926 + }, + { + "epoch": 0.7494404354754955, + "grad_norm": 1.8818434476852417, + "learning_rate": 3.115647979697128e-05, + "loss": 1.3493, + "step": 20927 + }, + { + "epoch": 0.7494762476050638, + "grad_norm": 1.2638598680496216, + "learning_rate": 3.114806752093517e-05, + "loss": 1.4576, + "step": 20928 + }, + { + "epoch": 0.7495120597346321, + "grad_norm": 1.5679749250411987, + "learning_rate": 3.113965617121291e-05, + "loss": 1.2887, + "step": 20929 + }, + { + "epoch": 0.7495478718642004, + "grad_norm": 1.635933518409729, + "learning_rate": 3.1131245747917835e-05, + "loss": 1.6161, + "step": 20930 + }, + { + "epoch": 0.7495836839937687, + "grad_norm": 1.6987125873565674, + "learning_rate": 3.1122836251163014e-05, + "loss": 1.4761, + "step": 20931 + }, + { + "epoch": 0.749619496123337, + "grad_norm": 1.466423511505127, + "learning_rate": 3.111442768106155e-05, + "loss": 1.5627, + "step": 20932 + }, + { + "epoch": 0.7496553082529053, + "grad_norm": 1.3633636236190796, + "learning_rate": 3.1106020037726615e-05, + "loss": 1.2434, + "step": 20933 + }, + { + "epoch": 0.7496911203824735, + "grad_norm": 1.5454039573669434, + "learning_rate": 3.1097613321271304e-05, + "loss": 1.3154, + "step": 20934 + }, + { + "epoch": 0.7497269325120418, + "grad_norm": 1.5664490461349487, + "learning_rate": 3.108920753180875e-05, + "loss": 1.4282, + "step": 20935 + }, + { + "epoch": 0.7497627446416101, + "grad_norm": 2.946145534515381, + "learning_rate": 3.1080802669452e-05, + "loss": 1.5775, + "step": 20936 + }, + { + "epoch": 0.7497985567711783, + "grad_norm": 2.2433459758758545, + "learning_rate": 3.107239873431416e-05, + "loss": 1.4163, + "step": 20937 + }, + { + "epoch": 0.7498343689007467, + "grad_norm": 1.6156102418899536, + "learning_rate": 3.1063995726508296e-05, + "loss": 1.1802, + "step": 20938 + }, + { + "epoch": 0.749870181030315, + "grad_norm": 2.013106346130371, + "learning_rate": 3.105559364614743e-05, + "loss": 1.434, + "step": 20939 + }, + { + "epoch": 0.7499059931598833, + "grad_norm": 2.1555824279785156, + "learning_rate": 3.1047192493344624e-05, + "loss": 1.3182, + "step": 20940 + }, + { + "epoch": 0.7499418052894515, + "grad_norm": 1.372545599937439, + "learning_rate": 3.103879226821289e-05, + "loss": 1.2394, + "step": 20941 + }, + { + "epoch": 0.7499776174190198, + "grad_norm": 2.159738302230835, + "learning_rate": 3.1030392970865286e-05, + "loss": 1.2531, + "step": 20942 + }, + { + "epoch": 0.7500134295485881, + "grad_norm": 1.378603219985962, + "learning_rate": 3.102199460141475e-05, + "loss": 1.3648, + "step": 20943 + }, + { + "epoch": 0.7500492416781563, + "grad_norm": 1.817529320716858, + "learning_rate": 3.1013597159974304e-05, + "loss": 1.4851, + "step": 20944 + }, + { + "epoch": 0.7500850538077247, + "grad_norm": 1.4849520921707153, + "learning_rate": 3.1005200646656915e-05, + "loss": 1.3406, + "step": 20945 + }, + { + "epoch": 0.750120865937293, + "grad_norm": 2.093026638031006, + "learning_rate": 3.09968050615756e-05, + "loss": 1.2892, + "step": 20946 + }, + { + "epoch": 0.7501566780668613, + "grad_norm": 1.8699207305908203, + "learning_rate": 3.0988410404843216e-05, + "loss": 1.7263, + "step": 20947 + }, + { + "epoch": 0.7501924901964295, + "grad_norm": 2.0237114429473877, + "learning_rate": 3.0980016676572766e-05, + "loss": 1.4728, + "step": 20948 + }, + { + "epoch": 0.7502283023259978, + "grad_norm": 1.5000897645950317, + "learning_rate": 3.097162387687719e-05, + "loss": 1.3496, + "step": 20949 + }, + { + "epoch": 0.7502641144555661, + "grad_norm": 2.2044296264648438, + "learning_rate": 3.096323200586934e-05, + "loss": 1.1754, + "step": 20950 + }, + { + "epoch": 0.7502999265851343, + "grad_norm": 1.645936369895935, + "learning_rate": 3.0954841063662145e-05, + "loss": 1.5664, + "step": 20951 + }, + { + "epoch": 0.7503357387147027, + "grad_norm": 1.730129599571228, + "learning_rate": 3.094645105036851e-05, + "loss": 1.2231, + "step": 20952 + }, + { + "epoch": 0.750371550844271, + "grad_norm": 1.6695406436920166, + "learning_rate": 3.093806196610134e-05, + "loss": 1.5443, + "step": 20953 + }, + { + "epoch": 0.7504073629738393, + "grad_norm": 1.7872570753097534, + "learning_rate": 3.092967381097342e-05, + "loss": 1.2933, + "step": 20954 + }, + { + "epoch": 0.7504431751034075, + "grad_norm": 1.5018796920776367, + "learning_rate": 3.092128658509765e-05, + "loss": 1.4061, + "step": 20955 + }, + { + "epoch": 0.7504789872329758, + "grad_norm": 2.464582681655884, + "learning_rate": 3.09129002885869e-05, + "loss": 1.5651, + "step": 20956 + }, + { + "epoch": 0.7505147993625441, + "grad_norm": 1.7600558996200562, + "learning_rate": 3.090451492155392e-05, + "loss": 1.6592, + "step": 20957 + }, + { + "epoch": 0.7505506114921123, + "grad_norm": 1.316369652748108, + "learning_rate": 3.089613048411158e-05, + "loss": 1.3414, + "step": 20958 + }, + { + "epoch": 0.7505864236216807, + "grad_norm": 1.9160432815551758, + "learning_rate": 3.088774697637265e-05, + "loss": 1.3255, + "step": 20959 + }, + { + "epoch": 0.750622235751249, + "grad_norm": 2.339635133743286, + "learning_rate": 3.087936439844997e-05, + "loss": 1.357, + "step": 20960 + }, + { + "epoch": 0.7506580478808172, + "grad_norm": 1.4159473180770874, + "learning_rate": 3.087098275045626e-05, + "loss": 1.3368, + "step": 20961 + }, + { + "epoch": 0.7506938600103855, + "grad_norm": 2.2639219760894775, + "learning_rate": 3.08626020325043e-05, + "loss": 1.565, + "step": 20962 + }, + { + "epoch": 0.7507296721399538, + "grad_norm": 1.6115988492965698, + "learning_rate": 3.0854222244706857e-05, + "loss": 1.4387, + "step": 20963 + }, + { + "epoch": 0.750765484269522, + "grad_norm": 1.6219232082366943, + "learning_rate": 3.0845843387176686e-05, + "loss": 1.5412, + "step": 20964 + }, + { + "epoch": 0.7508012963990903, + "grad_norm": 1.478438377380371, + "learning_rate": 3.083746546002646e-05, + "loss": 1.3816, + "step": 20965 + }, + { + "epoch": 0.7508371085286587, + "grad_norm": 1.358998417854309, + "learning_rate": 3.082908846336891e-05, + "loss": 1.3876, + "step": 20966 + }, + { + "epoch": 0.750872920658227, + "grad_norm": 1.556822419166565, + "learning_rate": 3.082071239731681e-05, + "loss": 1.4421, + "step": 20967 + }, + { + "epoch": 0.7509087327877952, + "grad_norm": 1.4174234867095947, + "learning_rate": 3.0812337261982735e-05, + "loss": 1.4759, + "step": 20968 + }, + { + "epoch": 0.7509445449173635, + "grad_norm": 1.8507611751556396, + "learning_rate": 3.080396305747942e-05, + "loss": 1.6876, + "step": 20969 + }, + { + "epoch": 0.7509803570469318, + "grad_norm": 1.3811239004135132, + "learning_rate": 3.0795589783919543e-05, + "loss": 1.3176, + "step": 20970 + }, + { + "epoch": 0.7510161691765, + "grad_norm": 1.923411250114441, + "learning_rate": 3.078721744141575e-05, + "loss": 1.3883, + "step": 20971 + }, + { + "epoch": 0.7510519813060683, + "grad_norm": 2.107851266860962, + "learning_rate": 3.0778846030080644e-05, + "loss": 1.5351, + "step": 20972 + }, + { + "epoch": 0.7510877934356367, + "grad_norm": 1.7084307670593262, + "learning_rate": 3.077047555002688e-05, + "loss": 1.539, + "step": 20973 + }, + { + "epoch": 0.751123605565205, + "grad_norm": 2.1994099617004395, + "learning_rate": 3.0762106001367095e-05, + "loss": 1.2937, + "step": 20974 + }, + { + "epoch": 0.7511594176947732, + "grad_norm": 1.8254406452178955, + "learning_rate": 3.075373738421383e-05, + "loss": 1.2871, + "step": 20975 + }, + { + "epoch": 0.7511952298243415, + "grad_norm": 1.9883519411087036, + "learning_rate": 3.0745369698679715e-05, + "loss": 1.3643, + "step": 20976 + }, + { + "epoch": 0.7512310419539098, + "grad_norm": 1.6190694570541382, + "learning_rate": 3.0737002944877314e-05, + "loss": 1.2145, + "step": 20977 + }, + { + "epoch": 0.751266854083478, + "grad_norm": 1.9418599605560303, + "learning_rate": 3.072863712291922e-05, + "loss": 1.3982, + "step": 20978 + }, + { + "epoch": 0.7513026662130463, + "grad_norm": 1.3245214223861694, + "learning_rate": 3.0720272232917934e-05, + "loss": 1.4906, + "step": 20979 + }, + { + "epoch": 0.7513384783426147, + "grad_norm": 1.2552196979522705, + "learning_rate": 3.071190827498602e-05, + "loss": 1.4326, + "step": 20980 + }, + { + "epoch": 0.751374290472183, + "grad_norm": 1.699867844581604, + "learning_rate": 3.070354524923601e-05, + "loss": 1.4727, + "step": 20981 + }, + { + "epoch": 0.7514101026017512, + "grad_norm": 1.6668874025344849, + "learning_rate": 3.0695183155780435e-05, + "loss": 1.3547, + "step": 20982 + }, + { + "epoch": 0.7514459147313195, + "grad_norm": 1.845920443534851, + "learning_rate": 3.068682199473175e-05, + "loss": 1.4138, + "step": 20983 + }, + { + "epoch": 0.7514817268608878, + "grad_norm": 1.6310741901397705, + "learning_rate": 3.067846176620247e-05, + "loss": 1.2511, + "step": 20984 + }, + { + "epoch": 0.751517538990456, + "grad_norm": 1.9002145528793335, + "learning_rate": 3.06701024703051e-05, + "loss": 1.4841, + "step": 20985 + }, + { + "epoch": 0.7515533511200243, + "grad_norm": 2.533083438873291, + "learning_rate": 3.0661744107152025e-05, + "loss": 1.6083, + "step": 20986 + }, + { + "epoch": 0.7515891632495927, + "grad_norm": 1.722609519958496, + "learning_rate": 3.0653386676855756e-05, + "loss": 1.2962, + "step": 20987 + }, + { + "epoch": 0.751624975379161, + "grad_norm": 2.650226593017578, + "learning_rate": 3.064503017952871e-05, + "loss": 1.357, + "step": 20988 + }, + { + "epoch": 0.7516607875087292, + "grad_norm": 2.232440948486328, + "learning_rate": 3.0636674615283364e-05, + "loss": 1.5926, + "step": 20989 + }, + { + "epoch": 0.7516965996382975, + "grad_norm": 1.7429866790771484, + "learning_rate": 3.0628319984232056e-05, + "loss": 1.547, + "step": 20990 + }, + { + "epoch": 0.7517324117678658, + "grad_norm": 1.6939197778701782, + "learning_rate": 3.061996628648721e-05, + "loss": 1.1195, + "step": 20991 + }, + { + "epoch": 0.751768223897434, + "grad_norm": 1.5635935068130493, + "learning_rate": 3.0611613522161266e-05, + "loss": 1.1881, + "step": 20992 + }, + { + "epoch": 0.7518040360270023, + "grad_norm": 1.8957972526550293, + "learning_rate": 3.0603261691366525e-05, + "loss": 1.3517, + "step": 20993 + }, + { + "epoch": 0.7518398481565707, + "grad_norm": 2.051981210708618, + "learning_rate": 3.05949107942154e-05, + "loss": 1.413, + "step": 20994 + }, + { + "epoch": 0.751875660286139, + "grad_norm": 1.7447413206100464, + "learning_rate": 3.0586560830820174e-05, + "loss": 1.5783, + "step": 20995 + }, + { + "epoch": 0.7519114724157072, + "grad_norm": 1.535361886024475, + "learning_rate": 3.05782118012933e-05, + "loss": 1.3961, + "step": 20996 + }, + { + "epoch": 0.7519472845452755, + "grad_norm": 1.363828420639038, + "learning_rate": 3.0569863705747004e-05, + "loss": 1.5911, + "step": 20997 + }, + { + "epoch": 0.7519830966748438, + "grad_norm": 3.876757860183716, + "learning_rate": 3.0561516544293634e-05, + "loss": 1.889, + "step": 20998 + }, + { + "epoch": 0.752018908804412, + "grad_norm": 1.9156787395477295, + "learning_rate": 3.0553170317045485e-05, + "loss": 1.1729, + "step": 20999 + }, + { + "epoch": 0.7520547209339803, + "grad_norm": 2.1314427852630615, + "learning_rate": 3.054482502411489e-05, + "loss": 1.4521, + "step": 21000 + }, + { + "epoch": 0.7520905330635487, + "grad_norm": 1.6527806520462036, + "learning_rate": 3.0536480665614075e-05, + "loss": 1.6505, + "step": 21001 + }, + { + "epoch": 0.7521263451931169, + "grad_norm": 1.80655837059021, + "learning_rate": 3.052813724165525e-05, + "loss": 1.2674, + "step": 21002 + }, + { + "epoch": 0.7521621573226852, + "grad_norm": 2.413799285888672, + "learning_rate": 3.051979475235078e-05, + "loss": 1.1142, + "step": 21003 + }, + { + "epoch": 0.7521979694522535, + "grad_norm": 1.5094581842422485, + "learning_rate": 3.0511453197812834e-05, + "loss": 1.3326, + "step": 21004 + }, + { + "epoch": 0.7522337815818217, + "grad_norm": 1.4582942724227905, + "learning_rate": 3.050311257815368e-05, + "loss": 1.3724, + "step": 21005 + }, + { + "epoch": 0.75226959371139, + "grad_norm": 1.700408935546875, + "learning_rate": 3.0494772893485435e-05, + "loss": 1.1878, + "step": 21006 + }, + { + "epoch": 0.7523054058409583, + "grad_norm": 1.694864273071289, + "learning_rate": 3.0486434143920428e-05, + "loss": 1.2196, + "step": 21007 + }, + { + "epoch": 0.7523412179705267, + "grad_norm": 1.9889030456542969, + "learning_rate": 3.047809632957075e-05, + "loss": 1.3167, + "step": 21008 + }, + { + "epoch": 0.7523770301000949, + "grad_norm": 1.6860452890396118, + "learning_rate": 3.0469759450548607e-05, + "loss": 1.2681, + "step": 21009 + }, + { + "epoch": 0.7524128422296632, + "grad_norm": 1.7383288145065308, + "learning_rate": 3.0461423506966203e-05, + "loss": 1.2188, + "step": 21010 + }, + { + "epoch": 0.7524486543592315, + "grad_norm": 1.203225016593933, + "learning_rate": 3.0453088498935612e-05, + "loss": 1.459, + "step": 21011 + }, + { + "epoch": 0.7524844664887997, + "grad_norm": 1.667195439338684, + "learning_rate": 3.0444754426569032e-05, + "loss": 1.4978, + "step": 21012 + }, + { + "epoch": 0.752520278618368, + "grad_norm": 1.8023768663406372, + "learning_rate": 3.04364212899785e-05, + "loss": 1.3717, + "step": 21013 + }, + { + "epoch": 0.7525560907479363, + "grad_norm": 1.4114989042282104, + "learning_rate": 3.0428089089276257e-05, + "loss": 1.6978, + "step": 21014 + }, + { + "epoch": 0.7525919028775047, + "grad_norm": 1.4679601192474365, + "learning_rate": 3.04197578245743e-05, + "loss": 1.5715, + "step": 21015 + }, + { + "epoch": 0.7526277150070729, + "grad_norm": 1.8693400621414185, + "learning_rate": 3.041142749598479e-05, + "loss": 1.6269, + "step": 21016 + }, + { + "epoch": 0.7526635271366412, + "grad_norm": 2.4425885677337646, + "learning_rate": 3.0403098103619687e-05, + "loss": 1.4659, + "step": 21017 + }, + { + "epoch": 0.7526993392662095, + "grad_norm": 1.5194597244262695, + "learning_rate": 3.0394769647591194e-05, + "loss": 1.5173, + "step": 21018 + }, + { + "epoch": 0.7527351513957777, + "grad_norm": 1.4792680740356445, + "learning_rate": 3.0386442128011282e-05, + "loss": 1.0879, + "step": 21019 + }, + { + "epoch": 0.752770963525346, + "grad_norm": 1.689097285270691, + "learning_rate": 3.037811554499197e-05, + "loss": 1.0078, + "step": 21020 + }, + { + "epoch": 0.7528067756549143, + "grad_norm": 1.4809821844100952, + "learning_rate": 3.0369789898645306e-05, + "loss": 1.4036, + "step": 21021 + }, + { + "epoch": 0.7528425877844827, + "grad_norm": 1.493944764137268, + "learning_rate": 3.0361465189083305e-05, + "loss": 1.4354, + "step": 21022 + }, + { + "epoch": 0.7528783999140509, + "grad_norm": 1.486365795135498, + "learning_rate": 3.0353141416417997e-05, + "loss": 1.2398, + "step": 21023 + }, + { + "epoch": 0.7529142120436192, + "grad_norm": 1.6679426431655884, + "learning_rate": 3.034481858076127e-05, + "loss": 1.3382, + "step": 21024 + }, + { + "epoch": 0.7529500241731875, + "grad_norm": 1.6131479740142822, + "learning_rate": 3.0336496682225214e-05, + "loss": 1.4169, + "step": 21025 + }, + { + "epoch": 0.7529858363027557, + "grad_norm": 1.4379775524139404, + "learning_rate": 3.0328175720921715e-05, + "loss": 1.2663, + "step": 21026 + }, + { + "epoch": 0.753021648432324, + "grad_norm": 1.8165546655654907, + "learning_rate": 3.0319855696962762e-05, + "loss": 1.577, + "step": 21027 + }, + { + "epoch": 0.7530574605618923, + "grad_norm": 1.724393367767334, + "learning_rate": 3.0311536610460245e-05, + "loss": 1.671, + "step": 21028 + }, + { + "epoch": 0.7530932726914606, + "grad_norm": 1.8695555925369263, + "learning_rate": 3.0303218461526116e-05, + "loss": 1.7049, + "step": 21029 + }, + { + "epoch": 0.7531290848210289, + "grad_norm": 2.045958995819092, + "learning_rate": 3.02949012502723e-05, + "loss": 1.5631, + "step": 21030 + }, + { + "epoch": 0.7531648969505972, + "grad_norm": 1.6942603588104248, + "learning_rate": 3.028658497681065e-05, + "loss": 1.4666, + "step": 21031 + }, + { + "epoch": 0.7532007090801655, + "grad_norm": 2.18617582321167, + "learning_rate": 3.0278269641253075e-05, + "loss": 1.3933, + "step": 21032 + }, + { + "epoch": 0.7532365212097337, + "grad_norm": 2.1976447105407715, + "learning_rate": 3.0269955243711457e-05, + "loss": 1.314, + "step": 21033 + }, + { + "epoch": 0.753272333339302, + "grad_norm": 1.413646936416626, + "learning_rate": 3.0261641784297666e-05, + "loss": 1.5067, + "step": 21034 + }, + { + "epoch": 0.7533081454688703, + "grad_norm": 1.595048189163208, + "learning_rate": 3.0253329263123497e-05, + "loss": 1.3638, + "step": 21035 + }, + { + "epoch": 0.7533439575984386, + "grad_norm": 2.5425102710723877, + "learning_rate": 3.0245017680300813e-05, + "loss": 1.1756, + "step": 21036 + }, + { + "epoch": 0.7533797697280069, + "grad_norm": 1.6122390031814575, + "learning_rate": 3.0236707035941482e-05, + "loss": 1.591, + "step": 21037 + }, + { + "epoch": 0.7534155818575752, + "grad_norm": 1.5871548652648926, + "learning_rate": 3.0228397330157233e-05, + "loss": 1.2486, + "step": 21038 + }, + { + "epoch": 0.7534513939871434, + "grad_norm": 1.390411615371704, + "learning_rate": 3.022008856305989e-05, + "loss": 1.6519, + "step": 21039 + }, + { + "epoch": 0.7534872061167117, + "grad_norm": 1.7453995943069458, + "learning_rate": 3.0211780734761254e-05, + "loss": 1.1648, + "step": 21040 + }, + { + "epoch": 0.75352301824628, + "grad_norm": 1.4689477682113647, + "learning_rate": 3.020347384537312e-05, + "loss": 1.5223, + "step": 21041 + }, + { + "epoch": 0.7535588303758483, + "grad_norm": 1.7363903522491455, + "learning_rate": 3.019516789500718e-05, + "loss": 1.3932, + "step": 21042 + }, + { + "epoch": 0.7535946425054166, + "grad_norm": 1.9193975925445557, + "learning_rate": 3.0186862883775214e-05, + "loss": 1.2524, + "step": 21043 + }, + { + "epoch": 0.7536304546349849, + "grad_norm": 1.6323972940444946, + "learning_rate": 3.017855881178899e-05, + "loss": 1.7168, + "step": 21044 + }, + { + "epoch": 0.7536662667645532, + "grad_norm": 1.584980845451355, + "learning_rate": 3.0170255679160163e-05, + "loss": 1.1829, + "step": 21045 + }, + { + "epoch": 0.7537020788941214, + "grad_norm": 1.800627589225769, + "learning_rate": 3.0161953486000473e-05, + "loss": 1.3106, + "step": 21046 + }, + { + "epoch": 0.7537378910236897, + "grad_norm": 1.3021795749664307, + "learning_rate": 3.0153652232421603e-05, + "loss": 1.4727, + "step": 21047 + }, + { + "epoch": 0.753773703153258, + "grad_norm": 1.9696956872940063, + "learning_rate": 3.014535191853529e-05, + "loss": 1.3064, + "step": 21048 + }, + { + "epoch": 0.7538095152828262, + "grad_norm": 1.3597009181976318, + "learning_rate": 3.0137052544453126e-05, + "loss": 1.4837, + "step": 21049 + }, + { + "epoch": 0.7538453274123946, + "grad_norm": 1.9204648733139038, + "learning_rate": 3.0128754110286806e-05, + "loss": 1.6705, + "step": 21050 + }, + { + "epoch": 0.7538811395419629, + "grad_norm": 1.4181511402130127, + "learning_rate": 3.012045661614796e-05, + "loss": 1.4157, + "step": 21051 + }, + { + "epoch": 0.7539169516715312, + "grad_norm": 1.776105523109436, + "learning_rate": 3.0112160062148274e-05, + "loss": 1.5494, + "step": 21052 + }, + { + "epoch": 0.7539527638010994, + "grad_norm": 1.4429407119750977, + "learning_rate": 3.01038644483993e-05, + "loss": 1.3892, + "step": 21053 + }, + { + "epoch": 0.7539885759306677, + "grad_norm": 1.9590340852737427, + "learning_rate": 3.0095569775012665e-05, + "loss": 1.2104, + "step": 21054 + }, + { + "epoch": 0.754024388060236, + "grad_norm": 1.9207379817962646, + "learning_rate": 3.0087276042099997e-05, + "loss": 1.5865, + "step": 21055 + }, + { + "epoch": 0.7540602001898042, + "grad_norm": 1.3791786432266235, + "learning_rate": 3.007898324977282e-05, + "loss": 1.1746, + "step": 21056 + }, + { + "epoch": 0.7540960123193726, + "grad_norm": 1.5347812175750732, + "learning_rate": 3.0070691398142726e-05, + "loss": 1.4336, + "step": 21057 + }, + { + "epoch": 0.7541318244489409, + "grad_norm": 1.8425312042236328, + "learning_rate": 3.0062400487321286e-05, + "loss": 1.704, + "step": 21058 + }, + { + "epoch": 0.7541676365785092, + "grad_norm": 1.758529543876648, + "learning_rate": 3.0054110517420052e-05, + "loss": 1.6625, + "step": 21059 + }, + { + "epoch": 0.7542034487080774, + "grad_norm": 1.4791284799575806, + "learning_rate": 3.004582148855052e-05, + "loss": 1.6378, + "step": 21060 + }, + { + "epoch": 0.7542392608376457, + "grad_norm": 1.824438452720642, + "learning_rate": 3.0037533400824226e-05, + "loss": 1.3131, + "step": 21061 + }, + { + "epoch": 0.754275072967214, + "grad_norm": 1.6703860759735107, + "learning_rate": 3.0029246254352694e-05, + "loss": 1.6793, + "step": 21062 + }, + { + "epoch": 0.7543108850967822, + "grad_norm": 1.5747116804122925, + "learning_rate": 3.002096004924737e-05, + "loss": 1.4264, + "step": 21063 + }, + { + "epoch": 0.7543466972263506, + "grad_norm": 1.7274689674377441, + "learning_rate": 3.0012674785619766e-05, + "loss": 1.4875, + "step": 21064 + }, + { + "epoch": 0.7543825093559189, + "grad_norm": 1.6294996738433838, + "learning_rate": 3.0004390463581345e-05, + "loss": 1.2883, + "step": 21065 + }, + { + "epoch": 0.7544183214854872, + "grad_norm": 1.555970549583435, + "learning_rate": 2.9996107083243598e-05, + "loss": 1.3513, + "step": 21066 + }, + { + "epoch": 0.7544541336150554, + "grad_norm": 1.3730995655059814, + "learning_rate": 2.9987824644717898e-05, + "loss": 1.3771, + "step": 21067 + }, + { + "epoch": 0.7544899457446237, + "grad_norm": 1.953616738319397, + "learning_rate": 2.997954314811571e-05, + "loss": 1.5383, + "step": 21068 + }, + { + "epoch": 0.754525757874192, + "grad_norm": 1.8902431726455688, + "learning_rate": 2.9971262593548443e-05, + "loss": 1.5828, + "step": 21069 + }, + { + "epoch": 0.7545615700037602, + "grad_norm": 1.774874210357666, + "learning_rate": 2.996298298112754e-05, + "loss": 1.4622, + "step": 21070 + }, + { + "epoch": 0.7545973821333286, + "grad_norm": 1.683915615081787, + "learning_rate": 2.9954704310964332e-05, + "loss": 1.5168, + "step": 21071 + }, + { + "epoch": 0.7546331942628969, + "grad_norm": 1.5608534812927246, + "learning_rate": 2.9946426583170217e-05, + "loss": 1.3258, + "step": 21072 + }, + { + "epoch": 0.7546690063924651, + "grad_norm": 1.3177636861801147, + "learning_rate": 2.9938149797856608e-05, + "loss": 1.1626, + "step": 21073 + }, + { + "epoch": 0.7547048185220334, + "grad_norm": 1.6210861206054688, + "learning_rate": 2.992987395513479e-05, + "loss": 1.6502, + "step": 21074 + }, + { + "epoch": 0.7547406306516017, + "grad_norm": 1.682212471961975, + "learning_rate": 2.9921599055116135e-05, + "loss": 1.5752, + "step": 21075 + }, + { + "epoch": 0.75477644278117, + "grad_norm": 1.6862162351608276, + "learning_rate": 2.991332509791196e-05, + "loss": 1.6626, + "step": 21076 + }, + { + "epoch": 0.7548122549107382, + "grad_norm": 2.048794746398926, + "learning_rate": 2.9905052083633632e-05, + "loss": 1.5058, + "step": 21077 + }, + { + "epoch": 0.7548480670403066, + "grad_norm": 1.955229640007019, + "learning_rate": 2.9896780012392377e-05, + "loss": 1.5886, + "step": 21078 + }, + { + "epoch": 0.7548838791698749, + "grad_norm": 1.326578140258789, + "learning_rate": 2.9888508884299516e-05, + "loss": 1.5777, + "step": 21079 + }, + { + "epoch": 0.7549196912994431, + "grad_norm": 1.9599168300628662, + "learning_rate": 2.9880238699466367e-05, + "loss": 1.6122, + "step": 21080 + }, + { + "epoch": 0.7549555034290114, + "grad_norm": 1.659332036972046, + "learning_rate": 2.9871969458004135e-05, + "loss": 1.2969, + "step": 21081 + }, + { + "epoch": 0.7549913155585797, + "grad_norm": 2.504672050476074, + "learning_rate": 2.9863701160024083e-05, + "loss": 1.5504, + "step": 21082 + }, + { + "epoch": 0.755027127688148, + "grad_norm": 1.9664294719696045, + "learning_rate": 2.9855433805637467e-05, + "loss": 1.3975, + "step": 21083 + }, + { + "epoch": 0.7550629398177162, + "grad_norm": 2.3380939960479736, + "learning_rate": 2.9847167394955543e-05, + "loss": 1.267, + "step": 21084 + }, + { + "epoch": 0.7550987519472846, + "grad_norm": 1.6964002847671509, + "learning_rate": 2.9838901928089456e-05, + "loss": 1.1031, + "step": 21085 + }, + { + "epoch": 0.7551345640768529, + "grad_norm": 1.679082989692688, + "learning_rate": 2.983063740515044e-05, + "loss": 1.5012, + "step": 21086 + }, + { + "epoch": 0.7551703762064211, + "grad_norm": 1.3451464176177979, + "learning_rate": 2.9822373826249693e-05, + "loss": 1.3227, + "step": 21087 + }, + { + "epoch": 0.7552061883359894, + "grad_norm": 1.67902672290802, + "learning_rate": 2.9814111191498405e-05, + "loss": 1.4921, + "step": 21088 + }, + { + "epoch": 0.7552420004655577, + "grad_norm": 1.4258464574813843, + "learning_rate": 2.9805849501007733e-05, + "loss": 1.4449, + "step": 21089 + }, + { + "epoch": 0.7552778125951259, + "grad_norm": 1.4557456970214844, + "learning_rate": 2.979758875488874e-05, + "loss": 1.0844, + "step": 21090 + }, + { + "epoch": 0.7553136247246942, + "grad_norm": 1.5208238363265991, + "learning_rate": 2.9789328953252694e-05, + "loss": 1.5754, + "step": 21091 + }, + { + "epoch": 0.7553494368542626, + "grad_norm": 1.7373424768447876, + "learning_rate": 2.9781070096210627e-05, + "loss": 1.6214, + "step": 21092 + }, + { + "epoch": 0.7553852489838309, + "grad_norm": 1.8888261318206787, + "learning_rate": 2.9772812183873733e-05, + "loss": 1.4306, + "step": 21093 + }, + { + "epoch": 0.7554210611133991, + "grad_norm": 1.2451953887939453, + "learning_rate": 2.9764555216352997e-05, + "loss": 1.1014, + "step": 21094 + }, + { + "epoch": 0.7554568732429674, + "grad_norm": 2.3028512001037598, + "learning_rate": 2.975629919375963e-05, + "loss": 1.2671, + "step": 21095 + }, + { + "epoch": 0.7554926853725357, + "grad_norm": 1.363605260848999, + "learning_rate": 2.974804411620462e-05, + "loss": 1.3705, + "step": 21096 + }, + { + "epoch": 0.7555284975021039, + "grad_norm": 1.8250839710235596, + "learning_rate": 2.973978998379906e-05, + "loss": 1.4062, + "step": 21097 + }, + { + "epoch": 0.7555643096316722, + "grad_norm": 1.7646355628967285, + "learning_rate": 2.9731536796654026e-05, + "loss": 1.4773, + "step": 21098 + }, + { + "epoch": 0.7556001217612406, + "grad_norm": 2.1657111644744873, + "learning_rate": 2.9723284554880493e-05, + "loss": 1.1014, + "step": 21099 + }, + { + "epoch": 0.7556359338908089, + "grad_norm": 1.4602758884429932, + "learning_rate": 2.9715033258589543e-05, + "loss": 1.4667, + "step": 21100 + }, + { + "epoch": 0.7556717460203771, + "grad_norm": 1.4769114255905151, + "learning_rate": 2.9706782907892104e-05, + "loss": 1.1822, + "step": 21101 + }, + { + "epoch": 0.7557075581499454, + "grad_norm": 1.8590151071548462, + "learning_rate": 2.9698533502899294e-05, + "loss": 1.5489, + "step": 21102 + }, + { + "epoch": 0.7557433702795137, + "grad_norm": 1.3102487325668335, + "learning_rate": 2.9690285043722e-05, + "loss": 1.4329, + "step": 21103 + }, + { + "epoch": 0.7557791824090819, + "grad_norm": 1.4828435182571411, + "learning_rate": 2.9682037530471252e-05, + "loss": 1.3056, + "step": 21104 + }, + { + "epoch": 0.7558149945386502, + "grad_norm": 1.2443448305130005, + "learning_rate": 2.967379096325793e-05, + "loss": 1.3004, + "step": 21105 + }, + { + "epoch": 0.7558508066682186, + "grad_norm": 1.5931856632232666, + "learning_rate": 2.966554534219309e-05, + "loss": 1.6132, + "step": 21106 + }, + { + "epoch": 0.7558866187977868, + "grad_norm": 1.3592056035995483, + "learning_rate": 2.965730066738762e-05, + "loss": 1.4226, + "step": 21107 + }, + { + "epoch": 0.7559224309273551, + "grad_norm": 1.710934042930603, + "learning_rate": 2.964905693895237e-05, + "loss": 1.6701, + "step": 21108 + }, + { + "epoch": 0.7559582430569234, + "grad_norm": 1.9876086711883545, + "learning_rate": 2.9640814156998374e-05, + "loss": 1.5418, + "step": 21109 + }, + { + "epoch": 0.7559940551864917, + "grad_norm": 1.912034511566162, + "learning_rate": 2.9632572321636443e-05, + "loss": 1.5359, + "step": 21110 + }, + { + "epoch": 0.7560298673160599, + "grad_norm": 1.493776798248291, + "learning_rate": 2.9624331432977515e-05, + "loss": 1.5861, + "step": 21111 + }, + { + "epoch": 0.7560656794456282, + "grad_norm": 2.0025715827941895, + "learning_rate": 2.9616091491132357e-05, + "loss": 1.3727, + "step": 21112 + }, + { + "epoch": 0.7561014915751966, + "grad_norm": 1.6120564937591553, + "learning_rate": 2.9607852496211962e-05, + "loss": 1.554, + "step": 21113 + }, + { + "epoch": 0.7561373037047648, + "grad_norm": 1.4082764387130737, + "learning_rate": 2.9599614448327084e-05, + "loss": 1.3681, + "step": 21114 + }, + { + "epoch": 0.7561731158343331, + "grad_norm": 1.5112791061401367, + "learning_rate": 2.9591377347588623e-05, + "loss": 1.4086, + "step": 21115 + }, + { + "epoch": 0.7562089279639014, + "grad_norm": 1.5789004564285278, + "learning_rate": 2.958314119410732e-05, + "loss": 1.4716, + "step": 21116 + }, + { + "epoch": 0.7562447400934696, + "grad_norm": 1.2509442567825317, + "learning_rate": 2.9574905987994016e-05, + "loss": 1.6457, + "step": 21117 + }, + { + "epoch": 0.7562805522230379, + "grad_norm": 2.162203311920166, + "learning_rate": 2.9566671729359552e-05, + "loss": 1.665, + "step": 21118 + }, + { + "epoch": 0.7563163643526062, + "grad_norm": 1.7488106489181519, + "learning_rate": 2.9558438418314626e-05, + "loss": 1.1802, + "step": 21119 + }, + { + "epoch": 0.7563521764821746, + "grad_norm": 1.4477161169052124, + "learning_rate": 2.9550206054970063e-05, + "loss": 1.3837, + "step": 21120 + }, + { + "epoch": 0.7563879886117428, + "grad_norm": 1.7796725034713745, + "learning_rate": 2.9541974639436588e-05, + "loss": 1.5306, + "step": 21121 + }, + { + "epoch": 0.7564238007413111, + "grad_norm": 1.4333518743515015, + "learning_rate": 2.9533744171824996e-05, + "loss": 1.5737, + "step": 21122 + }, + { + "epoch": 0.7564596128708794, + "grad_norm": 1.4880192279815674, + "learning_rate": 2.9525514652245922e-05, + "loss": 1.3167, + "step": 21123 + }, + { + "epoch": 0.7564954250004476, + "grad_norm": 2.2075390815734863, + "learning_rate": 2.9517286080810204e-05, + "loss": 1.6719, + "step": 21124 + }, + { + "epoch": 0.7565312371300159, + "grad_norm": 1.4467953443527222, + "learning_rate": 2.950905845762849e-05, + "loss": 1.5902, + "step": 21125 + }, + { + "epoch": 0.7565670492595842, + "grad_norm": 1.8242559432983398, + "learning_rate": 2.9500831782811433e-05, + "loss": 1.417, + "step": 21126 + }, + { + "epoch": 0.7566028613891526, + "grad_norm": 1.713553547859192, + "learning_rate": 2.949260605646974e-05, + "loss": 1.6277, + "step": 21127 + }, + { + "epoch": 0.7566386735187208, + "grad_norm": 2.017822027206421, + "learning_rate": 2.948438127871409e-05, + "loss": 1.0925, + "step": 21128 + }, + { + "epoch": 0.7566744856482891, + "grad_norm": 1.8438435792922974, + "learning_rate": 2.947615744965516e-05, + "loss": 1.3623, + "step": 21129 + }, + { + "epoch": 0.7567102977778574, + "grad_norm": 1.515105128288269, + "learning_rate": 2.9467934569403542e-05, + "loss": 1.3707, + "step": 21130 + }, + { + "epoch": 0.7567461099074256, + "grad_norm": 1.3222929239273071, + "learning_rate": 2.945971263806987e-05, + "loss": 1.3478, + "step": 21131 + }, + { + "epoch": 0.7567819220369939, + "grad_norm": 1.3271212577819824, + "learning_rate": 2.9451491655764816e-05, + "loss": 1.2689, + "step": 21132 + }, + { + "epoch": 0.7568177341665622, + "grad_norm": 1.8101156949996948, + "learning_rate": 2.94432716225989e-05, + "loss": 1.4378, + "step": 21133 + }, + { + "epoch": 0.7568535462961306, + "grad_norm": 1.6955105066299438, + "learning_rate": 2.943505253868276e-05, + "loss": 1.4796, + "step": 21134 + }, + { + "epoch": 0.7568893584256988, + "grad_norm": 1.475974440574646, + "learning_rate": 2.942683440412697e-05, + "loss": 1.3474, + "step": 21135 + }, + { + "epoch": 0.7569251705552671, + "grad_norm": 1.774215579032898, + "learning_rate": 2.941861721904211e-05, + "loss": 1.1609, + "step": 21136 + }, + { + "epoch": 0.7569609826848354, + "grad_norm": 1.5521552562713623, + "learning_rate": 2.941040098353869e-05, + "loss": 1.486, + "step": 21137 + }, + { + "epoch": 0.7569967948144036, + "grad_norm": 1.7704689502716064, + "learning_rate": 2.940218569772726e-05, + "loss": 1.498, + "step": 21138 + }, + { + "epoch": 0.7570326069439719, + "grad_norm": 2.514124870300293, + "learning_rate": 2.9393971361718363e-05, + "loss": 1.3497, + "step": 21139 + }, + { + "epoch": 0.7570684190735402, + "grad_norm": 1.5722203254699707, + "learning_rate": 2.9385757975622542e-05, + "loss": 1.5558, + "step": 21140 + }, + { + "epoch": 0.7571042312031085, + "grad_norm": 1.9936293363571167, + "learning_rate": 2.937754553955022e-05, + "loss": 1.5372, + "step": 21141 + }, + { + "epoch": 0.7571400433326768, + "grad_norm": 1.4166237115859985, + "learning_rate": 2.936933405361194e-05, + "loss": 1.1757, + "step": 21142 + }, + { + "epoch": 0.7571758554622451, + "grad_norm": 1.6292997598648071, + "learning_rate": 2.936112351791819e-05, + "loss": 1.3799, + "step": 21143 + }, + { + "epoch": 0.7572116675918134, + "grad_norm": 2.5143439769744873, + "learning_rate": 2.9352913932579362e-05, + "loss": 1.3905, + "step": 21144 + }, + { + "epoch": 0.7572474797213816, + "grad_norm": 1.4618525505065918, + "learning_rate": 2.9344705297705966e-05, + "loss": 1.3163, + "step": 21145 + }, + { + "epoch": 0.7572832918509499, + "grad_norm": 1.6585017442703247, + "learning_rate": 2.933649761340841e-05, + "loss": 1.3903, + "step": 21146 + }, + { + "epoch": 0.7573191039805182, + "grad_norm": 1.43913733959198, + "learning_rate": 2.932829087979716e-05, + "loss": 1.3842, + "step": 21147 + }, + { + "epoch": 0.7573549161100865, + "grad_norm": 1.726843237876892, + "learning_rate": 2.9320085096982575e-05, + "loss": 1.6791, + "step": 21148 + }, + { + "epoch": 0.7573907282396548, + "grad_norm": 1.5694811344146729, + "learning_rate": 2.9311880265075066e-05, + "loss": 1.3672, + "step": 21149 + }, + { + "epoch": 0.7574265403692231, + "grad_norm": 1.3155571222305298, + "learning_rate": 2.9303676384185064e-05, + "loss": 1.2628, + "step": 21150 + }, + { + "epoch": 0.7574623524987913, + "grad_norm": 1.3629982471466064, + "learning_rate": 2.9295473454422863e-05, + "loss": 1.1417, + "step": 21151 + }, + { + "epoch": 0.7574981646283596, + "grad_norm": 1.683783769607544, + "learning_rate": 2.928727147589887e-05, + "loss": 1.7473, + "step": 21152 + }, + { + "epoch": 0.7575339767579279, + "grad_norm": 1.4711257219314575, + "learning_rate": 2.9279070448723432e-05, + "loss": 1.245, + "step": 21153 + }, + { + "epoch": 0.7575697888874962, + "grad_norm": 1.780172348022461, + "learning_rate": 2.927087037300691e-05, + "loss": 1.3952, + "step": 21154 + }, + { + "epoch": 0.7576056010170645, + "grad_norm": 1.5945377349853516, + "learning_rate": 2.926267124885955e-05, + "loss": 1.3441, + "step": 21155 + }, + { + "epoch": 0.7576414131466328, + "grad_norm": 2.19747257232666, + "learning_rate": 2.9254473076391708e-05, + "loss": 1.6498, + "step": 21156 + }, + { + "epoch": 0.7576772252762011, + "grad_norm": 1.6467136144638062, + "learning_rate": 2.924627585571368e-05, + "loss": 1.4511, + "step": 21157 + }, + { + "epoch": 0.7577130374057693, + "grad_norm": 1.4846833944320679, + "learning_rate": 2.9238079586935773e-05, + "loss": 1.4392, + "step": 21158 + }, + { + "epoch": 0.7577488495353376, + "grad_norm": 1.7187808752059937, + "learning_rate": 2.9229884270168195e-05, + "loss": 1.3017, + "step": 21159 + }, + { + "epoch": 0.7577846616649059, + "grad_norm": 1.69410240650177, + "learning_rate": 2.9221689905521245e-05, + "loss": 1.4055, + "step": 21160 + }, + { + "epoch": 0.7578204737944741, + "grad_norm": 1.315220832824707, + "learning_rate": 2.9213496493105187e-05, + "loss": 1.1048, + "step": 21161 + }, + { + "epoch": 0.7578562859240425, + "grad_norm": 1.9386459589004517, + "learning_rate": 2.920530403303019e-05, + "loss": 1.5724, + "step": 21162 + }, + { + "epoch": 0.7578920980536108, + "grad_norm": 1.3954391479492188, + "learning_rate": 2.9197112525406522e-05, + "loss": 1.0904, + "step": 21163 + }, + { + "epoch": 0.7579279101831791, + "grad_norm": 1.6443402767181396, + "learning_rate": 2.918892197034436e-05, + "loss": 1.4831, + "step": 21164 + }, + { + "epoch": 0.7579637223127473, + "grad_norm": 1.8026351928710938, + "learning_rate": 2.9180732367953956e-05, + "loss": 1.7182, + "step": 21165 + }, + { + "epoch": 0.7579995344423156, + "grad_norm": 1.4861021041870117, + "learning_rate": 2.9172543718345413e-05, + "loss": 1.3764, + "step": 21166 + }, + { + "epoch": 0.7580353465718839, + "grad_norm": 1.567413568496704, + "learning_rate": 2.9164356021628923e-05, + "loss": 1.3016, + "step": 21167 + }, + { + "epoch": 0.7580711587014521, + "grad_norm": 1.7095822095870972, + "learning_rate": 2.915616927791469e-05, + "loss": 1.23, + "step": 21168 + }, + { + "epoch": 0.7581069708310205, + "grad_norm": 1.373910903930664, + "learning_rate": 2.9147983487312793e-05, + "loss": 1.5581, + "step": 21169 + }, + { + "epoch": 0.7581427829605888, + "grad_norm": 1.7896029949188232, + "learning_rate": 2.913979864993338e-05, + "loss": 1.3731, + "step": 21170 + }, + { + "epoch": 0.7581785950901571, + "grad_norm": 1.2884318828582764, + "learning_rate": 2.9131614765886573e-05, + "loss": 1.4324, + "step": 21171 + }, + { + "epoch": 0.7582144072197253, + "grad_norm": 1.8694905042648315, + "learning_rate": 2.912343183528251e-05, + "loss": 1.6948, + "step": 21172 + }, + { + "epoch": 0.7582502193492936, + "grad_norm": 2.111467123031616, + "learning_rate": 2.9115249858231207e-05, + "loss": 1.6019, + "step": 21173 + }, + { + "epoch": 0.7582860314788619, + "grad_norm": 1.7151230573654175, + "learning_rate": 2.91070688348428e-05, + "loss": 1.6059, + "step": 21174 + }, + { + "epoch": 0.7583218436084301, + "grad_norm": 1.535046935081482, + "learning_rate": 2.9098888765227316e-05, + "loss": 1.5526, + "step": 21175 + }, + { + "epoch": 0.7583576557379985, + "grad_norm": 1.7479262351989746, + "learning_rate": 2.9090709649494873e-05, + "loss": 1.4793, + "step": 21176 + }, + { + "epoch": 0.7583934678675668, + "grad_norm": 1.8973602056503296, + "learning_rate": 2.908253148775546e-05, + "loss": 1.8296, + "step": 21177 + }, + { + "epoch": 0.758429279997135, + "grad_norm": 1.2485648393630981, + "learning_rate": 2.9074354280119042e-05, + "loss": 1.2522, + "step": 21178 + }, + { + "epoch": 0.7584650921267033, + "grad_norm": 1.4664356708526611, + "learning_rate": 2.9066178026695767e-05, + "loss": 1.2564, + "step": 21179 + }, + { + "epoch": 0.7585009042562716, + "grad_norm": 1.1946889162063599, + "learning_rate": 2.9058002727595546e-05, + "loss": 1.536, + "step": 21180 + }, + { + "epoch": 0.7585367163858399, + "grad_norm": 1.3045967817306519, + "learning_rate": 2.904982838292838e-05, + "loss": 1.2439, + "step": 21181 + }, + { + "epoch": 0.7585725285154081, + "grad_norm": 1.5080360174179077, + "learning_rate": 2.9041654992804256e-05, + "loss": 1.6046, + "step": 21182 + }, + { + "epoch": 0.7586083406449765, + "grad_norm": 1.595199704170227, + "learning_rate": 2.9033482557333158e-05, + "loss": 1.3323, + "step": 21183 + }, + { + "epoch": 0.7586441527745448, + "grad_norm": 1.783447265625, + "learning_rate": 2.9025311076624994e-05, + "loss": 1.5985, + "step": 21184 + }, + { + "epoch": 0.758679964904113, + "grad_norm": 1.9217463731765747, + "learning_rate": 2.9017140550789713e-05, + "loss": 1.1523, + "step": 21185 + }, + { + "epoch": 0.7587157770336813, + "grad_norm": 1.8757216930389404, + "learning_rate": 2.9008970979937276e-05, + "loss": 1.3591, + "step": 21186 + }, + { + "epoch": 0.7587515891632496, + "grad_norm": 1.840748906135559, + "learning_rate": 2.9000802364177527e-05, + "loss": 1.3812, + "step": 21187 + }, + { + "epoch": 0.7587874012928179, + "grad_norm": 1.5982416868209839, + "learning_rate": 2.8992634703620437e-05, + "loss": 1.5119, + "step": 21188 + }, + { + "epoch": 0.7588232134223861, + "grad_norm": 1.6878429651260376, + "learning_rate": 2.8984467998375786e-05, + "loss": 1.0781, + "step": 21189 + }, + { + "epoch": 0.7588590255519545, + "grad_norm": 1.7874064445495605, + "learning_rate": 2.8976302248553576e-05, + "loss": 1.2698, + "step": 21190 + }, + { + "epoch": 0.7588948376815228, + "grad_norm": 1.8459925651550293, + "learning_rate": 2.896813745426359e-05, + "loss": 1.1503, + "step": 21191 + }, + { + "epoch": 0.758930649811091, + "grad_norm": 1.5828872919082642, + "learning_rate": 2.8959973615615675e-05, + "loss": 1.2534, + "step": 21192 + }, + { + "epoch": 0.7589664619406593, + "grad_norm": 2.555840492248535, + "learning_rate": 2.8951810732719685e-05, + "loss": 1.5415, + "step": 21193 + }, + { + "epoch": 0.7590022740702276, + "grad_norm": 2.2074623107910156, + "learning_rate": 2.8943648805685464e-05, + "loss": 1.4024, + "step": 21194 + }, + { + "epoch": 0.7590380861997958, + "grad_norm": 2.5466811656951904, + "learning_rate": 2.893548783462279e-05, + "loss": 1.4226, + "step": 21195 + }, + { + "epoch": 0.7590738983293641, + "grad_norm": 1.3953499794006348, + "learning_rate": 2.8927327819641403e-05, + "loss": 1.4408, + "step": 21196 + }, + { + "epoch": 0.7591097104589324, + "grad_norm": 1.9189091920852661, + "learning_rate": 2.8919168760851202e-05, + "loss": 1.2927, + "step": 21197 + }, + { + "epoch": 0.7591455225885008, + "grad_norm": 1.6638646125793457, + "learning_rate": 2.891101065836187e-05, + "loss": 1.5423, + "step": 21198 + }, + { + "epoch": 0.759181334718069, + "grad_norm": 1.5958912372589111, + "learning_rate": 2.8902853512283225e-05, + "loss": 1.3269, + "step": 21199 + }, + { + "epoch": 0.7592171468476373, + "grad_norm": 2.3823351860046387, + "learning_rate": 2.8894697322724908e-05, + "loss": 1.2585, + "step": 21200 + }, + { + "epoch": 0.7592529589772056, + "grad_norm": 1.484283208847046, + "learning_rate": 2.8886542089796785e-05, + "loss": 1.3847, + "step": 21201 + }, + { + "epoch": 0.7592887711067738, + "grad_norm": 1.6040844917297363, + "learning_rate": 2.8878387813608477e-05, + "loss": 1.3717, + "step": 21202 + }, + { + "epoch": 0.7593245832363421, + "grad_norm": 1.6195263862609863, + "learning_rate": 2.8870234494269756e-05, + "loss": 1.4155, + "step": 21203 + }, + { + "epoch": 0.7593603953659104, + "grad_norm": 1.518485188484192, + "learning_rate": 2.8862082131890243e-05, + "loss": 1.3777, + "step": 21204 + }, + { + "epoch": 0.7593962074954788, + "grad_norm": 2.1455929279327393, + "learning_rate": 2.885393072657966e-05, + "loss": 1.7861, + "step": 21205 + }, + { + "epoch": 0.759432019625047, + "grad_norm": 1.6914993524551392, + "learning_rate": 2.8845780278447688e-05, + "loss": 1.2118, + "step": 21206 + }, + { + "epoch": 0.7594678317546153, + "grad_norm": 1.3727785348892212, + "learning_rate": 2.8837630787603908e-05, + "loss": 1.2499, + "step": 21207 + }, + { + "epoch": 0.7595036438841836, + "grad_norm": 1.734089732170105, + "learning_rate": 2.882948225415807e-05, + "loss": 1.5871, + "step": 21208 + }, + { + "epoch": 0.7595394560137518, + "grad_norm": 1.2891333103179932, + "learning_rate": 2.8821334678219712e-05, + "loss": 1.2715, + "step": 21209 + }, + { + "epoch": 0.7595752681433201, + "grad_norm": 2.27925443649292, + "learning_rate": 2.8813188059898512e-05, + "loss": 1.3765, + "step": 21210 + }, + { + "epoch": 0.7596110802728884, + "grad_norm": 1.3303371667861938, + "learning_rate": 2.8805042399303984e-05, + "loss": 1.5651, + "step": 21211 + }, + { + "epoch": 0.7596468924024568, + "grad_norm": 2.1135172843933105, + "learning_rate": 2.8796897696545832e-05, + "loss": 1.6908, + "step": 21212 + }, + { + "epoch": 0.759682704532025, + "grad_norm": 1.5996071100234985, + "learning_rate": 2.878875395173358e-05, + "loss": 1.4533, + "step": 21213 + }, + { + "epoch": 0.7597185166615933, + "grad_norm": 1.5009126663208008, + "learning_rate": 2.8780611164976767e-05, + "loss": 1.2599, + "step": 21214 + }, + { + "epoch": 0.7597543287911616, + "grad_norm": 1.407975196838379, + "learning_rate": 2.8772469336384954e-05, + "loss": 1.2601, + "step": 21215 + }, + { + "epoch": 0.7597901409207298, + "grad_norm": 1.3562133312225342, + "learning_rate": 2.876432846606769e-05, + "loss": 1.3456, + "step": 21216 + }, + { + "epoch": 0.7598259530502981, + "grad_norm": 1.6135796308517456, + "learning_rate": 2.8756188554134522e-05, + "loss": 1.2708, + "step": 21217 + }, + { + "epoch": 0.7598617651798664, + "grad_norm": 1.496204137802124, + "learning_rate": 2.8748049600694893e-05, + "loss": 1.3726, + "step": 21218 + }, + { + "epoch": 0.7598975773094347, + "grad_norm": 2.0687203407287598, + "learning_rate": 2.8739911605858394e-05, + "loss": 1.3556, + "step": 21219 + }, + { + "epoch": 0.759933389439003, + "grad_norm": 1.4460134506225586, + "learning_rate": 2.873177456973445e-05, + "loss": 1.1964, + "step": 21220 + }, + { + "epoch": 0.7599692015685713, + "grad_norm": 1.672918677330017, + "learning_rate": 2.872363849243257e-05, + "loss": 1.3602, + "step": 21221 + }, + { + "epoch": 0.7600050136981396, + "grad_norm": 1.2204346656799316, + "learning_rate": 2.871550337406217e-05, + "loss": 1.1142, + "step": 21222 + }, + { + "epoch": 0.7600408258277078, + "grad_norm": 1.909233808517456, + "learning_rate": 2.8707369214732716e-05, + "loss": 1.4387, + "step": 21223 + }, + { + "epoch": 0.7600766379572761, + "grad_norm": 1.3852407932281494, + "learning_rate": 2.8699236014553686e-05, + "loss": 1.5051, + "step": 21224 + }, + { + "epoch": 0.7601124500868444, + "grad_norm": 1.9627448320388794, + "learning_rate": 2.869110377363443e-05, + "loss": 1.5291, + "step": 21225 + }, + { + "epoch": 0.7601482622164127, + "grad_norm": 1.5735957622528076, + "learning_rate": 2.868297249208438e-05, + "loss": 1.5355, + "step": 21226 + }, + { + "epoch": 0.760184074345981, + "grad_norm": 1.7588822841644287, + "learning_rate": 2.867484217001296e-05, + "loss": 1.5529, + "step": 21227 + }, + { + "epoch": 0.7602198864755493, + "grad_norm": 1.5545251369476318, + "learning_rate": 2.866671280752956e-05, + "loss": 1.4601, + "step": 21228 + }, + { + "epoch": 0.7602556986051175, + "grad_norm": 1.5367778539657593, + "learning_rate": 2.8658584404743493e-05, + "loss": 1.306, + "step": 21229 + }, + { + "epoch": 0.7602915107346858, + "grad_norm": 1.8365488052368164, + "learning_rate": 2.865045696176415e-05, + "loss": 1.3618, + "step": 21230 + }, + { + "epoch": 0.7603273228642541, + "grad_norm": 1.5795180797576904, + "learning_rate": 2.8642330478700908e-05, + "loss": 1.201, + "step": 21231 + }, + { + "epoch": 0.7603631349938224, + "grad_norm": 2.363741397857666, + "learning_rate": 2.8634204955663024e-05, + "loss": 1.4861, + "step": 21232 + }, + { + "epoch": 0.7603989471233907, + "grad_norm": 1.6575523614883423, + "learning_rate": 2.862608039275987e-05, + "loss": 1.3186, + "step": 21233 + }, + { + "epoch": 0.760434759252959, + "grad_norm": 1.3435629606246948, + "learning_rate": 2.861795679010073e-05, + "loss": 1.7503, + "step": 21234 + }, + { + "epoch": 0.7604705713825273, + "grad_norm": 1.258905291557312, + "learning_rate": 2.8609834147794945e-05, + "loss": 1.1639, + "step": 21235 + }, + { + "epoch": 0.7605063835120955, + "grad_norm": 2.6588869094848633, + "learning_rate": 2.8601712465951713e-05, + "loss": 1.2825, + "step": 21236 + }, + { + "epoch": 0.7605421956416638, + "grad_norm": 1.9487318992614746, + "learning_rate": 2.8593591744680348e-05, + "loss": 1.6677, + "step": 21237 + }, + { + "epoch": 0.7605780077712321, + "grad_norm": 1.674471139907837, + "learning_rate": 2.858547198409013e-05, + "loss": 1.2264, + "step": 21238 + }, + { + "epoch": 0.7606138199008003, + "grad_norm": 1.990461826324463, + "learning_rate": 2.8577353184290236e-05, + "loss": 1.341, + "step": 21239 + }, + { + "epoch": 0.7606496320303687, + "grad_norm": 2.012136220932007, + "learning_rate": 2.8569235345389922e-05, + "loss": 1.583, + "step": 21240 + }, + { + "epoch": 0.760685444159937, + "grad_norm": 1.3574470281600952, + "learning_rate": 2.8561118467498415e-05, + "loss": 1.6975, + "step": 21241 + }, + { + "epoch": 0.7607212562895053, + "grad_norm": 1.79666006565094, + "learning_rate": 2.855300255072494e-05, + "loss": 1.5932, + "step": 21242 + }, + { + "epoch": 0.7607570684190735, + "grad_norm": 1.7653344869613647, + "learning_rate": 2.8544887595178616e-05, + "loss": 1.4088, + "step": 21243 + }, + { + "epoch": 0.7607928805486418, + "grad_norm": 1.3888636827468872, + "learning_rate": 2.853677360096867e-05, + "loss": 1.5872, + "step": 21244 + }, + { + "epoch": 0.7608286926782101, + "grad_norm": 1.8041224479675293, + "learning_rate": 2.8528660568204247e-05, + "loss": 1.7292, + "step": 21245 + }, + { + "epoch": 0.7608645048077783, + "grad_norm": 1.383401870727539, + "learning_rate": 2.8520548496994536e-05, + "loss": 1.2626, + "step": 21246 + }, + { + "epoch": 0.7609003169373467, + "grad_norm": 1.8830630779266357, + "learning_rate": 2.851243738744862e-05, + "loss": 1.5501, + "step": 21247 + }, + { + "epoch": 0.760936129066915, + "grad_norm": 2.0602715015411377, + "learning_rate": 2.8504327239675645e-05, + "loss": 1.3759, + "step": 21248 + }, + { + "epoch": 0.7609719411964833, + "grad_norm": 1.9153468608856201, + "learning_rate": 2.849621805378474e-05, + "loss": 1.3334, + "step": 21249 + }, + { + "epoch": 0.7610077533260515, + "grad_norm": 1.9879709482192993, + "learning_rate": 2.848810982988497e-05, + "loss": 1.5984, + "step": 21250 + }, + { + "epoch": 0.7610435654556198, + "grad_norm": 1.8865492343902588, + "learning_rate": 2.848000256808544e-05, + "loss": 1.3962, + "step": 21251 + }, + { + "epoch": 0.7610793775851881, + "grad_norm": 2.2912323474884033, + "learning_rate": 2.8471896268495214e-05, + "loss": 1.5664, + "step": 21252 + }, + { + "epoch": 0.7611151897147563, + "grad_norm": 1.8442264795303345, + "learning_rate": 2.84637909312234e-05, + "loss": 1.3058, + "step": 21253 + }, + { + "epoch": 0.7611510018443247, + "grad_norm": 2.1874215602874756, + "learning_rate": 2.845568655637896e-05, + "loss": 1.6448, + "step": 21254 + }, + { + "epoch": 0.761186813973893, + "grad_norm": 1.9503223896026611, + "learning_rate": 2.844758314407098e-05, + "loss": 1.1485, + "step": 21255 + }, + { + "epoch": 0.7612226261034613, + "grad_norm": 1.5357619524002075, + "learning_rate": 2.8439480694408506e-05, + "loss": 1.5472, + "step": 21256 + }, + { + "epoch": 0.7612584382330295, + "grad_norm": 1.703912377357483, + "learning_rate": 2.8431379207500476e-05, + "loss": 1.6161, + "step": 21257 + }, + { + "epoch": 0.7612942503625978, + "grad_norm": 2.0555579662323, + "learning_rate": 2.8423278683455922e-05, + "loss": 1.7808, + "step": 21258 + }, + { + "epoch": 0.7613300624921661, + "grad_norm": 1.4263310432434082, + "learning_rate": 2.8415179122383828e-05, + "loss": 1.0745, + "step": 21259 + }, + { + "epoch": 0.7613658746217343, + "grad_norm": 2.0803184509277344, + "learning_rate": 2.840708052439319e-05, + "loss": 1.5224, + "step": 21260 + }, + { + "epoch": 0.7614016867513027, + "grad_norm": 1.6625678539276123, + "learning_rate": 2.8398982889592908e-05, + "loss": 1.2313, + "step": 21261 + }, + { + "epoch": 0.761437498880871, + "grad_norm": 1.6900227069854736, + "learning_rate": 2.839088621809195e-05, + "loss": 1.4897, + "step": 21262 + }, + { + "epoch": 0.7614733110104392, + "grad_norm": 1.6262860298156738, + "learning_rate": 2.8382790509999257e-05, + "loss": 1.4142, + "step": 21263 + }, + { + "epoch": 0.7615091231400075, + "grad_norm": 1.7828209400177002, + "learning_rate": 2.8374695765423753e-05, + "loss": 1.7341, + "step": 21264 + }, + { + "epoch": 0.7615449352695758, + "grad_norm": 2.0078964233398438, + "learning_rate": 2.8366601984474305e-05, + "loss": 1.5635, + "step": 21265 + }, + { + "epoch": 0.761580747399144, + "grad_norm": 2.1161463260650635, + "learning_rate": 2.835850916725983e-05, + "loss": 1.4108, + "step": 21266 + }, + { + "epoch": 0.7616165595287123, + "grad_norm": 1.6242632865905762, + "learning_rate": 2.8350417313889233e-05, + "loss": 1.0424, + "step": 21267 + }, + { + "epoch": 0.7616523716582807, + "grad_norm": 1.9338126182556152, + "learning_rate": 2.8342326424471323e-05, + "loss": 1.4046, + "step": 21268 + }, + { + "epoch": 0.761688183787849, + "grad_norm": 1.9795308113098145, + "learning_rate": 2.8334236499114963e-05, + "loss": 1.5743, + "step": 21269 + }, + { + "epoch": 0.7617239959174172, + "grad_norm": 1.2497432231903076, + "learning_rate": 2.8326147537929027e-05, + "loss": 1.5952, + "step": 21270 + }, + { + "epoch": 0.7617598080469855, + "grad_norm": 1.5189738273620605, + "learning_rate": 2.8318059541022346e-05, + "loss": 1.6455, + "step": 21271 + }, + { + "epoch": 0.7617956201765538, + "grad_norm": 1.9544116258621216, + "learning_rate": 2.830997250850368e-05, + "loss": 1.438, + "step": 21272 + }, + { + "epoch": 0.761831432306122, + "grad_norm": 1.6780002117156982, + "learning_rate": 2.8301886440481862e-05, + "loss": 1.2535, + "step": 21273 + }, + { + "epoch": 0.7618672444356903, + "grad_norm": 2.1269612312316895, + "learning_rate": 2.8293801337065705e-05, + "loss": 1.2494, + "step": 21274 + }, + { + "epoch": 0.7619030565652587, + "grad_norm": 2.0308775901794434, + "learning_rate": 2.8285717198363924e-05, + "loss": 1.6469, + "step": 21275 + }, + { + "epoch": 0.761938868694827, + "grad_norm": 2.287074565887451, + "learning_rate": 2.8277634024485322e-05, + "loss": 1.2689, + "step": 21276 + }, + { + "epoch": 0.7619746808243952, + "grad_norm": 1.8691420555114746, + "learning_rate": 2.826955181553863e-05, + "loss": 1.2943, + "step": 21277 + }, + { + "epoch": 0.7620104929539635, + "grad_norm": 2.088134765625, + "learning_rate": 2.826147057163263e-05, + "loss": 1.2476, + "step": 21278 + }, + { + "epoch": 0.7620463050835318, + "grad_norm": 2.5623204708099365, + "learning_rate": 2.8253390292875982e-05, + "loss": 1.7233, + "step": 21279 + }, + { + "epoch": 0.7620821172131, + "grad_norm": 1.437081217765808, + "learning_rate": 2.8245310979377416e-05, + "loss": 1.507, + "step": 21280 + }, + { + "epoch": 0.7621179293426683, + "grad_norm": 1.5275484323501587, + "learning_rate": 2.8237232631245624e-05, + "loss": 1.5571, + "step": 21281 + }, + { + "epoch": 0.7621537414722367, + "grad_norm": 2.310119152069092, + "learning_rate": 2.8229155248589345e-05, + "loss": 1.687, + "step": 21282 + }, + { + "epoch": 0.762189553601805, + "grad_norm": 1.3541539907455444, + "learning_rate": 2.822107883151719e-05, + "loss": 1.5511, + "step": 21283 + }, + { + "epoch": 0.7622253657313732, + "grad_norm": 1.8010655641555786, + "learning_rate": 2.8213003380137783e-05, + "loss": 1.6483, + "step": 21284 + }, + { + "epoch": 0.7622611778609415, + "grad_norm": 1.4328798055648804, + "learning_rate": 2.820492889455987e-05, + "loss": 1.486, + "step": 21285 + }, + { + "epoch": 0.7622969899905098, + "grad_norm": 1.6243557929992676, + "learning_rate": 2.8196855374892006e-05, + "loss": 1.6467, + "step": 21286 + }, + { + "epoch": 0.762332802120078, + "grad_norm": 1.6708301305770874, + "learning_rate": 2.8188782821242855e-05, + "loss": 1.6195, + "step": 21287 + }, + { + "epoch": 0.7623686142496463, + "grad_norm": 1.5732182264328003, + "learning_rate": 2.8180711233720947e-05, + "loss": 1.3332, + "step": 21288 + }, + { + "epoch": 0.7624044263792147, + "grad_norm": 1.813924789428711, + "learning_rate": 2.8172640612434987e-05, + "loss": 1.285, + "step": 21289 + }, + { + "epoch": 0.762440238508783, + "grad_norm": 2.3234944343566895, + "learning_rate": 2.8164570957493473e-05, + "loss": 1.3432, + "step": 21290 + }, + { + "epoch": 0.7624760506383512, + "grad_norm": 1.5255062580108643, + "learning_rate": 2.8156502269004992e-05, + "loss": 1.472, + "step": 21291 + }, + { + "epoch": 0.7625118627679195, + "grad_norm": 2.036771535873413, + "learning_rate": 2.814843454707813e-05, + "loss": 1.3433, + "step": 21292 + }, + { + "epoch": 0.7625476748974878, + "grad_norm": 2.0992271900177, + "learning_rate": 2.8140367791821363e-05, + "loss": 1.2408, + "step": 21293 + }, + { + "epoch": 0.762583487027056, + "grad_norm": 1.3696377277374268, + "learning_rate": 2.813230200334329e-05, + "loss": 1.1414, + "step": 21294 + }, + { + "epoch": 0.7626192991566243, + "grad_norm": 2.283036231994629, + "learning_rate": 2.8124237181752334e-05, + "loss": 1.5427, + "step": 21295 + }, + { + "epoch": 0.7626551112861927, + "grad_norm": 1.7576613426208496, + "learning_rate": 2.8116173327157114e-05, + "loss": 1.338, + "step": 21296 + }, + { + "epoch": 0.762690923415761, + "grad_norm": 1.6552729606628418, + "learning_rate": 2.8108110439666024e-05, + "loss": 1.3773, + "step": 21297 + }, + { + "epoch": 0.7627267355453292, + "grad_norm": 1.3806850910186768, + "learning_rate": 2.8100048519387613e-05, + "loss": 1.3199, + "step": 21298 + }, + { + "epoch": 0.7627625476748975, + "grad_norm": 1.8089537620544434, + "learning_rate": 2.8091987566430233e-05, + "loss": 1.2466, + "step": 21299 + }, + { + "epoch": 0.7627983598044658, + "grad_norm": 1.5321838855743408, + "learning_rate": 2.808392758090247e-05, + "loss": 1.4227, + "step": 21300 + }, + { + "epoch": 0.762834171934034, + "grad_norm": 1.6883888244628906, + "learning_rate": 2.80758685629127e-05, + "loss": 1.4317, + "step": 21301 + }, + { + "epoch": 0.7628699840636023, + "grad_norm": 1.5351834297180176, + "learning_rate": 2.8067810512569282e-05, + "loss": 1.1779, + "step": 21302 + }, + { + "epoch": 0.7629057961931707, + "grad_norm": 2.15999174118042, + "learning_rate": 2.805975342998075e-05, + "loss": 1.4464, + "step": 21303 + }, + { + "epoch": 0.7629416083227389, + "grad_norm": 1.7024953365325928, + "learning_rate": 2.80516973152554e-05, + "loss": 1.3371, + "step": 21304 + }, + { + "epoch": 0.7629774204523072, + "grad_norm": 1.595112919807434, + "learning_rate": 2.8043642168501692e-05, + "loss": 1.4669, + "step": 21305 + }, + { + "epoch": 0.7630132325818755, + "grad_norm": 1.7060620784759521, + "learning_rate": 2.8035587989827904e-05, + "loss": 1.2248, + "step": 21306 + }, + { + "epoch": 0.7630490447114437, + "grad_norm": 1.508658766746521, + "learning_rate": 2.802753477934251e-05, + "loss": 1.4452, + "step": 21307 + }, + { + "epoch": 0.763084856841012, + "grad_norm": 3.327977180480957, + "learning_rate": 2.8019482537153762e-05, + "loss": 1.7054, + "step": 21308 + }, + { + "epoch": 0.7631206689705803, + "grad_norm": 1.5800703763961792, + "learning_rate": 2.801143126337007e-05, + "loss": 1.508, + "step": 21309 + }, + { + "epoch": 0.7631564811001487, + "grad_norm": 1.645244836807251, + "learning_rate": 2.8003380958099677e-05, + "loss": 1.5809, + "step": 21310 + }, + { + "epoch": 0.7631922932297169, + "grad_norm": 1.597456455230713, + "learning_rate": 2.7995331621450917e-05, + "loss": 1.2949, + "step": 21311 + }, + { + "epoch": 0.7632281053592852, + "grad_norm": 1.4710382223129272, + "learning_rate": 2.7987283253532125e-05, + "loss": 1.1875, + "step": 21312 + }, + { + "epoch": 0.7632639174888535, + "grad_norm": 2.0475046634674072, + "learning_rate": 2.7979235854451523e-05, + "loss": 1.7837, + "step": 21313 + }, + { + "epoch": 0.7632997296184217, + "grad_norm": 1.670210599899292, + "learning_rate": 2.79711894243174e-05, + "loss": 1.6562, + "step": 21314 + }, + { + "epoch": 0.76333554174799, + "grad_norm": 1.4853568077087402, + "learning_rate": 2.7963143963238005e-05, + "loss": 1.443, + "step": 21315 + }, + { + "epoch": 0.7633713538775583, + "grad_norm": 1.7376928329467773, + "learning_rate": 2.795509947132162e-05, + "loss": 1.2882, + "step": 21316 + }, + { + "epoch": 0.7634071660071267, + "grad_norm": 1.8859137296676636, + "learning_rate": 2.7947055948676392e-05, + "loss": 1.7188, + "step": 21317 + }, + { + "epoch": 0.7634429781366949, + "grad_norm": 1.6725388765335083, + "learning_rate": 2.793901339541063e-05, + "loss": 1.4696, + "step": 21318 + }, + { + "epoch": 0.7634787902662632, + "grad_norm": 1.5267573595046997, + "learning_rate": 2.79309718116325e-05, + "loss": 1.3887, + "step": 21319 + }, + { + "epoch": 0.7635146023958315, + "grad_norm": 2.08896541595459, + "learning_rate": 2.792293119745014e-05, + "loss": 1.4965, + "step": 21320 + }, + { + "epoch": 0.7635504145253997, + "grad_norm": 1.5745618343353271, + "learning_rate": 2.7914891552971776e-05, + "loss": 1.5053, + "step": 21321 + }, + { + "epoch": 0.763586226654968, + "grad_norm": 1.7663930654525757, + "learning_rate": 2.7906852878305567e-05, + "loss": 1.431, + "step": 21322 + }, + { + "epoch": 0.7636220387845363, + "grad_norm": 1.428335428237915, + "learning_rate": 2.789881517355969e-05, + "loss": 1.6087, + "step": 21323 + }, + { + "epoch": 0.7636578509141047, + "grad_norm": 1.546515703201294, + "learning_rate": 2.7890778438842214e-05, + "loss": 1.1567, + "step": 21324 + }, + { + "epoch": 0.7636936630436729, + "grad_norm": 2.3970510959625244, + "learning_rate": 2.7882742674261307e-05, + "loss": 1.8865, + "step": 21325 + }, + { + "epoch": 0.7637294751732412, + "grad_norm": 1.486477255821228, + "learning_rate": 2.78747078799251e-05, + "loss": 1.5473, + "step": 21326 + }, + { + "epoch": 0.7637652873028095, + "grad_norm": 1.832295298576355, + "learning_rate": 2.786667405594163e-05, + "loss": 1.3825, + "step": 21327 + }, + { + "epoch": 0.7638010994323777, + "grad_norm": 1.7424432039260864, + "learning_rate": 2.785864120241901e-05, + "loss": 1.503, + "step": 21328 + }, + { + "epoch": 0.763836911561946, + "grad_norm": 1.831028938293457, + "learning_rate": 2.7850609319465325e-05, + "loss": 1.5769, + "step": 21329 + }, + { + "epoch": 0.7638727236915143, + "grad_norm": 1.4432202577590942, + "learning_rate": 2.7842578407188656e-05, + "loss": 1.3591, + "step": 21330 + }, + { + "epoch": 0.7639085358210826, + "grad_norm": 1.3321956396102905, + "learning_rate": 2.7834548465696987e-05, + "loss": 1.4144, + "step": 21331 + }, + { + "epoch": 0.7639443479506509, + "grad_norm": 1.7596930265426636, + "learning_rate": 2.7826519495098378e-05, + "loss": 1.4145, + "step": 21332 + }, + { + "epoch": 0.7639801600802192, + "grad_norm": 1.5014865398406982, + "learning_rate": 2.7818491495500864e-05, + "loss": 1.4459, + "step": 21333 + }, + { + "epoch": 0.7640159722097875, + "grad_norm": 2.288329601287842, + "learning_rate": 2.7810464467012455e-05, + "loss": 1.7881, + "step": 21334 + }, + { + "epoch": 0.7640517843393557, + "grad_norm": 1.4313377141952515, + "learning_rate": 2.7802438409741106e-05, + "loss": 1.5355, + "step": 21335 + }, + { + "epoch": 0.764087596468924, + "grad_norm": 1.7000243663787842, + "learning_rate": 2.7794413323794822e-05, + "loss": 1.3548, + "step": 21336 + }, + { + "epoch": 0.7641234085984923, + "grad_norm": 1.5719764232635498, + "learning_rate": 2.7786389209281592e-05, + "loss": 1.3211, + "step": 21337 + }, + { + "epoch": 0.7641592207280606, + "grad_norm": 1.5126636028289795, + "learning_rate": 2.7778366066309326e-05, + "loss": 1.3154, + "step": 21338 + }, + { + "epoch": 0.7641950328576289, + "grad_norm": 1.7162196636199951, + "learning_rate": 2.7770343894985974e-05, + "loss": 1.4414, + "step": 21339 + }, + { + "epoch": 0.7642308449871972, + "grad_norm": 1.7152540683746338, + "learning_rate": 2.7762322695419485e-05, + "loss": 1.9311, + "step": 21340 + }, + { + "epoch": 0.7642666571167654, + "grad_norm": 2.3539340496063232, + "learning_rate": 2.7754302467717785e-05, + "loss": 1.6651, + "step": 21341 + }, + { + "epoch": 0.7643024692463337, + "grad_norm": 1.1298686265945435, + "learning_rate": 2.7746283211988734e-05, + "loss": 1.4936, + "step": 21342 + }, + { + "epoch": 0.764338281375902, + "grad_norm": 1.6960740089416504, + "learning_rate": 2.773826492834023e-05, + "loss": 1.3619, + "step": 21343 + }, + { + "epoch": 0.7643740935054703, + "grad_norm": 1.8697292804718018, + "learning_rate": 2.77302476168802e-05, + "loss": 1.4366, + "step": 21344 + }, + { + "epoch": 0.7644099056350386, + "grad_norm": 1.8246264457702637, + "learning_rate": 2.7722231277716437e-05, + "loss": 1.3414, + "step": 21345 + }, + { + "epoch": 0.7644457177646069, + "grad_norm": 2.1910572052001953, + "learning_rate": 2.771421591095682e-05, + "loss": 1.8065, + "step": 21346 + }, + { + "epoch": 0.7644815298941752, + "grad_norm": 1.7753005027770996, + "learning_rate": 2.7706201516709175e-05, + "loss": 1.0744, + "step": 21347 + }, + { + "epoch": 0.7645173420237434, + "grad_norm": 1.61532723903656, + "learning_rate": 2.769818809508138e-05, + "loss": 1.5135, + "step": 21348 + }, + { + "epoch": 0.7645531541533117, + "grad_norm": 1.4952168464660645, + "learning_rate": 2.769017564618117e-05, + "loss": 1.1487, + "step": 21349 + }, + { + "epoch": 0.76458896628288, + "grad_norm": 1.5021734237670898, + "learning_rate": 2.7682164170116365e-05, + "loss": 1.6482, + "step": 21350 + }, + { + "epoch": 0.7646247784124482, + "grad_norm": 1.436759114265442, + "learning_rate": 2.767415366699476e-05, + "loss": 1.1454, + "step": 21351 + }, + { + "epoch": 0.7646605905420166, + "grad_norm": 1.3185434341430664, + "learning_rate": 2.7666144136924166e-05, + "loss": 1.3639, + "step": 21352 + }, + { + "epoch": 0.7646964026715849, + "grad_norm": 1.667108416557312, + "learning_rate": 2.7658135580012256e-05, + "loss": 1.135, + "step": 21353 + }, + { + "epoch": 0.7647322148011532, + "grad_norm": 1.9361541271209717, + "learning_rate": 2.7650127996366826e-05, + "loss": 1.2538, + "step": 21354 + }, + { + "epoch": 0.7647680269307214, + "grad_norm": 1.9174108505249023, + "learning_rate": 2.764212138609562e-05, + "loss": 1.5039, + "step": 21355 + }, + { + "epoch": 0.7648038390602897, + "grad_norm": 2.271050214767456, + "learning_rate": 2.7634115749306312e-05, + "loss": 1.4584, + "step": 21356 + }, + { + "epoch": 0.764839651189858, + "grad_norm": 1.594559907913208, + "learning_rate": 2.762611108610663e-05, + "loss": 1.1316, + "step": 21357 + }, + { + "epoch": 0.7648754633194262, + "grad_norm": 1.610710859298706, + "learning_rate": 2.7618107396604263e-05, + "loss": 1.5176, + "step": 21358 + }, + { + "epoch": 0.7649112754489946, + "grad_norm": 1.653333306312561, + "learning_rate": 2.7610104680906933e-05, + "loss": 1.4896, + "step": 21359 + }, + { + "epoch": 0.7649470875785629, + "grad_norm": 1.548635721206665, + "learning_rate": 2.760210293912223e-05, + "loss": 1.7664, + "step": 21360 + }, + { + "epoch": 0.7649828997081312, + "grad_norm": 1.920652985572815, + "learning_rate": 2.759410217135786e-05, + "loss": 1.4373, + "step": 21361 + }, + { + "epoch": 0.7650187118376994, + "grad_norm": 1.9298456907272339, + "learning_rate": 2.7586102377721467e-05, + "loss": 1.2179, + "step": 21362 + }, + { + "epoch": 0.7650545239672677, + "grad_norm": 1.5186060667037964, + "learning_rate": 2.7578103558320623e-05, + "loss": 1.5287, + "step": 21363 + }, + { + "epoch": 0.765090336096836, + "grad_norm": 1.91496741771698, + "learning_rate": 2.7570105713262995e-05, + "loss": 1.4081, + "step": 21364 + }, + { + "epoch": 0.7651261482264042, + "grad_norm": 2.4558486938476562, + "learning_rate": 2.7562108842656152e-05, + "loss": 1.1442, + "step": 21365 + }, + { + "epoch": 0.7651619603559726, + "grad_norm": 1.984779715538025, + "learning_rate": 2.7554112946607735e-05, + "loss": 1.5647, + "step": 21366 + }, + { + "epoch": 0.7651977724855409, + "grad_norm": 1.4279773235321045, + "learning_rate": 2.7546118025225244e-05, + "loss": 1.4004, + "step": 21367 + }, + { + "epoch": 0.7652335846151092, + "grad_norm": 1.703660011291504, + "learning_rate": 2.7538124078616278e-05, + "loss": 1.4426, + "step": 21368 + }, + { + "epoch": 0.7652693967446774, + "grad_norm": 1.6676280498504639, + "learning_rate": 2.753013110688839e-05, + "loss": 1.3846, + "step": 21369 + }, + { + "epoch": 0.7653052088742457, + "grad_norm": 1.7314229011535645, + "learning_rate": 2.7522139110149125e-05, + "loss": 1.7917, + "step": 21370 + }, + { + "epoch": 0.765341021003814, + "grad_norm": 1.8637425899505615, + "learning_rate": 2.7514148088505998e-05, + "loss": 1.595, + "step": 21371 + }, + { + "epoch": 0.7653768331333822, + "grad_norm": 2.5171310901641846, + "learning_rate": 2.7506158042066454e-05, + "loss": 1.4561, + "step": 21372 + }, + { + "epoch": 0.7654126452629506, + "grad_norm": 1.657228708267212, + "learning_rate": 2.74981689709381e-05, + "loss": 1.5529, + "step": 21373 + }, + { + "epoch": 0.7654484573925189, + "grad_norm": 1.326586365699768, + "learning_rate": 2.749018087522832e-05, + "loss": 1.0958, + "step": 21374 + }, + { + "epoch": 0.7654842695220871, + "grad_norm": 1.817240595817566, + "learning_rate": 2.7482193755044637e-05, + "loss": 1.5557, + "step": 21375 + }, + { + "epoch": 0.7655200816516554, + "grad_norm": 1.442419171333313, + "learning_rate": 2.7474207610494495e-05, + "loss": 1.2791, + "step": 21376 + }, + { + "epoch": 0.7655558937812237, + "grad_norm": 1.3630636930465698, + "learning_rate": 2.7466222441685362e-05, + "loss": 1.1471, + "step": 21377 + }, + { + "epoch": 0.765591705910792, + "grad_norm": 1.8548414707183838, + "learning_rate": 2.7458238248724623e-05, + "loss": 1.5049, + "step": 21378 + }, + { + "epoch": 0.7656275180403602, + "grad_norm": 1.5222378969192505, + "learning_rate": 2.7450255031719707e-05, + "loss": 1.5707, + "step": 21379 + }, + { + "epoch": 0.7656633301699286, + "grad_norm": 1.5617293119430542, + "learning_rate": 2.7442272790778057e-05, + "loss": 1.1615, + "step": 21380 + }, + { + "epoch": 0.7656991422994969, + "grad_norm": 1.7830644845962524, + "learning_rate": 2.7434291526007004e-05, + "loss": 1.5264, + "step": 21381 + }, + { + "epoch": 0.7657349544290651, + "grad_norm": 1.7911914587020874, + "learning_rate": 2.742631123751399e-05, + "loss": 1.3538, + "step": 21382 + }, + { + "epoch": 0.7657707665586334, + "grad_norm": 1.432540774345398, + "learning_rate": 2.7418331925406293e-05, + "loss": 1.458, + "step": 21383 + }, + { + "epoch": 0.7658065786882017, + "grad_norm": 2.0374929904937744, + "learning_rate": 2.741035358979136e-05, + "loss": 1.5482, + "step": 21384 + }, + { + "epoch": 0.76584239081777, + "grad_norm": 1.5840460062026978, + "learning_rate": 2.7402376230776473e-05, + "loss": 1.3272, + "step": 21385 + }, + { + "epoch": 0.7658782029473382, + "grad_norm": 1.9470570087432861, + "learning_rate": 2.7394399848468953e-05, + "loss": 1.4448, + "step": 21386 + }, + { + "epoch": 0.7659140150769066, + "grad_norm": 1.5973190069198608, + "learning_rate": 2.7386424442976132e-05, + "loss": 1.3651, + "step": 21387 + }, + { + "epoch": 0.7659498272064749, + "grad_norm": 2.248100996017456, + "learning_rate": 2.7378450014405342e-05, + "loss": 1.4034, + "step": 21388 + }, + { + "epoch": 0.7659856393360431, + "grad_norm": 1.8361610174179077, + "learning_rate": 2.7370476562863835e-05, + "loss": 1.3061, + "step": 21389 + }, + { + "epoch": 0.7660214514656114, + "grad_norm": 1.7707544565200806, + "learning_rate": 2.7362504088458807e-05, + "loss": 1.3279, + "step": 21390 + }, + { + "epoch": 0.7660572635951797, + "grad_norm": 1.2263545989990234, + "learning_rate": 2.7354532591297666e-05, + "loss": 1.6046, + "step": 21391 + }, + { + "epoch": 0.7660930757247479, + "grad_norm": 1.5667316913604736, + "learning_rate": 2.7346562071487537e-05, + "loss": 1.3985, + "step": 21392 + }, + { + "epoch": 0.7661288878543162, + "grad_norm": 1.6585215330123901, + "learning_rate": 2.7338592529135744e-05, + "loss": 1.5769, + "step": 21393 + }, + { + "epoch": 0.7661646999838846, + "grad_norm": 1.4081734418869019, + "learning_rate": 2.7330623964349387e-05, + "loss": 1.5353, + "step": 21394 + }, + { + "epoch": 0.7662005121134529, + "grad_norm": 2.058746337890625, + "learning_rate": 2.732265637723582e-05, + "loss": 1.4592, + "step": 21395 + }, + { + "epoch": 0.7662363242430211, + "grad_norm": 1.8329404592514038, + "learning_rate": 2.7314689767902134e-05, + "loss": 1.3414, + "step": 21396 + }, + { + "epoch": 0.7662721363725894, + "grad_norm": 1.811803936958313, + "learning_rate": 2.7306724136455564e-05, + "loss": 1.3135, + "step": 21397 + }, + { + "epoch": 0.7663079485021577, + "grad_norm": 1.5890330076217651, + "learning_rate": 2.7298759483003223e-05, + "loss": 1.0912, + "step": 21398 + }, + { + "epoch": 0.7663437606317259, + "grad_norm": 1.5598218441009521, + "learning_rate": 2.7290795807652305e-05, + "loss": 1.7041, + "step": 21399 + }, + { + "epoch": 0.7663795727612942, + "grad_norm": 1.9628334045410156, + "learning_rate": 2.7282833110509952e-05, + "loss": 1.6035, + "step": 21400 + }, + { + "epoch": 0.7664153848908626, + "grad_norm": 1.3994742631912231, + "learning_rate": 2.7274871391683243e-05, + "loss": 1.0589, + "step": 21401 + }, + { + "epoch": 0.7664511970204309, + "grad_norm": 1.670101284980774, + "learning_rate": 2.7266910651279376e-05, + "loss": 1.189, + "step": 21402 + }, + { + "epoch": 0.7664870091499991, + "grad_norm": 1.4442775249481201, + "learning_rate": 2.725895088940539e-05, + "loss": 1.3845, + "step": 21403 + }, + { + "epoch": 0.7665228212795674, + "grad_norm": 1.8506652116775513, + "learning_rate": 2.7250992106168406e-05, + "loss": 1.6786, + "step": 21404 + }, + { + "epoch": 0.7665586334091357, + "grad_norm": 1.296434760093689, + "learning_rate": 2.724303430167543e-05, + "loss": 1.384, + "step": 21405 + }, + { + "epoch": 0.7665944455387039, + "grad_norm": 1.8313742876052856, + "learning_rate": 2.7235077476033645e-05, + "loss": 1.4286, + "step": 21406 + }, + { + "epoch": 0.7666302576682722, + "grad_norm": 1.7008599042892456, + "learning_rate": 2.7227121629350016e-05, + "loss": 1.5361, + "step": 21407 + }, + { + "epoch": 0.7666660697978406, + "grad_norm": 1.674019455909729, + "learning_rate": 2.7219166761731585e-05, + "loss": 1.6992, + "step": 21408 + }, + { + "epoch": 0.7667018819274088, + "grad_norm": 1.7322226762771606, + "learning_rate": 2.7211212873285376e-05, + "loss": 1.4695, + "step": 21409 + }, + { + "epoch": 0.7667376940569771, + "grad_norm": 1.8530694246292114, + "learning_rate": 2.72032599641184e-05, + "loss": 1.3144, + "step": 21410 + }, + { + "epoch": 0.7667735061865454, + "grad_norm": 1.8700464963912964, + "learning_rate": 2.7195308034337698e-05, + "loss": 1.2808, + "step": 21411 + }, + { + "epoch": 0.7668093183161137, + "grad_norm": 1.8522310256958008, + "learning_rate": 2.7187357084050147e-05, + "loss": 1.2198, + "step": 21412 + }, + { + "epoch": 0.7668451304456819, + "grad_norm": 2.2776641845703125, + "learning_rate": 2.7179407113362853e-05, + "loss": 1.6649, + "step": 21413 + }, + { + "epoch": 0.7668809425752502, + "grad_norm": 2.573241949081421, + "learning_rate": 2.7171458122382675e-05, + "loss": 1.4936, + "step": 21414 + }, + { + "epoch": 0.7669167547048186, + "grad_norm": 1.872636318206787, + "learning_rate": 2.7163510111216618e-05, + "loss": 1.4336, + "step": 21415 + }, + { + "epoch": 0.7669525668343868, + "grad_norm": 2.0823798179626465, + "learning_rate": 2.7155563079971535e-05, + "loss": 1.5788, + "step": 21416 + }, + { + "epoch": 0.7669883789639551, + "grad_norm": 1.9190607070922852, + "learning_rate": 2.71476170287544e-05, + "loss": 1.3688, + "step": 21417 + }, + { + "epoch": 0.7670241910935234, + "grad_norm": 1.5269092321395874, + "learning_rate": 2.713967195767214e-05, + "loss": 1.4576, + "step": 21418 + }, + { + "epoch": 0.7670600032230916, + "grad_norm": 1.5405995845794678, + "learning_rate": 2.713172786683157e-05, + "loss": 1.3906, + "step": 21419 + }, + { + "epoch": 0.7670958153526599, + "grad_norm": 1.7105742692947388, + "learning_rate": 2.712378475633961e-05, + "loss": 1.6096, + "step": 21420 + }, + { + "epoch": 0.7671316274822282, + "grad_norm": 1.8407995700836182, + "learning_rate": 2.7115842626303134e-05, + "loss": 1.4489, + "step": 21421 + }, + { + "epoch": 0.7671674396117966, + "grad_norm": 1.9620721340179443, + "learning_rate": 2.7107901476829e-05, + "loss": 1.6845, + "step": 21422 + }, + { + "epoch": 0.7672032517413648, + "grad_norm": 2.0660271644592285, + "learning_rate": 2.7099961308024004e-05, + "loss": 1.1152, + "step": 21423 + }, + { + "epoch": 0.7672390638709331, + "grad_norm": 1.7892515659332275, + "learning_rate": 2.7092022119994988e-05, + "loss": 1.3035, + "step": 21424 + }, + { + "epoch": 0.7672748760005014, + "grad_norm": 1.1042615175247192, + "learning_rate": 2.70840839128488e-05, + "loss": 1.2817, + "step": 21425 + }, + { + "epoch": 0.7673106881300696, + "grad_norm": 1.3992247581481934, + "learning_rate": 2.7076146686692184e-05, + "loss": 1.4926, + "step": 21426 + }, + { + "epoch": 0.7673465002596379, + "grad_norm": 2.207401990890503, + "learning_rate": 2.7068210441631947e-05, + "loss": 1.4668, + "step": 21427 + }, + { + "epoch": 0.7673823123892062, + "grad_norm": 1.5987458229064941, + "learning_rate": 2.7060275177774862e-05, + "loss": 1.3982, + "step": 21428 + }, + { + "epoch": 0.7674181245187746, + "grad_norm": 1.4975227117538452, + "learning_rate": 2.7052340895227714e-05, + "loss": 1.7043, + "step": 21429 + }, + { + "epoch": 0.7674539366483428, + "grad_norm": 1.8431814908981323, + "learning_rate": 2.7044407594097197e-05, + "loss": 1.618, + "step": 21430 + }, + { + "epoch": 0.7674897487779111, + "grad_norm": 1.6562138795852661, + "learning_rate": 2.703647527449007e-05, + "loss": 1.2938, + "step": 21431 + }, + { + "epoch": 0.7675255609074794, + "grad_norm": 1.7366337776184082, + "learning_rate": 2.7028543936513086e-05, + "loss": 1.255, + "step": 21432 + }, + { + "epoch": 0.7675613730370476, + "grad_norm": 1.6569148302078247, + "learning_rate": 2.7020613580272893e-05, + "loss": 1.4548, + "step": 21433 + }, + { + "epoch": 0.7675971851666159, + "grad_norm": 1.7070738077163696, + "learning_rate": 2.7012684205876192e-05, + "loss": 1.226, + "step": 21434 + }, + { + "epoch": 0.7676329972961842, + "grad_norm": 1.5093258619308472, + "learning_rate": 2.7004755813429683e-05, + "loss": 1.4913, + "step": 21435 + }, + { + "epoch": 0.7676688094257526, + "grad_norm": 1.564212441444397, + "learning_rate": 2.6996828403040064e-05, + "loss": 1.5233, + "step": 21436 + }, + { + "epoch": 0.7677046215553208, + "grad_norm": 1.472865343093872, + "learning_rate": 2.698890197481392e-05, + "loss": 1.4532, + "step": 21437 + }, + { + "epoch": 0.7677404336848891, + "grad_norm": 2.028477430343628, + "learning_rate": 2.6980976528857915e-05, + "loss": 1.3926, + "step": 21438 + }, + { + "epoch": 0.7677762458144574, + "grad_norm": 1.4569189548492432, + "learning_rate": 2.697305206527869e-05, + "loss": 1.2849, + "step": 21439 + }, + { + "epoch": 0.7678120579440256, + "grad_norm": 1.7602864503860474, + "learning_rate": 2.6965128584182886e-05, + "loss": 1.3607, + "step": 21440 + }, + { + "epoch": 0.7678478700735939, + "grad_norm": 1.7039145231246948, + "learning_rate": 2.6957206085677023e-05, + "loss": 1.6376, + "step": 21441 + }, + { + "epoch": 0.7678836822031622, + "grad_norm": 2.3543314933776855, + "learning_rate": 2.694928456986775e-05, + "loss": 1.5835, + "step": 21442 + }, + { + "epoch": 0.7679194943327305, + "grad_norm": 1.356101393699646, + "learning_rate": 2.6941364036861638e-05, + "loss": 1.0908, + "step": 21443 + }, + { + "epoch": 0.7679553064622988, + "grad_norm": 1.6495617628097534, + "learning_rate": 2.6933444486765212e-05, + "loss": 1.3593, + "step": 21444 + }, + { + "epoch": 0.7679911185918671, + "grad_norm": 1.7570487260818481, + "learning_rate": 2.6925525919685047e-05, + "loss": 1.6665, + "step": 21445 + }, + { + "epoch": 0.7680269307214354, + "grad_norm": 1.7626287937164307, + "learning_rate": 2.6917608335727675e-05, + "loss": 1.5255, + "step": 21446 + }, + { + "epoch": 0.7680627428510036, + "grad_norm": 1.4852346181869507, + "learning_rate": 2.6909691734999633e-05, + "loss": 1.3978, + "step": 21447 + }, + { + "epoch": 0.7680985549805719, + "grad_norm": 1.377875566482544, + "learning_rate": 2.690177611760738e-05, + "loss": 1.4612, + "step": 21448 + }, + { + "epoch": 0.7681343671101402, + "grad_norm": 1.5788637399673462, + "learning_rate": 2.6893861483657436e-05, + "loss": 1.529, + "step": 21449 + }, + { + "epoch": 0.7681701792397085, + "grad_norm": 1.2312066555023193, + "learning_rate": 2.688594783325632e-05, + "loss": 1.4575, + "step": 21450 + }, + { + "epoch": 0.7682059913692768, + "grad_norm": 2.2032153606414795, + "learning_rate": 2.687803516651044e-05, + "loss": 1.4401, + "step": 21451 + }, + { + "epoch": 0.7682418034988451, + "grad_norm": 1.4819837808609009, + "learning_rate": 2.6870123483526276e-05, + "loss": 1.329, + "step": 21452 + }, + { + "epoch": 0.7682776156284133, + "grad_norm": 1.5559954643249512, + "learning_rate": 2.6862212784410258e-05, + "loss": 1.3719, + "step": 21453 + }, + { + "epoch": 0.7683134277579816, + "grad_norm": 1.2472002506256104, + "learning_rate": 2.685430306926887e-05, + "loss": 1.3489, + "step": 21454 + }, + { + "epoch": 0.7683492398875499, + "grad_norm": 1.4845582246780396, + "learning_rate": 2.6846394338208446e-05, + "loss": 1.4089, + "step": 21455 + }, + { + "epoch": 0.7683850520171182, + "grad_norm": 2.007016181945801, + "learning_rate": 2.683848659133542e-05, + "loss": 1.4288, + "step": 21456 + }, + { + "epoch": 0.7684208641466865, + "grad_norm": 1.433982253074646, + "learning_rate": 2.68305798287562e-05, + "loss": 1.3565, + "step": 21457 + }, + { + "epoch": 0.7684566762762548, + "grad_norm": 1.5630964040756226, + "learning_rate": 2.682267405057717e-05, + "loss": 1.4758, + "step": 21458 + }, + { + "epoch": 0.7684924884058231, + "grad_norm": 2.2693796157836914, + "learning_rate": 2.6814769256904627e-05, + "loss": 1.3924, + "step": 21459 + }, + { + "epoch": 0.7685283005353913, + "grad_norm": 1.6751092672348022, + "learning_rate": 2.6806865447844974e-05, + "loss": 1.4485, + "step": 21460 + }, + { + "epoch": 0.7685641126649596, + "grad_norm": 1.8391053676605225, + "learning_rate": 2.6798962623504566e-05, + "loss": 1.6818, + "step": 21461 + }, + { + "epoch": 0.7685999247945279, + "grad_norm": 1.5685383081436157, + "learning_rate": 2.6791060783989653e-05, + "loss": 1.2592, + "step": 21462 + }, + { + "epoch": 0.7686357369240961, + "grad_norm": 1.350655436515808, + "learning_rate": 2.678315992940659e-05, + "loss": 1.3298, + "step": 21463 + }, + { + "epoch": 0.7686715490536645, + "grad_norm": 1.6182117462158203, + "learning_rate": 2.6775260059861673e-05, + "loss": 1.4552, + "step": 21464 + }, + { + "epoch": 0.7687073611832328, + "grad_norm": 1.374692440032959, + "learning_rate": 2.6767361175461202e-05, + "loss": 1.2673, + "step": 21465 + }, + { + "epoch": 0.7687431733128011, + "grad_norm": 1.6027882099151611, + "learning_rate": 2.6759463276311393e-05, + "loss": 1.2951, + "step": 21466 + }, + { + "epoch": 0.7687789854423693, + "grad_norm": 1.84087336063385, + "learning_rate": 2.675156636251853e-05, + "loss": 1.3431, + "step": 21467 + }, + { + "epoch": 0.7688147975719376, + "grad_norm": 2.3304686546325684, + "learning_rate": 2.6743670434188893e-05, + "loss": 1.6684, + "step": 21468 + }, + { + "epoch": 0.7688506097015059, + "grad_norm": 2.1128644943237305, + "learning_rate": 2.673577549142864e-05, + "loss": 1.5036, + "step": 21469 + }, + { + "epoch": 0.7688864218310741, + "grad_norm": 1.8388144969940186, + "learning_rate": 2.6727881534344057e-05, + "loss": 1.6742, + "step": 21470 + }, + { + "epoch": 0.7689222339606425, + "grad_norm": 1.973982810974121, + "learning_rate": 2.6719988563041264e-05, + "loss": 1.4739, + "step": 21471 + }, + { + "epoch": 0.7689580460902108, + "grad_norm": 1.7554025650024414, + "learning_rate": 2.6712096577626543e-05, + "loss": 1.4875, + "step": 21472 + }, + { + "epoch": 0.7689938582197791, + "grad_norm": 1.8182271718978882, + "learning_rate": 2.670420557820601e-05, + "loss": 1.8165, + "step": 21473 + }, + { + "epoch": 0.7690296703493473, + "grad_norm": 1.4545783996582031, + "learning_rate": 2.6696315564885844e-05, + "loss": 1.5724, + "step": 21474 + }, + { + "epoch": 0.7690654824789156, + "grad_norm": 1.3026014566421509, + "learning_rate": 2.6688426537772194e-05, + "loss": 1.5714, + "step": 21475 + }, + { + "epoch": 0.7691012946084839, + "grad_norm": 1.7563316822052002, + "learning_rate": 2.668053849697123e-05, + "loss": 1.3821, + "step": 21476 + }, + { + "epoch": 0.7691371067380521, + "grad_norm": 1.6058958768844604, + "learning_rate": 2.6672651442589046e-05, + "loss": 1.4226, + "step": 21477 + }, + { + "epoch": 0.7691729188676205, + "grad_norm": 1.446184754371643, + "learning_rate": 2.6664765374731693e-05, + "loss": 1.1693, + "step": 21478 + }, + { + "epoch": 0.7692087309971888, + "grad_norm": 1.7952556610107422, + "learning_rate": 2.665688029350538e-05, + "loss": 1.5924, + "step": 21479 + }, + { + "epoch": 0.769244543126757, + "grad_norm": 1.4149837493896484, + "learning_rate": 2.6648996199016118e-05, + "loss": 1.5261, + "step": 21480 + }, + { + "epoch": 0.7692803552563253, + "grad_norm": 2.5518479347229004, + "learning_rate": 2.6641113091370017e-05, + "loss": 1.4939, + "step": 21481 + }, + { + "epoch": 0.7693161673858936, + "grad_norm": 2.9114930629730225, + "learning_rate": 2.6633230970673062e-05, + "loss": 1.8433, + "step": 21482 + }, + { + "epoch": 0.7693519795154619, + "grad_norm": 1.5813169479370117, + "learning_rate": 2.66253498370314e-05, + "loss": 1.3981, + "step": 21483 + }, + { + "epoch": 0.7693877916450301, + "grad_norm": 1.501997947692871, + "learning_rate": 2.661746969055098e-05, + "loss": 1.425, + "step": 21484 + }, + { + "epoch": 0.7694236037745985, + "grad_norm": 1.4923934936523438, + "learning_rate": 2.660959053133786e-05, + "loss": 1.4173, + "step": 21485 + }, + { + "epoch": 0.7694594159041668, + "grad_norm": 1.368891954421997, + "learning_rate": 2.6601712359498045e-05, + "loss": 1.5218, + "step": 21486 + }, + { + "epoch": 0.769495228033735, + "grad_norm": 1.6902717351913452, + "learning_rate": 2.6593835175137494e-05, + "loss": 1.2863, + "step": 21487 + }, + { + "epoch": 0.7695310401633033, + "grad_norm": 1.7259552478790283, + "learning_rate": 2.6585958978362235e-05, + "loss": 1.4637, + "step": 21488 + }, + { + "epoch": 0.7695668522928716, + "grad_norm": 1.6730120182037354, + "learning_rate": 2.6578083769278127e-05, + "loss": 1.3071, + "step": 21489 + }, + { + "epoch": 0.7696026644224399, + "grad_norm": 2.149928331375122, + "learning_rate": 2.6570209547991265e-05, + "loss": 1.5697, + "step": 21490 + }, + { + "epoch": 0.7696384765520081, + "grad_norm": 2.8818540573120117, + "learning_rate": 2.6562336314607484e-05, + "loss": 1.3422, + "step": 21491 + }, + { + "epoch": 0.7696742886815765, + "grad_norm": 1.4298157691955566, + "learning_rate": 2.6554464069232776e-05, + "loss": 1.4364, + "step": 21492 + }, + { + "epoch": 0.7697101008111448, + "grad_norm": 1.7484629154205322, + "learning_rate": 2.6546592811972948e-05, + "loss": 1.5057, + "step": 21493 + }, + { + "epoch": 0.769745912940713, + "grad_norm": 2.24288272857666, + "learning_rate": 2.6538722542934035e-05, + "loss": 1.6272, + "step": 21494 + }, + { + "epoch": 0.7697817250702813, + "grad_norm": 1.6153521537780762, + "learning_rate": 2.6530853262221843e-05, + "loss": 1.3215, + "step": 21495 + }, + { + "epoch": 0.7698175371998496, + "grad_norm": 1.5541167259216309, + "learning_rate": 2.652298496994222e-05, + "loss": 1.3765, + "step": 21496 + }, + { + "epoch": 0.7698533493294178, + "grad_norm": 2.0340280532836914, + "learning_rate": 2.6515117666201062e-05, + "loss": 1.6024, + "step": 21497 + }, + { + "epoch": 0.7698891614589861, + "grad_norm": 1.580460786819458, + "learning_rate": 2.6507251351104212e-05, + "loss": 1.1933, + "step": 21498 + }, + { + "epoch": 0.7699249735885545, + "grad_norm": 1.4655922651290894, + "learning_rate": 2.649938602475751e-05, + "loss": 0.9808, + "step": 21499 + }, + { + "epoch": 0.7699607857181228, + "grad_norm": 1.751434326171875, + "learning_rate": 2.6491521687266717e-05, + "loss": 1.4647, + "step": 21500 + }, + { + "epoch": 0.769996597847691, + "grad_norm": 1.7684653997421265, + "learning_rate": 2.6483658338737726e-05, + "loss": 1.6803, + "step": 21501 + }, + { + "epoch": 0.7700324099772593, + "grad_norm": 1.5846799612045288, + "learning_rate": 2.6475795979276262e-05, + "loss": 1.5269, + "step": 21502 + }, + { + "epoch": 0.7700682221068276, + "grad_norm": 1.7030701637268066, + "learning_rate": 2.6467934608988155e-05, + "loss": 1.4744, + "step": 21503 + }, + { + "epoch": 0.7701040342363958, + "grad_norm": 2.3186452388763428, + "learning_rate": 2.6460074227979104e-05, + "loss": 1.5314, + "step": 21504 + }, + { + "epoch": 0.7701398463659641, + "grad_norm": 1.4595292806625366, + "learning_rate": 2.6452214836354893e-05, + "loss": 1.7642, + "step": 21505 + }, + { + "epoch": 0.7701756584955325, + "grad_norm": 1.770524501800537, + "learning_rate": 2.6444356434221296e-05, + "loss": 1.5098, + "step": 21506 + }, + { + "epoch": 0.7702114706251008, + "grad_norm": 3.266895055770874, + "learning_rate": 2.643649902168397e-05, + "loss": 1.4408, + "step": 21507 + }, + { + "epoch": 0.770247282754669, + "grad_norm": 1.3348801136016846, + "learning_rate": 2.6428642598848663e-05, + "loss": 1.5114, + "step": 21508 + }, + { + "epoch": 0.7702830948842373, + "grad_norm": 1.5498926639556885, + "learning_rate": 2.642078716582107e-05, + "loss": 1.3847, + "step": 21509 + }, + { + "epoch": 0.7703189070138056, + "grad_norm": 1.818127155303955, + "learning_rate": 2.6412932722706908e-05, + "loss": 1.3761, + "step": 21510 + }, + { + "epoch": 0.7703547191433738, + "grad_norm": 1.73329758644104, + "learning_rate": 2.6405079269611744e-05, + "loss": 1.3512, + "step": 21511 + }, + { + "epoch": 0.7703905312729421, + "grad_norm": 1.2978670597076416, + "learning_rate": 2.6397226806641375e-05, + "loss": 1.4232, + "step": 21512 + }, + { + "epoch": 0.7704263434025105, + "grad_norm": 1.5033830404281616, + "learning_rate": 2.6389375333901377e-05, + "loss": 1.3919, + "step": 21513 + }, + { + "epoch": 0.7704621555320788, + "grad_norm": 1.4706625938415527, + "learning_rate": 2.6381524851497353e-05, + "loss": 1.4614, + "step": 21514 + }, + { + "epoch": 0.770497967661647, + "grad_norm": 1.6710084676742554, + "learning_rate": 2.6373675359534955e-05, + "loss": 1.549, + "step": 21515 + }, + { + "epoch": 0.7705337797912153, + "grad_norm": 1.2906526327133179, + "learning_rate": 2.636582685811978e-05, + "loss": 1.4191, + "step": 21516 + }, + { + "epoch": 0.7705695919207836, + "grad_norm": 1.3973788022994995, + "learning_rate": 2.6357979347357454e-05, + "loss": 1.2873, + "step": 21517 + }, + { + "epoch": 0.7706054040503518, + "grad_norm": 1.4239815473556519, + "learning_rate": 2.635013282735349e-05, + "loss": 1.5933, + "step": 21518 + }, + { + "epoch": 0.7706412161799201, + "grad_norm": 2.066704273223877, + "learning_rate": 2.63422872982135e-05, + "loss": 1.3148, + "step": 21519 + }, + { + "epoch": 0.7706770283094885, + "grad_norm": 1.2484495639801025, + "learning_rate": 2.6334442760043044e-05, + "loss": 1.3429, + "step": 21520 + }, + { + "epoch": 0.7707128404390567, + "grad_norm": 1.6173884868621826, + "learning_rate": 2.632659921294761e-05, + "loss": 1.4515, + "step": 21521 + }, + { + "epoch": 0.770748652568625, + "grad_norm": 1.4764695167541504, + "learning_rate": 2.631875665703275e-05, + "loss": 1.4135, + "step": 21522 + }, + { + "epoch": 0.7707844646981933, + "grad_norm": 1.487092137336731, + "learning_rate": 2.6310915092403976e-05, + "loss": 1.2509, + "step": 21523 + }, + { + "epoch": 0.7708202768277616, + "grad_norm": 1.7480628490447998, + "learning_rate": 2.6303074519166827e-05, + "loss": 1.6969, + "step": 21524 + }, + { + "epoch": 0.7708560889573298, + "grad_norm": 1.725825309753418, + "learning_rate": 2.6295234937426706e-05, + "loss": 1.3108, + "step": 21525 + }, + { + "epoch": 0.7708919010868981, + "grad_norm": 1.542431116104126, + "learning_rate": 2.628739634728914e-05, + "loss": 1.0812, + "step": 21526 + }, + { + "epoch": 0.7709277132164665, + "grad_norm": 1.9049519300460815, + "learning_rate": 2.6279558748859555e-05, + "loss": 1.659, + "step": 21527 + }, + { + "epoch": 0.7709635253460347, + "grad_norm": 1.3463095426559448, + "learning_rate": 2.627172214224346e-05, + "loss": 1.5486, + "step": 21528 + }, + { + "epoch": 0.770999337475603, + "grad_norm": 2.2055890560150146, + "learning_rate": 2.626388652754621e-05, + "loss": 1.6108, + "step": 21529 + }, + { + "epoch": 0.7710351496051713, + "grad_norm": 1.8153146505355835, + "learning_rate": 2.6256051904873246e-05, + "loss": 1.1265, + "step": 21530 + }, + { + "epoch": 0.7710709617347395, + "grad_norm": 2.994877338409424, + "learning_rate": 2.6248218274330017e-05, + "loss": 1.6599, + "step": 21531 + }, + { + "epoch": 0.7711067738643078, + "grad_norm": 1.478348731994629, + "learning_rate": 2.6240385636021847e-05, + "loss": 1.4522, + "step": 21532 + }, + { + "epoch": 0.7711425859938761, + "grad_norm": 1.8544647693634033, + "learning_rate": 2.6232553990054144e-05, + "loss": 1.4124, + "step": 21533 + }, + { + "epoch": 0.7711783981234445, + "grad_norm": 1.606522798538208, + "learning_rate": 2.6224723336532274e-05, + "loss": 1.3303, + "step": 21534 + }, + { + "epoch": 0.7712142102530127, + "grad_norm": 1.5226876735687256, + "learning_rate": 2.6216893675561617e-05, + "loss": 1.5826, + "step": 21535 + }, + { + "epoch": 0.771250022382581, + "grad_norm": 1.7415231466293335, + "learning_rate": 2.6209065007247458e-05, + "loss": 1.3703, + "step": 21536 + }, + { + "epoch": 0.7712858345121493, + "grad_norm": 1.5941494703292847, + "learning_rate": 2.6201237331695138e-05, + "loss": 1.5155, + "step": 21537 + }, + { + "epoch": 0.7713216466417175, + "grad_norm": 1.5538673400878906, + "learning_rate": 2.619341064901001e-05, + "loss": 1.2863, + "step": 21538 + }, + { + "epoch": 0.7713574587712858, + "grad_norm": 1.6205617189407349, + "learning_rate": 2.6185584959297303e-05, + "loss": 1.4426, + "step": 21539 + }, + { + "epoch": 0.7713932709008541, + "grad_norm": 1.6439517736434937, + "learning_rate": 2.6177760262662345e-05, + "loss": 1.3722, + "step": 21540 + }, + { + "epoch": 0.7714290830304225, + "grad_norm": 2.5068721771240234, + "learning_rate": 2.6169936559210396e-05, + "loss": 1.5905, + "step": 21541 + }, + { + "epoch": 0.7714648951599907, + "grad_norm": 1.3637676239013672, + "learning_rate": 2.6162113849046745e-05, + "loss": 1.5606, + "step": 21542 + }, + { + "epoch": 0.771500707289559, + "grad_norm": 1.5745965242385864, + "learning_rate": 2.615429213227658e-05, + "loss": 1.7289, + "step": 21543 + }, + { + "epoch": 0.7715365194191273, + "grad_norm": 1.5057892799377441, + "learning_rate": 2.6146471409005158e-05, + "loss": 1.6678, + "step": 21544 + }, + { + "epoch": 0.7715723315486955, + "grad_norm": 1.9322911500930786, + "learning_rate": 2.61386516793377e-05, + "loss": 1.5913, + "step": 21545 + }, + { + "epoch": 0.7716081436782638, + "grad_norm": 1.3579450845718384, + "learning_rate": 2.6130832943379447e-05, + "loss": 1.5061, + "step": 21546 + }, + { + "epoch": 0.7716439558078321, + "grad_norm": 1.32883620262146, + "learning_rate": 2.612301520123551e-05, + "loss": 1.0265, + "step": 21547 + }, + { + "epoch": 0.7716797679374005, + "grad_norm": 1.9010030031204224, + "learning_rate": 2.6115198453011114e-05, + "loss": 1.4166, + "step": 21548 + }, + { + "epoch": 0.7717155800669687, + "grad_norm": 1.6484206914901733, + "learning_rate": 2.6107382698811446e-05, + "loss": 1.4505, + "step": 21549 + }, + { + "epoch": 0.771751392196537, + "grad_norm": 1.9972126483917236, + "learning_rate": 2.609956793874161e-05, + "loss": 1.6538, + "step": 21550 + }, + { + "epoch": 0.7717872043261053, + "grad_norm": 2.0636024475097656, + "learning_rate": 2.6091754172906747e-05, + "loss": 1.4478, + "step": 21551 + }, + { + "epoch": 0.7718230164556735, + "grad_norm": 1.371565341949463, + "learning_rate": 2.6083941401412005e-05, + "loss": 1.4717, + "step": 21552 + }, + { + "epoch": 0.7718588285852418, + "grad_norm": 1.405341386795044, + "learning_rate": 2.6076129624362512e-05, + "loss": 1.3875, + "step": 21553 + }, + { + "epoch": 0.7718946407148101, + "grad_norm": 1.5658798217773438, + "learning_rate": 2.6068318841863314e-05, + "loss": 1.1604, + "step": 21554 + }, + { + "epoch": 0.7719304528443784, + "grad_norm": 1.636956810951233, + "learning_rate": 2.6060509054019523e-05, + "loss": 1.1873, + "step": 21555 + }, + { + "epoch": 0.7719662649739467, + "grad_norm": 1.5121686458587646, + "learning_rate": 2.6052700260936237e-05, + "loss": 1.1696, + "step": 21556 + }, + { + "epoch": 0.772002077103515, + "grad_norm": 1.6735605001449585, + "learning_rate": 2.604489246271845e-05, + "loss": 1.6659, + "step": 21557 + }, + { + "epoch": 0.7720378892330833, + "grad_norm": 1.52809739112854, + "learning_rate": 2.6037085659471237e-05, + "loss": 1.4882, + "step": 21558 + }, + { + "epoch": 0.7720737013626515, + "grad_norm": 1.5789731740951538, + "learning_rate": 2.6029279851299636e-05, + "loss": 1.2542, + "step": 21559 + }, + { + "epoch": 0.7721095134922198, + "grad_norm": 1.3522050380706787, + "learning_rate": 2.6021475038308694e-05, + "loss": 1.6823, + "step": 21560 + }, + { + "epoch": 0.7721453256217881, + "grad_norm": 1.3396024703979492, + "learning_rate": 2.6013671220603343e-05, + "loss": 1.5947, + "step": 21561 + }, + { + "epoch": 0.7721811377513564, + "grad_norm": 1.3040772676467896, + "learning_rate": 2.6005868398288614e-05, + "loss": 1.1411, + "step": 21562 + }, + { + "epoch": 0.7722169498809247, + "grad_norm": 2.3642704486846924, + "learning_rate": 2.5998066571469482e-05, + "loss": 1.5471, + "step": 21563 + }, + { + "epoch": 0.772252762010493, + "grad_norm": 2.173736333847046, + "learning_rate": 2.599026574025093e-05, + "loss": 1.4395, + "step": 21564 + }, + { + "epoch": 0.7722885741400612, + "grad_norm": 1.3441381454467773, + "learning_rate": 2.5982465904737895e-05, + "loss": 1.3327, + "step": 21565 + }, + { + "epoch": 0.7723243862696295, + "grad_norm": 1.4685009717941284, + "learning_rate": 2.597466706503524e-05, + "loss": 1.6688, + "step": 21566 + }, + { + "epoch": 0.7723601983991978, + "grad_norm": 1.6783415079116821, + "learning_rate": 2.5966869221248013e-05, + "loss": 1.5029, + "step": 21567 + }, + { + "epoch": 0.772396010528766, + "grad_norm": 1.6194955110549927, + "learning_rate": 2.595907237348104e-05, + "loss": 1.4862, + "step": 21568 + }, + { + "epoch": 0.7724318226583344, + "grad_norm": 1.703182339668274, + "learning_rate": 2.595127652183924e-05, + "loss": 1.3713, + "step": 21569 + }, + { + "epoch": 0.7724676347879027, + "grad_norm": 1.670957088470459, + "learning_rate": 2.5943481666427506e-05, + "loss": 1.3126, + "step": 21570 + }, + { + "epoch": 0.772503446917471, + "grad_norm": 1.6317272186279297, + "learning_rate": 2.5935687807350718e-05, + "loss": 1.6463, + "step": 21571 + }, + { + "epoch": 0.7725392590470392, + "grad_norm": 1.457313895225525, + "learning_rate": 2.5927894944713695e-05, + "loss": 1.2296, + "step": 21572 + }, + { + "epoch": 0.7725750711766075, + "grad_norm": 2.3845486640930176, + "learning_rate": 2.5920103078621294e-05, + "loss": 1.4234, + "step": 21573 + }, + { + "epoch": 0.7726108833061758, + "grad_norm": 2.011991500854492, + "learning_rate": 2.591231220917837e-05, + "loss": 1.5692, + "step": 21574 + }, + { + "epoch": 0.772646695435744, + "grad_norm": 1.949367642402649, + "learning_rate": 2.59045223364897e-05, + "loss": 1.5255, + "step": 21575 + }, + { + "epoch": 0.7726825075653124, + "grad_norm": 1.472780704498291, + "learning_rate": 2.5896733460660138e-05, + "loss": 1.5582, + "step": 21576 + }, + { + "epoch": 0.7727183196948807, + "grad_norm": 1.6741362810134888, + "learning_rate": 2.5888945581794377e-05, + "loss": 1.4447, + "step": 21577 + }, + { + "epoch": 0.772754131824449, + "grad_norm": 1.7349146604537964, + "learning_rate": 2.5881158699997322e-05, + "loss": 1.4366, + "step": 21578 + }, + { + "epoch": 0.7727899439540172, + "grad_norm": 1.5743865966796875, + "learning_rate": 2.5873372815373633e-05, + "loss": 1.5798, + "step": 21579 + }, + { + "epoch": 0.7728257560835855, + "grad_norm": 1.3788039684295654, + "learning_rate": 2.5865587928028124e-05, + "loss": 1.1381, + "step": 21580 + }, + { + "epoch": 0.7728615682131538, + "grad_norm": 1.6148918867111206, + "learning_rate": 2.5857804038065446e-05, + "loss": 1.263, + "step": 21581 + }, + { + "epoch": 0.772897380342722, + "grad_norm": 1.67681086063385, + "learning_rate": 2.585002114559044e-05, + "loss": 1.6036, + "step": 21582 + }, + { + "epoch": 0.7729331924722904, + "grad_norm": 1.5975325107574463, + "learning_rate": 2.5842239250707757e-05, + "loss": 1.2015, + "step": 21583 + }, + { + "epoch": 0.7729690046018587, + "grad_norm": 1.4812545776367188, + "learning_rate": 2.5834458353522018e-05, + "loss": 1.3873, + "step": 21584 + }, + { + "epoch": 0.773004816731427, + "grad_norm": 1.8217486143112183, + "learning_rate": 2.5826678454138044e-05, + "loss": 1.6502, + "step": 21585 + }, + { + "epoch": 0.7730406288609952, + "grad_norm": 1.686937928199768, + "learning_rate": 2.5818899552660404e-05, + "loss": 1.5596, + "step": 21586 + }, + { + "epoch": 0.7730764409905635, + "grad_norm": 1.654982328414917, + "learning_rate": 2.5811121649193805e-05, + "loss": 1.3852, + "step": 21587 + }, + { + "epoch": 0.7731122531201318, + "grad_norm": 1.7891792058944702, + "learning_rate": 2.5803344743842817e-05, + "loss": 1.3776, + "step": 21588 + }, + { + "epoch": 0.7731480652497, + "grad_norm": 1.1618537902832031, + "learning_rate": 2.579556883671217e-05, + "loss": 1.4942, + "step": 21589 + }, + { + "epoch": 0.7731838773792683, + "grad_norm": 2.126289129257202, + "learning_rate": 2.578779392790641e-05, + "loss": 1.4412, + "step": 21590 + }, + { + "epoch": 0.7732196895088367, + "grad_norm": 1.313543438911438, + "learning_rate": 2.5780020017530182e-05, + "loss": 1.3064, + "step": 21591 + }, + { + "epoch": 0.773255501638405, + "grad_norm": 2.4040563106536865, + "learning_rate": 2.5772247105688006e-05, + "loss": 1.2878, + "step": 21592 + }, + { + "epoch": 0.7732913137679732, + "grad_norm": 1.7156052589416504, + "learning_rate": 2.5764475192484506e-05, + "loss": 1.463, + "step": 21593 + }, + { + "epoch": 0.7733271258975415, + "grad_norm": 1.546714425086975, + "learning_rate": 2.5756704278024268e-05, + "loss": 1.183, + "step": 21594 + }, + { + "epoch": 0.7733629380271098, + "grad_norm": 1.4804030656814575, + "learning_rate": 2.5748934362411747e-05, + "loss": 1.3276, + "step": 21595 + }, + { + "epoch": 0.773398750156678, + "grad_norm": 1.903746247291565, + "learning_rate": 2.574116544575159e-05, + "loss": 1.526, + "step": 21596 + }, + { + "epoch": 0.7734345622862463, + "grad_norm": 1.246662974357605, + "learning_rate": 2.573339752814825e-05, + "loss": 1.1375, + "step": 21597 + }, + { + "epoch": 0.7734703744158147, + "grad_norm": 2.333866834640503, + "learning_rate": 2.5725630609706264e-05, + "loss": 1.7398, + "step": 21598 + }, + { + "epoch": 0.773506186545383, + "grad_norm": 1.3863903284072876, + "learning_rate": 2.571786469053006e-05, + "loss": 1.7066, + "step": 21599 + }, + { + "epoch": 0.7735419986749512, + "grad_norm": 1.4405169486999512, + "learning_rate": 2.5710099770724227e-05, + "loss": 1.6633, + "step": 21600 + }, + { + "epoch": 0.7735778108045195, + "grad_norm": 1.563675880432129, + "learning_rate": 2.5702335850393166e-05, + "loss": 1.2315, + "step": 21601 + }, + { + "epoch": 0.7736136229340878, + "grad_norm": 1.655320644378662, + "learning_rate": 2.5694572929641326e-05, + "loss": 1.5362, + "step": 21602 + }, + { + "epoch": 0.773649435063656, + "grad_norm": 1.73147714138031, + "learning_rate": 2.5686811008573142e-05, + "loss": 1.2198, + "step": 21603 + }, + { + "epoch": 0.7736852471932243, + "grad_norm": 1.8005670309066772, + "learning_rate": 2.5679050087293067e-05, + "loss": 1.3593, + "step": 21604 + }, + { + "epoch": 0.7737210593227927, + "grad_norm": 1.3524696826934814, + "learning_rate": 2.5671290165905537e-05, + "loss": 1.5948, + "step": 21605 + }, + { + "epoch": 0.7737568714523609, + "grad_norm": 1.4053454399108887, + "learning_rate": 2.5663531244514892e-05, + "loss": 1.4189, + "step": 21606 + }, + { + "epoch": 0.7737926835819292, + "grad_norm": 1.6903241872787476, + "learning_rate": 2.5655773323225552e-05, + "loss": 1.6855, + "step": 21607 + }, + { + "epoch": 0.7738284957114975, + "grad_norm": 2.3848190307617188, + "learning_rate": 2.564801640214187e-05, + "loss": 1.481, + "step": 21608 + }, + { + "epoch": 0.7738643078410657, + "grad_norm": 1.8017464876174927, + "learning_rate": 2.564026048136826e-05, + "loss": 1.7375, + "step": 21609 + }, + { + "epoch": 0.773900119970634, + "grad_norm": 1.8137153387069702, + "learning_rate": 2.5632505561009002e-05, + "loss": 1.7897, + "step": 21610 + }, + { + "epoch": 0.7739359321002023, + "grad_norm": 1.650579810142517, + "learning_rate": 2.5624751641168442e-05, + "loss": 1.4311, + "step": 21611 + }, + { + "epoch": 0.7739717442297707, + "grad_norm": 1.6380149126052856, + "learning_rate": 2.5616998721950948e-05, + "loss": 1.357, + "step": 21612 + }, + { + "epoch": 0.7740075563593389, + "grad_norm": 1.4498231410980225, + "learning_rate": 2.5609246803460764e-05, + "loss": 1.6064, + "step": 21613 + }, + { + "epoch": 0.7740433684889072, + "grad_norm": 1.459731101989746, + "learning_rate": 2.5601495885802196e-05, + "loss": 1.58, + "step": 21614 + }, + { + "epoch": 0.7740791806184755, + "grad_norm": 2.7620160579681396, + "learning_rate": 2.559374596907954e-05, + "loss": 1.3054, + "step": 21615 + }, + { + "epoch": 0.7741149927480437, + "grad_norm": 1.4486792087554932, + "learning_rate": 2.5585997053397083e-05, + "loss": 1.7824, + "step": 21616 + }, + { + "epoch": 0.774150804877612, + "grad_norm": 1.2154338359832764, + "learning_rate": 2.5578249138859023e-05, + "loss": 1.4483, + "step": 21617 + }, + { + "epoch": 0.7741866170071803, + "grad_norm": 1.8012681007385254, + "learning_rate": 2.5570502225569625e-05, + "loss": 1.4442, + "step": 21618 + }, + { + "epoch": 0.7742224291367487, + "grad_norm": 2.442791700363159, + "learning_rate": 2.556275631363314e-05, + "loss": 1.7195, + "step": 21619 + }, + { + "epoch": 0.7742582412663169, + "grad_norm": 1.6836718320846558, + "learning_rate": 2.5555011403153715e-05, + "loss": 1.3161, + "step": 21620 + }, + { + "epoch": 0.7742940533958852, + "grad_norm": 1.6451094150543213, + "learning_rate": 2.5547267494235595e-05, + "loss": 1.1316, + "step": 21621 + }, + { + "epoch": 0.7743298655254535, + "grad_norm": 1.3877159357070923, + "learning_rate": 2.5539524586982944e-05, + "loss": 1.1024, + "step": 21622 + }, + { + "epoch": 0.7743656776550217, + "grad_norm": 1.4425606727600098, + "learning_rate": 2.553178268149997e-05, + "loss": 1.544, + "step": 21623 + }, + { + "epoch": 0.77440148978459, + "grad_norm": 1.7422723770141602, + "learning_rate": 2.5524041777890783e-05, + "loss": 1.1543, + "step": 21624 + }, + { + "epoch": 0.7744373019141583, + "grad_norm": 1.4674055576324463, + "learning_rate": 2.5516301876259542e-05, + "loss": 1.3218, + "step": 21625 + }, + { + "epoch": 0.7744731140437267, + "grad_norm": 1.6906684637069702, + "learning_rate": 2.5508562976710416e-05, + "loss": 1.2574, + "step": 21626 + }, + { + "epoch": 0.7745089261732949, + "grad_norm": 1.5787941217422485, + "learning_rate": 2.5500825079347458e-05, + "loss": 1.1405, + "step": 21627 + }, + { + "epoch": 0.7745447383028632, + "grad_norm": 2.0668768882751465, + "learning_rate": 2.5493088184274795e-05, + "loss": 1.4762, + "step": 21628 + }, + { + "epoch": 0.7745805504324315, + "grad_norm": 2.3916802406311035, + "learning_rate": 2.548535229159653e-05, + "loss": 1.322, + "step": 21629 + }, + { + "epoch": 0.7746163625619997, + "grad_norm": 1.847888708114624, + "learning_rate": 2.5477617401416765e-05, + "loss": 1.4714, + "step": 21630 + }, + { + "epoch": 0.774652174691568, + "grad_norm": 1.9050281047821045, + "learning_rate": 2.5469883513839498e-05, + "loss": 1.3579, + "step": 21631 + }, + { + "epoch": 0.7746879868211363, + "grad_norm": 1.6340993642807007, + "learning_rate": 2.5462150628968806e-05, + "loss": 1.6059, + "step": 21632 + }, + { + "epoch": 0.7747237989507046, + "grad_norm": 1.4270821809768677, + "learning_rate": 2.5454418746908737e-05, + "loss": 1.3574, + "step": 21633 + }, + { + "epoch": 0.7747596110802729, + "grad_norm": 2.2271409034729004, + "learning_rate": 2.544668786776333e-05, + "loss": 1.3854, + "step": 21634 + }, + { + "epoch": 0.7747954232098412, + "grad_norm": 1.591993808746338, + "learning_rate": 2.5438957991636546e-05, + "loss": 1.3642, + "step": 21635 + }, + { + "epoch": 0.7748312353394095, + "grad_norm": 1.3219497203826904, + "learning_rate": 2.5431229118632406e-05, + "loss": 1.3877, + "step": 21636 + }, + { + "epoch": 0.7748670474689777, + "grad_norm": 1.488739013671875, + "learning_rate": 2.542350124885492e-05, + "loss": 1.291, + "step": 21637 + }, + { + "epoch": 0.774902859598546, + "grad_norm": 1.77037513256073, + "learning_rate": 2.5415774382407997e-05, + "loss": 1.5163, + "step": 21638 + }, + { + "epoch": 0.7749386717281143, + "grad_norm": 1.768937587738037, + "learning_rate": 2.5408048519395622e-05, + "loss": 1.3945, + "step": 21639 + }, + { + "epoch": 0.7749744838576826, + "grad_norm": 1.5960173606872559, + "learning_rate": 2.5400323659921744e-05, + "loss": 1.361, + "step": 21640 + }, + { + "epoch": 0.7750102959872509, + "grad_norm": 2.2278168201446533, + "learning_rate": 2.539259980409031e-05, + "loss": 1.6758, + "step": 21641 + }, + { + "epoch": 0.7750461081168192, + "grad_norm": 2.10552978515625, + "learning_rate": 2.5384876952005177e-05, + "loss": 1.4251, + "step": 21642 + }, + { + "epoch": 0.7750819202463874, + "grad_norm": 1.4312798976898193, + "learning_rate": 2.537715510377028e-05, + "loss": 1.7312, + "step": 21643 + }, + { + "epoch": 0.7751177323759557, + "grad_norm": 1.418057918548584, + "learning_rate": 2.5369434259489534e-05, + "loss": 1.3847, + "step": 21644 + }, + { + "epoch": 0.775153544505524, + "grad_norm": 1.6209661960601807, + "learning_rate": 2.5361714419266757e-05, + "loss": 1.4997, + "step": 21645 + }, + { + "epoch": 0.7751893566350923, + "grad_norm": 1.4104539155960083, + "learning_rate": 2.5353995583205824e-05, + "loss": 1.4841, + "step": 21646 + }, + { + "epoch": 0.7752251687646606, + "grad_norm": 1.7481262683868408, + "learning_rate": 2.5346277751410607e-05, + "loss": 1.3121, + "step": 21647 + }, + { + "epoch": 0.7752609808942289, + "grad_norm": 1.7740089893341064, + "learning_rate": 2.5338560923984954e-05, + "loss": 1.8563, + "step": 21648 + }, + { + "epoch": 0.7752967930237972, + "grad_norm": 1.5107569694519043, + "learning_rate": 2.533084510103263e-05, + "loss": 1.2749, + "step": 21649 + }, + { + "epoch": 0.7753326051533654, + "grad_norm": 1.553844690322876, + "learning_rate": 2.532313028265746e-05, + "loss": 1.6047, + "step": 21650 + }, + { + "epoch": 0.7753684172829337, + "grad_norm": 1.7860406637191772, + "learning_rate": 2.531541646896325e-05, + "loss": 1.3046, + "step": 21651 + }, + { + "epoch": 0.775404229412502, + "grad_norm": 1.328852653503418, + "learning_rate": 2.5307703660053805e-05, + "loss": 1.031, + "step": 21652 + }, + { + "epoch": 0.7754400415420702, + "grad_norm": 1.684956669807434, + "learning_rate": 2.5299991856032835e-05, + "loss": 1.131, + "step": 21653 + }, + { + "epoch": 0.7754758536716386, + "grad_norm": 1.5441877841949463, + "learning_rate": 2.5292281057004108e-05, + "loss": 1.5972, + "step": 21654 + }, + { + "epoch": 0.7755116658012069, + "grad_norm": 2.099240303039551, + "learning_rate": 2.528457126307141e-05, + "loss": 1.5017, + "step": 21655 + }, + { + "epoch": 0.7755474779307752, + "grad_norm": 1.9487109184265137, + "learning_rate": 2.5276862474338404e-05, + "loss": 1.5184, + "step": 21656 + }, + { + "epoch": 0.7755832900603434, + "grad_norm": 1.8288156986236572, + "learning_rate": 2.5269154690908827e-05, + "loss": 1.393, + "step": 21657 + }, + { + "epoch": 0.7756191021899117, + "grad_norm": 1.4676835536956787, + "learning_rate": 2.526144791288637e-05, + "loss": 1.2301, + "step": 21658 + }, + { + "epoch": 0.77565491431948, + "grad_norm": 1.6494859457015991, + "learning_rate": 2.525374214037476e-05, + "loss": 1.4331, + "step": 21659 + }, + { + "epoch": 0.7756907264490482, + "grad_norm": 2.2830381393432617, + "learning_rate": 2.5246037373477606e-05, + "loss": 1.47, + "step": 21660 + }, + { + "epoch": 0.7757265385786166, + "grad_norm": 1.7719887495040894, + "learning_rate": 2.523833361229859e-05, + "loss": 1.7523, + "step": 21661 + }, + { + "epoch": 0.7757623507081849, + "grad_norm": 1.495537519454956, + "learning_rate": 2.5230630856941394e-05, + "loss": 1.2791, + "step": 21662 + }, + { + "epoch": 0.7757981628377532, + "grad_norm": 1.52877676486969, + "learning_rate": 2.5222929107509584e-05, + "loss": 1.5248, + "step": 21663 + }, + { + "epoch": 0.7758339749673214, + "grad_norm": 1.7707349061965942, + "learning_rate": 2.5215228364106835e-05, + "loss": 1.3937, + "step": 21664 + }, + { + "epoch": 0.7758697870968897, + "grad_norm": 1.9539955854415894, + "learning_rate": 2.5207528626836662e-05, + "loss": 1.7118, + "step": 21665 + }, + { + "epoch": 0.775905599226458, + "grad_norm": 1.8302944898605347, + "learning_rate": 2.5199829895802775e-05, + "loss": 1.5033, + "step": 21666 + }, + { + "epoch": 0.7759414113560262, + "grad_norm": 2.0510201454162598, + "learning_rate": 2.519213217110866e-05, + "loss": 1.6254, + "step": 21667 + }, + { + "epoch": 0.7759772234855946, + "grad_norm": 2.2383663654327393, + "learning_rate": 2.5184435452857913e-05, + "loss": 1.6945, + "step": 21668 + }, + { + "epoch": 0.7760130356151629, + "grad_norm": 1.3355016708374023, + "learning_rate": 2.517673974115409e-05, + "loss": 1.1414, + "step": 21669 + }, + { + "epoch": 0.7760488477447312, + "grad_norm": 1.586307406425476, + "learning_rate": 2.5169045036100736e-05, + "loss": 1.6073, + "step": 21670 + }, + { + "epoch": 0.7760846598742994, + "grad_norm": 1.2950676679611206, + "learning_rate": 2.5161351337801363e-05, + "loss": 1.399, + "step": 21671 + }, + { + "epoch": 0.7761204720038677, + "grad_norm": 1.7637110948562622, + "learning_rate": 2.5153658646359412e-05, + "loss": 1.2684, + "step": 21672 + }, + { + "epoch": 0.776156284133436, + "grad_norm": 1.1106634140014648, + "learning_rate": 2.51459669618785e-05, + "loss": 1.3017, + "step": 21673 + }, + { + "epoch": 0.7761920962630042, + "grad_norm": 1.6944717168807983, + "learning_rate": 2.5138276284462016e-05, + "loss": 1.1044, + "step": 21674 + }, + { + "epoch": 0.7762279083925726, + "grad_norm": 1.657623291015625, + "learning_rate": 2.513058661421349e-05, + "loss": 1.4931, + "step": 21675 + }, + { + "epoch": 0.7762637205221409, + "grad_norm": 1.4893158674240112, + "learning_rate": 2.512289795123629e-05, + "loss": 1.48, + "step": 21676 + }, + { + "epoch": 0.7762995326517091, + "grad_norm": 1.7651742696762085, + "learning_rate": 2.5115210295633974e-05, + "loss": 1.2103, + "step": 21677 + }, + { + "epoch": 0.7763353447812774, + "grad_norm": 2.091895580291748, + "learning_rate": 2.5107523647509877e-05, + "loss": 1.4103, + "step": 21678 + }, + { + "epoch": 0.7763711569108457, + "grad_norm": 1.813442587852478, + "learning_rate": 2.5099838006967446e-05, + "loss": 1.3978, + "step": 21679 + }, + { + "epoch": 0.776406969040414, + "grad_norm": 1.8207119703292847, + "learning_rate": 2.5092153374110107e-05, + "loss": 1.2788, + "step": 21680 + }, + { + "epoch": 0.7764427811699822, + "grad_norm": 1.47296142578125, + "learning_rate": 2.5084469749041185e-05, + "loss": 1.4844, + "step": 21681 + }, + { + "epoch": 0.7764785932995506, + "grad_norm": 1.7010948657989502, + "learning_rate": 2.5076787131864132e-05, + "loss": 1.5089, + "step": 21682 + }, + { + "epoch": 0.7765144054291189, + "grad_norm": 2.0861148834228516, + "learning_rate": 2.506910552268219e-05, + "loss": 1.2157, + "step": 21683 + }, + { + "epoch": 0.7765502175586871, + "grad_norm": 1.6660261154174805, + "learning_rate": 2.5061424921598853e-05, + "loss": 1.4986, + "step": 21684 + }, + { + "epoch": 0.7765860296882554, + "grad_norm": 1.3567243814468384, + "learning_rate": 2.5053745328717336e-05, + "loss": 1.2758, + "step": 21685 + }, + { + "epoch": 0.7766218418178237, + "grad_norm": 1.429267406463623, + "learning_rate": 2.504606674414104e-05, + "loss": 1.2731, + "step": 21686 + }, + { + "epoch": 0.776657653947392, + "grad_norm": 1.6606378555297852, + "learning_rate": 2.5038389167973177e-05, + "loss": 1.4486, + "step": 21687 + }, + { + "epoch": 0.7766934660769602, + "grad_norm": 1.906919002532959, + "learning_rate": 2.5030712600317143e-05, + "loss": 1.6785, + "step": 21688 + }, + { + "epoch": 0.7767292782065286, + "grad_norm": 1.29216468334198, + "learning_rate": 2.5023037041276175e-05, + "loss": 1.4627, + "step": 21689 + }, + { + "epoch": 0.7767650903360969, + "grad_norm": 2.332552433013916, + "learning_rate": 2.5015362490953497e-05, + "loss": 1.4408, + "step": 21690 + }, + { + "epoch": 0.7768009024656651, + "grad_norm": 1.672184705734253, + "learning_rate": 2.5007688949452402e-05, + "loss": 1.2021, + "step": 21691 + }, + { + "epoch": 0.7768367145952334, + "grad_norm": 1.5943087339401245, + "learning_rate": 2.5000016416876103e-05, + "loss": 1.4337, + "step": 21692 + }, + { + "epoch": 0.7768725267248017, + "grad_norm": 1.5827921628952026, + "learning_rate": 2.499234489332788e-05, + "loss": 1.4747, + "step": 21693 + }, + { + "epoch": 0.7769083388543699, + "grad_norm": 1.874254584312439, + "learning_rate": 2.4984674378910845e-05, + "loss": 1.3405, + "step": 21694 + }, + { + "epoch": 0.7769441509839382, + "grad_norm": 1.8608965873718262, + "learning_rate": 2.4977004873728315e-05, + "loss": 1.196, + "step": 21695 + }, + { + "epoch": 0.7769799631135066, + "grad_norm": 2.278493881225586, + "learning_rate": 2.496933637788338e-05, + "loss": 1.4088, + "step": 21696 + }, + { + "epoch": 0.7770157752430749, + "grad_norm": 1.8424022197723389, + "learning_rate": 2.496166889147926e-05, + "loss": 1.3471, + "step": 21697 + }, + { + "epoch": 0.7770515873726431, + "grad_norm": 1.493421196937561, + "learning_rate": 2.495400241461907e-05, + "loss": 1.5338, + "step": 21698 + }, + { + "epoch": 0.7770873995022114, + "grad_norm": 1.4196568727493286, + "learning_rate": 2.494633694740598e-05, + "loss": 1.4372, + "step": 21699 + }, + { + "epoch": 0.7771232116317797, + "grad_norm": 1.808488130569458, + "learning_rate": 2.4938672489943138e-05, + "loss": 1.3776, + "step": 21700 + }, + { + "epoch": 0.7771590237613479, + "grad_norm": 1.3471368551254272, + "learning_rate": 2.493100904233361e-05, + "loss": 1.3275, + "step": 21701 + }, + { + "epoch": 0.7771948358909162, + "grad_norm": 1.5173574686050415, + "learning_rate": 2.4923346604680532e-05, + "loss": 1.1689, + "step": 21702 + }, + { + "epoch": 0.7772306480204846, + "grad_norm": 1.4176759719848633, + "learning_rate": 2.4915685177086967e-05, + "loss": 1.2773, + "step": 21703 + }, + { + "epoch": 0.7772664601500529, + "grad_norm": 1.5612668991088867, + "learning_rate": 2.4908024759656046e-05, + "loss": 1.5212, + "step": 21704 + }, + { + "epoch": 0.7773022722796211, + "grad_norm": 1.379515528678894, + "learning_rate": 2.490036535249073e-05, + "loss": 1.3418, + "step": 21705 + }, + { + "epoch": 0.7773380844091894, + "grad_norm": 1.7329143285751343, + "learning_rate": 2.489270695569418e-05, + "loss": 1.4872, + "step": 21706 + }, + { + "epoch": 0.7773738965387577, + "grad_norm": 1.6147024631500244, + "learning_rate": 2.4885049569369378e-05, + "loss": 1.492, + "step": 21707 + }, + { + "epoch": 0.7774097086683259, + "grad_norm": 1.3835676908493042, + "learning_rate": 2.4877393193619315e-05, + "loss": 1.2585, + "step": 21708 + }, + { + "epoch": 0.7774455207978942, + "grad_norm": 1.4012019634246826, + "learning_rate": 2.4869737828547024e-05, + "loss": 1.4471, + "step": 21709 + }, + { + "epoch": 0.7774813329274626, + "grad_norm": 1.916560173034668, + "learning_rate": 2.4862083474255503e-05, + "loss": 1.3948, + "step": 21710 + }, + { + "epoch": 0.7775171450570308, + "grad_norm": 2.5519630908966064, + "learning_rate": 2.485443013084775e-05, + "loss": 1.78, + "step": 21711 + }, + { + "epoch": 0.7775529571865991, + "grad_norm": 1.7727234363555908, + "learning_rate": 2.484677779842669e-05, + "loss": 1.4787, + "step": 21712 + }, + { + "epoch": 0.7775887693161674, + "grad_norm": 1.8907781839370728, + "learning_rate": 2.4839126477095287e-05, + "loss": 1.3687, + "step": 21713 + }, + { + "epoch": 0.7776245814457357, + "grad_norm": 1.9135130643844604, + "learning_rate": 2.4831476166956515e-05, + "loss": 1.402, + "step": 21714 + }, + { + "epoch": 0.7776603935753039, + "grad_norm": 1.6929961442947388, + "learning_rate": 2.482382686811324e-05, + "loss": 1.4129, + "step": 21715 + }, + { + "epoch": 0.7776962057048722, + "grad_norm": 1.5624363422393799, + "learning_rate": 2.4816178580668415e-05, + "loss": 1.4647, + "step": 21716 + }, + { + "epoch": 0.7777320178344406, + "grad_norm": 1.6534024477005005, + "learning_rate": 2.4808531304724913e-05, + "loss": 1.497, + "step": 21717 + }, + { + "epoch": 0.7777678299640088, + "grad_norm": 1.207729458808899, + "learning_rate": 2.4800885040385668e-05, + "loss": 1.4953, + "step": 21718 + }, + { + "epoch": 0.7778036420935771, + "grad_norm": 1.8488044738769531, + "learning_rate": 2.4793239787753487e-05, + "loss": 1.4934, + "step": 21719 + }, + { + "epoch": 0.7778394542231454, + "grad_norm": 1.6745057106018066, + "learning_rate": 2.478559554693125e-05, + "loss": 1.3223, + "step": 21720 + }, + { + "epoch": 0.7778752663527136, + "grad_norm": 1.441912293434143, + "learning_rate": 2.4777952318021814e-05, + "loss": 1.5393, + "step": 21721 + }, + { + "epoch": 0.7779110784822819, + "grad_norm": 1.9352595806121826, + "learning_rate": 2.4770310101128026e-05, + "loss": 1.526, + "step": 21722 + }, + { + "epoch": 0.7779468906118502, + "grad_norm": 1.4422829151153564, + "learning_rate": 2.476266889635265e-05, + "loss": 1.6112, + "step": 21723 + }, + { + "epoch": 0.7779827027414186, + "grad_norm": 1.7226628065109253, + "learning_rate": 2.475502870379851e-05, + "loss": 1.1627, + "step": 21724 + }, + { + "epoch": 0.7780185148709868, + "grad_norm": 2.2136683464050293, + "learning_rate": 2.474738952356842e-05, + "loss": 1.2243, + "step": 21725 + }, + { + "epoch": 0.7780543270005551, + "grad_norm": 1.4644486904144287, + "learning_rate": 2.4739751355765116e-05, + "loss": 1.4847, + "step": 21726 + }, + { + "epoch": 0.7780901391301234, + "grad_norm": 1.2714347839355469, + "learning_rate": 2.4732114200491386e-05, + "loss": 1.321, + "step": 21727 + }, + { + "epoch": 0.7781259512596916, + "grad_norm": 1.5196048021316528, + "learning_rate": 2.4724478057849965e-05, + "loss": 1.4103, + "step": 21728 + }, + { + "epoch": 0.7781617633892599, + "grad_norm": 1.713654637336731, + "learning_rate": 2.4716842927943617e-05, + "loss": 1.4538, + "step": 21729 + }, + { + "epoch": 0.7781975755188282, + "grad_norm": 1.7022624015808105, + "learning_rate": 2.4709208810875017e-05, + "loss": 1.7523, + "step": 21730 + }, + { + "epoch": 0.7782333876483966, + "grad_norm": 1.4414794445037842, + "learning_rate": 2.4701575706746882e-05, + "loss": 1.2275, + "step": 21731 + }, + { + "epoch": 0.7782691997779648, + "grad_norm": 1.515198826789856, + "learning_rate": 2.4693943615661963e-05, + "loss": 1.3186, + "step": 21732 + }, + { + "epoch": 0.7783050119075331, + "grad_norm": 1.7945395708084106, + "learning_rate": 2.4686312537722855e-05, + "loss": 1.4992, + "step": 21733 + }, + { + "epoch": 0.7783408240371014, + "grad_norm": 1.8566440343856812, + "learning_rate": 2.4678682473032267e-05, + "loss": 1.4913, + "step": 21734 + }, + { + "epoch": 0.7783766361666696, + "grad_norm": 1.767237901687622, + "learning_rate": 2.4671053421692845e-05, + "loss": 1.3864, + "step": 21735 + }, + { + "epoch": 0.7784124482962379, + "grad_norm": 1.450467586517334, + "learning_rate": 2.466342538380727e-05, + "loss": 1.5967, + "step": 21736 + }, + { + "epoch": 0.7784482604258062, + "grad_norm": 2.0914173126220703, + "learning_rate": 2.46557983594781e-05, + "loss": 1.4615, + "step": 21737 + }, + { + "epoch": 0.7784840725553746, + "grad_norm": 1.6237757205963135, + "learning_rate": 2.4648172348807963e-05, + "loss": 1.4614, + "step": 21738 + }, + { + "epoch": 0.7785198846849428, + "grad_norm": 1.7426936626434326, + "learning_rate": 2.464054735189948e-05, + "loss": 1.3176, + "step": 21739 + }, + { + "epoch": 0.7785556968145111, + "grad_norm": 1.7049877643585205, + "learning_rate": 2.4632923368855254e-05, + "loss": 1.2699, + "step": 21740 + }, + { + "epoch": 0.7785915089440794, + "grad_norm": 1.7951089143753052, + "learning_rate": 2.4625300399777806e-05, + "loss": 1.4016, + "step": 21741 + }, + { + "epoch": 0.7786273210736476, + "grad_norm": 1.667818546295166, + "learning_rate": 2.46176784447697e-05, + "loss": 1.3115, + "step": 21742 + }, + { + "epoch": 0.7786631332032159, + "grad_norm": 1.8031182289123535, + "learning_rate": 2.4610057503933537e-05, + "loss": 1.5707, + "step": 21743 + }, + { + "epoch": 0.7786989453327842, + "grad_norm": 1.7315683364868164, + "learning_rate": 2.4602437577371763e-05, + "loss": 1.3129, + "step": 21744 + }, + { + "epoch": 0.7787347574623525, + "grad_norm": 1.3032207489013672, + "learning_rate": 2.4594818665186937e-05, + "loss": 1.3991, + "step": 21745 + }, + { + "epoch": 0.7787705695919208, + "grad_norm": 1.8537158966064453, + "learning_rate": 2.4587200767481565e-05, + "loss": 1.1288, + "step": 21746 + }, + { + "epoch": 0.7788063817214891, + "grad_norm": 2.1073696613311768, + "learning_rate": 2.457958388435816e-05, + "loss": 1.5075, + "step": 21747 + }, + { + "epoch": 0.7788421938510574, + "grad_norm": 1.4380515813827515, + "learning_rate": 2.4571968015919144e-05, + "loss": 1.2178, + "step": 21748 + }, + { + "epoch": 0.7788780059806256, + "grad_norm": 1.5372939109802246, + "learning_rate": 2.4564353162266996e-05, + "loss": 1.037, + "step": 21749 + }, + { + "epoch": 0.7789138181101939, + "grad_norm": 1.7316334247589111, + "learning_rate": 2.4556739323504195e-05, + "loss": 1.3958, + "step": 21750 + }, + { + "epoch": 0.7789496302397622, + "grad_norm": 2.508051633834839, + "learning_rate": 2.454912649973313e-05, + "loss": 1.6857, + "step": 21751 + }, + { + "epoch": 0.7789854423693305, + "grad_norm": 1.7578423023223877, + "learning_rate": 2.4541514691056245e-05, + "loss": 1.3476, + "step": 21752 + }, + { + "epoch": 0.7790212544988988, + "grad_norm": 2.091949462890625, + "learning_rate": 2.453390389757595e-05, + "loss": 1.594, + "step": 21753 + }, + { + "epoch": 0.7790570666284671, + "grad_norm": 1.5613199472427368, + "learning_rate": 2.4526294119394653e-05, + "loss": 1.5924, + "step": 21754 + }, + { + "epoch": 0.7790928787580353, + "grad_norm": 1.5921406745910645, + "learning_rate": 2.451868535661469e-05, + "loss": 1.3637, + "step": 21755 + }, + { + "epoch": 0.7791286908876036, + "grad_norm": 1.6087360382080078, + "learning_rate": 2.451107760933845e-05, + "loss": 1.5249, + "step": 21756 + }, + { + "epoch": 0.7791645030171719, + "grad_norm": 1.8573392629623413, + "learning_rate": 2.4503470877668287e-05, + "loss": 1.5981, + "step": 21757 + }, + { + "epoch": 0.7792003151467402, + "grad_norm": 1.5626813173294067, + "learning_rate": 2.4495865161706567e-05, + "loss": 1.3329, + "step": 21758 + }, + { + "epoch": 0.7792361272763085, + "grad_norm": 2.4538016319274902, + "learning_rate": 2.448826046155559e-05, + "loss": 1.4858, + "step": 21759 + }, + { + "epoch": 0.7792719394058768, + "grad_norm": 1.3803672790527344, + "learning_rate": 2.4480656777317613e-05, + "loss": 1.4366, + "step": 21760 + }, + { + "epoch": 0.7793077515354451, + "grad_norm": 1.7788008451461792, + "learning_rate": 2.447305410909504e-05, + "loss": 1.3099, + "step": 21761 + }, + { + "epoch": 0.7793435636650133, + "grad_norm": 1.5982377529144287, + "learning_rate": 2.4465452456990067e-05, + "loss": 1.4482, + "step": 21762 + }, + { + "epoch": 0.7793793757945816, + "grad_norm": 1.2722399234771729, + "learning_rate": 2.4457851821105006e-05, + "loss": 1.6228, + "step": 21763 + }, + { + "epoch": 0.7794151879241499, + "grad_norm": 1.843069314956665, + "learning_rate": 2.4450252201542102e-05, + "loss": 1.3791, + "step": 21764 + }, + { + "epoch": 0.7794510000537181, + "grad_norm": 1.3950681686401367, + "learning_rate": 2.444265359840363e-05, + "loss": 1.3212, + "step": 21765 + }, + { + "epoch": 0.7794868121832865, + "grad_norm": 1.4858050346374512, + "learning_rate": 2.4435056011791768e-05, + "loss": 1.3646, + "step": 21766 + }, + { + "epoch": 0.7795226243128548, + "grad_norm": 1.639343023300171, + "learning_rate": 2.4427459441808754e-05, + "loss": 1.739, + "step": 21767 + }, + { + "epoch": 0.7795584364424231, + "grad_norm": 1.5904186964035034, + "learning_rate": 2.4419863888556815e-05, + "loss": 1.3763, + "step": 21768 + }, + { + "epoch": 0.7795942485719913, + "grad_norm": 1.7701135873794556, + "learning_rate": 2.4412269352138097e-05, + "loss": 1.5168, + "step": 21769 + }, + { + "epoch": 0.7796300607015596, + "grad_norm": 1.9620040655136108, + "learning_rate": 2.4404675832654812e-05, + "loss": 1.421, + "step": 21770 + }, + { + "epoch": 0.7796658728311279, + "grad_norm": 1.3738888502120972, + "learning_rate": 2.4397083330209046e-05, + "loss": 1.1076, + "step": 21771 + }, + { + "epoch": 0.7797016849606961, + "grad_norm": 1.2282766103744507, + "learning_rate": 2.438949184490307e-05, + "loss": 1.3747, + "step": 21772 + }, + { + "epoch": 0.7797374970902645, + "grad_norm": 1.7531875371932983, + "learning_rate": 2.438190137683891e-05, + "loss": 1.4378, + "step": 21773 + }, + { + "epoch": 0.7797733092198328, + "grad_norm": 1.9136475324630737, + "learning_rate": 2.4374311926118765e-05, + "loss": 1.4021, + "step": 21774 + }, + { + "epoch": 0.7798091213494011, + "grad_norm": 1.4294815063476562, + "learning_rate": 2.4366723492844644e-05, + "loss": 1.5454, + "step": 21775 + }, + { + "epoch": 0.7798449334789693, + "grad_norm": 1.2743984460830688, + "learning_rate": 2.435913607711876e-05, + "loss": 1.4154, + "step": 21776 + }, + { + "epoch": 0.7798807456085376, + "grad_norm": 1.6209650039672852, + "learning_rate": 2.4351549679043118e-05, + "loss": 1.8038, + "step": 21777 + }, + { + "epoch": 0.7799165577381059, + "grad_norm": 2.175903081893921, + "learning_rate": 2.4343964298719746e-05, + "loss": 1.2831, + "step": 21778 + }, + { + "epoch": 0.7799523698676741, + "grad_norm": 1.389844298362732, + "learning_rate": 2.4336379936250808e-05, + "loss": 1.459, + "step": 21779 + }, + { + "epoch": 0.7799881819972425, + "grad_norm": 1.7427324056625366, + "learning_rate": 2.4328796591738236e-05, + "loss": 1.4667, + "step": 21780 + }, + { + "epoch": 0.7800239941268108, + "grad_norm": 2.2379322052001953, + "learning_rate": 2.432121426528414e-05, + "loss": 1.25, + "step": 21781 + }, + { + "epoch": 0.780059806256379, + "grad_norm": 1.569846510887146, + "learning_rate": 2.431363295699042e-05, + "loss": 1.5934, + "step": 21782 + }, + { + "epoch": 0.7800956183859473, + "grad_norm": 1.7577472925186157, + "learning_rate": 2.43060526669592e-05, + "loss": 1.2713, + "step": 21783 + }, + { + "epoch": 0.7801314305155156, + "grad_norm": 1.8201638460159302, + "learning_rate": 2.4298473395292378e-05, + "loss": 1.416, + "step": 21784 + }, + { + "epoch": 0.7801672426450839, + "grad_norm": 2.0842766761779785, + "learning_rate": 2.4290895142091974e-05, + "loss": 1.3904, + "step": 21785 + }, + { + "epoch": 0.7802030547746521, + "grad_norm": 1.799854040145874, + "learning_rate": 2.428331790745989e-05, + "loss": 1.6248, + "step": 21786 + }, + { + "epoch": 0.7802388669042205, + "grad_norm": 1.5186867713928223, + "learning_rate": 2.42757416914981e-05, + "loss": 1.6752, + "step": 21787 + }, + { + "epoch": 0.7802746790337888, + "grad_norm": 1.4737837314605713, + "learning_rate": 2.4268166494308553e-05, + "loss": 1.4571, + "step": 21788 + }, + { + "epoch": 0.780310491163357, + "grad_norm": 1.4512406587600708, + "learning_rate": 2.426059231599308e-05, + "loss": 1.2519, + "step": 21789 + }, + { + "epoch": 0.7803463032929253, + "grad_norm": 1.9012922048568726, + "learning_rate": 2.42530191566537e-05, + "loss": 1.3534, + "step": 21790 + }, + { + "epoch": 0.7803821154224936, + "grad_norm": 1.6925349235534668, + "learning_rate": 2.4245447016392207e-05, + "loss": 1.4462, + "step": 21791 + }, + { + "epoch": 0.7804179275520619, + "grad_norm": 1.83090078830719, + "learning_rate": 2.4237875895310548e-05, + "loss": 1.4723, + "step": 21792 + }, + { + "epoch": 0.7804537396816301, + "grad_norm": 1.7170168161392212, + "learning_rate": 2.4230305793510478e-05, + "loss": 1.6161, + "step": 21793 + }, + { + "epoch": 0.7804895518111985, + "grad_norm": 1.244886040687561, + "learning_rate": 2.4222736711093964e-05, + "loss": 1.4318, + "step": 21794 + }, + { + "epoch": 0.7805253639407668, + "grad_norm": 1.9996095895767212, + "learning_rate": 2.4215168648162778e-05, + "loss": 1.3049, + "step": 21795 + }, + { + "epoch": 0.780561176070335, + "grad_norm": 1.6705350875854492, + "learning_rate": 2.420760160481872e-05, + "loss": 1.4626, + "step": 21796 + }, + { + "epoch": 0.7805969881999033, + "grad_norm": 1.3955320119857788, + "learning_rate": 2.4200035581163614e-05, + "loss": 1.4416, + "step": 21797 + }, + { + "epoch": 0.7806328003294716, + "grad_norm": 2.089934825897217, + "learning_rate": 2.4192470577299263e-05, + "loss": 1.2793, + "step": 21798 + }, + { + "epoch": 0.7806686124590398, + "grad_norm": 1.3678085803985596, + "learning_rate": 2.418490659332746e-05, + "loss": 1.2663, + "step": 21799 + }, + { + "epoch": 0.7807044245886081, + "grad_norm": 1.447238564491272, + "learning_rate": 2.4177343629349912e-05, + "loss": 1.412, + "step": 21800 + }, + { + "epoch": 0.7807402367181765, + "grad_norm": 2.202078342437744, + "learning_rate": 2.4169781685468407e-05, + "loss": 1.4239, + "step": 21801 + }, + { + "epoch": 0.7807760488477448, + "grad_norm": 1.9966267347335815, + "learning_rate": 2.416222076178467e-05, + "loss": 1.31, + "step": 21802 + }, + { + "epoch": 0.780811860977313, + "grad_norm": 2.0359063148498535, + "learning_rate": 2.4154660858400456e-05, + "loss": 1.5723, + "step": 21803 + }, + { + "epoch": 0.7808476731068813, + "grad_norm": 1.8663406372070312, + "learning_rate": 2.414710197541743e-05, + "loss": 1.5869, + "step": 21804 + }, + { + "epoch": 0.7808834852364496, + "grad_norm": 1.834692120552063, + "learning_rate": 2.4139544112937283e-05, + "loss": 1.5305, + "step": 21805 + }, + { + "epoch": 0.7809192973660178, + "grad_norm": 1.34640634059906, + "learning_rate": 2.413198727106176e-05, + "loss": 1.6293, + "step": 21806 + }, + { + "epoch": 0.7809551094955861, + "grad_norm": 2.504323959350586, + "learning_rate": 2.412443144989246e-05, + "loss": 1.2204, + "step": 21807 + }, + { + "epoch": 0.7809909216251545, + "grad_norm": 2.4626822471618652, + "learning_rate": 2.411687664953106e-05, + "loss": 1.5398, + "step": 21808 + }, + { + "epoch": 0.7810267337547228, + "grad_norm": 1.4830766916275024, + "learning_rate": 2.41093228700792e-05, + "loss": 1.4997, + "step": 21809 + }, + { + "epoch": 0.781062545884291, + "grad_norm": 1.2905964851379395, + "learning_rate": 2.4101770111638534e-05, + "loss": 1.2161, + "step": 21810 + }, + { + "epoch": 0.7810983580138593, + "grad_norm": 1.5993380546569824, + "learning_rate": 2.409421837431063e-05, + "loss": 1.3006, + "step": 21811 + }, + { + "epoch": 0.7811341701434276, + "grad_norm": 1.530030608177185, + "learning_rate": 2.4086667658197093e-05, + "loss": 1.3426, + "step": 21812 + }, + { + "epoch": 0.7811699822729958, + "grad_norm": 1.3062310218811035, + "learning_rate": 2.4079117963399554e-05, + "loss": 1.4719, + "step": 21813 + }, + { + "epoch": 0.7812057944025641, + "grad_norm": 1.582737684249878, + "learning_rate": 2.4071569290019535e-05, + "loss": 1.2686, + "step": 21814 + }, + { + "epoch": 0.7812416065321325, + "grad_norm": 1.5829455852508545, + "learning_rate": 2.4064021638158596e-05, + "loss": 1.7681, + "step": 21815 + }, + { + "epoch": 0.7812774186617008, + "grad_norm": 1.5450665950775146, + "learning_rate": 2.40564750079183e-05, + "loss": 1.3582, + "step": 21816 + }, + { + "epoch": 0.781313230791269, + "grad_norm": 1.877021312713623, + "learning_rate": 2.404892939940021e-05, + "loss": 1.5996, + "step": 21817 + }, + { + "epoch": 0.7813490429208373, + "grad_norm": 1.5125741958618164, + "learning_rate": 2.404138481270577e-05, + "loss": 1.4883, + "step": 21818 + }, + { + "epoch": 0.7813848550504056, + "grad_norm": 1.710242748260498, + "learning_rate": 2.4033841247936517e-05, + "loss": 1.5537, + "step": 21819 + }, + { + "epoch": 0.7814206671799738, + "grad_norm": 2.031437397003174, + "learning_rate": 2.4026298705193972e-05, + "loss": 1.6633, + "step": 21820 + }, + { + "epoch": 0.7814564793095421, + "grad_norm": 1.5349557399749756, + "learning_rate": 2.4018757184579545e-05, + "loss": 1.2571, + "step": 21821 + }, + { + "epoch": 0.7814922914391105, + "grad_norm": 1.6792068481445312, + "learning_rate": 2.401121668619474e-05, + "loss": 1.5264, + "step": 21822 + }, + { + "epoch": 0.7815281035686787, + "grad_norm": 1.345013976097107, + "learning_rate": 2.4003677210140986e-05, + "loss": 1.1816, + "step": 21823 + }, + { + "epoch": 0.781563915698247, + "grad_norm": 1.3718589544296265, + "learning_rate": 2.3996138756519758e-05, + "loss": 1.336, + "step": 21824 + }, + { + "epoch": 0.7815997278278153, + "grad_norm": 1.577532410621643, + "learning_rate": 2.3988601325432415e-05, + "loss": 1.72, + "step": 21825 + }, + { + "epoch": 0.7816355399573836, + "grad_norm": 1.7576181888580322, + "learning_rate": 2.39810649169804e-05, + "loss": 1.5281, + "step": 21826 + }, + { + "epoch": 0.7816713520869518, + "grad_norm": 1.4686623811721802, + "learning_rate": 2.3973529531265095e-05, + "loss": 1.1936, + "step": 21827 + }, + { + "epoch": 0.7817071642165201, + "grad_norm": 1.418623685836792, + "learning_rate": 2.396599516838791e-05, + "loss": 1.2775, + "step": 21828 + }, + { + "epoch": 0.7817429763460885, + "grad_norm": 1.3246876001358032, + "learning_rate": 2.3958461828450164e-05, + "loss": 1.3822, + "step": 21829 + }, + { + "epoch": 0.7817787884756567, + "grad_norm": 1.7767356634140015, + "learning_rate": 2.3950929511553223e-05, + "loss": 1.4197, + "step": 21830 + }, + { + "epoch": 0.781814600605225, + "grad_norm": 1.667604684829712, + "learning_rate": 2.3943398217798452e-05, + "loss": 1.3056, + "step": 21831 + }, + { + "epoch": 0.7818504127347933, + "grad_norm": 1.5106874704360962, + "learning_rate": 2.393586794728713e-05, + "loss": 1.2763, + "step": 21832 + }, + { + "epoch": 0.7818862248643615, + "grad_norm": 1.9181849956512451, + "learning_rate": 2.3928338700120578e-05, + "loss": 1.4003, + "step": 21833 + }, + { + "epoch": 0.7819220369939298, + "grad_norm": 1.5137401819229126, + "learning_rate": 2.3920810476400112e-05, + "loss": 1.6359, + "step": 21834 + }, + { + "epoch": 0.7819578491234981, + "grad_norm": 1.4727210998535156, + "learning_rate": 2.391328327622704e-05, + "loss": 1.39, + "step": 21835 + }, + { + "epoch": 0.7819936612530665, + "grad_norm": 1.6446338891983032, + "learning_rate": 2.3905757099702564e-05, + "loss": 1.6383, + "step": 21836 + }, + { + "epoch": 0.7820294733826347, + "grad_norm": 1.7629797458648682, + "learning_rate": 2.3898231946927963e-05, + "loss": 1.5718, + "step": 21837 + }, + { + "epoch": 0.782065285512203, + "grad_norm": 1.1665034294128418, + "learning_rate": 2.3890707818004522e-05, + "loss": 1.5076, + "step": 21838 + }, + { + "epoch": 0.7821010976417713, + "grad_norm": 1.5729949474334717, + "learning_rate": 2.3883184713033414e-05, + "loss": 1.2453, + "step": 21839 + }, + { + "epoch": 0.7821369097713395, + "grad_norm": 1.3064537048339844, + "learning_rate": 2.387566263211586e-05, + "loss": 1.4201, + "step": 21840 + }, + { + "epoch": 0.7821727219009078, + "grad_norm": 1.7350980043411255, + "learning_rate": 2.3868141575353077e-05, + "loss": 1.461, + "step": 21841 + }, + { + "epoch": 0.7822085340304761, + "grad_norm": 1.9520295858383179, + "learning_rate": 2.3860621542846273e-05, + "loss": 1.5801, + "step": 21842 + }, + { + "epoch": 0.7822443461600445, + "grad_norm": 1.5008251667022705, + "learning_rate": 2.3853102534696557e-05, + "loss": 0.9946, + "step": 21843 + }, + { + "epoch": 0.7822801582896127, + "grad_norm": 1.6487131118774414, + "learning_rate": 2.384558455100514e-05, + "loss": 1.537, + "step": 21844 + }, + { + "epoch": 0.782315970419181, + "grad_norm": 1.7409266233444214, + "learning_rate": 2.3838067591873136e-05, + "loss": 1.2899, + "step": 21845 + }, + { + "epoch": 0.7823517825487493, + "grad_norm": 1.6287227869033813, + "learning_rate": 2.3830551657401723e-05, + "loss": 1.2628, + "step": 21846 + }, + { + "epoch": 0.7823875946783175, + "grad_norm": 1.5295032262802124, + "learning_rate": 2.3823036747691995e-05, + "loss": 1.081, + "step": 21847 + }, + { + "epoch": 0.7824234068078858, + "grad_norm": 1.78057062625885, + "learning_rate": 2.3815522862844985e-05, + "loss": 1.4897, + "step": 21848 + }, + { + "epoch": 0.7824592189374541, + "grad_norm": 1.901196002960205, + "learning_rate": 2.3808010002961902e-05, + "loss": 1.394, + "step": 21849 + }, + { + "epoch": 0.7824950310670225, + "grad_norm": 1.6514678001403809, + "learning_rate": 2.3800498168143726e-05, + "loss": 1.161, + "step": 21850 + }, + { + "epoch": 0.7825308431965907, + "grad_norm": 1.6724635362625122, + "learning_rate": 2.379298735849156e-05, + "loss": 1.4317, + "step": 21851 + }, + { + "epoch": 0.782566655326159, + "grad_norm": 1.8803189992904663, + "learning_rate": 2.378547757410645e-05, + "loss": 1.1883, + "step": 21852 + }, + { + "epoch": 0.7826024674557273, + "grad_norm": 1.2810298204421997, + "learning_rate": 2.377796881508947e-05, + "loss": 1.2568, + "step": 21853 + }, + { + "epoch": 0.7826382795852955, + "grad_norm": 1.5125658512115479, + "learning_rate": 2.3770461081541563e-05, + "loss": 1.3659, + "step": 21854 + }, + { + "epoch": 0.7826740917148638, + "grad_norm": 2.5842549800872803, + "learning_rate": 2.3762954373563763e-05, + "loss": 1.5197, + "step": 21855 + }, + { + "epoch": 0.7827099038444321, + "grad_norm": 1.7202882766723633, + "learning_rate": 2.375544869125711e-05, + "loss": 1.4987, + "step": 21856 + }, + { + "epoch": 0.7827457159740004, + "grad_norm": 1.7006173133850098, + "learning_rate": 2.3747944034722524e-05, + "loss": 1.3801, + "step": 21857 + }, + { + "epoch": 0.7827815281035687, + "grad_norm": 1.8149973154067993, + "learning_rate": 2.3740440404061015e-05, + "loss": 1.214, + "step": 21858 + }, + { + "epoch": 0.782817340233137, + "grad_norm": 1.8355730772018433, + "learning_rate": 2.3732937799373455e-05, + "loss": 1.286, + "step": 21859 + }, + { + "epoch": 0.7828531523627053, + "grad_norm": 1.700737476348877, + "learning_rate": 2.37254362207609e-05, + "loss": 1.4152, + "step": 21860 + }, + { + "epoch": 0.7828889644922735, + "grad_norm": 1.5340129137039185, + "learning_rate": 2.3717935668324186e-05, + "loss": 1.4033, + "step": 21861 + }, + { + "epoch": 0.7829247766218418, + "grad_norm": 1.533355474472046, + "learning_rate": 2.371043614216425e-05, + "loss": 1.205, + "step": 21862 + }, + { + "epoch": 0.7829605887514101, + "grad_norm": 2.0897789001464844, + "learning_rate": 2.3702937642381985e-05, + "loss": 1.3425, + "step": 21863 + }, + { + "epoch": 0.7829964008809784, + "grad_norm": 1.8037362098693848, + "learning_rate": 2.369544016907831e-05, + "loss": 1.2512, + "step": 21864 + }, + { + "epoch": 0.7830322130105467, + "grad_norm": 2.2047224044799805, + "learning_rate": 2.3687943722354056e-05, + "loss": 1.8162, + "step": 21865 + }, + { + "epoch": 0.783068025140115, + "grad_norm": 1.6913021802902222, + "learning_rate": 2.3680448302310032e-05, + "loss": 1.2962, + "step": 21866 + }, + { + "epoch": 0.7831038372696832, + "grad_norm": 1.605500340461731, + "learning_rate": 2.367295390904719e-05, + "loss": 1.8098, + "step": 21867 + }, + { + "epoch": 0.7831396493992515, + "grad_norm": 2.315788984298706, + "learning_rate": 2.3665460542666263e-05, + "loss": 1.3339, + "step": 21868 + }, + { + "epoch": 0.7831754615288198, + "grad_norm": 2.22713041305542, + "learning_rate": 2.3657968203268133e-05, + "loss": 1.512, + "step": 21869 + }, + { + "epoch": 0.783211273658388, + "grad_norm": 2.339245319366455, + "learning_rate": 2.365047689095351e-05, + "loss": 1.2251, + "step": 21870 + }, + { + "epoch": 0.7832470857879564, + "grad_norm": 2.6898417472839355, + "learning_rate": 2.3642986605823292e-05, + "loss": 1.8021, + "step": 21871 + }, + { + "epoch": 0.7832828979175247, + "grad_norm": 1.8308742046356201, + "learning_rate": 2.3635497347978176e-05, + "loss": 1.4888, + "step": 21872 + }, + { + "epoch": 0.783318710047093, + "grad_norm": 2.016420841217041, + "learning_rate": 2.3628009117518956e-05, + "loss": 1.4186, + "step": 21873 + }, + { + "epoch": 0.7833545221766612, + "grad_norm": 1.976356863975525, + "learning_rate": 2.3620521914546334e-05, + "loss": 1.5075, + "step": 21874 + }, + { + "epoch": 0.7833903343062295, + "grad_norm": 2.4779212474823, + "learning_rate": 2.361303573916107e-05, + "loss": 1.5026, + "step": 21875 + }, + { + "epoch": 0.7834261464357978, + "grad_norm": 1.4665014743804932, + "learning_rate": 2.360555059146391e-05, + "loss": 1.2472, + "step": 21876 + }, + { + "epoch": 0.783461958565366, + "grad_norm": 1.507570505142212, + "learning_rate": 2.359806647155547e-05, + "loss": 1.339, + "step": 21877 + }, + { + "epoch": 0.7834977706949344, + "grad_norm": 1.5394234657287598, + "learning_rate": 2.3590583379536535e-05, + "loss": 1.5506, + "step": 21878 + }, + { + "epoch": 0.7835335828245027, + "grad_norm": 1.5413225889205933, + "learning_rate": 2.358310131550773e-05, + "loss": 1.5083, + "step": 21879 + }, + { + "epoch": 0.783569394954071, + "grad_norm": 1.6394191980361938, + "learning_rate": 2.3575620279569743e-05, + "loss": 1.3997, + "step": 21880 + }, + { + "epoch": 0.7836052070836392, + "grad_norm": 1.2522400617599487, + "learning_rate": 2.3568140271823147e-05, + "loss": 1.5342, + "step": 21881 + }, + { + "epoch": 0.7836410192132075, + "grad_norm": 1.663016676902771, + "learning_rate": 2.3560661292368702e-05, + "loss": 1.2195, + "step": 21882 + }, + { + "epoch": 0.7836768313427758, + "grad_norm": 1.80072820186615, + "learning_rate": 2.355318334130695e-05, + "loss": 1.3811, + "step": 21883 + }, + { + "epoch": 0.783712643472344, + "grad_norm": 1.770574688911438, + "learning_rate": 2.3545706418738476e-05, + "loss": 1.476, + "step": 21884 + }, + { + "epoch": 0.7837484556019124, + "grad_norm": 1.4288097620010376, + "learning_rate": 2.3538230524763914e-05, + "loss": 1.4681, + "step": 21885 + }, + { + "epoch": 0.7837842677314807, + "grad_norm": 1.5454676151275635, + "learning_rate": 2.353075565948383e-05, + "loss": 1.1621, + "step": 21886 + }, + { + "epoch": 0.783820079861049, + "grad_norm": 1.748083472251892, + "learning_rate": 2.352328182299881e-05, + "loss": 1.2265, + "step": 21887 + }, + { + "epoch": 0.7838558919906172, + "grad_norm": 1.7457653284072876, + "learning_rate": 2.351580901540933e-05, + "loss": 1.3389, + "step": 21888 + }, + { + "epoch": 0.7838917041201855, + "grad_norm": 1.464296817779541, + "learning_rate": 2.3508337236816047e-05, + "loss": 1.4847, + "step": 21889 + }, + { + "epoch": 0.7839275162497538, + "grad_norm": 4.318163871765137, + "learning_rate": 2.3500866487319384e-05, + "loss": 1.2113, + "step": 21890 + }, + { + "epoch": 0.783963328379322, + "grad_norm": 2.506546974182129, + "learning_rate": 2.3493396767019915e-05, + "loss": 1.8466, + "step": 21891 + }, + { + "epoch": 0.7839991405088904, + "grad_norm": 1.6039185523986816, + "learning_rate": 2.348592807601808e-05, + "loss": 1.4501, + "step": 21892 + }, + { + "epoch": 0.7840349526384587, + "grad_norm": 1.9799975156784058, + "learning_rate": 2.3478460414414382e-05, + "loss": 1.2276, + "step": 21893 + }, + { + "epoch": 0.784070764768027, + "grad_norm": 1.671531319618225, + "learning_rate": 2.3470993782309324e-05, + "loss": 1.3684, + "step": 21894 + }, + { + "epoch": 0.7841065768975952, + "grad_norm": 1.4565225839614868, + "learning_rate": 2.3463528179803305e-05, + "loss": 1.2772, + "step": 21895 + }, + { + "epoch": 0.7841423890271635, + "grad_norm": 1.440674901008606, + "learning_rate": 2.3456063606996783e-05, + "loss": 1.4639, + "step": 21896 + }, + { + "epoch": 0.7841782011567318, + "grad_norm": 3.0526788234710693, + "learning_rate": 2.34486000639902e-05, + "loss": 1.2457, + "step": 21897 + }, + { + "epoch": 0.7842140132863, + "grad_norm": 1.7137478590011597, + "learning_rate": 2.3441137550883974e-05, + "loss": 1.3624, + "step": 21898 + }, + { + "epoch": 0.7842498254158684, + "grad_norm": 1.5892608165740967, + "learning_rate": 2.3433676067778465e-05, + "loss": 1.6998, + "step": 21899 + }, + { + "epoch": 0.7842856375454367, + "grad_norm": 1.6187303066253662, + "learning_rate": 2.3426215614774094e-05, + "loss": 1.4802, + "step": 21900 + }, + { + "epoch": 0.784321449675005, + "grad_norm": 1.3965293169021606, + "learning_rate": 2.3418756191971235e-05, + "loss": 1.6476, + "step": 21901 + }, + { + "epoch": 0.7843572618045732, + "grad_norm": 1.4413694143295288, + "learning_rate": 2.34112977994702e-05, + "loss": 1.5815, + "step": 21902 + }, + { + "epoch": 0.7843930739341415, + "grad_norm": 2.764780282974243, + "learning_rate": 2.340384043737136e-05, + "loss": 1.3733, + "step": 21903 + }, + { + "epoch": 0.7844288860637098, + "grad_norm": 1.5479607582092285, + "learning_rate": 2.339638410577505e-05, + "loss": 1.4778, + "step": 21904 + }, + { + "epoch": 0.784464698193278, + "grad_norm": 1.5237574577331543, + "learning_rate": 2.3388928804781608e-05, + "loss": 1.1754, + "step": 21905 + }, + { + "epoch": 0.7845005103228464, + "grad_norm": 2.09736704826355, + "learning_rate": 2.3381474534491276e-05, + "loss": 1.315, + "step": 21906 + }, + { + "epoch": 0.7845363224524147, + "grad_norm": 1.7399797439575195, + "learning_rate": 2.337402129500438e-05, + "loss": 1.087, + "step": 21907 + }, + { + "epoch": 0.7845721345819829, + "grad_norm": 2.0457167625427246, + "learning_rate": 2.3366569086421175e-05, + "loss": 1.4334, + "step": 21908 + }, + { + "epoch": 0.7846079467115512, + "grad_norm": 1.504894733428955, + "learning_rate": 2.3359117908841966e-05, + "loss": 1.5605, + "step": 21909 + }, + { + "epoch": 0.7846437588411195, + "grad_norm": 1.2692909240722656, + "learning_rate": 2.3351667762366948e-05, + "loss": 1.4162, + "step": 21910 + }, + { + "epoch": 0.7846795709706877, + "grad_norm": 1.8870021104812622, + "learning_rate": 2.334421864709636e-05, + "loss": 1.6566, + "step": 21911 + }, + { + "epoch": 0.784715383100256, + "grad_norm": 2.1324985027313232, + "learning_rate": 2.3336770563130463e-05, + "loss": 1.5475, + "step": 21912 + }, + { + "epoch": 0.7847511952298244, + "grad_norm": 2.145512580871582, + "learning_rate": 2.33293235105694e-05, + "loss": 1.5966, + "step": 21913 + }, + { + "epoch": 0.7847870073593927, + "grad_norm": 1.569747805595398, + "learning_rate": 2.332187748951339e-05, + "loss": 1.2442, + "step": 21914 + }, + { + "epoch": 0.7848228194889609, + "grad_norm": 1.6398723125457764, + "learning_rate": 2.331443250006261e-05, + "loss": 1.4996, + "step": 21915 + }, + { + "epoch": 0.7848586316185292, + "grad_norm": 1.8749650716781616, + "learning_rate": 2.3306988542317255e-05, + "loss": 1.2954, + "step": 21916 + }, + { + "epoch": 0.7848944437480975, + "grad_norm": 1.3666261434555054, + "learning_rate": 2.3299545616377415e-05, + "loss": 1.1951, + "step": 21917 + }, + { + "epoch": 0.7849302558776657, + "grad_norm": 1.1874198913574219, + "learning_rate": 2.329210372234325e-05, + "loss": 1.4692, + "step": 21918 + }, + { + "epoch": 0.784966068007234, + "grad_norm": 1.760377049446106, + "learning_rate": 2.3284662860314922e-05, + "loss": 1.1574, + "step": 21919 + }, + { + "epoch": 0.7850018801368024, + "grad_norm": 1.4190868139266968, + "learning_rate": 2.3277223030392458e-05, + "loss": 1.2522, + "step": 21920 + }, + { + "epoch": 0.7850376922663707, + "grad_norm": 1.8131829500198364, + "learning_rate": 2.3269784232675995e-05, + "loss": 1.4429, + "step": 21921 + }, + { + "epoch": 0.7850735043959389, + "grad_norm": 1.7088327407836914, + "learning_rate": 2.3262346467265605e-05, + "loss": 1.3675, + "step": 21922 + }, + { + "epoch": 0.7851093165255072, + "grad_norm": 1.543448567390442, + "learning_rate": 2.3254909734261398e-05, + "loss": 1.5017, + "step": 21923 + }, + { + "epoch": 0.7851451286550755, + "grad_norm": 2.0293030738830566, + "learning_rate": 2.324747403376336e-05, + "loss": 1.5802, + "step": 21924 + }, + { + "epoch": 0.7851809407846437, + "grad_norm": 1.8709771633148193, + "learning_rate": 2.3240039365871546e-05, + "loss": 1.7063, + "step": 21925 + }, + { + "epoch": 0.785216752914212, + "grad_norm": 1.7643381357192993, + "learning_rate": 2.3232605730686018e-05, + "loss": 1.4052, + "step": 21926 + }, + { + "epoch": 0.7852525650437804, + "grad_norm": 1.706915020942688, + "learning_rate": 2.3225173128306733e-05, + "loss": 1.2809, + "step": 21927 + }, + { + "epoch": 0.7852883771733487, + "grad_norm": 1.7989875078201294, + "learning_rate": 2.3217741558833706e-05, + "loss": 1.5696, + "step": 21928 + }, + { + "epoch": 0.7853241893029169, + "grad_norm": 1.6285196542739868, + "learning_rate": 2.321031102236694e-05, + "loss": 1.1875, + "step": 21929 + }, + { + "epoch": 0.7853600014324852, + "grad_norm": 1.5075069665908813, + "learning_rate": 2.3202881519006393e-05, + "loss": 1.3279, + "step": 21930 + }, + { + "epoch": 0.7853958135620535, + "grad_norm": 1.7992472648620605, + "learning_rate": 2.3195453048852e-05, + "loss": 1.5811, + "step": 21931 + }, + { + "epoch": 0.7854316256916217, + "grad_norm": 1.7869278192520142, + "learning_rate": 2.3188025612003718e-05, + "loss": 1.7683, + "step": 21932 + }, + { + "epoch": 0.78546743782119, + "grad_norm": 1.4537584781646729, + "learning_rate": 2.318059920856146e-05, + "loss": 1.3219, + "step": 21933 + }, + { + "epoch": 0.7855032499507584, + "grad_norm": 1.5661792755126953, + "learning_rate": 2.3173173838625183e-05, + "loss": 1.4321, + "step": 21934 + }, + { + "epoch": 0.7855390620803266, + "grad_norm": 1.5315093994140625, + "learning_rate": 2.316574950229472e-05, + "loss": 1.5277, + "step": 21935 + }, + { + "epoch": 0.7855748742098949, + "grad_norm": 2.292695999145508, + "learning_rate": 2.315832619967e-05, + "loss": 1.0601, + "step": 21936 + }, + { + "epoch": 0.7856106863394632, + "grad_norm": 1.7151291370391846, + "learning_rate": 2.3150903930850896e-05, + "loss": 1.4839, + "step": 21937 + }, + { + "epoch": 0.7856464984690315, + "grad_norm": 1.6335084438323975, + "learning_rate": 2.3143482695937235e-05, + "loss": 1.2555, + "step": 21938 + }, + { + "epoch": 0.7856823105985997, + "grad_norm": 1.5263906717300415, + "learning_rate": 2.3136062495028876e-05, + "loss": 1.4354, + "step": 21939 + }, + { + "epoch": 0.785718122728168, + "grad_norm": 1.3875404596328735, + "learning_rate": 2.312864332822564e-05, + "loss": 1.4227, + "step": 21940 + }, + { + "epoch": 0.7857539348577364, + "grad_norm": 2.1404190063476562, + "learning_rate": 2.3121225195627382e-05, + "loss": 1.6252, + "step": 21941 + }, + { + "epoch": 0.7857897469873046, + "grad_norm": 1.6052498817443848, + "learning_rate": 2.3113808097333854e-05, + "loss": 1.5175, + "step": 21942 + }, + { + "epoch": 0.7858255591168729, + "grad_norm": 1.234230637550354, + "learning_rate": 2.3106392033444856e-05, + "loss": 1.189, + "step": 21943 + }, + { + "epoch": 0.7858613712464412, + "grad_norm": 1.5555446147918701, + "learning_rate": 2.3098977004060185e-05, + "loss": 1.3555, + "step": 21944 + }, + { + "epoch": 0.7858971833760094, + "grad_norm": 1.4354848861694336, + "learning_rate": 2.309156300927957e-05, + "loss": 1.4896, + "step": 21945 + }, + { + "epoch": 0.7859329955055777, + "grad_norm": 1.6179566383361816, + "learning_rate": 2.308415004920277e-05, + "loss": 1.3944, + "step": 21946 + }, + { + "epoch": 0.785968807635146, + "grad_norm": 2.2030816078186035, + "learning_rate": 2.307673812392951e-05, + "loss": 1.605, + "step": 21947 + }, + { + "epoch": 0.7860046197647144, + "grad_norm": 1.3560844659805298, + "learning_rate": 2.3069327233559533e-05, + "loss": 1.143, + "step": 21948 + }, + { + "epoch": 0.7860404318942826, + "grad_norm": 1.650636076927185, + "learning_rate": 2.306191737819251e-05, + "loss": 1.3462, + "step": 21949 + }, + { + "epoch": 0.7860762440238509, + "grad_norm": 1.7846274375915527, + "learning_rate": 2.3054508557928144e-05, + "loss": 1.6821, + "step": 21950 + }, + { + "epoch": 0.7861120561534192, + "grad_norm": 1.6764312982559204, + "learning_rate": 2.3047100772866114e-05, + "loss": 1.4222, + "step": 21951 + }, + { + "epoch": 0.7861478682829874, + "grad_norm": 2.348435878753662, + "learning_rate": 2.3039694023106106e-05, + "loss": 1.341, + "step": 21952 + }, + { + "epoch": 0.7861836804125557, + "grad_norm": 1.734067678451538, + "learning_rate": 2.303228830874775e-05, + "loss": 1.3237, + "step": 21953 + }, + { + "epoch": 0.786219492542124, + "grad_norm": 2.134556770324707, + "learning_rate": 2.3024883629890604e-05, + "loss": 1.5639, + "step": 21954 + }, + { + "epoch": 0.7862553046716924, + "grad_norm": 1.831900954246521, + "learning_rate": 2.3017479986634426e-05, + "loss": 1.4596, + "step": 21955 + }, + { + "epoch": 0.7862911168012606, + "grad_norm": 2.242368698120117, + "learning_rate": 2.3010077379078722e-05, + "loss": 1.2863, + "step": 21956 + }, + { + "epoch": 0.7863269289308289, + "grad_norm": 1.671486496925354, + "learning_rate": 2.300267580732315e-05, + "loss": 1.3689, + "step": 21957 + }, + { + "epoch": 0.7863627410603972, + "grad_norm": 1.524246335029602, + "learning_rate": 2.2995275271467187e-05, + "loss": 1.4023, + "step": 21958 + }, + { + "epoch": 0.7863985531899654, + "grad_norm": 2.152644157409668, + "learning_rate": 2.2987875771610534e-05, + "loss": 1.3216, + "step": 21959 + }, + { + "epoch": 0.7864343653195337, + "grad_norm": 1.521066427230835, + "learning_rate": 2.2980477307852642e-05, + "loss": 1.1968, + "step": 21960 + }, + { + "epoch": 0.786470177449102, + "grad_norm": 1.6909211874008179, + "learning_rate": 2.297307988029308e-05, + "loss": 1.565, + "step": 21961 + }, + { + "epoch": 0.7865059895786704, + "grad_norm": 1.748152732849121, + "learning_rate": 2.29656834890314e-05, + "loss": 1.3081, + "step": 21962 + }, + { + "epoch": 0.7865418017082386, + "grad_norm": 1.6962015628814697, + "learning_rate": 2.2958288134167048e-05, + "loss": 1.2831, + "step": 21963 + }, + { + "epoch": 0.7865776138378069, + "grad_norm": 1.4486021995544434, + "learning_rate": 2.295089381579959e-05, + "loss": 1.4332, + "step": 21964 + }, + { + "epoch": 0.7866134259673752, + "grad_norm": 1.4903663396835327, + "learning_rate": 2.2943500534028406e-05, + "loss": 1.4113, + "step": 21965 + }, + { + "epoch": 0.7866492380969434, + "grad_norm": 1.5759414434432983, + "learning_rate": 2.2936108288953083e-05, + "loss": 1.2901, + "step": 21966 + }, + { + "epoch": 0.7866850502265117, + "grad_norm": 1.741335153579712, + "learning_rate": 2.292871708067299e-05, + "loss": 1.257, + "step": 21967 + }, + { + "epoch": 0.78672086235608, + "grad_norm": 2.2893245220184326, + "learning_rate": 2.2921326909287634e-05, + "loss": 1.5896, + "step": 21968 + }, + { + "epoch": 0.7867566744856483, + "grad_norm": 1.9488850831985474, + "learning_rate": 2.291393777489632e-05, + "loss": 1.4638, + "step": 21969 + }, + { + "epoch": 0.7867924866152166, + "grad_norm": 1.3496569395065308, + "learning_rate": 2.290654967759862e-05, + "loss": 1.6248, + "step": 21970 + }, + { + "epoch": 0.7868282987447849, + "grad_norm": 2.044699192047119, + "learning_rate": 2.289916261749383e-05, + "loss": 1.306, + "step": 21971 + }, + { + "epoch": 0.7868641108743532, + "grad_norm": 1.3044853210449219, + "learning_rate": 2.2891776594681315e-05, + "loss": 1.4078, + "step": 21972 + }, + { + "epoch": 0.7868999230039214, + "grad_norm": 1.7480604648590088, + "learning_rate": 2.2884391609260525e-05, + "loss": 1.2894, + "step": 21973 + }, + { + "epoch": 0.7869357351334897, + "grad_norm": 1.638363242149353, + "learning_rate": 2.2877007661330762e-05, + "loss": 1.638, + "step": 21974 + }, + { + "epoch": 0.786971547263058, + "grad_norm": 1.6407605409622192, + "learning_rate": 2.2869624750991393e-05, + "loss": 1.3685, + "step": 21975 + }, + { + "epoch": 0.7870073593926263, + "grad_norm": 1.5192142724990845, + "learning_rate": 2.2862242878341678e-05, + "loss": 1.8149, + "step": 21976 + }, + { + "epoch": 0.7870431715221946, + "grad_norm": 1.6428052186965942, + "learning_rate": 2.285486204348105e-05, + "loss": 1.4984, + "step": 21977 + }, + { + "epoch": 0.7870789836517629, + "grad_norm": 1.9950889348983765, + "learning_rate": 2.284748224650871e-05, + "loss": 1.5231, + "step": 21978 + }, + { + "epoch": 0.7871147957813311, + "grad_norm": 2.0278079509735107, + "learning_rate": 2.2840103487524e-05, + "loss": 1.3566, + "step": 21979 + }, + { + "epoch": 0.7871506079108994, + "grad_norm": 1.671902060508728, + "learning_rate": 2.283272576662615e-05, + "loss": 1.5062, + "step": 21980 + }, + { + "epoch": 0.7871864200404677, + "grad_norm": 1.5835262537002563, + "learning_rate": 2.2825349083914426e-05, + "loss": 1.4686, + "step": 21981 + }, + { + "epoch": 0.787222232170036, + "grad_norm": 1.7632768154144287, + "learning_rate": 2.2817973439488117e-05, + "loss": 1.3356, + "step": 21982 + }, + { + "epoch": 0.7872580442996042, + "grad_norm": 1.718269944190979, + "learning_rate": 2.2810598833446382e-05, + "loss": 1.306, + "step": 21983 + }, + { + "epoch": 0.7872938564291726, + "grad_norm": 1.9307987689971924, + "learning_rate": 2.2803225265888484e-05, + "loss": 1.5905, + "step": 21984 + }, + { + "epoch": 0.7873296685587409, + "grad_norm": 1.7241172790527344, + "learning_rate": 2.2795852736913604e-05, + "loss": 1.4503, + "step": 21985 + }, + { + "epoch": 0.7873654806883091, + "grad_norm": 1.7079567909240723, + "learning_rate": 2.2788481246620973e-05, + "loss": 1.2879, + "step": 21986 + }, + { + "epoch": 0.7874012928178774, + "grad_norm": 1.3769007921218872, + "learning_rate": 2.2781110795109674e-05, + "loss": 1.525, + "step": 21987 + }, + { + "epoch": 0.7874371049474457, + "grad_norm": 1.4252955913543701, + "learning_rate": 2.2773741382478975e-05, + "loss": 1.5648, + "step": 21988 + }, + { + "epoch": 0.787472917077014, + "grad_norm": 1.4553250074386597, + "learning_rate": 2.276637300882797e-05, + "loss": 1.5559, + "step": 21989 + }, + { + "epoch": 0.7875087292065822, + "grad_norm": 1.6732171773910522, + "learning_rate": 2.2759005674255774e-05, + "loss": 1.2551, + "step": 21990 + }, + { + "epoch": 0.7875445413361506, + "grad_norm": 1.594787836074829, + "learning_rate": 2.275163937886151e-05, + "loss": 1.4659, + "step": 21991 + }, + { + "epoch": 0.7875803534657189, + "grad_norm": 1.33848237991333, + "learning_rate": 2.2744274122744304e-05, + "loss": 1.2285, + "step": 21992 + }, + { + "epoch": 0.7876161655952871, + "grad_norm": 1.8487385511398315, + "learning_rate": 2.2736909906003266e-05, + "loss": 1.879, + "step": 21993 + }, + { + "epoch": 0.7876519777248554, + "grad_norm": 1.378639817237854, + "learning_rate": 2.2729546728737416e-05, + "loss": 1.6364, + "step": 21994 + }, + { + "epoch": 0.7876877898544237, + "grad_norm": 1.6893432140350342, + "learning_rate": 2.2722184591045835e-05, + "loss": 1.5968, + "step": 21995 + }, + { + "epoch": 0.7877236019839919, + "grad_norm": 1.7341252565383911, + "learning_rate": 2.2714823493027583e-05, + "loss": 1.5264, + "step": 21996 + }, + { + "epoch": 0.7877594141135602, + "grad_norm": 1.3586167097091675, + "learning_rate": 2.2707463434781718e-05, + "loss": 1.3724, + "step": 21997 + }, + { + "epoch": 0.7877952262431286, + "grad_norm": 1.5334831476211548, + "learning_rate": 2.2700104416407208e-05, + "loss": 1.736, + "step": 21998 + }, + { + "epoch": 0.7878310383726969, + "grad_norm": 1.5055005550384521, + "learning_rate": 2.2692746438003078e-05, + "loss": 1.2053, + "step": 21999 + }, + { + "epoch": 0.7878668505022651, + "grad_norm": 2.348088502883911, + "learning_rate": 2.2685389499668352e-05, + "loss": 1.5909, + "step": 22000 + }, + { + "epoch": 0.7879026626318334, + "grad_norm": 1.4939537048339844, + "learning_rate": 2.2678033601501957e-05, + "loss": 1.5303, + "step": 22001 + }, + { + "epoch": 0.7879384747614017, + "grad_norm": 1.1772562265396118, + "learning_rate": 2.2670678743602892e-05, + "loss": 1.2715, + "step": 22002 + }, + { + "epoch": 0.7879742868909699, + "grad_norm": 1.5099385976791382, + "learning_rate": 2.2663324926070086e-05, + "loss": 1.3447, + "step": 22003 + }, + { + "epoch": 0.7880100990205382, + "grad_norm": 1.5559260845184326, + "learning_rate": 2.2655972149002512e-05, + "loss": 1.1992, + "step": 22004 + }, + { + "epoch": 0.7880459111501066, + "grad_norm": 2.2137198448181152, + "learning_rate": 2.2648620412499045e-05, + "loss": 1.7298, + "step": 22005 + }, + { + "epoch": 0.7880817232796749, + "grad_norm": 1.509599208831787, + "learning_rate": 2.264126971665861e-05, + "loss": 1.1957, + "step": 22006 + }, + { + "epoch": 0.7881175354092431, + "grad_norm": 1.8853390216827393, + "learning_rate": 2.2633920061580127e-05, + "loss": 1.7451, + "step": 22007 + }, + { + "epoch": 0.7881533475388114, + "grad_norm": 1.300235629081726, + "learning_rate": 2.262657144736243e-05, + "loss": 1.2197, + "step": 22008 + }, + { + "epoch": 0.7881891596683797, + "grad_norm": 1.5796751976013184, + "learning_rate": 2.2619223874104423e-05, + "loss": 1.5641, + "step": 22009 + }, + { + "epoch": 0.7882249717979479, + "grad_norm": 1.7323791980743408, + "learning_rate": 2.261187734190493e-05, + "loss": 1.5017, + "step": 22010 + }, + { + "epoch": 0.7882607839275162, + "grad_norm": 1.4595657587051392, + "learning_rate": 2.2604531850862832e-05, + "loss": 1.5183, + "step": 22011 + }, + { + "epoch": 0.7882965960570846, + "grad_norm": 2.023123264312744, + "learning_rate": 2.2597187401076903e-05, + "loss": 1.6822, + "step": 22012 + }, + { + "epoch": 0.7883324081866528, + "grad_norm": 1.8436816930770874, + "learning_rate": 2.2589843992645977e-05, + "loss": 1.6796, + "step": 22013 + }, + { + "epoch": 0.7883682203162211, + "grad_norm": 1.3676259517669678, + "learning_rate": 2.258250162566887e-05, + "loss": 1.4, + "step": 22014 + }, + { + "epoch": 0.7884040324457894, + "grad_norm": 1.615670084953308, + "learning_rate": 2.2575160300244314e-05, + "loss": 1.2904, + "step": 22015 + }, + { + "epoch": 0.7884398445753577, + "grad_norm": 1.8132332563400269, + "learning_rate": 2.2567820016471107e-05, + "loss": 1.19, + "step": 22016 + }, + { + "epoch": 0.7884756567049259, + "grad_norm": 1.5410048961639404, + "learning_rate": 2.256048077444801e-05, + "loss": 1.4224, + "step": 22017 + }, + { + "epoch": 0.7885114688344942, + "grad_norm": 1.866053581237793, + "learning_rate": 2.2553142574273777e-05, + "loss": 1.4266, + "step": 22018 + }, + { + "epoch": 0.7885472809640626, + "grad_norm": 1.684851884841919, + "learning_rate": 2.2545805416047073e-05, + "loss": 1.4562, + "step": 22019 + }, + { + "epoch": 0.7885830930936308, + "grad_norm": 1.7239848375320435, + "learning_rate": 2.253846929986666e-05, + "loss": 1.2378, + "step": 22020 + }, + { + "epoch": 0.7886189052231991, + "grad_norm": 2.0426554679870605, + "learning_rate": 2.253113422583122e-05, + "loss": 1.1809, + "step": 22021 + }, + { + "epoch": 0.7886547173527674, + "grad_norm": 1.4321867227554321, + "learning_rate": 2.252380019403947e-05, + "loss": 1.1409, + "step": 22022 + }, + { + "epoch": 0.7886905294823356, + "grad_norm": 1.3526005744934082, + "learning_rate": 2.251646720459003e-05, + "loss": 1.1834, + "step": 22023 + }, + { + "epoch": 0.7887263416119039, + "grad_norm": 1.6505836248397827, + "learning_rate": 2.250913525758157e-05, + "loss": 1.3677, + "step": 22024 + }, + { + "epoch": 0.7887621537414722, + "grad_norm": 1.5732851028442383, + "learning_rate": 2.2501804353112765e-05, + "loss": 1.6171, + "step": 22025 + }, + { + "epoch": 0.7887979658710406, + "grad_norm": 1.22661554813385, + "learning_rate": 2.249447449128219e-05, + "loss": 1.4751, + "step": 22026 + }, + { + "epoch": 0.7888337780006088, + "grad_norm": 1.3099799156188965, + "learning_rate": 2.248714567218849e-05, + "loss": 1.1526, + "step": 22027 + }, + { + "epoch": 0.7888695901301771, + "grad_norm": 2.0355942249298096, + "learning_rate": 2.2479817895930256e-05, + "loss": 1.4029, + "step": 22028 + }, + { + "epoch": 0.7889054022597454, + "grad_norm": 1.6288301944732666, + "learning_rate": 2.247249116260611e-05, + "loss": 1.5374, + "step": 22029 + }, + { + "epoch": 0.7889412143893136, + "grad_norm": 1.6688481569290161, + "learning_rate": 2.2465165472314564e-05, + "loss": 1.449, + "step": 22030 + }, + { + "epoch": 0.7889770265188819, + "grad_norm": 1.6263892650604248, + "learning_rate": 2.2457840825154198e-05, + "loss": 1.3263, + "step": 22031 + }, + { + "epoch": 0.7890128386484502, + "grad_norm": 1.6034423112869263, + "learning_rate": 2.24505172212236e-05, + "loss": 1.4507, + "step": 22032 + }, + { + "epoch": 0.7890486507780186, + "grad_norm": 2.850820541381836, + "learning_rate": 2.2443194660621225e-05, + "loss": 1.6127, + "step": 22033 + }, + { + "epoch": 0.7890844629075868, + "grad_norm": 1.3363112211227417, + "learning_rate": 2.243587314344563e-05, + "loss": 1.3583, + "step": 22034 + }, + { + "epoch": 0.7891202750371551, + "grad_norm": 1.8845534324645996, + "learning_rate": 2.242855266979531e-05, + "loss": 1.353, + "step": 22035 + }, + { + "epoch": 0.7891560871667234, + "grad_norm": 1.926565170288086, + "learning_rate": 2.242123323976878e-05, + "loss": 1.6149, + "step": 22036 + }, + { + "epoch": 0.7891918992962916, + "grad_norm": 1.9906445741653442, + "learning_rate": 2.2413914853464455e-05, + "loss": 1.466, + "step": 22037 + }, + { + "epoch": 0.7892277114258599, + "grad_norm": 1.8436861038208008, + "learning_rate": 2.240659751098083e-05, + "loss": 1.4323, + "step": 22038 + }, + { + "epoch": 0.7892635235554282, + "grad_norm": 1.6157275438308716, + "learning_rate": 2.2399281212416346e-05, + "loss": 1.3719, + "step": 22039 + }, + { + "epoch": 0.7892993356849966, + "grad_norm": 1.502375841140747, + "learning_rate": 2.2391965957869464e-05, + "loss": 1.5709, + "step": 22040 + }, + { + "epoch": 0.7893351478145648, + "grad_norm": 3.7131903171539307, + "learning_rate": 2.2384651747438578e-05, + "loss": 1.413, + "step": 22041 + }, + { + "epoch": 0.7893709599441331, + "grad_norm": 1.402735948562622, + "learning_rate": 2.237733858122203e-05, + "loss": 1.5297, + "step": 22042 + }, + { + "epoch": 0.7894067720737014, + "grad_norm": 1.9113142490386963, + "learning_rate": 2.2370026459318315e-05, + "loss": 1.4642, + "step": 22043 + }, + { + "epoch": 0.7894425842032696, + "grad_norm": 2.179842710494995, + "learning_rate": 2.236271538182574e-05, + "loss": 1.5229, + "step": 22044 + }, + { + "epoch": 0.7894783963328379, + "grad_norm": 1.7188292741775513, + "learning_rate": 2.2355405348842672e-05, + "loss": 1.3425, + "step": 22045 + }, + { + "epoch": 0.7895142084624062, + "grad_norm": 2.006430149078369, + "learning_rate": 2.2348096360467484e-05, + "loss": 1.4632, + "step": 22046 + }, + { + "epoch": 0.7895500205919745, + "grad_norm": 1.5962008237838745, + "learning_rate": 2.2340788416798518e-05, + "loss": 1.4263, + "step": 22047 + }, + { + "epoch": 0.7895858327215428, + "grad_norm": 1.4921599626541138, + "learning_rate": 2.233348151793404e-05, + "loss": 1.2119, + "step": 22048 + }, + { + "epoch": 0.7896216448511111, + "grad_norm": 1.3751320838928223, + "learning_rate": 2.232617566397238e-05, + "loss": 1.2617, + "step": 22049 + }, + { + "epoch": 0.7896574569806794, + "grad_norm": 1.339401364326477, + "learning_rate": 2.2318870855011874e-05, + "loss": 1.5102, + "step": 22050 + }, + { + "epoch": 0.7896932691102476, + "grad_norm": 1.2918152809143066, + "learning_rate": 2.231156709115073e-05, + "loss": 1.358, + "step": 22051 + }, + { + "epoch": 0.7897290812398159, + "grad_norm": 1.8511288166046143, + "learning_rate": 2.230426437248726e-05, + "loss": 1.1148, + "step": 22052 + }, + { + "epoch": 0.7897648933693842, + "grad_norm": 1.64247727394104, + "learning_rate": 2.229696269911965e-05, + "loss": 1.1974, + "step": 22053 + }, + { + "epoch": 0.7898007054989525, + "grad_norm": 1.3478108644485474, + "learning_rate": 2.228966207114622e-05, + "loss": 1.6666, + "step": 22054 + }, + { + "epoch": 0.7898365176285208, + "grad_norm": 1.3576805591583252, + "learning_rate": 2.228236248866512e-05, + "loss": 1.3979, + "step": 22055 + }, + { + "epoch": 0.7898723297580891, + "grad_norm": 2.1761605739593506, + "learning_rate": 2.2275063951774587e-05, + "loss": 1.5848, + "step": 22056 + }, + { + "epoch": 0.7899081418876573, + "grad_norm": 1.8470492362976074, + "learning_rate": 2.2267766460572814e-05, + "loss": 1.3471, + "step": 22057 + }, + { + "epoch": 0.7899439540172256, + "grad_norm": 1.5361154079437256, + "learning_rate": 2.226047001515801e-05, + "loss": 1.6824, + "step": 22058 + }, + { + "epoch": 0.7899797661467939, + "grad_norm": 1.5846877098083496, + "learning_rate": 2.225317461562829e-05, + "loss": 1.3616, + "step": 22059 + }, + { + "epoch": 0.7900155782763622, + "grad_norm": 1.971751093864441, + "learning_rate": 2.2245880262081774e-05, + "loss": 1.6053, + "step": 22060 + }, + { + "epoch": 0.7900513904059305, + "grad_norm": 1.9126818180084229, + "learning_rate": 2.223858695461669e-05, + "loss": 1.4891, + "step": 22061 + }, + { + "epoch": 0.7900872025354988, + "grad_norm": 1.4563158750534058, + "learning_rate": 2.2231294693331096e-05, + "loss": 1.4378, + "step": 22062 + }, + { + "epoch": 0.7901230146650671, + "grad_norm": 1.2912527322769165, + "learning_rate": 2.222400347832314e-05, + "loss": 1.4947, + "step": 22063 + }, + { + "epoch": 0.7901588267946353, + "grad_norm": 1.532296061515808, + "learning_rate": 2.221671330969084e-05, + "loss": 1.4986, + "step": 22064 + }, + { + "epoch": 0.7901946389242036, + "grad_norm": 1.5457974672317505, + "learning_rate": 2.220942418753238e-05, + "loss": 1.4653, + "step": 22065 + }, + { + "epoch": 0.7902304510537719, + "grad_norm": 1.6285170316696167, + "learning_rate": 2.220213611194576e-05, + "loss": 1.5869, + "step": 22066 + }, + { + "epoch": 0.7902662631833401, + "grad_norm": 1.7709981203079224, + "learning_rate": 2.2194849083029057e-05, + "loss": 1.2938, + "step": 22067 + }, + { + "epoch": 0.7903020753129085, + "grad_norm": 1.4877737760543823, + "learning_rate": 2.2187563100880282e-05, + "loss": 1.5076, + "step": 22068 + }, + { + "epoch": 0.7903378874424768, + "grad_norm": 1.7979744672775269, + "learning_rate": 2.2180278165597467e-05, + "loss": 1.6471, + "step": 22069 + }, + { + "epoch": 0.7903736995720451, + "grad_norm": 1.7051743268966675, + "learning_rate": 2.2172994277278668e-05, + "loss": 1.4707, + "step": 22070 + }, + { + "epoch": 0.7904095117016133, + "grad_norm": 2.3612325191497803, + "learning_rate": 2.2165711436021774e-05, + "loss": 1.3351, + "step": 22071 + }, + { + "epoch": 0.7904453238311816, + "grad_norm": 1.6688679456710815, + "learning_rate": 2.2158429641924895e-05, + "loss": 1.2814, + "step": 22072 + }, + { + "epoch": 0.7904811359607499, + "grad_norm": 1.6498329639434814, + "learning_rate": 2.2151148895085906e-05, + "loss": 1.6461, + "step": 22073 + }, + { + "epoch": 0.7905169480903181, + "grad_norm": 2.0248377323150635, + "learning_rate": 2.2143869195602816e-05, + "loss": 1.5359, + "step": 22074 + }, + { + "epoch": 0.7905527602198865, + "grad_norm": 1.48524010181427, + "learning_rate": 2.2136590543573497e-05, + "loss": 1.4948, + "step": 22075 + }, + { + "epoch": 0.7905885723494548, + "grad_norm": 1.3780854940414429, + "learning_rate": 2.212931293909596e-05, + "loss": 1.2927, + "step": 22076 + }, + { + "epoch": 0.7906243844790231, + "grad_norm": 1.5393905639648438, + "learning_rate": 2.2122036382268074e-05, + "loss": 0.9784, + "step": 22077 + }, + { + "epoch": 0.7906601966085913, + "grad_norm": 1.6730303764343262, + "learning_rate": 2.21147608731877e-05, + "loss": 1.6592, + "step": 22078 + }, + { + "epoch": 0.7906960087381596, + "grad_norm": 1.3171392679214478, + "learning_rate": 2.210748641195276e-05, + "loss": 1.4827, + "step": 22079 + }, + { + "epoch": 0.7907318208677279, + "grad_norm": 1.4257128238677979, + "learning_rate": 2.210021299866112e-05, + "loss": 1.3873, + "step": 22080 + }, + { + "epoch": 0.7907676329972961, + "grad_norm": 1.7365443706512451, + "learning_rate": 2.209294063341065e-05, + "loss": 1.4258, + "step": 22081 + }, + { + "epoch": 0.7908034451268645, + "grad_norm": 1.7937761545181274, + "learning_rate": 2.2085669316299117e-05, + "loss": 1.8, + "step": 22082 + }, + { + "epoch": 0.7908392572564328, + "grad_norm": 2.080672025680542, + "learning_rate": 2.207839904742446e-05, + "loss": 1.4081, + "step": 22083 + }, + { + "epoch": 0.790875069386001, + "grad_norm": 2.0380451679229736, + "learning_rate": 2.2071129826884397e-05, + "loss": 1.2983, + "step": 22084 + }, + { + "epoch": 0.7909108815155693, + "grad_norm": 1.4107887744903564, + "learning_rate": 2.2063861654776798e-05, + "loss": 1.4714, + "step": 22085 + }, + { + "epoch": 0.7909466936451376, + "grad_norm": 1.3979287147521973, + "learning_rate": 2.205659453119938e-05, + "loss": 1.2243, + "step": 22086 + }, + { + "epoch": 0.7909825057747059, + "grad_norm": 1.476107120513916, + "learning_rate": 2.204932845624994e-05, + "loss": 1.5036, + "step": 22087 + }, + { + "epoch": 0.7910183179042741, + "grad_norm": 1.4876946210861206, + "learning_rate": 2.204206343002626e-05, + "loss": 1.4428, + "step": 22088 + }, + { + "epoch": 0.7910541300338425, + "grad_norm": 1.7648842334747314, + "learning_rate": 2.2034799452626043e-05, + "loss": 1.5549, + "step": 22089 + }, + { + "epoch": 0.7910899421634108, + "grad_norm": 1.5976637601852417, + "learning_rate": 2.2027536524147017e-05, + "loss": 1.6226, + "step": 22090 + }, + { + "epoch": 0.791125754292979, + "grad_norm": 1.9662413597106934, + "learning_rate": 2.2020274644686922e-05, + "loss": 1.2422, + "step": 22091 + }, + { + "epoch": 0.7911615664225473, + "grad_norm": 1.4056212902069092, + "learning_rate": 2.2013013814343465e-05, + "loss": 1.2724, + "step": 22092 + }, + { + "epoch": 0.7911973785521156, + "grad_norm": 1.9056984186172485, + "learning_rate": 2.200575403321429e-05, + "loss": 1.7059, + "step": 22093 + }, + { + "epoch": 0.7912331906816839, + "grad_norm": 1.6030707359313965, + "learning_rate": 2.1998495301397083e-05, + "loss": 1.666, + "step": 22094 + }, + { + "epoch": 0.7912690028112521, + "grad_norm": 1.5489169359207153, + "learning_rate": 2.1991237618989535e-05, + "loss": 1.2683, + "step": 22095 + }, + { + "epoch": 0.7913048149408205, + "grad_norm": 1.811343789100647, + "learning_rate": 2.1983980986089235e-05, + "loss": 1.1581, + "step": 22096 + }, + { + "epoch": 0.7913406270703888, + "grad_norm": 1.4421796798706055, + "learning_rate": 2.197672540279384e-05, + "loss": 1.3113, + "step": 22097 + }, + { + "epoch": 0.791376439199957, + "grad_norm": 2.07533860206604, + "learning_rate": 2.196947086920096e-05, + "loss": 1.4348, + "step": 22098 + }, + { + "epoch": 0.7914122513295253, + "grad_norm": 1.6728070974349976, + "learning_rate": 2.1962217385408225e-05, + "loss": 1.508, + "step": 22099 + }, + { + "epoch": 0.7914480634590936, + "grad_norm": 1.8283659219741821, + "learning_rate": 2.1954964951513168e-05, + "loss": 1.5883, + "step": 22100 + }, + { + "epoch": 0.7914838755886618, + "grad_norm": 1.3747072219848633, + "learning_rate": 2.1947713567613382e-05, + "loss": 1.1613, + "step": 22101 + }, + { + "epoch": 0.7915196877182301, + "grad_norm": 1.5672385692596436, + "learning_rate": 2.194046323380643e-05, + "loss": 1.4233, + "step": 22102 + }, + { + "epoch": 0.7915554998477985, + "grad_norm": 1.5005253553390503, + "learning_rate": 2.193321395018989e-05, + "loss": 1.303, + "step": 22103 + }, + { + "epoch": 0.7915913119773668, + "grad_norm": 1.7410175800323486, + "learning_rate": 2.192596571686123e-05, + "loss": 1.4531, + "step": 22104 + }, + { + "epoch": 0.791627124106935, + "grad_norm": 3.041870594024658, + "learning_rate": 2.1918718533917982e-05, + "loss": 1.7464, + "step": 22105 + }, + { + "epoch": 0.7916629362365033, + "grad_norm": 1.4313814640045166, + "learning_rate": 2.1911472401457688e-05, + "loss": 1.426, + "step": 22106 + }, + { + "epoch": 0.7916987483660716, + "grad_norm": 2.0393707752227783, + "learning_rate": 2.1904227319577786e-05, + "loss": 1.4193, + "step": 22107 + }, + { + "epoch": 0.7917345604956398, + "grad_norm": 1.6360301971435547, + "learning_rate": 2.1896983288375773e-05, + "loss": 1.0499, + "step": 22108 + }, + { + "epoch": 0.7917703726252081, + "grad_norm": 2.0467658042907715, + "learning_rate": 2.188974030794909e-05, + "loss": 1.6207, + "step": 22109 + }, + { + "epoch": 0.7918061847547765, + "grad_norm": 1.2687796354293823, + "learning_rate": 2.1882498378395232e-05, + "loss": 1.5234, + "step": 22110 + }, + { + "epoch": 0.7918419968843448, + "grad_norm": 1.6729602813720703, + "learning_rate": 2.1875257499811563e-05, + "loss": 1.2955, + "step": 22111 + }, + { + "epoch": 0.791877809013913, + "grad_norm": 1.8214075565338135, + "learning_rate": 2.1868017672295537e-05, + "loss": 1.5294, + "step": 22112 + }, + { + "epoch": 0.7919136211434813, + "grad_norm": 1.2164210081100464, + "learning_rate": 2.1860778895944566e-05, + "loss": 1.3928, + "step": 22113 + }, + { + "epoch": 0.7919494332730496, + "grad_norm": 1.5118670463562012, + "learning_rate": 2.1853541170856007e-05, + "loss": 1.39, + "step": 22114 + }, + { + "epoch": 0.7919852454026178, + "grad_norm": 1.5947706699371338, + "learning_rate": 2.1846304497127247e-05, + "loss": 1.4102, + "step": 22115 + }, + { + "epoch": 0.7920210575321861, + "grad_norm": 1.5643930435180664, + "learning_rate": 2.183906887485565e-05, + "loss": 1.5654, + "step": 22116 + }, + { + "epoch": 0.7920568696617545, + "grad_norm": 1.5630208253860474, + "learning_rate": 2.1831834304138587e-05, + "loss": 1.5264, + "step": 22117 + }, + { + "epoch": 0.7920926817913228, + "grad_norm": 2.1537749767303467, + "learning_rate": 2.1824600785073335e-05, + "loss": 1.46, + "step": 22118 + }, + { + "epoch": 0.792128493920891, + "grad_norm": 1.5308321714401245, + "learning_rate": 2.1817368317757235e-05, + "loss": 1.3042, + "step": 22119 + }, + { + "epoch": 0.7921643060504593, + "grad_norm": 1.5003904104232788, + "learning_rate": 2.1810136902287625e-05, + "loss": 0.9253, + "step": 22120 + }, + { + "epoch": 0.7922001181800276, + "grad_norm": 1.9857511520385742, + "learning_rate": 2.1802906538761748e-05, + "loss": 1.3661, + "step": 22121 + }, + { + "epoch": 0.7922359303095958, + "grad_norm": 2.4356906414031982, + "learning_rate": 2.179567722727689e-05, + "loss": 1.4112, + "step": 22122 + }, + { + "epoch": 0.7922717424391641, + "grad_norm": 1.3355191946029663, + "learning_rate": 2.178844896793032e-05, + "loss": 1.0901, + "step": 22123 + }, + { + "epoch": 0.7923075545687325, + "grad_norm": 1.3789863586425781, + "learning_rate": 2.1781221760819303e-05, + "loss": 1.6212, + "step": 22124 + }, + { + "epoch": 0.7923433666983007, + "grad_norm": 1.8660190105438232, + "learning_rate": 2.1773995606041044e-05, + "loss": 1.2014, + "step": 22125 + }, + { + "epoch": 0.792379178827869, + "grad_norm": 1.63323175907135, + "learning_rate": 2.1766770503692748e-05, + "loss": 1.5686, + "step": 22126 + }, + { + "epoch": 0.7924149909574373, + "grad_norm": 1.8331819772720337, + "learning_rate": 2.1759546453871647e-05, + "loss": 1.3657, + "step": 22127 + }, + { + "epoch": 0.7924508030870056, + "grad_norm": 1.5052270889282227, + "learning_rate": 2.1752323456674962e-05, + "loss": 1.0748, + "step": 22128 + }, + { + "epoch": 0.7924866152165738, + "grad_norm": 1.5629814863204956, + "learning_rate": 2.1745101512199806e-05, + "loss": 1.4885, + "step": 22129 + }, + { + "epoch": 0.7925224273461421, + "grad_norm": 1.492061734199524, + "learning_rate": 2.173788062054336e-05, + "loss": 1.3173, + "step": 22130 + }, + { + "epoch": 0.7925582394757105, + "grad_norm": 1.879479169845581, + "learning_rate": 2.1730660781802804e-05, + "loss": 1.4771, + "step": 22131 + }, + { + "epoch": 0.7925940516052787, + "grad_norm": 1.4208660125732422, + "learning_rate": 2.1723441996075223e-05, + "loss": 1.3431, + "step": 22132 + }, + { + "epoch": 0.792629863734847, + "grad_norm": 1.137665867805481, + "learning_rate": 2.1716224263457763e-05, + "loss": 1.3964, + "step": 22133 + }, + { + "epoch": 0.7926656758644153, + "grad_norm": 1.7951220273971558, + "learning_rate": 2.1709007584047524e-05, + "loss": 1.682, + "step": 22134 + }, + { + "epoch": 0.7927014879939835, + "grad_norm": 1.7697921991348267, + "learning_rate": 2.170179195794163e-05, + "loss": 1.3207, + "step": 22135 + }, + { + "epoch": 0.7927373001235518, + "grad_norm": 1.775586724281311, + "learning_rate": 2.1694577385237104e-05, + "loss": 1.291, + "step": 22136 + }, + { + "epoch": 0.7927731122531201, + "grad_norm": 1.5712333917617798, + "learning_rate": 2.168736386603102e-05, + "loss": 1.2253, + "step": 22137 + }, + { + "epoch": 0.7928089243826885, + "grad_norm": 1.932370901107788, + "learning_rate": 2.168015140042048e-05, + "loss": 1.6061, + "step": 22138 + }, + { + "epoch": 0.7928447365122567, + "grad_norm": 1.4659613370895386, + "learning_rate": 2.167293998850244e-05, + "loss": 1.1571, + "step": 22139 + }, + { + "epoch": 0.792880548641825, + "grad_norm": 1.5071487426757812, + "learning_rate": 2.1665729630373965e-05, + "loss": 1.5654, + "step": 22140 + }, + { + "epoch": 0.7929163607713933, + "grad_norm": 1.8615208864212036, + "learning_rate": 2.165852032613205e-05, + "loss": 1.2381, + "step": 22141 + }, + { + "epoch": 0.7929521729009615, + "grad_norm": 1.6738359928131104, + "learning_rate": 2.1651312075873718e-05, + "loss": 1.3059, + "step": 22142 + }, + { + "epoch": 0.7929879850305298, + "grad_norm": 1.4147003889083862, + "learning_rate": 2.1644104879695892e-05, + "loss": 1.2178, + "step": 22143 + }, + { + "epoch": 0.7930237971600981, + "grad_norm": 1.8694628477096558, + "learning_rate": 2.1636898737695567e-05, + "loss": 1.411, + "step": 22144 + }, + { + "epoch": 0.7930596092896665, + "grad_norm": 1.5314850807189941, + "learning_rate": 2.1629693649969683e-05, + "loss": 1.3889, + "step": 22145 + }, + { + "epoch": 0.7930954214192347, + "grad_norm": 1.4080699682235718, + "learning_rate": 2.1622489616615203e-05, + "loss": 1.335, + "step": 22146 + }, + { + "epoch": 0.793131233548803, + "grad_norm": 1.9649324417114258, + "learning_rate": 2.1615286637729037e-05, + "loss": 1.4941, + "step": 22147 + }, + { + "epoch": 0.7931670456783713, + "grad_norm": 1.2882145643234253, + "learning_rate": 2.1608084713408018e-05, + "loss": 1.2824, + "step": 22148 + }, + { + "epoch": 0.7932028578079395, + "grad_norm": 1.4714865684509277, + "learning_rate": 2.1600883843749165e-05, + "loss": 1.5583, + "step": 22149 + }, + { + "epoch": 0.7932386699375078, + "grad_norm": 1.5543267726898193, + "learning_rate": 2.159368402884926e-05, + "loss": 1.4382, + "step": 22150 + }, + { + "epoch": 0.7932744820670761, + "grad_norm": 1.4382576942443848, + "learning_rate": 2.1586485268805225e-05, + "loss": 1.3659, + "step": 22151 + }, + { + "epoch": 0.7933102941966444, + "grad_norm": 1.7490313053131104, + "learning_rate": 2.1579287563713823e-05, + "loss": 1.0572, + "step": 22152 + }, + { + "epoch": 0.7933461063262127, + "grad_norm": 1.7107819318771362, + "learning_rate": 2.1572090913672017e-05, + "loss": 1.3691, + "step": 22153 + }, + { + "epoch": 0.793381918455781, + "grad_norm": 2.4976677894592285, + "learning_rate": 2.1564895318776534e-05, + "loss": 1.4157, + "step": 22154 + }, + { + "epoch": 0.7934177305853493, + "grad_norm": 1.6344050168991089, + "learning_rate": 2.1557700779124214e-05, + "loss": 1.4745, + "step": 22155 + }, + { + "epoch": 0.7934535427149175, + "grad_norm": 2.2682435512542725, + "learning_rate": 2.1550507294811863e-05, + "loss": 1.4652, + "step": 22156 + }, + { + "epoch": 0.7934893548444858, + "grad_norm": 1.6466939449310303, + "learning_rate": 2.1543314865936225e-05, + "loss": 1.5305, + "step": 22157 + }, + { + "epoch": 0.7935251669740541, + "grad_norm": 1.3412882089614868, + "learning_rate": 2.1536123492594106e-05, + "loss": 1.2984, + "step": 22158 + }, + { + "epoch": 0.7935609791036224, + "grad_norm": 1.7352933883666992, + "learning_rate": 2.1528933174882183e-05, + "loss": 1.6905, + "step": 22159 + }, + { + "epoch": 0.7935967912331907, + "grad_norm": 1.512288212776184, + "learning_rate": 2.1521743912897296e-05, + "loss": 1.3645, + "step": 22160 + }, + { + "epoch": 0.793632603362759, + "grad_norm": 1.663425326347351, + "learning_rate": 2.1514555706736084e-05, + "loss": 1.2875, + "step": 22161 + }, + { + "epoch": 0.7936684154923273, + "grad_norm": 1.7665363550186157, + "learning_rate": 2.1507368556495323e-05, + "loss": 1.4851, + "step": 22162 + }, + { + "epoch": 0.7937042276218955, + "grad_norm": 1.8287420272827148, + "learning_rate": 2.150018246227161e-05, + "loss": 1.4071, + "step": 22163 + }, + { + "epoch": 0.7937400397514638, + "grad_norm": 1.557187557220459, + "learning_rate": 2.1492997424161744e-05, + "loss": 1.3892, + "step": 22164 + }, + { + "epoch": 0.7937758518810321, + "grad_norm": 1.6745768785476685, + "learning_rate": 2.1485813442262325e-05, + "loss": 1.1793, + "step": 22165 + }, + { + "epoch": 0.7938116640106004, + "grad_norm": 2.1988525390625, + "learning_rate": 2.1478630516669952e-05, + "loss": 1.5253, + "step": 22166 + }, + { + "epoch": 0.7938474761401687, + "grad_norm": 1.7745083570480347, + "learning_rate": 2.1471448647481384e-05, + "loss": 1.0861, + "step": 22167 + }, + { + "epoch": 0.793883288269737, + "grad_norm": 1.9180774688720703, + "learning_rate": 2.1464267834793152e-05, + "loss": 1.3288, + "step": 22168 + }, + { + "epoch": 0.7939191003993052, + "grad_norm": 1.5834040641784668, + "learning_rate": 2.1457088078701916e-05, + "loss": 1.544, + "step": 22169 + }, + { + "epoch": 0.7939549125288735, + "grad_norm": 1.3772550821304321, + "learning_rate": 2.144990937930419e-05, + "loss": 1.4562, + "step": 22170 + }, + { + "epoch": 0.7939907246584418, + "grad_norm": 1.783267855644226, + "learning_rate": 2.1442731736696666e-05, + "loss": 1.5635, + "step": 22171 + }, + { + "epoch": 0.79402653678801, + "grad_norm": 1.4774640798568726, + "learning_rate": 2.143555515097583e-05, + "loss": 1.3106, + "step": 22172 + }, + { + "epoch": 0.7940623489175784, + "grad_norm": 1.496637225151062, + "learning_rate": 2.1428379622238283e-05, + "loss": 1.4957, + "step": 22173 + }, + { + "epoch": 0.7940981610471467, + "grad_norm": 2.0105698108673096, + "learning_rate": 2.1421205150580514e-05, + "loss": 1.9181, + "step": 22174 + }, + { + "epoch": 0.794133973176715, + "grad_norm": 1.443631887435913, + "learning_rate": 2.1414031736099072e-05, + "loss": 1.3103, + "step": 22175 + }, + { + "epoch": 0.7941697853062832, + "grad_norm": 1.856333613395691, + "learning_rate": 2.1406859378890486e-05, + "loss": 1.6582, + "step": 22176 + }, + { + "epoch": 0.7942055974358515, + "grad_norm": 1.6512917280197144, + "learning_rate": 2.1399688079051205e-05, + "loss": 1.1163, + "step": 22177 + }, + { + "epoch": 0.7942414095654198, + "grad_norm": 1.48997962474823, + "learning_rate": 2.1392517836677738e-05, + "loss": 1.2154, + "step": 22178 + }, + { + "epoch": 0.794277221694988, + "grad_norm": 1.7168711423873901, + "learning_rate": 2.1385348651866542e-05, + "loss": 1.384, + "step": 22179 + }, + { + "epoch": 0.7943130338245564, + "grad_norm": 1.7471309900283813, + "learning_rate": 2.13781805247141e-05, + "loss": 1.5881, + "step": 22180 + }, + { + "epoch": 0.7943488459541247, + "grad_norm": 1.4693900346755981, + "learning_rate": 2.137101345531677e-05, + "loss": 1.5126, + "step": 22181 + }, + { + "epoch": 0.794384658083693, + "grad_norm": 1.2849462032318115, + "learning_rate": 2.136384744377109e-05, + "loss": 1.44, + "step": 22182 + }, + { + "epoch": 0.7944204702132612, + "grad_norm": 1.9792394638061523, + "learning_rate": 2.135668249017341e-05, + "loss": 1.6862, + "step": 22183 + }, + { + "epoch": 0.7944562823428295, + "grad_norm": 2.0086636543273926, + "learning_rate": 2.134951859462009e-05, + "loss": 1.3668, + "step": 22184 + }, + { + "epoch": 0.7944920944723978, + "grad_norm": 1.3767683506011963, + "learning_rate": 2.1342355757207544e-05, + "loss": 1.2261, + "step": 22185 + }, + { + "epoch": 0.794527906601966, + "grad_norm": 1.7332990169525146, + "learning_rate": 2.1335193978032152e-05, + "loss": 1.233, + "step": 22186 + }, + { + "epoch": 0.7945637187315344, + "grad_norm": 1.5471128225326538, + "learning_rate": 2.1328033257190272e-05, + "loss": 1.2935, + "step": 22187 + }, + { + "epoch": 0.7945995308611027, + "grad_norm": 1.7943533658981323, + "learning_rate": 2.13208735947782e-05, + "loss": 1.4358, + "step": 22188 + }, + { + "epoch": 0.794635342990671, + "grad_norm": 1.4228322505950928, + "learning_rate": 2.1313714990892285e-05, + "loss": 1.4599, + "step": 22189 + }, + { + "epoch": 0.7946711551202392, + "grad_norm": 1.6347566843032837, + "learning_rate": 2.1306557445628837e-05, + "loss": 1.4877, + "step": 22190 + }, + { + "epoch": 0.7947069672498075, + "grad_norm": 1.4751968383789062, + "learning_rate": 2.1299400959084183e-05, + "loss": 1.5368, + "step": 22191 + }, + { + "epoch": 0.7947427793793758, + "grad_norm": 1.509900689125061, + "learning_rate": 2.1292245531354538e-05, + "loss": 1.2897, + "step": 22192 + }, + { + "epoch": 0.794778591508944, + "grad_norm": 1.7221992015838623, + "learning_rate": 2.128509116253621e-05, + "loss": 1.5262, + "step": 22193 + }, + { + "epoch": 0.7948144036385124, + "grad_norm": 1.6442131996154785, + "learning_rate": 2.1277937852725472e-05, + "loss": 1.1642, + "step": 22194 + }, + { + "epoch": 0.7948502157680807, + "grad_norm": 1.5237714052200317, + "learning_rate": 2.1270785602018505e-05, + "loss": 1.2724, + "step": 22195 + }, + { + "epoch": 0.794886027897649, + "grad_norm": 2.1376285552978516, + "learning_rate": 2.1263634410511567e-05, + "loss": 1.4343, + "step": 22196 + }, + { + "epoch": 0.7949218400272172, + "grad_norm": 2.232311487197876, + "learning_rate": 2.125648427830086e-05, + "loss": 1.6066, + "step": 22197 + }, + { + "epoch": 0.7949576521567855, + "grad_norm": 1.5320496559143066, + "learning_rate": 2.1249335205482613e-05, + "loss": 1.4768, + "step": 22198 + }, + { + "epoch": 0.7949934642863538, + "grad_norm": 1.4314378499984741, + "learning_rate": 2.1242187192152964e-05, + "loss": 1.3451, + "step": 22199 + }, + { + "epoch": 0.795029276415922, + "grad_norm": 1.5462158918380737, + "learning_rate": 2.1235040238408087e-05, + "loss": 1.6463, + "step": 22200 + }, + { + "epoch": 0.7950650885454904, + "grad_norm": 1.7351806163787842, + "learning_rate": 2.122789434434417e-05, + "loss": 1.5408, + "step": 22201 + }, + { + "epoch": 0.7951009006750587, + "grad_norm": 1.6399816274642944, + "learning_rate": 2.1220749510057304e-05, + "loss": 1.4684, + "step": 22202 + }, + { + "epoch": 0.7951367128046269, + "grad_norm": 2.056469440460205, + "learning_rate": 2.1213605735643625e-05, + "loss": 1.8509, + "step": 22203 + }, + { + "epoch": 0.7951725249341952, + "grad_norm": 1.7393684387207031, + "learning_rate": 2.1206463021199263e-05, + "loss": 1.3368, + "step": 22204 + }, + { + "epoch": 0.7952083370637635, + "grad_norm": 1.5154002904891968, + "learning_rate": 2.1199321366820336e-05, + "loss": 1.4059, + "step": 22205 + }, + { + "epoch": 0.7952441491933318, + "grad_norm": 2.056178569793701, + "learning_rate": 2.1192180772602867e-05, + "loss": 1.588, + "step": 22206 + }, + { + "epoch": 0.7952799613229, + "grad_norm": 1.605133295059204, + "learning_rate": 2.1185041238642934e-05, + "loss": 1.3137, + "step": 22207 + }, + { + "epoch": 0.7953157734524684, + "grad_norm": 1.9872175455093384, + "learning_rate": 2.117790276503665e-05, + "loss": 1.3687, + "step": 22208 + }, + { + "epoch": 0.7953515855820367, + "grad_norm": 1.4879945516586304, + "learning_rate": 2.1170765351879985e-05, + "loss": 1.5544, + "step": 22209 + }, + { + "epoch": 0.7953873977116049, + "grad_norm": 1.7017463445663452, + "learning_rate": 2.116362899926898e-05, + "loss": 1.3034, + "step": 22210 + }, + { + "epoch": 0.7954232098411732, + "grad_norm": 1.7935137748718262, + "learning_rate": 2.1156493707299664e-05, + "loss": 1.1258, + "step": 22211 + }, + { + "epoch": 0.7954590219707415, + "grad_norm": 2.020554780960083, + "learning_rate": 2.1149359476068043e-05, + "loss": 1.7252, + "step": 22212 + }, + { + "epoch": 0.7954948341003097, + "grad_norm": 2.046848773956299, + "learning_rate": 2.1142226305670054e-05, + "loss": 1.5382, + "step": 22213 + }, + { + "epoch": 0.795530646229878, + "grad_norm": 1.9651658535003662, + "learning_rate": 2.1135094196201698e-05, + "loss": 1.5724, + "step": 22214 + }, + { + "epoch": 0.7955664583594464, + "grad_norm": 1.6604115962982178, + "learning_rate": 2.112796314775892e-05, + "loss": 1.4821, + "step": 22215 + }, + { + "epoch": 0.7956022704890147, + "grad_norm": 1.4332162141799927, + "learning_rate": 2.112083316043768e-05, + "loss": 1.5424, + "step": 22216 + }, + { + "epoch": 0.7956380826185829, + "grad_norm": 1.3632845878601074, + "learning_rate": 2.1113704234333866e-05, + "loss": 1.4333, + "step": 22217 + }, + { + "epoch": 0.7956738947481512, + "grad_norm": 1.6020152568817139, + "learning_rate": 2.1106576369543395e-05, + "loss": 1.3168, + "step": 22218 + }, + { + "epoch": 0.7957097068777195, + "grad_norm": 1.4405721426010132, + "learning_rate": 2.109944956616221e-05, + "loss": 1.3853, + "step": 22219 + }, + { + "epoch": 0.7957455190072877, + "grad_norm": 1.601324439048767, + "learning_rate": 2.109232382428612e-05, + "loss": 1.4282, + "step": 22220 + }, + { + "epoch": 0.795781331136856, + "grad_norm": 1.5386035442352295, + "learning_rate": 2.1085199144011037e-05, + "loss": 1.5231, + "step": 22221 + }, + { + "epoch": 0.7958171432664244, + "grad_norm": 1.4716813564300537, + "learning_rate": 2.1078075525432805e-05, + "loss": 1.3985, + "step": 22222 + }, + { + "epoch": 0.7958529553959927, + "grad_norm": 2.0186898708343506, + "learning_rate": 2.1070952968647296e-05, + "loss": 1.3269, + "step": 22223 + }, + { + "epoch": 0.7958887675255609, + "grad_norm": 1.4052791595458984, + "learning_rate": 2.1063831473750272e-05, + "loss": 1.1185, + "step": 22224 + }, + { + "epoch": 0.7959245796551292, + "grad_norm": 1.8976056575775146, + "learning_rate": 2.1056711040837574e-05, + "loss": 1.6696, + "step": 22225 + }, + { + "epoch": 0.7959603917846975, + "grad_norm": 1.5097455978393555, + "learning_rate": 2.104959167000503e-05, + "loss": 1.5236, + "step": 22226 + }, + { + "epoch": 0.7959962039142657, + "grad_norm": 1.326099157333374, + "learning_rate": 2.104247336134836e-05, + "loss": 1.2501, + "step": 22227 + }, + { + "epoch": 0.796032016043834, + "grad_norm": 1.6710323095321655, + "learning_rate": 2.103535611496337e-05, + "loss": 1.243, + "step": 22228 + }, + { + "epoch": 0.7960678281734024, + "grad_norm": 2.964059829711914, + "learning_rate": 2.1028239930945794e-05, + "loss": 1.1575, + "step": 22229 + }, + { + "epoch": 0.7961036403029706, + "grad_norm": 1.9996397495269775, + "learning_rate": 2.1021124809391423e-05, + "loss": 1.2896, + "step": 22230 + }, + { + "epoch": 0.7961394524325389, + "grad_norm": 2.0370073318481445, + "learning_rate": 2.1014010750395907e-05, + "loss": 1.5759, + "step": 22231 + }, + { + "epoch": 0.7961752645621072, + "grad_norm": 2.4355692863464355, + "learning_rate": 2.1006897754055e-05, + "loss": 1.5545, + "step": 22232 + }, + { + "epoch": 0.7962110766916755, + "grad_norm": 1.777365803718567, + "learning_rate": 2.099978582046438e-05, + "loss": 1.3352, + "step": 22233 + }, + { + "epoch": 0.7962468888212437, + "grad_norm": 1.5381382703781128, + "learning_rate": 2.099267494971977e-05, + "loss": 1.5469, + "step": 22234 + }, + { + "epoch": 0.796282700950812, + "grad_norm": 1.5520857572555542, + "learning_rate": 2.0985565141916808e-05, + "loss": 1.0467, + "step": 22235 + }, + { + "epoch": 0.7963185130803804, + "grad_norm": 1.8397648334503174, + "learning_rate": 2.097845639715109e-05, + "loss": 1.5229, + "step": 22236 + }, + { + "epoch": 0.7963543252099486, + "grad_norm": 1.6012263298034668, + "learning_rate": 2.0971348715518368e-05, + "loss": 1.1753, + "step": 22237 + }, + { + "epoch": 0.7963901373395169, + "grad_norm": 1.5385147333145142, + "learning_rate": 2.0964242097114184e-05, + "loss": 1.825, + "step": 22238 + }, + { + "epoch": 0.7964259494690852, + "grad_norm": 1.9621472358703613, + "learning_rate": 2.0957136542034172e-05, + "loss": 1.3496, + "step": 22239 + }, + { + "epoch": 0.7964617615986535, + "grad_norm": 1.6801329851150513, + "learning_rate": 2.0950032050373925e-05, + "loss": 1.2043, + "step": 22240 + }, + { + "epoch": 0.7964975737282217, + "grad_norm": 2.1597633361816406, + "learning_rate": 2.0942928622229064e-05, + "loss": 1.5174, + "step": 22241 + }, + { + "epoch": 0.79653338585779, + "grad_norm": 1.5973130464553833, + "learning_rate": 2.093582625769509e-05, + "loss": 1.7511, + "step": 22242 + }, + { + "epoch": 0.7965691979873584, + "grad_norm": 2.0921790599823, + "learning_rate": 2.0928724956867585e-05, + "loss": 1.3482, + "step": 22243 + }, + { + "epoch": 0.7966050101169266, + "grad_norm": 2.008749008178711, + "learning_rate": 2.0921624719842126e-05, + "loss": 1.5626, + "step": 22244 + }, + { + "epoch": 0.7966408222464949, + "grad_norm": 1.3919188976287842, + "learning_rate": 2.091452554671417e-05, + "loss": 1.3621, + "step": 22245 + }, + { + "epoch": 0.7966766343760632, + "grad_norm": 2.2559168338775635, + "learning_rate": 2.0907427437579287e-05, + "loss": 1.3592, + "step": 22246 + }, + { + "epoch": 0.7967124465056314, + "grad_norm": 1.4418983459472656, + "learning_rate": 2.0900330392532895e-05, + "loss": 1.529, + "step": 22247 + }, + { + "epoch": 0.7967482586351997, + "grad_norm": 1.5940090417861938, + "learning_rate": 2.089323441167058e-05, + "loss": 1.6447, + "step": 22248 + }, + { + "epoch": 0.796784070764768, + "grad_norm": 1.591691255569458, + "learning_rate": 2.088613949508772e-05, + "loss": 1.244, + "step": 22249 + }, + { + "epoch": 0.7968198828943364, + "grad_norm": 1.6028300523757935, + "learning_rate": 2.0879045642879814e-05, + "loss": 1.3213, + "step": 22250 + }, + { + "epoch": 0.7968556950239046, + "grad_norm": 1.5213801860809326, + "learning_rate": 2.0871952855142286e-05, + "loss": 1.2998, + "step": 22251 + }, + { + "epoch": 0.7968915071534729, + "grad_norm": 3.0959925651550293, + "learning_rate": 2.0864861131970594e-05, + "loss": 1.4941, + "step": 22252 + }, + { + "epoch": 0.7969273192830412, + "grad_norm": 1.3404817581176758, + "learning_rate": 2.0857770473460115e-05, + "loss": 1.2366, + "step": 22253 + }, + { + "epoch": 0.7969631314126094, + "grad_norm": 2.0209555625915527, + "learning_rate": 2.08506808797062e-05, + "loss": 1.3125, + "step": 22254 + }, + { + "epoch": 0.7969989435421777, + "grad_norm": 1.8329658508300781, + "learning_rate": 2.084359235080433e-05, + "loss": 1.4451, + "step": 22255 + }, + { + "epoch": 0.797034755671746, + "grad_norm": 1.6439001560211182, + "learning_rate": 2.0836504886849785e-05, + "loss": 1.7872, + "step": 22256 + }, + { + "epoch": 0.7970705678013144, + "grad_norm": 2.4358198642730713, + "learning_rate": 2.082941848793799e-05, + "loss": 1.4332, + "step": 22257 + }, + { + "epoch": 0.7971063799308826, + "grad_norm": 1.5745573043823242, + "learning_rate": 2.0822333154164187e-05, + "loss": 1.1289, + "step": 22258 + }, + { + "epoch": 0.7971421920604509, + "grad_norm": 1.8613457679748535, + "learning_rate": 2.0815248885623817e-05, + "loss": 1.2301, + "step": 22259 + }, + { + "epoch": 0.7971780041900192, + "grad_norm": 1.6447163820266724, + "learning_rate": 2.0808165682412107e-05, + "loss": 1.2927, + "step": 22260 + }, + { + "epoch": 0.7972138163195874, + "grad_norm": 2.150489568710327, + "learning_rate": 2.08010835446244e-05, + "loss": 1.5737, + "step": 22261 + }, + { + "epoch": 0.7972496284491557, + "grad_norm": 2.0756678581237793, + "learning_rate": 2.0794002472355933e-05, + "loss": 1.6523, + "step": 22262 + }, + { + "epoch": 0.797285440578724, + "grad_norm": 1.8237303495407104, + "learning_rate": 2.0786922465701997e-05, + "loss": 1.5846, + "step": 22263 + }, + { + "epoch": 0.7973212527082923, + "grad_norm": 1.4604321718215942, + "learning_rate": 2.0779843524757858e-05, + "loss": 1.128, + "step": 22264 + }, + { + "epoch": 0.7973570648378606, + "grad_norm": 1.7635831832885742, + "learning_rate": 2.0772765649618688e-05, + "loss": 1.3801, + "step": 22265 + }, + { + "epoch": 0.7973928769674289, + "grad_norm": 2.052753448486328, + "learning_rate": 2.0765688840379816e-05, + "loss": 1.3685, + "step": 22266 + }, + { + "epoch": 0.7974286890969972, + "grad_norm": 1.3673149347305298, + "learning_rate": 2.075861309713637e-05, + "loss": 0.9523, + "step": 22267 + }, + { + "epoch": 0.7974645012265654, + "grad_norm": 2.1413803100585938, + "learning_rate": 2.0751538419983598e-05, + "loss": 1.1637, + "step": 22268 + }, + { + "epoch": 0.7975003133561337, + "grad_norm": 1.6249077320098877, + "learning_rate": 2.0744464809016593e-05, + "loss": 1.676, + "step": 22269 + }, + { + "epoch": 0.797536125485702, + "grad_norm": 1.6026825904846191, + "learning_rate": 2.0737392264330635e-05, + "loss": 1.188, + "step": 22270 + }, + { + "epoch": 0.7975719376152703, + "grad_norm": 1.4662625789642334, + "learning_rate": 2.073032078602083e-05, + "loss": 1.4825, + "step": 22271 + }, + { + "epoch": 0.7976077497448386, + "grad_norm": 1.4503960609436035, + "learning_rate": 2.0723250374182278e-05, + "loss": 1.6084, + "step": 22272 + }, + { + "epoch": 0.7976435618744069, + "grad_norm": 1.5225666761398315, + "learning_rate": 2.071618102891013e-05, + "loss": 1.307, + "step": 22273 + }, + { + "epoch": 0.7976793740039752, + "grad_norm": 1.6680306196212769, + "learning_rate": 2.070911275029951e-05, + "loss": 1.533, + "step": 22274 + }, + { + "epoch": 0.7977151861335434, + "grad_norm": 2.219346761703491, + "learning_rate": 2.0702045538445515e-05, + "loss": 1.7578, + "step": 22275 + }, + { + "epoch": 0.7977509982631117, + "grad_norm": 1.6483958959579468, + "learning_rate": 2.069497939344316e-05, + "loss": 1.3416, + "step": 22276 + }, + { + "epoch": 0.79778681039268, + "grad_norm": 1.3028994798660278, + "learning_rate": 2.0687914315387613e-05, + "loss": 1.5492, + "step": 22277 + }, + { + "epoch": 0.7978226225222483, + "grad_norm": 1.6726864576339722, + "learning_rate": 2.0680850304373843e-05, + "loss": 1.4142, + "step": 22278 + }, + { + "epoch": 0.7978584346518166, + "grad_norm": 1.879017949104309, + "learning_rate": 2.0673787360496954e-05, + "loss": 1.5515, + "step": 22279 + }, + { + "epoch": 0.7978942467813849, + "grad_norm": 1.7818199396133423, + "learning_rate": 2.06667254838519e-05, + "loss": 1.4629, + "step": 22280 + }, + { + "epoch": 0.7979300589109531, + "grad_norm": 1.6430115699768066, + "learning_rate": 2.0659664674533728e-05, + "loss": 1.6182, + "step": 22281 + }, + { + "epoch": 0.7979658710405214, + "grad_norm": 2.086963176727295, + "learning_rate": 2.065260493263744e-05, + "loss": 1.5188, + "step": 22282 + }, + { + "epoch": 0.7980016831700897, + "grad_norm": 1.5534586906433105, + "learning_rate": 2.0645546258257987e-05, + "loss": 1.5905, + "step": 22283 + }, + { + "epoch": 0.798037495299658, + "grad_norm": 1.8073506355285645, + "learning_rate": 2.063848865149035e-05, + "loss": 1.5429, + "step": 22284 + }, + { + "epoch": 0.7980733074292263, + "grad_norm": 1.6143696308135986, + "learning_rate": 2.0631432112429473e-05, + "loss": 1.5414, + "step": 22285 + }, + { + "epoch": 0.7981091195587946, + "grad_norm": 1.9073354005813599, + "learning_rate": 2.062437664117033e-05, + "loss": 1.6396, + "step": 22286 + }, + { + "epoch": 0.7981449316883629, + "grad_norm": 1.7292273044586182, + "learning_rate": 2.061732223780778e-05, + "loss": 1.3851, + "step": 22287 + }, + { + "epoch": 0.7981807438179311, + "grad_norm": 1.2550760507583618, + "learning_rate": 2.061026890243677e-05, + "loss": 1.3337, + "step": 22288 + }, + { + "epoch": 0.7982165559474994, + "grad_norm": 1.5007202625274658, + "learning_rate": 2.06032166351522e-05, + "loss": 1.1814, + "step": 22289 + }, + { + "epoch": 0.7982523680770677, + "grad_norm": 1.4860681295394897, + "learning_rate": 2.059616543604892e-05, + "loss": 1.4702, + "step": 22290 + }, + { + "epoch": 0.7982881802066359, + "grad_norm": 1.7105454206466675, + "learning_rate": 2.0589115305221807e-05, + "loss": 1.556, + "step": 22291 + }, + { + "epoch": 0.7983239923362043, + "grad_norm": 1.458163857460022, + "learning_rate": 2.058206624276572e-05, + "loss": 1.5388, + "step": 22292 + }, + { + "epoch": 0.7983598044657726, + "grad_norm": 1.4146854877471924, + "learning_rate": 2.0575018248775513e-05, + "loss": 1.6969, + "step": 22293 + }, + { + "epoch": 0.7983956165953409, + "grad_norm": 1.3511170148849487, + "learning_rate": 2.056797132334596e-05, + "loss": 1.4063, + "step": 22294 + }, + { + "epoch": 0.7984314287249091, + "grad_norm": 1.6224746704101562, + "learning_rate": 2.056092546657189e-05, + "loss": 1.4332, + "step": 22295 + }, + { + "epoch": 0.7984672408544774, + "grad_norm": 1.4703950881958008, + "learning_rate": 2.0553880678548097e-05, + "loss": 1.3809, + "step": 22296 + }, + { + "epoch": 0.7985030529840457, + "grad_norm": 1.9702130556106567, + "learning_rate": 2.0546836959369387e-05, + "loss": 1.431, + "step": 22297 + }, + { + "epoch": 0.7985388651136139, + "grad_norm": 1.900384783744812, + "learning_rate": 2.0539794309130478e-05, + "loss": 1.1501, + "step": 22298 + }, + { + "epoch": 0.7985746772431823, + "grad_norm": 1.9203412532806396, + "learning_rate": 2.0532752727926142e-05, + "loss": 1.6926, + "step": 22299 + }, + { + "epoch": 0.7986104893727506, + "grad_norm": 1.5627378225326538, + "learning_rate": 2.0525712215851132e-05, + "loss": 1.3586, + "step": 22300 + }, + { + "epoch": 0.7986463015023189, + "grad_norm": 1.3566069602966309, + "learning_rate": 2.0518672773000124e-05, + "loss": 1.417, + "step": 22301 + }, + { + "epoch": 0.7986821136318871, + "grad_norm": 2.314469814300537, + "learning_rate": 2.051163439946786e-05, + "loss": 1.7983, + "step": 22302 + }, + { + "epoch": 0.7987179257614554, + "grad_norm": 1.9831676483154297, + "learning_rate": 2.050459709534901e-05, + "loss": 1.6987, + "step": 22303 + }, + { + "epoch": 0.7987537378910237, + "grad_norm": 2.6953423023223877, + "learning_rate": 2.0497560860738295e-05, + "loss": 1.2652, + "step": 22304 + }, + { + "epoch": 0.7987895500205919, + "grad_norm": 1.602420449256897, + "learning_rate": 2.0490525695730323e-05, + "loss": 1.4029, + "step": 22305 + }, + { + "epoch": 0.7988253621501603, + "grad_norm": 1.531333565711975, + "learning_rate": 2.048349160041977e-05, + "loss": 1.3618, + "step": 22306 + }, + { + "epoch": 0.7988611742797286, + "grad_norm": 1.4855061769485474, + "learning_rate": 2.0476458574901293e-05, + "loss": 1.2672, + "step": 22307 + }, + { + "epoch": 0.7988969864092968, + "grad_norm": 2.015868663787842, + "learning_rate": 2.046942661926946e-05, + "loss": 1.4248, + "step": 22308 + }, + { + "epoch": 0.7989327985388651, + "grad_norm": 1.3681994676589966, + "learning_rate": 2.04623957336189e-05, + "loss": 1.1357, + "step": 22309 + }, + { + "epoch": 0.7989686106684334, + "grad_norm": 2.770206928253174, + "learning_rate": 2.0455365918044224e-05, + "loss": 1.257, + "step": 22310 + }, + { + "epoch": 0.7990044227980017, + "grad_norm": 1.8386939764022827, + "learning_rate": 2.044833717264001e-05, + "loss": 1.4291, + "step": 22311 + }, + { + "epoch": 0.7990402349275699, + "grad_norm": 1.7308319807052612, + "learning_rate": 2.044130949750077e-05, + "loss": 1.3881, + "step": 22312 + }, + { + "epoch": 0.7990760470571383, + "grad_norm": 2.2315123081207275, + "learning_rate": 2.04342828927211e-05, + "loss": 1.3908, + "step": 22313 + }, + { + "epoch": 0.7991118591867066, + "grad_norm": 1.8249056339263916, + "learning_rate": 2.0427257358395546e-05, + "loss": 1.5637, + "step": 22314 + }, + { + "epoch": 0.7991476713162748, + "grad_norm": 1.5045298337936401, + "learning_rate": 2.0420232894618573e-05, + "loss": 1.42, + "step": 22315 + }, + { + "epoch": 0.7991834834458431, + "grad_norm": 1.8735840320587158, + "learning_rate": 2.041320950148472e-05, + "loss": 1.581, + "step": 22316 + }, + { + "epoch": 0.7992192955754114, + "grad_norm": 1.6931583881378174, + "learning_rate": 2.0406187179088477e-05, + "loss": 1.4165, + "step": 22317 + }, + { + "epoch": 0.7992551077049797, + "grad_norm": 2.089118719100952, + "learning_rate": 2.0399165927524334e-05, + "loss": 1.4857, + "step": 22318 + }, + { + "epoch": 0.7992909198345479, + "grad_norm": 1.5633699893951416, + "learning_rate": 2.0392145746886714e-05, + "loss": 1.5717, + "step": 22319 + }, + { + "epoch": 0.7993267319641163, + "grad_norm": 1.8018674850463867, + "learning_rate": 2.038512663727009e-05, + "loss": 1.4133, + "step": 22320 + }, + { + "epoch": 0.7993625440936846, + "grad_norm": 1.3811540603637695, + "learning_rate": 2.0378108598768887e-05, + "loss": 1.2667, + "step": 22321 + }, + { + "epoch": 0.7993983562232528, + "grad_norm": 1.6007475852966309, + "learning_rate": 2.0371091631477557e-05, + "loss": 1.2712, + "step": 22322 + }, + { + "epoch": 0.7994341683528211, + "grad_norm": 1.5358997583389282, + "learning_rate": 2.036407573549044e-05, + "loss": 1.5853, + "step": 22323 + }, + { + "epoch": 0.7994699804823894, + "grad_norm": 2.150416374206543, + "learning_rate": 2.035706091090197e-05, + "loss": 1.3255, + "step": 22324 + }, + { + "epoch": 0.7995057926119576, + "grad_norm": 1.9435093402862549, + "learning_rate": 2.035004715780654e-05, + "loss": 1.1444, + "step": 22325 + }, + { + "epoch": 0.7995416047415259, + "grad_norm": 1.7371599674224854, + "learning_rate": 2.0343034476298452e-05, + "loss": 1.5803, + "step": 22326 + }, + { + "epoch": 0.7995774168710943, + "grad_norm": 1.5360907316207886, + "learning_rate": 2.0336022866472092e-05, + "loss": 1.5967, + "step": 22327 + }, + { + "epoch": 0.7996132290006626, + "grad_norm": 1.229755163192749, + "learning_rate": 2.0329012328421783e-05, + "loss": 1.2461, + "step": 22328 + }, + { + "epoch": 0.7996490411302308, + "grad_norm": 1.8806687593460083, + "learning_rate": 2.0322002862241863e-05, + "loss": 1.5251, + "step": 22329 + }, + { + "epoch": 0.7996848532597991, + "grad_norm": 1.3057172298431396, + "learning_rate": 2.0314994468026606e-05, + "loss": 1.4201, + "step": 22330 + }, + { + "epoch": 0.7997206653893674, + "grad_norm": 1.5392056703567505, + "learning_rate": 2.03079871458703e-05, + "loss": 1.6202, + "step": 22331 + }, + { + "epoch": 0.7997564775189356, + "grad_norm": 1.719313621520996, + "learning_rate": 2.0300980895867263e-05, + "loss": 1.3479, + "step": 22332 + }, + { + "epoch": 0.7997922896485039, + "grad_norm": 1.619842767715454, + "learning_rate": 2.029397571811169e-05, + "loss": 1.5288, + "step": 22333 + }, + { + "epoch": 0.7998281017780723, + "grad_norm": 2.1151888370513916, + "learning_rate": 2.0286971612697902e-05, + "loss": 1.6217, + "step": 22334 + }, + { + "epoch": 0.7998639139076406, + "grad_norm": 1.7302162647247314, + "learning_rate": 2.027996857972002e-05, + "loss": 1.2988, + "step": 22335 + }, + { + "epoch": 0.7998997260372088, + "grad_norm": 1.6297879219055176, + "learning_rate": 2.0272966619272392e-05, + "loss": 1.0922, + "step": 22336 + }, + { + "epoch": 0.7999355381667771, + "grad_norm": 1.6117581129074097, + "learning_rate": 2.026596573144913e-05, + "loss": 0.9956, + "step": 22337 + }, + { + "epoch": 0.7999713502963454, + "grad_norm": 1.5663176774978638, + "learning_rate": 2.025896591634444e-05, + "loss": 1.3235, + "step": 22338 + }, + { + "epoch": 0.8000071624259136, + "grad_norm": 1.4366532564163208, + "learning_rate": 2.0251967174052523e-05, + "loss": 1.0999, + "step": 22339 + }, + { + "epoch": 0.8000429745554819, + "grad_norm": 1.7538988590240479, + "learning_rate": 2.024496950466753e-05, + "loss": 1.7191, + "step": 22340 + }, + { + "epoch": 0.8000787866850503, + "grad_norm": 1.5960266590118408, + "learning_rate": 2.023797290828361e-05, + "loss": 1.3942, + "step": 22341 + }, + { + "epoch": 0.8001145988146185, + "grad_norm": 1.8154847621917725, + "learning_rate": 2.0230977384994808e-05, + "loss": 1.1857, + "step": 22342 + }, + { + "epoch": 0.8001504109441868, + "grad_norm": 1.5289015769958496, + "learning_rate": 2.022398293489538e-05, + "loss": 1.6854, + "step": 22343 + }, + { + "epoch": 0.8001862230737551, + "grad_norm": 1.8163173198699951, + "learning_rate": 2.0216989558079326e-05, + "loss": 1.5605, + "step": 22344 + }, + { + "epoch": 0.8002220352033234, + "grad_norm": 1.35292387008667, + "learning_rate": 2.020999725464079e-05, + "loss": 1.3525, + "step": 22345 + }, + { + "epoch": 0.8002578473328916, + "grad_norm": 1.5341395139694214, + "learning_rate": 2.0203006024673764e-05, + "loss": 1.326, + "step": 22346 + }, + { + "epoch": 0.8002936594624599, + "grad_norm": 1.8680927753448486, + "learning_rate": 2.0196015868272412e-05, + "loss": 1.5703, + "step": 22347 + }, + { + "epoch": 0.8003294715920283, + "grad_norm": 1.65446937084198, + "learning_rate": 2.0189026785530705e-05, + "loss": 1.2258, + "step": 22348 + }, + { + "epoch": 0.8003652837215965, + "grad_norm": 2.1270790100097656, + "learning_rate": 2.01820387765427e-05, + "loss": 1.492, + "step": 22349 + }, + { + "epoch": 0.8004010958511648, + "grad_norm": 1.5399971008300781, + "learning_rate": 2.0175051841402426e-05, + "loss": 1.5374, + "step": 22350 + }, + { + "epoch": 0.8004369079807331, + "grad_norm": 1.526645302772522, + "learning_rate": 2.016806598020383e-05, + "loss": 1.2816, + "step": 22351 + }, + { + "epoch": 0.8004727201103014, + "grad_norm": 1.623569130897522, + "learning_rate": 2.0161081193040964e-05, + "loss": 1.244, + "step": 22352 + }, + { + "epoch": 0.8005085322398696, + "grad_norm": 1.273235559463501, + "learning_rate": 2.0154097480007716e-05, + "loss": 1.1387, + "step": 22353 + }, + { + "epoch": 0.8005443443694379, + "grad_norm": 1.4888707399368286, + "learning_rate": 2.0147114841198144e-05, + "loss": 1.4681, + "step": 22354 + }, + { + "epoch": 0.8005801564990063, + "grad_norm": 2.1212549209594727, + "learning_rate": 2.014013327670611e-05, + "loss": 1.2677, + "step": 22355 + }, + { + "epoch": 0.8006159686285745, + "grad_norm": 2.3081042766571045, + "learning_rate": 2.0133152786625598e-05, + "loss": 1.5632, + "step": 22356 + }, + { + "epoch": 0.8006517807581428, + "grad_norm": 1.2556418180465698, + "learning_rate": 2.012617337105044e-05, + "loss": 1.5076, + "step": 22357 + }, + { + "epoch": 0.8006875928877111, + "grad_norm": 1.5202817916870117, + "learning_rate": 2.0119195030074645e-05, + "loss": 1.3706, + "step": 22358 + }, + { + "epoch": 0.8007234050172793, + "grad_norm": 1.4703209400177002, + "learning_rate": 2.011221776379204e-05, + "loss": 1.6011, + "step": 22359 + }, + { + "epoch": 0.8007592171468476, + "grad_norm": 2.0287888050079346, + "learning_rate": 2.0105241572296463e-05, + "loss": 1.3434, + "step": 22360 + }, + { + "epoch": 0.8007950292764159, + "grad_norm": 1.8298935890197754, + "learning_rate": 2.0098266455681812e-05, + "loss": 1.6273, + "step": 22361 + }, + { + "epoch": 0.8008308414059843, + "grad_norm": 1.9720675945281982, + "learning_rate": 2.009129241404192e-05, + "loss": 1.537, + "step": 22362 + }, + { + "epoch": 0.8008666535355525, + "grad_norm": 1.583101749420166, + "learning_rate": 2.0084319447470645e-05, + "loss": 1.4068, + "step": 22363 + }, + { + "epoch": 0.8009024656651208, + "grad_norm": 2.1829562187194824, + "learning_rate": 2.007734755606171e-05, + "loss": 1.4279, + "step": 22364 + }, + { + "epoch": 0.8009382777946891, + "grad_norm": 1.5320464372634888, + "learning_rate": 2.0070376739909024e-05, + "loss": 1.4431, + "step": 22365 + }, + { + "epoch": 0.8009740899242573, + "grad_norm": 1.512054443359375, + "learning_rate": 2.0063406999106293e-05, + "loss": 1.4185, + "step": 22366 + }, + { + "epoch": 0.8010099020538256, + "grad_norm": 1.5620276927947998, + "learning_rate": 2.005643833374733e-05, + "loss": 1.3393, + "step": 22367 + }, + { + "epoch": 0.8010457141833939, + "grad_norm": 1.6267145872116089, + "learning_rate": 2.0049470743925845e-05, + "loss": 1.2916, + "step": 22368 + }, + { + "epoch": 0.8010815263129623, + "grad_norm": 1.743434190750122, + "learning_rate": 2.0042504229735604e-05, + "loss": 1.4186, + "step": 22369 + }, + { + "epoch": 0.8011173384425305, + "grad_norm": 1.7056375741958618, + "learning_rate": 2.0035538791270358e-05, + "loss": 1.3937, + "step": 22370 + }, + { + "epoch": 0.8011531505720988, + "grad_norm": 1.7276453971862793, + "learning_rate": 2.002857442862377e-05, + "loss": 1.4292, + "step": 22371 + }, + { + "epoch": 0.8011889627016671, + "grad_norm": 1.376869797706604, + "learning_rate": 2.002161114188955e-05, + "loss": 1.5486, + "step": 22372 + }, + { + "epoch": 0.8012247748312353, + "grad_norm": 1.5992333889007568, + "learning_rate": 2.0014648931161386e-05, + "loss": 1.3535, + "step": 22373 + }, + { + "epoch": 0.8012605869608036, + "grad_norm": 1.8465687036514282, + "learning_rate": 2.000768779653298e-05, + "loss": 1.1694, + "step": 22374 + }, + { + "epoch": 0.8012963990903719, + "grad_norm": 1.4649271965026855, + "learning_rate": 2.000072773809789e-05, + "loss": 1.6326, + "step": 22375 + }, + { + "epoch": 0.8013322112199402, + "grad_norm": 1.352648377418518, + "learning_rate": 1.9993768755949882e-05, + "loss": 1.5192, + "step": 22376 + }, + { + "epoch": 0.8013680233495085, + "grad_norm": 1.7649281024932861, + "learning_rate": 1.99868108501825e-05, + "loss": 1.1998, + "step": 22377 + }, + { + "epoch": 0.8014038354790768, + "grad_norm": 1.4050350189208984, + "learning_rate": 1.9979854020889356e-05, + "loss": 1.4443, + "step": 22378 + }, + { + "epoch": 0.8014396476086451, + "grad_norm": 1.8340182304382324, + "learning_rate": 1.9972898268164052e-05, + "loss": 1.5995, + "step": 22379 + }, + { + "epoch": 0.8014754597382133, + "grad_norm": 1.7190074920654297, + "learning_rate": 1.9965943592100166e-05, + "loss": 1.402, + "step": 22380 + }, + { + "epoch": 0.8015112718677816, + "grad_norm": 2.5135326385498047, + "learning_rate": 1.995898999279131e-05, + "loss": 1.2719, + "step": 22381 + }, + { + "epoch": 0.8015470839973499, + "grad_norm": 2.2622857093811035, + "learning_rate": 1.9952037470330964e-05, + "loss": 1.7018, + "step": 22382 + }, + { + "epoch": 0.8015828961269181, + "grad_norm": 1.4016941785812378, + "learning_rate": 1.994508602481271e-05, + "loss": 1.4652, + "step": 22383 + }, + { + "epoch": 0.8016187082564865, + "grad_norm": 1.9714092016220093, + "learning_rate": 1.993813565633005e-05, + "loss": 1.7556, + "step": 22384 + }, + { + "epoch": 0.8016545203860548, + "grad_norm": 1.8598322868347168, + "learning_rate": 1.993118636497654e-05, + "loss": 1.633, + "step": 22385 + }, + { + "epoch": 0.801690332515623, + "grad_norm": 1.6280977725982666, + "learning_rate": 1.99242381508456e-05, + "loss": 1.2321, + "step": 22386 + }, + { + "epoch": 0.8017261446451913, + "grad_norm": 1.4744327068328857, + "learning_rate": 1.9917291014030747e-05, + "loss": 1.1498, + "step": 22387 + }, + { + "epoch": 0.8017619567747596, + "grad_norm": 1.4115815162658691, + "learning_rate": 1.991034495462547e-05, + "loss": 1.2791, + "step": 22388 + }, + { + "epoch": 0.8017977689043279, + "grad_norm": 1.562197208404541, + "learning_rate": 1.990339997272317e-05, + "loss": 1.2146, + "step": 22389 + }, + { + "epoch": 0.8018335810338961, + "grad_norm": 2.2312514781951904, + "learning_rate": 1.9896456068417302e-05, + "loss": 1.4709, + "step": 22390 + }, + { + "epoch": 0.8018693931634645, + "grad_norm": 1.673010230064392, + "learning_rate": 1.9889513241801295e-05, + "loss": 1.3413, + "step": 22391 + }, + { + "epoch": 0.8019052052930328, + "grad_norm": 1.657200813293457, + "learning_rate": 1.988257149296857e-05, + "loss": 1.7043, + "step": 22392 + }, + { + "epoch": 0.801941017422601, + "grad_norm": 1.4257102012634277, + "learning_rate": 1.987563082201249e-05, + "loss": 1.4941, + "step": 22393 + }, + { + "epoch": 0.8019768295521693, + "grad_norm": 1.373645544052124, + "learning_rate": 1.9868691229026437e-05, + "loss": 1.5311, + "step": 22394 + }, + { + "epoch": 0.8020126416817376, + "grad_norm": 1.3798229694366455, + "learning_rate": 1.986175271410381e-05, + "loss": 1.1315, + "step": 22395 + }, + { + "epoch": 0.8020484538113059, + "grad_norm": 1.657395839691162, + "learning_rate": 1.9854815277337902e-05, + "loss": 1.2784, + "step": 22396 + }, + { + "epoch": 0.8020842659408741, + "grad_norm": 2.0209593772888184, + "learning_rate": 1.9847878918822073e-05, + "loss": 1.2659, + "step": 22397 + }, + { + "epoch": 0.8021200780704425, + "grad_norm": 1.9298338890075684, + "learning_rate": 1.9840943638649635e-05, + "loss": 1.2645, + "step": 22398 + }, + { + "epoch": 0.8021558902000108, + "grad_norm": 1.6281800270080566, + "learning_rate": 1.9834009436913948e-05, + "loss": 1.4317, + "step": 22399 + }, + { + "epoch": 0.802191702329579, + "grad_norm": 1.7945187091827393, + "learning_rate": 1.9827076313708216e-05, + "loss": 1.671, + "step": 22400 + }, + { + "epoch": 0.8022275144591473, + "grad_norm": 1.9173634052276611, + "learning_rate": 1.9820144269125763e-05, + "loss": 1.6113, + "step": 22401 + }, + { + "epoch": 0.8022633265887156, + "grad_norm": 1.6076704263687134, + "learning_rate": 1.981321330325987e-05, + "loss": 1.5395, + "step": 22402 + }, + { + "epoch": 0.8022991387182838, + "grad_norm": 1.4223419427871704, + "learning_rate": 1.980628341620373e-05, + "loss": 1.3787, + "step": 22403 + }, + { + "epoch": 0.8023349508478521, + "grad_norm": 1.6627603769302368, + "learning_rate": 1.9799354608050614e-05, + "loss": 1.4402, + "step": 22404 + }, + { + "epoch": 0.8023707629774205, + "grad_norm": 1.7014139890670776, + "learning_rate": 1.979242687889372e-05, + "loss": 1.4458, + "step": 22405 + }, + { + "epoch": 0.8024065751069888, + "grad_norm": 1.6976075172424316, + "learning_rate": 1.9785500228826292e-05, + "loss": 1.6349, + "step": 22406 + }, + { + "epoch": 0.802442387236557, + "grad_norm": 1.68596351146698, + "learning_rate": 1.977857465794146e-05, + "loss": 1.7244, + "step": 22407 + }, + { + "epoch": 0.8024781993661253, + "grad_norm": 2.4121077060699463, + "learning_rate": 1.977165016633242e-05, + "loss": 1.5179, + "step": 22408 + }, + { + "epoch": 0.8025140114956936, + "grad_norm": 2.1352639198303223, + "learning_rate": 1.9764726754092354e-05, + "loss": 1.5501, + "step": 22409 + }, + { + "epoch": 0.8025498236252618, + "grad_norm": 1.6876717805862427, + "learning_rate": 1.975780442131442e-05, + "loss": 1.5704, + "step": 22410 + }, + { + "epoch": 0.8025856357548301, + "grad_norm": 1.326467752456665, + "learning_rate": 1.9750883168091684e-05, + "loss": 1.539, + "step": 22411 + }, + { + "epoch": 0.8026214478843985, + "grad_norm": 1.339929223060608, + "learning_rate": 1.9743962994517316e-05, + "loss": 1.4927, + "step": 22412 + }, + { + "epoch": 0.8026572600139668, + "grad_norm": 1.7197191715240479, + "learning_rate": 1.9737043900684416e-05, + "loss": 1.3768, + "step": 22413 + }, + { + "epoch": 0.802693072143535, + "grad_norm": 1.7365750074386597, + "learning_rate": 1.9730125886686033e-05, + "loss": 1.2812, + "step": 22414 + }, + { + "epoch": 0.8027288842731033, + "grad_norm": 1.6885199546813965, + "learning_rate": 1.972320895261528e-05, + "loss": 1.4585, + "step": 22415 + }, + { + "epoch": 0.8027646964026716, + "grad_norm": 1.6332989931106567, + "learning_rate": 1.9716293098565186e-05, + "loss": 1.4209, + "step": 22416 + }, + { + "epoch": 0.8028005085322398, + "grad_norm": 1.5932855606079102, + "learning_rate": 1.9709378324628848e-05, + "loss": 1.5682, + "step": 22417 + }, + { + "epoch": 0.8028363206618081, + "grad_norm": 1.284571886062622, + "learning_rate": 1.970246463089922e-05, + "loss": 1.3149, + "step": 22418 + }, + { + "epoch": 0.8028721327913765, + "grad_norm": 1.5631963014602661, + "learning_rate": 1.9695552017469364e-05, + "loss": 1.5476, + "step": 22419 + }, + { + "epoch": 0.8029079449209447, + "grad_norm": 1.6943448781967163, + "learning_rate": 1.9688640484432287e-05, + "loss": 1.3432, + "step": 22420 + }, + { + "epoch": 0.802943757050513, + "grad_norm": 1.4390662908554077, + "learning_rate": 1.968173003188094e-05, + "loss": 1.2311, + "step": 22421 + }, + { + "epoch": 0.8029795691800813, + "grad_norm": 1.7541425228118896, + "learning_rate": 1.96748206599083e-05, + "loss": 1.4092, + "step": 22422 + }, + { + "epoch": 0.8030153813096496, + "grad_norm": 1.399704098701477, + "learning_rate": 1.9667912368607344e-05, + "loss": 1.3498, + "step": 22423 + }, + { + "epoch": 0.8030511934392178, + "grad_norm": 2.5454680919647217, + "learning_rate": 1.9661005158071033e-05, + "loss": 1.2805, + "step": 22424 + }, + { + "epoch": 0.8030870055687861, + "grad_norm": 2.0977659225463867, + "learning_rate": 1.965409902839225e-05, + "loss": 1.7922, + "step": 22425 + }, + { + "epoch": 0.8031228176983545, + "grad_norm": 1.8387500047683716, + "learning_rate": 1.9647193979663915e-05, + "loss": 1.3484, + "step": 22426 + }, + { + "epoch": 0.8031586298279227, + "grad_norm": 1.7446051836013794, + "learning_rate": 1.9640290011978935e-05, + "loss": 1.3899, + "step": 22427 + }, + { + "epoch": 0.803194441957491, + "grad_norm": 2.0197689533233643, + "learning_rate": 1.9633387125430226e-05, + "loss": 1.6515, + "step": 22428 + }, + { + "epoch": 0.8032302540870593, + "grad_norm": 1.3845274448394775, + "learning_rate": 1.9626485320110632e-05, + "loss": 1.3537, + "step": 22429 + }, + { + "epoch": 0.8032660662166276, + "grad_norm": 1.8746153116226196, + "learning_rate": 1.961958459611295e-05, + "loss": 1.1188, + "step": 22430 + }, + { + "epoch": 0.8033018783461958, + "grad_norm": 1.311545491218567, + "learning_rate": 1.9612684953530124e-05, + "loss": 1.3475, + "step": 22431 + }, + { + "epoch": 0.8033376904757641, + "grad_norm": 2.6298389434814453, + "learning_rate": 1.9605786392454904e-05, + "loss": 1.6516, + "step": 22432 + }, + { + "epoch": 0.8033735026053325, + "grad_norm": 1.625884771347046, + "learning_rate": 1.9598888912980117e-05, + "loss": 1.1758, + "step": 22433 + }, + { + "epoch": 0.8034093147349007, + "grad_norm": 2.10892391204834, + "learning_rate": 1.9591992515198588e-05, + "loss": 1.4329, + "step": 22434 + }, + { + "epoch": 0.803445126864469, + "grad_norm": 1.5895978212356567, + "learning_rate": 1.9585097199203096e-05, + "loss": 1.466, + "step": 22435 + }, + { + "epoch": 0.8034809389940373, + "grad_norm": 1.5277785062789917, + "learning_rate": 1.957820296508637e-05, + "loss": 1.7112, + "step": 22436 + }, + { + "epoch": 0.8035167511236055, + "grad_norm": 2.111467123031616, + "learning_rate": 1.9571309812941184e-05, + "loss": 1.1178, + "step": 22437 + }, + { + "epoch": 0.8035525632531738, + "grad_norm": 1.8915823698043823, + "learning_rate": 1.9564417742860307e-05, + "loss": 1.5494, + "step": 22438 + }, + { + "epoch": 0.8035883753827421, + "grad_norm": 1.9859658479690552, + "learning_rate": 1.9557526754936405e-05, + "loss": 1.6199, + "step": 22439 + }, + { + "epoch": 0.8036241875123105, + "grad_norm": 1.3071140050888062, + "learning_rate": 1.955063684926225e-05, + "loss": 1.3199, + "step": 22440 + }, + { + "epoch": 0.8036599996418787, + "grad_norm": 1.539048194885254, + "learning_rate": 1.9543748025930452e-05, + "loss": 1.3173, + "step": 22441 + }, + { + "epoch": 0.803695811771447, + "grad_norm": 1.7450897693634033, + "learning_rate": 1.9536860285033797e-05, + "loss": 1.5985, + "step": 22442 + }, + { + "epoch": 0.8037316239010153, + "grad_norm": 1.7976315021514893, + "learning_rate": 1.9529973626664865e-05, + "loss": 1.4624, + "step": 22443 + }, + { + "epoch": 0.8037674360305835, + "grad_norm": 2.065857172012329, + "learning_rate": 1.952308805091636e-05, + "loss": 1.5356, + "step": 22444 + }, + { + "epoch": 0.8038032481601518, + "grad_norm": 2.313326597213745, + "learning_rate": 1.9516203557880852e-05, + "loss": 1.5283, + "step": 22445 + }, + { + "epoch": 0.8038390602897201, + "grad_norm": 2.275071859359741, + "learning_rate": 1.9509320147651068e-05, + "loss": 1.4142, + "step": 22446 + }, + { + "epoch": 0.8038748724192885, + "grad_norm": 2.0012121200561523, + "learning_rate": 1.950243782031954e-05, + "loss": 1.5083, + "step": 22447 + }, + { + "epoch": 0.8039106845488567, + "grad_norm": 1.4849156141281128, + "learning_rate": 1.9495556575978836e-05, + "loss": 1.3491, + "step": 22448 + }, + { + "epoch": 0.803946496678425, + "grad_norm": 1.4966169595718384, + "learning_rate": 1.948867641472163e-05, + "loss": 1.4678, + "step": 22449 + }, + { + "epoch": 0.8039823088079933, + "grad_norm": 1.564099669456482, + "learning_rate": 1.9481797336640396e-05, + "loss": 1.1639, + "step": 22450 + }, + { + "epoch": 0.8040181209375615, + "grad_norm": 1.6799238920211792, + "learning_rate": 1.9474919341827746e-05, + "loss": 1.2108, + "step": 22451 + }, + { + "epoch": 0.8040539330671298, + "grad_norm": 1.8090828657150269, + "learning_rate": 1.946804243037613e-05, + "loss": 1.3893, + "step": 22452 + }, + { + "epoch": 0.8040897451966981, + "grad_norm": 1.8733340501785278, + "learning_rate": 1.9461166602378176e-05, + "loss": 1.6914, + "step": 22453 + }, + { + "epoch": 0.8041255573262664, + "grad_norm": 1.3128604888916016, + "learning_rate": 1.9454291857926323e-05, + "loss": 1.5375, + "step": 22454 + }, + { + "epoch": 0.8041613694558347, + "grad_norm": 1.3559684753417969, + "learning_rate": 1.94474181971131e-05, + "loss": 1.5298, + "step": 22455 + }, + { + "epoch": 0.804197181585403, + "grad_norm": 1.790544033050537, + "learning_rate": 1.9440545620030924e-05, + "loss": 1.469, + "step": 22456 + }, + { + "epoch": 0.8042329937149713, + "grad_norm": 1.5944311618804932, + "learning_rate": 1.9433674126772306e-05, + "loss": 1.4043, + "step": 22457 + }, + { + "epoch": 0.8042688058445395, + "grad_norm": 1.7077056169509888, + "learning_rate": 1.9426803717429696e-05, + "loss": 1.4089, + "step": 22458 + }, + { + "epoch": 0.8043046179741078, + "grad_norm": 1.622973918914795, + "learning_rate": 1.941993439209546e-05, + "loss": 1.3107, + "step": 22459 + }, + { + "epoch": 0.8043404301036761, + "grad_norm": 1.6063894033432007, + "learning_rate": 1.9413066150862113e-05, + "loss": 1.2024, + "step": 22460 + }, + { + "epoch": 0.8043762422332444, + "grad_norm": 1.5905756950378418, + "learning_rate": 1.9406198993822e-05, + "loss": 1.5908, + "step": 22461 + }, + { + "epoch": 0.8044120543628127, + "grad_norm": 1.9238442182540894, + "learning_rate": 1.9399332921067537e-05, + "loss": 2.0081, + "step": 22462 + }, + { + "epoch": 0.804447866492381, + "grad_norm": 1.5469660758972168, + "learning_rate": 1.939246793269103e-05, + "loss": 1.6734, + "step": 22463 + }, + { + "epoch": 0.8044836786219492, + "grad_norm": 2.7361080646514893, + "learning_rate": 1.938560402878494e-05, + "loss": 1.7897, + "step": 22464 + }, + { + "epoch": 0.8045194907515175, + "grad_norm": 1.9339004755020142, + "learning_rate": 1.9378741209441565e-05, + "loss": 1.3695, + "step": 22465 + }, + { + "epoch": 0.8045553028810858, + "grad_norm": 1.4602885246276855, + "learning_rate": 1.9371879474753208e-05, + "loss": 1.5427, + "step": 22466 + }, + { + "epoch": 0.8045911150106541, + "grad_norm": 1.5360411405563354, + "learning_rate": 1.9365018824812208e-05, + "loss": 1.4808, + "step": 22467 + }, + { + "epoch": 0.8046269271402224, + "grad_norm": 1.5212199687957764, + "learning_rate": 1.9358159259710874e-05, + "loss": 1.1769, + "step": 22468 + }, + { + "epoch": 0.8046627392697907, + "grad_norm": 2.0291521549224854, + "learning_rate": 1.9351300779541503e-05, + "loss": 1.5694, + "step": 22469 + }, + { + "epoch": 0.804698551399359, + "grad_norm": 1.705421805381775, + "learning_rate": 1.9344443384396337e-05, + "loss": 1.6615, + "step": 22470 + }, + { + "epoch": 0.8047343635289272, + "grad_norm": 2.076650381088257, + "learning_rate": 1.9337587074367637e-05, + "loss": 1.5295, + "step": 22471 + }, + { + "epoch": 0.8047701756584955, + "grad_norm": 1.3150924444198608, + "learning_rate": 1.9330731849547655e-05, + "loss": 1.019, + "step": 22472 + }, + { + "epoch": 0.8048059877880638, + "grad_norm": 1.6172802448272705, + "learning_rate": 1.9323877710028658e-05, + "loss": 1.4884, + "step": 22473 + }, + { + "epoch": 0.804841799917632, + "grad_norm": 1.3734067678451538, + "learning_rate": 1.9317024655902782e-05, + "loss": 1.3739, + "step": 22474 + }, + { + "epoch": 0.8048776120472004, + "grad_norm": 1.7881556749343872, + "learning_rate": 1.9310172687262273e-05, + "loss": 1.3257, + "step": 22475 + }, + { + "epoch": 0.8049134241767687, + "grad_norm": 1.5242328643798828, + "learning_rate": 1.9303321804199338e-05, + "loss": 1.2506, + "step": 22476 + }, + { + "epoch": 0.804949236306337, + "grad_norm": 1.535390019416809, + "learning_rate": 1.9296472006806087e-05, + "loss": 0.9729, + "step": 22477 + }, + { + "epoch": 0.8049850484359052, + "grad_norm": 1.8442243337631226, + "learning_rate": 1.9289623295174697e-05, + "loss": 1.6024, + "step": 22478 + }, + { + "epoch": 0.8050208605654735, + "grad_norm": 1.8470580577850342, + "learning_rate": 1.9282775669397324e-05, + "loss": 1.6681, + "step": 22479 + }, + { + "epoch": 0.8050566726950418, + "grad_norm": 1.2782084941864014, + "learning_rate": 1.9275929129566116e-05, + "loss": 1.4994, + "step": 22480 + }, + { + "epoch": 0.80509248482461, + "grad_norm": 1.826900839805603, + "learning_rate": 1.9269083675773126e-05, + "loss": 1.5691, + "step": 22481 + }, + { + "epoch": 0.8051282969541784, + "grad_norm": 1.380555510520935, + "learning_rate": 1.9262239308110474e-05, + "loss": 1.4017, + "step": 22482 + }, + { + "epoch": 0.8051641090837467, + "grad_norm": 1.2143405675888062, + "learning_rate": 1.925539602667028e-05, + "loss": 1.4549, + "step": 22483 + }, + { + "epoch": 0.805199921213315, + "grad_norm": 1.643473505973816, + "learning_rate": 1.924855383154456e-05, + "loss": 1.7101, + "step": 22484 + }, + { + "epoch": 0.8052357333428832, + "grad_norm": 1.6362136602401733, + "learning_rate": 1.924171272282538e-05, + "loss": 1.5971, + "step": 22485 + }, + { + "epoch": 0.8052715454724515, + "grad_norm": 1.5110559463500977, + "learning_rate": 1.9234872700604777e-05, + "loss": 1.3248, + "step": 22486 + }, + { + "epoch": 0.8053073576020198, + "grad_norm": 1.5972952842712402, + "learning_rate": 1.9228033764974818e-05, + "loss": 1.3044, + "step": 22487 + }, + { + "epoch": 0.805343169731588, + "grad_norm": 1.8326233625411987, + "learning_rate": 1.9221195916027445e-05, + "loss": 1.611, + "step": 22488 + }, + { + "epoch": 0.8053789818611564, + "grad_norm": 2.9135138988494873, + "learning_rate": 1.921435915385469e-05, + "loss": 1.9834, + "step": 22489 + }, + { + "epoch": 0.8054147939907247, + "grad_norm": 1.453930139541626, + "learning_rate": 1.9207523478548518e-05, + "loss": 1.3931, + "step": 22490 + }, + { + "epoch": 0.805450606120293, + "grad_norm": 1.5433127880096436, + "learning_rate": 1.9200688890200936e-05, + "loss": 1.0553, + "step": 22491 + }, + { + "epoch": 0.8054864182498612, + "grad_norm": 1.3644789457321167, + "learning_rate": 1.9193855388903824e-05, + "loss": 1.187, + "step": 22492 + }, + { + "epoch": 0.8055222303794295, + "grad_norm": 1.4411420822143555, + "learning_rate": 1.918702297474917e-05, + "loss": 1.4759, + "step": 22493 + }, + { + "epoch": 0.8055580425089978, + "grad_norm": 1.6154589653015137, + "learning_rate": 1.9180191647828906e-05, + "loss": 1.3965, + "step": 22494 + }, + { + "epoch": 0.805593854638566, + "grad_norm": 1.5406982898712158, + "learning_rate": 1.917336140823488e-05, + "loss": 1.3631, + "step": 22495 + }, + { + "epoch": 0.8056296667681344, + "grad_norm": 1.679787278175354, + "learning_rate": 1.916653225605901e-05, + "loss": 1.577, + "step": 22496 + }, + { + "epoch": 0.8056654788977027, + "grad_norm": 1.5739904642105103, + "learning_rate": 1.915970419139319e-05, + "loss": 1.641, + "step": 22497 + }, + { + "epoch": 0.805701291027271, + "grad_norm": 1.5268478393554688, + "learning_rate": 1.91528772143293e-05, + "loss": 1.0907, + "step": 22498 + }, + { + "epoch": 0.8057371031568392, + "grad_norm": 1.8088730573654175, + "learning_rate": 1.9146051324959134e-05, + "loss": 1.4507, + "step": 22499 + }, + { + "epoch": 0.8057729152864075, + "grad_norm": 1.8089536428451538, + "learning_rate": 1.9139226523374566e-05, + "loss": 1.4117, + "step": 22500 + }, + { + "epoch": 0.8058087274159758, + "grad_norm": 1.642994999885559, + "learning_rate": 1.9132402809667416e-05, + "loss": 1.4066, + "step": 22501 + }, + { + "epoch": 0.805844539545544, + "grad_norm": 2.0810434818267822, + "learning_rate": 1.9125580183929448e-05, + "loss": 1.5808, + "step": 22502 + }, + { + "epoch": 0.8058803516751124, + "grad_norm": 1.3621398210525513, + "learning_rate": 1.9118758646252477e-05, + "loss": 1.1892, + "step": 22503 + }, + { + "epoch": 0.8059161638046807, + "grad_norm": 1.652403473854065, + "learning_rate": 1.9111938196728284e-05, + "loss": 1.3299, + "step": 22504 + }, + { + "epoch": 0.8059519759342489, + "grad_norm": 1.8718891143798828, + "learning_rate": 1.9105118835448644e-05, + "loss": 1.6525, + "step": 22505 + }, + { + "epoch": 0.8059877880638172, + "grad_norm": 1.4959759712219238, + "learning_rate": 1.9098300562505266e-05, + "loss": 1.2341, + "step": 22506 + }, + { + "epoch": 0.8060236001933855, + "grad_norm": 1.535392165184021, + "learning_rate": 1.9091483377989895e-05, + "loss": 1.4926, + "step": 22507 + }, + { + "epoch": 0.8060594123229537, + "grad_norm": 2.2390332221984863, + "learning_rate": 1.9084667281994273e-05, + "loss": 1.5246, + "step": 22508 + }, + { + "epoch": 0.806095224452522, + "grad_norm": 1.3575999736785889, + "learning_rate": 1.9077852274610055e-05, + "loss": 1.6471, + "step": 22509 + }, + { + "epoch": 0.8061310365820904, + "grad_norm": 2.580101251602173, + "learning_rate": 1.9071038355928948e-05, + "loss": 1.5999, + "step": 22510 + }, + { + "epoch": 0.8061668487116587, + "grad_norm": 1.4733290672302246, + "learning_rate": 1.9064225526042644e-05, + "loss": 1.2515, + "step": 22511 + }, + { + "epoch": 0.8062026608412269, + "grad_norm": 1.3597815036773682, + "learning_rate": 1.90574137850428e-05, + "loss": 1.4526, + "step": 22512 + }, + { + "epoch": 0.8062384729707952, + "grad_norm": 1.9659523963928223, + "learning_rate": 1.9050603133021017e-05, + "loss": 1.6421, + "step": 22513 + }, + { + "epoch": 0.8062742851003635, + "grad_norm": 1.9482402801513672, + "learning_rate": 1.904379357006896e-05, + "loss": 1.3654, + "step": 22514 + }, + { + "epoch": 0.8063100972299317, + "grad_norm": 1.8046506643295288, + "learning_rate": 1.9036985096278227e-05, + "loss": 1.6794, + "step": 22515 + }, + { + "epoch": 0.8063459093595, + "grad_norm": 2.277416944503784, + "learning_rate": 1.903017771174046e-05, + "loss": 1.6218, + "step": 22516 + }, + { + "epoch": 0.8063817214890684, + "grad_norm": 1.3956186771392822, + "learning_rate": 1.9023371416547177e-05, + "loss": 1.5363, + "step": 22517 + }, + { + "epoch": 0.8064175336186367, + "grad_norm": 1.5622445344924927, + "learning_rate": 1.9016566210789977e-05, + "loss": 1.4348, + "step": 22518 + }, + { + "epoch": 0.8064533457482049, + "grad_norm": 1.2763302326202393, + "learning_rate": 1.9009762094560446e-05, + "loss": 1.5905, + "step": 22519 + }, + { + "epoch": 0.8064891578777732, + "grad_norm": 1.6055020093917847, + "learning_rate": 1.9002959067950066e-05, + "loss": 1.5085, + "step": 22520 + }, + { + "epoch": 0.8065249700073415, + "grad_norm": 1.4015109539031982, + "learning_rate": 1.8996157131050395e-05, + "loss": 1.4095, + "step": 22521 + }, + { + "epoch": 0.8065607821369097, + "grad_norm": 2.2692770957946777, + "learning_rate": 1.8989356283952943e-05, + "loss": 1.556, + "step": 22522 + }, + { + "epoch": 0.806596594266478, + "grad_norm": 1.5125993490219116, + "learning_rate": 1.898255652674924e-05, + "loss": 1.4581, + "step": 22523 + }, + { + "epoch": 0.8066324063960464, + "grad_norm": 1.379358172416687, + "learning_rate": 1.8975757859530696e-05, + "loss": 1.5579, + "step": 22524 + }, + { + "epoch": 0.8066682185256147, + "grad_norm": 1.793256163597107, + "learning_rate": 1.8968960282388826e-05, + "loss": 1.6786, + "step": 22525 + }, + { + "epoch": 0.8067040306551829, + "grad_norm": 2.028078317642212, + "learning_rate": 1.896216379541509e-05, + "loss": 1.3897, + "step": 22526 + }, + { + "epoch": 0.8067398427847512, + "grad_norm": 1.6242742538452148, + "learning_rate": 1.895536839870089e-05, + "loss": 1.2961, + "step": 22527 + }, + { + "epoch": 0.8067756549143195, + "grad_norm": 1.5608689785003662, + "learning_rate": 1.894857409233769e-05, + "loss": 1.0402, + "step": 22528 + }, + { + "epoch": 0.8068114670438877, + "grad_norm": 1.6409008502960205, + "learning_rate": 1.8941780876416826e-05, + "loss": 1.4753, + "step": 22529 + }, + { + "epoch": 0.806847279173456, + "grad_norm": 1.9912757873535156, + "learning_rate": 1.893498875102979e-05, + "loss": 1.7401, + "step": 22530 + }, + { + "epoch": 0.8068830913030244, + "grad_norm": 1.8302267789840698, + "learning_rate": 1.8928197716267894e-05, + "loss": 1.5594, + "step": 22531 + }, + { + "epoch": 0.8069189034325926, + "grad_norm": 1.399596929550171, + "learning_rate": 1.892140777222252e-05, + "loss": 1.4571, + "step": 22532 + }, + { + "epoch": 0.8069547155621609, + "grad_norm": 1.794324278831482, + "learning_rate": 1.8914618918985028e-05, + "loss": 1.2283, + "step": 22533 + }, + { + "epoch": 0.8069905276917292, + "grad_norm": 1.7145918607711792, + "learning_rate": 1.890783115664676e-05, + "loss": 1.2539, + "step": 22534 + }, + { + "epoch": 0.8070263398212975, + "grad_norm": 1.4848459959030151, + "learning_rate": 1.8901044485299034e-05, + "loss": 1.41, + "step": 22535 + }, + { + "epoch": 0.8070621519508657, + "grad_norm": 1.7278633117675781, + "learning_rate": 1.889425890503308e-05, + "loss": 1.5744, + "step": 22536 + }, + { + "epoch": 0.807097964080434, + "grad_norm": 1.4958399534225464, + "learning_rate": 1.888747441594031e-05, + "loss": 1.3393, + "step": 22537 + }, + { + "epoch": 0.8071337762100024, + "grad_norm": 1.7114999294281006, + "learning_rate": 1.888069101811193e-05, + "loss": 1.1309, + "step": 22538 + }, + { + "epoch": 0.8071695883395706, + "grad_norm": 1.3507667779922485, + "learning_rate": 1.887390871163922e-05, + "loss": 1.3367, + "step": 22539 + }, + { + "epoch": 0.8072054004691389, + "grad_norm": 2.3738009929656982, + "learning_rate": 1.886712749661339e-05, + "loss": 1.5551, + "step": 22540 + }, + { + "epoch": 0.8072412125987072, + "grad_norm": 1.762641191482544, + "learning_rate": 1.8860347373125753e-05, + "loss": 1.4421, + "step": 22541 + }, + { + "epoch": 0.8072770247282754, + "grad_norm": 1.4975340366363525, + "learning_rate": 1.8853568341267448e-05, + "loss": 1.3572, + "step": 22542 + }, + { + "epoch": 0.8073128368578437, + "grad_norm": 1.7372536659240723, + "learning_rate": 1.884679040112971e-05, + "loss": 1.1709, + "step": 22543 + }, + { + "epoch": 0.807348648987412, + "grad_norm": 2.144090175628662, + "learning_rate": 1.884001355280376e-05, + "loss": 1.6421, + "step": 22544 + }, + { + "epoch": 0.8073844611169804, + "grad_norm": 1.7356644868850708, + "learning_rate": 1.8833237796380708e-05, + "loss": 1.3723, + "step": 22545 + }, + { + "epoch": 0.8074202732465486, + "grad_norm": 1.4547863006591797, + "learning_rate": 1.8826463131951767e-05, + "loss": 1.3705, + "step": 22546 + }, + { + "epoch": 0.8074560853761169, + "grad_norm": 1.4055238962173462, + "learning_rate": 1.8819689559608012e-05, + "loss": 1.2637, + "step": 22547 + }, + { + "epoch": 0.8074918975056852, + "grad_norm": 1.7052263021469116, + "learning_rate": 1.8812917079440673e-05, + "loss": 1.4931, + "step": 22548 + }, + { + "epoch": 0.8075277096352534, + "grad_norm": 1.6998118162155151, + "learning_rate": 1.8806145691540777e-05, + "loss": 1.7741, + "step": 22549 + }, + { + "epoch": 0.8075635217648217, + "grad_norm": 1.4957939386367798, + "learning_rate": 1.8799375395999487e-05, + "loss": 1.4767, + "step": 22550 + }, + { + "epoch": 0.80759933389439, + "grad_norm": 1.5346788167953491, + "learning_rate": 1.8792606192907813e-05, + "loss": 1.3791, + "step": 22551 + }, + { + "epoch": 0.8076351460239584, + "grad_norm": 1.6711095571517944, + "learning_rate": 1.878583808235692e-05, + "loss": 1.3828, + "step": 22552 + }, + { + "epoch": 0.8076709581535266, + "grad_norm": 2.229304313659668, + "learning_rate": 1.8779071064437813e-05, + "loss": 1.8454, + "step": 22553 + }, + { + "epoch": 0.8077067702830949, + "grad_norm": 1.6836011409759521, + "learning_rate": 1.877230513924152e-05, + "loss": 1.14, + "step": 22554 + }, + { + "epoch": 0.8077425824126632, + "grad_norm": 1.786728858947754, + "learning_rate": 1.8765540306859076e-05, + "loss": 1.4788, + "step": 22555 + }, + { + "epoch": 0.8077783945422314, + "grad_norm": 3.058734178543091, + "learning_rate": 1.8758776567381508e-05, + "loss": 1.3963, + "step": 22556 + }, + { + "epoch": 0.8078142066717997, + "grad_norm": 1.7298702001571655, + "learning_rate": 1.8752013920899836e-05, + "loss": 1.3446, + "step": 22557 + }, + { + "epoch": 0.807850018801368, + "grad_norm": 1.8953697681427002, + "learning_rate": 1.874525236750495e-05, + "loss": 1.6026, + "step": 22558 + }, + { + "epoch": 0.8078858309309364, + "grad_norm": 1.685193657875061, + "learning_rate": 1.8738491907287946e-05, + "loss": 1.5745, + "step": 22559 + }, + { + "epoch": 0.8079216430605046, + "grad_norm": 1.7449640035629272, + "learning_rate": 1.8731732540339684e-05, + "loss": 1.4257, + "step": 22560 + }, + { + "epoch": 0.8079574551900729, + "grad_norm": 1.563020944595337, + "learning_rate": 1.872497426675116e-05, + "loss": 1.2951, + "step": 22561 + }, + { + "epoch": 0.8079932673196412, + "grad_norm": 1.595478892326355, + "learning_rate": 1.8718217086613242e-05, + "loss": 1.1794, + "step": 22562 + }, + { + "epoch": 0.8080290794492094, + "grad_norm": 1.6257182359695435, + "learning_rate": 1.871146100001687e-05, + "loss": 1.3228, + "step": 22563 + }, + { + "epoch": 0.8080648915787777, + "grad_norm": 1.8950823545455933, + "learning_rate": 1.8704706007052963e-05, + "loss": 1.4481, + "step": 22564 + }, + { + "epoch": 0.808100703708346, + "grad_norm": 1.5394660234451294, + "learning_rate": 1.8697952107812344e-05, + "loss": 1.249, + "step": 22565 + }, + { + "epoch": 0.8081365158379143, + "grad_norm": 1.2871990203857422, + "learning_rate": 1.86911993023859e-05, + "loss": 1.3532, + "step": 22566 + }, + { + "epoch": 0.8081723279674826, + "grad_norm": 1.3102869987487793, + "learning_rate": 1.8684447590864494e-05, + "loss": 1.4537, + "step": 22567 + }, + { + "epoch": 0.8082081400970509, + "grad_norm": 1.5285694599151611, + "learning_rate": 1.867769697333899e-05, + "loss": 1.2677, + "step": 22568 + }, + { + "epoch": 0.8082439522266192, + "grad_norm": 1.455868124961853, + "learning_rate": 1.8670947449900113e-05, + "loss": 1.2461, + "step": 22569 + }, + { + "epoch": 0.8082797643561874, + "grad_norm": 1.5081433057785034, + "learning_rate": 1.8664199020638785e-05, + "loss": 1.6674, + "step": 22570 + }, + { + "epoch": 0.8083155764857557, + "grad_norm": 1.4699350595474243, + "learning_rate": 1.8657451685645756e-05, + "loss": 1.5963, + "step": 22571 + }, + { + "epoch": 0.808351388615324, + "grad_norm": 1.518787145614624, + "learning_rate": 1.8650705445011752e-05, + "loss": 1.1539, + "step": 22572 + }, + { + "epoch": 0.8083872007448923, + "grad_norm": 1.761563777923584, + "learning_rate": 1.8643960298827566e-05, + "loss": 1.2219, + "step": 22573 + }, + { + "epoch": 0.8084230128744606, + "grad_norm": 1.9616132974624634, + "learning_rate": 1.8637216247183966e-05, + "loss": 1.5145, + "step": 22574 + }, + { + "epoch": 0.8084588250040289, + "grad_norm": 1.8252731561660767, + "learning_rate": 1.8630473290171692e-05, + "loss": 1.6176, + "step": 22575 + }, + { + "epoch": 0.8084946371335971, + "grad_norm": 1.8796595335006714, + "learning_rate": 1.8623731427881418e-05, + "loss": 1.5545, + "step": 22576 + }, + { + "epoch": 0.8085304492631654, + "grad_norm": 1.679461121559143, + "learning_rate": 1.8616990660403865e-05, + "loss": 1.4918, + "step": 22577 + }, + { + "epoch": 0.8085662613927337, + "grad_norm": 1.4722779989242554, + "learning_rate": 1.8610250987829725e-05, + "loss": 1.3942, + "step": 22578 + }, + { + "epoch": 0.808602073522302, + "grad_norm": 1.8476933240890503, + "learning_rate": 1.86035124102497e-05, + "loss": 1.3052, + "step": 22579 + }, + { + "epoch": 0.8086378856518703, + "grad_norm": 1.712782621383667, + "learning_rate": 1.85967749277544e-05, + "loss": 1.0278, + "step": 22580 + }, + { + "epoch": 0.8086736977814386, + "grad_norm": 1.7284092903137207, + "learning_rate": 1.8590038540434485e-05, + "loss": 1.5429, + "step": 22581 + }, + { + "epoch": 0.8087095099110069, + "grad_norm": 1.3968522548675537, + "learning_rate": 1.8583303248380625e-05, + "loss": 1.2947, + "step": 22582 + }, + { + "epoch": 0.8087453220405751, + "grad_norm": 1.7425910234451294, + "learning_rate": 1.8576569051683368e-05, + "loss": 1.552, + "step": 22583 + }, + { + "epoch": 0.8087811341701434, + "grad_norm": 1.8411518335342407, + "learning_rate": 1.8569835950433344e-05, + "loss": 1.7172, + "step": 22584 + }, + { + "epoch": 0.8088169462997117, + "grad_norm": 1.879120945930481, + "learning_rate": 1.856310394472114e-05, + "loss": 1.374, + "step": 22585 + }, + { + "epoch": 0.80885275842928, + "grad_norm": 1.4328593015670776, + "learning_rate": 1.8556373034637353e-05, + "loss": 1.5658, + "step": 22586 + }, + { + "epoch": 0.8088885705588483, + "grad_norm": 1.5288828611373901, + "learning_rate": 1.8549643220272494e-05, + "loss": 1.4516, + "step": 22587 + }, + { + "epoch": 0.8089243826884166, + "grad_norm": 2.343038558959961, + "learning_rate": 1.8542914501717113e-05, + "loss": 1.4335, + "step": 22588 + }, + { + "epoch": 0.8089601948179849, + "grad_norm": 1.6488909721374512, + "learning_rate": 1.853618687906177e-05, + "loss": 0.9991, + "step": 22589 + }, + { + "epoch": 0.8089960069475531, + "grad_norm": 1.3302148580551147, + "learning_rate": 1.852946035239693e-05, + "loss": 1.339, + "step": 22590 + }, + { + "epoch": 0.8090318190771214, + "grad_norm": 1.6579506397247314, + "learning_rate": 1.8522734921813113e-05, + "loss": 1.4514, + "step": 22591 + }, + { + "epoch": 0.8090676312066897, + "grad_norm": 1.6817675828933716, + "learning_rate": 1.85160105874008e-05, + "loss": 1.1681, + "step": 22592 + }, + { + "epoch": 0.8091034433362579, + "grad_norm": 1.8572685718536377, + "learning_rate": 1.8509287349250482e-05, + "loss": 1.456, + "step": 22593 + }, + { + "epoch": 0.8091392554658263, + "grad_norm": 1.4762178659439087, + "learning_rate": 1.850256520745256e-05, + "loss": 1.4556, + "step": 22594 + }, + { + "epoch": 0.8091750675953946, + "grad_norm": 1.8500964641571045, + "learning_rate": 1.84958441620975e-05, + "loss": 1.5579, + "step": 22595 + }, + { + "epoch": 0.8092108797249629, + "grad_norm": 1.5180333852767944, + "learning_rate": 1.8489124213275745e-05, + "loss": 1.34, + "step": 22596 + }, + { + "epoch": 0.8092466918545311, + "grad_norm": 2.089555501937866, + "learning_rate": 1.8482405361077658e-05, + "loss": 1.6457, + "step": 22597 + }, + { + "epoch": 0.8092825039840994, + "grad_norm": 1.4597141742706299, + "learning_rate": 1.847568760559366e-05, + "loss": 1.3754, + "step": 22598 + }, + { + "epoch": 0.8093183161136677, + "grad_norm": 1.6133396625518799, + "learning_rate": 1.8468970946914134e-05, + "loss": 1.1957, + "step": 22599 + }, + { + "epoch": 0.8093541282432359, + "grad_norm": 1.5091086626052856, + "learning_rate": 1.8462255385129447e-05, + "loss": 1.3334, + "step": 22600 + }, + { + "epoch": 0.8093899403728043, + "grad_norm": 1.8339647054672241, + "learning_rate": 1.8455540920329916e-05, + "loss": 1.7524, + "step": 22601 + }, + { + "epoch": 0.8094257525023726, + "grad_norm": 2.3774209022521973, + "learning_rate": 1.8448827552605907e-05, + "loss": 1.4291, + "step": 22602 + }, + { + "epoch": 0.8094615646319409, + "grad_norm": 1.3259594440460205, + "learning_rate": 1.8442115282047723e-05, + "loss": 1.5422, + "step": 22603 + }, + { + "epoch": 0.8094973767615091, + "grad_norm": 1.5684672594070435, + "learning_rate": 1.8435404108745702e-05, + "loss": 1.4201, + "step": 22604 + }, + { + "epoch": 0.8095331888910774, + "grad_norm": 1.6213066577911377, + "learning_rate": 1.8428694032790074e-05, + "loss": 1.1581, + "step": 22605 + }, + { + "epoch": 0.8095690010206457, + "grad_norm": 1.6540571451187134, + "learning_rate": 1.8421985054271163e-05, + "loss": 1.319, + "step": 22606 + }, + { + "epoch": 0.8096048131502139, + "grad_norm": 2.1115972995758057, + "learning_rate": 1.8415277173279234e-05, + "loss": 1.3976, + "step": 22607 + }, + { + "epoch": 0.8096406252797823, + "grad_norm": 1.4200903177261353, + "learning_rate": 1.840857038990449e-05, + "loss": 1.2876, + "step": 22608 + }, + { + "epoch": 0.8096764374093506, + "grad_norm": 2.16260027885437, + "learning_rate": 1.840186470423718e-05, + "loss": 1.3341, + "step": 22609 + }, + { + "epoch": 0.8097122495389188, + "grad_norm": 1.4035898447036743, + "learning_rate": 1.8395160116367528e-05, + "loss": 1.276, + "step": 22610 + }, + { + "epoch": 0.8097480616684871, + "grad_norm": 2.398355484008789, + "learning_rate": 1.8388456626385765e-05, + "loss": 1.5374, + "step": 22611 + }, + { + "epoch": 0.8097838737980554, + "grad_norm": 1.7936995029449463, + "learning_rate": 1.838175423438202e-05, + "loss": 1.4219, + "step": 22612 + }, + { + "epoch": 0.8098196859276237, + "grad_norm": 1.4915958642959595, + "learning_rate": 1.837505294044649e-05, + "loss": 1.4166, + "step": 22613 + }, + { + "epoch": 0.8098554980571919, + "grad_norm": 1.7961300611495972, + "learning_rate": 1.836835274466936e-05, + "loss": 1.4323, + "step": 22614 + }, + { + "epoch": 0.8098913101867603, + "grad_norm": 2.53918719291687, + "learning_rate": 1.836165364714072e-05, + "loss": 1.3317, + "step": 22615 + }, + { + "epoch": 0.8099271223163286, + "grad_norm": 1.416369080543518, + "learning_rate": 1.835495564795072e-05, + "loss": 1.3952, + "step": 22616 + }, + { + "epoch": 0.8099629344458968, + "grad_norm": 2.098452091217041, + "learning_rate": 1.8348258747189484e-05, + "loss": 1.4207, + "step": 22617 + }, + { + "epoch": 0.8099987465754651, + "grad_norm": 1.722528100013733, + "learning_rate": 1.8341562944947134e-05, + "loss": 1.4759, + "step": 22618 + }, + { + "epoch": 0.8100345587050334, + "grad_norm": 1.612025499343872, + "learning_rate": 1.8334868241313685e-05, + "loss": 1.5993, + "step": 22619 + }, + { + "epoch": 0.8100703708346016, + "grad_norm": 1.5949660539627075, + "learning_rate": 1.832817463637925e-05, + "loss": 1.4045, + "step": 22620 + }, + { + "epoch": 0.8101061829641699, + "grad_norm": 1.33736252784729, + "learning_rate": 1.832148213023387e-05, + "loss": 1.2006, + "step": 22621 + }, + { + "epoch": 0.8101419950937383, + "grad_norm": 2.1266708374023438, + "learning_rate": 1.8314790722967624e-05, + "loss": 1.522, + "step": 22622 + }, + { + "epoch": 0.8101778072233066, + "grad_norm": 1.595590353012085, + "learning_rate": 1.8308100414670504e-05, + "loss": 1.4679, + "step": 22623 + }, + { + "epoch": 0.8102136193528748, + "grad_norm": 1.3091840744018555, + "learning_rate": 1.830141120543246e-05, + "loss": 1.2643, + "step": 22624 + }, + { + "epoch": 0.8102494314824431, + "grad_norm": 1.8592644929885864, + "learning_rate": 1.829472309534359e-05, + "loss": 1.4972, + "step": 22625 + }, + { + "epoch": 0.8102852436120114, + "grad_norm": 1.5312573909759521, + "learning_rate": 1.828803608449382e-05, + "loss": 1.4416, + "step": 22626 + }, + { + "epoch": 0.8103210557415796, + "grad_norm": 1.3308011293411255, + "learning_rate": 1.828135017297311e-05, + "loss": 1.3562, + "step": 22627 + }, + { + "epoch": 0.8103568678711479, + "grad_norm": 1.6629422903060913, + "learning_rate": 1.8274665360871425e-05, + "loss": 1.2463, + "step": 22628 + }, + { + "epoch": 0.8103926800007163, + "grad_norm": 2.118683099746704, + "learning_rate": 1.8267981648278733e-05, + "loss": 1.5011, + "step": 22629 + }, + { + "epoch": 0.8104284921302846, + "grad_norm": 1.60123872756958, + "learning_rate": 1.8261299035284883e-05, + "loss": 1.4359, + "step": 22630 + }, + { + "epoch": 0.8104643042598528, + "grad_norm": 1.698397159576416, + "learning_rate": 1.825461752197983e-05, + "loss": 1.4867, + "step": 22631 + }, + { + "epoch": 0.8105001163894211, + "grad_norm": 1.6148022413253784, + "learning_rate": 1.8247937108453482e-05, + "loss": 1.4307, + "step": 22632 + }, + { + "epoch": 0.8105359285189894, + "grad_norm": 1.858364224433899, + "learning_rate": 1.8241257794795653e-05, + "loss": 1.3802, + "step": 22633 + }, + { + "epoch": 0.8105717406485576, + "grad_norm": 1.6542712450027466, + "learning_rate": 1.8234579581096266e-05, + "loss": 1.419, + "step": 22634 + }, + { + "epoch": 0.8106075527781259, + "grad_norm": 1.4909992218017578, + "learning_rate": 1.82279024674451e-05, + "loss": 1.3149, + "step": 22635 + }, + { + "epoch": 0.8106433649076943, + "grad_norm": 1.499837040901184, + "learning_rate": 1.8221226453932074e-05, + "loss": 1.3224, + "step": 22636 + }, + { + "epoch": 0.8106791770372626, + "grad_norm": 2.104693651199341, + "learning_rate": 1.821455154064693e-05, + "loss": 1.3378, + "step": 22637 + }, + { + "epoch": 0.8107149891668308, + "grad_norm": 2.1410248279571533, + "learning_rate": 1.8207877727679523e-05, + "loss": 1.29, + "step": 22638 + }, + { + "epoch": 0.8107508012963991, + "grad_norm": 1.8281619548797607, + "learning_rate": 1.820120501511957e-05, + "loss": 1.3479, + "step": 22639 + }, + { + "epoch": 0.8107866134259674, + "grad_norm": 1.728197693824768, + "learning_rate": 1.8194533403056935e-05, + "loss": 1.5324, + "step": 22640 + }, + { + "epoch": 0.8108224255555356, + "grad_norm": 1.576294183731079, + "learning_rate": 1.8187862891581343e-05, + "loss": 1.4459, + "step": 22641 + }, + { + "epoch": 0.8108582376851039, + "grad_norm": 1.467061996459961, + "learning_rate": 1.8181193480782466e-05, + "loss": 1.1734, + "step": 22642 + }, + { + "epoch": 0.8108940498146723, + "grad_norm": 1.5236502885818481, + "learning_rate": 1.8174525170750145e-05, + "loss": 1.0675, + "step": 22643 + }, + { + "epoch": 0.8109298619442405, + "grad_norm": 1.7928649187088013, + "learning_rate": 1.816785796157402e-05, + "loss": 1.6023, + "step": 22644 + }, + { + "epoch": 0.8109656740738088, + "grad_norm": 2.2303380966186523, + "learning_rate": 1.8161191853343827e-05, + "loss": 1.3182, + "step": 22645 + }, + { + "epoch": 0.8110014862033771, + "grad_norm": 1.2965222597122192, + "learning_rate": 1.815452684614919e-05, + "loss": 1.5056, + "step": 22646 + }, + { + "epoch": 0.8110372983329454, + "grad_norm": 2.0349364280700684, + "learning_rate": 1.8147862940079875e-05, + "loss": 1.7971, + "step": 22647 + }, + { + "epoch": 0.8110731104625136, + "grad_norm": 1.655045509338379, + "learning_rate": 1.8141200135225444e-05, + "loss": 1.6047, + "step": 22648 + }, + { + "epoch": 0.8111089225920819, + "grad_norm": 1.3515609502792358, + "learning_rate": 1.8134538431675608e-05, + "loss": 1.519, + "step": 22649 + }, + { + "epoch": 0.8111447347216503, + "grad_norm": 1.5632681846618652, + "learning_rate": 1.8127877829519935e-05, + "loss": 1.5632, + "step": 22650 + }, + { + "epoch": 0.8111805468512185, + "grad_norm": 1.6890665292739868, + "learning_rate": 1.8121218328848054e-05, + "loss": 1.2321, + "step": 22651 + }, + { + "epoch": 0.8112163589807868, + "grad_norm": 1.3986566066741943, + "learning_rate": 1.8114559929749586e-05, + "loss": 1.0644, + "step": 22652 + }, + { + "epoch": 0.8112521711103551, + "grad_norm": 2.0749661922454834, + "learning_rate": 1.8107902632314044e-05, + "loss": 1.2105, + "step": 22653 + }, + { + "epoch": 0.8112879832399233, + "grad_norm": 1.7615954875946045, + "learning_rate": 1.8101246436631093e-05, + "loss": 1.3472, + "step": 22654 + }, + { + "epoch": 0.8113237953694916, + "grad_norm": 1.7018330097198486, + "learning_rate": 1.8094591342790202e-05, + "loss": 1.5256, + "step": 22655 + }, + { + "epoch": 0.8113596074990599, + "grad_norm": 1.8836838006973267, + "learning_rate": 1.8087937350880957e-05, + "loss": 1.4031, + "step": 22656 + }, + { + "epoch": 0.8113954196286283, + "grad_norm": 1.560875415802002, + "learning_rate": 1.8081284460992808e-05, + "loss": 1.2982, + "step": 22657 + }, + { + "epoch": 0.8114312317581965, + "grad_norm": 1.307560920715332, + "learning_rate": 1.8074632673215365e-05, + "loss": 1.4884, + "step": 22658 + }, + { + "epoch": 0.8114670438877648, + "grad_norm": 2.2579801082611084, + "learning_rate": 1.806798198763805e-05, + "loss": 1.6971, + "step": 22659 + }, + { + "epoch": 0.8115028560173331, + "grad_norm": 1.286411166191101, + "learning_rate": 1.806133240435034e-05, + "loss": 1.2505, + "step": 22660 + }, + { + "epoch": 0.8115386681469013, + "grad_norm": 1.7834134101867676, + "learning_rate": 1.8054683923441694e-05, + "loss": 1.4802, + "step": 22661 + }, + { + "epoch": 0.8115744802764696, + "grad_norm": 1.6677281856536865, + "learning_rate": 1.804803654500159e-05, + "loss": 1.3463, + "step": 22662 + }, + { + "epoch": 0.8116102924060379, + "grad_norm": 1.974789023399353, + "learning_rate": 1.8041390269119463e-05, + "loss": 1.7445, + "step": 22663 + }, + { + "epoch": 0.8116461045356063, + "grad_norm": 2.2561769485473633, + "learning_rate": 1.8034745095884687e-05, + "loss": 1.4882, + "step": 22664 + }, + { + "epoch": 0.8116819166651745, + "grad_norm": 1.5701444149017334, + "learning_rate": 1.802810102538668e-05, + "loss": 1.6699, + "step": 22665 + }, + { + "epoch": 0.8117177287947428, + "grad_norm": 2.0393314361572266, + "learning_rate": 1.8021458057714845e-05, + "loss": 1.4406, + "step": 22666 + }, + { + "epoch": 0.8117535409243111, + "grad_norm": 2.785156488418579, + "learning_rate": 1.8014816192958574e-05, + "loss": 1.2964, + "step": 22667 + }, + { + "epoch": 0.8117893530538793, + "grad_norm": 1.4221245050430298, + "learning_rate": 1.8008175431207173e-05, + "loss": 1.4925, + "step": 22668 + }, + { + "epoch": 0.8118251651834476, + "grad_norm": 1.7360721826553345, + "learning_rate": 1.8001535772550006e-05, + "loss": 1.3764, + "step": 22669 + }, + { + "epoch": 0.8118609773130159, + "grad_norm": 1.4904028177261353, + "learning_rate": 1.7994897217076423e-05, + "loss": 1.1439, + "step": 22670 + }, + { + "epoch": 0.8118967894425843, + "grad_norm": 1.3772350549697876, + "learning_rate": 1.7988259764875705e-05, + "loss": 1.4313, + "step": 22671 + }, + { + "epoch": 0.8119326015721525, + "grad_norm": 3.0207619667053223, + "learning_rate": 1.7981623416037163e-05, + "loss": 1.1208, + "step": 22672 + }, + { + "epoch": 0.8119684137017208, + "grad_norm": 1.3616470098495483, + "learning_rate": 1.7974988170650075e-05, + "loss": 1.4309, + "step": 22673 + }, + { + "epoch": 0.8120042258312891, + "grad_norm": 1.674111247062683, + "learning_rate": 1.7968354028803748e-05, + "loss": 1.4841, + "step": 22674 + }, + { + "epoch": 0.8120400379608573, + "grad_norm": 1.7297577857971191, + "learning_rate": 1.7961720990587382e-05, + "loss": 1.0489, + "step": 22675 + }, + { + "epoch": 0.8120758500904256, + "grad_norm": 1.3222150802612305, + "learning_rate": 1.7955089056090234e-05, + "loss": 1.5225, + "step": 22676 + }, + { + "epoch": 0.8121116622199939, + "grad_norm": 1.6286765336990356, + "learning_rate": 1.7948458225401553e-05, + "loss": 1.7537, + "step": 22677 + }, + { + "epoch": 0.8121474743495622, + "grad_norm": 1.5624029636383057, + "learning_rate": 1.7941828498610503e-05, + "loss": 1.5025, + "step": 22678 + }, + { + "epoch": 0.8121832864791305, + "grad_norm": 1.589890956878662, + "learning_rate": 1.793519987580631e-05, + "loss": 1.1414, + "step": 22679 + }, + { + "epoch": 0.8122190986086988, + "grad_norm": 2.1349070072174072, + "learning_rate": 1.7928572357078143e-05, + "loss": 1.079, + "step": 22680 + }, + { + "epoch": 0.812254910738267, + "grad_norm": 1.6790281534194946, + "learning_rate": 1.7921945942515195e-05, + "loss": 1.5152, + "step": 22681 + }, + { + "epoch": 0.8122907228678353, + "grad_norm": 1.7030316591262817, + "learning_rate": 1.7915320632206566e-05, + "loss": 1.0342, + "step": 22682 + }, + { + "epoch": 0.8123265349974036, + "grad_norm": 1.1964973211288452, + "learning_rate": 1.7908696426241422e-05, + "loss": 1.2793, + "step": 22683 + }, + { + "epoch": 0.8123623471269719, + "grad_norm": 1.6645698547363281, + "learning_rate": 1.790207332470887e-05, + "loss": 1.5247, + "step": 22684 + }, + { + "epoch": 0.8123981592565402, + "grad_norm": 2.757772207260132, + "learning_rate": 1.7895451327698054e-05, + "loss": 1.5142, + "step": 22685 + }, + { + "epoch": 0.8124339713861085, + "grad_norm": 1.540874719619751, + "learning_rate": 1.788883043529801e-05, + "loss": 1.4304, + "step": 22686 + }, + { + "epoch": 0.8124697835156768, + "grad_norm": 1.5452402830123901, + "learning_rate": 1.788221064759783e-05, + "loss": 1.5054, + "step": 22687 + }, + { + "epoch": 0.812505595645245, + "grad_norm": 1.5464129447937012, + "learning_rate": 1.78755919646866e-05, + "loss": 1.294, + "step": 22688 + }, + { + "epoch": 0.8125414077748133, + "grad_norm": 2.5941967964172363, + "learning_rate": 1.7868974386653336e-05, + "loss": 1.4364, + "step": 22689 + }, + { + "epoch": 0.8125772199043816, + "grad_norm": 1.8890364170074463, + "learning_rate": 1.786235791358707e-05, + "loss": 1.3582, + "step": 22690 + }, + { + "epoch": 0.8126130320339499, + "grad_norm": 2.1199588775634766, + "learning_rate": 1.7855742545576836e-05, + "loss": 1.8747, + "step": 22691 + }, + { + "epoch": 0.8126488441635182, + "grad_norm": 2.581043004989624, + "learning_rate": 1.7849128282711647e-05, + "loss": 1.471, + "step": 22692 + }, + { + "epoch": 0.8126846562930865, + "grad_norm": 1.238418698310852, + "learning_rate": 1.784251512508045e-05, + "loss": 1.4502, + "step": 22693 + }, + { + "epoch": 0.8127204684226548, + "grad_norm": 1.6828545331954956, + "learning_rate": 1.783590307277223e-05, + "loss": 1.8595, + "step": 22694 + }, + { + "epoch": 0.812756280552223, + "grad_norm": 1.997360348701477, + "learning_rate": 1.7829292125875984e-05, + "loss": 1.4948, + "step": 22695 + }, + { + "epoch": 0.8127920926817913, + "grad_norm": 1.4616385698318481, + "learning_rate": 1.7822682284480585e-05, + "loss": 1.6081, + "step": 22696 + }, + { + "epoch": 0.8128279048113596, + "grad_norm": 1.9267231225967407, + "learning_rate": 1.7816073548675004e-05, + "loss": 1.3649, + "step": 22697 + }, + { + "epoch": 0.8128637169409278, + "grad_norm": 2.3256096839904785, + "learning_rate": 1.780946591854814e-05, + "loss": 1.3092, + "step": 22698 + }, + { + "epoch": 0.8128995290704962, + "grad_norm": 1.9210196733474731, + "learning_rate": 1.780285939418892e-05, + "loss": 1.3377, + "step": 22699 + }, + { + "epoch": 0.8129353412000645, + "grad_norm": 1.4383782148361206, + "learning_rate": 1.7796253975686172e-05, + "loss": 1.3547, + "step": 22700 + }, + { + "epoch": 0.8129711533296328, + "grad_norm": 1.6383898258209229, + "learning_rate": 1.7789649663128793e-05, + "loss": 1.1577, + "step": 22701 + }, + { + "epoch": 0.813006965459201, + "grad_norm": 1.306366205215454, + "learning_rate": 1.7783046456605658e-05, + "loss": 1.4998, + "step": 22702 + }, + { + "epoch": 0.8130427775887693, + "grad_norm": 1.700914978981018, + "learning_rate": 1.7776444356205556e-05, + "loss": 1.4648, + "step": 22703 + }, + { + "epoch": 0.8130785897183376, + "grad_norm": 1.3967289924621582, + "learning_rate": 1.7769843362017336e-05, + "loss": 1.6431, + "step": 22704 + }, + { + "epoch": 0.8131144018479058, + "grad_norm": 1.7971333265304565, + "learning_rate": 1.77632434741298e-05, + "loss": 1.5475, + "step": 22705 + }, + { + "epoch": 0.8131502139774742, + "grad_norm": 1.5884615182876587, + "learning_rate": 1.7756644692631773e-05, + "loss": 1.3453, + "step": 22706 + }, + { + "epoch": 0.8131860261070425, + "grad_norm": 2.3906350135803223, + "learning_rate": 1.7750047017611983e-05, + "loss": 1.4263, + "step": 22707 + }, + { + "epoch": 0.8132218382366108, + "grad_norm": 1.615214228630066, + "learning_rate": 1.7743450449159217e-05, + "loss": 1.0195, + "step": 22708 + }, + { + "epoch": 0.813257650366179, + "grad_norm": 1.702710509300232, + "learning_rate": 1.7736854987362217e-05, + "loss": 1.5819, + "step": 22709 + }, + { + "epoch": 0.8132934624957473, + "grad_norm": 1.4902650117874146, + "learning_rate": 1.773026063230975e-05, + "loss": 1.6003, + "step": 22710 + }, + { + "epoch": 0.8133292746253156, + "grad_norm": 1.6640796661376953, + "learning_rate": 1.7723667384090503e-05, + "loss": 1.5305, + "step": 22711 + }, + { + "epoch": 0.8133650867548838, + "grad_norm": 1.3970403671264648, + "learning_rate": 1.7717075242793123e-05, + "loss": 1.4203, + "step": 22712 + }, + { + "epoch": 0.8134008988844522, + "grad_norm": 1.4592771530151367, + "learning_rate": 1.771048420850643e-05, + "loss": 1.5402, + "step": 22713 + }, + { + "epoch": 0.8134367110140205, + "grad_norm": 1.4296540021896362, + "learning_rate": 1.770389428131899e-05, + "loss": 1.4899, + "step": 22714 + }, + { + "epoch": 0.8134725231435888, + "grad_norm": 1.440382957458496, + "learning_rate": 1.769730546131949e-05, + "loss": 1.4909, + "step": 22715 + }, + { + "epoch": 0.813508335273157, + "grad_norm": 1.3283933401107788, + "learning_rate": 1.7690717748596585e-05, + "loss": 1.6033, + "step": 22716 + }, + { + "epoch": 0.8135441474027253, + "grad_norm": 1.8007736206054688, + "learning_rate": 1.7684131143238937e-05, + "loss": 1.4637, + "step": 22717 + }, + { + "epoch": 0.8135799595322936, + "grad_norm": 1.7173250913619995, + "learning_rate": 1.767754564533509e-05, + "loss": 1.4911, + "step": 22718 + }, + { + "epoch": 0.8136157716618618, + "grad_norm": 1.7633723020553589, + "learning_rate": 1.7670961254973682e-05, + "loss": 1.6723, + "step": 22719 + }, + { + "epoch": 0.8136515837914302, + "grad_norm": 2.1998519897460938, + "learning_rate": 1.766437797224332e-05, + "loss": 1.2689, + "step": 22720 + }, + { + "epoch": 0.8136873959209985, + "grad_norm": 1.6470333337783813, + "learning_rate": 1.7657795797232525e-05, + "loss": 1.4681, + "step": 22721 + }, + { + "epoch": 0.8137232080505667, + "grad_norm": 1.4363399744033813, + "learning_rate": 1.7651214730029897e-05, + "loss": 1.5119, + "step": 22722 + }, + { + "epoch": 0.813759020180135, + "grad_norm": 1.6144115924835205, + "learning_rate": 1.7644634770723888e-05, + "loss": 1.2808, + "step": 22723 + }, + { + "epoch": 0.8137948323097033, + "grad_norm": 1.3689866065979004, + "learning_rate": 1.763805591940315e-05, + "loss": 1.2235, + "step": 22724 + }, + { + "epoch": 0.8138306444392716, + "grad_norm": 1.7141789197921753, + "learning_rate": 1.7631478176156113e-05, + "loss": 1.2982, + "step": 22725 + }, + { + "epoch": 0.8138664565688398, + "grad_norm": 1.6982225179672241, + "learning_rate": 1.762490154107128e-05, + "loss": 1.4998, + "step": 22726 + }, + { + "epoch": 0.8139022686984082, + "grad_norm": 1.3144981861114502, + "learning_rate": 1.761832601423714e-05, + "loss": 1.3089, + "step": 22727 + }, + { + "epoch": 0.8139380808279765, + "grad_norm": 1.2160977125167847, + "learning_rate": 1.7611751595742188e-05, + "loss": 1.0489, + "step": 22728 + }, + { + "epoch": 0.8139738929575447, + "grad_norm": 1.6171183586120605, + "learning_rate": 1.760517828567484e-05, + "loss": 1.6103, + "step": 22729 + }, + { + "epoch": 0.814009705087113, + "grad_norm": 1.4586262702941895, + "learning_rate": 1.759860608412349e-05, + "loss": 1.0743, + "step": 22730 + }, + { + "epoch": 0.8140455172166813, + "grad_norm": 1.4333014488220215, + "learning_rate": 1.7592034991176654e-05, + "loss": 1.4654, + "step": 22731 + }, + { + "epoch": 0.8140813293462495, + "grad_norm": 1.6419109106063843, + "learning_rate": 1.7585465006922662e-05, + "loss": 1.2001, + "step": 22732 + }, + { + "epoch": 0.8141171414758178, + "grad_norm": 1.2818939685821533, + "learning_rate": 1.757889613144995e-05, + "loss": 1.3681, + "step": 22733 + }, + { + "epoch": 0.8141529536053862, + "grad_norm": 1.5382506847381592, + "learning_rate": 1.7572328364846836e-05, + "loss": 1.4991, + "step": 22734 + }, + { + "epoch": 0.8141887657349545, + "grad_norm": 1.436592698097229, + "learning_rate": 1.7565761707201767e-05, + "loss": 1.2363, + "step": 22735 + }, + { + "epoch": 0.8142245778645227, + "grad_norm": 1.5679230690002441, + "learning_rate": 1.7559196158603018e-05, + "loss": 1.3993, + "step": 22736 + }, + { + "epoch": 0.814260389994091, + "grad_norm": 1.6096587181091309, + "learning_rate": 1.755263171913897e-05, + "loss": 1.2466, + "step": 22737 + }, + { + "epoch": 0.8142962021236593, + "grad_norm": 1.662288784980774, + "learning_rate": 1.7546068388897885e-05, + "loss": 1.4613, + "step": 22738 + }, + { + "epoch": 0.8143320142532275, + "grad_norm": 1.7570534944534302, + "learning_rate": 1.753950616796809e-05, + "loss": 1.493, + "step": 22739 + }, + { + "epoch": 0.8143678263827958, + "grad_norm": 1.7655926942825317, + "learning_rate": 1.75329450564379e-05, + "loss": 1.3305, + "step": 22740 + }, + { + "epoch": 0.8144036385123642, + "grad_norm": 1.5644776821136475, + "learning_rate": 1.75263850543955e-05, + "loss": 1.2155, + "step": 22741 + }, + { + "epoch": 0.8144394506419325, + "grad_norm": 2.489290714263916, + "learning_rate": 1.7519826161929266e-05, + "loss": 1.6175, + "step": 22742 + }, + { + "epoch": 0.8144752627715007, + "grad_norm": 1.4218047857284546, + "learning_rate": 1.7513268379127356e-05, + "loss": 1.5427, + "step": 22743 + }, + { + "epoch": 0.814511074901069, + "grad_norm": 1.5447837114334106, + "learning_rate": 1.750671170607804e-05, + "loss": 1.0916, + "step": 22744 + }, + { + "epoch": 0.8145468870306373, + "grad_norm": 1.4648761749267578, + "learning_rate": 1.7500156142869462e-05, + "loss": 1.311, + "step": 22745 + }, + { + "epoch": 0.8145826991602055, + "grad_norm": 1.8207685947418213, + "learning_rate": 1.7493601689589913e-05, + "loss": 1.169, + "step": 22746 + }, + { + "epoch": 0.8146185112897738, + "grad_norm": 1.640710711479187, + "learning_rate": 1.748704834632753e-05, + "loss": 1.5011, + "step": 22747 + }, + { + "epoch": 0.8146543234193422, + "grad_norm": 1.3525753021240234, + "learning_rate": 1.748049611317045e-05, + "loss": 1.5688, + "step": 22748 + }, + { + "epoch": 0.8146901355489105, + "grad_norm": 1.8010128736495972, + "learning_rate": 1.7473944990206858e-05, + "loss": 1.1804, + "step": 22749 + }, + { + "epoch": 0.8147259476784787, + "grad_norm": 1.575965404510498, + "learning_rate": 1.7467394977524876e-05, + "loss": 1.385, + "step": 22750 + }, + { + "epoch": 0.814761759808047, + "grad_norm": 1.5879836082458496, + "learning_rate": 1.7460846075212666e-05, + "loss": 1.0562, + "step": 22751 + }, + { + "epoch": 0.8147975719376153, + "grad_norm": 1.2580584287643433, + "learning_rate": 1.745429828335826e-05, + "loss": 1.467, + "step": 22752 + }, + { + "epoch": 0.8148333840671835, + "grad_norm": 1.7832731008529663, + "learning_rate": 1.7447751602049835e-05, + "loss": 1.1617, + "step": 22753 + }, + { + "epoch": 0.8148691961967518, + "grad_norm": 1.6994675397872925, + "learning_rate": 1.74412060313754e-05, + "loss": 1.5291, + "step": 22754 + }, + { + "epoch": 0.8149050083263202, + "grad_norm": 1.608042597770691, + "learning_rate": 1.7434661571423084e-05, + "loss": 1.4662, + "step": 22755 + }, + { + "epoch": 0.8149408204558884, + "grad_norm": 1.4791151285171509, + "learning_rate": 1.7428118222280855e-05, + "loss": 1.4369, + "step": 22756 + }, + { + "epoch": 0.8149766325854567, + "grad_norm": 1.5534361600875854, + "learning_rate": 1.7421575984036797e-05, + "loss": 1.4834, + "step": 22757 + }, + { + "epoch": 0.815012444715025, + "grad_norm": 1.302477240562439, + "learning_rate": 1.7415034856778934e-05, + "loss": 1.4125, + "step": 22758 + }, + { + "epoch": 0.8150482568445933, + "grad_norm": 1.541063666343689, + "learning_rate": 1.7408494840595224e-05, + "loss": 1.2171, + "step": 22759 + }, + { + "epoch": 0.8150840689741615, + "grad_norm": 2.021052122116089, + "learning_rate": 1.7401955935573688e-05, + "loss": 1.6671, + "step": 22760 + }, + { + "epoch": 0.8151198811037298, + "grad_norm": 2.089958429336548, + "learning_rate": 1.739541814180228e-05, + "loss": 1.5129, + "step": 22761 + }, + { + "epoch": 0.8151556932332982, + "grad_norm": 1.526520848274231, + "learning_rate": 1.7388881459369e-05, + "loss": 1.4541, + "step": 22762 + }, + { + "epoch": 0.8151915053628664, + "grad_norm": 1.9976831674575806, + "learning_rate": 1.738234588836174e-05, + "loss": 1.3426, + "step": 22763 + }, + { + "epoch": 0.8152273174924347, + "grad_norm": 1.4353028535842896, + "learning_rate": 1.737581142886844e-05, + "loss": 1.292, + "step": 22764 + }, + { + "epoch": 0.815263129622003, + "grad_norm": 1.6733012199401855, + "learning_rate": 1.7369278080977037e-05, + "loss": 1.4501, + "step": 22765 + }, + { + "epoch": 0.8152989417515712, + "grad_norm": 1.6345852613449097, + "learning_rate": 1.7362745844775396e-05, + "loss": 1.4787, + "step": 22766 + }, + { + "epoch": 0.8153347538811395, + "grad_norm": 1.6095476150512695, + "learning_rate": 1.7356214720351414e-05, + "loss": 1.1239, + "step": 22767 + }, + { + "epoch": 0.8153705660107078, + "grad_norm": 2.355900526046753, + "learning_rate": 1.7349684707792956e-05, + "loss": 1.4855, + "step": 22768 + }, + { + "epoch": 0.8154063781402762, + "grad_norm": 1.734176516532898, + "learning_rate": 1.7343155807187915e-05, + "loss": 1.3749, + "step": 22769 + }, + { + "epoch": 0.8154421902698444, + "grad_norm": 2.209763765335083, + "learning_rate": 1.7336628018624058e-05, + "loss": 1.4435, + "step": 22770 + }, + { + "epoch": 0.8154780023994127, + "grad_norm": 1.8333766460418701, + "learning_rate": 1.7330101342189254e-05, + "loss": 1.4678, + "step": 22771 + }, + { + "epoch": 0.815513814528981, + "grad_norm": 2.5825207233428955, + "learning_rate": 1.732357577797129e-05, + "loss": 1.0537, + "step": 22772 + }, + { + "epoch": 0.8155496266585492, + "grad_norm": 1.7145079374313354, + "learning_rate": 1.7317051326057998e-05, + "loss": 1.4113, + "step": 22773 + }, + { + "epoch": 0.8155854387881175, + "grad_norm": 1.4004257917404175, + "learning_rate": 1.7310527986537095e-05, + "loss": 1.4613, + "step": 22774 + }, + { + "epoch": 0.8156212509176858, + "grad_norm": 1.3757426738739014, + "learning_rate": 1.7304005759496377e-05, + "loss": 1.5297, + "step": 22775 + }, + { + "epoch": 0.815657063047254, + "grad_norm": 1.3896691799163818, + "learning_rate": 1.729748464502362e-05, + "loss": 1.3216, + "step": 22776 + }, + { + "epoch": 0.8156928751768224, + "grad_norm": 1.7896462678909302, + "learning_rate": 1.729096464320651e-05, + "loss": 1.4618, + "step": 22777 + }, + { + "epoch": 0.8157286873063907, + "grad_norm": 1.3658167123794556, + "learning_rate": 1.7284445754132772e-05, + "loss": 1.6294, + "step": 22778 + }, + { + "epoch": 0.815764499435959, + "grad_norm": 1.7124221324920654, + "learning_rate": 1.727792797789013e-05, + "loss": 1.4627, + "step": 22779 + }, + { + "epoch": 0.8158003115655272, + "grad_norm": 1.5317054986953735, + "learning_rate": 1.7271411314566287e-05, + "loss": 1.4144, + "step": 22780 + }, + { + "epoch": 0.8158361236950955, + "grad_norm": 1.3445594310760498, + "learning_rate": 1.7264895764248868e-05, + "loss": 1.4471, + "step": 22781 + }, + { + "epoch": 0.8158719358246638, + "grad_norm": 1.3334885835647583, + "learning_rate": 1.7258381327025552e-05, + "loss": 1.3444, + "step": 22782 + }, + { + "epoch": 0.815907747954232, + "grad_norm": 1.8246225118637085, + "learning_rate": 1.7251868002984005e-05, + "loss": 1.5161, + "step": 22783 + }, + { + "epoch": 0.8159435600838004, + "grad_norm": 1.5593852996826172, + "learning_rate": 1.7245355792211826e-05, + "loss": 1.6261, + "step": 22784 + }, + { + "epoch": 0.8159793722133687, + "grad_norm": 1.728540301322937, + "learning_rate": 1.723884469479663e-05, + "loss": 1.3071, + "step": 22785 + }, + { + "epoch": 0.816015184342937, + "grad_norm": 1.4976931810379028, + "learning_rate": 1.7232334710826025e-05, + "loss": 1.4344, + "step": 22786 + }, + { + "epoch": 0.8160509964725052, + "grad_norm": 1.657505989074707, + "learning_rate": 1.722582584038762e-05, + "loss": 1.5328, + "step": 22787 + }, + { + "epoch": 0.8160868086020735, + "grad_norm": 1.5885651111602783, + "learning_rate": 1.7219318083568937e-05, + "loss": 1.1093, + "step": 22788 + }, + { + "epoch": 0.8161226207316418, + "grad_norm": 1.4087108373641968, + "learning_rate": 1.7212811440457545e-05, + "loss": 1.2369, + "step": 22789 + }, + { + "epoch": 0.81615843286121, + "grad_norm": 1.7424745559692383, + "learning_rate": 1.7206305911141017e-05, + "loss": 1.5162, + "step": 22790 + }, + { + "epoch": 0.8161942449907784, + "grad_norm": 1.5498261451721191, + "learning_rate": 1.7199801495706812e-05, + "loss": 1.087, + "step": 22791 + }, + { + "epoch": 0.8162300571203467, + "grad_norm": 1.4619683027267456, + "learning_rate": 1.719329819424248e-05, + "loss": 1.4748, + "step": 22792 + }, + { + "epoch": 0.816265869249915, + "grad_norm": 1.8188879489898682, + "learning_rate": 1.7186796006835514e-05, + "loss": 1.5167, + "step": 22793 + }, + { + "epoch": 0.8163016813794832, + "grad_norm": 1.7282185554504395, + "learning_rate": 1.7180294933573405e-05, + "loss": 1.5527, + "step": 22794 + }, + { + "epoch": 0.8163374935090515, + "grad_norm": 1.842283844947815, + "learning_rate": 1.7173794974543568e-05, + "loss": 1.3138, + "step": 22795 + }, + { + "epoch": 0.8163733056386198, + "grad_norm": 2.1575214862823486, + "learning_rate": 1.7167296129833488e-05, + "loss": 1.7364, + "step": 22796 + }, + { + "epoch": 0.816409117768188, + "grad_norm": 1.707844614982605, + "learning_rate": 1.7160798399530586e-05, + "loss": 1.282, + "step": 22797 + }, + { + "epoch": 0.8164449298977564, + "grad_norm": 1.5645873546600342, + "learning_rate": 1.7154301783722315e-05, + "loss": 1.1909, + "step": 22798 + }, + { + "epoch": 0.8164807420273247, + "grad_norm": 1.1664835214614868, + "learning_rate": 1.7147806282496027e-05, + "loss": 1.4765, + "step": 22799 + }, + { + "epoch": 0.816516554156893, + "grad_norm": 1.5875605344772339, + "learning_rate": 1.7141311895939137e-05, + "loss": 1.5326, + "step": 22800 + }, + { + "epoch": 0.8165523662864612, + "grad_norm": 1.8210203647613525, + "learning_rate": 1.7134818624139036e-05, + "loss": 1.204, + "step": 22801 + }, + { + "epoch": 0.8165881784160295, + "grad_norm": 1.5193594694137573, + "learning_rate": 1.7128326467183032e-05, + "loss": 1.2251, + "step": 22802 + }, + { + "epoch": 0.8166239905455978, + "grad_norm": 1.719142198562622, + "learning_rate": 1.7121835425158506e-05, + "loss": 1.6163, + "step": 22803 + }, + { + "epoch": 0.816659802675166, + "grad_norm": 1.998476266860962, + "learning_rate": 1.711534549815278e-05, + "loss": 1.3592, + "step": 22804 + }, + { + "epoch": 0.8166956148047344, + "grad_norm": 1.435386300086975, + "learning_rate": 1.7108856686253183e-05, + "loss": 1.2512, + "step": 22805 + }, + { + "epoch": 0.8167314269343027, + "grad_norm": 1.4732717275619507, + "learning_rate": 1.710236898954698e-05, + "loss": 1.3226, + "step": 22806 + }, + { + "epoch": 0.8167672390638709, + "grad_norm": 1.7154697179794312, + "learning_rate": 1.7095882408121468e-05, + "loss": 1.2685, + "step": 22807 + }, + { + "epoch": 0.8168030511934392, + "grad_norm": 1.3846417665481567, + "learning_rate": 1.708939694206395e-05, + "loss": 1.543, + "step": 22808 + }, + { + "epoch": 0.8168388633230075, + "grad_norm": 1.680985689163208, + "learning_rate": 1.708291259146162e-05, + "loss": 1.3678, + "step": 22809 + }, + { + "epoch": 0.8168746754525757, + "grad_norm": 1.3640410900115967, + "learning_rate": 1.7076429356401748e-05, + "loss": 1.5972, + "step": 22810 + }, + { + "epoch": 0.816910487582144, + "grad_norm": 1.7066844701766968, + "learning_rate": 1.706994723697155e-05, + "loss": 1.1552, + "step": 22811 + }, + { + "epoch": 0.8169462997117124, + "grad_norm": 2.2980430126190186, + "learning_rate": 1.7063466233258275e-05, + "loss": 1.3957, + "step": 22812 + }, + { + "epoch": 0.8169821118412807, + "grad_norm": 1.7297414541244507, + "learning_rate": 1.7056986345349046e-05, + "loss": 1.5695, + "step": 22813 + }, + { + "epoch": 0.8170179239708489, + "grad_norm": 2.15028977394104, + "learning_rate": 1.7050507573331077e-05, + "loss": 1.2113, + "step": 22814 + }, + { + "epoch": 0.8170537361004172, + "grad_norm": 1.3350656032562256, + "learning_rate": 1.7044029917291536e-05, + "loss": 1.1719, + "step": 22815 + }, + { + "epoch": 0.8170895482299855, + "grad_norm": 1.7217737436294556, + "learning_rate": 1.7037553377317595e-05, + "loss": 1.2585, + "step": 22816 + }, + { + "epoch": 0.8171253603595537, + "grad_norm": 1.6362788677215576, + "learning_rate": 1.7031077953496356e-05, + "loss": 1.433, + "step": 22817 + }, + { + "epoch": 0.817161172489122, + "grad_norm": 1.7198362350463867, + "learning_rate": 1.7024603645914896e-05, + "loss": 1.3631, + "step": 22818 + }, + { + "epoch": 0.8171969846186904, + "grad_norm": 1.4463917016983032, + "learning_rate": 1.7018130454660395e-05, + "loss": 1.486, + "step": 22819 + }, + { + "epoch": 0.8172327967482587, + "grad_norm": 2.732374906539917, + "learning_rate": 1.7011658379819904e-05, + "loss": 1.2624, + "step": 22820 + }, + { + "epoch": 0.8172686088778269, + "grad_norm": 1.4441410303115845, + "learning_rate": 1.7005187421480517e-05, + "loss": 1.2175, + "step": 22821 + }, + { + "epoch": 0.8173044210073952, + "grad_norm": 1.7178325653076172, + "learning_rate": 1.699871757972924e-05, + "loss": 1.5518, + "step": 22822 + }, + { + "epoch": 0.8173402331369635, + "grad_norm": 2.081376075744629, + "learning_rate": 1.6992248854653192e-05, + "loss": 1.4921, + "step": 22823 + }, + { + "epoch": 0.8173760452665317, + "grad_norm": 2.20284104347229, + "learning_rate": 1.698578124633934e-05, + "loss": 1.6761, + "step": 22824 + }, + { + "epoch": 0.8174118573961, + "grad_norm": 1.9613070487976074, + "learning_rate": 1.6979314754874733e-05, + "loss": 1.6807, + "step": 22825 + }, + { + "epoch": 0.8174476695256684, + "grad_norm": 1.3908569812774658, + "learning_rate": 1.6972849380346367e-05, + "loss": 1.3088, + "step": 22826 + }, + { + "epoch": 0.8174834816552367, + "grad_norm": 1.7931389808654785, + "learning_rate": 1.696638512284119e-05, + "loss": 1.5472, + "step": 22827 + }, + { + "epoch": 0.8175192937848049, + "grad_norm": 2.0211594104766846, + "learning_rate": 1.6959921982446225e-05, + "loss": 1.6009, + "step": 22828 + }, + { + "epoch": 0.8175551059143732, + "grad_norm": 1.730381965637207, + "learning_rate": 1.6953459959248354e-05, + "loss": 1.2824, + "step": 22829 + }, + { + "epoch": 0.8175909180439415, + "grad_norm": 1.6810767650604248, + "learning_rate": 1.69469990533346e-05, + "loss": 1.3071, + "step": 22830 + }, + { + "epoch": 0.8176267301735097, + "grad_norm": 1.639445185661316, + "learning_rate": 1.694053926479181e-05, + "loss": 1.2268, + "step": 22831 + }, + { + "epoch": 0.817662542303078, + "grad_norm": 1.8805431127548218, + "learning_rate": 1.6934080593706958e-05, + "loss": 1.2137, + "step": 22832 + }, + { + "epoch": 0.8176983544326464, + "grad_norm": 1.8979594707489014, + "learning_rate": 1.692762304016685e-05, + "loss": 1.4345, + "step": 22833 + }, + { + "epoch": 0.8177341665622146, + "grad_norm": 1.65707528591156, + "learning_rate": 1.6921166604258475e-05, + "loss": 1.3987, + "step": 22834 + }, + { + "epoch": 0.8177699786917829, + "grad_norm": 1.6859840154647827, + "learning_rate": 1.691471128606864e-05, + "loss": 1.6425, + "step": 22835 + }, + { + "epoch": 0.8178057908213512, + "grad_norm": 1.899539589881897, + "learning_rate": 1.6908257085684143e-05, + "loss": 1.5508, + "step": 22836 + }, + { + "epoch": 0.8178416029509195, + "grad_norm": 1.8419172763824463, + "learning_rate": 1.6901804003191914e-05, + "loss": 1.3642, + "step": 22837 + }, + { + "epoch": 0.8178774150804877, + "grad_norm": 1.5693961381912231, + "learning_rate": 1.6895352038678692e-05, + "loss": 1.6069, + "step": 22838 + }, + { + "epoch": 0.817913227210056, + "grad_norm": 1.3967636823654175, + "learning_rate": 1.6888901192231342e-05, + "loss": 1.4029, + "step": 22839 + }, + { + "epoch": 0.8179490393396244, + "grad_norm": 1.2297767400741577, + "learning_rate": 1.6882451463936566e-05, + "loss": 1.2852, + "step": 22840 + }, + { + "epoch": 0.8179848514691926, + "grad_norm": 1.8269104957580566, + "learning_rate": 1.6876002853881244e-05, + "loss": 1.4348, + "step": 22841 + }, + { + "epoch": 0.8180206635987609, + "grad_norm": 1.6397745609283447, + "learning_rate": 1.6869555362152056e-05, + "loss": 1.2093, + "step": 22842 + }, + { + "epoch": 0.8180564757283292, + "grad_norm": 1.4336014986038208, + "learning_rate": 1.6863108988835797e-05, + "loss": 1.1987, + "step": 22843 + }, + { + "epoch": 0.8180922878578974, + "grad_norm": 2.0080535411834717, + "learning_rate": 1.685666373401914e-05, + "loss": 1.517, + "step": 22844 + }, + { + "epoch": 0.8181280999874657, + "grad_norm": 1.9427084922790527, + "learning_rate": 1.685021959778883e-05, + "loss": 1.4457, + "step": 22845 + }, + { + "epoch": 0.818163912117034, + "grad_norm": 1.9075568914413452, + "learning_rate": 1.6843776580231586e-05, + "loss": 1.2082, + "step": 22846 + }, + { + "epoch": 0.8181997242466024, + "grad_norm": 1.3768006563186646, + "learning_rate": 1.6837334681434037e-05, + "loss": 1.3797, + "step": 22847 + }, + { + "epoch": 0.8182355363761706, + "grad_norm": 2.21150541305542, + "learning_rate": 1.683089390148287e-05, + "loss": 1.6664, + "step": 22848 + }, + { + "epoch": 0.8182713485057389, + "grad_norm": 2.0442562103271484, + "learning_rate": 1.6824454240464748e-05, + "loss": 1.4715, + "step": 22849 + }, + { + "epoch": 0.8183071606353072, + "grad_norm": 1.8075312376022339, + "learning_rate": 1.6818015698466338e-05, + "loss": 1.2415, + "step": 22850 + }, + { + "epoch": 0.8183429727648754, + "grad_norm": 1.6639543771743774, + "learning_rate": 1.681157827557418e-05, + "loss": 1.3192, + "step": 22851 + }, + { + "epoch": 0.8183787848944437, + "grad_norm": 1.579306960105896, + "learning_rate": 1.680514197187497e-05, + "loss": 1.5554, + "step": 22852 + }, + { + "epoch": 0.818414597024012, + "grad_norm": 1.3775279521942139, + "learning_rate": 1.6798706787455264e-05, + "loss": 1.4963, + "step": 22853 + }, + { + "epoch": 0.8184504091535804, + "grad_norm": 2.1027517318725586, + "learning_rate": 1.6792272722401626e-05, + "loss": 1.2141, + "step": 22854 + }, + { + "epoch": 0.8184862212831486, + "grad_norm": 1.735244870185852, + "learning_rate": 1.6785839776800615e-05, + "loss": 1.4396, + "step": 22855 + }, + { + "epoch": 0.8185220334127169, + "grad_norm": 1.4876716136932373, + "learning_rate": 1.677940795073879e-05, + "loss": 1.7742, + "step": 22856 + }, + { + "epoch": 0.8185578455422852, + "grad_norm": 1.8125195503234863, + "learning_rate": 1.6772977244302714e-05, + "loss": 1.6015, + "step": 22857 + }, + { + "epoch": 0.8185936576718534, + "grad_norm": 1.8210275173187256, + "learning_rate": 1.6766547657578844e-05, + "loss": 1.4319, + "step": 22858 + }, + { + "epoch": 0.8186294698014217, + "grad_norm": 1.593743085861206, + "learning_rate": 1.6760119190653724e-05, + "loss": 1.2788, + "step": 22859 + }, + { + "epoch": 0.81866528193099, + "grad_norm": 1.4638431072235107, + "learning_rate": 1.6753691843613818e-05, + "loss": 1.3581, + "step": 22860 + }, + { + "epoch": 0.8187010940605584, + "grad_norm": 1.6058062314987183, + "learning_rate": 1.6747265616545625e-05, + "loss": 1.0587, + "step": 22861 + }, + { + "epoch": 0.8187369061901266, + "grad_norm": 1.6514182090759277, + "learning_rate": 1.674084050953557e-05, + "loss": 1.288, + "step": 22862 + }, + { + "epoch": 0.8187727183196949, + "grad_norm": 1.6830849647521973, + "learning_rate": 1.6734416522670114e-05, + "loss": 1.466, + "step": 22863 + }, + { + "epoch": 0.8188085304492632, + "grad_norm": 1.5081751346588135, + "learning_rate": 1.6727993656035702e-05, + "loss": 1.1591, + "step": 22864 + }, + { + "epoch": 0.8188443425788314, + "grad_norm": 1.801693081855774, + "learning_rate": 1.672157190971869e-05, + "loss": 1.4321, + "step": 22865 + }, + { + "epoch": 0.8188801547083997, + "grad_norm": 1.6656297445297241, + "learning_rate": 1.671515128380551e-05, + "loss": 1.4196, + "step": 22866 + }, + { + "epoch": 0.818915966837968, + "grad_norm": 1.7084009647369385, + "learning_rate": 1.6708731778382546e-05, + "loss": 1.4801, + "step": 22867 + }, + { + "epoch": 0.8189517789675363, + "grad_norm": 1.5573643445968628, + "learning_rate": 1.6702313393536173e-05, + "loss": 1.4912, + "step": 22868 + }, + { + "epoch": 0.8189875910971046, + "grad_norm": 1.7359768152236938, + "learning_rate": 1.6695896129352705e-05, + "loss": 1.6653, + "step": 22869 + }, + { + "epoch": 0.8190234032266729, + "grad_norm": 1.4117817878723145, + "learning_rate": 1.66894799859185e-05, + "loss": 1.2851, + "step": 22870 + }, + { + "epoch": 0.8190592153562412, + "grad_norm": 1.3178173303604126, + "learning_rate": 1.6683064963319906e-05, + "loss": 0.9814, + "step": 22871 + }, + { + "epoch": 0.8190950274858094, + "grad_norm": 1.5659393072128296, + "learning_rate": 1.6676651061643177e-05, + "loss": 1.4266, + "step": 22872 + }, + { + "epoch": 0.8191308396153777, + "grad_norm": 1.5842785835266113, + "learning_rate": 1.6670238280974627e-05, + "loss": 1.5485, + "step": 22873 + }, + { + "epoch": 0.819166651744946, + "grad_norm": 1.5451146364212036, + "learning_rate": 1.6663826621400537e-05, + "loss": 1.4251, + "step": 22874 + }, + { + "epoch": 0.8192024638745143, + "grad_norm": 1.8988233804702759, + "learning_rate": 1.6657416083007184e-05, + "loss": 1.5944, + "step": 22875 + }, + { + "epoch": 0.8192382760040826, + "grad_norm": 1.5368101596832275, + "learning_rate": 1.6651006665880776e-05, + "loss": 1.3794, + "step": 22876 + }, + { + "epoch": 0.8192740881336509, + "grad_norm": 3.667703628540039, + "learning_rate": 1.6644598370107554e-05, + "loss": 1.618, + "step": 22877 + }, + { + "epoch": 0.8193099002632191, + "grad_norm": 1.5384092330932617, + "learning_rate": 1.6638191195773744e-05, + "loss": 1.3134, + "step": 22878 + }, + { + "epoch": 0.8193457123927874, + "grad_norm": 1.6481189727783203, + "learning_rate": 1.6631785142965563e-05, + "loss": 1.5524, + "step": 22879 + }, + { + "epoch": 0.8193815245223557, + "grad_norm": 1.4624632596969604, + "learning_rate": 1.6625380211769147e-05, + "loss": 1.3622, + "step": 22880 + }, + { + "epoch": 0.819417336651924, + "grad_norm": 1.7683665752410889, + "learning_rate": 1.6618976402270704e-05, + "loss": 1.4516, + "step": 22881 + }, + { + "epoch": 0.8194531487814923, + "grad_norm": 1.6138049364089966, + "learning_rate": 1.66125737145564e-05, + "loss": 1.5001, + "step": 22882 + }, + { + "epoch": 0.8194889609110606, + "grad_norm": 1.509700059890747, + "learning_rate": 1.6606172148712328e-05, + "loss": 1.4476, + "step": 22883 + }, + { + "epoch": 0.8195247730406289, + "grad_norm": 1.956424593925476, + "learning_rate": 1.659977170482464e-05, + "loss": 1.2311, + "step": 22884 + }, + { + "epoch": 0.8195605851701971, + "grad_norm": 1.6657973527908325, + "learning_rate": 1.6593372382979455e-05, + "loss": 1.3247, + "step": 22885 + }, + { + "epoch": 0.8195963972997654, + "grad_norm": 1.5629335641860962, + "learning_rate": 1.658697418326287e-05, + "loss": 1.169, + "step": 22886 + }, + { + "epoch": 0.8196322094293337, + "grad_norm": 1.9260749816894531, + "learning_rate": 1.658057710576093e-05, + "loss": 1.5238, + "step": 22887 + }, + { + "epoch": 0.819668021558902, + "grad_norm": 1.5545670986175537, + "learning_rate": 1.657418115055973e-05, + "loss": 1.4566, + "step": 22888 + }, + { + "epoch": 0.8197038336884703, + "grad_norm": 2.068680763244629, + "learning_rate": 1.6567786317745327e-05, + "loss": 1.4548, + "step": 22889 + }, + { + "epoch": 0.8197396458180386, + "grad_norm": 1.982734203338623, + "learning_rate": 1.6561392607403713e-05, + "loss": 1.3612, + "step": 22890 + }, + { + "epoch": 0.8197754579476069, + "grad_norm": 1.426444172859192, + "learning_rate": 1.655500001962095e-05, + "loss": 1.4532, + "step": 22891 + }, + { + "epoch": 0.8198112700771751, + "grad_norm": 1.4528121948242188, + "learning_rate": 1.6548608554483e-05, + "loss": 1.4581, + "step": 22892 + }, + { + "epoch": 0.8198470822067434, + "grad_norm": 1.5887699127197266, + "learning_rate": 1.6542218212075923e-05, + "loss": 1.633, + "step": 22893 + }, + { + "epoch": 0.8198828943363117, + "grad_norm": 1.5253174304962158, + "learning_rate": 1.6535828992485613e-05, + "loss": 1.286, + "step": 22894 + }, + { + "epoch": 0.8199187064658799, + "grad_norm": 1.6747887134552002, + "learning_rate": 1.6529440895798065e-05, + "loss": 1.3259, + "step": 22895 + }, + { + "epoch": 0.8199545185954483, + "grad_norm": 1.4363501071929932, + "learning_rate": 1.6523053922099242e-05, + "loss": 1.3707, + "step": 22896 + }, + { + "epoch": 0.8199903307250166, + "grad_norm": 1.5821083784103394, + "learning_rate": 1.651666807147503e-05, + "loss": 1.4022, + "step": 22897 + }, + { + "epoch": 0.8200261428545849, + "grad_norm": 1.7014988660812378, + "learning_rate": 1.651028334401137e-05, + "loss": 1.6497, + "step": 22898 + }, + { + "epoch": 0.8200619549841531, + "grad_norm": 1.6083884239196777, + "learning_rate": 1.6503899739794138e-05, + "loss": 1.8573, + "step": 22899 + }, + { + "epoch": 0.8200977671137214, + "grad_norm": 1.4160634279251099, + "learning_rate": 1.6497517258909267e-05, + "loss": 1.4247, + "step": 22900 + }, + { + "epoch": 0.8201335792432897, + "grad_norm": 1.8023004531860352, + "learning_rate": 1.6491135901442567e-05, + "loss": 1.4982, + "step": 22901 + }, + { + "epoch": 0.8201693913728579, + "grad_norm": 1.3858096599578857, + "learning_rate": 1.648475566747991e-05, + "loss": 1.4687, + "step": 22902 + }, + { + "epoch": 0.8202052035024263, + "grad_norm": 3.3740923404693604, + "learning_rate": 1.6478376557107145e-05, + "loss": 1.4911, + "step": 22903 + }, + { + "epoch": 0.8202410156319946, + "grad_norm": 1.7053208351135254, + "learning_rate": 1.647199857041011e-05, + "loss": 1.6013, + "step": 22904 + }, + { + "epoch": 0.8202768277615629, + "grad_norm": 1.5999023914337158, + "learning_rate": 1.6465621707474587e-05, + "loss": 1.5928, + "step": 22905 + }, + { + "epoch": 0.8203126398911311, + "grad_norm": 2.348151206970215, + "learning_rate": 1.6459245968386327e-05, + "loss": 1.6773, + "step": 22906 + }, + { + "epoch": 0.8203484520206994, + "grad_norm": 1.8074836730957031, + "learning_rate": 1.64528713532312e-05, + "loss": 1.5876, + "step": 22907 + }, + { + "epoch": 0.8203842641502677, + "grad_norm": 1.8491315841674805, + "learning_rate": 1.64464978620949e-05, + "loss": 1.4896, + "step": 22908 + }, + { + "epoch": 0.8204200762798359, + "grad_norm": 1.720956563949585, + "learning_rate": 1.6440125495063185e-05, + "loss": 1.3682, + "step": 22909 + }, + { + "epoch": 0.8204558884094043, + "grad_norm": 1.3919970989227295, + "learning_rate": 1.643375425222181e-05, + "loss": 1.6958, + "step": 22910 + }, + { + "epoch": 0.8204917005389726, + "grad_norm": 1.6565746068954468, + "learning_rate": 1.6427384133656498e-05, + "loss": 1.1266, + "step": 22911 + }, + { + "epoch": 0.8205275126685408, + "grad_norm": 1.7313103675842285, + "learning_rate": 1.64210151394529e-05, + "loss": 1.4588, + "step": 22912 + }, + { + "epoch": 0.8205633247981091, + "grad_norm": 1.5546603202819824, + "learning_rate": 1.641464726969675e-05, + "loss": 1.3061, + "step": 22913 + }, + { + "epoch": 0.8205991369276774, + "grad_norm": 1.9309124946594238, + "learning_rate": 1.6408280524473706e-05, + "loss": 1.6808, + "step": 22914 + }, + { + "epoch": 0.8206349490572457, + "grad_norm": 1.3096654415130615, + "learning_rate": 1.640191490386942e-05, + "loss": 1.4659, + "step": 22915 + }, + { + "epoch": 0.8206707611868139, + "grad_norm": 1.6897329092025757, + "learning_rate": 1.6395550407969552e-05, + "loss": 1.2159, + "step": 22916 + }, + { + "epoch": 0.8207065733163823, + "grad_norm": 1.5496948957443237, + "learning_rate": 1.6389187036859655e-05, + "loss": 1.4833, + "step": 22917 + }, + { + "epoch": 0.8207423854459506, + "grad_norm": 2.1769845485687256, + "learning_rate": 1.638282479062545e-05, + "loss": 1.7208, + "step": 22918 + }, + { + "epoch": 0.8207781975755188, + "grad_norm": 1.4986789226531982, + "learning_rate": 1.637646366935246e-05, + "loss": 1.3027, + "step": 22919 + }, + { + "epoch": 0.8208140097050871, + "grad_norm": 1.4664111137390137, + "learning_rate": 1.6370103673126267e-05, + "loss": 1.5531, + "step": 22920 + }, + { + "epoch": 0.8208498218346554, + "grad_norm": 1.5579311847686768, + "learning_rate": 1.6363744802032476e-05, + "loss": 1.5757, + "step": 22921 + }, + { + "epoch": 0.8208856339642236, + "grad_norm": 1.549301266670227, + "learning_rate": 1.6357387056156626e-05, + "loss": 1.5303, + "step": 22922 + }, + { + "epoch": 0.8209214460937919, + "grad_norm": 1.373755931854248, + "learning_rate": 1.6351030435584245e-05, + "loss": 1.5735, + "step": 22923 + }, + { + "epoch": 0.8209572582233603, + "grad_norm": 1.8269177675247192, + "learning_rate": 1.6344674940400805e-05, + "loss": 1.607, + "step": 22924 + }, + { + "epoch": 0.8209930703529286, + "grad_norm": 1.6167367696762085, + "learning_rate": 1.633832057069191e-05, + "loss": 1.2946, + "step": 22925 + }, + { + "epoch": 0.8210288824824968, + "grad_norm": 1.9242609739303589, + "learning_rate": 1.6331967326542963e-05, + "loss": 1.5833, + "step": 22926 + }, + { + "epoch": 0.8210646946120651, + "grad_norm": 1.805993676185608, + "learning_rate": 1.63256152080395e-05, + "loss": 1.2707, + "step": 22927 + }, + { + "epoch": 0.8211005067416334, + "grad_norm": 1.4096330404281616, + "learning_rate": 1.6319264215266894e-05, + "loss": 1.2395, + "step": 22928 + }, + { + "epoch": 0.8211363188712016, + "grad_norm": 1.5715415477752686, + "learning_rate": 1.6312914348310704e-05, + "loss": 1.3947, + "step": 22929 + }, + { + "epoch": 0.8211721310007699, + "grad_norm": 1.2842967510223389, + "learning_rate": 1.6306565607256285e-05, + "loss": 1.0019, + "step": 22930 + }, + { + "epoch": 0.8212079431303383, + "grad_norm": 1.9133511781692505, + "learning_rate": 1.6300217992189082e-05, + "loss": 1.8043, + "step": 22931 + }, + { + "epoch": 0.8212437552599066, + "grad_norm": 1.7129391431808472, + "learning_rate": 1.6293871503194458e-05, + "loss": 1.3144, + "step": 22932 + }, + { + "epoch": 0.8212795673894748, + "grad_norm": 1.822322130203247, + "learning_rate": 1.6287526140357822e-05, + "loss": 1.4964, + "step": 22933 + }, + { + "epoch": 0.8213153795190431, + "grad_norm": 1.658325433731079, + "learning_rate": 1.6281181903764565e-05, + "loss": 1.5703, + "step": 22934 + }, + { + "epoch": 0.8213511916486114, + "grad_norm": 1.6791235208511353, + "learning_rate": 1.627483879349997e-05, + "loss": 1.4307, + "step": 22935 + }, + { + "epoch": 0.8213870037781796, + "grad_norm": 1.326132893562317, + "learning_rate": 1.626849680964947e-05, + "loss": 1.3838, + "step": 22936 + }, + { + "epoch": 0.8214228159077479, + "grad_norm": 1.37123441696167, + "learning_rate": 1.6262155952298307e-05, + "loss": 1.1731, + "step": 22937 + }, + { + "epoch": 0.8214586280373163, + "grad_norm": 1.4056603908538818, + "learning_rate": 1.625581622153186e-05, + "loss": 1.4067, + "step": 22938 + }, + { + "epoch": 0.8214944401668846, + "grad_norm": 1.6344373226165771, + "learning_rate": 1.6249477617435327e-05, + "loss": 1.4917, + "step": 22939 + }, + { + "epoch": 0.8215302522964528, + "grad_norm": 1.7621803283691406, + "learning_rate": 1.6243140140094093e-05, + "loss": 1.5295, + "step": 22940 + }, + { + "epoch": 0.8215660644260211, + "grad_norm": 1.3223881721496582, + "learning_rate": 1.6236803789593368e-05, + "loss": 1.452, + "step": 22941 + }, + { + "epoch": 0.8216018765555894, + "grad_norm": 1.669700026512146, + "learning_rate": 1.6230468566018375e-05, + "loss": 1.4759, + "step": 22942 + }, + { + "epoch": 0.8216376886851576, + "grad_norm": 1.5705947875976562, + "learning_rate": 1.6224134469454366e-05, + "loss": 1.4296, + "step": 22943 + }, + { + "epoch": 0.8216735008147259, + "grad_norm": 1.8033430576324463, + "learning_rate": 1.6217801499986573e-05, + "loss": 1.5341, + "step": 22944 + }, + { + "epoch": 0.8217093129442943, + "grad_norm": 1.3500990867614746, + "learning_rate": 1.6211469657700217e-05, + "loss": 1.4331, + "step": 22945 + }, + { + "epoch": 0.8217451250738625, + "grad_norm": 1.8312416076660156, + "learning_rate": 1.6205138942680408e-05, + "loss": 1.6042, + "step": 22946 + }, + { + "epoch": 0.8217809372034308, + "grad_norm": 2.076237201690674, + "learning_rate": 1.6198809355012412e-05, + "loss": 1.4416, + "step": 22947 + }, + { + "epoch": 0.8218167493329991, + "grad_norm": 2.3092427253723145, + "learning_rate": 1.6192480894781316e-05, + "loss": 1.4371, + "step": 22948 + }, + { + "epoch": 0.8218525614625674, + "grad_norm": 1.5285634994506836, + "learning_rate": 1.6186153562072316e-05, + "loss": 1.6155, + "step": 22949 + }, + { + "epoch": 0.8218883735921356, + "grad_norm": 1.3765846490859985, + "learning_rate": 1.617982735697048e-05, + "loss": 1.0938, + "step": 22950 + }, + { + "epoch": 0.8219241857217039, + "grad_norm": 1.4085685014724731, + "learning_rate": 1.6173502279560936e-05, + "loss": 1.4514, + "step": 22951 + }, + { + "epoch": 0.8219599978512723, + "grad_norm": 1.2525451183319092, + "learning_rate": 1.6167178329928823e-05, + "loss": 1.2433, + "step": 22952 + }, + { + "epoch": 0.8219958099808405, + "grad_norm": 1.4547854661941528, + "learning_rate": 1.6160855508159168e-05, + "loss": 1.2984, + "step": 22953 + }, + { + "epoch": 0.8220316221104088, + "grad_norm": 1.432128667831421, + "learning_rate": 1.6154533814337058e-05, + "loss": 1.5026, + "step": 22954 + }, + { + "epoch": 0.8220674342399771, + "grad_norm": 1.9972702264785767, + "learning_rate": 1.614821324854754e-05, + "loss": 1.4845, + "step": 22955 + }, + { + "epoch": 0.8221032463695453, + "grad_norm": 1.6977585554122925, + "learning_rate": 1.6141893810875675e-05, + "loss": 1.6666, + "step": 22956 + }, + { + "epoch": 0.8221390584991136, + "grad_norm": 1.9989867210388184, + "learning_rate": 1.6135575501406432e-05, + "loss": 1.7976, + "step": 22957 + }, + { + "epoch": 0.8221748706286819, + "grad_norm": 1.8499654531478882, + "learning_rate": 1.6129258320224848e-05, + "loss": 1.3219, + "step": 22958 + }, + { + "epoch": 0.8222106827582503, + "grad_norm": 2.332317352294922, + "learning_rate": 1.612294226741593e-05, + "loss": 1.4394, + "step": 22959 + }, + { + "epoch": 0.8222464948878185, + "grad_norm": 2.4020214080810547, + "learning_rate": 1.6116627343064605e-05, + "loss": 1.7889, + "step": 22960 + }, + { + "epoch": 0.8222823070173868, + "grad_norm": 2.036348342895508, + "learning_rate": 1.611031354725586e-05, + "loss": 1.3781, + "step": 22961 + }, + { + "epoch": 0.8223181191469551, + "grad_norm": 2.3071060180664062, + "learning_rate": 1.6104000880074642e-05, + "loss": 1.5629, + "step": 22962 + }, + { + "epoch": 0.8223539312765233, + "grad_norm": 1.9471147060394287, + "learning_rate": 1.6097689341605894e-05, + "loss": 1.464, + "step": 22963 + }, + { + "epoch": 0.8223897434060916, + "grad_norm": 1.335502028465271, + "learning_rate": 1.6091378931934474e-05, + "loss": 1.3216, + "step": 22964 + }, + { + "epoch": 0.8224255555356599, + "grad_norm": 1.5332695245742798, + "learning_rate": 1.6085069651145334e-05, + "loss": 1.3925, + "step": 22965 + }, + { + "epoch": 0.8224613676652283, + "grad_norm": 1.4483823776245117, + "learning_rate": 1.6078761499323326e-05, + "loss": 1.5104, + "step": 22966 + }, + { + "epoch": 0.8224971797947965, + "grad_norm": 1.384883999824524, + "learning_rate": 1.6072454476553357e-05, + "loss": 1.522, + "step": 22967 + }, + { + "epoch": 0.8225329919243648, + "grad_norm": 1.8154127597808838, + "learning_rate": 1.6066148582920237e-05, + "loss": 1.4938, + "step": 22968 + }, + { + "epoch": 0.8225688040539331, + "grad_norm": 1.559505581855774, + "learning_rate": 1.6059843818508814e-05, + "loss": 1.2616, + "step": 22969 + }, + { + "epoch": 0.8226046161835013, + "grad_norm": 1.9433817863464355, + "learning_rate": 1.605354018340395e-05, + "loss": 1.1542, + "step": 22970 + }, + { + "epoch": 0.8226404283130696, + "grad_norm": 1.6815561056137085, + "learning_rate": 1.6047237677690386e-05, + "loss": 1.3772, + "step": 22971 + }, + { + "epoch": 0.8226762404426379, + "grad_norm": 1.572665810585022, + "learning_rate": 1.6040936301452957e-05, + "loss": 1.7138, + "step": 22972 + }, + { + "epoch": 0.8227120525722063, + "grad_norm": 1.9919036626815796, + "learning_rate": 1.603463605477643e-05, + "loss": 1.4052, + "step": 22973 + }, + { + "epoch": 0.8227478647017745, + "grad_norm": 1.5342936515808105, + "learning_rate": 1.602833693774558e-05, + "loss": 1.0727, + "step": 22974 + }, + { + "epoch": 0.8227836768313428, + "grad_norm": 1.8794121742248535, + "learning_rate": 1.6022038950445127e-05, + "loss": 1.6559, + "step": 22975 + }, + { + "epoch": 0.8228194889609111, + "grad_norm": 1.6604645252227783, + "learning_rate": 1.6015742092959818e-05, + "loss": 1.4897, + "step": 22976 + }, + { + "epoch": 0.8228553010904793, + "grad_norm": 1.3487586975097656, + "learning_rate": 1.6009446365374383e-05, + "loss": 1.3302, + "step": 22977 + }, + { + "epoch": 0.8228911132200476, + "grad_norm": 1.6160697937011719, + "learning_rate": 1.6003151767773485e-05, + "loss": 1.1905, + "step": 22978 + }, + { + "epoch": 0.8229269253496159, + "grad_norm": 2.0315003395080566, + "learning_rate": 1.5996858300241834e-05, + "loss": 1.4471, + "step": 22979 + }, + { + "epoch": 0.8229627374791842, + "grad_norm": 1.3102805614471436, + "learning_rate": 1.5990565962864103e-05, + "loss": 1.4466, + "step": 22980 + }, + { + "epoch": 0.8229985496087525, + "grad_norm": 1.7477620840072632, + "learning_rate": 1.5984274755724958e-05, + "loss": 1.5908, + "step": 22981 + }, + { + "epoch": 0.8230343617383208, + "grad_norm": 1.6393771171569824, + "learning_rate": 1.5977984678909008e-05, + "loss": 1.3496, + "step": 22982 + }, + { + "epoch": 0.823070173867889, + "grad_norm": 2.298346519470215, + "learning_rate": 1.597169573250089e-05, + "loss": 1.623, + "step": 22983 + }, + { + "epoch": 0.8231059859974573, + "grad_norm": 1.93317711353302, + "learning_rate": 1.5965407916585208e-05, + "loss": 1.3362, + "step": 22984 + }, + { + "epoch": 0.8231417981270256, + "grad_norm": 1.3797252178192139, + "learning_rate": 1.59591212312466e-05, + "loss": 1.1094, + "step": 22985 + }, + { + "epoch": 0.8231776102565939, + "grad_norm": 1.945043921470642, + "learning_rate": 1.595283567656959e-05, + "loss": 1.5189, + "step": 22986 + }, + { + "epoch": 0.8232134223861622, + "grad_norm": 1.6652638912200928, + "learning_rate": 1.5946551252638754e-05, + "loss": 1.728, + "step": 22987 + }, + { + "epoch": 0.8232492345157305, + "grad_norm": 1.7131400108337402, + "learning_rate": 1.594026795953868e-05, + "loss": 1.1332, + "step": 22988 + }, + { + "epoch": 0.8232850466452988, + "grad_norm": 1.8453240394592285, + "learning_rate": 1.5933985797353844e-05, + "loss": 1.5197, + "step": 22989 + }, + { + "epoch": 0.823320858774867, + "grad_norm": 2.4557993412017822, + "learning_rate": 1.5927704766168793e-05, + "loss": 1.5135, + "step": 22990 + }, + { + "epoch": 0.8233566709044353, + "grad_norm": 1.5093895196914673, + "learning_rate": 1.5921424866068026e-05, + "loss": 1.7071, + "step": 22991 + }, + { + "epoch": 0.8233924830340036, + "grad_norm": 1.661433219909668, + "learning_rate": 1.5915146097136056e-05, + "loss": 1.4739, + "step": 22992 + }, + { + "epoch": 0.8234282951635719, + "grad_norm": 1.4338335990905762, + "learning_rate": 1.5908868459457317e-05, + "loss": 1.4, + "step": 22993 + }, + { + "epoch": 0.8234641072931402, + "grad_norm": 1.3473552465438843, + "learning_rate": 1.5902591953116287e-05, + "loss": 0.9404, + "step": 22994 + }, + { + "epoch": 0.8234999194227085, + "grad_norm": 1.7865482568740845, + "learning_rate": 1.589631657819741e-05, + "loss": 1.5638, + "step": 22995 + }, + { + "epoch": 0.8235357315522768, + "grad_norm": 1.594167709350586, + "learning_rate": 1.5890042334785104e-05, + "loss": 1.2829, + "step": 22996 + }, + { + "epoch": 0.823571543681845, + "grad_norm": 1.8517452478408813, + "learning_rate": 1.5883769222963775e-05, + "loss": 1.4047, + "step": 22997 + }, + { + "epoch": 0.8236073558114133, + "grad_norm": 1.841001033782959, + "learning_rate": 1.587749724281783e-05, + "loss": 1.166, + "step": 22998 + }, + { + "epoch": 0.8236431679409816, + "grad_norm": 1.69795823097229, + "learning_rate": 1.5871226394431672e-05, + "loss": 1.6688, + "step": 22999 + }, + { + "epoch": 0.8236789800705498, + "grad_norm": 1.5626368522644043, + "learning_rate": 1.586495667788962e-05, + "loss": 1.3707, + "step": 23000 + }, + { + "epoch": 0.8237147922001182, + "grad_norm": 1.3981742858886719, + "learning_rate": 1.5858688093276042e-05, + "loss": 1.0959, + "step": 23001 + }, + { + "epoch": 0.8237506043296865, + "grad_norm": 2.0931923389434814, + "learning_rate": 1.5852420640675313e-05, + "loss": 1.35, + "step": 23002 + }, + { + "epoch": 0.8237864164592548, + "grad_norm": 2.05471134185791, + "learning_rate": 1.5846154320171703e-05, + "loss": 1.7002, + "step": 23003 + }, + { + "epoch": 0.823822228588823, + "grad_norm": 1.5984715223312378, + "learning_rate": 1.583988913184953e-05, + "loss": 1.4799, + "step": 23004 + }, + { + "epoch": 0.8238580407183913, + "grad_norm": 1.48750638961792, + "learning_rate": 1.583362507579309e-05, + "loss": 1.209, + "step": 23005 + }, + { + "epoch": 0.8238938528479596, + "grad_norm": 1.6805229187011719, + "learning_rate": 1.582736215208669e-05, + "loss": 1.3504, + "step": 23006 + }, + { + "epoch": 0.8239296649775278, + "grad_norm": 1.4237467050552368, + "learning_rate": 1.582110036081452e-05, + "loss": 1.0545, + "step": 23007 + }, + { + "epoch": 0.8239654771070962, + "grad_norm": 1.6374319791793823, + "learning_rate": 1.581483970206087e-05, + "loss": 1.4251, + "step": 23008 + }, + { + "epoch": 0.8240012892366645, + "grad_norm": 1.6855441331863403, + "learning_rate": 1.580858017590996e-05, + "loss": 1.3527, + "step": 23009 + }, + { + "epoch": 0.8240371013662328, + "grad_norm": 1.4074074029922485, + "learning_rate": 1.5802321782446028e-05, + "loss": 1.3838, + "step": 23010 + }, + { + "epoch": 0.824072913495801, + "grad_norm": 1.5578721761703491, + "learning_rate": 1.5796064521753252e-05, + "loss": 1.4217, + "step": 23011 + }, + { + "epoch": 0.8241087256253693, + "grad_norm": 1.4159759283065796, + "learning_rate": 1.5789808393915763e-05, + "loss": 1.5785, + "step": 23012 + }, + { + "epoch": 0.8241445377549376, + "grad_norm": 1.645815372467041, + "learning_rate": 1.5783553399017825e-05, + "loss": 1.3073, + "step": 23013 + }, + { + "epoch": 0.8241803498845058, + "grad_norm": 1.8680726289749146, + "learning_rate": 1.577729953714352e-05, + "loss": 1.4468, + "step": 23014 + }, + { + "epoch": 0.8242161620140742, + "grad_norm": 1.9843326807022095, + "learning_rate": 1.577104680837703e-05, + "loss": 1.527, + "step": 23015 + }, + { + "epoch": 0.8242519741436425, + "grad_norm": 1.7631583213806152, + "learning_rate": 1.576479521280242e-05, + "loss": 1.6603, + "step": 23016 + }, + { + "epoch": 0.8242877862732108, + "grad_norm": 1.3905339241027832, + "learning_rate": 1.575854475050388e-05, + "loss": 1.0263, + "step": 23017 + }, + { + "epoch": 0.824323598402779, + "grad_norm": 1.588399887084961, + "learning_rate": 1.5752295421565423e-05, + "loss": 1.5257, + "step": 23018 + }, + { + "epoch": 0.8243594105323473, + "grad_norm": 2.075728178024292, + "learning_rate": 1.574604722607117e-05, + "loss": 1.4954, + "step": 23019 + }, + { + "epoch": 0.8243952226619156, + "grad_norm": 1.729669213294983, + "learning_rate": 1.573980016410519e-05, + "loss": 1.5427, + "step": 23020 + }, + { + "epoch": 0.8244310347914838, + "grad_norm": 1.457468867301941, + "learning_rate": 1.573355423575149e-05, + "loss": 1.5744, + "step": 23021 + }, + { + "epoch": 0.8244668469210522, + "grad_norm": 2.133328676223755, + "learning_rate": 1.572730944109415e-05, + "loss": 1.3736, + "step": 23022 + }, + { + "epoch": 0.8245026590506205, + "grad_norm": 1.4043382406234741, + "learning_rate": 1.5721065780217103e-05, + "loss": 1.4783, + "step": 23023 + }, + { + "epoch": 0.8245384711801887, + "grad_norm": 2.212798833847046, + "learning_rate": 1.5714823253204447e-05, + "loss": 1.6248, + "step": 23024 + }, + { + "epoch": 0.824574283309757, + "grad_norm": 1.4132270812988281, + "learning_rate": 1.5708581860140113e-05, + "loss": 1.2569, + "step": 23025 + }, + { + "epoch": 0.8246100954393253, + "grad_norm": 1.8781002759933472, + "learning_rate": 1.5702341601108094e-05, + "loss": 1.5612, + "step": 23026 + }, + { + "epoch": 0.8246459075688936, + "grad_norm": 1.9271550178527832, + "learning_rate": 1.56961024761923e-05, + "loss": 1.4968, + "step": 23027 + }, + { + "epoch": 0.8246817196984618, + "grad_norm": 1.5880156755447388, + "learning_rate": 1.5689864485476736e-05, + "loss": 1.3802, + "step": 23028 + }, + { + "epoch": 0.8247175318280302, + "grad_norm": 1.7311402559280396, + "learning_rate": 1.5683627629045295e-05, + "loss": 1.5533, + "step": 23029 + }, + { + "epoch": 0.8247533439575985, + "grad_norm": 1.6801574230194092, + "learning_rate": 1.5677391906981842e-05, + "loss": 1.268, + "step": 23030 + }, + { + "epoch": 0.8247891560871667, + "grad_norm": 1.9536669254302979, + "learning_rate": 1.5671157319370357e-05, + "loss": 1.4332, + "step": 23031 + }, + { + "epoch": 0.824824968216735, + "grad_norm": 1.3697175979614258, + "learning_rate": 1.5664923866294655e-05, + "loss": 1.2412, + "step": 23032 + }, + { + "epoch": 0.8248607803463033, + "grad_norm": 1.6064835786819458, + "learning_rate": 1.565869154783863e-05, + "loss": 1.5359, + "step": 23033 + }, + { + "epoch": 0.8248965924758715, + "grad_norm": 1.9790090322494507, + "learning_rate": 1.5652460364086084e-05, + "loss": 1.4676, + "step": 23034 + }, + { + "epoch": 0.8249324046054398, + "grad_norm": 1.8181332349777222, + "learning_rate": 1.5646230315120923e-05, + "loss": 1.3089, + "step": 23035 + }, + { + "epoch": 0.8249682167350082, + "grad_norm": 1.756899356842041, + "learning_rate": 1.5640001401026904e-05, + "loss": 1.4869, + "step": 23036 + }, + { + "epoch": 0.8250040288645765, + "grad_norm": 1.8076976537704468, + "learning_rate": 1.5633773621887872e-05, + "loss": 1.1611, + "step": 23037 + }, + { + "epoch": 0.8250398409941447, + "grad_norm": 1.5319172143936157, + "learning_rate": 1.5627546977787565e-05, + "loss": 1.4351, + "step": 23038 + }, + { + "epoch": 0.825075653123713, + "grad_norm": 1.5525784492492676, + "learning_rate": 1.5621321468809778e-05, + "loss": 1.4875, + "step": 23039 + }, + { + "epoch": 0.8251114652532813, + "grad_norm": 2.0940845012664795, + "learning_rate": 1.56150970950383e-05, + "loss": 1.1187, + "step": 23040 + }, + { + "epoch": 0.8251472773828495, + "grad_norm": 1.537346601486206, + "learning_rate": 1.5608873856556828e-05, + "loss": 1.3419, + "step": 23041 + }, + { + "epoch": 0.8251830895124178, + "grad_norm": 1.681191325187683, + "learning_rate": 1.5602651753449083e-05, + "loss": 1.2589, + "step": 23042 + }, + { + "epoch": 0.8252189016419862, + "grad_norm": 1.7197188138961792, + "learning_rate": 1.5596430785798798e-05, + "loss": 1.5691, + "step": 23043 + }, + { + "epoch": 0.8252547137715545, + "grad_norm": 1.6822034120559692, + "learning_rate": 1.55902109536897e-05, + "loss": 1.2887, + "step": 23044 + }, + { + "epoch": 0.8252905259011227, + "grad_norm": 2.1965887546539307, + "learning_rate": 1.558399225720537e-05, + "loss": 1.1151, + "step": 23045 + }, + { + "epoch": 0.825326338030691, + "grad_norm": 1.4301897287368774, + "learning_rate": 1.5577774696429592e-05, + "loss": 1.5266, + "step": 23046 + }, + { + "epoch": 0.8253621501602593, + "grad_norm": 1.6066406965255737, + "learning_rate": 1.5571558271445952e-05, + "loss": 1.5269, + "step": 23047 + }, + { + "epoch": 0.8253979622898275, + "grad_norm": 1.7901389598846436, + "learning_rate": 1.556534298233807e-05, + "loss": 1.2883, + "step": 23048 + }, + { + "epoch": 0.8254337744193958, + "grad_norm": 1.3142569065093994, + "learning_rate": 1.5559128829189597e-05, + "loss": 1.6844, + "step": 23049 + }, + { + "epoch": 0.8254695865489642, + "grad_norm": 1.5939184427261353, + "learning_rate": 1.5552915812084113e-05, + "loss": 1.3867, + "step": 23050 + }, + { + "epoch": 0.8255053986785325, + "grad_norm": 2.42671275138855, + "learning_rate": 1.5546703931105233e-05, + "loss": 1.5441, + "step": 23051 + }, + { + "epoch": 0.8255412108081007, + "grad_norm": 2.1185123920440674, + "learning_rate": 1.5540493186336503e-05, + "loss": 1.5006, + "step": 23052 + }, + { + "epoch": 0.825577022937669, + "grad_norm": 1.4602375030517578, + "learning_rate": 1.5534283577861497e-05, + "loss": 1.3002, + "step": 23053 + }, + { + "epoch": 0.8256128350672373, + "grad_norm": 1.9092152118682861, + "learning_rate": 1.552807510576374e-05, + "loss": 1.5135, + "step": 23054 + }, + { + "epoch": 0.8256486471968055, + "grad_norm": 2.400681734085083, + "learning_rate": 1.5521867770126795e-05, + "loss": 1.466, + "step": 23055 + }, + { + "epoch": 0.8256844593263738, + "grad_norm": 1.5862021446228027, + "learning_rate": 1.5515661571034134e-05, + "loss": 1.2344, + "step": 23056 + }, + { + "epoch": 0.8257202714559422, + "grad_norm": 2.3739986419677734, + "learning_rate": 1.5509456508569275e-05, + "loss": 1.4641, + "step": 23057 + }, + { + "epoch": 0.8257560835855104, + "grad_norm": 1.941270351409912, + "learning_rate": 1.5503252582815707e-05, + "loss": 1.5392, + "step": 23058 + }, + { + "epoch": 0.8257918957150787, + "grad_norm": 1.816595435142517, + "learning_rate": 1.5497049793856868e-05, + "loss": 1.1223, + "step": 23059 + }, + { + "epoch": 0.825827707844647, + "grad_norm": 1.351844072341919, + "learning_rate": 1.5490848141776214e-05, + "loss": 1.1367, + "step": 23060 + }, + { + "epoch": 0.8258635199742153, + "grad_norm": 1.4327374696731567, + "learning_rate": 1.548464762665719e-05, + "loss": 1.3438, + "step": 23061 + }, + { + "epoch": 0.8258993321037835, + "grad_norm": 1.515933632850647, + "learning_rate": 1.5478448248583244e-05, + "loss": 1.2234, + "step": 23062 + }, + { + "epoch": 0.8259351442333518, + "grad_norm": 1.2966159582138062, + "learning_rate": 1.5472250007637724e-05, + "loss": 1.4408, + "step": 23063 + }, + { + "epoch": 0.8259709563629202, + "grad_norm": 1.9407875537872314, + "learning_rate": 1.546605290390405e-05, + "loss": 1.5882, + "step": 23064 + }, + { + "epoch": 0.8260067684924884, + "grad_norm": 2.346935749053955, + "learning_rate": 1.545985693746561e-05, + "loss": 1.293, + "step": 23065 + }, + { + "epoch": 0.8260425806220567, + "grad_norm": 1.5052082538604736, + "learning_rate": 1.545366210840573e-05, + "loss": 1.3619, + "step": 23066 + }, + { + "epoch": 0.826078392751625, + "grad_norm": 1.547210693359375, + "learning_rate": 1.5447468416807766e-05, + "loss": 1.5157, + "step": 23067 + }, + { + "epoch": 0.8261142048811932, + "grad_norm": 1.6457045078277588, + "learning_rate": 1.5441275862755043e-05, + "loss": 1.1963, + "step": 23068 + }, + { + "epoch": 0.8261500170107615, + "grad_norm": 1.628400206565857, + "learning_rate": 1.5435084446330917e-05, + "loss": 1.6531, + "step": 23069 + }, + { + "epoch": 0.8261858291403298, + "grad_norm": 1.6187325716018677, + "learning_rate": 1.5428894167618622e-05, + "loss": 1.6925, + "step": 23070 + }, + { + "epoch": 0.8262216412698982, + "grad_norm": 1.649355411529541, + "learning_rate": 1.5422705026701468e-05, + "loss": 1.3264, + "step": 23071 + }, + { + "epoch": 0.8262574533994664, + "grad_norm": 1.924761176109314, + "learning_rate": 1.5416517023662713e-05, + "loss": 1.6002, + "step": 23072 + }, + { + "epoch": 0.8262932655290347, + "grad_norm": 1.5182862281799316, + "learning_rate": 1.541033015858565e-05, + "loss": 1.3946, + "step": 23073 + }, + { + "epoch": 0.826329077658603, + "grad_norm": 1.9584239721298218, + "learning_rate": 1.540414443155345e-05, + "loss": 1.4373, + "step": 23074 + }, + { + "epoch": 0.8263648897881712, + "grad_norm": 1.620587706565857, + "learning_rate": 1.5397959842649367e-05, + "loss": 1.5348, + "step": 23075 + }, + { + "epoch": 0.8264007019177395, + "grad_norm": 1.8111913204193115, + "learning_rate": 1.5391776391956638e-05, + "loss": 1.8022, + "step": 23076 + }, + { + "epoch": 0.8264365140473078, + "grad_norm": 1.7288938760757446, + "learning_rate": 1.5385594079558387e-05, + "loss": 1.4836, + "step": 23077 + }, + { + "epoch": 0.8264723261768762, + "grad_norm": 1.5004984140396118, + "learning_rate": 1.5379412905537828e-05, + "loss": 1.4875, + "step": 23078 + }, + { + "epoch": 0.8265081383064444, + "grad_norm": 2.3542582988739014, + "learning_rate": 1.5373232869978116e-05, + "loss": 1.5091, + "step": 23079 + }, + { + "epoch": 0.8265439504360127, + "grad_norm": 1.4314318895339966, + "learning_rate": 1.5367053972962408e-05, + "loss": 1.3066, + "step": 23080 + }, + { + "epoch": 0.826579762565581, + "grad_norm": 1.7067185640335083, + "learning_rate": 1.5360876214573806e-05, + "loss": 1.3752, + "step": 23081 + }, + { + "epoch": 0.8266155746951492, + "grad_norm": 1.7711745500564575, + "learning_rate": 1.5354699594895438e-05, + "loss": 1.4336, + "step": 23082 + }, + { + "epoch": 0.8266513868247175, + "grad_norm": 2.183694839477539, + "learning_rate": 1.534852411401043e-05, + "loss": 1.2257, + "step": 23083 + }, + { + "epoch": 0.8266871989542858, + "grad_norm": 1.6529567241668701, + "learning_rate": 1.5342349772001808e-05, + "loss": 1.4126, + "step": 23084 + }, + { + "epoch": 0.8267230110838542, + "grad_norm": 2.0304243564605713, + "learning_rate": 1.5336176568952666e-05, + "loss": 1.3902, + "step": 23085 + }, + { + "epoch": 0.8267588232134224, + "grad_norm": 1.541917324066162, + "learning_rate": 1.5330004504946072e-05, + "loss": 1.6367, + "step": 23086 + }, + { + "epoch": 0.8267946353429907, + "grad_norm": 1.8716808557510376, + "learning_rate": 1.532383358006506e-05, + "loss": 1.4428, + "step": 23087 + }, + { + "epoch": 0.826830447472559, + "grad_norm": 1.713232398033142, + "learning_rate": 1.5317663794392634e-05, + "loss": 1.2549, + "step": 23088 + }, + { + "epoch": 0.8268662596021272, + "grad_norm": 1.4402745962142944, + "learning_rate": 1.53114951480118e-05, + "loss": 1.3368, + "step": 23089 + }, + { + "epoch": 0.8269020717316955, + "grad_norm": 2.111379384994507, + "learning_rate": 1.5305327641005584e-05, + "loss": 1.5311, + "step": 23090 + }, + { + "epoch": 0.8269378838612638, + "grad_norm": 1.7146985530853271, + "learning_rate": 1.5299161273456907e-05, + "loss": 1.5118, + "step": 23091 + }, + { + "epoch": 0.8269736959908321, + "grad_norm": 1.5040332078933716, + "learning_rate": 1.529299604544876e-05, + "loss": 1.0758, + "step": 23092 + }, + { + "epoch": 0.8270095081204004, + "grad_norm": 1.687289834022522, + "learning_rate": 1.5286831957064095e-05, + "loss": 1.294, + "step": 23093 + }, + { + "epoch": 0.8270453202499687, + "grad_norm": 1.5885672569274902, + "learning_rate": 1.528066900838585e-05, + "loss": 1.3912, + "step": 23094 + }, + { + "epoch": 0.827081132379537, + "grad_norm": 1.3895875215530396, + "learning_rate": 1.5274507199496913e-05, + "loss": 1.2929, + "step": 23095 + }, + { + "epoch": 0.8271169445091052, + "grad_norm": 1.6085909605026245, + "learning_rate": 1.526834653048018e-05, + "loss": 1.3754, + "step": 23096 + }, + { + "epoch": 0.8271527566386735, + "grad_norm": 1.6051825284957886, + "learning_rate": 1.526218700141855e-05, + "loss": 1.41, + "step": 23097 + }, + { + "epoch": 0.8271885687682418, + "grad_norm": 1.4668638706207275, + "learning_rate": 1.5256028612394913e-05, + "loss": 1.2962, + "step": 23098 + }, + { + "epoch": 0.8272243808978101, + "grad_norm": 1.2707617282867432, + "learning_rate": 1.5249871363492107e-05, + "loss": 1.3112, + "step": 23099 + }, + { + "epoch": 0.8272601930273784, + "grad_norm": 1.6298561096191406, + "learning_rate": 1.5243715254792912e-05, + "loss": 1.605, + "step": 23100 + }, + { + "epoch": 0.8272960051569467, + "grad_norm": 1.4971948862075806, + "learning_rate": 1.5237560286380247e-05, + "loss": 1.1962, + "step": 23101 + }, + { + "epoch": 0.827331817286515, + "grad_norm": 1.3633439540863037, + "learning_rate": 1.5231406458336839e-05, + "loss": 1.4394, + "step": 23102 + }, + { + "epoch": 0.8273676294160832, + "grad_norm": 1.8583279848098755, + "learning_rate": 1.5225253770745529e-05, + "loss": 1.6863, + "step": 23103 + }, + { + "epoch": 0.8274034415456515, + "grad_norm": 1.6401985883712769, + "learning_rate": 1.5219102223689074e-05, + "loss": 1.5851, + "step": 23104 + }, + { + "epoch": 0.8274392536752198, + "grad_norm": 1.8377301692962646, + "learning_rate": 1.5212951817250253e-05, + "loss": 1.2312, + "step": 23105 + }, + { + "epoch": 0.8274750658047881, + "grad_norm": 1.7044795751571655, + "learning_rate": 1.5206802551511778e-05, + "loss": 1.8088, + "step": 23106 + }, + { + "epoch": 0.8275108779343564, + "grad_norm": 1.5256201028823853, + "learning_rate": 1.5200654426556405e-05, + "loss": 1.4694, + "step": 23107 + }, + { + "epoch": 0.8275466900639247, + "grad_norm": 1.6948037147521973, + "learning_rate": 1.5194507442466865e-05, + "loss": 1.3478, + "step": 23108 + }, + { + "epoch": 0.8275825021934929, + "grad_norm": 1.5120340585708618, + "learning_rate": 1.5188361599325817e-05, + "loss": 1.3614, + "step": 23109 + }, + { + "epoch": 0.8276183143230612, + "grad_norm": 1.5714203119277954, + "learning_rate": 1.5182216897215984e-05, + "loss": 1.6772, + "step": 23110 + }, + { + "epoch": 0.8276541264526295, + "grad_norm": 1.667129397392273, + "learning_rate": 1.5176073336219965e-05, + "loss": 1.205, + "step": 23111 + }, + { + "epoch": 0.8276899385821977, + "grad_norm": 1.733232855796814, + "learning_rate": 1.5169930916420516e-05, + "loss": 1.3543, + "step": 23112 + }, + { + "epoch": 0.8277257507117661, + "grad_norm": 1.5604608058929443, + "learning_rate": 1.5163789637900194e-05, + "loss": 1.5258, + "step": 23113 + }, + { + "epoch": 0.8277615628413344, + "grad_norm": 1.5304710865020752, + "learning_rate": 1.5157649500741678e-05, + "loss": 1.4189, + "step": 23114 + }, + { + "epoch": 0.8277973749709027, + "grad_norm": 2.146677017211914, + "learning_rate": 1.5151510505027499e-05, + "loss": 1.3662, + "step": 23115 + }, + { + "epoch": 0.8278331871004709, + "grad_norm": 2.4017913341522217, + "learning_rate": 1.5145372650840361e-05, + "loss": 1.4622, + "step": 23116 + }, + { + "epoch": 0.8278689992300392, + "grad_norm": 1.358140468597412, + "learning_rate": 1.5139235938262763e-05, + "loss": 1.4776, + "step": 23117 + }, + { + "epoch": 0.8279048113596075, + "grad_norm": 2.4922754764556885, + "learning_rate": 1.513310036737724e-05, + "loss": 1.7687, + "step": 23118 + }, + { + "epoch": 0.8279406234891757, + "grad_norm": 1.3777458667755127, + "learning_rate": 1.5126965938266436e-05, + "loss": 1.504, + "step": 23119 + }, + { + "epoch": 0.8279764356187441, + "grad_norm": 1.5722744464874268, + "learning_rate": 1.5120832651012795e-05, + "loss": 1.3428, + "step": 23120 + }, + { + "epoch": 0.8280122477483124, + "grad_norm": 2.008540153503418, + "learning_rate": 1.5114700505698886e-05, + "loss": 1.3799, + "step": 23121 + }, + { + "epoch": 0.8280480598778807, + "grad_norm": 1.4039058685302734, + "learning_rate": 1.5108569502407155e-05, + "loss": 1.3482, + "step": 23122 + }, + { + "epoch": 0.8280838720074489, + "grad_norm": 1.4145723581314087, + "learning_rate": 1.5102439641220156e-05, + "loss": 1.3316, + "step": 23123 + }, + { + "epoch": 0.8281196841370172, + "grad_norm": 1.5959590673446655, + "learning_rate": 1.5096310922220291e-05, + "loss": 1.7295, + "step": 23124 + }, + { + "epoch": 0.8281554962665855, + "grad_norm": 1.5081450939178467, + "learning_rate": 1.5090183345490084e-05, + "loss": 1.2043, + "step": 23125 + }, + { + "epoch": 0.8281913083961537, + "grad_norm": 1.61405348777771, + "learning_rate": 1.50840569111119e-05, + "loss": 1.5296, + "step": 23126 + }, + { + "epoch": 0.8282271205257221, + "grad_norm": 2.1915838718414307, + "learning_rate": 1.5077931619168196e-05, + "loss": 1.2124, + "step": 23127 + }, + { + "epoch": 0.8282629326552904, + "grad_norm": 1.5725104808807373, + "learning_rate": 1.5071807469741406e-05, + "loss": 1.4422, + "step": 23128 + }, + { + "epoch": 0.8282987447848587, + "grad_norm": 1.9400559663772583, + "learning_rate": 1.5065684462913853e-05, + "loss": 1.4408, + "step": 23129 + }, + { + "epoch": 0.8283345569144269, + "grad_norm": 1.7734508514404297, + "learning_rate": 1.5059562598768007e-05, + "loss": 1.6698, + "step": 23130 + }, + { + "epoch": 0.8283703690439952, + "grad_norm": 1.7858387231826782, + "learning_rate": 1.5053441877386154e-05, + "loss": 1.4474, + "step": 23131 + }, + { + "epoch": 0.8284061811735635, + "grad_norm": 1.9878989458084106, + "learning_rate": 1.5047322298850685e-05, + "loss": 1.1392, + "step": 23132 + }, + { + "epoch": 0.8284419933031317, + "grad_norm": 1.5624104738235474, + "learning_rate": 1.504120386324387e-05, + "loss": 1.5743, + "step": 23133 + }, + { + "epoch": 0.8284778054327001, + "grad_norm": 1.580743432044983, + "learning_rate": 1.5035086570648115e-05, + "loss": 1.2758, + "step": 23134 + }, + { + "epoch": 0.8285136175622684, + "grad_norm": 1.7219854593276978, + "learning_rate": 1.5028970421145684e-05, + "loss": 1.2629, + "step": 23135 + }, + { + "epoch": 0.8285494296918366, + "grad_norm": 1.9313101768493652, + "learning_rate": 1.5022855414818816e-05, + "loss": 1.3722, + "step": 23136 + }, + { + "epoch": 0.8285852418214049, + "grad_norm": 1.6862903833389282, + "learning_rate": 1.5016741551749813e-05, + "loss": 1.4906, + "step": 23137 + }, + { + "epoch": 0.8286210539509732, + "grad_norm": 2.225872039794922, + "learning_rate": 1.5010628832020945e-05, + "loss": 1.6626, + "step": 23138 + }, + { + "epoch": 0.8286568660805415, + "grad_norm": 1.539825201034546, + "learning_rate": 1.5004517255714456e-05, + "loss": 1.7301, + "step": 23139 + }, + { + "epoch": 0.8286926782101097, + "grad_norm": 1.5023062229156494, + "learning_rate": 1.4998406822912525e-05, + "loss": 1.6564, + "step": 23140 + }, + { + "epoch": 0.8287284903396781, + "grad_norm": 2.535398483276367, + "learning_rate": 1.4992297533697387e-05, + "loss": 1.4908, + "step": 23141 + }, + { + "epoch": 0.8287643024692464, + "grad_norm": 1.7443580627441406, + "learning_rate": 1.4986189388151229e-05, + "loss": 1.3793, + "step": 23142 + }, + { + "epoch": 0.8288001145988146, + "grad_norm": 1.728061318397522, + "learning_rate": 1.4980082386356264e-05, + "loss": 1.246, + "step": 23143 + }, + { + "epoch": 0.8288359267283829, + "grad_norm": 1.9140900373458862, + "learning_rate": 1.4973976528394596e-05, + "loss": 1.2107, + "step": 23144 + }, + { + "epoch": 0.8288717388579512, + "grad_norm": 2.0008881092071533, + "learning_rate": 1.4967871814348399e-05, + "loss": 1.4527, + "step": 23145 + }, + { + "epoch": 0.8289075509875194, + "grad_norm": 1.3603802919387817, + "learning_rate": 1.4961768244299823e-05, + "loss": 1.2612, + "step": 23146 + }, + { + "epoch": 0.8289433631170877, + "grad_norm": 1.8144201040267944, + "learning_rate": 1.4955665818330944e-05, + "loss": 1.5788, + "step": 23147 + }, + { + "epoch": 0.8289791752466561, + "grad_norm": 1.1382317543029785, + "learning_rate": 1.4949564536523874e-05, + "loss": 1.1458, + "step": 23148 + }, + { + "epoch": 0.8290149873762244, + "grad_norm": 1.77524995803833, + "learning_rate": 1.4943464398960716e-05, + "loss": 1.6568, + "step": 23149 + }, + { + "epoch": 0.8290507995057926, + "grad_norm": 1.7425357103347778, + "learning_rate": 1.4937365405723547e-05, + "loss": 1.3626, + "step": 23150 + }, + { + "epoch": 0.8290866116353609, + "grad_norm": 1.9161444902420044, + "learning_rate": 1.493126755689439e-05, + "loss": 1.4522, + "step": 23151 + }, + { + "epoch": 0.8291224237649292, + "grad_norm": 1.7542110681533813, + "learning_rate": 1.4925170852555282e-05, + "loss": 1.3236, + "step": 23152 + }, + { + "epoch": 0.8291582358944974, + "grad_norm": 1.5818971395492554, + "learning_rate": 1.4919075292788298e-05, + "loss": 1.3392, + "step": 23153 + }, + { + "epoch": 0.8291940480240657, + "grad_norm": 1.3822681903839111, + "learning_rate": 1.4912980877675387e-05, + "loss": 1.3119, + "step": 23154 + }, + { + "epoch": 0.8292298601536341, + "grad_norm": 1.6355019807815552, + "learning_rate": 1.4906887607298548e-05, + "loss": 1.9485, + "step": 23155 + }, + { + "epoch": 0.8292656722832024, + "grad_norm": 1.661881685256958, + "learning_rate": 1.4900795481739793e-05, + "loss": 1.6326, + "step": 23156 + }, + { + "epoch": 0.8293014844127706, + "grad_norm": 1.375794768333435, + "learning_rate": 1.4894704501081069e-05, + "loss": 1.4931, + "step": 23157 + }, + { + "epoch": 0.8293372965423389, + "grad_norm": 1.861142873764038, + "learning_rate": 1.488861466540431e-05, + "loss": 1.466, + "step": 23158 + }, + { + "epoch": 0.8293731086719072, + "grad_norm": 1.3190573453903198, + "learning_rate": 1.488252597479145e-05, + "loss": 1.3289, + "step": 23159 + }, + { + "epoch": 0.8294089208014754, + "grad_norm": 1.687562108039856, + "learning_rate": 1.4876438429324414e-05, + "loss": 1.1553, + "step": 23160 + }, + { + "epoch": 0.8294447329310437, + "grad_norm": 1.8980015516281128, + "learning_rate": 1.487035202908511e-05, + "loss": 1.4203, + "step": 23161 + }, + { + "epoch": 0.8294805450606121, + "grad_norm": 2.008462905883789, + "learning_rate": 1.4864266774155389e-05, + "loss": 1.3749, + "step": 23162 + }, + { + "epoch": 0.8295163571901804, + "grad_norm": 1.576193928718567, + "learning_rate": 1.4858182664617148e-05, + "loss": 1.2269, + "step": 23163 + }, + { + "epoch": 0.8295521693197486, + "grad_norm": 1.642438530921936, + "learning_rate": 1.4852099700552259e-05, + "loss": 1.38, + "step": 23164 + }, + { + "epoch": 0.8295879814493169, + "grad_norm": 1.7365227937698364, + "learning_rate": 1.4846017882042506e-05, + "loss": 1.4031, + "step": 23165 + }, + { + "epoch": 0.8296237935788852, + "grad_norm": 1.6109135150909424, + "learning_rate": 1.4839937209169741e-05, + "loss": 1.2866, + "step": 23166 + }, + { + "epoch": 0.8296596057084534, + "grad_norm": 1.110206127166748, + "learning_rate": 1.4833857682015773e-05, + "loss": 1.3672, + "step": 23167 + }, + { + "epoch": 0.8296954178380217, + "grad_norm": 1.5050301551818848, + "learning_rate": 1.4827779300662425e-05, + "loss": 1.4046, + "step": 23168 + }, + { + "epoch": 0.8297312299675901, + "grad_norm": 1.5726349353790283, + "learning_rate": 1.4821702065191413e-05, + "loss": 1.3428, + "step": 23169 + }, + { + "epoch": 0.8297670420971583, + "grad_norm": 1.4734699726104736, + "learning_rate": 1.4815625975684522e-05, + "loss": 1.3572, + "step": 23170 + }, + { + "epoch": 0.8298028542267266, + "grad_norm": 1.7343571186065674, + "learning_rate": 1.4809551032223534e-05, + "loss": 1.4396, + "step": 23171 + }, + { + "epoch": 0.8298386663562949, + "grad_norm": 1.371505618095398, + "learning_rate": 1.480347723489013e-05, + "loss": 1.4953, + "step": 23172 + }, + { + "epoch": 0.8298744784858632, + "grad_norm": 1.4669415950775146, + "learning_rate": 1.4797404583766028e-05, + "loss": 0.9984, + "step": 23173 + }, + { + "epoch": 0.8299102906154314, + "grad_norm": 1.7243847846984863, + "learning_rate": 1.4791333078932956e-05, + "loss": 1.4721, + "step": 23174 + }, + { + "epoch": 0.8299461027449997, + "grad_norm": 1.569770097732544, + "learning_rate": 1.4785262720472615e-05, + "loss": 1.4345, + "step": 23175 + }, + { + "epoch": 0.829981914874568, + "grad_norm": 1.4598844051361084, + "learning_rate": 1.4779193508466604e-05, + "loss": 1.3252, + "step": 23176 + }, + { + "epoch": 0.8300177270041363, + "grad_norm": 1.655402660369873, + "learning_rate": 1.4773125442996626e-05, + "loss": 1.622, + "step": 23177 + }, + { + "epoch": 0.8300535391337046, + "grad_norm": 1.8673683404922485, + "learning_rate": 1.4767058524144318e-05, + "loss": 1.3777, + "step": 23178 + }, + { + "epoch": 0.8300893512632729, + "grad_norm": 1.8644596338272095, + "learning_rate": 1.476099275199131e-05, + "loss": 1.4903, + "step": 23179 + }, + { + "epoch": 0.8301251633928411, + "grad_norm": 1.84309720993042, + "learning_rate": 1.4754928126619172e-05, + "loss": 1.3268, + "step": 23180 + }, + { + "epoch": 0.8301609755224094, + "grad_norm": 1.785307765007019, + "learning_rate": 1.4748864648109518e-05, + "loss": 1.3626, + "step": 23181 + }, + { + "epoch": 0.8301967876519777, + "grad_norm": 1.6583218574523926, + "learning_rate": 1.4742802316543947e-05, + "loss": 1.3632, + "step": 23182 + }, + { + "epoch": 0.830232599781546, + "grad_norm": 2.204003095626831, + "learning_rate": 1.4736741132003984e-05, + "loss": 1.4275, + "step": 23183 + }, + { + "epoch": 0.8302684119111143, + "grad_norm": 1.5974963903427124, + "learning_rate": 1.4730681094571175e-05, + "loss": 1.5503, + "step": 23184 + }, + { + "epoch": 0.8303042240406826, + "grad_norm": 1.756859540939331, + "learning_rate": 1.4724622204327066e-05, + "loss": 1.56, + "step": 23185 + }, + { + "epoch": 0.8303400361702509, + "grad_norm": 1.429917335510254, + "learning_rate": 1.471856446135319e-05, + "loss": 1.5715, + "step": 23186 + }, + { + "epoch": 0.8303758482998191, + "grad_norm": 1.7499074935913086, + "learning_rate": 1.4712507865730996e-05, + "loss": 1.5858, + "step": 23187 + }, + { + "epoch": 0.8304116604293874, + "grad_norm": 1.723129153251648, + "learning_rate": 1.4706452417542006e-05, + "loss": 1.4264, + "step": 23188 + }, + { + "epoch": 0.8304474725589557, + "grad_norm": 1.859376311302185, + "learning_rate": 1.4700398116867697e-05, + "loss": 1.3564, + "step": 23189 + }, + { + "epoch": 0.830483284688524, + "grad_norm": 1.5590065717697144, + "learning_rate": 1.4694344963789474e-05, + "loss": 1.5782, + "step": 23190 + }, + { + "epoch": 0.8305190968180923, + "grad_norm": 1.856689214706421, + "learning_rate": 1.4688292958388816e-05, + "loss": 1.1994, + "step": 23191 + }, + { + "epoch": 0.8305549089476606, + "grad_norm": 1.3962154388427734, + "learning_rate": 1.4682242100747123e-05, + "loss": 1.4828, + "step": 23192 + }, + { + "epoch": 0.8305907210772289, + "grad_norm": 1.6056325435638428, + "learning_rate": 1.467619239094583e-05, + "loss": 1.4341, + "step": 23193 + }, + { + "epoch": 0.8306265332067971, + "grad_norm": 2.161923885345459, + "learning_rate": 1.4670143829066296e-05, + "loss": 1.4834, + "step": 23194 + }, + { + "epoch": 0.8306623453363654, + "grad_norm": 1.8729352951049805, + "learning_rate": 1.4664096415189899e-05, + "loss": 1.5906, + "step": 23195 + }, + { + "epoch": 0.8306981574659337, + "grad_norm": 1.8302282094955444, + "learning_rate": 1.465805014939804e-05, + "loss": 1.3676, + "step": 23196 + }, + { + "epoch": 0.8307339695955019, + "grad_norm": 1.3889516592025757, + "learning_rate": 1.465200503177201e-05, + "loss": 1.4782, + "step": 23197 + }, + { + "epoch": 0.8307697817250703, + "grad_norm": 2.007490396499634, + "learning_rate": 1.4645961062393177e-05, + "loss": 1.4347, + "step": 23198 + }, + { + "epoch": 0.8308055938546386, + "grad_norm": 1.6827696561813354, + "learning_rate": 1.4639918241342798e-05, + "loss": 1.4996, + "step": 23199 + }, + { + "epoch": 0.8308414059842069, + "grad_norm": 1.323423981666565, + "learning_rate": 1.4633876568702254e-05, + "loss": 1.1966, + "step": 23200 + }, + { + "epoch": 0.8308772181137751, + "grad_norm": 1.5560758113861084, + "learning_rate": 1.4627836044552767e-05, + "loss": 1.4059, + "step": 23201 + }, + { + "epoch": 0.8309130302433434, + "grad_norm": 2.4446370601654053, + "learning_rate": 1.462179666897563e-05, + "loss": 1.4578, + "step": 23202 + }, + { + "epoch": 0.8309488423729117, + "grad_norm": 1.423201322555542, + "learning_rate": 1.4615758442052085e-05, + "loss": 1.1859, + "step": 23203 + }, + { + "epoch": 0.8309846545024799, + "grad_norm": 1.891046166419983, + "learning_rate": 1.4609721363863393e-05, + "loss": 1.3277, + "step": 23204 + }, + { + "epoch": 0.8310204666320483, + "grad_norm": 2.2019035816192627, + "learning_rate": 1.4603685434490756e-05, + "loss": 1.154, + "step": 23205 + }, + { + "epoch": 0.8310562787616166, + "grad_norm": 1.9172054529190063, + "learning_rate": 1.4597650654015327e-05, + "loss": 1.2394, + "step": 23206 + }, + { + "epoch": 0.8310920908911849, + "grad_norm": 1.5021252632141113, + "learning_rate": 1.45916170225184e-05, + "loss": 1.5499, + "step": 23207 + }, + { + "epoch": 0.8311279030207531, + "grad_norm": 2.5014986991882324, + "learning_rate": 1.4585584540081066e-05, + "loss": 1.4685, + "step": 23208 + }, + { + "epoch": 0.8311637151503214, + "grad_norm": 1.3708525896072388, + "learning_rate": 1.4579553206784546e-05, + "loss": 1.3963, + "step": 23209 + }, + { + "epoch": 0.8311995272798897, + "grad_norm": 1.7175711393356323, + "learning_rate": 1.45735230227099e-05, + "loss": 1.5637, + "step": 23210 + }, + { + "epoch": 0.8312353394094579, + "grad_norm": 1.8059896230697632, + "learning_rate": 1.4567493987938364e-05, + "loss": 1.3623, + "step": 23211 + }, + { + "epoch": 0.8312711515390263, + "grad_norm": 1.5208191871643066, + "learning_rate": 1.456146610255097e-05, + "loss": 1.4858, + "step": 23212 + }, + { + "epoch": 0.8313069636685946, + "grad_norm": 2.4135520458221436, + "learning_rate": 1.4555439366628843e-05, + "loss": 1.4645, + "step": 23213 + }, + { + "epoch": 0.8313427757981628, + "grad_norm": 1.7877299785614014, + "learning_rate": 1.4549413780253085e-05, + "loss": 1.4712, + "step": 23214 + }, + { + "epoch": 0.8313785879277311, + "grad_norm": 1.7445772886276245, + "learning_rate": 1.454338934350472e-05, + "loss": 1.6327, + "step": 23215 + }, + { + "epoch": 0.8314144000572994, + "grad_norm": 1.4530085325241089, + "learning_rate": 1.453736605646484e-05, + "loss": 1.4826, + "step": 23216 + }, + { + "epoch": 0.8314502121868677, + "grad_norm": 1.793205738067627, + "learning_rate": 1.4531343919214414e-05, + "loss": 1.3664, + "step": 23217 + }, + { + "epoch": 0.8314860243164359, + "grad_norm": 1.2321491241455078, + "learning_rate": 1.4525322931834562e-05, + "loss": 1.1971, + "step": 23218 + }, + { + "epoch": 0.8315218364460043, + "grad_norm": 1.8643945455551147, + "learning_rate": 1.4519303094406211e-05, + "loss": 1.5384, + "step": 23219 + }, + { + "epoch": 0.8315576485755726, + "grad_norm": 1.8356062173843384, + "learning_rate": 1.4513284407010385e-05, + "loss": 1.5823, + "step": 23220 + }, + { + "epoch": 0.8315934607051408, + "grad_norm": 1.8508111238479614, + "learning_rate": 1.450726686972802e-05, + "loss": 1.5022, + "step": 23221 + }, + { + "epoch": 0.8316292728347091, + "grad_norm": 1.2595747709274292, + "learning_rate": 1.4501250482640139e-05, + "loss": 1.4594, + "step": 23222 + }, + { + "epoch": 0.8316650849642774, + "grad_norm": 2.1554951667785645, + "learning_rate": 1.4495235245827642e-05, + "loss": 1.2048, + "step": 23223 + }, + { + "epoch": 0.8317008970938456, + "grad_norm": 4.19765567779541, + "learning_rate": 1.4489221159371447e-05, + "loss": 1.5112, + "step": 23224 + }, + { + "epoch": 0.8317367092234139, + "grad_norm": 1.5159118175506592, + "learning_rate": 1.4483208223352474e-05, + "loss": 1.6076, + "step": 23225 + }, + { + "epoch": 0.8317725213529823, + "grad_norm": 1.4949398040771484, + "learning_rate": 1.4477196437851625e-05, + "loss": 1.5265, + "step": 23226 + }, + { + "epoch": 0.8318083334825506, + "grad_norm": 1.6657307147979736, + "learning_rate": 1.4471185802949816e-05, + "loss": 1.501, + "step": 23227 + }, + { + "epoch": 0.8318441456121188, + "grad_norm": 1.5736427307128906, + "learning_rate": 1.4465176318727825e-05, + "loss": 1.6235, + "step": 23228 + }, + { + "epoch": 0.8318799577416871, + "grad_norm": 1.6373560428619385, + "learning_rate": 1.4459167985266597e-05, + "loss": 1.6095, + "step": 23229 + }, + { + "epoch": 0.8319157698712554, + "grad_norm": 1.5352661609649658, + "learning_rate": 1.4453160802646903e-05, + "loss": 1.4128, + "step": 23230 + }, + { + "epoch": 0.8319515820008236, + "grad_norm": 1.6256799697875977, + "learning_rate": 1.444715477094961e-05, + "loss": 1.4257, + "step": 23231 + }, + { + "epoch": 0.8319873941303919, + "grad_norm": 1.462746262550354, + "learning_rate": 1.4441149890255467e-05, + "loss": 1.2893, + "step": 23232 + }, + { + "epoch": 0.8320232062599603, + "grad_norm": 1.7766444683074951, + "learning_rate": 1.4435146160645285e-05, + "loss": 1.4783, + "step": 23233 + }, + { + "epoch": 0.8320590183895286, + "grad_norm": 1.7726175785064697, + "learning_rate": 1.4429143582199866e-05, + "loss": 1.4622, + "step": 23234 + }, + { + "epoch": 0.8320948305190968, + "grad_norm": 1.3048326969146729, + "learning_rate": 1.4423142154999925e-05, + "loss": 1.1044, + "step": 23235 + }, + { + "epoch": 0.8321306426486651, + "grad_norm": 1.4011657238006592, + "learning_rate": 1.4417141879126218e-05, + "loss": 1.5252, + "step": 23236 + }, + { + "epoch": 0.8321664547782334, + "grad_norm": 1.5536365509033203, + "learning_rate": 1.4411142754659468e-05, + "loss": 1.2663, + "step": 23237 + }, + { + "epoch": 0.8322022669078016, + "grad_norm": 1.8643995523452759, + "learning_rate": 1.4405144781680424e-05, + "loss": 1.2179, + "step": 23238 + }, + { + "epoch": 0.8322380790373699, + "grad_norm": 1.3420333862304688, + "learning_rate": 1.4399147960269688e-05, + "loss": 1.5695, + "step": 23239 + }, + { + "epoch": 0.8322738911669383, + "grad_norm": 1.519026279449463, + "learning_rate": 1.439315229050805e-05, + "loss": 1.4541, + "step": 23240 + }, + { + "epoch": 0.8323097032965066, + "grad_norm": 1.8381019830703735, + "learning_rate": 1.4387157772476134e-05, + "loss": 1.4799, + "step": 23241 + }, + { + "epoch": 0.8323455154260748, + "grad_norm": 1.4542158842086792, + "learning_rate": 1.4381164406254544e-05, + "loss": 1.4863, + "step": 23242 + }, + { + "epoch": 0.8323813275556431, + "grad_norm": 1.423120141029358, + "learning_rate": 1.4375172191923947e-05, + "loss": 1.3069, + "step": 23243 + }, + { + "epoch": 0.8324171396852114, + "grad_norm": 1.5357532501220703, + "learning_rate": 1.4369181129564957e-05, + "loss": 1.473, + "step": 23244 + }, + { + "epoch": 0.8324529518147796, + "grad_norm": 1.6310615539550781, + "learning_rate": 1.4363191219258209e-05, + "loss": 1.4536, + "step": 23245 + }, + { + "epoch": 0.8324887639443479, + "grad_norm": 1.673390507698059, + "learning_rate": 1.4357202461084229e-05, + "loss": 1.3385, + "step": 23246 + }, + { + "epoch": 0.8325245760739163, + "grad_norm": 2.130810022354126, + "learning_rate": 1.4351214855123629e-05, + "loss": 1.3291, + "step": 23247 + }, + { + "epoch": 0.8325603882034845, + "grad_norm": 1.4513099193572998, + "learning_rate": 1.4345228401456945e-05, + "loss": 1.2643, + "step": 23248 + }, + { + "epoch": 0.8325962003330528, + "grad_norm": 1.4877856969833374, + "learning_rate": 1.4339243100164757e-05, + "loss": 1.6895, + "step": 23249 + }, + { + "epoch": 0.8326320124626211, + "grad_norm": 1.3141956329345703, + "learning_rate": 1.4333258951327534e-05, + "loss": 1.3533, + "step": 23250 + }, + { + "epoch": 0.8326678245921894, + "grad_norm": 1.7010196447372437, + "learning_rate": 1.4327275955025798e-05, + "loss": 1.2068, + "step": 23251 + }, + { + "epoch": 0.8327036367217576, + "grad_norm": 1.50443434715271, + "learning_rate": 1.4321294111340089e-05, + "loss": 1.283, + "step": 23252 + }, + { + "epoch": 0.8327394488513259, + "grad_norm": 1.7006819248199463, + "learning_rate": 1.4315313420350829e-05, + "loss": 1.3811, + "step": 23253 + }, + { + "epoch": 0.8327752609808943, + "grad_norm": 1.890299916267395, + "learning_rate": 1.4309333882138488e-05, + "loss": 1.3671, + "step": 23254 + }, + { + "epoch": 0.8328110731104625, + "grad_norm": 1.7135711908340454, + "learning_rate": 1.4303355496783544e-05, + "loss": 1.3083, + "step": 23255 + }, + { + "epoch": 0.8328468852400308, + "grad_norm": 1.436191201210022, + "learning_rate": 1.4297378264366423e-05, + "loss": 1.5384, + "step": 23256 + }, + { + "epoch": 0.8328826973695991, + "grad_norm": 1.4963761568069458, + "learning_rate": 1.4291402184967507e-05, + "loss": 1.4136, + "step": 23257 + }, + { + "epoch": 0.8329185094991673, + "grad_norm": 2.164684295654297, + "learning_rate": 1.4285427258667217e-05, + "loss": 1.5224, + "step": 23258 + }, + { + "epoch": 0.8329543216287356, + "grad_norm": 1.8645986318588257, + "learning_rate": 1.4279453485545968e-05, + "loss": 1.9139, + "step": 23259 + }, + { + "epoch": 0.8329901337583039, + "grad_norm": 1.6702115535736084, + "learning_rate": 1.4273480865684074e-05, + "loss": 1.3873, + "step": 23260 + }, + { + "epoch": 0.8330259458878723, + "grad_norm": 1.5879836082458496, + "learning_rate": 1.4267509399161916e-05, + "loss": 1.2909, + "step": 23261 + }, + { + "epoch": 0.8330617580174405, + "grad_norm": 1.5265380144119263, + "learning_rate": 1.4261539086059839e-05, + "loss": 1.5102, + "step": 23262 + }, + { + "epoch": 0.8330975701470088, + "grad_norm": 1.9467709064483643, + "learning_rate": 1.4255569926458168e-05, + "loss": 1.4075, + "step": 23263 + }, + { + "epoch": 0.8331333822765771, + "grad_norm": 1.7792093753814697, + "learning_rate": 1.4249601920437194e-05, + "loss": 1.3639, + "step": 23264 + }, + { + "epoch": 0.8331691944061453, + "grad_norm": 1.6893055438995361, + "learning_rate": 1.42436350680772e-05, + "loss": 1.7997, + "step": 23265 + }, + { + "epoch": 0.8332050065357136, + "grad_norm": 1.7191356420516968, + "learning_rate": 1.4237669369458495e-05, + "loss": 1.5456, + "step": 23266 + }, + { + "epoch": 0.8332408186652819, + "grad_norm": 2.7596893310546875, + "learning_rate": 1.4231704824661329e-05, + "loss": 1.5299, + "step": 23267 + }, + { + "epoch": 0.8332766307948503, + "grad_norm": 1.2917113304138184, + "learning_rate": 1.4225741433765927e-05, + "loss": 1.2855, + "step": 23268 + }, + { + "epoch": 0.8333124429244185, + "grad_norm": 1.6655503511428833, + "learning_rate": 1.4219779196852534e-05, + "loss": 1.5177, + "step": 23269 + }, + { + "epoch": 0.8333482550539868, + "grad_norm": 1.3067896366119385, + "learning_rate": 1.4213818114001387e-05, + "loss": 1.131, + "step": 23270 + }, + { + "epoch": 0.8333840671835551, + "grad_norm": 1.9188344478607178, + "learning_rate": 1.4207858185292643e-05, + "loss": 1.4093, + "step": 23271 + }, + { + "epoch": 0.8334198793131233, + "grad_norm": 1.8857660293579102, + "learning_rate": 1.4201899410806496e-05, + "loss": 1.612, + "step": 23272 + }, + { + "epoch": 0.8334556914426916, + "grad_norm": 1.7722171545028687, + "learning_rate": 1.4195941790623124e-05, + "loss": 1.6784, + "step": 23273 + }, + { + "epoch": 0.8334915035722599, + "grad_norm": 1.8155393600463867, + "learning_rate": 1.4189985324822697e-05, + "loss": 1.6842, + "step": 23274 + }, + { + "epoch": 0.8335273157018283, + "grad_norm": 1.6256564855575562, + "learning_rate": 1.4184030013485305e-05, + "loss": 1.5721, + "step": 23275 + }, + { + "epoch": 0.8335631278313965, + "grad_norm": 1.669029712677002, + "learning_rate": 1.4178075856691097e-05, + "loss": 1.4226, + "step": 23276 + }, + { + "epoch": 0.8335989399609648, + "grad_norm": 1.8676083087921143, + "learning_rate": 1.4172122854520198e-05, + "loss": 1.7837, + "step": 23277 + }, + { + "epoch": 0.8336347520905331, + "grad_norm": 1.5746208429336548, + "learning_rate": 1.4166171007052653e-05, + "loss": 1.2264, + "step": 23278 + }, + { + "epoch": 0.8336705642201013, + "grad_norm": 2.2983994483947754, + "learning_rate": 1.4160220314368555e-05, + "loss": 1.6691, + "step": 23279 + }, + { + "epoch": 0.8337063763496696, + "grad_norm": 1.4600412845611572, + "learning_rate": 1.4154270776547974e-05, + "loss": 1.1893, + "step": 23280 + }, + { + "epoch": 0.8337421884792379, + "grad_norm": 3.8261477947235107, + "learning_rate": 1.4148322393670976e-05, + "loss": 1.55, + "step": 23281 + }, + { + "epoch": 0.8337780006088062, + "grad_norm": 1.9612890481948853, + "learning_rate": 1.4142375165817523e-05, + "loss": 1.4173, + "step": 23282 + }, + { + "epoch": 0.8338138127383745, + "grad_norm": 2.1106479167938232, + "learning_rate": 1.413642909306767e-05, + "loss": 1.5264, + "step": 23283 + }, + { + "epoch": 0.8338496248679428, + "grad_norm": 1.3178765773773193, + "learning_rate": 1.4130484175501435e-05, + "loss": 1.2424, + "step": 23284 + }, + { + "epoch": 0.833885436997511, + "grad_norm": 2.719381332397461, + "learning_rate": 1.412454041319874e-05, + "loss": 1.6399, + "step": 23285 + }, + { + "epoch": 0.8339212491270793, + "grad_norm": 2.3354897499084473, + "learning_rate": 1.4118597806239585e-05, + "loss": 1.4628, + "step": 23286 + }, + { + "epoch": 0.8339570612566476, + "grad_norm": 1.822835922241211, + "learning_rate": 1.4112656354703924e-05, + "loss": 1.2929, + "step": 23287 + }, + { + "epoch": 0.8339928733862159, + "grad_norm": 1.2769025564193726, + "learning_rate": 1.41067160586717e-05, + "loss": 1.4113, + "step": 23288 + }, + { + "epoch": 0.8340286855157842, + "grad_norm": 1.8172305822372437, + "learning_rate": 1.4100776918222802e-05, + "loss": 1.2328, + "step": 23289 + }, + { + "epoch": 0.8340644976453525, + "grad_norm": 1.472387433052063, + "learning_rate": 1.4094838933437138e-05, + "loss": 1.4583, + "step": 23290 + }, + { + "epoch": 0.8341003097749208, + "grad_norm": 1.9044989347457886, + "learning_rate": 1.4088902104394607e-05, + "loss": 1.7, + "step": 23291 + }, + { + "epoch": 0.834136121904489, + "grad_norm": 2.0929458141326904, + "learning_rate": 1.4082966431175116e-05, + "loss": 1.1544, + "step": 23292 + }, + { + "epoch": 0.8341719340340573, + "grad_norm": 1.484946846961975, + "learning_rate": 1.4077031913858474e-05, + "loss": 1.5744, + "step": 23293 + }, + { + "epoch": 0.8342077461636256, + "grad_norm": 1.6925033330917358, + "learning_rate": 1.4071098552524497e-05, + "loss": 1.4153, + "step": 23294 + }, + { + "epoch": 0.8342435582931939, + "grad_norm": 1.6346595287322998, + "learning_rate": 1.4065166347253089e-05, + "loss": 1.2867, + "step": 23295 + }, + { + "epoch": 0.8342793704227622, + "grad_norm": 1.8432413339614868, + "learning_rate": 1.4059235298124006e-05, + "loss": 1.3611, + "step": 23296 + }, + { + "epoch": 0.8343151825523305, + "grad_norm": 1.546345591545105, + "learning_rate": 1.4053305405217044e-05, + "loss": 1.531, + "step": 23297 + }, + { + "epoch": 0.8343509946818988, + "grad_norm": 1.4019263982772827, + "learning_rate": 1.4047376668612e-05, + "loss": 1.3993, + "step": 23298 + }, + { + "epoch": 0.834386806811467, + "grad_norm": 1.7556198835372925, + "learning_rate": 1.4041449088388659e-05, + "loss": 1.0783, + "step": 23299 + }, + { + "epoch": 0.8344226189410353, + "grad_norm": 1.5568428039550781, + "learning_rate": 1.4035522664626721e-05, + "loss": 1.4352, + "step": 23300 + }, + { + "epoch": 0.8344584310706036, + "grad_norm": 3.3579607009887695, + "learning_rate": 1.4029597397405925e-05, + "loss": 1.6769, + "step": 23301 + }, + { + "epoch": 0.8344942432001718, + "grad_norm": 1.5644280910491943, + "learning_rate": 1.4023673286806039e-05, + "loss": 1.089, + "step": 23302 + }, + { + "epoch": 0.8345300553297402, + "grad_norm": 1.8802729845046997, + "learning_rate": 1.4017750332906698e-05, + "loss": 1.7205, + "step": 23303 + }, + { + "epoch": 0.8345658674593085, + "grad_norm": 1.711993932723999, + "learning_rate": 1.4011828535787642e-05, + "loss": 1.5439, + "step": 23304 + }, + { + "epoch": 0.8346016795888768, + "grad_norm": 1.9384276866912842, + "learning_rate": 1.400590789552847e-05, + "loss": 1.3889, + "step": 23305 + }, + { + "epoch": 0.834637491718445, + "grad_norm": 2.598912000656128, + "learning_rate": 1.3999988412208931e-05, + "loss": 1.5202, + "step": 23306 + }, + { + "epoch": 0.8346733038480133, + "grad_norm": 1.5115162134170532, + "learning_rate": 1.3994070085908596e-05, + "loss": 1.5137, + "step": 23307 + }, + { + "epoch": 0.8347091159775816, + "grad_norm": 1.8529272079467773, + "learning_rate": 1.3988152916707121e-05, + "loss": 1.2628, + "step": 23308 + }, + { + "epoch": 0.8347449281071498, + "grad_norm": 1.7130944728851318, + "learning_rate": 1.3982236904684064e-05, + "loss": 1.2679, + "step": 23309 + }, + { + "epoch": 0.8347807402367182, + "grad_norm": 1.4939451217651367, + "learning_rate": 1.3976322049919088e-05, + "loss": 1.5153, + "step": 23310 + }, + { + "epoch": 0.8348165523662865, + "grad_norm": 1.5681949853897095, + "learning_rate": 1.3970408352491749e-05, + "loss": 1.4088, + "step": 23311 + }, + { + "epoch": 0.8348523644958548, + "grad_norm": 1.7648473978042603, + "learning_rate": 1.3964495812481548e-05, + "loss": 1.3955, + "step": 23312 + }, + { + "epoch": 0.834888176625423, + "grad_norm": 1.4568158388137817, + "learning_rate": 1.3958584429968124e-05, + "loss": 1.4164, + "step": 23313 + }, + { + "epoch": 0.8349239887549913, + "grad_norm": 1.6647497415542603, + "learning_rate": 1.3952674205030935e-05, + "loss": 1.1866, + "step": 23314 + }, + { + "epoch": 0.8349598008845596, + "grad_norm": 1.7173484563827515, + "learning_rate": 1.394676513774954e-05, + "loss": 1.5996, + "step": 23315 + }, + { + "epoch": 0.8349956130141278, + "grad_norm": 1.6371338367462158, + "learning_rate": 1.3940857228203386e-05, + "loss": 1.4188, + "step": 23316 + }, + { + "epoch": 0.8350314251436962, + "grad_norm": 1.3010733127593994, + "learning_rate": 1.393495047647202e-05, + "loss": 1.516, + "step": 23317 + }, + { + "epoch": 0.8350672372732645, + "grad_norm": 1.5296361446380615, + "learning_rate": 1.3929044882634867e-05, + "loss": 1.4748, + "step": 23318 + }, + { + "epoch": 0.8351030494028328, + "grad_norm": 1.7259113788604736, + "learning_rate": 1.3923140446771409e-05, + "loss": 1.8208, + "step": 23319 + }, + { + "epoch": 0.835138861532401, + "grad_norm": 1.6335358619689941, + "learning_rate": 1.3917237168961051e-05, + "loss": 1.3154, + "step": 23320 + }, + { + "epoch": 0.8351746736619693, + "grad_norm": 2.0975825786590576, + "learning_rate": 1.3911335049283225e-05, + "loss": 1.6693, + "step": 23321 + }, + { + "epoch": 0.8352104857915376, + "grad_norm": 1.558631181716919, + "learning_rate": 1.390543408781736e-05, + "loss": 1.5984, + "step": 23322 + }, + { + "epoch": 0.8352462979211058, + "grad_norm": 1.3782986402511597, + "learning_rate": 1.3899534284642779e-05, + "loss": 1.1026, + "step": 23323 + }, + { + "epoch": 0.8352821100506742, + "grad_norm": 1.7837612628936768, + "learning_rate": 1.3893635639838942e-05, + "loss": 1.2879, + "step": 23324 + }, + { + "epoch": 0.8353179221802425, + "grad_norm": 1.4235824346542358, + "learning_rate": 1.3887738153485153e-05, + "loss": 1.5469, + "step": 23325 + }, + { + "epoch": 0.8353537343098107, + "grad_norm": 1.4114874601364136, + "learning_rate": 1.388184182566079e-05, + "loss": 1.523, + "step": 23326 + }, + { + "epoch": 0.835389546439379, + "grad_norm": 1.7939683198928833, + "learning_rate": 1.3875946656445126e-05, + "loss": 1.5065, + "step": 23327 + }, + { + "epoch": 0.8354253585689473, + "grad_norm": 1.2166051864624023, + "learning_rate": 1.3870052645917542e-05, + "loss": 1.257, + "step": 23328 + }, + { + "epoch": 0.8354611706985156, + "grad_norm": 2.7868435382843018, + "learning_rate": 1.3864159794157305e-05, + "loss": 0.9794, + "step": 23329 + }, + { + "epoch": 0.8354969828280838, + "grad_norm": 2.0114903450012207, + "learning_rate": 1.3858268101243666e-05, + "loss": 1.338, + "step": 23330 + }, + { + "epoch": 0.8355327949576522, + "grad_norm": 1.9516043663024902, + "learning_rate": 1.3852377567255913e-05, + "loss": 1.3652, + "step": 23331 + }, + { + "epoch": 0.8355686070872205, + "grad_norm": 1.8483436107635498, + "learning_rate": 1.3846488192273298e-05, + "loss": 1.8179, + "step": 23332 + }, + { + "epoch": 0.8356044192167887, + "grad_norm": 1.333381175994873, + "learning_rate": 1.384059997637508e-05, + "loss": 1.4457, + "step": 23333 + }, + { + "epoch": 0.835640231346357, + "grad_norm": 1.6573841571807861, + "learning_rate": 1.3834712919640424e-05, + "loss": 1.1238, + "step": 23334 + }, + { + "epoch": 0.8356760434759253, + "grad_norm": 1.6716316938400269, + "learning_rate": 1.382882702214856e-05, + "loss": 1.6315, + "step": 23335 + }, + { + "epoch": 0.8357118556054935, + "grad_norm": 2.7846570014953613, + "learning_rate": 1.382294228397868e-05, + "loss": 1.7287, + "step": 23336 + }, + { + "epoch": 0.8357476677350618, + "grad_norm": 1.2655292749404907, + "learning_rate": 1.3817058705209973e-05, + "loss": 1.5531, + "step": 23337 + }, + { + "epoch": 0.8357834798646302, + "grad_norm": 1.5303046703338623, + "learning_rate": 1.3811176285921557e-05, + "loss": 1.3362, + "step": 23338 + }, + { + "epoch": 0.8358192919941985, + "grad_norm": 2.3918161392211914, + "learning_rate": 1.3805295026192577e-05, + "loss": 1.4692, + "step": 23339 + }, + { + "epoch": 0.8358551041237667, + "grad_norm": 1.6694695949554443, + "learning_rate": 1.3799414926102194e-05, + "loss": 1.3039, + "step": 23340 + }, + { + "epoch": 0.835890916253335, + "grad_norm": 1.2947691679000854, + "learning_rate": 1.3793535985729478e-05, + "loss": 1.5269, + "step": 23341 + }, + { + "epoch": 0.8359267283829033, + "grad_norm": 1.4575728178024292, + "learning_rate": 1.3787658205153532e-05, + "loss": 1.3698, + "step": 23342 + }, + { + "epoch": 0.8359625405124715, + "grad_norm": 1.376752257347107, + "learning_rate": 1.3781781584453435e-05, + "loss": 1.4291, + "step": 23343 + }, + { + "epoch": 0.8359983526420398, + "grad_norm": 1.3669610023498535, + "learning_rate": 1.3775906123708282e-05, + "loss": 1.2259, + "step": 23344 + }, + { + "epoch": 0.8360341647716082, + "grad_norm": 1.7606292963027954, + "learning_rate": 1.3770031822997064e-05, + "loss": 1.4199, + "step": 23345 + }, + { + "epoch": 0.8360699769011765, + "grad_norm": 1.6351981163024902, + "learning_rate": 1.3764158682398843e-05, + "loss": 1.3858, + "step": 23346 + }, + { + "epoch": 0.8361057890307447, + "grad_norm": 1.4310319423675537, + "learning_rate": 1.3758286701992651e-05, + "loss": 1.4934, + "step": 23347 + }, + { + "epoch": 0.836141601160313, + "grad_norm": 1.3345412015914917, + "learning_rate": 1.375241588185744e-05, + "loss": 1.3918, + "step": 23348 + }, + { + "epoch": 0.8361774132898813, + "grad_norm": 1.554101586341858, + "learning_rate": 1.3746546222072232e-05, + "loss": 1.2231, + "step": 23349 + }, + { + "epoch": 0.8362132254194495, + "grad_norm": 1.5663398504257202, + "learning_rate": 1.3740677722715977e-05, + "loss": 1.5471, + "step": 23350 + }, + { + "epoch": 0.8362490375490178, + "grad_norm": 1.472374439239502, + "learning_rate": 1.373481038386767e-05, + "loss": 1.4637, + "step": 23351 + }, + { + "epoch": 0.8362848496785862, + "grad_norm": 1.973220705986023, + "learning_rate": 1.3728944205606186e-05, + "loss": 1.3186, + "step": 23352 + }, + { + "epoch": 0.8363206618081545, + "grad_norm": 1.8892866373062134, + "learning_rate": 1.3723079188010469e-05, + "loss": 1.4246, + "step": 23353 + }, + { + "epoch": 0.8363564739377227, + "grad_norm": 1.4619778394699097, + "learning_rate": 1.3717215331159439e-05, + "loss": 1.3644, + "step": 23354 + }, + { + "epoch": 0.836392286067291, + "grad_norm": 1.3195886611938477, + "learning_rate": 1.3711352635132002e-05, + "loss": 1.1605, + "step": 23355 + }, + { + "epoch": 0.8364280981968593, + "grad_norm": 1.655963659286499, + "learning_rate": 1.3705491100006995e-05, + "loss": 1.3923, + "step": 23356 + }, + { + "epoch": 0.8364639103264275, + "grad_norm": 1.574783444404602, + "learning_rate": 1.3699630725863289e-05, + "loss": 1.3442, + "step": 23357 + }, + { + "epoch": 0.8364997224559958, + "grad_norm": 1.387752652168274, + "learning_rate": 1.3693771512779752e-05, + "loss": 1.4718, + "step": 23358 + }, + { + "epoch": 0.8365355345855642, + "grad_norm": 1.196673035621643, + "learning_rate": 1.3687913460835167e-05, + "loss": 1.6578, + "step": 23359 + }, + { + "epoch": 0.8365713467151324, + "grad_norm": 1.6154292821884155, + "learning_rate": 1.3682056570108382e-05, + "loss": 1.5265, + "step": 23360 + }, + { + "epoch": 0.8366071588447007, + "grad_norm": 1.4177491664886475, + "learning_rate": 1.3676200840678167e-05, + "loss": 1.1989, + "step": 23361 + }, + { + "epoch": 0.836642970974269, + "grad_norm": 1.6271674633026123, + "learning_rate": 1.3670346272623357e-05, + "loss": 1.2113, + "step": 23362 + }, + { + "epoch": 0.8366787831038373, + "grad_norm": 1.4818865060806274, + "learning_rate": 1.366449286602265e-05, + "loss": 1.4199, + "step": 23363 + }, + { + "epoch": 0.8367145952334055, + "grad_norm": 1.7398416996002197, + "learning_rate": 1.3658640620954832e-05, + "loss": 1.4518, + "step": 23364 + }, + { + "epoch": 0.8367504073629738, + "grad_norm": 1.4655635356903076, + "learning_rate": 1.3652789537498656e-05, + "loss": 1.5296, + "step": 23365 + }, + { + "epoch": 0.8367862194925422, + "grad_norm": 1.5625853538513184, + "learning_rate": 1.364693961573279e-05, + "loss": 1.3327, + "step": 23366 + }, + { + "epoch": 0.8368220316221104, + "grad_norm": 1.5531121492385864, + "learning_rate": 1.3641090855735972e-05, + "loss": 1.0688, + "step": 23367 + }, + { + "epoch": 0.8368578437516787, + "grad_norm": 1.4807175397872925, + "learning_rate": 1.3635243257586872e-05, + "loss": 1.5016, + "step": 23368 + }, + { + "epoch": 0.836893655881247, + "grad_norm": 1.3785326480865479, + "learning_rate": 1.3629396821364193e-05, + "loss": 1.4082, + "step": 23369 + }, + { + "epoch": 0.8369294680108152, + "grad_norm": 2.476111888885498, + "learning_rate": 1.3623551547146552e-05, + "loss": 1.5001, + "step": 23370 + }, + { + "epoch": 0.8369652801403835, + "grad_norm": 1.6185861825942993, + "learning_rate": 1.3617707435012606e-05, + "loss": 1.3256, + "step": 23371 + }, + { + "epoch": 0.8370010922699518, + "grad_norm": 1.973078727722168, + "learning_rate": 1.3611864485040982e-05, + "loss": 1.6241, + "step": 23372 + }, + { + "epoch": 0.8370369043995202, + "grad_norm": 1.578711748123169, + "learning_rate": 1.3606022697310316e-05, + "loss": 1.5142, + "step": 23373 + }, + { + "epoch": 0.8370727165290884, + "grad_norm": 1.5022464990615845, + "learning_rate": 1.3600182071899148e-05, + "loss": 1.6072, + "step": 23374 + }, + { + "epoch": 0.8371085286586567, + "grad_norm": 1.498950481414795, + "learning_rate": 1.359434260888608e-05, + "loss": 1.4262, + "step": 23375 + }, + { + "epoch": 0.837144340788225, + "grad_norm": 1.854566216468811, + "learning_rate": 1.3588504308349703e-05, + "loss": 1.3126, + "step": 23376 + }, + { + "epoch": 0.8371801529177932, + "grad_norm": 1.531659483909607, + "learning_rate": 1.3582667170368513e-05, + "loss": 1.4459, + "step": 23377 + }, + { + "epoch": 0.8372159650473615, + "grad_norm": 1.3971465826034546, + "learning_rate": 1.3576831195021067e-05, + "loss": 1.5605, + "step": 23378 + }, + { + "epoch": 0.8372517771769298, + "grad_norm": 1.5634061098098755, + "learning_rate": 1.357099638238587e-05, + "loss": 1.3963, + "step": 23379 + }, + { + "epoch": 0.8372875893064982, + "grad_norm": 1.8827449083328247, + "learning_rate": 1.356516273254147e-05, + "loss": 1.2539, + "step": 23380 + }, + { + "epoch": 0.8373234014360664, + "grad_norm": 2.146848678588867, + "learning_rate": 1.3559330245566282e-05, + "loss": 1.4573, + "step": 23381 + }, + { + "epoch": 0.8373592135656347, + "grad_norm": 1.4957455396652222, + "learning_rate": 1.3553498921538798e-05, + "loss": 1.4717, + "step": 23382 + }, + { + "epoch": 0.837395025695203, + "grad_norm": 1.9274322986602783, + "learning_rate": 1.3547668760537514e-05, + "loss": 1.6203, + "step": 23383 + }, + { + "epoch": 0.8374308378247712, + "grad_norm": 2.2220675945281982, + "learning_rate": 1.3541839762640796e-05, + "loss": 1.3541, + "step": 23384 + }, + { + "epoch": 0.8374666499543395, + "grad_norm": 1.7440205812454224, + "learning_rate": 1.3536011927927117e-05, + "loss": 1.4781, + "step": 23385 + }, + { + "epoch": 0.8375024620839078, + "grad_norm": 1.5415992736816406, + "learning_rate": 1.3530185256474848e-05, + "loss": 1.4387, + "step": 23386 + }, + { + "epoch": 0.8375382742134762, + "grad_norm": 1.6011251211166382, + "learning_rate": 1.3524359748362437e-05, + "loss": 1.4087, + "step": 23387 + }, + { + "epoch": 0.8375740863430444, + "grad_norm": 2.023725986480713, + "learning_rate": 1.3518535403668186e-05, + "loss": 1.3027, + "step": 23388 + }, + { + "epoch": 0.8376098984726127, + "grad_norm": 1.5329786539077759, + "learning_rate": 1.3512712222470491e-05, + "loss": 1.1108, + "step": 23389 + }, + { + "epoch": 0.837645710602181, + "grad_norm": 1.8306118249893188, + "learning_rate": 1.3506890204847722e-05, + "loss": 1.6126, + "step": 23390 + }, + { + "epoch": 0.8376815227317492, + "grad_norm": 1.6327990293502808, + "learning_rate": 1.3501069350878149e-05, + "loss": 1.2224, + "step": 23391 + }, + { + "epoch": 0.8377173348613175, + "grad_norm": 1.5039691925048828, + "learning_rate": 1.3495249660640142e-05, + "loss": 1.1612, + "step": 23392 + }, + { + "epoch": 0.8377531469908858, + "grad_norm": 1.8655650615692139, + "learning_rate": 1.3489431134211916e-05, + "loss": 1.4411, + "step": 23393 + }, + { + "epoch": 0.8377889591204541, + "grad_norm": 1.7704803943634033, + "learning_rate": 1.3483613771671843e-05, + "loss": 1.5848, + "step": 23394 + }, + { + "epoch": 0.8378247712500224, + "grad_norm": 1.495496153831482, + "learning_rate": 1.3477797573098128e-05, + "loss": 1.3929, + "step": 23395 + }, + { + "epoch": 0.8378605833795907, + "grad_norm": 1.5385957956314087, + "learning_rate": 1.347198253856905e-05, + "loss": 1.2143, + "step": 23396 + }, + { + "epoch": 0.837896395509159, + "grad_norm": 2.3165247440338135, + "learning_rate": 1.3466168668162827e-05, + "loss": 1.2349, + "step": 23397 + }, + { + "epoch": 0.8379322076387272, + "grad_norm": 1.8588618040084839, + "learning_rate": 1.3460355961957704e-05, + "loss": 1.4881, + "step": 23398 + }, + { + "epoch": 0.8379680197682955, + "grad_norm": 1.4896241426467896, + "learning_rate": 1.3454544420031878e-05, + "loss": 1.6074, + "step": 23399 + }, + { + "epoch": 0.8380038318978638, + "grad_norm": 1.9073976278305054, + "learning_rate": 1.3448734042463463e-05, + "loss": 1.7668, + "step": 23400 + }, + { + "epoch": 0.8380396440274321, + "grad_norm": 1.4687809944152832, + "learning_rate": 1.3442924829330738e-05, + "loss": 1.6483, + "step": 23401 + }, + { + "epoch": 0.8380754561570004, + "grad_norm": 1.6176726818084717, + "learning_rate": 1.3437116780711778e-05, + "loss": 1.2699, + "step": 23402 + }, + { + "epoch": 0.8381112682865687, + "grad_norm": 1.1954487562179565, + "learning_rate": 1.3431309896684785e-05, + "loss": 1.2813, + "step": 23403 + }, + { + "epoch": 0.838147080416137, + "grad_norm": 1.294426441192627, + "learning_rate": 1.3425504177327808e-05, + "loss": 1.1881, + "step": 23404 + }, + { + "epoch": 0.8381828925457052, + "grad_norm": 1.699479341506958, + "learning_rate": 1.341969962271904e-05, + "loss": 1.2483, + "step": 23405 + }, + { + "epoch": 0.8382187046752735, + "grad_norm": 1.4392833709716797, + "learning_rate": 1.3413896232936506e-05, + "loss": 1.3719, + "step": 23406 + }, + { + "epoch": 0.8382545168048418, + "grad_norm": 1.4115568399429321, + "learning_rate": 1.3408094008058314e-05, + "loss": 1.3079, + "step": 23407 + }, + { + "epoch": 0.8382903289344101, + "grad_norm": 1.4831119775772095, + "learning_rate": 1.3402292948162554e-05, + "loss": 1.259, + "step": 23408 + }, + { + "epoch": 0.8383261410639784, + "grad_norm": 1.8980132341384888, + "learning_rate": 1.3396493053327208e-05, + "loss": 1.2174, + "step": 23409 + }, + { + "epoch": 0.8383619531935467, + "grad_norm": 1.7400474548339844, + "learning_rate": 1.339069432363036e-05, + "loss": 1.3569, + "step": 23410 + }, + { + "epoch": 0.8383977653231149, + "grad_norm": 1.8057924509048462, + "learning_rate": 1.3384896759149957e-05, + "loss": 1.4134, + "step": 23411 + }, + { + "epoch": 0.8384335774526832, + "grad_norm": 1.5717723369598389, + "learning_rate": 1.3379100359964082e-05, + "loss": 1.6989, + "step": 23412 + }, + { + "epoch": 0.8384693895822515, + "grad_norm": 1.7798326015472412, + "learning_rate": 1.337330512615066e-05, + "loss": 1.3593, + "step": 23413 + }, + { + "epoch": 0.8385052017118197, + "grad_norm": 1.4081906080245972, + "learning_rate": 1.3367511057787707e-05, + "loss": 1.275, + "step": 23414 + }, + { + "epoch": 0.8385410138413881, + "grad_norm": 1.6106690168380737, + "learning_rate": 1.3361718154953096e-05, + "loss": 1.5284, + "step": 23415 + }, + { + "epoch": 0.8385768259709564, + "grad_norm": 1.369853138923645, + "learning_rate": 1.3355926417724852e-05, + "loss": 1.051, + "step": 23416 + }, + { + "epoch": 0.8386126381005247, + "grad_norm": 1.8405297994613647, + "learning_rate": 1.3350135846180856e-05, + "loss": 1.6593, + "step": 23417 + }, + { + "epoch": 0.8386484502300929, + "grad_norm": 1.8503670692443848, + "learning_rate": 1.3344346440398992e-05, + "loss": 1.4476, + "step": 23418 + }, + { + "epoch": 0.8386842623596612, + "grad_norm": 2.0508267879486084, + "learning_rate": 1.3338558200457174e-05, + "loss": 1.3094, + "step": 23419 + }, + { + "epoch": 0.8387200744892295, + "grad_norm": 2.294262647628784, + "learning_rate": 1.3332771126433263e-05, + "loss": 1.4598, + "step": 23420 + }, + { + "epoch": 0.8387558866187977, + "grad_norm": 1.7309292554855347, + "learning_rate": 1.3326985218405152e-05, + "loss": 1.1046, + "step": 23421 + }, + { + "epoch": 0.8387916987483661, + "grad_norm": 1.3808698654174805, + "learning_rate": 1.3321200476450602e-05, + "loss": 1.5908, + "step": 23422 + }, + { + "epoch": 0.8388275108779344, + "grad_norm": 1.4959542751312256, + "learning_rate": 1.3315416900647548e-05, + "loss": 1.298, + "step": 23423 + }, + { + "epoch": 0.8388633230075027, + "grad_norm": 1.4639296531677246, + "learning_rate": 1.3309634491073707e-05, + "loss": 1.5601, + "step": 23424 + }, + { + "epoch": 0.8388991351370709, + "grad_norm": 1.8734369277954102, + "learning_rate": 1.330385324780694e-05, + "loss": 1.5761, + "step": 23425 + }, + { + "epoch": 0.8389349472666392, + "grad_norm": 1.7705508470535278, + "learning_rate": 1.3298073170924986e-05, + "loss": 1.6338, + "step": 23426 + }, + { + "epoch": 0.8389707593962075, + "grad_norm": 1.5225541591644287, + "learning_rate": 1.3292294260505611e-05, + "loss": 1.3579, + "step": 23427 + }, + { + "epoch": 0.8390065715257757, + "grad_norm": 1.517962098121643, + "learning_rate": 1.328651651662659e-05, + "loss": 1.0889, + "step": 23428 + }, + { + "epoch": 0.8390423836553441, + "grad_norm": 1.481013536453247, + "learning_rate": 1.3280739939365617e-05, + "loss": 1.5388, + "step": 23429 + }, + { + "epoch": 0.8390781957849124, + "grad_norm": 2.0160789489746094, + "learning_rate": 1.3274964528800437e-05, + "loss": 1.8554, + "step": 23430 + }, + { + "epoch": 0.8391140079144807, + "grad_norm": 1.6885130405426025, + "learning_rate": 1.3269190285008737e-05, + "loss": 1.433, + "step": 23431 + }, + { + "epoch": 0.8391498200440489, + "grad_norm": 1.642545461654663, + "learning_rate": 1.3263417208068218e-05, + "loss": 1.3426, + "step": 23432 + }, + { + "epoch": 0.8391856321736172, + "grad_norm": 1.5598057508468628, + "learning_rate": 1.325764529805651e-05, + "loss": 1.3445, + "step": 23433 + }, + { + "epoch": 0.8392214443031855, + "grad_norm": 1.5403119325637817, + "learning_rate": 1.3251874555051336e-05, + "loss": 1.5846, + "step": 23434 + }, + { + "epoch": 0.8392572564327537, + "grad_norm": 1.8304781913757324, + "learning_rate": 1.3246104979130281e-05, + "loss": 1.3213, + "step": 23435 + }, + { + "epoch": 0.8392930685623221, + "grad_norm": 1.7894020080566406, + "learning_rate": 1.324033657037097e-05, + "loss": 1.6969, + "step": 23436 + }, + { + "epoch": 0.8393288806918904, + "grad_norm": 1.809398889541626, + "learning_rate": 1.323456932885101e-05, + "loss": 1.4837, + "step": 23437 + }, + { + "epoch": 0.8393646928214586, + "grad_norm": 1.6280808448791504, + "learning_rate": 1.3228803254648004e-05, + "loss": 1.6715, + "step": 23438 + }, + { + "epoch": 0.8394005049510269, + "grad_norm": 1.8543540239334106, + "learning_rate": 1.3223038347839544e-05, + "loss": 1.3243, + "step": 23439 + }, + { + "epoch": 0.8394363170805952, + "grad_norm": 1.5658224821090698, + "learning_rate": 1.321727460850315e-05, + "loss": 1.4643, + "step": 23440 + }, + { + "epoch": 0.8394721292101635, + "grad_norm": 1.7527110576629639, + "learning_rate": 1.321151203671639e-05, + "loss": 1.2166, + "step": 23441 + }, + { + "epoch": 0.8395079413397317, + "grad_norm": 1.9235097169876099, + "learning_rate": 1.320575063255678e-05, + "loss": 1.3466, + "step": 23442 + }, + { + "epoch": 0.8395437534693001, + "grad_norm": 1.8319237232208252, + "learning_rate": 1.3199990396101858e-05, + "loss": 1.3571, + "step": 23443 + }, + { + "epoch": 0.8395795655988684, + "grad_norm": 2.8446924686431885, + "learning_rate": 1.3194231327429085e-05, + "loss": 1.7009, + "step": 23444 + }, + { + "epoch": 0.8396153777284366, + "grad_norm": 2.0625321865081787, + "learning_rate": 1.3188473426615956e-05, + "loss": 1.5693, + "step": 23445 + }, + { + "epoch": 0.8396511898580049, + "grad_norm": 1.4854190349578857, + "learning_rate": 1.3182716693739949e-05, + "loss": 1.1659, + "step": 23446 + }, + { + "epoch": 0.8396870019875732, + "grad_norm": 1.87632155418396, + "learning_rate": 1.3176961128878495e-05, + "loss": 1.3416, + "step": 23447 + }, + { + "epoch": 0.8397228141171414, + "grad_norm": 1.6211196184158325, + "learning_rate": 1.3171206732109031e-05, + "loss": 1.342, + "step": 23448 + }, + { + "epoch": 0.8397586262467097, + "grad_norm": 1.543217658996582, + "learning_rate": 1.3165453503508984e-05, + "loss": 1.5375, + "step": 23449 + }, + { + "epoch": 0.8397944383762781, + "grad_norm": 1.9819585084915161, + "learning_rate": 1.3159701443155759e-05, + "loss": 1.6551, + "step": 23450 + }, + { + "epoch": 0.8398302505058464, + "grad_norm": 1.3575177192687988, + "learning_rate": 1.3153950551126725e-05, + "loss": 1.5489, + "step": 23451 + }, + { + "epoch": 0.8398660626354146, + "grad_norm": 1.597785472869873, + "learning_rate": 1.3148200827499269e-05, + "loss": 1.4854, + "step": 23452 + }, + { + "epoch": 0.8399018747649829, + "grad_norm": 1.6049631834030151, + "learning_rate": 1.3142452272350747e-05, + "loss": 1.3046, + "step": 23453 + }, + { + "epoch": 0.8399376868945512, + "grad_norm": 1.8065446615219116, + "learning_rate": 1.3136704885758477e-05, + "loss": 1.4958, + "step": 23454 + }, + { + "epoch": 0.8399734990241194, + "grad_norm": 1.5930157899856567, + "learning_rate": 1.3130958667799798e-05, + "loss": 1.6832, + "step": 23455 + }, + { + "epoch": 0.8400093111536877, + "grad_norm": 1.551112413406372, + "learning_rate": 1.3125213618552013e-05, + "loss": 1.4293, + "step": 23456 + }, + { + "epoch": 0.8400451232832561, + "grad_norm": 2.0661537647247314, + "learning_rate": 1.3119469738092449e-05, + "loss": 1.3053, + "step": 23457 + }, + { + "epoch": 0.8400809354128244, + "grad_norm": 1.5538454055786133, + "learning_rate": 1.3113727026498323e-05, + "loss": 1.2876, + "step": 23458 + }, + { + "epoch": 0.8401167475423926, + "grad_norm": 1.8126628398895264, + "learning_rate": 1.310798548384693e-05, + "loss": 1.5399, + "step": 23459 + }, + { + "epoch": 0.8401525596719609, + "grad_norm": 1.6119327545166016, + "learning_rate": 1.3102245110215495e-05, + "loss": 1.3692, + "step": 23460 + }, + { + "epoch": 0.8401883718015292, + "grad_norm": 1.673466444015503, + "learning_rate": 1.30965059056813e-05, + "loss": 1.1661, + "step": 23461 + }, + { + "epoch": 0.8402241839310974, + "grad_norm": 3.0586817264556885, + "learning_rate": 1.3090767870321496e-05, + "loss": 1.6918, + "step": 23462 + }, + { + "epoch": 0.8402599960606657, + "grad_norm": 1.7422840595245361, + "learning_rate": 1.30850310042133e-05, + "loss": 1.4367, + "step": 23463 + }, + { + "epoch": 0.8402958081902341, + "grad_norm": 1.521484613418579, + "learning_rate": 1.3079295307433925e-05, + "loss": 1.5184, + "step": 23464 + }, + { + "epoch": 0.8403316203198024, + "grad_norm": 1.3086313009262085, + "learning_rate": 1.307356078006049e-05, + "loss": 1.2359, + "step": 23465 + }, + { + "epoch": 0.8403674324493706, + "grad_norm": 1.7559353113174438, + "learning_rate": 1.3067827422170165e-05, + "loss": 1.593, + "step": 23466 + }, + { + "epoch": 0.8404032445789389, + "grad_norm": 1.462621808052063, + "learning_rate": 1.3062095233840089e-05, + "loss": 1.3325, + "step": 23467 + }, + { + "epoch": 0.8404390567085072, + "grad_norm": 1.2271201610565186, + "learning_rate": 1.30563642151474e-05, + "loss": 1.4682, + "step": 23468 + }, + { + "epoch": 0.8404748688380754, + "grad_norm": 2.206615924835205, + "learning_rate": 1.3050634366169156e-05, + "loss": 1.0847, + "step": 23469 + }, + { + "epoch": 0.8405106809676437, + "grad_norm": 1.6466546058654785, + "learning_rate": 1.3044905686982479e-05, + "loss": 1.3145, + "step": 23470 + }, + { + "epoch": 0.8405464930972121, + "grad_norm": 1.7529257535934448, + "learning_rate": 1.3039178177664458e-05, + "loss": 1.2314, + "step": 23471 + }, + { + "epoch": 0.8405823052267803, + "grad_norm": 1.4902821779251099, + "learning_rate": 1.3033451838292088e-05, + "loss": 1.453, + "step": 23472 + }, + { + "epoch": 0.8406181173563486, + "grad_norm": 1.2909750938415527, + "learning_rate": 1.3027726668942452e-05, + "loss": 1.4757, + "step": 23473 + }, + { + "epoch": 0.8406539294859169, + "grad_norm": 1.7654305696487427, + "learning_rate": 1.3022002669692568e-05, + "loss": 1.2688, + "step": 23474 + }, + { + "epoch": 0.8406897416154852, + "grad_norm": 1.5162429809570312, + "learning_rate": 1.3016279840619461e-05, + "loss": 1.7285, + "step": 23475 + }, + { + "epoch": 0.8407255537450534, + "grad_norm": 1.8618532419204712, + "learning_rate": 1.3010558181800091e-05, + "loss": 1.5294, + "step": 23476 + }, + { + "epoch": 0.8407613658746217, + "grad_norm": 1.9252269268035889, + "learning_rate": 1.3004837693311445e-05, + "loss": 1.8202, + "step": 23477 + }, + { + "epoch": 0.8407971780041901, + "grad_norm": 1.5172358751296997, + "learning_rate": 1.2999118375230523e-05, + "loss": 1.452, + "step": 23478 + }, + { + "epoch": 0.8408329901337583, + "grad_norm": 1.8392778635025024, + "learning_rate": 1.2993400227634211e-05, + "loss": 1.2184, + "step": 23479 + }, + { + "epoch": 0.8408688022633266, + "grad_norm": 1.4657130241394043, + "learning_rate": 1.2987683250599481e-05, + "loss": 1.2283, + "step": 23480 + }, + { + "epoch": 0.8409046143928949, + "grad_norm": 1.8285713195800781, + "learning_rate": 1.2981967444203224e-05, + "loss": 1.2617, + "step": 23481 + }, + { + "epoch": 0.8409404265224631, + "grad_norm": 1.4383978843688965, + "learning_rate": 1.297625280852237e-05, + "loss": 1.2994, + "step": 23482 + }, + { + "epoch": 0.8409762386520314, + "grad_norm": 1.733881950378418, + "learning_rate": 1.297053934363377e-05, + "loss": 1.5984, + "step": 23483 + }, + { + "epoch": 0.8410120507815997, + "grad_norm": 1.4433633089065552, + "learning_rate": 1.2964827049614291e-05, + "loss": 1.3371, + "step": 23484 + }, + { + "epoch": 0.8410478629111681, + "grad_norm": 1.5623021125793457, + "learning_rate": 1.295911592654081e-05, + "loss": 1.8593, + "step": 23485 + }, + { + "epoch": 0.8410836750407363, + "grad_norm": 1.315675973892212, + "learning_rate": 1.2953405974490163e-05, + "loss": 1.326, + "step": 23486 + }, + { + "epoch": 0.8411194871703046, + "grad_norm": 2.0233829021453857, + "learning_rate": 1.2947697193539154e-05, + "loss": 1.499, + "step": 23487 + }, + { + "epoch": 0.8411552992998729, + "grad_norm": 1.2258970737457275, + "learning_rate": 1.2941989583764547e-05, + "loss": 1.0147, + "step": 23488 + }, + { + "epoch": 0.8411911114294411, + "grad_norm": 1.2284082174301147, + "learning_rate": 1.2936283145243222e-05, + "loss": 1.2718, + "step": 23489 + }, + { + "epoch": 0.8412269235590094, + "grad_norm": 1.7486215829849243, + "learning_rate": 1.2930577878051887e-05, + "loss": 1.5571, + "step": 23490 + }, + { + "epoch": 0.8412627356885777, + "grad_norm": 1.4280521869659424, + "learning_rate": 1.2924873782267322e-05, + "loss": 1.5505, + "step": 23491 + }, + { + "epoch": 0.8412985478181461, + "grad_norm": 1.516255259513855, + "learning_rate": 1.2919170857966223e-05, + "loss": 1.2592, + "step": 23492 + }, + { + "epoch": 0.8413343599477143, + "grad_norm": 1.6720364093780518, + "learning_rate": 1.2913469105225407e-05, + "loss": 1.5911, + "step": 23493 + }, + { + "epoch": 0.8413701720772826, + "grad_norm": 1.5660908222198486, + "learning_rate": 1.29077685241215e-05, + "loss": 1.3434, + "step": 23494 + }, + { + "epoch": 0.8414059842068509, + "grad_norm": 1.516623616218567, + "learning_rate": 1.290206911473123e-05, + "loss": 1.3898, + "step": 23495 + }, + { + "epoch": 0.8414417963364191, + "grad_norm": 1.6085141897201538, + "learning_rate": 1.2896370877131293e-05, + "loss": 1.5737, + "step": 23496 + }, + { + "epoch": 0.8414776084659874, + "grad_norm": 1.5538430213928223, + "learning_rate": 1.2890673811398301e-05, + "loss": 1.4526, + "step": 23497 + }, + { + "epoch": 0.8415134205955557, + "grad_norm": 1.6788676977157593, + "learning_rate": 1.2884977917608964e-05, + "loss": 1.6376, + "step": 23498 + }, + { + "epoch": 0.841549232725124, + "grad_norm": 1.8180345296859741, + "learning_rate": 1.287928319583983e-05, + "loss": 1.3496, + "step": 23499 + }, + { + "epoch": 0.8415850448546923, + "grad_norm": 1.3776708841323853, + "learning_rate": 1.2873589646167605e-05, + "loss": 1.3057, + "step": 23500 + }, + { + "epoch": 0.8416208569842606, + "grad_norm": 2.0103936195373535, + "learning_rate": 1.2867897268668826e-05, + "loss": 1.289, + "step": 23501 + }, + { + "epoch": 0.8416566691138289, + "grad_norm": 2.1895813941955566, + "learning_rate": 1.2862206063420113e-05, + "loss": 1.3973, + "step": 23502 + }, + { + "epoch": 0.8416924812433971, + "grad_norm": 1.5710906982421875, + "learning_rate": 1.2856516030497979e-05, + "loss": 1.3817, + "step": 23503 + }, + { + "epoch": 0.8417282933729654, + "grad_norm": 1.7813724279403687, + "learning_rate": 1.2850827169979063e-05, + "loss": 1.259, + "step": 23504 + }, + { + "epoch": 0.8417641055025337, + "grad_norm": 1.3750540018081665, + "learning_rate": 1.284513948193985e-05, + "loss": 1.3579, + "step": 23505 + }, + { + "epoch": 0.841799917632102, + "grad_norm": 1.7597627639770508, + "learning_rate": 1.2839452966456822e-05, + "loss": 1.2448, + "step": 23506 + }, + { + "epoch": 0.8418357297616703, + "grad_norm": 1.2781704664230347, + "learning_rate": 1.2833767623606563e-05, + "loss": 1.4802, + "step": 23507 + }, + { + "epoch": 0.8418715418912386, + "grad_norm": 1.4671543836593628, + "learning_rate": 1.28280834534655e-05, + "loss": 1.1333, + "step": 23508 + }, + { + "epoch": 0.8419073540208069, + "grad_norm": 1.8261942863464355, + "learning_rate": 1.2822400456110162e-05, + "loss": 1.3526, + "step": 23509 + }, + { + "epoch": 0.8419431661503751, + "grad_norm": 1.9040741920471191, + "learning_rate": 1.281671863161693e-05, + "loss": 1.2746, + "step": 23510 + }, + { + "epoch": 0.8419789782799434, + "grad_norm": 1.4849711656570435, + "learning_rate": 1.2811037980062324e-05, + "loss": 1.4179, + "step": 23511 + }, + { + "epoch": 0.8420147904095117, + "grad_norm": 1.4805055856704712, + "learning_rate": 1.2805358501522724e-05, + "loss": 1.4121, + "step": 23512 + }, + { + "epoch": 0.84205060253908, + "grad_norm": 1.6600576639175415, + "learning_rate": 1.279968019607457e-05, + "loss": 1.4056, + "step": 23513 + }, + { + "epoch": 0.8420864146686483, + "grad_norm": 1.296726107597351, + "learning_rate": 1.2794003063794225e-05, + "loss": 1.6649, + "step": 23514 + }, + { + "epoch": 0.8421222267982166, + "grad_norm": 1.372146725654602, + "learning_rate": 1.2788327104758068e-05, + "loss": 1.5393, + "step": 23515 + }, + { + "epoch": 0.8421580389277848, + "grad_norm": 1.7383742332458496, + "learning_rate": 1.278265231904251e-05, + "loss": 1.5947, + "step": 23516 + }, + { + "epoch": 0.8421938510573531, + "grad_norm": 1.5140732526779175, + "learning_rate": 1.277697870672383e-05, + "loss": 1.3892, + "step": 23517 + }, + { + "epoch": 0.8422296631869214, + "grad_norm": 1.4669893980026245, + "learning_rate": 1.2771306267878392e-05, + "loss": 1.2139, + "step": 23518 + }, + { + "epoch": 0.8422654753164897, + "grad_norm": 2.693443536758423, + "learning_rate": 1.2765635002582521e-05, + "loss": 1.362, + "step": 23519 + }, + { + "epoch": 0.842301287446058, + "grad_norm": 1.3767279386520386, + "learning_rate": 1.2759964910912524e-05, + "loss": 1.2912, + "step": 23520 + }, + { + "epoch": 0.8423370995756263, + "grad_norm": 1.9460740089416504, + "learning_rate": 1.275429599294462e-05, + "loss": 1.6095, + "step": 23521 + }, + { + "epoch": 0.8423729117051946, + "grad_norm": 1.8773564100265503, + "learning_rate": 1.2748628248755167e-05, + "loss": 1.5899, + "step": 23522 + }, + { + "epoch": 0.8424087238347628, + "grad_norm": 1.7842673063278198, + "learning_rate": 1.2742961678420385e-05, + "loss": 1.4446, + "step": 23523 + }, + { + "epoch": 0.8424445359643311, + "grad_norm": 1.3329923152923584, + "learning_rate": 1.2737296282016464e-05, + "loss": 1.4283, + "step": 23524 + }, + { + "epoch": 0.8424803480938994, + "grad_norm": 1.8569622039794922, + "learning_rate": 1.2731632059619669e-05, + "loss": 1.3498, + "step": 23525 + }, + { + "epoch": 0.8425161602234676, + "grad_norm": 2.4255120754241943, + "learning_rate": 1.2725969011306204e-05, + "loss": 1.6669, + "step": 23526 + }, + { + "epoch": 0.842551972353036, + "grad_norm": 1.7957884073257446, + "learning_rate": 1.2720307137152266e-05, + "loss": 1.6592, + "step": 23527 + }, + { + "epoch": 0.8425877844826043, + "grad_norm": 1.8802419900894165, + "learning_rate": 1.271464643723399e-05, + "loss": 1.3524, + "step": 23528 + }, + { + "epoch": 0.8426235966121726, + "grad_norm": 1.8517471551895142, + "learning_rate": 1.2708986911627551e-05, + "loss": 1.5125, + "step": 23529 + }, + { + "epoch": 0.8426594087417408, + "grad_norm": 1.721115231513977, + "learning_rate": 1.27033285604091e-05, + "loss": 1.3382, + "step": 23530 + }, + { + "epoch": 0.8426952208713091, + "grad_norm": 1.90151846408844, + "learning_rate": 1.2697671383654786e-05, + "loss": 1.58, + "step": 23531 + }, + { + "epoch": 0.8427310330008774, + "grad_norm": 1.4302324056625366, + "learning_rate": 1.2692015381440658e-05, + "loss": 1.1196, + "step": 23532 + }, + { + "epoch": 0.8427668451304456, + "grad_norm": 1.452170491218567, + "learning_rate": 1.2686360553842857e-05, + "loss": 1.4946, + "step": 23533 + }, + { + "epoch": 0.842802657260014, + "grad_norm": 1.3650285005569458, + "learning_rate": 1.2680706900937455e-05, + "loss": 1.3334, + "step": 23534 + }, + { + "epoch": 0.8428384693895823, + "grad_norm": 1.7403556108474731, + "learning_rate": 1.2675054422800503e-05, + "loss": 1.6425, + "step": 23535 + }, + { + "epoch": 0.8428742815191506, + "grad_norm": 1.6567943096160889, + "learning_rate": 1.2669403119508039e-05, + "loss": 1.5255, + "step": 23536 + }, + { + "epoch": 0.8429100936487188, + "grad_norm": 1.5474529266357422, + "learning_rate": 1.2663752991136112e-05, + "loss": 1.6668, + "step": 23537 + }, + { + "epoch": 0.8429459057782871, + "grad_norm": 1.6595933437347412, + "learning_rate": 1.2658104037760753e-05, + "loss": 1.3727, + "step": 23538 + }, + { + "epoch": 0.8429817179078554, + "grad_norm": 1.8614715337753296, + "learning_rate": 1.2652456259457924e-05, + "loss": 1.4968, + "step": 23539 + }, + { + "epoch": 0.8430175300374236, + "grad_norm": 2.24442720413208, + "learning_rate": 1.2646809656303627e-05, + "loss": 1.5371, + "step": 23540 + }, + { + "epoch": 0.843053342166992, + "grad_norm": 1.914910912513733, + "learning_rate": 1.2641164228373847e-05, + "loss": 1.4012, + "step": 23541 + }, + { + "epoch": 0.8430891542965603, + "grad_norm": 3.1867616176605225, + "learning_rate": 1.2635519975744503e-05, + "loss": 1.6351, + "step": 23542 + }, + { + "epoch": 0.8431249664261286, + "grad_norm": 1.5233803987503052, + "learning_rate": 1.2629876898491532e-05, + "loss": 1.4552, + "step": 23543 + }, + { + "epoch": 0.8431607785556968, + "grad_norm": 1.6941332817077637, + "learning_rate": 1.2624234996690875e-05, + "loss": 1.233, + "step": 23544 + }, + { + "epoch": 0.8431965906852651, + "grad_norm": 1.8111356496810913, + "learning_rate": 1.2618594270418448e-05, + "loss": 1.1046, + "step": 23545 + }, + { + "epoch": 0.8432324028148334, + "grad_norm": 1.4550955295562744, + "learning_rate": 1.2612954719750103e-05, + "loss": 1.3761, + "step": 23546 + }, + { + "epoch": 0.8432682149444016, + "grad_norm": 1.7182893753051758, + "learning_rate": 1.2607316344761733e-05, + "loss": 1.261, + "step": 23547 + }, + { + "epoch": 0.84330402707397, + "grad_norm": 1.9735833406448364, + "learning_rate": 1.2601679145529189e-05, + "loss": 1.2702, + "step": 23548 + }, + { + "epoch": 0.8433398392035383, + "grad_norm": 1.8338557481765747, + "learning_rate": 1.2596043122128343e-05, + "loss": 1.2931, + "step": 23549 + }, + { + "epoch": 0.8433756513331065, + "grad_norm": 2.4479641914367676, + "learning_rate": 1.2590408274634969e-05, + "loss": 1.2256, + "step": 23550 + }, + { + "epoch": 0.8434114634626748, + "grad_norm": 1.5408915281295776, + "learning_rate": 1.2584774603124905e-05, + "loss": 1.352, + "step": 23551 + }, + { + "epoch": 0.8434472755922431, + "grad_norm": 1.7759418487548828, + "learning_rate": 1.2579142107673959e-05, + "loss": 1.282, + "step": 23552 + }, + { + "epoch": 0.8434830877218114, + "grad_norm": 1.3212677240371704, + "learning_rate": 1.2573510788357867e-05, + "loss": 1.3845, + "step": 23553 + }, + { + "epoch": 0.8435188998513796, + "grad_norm": 2.116630792617798, + "learning_rate": 1.2567880645252417e-05, + "loss": 1.23, + "step": 23554 + }, + { + "epoch": 0.843554711980948, + "grad_norm": 2.0516796112060547, + "learning_rate": 1.2562251678433356e-05, + "loss": 1.6818, + "step": 23555 + }, + { + "epoch": 0.8435905241105163, + "grad_norm": 1.6188548803329468, + "learning_rate": 1.2556623887976427e-05, + "loss": 1.6243, + "step": 23556 + }, + { + "epoch": 0.8436263362400845, + "grad_norm": 1.391708254814148, + "learning_rate": 1.255099727395732e-05, + "loss": 1.2512, + "step": 23557 + }, + { + "epoch": 0.8436621483696528, + "grad_norm": 1.5969374179840088, + "learning_rate": 1.2545371836451736e-05, + "loss": 1.2388, + "step": 23558 + }, + { + "epoch": 0.8436979604992211, + "grad_norm": 1.3789353370666504, + "learning_rate": 1.2539747575535387e-05, + "loss": 1.4325, + "step": 23559 + }, + { + "epoch": 0.8437337726287893, + "grad_norm": 2.3373169898986816, + "learning_rate": 1.2534124491283893e-05, + "loss": 1.6212, + "step": 23560 + }, + { + "epoch": 0.8437695847583576, + "grad_norm": 2.5310919284820557, + "learning_rate": 1.2528502583772938e-05, + "loss": 1.6208, + "step": 23561 + }, + { + "epoch": 0.843805396887926, + "grad_norm": 1.8715249300003052, + "learning_rate": 1.252288185307815e-05, + "loss": 1.5006, + "step": 23562 + }, + { + "epoch": 0.8438412090174943, + "grad_norm": 1.8814998865127563, + "learning_rate": 1.2517262299275167e-05, + "loss": 1.52, + "step": 23563 + }, + { + "epoch": 0.8438770211470625, + "grad_norm": 1.4319605827331543, + "learning_rate": 1.2511643922439564e-05, + "loss": 1.4662, + "step": 23564 + }, + { + "epoch": 0.8439128332766308, + "grad_norm": 1.735001564025879, + "learning_rate": 1.2506026722646924e-05, + "loss": 0.974, + "step": 23565 + }, + { + "epoch": 0.8439486454061991, + "grad_norm": 1.460253119468689, + "learning_rate": 1.2500410699972853e-05, + "loss": 1.0201, + "step": 23566 + }, + { + "epoch": 0.8439844575357673, + "grad_norm": 2.5098659992218018, + "learning_rate": 1.2494795854492903e-05, + "loss": 1.7105, + "step": 23567 + }, + { + "epoch": 0.8440202696653356, + "grad_norm": 1.4424971342086792, + "learning_rate": 1.2489182186282577e-05, + "loss": 1.2676, + "step": 23568 + }, + { + "epoch": 0.8440560817949039, + "grad_norm": 1.512639045715332, + "learning_rate": 1.2483569695417418e-05, + "loss": 1.4629, + "step": 23569 + }, + { + "epoch": 0.8440918939244723, + "grad_norm": 1.5117937326431274, + "learning_rate": 1.2477958381972977e-05, + "loss": 1.133, + "step": 23570 + }, + { + "epoch": 0.8441277060540405, + "grad_norm": 1.7218340635299683, + "learning_rate": 1.2472348246024679e-05, + "loss": 1.2329, + "step": 23571 + }, + { + "epoch": 0.8441635181836088, + "grad_norm": 1.6333949565887451, + "learning_rate": 1.2466739287648032e-05, + "loss": 1.5322, + "step": 23572 + }, + { + "epoch": 0.8441993303131771, + "grad_norm": 1.4452733993530273, + "learning_rate": 1.24611315069185e-05, + "loss": 1.3076, + "step": 23573 + }, + { + "epoch": 0.8442351424427453, + "grad_norm": 1.46990168094635, + "learning_rate": 1.2455524903911552e-05, + "loss": 1.3682, + "step": 23574 + }, + { + "epoch": 0.8442709545723136, + "grad_norm": 2.660382032394409, + "learning_rate": 1.2449919478702587e-05, + "loss": 1.623, + "step": 23575 + }, + { + "epoch": 0.8443067667018819, + "grad_norm": 2.3135154247283936, + "learning_rate": 1.2444315231366988e-05, + "loss": 1.5553, + "step": 23576 + }, + { + "epoch": 0.8443425788314503, + "grad_norm": 1.4820939302444458, + "learning_rate": 1.2438712161980226e-05, + "loss": 1.5055, + "step": 23577 + }, + { + "epoch": 0.8443783909610185, + "grad_norm": 1.7251002788543701, + "learning_rate": 1.2433110270617632e-05, + "loss": 1.7433, + "step": 23578 + }, + { + "epoch": 0.8444142030905868, + "grad_norm": 1.5508625507354736, + "learning_rate": 1.2427509557354578e-05, + "loss": 1.2144, + "step": 23579 + }, + { + "epoch": 0.8444500152201551, + "grad_norm": 1.7173960208892822, + "learning_rate": 1.2421910022266425e-05, + "loss": 1.3851, + "step": 23580 + }, + { + "epoch": 0.8444858273497233, + "grad_norm": 1.360556960105896, + "learning_rate": 1.2416311665428526e-05, + "loss": 1.4602, + "step": 23581 + }, + { + "epoch": 0.8445216394792916, + "grad_norm": 1.280659794807434, + "learning_rate": 1.2410714486916164e-05, + "loss": 1.4802, + "step": 23582 + }, + { + "epoch": 0.8445574516088599, + "grad_norm": 1.5254395008087158, + "learning_rate": 1.2405118486804646e-05, + "loss": 1.4026, + "step": 23583 + }, + { + "epoch": 0.8445932637384282, + "grad_norm": 1.5753604173660278, + "learning_rate": 1.2399523665169298e-05, + "loss": 1.353, + "step": 23584 + }, + { + "epoch": 0.8446290758679965, + "grad_norm": 3.215534210205078, + "learning_rate": 1.239393002208533e-05, + "loss": 1.3249, + "step": 23585 + }, + { + "epoch": 0.8446648879975648, + "grad_norm": 1.6247320175170898, + "learning_rate": 1.238833755762806e-05, + "loss": 1.4145, + "step": 23586 + }, + { + "epoch": 0.844700700127133, + "grad_norm": 2.186195135116577, + "learning_rate": 1.2382746271872658e-05, + "loss": 1.7662, + "step": 23587 + }, + { + "epoch": 0.8447365122567013, + "grad_norm": 1.3721939325332642, + "learning_rate": 1.2377156164894422e-05, + "loss": 1.0364, + "step": 23588 + }, + { + "epoch": 0.8447723243862696, + "grad_norm": 1.7674872875213623, + "learning_rate": 1.2371567236768511e-05, + "loss": 1.2248, + "step": 23589 + }, + { + "epoch": 0.8448081365158379, + "grad_norm": 1.807494044303894, + "learning_rate": 1.2365979487570122e-05, + "loss": 1.3406, + "step": 23590 + }, + { + "epoch": 0.8448439486454062, + "grad_norm": 3.335813522338867, + "learning_rate": 1.2360392917374442e-05, + "loss": 1.4115, + "step": 23591 + }, + { + "epoch": 0.8448797607749745, + "grad_norm": 1.336086392402649, + "learning_rate": 1.235480752625665e-05, + "loss": 1.582, + "step": 23592 + }, + { + "epoch": 0.8449155729045428, + "grad_norm": 2.0651471614837646, + "learning_rate": 1.234922331429188e-05, + "loss": 1.2444, + "step": 23593 + }, + { + "epoch": 0.844951385034111, + "grad_norm": 1.5631545782089233, + "learning_rate": 1.2343640281555191e-05, + "loss": 1.1421, + "step": 23594 + }, + { + "epoch": 0.8449871971636793, + "grad_norm": 2.9039080142974854, + "learning_rate": 1.2338058428121802e-05, + "loss": 1.5782, + "step": 23595 + }, + { + "epoch": 0.8450230092932476, + "grad_norm": 1.8071566820144653, + "learning_rate": 1.233247775406674e-05, + "loss": 1.4459, + "step": 23596 + }, + { + "epoch": 0.8450588214228159, + "grad_norm": 1.7052816152572632, + "learning_rate": 1.2326898259465125e-05, + "loss": 1.6382, + "step": 23597 + }, + { + "epoch": 0.8450946335523842, + "grad_norm": 1.8090574741363525, + "learning_rate": 1.2321319944391963e-05, + "loss": 1.3878, + "step": 23598 + }, + { + "epoch": 0.8451304456819525, + "grad_norm": 1.5731558799743652, + "learning_rate": 1.2315742808922382e-05, + "loss": 1.2031, + "step": 23599 + }, + { + "epoch": 0.8451662578115208, + "grad_norm": 1.4847115278244019, + "learning_rate": 1.2310166853131366e-05, + "loss": 1.4141, + "step": 23600 + }, + { + "epoch": 0.845202069941089, + "grad_norm": 1.2769142389297485, + "learning_rate": 1.2304592077093958e-05, + "loss": 1.3135, + "step": 23601 + }, + { + "epoch": 0.8452378820706573, + "grad_norm": 1.4810028076171875, + "learning_rate": 1.2299018480885117e-05, + "loss": 1.0614, + "step": 23602 + }, + { + "epoch": 0.8452736942002256, + "grad_norm": 1.6237541437149048, + "learning_rate": 1.2293446064579873e-05, + "loss": 1.2855, + "step": 23603 + }, + { + "epoch": 0.8453095063297938, + "grad_norm": 1.9380799531936646, + "learning_rate": 1.2287874828253187e-05, + "loss": 1.5237, + "step": 23604 + }, + { + "epoch": 0.8453453184593622, + "grad_norm": 2.556157350540161, + "learning_rate": 1.2282304771979958e-05, + "loss": 1.4906, + "step": 23605 + }, + { + "epoch": 0.8453811305889305, + "grad_norm": 2.639732837677002, + "learning_rate": 1.2276735895835223e-05, + "loss": 1.5245, + "step": 23606 + }, + { + "epoch": 0.8454169427184988, + "grad_norm": 1.5100208520889282, + "learning_rate": 1.2271168199893834e-05, + "loss": 1.6371, + "step": 23607 + }, + { + "epoch": 0.845452754848067, + "grad_norm": 1.5558120012283325, + "learning_rate": 1.2265601684230732e-05, + "loss": 1.3295, + "step": 23608 + }, + { + "epoch": 0.8454885669776353, + "grad_norm": 1.4905898571014404, + "learning_rate": 1.2260036348920745e-05, + "loss": 1.6187, + "step": 23609 + }, + { + "epoch": 0.8455243791072036, + "grad_norm": 1.5503627061843872, + "learning_rate": 1.2254472194038835e-05, + "loss": 1.4804, + "step": 23610 + }, + { + "epoch": 0.8455601912367718, + "grad_norm": 1.5962167978286743, + "learning_rate": 1.224890921965981e-05, + "loss": 1.4638, + "step": 23611 + }, + { + "epoch": 0.8455960033663402, + "grad_norm": 1.7058485746383667, + "learning_rate": 1.2243347425858508e-05, + "loss": 1.4108, + "step": 23612 + }, + { + "epoch": 0.8456318154959085, + "grad_norm": 2.0142135620117188, + "learning_rate": 1.2237786812709773e-05, + "loss": 1.6224, + "step": 23613 + }, + { + "epoch": 0.8456676276254768, + "grad_norm": 1.3440698385238647, + "learning_rate": 1.2232227380288408e-05, + "loss": 1.5021, + "step": 23614 + }, + { + "epoch": 0.845703439755045, + "grad_norm": 1.5169098377227783, + "learning_rate": 1.2226669128669232e-05, + "loss": 1.3667, + "step": 23615 + }, + { + "epoch": 0.8457392518846133, + "grad_norm": 2.082235813140869, + "learning_rate": 1.2221112057926954e-05, + "loss": 1.5047, + "step": 23616 + }, + { + "epoch": 0.8457750640141816, + "grad_norm": 1.3239139318466187, + "learning_rate": 1.2215556168136443e-05, + "loss": 1.5601, + "step": 23617 + }, + { + "epoch": 0.8458108761437498, + "grad_norm": 1.6118295192718506, + "learning_rate": 1.2210001459372355e-05, + "loss": 1.5155, + "step": 23618 + }, + { + "epoch": 0.8458466882733182, + "grad_norm": 1.7186284065246582, + "learning_rate": 1.2204447931709484e-05, + "loss": 1.6944, + "step": 23619 + }, + { + "epoch": 0.8458825004028865, + "grad_norm": 1.6014262437820435, + "learning_rate": 1.2198895585222503e-05, + "loss": 1.5028, + "step": 23620 + }, + { + "epoch": 0.8459183125324548, + "grad_norm": 1.8666081428527832, + "learning_rate": 1.219334441998612e-05, + "loss": 1.4784, + "step": 23621 + }, + { + "epoch": 0.845954124662023, + "grad_norm": 1.4160361289978027, + "learning_rate": 1.2187794436075039e-05, + "loss": 1.0623, + "step": 23622 + }, + { + "epoch": 0.8459899367915913, + "grad_norm": 1.6083838939666748, + "learning_rate": 1.2182245633563905e-05, + "loss": 1.2759, + "step": 23623 + }, + { + "epoch": 0.8460257489211596, + "grad_norm": 1.7315397262573242, + "learning_rate": 1.2176698012527376e-05, + "loss": 1.3641, + "step": 23624 + }, + { + "epoch": 0.8460615610507278, + "grad_norm": 1.6413108110427856, + "learning_rate": 1.2171151573040085e-05, + "loss": 1.5172, + "step": 23625 + }, + { + "epoch": 0.8460973731802962, + "grad_norm": 2.129379987716675, + "learning_rate": 1.2165606315176691e-05, + "loss": 1.7346, + "step": 23626 + }, + { + "epoch": 0.8461331853098645, + "grad_norm": 1.699070930480957, + "learning_rate": 1.2160062239011739e-05, + "loss": 1.3449, + "step": 23627 + }, + { + "epoch": 0.8461689974394327, + "grad_norm": 1.3669846057891846, + "learning_rate": 1.2154519344619841e-05, + "loss": 1.5008, + "step": 23628 + }, + { + "epoch": 0.846204809569001, + "grad_norm": 1.5487329959869385, + "learning_rate": 1.2148977632075598e-05, + "loss": 1.569, + "step": 23629 + }, + { + "epoch": 0.8462406216985693, + "grad_norm": 2.4239537715911865, + "learning_rate": 1.2143437101453514e-05, + "loss": 1.4715, + "step": 23630 + }, + { + "epoch": 0.8462764338281376, + "grad_norm": 1.408677101135254, + "learning_rate": 1.2137897752828165e-05, + "loss": 1.3176, + "step": 23631 + }, + { + "epoch": 0.8463122459577058, + "grad_norm": 1.3192760944366455, + "learning_rate": 1.2132359586274067e-05, + "loss": 1.4138, + "step": 23632 + }, + { + "epoch": 0.8463480580872742, + "grad_norm": 1.8194007873535156, + "learning_rate": 1.212682260186575e-05, + "loss": 1.7193, + "step": 23633 + }, + { + "epoch": 0.8463838702168425, + "grad_norm": 1.4878827333450317, + "learning_rate": 1.2121286799677667e-05, + "loss": 0.9791, + "step": 23634 + }, + { + "epoch": 0.8464196823464107, + "grad_norm": 1.69992995262146, + "learning_rate": 1.2115752179784312e-05, + "loss": 1.4171, + "step": 23635 + }, + { + "epoch": 0.846455494475979, + "grad_norm": 1.8349891901016235, + "learning_rate": 1.211021874226015e-05, + "loss": 1.8357, + "step": 23636 + }, + { + "epoch": 0.8464913066055473, + "grad_norm": 1.3100531101226807, + "learning_rate": 1.2104686487179639e-05, + "loss": 1.4282, + "step": 23637 + }, + { + "epoch": 0.8465271187351155, + "grad_norm": 1.4935340881347656, + "learning_rate": 1.209915541461718e-05, + "loss": 1.4017, + "step": 23638 + }, + { + "epoch": 0.8465629308646838, + "grad_norm": 1.7822998762130737, + "learning_rate": 1.20936255246472e-05, + "loss": 1.1659, + "step": 23639 + }, + { + "epoch": 0.8465987429942522, + "grad_norm": 1.7286075353622437, + "learning_rate": 1.2088096817344118e-05, + "loss": 1.8532, + "step": 23640 + }, + { + "epoch": 0.8466345551238205, + "grad_norm": 1.912163496017456, + "learning_rate": 1.2082569292782275e-05, + "loss": 1.4164, + "step": 23641 + }, + { + "epoch": 0.8466703672533887, + "grad_norm": 1.4342498779296875, + "learning_rate": 1.2077042951036055e-05, + "loss": 1.1178, + "step": 23642 + }, + { + "epoch": 0.846706179382957, + "grad_norm": 2.4091644287109375, + "learning_rate": 1.207151779217981e-05, + "loss": 1.2119, + "step": 23643 + }, + { + "epoch": 0.8467419915125253, + "grad_norm": 1.5198973417282104, + "learning_rate": 1.2065993816287901e-05, + "loss": 1.1991, + "step": 23644 + }, + { + "epoch": 0.8467778036420935, + "grad_norm": 1.4124888181686401, + "learning_rate": 1.2060471023434594e-05, + "loss": 1.6583, + "step": 23645 + }, + { + "epoch": 0.8468136157716618, + "grad_norm": 1.6898411512374878, + "learning_rate": 1.2054949413694216e-05, + "loss": 1.2365, + "step": 23646 + }, + { + "epoch": 0.8468494279012302, + "grad_norm": 1.4667853116989136, + "learning_rate": 1.2049428987141065e-05, + "loss": 1.5452, + "step": 23647 + }, + { + "epoch": 0.8468852400307985, + "grad_norm": 1.607649326324463, + "learning_rate": 1.204390974384939e-05, + "loss": 1.347, + "step": 23648 + }, + { + "epoch": 0.8469210521603667, + "grad_norm": 1.8424022197723389, + "learning_rate": 1.2038391683893446e-05, + "loss": 1.7185, + "step": 23649 + }, + { + "epoch": 0.846956864289935, + "grad_norm": 1.5463377237319946, + "learning_rate": 1.2032874807347484e-05, + "loss": 1.5278, + "step": 23650 + }, + { + "epoch": 0.8469926764195033, + "grad_norm": 1.372738242149353, + "learning_rate": 1.2027359114285741e-05, + "loss": 1.4912, + "step": 23651 + }, + { + "epoch": 0.8470284885490715, + "grad_norm": 1.7172951698303223, + "learning_rate": 1.2021844604782384e-05, + "loss": 1.6273, + "step": 23652 + }, + { + "epoch": 0.8470643006786398, + "grad_norm": 1.8303214311599731, + "learning_rate": 1.2016331278911619e-05, + "loss": 1.2777, + "step": 23653 + }, + { + "epoch": 0.8471001128082082, + "grad_norm": 1.78458833694458, + "learning_rate": 1.201081913674763e-05, + "loss": 1.6966, + "step": 23654 + }, + { + "epoch": 0.8471359249377765, + "grad_norm": 1.1796941757202148, + "learning_rate": 1.2005308178364593e-05, + "loss": 1.457, + "step": 23655 + }, + { + "epoch": 0.8471717370673447, + "grad_norm": 1.6931653022766113, + "learning_rate": 1.1999798403836615e-05, + "loss": 1.0998, + "step": 23656 + }, + { + "epoch": 0.847207549196913, + "grad_norm": 1.5502809286117554, + "learning_rate": 1.1994289813237835e-05, + "loss": 1.3704, + "step": 23657 + }, + { + "epoch": 0.8472433613264813, + "grad_norm": 1.3517582416534424, + "learning_rate": 1.1988782406642385e-05, + "loss": 1.149, + "step": 23658 + }, + { + "epoch": 0.8472791734560495, + "grad_norm": 1.7601218223571777, + "learning_rate": 1.1983276184124314e-05, + "loss": 1.4605, + "step": 23659 + }, + { + "epoch": 0.8473149855856178, + "grad_norm": 1.4631924629211426, + "learning_rate": 1.1977771145757733e-05, + "loss": 1.5351, + "step": 23660 + }, + { + "epoch": 0.8473507977151862, + "grad_norm": 1.9184998273849487, + "learning_rate": 1.1972267291616702e-05, + "loss": 1.5686, + "step": 23661 + }, + { + "epoch": 0.8473866098447544, + "grad_norm": 1.5700645446777344, + "learning_rate": 1.1966764621775284e-05, + "loss": 1.1868, + "step": 23662 + }, + { + "epoch": 0.8474224219743227, + "grad_norm": 1.7700568437576294, + "learning_rate": 1.1961263136307477e-05, + "loss": 1.2635, + "step": 23663 + }, + { + "epoch": 0.847458234103891, + "grad_norm": 1.7117582559585571, + "learning_rate": 1.195576283528731e-05, + "loss": 1.5889, + "step": 23664 + }, + { + "epoch": 0.8474940462334593, + "grad_norm": 1.486767053604126, + "learning_rate": 1.1950263718788812e-05, + "loss": 1.6332, + "step": 23665 + }, + { + "epoch": 0.8475298583630275, + "grad_norm": 1.6834611892700195, + "learning_rate": 1.1944765786885914e-05, + "loss": 1.4774, + "step": 23666 + }, + { + "epoch": 0.8475656704925958, + "grad_norm": 1.8783506155014038, + "learning_rate": 1.1939269039652612e-05, + "loss": 1.5829, + "step": 23667 + }, + { + "epoch": 0.8476014826221642, + "grad_norm": 1.8544740676879883, + "learning_rate": 1.1933773477162847e-05, + "loss": 1.6606, + "step": 23668 + }, + { + "epoch": 0.8476372947517324, + "grad_norm": 1.5424895286560059, + "learning_rate": 1.192827909949059e-05, + "loss": 1.3291, + "step": 23669 + }, + { + "epoch": 0.8476731068813007, + "grad_norm": 1.9440540075302124, + "learning_rate": 1.1922785906709711e-05, + "loss": 1.461, + "step": 23670 + }, + { + "epoch": 0.847708919010869, + "grad_norm": 2.229961395263672, + "learning_rate": 1.1917293898894145e-05, + "loss": 1.4525, + "step": 23671 + }, + { + "epoch": 0.8477447311404372, + "grad_norm": 1.5149637460708618, + "learning_rate": 1.1911803076117777e-05, + "loss": 1.3611, + "step": 23672 + }, + { + "epoch": 0.8477805432700055, + "grad_norm": 1.4159404039382935, + "learning_rate": 1.1906313438454464e-05, + "loss": 1.444, + "step": 23673 + }, + { + "epoch": 0.8478163553995738, + "grad_norm": 1.359679937362671, + "learning_rate": 1.1900824985978066e-05, + "loss": 1.1712, + "step": 23674 + }, + { + "epoch": 0.8478521675291422, + "grad_norm": 1.24271821975708, + "learning_rate": 1.1895337718762422e-05, + "loss": 1.493, + "step": 23675 + }, + { + "epoch": 0.8478879796587104, + "grad_norm": 1.549537181854248, + "learning_rate": 1.1889851636881388e-05, + "loss": 1.3426, + "step": 23676 + }, + { + "epoch": 0.8479237917882787, + "grad_norm": 1.8525686264038086, + "learning_rate": 1.1884366740408726e-05, + "loss": 1.4973, + "step": 23677 + }, + { + "epoch": 0.847959603917847, + "grad_norm": 1.6255896091461182, + "learning_rate": 1.1878883029418253e-05, + "loss": 1.6222, + "step": 23678 + }, + { + "epoch": 0.8479954160474152, + "grad_norm": 1.3341083526611328, + "learning_rate": 1.1873400503983733e-05, + "loss": 1.4786, + "step": 23679 + }, + { + "epoch": 0.8480312281769835, + "grad_norm": 1.8212075233459473, + "learning_rate": 1.1867919164178964e-05, + "loss": 1.4576, + "step": 23680 + }, + { + "epoch": 0.8480670403065518, + "grad_norm": 1.6231342554092407, + "learning_rate": 1.1862439010077653e-05, + "loss": 1.5524, + "step": 23681 + }, + { + "epoch": 0.8481028524361202, + "grad_norm": 1.5469783544540405, + "learning_rate": 1.1856960041753495e-05, + "loss": 1.3759, + "step": 23682 + }, + { + "epoch": 0.8481386645656884, + "grad_norm": 1.431070327758789, + "learning_rate": 1.185148225928029e-05, + "loss": 1.7274, + "step": 23683 + }, + { + "epoch": 0.8481744766952567, + "grad_norm": 1.7324550151824951, + "learning_rate": 1.1846005662731663e-05, + "loss": 1.6348, + "step": 23684 + }, + { + "epoch": 0.848210288824825, + "grad_norm": 1.7936501502990723, + "learning_rate": 1.1840530252181336e-05, + "loss": 1.6526, + "step": 23685 + }, + { + "epoch": 0.8482461009543932, + "grad_norm": 1.3822287321090698, + "learning_rate": 1.1835056027702918e-05, + "loss": 1.3794, + "step": 23686 + }, + { + "epoch": 0.8482819130839615, + "grad_norm": 1.904231071472168, + "learning_rate": 1.1829582989370148e-05, + "loss": 1.6432, + "step": 23687 + }, + { + "epoch": 0.8483177252135298, + "grad_norm": 2.0496718883514404, + "learning_rate": 1.1824111137256577e-05, + "loss": 1.3671, + "step": 23688 + }, + { + "epoch": 0.8483535373430982, + "grad_norm": 2.090599298477173, + "learning_rate": 1.1818640471435848e-05, + "loss": 1.6222, + "step": 23689 + }, + { + "epoch": 0.8483893494726664, + "grad_norm": 1.9762262105941772, + "learning_rate": 1.1813170991981593e-05, + "loss": 1.5517, + "step": 23690 + }, + { + "epoch": 0.8484251616022347, + "grad_norm": 1.5518031120300293, + "learning_rate": 1.1807702698967349e-05, + "loss": 1.4667, + "step": 23691 + }, + { + "epoch": 0.848460973731803, + "grad_norm": 1.2942707538604736, + "learning_rate": 1.1802235592466727e-05, + "loss": 1.093, + "step": 23692 + }, + { + "epoch": 0.8484967858613712, + "grad_norm": 2.022648334503174, + "learning_rate": 1.179676967255321e-05, + "loss": 1.55, + "step": 23693 + }, + { + "epoch": 0.8485325979909395, + "grad_norm": 1.2496730089187622, + "learning_rate": 1.1791304939300429e-05, + "loss": 0.9355, + "step": 23694 + }, + { + "epoch": 0.8485684101205078, + "grad_norm": 1.1895911693572998, + "learning_rate": 1.1785841392781838e-05, + "loss": 1.1347, + "step": 23695 + }, + { + "epoch": 0.8486042222500761, + "grad_norm": 1.413276195526123, + "learning_rate": 1.1780379033070988e-05, + "loss": 1.4609, + "step": 23696 + }, + { + "epoch": 0.8486400343796444, + "grad_norm": 1.4332022666931152, + "learning_rate": 1.1774917860241297e-05, + "loss": 1.478, + "step": 23697 + }, + { + "epoch": 0.8486758465092127, + "grad_norm": 1.54298734664917, + "learning_rate": 1.1769457874366318e-05, + "loss": 1.2612, + "step": 23698 + }, + { + "epoch": 0.848711658638781, + "grad_norm": 1.376090168952942, + "learning_rate": 1.1763999075519482e-05, + "loss": 1.2319, + "step": 23699 + }, + { + "epoch": 0.8487474707683492, + "grad_norm": 1.784658670425415, + "learning_rate": 1.1758541463774186e-05, + "loss": 1.53, + "step": 23700 + }, + { + "epoch": 0.8487832828979175, + "grad_norm": 2.318807363510132, + "learning_rate": 1.1753085039203926e-05, + "loss": 1.2602, + "step": 23701 + }, + { + "epoch": 0.8488190950274858, + "grad_norm": 1.3570774793624878, + "learning_rate": 1.1747629801882054e-05, + "loss": 1.635, + "step": 23702 + }, + { + "epoch": 0.8488549071570541, + "grad_norm": 2.0229833126068115, + "learning_rate": 1.1742175751882012e-05, + "loss": 1.5221, + "step": 23703 + }, + { + "epoch": 0.8488907192866224, + "grad_norm": 1.5566153526306152, + "learning_rate": 1.1736722889277107e-05, + "loss": 1.4815, + "step": 23704 + }, + { + "epoch": 0.8489265314161907, + "grad_norm": 2.224531888961792, + "learning_rate": 1.1731271214140783e-05, + "loss": 1.5376, + "step": 23705 + }, + { + "epoch": 0.848962343545759, + "grad_norm": 2.213486671447754, + "learning_rate": 1.1725820726546322e-05, + "loss": 1.5942, + "step": 23706 + }, + { + "epoch": 0.8489981556753272, + "grad_norm": 1.877007246017456, + "learning_rate": 1.1720371426567111e-05, + "loss": 1.4773, + "step": 23707 + }, + { + "epoch": 0.8490339678048955, + "grad_norm": 1.609832763671875, + "learning_rate": 1.1714923314276405e-05, + "loss": 1.2998, + "step": 23708 + }, + { + "epoch": 0.8490697799344638, + "grad_norm": 1.7458049058914185, + "learning_rate": 1.170947638974752e-05, + "loss": 1.4685, + "step": 23709 + }, + { + "epoch": 0.8491055920640321, + "grad_norm": 2.7425076961517334, + "learning_rate": 1.1704030653053766e-05, + "loss": 1.5496, + "step": 23710 + }, + { + "epoch": 0.8491414041936004, + "grad_norm": 1.5261955261230469, + "learning_rate": 1.1698586104268372e-05, + "loss": 1.4367, + "step": 23711 + }, + { + "epoch": 0.8491772163231687, + "grad_norm": 1.8349772691726685, + "learning_rate": 1.169314274346459e-05, + "loss": 1.3865, + "step": 23712 + }, + { + "epoch": 0.8492130284527369, + "grad_norm": 1.9247404336929321, + "learning_rate": 1.1687700570715677e-05, + "loss": 1.3468, + "step": 23713 + }, + { + "epoch": 0.8492488405823052, + "grad_norm": 1.6313281059265137, + "learning_rate": 1.1682259586094845e-05, + "loss": 1.3691, + "step": 23714 + }, + { + "epoch": 0.8492846527118735, + "grad_norm": 1.6033432483673096, + "learning_rate": 1.1676819789675264e-05, + "loss": 1.3165, + "step": 23715 + }, + { + "epoch": 0.8493204648414417, + "grad_norm": 1.6103150844573975, + "learning_rate": 1.1671381181530171e-05, + "loss": 1.4167, + "step": 23716 + }, + { + "epoch": 0.8493562769710101, + "grad_norm": 1.6567761898040771, + "learning_rate": 1.1665943761732712e-05, + "loss": 1.4802, + "step": 23717 + }, + { + "epoch": 0.8493920891005784, + "grad_norm": 1.9496479034423828, + "learning_rate": 1.1660507530356024e-05, + "loss": 1.4007, + "step": 23718 + }, + { + "epoch": 0.8494279012301467, + "grad_norm": 1.6476153135299683, + "learning_rate": 1.1655072487473251e-05, + "loss": 1.3635, + "step": 23719 + }, + { + "epoch": 0.8494637133597149, + "grad_norm": 1.7189626693725586, + "learning_rate": 1.1649638633157523e-05, + "loss": 1.6882, + "step": 23720 + }, + { + "epoch": 0.8494995254892832, + "grad_norm": 1.4892523288726807, + "learning_rate": 1.1644205967481959e-05, + "loss": 1.2611, + "step": 23721 + }, + { + "epoch": 0.8495353376188515, + "grad_norm": 1.507788896560669, + "learning_rate": 1.1638774490519622e-05, + "loss": 1.5566, + "step": 23722 + }, + { + "epoch": 0.8495711497484197, + "grad_norm": 2.1150612831115723, + "learning_rate": 1.1633344202343587e-05, + "loss": 1.3369, + "step": 23723 + }, + { + "epoch": 0.8496069618779881, + "grad_norm": 1.6760443449020386, + "learning_rate": 1.162791510302692e-05, + "loss": 1.0433, + "step": 23724 + }, + { + "epoch": 0.8496427740075564, + "grad_norm": 1.6764628887176514, + "learning_rate": 1.1622487192642694e-05, + "loss": 1.3644, + "step": 23725 + }, + { + "epoch": 0.8496785861371247, + "grad_norm": 1.971871018409729, + "learning_rate": 1.1617060471263875e-05, + "loss": 1.2388, + "step": 23726 + }, + { + "epoch": 0.8497143982666929, + "grad_norm": 2.7978687286376953, + "learning_rate": 1.1611634938963512e-05, + "loss": 1.2827, + "step": 23727 + }, + { + "epoch": 0.8497502103962612, + "grad_norm": 2.1130940914154053, + "learning_rate": 1.1606210595814593e-05, + "loss": 1.4592, + "step": 23728 + }, + { + "epoch": 0.8497860225258295, + "grad_norm": 1.4283808469772339, + "learning_rate": 1.1600787441890082e-05, + "loss": 1.4418, + "step": 23729 + }, + { + "epoch": 0.8498218346553977, + "grad_norm": 1.7132954597473145, + "learning_rate": 1.1595365477262944e-05, + "loss": 1.5116, + "step": 23730 + }, + { + "epoch": 0.8498576467849661, + "grad_norm": 1.5262160301208496, + "learning_rate": 1.1589944702006129e-05, + "loss": 1.1867, + "step": 23731 + }, + { + "epoch": 0.8498934589145344, + "grad_norm": 2.8375978469848633, + "learning_rate": 1.158452511619259e-05, + "loss": 1.5645, + "step": 23732 + }, + { + "epoch": 0.8499292710441027, + "grad_norm": 1.3872442245483398, + "learning_rate": 1.1579106719895205e-05, + "loss": 1.4656, + "step": 23733 + }, + { + "epoch": 0.8499650831736709, + "grad_norm": 2.849210262298584, + "learning_rate": 1.157368951318687e-05, + "loss": 1.7573, + "step": 23734 + }, + { + "epoch": 0.8500008953032392, + "grad_norm": 1.7254695892333984, + "learning_rate": 1.1568273496140513e-05, + "loss": 1.4489, + "step": 23735 + }, + { + "epoch": 0.8500367074328075, + "grad_norm": 1.4236689805984497, + "learning_rate": 1.1562858668828936e-05, + "loss": 1.6148, + "step": 23736 + }, + { + "epoch": 0.8500725195623757, + "grad_norm": 1.5997804403305054, + "learning_rate": 1.1557445031325032e-05, + "loss": 1.3078, + "step": 23737 + }, + { + "epoch": 0.8501083316919441, + "grad_norm": 2.8871943950653076, + "learning_rate": 1.1552032583701612e-05, + "loss": 1.547, + "step": 23738 + }, + { + "epoch": 0.8501441438215124, + "grad_norm": 1.5523663759231567, + "learning_rate": 1.1546621326031526e-05, + "loss": 1.407, + "step": 23739 + }, + { + "epoch": 0.8501799559510806, + "grad_norm": 1.4825963973999023, + "learning_rate": 1.154121125838754e-05, + "loss": 1.4152, + "step": 23740 + }, + { + "epoch": 0.8502157680806489, + "grad_norm": 1.7809697389602661, + "learning_rate": 1.1535802380842453e-05, + "loss": 1.2848, + "step": 23741 + }, + { + "epoch": 0.8502515802102172, + "grad_norm": 1.843872308731079, + "learning_rate": 1.1530394693469026e-05, + "loss": 1.4454, + "step": 23742 + }, + { + "epoch": 0.8502873923397855, + "grad_norm": 1.6552728414535522, + "learning_rate": 1.1524988196340048e-05, + "loss": 1.4238, + "step": 23743 + }, + { + "epoch": 0.8503232044693537, + "grad_norm": 1.1542280912399292, + "learning_rate": 1.1519582889528202e-05, + "loss": 1.3864, + "step": 23744 + }, + { + "epoch": 0.8503590165989221, + "grad_norm": 1.427795648574829, + "learning_rate": 1.1514178773106243e-05, + "loss": 1.2635, + "step": 23745 + }, + { + "epoch": 0.8503948287284904, + "grad_norm": 1.981183409690857, + "learning_rate": 1.150877584714689e-05, + "loss": 1.3967, + "step": 23746 + }, + { + "epoch": 0.8504306408580586, + "grad_norm": 1.5299640893936157, + "learning_rate": 1.1503374111722786e-05, + "loss": 1.6025, + "step": 23747 + }, + { + "epoch": 0.8504664529876269, + "grad_norm": 1.5333250761032104, + "learning_rate": 1.149797356690664e-05, + "loss": 1.37, + "step": 23748 + }, + { + "epoch": 0.8505022651171952, + "grad_norm": 1.6742618083953857, + "learning_rate": 1.149257421277109e-05, + "loss": 1.4453, + "step": 23749 + }, + { + "epoch": 0.8505380772467634, + "grad_norm": 1.8862006664276123, + "learning_rate": 1.1487176049388814e-05, + "loss": 1.4248, + "step": 23750 + }, + { + "epoch": 0.8505738893763317, + "grad_norm": 1.4315221309661865, + "learning_rate": 1.1481779076832388e-05, + "loss": 1.3406, + "step": 23751 + }, + { + "epoch": 0.8506097015059001, + "grad_norm": 1.759994387626648, + "learning_rate": 1.1476383295174452e-05, + "loss": 1.6171, + "step": 23752 + }, + { + "epoch": 0.8506455136354684, + "grad_norm": 2.4337728023529053, + "learning_rate": 1.1470988704487607e-05, + "loss": 1.2688, + "step": 23753 + }, + { + "epoch": 0.8506813257650366, + "grad_norm": 1.9323700666427612, + "learning_rate": 1.146559530484439e-05, + "loss": 1.4573, + "step": 23754 + }, + { + "epoch": 0.8507171378946049, + "grad_norm": 1.7567484378814697, + "learning_rate": 1.146020309631739e-05, + "loss": 1.5855, + "step": 23755 + }, + { + "epoch": 0.8507529500241732, + "grad_norm": 1.5216503143310547, + "learning_rate": 1.145481207897915e-05, + "loss": 1.3883, + "step": 23756 + }, + { + "epoch": 0.8507887621537414, + "grad_norm": 1.536760926246643, + "learning_rate": 1.144942225290222e-05, + "loss": 1.4571, + "step": 23757 + }, + { + "epoch": 0.8508245742833097, + "grad_norm": 2.1241402626037598, + "learning_rate": 1.1444033618159068e-05, + "loss": 1.9264, + "step": 23758 + }, + { + "epoch": 0.8508603864128781, + "grad_norm": 1.6003999710083008, + "learning_rate": 1.143864617482222e-05, + "loss": 1.7025, + "step": 23759 + }, + { + "epoch": 0.8508961985424464, + "grad_norm": 1.6267139911651611, + "learning_rate": 1.1433259922964146e-05, + "loss": 1.2968, + "step": 23760 + }, + { + "epoch": 0.8509320106720146, + "grad_norm": 1.788483738899231, + "learning_rate": 1.1427874862657339e-05, + "loss": 1.4734, + "step": 23761 + }, + { + "epoch": 0.8509678228015829, + "grad_norm": 1.3906162977218628, + "learning_rate": 1.1422490993974199e-05, + "loss": 1.0226, + "step": 23762 + }, + { + "epoch": 0.8510036349311512, + "grad_norm": 2.156611204147339, + "learning_rate": 1.1417108316987201e-05, + "loss": 1.4735, + "step": 23763 + }, + { + "epoch": 0.8510394470607194, + "grad_norm": 1.2706553936004639, + "learning_rate": 1.1411726831768754e-05, + "loss": 1.6045, + "step": 23764 + }, + { + "epoch": 0.8510752591902877, + "grad_norm": 1.6218516826629639, + "learning_rate": 1.1406346538391243e-05, + "loss": 1.5444, + "step": 23765 + }, + { + "epoch": 0.8511110713198561, + "grad_norm": 2.312868118286133, + "learning_rate": 1.1400967436927056e-05, + "loss": 1.3219, + "step": 23766 + }, + { + "epoch": 0.8511468834494244, + "grad_norm": 1.7224329710006714, + "learning_rate": 1.1395589527448558e-05, + "loss": 1.5206, + "step": 23767 + }, + { + "epoch": 0.8511826955789926, + "grad_norm": 1.7545734643936157, + "learning_rate": 1.1390212810028144e-05, + "loss": 1.8438, + "step": 23768 + }, + { + "epoch": 0.8512185077085609, + "grad_norm": 1.76412034034729, + "learning_rate": 1.1384837284738114e-05, + "loss": 1.1991, + "step": 23769 + }, + { + "epoch": 0.8512543198381292, + "grad_norm": 1.5692112445831299, + "learning_rate": 1.1379462951650755e-05, + "loss": 1.4919, + "step": 23770 + }, + { + "epoch": 0.8512901319676974, + "grad_norm": 1.500417947769165, + "learning_rate": 1.137408981083845e-05, + "loss": 1.0246, + "step": 23771 + }, + { + "epoch": 0.8513259440972657, + "grad_norm": 2.5152666568756104, + "learning_rate": 1.1368717862373424e-05, + "loss": 1.4325, + "step": 23772 + }, + { + "epoch": 0.8513617562268341, + "grad_norm": 1.9641667604446411, + "learning_rate": 1.136334710632797e-05, + "loss": 1.3025, + "step": 23773 + }, + { + "epoch": 0.8513975683564023, + "grad_norm": 1.6455731391906738, + "learning_rate": 1.1357977542774356e-05, + "loss": 1.2259, + "step": 23774 + }, + { + "epoch": 0.8514333804859706, + "grad_norm": 2.0032389163970947, + "learning_rate": 1.1352609171784834e-05, + "loss": 1.2823, + "step": 23775 + }, + { + "epoch": 0.8514691926155389, + "grad_norm": 1.3516676425933838, + "learning_rate": 1.1347241993431578e-05, + "loss": 1.4081, + "step": 23776 + }, + { + "epoch": 0.8515050047451072, + "grad_norm": 2.0282421112060547, + "learning_rate": 1.1341876007786845e-05, + "loss": 1.5955, + "step": 23777 + }, + { + "epoch": 0.8515408168746754, + "grad_norm": 2.355489492416382, + "learning_rate": 1.1336511214922819e-05, + "loss": 1.2484, + "step": 23778 + }, + { + "epoch": 0.8515766290042437, + "grad_norm": 1.3258121013641357, + "learning_rate": 1.1331147614911641e-05, + "loss": 1.5587, + "step": 23779 + }, + { + "epoch": 0.8516124411338121, + "grad_norm": 1.586104154586792, + "learning_rate": 1.1325785207825524e-05, + "loss": 1.3955, + "step": 23780 + }, + { + "epoch": 0.8516482532633803, + "grad_norm": 1.611124038696289, + "learning_rate": 1.132042399373654e-05, + "loss": 1.3186, + "step": 23781 + }, + { + "epoch": 0.8516840653929486, + "grad_norm": 1.5817452669143677, + "learning_rate": 1.131506397271691e-05, + "loss": 1.7507, + "step": 23782 + }, + { + "epoch": 0.8517198775225169, + "grad_norm": 1.5858224630355835, + "learning_rate": 1.1309705144838678e-05, + "loss": 1.5766, + "step": 23783 + }, + { + "epoch": 0.8517556896520851, + "grad_norm": 2.0559260845184326, + "learning_rate": 1.1304347510173963e-05, + "loss": 1.5677, + "step": 23784 + }, + { + "epoch": 0.8517915017816534, + "grad_norm": 1.479744791984558, + "learning_rate": 1.129899106879484e-05, + "loss": 1.4073, + "step": 23785 + }, + { + "epoch": 0.8518273139112217, + "grad_norm": 1.3566659688949585, + "learning_rate": 1.1293635820773397e-05, + "loss": 1.2683, + "step": 23786 + }, + { + "epoch": 0.8518631260407901, + "grad_norm": 1.4227968454360962, + "learning_rate": 1.1288281766181651e-05, + "loss": 1.4598, + "step": 23787 + }, + { + "epoch": 0.8518989381703583, + "grad_norm": 1.8010733127593994, + "learning_rate": 1.1282928905091616e-05, + "loss": 1.268, + "step": 23788 + }, + { + "epoch": 0.8519347502999266, + "grad_norm": 1.2809944152832031, + "learning_rate": 1.1277577237575377e-05, + "loss": 1.6804, + "step": 23789 + }, + { + "epoch": 0.8519705624294949, + "grad_norm": 1.514971375465393, + "learning_rate": 1.1272226763704863e-05, + "loss": 1.7797, + "step": 23790 + }, + { + "epoch": 0.8520063745590631, + "grad_norm": 2.1083321571350098, + "learning_rate": 1.1266877483552118e-05, + "loss": 1.374, + "step": 23791 + }, + { + "epoch": 0.8520421866886314, + "grad_norm": 1.2880560159683228, + "learning_rate": 1.126152939718903e-05, + "loss": 1.582, + "step": 23792 + }, + { + "epoch": 0.8520779988181997, + "grad_norm": 1.4016788005828857, + "learning_rate": 1.125618250468764e-05, + "loss": 1.5074, + "step": 23793 + }, + { + "epoch": 0.8521138109477681, + "grad_norm": 1.4981015920639038, + "learning_rate": 1.1250836806119824e-05, + "loss": 1.3985, + "step": 23794 + }, + { + "epoch": 0.8521496230773363, + "grad_norm": 1.716374397277832, + "learning_rate": 1.1245492301557547e-05, + "loss": 1.3592, + "step": 23795 + }, + { + "epoch": 0.8521854352069046, + "grad_norm": 1.9721206426620483, + "learning_rate": 1.1240148991072662e-05, + "loss": 1.3089, + "step": 23796 + }, + { + "epoch": 0.8522212473364729, + "grad_norm": 1.718814492225647, + "learning_rate": 1.123480687473708e-05, + "loss": 1.5038, + "step": 23797 + }, + { + "epoch": 0.8522570594660411, + "grad_norm": 2.706646203994751, + "learning_rate": 1.1229465952622686e-05, + "loss": 1.4359, + "step": 23798 + }, + { + "epoch": 0.8522928715956094, + "grad_norm": 1.659623622894287, + "learning_rate": 1.122412622480129e-05, + "loss": 1.4396, + "step": 23799 + }, + { + "epoch": 0.8523286837251777, + "grad_norm": 1.6759545803070068, + "learning_rate": 1.1218787691344801e-05, + "loss": 1.6162, + "step": 23800 + }, + { + "epoch": 0.852364495854746, + "grad_norm": 2.4348177909851074, + "learning_rate": 1.1213450352324983e-05, + "loss": 1.8575, + "step": 23801 + }, + { + "epoch": 0.8524003079843143, + "grad_norm": 1.6588647365570068, + "learning_rate": 1.1208114207813691e-05, + "loss": 1.6102, + "step": 23802 + }, + { + "epoch": 0.8524361201138826, + "grad_norm": 1.6332166194915771, + "learning_rate": 1.1202779257882645e-05, + "loss": 1.357, + "step": 23803 + }, + { + "epoch": 0.8524719322434509, + "grad_norm": 1.6586291790008545, + "learning_rate": 1.1197445502603698e-05, + "loss": 1.4706, + "step": 23804 + }, + { + "epoch": 0.8525077443730191, + "grad_norm": 1.4110736846923828, + "learning_rate": 1.1192112942048582e-05, + "loss": 1.311, + "step": 23805 + }, + { + "epoch": 0.8525435565025874, + "grad_norm": 1.5690315961837769, + "learning_rate": 1.1186781576289007e-05, + "loss": 1.3135, + "step": 23806 + }, + { + "epoch": 0.8525793686321557, + "grad_norm": 1.8064452409744263, + "learning_rate": 1.1181451405396725e-05, + "loss": 1.6223, + "step": 23807 + }, + { + "epoch": 0.852615180761724, + "grad_norm": 1.4428201913833618, + "learning_rate": 1.1176122429443458e-05, + "loss": 1.4009, + "step": 23808 + }, + { + "epoch": 0.8526509928912923, + "grad_norm": 1.7909198999404907, + "learning_rate": 1.1170794648500893e-05, + "loss": 1.3315, + "step": 23809 + }, + { + "epoch": 0.8526868050208606, + "grad_norm": 1.5551303625106812, + "learning_rate": 1.116546806264067e-05, + "loss": 1.4318, + "step": 23810 + }, + { + "epoch": 0.8527226171504289, + "grad_norm": 1.7789667844772339, + "learning_rate": 1.1160142671934537e-05, + "loss": 1.3732, + "step": 23811 + }, + { + "epoch": 0.8527584292799971, + "grad_norm": 1.6274534463882446, + "learning_rate": 1.1154818476454054e-05, + "loss": 1.4462, + "step": 23812 + }, + { + "epoch": 0.8527942414095654, + "grad_norm": 1.3836264610290527, + "learning_rate": 1.114949547627091e-05, + "loss": 1.5112, + "step": 23813 + }, + { + "epoch": 0.8528300535391337, + "grad_norm": 1.9895684719085693, + "learning_rate": 1.1144173671456682e-05, + "loss": 1.3913, + "step": 23814 + }, + { + "epoch": 0.852865865668702, + "grad_norm": 2.5720231533050537, + "learning_rate": 1.1138853062082977e-05, + "loss": 1.4404, + "step": 23815 + }, + { + "epoch": 0.8529016777982703, + "grad_norm": 1.6841319799423218, + "learning_rate": 1.1133533648221405e-05, + "loss": 1.2546, + "step": 23816 + }, + { + "epoch": 0.8529374899278386, + "grad_norm": 1.8937181234359741, + "learning_rate": 1.1128215429943477e-05, + "loss": 1.3837, + "step": 23817 + }, + { + "epoch": 0.8529733020574068, + "grad_norm": 1.6849440336227417, + "learning_rate": 1.1122898407320791e-05, + "loss": 1.3869, + "step": 23818 + }, + { + "epoch": 0.8530091141869751, + "grad_norm": 1.8310136795043945, + "learning_rate": 1.1117582580424857e-05, + "loss": 1.2157, + "step": 23819 + }, + { + "epoch": 0.8530449263165434, + "grad_norm": 1.718644380569458, + "learning_rate": 1.1112267949327216e-05, + "loss": 1.3847, + "step": 23820 + }, + { + "epoch": 0.8530807384461117, + "grad_norm": 1.7329232692718506, + "learning_rate": 1.1106954514099332e-05, + "loss": 1.3567, + "step": 23821 + }, + { + "epoch": 0.85311655057568, + "grad_norm": 1.7883803844451904, + "learning_rate": 1.1101642274812706e-05, + "loss": 1.5081, + "step": 23822 + }, + { + "epoch": 0.8531523627052483, + "grad_norm": 1.6889233589172363, + "learning_rate": 1.1096331231538847e-05, + "loss": 1.2767, + "step": 23823 + }, + { + "epoch": 0.8531881748348166, + "grad_norm": 1.7603107690811157, + "learning_rate": 1.1091021384349143e-05, + "loss": 1.3097, + "step": 23824 + }, + { + "epoch": 0.8532239869643848, + "grad_norm": 1.8802781105041504, + "learning_rate": 1.1085712733315068e-05, + "loss": 1.3812, + "step": 23825 + }, + { + "epoch": 0.8532597990939531, + "grad_norm": 1.5397545099258423, + "learning_rate": 1.1080405278508033e-05, + "loss": 1.4916, + "step": 23826 + }, + { + "epoch": 0.8532956112235214, + "grad_norm": 1.7694755792617798, + "learning_rate": 1.1075099019999468e-05, + "loss": 1.6139, + "step": 23827 + }, + { + "epoch": 0.8533314233530896, + "grad_norm": 1.587158441543579, + "learning_rate": 1.106979395786072e-05, + "loss": 1.7875, + "step": 23828 + }, + { + "epoch": 0.853367235482658, + "grad_norm": 1.5730561017990112, + "learning_rate": 1.1064490092163181e-05, + "loss": 1.2947, + "step": 23829 + }, + { + "epoch": 0.8534030476122263, + "grad_norm": 1.4817062616348267, + "learning_rate": 1.1059187422978211e-05, + "loss": 1.5571, + "step": 23830 + }, + { + "epoch": 0.8534388597417946, + "grad_norm": 2.2228071689605713, + "learning_rate": 1.1053885950377174e-05, + "loss": 1.4434, + "step": 23831 + }, + { + "epoch": 0.8534746718713628, + "grad_norm": 1.7988253831863403, + "learning_rate": 1.1048585674431345e-05, + "loss": 1.5946, + "step": 23832 + }, + { + "epoch": 0.8535104840009311, + "grad_norm": 1.7007036209106445, + "learning_rate": 1.1043286595212054e-05, + "loss": 1.3761, + "step": 23833 + }, + { + "epoch": 0.8535462961304994, + "grad_norm": 1.7106313705444336, + "learning_rate": 1.1037988712790626e-05, + "loss": 1.3455, + "step": 23834 + }, + { + "epoch": 0.8535821082600676, + "grad_norm": 1.68916916847229, + "learning_rate": 1.1032692027238279e-05, + "loss": 1.4494, + "step": 23835 + }, + { + "epoch": 0.853617920389636, + "grad_norm": 1.5535471439361572, + "learning_rate": 1.10273965386263e-05, + "loss": 1.4831, + "step": 23836 + }, + { + "epoch": 0.8536537325192043, + "grad_norm": 1.5345951318740845, + "learning_rate": 1.1022102247025934e-05, + "loss": 1.6866, + "step": 23837 + }, + { + "epoch": 0.8536895446487726, + "grad_norm": 1.2427003383636475, + "learning_rate": 1.1016809152508434e-05, + "loss": 1.3729, + "step": 23838 + }, + { + "epoch": 0.8537253567783408, + "grad_norm": 1.175044298171997, + "learning_rate": 1.1011517255144965e-05, + "loss": 1.3661, + "step": 23839 + }, + { + "epoch": 0.8537611689079091, + "grad_norm": 1.9435486793518066, + "learning_rate": 1.1006226555006749e-05, + "loss": 1.1561, + "step": 23840 + }, + { + "epoch": 0.8537969810374774, + "grad_norm": 1.4957728385925293, + "learning_rate": 1.1000937052164973e-05, + "loss": 1.2308, + "step": 23841 + }, + { + "epoch": 0.8538327931670456, + "grad_norm": 1.851515293121338, + "learning_rate": 1.0995648746690768e-05, + "loss": 1.1669, + "step": 23842 + }, + { + "epoch": 0.853868605296614, + "grad_norm": 1.4531022310256958, + "learning_rate": 1.0990361638655311e-05, + "loss": 1.3593, + "step": 23843 + }, + { + "epoch": 0.8539044174261823, + "grad_norm": 1.6683907508850098, + "learning_rate": 1.0985075728129712e-05, + "loss": 1.413, + "step": 23844 + }, + { + "epoch": 0.8539402295557506, + "grad_norm": 2.5751333236694336, + "learning_rate": 1.0979791015185125e-05, + "loss": 1.5377, + "step": 23845 + }, + { + "epoch": 0.8539760416853188, + "grad_norm": 1.8338749408721924, + "learning_rate": 1.0974507499892605e-05, + "loss": 1.6186, + "step": 23846 + }, + { + "epoch": 0.8540118538148871, + "grad_norm": 1.6571879386901855, + "learning_rate": 1.0969225182323239e-05, + "loss": 1.3968, + "step": 23847 + }, + { + "epoch": 0.8540476659444554, + "grad_norm": 1.5902742147445679, + "learning_rate": 1.0963944062548125e-05, + "loss": 1.466, + "step": 23848 + }, + { + "epoch": 0.8540834780740236, + "grad_norm": 1.432607650756836, + "learning_rate": 1.0958664140638297e-05, + "loss": 1.4103, + "step": 23849 + }, + { + "epoch": 0.854119290203592, + "grad_norm": 1.4626818895339966, + "learning_rate": 1.0953385416664785e-05, + "loss": 1.0073, + "step": 23850 + }, + { + "epoch": 0.8541551023331603, + "grad_norm": 2.2217087745666504, + "learning_rate": 1.09481078906986e-05, + "loss": 1.4524, + "step": 23851 + }, + { + "epoch": 0.8541909144627285, + "grad_norm": 1.7042936086654663, + "learning_rate": 1.0942831562810774e-05, + "loss": 1.5566, + "step": 23852 + }, + { + "epoch": 0.8542267265922968, + "grad_norm": 1.6705306768417358, + "learning_rate": 1.093755643307226e-05, + "loss": 1.3809, + "step": 23853 + }, + { + "epoch": 0.8542625387218651, + "grad_norm": 2.15169620513916, + "learning_rate": 1.0932282501554037e-05, + "loss": 1.3918, + "step": 23854 + }, + { + "epoch": 0.8542983508514334, + "grad_norm": 1.519851565361023, + "learning_rate": 1.0927009768327068e-05, + "loss": 1.2011, + "step": 23855 + }, + { + "epoch": 0.8543341629810016, + "grad_norm": 1.5760105848312378, + "learning_rate": 1.0921738233462297e-05, + "loss": 1.5071, + "step": 23856 + }, + { + "epoch": 0.85436997511057, + "grad_norm": 1.225251317024231, + "learning_rate": 1.0916467897030625e-05, + "loss": 1.1271, + "step": 23857 + }, + { + "epoch": 0.8544057872401383, + "grad_norm": 1.4910647869110107, + "learning_rate": 1.091119875910297e-05, + "loss": 1.2668, + "step": 23858 + }, + { + "epoch": 0.8544415993697065, + "grad_norm": 1.352760910987854, + "learning_rate": 1.0905930819750232e-05, + "loss": 1.1062, + "step": 23859 + }, + { + "epoch": 0.8544774114992748, + "grad_norm": 1.927417278289795, + "learning_rate": 1.0900664079043255e-05, + "loss": 1.2948, + "step": 23860 + }, + { + "epoch": 0.8545132236288431, + "grad_norm": 2.803440570831299, + "learning_rate": 1.0895398537052914e-05, + "loss": 1.2944, + "step": 23861 + }, + { + "epoch": 0.8545490357584113, + "grad_norm": 1.5908805131912231, + "learning_rate": 1.0890134193850043e-05, + "loss": 1.4876, + "step": 23862 + }, + { + "epoch": 0.8545848478879796, + "grad_norm": 1.509763240814209, + "learning_rate": 1.0884871049505507e-05, + "loss": 1.4407, + "step": 23863 + }, + { + "epoch": 0.854620660017548, + "grad_norm": 1.424763560295105, + "learning_rate": 1.0879609104090049e-05, + "loss": 1.6699, + "step": 23864 + }, + { + "epoch": 0.8546564721471163, + "grad_norm": 1.295540452003479, + "learning_rate": 1.0874348357674492e-05, + "loss": 1.2205, + "step": 23865 + }, + { + "epoch": 0.8546922842766845, + "grad_norm": 1.5044755935668945, + "learning_rate": 1.0869088810329642e-05, + "loss": 1.3277, + "step": 23866 + }, + { + "epoch": 0.8547280964062528, + "grad_norm": 1.0691648721694946, + "learning_rate": 1.0863830462126202e-05, + "loss": 1.3522, + "step": 23867 + }, + { + "epoch": 0.8547639085358211, + "grad_norm": 1.2039045095443726, + "learning_rate": 1.085857331313498e-05, + "loss": 1.4813, + "step": 23868 + }, + { + "epoch": 0.8547997206653893, + "grad_norm": 1.421851396560669, + "learning_rate": 1.0853317363426618e-05, + "loss": 1.3372, + "step": 23869 + }, + { + "epoch": 0.8548355327949576, + "grad_norm": 1.878260612487793, + "learning_rate": 1.0848062613071918e-05, + "loss": 1.6804, + "step": 23870 + }, + { + "epoch": 0.854871344924526, + "grad_norm": 1.3401705026626587, + "learning_rate": 1.0842809062141524e-05, + "loss": 1.519, + "step": 23871 + }, + { + "epoch": 0.8549071570540943, + "grad_norm": 1.784934163093567, + "learning_rate": 1.083755671070613e-05, + "loss": 1.5975, + "step": 23872 + }, + { + "epoch": 0.8549429691836625, + "grad_norm": 1.439996361732483, + "learning_rate": 1.0832305558836397e-05, + "loss": 1.2901, + "step": 23873 + }, + { + "epoch": 0.8549787813132308, + "grad_norm": 2.1497843265533447, + "learning_rate": 1.0827055606602998e-05, + "loss": 1.6519, + "step": 23874 + }, + { + "epoch": 0.8550145934427991, + "grad_norm": 1.6868973970413208, + "learning_rate": 1.0821806854076533e-05, + "loss": 1.6856, + "step": 23875 + }, + { + "epoch": 0.8550504055723673, + "grad_norm": 1.7532434463500977, + "learning_rate": 1.0816559301327589e-05, + "loss": 1.3696, + "step": 23876 + }, + { + "epoch": 0.8550862177019356, + "grad_norm": 1.7454650402069092, + "learning_rate": 1.0811312948426844e-05, + "loss": 1.4143, + "step": 23877 + }, + { + "epoch": 0.855122029831504, + "grad_norm": 1.5443825721740723, + "learning_rate": 1.0806067795444818e-05, + "loss": 1.7368, + "step": 23878 + }, + { + "epoch": 0.8551578419610723, + "grad_norm": 1.5286147594451904, + "learning_rate": 1.0800823842452113e-05, + "loss": 1.492, + "step": 23879 + }, + { + "epoch": 0.8551936540906405, + "grad_norm": 2.415445566177368, + "learning_rate": 1.0795581089519236e-05, + "loss": 1.6911, + "step": 23880 + }, + { + "epoch": 0.8552294662202088, + "grad_norm": 2.523890733718872, + "learning_rate": 1.0790339536716776e-05, + "loss": 1.4355, + "step": 23881 + }, + { + "epoch": 0.8552652783497771, + "grad_norm": 2.055680751800537, + "learning_rate": 1.078509918411521e-05, + "loss": 1.2303, + "step": 23882 + }, + { + "epoch": 0.8553010904793453, + "grad_norm": 1.5757873058319092, + "learning_rate": 1.0779860031785061e-05, + "loss": 1.5363, + "step": 23883 + }, + { + "epoch": 0.8553369026089136, + "grad_norm": 1.436819314956665, + "learning_rate": 1.0774622079796826e-05, + "loss": 1.586, + "step": 23884 + }, + { + "epoch": 0.855372714738482, + "grad_norm": 2.188523769378662, + "learning_rate": 1.0769385328220938e-05, + "loss": 1.1043, + "step": 23885 + }, + { + "epoch": 0.8554085268680502, + "grad_norm": 1.9349035024642944, + "learning_rate": 1.0764149777127897e-05, + "loss": 1.5145, + "step": 23886 + }, + { + "epoch": 0.8554443389976185, + "grad_norm": 1.4277589321136475, + "learning_rate": 1.0758915426588068e-05, + "loss": 1.4414, + "step": 23887 + }, + { + "epoch": 0.8554801511271868, + "grad_norm": 1.9857627153396606, + "learning_rate": 1.0753682276671961e-05, + "loss": 1.3229, + "step": 23888 + }, + { + "epoch": 0.855515963256755, + "grad_norm": 1.2367472648620605, + "learning_rate": 1.074845032744991e-05, + "loss": 1.3326, + "step": 23889 + }, + { + "epoch": 0.8555517753863233, + "grad_norm": 1.7371845245361328, + "learning_rate": 1.0743219578992369e-05, + "loss": 1.3939, + "step": 23890 + }, + { + "epoch": 0.8555875875158916, + "grad_norm": 1.5299841165542603, + "learning_rate": 1.0737990031369627e-05, + "loss": 1.4069, + "step": 23891 + }, + { + "epoch": 0.85562339964546, + "grad_norm": 3.103381395339966, + "learning_rate": 1.0732761684652127e-05, + "loss": 1.3743, + "step": 23892 + }, + { + "epoch": 0.8556592117750282, + "grad_norm": 1.619565486907959, + "learning_rate": 1.0727534538910177e-05, + "loss": 1.7073, + "step": 23893 + }, + { + "epoch": 0.8556950239045965, + "grad_norm": 1.3993152379989624, + "learning_rate": 1.0722308594214081e-05, + "loss": 1.3665, + "step": 23894 + }, + { + "epoch": 0.8557308360341648, + "grad_norm": 1.5730746984481812, + "learning_rate": 1.0717083850634158e-05, + "loss": 1.4241, + "step": 23895 + }, + { + "epoch": 0.855766648163733, + "grad_norm": 1.6564350128173828, + "learning_rate": 1.0711860308240706e-05, + "loss": 1.5138, + "step": 23896 + }, + { + "epoch": 0.8558024602933013, + "grad_norm": 1.9914555549621582, + "learning_rate": 1.0706637967104016e-05, + "loss": 1.4409, + "step": 23897 + }, + { + "epoch": 0.8558382724228696, + "grad_norm": 1.646231770515442, + "learning_rate": 1.0701416827294297e-05, + "loss": 1.4735, + "step": 23898 + }, + { + "epoch": 0.855874084552438, + "grad_norm": 1.485743522644043, + "learning_rate": 1.069619688888187e-05, + "loss": 1.4125, + "step": 23899 + }, + { + "epoch": 0.8559098966820062, + "grad_norm": 1.75136399269104, + "learning_rate": 1.0690978151936892e-05, + "loss": 1.4939, + "step": 23900 + }, + { + "epoch": 0.8559457088115745, + "grad_norm": 1.9949253797531128, + "learning_rate": 1.0685760616529628e-05, + "loss": 1.5425, + "step": 23901 + }, + { + "epoch": 0.8559815209411428, + "grad_norm": 1.7424404621124268, + "learning_rate": 1.068054428273022e-05, + "loss": 1.4624, + "step": 23902 + }, + { + "epoch": 0.856017333070711, + "grad_norm": 1.6785800457000732, + "learning_rate": 1.0675329150608892e-05, + "loss": 1.4386, + "step": 23903 + }, + { + "epoch": 0.8560531452002793, + "grad_norm": 1.4667755365371704, + "learning_rate": 1.0670115220235799e-05, + "loss": 1.3168, + "step": 23904 + }, + { + "epoch": 0.8560889573298476, + "grad_norm": 1.4324119091033936, + "learning_rate": 1.0664902491681051e-05, + "loss": 1.4958, + "step": 23905 + }, + { + "epoch": 0.856124769459416, + "grad_norm": 3.299861192703247, + "learning_rate": 1.0659690965014813e-05, + "loss": 1.3856, + "step": 23906 + }, + { + "epoch": 0.8561605815889842, + "grad_norm": 1.4514801502227783, + "learning_rate": 1.0654480640307195e-05, + "loss": 1.5554, + "step": 23907 + }, + { + "epoch": 0.8561963937185525, + "grad_norm": 1.6911697387695312, + "learning_rate": 1.0649271517628313e-05, + "loss": 1.5904, + "step": 23908 + }, + { + "epoch": 0.8562322058481208, + "grad_norm": 2.0003654956817627, + "learning_rate": 1.0644063597048182e-05, + "loss": 1.2916, + "step": 23909 + }, + { + "epoch": 0.856268017977689, + "grad_norm": 2.0778403282165527, + "learning_rate": 1.063885687863696e-05, + "loss": 1.2572, + "step": 23910 + }, + { + "epoch": 0.8563038301072573, + "grad_norm": 1.5954288244247437, + "learning_rate": 1.0633651362464647e-05, + "loss": 1.4367, + "step": 23911 + }, + { + "epoch": 0.8563396422368256, + "grad_norm": 1.7671774625778198, + "learning_rate": 1.0628447048601265e-05, + "loss": 1.7623, + "step": 23912 + }, + { + "epoch": 0.856375454366394, + "grad_norm": 1.8786360025405884, + "learning_rate": 1.0623243937116845e-05, + "loss": 1.3735, + "step": 23913 + }, + { + "epoch": 0.8564112664959622, + "grad_norm": 1.2363111972808838, + "learning_rate": 1.06180420280814e-05, + "loss": 1.4896, + "step": 23914 + }, + { + "epoch": 0.8564470786255305, + "grad_norm": 1.6070228815078735, + "learning_rate": 1.0612841321564915e-05, + "loss": 1.3349, + "step": 23915 + }, + { + "epoch": 0.8564828907550988, + "grad_norm": 1.6431260108947754, + "learning_rate": 1.0607641817637326e-05, + "loss": 1.4696, + "step": 23916 + }, + { + "epoch": 0.856518702884667, + "grad_norm": 2.0966637134552, + "learning_rate": 1.060244351636861e-05, + "loss": 1.2884, + "step": 23917 + }, + { + "epoch": 0.8565545150142353, + "grad_norm": 1.5325583219528198, + "learning_rate": 1.0597246417828698e-05, + "loss": 1.2232, + "step": 23918 + }, + { + "epoch": 0.8565903271438036, + "grad_norm": 2.3397915363311768, + "learning_rate": 1.0592050522087549e-05, + "loss": 1.5092, + "step": 23919 + }, + { + "epoch": 0.856626139273372, + "grad_norm": 1.8545010089874268, + "learning_rate": 1.0586855829215003e-05, + "loss": 1.2814, + "step": 23920 + }, + { + "epoch": 0.8566619514029402, + "grad_norm": 1.6768451929092407, + "learning_rate": 1.0581662339280973e-05, + "loss": 1.5897, + "step": 23921 + }, + { + "epoch": 0.8566977635325085, + "grad_norm": 1.569795846939087, + "learning_rate": 1.0576470052355358e-05, + "loss": 1.4008, + "step": 23922 + }, + { + "epoch": 0.8567335756620768, + "grad_norm": 1.2324440479278564, + "learning_rate": 1.057127896850797e-05, + "loss": 1.1082, + "step": 23923 + }, + { + "epoch": 0.856769387791645, + "grad_norm": 2.4131672382354736, + "learning_rate": 1.0566089087808672e-05, + "loss": 1.3353, + "step": 23924 + }, + { + "epoch": 0.8568051999212133, + "grad_norm": 1.468252420425415, + "learning_rate": 1.056090041032729e-05, + "loss": 1.425, + "step": 23925 + }, + { + "epoch": 0.8568410120507816, + "grad_norm": 2.575080156326294, + "learning_rate": 1.0555712936133633e-05, + "loss": 1.446, + "step": 23926 + }, + { + "epoch": 0.8568768241803499, + "grad_norm": 1.8932536840438843, + "learning_rate": 1.0550526665297466e-05, + "loss": 1.3337, + "step": 23927 + }, + { + "epoch": 0.8569126363099182, + "grad_norm": 2.1886892318725586, + "learning_rate": 1.0545341597888581e-05, + "loss": 1.3273, + "step": 23928 + }, + { + "epoch": 0.8569484484394865, + "grad_norm": 1.8264682292938232, + "learning_rate": 1.0540157733976763e-05, + "loss": 1.5032, + "step": 23929 + }, + { + "epoch": 0.8569842605690547, + "grad_norm": 1.7340106964111328, + "learning_rate": 1.0534975073631703e-05, + "loss": 1.3628, + "step": 23930 + }, + { + "epoch": 0.857020072698623, + "grad_norm": 1.687674641609192, + "learning_rate": 1.0529793616923157e-05, + "loss": 1.4686, + "step": 23931 + }, + { + "epoch": 0.8570558848281913, + "grad_norm": 1.507163643836975, + "learning_rate": 1.052461336392082e-05, + "loss": 0.9268, + "step": 23932 + }, + { + "epoch": 0.8570916969577596, + "grad_norm": 1.2720131874084473, + "learning_rate": 1.0519434314694422e-05, + "loss": 1.4392, + "step": 23933 + }, + { + "epoch": 0.8571275090873279, + "grad_norm": 1.435158133506775, + "learning_rate": 1.0514256469313588e-05, + "loss": 1.6138, + "step": 23934 + }, + { + "epoch": 0.8571633212168962, + "grad_norm": 2.085623025894165, + "learning_rate": 1.0509079827848012e-05, + "loss": 1.5755, + "step": 23935 + }, + { + "epoch": 0.8571991333464645, + "grad_norm": 1.562156081199646, + "learning_rate": 1.0503904390367325e-05, + "loss": 1.3898, + "step": 23936 + }, + { + "epoch": 0.8572349454760327, + "grad_norm": 2.3077609539031982, + "learning_rate": 1.0498730156941184e-05, + "loss": 1.6702, + "step": 23937 + }, + { + "epoch": 0.857270757605601, + "grad_norm": 1.676255702972412, + "learning_rate": 1.0493557127639164e-05, + "loss": 1.3625, + "step": 23938 + }, + { + "epoch": 0.8573065697351693, + "grad_norm": 1.866253137588501, + "learning_rate": 1.0488385302530878e-05, + "loss": 1.3196, + "step": 23939 + }, + { + "epoch": 0.8573423818647375, + "grad_norm": 1.4433153867721558, + "learning_rate": 1.0483214681685927e-05, + "loss": 1.6465, + "step": 23940 + }, + { + "epoch": 0.8573781939943059, + "grad_norm": 2.484351873397827, + "learning_rate": 1.047804526517383e-05, + "loss": 1.3115, + "step": 23941 + }, + { + "epoch": 0.8574140061238742, + "grad_norm": 1.8021259307861328, + "learning_rate": 1.0472877053064156e-05, + "loss": 1.4364, + "step": 23942 + }, + { + "epoch": 0.8574498182534425, + "grad_norm": 1.6426368951797485, + "learning_rate": 1.0467710045426449e-05, + "loss": 1.6388, + "step": 23943 + }, + { + "epoch": 0.8574856303830107, + "grad_norm": 1.6121195554733276, + "learning_rate": 1.046254424233023e-05, + "loss": 1.1081, + "step": 23944 + }, + { + "epoch": 0.857521442512579, + "grad_norm": 1.808910846710205, + "learning_rate": 1.0457379643844966e-05, + "loss": 1.5212, + "step": 23945 + }, + { + "epoch": 0.8575572546421473, + "grad_norm": 1.7684128284454346, + "learning_rate": 1.0452216250040148e-05, + "loss": 1.5465, + "step": 23946 + }, + { + "epoch": 0.8575930667717155, + "grad_norm": 1.6211570501327515, + "learning_rate": 1.0447054060985284e-05, + "loss": 1.3641, + "step": 23947 + }, + { + "epoch": 0.8576288789012839, + "grad_norm": 1.535387396812439, + "learning_rate": 1.0441893076749765e-05, + "loss": 1.6225, + "step": 23948 + }, + { + "epoch": 0.8576646910308522, + "grad_norm": 1.648389220237732, + "learning_rate": 1.0436733297403056e-05, + "loss": 1.1475, + "step": 23949 + }, + { + "epoch": 0.8577005031604205, + "grad_norm": 1.5785759687423706, + "learning_rate": 1.043157472301457e-05, + "loss": 1.3503, + "step": 23950 + }, + { + "epoch": 0.8577363152899887, + "grad_norm": 1.7339171171188354, + "learning_rate": 1.0426417353653739e-05, + "loss": 1.1188, + "step": 23951 + }, + { + "epoch": 0.857772127419557, + "grad_norm": 1.6951241493225098, + "learning_rate": 1.0421261189389885e-05, + "loss": 1.3101, + "step": 23952 + }, + { + "epoch": 0.8578079395491253, + "grad_norm": 2.160072088241577, + "learning_rate": 1.0416106230292432e-05, + "loss": 1.5236, + "step": 23953 + }, + { + "epoch": 0.8578437516786935, + "grad_norm": 1.4381632804870605, + "learning_rate": 1.0410952476430703e-05, + "loss": 1.3926, + "step": 23954 + }, + { + "epoch": 0.8578795638082619, + "grad_norm": 2.0479938983917236, + "learning_rate": 1.0405799927874072e-05, + "loss": 1.4182, + "step": 23955 + }, + { + "epoch": 0.8579153759378302, + "grad_norm": 1.3435159921646118, + "learning_rate": 1.0400648584691808e-05, + "loss": 1.4873, + "step": 23956 + }, + { + "epoch": 0.8579511880673985, + "grad_norm": 1.5888739824295044, + "learning_rate": 1.0395498446953245e-05, + "loss": 1.4416, + "step": 23957 + }, + { + "epoch": 0.8579870001969667, + "grad_norm": 1.7520204782485962, + "learning_rate": 1.0390349514727694e-05, + "loss": 1.2782, + "step": 23958 + }, + { + "epoch": 0.858022812326535, + "grad_norm": 1.682873249053955, + "learning_rate": 1.0385201788084375e-05, + "loss": 1.6233, + "step": 23959 + }, + { + "epoch": 0.8580586244561033, + "grad_norm": 1.4209063053131104, + "learning_rate": 1.0380055267092581e-05, + "loss": 1.7817, + "step": 23960 + }, + { + "epoch": 0.8580944365856715, + "grad_norm": 1.583618402481079, + "learning_rate": 1.0374909951821532e-05, + "loss": 1.3119, + "step": 23961 + }, + { + "epoch": 0.8581302487152399, + "grad_norm": 1.6202590465545654, + "learning_rate": 1.0369765842340484e-05, + "loss": 1.3546, + "step": 23962 + }, + { + "epoch": 0.8581660608448082, + "grad_norm": 1.4713712930679321, + "learning_rate": 1.0364622938718627e-05, + "loss": 1.2915, + "step": 23963 + }, + { + "epoch": 0.8582018729743764, + "grad_norm": 1.4419574737548828, + "learning_rate": 1.0359481241025105e-05, + "loss": 1.3699, + "step": 23964 + }, + { + "epoch": 0.8582376851039447, + "grad_norm": 2.112922430038452, + "learning_rate": 1.0354340749329172e-05, + "loss": 1.1557, + "step": 23965 + }, + { + "epoch": 0.858273497233513, + "grad_norm": 1.5085670948028564, + "learning_rate": 1.0349201463699932e-05, + "loss": 1.4269, + "step": 23966 + }, + { + "epoch": 0.8583093093630813, + "grad_norm": 1.2277408838272095, + "learning_rate": 1.0344063384206537e-05, + "loss": 1.3207, + "step": 23967 + }, + { + "epoch": 0.8583451214926495, + "grad_norm": 1.8692991733551025, + "learning_rate": 1.0338926510918134e-05, + "loss": 1.8965, + "step": 23968 + }, + { + "epoch": 0.8583809336222178, + "grad_norm": 1.5993348360061646, + "learning_rate": 1.0333790843903835e-05, + "loss": 1.3095, + "step": 23969 + }, + { + "epoch": 0.8584167457517862, + "grad_norm": 1.7335530519485474, + "learning_rate": 1.0328656383232692e-05, + "loss": 1.3135, + "step": 23970 + }, + { + "epoch": 0.8584525578813544, + "grad_norm": 1.4201031923294067, + "learning_rate": 1.0323523128973822e-05, + "loss": 1.2858, + "step": 23971 + }, + { + "epoch": 0.8584883700109227, + "grad_norm": 1.8706300258636475, + "learning_rate": 1.0318391081196288e-05, + "loss": 1.4368, + "step": 23972 + }, + { + "epoch": 0.858524182140491, + "grad_norm": 1.5056337118148804, + "learning_rate": 1.0313260239969102e-05, + "loss": 1.4255, + "step": 23973 + }, + { + "epoch": 0.8585599942700592, + "grad_norm": 1.665971040725708, + "learning_rate": 1.0308130605361333e-05, + "loss": 1.4536, + "step": 23974 + }, + { + "epoch": 0.8585958063996275, + "grad_norm": 1.5636063814163208, + "learning_rate": 1.0303002177441934e-05, + "loss": 1.3331, + "step": 23975 + }, + { + "epoch": 0.8586316185291958, + "grad_norm": 1.4892923831939697, + "learning_rate": 1.0297874956279974e-05, + "loss": 1.2562, + "step": 23976 + }, + { + "epoch": 0.8586674306587642, + "grad_norm": 1.3059931993484497, + "learning_rate": 1.0292748941944385e-05, + "loss": 1.2593, + "step": 23977 + }, + { + "epoch": 0.8587032427883324, + "grad_norm": 1.7474253177642822, + "learning_rate": 1.0287624134504158e-05, + "loss": 1.2735, + "step": 23978 + }, + { + "epoch": 0.8587390549179007, + "grad_norm": 1.916402816772461, + "learning_rate": 1.0282500534028195e-05, + "loss": 1.5238, + "step": 23979 + }, + { + "epoch": 0.858774867047469, + "grad_norm": 1.8782771825790405, + "learning_rate": 1.0277378140585491e-05, + "loss": 1.4562, + "step": 23980 + }, + { + "epoch": 0.8588106791770372, + "grad_norm": 1.3739666938781738, + "learning_rate": 1.0272256954244941e-05, + "loss": 1.6322, + "step": 23981 + }, + { + "epoch": 0.8588464913066055, + "grad_norm": 1.7305517196655273, + "learning_rate": 1.0267136975075386e-05, + "loss": 1.2822, + "step": 23982 + }, + { + "epoch": 0.8588823034361738, + "grad_norm": 1.8392616510391235, + "learning_rate": 1.0262018203145796e-05, + "loss": 1.331, + "step": 23983 + }, + { + "epoch": 0.8589181155657422, + "grad_norm": 1.3434126377105713, + "learning_rate": 1.0256900638524979e-05, + "loss": 1.5343, + "step": 23984 + }, + { + "epoch": 0.8589539276953104, + "grad_norm": 1.373840093612671, + "learning_rate": 1.0251784281281829e-05, + "loss": 1.4114, + "step": 23985 + }, + { + "epoch": 0.8589897398248787, + "grad_norm": 2.5821125507354736, + "learning_rate": 1.0246669131485109e-05, + "loss": 1.411, + "step": 23986 + }, + { + "epoch": 0.859025551954447, + "grad_norm": 1.8943785429000854, + "learning_rate": 1.0241555189203722e-05, + "loss": 1.3037, + "step": 23987 + }, + { + "epoch": 0.8590613640840152, + "grad_norm": 1.3766758441925049, + "learning_rate": 1.0236442454506411e-05, + "loss": 1.3691, + "step": 23988 + }, + { + "epoch": 0.8590971762135835, + "grad_norm": 2.0536932945251465, + "learning_rate": 1.0231330927462002e-05, + "loss": 1.1827, + "step": 23989 + }, + { + "epoch": 0.8591329883431518, + "grad_norm": 1.7797173261642456, + "learning_rate": 1.0226220608139214e-05, + "loss": 1.53, + "step": 23990 + }, + { + "epoch": 0.8591688004727202, + "grad_norm": 1.6189501285552979, + "learning_rate": 1.022111149660684e-05, + "loss": 1.8422, + "step": 23991 + }, + { + "epoch": 0.8592046126022884, + "grad_norm": 1.5615276098251343, + "learning_rate": 1.021600359293361e-05, + "loss": 1.3183, + "step": 23992 + }, + { + "epoch": 0.8592404247318567, + "grad_norm": 1.7850444316864014, + "learning_rate": 1.0210896897188216e-05, + "loss": 1.5006, + "step": 23993 + }, + { + "epoch": 0.859276236861425, + "grad_norm": 2.1221442222595215, + "learning_rate": 1.0205791409439413e-05, + "loss": 1.2876, + "step": 23994 + }, + { + "epoch": 0.8593120489909932, + "grad_norm": 1.399099349975586, + "learning_rate": 1.0200687129755837e-05, + "loss": 1.0976, + "step": 23995 + }, + { + "epoch": 0.8593478611205615, + "grad_norm": 1.5655336380004883, + "learning_rate": 1.0195584058206209e-05, + "loss": 1.1475, + "step": 23996 + }, + { + "epoch": 0.8593836732501298, + "grad_norm": 1.4112638235092163, + "learning_rate": 1.0190482194859119e-05, + "loss": 1.2891, + "step": 23997 + }, + { + "epoch": 0.8594194853796981, + "grad_norm": 1.8032960891723633, + "learning_rate": 1.018538153978329e-05, + "loss": 1.3916, + "step": 23998 + }, + { + "epoch": 0.8594552975092664, + "grad_norm": 1.1614011526107788, + "learning_rate": 1.0180282093047288e-05, + "loss": 0.9882, + "step": 23999 + }, + { + "epoch": 0.8594911096388347, + "grad_norm": 1.553151249885559, + "learning_rate": 1.0175183854719716e-05, + "loss": 1.6078, + "step": 24000 + }, + { + "epoch": 0.859526921768403, + "grad_norm": 1.4639896154403687, + "learning_rate": 1.0170086824869184e-05, + "loss": 1.3421, + "step": 24001 + }, + { + "epoch": 0.8595627338979712, + "grad_norm": 1.3828316926956177, + "learning_rate": 1.0164991003564261e-05, + "loss": 1.2658, + "step": 24002 + }, + { + "epoch": 0.8595985460275395, + "grad_norm": 2.3339357376098633, + "learning_rate": 1.0159896390873524e-05, + "loss": 1.5719, + "step": 24003 + }, + { + "epoch": 0.8596343581571078, + "grad_norm": 1.4203572273254395, + "learning_rate": 1.0154802986865475e-05, + "loss": 1.5003, + "step": 24004 + }, + { + "epoch": 0.8596701702866761, + "grad_norm": 1.435115933418274, + "learning_rate": 1.0149710791608657e-05, + "loss": 1.3779, + "step": 24005 + }, + { + "epoch": 0.8597059824162444, + "grad_norm": 1.3784613609313965, + "learning_rate": 1.0144619805171584e-05, + "loss": 1.4091, + "step": 24006 + }, + { + "epoch": 0.8597417945458127, + "grad_norm": 1.42812979221344, + "learning_rate": 1.0139530027622768e-05, + "loss": 1.2791, + "step": 24007 + }, + { + "epoch": 0.859777606675381, + "grad_norm": 1.809586763381958, + "learning_rate": 1.0134441459030642e-05, + "loss": 1.3902, + "step": 24008 + }, + { + "epoch": 0.8598134188049492, + "grad_norm": 1.387447714805603, + "learning_rate": 1.0129354099463683e-05, + "loss": 1.4874, + "step": 24009 + }, + { + "epoch": 0.8598492309345175, + "grad_norm": 1.933063268661499, + "learning_rate": 1.0124267948990363e-05, + "loss": 1.7334, + "step": 24010 + }, + { + "epoch": 0.8598850430640858, + "grad_norm": 1.4610179662704468, + "learning_rate": 1.0119183007679067e-05, + "loss": 1.3521, + "step": 24011 + }, + { + "epoch": 0.8599208551936541, + "grad_norm": 1.5905406475067139, + "learning_rate": 1.0114099275598232e-05, + "loss": 1.497, + "step": 24012 + }, + { + "epoch": 0.8599566673232224, + "grad_norm": 1.7359148263931274, + "learning_rate": 1.0109016752816247e-05, + "loss": 1.3048, + "step": 24013 + }, + { + "epoch": 0.8599924794527907, + "grad_norm": 2.0931310653686523, + "learning_rate": 1.0103935439401502e-05, + "loss": 1.3526, + "step": 24014 + }, + { + "epoch": 0.8600282915823589, + "grad_norm": 1.846803903579712, + "learning_rate": 1.0098855335422331e-05, + "loss": 1.521, + "step": 24015 + }, + { + "epoch": 0.8600641037119272, + "grad_norm": 1.602505087852478, + "learning_rate": 1.00937764409471e-05, + "loss": 1.6741, + "step": 24016 + }, + { + "epoch": 0.8600999158414955, + "grad_norm": 1.551328420639038, + "learning_rate": 1.0088698756044146e-05, + "loss": 1.2546, + "step": 24017 + }, + { + "epoch": 0.8601357279710637, + "grad_norm": 1.5050621032714844, + "learning_rate": 1.0083622280781769e-05, + "loss": 1.4149, + "step": 24018 + }, + { + "epoch": 0.8601715401006321, + "grad_norm": 1.488797903060913, + "learning_rate": 1.0078547015228257e-05, + "loss": 1.5439, + "step": 24019 + }, + { + "epoch": 0.8602073522302004, + "grad_norm": 1.7439039945602417, + "learning_rate": 1.0073472959451913e-05, + "loss": 1.1836, + "step": 24020 + }, + { + "epoch": 0.8602431643597687, + "grad_norm": 1.8317373991012573, + "learning_rate": 1.0068400113521014e-05, + "loss": 1.4543, + "step": 24021 + }, + { + "epoch": 0.8602789764893369, + "grad_norm": 1.8038636445999146, + "learning_rate": 1.0063328477503764e-05, + "loss": 1.5558, + "step": 24022 + }, + { + "epoch": 0.8603147886189052, + "grad_norm": 2.062847137451172, + "learning_rate": 1.0058258051468417e-05, + "loss": 1.2856, + "step": 24023 + }, + { + "epoch": 0.8603506007484735, + "grad_norm": 2.013709306716919, + "learning_rate": 1.0053188835483197e-05, + "loss": 1.5779, + "step": 24024 + }, + { + "epoch": 0.8603864128780417, + "grad_norm": 2.6702752113342285, + "learning_rate": 1.0048120829616314e-05, + "loss": 1.3096, + "step": 24025 + }, + { + "epoch": 0.8604222250076101, + "grad_norm": 1.617053508758545, + "learning_rate": 1.0043054033935917e-05, + "loss": 1.5229, + "step": 24026 + }, + { + "epoch": 0.8604580371371784, + "grad_norm": 1.3419846296310425, + "learning_rate": 1.0037988448510193e-05, + "loss": 1.154, + "step": 24027 + }, + { + "epoch": 0.8604938492667467, + "grad_norm": 1.271104097366333, + "learning_rate": 1.0032924073407313e-05, + "loss": 1.4895, + "step": 24028 + }, + { + "epoch": 0.8605296613963149, + "grad_norm": 1.682997703552246, + "learning_rate": 1.0027860908695363e-05, + "loss": 1.4205, + "step": 24029 + }, + { + "epoch": 0.8605654735258832, + "grad_norm": 1.687303066253662, + "learning_rate": 1.0022798954442491e-05, + "loss": 1.2232, + "step": 24030 + }, + { + "epoch": 0.8606012856554515, + "grad_norm": 1.7541818618774414, + "learning_rate": 1.001773821071681e-05, + "loss": 1.2612, + "step": 24031 + }, + { + "epoch": 0.8606370977850197, + "grad_norm": 2.01399302482605, + "learning_rate": 1.0012678677586396e-05, + "loss": 1.4981, + "step": 24032 + }, + { + "epoch": 0.8606729099145881, + "grad_norm": 1.589170217514038, + "learning_rate": 1.0007620355119307e-05, + "loss": 1.3366, + "step": 24033 + }, + { + "epoch": 0.8607087220441564, + "grad_norm": 1.6481248140335083, + "learning_rate": 1.000256324338359e-05, + "loss": 1.3699, + "step": 24034 + }, + { + "epoch": 0.8607445341737247, + "grad_norm": 1.8044880628585815, + "learning_rate": 9.997507342447333e-06, + "loss": 1.5593, + "step": 24035 + }, + { + "epoch": 0.8607803463032929, + "grad_norm": 1.9255050420761108, + "learning_rate": 9.992452652378493e-06, + "loss": 1.5335, + "step": 24036 + }, + { + "epoch": 0.8608161584328612, + "grad_norm": 1.7744263410568237, + "learning_rate": 9.987399173245093e-06, + "loss": 1.1219, + "step": 24037 + }, + { + "epoch": 0.8608519705624295, + "grad_norm": 1.6102805137634277, + "learning_rate": 9.982346905115137e-06, + "loss": 1.3634, + "step": 24038 + }, + { + "epoch": 0.8608877826919977, + "grad_norm": 1.4857194423675537, + "learning_rate": 9.977295848056612e-06, + "loss": 1.3996, + "step": 24039 + }, + { + "epoch": 0.8609235948215661, + "grad_norm": 1.8359849452972412, + "learning_rate": 9.97224600213742e-06, + "loss": 1.3318, + "step": 24040 + }, + { + "epoch": 0.8609594069511344, + "grad_norm": 1.50896155834198, + "learning_rate": 9.96719736742554e-06, + "loss": 1.6464, + "step": 24041 + }, + { + "epoch": 0.8609952190807026, + "grad_norm": 1.3213436603546143, + "learning_rate": 9.962149943988885e-06, + "loss": 1.2947, + "step": 24042 + }, + { + "epoch": 0.8610310312102709, + "grad_norm": 1.540392279624939, + "learning_rate": 9.957103731895379e-06, + "loss": 1.5812, + "step": 24043 + }, + { + "epoch": 0.8610668433398392, + "grad_norm": 1.953053593635559, + "learning_rate": 9.952058731212877e-06, + "loss": 1.8253, + "step": 24044 + }, + { + "epoch": 0.8611026554694075, + "grad_norm": 1.9430025815963745, + "learning_rate": 9.947014942009269e-06, + "loss": 1.4107, + "step": 24045 + }, + { + "epoch": 0.8611384675989757, + "grad_norm": 1.5683186054229736, + "learning_rate": 9.941972364352436e-06, + "loss": 1.6623, + "step": 24046 + }, + { + "epoch": 0.8611742797285441, + "grad_norm": 1.603224754333496, + "learning_rate": 9.936930998310179e-06, + "loss": 1.4686, + "step": 24047 + }, + { + "epoch": 0.8612100918581124, + "grad_norm": 1.5715124607086182, + "learning_rate": 9.931890843950342e-06, + "loss": 1.481, + "step": 24048 + }, + { + "epoch": 0.8612459039876806, + "grad_norm": 1.7702678442001343, + "learning_rate": 9.92685190134074e-06, + "loss": 1.6707, + "step": 24049 + }, + { + "epoch": 0.8612817161172489, + "grad_norm": 1.6073265075683594, + "learning_rate": 9.921814170549171e-06, + "loss": 1.2719, + "step": 24050 + }, + { + "epoch": 0.8613175282468172, + "grad_norm": 2.0798747539520264, + "learning_rate": 9.916777651643383e-06, + "loss": 1.3234, + "step": 24051 + }, + { + "epoch": 0.8613533403763854, + "grad_norm": 2.2637381553649902, + "learning_rate": 9.911742344691156e-06, + "loss": 1.5011, + "step": 24052 + }, + { + "epoch": 0.8613891525059537, + "grad_norm": 1.9088646173477173, + "learning_rate": 9.906708249760244e-06, + "loss": 1.2568, + "step": 24053 + }, + { + "epoch": 0.8614249646355221, + "grad_norm": 1.5362104177474976, + "learning_rate": 9.901675366918339e-06, + "loss": 1.2768, + "step": 24054 + }, + { + "epoch": 0.8614607767650904, + "grad_norm": 1.4941067695617676, + "learning_rate": 9.896643696233177e-06, + "loss": 1.0898, + "step": 24055 + }, + { + "epoch": 0.8614965888946586, + "grad_norm": 1.436546802520752, + "learning_rate": 9.891613237772458e-06, + "loss": 1.5783, + "step": 24056 + }, + { + "epoch": 0.8615324010242269, + "grad_norm": 1.2099878787994385, + "learning_rate": 9.88658399160386e-06, + "loss": 1.3196, + "step": 24057 + }, + { + "epoch": 0.8615682131537952, + "grad_norm": 1.3755364418029785, + "learning_rate": 9.88155595779502e-06, + "loss": 1.4722, + "step": 24058 + }, + { + "epoch": 0.8616040252833634, + "grad_norm": 1.5416947603225708, + "learning_rate": 9.876529136413593e-06, + "loss": 1.2288, + "step": 24059 + }, + { + "epoch": 0.8616398374129317, + "grad_norm": 1.4445111751556396, + "learning_rate": 9.871503527527226e-06, + "loss": 1.4655, + "step": 24060 + }, + { + "epoch": 0.8616756495425001, + "grad_norm": 1.4712949991226196, + "learning_rate": 9.866479131203544e-06, + "loss": 1.3464, + "step": 24061 + }, + { + "epoch": 0.8617114616720684, + "grad_norm": 1.454877495765686, + "learning_rate": 9.861455947510112e-06, + "loss": 1.4811, + "step": 24062 + }, + { + "epoch": 0.8617472738016366, + "grad_norm": 1.4645787477493286, + "learning_rate": 9.856433976514479e-06, + "loss": 1.4101, + "step": 24063 + }, + { + "epoch": 0.8617830859312049, + "grad_norm": 2.1881892681121826, + "learning_rate": 9.8514132182843e-06, + "loss": 1.2949, + "step": 24064 + }, + { + "epoch": 0.8618188980607732, + "grad_norm": 1.827864170074463, + "learning_rate": 9.846393672887044e-06, + "loss": 1.496, + "step": 24065 + }, + { + "epoch": 0.8618547101903414, + "grad_norm": 1.3500398397445679, + "learning_rate": 9.841375340390268e-06, + "loss": 1.4182, + "step": 24066 + }, + { + "epoch": 0.8618905223199097, + "grad_norm": 1.408724069595337, + "learning_rate": 9.836358220861508e-06, + "loss": 1.3504, + "step": 24067 + }, + { + "epoch": 0.8619263344494781, + "grad_norm": 1.4061379432678223, + "learning_rate": 9.831342314368252e-06, + "loss": 1.1648, + "step": 24068 + }, + { + "epoch": 0.8619621465790464, + "grad_norm": 1.690593957901001, + "learning_rate": 9.826327620977972e-06, + "loss": 1.3875, + "step": 24069 + }, + { + "epoch": 0.8619979587086146, + "grad_norm": 1.5156593322753906, + "learning_rate": 9.82131414075811e-06, + "loss": 1.1197, + "step": 24070 + }, + { + "epoch": 0.8620337708381829, + "grad_norm": 1.518989086151123, + "learning_rate": 9.816301873776178e-06, + "loss": 1.2619, + "step": 24071 + }, + { + "epoch": 0.8620695829677512, + "grad_norm": 1.8217155933380127, + "learning_rate": 9.81129082009955e-06, + "loss": 1.471, + "step": 24072 + }, + { + "epoch": 0.8621053950973194, + "grad_norm": 1.4540605545043945, + "learning_rate": 9.8062809797957e-06, + "loss": 1.2745, + "step": 24073 + }, + { + "epoch": 0.8621412072268877, + "grad_norm": 1.4706782102584839, + "learning_rate": 9.801272352931957e-06, + "loss": 1.4899, + "step": 24074 + }, + { + "epoch": 0.8621770193564561, + "grad_norm": 1.7271860837936401, + "learning_rate": 9.796264939575784e-06, + "loss": 1.5868, + "step": 24075 + }, + { + "epoch": 0.8622128314860243, + "grad_norm": 1.2994468212127686, + "learning_rate": 9.791258739794484e-06, + "loss": 1.5339, + "step": 24076 + }, + { + "epoch": 0.8622486436155926, + "grad_norm": 1.6923842430114746, + "learning_rate": 9.78625375365545e-06, + "loss": 1.3637, + "step": 24077 + }, + { + "epoch": 0.8622844557451609, + "grad_norm": 1.8347572088241577, + "learning_rate": 9.781249981226015e-06, + "loss": 1.6562, + "step": 24078 + }, + { + "epoch": 0.8623202678747292, + "grad_norm": 2.0344903469085693, + "learning_rate": 9.77624742257347e-06, + "loss": 1.68, + "step": 24079 + }, + { + "epoch": 0.8623560800042974, + "grad_norm": 1.6433473825454712, + "learning_rate": 9.771246077765151e-06, + "loss": 1.554, + "step": 24080 + }, + { + "epoch": 0.8623918921338657, + "grad_norm": 1.4841326475143433, + "learning_rate": 9.766245946868302e-06, + "loss": 1.5479, + "step": 24081 + }, + { + "epoch": 0.8624277042634341, + "grad_norm": 1.3508626222610474, + "learning_rate": 9.761247029950249e-06, + "loss": 1.2383, + "step": 24082 + }, + { + "epoch": 0.8624635163930023, + "grad_norm": 1.4519118070602417, + "learning_rate": 9.756249327078204e-06, + "loss": 1.3748, + "step": 24083 + }, + { + "epoch": 0.8624993285225706, + "grad_norm": 2.055274486541748, + "learning_rate": 9.751252838319436e-06, + "loss": 1.6204, + "step": 24084 + }, + { + "epoch": 0.8625351406521389, + "grad_norm": 1.8891230821609497, + "learning_rate": 9.746257563741102e-06, + "loss": 1.2223, + "step": 24085 + }, + { + "epoch": 0.8625709527817071, + "grad_norm": 1.5189100503921509, + "learning_rate": 9.741263503410503e-06, + "loss": 1.4791, + "step": 24086 + }, + { + "epoch": 0.8626067649112754, + "grad_norm": 1.586703896522522, + "learning_rate": 9.736270657394774e-06, + "loss": 1.1318, + "step": 24087 + }, + { + "epoch": 0.8626425770408437, + "grad_norm": 1.821293830871582, + "learning_rate": 9.731279025761076e-06, + "loss": 1.2319, + "step": 24088 + }, + { + "epoch": 0.8626783891704121, + "grad_norm": 1.7827171087265015, + "learning_rate": 9.726288608576573e-06, + "loss": 1.3376, + "step": 24089 + }, + { + "epoch": 0.8627142012999803, + "grad_norm": 1.368043303489685, + "learning_rate": 9.721299405908412e-06, + "loss": 1.5547, + "step": 24090 + }, + { + "epoch": 0.8627500134295486, + "grad_norm": 2.012338638305664, + "learning_rate": 9.716311417823742e-06, + "loss": 1.7127, + "step": 24091 + }, + { + "epoch": 0.8627858255591169, + "grad_norm": 2.133498191833496, + "learning_rate": 9.711324644389609e-06, + "loss": 1.4458, + "step": 24092 + }, + { + "epoch": 0.8628216376886851, + "grad_norm": 1.5990008115768433, + "learning_rate": 9.706339085673167e-06, + "loss": 1.2618, + "step": 24093 + }, + { + "epoch": 0.8628574498182534, + "grad_norm": 1.9938685894012451, + "learning_rate": 9.701354741741454e-06, + "loss": 1.6879, + "step": 24094 + }, + { + "epoch": 0.8628932619478217, + "grad_norm": 1.8313326835632324, + "learning_rate": 9.696371612661548e-06, + "loss": 1.3386, + "step": 24095 + }, + { + "epoch": 0.8629290740773901, + "grad_norm": 1.820915699005127, + "learning_rate": 9.691389698500463e-06, + "loss": 1.2029, + "step": 24096 + }, + { + "epoch": 0.8629648862069583, + "grad_norm": 1.3561536073684692, + "learning_rate": 9.686408999325236e-06, + "loss": 1.3453, + "step": 24097 + }, + { + "epoch": 0.8630006983365266, + "grad_norm": 2.9096224308013916, + "learning_rate": 9.6814295152029e-06, + "loss": 1.2922, + "step": 24098 + }, + { + "epoch": 0.8630365104660949, + "grad_norm": 1.629462718963623, + "learning_rate": 9.676451246200401e-06, + "loss": 1.4125, + "step": 24099 + }, + { + "epoch": 0.8630723225956631, + "grad_norm": 1.6376150846481323, + "learning_rate": 9.671474192384755e-06, + "loss": 1.2534, + "step": 24100 + }, + { + "epoch": 0.8631081347252314, + "grad_norm": 1.6502249240875244, + "learning_rate": 9.666498353822905e-06, + "loss": 1.478, + "step": 24101 + }, + { + "epoch": 0.8631439468547997, + "grad_norm": 2.37195086479187, + "learning_rate": 9.661523730581813e-06, + "loss": 1.2485, + "step": 24102 + }, + { + "epoch": 0.863179758984368, + "grad_norm": 1.9842101335525513, + "learning_rate": 9.656550322728353e-06, + "loss": 1.3154, + "step": 24103 + }, + { + "epoch": 0.8632155711139363, + "grad_norm": 1.7502880096435547, + "learning_rate": 9.651578130329508e-06, + "loss": 1.5923, + "step": 24104 + }, + { + "epoch": 0.8632513832435046, + "grad_norm": 1.3002649545669556, + "learning_rate": 9.646607153452147e-06, + "loss": 1.1142, + "step": 24105 + }, + { + "epoch": 0.8632871953730729, + "grad_norm": 1.5864512920379639, + "learning_rate": 9.641637392163116e-06, + "loss": 1.1018, + "step": 24106 + }, + { + "epoch": 0.8633230075026411, + "grad_norm": 1.585464596748352, + "learning_rate": 9.636668846529296e-06, + "loss": 1.3463, + "step": 24107 + }, + { + "epoch": 0.8633588196322094, + "grad_norm": 1.4023091793060303, + "learning_rate": 9.631701516617542e-06, + "loss": 1.2446, + "step": 24108 + }, + { + "epoch": 0.8633946317617777, + "grad_norm": 1.7487534284591675, + "learning_rate": 9.626735402494703e-06, + "loss": 1.1232, + "step": 24109 + }, + { + "epoch": 0.863430443891346, + "grad_norm": 2.0945987701416016, + "learning_rate": 9.621770504227534e-06, + "loss": 1.4021, + "step": 24110 + }, + { + "epoch": 0.8634662560209143, + "grad_norm": 1.8034226894378662, + "learning_rate": 9.616806821882873e-06, + "loss": 1.2245, + "step": 24111 + }, + { + "epoch": 0.8635020681504826, + "grad_norm": 1.5060043334960938, + "learning_rate": 9.611844355527477e-06, + "loss": 1.4211, + "step": 24112 + }, + { + "epoch": 0.8635378802800509, + "grad_norm": 1.9010181427001953, + "learning_rate": 9.60688310522816e-06, + "loss": 1.5169, + "step": 24113 + }, + { + "epoch": 0.8635736924096191, + "grad_norm": 2.2144103050231934, + "learning_rate": 9.6019230710516e-06, + "loss": 1.7569, + "step": 24114 + }, + { + "epoch": 0.8636095045391874, + "grad_norm": 1.660400390625, + "learning_rate": 9.596964253064567e-06, + "loss": 1.3721, + "step": 24115 + }, + { + "epoch": 0.8636453166687557, + "grad_norm": 1.677625298500061, + "learning_rate": 9.592006651333785e-06, + "loss": 1.4996, + "step": 24116 + }, + { + "epoch": 0.863681128798324, + "grad_norm": 1.848111867904663, + "learning_rate": 9.587050265925912e-06, + "loss": 1.7495, + "step": 24117 + }, + { + "epoch": 0.8637169409278923, + "grad_norm": 1.6364637613296509, + "learning_rate": 9.582095096907651e-06, + "loss": 1.3123, + "step": 24118 + }, + { + "epoch": 0.8637527530574606, + "grad_norm": 1.3802284002304077, + "learning_rate": 9.57714114434568e-06, + "loss": 1.4342, + "step": 24119 + }, + { + "epoch": 0.8637885651870288, + "grad_norm": 1.6368407011032104, + "learning_rate": 9.572188408306649e-06, + "loss": 1.496, + "step": 24120 + }, + { + "epoch": 0.8638243773165971, + "grad_norm": 1.535515546798706, + "learning_rate": 9.567236888857166e-06, + "loss": 1.4658, + "step": 24121 + }, + { + "epoch": 0.8638601894461654, + "grad_norm": 1.625495195388794, + "learning_rate": 9.562286586063861e-06, + "loss": 1.8956, + "step": 24122 + }, + { + "epoch": 0.8638960015757337, + "grad_norm": 1.8822096586227417, + "learning_rate": 9.557337499993346e-06, + "loss": 1.4238, + "step": 24123 + }, + { + "epoch": 0.863931813705302, + "grad_norm": 2.1007256507873535, + "learning_rate": 9.552389630712178e-06, + "loss": 1.5239, + "step": 24124 + }, + { + "epoch": 0.8639676258348703, + "grad_norm": 2.123950958251953, + "learning_rate": 9.547442978286946e-06, + "loss": 1.5102, + "step": 24125 + }, + { + "epoch": 0.8640034379644386, + "grad_norm": 1.6733735799789429, + "learning_rate": 9.542497542784178e-06, + "loss": 1.2072, + "step": 24126 + }, + { + "epoch": 0.8640392500940068, + "grad_norm": 1.3590643405914307, + "learning_rate": 9.537553324270455e-06, + "loss": 1.7378, + "step": 24127 + }, + { + "epoch": 0.8640750622235751, + "grad_norm": 1.9480410814285278, + "learning_rate": 9.53261032281224e-06, + "loss": 1.5894, + "step": 24128 + }, + { + "epoch": 0.8641108743531434, + "grad_norm": 1.5625706911087036, + "learning_rate": 9.527668538476054e-06, + "loss": 1.3688, + "step": 24129 + }, + { + "epoch": 0.8641466864827116, + "grad_norm": 1.3763489723205566, + "learning_rate": 9.522727971328393e-06, + "loss": 0.9818, + "step": 24130 + }, + { + "epoch": 0.86418249861228, + "grad_norm": 2.1239705085754395, + "learning_rate": 9.517788621435742e-06, + "loss": 1.7518, + "step": 24131 + }, + { + "epoch": 0.8642183107418483, + "grad_norm": 2.0525310039520264, + "learning_rate": 9.512850488864511e-06, + "loss": 1.6703, + "step": 24132 + }, + { + "epoch": 0.8642541228714166, + "grad_norm": 2.882822036743164, + "learning_rate": 9.50791357368115e-06, + "loss": 1.3199, + "step": 24133 + }, + { + "epoch": 0.8642899350009848, + "grad_norm": 1.5467312335968018, + "learning_rate": 9.502977875952113e-06, + "loss": 1.4511, + "step": 24134 + }, + { + "epoch": 0.8643257471305531, + "grad_norm": 1.8754770755767822, + "learning_rate": 9.49804339574375e-06, + "loss": 1.5401, + "step": 24135 + }, + { + "epoch": 0.8643615592601214, + "grad_norm": 1.4859979152679443, + "learning_rate": 9.493110133122474e-06, + "loss": 1.5788, + "step": 24136 + }, + { + "epoch": 0.8643973713896896, + "grad_norm": 1.6180216073989868, + "learning_rate": 9.488178088154654e-06, + "loss": 1.0272, + "step": 24137 + }, + { + "epoch": 0.864433183519258, + "grad_norm": 1.5847938060760498, + "learning_rate": 9.48324726090667e-06, + "loss": 1.4547, + "step": 24138 + }, + { + "epoch": 0.8644689956488263, + "grad_norm": 1.4334745407104492, + "learning_rate": 9.478317651444812e-06, + "loss": 1.2533, + "step": 24139 + }, + { + "epoch": 0.8645048077783946, + "grad_norm": 1.6126441955566406, + "learning_rate": 9.47338925983543e-06, + "loss": 1.2693, + "step": 24140 + }, + { + "epoch": 0.8645406199079628, + "grad_norm": 1.5606629848480225, + "learning_rate": 9.468462086144847e-06, + "loss": 1.3871, + "step": 24141 + }, + { + "epoch": 0.8645764320375311, + "grad_norm": 1.4781626462936401, + "learning_rate": 9.46353613043931e-06, + "loss": 1.239, + "step": 24142 + }, + { + "epoch": 0.8646122441670994, + "grad_norm": 1.2007153034210205, + "learning_rate": 9.4586113927851e-06, + "loss": 1.2876, + "step": 24143 + }, + { + "epoch": 0.8646480562966676, + "grad_norm": 1.7525100708007812, + "learning_rate": 9.453687873248495e-06, + "loss": 1.5107, + "step": 24144 + }, + { + "epoch": 0.864683868426236, + "grad_norm": 1.4336668252944946, + "learning_rate": 9.448765571895735e-06, + "loss": 1.4305, + "step": 24145 + }, + { + "epoch": 0.8647196805558043, + "grad_norm": 2.0794224739074707, + "learning_rate": 9.443844488793018e-06, + "loss": 1.7878, + "step": 24146 + }, + { + "epoch": 0.8647554926853726, + "grad_norm": 1.4945629835128784, + "learning_rate": 9.438924624006563e-06, + "loss": 1.5938, + "step": 24147 + }, + { + "epoch": 0.8647913048149408, + "grad_norm": 1.2787604331970215, + "learning_rate": 9.434005977602556e-06, + "loss": 1.2151, + "step": 24148 + }, + { + "epoch": 0.8648271169445091, + "grad_norm": 1.4332270622253418, + "learning_rate": 9.429088549647203e-06, + "loss": 1.2921, + "step": 24149 + }, + { + "epoch": 0.8648629290740774, + "grad_norm": 1.8126541376113892, + "learning_rate": 9.424172340206616e-06, + "loss": 1.3874, + "step": 24150 + }, + { + "epoch": 0.8648987412036456, + "grad_norm": 1.3045158386230469, + "learning_rate": 9.419257349346956e-06, + "loss": 1.3823, + "step": 24151 + }, + { + "epoch": 0.864934553333214, + "grad_norm": 1.5711522102355957, + "learning_rate": 9.414343577134355e-06, + "loss": 1.5328, + "step": 24152 + }, + { + "epoch": 0.8649703654627823, + "grad_norm": 1.600151777267456, + "learning_rate": 9.409431023634908e-06, + "loss": 1.39, + "step": 24153 + }, + { + "epoch": 0.8650061775923505, + "grad_norm": 1.6828088760375977, + "learning_rate": 9.404519688914703e-06, + "loss": 1.2594, + "step": 24154 + }, + { + "epoch": 0.8650419897219188, + "grad_norm": 1.626774787902832, + "learning_rate": 9.399609573039836e-06, + "loss": 1.4374, + "step": 24155 + }, + { + "epoch": 0.8650778018514871, + "grad_norm": 1.709047555923462, + "learning_rate": 9.394700676076374e-06, + "loss": 1.2774, + "step": 24156 + }, + { + "epoch": 0.8651136139810554, + "grad_norm": 1.3657279014587402, + "learning_rate": 9.389792998090319e-06, + "loss": 1.4303, + "step": 24157 + }, + { + "epoch": 0.8651494261106236, + "grad_norm": 1.8122633695602417, + "learning_rate": 9.384886539147718e-06, + "loss": 1.4683, + "step": 24158 + }, + { + "epoch": 0.865185238240192, + "grad_norm": 2.004755973815918, + "learning_rate": 9.379981299314611e-06, + "loss": 1.7386, + "step": 24159 + }, + { + "epoch": 0.8652210503697603, + "grad_norm": 1.629216194152832, + "learning_rate": 9.375077278656941e-06, + "loss": 1.3271, + "step": 24160 + }, + { + "epoch": 0.8652568624993285, + "grad_norm": 1.6570649147033691, + "learning_rate": 9.370174477240712e-06, + "loss": 1.3823, + "step": 24161 + }, + { + "epoch": 0.8652926746288968, + "grad_norm": 1.4738001823425293, + "learning_rate": 9.36527289513187e-06, + "loss": 1.4896, + "step": 24162 + }, + { + "epoch": 0.8653284867584651, + "grad_norm": 1.5269134044647217, + "learning_rate": 9.3603725323964e-06, + "loss": 1.3027, + "step": 24163 + }, + { + "epoch": 0.8653642988880333, + "grad_norm": 2.1662659645080566, + "learning_rate": 9.355473389100178e-06, + "loss": 1.4489, + "step": 24164 + }, + { + "epoch": 0.8654001110176016, + "grad_norm": 1.455833911895752, + "learning_rate": 9.350575465309142e-06, + "loss": 1.3106, + "step": 24165 + }, + { + "epoch": 0.86543592314717, + "grad_norm": 1.9036498069763184, + "learning_rate": 9.345678761089194e-06, + "loss": 1.6722, + "step": 24166 + }, + { + "epoch": 0.8654717352767383, + "grad_norm": 1.4704736471176147, + "learning_rate": 9.340783276506193e-06, + "loss": 1.2371, + "step": 24167 + }, + { + "epoch": 0.8655075474063065, + "grad_norm": 1.6556400060653687, + "learning_rate": 9.335889011626032e-06, + "loss": 1.2193, + "step": 24168 + }, + { + "epoch": 0.8655433595358748, + "grad_norm": 1.7995213270187378, + "learning_rate": 9.330995966514489e-06, + "loss": 1.4554, + "step": 24169 + }, + { + "epoch": 0.8655791716654431, + "grad_norm": 1.5920368432998657, + "learning_rate": 9.32610414123748e-06, + "loss": 1.2939, + "step": 24170 + }, + { + "epoch": 0.8656149837950113, + "grad_norm": 1.5632593631744385, + "learning_rate": 9.321213535860763e-06, + "loss": 1.2844, + "step": 24171 + }, + { + "epoch": 0.8656507959245796, + "grad_norm": 1.6625022888183594, + "learning_rate": 9.316324150450173e-06, + "loss": 1.2916, + "step": 24172 + }, + { + "epoch": 0.865686608054148, + "grad_norm": 1.8836668729782104, + "learning_rate": 9.311435985071426e-06, + "loss": 1.4325, + "step": 24173 + }, + { + "epoch": 0.8657224201837163, + "grad_norm": 1.6699721813201904, + "learning_rate": 9.30654903979037e-06, + "loss": 1.4862, + "step": 24174 + }, + { + "epoch": 0.8657582323132845, + "grad_norm": 1.752537488937378, + "learning_rate": 9.301663314672704e-06, + "loss": 1.4677, + "step": 24175 + }, + { + "epoch": 0.8657940444428528, + "grad_norm": 2.023043155670166, + "learning_rate": 9.296778809784123e-06, + "loss": 1.2724, + "step": 24176 + }, + { + "epoch": 0.8658298565724211, + "grad_norm": 1.663191556930542, + "learning_rate": 9.29189552519043e-06, + "loss": 1.4344, + "step": 24177 + }, + { + "epoch": 0.8658656687019893, + "grad_norm": 1.4714888334274292, + "learning_rate": 9.287013460957261e-06, + "loss": 1.2581, + "step": 24178 + }, + { + "epoch": 0.8659014808315576, + "grad_norm": 1.4269914627075195, + "learning_rate": 9.28213261715033e-06, + "loss": 1.4169, + "step": 24179 + }, + { + "epoch": 0.865937292961126, + "grad_norm": 1.6043585538864136, + "learning_rate": 9.27725299383525e-06, + "loss": 1.5219, + "step": 24180 + }, + { + "epoch": 0.8659731050906943, + "grad_norm": 1.789746642112732, + "learning_rate": 9.272374591077748e-06, + "loss": 1.2993, + "step": 24181 + }, + { + "epoch": 0.8660089172202625, + "grad_norm": 1.4630588293075562, + "learning_rate": 9.267497408943393e-06, + "loss": 1.5637, + "step": 24182 + }, + { + "epoch": 0.8660447293498308, + "grad_norm": 1.9434194564819336, + "learning_rate": 9.262621447497844e-06, + "loss": 1.3992, + "step": 24183 + }, + { + "epoch": 0.8660805414793991, + "grad_norm": 1.6045405864715576, + "learning_rate": 9.257746706806658e-06, + "loss": 1.2954, + "step": 24184 + }, + { + "epoch": 0.8661163536089673, + "grad_norm": 1.6443135738372803, + "learning_rate": 9.252873186935452e-06, + "loss": 1.2575, + "step": 24185 + }, + { + "epoch": 0.8661521657385356, + "grad_norm": 2.0701043605804443, + "learning_rate": 9.248000887949782e-06, + "loss": 1.5517, + "step": 24186 + }, + { + "epoch": 0.866187977868104, + "grad_norm": 1.616709589958191, + "learning_rate": 9.243129809915175e-06, + "loss": 1.4381, + "step": 24187 + }, + { + "epoch": 0.8662237899976722, + "grad_norm": 1.3892414569854736, + "learning_rate": 9.238259952897221e-06, + "loss": 1.4791, + "step": 24188 + }, + { + "epoch": 0.8662596021272405, + "grad_norm": 1.8243262767791748, + "learning_rate": 9.233391316961393e-06, + "loss": 1.6, + "step": 24189 + }, + { + "epoch": 0.8662954142568088, + "grad_norm": 1.4055460691452026, + "learning_rate": 9.228523902173214e-06, + "loss": 1.2116, + "step": 24190 + }, + { + "epoch": 0.866331226386377, + "grad_norm": 1.6112884283065796, + "learning_rate": 9.223657708598133e-06, + "loss": 1.3763, + "step": 24191 + }, + { + "epoch": 0.8663670385159453, + "grad_norm": 1.9256858825683594, + "learning_rate": 9.218792736301674e-06, + "loss": 1.5623, + "step": 24192 + }, + { + "epoch": 0.8664028506455136, + "grad_norm": 1.4280561208724976, + "learning_rate": 9.213928985349252e-06, + "loss": 1.3904, + "step": 24193 + }, + { + "epoch": 0.866438662775082, + "grad_norm": 1.9041266441345215, + "learning_rate": 9.209066455806303e-06, + "loss": 1.5014, + "step": 24194 + }, + { + "epoch": 0.8664744749046502, + "grad_norm": 1.7539561986923218, + "learning_rate": 9.204205147738254e-06, + "loss": 1.3906, + "step": 24195 + }, + { + "epoch": 0.8665102870342185, + "grad_norm": 1.7698585987091064, + "learning_rate": 9.199345061210495e-06, + "loss": 1.4611, + "step": 24196 + }, + { + "epoch": 0.8665460991637868, + "grad_norm": 1.589356541633606, + "learning_rate": 9.194486196288454e-06, + "loss": 1.4255, + "step": 24197 + }, + { + "epoch": 0.866581911293355, + "grad_norm": 1.2249170541763306, + "learning_rate": 9.189628553037445e-06, + "loss": 1.4406, + "step": 24198 + }, + { + "epoch": 0.8666177234229233, + "grad_norm": 1.425215482711792, + "learning_rate": 9.184772131522845e-06, + "loss": 1.3896, + "step": 24199 + }, + { + "epoch": 0.8666535355524916, + "grad_norm": 1.1758129596710205, + "learning_rate": 9.179916931809995e-06, + "loss": 1.323, + "step": 24200 + }, + { + "epoch": 0.86668934768206, + "grad_norm": 1.505043387413025, + "learning_rate": 9.175062953964242e-06, + "loss": 1.5766, + "step": 24201 + }, + { + "epoch": 0.8667251598116282, + "grad_norm": 1.791693091392517, + "learning_rate": 9.170210198050833e-06, + "loss": 1.1097, + "step": 24202 + }, + { + "epoch": 0.8667609719411965, + "grad_norm": 1.9780690670013428, + "learning_rate": 9.165358664135082e-06, + "loss": 1.4566, + "step": 24203 + }, + { + "epoch": 0.8667967840707648, + "grad_norm": 1.7749983072280884, + "learning_rate": 9.160508352282282e-06, + "loss": 1.4966, + "step": 24204 + }, + { + "epoch": 0.866832596200333, + "grad_norm": 2.2015771865844727, + "learning_rate": 9.155659262557648e-06, + "loss": 1.4599, + "step": 24205 + }, + { + "epoch": 0.8668684083299013, + "grad_norm": 1.5244667530059814, + "learning_rate": 9.150811395026448e-06, + "loss": 1.4312, + "step": 24206 + }, + { + "epoch": 0.8669042204594696, + "grad_norm": 2.0955560207366943, + "learning_rate": 9.145964749753888e-06, + "loss": 1.7519, + "step": 24207 + }, + { + "epoch": 0.866940032589038, + "grad_norm": 1.8348698616027832, + "learning_rate": 9.141119326805193e-06, + "loss": 1.5078, + "step": 24208 + }, + { + "epoch": 0.8669758447186062, + "grad_norm": 1.442132830619812, + "learning_rate": 9.13627512624552e-06, + "loss": 1.3298, + "step": 24209 + }, + { + "epoch": 0.8670116568481745, + "grad_norm": 1.7003968954086304, + "learning_rate": 9.131432148140062e-06, + "loss": 1.665, + "step": 24210 + }, + { + "epoch": 0.8670474689777428, + "grad_norm": 2.5281410217285156, + "learning_rate": 9.126590392553992e-06, + "loss": 1.4594, + "step": 24211 + }, + { + "epoch": 0.867083281107311, + "grad_norm": 2.014244556427002, + "learning_rate": 9.12174985955241e-06, + "loss": 1.2994, + "step": 24212 + }, + { + "epoch": 0.8671190932368793, + "grad_norm": 1.3764430284500122, + "learning_rate": 9.116910549200452e-06, + "loss": 1.1514, + "step": 24213 + }, + { + "epoch": 0.8671549053664476, + "grad_norm": 2.000931978225708, + "learning_rate": 9.112072461563248e-06, + "loss": 1.557, + "step": 24214 + }, + { + "epoch": 0.867190717496016, + "grad_norm": 1.996650218963623, + "learning_rate": 9.107235596705877e-06, + "loss": 1.3858, + "step": 24215 + }, + { + "epoch": 0.8672265296255842, + "grad_norm": 1.7024036645889282, + "learning_rate": 9.102399954693396e-06, + "loss": 1.5788, + "step": 24216 + }, + { + "epoch": 0.8672623417551525, + "grad_norm": 1.849635362625122, + "learning_rate": 9.097565535590869e-06, + "loss": 1.4354, + "step": 24217 + }, + { + "epoch": 0.8672981538847208, + "grad_norm": 1.4759061336517334, + "learning_rate": 9.092732339463339e-06, + "loss": 1.2717, + "step": 24218 + }, + { + "epoch": 0.867333966014289, + "grad_norm": 1.4988124370574951, + "learning_rate": 9.087900366375868e-06, + "loss": 1.5389, + "step": 24219 + }, + { + "epoch": 0.8673697781438573, + "grad_norm": 1.6259818077087402, + "learning_rate": 9.083069616393392e-06, + "loss": 1.3329, + "step": 24220 + }, + { + "epoch": 0.8674055902734256, + "grad_norm": 2.060462713241577, + "learning_rate": 9.078240089580948e-06, + "loss": 1.7412, + "step": 24221 + }, + { + "epoch": 0.8674414024029939, + "grad_norm": 1.432573676109314, + "learning_rate": 9.073411786003527e-06, + "loss": 1.658, + "step": 24222 + }, + { + "epoch": 0.8674772145325622, + "grad_norm": 1.677588939666748, + "learning_rate": 9.068584705726035e-06, + "loss": 1.2352, + "step": 24223 + }, + { + "epoch": 0.8675130266621305, + "grad_norm": 2.123272180557251, + "learning_rate": 9.063758848813452e-06, + "loss": 1.4148, + "step": 24224 + }, + { + "epoch": 0.8675488387916988, + "grad_norm": 1.933234453201294, + "learning_rate": 9.058934215330695e-06, + "loss": 1.3967, + "step": 24225 + }, + { + "epoch": 0.867584650921267, + "grad_norm": 1.572487711906433, + "learning_rate": 9.054110805342686e-06, + "loss": 1.4756, + "step": 24226 + }, + { + "epoch": 0.8676204630508353, + "grad_norm": 2.7359111309051514, + "learning_rate": 9.049288618914276e-06, + "loss": 1.6118, + "step": 24227 + }, + { + "epoch": 0.8676562751804036, + "grad_norm": 1.771386981010437, + "learning_rate": 9.044467656110389e-06, + "loss": 1.3877, + "step": 24228 + }, + { + "epoch": 0.8676920873099719, + "grad_norm": 1.7388691902160645, + "learning_rate": 9.039647916995874e-06, + "loss": 1.2226, + "step": 24229 + }, + { + "epoch": 0.8677278994395402, + "grad_norm": 2.149545192718506, + "learning_rate": 9.034829401635547e-06, + "loss": 1.3794, + "step": 24230 + }, + { + "epoch": 0.8677637115691085, + "grad_norm": 1.9506735801696777, + "learning_rate": 9.030012110094255e-06, + "loss": 1.5796, + "step": 24231 + }, + { + "epoch": 0.8677995236986767, + "grad_norm": 1.4188599586486816, + "learning_rate": 9.025196042436802e-06, + "loss": 1.5882, + "step": 24232 + }, + { + "epoch": 0.867835335828245, + "grad_norm": 1.2901597023010254, + "learning_rate": 9.020381198728011e-06, + "loss": 1.6489, + "step": 24233 + }, + { + "epoch": 0.8678711479578133, + "grad_norm": 1.7088090181350708, + "learning_rate": 9.015567579032614e-06, + "loss": 1.5318, + "step": 24234 + }, + { + "epoch": 0.8679069600873816, + "grad_norm": 1.7100541591644287, + "learning_rate": 9.010755183415398e-06, + "loss": 1.4519, + "step": 24235 + }, + { + "epoch": 0.8679427722169499, + "grad_norm": 1.6215293407440186, + "learning_rate": 9.005944011941103e-06, + "loss": 1.4229, + "step": 24236 + }, + { + "epoch": 0.8679785843465182, + "grad_norm": 1.7219916582107544, + "learning_rate": 9.001134064674476e-06, + "loss": 1.4134, + "step": 24237 + }, + { + "epoch": 0.8680143964760865, + "grad_norm": 1.3429756164550781, + "learning_rate": 8.99632534168019e-06, + "loss": 1.6244, + "step": 24238 + }, + { + "epoch": 0.8680502086056547, + "grad_norm": 1.7451717853546143, + "learning_rate": 8.991517843022968e-06, + "loss": 1.5262, + "step": 24239 + }, + { + "epoch": 0.868086020735223, + "grad_norm": 1.512374758720398, + "learning_rate": 8.986711568767493e-06, + "loss": 1.2379, + "step": 24240 + }, + { + "epoch": 0.8681218328647913, + "grad_norm": 2.092271089553833, + "learning_rate": 8.981906518978389e-06, + "loss": 1.6425, + "step": 24241 + }, + { + "epoch": 0.8681576449943595, + "grad_norm": 1.376625657081604, + "learning_rate": 8.977102693720341e-06, + "loss": 1.5389, + "step": 24242 + }, + { + "epoch": 0.8681934571239279, + "grad_norm": 1.661430835723877, + "learning_rate": 8.97230009305795e-06, + "loss": 1.507, + "step": 24243 + }, + { + "epoch": 0.8682292692534962, + "grad_norm": 2.269094944000244, + "learning_rate": 8.967498717055878e-06, + "loss": 1.596, + "step": 24244 + }, + { + "epoch": 0.8682650813830645, + "grad_norm": 1.464607834815979, + "learning_rate": 8.96269856577866e-06, + "loss": 1.4243, + "step": 24245 + }, + { + "epoch": 0.8683008935126327, + "grad_norm": 1.3154370784759521, + "learning_rate": 8.9578996392909e-06, + "loss": 1.2494, + "step": 24246 + }, + { + "epoch": 0.868336705642201, + "grad_norm": 1.6771907806396484, + "learning_rate": 8.953101937657194e-06, + "loss": 1.4635, + "step": 24247 + }, + { + "epoch": 0.8683725177717693, + "grad_norm": 1.5361515283584595, + "learning_rate": 8.94830546094203e-06, + "loss": 1.6276, + "step": 24248 + }, + { + "epoch": 0.8684083299013375, + "grad_norm": 1.7470072507858276, + "learning_rate": 8.943510209209971e-06, + "loss": 1.6124, + "step": 24249 + }, + { + "epoch": 0.8684441420309059, + "grad_norm": 1.491672396659851, + "learning_rate": 8.93871618252553e-06, + "loss": 1.2757, + "step": 24250 + }, + { + "epoch": 0.8684799541604742, + "grad_norm": 1.5673433542251587, + "learning_rate": 8.933923380953224e-06, + "loss": 1.0436, + "step": 24251 + }, + { + "epoch": 0.8685157662900425, + "grad_norm": 1.1775404214859009, + "learning_rate": 8.92913180455749e-06, + "loss": 1.2953, + "step": 24252 + }, + { + "epoch": 0.8685515784196107, + "grad_norm": 1.688716173171997, + "learning_rate": 8.924341453402817e-06, + "loss": 1.3285, + "step": 24253 + }, + { + "epoch": 0.868587390549179, + "grad_norm": 1.6891595125198364, + "learning_rate": 8.919552327553648e-06, + "loss": 1.5579, + "step": 24254 + }, + { + "epoch": 0.8686232026787473, + "grad_norm": 1.63966703414917, + "learning_rate": 8.914764427074428e-06, + "loss": 1.4901, + "step": 24255 + }, + { + "epoch": 0.8686590148083155, + "grad_norm": 1.675835371017456, + "learning_rate": 8.909977752029574e-06, + "loss": 1.3802, + "step": 24256 + }, + { + "epoch": 0.8686948269378839, + "grad_norm": 2.029541492462158, + "learning_rate": 8.905192302483433e-06, + "loss": 1.4674, + "step": 24257 + }, + { + "epoch": 0.8687306390674522, + "grad_norm": 1.7141716480255127, + "learning_rate": 8.900408078500454e-06, + "loss": 1.4041, + "step": 24258 + }, + { + "epoch": 0.8687664511970205, + "grad_norm": 1.9804600477218628, + "learning_rate": 8.895625080144965e-06, + "loss": 1.4072, + "step": 24259 + }, + { + "epoch": 0.8688022633265887, + "grad_norm": 1.474840760231018, + "learning_rate": 8.890843307481322e-06, + "loss": 1.3555, + "step": 24260 + }, + { + "epoch": 0.868838075456157, + "grad_norm": 1.8305799961090088, + "learning_rate": 8.886062760573854e-06, + "loss": 1.4299, + "step": 24261 + }, + { + "epoch": 0.8688738875857253, + "grad_norm": 1.6935603618621826, + "learning_rate": 8.88128343948691e-06, + "loss": 1.416, + "step": 24262 + }, + { + "epoch": 0.8689096997152935, + "grad_norm": 1.5021482706069946, + "learning_rate": 8.876505344284758e-06, + "loss": 1.318, + "step": 24263 + }, + { + "epoch": 0.8689455118448619, + "grad_norm": 2.0541741847991943, + "learning_rate": 8.871728475031649e-06, + "loss": 1.6846, + "step": 24264 + }, + { + "epoch": 0.8689813239744302, + "grad_norm": 1.6807490587234497, + "learning_rate": 8.86695283179192e-06, + "loss": 1.2177, + "step": 24265 + }, + { + "epoch": 0.8690171361039984, + "grad_norm": 1.686801552772522, + "learning_rate": 8.862178414629774e-06, + "loss": 1.6565, + "step": 24266 + }, + { + "epoch": 0.8690529482335667, + "grad_norm": 1.6299697160720825, + "learning_rate": 8.857405223609472e-06, + "loss": 1.5448, + "step": 24267 + }, + { + "epoch": 0.869088760363135, + "grad_norm": 2.021028518676758, + "learning_rate": 8.852633258795185e-06, + "loss": 1.4393, + "step": 24268 + }, + { + "epoch": 0.8691245724927033, + "grad_norm": 1.4445551633834839, + "learning_rate": 8.847862520251182e-06, + "loss": 1.1612, + "step": 24269 + }, + { + "epoch": 0.8691603846222715, + "grad_norm": 1.3987027406692505, + "learning_rate": 8.843093008041591e-06, + "loss": 1.5816, + "step": 24270 + }, + { + "epoch": 0.8691961967518399, + "grad_norm": 1.6368407011032104, + "learning_rate": 8.838324722230595e-06, + "loss": 1.1717, + "step": 24271 + }, + { + "epoch": 0.8692320088814082, + "grad_norm": 1.2989962100982666, + "learning_rate": 8.833557662882374e-06, + "loss": 1.4289, + "step": 24272 + }, + { + "epoch": 0.8692678210109764, + "grad_norm": 1.8490816354751587, + "learning_rate": 8.828791830061022e-06, + "loss": 1.2979, + "step": 24273 + }, + { + "epoch": 0.8693036331405447, + "grad_norm": 1.353345274925232, + "learning_rate": 8.824027223830688e-06, + "loss": 1.4558, + "step": 24274 + }, + { + "epoch": 0.869339445270113, + "grad_norm": 1.5162136554718018, + "learning_rate": 8.819263844255432e-06, + "loss": 1.5938, + "step": 24275 + }, + { + "epoch": 0.8693752573996812, + "grad_norm": 1.5665966272354126, + "learning_rate": 8.81450169139939e-06, + "loss": 1.3911, + "step": 24276 + }, + { + "epoch": 0.8694110695292495, + "grad_norm": 1.4457532167434692, + "learning_rate": 8.809740765326591e-06, + "loss": 1.2053, + "step": 24277 + }, + { + "epoch": 0.8694468816588179, + "grad_norm": 2.0011956691741943, + "learning_rate": 8.804981066101126e-06, + "loss": 1.682, + "step": 24278 + }, + { + "epoch": 0.8694826937883862, + "grad_norm": 1.6146661043167114, + "learning_rate": 8.800222593786967e-06, + "loss": 1.6388, + "step": 24279 + }, + { + "epoch": 0.8695185059179544, + "grad_norm": 1.7931370735168457, + "learning_rate": 8.795465348448218e-06, + "loss": 1.3895, + "step": 24280 + }, + { + "epoch": 0.8695543180475227, + "grad_norm": 3.107492446899414, + "learning_rate": 8.790709330148828e-06, + "loss": 1.2655, + "step": 24281 + }, + { + "epoch": 0.869590130177091, + "grad_norm": 2.4997458457946777, + "learning_rate": 8.78595453895278e-06, + "loss": 1.363, + "step": 24282 + }, + { + "epoch": 0.8696259423066592, + "grad_norm": 1.7735134363174438, + "learning_rate": 8.781200974924053e-06, + "loss": 1.5793, + "step": 24283 + }, + { + "epoch": 0.8696617544362275, + "grad_norm": 1.5544660091400146, + "learning_rate": 8.7764486381266e-06, + "loss": 1.3186, + "step": 24284 + }, + { + "epoch": 0.8696975665657959, + "grad_norm": 1.9983875751495361, + "learning_rate": 8.77169752862439e-06, + "loss": 1.5833, + "step": 24285 + }, + { + "epoch": 0.8697333786953642, + "grad_norm": 1.391190767288208, + "learning_rate": 8.76694764648126e-06, + "loss": 1.5987, + "step": 24286 + }, + { + "epoch": 0.8697691908249324, + "grad_norm": 1.8695008754730225, + "learning_rate": 8.762198991761217e-06, + "loss": 1.2977, + "step": 24287 + }, + { + "epoch": 0.8698050029545007, + "grad_norm": 1.3335654735565186, + "learning_rate": 8.757451564528074e-06, + "loss": 1.2562, + "step": 24288 + }, + { + "epoch": 0.869840815084069, + "grad_norm": 1.5261073112487793, + "learning_rate": 8.752705364845748e-06, + "loss": 1.3221, + "step": 24289 + }, + { + "epoch": 0.8698766272136372, + "grad_norm": 1.8547850847244263, + "learning_rate": 8.747960392778053e-06, + "loss": 1.3653, + "step": 24290 + }, + { + "epoch": 0.8699124393432055, + "grad_norm": 1.607026219367981, + "learning_rate": 8.74321664838884e-06, + "loss": 1.3366, + "step": 24291 + }, + { + "epoch": 0.8699482514727739, + "grad_norm": 1.540658950805664, + "learning_rate": 8.738474131741958e-06, + "loss": 1.1223, + "step": 24292 + }, + { + "epoch": 0.8699840636023422, + "grad_norm": 2.3103864192962646, + "learning_rate": 8.733732842901166e-06, + "loss": 1.3668, + "step": 24293 + }, + { + "epoch": 0.8700198757319104, + "grad_norm": 1.749535322189331, + "learning_rate": 8.728992781930278e-06, + "loss": 1.3763, + "step": 24294 + }, + { + "epoch": 0.8700556878614787, + "grad_norm": 1.4846516847610474, + "learning_rate": 8.724253948893057e-06, + "loss": 1.4243, + "step": 24295 + }, + { + "epoch": 0.870091499991047, + "grad_norm": 1.5358998775482178, + "learning_rate": 8.719516343853273e-06, + "loss": 1.5699, + "step": 24296 + }, + { + "epoch": 0.8701273121206152, + "grad_norm": 1.1911826133728027, + "learning_rate": 8.71477996687463e-06, + "loss": 1.5351, + "step": 24297 + }, + { + "epoch": 0.8701631242501835, + "grad_norm": 2.5798017978668213, + "learning_rate": 8.710044818020902e-06, + "loss": 1.2529, + "step": 24298 + }, + { + "epoch": 0.8701989363797519, + "grad_norm": 1.7193776369094849, + "learning_rate": 8.705310897355768e-06, + "loss": 1.4123, + "step": 24299 + }, + { + "epoch": 0.8702347485093201, + "grad_norm": 1.6127065420150757, + "learning_rate": 8.700578204942889e-06, + "loss": 1.6177, + "step": 24300 + }, + { + "epoch": 0.8702705606388884, + "grad_norm": 1.6372747421264648, + "learning_rate": 8.69584674084597e-06, + "loss": 0.992, + "step": 24301 + }, + { + "epoch": 0.8703063727684567, + "grad_norm": 1.3636530637741089, + "learning_rate": 8.69111650512866e-06, + "loss": 1.3641, + "step": 24302 + }, + { + "epoch": 0.870342184898025, + "grad_norm": 1.8584938049316406, + "learning_rate": 8.686387497854609e-06, + "loss": 1.3151, + "step": 24303 + }, + { + "epoch": 0.8703779970275932, + "grad_norm": 1.4046014547348022, + "learning_rate": 8.681659719087421e-06, + "loss": 1.7345, + "step": 24304 + }, + { + "epoch": 0.8704138091571615, + "grad_norm": 1.3433245420455933, + "learning_rate": 8.676933168890699e-06, + "loss": 1.2366, + "step": 24305 + }, + { + "epoch": 0.8704496212867299, + "grad_norm": 2.0000412464141846, + "learning_rate": 8.67220784732804e-06, + "loss": 1.1339, + "step": 24306 + }, + { + "epoch": 0.8704854334162981, + "grad_norm": 1.5002861022949219, + "learning_rate": 8.667483754463046e-06, + "loss": 1.4521, + "step": 24307 + }, + { + "epoch": 0.8705212455458664, + "grad_norm": 1.84786856174469, + "learning_rate": 8.662760890359233e-06, + "loss": 1.2317, + "step": 24308 + }, + { + "epoch": 0.8705570576754347, + "grad_norm": 1.4311268329620361, + "learning_rate": 8.658039255080153e-06, + "loss": 1.4301, + "step": 24309 + }, + { + "epoch": 0.870592869805003, + "grad_norm": 1.6267906427383423, + "learning_rate": 8.65331884868934e-06, + "loss": 1.6193, + "step": 24310 + }, + { + "epoch": 0.8706286819345712, + "grad_norm": 1.5863173007965088, + "learning_rate": 8.64859967125029e-06, + "loss": 1.4066, + "step": 24311 + }, + { + "epoch": 0.8706644940641395, + "grad_norm": 1.7492663860321045, + "learning_rate": 8.643881722826486e-06, + "loss": 1.1955, + "step": 24312 + }, + { + "epoch": 0.8707003061937079, + "grad_norm": 1.6891770362854004, + "learning_rate": 8.639165003481408e-06, + "loss": 1.2417, + "step": 24313 + }, + { + "epoch": 0.8707361183232761, + "grad_norm": 1.444863200187683, + "learning_rate": 8.634449513278553e-06, + "loss": 1.4051, + "step": 24314 + }, + { + "epoch": 0.8707719304528444, + "grad_norm": 2.0153067111968994, + "learning_rate": 8.629735252281301e-06, + "loss": 1.3517, + "step": 24315 + }, + { + "epoch": 0.8708077425824127, + "grad_norm": 1.9217655658721924, + "learning_rate": 8.625022220553091e-06, + "loss": 1.282, + "step": 24316 + }, + { + "epoch": 0.8708435547119809, + "grad_norm": 1.494198203086853, + "learning_rate": 8.620310418157374e-06, + "loss": 1.2652, + "step": 24317 + }, + { + "epoch": 0.8708793668415492, + "grad_norm": 1.5170859098434448, + "learning_rate": 8.615599845157484e-06, + "loss": 1.7015, + "step": 24318 + }, + { + "epoch": 0.8709151789711175, + "grad_norm": 1.4580488204956055, + "learning_rate": 8.61089050161683e-06, + "loss": 1.7065, + "step": 24319 + }, + { + "epoch": 0.8709509911006859, + "grad_norm": 2.421257734298706, + "learning_rate": 8.60618238759875e-06, + "loss": 1.3544, + "step": 24320 + }, + { + "epoch": 0.8709868032302541, + "grad_norm": 1.232548713684082, + "learning_rate": 8.601475503166623e-06, + "loss": 1.237, + "step": 24321 + }, + { + "epoch": 0.8710226153598224, + "grad_norm": 1.703403115272522, + "learning_rate": 8.596769848383723e-06, + "loss": 1.4257, + "step": 24322 + }, + { + "epoch": 0.8710584274893907, + "grad_norm": 1.6034023761749268, + "learning_rate": 8.592065423313378e-06, + "loss": 1.2383, + "step": 24323 + }, + { + "epoch": 0.8710942396189589, + "grad_norm": 1.508851408958435, + "learning_rate": 8.587362228018892e-06, + "loss": 1.5151, + "step": 24324 + }, + { + "epoch": 0.8711300517485272, + "grad_norm": 1.593440055847168, + "learning_rate": 8.582660262563558e-06, + "loss": 1.5984, + "step": 24325 + }, + { + "epoch": 0.8711658638780955, + "grad_norm": 2.054398775100708, + "learning_rate": 8.577959527010582e-06, + "loss": 1.169, + "step": 24326 + }, + { + "epoch": 0.8712016760076638, + "grad_norm": 1.829206943511963, + "learning_rate": 8.573260021423236e-06, + "loss": 1.2653, + "step": 24327 + }, + { + "epoch": 0.8712374881372321, + "grad_norm": 1.868972897529602, + "learning_rate": 8.568561745864766e-06, + "loss": 1.5099, + "step": 24328 + }, + { + "epoch": 0.8712733002668004, + "grad_norm": 1.5950393676757812, + "learning_rate": 8.563864700398338e-06, + "loss": 1.6054, + "step": 24329 + }, + { + "epoch": 0.8713091123963687, + "grad_norm": 1.582476258277893, + "learning_rate": 8.559168885087165e-06, + "loss": 1.603, + "step": 24330 + }, + { + "epoch": 0.8713449245259369, + "grad_norm": 1.48959481716156, + "learning_rate": 8.554474299994431e-06, + "loss": 1.4577, + "step": 24331 + }, + { + "epoch": 0.8713807366555052, + "grad_norm": 1.9258126020431519, + "learning_rate": 8.549780945183306e-06, + "loss": 1.4982, + "step": 24332 + }, + { + "epoch": 0.8714165487850735, + "grad_norm": 2.175291061401367, + "learning_rate": 8.545088820716895e-06, + "loss": 1.5243, + "step": 24333 + }, + { + "epoch": 0.8714523609146418, + "grad_norm": 1.4352703094482422, + "learning_rate": 8.54039792665835e-06, + "loss": 1.2539, + "step": 24334 + }, + { + "epoch": 0.8714881730442101, + "grad_norm": 1.5487613677978516, + "learning_rate": 8.535708263070785e-06, + "loss": 1.5298, + "step": 24335 + }, + { + "epoch": 0.8715239851737784, + "grad_norm": 1.46132230758667, + "learning_rate": 8.531019830017272e-06, + "loss": 1.4797, + "step": 24336 + }, + { + "epoch": 0.8715597973033467, + "grad_norm": 1.3812408447265625, + "learning_rate": 8.526332627560906e-06, + "loss": 1.1609, + "step": 24337 + }, + { + "epoch": 0.8715956094329149, + "grad_norm": 1.4614298343658447, + "learning_rate": 8.521646655764736e-06, + "loss": 1.2881, + "step": 24338 + }, + { + "epoch": 0.8716314215624832, + "grad_norm": 1.8450669050216675, + "learning_rate": 8.516961914691835e-06, + "loss": 1.1969, + "step": 24339 + }, + { + "epoch": 0.8716672336920515, + "grad_norm": 2.6135294437408447, + "learning_rate": 8.512278404405182e-06, + "loss": 1.5435, + "step": 24340 + }, + { + "epoch": 0.8717030458216198, + "grad_norm": 1.3671050071716309, + "learning_rate": 8.507596124967821e-06, + "loss": 1.4997, + "step": 24341 + }, + { + "epoch": 0.8717388579511881, + "grad_norm": 1.8961228132247925, + "learning_rate": 8.50291507644273e-06, + "loss": 1.3494, + "step": 24342 + }, + { + "epoch": 0.8717746700807564, + "grad_norm": 1.6030116081237793, + "learning_rate": 8.498235258892907e-06, + "loss": 1.6106, + "step": 24343 + }, + { + "epoch": 0.8718104822103246, + "grad_norm": 1.4362305402755737, + "learning_rate": 8.493556672381297e-06, + "loss": 1.4159, + "step": 24344 + }, + { + "epoch": 0.8718462943398929, + "grad_norm": 1.694785237312317, + "learning_rate": 8.488879316970832e-06, + "loss": 1.3855, + "step": 24345 + }, + { + "epoch": 0.8718821064694612, + "grad_norm": 1.857426643371582, + "learning_rate": 8.484203192724482e-06, + "loss": 1.4179, + "step": 24346 + }, + { + "epoch": 0.8719179185990295, + "grad_norm": 1.4686394929885864, + "learning_rate": 8.479528299705108e-06, + "loss": 1.217, + "step": 24347 + }, + { + "epoch": 0.8719537307285978, + "grad_norm": 1.5443724393844604, + "learning_rate": 8.474854637975638e-06, + "loss": 1.3838, + "step": 24348 + }, + { + "epoch": 0.8719895428581661, + "grad_norm": 1.604174017906189, + "learning_rate": 8.47018220759893e-06, + "loss": 1.8491, + "step": 24349 + }, + { + "epoch": 0.8720253549877344, + "grad_norm": 1.6408330202102661, + "learning_rate": 8.465511008637872e-06, + "loss": 1.503, + "step": 24350 + }, + { + "epoch": 0.8720611671173026, + "grad_norm": 2.2034032344818115, + "learning_rate": 8.460841041155277e-06, + "loss": 1.7287, + "step": 24351 + }, + { + "epoch": 0.8720969792468709, + "grad_norm": 2.2767324447631836, + "learning_rate": 8.456172305213995e-06, + "loss": 1.2505, + "step": 24352 + }, + { + "epoch": 0.8721327913764392, + "grad_norm": 1.8010529279708862, + "learning_rate": 8.45150480087684e-06, + "loss": 1.4255, + "step": 24353 + }, + { + "epoch": 0.8721686035060074, + "grad_norm": 1.6472227573394775, + "learning_rate": 8.44683852820659e-06, + "loss": 1.5661, + "step": 24354 + }, + { + "epoch": 0.8722044156355758, + "grad_norm": 1.3859483003616333, + "learning_rate": 8.442173487266047e-06, + "loss": 1.047, + "step": 24355 + }, + { + "epoch": 0.8722402277651441, + "grad_norm": 1.3888520002365112, + "learning_rate": 8.437509678117916e-06, + "loss": 1.2829, + "step": 24356 + }, + { + "epoch": 0.8722760398947124, + "grad_norm": 1.3679659366607666, + "learning_rate": 8.432847100825025e-06, + "loss": 1.2947, + "step": 24357 + }, + { + "epoch": 0.8723118520242806, + "grad_norm": 2.372196912765503, + "learning_rate": 8.428185755450047e-06, + "loss": 1.318, + "step": 24358 + }, + { + "epoch": 0.8723476641538489, + "grad_norm": 1.549730896949768, + "learning_rate": 8.423525642055719e-06, + "loss": 1.5575, + "step": 24359 + }, + { + "epoch": 0.8723834762834172, + "grad_norm": 1.8781365156173706, + "learning_rate": 8.418866760704735e-06, + "loss": 1.4678, + "step": 24360 + }, + { + "epoch": 0.8724192884129854, + "grad_norm": 2.044919729232788, + "learning_rate": 8.414209111459747e-06, + "loss": 1.4384, + "step": 24361 + }, + { + "epoch": 0.8724551005425537, + "grad_norm": 1.8532249927520752, + "learning_rate": 8.409552694383472e-06, + "loss": 1.1847, + "step": 24362 + }, + { + "epoch": 0.8724909126721221, + "grad_norm": 1.6493250131607056, + "learning_rate": 8.404897509538468e-06, + "loss": 1.3719, + "step": 24363 + }, + { + "epoch": 0.8725267248016904, + "grad_norm": 1.678415298461914, + "learning_rate": 8.400243556987464e-06, + "loss": 1.0674, + "step": 24364 + }, + { + "epoch": 0.8725625369312586, + "grad_norm": 1.7217615842819214, + "learning_rate": 8.39559083679301e-06, + "loss": 1.647, + "step": 24365 + }, + { + "epoch": 0.8725983490608269, + "grad_norm": 1.3149148225784302, + "learning_rate": 8.390939349017735e-06, + "loss": 1.2332, + "step": 24366 + }, + { + "epoch": 0.8726341611903952, + "grad_norm": 1.6030855178833008, + "learning_rate": 8.386289093724175e-06, + "loss": 1.3858, + "step": 24367 + }, + { + "epoch": 0.8726699733199634, + "grad_norm": 1.845257043838501, + "learning_rate": 8.38164007097495e-06, + "loss": 1.278, + "step": 24368 + }, + { + "epoch": 0.8727057854495317, + "grad_norm": 1.8243887424468994, + "learning_rate": 8.376992280832574e-06, + "loss": 1.4684, + "step": 24369 + }, + { + "epoch": 0.8727415975791001, + "grad_norm": 1.6303426027297974, + "learning_rate": 8.372345723359553e-06, + "loss": 1.6383, + "step": 24370 + }, + { + "epoch": 0.8727774097086684, + "grad_norm": 1.4345486164093018, + "learning_rate": 8.367700398618472e-06, + "loss": 1.2597, + "step": 24371 + }, + { + "epoch": 0.8728132218382366, + "grad_norm": 1.5594035387039185, + "learning_rate": 8.363056306671757e-06, + "loss": 1.1994, + "step": 24372 + }, + { + "epoch": 0.8728490339678049, + "grad_norm": 1.6975489854812622, + "learning_rate": 8.358413447581937e-06, + "loss": 1.1322, + "step": 24373 + }, + { + "epoch": 0.8728848460973732, + "grad_norm": 1.6841044425964355, + "learning_rate": 8.353771821411415e-06, + "loss": 1.8269, + "step": 24374 + }, + { + "epoch": 0.8729206582269414, + "grad_norm": 1.4348398447036743, + "learning_rate": 8.349131428222723e-06, + "loss": 1.4246, + "step": 24375 + }, + { + "epoch": 0.8729564703565097, + "grad_norm": 2.420299768447876, + "learning_rate": 8.344492268078219e-06, + "loss": 1.3549, + "step": 24376 + }, + { + "epoch": 0.8729922824860781, + "grad_norm": 1.3469812870025635, + "learning_rate": 8.339854341040376e-06, + "loss": 1.4121, + "step": 24377 + }, + { + "epoch": 0.8730280946156463, + "grad_norm": 1.9128682613372803, + "learning_rate": 8.335217647171533e-06, + "loss": 1.5137, + "step": 24378 + }, + { + "epoch": 0.8730639067452146, + "grad_norm": 1.6239603757858276, + "learning_rate": 8.330582186534097e-06, + "loss": 1.3636, + "step": 24379 + }, + { + "epoch": 0.8730997188747829, + "grad_norm": 1.9807374477386475, + "learning_rate": 8.32594795919045e-06, + "loss": 1.7255, + "step": 24380 + }, + { + "epoch": 0.8731355310043512, + "grad_norm": 1.6199429035186768, + "learning_rate": 8.321314965202898e-06, + "loss": 1.3875, + "step": 24381 + }, + { + "epoch": 0.8731713431339194, + "grad_norm": 1.5612422227859497, + "learning_rate": 8.316683204633814e-06, + "loss": 1.3494, + "step": 24382 + }, + { + "epoch": 0.8732071552634877, + "grad_norm": 2.1652863025665283, + "learning_rate": 8.312052677545478e-06, + "loss": 1.4789, + "step": 24383 + }, + { + "epoch": 0.8732429673930561, + "grad_norm": 1.5461417436599731, + "learning_rate": 8.307423384000224e-06, + "loss": 1.6285, + "step": 24384 + }, + { + "epoch": 0.8732787795226243, + "grad_norm": 1.2789685726165771, + "learning_rate": 8.302795324060287e-06, + "loss": 1.2883, + "step": 24385 + }, + { + "epoch": 0.8733145916521926, + "grad_norm": 2.073131799697876, + "learning_rate": 8.298168497787984e-06, + "loss": 1.3102, + "step": 24386 + }, + { + "epoch": 0.8733504037817609, + "grad_norm": 1.3319644927978516, + "learning_rate": 8.293542905245543e-06, + "loss": 1.1161, + "step": 24387 + }, + { + "epoch": 0.8733862159113291, + "grad_norm": 1.5022635459899902, + "learning_rate": 8.288918546495172e-06, + "loss": 1.5562, + "step": 24388 + }, + { + "epoch": 0.8734220280408974, + "grad_norm": 2.067774534225464, + "learning_rate": 8.284295421599097e-06, + "loss": 1.5094, + "step": 24389 + }, + { + "epoch": 0.8734578401704657, + "grad_norm": 1.600555658340454, + "learning_rate": 8.279673530619525e-06, + "loss": 1.6114, + "step": 24390 + }, + { + "epoch": 0.8734936523000341, + "grad_norm": 1.613824486732483, + "learning_rate": 8.27505287361866e-06, + "loss": 1.443, + "step": 24391 + }, + { + "epoch": 0.8735294644296023, + "grad_norm": 1.7431727647781372, + "learning_rate": 8.270433450658621e-06, + "loss": 1.5098, + "step": 24392 + }, + { + "epoch": 0.8735652765591706, + "grad_norm": 1.3850111961364746, + "learning_rate": 8.265815261801568e-06, + "loss": 1.5287, + "step": 24393 + }, + { + "epoch": 0.8736010886887389, + "grad_norm": 1.909886360168457, + "learning_rate": 8.261198307109651e-06, + "loss": 1.533, + "step": 24394 + }, + { + "epoch": 0.8736369008183071, + "grad_norm": 1.8211636543273926, + "learning_rate": 8.25658258664499e-06, + "loss": 1.6334, + "step": 24395 + }, + { + "epoch": 0.8736727129478754, + "grad_norm": 1.3177467584609985, + "learning_rate": 8.251968100469653e-06, + "loss": 1.3133, + "step": 24396 + }, + { + "epoch": 0.8737085250774437, + "grad_norm": 1.4866849184036255, + "learning_rate": 8.247354848645738e-06, + "loss": 1.5015, + "step": 24397 + }, + { + "epoch": 0.8737443372070121, + "grad_norm": 1.5447543859481812, + "learning_rate": 8.242742831235339e-06, + "loss": 1.5953, + "step": 24398 + }, + { + "epoch": 0.8737801493365803, + "grad_norm": 1.414366602897644, + "learning_rate": 8.23813204830045e-06, + "loss": 1.1296, + "step": 24399 + }, + { + "epoch": 0.8738159614661486, + "grad_norm": 1.600353479385376, + "learning_rate": 8.233522499903123e-06, + "loss": 1.287, + "step": 24400 + }, + { + "epoch": 0.8738517735957169, + "grad_norm": 1.3291378021240234, + "learning_rate": 8.228914186105397e-06, + "loss": 1.2978, + "step": 24401 + }, + { + "epoch": 0.8738875857252851, + "grad_norm": 1.3929870128631592, + "learning_rate": 8.224307106969264e-06, + "loss": 1.3377, + "step": 24402 + }, + { + "epoch": 0.8739233978548534, + "grad_norm": 1.5553607940673828, + "learning_rate": 8.219701262556678e-06, + "loss": 1.3124, + "step": 24403 + }, + { + "epoch": 0.8739592099844217, + "grad_norm": 1.6648629903793335, + "learning_rate": 8.21509665292962e-06, + "loss": 1.2963, + "step": 24404 + }, + { + "epoch": 0.87399502211399, + "grad_norm": 1.5509344339370728, + "learning_rate": 8.210493278150066e-06, + "loss": 1.3433, + "step": 24405 + }, + { + "epoch": 0.8740308342435583, + "grad_norm": 1.741041660308838, + "learning_rate": 8.205891138279898e-06, + "loss": 1.3943, + "step": 24406 + }, + { + "epoch": 0.8740666463731266, + "grad_norm": 1.877867341041565, + "learning_rate": 8.201290233381075e-06, + "loss": 1.662, + "step": 24407 + }, + { + "epoch": 0.8741024585026949, + "grad_norm": 1.2536922693252563, + "learning_rate": 8.196690563515463e-06, + "loss": 0.8697, + "step": 24408 + }, + { + "epoch": 0.8741382706322631, + "grad_norm": 2.1341769695281982, + "learning_rate": 8.192092128744988e-06, + "loss": 1.1999, + "step": 24409 + }, + { + "epoch": 0.8741740827618314, + "grad_norm": 1.4079196453094482, + "learning_rate": 8.187494929131478e-06, + "loss": 1.447, + "step": 24410 + }, + { + "epoch": 0.8742098948913997, + "grad_norm": 1.608680009841919, + "learning_rate": 8.182898964736785e-06, + "loss": 1.5774, + "step": 24411 + }, + { + "epoch": 0.874245707020968, + "grad_norm": 1.5955612659454346, + "learning_rate": 8.178304235622758e-06, + "loss": 1.735, + "step": 24412 + }, + { + "epoch": 0.8742815191505363, + "grad_norm": 1.7357336282730103, + "learning_rate": 8.173710741851215e-06, + "loss": 1.4914, + "step": 24413 + }, + { + "epoch": 0.8743173312801046, + "grad_norm": 1.8703807592391968, + "learning_rate": 8.169118483483928e-06, + "loss": 1.552, + "step": 24414 + }, + { + "epoch": 0.8743531434096729, + "grad_norm": 1.617150902748108, + "learning_rate": 8.164527460582705e-06, + "loss": 1.4209, + "step": 24415 + }, + { + "epoch": 0.8743889555392411, + "grad_norm": 2.064117670059204, + "learning_rate": 8.159937673209327e-06, + "loss": 1.6424, + "step": 24416 + }, + { + "epoch": 0.8744247676688094, + "grad_norm": 1.8013323545455933, + "learning_rate": 8.155349121425504e-06, + "loss": 1.4639, + "step": 24417 + }, + { + "epoch": 0.8744605797983777, + "grad_norm": 1.68295419216156, + "learning_rate": 8.150761805292983e-06, + "loss": 1.3943, + "step": 24418 + }, + { + "epoch": 0.874496391927946, + "grad_norm": 1.8984606266021729, + "learning_rate": 8.146175724873485e-06, + "loss": 1.3342, + "step": 24419 + }, + { + "epoch": 0.8745322040575143, + "grad_norm": 1.3739839792251587, + "learning_rate": 8.141590880228722e-06, + "loss": 1.4143, + "step": 24420 + }, + { + "epoch": 0.8745680161870826, + "grad_norm": 1.4901645183563232, + "learning_rate": 8.137007271420349e-06, + "loss": 1.2584, + "step": 24421 + }, + { + "epoch": 0.8746038283166508, + "grad_norm": 1.6518315076828003, + "learning_rate": 8.132424898510061e-06, + "loss": 1.285, + "step": 24422 + }, + { + "epoch": 0.8746396404462191, + "grad_norm": 1.4844005107879639, + "learning_rate": 8.127843761559506e-06, + "loss": 1.5776, + "step": 24423 + }, + { + "epoch": 0.8746754525757874, + "grad_norm": 2.039109945297241, + "learning_rate": 8.123263860630282e-06, + "loss": 1.5137, + "step": 24424 + }, + { + "epoch": 0.8747112647053557, + "grad_norm": 1.6008793115615845, + "learning_rate": 8.118685195784037e-06, + "loss": 1.2386, + "step": 24425 + }, + { + "epoch": 0.874747076834924, + "grad_norm": 1.5261561870574951, + "learning_rate": 8.114107767082358e-06, + "loss": 1.5291, + "step": 24426 + }, + { + "epoch": 0.8747828889644923, + "grad_norm": 1.8413728475570679, + "learning_rate": 8.109531574586859e-06, + "loss": 1.0062, + "step": 24427 + }, + { + "epoch": 0.8748187010940606, + "grad_norm": 2.2254388332366943, + "learning_rate": 8.10495661835906e-06, + "loss": 1.4563, + "step": 24428 + }, + { + "epoch": 0.8748545132236288, + "grad_norm": 2.0652709007263184, + "learning_rate": 8.100382898460546e-06, + "loss": 1.7514, + "step": 24429 + }, + { + "epoch": 0.8748903253531971, + "grad_norm": 1.7497395277023315, + "learning_rate": 8.095810414952832e-06, + "loss": 1.2252, + "step": 24430 + }, + { + "epoch": 0.8749261374827654, + "grad_norm": 1.9762898683547974, + "learning_rate": 8.091239167897446e-06, + "loss": 1.4887, + "step": 24431 + }, + { + "epoch": 0.8749619496123336, + "grad_norm": 1.4887042045593262, + "learning_rate": 8.086669157355876e-06, + "loss": 1.6301, + "step": 24432 + }, + { + "epoch": 0.874997761741902, + "grad_norm": 1.460737705230713, + "learning_rate": 8.082100383389613e-06, + "loss": 1.2551, + "step": 24433 + }, + { + "epoch": 0.8750335738714703, + "grad_norm": 1.5650736093521118, + "learning_rate": 8.077532846060143e-06, + "loss": 1.298, + "step": 24434 + }, + { + "epoch": 0.8750693860010386, + "grad_norm": 1.7930569648742676, + "learning_rate": 8.072966545428873e-06, + "loss": 1.467, + "step": 24435 + }, + { + "epoch": 0.8751051981306068, + "grad_norm": 2.4084696769714355, + "learning_rate": 8.068401481557263e-06, + "loss": 1.3533, + "step": 24436 + }, + { + "epoch": 0.8751410102601751, + "grad_norm": 1.8376762866973877, + "learning_rate": 8.063837654506734e-06, + "loss": 1.3193, + "step": 24437 + }, + { + "epoch": 0.8751768223897434, + "grad_norm": 1.538162350654602, + "learning_rate": 8.059275064338689e-06, + "loss": 1.3677, + "step": 24438 + }, + { + "epoch": 0.8752126345193116, + "grad_norm": 1.8487403392791748, + "learning_rate": 8.054713711114491e-06, + "loss": 1.4857, + "step": 24439 + }, + { + "epoch": 0.87524844664888, + "grad_norm": 2.232276678085327, + "learning_rate": 8.050153594895526e-06, + "loss": 1.6025, + "step": 24440 + }, + { + "epoch": 0.8752842587784483, + "grad_norm": 1.5102007389068604, + "learning_rate": 8.045594715743144e-06, + "loss": 0.9663, + "step": 24441 + }, + { + "epoch": 0.8753200709080166, + "grad_norm": 1.7949094772338867, + "learning_rate": 8.04103707371866e-06, + "loss": 1.3811, + "step": 24442 + }, + { + "epoch": 0.8753558830375848, + "grad_norm": 1.8622803688049316, + "learning_rate": 8.036480668883394e-06, + "loss": 1.3921, + "step": 24443 + }, + { + "epoch": 0.8753916951671531, + "grad_norm": 1.5665391683578491, + "learning_rate": 8.031925501298666e-06, + "loss": 1.338, + "step": 24444 + }, + { + "epoch": 0.8754275072967214, + "grad_norm": 1.5506495237350464, + "learning_rate": 8.027371571025765e-06, + "loss": 1.5878, + "step": 24445 + }, + { + "epoch": 0.8754633194262896, + "grad_norm": 1.6303693056106567, + "learning_rate": 8.022818878125926e-06, + "loss": 1.5915, + "step": 24446 + }, + { + "epoch": 0.875499131555858, + "grad_norm": 1.6259897947311401, + "learning_rate": 8.018267422660419e-06, + "loss": 1.731, + "step": 24447 + }, + { + "epoch": 0.8755349436854263, + "grad_norm": 1.2927833795547485, + "learning_rate": 8.013717204690474e-06, + "loss": 1.2528, + "step": 24448 + }, + { + "epoch": 0.8755707558149946, + "grad_norm": 1.6778570413589478, + "learning_rate": 8.00916822427733e-06, + "loss": 1.3148, + "step": 24449 + }, + { + "epoch": 0.8756065679445628, + "grad_norm": 1.671055555343628, + "learning_rate": 8.004620481482161e-06, + "loss": 1.2224, + "step": 24450 + }, + { + "epoch": 0.8756423800741311, + "grad_norm": 2.1273000240325928, + "learning_rate": 8.00007397636613e-06, + "loss": 1.5564, + "step": 24451 + }, + { + "epoch": 0.8756781922036994, + "grad_norm": 1.361875295639038, + "learning_rate": 7.995528708990463e-06, + "loss": 1.5403, + "step": 24452 + }, + { + "epoch": 0.8757140043332676, + "grad_norm": 1.7198835611343384, + "learning_rate": 7.990984679416269e-06, + "loss": 1.28, + "step": 24453 + }, + { + "epoch": 0.875749816462836, + "grad_norm": 1.4493197202682495, + "learning_rate": 7.986441887704687e-06, + "loss": 1.2897, + "step": 24454 + }, + { + "epoch": 0.8757856285924043, + "grad_norm": 1.5954997539520264, + "learning_rate": 7.981900333916848e-06, + "loss": 1.2537, + "step": 24455 + }, + { + "epoch": 0.8758214407219725, + "grad_norm": 1.679919958114624, + "learning_rate": 7.977360018113855e-06, + "loss": 1.438, + "step": 24456 + }, + { + "epoch": 0.8758572528515408, + "grad_norm": 1.6092506647109985, + "learning_rate": 7.972820940356785e-06, + "loss": 1.3889, + "step": 24457 + }, + { + "epoch": 0.8758930649811091, + "grad_norm": 2.2504239082336426, + "learning_rate": 7.968283100706664e-06, + "loss": 1.4879, + "step": 24458 + }, + { + "epoch": 0.8759288771106774, + "grad_norm": 2.6346192359924316, + "learning_rate": 7.963746499224611e-06, + "loss": 1.4057, + "step": 24459 + }, + { + "epoch": 0.8759646892402456, + "grad_norm": 2.1243059635162354, + "learning_rate": 7.959211135971622e-06, + "loss": 1.4884, + "step": 24460 + }, + { + "epoch": 0.876000501369814, + "grad_norm": 1.6332237720489502, + "learning_rate": 7.954677011008749e-06, + "loss": 1.1818, + "step": 24461 + }, + { + "epoch": 0.8760363134993823, + "grad_norm": 1.5698171854019165, + "learning_rate": 7.95014412439692e-06, + "loss": 1.4265, + "step": 24462 + }, + { + "epoch": 0.8760721256289505, + "grad_norm": 1.746675729751587, + "learning_rate": 7.945612476197207e-06, + "loss": 1.6762, + "step": 24463 + }, + { + "epoch": 0.8761079377585188, + "grad_norm": 1.5642447471618652, + "learning_rate": 7.941082066470507e-06, + "loss": 1.2722, + "step": 24464 + }, + { + "epoch": 0.8761437498880871, + "grad_norm": 2.2350451946258545, + "learning_rate": 7.936552895277826e-06, + "loss": 1.7897, + "step": 24465 + }, + { + "epoch": 0.8761795620176553, + "grad_norm": 2.5448694229125977, + "learning_rate": 7.932024962680062e-06, + "loss": 1.3325, + "step": 24466 + }, + { + "epoch": 0.8762153741472236, + "grad_norm": 1.8758574724197388, + "learning_rate": 7.927498268738132e-06, + "loss": 1.0997, + "step": 24467 + }, + { + "epoch": 0.876251186276792, + "grad_norm": 2.587082624435425, + "learning_rate": 7.922972813512974e-06, + "loss": 1.5028, + "step": 24468 + }, + { + "epoch": 0.8762869984063603, + "grad_norm": 1.386791467666626, + "learning_rate": 7.918448597065408e-06, + "loss": 1.5839, + "step": 24469 + }, + { + "epoch": 0.8763228105359285, + "grad_norm": 1.7087793350219727, + "learning_rate": 7.913925619456374e-06, + "loss": 1.6286, + "step": 24470 + }, + { + "epoch": 0.8763586226654968, + "grad_norm": 1.446853756904602, + "learning_rate": 7.909403880746669e-06, + "loss": 1.3785, + "step": 24471 + }, + { + "epoch": 0.8763944347950651, + "grad_norm": 2.251840829849243, + "learning_rate": 7.904883380997164e-06, + "loss": 1.453, + "step": 24472 + }, + { + "epoch": 0.8764302469246333, + "grad_norm": 1.5899436473846436, + "learning_rate": 7.900364120268622e-06, + "loss": 1.61, + "step": 24473 + }, + { + "epoch": 0.8764660590542016, + "grad_norm": 1.766593098640442, + "learning_rate": 7.895846098621917e-06, + "loss": 1.1907, + "step": 24474 + }, + { + "epoch": 0.87650187118377, + "grad_norm": 1.9567400217056274, + "learning_rate": 7.891329316117801e-06, + "loss": 1.363, + "step": 24475 + }, + { + "epoch": 0.8765376833133383, + "grad_norm": 1.5550438165664673, + "learning_rate": 7.886813772817026e-06, + "loss": 1.1627, + "step": 24476 + }, + { + "epoch": 0.8765734954429065, + "grad_norm": 1.8496496677398682, + "learning_rate": 7.88229946878034e-06, + "loss": 1.1868, + "step": 24477 + }, + { + "epoch": 0.8766093075724748, + "grad_norm": 1.919206142425537, + "learning_rate": 7.877786404068498e-06, + "loss": 1.7735, + "step": 24478 + }, + { + "epoch": 0.8766451197020431, + "grad_norm": 1.3942233324050903, + "learning_rate": 7.873274578742229e-06, + "loss": 1.4318, + "step": 24479 + }, + { + "epoch": 0.8766809318316113, + "grad_norm": 1.6327040195465088, + "learning_rate": 7.868763992862182e-06, + "loss": 1.1995, + "step": 24480 + }, + { + "epoch": 0.8767167439611796, + "grad_norm": 1.5736989974975586, + "learning_rate": 7.864254646489099e-06, + "loss": 1.1477, + "step": 24481 + }, + { + "epoch": 0.876752556090748, + "grad_norm": 1.382622480392456, + "learning_rate": 7.859746539683621e-06, + "loss": 1.565, + "step": 24482 + }, + { + "epoch": 0.8767883682203162, + "grad_norm": 1.465562105178833, + "learning_rate": 7.855239672506408e-06, + "loss": 1.4932, + "step": 24483 + }, + { + "epoch": 0.8768241803498845, + "grad_norm": 1.737520456314087, + "learning_rate": 7.85073404501807e-06, + "loss": 1.3897, + "step": 24484 + }, + { + "epoch": 0.8768599924794528, + "grad_norm": 1.8517922163009644, + "learning_rate": 7.846229657279246e-06, + "loss": 1.6007, + "step": 24485 + }, + { + "epoch": 0.8768958046090211, + "grad_norm": 1.5409523248672485, + "learning_rate": 7.841726509350545e-06, + "loss": 1.3853, + "step": 24486 + }, + { + "epoch": 0.8769316167385893, + "grad_norm": 1.7995675802230835, + "learning_rate": 7.837224601292525e-06, + "loss": 1.4295, + "step": 24487 + }, + { + "epoch": 0.8769674288681576, + "grad_norm": 2.4543213844299316, + "learning_rate": 7.832723933165764e-06, + "loss": 1.4674, + "step": 24488 + }, + { + "epoch": 0.877003240997726, + "grad_norm": 1.7001053094863892, + "learning_rate": 7.828224505030823e-06, + "loss": 1.3251, + "step": 24489 + }, + { + "epoch": 0.8770390531272942, + "grad_norm": 1.9298815727233887, + "learning_rate": 7.823726316948232e-06, + "loss": 1.528, + "step": 24490 + }, + { + "epoch": 0.8770748652568625, + "grad_norm": 1.8918613195419312, + "learning_rate": 7.819229368978498e-06, + "loss": 1.6044, + "step": 24491 + }, + { + "epoch": 0.8771106773864308, + "grad_norm": 1.5881575345993042, + "learning_rate": 7.814733661182116e-06, + "loss": 1.6521, + "step": 24492 + }, + { + "epoch": 0.877146489515999, + "grad_norm": 1.3574950695037842, + "learning_rate": 7.810239193619618e-06, + "loss": 1.2039, + "step": 24493 + }, + { + "epoch": 0.8771823016455673, + "grad_norm": 1.7643013000488281, + "learning_rate": 7.805745966351407e-06, + "loss": 0.9625, + "step": 24494 + }, + { + "epoch": 0.8772181137751356, + "grad_norm": 1.515485167503357, + "learning_rate": 7.801253979437962e-06, + "loss": 1.5384, + "step": 24495 + }, + { + "epoch": 0.877253925904704, + "grad_norm": 1.3017157316207886, + "learning_rate": 7.796763232939719e-06, + "loss": 1.2687, + "step": 24496 + }, + { + "epoch": 0.8772897380342722, + "grad_norm": 1.8666355609893799, + "learning_rate": 7.79227372691711e-06, + "loss": 1.4063, + "step": 24497 + }, + { + "epoch": 0.8773255501638405, + "grad_norm": 1.7395696640014648, + "learning_rate": 7.787785461430498e-06, + "loss": 1.4451, + "step": 24498 + }, + { + "epoch": 0.8773613622934088, + "grad_norm": 1.4882618188858032, + "learning_rate": 7.783298436540288e-06, + "loss": 1.4995, + "step": 24499 + }, + { + "epoch": 0.877397174422977, + "grad_norm": 1.7713781595230103, + "learning_rate": 7.778812652306844e-06, + "loss": 1.5565, + "step": 24500 + }, + { + "epoch": 0.8774329865525453, + "grad_norm": 1.427703619003296, + "learning_rate": 7.774328108790541e-06, + "loss": 1.297, + "step": 24501 + }, + { + "epoch": 0.8774687986821136, + "grad_norm": 2.4432625770568848, + "learning_rate": 7.769844806051674e-06, + "loss": 1.7429, + "step": 24502 + }, + { + "epoch": 0.877504610811682, + "grad_norm": 1.2416493892669678, + "learning_rate": 7.765362744150573e-06, + "loss": 1.3341, + "step": 24503 + }, + { + "epoch": 0.8775404229412502, + "grad_norm": 1.4567826986312866, + "learning_rate": 7.760881923147567e-06, + "loss": 1.5861, + "step": 24504 + }, + { + "epoch": 0.8775762350708185, + "grad_norm": 1.5808217525482178, + "learning_rate": 7.756402343102897e-06, + "loss": 1.5537, + "step": 24505 + }, + { + "epoch": 0.8776120472003868, + "grad_norm": 1.6409419775009155, + "learning_rate": 7.751924004076837e-06, + "loss": 1.1795, + "step": 24506 + }, + { + "epoch": 0.877647859329955, + "grad_norm": 1.580722689628601, + "learning_rate": 7.747446906129662e-06, + "loss": 1.4825, + "step": 24507 + }, + { + "epoch": 0.8776836714595233, + "grad_norm": 1.7144967317581177, + "learning_rate": 7.742971049321601e-06, + "loss": 1.392, + "step": 24508 + }, + { + "epoch": 0.8777194835890916, + "grad_norm": 1.1489810943603516, + "learning_rate": 7.738496433712839e-06, + "loss": 1.3657, + "step": 24509 + }, + { + "epoch": 0.87775529571866, + "grad_norm": 1.4573662281036377, + "learning_rate": 7.734023059363605e-06, + "loss": 1.3468, + "step": 24510 + }, + { + "epoch": 0.8777911078482282, + "grad_norm": 1.420721173286438, + "learning_rate": 7.729550926334094e-06, + "loss": 1.3038, + "step": 24511 + }, + { + "epoch": 0.8778269199777965, + "grad_norm": 1.3883370161056519, + "learning_rate": 7.72508003468444e-06, + "loss": 1.7398, + "step": 24512 + }, + { + "epoch": 0.8778627321073648, + "grad_norm": 1.651623010635376, + "learning_rate": 7.720610384474802e-06, + "loss": 1.4708, + "step": 24513 + }, + { + "epoch": 0.877898544236933, + "grad_norm": 1.322274088859558, + "learning_rate": 7.716141975765322e-06, + "loss": 1.4895, + "step": 24514 + }, + { + "epoch": 0.8779343563665013, + "grad_norm": 1.9884264469146729, + "learning_rate": 7.711674808616132e-06, + "loss": 1.4892, + "step": 24515 + }, + { + "epoch": 0.8779701684960696, + "grad_norm": 1.6073400974273682, + "learning_rate": 7.70720888308729e-06, + "loss": 1.5275, + "step": 24516 + }, + { + "epoch": 0.878005980625638, + "grad_norm": 2.0732638835906982, + "learning_rate": 7.70274419923892e-06, + "loss": 1.4096, + "step": 24517 + }, + { + "epoch": 0.8780417927552062, + "grad_norm": 1.6035428047180176, + "learning_rate": 7.69828075713106e-06, + "loss": 1.4707, + "step": 24518 + }, + { + "epoch": 0.8780776048847745, + "grad_norm": 1.8507529497146606, + "learning_rate": 7.693818556823784e-06, + "loss": 1.6353, + "step": 24519 + }, + { + "epoch": 0.8781134170143428, + "grad_norm": 1.507523536682129, + "learning_rate": 7.6893575983771e-06, + "loss": 1.3616, + "step": 24520 + }, + { + "epoch": 0.878149229143911, + "grad_norm": 1.8455419540405273, + "learning_rate": 7.68489788185105e-06, + "loss": 1.6077, + "step": 24521 + }, + { + "epoch": 0.8781850412734793, + "grad_norm": 1.4134904146194458, + "learning_rate": 7.680439407305629e-06, + "loss": 1.493, + "step": 24522 + }, + { + "epoch": 0.8782208534030476, + "grad_norm": 1.6321110725402832, + "learning_rate": 7.675982174800788e-06, + "loss": 1.4207, + "step": 24523 + }, + { + "epoch": 0.8782566655326159, + "grad_norm": 1.6989505290985107, + "learning_rate": 7.671526184396527e-06, + "loss": 1.5392, + "step": 24524 + }, + { + "epoch": 0.8782924776621842, + "grad_norm": 1.6748425960540771, + "learning_rate": 7.667071436152784e-06, + "loss": 1.3084, + "step": 24525 + }, + { + "epoch": 0.8783282897917525, + "grad_norm": 1.3120161294937134, + "learning_rate": 7.662617930129502e-06, + "loss": 1.4486, + "step": 24526 + }, + { + "epoch": 0.8783641019213208, + "grad_norm": 1.5598652362823486, + "learning_rate": 7.658165666386585e-06, + "loss": 1.503, + "step": 24527 + }, + { + "epoch": 0.878399914050889, + "grad_norm": 1.6046355962753296, + "learning_rate": 7.653714644983923e-06, + "loss": 1.3375, + "step": 24528 + }, + { + "epoch": 0.8784357261804573, + "grad_norm": 1.784735918045044, + "learning_rate": 7.649264865981443e-06, + "loss": 1.6171, + "step": 24529 + }, + { + "epoch": 0.8784715383100256, + "grad_norm": 1.5161055326461792, + "learning_rate": 7.644816329438952e-06, + "loss": 1.4565, + "step": 24530 + }, + { + "epoch": 0.8785073504395939, + "grad_norm": 1.4199373722076416, + "learning_rate": 7.640369035416339e-06, + "loss": 1.6551, + "step": 24531 + }, + { + "epoch": 0.8785431625691622, + "grad_norm": 1.6012083292007446, + "learning_rate": 7.63592298397342e-06, + "loss": 1.2885, + "step": 24532 + }, + { + "epoch": 0.8785789746987305, + "grad_norm": 1.571999430656433, + "learning_rate": 7.631478175170026e-06, + "loss": 1.4313, + "step": 24533 + }, + { + "epoch": 0.8786147868282987, + "grad_norm": 1.8361530303955078, + "learning_rate": 7.627034609065942e-06, + "loss": 1.4186, + "step": 24534 + }, + { + "epoch": 0.878650598957867, + "grad_norm": 1.6053346395492554, + "learning_rate": 7.622592285720942e-06, + "loss": 1.3284, + "step": 24535 + }, + { + "epoch": 0.8786864110874353, + "grad_norm": 2.1718499660491943, + "learning_rate": 7.618151205194813e-06, + "loss": 1.2265, + "step": 24536 + }, + { + "epoch": 0.8787222232170036, + "grad_norm": 1.7951548099517822, + "learning_rate": 7.613711367547316e-06, + "loss": 1.5307, + "step": 24537 + }, + { + "epoch": 0.8787580353465719, + "grad_norm": 2.2277722358703613, + "learning_rate": 7.609272772838138e-06, + "loss": 1.8082, + "step": 24538 + }, + { + "epoch": 0.8787938474761402, + "grad_norm": 1.3839033842086792, + "learning_rate": 7.604835421127021e-06, + "loss": 1.268, + "step": 24539 + }, + { + "epoch": 0.8788296596057085, + "grad_norm": 1.9207768440246582, + "learning_rate": 7.600399312473683e-06, + "loss": 1.7225, + "step": 24540 + }, + { + "epoch": 0.8788654717352767, + "grad_norm": 1.5807201862335205, + "learning_rate": 7.595964446937764e-06, + "loss": 1.6244, + "step": 24541 + }, + { + "epoch": 0.878901283864845, + "grad_norm": 1.7699261903762817, + "learning_rate": 7.591530824578952e-06, + "loss": 1.6804, + "step": 24542 + }, + { + "epoch": 0.8789370959944133, + "grad_norm": 1.3223094940185547, + "learning_rate": 7.587098445456897e-06, + "loss": 1.3716, + "step": 24543 + }, + { + "epoch": 0.8789729081239815, + "grad_norm": 1.6015185117721558, + "learning_rate": 7.582667309631242e-06, + "loss": 1.3739, + "step": 24544 + }, + { + "epoch": 0.8790087202535499, + "grad_norm": 1.5923868417739868, + "learning_rate": 7.578237417161571e-06, + "loss": 1.4783, + "step": 24545 + }, + { + "epoch": 0.8790445323831182, + "grad_norm": 2.426682710647583, + "learning_rate": 7.573808768107504e-06, + "loss": 1.4759, + "step": 24546 + }, + { + "epoch": 0.8790803445126865, + "grad_norm": 1.7707291841506958, + "learning_rate": 7.569381362528638e-06, + "loss": 1.2952, + "step": 24547 + }, + { + "epoch": 0.8791161566422547, + "grad_norm": 1.452986240386963, + "learning_rate": 7.5649552004844915e-06, + "loss": 1.4157, + "step": 24548 + }, + { + "epoch": 0.879151968771823, + "grad_norm": 1.917758822441101, + "learning_rate": 7.560530282034662e-06, + "loss": 1.3931, + "step": 24549 + }, + { + "epoch": 0.8791877809013913, + "grad_norm": 1.3515937328338623, + "learning_rate": 7.556106607238633e-06, + "loss": 1.4283, + "step": 24550 + }, + { + "epoch": 0.8792235930309595, + "grad_norm": 1.6925004720687866, + "learning_rate": 7.551684176155971e-06, + "loss": 1.3824, + "step": 24551 + }, + { + "epoch": 0.8792594051605279, + "grad_norm": 1.9788447618484497, + "learning_rate": 7.547262988846126e-06, + "loss": 1.2657, + "step": 24552 + }, + { + "epoch": 0.8792952172900962, + "grad_norm": 1.8088815212249756, + "learning_rate": 7.542843045368609e-06, + "loss": 1.6715, + "step": 24553 + }, + { + "epoch": 0.8793310294196645, + "grad_norm": 1.4498738050460815, + "learning_rate": 7.538424345782902e-06, + "loss": 1.3569, + "step": 24554 + }, + { + "epoch": 0.8793668415492327, + "grad_norm": 1.5786463022232056, + "learning_rate": 7.534006890148404e-06, + "loss": 1.2862, + "step": 24555 + }, + { + "epoch": 0.879402653678801, + "grad_norm": 1.7088158130645752, + "learning_rate": 7.52959067852459e-06, + "loss": 1.4563, + "step": 24556 + }, + { + "epoch": 0.8794384658083693, + "grad_norm": 1.9697120189666748, + "learning_rate": 7.525175710970811e-06, + "loss": 1.4709, + "step": 24557 + }, + { + "epoch": 0.8794742779379375, + "grad_norm": 1.5277745723724365, + "learning_rate": 7.520761987546554e-06, + "loss": 1.2802, + "step": 24558 + }, + { + "epoch": 0.8795100900675059, + "grad_norm": 2.2481420040130615, + "learning_rate": 7.516349508311138e-06, + "loss": 1.3251, + "step": 24559 + }, + { + "epoch": 0.8795459021970742, + "grad_norm": 1.6836209297180176, + "learning_rate": 7.51193827332396e-06, + "loss": 1.5707, + "step": 24560 + }, + { + "epoch": 0.8795817143266424, + "grad_norm": 1.6443015336990356, + "learning_rate": 7.507528282644316e-06, + "loss": 1.592, + "step": 24561 + }, + { + "epoch": 0.8796175264562107, + "grad_norm": 1.5825704336166382, + "learning_rate": 7.503119536331604e-06, + "loss": 1.4901, + "step": 24562 + }, + { + "epoch": 0.879653338585779, + "grad_norm": 1.8732868432998657, + "learning_rate": 7.49871203444511e-06, + "loss": 1.2958, + "step": 24563 + }, + { + "epoch": 0.8796891507153473, + "grad_norm": 1.4926637411117554, + "learning_rate": 7.494305777044086e-06, + "loss": 1.3454, + "step": 24564 + }, + { + "epoch": 0.8797249628449155, + "grad_norm": 1.602117896080017, + "learning_rate": 7.489900764187896e-06, + "loss": 1.6786, + "step": 24565 + }, + { + "epoch": 0.8797607749744839, + "grad_norm": 2.2812118530273438, + "learning_rate": 7.485496995935748e-06, + "loss": 1.5049, + "step": 24566 + }, + { + "epoch": 0.8797965871040522, + "grad_norm": 1.250343918800354, + "learning_rate": 7.481094472346905e-06, + "loss": 1.3796, + "step": 24567 + }, + { + "epoch": 0.8798323992336204, + "grad_norm": 1.6296786069869995, + "learning_rate": 7.476693193480577e-06, + "loss": 1.5116, + "step": 24568 + }, + { + "epoch": 0.8798682113631887, + "grad_norm": 1.5637654066085815, + "learning_rate": 7.472293159396027e-06, + "loss": 1.5897, + "step": 24569 + }, + { + "epoch": 0.879904023492757, + "grad_norm": 1.8617569208145142, + "learning_rate": 7.4678943701523954e-06, + "loss": 1.4735, + "step": 24570 + }, + { + "epoch": 0.8799398356223253, + "grad_norm": 2.2335526943206787, + "learning_rate": 7.4634968258089135e-06, + "loss": 1.5358, + "step": 24571 + }, + { + "epoch": 0.8799756477518935, + "grad_norm": 1.9554426670074463, + "learning_rate": 7.4591005264246895e-06, + "loss": 1.4661, + "step": 24572 + }, + { + "epoch": 0.8800114598814619, + "grad_norm": 2.1411080360412598, + "learning_rate": 7.454705472058909e-06, + "loss": 1.3836, + "step": 24573 + }, + { + "epoch": 0.8800472720110302, + "grad_norm": 2.0509655475616455, + "learning_rate": 7.450311662770704e-06, + "loss": 1.215, + "step": 24574 + }, + { + "epoch": 0.8800830841405984, + "grad_norm": 1.5214743614196777, + "learning_rate": 7.445919098619159e-06, + "loss": 1.2725, + "step": 24575 + }, + { + "epoch": 0.8801188962701667, + "grad_norm": 1.6003098487854004, + "learning_rate": 7.441527779663382e-06, + "loss": 1.5988, + "step": 24576 + }, + { + "epoch": 0.880154708399735, + "grad_norm": 1.7950252294540405, + "learning_rate": 7.43713770596246e-06, + "loss": 1.3825, + "step": 24577 + }, + { + "epoch": 0.8801905205293032, + "grad_norm": 1.5984803438186646, + "learning_rate": 7.4327488775754794e-06, + "loss": 1.3248, + "step": 24578 + }, + { + "epoch": 0.8802263326588715, + "grad_norm": 1.6045230627059937, + "learning_rate": 7.428361294561415e-06, + "loss": 1.2454, + "step": 24579 + }, + { + "epoch": 0.8802621447884399, + "grad_norm": 2.036458969116211, + "learning_rate": 7.423974956979374e-06, + "loss": 1.4281, + "step": 24580 + }, + { + "epoch": 0.8802979569180082, + "grad_norm": 1.4843889474868774, + "learning_rate": 7.419589864888332e-06, + "loss": 1.4315, + "step": 24581 + }, + { + "epoch": 0.8803337690475764, + "grad_norm": 1.5990513563156128, + "learning_rate": 7.415206018347287e-06, + "loss": 1.1032, + "step": 24582 + }, + { + "epoch": 0.8803695811771447, + "grad_norm": 1.3269641399383545, + "learning_rate": 7.410823417415203e-06, + "loss": 1.2407, + "step": 24583 + }, + { + "epoch": 0.880405393306713, + "grad_norm": 1.8271068334579468, + "learning_rate": 7.406442062151064e-06, + "loss": 1.3426, + "step": 24584 + }, + { + "epoch": 0.8804412054362812, + "grad_norm": 2.1693825721740723, + "learning_rate": 7.402061952613826e-06, + "loss": 1.126, + "step": 24585 + }, + { + "epoch": 0.8804770175658495, + "grad_norm": 1.887425422668457, + "learning_rate": 7.397683088862395e-06, + "loss": 1.5462, + "step": 24586 + }, + { + "epoch": 0.8805128296954179, + "grad_norm": 1.5298993587493896, + "learning_rate": 7.393305470955681e-06, + "loss": 1.1619, + "step": 24587 + }, + { + "epoch": 0.8805486418249862, + "grad_norm": 2.064223051071167, + "learning_rate": 7.388929098952579e-06, + "loss": 1.2954, + "step": 24588 + }, + { + "epoch": 0.8805844539545544, + "grad_norm": 1.1978247165679932, + "learning_rate": 7.384553972912011e-06, + "loss": 1.537, + "step": 24589 + }, + { + "epoch": 0.8806202660841227, + "grad_norm": 1.5021591186523438, + "learning_rate": 7.380180092892775e-06, + "loss": 1.4642, + "step": 24590 + }, + { + "epoch": 0.880656078213691, + "grad_norm": 2.6436407566070557, + "learning_rate": 7.375807458953743e-06, + "loss": 1.812, + "step": 24591 + }, + { + "epoch": 0.8806918903432592, + "grad_norm": 1.5209324359893799, + "learning_rate": 7.37143607115377e-06, + "loss": 1.3444, + "step": 24592 + }, + { + "epoch": 0.8807277024728275, + "grad_norm": 1.360662579536438, + "learning_rate": 7.36706592955162e-06, + "loss": 1.3151, + "step": 24593 + }, + { + "epoch": 0.8807635146023959, + "grad_norm": 1.9590721130371094, + "learning_rate": 7.362697034206112e-06, + "loss": 1.177, + "step": 24594 + }, + { + "epoch": 0.8807993267319641, + "grad_norm": 1.7141327857971191, + "learning_rate": 7.358329385176033e-06, + "loss": 1.4844, + "step": 24595 + }, + { + "epoch": 0.8808351388615324, + "grad_norm": 1.7014778852462769, + "learning_rate": 7.353962982520135e-06, + "loss": 1.5032, + "step": 24596 + }, + { + "epoch": 0.8808709509911007, + "grad_norm": 1.8111387491226196, + "learning_rate": 7.34959782629715e-06, + "loss": 1.264, + "step": 24597 + }, + { + "epoch": 0.880906763120669, + "grad_norm": 1.8225723505020142, + "learning_rate": 7.345233916565808e-06, + "loss": 1.4492, + "step": 24598 + }, + { + "epoch": 0.8809425752502372, + "grad_norm": 1.3599964380264282, + "learning_rate": 7.340871253384851e-06, + "loss": 1.4305, + "step": 24599 + }, + { + "epoch": 0.8809783873798055, + "grad_norm": 1.8249688148498535, + "learning_rate": 7.336509836812933e-06, + "loss": 1.1971, + "step": 24600 + }, + { + "epoch": 0.8810141995093739, + "grad_norm": 1.7372807264328003, + "learning_rate": 7.3321496669087495e-06, + "loss": 1.4024, + "step": 24601 + }, + { + "epoch": 0.8810500116389421, + "grad_norm": 1.6618596315383911, + "learning_rate": 7.327790743730956e-06, + "loss": 1.3195, + "step": 24602 + }, + { + "epoch": 0.8810858237685104, + "grad_norm": 1.5446751117706299, + "learning_rate": 7.323433067338214e-06, + "loss": 1.426, + "step": 24603 + }, + { + "epoch": 0.8811216358980787, + "grad_norm": 1.6365025043487549, + "learning_rate": 7.319076637789124e-06, + "loss": 1.1681, + "step": 24604 + }, + { + "epoch": 0.881157448027647, + "grad_norm": 1.3218605518341064, + "learning_rate": 7.314721455142304e-06, + "loss": 1.3863, + "step": 24605 + }, + { + "epoch": 0.8811932601572152, + "grad_norm": 1.4129111766815186, + "learning_rate": 7.310367519456352e-06, + "loss": 1.5447, + "step": 24606 + }, + { + "epoch": 0.8812290722867835, + "grad_norm": 2.340679407119751, + "learning_rate": 7.306014830789865e-06, + "loss": 1.4607, + "step": 24607 + }, + { + "epoch": 0.8812648844163519, + "grad_norm": 2.0600128173828125, + "learning_rate": 7.3016633892013634e-06, + "loss": 1.5905, + "step": 24608 + }, + { + "epoch": 0.8813006965459201, + "grad_norm": 1.7693760395050049, + "learning_rate": 7.2973131947494e-06, + "loss": 1.3652, + "step": 24609 + }, + { + "epoch": 0.8813365086754884, + "grad_norm": 1.6240615844726562, + "learning_rate": 7.292964247492539e-06, + "loss": 1.3207, + "step": 24610 + }, + { + "epoch": 0.8813723208050567, + "grad_norm": 1.4567763805389404, + "learning_rate": 7.288616547489235e-06, + "loss": 1.3986, + "step": 24611 + }, + { + "epoch": 0.8814081329346249, + "grad_norm": 1.3678109645843506, + "learning_rate": 7.284270094798018e-06, + "loss": 1.2433, + "step": 24612 + }, + { + "epoch": 0.8814439450641932, + "grad_norm": 1.4017517566680908, + "learning_rate": 7.279924889477341e-06, + "loss": 1.4941, + "step": 24613 + }, + { + "epoch": 0.8814797571937615, + "grad_norm": 2.248542308807373, + "learning_rate": 7.27558093158569e-06, + "loss": 1.5972, + "step": 24614 + }, + { + "epoch": 0.8815155693233299, + "grad_norm": 1.4761277437210083, + "learning_rate": 7.2712382211814865e-06, + "loss": 1.4937, + "step": 24615 + }, + { + "epoch": 0.8815513814528981, + "grad_norm": 1.3250051736831665, + "learning_rate": 7.266896758323149e-06, + "loss": 1.1396, + "step": 24616 + }, + { + "epoch": 0.8815871935824664, + "grad_norm": 2.3315348625183105, + "learning_rate": 7.2625565430691214e-06, + "loss": 1.128, + "step": 24617 + }, + { + "epoch": 0.8816230057120347, + "grad_norm": 1.9104201793670654, + "learning_rate": 7.258217575477755e-06, + "loss": 1.364, + "step": 24618 + }, + { + "epoch": 0.8816588178416029, + "grad_norm": 1.6324379444122314, + "learning_rate": 7.253879855607437e-06, + "loss": 1.4386, + "step": 24619 + }, + { + "epoch": 0.8816946299711712, + "grad_norm": 1.622747540473938, + "learning_rate": 7.249543383516544e-06, + "loss": 1.3758, + "step": 24620 + }, + { + "epoch": 0.8817304421007395, + "grad_norm": 1.622939109802246, + "learning_rate": 7.245208159263417e-06, + "loss": 1.5202, + "step": 24621 + }, + { + "epoch": 0.8817662542303079, + "grad_norm": 1.5521178245544434, + "learning_rate": 7.240874182906343e-06, + "loss": 1.3341, + "step": 24622 + }, + { + "epoch": 0.8818020663598761, + "grad_norm": 1.466652512550354, + "learning_rate": 7.236541454503664e-06, + "loss": 1.5546, + "step": 24623 + }, + { + "epoch": 0.8818378784894444, + "grad_norm": 1.4532678127288818, + "learning_rate": 7.232209974113668e-06, + "loss": 1.3809, + "step": 24624 + }, + { + "epoch": 0.8818736906190127, + "grad_norm": 2.0897300243377686, + "learning_rate": 7.2278797417946405e-06, + "loss": 1.6313, + "step": 24625 + }, + { + "epoch": 0.8819095027485809, + "grad_norm": 1.290476679801941, + "learning_rate": 7.2235507576048024e-06, + "loss": 0.8929, + "step": 24626 + }, + { + "epoch": 0.8819453148781492, + "grad_norm": 1.5610119104385376, + "learning_rate": 7.219223021602417e-06, + "loss": 1.1235, + "step": 24627 + }, + { + "epoch": 0.8819811270077175, + "grad_norm": 1.5145996809005737, + "learning_rate": 7.214896533845716e-06, + "loss": 1.5866, + "step": 24628 + }, + { + "epoch": 0.8820169391372858, + "grad_norm": 1.9009678363800049, + "learning_rate": 7.210571294392898e-06, + "loss": 1.5812, + "step": 24629 + }, + { + "epoch": 0.8820527512668541, + "grad_norm": 1.3867923021316528, + "learning_rate": 7.206247303302138e-06, + "loss": 1.3757, + "step": 24630 + }, + { + "epoch": 0.8820885633964224, + "grad_norm": 2.2626278400421143, + "learning_rate": 7.201924560631634e-06, + "loss": 1.6227, + "step": 24631 + }, + { + "epoch": 0.8821243755259907, + "grad_norm": 1.3764382600784302, + "learning_rate": 7.197603066439551e-06, + "loss": 1.1527, + "step": 24632 + }, + { + "epoch": 0.8821601876555589, + "grad_norm": 1.6069098711013794, + "learning_rate": 7.193282820783987e-06, + "loss": 1.3079, + "step": 24633 + }, + { + "epoch": 0.8821959997851272, + "grad_norm": 1.8032952547073364, + "learning_rate": 7.188963823723105e-06, + "loss": 1.4146, + "step": 24634 + }, + { + "epoch": 0.8822318119146955, + "grad_norm": 1.4079662561416626, + "learning_rate": 7.184646075315005e-06, + "loss": 1.1088, + "step": 24635 + }, + { + "epoch": 0.8822676240442638, + "grad_norm": 1.5412856340408325, + "learning_rate": 7.18032957561775e-06, + "loss": 1.5314, + "step": 24636 + }, + { + "epoch": 0.8823034361738321, + "grad_norm": 1.5019010305404663, + "learning_rate": 7.176014324689428e-06, + "loss": 1.5056, + "step": 24637 + }, + { + "epoch": 0.8823392483034004, + "grad_norm": 1.4662113189697266, + "learning_rate": 7.171700322588115e-06, + "loss": 1.3196, + "step": 24638 + }, + { + "epoch": 0.8823750604329686, + "grad_norm": 1.510379433631897, + "learning_rate": 7.167387569371842e-06, + "loss": 1.2243, + "step": 24639 + }, + { + "epoch": 0.8824108725625369, + "grad_norm": 1.4555983543395996, + "learning_rate": 7.1630760650986065e-06, + "loss": 1.353, + "step": 24640 + }, + { + "epoch": 0.8824466846921052, + "grad_norm": 1.6988203525543213, + "learning_rate": 7.158765809826429e-06, + "loss": 1.5011, + "step": 24641 + }, + { + "epoch": 0.8824824968216735, + "grad_norm": 1.522375464439392, + "learning_rate": 7.154456803613297e-06, + "loss": 1.2669, + "step": 24642 + }, + { + "epoch": 0.8825183089512418, + "grad_norm": 1.9415677785873413, + "learning_rate": 7.150149046517218e-06, + "loss": 1.7027, + "step": 24643 + }, + { + "epoch": 0.8825541210808101, + "grad_norm": 1.302003264427185, + "learning_rate": 7.145842538596104e-06, + "loss": 1.3639, + "step": 24644 + }, + { + "epoch": 0.8825899332103784, + "grad_norm": 1.48600435256958, + "learning_rate": 7.141537279907873e-06, + "loss": 1.3566, + "step": 24645 + }, + { + "epoch": 0.8826257453399466, + "grad_norm": 1.4763420820236206, + "learning_rate": 7.1372332705105125e-06, + "loss": 1.1619, + "step": 24646 + }, + { + "epoch": 0.8826615574695149, + "grad_norm": 1.6381981372833252, + "learning_rate": 7.132930510461889e-06, + "loss": 1.5885, + "step": 24647 + }, + { + "epoch": 0.8826973695990832, + "grad_norm": 1.9131838083267212, + "learning_rate": 7.128628999819886e-06, + "loss": 1.5576, + "step": 24648 + }, + { + "epoch": 0.8827331817286515, + "grad_norm": 2.0828161239624023, + "learning_rate": 7.1243287386423826e-06, + "loss": 1.5515, + "step": 24649 + }, + { + "epoch": 0.8827689938582198, + "grad_norm": 1.6835256814956665, + "learning_rate": 7.120029726987254e-06, + "loss": 1.5039, + "step": 24650 + }, + { + "epoch": 0.8828048059877881, + "grad_norm": 1.6714760065078735, + "learning_rate": 7.11573196491232e-06, + "loss": 1.5961, + "step": 24651 + }, + { + "epoch": 0.8828406181173564, + "grad_norm": 1.5934548377990723, + "learning_rate": 7.111435452475368e-06, + "loss": 1.5434, + "step": 24652 + }, + { + "epoch": 0.8828764302469246, + "grad_norm": 1.663367509841919, + "learning_rate": 7.1071401897342625e-06, + "loss": 1.3505, + "step": 24653 + }, + { + "epoch": 0.8829122423764929, + "grad_norm": 1.8386807441711426, + "learning_rate": 7.1028461767467466e-06, + "loss": 1.7194, + "step": 24654 + }, + { + "epoch": 0.8829480545060612, + "grad_norm": 1.7250741720199585, + "learning_rate": 7.0985534135706296e-06, + "loss": 1.4797, + "step": 24655 + }, + { + "epoch": 0.8829838666356294, + "grad_norm": 1.8594763278961182, + "learning_rate": 7.0942619002635995e-06, + "loss": 1.5841, + "step": 24656 + }, + { + "epoch": 0.8830196787651978, + "grad_norm": 1.341265320777893, + "learning_rate": 7.089971636883475e-06, + "loss": 1.3489, + "step": 24657 + }, + { + "epoch": 0.8830554908947661, + "grad_norm": 1.5680384635925293, + "learning_rate": 7.085682623487921e-06, + "loss": 1.3567, + "step": 24658 + }, + { + "epoch": 0.8830913030243344, + "grad_norm": 1.4889854192733765, + "learning_rate": 7.0813948601346715e-06, + "loss": 1.5167, + "step": 24659 + }, + { + "epoch": 0.8831271151539026, + "grad_norm": 2.0524439811706543, + "learning_rate": 7.077108346881378e-06, + "loss": 1.2639, + "step": 24660 + }, + { + "epoch": 0.8831629272834709, + "grad_norm": 1.2576946020126343, + "learning_rate": 7.07282308378574e-06, + "loss": 1.4163, + "step": 24661 + }, + { + "epoch": 0.8831987394130392, + "grad_norm": 1.7365771532058716, + "learning_rate": 7.068539070905411e-06, + "loss": 1.8081, + "step": 24662 + }, + { + "epoch": 0.8832345515426074, + "grad_norm": 1.9017252922058105, + "learning_rate": 7.064256308297978e-06, + "loss": 1.7369, + "step": 24663 + }, + { + "epoch": 0.8832703636721758, + "grad_norm": 1.9138576984405518, + "learning_rate": 7.05997479602114e-06, + "loss": 1.2112, + "step": 24664 + }, + { + "epoch": 0.8833061758017441, + "grad_norm": 1.3498430252075195, + "learning_rate": 7.0556945341324284e-06, + "loss": 1.4391, + "step": 24665 + }, + { + "epoch": 0.8833419879313124, + "grad_norm": 1.7585177421569824, + "learning_rate": 7.051415522689487e-06, + "loss": 1.632, + "step": 24666 + }, + { + "epoch": 0.8833778000608806, + "grad_norm": 1.7154725790023804, + "learning_rate": 7.047137761749811e-06, + "loss": 1.4024, + "step": 24667 + }, + { + "epoch": 0.8834136121904489, + "grad_norm": 1.4533997774124146, + "learning_rate": 7.042861251371036e-06, + "loss": 1.1525, + "step": 24668 + }, + { + "epoch": 0.8834494243200172, + "grad_norm": 1.5111123323440552, + "learning_rate": 7.038585991610647e-06, + "loss": 1.3345, + "step": 24669 + }, + { + "epoch": 0.8834852364495854, + "grad_norm": 2.691683769226074, + "learning_rate": 7.034311982526165e-06, + "loss": 1.4586, + "step": 24670 + }, + { + "epoch": 0.8835210485791538, + "grad_norm": 2.8711907863616943, + "learning_rate": 7.0300392241751e-06, + "loss": 1.7666, + "step": 24671 + }, + { + "epoch": 0.8835568607087221, + "grad_norm": 1.3507050275802612, + "learning_rate": 7.025767716614928e-06, + "loss": 1.2691, + "step": 24672 + }, + { + "epoch": 0.8835926728382903, + "grad_norm": 1.9385850429534912, + "learning_rate": 7.021497459903137e-06, + "loss": 1.6612, + "step": 24673 + }, + { + "epoch": 0.8836284849678586, + "grad_norm": 2.0036303997039795, + "learning_rate": 7.017228454097136e-06, + "loss": 1.4667, + "step": 24674 + }, + { + "epoch": 0.8836642970974269, + "grad_norm": 1.8919461965560913, + "learning_rate": 7.012960699254423e-06, + "loss": 1.2746, + "step": 24675 + }, + { + "epoch": 0.8837001092269952, + "grad_norm": 1.8504756689071655, + "learning_rate": 7.0086941954323634e-06, + "loss": 1.3695, + "step": 24676 + }, + { + "epoch": 0.8837359213565634, + "grad_norm": 1.6944911479949951, + "learning_rate": 7.004428942688379e-06, + "loss": 1.3329, + "step": 24677 + }, + { + "epoch": 0.8837717334861318, + "grad_norm": 1.8815547227859497, + "learning_rate": 7.000164941079846e-06, + "loss": 1.3605, + "step": 24678 + }, + { + "epoch": 0.8838075456157001, + "grad_norm": 1.4075126647949219, + "learning_rate": 6.995902190664116e-06, + "loss": 1.439, + "step": 24679 + }, + { + "epoch": 0.8838433577452683, + "grad_norm": 1.4690585136413574, + "learning_rate": 6.99164069149858e-06, + "loss": 1.342, + "step": 24680 + }, + { + "epoch": 0.8838791698748366, + "grad_norm": 1.5376348495483398, + "learning_rate": 6.9873804436405345e-06, + "loss": 1.4158, + "step": 24681 + }, + { + "epoch": 0.8839149820044049, + "grad_norm": 1.4911822080612183, + "learning_rate": 6.9831214471473e-06, + "loss": 1.1987, + "step": 24682 + }, + { + "epoch": 0.8839507941339732, + "grad_norm": 1.6924875974655151, + "learning_rate": 6.978863702076188e-06, + "loss": 1.4808, + "step": 24683 + }, + { + "epoch": 0.8839866062635414, + "grad_norm": 1.6052711009979248, + "learning_rate": 6.974607208484496e-06, + "loss": 1.5073, + "step": 24684 + }, + { + "epoch": 0.8840224183931098, + "grad_norm": 1.507482886314392, + "learning_rate": 6.970351966429445e-06, + "loss": 1.3385, + "step": 24685 + }, + { + "epoch": 0.8840582305226781, + "grad_norm": 1.743933081626892, + "learning_rate": 6.966097975968311e-06, + "loss": 1.159, + "step": 24686 + }, + { + "epoch": 0.8840940426522463, + "grad_norm": 1.7662138938903809, + "learning_rate": 6.961845237158337e-06, + "loss": 1.4932, + "step": 24687 + }, + { + "epoch": 0.8841298547818146, + "grad_norm": 1.4913418292999268, + "learning_rate": 6.957593750056712e-06, + "loss": 1.1511, + "step": 24688 + }, + { + "epoch": 0.8841656669113829, + "grad_norm": 1.4417868852615356, + "learning_rate": 6.953343514720656e-06, + "loss": 1.5253, + "step": 24689 + }, + { + "epoch": 0.8842014790409511, + "grad_norm": 1.7345951795578003, + "learning_rate": 6.949094531207334e-06, + "loss": 1.3248, + "step": 24690 + }, + { + "epoch": 0.8842372911705194, + "grad_norm": 1.930124044418335, + "learning_rate": 6.944846799573934e-06, + "loss": 1.337, + "step": 24691 + }, + { + "epoch": 0.8842731033000878, + "grad_norm": 2.2706034183502197, + "learning_rate": 6.940600319877566e-06, + "loss": 1.4853, + "step": 24692 + }, + { + "epoch": 0.8843089154296561, + "grad_norm": 1.682254672050476, + "learning_rate": 6.936355092175384e-06, + "loss": 1.4649, + "step": 24693 + }, + { + "epoch": 0.8843447275592243, + "grad_norm": 1.652011752128601, + "learning_rate": 6.932111116524509e-06, + "loss": 1.3665, + "step": 24694 + }, + { + "epoch": 0.8843805396887926, + "grad_norm": 1.7562085390090942, + "learning_rate": 6.92786839298204e-06, + "loss": 1.4918, + "step": 24695 + }, + { + "epoch": 0.8844163518183609, + "grad_norm": 1.5120139122009277, + "learning_rate": 6.923626921605031e-06, + "loss": 1.2655, + "step": 24696 + }, + { + "epoch": 0.8844521639479291, + "grad_norm": 1.5884032249450684, + "learning_rate": 6.9193867024505695e-06, + "loss": 1.5416, + "step": 24697 + }, + { + "epoch": 0.8844879760774974, + "grad_norm": 1.2141194343566895, + "learning_rate": 6.9151477355757e-06, + "loss": 1.3824, + "step": 24698 + }, + { + "epoch": 0.8845237882070658, + "grad_norm": 1.8356338739395142, + "learning_rate": 6.910910021037431e-06, + "loss": 1.3849, + "step": 24699 + }, + { + "epoch": 0.884559600336634, + "grad_norm": 1.3843994140625, + "learning_rate": 6.906673558892807e-06, + "loss": 1.4973, + "step": 24700 + }, + { + "epoch": 0.8845954124662023, + "grad_norm": 1.4231609106063843, + "learning_rate": 6.902438349198792e-06, + "loss": 1.7378, + "step": 24701 + }, + { + "epoch": 0.8846312245957706, + "grad_norm": 1.7714993953704834, + "learning_rate": 6.898204392012408e-06, + "loss": 1.7244, + "step": 24702 + }, + { + "epoch": 0.8846670367253389, + "grad_norm": 1.7537744045257568, + "learning_rate": 6.893971687390566e-06, + "loss": 1.4904, + "step": 24703 + }, + { + "epoch": 0.8847028488549071, + "grad_norm": 1.8601810932159424, + "learning_rate": 6.889740235390241e-06, + "loss": 1.3624, + "step": 24704 + }, + { + "epoch": 0.8847386609844754, + "grad_norm": 1.8371527194976807, + "learning_rate": 6.885510036068377e-06, + "loss": 1.6462, + "step": 24705 + }, + { + "epoch": 0.8847744731140438, + "grad_norm": 2.1824593544006348, + "learning_rate": 6.881281089481839e-06, + "loss": 1.3354, + "step": 24706 + }, + { + "epoch": 0.884810285243612, + "grad_norm": 1.5834816694259644, + "learning_rate": 6.877053395687561e-06, + "loss": 1.5739, + "step": 24707 + }, + { + "epoch": 0.8848460973731803, + "grad_norm": 1.9149326086044312, + "learning_rate": 6.872826954742406e-06, + "loss": 1.5595, + "step": 24708 + }, + { + "epoch": 0.8848819095027486, + "grad_norm": 2.170591354370117, + "learning_rate": 6.868601766703253e-06, + "loss": 1.2154, + "step": 24709 + }, + { + "epoch": 0.8849177216323169, + "grad_norm": 1.5158920288085938, + "learning_rate": 6.8643778316269226e-06, + "loss": 1.2864, + "step": 24710 + }, + { + "epoch": 0.8849535337618851, + "grad_norm": 1.6316123008728027, + "learning_rate": 6.860155149570246e-06, + "loss": 1.5006, + "step": 24711 + }, + { + "epoch": 0.8849893458914534, + "grad_norm": 1.3980909585952759, + "learning_rate": 6.855933720590047e-06, + "loss": 1.4343, + "step": 24712 + }, + { + "epoch": 0.8850251580210218, + "grad_norm": 1.5064276456832886, + "learning_rate": 6.8517135447431215e-06, + "loss": 1.3443, + "step": 24713 + }, + { + "epoch": 0.88506097015059, + "grad_norm": 1.580866813659668, + "learning_rate": 6.847494622086226e-06, + "loss": 1.3168, + "step": 24714 + }, + { + "epoch": 0.8850967822801583, + "grad_norm": 1.6514077186584473, + "learning_rate": 6.843276952676125e-06, + "loss": 1.3983, + "step": 24715 + }, + { + "epoch": 0.8851325944097266, + "grad_norm": 1.7922331094741821, + "learning_rate": 6.839060536569597e-06, + "loss": 1.4731, + "step": 24716 + }, + { + "epoch": 0.8851684065392948, + "grad_norm": 1.8965702056884766, + "learning_rate": 6.834845373823317e-06, + "loss": 1.2081, + "step": 24717 + }, + { + "epoch": 0.8852042186688631, + "grad_norm": 1.391337275505066, + "learning_rate": 6.830631464494019e-06, + "loss": 1.4714, + "step": 24718 + }, + { + "epoch": 0.8852400307984314, + "grad_norm": 1.5723334550857544, + "learning_rate": 6.826418808638391e-06, + "loss": 1.647, + "step": 24719 + }, + { + "epoch": 0.8852758429279998, + "grad_norm": 1.616264820098877, + "learning_rate": 6.82220740631313e-06, + "loss": 1.3395, + "step": 24720 + }, + { + "epoch": 0.885311655057568, + "grad_norm": 1.7802958488464355, + "learning_rate": 6.8179972575748706e-06, + "loss": 1.3974, + "step": 24721 + }, + { + "epoch": 0.8853474671871363, + "grad_norm": 1.9190346002578735, + "learning_rate": 6.813788362480256e-06, + "loss": 1.4053, + "step": 24722 + }, + { + "epoch": 0.8853832793167046, + "grad_norm": 1.4406262636184692, + "learning_rate": 6.809580721085929e-06, + "loss": 1.5542, + "step": 24723 + }, + { + "epoch": 0.8854190914462728, + "grad_norm": 1.6550371646881104, + "learning_rate": 6.805374333448478e-06, + "loss": 1.1518, + "step": 24724 + }, + { + "epoch": 0.8854549035758411, + "grad_norm": 1.6076737642288208, + "learning_rate": 6.801169199624502e-06, + "loss": 1.3962, + "step": 24725 + }, + { + "epoch": 0.8854907157054094, + "grad_norm": 1.676985263824463, + "learning_rate": 6.796965319670568e-06, + "loss": 1.6268, + "step": 24726 + }, + { + "epoch": 0.8855265278349778, + "grad_norm": 2.0720744132995605, + "learning_rate": 6.792762693643262e-06, + "loss": 1.3802, + "step": 24727 + }, + { + "epoch": 0.885562339964546, + "grad_norm": 1.511457085609436, + "learning_rate": 6.7885613215990965e-06, + "loss": 1.3425, + "step": 24728 + }, + { + "epoch": 0.8855981520941143, + "grad_norm": 2.5622329711914062, + "learning_rate": 6.7843612035945915e-06, + "loss": 1.6844, + "step": 24729 + }, + { + "epoch": 0.8856339642236826, + "grad_norm": 1.8360401391983032, + "learning_rate": 6.78016233968628e-06, + "loss": 1.3572, + "step": 24730 + }, + { + "epoch": 0.8856697763532508, + "grad_norm": 1.9595917463302612, + "learning_rate": 6.775964729930651e-06, + "loss": 1.4014, + "step": 24731 + }, + { + "epoch": 0.8857055884828191, + "grad_norm": 1.515363097190857, + "learning_rate": 6.771768374384168e-06, + "loss": 1.1346, + "step": 24732 + }, + { + "epoch": 0.8857414006123874, + "grad_norm": 1.5928148031234741, + "learning_rate": 6.767573273103245e-06, + "loss": 1.3723, + "step": 24733 + }, + { + "epoch": 0.8857772127419558, + "grad_norm": 1.628875970840454, + "learning_rate": 6.7633794261444005e-06, + "loss": 1.5718, + "step": 24734 + }, + { + "epoch": 0.885813024871524, + "grad_norm": 1.8851513862609863, + "learning_rate": 6.7591868335640016e-06, + "loss": 1.4976, + "step": 24735 + }, + { + "epoch": 0.8858488370010923, + "grad_norm": 1.4460524320602417, + "learning_rate": 6.754995495418482e-06, + "loss": 1.467, + "step": 24736 + }, + { + "epoch": 0.8858846491306606, + "grad_norm": 2.1435530185699463, + "learning_rate": 6.750805411764205e-06, + "loss": 1.3132, + "step": 24737 + }, + { + "epoch": 0.8859204612602288, + "grad_norm": 1.743093490600586, + "learning_rate": 6.746616582657583e-06, + "loss": 1.4094, + "step": 24738 + }, + { + "epoch": 0.8859562733897971, + "grad_norm": 2.133668899536133, + "learning_rate": 6.742429008154927e-06, + "loss": 1.4657, + "step": 24739 + }, + { + "epoch": 0.8859920855193654, + "grad_norm": 1.4859719276428223, + "learning_rate": 6.738242688312602e-06, + "loss": 1.3478, + "step": 24740 + }, + { + "epoch": 0.8860278976489337, + "grad_norm": 2.430969715118408, + "learning_rate": 6.734057623186929e-06, + "loss": 1.3177, + "step": 24741 + }, + { + "epoch": 0.886063709778502, + "grad_norm": 1.748104453086853, + "learning_rate": 6.729873812834198e-06, + "loss": 1.2021, + "step": 24742 + }, + { + "epoch": 0.8860995219080703, + "grad_norm": 2.162109375, + "learning_rate": 6.725691257310718e-06, + "loss": 1.6393, + "step": 24743 + }, + { + "epoch": 0.8861353340376386, + "grad_norm": 1.8088104724884033, + "learning_rate": 6.721509956672711e-06, + "loss": 1.4486, + "step": 24744 + }, + { + "epoch": 0.8861711461672068, + "grad_norm": 1.3711811304092407, + "learning_rate": 6.7173299109765e-06, + "loss": 1.4542, + "step": 24745 + }, + { + "epoch": 0.8862069582967751, + "grad_norm": 2.1322035789489746, + "learning_rate": 6.713151120278283e-06, + "loss": 1.5574, + "step": 24746 + }, + { + "epoch": 0.8862427704263434, + "grad_norm": 2.123325824737549, + "learning_rate": 6.7089735846342815e-06, + "loss": 1.3594, + "step": 24747 + }, + { + "epoch": 0.8862785825559117, + "grad_norm": 1.7514166831970215, + "learning_rate": 6.704797304100707e-06, + "loss": 1.2006, + "step": 24748 + }, + { + "epoch": 0.88631439468548, + "grad_norm": 1.8067326545715332, + "learning_rate": 6.700622278733748e-06, + "loss": 1.5183, + "step": 24749 + }, + { + "epoch": 0.8863502068150483, + "grad_norm": 1.8331403732299805, + "learning_rate": 6.69644850858957e-06, + "loss": 1.411, + "step": 24750 + }, + { + "epoch": 0.8863860189446165, + "grad_norm": 1.604770541191101, + "learning_rate": 6.692275993724295e-06, + "loss": 1.3225, + "step": 24751 + }, + { + "epoch": 0.8864218310741848, + "grad_norm": 2.0157501697540283, + "learning_rate": 6.688104734194123e-06, + "loss": 1.3786, + "step": 24752 + }, + { + "epoch": 0.8864576432037531, + "grad_norm": 1.6937917470932007, + "learning_rate": 6.683934730055119e-06, + "loss": 1.3432, + "step": 24753 + }, + { + "epoch": 0.8864934553333214, + "grad_norm": 1.6693098545074463, + "learning_rate": 6.679765981363417e-06, + "loss": 1.6571, + "step": 24754 + }, + { + "epoch": 0.8865292674628896, + "grad_norm": 1.3133947849273682, + "learning_rate": 6.675598488175061e-06, + "loss": 1.4155, + "step": 24755 + }, + { + "epoch": 0.886565079592458, + "grad_norm": 1.407610297203064, + "learning_rate": 6.671432250546184e-06, + "loss": 1.4084, + "step": 24756 + }, + { + "epoch": 0.8866008917220263, + "grad_norm": 1.6117279529571533, + "learning_rate": 6.6672672685327955e-06, + "loss": 1.511, + "step": 24757 + }, + { + "epoch": 0.8866367038515945, + "grad_norm": 1.8112707138061523, + "learning_rate": 6.663103542190918e-06, + "loss": 1.7054, + "step": 24758 + }, + { + "epoch": 0.8866725159811628, + "grad_norm": 1.5144332647323608, + "learning_rate": 6.658941071576597e-06, + "loss": 1.3524, + "step": 24759 + }, + { + "epoch": 0.8867083281107311, + "grad_norm": 1.8527796268463135, + "learning_rate": 6.654779856745807e-06, + "loss": 1.6036, + "step": 24760 + }, + { + "epoch": 0.8867441402402993, + "grad_norm": 1.8971564769744873, + "learning_rate": 6.650619897754573e-06, + "loss": 1.2975, + "step": 24761 + }, + { + "epoch": 0.8867799523698676, + "grad_norm": 1.2585480213165283, + "learning_rate": 6.646461194658804e-06, + "loss": 1.4554, + "step": 24762 + }, + { + "epoch": 0.886815764499436, + "grad_norm": 1.744014024734497, + "learning_rate": 6.642303747514511e-06, + "loss": 1.4196, + "step": 24763 + }, + { + "epoch": 0.8868515766290043, + "grad_norm": 1.7813466787338257, + "learning_rate": 6.638147556377583e-06, + "loss": 1.1593, + "step": 24764 + }, + { + "epoch": 0.8868873887585725, + "grad_norm": 1.533890962600708, + "learning_rate": 6.633992621303975e-06, + "loss": 1.6984, + "step": 24765 + }, + { + "epoch": 0.8869232008881408, + "grad_norm": 1.2901742458343506, + "learning_rate": 6.629838942349542e-06, + "loss": 1.3292, + "step": 24766 + }, + { + "epoch": 0.8869590130177091, + "grad_norm": 1.23209810256958, + "learning_rate": 6.625686519570184e-06, + "loss": 1.5023, + "step": 24767 + }, + { + "epoch": 0.8869948251472773, + "grad_norm": 1.7562414407730103, + "learning_rate": 6.621535353021791e-06, + "loss": 1.3996, + "step": 24768 + }, + { + "epoch": 0.8870306372768456, + "grad_norm": 1.7048919200897217, + "learning_rate": 6.617385442760171e-06, + "loss": 1.4065, + "step": 24769 + }, + { + "epoch": 0.887066449406414, + "grad_norm": 2.8547306060791016, + "learning_rate": 6.61323678884117e-06, + "loss": 1.6266, + "step": 24770 + }, + { + "epoch": 0.8871022615359823, + "grad_norm": 1.4367671012878418, + "learning_rate": 6.6090893913206106e-06, + "loss": 1.5252, + "step": 24771 + }, + { + "epoch": 0.8871380736655505, + "grad_norm": 1.2508554458618164, + "learning_rate": 6.604943250254303e-06, + "loss": 0.8535, + "step": 24772 + }, + { + "epoch": 0.8871738857951188, + "grad_norm": 1.5373388528823853, + "learning_rate": 6.600798365697991e-06, + "loss": 1.3155, + "step": 24773 + }, + { + "epoch": 0.8872096979246871, + "grad_norm": 1.792191743850708, + "learning_rate": 6.596654737707486e-06, + "loss": 1.4589, + "step": 24774 + }, + { + "epoch": 0.8872455100542553, + "grad_norm": 2.387756109237671, + "learning_rate": 6.592512366338499e-06, + "loss": 1.5162, + "step": 24775 + }, + { + "epoch": 0.8872813221838236, + "grad_norm": 1.9498742818832397, + "learning_rate": 6.588371251646774e-06, + "loss": 1.6594, + "step": 24776 + }, + { + "epoch": 0.887317134313392, + "grad_norm": 1.4522777795791626, + "learning_rate": 6.584231393688012e-06, + "loss": 1.5712, + "step": 24777 + }, + { + "epoch": 0.8873529464429603, + "grad_norm": 1.6063610315322876, + "learning_rate": 6.5800927925179115e-06, + "loss": 1.3801, + "step": 24778 + }, + { + "epoch": 0.8873887585725285, + "grad_norm": 1.822343111038208, + "learning_rate": 6.575955448192184e-06, + "loss": 1.4519, + "step": 24779 + }, + { + "epoch": 0.8874245707020968, + "grad_norm": 2.020034074783325, + "learning_rate": 6.5718193607664516e-06, + "loss": 1.1206, + "step": 24780 + }, + { + "epoch": 0.8874603828316651, + "grad_norm": 2.021859884262085, + "learning_rate": 6.5676845302963805e-06, + "loss": 1.403, + "step": 24781 + }, + { + "epoch": 0.8874961949612333, + "grad_norm": 1.5246386528015137, + "learning_rate": 6.563550956837594e-06, + "loss": 1.1573, + "step": 24782 + }, + { + "epoch": 0.8875320070908016, + "grad_norm": 1.686280369758606, + "learning_rate": 6.559418640445714e-06, + "loss": 1.4514, + "step": 24783 + }, + { + "epoch": 0.88756781922037, + "grad_norm": 1.4774922132492065, + "learning_rate": 6.555287581176317e-06, + "loss": 1.606, + "step": 24784 + }, + { + "epoch": 0.8876036313499382, + "grad_norm": 1.6550049781799316, + "learning_rate": 6.551157779084982e-06, + "loss": 1.2363, + "step": 24785 + }, + { + "epoch": 0.8876394434795065, + "grad_norm": 1.3732515573501587, + "learning_rate": 6.547029234227298e-06, + "loss": 1.4708, + "step": 24786 + }, + { + "epoch": 0.8876752556090748, + "grad_norm": 1.4818493127822876, + "learning_rate": 6.5429019466587745e-06, + "loss": 1.5955, + "step": 24787 + }, + { + "epoch": 0.8877110677386431, + "grad_norm": 1.542306900024414, + "learning_rate": 6.5387759164349585e-06, + "loss": 1.4008, + "step": 24788 + }, + { + "epoch": 0.8877468798682113, + "grad_norm": 1.4538143873214722, + "learning_rate": 6.5346511436113585e-06, + "loss": 1.2013, + "step": 24789 + }, + { + "epoch": 0.8877826919977796, + "grad_norm": 2.1333580017089844, + "learning_rate": 6.5305276282434765e-06, + "loss": 1.4146, + "step": 24790 + }, + { + "epoch": 0.887818504127348, + "grad_norm": 1.75994873046875, + "learning_rate": 6.526405370386757e-06, + "loss": 1.4529, + "step": 24791 + }, + { + "epoch": 0.8878543162569162, + "grad_norm": 1.6889221668243408, + "learning_rate": 6.522284370096687e-06, + "loss": 1.5275, + "step": 24792 + }, + { + "epoch": 0.8878901283864845, + "grad_norm": 1.9283156394958496, + "learning_rate": 6.518164627428724e-06, + "loss": 1.5469, + "step": 24793 + }, + { + "epoch": 0.8879259405160528, + "grad_norm": 1.4188671112060547, + "learning_rate": 6.514046142438246e-06, + "loss": 1.3232, + "step": 24794 + }, + { + "epoch": 0.887961752645621, + "grad_norm": 2.01699161529541, + "learning_rate": 6.509928915180697e-06, + "loss": 1.5934, + "step": 24795 + }, + { + "epoch": 0.8879975647751893, + "grad_norm": 1.3128671646118164, + "learning_rate": 6.505812945711454e-06, + "loss": 1.2986, + "step": 24796 + }, + { + "epoch": 0.8880333769047576, + "grad_norm": 1.3702200651168823, + "learning_rate": 6.501698234085929e-06, + "loss": 1.4046, + "step": 24797 + }, + { + "epoch": 0.888069189034326, + "grad_norm": 1.5460312366485596, + "learning_rate": 6.497584780359423e-06, + "loss": 1.3257, + "step": 24798 + }, + { + "epoch": 0.8881050011638942, + "grad_norm": 1.7780214548110962, + "learning_rate": 6.4934725845873016e-06, + "loss": 1.5938, + "step": 24799 + }, + { + "epoch": 0.8881408132934625, + "grad_norm": 1.7738583087921143, + "learning_rate": 6.489361646824898e-06, + "loss": 1.3187, + "step": 24800 + }, + { + "epoch": 0.8881766254230308, + "grad_norm": 1.6108421087265015, + "learning_rate": 6.485251967127526e-06, + "loss": 1.6039, + "step": 24801 + }, + { + "epoch": 0.888212437552599, + "grad_norm": 1.5561243295669556, + "learning_rate": 6.48114354555045e-06, + "loss": 1.3836, + "step": 24802 + }, + { + "epoch": 0.8882482496821673, + "grad_norm": 1.284730076789856, + "learning_rate": 6.47703638214896e-06, + "loss": 1.3728, + "step": 24803 + }, + { + "epoch": 0.8882840618117356, + "grad_norm": 1.3639391660690308, + "learning_rate": 6.4729304769783225e-06, + "loss": 1.3553, + "step": 24804 + }, + { + "epoch": 0.888319873941304, + "grad_norm": 1.3871612548828125, + "learning_rate": 6.468825830093739e-06, + "loss": 1.626, + "step": 24805 + }, + { + "epoch": 0.8883556860708722, + "grad_norm": 1.5964213609695435, + "learning_rate": 6.4647224415504745e-06, + "loss": 1.4741, + "step": 24806 + }, + { + "epoch": 0.8883914982004405, + "grad_norm": 1.4523588418960571, + "learning_rate": 6.460620311403709e-06, + "loss": 1.5063, + "step": 24807 + }, + { + "epoch": 0.8884273103300088, + "grad_norm": 1.8651169538497925, + "learning_rate": 6.456519439708653e-06, + "loss": 1.5239, + "step": 24808 + }, + { + "epoch": 0.888463122459577, + "grad_norm": 1.986107349395752, + "learning_rate": 6.452419826520451e-06, + "loss": 1.4542, + "step": 24809 + }, + { + "epoch": 0.8884989345891453, + "grad_norm": 1.5596935749053955, + "learning_rate": 6.44832147189427e-06, + "loss": 1.439, + "step": 24810 + }, + { + "epoch": 0.8885347467187136, + "grad_norm": 1.6743347644805908, + "learning_rate": 6.444224375885277e-06, + "loss": 1.5254, + "step": 24811 + }, + { + "epoch": 0.888570558848282, + "grad_norm": 1.4157236814498901, + "learning_rate": 6.44012853854854e-06, + "loss": 1.3591, + "step": 24812 + }, + { + "epoch": 0.8886063709778502, + "grad_norm": 1.396809458732605, + "learning_rate": 6.436033959939192e-06, + "loss": 1.1897, + "step": 24813 + }, + { + "epoch": 0.8886421831074185, + "grad_norm": 1.3337267637252808, + "learning_rate": 6.431940640112322e-06, + "loss": 1.7, + "step": 24814 + }, + { + "epoch": 0.8886779952369868, + "grad_norm": 1.819524884223938, + "learning_rate": 6.4278485791230195e-06, + "loss": 1.5505, + "step": 24815 + }, + { + "epoch": 0.888713807366555, + "grad_norm": 1.781790018081665, + "learning_rate": 6.423757777026285e-06, + "loss": 1.2891, + "step": 24816 + }, + { + "epoch": 0.8887496194961233, + "grad_norm": 1.4497478008270264, + "learning_rate": 6.419668233877197e-06, + "loss": 1.448, + "step": 24817 + }, + { + "epoch": 0.8887854316256916, + "grad_norm": 1.353127121925354, + "learning_rate": 6.415579949730755e-06, + "loss": 1.514, + "step": 24818 + }, + { + "epoch": 0.88882124375526, + "grad_norm": 1.5938628911972046, + "learning_rate": 6.411492924641982e-06, + "loss": 1.3923, + "step": 24819 + }, + { + "epoch": 0.8888570558848282, + "grad_norm": 1.3531911373138428, + "learning_rate": 6.407407158665846e-06, + "loss": 1.4075, + "step": 24820 + }, + { + "epoch": 0.8888928680143965, + "grad_norm": 1.8812788724899292, + "learning_rate": 6.403322651857313e-06, + "loss": 1.4905, + "step": 24821 + }, + { + "epoch": 0.8889286801439648, + "grad_norm": 1.4424673318862915, + "learning_rate": 6.399239404271362e-06, + "loss": 1.4742, + "step": 24822 + }, + { + "epoch": 0.888964492273533, + "grad_norm": 1.4213186502456665, + "learning_rate": 6.395157415962894e-06, + "loss": 1.4607, + "step": 24823 + }, + { + "epoch": 0.8890003044031013, + "grad_norm": 1.715586543083191, + "learning_rate": 6.39107668698683e-06, + "loss": 1.298, + "step": 24824 + }, + { + "epoch": 0.8890361165326696, + "grad_norm": 1.4657044410705566, + "learning_rate": 6.386997217398094e-06, + "loss": 1.6368, + "step": 24825 + }, + { + "epoch": 0.8890719286622379, + "grad_norm": 1.7812143564224243, + "learning_rate": 6.382919007251575e-06, + "loss": 1.4923, + "step": 24826 + }, + { + "epoch": 0.8891077407918062, + "grad_norm": 2.4855334758758545, + "learning_rate": 6.378842056602097e-06, + "loss": 1.3953, + "step": 24827 + }, + { + "epoch": 0.8891435529213745, + "grad_norm": 1.778143286705017, + "learning_rate": 6.374766365504547e-06, + "loss": 1.3601, + "step": 24828 + }, + { + "epoch": 0.8891793650509427, + "grad_norm": 1.8928757905960083, + "learning_rate": 6.370691934013761e-06, + "loss": 1.29, + "step": 24829 + }, + { + "epoch": 0.889215177180511, + "grad_norm": 1.5196993350982666, + "learning_rate": 6.366618762184529e-06, + "loss": 1.4735, + "step": 24830 + }, + { + "epoch": 0.8892509893100793, + "grad_norm": 1.5085698366165161, + "learning_rate": 6.36254685007166e-06, + "loss": 1.7028, + "step": 24831 + }, + { + "epoch": 0.8892868014396476, + "grad_norm": 1.4420183897018433, + "learning_rate": 6.358476197729934e-06, + "loss": 1.086, + "step": 24832 + }, + { + "epoch": 0.8893226135692159, + "grad_norm": 1.4737896919250488, + "learning_rate": 6.3544068052141415e-06, + "loss": 1.3265, + "step": 24833 + }, + { + "epoch": 0.8893584256987842, + "grad_norm": 1.763348937034607, + "learning_rate": 6.3503386725790034e-06, + "loss": 1.3154, + "step": 24834 + }, + { + "epoch": 0.8893942378283525, + "grad_norm": 1.7766228914260864, + "learning_rate": 6.346271799879244e-06, + "loss": 1.1381, + "step": 24835 + }, + { + "epoch": 0.8894300499579207, + "grad_norm": 1.7439228296279907, + "learning_rate": 6.342206187169608e-06, + "loss": 1.1344, + "step": 24836 + }, + { + "epoch": 0.889465862087489, + "grad_norm": 1.729805588722229, + "learning_rate": 6.338141834504785e-06, + "loss": 1.3619, + "step": 24837 + }, + { + "epoch": 0.8895016742170573, + "grad_norm": 2.5667622089385986, + "learning_rate": 6.3340787419394535e-06, + "loss": 1.2524, + "step": 24838 + }, + { + "epoch": 0.8895374863466255, + "grad_norm": 1.547900676727295, + "learning_rate": 6.330016909528236e-06, + "loss": 1.2293, + "step": 24839 + }, + { + "epoch": 0.8895732984761939, + "grad_norm": 1.8758774995803833, + "learning_rate": 6.325956337325845e-06, + "loss": 1.3833, + "step": 24840 + }, + { + "epoch": 0.8896091106057622, + "grad_norm": 1.384765625, + "learning_rate": 6.321897025386869e-06, + "loss": 1.1106, + "step": 24841 + }, + { + "epoch": 0.8896449227353305, + "grad_norm": 1.5886365175247192, + "learning_rate": 6.317838973765944e-06, + "loss": 1.6064, + "step": 24842 + }, + { + "epoch": 0.8896807348648987, + "grad_norm": 1.6912497282028198, + "learning_rate": 6.313782182517636e-06, + "loss": 1.5548, + "step": 24843 + }, + { + "epoch": 0.889716546994467, + "grad_norm": 1.452126145362854, + "learning_rate": 6.309726651696557e-06, + "loss": 1.4011, + "step": 24844 + }, + { + "epoch": 0.8897523591240353, + "grad_norm": 1.428489089012146, + "learning_rate": 6.305672381357264e-06, + "loss": 1.322, + "step": 24845 + }, + { + "epoch": 0.8897881712536035, + "grad_norm": 1.99895179271698, + "learning_rate": 6.301619371554257e-06, + "loss": 1.6756, + "step": 24846 + }, + { + "epoch": 0.8898239833831719, + "grad_norm": 2.284052848815918, + "learning_rate": 6.297567622342127e-06, + "loss": 1.495, + "step": 24847 + }, + { + "epoch": 0.8898597955127402, + "grad_norm": 1.3983558416366577, + "learning_rate": 6.29351713377535e-06, + "loss": 1.1678, + "step": 24848 + }, + { + "epoch": 0.8898956076423085, + "grad_norm": 1.4451669454574585, + "learning_rate": 6.289467905908442e-06, + "loss": 1.4856, + "step": 24849 + }, + { + "epoch": 0.8899314197718767, + "grad_norm": 1.845213770866394, + "learning_rate": 6.285419938795833e-06, + "loss": 1.5874, + "step": 24850 + }, + { + "epoch": 0.889967231901445, + "grad_norm": 1.7814226150512695, + "learning_rate": 6.281373232492038e-06, + "loss": 1.3611, + "step": 24851 + }, + { + "epoch": 0.8900030440310133, + "grad_norm": 2.072495222091675, + "learning_rate": 6.2773277870514675e-06, + "loss": 1.3698, + "step": 24852 + }, + { + "epoch": 0.8900388561605815, + "grad_norm": 1.623978853225708, + "learning_rate": 6.273283602528579e-06, + "loss": 1.0922, + "step": 24853 + }, + { + "epoch": 0.8900746682901499, + "grad_norm": 1.4474862813949585, + "learning_rate": 6.269240678977739e-06, + "loss": 1.3673, + "step": 24854 + }, + { + "epoch": 0.8901104804197182, + "grad_norm": 1.9575895071029663, + "learning_rate": 6.265199016453371e-06, + "loss": 1.5366, + "step": 24855 + }, + { + "epoch": 0.8901462925492865, + "grad_norm": 1.3855613470077515, + "learning_rate": 6.261158615009843e-06, + "loss": 1.3095, + "step": 24856 + }, + { + "epoch": 0.8901821046788547, + "grad_norm": 1.46273934841156, + "learning_rate": 6.25711947470149e-06, + "loss": 1.7024, + "step": 24857 + }, + { + "epoch": 0.890217916808423, + "grad_norm": 1.435225248336792, + "learning_rate": 6.253081595582699e-06, + "loss": 1.3673, + "step": 24858 + }, + { + "epoch": 0.8902537289379913, + "grad_norm": 1.5706956386566162, + "learning_rate": 6.249044977707763e-06, + "loss": 1.3358, + "step": 24859 + }, + { + "epoch": 0.8902895410675595, + "grad_norm": 1.7113782167434692, + "learning_rate": 6.245009621131004e-06, + "loss": 1.4194, + "step": 24860 + }, + { + "epoch": 0.8903253531971279, + "grad_norm": 1.5109610557556152, + "learning_rate": 6.2409755259066786e-06, + "loss": 1.3438, + "step": 24861 + }, + { + "epoch": 0.8903611653266962, + "grad_norm": 1.6783028841018677, + "learning_rate": 6.23694269208912e-06, + "loss": 1.2793, + "step": 24862 + }, + { + "epoch": 0.8903969774562644, + "grad_norm": 2.1244373321533203, + "learning_rate": 6.232911119732554e-06, + "loss": 1.4257, + "step": 24863 + }, + { + "epoch": 0.8904327895858327, + "grad_norm": 1.8511030673980713, + "learning_rate": 6.228880808891202e-06, + "loss": 1.1956, + "step": 24864 + }, + { + "epoch": 0.890468601715401, + "grad_norm": 1.8468077182769775, + "learning_rate": 6.224851759619299e-06, + "loss": 1.4459, + "step": 24865 + }, + { + "epoch": 0.8905044138449693, + "grad_norm": 1.4527554512023926, + "learning_rate": 6.220823971971046e-06, + "loss": 1.4473, + "step": 24866 + }, + { + "epoch": 0.8905402259745375, + "grad_norm": 1.7244460582733154, + "learning_rate": 6.216797446000666e-06, + "loss": 1.3496, + "step": 24867 + }, + { + "epoch": 0.8905760381041059, + "grad_norm": 1.394762635231018, + "learning_rate": 6.212772181762283e-06, + "loss": 1.3262, + "step": 24868 + }, + { + "epoch": 0.8906118502336742, + "grad_norm": 1.723497748374939, + "learning_rate": 6.208748179310087e-06, + "loss": 1.1949, + "step": 24869 + }, + { + "epoch": 0.8906476623632424, + "grad_norm": 1.4614174365997314, + "learning_rate": 6.204725438698189e-06, + "loss": 1.5436, + "step": 24870 + }, + { + "epoch": 0.8906834744928107, + "grad_norm": 1.726501703262329, + "learning_rate": 6.200703959980747e-06, + "loss": 1.3518, + "step": 24871 + }, + { + "epoch": 0.890719286622379, + "grad_norm": 1.5753802061080933, + "learning_rate": 6.196683743211818e-06, + "loss": 1.4447, + "step": 24872 + }, + { + "epoch": 0.8907550987519472, + "grad_norm": 1.9597927331924438, + "learning_rate": 6.192664788445513e-06, + "loss": 1.5194, + "step": 24873 + }, + { + "epoch": 0.8907909108815155, + "grad_norm": 1.322771668434143, + "learning_rate": 6.188647095735911e-06, + "loss": 1.6306, + "step": 24874 + }, + { + "epoch": 0.8908267230110839, + "grad_norm": 1.6770081520080566, + "learning_rate": 6.184630665137048e-06, + "loss": 1.2971, + "step": 24875 + }, + { + "epoch": 0.8908625351406522, + "grad_norm": 1.794987440109253, + "learning_rate": 6.180615496702968e-06, + "loss": 1.5026, + "step": 24876 + }, + { + "epoch": 0.8908983472702204, + "grad_norm": 1.406591534614563, + "learning_rate": 6.176601590487685e-06, + "loss": 1.1957, + "step": 24877 + }, + { + "epoch": 0.8909341593997887, + "grad_norm": 1.7108861207962036, + "learning_rate": 6.17258894654521e-06, + "loss": 1.3956, + "step": 24878 + }, + { + "epoch": 0.890969971529357, + "grad_norm": 1.513010859489441, + "learning_rate": 6.168577564929523e-06, + "loss": 1.6367, + "step": 24879 + }, + { + "epoch": 0.8910057836589252, + "grad_norm": 1.442939043045044, + "learning_rate": 6.16456744569458e-06, + "loss": 1.1821, + "step": 24880 + }, + { + "epoch": 0.8910415957884935, + "grad_norm": 2.025557279586792, + "learning_rate": 6.160558588894361e-06, + "loss": 1.5068, + "step": 24881 + }, + { + "epoch": 0.8910774079180619, + "grad_norm": 1.4389938116073608, + "learning_rate": 6.156550994582766e-06, + "loss": 1.0765, + "step": 24882 + }, + { + "epoch": 0.8911132200476302, + "grad_norm": 1.3525745868682861, + "learning_rate": 6.1525446628137306e-06, + "loss": 1.2608, + "step": 24883 + }, + { + "epoch": 0.8911490321771984, + "grad_norm": 1.968949794769287, + "learning_rate": 6.148539593641156e-06, + "loss": 1.5984, + "step": 24884 + }, + { + "epoch": 0.8911848443067667, + "grad_norm": 1.5880968570709229, + "learning_rate": 6.144535787118921e-06, + "loss": 1.5123, + "step": 24885 + }, + { + "epoch": 0.891220656436335, + "grad_norm": 2.515672206878662, + "learning_rate": 6.140533243300894e-06, + "loss": 1.5112, + "step": 24886 + }, + { + "epoch": 0.8912564685659032, + "grad_norm": 1.2918782234191895, + "learning_rate": 6.13653196224091e-06, + "loss": 1.0852, + "step": 24887 + }, + { + "epoch": 0.8912922806954715, + "grad_norm": 1.7701398134231567, + "learning_rate": 6.132531943992826e-06, + "loss": 1.1541, + "step": 24888 + }, + { + "epoch": 0.8913280928250399, + "grad_norm": 1.8440961837768555, + "learning_rate": 6.128533188610453e-06, + "loss": 1.3715, + "step": 24889 + }, + { + "epoch": 0.8913639049546082, + "grad_norm": 2.0885820388793945, + "learning_rate": 6.124535696147559e-06, + "loss": 1.8181, + "step": 24890 + }, + { + "epoch": 0.8913997170841764, + "grad_norm": 1.7771596908569336, + "learning_rate": 6.12053946665796e-06, + "loss": 1.2956, + "step": 24891 + }, + { + "epoch": 0.8914355292137447, + "grad_norm": 1.7697362899780273, + "learning_rate": 6.1165445001954095e-06, + "loss": 1.2343, + "step": 24892 + }, + { + "epoch": 0.891471341343313, + "grad_norm": 2.0563573837280273, + "learning_rate": 6.112550796813643e-06, + "loss": 1.5779, + "step": 24893 + }, + { + "epoch": 0.8915071534728812, + "grad_norm": 1.5106770992279053, + "learning_rate": 6.108558356566396e-06, + "loss": 1.5096, + "step": 24894 + }, + { + "epoch": 0.8915429656024495, + "grad_norm": 1.5133570432662964, + "learning_rate": 6.104567179507381e-06, + "loss": 1.21, + "step": 24895 + }, + { + "epoch": 0.8915787777320179, + "grad_norm": 1.82058584690094, + "learning_rate": 6.100577265690321e-06, + "loss": 1.6064, + "step": 24896 + }, + { + "epoch": 0.8916145898615861, + "grad_norm": 1.4742602109909058, + "learning_rate": 6.096588615168864e-06, + "loss": 1.2988, + "step": 24897 + }, + { + "epoch": 0.8916504019911544, + "grad_norm": 1.487149953842163, + "learning_rate": 6.092601227996664e-06, + "loss": 1.4094, + "step": 24898 + }, + { + "epoch": 0.8916862141207227, + "grad_norm": 1.983708381652832, + "learning_rate": 6.088615104227413e-06, + "loss": 1.4038, + "step": 24899 + }, + { + "epoch": 0.891722026250291, + "grad_norm": 1.6666429042816162, + "learning_rate": 6.084630243914679e-06, + "loss": 1.379, + "step": 24900 + }, + { + "epoch": 0.8917578383798592, + "grad_norm": 1.616948127746582, + "learning_rate": 6.080646647112109e-06, + "loss": 1.3444, + "step": 24901 + }, + { + "epoch": 0.8917936505094275, + "grad_norm": 1.2481621503829956, + "learning_rate": 6.076664313873293e-06, + "loss": 1.2012, + "step": 24902 + }, + { + "epoch": 0.8918294626389959, + "grad_norm": 1.9596446752548218, + "learning_rate": 6.07268324425182e-06, + "loss": 1.6332, + "step": 24903 + }, + { + "epoch": 0.8918652747685641, + "grad_norm": 2.1639833450317383, + "learning_rate": 6.068703438301226e-06, + "loss": 1.6922, + "step": 24904 + }, + { + "epoch": 0.8919010868981324, + "grad_norm": 1.326301097869873, + "learning_rate": 6.064724896075058e-06, + "loss": 1.5365, + "step": 24905 + }, + { + "epoch": 0.8919368990277007, + "grad_norm": 1.8309438228607178, + "learning_rate": 6.06074761762685e-06, + "loss": 1.2234, + "step": 24906 + }, + { + "epoch": 0.891972711157269, + "grad_norm": 1.4915207624435425, + "learning_rate": 6.056771603010125e-06, + "loss": 1.4147, + "step": 24907 + }, + { + "epoch": 0.8920085232868372, + "grad_norm": 1.525928020477295, + "learning_rate": 6.052796852278353e-06, + "loss": 1.5172, + "step": 24908 + }, + { + "epoch": 0.8920443354164055, + "grad_norm": 1.383877158164978, + "learning_rate": 6.048823365485012e-06, + "loss": 1.2705, + "step": 24909 + }, + { + "epoch": 0.8920801475459739, + "grad_norm": 1.3349719047546387, + "learning_rate": 6.044851142683572e-06, + "loss": 1.3585, + "step": 24910 + }, + { + "epoch": 0.8921159596755421, + "grad_norm": 2.2004175186157227, + "learning_rate": 6.040880183927455e-06, + "loss": 1.3931, + "step": 24911 + }, + { + "epoch": 0.8921517718051104, + "grad_norm": 1.7021312713623047, + "learning_rate": 6.036910489270098e-06, + "loss": 1.2756, + "step": 24912 + }, + { + "epoch": 0.8921875839346787, + "grad_norm": 1.871277093887329, + "learning_rate": 6.0329420587649124e-06, + "loss": 1.5018, + "step": 24913 + }, + { + "epoch": 0.8922233960642469, + "grad_norm": 2.3825483322143555, + "learning_rate": 6.028974892465289e-06, + "loss": 1.2814, + "step": 24914 + }, + { + "epoch": 0.8922592081938152, + "grad_norm": 2.0105133056640625, + "learning_rate": 6.025008990424585e-06, + "loss": 1.2743, + "step": 24915 + }, + { + "epoch": 0.8922950203233835, + "grad_norm": 2.2031428813934326, + "learning_rate": 6.021044352696159e-06, + "loss": 1.8541, + "step": 24916 + }, + { + "epoch": 0.8923308324529519, + "grad_norm": 1.5223588943481445, + "learning_rate": 6.017080979333378e-06, + "loss": 1.491, + "step": 24917 + }, + { + "epoch": 0.8923666445825201, + "grad_norm": 4.320333957672119, + "learning_rate": 6.013118870389523e-06, + "loss": 1.7306, + "step": 24918 + }, + { + "epoch": 0.8924024567120884, + "grad_norm": 1.5868854522705078, + "learning_rate": 6.009158025917927e-06, + "loss": 1.2724, + "step": 24919 + }, + { + "epoch": 0.8924382688416567, + "grad_norm": 1.3187925815582275, + "learning_rate": 6.00519844597186e-06, + "loss": 1.2816, + "step": 24920 + }, + { + "epoch": 0.8924740809712249, + "grad_norm": 1.4020674228668213, + "learning_rate": 6.001240130604624e-06, + "loss": 1.484, + "step": 24921 + }, + { + "epoch": 0.8925098931007932, + "grad_norm": 1.5411213636398315, + "learning_rate": 5.997283079869442e-06, + "loss": 1.261, + "step": 24922 + }, + { + "epoch": 0.8925457052303615, + "grad_norm": 2.140064001083374, + "learning_rate": 5.993327293819562e-06, + "loss": 1.439, + "step": 24923 + }, + { + "epoch": 0.8925815173599299, + "grad_norm": 1.97054922580719, + "learning_rate": 5.989372772508195e-06, + "loss": 1.3858, + "step": 24924 + }, + { + "epoch": 0.8926173294894981, + "grad_norm": 2.1375653743743896, + "learning_rate": 5.985419515988566e-06, + "loss": 1.3219, + "step": 24925 + }, + { + "epoch": 0.8926531416190664, + "grad_norm": 1.6442660093307495, + "learning_rate": 5.981467524313855e-06, + "loss": 1.1661, + "step": 24926 + }, + { + "epoch": 0.8926889537486347, + "grad_norm": 1.7716608047485352, + "learning_rate": 5.977516797537186e-06, + "loss": 1.5978, + "step": 24927 + }, + { + "epoch": 0.8927247658782029, + "grad_norm": 1.8038569688796997, + "learning_rate": 5.973567335711783e-06, + "loss": 1.3623, + "step": 24928 + }, + { + "epoch": 0.8927605780077712, + "grad_norm": 2.1996359825134277, + "learning_rate": 5.969619138890737e-06, + "loss": 1.1365, + "step": 24929 + }, + { + "epoch": 0.8927963901373395, + "grad_norm": 1.420465350151062, + "learning_rate": 5.965672207127171e-06, + "loss": 1.3239, + "step": 24930 + }, + { + "epoch": 0.8928322022669078, + "grad_norm": 1.2334009408950806, + "learning_rate": 5.961726540474189e-06, + "loss": 1.3553, + "step": 24931 + }, + { + "epoch": 0.8928680143964761, + "grad_norm": 1.63217294216156, + "learning_rate": 5.95778213898488e-06, + "loss": 1.382, + "step": 24932 + }, + { + "epoch": 0.8929038265260444, + "grad_norm": 1.6627821922302246, + "learning_rate": 5.9538390027123025e-06, + "loss": 1.3492, + "step": 24933 + }, + { + "epoch": 0.8929396386556127, + "grad_norm": 1.619146466255188, + "learning_rate": 5.949897131709514e-06, + "loss": 1.6053, + "step": 24934 + }, + { + "epoch": 0.8929754507851809, + "grad_norm": 1.4433724880218506, + "learning_rate": 5.94595652602955e-06, + "loss": 1.4331, + "step": 24935 + }, + { + "epoch": 0.8930112629147492, + "grad_norm": 1.6781110763549805, + "learning_rate": 5.9420171857254126e-06, + "loss": 1.6084, + "step": 24936 + }, + { + "epoch": 0.8930470750443175, + "grad_norm": 1.698456883430481, + "learning_rate": 5.938079110850114e-06, + "loss": 1.4175, + "step": 24937 + }, + { + "epoch": 0.8930828871738858, + "grad_norm": 2.07592511177063, + "learning_rate": 5.934142301456613e-06, + "loss": 1.6356, + "step": 24938 + }, + { + "epoch": 0.8931186993034541, + "grad_norm": 1.7842410802841187, + "learning_rate": 5.9302067575979115e-06, + "loss": 1.3476, + "step": 24939 + }, + { + "epoch": 0.8931545114330224, + "grad_norm": 1.0301146507263184, + "learning_rate": 5.926272479326922e-06, + "loss": 1.4211, + "step": 24940 + }, + { + "epoch": 0.8931903235625906, + "grad_norm": 1.4850590229034424, + "learning_rate": 5.922339466696591e-06, + "loss": 1.1895, + "step": 24941 + }, + { + "epoch": 0.8932261356921589, + "grad_norm": 1.8592171669006348, + "learning_rate": 5.918407719759844e-06, + "loss": 1.6507, + "step": 24942 + }, + { + "epoch": 0.8932619478217272, + "grad_norm": 1.9317047595977783, + "learning_rate": 5.914477238569549e-06, + "loss": 1.2963, + "step": 24943 + }, + { + "epoch": 0.8932977599512955, + "grad_norm": 2.370089292526245, + "learning_rate": 5.91054802317862e-06, + "loss": 1.7081, + "step": 24944 + }, + { + "epoch": 0.8933335720808638, + "grad_norm": 2.0202910900115967, + "learning_rate": 5.906620073639868e-06, + "loss": 1.5157, + "step": 24945 + }, + { + "epoch": 0.8933693842104321, + "grad_norm": 1.8120547533035278, + "learning_rate": 5.902693390006209e-06, + "loss": 1.6128, + "step": 24946 + }, + { + "epoch": 0.8934051963400004, + "grad_norm": 1.7174150943756104, + "learning_rate": 5.89876797233041e-06, + "loss": 1.5787, + "step": 24947 + }, + { + "epoch": 0.8934410084695686, + "grad_norm": 2.4649229049682617, + "learning_rate": 5.894843820665319e-06, + "loss": 1.7147, + "step": 24948 + }, + { + "epoch": 0.8934768205991369, + "grad_norm": 1.6594107151031494, + "learning_rate": 5.890920935063693e-06, + "loss": 1.503, + "step": 24949 + }, + { + "epoch": 0.8935126327287052, + "grad_norm": 2.6280345916748047, + "learning_rate": 5.8869993155783675e-06, + "loss": 1.2833, + "step": 24950 + }, + { + "epoch": 0.8935484448582734, + "grad_norm": 1.1953978538513184, + "learning_rate": 5.883078962262056e-06, + "loss": 1.6845, + "step": 24951 + }, + { + "epoch": 0.8935842569878418, + "grad_norm": 1.5348491668701172, + "learning_rate": 5.879159875167517e-06, + "loss": 1.4533, + "step": 24952 + }, + { + "epoch": 0.8936200691174101, + "grad_norm": 1.3968113660812378, + "learning_rate": 5.875242054347463e-06, + "loss": 1.3589, + "step": 24953 + }, + { + "epoch": 0.8936558812469784, + "grad_norm": 1.4299107789993286, + "learning_rate": 5.871325499854618e-06, + "loss": 1.2363, + "step": 24954 + }, + { + "epoch": 0.8936916933765466, + "grad_norm": 1.3905788660049438, + "learning_rate": 5.867410211741686e-06, + "loss": 1.6219, + "step": 24955 + }, + { + "epoch": 0.8937275055061149, + "grad_norm": 1.5503168106079102, + "learning_rate": 5.863496190061302e-06, + "loss": 1.6683, + "step": 24956 + }, + { + "epoch": 0.8937633176356832, + "grad_norm": 2.048081159591675, + "learning_rate": 5.859583434866167e-06, + "loss": 1.3548, + "step": 24957 + }, + { + "epoch": 0.8937991297652514, + "grad_norm": 1.719864010810852, + "learning_rate": 5.855671946208896e-06, + "loss": 1.3932, + "step": 24958 + }, + { + "epoch": 0.8938349418948198, + "grad_norm": 1.683786153793335, + "learning_rate": 5.851761724142147e-06, + "loss": 1.3284, + "step": 24959 + }, + { + "epoch": 0.8938707540243881, + "grad_norm": 1.8406994342803955, + "learning_rate": 5.8478527687184755e-06, + "loss": 1.3094, + "step": 24960 + }, + { + "epoch": 0.8939065661539564, + "grad_norm": 1.4304991960525513, + "learning_rate": 5.843945079990498e-06, + "loss": 1.4482, + "step": 24961 + }, + { + "epoch": 0.8939423782835246, + "grad_norm": 1.7396273612976074, + "learning_rate": 5.840038658010805e-06, + "loss": 1.2779, + "step": 24962 + }, + { + "epoch": 0.8939781904130929, + "grad_norm": 1.7192277908325195, + "learning_rate": 5.83613350283192e-06, + "loss": 1.3281, + "step": 24963 + }, + { + "epoch": 0.8940140025426612, + "grad_norm": 2.3847055435180664, + "learning_rate": 5.83222961450639e-06, + "loss": 1.6135, + "step": 24964 + }, + { + "epoch": 0.8940498146722294, + "grad_norm": 2.184673309326172, + "learning_rate": 5.828326993086741e-06, + "loss": 1.3081, + "step": 24965 + }, + { + "epoch": 0.8940856268017978, + "grad_norm": 1.49175226688385, + "learning_rate": 5.824425638625508e-06, + "loss": 1.1297, + "step": 24966 + }, + { + "epoch": 0.8941214389313661, + "grad_norm": 1.4161489009857178, + "learning_rate": 5.820525551175104e-06, + "loss": 1.3759, + "step": 24967 + }, + { + "epoch": 0.8941572510609344, + "grad_norm": 1.6693317890167236, + "learning_rate": 5.8166267307880885e-06, + "loss": 1.653, + "step": 24968 + }, + { + "epoch": 0.8941930631905026, + "grad_norm": 1.2906261682510376, + "learning_rate": 5.812729177516874e-06, + "loss": 1.375, + "step": 24969 + }, + { + "epoch": 0.8942288753200709, + "grad_norm": 1.3217500448226929, + "learning_rate": 5.808832891413873e-06, + "loss": 1.1146, + "step": 24970 + }, + { + "epoch": 0.8942646874496392, + "grad_norm": 1.2847282886505127, + "learning_rate": 5.804937872531524e-06, + "loss": 1.1061, + "step": 24971 + }, + { + "epoch": 0.8943004995792074, + "grad_norm": 1.6961958408355713, + "learning_rate": 5.8010441209222384e-06, + "loss": 1.4849, + "step": 24972 + }, + { + "epoch": 0.8943363117087758, + "grad_norm": 1.6735053062438965, + "learning_rate": 5.797151636638409e-06, + "loss": 1.1823, + "step": 24973 + }, + { + "epoch": 0.8943721238383441, + "grad_norm": 1.3020800352096558, + "learning_rate": 5.7932604197323826e-06, + "loss": 1.0688, + "step": 24974 + }, + { + "epoch": 0.8944079359679123, + "grad_norm": 1.8242229223251343, + "learning_rate": 5.789370470256517e-06, + "loss": 1.2799, + "step": 24975 + }, + { + "epoch": 0.8944437480974806, + "grad_norm": 1.5822815895080566, + "learning_rate": 5.785481788263147e-06, + "loss": 1.4551, + "step": 24976 + }, + { + "epoch": 0.8944795602270489, + "grad_norm": 1.5726370811462402, + "learning_rate": 5.7815943738046e-06, + "loss": 1.1166, + "step": 24977 + }, + { + "epoch": 0.8945153723566172, + "grad_norm": 1.3955146074295044, + "learning_rate": 5.777708226933165e-06, + "loss": 1.4049, + "step": 24978 + }, + { + "epoch": 0.8945511844861854, + "grad_norm": 1.5697683095932007, + "learning_rate": 5.773823347701124e-06, + "loss": 1.5281, + "step": 24979 + }, + { + "epoch": 0.8945869966157538, + "grad_norm": 1.8382532596588135, + "learning_rate": 5.7699397361607564e-06, + "loss": 1.2936, + "step": 24980 + }, + { + "epoch": 0.8946228087453221, + "grad_norm": 1.5816271305084229, + "learning_rate": 5.766057392364288e-06, + "loss": 1.4626, + "step": 24981 + }, + { + "epoch": 0.8946586208748903, + "grad_norm": 1.3491520881652832, + "learning_rate": 5.7621763163639655e-06, + "loss": 1.4557, + "step": 24982 + }, + { + "epoch": 0.8946944330044586, + "grad_norm": 1.6579393148422241, + "learning_rate": 5.758296508212013e-06, + "loss": 1.3494, + "step": 24983 + }, + { + "epoch": 0.8947302451340269, + "grad_norm": 1.8803688287734985, + "learning_rate": 5.7544179679606234e-06, + "loss": 1.9863, + "step": 24984 + }, + { + "epoch": 0.8947660572635951, + "grad_norm": 1.9124130010604858, + "learning_rate": 5.750540695661955e-06, + "loss": 1.3755, + "step": 24985 + }, + { + "epoch": 0.8948018693931634, + "grad_norm": 2.12723970413208, + "learning_rate": 5.746664691368187e-06, + "loss": 1.3116, + "step": 24986 + }, + { + "epoch": 0.8948376815227318, + "grad_norm": 1.3383265733718872, + "learning_rate": 5.742789955131489e-06, + "loss": 1.3311, + "step": 24987 + }, + { + "epoch": 0.8948734936523001, + "grad_norm": 2.026008367538452, + "learning_rate": 5.7389164870039535e-06, + "loss": 1.4909, + "step": 24988 + }, + { + "epoch": 0.8949093057818683, + "grad_norm": 1.573212742805481, + "learning_rate": 5.735044287037705e-06, + "loss": 1.3221, + "step": 24989 + }, + { + "epoch": 0.8949451179114366, + "grad_norm": 1.6352729797363281, + "learning_rate": 5.7311733552848355e-06, + "loss": 1.5761, + "step": 24990 + }, + { + "epoch": 0.8949809300410049, + "grad_norm": 1.3640257120132446, + "learning_rate": 5.727303691797459e-06, + "loss": 1.2875, + "step": 24991 + }, + { + "epoch": 0.8950167421705731, + "grad_norm": 2.3444406986236572, + "learning_rate": 5.723435296627588e-06, + "loss": 1.3894, + "step": 24992 + }, + { + "epoch": 0.8950525543001414, + "grad_norm": 1.89201021194458, + "learning_rate": 5.719568169827283e-06, + "loss": 1.1353, + "step": 24993 + }, + { + "epoch": 0.8950883664297098, + "grad_norm": 1.320304274559021, + "learning_rate": 5.71570231144859e-06, + "loss": 1.4152, + "step": 24994 + }, + { + "epoch": 0.8951241785592781, + "grad_norm": 2.161402940750122, + "learning_rate": 5.7118377215435e-06, + "loss": 1.3054, + "step": 24995 + }, + { + "epoch": 0.8951599906888463, + "grad_norm": 1.4582648277282715, + "learning_rate": 5.7079744001640065e-06, + "loss": 1.5407, + "step": 24996 + }, + { + "epoch": 0.8951958028184146, + "grad_norm": 1.544952630996704, + "learning_rate": 5.70411234736209e-06, + "loss": 1.2509, + "step": 24997 + }, + { + "epoch": 0.8952316149479829, + "grad_norm": 1.4822784662246704, + "learning_rate": 5.700251563189718e-06, + "loss": 1.6312, + "step": 24998 + }, + { + "epoch": 0.8952674270775511, + "grad_norm": 2.0058014392852783, + "learning_rate": 5.696392047698817e-06, + "loss": 1.6654, + "step": 24999 + }, + { + "epoch": 0.8953032392071194, + "grad_norm": 2.085139751434326, + "learning_rate": 5.6925338009413136e-06, + "loss": 1.3925, + "step": 25000 + }, + { + "epoch": 0.8953390513366878, + "grad_norm": 1.3141660690307617, + "learning_rate": 5.688676822969119e-06, + "loss": 1.3244, + "step": 25001 + }, + { + "epoch": 0.895374863466256, + "grad_norm": 1.5696529150009155, + "learning_rate": 5.684821113834138e-06, + "loss": 1.462, + "step": 25002 + }, + { + "epoch": 0.8954106755958243, + "grad_norm": 1.421932578086853, + "learning_rate": 5.680966673588217e-06, + "loss": 1.2906, + "step": 25003 + }, + { + "epoch": 0.8954464877253926, + "grad_norm": 2.3099539279937744, + "learning_rate": 5.677113502283227e-06, + "loss": 1.2604, + "step": 25004 + }, + { + "epoch": 0.8954822998549609, + "grad_norm": 1.683809757232666, + "learning_rate": 5.673261599971025e-06, + "loss": 1.7746, + "step": 25005 + }, + { + "epoch": 0.8955181119845291, + "grad_norm": 1.5996848344802856, + "learning_rate": 5.669410966703393e-06, + "loss": 1.1726, + "step": 25006 + }, + { + "epoch": 0.8955539241140974, + "grad_norm": 1.568306803703308, + "learning_rate": 5.665561602532165e-06, + "loss": 1.6927, + "step": 25007 + }, + { + "epoch": 0.8955897362436658, + "grad_norm": 1.9789416790008545, + "learning_rate": 5.661713507509126e-06, + "loss": 1.327, + "step": 25008 + }, + { + "epoch": 0.895625548373234, + "grad_norm": 1.8262815475463867, + "learning_rate": 5.657866681686053e-06, + "loss": 1.6479, + "step": 25009 + }, + { + "epoch": 0.8956613605028023, + "grad_norm": 1.8193621635437012, + "learning_rate": 5.654021125114672e-06, + "loss": 1.9024, + "step": 25010 + }, + { + "epoch": 0.8956971726323706, + "grad_norm": 2.2621870040893555, + "learning_rate": 5.6501768378467546e-06, + "loss": 1.37, + "step": 25011 + }, + { + "epoch": 0.8957329847619389, + "grad_norm": 2.0445046424865723, + "learning_rate": 5.646333819933991e-06, + "loss": 1.2324, + "step": 25012 + }, + { + "epoch": 0.8957687968915071, + "grad_norm": 2.021339178085327, + "learning_rate": 5.642492071428118e-06, + "loss": 1.6428, + "step": 25013 + }, + { + "epoch": 0.8958046090210754, + "grad_norm": 2.2916150093078613, + "learning_rate": 5.638651592380795e-06, + "loss": 1.6459, + "step": 25014 + }, + { + "epoch": 0.8958404211506438, + "grad_norm": 1.9969441890716553, + "learning_rate": 5.63481238284368e-06, + "loss": 1.1157, + "step": 25015 + }, + { + "epoch": 0.895876233280212, + "grad_norm": 1.4788435697555542, + "learning_rate": 5.630974442868475e-06, + "loss": 1.5839, + "step": 25016 + }, + { + "epoch": 0.8959120454097803, + "grad_norm": 1.4995172023773193, + "learning_rate": 5.627137772506752e-06, + "loss": 1.339, + "step": 25017 + }, + { + "epoch": 0.8959478575393486, + "grad_norm": 1.5286364555358887, + "learning_rate": 5.623302371810169e-06, + "loss": 1.2414, + "step": 25018 + }, + { + "epoch": 0.8959836696689168, + "grad_norm": 1.7818827629089355, + "learning_rate": 5.619468240830306e-06, + "loss": 1.4386, + "step": 25019 + }, + { + "epoch": 0.8960194817984851, + "grad_norm": 2.339745044708252, + "learning_rate": 5.615635379618778e-06, + "loss": 1.3534, + "step": 25020 + }, + { + "epoch": 0.8960552939280534, + "grad_norm": 1.4172555208206177, + "learning_rate": 5.61180378822711e-06, + "loss": 1.5372, + "step": 25021 + }, + { + "epoch": 0.8960911060576218, + "grad_norm": 1.7020219564437866, + "learning_rate": 5.607973466706873e-06, + "loss": 1.2331, + "step": 25022 + }, + { + "epoch": 0.89612691818719, + "grad_norm": 1.4076426029205322, + "learning_rate": 5.604144415109614e-06, + "loss": 1.187, + "step": 25023 + }, + { + "epoch": 0.8961627303167583, + "grad_norm": 1.5349396467208862, + "learning_rate": 5.600316633486802e-06, + "loss": 1.3395, + "step": 25024 + }, + { + "epoch": 0.8961985424463266, + "grad_norm": 2.8167307376861572, + "learning_rate": 5.596490121889975e-06, + "loss": 1.4081, + "step": 25025 + }, + { + "epoch": 0.8962343545758948, + "grad_norm": 1.624639630317688, + "learning_rate": 5.592664880370602e-06, + "loss": 1.3656, + "step": 25026 + }, + { + "epoch": 0.8962701667054631, + "grad_norm": 2.108041524887085, + "learning_rate": 5.588840908980153e-06, + "loss": 1.2949, + "step": 25027 + }, + { + "epoch": 0.8963059788350314, + "grad_norm": 1.5697460174560547, + "learning_rate": 5.585018207770054e-06, + "loss": 1.7335, + "step": 25028 + }, + { + "epoch": 0.8963417909645998, + "grad_norm": 1.555113434791565, + "learning_rate": 5.581196776791752e-06, + "loss": 1.3761, + "step": 25029 + }, + { + "epoch": 0.896377603094168, + "grad_norm": 1.8838539123535156, + "learning_rate": 5.5773766160966634e-06, + "loss": 1.6462, + "step": 25030 + }, + { + "epoch": 0.8964134152237363, + "grad_norm": 1.76872718334198, + "learning_rate": 5.5735577257361785e-06, + "loss": 1.1874, + "step": 25031 + }, + { + "epoch": 0.8964492273533046, + "grad_norm": 1.5243675708770752, + "learning_rate": 5.569740105761679e-06, + "loss": 1.387, + "step": 25032 + }, + { + "epoch": 0.8964850394828728, + "grad_norm": 1.6961188316345215, + "learning_rate": 5.565923756224489e-06, + "loss": 1.4076, + "step": 25033 + }, + { + "epoch": 0.8965208516124411, + "grad_norm": 2.217806816101074, + "learning_rate": 5.562108677176015e-06, + "loss": 1.3709, + "step": 25034 + }, + { + "epoch": 0.8965566637420094, + "grad_norm": 1.590507984161377, + "learning_rate": 5.558294868667535e-06, + "loss": 1.444, + "step": 25035 + }, + { + "epoch": 0.8965924758715778, + "grad_norm": 1.7941250801086426, + "learning_rate": 5.554482330750388e-06, + "loss": 1.5058, + "step": 25036 + }, + { + "epoch": 0.896628288001146, + "grad_norm": 1.739349365234375, + "learning_rate": 5.550671063475832e-06, + "loss": 1.4063, + "step": 25037 + }, + { + "epoch": 0.8966641001307143, + "grad_norm": 1.7744945287704468, + "learning_rate": 5.546861066895193e-06, + "loss": 1.6937, + "step": 25038 + }, + { + "epoch": 0.8966999122602826, + "grad_norm": 1.3879646062850952, + "learning_rate": 5.543052341059707e-06, + "loss": 1.3204, + "step": 25039 + }, + { + "epoch": 0.8967357243898508, + "grad_norm": 1.5593581199645996, + "learning_rate": 5.5392448860205785e-06, + "loss": 1.4807, + "step": 25040 + }, + { + "epoch": 0.8967715365194191, + "grad_norm": 2.365830659866333, + "learning_rate": 5.535438701829088e-06, + "loss": 1.6088, + "step": 25041 + }, + { + "epoch": 0.8968073486489874, + "grad_norm": 2.006086587905884, + "learning_rate": 5.5316337885364165e-06, + "loss": 1.7096, + "step": 25042 + }, + { + "epoch": 0.8968431607785557, + "grad_norm": 2.7186625003814697, + "learning_rate": 5.527830146193758e-06, + "loss": 1.707, + "step": 25043 + }, + { + "epoch": 0.896878972908124, + "grad_norm": 1.4928075075149536, + "learning_rate": 5.5240277748522694e-06, + "loss": 1.0195, + "step": 25044 + }, + { + "epoch": 0.8969147850376923, + "grad_norm": 2.0089306831359863, + "learning_rate": 5.520226674563145e-06, + "loss": 1.4221, + "step": 25045 + }, + { + "epoch": 0.8969505971672606, + "grad_norm": 1.3066225051879883, + "learning_rate": 5.516426845377476e-06, + "loss": 1.3159, + "step": 25046 + }, + { + "epoch": 0.8969864092968288, + "grad_norm": 1.5065250396728516, + "learning_rate": 5.512628287346433e-06, + "loss": 1.3258, + "step": 25047 + }, + { + "epoch": 0.8970222214263971, + "grad_norm": 1.5593777894973755, + "learning_rate": 5.5088310005210865e-06, + "loss": 1.1823, + "step": 25048 + }, + { + "epoch": 0.8970580335559654, + "grad_norm": 1.4893614053726196, + "learning_rate": 5.505034984952529e-06, + "loss": 1.4447, + "step": 25049 + }, + { + "epoch": 0.8970938456855337, + "grad_norm": 1.4334756135940552, + "learning_rate": 5.501240240691852e-06, + "loss": 1.6804, + "step": 25050 + }, + { + "epoch": 0.897129657815102, + "grad_norm": 1.6654868125915527, + "learning_rate": 5.49744676779006e-06, + "loss": 1.3822, + "step": 25051 + }, + { + "epoch": 0.8971654699446703, + "grad_norm": 1.5863392353057861, + "learning_rate": 5.4936545662982455e-06, + "loss": 1.5038, + "step": 25052 + }, + { + "epoch": 0.8972012820742385, + "grad_norm": 1.6635081768035889, + "learning_rate": 5.4898636362674e-06, + "loss": 1.2242, + "step": 25053 + }, + { + "epoch": 0.8972370942038068, + "grad_norm": 1.979265570640564, + "learning_rate": 5.486073977748541e-06, + "loss": 1.1738, + "step": 25054 + }, + { + "epoch": 0.8972729063333751, + "grad_norm": 1.9149836301803589, + "learning_rate": 5.482285590792613e-06, + "loss": 1.4048, + "step": 25055 + }, + { + "epoch": 0.8973087184629434, + "grad_norm": 1.5873132944107056, + "learning_rate": 5.478498475450644e-06, + "loss": 1.4913, + "step": 25056 + }, + { + "epoch": 0.8973445305925117, + "grad_norm": 1.8850102424621582, + "learning_rate": 5.47471263177356e-06, + "loss": 1.5126, + "step": 25057 + }, + { + "epoch": 0.89738034272208, + "grad_norm": 1.7988669872283936, + "learning_rate": 5.470928059812264e-06, + "loss": 1.1521, + "step": 25058 + }, + { + "epoch": 0.8974161548516483, + "grad_norm": 1.4897023439407349, + "learning_rate": 5.467144759617704e-06, + "loss": 1.529, + "step": 25059 + }, + { + "epoch": 0.8974519669812165, + "grad_norm": 1.9250378608703613, + "learning_rate": 5.463362731240773e-06, + "loss": 1.5413, + "step": 25060 + }, + { + "epoch": 0.8974877791107848, + "grad_norm": 1.302215576171875, + "learning_rate": 5.4595819747323636e-06, + "loss": 1.2577, + "step": 25061 + }, + { + "epoch": 0.8975235912403531, + "grad_norm": 1.791074514389038, + "learning_rate": 5.455802490143314e-06, + "loss": 1.4449, + "step": 25062 + }, + { + "epoch": 0.8975594033699213, + "grad_norm": 2.012291669845581, + "learning_rate": 5.4520242775244925e-06, + "loss": 1.218, + "step": 25063 + }, + { + "epoch": 0.8975952154994897, + "grad_norm": 1.5739734172821045, + "learning_rate": 5.4482473369267264e-06, + "loss": 1.1135, + "step": 25064 + }, + { + "epoch": 0.897631027629058, + "grad_norm": 1.5240012407302856, + "learning_rate": 5.444471668400841e-06, + "loss": 1.5621, + "step": 25065 + }, + { + "epoch": 0.8976668397586263, + "grad_norm": 1.944368600845337, + "learning_rate": 5.440697271997608e-06, + "loss": 1.0039, + "step": 25066 + }, + { + "epoch": 0.8977026518881945, + "grad_norm": 1.4575976133346558, + "learning_rate": 5.436924147767819e-06, + "loss": 1.547, + "step": 25067 + }, + { + "epoch": 0.8977384640177628, + "grad_norm": 2.7136669158935547, + "learning_rate": 5.433152295762256e-06, + "loss": 1.402, + "step": 25068 + }, + { + "epoch": 0.8977742761473311, + "grad_norm": 2.263066291809082, + "learning_rate": 5.429381716031634e-06, + "loss": 1.2947, + "step": 25069 + }, + { + "epoch": 0.8978100882768993, + "grad_norm": 1.668306589126587, + "learning_rate": 5.42561240862669e-06, + "loss": 1.6464, + "step": 25070 + }, + { + "epoch": 0.8978459004064677, + "grad_norm": 1.28193998336792, + "learning_rate": 5.421844373598139e-06, + "loss": 1.2046, + "step": 25071 + }, + { + "epoch": 0.897881712536036, + "grad_norm": 1.325032353401184, + "learning_rate": 5.418077610996686e-06, + "loss": 1.4681, + "step": 25072 + }, + { + "epoch": 0.8979175246656043, + "grad_norm": 1.4316115379333496, + "learning_rate": 5.4143121208729885e-06, + "loss": 1.579, + "step": 25073 + }, + { + "epoch": 0.8979533367951725, + "grad_norm": 1.4661067724227905, + "learning_rate": 5.410547903277707e-06, + "loss": 1.2959, + "step": 25074 + }, + { + "epoch": 0.8979891489247408, + "grad_norm": 1.6015124320983887, + "learning_rate": 5.4067849582615124e-06, + "loss": 1.5803, + "step": 25075 + }, + { + "epoch": 0.8980249610543091, + "grad_norm": 1.9564155340194702, + "learning_rate": 5.403023285874997e-06, + "loss": 1.4142, + "step": 25076 + }, + { + "epoch": 0.8980607731838773, + "grad_norm": 1.7000601291656494, + "learning_rate": 5.399262886168777e-06, + "loss": 1.4036, + "step": 25077 + }, + { + "epoch": 0.8980965853134457, + "grad_norm": 2.1915886402130127, + "learning_rate": 5.395503759193454e-06, + "loss": 1.7342, + "step": 25078 + }, + { + "epoch": 0.898132397443014, + "grad_norm": 1.5303142070770264, + "learning_rate": 5.391745904999601e-06, + "loss": 1.4086, + "step": 25079 + }, + { + "epoch": 0.8981682095725823, + "grad_norm": 1.7111073732376099, + "learning_rate": 5.387989323637765e-06, + "loss": 1.5384, + "step": 25080 + }, + { + "epoch": 0.8982040217021505, + "grad_norm": 1.3334734439849854, + "learning_rate": 5.384234015158495e-06, + "loss": 1.4625, + "step": 25081 + }, + { + "epoch": 0.8982398338317188, + "grad_norm": 2.577977418899536, + "learning_rate": 5.380479979612307e-06, + "loss": 1.4246, + "step": 25082 + }, + { + "epoch": 0.8982756459612871, + "grad_norm": 1.6402453184127808, + "learning_rate": 5.376727217049726e-06, + "loss": 1.4184, + "step": 25083 + }, + { + "epoch": 0.8983114580908553, + "grad_norm": 1.353309154510498, + "learning_rate": 5.372975727521201e-06, + "loss": 1.2795, + "step": 25084 + }, + { + "epoch": 0.8983472702204237, + "grad_norm": 1.778639793395996, + "learning_rate": 5.369225511077236e-06, + "loss": 1.47, + "step": 25085 + }, + { + "epoch": 0.898383082349992, + "grad_norm": 1.7196940183639526, + "learning_rate": 5.36547656776829e-06, + "loss": 1.4584, + "step": 25086 + }, + { + "epoch": 0.8984188944795602, + "grad_norm": 1.3862847089767456, + "learning_rate": 5.36172889764478e-06, + "loss": 1.7245, + "step": 25087 + }, + { + "epoch": 0.8984547066091285, + "grad_norm": 1.9796829223632812, + "learning_rate": 5.357982500757119e-06, + "loss": 1.4531, + "step": 25088 + }, + { + "epoch": 0.8984905187386968, + "grad_norm": 1.9251147508621216, + "learning_rate": 5.354237377155735e-06, + "loss": 1.4222, + "step": 25089 + }, + { + "epoch": 0.898526330868265, + "grad_norm": 1.603308916091919, + "learning_rate": 5.3504935268910095e-06, + "loss": 1.2329, + "step": 25090 + }, + { + "epoch": 0.8985621429978333, + "grad_norm": 1.7304174900054932, + "learning_rate": 5.346750950013301e-06, + "loss": 1.6948, + "step": 25091 + }, + { + "epoch": 0.8985979551274017, + "grad_norm": 1.523800253868103, + "learning_rate": 5.343009646572949e-06, + "loss": 1.5058, + "step": 25092 + }, + { + "epoch": 0.89863376725697, + "grad_norm": 1.9301091432571411, + "learning_rate": 5.3392696166203345e-06, + "loss": 1.6271, + "step": 25093 + }, + { + "epoch": 0.8986695793865382, + "grad_norm": 1.908660650253296, + "learning_rate": 5.335530860205718e-06, + "loss": 1.2108, + "step": 25094 + }, + { + "epoch": 0.8987053915161065, + "grad_norm": 1.437114953994751, + "learning_rate": 5.331793377379435e-06, + "loss": 1.5768, + "step": 25095 + }, + { + "epoch": 0.8987412036456748, + "grad_norm": 1.4844969511032104, + "learning_rate": 5.328057168191747e-06, + "loss": 1.2316, + "step": 25096 + }, + { + "epoch": 0.898777015775243, + "grad_norm": 1.9871184825897217, + "learning_rate": 5.324322232692947e-06, + "loss": 1.3748, + "step": 25097 + }, + { + "epoch": 0.8988128279048113, + "grad_norm": 1.5690678358078003, + "learning_rate": 5.32058857093326e-06, + "loss": 1.5002, + "step": 25098 + }, + { + "epoch": 0.8988486400343797, + "grad_norm": 2.270082712173462, + "learning_rate": 5.316856182962926e-06, + "loss": 1.6875, + "step": 25099 + }, + { + "epoch": 0.898884452163948, + "grad_norm": 1.716614007949829, + "learning_rate": 5.313125068832159e-06, + "loss": 1.4479, + "step": 25100 + }, + { + "epoch": 0.8989202642935162, + "grad_norm": 2.0033087730407715, + "learning_rate": 5.309395228591174e-06, + "loss": 1.2221, + "step": 25101 + }, + { + "epoch": 0.8989560764230845, + "grad_norm": 1.4407968521118164, + "learning_rate": 5.305666662290121e-06, + "loss": 1.3376, + "step": 25102 + }, + { + "epoch": 0.8989918885526528, + "grad_norm": 1.4618561267852783, + "learning_rate": 5.30193936997917e-06, + "loss": 1.6145, + "step": 25103 + }, + { + "epoch": 0.899027700682221, + "grad_norm": 2.3181114196777344, + "learning_rate": 5.298213351708492e-06, + "loss": 1.4054, + "step": 25104 + }, + { + "epoch": 0.8990635128117893, + "grad_norm": 1.5716500282287598, + "learning_rate": 5.29448860752817e-06, + "loss": 1.3271, + "step": 25105 + }, + { + "epoch": 0.8990993249413577, + "grad_norm": 1.612802505493164, + "learning_rate": 5.290765137488351e-06, + "loss": 1.4278, + "step": 25106 + }, + { + "epoch": 0.899135137070926, + "grad_norm": 1.4087631702423096, + "learning_rate": 5.287042941639131e-06, + "loss": 1.6085, + "step": 25107 + }, + { + "epoch": 0.8991709492004942, + "grad_norm": 2.212252616882324, + "learning_rate": 5.2833220200305785e-06, + "loss": 1.8866, + "step": 25108 + }, + { + "epoch": 0.8992067613300625, + "grad_norm": 1.4179940223693848, + "learning_rate": 5.279602372712744e-06, + "loss": 1.5334, + "step": 25109 + }, + { + "epoch": 0.8992425734596308, + "grad_norm": 1.4929925203323364, + "learning_rate": 5.275883999735676e-06, + "loss": 1.3277, + "step": 25110 + }, + { + "epoch": 0.899278385589199, + "grad_norm": 2.1377246379852295, + "learning_rate": 5.272166901149423e-06, + "loss": 1.1745, + "step": 25111 + }, + { + "epoch": 0.8993141977187673, + "grad_norm": 1.96725332736969, + "learning_rate": 5.2684510770039556e-06, + "loss": 1.696, + "step": 25112 + }, + { + "epoch": 0.8993500098483357, + "grad_norm": 2.2955400943756104, + "learning_rate": 5.264736527349279e-06, + "loss": 1.2985, + "step": 25113 + }, + { + "epoch": 0.899385821977904, + "grad_norm": 1.76557195186615, + "learning_rate": 5.261023252235386e-06, + "loss": 1.5453, + "step": 25114 + }, + { + "epoch": 0.8994216341074722, + "grad_norm": 1.7912245988845825, + "learning_rate": 5.257311251712227e-06, + "loss": 1.6088, + "step": 25115 + }, + { + "epoch": 0.8994574462370405, + "grad_norm": 1.81741201877594, + "learning_rate": 5.253600525829716e-06, + "loss": 1.8192, + "step": 25116 + }, + { + "epoch": 0.8994932583666088, + "grad_norm": 1.5481551885604858, + "learning_rate": 5.249891074637803e-06, + "loss": 1.2055, + "step": 25117 + }, + { + "epoch": 0.899529070496177, + "grad_norm": 2.0782535076141357, + "learning_rate": 5.2461828981863916e-06, + "loss": 1.502, + "step": 25118 + }, + { + "epoch": 0.8995648826257453, + "grad_norm": 1.717686653137207, + "learning_rate": 5.2424759965253645e-06, + "loss": 1.6172, + "step": 25119 + }, + { + "epoch": 0.8996006947553137, + "grad_norm": 2.288208246231079, + "learning_rate": 5.2387703697046045e-06, + "loss": 1.2769, + "step": 25120 + }, + { + "epoch": 0.899636506884882, + "grad_norm": 1.6821223497390747, + "learning_rate": 5.235066017773926e-06, + "loss": 1.1108, + "step": 25121 + }, + { + "epoch": 0.8996723190144502, + "grad_norm": 1.282433271408081, + "learning_rate": 5.2313629407832355e-06, + "loss": 1.3598, + "step": 25122 + }, + { + "epoch": 0.8997081311440185, + "grad_norm": 1.8116391897201538, + "learning_rate": 5.227661138782281e-06, + "loss": 1.7734, + "step": 25123 + }, + { + "epoch": 0.8997439432735868, + "grad_norm": 1.6620230674743652, + "learning_rate": 5.22396061182091e-06, + "loss": 1.3506, + "step": 25124 + }, + { + "epoch": 0.899779755403155, + "grad_norm": 1.840793251991272, + "learning_rate": 5.220261359948897e-06, + "loss": 1.705, + "step": 25125 + }, + { + "epoch": 0.8998155675327233, + "grad_norm": 1.7171930074691772, + "learning_rate": 5.216563383216022e-06, + "loss": 1.5466, + "step": 25126 + }, + { + "epoch": 0.8998513796622917, + "grad_norm": 1.6956909894943237, + "learning_rate": 5.2128666816720015e-06, + "loss": 1.7422, + "step": 25127 + }, + { + "epoch": 0.8998871917918599, + "grad_norm": 2.0536861419677734, + "learning_rate": 5.209171255366607e-06, + "loss": 1.3849, + "step": 25128 + }, + { + "epoch": 0.8999230039214282, + "grad_norm": 1.4225544929504395, + "learning_rate": 5.205477104349554e-06, + "loss": 1.457, + "step": 25129 + }, + { + "epoch": 0.8999588160509965, + "grad_norm": 1.5567337274551392, + "learning_rate": 5.2017842286705145e-06, + "loss": 1.4507, + "step": 25130 + }, + { + "epoch": 0.8999946281805647, + "grad_norm": 1.5060020685195923, + "learning_rate": 5.198092628379192e-06, + "loss": 1.6578, + "step": 25131 + }, + { + "epoch": 0.900030440310133, + "grad_norm": 2.0625991821289062, + "learning_rate": 5.194402303525225e-06, + "loss": 1.5786, + "step": 25132 + }, + { + "epoch": 0.9000662524397013, + "grad_norm": 1.730444073677063, + "learning_rate": 5.190713254158319e-06, + "loss": 1.2594, + "step": 25133 + }, + { + "epoch": 0.9001020645692697, + "grad_norm": 1.809962272644043, + "learning_rate": 5.187025480328056e-06, + "loss": 1.4024, + "step": 25134 + }, + { + "epoch": 0.9001378766988379, + "grad_norm": 1.6739686727523804, + "learning_rate": 5.183338982084074e-06, + "loss": 1.762, + "step": 25135 + }, + { + "epoch": 0.9001736888284062, + "grad_norm": 1.4611245393753052, + "learning_rate": 5.179653759475933e-06, + "loss": 1.3352, + "step": 25136 + }, + { + "epoch": 0.9002095009579745, + "grad_norm": 2.249504804611206, + "learning_rate": 5.175969812553272e-06, + "loss": 1.1882, + "step": 25137 + }, + { + "epoch": 0.9002453130875427, + "grad_norm": 1.9460619688034058, + "learning_rate": 5.172287141365628e-06, + "loss": 1.3776, + "step": 25138 + }, + { + "epoch": 0.900281125217111, + "grad_norm": 1.6175462007522583, + "learning_rate": 5.168605745962507e-06, + "loss": 1.5004, + "step": 25139 + }, + { + "epoch": 0.9003169373466793, + "grad_norm": 1.803956151008606, + "learning_rate": 5.164925626393502e-06, + "loss": 1.5692, + "step": 25140 + }, + { + "epoch": 0.9003527494762477, + "grad_norm": 1.1598551273345947, + "learning_rate": 5.161246782708073e-06, + "loss": 1.3922, + "step": 25141 + }, + { + "epoch": 0.9003885616058159, + "grad_norm": 1.4358941316604614, + "learning_rate": 5.15756921495576e-06, + "loss": 1.4219, + "step": 25142 + }, + { + "epoch": 0.9004243737353842, + "grad_norm": 1.7593889236450195, + "learning_rate": 5.153892923185977e-06, + "loss": 1.4586, + "step": 25143 + }, + { + "epoch": 0.9004601858649525, + "grad_norm": 1.4885627031326294, + "learning_rate": 5.150217907448263e-06, + "loss": 1.5213, + "step": 25144 + }, + { + "epoch": 0.9004959979945207, + "grad_norm": 1.5121746063232422, + "learning_rate": 5.146544167792011e-06, + "loss": 1.3499, + "step": 25145 + }, + { + "epoch": 0.900531810124089, + "grad_norm": 1.7629821300506592, + "learning_rate": 5.1428717042666385e-06, + "loss": 1.2924, + "step": 25146 + }, + { + "epoch": 0.9005676222536573, + "grad_norm": 1.4092707633972168, + "learning_rate": 5.1392005169215825e-06, + "loss": 1.3375, + "step": 25147 + }, + { + "epoch": 0.9006034343832257, + "grad_norm": 1.5190919637680054, + "learning_rate": 5.1355306058062044e-06, + "loss": 1.626, + "step": 25148 + }, + { + "epoch": 0.9006392465127939, + "grad_norm": 1.983694314956665, + "learning_rate": 5.13186197096992e-06, + "loss": 1.2005, + "step": 25149 + }, + { + "epoch": 0.9006750586423622, + "grad_norm": 1.4387457370758057, + "learning_rate": 5.128194612462034e-06, + "loss": 1.4169, + "step": 25150 + }, + { + "epoch": 0.9007108707719305, + "grad_norm": 2.042126178741455, + "learning_rate": 5.12452853033194e-06, + "loss": 1.8489, + "step": 25151 + }, + { + "epoch": 0.9007466829014987, + "grad_norm": 1.567436933517456, + "learning_rate": 5.120863724628922e-06, + "loss": 1.6615, + "step": 25152 + }, + { + "epoch": 0.900782495031067, + "grad_norm": 1.5410945415496826, + "learning_rate": 5.117200195402316e-06, + "loss": 1.1489, + "step": 25153 + }, + { + "epoch": 0.9008183071606353, + "grad_norm": 1.5345313549041748, + "learning_rate": 5.113537942701363e-06, + "loss": 1.6927, + "step": 25154 + }, + { + "epoch": 0.9008541192902035, + "grad_norm": 1.8208950757980347, + "learning_rate": 5.109876966575377e-06, + "loss": 1.3238, + "step": 25155 + }, + { + "epoch": 0.9008899314197719, + "grad_norm": 1.6119062900543213, + "learning_rate": 5.106217267073598e-06, + "loss": 1.6702, + "step": 25156 + }, + { + "epoch": 0.9009257435493402, + "grad_norm": 1.6705838441848755, + "learning_rate": 5.102558844245265e-06, + "loss": 1.004, + "step": 25157 + }, + { + "epoch": 0.9009615556789085, + "grad_norm": 1.38338041305542, + "learning_rate": 5.09890169813958e-06, + "loss": 1.4428, + "step": 25158 + }, + { + "epoch": 0.9009973678084767, + "grad_norm": 1.6136221885681152, + "learning_rate": 5.095245828805761e-06, + "loss": 1.3344, + "step": 25159 + }, + { + "epoch": 0.901033179938045, + "grad_norm": 1.925017237663269, + "learning_rate": 5.091591236293003e-06, + "loss": 1.5366, + "step": 25160 + }, + { + "epoch": 0.9010689920676133, + "grad_norm": 1.5999585390090942, + "learning_rate": 5.087937920650454e-06, + "loss": 1.3421, + "step": 25161 + }, + { + "epoch": 0.9011048041971815, + "grad_norm": 1.429749608039856, + "learning_rate": 5.0842858819272644e-06, + "loss": 1.6059, + "step": 25162 + }, + { + "epoch": 0.9011406163267499, + "grad_norm": 1.6445558071136475, + "learning_rate": 5.0806351201725944e-06, + "loss": 1.3776, + "step": 25163 + }, + { + "epoch": 0.9011764284563182, + "grad_norm": 1.7813189029693604, + "learning_rate": 5.076985635435527e-06, + "loss": 1.43, + "step": 25164 + }, + { + "epoch": 0.9012122405858864, + "grad_norm": 1.552821159362793, + "learning_rate": 5.073337427765179e-06, + "loss": 1.5426, + "step": 25165 + }, + { + "epoch": 0.9012480527154547, + "grad_norm": 1.7873458862304688, + "learning_rate": 5.069690497210633e-06, + "loss": 1.3311, + "step": 25166 + }, + { + "epoch": 0.901283864845023, + "grad_norm": 1.6734850406646729, + "learning_rate": 5.06604484382095e-06, + "loss": 1.394, + "step": 25167 + }, + { + "epoch": 0.9013196769745913, + "grad_norm": 1.4095349311828613, + "learning_rate": 5.062400467645178e-06, + "loss": 1.2577, + "step": 25168 + }, + { + "epoch": 0.9013554891041595, + "grad_norm": 1.764383316040039, + "learning_rate": 5.058757368732336e-06, + "loss": 1.3601, + "step": 25169 + }, + { + "epoch": 0.9013913012337279, + "grad_norm": 1.9552977085113525, + "learning_rate": 5.055115547131462e-06, + "loss": 1.6717, + "step": 25170 + }, + { + "epoch": 0.9014271133632962, + "grad_norm": 1.2154018878936768, + "learning_rate": 5.051475002891537e-06, + "loss": 1.1555, + "step": 25171 + }, + { + "epoch": 0.9014629254928644, + "grad_norm": 2.186868906021118, + "learning_rate": 5.047835736061535e-06, + "loss": 1.2166, + "step": 25172 + }, + { + "epoch": 0.9014987376224327, + "grad_norm": 1.7284328937530518, + "learning_rate": 5.044197746690427e-06, + "loss": 1.236, + "step": 25173 + }, + { + "epoch": 0.901534549752001, + "grad_norm": 1.353801965713501, + "learning_rate": 5.040561034827163e-06, + "loss": 1.3683, + "step": 25174 + }, + { + "epoch": 0.9015703618815692, + "grad_norm": 1.7040997743606567, + "learning_rate": 5.036925600520648e-06, + "loss": 1.4311, + "step": 25175 + }, + { + "epoch": 0.9016061740111375, + "grad_norm": 1.961835503578186, + "learning_rate": 5.0332914438197984e-06, + "loss": 1.1263, + "step": 25176 + }, + { + "epoch": 0.9016419861407059, + "grad_norm": 1.8802769184112549, + "learning_rate": 5.029658564773521e-06, + "loss": 1.5408, + "step": 25177 + }, + { + "epoch": 0.9016777982702742, + "grad_norm": 1.668648600578308, + "learning_rate": 5.026026963430697e-06, + "loss": 1.2913, + "step": 25178 + }, + { + "epoch": 0.9017136103998424, + "grad_norm": 1.8066505193710327, + "learning_rate": 5.022396639840166e-06, + "loss": 1.5094, + "step": 25179 + }, + { + "epoch": 0.9017494225294107, + "grad_norm": 1.4674502611160278, + "learning_rate": 5.018767594050766e-06, + "loss": 1.4396, + "step": 25180 + }, + { + "epoch": 0.901785234658979, + "grad_norm": 1.7392138242721558, + "learning_rate": 5.015139826111348e-06, + "loss": 1.8019, + "step": 25181 + }, + { + "epoch": 0.9018210467885472, + "grad_norm": 1.5244306325912476, + "learning_rate": 5.0115133360706945e-06, + "loss": 1.6719, + "step": 25182 + }, + { + "epoch": 0.9018568589181155, + "grad_norm": 1.4753122329711914, + "learning_rate": 5.0078881239776e-06, + "loss": 1.09, + "step": 25183 + }, + { + "epoch": 0.9018926710476839, + "grad_norm": 1.8521908521652222, + "learning_rate": 5.0042641898808364e-06, + "loss": 1.6567, + "step": 25184 + }, + { + "epoch": 0.9019284831772522, + "grad_norm": 1.287305235862732, + "learning_rate": 5.000641533829176e-06, + "loss": 1.3805, + "step": 25185 + }, + { + "epoch": 0.9019642953068204, + "grad_norm": 1.5233205556869507, + "learning_rate": 4.9970201558713345e-06, + "loss": 1.5226, + "step": 25186 + }, + { + "epoch": 0.9020001074363887, + "grad_norm": 1.4274674654006958, + "learning_rate": 4.99340005605603e-06, + "loss": 1.4153, + "step": 25187 + }, + { + "epoch": 0.902035919565957, + "grad_norm": 1.5530496835708618, + "learning_rate": 4.98978123443199e-06, + "loss": 1.6574, + "step": 25188 + }, + { + "epoch": 0.9020717316955252, + "grad_norm": 1.6646677255630493, + "learning_rate": 4.986163691047896e-06, + "loss": 1.177, + "step": 25189 + }, + { + "epoch": 0.9021075438250935, + "grad_norm": 1.5656930208206177, + "learning_rate": 4.982547425952399e-06, + "loss": 1.5505, + "step": 25190 + }, + { + "epoch": 0.9021433559546619, + "grad_norm": 1.369274377822876, + "learning_rate": 4.9789324391941615e-06, + "loss": 1.3201, + "step": 25191 + }, + { + "epoch": 0.9021791680842302, + "grad_norm": 1.6000550985336304, + "learning_rate": 4.975318730821843e-06, + "loss": 1.2981, + "step": 25192 + }, + { + "epoch": 0.9022149802137984, + "grad_norm": 1.3364497423171997, + "learning_rate": 4.971706300884016e-06, + "loss": 1.3928, + "step": 25193 + }, + { + "epoch": 0.9022507923433667, + "grad_norm": 1.555607557296753, + "learning_rate": 4.9680951494292975e-06, + "loss": 1.4731, + "step": 25194 + }, + { + "epoch": 0.902286604472935, + "grad_norm": 1.5311139822006226, + "learning_rate": 4.964485276506281e-06, + "loss": 1.416, + "step": 25195 + }, + { + "epoch": 0.9023224166025032, + "grad_norm": 2.2002294063568115, + "learning_rate": 4.960876682163551e-06, + "loss": 1.5436, + "step": 25196 + }, + { + "epoch": 0.9023582287320715, + "grad_norm": 1.4345999956130981, + "learning_rate": 4.957269366449613e-06, + "loss": 1.2206, + "step": 25197 + }, + { + "epoch": 0.9023940408616399, + "grad_norm": 1.821091651916504, + "learning_rate": 4.953663329413017e-06, + "loss": 1.3636, + "step": 25198 + }, + { + "epoch": 0.9024298529912081, + "grad_norm": 2.0151352882385254, + "learning_rate": 4.950058571102289e-06, + "loss": 1.5156, + "step": 25199 + }, + { + "epoch": 0.9024656651207764, + "grad_norm": 2.489975929260254, + "learning_rate": 4.946455091565916e-06, + "loss": 1.3341, + "step": 25200 + }, + { + "epoch": 0.9025014772503447, + "grad_norm": 1.6598349809646606, + "learning_rate": 4.942852890852367e-06, + "loss": 1.5651, + "step": 25201 + }, + { + "epoch": 0.902537289379913, + "grad_norm": 1.6895561218261719, + "learning_rate": 4.939251969010128e-06, + "loss": 1.4153, + "step": 25202 + }, + { + "epoch": 0.9025731015094812, + "grad_norm": 2.297715902328491, + "learning_rate": 4.935652326087648e-06, + "loss": 1.6323, + "step": 25203 + }, + { + "epoch": 0.9026089136390495, + "grad_norm": 1.411095142364502, + "learning_rate": 4.932053962133321e-06, + "loss": 1.3637, + "step": 25204 + }, + { + "epoch": 0.9026447257686179, + "grad_norm": 1.6230731010437012, + "learning_rate": 4.928456877195586e-06, + "loss": 1.3658, + "step": 25205 + }, + { + "epoch": 0.9026805378981861, + "grad_norm": 2.2268002033233643, + "learning_rate": 4.924861071322817e-06, + "loss": 1.602, + "step": 25206 + }, + { + "epoch": 0.9027163500277544, + "grad_norm": 1.9682823419570923, + "learning_rate": 4.92126654456343e-06, + "loss": 1.4974, + "step": 25207 + }, + { + "epoch": 0.9027521621573227, + "grad_norm": 1.9790271520614624, + "learning_rate": 4.917673296965741e-06, + "loss": 1.635, + "step": 25208 + }, + { + "epoch": 0.902787974286891, + "grad_norm": 1.872644066810608, + "learning_rate": 4.914081328578113e-06, + "loss": 1.2021, + "step": 25209 + }, + { + "epoch": 0.9028237864164592, + "grad_norm": 2.065763235092163, + "learning_rate": 4.910490639448884e-06, + "loss": 1.497, + "step": 25210 + }, + { + "epoch": 0.9028595985460275, + "grad_norm": 1.9289997816085815, + "learning_rate": 4.906901229626326e-06, + "loss": 1.7655, + "step": 25211 + }, + { + "epoch": 0.9028954106755959, + "grad_norm": 1.6984025239944458, + "learning_rate": 4.903313099158757e-06, + "loss": 1.4767, + "step": 25212 + }, + { + "epoch": 0.9029312228051641, + "grad_norm": 1.4198287725448608, + "learning_rate": 4.8997262480944385e-06, + "loss": 1.3188, + "step": 25213 + }, + { + "epoch": 0.9029670349347324, + "grad_norm": 1.8730920553207397, + "learning_rate": 4.896140676481653e-06, + "loss": 1.0944, + "step": 25214 + }, + { + "epoch": 0.9030028470643007, + "grad_norm": 1.4049091339111328, + "learning_rate": 4.892556384368607e-06, + "loss": 1.4781, + "step": 25215 + }, + { + "epoch": 0.9030386591938689, + "grad_norm": 1.9881552457809448, + "learning_rate": 4.8889733718035295e-06, + "loss": 1.507, + "step": 25216 + }, + { + "epoch": 0.9030744713234372, + "grad_norm": 1.4669917821884155, + "learning_rate": 4.885391638834646e-06, + "loss": 1.2178, + "step": 25217 + }, + { + "epoch": 0.9031102834530055, + "grad_norm": 2.0855116844177246, + "learning_rate": 4.88181118551011e-06, + "loss": 1.1911, + "step": 25218 + }, + { + "epoch": 0.9031460955825739, + "grad_norm": 1.5934829711914062, + "learning_rate": 4.878232011878136e-06, + "loss": 1.4744, + "step": 25219 + }, + { + "epoch": 0.9031819077121421, + "grad_norm": 2.0535361766815186, + "learning_rate": 4.874654117986821e-06, + "loss": 1.5789, + "step": 25220 + }, + { + "epoch": 0.9032177198417104, + "grad_norm": 1.54163658618927, + "learning_rate": 4.871077503884358e-06, + "loss": 1.5965, + "step": 25221 + }, + { + "epoch": 0.9032535319712787, + "grad_norm": 1.860435962677002, + "learning_rate": 4.86750216961882e-06, + "loss": 1.2296, + "step": 25222 + }, + { + "epoch": 0.9032893441008469, + "grad_norm": 1.6049903631210327, + "learning_rate": 4.863928115238336e-06, + "loss": 1.3, + "step": 25223 + }, + { + "epoch": 0.9033251562304152, + "grad_norm": 1.5540401935577393, + "learning_rate": 4.860355340790978e-06, + "loss": 1.128, + "step": 25224 + }, + { + "epoch": 0.9033609683599835, + "grad_norm": 1.7751628160476685, + "learning_rate": 4.85678384632483e-06, + "loss": 1.462, + "step": 25225 + }, + { + "epoch": 0.9033967804895519, + "grad_norm": 2.4625301361083984, + "learning_rate": 4.8532136318879315e-06, + "loss": 1.7404, + "step": 25226 + }, + { + "epoch": 0.9034325926191201, + "grad_norm": 1.419631838798523, + "learning_rate": 4.8496446975282885e-06, + "loss": 1.5578, + "step": 25227 + }, + { + "epoch": 0.9034684047486884, + "grad_norm": 1.7044492959976196, + "learning_rate": 4.846077043293973e-06, + "loss": 1.1315, + "step": 25228 + }, + { + "epoch": 0.9035042168782567, + "grad_norm": 1.758029818534851, + "learning_rate": 4.842510669232925e-06, + "loss": 1.3711, + "step": 25229 + }, + { + "epoch": 0.9035400290078249, + "grad_norm": 2.1265738010406494, + "learning_rate": 4.8389455753931726e-06, + "loss": 1.2978, + "step": 25230 + }, + { + "epoch": 0.9035758411373932, + "grad_norm": 1.8212213516235352, + "learning_rate": 4.835381761822633e-06, + "loss": 1.5178, + "step": 25231 + }, + { + "epoch": 0.9036116532669615, + "grad_norm": 1.3521546125411987, + "learning_rate": 4.831819228569301e-06, + "loss": 1.1085, + "step": 25232 + }, + { + "epoch": 0.9036474653965298, + "grad_norm": 1.5324915647506714, + "learning_rate": 4.828257975681072e-06, + "loss": 1.7123, + "step": 25233 + }, + { + "epoch": 0.9036832775260981, + "grad_norm": 2.087040901184082, + "learning_rate": 4.824698003205863e-06, + "loss": 1.4467, + "step": 25234 + }, + { + "epoch": 0.9037190896556664, + "grad_norm": 1.3491966724395752, + "learning_rate": 4.8211393111915915e-06, + "loss": 1.2438, + "step": 25235 + }, + { + "epoch": 0.9037549017852347, + "grad_norm": 1.7652667760849, + "learning_rate": 4.817581899686108e-06, + "loss": 1.6547, + "step": 25236 + }, + { + "epoch": 0.9037907139148029, + "grad_norm": 1.6787374019622803, + "learning_rate": 4.814025768737296e-06, + "loss": 1.2073, + "step": 25237 + }, + { + "epoch": 0.9038265260443712, + "grad_norm": 1.5756950378417969, + "learning_rate": 4.810470918392962e-06, + "loss": 1.3766, + "step": 25238 + }, + { + "epoch": 0.9038623381739395, + "grad_norm": 1.498803973197937, + "learning_rate": 4.8069173487009785e-06, + "loss": 1.3329, + "step": 25239 + }, + { + "epoch": 0.9038981503035078, + "grad_norm": 1.668433427810669, + "learning_rate": 4.803365059709131e-06, + "loss": 1.4101, + "step": 25240 + }, + { + "epoch": 0.9039339624330761, + "grad_norm": 1.7079111337661743, + "learning_rate": 4.799814051465212e-06, + "loss": 1.6624, + "step": 25241 + }, + { + "epoch": 0.9039697745626444, + "grad_norm": 1.6564069986343384, + "learning_rate": 4.7962643240169854e-06, + "loss": 1.2615, + "step": 25242 + }, + { + "epoch": 0.9040055866922126, + "grad_norm": 1.8872987031936646, + "learning_rate": 4.792715877412213e-06, + "loss": 1.494, + "step": 25243 + }, + { + "epoch": 0.9040413988217809, + "grad_norm": 1.6588809490203857, + "learning_rate": 4.789168711698655e-06, + "loss": 1.5164, + "step": 25244 + }, + { + "epoch": 0.9040772109513492, + "grad_norm": 1.7760930061340332, + "learning_rate": 4.785622826924019e-06, + "loss": 1.64, + "step": 25245 + }, + { + "epoch": 0.9041130230809175, + "grad_norm": 1.5673993825912476, + "learning_rate": 4.782078223135999e-06, + "loss": 1.6067, + "step": 25246 + }, + { + "epoch": 0.9041488352104858, + "grad_norm": 1.5011767148971558, + "learning_rate": 4.778534900382292e-06, + "loss": 1.441, + "step": 25247 + }, + { + "epoch": 0.9041846473400541, + "grad_norm": 1.4967342615127563, + "learning_rate": 4.774992858710581e-06, + "loss": 1.2915, + "step": 25248 + }, + { + "epoch": 0.9042204594696224, + "grad_norm": 1.8950409889221191, + "learning_rate": 4.771452098168494e-06, + "loss": 1.6408, + "step": 25249 + }, + { + "epoch": 0.9042562715991906, + "grad_norm": 1.2963480949401855, + "learning_rate": 4.767912618803705e-06, + "loss": 1.1706, + "step": 25250 + }, + { + "epoch": 0.9042920837287589, + "grad_norm": 1.2724201679229736, + "learning_rate": 4.764374420663808e-06, + "loss": 1.3548, + "step": 25251 + }, + { + "epoch": 0.9043278958583272, + "grad_norm": 2.205700159072876, + "learning_rate": 4.7608375037964e-06, + "loss": 1.2672, + "step": 25252 + }, + { + "epoch": 0.9043637079878954, + "grad_norm": 1.8525762557983398, + "learning_rate": 4.757301868249076e-06, + "loss": 1.2589, + "step": 25253 + }, + { + "epoch": 0.9043995201174638, + "grad_norm": 1.4828543663024902, + "learning_rate": 4.753767514069396e-06, + "loss": 1.5784, + "step": 25254 + }, + { + "epoch": 0.9044353322470321, + "grad_norm": 1.468957543373108, + "learning_rate": 4.750234441304924e-06, + "loss": 1.479, + "step": 25255 + }, + { + "epoch": 0.9044711443766004, + "grad_norm": 1.6229407787322998, + "learning_rate": 4.746702650003176e-06, + "loss": 1.2722, + "step": 25256 + }, + { + "epoch": 0.9045069565061686, + "grad_norm": 2.103856086730957, + "learning_rate": 4.743172140211683e-06, + "loss": 1.305, + "step": 25257 + }, + { + "epoch": 0.9045427686357369, + "grad_norm": 1.640134572982788, + "learning_rate": 4.7396429119779265e-06, + "loss": 1.3592, + "step": 25258 + }, + { + "epoch": 0.9045785807653052, + "grad_norm": 1.485317349433899, + "learning_rate": 4.736114965349414e-06, + "loss": 1.4747, + "step": 25259 + }, + { + "epoch": 0.9046143928948734, + "grad_norm": 1.4240471124649048, + "learning_rate": 4.732588300373586e-06, + "loss": 1.2655, + "step": 25260 + }, + { + "epoch": 0.9046502050244418, + "grad_norm": 1.6139092445373535, + "learning_rate": 4.729062917097882e-06, + "loss": 1.0557, + "step": 25261 + }, + { + "epoch": 0.9046860171540101, + "grad_norm": 2.000537633895874, + "learning_rate": 4.725538815569774e-06, + "loss": 1.6511, + "step": 25262 + }, + { + "epoch": 0.9047218292835784, + "grad_norm": 1.554819941520691, + "learning_rate": 4.722015995836626e-06, + "loss": 1.5654, + "step": 25263 + }, + { + "epoch": 0.9047576414131466, + "grad_norm": 2.3554747104644775, + "learning_rate": 4.718494457945855e-06, + "loss": 1.5196, + "step": 25264 + }, + { + "epoch": 0.9047934535427149, + "grad_norm": 1.2216877937316895, + "learning_rate": 4.714974201944833e-06, + "loss": 1.3913, + "step": 25265 + }, + { + "epoch": 0.9048292656722832, + "grad_norm": 1.3515695333480835, + "learning_rate": 4.711455227880935e-06, + "loss": 1.4451, + "step": 25266 + }, + { + "epoch": 0.9048650778018514, + "grad_norm": 1.557699203491211, + "learning_rate": 4.707937535801488e-06, + "loss": 1.023, + "step": 25267 + }, + { + "epoch": 0.9049008899314198, + "grad_norm": 1.7648674249649048, + "learning_rate": 4.704421125753822e-06, + "loss": 1.4819, + "step": 25268 + }, + { + "epoch": 0.9049367020609881, + "grad_norm": 2.089319944381714, + "learning_rate": 4.700905997785254e-06, + "loss": 1.3192, + "step": 25269 + }, + { + "epoch": 0.9049725141905564, + "grad_norm": 1.6241894960403442, + "learning_rate": 4.697392151943059e-06, + "loss": 1.2065, + "step": 25270 + }, + { + "epoch": 0.9050083263201246, + "grad_norm": 1.50771164894104, + "learning_rate": 4.693879588274519e-06, + "loss": 1.334, + "step": 25271 + }, + { + "epoch": 0.9050441384496929, + "grad_norm": 1.4906578063964844, + "learning_rate": 4.690368306826898e-06, + "loss": 1.547, + "step": 25272 + }, + { + "epoch": 0.9050799505792612, + "grad_norm": 1.5478224754333496, + "learning_rate": 4.686858307647446e-06, + "loss": 1.4141, + "step": 25273 + }, + { + "epoch": 0.9051157627088294, + "grad_norm": 1.7823578119277954, + "learning_rate": 4.683349590783348e-06, + "loss": 1.5894, + "step": 25274 + }, + { + "epoch": 0.9051515748383978, + "grad_norm": 1.496527075767517, + "learning_rate": 4.679842156281844e-06, + "loss": 1.2226, + "step": 25275 + }, + { + "epoch": 0.9051873869679661, + "grad_norm": 1.2077003717422485, + "learning_rate": 4.676336004190096e-06, + "loss": 1.1825, + "step": 25276 + }, + { + "epoch": 0.9052231990975343, + "grad_norm": 1.6871800422668457, + "learning_rate": 4.6728311345553115e-06, + "loss": 1.2391, + "step": 25277 + }, + { + "epoch": 0.9052590112271026, + "grad_norm": 1.8817455768585205, + "learning_rate": 4.669327547424607e-06, + "loss": 1.5434, + "step": 25278 + }, + { + "epoch": 0.9052948233566709, + "grad_norm": 1.5902183055877686, + "learning_rate": 4.665825242845134e-06, + "loss": 1.4759, + "step": 25279 + }, + { + "epoch": 0.9053306354862392, + "grad_norm": 1.7828710079193115, + "learning_rate": 4.662324220864011e-06, + "loss": 1.3709, + "step": 25280 + }, + { + "epoch": 0.9053664476158074, + "grad_norm": 1.9528136253356934, + "learning_rate": 4.658824481528335e-06, + "loss": 1.3893, + "step": 25281 + }, + { + "epoch": 0.9054022597453758, + "grad_norm": 2.271791934967041, + "learning_rate": 4.655326024885198e-06, + "loss": 1.5958, + "step": 25282 + }, + { + "epoch": 0.9054380718749441, + "grad_norm": 1.7135944366455078, + "learning_rate": 4.651828850981654e-06, + "loss": 1.7616, + "step": 25283 + }, + { + "epoch": 0.9054738840045123, + "grad_norm": 2.0489044189453125, + "learning_rate": 4.6483329598647874e-06, + "loss": 1.4232, + "step": 25284 + }, + { + "epoch": 0.9055096961340806, + "grad_norm": 1.7245614528656006, + "learning_rate": 4.644838351581582e-06, + "loss": 1.6448, + "step": 25285 + }, + { + "epoch": 0.9055455082636489, + "grad_norm": 1.375653624534607, + "learning_rate": 4.6413450261790894e-06, + "loss": 1.3573, + "step": 25286 + }, + { + "epoch": 0.9055813203932171, + "grad_norm": 1.45327889919281, + "learning_rate": 4.637852983704294e-06, + "loss": 1.5115, + "step": 25287 + }, + { + "epoch": 0.9056171325227854, + "grad_norm": 1.4247119426727295, + "learning_rate": 4.63436222420417e-06, + "loss": 1.488, + "step": 25288 + }, + { + "epoch": 0.9056529446523538, + "grad_norm": 1.779223918914795, + "learning_rate": 4.630872747725701e-06, + "loss": 1.5267, + "step": 25289 + }, + { + "epoch": 0.9056887567819221, + "grad_norm": 1.4207184314727783, + "learning_rate": 4.627384554315806e-06, + "loss": 1.3127, + "step": 25290 + }, + { + "epoch": 0.9057245689114903, + "grad_norm": 1.3641339540481567, + "learning_rate": 4.623897644021446e-06, + "loss": 1.1716, + "step": 25291 + }, + { + "epoch": 0.9057603810410586, + "grad_norm": 1.7971229553222656, + "learning_rate": 4.6204120168895085e-06, + "loss": 1.7832, + "step": 25292 + }, + { + "epoch": 0.9057961931706269, + "grad_norm": 1.6615642309188843, + "learning_rate": 4.616927672966898e-06, + "loss": 1.2579, + "step": 25293 + }, + { + "epoch": 0.9058320053001951, + "grad_norm": 1.582353949546814, + "learning_rate": 4.6134446123004885e-06, + "loss": 1.2602, + "step": 25294 + }, + { + "epoch": 0.9058678174297634, + "grad_norm": 1.9554626941680908, + "learning_rate": 4.609962834937153e-06, + "loss": 1.2429, + "step": 25295 + }, + { + "epoch": 0.9059036295593318, + "grad_norm": 1.6370717287063599, + "learning_rate": 4.606482340923712e-06, + "loss": 1.3997, + "step": 25296 + }, + { + "epoch": 0.9059394416889001, + "grad_norm": 1.5962556600570679, + "learning_rate": 4.6030031303070045e-06, + "loss": 1.5322, + "step": 25297 + }, + { + "epoch": 0.9059752538184683, + "grad_norm": 1.372740387916565, + "learning_rate": 4.599525203133848e-06, + "loss": 1.2582, + "step": 25298 + }, + { + "epoch": 0.9060110659480366, + "grad_norm": 1.5764113664627075, + "learning_rate": 4.596048559451005e-06, + "loss": 1.4412, + "step": 25299 + }, + { + "epoch": 0.9060468780776049, + "grad_norm": 1.6428806781768799, + "learning_rate": 4.592573199305272e-06, + "loss": 1.5331, + "step": 25300 + }, + { + "epoch": 0.9060826902071731, + "grad_norm": 2.063018560409546, + "learning_rate": 4.58909912274339e-06, + "loss": 1.4299, + "step": 25301 + }, + { + "epoch": 0.9061185023367414, + "grad_norm": 1.7012205123901367, + "learning_rate": 4.585626329812132e-06, + "loss": 1.6607, + "step": 25302 + }, + { + "epoch": 0.9061543144663098, + "grad_norm": 1.4773954153060913, + "learning_rate": 4.582154820558182e-06, + "loss": 1.2731, + "step": 25303 + }, + { + "epoch": 0.906190126595878, + "grad_norm": 1.781326413154602, + "learning_rate": 4.5786845950282486e-06, + "loss": 1.2667, + "step": 25304 + }, + { + "epoch": 0.9062259387254463, + "grad_norm": 1.574438452720642, + "learning_rate": 4.575215653269061e-06, + "loss": 1.4534, + "step": 25305 + }, + { + "epoch": 0.9062617508550146, + "grad_norm": 1.2895307540893555, + "learning_rate": 4.571747995327224e-06, + "loss": 1.4442, + "step": 25306 + }, + { + "epoch": 0.9062975629845829, + "grad_norm": 1.6740641593933105, + "learning_rate": 4.568281621249437e-06, + "loss": 1.269, + "step": 25307 + }, + { + "epoch": 0.9063333751141511, + "grad_norm": 2.1443235874176025, + "learning_rate": 4.564816531082316e-06, + "loss": 1.6352, + "step": 25308 + }, + { + "epoch": 0.9063691872437194, + "grad_norm": 1.7951942682266235, + "learning_rate": 4.561352724872503e-06, + "loss": 1.3721, + "step": 25309 + }, + { + "epoch": 0.9064049993732878, + "grad_norm": 1.5101568698883057, + "learning_rate": 4.557890202666571e-06, + "loss": 1.2677, + "step": 25310 + }, + { + "epoch": 0.906440811502856, + "grad_norm": 1.488024353981018, + "learning_rate": 4.5544289645111145e-06, + "loss": 1.3814, + "step": 25311 + }, + { + "epoch": 0.9064766236324243, + "grad_norm": 1.546871304512024, + "learning_rate": 4.5509690104526995e-06, + "loss": 1.1841, + "step": 25312 + }, + { + "epoch": 0.9065124357619926, + "grad_norm": 1.6033934354782104, + "learning_rate": 4.547510340537886e-06, + "loss": 1.4059, + "step": 25313 + }, + { + "epoch": 0.9065482478915609, + "grad_norm": 1.8143798112869263, + "learning_rate": 4.544052954813194e-06, + "loss": 1.2067, + "step": 25314 + }, + { + "epoch": 0.9065840600211291, + "grad_norm": 1.500676155090332, + "learning_rate": 4.540596853325119e-06, + "loss": 1.2754, + "step": 25315 + }, + { + "epoch": 0.9066198721506974, + "grad_norm": 1.7909812927246094, + "learning_rate": 4.537142036120212e-06, + "loss": 1.3087, + "step": 25316 + }, + { + "epoch": 0.9066556842802658, + "grad_norm": 1.3690929412841797, + "learning_rate": 4.533688503244893e-06, + "loss": 1.2588, + "step": 25317 + }, + { + "epoch": 0.906691496409834, + "grad_norm": 1.785294532775879, + "learning_rate": 4.5302362547456565e-06, + "loss": 1.3569, + "step": 25318 + }, + { + "epoch": 0.9067273085394023, + "grad_norm": 1.5315146446228027, + "learning_rate": 4.5267852906689555e-06, + "loss": 1.4828, + "step": 25319 + }, + { + "epoch": 0.9067631206689706, + "grad_norm": 1.328297734260559, + "learning_rate": 4.523335611061208e-06, + "loss": 1.5227, + "step": 25320 + }, + { + "epoch": 0.9067989327985388, + "grad_norm": 1.671176791191101, + "learning_rate": 4.51988721596881e-06, + "loss": 1.3934, + "step": 25321 + }, + { + "epoch": 0.9068347449281071, + "grad_norm": 1.6849404573440552, + "learning_rate": 4.51644010543818e-06, + "loss": 1.3083, + "step": 25322 + }, + { + "epoch": 0.9068705570576754, + "grad_norm": 1.6850334405899048, + "learning_rate": 4.512994279515692e-06, + "loss": 1.5317, + "step": 25323 + }, + { + "epoch": 0.9069063691872438, + "grad_norm": 1.4295352697372437, + "learning_rate": 4.509549738247676e-06, + "loss": 1.378, + "step": 25324 + }, + { + "epoch": 0.906942181316812, + "grad_norm": 2.1510770320892334, + "learning_rate": 4.5061064816805165e-06, + "loss": 1.4521, + "step": 25325 + }, + { + "epoch": 0.9069779934463803, + "grad_norm": 1.9867370128631592, + "learning_rate": 4.502664509860488e-06, + "loss": 1.4834, + "step": 25326 + }, + { + "epoch": 0.9070138055759486, + "grad_norm": 1.2240649461746216, + "learning_rate": 4.499223822833942e-06, + "loss": 1.3368, + "step": 25327 + }, + { + "epoch": 0.9070496177055168, + "grad_norm": 1.844240665435791, + "learning_rate": 4.4957844206471535e-06, + "loss": 1.3575, + "step": 25328 + }, + { + "epoch": 0.9070854298350851, + "grad_norm": 1.304735779762268, + "learning_rate": 4.492346303346395e-06, + "loss": 1.1249, + "step": 25329 + }, + { + "epoch": 0.9071212419646534, + "grad_norm": 1.8168039321899414, + "learning_rate": 4.488909470977909e-06, + "loss": 1.3287, + "step": 25330 + }, + { + "epoch": 0.9071570540942218, + "grad_norm": 1.2686028480529785, + "learning_rate": 4.485473923587957e-06, + "loss": 1.5049, + "step": 25331 + }, + { + "epoch": 0.90719286622379, + "grad_norm": 1.6690572500228882, + "learning_rate": 4.482039661222759e-06, + "loss": 1.43, + "step": 25332 + }, + { + "epoch": 0.9072286783533583, + "grad_norm": 1.5077745914459229, + "learning_rate": 4.478606683928476e-06, + "loss": 1.6013, + "step": 25333 + }, + { + "epoch": 0.9072644904829266, + "grad_norm": 1.62128484249115, + "learning_rate": 4.475174991751352e-06, + "loss": 1.1891, + "step": 25334 + }, + { + "epoch": 0.9073003026124948, + "grad_norm": 1.4367783069610596, + "learning_rate": 4.471744584737525e-06, + "loss": 1.5777, + "step": 25335 + }, + { + "epoch": 0.9073361147420631, + "grad_norm": 2.2138447761535645, + "learning_rate": 4.468315462933159e-06, + "loss": 1.4724, + "step": 25336 + }, + { + "epoch": 0.9073719268716314, + "grad_norm": 1.362805724143982, + "learning_rate": 4.464887626384362e-06, + "loss": 1.3973, + "step": 25337 + }, + { + "epoch": 0.9074077390011998, + "grad_norm": 1.5935183763504028, + "learning_rate": 4.461461075137285e-06, + "loss": 1.6762, + "step": 25338 + }, + { + "epoch": 0.907443551130768, + "grad_norm": 1.413731575012207, + "learning_rate": 4.458035809238026e-06, + "loss": 1.4027, + "step": 25339 + }, + { + "epoch": 0.9074793632603363, + "grad_norm": 1.4532414674758911, + "learning_rate": 4.454611828732636e-06, + "loss": 1.385, + "step": 25340 + }, + { + "epoch": 0.9075151753899046, + "grad_norm": 1.9259858131408691, + "learning_rate": 4.4511891336671885e-06, + "loss": 1.6331, + "step": 25341 + }, + { + "epoch": 0.9075509875194728, + "grad_norm": 1.7889636754989624, + "learning_rate": 4.447767724087759e-06, + "loss": 1.1333, + "step": 25342 + }, + { + "epoch": 0.9075867996490411, + "grad_norm": 1.47054922580719, + "learning_rate": 4.444347600040366e-06, + "loss": 1.4389, + "step": 25343 + }, + { + "epoch": 0.9076226117786094, + "grad_norm": 1.6787251234054565, + "learning_rate": 4.440928761570995e-06, + "loss": 1.1389, + "step": 25344 + }, + { + "epoch": 0.9076584239081777, + "grad_norm": 1.4817570447921753, + "learning_rate": 4.4375112087256864e-06, + "loss": 1.1824, + "step": 25345 + }, + { + "epoch": 0.907694236037746, + "grad_norm": 2.3525238037109375, + "learning_rate": 4.434094941550393e-06, + "loss": 1.3532, + "step": 25346 + }, + { + "epoch": 0.9077300481673143, + "grad_norm": 1.610073208808899, + "learning_rate": 4.430679960091089e-06, + "loss": 1.6939, + "step": 25347 + }, + { + "epoch": 0.9077658602968826, + "grad_norm": 2.0152218341827393, + "learning_rate": 4.427266264393693e-06, + "loss": 1.2251, + "step": 25348 + }, + { + "epoch": 0.9078016724264508, + "grad_norm": 1.7770185470581055, + "learning_rate": 4.423853854504156e-06, + "loss": 1.3707, + "step": 25349 + }, + { + "epoch": 0.9078374845560191, + "grad_norm": 2.1795554161071777, + "learning_rate": 4.420442730468388e-06, + "loss": 1.2333, + "step": 25350 + }, + { + "epoch": 0.9078732966855874, + "grad_norm": 1.550978660583496, + "learning_rate": 4.417032892332263e-06, + "loss": 1.528, + "step": 25351 + }, + { + "epoch": 0.9079091088151557, + "grad_norm": 1.501692533493042, + "learning_rate": 4.413624340141676e-06, + "loss": 1.2372, + "step": 25352 + }, + { + "epoch": 0.907944920944724, + "grad_norm": 1.6500540971755981, + "learning_rate": 4.410217073942468e-06, + "loss": 1.4474, + "step": 25353 + }, + { + "epoch": 0.9079807330742923, + "grad_norm": 1.5187865495681763, + "learning_rate": 4.4068110937805055e-06, + "loss": 1.1773, + "step": 25354 + }, + { + "epoch": 0.9080165452038605, + "grad_norm": 1.3688457012176514, + "learning_rate": 4.40340639970157e-06, + "loss": 1.2746, + "step": 25355 + }, + { + "epoch": 0.9080523573334288, + "grad_norm": 1.6650950908660889, + "learning_rate": 4.400002991751495e-06, + "loss": 1.6117, + "step": 25356 + }, + { + "epoch": 0.9080881694629971, + "grad_norm": 1.6353851556777954, + "learning_rate": 4.396600869976086e-06, + "loss": 1.2558, + "step": 25357 + }, + { + "epoch": 0.9081239815925654, + "grad_norm": 1.6381622552871704, + "learning_rate": 4.393200034421074e-06, + "loss": 1.4325, + "step": 25358 + }, + { + "epoch": 0.9081597937221337, + "grad_norm": 2.1839332580566406, + "learning_rate": 4.3898004851322335e-06, + "loss": 1.5382, + "step": 25359 + }, + { + "epoch": 0.908195605851702, + "grad_norm": 1.539484977722168, + "learning_rate": 4.386402222155295e-06, + "loss": 1.6129, + "step": 25360 + }, + { + "epoch": 0.9082314179812703, + "grad_norm": 1.4183225631713867, + "learning_rate": 4.383005245535998e-06, + "loss": 1.3707, + "step": 25361 + }, + { + "epoch": 0.9082672301108385, + "grad_norm": 1.6400363445281982, + "learning_rate": 4.379609555320008e-06, + "loss": 0.9586, + "step": 25362 + }, + { + "epoch": 0.9083030422404068, + "grad_norm": 1.407865047454834, + "learning_rate": 4.376215151553042e-06, + "loss": 1.3569, + "step": 25363 + }, + { + "epoch": 0.9083388543699751, + "grad_norm": 1.3456802368164062, + "learning_rate": 4.372822034280744e-06, + "loss": 0.978, + "step": 25364 + }, + { + "epoch": 0.9083746664995433, + "grad_norm": 2.59515380859375, + "learning_rate": 4.3694302035487965e-06, + "loss": 1.2745, + "step": 25365 + }, + { + "epoch": 0.9084104786291117, + "grad_norm": 1.7671518325805664, + "learning_rate": 4.366039659402798e-06, + "loss": 1.4961, + "step": 25366 + }, + { + "epoch": 0.90844629075868, + "grad_norm": 2.6446104049682617, + "learning_rate": 4.362650401888369e-06, + "loss": 1.2699, + "step": 25367 + }, + { + "epoch": 0.9084821028882483, + "grad_norm": 1.3323699235916138, + "learning_rate": 4.359262431051137e-06, + "loss": 1.4321, + "step": 25368 + }, + { + "epoch": 0.9085179150178165, + "grad_norm": 1.728229284286499, + "learning_rate": 4.355875746936644e-06, + "loss": 1.5239, + "step": 25369 + }, + { + "epoch": 0.9085537271473848, + "grad_norm": 1.4732418060302734, + "learning_rate": 4.352490349590477e-06, + "loss": 1.4759, + "step": 25370 + }, + { + "epoch": 0.9085895392769531, + "grad_norm": 1.4603962898254395, + "learning_rate": 4.349106239058165e-06, + "loss": 1.6496, + "step": 25371 + }, + { + "epoch": 0.9086253514065213, + "grad_norm": 1.721230149269104, + "learning_rate": 4.345723415385272e-06, + "loss": 1.7606, + "step": 25372 + }, + { + "epoch": 0.9086611635360897, + "grad_norm": 1.6769062280654907, + "learning_rate": 4.342341878617262e-06, + "loss": 1.7661, + "step": 25373 + }, + { + "epoch": 0.908696975665658, + "grad_norm": 2.0991551876068115, + "learning_rate": 4.338961628799665e-06, + "loss": 1.2793, + "step": 25374 + }, + { + "epoch": 0.9087327877952263, + "grad_norm": 1.581172227859497, + "learning_rate": 4.335582665977944e-06, + "loss": 1.4508, + "step": 25375 + }, + { + "epoch": 0.9087685999247945, + "grad_norm": 1.404631495475769, + "learning_rate": 4.332204990197564e-06, + "loss": 1.4273, + "step": 25376 + }, + { + "epoch": 0.9088044120543628, + "grad_norm": 1.1999675035476685, + "learning_rate": 4.328828601503943e-06, + "loss": 1.2674, + "step": 25377 + }, + { + "epoch": 0.9088402241839311, + "grad_norm": 1.3518636226654053, + "learning_rate": 4.325453499942545e-06, + "loss": 1.3804, + "step": 25378 + }, + { + "epoch": 0.9088760363134993, + "grad_norm": 1.5245866775512695, + "learning_rate": 4.322079685558755e-06, + "loss": 1.2018, + "step": 25379 + }, + { + "epoch": 0.9089118484430677, + "grad_norm": 1.940627932548523, + "learning_rate": 4.318707158397972e-06, + "loss": 1.5514, + "step": 25380 + }, + { + "epoch": 0.908947660572636, + "grad_norm": 1.9791686534881592, + "learning_rate": 4.3153359185055474e-06, + "loss": 1.3945, + "step": 25381 + }, + { + "epoch": 0.9089834727022043, + "grad_norm": 2.057812452316284, + "learning_rate": 4.311965965926867e-06, + "loss": 1.1848, + "step": 25382 + }, + { + "epoch": 0.9090192848317725, + "grad_norm": 1.4718939065933228, + "learning_rate": 4.308597300707262e-06, + "loss": 1.4995, + "step": 25383 + }, + { + "epoch": 0.9090550969613408, + "grad_norm": 1.5255402326583862, + "learning_rate": 4.305229922892029e-06, + "loss": 1.418, + "step": 25384 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.564485788345337, + "learning_rate": 4.301863832526498e-06, + "loss": 1.1865, + "step": 25385 + }, + { + "epoch": 0.9091267212204773, + "grad_norm": 1.8961577415466309, + "learning_rate": 4.298499029655967e-06, + "loss": 1.4887, + "step": 25386 + }, + { + "epoch": 0.9091625333500457, + "grad_norm": 1.5364948511123657, + "learning_rate": 4.295135514325654e-06, + "loss": 1.5881, + "step": 25387 + }, + { + "epoch": 0.909198345479614, + "grad_norm": 2.002300977706909, + "learning_rate": 4.291773286580858e-06, + "loss": 1.6364, + "step": 25388 + }, + { + "epoch": 0.9092341576091822, + "grad_norm": 2.0127434730529785, + "learning_rate": 4.288412346466797e-06, + "loss": 1.3339, + "step": 25389 + }, + { + "epoch": 0.9092699697387505, + "grad_norm": 1.5489211082458496, + "learning_rate": 4.28505269402869e-06, + "loss": 1.3824, + "step": 25390 + }, + { + "epoch": 0.9093057818683188, + "grad_norm": 1.6152890920639038, + "learning_rate": 4.281694329311736e-06, + "loss": 1.3801, + "step": 25391 + }, + { + "epoch": 0.909341593997887, + "grad_norm": 2.6852145195007324, + "learning_rate": 4.278337252361109e-06, + "loss": 1.5435, + "step": 25392 + }, + { + "epoch": 0.9093774061274553, + "grad_norm": 2.5400712490081787, + "learning_rate": 4.2749814632219946e-06, + "loss": 1.4603, + "step": 25393 + }, + { + "epoch": 0.9094132182570237, + "grad_norm": 2.176903486251831, + "learning_rate": 4.271626961939524e-06, + "loss": 1.4666, + "step": 25394 + }, + { + "epoch": 0.909449030386592, + "grad_norm": 1.6950504779815674, + "learning_rate": 4.268273748558815e-06, + "loss": 1.3297, + "step": 25395 + }, + { + "epoch": 0.9094848425161602, + "grad_norm": 1.5055739879608154, + "learning_rate": 4.264921823125012e-06, + "loss": 1.4014, + "step": 25396 + }, + { + "epoch": 0.9095206546457285, + "grad_norm": 1.2444562911987305, + "learning_rate": 4.261571185683211e-06, + "loss": 1.4675, + "step": 25397 + }, + { + "epoch": 0.9095564667752968, + "grad_norm": 2.0132944583892822, + "learning_rate": 4.258221836278453e-06, + "loss": 1.6443, + "step": 25398 + }, + { + "epoch": 0.909592278904865, + "grad_norm": 1.8058809041976929, + "learning_rate": 4.2548737749558255e-06, + "loss": 1.347, + "step": 25399 + }, + { + "epoch": 0.9096280910344333, + "grad_norm": 1.5747233629226685, + "learning_rate": 4.2515270017603695e-06, + "loss": 1.6271, + "step": 25400 + }, + { + "epoch": 0.9096639031640017, + "grad_norm": 1.754483699798584, + "learning_rate": 4.248181516737127e-06, + "loss": 1.4832, + "step": 25401 + }, + { + "epoch": 0.90969971529357, + "grad_norm": 1.453294038772583, + "learning_rate": 4.244837319931072e-06, + "loss": 1.5246, + "step": 25402 + }, + { + "epoch": 0.9097355274231382, + "grad_norm": 1.683763027191162, + "learning_rate": 4.2414944113872255e-06, + "loss": 1.3641, + "step": 25403 + }, + { + "epoch": 0.9097713395527065, + "grad_norm": 1.6361896991729736, + "learning_rate": 4.2381527911505625e-06, + "loss": 1.8167, + "step": 25404 + }, + { + "epoch": 0.9098071516822748, + "grad_norm": 1.424181580543518, + "learning_rate": 4.234812459266013e-06, + "loss": 1.5335, + "step": 25405 + }, + { + "epoch": 0.909842963811843, + "grad_norm": 2.2374167442321777, + "learning_rate": 4.231473415778531e-06, + "loss": 1.4462, + "step": 25406 + }, + { + "epoch": 0.9098787759414113, + "grad_norm": 1.852335810661316, + "learning_rate": 4.228135660733046e-06, + "loss": 1.648, + "step": 25407 + }, + { + "epoch": 0.9099145880709797, + "grad_norm": 2.4960708618164062, + "learning_rate": 4.224799194174467e-06, + "loss": 1.3632, + "step": 25408 + }, + { + "epoch": 0.909950400200548, + "grad_norm": 1.384843349456787, + "learning_rate": 4.221464016147669e-06, + "loss": 1.5738, + "step": 25409 + }, + { + "epoch": 0.9099862123301162, + "grad_norm": 1.7134809494018555, + "learning_rate": 4.218130126697517e-06, + "loss": 1.5194, + "step": 25410 + }, + { + "epoch": 0.9100220244596845, + "grad_norm": 1.2886848449707031, + "learning_rate": 4.214797525868897e-06, + "loss": 1.2627, + "step": 25411 + }, + { + "epoch": 0.9100578365892528, + "grad_norm": 1.9155702590942383, + "learning_rate": 4.2114662137066055e-06, + "loss": 1.8567, + "step": 25412 + }, + { + "epoch": 0.910093648718821, + "grad_norm": 1.4528340101242065, + "learning_rate": 4.208136190255485e-06, + "loss": 1.4138, + "step": 25413 + }, + { + "epoch": 0.9101294608483893, + "grad_norm": 1.6072170734405518, + "learning_rate": 4.204807455560311e-06, + "loss": 1.4734, + "step": 25414 + }, + { + "epoch": 0.9101652729779577, + "grad_norm": 1.2965353727340698, + "learning_rate": 4.201480009665915e-06, + "loss": 1.3501, + "step": 25415 + }, + { + "epoch": 0.910201085107526, + "grad_norm": 1.2253295183181763, + "learning_rate": 4.198153852617015e-06, + "loss": 1.2337, + "step": 25416 + }, + { + "epoch": 0.9102368972370942, + "grad_norm": 2.0200018882751465, + "learning_rate": 4.194828984458376e-06, + "loss": 1.5565, + "step": 25417 + }, + { + "epoch": 0.9102727093666625, + "grad_norm": 1.6863843202590942, + "learning_rate": 4.191505405234741e-06, + "loss": 1.4327, + "step": 25418 + }, + { + "epoch": 0.9103085214962308, + "grad_norm": 1.7367192506790161, + "learning_rate": 4.188183114990829e-06, + "loss": 1.4891, + "step": 25419 + }, + { + "epoch": 0.910344333625799, + "grad_norm": 2.484041452407837, + "learning_rate": 4.1848621137713154e-06, + "loss": 1.5881, + "step": 25420 + }, + { + "epoch": 0.9103801457553673, + "grad_norm": 1.6672288179397583, + "learning_rate": 4.181542401620875e-06, + "loss": 1.2391, + "step": 25421 + }, + { + "epoch": 0.9104159578849357, + "grad_norm": 1.5806713104248047, + "learning_rate": 4.178223978584206e-06, + "loss": 1.6583, + "step": 25422 + }, + { + "epoch": 0.910451770014504, + "grad_norm": 1.8388357162475586, + "learning_rate": 4.174906844705917e-06, + "loss": 1.246, + "step": 25423 + }, + { + "epoch": 0.9104875821440722, + "grad_norm": 1.3316192626953125, + "learning_rate": 4.171591000030672e-06, + "loss": 1.4997, + "step": 25424 + }, + { + "epoch": 0.9105233942736405, + "grad_norm": 1.8472051620483398, + "learning_rate": 4.168276444603026e-06, + "loss": 1.4753, + "step": 25425 + }, + { + "epoch": 0.9105592064032088, + "grad_norm": 1.7919559478759766, + "learning_rate": 4.164963178467629e-06, + "loss": 1.5098, + "step": 25426 + }, + { + "epoch": 0.910595018532777, + "grad_norm": 1.5775041580200195, + "learning_rate": 4.161651201669036e-06, + "loss": 1.531, + "step": 25427 + }, + { + "epoch": 0.9106308306623453, + "grad_norm": 2.2557876110076904, + "learning_rate": 4.1583405142517906e-06, + "loss": 1.5838, + "step": 25428 + }, + { + "epoch": 0.9106666427919137, + "grad_norm": 1.9684809446334839, + "learning_rate": 4.155031116260466e-06, + "loss": 1.2069, + "step": 25429 + }, + { + "epoch": 0.9107024549214819, + "grad_norm": 1.215849757194519, + "learning_rate": 4.15172300773955e-06, + "loss": 1.4853, + "step": 25430 + }, + { + "epoch": 0.9107382670510502, + "grad_norm": 1.6601389646530151, + "learning_rate": 4.148416188733584e-06, + "loss": 1.3532, + "step": 25431 + }, + { + "epoch": 0.9107740791806185, + "grad_norm": 1.467976450920105, + "learning_rate": 4.1451106592869995e-06, + "loss": 1.3732, + "step": 25432 + }, + { + "epoch": 0.9108098913101867, + "grad_norm": 1.4758421182632446, + "learning_rate": 4.14180641944435e-06, + "loss": 1.4902, + "step": 25433 + }, + { + "epoch": 0.910845703439755, + "grad_norm": 1.355146884918213, + "learning_rate": 4.138503469250021e-06, + "loss": 1.3033, + "step": 25434 + }, + { + "epoch": 0.9108815155693233, + "grad_norm": 1.4203933477401733, + "learning_rate": 4.13520180874849e-06, + "loss": 1.3695, + "step": 25435 + }, + { + "epoch": 0.9109173276988917, + "grad_norm": 2.0352320671081543, + "learning_rate": 4.131901437984153e-06, + "loss": 1.651, + "step": 25436 + }, + { + "epoch": 0.9109531398284599, + "grad_norm": 1.4367496967315674, + "learning_rate": 4.128602357001421e-06, + "loss": 1.3241, + "step": 25437 + }, + { + "epoch": 0.9109889519580282, + "grad_norm": 1.3705084323883057, + "learning_rate": 4.12530456584469e-06, + "loss": 1.5884, + "step": 25438 + }, + { + "epoch": 0.9110247640875965, + "grad_norm": 1.4604884386062622, + "learning_rate": 4.122008064558313e-06, + "loss": 1.295, + "step": 25439 + }, + { + "epoch": 0.9110605762171647, + "grad_norm": 1.8145109415054321, + "learning_rate": 4.118712853186634e-06, + "loss": 1.5224, + "step": 25440 + }, + { + "epoch": 0.911096388346733, + "grad_norm": 1.4539945125579834, + "learning_rate": 4.115418931773996e-06, + "loss": 1.6827, + "step": 25441 + }, + { + "epoch": 0.9111322004763013, + "grad_norm": 1.368727445602417, + "learning_rate": 4.112126300364727e-06, + "loss": 1.4317, + "step": 25442 + }, + { + "epoch": 0.9111680126058697, + "grad_norm": 1.5504074096679688, + "learning_rate": 4.108834959003094e-06, + "loss": 1.4278, + "step": 25443 + }, + { + "epoch": 0.9112038247354379, + "grad_norm": 1.5621628761291504, + "learning_rate": 4.1055449077334165e-06, + "loss": 1.2978, + "step": 25444 + }, + { + "epoch": 0.9112396368650062, + "grad_norm": 1.6070566177368164, + "learning_rate": 4.102256146599936e-06, + "loss": 1.158, + "step": 25445 + }, + { + "epoch": 0.9112754489945745, + "grad_norm": 2.3168065547943115, + "learning_rate": 4.098968675646886e-06, + "loss": 1.3911, + "step": 25446 + }, + { + "epoch": 0.9113112611241427, + "grad_norm": 1.7678879499435425, + "learning_rate": 4.095682494918507e-06, + "loss": 1.1655, + "step": 25447 + }, + { + "epoch": 0.911347073253711, + "grad_norm": 2.47255539894104, + "learning_rate": 4.092397604459019e-06, + "loss": 1.484, + "step": 25448 + }, + { + "epoch": 0.9113828853832793, + "grad_norm": 1.4481755495071411, + "learning_rate": 4.089114004312622e-06, + "loss": 1.3727, + "step": 25449 + }, + { + "epoch": 0.9114186975128477, + "grad_norm": 1.532240390777588, + "learning_rate": 4.085831694523456e-06, + "loss": 1.5036, + "step": 25450 + }, + { + "epoch": 0.9114545096424159, + "grad_norm": 1.6303825378417969, + "learning_rate": 4.082550675135721e-06, + "loss": 1.2653, + "step": 25451 + }, + { + "epoch": 0.9114903217719842, + "grad_norm": 2.0714213848114014, + "learning_rate": 4.079270946193525e-06, + "loss": 1.2285, + "step": 25452 + }, + { + "epoch": 0.9115261339015525, + "grad_norm": 2.202430486679077, + "learning_rate": 4.075992507741033e-06, + "loss": 1.3034, + "step": 25453 + }, + { + "epoch": 0.9115619460311207, + "grad_norm": 2.01389741897583, + "learning_rate": 4.07271535982231e-06, + "loss": 1.6597, + "step": 25454 + }, + { + "epoch": 0.911597758160689, + "grad_norm": 1.577217698097229, + "learning_rate": 4.0694395024814754e-06, + "loss": 1.4017, + "step": 25455 + }, + { + "epoch": 0.9116335702902573, + "grad_norm": 1.35993230342865, + "learning_rate": 4.066164935762595e-06, + "loss": 1.406, + "step": 25456 + }, + { + "epoch": 0.9116693824198256, + "grad_norm": 1.9778926372528076, + "learning_rate": 4.062891659709711e-06, + "loss": 1.3003, + "step": 25457 + }, + { + "epoch": 0.9117051945493939, + "grad_norm": 1.6684564352035522, + "learning_rate": 4.059619674366866e-06, + "loss": 1.5999, + "step": 25458 + }, + { + "epoch": 0.9117410066789622, + "grad_norm": 2.0282845497131348, + "learning_rate": 4.05634897977808e-06, + "loss": 1.3524, + "step": 25459 + }, + { + "epoch": 0.9117768188085305, + "grad_norm": 2.274070978164673, + "learning_rate": 4.053079575987384e-06, + "loss": 1.3631, + "step": 25460 + }, + { + "epoch": 0.9118126309380987, + "grad_norm": 1.6663633584976196, + "learning_rate": 4.049811463038722e-06, + "loss": 1.5455, + "step": 25461 + }, + { + "epoch": 0.911848443067667, + "grad_norm": 1.4195297956466675, + "learning_rate": 4.0465446409760795e-06, + "loss": 1.4995, + "step": 25462 + }, + { + "epoch": 0.9118842551972353, + "grad_norm": 1.5641138553619385, + "learning_rate": 4.043279109843412e-06, + "loss": 1.4892, + "step": 25463 + }, + { + "epoch": 0.9119200673268036, + "grad_norm": 1.4835494756698608, + "learning_rate": 4.04001486968465e-06, + "loss": 1.4001, + "step": 25464 + }, + { + "epoch": 0.9119558794563719, + "grad_norm": 1.8441171646118164, + "learning_rate": 4.036751920543702e-06, + "loss": 1.6182, + "step": 25465 + }, + { + "epoch": 0.9119916915859402, + "grad_norm": 2.2617461681365967, + "learning_rate": 4.033490262464468e-06, + "loss": 1.2301, + "step": 25466 + }, + { + "epoch": 0.9120275037155084, + "grad_norm": 1.4524611234664917, + "learning_rate": 4.030229895490856e-06, + "loss": 1.5047, + "step": 25467 + }, + { + "epoch": 0.9120633158450767, + "grad_norm": 1.821845531463623, + "learning_rate": 4.026970819666698e-06, + "loss": 1.2156, + "step": 25468 + }, + { + "epoch": 0.912099127974645, + "grad_norm": 1.517996072769165, + "learning_rate": 4.023713035035836e-06, + "loss": 1.4621, + "step": 25469 + }, + { + "epoch": 0.9121349401042133, + "grad_norm": 2.0149309635162354, + "learning_rate": 4.020456541642126e-06, + "loss": 1.4772, + "step": 25470 + }, + { + "epoch": 0.9121707522337816, + "grad_norm": 1.5749101638793945, + "learning_rate": 4.017201339529386e-06, + "loss": 1.3722, + "step": 25471 + }, + { + "epoch": 0.9122065643633499, + "grad_norm": 1.673729658126831, + "learning_rate": 4.013947428741372e-06, + "loss": 1.6295, + "step": 25472 + }, + { + "epoch": 0.9122423764929182, + "grad_norm": 1.4756914377212524, + "learning_rate": 4.01069480932188e-06, + "loss": 1.5144, + "step": 25473 + }, + { + "epoch": 0.9122781886224864, + "grad_norm": 1.3629252910614014, + "learning_rate": 4.007443481314699e-06, + "loss": 1.3908, + "step": 25474 + }, + { + "epoch": 0.9123140007520547, + "grad_norm": 1.5156805515289307, + "learning_rate": 4.0041934447635156e-06, + "loss": 1.3617, + "step": 25475 + }, + { + "epoch": 0.912349812881623, + "grad_norm": 1.570890188217163, + "learning_rate": 4.000944699712094e-06, + "loss": 1.2795, + "step": 25476 + }, + { + "epoch": 0.9123856250111912, + "grad_norm": 1.760048508644104, + "learning_rate": 3.997697246204124e-06, + "loss": 1.4411, + "step": 25477 + }, + { + "epoch": 0.9124214371407596, + "grad_norm": 1.7072031497955322, + "learning_rate": 3.994451084283324e-06, + "loss": 1.4699, + "step": 25478 + }, + { + "epoch": 0.9124572492703279, + "grad_norm": 1.4796770811080933, + "learning_rate": 3.991206213993326e-06, + "loss": 1.326, + "step": 25479 + }, + { + "epoch": 0.9124930613998962, + "grad_norm": 1.6207607984542847, + "learning_rate": 3.987962635377806e-06, + "loss": 1.2875, + "step": 25480 + }, + { + "epoch": 0.9125288735294644, + "grad_norm": 1.4966524839401245, + "learning_rate": 3.98472034848042e-06, + "loss": 1.3703, + "step": 25481 + }, + { + "epoch": 0.9125646856590327, + "grad_norm": 1.418821096420288, + "learning_rate": 3.9814793533447635e-06, + "loss": 1.2243, + "step": 25482 + }, + { + "epoch": 0.912600497788601, + "grad_norm": 1.7261139154434204, + "learning_rate": 3.978239650014437e-06, + "loss": 1.4594, + "step": 25483 + }, + { + "epoch": 0.9126363099181692, + "grad_norm": 1.719135046005249, + "learning_rate": 3.975001238533038e-06, + "loss": 1.5768, + "step": 25484 + }, + { + "epoch": 0.9126721220477376, + "grad_norm": 1.6824172735214233, + "learning_rate": 3.971764118944155e-06, + "loss": 1.45, + "step": 25485 + }, + { + "epoch": 0.9127079341773059, + "grad_norm": 1.4959074258804321, + "learning_rate": 3.968528291291296e-06, + "loss": 1.1142, + "step": 25486 + }, + { + "epoch": 0.9127437463068742, + "grad_norm": 1.5593359470367432, + "learning_rate": 3.965293755618027e-06, + "loss": 1.3833, + "step": 25487 + }, + { + "epoch": 0.9127795584364424, + "grad_norm": 1.45145845413208, + "learning_rate": 3.962060511967846e-06, + "loss": 1.1292, + "step": 25488 + }, + { + "epoch": 0.9128153705660107, + "grad_norm": 1.793111801147461, + "learning_rate": 3.9588285603842755e-06, + "loss": 1.384, + "step": 25489 + }, + { + "epoch": 0.912851182695579, + "grad_norm": 1.5112842321395874, + "learning_rate": 3.955597900910768e-06, + "loss": 1.4585, + "step": 25490 + }, + { + "epoch": 0.9128869948251472, + "grad_norm": 1.887883186340332, + "learning_rate": 3.9523685335908e-06, + "loss": 1.5188, + "step": 25491 + }, + { + "epoch": 0.9129228069547156, + "grad_norm": 1.8775691986083984, + "learning_rate": 3.9491404584678485e-06, + "loss": 1.4276, + "step": 25492 + }, + { + "epoch": 0.9129586190842839, + "grad_norm": 1.926689863204956, + "learning_rate": 3.945913675585289e-06, + "loss": 1.4073, + "step": 25493 + }, + { + "epoch": 0.9129944312138522, + "grad_norm": 1.671813726425171, + "learning_rate": 3.9426881849865646e-06, + "loss": 1.4181, + "step": 25494 + }, + { + "epoch": 0.9130302433434204, + "grad_norm": 1.5096828937530518, + "learning_rate": 3.939463986715064e-06, + "loss": 1.4566, + "step": 25495 + }, + { + "epoch": 0.9130660554729887, + "grad_norm": 1.3902485370635986, + "learning_rate": 3.936241080814174e-06, + "loss": 1.2241, + "step": 25496 + }, + { + "epoch": 0.913101867602557, + "grad_norm": 1.5125646591186523, + "learning_rate": 3.933019467327248e-06, + "loss": 1.548, + "step": 25497 + }, + { + "epoch": 0.9131376797321252, + "grad_norm": 1.8359897136688232, + "learning_rate": 3.9297991462976196e-06, + "loss": 1.5648, + "step": 25498 + }, + { + "epoch": 0.9131734918616936, + "grad_norm": 1.5971086025238037, + "learning_rate": 3.92658011776863e-06, + "loss": 1.3288, + "step": 25499 + }, + { + "epoch": 0.9132093039912619, + "grad_norm": 2.038939952850342, + "learning_rate": 3.923362381783568e-06, + "loss": 1.2699, + "step": 25500 + }, + { + "epoch": 0.9132451161208301, + "grad_norm": 2.618192195892334, + "learning_rate": 3.920145938385744e-06, + "loss": 1.3613, + "step": 25501 + }, + { + "epoch": 0.9132809282503984, + "grad_norm": 1.5217925310134888, + "learning_rate": 3.916930787618412e-06, + "loss": 1.46, + "step": 25502 + }, + { + "epoch": 0.9133167403799667, + "grad_norm": 2.0505995750427246, + "learning_rate": 3.913716929524857e-06, + "loss": 1.23, + "step": 25503 + }, + { + "epoch": 0.913352552509535, + "grad_norm": 1.67502760887146, + "learning_rate": 3.910504364148282e-06, + "loss": 1.3907, + "step": 25504 + }, + { + "epoch": 0.9133883646391032, + "grad_norm": 1.720183253288269, + "learning_rate": 3.907293091531927e-06, + "loss": 1.1388, + "step": 25505 + }, + { + "epoch": 0.9134241767686716, + "grad_norm": 1.7315115928649902, + "learning_rate": 3.904083111718993e-06, + "loss": 1.2282, + "step": 25506 + }, + { + "epoch": 0.9134599888982399, + "grad_norm": 1.504867672920227, + "learning_rate": 3.900874424752677e-06, + "loss": 1.4048, + "step": 25507 + }, + { + "epoch": 0.9134958010278081, + "grad_norm": 1.2139843702316284, + "learning_rate": 3.897667030676133e-06, + "loss": 1.311, + "step": 25508 + }, + { + "epoch": 0.9135316131573764, + "grad_norm": 1.6019784212112427, + "learning_rate": 3.8944609295324955e-06, + "loss": 1.394, + "step": 25509 + }, + { + "epoch": 0.9135674252869447, + "grad_norm": 1.34242582321167, + "learning_rate": 3.89125612136495e-06, + "loss": 1.1476, + "step": 25510 + }, + { + "epoch": 0.913603237416513, + "grad_norm": 1.3961800336837769, + "learning_rate": 3.888052606216564e-06, + "loss": 1.481, + "step": 25511 + }, + { + "epoch": 0.9136390495460812, + "grad_norm": 1.6185824871063232, + "learning_rate": 3.884850384130456e-06, + "loss": 1.1685, + "step": 25512 + }, + { + "epoch": 0.9136748616756496, + "grad_norm": 2.0552761554718018, + "learning_rate": 3.881649455149694e-06, + "loss": 1.3966, + "step": 25513 + }, + { + "epoch": 0.9137106738052179, + "grad_norm": 1.247956395149231, + "learning_rate": 3.878449819317376e-06, + "loss": 1.1779, + "step": 25514 + }, + { + "epoch": 0.9137464859347861, + "grad_norm": 1.512967586517334, + "learning_rate": 3.875251476676522e-06, + "loss": 1.199, + "step": 25515 + }, + { + "epoch": 0.9137822980643544, + "grad_norm": 1.5441175699234009, + "learning_rate": 3.872054427270167e-06, + "loss": 1.108, + "step": 25516 + }, + { + "epoch": 0.9138181101939227, + "grad_norm": 2.1521899700164795, + "learning_rate": 3.868858671141329e-06, + "loss": 1.6401, + "step": 25517 + }, + { + "epoch": 0.9138539223234909, + "grad_norm": 2.341466188430786, + "learning_rate": 3.865664208332986e-06, + "loss": 1.3783, + "step": 25518 + }, + { + "epoch": 0.9138897344530592, + "grad_norm": 1.7183805704116821, + "learning_rate": 3.862471038888138e-06, + "loss": 1.2954, + "step": 25519 + }, + { + "epoch": 0.9139255465826276, + "grad_norm": 1.7073190212249756, + "learning_rate": 3.859279162849716e-06, + "loss": 1.6382, + "step": 25520 + }, + { + "epoch": 0.9139613587121959, + "grad_norm": 1.8298503160476685, + "learning_rate": 3.856088580260697e-06, + "loss": 1.6447, + "step": 25521 + }, + { + "epoch": 0.9139971708417641, + "grad_norm": 1.38832688331604, + "learning_rate": 3.8528992911639806e-06, + "loss": 1.3147, + "step": 25522 + }, + { + "epoch": 0.9140329829713324, + "grad_norm": 1.9283418655395508, + "learning_rate": 3.8497112956024875e-06, + "loss": 1.2106, + "step": 25523 + }, + { + "epoch": 0.9140687951009007, + "grad_norm": 1.6704370975494385, + "learning_rate": 3.846524593619094e-06, + "loss": 1.6686, + "step": 25524 + }, + { + "epoch": 0.9141046072304689, + "grad_norm": 2.005181074142456, + "learning_rate": 3.8433391852567e-06, + "loss": 1.9029, + "step": 25525 + }, + { + "epoch": 0.9141404193600372, + "grad_norm": 1.8579343557357788, + "learning_rate": 3.840155070558149e-06, + "loss": 1.2547, + "step": 25526 + }, + { + "epoch": 0.9141762314896056, + "grad_norm": 1.4794865846633911, + "learning_rate": 3.836972249566239e-06, + "loss": 1.1888, + "step": 25527 + }, + { + "epoch": 0.9142120436191739, + "grad_norm": 1.6024411916732788, + "learning_rate": 3.83379072232386e-06, + "loss": 1.2923, + "step": 25528 + }, + { + "epoch": 0.9142478557487421, + "grad_norm": 1.8010493516921997, + "learning_rate": 3.830610488873765e-06, + "loss": 1.2005, + "step": 25529 + }, + { + "epoch": 0.9142836678783104, + "grad_norm": 1.4514111280441284, + "learning_rate": 3.827431549258764e-06, + "loss": 1.6173, + "step": 25530 + }, + { + "epoch": 0.9143194800078787, + "grad_norm": 1.4556541442871094, + "learning_rate": 3.824253903521602e-06, + "loss": 1.3869, + "step": 25531 + }, + { + "epoch": 0.9143552921374469, + "grad_norm": 1.7731586694717407, + "learning_rate": 3.821077551705065e-06, + "loss": 1.557, + "step": 25532 + }, + { + "epoch": 0.9143911042670152, + "grad_norm": 1.5040415525436401, + "learning_rate": 3.817902493851877e-06, + "loss": 1.1799, + "step": 25533 + }, + { + "epoch": 0.9144269163965836, + "grad_norm": 1.508422613143921, + "learning_rate": 3.814728730004724e-06, + "loss": 1.3727, + "step": 25534 + }, + { + "epoch": 0.9144627285261518, + "grad_norm": 1.460686206817627, + "learning_rate": 3.811556260206328e-06, + "loss": 1.3637, + "step": 25535 + }, + { + "epoch": 0.9144985406557201, + "grad_norm": 2.1639134883880615, + "learning_rate": 3.808385084499366e-06, + "loss": 1.4715, + "step": 25536 + }, + { + "epoch": 0.9145343527852884, + "grad_norm": 1.949466586112976, + "learning_rate": 3.8052152029265154e-06, + "loss": 1.3316, + "step": 25537 + }, + { + "epoch": 0.9145701649148567, + "grad_norm": 1.9992917776107788, + "learning_rate": 3.8020466155304078e-06, + "loss": 1.3247, + "step": 25538 + }, + { + "epoch": 0.9146059770444249, + "grad_norm": 1.9052329063415527, + "learning_rate": 3.798879322353666e-06, + "loss": 1.5343, + "step": 25539 + }, + { + "epoch": 0.9146417891739932, + "grad_norm": 1.6159497499465942, + "learning_rate": 3.7957133234389207e-06, + "loss": 1.5618, + "step": 25540 + }, + { + "epoch": 0.9146776013035616, + "grad_norm": 1.5084832906723022, + "learning_rate": 3.7925486188287727e-06, + "loss": 1.4374, + "step": 25541 + }, + { + "epoch": 0.9147134134331298, + "grad_norm": 1.6207736730575562, + "learning_rate": 3.7893852085657657e-06, + "loss": 1.5015, + "step": 25542 + }, + { + "epoch": 0.9147492255626981, + "grad_norm": 1.9216586351394653, + "learning_rate": 3.786223092692476e-06, + "loss": 1.6365, + "step": 25543 + }, + { + "epoch": 0.9147850376922664, + "grad_norm": 1.3661115169525146, + "learning_rate": 3.7830622712514696e-06, + "loss": 1.3516, + "step": 25544 + }, + { + "epoch": 0.9148208498218346, + "grad_norm": 1.7824134826660156, + "learning_rate": 3.779902744285224e-06, + "loss": 1.0715, + "step": 25545 + }, + { + "epoch": 0.9148566619514029, + "grad_norm": 2.196190357208252, + "learning_rate": 3.7767445118362832e-06, + "loss": 1.1834, + "step": 25546 + }, + { + "epoch": 0.9148924740809712, + "grad_norm": 1.838174819946289, + "learning_rate": 3.7735875739471237e-06, + "loss": 1.417, + "step": 25547 + }, + { + "epoch": 0.9149282862105395, + "grad_norm": 1.4311494827270508, + "learning_rate": 3.770431930660223e-06, + "loss": 1.3594, + "step": 25548 + }, + { + "epoch": 0.9149640983401078, + "grad_norm": 1.6186925172805786, + "learning_rate": 3.767277582018036e-06, + "loss": 1.2366, + "step": 25549 + }, + { + "epoch": 0.9149999104696761, + "grad_norm": 1.7173895835876465, + "learning_rate": 3.7641245280629842e-06, + "loss": 1.3939, + "step": 25550 + }, + { + "epoch": 0.9150357225992444, + "grad_norm": 2.2020442485809326, + "learning_rate": 3.760972768837523e-06, + "loss": 1.4908, + "step": 25551 + }, + { + "epoch": 0.9150715347288126, + "grad_norm": 1.694572925567627, + "learning_rate": 3.757822304384018e-06, + "loss": 1.3844, + "step": 25552 + }, + { + "epoch": 0.9151073468583809, + "grad_norm": 1.406884789466858, + "learning_rate": 3.7546731347448685e-06, + "loss": 1.3992, + "step": 25553 + }, + { + "epoch": 0.9151431589879492, + "grad_norm": 1.7870287895202637, + "learning_rate": 3.7515252599624516e-06, + "loss": 1.3392, + "step": 25554 + }, + { + "epoch": 0.9151789711175174, + "grad_norm": 1.6366087198257446, + "learning_rate": 3.748378680079112e-06, + "loss": 1.3854, + "step": 25555 + }, + { + "epoch": 0.9152147832470858, + "grad_norm": 1.4431771039962769, + "learning_rate": 3.745233395137182e-06, + "loss": 1.4213, + "step": 25556 + }, + { + "epoch": 0.9152505953766541, + "grad_norm": 1.887716293334961, + "learning_rate": 3.7420894051789723e-06, + "loss": 1.394, + "step": 25557 + }, + { + "epoch": 0.9152864075062224, + "grad_norm": 1.6417644023895264, + "learning_rate": 3.7389467102467823e-06, + "loss": 1.3629, + "step": 25558 + }, + { + "epoch": 0.9153222196357906, + "grad_norm": 1.8466694355010986, + "learning_rate": 3.7358053103829117e-06, + "loss": 1.4253, + "step": 25559 + }, + { + "epoch": 0.9153580317653589, + "grad_norm": 1.1880935430526733, + "learning_rate": 3.732665205629593e-06, + "loss": 1.306, + "step": 25560 + }, + { + "epoch": 0.9153938438949272, + "grad_norm": 1.6200544834136963, + "learning_rate": 3.7295263960290927e-06, + "loss": 1.3328, + "step": 25561 + }, + { + "epoch": 0.9154296560244954, + "grad_norm": 1.4610018730163574, + "learning_rate": 3.7263888816236435e-06, + "loss": 0.9073, + "step": 25562 + }, + { + "epoch": 0.9154654681540638, + "grad_norm": 1.679741621017456, + "learning_rate": 3.7232526624554344e-06, + "loss": 1.6995, + "step": 25563 + }, + { + "epoch": 0.9155012802836321, + "grad_norm": 1.569359540939331, + "learning_rate": 3.720117738566675e-06, + "loss": 1.2822, + "step": 25564 + }, + { + "epoch": 0.9155370924132004, + "grad_norm": 2.0695765018463135, + "learning_rate": 3.7169841099995438e-06, + "loss": 1.6033, + "step": 25565 + }, + { + "epoch": 0.9155729045427686, + "grad_norm": 1.6272145509719849, + "learning_rate": 3.7138517767961954e-06, + "loss": 1.2955, + "step": 25566 + }, + { + "epoch": 0.9156087166723369, + "grad_norm": 1.9181761741638184, + "learning_rate": 3.710720738998774e-06, + "loss": 1.4108, + "step": 25567 + }, + { + "epoch": 0.9156445288019052, + "grad_norm": 2.273002862930298, + "learning_rate": 3.7075909966493903e-06, + "loss": 1.6432, + "step": 25568 + }, + { + "epoch": 0.9156803409314734, + "grad_norm": 1.7246898412704468, + "learning_rate": 3.7044625497901774e-06, + "loss": 1.2398, + "step": 25569 + }, + { + "epoch": 0.9157161530610418, + "grad_norm": 1.4257093667984009, + "learning_rate": 3.7013353984631906e-06, + "loss": 1.7232, + "step": 25570 + }, + { + "epoch": 0.9157519651906101, + "grad_norm": 1.7329950332641602, + "learning_rate": 3.698209542710529e-06, + "loss": 1.4783, + "step": 25571 + }, + { + "epoch": 0.9157877773201784, + "grad_norm": 1.9357683658599854, + "learning_rate": 3.6950849825742375e-06, + "loss": 1.4423, + "step": 25572 + }, + { + "epoch": 0.9158235894497466, + "grad_norm": 2.051203489303589, + "learning_rate": 3.6919617180963595e-06, + "loss": 1.6041, + "step": 25573 + }, + { + "epoch": 0.9158594015793149, + "grad_norm": 1.6284784078598022, + "learning_rate": 3.6888397493188954e-06, + "loss": 1.5151, + "step": 25574 + }, + { + "epoch": 0.9158952137088832, + "grad_norm": 1.5720276832580566, + "learning_rate": 3.685719076283867e-06, + "loss": 1.398, + "step": 25575 + }, + { + "epoch": 0.9159310258384514, + "grad_norm": 1.6514610052108765, + "learning_rate": 3.68259969903324e-06, + "loss": 1.3815, + "step": 25576 + }, + { + "epoch": 0.9159668379680198, + "grad_norm": 1.2488956451416016, + "learning_rate": 3.6794816176090152e-06, + "loss": 1.3639, + "step": 25577 + }, + { + "epoch": 0.9160026500975881, + "grad_norm": 2.5675435066223145, + "learning_rate": 3.676364832053103e-06, + "loss": 1.2767, + "step": 25578 + }, + { + "epoch": 0.9160384622271563, + "grad_norm": 1.3877888917922974, + "learning_rate": 3.6732493424074587e-06, + "loss": 1.2629, + "step": 25579 + }, + { + "epoch": 0.9160742743567246, + "grad_norm": 1.5578382015228271, + "learning_rate": 3.6701351487140046e-06, + "loss": 1.5932, + "step": 25580 + }, + { + "epoch": 0.9161100864862929, + "grad_norm": 2.35837459564209, + "learning_rate": 3.667022251014607e-06, + "loss": 1.3385, + "step": 25581 + }, + { + "epoch": 0.9161458986158612, + "grad_norm": 1.7006118297576904, + "learning_rate": 3.6639106493511766e-06, + "loss": 1.262, + "step": 25582 + }, + { + "epoch": 0.9161817107454294, + "grad_norm": 1.6541268825531006, + "learning_rate": 3.660800343765547e-06, + "loss": 1.4949, + "step": 25583 + }, + { + "epoch": 0.9162175228749978, + "grad_norm": 1.5310859680175781, + "learning_rate": 3.657691334299607e-06, + "loss": 1.6051, + "step": 25584 + }, + { + "epoch": 0.9162533350045661, + "grad_norm": 1.7938069105148315, + "learning_rate": 3.6545836209951333e-06, + "loss": 1.4242, + "step": 25585 + }, + { + "epoch": 0.9162891471341343, + "grad_norm": 1.6601662635803223, + "learning_rate": 3.6514772038939714e-06, + "loss": 1.4616, + "step": 25586 + }, + { + "epoch": 0.9163249592637026, + "grad_norm": 2.1517858505249023, + "learning_rate": 3.64837208303791e-06, + "loss": 1.5088, + "step": 25587 + }, + { + "epoch": 0.9163607713932709, + "grad_norm": 1.3455365896224976, + "learning_rate": 3.6452682584687035e-06, + "loss": 1.12, + "step": 25588 + }, + { + "epoch": 0.9163965835228391, + "grad_norm": 1.8301458358764648, + "learning_rate": 3.642165730228131e-06, + "loss": 1.3795, + "step": 25589 + }, + { + "epoch": 0.9164323956524074, + "grad_norm": 1.7565302848815918, + "learning_rate": 3.6390644983579135e-06, + "loss": 1.4932, + "step": 25590 + }, + { + "epoch": 0.9164682077819758, + "grad_norm": 1.7920565605163574, + "learning_rate": 3.6359645628998073e-06, + "loss": 1.5721, + "step": 25591 + }, + { + "epoch": 0.9165040199115441, + "grad_norm": 1.578942894935608, + "learning_rate": 3.6328659238954897e-06, + "loss": 1.3415, + "step": 25592 + }, + { + "epoch": 0.9165398320411123, + "grad_norm": 1.4769006967544556, + "learning_rate": 3.62976858138665e-06, + "loss": 1.2929, + "step": 25593 + }, + { + "epoch": 0.9165756441706806, + "grad_norm": 1.8735318183898926, + "learning_rate": 3.6266725354149656e-06, + "loss": 1.7607, + "step": 25594 + }, + { + "epoch": 0.9166114563002489, + "grad_norm": 1.9881622791290283, + "learning_rate": 3.6235777860221033e-06, + "loss": 1.5311, + "step": 25595 + }, + { + "epoch": 0.9166472684298171, + "grad_norm": 1.5557109117507935, + "learning_rate": 3.620484333249674e-06, + "loss": 1.3811, + "step": 25596 + }, + { + "epoch": 0.9166830805593854, + "grad_norm": 1.9754571914672852, + "learning_rate": 3.6173921771393003e-06, + "loss": 1.4716, + "step": 25597 + }, + { + "epoch": 0.9167188926889538, + "grad_norm": 1.7173326015472412, + "learning_rate": 3.6143013177326046e-06, + "loss": 1.6932, + "step": 25598 + }, + { + "epoch": 0.9167547048185221, + "grad_norm": 1.4891917705535889, + "learning_rate": 3.611211755071142e-06, + "loss": 1.4409, + "step": 25599 + }, + { + "epoch": 0.9167905169480903, + "grad_norm": 2.2099905014038086, + "learning_rate": 3.608123489196502e-06, + "loss": 1.2602, + "step": 25600 + }, + { + "epoch": 0.9168263290776586, + "grad_norm": 1.5880131721496582, + "learning_rate": 3.605036520150218e-06, + "loss": 1.3322, + "step": 25601 + }, + { + "epoch": 0.9168621412072269, + "grad_norm": 1.5003440380096436, + "learning_rate": 3.601950847973845e-06, + "loss": 1.5016, + "step": 25602 + }, + { + "epoch": 0.9168979533367951, + "grad_norm": 1.5447492599487305, + "learning_rate": 3.598866472708862e-06, + "loss": 1.4572, + "step": 25603 + }, + { + "epoch": 0.9169337654663634, + "grad_norm": 1.4146647453308105, + "learning_rate": 3.595783394396779e-06, + "loss": 1.418, + "step": 25604 + }, + { + "epoch": 0.9169695775959318, + "grad_norm": 1.7357875108718872, + "learning_rate": 3.592701613079097e-06, + "loss": 1.1391, + "step": 25605 + }, + { + "epoch": 0.9170053897255, + "grad_norm": 1.6281183958053589, + "learning_rate": 3.5896211287972383e-06, + "loss": 1.4511, + "step": 25606 + }, + { + "epoch": 0.9170412018550683, + "grad_norm": 1.7715595960617065, + "learning_rate": 3.5865419415926803e-06, + "loss": 1.5196, + "step": 25607 + }, + { + "epoch": 0.9170770139846366, + "grad_norm": 1.6238850355148315, + "learning_rate": 3.583464051506813e-06, + "loss": 1.3839, + "step": 25608 + }, + { + "epoch": 0.9171128261142049, + "grad_norm": 1.6311407089233398, + "learning_rate": 3.5803874585811024e-06, + "loss": 1.4173, + "step": 25609 + }, + { + "epoch": 0.9171486382437731, + "grad_norm": 1.4624392986297607, + "learning_rate": 3.577312162856883e-06, + "loss": 1.3834, + "step": 25610 + }, + { + "epoch": 0.9171844503733414, + "grad_norm": 1.55482816696167, + "learning_rate": 3.574238164375554e-06, + "loss": 1.433, + "step": 25611 + }, + { + "epoch": 0.9172202625029098, + "grad_norm": 1.4993577003479004, + "learning_rate": 3.571165463178472e-06, + "loss": 1.4662, + "step": 25612 + }, + { + "epoch": 0.917256074632478, + "grad_norm": 1.3473643064498901, + "learning_rate": 3.568094059306981e-06, + "loss": 1.401, + "step": 25613 + }, + { + "epoch": 0.9172918867620463, + "grad_norm": 2.217761516571045, + "learning_rate": 3.5650239528024043e-06, + "loss": 1.266, + "step": 25614 + }, + { + "epoch": 0.9173276988916146, + "grad_norm": 1.6229770183563232, + "learning_rate": 3.5619551437060083e-06, + "loss": 1.5986, + "step": 25615 + }, + { + "epoch": 0.9173635110211829, + "grad_norm": 1.9005887508392334, + "learning_rate": 3.558887632059138e-06, + "loss": 0.9255, + "step": 25616 + }, + { + "epoch": 0.9173993231507511, + "grad_norm": 1.1790728569030762, + "learning_rate": 3.555821417903027e-06, + "loss": 1.1397, + "step": 25617 + }, + { + "epoch": 0.9174351352803194, + "grad_norm": 1.6984502077102661, + "learning_rate": 3.552756501278931e-06, + "loss": 1.413, + "step": 25618 + }, + { + "epoch": 0.9174709474098878, + "grad_norm": 1.8751317262649536, + "learning_rate": 3.549692882228084e-06, + "loss": 1.6012, + "step": 25619 + }, + { + "epoch": 0.917506759539456, + "grad_norm": 1.5878108739852905, + "learning_rate": 3.5466305607917195e-06, + "loss": 1.2097, + "step": 25620 + }, + { + "epoch": 0.9175425716690243, + "grad_norm": 1.529499888420105, + "learning_rate": 3.5435695370110154e-06, + "loss": 1.5132, + "step": 25621 + }, + { + "epoch": 0.9175783837985926, + "grad_norm": 1.7148410081863403, + "learning_rate": 3.540509810927173e-06, + "loss": 1.4484, + "step": 25622 + }, + { + "epoch": 0.9176141959281608, + "grad_norm": 1.3747161626815796, + "learning_rate": 3.537451382581336e-06, + "loss": 1.1541, + "step": 25623 + }, + { + "epoch": 0.9176500080577291, + "grad_norm": 1.7710306644439697, + "learning_rate": 3.534394252014661e-06, + "loss": 1.4204, + "step": 25624 + }, + { + "epoch": 0.9176858201872974, + "grad_norm": 1.4869321584701538, + "learning_rate": 3.531338419268293e-06, + "loss": 1.6908, + "step": 25625 + }, + { + "epoch": 0.9177216323168658, + "grad_norm": 1.8567776679992676, + "learning_rate": 3.52828388438331e-06, + "loss": 1.131, + "step": 25626 + }, + { + "epoch": 0.917757444446434, + "grad_norm": 1.6301764249801636, + "learning_rate": 3.5252306474008457e-06, + "loss": 1.3344, + "step": 25627 + }, + { + "epoch": 0.9177932565760023, + "grad_norm": 1.6216933727264404, + "learning_rate": 3.522178708361956e-06, + "loss": 1.4663, + "step": 25628 + }, + { + "epoch": 0.9178290687055706, + "grad_norm": 1.4409526586532593, + "learning_rate": 3.5191280673077086e-06, + "loss": 1.3619, + "step": 25629 + }, + { + "epoch": 0.9178648808351388, + "grad_norm": 2.590237855911255, + "learning_rate": 3.516078724279137e-06, + "loss": 1.1399, + "step": 25630 + }, + { + "epoch": 0.9179006929647071, + "grad_norm": 2.0231406688690186, + "learning_rate": 3.513030679317264e-06, + "loss": 1.4057, + "step": 25631 + }, + { + "epoch": 0.9179365050942754, + "grad_norm": 1.7201601266860962, + "learning_rate": 3.5099839324631233e-06, + "loss": 1.4519, + "step": 25632 + }, + { + "epoch": 0.9179723172238438, + "grad_norm": 1.846491813659668, + "learning_rate": 3.506938483757671e-06, + "loss": 1.2908, + "step": 25633 + }, + { + "epoch": 0.918008129353412, + "grad_norm": 2.1384713649749756, + "learning_rate": 3.503894333241886e-06, + "loss": 1.3344, + "step": 25634 + }, + { + "epoch": 0.9180439414829803, + "grad_norm": 1.8844846487045288, + "learning_rate": 3.500851480956746e-06, + "loss": 1.228, + "step": 25635 + }, + { + "epoch": 0.9180797536125486, + "grad_norm": 2.2950620651245117, + "learning_rate": 3.497809926943174e-06, + "loss": 1.5764, + "step": 25636 + }, + { + "epoch": 0.9181155657421168, + "grad_norm": 1.582805871963501, + "learning_rate": 3.4947696712420708e-06, + "loss": 1.4444, + "step": 25637 + }, + { + "epoch": 0.9181513778716851, + "grad_norm": 1.5671573877334595, + "learning_rate": 3.491730713894381e-06, + "loss": 1.3715, + "step": 25638 + }, + { + "epoch": 0.9181871900012534, + "grad_norm": 1.5163581371307373, + "learning_rate": 3.4886930549409724e-06, + "loss": 1.3593, + "step": 25639 + }, + { + "epoch": 0.9182230021308218, + "grad_norm": 1.5615180730819702, + "learning_rate": 3.485656694422701e-06, + "loss": 1.1561, + "step": 25640 + }, + { + "epoch": 0.91825881426039, + "grad_norm": 1.5961110591888428, + "learning_rate": 3.482621632380412e-06, + "loss": 1.4103, + "step": 25641 + }, + { + "epoch": 0.9182946263899583, + "grad_norm": 1.2680072784423828, + "learning_rate": 3.479587868854961e-06, + "loss": 1.6213, + "step": 25642 + }, + { + "epoch": 0.9183304385195266, + "grad_norm": 1.8912404775619507, + "learning_rate": 3.4765554038871607e-06, + "loss": 1.2256, + "step": 25643 + }, + { + "epoch": 0.9183662506490948, + "grad_norm": 1.5284357070922852, + "learning_rate": 3.4735242375177777e-06, + "loss": 1.577, + "step": 25644 + }, + { + "epoch": 0.9184020627786631, + "grad_norm": 1.3570201396942139, + "learning_rate": 3.470494369787636e-06, + "loss": 1.508, + "step": 25645 + }, + { + "epoch": 0.9184378749082314, + "grad_norm": 2.326610803604126, + "learning_rate": 3.4674658007374683e-06, + "loss": 1.3932, + "step": 25646 + }, + { + "epoch": 0.9184736870377997, + "grad_norm": 1.5897072553634644, + "learning_rate": 3.464438530408043e-06, + "loss": 1.6262, + "step": 25647 + }, + { + "epoch": 0.918509499167368, + "grad_norm": 1.651442289352417, + "learning_rate": 3.46141255884006e-06, + "loss": 1.0848, + "step": 25648 + }, + { + "epoch": 0.9185453112969363, + "grad_norm": 1.5025795698165894, + "learning_rate": 3.4583878860742434e-06, + "loss": 1.5907, + "step": 25649 + }, + { + "epoch": 0.9185811234265046, + "grad_norm": 2.048462152481079, + "learning_rate": 3.4553645121513046e-06, + "loss": 1.8224, + "step": 25650 + }, + { + "epoch": 0.9186169355560728, + "grad_norm": 1.4465118646621704, + "learning_rate": 3.4523424371118885e-06, + "loss": 1.4116, + "step": 25651 + }, + { + "epoch": 0.9186527476856411, + "grad_norm": 2.2787973880767822, + "learning_rate": 3.449321660996674e-06, + "loss": 1.4564, + "step": 25652 + }, + { + "epoch": 0.9186885598152094, + "grad_norm": 1.6836674213409424, + "learning_rate": 3.446302183846295e-06, + "loss": 1.1173, + "step": 25653 + }, + { + "epoch": 0.9187243719447777, + "grad_norm": 1.779667615890503, + "learning_rate": 3.443284005701375e-06, + "loss": 1.5232, + "step": 25654 + }, + { + "epoch": 0.918760184074346, + "grad_norm": 1.2427458763122559, + "learning_rate": 3.4402671266025253e-06, + "loss": 1.2979, + "step": 25655 + }, + { + "epoch": 0.9187959962039143, + "grad_norm": 1.405266523361206, + "learning_rate": 3.4372515465903145e-06, + "loss": 1.5639, + "step": 25656 + }, + { + "epoch": 0.9188318083334825, + "grad_norm": 1.3440146446228027, + "learning_rate": 3.434237265705342e-06, + "loss": 1.3727, + "step": 25657 + }, + { + "epoch": 0.9188676204630508, + "grad_norm": 1.969915747642517, + "learning_rate": 3.4312242839881325e-06, + "loss": 1.0968, + "step": 25658 + }, + { + "epoch": 0.9189034325926191, + "grad_norm": 1.9113439321517944, + "learning_rate": 3.4282126014792414e-06, + "loss": 1.6633, + "step": 25659 + }, + { + "epoch": 0.9189392447221874, + "grad_norm": 1.5810271501541138, + "learning_rate": 3.4252022182191813e-06, + "loss": 1.4188, + "step": 25660 + }, + { + "epoch": 0.9189750568517557, + "grad_norm": 2.1815831661224365, + "learning_rate": 3.4221931342484525e-06, + "loss": 1.7841, + "step": 25661 + }, + { + "epoch": 0.919010868981324, + "grad_norm": 1.472642183303833, + "learning_rate": 3.4191853496075343e-06, + "loss": 1.4584, + "step": 25662 + }, + { + "epoch": 0.9190466811108923, + "grad_norm": 1.7633564472198486, + "learning_rate": 3.4161788643369052e-06, + "loss": 1.2982, + "step": 25663 + }, + { + "epoch": 0.9190824932404605, + "grad_norm": 2.459702730178833, + "learning_rate": 3.4131736784769996e-06, + "loss": 1.5756, + "step": 25664 + }, + { + "epoch": 0.9191183053700288, + "grad_norm": 1.4552972316741943, + "learning_rate": 3.410169792068263e-06, + "loss": 1.2331, + "step": 25665 + }, + { + "epoch": 0.9191541174995971, + "grad_norm": 1.558521032333374, + "learning_rate": 3.407167205151085e-06, + "loss": 1.194, + "step": 25666 + }, + { + "epoch": 0.9191899296291653, + "grad_norm": 1.9817105531692505, + "learning_rate": 3.404165917765889e-06, + "loss": 1.4798, + "step": 25667 + }, + { + "epoch": 0.9192257417587337, + "grad_norm": 1.6500427722930908, + "learning_rate": 3.401165929953043e-06, + "loss": 1.4384, + "step": 25668 + }, + { + "epoch": 0.919261553888302, + "grad_norm": 1.5914006233215332, + "learning_rate": 3.398167241752892e-06, + "loss": 1.521, + "step": 25669 + }, + { + "epoch": 0.9192973660178703, + "grad_norm": 1.3515881299972534, + "learning_rate": 3.395169853205793e-06, + "loss": 1.5902, + "step": 25670 + }, + { + "epoch": 0.9193331781474385, + "grad_norm": 1.7092448472976685, + "learning_rate": 3.3921737643520803e-06, + "loss": 1.537, + "step": 25671 + }, + { + "epoch": 0.9193689902770068, + "grad_norm": 1.3170454502105713, + "learning_rate": 3.3891789752320656e-06, + "loss": 1.3537, + "step": 25672 + }, + { + "epoch": 0.9194048024065751, + "grad_norm": 2.0973784923553467, + "learning_rate": 3.3861854858860177e-06, + "loss": 1.5353, + "step": 25673 + }, + { + "epoch": 0.9194406145361433, + "grad_norm": 1.5266790390014648, + "learning_rate": 3.3831932963542147e-06, + "loss": 1.3477, + "step": 25674 + }, + { + "epoch": 0.9194764266657117, + "grad_norm": 1.6230286359786987, + "learning_rate": 3.3802024066769355e-06, + "loss": 1.3931, + "step": 25675 + }, + { + "epoch": 0.91951223879528, + "grad_norm": 2.351484775543213, + "learning_rate": 3.3772128168943816e-06, + "loss": 1.5749, + "step": 25676 + }, + { + "epoch": 0.9195480509248483, + "grad_norm": 2.194748878479004, + "learning_rate": 3.37422452704681e-06, + "loss": 1.4503, + "step": 25677 + }, + { + "epoch": 0.9195838630544165, + "grad_norm": 1.614140510559082, + "learning_rate": 3.3712375371743987e-06, + "loss": 1.5758, + "step": 25678 + }, + { + "epoch": 0.9196196751839848, + "grad_norm": 2.11362624168396, + "learning_rate": 3.3682518473173607e-06, + "loss": 1.3219, + "step": 25679 + }, + { + "epoch": 0.9196554873135531, + "grad_norm": 2.002917766571045, + "learning_rate": 3.3652674575158306e-06, + "loss": 1.2815, + "step": 25680 + }, + { + "epoch": 0.9196912994431213, + "grad_norm": 1.6976182460784912, + "learning_rate": 3.362284367809976e-06, + "loss": 1.3635, + "step": 25681 + }, + { + "epoch": 0.9197271115726897, + "grad_norm": 1.6870442628860474, + "learning_rate": 3.3593025782399424e-06, + "loss": 1.2197, + "step": 25682 + }, + { + "epoch": 0.919762923702258, + "grad_norm": 2.186999797821045, + "learning_rate": 3.3563220888458425e-06, + "loss": 1.3623, + "step": 25683 + }, + { + "epoch": 0.9197987358318263, + "grad_norm": 1.727008581161499, + "learning_rate": 3.353342899667755e-06, + "loss": 1.2574, + "step": 25684 + }, + { + "epoch": 0.9198345479613945, + "grad_norm": 1.7052570581436157, + "learning_rate": 3.3503650107457706e-06, + "loss": 1.7394, + "step": 25685 + }, + { + "epoch": 0.9198703600909628, + "grad_norm": 1.6224033832550049, + "learning_rate": 3.347388422119968e-06, + "loss": 1.3684, + "step": 25686 + }, + { + "epoch": 0.9199061722205311, + "grad_norm": 1.8566479682922363, + "learning_rate": 3.3444131338303708e-06, + "loss": 1.127, + "step": 25687 + }, + { + "epoch": 0.9199419843500993, + "grad_norm": 2.9639549255371094, + "learning_rate": 3.3414391459170134e-06, + "loss": 1.5899, + "step": 25688 + }, + { + "epoch": 0.9199777964796677, + "grad_norm": 1.5856989622116089, + "learning_rate": 3.33846645841992e-06, + "loss": 1.2898, + "step": 25689 + }, + { + "epoch": 0.920013608609236, + "grad_norm": 1.1478488445281982, + "learning_rate": 3.33549507137908e-06, + "loss": 1.3804, + "step": 25690 + }, + { + "epoch": 0.9200494207388042, + "grad_norm": 1.6433991193771362, + "learning_rate": 3.332524984834462e-06, + "loss": 1.6188, + "step": 25691 + }, + { + "epoch": 0.9200852328683725, + "grad_norm": 1.6301097869873047, + "learning_rate": 3.3295561988260227e-06, + "loss": 1.6194, + "step": 25692 + }, + { + "epoch": 0.9201210449979408, + "grad_norm": 3.6255156993865967, + "learning_rate": 3.326588713393719e-06, + "loss": 1.531, + "step": 25693 + }, + { + "epoch": 0.920156857127509, + "grad_norm": 1.6354080438613892, + "learning_rate": 3.3236225285774637e-06, + "loss": 1.4595, + "step": 25694 + }, + { + "epoch": 0.9201926692570773, + "grad_norm": 1.5773730278015137, + "learning_rate": 3.3206576444171577e-06, + "loss": 1.2759, + "step": 25695 + }, + { + "epoch": 0.9202284813866457, + "grad_norm": 1.889205813407898, + "learning_rate": 3.317694060952692e-06, + "loss": 1.1222, + "step": 25696 + }, + { + "epoch": 0.920264293516214, + "grad_norm": 1.4370874166488647, + "learning_rate": 3.314731778223956e-06, + "loss": 1.443, + "step": 25697 + }, + { + "epoch": 0.9203001056457822, + "grad_norm": 1.4879106283187866, + "learning_rate": 3.3117707962707746e-06, + "loss": 1.5812, + "step": 25698 + }, + { + "epoch": 0.9203359177753505, + "grad_norm": 1.411486268043518, + "learning_rate": 3.308811115133004e-06, + "loss": 1.5091, + "step": 25699 + }, + { + "epoch": 0.9203717299049188, + "grad_norm": 2.297484874725342, + "learning_rate": 3.3058527348504455e-06, + "loss": 1.1828, + "step": 25700 + }, + { + "epoch": 0.920407542034487, + "grad_norm": 1.640698790550232, + "learning_rate": 3.302895655462934e-06, + "loss": 1.6501, + "step": 25701 + }, + { + "epoch": 0.9204433541640553, + "grad_norm": 1.9217278957366943, + "learning_rate": 3.2999398770102276e-06, + "loss": 1.2598, + "step": 25702 + }, + { + "epoch": 0.9204791662936237, + "grad_norm": 1.5686508417129517, + "learning_rate": 3.296985399532071e-06, + "loss": 1.4808, + "step": 25703 + }, + { + "epoch": 0.920514978423192, + "grad_norm": 1.4743748903274536, + "learning_rate": 3.2940322230682664e-06, + "loss": 1.2292, + "step": 25704 + }, + { + "epoch": 0.9205507905527602, + "grad_norm": 1.3531376123428345, + "learning_rate": 3.291080347658504e-06, + "loss": 1.2794, + "step": 25705 + }, + { + "epoch": 0.9205866026823285, + "grad_norm": 1.6078829765319824, + "learning_rate": 3.2881297733425188e-06, + "loss": 1.4727, + "step": 25706 + }, + { + "epoch": 0.9206224148118968, + "grad_norm": 1.4829717874526978, + "learning_rate": 3.285180500159979e-06, + "loss": 1.5516, + "step": 25707 + }, + { + "epoch": 0.920658226941465, + "grad_norm": 1.7333753108978271, + "learning_rate": 3.2822325281505973e-06, + "loss": 1.3617, + "step": 25708 + }, + { + "epoch": 0.9206940390710333, + "grad_norm": 1.9004656076431274, + "learning_rate": 3.27928585735402e-06, + "loss": 1.8051, + "step": 25709 + }, + { + "epoch": 0.9207298512006017, + "grad_norm": 1.6876628398895264, + "learning_rate": 3.2763404878098815e-06, + "loss": 1.4722, + "step": 25710 + }, + { + "epoch": 0.92076566333017, + "grad_norm": 1.6359210014343262, + "learning_rate": 3.273396419557839e-06, + "loss": 1.2267, + "step": 25711 + }, + { + "epoch": 0.9208014754597382, + "grad_norm": 1.5168014764785767, + "learning_rate": 3.2704536526374506e-06, + "loss": 1.3787, + "step": 25712 + }, + { + "epoch": 0.9208372875893065, + "grad_norm": 2.176189661026001, + "learning_rate": 3.267512187088362e-06, + "loss": 1.6466, + "step": 25713 + }, + { + "epoch": 0.9208730997188748, + "grad_norm": 1.7817593812942505, + "learning_rate": 3.2645720229500965e-06, + "loss": 1.7843, + "step": 25714 + }, + { + "epoch": 0.920908911848443, + "grad_norm": 1.368715524673462, + "learning_rate": 3.2616331602622565e-06, + "loss": 1.5336, + "step": 25715 + }, + { + "epoch": 0.9209447239780113, + "grad_norm": 1.853217363357544, + "learning_rate": 3.2586955990643432e-06, + "loss": 1.4965, + "step": 25716 + }, + { + "epoch": 0.9209805361075797, + "grad_norm": 2.11053729057312, + "learning_rate": 3.255759339395903e-06, + "loss": 1.5349, + "step": 25717 + }, + { + "epoch": 0.921016348237148, + "grad_norm": 1.5958489179611206, + "learning_rate": 3.2528243812964156e-06, + "loss": 1.5431, + "step": 25718 + }, + { + "epoch": 0.9210521603667162, + "grad_norm": 1.3022187948226929, + "learning_rate": 3.2498907248054045e-06, + "loss": 1.6714, + "step": 25719 + }, + { + "epoch": 0.9210879724962845, + "grad_norm": 1.4858121871948242, + "learning_rate": 3.2469583699623053e-06, + "loss": 1.5852, + "step": 25720 + }, + { + "epoch": 0.9211237846258528, + "grad_norm": 2.0671451091766357, + "learning_rate": 3.2440273168065636e-06, + "loss": 1.0612, + "step": 25721 + }, + { + "epoch": 0.921159596755421, + "grad_norm": 1.670335054397583, + "learning_rate": 3.241097565377649e-06, + "loss": 1.5164, + "step": 25722 + }, + { + "epoch": 0.9211954088849893, + "grad_norm": 1.7597389221191406, + "learning_rate": 3.2381691157149395e-06, + "loss": 1.4661, + "step": 25723 + }, + { + "epoch": 0.9212312210145577, + "grad_norm": 1.881030797958374, + "learning_rate": 3.2352419678578714e-06, + "loss": 1.3387, + "step": 25724 + }, + { + "epoch": 0.921267033144126, + "grad_norm": 1.9583910703659058, + "learning_rate": 3.2323161218457796e-06, + "loss": 1.5953, + "step": 25725 + }, + { + "epoch": 0.9213028452736942, + "grad_norm": 1.6051225662231445, + "learning_rate": 3.229391577718066e-06, + "loss": 1.2554, + "step": 25726 + }, + { + "epoch": 0.9213386574032625, + "grad_norm": 1.7156764268875122, + "learning_rate": 3.226468335514077e-06, + "loss": 1.4546, + "step": 25727 + }, + { + "epoch": 0.9213744695328308, + "grad_norm": 1.7657434940338135, + "learning_rate": 3.223546395273114e-06, + "loss": 1.3154, + "step": 25728 + }, + { + "epoch": 0.921410281662399, + "grad_norm": 1.699459433555603, + "learning_rate": 3.220625757034501e-06, + "loss": 1.4964, + "step": 25729 + }, + { + "epoch": 0.9214460937919673, + "grad_norm": 1.7240272760391235, + "learning_rate": 3.2177064208375298e-06, + "loss": 1.6393, + "step": 25730 + }, + { + "epoch": 0.9214819059215357, + "grad_norm": 1.9153105020523071, + "learning_rate": 3.21478838672149e-06, + "loss": 1.7071, + "step": 25731 + }, + { + "epoch": 0.9215177180511039, + "grad_norm": 2.9040825366973877, + "learning_rate": 3.211871654725618e-06, + "loss": 1.7588, + "step": 25732 + }, + { + "epoch": 0.9215535301806722, + "grad_norm": 1.6486643552780151, + "learning_rate": 3.208956224889159e-06, + "loss": 1.3715, + "step": 25733 + }, + { + "epoch": 0.9215893423102405, + "grad_norm": 1.6218197345733643, + "learning_rate": 3.2060420972513494e-06, + "loss": 1.3616, + "step": 25734 + }, + { + "epoch": 0.9216251544398087, + "grad_norm": 1.4166431427001953, + "learning_rate": 3.203129271851402e-06, + "loss": 1.2931, + "step": 25735 + }, + { + "epoch": 0.921660966569377, + "grad_norm": 1.4935784339904785, + "learning_rate": 3.2002177487284736e-06, + "loss": 1.3154, + "step": 25736 + }, + { + "epoch": 0.9216967786989453, + "grad_norm": 1.6623226404190063, + "learning_rate": 3.197307527921756e-06, + "loss": 1.4607, + "step": 25737 + }, + { + "epoch": 0.9217325908285137, + "grad_norm": 1.6059545278549194, + "learning_rate": 3.194398609470406e-06, + "loss": 1.4489, + "step": 25738 + }, + { + "epoch": 0.9217684029580819, + "grad_norm": 1.5388230085372925, + "learning_rate": 3.1914909934135483e-06, + "loss": 1.2268, + "step": 25739 + }, + { + "epoch": 0.9218042150876502, + "grad_norm": 1.3560782670974731, + "learning_rate": 3.1885846797902964e-06, + "loss": 1.3344, + "step": 25740 + }, + { + "epoch": 0.9218400272172185, + "grad_norm": 1.4934988021850586, + "learning_rate": 3.185679668639763e-06, + "loss": 1.4104, + "step": 25741 + }, + { + "epoch": 0.9218758393467867, + "grad_norm": 1.6801812648773193, + "learning_rate": 3.1827759600010498e-06, + "loss": 1.6805, + "step": 25742 + }, + { + "epoch": 0.921911651476355, + "grad_norm": 1.7978261709213257, + "learning_rate": 3.179873553913171e-06, + "loss": 1.5292, + "step": 25743 + }, + { + "epoch": 0.9219474636059233, + "grad_norm": 1.6324962377548218, + "learning_rate": 3.1769724504152164e-06, + "loss": 1.1737, + "step": 25744 + }, + { + "epoch": 0.9219832757354917, + "grad_norm": 1.4262455701828003, + "learning_rate": 3.1740726495462223e-06, + "loss": 1.3579, + "step": 25745 + }, + { + "epoch": 0.9220190878650599, + "grad_norm": 1.422607183456421, + "learning_rate": 3.1711741513451576e-06, + "loss": 1.2943, + "step": 25746 + }, + { + "epoch": 0.9220548999946282, + "grad_norm": 1.3137279748916626, + "learning_rate": 3.1682769558510574e-06, + "loss": 1.6224, + "step": 25747 + }, + { + "epoch": 0.9220907121241965, + "grad_norm": 1.8328073024749756, + "learning_rate": 3.165381063102879e-06, + "loss": 1.3839, + "step": 25748 + }, + { + "epoch": 0.9221265242537647, + "grad_norm": 1.4281005859375, + "learning_rate": 3.162486473139603e-06, + "loss": 1.2144, + "step": 25749 + }, + { + "epoch": 0.922162336383333, + "grad_norm": 1.3970149755477905, + "learning_rate": 3.1595931860001536e-06, + "loss": 1.2464, + "step": 25750 + }, + { + "epoch": 0.9221981485129013, + "grad_norm": 1.5348528623580933, + "learning_rate": 3.1567012017234553e-06, + "loss": 1.4653, + "step": 25751 + }, + { + "epoch": 0.9222339606424697, + "grad_norm": 2.080173969268799, + "learning_rate": 3.1538105203484323e-06, + "loss": 1.4834, + "step": 25752 + }, + { + "epoch": 0.9222697727720379, + "grad_norm": 1.4271656274795532, + "learning_rate": 3.150921141913965e-06, + "loss": 1.6015, + "step": 25753 + }, + { + "epoch": 0.9223055849016062, + "grad_norm": 1.562336802482605, + "learning_rate": 3.148033066458933e-06, + "loss": 1.8834, + "step": 25754 + }, + { + "epoch": 0.9223413970311745, + "grad_norm": 1.3763641119003296, + "learning_rate": 3.145146294022172e-06, + "loss": 1.3171, + "step": 25755 + }, + { + "epoch": 0.9223772091607427, + "grad_norm": 1.3976991176605225, + "learning_rate": 3.1422608246425513e-06, + "loss": 1.185, + "step": 25756 + }, + { + "epoch": 0.922413021290311, + "grad_norm": 1.4210984706878662, + "learning_rate": 3.1393766583588614e-06, + "loss": 1.6143, + "step": 25757 + }, + { + "epoch": 0.9224488334198793, + "grad_norm": 1.7500970363616943, + "learning_rate": 3.136493795209916e-06, + "loss": 1.702, + "step": 25758 + }, + { + "epoch": 0.9224846455494476, + "grad_norm": 2.02065372467041, + "learning_rate": 3.1336122352345065e-06, + "loss": 1.5751, + "step": 25759 + }, + { + "epoch": 0.9225204576790159, + "grad_norm": 1.553234338760376, + "learning_rate": 3.130731978471402e-06, + "loss": 1.1833, + "step": 25760 + }, + { + "epoch": 0.9225562698085842, + "grad_norm": 1.5323034524917603, + "learning_rate": 3.1278530249593372e-06, + "loss": 1.5451, + "step": 25761 + }, + { + "epoch": 0.9225920819381525, + "grad_norm": 1.3176145553588867, + "learning_rate": 3.124975374737049e-06, + "loss": 1.055, + "step": 25762 + }, + { + "epoch": 0.9226278940677207, + "grad_norm": 2.0297014713287354, + "learning_rate": 3.1220990278432727e-06, + "loss": 1.3966, + "step": 25763 + }, + { + "epoch": 0.922663706197289, + "grad_norm": 1.19528067111969, + "learning_rate": 3.119223984316677e-06, + "loss": 1.4489, + "step": 25764 + }, + { + "epoch": 0.9226995183268573, + "grad_norm": 1.6679394245147705, + "learning_rate": 3.1163502441959647e-06, + "loss": 1.1535, + "step": 25765 + }, + { + "epoch": 0.9227353304564256, + "grad_norm": 1.6907655000686646, + "learning_rate": 3.113477807519782e-06, + "loss": 1.6175, + "step": 25766 + }, + { + "epoch": 0.9227711425859939, + "grad_norm": 1.7465883493423462, + "learning_rate": 3.110606674326788e-06, + "loss": 1.6063, + "step": 25767 + }, + { + "epoch": 0.9228069547155622, + "grad_norm": 1.6468842029571533, + "learning_rate": 3.1077368446555956e-06, + "loss": 1.378, + "step": 25768 + }, + { + "epoch": 0.9228427668451304, + "grad_norm": 1.6969002485275269, + "learning_rate": 3.104868318544818e-06, + "loss": 1.4272, + "step": 25769 + }, + { + "epoch": 0.9228785789746987, + "grad_norm": 1.4377319812774658, + "learning_rate": 3.1020010960330583e-06, + "loss": 1.6473, + "step": 25770 + }, + { + "epoch": 0.922914391104267, + "grad_norm": 2.1896963119506836, + "learning_rate": 3.0991351771588963e-06, + "loss": 1.2011, + "step": 25771 + }, + { + "epoch": 0.9229502032338353, + "grad_norm": 1.6838210821151733, + "learning_rate": 3.0962705619608565e-06, + "loss": 1.372, + "step": 25772 + }, + { + "epoch": 0.9229860153634036, + "grad_norm": 1.592898964881897, + "learning_rate": 3.093407250477509e-06, + "loss": 1.4824, + "step": 25773 + }, + { + "epoch": 0.9230218274929719, + "grad_norm": 1.7946579456329346, + "learning_rate": 3.0905452427473667e-06, + "loss": 1.4023, + "step": 25774 + }, + { + "epoch": 0.9230576396225402, + "grad_norm": 2.0756218433380127, + "learning_rate": 3.0876845388089327e-06, + "loss": 1.4185, + "step": 25775 + }, + { + "epoch": 0.9230934517521084, + "grad_norm": 1.2983440160751343, + "learning_rate": 3.084825138700698e-06, + "loss": 1.4131, + "step": 25776 + }, + { + "epoch": 0.9231292638816767, + "grad_norm": 2.0300850868225098, + "learning_rate": 3.08196704246112e-06, + "loss": 1.5474, + "step": 25777 + }, + { + "epoch": 0.923165076011245, + "grad_norm": 2.3360812664031982, + "learning_rate": 3.0791102501286804e-06, + "loss": 1.1895, + "step": 25778 + }, + { + "epoch": 0.9232008881408132, + "grad_norm": 1.792782187461853, + "learning_rate": 3.0762547617417703e-06, + "loss": 1.308, + "step": 25779 + }, + { + "epoch": 0.9232367002703816, + "grad_norm": 1.6354910135269165, + "learning_rate": 3.0734005773388364e-06, + "loss": 1.405, + "step": 25780 + }, + { + "epoch": 0.9232725123999499, + "grad_norm": 1.6714788675308228, + "learning_rate": 3.0705476969582813e-06, + "loss": 1.2023, + "step": 25781 + }, + { + "epoch": 0.9233083245295182, + "grad_norm": 1.793521523475647, + "learning_rate": 3.0676961206384746e-06, + "loss": 1.3668, + "step": 25782 + }, + { + "epoch": 0.9233441366590864, + "grad_norm": 1.6857377290725708, + "learning_rate": 3.0648458484177746e-06, + "loss": 1.5258, + "step": 25783 + }, + { + "epoch": 0.9233799487886547, + "grad_norm": 1.6202548742294312, + "learning_rate": 3.061996880334539e-06, + "loss": 1.2652, + "step": 25784 + }, + { + "epoch": 0.923415760918223, + "grad_norm": 1.563083291053772, + "learning_rate": 3.059149216427104e-06, + "loss": 1.5697, + "step": 25785 + }, + { + "epoch": 0.9234515730477912, + "grad_norm": 1.3261644840240479, + "learning_rate": 3.0563028567337614e-06, + "loss": 1.38, + "step": 25786 + }, + { + "epoch": 0.9234873851773596, + "grad_norm": 1.2361618280410767, + "learning_rate": 3.053457801292814e-06, + "loss": 1.3723, + "step": 25787 + }, + { + "epoch": 0.9235231973069279, + "grad_norm": 1.499376893043518, + "learning_rate": 3.0506140501425417e-06, + "loss": 1.4285, + "step": 25788 + }, + { + "epoch": 0.9235590094364962, + "grad_norm": 1.281010389328003, + "learning_rate": 3.0477716033212032e-06, + "loss": 1.4848, + "step": 25789 + }, + { + "epoch": 0.9235948215660644, + "grad_norm": 2.0112226009368896, + "learning_rate": 3.044930460867046e-06, + "loss": 1.4768, + "step": 25790 + }, + { + "epoch": 0.9236306336956327, + "grad_norm": 1.588922142982483, + "learning_rate": 3.042090622818272e-06, + "loss": 1.2698, + "step": 25791 + }, + { + "epoch": 0.923666445825201, + "grad_norm": 1.5995523929595947, + "learning_rate": 3.039252089213118e-06, + "loss": 1.3193, + "step": 25792 + }, + { + "epoch": 0.9237022579547692, + "grad_norm": 2.97119140625, + "learning_rate": 3.0364148600897423e-06, + "loss": 1.5305, + "step": 25793 + }, + { + "epoch": 0.9237380700843376, + "grad_norm": 2.8016178607940674, + "learning_rate": 3.0335789354863362e-06, + "loss": 1.5246, + "step": 25794 + }, + { + "epoch": 0.9237738822139059, + "grad_norm": 1.5737606287002563, + "learning_rate": 3.0307443154410365e-06, + "loss": 1.42, + "step": 25795 + }, + { + "epoch": 0.9238096943434742, + "grad_norm": 1.4103105068206787, + "learning_rate": 3.027910999992012e-06, + "loss": 1.4139, + "step": 25796 + }, + { + "epoch": 0.9238455064730424, + "grad_norm": 1.6987563371658325, + "learning_rate": 3.0250789891773433e-06, + "loss": 1.6168, + "step": 25797 + }, + { + "epoch": 0.9238813186026107, + "grad_norm": 1.6376010179519653, + "learning_rate": 3.022248283035156e-06, + "loss": 1.3091, + "step": 25798 + }, + { + "epoch": 0.923917130732179, + "grad_norm": 1.1662794351577759, + "learning_rate": 3.0194188816035305e-06, + "loss": 1.2575, + "step": 25799 + }, + { + "epoch": 0.9239529428617472, + "grad_norm": 1.8269308805465698, + "learning_rate": 3.0165907849205254e-06, + "loss": 1.2645, + "step": 25800 + }, + { + "epoch": 0.9239887549913156, + "grad_norm": 1.7312307357788086, + "learning_rate": 3.013763993024188e-06, + "loss": 1.2861, + "step": 25801 + }, + { + "epoch": 0.9240245671208839, + "grad_norm": 1.7468078136444092, + "learning_rate": 3.010938505952543e-06, + "loss": 1.4326, + "step": 25802 + }, + { + "epoch": 0.9240603792504521, + "grad_norm": 1.40182363986969, + "learning_rate": 3.008114323743627e-06, + "loss": 1.2857, + "step": 25803 + }, + { + "epoch": 0.9240961913800204, + "grad_norm": 1.6253925561904907, + "learning_rate": 3.005291446435421e-06, + "loss": 1.4106, + "step": 25804 + }, + { + "epoch": 0.9241320035095887, + "grad_norm": 1.5819063186645508, + "learning_rate": 3.002469874065894e-06, + "loss": 1.2463, + "step": 25805 + }, + { + "epoch": 0.924167815639157, + "grad_norm": 1.8966995477676392, + "learning_rate": 2.999649606673027e-06, + "loss": 1.4093, + "step": 25806 + }, + { + "epoch": 0.9242036277687252, + "grad_norm": 1.4328267574310303, + "learning_rate": 2.996830644294757e-06, + "loss": 1.2878, + "step": 25807 + }, + { + "epoch": 0.9242394398982936, + "grad_norm": 2.1859099864959717, + "learning_rate": 2.994012986969008e-06, + "loss": 1.3069, + "step": 25808 + }, + { + "epoch": 0.9242752520278619, + "grad_norm": 1.6977571249008179, + "learning_rate": 2.991196634733662e-06, + "loss": 1.162, + "step": 25809 + }, + { + "epoch": 0.9243110641574301, + "grad_norm": 1.3135548830032349, + "learning_rate": 2.9883815876266653e-06, + "loss": 1.3733, + "step": 25810 + }, + { + "epoch": 0.9243468762869984, + "grad_norm": 1.7272981405258179, + "learning_rate": 2.985567845685833e-06, + "loss": 1.5297, + "step": 25811 + }, + { + "epoch": 0.9243826884165667, + "grad_norm": 1.472185492515564, + "learning_rate": 2.982755408949067e-06, + "loss": 1.3625, + "step": 25812 + }, + { + "epoch": 0.924418500546135, + "grad_norm": 1.6928966045379639, + "learning_rate": 2.97994427745415e-06, + "loss": 1.4356, + "step": 25813 + }, + { + "epoch": 0.9244543126757032, + "grad_norm": 1.5978893041610718, + "learning_rate": 2.977134451238972e-06, + "loss": 1.322, + "step": 25814 + }, + { + "epoch": 0.9244901248052716, + "grad_norm": 1.2723480463027954, + "learning_rate": 2.9743259303412707e-06, + "loss": 1.3138, + "step": 25815 + }, + { + "epoch": 0.9245259369348399, + "grad_norm": 1.316504716873169, + "learning_rate": 2.9715187147988823e-06, + "loss": 1.177, + "step": 25816 + }, + { + "epoch": 0.9245617490644081, + "grad_norm": 1.8215168714523315, + "learning_rate": 2.968712804649543e-06, + "loss": 1.5501, + "step": 25817 + }, + { + "epoch": 0.9245975611939764, + "grad_norm": 1.5576127767562866, + "learning_rate": 2.9659081999310112e-06, + "loss": 1.3463, + "step": 25818 + }, + { + "epoch": 0.9246333733235447, + "grad_norm": 1.40639328956604, + "learning_rate": 2.9631049006810243e-06, + "loss": 1.1919, + "step": 25819 + }, + { + "epoch": 0.9246691854531129, + "grad_norm": 1.4003286361694336, + "learning_rate": 2.9603029069372733e-06, + "loss": 1.5546, + "step": 25820 + }, + { + "epoch": 0.9247049975826812, + "grad_norm": 1.6173608303070068, + "learning_rate": 2.9575022187374958e-06, + "loss": 1.5284, + "step": 25821 + }, + { + "epoch": 0.9247408097122496, + "grad_norm": 1.644357442855835, + "learning_rate": 2.9547028361193495e-06, + "loss": 1.3983, + "step": 25822 + }, + { + "epoch": 0.9247766218418179, + "grad_norm": 1.6090519428253174, + "learning_rate": 2.951904759120494e-06, + "loss": 1.4281, + "step": 25823 + }, + { + "epoch": 0.9248124339713861, + "grad_norm": 1.6314555406570435, + "learning_rate": 2.9491079877785767e-06, + "loss": 1.3319, + "step": 25824 + }, + { + "epoch": 0.9248482461009544, + "grad_norm": 1.707886815071106, + "learning_rate": 2.9463125221312117e-06, + "loss": 1.2501, + "step": 25825 + }, + { + "epoch": 0.9248840582305227, + "grad_norm": 1.4632911682128906, + "learning_rate": 2.9435183622160465e-06, + "loss": 1.526, + "step": 25826 + }, + { + "epoch": 0.9249198703600909, + "grad_norm": 1.7945195436477661, + "learning_rate": 2.9407255080706297e-06, + "loss": 1.3357, + "step": 25827 + }, + { + "epoch": 0.9249556824896592, + "grad_norm": 1.7915374040603638, + "learning_rate": 2.937933959732553e-06, + "loss": 1.5678, + "step": 25828 + }, + { + "epoch": 0.9249914946192276, + "grad_norm": 1.4077473878860474, + "learning_rate": 2.9351437172393746e-06, + "loss": 1.5492, + "step": 25829 + }, + { + "epoch": 0.9250273067487959, + "grad_norm": 1.4841781854629517, + "learning_rate": 2.9323547806286432e-06, + "loss": 1.391, + "step": 25830 + }, + { + "epoch": 0.9250631188783641, + "grad_norm": 2.442631721496582, + "learning_rate": 2.9295671499378506e-06, + "loss": 1.6424, + "step": 25831 + }, + { + "epoch": 0.9250989310079324, + "grad_norm": 1.5331733226776123, + "learning_rate": 2.9267808252045338e-06, + "loss": 1.4525, + "step": 25832 + }, + { + "epoch": 0.9251347431375007, + "grad_norm": 2.0141773223876953, + "learning_rate": 2.923995806466173e-06, + "loss": 1.367, + "step": 25833 + }, + { + "epoch": 0.9251705552670689, + "grad_norm": 1.460148572921753, + "learning_rate": 2.9212120937602174e-06, + "loss": 1.5702, + "step": 25834 + }, + { + "epoch": 0.9252063673966372, + "grad_norm": 1.5255372524261475, + "learning_rate": 2.9184296871241357e-06, + "loss": 1.3125, + "step": 25835 + }, + { + "epoch": 0.9252421795262056, + "grad_norm": 1.992004156112671, + "learning_rate": 2.9156485865953544e-06, + "loss": 1.5373, + "step": 25836 + }, + { + "epoch": 0.9252779916557738, + "grad_norm": 1.4714744091033936, + "learning_rate": 2.9128687922112987e-06, + "loss": 1.3717, + "step": 25837 + }, + { + "epoch": 0.9253138037853421, + "grad_norm": 1.6506719589233398, + "learning_rate": 2.91009030400935e-06, + "loss": 1.3876, + "step": 25838 + }, + { + "epoch": 0.9253496159149104, + "grad_norm": 1.959061622619629, + "learning_rate": 2.9073131220269e-06, + "loss": 1.2011, + "step": 25839 + }, + { + "epoch": 0.9253854280444787, + "grad_norm": 1.3772485256195068, + "learning_rate": 2.9045372463013088e-06, + "loss": 1.3579, + "step": 25840 + }, + { + "epoch": 0.9254212401740469, + "grad_norm": 2.8980002403259277, + "learning_rate": 2.9017626768699346e-06, + "loss": 1.3286, + "step": 25841 + }, + { + "epoch": 0.9254570523036152, + "grad_norm": 2.0866899490356445, + "learning_rate": 2.8989894137700924e-06, + "loss": 1.7091, + "step": 25842 + }, + { + "epoch": 0.9254928644331836, + "grad_norm": 1.37162446975708, + "learning_rate": 2.8962174570390965e-06, + "loss": 1.5254, + "step": 25843 + }, + { + "epoch": 0.9255286765627518, + "grad_norm": 1.817568063735962, + "learning_rate": 2.8934468067142396e-06, + "loss": 1.1529, + "step": 25844 + }, + { + "epoch": 0.9255644886923201, + "grad_norm": 1.9801509380340576, + "learning_rate": 2.8906774628327917e-06, + "loss": 1.5762, + "step": 25845 + }, + { + "epoch": 0.9256003008218884, + "grad_norm": 1.6975454092025757, + "learning_rate": 2.8879094254320225e-06, + "loss": 1.2415, + "step": 25846 + }, + { + "epoch": 0.9256361129514566, + "grad_norm": 2.336732864379883, + "learning_rate": 2.8851426945491588e-06, + "loss": 1.4693, + "step": 25847 + }, + { + "epoch": 0.9256719250810249, + "grad_norm": 1.9778311252593994, + "learning_rate": 2.882377270221448e-06, + "loss": 1.4825, + "step": 25848 + }, + { + "epoch": 0.9257077372105932, + "grad_norm": 1.873709797859192, + "learning_rate": 2.8796131524860603e-06, + "loss": 1.408, + "step": 25849 + }, + { + "epoch": 0.9257435493401616, + "grad_norm": 1.574170470237732, + "learning_rate": 2.8768503413802108e-06, + "loss": 1.3283, + "step": 25850 + }, + { + "epoch": 0.9257793614697298, + "grad_norm": 1.7731971740722656, + "learning_rate": 2.8740888369410577e-06, + "loss": 1.6502, + "step": 25851 + }, + { + "epoch": 0.9258151735992981, + "grad_norm": 1.3365644216537476, + "learning_rate": 2.8713286392057614e-06, + "loss": 1.4837, + "step": 25852 + }, + { + "epoch": 0.9258509857288664, + "grad_norm": 1.4238344430923462, + "learning_rate": 2.868569748211436e-06, + "loss": 1.3521, + "step": 25853 + }, + { + "epoch": 0.9258867978584346, + "grad_norm": 1.5015554428100586, + "learning_rate": 2.8658121639952297e-06, + "loss": 1.5199, + "step": 25854 + }, + { + "epoch": 0.9259226099880029, + "grad_norm": 1.2220364809036255, + "learning_rate": 2.8630558865942237e-06, + "loss": 1.33, + "step": 25855 + }, + { + "epoch": 0.9259584221175712, + "grad_norm": 1.678745985031128, + "learning_rate": 2.8603009160454995e-06, + "loss": 1.8247, + "step": 25856 + }, + { + "epoch": 0.9259942342471396, + "grad_norm": 1.7295626401901245, + "learning_rate": 2.857547252386117e-06, + "loss": 1.4945, + "step": 25857 + }, + { + "epoch": 0.9260300463767078, + "grad_norm": 1.4009100198745728, + "learning_rate": 2.854794895653146e-06, + "loss": 1.1137, + "step": 25858 + }, + { + "epoch": 0.9260658585062761, + "grad_norm": 1.4563788175582886, + "learning_rate": 2.8520438458836007e-06, + "loss": 1.5361, + "step": 25859 + }, + { + "epoch": 0.9261016706358444, + "grad_norm": 1.6020429134368896, + "learning_rate": 2.849294103114486e-06, + "loss": 1.329, + "step": 25860 + }, + { + "epoch": 0.9261374827654126, + "grad_norm": 1.4788800477981567, + "learning_rate": 2.846545667382805e-06, + "loss": 1.3567, + "step": 25861 + }, + { + "epoch": 0.9261732948949809, + "grad_norm": 1.8151326179504395, + "learning_rate": 2.8437985387255394e-06, + "loss": 1.7524, + "step": 25862 + }, + { + "epoch": 0.9262091070245492, + "grad_norm": 1.3136143684387207, + "learning_rate": 2.8410527171796376e-06, + "loss": 1.109, + "step": 25863 + }, + { + "epoch": 0.9262449191541176, + "grad_norm": 3.2092204093933105, + "learning_rate": 2.838308202782036e-06, + "loss": 1.8549, + "step": 25864 + }, + { + "epoch": 0.9262807312836858, + "grad_norm": 1.5936020612716675, + "learning_rate": 2.835564995569684e-06, + "loss": 1.6723, + "step": 25865 + }, + { + "epoch": 0.9263165434132541, + "grad_norm": 1.55735445022583, + "learning_rate": 2.8328230955794733e-06, + "loss": 1.3075, + "step": 25866 + }, + { + "epoch": 0.9263523555428224, + "grad_norm": 1.6565905809402466, + "learning_rate": 2.8300825028482748e-06, + "loss": 1.3222, + "step": 25867 + }, + { + "epoch": 0.9263881676723906, + "grad_norm": 1.7441405057907104, + "learning_rate": 2.827343217412981e-06, + "loss": 1.2376, + "step": 25868 + }, + { + "epoch": 0.9264239798019589, + "grad_norm": 1.8766648769378662, + "learning_rate": 2.8246052393104516e-06, + "loss": 1.4782, + "step": 25869 + }, + { + "epoch": 0.9264597919315272, + "grad_norm": 1.460390329360962, + "learning_rate": 2.8218685685775015e-06, + "loss": 1.4786, + "step": 25870 + }, + { + "epoch": 0.9264956040610955, + "grad_norm": 1.593667984008789, + "learning_rate": 2.8191332052509567e-06, + "loss": 1.6359, + "step": 25871 + }, + { + "epoch": 0.9265314161906638, + "grad_norm": 1.744284987449646, + "learning_rate": 2.8163991493676212e-06, + "loss": 1.2058, + "step": 25872 + }, + { + "epoch": 0.9265672283202321, + "grad_norm": 1.8081164360046387, + "learning_rate": 2.8136664009642877e-06, + "loss": 1.678, + "step": 25873 + }, + { + "epoch": 0.9266030404498004, + "grad_norm": 1.7997514009475708, + "learning_rate": 2.8109349600777045e-06, + "loss": 1.6136, + "step": 25874 + }, + { + "epoch": 0.9266388525793686, + "grad_norm": 1.647782802581787, + "learning_rate": 2.8082048267446203e-06, + "loss": 1.1104, + "step": 25875 + }, + { + "epoch": 0.9266746647089369, + "grad_norm": 1.5916073322296143, + "learning_rate": 2.805476001001772e-06, + "loss": 1.5106, + "step": 25876 + }, + { + "epoch": 0.9267104768385052, + "grad_norm": 1.8126418590545654, + "learning_rate": 2.802748482885886e-06, + "loss": 1.3683, + "step": 25877 + }, + { + "epoch": 0.9267462889680735, + "grad_norm": 1.6614654064178467, + "learning_rate": 2.800022272433633e-06, + "loss": 1.3221, + "step": 25878 + }, + { + "epoch": 0.9267821010976418, + "grad_norm": 1.9782456159591675, + "learning_rate": 2.797297369681706e-06, + "loss": 1.3235, + "step": 25879 + }, + { + "epoch": 0.9268179132272101, + "grad_norm": 1.6730729341506958, + "learning_rate": 2.7945737746667643e-06, + "loss": 1.6093, + "step": 25880 + }, + { + "epoch": 0.9268537253567783, + "grad_norm": 1.5324561595916748, + "learning_rate": 2.7918514874254454e-06, + "loss": 1.3519, + "step": 25881 + }, + { + "epoch": 0.9268895374863466, + "grad_norm": 1.5539942979812622, + "learning_rate": 2.789130507994364e-06, + "loss": 1.1705, + "step": 25882 + }, + { + "epoch": 0.9269253496159149, + "grad_norm": 1.2986265420913696, + "learning_rate": 2.786410836410147e-06, + "loss": 1.4491, + "step": 25883 + }, + { + "epoch": 0.9269611617454832, + "grad_norm": 1.458767056465149, + "learning_rate": 2.783692472709376e-06, + "loss": 1.5184, + "step": 25884 + }, + { + "epoch": 0.9269969738750515, + "grad_norm": 1.4174643754959106, + "learning_rate": 2.7809754169286216e-06, + "loss": 1.3685, + "step": 25885 + }, + { + "epoch": 0.9270327860046198, + "grad_norm": 1.9586244821548462, + "learning_rate": 2.7782596691044327e-06, + "loss": 1.3631, + "step": 25886 + }, + { + "epoch": 0.9270685981341881, + "grad_norm": 1.8806778192520142, + "learning_rate": 2.7755452292733684e-06, + "loss": 1.5604, + "step": 25887 + }, + { + "epoch": 0.9271044102637563, + "grad_norm": 1.5468672513961792, + "learning_rate": 2.7728320974719225e-06, + "loss": 1.0677, + "step": 25888 + }, + { + "epoch": 0.9271402223933246, + "grad_norm": 1.300863265991211, + "learning_rate": 2.7701202737366096e-06, + "loss": 1.379, + "step": 25889 + }, + { + "epoch": 0.9271760345228929, + "grad_norm": 1.3959417343139648, + "learning_rate": 2.7674097581039004e-06, + "loss": 1.4749, + "step": 25890 + }, + { + "epoch": 0.9272118466524611, + "grad_norm": 1.5664010047912598, + "learning_rate": 2.7647005506102886e-06, + "loss": 1.2282, + "step": 25891 + }, + { + "epoch": 0.9272476587820295, + "grad_norm": 2.319263458251953, + "learning_rate": 2.7619926512921888e-06, + "loss": 1.4032, + "step": 25892 + }, + { + "epoch": 0.9272834709115978, + "grad_norm": 1.7327048778533936, + "learning_rate": 2.7592860601860616e-06, + "loss": 1.7905, + "step": 25893 + }, + { + "epoch": 0.9273192830411661, + "grad_norm": 1.7181236743927002, + "learning_rate": 2.7565807773282994e-06, + "loss": 1.3992, + "step": 25894 + }, + { + "epoch": 0.9273550951707343, + "grad_norm": 1.821083426475525, + "learning_rate": 2.7538768027553174e-06, + "loss": 1.6069, + "step": 25895 + }, + { + "epoch": 0.9273909073003026, + "grad_norm": 1.5232264995574951, + "learning_rate": 2.751174136503498e-06, + "loss": 1.3917, + "step": 25896 + }, + { + "epoch": 0.9274267194298709, + "grad_norm": 1.4377104043960571, + "learning_rate": 2.748472778609157e-06, + "loss": 1.4063, + "step": 25897 + }, + { + "epoch": 0.9274625315594391, + "grad_norm": 1.4285274744033813, + "learning_rate": 2.7457727291086867e-06, + "loss": 1.4704, + "step": 25898 + }, + { + "epoch": 0.9274983436890075, + "grad_norm": 1.573488473892212, + "learning_rate": 2.7430739880383915e-06, + "loss": 1.3264, + "step": 25899 + }, + { + "epoch": 0.9275341558185758, + "grad_norm": 1.4215755462646484, + "learning_rate": 2.7403765554345984e-06, + "loss": 1.3079, + "step": 25900 + }, + { + "epoch": 0.9275699679481441, + "grad_norm": 1.3711589574813843, + "learning_rate": 2.737680431333556e-06, + "loss": 1.3841, + "step": 25901 + }, + { + "epoch": 0.9276057800777123, + "grad_norm": 1.6720585823059082, + "learning_rate": 2.7349856157715793e-06, + "loss": 1.4348, + "step": 25902 + }, + { + "epoch": 0.9276415922072806, + "grad_norm": 1.738786220550537, + "learning_rate": 2.7322921087849063e-06, + "loss": 1.5032, + "step": 25903 + }, + { + "epoch": 0.9276774043368489, + "grad_norm": 2.6275978088378906, + "learning_rate": 2.7295999104097746e-06, + "loss": 1.3313, + "step": 25904 + }, + { + "epoch": 0.9277132164664171, + "grad_norm": 1.735266923904419, + "learning_rate": 2.726909020682422e-06, + "loss": 1.2613, + "step": 25905 + }, + { + "epoch": 0.9277490285959855, + "grad_norm": 1.6922903060913086, + "learning_rate": 2.72421943963902e-06, + "loss": 1.3072, + "step": 25906 + }, + { + "epoch": 0.9277848407255538, + "grad_norm": 1.5648711919784546, + "learning_rate": 2.7215311673157715e-06, + "loss": 1.5856, + "step": 25907 + }, + { + "epoch": 0.927820652855122, + "grad_norm": 1.460918664932251, + "learning_rate": 2.718844203748827e-06, + "loss": 1.3441, + "step": 25908 + }, + { + "epoch": 0.9278564649846903, + "grad_norm": 1.5979174375534058, + "learning_rate": 2.716158548974379e-06, + "loss": 1.6612, + "step": 25909 + }, + { + "epoch": 0.9278922771142586, + "grad_norm": 1.7928675413131714, + "learning_rate": 2.71347420302851e-06, + "loss": 1.5751, + "step": 25910 + }, + { + "epoch": 0.9279280892438269, + "grad_norm": 1.5402860641479492, + "learning_rate": 2.7107911659473682e-06, + "loss": 1.2454, + "step": 25911 + }, + { + "epoch": 0.9279639013733951, + "grad_norm": 1.4214240312576294, + "learning_rate": 2.708109437767015e-06, + "loss": 1.1149, + "step": 25912 + }, + { + "epoch": 0.9279997135029635, + "grad_norm": 1.4117375612258911, + "learning_rate": 2.705429018523575e-06, + "loss": 1.102, + "step": 25913 + }, + { + "epoch": 0.9280355256325318, + "grad_norm": 1.7786318063735962, + "learning_rate": 2.702749908253077e-06, + "loss": 1.2132, + "step": 25914 + }, + { + "epoch": 0.9280713377621, + "grad_norm": 1.6047890186309814, + "learning_rate": 2.70007210699158e-06, + "loss": 1.266, + "step": 25915 + }, + { + "epoch": 0.9281071498916683, + "grad_norm": 2.034290075302124, + "learning_rate": 2.697395614775089e-06, + "loss": 1.2727, + "step": 25916 + }, + { + "epoch": 0.9281429620212366, + "grad_norm": 1.5755672454833984, + "learning_rate": 2.694720431639641e-06, + "loss": 1.0504, + "step": 25917 + }, + { + "epoch": 0.9281787741508049, + "grad_norm": 1.8133633136749268, + "learning_rate": 2.6920465576212195e-06, + "loss": 1.6142, + "step": 25918 + }, + { + "epoch": 0.9282145862803731, + "grad_norm": 2.0033183097839355, + "learning_rate": 2.6893739927557725e-06, + "loss": 1.4867, + "step": 25919 + }, + { + "epoch": 0.9282503984099415, + "grad_norm": 1.8075355291366577, + "learning_rate": 2.6867027370793053e-06, + "loss": 1.2702, + "step": 25920 + }, + { + "epoch": 0.9282862105395098, + "grad_norm": 1.8477706909179688, + "learning_rate": 2.684032790627722e-06, + "loss": 1.1577, + "step": 25921 + }, + { + "epoch": 0.928322022669078, + "grad_norm": 1.8060771226882935, + "learning_rate": 2.6813641534369383e-06, + "loss": 1.4247, + "step": 25922 + }, + { + "epoch": 0.9283578347986463, + "grad_norm": 1.3779622316360474, + "learning_rate": 2.678696825542859e-06, + "loss": 1.5116, + "step": 25923 + }, + { + "epoch": 0.9283936469282146, + "grad_norm": 1.5192153453826904, + "learning_rate": 2.676030806981389e-06, + "loss": 1.442, + "step": 25924 + }, + { + "epoch": 0.9284294590577828, + "grad_norm": 1.7140536308288574, + "learning_rate": 2.673366097788399e-06, + "loss": 1.4663, + "step": 25925 + }, + { + "epoch": 0.9284652711873511, + "grad_norm": 1.3808923959732056, + "learning_rate": 2.670702697999705e-06, + "loss": 1.4116, + "step": 25926 + }, + { + "epoch": 0.9285010833169195, + "grad_norm": 1.517737865447998, + "learning_rate": 2.6680406076511677e-06, + "loss": 1.3334, + "step": 25927 + }, + { + "epoch": 0.9285368954464878, + "grad_norm": 1.7509211301803589, + "learning_rate": 2.6653798267785912e-06, + "loss": 1.2938, + "step": 25928 + }, + { + "epoch": 0.928572707576056, + "grad_norm": 1.7517247200012207, + "learning_rate": 2.6627203554177916e-06, + "loss": 1.1616, + "step": 25929 + }, + { + "epoch": 0.9286085197056243, + "grad_norm": 1.3450393676757812, + "learning_rate": 2.660062193604518e-06, + "loss": 1.4951, + "step": 25930 + }, + { + "epoch": 0.9286443318351926, + "grad_norm": 1.2545347213745117, + "learning_rate": 2.6574053413745524e-06, + "loss": 1.2913, + "step": 25931 + }, + { + "epoch": 0.9286801439647608, + "grad_norm": 1.784175992012024, + "learning_rate": 2.654749798763645e-06, + "loss": 1.8439, + "step": 25932 + }, + { + "epoch": 0.9287159560943291, + "grad_norm": 1.5474541187286377, + "learning_rate": 2.6520955658074997e-06, + "loss": 1.6509, + "step": 25933 + }, + { + "epoch": 0.9287517682238975, + "grad_norm": 1.63936448097229, + "learning_rate": 2.649442642541833e-06, + "loss": 1.5111, + "step": 25934 + }, + { + "epoch": 0.9287875803534658, + "grad_norm": 1.4332921504974365, + "learning_rate": 2.646791029002349e-06, + "loss": 1.5307, + "step": 25935 + }, + { + "epoch": 0.928823392483034, + "grad_norm": 1.8409286737442017, + "learning_rate": 2.6441407252247306e-06, + "loss": 1.5314, + "step": 25936 + }, + { + "epoch": 0.9288592046126023, + "grad_norm": 1.276275396347046, + "learning_rate": 2.641491731244605e-06, + "loss": 1.4106, + "step": 25937 + }, + { + "epoch": 0.9288950167421706, + "grad_norm": 1.8311176300048828, + "learning_rate": 2.6388440470976217e-06, + "loss": 1.326, + "step": 25938 + }, + { + "epoch": 0.9289308288717388, + "grad_norm": 2.0258560180664062, + "learning_rate": 2.6361976728194183e-06, + "loss": 1.3223, + "step": 25939 + }, + { + "epoch": 0.9289666410013071, + "grad_norm": 1.7356675863265991, + "learning_rate": 2.6335526084455665e-06, + "loss": 1.4339, + "step": 25940 + }, + { + "epoch": 0.9290024531308755, + "grad_norm": 1.5074106454849243, + "learning_rate": 2.630908854011682e-06, + "loss": 1.4021, + "step": 25941 + }, + { + "epoch": 0.9290382652604438, + "grad_norm": 1.8826628923416138, + "learning_rate": 2.628266409553315e-06, + "loss": 1.5917, + "step": 25942 + }, + { + "epoch": 0.929074077390012, + "grad_norm": 1.2302558422088623, + "learning_rate": 2.625625275106036e-06, + "loss": 1.2319, + "step": 25943 + }, + { + "epoch": 0.9291098895195803, + "grad_norm": 1.6267286539077759, + "learning_rate": 2.6229854507053507e-06, + "loss": 1.6428, + "step": 25944 + }, + { + "epoch": 0.9291457016491486, + "grad_norm": 1.582241415977478, + "learning_rate": 2.6203469363867973e-06, + "loss": 1.4682, + "step": 25945 + }, + { + "epoch": 0.9291815137787168, + "grad_norm": 1.7281001806259155, + "learning_rate": 2.6177097321858578e-06, + "loss": 1.4495, + "step": 25946 + }, + { + "epoch": 0.9292173259082851, + "grad_norm": 2.26235294342041, + "learning_rate": 2.615073838138027e-06, + "loss": 1.5354, + "step": 25947 + }, + { + "epoch": 0.9292531380378534, + "grad_norm": 1.591934084892273, + "learning_rate": 2.6124392542787645e-06, + "loss": 1.4247, + "step": 25948 + }, + { + "epoch": 0.9292889501674217, + "grad_norm": 2.2034246921539307, + "learning_rate": 2.609805980643498e-06, + "loss": 1.4845, + "step": 25949 + }, + { + "epoch": 0.92932476229699, + "grad_norm": 2.1909966468811035, + "learning_rate": 2.607174017267677e-06, + "loss": 1.3116, + "step": 25950 + }, + { + "epoch": 0.9293605744265583, + "grad_norm": 1.33592689037323, + "learning_rate": 2.6045433641866958e-06, + "loss": 1.4352, + "step": 25951 + }, + { + "epoch": 0.9293963865561266, + "grad_norm": 1.6754401922225952, + "learning_rate": 2.6019140214359585e-06, + "loss": 1.4147, + "step": 25952 + }, + { + "epoch": 0.9294321986856948, + "grad_norm": 2.1295597553253174, + "learning_rate": 2.599285989050826e-06, + "loss": 1.6047, + "step": 25953 + }, + { + "epoch": 0.9294680108152631, + "grad_norm": 2.042235851287842, + "learning_rate": 2.59665926706667e-06, + "loss": 1.5889, + "step": 25954 + }, + { + "epoch": 0.9295038229448314, + "grad_norm": 1.2845739126205444, + "learning_rate": 2.594033855518818e-06, + "loss": 1.4005, + "step": 25955 + }, + { + "epoch": 0.9295396350743997, + "grad_norm": 1.5340960025787354, + "learning_rate": 2.5914097544425975e-06, + "loss": 1.5359, + "step": 25956 + }, + { + "epoch": 0.929575447203968, + "grad_norm": 1.4688477516174316, + "learning_rate": 2.588786963873313e-06, + "loss": 1.603, + "step": 25957 + }, + { + "epoch": 0.9296112593335363, + "grad_norm": 1.4088783264160156, + "learning_rate": 2.586165483846248e-06, + "loss": 1.496, + "step": 25958 + }, + { + "epoch": 0.9296470714631045, + "grad_norm": 1.6192882061004639, + "learning_rate": 2.5835453143966627e-06, + "loss": 1.5361, + "step": 25959 + }, + { + "epoch": 0.9296828835926728, + "grad_norm": 1.3497220277786255, + "learning_rate": 2.580926455559829e-06, + "loss": 1.3451, + "step": 25960 + }, + { + "epoch": 0.9297186957222411, + "grad_norm": 1.6116409301757812, + "learning_rate": 2.5783089073709633e-06, + "loss": 1.3246, + "step": 25961 + }, + { + "epoch": 0.9297545078518094, + "grad_norm": 1.3760795593261719, + "learning_rate": 2.5756926698652816e-06, + "loss": 1.3463, + "step": 25962 + }, + { + "epoch": 0.9297903199813777, + "grad_norm": 1.4690639972686768, + "learning_rate": 2.5730777430779895e-06, + "loss": 1.2777, + "step": 25963 + }, + { + "epoch": 0.929826132110946, + "grad_norm": 1.1338255405426025, + "learning_rate": 2.57046412704427e-06, + "loss": 1.4689, + "step": 25964 + }, + { + "epoch": 0.9298619442405143, + "grad_norm": 2.178570032119751, + "learning_rate": 2.567851821799283e-06, + "loss": 1.7218, + "step": 25965 + }, + { + "epoch": 0.9298977563700825, + "grad_norm": 1.6850600242614746, + "learning_rate": 2.565240827378157e-06, + "loss": 1.2855, + "step": 25966 + }, + { + "epoch": 0.9299335684996508, + "grad_norm": 1.398482322692871, + "learning_rate": 2.562631143816041e-06, + "loss": 1.212, + "step": 25967 + }, + { + "epoch": 0.9299693806292191, + "grad_norm": 1.6965020895004272, + "learning_rate": 2.560022771148052e-06, + "loss": 1.4466, + "step": 25968 + }, + { + "epoch": 0.9300051927587873, + "grad_norm": 2.532001256942749, + "learning_rate": 2.55741570940925e-06, + "loss": 1.3208, + "step": 25969 + }, + { + "epoch": 0.9300410048883557, + "grad_norm": 1.4785244464874268, + "learning_rate": 2.5548099586347296e-06, + "loss": 1.2717, + "step": 25970 + }, + { + "epoch": 0.930076817017924, + "grad_norm": 1.5374987125396729, + "learning_rate": 2.552205518859552e-06, + "loss": 1.4995, + "step": 25971 + }, + { + "epoch": 0.9301126291474923, + "grad_norm": 1.8516207933425903, + "learning_rate": 2.549602390118755e-06, + "loss": 1.3219, + "step": 25972 + }, + { + "epoch": 0.9301484412770605, + "grad_norm": 1.6778855323791504, + "learning_rate": 2.5470005724473447e-06, + "loss": 1.4766, + "step": 25973 + }, + { + "epoch": 0.9301842534066288, + "grad_norm": 1.8335471153259277, + "learning_rate": 2.544400065880337e-06, + "loss": 1.6164, + "step": 25974 + }, + { + "epoch": 0.9302200655361971, + "grad_norm": 1.762180209159851, + "learning_rate": 2.5418008704527263e-06, + "loss": 1.3349, + "step": 25975 + }, + { + "epoch": 0.9302558776657653, + "grad_norm": 1.5323717594146729, + "learning_rate": 2.5392029861994625e-06, + "loss": 1.361, + "step": 25976 + }, + { + "epoch": 0.9302916897953337, + "grad_norm": 1.7938995361328125, + "learning_rate": 2.5366064131555066e-06, + "loss": 1.4333, + "step": 25977 + }, + { + "epoch": 0.930327501924902, + "grad_norm": 3.3588671684265137, + "learning_rate": 2.534011151355797e-06, + "loss": 1.7233, + "step": 25978 + }, + { + "epoch": 0.9303633140544703, + "grad_norm": 1.9261466264724731, + "learning_rate": 2.531417200835251e-06, + "loss": 1.1339, + "step": 25979 + }, + { + "epoch": 0.9303991261840385, + "grad_norm": 2.6531336307525635, + "learning_rate": 2.528824561628762e-06, + "loss": 1.3105, + "step": 25980 + }, + { + "epoch": 0.9304349383136068, + "grad_norm": 1.9853758811950684, + "learning_rate": 2.5262332337712025e-06, + "loss": 1.5984, + "step": 25981 + }, + { + "epoch": 0.9304707504431751, + "grad_norm": 2.0931293964385986, + "learning_rate": 2.5236432172974333e-06, + "loss": 1.4613, + "step": 25982 + }, + { + "epoch": 0.9305065625727433, + "grad_norm": 1.676774263381958, + "learning_rate": 2.521054512242338e-06, + "loss": 1.3516, + "step": 25983 + }, + { + "epoch": 0.9305423747023117, + "grad_norm": 2.231872797012329, + "learning_rate": 2.5184671186406996e-06, + "loss": 1.4991, + "step": 25984 + }, + { + "epoch": 0.93057818683188, + "grad_norm": 1.5376390218734741, + "learning_rate": 2.5158810365273345e-06, + "loss": 1.341, + "step": 25985 + }, + { + "epoch": 0.9306139989614483, + "grad_norm": 1.4266811609268188, + "learning_rate": 2.5132962659370595e-06, + "loss": 1.5016, + "step": 25986 + }, + { + "epoch": 0.9306498110910165, + "grad_norm": 1.5460529327392578, + "learning_rate": 2.510712806904625e-06, + "loss": 1.1129, + "step": 25987 + }, + { + "epoch": 0.9306856232205848, + "grad_norm": 1.3211203813552856, + "learning_rate": 2.5081306594647912e-06, + "loss": 1.3368, + "step": 25988 + }, + { + "epoch": 0.9307214353501531, + "grad_norm": 1.648240566253662, + "learning_rate": 2.505549823652309e-06, + "loss": 1.2672, + "step": 25989 + }, + { + "epoch": 0.9307572474797213, + "grad_norm": 1.4202256202697754, + "learning_rate": 2.5029702995019055e-06, + "loss": 1.5001, + "step": 25990 + }, + { + "epoch": 0.9307930596092897, + "grad_norm": 1.5760564804077148, + "learning_rate": 2.5003920870482644e-06, + "loss": 1.4344, + "step": 25991 + }, + { + "epoch": 0.930828871738858, + "grad_norm": 1.6077864170074463, + "learning_rate": 2.4978151863260914e-06, + "loss": 1.4363, + "step": 25992 + }, + { + "epoch": 0.9308646838684262, + "grad_norm": 1.4821381568908691, + "learning_rate": 2.495239597370047e-06, + "loss": 1.5983, + "step": 25993 + }, + { + "epoch": 0.9309004959979945, + "grad_norm": 1.120290994644165, + "learning_rate": 2.492665320214771e-06, + "loss": 1.4057, + "step": 25994 + }, + { + "epoch": 0.9309363081275628, + "grad_norm": 1.2401329278945923, + "learning_rate": 2.490092354894913e-06, + "loss": 1.1569, + "step": 25995 + }, + { + "epoch": 0.930972120257131, + "grad_norm": 1.77287757396698, + "learning_rate": 2.4875207014450785e-06, + "loss": 1.5907, + "step": 25996 + }, + { + "epoch": 0.9310079323866993, + "grad_norm": 1.4243831634521484, + "learning_rate": 2.4849503598998738e-06, + "loss": 1.4151, + "step": 25997 + }, + { + "epoch": 0.9310437445162677, + "grad_norm": 1.5774353742599487, + "learning_rate": 2.4823813302938814e-06, + "loss": 1.3764, + "step": 25998 + }, + { + "epoch": 0.931079556645836, + "grad_norm": 1.5646772384643555, + "learning_rate": 2.4798136126616634e-06, + "loss": 1.5245, + "step": 25999 + }, + { + "epoch": 0.9311153687754042, + "grad_norm": 1.583769679069519, + "learning_rate": 2.477247207037736e-06, + "loss": 1.7049, + "step": 26000 + }, + { + "epoch": 0.9311511809049725, + "grad_norm": 1.5972886085510254, + "learning_rate": 2.4746821134566833e-06, + "loss": 1.2635, + "step": 26001 + }, + { + "epoch": 0.9311869930345408, + "grad_norm": 1.2284495830535889, + "learning_rate": 2.4721183319529774e-06, + "loss": 1.3457, + "step": 26002 + }, + { + "epoch": 0.931222805164109, + "grad_norm": 1.774373173713684, + "learning_rate": 2.4695558625611015e-06, + "loss": 1.2893, + "step": 26003 + }, + { + "epoch": 0.9312586172936773, + "grad_norm": 2.132659435272217, + "learning_rate": 2.4669947053155617e-06, + "loss": 1.2126, + "step": 26004 + }, + { + "epoch": 0.9312944294232457, + "grad_norm": 1.3666845560073853, + "learning_rate": 2.464434860250786e-06, + "loss": 1.5706, + "step": 26005 + }, + { + "epoch": 0.931330241552814, + "grad_norm": 1.9008538722991943, + "learning_rate": 2.461876327401247e-06, + "loss": 1.3145, + "step": 26006 + }, + { + "epoch": 0.9313660536823822, + "grad_norm": 1.4239145517349243, + "learning_rate": 2.4593191068013164e-06, + "loss": 1.3138, + "step": 26007 + }, + { + "epoch": 0.9314018658119505, + "grad_norm": 1.7623815536499023, + "learning_rate": 2.4567631984854566e-06, + "loss": 1.5899, + "step": 26008 + }, + { + "epoch": 0.9314376779415188, + "grad_norm": 1.5793737173080444, + "learning_rate": 2.4542086024880174e-06, + "loss": 1.5624, + "step": 26009 + }, + { + "epoch": 0.931473490071087, + "grad_norm": 1.4286556243896484, + "learning_rate": 2.4516553188433823e-06, + "loss": 0.9605, + "step": 26010 + }, + { + "epoch": 0.9315093022006553, + "grad_norm": 1.671534538269043, + "learning_rate": 2.4491033475858795e-06, + "loss": 1.5661, + "step": 26011 + }, + { + "epoch": 0.9315451143302237, + "grad_norm": 2.992291212081909, + "learning_rate": 2.44655268874987e-06, + "loss": 1.1977, + "step": 26012 + }, + { + "epoch": 0.931580926459792, + "grad_norm": 1.70440673828125, + "learning_rate": 2.4440033423696717e-06, + "loss": 1.4343, + "step": 26013 + }, + { + "epoch": 0.9316167385893602, + "grad_norm": 2.031369924545288, + "learning_rate": 2.4414553084795455e-06, + "loss": 1.4702, + "step": 26014 + }, + { + "epoch": 0.9316525507189285, + "grad_norm": 1.6914634704589844, + "learning_rate": 2.4389085871138086e-06, + "loss": 1.4862, + "step": 26015 + }, + { + "epoch": 0.9316883628484968, + "grad_norm": 2.1513466835021973, + "learning_rate": 2.4363631783067108e-06, + "loss": 1.523, + "step": 26016 + }, + { + "epoch": 0.931724174978065, + "grad_norm": 1.6219446659088135, + "learning_rate": 2.4338190820925145e-06, + "loss": 1.4038, + "step": 26017 + }, + { + "epoch": 0.9317599871076333, + "grad_norm": 1.5040446519851685, + "learning_rate": 2.4312762985054137e-06, + "loss": 1.3775, + "step": 26018 + }, + { + "epoch": 0.9317957992372017, + "grad_norm": 3.2530477046966553, + "learning_rate": 2.4287348275796373e-06, + "loss": 1.0825, + "step": 26019 + }, + { + "epoch": 0.93183161136677, + "grad_norm": 1.5604746341705322, + "learning_rate": 2.4261946693493797e-06, + "loss": 1.0338, + "step": 26020 + }, + { + "epoch": 0.9318674234963382, + "grad_norm": 2.0306684970855713, + "learning_rate": 2.4236558238488025e-06, + "loss": 1.4578, + "step": 26021 + }, + { + "epoch": 0.9319032356259065, + "grad_norm": 2.1134817600250244, + "learning_rate": 2.421118291112079e-06, + "loss": 1.5503, + "step": 26022 + }, + { + "epoch": 0.9319390477554748, + "grad_norm": 1.4790269136428833, + "learning_rate": 2.4185820711733363e-06, + "loss": 1.5024, + "step": 26023 + }, + { + "epoch": 0.931974859885043, + "grad_norm": 1.7622898817062378, + "learning_rate": 2.4160471640667147e-06, + "loss": 1.5454, + "step": 26024 + }, + { + "epoch": 0.9320106720146113, + "grad_norm": 1.9339481592178345, + "learning_rate": 2.413513569826298e-06, + "loss": 1.3688, + "step": 26025 + }, + { + "epoch": 0.9320464841441797, + "grad_norm": 1.6316418647766113, + "learning_rate": 2.410981288486169e-06, + "loss": 1.4478, + "step": 26026 + }, + { + "epoch": 0.932082296273748, + "grad_norm": 1.73930823802948, + "learning_rate": 2.408450320080413e-06, + "loss": 1.5425, + "step": 26027 + }, + { + "epoch": 0.9321181084033162, + "grad_norm": 1.9473265409469604, + "learning_rate": 2.40592066464308e-06, + "loss": 1.5209, + "step": 26028 + }, + { + "epoch": 0.9321539205328845, + "grad_norm": 1.8710750341415405, + "learning_rate": 2.4033923222081868e-06, + "loss": 1.3462, + "step": 26029 + }, + { + "epoch": 0.9321897326624528, + "grad_norm": 1.5108715295791626, + "learning_rate": 2.400865292809762e-06, + "loss": 1.4425, + "step": 26030 + }, + { + "epoch": 0.932225544792021, + "grad_norm": 1.6120551824569702, + "learning_rate": 2.3983395764818008e-06, + "loss": 1.2536, + "step": 26031 + }, + { + "epoch": 0.9322613569215893, + "grad_norm": 1.3649541139602661, + "learning_rate": 2.395815173258287e-06, + "loss": 1.34, + "step": 26032 + }, + { + "epoch": 0.9322971690511577, + "grad_norm": 1.4536439180374146, + "learning_rate": 2.393292083173171e-06, + "loss": 1.5061, + "step": 26033 + }, + { + "epoch": 0.9323329811807259, + "grad_norm": 1.6911566257476807, + "learning_rate": 2.390770306260415e-06, + "loss": 1.3117, + "step": 26034 + }, + { + "epoch": 0.9323687933102942, + "grad_norm": 1.714871883392334, + "learning_rate": 2.388249842553936e-06, + "loss": 1.5929, + "step": 26035 + }, + { + "epoch": 0.9324046054398625, + "grad_norm": 1.709179162979126, + "learning_rate": 2.385730692087651e-06, + "loss": 1.5818, + "step": 26036 + }, + { + "epoch": 0.9324404175694307, + "grad_norm": 1.9201855659484863, + "learning_rate": 2.3832128548954334e-06, + "loss": 1.3567, + "step": 26037 + }, + { + "epoch": 0.932476229698999, + "grad_norm": 1.8978859186172485, + "learning_rate": 2.3806963310111786e-06, + "loss": 1.6552, + "step": 26038 + }, + { + "epoch": 0.9325120418285673, + "grad_norm": 1.9330658912658691, + "learning_rate": 2.3781811204687367e-06, + "loss": 1.3294, + "step": 26039 + }, + { + "epoch": 0.9325478539581357, + "grad_norm": 1.352783441543579, + "learning_rate": 2.375667223301936e-06, + "loss": 1.5671, + "step": 26040 + }, + { + "epoch": 0.9325836660877039, + "grad_norm": 1.6923365592956543, + "learning_rate": 2.3731546395446056e-06, + "loss": 1.4242, + "step": 26041 + }, + { + "epoch": 0.9326194782172722, + "grad_norm": 1.456566333770752, + "learning_rate": 2.370643369230563e-06, + "loss": 1.8199, + "step": 26042 + }, + { + "epoch": 0.9326552903468405, + "grad_norm": 1.8373398780822754, + "learning_rate": 2.3681334123935805e-06, + "loss": 1.5232, + "step": 26043 + }, + { + "epoch": 0.9326911024764087, + "grad_norm": 1.4119781255722046, + "learning_rate": 2.3656247690674092e-06, + "loss": 1.5827, + "step": 26044 + }, + { + "epoch": 0.932726914605977, + "grad_norm": 1.4931371212005615, + "learning_rate": 2.3631174392858335e-06, + "loss": 1.5398, + "step": 26045 + }, + { + "epoch": 0.9327627267355453, + "grad_norm": 2.504749298095703, + "learning_rate": 2.3606114230825704e-06, + "loss": 1.3982, + "step": 26046 + }, + { + "epoch": 0.9327985388651137, + "grad_norm": 1.844650149345398, + "learning_rate": 2.3581067204913267e-06, + "loss": 1.3822, + "step": 26047 + }, + { + "epoch": 0.9328343509946819, + "grad_norm": 1.961634635925293, + "learning_rate": 2.355603331545808e-06, + "loss": 1.5726, + "step": 26048 + }, + { + "epoch": 0.9328701631242502, + "grad_norm": 1.4602668285369873, + "learning_rate": 2.3531012562796995e-06, + "loss": 1.3054, + "step": 26049 + }, + { + "epoch": 0.9329059752538185, + "grad_norm": 1.6050527095794678, + "learning_rate": 2.3506004947266512e-06, + "loss": 1.5448, + "step": 26050 + }, + { + "epoch": 0.9329417873833867, + "grad_norm": 1.3830983638763428, + "learning_rate": 2.3481010469203256e-06, + "loss": 1.141, + "step": 26051 + }, + { + "epoch": 0.932977599512955, + "grad_norm": 1.515015721321106, + "learning_rate": 2.345602912894329e-06, + "loss": 1.4587, + "step": 26052 + }, + { + "epoch": 0.9330134116425233, + "grad_norm": 2.136948347091675, + "learning_rate": 2.3431060926822903e-06, + "loss": 1.4007, + "step": 26053 + }, + { + "epoch": 0.9330492237720917, + "grad_norm": 1.7469984292984009, + "learning_rate": 2.340610586317782e-06, + "loss": 1.6419, + "step": 26054 + }, + { + "epoch": 0.9330850359016599, + "grad_norm": 1.5936490297317505, + "learning_rate": 2.3381163938343776e-06, + "loss": 1.5191, + "step": 26055 + }, + { + "epoch": 0.9331208480312282, + "grad_norm": 1.4409581422805786, + "learning_rate": 2.3356235152656613e-06, + "loss": 1.4645, + "step": 26056 + }, + { + "epoch": 0.9331566601607965, + "grad_norm": 1.6994661092758179, + "learning_rate": 2.33313195064514e-06, + "loss": 1.4291, + "step": 26057 + }, + { + "epoch": 0.9331924722903647, + "grad_norm": 2.277144193649292, + "learning_rate": 2.330641700006353e-06, + "loss": 1.7529, + "step": 26058 + }, + { + "epoch": 0.933228284419933, + "grad_norm": 1.867384910583496, + "learning_rate": 2.328152763382796e-06, + "loss": 1.3866, + "step": 26059 + }, + { + "epoch": 0.9332640965495013, + "grad_norm": 1.7690753936767578, + "learning_rate": 2.325665140807964e-06, + "loss": 1.6213, + "step": 26060 + }, + { + "epoch": 0.9332999086790696, + "grad_norm": 1.4293922185897827, + "learning_rate": 2.323178832315298e-06, + "loss": 1.381, + "step": 26061 + }, + { + "epoch": 0.9333357208086379, + "grad_norm": 1.8195722103118896, + "learning_rate": 2.3206938379382813e-06, + "loss": 1.5304, + "step": 26062 + }, + { + "epoch": 0.9333715329382062, + "grad_norm": 1.5737478733062744, + "learning_rate": 2.318210157710332e-06, + "loss": 1.3585, + "step": 26063 + }, + { + "epoch": 0.9334073450677745, + "grad_norm": 1.6605415344238281, + "learning_rate": 2.3157277916648567e-06, + "loss": 1.1337, + "step": 26064 + }, + { + "epoch": 0.9334431571973427, + "grad_norm": 1.6333032846450806, + "learning_rate": 2.313246739835262e-06, + "loss": 1.4887, + "step": 26065 + }, + { + "epoch": 0.933478969326911, + "grad_norm": 1.589296579360962, + "learning_rate": 2.3107670022549323e-06, + "loss": 1.6372, + "step": 26066 + }, + { + "epoch": 0.9335147814564793, + "grad_norm": 1.6948356628417969, + "learning_rate": 2.3082885789572182e-06, + "loss": 1.316, + "step": 26067 + }, + { + "epoch": 0.9335505935860476, + "grad_norm": 1.5151407718658447, + "learning_rate": 2.305811469975472e-06, + "loss": 1.4232, + "step": 26068 + }, + { + "epoch": 0.9335864057156159, + "grad_norm": 1.8046801090240479, + "learning_rate": 2.30333567534301e-06, + "loss": 1.536, + "step": 26069 + }, + { + "epoch": 0.9336222178451842, + "grad_norm": 1.2985812425613403, + "learning_rate": 2.3008611950931404e-06, + "loss": 1.5431, + "step": 26070 + }, + { + "epoch": 0.9336580299747524, + "grad_norm": 1.8768430948257446, + "learning_rate": 2.2983880292591798e-06, + "loss": 1.6994, + "step": 26071 + }, + { + "epoch": 0.9336938421043207, + "grad_norm": 2.3779830932617188, + "learning_rate": 2.295916177874369e-06, + "loss": 1.2774, + "step": 26072 + }, + { + "epoch": 0.933729654233889, + "grad_norm": 2.449038028717041, + "learning_rate": 2.2934456409719698e-06, + "loss": 1.2834, + "step": 26073 + }, + { + "epoch": 0.9337654663634573, + "grad_norm": 1.443924069404602, + "learning_rate": 2.2909764185852447e-06, + "loss": 1.166, + "step": 26074 + }, + { + "epoch": 0.9338012784930256, + "grad_norm": 1.5654816627502441, + "learning_rate": 2.288508510747389e-06, + "loss": 1.4546, + "step": 26075 + }, + { + "epoch": 0.9338370906225939, + "grad_norm": 1.7187449932098389, + "learning_rate": 2.2860419174916104e-06, + "loss": 1.4493, + "step": 26076 + }, + { + "epoch": 0.9338729027521622, + "grad_norm": 1.4201176166534424, + "learning_rate": 2.2835766388510926e-06, + "loss": 1.2746, + "step": 26077 + }, + { + "epoch": 0.9339087148817304, + "grad_norm": 1.4719351530075073, + "learning_rate": 2.2811126748590207e-06, + "loss": 1.6015, + "step": 26078 + }, + { + "epoch": 0.9339445270112987, + "grad_norm": 1.7975047826766968, + "learning_rate": 2.278650025548512e-06, + "loss": 1.4907, + "step": 26079 + }, + { + "epoch": 0.933980339140867, + "grad_norm": 2.0674078464508057, + "learning_rate": 2.2761886909527187e-06, + "loss": 1.4675, + "step": 26080 + }, + { + "epoch": 0.9340161512704352, + "grad_norm": 1.5718308687210083, + "learning_rate": 2.273728671104769e-06, + "loss": 1.4185, + "step": 26081 + }, + { + "epoch": 0.9340519634000036, + "grad_norm": 1.3587315082550049, + "learning_rate": 2.271269966037726e-06, + "loss": 1.3692, + "step": 26082 + }, + { + "epoch": 0.9340877755295719, + "grad_norm": 2.1399447917938232, + "learning_rate": 2.2688125757846957e-06, + "loss": 1.3626, + "step": 26083 + }, + { + "epoch": 0.9341235876591402, + "grad_norm": 2.119439125061035, + "learning_rate": 2.2663565003787078e-06, + "loss": 1.5303, + "step": 26084 + }, + { + "epoch": 0.9341593997887084, + "grad_norm": 1.2838094234466553, + "learning_rate": 2.263901739852847e-06, + "loss": 1.4867, + "step": 26085 + }, + { + "epoch": 0.9341952119182767, + "grad_norm": 1.8111953735351562, + "learning_rate": 2.2614482942400984e-06, + "loss": 1.2078, + "step": 26086 + }, + { + "epoch": 0.934231024047845, + "grad_norm": 1.9275093078613281, + "learning_rate": 2.2589961635735015e-06, + "loss": 1.512, + "step": 26087 + }, + { + "epoch": 0.9342668361774132, + "grad_norm": 1.3588461875915527, + "learning_rate": 2.2565453478860297e-06, + "loss": 1.1154, + "step": 26088 + }, + { + "epoch": 0.9343026483069816, + "grad_norm": 1.3044812679290771, + "learning_rate": 2.254095847210669e-06, + "loss": 1.4511, + "step": 26089 + }, + { + "epoch": 0.9343384604365499, + "grad_norm": 1.3384042978286743, + "learning_rate": 2.2516476615803694e-06, + "loss": 1.2449, + "step": 26090 + }, + { + "epoch": 0.9343742725661182, + "grad_norm": 1.543078899383545, + "learning_rate": 2.249200791028039e-06, + "loss": 1.4121, + "step": 26091 + }, + { + "epoch": 0.9344100846956864, + "grad_norm": 1.5723681449890137, + "learning_rate": 2.2467552355866505e-06, + "loss": 1.5712, + "step": 26092 + }, + { + "epoch": 0.9344458968252547, + "grad_norm": 1.4810062646865845, + "learning_rate": 2.2443109952890674e-06, + "loss": 1.3736, + "step": 26093 + }, + { + "epoch": 0.934481708954823, + "grad_norm": 1.5201414823532104, + "learning_rate": 2.241868070168185e-06, + "loss": 1.3559, + "step": 26094 + }, + { + "epoch": 0.9345175210843912, + "grad_norm": 1.811244010925293, + "learning_rate": 2.239426460256855e-06, + "loss": 1.2936, + "step": 26095 + }, + { + "epoch": 0.9345533332139596, + "grad_norm": 1.8156101703643799, + "learning_rate": 2.236986165587951e-06, + "loss": 1.601, + "step": 26096 + }, + { + "epoch": 0.9345891453435279, + "grad_norm": 1.2325727939605713, + "learning_rate": 2.2345471861942914e-06, + "loss": 1.2936, + "step": 26097 + }, + { + "epoch": 0.9346249574730962, + "grad_norm": 1.9929440021514893, + "learning_rate": 2.232109522108694e-06, + "loss": 1.3793, + "step": 26098 + }, + { + "epoch": 0.9346607696026644, + "grad_norm": 1.516374111175537, + "learning_rate": 2.2296731733639552e-06, + "loss": 1.2078, + "step": 26099 + }, + { + "epoch": 0.9346965817322327, + "grad_norm": 3.2850093841552734, + "learning_rate": 2.227238139992849e-06, + "loss": 1.2199, + "step": 26100 + }, + { + "epoch": 0.934732393861801, + "grad_norm": 1.9196382761001587, + "learning_rate": 2.224804422028137e-06, + "loss": 1.4492, + "step": 26101 + }, + { + "epoch": 0.9347682059913692, + "grad_norm": 1.7088524103164673, + "learning_rate": 2.2223720195025386e-06, + "loss": 1.8744, + "step": 26102 + }, + { + "epoch": 0.9348040181209376, + "grad_norm": 1.5560927391052246, + "learning_rate": 2.2199409324488275e-06, + "loss": 1.2936, + "step": 26103 + }, + { + "epoch": 0.9348398302505059, + "grad_norm": 1.8463906049728394, + "learning_rate": 2.2175111608996657e-06, + "loss": 1.4824, + "step": 26104 + }, + { + "epoch": 0.9348756423800741, + "grad_norm": 1.8507603406906128, + "learning_rate": 2.215082704887772e-06, + "loss": 1.5644, + "step": 26105 + }, + { + "epoch": 0.9349114545096424, + "grad_norm": 1.5813714265823364, + "learning_rate": 2.212655564445798e-06, + "loss": 1.2692, + "step": 26106 + }, + { + "epoch": 0.9349472666392107, + "grad_norm": 1.5400943756103516, + "learning_rate": 2.2102297396064176e-06, + "loss": 1.3857, + "step": 26107 + }, + { + "epoch": 0.934983078768779, + "grad_norm": 1.7825454473495483, + "learning_rate": 2.20780523040226e-06, + "loss": 1.1498, + "step": 26108 + }, + { + "epoch": 0.9350188908983472, + "grad_norm": 1.7498632669448853, + "learning_rate": 2.2053820368659215e-06, + "loss": 1.4762, + "step": 26109 + }, + { + "epoch": 0.9350547030279156, + "grad_norm": 1.3450285196304321, + "learning_rate": 2.202960159030032e-06, + "loss": 1.3703, + "step": 26110 + }, + { + "epoch": 0.9350905151574839, + "grad_norm": 1.6363308429718018, + "learning_rate": 2.200539596927165e-06, + "loss": 1.3916, + "step": 26111 + }, + { + "epoch": 0.9351263272870521, + "grad_norm": 1.3470873832702637, + "learning_rate": 2.1981203505898827e-06, + "loss": 1.4387, + "step": 26112 + }, + { + "epoch": 0.9351621394166204, + "grad_norm": 1.7252334356307983, + "learning_rate": 2.195702420050727e-06, + "loss": 1.7303, + "step": 26113 + }, + { + "epoch": 0.9351979515461887, + "grad_norm": 1.8090825080871582, + "learning_rate": 2.19328580534226e-06, + "loss": 1.1771, + "step": 26114 + }, + { + "epoch": 0.935233763675757, + "grad_norm": 1.476894736289978, + "learning_rate": 2.190870506496956e-06, + "loss": 1.4577, + "step": 26115 + }, + { + "epoch": 0.9352695758053252, + "grad_norm": 1.3571605682373047, + "learning_rate": 2.188456523547322e-06, + "loss": 1.389, + "step": 26116 + }, + { + "epoch": 0.9353053879348936, + "grad_norm": 1.636551856994629, + "learning_rate": 2.1860438565258433e-06, + "loss": 1.1224, + "step": 26117 + }, + { + "epoch": 0.9353412000644619, + "grad_norm": 1.9286612272262573, + "learning_rate": 2.183632505464972e-06, + "loss": 1.4867, + "step": 26118 + }, + { + "epoch": 0.9353770121940301, + "grad_norm": 1.419366717338562, + "learning_rate": 2.1812224703971597e-06, + "loss": 1.399, + "step": 26119 + }, + { + "epoch": 0.9354128243235984, + "grad_norm": 1.7759507894515991, + "learning_rate": 2.1788137513548134e-06, + "loss": 1.1488, + "step": 26120 + }, + { + "epoch": 0.9354486364531667, + "grad_norm": 1.542161464691162, + "learning_rate": 2.176406348370341e-06, + "loss": 1.4903, + "step": 26121 + }, + { + "epoch": 0.9354844485827349, + "grad_norm": 1.8545513153076172, + "learning_rate": 2.174000261476139e-06, + "loss": 1.3911, + "step": 26122 + }, + { + "epoch": 0.9355202607123032, + "grad_norm": 1.7891281843185425, + "learning_rate": 2.171595490704592e-06, + "loss": 1.4406, + "step": 26123 + }, + { + "epoch": 0.9355560728418716, + "grad_norm": 1.7108030319213867, + "learning_rate": 2.1691920360880303e-06, + "loss": 1.1197, + "step": 26124 + }, + { + "epoch": 0.9355918849714399, + "grad_norm": 1.6328608989715576, + "learning_rate": 2.166789897658794e-06, + "loss": 1.5987, + "step": 26125 + }, + { + "epoch": 0.9356276971010081, + "grad_norm": 1.4279072284698486, + "learning_rate": 2.1643890754492136e-06, + "loss": 1.3825, + "step": 26126 + }, + { + "epoch": 0.9356635092305764, + "grad_norm": 1.4188889265060425, + "learning_rate": 2.1619895694915624e-06, + "loss": 1.826, + "step": 26127 + }, + { + "epoch": 0.9356993213601447, + "grad_norm": 1.5467236042022705, + "learning_rate": 2.159591379818149e-06, + "loss": 1.4663, + "step": 26128 + }, + { + "epoch": 0.9357351334897129, + "grad_norm": 1.9561902284622192, + "learning_rate": 2.1571945064612243e-06, + "loss": 1.3251, + "step": 26129 + }, + { + "epoch": 0.9357709456192812, + "grad_norm": 1.4215346574783325, + "learning_rate": 2.1547989494530517e-06, + "loss": 1.3927, + "step": 26130 + }, + { + "epoch": 0.9358067577488496, + "grad_norm": 1.427872896194458, + "learning_rate": 2.1524047088258394e-06, + "loss": 1.5345, + "step": 26131 + }, + { + "epoch": 0.9358425698784179, + "grad_norm": 1.6102819442749023, + "learning_rate": 2.1500117846118053e-06, + "loss": 1.3043, + "step": 26132 + }, + { + "epoch": 0.9358783820079861, + "grad_norm": 1.5477509498596191, + "learning_rate": 2.147620176843157e-06, + "loss": 1.0714, + "step": 26133 + }, + { + "epoch": 0.9359141941375544, + "grad_norm": 1.82809317111969, + "learning_rate": 2.145229885552047e-06, + "loss": 1.4888, + "step": 26134 + }, + { + "epoch": 0.9359500062671227, + "grad_norm": 2.383084535598755, + "learning_rate": 2.142840910770638e-06, + "loss": 1.1927, + "step": 26135 + }, + { + "epoch": 0.9359858183966909, + "grad_norm": 2.7783987522125244, + "learning_rate": 2.140453252531083e-06, + "loss": 1.5051, + "step": 26136 + }, + { + "epoch": 0.9360216305262592, + "grad_norm": 1.496717929840088, + "learning_rate": 2.1380669108655105e-06, + "loss": 1.25, + "step": 26137 + }, + { + "epoch": 0.9360574426558276, + "grad_norm": 1.7770413160324097, + "learning_rate": 2.135681885806007e-06, + "loss": 1.5141, + "step": 26138 + }, + { + "epoch": 0.9360932547853958, + "grad_norm": 1.5932000875473022, + "learning_rate": 2.133298177384668e-06, + "loss": 1.4161, + "step": 26139 + }, + { + "epoch": 0.9361290669149641, + "grad_norm": 1.4183354377746582, + "learning_rate": 2.1309157856335694e-06, + "loss": 1.6704, + "step": 26140 + }, + { + "epoch": 0.9361648790445324, + "grad_norm": 2.0836100578308105, + "learning_rate": 2.128534710584751e-06, + "loss": 1.1774, + "step": 26141 + }, + { + "epoch": 0.9362006911741007, + "grad_norm": 1.5669517517089844, + "learning_rate": 2.126154952270254e-06, + "loss": 1.4349, + "step": 26142 + }, + { + "epoch": 0.9362365033036689, + "grad_norm": 1.7547202110290527, + "learning_rate": 2.1237765107220973e-06, + "loss": 1.235, + "step": 26143 + }, + { + "epoch": 0.9362723154332372, + "grad_norm": 1.7867295742034912, + "learning_rate": 2.121399385972278e-06, + "loss": 1.0921, + "step": 26144 + }, + { + "epoch": 0.9363081275628056, + "grad_norm": 1.2874763011932373, + "learning_rate": 2.11902357805277e-06, + "loss": 1.058, + "step": 26145 + }, + { + "epoch": 0.9363439396923738, + "grad_norm": 1.6540254354476929, + "learning_rate": 2.116649086995537e-06, + "loss": 1.5833, + "step": 26146 + }, + { + "epoch": 0.9363797518219421, + "grad_norm": 1.5691769123077393, + "learning_rate": 2.1142759128325306e-06, + "loss": 1.2805, + "step": 26147 + }, + { + "epoch": 0.9364155639515104, + "grad_norm": 2.247567653656006, + "learning_rate": 2.1119040555956925e-06, + "loss": 1.2188, + "step": 26148 + }, + { + "epoch": 0.9364513760810786, + "grad_norm": 1.526218295097351, + "learning_rate": 2.109533515316908e-06, + "loss": 1.198, + "step": 26149 + }, + { + "epoch": 0.9364871882106469, + "grad_norm": 1.5749701261520386, + "learning_rate": 2.1071642920280855e-06, + "loss": 1.3176, + "step": 26150 + }, + { + "epoch": 0.9365230003402152, + "grad_norm": 1.655874490737915, + "learning_rate": 2.1047963857610986e-06, + "loss": 1.3957, + "step": 26151 + }, + { + "epoch": 0.9365588124697836, + "grad_norm": 1.9014919996261597, + "learning_rate": 2.102429796547789e-06, + "loss": 1.6384, + "step": 26152 + }, + { + "epoch": 0.9365946245993518, + "grad_norm": 1.5637357234954834, + "learning_rate": 2.100064524420009e-06, + "loss": 1.303, + "step": 26153 + }, + { + "epoch": 0.9366304367289201, + "grad_norm": 2.0295560359954834, + "learning_rate": 2.0977005694095774e-06, + "loss": 1.3744, + "step": 26154 + }, + { + "epoch": 0.9366662488584884, + "grad_norm": 2.1295666694641113, + "learning_rate": 2.0953379315483134e-06, + "loss": 1.2995, + "step": 26155 + }, + { + "epoch": 0.9367020609880566, + "grad_norm": 1.3736835718154907, + "learning_rate": 2.0929766108679803e-06, + "loss": 1.4379, + "step": 26156 + }, + { + "epoch": 0.9367378731176249, + "grad_norm": 1.850738763809204, + "learning_rate": 2.0906166074003532e-06, + "loss": 1.3111, + "step": 26157 + }, + { + "epoch": 0.9367736852471932, + "grad_norm": 1.3870395421981812, + "learning_rate": 2.0882579211771837e-06, + "loss": 1.3912, + "step": 26158 + }, + { + "epoch": 0.9368094973767616, + "grad_norm": 1.872261881828308, + "learning_rate": 2.0859005522302245e-06, + "loss": 1.5057, + "step": 26159 + }, + { + "epoch": 0.9368453095063298, + "grad_norm": 1.5025333166122437, + "learning_rate": 2.0835445005911503e-06, + "loss": 1.2798, + "step": 26160 + }, + { + "epoch": 0.9368811216358981, + "grad_norm": 1.5420435667037964, + "learning_rate": 2.081189766291691e-06, + "loss": 1.303, + "step": 26161 + }, + { + "epoch": 0.9369169337654664, + "grad_norm": 1.7748241424560547, + "learning_rate": 2.0788363493635333e-06, + "loss": 1.4483, + "step": 26162 + }, + { + "epoch": 0.9369527458950346, + "grad_norm": 1.7625503540039062, + "learning_rate": 2.0764842498383063e-06, + "loss": 1.348, + "step": 26163 + }, + { + "epoch": 0.9369885580246029, + "grad_norm": 1.4400793313980103, + "learning_rate": 2.074133467747663e-06, + "loss": 1.6231, + "step": 26164 + }, + { + "epoch": 0.9370243701541712, + "grad_norm": 1.3040224313735962, + "learning_rate": 2.071784003123256e-06, + "loss": 1.4208, + "step": 26165 + }, + { + "epoch": 0.9370601822837396, + "grad_norm": 1.9658088684082031, + "learning_rate": 2.069435855996671e-06, + "loss": 1.5414, + "step": 26166 + }, + { + "epoch": 0.9370959944133078, + "grad_norm": 1.4681401252746582, + "learning_rate": 2.0670890263995047e-06, + "loss": 1.1518, + "step": 26167 + }, + { + "epoch": 0.9371318065428761, + "grad_norm": 1.5593922138214111, + "learning_rate": 2.0647435143633322e-06, + "loss": 1.3844, + "step": 26168 + }, + { + "epoch": 0.9371676186724444, + "grad_norm": 1.4829758405685425, + "learning_rate": 2.0623993199197055e-06, + "loss": 1.4184, + "step": 26169 + }, + { + "epoch": 0.9372034308020126, + "grad_norm": 1.5997012853622437, + "learning_rate": 2.0600564431001668e-06, + "loss": 1.2306, + "step": 26170 + }, + { + "epoch": 0.9372392429315809, + "grad_norm": 1.61320960521698, + "learning_rate": 2.057714883936235e-06, + "loss": 1.3906, + "step": 26171 + }, + { + "epoch": 0.9372750550611492, + "grad_norm": 2.0251548290252686, + "learning_rate": 2.0553746424594065e-06, + "loss": 1.2477, + "step": 26172 + }, + { + "epoch": 0.9373108671907175, + "grad_norm": 1.9987709522247314, + "learning_rate": 2.0530357187011907e-06, + "loss": 1.3705, + "step": 26173 + }, + { + "epoch": 0.9373466793202858, + "grad_norm": 1.7930504083633423, + "learning_rate": 2.050698112693028e-06, + "loss": 1.1294, + "step": 26174 + }, + { + "epoch": 0.9373824914498541, + "grad_norm": 2.174814462661743, + "learning_rate": 2.0483618244663714e-06, + "loss": 1.3507, + "step": 26175 + }, + { + "epoch": 0.9374183035794224, + "grad_norm": 2.3176262378692627, + "learning_rate": 2.0460268540526518e-06, + "loss": 1.405, + "step": 26176 + }, + { + "epoch": 0.9374541157089906, + "grad_norm": 2.033170223236084, + "learning_rate": 2.04369320148331e-06, + "loss": 1.5684, + "step": 26177 + }, + { + "epoch": 0.9374899278385589, + "grad_norm": 1.7996870279312134, + "learning_rate": 2.041360866789721e-06, + "loss": 1.1019, + "step": 26178 + }, + { + "epoch": 0.9375257399681272, + "grad_norm": 2.1224141120910645, + "learning_rate": 2.0390298500032377e-06, + "loss": 1.1435, + "step": 26179 + }, + { + "epoch": 0.9375615520976955, + "grad_norm": 1.8272570371627808, + "learning_rate": 2.0367001511552685e-06, + "loss": 1.5015, + "step": 26180 + }, + { + "epoch": 0.9375973642272638, + "grad_norm": 1.5879367589950562, + "learning_rate": 2.0343717702771325e-06, + "loss": 1.3781, + "step": 26181 + }, + { + "epoch": 0.9376331763568321, + "grad_norm": 1.2769107818603516, + "learning_rate": 2.0320447074001492e-06, + "loss": 1.6663, + "step": 26182 + }, + { + "epoch": 0.9376689884864003, + "grad_norm": 1.972424864768982, + "learning_rate": 2.0297189625556377e-06, + "loss": 1.2255, + "step": 26183 + }, + { + "epoch": 0.9377048006159686, + "grad_norm": 1.7590620517730713, + "learning_rate": 2.027394535774896e-06, + "loss": 1.394, + "step": 26184 + }, + { + "epoch": 0.9377406127455369, + "grad_norm": 1.6972074508666992, + "learning_rate": 2.0250714270891757e-06, + "loss": 1.5186, + "step": 26185 + }, + { + "epoch": 0.9377764248751052, + "grad_norm": 1.7677464485168457, + "learning_rate": 2.0227496365297304e-06, + "loss": 1.415, + "step": 26186 + }, + { + "epoch": 0.9378122370046735, + "grad_norm": 1.1106623411178589, + "learning_rate": 2.020429164127835e-06, + "loss": 0.9381, + "step": 26187 + }, + { + "epoch": 0.9378480491342418, + "grad_norm": 1.5314291715621948, + "learning_rate": 2.0181100099146533e-06, + "loss": 1.1579, + "step": 26188 + }, + { + "epoch": 0.9378838612638101, + "grad_norm": 1.614284873008728, + "learning_rate": 2.015792173921438e-06, + "loss": 1.1576, + "step": 26189 + }, + { + "epoch": 0.9379196733933783, + "grad_norm": 1.5244933366775513, + "learning_rate": 2.01347565617932e-06, + "loss": 1.3892, + "step": 26190 + }, + { + "epoch": 0.9379554855229466, + "grad_norm": 1.5229711532592773, + "learning_rate": 2.0111604567195185e-06, + "loss": 1.249, + "step": 26191 + }, + { + "epoch": 0.9379912976525149, + "grad_norm": 1.5737578868865967, + "learning_rate": 2.008846575573142e-06, + "loss": 1.0932, + "step": 26192 + }, + { + "epoch": 0.9380271097820831, + "grad_norm": 1.5744316577911377, + "learning_rate": 2.006534012771344e-06, + "loss": 1.4432, + "step": 26193 + }, + { + "epoch": 0.9380629219116515, + "grad_norm": 1.4843591451644897, + "learning_rate": 2.004222768345221e-06, + "loss": 1.482, + "step": 26194 + }, + { + "epoch": 0.9380987340412198, + "grad_norm": 1.2316222190856934, + "learning_rate": 2.0019128423258816e-06, + "loss": 1.3807, + "step": 26195 + }, + { + "epoch": 0.9381345461707881, + "grad_norm": 1.4942682981491089, + "learning_rate": 1.999604234744401e-06, + "loss": 1.4272, + "step": 26196 + }, + { + "epoch": 0.9381703583003563, + "grad_norm": 2.2705743312835693, + "learning_rate": 1.99729694563181e-06, + "loss": 1.5428, + "step": 26197 + }, + { + "epoch": 0.9382061704299246, + "grad_norm": 1.8797887563705444, + "learning_rate": 1.9949909750192064e-06, + "loss": 1.4058, + "step": 26198 + }, + { + "epoch": 0.9382419825594929, + "grad_norm": 1.439017653465271, + "learning_rate": 1.992686322937565e-06, + "loss": 1.211, + "step": 26199 + }, + { + "epoch": 0.9382777946890611, + "grad_norm": 1.3723461627960205, + "learning_rate": 1.990382989417916e-06, + "loss": 1.4947, + "step": 26200 + }, + { + "epoch": 0.9383136068186295, + "grad_norm": 1.789353847503662, + "learning_rate": 1.9880809744912244e-06, + "loss": 1.4519, + "step": 26201 + }, + { + "epoch": 0.9383494189481978, + "grad_norm": 1.4479377269744873, + "learning_rate": 1.985780278188487e-06, + "loss": 1.5526, + "step": 26202 + }, + { + "epoch": 0.9383852310777661, + "grad_norm": 1.693623661994934, + "learning_rate": 1.983480900540646e-06, + "loss": 1.5865, + "step": 26203 + }, + { + "epoch": 0.9384210432073343, + "grad_norm": 1.6015986204147339, + "learning_rate": 1.981182841578644e-06, + "loss": 1.5839, + "step": 26204 + }, + { + "epoch": 0.9384568553369026, + "grad_norm": 2.0890917778015137, + "learning_rate": 1.978886101333388e-06, + "loss": 1.1135, + "step": 26205 + }, + { + "epoch": 0.9384926674664709, + "grad_norm": 1.5381704568862915, + "learning_rate": 1.9765906798357767e-06, + "loss": 1.5057, + "step": 26206 + }, + { + "epoch": 0.9385284795960391, + "grad_norm": 1.5179535150527954, + "learning_rate": 1.9742965771167077e-06, + "loss": 1.5197, + "step": 26207 + }, + { + "epoch": 0.9385642917256075, + "grad_norm": 1.7586171627044678, + "learning_rate": 1.972003793207011e-06, + "loss": 1.5069, + "step": 26208 + }, + { + "epoch": 0.9386001038551758, + "grad_norm": 1.4683053493499756, + "learning_rate": 1.969712328137574e-06, + "loss": 1.2305, + "step": 26209 + }, + { + "epoch": 0.938635915984744, + "grad_norm": 1.755323886871338, + "learning_rate": 1.967422181939205e-06, + "loss": 1.4299, + "step": 26210 + }, + { + "epoch": 0.9386717281143123, + "grad_norm": 1.5153175592422485, + "learning_rate": 1.9651333546427232e-06, + "loss": 1.5045, + "step": 26211 + }, + { + "epoch": 0.9387075402438806, + "grad_norm": 1.4256856441497803, + "learning_rate": 1.9628458462789044e-06, + "loss": 1.1516, + "step": 26212 + }, + { + "epoch": 0.9387433523734489, + "grad_norm": 1.3613686561584473, + "learning_rate": 1.960559656878547e-06, + "loss": 1.4651, + "step": 26213 + }, + { + "epoch": 0.9387791645030171, + "grad_norm": 1.3513787984848022, + "learning_rate": 1.9582747864723917e-06, + "loss": 1.3124, + "step": 26214 + }, + { + "epoch": 0.9388149766325855, + "grad_norm": 1.5042704343795776, + "learning_rate": 1.9559912350911925e-06, + "loss": 1.398, + "step": 26215 + }, + { + "epoch": 0.9388507887621538, + "grad_norm": 1.5108270645141602, + "learning_rate": 1.953709002765647e-06, + "loss": 1.3385, + "step": 26216 + }, + { + "epoch": 0.938886600891722, + "grad_norm": 1.5499955415725708, + "learning_rate": 1.951428089526486e-06, + "loss": 1.5028, + "step": 26217 + }, + { + "epoch": 0.9389224130212903, + "grad_norm": 1.7364864349365234, + "learning_rate": 1.949148495404396e-06, + "loss": 1.306, + "step": 26218 + }, + { + "epoch": 0.9389582251508586, + "grad_norm": 1.5313931703567505, + "learning_rate": 1.9468702204300195e-06, + "loss": 1.5107, + "step": 26219 + }, + { + "epoch": 0.9389940372804269, + "grad_norm": 1.9169384241104126, + "learning_rate": 1.9445932646340314e-06, + "loss": 1.3873, + "step": 26220 + }, + { + "epoch": 0.9390298494099951, + "grad_norm": 1.4949196577072144, + "learning_rate": 1.9423176280470633e-06, + "loss": 1.5275, + "step": 26221 + }, + { + "epoch": 0.9390656615395635, + "grad_norm": 1.7438608407974243, + "learning_rate": 1.940043310699724e-06, + "loss": 1.3416, + "step": 26222 + }, + { + "epoch": 0.9391014736691318, + "grad_norm": 2.022535562515259, + "learning_rate": 1.937770312622611e-06, + "loss": 1.5157, + "step": 26223 + }, + { + "epoch": 0.9391372857987, + "grad_norm": 1.9582380056381226, + "learning_rate": 1.9354986338463e-06, + "loss": 1.2197, + "step": 26224 + }, + { + "epoch": 0.9391730979282683, + "grad_norm": 1.4199358224868774, + "learning_rate": 1.9332282744013774e-06, + "loss": 1.5727, + "step": 26225 + }, + { + "epoch": 0.9392089100578366, + "grad_norm": 1.7500290870666504, + "learning_rate": 1.9309592343183636e-06, + "loss": 1.5048, + "step": 26226 + }, + { + "epoch": 0.9392447221874048, + "grad_norm": 1.1745381355285645, + "learning_rate": 1.9286915136277894e-06, + "loss": 1.3728, + "step": 26227 + }, + { + "epoch": 0.9392805343169731, + "grad_norm": 1.6411765813827515, + "learning_rate": 1.926425112360164e-06, + "loss": 1.1032, + "step": 26228 + }, + { + "epoch": 0.9393163464465415, + "grad_norm": 2.0992393493652344, + "learning_rate": 1.924160030545996e-06, + "loss": 1.5346, + "step": 26229 + }, + { + "epoch": 0.9393521585761098, + "grad_norm": 1.2511874437332153, + "learning_rate": 1.9218962682157395e-06, + "loss": 1.4908, + "step": 26230 + }, + { + "epoch": 0.939387970705678, + "grad_norm": 1.5121225118637085, + "learning_rate": 1.919633825399858e-06, + "loss": 1.4154, + "step": 26231 + }, + { + "epoch": 0.9394237828352463, + "grad_norm": 1.8958724737167358, + "learning_rate": 1.9173727021287947e-06, + "loss": 1.4142, + "step": 26232 + }, + { + "epoch": 0.9394595949648146, + "grad_norm": 1.8122069835662842, + "learning_rate": 1.915112898432947e-06, + "loss": 1.5462, + "step": 26233 + }, + { + "epoch": 0.9394954070943828, + "grad_norm": 1.5994408130645752, + "learning_rate": 1.9128544143427463e-06, + "loss": 1.5741, + "step": 26234 + }, + { + "epoch": 0.9395312192239511, + "grad_norm": 1.965086817741394, + "learning_rate": 1.910597249888568e-06, + "loss": 1.4839, + "step": 26235 + }, + { + "epoch": 0.9395670313535195, + "grad_norm": 1.5724587440490723, + "learning_rate": 1.9083414051007776e-06, + "loss": 1.6046, + "step": 26236 + }, + { + "epoch": 0.9396028434830878, + "grad_norm": 1.785568118095398, + "learning_rate": 1.9060868800097164e-06, + "loss": 1.3867, + "step": 26237 + }, + { + "epoch": 0.939638655612656, + "grad_norm": 1.5689623355865479, + "learning_rate": 1.9038336746457276e-06, + "loss": 1.2745, + "step": 26238 + }, + { + "epoch": 0.9396744677422243, + "grad_norm": 2.110036611557007, + "learning_rate": 1.9015817890391308e-06, + "loss": 1.2477, + "step": 26239 + }, + { + "epoch": 0.9397102798717926, + "grad_norm": 2.069141149520874, + "learning_rate": 1.8993312232202021e-06, + "loss": 1.5739, + "step": 26240 + }, + { + "epoch": 0.9397460920013608, + "grad_norm": 1.2748523950576782, + "learning_rate": 1.8970819772192394e-06, + "loss": 1.4898, + "step": 26241 + }, + { + "epoch": 0.9397819041309291, + "grad_norm": 1.6000792980194092, + "learning_rate": 1.8948340510664853e-06, + "loss": 1.1472, + "step": 26242 + }, + { + "epoch": 0.9398177162604975, + "grad_norm": 1.5243643522262573, + "learning_rate": 1.8925874447922044e-06, + "loss": 1.3172, + "step": 26243 + }, + { + "epoch": 0.9398535283900658, + "grad_norm": 1.70756995677948, + "learning_rate": 1.8903421584266056e-06, + "loss": 1.4195, + "step": 26244 + }, + { + "epoch": 0.939889340519634, + "grad_norm": 1.3219189643859863, + "learning_rate": 1.8880981919998875e-06, + "loss": 1.3242, + "step": 26245 + }, + { + "epoch": 0.9399251526492023, + "grad_norm": 1.4109432697296143, + "learning_rate": 1.8858555455422699e-06, + "loss": 1.4621, + "step": 26246 + }, + { + "epoch": 0.9399609647787706, + "grad_norm": 1.9342769384384155, + "learning_rate": 1.8836142190839067e-06, + "loss": 1.5312, + "step": 26247 + }, + { + "epoch": 0.9399967769083388, + "grad_norm": 1.6864076852798462, + "learning_rate": 1.8813742126549404e-06, + "loss": 1.4521, + "step": 26248 + }, + { + "epoch": 0.9400325890379071, + "grad_norm": 1.7248731851577759, + "learning_rate": 1.879135526285525e-06, + "loss": 1.3108, + "step": 26249 + }, + { + "epoch": 0.9400684011674755, + "grad_norm": 1.3078269958496094, + "learning_rate": 1.876898160005791e-06, + "loss": 1.2327, + "step": 26250 + }, + { + "epoch": 0.9401042132970437, + "grad_norm": 1.7912778854370117, + "learning_rate": 1.8746621138458042e-06, + "loss": 1.3538, + "step": 26251 + }, + { + "epoch": 0.940140025426612, + "grad_norm": 1.9527921676635742, + "learning_rate": 1.8724273878356624e-06, + "loss": 1.23, + "step": 26252 + }, + { + "epoch": 0.9401758375561803, + "grad_norm": 1.850545883178711, + "learning_rate": 1.8701939820054414e-06, + "loss": 1.5436, + "step": 26253 + }, + { + "epoch": 0.9402116496857486, + "grad_norm": 1.3803719282150269, + "learning_rate": 1.8679618963851952e-06, + "loss": 1.543, + "step": 26254 + }, + { + "epoch": 0.9402474618153168, + "grad_norm": 1.3144088983535767, + "learning_rate": 1.8657311310049218e-06, + "loss": 1.5007, + "step": 26255 + }, + { + "epoch": 0.9402832739448851, + "grad_norm": 1.282601237297058, + "learning_rate": 1.863501685894664e-06, + "loss": 1.3551, + "step": 26256 + }, + { + "epoch": 0.9403190860744535, + "grad_norm": 2.010690927505493, + "learning_rate": 1.861273561084398e-06, + "loss": 1.3937, + "step": 26257 + }, + { + "epoch": 0.9403548982040217, + "grad_norm": 2.084581136703491, + "learning_rate": 1.8590467566041104e-06, + "loss": 1.7239, + "step": 26258 + }, + { + "epoch": 0.94039071033359, + "grad_norm": 1.5342340469360352, + "learning_rate": 1.8568212724837442e-06, + "loss": 1.2138, + "step": 26259 + }, + { + "epoch": 0.9404265224631583, + "grad_norm": 1.4099143743515015, + "learning_rate": 1.8545971087532644e-06, + "loss": 1.7193, + "step": 26260 + }, + { + "epoch": 0.9404623345927265, + "grad_norm": 1.9763034582138062, + "learning_rate": 1.8523742654425802e-06, + "loss": 1.4201, + "step": 26261 + }, + { + "epoch": 0.9404981467222948, + "grad_norm": 1.699539065361023, + "learning_rate": 1.8501527425816012e-06, + "loss": 1.6167, + "step": 26262 + }, + { + "epoch": 0.9405339588518631, + "grad_norm": 1.9624556303024292, + "learning_rate": 1.8479325402002034e-06, + "loss": 1.5734, + "step": 26263 + }, + { + "epoch": 0.9405697709814315, + "grad_norm": 2.5679054260253906, + "learning_rate": 1.8457136583282741e-06, + "loss": 1.3815, + "step": 26264 + }, + { + "epoch": 0.9406055831109997, + "grad_norm": 1.602765440940857, + "learning_rate": 1.8434960969956561e-06, + "loss": 1.489, + "step": 26265 + }, + { + "epoch": 0.940641395240568, + "grad_norm": 2.7537906169891357, + "learning_rate": 1.8412798562321809e-06, + "loss": 1.2926, + "step": 26266 + }, + { + "epoch": 0.9406772073701363, + "grad_norm": 1.3616611957550049, + "learning_rate": 1.8390649360676692e-06, + "loss": 1.625, + "step": 26267 + }, + { + "epoch": 0.9407130194997045, + "grad_norm": 1.3584762811660767, + "learning_rate": 1.8368513365319306e-06, + "loss": 1.11, + "step": 26268 + }, + { + "epoch": 0.9407488316292728, + "grad_norm": 1.5779452323913574, + "learning_rate": 1.83463905765473e-06, + "loss": 1.6435, + "step": 26269 + }, + { + "epoch": 0.9407846437588411, + "grad_norm": 1.533232569694519, + "learning_rate": 1.8324280994658327e-06, + "loss": 1.4304, + "step": 26270 + }, + { + "epoch": 0.9408204558884095, + "grad_norm": 1.4318612813949585, + "learning_rate": 1.8302184619949925e-06, + "loss": 1.3363, + "step": 26271 + }, + { + "epoch": 0.9408562680179777, + "grad_norm": 1.623886227607727, + "learning_rate": 1.8280101452719412e-06, + "loss": 1.2529, + "step": 26272 + }, + { + "epoch": 0.940892080147546, + "grad_norm": 1.5533499717712402, + "learning_rate": 1.825803149326366e-06, + "loss": 1.488, + "step": 26273 + }, + { + "epoch": 0.9409278922771143, + "grad_norm": 1.4572827816009521, + "learning_rate": 1.8235974741879769e-06, + "loss": 1.2201, + "step": 26274 + }, + { + "epoch": 0.9409637044066825, + "grad_norm": 1.7187683582305908, + "learning_rate": 1.8213931198864608e-06, + "loss": 1.6117, + "step": 26275 + }, + { + "epoch": 0.9409995165362508, + "grad_norm": 1.6530053615570068, + "learning_rate": 1.8191900864514388e-06, + "loss": 1.2012, + "step": 26276 + }, + { + "epoch": 0.9410353286658191, + "grad_norm": 1.9581012725830078, + "learning_rate": 1.816988373912587e-06, + "loss": 1.4425, + "step": 26277 + }, + { + "epoch": 0.9410711407953875, + "grad_norm": 1.6734392642974854, + "learning_rate": 1.8147879822994928e-06, + "loss": 1.3796, + "step": 26278 + }, + { + "epoch": 0.9411069529249557, + "grad_norm": 1.4807915687561035, + "learning_rate": 1.8125889116417883e-06, + "loss": 1.314, + "step": 26279 + }, + { + "epoch": 0.941142765054524, + "grad_norm": 2.1756036281585693, + "learning_rate": 1.8103911619690384e-06, + "loss": 1.6318, + "step": 26280 + }, + { + "epoch": 0.9411785771840923, + "grad_norm": 1.8008071184158325, + "learning_rate": 1.8081947333108195e-06, + "loss": 1.4568, + "step": 26281 + }, + { + "epoch": 0.9412143893136605, + "grad_norm": 1.7400346994400024, + "learning_rate": 1.805999625696686e-06, + "loss": 1.4965, + "step": 26282 + }, + { + "epoch": 0.9412502014432288, + "grad_norm": 1.8043440580368042, + "learning_rate": 1.8038058391561697e-06, + "loss": 1.1718, + "step": 26283 + }, + { + "epoch": 0.9412860135727971, + "grad_norm": 2.3016085624694824, + "learning_rate": 1.8016133737187913e-06, + "loss": 1.3647, + "step": 26284 + }, + { + "epoch": 0.9413218257023654, + "grad_norm": 1.9413868188858032, + "learning_rate": 1.799422229414016e-06, + "loss": 1.1064, + "step": 26285 + }, + { + "epoch": 0.9413576378319337, + "grad_norm": 1.3793493509292603, + "learning_rate": 1.7972324062713652e-06, + "loss": 1.5471, + "step": 26286 + }, + { + "epoch": 0.941393449961502, + "grad_norm": 1.6569875478744507, + "learning_rate": 1.7950439043202593e-06, + "loss": 1.3871, + "step": 26287 + }, + { + "epoch": 0.9414292620910703, + "grad_norm": 2.83836030960083, + "learning_rate": 1.7928567235901861e-06, + "loss": 1.6417, + "step": 26288 + }, + { + "epoch": 0.9414650742206385, + "grad_norm": 1.4256877899169922, + "learning_rate": 1.790670864110522e-06, + "loss": 1.3258, + "step": 26289 + }, + { + "epoch": 0.9415008863502068, + "grad_norm": 1.5324336290359497, + "learning_rate": 1.7884863259107209e-06, + "loss": 1.4885, + "step": 26290 + }, + { + "epoch": 0.9415366984797751, + "grad_norm": 1.494587779045105, + "learning_rate": 1.7863031090201377e-06, + "loss": 1.5611, + "step": 26291 + }, + { + "epoch": 0.9415725106093434, + "grad_norm": 1.7873804569244385, + "learning_rate": 1.7841212134681705e-06, + "loss": 1.4419, + "step": 26292 + }, + { + "epoch": 0.9416083227389117, + "grad_norm": 1.5246162414550781, + "learning_rate": 1.781940639284163e-06, + "loss": 1.4035, + "step": 26293 + }, + { + "epoch": 0.94164413486848, + "grad_norm": 2.191568613052368, + "learning_rate": 1.7797613864974472e-06, + "loss": 1.4957, + "step": 26294 + }, + { + "epoch": 0.9416799469980482, + "grad_norm": 1.926516056060791, + "learning_rate": 1.7775834551373548e-06, + "loss": 0.9744, + "step": 26295 + }, + { + "epoch": 0.9417157591276165, + "grad_norm": 2.023942470550537, + "learning_rate": 1.775406845233163e-06, + "loss": 1.1816, + "step": 26296 + }, + { + "epoch": 0.9417515712571848, + "grad_norm": 1.6293940544128418, + "learning_rate": 1.7732315568141811e-06, + "loss": 1.2553, + "step": 26297 + }, + { + "epoch": 0.941787383386753, + "grad_norm": 1.5346230268478394, + "learning_rate": 1.7710575899096637e-06, + "loss": 1.3784, + "step": 26298 + }, + { + "epoch": 0.9418231955163214, + "grad_norm": 1.4494526386260986, + "learning_rate": 1.7688849445488654e-06, + "loss": 1.2135, + "step": 26299 + }, + { + "epoch": 0.9418590076458897, + "grad_norm": 1.3800164461135864, + "learning_rate": 1.7667136207609958e-06, + "loss": 1.4168, + "step": 26300 + }, + { + "epoch": 0.941894819775458, + "grad_norm": 1.7155061960220337, + "learning_rate": 1.7645436185753095e-06, + "loss": 1.5418, + "step": 26301 + }, + { + "epoch": 0.9419306319050262, + "grad_norm": 1.9509527683258057, + "learning_rate": 1.7623749380209609e-06, + "loss": 1.2594, + "step": 26302 + }, + { + "epoch": 0.9419664440345945, + "grad_norm": 1.7195106744766235, + "learning_rate": 1.7602075791271377e-06, + "loss": 1.451, + "step": 26303 + }, + { + "epoch": 0.9420022561641628, + "grad_norm": 1.8029755353927612, + "learning_rate": 1.7580415419229946e-06, + "loss": 1.1763, + "step": 26304 + }, + { + "epoch": 0.942038068293731, + "grad_norm": 1.6189782619476318, + "learning_rate": 1.7558768264376856e-06, + "loss": 1.4517, + "step": 26305 + }, + { + "epoch": 0.9420738804232994, + "grad_norm": 1.5579745769500732, + "learning_rate": 1.7537134327003324e-06, + "loss": 1.4916, + "step": 26306 + }, + { + "epoch": 0.9421096925528677, + "grad_norm": 2.151827573776245, + "learning_rate": 1.7515513607400225e-06, + "loss": 1.6583, + "step": 26307 + }, + { + "epoch": 0.942145504682436, + "grad_norm": 1.8219877481460571, + "learning_rate": 1.749390610585877e-06, + "loss": 1.2741, + "step": 26308 + }, + { + "epoch": 0.9421813168120042, + "grad_norm": 1.3775928020477295, + "learning_rate": 1.7472311822669397e-06, + "loss": 1.3416, + "step": 26309 + }, + { + "epoch": 0.9422171289415725, + "grad_norm": 1.1605829000473022, + "learning_rate": 1.7450730758122757e-06, + "loss": 1.4166, + "step": 26310 + }, + { + "epoch": 0.9422529410711408, + "grad_norm": 1.6979670524597168, + "learning_rate": 1.7429162912508956e-06, + "loss": 1.4178, + "step": 26311 + }, + { + "epoch": 0.942288753200709, + "grad_norm": 1.6193804740905762, + "learning_rate": 1.7407608286118427e-06, + "loss": 1.5039, + "step": 26312 + }, + { + "epoch": 0.9423245653302774, + "grad_norm": 2.2689478397369385, + "learning_rate": 1.7386066879241159e-06, + "loss": 1.6966, + "step": 26313 + }, + { + "epoch": 0.9423603774598457, + "grad_norm": 1.6678130626678467, + "learning_rate": 1.736453869216681e-06, + "loss": 1.4604, + "step": 26314 + }, + { + "epoch": 0.942396189589414, + "grad_norm": 1.554015040397644, + "learning_rate": 1.7343023725185038e-06, + "loss": 1.471, + "step": 26315 + }, + { + "epoch": 0.9424320017189822, + "grad_norm": 1.548634648323059, + "learning_rate": 1.7321521978585387e-06, + "loss": 1.5321, + "step": 26316 + }, + { + "epoch": 0.9424678138485505, + "grad_norm": 1.383250117301941, + "learning_rate": 1.7300033452657184e-06, + "loss": 1.3117, + "step": 26317 + }, + { + "epoch": 0.9425036259781188, + "grad_norm": 1.8333631753921509, + "learning_rate": 1.7278558147689306e-06, + "loss": 1.4467, + "step": 26318 + }, + { + "epoch": 0.942539438107687, + "grad_norm": 2.024153232574463, + "learning_rate": 1.7257096063970856e-06, + "loss": 1.4771, + "step": 26319 + }, + { + "epoch": 0.9425752502372554, + "grad_norm": 2.0110878944396973, + "learning_rate": 1.7235647201790605e-06, + "loss": 1.9133, + "step": 26320 + }, + { + "epoch": 0.9426110623668237, + "grad_norm": 2.2840211391448975, + "learning_rate": 1.7214211561436987e-06, + "loss": 1.576, + "step": 26321 + }, + { + "epoch": 0.942646874496392, + "grad_norm": 1.5875118970870972, + "learning_rate": 1.719278914319844e-06, + "loss": 1.4262, + "step": 26322 + }, + { + "epoch": 0.9426826866259602, + "grad_norm": 1.5386816263198853, + "learning_rate": 1.7171379947363175e-06, + "loss": 1.3028, + "step": 26323 + }, + { + "epoch": 0.9427184987555285, + "grad_norm": 1.6080418825149536, + "learning_rate": 1.7149983974219297e-06, + "loss": 1.4332, + "step": 26324 + }, + { + "epoch": 0.9427543108850968, + "grad_norm": 1.3982714414596558, + "learning_rate": 1.7128601224054464e-06, + "loss": 1.2564, + "step": 26325 + }, + { + "epoch": 0.942790123014665, + "grad_norm": 1.6409122943878174, + "learning_rate": 1.7107231697156557e-06, + "loss": 1.3616, + "step": 26326 + }, + { + "epoch": 0.9428259351442334, + "grad_norm": 1.7633566856384277, + "learning_rate": 1.7085875393813123e-06, + "loss": 1.3169, + "step": 26327 + }, + { + "epoch": 0.9428617472738017, + "grad_norm": 1.4810881614685059, + "learning_rate": 1.7064532314311266e-06, + "loss": 1.3493, + "step": 26328 + }, + { + "epoch": 0.94289755940337, + "grad_norm": 1.8232383728027344, + "learning_rate": 1.70432024589382e-06, + "loss": 1.5052, + "step": 26329 + }, + { + "epoch": 0.9429333715329382, + "grad_norm": 2.7344441413879395, + "learning_rate": 1.702188582798092e-06, + "loss": 1.6025, + "step": 26330 + }, + { + "epoch": 0.9429691836625065, + "grad_norm": 2.21616792678833, + "learning_rate": 1.7000582421726308e-06, + "loss": 1.5006, + "step": 26331 + }, + { + "epoch": 0.9430049957920748, + "grad_norm": 1.2851320505142212, + "learning_rate": 1.6979292240460799e-06, + "loss": 1.2972, + "step": 26332 + }, + { + "epoch": 0.943040807921643, + "grad_norm": 1.8838286399841309, + "learning_rate": 1.695801528447094e-06, + "loss": 1.398, + "step": 26333 + }, + { + "epoch": 0.9430766200512114, + "grad_norm": 1.508453130722046, + "learning_rate": 1.6936751554042951e-06, + "loss": 1.4436, + "step": 26334 + }, + { + "epoch": 0.9431124321807797, + "grad_norm": 1.5754175186157227, + "learning_rate": 1.6915501049462934e-06, + "loss": 1.2449, + "step": 26335 + }, + { + "epoch": 0.9431482443103479, + "grad_norm": 1.53984797000885, + "learning_rate": 1.6894263771016661e-06, + "loss": 1.8434, + "step": 26336 + }, + { + "epoch": 0.9431840564399162, + "grad_norm": 1.8990575075149536, + "learning_rate": 1.6873039718990014e-06, + "loss": 1.317, + "step": 26337 + }, + { + "epoch": 0.9432198685694845, + "grad_norm": 1.4001402854919434, + "learning_rate": 1.6851828893668543e-06, + "loss": 1.2721, + "step": 26338 + }, + { + "epoch": 0.9432556806990527, + "grad_norm": 1.814524531364441, + "learning_rate": 1.6830631295337462e-06, + "loss": 1.4002, + "step": 26339 + }, + { + "epoch": 0.943291492828621, + "grad_norm": 1.9315727949142456, + "learning_rate": 1.68094469242821e-06, + "loss": 1.3913, + "step": 26340 + }, + { + "epoch": 0.9433273049581893, + "grad_norm": 1.8088985681533813, + "learning_rate": 1.6788275780787343e-06, + "loss": 1.4572, + "step": 26341 + }, + { + "epoch": 0.9433631170877577, + "grad_norm": 1.4374310970306396, + "learning_rate": 1.6767117865138182e-06, + "loss": 1.3947, + "step": 26342 + }, + { + "epoch": 0.9433989292173259, + "grad_norm": 1.5694729089736938, + "learning_rate": 1.6745973177619056e-06, + "loss": 1.3992, + "step": 26343 + }, + { + "epoch": 0.9434347413468942, + "grad_norm": 1.4492532014846802, + "learning_rate": 1.6724841718514629e-06, + "loss": 1.266, + "step": 26344 + }, + { + "epoch": 0.9434705534764625, + "grad_norm": 1.6183035373687744, + "learning_rate": 1.6703723488109112e-06, + "loss": 1.5549, + "step": 26345 + }, + { + "epoch": 0.9435063656060307, + "grad_norm": 1.5488311052322388, + "learning_rate": 1.6682618486686619e-06, + "loss": 1.3828, + "step": 26346 + }, + { + "epoch": 0.943542177735599, + "grad_norm": 1.724896788597107, + "learning_rate": 1.6661526714531029e-06, + "loss": 1.1049, + "step": 26347 + }, + { + "epoch": 0.9435779898651673, + "grad_norm": 1.9380396604537964, + "learning_rate": 1.6640448171926226e-06, + "loss": 1.3431, + "step": 26348 + }, + { + "epoch": 0.9436138019947357, + "grad_norm": 2.2302026748657227, + "learning_rate": 1.6619382859155873e-06, + "loss": 1.6598, + "step": 26349 + }, + { + "epoch": 0.9436496141243039, + "grad_norm": 2.1377158164978027, + "learning_rate": 1.659833077650319e-06, + "loss": 1.3672, + "step": 26350 + }, + { + "epoch": 0.9436854262538722, + "grad_norm": 1.8395555019378662, + "learning_rate": 1.6577291924251392e-06, + "loss": 1.4068, + "step": 26351 + }, + { + "epoch": 0.9437212383834405, + "grad_norm": 1.6605585813522339, + "learning_rate": 1.6556266302683588e-06, + "loss": 1.1427, + "step": 26352 + }, + { + "epoch": 0.9437570505130087, + "grad_norm": 1.5564028024673462, + "learning_rate": 1.6535253912082772e-06, + "loss": 1.1415, + "step": 26353 + }, + { + "epoch": 0.943792862642577, + "grad_norm": 1.9182465076446533, + "learning_rate": 1.6514254752731494e-06, + "loss": 1.3955, + "step": 26354 + }, + { + "epoch": 0.9438286747721453, + "grad_norm": 1.4966611862182617, + "learning_rate": 1.6493268824912312e-06, + "loss": 1.3372, + "step": 26355 + }, + { + "epoch": 0.9438644869017137, + "grad_norm": 1.8412766456604004, + "learning_rate": 1.647229612890766e-06, + "loss": 1.5312, + "step": 26356 + }, + { + "epoch": 0.9439002990312819, + "grad_norm": 1.7466552257537842, + "learning_rate": 1.6451336664999539e-06, + "loss": 1.268, + "step": 26357 + }, + { + "epoch": 0.9439361111608502, + "grad_norm": 1.5257654190063477, + "learning_rate": 1.6430390433469945e-06, + "loss": 1.4011, + "step": 26358 + }, + { + "epoch": 0.9439719232904185, + "grad_norm": 1.8344718217849731, + "learning_rate": 1.640945743460065e-06, + "loss": 1.5296, + "step": 26359 + }, + { + "epoch": 0.9440077354199867, + "grad_norm": 1.9049286842346191, + "learning_rate": 1.6388537668673542e-06, + "loss": 1.1641, + "step": 26360 + }, + { + "epoch": 0.944043547549555, + "grad_norm": 1.3105257749557495, + "learning_rate": 1.636763113596984e-06, + "loss": 1.3407, + "step": 26361 + }, + { + "epoch": 0.9440793596791233, + "grad_norm": 2.12705659866333, + "learning_rate": 1.6346737836770875e-06, + "loss": 1.4898, + "step": 26362 + }, + { + "epoch": 0.9441151718086916, + "grad_norm": 1.5113617181777954, + "learning_rate": 1.6325857771357756e-06, + "loss": 1.5729, + "step": 26363 + }, + { + "epoch": 0.9441509839382599, + "grad_norm": 1.867129921913147, + "learning_rate": 1.6304990940011255e-06, + "loss": 1.3769, + "step": 26364 + }, + { + "epoch": 0.9441867960678282, + "grad_norm": 1.3271534442901611, + "learning_rate": 1.6284137343012263e-06, + "loss": 1.2326, + "step": 26365 + }, + { + "epoch": 0.9442226081973965, + "grad_norm": 1.6503703594207764, + "learning_rate": 1.6263296980641328e-06, + "loss": 1.2799, + "step": 26366 + }, + { + "epoch": 0.9442584203269647, + "grad_norm": 1.6071670055389404, + "learning_rate": 1.62424698531789e-06, + "loss": 1.2021, + "step": 26367 + }, + { + "epoch": 0.944294232456533, + "grad_norm": 1.499289870262146, + "learning_rate": 1.6221655960904968e-06, + "loss": 1.0335, + "step": 26368 + }, + { + "epoch": 0.9443300445861013, + "grad_norm": 1.6527241468429565, + "learning_rate": 1.620085530409965e-06, + "loss": 1.3493, + "step": 26369 + }, + { + "epoch": 0.9443658567156696, + "grad_norm": 1.5437915325164795, + "learning_rate": 1.6180067883042937e-06, + "loss": 1.468, + "step": 26370 + }, + { + "epoch": 0.9444016688452379, + "grad_norm": 2.0119853019714355, + "learning_rate": 1.6159293698014278e-06, + "loss": 1.4788, + "step": 26371 + }, + { + "epoch": 0.9444374809748062, + "grad_norm": 1.936726689338684, + "learning_rate": 1.6138532749293335e-06, + "loss": 1.4597, + "step": 26372 + }, + { + "epoch": 0.9444732931043744, + "grad_norm": 1.780268907546997, + "learning_rate": 1.6117785037159216e-06, + "loss": 1.1247, + "step": 26373 + }, + { + "epoch": 0.9445091052339427, + "grad_norm": 1.8385742902755737, + "learning_rate": 1.6097050561891369e-06, + "loss": 1.5127, + "step": 26374 + }, + { + "epoch": 0.944544917363511, + "grad_norm": 1.4780793190002441, + "learning_rate": 1.6076329323768347e-06, + "loss": 1.4763, + "step": 26375 + }, + { + "epoch": 0.9445807294930793, + "grad_norm": 1.8092213869094849, + "learning_rate": 1.605562132306937e-06, + "loss": 1.3985, + "step": 26376 + }, + { + "epoch": 0.9446165416226476, + "grad_norm": 1.405903935432434, + "learning_rate": 1.6034926560072549e-06, + "loss": 1.1074, + "step": 26377 + }, + { + "epoch": 0.9446523537522159, + "grad_norm": 1.8108632564544678, + "learning_rate": 1.6014245035056775e-06, + "loss": 1.3274, + "step": 26378 + }, + { + "epoch": 0.9446881658817842, + "grad_norm": 1.407546877861023, + "learning_rate": 1.5993576748300043e-06, + "loss": 1.4492, + "step": 26379 + }, + { + "epoch": 0.9447239780113524, + "grad_norm": 1.6472501754760742, + "learning_rate": 1.5972921700080357e-06, + "loss": 1.2341, + "step": 26380 + }, + { + "epoch": 0.9447597901409207, + "grad_norm": 1.5353707075119019, + "learning_rate": 1.5952279890675826e-06, + "loss": 1.1901, + "step": 26381 + }, + { + "epoch": 0.944795602270489, + "grad_norm": 1.5727671384811401, + "learning_rate": 1.5931651320364006e-06, + "loss": 1.5107, + "step": 26382 + }, + { + "epoch": 0.9448314144000572, + "grad_norm": 1.323904037475586, + "learning_rate": 1.5911035989422562e-06, + "loss": 1.3362, + "step": 26383 + }, + { + "epoch": 0.9448672265296256, + "grad_norm": 1.4275002479553223, + "learning_rate": 1.5890433898128498e-06, + "loss": 1.4968, + "step": 26384 + }, + { + "epoch": 0.9449030386591939, + "grad_norm": 1.5640442371368408, + "learning_rate": 1.5869845046759369e-06, + "loss": 1.4972, + "step": 26385 + }, + { + "epoch": 0.9449388507887622, + "grad_norm": 2.0282034873962402, + "learning_rate": 1.5849269435592061e-06, + "loss": 1.6468, + "step": 26386 + }, + { + "epoch": 0.9449746629183304, + "grad_norm": 1.7705632448196411, + "learning_rate": 1.5828707064903359e-06, + "loss": 1.3092, + "step": 26387 + }, + { + "epoch": 0.9450104750478987, + "grad_norm": 1.7389479875564575, + "learning_rate": 1.5808157934969813e-06, + "loss": 1.0817, + "step": 26388 + }, + { + "epoch": 0.945046287177467, + "grad_norm": 1.5240728855133057, + "learning_rate": 1.5787622046068207e-06, + "loss": 1.3589, + "step": 26389 + }, + { + "epoch": 0.9450820993070352, + "grad_norm": 1.9323331117630005, + "learning_rate": 1.576709939847454e-06, + "loss": 1.2756, + "step": 26390 + }, + { + "epoch": 0.9451179114366036, + "grad_norm": 1.5090250968933105, + "learning_rate": 1.574658999246481e-06, + "loss": 1.1489, + "step": 26391 + }, + { + "epoch": 0.9451537235661719, + "grad_norm": 2.539374828338623, + "learning_rate": 1.5726093828315248e-06, + "loss": 1.2725, + "step": 26392 + }, + { + "epoch": 0.9451895356957402, + "grad_norm": 1.6672533750534058, + "learning_rate": 1.5705610906301404e-06, + "loss": 1.317, + "step": 26393 + }, + { + "epoch": 0.9452253478253084, + "grad_norm": 1.4400562047958374, + "learning_rate": 1.5685141226699064e-06, + "loss": 1.4577, + "step": 26394 + }, + { + "epoch": 0.9452611599548767, + "grad_norm": 1.512329339981079, + "learning_rate": 1.5664684789783224e-06, + "loss": 1.2943, + "step": 26395 + }, + { + "epoch": 0.945296972084445, + "grad_norm": 1.447803258895874, + "learning_rate": 1.5644241595829557e-06, + "loss": 1.2944, + "step": 26396 + }, + { + "epoch": 0.9453327842140132, + "grad_norm": 1.6319823265075684, + "learning_rate": 1.562381164511284e-06, + "loss": 1.2474, + "step": 26397 + }, + { + "epoch": 0.9453685963435816, + "grad_norm": 1.9061074256896973, + "learning_rate": 1.5603394937907967e-06, + "loss": 1.4807, + "step": 26398 + }, + { + "epoch": 0.9454044084731499, + "grad_norm": 1.5398566722869873, + "learning_rate": 1.5582991474489607e-06, + "loss": 1.526, + "step": 26399 + }, + { + "epoch": 0.9454402206027182, + "grad_norm": 4.880867958068848, + "learning_rate": 1.5562601255132314e-06, + "loss": 1.3848, + "step": 26400 + }, + { + "epoch": 0.9454760327322864, + "grad_norm": 1.713074803352356, + "learning_rate": 1.554222428011043e-06, + "loss": 1.3903, + "step": 26401 + }, + { + "epoch": 0.9455118448618547, + "grad_norm": 1.401732325553894, + "learning_rate": 1.5521860549698063e-06, + "loss": 1.6819, + "step": 26402 + }, + { + "epoch": 0.945547656991423, + "grad_norm": 1.8747241497039795, + "learning_rate": 1.550151006416911e-06, + "loss": 1.5883, + "step": 26403 + }, + { + "epoch": 0.9455834691209912, + "grad_norm": 1.228621244430542, + "learning_rate": 1.5481172823797463e-06, + "loss": 1.3525, + "step": 26404 + }, + { + "epoch": 0.9456192812505596, + "grad_norm": 1.479820728302002, + "learning_rate": 1.5460848828856677e-06, + "loss": 1.6433, + "step": 26405 + }, + { + "epoch": 0.9456550933801279, + "grad_norm": 1.6731551885604858, + "learning_rate": 1.5440538079620204e-06, + "loss": 1.3101, + "step": 26406 + }, + { + "epoch": 0.9456909055096961, + "grad_norm": 1.7374619245529175, + "learning_rate": 1.5420240576361378e-06, + "loss": 1.4818, + "step": 26407 + }, + { + "epoch": 0.9457267176392644, + "grad_norm": 1.6798261404037476, + "learning_rate": 1.5399956319353092e-06, + "loss": 1.2353, + "step": 26408 + }, + { + "epoch": 0.9457625297688327, + "grad_norm": 1.8133715391159058, + "learning_rate": 1.5379685308868464e-06, + "loss": 1.3315, + "step": 26409 + }, + { + "epoch": 0.945798341898401, + "grad_norm": 2.04004168510437, + "learning_rate": 1.535942754517994e-06, + "loss": 1.154, + "step": 26410 + }, + { + "epoch": 0.9458341540279692, + "grad_norm": 1.9576140642166138, + "learning_rate": 1.53391830285603e-06, + "loss": 1.4195, + "step": 26411 + }, + { + "epoch": 0.9458699661575376, + "grad_norm": 1.7399269342422485, + "learning_rate": 1.5318951759281885e-06, + "loss": 1.6082, + "step": 26412 + }, + { + "epoch": 0.9459057782871059, + "grad_norm": 1.564070224761963, + "learning_rate": 1.529873373761681e-06, + "loss": 1.7338, + "step": 26413 + }, + { + "epoch": 0.9459415904166741, + "grad_norm": 1.5939946174621582, + "learning_rate": 1.527852896383708e-06, + "loss": 1.4318, + "step": 26414 + }, + { + "epoch": 0.9459774025462424, + "grad_norm": 1.428228497505188, + "learning_rate": 1.5258337438214587e-06, + "loss": 1.4115, + "step": 26415 + }, + { + "epoch": 0.9460132146758107, + "grad_norm": 1.6301560401916504, + "learning_rate": 1.5238159161020893e-06, + "loss": 1.5172, + "step": 26416 + }, + { + "epoch": 0.946049026805379, + "grad_norm": 1.5005348920822144, + "learning_rate": 1.5217994132527448e-06, + "loss": 1.5653, + "step": 26417 + }, + { + "epoch": 0.9460848389349472, + "grad_norm": 1.9517136812210083, + "learning_rate": 1.5197842353005698e-06, + "loss": 1.4366, + "step": 26418 + }, + { + "epoch": 0.9461206510645156, + "grad_norm": 1.6520404815673828, + "learning_rate": 1.5177703822726652e-06, + "loss": 1.4942, + "step": 26419 + }, + { + "epoch": 0.9461564631940839, + "grad_norm": 1.7810132503509521, + "learning_rate": 1.5157578541961315e-06, + "loss": 1.1101, + "step": 26420 + }, + { + "epoch": 0.9461922753236521, + "grad_norm": 1.829916000366211, + "learning_rate": 1.5137466510980357e-06, + "loss": 1.4551, + "step": 26421 + }, + { + "epoch": 0.9462280874532204, + "grad_norm": 1.324633002281189, + "learning_rate": 1.5117367730054343e-06, + "loss": 1.3078, + "step": 26422 + }, + { + "epoch": 0.9462638995827887, + "grad_norm": 1.974655270576477, + "learning_rate": 1.5097282199453943e-06, + "loss": 1.5415, + "step": 26423 + }, + { + "epoch": 0.9462997117123569, + "grad_norm": 1.4947298765182495, + "learning_rate": 1.5077209919449053e-06, + "loss": 1.2912, + "step": 26424 + }, + { + "epoch": 0.9463355238419252, + "grad_norm": 1.5060384273529053, + "learning_rate": 1.505715089030979e-06, + "loss": 1.293, + "step": 26425 + }, + { + "epoch": 0.9463713359714936, + "grad_norm": 2.223021984100342, + "learning_rate": 1.503710511230616e-06, + "loss": 1.344, + "step": 26426 + }, + { + "epoch": 0.9464071481010619, + "grad_norm": 1.7746235132217407, + "learning_rate": 1.5017072585707725e-06, + "loss": 1.4105, + "step": 26427 + }, + { + "epoch": 0.9464429602306301, + "grad_norm": 1.6269015073776245, + "learning_rate": 1.4997053310784047e-06, + "loss": 1.2152, + "step": 26428 + }, + { + "epoch": 0.9464787723601984, + "grad_norm": 1.9540683031082153, + "learning_rate": 1.497704728780447e-06, + "loss": 1.4217, + "step": 26429 + }, + { + "epoch": 0.9465145844897667, + "grad_norm": 1.675334095954895, + "learning_rate": 1.4957054517038106e-06, + "loss": 1.4094, + "step": 26430 + }, + { + "epoch": 0.9465503966193349, + "grad_norm": 2.8871023654937744, + "learning_rate": 1.4937074998753965e-06, + "loss": 1.3651, + "step": 26431 + }, + { + "epoch": 0.9465862087489032, + "grad_norm": 1.911722183227539, + "learning_rate": 1.491710873322083e-06, + "loss": 1.441, + "step": 26432 + }, + { + "epoch": 0.9466220208784716, + "grad_norm": 1.7608087062835693, + "learning_rate": 1.489715572070738e-06, + "loss": 1.421, + "step": 26433 + }, + { + "epoch": 0.9466578330080399, + "grad_norm": 1.8853119611740112, + "learning_rate": 1.4877215961482062e-06, + "loss": 1.2649, + "step": 26434 + }, + { + "epoch": 0.9466936451376081, + "grad_norm": 1.8366423845291138, + "learning_rate": 1.4857289455812883e-06, + "loss": 1.5063, + "step": 26435 + }, + { + "epoch": 0.9467294572671764, + "grad_norm": 1.7807245254516602, + "learning_rate": 1.48373762039683e-06, + "loss": 1.3553, + "step": 26436 + }, + { + "epoch": 0.9467652693967447, + "grad_norm": 1.6845966577529907, + "learning_rate": 1.4817476206216096e-06, + "loss": 1.2403, + "step": 26437 + }, + { + "epoch": 0.9468010815263129, + "grad_norm": 1.5173600912094116, + "learning_rate": 1.4797589462823836e-06, + "loss": 1.5802, + "step": 26438 + }, + { + "epoch": 0.9468368936558812, + "grad_norm": 1.406801462173462, + "learning_rate": 1.4777715974059192e-06, + "loss": 1.5145, + "step": 26439 + }, + { + "epoch": 0.9468727057854496, + "grad_norm": 1.5290340185165405, + "learning_rate": 1.4757855740189508e-06, + "loss": 1.3885, + "step": 26440 + }, + { + "epoch": 0.9469085179150178, + "grad_norm": 2.0321245193481445, + "learning_rate": 1.4738008761482125e-06, + "loss": 1.6672, + "step": 26441 + }, + { + "epoch": 0.9469443300445861, + "grad_norm": 1.7161322832107544, + "learning_rate": 1.471817503820383e-06, + "loss": 1.3366, + "step": 26442 + }, + { + "epoch": 0.9469801421741544, + "grad_norm": 1.8721601963043213, + "learning_rate": 1.469835457062163e-06, + "loss": 1.3743, + "step": 26443 + }, + { + "epoch": 0.9470159543037227, + "grad_norm": 1.5677088499069214, + "learning_rate": 1.4678547359002092e-06, + "loss": 1.556, + "step": 26444 + }, + { + "epoch": 0.9470517664332909, + "grad_norm": 1.6195982694625854, + "learning_rate": 1.465875340361178e-06, + "loss": 1.4419, + "step": 26445 + }, + { + "epoch": 0.9470875785628592, + "grad_norm": 1.714670181274414, + "learning_rate": 1.4638972704716814e-06, + "loss": 1.7476, + "step": 26446 + }, + { + "epoch": 0.9471233906924276, + "grad_norm": 1.6740520000457764, + "learning_rate": 1.4619205262583536e-06, + "loss": 1.4975, + "step": 26447 + }, + { + "epoch": 0.9471592028219958, + "grad_norm": 1.2522761821746826, + "learning_rate": 1.4599451077477844e-06, + "loss": 1.4487, + "step": 26448 + }, + { + "epoch": 0.9471950149515641, + "grad_norm": 1.4357094764709473, + "learning_rate": 1.4579710149665416e-06, + "loss": 1.2878, + "step": 26449 + }, + { + "epoch": 0.9472308270811324, + "grad_norm": 1.617078423500061, + "learning_rate": 1.4559982479411927e-06, + "loss": 1.4942, + "step": 26450 + }, + { + "epoch": 0.9472666392107006, + "grad_norm": 1.5373728275299072, + "learning_rate": 1.4540268066982722e-06, + "loss": 1.2622, + "step": 26451 + }, + { + "epoch": 0.9473024513402689, + "grad_norm": 2.110975980758667, + "learning_rate": 1.452056691264303e-06, + "loss": 1.2574, + "step": 26452 + }, + { + "epoch": 0.9473382634698372, + "grad_norm": 2.1490426063537598, + "learning_rate": 1.4500879016657865e-06, + "loss": 1.5997, + "step": 26453 + }, + { + "epoch": 0.9473740755994056, + "grad_norm": 1.6299991607666016, + "learning_rate": 1.4481204379292234e-06, + "loss": 1.5127, + "step": 26454 + }, + { + "epoch": 0.9474098877289738, + "grad_norm": 1.6052266359329224, + "learning_rate": 1.4461543000810929e-06, + "loss": 1.6573, + "step": 26455 + }, + { + "epoch": 0.9474456998585421, + "grad_norm": 2.261275291442871, + "learning_rate": 1.4441894881478069e-06, + "loss": 1.5065, + "step": 26456 + }, + { + "epoch": 0.9474815119881104, + "grad_norm": 1.8761277198791504, + "learning_rate": 1.4422260021558331e-06, + "loss": 1.6339, + "step": 26457 + }, + { + "epoch": 0.9475173241176786, + "grad_norm": 1.7935103178024292, + "learning_rate": 1.440263842131573e-06, + "loss": 1.5145, + "step": 26458 + }, + { + "epoch": 0.9475531362472469, + "grad_norm": 1.3724159002304077, + "learning_rate": 1.4383030081014493e-06, + "loss": 1.3055, + "step": 26459 + }, + { + "epoch": 0.9475889483768152, + "grad_norm": 1.9284459352493286, + "learning_rate": 1.436343500091808e-06, + "loss": 1.5151, + "step": 26460 + }, + { + "epoch": 0.9476247605063836, + "grad_norm": 1.7961905002593994, + "learning_rate": 1.4343853181290168e-06, + "loss": 1.5256, + "step": 26461 + }, + { + "epoch": 0.9476605726359518, + "grad_norm": 1.7151559591293335, + "learning_rate": 1.4324284622394547e-06, + "loss": 1.4779, + "step": 26462 + }, + { + "epoch": 0.9476963847655201, + "grad_norm": 1.4853241443634033, + "learning_rate": 1.4304729324494115e-06, + "loss": 1.4664, + "step": 26463 + }, + { + "epoch": 0.9477321968950884, + "grad_norm": 1.5965665578842163, + "learning_rate": 1.4285187287851997e-06, + "loss": 1.3899, + "step": 26464 + }, + { + "epoch": 0.9477680090246566, + "grad_norm": 1.4473627805709839, + "learning_rate": 1.4265658512731316e-06, + "loss": 1.5785, + "step": 26465 + }, + { + "epoch": 0.9478038211542249, + "grad_norm": 1.9590723514556885, + "learning_rate": 1.4246142999394751e-06, + "loss": 1.564, + "step": 26466 + }, + { + "epoch": 0.9478396332837932, + "grad_norm": 1.9028202295303345, + "learning_rate": 1.4226640748104757e-06, + "loss": 1.5236, + "step": 26467 + }, + { + "epoch": 0.9478754454133616, + "grad_norm": 1.640310525894165, + "learning_rate": 1.4207151759123683e-06, + "loss": 1.5118, + "step": 26468 + }, + { + "epoch": 0.9479112575429298, + "grad_norm": 1.495152473449707, + "learning_rate": 1.418767603271387e-06, + "loss": 1.4463, + "step": 26469 + }, + { + "epoch": 0.9479470696724981, + "grad_norm": 1.3607423305511475, + "learning_rate": 1.4168213569137223e-06, + "loss": 1.2448, + "step": 26470 + }, + { + "epoch": 0.9479828818020664, + "grad_norm": 1.2964245080947876, + "learning_rate": 1.4148764368655754e-06, + "loss": 1.4987, + "step": 26471 + }, + { + "epoch": 0.9480186939316346, + "grad_norm": 1.689813494682312, + "learning_rate": 1.4129328431530807e-06, + "loss": 1.2008, + "step": 26472 + }, + { + "epoch": 0.9480545060612029, + "grad_norm": 1.8330782651901245, + "learning_rate": 1.4109905758024177e-06, + "loss": 1.8116, + "step": 26473 + }, + { + "epoch": 0.9480903181907712, + "grad_norm": 1.5751451253890991, + "learning_rate": 1.4090496348397097e-06, + "loss": 1.2216, + "step": 26474 + }, + { + "epoch": 0.9481261303203395, + "grad_norm": 2.155590057373047, + "learning_rate": 1.407110020291058e-06, + "loss": 1.3441, + "step": 26475 + }, + { + "epoch": 0.9481619424499078, + "grad_norm": 2.1611201763153076, + "learning_rate": 1.4051717321825643e-06, + "loss": 1.453, + "step": 26476 + }, + { + "epoch": 0.9481977545794761, + "grad_norm": 1.6963903903961182, + "learning_rate": 1.403234770540307e-06, + "loss": 1.1672, + "step": 26477 + }, + { + "epoch": 0.9482335667090444, + "grad_norm": 1.572092056274414, + "learning_rate": 1.4012991353903549e-06, + "loss": 1.6028, + "step": 26478 + }, + { + "epoch": 0.9482693788386126, + "grad_norm": 1.452835202217102, + "learning_rate": 1.3993648267587312e-06, + "loss": 1.4143, + "step": 26479 + }, + { + "epoch": 0.9483051909681809, + "grad_norm": 1.6740903854370117, + "learning_rate": 1.3974318446714706e-06, + "loss": 1.61, + "step": 26480 + }, + { + "epoch": 0.9483410030977492, + "grad_norm": 1.7781686782836914, + "learning_rate": 1.395500189154575e-06, + "loss": 1.4476, + "step": 26481 + }, + { + "epoch": 0.9483768152273175, + "grad_norm": 1.7080374956130981, + "learning_rate": 1.3935698602340452e-06, + "loss": 1.3767, + "step": 26482 + }, + { + "epoch": 0.9484126273568858, + "grad_norm": 1.3993390798568726, + "learning_rate": 1.3916408579358164e-06, + "loss": 1.5493, + "step": 26483 + }, + { + "epoch": 0.9484484394864541, + "grad_norm": 1.3815793991088867, + "learning_rate": 1.3897131822858789e-06, + "loss": 1.4565, + "step": 26484 + }, + { + "epoch": 0.9484842516160223, + "grad_norm": 2.051628589630127, + "learning_rate": 1.3877868333101562e-06, + "loss": 1.5791, + "step": 26485 + }, + { + "epoch": 0.9485200637455906, + "grad_norm": 1.3823293447494507, + "learning_rate": 1.385861811034561e-06, + "loss": 1.2821, + "step": 26486 + }, + { + "epoch": 0.9485558758751589, + "grad_norm": 2.022139310836792, + "learning_rate": 1.383938115484984e-06, + "loss": 1.4049, + "step": 26487 + }, + { + "epoch": 0.9485916880047272, + "grad_norm": 1.9774534702301025, + "learning_rate": 1.3820157466873152e-06, + "loss": 1.4567, + "step": 26488 + }, + { + "epoch": 0.9486275001342955, + "grad_norm": 1.6379338502883911, + "learning_rate": 1.3800947046674228e-06, + "loss": 1.175, + "step": 26489 + }, + { + "epoch": 0.9486633122638638, + "grad_norm": 1.9727866649627686, + "learning_rate": 1.3781749894511308e-06, + "loss": 1.3028, + "step": 26490 + }, + { + "epoch": 0.9486991243934321, + "grad_norm": 1.7060546875, + "learning_rate": 1.3762566010642962e-06, + "loss": 1.3837, + "step": 26491 + }, + { + "epoch": 0.9487349365230003, + "grad_norm": 1.596717119216919, + "learning_rate": 1.3743395395326985e-06, + "loss": 1.2626, + "step": 26492 + }, + { + "epoch": 0.9487707486525686, + "grad_norm": 1.3802907466888428, + "learning_rate": 1.3724238048821615e-06, + "loss": 1.1592, + "step": 26493 + }, + { + "epoch": 0.9488065607821369, + "grad_norm": 1.8134254217147827, + "learning_rate": 1.370509397138431e-06, + "loss": 1.674, + "step": 26494 + }, + { + "epoch": 0.9488423729117051, + "grad_norm": 1.4546838998794556, + "learning_rate": 1.3685963163272752e-06, + "loss": 1.401, + "step": 26495 + }, + { + "epoch": 0.9488781850412735, + "grad_norm": 1.6962960958480835, + "learning_rate": 1.3666845624744406e-06, + "loss": 1.3322, + "step": 26496 + }, + { + "epoch": 0.9489139971708418, + "grad_norm": 1.3423304557800293, + "learning_rate": 1.3647741356056287e-06, + "loss": 1.2208, + "step": 26497 + }, + { + "epoch": 0.9489498093004101, + "grad_norm": 1.656837821006775, + "learning_rate": 1.3628650357465522e-06, + "loss": 1.3385, + "step": 26498 + }, + { + "epoch": 0.9489856214299783, + "grad_norm": 1.3497300148010254, + "learning_rate": 1.3609572629228906e-06, + "loss": 1.2634, + "step": 26499 + }, + { + "epoch": 0.9490214335595466, + "grad_norm": 2.2240593433380127, + "learning_rate": 1.3590508171603233e-06, + "loss": 1.6153, + "step": 26500 + }, + { + "epoch": 0.9490572456891149, + "grad_norm": 1.696808934211731, + "learning_rate": 1.3571456984844743e-06, + "loss": 1.33, + "step": 26501 + }, + { + "epoch": 0.9490930578186831, + "grad_norm": 1.5843878984451294, + "learning_rate": 1.3552419069210009e-06, + "loss": 1.2396, + "step": 26502 + }, + { + "epoch": 0.9491288699482515, + "grad_norm": 1.4548217058181763, + "learning_rate": 1.3533394424954937e-06, + "loss": 1.2981, + "step": 26503 + }, + { + "epoch": 0.9491646820778198, + "grad_norm": 1.916221022605896, + "learning_rate": 1.3514383052335766e-06, + "loss": 1.3667, + "step": 26504 + }, + { + "epoch": 0.9492004942073881, + "grad_norm": 1.5517075061798096, + "learning_rate": 1.3495384951607958e-06, + "loss": 1.2612, + "step": 26505 + }, + { + "epoch": 0.9492363063369563, + "grad_norm": 3.5113589763641357, + "learning_rate": 1.3476400123027312e-06, + "loss": 1.6406, + "step": 26506 + }, + { + "epoch": 0.9492721184665246, + "grad_norm": 1.6516262292861938, + "learning_rate": 1.3457428566849173e-06, + "loss": 1.6534, + "step": 26507 + }, + { + "epoch": 0.9493079305960929, + "grad_norm": 1.575160264968872, + "learning_rate": 1.3438470283328785e-06, + "loss": 1.4022, + "step": 26508 + }, + { + "epoch": 0.9493437427256611, + "grad_norm": 1.442672610282898, + "learning_rate": 1.3419525272721168e-06, + "loss": 1.4947, + "step": 26509 + }, + { + "epoch": 0.9493795548552295, + "grad_norm": 1.832108736038208, + "learning_rate": 1.3400593535281224e-06, + "loss": 1.4931, + "step": 26510 + }, + { + "epoch": 0.9494153669847978, + "grad_norm": 1.702636957168579, + "learning_rate": 1.3381675071263755e-06, + "loss": 1.564, + "step": 26511 + }, + { + "epoch": 0.949451179114366, + "grad_norm": 1.6425777673721313, + "learning_rate": 1.3362769880923221e-06, + "loss": 1.2737, + "step": 26512 + }, + { + "epoch": 0.9494869912439343, + "grad_norm": 1.3201589584350586, + "learning_rate": 1.3343877964513863e-06, + "loss": 1.3811, + "step": 26513 + }, + { + "epoch": 0.9495228033735026, + "grad_norm": 2.1531147956848145, + "learning_rate": 1.3324999322290033e-06, + "loss": 1.2665, + "step": 26514 + }, + { + "epoch": 0.9495586155030709, + "grad_norm": 1.7345833778381348, + "learning_rate": 1.330613395450553e-06, + "loss": 1.6657, + "step": 26515 + }, + { + "epoch": 0.9495944276326391, + "grad_norm": 1.5731521844863892, + "learning_rate": 1.3287281861414258e-06, + "loss": 1.4453, + "step": 26516 + }, + { + "epoch": 0.9496302397622075, + "grad_norm": 1.4536832571029663, + "learning_rate": 1.3268443043269796e-06, + "loss": 1.5805, + "step": 26517 + }, + { + "epoch": 0.9496660518917758, + "grad_norm": 1.422829031944275, + "learning_rate": 1.3249617500325718e-06, + "loss": 1.7652, + "step": 26518 + }, + { + "epoch": 0.949701864021344, + "grad_norm": 1.5982391834259033, + "learning_rate": 1.3230805232835153e-06, + "loss": 1.4507, + "step": 26519 + }, + { + "epoch": 0.9497376761509123, + "grad_norm": 1.574022650718689, + "learning_rate": 1.3212006241051345e-06, + "loss": 1.5129, + "step": 26520 + }, + { + "epoch": 0.9497734882804806, + "grad_norm": 1.3596420288085938, + "learning_rate": 1.319322052522709e-06, + "loss": 1.2358, + "step": 26521 + }, + { + "epoch": 0.9498093004100489, + "grad_norm": 1.462693214416504, + "learning_rate": 1.3174448085615187e-06, + "loss": 1.4235, + "step": 26522 + }, + { + "epoch": 0.9498451125396171, + "grad_norm": 1.5312095880508423, + "learning_rate": 1.3155688922468101e-06, + "loss": 1.4207, + "step": 26523 + }, + { + "epoch": 0.9498809246691855, + "grad_norm": 1.4311823844909668, + "learning_rate": 1.3136943036038297e-06, + "loss": 1.3391, + "step": 26524 + }, + { + "epoch": 0.9499167367987538, + "grad_norm": 1.4623321294784546, + "learning_rate": 1.3118210426578015e-06, + "loss": 1.2909, + "step": 26525 + }, + { + "epoch": 0.949952548928322, + "grad_norm": 1.7718008756637573, + "learning_rate": 1.3099491094339279e-06, + "loss": 1.3087, + "step": 26526 + }, + { + "epoch": 0.9499883610578903, + "grad_norm": 1.7378346920013428, + "learning_rate": 1.3080785039573773e-06, + "loss": 1.4943, + "step": 26527 + }, + { + "epoch": 0.9500241731874586, + "grad_norm": 1.7058058977127075, + "learning_rate": 1.3062092262533189e-06, + "loss": 1.4128, + "step": 26528 + }, + { + "epoch": 0.9500599853170268, + "grad_norm": 1.367444396018982, + "learning_rate": 1.304341276346932e-06, + "loss": 1.0962, + "step": 26529 + }, + { + "epoch": 0.9500957974465951, + "grad_norm": 1.446695327758789, + "learning_rate": 1.3024746542633082e-06, + "loss": 1.2731, + "step": 26530 + }, + { + "epoch": 0.9501316095761635, + "grad_norm": 1.3863316774368286, + "learning_rate": 1.3006093600275825e-06, + "loss": 1.5057, + "step": 26531 + }, + { + "epoch": 0.9501674217057318, + "grad_norm": 2.1868419647216797, + "learning_rate": 1.2987453936648575e-06, + "loss": 1.3829, + "step": 26532 + }, + { + "epoch": 0.9502032338353, + "grad_norm": 1.467094898223877, + "learning_rate": 1.2968827552001793e-06, + "loss": 1.5164, + "step": 26533 + }, + { + "epoch": 0.9502390459648683, + "grad_norm": 1.3066911697387695, + "learning_rate": 1.2950214446586284e-06, + "loss": 1.3691, + "step": 26534 + }, + { + "epoch": 0.9502748580944366, + "grad_norm": 1.5992457866668701, + "learning_rate": 1.2931614620652511e-06, + "loss": 1.4246, + "step": 26535 + }, + { + "epoch": 0.9503106702240048, + "grad_norm": 1.3781858682632446, + "learning_rate": 1.2913028074450607e-06, + "loss": 1.4019, + "step": 26536 + }, + { + "epoch": 0.9503464823535731, + "grad_norm": 1.9242557287216187, + "learning_rate": 1.2894454808230593e-06, + "loss": 1.3886, + "step": 26537 + }, + { + "epoch": 0.9503822944831415, + "grad_norm": 1.6685802936553955, + "learning_rate": 1.2875894822242496e-06, + "loss": 1.3555, + "step": 26538 + }, + { + "epoch": 0.9504181066127098, + "grad_norm": 1.6586769819259644, + "learning_rate": 1.2857348116736002e-06, + "loss": 1.3652, + "step": 26539 + }, + { + "epoch": 0.950453918742278, + "grad_norm": 1.9376729726791382, + "learning_rate": 1.2838814691960355e-06, + "loss": 1.5063, + "step": 26540 + }, + { + "epoch": 0.9504897308718463, + "grad_norm": 1.4433695077896118, + "learning_rate": 1.2820294548165246e-06, + "loss": 1.2929, + "step": 26541 + }, + { + "epoch": 0.9505255430014146, + "grad_norm": 1.3399428129196167, + "learning_rate": 1.2801787685599698e-06, + "loss": 1.3715, + "step": 26542 + }, + { + "epoch": 0.9505613551309828, + "grad_norm": 1.332889437675476, + "learning_rate": 1.2783294104512734e-06, + "loss": 1.4932, + "step": 26543 + }, + { + "epoch": 0.9505971672605511, + "grad_norm": 1.9041041135787964, + "learning_rate": 1.2764813805153041e-06, + "loss": 1.6289, + "step": 26544 + }, + { + "epoch": 0.9506329793901195, + "grad_norm": 1.5188629627227783, + "learning_rate": 1.2746346787769425e-06, + "loss": 1.08, + "step": 26545 + }, + { + "epoch": 0.9506687915196878, + "grad_norm": 1.580101490020752, + "learning_rate": 1.272789305261013e-06, + "loss": 1.5524, + "step": 26546 + }, + { + "epoch": 0.950704603649256, + "grad_norm": 1.4802852869033813, + "learning_rate": 1.2709452599923731e-06, + "loss": 1.4638, + "step": 26547 + }, + { + "epoch": 0.9507404157788243, + "grad_norm": 3.1193196773529053, + "learning_rate": 1.2691025429958037e-06, + "loss": 1.2083, + "step": 26548 + }, + { + "epoch": 0.9507762279083926, + "grad_norm": 1.8259837627410889, + "learning_rate": 1.2672611542960954e-06, + "loss": 1.3745, + "step": 26549 + }, + { + "epoch": 0.9508120400379608, + "grad_norm": 1.8224256038665771, + "learning_rate": 1.2654210939180511e-06, + "loss": 1.4162, + "step": 26550 + }, + { + "epoch": 0.9508478521675291, + "grad_norm": 2.090353488922119, + "learning_rate": 1.2635823618863951e-06, + "loss": 1.4225, + "step": 26551 + }, + { + "epoch": 0.9508836642970975, + "grad_norm": 1.8558623790740967, + "learning_rate": 1.2617449582258744e-06, + "loss": 1.6497, + "step": 26552 + }, + { + "epoch": 0.9509194764266657, + "grad_norm": 1.5263234376907349, + "learning_rate": 1.2599088829612249e-06, + "loss": 1.4847, + "step": 26553 + }, + { + "epoch": 0.950955288556234, + "grad_norm": 1.3935396671295166, + "learning_rate": 1.2580741361171267e-06, + "loss": 1.5371, + "step": 26554 + }, + { + "epoch": 0.9509911006858023, + "grad_norm": 1.8329626321792603, + "learning_rate": 1.2562407177182712e-06, + "loss": 1.3644, + "step": 26555 + }, + { + "epoch": 0.9510269128153706, + "grad_norm": 1.6709609031677246, + "learning_rate": 1.2544086277893386e-06, + "loss": 1.6079, + "step": 26556 + }, + { + "epoch": 0.9510627249449388, + "grad_norm": 1.656237244606018, + "learning_rate": 1.2525778663549537e-06, + "loss": 1.2923, + "step": 26557 + }, + { + "epoch": 0.9510985370745071, + "grad_norm": 1.7023342847824097, + "learning_rate": 1.2507484334397634e-06, + "loss": 1.4957, + "step": 26558 + }, + { + "epoch": 0.9511343492040755, + "grad_norm": 1.3861737251281738, + "learning_rate": 1.2489203290683703e-06, + "loss": 1.3171, + "step": 26559 + }, + { + "epoch": 0.9511701613336437, + "grad_norm": 1.4072065353393555, + "learning_rate": 1.2470935532653772e-06, + "loss": 1.5584, + "step": 26560 + }, + { + "epoch": 0.951205973463212, + "grad_norm": 1.6545718908309937, + "learning_rate": 1.2452681060553639e-06, + "loss": 1.6098, + "step": 26561 + }, + { + "epoch": 0.9512417855927803, + "grad_norm": 1.359649658203125, + "learning_rate": 1.243443987462878e-06, + "loss": 1.4323, + "step": 26562 + }, + { + "epoch": 0.9512775977223485, + "grad_norm": 1.870990514755249, + "learning_rate": 1.2416211975124658e-06, + "loss": 1.4793, + "step": 26563 + }, + { + "epoch": 0.9513134098519168, + "grad_norm": 1.6119709014892578, + "learning_rate": 1.2397997362286528e-06, + "loss": 1.1674, + "step": 26564 + }, + { + "epoch": 0.9513492219814851, + "grad_norm": 1.4036765098571777, + "learning_rate": 1.2379796036359526e-06, + "loss": 1.4654, + "step": 26565 + }, + { + "epoch": 0.9513850341110535, + "grad_norm": 1.458703875541687, + "learning_rate": 1.2361607997588343e-06, + "loss": 1.538, + "step": 26566 + }, + { + "epoch": 0.9514208462406217, + "grad_norm": 1.934006690979004, + "learning_rate": 1.2343433246217673e-06, + "loss": 1.5755, + "step": 26567 + }, + { + "epoch": 0.95145665837019, + "grad_norm": 1.8799941539764404, + "learning_rate": 1.232527178249232e-06, + "loss": 1.5967, + "step": 26568 + }, + { + "epoch": 0.9514924704997583, + "grad_norm": 2.1603283882141113, + "learning_rate": 1.2307123606656312e-06, + "loss": 1.2299, + "step": 26569 + }, + { + "epoch": 0.9515282826293265, + "grad_norm": 2.093066692352295, + "learning_rate": 1.2288988718953897e-06, + "loss": 1.441, + "step": 26570 + }, + { + "epoch": 0.9515640947588948, + "grad_norm": 2.1063077449798584, + "learning_rate": 1.2270867119629103e-06, + "loss": 1.376, + "step": 26571 + }, + { + "epoch": 0.9515999068884631, + "grad_norm": 1.4114335775375366, + "learning_rate": 1.2252758808925736e-06, + "loss": 1.3026, + "step": 26572 + }, + { + "epoch": 0.9516357190180315, + "grad_norm": 1.5860474109649658, + "learning_rate": 1.2234663787087375e-06, + "loss": 1.3799, + "step": 26573 + }, + { + "epoch": 0.9516715311475997, + "grad_norm": 1.4082696437835693, + "learning_rate": 1.2216582054357495e-06, + "loss": 1.3476, + "step": 26574 + }, + { + "epoch": 0.951707343277168, + "grad_norm": 1.7118854522705078, + "learning_rate": 1.2198513610979346e-06, + "loss": 1.3482, + "step": 26575 + }, + { + "epoch": 0.9517431554067363, + "grad_norm": 2.112502336502075, + "learning_rate": 1.2180458457196064e-06, + "loss": 1.2543, + "step": 26576 + }, + { + "epoch": 0.9517789675363045, + "grad_norm": 1.6754275560379028, + "learning_rate": 1.2162416593250569e-06, + "loss": 1.5346, + "step": 26577 + }, + { + "epoch": 0.9518147796658728, + "grad_norm": 1.9799768924713135, + "learning_rate": 1.2144388019385333e-06, + "loss": 1.2265, + "step": 26578 + }, + { + "epoch": 0.9518505917954411, + "grad_norm": 1.3142305612564087, + "learning_rate": 1.2126372735843272e-06, + "loss": 1.4418, + "step": 26579 + }, + { + "epoch": 0.9518864039250094, + "grad_norm": 1.5378390550613403, + "learning_rate": 1.2108370742866526e-06, + "loss": 1.7047, + "step": 26580 + }, + { + "epoch": 0.9519222160545777, + "grad_norm": 1.4842941761016846, + "learning_rate": 1.2090382040697456e-06, + "loss": 1.3786, + "step": 26581 + }, + { + "epoch": 0.951958028184146, + "grad_norm": 1.9399491548538208, + "learning_rate": 1.2072406629577871e-06, + "loss": 1.6141, + "step": 26582 + }, + { + "epoch": 0.9519938403137143, + "grad_norm": 2.0363988876342773, + "learning_rate": 1.2054444509749906e-06, + "loss": 1.5161, + "step": 26583 + }, + { + "epoch": 0.9520296524432825, + "grad_norm": 1.677929162979126, + "learning_rate": 1.203649568145493e-06, + "loss": 1.4207, + "step": 26584 + }, + { + "epoch": 0.9520654645728508, + "grad_norm": 1.5547820329666138, + "learning_rate": 1.201856014493441e-06, + "loss": 1.4561, + "step": 26585 + }, + { + "epoch": 0.9521012767024191, + "grad_norm": 1.590218186378479, + "learning_rate": 1.2000637900429934e-06, + "loss": 1.3997, + "step": 26586 + }, + { + "epoch": 0.9521370888319874, + "grad_norm": 1.4837863445281982, + "learning_rate": 1.1982728948182308e-06, + "loss": 1.3099, + "step": 26587 + }, + { + "epoch": 0.9521729009615557, + "grad_norm": 1.7251883745193481, + "learning_rate": 1.1964833288432674e-06, + "loss": 1.3323, + "step": 26588 + }, + { + "epoch": 0.952208713091124, + "grad_norm": 2.328374147415161, + "learning_rate": 1.194695092142173e-06, + "loss": 1.6759, + "step": 26589 + }, + { + "epoch": 0.9522445252206923, + "grad_norm": 1.6122950315475464, + "learning_rate": 1.1929081847390056e-06, + "loss": 1.2681, + "step": 26590 + }, + { + "epoch": 0.9522803373502605, + "grad_norm": 1.4836642742156982, + "learning_rate": 1.191122606657813e-06, + "loss": 1.2356, + "step": 26591 + }, + { + "epoch": 0.9523161494798288, + "grad_norm": 1.837382435798645, + "learning_rate": 1.1893383579226091e-06, + "loss": 1.5641, + "step": 26592 + }, + { + "epoch": 0.9523519616093971, + "grad_norm": 2.236088275909424, + "learning_rate": 1.1875554385573972e-06, + "loss": 1.538, + "step": 26593 + }, + { + "epoch": 0.9523877737389654, + "grad_norm": 1.5058979988098145, + "learning_rate": 1.185773848586158e-06, + "loss": 1.551, + "step": 26594 + }, + { + "epoch": 0.9524235858685337, + "grad_norm": 2.073533773422241, + "learning_rate": 1.1839935880328946e-06, + "loss": 1.7001, + "step": 26595 + }, + { + "epoch": 0.952459397998102, + "grad_norm": 2.2686805725097656, + "learning_rate": 1.1822146569215097e-06, + "loss": 1.5851, + "step": 26596 + }, + { + "epoch": 0.9524952101276702, + "grad_norm": 1.4711614847183228, + "learning_rate": 1.1804370552759735e-06, + "loss": 1.4615, + "step": 26597 + }, + { + "epoch": 0.9525310222572385, + "grad_norm": 1.4939398765563965, + "learning_rate": 1.178660783120189e-06, + "loss": 1.4739, + "step": 26598 + }, + { + "epoch": 0.9525668343868068, + "grad_norm": 1.7867990732192993, + "learning_rate": 1.176885840478048e-06, + "loss": 1.6565, + "step": 26599 + }, + { + "epoch": 0.952602646516375, + "grad_norm": 1.488075613975525, + "learning_rate": 1.1751122273734316e-06, + "loss": 1.131, + "step": 26600 + }, + { + "epoch": 0.9526384586459434, + "grad_norm": 2.2243800163269043, + "learning_rate": 1.1733399438302206e-06, + "loss": 1.2857, + "step": 26601 + }, + { + "epoch": 0.9526742707755117, + "grad_norm": 1.2374006509780884, + "learning_rate": 1.1715689898722404e-06, + "loss": 1.6162, + "step": 26602 + }, + { + "epoch": 0.95271008290508, + "grad_norm": 1.3920713663101196, + "learning_rate": 1.1697993655233164e-06, + "loss": 1.394, + "step": 26603 + }, + { + "epoch": 0.9527458950346482, + "grad_norm": 1.5225958824157715, + "learning_rate": 1.1680310708072518e-06, + "loss": 1.46, + "step": 26604 + }, + { + "epoch": 0.9527817071642165, + "grad_norm": 1.5914231538772583, + "learning_rate": 1.1662641057478497e-06, + "loss": 1.4585, + "step": 26605 + }, + { + "epoch": 0.9528175192937848, + "grad_norm": 1.5339897871017456, + "learning_rate": 1.1644984703688799e-06, + "loss": 1.4752, + "step": 26606 + }, + { + "epoch": 0.952853331423353, + "grad_norm": 1.8416153192520142, + "learning_rate": 1.1627341646941015e-06, + "loss": 1.706, + "step": 26607 + }, + { + "epoch": 0.9528891435529214, + "grad_norm": 1.4199974536895752, + "learning_rate": 1.1609711887472286e-06, + "loss": 1.3469, + "step": 26608 + }, + { + "epoch": 0.9529249556824897, + "grad_norm": 1.518681287765503, + "learning_rate": 1.1592095425520088e-06, + "loss": 1.392, + "step": 26609 + }, + { + "epoch": 0.952960767812058, + "grad_norm": 1.9634442329406738, + "learning_rate": 1.1574492261321236e-06, + "loss": 1.1609, + "step": 26610 + }, + { + "epoch": 0.9529965799416262, + "grad_norm": 1.7711156606674194, + "learning_rate": 1.1556902395112645e-06, + "loss": 1.3154, + "step": 26611 + }, + { + "epoch": 0.9530323920711945, + "grad_norm": 1.8715670108795166, + "learning_rate": 1.1539325827130799e-06, + "loss": 1.1835, + "step": 26612 + }, + { + "epoch": 0.9530682042007628, + "grad_norm": 1.6497482061386108, + "learning_rate": 1.1521762557612502e-06, + "loss": 1.1409, + "step": 26613 + }, + { + "epoch": 0.953104016330331, + "grad_norm": 1.8769314289093018, + "learning_rate": 1.1504212586793683e-06, + "loss": 1.4843, + "step": 26614 + }, + { + "epoch": 0.9531398284598994, + "grad_norm": 1.9313020706176758, + "learning_rate": 1.1486675914910705e-06, + "loss": 1.6296, + "step": 26615 + }, + { + "epoch": 0.9531756405894677, + "grad_norm": 1.3665918111801147, + "learning_rate": 1.1469152542199379e-06, + "loss": 1.3802, + "step": 26616 + }, + { + "epoch": 0.953211452719036, + "grad_norm": 1.4282692670822144, + "learning_rate": 1.1451642468895518e-06, + "loss": 1.2054, + "step": 26617 + }, + { + "epoch": 0.9532472648486042, + "grad_norm": 1.5573898553848267, + "learning_rate": 1.14341456952346e-06, + "loss": 1.4843, + "step": 26618 + }, + { + "epoch": 0.9532830769781725, + "grad_norm": 1.994338870048523, + "learning_rate": 1.1416662221452211e-06, + "loss": 1.3083, + "step": 26619 + }, + { + "epoch": 0.9533188891077408, + "grad_norm": 1.591489315032959, + "learning_rate": 1.139919204778339e-06, + "loss": 1.3245, + "step": 26620 + }, + { + "epoch": 0.953354701237309, + "grad_norm": 1.2546374797821045, + "learning_rate": 1.1381735174463283e-06, + "loss": 0.9723, + "step": 26621 + }, + { + "epoch": 0.9533905133668774, + "grad_norm": 1.652453064918518, + "learning_rate": 1.1364291601726585e-06, + "loss": 1.4873, + "step": 26622 + }, + { + "epoch": 0.9534263254964457, + "grad_norm": 1.4358993768692017, + "learning_rate": 1.1346861329808112e-06, + "loss": 1.5944, + "step": 26623 + }, + { + "epoch": 0.953462137626014, + "grad_norm": 1.5629358291625977, + "learning_rate": 1.1329444358942454e-06, + "loss": 1.2247, + "step": 26624 + }, + { + "epoch": 0.9534979497555822, + "grad_norm": 1.5762356519699097, + "learning_rate": 1.1312040689363757e-06, + "loss": 1.4513, + "step": 26625 + }, + { + "epoch": 0.9535337618851505, + "grad_norm": 1.2871313095092773, + "learning_rate": 1.1294650321306277e-06, + "loss": 1.2398, + "step": 26626 + }, + { + "epoch": 0.9535695740147188, + "grad_norm": 1.5916223526000977, + "learning_rate": 1.127727325500394e-06, + "loss": 1.5777, + "step": 26627 + }, + { + "epoch": 0.953605386144287, + "grad_norm": 1.6760125160217285, + "learning_rate": 1.1259909490690556e-06, + "loss": 1.4228, + "step": 26628 + }, + { + "epoch": 0.9536411982738554, + "grad_norm": 1.6149671077728271, + "learning_rate": 1.1242559028599609e-06, + "loss": 1.4672, + "step": 26629 + }, + { + "epoch": 0.9536770104034237, + "grad_norm": 1.6717442274093628, + "learning_rate": 1.1225221868964686e-06, + "loss": 1.2505, + "step": 26630 + }, + { + "epoch": 0.9537128225329919, + "grad_norm": 1.4332804679870605, + "learning_rate": 1.1207898012018936e-06, + "loss": 1.3423, + "step": 26631 + }, + { + "epoch": 0.9537486346625602, + "grad_norm": 1.3053473234176636, + "learning_rate": 1.1190587457995506e-06, + "loss": 1.7562, + "step": 26632 + }, + { + "epoch": 0.9537844467921285, + "grad_norm": 1.356052279472351, + "learning_rate": 1.1173290207127207e-06, + "loss": 1.47, + "step": 26633 + }, + { + "epoch": 0.9538202589216968, + "grad_norm": 1.9330263137817383, + "learning_rate": 1.1156006259646856e-06, + "loss": 1.6352, + "step": 26634 + }, + { + "epoch": 0.953856071051265, + "grad_norm": 2.3237826824188232, + "learning_rate": 1.1138735615786933e-06, + "loss": 1.331, + "step": 26635 + }, + { + "epoch": 0.9538918831808334, + "grad_norm": 1.5061534643173218, + "learning_rate": 1.1121478275779696e-06, + "loss": 1.2172, + "step": 26636 + }, + { + "epoch": 0.9539276953104017, + "grad_norm": 1.64857816696167, + "learning_rate": 1.1104234239857402e-06, + "loss": 1.6107, + "step": 26637 + }, + { + "epoch": 0.9539635074399699, + "grad_norm": 1.9496674537658691, + "learning_rate": 1.1087003508252202e-06, + "loss": 1.3248, + "step": 26638 + }, + { + "epoch": 0.9539993195695382, + "grad_norm": 1.5698601007461548, + "learning_rate": 1.1069786081195687e-06, + "loss": 1.4504, + "step": 26639 + }, + { + "epoch": 0.9540351316991065, + "grad_norm": 1.6455700397491455, + "learning_rate": 1.105258195891945e-06, + "loss": 1.368, + "step": 26640 + }, + { + "epoch": 0.9540709438286747, + "grad_norm": 1.2808388471603394, + "learning_rate": 1.1035391141655195e-06, + "loss": 1.5479, + "step": 26641 + }, + { + "epoch": 0.954106755958243, + "grad_norm": 2.0161030292510986, + "learning_rate": 1.1018213629634178e-06, + "loss": 1.6358, + "step": 26642 + }, + { + "epoch": 0.9541425680878114, + "grad_norm": 1.532594919204712, + "learning_rate": 1.1001049423087217e-06, + "loss": 1.046, + "step": 26643 + }, + { + "epoch": 0.9541783802173797, + "grad_norm": 1.7464717626571655, + "learning_rate": 1.098389852224546e-06, + "loss": 1.1881, + "step": 26644 + }, + { + "epoch": 0.9542141923469479, + "grad_norm": 1.4636871814727783, + "learning_rate": 1.0966760927339726e-06, + "loss": 1.4925, + "step": 26645 + }, + { + "epoch": 0.9542500044765162, + "grad_norm": 1.6371227502822876, + "learning_rate": 1.094963663860027e-06, + "loss": 1.5365, + "step": 26646 + }, + { + "epoch": 0.9542858166060845, + "grad_norm": 1.4008408784866333, + "learning_rate": 1.0932525656257796e-06, + "loss": 1.4636, + "step": 26647 + }, + { + "epoch": 0.9543216287356527, + "grad_norm": 1.474956750869751, + "learning_rate": 1.0915427980542348e-06, + "loss": 1.5885, + "step": 26648 + }, + { + "epoch": 0.954357440865221, + "grad_norm": 1.310798168182373, + "learning_rate": 1.089834361168407e-06, + "loss": 1.1127, + "step": 26649 + }, + { + "epoch": 0.9543932529947894, + "grad_norm": 1.807181715965271, + "learning_rate": 1.088127254991267e-06, + "loss": 1.4589, + "step": 26650 + }, + { + "epoch": 0.9544290651243577, + "grad_norm": 1.9590734243392944, + "learning_rate": 1.086421479545785e-06, + "loss": 1.5383, + "step": 26651 + }, + { + "epoch": 0.9544648772539259, + "grad_norm": 1.3808904886245728, + "learning_rate": 1.0847170348549096e-06, + "loss": 1.3616, + "step": 26652 + }, + { + "epoch": 0.9545006893834942, + "grad_norm": 2.0818872451782227, + "learning_rate": 1.0830139209415779e-06, + "loss": 1.6911, + "step": 26653 + }, + { + "epoch": 0.9545365015130625, + "grad_norm": 1.6958940029144287, + "learning_rate": 1.081312137828716e-06, + "loss": 1.8023, + "step": 26654 + }, + { + "epoch": 0.9545723136426307, + "grad_norm": 1.6296740770339966, + "learning_rate": 1.0796116855391724e-06, + "loss": 1.0263, + "step": 26655 + }, + { + "epoch": 0.954608125772199, + "grad_norm": 1.9934402704238892, + "learning_rate": 1.0779125640958843e-06, + "loss": 1.5194, + "step": 26656 + }, + { + "epoch": 0.9546439379017674, + "grad_norm": 1.6104828119277954, + "learning_rate": 1.0762147735216665e-06, + "loss": 1.5336, + "step": 26657 + }, + { + "epoch": 0.9546797500313356, + "grad_norm": 1.7793383598327637, + "learning_rate": 1.0745183138393788e-06, + "loss": 1.5556, + "step": 26658 + }, + { + "epoch": 0.9547155621609039, + "grad_norm": 1.319137454032898, + "learning_rate": 1.0728231850718363e-06, + "loss": 1.6508, + "step": 26659 + }, + { + "epoch": 0.9547513742904722, + "grad_norm": 1.5470972061157227, + "learning_rate": 1.071129387241865e-06, + "loss": 1.492, + "step": 26660 + }, + { + "epoch": 0.9547871864200405, + "grad_norm": 1.630874752998352, + "learning_rate": 1.0694369203722354e-06, + "loss": 1.7498, + "step": 26661 + }, + { + "epoch": 0.9548229985496087, + "grad_norm": 2.3035964965820312, + "learning_rate": 1.0677457844857186e-06, + "loss": 1.5807, + "step": 26662 + }, + { + "epoch": 0.954858810679177, + "grad_norm": 1.6051185131072998, + "learning_rate": 1.0660559796050739e-06, + "loss": 1.4429, + "step": 26663 + }, + { + "epoch": 0.9548946228087454, + "grad_norm": 2.025474786758423, + "learning_rate": 1.0643675057530166e-06, + "loss": 1.2662, + "step": 26664 + }, + { + "epoch": 0.9549304349383136, + "grad_norm": 1.6414517164230347, + "learning_rate": 1.0626803629522951e-06, + "loss": 0.994, + "step": 26665 + }, + { + "epoch": 0.9549662470678819, + "grad_norm": 1.5560110807418823, + "learning_rate": 1.0609945512255692e-06, + "loss": 1.2765, + "step": 26666 + }, + { + "epoch": 0.9550020591974502, + "grad_norm": 1.2948863506317139, + "learning_rate": 1.0593100705955538e-06, + "loss": 1.4118, + "step": 26667 + }, + { + "epoch": 0.9550378713270185, + "grad_norm": 1.3893499374389648, + "learning_rate": 1.0576269210848867e-06, + "loss": 1.5091, + "step": 26668 + }, + { + "epoch": 0.9550736834565867, + "grad_norm": 1.5306798219680786, + "learning_rate": 1.055945102716227e-06, + "loss": 1.2156, + "step": 26669 + }, + { + "epoch": 0.955109495586155, + "grad_norm": 1.7268040180206299, + "learning_rate": 1.0542646155122015e-06, + "loss": 1.7405, + "step": 26670 + }, + { + "epoch": 0.9551453077157234, + "grad_norm": 2.0746474266052246, + "learning_rate": 1.0525854594954143e-06, + "loss": 1.3419, + "step": 26671 + }, + { + "epoch": 0.9551811198452916, + "grad_norm": 1.7448663711547852, + "learning_rate": 1.0509076346884583e-06, + "loss": 1.827, + "step": 26672 + }, + { + "epoch": 0.9552169319748599, + "grad_norm": 1.811957597732544, + "learning_rate": 1.0492311411138934e-06, + "loss": 1.6032, + "step": 26673 + }, + { + "epoch": 0.9552527441044282, + "grad_norm": 2.3352155685424805, + "learning_rate": 1.0475559787943012e-06, + "loss": 1.2959, + "step": 26674 + }, + { + "epoch": 0.9552885562339964, + "grad_norm": 1.8025147914886475, + "learning_rate": 1.0458821477521974e-06, + "loss": 1.5484, + "step": 26675 + }, + { + "epoch": 0.9553243683635647, + "grad_norm": 1.6638069152832031, + "learning_rate": 1.0442096480101082e-06, + "loss": 1.1597, + "step": 26676 + }, + { + "epoch": 0.955360180493133, + "grad_norm": 1.7823617458343506, + "learning_rate": 1.042538479590527e-06, + "loss": 1.7342, + "step": 26677 + }, + { + "epoch": 0.9553959926227014, + "grad_norm": 1.538569450378418, + "learning_rate": 1.0408686425159574e-06, + "loss": 1.3907, + "step": 26678 + }, + { + "epoch": 0.9554318047522696, + "grad_norm": 1.5950886011123657, + "learning_rate": 1.0392001368088377e-06, + "loss": 1.4772, + "step": 26679 + }, + { + "epoch": 0.9554676168818379, + "grad_norm": 2.025176525115967, + "learning_rate": 1.0375329624916386e-06, + "loss": 1.4865, + "step": 26680 + }, + { + "epoch": 0.9555034290114062, + "grad_norm": 1.6462210416793823, + "learning_rate": 1.0358671195867865e-06, + "loss": 1.2826, + "step": 26681 + }, + { + "epoch": 0.9555392411409744, + "grad_norm": 1.741531491279602, + "learning_rate": 1.0342026081166745e-06, + "loss": 1.3633, + "step": 26682 + }, + { + "epoch": 0.9555750532705427, + "grad_norm": 1.3028637170791626, + "learning_rate": 1.0325394281037293e-06, + "loss": 1.3505, + "step": 26683 + }, + { + "epoch": 0.955610865400111, + "grad_norm": 1.8471804857254028, + "learning_rate": 1.0308775795702775e-06, + "loss": 1.2296, + "step": 26684 + }, + { + "epoch": 0.9556466775296794, + "grad_norm": 1.38747239112854, + "learning_rate": 1.0292170625387342e-06, + "loss": 1.4824, + "step": 26685 + }, + { + "epoch": 0.9556824896592476, + "grad_norm": 1.3638286590576172, + "learning_rate": 1.0275578770313933e-06, + "loss": 1.3719, + "step": 26686 + }, + { + "epoch": 0.9557183017888159, + "grad_norm": 1.4321017265319824, + "learning_rate": 1.025900023070614e-06, + "loss": 1.4655, + "step": 26687 + }, + { + "epoch": 0.9557541139183842, + "grad_norm": 1.3580139875411987, + "learning_rate": 1.0242435006786677e-06, + "loss": 1.5321, + "step": 26688 + }, + { + "epoch": 0.9557899260479524, + "grad_norm": 1.8781472444534302, + "learning_rate": 1.0225883098778588e-06, + "loss": 1.3467, + "step": 26689 + }, + { + "epoch": 0.9558257381775207, + "grad_norm": 1.5805599689483643, + "learning_rate": 1.0209344506904694e-06, + "loss": 1.4534, + "step": 26690 + }, + { + "epoch": 0.955861550307089, + "grad_norm": 1.7332732677459717, + "learning_rate": 1.019281923138715e-06, + "loss": 1.4556, + "step": 26691 + }, + { + "epoch": 0.9558973624366573, + "grad_norm": 1.5968459844589233, + "learning_rate": 1.0176307272448448e-06, + "loss": 1.3131, + "step": 26692 + }, + { + "epoch": 0.9559331745662256, + "grad_norm": 1.7155627012252808, + "learning_rate": 1.015980863031074e-06, + "loss": 1.657, + "step": 26693 + }, + { + "epoch": 0.9559689866957939, + "grad_norm": 1.7028433084487915, + "learning_rate": 1.0143323305196184e-06, + "loss": 1.4293, + "step": 26694 + }, + { + "epoch": 0.9560047988253622, + "grad_norm": 1.3736257553100586, + "learning_rate": 1.0126851297326157e-06, + "loss": 1.1688, + "step": 26695 + }, + { + "epoch": 0.9560406109549304, + "grad_norm": 1.3911411762237549, + "learning_rate": 1.0110392606922703e-06, + "loss": 1.3592, + "step": 26696 + }, + { + "epoch": 0.9560764230844987, + "grad_norm": 1.8647515773773193, + "learning_rate": 1.0093947234206868e-06, + "loss": 1.5904, + "step": 26697 + }, + { + "epoch": 0.956112235214067, + "grad_norm": 2.0917210578918457, + "learning_rate": 1.0077515179400254e-06, + "loss": 1.5137, + "step": 26698 + }, + { + "epoch": 0.9561480473436353, + "grad_norm": 1.5692753791809082, + "learning_rate": 1.0061096442723683e-06, + "loss": 1.1976, + "step": 26699 + }, + { + "epoch": 0.9561838594732036, + "grad_norm": 1.5351765155792236, + "learning_rate": 1.004469102439809e-06, + "loss": 1.2061, + "step": 26700 + }, + { + "epoch": 0.9562196716027719, + "grad_norm": 1.3395408391952515, + "learning_rate": 1.0028298924644408e-06, + "loss": 1.1257, + "step": 26701 + }, + { + "epoch": 0.9562554837323402, + "grad_norm": 1.8311903476715088, + "learning_rate": 1.0011920143682796e-06, + "loss": 1.3361, + "step": 26702 + }, + { + "epoch": 0.9562912958619084, + "grad_norm": 1.7517163753509521, + "learning_rate": 9.995554681733855e-07, + "loss": 1.0912, + "step": 26703 + }, + { + "epoch": 0.9563271079914767, + "grad_norm": 1.5621646642684937, + "learning_rate": 9.97920253901774e-07, + "loss": 1.7053, + "step": 26704 + }, + { + "epoch": 0.956362920121045, + "grad_norm": 1.5236084461212158, + "learning_rate": 9.96286371575439e-07, + "loss": 1.6629, + "step": 26705 + }, + { + "epoch": 0.9563987322506133, + "grad_norm": 2.2787911891937256, + "learning_rate": 9.946538212163736e-07, + "loss": 1.6588, + "step": 26706 + }, + { + "epoch": 0.9564345443801816, + "grad_norm": 1.7489440441131592, + "learning_rate": 9.930226028465272e-07, + "loss": 1.5958, + "step": 26707 + }, + { + "epoch": 0.9564703565097499, + "grad_norm": 2.500220775604248, + "learning_rate": 9.913927164878488e-07, + "loss": 1.5689, + "step": 26708 + }, + { + "epoch": 0.9565061686393181, + "grad_norm": 1.724913239479065, + "learning_rate": 9.897641621622765e-07, + "loss": 1.1326, + "step": 26709 + }, + { + "epoch": 0.9565419807688864, + "grad_norm": 2.303943634033203, + "learning_rate": 9.88136939891704e-07, + "loss": 1.4504, + "step": 26710 + }, + { + "epoch": 0.9565777928984547, + "grad_norm": 2.162876844406128, + "learning_rate": 9.865110496980356e-07, + "loss": 1.5306, + "step": 26711 + }, + { + "epoch": 0.956613605028023, + "grad_norm": 1.4706625938415527, + "learning_rate": 9.84886491603154e-07, + "loss": 1.3547, + "step": 26712 + }, + { + "epoch": 0.9566494171575913, + "grad_norm": 1.7020697593688965, + "learning_rate": 9.832632656288864e-07, + "loss": 1.5613, + "step": 26713 + }, + { + "epoch": 0.9566852292871596, + "grad_norm": 1.5223006010055542, + "learning_rate": 9.81641371797104e-07, + "loss": 1.5805, + "step": 26714 + }, + { + "epoch": 0.9567210414167279, + "grad_norm": 1.1612199544906616, + "learning_rate": 9.800208101296115e-07, + "loss": 1.5998, + "step": 26715 + }, + { + "epoch": 0.9567568535462961, + "grad_norm": 1.6327825784683228, + "learning_rate": 9.784015806482028e-07, + "loss": 1.1863, + "step": 26716 + }, + { + "epoch": 0.9567926656758644, + "grad_norm": 1.9279059171676636, + "learning_rate": 9.767836833746714e-07, + "loss": 1.4357, + "step": 26717 + }, + { + "epoch": 0.9568284778054327, + "grad_norm": 1.5725735425949097, + "learning_rate": 9.751671183307888e-07, + "loss": 1.4529, + "step": 26718 + }, + { + "epoch": 0.956864289935001, + "grad_norm": 1.6223695278167725, + "learning_rate": 9.735518855383152e-07, + "loss": 1.4732, + "step": 26719 + }, + { + "epoch": 0.9569001020645693, + "grad_norm": 2.1366162300109863, + "learning_rate": 9.719379850189447e-07, + "loss": 1.4565, + "step": 26720 + }, + { + "epoch": 0.9569359141941376, + "grad_norm": 2.1586503982543945, + "learning_rate": 9.703254167944154e-07, + "loss": 1.2536, + "step": 26721 + }, + { + "epoch": 0.9569717263237059, + "grad_norm": 2.3456790447235107, + "learning_rate": 9.68714180886421e-07, + "loss": 1.3688, + "step": 26722 + }, + { + "epoch": 0.9570075384532741, + "grad_norm": 1.5606224536895752, + "learning_rate": 9.67104277316644e-07, + "loss": 1.6539, + "step": 26723 + }, + { + "epoch": 0.9570433505828424, + "grad_norm": 1.9346030950546265, + "learning_rate": 9.654957061067228e-07, + "loss": 1.4414, + "step": 26724 + }, + { + "epoch": 0.9570791627124107, + "grad_norm": 1.4974099397659302, + "learning_rate": 9.638884672783176e-07, + "loss": 1.3323, + "step": 26725 + }, + { + "epoch": 0.9571149748419789, + "grad_norm": 1.7413378953933716, + "learning_rate": 9.622825608530561e-07, + "loss": 1.1622, + "step": 26726 + }, + { + "epoch": 0.9571507869715473, + "grad_norm": 1.3514119386672974, + "learning_rate": 9.606779868525206e-07, + "loss": 1.0968, + "step": 26727 + }, + { + "epoch": 0.9571865991011156, + "grad_norm": 1.4090226888656616, + "learning_rate": 9.590747452983161e-07, + "loss": 1.6259, + "step": 26728 + }, + { + "epoch": 0.9572224112306839, + "grad_norm": 1.9629863500595093, + "learning_rate": 9.574728362120033e-07, + "loss": 1.0267, + "step": 26729 + }, + { + "epoch": 0.9572582233602521, + "grad_norm": 1.4839998483657837, + "learning_rate": 9.558722596151425e-07, + "loss": 1.3511, + "step": 26730 + }, + { + "epoch": 0.9572940354898204, + "grad_norm": 1.6294034719467163, + "learning_rate": 9.5427301552925e-07, + "loss": 1.4625, + "step": 26731 + }, + { + "epoch": 0.9573298476193887, + "grad_norm": 1.5633814334869385, + "learning_rate": 9.526751039758641e-07, + "loss": 1.6409, + "step": 26732 + }, + { + "epoch": 0.9573656597489569, + "grad_norm": 1.6685106754302979, + "learning_rate": 9.510785249764786e-07, + "loss": 1.4664, + "step": 26733 + }, + { + "epoch": 0.9574014718785253, + "grad_norm": 1.2557251453399658, + "learning_rate": 9.494832785525653e-07, + "loss": 1.4488, + "step": 26734 + }, + { + "epoch": 0.9574372840080936, + "grad_norm": 1.9086263179779053, + "learning_rate": 9.478893647255849e-07, + "loss": 1.2848, + "step": 26735 + }, + { + "epoch": 0.9574730961376618, + "grad_norm": 1.5768110752105713, + "learning_rate": 9.462967835169756e-07, + "loss": 1.4434, + "step": 26736 + }, + { + "epoch": 0.9575089082672301, + "grad_norm": 1.5403549671173096, + "learning_rate": 9.44705534948187e-07, + "loss": 1.4612, + "step": 26737 + }, + { + "epoch": 0.9575447203967984, + "grad_norm": 1.6581268310546875, + "learning_rate": 9.431156190406131e-07, + "loss": 1.2959, + "step": 26738 + }, + { + "epoch": 0.9575805325263667, + "grad_norm": 1.4390212297439575, + "learning_rate": 9.41527035815637e-07, + "loss": 1.2648, + "step": 26739 + }, + { + "epoch": 0.9576163446559349, + "grad_norm": 1.7185239791870117, + "learning_rate": 9.399397852946413e-07, + "loss": 1.5569, + "step": 26740 + }, + { + "epoch": 0.9576521567855032, + "grad_norm": 2.195068359375, + "learning_rate": 9.383538674989756e-07, + "loss": 1.7491, + "step": 26741 + }, + { + "epoch": 0.9576879689150716, + "grad_norm": 1.4795840978622437, + "learning_rate": 9.367692824499786e-07, + "loss": 1.5406, + "step": 26742 + }, + { + "epoch": 0.9577237810446398, + "grad_norm": 1.736876130104065, + "learning_rate": 9.351860301689775e-07, + "loss": 1.2074, + "step": 26743 + }, + { + "epoch": 0.9577595931742081, + "grad_norm": 1.4114105701446533, + "learning_rate": 9.336041106772553e-07, + "loss": 1.3731, + "step": 26744 + }, + { + "epoch": 0.9577954053037764, + "grad_norm": 1.434982180595398, + "learning_rate": 9.320235239961061e-07, + "loss": 1.257, + "step": 26745 + }, + { + "epoch": 0.9578312174333447, + "grad_norm": 2.827622413635254, + "learning_rate": 9.304442701467908e-07, + "loss": 1.6111, + "step": 26746 + }, + { + "epoch": 0.9578670295629129, + "grad_norm": 2.319575786590576, + "learning_rate": 9.288663491505478e-07, + "loss": 1.4424, + "step": 26747 + }, + { + "epoch": 0.9579028416924812, + "grad_norm": 1.5229041576385498, + "learning_rate": 9.27289761028638e-07, + "loss": 1.2918, + "step": 26748 + }, + { + "epoch": 0.9579386538220496, + "grad_norm": 1.6973974704742432, + "learning_rate": 9.257145058022331e-07, + "loss": 1.3539, + "step": 26749 + }, + { + "epoch": 0.9579744659516178, + "grad_norm": 1.4195101261138916, + "learning_rate": 9.241405834925388e-07, + "loss": 1.3205, + "step": 26750 + }, + { + "epoch": 0.9580102780811861, + "grad_norm": 1.6695009469985962, + "learning_rate": 9.225679941207488e-07, + "loss": 1.4648, + "step": 26751 + }, + { + "epoch": 0.9580460902107544, + "grad_norm": 1.8575265407562256, + "learning_rate": 9.20996737708002e-07, + "loss": 1.4749, + "step": 26752 + }, + { + "epoch": 0.9580819023403226, + "grad_norm": 1.3980778455734253, + "learning_rate": 9.19426814275437e-07, + "loss": 1.5525, + "step": 26753 + }, + { + "epoch": 0.9581177144698909, + "grad_norm": 1.5089133977890015, + "learning_rate": 9.178582238441702e-07, + "loss": 1.1665, + "step": 26754 + }, + { + "epoch": 0.9581535265994592, + "grad_norm": 1.390755534172058, + "learning_rate": 9.162909664353292e-07, + "loss": 1.6808, + "step": 26755 + }, + { + "epoch": 0.9581893387290276, + "grad_norm": 1.4605809450149536, + "learning_rate": 9.14725042069986e-07, + "loss": 1.5083, + "step": 26756 + }, + { + "epoch": 0.9582251508585958, + "grad_norm": 1.6914252042770386, + "learning_rate": 9.131604507691904e-07, + "loss": 1.635, + "step": 26757 + }, + { + "epoch": 0.9582609629881641, + "grad_norm": 1.5005120038986206, + "learning_rate": 9.115971925540257e-07, + "loss": 1.3975, + "step": 26758 + }, + { + "epoch": 0.9582967751177324, + "grad_norm": 1.4641746282577515, + "learning_rate": 9.100352674454971e-07, + "loss": 1.4, + "step": 26759 + }, + { + "epoch": 0.9583325872473006, + "grad_norm": 1.9565989971160889, + "learning_rate": 9.084746754646323e-07, + "loss": 1.4052, + "step": 26760 + }, + { + "epoch": 0.9583683993768689, + "grad_norm": 1.6255415678024292, + "learning_rate": 9.069154166324146e-07, + "loss": 1.3676, + "step": 26761 + }, + { + "epoch": 0.9584042115064372, + "grad_norm": 2.551762104034424, + "learning_rate": 9.053574909698381e-07, + "loss": 1.4617, + "step": 26762 + }, + { + "epoch": 0.9584400236360056, + "grad_norm": 1.3324257135391235, + "learning_rate": 9.038008984978419e-07, + "loss": 1.3645, + "step": 26763 + }, + { + "epoch": 0.9584758357655738, + "grad_norm": 3.2408790588378906, + "learning_rate": 9.022456392373868e-07, + "loss": 1.3834, + "step": 26764 + }, + { + "epoch": 0.9585116478951421, + "grad_norm": 1.4632230997085571, + "learning_rate": 9.006917132093895e-07, + "loss": 1.0799, + "step": 26765 + }, + { + "epoch": 0.9585474600247104, + "grad_norm": 1.6773384809494019, + "learning_rate": 8.991391204347555e-07, + "loss": 1.4687, + "step": 26766 + }, + { + "epoch": 0.9585832721542786, + "grad_norm": 1.8148605823516846, + "learning_rate": 8.97587860934368e-07, + "loss": 1.2124, + "step": 26767 + }, + { + "epoch": 0.9586190842838469, + "grad_norm": 1.7556577920913696, + "learning_rate": 8.960379347291103e-07, + "loss": 1.6248, + "step": 26768 + }, + { + "epoch": 0.9586548964134152, + "grad_norm": 2.0786924362182617, + "learning_rate": 8.944893418398326e-07, + "loss": 1.5726, + "step": 26769 + }, + { + "epoch": 0.9586907085429835, + "grad_norm": 1.6416964530944824, + "learning_rate": 8.929420822873513e-07, + "loss": 1.4571, + "step": 26770 + }, + { + "epoch": 0.9587265206725518, + "grad_norm": 1.4926072359085083, + "learning_rate": 8.913961560925055e-07, + "loss": 1.3523, + "step": 26771 + }, + { + "epoch": 0.9587623328021201, + "grad_norm": 1.511873483657837, + "learning_rate": 8.898515632760784e-07, + "loss": 1.0218, + "step": 26772 + }, + { + "epoch": 0.9587981449316884, + "grad_norm": 1.5736192464828491, + "learning_rate": 8.883083038588536e-07, + "loss": 1.7893, + "step": 26773 + }, + { + "epoch": 0.9588339570612566, + "grad_norm": 1.9865953922271729, + "learning_rate": 8.867663778616031e-07, + "loss": 1.5569, + "step": 26774 + }, + { + "epoch": 0.9588697691908249, + "grad_norm": 1.5352931022644043, + "learning_rate": 8.852257853050661e-07, + "loss": 1.7685, + "step": 26775 + }, + { + "epoch": 0.9589055813203932, + "grad_norm": 2.0710299015045166, + "learning_rate": 8.836865262099481e-07, + "loss": 1.5833, + "step": 26776 + }, + { + "epoch": 0.9589413934499615, + "grad_norm": 1.8323180675506592, + "learning_rate": 8.821486005969992e-07, + "loss": 1.1984, + "step": 26777 + }, + { + "epoch": 0.9589772055795298, + "grad_norm": 1.1376245021820068, + "learning_rate": 8.806120084868807e-07, + "loss": 1.3374, + "step": 26778 + }, + { + "epoch": 0.9590130177090981, + "grad_norm": 1.5147979259490967, + "learning_rate": 8.79076749900265e-07, + "loss": 1.4927, + "step": 26779 + }, + { + "epoch": 0.9590488298386664, + "grad_norm": 1.8162474632263184, + "learning_rate": 8.775428248578243e-07, + "loss": 1.8344, + "step": 26780 + }, + { + "epoch": 0.9590846419682346, + "grad_norm": 1.5417896509170532, + "learning_rate": 8.760102333801756e-07, + "loss": 1.3234, + "step": 26781 + }, + { + "epoch": 0.9591204540978029, + "grad_norm": 1.8801335096359253, + "learning_rate": 8.744789754879579e-07, + "loss": 1.6405, + "step": 26782 + }, + { + "epoch": 0.9591562662273712, + "grad_norm": 1.4927922487258911, + "learning_rate": 8.729490512017547e-07, + "loss": 1.3414, + "step": 26783 + }, + { + "epoch": 0.9591920783569395, + "grad_norm": 1.7957075834274292, + "learning_rate": 8.714204605421716e-07, + "loss": 1.4618, + "step": 26784 + }, + { + "epoch": 0.9592278904865078, + "grad_norm": 1.5633784532546997, + "learning_rate": 8.69893203529748e-07, + "loss": 1.208, + "step": 26785 + }, + { + "epoch": 0.9592637026160761, + "grad_norm": 2.0784788131713867, + "learning_rate": 8.683672801850451e-07, + "loss": 1.613, + "step": 26786 + }, + { + "epoch": 0.9592995147456443, + "grad_norm": 1.31596040725708, + "learning_rate": 8.668426905285909e-07, + "loss": 0.8767, + "step": 26787 + }, + { + "epoch": 0.9593353268752126, + "grad_norm": 1.5244179964065552, + "learning_rate": 8.653194345808913e-07, + "loss": 1.4995, + "step": 26788 + }, + { + "epoch": 0.9593711390047809, + "grad_norm": 1.6521975994110107, + "learning_rate": 8.63797512362452e-07, + "loss": 1.2428, + "step": 26789 + }, + { + "epoch": 0.9594069511343492, + "grad_norm": 1.871695637702942, + "learning_rate": 8.622769238937345e-07, + "loss": 1.2734, + "step": 26790 + }, + { + "epoch": 0.9594427632639175, + "grad_norm": 1.8507094383239746, + "learning_rate": 8.607576691952002e-07, + "loss": 1.5353, + "step": 26791 + }, + { + "epoch": 0.9594785753934858, + "grad_norm": 1.6126267910003662, + "learning_rate": 8.592397482872993e-07, + "loss": 1.3408, + "step": 26792 + }, + { + "epoch": 0.9595143875230541, + "grad_norm": 1.529656171798706, + "learning_rate": 8.577231611904379e-07, + "loss": 1.6628, + "step": 26793 + }, + { + "epoch": 0.9595501996526223, + "grad_norm": 1.551055908203125, + "learning_rate": 8.562079079250219e-07, + "loss": 1.3079, + "step": 26794 + }, + { + "epoch": 0.9595860117821906, + "grad_norm": 1.8348814249038696, + "learning_rate": 8.546939885114569e-07, + "loss": 1.394, + "step": 26795 + }, + { + "epoch": 0.9596218239117589, + "grad_norm": 1.4194611310958862, + "learning_rate": 8.531814029700935e-07, + "loss": 1.3471, + "step": 26796 + }, + { + "epoch": 0.9596576360413271, + "grad_norm": 1.5185151100158691, + "learning_rate": 8.516701513212821e-07, + "loss": 1.5099, + "step": 26797 + }, + { + "epoch": 0.9596934481708955, + "grad_norm": 1.589985728263855, + "learning_rate": 8.501602335853509e-07, + "loss": 1.2476, + "step": 26798 + }, + { + "epoch": 0.9597292603004638, + "grad_norm": 2.1401360034942627, + "learning_rate": 8.48651649782628e-07, + "loss": 1.6254, + "step": 26799 + }, + { + "epoch": 0.9597650724300321, + "grad_norm": 1.7402527332305908, + "learning_rate": 8.471443999333972e-07, + "loss": 1.423, + "step": 26800 + }, + { + "epoch": 0.9598008845596003, + "grad_norm": 1.600137710571289, + "learning_rate": 8.456384840579423e-07, + "loss": 1.5019, + "step": 26801 + }, + { + "epoch": 0.9598366966891686, + "grad_norm": 1.6320627927780151, + "learning_rate": 8.441339021765138e-07, + "loss": 1.3925, + "step": 26802 + }, + { + "epoch": 0.9598725088187369, + "grad_norm": 2.637521266937256, + "learning_rate": 8.426306543093732e-07, + "loss": 1.6728, + "step": 26803 + }, + { + "epoch": 0.9599083209483051, + "grad_norm": 1.9787664413452148, + "learning_rate": 8.411287404767265e-07, + "loss": 1.6501, + "step": 26804 + }, + { + "epoch": 0.9599441330778735, + "grad_norm": 1.341884970664978, + "learning_rate": 8.396281606987799e-07, + "loss": 1.4126, + "step": 26805 + }, + { + "epoch": 0.9599799452074418, + "grad_norm": 1.7115846872329712, + "learning_rate": 8.381289149957395e-07, + "loss": 1.5408, + "step": 26806 + }, + { + "epoch": 0.9600157573370101, + "grad_norm": 1.4531255960464478, + "learning_rate": 8.366310033877667e-07, + "loss": 1.589, + "step": 26807 + }, + { + "epoch": 0.9600515694665783, + "grad_norm": 2.3292057514190674, + "learning_rate": 8.351344258950123e-07, + "loss": 1.8701, + "step": 26808 + }, + { + "epoch": 0.9600873815961466, + "grad_norm": 1.7623049020767212, + "learning_rate": 8.336391825376044e-07, + "loss": 1.3654, + "step": 26809 + }, + { + "epoch": 0.9601231937257149, + "grad_norm": 1.5028655529022217, + "learning_rate": 8.321452733356605e-07, + "loss": 1.4138, + "step": 26810 + }, + { + "epoch": 0.9601590058552831, + "grad_norm": 1.9508914947509766, + "learning_rate": 8.306526983092977e-07, + "loss": 1.3763, + "step": 26811 + }, + { + "epoch": 0.9601948179848515, + "grad_norm": 1.7204222679138184, + "learning_rate": 8.291614574785777e-07, + "loss": 1.3373, + "step": 26812 + }, + { + "epoch": 0.9602306301144198, + "grad_norm": 1.9033406972885132, + "learning_rate": 8.276715508635624e-07, + "loss": 1.3409, + "step": 26813 + }, + { + "epoch": 0.960266442243988, + "grad_norm": 1.5977619886398315, + "learning_rate": 8.261829784843133e-07, + "loss": 1.2249, + "step": 26814 + }, + { + "epoch": 0.9603022543735563, + "grad_norm": 1.9996955394744873, + "learning_rate": 8.246957403608479e-07, + "loss": 1.5399, + "step": 26815 + }, + { + "epoch": 0.9603380665031246, + "grad_norm": 1.686150312423706, + "learning_rate": 8.232098365131613e-07, + "loss": 1.2805, + "step": 26816 + }, + { + "epoch": 0.9603738786326929, + "grad_norm": 1.4930355548858643, + "learning_rate": 8.217252669612708e-07, + "loss": 1.4063, + "step": 26817 + }, + { + "epoch": 0.9604096907622611, + "grad_norm": 1.8459813594818115, + "learning_rate": 8.20242031725138e-07, + "loss": 1.5286, + "step": 26818 + }, + { + "epoch": 0.9604455028918295, + "grad_norm": 1.9831894636154175, + "learning_rate": 8.187601308247028e-07, + "loss": 1.4814, + "step": 26819 + }, + { + "epoch": 0.9604813150213978, + "grad_norm": 1.6175930500030518, + "learning_rate": 8.172795642799269e-07, + "loss": 1.24, + "step": 26820 + }, + { + "epoch": 0.960517127150966, + "grad_norm": 1.5015861988067627, + "learning_rate": 8.158003321107167e-07, + "loss": 1.2342, + "step": 26821 + }, + { + "epoch": 0.9605529392805343, + "grad_norm": 1.2254098653793335, + "learning_rate": 8.143224343369671e-07, + "loss": 1.3663, + "step": 26822 + }, + { + "epoch": 0.9605887514101026, + "grad_norm": 1.7844117879867554, + "learning_rate": 8.128458709785736e-07, + "loss": 1.5133, + "step": 26823 + }, + { + "epoch": 0.9606245635396709, + "grad_norm": 1.9229958057403564, + "learning_rate": 8.113706420553868e-07, + "loss": 1.7314, + "step": 26824 + }, + { + "epoch": 0.9606603756692391, + "grad_norm": 1.4775229692459106, + "learning_rate": 8.098967475872798e-07, + "loss": 1.4693, + "step": 26825 + }, + { + "epoch": 0.9606961877988075, + "grad_norm": 1.6402254104614258, + "learning_rate": 8.084241875940591e-07, + "loss": 1.3664, + "step": 26826 + }, + { + "epoch": 0.9607319999283758, + "grad_norm": 1.7152255773544312, + "learning_rate": 8.069529620955418e-07, + "loss": 1.3495, + "step": 26827 + }, + { + "epoch": 0.960767812057944, + "grad_norm": 1.5333672761917114, + "learning_rate": 8.054830711115236e-07, + "loss": 1.533, + "step": 26828 + }, + { + "epoch": 0.9608036241875123, + "grad_norm": 1.5828067064285278, + "learning_rate": 8.040145146617883e-07, + "loss": 1.7531, + "step": 26829 + }, + { + "epoch": 0.9608394363170806, + "grad_norm": 1.8916761875152588, + "learning_rate": 8.025472927660649e-07, + "loss": 1.0839, + "step": 26830 + }, + { + "epoch": 0.9608752484466488, + "grad_norm": 1.5456327199935913, + "learning_rate": 8.010814054441262e-07, + "loss": 1.5393, + "step": 26831 + }, + { + "epoch": 0.9609110605762171, + "grad_norm": 1.50007164478302, + "learning_rate": 7.996168527156789e-07, + "loss": 1.049, + "step": 26832 + }, + { + "epoch": 0.9609468727057855, + "grad_norm": 1.552908182144165, + "learning_rate": 7.981536346004292e-07, + "loss": 1.4244, + "step": 26833 + }, + { + "epoch": 0.9609826848353538, + "grad_norm": 1.4380766153335571, + "learning_rate": 7.966917511180505e-07, + "loss": 1.1892, + "step": 26834 + }, + { + "epoch": 0.961018496964922, + "grad_norm": 2.1194546222686768, + "learning_rate": 7.952312022882269e-07, + "loss": 1.5398, + "step": 26835 + }, + { + "epoch": 0.9610543090944903, + "grad_norm": 1.35401451587677, + "learning_rate": 7.937719881306094e-07, + "loss": 1.566, + "step": 26836 + }, + { + "epoch": 0.9610901212240586, + "grad_norm": 2.0986156463623047, + "learning_rate": 7.923141086648156e-07, + "loss": 1.3092, + "step": 26837 + }, + { + "epoch": 0.9611259333536268, + "grad_norm": 1.8315749168395996, + "learning_rate": 7.908575639104631e-07, + "loss": 1.5848, + "step": 26838 + }, + { + "epoch": 0.9611617454831951, + "grad_norm": 1.6822844743728638, + "learning_rate": 7.894023538871587e-07, + "loss": 1.5722, + "step": 26839 + }, + { + "epoch": 0.9611975576127635, + "grad_norm": 1.2145322561264038, + "learning_rate": 7.879484786144753e-07, + "loss": 1.2355, + "step": 26840 + }, + { + "epoch": 0.9612333697423318, + "grad_norm": 1.5983326435089111, + "learning_rate": 7.864959381119641e-07, + "loss": 1.6364, + "step": 26841 + }, + { + "epoch": 0.9612691818719, + "grad_norm": 1.8771876096725464, + "learning_rate": 7.85044732399165e-07, + "loss": 1.4767, + "step": 26842 + }, + { + "epoch": 0.9613049940014683, + "grad_norm": 2.0207412242889404, + "learning_rate": 7.83594861495629e-07, + "loss": 1.4324, + "step": 26843 + }, + { + "epoch": 0.9613408061310366, + "grad_norm": 1.887495517730713, + "learning_rate": 7.821463254208405e-07, + "loss": 1.4604, + "step": 26844 + }, + { + "epoch": 0.9613766182606048, + "grad_norm": 1.4683293104171753, + "learning_rate": 7.80699124194284e-07, + "loss": 1.5779, + "step": 26845 + }, + { + "epoch": 0.9614124303901731, + "grad_norm": 1.590532660484314, + "learning_rate": 7.792532578354439e-07, + "loss": 1.4709, + "step": 26846 + }, + { + "epoch": 0.9614482425197415, + "grad_norm": 1.5941145420074463, + "learning_rate": 7.778087263637601e-07, + "loss": 1.5715, + "step": 26847 + }, + { + "epoch": 0.9614840546493097, + "grad_norm": 2.0739402770996094, + "learning_rate": 7.763655297986839e-07, + "loss": 1.6764, + "step": 26848 + }, + { + "epoch": 0.961519866778878, + "grad_norm": 1.8480732440948486, + "learning_rate": 7.749236681595995e-07, + "loss": 1.6399, + "step": 26849 + }, + { + "epoch": 0.9615556789084463, + "grad_norm": 1.648714542388916, + "learning_rate": 7.734831414659471e-07, + "loss": 1.5086, + "step": 26850 + }, + { + "epoch": 0.9615914910380146, + "grad_norm": 1.796379804611206, + "learning_rate": 7.720439497370668e-07, + "loss": 1.3986, + "step": 26851 + }, + { + "epoch": 0.9616273031675828, + "grad_norm": 1.7396949529647827, + "learning_rate": 7.706060929923542e-07, + "loss": 1.1852, + "step": 26852 + }, + { + "epoch": 0.9616631152971511, + "grad_norm": 1.374097228050232, + "learning_rate": 7.691695712511382e-07, + "loss": 1.4309, + "step": 26853 + }, + { + "epoch": 0.9616989274267195, + "grad_norm": 1.4233494997024536, + "learning_rate": 7.677343845327478e-07, + "loss": 1.1654, + "step": 26854 + }, + { + "epoch": 0.9617347395562877, + "grad_norm": 2.057396173477173, + "learning_rate": 7.663005328564787e-07, + "loss": 1.6129, + "step": 26855 + }, + { + "epoch": 0.961770551685856, + "grad_norm": 2.265347480773926, + "learning_rate": 7.648680162416489e-07, + "loss": 1.3166, + "step": 26856 + }, + { + "epoch": 0.9618063638154243, + "grad_norm": 1.4549821615219116, + "learning_rate": 7.634368347075093e-07, + "loss": 1.2215, + "step": 26857 + }, + { + "epoch": 0.9618421759449926, + "grad_norm": 1.7157262563705444, + "learning_rate": 7.620069882733227e-07, + "loss": 1.5253, + "step": 26858 + }, + { + "epoch": 0.9618779880745608, + "grad_norm": 2.227339506149292, + "learning_rate": 7.605784769583291e-07, + "loss": 1.5059, + "step": 26859 + }, + { + "epoch": 0.9619138002041291, + "grad_norm": 1.5298230648040771, + "learning_rate": 7.591513007817242e-07, + "loss": 1.3268, + "step": 26860 + }, + { + "epoch": 0.9619496123336975, + "grad_norm": 1.7910478115081787, + "learning_rate": 7.577254597627481e-07, + "loss": 1.3596, + "step": 26861 + }, + { + "epoch": 0.9619854244632657, + "grad_norm": 1.3827205896377563, + "learning_rate": 7.563009539205524e-07, + "loss": 1.4843, + "step": 26862 + }, + { + "epoch": 0.962021236592834, + "grad_norm": 1.5558743476867676, + "learning_rate": 7.548777832743214e-07, + "loss": 1.2898, + "step": 26863 + }, + { + "epoch": 0.9620570487224023, + "grad_norm": 2.3799028396606445, + "learning_rate": 7.534559478431735e-07, + "loss": 1.2981, + "step": 26864 + }, + { + "epoch": 0.9620928608519705, + "grad_norm": 1.3155205249786377, + "learning_rate": 7.52035447646271e-07, + "loss": 1.3632, + "step": 26865 + }, + { + "epoch": 0.9621286729815388, + "grad_norm": 1.3187897205352783, + "learning_rate": 7.506162827027097e-07, + "loss": 1.3126, + "step": 26866 + }, + { + "epoch": 0.9621644851111071, + "grad_norm": 1.7988380193710327, + "learning_rate": 7.491984530315854e-07, + "loss": 1.6965, + "step": 26867 + }, + { + "epoch": 0.9622002972406755, + "grad_norm": 1.1581627130508423, + "learning_rate": 7.477819586519719e-07, + "loss": 1.1106, + "step": 26868 + }, + { + "epoch": 0.9622361093702437, + "grad_norm": 1.507202386856079, + "learning_rate": 7.463667995829205e-07, + "loss": 1.4695, + "step": 26869 + }, + { + "epoch": 0.962271921499812, + "grad_norm": 2.0441157817840576, + "learning_rate": 7.449529758434826e-07, + "loss": 2.2557, + "step": 26870 + }, + { + "epoch": 0.9623077336293803, + "grad_norm": 1.7305978536605835, + "learning_rate": 7.435404874526542e-07, + "loss": 1.238, + "step": 26871 + }, + { + "epoch": 0.9623435457589485, + "grad_norm": 1.8210771083831787, + "learning_rate": 7.421293344294755e-07, + "loss": 1.5418, + "step": 26872 + }, + { + "epoch": 0.9623793578885168, + "grad_norm": 1.6746025085449219, + "learning_rate": 7.407195167929093e-07, + "loss": 1.2805, + "step": 26873 + }, + { + "epoch": 0.9624151700180851, + "grad_norm": 1.6006801128387451, + "learning_rate": 7.393110345619291e-07, + "loss": 1.269, + "step": 26874 + }, + { + "epoch": 0.9624509821476535, + "grad_norm": 2.0808916091918945, + "learning_rate": 7.379038877554755e-07, + "loss": 1.2495, + "step": 26875 + }, + { + "epoch": 0.9624867942772217, + "grad_norm": 1.9244236946105957, + "learning_rate": 7.364980763924889e-07, + "loss": 1.3046, + "step": 26876 + }, + { + "epoch": 0.96252260640679, + "grad_norm": 2.2387166023254395, + "learning_rate": 7.350936004918873e-07, + "loss": 1.4992, + "step": 26877 + }, + { + "epoch": 0.9625584185363583, + "grad_norm": 1.9266955852508545, + "learning_rate": 7.336904600725447e-07, + "loss": 1.306, + "step": 26878 + }, + { + "epoch": 0.9625942306659265, + "grad_norm": 1.6920135021209717, + "learning_rate": 7.322886551533681e-07, + "loss": 1.6659, + "step": 26879 + }, + { + "epoch": 0.9626300427954948, + "grad_norm": 1.9262080192565918, + "learning_rate": 7.308881857531869e-07, + "loss": 1.4505, + "step": 26880 + }, + { + "epoch": 0.9626658549250631, + "grad_norm": 1.438104271888733, + "learning_rate": 7.294890518908748e-07, + "loss": 1.3511, + "step": 26881 + }, + { + "epoch": 0.9627016670546314, + "grad_norm": 1.2074106931686401, + "learning_rate": 7.280912535852169e-07, + "loss": 1.4729, + "step": 26882 + }, + { + "epoch": 0.9627374791841997, + "grad_norm": 1.594930648803711, + "learning_rate": 7.266947908550536e-07, + "loss": 1.2189, + "step": 26883 + }, + { + "epoch": 0.962773291313768, + "grad_norm": 1.5803894996643066, + "learning_rate": 7.252996637191589e-07, + "loss": 1.0278, + "step": 26884 + }, + { + "epoch": 0.9628091034433363, + "grad_norm": 1.9029607772827148, + "learning_rate": 7.239058721962954e-07, + "loss": 1.3136, + "step": 26885 + }, + { + "epoch": 0.9628449155729045, + "grad_norm": 2.055635452270508, + "learning_rate": 7.22513416305226e-07, + "loss": 1.294, + "step": 26886 + }, + { + "epoch": 0.9628807277024728, + "grad_norm": 1.8848527669906616, + "learning_rate": 7.211222960646691e-07, + "loss": 1.4366, + "step": 26887 + }, + { + "epoch": 0.9629165398320411, + "grad_norm": 1.2585186958312988, + "learning_rate": 7.197325114933651e-07, + "loss": 1.1339, + "step": 26888 + }, + { + "epoch": 0.9629523519616094, + "grad_norm": 1.8972463607788086, + "learning_rate": 7.18344062609988e-07, + "loss": 1.5986, + "step": 26889 + }, + { + "epoch": 0.9629881640911777, + "grad_norm": 2.0489518642425537, + "learning_rate": 7.16956949433234e-07, + "loss": 1.325, + "step": 26890 + }, + { + "epoch": 0.963023976220746, + "grad_norm": 1.7141057252883911, + "learning_rate": 7.155711719817548e-07, + "loss": 1.3843, + "step": 26891 + }, + { + "epoch": 0.9630597883503142, + "grad_norm": 1.6005711555480957, + "learning_rate": 7.141867302742023e-07, + "loss": 1.265, + "step": 26892 + }, + { + "epoch": 0.9630956004798825, + "grad_norm": 1.6023467779159546, + "learning_rate": 7.128036243291947e-07, + "loss": 1.2884, + "step": 26893 + }, + { + "epoch": 0.9631314126094508, + "grad_norm": 1.2686173915863037, + "learning_rate": 7.114218541653395e-07, + "loss": 1.1523, + "step": 26894 + }, + { + "epoch": 0.9631672247390191, + "grad_norm": 1.4661974906921387, + "learning_rate": 7.100414198012439e-07, + "loss": 1.2727, + "step": 26895 + }, + { + "epoch": 0.9632030368685874, + "grad_norm": 1.7659021615982056, + "learning_rate": 7.086623212554488e-07, + "loss": 1.5836, + "step": 26896 + }, + { + "epoch": 0.9632388489981557, + "grad_norm": 1.6870405673980713, + "learning_rate": 7.072845585465282e-07, + "loss": 1.3479, + "step": 26897 + }, + { + "epoch": 0.963274661127724, + "grad_norm": 1.9497534036636353, + "learning_rate": 7.059081316930227e-07, + "loss": 1.334, + "step": 26898 + }, + { + "epoch": 0.9633104732572922, + "grad_norm": 1.5201635360717773, + "learning_rate": 7.045330407134398e-07, + "loss": 1.5913, + "step": 26899 + }, + { + "epoch": 0.9633462853868605, + "grad_norm": 1.6073867082595825, + "learning_rate": 7.03159285626287e-07, + "loss": 1.531, + "step": 26900 + }, + { + "epoch": 0.9633820975164288, + "grad_norm": 2.0254600048065186, + "learning_rate": 7.017868664500382e-07, + "loss": 1.5583, + "step": 26901 + }, + { + "epoch": 0.963417909645997, + "grad_norm": 2.1424214839935303, + "learning_rate": 7.004157832031677e-07, + "loss": 1.5673, + "step": 26902 + }, + { + "epoch": 0.9634537217755654, + "grad_norm": 1.7584158182144165, + "learning_rate": 6.990460359041051e-07, + "loss": 1.17, + "step": 26903 + }, + { + "epoch": 0.9634895339051337, + "grad_norm": 1.9335455894470215, + "learning_rate": 6.976776245712913e-07, + "loss": 1.464, + "step": 26904 + }, + { + "epoch": 0.963525346034702, + "grad_norm": 1.4011025428771973, + "learning_rate": 6.963105492231336e-07, + "loss": 1.5505, + "step": 26905 + }, + { + "epoch": 0.9635611581642702, + "grad_norm": 1.5605167150497437, + "learning_rate": 6.949448098780398e-07, + "loss": 1.2234, + "step": 26906 + }, + { + "epoch": 0.9635969702938385, + "grad_norm": 1.5061067342758179, + "learning_rate": 6.935804065543505e-07, + "loss": 1.1857, + "step": 26907 + }, + { + "epoch": 0.9636327824234068, + "grad_norm": 1.4534320831298828, + "learning_rate": 6.922173392704512e-07, + "loss": 1.625, + "step": 26908 + }, + { + "epoch": 0.963668594552975, + "grad_norm": 1.4374990463256836, + "learning_rate": 6.908556080446715e-07, + "loss": 1.3349, + "step": 26909 + }, + { + "epoch": 0.9637044066825434, + "grad_norm": 1.656003475189209, + "learning_rate": 6.894952128953191e-07, + "loss": 1.3548, + "step": 26910 + }, + { + "epoch": 0.9637402188121117, + "grad_norm": 1.9220056533813477, + "learning_rate": 6.881361538407127e-07, + "loss": 1.3714, + "step": 26911 + }, + { + "epoch": 0.96377603094168, + "grad_norm": 1.139539361000061, + "learning_rate": 6.867784308991266e-07, + "loss": 1.3443, + "step": 26912 + }, + { + "epoch": 0.9638118430712482, + "grad_norm": 1.2987382411956787, + "learning_rate": 6.854220440888459e-07, + "loss": 1.4791, + "step": 26913 + }, + { + "epoch": 0.9638476552008165, + "grad_norm": 1.2175135612487793, + "learning_rate": 6.840669934280897e-07, + "loss": 1.3745, + "step": 26914 + }, + { + "epoch": 0.9638834673303848, + "grad_norm": 1.5563571453094482, + "learning_rate": 6.827132789351098e-07, + "loss": 1.6409, + "step": 26915 + }, + { + "epoch": 0.963919279459953, + "grad_norm": 1.1882492303848267, + "learning_rate": 6.813609006281141e-07, + "loss": 1.41, + "step": 26916 + }, + { + "epoch": 0.9639550915895214, + "grad_norm": 1.3806182146072388, + "learning_rate": 6.800098585252989e-07, + "loss": 1.3614, + "step": 26917 + }, + { + "epoch": 0.9639909037190897, + "grad_norm": 1.8224741220474243, + "learning_rate": 6.786601526448277e-07, + "loss": 1.3087, + "step": 26918 + }, + { + "epoch": 0.964026715848658, + "grad_norm": 1.6787079572677612, + "learning_rate": 6.773117830048747e-07, + "loss": 1.5826, + "step": 26919 + }, + { + "epoch": 0.9640625279782262, + "grad_norm": 2.0062367916107178, + "learning_rate": 6.75964749623581e-07, + "loss": 1.6547, + "step": 26920 + }, + { + "epoch": 0.9640983401077945, + "grad_norm": 1.6232973337173462, + "learning_rate": 6.746190525190543e-07, + "loss": 1.529, + "step": 26921 + }, + { + "epoch": 0.9641341522373628, + "grad_norm": 1.5797841548919678, + "learning_rate": 6.732746917094135e-07, + "loss": 1.5343, + "step": 26922 + }, + { + "epoch": 0.964169964366931, + "grad_norm": 1.5757808685302734, + "learning_rate": 6.719316672127329e-07, + "loss": 1.5117, + "step": 26923 + }, + { + "epoch": 0.9642057764964994, + "grad_norm": 1.458806037902832, + "learning_rate": 6.70589979047087e-07, + "loss": 1.3089, + "step": 26924 + }, + { + "epoch": 0.9642415886260677, + "grad_norm": 1.3439996242523193, + "learning_rate": 6.692496272305282e-07, + "loss": 1.4445, + "step": 26925 + }, + { + "epoch": 0.964277400755636, + "grad_norm": 1.373887538909912, + "learning_rate": 6.679106117810974e-07, + "loss": 1.4416, + "step": 26926 + }, + { + "epoch": 0.9643132128852042, + "grad_norm": 1.9046086072921753, + "learning_rate": 6.665729327167913e-07, + "loss": 1.7455, + "step": 26927 + }, + { + "epoch": 0.9643490250147725, + "grad_norm": 1.4031591415405273, + "learning_rate": 6.652365900556179e-07, + "loss": 1.5515, + "step": 26928 + }, + { + "epoch": 0.9643848371443408, + "grad_norm": 1.617029070854187, + "learning_rate": 6.639015838155515e-07, + "loss": 1.713, + "step": 26929 + }, + { + "epoch": 0.964420649273909, + "grad_norm": 1.4018628597259521, + "learning_rate": 6.625679140145557e-07, + "loss": 0.8941, + "step": 26930 + }, + { + "epoch": 0.9644564614034774, + "grad_norm": 1.6945066452026367, + "learning_rate": 6.612355806705828e-07, + "loss": 1.3377, + "step": 26931 + }, + { + "epoch": 0.9644922735330457, + "grad_norm": 1.6313774585723877, + "learning_rate": 6.599045838015294e-07, + "loss": 1.0908, + "step": 26932 + }, + { + "epoch": 0.9645280856626139, + "grad_norm": 1.6346851587295532, + "learning_rate": 6.585749234253258e-07, + "loss": 1.3156, + "step": 26933 + }, + { + "epoch": 0.9645638977921822, + "grad_norm": 2.169135570526123, + "learning_rate": 6.572465995598575e-07, + "loss": 1.5391, + "step": 26934 + }, + { + "epoch": 0.9645997099217505, + "grad_norm": 1.4126479625701904, + "learning_rate": 6.559196122229994e-07, + "loss": 1.5252, + "step": 26935 + }, + { + "epoch": 0.9646355220513188, + "grad_norm": 2.0156967639923096, + "learning_rate": 6.545939614325924e-07, + "loss": 1.5874, + "step": 26936 + }, + { + "epoch": 0.964671334180887, + "grad_norm": 1.959639310836792, + "learning_rate": 6.532696472064781e-07, + "loss": 1.4451, + "step": 26937 + }, + { + "epoch": 0.9647071463104554, + "grad_norm": 1.6670535802841187, + "learning_rate": 6.519466695624755e-07, + "loss": 1.7676, + "step": 26938 + }, + { + "epoch": 0.9647429584400237, + "grad_norm": 1.3304088115692139, + "learning_rate": 6.506250285183812e-07, + "loss": 1.3935, + "step": 26939 + }, + { + "epoch": 0.9647787705695919, + "grad_norm": 1.6989744901657104, + "learning_rate": 6.493047240919703e-07, + "loss": 1.3568, + "step": 26940 + }, + { + "epoch": 0.9648145826991602, + "grad_norm": 1.3269007205963135, + "learning_rate": 6.479857563010062e-07, + "loss": 1.2906, + "step": 26941 + }, + { + "epoch": 0.9648503948287285, + "grad_norm": 1.288854956626892, + "learning_rate": 6.466681251632522e-07, + "loss": 1.3047, + "step": 26942 + }, + { + "epoch": 0.9648862069582967, + "grad_norm": 1.3249443769454956, + "learning_rate": 6.453518306964168e-07, + "loss": 1.4381, + "step": 26943 + }, + { + "epoch": 0.964922019087865, + "grad_norm": 1.3801206350326538, + "learning_rate": 6.440368729182078e-07, + "loss": 1.4704, + "step": 26944 + }, + { + "epoch": 0.9649578312174334, + "grad_norm": 1.7521178722381592, + "learning_rate": 6.427232518463333e-07, + "loss": 1.3048, + "step": 26945 + }, + { + "epoch": 0.9649936433470017, + "grad_norm": 1.3953019380569458, + "learning_rate": 6.414109674984458e-07, + "loss": 1.4156, + "step": 26946 + }, + { + "epoch": 0.9650294554765699, + "grad_norm": 1.5526511669158936, + "learning_rate": 6.401000198922202e-07, + "loss": 1.5402, + "step": 26947 + }, + { + "epoch": 0.9650652676061382, + "grad_norm": 1.5012935400009155, + "learning_rate": 6.387904090452757e-07, + "loss": 1.5017, + "step": 26948 + }, + { + "epoch": 0.9651010797357065, + "grad_norm": 1.2707685232162476, + "learning_rate": 6.374821349752424e-07, + "loss": 1.1305, + "step": 26949 + }, + { + "epoch": 0.9651368918652747, + "grad_norm": 1.609204649925232, + "learning_rate": 6.361751976997177e-07, + "loss": 1.5067, + "step": 26950 + }, + { + "epoch": 0.965172703994843, + "grad_norm": 1.312595009803772, + "learning_rate": 6.348695972362872e-07, + "loss": 1.4279, + "step": 26951 + }, + { + "epoch": 0.9652085161244114, + "grad_norm": 1.449548602104187, + "learning_rate": 6.33565333602515e-07, + "loss": 1.6088, + "step": 26952 + }, + { + "epoch": 0.9652443282539797, + "grad_norm": 1.653937816619873, + "learning_rate": 6.322624068159421e-07, + "loss": 1.2204, + "step": 26953 + }, + { + "epoch": 0.9652801403835479, + "grad_norm": 2.3628926277160645, + "learning_rate": 6.309608168941217e-07, + "loss": 1.6656, + "step": 26954 + }, + { + "epoch": 0.9653159525131162, + "grad_norm": 1.80828857421875, + "learning_rate": 6.296605638545172e-07, + "loss": 1.3403, + "step": 26955 + }, + { + "epoch": 0.9653517646426845, + "grad_norm": 1.901632308959961, + "learning_rate": 6.283616477146703e-07, + "loss": 1.1962, + "step": 26956 + }, + { + "epoch": 0.9653875767722527, + "grad_norm": 1.6647350788116455, + "learning_rate": 6.270640684920337e-07, + "loss": 1.7878, + "step": 26957 + }, + { + "epoch": 0.965423388901821, + "grad_norm": 1.296832799911499, + "learning_rate": 6.257678262040712e-07, + "loss": 1.3981, + "step": 26958 + }, + { + "epoch": 0.9654592010313894, + "grad_norm": 1.8085285425186157, + "learning_rate": 6.244729208682131e-07, + "loss": 1.1554, + "step": 26959 + }, + { + "epoch": 0.9654950131609576, + "grad_norm": 1.1620607376098633, + "learning_rate": 6.231793525018903e-07, + "loss": 1.0353, + "step": 26960 + }, + { + "epoch": 0.9655308252905259, + "grad_norm": 1.9427391290664673, + "learning_rate": 6.218871211224997e-07, + "loss": 1.4748, + "step": 26961 + }, + { + "epoch": 0.9655666374200942, + "grad_norm": 1.4789810180664062, + "learning_rate": 6.205962267474386e-07, + "loss": 1.1432, + "step": 26962 + }, + { + "epoch": 0.9656024495496625, + "grad_norm": 1.6948950290679932, + "learning_rate": 6.193066693940597e-07, + "loss": 1.1085, + "step": 26963 + }, + { + "epoch": 0.9656382616792307, + "grad_norm": 1.3911237716674805, + "learning_rate": 6.180184490797158e-07, + "loss": 1.3466, + "step": 26964 + }, + { + "epoch": 0.965674073808799, + "grad_norm": 1.7155574560165405, + "learning_rate": 6.167315658217376e-07, + "loss": 1.575, + "step": 26965 + }, + { + "epoch": 0.9657098859383674, + "grad_norm": 2.408755302429199, + "learning_rate": 6.154460196374445e-07, + "loss": 1.6934, + "step": 26966 + }, + { + "epoch": 0.9657456980679356, + "grad_norm": 1.5642789602279663, + "learning_rate": 6.141618105441227e-07, + "loss": 1.4295, + "step": 26967 + }, + { + "epoch": 0.9657815101975039, + "grad_norm": 1.8017868995666504, + "learning_rate": 6.128789385590583e-07, + "loss": 1.1765, + "step": 26968 + }, + { + "epoch": 0.9658173223270722, + "grad_norm": 2.101372241973877, + "learning_rate": 6.115974036995154e-07, + "loss": 1.272, + "step": 26969 + }, + { + "epoch": 0.9658531344566404, + "grad_norm": 1.5753008127212524, + "learning_rate": 6.103172059827134e-07, + "loss": 1.1554, + "step": 26970 + }, + { + "epoch": 0.9658889465862087, + "grad_norm": 1.654207468032837, + "learning_rate": 6.090383454259052e-07, + "loss": 1.7734, + "step": 26971 + }, + { + "epoch": 0.965924758715777, + "grad_norm": 2.283719539642334, + "learning_rate": 6.077608220462771e-07, + "loss": 0.987, + "step": 26972 + }, + { + "epoch": 0.9659605708453454, + "grad_norm": 1.7849880456924438, + "learning_rate": 6.064846358610154e-07, + "loss": 1.5324, + "step": 26973 + }, + { + "epoch": 0.9659963829749136, + "grad_norm": 2.171529769897461, + "learning_rate": 6.052097868872953e-07, + "loss": 1.4802, + "step": 26974 + }, + { + "epoch": 0.9660321951044819, + "grad_norm": 2.1995201110839844, + "learning_rate": 6.039362751422695e-07, + "loss": 1.3182, + "step": 26975 + }, + { + "epoch": 0.9660680072340502, + "grad_norm": 1.4685053825378418, + "learning_rate": 6.026641006430689e-07, + "loss": 1.4164, + "step": 26976 + }, + { + "epoch": 0.9661038193636184, + "grad_norm": 1.7886309623718262, + "learning_rate": 6.013932634068021e-07, + "loss": 1.6115, + "step": 26977 + }, + { + "epoch": 0.9661396314931867, + "grad_norm": 1.714840054512024, + "learning_rate": 6.001237634505885e-07, + "loss": 1.8374, + "step": 26978 + }, + { + "epoch": 0.966175443622755, + "grad_norm": 1.6897437572479248, + "learning_rate": 5.988556007914814e-07, + "loss": 1.9828, + "step": 26979 + }, + { + "epoch": 0.9662112557523234, + "grad_norm": 1.6048061847686768, + "learning_rate": 5.975887754465559e-07, + "loss": 1.5878, + "step": 26980 + }, + { + "epoch": 0.9662470678818916, + "grad_norm": 1.753430724143982, + "learning_rate": 5.96323287432854e-07, + "loss": 1.4545, + "step": 26981 + }, + { + "epoch": 0.9662828800114599, + "grad_norm": 1.6798697710037231, + "learning_rate": 5.950591367674064e-07, + "loss": 1.2394, + "step": 26982 + }, + { + "epoch": 0.9663186921410282, + "grad_norm": 2.274351119995117, + "learning_rate": 5.937963234672106e-07, + "loss": 1.3658, + "step": 26983 + }, + { + "epoch": 0.9663545042705964, + "grad_norm": 1.8976236581802368, + "learning_rate": 5.925348475492643e-07, + "loss": 1.0295, + "step": 26984 + }, + { + "epoch": 0.9663903164001647, + "grad_norm": 1.966462254524231, + "learning_rate": 5.912747090305315e-07, + "loss": 1.659, + "step": 26985 + }, + { + "epoch": 0.966426128529733, + "grad_norm": 1.4895193576812744, + "learning_rate": 5.900159079279654e-07, + "loss": 1.377, + "step": 26986 + }, + { + "epoch": 0.9664619406593014, + "grad_norm": 1.6803325414657593, + "learning_rate": 5.887584442585081e-07, + "loss": 1.2961, + "step": 26987 + }, + { + "epoch": 0.9664977527888696, + "grad_norm": 1.4784119129180908, + "learning_rate": 5.875023180390793e-07, + "loss": 1.6773, + "step": 26988 + }, + { + "epoch": 0.9665335649184379, + "grad_norm": 1.7596663236618042, + "learning_rate": 5.862475292865655e-07, + "loss": 1.3413, + "step": 26989 + }, + { + "epoch": 0.9665693770480062, + "grad_norm": 2.026500940322876, + "learning_rate": 5.849940780178642e-07, + "loss": 1.1128, + "step": 26990 + }, + { + "epoch": 0.9666051891775744, + "grad_norm": 1.9572885036468506, + "learning_rate": 5.837419642498288e-07, + "loss": 1.7062, + "step": 26991 + }, + { + "epoch": 0.9666410013071427, + "grad_norm": 1.2944083213806152, + "learning_rate": 5.824911879993123e-07, + "loss": 1.2943, + "step": 26992 + }, + { + "epoch": 0.966676813436711, + "grad_norm": 2.0377883911132812, + "learning_rate": 5.812417492831346e-07, + "loss": 1.4307, + "step": 26993 + }, + { + "epoch": 0.9667126255662793, + "grad_norm": 1.5536547899246216, + "learning_rate": 5.799936481181045e-07, + "loss": 1.448, + "step": 26994 + }, + { + "epoch": 0.9667484376958476, + "grad_norm": 2.0070462226867676, + "learning_rate": 5.787468845210198e-07, + "loss": 1.2658, + "step": 26995 + }, + { + "epoch": 0.9667842498254159, + "grad_norm": 1.606011986732483, + "learning_rate": 5.775014585086446e-07, + "loss": 1.7443, + "step": 26996 + }, + { + "epoch": 0.9668200619549842, + "grad_norm": 2.032684326171875, + "learning_rate": 5.762573700977547e-07, + "loss": 1.5145, + "step": 26997 + }, + { + "epoch": 0.9668558740845524, + "grad_norm": 1.5862538814544678, + "learning_rate": 5.750146193050698e-07, + "loss": 1.3515, + "step": 26998 + }, + { + "epoch": 0.9668916862141207, + "grad_norm": 1.840734839439392, + "learning_rate": 5.7377320614731e-07, + "loss": 1.6445, + "step": 26999 + }, + { + "epoch": 0.966927498343689, + "grad_norm": 2.2183516025543213, + "learning_rate": 5.725331306411841e-07, + "loss": 1.6061, + "step": 27000 + }, + { + "epoch": 0.9669633104732573, + "grad_norm": 1.8486427068710327, + "learning_rate": 5.712943928033787e-07, + "loss": 1.2393, + "step": 27001 + }, + { + "epoch": 0.9669991226028256, + "grad_norm": 1.6504371166229248, + "learning_rate": 5.700569926505361e-07, + "loss": 1.1869, + "step": 27002 + }, + { + "epoch": 0.9670349347323939, + "grad_norm": 1.9643383026123047, + "learning_rate": 5.688209301993319e-07, + "loss": 1.5403, + "step": 27003 + }, + { + "epoch": 0.9670707468619621, + "grad_norm": 1.8288096189498901, + "learning_rate": 5.675862054663861e-07, + "loss": 1.4967, + "step": 27004 + }, + { + "epoch": 0.9671065589915304, + "grad_norm": 2.0646116733551025, + "learning_rate": 5.663528184683186e-07, + "loss": 1.5351, + "step": 27005 + }, + { + "epoch": 0.9671423711210987, + "grad_norm": 2.1061975955963135, + "learning_rate": 5.651207692216942e-07, + "loss": 1.564, + "step": 27006 + }, + { + "epoch": 0.967178183250667, + "grad_norm": 1.4611119031906128, + "learning_rate": 5.638900577431216e-07, + "loss": 1.3468, + "step": 27007 + }, + { + "epoch": 0.9672139953802353, + "grad_norm": 1.4998737573623657, + "learning_rate": 5.626606840491433e-07, + "loss": 1.6063, + "step": 27008 + }, + { + "epoch": 0.9672498075098036, + "grad_norm": 1.3977317810058594, + "learning_rate": 5.614326481562904e-07, + "loss": 1.4851, + "step": 27009 + }, + { + "epoch": 0.9672856196393719, + "grad_norm": 1.8437548875808716, + "learning_rate": 5.602059500811052e-07, + "loss": 1.705, + "step": 27010 + }, + { + "epoch": 0.9673214317689401, + "grad_norm": 1.445892572402954, + "learning_rate": 5.589805898400746e-07, + "loss": 1.3189, + "step": 27011 + }, + { + "epoch": 0.9673572438985084, + "grad_norm": 1.510178804397583, + "learning_rate": 5.577565674496965e-07, + "loss": 1.3705, + "step": 27012 + }, + { + "epoch": 0.9673930560280767, + "grad_norm": 1.8084083795547485, + "learning_rate": 5.565338829264355e-07, + "loss": 1.4164, + "step": 27013 + }, + { + "epoch": 0.967428868157645, + "grad_norm": 1.2820135354995728, + "learning_rate": 5.553125362867228e-07, + "loss": 1.2558, + "step": 27014 + }, + { + "epoch": 0.9674646802872133, + "grad_norm": 1.898093581199646, + "learning_rate": 5.540925275470232e-07, + "loss": 1.796, + "step": 27015 + }, + { + "epoch": 0.9675004924167816, + "grad_norm": 1.603390097618103, + "learning_rate": 5.528738567237235e-07, + "loss": 1.697, + "step": 27016 + }, + { + "epoch": 0.9675363045463499, + "grad_norm": 1.477070927619934, + "learning_rate": 5.516565238332328e-07, + "loss": 1.2186, + "step": 27017 + }, + { + "epoch": 0.9675721166759181, + "grad_norm": 1.664985179901123, + "learning_rate": 5.504405288919156e-07, + "loss": 1.1862, + "step": 27018 + }, + { + "epoch": 0.9676079288054864, + "grad_norm": 1.6416716575622559, + "learning_rate": 5.492258719161481e-07, + "loss": 1.4381, + "step": 27019 + }, + { + "epoch": 0.9676437409350547, + "grad_norm": 1.1065994501113892, + "learning_rate": 5.480125529222613e-07, + "loss": 1.0456, + "step": 27020 + }, + { + "epoch": 0.9676795530646229, + "grad_norm": 1.6238538026809692, + "learning_rate": 5.468005719265868e-07, + "loss": 1.6424, + "step": 27021 + }, + { + "epoch": 0.9677153651941913, + "grad_norm": 1.3962935209274292, + "learning_rate": 5.455899289454225e-07, + "loss": 1.6138, + "step": 27022 + }, + { + "epoch": 0.9677511773237596, + "grad_norm": 1.931966781616211, + "learning_rate": 5.443806239950555e-07, + "loss": 1.2321, + "step": 27023 + }, + { + "epoch": 0.9677869894533279, + "grad_norm": 1.9834296703338623, + "learning_rate": 5.431726570917617e-07, + "loss": 1.26, + "step": 27024 + }, + { + "epoch": 0.9678228015828961, + "grad_norm": 1.65738844871521, + "learning_rate": 5.419660282517836e-07, + "loss": 1.153, + "step": 27025 + }, + { + "epoch": 0.9678586137124644, + "grad_norm": 1.481104850769043, + "learning_rate": 5.407607374913748e-07, + "loss": 1.4851, + "step": 27026 + }, + { + "epoch": 0.9678944258420327, + "grad_norm": 1.8398284912109375, + "learning_rate": 5.395567848267225e-07, + "loss": 1.6348, + "step": 27027 + }, + { + "epoch": 0.9679302379716009, + "grad_norm": 1.63754141330719, + "learning_rate": 5.383541702740469e-07, + "loss": 1.5278, + "step": 27028 + }, + { + "epoch": 0.9679660501011693, + "grad_norm": 1.5657932758331299, + "learning_rate": 5.37152893849513e-07, + "loss": 1.4017, + "step": 27029 + }, + { + "epoch": 0.9680018622307376, + "grad_norm": 1.9750888347625732, + "learning_rate": 5.359529555692966e-07, + "loss": 1.5823, + "step": 27030 + }, + { + "epoch": 0.9680376743603059, + "grad_norm": 1.7874226570129395, + "learning_rate": 5.347543554495293e-07, + "loss": 1.4156, + "step": 27031 + }, + { + "epoch": 0.9680734864898741, + "grad_norm": 2.025090456008911, + "learning_rate": 5.335570935063427e-07, + "loss": 1.6055, + "step": 27032 + }, + { + "epoch": 0.9681092986194424, + "grad_norm": 1.950339913368225, + "learning_rate": 5.323611697558462e-07, + "loss": 1.2355, + "step": 27033 + }, + { + "epoch": 0.9681451107490107, + "grad_norm": 1.6537014245986938, + "learning_rate": 5.311665842141155e-07, + "loss": 1.3523, + "step": 27034 + }, + { + "epoch": 0.9681809228785789, + "grad_norm": 1.566712498664856, + "learning_rate": 5.299733368972492e-07, + "loss": 1.5859, + "step": 27035 + }, + { + "epoch": 0.9682167350081473, + "grad_norm": 2.611342668533325, + "learning_rate": 5.287814278212677e-07, + "loss": 1.6179, + "step": 27036 + }, + { + "epoch": 0.9682525471377156, + "grad_norm": 1.9200516939163208, + "learning_rate": 5.275908570022359e-07, + "loss": 1.6782, + "step": 27037 + }, + { + "epoch": 0.9682883592672838, + "grad_norm": 1.4160640239715576, + "learning_rate": 5.26401624456152e-07, + "loss": 1.287, + "step": 27038 + }, + { + "epoch": 0.9683241713968521, + "grad_norm": 1.3941360712051392, + "learning_rate": 5.252137301990256e-07, + "loss": 1.3371, + "step": 27039 + }, + { + "epoch": 0.9683599835264204, + "grad_norm": 3.1313459873199463, + "learning_rate": 5.240271742468328e-07, + "loss": 1.3915, + "step": 27040 + }, + { + "epoch": 0.9683957956559887, + "grad_norm": 2.499171018600464, + "learning_rate": 5.228419566155385e-07, + "loss": 1.3878, + "step": 27041 + }, + { + "epoch": 0.9684316077855569, + "grad_norm": 1.5517430305480957, + "learning_rate": 5.216580773210966e-07, + "loss": 1.3956, + "step": 27042 + }, + { + "epoch": 0.9684674199151253, + "grad_norm": 1.5848878622055054, + "learning_rate": 5.204755363794167e-07, + "loss": 1.3835, + "step": 27043 + }, + { + "epoch": 0.9685032320446936, + "grad_norm": 1.4164799451828003, + "learning_rate": 5.192943338064305e-07, + "loss": 1.5956, + "step": 27044 + }, + { + "epoch": 0.9685390441742618, + "grad_norm": 1.7449172735214233, + "learning_rate": 5.18114469618014e-07, + "loss": 1.7308, + "step": 27045 + }, + { + "epoch": 0.9685748563038301, + "grad_norm": 1.5101077556610107, + "learning_rate": 5.169359438300436e-07, + "loss": 1.1332, + "step": 27046 + }, + { + "epoch": 0.9686106684333984, + "grad_norm": 1.4586726427078247, + "learning_rate": 5.157587564583733e-07, + "loss": 1.5053, + "step": 27047 + }, + { + "epoch": 0.9686464805629666, + "grad_norm": 1.5485001802444458, + "learning_rate": 5.145829075188457e-07, + "loss": 1.1668, + "step": 27048 + }, + { + "epoch": 0.9686822926925349, + "grad_norm": 1.5960198640823364, + "learning_rate": 5.134083970272819e-07, + "loss": 1.6091, + "step": 27049 + }, + { + "epoch": 0.9687181048221033, + "grad_norm": 1.9195680618286133, + "learning_rate": 5.1223522499948e-07, + "loss": 1.3194, + "step": 27050 + }, + { + "epoch": 0.9687539169516716, + "grad_norm": 1.7722783088684082, + "learning_rate": 5.110633914512164e-07, + "loss": 1.5024, + "step": 27051 + }, + { + "epoch": 0.9687897290812398, + "grad_norm": 1.3769675493240356, + "learning_rate": 5.098928963982674e-07, + "loss": 1.5481, + "step": 27052 + }, + { + "epoch": 0.9688255412108081, + "grad_norm": 1.214339256286621, + "learning_rate": 5.08723739856376e-07, + "loss": 1.1588, + "step": 27053 + }, + { + "epoch": 0.9688613533403764, + "grad_norm": 1.6598423719406128, + "learning_rate": 5.07555921841274e-07, + "loss": 1.4385, + "step": 27054 + }, + { + "epoch": 0.9688971654699446, + "grad_norm": 1.1218611001968384, + "learning_rate": 5.06389442368671e-07, + "loss": 1.3616, + "step": 27055 + }, + { + "epoch": 0.9689329775995129, + "grad_norm": 1.3781205415725708, + "learning_rate": 5.052243014542546e-07, + "loss": 1.5814, + "step": 27056 + }, + { + "epoch": 0.9689687897290813, + "grad_norm": 1.7898406982421875, + "learning_rate": 5.040604991137121e-07, + "loss": 1.1487, + "step": 27057 + }, + { + "epoch": 0.9690046018586496, + "grad_norm": 1.6360304355621338, + "learning_rate": 5.028980353626866e-07, + "loss": 1.4937, + "step": 27058 + }, + { + "epoch": 0.9690404139882178, + "grad_norm": 1.2866710424423218, + "learning_rate": 5.017369102168435e-07, + "loss": 1.2457, + "step": 27059 + }, + { + "epoch": 0.9690762261177861, + "grad_norm": 1.3128433227539062, + "learning_rate": 5.005771236917811e-07, + "loss": 1.4053, + "step": 27060 + }, + { + "epoch": 0.9691120382473544, + "grad_norm": 1.6207622289657593, + "learning_rate": 4.994186758030983e-07, + "loss": 1.5096, + "step": 27061 + }, + { + "epoch": 0.9691478503769226, + "grad_norm": 1.527236819267273, + "learning_rate": 4.982615665663937e-07, + "loss": 1.4559, + "step": 27062 + }, + { + "epoch": 0.9691836625064909, + "grad_norm": 1.528676152229309, + "learning_rate": 4.971057959972325e-07, + "loss": 1.1819, + "step": 27063 + }, + { + "epoch": 0.9692194746360593, + "grad_norm": 1.549570083618164, + "learning_rate": 4.95951364111169e-07, + "loss": 1.344, + "step": 27064 + }, + { + "epoch": 0.9692552867656276, + "grad_norm": 2.239349603652954, + "learning_rate": 4.947982709237131e-07, + "loss": 1.349, + "step": 27065 + }, + { + "epoch": 0.9692910988951958, + "grad_norm": 1.8288278579711914, + "learning_rate": 4.936465164504079e-07, + "loss": 1.8213, + "step": 27066 + }, + { + "epoch": 0.9693269110247641, + "grad_norm": 1.5280758142471313, + "learning_rate": 4.924961007067408e-07, + "loss": 1.7326, + "step": 27067 + }, + { + "epoch": 0.9693627231543324, + "grad_norm": 1.4166117906570435, + "learning_rate": 4.913470237081774e-07, + "loss": 1.5967, + "step": 27068 + }, + { + "epoch": 0.9693985352839006, + "grad_norm": 1.780877947807312, + "learning_rate": 4.90199285470172e-07, + "loss": 1.6305, + "step": 27069 + }, + { + "epoch": 0.9694343474134689, + "grad_norm": 1.342950463294983, + "learning_rate": 4.8905288600819e-07, + "loss": 1.2657, + "step": 27070 + }, + { + "epoch": 0.9694701595430373, + "grad_norm": 1.7711330652236938, + "learning_rate": 4.879078253376412e-07, + "loss": 1.4819, + "step": 27071 + }, + { + "epoch": 0.9695059716726055, + "grad_norm": 1.4958521127700806, + "learning_rate": 4.867641034739134e-07, + "loss": 1.4194, + "step": 27072 + }, + { + "epoch": 0.9695417838021738, + "grad_norm": 1.9198914766311646, + "learning_rate": 4.856217204324275e-07, + "loss": 1.6569, + "step": 27073 + }, + { + "epoch": 0.9695775959317421, + "grad_norm": 1.47645902633667, + "learning_rate": 4.844806762285381e-07, + "loss": 1.2356, + "step": 27074 + }, + { + "epoch": 0.9696134080613104, + "grad_norm": 1.583561658859253, + "learning_rate": 4.833409708775882e-07, + "loss": 1.4539, + "step": 27075 + }, + { + "epoch": 0.9696492201908786, + "grad_norm": 1.3329954147338867, + "learning_rate": 4.822026043949213e-07, + "loss": 1.4844, + "step": 27076 + }, + { + "epoch": 0.9696850323204469, + "grad_norm": 1.3243921995162964, + "learning_rate": 4.810655767958583e-07, + "loss": 1.1535, + "step": 27077 + }, + { + "epoch": 0.9697208444500153, + "grad_norm": 1.422972559928894, + "learning_rate": 4.799298880956759e-07, + "loss": 1.4926, + "step": 27078 + }, + { + "epoch": 0.9697566565795835, + "grad_norm": 1.622788667678833, + "learning_rate": 4.787955383096731e-07, + "loss": 1.1752, + "step": 27079 + }, + { + "epoch": 0.9697924687091518, + "grad_norm": 1.558379888534546, + "learning_rate": 4.776625274530933e-07, + "loss": 1.1288, + "step": 27080 + }, + { + "epoch": 0.9698282808387201, + "grad_norm": 1.6241610050201416, + "learning_rate": 4.765308555411907e-07, + "loss": 1.1744, + "step": 27081 + }, + { + "epoch": 0.9698640929682883, + "grad_norm": 1.4816008806228638, + "learning_rate": 4.754005225891978e-07, + "loss": 1.344, + "step": 27082 + }, + { + "epoch": 0.9698999050978566, + "grad_norm": 1.7448887825012207, + "learning_rate": 4.742715286123134e-07, + "loss": 1.3445, + "step": 27083 + }, + { + "epoch": 0.9699357172274249, + "grad_norm": 1.9753953218460083, + "learning_rate": 4.7314387362572545e-07, + "loss": 1.3754, + "step": 27084 + }, + { + "epoch": 0.9699715293569933, + "grad_norm": 1.8347922563552856, + "learning_rate": 4.7201755764459953e-07, + "loss": 1.4986, + "step": 27085 + }, + { + "epoch": 0.9700073414865615, + "grad_norm": 1.6232060194015503, + "learning_rate": 4.7089258068410133e-07, + "loss": 1.4232, + "step": 27086 + }, + { + "epoch": 0.9700431536161298, + "grad_norm": 2.0615720748901367, + "learning_rate": 4.6976894275935215e-07, + "loss": 1.9353, + "step": 27087 + }, + { + "epoch": 0.9700789657456981, + "grad_norm": 1.7156115770339966, + "learning_rate": 4.6864664388548417e-07, + "loss": 1.4939, + "step": 27088 + }, + { + "epoch": 0.9701147778752663, + "grad_norm": 1.5696758031845093, + "learning_rate": 4.6752568407759655e-07, + "loss": 1.2644, + "step": 27089 + }, + { + "epoch": 0.9701505900048346, + "grad_norm": 1.4447704553604126, + "learning_rate": 4.66406063350755e-07, + "loss": 1.4013, + "step": 27090 + }, + { + "epoch": 0.9701864021344029, + "grad_norm": 1.5259318351745605, + "learning_rate": 4.652877817200252e-07, + "loss": 1.5891, + "step": 27091 + }, + { + "epoch": 0.9702222142639713, + "grad_norm": 1.3538051843643188, + "learning_rate": 4.6417083920046176e-07, + "loss": 1.4276, + "step": 27092 + }, + { + "epoch": 0.9702580263935395, + "grad_norm": 1.2154916524887085, + "learning_rate": 4.630552358070972e-07, + "loss": 1.2274, + "step": 27093 + }, + { + "epoch": 0.9702938385231078, + "grad_norm": 1.5083569288253784, + "learning_rate": 4.6194097155491944e-07, + "loss": 1.7358, + "step": 27094 + }, + { + "epoch": 0.9703296506526761, + "grad_norm": 1.5696214437484741, + "learning_rate": 4.6082804645893874e-07, + "loss": 1.4356, + "step": 27095 + }, + { + "epoch": 0.9703654627822443, + "grad_norm": 1.5140817165374756, + "learning_rate": 4.597164605341209e-07, + "loss": 1.3379, + "step": 27096 + }, + { + "epoch": 0.9704012749118126, + "grad_norm": 1.9681334495544434, + "learning_rate": 4.5860621379540944e-07, + "loss": 1.3589, + "step": 27097 + }, + { + "epoch": 0.9704370870413809, + "grad_norm": 1.6079471111297607, + "learning_rate": 4.574973062577592e-07, + "loss": 1.2374, + "step": 27098 + }, + { + "epoch": 0.9704728991709493, + "grad_norm": 1.9617375135421753, + "learning_rate": 4.5638973793608043e-07, + "loss": 1.429, + "step": 27099 + }, + { + "epoch": 0.9705087113005175, + "grad_norm": 1.4626727104187012, + "learning_rate": 4.5528350884528335e-07, + "loss": 1.5835, + "step": 27100 + }, + { + "epoch": 0.9705445234300858, + "grad_norm": 1.6523990631103516, + "learning_rate": 4.5417861900023397e-07, + "loss": 1.3469, + "step": 27101 + }, + { + "epoch": 0.9705803355596541, + "grad_norm": 1.3021047115325928, + "learning_rate": 4.5307506841580914e-07, + "loss": 1.2336, + "step": 27102 + }, + { + "epoch": 0.9706161476892223, + "grad_norm": 1.75713050365448, + "learning_rate": 4.5197285710685265e-07, + "loss": 1.8113, + "step": 27103 + }, + { + "epoch": 0.9706519598187906, + "grad_norm": 1.3806419372558594, + "learning_rate": 4.5087198508819705e-07, + "loss": 1.3937, + "step": 27104 + }, + { + "epoch": 0.9706877719483589, + "grad_norm": 1.4847077131271362, + "learning_rate": 4.497724523746416e-07, + "loss": 1.3979, + "step": 27105 + }, + { + "epoch": 0.9707235840779272, + "grad_norm": 1.5948593616485596, + "learning_rate": 4.486742589809967e-07, + "loss": 1.5002, + "step": 27106 + }, + { + "epoch": 0.9707593962074955, + "grad_norm": 1.5786889791488647, + "learning_rate": 4.4757740492201717e-07, + "loss": 1.5142, + "step": 27107 + }, + { + "epoch": 0.9707952083370638, + "grad_norm": 2.1769702434539795, + "learning_rate": 4.464818902124801e-07, + "loss": 1.2433, + "step": 27108 + }, + { + "epoch": 0.970831020466632, + "grad_norm": 1.6344908475875854, + "learning_rate": 4.4538771486710706e-07, + "loss": 1.8311, + "step": 27109 + }, + { + "epoch": 0.9708668325962003, + "grad_norm": 2.086514472961426, + "learning_rate": 4.442948789006307e-07, + "loss": 1.4124, + "step": 27110 + }, + { + "epoch": 0.9709026447257686, + "grad_norm": 2.050281286239624, + "learning_rate": 4.432033823277504e-07, + "loss": 1.2815, + "step": 27111 + }, + { + "epoch": 0.9709384568553369, + "grad_norm": 2.2314565181732178, + "learning_rate": 4.4211322516314324e-07, + "loss": 1.7093, + "step": 27112 + }, + { + "epoch": 0.9709742689849052, + "grad_norm": 1.5358484983444214, + "learning_rate": 4.410244074214864e-07, + "loss": 1.4834, + "step": 27113 + }, + { + "epoch": 0.9710100811144735, + "grad_norm": 1.5115602016448975, + "learning_rate": 4.399369291174349e-07, + "loss": 1.2563, + "step": 27114 + }, + { + "epoch": 0.9710458932440418, + "grad_norm": 2.1405441761016846, + "learning_rate": 4.388507902655881e-07, + "loss": 1.3515, + "step": 27115 + }, + { + "epoch": 0.97108170537361, + "grad_norm": 1.8947888612747192, + "learning_rate": 4.377659908805898e-07, + "loss": 1.5495, + "step": 27116 + }, + { + "epoch": 0.9711175175031783, + "grad_norm": 1.3705615997314453, + "learning_rate": 4.366825309770284e-07, + "loss": 1.0637, + "step": 27117 + }, + { + "epoch": 0.9711533296327466, + "grad_norm": 2.133995532989502, + "learning_rate": 4.3560041056947e-07, + "loss": 1.4022, + "step": 27118 + }, + { + "epoch": 0.9711891417623149, + "grad_norm": 2.3694558143615723, + "learning_rate": 4.345196296724807e-07, + "loss": 1.4344, + "step": 27119 + }, + { + "epoch": 0.9712249538918832, + "grad_norm": 1.50997793674469, + "learning_rate": 4.334401883005934e-07, + "loss": 1.4774, + "step": 27120 + }, + { + "epoch": 0.9712607660214515, + "grad_norm": 2.032027244567871, + "learning_rate": 4.32362086468352e-07, + "loss": 1.5578, + "step": 27121 + }, + { + "epoch": 0.9712965781510198, + "grad_norm": 1.652375340461731, + "learning_rate": 4.312853241902337e-07, + "loss": 1.4457, + "step": 27122 + }, + { + "epoch": 0.971332390280588, + "grad_norm": 1.819873332977295, + "learning_rate": 4.3020990148073815e-07, + "loss": 1.6141, + "step": 27123 + }, + { + "epoch": 0.9713682024101563, + "grad_norm": 1.587869644165039, + "learning_rate": 4.2913581835433147e-07, + "loss": 1.6089, + "step": 27124 + }, + { + "epoch": 0.9714040145397246, + "grad_norm": 1.382434368133545, + "learning_rate": 4.2806307482546883e-07, + "loss": 1.5386, + "step": 27125 + }, + { + "epoch": 0.9714398266692928, + "grad_norm": 1.4083807468414307, + "learning_rate": 4.2699167090858303e-07, + "loss": 1.2496, + "step": 27126 + }, + { + "epoch": 0.9714756387988612, + "grad_norm": 1.9262927770614624, + "learning_rate": 4.259216066180738e-07, + "loss": 1.4704, + "step": 27127 + }, + { + "epoch": 0.9715114509284295, + "grad_norm": 1.735276460647583, + "learning_rate": 4.248528819683517e-07, + "loss": 1.3874, + "step": 27128 + }, + { + "epoch": 0.9715472630579978, + "grad_norm": 1.3945090770721436, + "learning_rate": 4.2378549697380533e-07, + "loss": 1.4935, + "step": 27129 + }, + { + "epoch": 0.971583075187566, + "grad_norm": 1.7358595132827759, + "learning_rate": 4.2271945164876756e-07, + "loss": 1.4826, + "step": 27130 + }, + { + "epoch": 0.9716188873171343, + "grad_norm": 1.6000685691833496, + "learning_rate": 4.216547460075937e-07, + "loss": 1.4768, + "step": 27131 + }, + { + "epoch": 0.9716546994467026, + "grad_norm": 1.8158823251724243, + "learning_rate": 4.2059138006460554e-07, + "loss": 1.4642, + "step": 27132 + }, + { + "epoch": 0.9716905115762708, + "grad_norm": 1.5935328006744385, + "learning_rate": 4.1952935383412494e-07, + "loss": 1.3549, + "step": 27133 + }, + { + "epoch": 0.9717263237058391, + "grad_norm": 1.1625258922576904, + "learning_rate": 4.1846866733041834e-07, + "loss": 1.3621, + "step": 27134 + }, + { + "epoch": 0.9717621358354075, + "grad_norm": 1.8553330898284912, + "learning_rate": 4.1740932056776317e-07, + "loss": 1.5953, + "step": 27135 + }, + { + "epoch": 0.9717979479649758, + "grad_norm": 1.6651084423065186, + "learning_rate": 4.1635131356041467e-07, + "loss": 1.6272, + "step": 27136 + }, + { + "epoch": 0.971833760094544, + "grad_norm": 1.5465548038482666, + "learning_rate": 4.1529464632260597e-07, + "loss": 1.3804, + "step": 27137 + }, + { + "epoch": 0.9718695722241123, + "grad_norm": 1.316346526145935, + "learning_rate": 4.1423931886854785e-07, + "loss": 1.5989, + "step": 27138 + }, + { + "epoch": 0.9719053843536806, + "grad_norm": 1.3082847595214844, + "learning_rate": 4.131853312124512e-07, + "loss": 1.5898, + "step": 27139 + }, + { + "epoch": 0.9719411964832488, + "grad_norm": 1.3995599746704102, + "learning_rate": 4.1213268336849355e-07, + "loss": 1.4302, + "step": 27140 + }, + { + "epoch": 0.9719770086128171, + "grad_norm": 1.8354229927062988, + "learning_rate": 4.1108137535081914e-07, + "loss": 1.3678, + "step": 27141 + }, + { + "epoch": 0.9720128207423855, + "grad_norm": 1.4459917545318604, + "learning_rate": 4.1003140717358336e-07, + "loss": 1.4699, + "step": 27142 + }, + { + "epoch": 0.9720486328719538, + "grad_norm": 1.557350993156433, + "learning_rate": 4.089827788509304e-07, + "loss": 1.2917, + "step": 27143 + }, + { + "epoch": 0.972084445001522, + "grad_norm": 1.7156989574432373, + "learning_rate": 4.079354903969379e-07, + "loss": 1.4024, + "step": 27144 + }, + { + "epoch": 0.9721202571310903, + "grad_norm": 2.2056097984313965, + "learning_rate": 4.068895418257057e-07, + "loss": 1.2329, + "step": 27145 + }, + { + "epoch": 0.9721560692606586, + "grad_norm": 1.616719126701355, + "learning_rate": 4.0584493315131146e-07, + "loss": 1.4484, + "step": 27146 + }, + { + "epoch": 0.9721918813902268, + "grad_norm": 1.549475908279419, + "learning_rate": 4.048016643878105e-07, + "loss": 1.6468, + "step": 27147 + }, + { + "epoch": 0.9722276935197951, + "grad_norm": 1.5446029901504517, + "learning_rate": 4.037597355492362e-07, + "loss": 1.5179, + "step": 27148 + }, + { + "epoch": 0.9722635056493635, + "grad_norm": 1.9477657079696655, + "learning_rate": 4.0271914664959944e-07, + "loss": 1.5566, + "step": 27149 + }, + { + "epoch": 0.9722993177789317, + "grad_norm": 1.5786957740783691, + "learning_rate": 4.016798977029113e-07, + "loss": 1.2298, + "step": 27150 + }, + { + "epoch": 0.9723351299085, + "grad_norm": 1.673638939857483, + "learning_rate": 4.006419887231383e-07, + "loss": 1.5531, + "step": 27151 + }, + { + "epoch": 0.9723709420380683, + "grad_norm": 1.7010632753372192, + "learning_rate": 3.9960541972426936e-07, + "loss": 1.5984, + "step": 27152 + }, + { + "epoch": 0.9724067541676366, + "grad_norm": 1.7504063844680786, + "learning_rate": 3.985701907202155e-07, + "loss": 1.3279, + "step": 27153 + }, + { + "epoch": 0.9724425662972048, + "grad_norm": 1.5936665534973145, + "learning_rate": 3.975363017249323e-07, + "loss": 1.2734, + "step": 27154 + }, + { + "epoch": 0.9724783784267731, + "grad_norm": 1.544960856437683, + "learning_rate": 3.9650375275231967e-07, + "loss": 1.4678, + "step": 27155 + }, + { + "epoch": 0.9725141905563415, + "grad_norm": 1.6201308965682983, + "learning_rate": 3.9547254381626653e-07, + "loss": 1.3237, + "step": 27156 + }, + { + "epoch": 0.9725500026859097, + "grad_norm": 1.5041776895523071, + "learning_rate": 3.944426749306507e-07, + "loss": 1.4167, + "step": 27157 + }, + { + "epoch": 0.972585814815478, + "grad_norm": 1.9468683004379272, + "learning_rate": 3.934141461093277e-07, + "loss": 1.2188, + "step": 27158 + }, + { + "epoch": 0.9726216269450463, + "grad_norm": 1.457405686378479, + "learning_rate": 3.9238695736614207e-07, + "loss": 1.3617, + "step": 27159 + }, + { + "epoch": 0.9726574390746145, + "grad_norm": 1.630591630935669, + "learning_rate": 3.913611087148938e-07, + "loss": 1.438, + "step": 27160 + }, + { + "epoch": 0.9726932512041828, + "grad_norm": 1.6744720935821533, + "learning_rate": 3.903366001694053e-07, + "loss": 1.4143, + "step": 27161 + }, + { + "epoch": 0.9727290633337511, + "grad_norm": 1.620065689086914, + "learning_rate": 3.8931343174344324e-07, + "loss": 1.2351, + "step": 27162 + }, + { + "epoch": 0.9727648754633195, + "grad_norm": 1.7324904203414917, + "learning_rate": 3.882916034507855e-07, + "loss": 1.212, + "step": 27163 + }, + { + "epoch": 0.9728006875928877, + "grad_norm": 1.5588440895080566, + "learning_rate": 3.8727111530516556e-07, + "loss": 1.4532, + "step": 27164 + }, + { + "epoch": 0.972836499722456, + "grad_norm": 1.5138558149337769, + "learning_rate": 3.86251967320328e-07, + "loss": 1.4652, + "step": 27165 + }, + { + "epoch": 0.9728723118520243, + "grad_norm": 1.6585930585861206, + "learning_rate": 3.85234159509984e-07, + "loss": 1.5071, + "step": 27166 + }, + { + "epoch": 0.9729081239815925, + "grad_norm": 1.572977900505066, + "learning_rate": 3.842176918878115e-07, + "loss": 1.5052, + "step": 27167 + }, + { + "epoch": 0.9729439361111608, + "grad_norm": 1.2999725341796875, + "learning_rate": 3.832025644674886e-07, + "loss": 1.2422, + "step": 27168 + }, + { + "epoch": 0.9729797482407291, + "grad_norm": 2.5449912548065186, + "learning_rate": 3.821887772626931e-07, + "loss": 1.3325, + "step": 27169 + }, + { + "epoch": 0.9730155603702975, + "grad_norm": 1.7839765548706055, + "learning_rate": 3.8117633028704745e-07, + "loss": 1.0184, + "step": 27170 + }, + { + "epoch": 0.9730513724998657, + "grad_norm": 1.3288191556930542, + "learning_rate": 3.801652235541631e-07, + "loss": 1.4608, + "step": 27171 + }, + { + "epoch": 0.973087184629434, + "grad_norm": 1.5566009283065796, + "learning_rate": 3.791554570776734e-07, + "loss": 1.4342, + "step": 27172 + }, + { + "epoch": 0.9731229967590023, + "grad_norm": 1.7924531698226929, + "learning_rate": 3.781470308711343e-07, + "loss": 1.2537, + "step": 27173 + }, + { + "epoch": 0.9731588088885705, + "grad_norm": 1.666527271270752, + "learning_rate": 3.771399449481239e-07, + "loss": 1.0901, + "step": 27174 + }, + { + "epoch": 0.9731946210181388, + "grad_norm": 1.79993736743927, + "learning_rate": 3.761341993221867e-07, + "loss": 1.2884, + "step": 27175 + }, + { + "epoch": 0.9732304331477071, + "grad_norm": 1.9205312728881836, + "learning_rate": 3.7512979400686763e-07, + "loss": 1.389, + "step": 27176 + }, + { + "epoch": 0.9732662452772755, + "grad_norm": 1.901877999305725, + "learning_rate": 3.741267290156669e-07, + "loss": 1.6964, + "step": 27177 + }, + { + "epoch": 0.9733020574068437, + "grad_norm": 1.5668498277664185, + "learning_rate": 3.7312500436208487e-07, + "loss": 1.1559, + "step": 27178 + }, + { + "epoch": 0.973337869536412, + "grad_norm": 1.4472907781600952, + "learning_rate": 3.7212462005959957e-07, + "loss": 1.306, + "step": 27179 + }, + { + "epoch": 0.9733736816659803, + "grad_norm": 1.4478232860565186, + "learning_rate": 3.7112557612165586e-07, + "loss": 1.3558, + "step": 27180 + }, + { + "epoch": 0.9734094937955485, + "grad_norm": 1.8058240413665771, + "learning_rate": 3.7012787256172075e-07, + "loss": 1.0638, + "step": 27181 + }, + { + "epoch": 0.9734453059251168, + "grad_norm": 1.9508631229400635, + "learning_rate": 3.6913150939318353e-07, + "loss": 1.5124, + "step": 27182 + }, + { + "epoch": 0.9734811180546851, + "grad_norm": 1.470751404762268, + "learning_rate": 3.6813648662947785e-07, + "loss": 1.427, + "step": 27183 + }, + { + "epoch": 0.9735169301842534, + "grad_norm": 1.1770071983337402, + "learning_rate": 3.67142804283982e-07, + "loss": 1.2635, + "step": 27184 + }, + { + "epoch": 0.9735527423138217, + "grad_norm": 1.5016038417816162, + "learning_rate": 3.661504623700629e-07, + "loss": 1.2942, + "step": 27185 + }, + { + "epoch": 0.97358855444339, + "grad_norm": 1.4237912893295288, + "learning_rate": 3.6515946090106557e-07, + "loss": 1.7626, + "step": 27186 + }, + { + "epoch": 0.9736243665729583, + "grad_norm": 1.517599105834961, + "learning_rate": 3.641697998903237e-07, + "loss": 1.4293, + "step": 27187 + }, + { + "epoch": 0.9736601787025265, + "grad_norm": 1.4845151901245117, + "learning_rate": 3.631814793511712e-07, + "loss": 1.1693, + "step": 27188 + }, + { + "epoch": 0.9736959908320948, + "grad_norm": 1.8167911767959595, + "learning_rate": 3.621944992968751e-07, + "loss": 1.401, + "step": 27189 + }, + { + "epoch": 0.9737318029616631, + "grad_norm": 1.7373158931732178, + "learning_rate": 3.612088597407359e-07, + "loss": 1.4933, + "step": 27190 + }, + { + "epoch": 0.9737676150912314, + "grad_norm": 1.7631866931915283, + "learning_rate": 3.6022456069600973e-07, + "loss": 1.3376, + "step": 27191 + }, + { + "epoch": 0.9738034272207997, + "grad_norm": 1.3551874160766602, + "learning_rate": 3.592416021759304e-07, + "loss": 1.0676, + "step": 27192 + }, + { + "epoch": 0.973839239350368, + "grad_norm": 1.2745553255081177, + "learning_rate": 3.5825998419372065e-07, + "loss": 1.3036, + "step": 27193 + }, + { + "epoch": 0.9738750514799362, + "grad_norm": 1.5684278011322021, + "learning_rate": 3.5727970676260327e-07, + "loss": 1.1635, + "step": 27194 + }, + { + "epoch": 0.9739108636095045, + "grad_norm": 1.5761404037475586, + "learning_rate": 3.563007698957566e-07, + "loss": 1.2785, + "step": 27195 + }, + { + "epoch": 0.9739466757390728, + "grad_norm": 1.679574728012085, + "learning_rate": 3.553231736063589e-07, + "loss": 1.723, + "step": 27196 + }, + { + "epoch": 0.9739824878686411, + "grad_norm": 1.4595870971679688, + "learning_rate": 3.5434691790754427e-07, + "loss": 1.6485, + "step": 27197 + }, + { + "epoch": 0.9740182999982094, + "grad_norm": 1.6029869318008423, + "learning_rate": 3.5337200281245765e-07, + "loss": 1.4157, + "step": 27198 + }, + { + "epoch": 0.9740541121277777, + "grad_norm": 1.6887636184692383, + "learning_rate": 3.52398428334233e-07, + "loss": 1.6353, + "step": 27199 + }, + { + "epoch": 0.974089924257346, + "grad_norm": 1.629212737083435, + "learning_rate": 3.514261944859376e-07, + "loss": 1.4481, + "step": 27200 + }, + { + "epoch": 0.9741257363869142, + "grad_norm": 1.597827672958374, + "learning_rate": 3.5045530128066106e-07, + "loss": 1.5889, + "step": 27201 + }, + { + "epoch": 0.9741615485164825, + "grad_norm": 1.69936203956604, + "learning_rate": 3.4948574873148174e-07, + "loss": 1.5639, + "step": 27202 + }, + { + "epoch": 0.9741973606460508, + "grad_norm": 2.234874963760376, + "learning_rate": 3.4851753685142265e-07, + "loss": 1.637, + "step": 27203 + }, + { + "epoch": 0.974233172775619, + "grad_norm": 1.872274398803711, + "learning_rate": 3.4755066565351767e-07, + "loss": 1.4049, + "step": 27204 + }, + { + "epoch": 0.9742689849051874, + "grad_norm": 1.5312186479568481, + "learning_rate": 3.465851351507787e-07, + "loss": 1.2672, + "step": 27205 + }, + { + "epoch": 0.9743047970347557, + "grad_norm": 1.6697921752929688, + "learning_rate": 3.456209453561954e-07, + "loss": 1.6174, + "step": 27206 + }, + { + "epoch": 0.974340609164324, + "grad_norm": 1.3625354766845703, + "learning_rate": 3.4465809628273504e-07, + "loss": 0.9797, + "step": 27207 + }, + { + "epoch": 0.9743764212938922, + "grad_norm": 1.308997631072998, + "learning_rate": 3.4369658794335403e-07, + "loss": 1.3414, + "step": 27208 + }, + { + "epoch": 0.9744122334234605, + "grad_norm": 1.4477964639663696, + "learning_rate": 3.4273642035099753e-07, + "loss": 1.2255, + "step": 27209 + }, + { + "epoch": 0.9744480455530288, + "grad_norm": 1.5210626125335693, + "learning_rate": 3.417775935185663e-07, + "loss": 1.4795, + "step": 27210 + }, + { + "epoch": 0.974483857682597, + "grad_norm": 1.6373701095581055, + "learning_rate": 3.408201074589612e-07, + "loss": 1.6343, + "step": 27211 + }, + { + "epoch": 0.9745196698121654, + "grad_norm": 1.727810263633728, + "learning_rate": 3.39863962185083e-07, + "loss": 1.565, + "step": 27212 + }, + { + "epoch": 0.9745554819417337, + "grad_norm": 1.3922851085662842, + "learning_rate": 3.3890915770977694e-07, + "loss": 1.2344, + "step": 27213 + }, + { + "epoch": 0.974591294071302, + "grad_norm": 1.5838536024093628, + "learning_rate": 3.379556940458883e-07, + "loss": 1.3403, + "step": 27214 + }, + { + "epoch": 0.9746271062008702, + "grad_norm": 1.6230381727218628, + "learning_rate": 3.3700357120626247e-07, + "loss": 1.4572, + "step": 27215 + }, + { + "epoch": 0.9746629183304385, + "grad_norm": 1.2696282863616943, + "learning_rate": 3.360527892036891e-07, + "loss": 1.6312, + "step": 27216 + }, + { + "epoch": 0.9746987304600068, + "grad_norm": 1.652672529220581, + "learning_rate": 3.351033480509691e-07, + "loss": 1.4796, + "step": 27217 + }, + { + "epoch": 0.974734542589575, + "grad_norm": 1.8868895769119263, + "learning_rate": 3.3415524776088116e-07, + "loss": 1.3838, + "step": 27218 + }, + { + "epoch": 0.9747703547191434, + "grad_norm": 1.9636809825897217, + "learning_rate": 3.332084883461706e-07, + "loss": 1.42, + "step": 27219 + }, + { + "epoch": 0.9748061668487117, + "grad_norm": 1.3785717487335205, + "learning_rate": 3.3226306981957166e-07, + "loss": 1.6404, + "step": 27220 + }, + { + "epoch": 0.97484197897828, + "grad_norm": 1.4908267259597778, + "learning_rate": 3.313189921938187e-07, + "loss": 1.2639, + "step": 27221 + }, + { + "epoch": 0.9748777911078482, + "grad_norm": 2.0432732105255127, + "learning_rate": 3.3037625548160143e-07, + "loss": 1.8091, + "step": 27222 + }, + { + "epoch": 0.9749136032374165, + "grad_norm": 2.1694438457489014, + "learning_rate": 3.294348596956098e-07, + "loss": 1.3754, + "step": 27223 + }, + { + "epoch": 0.9749494153669848, + "grad_norm": 1.6928741931915283, + "learning_rate": 3.2849480484851145e-07, + "loss": 1.213, + "step": 27224 + }, + { + "epoch": 0.974985227496553, + "grad_norm": 1.616788625717163, + "learning_rate": 3.275560909529407e-07, + "loss": 1.5529, + "step": 27225 + }, + { + "epoch": 0.9750210396261214, + "grad_norm": 1.5182127952575684, + "learning_rate": 3.2661871802154296e-07, + "loss": 1.4226, + "step": 27226 + }, + { + "epoch": 0.9750568517556897, + "grad_norm": 1.6872198581695557, + "learning_rate": 3.256826860669193e-07, + "loss": 1.3533, + "step": 27227 + }, + { + "epoch": 0.975092663885258, + "grad_norm": 2.066617012023926, + "learning_rate": 3.2474799510165965e-07, + "loss": 1.5938, + "step": 27228 + }, + { + "epoch": 0.9751284760148262, + "grad_norm": 1.7410343885421753, + "learning_rate": 3.238146451383428e-07, + "loss": 1.4438, + "step": 27229 + }, + { + "epoch": 0.9751642881443945, + "grad_norm": 1.7189652919769287, + "learning_rate": 3.228826361895254e-07, + "loss": 1.6799, + "step": 27230 + }, + { + "epoch": 0.9752001002739628, + "grad_norm": 1.2681914567947388, + "learning_rate": 3.21951968267753e-07, + "loss": 1.3594, + "step": 27231 + }, + { + "epoch": 0.975235912403531, + "grad_norm": 1.4830594062805176, + "learning_rate": 3.2102264138553774e-07, + "loss": 1.3071, + "step": 27232 + }, + { + "epoch": 0.9752717245330994, + "grad_norm": 1.574739694595337, + "learning_rate": 3.200946555553919e-07, + "loss": 1.2429, + "step": 27233 + }, + { + "epoch": 0.9753075366626677, + "grad_norm": 2.1808180809020996, + "learning_rate": 3.191680107897943e-07, + "loss": 1.2584, + "step": 27234 + }, + { + "epoch": 0.9753433487922359, + "grad_norm": 1.646176815032959, + "learning_rate": 3.1824270710121286e-07, + "loss": 1.3929, + "step": 27235 + }, + { + "epoch": 0.9753791609218042, + "grad_norm": 1.216214656829834, + "learning_rate": 3.173187445020931e-07, + "loss": 1.1705, + "step": 27236 + }, + { + "epoch": 0.9754149730513725, + "grad_norm": 1.5510247945785522, + "learning_rate": 3.1639612300485844e-07, + "loss": 1.4461, + "step": 27237 + }, + { + "epoch": 0.9754507851809407, + "grad_norm": 1.6966277360916138, + "learning_rate": 3.1547484262194336e-07, + "loss": 1.5542, + "step": 27238 + }, + { + "epoch": 0.975486597310509, + "grad_norm": 2.0955851078033447, + "learning_rate": 3.1455490336572693e-07, + "loss": 1.1453, + "step": 27239 + }, + { + "epoch": 0.9755224094400774, + "grad_norm": 1.7406843900680542, + "learning_rate": 3.13636305248588e-07, + "loss": 1.4391, + "step": 27240 + }, + { + "epoch": 0.9755582215696457, + "grad_norm": 1.6218078136444092, + "learning_rate": 3.1271904828288343e-07, + "loss": 1.4322, + "step": 27241 + }, + { + "epoch": 0.9755940336992139, + "grad_norm": 1.6305664777755737, + "learning_rate": 3.1180313248097004e-07, + "loss": 1.2418, + "step": 27242 + }, + { + "epoch": 0.9756298458287822, + "grad_norm": 1.9506369829177856, + "learning_rate": 3.10888557855149e-07, + "loss": 1.5543, + "step": 27243 + }, + { + "epoch": 0.9756656579583505, + "grad_norm": 1.932220697402954, + "learning_rate": 3.099753244177217e-07, + "loss": 1.4439, + "step": 27244 + }, + { + "epoch": 0.9757014700879187, + "grad_norm": 1.4722483158111572, + "learning_rate": 3.090634321810004e-07, + "loss": 1.3055, + "step": 27245 + }, + { + "epoch": 0.975737282217487, + "grad_norm": 2.48748779296875, + "learning_rate": 3.0815288115723095e-07, + "loss": 1.7239, + "step": 27246 + }, + { + "epoch": 0.9757730943470554, + "grad_norm": 1.9032803773880005, + "learning_rate": 3.0724367135868126e-07, + "loss": 1.4195, + "step": 27247 + }, + { + "epoch": 0.9758089064766237, + "grad_norm": 2.151289939880371, + "learning_rate": 3.063358027975638e-07, + "loss": 1.3273, + "step": 27248 + }, + { + "epoch": 0.9758447186061919, + "grad_norm": 1.8209279775619507, + "learning_rate": 3.054292754861021e-07, + "loss": 1.611, + "step": 27249 + }, + { + "epoch": 0.9758805307357602, + "grad_norm": 1.8769649267196655, + "learning_rate": 3.0452408943649756e-07, + "loss": 1.5656, + "step": 27250 + }, + { + "epoch": 0.9759163428653285, + "grad_norm": 1.3402371406555176, + "learning_rate": 3.0362024466092933e-07, + "loss": 1.4473, + "step": 27251 + }, + { + "epoch": 0.9759521549948967, + "grad_norm": 1.6566824913024902, + "learning_rate": 3.0271774117153207e-07, + "loss": 1.3018, + "step": 27252 + }, + { + "epoch": 0.975987967124465, + "grad_norm": 1.9619542360305786, + "learning_rate": 3.01816578980485e-07, + "loss": 1.405, + "step": 27253 + }, + { + "epoch": 0.9760237792540334, + "grad_norm": 1.8266921043395996, + "learning_rate": 3.009167580998895e-07, + "loss": 1.2748, + "step": 27254 + }, + { + "epoch": 0.9760595913836017, + "grad_norm": 2.3018181324005127, + "learning_rate": 3.0001827854184704e-07, + "loss": 1.6089, + "step": 27255 + }, + { + "epoch": 0.9760954035131699, + "grad_norm": 1.7781779766082764, + "learning_rate": 2.9912114031847015e-07, + "loss": 1.3481, + "step": 27256 + }, + { + "epoch": 0.9761312156427382, + "grad_norm": 1.4521387815475464, + "learning_rate": 2.9822534344180475e-07, + "loss": 1.3735, + "step": 27257 + }, + { + "epoch": 0.9761670277723065, + "grad_norm": 2.076186180114746, + "learning_rate": 2.9733088792391894e-07, + "loss": 1.0925, + "step": 27258 + }, + { + "epoch": 0.9762028399018747, + "grad_norm": 1.6280510425567627, + "learning_rate": 2.9643777377682535e-07, + "loss": 1.4911, + "step": 27259 + }, + { + "epoch": 0.976238652031443, + "grad_norm": 1.9344412088394165, + "learning_rate": 2.955460010125699e-07, + "loss": 1.3981, + "step": 27260 + }, + { + "epoch": 0.9762744641610114, + "grad_norm": 1.447930932044983, + "learning_rate": 2.946555696431208e-07, + "loss": 1.571, + "step": 27261 + }, + { + "epoch": 0.9763102762905796, + "grad_norm": 1.7467442750930786, + "learning_rate": 2.9376647968047954e-07, + "loss": 1.5898, + "step": 27262 + }, + { + "epoch": 0.9763460884201479, + "grad_norm": 1.490739107131958, + "learning_rate": 2.928787311365921e-07, + "loss": 1.4754, + "step": 27263 + }, + { + "epoch": 0.9763819005497162, + "grad_norm": 1.6217472553253174, + "learning_rate": 2.919923240234046e-07, + "loss": 1.4527, + "step": 27264 + }, + { + "epoch": 0.9764177126792845, + "grad_norm": 1.551711916923523, + "learning_rate": 2.911072583528518e-07, + "loss": 1.3942, + "step": 27265 + }, + { + "epoch": 0.9764535248088527, + "grad_norm": 1.4863595962524414, + "learning_rate": 2.9022353413683534e-07, + "loss": 1.371, + "step": 27266 + }, + { + "epoch": 0.976489336938421, + "grad_norm": 1.9599124193191528, + "learning_rate": 2.893411513872457e-07, + "loss": 1.4827, + "step": 27267 + }, + { + "epoch": 0.9765251490679894, + "grad_norm": 1.4526926279067993, + "learning_rate": 2.884601101159512e-07, + "loss": 1.252, + "step": 27268 + }, + { + "epoch": 0.9765609611975576, + "grad_norm": 1.6005815267562866, + "learning_rate": 2.87580410334809e-07, + "loss": 1.4749, + "step": 27269 + }, + { + "epoch": 0.9765967733271259, + "grad_norm": 1.9566835165023804, + "learning_rate": 2.8670205205565406e-07, + "loss": 1.6584, + "step": 27270 + }, + { + "epoch": 0.9766325854566942, + "grad_norm": 1.6158761978149414, + "learning_rate": 2.8582503529029916e-07, + "loss": 1.0847, + "step": 27271 + }, + { + "epoch": 0.9766683975862624, + "grad_norm": 1.6226449012756348, + "learning_rate": 2.84949360050546e-07, + "loss": 1.2105, + "step": 27272 + }, + { + "epoch": 0.9767042097158307, + "grad_norm": 1.3435142040252686, + "learning_rate": 2.8407502634817395e-07, + "loss": 1.3259, + "step": 27273 + }, + { + "epoch": 0.976740021845399, + "grad_norm": 1.224449872970581, + "learning_rate": 2.8320203419495153e-07, + "loss": 1.0536, + "step": 27274 + }, + { + "epoch": 0.9767758339749674, + "grad_norm": 1.4458800554275513, + "learning_rate": 2.8233038360262474e-07, + "loss": 1.2663, + "step": 27275 + }, + { + "epoch": 0.9768116461045356, + "grad_norm": 1.431313395500183, + "learning_rate": 2.814600745829177e-07, + "loss": 1.3087, + "step": 27276 + }, + { + "epoch": 0.9768474582341039, + "grad_norm": 1.2949119806289673, + "learning_rate": 2.805911071475209e-07, + "loss": 1.3999, + "step": 27277 + }, + { + "epoch": 0.9768832703636722, + "grad_norm": 1.5865280628204346, + "learning_rate": 2.797234813081584e-07, + "loss": 1.1732, + "step": 27278 + }, + { + "epoch": 0.9769190824932404, + "grad_norm": 1.7895091772079468, + "learning_rate": 2.788571970764764e-07, + "loss": 1.3668, + "step": 27279 + }, + { + "epoch": 0.9769548946228087, + "grad_norm": 1.5917913913726807, + "learning_rate": 2.7799225446414334e-07, + "loss": 1.3931, + "step": 27280 + }, + { + "epoch": 0.976990706752377, + "grad_norm": 1.47417414188385, + "learning_rate": 2.7712865348279436e-07, + "loss": 1.3361, + "step": 27281 + }, + { + "epoch": 0.9770265188819454, + "grad_norm": 1.7814379930496216, + "learning_rate": 2.762663941440424e-07, + "loss": 1.6861, + "step": 27282 + }, + { + "epoch": 0.9770623310115136, + "grad_norm": 1.5276187658309937, + "learning_rate": 2.7540547645950045e-07, + "loss": 1.3426, + "step": 27283 + }, + { + "epoch": 0.9770981431410819, + "grad_norm": 1.551092505455017, + "learning_rate": 2.745459004407369e-07, + "loss": 1.2497, + "step": 27284 + }, + { + "epoch": 0.9771339552706502, + "grad_norm": 1.6632046699523926, + "learning_rate": 2.736876660993204e-07, + "loss": 1.4969, + "step": 27285 + }, + { + "epoch": 0.9771697674002184, + "grad_norm": 1.940461277961731, + "learning_rate": 2.728307734467972e-07, + "loss": 1.3537, + "step": 27286 + }, + { + "epoch": 0.9772055795297867, + "grad_norm": 2.302395820617676, + "learning_rate": 2.719752224947025e-07, + "loss": 1.3458, + "step": 27287 + }, + { + "epoch": 0.977241391659355, + "grad_norm": 1.845481038093567, + "learning_rate": 2.711210132545383e-07, + "loss": 1.3436, + "step": 27288 + }, + { + "epoch": 0.9772772037889234, + "grad_norm": 1.4163119792938232, + "learning_rate": 2.7026814573779534e-07, + "loss": 1.3536, + "step": 27289 + }, + { + "epoch": 0.9773130159184916, + "grad_norm": 1.6288304328918457, + "learning_rate": 2.6941661995596446e-07, + "loss": 1.4439, + "step": 27290 + }, + { + "epoch": 0.9773488280480599, + "grad_norm": 1.5382457971572876, + "learning_rate": 2.6856643592048093e-07, + "loss": 1.5094, + "step": 27291 + }, + { + "epoch": 0.9773846401776282, + "grad_norm": 1.2881473302841187, + "learning_rate": 2.6771759364279114e-07, + "loss": 1.5564, + "step": 27292 + }, + { + "epoch": 0.9774204523071964, + "grad_norm": 1.9076626300811768, + "learning_rate": 2.668700931343082e-07, + "loss": 1.5091, + "step": 27293 + }, + { + "epoch": 0.9774562644367647, + "grad_norm": 2.171661376953125, + "learning_rate": 2.6602393440645636e-07, + "loss": 1.4619, + "step": 27294 + }, + { + "epoch": 0.977492076566333, + "grad_norm": 1.776262879371643, + "learning_rate": 2.651791174706042e-07, + "loss": 1.5386, + "step": 27295 + }, + { + "epoch": 0.9775278886959013, + "grad_norm": 1.5984774827957153, + "learning_rate": 2.6433564233810936e-07, + "loss": 1.5978, + "step": 27296 + }, + { + "epoch": 0.9775637008254696, + "grad_norm": 1.9452379941940308, + "learning_rate": 2.6349350902032944e-07, + "loss": 1.668, + "step": 27297 + }, + { + "epoch": 0.9775995129550379, + "grad_norm": 1.268587589263916, + "learning_rate": 2.6265271752859975e-07, + "loss": 1.3742, + "step": 27298 + }, + { + "epoch": 0.9776353250846062, + "grad_norm": 1.4393203258514404, + "learning_rate": 2.618132678742224e-07, + "loss": 1.2159, + "step": 27299 + }, + { + "epoch": 0.9776711372141744, + "grad_norm": 1.389753818511963, + "learning_rate": 2.609751600684995e-07, + "loss": 1.1791, + "step": 27300 + }, + { + "epoch": 0.9777069493437427, + "grad_norm": 1.9547994136810303, + "learning_rate": 2.601383941226998e-07, + "loss": 1.4618, + "step": 27301 + }, + { + "epoch": 0.977742761473311, + "grad_norm": 1.6610933542251587, + "learning_rate": 2.593029700480698e-07, + "loss": 1.6401, + "step": 27302 + }, + { + "epoch": 0.9777785736028793, + "grad_norm": 2.492784261703491, + "learning_rate": 2.584688878558783e-07, + "loss": 1.4826, + "step": 27303 + }, + { + "epoch": 0.9778143857324476, + "grad_norm": 1.273895263671875, + "learning_rate": 2.576361475573275e-07, + "loss": 1.4765, + "step": 27304 + }, + { + "epoch": 0.9778501978620159, + "grad_norm": 1.622200846672058, + "learning_rate": 2.568047491636194e-07, + "loss": 1.1659, + "step": 27305 + }, + { + "epoch": 0.9778860099915841, + "grad_norm": 1.9652822017669678, + "learning_rate": 2.559746926859452e-07, + "loss": 1.4729, + "step": 27306 + }, + { + "epoch": 0.9779218221211524, + "grad_norm": 1.710400938987732, + "learning_rate": 2.551459781354737e-07, + "loss": 1.716, + "step": 27307 + }, + { + "epoch": 0.9779576342507207, + "grad_norm": 2.0867536067962646, + "learning_rate": 2.543186055233515e-07, + "loss": 1.2675, + "step": 27308 + }, + { + "epoch": 0.977993446380289, + "grad_norm": 1.5373716354370117, + "learning_rate": 2.534925748607031e-07, + "loss": 1.2282, + "step": 27309 + }, + { + "epoch": 0.9780292585098573, + "grad_norm": 1.6054054498672485, + "learning_rate": 2.526678861586529e-07, + "loss": 1.6797, + "step": 27310 + }, + { + "epoch": 0.9780650706394256, + "grad_norm": 1.2186475992202759, + "learning_rate": 2.51844539428292e-07, + "loss": 1.5985, + "step": 27311 + }, + { + "epoch": 0.9781008827689939, + "grad_norm": 1.5311107635498047, + "learning_rate": 2.5102253468070047e-07, + "loss": 1.2037, + "step": 27312 + }, + { + "epoch": 0.9781366948985621, + "grad_norm": 2.0063748359680176, + "learning_rate": 2.50201871926925e-07, + "loss": 1.2174, + "step": 27313 + }, + { + "epoch": 0.9781725070281304, + "grad_norm": 1.6210482120513916, + "learning_rate": 2.4938255117802347e-07, + "loss": 1.3572, + "step": 27314 + }, + { + "epoch": 0.9782083191576987, + "grad_norm": 1.8170969486236572, + "learning_rate": 2.485645724450092e-07, + "loss": 1.2536, + "step": 27315 + }, + { + "epoch": 0.978244131287267, + "grad_norm": 1.7604585886001587, + "learning_rate": 2.4774793573888453e-07, + "loss": 1.336, + "step": 27316 + }, + { + "epoch": 0.9782799434168353, + "grad_norm": 1.353798747062683, + "learning_rate": 2.4693264107064075e-07, + "loss": 1.3545, + "step": 27317 + }, + { + "epoch": 0.9783157555464036, + "grad_norm": 1.6163965463638306, + "learning_rate": 2.4611868845124673e-07, + "loss": 1.1518, + "step": 27318 + }, + { + "epoch": 0.9783515676759719, + "grad_norm": 1.461172342300415, + "learning_rate": 2.453060778916605e-07, + "loss": 1.5765, + "step": 27319 + }, + { + "epoch": 0.9783873798055401, + "grad_norm": 1.6993408203125, + "learning_rate": 2.4449480940279547e-07, + "loss": 1.5854, + "step": 27320 + }, + { + "epoch": 0.9784231919351084, + "grad_norm": 1.431376576423645, + "learning_rate": 2.436848829955762e-07, + "loss": 1.1842, + "step": 27321 + }, + { + "epoch": 0.9784590040646767, + "grad_norm": 2.1553871631622314, + "learning_rate": 2.428762986809052e-07, + "loss": 1.4902, + "step": 27322 + }, + { + "epoch": 0.9784948161942449, + "grad_norm": 1.702622652053833, + "learning_rate": 2.420690564696626e-07, + "loss": 1.5358, + "step": 27323 + }, + { + "epoch": 0.9785306283238133, + "grad_norm": 1.2670756578445435, + "learning_rate": 2.4126315637269523e-07, + "loss": 0.9199, + "step": 27324 + }, + { + "epoch": 0.9785664404533816, + "grad_norm": 1.6474324464797974, + "learning_rate": 2.4045859840085005e-07, + "loss": 1.3944, + "step": 27325 + }, + { + "epoch": 0.9786022525829499, + "grad_norm": 1.6244606971740723, + "learning_rate": 2.3965538256496276e-07, + "loss": 1.1658, + "step": 27326 + }, + { + "epoch": 0.9786380647125181, + "grad_norm": 1.3171576261520386, + "learning_rate": 2.388535088758248e-07, + "loss": 1.4406, + "step": 27327 + }, + { + "epoch": 0.9786738768420864, + "grad_norm": 1.325938105583191, + "learning_rate": 2.3805297734422748e-07, + "loss": 1.3195, + "step": 27328 + }, + { + "epoch": 0.9787096889716547, + "grad_norm": 1.6224009990692139, + "learning_rate": 2.3725378798095112e-07, + "loss": 1.7065, + "step": 27329 + }, + { + "epoch": 0.9787455011012229, + "grad_norm": 1.4370075464248657, + "learning_rate": 2.364559407967426e-07, + "loss": 1.4062, + "step": 27330 + }, + { + "epoch": 0.9787813132307913, + "grad_norm": 1.7838104963302612, + "learning_rate": 2.3565943580232676e-07, + "loss": 1.6156, + "step": 27331 + }, + { + "epoch": 0.9788171253603596, + "grad_norm": 1.8117674589157104, + "learning_rate": 2.3486427300841717e-07, + "loss": 1.8238, + "step": 27332 + }, + { + "epoch": 0.9788529374899279, + "grad_norm": 1.5283007621765137, + "learning_rate": 2.3407045242573867e-07, + "loss": 1.1369, + "step": 27333 + }, + { + "epoch": 0.9788887496194961, + "grad_norm": 1.715983510017395, + "learning_rate": 2.3327797406494934e-07, + "loss": 1.3713, + "step": 27334 + }, + { + "epoch": 0.9789245617490644, + "grad_norm": 1.5859873294830322, + "learning_rate": 2.3248683793670735e-07, + "loss": 1.4738, + "step": 27335 + }, + { + "epoch": 0.9789603738786327, + "grad_norm": 1.5380550622940063, + "learning_rate": 2.316970440516708e-07, + "loss": 1.3575, + "step": 27336 + }, + { + "epoch": 0.9789961860082009, + "grad_norm": 1.4396132230758667, + "learning_rate": 2.309085924204535e-07, + "loss": 1.3689, + "step": 27337 + }, + { + "epoch": 0.9790319981377693, + "grad_norm": 1.9657173156738281, + "learning_rate": 2.301214830536691e-07, + "loss": 1.7597, + "step": 27338 + }, + { + "epoch": 0.9790678102673376, + "grad_norm": 2.931874990463257, + "learning_rate": 2.2933571596190918e-07, + "loss": 1.9935, + "step": 27339 + }, + { + "epoch": 0.9791036223969058, + "grad_norm": 1.8927358388900757, + "learning_rate": 2.2855129115574304e-07, + "loss": 1.2429, + "step": 27340 + }, + { + "epoch": 0.9791394345264741, + "grad_norm": 1.5039610862731934, + "learning_rate": 2.277682086457289e-07, + "loss": 1.6117, + "step": 27341 + }, + { + "epoch": 0.9791752466560424, + "grad_norm": 1.5220850706100464, + "learning_rate": 2.269864684423917e-07, + "loss": 1.2156, + "step": 27342 + }, + { + "epoch": 0.9792110587856107, + "grad_norm": 1.7066388130187988, + "learning_rate": 2.262060705562452e-07, + "loss": 1.4429, + "step": 27343 + }, + { + "epoch": 0.9792468709151789, + "grad_norm": 1.3088314533233643, + "learning_rate": 2.2542701499780327e-07, + "loss": 1.3944, + "step": 27344 + }, + { + "epoch": 0.9792826830447473, + "grad_norm": 2.0127079486846924, + "learning_rate": 2.246493017775353e-07, + "loss": 1.0906, + "step": 27345 + }, + { + "epoch": 0.9793184951743156, + "grad_norm": 1.8624136447906494, + "learning_rate": 2.2387293090592177e-07, + "loss": 1.7316, + "step": 27346 + }, + { + "epoch": 0.9793543073038838, + "grad_norm": 2.0756258964538574, + "learning_rate": 2.230979023933877e-07, + "loss": 1.341, + "step": 27347 + }, + { + "epoch": 0.9793901194334521, + "grad_norm": 1.8070365190505981, + "learning_rate": 2.2232421625036914e-07, + "loss": 1.7985, + "step": 27348 + }, + { + "epoch": 0.9794259315630204, + "grad_norm": 1.252737283706665, + "learning_rate": 2.2155187248728004e-07, + "loss": 1.3545, + "step": 27349 + }, + { + "epoch": 0.9794617436925886, + "grad_norm": 1.4469175338745117, + "learning_rate": 2.2078087111450097e-07, + "loss": 1.5457, + "step": 27350 + }, + { + "epoch": 0.9794975558221569, + "grad_norm": 1.5265394449234009, + "learning_rate": 2.200112121424125e-07, + "loss": 1.4557, + "step": 27351 + }, + { + "epoch": 0.9795333679517253, + "grad_norm": 1.837325096130371, + "learning_rate": 2.192428955813619e-07, + "loss": 1.3282, + "step": 27352 + }, + { + "epoch": 0.9795691800812936, + "grad_norm": 2.001613140106201, + "learning_rate": 2.184759214416854e-07, + "loss": 1.3861, + "step": 27353 + }, + { + "epoch": 0.9796049922108618, + "grad_norm": 2.0825603008270264, + "learning_rate": 2.177102897337191e-07, + "loss": 1.5685, + "step": 27354 + }, + { + "epoch": 0.9796408043404301, + "grad_norm": 1.7808622121810913, + "learning_rate": 2.169460004677437e-07, + "loss": 1.4419, + "step": 27355 + }, + { + "epoch": 0.9796766164699984, + "grad_norm": 1.5627222061157227, + "learning_rate": 2.16183053654051e-07, + "loss": 1.3342, + "step": 27356 + }, + { + "epoch": 0.9797124285995666, + "grad_norm": 1.7511463165283203, + "learning_rate": 2.1542144930289943e-07, + "loss": 1.1166, + "step": 27357 + }, + { + "epoch": 0.9797482407291349, + "grad_norm": 1.368848204612732, + "learning_rate": 2.1466118742453634e-07, + "loss": 1.4296, + "step": 27358 + }, + { + "epoch": 0.9797840528587033, + "grad_norm": 1.8091198205947876, + "learning_rate": 2.139022680292091e-07, + "loss": 1.3872, + "step": 27359 + }, + { + "epoch": 0.9798198649882716, + "grad_norm": 1.5368421077728271, + "learning_rate": 2.1314469112709844e-07, + "loss": 1.1766, + "step": 27360 + }, + { + "epoch": 0.9798556771178398, + "grad_norm": 1.9813166856765747, + "learning_rate": 2.1238845672841845e-07, + "loss": 1.4277, + "step": 27361 + }, + { + "epoch": 0.9798914892474081, + "grad_norm": 2.2372472286224365, + "learning_rate": 2.1163356484332764e-07, + "loss": 1.2097, + "step": 27362 + }, + { + "epoch": 0.9799273013769764, + "grad_norm": 1.3528220653533936, + "learning_rate": 2.1088001548199565e-07, + "loss": 1.6458, + "step": 27363 + }, + { + "epoch": 0.9799631135065446, + "grad_norm": 1.6533170938491821, + "learning_rate": 2.1012780865454773e-07, + "loss": 1.089, + "step": 27364 + }, + { + "epoch": 0.9799989256361129, + "grad_norm": 1.8356963396072388, + "learning_rate": 2.093769443711091e-07, + "loss": 1.4844, + "step": 27365 + }, + { + "epoch": 0.9800347377656813, + "grad_norm": 1.6935428380966187, + "learning_rate": 2.086274226417939e-07, + "loss": 1.5054, + "step": 27366 + }, + { + "epoch": 0.9800705498952496, + "grad_norm": 1.528594732284546, + "learning_rate": 2.0787924347666076e-07, + "loss": 1.5082, + "step": 27367 + }, + { + "epoch": 0.9801063620248178, + "grad_norm": 1.4918920993804932, + "learning_rate": 2.071324068858016e-07, + "loss": 1.2325, + "step": 27368 + }, + { + "epoch": 0.9801421741543861, + "grad_norm": 1.6513475179672241, + "learning_rate": 2.0638691287925282e-07, + "loss": 1.7074, + "step": 27369 + }, + { + "epoch": 0.9801779862839544, + "grad_norm": 1.2635741233825684, + "learning_rate": 2.0564276146703977e-07, + "loss": 1.2669, + "step": 27370 + }, + { + "epoch": 0.9802137984135226, + "grad_norm": 2.1320393085479736, + "learning_rate": 2.048999526591766e-07, + "loss": 1.4007, + "step": 27371 + }, + { + "epoch": 0.9802496105430909, + "grad_norm": 1.2138698101043701, + "learning_rate": 2.041584864656554e-07, + "loss": 1.3064, + "step": 27372 + }, + { + "epoch": 0.9802854226726593, + "grad_norm": 1.551811695098877, + "learning_rate": 2.03418362896457e-07, + "loss": 1.4349, + "step": 27373 + }, + { + "epoch": 0.9803212348022275, + "grad_norm": 1.5845892429351807, + "learning_rate": 2.0267958196154013e-07, + "loss": 1.3266, + "step": 27374 + }, + { + "epoch": 0.9803570469317958, + "grad_norm": 1.8183881044387817, + "learning_rate": 2.019421436708413e-07, + "loss": 1.7351, + "step": 27375 + }, + { + "epoch": 0.9803928590613641, + "grad_norm": 1.50227952003479, + "learning_rate": 2.012060480342748e-07, + "loss": 1.6177, + "step": 27376 + }, + { + "epoch": 0.9804286711909324, + "grad_norm": 1.61687171459198, + "learning_rate": 2.0047129506175488e-07, + "loss": 1.2517, + "step": 27377 + }, + { + "epoch": 0.9804644833205006, + "grad_norm": 1.2359267473220825, + "learning_rate": 1.9973788476315147e-07, + "loss": 1.3682, + "step": 27378 + }, + { + "epoch": 0.9805002954500689, + "grad_norm": 1.4068397283554077, + "learning_rate": 1.9900581714835666e-07, + "loss": 1.4976, + "step": 27379 + }, + { + "epoch": 0.9805361075796373, + "grad_norm": 1.6050434112548828, + "learning_rate": 1.9827509222719587e-07, + "loss": 1.3544, + "step": 27380 + }, + { + "epoch": 0.9805719197092055, + "grad_norm": 1.5304006338119507, + "learning_rate": 1.9754571000950572e-07, + "loss": 1.4646, + "step": 27381 + }, + { + "epoch": 0.9806077318387738, + "grad_norm": 1.6975919008255005, + "learning_rate": 1.968176705051117e-07, + "loss": 1.4701, + "step": 27382 + }, + { + "epoch": 0.9806435439683421, + "grad_norm": 1.4890451431274414, + "learning_rate": 1.960909737237837e-07, + "loss": 0.9229, + "step": 27383 + }, + { + "epoch": 0.9806793560979103, + "grad_norm": 1.4120088815689087, + "learning_rate": 1.9536561967532507e-07, + "loss": 1.7422, + "step": 27384 + }, + { + "epoch": 0.9807151682274786, + "grad_norm": 1.6301904916763306, + "learning_rate": 1.9464160836948354e-07, + "loss": 1.2318, + "step": 27385 + }, + { + "epoch": 0.9807509803570469, + "grad_norm": 1.9923317432403564, + "learning_rate": 1.9391893981599575e-07, + "loss": 1.6143, + "step": 27386 + }, + { + "epoch": 0.9807867924866153, + "grad_norm": 1.7062000036239624, + "learning_rate": 1.9319761402458726e-07, + "loss": 1.5476, + "step": 27387 + }, + { + "epoch": 0.9808226046161835, + "grad_norm": 2.0269036293029785, + "learning_rate": 1.924776310049725e-07, + "loss": 1.3775, + "step": 27388 + }, + { + "epoch": 0.9808584167457518, + "grad_norm": 2.484912395477295, + "learning_rate": 1.9175899076682158e-07, + "loss": 1.5035, + "step": 27389 + }, + { + "epoch": 0.9808942288753201, + "grad_norm": 1.5835357904434204, + "learning_rate": 1.9104169331981558e-07, + "loss": 1.264, + "step": 27390 + }, + { + "epoch": 0.9809300410048883, + "grad_norm": 1.6880030632019043, + "learning_rate": 1.9032573867359126e-07, + "loss": 1.5247, + "step": 27391 + }, + { + "epoch": 0.9809658531344566, + "grad_norm": 1.7247785329818726, + "learning_rate": 1.8961112683778536e-07, + "loss": 1.5215, + "step": 27392 + }, + { + "epoch": 0.9810016652640249, + "grad_norm": 1.686508297920227, + "learning_rate": 1.8889785782202352e-07, + "loss": 1.4663, + "step": 27393 + }, + { + "epoch": 0.9810374773935933, + "grad_norm": 1.2703536748886108, + "learning_rate": 1.8818593163589805e-07, + "loss": 1.281, + "step": 27394 + }, + { + "epoch": 0.9810732895231615, + "grad_norm": 2.6047658920288086, + "learning_rate": 1.8747534828897905e-07, + "loss": 1.5477, + "step": 27395 + }, + { + "epoch": 0.9811091016527298, + "grad_norm": 1.490902304649353, + "learning_rate": 1.867661077908256e-07, + "loss": 1.2813, + "step": 27396 + }, + { + "epoch": 0.9811449137822981, + "grad_norm": 1.706141710281372, + "learning_rate": 1.8605821015098556e-07, + "loss": 1.3684, + "step": 27397 + }, + { + "epoch": 0.9811807259118663, + "grad_norm": 3.6071617603302, + "learning_rate": 1.853516553789847e-07, + "loss": 1.553, + "step": 27398 + }, + { + "epoch": 0.9812165380414346, + "grad_norm": 1.1728358268737793, + "learning_rate": 1.8464644348432647e-07, + "loss": 1.2748, + "step": 27399 + }, + { + "epoch": 0.9812523501710029, + "grad_norm": 1.6108249425888062, + "learning_rate": 1.8394257447650332e-07, + "loss": 1.4349, + "step": 27400 + }, + { + "epoch": 0.9812881623005713, + "grad_norm": 1.6908518075942993, + "learning_rate": 1.832400483649632e-07, + "loss": 1.6162, + "step": 27401 + }, + { + "epoch": 0.9813239744301395, + "grad_norm": 1.55814528465271, + "learning_rate": 1.825388651591875e-07, + "loss": 1.6483, + "step": 27402 + }, + { + "epoch": 0.9813597865597078, + "grad_norm": 1.2762830257415771, + "learning_rate": 1.8183902486859083e-07, + "loss": 1.5125, + "step": 27403 + }, + { + "epoch": 0.9813955986892761, + "grad_norm": 1.6804869174957275, + "learning_rate": 1.8114052750259902e-07, + "loss": 1.4573, + "step": 27404 + }, + { + "epoch": 0.9814314108188443, + "grad_norm": 1.4806815385818481, + "learning_rate": 1.8044337307059345e-07, + "loss": 1.2894, + "step": 27405 + }, + { + "epoch": 0.9814672229484126, + "grad_norm": 1.408532738685608, + "learning_rate": 1.7974756158196658e-07, + "loss": 1.4177, + "step": 27406 + }, + { + "epoch": 0.9815030350779809, + "grad_norm": 1.4657418727874756, + "learning_rate": 1.7905309304608876e-07, + "loss": 1.3174, + "step": 27407 + }, + { + "epoch": 0.9815388472075492, + "grad_norm": 1.338862657546997, + "learning_rate": 1.7835996747228578e-07, + "loss": 1.3876, + "step": 27408 + }, + { + "epoch": 0.9815746593371175, + "grad_norm": 1.70529305934906, + "learning_rate": 1.7766818486988357e-07, + "loss": 1.5092, + "step": 27409 + }, + { + "epoch": 0.9816104714666858, + "grad_norm": 1.8063136339187622, + "learning_rate": 1.769777452481969e-07, + "loss": 1.3785, + "step": 27410 + }, + { + "epoch": 0.981646283596254, + "grad_norm": 1.7511066198349, + "learning_rate": 1.7628864861651827e-07, + "loss": 1.334, + "step": 27411 + }, + { + "epoch": 0.9816820957258223, + "grad_norm": 2.083026647567749, + "learning_rate": 1.7560089498410704e-07, + "loss": 1.3795, + "step": 27412 + }, + { + "epoch": 0.9817179078553906, + "grad_norm": 1.639681339263916, + "learning_rate": 1.749144843602224e-07, + "loss": 1.3635, + "step": 27413 + }, + { + "epoch": 0.9817537199849589, + "grad_norm": 1.5684409141540527, + "learning_rate": 1.7422941675410143e-07, + "loss": 1.3592, + "step": 27414 + }, + { + "epoch": 0.9817895321145272, + "grad_norm": 2.3734753131866455, + "learning_rate": 1.7354569217494788e-07, + "loss": 1.4517, + "step": 27415 + }, + { + "epoch": 0.9818253442440955, + "grad_norm": 1.4742026329040527, + "learning_rate": 1.728633106319766e-07, + "loss": 1.4555, + "step": 27416 + }, + { + "epoch": 0.9818611563736638, + "grad_norm": 1.7053145170211792, + "learning_rate": 1.721822721343691e-07, + "loss": 1.4405, + "step": 27417 + }, + { + "epoch": 0.981896968503232, + "grad_norm": 1.586531400680542, + "learning_rate": 1.7150257669127367e-07, + "loss": 1.1004, + "step": 27418 + }, + { + "epoch": 0.9819327806328003, + "grad_norm": 1.7251858711242676, + "learning_rate": 1.7082422431183853e-07, + "loss": 1.5102, + "step": 27419 + }, + { + "epoch": 0.9819685927623686, + "grad_norm": 1.6602956056594849, + "learning_rate": 1.701472150051897e-07, + "loss": 1.6386, + "step": 27420 + }, + { + "epoch": 0.9820044048919369, + "grad_norm": 1.4878696203231812, + "learning_rate": 1.6947154878045324e-07, + "loss": 1.6375, + "step": 27421 + }, + { + "epoch": 0.9820402170215052, + "grad_norm": 2.1307990550994873, + "learning_rate": 1.6879722564669964e-07, + "loss": 1.5758, + "step": 27422 + }, + { + "epoch": 0.9820760291510735, + "grad_norm": 1.5953096151351929, + "learning_rate": 1.6812424561299943e-07, + "loss": 1.3952, + "step": 27423 + }, + { + "epoch": 0.9821118412806418, + "grad_norm": 1.6359837055206299, + "learning_rate": 1.6745260868841207e-07, + "loss": 1.5067, + "step": 27424 + }, + { + "epoch": 0.98214765341021, + "grad_norm": 1.5047430992126465, + "learning_rate": 1.667823148819858e-07, + "loss": 1.2965, + "step": 27425 + }, + { + "epoch": 0.9821834655397783, + "grad_norm": 1.7252825498580933, + "learning_rate": 1.661133642027246e-07, + "loss": 1.1945, + "step": 27426 + }, + { + "epoch": 0.9822192776693466, + "grad_norm": 1.6850330829620361, + "learning_rate": 1.6544575665963236e-07, + "loss": 1.2893, + "step": 27427 + }, + { + "epoch": 0.9822550897989148, + "grad_norm": 1.569167971611023, + "learning_rate": 1.6477949226167967e-07, + "loss": 1.4836, + "step": 27428 + }, + { + "epoch": 0.9822909019284832, + "grad_norm": 1.9602831602096558, + "learning_rate": 1.6411457101784822e-07, + "loss": 1.337, + "step": 27429 + }, + { + "epoch": 0.9823267140580515, + "grad_norm": 1.4004954099655151, + "learning_rate": 1.6345099293708644e-07, + "loss": 1.324, + "step": 27430 + }, + { + "epoch": 0.9823625261876198, + "grad_norm": 3.534090042114258, + "learning_rate": 1.627887580282983e-07, + "loss": 1.4684, + "step": 27431 + }, + { + "epoch": 0.982398338317188, + "grad_norm": 1.5962355136871338, + "learning_rate": 1.6212786630041e-07, + "loss": 1.2013, + "step": 27432 + }, + { + "epoch": 0.9824341504467563, + "grad_norm": 1.6603145599365234, + "learning_rate": 1.6146831776231442e-07, + "loss": 1.3004, + "step": 27433 + }, + { + "epoch": 0.9824699625763246, + "grad_norm": 1.6587244272232056, + "learning_rate": 1.6081011242287115e-07, + "loss": 1.5779, + "step": 27434 + }, + { + "epoch": 0.9825057747058928, + "grad_norm": 1.366642951965332, + "learning_rate": 1.6015325029095084e-07, + "loss": 1.3437, + "step": 27435 + }, + { + "epoch": 0.9825415868354612, + "grad_norm": 1.6663930416107178, + "learning_rate": 1.5949773137537982e-07, + "loss": 1.0643, + "step": 27436 + }, + { + "epoch": 0.9825773989650295, + "grad_norm": 1.70339834690094, + "learning_rate": 1.588435556849843e-07, + "loss": 1.2619, + "step": 27437 + }, + { + "epoch": 0.9826132110945978, + "grad_norm": 1.9648959636688232, + "learning_rate": 1.5819072322856842e-07, + "loss": 1.2753, + "step": 27438 + }, + { + "epoch": 0.982649023224166, + "grad_norm": 1.460438847541809, + "learning_rate": 1.575392340149029e-07, + "loss": 1.3353, + "step": 27439 + }, + { + "epoch": 0.9826848353537343, + "grad_norm": 1.61161208152771, + "learning_rate": 1.5688908805275848e-07, + "loss": 1.6739, + "step": 27440 + }, + { + "epoch": 0.9827206474833026, + "grad_norm": 1.4673449993133545, + "learning_rate": 1.5624028535088375e-07, + "loss": 1.4056, + "step": 27441 + }, + { + "epoch": 0.9827564596128708, + "grad_norm": 1.7691820859909058, + "learning_rate": 1.5559282591801617e-07, + "loss": 1.679, + "step": 27442 + }, + { + "epoch": 0.9827922717424392, + "grad_norm": 1.7717576026916504, + "learning_rate": 1.5494670976284875e-07, + "loss": 1.574, + "step": 27443 + }, + { + "epoch": 0.9828280838720075, + "grad_norm": 1.412382960319519, + "learning_rate": 1.543019368940857e-07, + "loss": 1.3202, + "step": 27444 + }, + { + "epoch": 0.9828638960015758, + "grad_norm": 1.745245099067688, + "learning_rate": 1.5365850732039778e-07, + "loss": 1.6919, + "step": 27445 + }, + { + "epoch": 0.982899708131144, + "grad_norm": 1.7831408977508545, + "learning_rate": 1.5301642105043368e-07, + "loss": 1.3777, + "step": 27446 + }, + { + "epoch": 0.9829355202607123, + "grad_norm": 1.3212556838989258, + "learning_rate": 1.5237567809285314e-07, + "loss": 1.1704, + "step": 27447 + }, + { + "epoch": 0.9829713323902806, + "grad_norm": 1.4446337223052979, + "learning_rate": 1.5173627845624927e-07, + "loss": 1.2402, + "step": 27448 + }, + { + "epoch": 0.9830071445198488, + "grad_norm": 1.6273493766784668, + "learning_rate": 1.510982221492485e-07, + "loss": 1.616, + "step": 27449 + }, + { + "epoch": 0.9830429566494172, + "grad_norm": 1.3335165977478027, + "learning_rate": 1.5046150918042178e-07, + "loss": 1.4248, + "step": 27450 + }, + { + "epoch": 0.9830787687789855, + "grad_norm": 1.4034610986709595, + "learning_rate": 1.4982613955834001e-07, + "loss": 1.2438, + "step": 27451 + }, + { + "epoch": 0.9831145809085537, + "grad_norm": 1.4101903438568115, + "learning_rate": 1.4919211329156302e-07, + "loss": 1.3619, + "step": 27452 + }, + { + "epoch": 0.983150393038122, + "grad_norm": 2.1397769451141357, + "learning_rate": 1.4855943038858399e-07, + "loss": 1.52, + "step": 27453 + }, + { + "epoch": 0.9831862051676903, + "grad_norm": 1.4390918016433716, + "learning_rate": 1.4792809085795166e-07, + "loss": 1.2698, + "step": 27454 + }, + { + "epoch": 0.9832220172972586, + "grad_norm": 1.4676564931869507, + "learning_rate": 1.4729809470814815e-07, + "loss": 1.3267, + "step": 27455 + }, + { + "epoch": 0.9832578294268268, + "grad_norm": 2.3807873725891113, + "learning_rate": 1.4666944194764443e-07, + "loss": 1.358, + "step": 27456 + }, + { + "epoch": 0.9832936415563952, + "grad_norm": 1.7312041521072388, + "learning_rate": 1.4604213258491152e-07, + "loss": 1.5644, + "step": 27457 + }, + { + "epoch": 0.9833294536859635, + "grad_norm": 1.8110259771347046, + "learning_rate": 1.4541616662836488e-07, + "loss": 1.3413, + "step": 27458 + }, + { + "epoch": 0.9833652658155317, + "grad_norm": 1.3456915616989136, + "learning_rate": 1.4479154408645335e-07, + "loss": 1.2207, + "step": 27459 + }, + { + "epoch": 0.9834010779451, + "grad_norm": 1.6277025938034058, + "learning_rate": 1.441682649675591e-07, + "loss": 1.4059, + "step": 27460 + }, + { + "epoch": 0.9834368900746683, + "grad_norm": 1.6478482484817505, + "learning_rate": 1.435463292800754e-07, + "loss": 1.4681, + "step": 27461 + }, + { + "epoch": 0.9834727022042365, + "grad_norm": 2.0339913368225098, + "learning_rate": 1.4292573703237333e-07, + "loss": 1.4551, + "step": 27462 + }, + { + "epoch": 0.9835085143338048, + "grad_norm": 2.6009304523468018, + "learning_rate": 1.423064882328018e-07, + "loss": 1.6065, + "step": 27463 + }, + { + "epoch": 0.9835443264633732, + "grad_norm": 2.1348941326141357, + "learning_rate": 1.4168858288968745e-07, + "loss": 1.4424, + "step": 27464 + }, + { + "epoch": 0.9835801385929415, + "grad_norm": 2.100189208984375, + "learning_rate": 1.4107202101134588e-07, + "loss": 1.8112, + "step": 27465 + }, + { + "epoch": 0.9836159507225097, + "grad_norm": 1.6087077856063843, + "learning_rate": 1.404568026060704e-07, + "loss": 1.2046, + "step": 27466 + }, + { + "epoch": 0.983651762852078, + "grad_norm": 1.3044302463531494, + "learning_rate": 1.3984292768213225e-07, + "loss": 1.7624, + "step": 27467 + }, + { + "epoch": 0.9836875749816463, + "grad_norm": 1.8005906343460083, + "learning_rate": 1.3923039624780255e-07, + "loss": 1.6117, + "step": 27468 + }, + { + "epoch": 0.9837233871112145, + "grad_norm": 1.8799329996109009, + "learning_rate": 1.3861920831131914e-07, + "loss": 1.4647, + "step": 27469 + }, + { + "epoch": 0.9837591992407828, + "grad_norm": 1.3361802101135254, + "learning_rate": 1.380093638808977e-07, + "loss": 1.2524, + "step": 27470 + }, + { + "epoch": 0.9837950113703512, + "grad_norm": 1.5945254564285278, + "learning_rate": 1.3740086296475385e-07, + "loss": 1.6172, + "step": 27471 + }, + { + "epoch": 0.9838308234999195, + "grad_norm": 1.4752191305160522, + "learning_rate": 1.3679370557106997e-07, + "loss": 1.24, + "step": 27472 + }, + { + "epoch": 0.9838666356294877, + "grad_norm": 1.4537187814712524, + "learning_rate": 1.3618789170800618e-07, + "loss": 1.3304, + "step": 27473 + }, + { + "epoch": 0.983902447759056, + "grad_norm": 1.3634134531021118, + "learning_rate": 1.355834213837226e-07, + "loss": 1.4036, + "step": 27474 + }, + { + "epoch": 0.9839382598886243, + "grad_norm": 1.8052500486373901, + "learning_rate": 1.349802946063461e-07, + "loss": 1.4053, + "step": 27475 + }, + { + "epoch": 0.9839740720181925, + "grad_norm": 1.618231177330017, + "learning_rate": 1.3437851138399237e-07, + "loss": 1.4643, + "step": 27476 + }, + { + "epoch": 0.9840098841477608, + "grad_norm": 1.6881462335586548, + "learning_rate": 1.33778071724755e-07, + "loss": 1.4012, + "step": 27477 + }, + { + "epoch": 0.9840456962773292, + "grad_norm": 1.7199472188949585, + "learning_rate": 1.3317897563671633e-07, + "loss": 1.4276, + "step": 27478 + }, + { + "epoch": 0.9840815084068975, + "grad_norm": 1.4663841724395752, + "learning_rate": 1.3258122312793663e-07, + "loss": 1.4161, + "step": 27479 + }, + { + "epoch": 0.9841173205364657, + "grad_norm": 1.3081108331680298, + "learning_rate": 1.3198481420646504e-07, + "loss": 0.9065, + "step": 27480 + }, + { + "epoch": 0.984153132666034, + "grad_norm": 1.7379041910171509, + "learning_rate": 1.313897488803062e-07, + "loss": 1.4373, + "step": 27481 + }, + { + "epoch": 0.9841889447956023, + "grad_norm": 1.5058616399765015, + "learning_rate": 1.3079602715748706e-07, + "loss": 1.3416, + "step": 27482 + }, + { + "epoch": 0.9842247569251705, + "grad_norm": 1.4991620779037476, + "learning_rate": 1.3020364904597903e-07, + "loss": 1.4064, + "step": 27483 + }, + { + "epoch": 0.9842605690547388, + "grad_norm": 2.0297024250030518, + "learning_rate": 1.296126145537646e-07, + "loss": 1.5304, + "step": 27484 + }, + { + "epoch": 0.9842963811843072, + "grad_norm": 1.7815202474594116, + "learning_rate": 1.2902292368878188e-07, + "loss": 1.3314, + "step": 27485 + }, + { + "epoch": 0.9843321933138754, + "grad_norm": 1.5582129955291748, + "learning_rate": 1.2843457645896895e-07, + "loss": 1.2736, + "step": 27486 + }, + { + "epoch": 0.9843680054434437, + "grad_norm": 1.263076901435852, + "learning_rate": 1.278475728722528e-07, + "loss": 1.4625, + "step": 27487 + }, + { + "epoch": 0.984403817573012, + "grad_norm": 2.047459602355957, + "learning_rate": 1.27261912936516e-07, + "loss": 1.7214, + "step": 27488 + }, + { + "epoch": 0.9844396297025803, + "grad_norm": 1.9874540567398071, + "learning_rate": 1.2667759665964118e-07, + "loss": 1.1864, + "step": 27489 + }, + { + "epoch": 0.9844754418321485, + "grad_norm": 1.2950546741485596, + "learning_rate": 1.260946240494998e-07, + "loss": 1.496, + "step": 27490 + }, + { + "epoch": 0.9845112539617168, + "grad_norm": 1.5544168949127197, + "learning_rate": 1.255129951139189e-07, + "loss": 1.14, + "step": 27491 + }, + { + "epoch": 0.9845470660912852, + "grad_norm": 1.3458187580108643, + "learning_rate": 1.249327098607367e-07, + "loss": 1.3636, + "step": 27492 + }, + { + "epoch": 0.9845828782208534, + "grad_norm": 1.8006720542907715, + "learning_rate": 1.2435376829775803e-07, + "loss": 1.6198, + "step": 27493 + }, + { + "epoch": 0.9846186903504217, + "grad_norm": 1.344710111618042, + "learning_rate": 1.2377617043276556e-07, + "loss": 1.2155, + "step": 27494 + }, + { + "epoch": 0.98465450247999, + "grad_norm": 1.3209199905395508, + "learning_rate": 1.231999162735309e-07, + "loss": 1.2898, + "step": 27495 + }, + { + "epoch": 0.9846903146095582, + "grad_norm": 1.2484784126281738, + "learning_rate": 1.2262500582781445e-07, + "loss": 1.2969, + "step": 27496 + }, + { + "epoch": 0.9847261267391265, + "grad_norm": 1.6258513927459717, + "learning_rate": 1.2205143910334338e-07, + "loss": 1.3481, + "step": 27497 + }, + { + "epoch": 0.9847619388686948, + "grad_norm": 2.1914570331573486, + "learning_rate": 1.2147921610783374e-07, + "loss": 1.7681, + "step": 27498 + }, + { + "epoch": 0.9847977509982632, + "grad_norm": 1.3298579454421997, + "learning_rate": 1.209083368490016e-07, + "loss": 0.9676, + "step": 27499 + }, + { + "epoch": 0.9848335631278314, + "grad_norm": 2.284987211227417, + "learning_rate": 1.2033880133449638e-07, + "loss": 1.3902, + "step": 27500 + }, + { + "epoch": 0.9848693752573997, + "grad_norm": 2.084764003753662, + "learning_rate": 1.1977060957200083e-07, + "loss": 1.6581, + "step": 27501 + }, + { + "epoch": 0.984905187386968, + "grad_norm": 1.331100344657898, + "learning_rate": 1.1920376156916435e-07, + "loss": 1.7012, + "step": 27502 + }, + { + "epoch": 0.9849409995165362, + "grad_norm": 1.537713885307312, + "learning_rate": 1.1863825733359201e-07, + "loss": 1.6487, + "step": 27503 + }, + { + "epoch": 0.9849768116461045, + "grad_norm": 1.940346360206604, + "learning_rate": 1.1807409687291104e-07, + "loss": 1.5181, + "step": 27504 + }, + { + "epoch": 0.9850126237756728, + "grad_norm": 1.2605899572372437, + "learning_rate": 1.1751128019470426e-07, + "loss": 1.3682, + "step": 27505 + }, + { + "epoch": 0.9850484359052412, + "grad_norm": 1.9707603454589844, + "learning_rate": 1.1694980730654337e-07, + "loss": 1.4205, + "step": 27506 + }, + { + "epoch": 0.9850842480348094, + "grad_norm": 2.1587588787078857, + "learning_rate": 1.163896782159779e-07, + "loss": 1.4563, + "step": 27507 + }, + { + "epoch": 0.9851200601643777, + "grad_norm": 2.022172451019287, + "learning_rate": 1.1583089293055738e-07, + "loss": 1.4855, + "step": 27508 + }, + { + "epoch": 0.985155872293946, + "grad_norm": 1.5751782655715942, + "learning_rate": 1.152734514577869e-07, + "loss": 1.4265, + "step": 27509 + }, + { + "epoch": 0.9851916844235142, + "grad_norm": 1.86588454246521, + "learning_rate": 1.1471735380517156e-07, + "loss": 1.5442, + "step": 27510 + }, + { + "epoch": 0.9852274965530825, + "grad_norm": 1.3863067626953125, + "learning_rate": 1.141625999801943e-07, + "loss": 1.5659, + "step": 27511 + }, + { + "epoch": 0.9852633086826508, + "grad_norm": 1.3101366758346558, + "learning_rate": 1.1360918999030467e-07, + "loss": 1.3462, + "step": 27512 + }, + { + "epoch": 0.9852991208122192, + "grad_norm": 2.0936365127563477, + "learning_rate": 1.1305712384297451e-07, + "loss": 1.4808, + "step": 27513 + }, + { + "epoch": 0.9853349329417874, + "grad_norm": 1.8865522146224976, + "learning_rate": 1.1250640154560898e-07, + "loss": 1.5132, + "step": 27514 + }, + { + "epoch": 0.9853707450713557, + "grad_norm": 1.5039201974868774, + "learning_rate": 1.1195702310561329e-07, + "loss": 1.5329, + "step": 27515 + }, + { + "epoch": 0.985406557200924, + "grad_norm": 1.6145402193069458, + "learning_rate": 1.1140898853040372e-07, + "loss": 1.1151, + "step": 27516 + }, + { + "epoch": 0.9854423693304922, + "grad_norm": 1.550848126411438, + "learning_rate": 1.1086229782734103e-07, + "loss": 1.1668, + "step": 27517 + }, + { + "epoch": 0.9854781814600605, + "grad_norm": 1.7086671590805054, + "learning_rate": 1.1031695100376382e-07, + "loss": 1.6588, + "step": 27518 + }, + { + "epoch": 0.9855139935896288, + "grad_norm": 1.6773920059204102, + "learning_rate": 1.0977294806703286e-07, + "loss": 1.6271, + "step": 27519 + }, + { + "epoch": 0.9855498057191971, + "grad_norm": 1.6794071197509766, + "learning_rate": 1.0923028902446453e-07, + "loss": 1.5406, + "step": 27520 + }, + { + "epoch": 0.9855856178487654, + "grad_norm": 1.3817123174667358, + "learning_rate": 1.0868897388334187e-07, + "loss": 1.7427, + "step": 27521 + }, + { + "epoch": 0.9856214299783337, + "grad_norm": 1.4821956157684326, + "learning_rate": 1.0814900265095907e-07, + "loss": 1.5651, + "step": 27522 + }, + { + "epoch": 0.985657242107902, + "grad_norm": 1.8491703271865845, + "learning_rate": 1.0761037533457696e-07, + "loss": 1.5515, + "step": 27523 + }, + { + "epoch": 0.9856930542374702, + "grad_norm": 2.7087044715881348, + "learning_rate": 1.0707309194145643e-07, + "loss": 1.3486, + "step": 27524 + }, + { + "epoch": 0.9857288663670385, + "grad_norm": 1.8126064538955688, + "learning_rate": 1.0653715247881391e-07, + "loss": 1.4137, + "step": 27525 + }, + { + "epoch": 0.9857646784966068, + "grad_norm": 2.0652072429656982, + "learning_rate": 1.0600255695385475e-07, + "loss": 1.2903, + "step": 27526 + }, + { + "epoch": 0.9858004906261751, + "grad_norm": 1.4882497787475586, + "learning_rate": 1.054693053737843e-07, + "loss": 1.6498, + "step": 27527 + }, + { + "epoch": 0.9858363027557434, + "grad_norm": 1.413586139678955, + "learning_rate": 1.049373977457635e-07, + "loss": 1.3082, + "step": 27528 + }, + { + "epoch": 0.9858721148853117, + "grad_norm": 2.0597503185272217, + "learning_rate": 1.0440683407695328e-07, + "loss": 1.4598, + "step": 27529 + }, + { + "epoch": 0.98590792701488, + "grad_norm": 1.3489118814468384, + "learning_rate": 1.0387761437449239e-07, + "loss": 1.4809, + "step": 27530 + }, + { + "epoch": 0.9859437391444482, + "grad_norm": 1.6404932737350464, + "learning_rate": 1.0334973864550845e-07, + "loss": 1.1483, + "step": 27531 + }, + { + "epoch": 0.9859795512740165, + "grad_norm": 1.574805736541748, + "learning_rate": 1.0282320689708469e-07, + "loss": 1.4865, + "step": 27532 + }, + { + "epoch": 0.9860153634035848, + "grad_norm": 1.9894275665283203, + "learning_rate": 1.0229801913632653e-07, + "loss": 1.5721, + "step": 27533 + }, + { + "epoch": 0.986051175533153, + "grad_norm": 1.813362717628479, + "learning_rate": 1.0177417537028389e-07, + "loss": 1.6005, + "step": 27534 + }, + { + "epoch": 0.9860869876627214, + "grad_norm": 1.4725996255874634, + "learning_rate": 1.0125167560601778e-07, + "loss": 1.4278, + "step": 27535 + }, + { + "epoch": 0.9861227997922897, + "grad_norm": 1.5005663633346558, + "learning_rate": 1.0073051985054482e-07, + "loss": 1.2329, + "step": 27536 + }, + { + "epoch": 0.9861586119218579, + "grad_norm": 1.8271814584732056, + "learning_rate": 1.0021070811088162e-07, + "loss": 1.3513, + "step": 27537 + }, + { + "epoch": 0.9861944240514262, + "grad_norm": 1.5297101736068726, + "learning_rate": 9.969224039403369e-08, + "loss": 1.5614, + "step": 27538 + }, + { + "epoch": 0.9862302361809945, + "grad_norm": 1.3022998571395874, + "learning_rate": 9.9175116706951e-08, + "loss": 1.0406, + "step": 27539 + }, + { + "epoch": 0.9862660483105627, + "grad_norm": 1.8046320676803589, + "learning_rate": 9.86593370566058e-08, + "loss": 1.3661, + "step": 27540 + }, + { + "epoch": 0.986301860440131, + "grad_norm": 1.635322093963623, + "learning_rate": 9.814490144993693e-08, + "loss": 1.5341, + "step": 27541 + }, + { + "epoch": 0.9863376725696994, + "grad_norm": 1.8201141357421875, + "learning_rate": 9.763180989386112e-08, + "loss": 1.3124, + "step": 27542 + }, + { + "epoch": 0.9863734846992677, + "grad_norm": 1.5187281370162964, + "learning_rate": 9.712006239529503e-08, + "loss": 1.2908, + "step": 27543 + }, + { + "epoch": 0.9864092968288359, + "grad_norm": 1.995166540145874, + "learning_rate": 9.660965896111095e-08, + "loss": 1.5077, + "step": 27544 + }, + { + "epoch": 0.9864451089584042, + "grad_norm": 1.9507112503051758, + "learning_rate": 9.610059959817008e-08, + "loss": 1.1559, + "step": 27545 + }, + { + "epoch": 0.9864809210879725, + "grad_norm": 1.3299881219863892, + "learning_rate": 9.559288431333357e-08, + "loss": 1.5751, + "step": 27546 + }, + { + "epoch": 0.9865167332175407, + "grad_norm": 1.6809051036834717, + "learning_rate": 9.50865131134182e-08, + "loss": 1.2238, + "step": 27547 + }, + { + "epoch": 0.986552545347109, + "grad_norm": 1.5759165287017822, + "learning_rate": 9.458148600525185e-08, + "loss": 1.2708, + "step": 27548 + }, + { + "epoch": 0.9865883574766774, + "grad_norm": 1.5264739990234375, + "learning_rate": 9.407780299562908e-08, + "loss": 1.8756, + "step": 27549 + }, + { + "epoch": 0.9866241696062457, + "grad_norm": 1.4205260276794434, + "learning_rate": 9.357546409132223e-08, + "loss": 1.5195, + "step": 27550 + }, + { + "epoch": 0.9866599817358139, + "grad_norm": 1.4577481746673584, + "learning_rate": 9.307446929908148e-08, + "loss": 1.8085, + "step": 27551 + }, + { + "epoch": 0.9866957938653822, + "grad_norm": 1.9873889684677124, + "learning_rate": 9.257481862564587e-08, + "loss": 1.3263, + "step": 27552 + }, + { + "epoch": 0.9867316059949505, + "grad_norm": 1.4352340698242188, + "learning_rate": 9.207651207775448e-08, + "loss": 1.5041, + "step": 27553 + }, + { + "epoch": 0.9867674181245187, + "grad_norm": 1.8972949981689453, + "learning_rate": 9.157954966210192e-08, + "loss": 1.5607, + "step": 27554 + }, + { + "epoch": 0.986803230254087, + "grad_norm": 2.068582057952881, + "learning_rate": 9.108393138536064e-08, + "loss": 1.3331, + "step": 27555 + }, + { + "epoch": 0.9868390423836554, + "grad_norm": 1.2161344289779663, + "learning_rate": 9.05896572542253e-08, + "loss": 1.1665, + "step": 27556 + }, + { + "epoch": 0.9868748545132237, + "grad_norm": 2.187922239303589, + "learning_rate": 9.009672727533503e-08, + "loss": 1.7574, + "step": 27557 + }, + { + "epoch": 0.9869106666427919, + "grad_norm": 1.6480915546417236, + "learning_rate": 8.960514145530674e-08, + "loss": 1.1512, + "step": 27558 + }, + { + "epoch": 0.9869464787723602, + "grad_norm": 1.5355759859085083, + "learning_rate": 8.911489980076848e-08, + "loss": 1.5087, + "step": 27559 + }, + { + "epoch": 0.9869822909019285, + "grad_norm": 2.0616066455841064, + "learning_rate": 8.862600231832607e-08, + "loss": 1.545, + "step": 27560 + }, + { + "epoch": 0.9870181030314967, + "grad_norm": 1.3449019193649292, + "learning_rate": 8.813844901452984e-08, + "loss": 1.6202, + "step": 27561 + }, + { + "epoch": 0.987053915161065, + "grad_norm": 1.6275995969772339, + "learning_rate": 8.765223989596338e-08, + "loss": 1.4442, + "step": 27562 + }, + { + "epoch": 0.9870897272906334, + "grad_norm": 1.3992468118667603, + "learning_rate": 8.716737496915483e-08, + "loss": 1.2691, + "step": 27563 + }, + { + "epoch": 0.9871255394202016, + "grad_norm": 1.3481507301330566, + "learning_rate": 8.66838542406212e-08, + "loss": 1.29, + "step": 27564 + }, + { + "epoch": 0.9871613515497699, + "grad_norm": 1.4527891874313354, + "learning_rate": 8.620167771689058e-08, + "loss": 1.2298, + "step": 27565 + }, + { + "epoch": 0.9871971636793382, + "grad_norm": 1.4806122779846191, + "learning_rate": 8.572084540443558e-08, + "loss": 1.4566, + "step": 27566 + }, + { + "epoch": 0.9872329758089065, + "grad_norm": 1.745713472366333, + "learning_rate": 8.524135730971772e-08, + "loss": 1.4321, + "step": 27567 + }, + { + "epoch": 0.9872687879384747, + "grad_norm": 1.633641242980957, + "learning_rate": 8.476321343920957e-08, + "loss": 1.3979, + "step": 27568 + }, + { + "epoch": 0.987304600068043, + "grad_norm": 1.8932297229766846, + "learning_rate": 8.428641379931713e-08, + "loss": 1.4096, + "step": 27569 + }, + { + "epoch": 0.9873404121976114, + "grad_norm": 1.2509734630584717, + "learning_rate": 8.381095839647967e-08, + "loss": 1.2951, + "step": 27570 + }, + { + "epoch": 0.9873762243271796, + "grad_norm": 1.5911204814910889, + "learning_rate": 8.333684723708102e-08, + "loss": 1.2692, + "step": 27571 + }, + { + "epoch": 0.9874120364567479, + "grad_norm": 1.4687579870224, + "learning_rate": 8.286408032749382e-08, + "loss": 1.694, + "step": 27572 + }, + { + "epoch": 0.9874478485863162, + "grad_norm": 1.4162245988845825, + "learning_rate": 8.239265767410187e-08, + "loss": 1.3073, + "step": 27573 + }, + { + "epoch": 0.9874836607158844, + "grad_norm": 1.4807811975479126, + "learning_rate": 8.192257928322233e-08, + "loss": 1.3811, + "step": 27574 + }, + { + "epoch": 0.9875194728454527, + "grad_norm": 2.3424911499023438, + "learning_rate": 8.145384516118349e-08, + "loss": 1.4198, + "step": 27575 + }, + { + "epoch": 0.987555284975021, + "grad_norm": 1.6639134883880615, + "learning_rate": 8.098645531431359e-08, + "loss": 1.3878, + "step": 27576 + }, + { + "epoch": 0.9875910971045894, + "grad_norm": 1.885317087173462, + "learning_rate": 8.052040974887432e-08, + "loss": 1.669, + "step": 27577 + }, + { + "epoch": 0.9876269092341576, + "grad_norm": 1.4320241212844849, + "learning_rate": 8.005570847113841e-08, + "loss": 1.1812, + "step": 27578 + }, + { + "epoch": 0.9876627213637259, + "grad_norm": 1.843773603439331, + "learning_rate": 7.959235148737865e-08, + "loss": 1.4705, + "step": 27579 + }, + { + "epoch": 0.9876985334932942, + "grad_norm": 1.7358323335647583, + "learning_rate": 7.913033880381226e-08, + "loss": 1.5676, + "step": 27580 + }, + { + "epoch": 0.9877343456228624, + "grad_norm": 1.5030070543289185, + "learning_rate": 7.866967042665651e-08, + "loss": 1.2817, + "step": 27581 + }, + { + "epoch": 0.9877701577524307, + "grad_norm": 2.4306750297546387, + "learning_rate": 7.821034636211755e-08, + "loss": 1.5123, + "step": 27582 + }, + { + "epoch": 0.987805969881999, + "grad_norm": 2.3857903480529785, + "learning_rate": 7.77523666163571e-08, + "loss": 1.4726, + "step": 27583 + }, + { + "epoch": 0.9878417820115674, + "grad_norm": 1.7818009853363037, + "learning_rate": 7.729573119555911e-08, + "loss": 1.5097, + "step": 27584 + }, + { + "epoch": 0.9878775941411356, + "grad_norm": 1.879576325416565, + "learning_rate": 7.684044010585201e-08, + "loss": 1.3873, + "step": 27585 + }, + { + "epoch": 0.9879134062707039, + "grad_norm": 1.4474228620529175, + "learning_rate": 7.638649335336423e-08, + "loss": 1.5264, + "step": 27586 + }, + { + "epoch": 0.9879492184002722, + "grad_norm": 1.6005494594573975, + "learning_rate": 7.593389094420201e-08, + "loss": 1.217, + "step": 27587 + }, + { + "epoch": 0.9879850305298404, + "grad_norm": 1.4687864780426025, + "learning_rate": 7.548263288446045e-08, + "loss": 1.5201, + "step": 27588 + }, + { + "epoch": 0.9880208426594087, + "grad_norm": 1.9801404476165771, + "learning_rate": 7.503271918020138e-08, + "loss": 1.2031, + "step": 27589 + }, + { + "epoch": 0.988056654788977, + "grad_norm": 2.481457233428955, + "learning_rate": 7.458414983748663e-08, + "loss": 1.7782, + "step": 27590 + }, + { + "epoch": 0.9880924669185454, + "grad_norm": 1.6557172536849976, + "learning_rate": 7.41369248623447e-08, + "loss": 1.2114, + "step": 27591 + }, + { + "epoch": 0.9881282790481136, + "grad_norm": 2.5019419193267822, + "learning_rate": 7.369104426080409e-08, + "loss": 1.3993, + "step": 27592 + }, + { + "epoch": 0.9881640911776819, + "grad_norm": 1.443387508392334, + "learning_rate": 7.324650803884891e-08, + "loss": 1.6107, + "step": 27593 + }, + { + "epoch": 0.9881999033072502, + "grad_norm": 1.4562711715698242, + "learning_rate": 7.280331620246328e-08, + "loss": 1.4164, + "step": 27594 + }, + { + "epoch": 0.9882357154368184, + "grad_norm": 1.7285951375961304, + "learning_rate": 7.236146875762017e-08, + "loss": 1.7147, + "step": 27595 + }, + { + "epoch": 0.9882715275663867, + "grad_norm": 1.6666792631149292, + "learning_rate": 7.19209657102482e-08, + "loss": 1.2269, + "step": 27596 + }, + { + "epoch": 0.988307339695955, + "grad_norm": 1.4787660837173462, + "learning_rate": 7.148180706628704e-08, + "loss": 1.088, + "step": 27597 + }, + { + "epoch": 0.9883431518255233, + "grad_norm": 1.591696858406067, + "learning_rate": 7.104399283163199e-08, + "loss": 1.429, + "step": 27598 + }, + { + "epoch": 0.9883789639550916, + "grad_norm": 1.709412932395935, + "learning_rate": 7.060752301218942e-08, + "loss": 1.2527, + "step": 27599 + }, + { + "epoch": 0.9884147760846599, + "grad_norm": 1.553296446800232, + "learning_rate": 7.017239761381022e-08, + "loss": 1.4105, + "step": 27600 + }, + { + "epoch": 0.9884505882142282, + "grad_norm": 1.3799681663513184, + "learning_rate": 6.973861664237857e-08, + "loss": 1.441, + "step": 27601 + }, + { + "epoch": 0.9884864003437964, + "grad_norm": 1.5649609565734863, + "learning_rate": 6.930618010370094e-08, + "loss": 1.4126, + "step": 27602 + }, + { + "epoch": 0.9885222124733647, + "grad_norm": 1.5194275379180908, + "learning_rate": 6.887508800361708e-08, + "loss": 1.1752, + "step": 27603 + }, + { + "epoch": 0.988558024602933, + "grad_norm": 1.4459376335144043, + "learning_rate": 6.844534034791128e-08, + "loss": 1.2249, + "step": 27604 + }, + { + "epoch": 0.9885938367325013, + "grad_norm": 1.5532654523849487, + "learning_rate": 6.801693714236779e-08, + "loss": 1.541, + "step": 27605 + }, + { + "epoch": 0.9886296488620696, + "grad_norm": 1.2949588298797607, + "learning_rate": 6.758987839275976e-08, + "loss": 1.4817, + "step": 27606 + }, + { + "epoch": 0.9886654609916379, + "grad_norm": 1.7721437215805054, + "learning_rate": 6.716416410481596e-08, + "loss": 1.6125, + "step": 27607 + }, + { + "epoch": 0.9887012731212061, + "grad_norm": 1.6902053356170654, + "learning_rate": 6.673979428428733e-08, + "loss": 1.4175, + "step": 27608 + }, + { + "epoch": 0.9887370852507744, + "grad_norm": 1.6286303997039795, + "learning_rate": 6.631676893685823e-08, + "loss": 1.6024, + "step": 27609 + }, + { + "epoch": 0.9887728973803427, + "grad_norm": 1.3310688734054565, + "learning_rate": 6.589508806823518e-08, + "loss": 1.2111, + "step": 27610 + }, + { + "epoch": 0.988808709509911, + "grad_norm": 2.158609628677368, + "learning_rate": 6.547475168409145e-08, + "loss": 1.1975, + "step": 27611 + }, + { + "epoch": 0.9888445216394793, + "grad_norm": 1.4116407632827759, + "learning_rate": 6.505575979007805e-08, + "loss": 1.2926, + "step": 27612 + }, + { + "epoch": 0.9888803337690476, + "grad_norm": 1.7177308797836304, + "learning_rate": 6.463811239183492e-08, + "loss": 1.272, + "step": 27613 + }, + { + "epoch": 0.9889161458986159, + "grad_norm": 1.2889899015426636, + "learning_rate": 6.42218094949798e-08, + "loss": 1.3474, + "step": 27614 + }, + { + "epoch": 0.9889519580281841, + "grad_norm": 2.355888605117798, + "learning_rate": 6.38068511051082e-08, + "loss": 1.6049, + "step": 27615 + }, + { + "epoch": 0.9889877701577524, + "grad_norm": 1.743425726890564, + "learning_rate": 6.339323722780455e-08, + "loss": 1.738, + "step": 27616 + }, + { + "epoch": 0.9890235822873207, + "grad_norm": 1.74456787109375, + "learning_rate": 6.298096786864216e-08, + "loss": 1.4269, + "step": 27617 + }, + { + "epoch": 0.989059394416889, + "grad_norm": 1.743873119354248, + "learning_rate": 6.257004303316106e-08, + "loss": 1.6638, + "step": 27618 + }, + { + "epoch": 0.9890952065464573, + "grad_norm": 2.2863898277282715, + "learning_rate": 6.216046272687904e-08, + "loss": 1.3663, + "step": 27619 + }, + { + "epoch": 0.9891310186760256, + "grad_norm": 2.197343111038208, + "learning_rate": 6.1752226955325e-08, + "loss": 1.6005, + "step": 27620 + }, + { + "epoch": 0.9891668308055939, + "grad_norm": 1.486518144607544, + "learning_rate": 6.134533572398349e-08, + "loss": 1.5046, + "step": 27621 + }, + { + "epoch": 0.9892026429351621, + "grad_norm": 1.5760600566864014, + "learning_rate": 6.093978903833897e-08, + "loss": 1.3962, + "step": 27622 + }, + { + "epoch": 0.9892384550647304, + "grad_norm": 1.749154806137085, + "learning_rate": 6.053558690382045e-08, + "loss": 1.4331, + "step": 27623 + }, + { + "epoch": 0.9892742671942987, + "grad_norm": 1.6556254625320435, + "learning_rate": 6.013272932590131e-08, + "loss": 1.5677, + "step": 27624 + }, + { + "epoch": 0.9893100793238669, + "grad_norm": 1.8295544385910034, + "learning_rate": 5.973121630996615e-08, + "loss": 1.1101, + "step": 27625 + }, + { + "epoch": 0.9893458914534353, + "grad_norm": 1.4694961309432983, + "learning_rate": 5.9331047861443944e-08, + "loss": 1.3744, + "step": 27626 + }, + { + "epoch": 0.9893817035830036, + "grad_norm": 1.7429505586624146, + "learning_rate": 5.893222398569709e-08, + "loss": 1.4978, + "step": 27627 + }, + { + "epoch": 0.9894175157125719, + "grad_norm": 1.6086716651916504, + "learning_rate": 5.8534744688110156e-08, + "loss": 1.007, + "step": 27628 + }, + { + "epoch": 0.9894533278421401, + "grad_norm": 1.5418668985366821, + "learning_rate": 5.8138609974023316e-08, + "loss": 1.2586, + "step": 27629 + }, + { + "epoch": 0.9894891399717084, + "grad_norm": 1.3214435577392578, + "learning_rate": 5.774381984876565e-08, + "loss": 1.4815, + "step": 27630 + }, + { + "epoch": 0.9895249521012767, + "grad_norm": 1.4706875085830688, + "learning_rate": 5.735037431765511e-08, + "loss": 1.1197, + "step": 27631 + }, + { + "epoch": 0.9895607642308449, + "grad_norm": 1.5423040390014648, + "learning_rate": 5.6958273385965264e-08, + "loss": 1.1352, + "step": 27632 + }, + { + "epoch": 0.9895965763604133, + "grad_norm": 1.6856582164764404, + "learning_rate": 5.656751705899188e-08, + "loss": 1.5453, + "step": 27633 + }, + { + "epoch": 0.9896323884899816, + "grad_norm": 1.7213730812072754, + "learning_rate": 5.617810534198631e-08, + "loss": 1.136, + "step": 27634 + }, + { + "epoch": 0.9896682006195499, + "grad_norm": 2.0866880416870117, + "learning_rate": 5.57900382401777e-08, + "loss": 1.1906, + "step": 27635 + }, + { + "epoch": 0.9897040127491181, + "grad_norm": 2.5806922912597656, + "learning_rate": 5.540331575880631e-08, + "loss": 1.7527, + "step": 27636 + }, + { + "epoch": 0.9897398248786864, + "grad_norm": 1.7330875396728516, + "learning_rate": 5.501793790305687e-08, + "loss": 1.3741, + "step": 27637 + }, + { + "epoch": 0.9897756370082547, + "grad_norm": 1.4673677682876587, + "learning_rate": 5.4633904678125234e-08, + "loss": 1.5148, + "step": 27638 + }, + { + "epoch": 0.9898114491378229, + "grad_norm": 1.8655990362167358, + "learning_rate": 5.425121608917394e-08, + "loss": 1.6041, + "step": 27639 + }, + { + "epoch": 0.9898472612673913, + "grad_norm": 1.5611985921859741, + "learning_rate": 5.3869872141343313e-08, + "loss": 1.3083, + "step": 27640 + }, + { + "epoch": 0.9898830733969596, + "grad_norm": 1.6236423254013062, + "learning_rate": 5.348987283978479e-08, + "loss": 1.3885, + "step": 27641 + }, + { + "epoch": 0.9899188855265278, + "grad_norm": 1.4711804389953613, + "learning_rate": 5.3111218189594304e-08, + "loss": 1.2077, + "step": 27642 + }, + { + "epoch": 0.9899546976560961, + "grad_norm": 1.3890304565429688, + "learning_rate": 5.2733908195867764e-08, + "loss": 1.526, + "step": 27643 + }, + { + "epoch": 0.9899905097856644, + "grad_norm": 1.4793809652328491, + "learning_rate": 5.23579428636789e-08, + "loss": 1.4713, + "step": 27644 + }, + { + "epoch": 0.9900263219152327, + "grad_norm": 1.3004274368286133, + "learning_rate": 5.1983322198101425e-08, + "loss": 1.5026, + "step": 27645 + }, + { + "epoch": 0.9900621340448009, + "grad_norm": 2.2040960788726807, + "learning_rate": 5.161004620416465e-08, + "loss": 1.1282, + "step": 27646 + }, + { + "epoch": 0.9900979461743693, + "grad_norm": 1.7686691284179688, + "learning_rate": 5.1238114886875685e-08, + "loss": 1.3961, + "step": 27647 + }, + { + "epoch": 0.9901337583039376, + "grad_norm": 1.619179606437683, + "learning_rate": 5.086752825126384e-08, + "loss": 1.5506, + "step": 27648 + }, + { + "epoch": 0.9901695704335058, + "grad_norm": 1.7559400796890259, + "learning_rate": 5.049828630230291e-08, + "loss": 1.3118, + "step": 27649 + }, + { + "epoch": 0.9902053825630741, + "grad_norm": 1.9877136945724487, + "learning_rate": 5.01303890449667e-08, + "loss": 1.5718, + "step": 27650 + }, + { + "epoch": 0.9902411946926424, + "grad_norm": 1.8758115768432617, + "learning_rate": 4.976383648419569e-08, + "loss": 1.306, + "step": 27651 + }, + { + "epoch": 0.9902770068222106, + "grad_norm": 2.0945935249328613, + "learning_rate": 4.9398628624930385e-08, + "loss": 1.5006, + "step": 27652 + }, + { + "epoch": 0.9903128189517789, + "grad_norm": 1.786147117614746, + "learning_rate": 4.903476547206687e-08, + "loss": 1.273, + "step": 27653 + }, + { + "epoch": 0.9903486310813473, + "grad_norm": 1.7926175594329834, + "learning_rate": 4.8672247030523425e-08, + "loss": 1.3818, + "step": 27654 + }, + { + "epoch": 0.9903844432109156, + "grad_norm": 2.316483497619629, + "learning_rate": 4.8311073305162825e-08, + "loss": 1.248, + "step": 27655 + }, + { + "epoch": 0.9904202553404838, + "grad_norm": 2.2332239151000977, + "learning_rate": 4.795124430085896e-08, + "loss": 1.1146, + "step": 27656 + }, + { + "epoch": 0.9904560674700521, + "grad_norm": 1.80547297000885, + "learning_rate": 4.7592760022430185e-08, + "loss": 1.372, + "step": 27657 + }, + { + "epoch": 0.9904918795996204, + "grad_norm": 1.2292410135269165, + "learning_rate": 4.723562047471708e-08, + "loss": 1.6469, + "step": 27658 + }, + { + "epoch": 0.9905276917291886, + "grad_norm": 1.5700558423995972, + "learning_rate": 4.687982566251581e-08, + "loss": 1.3877, + "step": 27659 + }, + { + "epoch": 0.9905635038587569, + "grad_norm": 1.5725685358047485, + "learning_rate": 4.652537559062253e-08, + "loss": 1.337, + "step": 27660 + }, + { + "epoch": 0.9905993159883253, + "grad_norm": 1.444566249847412, + "learning_rate": 4.617227026378901e-08, + "loss": 1.2462, + "step": 27661 + }, + { + "epoch": 0.9906351281178936, + "grad_norm": 1.5620014667510986, + "learning_rate": 4.582050968677809e-08, + "loss": 1.3293, + "step": 27662 + }, + { + "epoch": 0.9906709402474618, + "grad_norm": 1.445263147354126, + "learning_rate": 4.5470093864330435e-08, + "loss": 1.4857, + "step": 27663 + }, + { + "epoch": 0.9907067523770301, + "grad_norm": 2.2626137733459473, + "learning_rate": 4.5121022801142275e-08, + "loss": 1.2967, + "step": 27664 + }, + { + "epoch": 0.9907425645065984, + "grad_norm": 1.5922274589538574, + "learning_rate": 4.477329650192097e-08, + "loss": 1.2494, + "step": 27665 + }, + { + "epoch": 0.9907783766361666, + "grad_norm": 1.4853841066360474, + "learning_rate": 4.442691497134055e-08, + "loss": 1.2511, + "step": 27666 + }, + { + "epoch": 0.9908141887657349, + "grad_norm": 1.8890957832336426, + "learning_rate": 4.408187821406395e-08, + "loss": 1.6964, + "step": 27667 + }, + { + "epoch": 0.9908500008953033, + "grad_norm": 1.2731983661651611, + "learning_rate": 4.373818623473191e-08, + "loss": 1.3958, + "step": 27668 + }, + { + "epoch": 0.9908858130248716, + "grad_norm": 1.4172587394714355, + "learning_rate": 4.3395839037962956e-08, + "loss": 1.3414, + "step": 27669 + }, + { + "epoch": 0.9909216251544398, + "grad_norm": 1.5718883275985718, + "learning_rate": 4.305483662837562e-08, + "loss": 1.5334, + "step": 27670 + }, + { + "epoch": 0.9909574372840081, + "grad_norm": 1.6942057609558105, + "learning_rate": 4.2715179010555106e-08, + "loss": 1.4271, + "step": 27671 + }, + { + "epoch": 0.9909932494135764, + "grad_norm": 1.4615954160690308, + "learning_rate": 4.2376866189053346e-08, + "loss": 1.4432, + "step": 27672 + }, + { + "epoch": 0.9910290615431446, + "grad_norm": 1.955811858177185, + "learning_rate": 4.2039898168444445e-08, + "loss": 1.4947, + "step": 27673 + }, + { + "epoch": 0.9910648736727129, + "grad_norm": 1.6968096494674683, + "learning_rate": 4.170427495324702e-08, + "loss": 1.558, + "step": 27674 + }, + { + "epoch": 0.9911006858022813, + "grad_norm": 1.3976668119430542, + "learning_rate": 4.1369996547979685e-08, + "loss": 1.142, + "step": 27675 + }, + { + "epoch": 0.9911364979318495, + "grad_norm": 2.15129017829895, + "learning_rate": 4.1037062957138825e-08, + "loss": 1.7194, + "step": 27676 + }, + { + "epoch": 0.9911723100614178, + "grad_norm": 1.6736570596694946, + "learning_rate": 4.070547418522086e-08, + "loss": 1.307, + "step": 27677 + }, + { + "epoch": 0.9912081221909861, + "grad_norm": 1.7805536985397339, + "learning_rate": 4.037523023666667e-08, + "loss": 1.5194, + "step": 27678 + }, + { + "epoch": 0.9912439343205544, + "grad_norm": 1.7508223056793213, + "learning_rate": 4.0046331115917157e-08, + "loss": 1.5008, + "step": 27679 + }, + { + "epoch": 0.9912797464501226, + "grad_norm": 1.3880499601364136, + "learning_rate": 3.9718776827413204e-08, + "loss": 1.5748, + "step": 27680 + }, + { + "epoch": 0.9913155585796909, + "grad_norm": 2.6532273292541504, + "learning_rate": 3.9392567375551306e-08, + "loss": 1.1765, + "step": 27681 + }, + { + "epoch": 0.9913513707092593, + "grad_norm": 1.741824984550476, + "learning_rate": 3.906770276471683e-08, + "loss": 1.1866, + "step": 27682 + }, + { + "epoch": 0.9913871828388275, + "grad_norm": 1.3869082927703857, + "learning_rate": 3.8744182999295164e-08, + "loss": 1.4337, + "step": 27683 + }, + { + "epoch": 0.9914229949683958, + "grad_norm": 2.356285333633423, + "learning_rate": 3.842200808362728e-08, + "loss": 1.7948, + "step": 27684 + }, + { + "epoch": 0.9914588070979641, + "grad_norm": 1.4748954772949219, + "learning_rate": 3.810117802204305e-08, + "loss": 1.323, + "step": 27685 + }, + { + "epoch": 0.9914946192275323, + "grad_norm": 1.8216158151626587, + "learning_rate": 3.778169281887234e-08, + "loss": 1.4063, + "step": 27686 + }, + { + "epoch": 0.9915304313571006, + "grad_norm": 1.3165158033370972, + "learning_rate": 3.746355247841171e-08, + "loss": 1.1767, + "step": 27687 + }, + { + "epoch": 0.9915662434866689, + "grad_norm": 2.005211114883423, + "learning_rate": 3.7146757004924425e-08, + "loss": 1.1474, + "step": 27688 + }, + { + "epoch": 0.9916020556162373, + "grad_norm": 1.54912531375885, + "learning_rate": 3.683130640269594e-08, + "loss": 1.4143, + "step": 27689 + }, + { + "epoch": 0.9916378677458055, + "grad_norm": 1.9291484355926514, + "learning_rate": 3.651720067595621e-08, + "loss": 1.4354, + "step": 27690 + }, + { + "epoch": 0.9916736798753738, + "grad_norm": 1.4543896913528442, + "learning_rate": 3.620443982892407e-08, + "loss": 1.5983, + "step": 27691 + }, + { + "epoch": 0.9917094920049421, + "grad_norm": 2.140312910079956, + "learning_rate": 3.589302386582949e-08, + "loss": 1.352, + "step": 27692 + }, + { + "epoch": 0.9917453041345103, + "grad_norm": 1.8118711709976196, + "learning_rate": 3.558295279084689e-08, + "loss": 1.4931, + "step": 27693 + }, + { + "epoch": 0.9917811162640786, + "grad_norm": 1.5152671337127686, + "learning_rate": 3.527422660815072e-08, + "loss": 1.3262, + "step": 27694 + }, + { + "epoch": 0.9918169283936469, + "grad_norm": 1.843450903892517, + "learning_rate": 3.4966845321893204e-08, + "loss": 1.3335, + "step": 27695 + }, + { + "epoch": 0.9918527405232153, + "grad_norm": 2.2369720935821533, + "learning_rate": 3.4660808936215485e-08, + "loss": 1.5118, + "step": 27696 + }, + { + "epoch": 0.9918885526527835, + "grad_norm": 1.4145985841751099, + "learning_rate": 3.435611745522538e-08, + "loss": 1.5992, + "step": 27697 + }, + { + "epoch": 0.9919243647823518, + "grad_norm": 1.7589112520217896, + "learning_rate": 3.405277088301961e-08, + "loss": 1.3502, + "step": 27698 + }, + { + "epoch": 0.9919601769119201, + "grad_norm": 1.60627281665802, + "learning_rate": 3.375076922370601e-08, + "loss": 1.2305, + "step": 27699 + }, + { + "epoch": 0.9919959890414883, + "grad_norm": 1.4741889238357544, + "learning_rate": 3.345011248131469e-08, + "loss": 1.3674, + "step": 27700 + }, + { + "epoch": 0.9920318011710566, + "grad_norm": 1.848053216934204, + "learning_rate": 3.3150800659909055e-08, + "loss": 1.4557, + "step": 27701 + }, + { + "epoch": 0.9920676133006249, + "grad_norm": 1.6344389915466309, + "learning_rate": 3.285283376350812e-08, + "loss": 1.3267, + "step": 27702 + }, + { + "epoch": 0.9921034254301933, + "grad_norm": 1.2263892889022827, + "learning_rate": 3.255621179613089e-08, + "loss": 1.3394, + "step": 27703 + }, + { + "epoch": 0.9921392375597615, + "grad_norm": 1.8323966264724731, + "learning_rate": 3.226093476175196e-08, + "loss": 1.4297, + "step": 27704 + }, + { + "epoch": 0.9921750496893298, + "grad_norm": 1.2270833253860474, + "learning_rate": 3.1967002664357036e-08, + "loss": 1.1901, + "step": 27705 + }, + { + "epoch": 0.9922108618188981, + "grad_norm": 1.5533965826034546, + "learning_rate": 3.167441550789851e-08, + "loss": 1.455, + "step": 27706 + }, + { + "epoch": 0.9922466739484663, + "grad_norm": 1.9076077938079834, + "learning_rate": 3.138317329630658e-08, + "loss": 1.3796, + "step": 27707 + }, + { + "epoch": 0.9922824860780346, + "grad_norm": 1.721117377281189, + "learning_rate": 3.109327603351142e-08, + "loss": 1.2874, + "step": 27708 + }, + { + "epoch": 0.9923182982076029, + "grad_norm": 1.3826229572296143, + "learning_rate": 3.080472372339882e-08, + "loss": 1.524, + "step": 27709 + }, + { + "epoch": 0.9923541103371712, + "grad_norm": 1.7644851207733154, + "learning_rate": 3.0517516369865665e-08, + "loss": 1.3418, + "step": 27710 + }, + { + "epoch": 0.9923899224667395, + "grad_norm": 1.6510146856307983, + "learning_rate": 3.0231653976764415e-08, + "loss": 1.2241, + "step": 27711 + }, + { + "epoch": 0.9924257345963078, + "grad_norm": 1.384408950805664, + "learning_rate": 2.994713654793646e-08, + "loss": 1.5097, + "step": 27712 + }, + { + "epoch": 0.992461546725876, + "grad_norm": 1.7258027791976929, + "learning_rate": 2.966396408722316e-08, + "loss": 1.3808, + "step": 27713 + }, + { + "epoch": 0.9924973588554443, + "grad_norm": 2.7897722721099854, + "learning_rate": 2.9382136598432587e-08, + "loss": 0.9909, + "step": 27714 + }, + { + "epoch": 0.9925331709850126, + "grad_norm": 1.8314100503921509, + "learning_rate": 2.9101654085350594e-08, + "loss": 1.4235, + "step": 27715 + }, + { + "epoch": 0.9925689831145809, + "grad_norm": 1.441307783126831, + "learning_rate": 2.8822516551751942e-08, + "loss": 1.1951, + "step": 27716 + }, + { + "epoch": 0.9926047952441492, + "grad_norm": 1.8378783464431763, + "learning_rate": 2.854472400138919e-08, + "loss": 1.2973, + "step": 27717 + }, + { + "epoch": 0.9926406073737175, + "grad_norm": 1.8195862770080566, + "learning_rate": 2.8268276438003782e-08, + "loss": 1.4349, + "step": 27718 + }, + { + "epoch": 0.9926764195032858, + "grad_norm": 1.6150768995285034, + "learning_rate": 2.799317386531497e-08, + "loss": 1.547, + "step": 27719 + }, + { + "epoch": 0.992712231632854, + "grad_norm": 1.6209663152694702, + "learning_rate": 2.7719416287030897e-08, + "loss": 1.4109, + "step": 27720 + }, + { + "epoch": 0.9927480437624223, + "grad_norm": 1.6728167533874512, + "learning_rate": 2.74470037068264e-08, + "loss": 1.4065, + "step": 27721 + }, + { + "epoch": 0.9927838558919906, + "grad_norm": 1.3521865606307983, + "learning_rate": 2.717593612835412e-08, + "loss": 1.4566, + "step": 27722 + }, + { + "epoch": 0.9928196680215589, + "grad_norm": 1.5996589660644531, + "learning_rate": 2.6906213555288884e-08, + "loss": 1.4181, + "step": 27723 + }, + { + "epoch": 0.9928554801511272, + "grad_norm": 1.476045846939087, + "learning_rate": 2.6637835991238924e-08, + "loss": 1.2518, + "step": 27724 + }, + { + "epoch": 0.9928912922806955, + "grad_norm": 2.2209970951080322, + "learning_rate": 2.6370803439812463e-08, + "loss": 1.0453, + "step": 27725 + }, + { + "epoch": 0.9929271044102638, + "grad_norm": 1.3425819873809814, + "learning_rate": 2.6105115904617726e-08, + "loss": 1.5999, + "step": 27726 + }, + { + "epoch": 0.992962916539832, + "grad_norm": 1.6703921556472778, + "learning_rate": 2.584077338921853e-08, + "loss": 1.3171, + "step": 27727 + }, + { + "epoch": 0.9929987286694003, + "grad_norm": 1.6346490383148193, + "learning_rate": 2.557777589717869e-08, + "loss": 1.4642, + "step": 27728 + }, + { + "epoch": 0.9930345407989686, + "grad_norm": 1.7949005365371704, + "learning_rate": 2.5316123432028714e-08, + "loss": 1.6685, + "step": 27729 + }, + { + "epoch": 0.9930703529285368, + "grad_norm": 1.5301461219787598, + "learning_rate": 2.5055815997299113e-08, + "loss": 1.7001, + "step": 27730 + }, + { + "epoch": 0.9931061650581052, + "grad_norm": 1.6735132932662964, + "learning_rate": 2.479685359647599e-08, + "loss": 1.4314, + "step": 27731 + }, + { + "epoch": 0.9931419771876735, + "grad_norm": 1.8682321310043335, + "learning_rate": 2.453923623305654e-08, + "loss": 1.4695, + "step": 27732 + }, + { + "epoch": 0.9931777893172418, + "grad_norm": 1.7532639503479004, + "learning_rate": 2.4282963910504664e-08, + "loss": 1.6481, + "step": 27733 + }, + { + "epoch": 0.99321360144681, + "grad_norm": 1.6439052820205688, + "learning_rate": 2.4028036632262053e-08, + "loss": 1.3373, + "step": 27734 + }, + { + "epoch": 0.9932494135763783, + "grad_norm": 2.574580192565918, + "learning_rate": 2.3774454401770396e-08, + "loss": 1.5889, + "step": 27735 + }, + { + "epoch": 0.9932852257059466, + "grad_norm": 1.586375117301941, + "learning_rate": 2.3522217222426978e-08, + "loss": 1.4725, + "step": 27736 + }, + { + "epoch": 0.9933210378355148, + "grad_norm": 1.4359043836593628, + "learning_rate": 2.3271325097629082e-08, + "loss": 1.3233, + "step": 27737 + }, + { + "epoch": 0.9933568499650832, + "grad_norm": 1.3824795484542847, + "learning_rate": 2.3021778030751784e-08, + "loss": 1.4131, + "step": 27738 + }, + { + "epoch": 0.9933926620946515, + "grad_norm": 1.66568124294281, + "learning_rate": 2.2773576025170163e-08, + "loss": 1.3049, + "step": 27739 + }, + { + "epoch": 0.9934284742242198, + "grad_norm": 1.4649953842163086, + "learning_rate": 2.2526719084192683e-08, + "loss": 1.4202, + "step": 27740 + }, + { + "epoch": 0.993464286353788, + "grad_norm": 1.3860225677490234, + "learning_rate": 2.2281207211172218e-08, + "loss": 1.4507, + "step": 27741 + }, + { + "epoch": 0.9935000984833563, + "grad_norm": 1.6990655660629272, + "learning_rate": 2.2037040409383924e-08, + "loss": 1.3812, + "step": 27742 + }, + { + "epoch": 0.9935359106129246, + "grad_norm": 1.6962313652038574, + "learning_rate": 2.1794218682125168e-08, + "loss": 1.4086, + "step": 27743 + }, + { + "epoch": 0.9935717227424928, + "grad_norm": 1.4338451623916626, + "learning_rate": 2.15527420326711e-08, + "loss": 1.5719, + "step": 27744 + }, + { + "epoch": 0.9936075348720612, + "grad_norm": 1.4438395500183105, + "learning_rate": 2.131261046425248e-08, + "loss": 1.2907, + "step": 27745 + }, + { + "epoch": 0.9936433470016295, + "grad_norm": 1.768540620803833, + "learning_rate": 2.107382398011115e-08, + "loss": 1.33, + "step": 27746 + }, + { + "epoch": 0.9936791591311978, + "grad_norm": 1.6516530513763428, + "learning_rate": 2.0836382583466762e-08, + "loss": 1.4927, + "step": 27747 + }, + { + "epoch": 0.993714971260766, + "grad_norm": 1.5076415538787842, + "learning_rate": 2.0600286277494552e-08, + "loss": 1.2049, + "step": 27748 + }, + { + "epoch": 0.9937507833903343, + "grad_norm": 1.4209827184677124, + "learning_rate": 2.0365535065391962e-08, + "loss": 1.297, + "step": 27749 + }, + { + "epoch": 0.9937865955199026, + "grad_norm": 1.4648021459579468, + "learning_rate": 2.013212895030092e-08, + "loss": 1.286, + "step": 27750 + }, + { + "epoch": 0.9938224076494708, + "grad_norm": 2.422774314880371, + "learning_rate": 1.9900067935363364e-08, + "loss": 1.6095, + "step": 27751 + }, + { + "epoch": 0.9938582197790392, + "grad_norm": 1.4855307340621948, + "learning_rate": 1.966935202371012e-08, + "loss": 1.5097, + "step": 27752 + }, + { + "epoch": 0.9938940319086075, + "grad_norm": 2.235036849975586, + "learning_rate": 1.9439981218438708e-08, + "loss": 1.4405, + "step": 27753 + }, + { + "epoch": 0.9939298440381757, + "grad_norm": 1.6031259298324585, + "learning_rate": 1.921195552263555e-08, + "loss": 1.4121, + "step": 27754 + }, + { + "epoch": 0.993965656167744, + "grad_norm": 1.9243065118789673, + "learning_rate": 1.8985274939375962e-08, + "loss": 1.5876, + "step": 27755 + }, + { + "epoch": 0.9940014682973123, + "grad_norm": 1.6919931173324585, + "learning_rate": 1.8759939471690858e-08, + "loss": 1.453, + "step": 27756 + }, + { + "epoch": 0.9940372804268806, + "grad_norm": 1.5720360279083252, + "learning_rate": 1.8535949122633346e-08, + "loss": 1.4789, + "step": 27757 + }, + { + "epoch": 0.9940730925564488, + "grad_norm": 1.6576461791992188, + "learning_rate": 1.831330389521213e-08, + "loss": 1.417, + "step": 27758 + }, + { + "epoch": 0.9941089046860172, + "grad_norm": 1.631563425064087, + "learning_rate": 1.8092003792413714e-08, + "loss": 1.3115, + "step": 27759 + }, + { + "epoch": 0.9941447168155855, + "grad_norm": 1.270306944847107, + "learning_rate": 1.7872048817213495e-08, + "loss": 1.5715, + "step": 27760 + }, + { + "epoch": 0.9941805289451537, + "grad_norm": 1.9433504343032837, + "learning_rate": 1.7653438972586868e-08, + "loss": 1.4304, + "step": 27761 + }, + { + "epoch": 0.994216341074722, + "grad_norm": 2.1544501781463623, + "learning_rate": 1.743617426145372e-08, + "loss": 1.4941, + "step": 27762 + }, + { + "epoch": 0.9942521532042903, + "grad_norm": 1.2280709743499756, + "learning_rate": 1.7220254686756142e-08, + "loss": 1.0029, + "step": 27763 + }, + { + "epoch": 0.9942879653338585, + "grad_norm": 1.3527542352676392, + "learning_rate": 1.700568025139182e-08, + "loss": 1.4559, + "step": 27764 + }, + { + "epoch": 0.9943237774634268, + "grad_norm": 1.9199059009552002, + "learning_rate": 1.679245095824733e-08, + "loss": 1.3947, + "step": 27765 + }, + { + "epoch": 0.9943595895929952, + "grad_norm": 2.5638864040374756, + "learning_rate": 1.658056681019815e-08, + "loss": 1.5441, + "step": 27766 + }, + { + "epoch": 0.9943954017225635, + "grad_norm": 1.6788527965545654, + "learning_rate": 1.637002781007535e-08, + "loss": 1.3188, + "step": 27767 + }, + { + "epoch": 0.9944312138521317, + "grad_norm": 1.2428195476531982, + "learning_rate": 1.6160833960732203e-08, + "loss": 1.4178, + "step": 27768 + }, + { + "epoch": 0.9944670259817, + "grad_norm": 1.7041432857513428, + "learning_rate": 1.595298526496647e-08, + "loss": 1.5002, + "step": 27769 + }, + { + "epoch": 0.9945028381112683, + "grad_norm": 1.5376256704330444, + "learning_rate": 1.5746481725598117e-08, + "loss": 1.6361, + "step": 27770 + }, + { + "epoch": 0.9945386502408365, + "grad_norm": 1.9216423034667969, + "learning_rate": 1.5541323345380497e-08, + "loss": 1.2145, + "step": 27771 + }, + { + "epoch": 0.9945744623704048, + "grad_norm": 1.9160414934158325, + "learning_rate": 1.533751012707807e-08, + "loss": 1.5186, + "step": 27772 + }, + { + "epoch": 0.9946102744999732, + "grad_norm": 1.4614883661270142, + "learning_rate": 1.5135042073444182e-08, + "loss": 1.6713, + "step": 27773 + }, + { + "epoch": 0.9946460866295415, + "grad_norm": 1.514203667640686, + "learning_rate": 1.4933919187198884e-08, + "loss": 1.041, + "step": 27774 + }, + { + "epoch": 0.9946818987591097, + "grad_norm": 1.3184031248092651, + "learning_rate": 1.4734141471051122e-08, + "loss": 1.3827, + "step": 27775 + }, + { + "epoch": 0.994717710888678, + "grad_norm": 1.7681211233139038, + "learning_rate": 1.4535708927676529e-08, + "loss": 1.5363, + "step": 27776 + }, + { + "epoch": 0.9947535230182463, + "grad_norm": 1.4674324989318848, + "learning_rate": 1.4338621559750742e-08, + "loss": 1.5941, + "step": 27777 + }, + { + "epoch": 0.9947893351478145, + "grad_norm": 1.7696936130523682, + "learning_rate": 1.4142879369927198e-08, + "loss": 1.4344, + "step": 27778 + }, + { + "epoch": 0.9948251472773828, + "grad_norm": 1.9830656051635742, + "learning_rate": 1.3948482360848225e-08, + "loss": 1.5663, + "step": 27779 + }, + { + "epoch": 0.9948609594069512, + "grad_norm": 1.470342993736267, + "learning_rate": 1.3755430535111747e-08, + "loss": 1.6629, + "step": 27780 + }, + { + "epoch": 0.9948967715365195, + "grad_norm": 1.7345906496047974, + "learning_rate": 1.3563723895326785e-08, + "loss": 1.4955, + "step": 27781 + }, + { + "epoch": 0.9949325836660877, + "grad_norm": 1.6475701332092285, + "learning_rate": 1.3373362444057957e-08, + "loss": 1.443, + "step": 27782 + }, + { + "epoch": 0.994968395795656, + "grad_norm": 1.7012711763381958, + "learning_rate": 1.3184346183892082e-08, + "loss": 1.341, + "step": 27783 + }, + { + "epoch": 0.9950042079252243, + "grad_norm": 1.4471287727355957, + "learning_rate": 1.2996675117349367e-08, + "loss": 1.3683, + "step": 27784 + }, + { + "epoch": 0.9950400200547925, + "grad_norm": 2.2064197063446045, + "learning_rate": 1.2810349246961117e-08, + "loss": 1.3698, + "step": 27785 + }, + { + "epoch": 0.9950758321843608, + "grad_norm": 2.3040659427642822, + "learning_rate": 1.262536857523644e-08, + "loss": 1.364, + "step": 27786 + }, + { + "epoch": 0.9951116443139292, + "grad_norm": 1.5879874229431152, + "learning_rate": 1.2441733104662234e-08, + "loss": 1.3092, + "step": 27787 + }, + { + "epoch": 0.9951474564434974, + "grad_norm": 1.4612466096878052, + "learning_rate": 1.2259442837714297e-08, + "loss": 1.4385, + "step": 27788 + }, + { + "epoch": 0.9951832685730657, + "grad_norm": 2.061544418334961, + "learning_rate": 1.2078497776835119e-08, + "loss": 1.4193, + "step": 27789 + }, + { + "epoch": 0.995219080702634, + "grad_norm": 1.4527156352996826, + "learning_rate": 1.1898897924467189e-08, + "loss": 1.4357, + "step": 27790 + }, + { + "epoch": 0.9952548928322023, + "grad_norm": 1.664737343788147, + "learning_rate": 1.1720643283019694e-08, + "loss": 1.6887, + "step": 27791 + }, + { + "epoch": 0.9952907049617705, + "grad_norm": 1.4561351537704468, + "learning_rate": 1.1543733854901817e-08, + "loss": 1.2292, + "step": 27792 + }, + { + "epoch": 0.9953265170913388, + "grad_norm": 1.2238926887512207, + "learning_rate": 1.1368169642489435e-08, + "loss": 1.2025, + "step": 27793 + }, + { + "epoch": 0.9953623292209072, + "grad_norm": 1.3671455383300781, + "learning_rate": 1.119395064813622e-08, + "loss": 1.3286, + "step": 27794 + }, + { + "epoch": 0.9953981413504754, + "grad_norm": 1.3853384256362915, + "learning_rate": 1.1021076874195846e-08, + "loss": 1.3408, + "step": 27795 + }, + { + "epoch": 0.9954339534800437, + "grad_norm": 1.7308851480484009, + "learning_rate": 1.0849548322988679e-08, + "loss": 1.3314, + "step": 27796 + }, + { + "epoch": 0.995469765609612, + "grad_norm": 1.6281728744506836, + "learning_rate": 1.0679364996823982e-08, + "loss": 1.4538, + "step": 27797 + }, + { + "epoch": 0.9955055777391802, + "grad_norm": 1.8174667358398438, + "learning_rate": 1.0510526897988815e-08, + "loss": 1.2637, + "step": 27798 + }, + { + "epoch": 0.9955413898687485, + "grad_norm": 2.2247347831726074, + "learning_rate": 1.0343034028759136e-08, + "loss": 1.6667, + "step": 27799 + }, + { + "epoch": 0.9955772019983168, + "grad_norm": 1.9621455669403076, + "learning_rate": 1.0176886391388695e-08, + "loss": 1.5749, + "step": 27800 + }, + { + "epoch": 0.9956130141278852, + "grad_norm": 1.6727039813995361, + "learning_rate": 1.0012083988109045e-08, + "loss": 1.2988, + "step": 27801 + }, + { + "epoch": 0.9956488262574534, + "grad_norm": 1.5761189460754395, + "learning_rate": 9.848626821140627e-09, + "loss": 1.5141, + "step": 27802 + }, + { + "epoch": 0.9956846383870217, + "grad_norm": 1.3723686933517456, + "learning_rate": 9.686514892681687e-09, + "loss": 1.2416, + "step": 27803 + }, + { + "epoch": 0.99572045051659, + "grad_norm": 1.3970915079116821, + "learning_rate": 9.525748204908258e-09, + "loss": 1.2696, + "step": 27804 + }, + { + "epoch": 0.9957562626461582, + "grad_norm": 1.6122543811798096, + "learning_rate": 9.366326759985278e-09, + "loss": 1.5499, + "step": 27805 + }, + { + "epoch": 0.9957920747757265, + "grad_norm": 1.3238275051116943, + "learning_rate": 9.208250560066578e-09, + "loss": 1.0434, + "step": 27806 + }, + { + "epoch": 0.9958278869052948, + "grad_norm": 1.784379243850708, + "learning_rate": 9.051519607272684e-09, + "loss": 1.6537, + "step": 27807 + }, + { + "epoch": 0.9958636990348632, + "grad_norm": 1.6680328845977783, + "learning_rate": 8.89613390370192e-09, + "loss": 1.658, + "step": 27808 + }, + { + "epoch": 0.9958995111644314, + "grad_norm": 1.6057300567626953, + "learning_rate": 8.742093451463707e-09, + "loss": 1.3095, + "step": 27809 + }, + { + "epoch": 0.9959353232939997, + "grad_norm": 1.358614444732666, + "learning_rate": 8.589398252611957e-09, + "loss": 1.3424, + "step": 27810 + }, + { + "epoch": 0.995971135423568, + "grad_norm": 1.5714879035949707, + "learning_rate": 8.43804830922279e-09, + "loss": 1.2419, + "step": 27811 + }, + { + "epoch": 0.9960069475531362, + "grad_norm": 1.897676706314087, + "learning_rate": 8.28804362331681e-09, + "loss": 1.6724, + "step": 27812 + }, + { + "epoch": 0.9960427596827045, + "grad_norm": 1.658821940422058, + "learning_rate": 8.139384196903522e-09, + "loss": 1.2458, + "step": 27813 + }, + { + "epoch": 0.9960785718122728, + "grad_norm": 1.9782476425170898, + "learning_rate": 7.992070032003529e-09, + "loss": 1.3115, + "step": 27814 + }, + { + "epoch": 0.9961143839418412, + "grad_norm": 1.8454557657241821, + "learning_rate": 7.84610113059303e-09, + "loss": 1.4479, + "step": 27815 + }, + { + "epoch": 0.9961501960714094, + "grad_norm": 1.320586085319519, + "learning_rate": 7.70147749462602e-09, + "loss": 1.4129, + "step": 27816 + }, + { + "epoch": 0.9961860082009777, + "grad_norm": 1.5092272758483887, + "learning_rate": 7.558199126056487e-09, + "loss": 1.5169, + "step": 27817 + }, + { + "epoch": 0.996221820330546, + "grad_norm": 1.7253291606903076, + "learning_rate": 7.416266026816221e-09, + "loss": 1.6159, + "step": 27818 + }, + { + "epoch": 0.9962576324601142, + "grad_norm": 1.819915771484375, + "learning_rate": 7.275678198803703e-09, + "loss": 1.2604, + "step": 27819 + }, + { + "epoch": 0.9962934445896825, + "grad_norm": 1.2910900115966797, + "learning_rate": 7.136435643917416e-09, + "loss": 1.3797, + "step": 27820 + }, + { + "epoch": 0.9963292567192508, + "grad_norm": 1.5493773221969604, + "learning_rate": 6.998538364022533e-09, + "loss": 1.6762, + "step": 27821 + }, + { + "epoch": 0.9963650688488191, + "grad_norm": 1.7264066934585571, + "learning_rate": 6.861986360995332e-09, + "loss": 1.2949, + "step": 27822 + }, + { + "epoch": 0.9964008809783874, + "grad_norm": 2.5256896018981934, + "learning_rate": 6.726779636645475e-09, + "loss": 1.688, + "step": 27823 + }, + { + "epoch": 0.9964366931079557, + "grad_norm": 1.2364569902420044, + "learning_rate": 6.592918192804831e-09, + "loss": 1.2282, + "step": 27824 + }, + { + "epoch": 0.996472505237524, + "grad_norm": 1.384142518043518, + "learning_rate": 6.460402031283064e-09, + "loss": 1.3266, + "step": 27825 + }, + { + "epoch": 0.9965083173670922, + "grad_norm": 1.398099660873413, + "learning_rate": 6.329231153845427e-09, + "loss": 1.0723, + "step": 27826 + }, + { + "epoch": 0.9965441294966605, + "grad_norm": 2.250969886779785, + "learning_rate": 6.199405562268279e-09, + "loss": 1.545, + "step": 27827 + }, + { + "epoch": 0.9965799416262288, + "grad_norm": 1.5381693840026855, + "learning_rate": 6.070925258294668e-09, + "loss": 1.4875, + "step": 27828 + }, + { + "epoch": 0.9966157537557971, + "grad_norm": 1.3616868257522583, + "learning_rate": 5.943790243656544e-09, + "loss": 1.3771, + "step": 27829 + }, + { + "epoch": 0.9966515658853654, + "grad_norm": 1.404511570930481, + "learning_rate": 5.818000520052547e-09, + "loss": 1.3444, + "step": 27830 + }, + { + "epoch": 0.9966873780149337, + "grad_norm": 1.3971214294433594, + "learning_rate": 5.69355608919242e-09, + "loss": 1.3699, + "step": 27831 + }, + { + "epoch": 0.996723190144502, + "grad_norm": 1.6311243772506714, + "learning_rate": 5.570456952741499e-09, + "loss": 1.3255, + "step": 27832 + }, + { + "epoch": 0.9967590022740702, + "grad_norm": 1.9845330715179443, + "learning_rate": 5.448703112365117e-09, + "loss": 1.5725, + "step": 27833 + }, + { + "epoch": 0.9967948144036385, + "grad_norm": 1.6047090291976929, + "learning_rate": 5.328294569673098e-09, + "loss": 1.4876, + "step": 27834 + }, + { + "epoch": 0.9968306265332068, + "grad_norm": 1.8364373445510864, + "learning_rate": 5.209231326319674e-09, + "loss": 1.3304, + "step": 27835 + }, + { + "epoch": 0.9968664386627751, + "grad_norm": 1.2628228664398193, + "learning_rate": 5.0915133838924656e-09, + "loss": 1.21, + "step": 27836 + }, + { + "epoch": 0.9969022507923434, + "grad_norm": 1.866013765335083, + "learning_rate": 4.975140743967987e-09, + "loss": 1.4851, + "step": 27837 + }, + { + "epoch": 0.9969380629219117, + "grad_norm": 1.4663585424423218, + "learning_rate": 4.860113408122757e-09, + "loss": 1.7521, + "step": 27838 + }, + { + "epoch": 0.9969738750514799, + "grad_norm": 1.7713894844055176, + "learning_rate": 4.746431377899985e-09, + "loss": 1.4532, + "step": 27839 + }, + { + "epoch": 0.9970096871810482, + "grad_norm": 1.5363188982009888, + "learning_rate": 4.634094654820675e-09, + "loss": 1.2711, + "step": 27840 + }, + { + "epoch": 0.9970454993106165, + "grad_norm": 1.7956669330596924, + "learning_rate": 4.523103240416937e-09, + "loss": 1.5147, + "step": 27841 + }, + { + "epoch": 0.9970813114401847, + "grad_norm": 2.0350286960601807, + "learning_rate": 4.413457136165367e-09, + "loss": 1.5831, + "step": 27842 + }, + { + "epoch": 0.9971171235697531, + "grad_norm": 1.6121965646743774, + "learning_rate": 4.3051563435425605e-09, + "loss": 1.3732, + "step": 27843 + }, + { + "epoch": 0.9971529356993214, + "grad_norm": 1.6830921173095703, + "learning_rate": 4.198200864014012e-09, + "loss": 1.6102, + "step": 27844 + }, + { + "epoch": 0.9971887478288897, + "grad_norm": 1.550490379333496, + "learning_rate": 4.092590699011911e-09, + "loss": 1.4481, + "step": 27845 + }, + { + "epoch": 0.9972245599584579, + "grad_norm": 1.875200629234314, + "learning_rate": 3.988325849957342e-09, + "loss": 1.3372, + "step": 27846 + }, + { + "epoch": 0.9972603720880262, + "grad_norm": 1.4787503480911255, + "learning_rate": 3.885406318260288e-09, + "loss": 1.4715, + "step": 27847 + }, + { + "epoch": 0.9972961842175945, + "grad_norm": 1.6989432573318481, + "learning_rate": 3.783832105286322e-09, + "loss": 1.3271, + "step": 27848 + }, + { + "epoch": 0.9973319963471627, + "grad_norm": 1.4826751947402954, + "learning_rate": 3.6836032124232256e-09, + "loss": 1.3739, + "step": 27849 + }, + { + "epoch": 0.9973678084767311, + "grad_norm": 1.7975443601608276, + "learning_rate": 3.5847196410143667e-09, + "loss": 1.6923, + "step": 27850 + }, + { + "epoch": 0.9974036206062994, + "grad_norm": 1.5432265996932983, + "learning_rate": 3.4871813923809117e-09, + "loss": 1.3351, + "step": 27851 + }, + { + "epoch": 0.9974394327358677, + "grad_norm": 1.5838642120361328, + "learning_rate": 3.390988467844025e-09, + "loss": 1.579, + "step": 27852 + }, + { + "epoch": 0.9974752448654359, + "grad_norm": 1.9075052738189697, + "learning_rate": 3.2961408686915662e-09, + "loss": 1.3764, + "step": 27853 + }, + { + "epoch": 0.9975110569950042, + "grad_norm": 1.5979747772216797, + "learning_rate": 3.2026385962113937e-09, + "loss": 1.4625, + "step": 27854 + }, + { + "epoch": 0.9975468691245725, + "grad_norm": 1.5051318407058716, + "learning_rate": 3.110481651646957e-09, + "loss": 1.3817, + "step": 27855 + }, + { + "epoch": 0.9975826812541407, + "grad_norm": 2.1045610904693604, + "learning_rate": 3.0196700362417065e-09, + "loss": 1.4953, + "step": 27856 + }, + { + "epoch": 0.9976184933837091, + "grad_norm": 1.5052860975265503, + "learning_rate": 2.930203751227989e-09, + "loss": 1.3494, + "step": 27857 + }, + { + "epoch": 0.9976543055132774, + "grad_norm": 1.5225681066513062, + "learning_rate": 2.8420827977937437e-09, + "loss": 1.5778, + "step": 27858 + }, + { + "epoch": 0.9976901176428457, + "grad_norm": 1.5422025918960571, + "learning_rate": 2.7553071771380112e-09, + "loss": 1.3814, + "step": 27859 + }, + { + "epoch": 0.9977259297724139, + "grad_norm": 3.553717851638794, + "learning_rate": 2.6698768904154236e-09, + "loss": 1.4703, + "step": 27860 + }, + { + "epoch": 0.9977617419019822, + "grad_norm": 1.6214388608932495, + "learning_rate": 2.585791938791715e-09, + "loss": 1.1495, + "step": 27861 + }, + { + "epoch": 0.9977975540315505, + "grad_norm": 1.3043503761291504, + "learning_rate": 2.5030523233771087e-09, + "loss": 1.4878, + "step": 27862 + }, + { + "epoch": 0.9978333661611187, + "grad_norm": 1.5366240739822388, + "learning_rate": 2.4216580453040314e-09, + "loss": 1.0653, + "step": 27863 + }, + { + "epoch": 0.9978691782906871, + "grad_norm": 1.7023508548736572, + "learning_rate": 2.3416091056605027e-09, + "loss": 1.5414, + "step": 27864 + }, + { + "epoch": 0.9979049904202554, + "grad_norm": 1.317268967628479, + "learning_rate": 2.2629055055234384e-09, + "loss": 1.4184, + "step": 27865 + }, + { + "epoch": 0.9979408025498236, + "grad_norm": 1.655472755432129, + "learning_rate": 2.18554724594755e-09, + "loss": 1.6407, + "step": 27866 + }, + { + "epoch": 0.9979766146793919, + "grad_norm": 1.6678001880645752, + "learning_rate": 2.1095343279764477e-09, + "loss": 1.4937, + "step": 27867 + }, + { + "epoch": 0.9980124268089602, + "grad_norm": 1.3210474252700806, + "learning_rate": 2.0348667526426392e-09, + "loss": 1.3464, + "step": 27868 + }, + { + "epoch": 0.9980482389385285, + "grad_norm": 1.340619444847107, + "learning_rate": 1.961544520934222e-09, + "loss": 1.1373, + "step": 27869 + }, + { + "epoch": 0.9980840510680967, + "grad_norm": 2.0693037509918213, + "learning_rate": 1.8895676338392952e-09, + "loss": 1.6612, + "step": 27870 + }, + { + "epoch": 0.9981198631976651, + "grad_norm": 1.7016184329986572, + "learning_rate": 1.8189360923459575e-09, + "loss": 1.3875, + "step": 27871 + }, + { + "epoch": 0.9981556753272334, + "grad_norm": 1.4674826860427856, + "learning_rate": 1.7496498973756936e-09, + "loss": 1.4655, + "step": 27872 + }, + { + "epoch": 0.9981914874568016, + "grad_norm": 1.7653722763061523, + "learning_rate": 1.6817090498832954e-09, + "loss": 1.4544, + "step": 27873 + }, + { + "epoch": 0.9982272995863699, + "grad_norm": 1.617550015449524, + "learning_rate": 1.615113550779146e-09, + "loss": 1.4976, + "step": 27874 + }, + { + "epoch": 0.9982631117159382, + "grad_norm": 1.6793699264526367, + "learning_rate": 1.5498634009514235e-09, + "loss": 1.6882, + "step": 27875 + }, + { + "epoch": 0.9982989238455064, + "grad_norm": 1.7984838485717773, + "learning_rate": 1.4859586012772042e-09, + "loss": 1.6261, + "step": 27876 + }, + { + "epoch": 0.9983347359750747, + "grad_norm": 1.3389443159103394, + "learning_rate": 1.4233991526224622e-09, + "loss": 1.2497, + "step": 27877 + }, + { + "epoch": 0.9983705481046431, + "grad_norm": 1.4950339794158936, + "learning_rate": 1.3621850558309668e-09, + "loss": 1.4324, + "step": 27878 + }, + { + "epoch": 0.9984063602342114, + "grad_norm": 1.9988292455673218, + "learning_rate": 1.3023163117242832e-09, + "loss": 1.3851, + "step": 27879 + }, + { + "epoch": 0.9984421723637796, + "grad_norm": 1.5393435955047607, + "learning_rate": 1.2437929211017718e-09, + "loss": 1.4378, + "step": 27880 + }, + { + "epoch": 0.9984779844933479, + "grad_norm": 1.34359872341156, + "learning_rate": 1.1866148847516912e-09, + "loss": 1.4509, + "step": 27881 + }, + { + "epoch": 0.9985137966229162, + "grad_norm": 1.7370821237564087, + "learning_rate": 1.1307822034511973e-09, + "loss": 1.3819, + "step": 27882 + }, + { + "epoch": 0.9985496087524844, + "grad_norm": 2.0988402366638184, + "learning_rate": 1.0762948779441396e-09, + "loss": 1.3734, + "step": 27883 + }, + { + "epoch": 0.9985854208820527, + "grad_norm": 1.5026785135269165, + "learning_rate": 1.0231529089632652e-09, + "loss": 1.4251, + "step": 27884 + }, + { + "epoch": 0.9986212330116211, + "grad_norm": 2.722470283508301, + "learning_rate": 9.713562972302193e-10, + "loss": 1.5023, + "step": 27885 + }, + { + "epoch": 0.9986570451411894, + "grad_norm": 1.7294663190841675, + "learning_rate": 9.209050434333399e-10, + "loss": 1.3209, + "step": 27886 + }, + { + "epoch": 0.9986928572707576, + "grad_norm": 2.4848201274871826, + "learning_rate": 8.717991482609655e-10, + "loss": 1.7707, + "step": 27887 + }, + { + "epoch": 0.9987286694003259, + "grad_norm": 1.535007357597351, + "learning_rate": 8.240386123681276e-10, + "loss": 1.4537, + "step": 27888 + }, + { + "epoch": 0.9987644815298942, + "grad_norm": 1.3857157230377197, + "learning_rate": 7.776234363987555e-10, + "loss": 1.79, + "step": 27889 + }, + { + "epoch": 0.9988002936594624, + "grad_norm": 1.4100427627563477, + "learning_rate": 7.325536209856765e-10, + "loss": 1.578, + "step": 27890 + }, + { + "epoch": 0.9988361057890307, + "grad_norm": 1.3705559968948364, + "learning_rate": 6.888291667173085e-10, + "loss": 1.3029, + "step": 27891 + }, + { + "epoch": 0.9988719179185991, + "grad_norm": 1.7069154977798462, + "learning_rate": 6.464500741820701e-10, + "loss": 1.4618, + "step": 27892 + }, + { + "epoch": 0.9989077300481674, + "grad_norm": 2.2990357875823975, + "learning_rate": 6.054163439683791e-10, + "loss": 1.4391, + "step": 27893 + }, + { + "epoch": 0.9989435421777356, + "grad_norm": 1.4286106824874878, + "learning_rate": 5.65727976620245e-10, + "loss": 1.5029, + "step": 27894 + }, + { + "epoch": 0.9989793543073039, + "grad_norm": 1.4237838983535767, + "learning_rate": 5.273849726705748e-10, + "loss": 1.4531, + "step": 27895 + }, + { + "epoch": 0.9990151664368722, + "grad_norm": 1.7203515768051147, + "learning_rate": 4.903873326300712e-10, + "loss": 1.3107, + "step": 27896 + }, + { + "epoch": 0.9990509785664404, + "grad_norm": 1.3520474433898926, + "learning_rate": 4.5473505700943664e-10, + "loss": 1.3599, + "step": 27897 + }, + { + "epoch": 0.9990867906960087, + "grad_norm": 1.73139488697052, + "learning_rate": 4.2042814627496483e-10, + "loss": 1.4747, + "step": 27898 + }, + { + "epoch": 0.9991226028255771, + "grad_norm": 1.3812834024429321, + "learning_rate": 3.874666008929495e-10, + "loss": 1.3831, + "step": 27899 + }, + { + "epoch": 0.9991584149551453, + "grad_norm": 1.7007373571395874, + "learning_rate": 3.558504213074798e-10, + "loss": 1.186, + "step": 27900 + }, + { + "epoch": 0.9991942270847136, + "grad_norm": 2.0018959045410156, + "learning_rate": 3.255796079404405e-10, + "loss": 1.5158, + "step": 27901 + }, + { + "epoch": 0.9992300392142819, + "grad_norm": 1.486700177192688, + "learning_rate": 2.9665416120261414e-10, + "loss": 1.6186, + "step": 27902 + }, + { + "epoch": 0.9992658513438502, + "grad_norm": 2.046440601348877, + "learning_rate": 2.69074081493681e-10, + "loss": 1.1622, + "step": 27903 + }, + { + "epoch": 0.9993016634734184, + "grad_norm": 1.8606266975402832, + "learning_rate": 2.4283936915781014e-10, + "loss": 1.4256, + "step": 27904 + }, + { + "epoch": 0.9993374756029867, + "grad_norm": 1.6825835704803467, + "learning_rate": 2.1795002457247748e-10, + "loss": 1.3446, + "step": 27905 + }, + { + "epoch": 0.9993732877325551, + "grad_norm": 1.6700379848480225, + "learning_rate": 1.9440604807074992e-10, + "loss": 1.4342, + "step": 27906 + }, + { + "epoch": 0.9994090998621233, + "grad_norm": 1.4203354120254517, + "learning_rate": 1.7220743995238763e-10, + "loss": 1.5311, + "step": 27907 + }, + { + "epoch": 0.9994449119916916, + "grad_norm": 1.6370415687561035, + "learning_rate": 1.5135420052825312e-10, + "loss": 1.5626, + "step": 27908 + }, + { + "epoch": 0.9994807241212599, + "grad_norm": 1.8386884927749634, + "learning_rate": 1.318463300870043e-10, + "loss": 1.1738, + "step": 27909 + }, + { + "epoch": 0.9995165362508281, + "grad_norm": 2.393831253051758, + "learning_rate": 1.136838288728903e-10, + "loss": 1.4101, + "step": 27910 + }, + { + "epoch": 0.9995523483803964, + "grad_norm": 1.738539695739746, + "learning_rate": 9.68666971412624e-11, + "loss": 1.7001, + "step": 27911 + }, + { + "epoch": 0.9995881605099647, + "grad_norm": 1.6437656879425049, + "learning_rate": 8.139493511416518e-11, + "loss": 1.5699, + "step": 27912 + }, + { + "epoch": 0.9996239726395331, + "grad_norm": 1.6513583660125732, + "learning_rate": 6.726854300254104e-11, + "loss": 1.7211, + "step": 27913 + }, + { + "epoch": 0.9996597847691013, + "grad_norm": 1.3128859996795654, + "learning_rate": 5.448752098402565e-11, + "loss": 1.0227, + "step": 27914 + }, + { + "epoch": 0.9996955968986696, + "grad_norm": 1.9902230501174927, + "learning_rate": 4.3051869258459163e-11, + "loss": 1.5862, + "step": 27915 + }, + { + "epoch": 0.9997314090282379, + "grad_norm": 2.3277649879455566, + "learning_rate": 3.296158795906834e-11, + "loss": 1.4745, + "step": 27916 + }, + { + "epoch": 0.9997672211578061, + "grad_norm": 1.625291109085083, + "learning_rate": 2.421667721907994e-11, + "loss": 1.3652, + "step": 27917 + }, + { + "epoch": 0.9998030332873744, + "grad_norm": 1.4065899848937988, + "learning_rate": 1.68171371606185e-11, + "loss": 1.5924, + "step": 27918 + }, + { + "epoch": 0.9998388454169427, + "grad_norm": 1.4588338136672974, + "learning_rate": 1.0762967894706321e-11, + "loss": 1.3983, + "step": 27919 + }, + { + "epoch": 0.9998746575465111, + "grad_norm": 1.98700749874115, + "learning_rate": 6.054169487956784e-12, + "loss": 1.338, + "step": 27920 + }, + { + "epoch": 0.9999104696760793, + "grad_norm": 1.688843846321106, + "learning_rate": 2.690742006983271e-12, + "loss": 1.1956, + "step": 27921 + }, + { + "epoch": 0.9999462818056476, + "grad_norm": 1.4015154838562012, + "learning_rate": 6.726855072969329e-13, + "loss": 1.3722, + "step": 27922 + }, + { + "epoch": 0.9999820939352159, + "grad_norm": 1.3305832147598267, + "learning_rate": 0.0, + "loss": 1.4175, + "step": 27923 + }, + { + "epoch": 0.9999820939352159, + "step": 27923, + "total_flos": 2.3643443035111424e+18, + "train_loss": 1.5265222442264104, + "train_runtime": 42480.1433, + "train_samples_per_second": 5.259, + "train_steps_per_second": 0.657 + } + ], + "logging_steps": 1.0, + "max_steps": 27923, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.3643443035111424e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}