diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8652 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998983843105376, + "eval_steps": 500, + "global_step": 1230, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000812925515699624, + "grad_norm": 34.804351806640625, + "learning_rate": 5.405405405405406e-07, + "loss": 2.022, + "step": 1 + }, + { + "epoch": 0.001625851031399248, + "grad_norm": 35.288848876953125, + "learning_rate": 1.0810810810810812e-06, + "loss": 2.1055, + "step": 2 + }, + { + "epoch": 0.002438776547098872, + "grad_norm": 37.58893585205078, + "learning_rate": 1.6216216216216219e-06, + "loss": 2.0685, + "step": 3 + }, + { + "epoch": 0.003251702062798496, + "grad_norm": 28.51118278503418, + "learning_rate": 2.1621621621621623e-06, + "loss": 2.0364, + "step": 4 + }, + { + "epoch": 0.00406462757849812, + "grad_norm": 24.874475479125977, + "learning_rate": 2.702702702702703e-06, + "loss": 1.9688, + "step": 5 + }, + { + "epoch": 0.004877553094197744, + "grad_norm": 12.156012535095215, + "learning_rate": 3.2432432432432437e-06, + "loss": 1.8677, + "step": 6 + }, + { + "epoch": 0.005690478609897368, + "grad_norm": 7.017012119293213, + "learning_rate": 3.7837837837837844e-06, + "loss": 1.7774, + "step": 7 + }, + { + "epoch": 0.006503404125596992, + "grad_norm": 11.024828910827637, + "learning_rate": 4.324324324324325e-06, + "loss": 1.8042, + "step": 8 + }, + { + "epoch": 0.007316329641296616, + "grad_norm": 6.988280296325684, + "learning_rate": 4.864864864864866e-06, + "loss": 1.7973, + "step": 9 + }, + { + "epoch": 0.00812925515699624, + "grad_norm": 8.541196823120117, + "learning_rate": 5.405405405405406e-06, + "loss": 1.7946, + "step": 10 + }, + { + "epoch": 0.008942180672695864, + "grad_norm": 7.084593772888184, + "learning_rate": 5.945945945945947e-06, + "loss": 1.8178, + "step": 11 + }, + { + "epoch": 0.009755106188395488, + "grad_norm": 5.755589962005615, + "learning_rate": 6.486486486486487e-06, + "loss": 1.7748, + "step": 12 + }, + { + "epoch": 0.010568031704095112, + "grad_norm": 4.855886459350586, + "learning_rate": 7.027027027027028e-06, + "loss": 1.6665, + "step": 13 + }, + { + "epoch": 0.011380957219794737, + "grad_norm": 5.280701160430908, + "learning_rate": 7.567567567567569e-06, + "loss": 1.7226, + "step": 14 + }, + { + "epoch": 0.01219388273549436, + "grad_norm": 4.513389587402344, + "learning_rate": 8.108108108108109e-06, + "loss": 1.7219, + "step": 15 + }, + { + "epoch": 0.013006808251193984, + "grad_norm": 4.92287015914917, + "learning_rate": 8.64864864864865e-06, + "loss": 1.697, + "step": 16 + }, + { + "epoch": 0.013819733766893608, + "grad_norm": 4.488801002502441, + "learning_rate": 9.189189189189191e-06, + "loss": 1.6584, + "step": 17 + }, + { + "epoch": 0.014632659282593233, + "grad_norm": 3.736851930618286, + "learning_rate": 9.729729729729732e-06, + "loss": 1.6752, + "step": 18 + }, + { + "epoch": 0.015445584798292857, + "grad_norm": 3.7089431285858154, + "learning_rate": 1.027027027027027e-05, + "loss": 1.602, + "step": 19 + }, + { + "epoch": 0.01625851031399248, + "grad_norm": 3.9821619987487793, + "learning_rate": 1.0810810810810812e-05, + "loss": 1.6492, + "step": 20 + }, + { + "epoch": 0.017071435829692106, + "grad_norm": 3.72698974609375, + "learning_rate": 1.1351351351351352e-05, + "loss": 1.6893, + "step": 21 + }, + { + "epoch": 0.017884361345391727, + "grad_norm": 3.0124993324279785, + "learning_rate": 1.1891891891891894e-05, + "loss": 1.5879, + "step": 22 + }, + { + "epoch": 0.01869728686109135, + "grad_norm": 9.361907005310059, + "learning_rate": 1.2432432432432433e-05, + "loss": 1.6172, + "step": 23 + }, + { + "epoch": 0.019510212376790976, + "grad_norm": 3.431147813796997, + "learning_rate": 1.2972972972972975e-05, + "loss": 1.6354, + "step": 24 + }, + { + "epoch": 0.0203231378924906, + "grad_norm": 3.3041067123413086, + "learning_rate": 1.3513513513513515e-05, + "loss": 1.5998, + "step": 25 + }, + { + "epoch": 0.021136063408190225, + "grad_norm": 3.4122121334075928, + "learning_rate": 1.4054054054054055e-05, + "loss": 1.5737, + "step": 26 + }, + { + "epoch": 0.02194898892388985, + "grad_norm": 3.538844585418701, + "learning_rate": 1.4594594594594596e-05, + "loss": 1.5737, + "step": 27 + }, + { + "epoch": 0.022761914439589474, + "grad_norm": 3.245887041091919, + "learning_rate": 1.5135135135135138e-05, + "loss": 1.5893, + "step": 28 + }, + { + "epoch": 0.023574839955289098, + "grad_norm": 3.7787671089172363, + "learning_rate": 1.5675675675675676e-05, + "loss": 1.5923, + "step": 29 + }, + { + "epoch": 0.02438776547098872, + "grad_norm": 3.557563066482544, + "learning_rate": 1.6216216216216218e-05, + "loss": 1.5906, + "step": 30 + }, + { + "epoch": 0.025200690986688343, + "grad_norm": 3.1536169052124023, + "learning_rate": 1.6756756756756757e-05, + "loss": 1.5976, + "step": 31 + }, + { + "epoch": 0.026013616502387968, + "grad_norm": 3.060678005218506, + "learning_rate": 1.72972972972973e-05, + "loss": 1.5239, + "step": 32 + }, + { + "epoch": 0.026826542018087592, + "grad_norm": 3.0163331031799316, + "learning_rate": 1.783783783783784e-05, + "loss": 1.5703, + "step": 33 + }, + { + "epoch": 0.027639467533787217, + "grad_norm": 3.0648066997528076, + "learning_rate": 1.8378378378378383e-05, + "loss": 1.5421, + "step": 34 + }, + { + "epoch": 0.02845239304948684, + "grad_norm": 2.8359413146972656, + "learning_rate": 1.891891891891892e-05, + "loss": 1.5698, + "step": 35 + }, + { + "epoch": 0.029265318565186466, + "grad_norm": 3.0256259441375732, + "learning_rate": 1.9459459459459463e-05, + "loss": 1.5258, + "step": 36 + }, + { + "epoch": 0.03007824408088609, + "grad_norm": 3.305952548980713, + "learning_rate": 2e-05, + "loss": 1.5979, + "step": 37 + }, + { + "epoch": 0.030891169596585714, + "grad_norm": 2.7958834171295166, + "learning_rate": 1.99999653272242e-05, + "loss": 1.5065, + "step": 38 + }, + { + "epoch": 0.031704095112285335, + "grad_norm": 3.515479564666748, + "learning_rate": 1.9999861309137232e-05, + "loss": 1.4837, + "step": 39 + }, + { + "epoch": 0.03251702062798496, + "grad_norm": 2.7845990657806396, + "learning_rate": 1.999968794646042e-05, + "loss": 1.5634, + "step": 40 + }, + { + "epoch": 0.033329946143684584, + "grad_norm": 3.0540645122528076, + "learning_rate": 1.9999445240395953e-05, + "loss": 1.5001, + "step": 41 + }, + { + "epoch": 0.03414287165938421, + "grad_norm": 3.059220790863037, + "learning_rate": 1.9999133192626893e-05, + "loss": 1.502, + "step": 42 + }, + { + "epoch": 0.03495579717508383, + "grad_norm": 2.594452142715454, + "learning_rate": 1.9998751805317152e-05, + "loss": 1.5245, + "step": 43 + }, + { + "epoch": 0.035768722690783454, + "grad_norm": 3.0076844692230225, + "learning_rate": 1.999830108111148e-05, + "loss": 1.5032, + "step": 44 + }, + { + "epoch": 0.03658164820648308, + "grad_norm": 2.9521396160125732, + "learning_rate": 1.999778102313545e-05, + "loss": 1.5381, + "step": 45 + }, + { + "epoch": 0.0373945737221827, + "grad_norm": 3.280303478240967, + "learning_rate": 1.999719163499543e-05, + "loss": 1.5478, + "step": 46 + }, + { + "epoch": 0.03820749923788233, + "grad_norm": 2.9089877605438232, + "learning_rate": 1.999653292077857e-05, + "loss": 1.4783, + "step": 47 + }, + { + "epoch": 0.03902042475358195, + "grad_norm": 5.1869635581970215, + "learning_rate": 1.999580488505276e-05, + "loss": 1.5067, + "step": 48 + }, + { + "epoch": 0.03983335026928158, + "grad_norm": 3.053921699523926, + "learning_rate": 1.9995007532866594e-05, + "loss": 1.503, + "step": 49 + }, + { + "epoch": 0.0406462757849812, + "grad_norm": 2.952059507369995, + "learning_rate": 1.9994140869749366e-05, + "loss": 1.4579, + "step": 50 + }, + { + "epoch": 0.04145920130068083, + "grad_norm": 2.609379291534424, + "learning_rate": 1.9993204901710995e-05, + "loss": 1.4679, + "step": 51 + }, + { + "epoch": 0.04227212681638045, + "grad_norm": 3.41717267036438, + "learning_rate": 1.9992199635241997e-05, + "loss": 1.5197, + "step": 52 + }, + { + "epoch": 0.04308505233208007, + "grad_norm": 2.8707101345062256, + "learning_rate": 1.999112507731346e-05, + "loss": 1.5074, + "step": 53 + }, + { + "epoch": 0.0438979778477797, + "grad_norm": 3.325697660446167, + "learning_rate": 1.9989981235376956e-05, + "loss": 1.427, + "step": 54 + }, + { + "epoch": 0.04471090336347932, + "grad_norm": 2.7196686267852783, + "learning_rate": 1.9988768117364526e-05, + "loss": 1.4868, + "step": 55 + }, + { + "epoch": 0.04552382887917895, + "grad_norm": 2.9488351345062256, + "learning_rate": 1.9987485731688595e-05, + "loss": 1.5011, + "step": 56 + }, + { + "epoch": 0.04633675439487857, + "grad_norm": 2.7776849269866943, + "learning_rate": 1.998613408724195e-05, + "loss": 1.4664, + "step": 57 + }, + { + "epoch": 0.047149679910578196, + "grad_norm": 2.719594717025757, + "learning_rate": 1.998471319339763e-05, + "loss": 1.4905, + "step": 58 + }, + { + "epoch": 0.04796260542627782, + "grad_norm": 2.8028323650360107, + "learning_rate": 1.9983223060008908e-05, + "loss": 1.4754, + "step": 59 + }, + { + "epoch": 0.04877553094197744, + "grad_norm": 3.1789817810058594, + "learning_rate": 1.9981663697409203e-05, + "loss": 1.4618, + "step": 60 + }, + { + "epoch": 0.049588456457677066, + "grad_norm": 3.077449321746826, + "learning_rate": 1.998003511641199e-05, + "loss": 1.453, + "step": 61 + }, + { + "epoch": 0.05040138197337669, + "grad_norm": 2.960418939590454, + "learning_rate": 1.997833732831076e-05, + "loss": 1.4564, + "step": 62 + }, + { + "epoch": 0.051214307489076315, + "grad_norm": 5.316094875335693, + "learning_rate": 1.9976570344878916e-05, + "loss": 1.4711, + "step": 63 + }, + { + "epoch": 0.052027233004775936, + "grad_norm": 3.257415771484375, + "learning_rate": 1.9974734178369702e-05, + "loss": 1.4606, + "step": 64 + }, + { + "epoch": 0.052840158520475564, + "grad_norm": 3.0437912940979004, + "learning_rate": 1.997282884151612e-05, + "loss": 1.5075, + "step": 65 + }, + { + "epoch": 0.053653084036175185, + "grad_norm": 3.3059332370758057, + "learning_rate": 1.9970854347530828e-05, + "loss": 1.484, + "step": 66 + }, + { + "epoch": 0.05446600955187481, + "grad_norm": 4.510897636413574, + "learning_rate": 1.9968810710106065e-05, + "loss": 1.5091, + "step": 67 + }, + { + "epoch": 0.05527893506757443, + "grad_norm": 3.3621528148651123, + "learning_rate": 1.9966697943413548e-05, + "loss": 1.4603, + "step": 68 + }, + { + "epoch": 0.056091860583274054, + "grad_norm": 2.878563642501831, + "learning_rate": 1.9964516062104377e-05, + "loss": 1.4438, + "step": 69 + }, + { + "epoch": 0.05690478609897368, + "grad_norm": 2.8587141036987305, + "learning_rate": 1.996226508130892e-05, + "loss": 1.441, + "step": 70 + }, + { + "epoch": 0.0577177116146733, + "grad_norm": 3.2675728797912598, + "learning_rate": 1.995994501663674e-05, + "loss": 1.4515, + "step": 71 + }, + { + "epoch": 0.05853063713037293, + "grad_norm": 3.018068790435791, + "learning_rate": 1.995755588417644e-05, + "loss": 1.4499, + "step": 72 + }, + { + "epoch": 0.05934356264607255, + "grad_norm": 3.715628147125244, + "learning_rate": 1.99550977004956e-05, + "loss": 1.4624, + "step": 73 + }, + { + "epoch": 0.06015648816177218, + "grad_norm": 2.7632699012756348, + "learning_rate": 1.9952570482640628e-05, + "loss": 1.4437, + "step": 74 + }, + { + "epoch": 0.0609694136774718, + "grad_norm": 3.3581650257110596, + "learning_rate": 1.9949974248136655e-05, + "loss": 1.4865, + "step": 75 + }, + { + "epoch": 0.06178233919317143, + "grad_norm": 3.090432643890381, + "learning_rate": 1.9947309014987414e-05, + "loss": 1.4416, + "step": 76 + }, + { + "epoch": 0.06259526470887104, + "grad_norm": 3.3709418773651123, + "learning_rate": 1.9944574801675106e-05, + "loss": 1.4184, + "step": 77 + }, + { + "epoch": 0.06340819022457067, + "grad_norm": 3.6959853172302246, + "learning_rate": 1.9941771627160287e-05, + "loss": 1.4694, + "step": 78 + }, + { + "epoch": 0.0642211157402703, + "grad_norm": 3.2907724380493164, + "learning_rate": 1.9938899510881732e-05, + "loss": 1.4121, + "step": 79 + }, + { + "epoch": 0.06503404125596993, + "grad_norm": 2.7885124683380127, + "learning_rate": 1.9935958472756283e-05, + "loss": 1.4033, + "step": 80 + }, + { + "epoch": 0.06584696677166954, + "grad_norm": 2.8771262168884277, + "learning_rate": 1.993294853317873e-05, + "loss": 1.4466, + "step": 81 + }, + { + "epoch": 0.06665989228736917, + "grad_norm": 3.572303056716919, + "learning_rate": 1.9929869713021668e-05, + "loss": 1.3854, + "step": 82 + }, + { + "epoch": 0.0674728178030688, + "grad_norm": 2.636934757232666, + "learning_rate": 1.9926722033635343e-05, + "loss": 1.4186, + "step": 83 + }, + { + "epoch": 0.06828574331876842, + "grad_norm": 3.1140427589416504, + "learning_rate": 1.9923505516847514e-05, + "loss": 1.424, + "step": 84 + }, + { + "epoch": 0.06909866883446804, + "grad_norm": 2.808480739593506, + "learning_rate": 1.9920220184963296e-05, + "loss": 1.4744, + "step": 85 + }, + { + "epoch": 0.06991159435016767, + "grad_norm": 3.118234872817993, + "learning_rate": 1.9916866060764994e-05, + "loss": 1.4277, + "step": 86 + }, + { + "epoch": 0.0707245198658673, + "grad_norm": 4.0702033042907715, + "learning_rate": 1.991344316751198e-05, + "loss": 1.4236, + "step": 87 + }, + { + "epoch": 0.07153744538156691, + "grad_norm": 2.938345193862915, + "learning_rate": 1.9909951528940485e-05, + "loss": 1.4119, + "step": 88 + }, + { + "epoch": 0.07235037089726654, + "grad_norm": 2.960853338241577, + "learning_rate": 1.990639116926348e-05, + "loss": 1.471, + "step": 89 + }, + { + "epoch": 0.07316329641296616, + "grad_norm": 3.146742343902588, + "learning_rate": 1.9902762113170467e-05, + "loss": 1.4751, + "step": 90 + }, + { + "epoch": 0.07397622192866579, + "grad_norm": 3.3954169750213623, + "learning_rate": 1.989906438582734e-05, + "loss": 1.467, + "step": 91 + }, + { + "epoch": 0.0747891474443654, + "grad_norm": 2.9790520668029785, + "learning_rate": 1.9895298012876192e-05, + "loss": 1.507, + "step": 92 + }, + { + "epoch": 0.07560207296006503, + "grad_norm": 2.577925682067871, + "learning_rate": 1.9891463020435144e-05, + "loss": 1.4728, + "step": 93 + }, + { + "epoch": 0.07641499847576466, + "grad_norm": 3.437133550643921, + "learning_rate": 1.9887559435098162e-05, + "loss": 1.4472, + "step": 94 + }, + { + "epoch": 0.07722792399146428, + "grad_norm": 2.806886911392212, + "learning_rate": 1.9883587283934875e-05, + "loss": 1.4497, + "step": 95 + }, + { + "epoch": 0.0780408495071639, + "grad_norm": 2.703793525695801, + "learning_rate": 1.9879546594490383e-05, + "loss": 1.4643, + "step": 96 + }, + { + "epoch": 0.07885377502286353, + "grad_norm": 3.2830615043640137, + "learning_rate": 1.987543739478507e-05, + "loss": 1.4162, + "step": 97 + }, + { + "epoch": 0.07966670053856316, + "grad_norm": 2.5376830101013184, + "learning_rate": 1.987125971331441e-05, + "loss": 1.494, + "step": 98 + }, + { + "epoch": 0.08047962605426277, + "grad_norm": 2.532893180847168, + "learning_rate": 1.9867013579048765e-05, + "loss": 1.4575, + "step": 99 + }, + { + "epoch": 0.0812925515699624, + "grad_norm": 2.838155508041382, + "learning_rate": 1.9862699021433186e-05, + "loss": 1.4007, + "step": 100 + }, + { + "epoch": 0.08210547708566203, + "grad_norm": 2.5777368545532227, + "learning_rate": 1.9858316070387208e-05, + "loss": 1.4213, + "step": 101 + }, + { + "epoch": 0.08291840260136166, + "grad_norm": 2.634209394454956, + "learning_rate": 1.9853864756304654e-05, + "loss": 1.4544, + "step": 102 + }, + { + "epoch": 0.08373132811706127, + "grad_norm": 2.9893202781677246, + "learning_rate": 1.9849345110053405e-05, + "loss": 1.4361, + "step": 103 + }, + { + "epoch": 0.0845442536327609, + "grad_norm": 2.668808698654175, + "learning_rate": 1.984475716297519e-05, + "loss": 1.4267, + "step": 104 + }, + { + "epoch": 0.08535717914846053, + "grad_norm": 3.2199463844299316, + "learning_rate": 1.984010094688539e-05, + "loss": 1.4731, + "step": 105 + }, + { + "epoch": 0.08617010466416014, + "grad_norm": 2.746006965637207, + "learning_rate": 1.9835376494072788e-05, + "loss": 1.385, + "step": 106 + }, + { + "epoch": 0.08698303017985977, + "grad_norm": 2.955232620239258, + "learning_rate": 1.9830583837299363e-05, + "loss": 1.3984, + "step": 107 + }, + { + "epoch": 0.0877959556955594, + "grad_norm": 5.357511520385742, + "learning_rate": 1.9825723009800058e-05, + "loss": 1.4562, + "step": 108 + }, + { + "epoch": 0.08860888121125902, + "grad_norm": 2.5583655834198, + "learning_rate": 1.9820794045282553e-05, + "loss": 1.4222, + "step": 109 + }, + { + "epoch": 0.08942180672695864, + "grad_norm": 2.6951992511749268, + "learning_rate": 1.9815796977927015e-05, + "loss": 1.4697, + "step": 110 + }, + { + "epoch": 0.09023473224265827, + "grad_norm": 2.714019775390625, + "learning_rate": 1.9810731842385892e-05, + "loss": 1.4696, + "step": 111 + }, + { + "epoch": 0.0910476577583579, + "grad_norm": 3.2998311519622803, + "learning_rate": 1.9805598673783644e-05, + "loss": 1.4034, + "step": 112 + }, + { + "epoch": 0.09186058327405751, + "grad_norm": 7.7324652671813965, + "learning_rate": 1.980039750771651e-05, + "loss": 1.4697, + "step": 113 + }, + { + "epoch": 0.09267350878975714, + "grad_norm": 2.6200242042541504, + "learning_rate": 1.9795128380252263e-05, + "loss": 1.451, + "step": 114 + }, + { + "epoch": 0.09348643430545676, + "grad_norm": 2.937061071395874, + "learning_rate": 1.978979132792996e-05, + "loss": 1.4348, + "step": 115 + }, + { + "epoch": 0.09429935982115639, + "grad_norm": 3.087085247039795, + "learning_rate": 1.9784386387759684e-05, + "loss": 1.4271, + "step": 116 + }, + { + "epoch": 0.095112285336856, + "grad_norm": 2.6796271800994873, + "learning_rate": 1.977891359722229e-05, + "loss": 1.4933, + "step": 117 + }, + { + "epoch": 0.09592521085255563, + "grad_norm": 3.4506633281707764, + "learning_rate": 1.9773372994269147e-05, + "loss": 1.427, + "step": 118 + }, + { + "epoch": 0.09673813636825526, + "grad_norm": 2.532562732696533, + "learning_rate": 1.976776461732187e-05, + "loss": 1.436, + "step": 119 + }, + { + "epoch": 0.09755106188395488, + "grad_norm": 3.619605541229248, + "learning_rate": 1.976208850527206e-05, + "loss": 1.4384, + "step": 120 + }, + { + "epoch": 0.0983639873996545, + "grad_norm": 3.5245602130889893, + "learning_rate": 1.9756344697481027e-05, + "loss": 1.4303, + "step": 121 + }, + { + "epoch": 0.09917691291535413, + "grad_norm": 2.649686336517334, + "learning_rate": 1.975053323377952e-05, + "loss": 1.4692, + "step": 122 + }, + { + "epoch": 0.09998983843105376, + "grad_norm": 2.516016721725464, + "learning_rate": 1.9744654154467468e-05, + "loss": 1.4154, + "step": 123 + }, + { + "epoch": 0.10080276394675337, + "grad_norm": 2.591168165206909, + "learning_rate": 1.9738707500313655e-05, + "loss": 1.403, + "step": 124 + }, + { + "epoch": 0.101615689462453, + "grad_norm": 2.522486686706543, + "learning_rate": 1.9732693312555492e-05, + "loss": 1.4575, + "step": 125 + }, + { + "epoch": 0.10242861497815263, + "grad_norm": 2.8282413482666016, + "learning_rate": 1.9726611632898693e-05, + "loss": 1.377, + "step": 126 + }, + { + "epoch": 0.10324154049385226, + "grad_norm": 2.598076820373535, + "learning_rate": 1.9720462503517e-05, + "loss": 1.4382, + "step": 127 + }, + { + "epoch": 0.10405446600955187, + "grad_norm": 2.688178777694702, + "learning_rate": 1.971424596705189e-05, + "loss": 1.4132, + "step": 128 + }, + { + "epoch": 0.1048673915252515, + "grad_norm": 2.7512471675872803, + "learning_rate": 1.9707962066612278e-05, + "loss": 1.4193, + "step": 129 + }, + { + "epoch": 0.10568031704095113, + "grad_norm": 2.8025805950164795, + "learning_rate": 1.970161084577422e-05, + "loss": 1.3829, + "step": 130 + }, + { + "epoch": 0.10649324255665074, + "grad_norm": 2.6514623165130615, + "learning_rate": 1.9695192348580606e-05, + "loss": 1.4362, + "step": 131 + }, + { + "epoch": 0.10730616807235037, + "grad_norm": 2.4559547901153564, + "learning_rate": 1.9688706619540863e-05, + "loss": 1.4357, + "step": 132 + }, + { + "epoch": 0.10811909358805, + "grad_norm": 2.8258724212646484, + "learning_rate": 1.968215370363063e-05, + "loss": 1.4501, + "step": 133 + }, + { + "epoch": 0.10893201910374962, + "grad_norm": 2.8553593158721924, + "learning_rate": 1.9675533646291463e-05, + "loss": 1.4841, + "step": 134 + }, + { + "epoch": 0.10974494461944924, + "grad_norm": 3.625079870223999, + "learning_rate": 1.9668846493430522e-05, + "loss": 1.47, + "step": 135 + }, + { + "epoch": 0.11055787013514887, + "grad_norm": 2.9793193340301514, + "learning_rate": 1.9662092291420233e-05, + "loss": 1.3969, + "step": 136 + }, + { + "epoch": 0.1113707956508485, + "grad_norm": 2.5699939727783203, + "learning_rate": 1.965527108709798e-05, + "loss": 1.4258, + "step": 137 + }, + { + "epoch": 0.11218372116654811, + "grad_norm": 2.7961106300354004, + "learning_rate": 1.964838292776579e-05, + "loss": 1.4637, + "step": 138 + }, + { + "epoch": 0.11299664668224774, + "grad_norm": 3.3331451416015625, + "learning_rate": 1.9641427861189973e-05, + "loss": 1.3976, + "step": 139 + }, + { + "epoch": 0.11380957219794736, + "grad_norm": 2.5645205974578857, + "learning_rate": 1.963440593560083e-05, + "loss": 1.409, + "step": 140 + }, + { + "epoch": 0.11462249771364699, + "grad_norm": 2.5996487140655518, + "learning_rate": 1.9627317199692287e-05, + "loss": 1.4834, + "step": 141 + }, + { + "epoch": 0.1154354232293466, + "grad_norm": 2.9811034202575684, + "learning_rate": 1.962016170262157e-05, + "loss": 1.4508, + "step": 142 + }, + { + "epoch": 0.11624834874504623, + "grad_norm": 2.4133377075195312, + "learning_rate": 1.961293949400888e-05, + "loss": 1.4077, + "step": 143 + }, + { + "epoch": 0.11706127426074586, + "grad_norm": 2.622091770172119, + "learning_rate": 1.960565062393701e-05, + "loss": 1.4046, + "step": 144 + }, + { + "epoch": 0.11787419977644548, + "grad_norm": 2.857346534729004, + "learning_rate": 1.9598295142951035e-05, + "loss": 1.4217, + "step": 145 + }, + { + "epoch": 0.1186871252921451, + "grad_norm": 2.600935220718384, + "learning_rate": 1.9590873102057948e-05, + "loss": 1.403, + "step": 146 + }, + { + "epoch": 0.11950005080784473, + "grad_norm": 2.820359945297241, + "learning_rate": 1.9583384552726294e-05, + "loss": 1.4358, + "step": 147 + }, + { + "epoch": 0.12031297632354436, + "grad_norm": 2.6272051334381104, + "learning_rate": 1.957582954688584e-05, + "loss": 1.4505, + "step": 148 + }, + { + "epoch": 0.12112590183924397, + "grad_norm": 2.8003182411193848, + "learning_rate": 1.9568208136927177e-05, + "loss": 1.3977, + "step": 149 + }, + { + "epoch": 0.1219388273549436, + "grad_norm": 3.560518264770508, + "learning_rate": 1.9560520375701408e-05, + "loss": 1.3992, + "step": 150 + }, + { + "epoch": 0.12275175287064323, + "grad_norm": 2.6377906799316406, + "learning_rate": 1.9552766316519726e-05, + "loss": 1.4022, + "step": 151 + }, + { + "epoch": 0.12356467838634286, + "grad_norm": 2.7982730865478516, + "learning_rate": 1.9544946013153093e-05, + "loss": 1.409, + "step": 152 + }, + { + "epoch": 0.12437760390204247, + "grad_norm": 2.7179160118103027, + "learning_rate": 1.9537059519831822e-05, + "loss": 1.415, + "step": 153 + }, + { + "epoch": 0.12519052941774209, + "grad_norm": 2.959397554397583, + "learning_rate": 1.9529106891245244e-05, + "loss": 1.4296, + "step": 154 + }, + { + "epoch": 0.12600345493344173, + "grad_norm": 3.3228979110717773, + "learning_rate": 1.9521088182541298e-05, + "loss": 1.4282, + "step": 155 + }, + { + "epoch": 0.12681638044914134, + "grad_norm": 2.763151168823242, + "learning_rate": 1.951300344932616e-05, + "loss": 1.3686, + "step": 156 + }, + { + "epoch": 0.12762930596484098, + "grad_norm": 2.863190174102783, + "learning_rate": 1.9504852747663862e-05, + "loss": 1.4227, + "step": 157 + }, + { + "epoch": 0.1284422314805406, + "grad_norm": 2.889604330062866, + "learning_rate": 1.9496636134075894e-05, + "loss": 1.4658, + "step": 158 + }, + { + "epoch": 0.1292551569962402, + "grad_norm": 3.024122476577759, + "learning_rate": 1.9488353665540813e-05, + "loss": 1.4081, + "step": 159 + }, + { + "epoch": 0.13006808251193985, + "grad_norm": 2.4810218811035156, + "learning_rate": 1.9480005399493857e-05, + "loss": 1.4296, + "step": 160 + }, + { + "epoch": 0.13088100802763947, + "grad_norm": 2.643673896789551, + "learning_rate": 1.9471591393826536e-05, + "loss": 1.3652, + "step": 161 + }, + { + "epoch": 0.13169393354333908, + "grad_norm": 2.888829231262207, + "learning_rate": 1.9463111706886234e-05, + "loss": 1.4003, + "step": 162 + }, + { + "epoch": 0.13250685905903872, + "grad_norm": 2.7480149269104004, + "learning_rate": 1.9454566397475813e-05, + "loss": 1.4195, + "step": 163 + }, + { + "epoch": 0.13331978457473834, + "grad_norm": 2.68972110748291, + "learning_rate": 1.944595552485319e-05, + "loss": 1.3848, + "step": 164 + }, + { + "epoch": 0.13413271009043795, + "grad_norm": 2.8888440132141113, + "learning_rate": 1.943727914873094e-05, + "loss": 1.481, + "step": 165 + }, + { + "epoch": 0.1349456356061376, + "grad_norm": 2.8409390449523926, + "learning_rate": 1.9428537329275862e-05, + "loss": 1.4176, + "step": 166 + }, + { + "epoch": 0.1357585611218372, + "grad_norm": 2.4992098808288574, + "learning_rate": 1.941973012710859e-05, + "loss": 1.395, + "step": 167 + }, + { + "epoch": 0.13657148663753685, + "grad_norm": 4.587447166442871, + "learning_rate": 1.941085760330316e-05, + "loss": 1.3905, + "step": 168 + }, + { + "epoch": 0.13738441215323646, + "grad_norm": 2.4778833389282227, + "learning_rate": 1.940191981938657e-05, + "loss": 1.3707, + "step": 169 + }, + { + "epoch": 0.13819733766893608, + "grad_norm": 2.7843387126922607, + "learning_rate": 1.9392916837338376e-05, + "loss": 1.3698, + "step": 170 + }, + { + "epoch": 0.13901026318463572, + "grad_norm": 2.731437921524048, + "learning_rate": 1.9383848719590257e-05, + "loss": 1.4358, + "step": 171 + }, + { + "epoch": 0.13982318870033533, + "grad_norm": 3.079371213912964, + "learning_rate": 1.9374715529025575e-05, + "loss": 1.4027, + "step": 172 + }, + { + "epoch": 0.14063611421603495, + "grad_norm": 3.6557998657226562, + "learning_rate": 1.9365517328978943e-05, + "loss": 1.428, + "step": 173 + }, + { + "epoch": 0.1414490397317346, + "grad_norm": 2.9291248321533203, + "learning_rate": 1.9356254183235785e-05, + "loss": 1.4039, + "step": 174 + }, + { + "epoch": 0.1422619652474342, + "grad_norm": 2.498507499694824, + "learning_rate": 1.93469261560319e-05, + "loss": 1.3731, + "step": 175 + }, + { + "epoch": 0.14307489076313382, + "grad_norm": 3.6117923259735107, + "learning_rate": 1.9337533312053002e-05, + "loss": 1.4263, + "step": 176 + }, + { + "epoch": 0.14388781627883346, + "grad_norm": 2.490755319595337, + "learning_rate": 1.9328075716434287e-05, + "loss": 1.4215, + "step": 177 + }, + { + "epoch": 0.14470074179453307, + "grad_norm": 2.9008986949920654, + "learning_rate": 1.931855343475998e-05, + "loss": 1.3968, + "step": 178 + }, + { + "epoch": 0.14551366731023269, + "grad_norm": 5.284730911254883, + "learning_rate": 1.930896653306286e-05, + "loss": 1.418, + "step": 179 + }, + { + "epoch": 0.14632659282593233, + "grad_norm": 2.826756000518799, + "learning_rate": 1.929931507782383e-05, + "loss": 1.3996, + "step": 180 + }, + { + "epoch": 0.14713951834163194, + "grad_norm": 2.8084652423858643, + "learning_rate": 1.9289599135971437e-05, + "loss": 1.374, + "step": 181 + }, + { + "epoch": 0.14795244385733158, + "grad_norm": 2.736046075820923, + "learning_rate": 1.9279818774881418e-05, + "loss": 1.3687, + "step": 182 + }, + { + "epoch": 0.1487653693730312, + "grad_norm": 2.7098567485809326, + "learning_rate": 1.9269974062376224e-05, + "loss": 1.4059, + "step": 183 + }, + { + "epoch": 0.1495782948887308, + "grad_norm": 2.7764878273010254, + "learning_rate": 1.926006506672456e-05, + "loss": 1.42, + "step": 184 + }, + { + "epoch": 0.15039122040443045, + "grad_norm": 2.7715649604797363, + "learning_rate": 1.9250091856640895e-05, + "loss": 1.4549, + "step": 185 + }, + { + "epoch": 0.15120414592013007, + "grad_norm": 2.4104158878326416, + "learning_rate": 1.9240054501285015e-05, + "loss": 1.4129, + "step": 186 + }, + { + "epoch": 0.15201707143582968, + "grad_norm": 2.75614595413208, + "learning_rate": 1.922995307026151e-05, + "loss": 1.3959, + "step": 187 + }, + { + "epoch": 0.15282999695152932, + "grad_norm": 2.813262939453125, + "learning_rate": 1.921978763361931e-05, + "loss": 1.4139, + "step": 188 + }, + { + "epoch": 0.15364292246722894, + "grad_norm": 2.5106594562530518, + "learning_rate": 1.9209558261851194e-05, + "loss": 1.3683, + "step": 189 + }, + { + "epoch": 0.15445584798292855, + "grad_norm": 2.9257330894470215, + "learning_rate": 1.919926502589331e-05, + "loss": 1.3387, + "step": 190 + }, + { + "epoch": 0.1552687734986282, + "grad_norm": 2.5029993057250977, + "learning_rate": 1.9188907997124666e-05, + "loss": 1.3892, + "step": 191 + }, + { + "epoch": 0.1560816990143278, + "grad_norm": 2.6917388439178467, + "learning_rate": 1.9178487247366652e-05, + "loss": 1.3946, + "step": 192 + }, + { + "epoch": 0.15689462453002745, + "grad_norm": 2.7038626670837402, + "learning_rate": 1.916800284888253e-05, + "loss": 1.4082, + "step": 193 + }, + { + "epoch": 0.15770755004572706, + "grad_norm": 2.586545467376709, + "learning_rate": 1.915745487437694e-05, + "loss": 1.3431, + "step": 194 + }, + { + "epoch": 0.15852047556142668, + "grad_norm": 3.043938159942627, + "learning_rate": 1.9146843396995396e-05, + "loss": 1.3967, + "step": 195 + }, + { + "epoch": 0.15933340107712632, + "grad_norm": 2.80709171295166, + "learning_rate": 1.9136168490323772e-05, + "loss": 1.3617, + "step": 196 + }, + { + "epoch": 0.16014632659282593, + "grad_norm": 5.03334903717041, + "learning_rate": 1.9125430228387794e-05, + "loss": 1.4326, + "step": 197 + }, + { + "epoch": 0.16095925210852555, + "grad_norm": 4.717489719390869, + "learning_rate": 1.9114628685652535e-05, + "loss": 1.3459, + "step": 198 + }, + { + "epoch": 0.1617721776242252, + "grad_norm": 3.0668435096740723, + "learning_rate": 1.9103763937021887e-05, + "loss": 1.3763, + "step": 199 + }, + { + "epoch": 0.1625851031399248, + "grad_norm": 2.712122678756714, + "learning_rate": 1.909283605783805e-05, + "loss": 1.3319, + "step": 200 + }, + { + "epoch": 0.16339802865562442, + "grad_norm": 2.7631924152374268, + "learning_rate": 1.9081845123881002e-05, + "loss": 1.3641, + "step": 201 + }, + { + "epoch": 0.16421095417132406, + "grad_norm": 3.499955654144287, + "learning_rate": 1.9070791211367984e-05, + "loss": 1.3259, + "step": 202 + }, + { + "epoch": 0.16502387968702367, + "grad_norm": 2.913755416870117, + "learning_rate": 1.9059674396952963e-05, + "loss": 1.3386, + "step": 203 + }, + { + "epoch": 0.16583680520272331, + "grad_norm": 2.5671772956848145, + "learning_rate": 1.90484947577261e-05, + "loss": 1.3301, + "step": 204 + }, + { + "epoch": 0.16664973071842293, + "grad_norm": 3.3566508293151855, + "learning_rate": 1.903725237121322e-05, + "loss": 1.3375, + "step": 205 + }, + { + "epoch": 0.16746265623412254, + "grad_norm": 2.3617210388183594, + "learning_rate": 1.902594731537527e-05, + "loss": 1.4476, + "step": 206 + }, + { + "epoch": 0.16827558174982218, + "grad_norm": 2.8202669620513916, + "learning_rate": 1.901457966860779e-05, + "loss": 1.334, + "step": 207 + }, + { + "epoch": 0.1690885072655218, + "grad_norm": 2.5990843772888184, + "learning_rate": 1.9003149509740347e-05, + "loss": 1.4321, + "step": 208 + }, + { + "epoch": 0.1699014327812214, + "grad_norm": 5.9826507568359375, + "learning_rate": 1.899165691803601e-05, + "loss": 1.4338, + "step": 209 + }, + { + "epoch": 0.17071435829692105, + "grad_norm": 3.9570019245147705, + "learning_rate": 1.8980101973190787e-05, + "loss": 1.3265, + "step": 210 + }, + { + "epoch": 0.17152728381262067, + "grad_norm": 2.8985307216644287, + "learning_rate": 1.896848475533309e-05, + "loss": 1.3297, + "step": 211 + }, + { + "epoch": 0.17234020932832028, + "grad_norm": 3.2575559616088867, + "learning_rate": 1.8956805345023145e-05, + "loss": 1.4086, + "step": 212 + }, + { + "epoch": 0.17315313484401992, + "grad_norm": 3.264796733856201, + "learning_rate": 1.894506382325248e-05, + "loss": 1.391, + "step": 213 + }, + { + "epoch": 0.17396606035971954, + "grad_norm": 2.767975330352783, + "learning_rate": 1.8933260271443313e-05, + "loss": 1.3731, + "step": 214 + }, + { + "epoch": 0.17477898587541915, + "grad_norm": 2.3556087017059326, + "learning_rate": 1.8921394771448032e-05, + "loss": 1.3288, + "step": 215 + }, + { + "epoch": 0.1755919113911188, + "grad_norm": 4.253211975097656, + "learning_rate": 1.89094674055486e-05, + "loss": 1.3776, + "step": 216 + }, + { + "epoch": 0.1764048369068184, + "grad_norm": 3.0681605339050293, + "learning_rate": 1.889747825645599e-05, + "loss": 1.4169, + "step": 217 + }, + { + "epoch": 0.17721776242251805, + "grad_norm": 2.3741588592529297, + "learning_rate": 1.8885427407309627e-05, + "loss": 1.3392, + "step": 218 + }, + { + "epoch": 0.17803068793821766, + "grad_norm": 2.968780279159546, + "learning_rate": 1.887331494167678e-05, + "loss": 1.4019, + "step": 219 + }, + { + "epoch": 0.17884361345391728, + "grad_norm": 2.3684914112091064, + "learning_rate": 1.8861140943552014e-05, + "loss": 1.3599, + "step": 220 + }, + { + "epoch": 0.17965653896961692, + "grad_norm": 3.0405993461608887, + "learning_rate": 1.884890549735659e-05, + "loss": 1.4245, + "step": 221 + }, + { + "epoch": 0.18046946448531653, + "grad_norm": 3.397047281265259, + "learning_rate": 1.8836608687937883e-05, + "loss": 1.392, + "step": 222 + }, + { + "epoch": 0.18128239000101615, + "grad_norm": 2.4693644046783447, + "learning_rate": 1.8824250600568798e-05, + "loss": 1.3726, + "step": 223 + }, + { + "epoch": 0.1820953155167158, + "grad_norm": 6.75039005279541, + "learning_rate": 1.8811831320947177e-05, + "loss": 1.3473, + "step": 224 + }, + { + "epoch": 0.1829082410324154, + "grad_norm": 2.922574758529663, + "learning_rate": 1.879935093519519e-05, + "loss": 1.4221, + "step": 225 + }, + { + "epoch": 0.18372116654811502, + "grad_norm": 5.6719136238098145, + "learning_rate": 1.878680952985877e-05, + "loss": 1.3844, + "step": 226 + }, + { + "epoch": 0.18453409206381466, + "grad_norm": 2.6967201232910156, + "learning_rate": 1.8774207191906976e-05, + "loss": 1.344, + "step": 227 + }, + { + "epoch": 0.18534701757951427, + "grad_norm": 3.049881935119629, + "learning_rate": 1.8761544008731426e-05, + "loss": 1.3912, + "step": 228 + }, + { + "epoch": 0.18615994309521391, + "grad_norm": 3.1408843994140625, + "learning_rate": 1.874882006814565e-05, + "loss": 1.4048, + "step": 229 + }, + { + "epoch": 0.18697286861091353, + "grad_norm": 2.6653666496276855, + "learning_rate": 1.8736035458384528e-05, + "loss": 1.3844, + "step": 230 + }, + { + "epoch": 0.18778579412661314, + "grad_norm": 2.6866488456726074, + "learning_rate": 1.8723190268103634e-05, + "loss": 1.3586, + "step": 231 + }, + { + "epoch": 0.18859871964231278, + "grad_norm": 3.2653231620788574, + "learning_rate": 1.8710284586378645e-05, + "loss": 1.3856, + "step": 232 + }, + { + "epoch": 0.1894116451580124, + "grad_norm": 2.841388463973999, + "learning_rate": 1.8697318502704734e-05, + "loss": 1.3868, + "step": 233 + }, + { + "epoch": 0.190224570673712, + "grad_norm": 2.797558307647705, + "learning_rate": 1.8684292106995916e-05, + "loss": 1.3885, + "step": 234 + }, + { + "epoch": 0.19103749618941165, + "grad_norm": 2.915003776550293, + "learning_rate": 1.8671205489584453e-05, + "loss": 1.3434, + "step": 235 + }, + { + "epoch": 0.19185042170511127, + "grad_norm": 3.2142281532287598, + "learning_rate": 1.865805874122021e-05, + "loss": 1.3975, + "step": 236 + }, + { + "epoch": 0.19266334722081088, + "grad_norm": 3.0831453800201416, + "learning_rate": 1.8644851953070045e-05, + "loss": 1.367, + "step": 237 + }, + { + "epoch": 0.19347627273651052, + "grad_norm": 3.2555181980133057, + "learning_rate": 1.863158521671716e-05, + "loss": 1.33, + "step": 238 + }, + { + "epoch": 0.19428919825221014, + "grad_norm": 2.8768310546875, + "learning_rate": 1.8618258624160465e-05, + "loss": 1.3867, + "step": 239 + }, + { + "epoch": 0.19510212376790975, + "grad_norm": 2.9737942218780518, + "learning_rate": 1.8604872267813954e-05, + "loss": 1.3726, + "step": 240 + }, + { + "epoch": 0.1959150492836094, + "grad_norm": 2.5942904949188232, + "learning_rate": 1.859142624050605e-05, + "loss": 1.3704, + "step": 241 + }, + { + "epoch": 0.196727974799309, + "grad_norm": 2.6901443004608154, + "learning_rate": 1.8577920635478976e-05, + "loss": 1.3523, + "step": 242 + }, + { + "epoch": 0.19754090031500865, + "grad_norm": 2.4508392810821533, + "learning_rate": 1.8564355546388094e-05, + "loss": 1.3758, + "step": 243 + }, + { + "epoch": 0.19835382583070826, + "grad_norm": 2.3041279315948486, + "learning_rate": 1.855073106730126e-05, + "loss": 1.3491, + "step": 244 + }, + { + "epoch": 0.19916675134640788, + "grad_norm": 2.8388736248016357, + "learning_rate": 1.8537047292698175e-05, + "loss": 1.3578, + "step": 245 + }, + { + "epoch": 0.19997967686210752, + "grad_norm": 3.058314085006714, + "learning_rate": 1.852330431746973e-05, + "loss": 1.3547, + "step": 246 + }, + { + "epoch": 0.20079260237780713, + "grad_norm": 2.881788492202759, + "learning_rate": 1.8509502236917353e-05, + "loss": 1.3823, + "step": 247 + }, + { + "epoch": 0.20160552789350675, + "grad_norm": 2.623408794403076, + "learning_rate": 1.8495641146752322e-05, + "loss": 1.4516, + "step": 248 + }, + { + "epoch": 0.2024184534092064, + "grad_norm": 2.662614345550537, + "learning_rate": 1.848172114309513e-05, + "loss": 1.3924, + "step": 249 + }, + { + "epoch": 0.203231378924906, + "grad_norm": 2.520263671875, + "learning_rate": 1.8467742322474822e-05, + "loss": 1.4097, + "step": 250 + }, + { + "epoch": 0.20404430444060562, + "grad_norm": 4.465703964233398, + "learning_rate": 1.845370478182829e-05, + "loss": 1.3645, + "step": 251 + }, + { + "epoch": 0.20485722995630526, + "grad_norm": 2.5109176635742188, + "learning_rate": 1.8439608618499637e-05, + "loss": 1.3238, + "step": 252 + }, + { + "epoch": 0.20567015547200487, + "grad_norm": 2.703659772872925, + "learning_rate": 1.842545393023949e-05, + "loss": 1.4027, + "step": 253 + }, + { + "epoch": 0.20648308098770451, + "grad_norm": 3.483933448791504, + "learning_rate": 1.841124081520431e-05, + "loss": 1.4167, + "step": 254 + }, + { + "epoch": 0.20729600650340413, + "grad_norm": 2.7172889709472656, + "learning_rate": 1.8396969371955724e-05, + "loss": 1.3017, + "step": 255 + }, + { + "epoch": 0.20810893201910374, + "grad_norm": 2.512045383453369, + "learning_rate": 1.838263969945985e-05, + "loss": 1.4112, + "step": 256 + }, + { + "epoch": 0.20892185753480338, + "grad_norm": 2.4449141025543213, + "learning_rate": 1.836825189708659e-05, + "loss": 1.3396, + "step": 257 + }, + { + "epoch": 0.209734783050503, + "grad_norm": 2.9280951023101807, + "learning_rate": 1.8353806064608953e-05, + "loss": 1.3461, + "step": 258 + }, + { + "epoch": 0.2105477085662026, + "grad_norm": 3.962769031524658, + "learning_rate": 1.833930230220236e-05, + "loss": 1.3347, + "step": 259 + }, + { + "epoch": 0.21136063408190225, + "grad_norm": 3.3168771266937256, + "learning_rate": 1.8324740710443955e-05, + "loss": 1.3264, + "step": 260 + }, + { + "epoch": 0.21217355959760187, + "grad_norm": 2.754786252975464, + "learning_rate": 1.831012139031189e-05, + "loss": 1.3859, + "step": 261 + }, + { + "epoch": 0.21298648511330148, + "grad_norm": 2.5179426670074463, + "learning_rate": 1.829544444318466e-05, + "loss": 1.3653, + "step": 262 + }, + { + "epoch": 0.21379941062900112, + "grad_norm": 2.9228906631469727, + "learning_rate": 1.8280709970840352e-05, + "loss": 1.3929, + "step": 263 + }, + { + "epoch": 0.21461233614470074, + "grad_norm": 2.732806921005249, + "learning_rate": 1.8265918075455985e-05, + "loss": 1.3197, + "step": 264 + }, + { + "epoch": 0.21542526166040035, + "grad_norm": 2.7236287593841553, + "learning_rate": 1.8251068859606777e-05, + "loss": 1.3156, + "step": 265 + }, + { + "epoch": 0.2162381871761, + "grad_norm": 3.677654504776001, + "learning_rate": 1.823616242626542e-05, + "loss": 1.3565, + "step": 266 + }, + { + "epoch": 0.2170511126917996, + "grad_norm": 2.4574098587036133, + "learning_rate": 1.8221198878801415e-05, + "loss": 1.3802, + "step": 267 + }, + { + "epoch": 0.21786403820749925, + "grad_norm": 3.2601144313812256, + "learning_rate": 1.8206178320980295e-05, + "loss": 1.3606, + "step": 268 + }, + { + "epoch": 0.21867696372319886, + "grad_norm": 2.4183156490325928, + "learning_rate": 1.819110085696295e-05, + "loss": 1.3327, + "step": 269 + }, + { + "epoch": 0.21948988923889848, + "grad_norm": 2.6820755004882812, + "learning_rate": 1.817596659130489e-05, + "loss": 1.3676, + "step": 270 + }, + { + "epoch": 0.22030281475459812, + "grad_norm": 2.619580030441284, + "learning_rate": 1.816077562895551e-05, + "loss": 1.408, + "step": 271 + }, + { + "epoch": 0.22111574027029773, + "grad_norm": 2.4499645233154297, + "learning_rate": 1.814552807525738e-05, + "loss": 1.3445, + "step": 272 + }, + { + "epoch": 0.22192866578599735, + "grad_norm": 2.5966873168945312, + "learning_rate": 1.81302240359455e-05, + "loss": 1.3354, + "step": 273 + }, + { + "epoch": 0.222741591301697, + "grad_norm": 8.227926254272461, + "learning_rate": 1.8114863617146576e-05, + "loss": 1.3495, + "step": 274 + }, + { + "epoch": 0.2235545168173966, + "grad_norm": 5.334491729736328, + "learning_rate": 1.8099446925378278e-05, + "loss": 1.3845, + "step": 275 + }, + { + "epoch": 0.22436744233309622, + "grad_norm": 2.436473846435547, + "learning_rate": 1.8083974067548506e-05, + "loss": 1.3152, + "step": 276 + }, + { + "epoch": 0.22518036784879586, + "grad_norm": 2.4906110763549805, + "learning_rate": 1.806844515095465e-05, + "loss": 1.3213, + "step": 277 + }, + { + "epoch": 0.22599329336449547, + "grad_norm": 2.627547264099121, + "learning_rate": 1.8052860283282832e-05, + "loss": 1.3394, + "step": 278 + }, + { + "epoch": 0.22680621888019512, + "grad_norm": 3.9034616947174072, + "learning_rate": 1.8037219572607177e-05, + "loss": 1.2956, + "step": 279 + }, + { + "epoch": 0.22761914439589473, + "grad_norm": 2.9307639598846436, + "learning_rate": 1.8021523127389066e-05, + "loss": 1.3507, + "step": 280 + }, + { + "epoch": 0.22843206991159434, + "grad_norm": 2.6711225509643555, + "learning_rate": 1.800577105647635e-05, + "loss": 1.4043, + "step": 281 + }, + { + "epoch": 0.22924499542729398, + "grad_norm": 2.9251246452331543, + "learning_rate": 1.7989963469102643e-05, + "loss": 1.3424, + "step": 282 + }, + { + "epoch": 0.2300579209429936, + "grad_norm": 2.2818679809570312, + "learning_rate": 1.797410047488653e-05, + "loss": 1.334, + "step": 283 + }, + { + "epoch": 0.2308708464586932, + "grad_norm": 2.6961264610290527, + "learning_rate": 1.7958182183830816e-05, + "loss": 1.3411, + "step": 284 + }, + { + "epoch": 0.23168377197439285, + "grad_norm": 2.5082268714904785, + "learning_rate": 1.794220870632177e-05, + "loss": 1.3815, + "step": 285 + }, + { + "epoch": 0.23249669749009247, + "grad_norm": 2.6569674015045166, + "learning_rate": 1.7926180153128358e-05, + "loss": 1.4037, + "step": 286 + }, + { + "epoch": 0.23330962300579208, + "grad_norm": 2.559483289718628, + "learning_rate": 1.791009663540146e-05, + "loss": 1.333, + "step": 287 + }, + { + "epoch": 0.23412254852149172, + "grad_norm": 2.6982040405273438, + "learning_rate": 1.789395826467312e-05, + "loss": 1.4168, + "step": 288 + }, + { + "epoch": 0.23493547403719134, + "grad_norm": 2.414900541305542, + "learning_rate": 1.7877765152855757e-05, + "loss": 1.3583, + "step": 289 + }, + { + "epoch": 0.23574839955289095, + "grad_norm": 2.465045928955078, + "learning_rate": 1.78615174122414e-05, + "loss": 1.44, + "step": 290 + }, + { + "epoch": 0.2365613250685906, + "grad_norm": 2.306795597076416, + "learning_rate": 1.78452151555009e-05, + "loss": 1.3215, + "step": 291 + }, + { + "epoch": 0.2373742505842902, + "grad_norm": 2.6841700077056885, + "learning_rate": 1.7828858495683162e-05, + "loss": 1.351, + "step": 292 + }, + { + "epoch": 0.23818717609998985, + "grad_norm": 2.4231340885162354, + "learning_rate": 1.781244754621434e-05, + "loss": 1.3923, + "step": 293 + }, + { + "epoch": 0.23900010161568946, + "grad_norm": 2.8300161361694336, + "learning_rate": 1.779598242089707e-05, + "loss": 1.3876, + "step": 294 + }, + { + "epoch": 0.23981302713138908, + "grad_norm": 2.6287200450897217, + "learning_rate": 1.7779463233909677e-05, + "loss": 1.3609, + "step": 295 + }, + { + "epoch": 0.24062595264708872, + "grad_norm": 2.656332015991211, + "learning_rate": 1.7762890099805362e-05, + "loss": 1.3538, + "step": 296 + }, + { + "epoch": 0.24143887816278833, + "grad_norm": 2.5331099033355713, + "learning_rate": 1.774626313351145e-05, + "loss": 1.3154, + "step": 297 + }, + { + "epoch": 0.24225180367848795, + "grad_norm": 2.8881306648254395, + "learning_rate": 1.7729582450328547e-05, + "loss": 1.3561, + "step": 298 + }, + { + "epoch": 0.2430647291941876, + "grad_norm": 2.4491260051727295, + "learning_rate": 1.771284816592978e-05, + "loss": 1.3494, + "step": 299 + }, + { + "epoch": 0.2438776547098872, + "grad_norm": 2.8161392211914062, + "learning_rate": 1.7696060396359956e-05, + "loss": 1.3125, + "step": 300 + }, + { + "epoch": 0.24469058022558682, + "grad_norm": 2.788238048553467, + "learning_rate": 1.7679219258034798e-05, + "loss": 1.41, + "step": 301 + }, + { + "epoch": 0.24550350574128646, + "grad_norm": 3.0948519706726074, + "learning_rate": 1.7662324867740102e-05, + "loss": 1.4138, + "step": 302 + }, + { + "epoch": 0.24631643125698607, + "grad_norm": 3.617783308029175, + "learning_rate": 1.7645377342630956e-05, + "loss": 1.3995, + "step": 303 + }, + { + "epoch": 0.24712935677268572, + "grad_norm": 2.713531255722046, + "learning_rate": 1.76283768002309e-05, + "loss": 1.354, + "step": 304 + }, + { + "epoch": 0.24794228228838533, + "grad_norm": 3.9215407371520996, + "learning_rate": 1.7611323358431145e-05, + "loss": 1.3939, + "step": 305 + }, + { + "epoch": 0.24875520780408494, + "grad_norm": 3.519932508468628, + "learning_rate": 1.759421713548971e-05, + "loss": 1.3311, + "step": 306 + }, + { + "epoch": 0.24956813331978459, + "grad_norm": 3.0680055618286133, + "learning_rate": 1.757705825003065e-05, + "loss": 1.4131, + "step": 307 + }, + { + "epoch": 0.25038105883548417, + "grad_norm": 2.456533908843994, + "learning_rate": 1.7559846821043205e-05, + "loss": 1.3132, + "step": 308 + }, + { + "epoch": 0.25119398435118384, + "grad_norm": 2.6937081813812256, + "learning_rate": 1.754258296788097e-05, + "loss": 1.3041, + "step": 309 + }, + { + "epoch": 0.25200690986688346, + "grad_norm": 5.319806098937988, + "learning_rate": 1.7525266810261096e-05, + "loss": 1.3544, + "step": 310 + }, + { + "epoch": 0.25281983538258307, + "grad_norm": 2.9595742225646973, + "learning_rate": 1.7507898468263422e-05, + "loss": 1.3528, + "step": 311 + }, + { + "epoch": 0.2536327608982827, + "grad_norm": 4.085862636566162, + "learning_rate": 1.7490478062329686e-05, + "loss": 1.3314, + "step": 312 + }, + { + "epoch": 0.2544456864139823, + "grad_norm": 2.4585909843444824, + "learning_rate": 1.7473005713262644e-05, + "loss": 1.3622, + "step": 313 + }, + { + "epoch": 0.25525861192968197, + "grad_norm": 2.4798450469970703, + "learning_rate": 1.7455481542225272e-05, + "loss": 1.3804, + "step": 314 + }, + { + "epoch": 0.2560715374453816, + "grad_norm": 2.686068534851074, + "learning_rate": 1.7437905670739893e-05, + "loss": 1.2945, + "step": 315 + }, + { + "epoch": 0.2568844629610812, + "grad_norm": 2.7424585819244385, + "learning_rate": 1.7420278220687366e-05, + "loss": 1.3561, + "step": 316 + }, + { + "epoch": 0.2576973884767808, + "grad_norm": 2.964237928390503, + "learning_rate": 1.7402599314306207e-05, + "loss": 1.3701, + "step": 317 + }, + { + "epoch": 0.2585103139924804, + "grad_norm": 2.7983458042144775, + "learning_rate": 1.7384869074191777e-05, + "loss": 1.3536, + "step": 318 + }, + { + "epoch": 0.25932323950818004, + "grad_norm": 2.6008524894714355, + "learning_rate": 1.7367087623295394e-05, + "loss": 1.3394, + "step": 319 + }, + { + "epoch": 0.2601361650238797, + "grad_norm": 2.4116249084472656, + "learning_rate": 1.7349255084923517e-05, + "loss": 1.3785, + "step": 320 + }, + { + "epoch": 0.2609490905395793, + "grad_norm": 2.9649388790130615, + "learning_rate": 1.7331371582736864e-05, + "loss": 1.3779, + "step": 321 + }, + { + "epoch": 0.26176201605527893, + "grad_norm": 2.692847490310669, + "learning_rate": 1.731343724074957e-05, + "loss": 1.3715, + "step": 322 + }, + { + "epoch": 0.26257494157097855, + "grad_norm": 2.6246955394744873, + "learning_rate": 1.7295452183328317e-05, + "loss": 1.3856, + "step": 323 + }, + { + "epoch": 0.26338786708667816, + "grad_norm": 2.822334051132202, + "learning_rate": 1.7277416535191478e-05, + "loss": 1.3289, + "step": 324 + }, + { + "epoch": 0.26420079260237783, + "grad_norm": 2.703158378601074, + "learning_rate": 1.7259330421408247e-05, + "loss": 1.3447, + "step": 325 + }, + { + "epoch": 0.26501371811807745, + "grad_norm": 2.5357322692871094, + "learning_rate": 1.7241193967397784e-05, + "loss": 1.3414, + "step": 326 + }, + { + "epoch": 0.26582664363377706, + "grad_norm": 2.7839202880859375, + "learning_rate": 1.7223007298928322e-05, + "loss": 1.3725, + "step": 327 + }, + { + "epoch": 0.2666395691494767, + "grad_norm": 2.6645684242248535, + "learning_rate": 1.7204770542116326e-05, + "loss": 1.3163, + "step": 328 + }, + { + "epoch": 0.2674524946651763, + "grad_norm": 4.677945137023926, + "learning_rate": 1.7186483823425582e-05, + "loss": 1.3583, + "step": 329 + }, + { + "epoch": 0.2682654201808759, + "grad_norm": 2.948094367980957, + "learning_rate": 1.7168147269666357e-05, + "loss": 1.3643, + "step": 330 + }, + { + "epoch": 0.26907834569657557, + "grad_norm": 2.5047991275787354, + "learning_rate": 1.714976100799449e-05, + "loss": 1.3542, + "step": 331 + }, + { + "epoch": 0.2698912712122752, + "grad_norm": 2.680239677429199, + "learning_rate": 1.713132516591053e-05, + "loss": 1.3204, + "step": 332 + }, + { + "epoch": 0.2707041967279748, + "grad_norm": 2.703165054321289, + "learning_rate": 1.7112839871258838e-05, + "loss": 1.3467, + "step": 333 + }, + { + "epoch": 0.2715171222436744, + "grad_norm": 2.5855846405029297, + "learning_rate": 1.7094305252226713e-05, + "loss": 1.3807, + "step": 334 + }, + { + "epoch": 0.272330047759374, + "grad_norm": 2.8401761054992676, + "learning_rate": 1.7075721437343488e-05, + "loss": 1.4032, + "step": 335 + }, + { + "epoch": 0.2731429732750737, + "grad_norm": 2.727287530899048, + "learning_rate": 1.705708855547966e-05, + "loss": 1.3416, + "step": 336 + }, + { + "epoch": 0.2739558987907733, + "grad_norm": 2.9767589569091797, + "learning_rate": 1.7038406735845967e-05, + "loss": 1.3062, + "step": 337 + }, + { + "epoch": 0.2747688243064729, + "grad_norm": 2.6532137393951416, + "learning_rate": 1.7019676107992523e-05, + "loss": 1.3717, + "step": 338 + }, + { + "epoch": 0.27558174982217254, + "grad_norm": 5.618951797485352, + "learning_rate": 1.70008968018079e-05, + "loss": 1.4021, + "step": 339 + }, + { + "epoch": 0.27639467533787215, + "grad_norm": 2.75219464302063, + "learning_rate": 1.6982068947518235e-05, + "loss": 1.3345, + "step": 340 + }, + { + "epoch": 0.27720760085357177, + "grad_norm": 2.7771074771881104, + "learning_rate": 1.6963192675686312e-05, + "loss": 1.3613, + "step": 341 + }, + { + "epoch": 0.27802052636927144, + "grad_norm": 2.4822003841400146, + "learning_rate": 1.694426811721069e-05, + "loss": 1.3465, + "step": 342 + }, + { + "epoch": 0.27883345188497105, + "grad_norm": 2.684894323348999, + "learning_rate": 1.6925295403324758e-05, + "loss": 1.337, + "step": 343 + }, + { + "epoch": 0.27964637740067066, + "grad_norm": 2.804255962371826, + "learning_rate": 1.6906274665595854e-05, + "loss": 1.2862, + "step": 344 + }, + { + "epoch": 0.2804593029163703, + "grad_norm": 2.7327306270599365, + "learning_rate": 1.688720603592432e-05, + "loss": 1.3826, + "step": 345 + }, + { + "epoch": 0.2812722284320699, + "grad_norm": 3.0967769622802734, + "learning_rate": 1.6868089646542632e-05, + "loss": 1.3406, + "step": 346 + }, + { + "epoch": 0.28208515394776956, + "grad_norm": 2.4972376823425293, + "learning_rate": 1.6848925630014445e-05, + "loss": 1.3315, + "step": 347 + }, + { + "epoch": 0.2828980794634692, + "grad_norm": 22.60991668701172, + "learning_rate": 1.6829714119233688e-05, + "loss": 1.3325, + "step": 348 + }, + { + "epoch": 0.2837110049791688, + "grad_norm": 3.207625389099121, + "learning_rate": 1.6810455247423634e-05, + "loss": 1.3926, + "step": 349 + }, + { + "epoch": 0.2845239304948684, + "grad_norm": 2.6568946838378906, + "learning_rate": 1.6791149148136003e-05, + "loss": 1.3464, + "step": 350 + }, + { + "epoch": 0.285336856010568, + "grad_norm": 2.9483156204223633, + "learning_rate": 1.677179595525e-05, + "loss": 1.2875, + "step": 351 + }, + { + "epoch": 0.28614978152626763, + "grad_norm": 2.841442584991455, + "learning_rate": 1.675239580297141e-05, + "loss": 1.3441, + "step": 352 + }, + { + "epoch": 0.2869627070419673, + "grad_norm": 3.3877551555633545, + "learning_rate": 1.6732948825831657e-05, + "loss": 1.3662, + "step": 353 + }, + { + "epoch": 0.2877756325576669, + "grad_norm": 2.9442946910858154, + "learning_rate": 1.671345515868688e-05, + "loss": 1.3075, + "step": 354 + }, + { + "epoch": 0.28858855807336653, + "grad_norm": 2.672950029373169, + "learning_rate": 1.6693914936716983e-05, + "loss": 1.2982, + "step": 355 + }, + { + "epoch": 0.28940148358906614, + "grad_norm": 2.7699198722839355, + "learning_rate": 1.6674328295424723e-05, + "loss": 1.3331, + "step": 356 + }, + { + "epoch": 0.29021440910476576, + "grad_norm": 2.578444719314575, + "learning_rate": 1.6654695370634738e-05, + "loss": 1.3768, + "step": 357 + }, + { + "epoch": 0.29102733462046537, + "grad_norm": 2.748466968536377, + "learning_rate": 1.6635016298492628e-05, + "loss": 1.3108, + "step": 358 + }, + { + "epoch": 0.29184026013616504, + "grad_norm": 2.818321943283081, + "learning_rate": 1.6615291215464005e-05, + "loss": 1.2586, + "step": 359 + }, + { + "epoch": 0.29265318565186466, + "grad_norm": 3.6742396354675293, + "learning_rate": 1.6595520258333545e-05, + "loss": 1.3112, + "step": 360 + }, + { + "epoch": 0.29346611116756427, + "grad_norm": 2.999140977859497, + "learning_rate": 1.657570356420404e-05, + "loss": 1.2923, + "step": 361 + }, + { + "epoch": 0.2942790366832639, + "grad_norm": 2.704463481903076, + "learning_rate": 1.6555841270495456e-05, + "loss": 1.3329, + "step": 362 + }, + { + "epoch": 0.2950919621989635, + "grad_norm": 3.2639801502227783, + "learning_rate": 1.6535933514943955e-05, + "loss": 1.3215, + "step": 363 + }, + { + "epoch": 0.29590488771466317, + "grad_norm": 3.2200841903686523, + "learning_rate": 1.6515980435600965e-05, + "loss": 1.3792, + "step": 364 + }, + { + "epoch": 0.2967178132303628, + "grad_norm": 2.9226245880126953, + "learning_rate": 1.6495982170832224e-05, + "loss": 1.3565, + "step": 365 + }, + { + "epoch": 0.2975307387460624, + "grad_norm": 3.096405029296875, + "learning_rate": 1.6475938859316795e-05, + "loss": 1.3857, + "step": 366 + }, + { + "epoch": 0.298343664261762, + "grad_norm": 2.7694365978240967, + "learning_rate": 1.6455850640046134e-05, + "loss": 1.3782, + "step": 367 + }, + { + "epoch": 0.2991565897774616, + "grad_norm": 3.011751890182495, + "learning_rate": 1.6435717652323097e-05, + "loss": 1.3426, + "step": 368 + }, + { + "epoch": 0.29996951529316124, + "grad_norm": 2.7828853130340576, + "learning_rate": 1.6415540035761008e-05, + "loss": 1.3429, + "step": 369 + }, + { + "epoch": 0.3007824408088609, + "grad_norm": 2.5543785095214844, + "learning_rate": 1.639531793028265e-05, + "loss": 1.3768, + "step": 370 + }, + { + "epoch": 0.3015953663245605, + "grad_norm": 2.8462271690368652, + "learning_rate": 1.637505147611934e-05, + "loss": 1.3203, + "step": 371 + }, + { + "epoch": 0.30240829184026013, + "grad_norm": 2.404257297515869, + "learning_rate": 1.6354740813809917e-05, + "loss": 1.3693, + "step": 372 + }, + { + "epoch": 0.30322121735595975, + "grad_norm": 2.674553394317627, + "learning_rate": 1.6334386084199787e-05, + "loss": 1.3518, + "step": 373 + }, + { + "epoch": 0.30403414287165936, + "grad_norm": 2.4954397678375244, + "learning_rate": 1.631398742843995e-05, + "loss": 1.3669, + "step": 374 + }, + { + "epoch": 0.30484706838735903, + "grad_norm": 3.333721876144409, + "learning_rate": 1.629354498798601e-05, + "loss": 1.3358, + "step": 375 + }, + { + "epoch": 0.30565999390305865, + "grad_norm": 2.859560966491699, + "learning_rate": 1.627305890459719e-05, + "loss": 1.3334, + "step": 376 + }, + { + "epoch": 0.30647291941875826, + "grad_norm": 2.8346803188323975, + "learning_rate": 1.625252932033538e-05, + "loss": 1.3366, + "step": 377 + }, + { + "epoch": 0.3072858449344579, + "grad_norm": 2.64909029006958, + "learning_rate": 1.6231956377564095e-05, + "loss": 1.3398, + "step": 378 + }, + { + "epoch": 0.3080987704501575, + "grad_norm": 3.935067653656006, + "learning_rate": 1.621134021894756e-05, + "loss": 1.2953, + "step": 379 + }, + { + "epoch": 0.3089116959658571, + "grad_norm": 5.056494235992432, + "learning_rate": 1.619068098744965e-05, + "loss": 1.3245, + "step": 380 + }, + { + "epoch": 0.30972462148155677, + "grad_norm": 2.9668800830841064, + "learning_rate": 1.6169978826332955e-05, + "loss": 1.3199, + "step": 381 + }, + { + "epoch": 0.3105375469972564, + "grad_norm": 2.6101276874542236, + "learning_rate": 1.6149233879157747e-05, + "loss": 1.3317, + "step": 382 + }, + { + "epoch": 0.311350472512956, + "grad_norm": 2.677374839782715, + "learning_rate": 1.6128446289781012e-05, + "loss": 1.304, + "step": 383 + }, + { + "epoch": 0.3121633980286556, + "grad_norm": 4.049331188201904, + "learning_rate": 1.610761620235543e-05, + "loss": 1.3241, + "step": 384 + }, + { + "epoch": 0.31297632354435523, + "grad_norm": 2.566908836364746, + "learning_rate": 1.60867437613284e-05, + "loss": 1.3392, + "step": 385 + }, + { + "epoch": 0.3137892490600549, + "grad_norm": 2.550367832183838, + "learning_rate": 1.6065829111441e-05, + "loss": 1.3274, + "step": 386 + }, + { + "epoch": 0.3146021745757545, + "grad_norm": 4.543491363525391, + "learning_rate": 1.6044872397727037e-05, + "loss": 1.2993, + "step": 387 + }, + { + "epoch": 0.3154151000914541, + "grad_norm": 2.8900489807128906, + "learning_rate": 1.6023873765511993e-05, + "loss": 1.3274, + "step": 388 + }, + { + "epoch": 0.31622802560715374, + "grad_norm": 2.4930450916290283, + "learning_rate": 1.6002833360412044e-05, + "loss": 1.3074, + "step": 389 + }, + { + "epoch": 0.31704095112285335, + "grad_norm": 3.0221235752105713, + "learning_rate": 1.5981751328333036e-05, + "loss": 1.3077, + "step": 390 + }, + { + "epoch": 0.31785387663855297, + "grad_norm": 3.0569851398468018, + "learning_rate": 1.5960627815469486e-05, + "loss": 1.3705, + "step": 391 + }, + { + "epoch": 0.31866680215425264, + "grad_norm": 7.261632442474365, + "learning_rate": 1.5939462968303554e-05, + "loss": 1.3564, + "step": 392 + }, + { + "epoch": 0.31947972766995225, + "grad_norm": 3.0555789470672607, + "learning_rate": 1.5918256933604047e-05, + "loss": 1.3451, + "step": 393 + }, + { + "epoch": 0.32029265318565187, + "grad_norm": 3.360779047012329, + "learning_rate": 1.589700985842538e-05, + "loss": 1.2764, + "step": 394 + }, + { + "epoch": 0.3211055787013515, + "grad_norm": 2.9022507667541504, + "learning_rate": 1.5875721890106574e-05, + "loss": 1.3424, + "step": 395 + }, + { + "epoch": 0.3219185042170511, + "grad_norm": 5.119380474090576, + "learning_rate": 1.5854393176270205e-05, + "loss": 1.3392, + "step": 396 + }, + { + "epoch": 0.32273142973275076, + "grad_norm": 2.7554409503936768, + "learning_rate": 1.5833023864821427e-05, + "loss": 1.3762, + "step": 397 + }, + { + "epoch": 0.3235443552484504, + "grad_norm": 2.553323984146118, + "learning_rate": 1.5811614103946905e-05, + "loss": 1.3066, + "step": 398 + }, + { + "epoch": 0.32435728076415, + "grad_norm": 3.514381170272827, + "learning_rate": 1.5790164042113805e-05, + "loss": 1.3575, + "step": 399 + }, + { + "epoch": 0.3251702062798496, + "grad_norm": 2.89054012298584, + "learning_rate": 1.576867382806877e-05, + "loss": 1.3106, + "step": 400 + }, + { + "epoch": 0.3259831317955492, + "grad_norm": 2.9955763816833496, + "learning_rate": 1.5747143610836873e-05, + "loss": 1.3634, + "step": 401 + }, + { + "epoch": 0.32679605731124883, + "grad_norm": 3.175438404083252, + "learning_rate": 1.5725573539720592e-05, + "loss": 1.2876, + "step": 402 + }, + { + "epoch": 0.3276089828269485, + "grad_norm": 2.6269116401672363, + "learning_rate": 1.570396376429877e-05, + "loss": 1.342, + "step": 403 + }, + { + "epoch": 0.3284219083426481, + "grad_norm": 2.900568962097168, + "learning_rate": 1.5682314434425593e-05, + "loss": 1.3133, + "step": 404 + }, + { + "epoch": 0.32923483385834773, + "grad_norm": 2.6711323261260986, + "learning_rate": 1.5660625700229526e-05, + "loss": 1.2702, + "step": 405 + }, + { + "epoch": 0.33004775937404734, + "grad_norm": 2.8045928478240967, + "learning_rate": 1.5638897712112303e-05, + "loss": 1.3336, + "step": 406 + }, + { + "epoch": 0.33086068488974696, + "grad_norm": 2.9632303714752197, + "learning_rate": 1.561713062074785e-05, + "loss": 1.3546, + "step": 407 + }, + { + "epoch": 0.33167361040544663, + "grad_norm": 2.5156984329223633, + "learning_rate": 1.5595324577081265e-05, + "loss": 1.3587, + "step": 408 + }, + { + "epoch": 0.33248653592114624, + "grad_norm": 2.6634364128112793, + "learning_rate": 1.5573479732327758e-05, + "loss": 1.3317, + "step": 409 + }, + { + "epoch": 0.33329946143684586, + "grad_norm": 4.38008451461792, + "learning_rate": 1.555159623797161e-05, + "loss": 1.3078, + "step": 410 + }, + { + "epoch": 0.33411238695254547, + "grad_norm": 3.089078903198242, + "learning_rate": 1.552967424576512e-05, + "loss": 1.328, + "step": 411 + }, + { + "epoch": 0.3349253124682451, + "grad_norm": 2.9011247158050537, + "learning_rate": 1.5507713907727557e-05, + "loss": 1.349, + "step": 412 + }, + { + "epoch": 0.3357382379839447, + "grad_norm": 2.431152582168579, + "learning_rate": 1.5485715376144087e-05, + "loss": 1.383, + "step": 413 + }, + { + "epoch": 0.33655116349964437, + "grad_norm": 2.6097633838653564, + "learning_rate": 1.5463678803564753e-05, + "loss": 1.3414, + "step": 414 + }, + { + "epoch": 0.337364089015344, + "grad_norm": 2.9973533153533936, + "learning_rate": 1.5441604342803374e-05, + "loss": 1.3359, + "step": 415 + }, + { + "epoch": 0.3381770145310436, + "grad_norm": 2.849950075149536, + "learning_rate": 1.5419492146936518e-05, + "loss": 1.3378, + "step": 416 + }, + { + "epoch": 0.3389899400467432, + "grad_norm": 2.600947856903076, + "learning_rate": 1.5397342369302425e-05, + "loss": 1.3411, + "step": 417 + }, + { + "epoch": 0.3398028655624428, + "grad_norm": 2.946190595626831, + "learning_rate": 1.5375155163499953e-05, + "loss": 1.2981, + "step": 418 + }, + { + "epoch": 0.34061579107814244, + "grad_norm": 3.5300893783569336, + "learning_rate": 1.5352930683387502e-05, + "loss": 1.3717, + "step": 419 + }, + { + "epoch": 0.3414287165938421, + "grad_norm": 2.342288017272949, + "learning_rate": 1.5330669083081956e-05, + "loss": 1.2734, + "step": 420 + }, + { + "epoch": 0.3422416421095417, + "grad_norm": 3.7037856578826904, + "learning_rate": 1.5308370516957617e-05, + "loss": 1.3402, + "step": 421 + }, + { + "epoch": 0.34305456762524134, + "grad_norm": 2.5814309120178223, + "learning_rate": 1.528603513964511e-05, + "loss": 1.3207, + "step": 422 + }, + { + "epoch": 0.34386749314094095, + "grad_norm": 2.4542317390441895, + "learning_rate": 1.5263663106030347e-05, + "loss": 1.3257, + "step": 423 + }, + { + "epoch": 0.34468041865664056, + "grad_norm": 2.689870595932007, + "learning_rate": 1.5241254571253433e-05, + "loss": 1.3105, + "step": 424 + }, + { + "epoch": 0.34549334417234023, + "grad_norm": 2.900061845779419, + "learning_rate": 1.5218809690707583e-05, + "loss": 1.3113, + "step": 425 + }, + { + "epoch": 0.34630626968803985, + "grad_norm": 2.7165238857269287, + "learning_rate": 1.5196328620038059e-05, + "loss": 1.335, + "step": 426 + }, + { + "epoch": 0.34711919520373946, + "grad_norm": 2.3893747329711914, + "learning_rate": 1.5173811515141083e-05, + "loss": 1.3062, + "step": 427 + }, + { + "epoch": 0.3479321207194391, + "grad_norm": 2.568575143814087, + "learning_rate": 1.5151258532162771e-05, + "loss": 1.3338, + "step": 428 + }, + { + "epoch": 0.3487450462351387, + "grad_norm": 3.406301736831665, + "learning_rate": 1.5128669827498024e-05, + "loss": 1.3189, + "step": 429 + }, + { + "epoch": 0.3495579717508383, + "grad_norm": 2.752307653427124, + "learning_rate": 1.5106045557789453e-05, + "loss": 1.331, + "step": 430 + }, + { + "epoch": 0.350370897266538, + "grad_norm": 2.570742130279541, + "learning_rate": 1.5083385879926309e-05, + "loss": 1.2887, + "step": 431 + }, + { + "epoch": 0.3511838227822376, + "grad_norm": 2.4754555225372314, + "learning_rate": 1.5060690951043385e-05, + "loss": 1.3432, + "step": 432 + }, + { + "epoch": 0.3519967482979372, + "grad_norm": 3.853609561920166, + "learning_rate": 1.5037960928519902e-05, + "loss": 1.3625, + "step": 433 + }, + { + "epoch": 0.3528096738136368, + "grad_norm": 2.6506130695343018, + "learning_rate": 1.501519596997847e-05, + "loss": 1.2797, + "step": 434 + }, + { + "epoch": 0.35362259932933643, + "grad_norm": 2.8529601097106934, + "learning_rate": 1.499239623328394e-05, + "loss": 1.2868, + "step": 435 + }, + { + "epoch": 0.3544355248450361, + "grad_norm": 4.091727256774902, + "learning_rate": 1.4969561876542348e-05, + "loss": 1.2648, + "step": 436 + }, + { + "epoch": 0.3552484503607357, + "grad_norm": 2.5217483043670654, + "learning_rate": 1.4946693058099802e-05, + "loss": 1.2792, + "step": 437 + }, + { + "epoch": 0.3560613758764353, + "grad_norm": 3.035297155380249, + "learning_rate": 1.4923789936541378e-05, + "loss": 1.3267, + "step": 438 + }, + { + "epoch": 0.35687430139213494, + "grad_norm": 4.371755599975586, + "learning_rate": 1.4900852670690044e-05, + "loss": 1.3114, + "step": 439 + }, + { + "epoch": 0.35768722690783455, + "grad_norm": 2.904101610183716, + "learning_rate": 1.487788141960553e-05, + "loss": 1.3716, + "step": 440 + }, + { + "epoch": 0.35850015242353417, + "grad_norm": 2.663241147994995, + "learning_rate": 1.4854876342583246e-05, + "loss": 1.3269, + "step": 441 + }, + { + "epoch": 0.35931307793923384, + "grad_norm": 2.626646041870117, + "learning_rate": 1.4831837599153165e-05, + "loss": 1.3077, + "step": 442 + }, + { + "epoch": 0.36012600345493345, + "grad_norm": 2.8876073360443115, + "learning_rate": 1.4808765349078729e-05, + "loss": 1.2807, + "step": 443 + }, + { + "epoch": 0.36093892897063307, + "grad_norm": 2.5428106784820557, + "learning_rate": 1.4785659752355724e-05, + "loss": 1.3242, + "step": 444 + }, + { + "epoch": 0.3617518544863327, + "grad_norm": 2.7515244483947754, + "learning_rate": 1.4762520969211186e-05, + "loss": 1.3356, + "step": 445 + }, + { + "epoch": 0.3625647800020323, + "grad_norm": 2.771684408187866, + "learning_rate": 1.4739349160102285e-05, + "loss": 1.3255, + "step": 446 + }, + { + "epoch": 0.36337770551773196, + "grad_norm": 2.7270543575286865, + "learning_rate": 1.4716144485715209e-05, + "loss": 1.2797, + "step": 447 + }, + { + "epoch": 0.3641906310334316, + "grad_norm": 3.5211868286132812, + "learning_rate": 1.4692907106964051e-05, + "loss": 1.3098, + "step": 448 + }, + { + "epoch": 0.3650035565491312, + "grad_norm": 5.923196315765381, + "learning_rate": 1.4669637184989696e-05, + "loss": 1.3212, + "step": 449 + }, + { + "epoch": 0.3658164820648308, + "grad_norm": 2.50697922706604, + "learning_rate": 1.4646334881158704e-05, + "loss": 1.3195, + "step": 450 + }, + { + "epoch": 0.3666294075805304, + "grad_norm": 3.3721578121185303, + "learning_rate": 1.4623000357062184e-05, + "loss": 1.2747, + "step": 451 + }, + { + "epoch": 0.36744233309623003, + "grad_norm": 2.429243803024292, + "learning_rate": 1.459963377451468e-05, + "loss": 1.3122, + "step": 452 + }, + { + "epoch": 0.3682552586119297, + "grad_norm": 4.240250587463379, + "learning_rate": 1.457623529555305e-05, + "loss": 1.3447, + "step": 453 + }, + { + "epoch": 0.3690681841276293, + "grad_norm": 2.631667137145996, + "learning_rate": 1.4552805082435333e-05, + "loss": 1.3171, + "step": 454 + }, + { + "epoch": 0.36988110964332893, + "grad_norm": 2.906388521194458, + "learning_rate": 1.4529343297639638e-05, + "loss": 1.3193, + "step": 455 + }, + { + "epoch": 0.37069403515902855, + "grad_norm": 3.047884464263916, + "learning_rate": 1.4505850103863007e-05, + "loss": 1.3181, + "step": 456 + }, + { + "epoch": 0.37150696067472816, + "grad_norm": 2.3922433853149414, + "learning_rate": 1.448232566402028e-05, + "loss": 1.3203, + "step": 457 + }, + { + "epoch": 0.37231988619042783, + "grad_norm": 3.278813123703003, + "learning_rate": 1.4458770141242992e-05, + "loss": 1.3309, + "step": 458 + }, + { + "epoch": 0.37313281170612744, + "grad_norm": 2.7148866653442383, + "learning_rate": 1.4435183698878212e-05, + "loss": 1.3408, + "step": 459 + }, + { + "epoch": 0.37394573722182706, + "grad_norm": 2.913823366165161, + "learning_rate": 1.4411566500487425e-05, + "loss": 1.3426, + "step": 460 + }, + { + "epoch": 0.37475866273752667, + "grad_norm": 2.435643196105957, + "learning_rate": 1.4387918709845395e-05, + "loss": 1.3357, + "step": 461 + }, + { + "epoch": 0.3755715882532263, + "grad_norm": 2.6099560260772705, + "learning_rate": 1.4364240490939032e-05, + "loss": 1.3013, + "step": 462 + }, + { + "epoch": 0.3763845137689259, + "grad_norm": 2.7896599769592285, + "learning_rate": 1.4340532007966252e-05, + "loss": 1.3284, + "step": 463 + }, + { + "epoch": 0.37719743928462557, + "grad_norm": 2.857205867767334, + "learning_rate": 1.4316793425334836e-05, + "loss": 1.2926, + "step": 464 + }, + { + "epoch": 0.3780103648003252, + "grad_norm": 2.4580750465393066, + "learning_rate": 1.4293024907661295e-05, + "loss": 1.3926, + "step": 465 + }, + { + "epoch": 0.3788232903160248, + "grad_norm": 2.6340065002441406, + "learning_rate": 1.4269226619769727e-05, + "loss": 1.3315, + "step": 466 + }, + { + "epoch": 0.3796362158317244, + "grad_norm": 3.416398525238037, + "learning_rate": 1.424539872669067e-05, + "loss": 1.2822, + "step": 467 + }, + { + "epoch": 0.380449141347424, + "grad_norm": 2.4222054481506348, + "learning_rate": 1.4221541393659966e-05, + "loss": 1.2894, + "step": 468 + }, + { + "epoch": 0.38126206686312364, + "grad_norm": 2.797074794769287, + "learning_rate": 1.4197654786117604e-05, + "loss": 1.3519, + "step": 469 + }, + { + "epoch": 0.3820749923788233, + "grad_norm": 2.563831329345703, + "learning_rate": 1.4173739069706586e-05, + "loss": 1.3474, + "step": 470 + }, + { + "epoch": 0.3828879178945229, + "grad_norm": 2.4004971981048584, + "learning_rate": 1.414979441027176e-05, + "loss": 1.3007, + "step": 471 + }, + { + "epoch": 0.38370084341022254, + "grad_norm": 2.532390594482422, + "learning_rate": 1.4125820973858693e-05, + "loss": 1.2613, + "step": 472 + }, + { + "epoch": 0.38451376892592215, + "grad_norm": 2.5733683109283447, + "learning_rate": 1.41018189267125e-05, + "loss": 1.3212, + "step": 473 + }, + { + "epoch": 0.38532669444162176, + "grad_norm": 2.710106134414673, + "learning_rate": 1.4077788435276701e-05, + "loss": 1.3235, + "step": 474 + }, + { + "epoch": 0.38613961995732143, + "grad_norm": 2.996795892715454, + "learning_rate": 1.4053729666192067e-05, + "loss": 1.3722, + "step": 475 + }, + { + "epoch": 0.38695254547302105, + "grad_norm": 2.4392545223236084, + "learning_rate": 1.4029642786295452e-05, + "loss": 1.3706, + "step": 476 + }, + { + "epoch": 0.38776547098872066, + "grad_norm": 2.6843369007110596, + "learning_rate": 1.400552796261866e-05, + "loss": 1.3382, + "step": 477 + }, + { + "epoch": 0.3885783965044203, + "grad_norm": 2.405515193939209, + "learning_rate": 1.3981385362387268e-05, + "loss": 1.316, + "step": 478 + }, + { + "epoch": 0.3893913220201199, + "grad_norm": 2.425203800201416, + "learning_rate": 1.3957215153019463e-05, + "loss": 1.3578, + "step": 479 + }, + { + "epoch": 0.3902042475358195, + "grad_norm": 2.5134634971618652, + "learning_rate": 1.3933017502124897e-05, + "loss": 1.3531, + "step": 480 + }, + { + "epoch": 0.3910171730515192, + "grad_norm": 2.4274141788482666, + "learning_rate": 1.3908792577503514e-05, + "loss": 1.3705, + "step": 481 + }, + { + "epoch": 0.3918300985672188, + "grad_norm": 2.881443500518799, + "learning_rate": 1.3884540547144393e-05, + "loss": 1.3196, + "step": 482 + }, + { + "epoch": 0.3926430240829184, + "grad_norm": 2.5505170822143555, + "learning_rate": 1.3860261579224574e-05, + "loss": 1.3221, + "step": 483 + }, + { + "epoch": 0.393455949598618, + "grad_norm": 2.5604939460754395, + "learning_rate": 1.3835955842107897e-05, + "loss": 1.2565, + "step": 484 + }, + { + "epoch": 0.39426887511431763, + "grad_norm": 2.8203351497650146, + "learning_rate": 1.3811623504343845e-05, + "loss": 1.323, + "step": 485 + }, + { + "epoch": 0.3950818006300173, + "grad_norm": 3.9116978645324707, + "learning_rate": 1.378726473466635e-05, + "loss": 1.3188, + "step": 486 + }, + { + "epoch": 0.3958947261457169, + "grad_norm": 2.918548822402954, + "learning_rate": 1.3762879701992642e-05, + "loss": 1.337, + "step": 487 + }, + { + "epoch": 0.3967076516614165, + "grad_norm": 3.048039674758911, + "learning_rate": 1.373846857542208e-05, + "loss": 1.3379, + "step": 488 + }, + { + "epoch": 0.39752057717711614, + "grad_norm": 2.6825406551361084, + "learning_rate": 1.3714031524234965e-05, + "loss": 1.3096, + "step": 489 + }, + { + "epoch": 0.39833350269281576, + "grad_norm": 2.5955066680908203, + "learning_rate": 1.3689568717891381e-05, + "loss": 1.2947, + "step": 490 + }, + { + "epoch": 0.39914642820851537, + "grad_norm": 2.5204849243164062, + "learning_rate": 1.3665080326029997e-05, + "loss": 1.2852, + "step": 491 + }, + { + "epoch": 0.39995935372421504, + "grad_norm": 3.158151865005493, + "learning_rate": 1.364056651846693e-05, + "loss": 1.3323, + "step": 492 + }, + { + "epoch": 0.40077227923991465, + "grad_norm": 2.787951946258545, + "learning_rate": 1.3616027465194525e-05, + "loss": 1.325, + "step": 493 + }, + { + "epoch": 0.40158520475561427, + "grad_norm": 3.462423324584961, + "learning_rate": 1.35914633363802e-05, + "loss": 1.2689, + "step": 494 + }, + { + "epoch": 0.4023981302713139, + "grad_norm": 3.3612263202667236, + "learning_rate": 1.356687430236526e-05, + "loss": 1.2846, + "step": 495 + }, + { + "epoch": 0.4032110557870135, + "grad_norm": 2.521135091781616, + "learning_rate": 1.3542260533663723e-05, + "loss": 1.2845, + "step": 496 + }, + { + "epoch": 0.40402398130271316, + "grad_norm": 2.702359914779663, + "learning_rate": 1.351762220096112e-05, + "loss": 1.2982, + "step": 497 + }, + { + "epoch": 0.4048369068184128, + "grad_norm": 2.928270101547241, + "learning_rate": 1.3492959475113332e-05, + "loss": 1.2878, + "step": 498 + }, + { + "epoch": 0.4056498323341124, + "grad_norm": 2.491701126098633, + "learning_rate": 1.3468272527145388e-05, + "loss": 1.2913, + "step": 499 + }, + { + "epoch": 0.406462757849812, + "grad_norm": 2.8777735233306885, + "learning_rate": 1.3443561528250295e-05, + "loss": 1.328, + "step": 500 + }, + { + "epoch": 0.4072756833655116, + "grad_norm": 3.4918212890625, + "learning_rate": 1.3418826649787834e-05, + "loss": 1.3415, + "step": 501 + }, + { + "epoch": 0.40808860888121123, + "grad_norm": 2.6940505504608154, + "learning_rate": 1.3394068063283387e-05, + "loss": 1.3017, + "step": 502 + }, + { + "epoch": 0.4089015343969109, + "grad_norm": 3.9722023010253906, + "learning_rate": 1.3369285940426737e-05, + "loss": 1.3161, + "step": 503 + }, + { + "epoch": 0.4097144599126105, + "grad_norm": 2.6105010509490967, + "learning_rate": 1.334448045307088e-05, + "loss": 1.2853, + "step": 504 + }, + { + "epoch": 0.41052738542831013, + "grad_norm": 4.094304084777832, + "learning_rate": 1.331965177323084e-05, + "loss": 1.3059, + "step": 505 + }, + { + "epoch": 0.41134031094400975, + "grad_norm": 2.5570600032806396, + "learning_rate": 1.3294800073082464e-05, + "loss": 1.2957, + "step": 506 + }, + { + "epoch": 0.41215323645970936, + "grad_norm": 2.60870099067688, + "learning_rate": 1.3269925524961237e-05, + "loss": 1.2887, + "step": 507 + }, + { + "epoch": 0.41296616197540903, + "grad_norm": 2.3958325386047363, + "learning_rate": 1.3245028301361086e-05, + "loss": 1.3207, + "step": 508 + }, + { + "epoch": 0.41377908749110864, + "grad_norm": 2.718470811843872, + "learning_rate": 1.3220108574933185e-05, + "loss": 1.2884, + "step": 509 + }, + { + "epoch": 0.41459201300680826, + "grad_norm": 2.9990408420562744, + "learning_rate": 1.3195166518484748e-05, + "loss": 1.3104, + "step": 510 + }, + { + "epoch": 0.41540493852250787, + "grad_norm": 3.256333589553833, + "learning_rate": 1.317020230497784e-05, + "loss": 1.2586, + "step": 511 + }, + { + "epoch": 0.4162178640382075, + "grad_norm": 3.0497708320617676, + "learning_rate": 1.3145216107528178e-05, + "loss": 1.2946, + "step": 512 + }, + { + "epoch": 0.4170307895539071, + "grad_norm": 2.6696412563323975, + "learning_rate": 1.3120208099403926e-05, + "loss": 1.3413, + "step": 513 + }, + { + "epoch": 0.41784371506960677, + "grad_norm": 2.592937469482422, + "learning_rate": 1.3095178454024496e-05, + "loss": 1.2827, + "step": 514 + }, + { + "epoch": 0.4186566405853064, + "grad_norm": 2.450669288635254, + "learning_rate": 1.3070127344959348e-05, + "loss": 1.2505, + "step": 515 + }, + { + "epoch": 0.419469566101006, + "grad_norm": 4.529777526855469, + "learning_rate": 1.3045054945926775e-05, + "loss": 1.3001, + "step": 516 + }, + { + "epoch": 0.4202824916167056, + "grad_norm": 3.2491648197174072, + "learning_rate": 1.3019961430792711e-05, + "loss": 1.2932, + "step": 517 + }, + { + "epoch": 0.4210954171324052, + "grad_norm": 3.3505818843841553, + "learning_rate": 1.2994846973569524e-05, + "loss": 1.3516, + "step": 518 + }, + { + "epoch": 0.4219083426481049, + "grad_norm": 3.5476715564727783, + "learning_rate": 1.2969711748414804e-05, + "loss": 1.2834, + "step": 519 + }, + { + "epoch": 0.4227212681638045, + "grad_norm": 2.738903522491455, + "learning_rate": 1.2944555929630152e-05, + "loss": 1.2978, + "step": 520 + }, + { + "epoch": 0.4235341936795041, + "grad_norm": 2.5854766368865967, + "learning_rate": 1.2919379691659979e-05, + "loss": 1.293, + "step": 521 + }, + { + "epoch": 0.42434711919520374, + "grad_norm": 3.76955246925354, + "learning_rate": 1.2894183209090304e-05, + "loss": 1.2517, + "step": 522 + }, + { + "epoch": 0.42516004471090335, + "grad_norm": 2.566361904144287, + "learning_rate": 1.2868966656647522e-05, + "loss": 1.3295, + "step": 523 + }, + { + "epoch": 0.42597297022660296, + "grad_norm": 2.7477164268493652, + "learning_rate": 1.2843730209197203e-05, + "loss": 1.3067, + "step": 524 + }, + { + "epoch": 0.42678589574230263, + "grad_norm": 3.0560967922210693, + "learning_rate": 1.2818474041742885e-05, + "loss": 1.2951, + "step": 525 + }, + { + "epoch": 0.42759882125800225, + "grad_norm": 2.9634625911712646, + "learning_rate": 1.2793198329424858e-05, + "loss": 1.268, + "step": 526 + }, + { + "epoch": 0.42841174677370186, + "grad_norm": 2.8108301162719727, + "learning_rate": 1.2767903247518945e-05, + "loss": 1.3319, + "step": 527 + }, + { + "epoch": 0.4292246722894015, + "grad_norm": 3.85799241065979, + "learning_rate": 1.2742588971435276e-05, + "loss": 1.3764, + "step": 528 + }, + { + "epoch": 0.4300375978051011, + "grad_norm": 2.564434766769409, + "learning_rate": 1.2717255676717106e-05, + "loss": 1.2854, + "step": 529 + }, + { + "epoch": 0.4308505233208007, + "grad_norm": 5.098544597625732, + "learning_rate": 1.2691903539039563e-05, + "loss": 1.3143, + "step": 530 + }, + { + "epoch": 0.4316634488365004, + "grad_norm": 7.195343017578125, + "learning_rate": 1.2666532734208437e-05, + "loss": 1.3026, + "step": 531 + }, + { + "epoch": 0.4324763743522, + "grad_norm": 2.743298053741455, + "learning_rate": 1.264114343815898e-05, + "loss": 1.3124, + "step": 532 + }, + { + "epoch": 0.4332892998678996, + "grad_norm": 3.183859348297119, + "learning_rate": 1.2615735826954664e-05, + "loss": 1.3132, + "step": 533 + }, + { + "epoch": 0.4341022253835992, + "grad_norm": 7.095142364501953, + "learning_rate": 1.2590310076785974e-05, + "loss": 1.2599, + "step": 534 + }, + { + "epoch": 0.43491515089929883, + "grad_norm": 2.91894268989563, + "learning_rate": 1.256486636396917e-05, + "loss": 1.3251, + "step": 535 + }, + { + "epoch": 0.4357280764149985, + "grad_norm": 2.931509494781494, + "learning_rate": 1.2539404864945087e-05, + "loss": 1.3347, + "step": 536 + }, + { + "epoch": 0.4365410019306981, + "grad_norm": 2.4552268981933594, + "learning_rate": 1.2513925756277894e-05, + "loss": 1.3469, + "step": 537 + }, + { + "epoch": 0.43735392744639773, + "grad_norm": 2.846196174621582, + "learning_rate": 1.2488429214653871e-05, + "loss": 1.2654, + "step": 538 + }, + { + "epoch": 0.43816685296209734, + "grad_norm": 3.494403600692749, + "learning_rate": 1.24629154168802e-05, + "loss": 1.2688, + "step": 539 + }, + { + "epoch": 0.43897977847779696, + "grad_norm": 3.00067138671875, + "learning_rate": 1.2437384539883715e-05, + "loss": 1.2865, + "step": 540 + }, + { + "epoch": 0.43979270399349657, + "grad_norm": 3.0412096977233887, + "learning_rate": 1.2411836760709686e-05, + "loss": 1.269, + "step": 541 + }, + { + "epoch": 0.44060562950919624, + "grad_norm": 2.3580715656280518, + "learning_rate": 1.2386272256520606e-05, + "loss": 1.2752, + "step": 542 + }, + { + "epoch": 0.44141855502489585, + "grad_norm": 9.030720710754395, + "learning_rate": 1.2360691204594937e-05, + "loss": 1.3074, + "step": 543 + }, + { + "epoch": 0.44223148054059547, + "grad_norm": 3.970172882080078, + "learning_rate": 1.2335093782325889e-05, + "loss": 1.3117, + "step": 544 + }, + { + "epoch": 0.4430444060562951, + "grad_norm": 2.8179943561553955, + "learning_rate": 1.2309480167220203e-05, + "loss": 1.3196, + "step": 545 + }, + { + "epoch": 0.4438573315719947, + "grad_norm": 2.9376232624053955, + "learning_rate": 1.2283850536896907e-05, + "loss": 1.2614, + "step": 546 + }, + { + "epoch": 0.44467025708769436, + "grad_norm": 2.811709403991699, + "learning_rate": 1.2258205069086082e-05, + "loss": 1.2666, + "step": 547 + }, + { + "epoch": 0.445483182603394, + "grad_norm": 3.060638427734375, + "learning_rate": 1.2232543941627641e-05, + "loss": 1.2891, + "step": 548 + }, + { + "epoch": 0.4462961081190936, + "grad_norm": 2.581530809402466, + "learning_rate": 1.2206867332470091e-05, + "loss": 1.2875, + "step": 549 + }, + { + "epoch": 0.4471090336347932, + "grad_norm": 2.588129997253418, + "learning_rate": 1.2181175419669293e-05, + "loss": 1.2964, + "step": 550 + }, + { + "epoch": 0.4479219591504928, + "grad_norm": 3.0943429470062256, + "learning_rate": 1.215546838138723e-05, + "loss": 1.29, + "step": 551 + }, + { + "epoch": 0.44873488466619244, + "grad_norm": 2.960190534591675, + "learning_rate": 1.212974639589078e-05, + "loss": 1.2812, + "step": 552 + }, + { + "epoch": 0.4495478101818921, + "grad_norm": 2.7364282608032227, + "learning_rate": 1.2104009641550472e-05, + "loss": 1.2783, + "step": 553 + }, + { + "epoch": 0.4503607356975917, + "grad_norm": 2.509277105331421, + "learning_rate": 1.2078258296839245e-05, + "loss": 1.2859, + "step": 554 + }, + { + "epoch": 0.45117366121329133, + "grad_norm": 2.769371747970581, + "learning_rate": 1.2052492540331218e-05, + "loss": 1.2866, + "step": 555 + }, + { + "epoch": 0.45198658672899095, + "grad_norm": 3.057968854904175, + "learning_rate": 1.2026712550700457e-05, + "loss": 1.3051, + "step": 556 + }, + { + "epoch": 0.45279951224469056, + "grad_norm": 3.4182374477386475, + "learning_rate": 1.200091850671972e-05, + "loss": 1.3266, + "step": 557 + }, + { + "epoch": 0.45361243776039023, + "grad_norm": 2.6871426105499268, + "learning_rate": 1.1975110587259222e-05, + "loss": 1.2596, + "step": 558 + }, + { + "epoch": 0.45442536327608984, + "grad_norm": 3.463675022125244, + "learning_rate": 1.1949288971285411e-05, + "loss": 1.2767, + "step": 559 + }, + { + "epoch": 0.45523828879178946, + "grad_norm": 2.8260090351104736, + "learning_rate": 1.1923453837859706e-05, + "loss": 1.2734, + "step": 560 + }, + { + "epoch": 0.4560512143074891, + "grad_norm": 2.6161341667175293, + "learning_rate": 1.1897605366137264e-05, + "loss": 1.2377, + "step": 561 + }, + { + "epoch": 0.4568641398231887, + "grad_norm": 2.847534418106079, + "learning_rate": 1.1871743735365735e-05, + "loss": 1.3128, + "step": 562 + }, + { + "epoch": 0.4576770653388883, + "grad_norm": 3.116063117980957, + "learning_rate": 1.1845869124884027e-05, + "loss": 1.3114, + "step": 563 + }, + { + "epoch": 0.45848999085458797, + "grad_norm": 3.2849061489105225, + "learning_rate": 1.1819981714121054e-05, + "loss": 1.2761, + "step": 564 + }, + { + "epoch": 0.4593029163702876, + "grad_norm": 2.484531879425049, + "learning_rate": 1.1794081682594491e-05, + "loss": 1.2978, + "step": 565 + }, + { + "epoch": 0.4601158418859872, + "grad_norm": 3.111940383911133, + "learning_rate": 1.176816920990954e-05, + "loss": 1.2928, + "step": 566 + }, + { + "epoch": 0.4609287674016868, + "grad_norm": 3.063422918319702, + "learning_rate": 1.174224447575767e-05, + "loss": 1.3137, + "step": 567 + }, + { + "epoch": 0.4617416929173864, + "grad_norm": 4.031757831573486, + "learning_rate": 1.171630765991538e-05, + "loss": 1.2986, + "step": 568 + }, + { + "epoch": 0.4625546184330861, + "grad_norm": 2.650336980819702, + "learning_rate": 1.169035894224295e-05, + "loss": 1.3328, + "step": 569 + }, + { + "epoch": 0.4633675439487857, + "grad_norm": 2.574526309967041, + "learning_rate": 1.1664398502683194e-05, + "loss": 1.3078, + "step": 570 + }, + { + "epoch": 0.4641804694644853, + "grad_norm": 2.3674449920654297, + "learning_rate": 1.1638426521260211e-05, + "loss": 1.2819, + "step": 571 + }, + { + "epoch": 0.46499339498018494, + "grad_norm": 2.8870980739593506, + "learning_rate": 1.1612443178078138e-05, + "loss": 1.2661, + "step": 572 + }, + { + "epoch": 0.46580632049588455, + "grad_norm": 2.4961047172546387, + "learning_rate": 1.1586448653319908e-05, + "loss": 1.3042, + "step": 573 + }, + { + "epoch": 0.46661924601158417, + "grad_norm": 2.6196508407592773, + "learning_rate": 1.156044312724598e-05, + "loss": 1.2306, + "step": 574 + }, + { + "epoch": 0.46743217152728384, + "grad_norm": 2.7249913215637207, + "learning_rate": 1.153442678019311e-05, + "loss": 1.3095, + "step": 575 + }, + { + "epoch": 0.46824509704298345, + "grad_norm": 2.9108643531799316, + "learning_rate": 1.1508399792573095e-05, + "loss": 1.2513, + "step": 576 + }, + { + "epoch": 0.46905802255868306, + "grad_norm": 2.7690494060516357, + "learning_rate": 1.1482362344871514e-05, + "loss": 1.3445, + "step": 577 + }, + { + "epoch": 0.4698709480743827, + "grad_norm": 3.629122734069824, + "learning_rate": 1.1456314617646482e-05, + "loss": 1.2616, + "step": 578 + }, + { + "epoch": 0.4706838735900823, + "grad_norm": 2.6831417083740234, + "learning_rate": 1.1430256791527406e-05, + "loss": 1.2786, + "step": 579 + }, + { + "epoch": 0.4714967991057819, + "grad_norm": 2.5316171646118164, + "learning_rate": 1.1404189047213716e-05, + "loss": 1.3195, + "step": 580 + }, + { + "epoch": 0.4723097246214816, + "grad_norm": 4.602120399475098, + "learning_rate": 1.137811156547362e-05, + "loss": 1.2378, + "step": 581 + }, + { + "epoch": 0.4731226501371812, + "grad_norm": 2.5073766708374023, + "learning_rate": 1.1352024527142855e-05, + "loss": 1.2426, + "step": 582 + }, + { + "epoch": 0.4739355756528808, + "grad_norm": 2.5561444759368896, + "learning_rate": 1.1325928113123431e-05, + "loss": 1.318, + "step": 583 + }, + { + "epoch": 0.4747485011685804, + "grad_norm": 2.8386447429656982, + "learning_rate": 1.129982250438237e-05, + "loss": 1.2529, + "step": 584 + }, + { + "epoch": 0.47556142668428003, + "grad_norm": 2.3654778003692627, + "learning_rate": 1.1273707881950445e-05, + "loss": 1.2822, + "step": 585 + }, + { + "epoch": 0.4763743521999797, + "grad_norm": 3.125446081161499, + "learning_rate": 1.1247584426920962e-05, + "loss": 1.3588, + "step": 586 + }, + { + "epoch": 0.4771872777156793, + "grad_norm": 3.600827217102051, + "learning_rate": 1.1221452320448449e-05, + "loss": 1.3023, + "step": 587 + }, + { + "epoch": 0.47800020323137893, + "grad_norm": 3.858783483505249, + "learning_rate": 1.1195311743747445e-05, + "loss": 1.2784, + "step": 588 + }, + { + "epoch": 0.47881312874707854, + "grad_norm": 2.841679334640503, + "learning_rate": 1.116916287809122e-05, + "loss": 1.3084, + "step": 589 + }, + { + "epoch": 0.47962605426277816, + "grad_norm": 2.9722323417663574, + "learning_rate": 1.1143005904810527e-05, + "loss": 1.2983, + "step": 590 + }, + { + "epoch": 0.48043897977847777, + "grad_norm": 2.560037136077881, + "learning_rate": 1.1116841005292339e-05, + "loss": 1.3175, + "step": 591 + }, + { + "epoch": 0.48125190529417744, + "grad_norm": 3.1770455837249756, + "learning_rate": 1.1090668360978589e-05, + "loss": 1.2603, + "step": 592 + }, + { + "epoch": 0.48206483080987705, + "grad_norm": 2.4485607147216797, + "learning_rate": 1.106448815336493e-05, + "loss": 1.2792, + "step": 593 + }, + { + "epoch": 0.48287775632557667, + "grad_norm": 3.7001748085021973, + "learning_rate": 1.1038300563999455e-05, + "loss": 1.2846, + "step": 594 + }, + { + "epoch": 0.4836906818412763, + "grad_norm": 2.6942710876464844, + "learning_rate": 1.1012105774481446e-05, + "loss": 1.2864, + "step": 595 + }, + { + "epoch": 0.4845036073569759, + "grad_norm": 2.5104377269744873, + "learning_rate": 1.0985903966460115e-05, + "loss": 1.256, + "step": 596 + }, + { + "epoch": 0.48531653287267557, + "grad_norm": 2.4864704608917236, + "learning_rate": 1.0959695321633346e-05, + "loss": 1.2838, + "step": 597 + }, + { + "epoch": 0.4861294583883752, + "grad_norm": 3.2645606994628906, + "learning_rate": 1.0933480021746432e-05, + "loss": 1.2966, + "step": 598 + }, + { + "epoch": 0.4869423839040748, + "grad_norm": 28.041383743286133, + "learning_rate": 1.0907258248590816e-05, + "loss": 1.2513, + "step": 599 + }, + { + "epoch": 0.4877553094197744, + "grad_norm": 2.736785888671875, + "learning_rate": 1.0881030184002827e-05, + "loss": 1.3217, + "step": 600 + }, + { + "epoch": 0.488568234935474, + "grad_norm": 4.294330596923828, + "learning_rate": 1.0854796009862434e-05, + "loss": 1.3007, + "step": 601 + }, + { + "epoch": 0.48938116045117364, + "grad_norm": 2.629371404647827, + "learning_rate": 1.0828555908091958e-05, + "loss": 1.2884, + "step": 602 + }, + { + "epoch": 0.4901940859668733, + "grad_norm": 3.166304588317871, + "learning_rate": 1.0802310060654832e-05, + "loss": 1.3127, + "step": 603 + }, + { + "epoch": 0.4910070114825729, + "grad_norm": 2.5344200134277344, + "learning_rate": 1.0776058649554336e-05, + "loss": 1.249, + "step": 604 + }, + { + "epoch": 0.49181993699827253, + "grad_norm": 3.2902913093566895, + "learning_rate": 1.0749801856832325e-05, + "loss": 1.2341, + "step": 605 + }, + { + "epoch": 0.49263286251397215, + "grad_norm": 2.5863964557647705, + "learning_rate": 1.0723539864567983e-05, + "loss": 1.3534, + "step": 606 + }, + { + "epoch": 0.49344578802967176, + "grad_norm": 3.1407294273376465, + "learning_rate": 1.0697272854876537e-05, + "loss": 1.2452, + "step": 607 + }, + { + "epoch": 0.49425871354537143, + "grad_norm": 2.339702844619751, + "learning_rate": 1.0671001009908015e-05, + "loss": 1.2597, + "step": 608 + }, + { + "epoch": 0.49507163906107104, + "grad_norm": 2.5861027240753174, + "learning_rate": 1.0644724511845976e-05, + "loss": 1.304, + "step": 609 + }, + { + "epoch": 0.49588456457677066, + "grad_norm": 2.6124143600463867, + "learning_rate": 1.0618443542906251e-05, + "loss": 1.2333, + "step": 610 + }, + { + "epoch": 0.4966974900924703, + "grad_norm": 2.53468918800354, + "learning_rate": 1.059215828533566e-05, + "loss": 1.2587, + "step": 611 + }, + { + "epoch": 0.4975104156081699, + "grad_norm": 5.205654621124268, + "learning_rate": 1.0565868921410776e-05, + "loss": 1.2758, + "step": 612 + }, + { + "epoch": 0.4983233411238695, + "grad_norm": 3.3307433128356934, + "learning_rate": 1.0539575633436645e-05, + "loss": 1.3197, + "step": 613 + }, + { + "epoch": 0.49913626663956917, + "grad_norm": 2.4654664993286133, + "learning_rate": 1.0513278603745523e-05, + "loss": 1.2733, + "step": 614 + }, + { + "epoch": 0.4999491921552688, + "grad_norm": 2.5150272846221924, + "learning_rate": 1.0486978014695606e-05, + "loss": 1.2841, + "step": 615 + }, + { + "epoch": 0.5007621176709683, + "grad_norm": 2.660186767578125, + "learning_rate": 1.0460674048669783e-05, + "loss": 1.3007, + "step": 616 + }, + { + "epoch": 0.501575043186668, + "grad_norm": 2.7415716648101807, + "learning_rate": 1.0434366888074363e-05, + "loss": 1.2974, + "step": 617 + }, + { + "epoch": 0.5023879687023677, + "grad_norm": 2.479142427444458, + "learning_rate": 1.0408056715337797e-05, + "loss": 1.301, + "step": 618 + }, + { + "epoch": 0.5032008942180672, + "grad_norm": 2.4590210914611816, + "learning_rate": 1.0381743712909424e-05, + "loss": 1.2253, + "step": 619 + }, + { + "epoch": 0.5040138197337669, + "grad_norm": 2.4704954624176025, + "learning_rate": 1.0355428063258224e-05, + "loss": 1.1927, + "step": 620 + }, + { + "epoch": 0.5048267452494665, + "grad_norm": 3.5037641525268555, + "learning_rate": 1.0329109948871512e-05, + "loss": 1.2727, + "step": 621 + }, + { + "epoch": 0.5056396707651661, + "grad_norm": 2.6537327766418457, + "learning_rate": 1.0302789552253702e-05, + "loss": 1.2295, + "step": 622 + }, + { + "epoch": 0.5064525962808658, + "grad_norm": 3.4443886280059814, + "learning_rate": 1.0276467055925044e-05, + "loss": 1.2403, + "step": 623 + }, + { + "epoch": 0.5072655217965654, + "grad_norm": 4.377493858337402, + "learning_rate": 1.0250142642420335e-05, + "loss": 1.2667, + "step": 624 + }, + { + "epoch": 0.508078447312265, + "grad_norm": 2.712472677230835, + "learning_rate": 1.0223816494287675e-05, + "loss": 1.3323, + "step": 625 + }, + { + "epoch": 0.5088913728279646, + "grad_norm": 2.922093152999878, + "learning_rate": 1.0197488794087188e-05, + "loss": 1.2713, + "step": 626 + }, + { + "epoch": 0.5097042983436643, + "grad_norm": 11.951809883117676, + "learning_rate": 1.0171159724389766e-05, + "loss": 1.2997, + "step": 627 + }, + { + "epoch": 0.5105172238593639, + "grad_norm": 2.5700554847717285, + "learning_rate": 1.0144829467775794e-05, + "loss": 1.261, + "step": 628 + }, + { + "epoch": 0.5113301493750635, + "grad_norm": 2.6800413131713867, + "learning_rate": 1.0118498206833886e-05, + "loss": 1.3292, + "step": 629 + }, + { + "epoch": 0.5121430748907632, + "grad_norm": 4.24453592300415, + "learning_rate": 1.0092166124159628e-05, + "loss": 1.3281, + "step": 630 + }, + { + "epoch": 0.5129560004064627, + "grad_norm": 2.7513749599456787, + "learning_rate": 1.0065833402354302e-05, + "loss": 1.2944, + "step": 631 + }, + { + "epoch": 0.5137689259221624, + "grad_norm": 2.610588788986206, + "learning_rate": 1.003950022402361e-05, + "loss": 1.3129, + "step": 632 + }, + { + "epoch": 0.5145818514378621, + "grad_norm": 2.949564218521118, + "learning_rate": 1.0013166771776441e-05, + "loss": 1.2961, + "step": 633 + }, + { + "epoch": 0.5153947769535616, + "grad_norm": 2.5617198944091797, + "learning_rate": 9.986833228223562e-06, + "loss": 1.2898, + "step": 634 + }, + { + "epoch": 0.5162077024692613, + "grad_norm": 2.779733896255493, + "learning_rate": 9.96049977597639e-06, + "loss": 1.2988, + "step": 635 + }, + { + "epoch": 0.5170206279849608, + "grad_norm": 2.8505136966705322, + "learning_rate": 9.934166597645703e-06, + "loss": 1.2652, + "step": 636 + }, + { + "epoch": 0.5178335535006605, + "grad_norm": 2.847262144088745, + "learning_rate": 9.907833875840374e-06, + "loss": 1.3076, + "step": 637 + }, + { + "epoch": 0.5186464790163601, + "grad_norm": 4.957255840301514, + "learning_rate": 9.881501793166117e-06, + "loss": 1.214, + "step": 638 + }, + { + "epoch": 0.5194594045320597, + "grad_norm": 2.7829556465148926, + "learning_rate": 9.85517053222421e-06, + "loss": 1.2379, + "step": 639 + }, + { + "epoch": 0.5202723300477594, + "grad_norm": 2.7060935497283936, + "learning_rate": 9.82884027561024e-06, + "loss": 1.3016, + "step": 640 + }, + { + "epoch": 0.521085255563459, + "grad_norm": 6.336554527282715, + "learning_rate": 9.802511205912815e-06, + "loss": 1.269, + "step": 641 + }, + { + "epoch": 0.5218981810791586, + "grad_norm": 3.0378448963165283, + "learning_rate": 9.776183505712327e-06, + "loss": 1.317, + "step": 642 + }, + { + "epoch": 0.5227111065948582, + "grad_norm": 5.806065082550049, + "learning_rate": 9.749857357579667e-06, + "loss": 1.3165, + "step": 643 + }, + { + "epoch": 0.5235240321105579, + "grad_norm": 2.7738869190216064, + "learning_rate": 9.723532944074961e-06, + "loss": 1.2835, + "step": 644 + }, + { + "epoch": 0.5243369576262575, + "grad_norm": 2.6603453159332275, + "learning_rate": 9.6972104477463e-06, + "loss": 1.2673, + "step": 645 + }, + { + "epoch": 0.5251498831419571, + "grad_norm": 2.9316189289093018, + "learning_rate": 9.670890051128493e-06, + "loss": 1.249, + "step": 646 + }, + { + "epoch": 0.5259628086576568, + "grad_norm": 2.8541407585144043, + "learning_rate": 9.644571936741778e-06, + "loss": 1.2835, + "step": 647 + }, + { + "epoch": 0.5267757341733563, + "grad_norm": 2.6935575008392334, + "learning_rate": 9.618256287090576e-06, + "loss": 1.2859, + "step": 648 + }, + { + "epoch": 0.527588659689056, + "grad_norm": 3.057039260864258, + "learning_rate": 9.591943284662206e-06, + "loss": 1.2538, + "step": 649 + }, + { + "epoch": 0.5284015852047557, + "grad_norm": 3.2430379390716553, + "learning_rate": 9.56563311192564e-06, + "loss": 1.294, + "step": 650 + }, + { + "epoch": 0.5292145107204552, + "grad_norm": 2.378072500228882, + "learning_rate": 9.53932595133022e-06, + "loss": 1.2793, + "step": 651 + }, + { + "epoch": 0.5300274362361549, + "grad_norm": 3.2185440063476562, + "learning_rate": 9.513021985304399e-06, + "loss": 1.2868, + "step": 652 + }, + { + "epoch": 0.5308403617518545, + "grad_norm": 3.272632122039795, + "learning_rate": 9.486721396254484e-06, + "loss": 1.2128, + "step": 653 + }, + { + "epoch": 0.5316532872675541, + "grad_norm": 3.163884401321411, + "learning_rate": 9.460424366563355e-06, + "loss": 1.2962, + "step": 654 + }, + { + "epoch": 0.5324662127832538, + "grad_norm": 3.096857786178589, + "learning_rate": 9.434131078589224e-06, + "loss": 1.2575, + "step": 655 + }, + { + "epoch": 0.5332791382989533, + "grad_norm": 2.711069107055664, + "learning_rate": 9.407841714664343e-06, + "loss": 1.2969, + "step": 656 + }, + { + "epoch": 0.534092063814653, + "grad_norm": 4.4655866622924805, + "learning_rate": 9.381556457093752e-06, + "loss": 1.2229, + "step": 657 + }, + { + "epoch": 0.5349049893303526, + "grad_norm": 2.7365305423736572, + "learning_rate": 9.355275488154025e-06, + "loss": 1.285, + "step": 658 + }, + { + "epoch": 0.5357179148460522, + "grad_norm": 3.4264895915985107, + "learning_rate": 9.32899899009199e-06, + "loss": 1.3222, + "step": 659 + }, + { + "epoch": 0.5365308403617518, + "grad_norm": 2.9572296142578125, + "learning_rate": 9.30272714512347e-06, + "loss": 1.2771, + "step": 660 + }, + { + "epoch": 0.5373437658774515, + "grad_norm": 3.124464988708496, + "learning_rate": 9.276460135432019e-06, + "loss": 1.2362, + "step": 661 + }, + { + "epoch": 0.5381566913931511, + "grad_norm": 3.484861373901367, + "learning_rate": 9.250198143167675e-06, + "loss": 1.2624, + "step": 662 + }, + { + "epoch": 0.5389696169088507, + "grad_norm": 3.191455602645874, + "learning_rate": 9.223941350445666e-06, + "loss": 1.3271, + "step": 663 + }, + { + "epoch": 0.5397825424245504, + "grad_norm": 3.055478572845459, + "learning_rate": 9.19768993934517e-06, + "loss": 1.2476, + "step": 664 + }, + { + "epoch": 0.5405954679402499, + "grad_norm": 2.8661985397338867, + "learning_rate": 9.171444091908046e-06, + "loss": 1.2575, + "step": 665 + }, + { + "epoch": 0.5414083934559496, + "grad_norm": 3.042300224304199, + "learning_rate": 9.145203990137571e-06, + "loss": 1.2472, + "step": 666 + }, + { + "epoch": 0.5422213189716493, + "grad_norm": 3.324767827987671, + "learning_rate": 9.118969815997174e-06, + "loss": 1.2608, + "step": 667 + }, + { + "epoch": 0.5430342444873488, + "grad_norm": 2.8374948501586914, + "learning_rate": 9.092741751409186e-06, + "loss": 1.2865, + "step": 668 + }, + { + "epoch": 0.5438471700030485, + "grad_norm": 3.3593552112579346, + "learning_rate": 9.06651997825357e-06, + "loss": 1.2746, + "step": 669 + }, + { + "epoch": 0.544660095518748, + "grad_norm": 3.2432382106781006, + "learning_rate": 9.040304678366658e-06, + "loss": 1.2864, + "step": 670 + }, + { + "epoch": 0.5454730210344477, + "grad_norm": 2.890409469604492, + "learning_rate": 9.014096033539889e-06, + "loss": 1.2685, + "step": 671 + }, + { + "epoch": 0.5462859465501474, + "grad_norm": 3.0769150257110596, + "learning_rate": 8.987894225518556e-06, + "loss": 1.2701, + "step": 672 + }, + { + "epoch": 0.547098872065847, + "grad_norm": 3.453287363052368, + "learning_rate": 8.961699436000548e-06, + "loss": 1.2218, + "step": 673 + }, + { + "epoch": 0.5479117975815466, + "grad_norm": 3.1950011253356934, + "learning_rate": 8.93551184663507e-06, + "loss": 1.2267, + "step": 674 + }, + { + "epoch": 0.5487247230972462, + "grad_norm": 3.445006847381592, + "learning_rate": 8.909331639021414e-06, + "loss": 1.283, + "step": 675 + }, + { + "epoch": 0.5495376486129459, + "grad_norm": 2.5453741550445557, + "learning_rate": 8.883158994707666e-06, + "loss": 1.3102, + "step": 676 + }, + { + "epoch": 0.5503505741286454, + "grad_norm": 4.167499542236328, + "learning_rate": 8.856994095189477e-06, + "loss": 1.2881, + "step": 677 + }, + { + "epoch": 0.5511634996443451, + "grad_norm": 2.6888363361358643, + "learning_rate": 8.830837121908783e-06, + "loss": 1.2332, + "step": 678 + }, + { + "epoch": 0.5519764251600447, + "grad_norm": 2.9484667778015137, + "learning_rate": 8.804688256252557e-06, + "loss": 1.2676, + "step": 679 + }, + { + "epoch": 0.5527893506757443, + "grad_norm": 2.5477519035339355, + "learning_rate": 8.778547679551555e-06, + "loss": 1.2956, + "step": 680 + }, + { + "epoch": 0.553602276191444, + "grad_norm": 2.3307385444641113, + "learning_rate": 8.75241557307904e-06, + "loss": 1.3021, + "step": 681 + }, + { + "epoch": 0.5544152017071435, + "grad_norm": 3.1104202270507812, + "learning_rate": 8.726292118049555e-06, + "loss": 1.2861, + "step": 682 + }, + { + "epoch": 0.5552281272228432, + "grad_norm": 3.2731287479400635, + "learning_rate": 8.700177495617635e-06, + "loss": 1.33, + "step": 683 + }, + { + "epoch": 0.5560410527385429, + "grad_norm": 2.923478364944458, + "learning_rate": 8.674071886876572e-06, + "loss": 1.2946, + "step": 684 + }, + { + "epoch": 0.5568539782542424, + "grad_norm": 3.1030538082122803, + "learning_rate": 8.647975472857148e-06, + "loss": 1.2481, + "step": 685 + }, + { + "epoch": 0.5576669037699421, + "grad_norm": 2.6904759407043457, + "learning_rate": 8.621888434526382e-06, + "loss": 1.2637, + "step": 686 + }, + { + "epoch": 0.5584798292856417, + "grad_norm": 3.6781442165374756, + "learning_rate": 8.595810952786289e-06, + "loss": 1.2875, + "step": 687 + }, + { + "epoch": 0.5592927548013413, + "grad_norm": 4.897818565368652, + "learning_rate": 8.569743208472594e-06, + "loss": 1.2804, + "step": 688 + }, + { + "epoch": 0.560105680317041, + "grad_norm": 2.9090828895568848, + "learning_rate": 8.543685382353518e-06, + "loss": 1.2817, + "step": 689 + }, + { + "epoch": 0.5609186058327406, + "grad_norm": 3.3284378051757812, + "learning_rate": 8.51763765512849e-06, + "loss": 1.2928, + "step": 690 + }, + { + "epoch": 0.5617315313484402, + "grad_norm": 3.440209150314331, + "learning_rate": 8.491600207426907e-06, + "loss": 1.2667, + "step": 691 + }, + { + "epoch": 0.5625444568641398, + "grad_norm": 3.1297762393951416, + "learning_rate": 8.465573219806893e-06, + "loss": 1.2752, + "step": 692 + }, + { + "epoch": 0.5633573823798395, + "grad_norm": 3.460277795791626, + "learning_rate": 8.439556872754025e-06, + "loss": 1.2611, + "step": 693 + }, + { + "epoch": 0.5641703078955391, + "grad_norm": 2.6390557289123535, + "learning_rate": 8.413551346680095e-06, + "loss": 1.2339, + "step": 694 + }, + { + "epoch": 0.5649832334112387, + "grad_norm": 2.365945339202881, + "learning_rate": 8.38755682192186e-06, + "loss": 1.2333, + "step": 695 + }, + { + "epoch": 0.5657961589269384, + "grad_norm": 3.140129804611206, + "learning_rate": 8.36157347873979e-06, + "loss": 1.2614, + "step": 696 + }, + { + "epoch": 0.5666090844426379, + "grad_norm": 4.027166366577148, + "learning_rate": 8.335601497316809e-06, + "loss": 1.263, + "step": 697 + }, + { + "epoch": 0.5674220099583376, + "grad_norm": 2.6872942447662354, + "learning_rate": 8.309641057757052e-06, + "loss": 1.2479, + "step": 698 + }, + { + "epoch": 0.5682349354740371, + "grad_norm": 2.575493574142456, + "learning_rate": 8.283692340084623e-06, + "loss": 1.2818, + "step": 699 + }, + { + "epoch": 0.5690478609897368, + "grad_norm": 2.6429176330566406, + "learning_rate": 8.257755524242333e-06, + "loss": 1.2921, + "step": 700 + }, + { + "epoch": 0.5698607865054365, + "grad_norm": 4.695654392242432, + "learning_rate": 8.231830790090461e-06, + "loss": 1.2046, + "step": 701 + }, + { + "epoch": 0.570673712021136, + "grad_norm": 2.4642715454101562, + "learning_rate": 8.205918317405508e-06, + "loss": 1.3013, + "step": 702 + }, + { + "epoch": 0.5714866375368357, + "grad_norm": 2.567474842071533, + "learning_rate": 8.18001828587895e-06, + "loss": 1.3458, + "step": 703 + }, + { + "epoch": 0.5722995630525353, + "grad_norm": 2.934668779373169, + "learning_rate": 8.154130875115978e-06, + "loss": 1.2804, + "step": 704 + }, + { + "epoch": 0.5731124885682349, + "grad_norm": 2.669285297393799, + "learning_rate": 8.12825626463427e-06, + "loss": 1.2329, + "step": 705 + }, + { + "epoch": 0.5739254140839346, + "grad_norm": 2.7390220165252686, + "learning_rate": 8.102394633862743e-06, + "loss": 1.2177, + "step": 706 + }, + { + "epoch": 0.5747383395996342, + "grad_norm": 3.19964861869812, + "learning_rate": 8.0765461621403e-06, + "loss": 1.2625, + "step": 707 + }, + { + "epoch": 0.5755512651153338, + "grad_norm": 2.753469705581665, + "learning_rate": 8.050711028714589e-06, + "loss": 1.2357, + "step": 708 + }, + { + "epoch": 0.5763641906310334, + "grad_norm": 3.3288702964782715, + "learning_rate": 8.02488941274078e-06, + "loss": 1.217, + "step": 709 + }, + { + "epoch": 0.5771771161467331, + "grad_norm": 2.808100700378418, + "learning_rate": 7.999081493280283e-06, + "loss": 1.3156, + "step": 710 + }, + { + "epoch": 0.5779900416624327, + "grad_norm": 2.8736870288848877, + "learning_rate": 7.973287449299545e-06, + "loss": 1.3122, + "step": 711 + }, + { + "epoch": 0.5788029671781323, + "grad_norm": 6.863023281097412, + "learning_rate": 7.947507459668784e-06, + "loss": 1.2218, + "step": 712 + }, + { + "epoch": 0.579615892693832, + "grad_norm": 4.454842567443848, + "learning_rate": 7.921741703160758e-06, + "loss": 1.1918, + "step": 713 + }, + { + "epoch": 0.5804288182095315, + "grad_norm": 2.4465959072113037, + "learning_rate": 7.895990358449533e-06, + "loss": 1.2705, + "step": 714 + }, + { + "epoch": 0.5812417437252312, + "grad_norm": 3.3625428676605225, + "learning_rate": 7.87025360410922e-06, + "loss": 1.2644, + "step": 715 + }, + { + "epoch": 0.5820546692409307, + "grad_norm": 2.846947431564331, + "learning_rate": 7.844531618612772e-06, + "loss": 1.2612, + "step": 716 + }, + { + "epoch": 0.5828675947566304, + "grad_norm": 3.332118034362793, + "learning_rate": 7.81882458033071e-06, + "loss": 1.2597, + "step": 717 + }, + { + "epoch": 0.5836805202723301, + "grad_norm": 2.646106719970703, + "learning_rate": 7.79313266752991e-06, + "loss": 1.2613, + "step": 718 + }, + { + "epoch": 0.5844934457880296, + "grad_norm": 2.8592135906219482, + "learning_rate": 7.767456058372362e-06, + "loss": 1.282, + "step": 719 + }, + { + "epoch": 0.5853063713037293, + "grad_norm": 2.748481035232544, + "learning_rate": 7.741794930913922e-06, + "loss": 1.2869, + "step": 720 + }, + { + "epoch": 0.5861192968194289, + "grad_norm": 2.8134074211120605, + "learning_rate": 7.7161494631031e-06, + "loss": 1.3079, + "step": 721 + }, + { + "epoch": 0.5869322223351285, + "grad_norm": 3.059119939804077, + "learning_rate": 7.690519832779799e-06, + "loss": 1.2705, + "step": 722 + }, + { + "epoch": 0.5877451478508282, + "grad_norm": 2.6439130306243896, + "learning_rate": 7.664906217674115e-06, + "loss": 1.2413, + "step": 723 + }, + { + "epoch": 0.5885580733665278, + "grad_norm": 2.812056303024292, + "learning_rate": 7.639308795405066e-06, + "loss": 1.2543, + "step": 724 + }, + { + "epoch": 0.5893709988822274, + "grad_norm": 3.2603330612182617, + "learning_rate": 7.613727743479395e-06, + "loss": 1.2442, + "step": 725 + }, + { + "epoch": 0.590183924397927, + "grad_norm": 2.544433116912842, + "learning_rate": 7.588163239290316e-06, + "loss": 1.3034, + "step": 726 + }, + { + "epoch": 0.5909968499136267, + "grad_norm": 4.0246262550354, + "learning_rate": 7.562615460116289e-06, + "loss": 1.3188, + "step": 727 + }, + { + "epoch": 0.5918097754293263, + "grad_norm": 4.249239444732666, + "learning_rate": 7.537084583119802e-06, + "loss": 1.3091, + "step": 728 + }, + { + "epoch": 0.5926227009450259, + "grad_norm": 2.7686362266540527, + "learning_rate": 7.511570785346129e-06, + "loss": 1.2449, + "step": 729 + }, + { + "epoch": 0.5934356264607256, + "grad_norm": 2.8529245853424072, + "learning_rate": 7.486074243722109e-06, + "loss": 1.2392, + "step": 730 + }, + { + "epoch": 0.5942485519764251, + "grad_norm": 3.073486328125, + "learning_rate": 7.460595135054916e-06, + "loss": 1.2848, + "step": 731 + }, + { + "epoch": 0.5950614774921248, + "grad_norm": 3.365366220474243, + "learning_rate": 7.435133636030831e-06, + "loss": 1.2912, + "step": 732 + }, + { + "epoch": 0.5958744030078245, + "grad_norm": 2.4938106536865234, + "learning_rate": 7.4096899232140295e-06, + "loss": 1.2965, + "step": 733 + }, + { + "epoch": 0.596687328523524, + "grad_norm": 2.9927473068237305, + "learning_rate": 7.384264173045339e-06, + "loss": 1.2748, + "step": 734 + }, + { + "epoch": 0.5975002540392237, + "grad_norm": 7.3427205085754395, + "learning_rate": 7.358856561841021e-06, + "loss": 1.2457, + "step": 735 + }, + { + "epoch": 0.5983131795549232, + "grad_norm": 3.274311065673828, + "learning_rate": 7.333467265791563e-06, + "loss": 1.2225, + "step": 736 + }, + { + "epoch": 0.5991261050706229, + "grad_norm": 4.503856658935547, + "learning_rate": 7.308096460960441e-06, + "loss": 1.2603, + "step": 737 + }, + { + "epoch": 0.5999390305863225, + "grad_norm": 3.6017913818359375, + "learning_rate": 7.282744323282895e-06, + "loss": 1.2278, + "step": 738 + }, + { + "epoch": 0.6007519561020221, + "grad_norm": 3.0930585861206055, + "learning_rate": 7.2574110285647244e-06, + "loss": 1.2649, + "step": 739 + }, + { + "epoch": 0.6015648816177218, + "grad_norm": 2.6793737411499023, + "learning_rate": 7.232096752481061e-06, + "loss": 1.215, + "step": 740 + }, + { + "epoch": 0.6023778071334214, + "grad_norm": 3.0066819190979004, + "learning_rate": 7.206801670575145e-06, + "loss": 1.2953, + "step": 741 + }, + { + "epoch": 0.603190732649121, + "grad_norm": 3.2586004734039307, + "learning_rate": 7.181525958257116e-06, + "loss": 1.1988, + "step": 742 + }, + { + "epoch": 0.6040036581648206, + "grad_norm": 3.186267375946045, + "learning_rate": 7.156269790802801e-06, + "loss": 1.2425, + "step": 743 + }, + { + "epoch": 0.6048165836805203, + "grad_norm": 3.919509172439575, + "learning_rate": 7.131033343352483e-06, + "loss": 1.3432, + "step": 744 + }, + { + "epoch": 0.6056295091962199, + "grad_norm": 3.8313186168670654, + "learning_rate": 7.105816790909699e-06, + "loss": 1.2491, + "step": 745 + }, + { + "epoch": 0.6064424347119195, + "grad_norm": 2.7689011096954346, + "learning_rate": 7.080620308340024e-06, + "loss": 1.2673, + "step": 746 + }, + { + "epoch": 0.6072553602276192, + "grad_norm": 4.105691909790039, + "learning_rate": 7.055444070369852e-06, + "loss": 1.2688, + "step": 747 + }, + { + "epoch": 0.6080682857433187, + "grad_norm": 3.336580276489258, + "learning_rate": 7.0302882515852025e-06, + "loss": 1.2613, + "step": 748 + }, + { + "epoch": 0.6088812112590184, + "grad_norm": 3.7272021770477295, + "learning_rate": 7.005153026430476e-06, + "loss": 1.1882, + "step": 749 + }, + { + "epoch": 0.6096941367747181, + "grad_norm": 4.220558166503906, + "learning_rate": 6.980038569207291e-06, + "loss": 1.1853, + "step": 750 + }, + { + "epoch": 0.6105070622904176, + "grad_norm": 2.8943638801574707, + "learning_rate": 6.954945054073228e-06, + "loss": 1.2408, + "step": 751 + }, + { + "epoch": 0.6113199878061173, + "grad_norm": 2.740449905395508, + "learning_rate": 6.929872655040655e-06, + "loss": 1.2233, + "step": 752 + }, + { + "epoch": 0.6121329133218169, + "grad_norm": 3.1293320655822754, + "learning_rate": 6.904821545975507e-06, + "loss": 1.2362, + "step": 753 + }, + { + "epoch": 0.6129458388375165, + "grad_norm": 2.9130334854125977, + "learning_rate": 6.879791900596077e-06, + "loss": 1.2525, + "step": 754 + }, + { + "epoch": 0.6137587643532162, + "grad_norm": 2.6800663471221924, + "learning_rate": 6.854783892471823e-06, + "loss": 1.2811, + "step": 755 + }, + { + "epoch": 0.6145716898689157, + "grad_norm": 2.7140908241271973, + "learning_rate": 6.829797695022163e-06, + "loss": 1.2693, + "step": 756 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 2.687870740890503, + "learning_rate": 6.804833481515256e-06, + "loss": 1.2124, + "step": 757 + }, + { + "epoch": 0.616197540900315, + "grad_norm": 3.170487880706787, + "learning_rate": 6.7798914250668154e-06, + "loss": 1.2373, + "step": 758 + }, + { + "epoch": 0.6170104664160146, + "grad_norm": 2.6142961978912354, + "learning_rate": 6.7549716986389146e-06, + "loss": 1.2527, + "step": 759 + }, + { + "epoch": 0.6178233919317142, + "grad_norm": 3.4092085361480713, + "learning_rate": 6.730074475038766e-06, + "loss": 1.2401, + "step": 760 + }, + { + "epoch": 0.6186363174474139, + "grad_norm": 3.256838083267212, + "learning_rate": 6.7051999269175405e-06, + "loss": 1.1863, + "step": 761 + }, + { + "epoch": 0.6194492429631135, + "grad_norm": 2.8312947750091553, + "learning_rate": 6.680348226769162e-06, + "loss": 1.241, + "step": 762 + }, + { + "epoch": 0.6202621684788131, + "grad_norm": 2.799750804901123, + "learning_rate": 6.655519546929121e-06, + "loss": 1.2601, + "step": 763 + }, + { + "epoch": 0.6210750939945128, + "grad_norm": 3.188913106918335, + "learning_rate": 6.630714059573267e-06, + "loss": 1.2719, + "step": 764 + }, + { + "epoch": 0.6218880195102123, + "grad_norm": 5.547321796417236, + "learning_rate": 6.6059319367166165e-06, + "loss": 1.2307, + "step": 765 + }, + { + "epoch": 0.622700945025912, + "grad_norm": 3.2380361557006836, + "learning_rate": 6.581173350212169e-06, + "loss": 1.2125, + "step": 766 + }, + { + "epoch": 0.6235138705416117, + "grad_norm": 2.61883282661438, + "learning_rate": 6.55643847174971e-06, + "loss": 1.2556, + "step": 767 + }, + { + "epoch": 0.6243267960573112, + "grad_norm": 3.0079920291900635, + "learning_rate": 6.531727472854617e-06, + "loss": 1.2761, + "step": 768 + }, + { + "epoch": 0.6251397215730109, + "grad_norm": 3.995910882949829, + "learning_rate": 6.507040524886672e-06, + "loss": 1.302, + "step": 769 + }, + { + "epoch": 0.6259526470887105, + "grad_norm": 2.7787578105926514, + "learning_rate": 6.482377799038882e-06, + "loss": 1.2249, + "step": 770 + }, + { + "epoch": 0.6267655726044101, + "grad_norm": 3.6458895206451416, + "learning_rate": 6.45773946633628e-06, + "loss": 1.2833, + "step": 771 + }, + { + "epoch": 0.6275784981201098, + "grad_norm": 2.9308435916900635, + "learning_rate": 6.4331256976347434e-06, + "loss": 1.309, + "step": 772 + }, + { + "epoch": 0.6283914236358094, + "grad_norm": 3.7917234897613525, + "learning_rate": 6.408536663619803e-06, + "loss": 1.2996, + "step": 773 + }, + { + "epoch": 0.629204349151509, + "grad_norm": 8.85531234741211, + "learning_rate": 6.383972534805478e-06, + "loss": 1.2499, + "step": 774 + }, + { + "epoch": 0.6300172746672086, + "grad_norm": 4.16661262512207, + "learning_rate": 6.359433481533074e-06, + "loss": 1.1928, + "step": 775 + }, + { + "epoch": 0.6308302001829083, + "grad_norm": 3.6679298877716064, + "learning_rate": 6.3349196739700024e-06, + "loss": 1.2917, + "step": 776 + }, + { + "epoch": 0.6316431256986078, + "grad_norm": 3.2031593322753906, + "learning_rate": 6.310431282108622e-06, + "loss": 1.2926, + "step": 777 + }, + { + "epoch": 0.6324560512143075, + "grad_norm": 2.7538363933563232, + "learning_rate": 6.2859684757650365e-06, + "loss": 1.2634, + "step": 778 + }, + { + "epoch": 0.6332689767300071, + "grad_norm": 3.4906575679779053, + "learning_rate": 6.261531424577923e-06, + "loss": 1.2711, + "step": 779 + }, + { + "epoch": 0.6340819022457067, + "grad_norm": 3.4287617206573486, + "learning_rate": 6.2371202980073596e-06, + "loss": 1.2412, + "step": 780 + }, + { + "epoch": 0.6348948277614064, + "grad_norm": 3.5826241970062256, + "learning_rate": 6.212735265333655e-06, + "loss": 1.1782, + "step": 781 + }, + { + "epoch": 0.6357077532771059, + "grad_norm": 3.369983673095703, + "learning_rate": 6.188376495656156e-06, + "loss": 1.2628, + "step": 782 + }, + { + "epoch": 0.6365206787928056, + "grad_norm": 3.6163413524627686, + "learning_rate": 6.164044157892102e-06, + "loss": 1.3304, + "step": 783 + }, + { + "epoch": 0.6373336043085053, + "grad_norm": 2.6903252601623535, + "learning_rate": 6.13973842077543e-06, + "loss": 1.2458, + "step": 784 + }, + { + "epoch": 0.6381465298242048, + "grad_norm": 3.919074296951294, + "learning_rate": 6.11545945285561e-06, + "loss": 1.253, + "step": 785 + }, + { + "epoch": 0.6389594553399045, + "grad_norm": 2.9155240058898926, + "learning_rate": 6.091207422496489e-06, + "loss": 1.2661, + "step": 786 + }, + { + "epoch": 0.6397723808556041, + "grad_norm": 3.2426347732543945, + "learning_rate": 6.066982497875109e-06, + "loss": 1.2556, + "step": 787 + }, + { + "epoch": 0.6405853063713037, + "grad_norm": 3.078899383544922, + "learning_rate": 6.042784846980542e-06, + "loss": 1.2572, + "step": 788 + }, + { + "epoch": 0.6413982318870034, + "grad_norm": 3.3044381141662598, + "learning_rate": 6.018614637612733e-06, + "loss": 1.2301, + "step": 789 + }, + { + "epoch": 0.642211157402703, + "grad_norm": 2.8474955558776855, + "learning_rate": 5.99447203738134e-06, + "loss": 1.2042, + "step": 790 + }, + { + "epoch": 0.6430240829184026, + "grad_norm": 2.9787845611572266, + "learning_rate": 5.9703572137045495e-06, + "loss": 1.2608, + "step": 791 + }, + { + "epoch": 0.6438370084341022, + "grad_norm": 3.380209445953369, + "learning_rate": 5.946270333807937e-06, + "loss": 1.2973, + "step": 792 + }, + { + "epoch": 0.6446499339498019, + "grad_norm": 2.81736421585083, + "learning_rate": 5.922211564723302e-06, + "loss": 1.2791, + "step": 793 + }, + { + "epoch": 0.6454628594655015, + "grad_norm": 2.9054102897644043, + "learning_rate": 5.898181073287504e-06, + "loss": 1.2692, + "step": 794 + }, + { + "epoch": 0.6462757849812011, + "grad_norm": 3.2480154037475586, + "learning_rate": 5.87417902614131e-06, + "loss": 1.311, + "step": 795 + }, + { + "epoch": 0.6470887104969008, + "grad_norm": 2.8822832107543945, + "learning_rate": 5.850205589728239e-06, + "loss": 1.2528, + "step": 796 + }, + { + "epoch": 0.6479016360126003, + "grad_norm": 2.8832008838653564, + "learning_rate": 5.826260930293417e-06, + "loss": 1.2631, + "step": 797 + }, + { + "epoch": 0.6487145615283, + "grad_norm": 3.547271490097046, + "learning_rate": 5.802345213882396e-06, + "loss": 1.2543, + "step": 798 + }, + { + "epoch": 0.6495274870439995, + "grad_norm": 9.93248176574707, + "learning_rate": 5.778458606340037e-06, + "loss": 1.3218, + "step": 799 + }, + { + "epoch": 0.6503404125596992, + "grad_norm": 4.664019584655762, + "learning_rate": 5.754601273309333e-06, + "loss": 1.2487, + "step": 800 + }, + { + "epoch": 0.6511533380753989, + "grad_norm": 3.191390037536621, + "learning_rate": 5.730773380230276e-06, + "loss": 1.1966, + "step": 801 + }, + { + "epoch": 0.6519662635910984, + "grad_norm": 3.228309392929077, + "learning_rate": 5.70697509233871e-06, + "loss": 1.2556, + "step": 802 + }, + { + "epoch": 0.6527791891067981, + "grad_norm": 3.1456098556518555, + "learning_rate": 5.683206574665165e-06, + "loss": 1.2308, + "step": 803 + }, + { + "epoch": 0.6535921146224977, + "grad_norm": 2.800039052963257, + "learning_rate": 5.6594679920337514e-06, + "loss": 1.2599, + "step": 804 + }, + { + "epoch": 0.6544050401381973, + "grad_norm": 2.9048550128936768, + "learning_rate": 5.635759509060969e-06, + "loss": 1.2707, + "step": 805 + }, + { + "epoch": 0.655217965653897, + "grad_norm": 4.015383720397949, + "learning_rate": 5.612081290154607e-06, + "loss": 1.1853, + "step": 806 + }, + { + "epoch": 0.6560308911695966, + "grad_norm": 2.6166458129882812, + "learning_rate": 5.58843349951258e-06, + "loss": 1.2589, + "step": 807 + }, + { + "epoch": 0.6568438166852962, + "grad_norm": 4.735121726989746, + "learning_rate": 5.564816301121792e-06, + "loss": 1.2395, + "step": 808 + }, + { + "epoch": 0.6576567422009958, + "grad_norm": 3.5069589614868164, + "learning_rate": 5.541229858757011e-06, + "loss": 1.2888, + "step": 809 + }, + { + "epoch": 0.6584696677166955, + "grad_norm": 2.354539394378662, + "learning_rate": 5.517674335979721e-06, + "loss": 1.1898, + "step": 810 + }, + { + "epoch": 0.6592825932323951, + "grad_norm": 3.2337725162506104, + "learning_rate": 5.494149896136998e-06, + "loss": 1.311, + "step": 811 + }, + { + "epoch": 0.6600955187480947, + "grad_norm": 2.6511757373809814, + "learning_rate": 5.470656702360367e-06, + "loss": 1.2788, + "step": 812 + }, + { + "epoch": 0.6609084442637944, + "grad_norm": 3.772780179977417, + "learning_rate": 5.447194917564671e-06, + "loss": 1.2211, + "step": 813 + }, + { + "epoch": 0.6617213697794939, + "grad_norm": 2.540316581726074, + "learning_rate": 5.423764704446954e-06, + "loss": 1.2647, + "step": 814 + }, + { + "epoch": 0.6625342952951936, + "grad_norm": 2.792747735977173, + "learning_rate": 5.400366225485326e-06, + "loss": 1.2184, + "step": 815 + }, + { + "epoch": 0.6633472208108933, + "grad_norm": 3.32261061668396, + "learning_rate": 5.376999642937817e-06, + "loss": 1.2727, + "step": 816 + }, + { + "epoch": 0.6641601463265928, + "grad_norm": 4.128072738647461, + "learning_rate": 5.353665118841296e-06, + "loss": 1.2718, + "step": 817 + }, + { + "epoch": 0.6649730718422925, + "grad_norm": 2.9913909435272217, + "learning_rate": 5.330362815010306e-06, + "loss": 1.2698, + "step": 818 + }, + { + "epoch": 0.665785997357992, + "grad_norm": 2.9993457794189453, + "learning_rate": 5.307092893035951e-06, + "loss": 1.2447, + "step": 819 + }, + { + "epoch": 0.6665989228736917, + "grad_norm": 2.801236629486084, + "learning_rate": 5.2838555142847925e-06, + "loss": 1.209, + "step": 820 + }, + { + "epoch": 0.6674118483893913, + "grad_norm": 3.982821464538574, + "learning_rate": 5.260650839897719e-06, + "loss": 1.3099, + "step": 821 + }, + { + "epoch": 0.6682247739050909, + "grad_norm": 2.9553382396698, + "learning_rate": 5.237479030788817e-06, + "loss": 1.2652, + "step": 822 + }, + { + "epoch": 0.6690376994207906, + "grad_norm": 3.233414888381958, + "learning_rate": 5.214340247644278e-06, + "loss": 1.2256, + "step": 823 + }, + { + "epoch": 0.6698506249364902, + "grad_norm": 3.1418299674987793, + "learning_rate": 5.191234650921273e-06, + "loss": 1.2225, + "step": 824 + }, + { + "epoch": 0.6706635504521898, + "grad_norm": 2.8071773052215576, + "learning_rate": 5.168162400846835e-06, + "loss": 1.3381, + "step": 825 + }, + { + "epoch": 0.6714764759678894, + "grad_norm": 3.2606897354125977, + "learning_rate": 5.145123657416759e-06, + "loss": 1.2671, + "step": 826 + }, + { + "epoch": 0.6722894014835891, + "grad_norm": 2.5103461742401123, + "learning_rate": 5.122118580394473e-06, + "loss": 1.2349, + "step": 827 + }, + { + "epoch": 0.6731023269992887, + "grad_norm": 2.882448196411133, + "learning_rate": 5.099147329309959e-06, + "loss": 1.2466, + "step": 828 + }, + { + "epoch": 0.6739152525149883, + "grad_norm": 3.0320730209350586, + "learning_rate": 5.076210063458622e-06, + "loss": 1.2157, + "step": 829 + }, + { + "epoch": 0.674728178030688, + "grad_norm": 3.285125970840454, + "learning_rate": 5.0533069419002e-06, + "loss": 1.3087, + "step": 830 + }, + { + "epoch": 0.6755411035463875, + "grad_norm": 3.9807510375976562, + "learning_rate": 5.030438123457655e-06, + "loss": 1.2153, + "step": 831 + }, + { + "epoch": 0.6763540290620872, + "grad_norm": 3.12975811958313, + "learning_rate": 5.007603766716063e-06, + "loss": 1.2064, + "step": 832 + }, + { + "epoch": 0.6771669545777869, + "grad_norm": 2.9132258892059326, + "learning_rate": 4.984804030021533e-06, + "loss": 1.2132, + "step": 833 + }, + { + "epoch": 0.6779798800934864, + "grad_norm": 2.872042417526245, + "learning_rate": 4.962039071480102e-06, + "loss": 1.2618, + "step": 834 + }, + { + "epoch": 0.6787928056091861, + "grad_norm": 3.7190613746643066, + "learning_rate": 4.939309048956622e-06, + "loss": 1.2482, + "step": 835 + }, + { + "epoch": 0.6796057311248856, + "grad_norm": 5.171625137329102, + "learning_rate": 4.9166141200736885e-06, + "loss": 1.2848, + "step": 836 + }, + { + "epoch": 0.6804186566405853, + "grad_norm": 3.5912961959838867, + "learning_rate": 4.89395444221055e-06, + "loss": 1.2525, + "step": 837 + }, + { + "epoch": 0.6812315821562849, + "grad_norm": 3.9113729000091553, + "learning_rate": 4.871330172501979e-06, + "loss": 1.2444, + "step": 838 + }, + { + "epoch": 0.6820445076719845, + "grad_norm": 5.135432720184326, + "learning_rate": 4.848741467837228e-06, + "loss": 1.2189, + "step": 839 + }, + { + "epoch": 0.6828574331876842, + "grad_norm": 3.0934841632843018, + "learning_rate": 4.826188484858918e-06, + "loss": 1.2357, + "step": 840 + }, + { + "epoch": 0.6836703587033838, + "grad_norm": 3.951188325881958, + "learning_rate": 4.803671379961945e-06, + "loss": 1.2539, + "step": 841 + }, + { + "epoch": 0.6844832842190834, + "grad_norm": 6.205260753631592, + "learning_rate": 4.781190309292421e-06, + "loss": 1.2537, + "step": 842 + }, + { + "epoch": 0.685296209734783, + "grad_norm": 4.493546485900879, + "learning_rate": 4.758745428746569e-06, + "loss": 1.252, + "step": 843 + }, + { + "epoch": 0.6861091352504827, + "grad_norm": 4.0202436447143555, + "learning_rate": 4.736336893969652e-06, + "loss": 1.1887, + "step": 844 + }, + { + "epoch": 0.6869220607661823, + "grad_norm": 2.65285587310791, + "learning_rate": 4.7139648603548925e-06, + "loss": 1.2612, + "step": 845 + }, + { + "epoch": 0.6877349862818819, + "grad_norm": 3.629551410675049, + "learning_rate": 4.691629483042387e-06, + "loss": 1.2411, + "step": 846 + }, + { + "epoch": 0.6885479117975816, + "grad_norm": 3.20709228515625, + "learning_rate": 4.669330916918043e-06, + "loss": 1.1949, + "step": 847 + }, + { + "epoch": 0.6893608373132811, + "grad_norm": 3.19427752494812, + "learning_rate": 4.647069316612502e-06, + "loss": 1.2134, + "step": 848 + }, + { + "epoch": 0.6901737628289808, + "grad_norm": 3.6364243030548096, + "learning_rate": 4.624844836500052e-06, + "loss": 1.2915, + "step": 849 + }, + { + "epoch": 0.6909866883446805, + "grad_norm": 3.5689237117767334, + "learning_rate": 4.60265763069758e-06, + "loss": 1.2234, + "step": 850 + }, + { + "epoch": 0.69179961386038, + "grad_norm": 3.1175014972686768, + "learning_rate": 4.580507853063487e-06, + "loss": 1.1833, + "step": 851 + }, + { + "epoch": 0.6926125393760797, + "grad_norm": 2.945756196975708, + "learning_rate": 4.5583956571966295e-06, + "loss": 1.2231, + "step": 852 + }, + { + "epoch": 0.6934254648917793, + "grad_norm": 4.729986667633057, + "learning_rate": 4.5363211964352524e-06, + "loss": 1.2578, + "step": 853 + }, + { + "epoch": 0.6942383904074789, + "grad_norm": 2.7775003910064697, + "learning_rate": 4.514284623855915e-06, + "loss": 1.2678, + "step": 854 + }, + { + "epoch": 0.6950513159231786, + "grad_norm": 4.027686595916748, + "learning_rate": 4.4922860922724466e-06, + "loss": 1.1692, + "step": 855 + }, + { + "epoch": 0.6958642414388782, + "grad_norm": 3.3442118167877197, + "learning_rate": 4.470325754234881e-06, + "loss": 1.2515, + "step": 856 + }, + { + "epoch": 0.6966771669545778, + "grad_norm": 3.197281837463379, + "learning_rate": 4.448403762028391e-06, + "loss": 1.2789, + "step": 857 + }, + { + "epoch": 0.6974900924702774, + "grad_norm": 3.1467063426971436, + "learning_rate": 4.426520267672244e-06, + "loss": 1.2498, + "step": 858 + }, + { + "epoch": 0.698303017985977, + "grad_norm": 8.657835960388184, + "learning_rate": 4.40467542291874e-06, + "loss": 1.2149, + "step": 859 + }, + { + "epoch": 0.6991159435016766, + "grad_norm": 5.045658111572266, + "learning_rate": 4.382869379252152e-06, + "loss": 1.2143, + "step": 860 + }, + { + "epoch": 0.6999288690173763, + "grad_norm": 3.543026924133301, + "learning_rate": 4.361102287887698e-06, + "loss": 1.2727, + "step": 861 + }, + { + "epoch": 0.700741794533076, + "grad_norm": 3.2592012882232666, + "learning_rate": 4.339374299770477e-06, + "loss": 1.2528, + "step": 862 + }, + { + "epoch": 0.7015547200487755, + "grad_norm": 3.284749984741211, + "learning_rate": 4.31768556557441e-06, + "loss": 1.1814, + "step": 863 + }, + { + "epoch": 0.7023676455644752, + "grad_norm": 2.9172427654266357, + "learning_rate": 4.296036235701235e-06, + "loss": 1.2536, + "step": 864 + }, + { + "epoch": 0.7031805710801747, + "grad_norm": 8.07040023803711, + "learning_rate": 4.274426460279412e-06, + "loss": 1.2113, + "step": 865 + }, + { + "epoch": 0.7039934965958744, + "grad_norm": 3.0349769592285156, + "learning_rate": 4.252856389163128e-06, + "loss": 1.2279, + "step": 866 + }, + { + "epoch": 0.7048064221115741, + "grad_norm": 2.7983269691467285, + "learning_rate": 4.231326171931231e-06, + "loss": 1.2585, + "step": 867 + }, + { + "epoch": 0.7056193476272736, + "grad_norm": 3.153099775314331, + "learning_rate": 4.209835957886196e-06, + "loss": 1.2576, + "step": 868 + }, + { + "epoch": 0.7064322731429733, + "grad_norm": 3.4303712844848633, + "learning_rate": 4.188385896053098e-06, + "loss": 1.2569, + "step": 869 + }, + { + "epoch": 0.7072451986586729, + "grad_norm": 3.310842990875244, + "learning_rate": 4.166976135178575e-06, + "loss": 1.2162, + "step": 870 + }, + { + "epoch": 0.7080581241743725, + "grad_norm": 3.982365846633911, + "learning_rate": 4.1456068237297964e-06, + "loss": 1.2409, + "step": 871 + }, + { + "epoch": 0.7088710496900722, + "grad_norm": 3.0641191005706787, + "learning_rate": 4.124278109893432e-06, + "loss": 1.2563, + "step": 872 + }, + { + "epoch": 0.7096839752057718, + "grad_norm": 2.9682273864746094, + "learning_rate": 4.10299014157462e-06, + "loss": 1.1857, + "step": 873 + }, + { + "epoch": 0.7104969007214714, + "grad_norm": 6.076914310455322, + "learning_rate": 4.0817430663959536e-06, + "loss": 1.2108, + "step": 874 + }, + { + "epoch": 0.711309826237171, + "grad_norm": 8.528678894042969, + "learning_rate": 4.06053703169645e-06, + "loss": 1.2185, + "step": 875 + }, + { + "epoch": 0.7121227517528707, + "grad_norm": 3.4424145221710205, + "learning_rate": 4.039372184530521e-06, + "loss": 1.2461, + "step": 876 + }, + { + "epoch": 0.7129356772685703, + "grad_norm": 3.1624224185943604, + "learning_rate": 4.0182486716669656e-06, + "loss": 1.2282, + "step": 877 + }, + { + "epoch": 0.7137486027842699, + "grad_norm": 4.986435890197754, + "learning_rate": 3.9971666395879605e-06, + "loss": 1.2048, + "step": 878 + }, + { + "epoch": 0.7145615282999696, + "grad_norm": 3.537174701690674, + "learning_rate": 3.9761262344880096e-06, + "loss": 1.2752, + "step": 879 + }, + { + "epoch": 0.7153744538156691, + "grad_norm": 2.7389779090881348, + "learning_rate": 3.9551276022729644e-06, + "loss": 1.2434, + "step": 880 + }, + { + "epoch": 0.7161873793313688, + "grad_norm": 3.5238423347473145, + "learning_rate": 3.9341708885590034e-06, + "loss": 1.2409, + "step": 881 + }, + { + "epoch": 0.7170003048470683, + "grad_norm": 3.9080941677093506, + "learning_rate": 3.913256238671607e-06, + "loss": 1.2019, + "step": 882 + }, + { + "epoch": 0.717813230362768, + "grad_norm": 4.038003921508789, + "learning_rate": 3.89238379764457e-06, + "loss": 1.2212, + "step": 883 + }, + { + "epoch": 0.7186261558784677, + "grad_norm": 3.344622850418091, + "learning_rate": 3.871553710218988e-06, + "loss": 1.2067, + "step": 884 + }, + { + "epoch": 0.7194390813941672, + "grad_norm": 3.5090816020965576, + "learning_rate": 3.850766120842252e-06, + "loss": 1.2171, + "step": 885 + }, + { + "epoch": 0.7202520069098669, + "grad_norm": 3.003899335861206, + "learning_rate": 3.830021173667048e-06, + "loss": 1.2371, + "step": 886 + }, + { + "epoch": 0.7210649324255665, + "grad_norm": 3.3116228580474854, + "learning_rate": 3.809319012550352e-06, + "loss": 1.2123, + "step": 887 + }, + { + "epoch": 0.7218778579412661, + "grad_norm": 3.532245397567749, + "learning_rate": 3.788659781052444e-06, + "loss": 1.2629, + "step": 888 + }, + { + "epoch": 0.7226907834569658, + "grad_norm": 4.061065196990967, + "learning_rate": 3.7680436224359084e-06, + "loss": 1.174, + "step": 889 + }, + { + "epoch": 0.7235037089726654, + "grad_norm": 3.3992788791656494, + "learning_rate": 3.747470679664624e-06, + "loss": 1.2209, + "step": 890 + }, + { + "epoch": 0.724316634488365, + "grad_norm": 3.4010937213897705, + "learning_rate": 3.7269410954028107e-06, + "loss": 1.2426, + "step": 891 + }, + { + "epoch": 0.7251295600040646, + "grad_norm": 2.854327917098999, + "learning_rate": 3.706455012013994e-06, + "loss": 1.1932, + "step": 892 + }, + { + "epoch": 0.7259424855197643, + "grad_norm": 3.451002836227417, + "learning_rate": 3.6860125715600513e-06, + "loss": 1.253, + "step": 893 + }, + { + "epoch": 0.7267554110354639, + "grad_norm": 3.123344898223877, + "learning_rate": 3.665613915800217e-06, + "loss": 1.2187, + "step": 894 + }, + { + "epoch": 0.7275683365511635, + "grad_norm": 3.021973133087158, + "learning_rate": 3.6452591861900886e-06, + "loss": 1.2165, + "step": 895 + }, + { + "epoch": 0.7283812620668632, + "grad_norm": 3.234985589981079, + "learning_rate": 3.6249485238806637e-06, + "loss": 1.212, + "step": 896 + }, + { + "epoch": 0.7291941875825627, + "grad_norm": 3.7146785259246826, + "learning_rate": 3.6046820697173514e-06, + "loss": 1.2697, + "step": 897 + }, + { + "epoch": 0.7300071130982624, + "grad_norm": 3.134507417678833, + "learning_rate": 3.5844599642389965e-06, + "loss": 1.2433, + "step": 898 + }, + { + "epoch": 0.7308200386139619, + "grad_norm": 2.9155194759368896, + "learning_rate": 3.564282347676903e-06, + "loss": 1.2403, + "step": 899 + }, + { + "epoch": 0.7316329641296616, + "grad_norm": 3.148232936859131, + "learning_rate": 3.54414935995387e-06, + "loss": 1.2575, + "step": 900 + }, + { + "epoch": 0.7324458896453613, + "grad_norm": 2.685274124145508, + "learning_rate": 3.524061140683206e-06, + "loss": 1.2124, + "step": 901 + }, + { + "epoch": 0.7332588151610608, + "grad_norm": 3.4557571411132812, + "learning_rate": 3.5040178291677816e-06, + "loss": 1.2105, + "step": 902 + }, + { + "epoch": 0.7340717406767605, + "grad_norm": 2.8230202198028564, + "learning_rate": 3.4840195643990383e-06, + "loss": 1.1745, + "step": 903 + }, + { + "epoch": 0.7348846661924601, + "grad_norm": 3.311697483062744, + "learning_rate": 3.464066485056048e-06, + "loss": 1.222, + "step": 904 + }, + { + "epoch": 0.7356975917081597, + "grad_norm": 3.2953929901123047, + "learning_rate": 3.444158729504549e-06, + "loss": 1.2688, + "step": 905 + }, + { + "epoch": 0.7365105172238594, + "grad_norm": 3.3319778442382812, + "learning_rate": 3.4242964357959597e-06, + "loss": 1.2539, + "step": 906 + }, + { + "epoch": 0.737323442739559, + "grad_norm": 3.124361753463745, + "learning_rate": 3.4044797416664564e-06, + "loss": 1.2527, + "step": 907 + }, + { + "epoch": 0.7381363682552586, + "grad_norm": 2.9690327644348145, + "learning_rate": 3.3847087845359996e-06, + "loss": 1.2722, + "step": 908 + }, + { + "epoch": 0.7389492937709582, + "grad_norm": 5.119561672210693, + "learning_rate": 3.364983701507376e-06, + "loss": 1.2233, + "step": 909 + }, + { + "epoch": 0.7397622192866579, + "grad_norm": 2.818423271179199, + "learning_rate": 3.3453046293652657e-06, + "loss": 1.2438, + "step": 910 + }, + { + "epoch": 0.7405751448023575, + "grad_norm": 3.0988523960113525, + "learning_rate": 3.3256717045752794e-06, + "loss": 1.223, + "step": 911 + }, + { + "epoch": 0.7413880703180571, + "grad_norm": 3.082066297531128, + "learning_rate": 3.3060850632830167e-06, + "loss": 1.244, + "step": 912 + }, + { + "epoch": 0.7422009958337568, + "grad_norm": 2.944265127182007, + "learning_rate": 3.286544841313126e-06, + "loss": 1.2308, + "step": 913 + }, + { + "epoch": 0.7430139213494563, + "grad_norm": 3.608762502670288, + "learning_rate": 3.2670511741683475e-06, + "loss": 1.2018, + "step": 914 + }, + { + "epoch": 0.743826846865156, + "grad_norm": 3.958385705947876, + "learning_rate": 3.2476041970285945e-06, + "loss": 1.2136, + "step": 915 + }, + { + "epoch": 0.7446397723808557, + "grad_norm": 2.9133267402648926, + "learning_rate": 3.2282040447500063e-06, + "loss": 1.2649, + "step": 916 + }, + { + "epoch": 0.7454526978965552, + "grad_norm": 3.8698244094848633, + "learning_rate": 3.208850851863998e-06, + "loss": 1.2265, + "step": 917 + }, + { + "epoch": 0.7462656234122549, + "grad_norm": 4.550247669219971, + "learning_rate": 3.189544752576369e-06, + "loss": 1.2046, + "step": 918 + }, + { + "epoch": 0.7470785489279544, + "grad_norm": 2.9886014461517334, + "learning_rate": 3.1702858807663175e-06, + "loss": 1.2812, + "step": 919 + }, + { + "epoch": 0.7478914744436541, + "grad_norm": 3.3736209869384766, + "learning_rate": 3.151074369985556e-06, + "loss": 1.2482, + "step": 920 + }, + { + "epoch": 0.7487043999593537, + "grad_norm": 2.7061290740966797, + "learning_rate": 3.131910353457369e-06, + "loss": 1.2474, + "step": 921 + }, + { + "epoch": 0.7495173254750533, + "grad_norm": 4.058886528015137, + "learning_rate": 3.112793964075681e-06, + "loss": 1.1897, + "step": 922 + }, + { + "epoch": 0.750330250990753, + "grad_norm": 3.3311798572540283, + "learning_rate": 3.0937253344041507e-06, + "loss": 1.2129, + "step": 923 + }, + { + "epoch": 0.7511431765064526, + "grad_norm": 3.2716569900512695, + "learning_rate": 3.074704596675242e-06, + "loss": 1.1763, + "step": 924 + }, + { + "epoch": 0.7519561020221522, + "grad_norm": 3.360356569290161, + "learning_rate": 3.055731882789311e-06, + "loss": 1.2771, + "step": 925 + }, + { + "epoch": 0.7527690275378518, + "grad_norm": 3.9494638442993164, + "learning_rate": 3.0368073243136874e-06, + "loss": 1.2551, + "step": 926 + }, + { + "epoch": 0.7535819530535515, + "grad_norm": 3.3180434703826904, + "learning_rate": 3.0179310524817707e-06, + "loss": 1.245, + "step": 927 + }, + { + "epoch": 0.7543948785692511, + "grad_norm": 4.963752746582031, + "learning_rate": 2.9991031981921026e-06, + "loss": 1.2266, + "step": 928 + }, + { + "epoch": 0.7552078040849507, + "grad_norm": 3.1220555305480957, + "learning_rate": 2.9803238920074784e-06, + "loss": 1.2057, + "step": 929 + }, + { + "epoch": 0.7560207296006504, + "grad_norm": 2.8764801025390625, + "learning_rate": 2.961593264154038e-06, + "loss": 1.2157, + "step": 930 + }, + { + "epoch": 0.7568336551163499, + "grad_norm": 2.682791233062744, + "learning_rate": 2.9429114445203423e-06, + "loss": 1.1899, + "step": 931 + }, + { + "epoch": 0.7576465806320496, + "grad_norm": 5.8080878257751465, + "learning_rate": 2.924278562656514e-06, + "loss": 1.1661, + "step": 932 + }, + { + "epoch": 0.7584595061477493, + "grad_norm": 3.5146303176879883, + "learning_rate": 2.90569474777329e-06, + "loss": 1.2712, + "step": 933 + }, + { + "epoch": 0.7592724316634488, + "grad_norm": 3.092174530029297, + "learning_rate": 2.8871601287411634e-06, + "loss": 1.2297, + "step": 934 + }, + { + "epoch": 0.7600853571791485, + "grad_norm": 2.807847499847412, + "learning_rate": 2.8686748340894744e-06, + "loss": 1.2369, + "step": 935 + }, + { + "epoch": 0.760898282694848, + "grad_norm": 2.8753178119659424, + "learning_rate": 2.850238992005514e-06, + "loss": 1.2812, + "step": 936 + }, + { + "epoch": 0.7617112082105477, + "grad_norm": 4.227181434631348, + "learning_rate": 2.8318527303336465e-06, + "loss": 1.2143, + "step": 937 + }, + { + "epoch": 0.7625241337262473, + "grad_norm": 3.921201229095459, + "learning_rate": 2.81351617657442e-06, + "loss": 1.2446, + "step": 938 + }, + { + "epoch": 0.763337059241947, + "grad_norm": 3.164557695388794, + "learning_rate": 2.795229457883678e-06, + "loss": 1.2085, + "step": 939 + }, + { + "epoch": 0.7641499847576466, + "grad_norm": 3.0904717445373535, + "learning_rate": 2.7769927010716814e-06, + "loss": 1.2436, + "step": 940 + }, + { + "epoch": 0.7649629102733462, + "grad_norm": 9.615850448608398, + "learning_rate": 2.7588060326022205e-06, + "loss": 1.2179, + "step": 941 + }, + { + "epoch": 0.7657758357890458, + "grad_norm": 7.9210357666015625, + "learning_rate": 2.740669578591755e-06, + "loss": 1.1704, + "step": 942 + }, + { + "epoch": 0.7665887613047454, + "grad_norm": 3.03359055519104, + "learning_rate": 2.7225834648085282e-06, + "loss": 1.1919, + "step": 943 + }, + { + "epoch": 0.7674016868204451, + "grad_norm": 3.331894636154175, + "learning_rate": 2.7045478166716843e-06, + "loss": 1.2297, + "step": 944 + }, + { + "epoch": 0.7682146123361447, + "grad_norm": 2.9995782375335693, + "learning_rate": 2.6865627592504295e-06, + "loss": 1.1936, + "step": 945 + }, + { + "epoch": 0.7690275378518443, + "grad_norm": 11.267196655273438, + "learning_rate": 2.668628417263137e-06, + "loss": 1.2385, + "step": 946 + }, + { + "epoch": 0.769840463367544, + "grad_norm": 4.058920383453369, + "learning_rate": 2.6507449150764852e-06, + "loss": 1.2078, + "step": 947 + }, + { + "epoch": 0.7706533888832435, + "grad_norm": 2.8774616718292236, + "learning_rate": 2.632912376704607e-06, + "loss": 1.2585, + "step": 948 + }, + { + "epoch": 0.7714663143989432, + "grad_norm": 3.4053540229797363, + "learning_rate": 2.615130925808228e-06, + "loss": 1.2739, + "step": 949 + }, + { + "epoch": 0.7722792399146429, + "grad_norm": 3.0022501945495605, + "learning_rate": 2.597400685693795e-06, + "loss": 1.2136, + "step": 950 + }, + { + "epoch": 0.7730921654303424, + "grad_norm": 3.6466481685638428, + "learning_rate": 2.5797217793126373e-06, + "loss": 1.3104, + "step": 951 + }, + { + "epoch": 0.7739050909460421, + "grad_norm": 4.021648406982422, + "learning_rate": 2.5620943292601074e-06, + "loss": 1.2621, + "step": 952 + }, + { + "epoch": 0.7747180164617417, + "grad_norm": 2.996817111968994, + "learning_rate": 2.5445184577747305e-06, + "loss": 1.2194, + "step": 953 + }, + { + "epoch": 0.7755309419774413, + "grad_norm": 3.8881189823150635, + "learning_rate": 2.52699428673736e-06, + "loss": 1.2516, + "step": 954 + }, + { + "epoch": 0.776343867493141, + "grad_norm": 3.279557228088379, + "learning_rate": 2.5095219376703183e-06, + "loss": 1.2116, + "step": 955 + }, + { + "epoch": 0.7771567930088406, + "grad_norm": 3.1030569076538086, + "learning_rate": 2.4921015317365794e-06, + "loss": 1.2902, + "step": 956 + }, + { + "epoch": 0.7779697185245402, + "grad_norm": 3.7724967002868652, + "learning_rate": 2.4747331897389103e-06, + "loss": 1.2783, + "step": 957 + }, + { + "epoch": 0.7787826440402398, + "grad_norm": 2.808138132095337, + "learning_rate": 2.4574170321190305e-06, + "loss": 1.2191, + "step": 958 + }, + { + "epoch": 0.7795955695559394, + "grad_norm": 2.6033871173858643, + "learning_rate": 2.440153178956798e-06, + "loss": 1.2282, + "step": 959 + }, + { + "epoch": 0.780408495071639, + "grad_norm": 2.870957612991333, + "learning_rate": 2.42294174996935e-06, + "loss": 1.2118, + "step": 960 + }, + { + "epoch": 0.7812214205873387, + "grad_norm": 2.913543462753296, + "learning_rate": 2.40578286451029e-06, + "loss": 1.2352, + "step": 961 + }, + { + "epoch": 0.7820343461030383, + "grad_norm": 3.7069716453552246, + "learning_rate": 2.38867664156886e-06, + "loss": 1.2218, + "step": 962 + }, + { + "epoch": 0.7828472716187379, + "grad_norm": 4.073693752288818, + "learning_rate": 2.3716231997691007e-06, + "loss": 1.1997, + "step": 963 + }, + { + "epoch": 0.7836601971344376, + "grad_norm": 2.7815756797790527, + "learning_rate": 2.3546226573690444e-06, + "loss": 1.1898, + "step": 964 + }, + { + "epoch": 0.7844731226501371, + "grad_norm": 3.2033910751342773, + "learning_rate": 2.3376751322599e-06, + "loss": 1.2575, + "step": 965 + }, + { + "epoch": 0.7852860481658368, + "grad_norm": 2.805227518081665, + "learning_rate": 2.320780741965206e-06, + "loss": 1.221, + "step": 966 + }, + { + "epoch": 0.7860989736815365, + "grad_norm": 2.747638463973999, + "learning_rate": 2.3039396036400463e-06, + "loss": 1.2199, + "step": 967 + }, + { + "epoch": 0.786911899197236, + "grad_norm": 2.758178234100342, + "learning_rate": 2.287151834070226e-06, + "loss": 1.1847, + "step": 968 + }, + { + "epoch": 0.7877248247129357, + "grad_norm": 3.467595338821411, + "learning_rate": 2.2704175496714552e-06, + "loss": 1.2456, + "step": 969 + }, + { + "epoch": 0.7885377502286353, + "grad_norm": 5.487158298492432, + "learning_rate": 2.2537368664885527e-06, + "loss": 1.2061, + "step": 970 + }, + { + "epoch": 0.7893506757443349, + "grad_norm": 3.063075542449951, + "learning_rate": 2.2371099001946385e-06, + "loss": 1.264, + "step": 971 + }, + { + "epoch": 0.7901636012600346, + "grad_norm": 2.6598317623138428, + "learning_rate": 2.2205367660903267e-06, + "loss": 1.1971, + "step": 972 + }, + { + "epoch": 0.7909765267757342, + "grad_norm": 3.249379873275757, + "learning_rate": 2.2040175791029305e-06, + "loss": 1.2442, + "step": 973 + }, + { + "epoch": 0.7917894522914338, + "grad_norm": 3.2312817573547363, + "learning_rate": 2.187552453785662e-06, + "loss": 1.1871, + "step": 974 + }, + { + "epoch": 0.7926023778071334, + "grad_norm": 3.060171604156494, + "learning_rate": 2.1711415043168395e-06, + "loss": 1.2198, + "step": 975 + }, + { + "epoch": 0.793415303322833, + "grad_norm": 3.2674033641815186, + "learning_rate": 2.1547848444991025e-06, + "loss": 1.2343, + "step": 976 + }, + { + "epoch": 0.7942282288385327, + "grad_norm": 3.822357654571533, + "learning_rate": 2.138482587758605e-06, + "loss": 1.1876, + "step": 977 + }, + { + "epoch": 0.7950411543542323, + "grad_norm": 3.4773342609405518, + "learning_rate": 2.1222348471442477e-06, + "loss": 1.1976, + "step": 978 + }, + { + "epoch": 0.795854079869932, + "grad_norm": 3.8379478454589844, + "learning_rate": 2.1060417353268845e-06, + "loss": 1.198, + "step": 979 + }, + { + "epoch": 0.7966670053856315, + "grad_norm": 4.963233470916748, + "learning_rate": 2.0899033645985423e-06, + "loss": 1.2991, + "step": 980 + }, + { + "epoch": 0.7974799309013312, + "grad_norm": 3.4560701847076416, + "learning_rate": 2.073819846871646e-06, + "loss": 1.1936, + "step": 981 + }, + { + "epoch": 0.7982928564170307, + "grad_norm": 2.69124698638916, + "learning_rate": 2.0577912936782317e-06, + "loss": 1.1708, + "step": 982 + }, + { + "epoch": 0.7991057819327304, + "grad_norm": 2.973618268966675, + "learning_rate": 2.041817816169187e-06, + "loss": 1.2535, + "step": 983 + }, + { + "epoch": 0.7999187074484301, + "grad_norm": 3.1709506511688232, + "learning_rate": 2.025899525113474e-06, + "loss": 1.2015, + "step": 984 + }, + { + "epoch": 0.8007316329641296, + "grad_norm": 2.750272274017334, + "learning_rate": 2.010036530897359e-06, + "loss": 1.2677, + "step": 985 + }, + { + "epoch": 0.8015445584798293, + "grad_norm": 2.7218148708343506, + "learning_rate": 1.9942289435236506e-06, + "loss": 1.2679, + "step": 986 + }, + { + "epoch": 0.8023574839955289, + "grad_norm": 3.0237209796905518, + "learning_rate": 1.978476872610939e-06, + "loss": 1.2425, + "step": 987 + }, + { + "epoch": 0.8031704095112285, + "grad_norm": 4.8593363761901855, + "learning_rate": 1.962780427392823e-06, + "loss": 1.2754, + "step": 988 + }, + { + "epoch": 0.8039833350269282, + "grad_norm": 4.2402544021606445, + "learning_rate": 1.9471397167171714e-06, + "loss": 1.1841, + "step": 989 + }, + { + "epoch": 0.8047962605426278, + "grad_norm": 2.8616418838500977, + "learning_rate": 1.931554849045355e-06, + "loss": 1.1712, + "step": 990 + }, + { + "epoch": 0.8056091860583274, + "grad_norm": 3.0303030014038086, + "learning_rate": 1.916025932451493e-06, + "loss": 1.2217, + "step": 991 + }, + { + "epoch": 0.806422111574027, + "grad_norm": 3.096165180206299, + "learning_rate": 1.9005530746217238e-06, + "loss": 1.1515, + "step": 992 + }, + { + "epoch": 0.8072350370897267, + "grad_norm": 5.142411231994629, + "learning_rate": 1.8851363828534253e-06, + "loss": 1.167, + "step": 993 + }, + { + "epoch": 0.8080479626054263, + "grad_norm": 3.1720876693725586, + "learning_rate": 1.869775964054501e-06, + "loss": 1.1896, + "step": 994 + }, + { + "epoch": 0.8088608881211259, + "grad_norm": 3.833009719848633, + "learning_rate": 1.8544719247426224e-06, + "loss": 1.2517, + "step": 995 + }, + { + "epoch": 0.8096738136368256, + "grad_norm": 3.188974618911743, + "learning_rate": 1.8392243710444911e-06, + "loss": 1.2795, + "step": 996 + }, + { + "epoch": 0.8104867391525251, + "grad_norm": 3.601663589477539, + "learning_rate": 1.8240334086951117e-06, + "loss": 1.2366, + "step": 997 + }, + { + "epoch": 0.8112996646682248, + "grad_norm": 3.1258544921875, + "learning_rate": 1.8088991430370506e-06, + "loss": 1.2002, + "step": 998 + }, + { + "epoch": 0.8121125901839243, + "grad_norm": 2.71299409866333, + "learning_rate": 1.7938216790197071e-06, + "loss": 1.2609, + "step": 999 + }, + { + "epoch": 0.812925515699624, + "grad_norm": 3.2866601943969727, + "learning_rate": 1.77880112119859e-06, + "loss": 1.2571, + "step": 1000 + }, + { + "epoch": 0.8137384412153237, + "grad_norm": 3.1053292751312256, + "learning_rate": 1.7638375737345804e-06, + "loss": 1.2316, + "step": 1001 + }, + { + "epoch": 0.8145513667310232, + "grad_norm": 2.839862823486328, + "learning_rate": 1.7489311403932274e-06, + "loss": 1.2464, + "step": 1002 + }, + { + "epoch": 0.8153642922467229, + "grad_norm": 2.750040292739868, + "learning_rate": 1.7340819245440166e-06, + "loss": 1.2639, + "step": 1003 + }, + { + "epoch": 0.8161772177624225, + "grad_norm": 3.918286085128784, + "learning_rate": 1.7192900291596493e-06, + "loss": 1.2379, + "step": 1004 + }, + { + "epoch": 0.8169901432781221, + "grad_norm": 3.579942226409912, + "learning_rate": 1.7045555568153415e-06, + "loss": 1.1943, + "step": 1005 + }, + { + "epoch": 0.8178030687938218, + "grad_norm": 3.2873690128326416, + "learning_rate": 1.6898786096881104e-06, + "loss": 1.2457, + "step": 1006 + }, + { + "epoch": 0.8186159943095214, + "grad_norm": 2.721126079559326, + "learning_rate": 1.6752592895560493e-06, + "loss": 1.2681, + "step": 1007 + }, + { + "epoch": 0.819428919825221, + "grad_norm": 2.9273929595947266, + "learning_rate": 1.6606976977976408e-06, + "loss": 1.1985, + "step": 1008 + }, + { + "epoch": 0.8202418453409206, + "grad_norm": 3.6816606521606445, + "learning_rate": 1.6461939353910494e-06, + "loss": 1.2128, + "step": 1009 + }, + { + "epoch": 0.8210547708566203, + "grad_norm": 2.8991682529449463, + "learning_rate": 1.631748102913412e-06, + "loss": 1.224, + "step": 1010 + }, + { + "epoch": 0.8218676963723199, + "grad_norm": 3.2517406940460205, + "learning_rate": 1.6173603005401505e-06, + "loss": 1.1936, + "step": 1011 + }, + { + "epoch": 0.8226806218880195, + "grad_norm": 3.0502426624298096, + "learning_rate": 1.6030306280442764e-06, + "loss": 1.2555, + "step": 1012 + }, + { + "epoch": 0.8234935474037192, + "grad_norm": 3.2694664001464844, + "learning_rate": 1.588759184795694e-06, + "loss": 1.2643, + "step": 1013 + }, + { + "epoch": 0.8243064729194187, + "grad_norm": 2.9429259300231934, + "learning_rate": 1.574546069760514e-06, + "loss": 1.2221, + "step": 1014 + }, + { + "epoch": 0.8251193984351184, + "grad_norm": 3.2481369972229004, + "learning_rate": 1.5603913815003634e-06, + "loss": 1.1949, + "step": 1015 + }, + { + "epoch": 0.8259323239508181, + "grad_norm": 3.006603717803955, + "learning_rate": 1.5462952181717117e-06, + "loss": 1.1593, + "step": 1016 + }, + { + "epoch": 0.8267452494665176, + "grad_norm": 2.8126094341278076, + "learning_rate": 1.532257677525183e-06, + "loss": 1.2094, + "step": 1017 + }, + { + "epoch": 0.8275581749822173, + "grad_norm": 3.258910894393921, + "learning_rate": 1.5182788569048689e-06, + "loss": 1.1524, + "step": 1018 + }, + { + "epoch": 0.8283711004979168, + "grad_norm": 3.097121477127075, + "learning_rate": 1.5043588532476827e-06, + "loss": 1.2063, + "step": 1019 + }, + { + "epoch": 0.8291840260136165, + "grad_norm": 3.5429606437683105, + "learning_rate": 1.49049776308265e-06, + "loss": 1.1579, + "step": 1020 + }, + { + "epoch": 0.8299969515293161, + "grad_norm": 3.0676991939544678, + "learning_rate": 1.476695682530268e-06, + "loss": 1.2063, + "step": 1021 + }, + { + "epoch": 0.8308098770450157, + "grad_norm": 3.191493272781372, + "learning_rate": 1.4629527073018267e-06, + "loss": 1.2724, + "step": 1022 + }, + { + "epoch": 0.8316228025607154, + "grad_norm": 4.181521415710449, + "learning_rate": 1.449268932698743e-06, + "loss": 1.2627, + "step": 1023 + }, + { + "epoch": 0.832435728076415, + "grad_norm": 3.7330870628356934, + "learning_rate": 1.4356444536119085e-06, + "loss": 1.1875, + "step": 1024 + }, + { + "epoch": 0.8332486535921146, + "grad_norm": 3.5213124752044678, + "learning_rate": 1.422079364521024e-06, + "loss": 1.2345, + "step": 1025 + }, + { + "epoch": 0.8340615791078142, + "grad_norm": 3.672848701477051, + "learning_rate": 1.4085737594939497e-06, + "loss": 1.2451, + "step": 1026 + }, + { + "epoch": 0.8348745046235139, + "grad_norm": 3.2613043785095215, + "learning_rate": 1.3951277321860468e-06, + "loss": 1.261, + "step": 1027 + }, + { + "epoch": 0.8356874301392135, + "grad_norm": 3.1444427967071533, + "learning_rate": 1.381741375839537e-06, + "loss": 1.2205, + "step": 1028 + }, + { + "epoch": 0.8365003556549131, + "grad_norm": 3.7306652069091797, + "learning_rate": 1.3684147832828409e-06, + "loss": 1.2343, + "step": 1029 + }, + { + "epoch": 0.8373132811706128, + "grad_norm": 3.6698615550994873, + "learning_rate": 1.355148046929956e-06, + "loss": 1.2195, + "step": 1030 + }, + { + "epoch": 0.8381262066863123, + "grad_norm": 4.807132244110107, + "learning_rate": 1.3419412587797908e-06, + "loss": 1.1946, + "step": 1031 + }, + { + "epoch": 0.838939132202012, + "grad_norm": 3.0877437591552734, + "learning_rate": 1.3287945104155487e-06, + "loss": 1.1901, + "step": 1032 + }, + { + "epoch": 0.8397520577177117, + "grad_norm": 6.123032093048096, + "learning_rate": 1.3157078930040856e-06, + "loss": 1.2338, + "step": 1033 + }, + { + "epoch": 0.8405649832334112, + "grad_norm": 3.8207807540893555, + "learning_rate": 1.3026814972952674e-06, + "loss": 1.2064, + "step": 1034 + }, + { + "epoch": 0.8413779087491109, + "grad_norm": 3.591054916381836, + "learning_rate": 1.2897154136213542e-06, + "loss": 1.248, + "step": 1035 + }, + { + "epoch": 0.8421908342648105, + "grad_norm": 3.14103364944458, + "learning_rate": 1.2768097318963701e-06, + "loss": 1.2247, + "step": 1036 + }, + { + "epoch": 0.8430037597805101, + "grad_norm": 3.2605819702148438, + "learning_rate": 1.2639645416154744e-06, + "loss": 1.2265, + "step": 1037 + }, + { + "epoch": 0.8438166852962098, + "grad_norm": 3.2860848903656006, + "learning_rate": 1.2511799318543493e-06, + "loss": 1.2083, + "step": 1038 + }, + { + "epoch": 0.8446296108119093, + "grad_norm": 3.6271586418151855, + "learning_rate": 1.2384559912685768e-06, + "loss": 1.2562, + "step": 1039 + }, + { + "epoch": 0.845442536327609, + "grad_norm": 3.0439271926879883, + "learning_rate": 1.2257928080930236e-06, + "loss": 1.1838, + "step": 1040 + }, + { + "epoch": 0.8462554618433086, + "grad_norm": 2.9285664558410645, + "learning_rate": 1.2131904701412345e-06, + "loss": 1.2271, + "step": 1041 + }, + { + "epoch": 0.8470683873590082, + "grad_norm": 4.422233581542969, + "learning_rate": 1.2006490648048118e-06, + "loss": 1.2218, + "step": 1042 + }, + { + "epoch": 0.8478813128747078, + "grad_norm": 3.193469524383545, + "learning_rate": 1.1881686790528279e-06, + "loss": 1.2167, + "step": 1043 + }, + { + "epoch": 0.8486942383904075, + "grad_norm": 2.9041225910186768, + "learning_rate": 1.1757493994312052e-06, + "loss": 1.1652, + "step": 1044 + }, + { + "epoch": 0.8495071639061071, + "grad_norm": 2.902376890182495, + "learning_rate": 1.1633913120621188e-06, + "loss": 1.209, + "step": 1045 + }, + { + "epoch": 0.8503200894218067, + "grad_norm": 2.7561545372009277, + "learning_rate": 1.151094502643414e-06, + "loss": 1.2105, + "step": 1046 + }, + { + "epoch": 0.8511330149375064, + "grad_norm": 3.4532971382141113, + "learning_rate": 1.1388590564479895e-06, + "loss": 1.2457, + "step": 1047 + }, + { + "epoch": 0.8519459404532059, + "grad_norm": 4.540160179138184, + "learning_rate": 1.1266850583232224e-06, + "loss": 1.1941, + "step": 1048 + }, + { + "epoch": 0.8527588659689056, + "grad_norm": 2.99617075920105, + "learning_rate": 1.1145725926903772e-06, + "loss": 1.2138, + "step": 1049 + }, + { + "epoch": 0.8535717914846053, + "grad_norm": 3.2309064865112305, + "learning_rate": 1.1025217435440116e-06, + "loss": 1.2373, + "step": 1050 + }, + { + "epoch": 0.8543847170003048, + "grad_norm": 2.7454960346221924, + "learning_rate": 1.0905325944514034e-06, + "loss": 1.2473, + "step": 1051 + }, + { + "epoch": 0.8551976425160045, + "grad_norm": 8.090238571166992, + "learning_rate": 1.078605228551971e-06, + "loss": 1.2342, + "step": 1052 + }, + { + "epoch": 0.8560105680317041, + "grad_norm": 3.7213146686553955, + "learning_rate": 1.0667397285566893e-06, + "loss": 1.2232, + "step": 1053 + }, + { + "epoch": 0.8568234935474037, + "grad_norm": 3.4427578449249268, + "learning_rate": 1.0549361767475241e-06, + "loss": 1.2474, + "step": 1054 + }, + { + "epoch": 0.8576364190631034, + "grad_norm": 3.212726593017578, + "learning_rate": 1.0431946549768567e-06, + "loss": 1.2727, + "step": 1055 + }, + { + "epoch": 0.858449344578803, + "grad_norm": 3.895224094390869, + "learning_rate": 1.0315152446669142e-06, + "loss": 1.2451, + "step": 1056 + }, + { + "epoch": 0.8592622700945026, + "grad_norm": 2.8261964321136475, + "learning_rate": 1.019898026809214e-06, + "loss": 1.2416, + "step": 1057 + }, + { + "epoch": 0.8600751956102022, + "grad_norm": 3.2146759033203125, + "learning_rate": 1.0083430819639962e-06, + "loss": 1.2258, + "step": 1058 + }, + { + "epoch": 0.8608881211259019, + "grad_norm": 5.239031791687012, + "learning_rate": 9.968504902596566e-07, + "loss": 1.2089, + "step": 1059 + }, + { + "epoch": 0.8617010466416014, + "grad_norm": 3.1487622261047363, + "learning_rate": 9.85420331392214e-07, + "loss": 1.2445, + "step": 1060 + }, + { + "epoch": 0.8625139721573011, + "grad_norm": 3.5642974376678467, + "learning_rate": 9.74052684624731e-07, + "loss": 1.2724, + "step": 1061 + }, + { + "epoch": 0.8633268976730007, + "grad_norm": 3.3064541816711426, + "learning_rate": 9.62747628786782e-07, + "loss": 1.235, + "step": 1062 + }, + { + "epoch": 0.8641398231887003, + "grad_norm": 2.7583703994750977, + "learning_rate": 9.515052422739035e-07, + "loss": 1.1864, + "step": 1063 + }, + { + "epoch": 0.8649527487044, + "grad_norm": 2.8002755641937256, + "learning_rate": 9.403256030470386e-07, + "loss": 1.1888, + "step": 1064 + }, + { + "epoch": 0.8657656742200995, + "grad_norm": 4.0211710929870605, + "learning_rate": 9.292087886320166e-07, + "loss": 1.2513, + "step": 1065 + }, + { + "epoch": 0.8665785997357992, + "grad_norm": 3.937668561935425, + "learning_rate": 9.181548761189996e-07, + "loss": 1.2111, + "step": 1066 + }, + { + "epoch": 0.8673915252514989, + "grad_norm": 31.291566848754883, + "learning_rate": 9.071639421619527e-07, + "loss": 1.2234, + "step": 1067 + }, + { + "epoch": 0.8682044507671984, + "grad_norm": 4.150018692016602, + "learning_rate": 8.962360629781164e-07, + "loss": 1.2205, + "step": 1068 + }, + { + "epoch": 0.8690173762828981, + "grad_norm": 2.8017213344573975, + "learning_rate": 8.853713143474685e-07, + "loss": 1.27, + "step": 1069 + }, + { + "epoch": 0.8698303017985977, + "grad_norm": 2.9798476696014404, + "learning_rate": 8.745697716122081e-07, + "loss": 1.2169, + "step": 1070 + }, + { + "epoch": 0.8706432273142973, + "grad_norm": 4.344991683959961, + "learning_rate": 8.638315096762318e-07, + "loss": 1.2217, + "step": 1071 + }, + { + "epoch": 0.871456152829997, + "grad_norm": 2.9421257972717285, + "learning_rate": 8.531566030046035e-07, + "loss": 1.2399, + "step": 1072 + }, + { + "epoch": 0.8722690783456966, + "grad_norm": 3.4676921367645264, + "learning_rate": 8.425451256230588e-07, + "loss": 1.1957, + "step": 1073 + }, + { + "epoch": 0.8730820038613962, + "grad_norm": 3.2855141162872314, + "learning_rate": 8.319971511174718e-07, + "loss": 1.2399, + "step": 1074 + }, + { + "epoch": 0.8738949293770958, + "grad_norm": 2.990471839904785, + "learning_rate": 8.215127526333499e-07, + "loss": 1.2787, + "step": 1075 + }, + { + "epoch": 0.8747078548927955, + "grad_norm": 3.183928966522217, + "learning_rate": 8.110920028753355e-07, + "loss": 1.1831, + "step": 1076 + }, + { + "epoch": 0.8755207804084951, + "grad_norm": 2.8277997970581055, + "learning_rate": 8.007349741066939e-07, + "loss": 1.248, + "step": 1077 + }, + { + "epoch": 0.8763337059241947, + "grad_norm": 2.7392983436584473, + "learning_rate": 7.904417381488083e-07, + "loss": 1.23, + "step": 1078 + }, + { + "epoch": 0.8771466314398944, + "grad_norm": 5.617170333862305, + "learning_rate": 7.802123663806938e-07, + "loss": 1.2267, + "step": 1079 + }, + { + "epoch": 0.8779595569555939, + "grad_norm": 2.906653642654419, + "learning_rate": 7.700469297384927e-07, + "loss": 1.2245, + "step": 1080 + }, + { + "epoch": 0.8787724824712936, + "grad_norm": 2.7728428840637207, + "learning_rate": 7.599454987149868e-07, + "loss": 1.2131, + "step": 1081 + }, + { + "epoch": 0.8795854079869931, + "grad_norm": 2.683861017227173, + "learning_rate": 7.499081433591071e-07, + "loss": 1.1936, + "step": 1082 + }, + { + "epoch": 0.8803983335026928, + "grad_norm": 2.6362993717193604, + "learning_rate": 7.399349332754458e-07, + "loss": 1.2169, + "step": 1083 + }, + { + "epoch": 0.8812112590183925, + "grad_norm": 3.3068742752075195, + "learning_rate": 7.300259376237795e-07, + "loss": 1.2098, + "step": 1084 + }, + { + "epoch": 0.882024184534092, + "grad_norm": 2.825416326522827, + "learning_rate": 7.201812251185869e-07, + "loss": 1.2543, + "step": 1085 + }, + { + "epoch": 0.8828371100497917, + "grad_norm": 3.172919750213623, + "learning_rate": 7.104008640285642e-07, + "loss": 1.1768, + "step": 1086 + }, + { + "epoch": 0.8836500355654913, + "grad_norm": 3.052677869796753, + "learning_rate": 7.006849221761736e-07, + "loss": 1.2068, + "step": 1087 + }, + { + "epoch": 0.8844629610811909, + "grad_norm": 2.8510589599609375, + "learning_rate": 6.910334669371433e-07, + "loss": 1.2043, + "step": 1088 + }, + { + "epoch": 0.8852758865968906, + "grad_norm": 3.4369497299194336, + "learning_rate": 6.814465652400237e-07, + "loss": 1.2467, + "step": 1089 + }, + { + "epoch": 0.8860888121125902, + "grad_norm": 2.667567491531372, + "learning_rate": 6.719242835657147e-07, + "loss": 1.2594, + "step": 1090 + }, + { + "epoch": 0.8869017376282898, + "grad_norm": 2.983642816543579, + "learning_rate": 6.62466687947001e-07, + "loss": 1.2199, + "step": 1091 + }, + { + "epoch": 0.8877146631439894, + "grad_norm": 3.583439350128174, + "learning_rate": 6.530738439681017e-07, + "loss": 1.1827, + "step": 1092 + }, + { + "epoch": 0.8885275886596891, + "grad_norm": 4.706247806549072, + "learning_rate": 6.437458167642164e-07, + "loss": 1.2292, + "step": 1093 + }, + { + "epoch": 0.8893405141753887, + "grad_norm": 4.394626140594482, + "learning_rate": 6.344826710210584e-07, + "loss": 1.2975, + "step": 1094 + }, + { + "epoch": 0.8901534396910883, + "grad_norm": 4.5692572593688965, + "learning_rate": 6.252844709744255e-07, + "loss": 1.1853, + "step": 1095 + }, + { + "epoch": 0.890966365206788, + "grad_norm": 3.4114434719085693, + "learning_rate": 6.161512804097436e-07, + "loss": 1.2067, + "step": 1096 + }, + { + "epoch": 0.8917792907224875, + "grad_norm": 7.298144340515137, + "learning_rate": 6.070831626616236e-07, + "loss": 1.2149, + "step": 1097 + }, + { + "epoch": 0.8925922162381872, + "grad_norm": 2.7437572479248047, + "learning_rate": 5.980801806134318e-07, + "loss": 1.2002, + "step": 1098 + }, + { + "epoch": 0.8934051417538869, + "grad_norm": 3.101397752761841, + "learning_rate": 5.891423966968413e-07, + "loss": 1.2594, + "step": 1099 + }, + { + "epoch": 0.8942180672695864, + "grad_norm": 3.186479091644287, + "learning_rate": 5.80269872891408e-07, + "loss": 1.1895, + "step": 1100 + }, + { + "epoch": 0.8950309927852861, + "grad_norm": 3.5605878829956055, + "learning_rate": 5.714626707241411e-07, + "loss": 1.1804, + "step": 1101 + }, + { + "epoch": 0.8958439183009856, + "grad_norm": 3.0213913917541504, + "learning_rate": 5.627208512690641e-07, + "loss": 1.2619, + "step": 1102 + }, + { + "epoch": 0.8966568438166853, + "grad_norm": 3.0476791858673096, + "learning_rate": 5.5404447514681e-07, + "loss": 1.1429, + "step": 1103 + }, + { + "epoch": 0.8974697693323849, + "grad_norm": 2.9802823066711426, + "learning_rate": 5.45433602524188e-07, + "loss": 1.2353, + "step": 1104 + }, + { + "epoch": 0.8982826948480845, + "grad_norm": 3.168029308319092, + "learning_rate": 5.368882931137675e-07, + "loss": 1.1771, + "step": 1105 + }, + { + "epoch": 0.8990956203637842, + "grad_norm": 2.8624963760375977, + "learning_rate": 5.284086061734672e-07, + "loss": 1.1929, + "step": 1106 + }, + { + "epoch": 0.8999085458794838, + "grad_norm": 3.3826193809509277, + "learning_rate": 5.199946005061462e-07, + "loss": 1.1379, + "step": 1107 + }, + { + "epoch": 0.9007214713951834, + "grad_norm": 3.2084782123565674, + "learning_rate": 5.116463344591893e-07, + "loss": 1.1694, + "step": 1108 + }, + { + "epoch": 0.901534396910883, + "grad_norm": 3.6624932289123535, + "learning_rate": 5.033638659241102e-07, + "loss": 1.219, + "step": 1109 + }, + { + "epoch": 0.9023473224265827, + "grad_norm": 3.2314536571502686, + "learning_rate": 4.951472523361401e-07, + "loss": 1.2457, + "step": 1110 + }, + { + "epoch": 0.9031602479422823, + "grad_norm": 3.1179494857788086, + "learning_rate": 4.869965506738416e-07, + "loss": 1.232, + "step": 1111 + }, + { + "epoch": 0.9039731734579819, + "grad_norm": 2.875725030899048, + "learning_rate": 4.789118174587071e-07, + "loss": 1.2515, + "step": 1112 + }, + { + "epoch": 0.9047860989736816, + "grad_norm": 2.5742199420928955, + "learning_rate": 4.7089310875475856e-07, + "loss": 1.2554, + "step": 1113 + }, + { + "epoch": 0.9055990244893811, + "grad_norm": 3.2250759601593018, + "learning_rate": 4.6294048016817917e-07, + "loss": 1.2281, + "step": 1114 + }, + { + "epoch": 0.9064119500050808, + "grad_norm": 2.866562843322754, + "learning_rate": 4.550539868469106e-07, + "loss": 1.2559, + "step": 1115 + }, + { + "epoch": 0.9072248755207805, + "grad_norm": 2.9703938961029053, + "learning_rate": 4.4723368348027375e-07, + "loss": 1.307, + "step": 1116 + }, + { + "epoch": 0.90803780103648, + "grad_norm": 3.0078420639038086, + "learning_rate": 4.394796242985933e-07, + "loss": 1.2285, + "step": 1117 + }, + { + "epoch": 0.9088507265521797, + "grad_norm": 3.0581750869750977, + "learning_rate": 4.317918630728235e-07, + "loss": 1.1751, + "step": 1118 + }, + { + "epoch": 0.9096636520678792, + "grad_norm": 4.224788188934326, + "learning_rate": 4.241704531141633e-07, + "loss": 1.155, + "step": 1119 + }, + { + "epoch": 0.9104765775835789, + "grad_norm": 3.2800920009613037, + "learning_rate": 4.166154472737061e-07, + "loss": 1.199, + "step": 1120 + }, + { + "epoch": 0.9112895030992785, + "grad_norm": 5.579473495483398, + "learning_rate": 4.091268979420537e-07, + "loss": 1.1558, + "step": 1121 + }, + { + "epoch": 0.9121024286149781, + "grad_norm": 3.660987615585327, + "learning_rate": 4.0170485704896453e-07, + "loss": 1.2258, + "step": 1122 + }, + { + "epoch": 0.9129153541306778, + "grad_norm": 11.064430236816406, + "learning_rate": 3.943493760629924e-07, + "loss": 1.1699, + "step": 1123 + }, + { + "epoch": 0.9137282796463774, + "grad_norm": 4.9747138023376465, + "learning_rate": 3.8706050599112363e-07, + "loss": 1.2415, + "step": 1124 + }, + { + "epoch": 0.914541205162077, + "grad_norm": 3.7896888256073, + "learning_rate": 3.798382973784298e-07, + "loss": 1.2221, + "step": 1125 + }, + { + "epoch": 0.9153541306777766, + "grad_norm": 3.383769989013672, + "learning_rate": 3.7268280030771655e-07, + "loss": 1.196, + "step": 1126 + }, + { + "epoch": 0.9161670561934763, + "grad_norm": 3.491272211074829, + "learning_rate": 3.655940643991718e-07, + "loss": 1.1786, + "step": 1127 + }, + { + "epoch": 0.9169799817091759, + "grad_norm": 3.1759097576141357, + "learning_rate": 3.585721388100283e-07, + "loss": 1.1696, + "step": 1128 + }, + { + "epoch": 0.9177929072248755, + "grad_norm": 2.7568089962005615, + "learning_rate": 3.516170722342127e-07, + "loss": 1.1703, + "step": 1129 + }, + { + "epoch": 0.9186058327405752, + "grad_norm": 2.992725372314453, + "learning_rate": 3.4472891290201927e-07, + "loss": 1.1739, + "step": 1130 + }, + { + "epoch": 0.9194187582562747, + "grad_norm": 4.317306041717529, + "learning_rate": 3.3790770857976995e-07, + "loss": 1.184, + "step": 1131 + }, + { + "epoch": 0.9202316837719744, + "grad_norm": 3.9048075675964355, + "learning_rate": 3.3115350656948043e-07, + "loss": 1.2651, + "step": 1132 + }, + { + "epoch": 0.9210446092876741, + "grad_norm": 3.3990674018859863, + "learning_rate": 3.2446635370853686e-07, + "loss": 1.205, + "step": 1133 + }, + { + "epoch": 0.9218575348033736, + "grad_norm": 4.0517754554748535, + "learning_rate": 3.1784629636937404e-07, + "loss": 1.1996, + "step": 1134 + }, + { + "epoch": 0.9226704603190733, + "grad_norm": 3.340564489364624, + "learning_rate": 3.1129338045914004e-07, + "loss": 1.2215, + "step": 1135 + }, + { + "epoch": 0.9234833858347729, + "grad_norm": 3.5760183334350586, + "learning_rate": 3.0480765141939316e-07, + "loss": 1.2191, + "step": 1136 + }, + { + "epoch": 0.9242963113504725, + "grad_norm": 2.8496994972229004, + "learning_rate": 2.9838915422578e-07, + "loss": 1.2217, + "step": 1137 + }, + { + "epoch": 0.9251092368661722, + "grad_norm": 3.025475025177002, + "learning_rate": 2.920379333877221e-07, + "loss": 1.2332, + "step": 1138 + }, + { + "epoch": 0.9259221623818717, + "grad_norm": 4.238699436187744, + "learning_rate": 2.8575403294811123e-07, + "loss": 1.2223, + "step": 1139 + }, + { + "epoch": 0.9267350878975714, + "grad_norm": 2.9650015830993652, + "learning_rate": 2.795374964830022e-07, + "loss": 1.2149, + "step": 1140 + }, + { + "epoch": 0.927548013413271, + "grad_norm": 2.731064796447754, + "learning_rate": 2.733883671013082e-07, + "loss": 1.2116, + "step": 1141 + }, + { + "epoch": 0.9283609389289706, + "grad_norm": 4.153676986694336, + "learning_rate": 2.673066874445096e-07, + "loss": 1.1189, + "step": 1142 + }, + { + "epoch": 0.9291738644446702, + "grad_norm": 3.843541383743286, + "learning_rate": 2.612924996863453e-07, + "loss": 1.1933, + "step": 1143 + }, + { + "epoch": 0.9299867899603699, + "grad_norm": 3.0720019340515137, + "learning_rate": 2.5534584553253526e-07, + "loss": 1.1859, + "step": 1144 + }, + { + "epoch": 0.9307997154760695, + "grad_norm": 3.4368112087249756, + "learning_rate": 2.494667662204797e-07, + "loss": 1.22, + "step": 1145 + }, + { + "epoch": 0.9316126409917691, + "grad_norm": 2.524754285812378, + "learning_rate": 2.436553025189758e-07, + "loss": 1.2561, + "step": 1146 + }, + { + "epoch": 0.9324255665074688, + "grad_norm": 3.2625484466552734, + "learning_rate": 2.3791149472794373e-07, + "loss": 1.2026, + "step": 1147 + }, + { + "epoch": 0.9332384920231683, + "grad_norm": 3.4842891693115234, + "learning_rate": 2.3223538267813317e-07, + "loss": 1.234, + "step": 1148 + }, + { + "epoch": 0.934051417538868, + "grad_norm": 2.9896857738494873, + "learning_rate": 2.2662700573085505e-07, + "loss": 1.2008, + "step": 1149 + }, + { + "epoch": 0.9348643430545677, + "grad_norm": 3.3465092182159424, + "learning_rate": 2.2108640277771153e-07, + "loss": 1.2392, + "step": 1150 + }, + { + "epoch": 0.9356772685702672, + "grad_norm": 2.6980130672454834, + "learning_rate": 2.156136122403174e-07, + "loss": 1.2083, + "step": 1151 + }, + { + "epoch": 0.9364901940859669, + "grad_norm": 3.4942784309387207, + "learning_rate": 2.1020867207004026e-07, + "loss": 1.2232, + "step": 1152 + }, + { + "epoch": 0.9373031196016665, + "grad_norm": 2.874210834503174, + "learning_rate": 2.048716197477374e-07, + "loss": 1.2447, + "step": 1153 + }, + { + "epoch": 0.9381160451173661, + "grad_norm": 3.429757833480835, + "learning_rate": 1.996024922834905e-07, + "loss": 1.1562, + "step": 1154 + }, + { + "epoch": 0.9389289706330658, + "grad_norm": 2.96549654006958, + "learning_rate": 1.9440132621635687e-07, + "loss": 1.2543, + "step": 1155 + }, + { + "epoch": 0.9397418961487654, + "grad_norm": 3.1660540103912354, + "learning_rate": 1.8926815761410867e-07, + "loss": 1.1931, + "step": 1156 + }, + { + "epoch": 0.940554821664465, + "grad_norm": 2.848574161529541, + "learning_rate": 1.8420302207298623e-07, + "loss": 1.1837, + "step": 1157 + }, + { + "epoch": 0.9413677471801646, + "grad_norm": 4.005343437194824, + "learning_rate": 1.792059547174507e-07, + "loss": 1.2423, + "step": 1158 + }, + { + "epoch": 0.9421806726958643, + "grad_norm": 2.7809975147247314, + "learning_rate": 1.7427699019994415e-07, + "loss": 1.1665, + "step": 1159 + }, + { + "epoch": 0.9429935982115638, + "grad_norm": 4.211681365966797, + "learning_rate": 1.6941616270063854e-07, + "loss": 1.2526, + "step": 1160 + }, + { + "epoch": 0.9438065237272635, + "grad_norm": 4.117452144622803, + "learning_rate": 1.6462350592721498e-07, + "loss": 1.1957, + "step": 1161 + }, + { + "epoch": 0.9446194492429631, + "grad_norm": 2.9959964752197266, + "learning_rate": 1.5989905311461274e-07, + "loss": 1.2342, + "step": 1162 + }, + { + "epoch": 0.9454323747586627, + "grad_norm": 3.091280460357666, + "learning_rate": 1.5524283702481158e-07, + "loss": 1.2168, + "step": 1163 + }, + { + "epoch": 0.9462453002743624, + "grad_norm": 4.000481128692627, + "learning_rate": 1.5065488994659983e-07, + "loss": 1.2206, + "step": 1164 + }, + { + "epoch": 0.9470582257900619, + "grad_norm": 3.2974343299865723, + "learning_rate": 1.461352436953478e-07, + "loss": 1.1955, + "step": 1165 + }, + { + "epoch": 0.9478711513057616, + "grad_norm": 3.589606285095215, + "learning_rate": 1.4168392961279254e-07, + "loss": 1.1277, + "step": 1166 + }, + { + "epoch": 0.9486840768214613, + "grad_norm": 3.071859121322632, + "learning_rate": 1.3730097856681668e-07, + "loss": 1.1837, + "step": 1167 + }, + { + "epoch": 0.9494970023371608, + "grad_norm": 3.4584462642669678, + "learning_rate": 1.329864209512377e-07, + "loss": 1.249, + "step": 1168 + }, + { + "epoch": 0.9503099278528605, + "grad_norm": 4.1693434715271, + "learning_rate": 1.2874028668559247e-07, + "loss": 1.2234, + "step": 1169 + }, + { + "epoch": 0.9511228533685601, + "grad_norm": 3.1776278018951416, + "learning_rate": 1.245626052149318e-07, + "loss": 1.2047, + "step": 1170 + }, + { + "epoch": 0.9519357788842597, + "grad_norm": 3.347137689590454, + "learning_rate": 1.2045340550961958e-07, + "loss": 1.2995, + "step": 1171 + }, + { + "epoch": 0.9527487043999594, + "grad_norm": 3.2806451320648193, + "learning_rate": 1.164127160651285e-07, + "loss": 1.1546, + "step": 1172 + }, + { + "epoch": 0.953561629915659, + "grad_norm": 4.498492240905762, + "learning_rate": 1.1244056490184008e-07, + "loss": 1.2469, + "step": 1173 + }, + { + "epoch": 0.9543745554313586, + "grad_norm": 3.0195493698120117, + "learning_rate": 1.0853697956485942e-07, + "loss": 1.2373, + "step": 1174 + }, + { + "epoch": 0.9551874809470582, + "grad_norm": 4.176177501678467, + "learning_rate": 1.0470198712381086e-07, + "loss": 1.2486, + "step": 1175 + }, + { + "epoch": 0.9560004064627579, + "grad_norm": 3.222987413406372, + "learning_rate": 1.009356141726614e-07, + "loss": 1.1905, + "step": 1176 + }, + { + "epoch": 0.9568133319784575, + "grad_norm": 2.6555376052856445, + "learning_rate": 9.723788682953539e-08, + "loss": 1.1666, + "step": 1177 + }, + { + "epoch": 0.9576262574941571, + "grad_norm": 4.015134334564209, + "learning_rate": 9.360883073652238e-08, + "loss": 1.2675, + "step": 1178 + }, + { + "epoch": 0.9584391830098568, + "grad_norm": 3.029994487762451, + "learning_rate": 9.004847105951509e-08, + "loss": 1.1977, + "step": 1179 + }, + { + "epoch": 0.9592521085255563, + "grad_norm": 2.7363007068634033, + "learning_rate": 8.655683248802282e-08, + "loss": 1.2359, + "step": 1180 + }, + { + "epoch": 0.960065034041256, + "grad_norm": 4.360199451446533, + "learning_rate": 8.313393923500613e-08, + "loss": 1.2099, + "step": 1181 + }, + { + "epoch": 0.9608779595569555, + "grad_norm": 2.9082043170928955, + "learning_rate": 7.977981503670795e-08, + "loss": 1.2632, + "step": 1182 + }, + { + "epoch": 0.9616908850726552, + "grad_norm": 3.0049242973327637, + "learning_rate": 7.64944831524872e-08, + "loss": 1.2128, + "step": 1183 + }, + { + "epoch": 0.9625038105883549, + "grad_norm": 2.9180142879486084, + "learning_rate": 7.327796636465767e-08, + "loss": 1.2075, + "step": 1184 + }, + { + "epoch": 0.9633167361040544, + "grad_norm": 2.8545587062835693, + "learning_rate": 7.01302869783338e-08, + "loss": 1.1809, + "step": 1185 + }, + { + "epoch": 0.9641296616197541, + "grad_norm": 3.2359890937805176, + "learning_rate": 6.705146682127184e-08, + "loss": 1.2404, + "step": 1186 + }, + { + "epoch": 0.9649425871354537, + "grad_norm": 7.442730903625488, + "learning_rate": 6.404152724371892e-08, + "loss": 1.2081, + "step": 1187 + }, + { + "epoch": 0.9657555126511533, + "grad_norm": 2.9155330657958984, + "learning_rate": 6.110048911826871e-08, + "loss": 1.1837, + "step": 1188 + }, + { + "epoch": 0.966568438166853, + "grad_norm": 5.689270496368408, + "learning_rate": 5.82283728397115e-08, + "loss": 1.2039, + "step": 1189 + }, + { + "epoch": 0.9673813636825526, + "grad_norm": 2.791161060333252, + "learning_rate": 5.542519832489546e-08, + "loss": 1.2032, + "step": 1190 + }, + { + "epoch": 0.9681942891982522, + "grad_norm": 3.127793312072754, + "learning_rate": 5.269098501259007e-08, + "loss": 1.2016, + "step": 1191 + }, + { + "epoch": 0.9690072147139518, + "grad_norm": 2.8209614753723145, + "learning_rate": 5.002575186334735e-08, + "loss": 1.1624, + "step": 1192 + }, + { + "epoch": 0.9698201402296515, + "grad_norm": 3.3611080646514893, + "learning_rate": 4.742951735937418e-08, + "loss": 1.2068, + "step": 1193 + }, + { + "epoch": 0.9706330657453511, + "grad_norm": 5.118293285369873, + "learning_rate": 4.490229950440239e-08, + "loss": 1.2398, + "step": 1194 + }, + { + "epoch": 0.9714459912610507, + "grad_norm": 9.395883560180664, + "learning_rate": 4.2444115823562226e-08, + "loss": 1.3143, + "step": 1195 + }, + { + "epoch": 0.9722589167767504, + "grad_norm": 3.1017065048217773, + "learning_rate": 4.005498336326463e-08, + "loss": 1.1918, + "step": 1196 + }, + { + "epoch": 0.9730718422924499, + "grad_norm": 3.226966142654419, + "learning_rate": 3.773491869108137e-08, + "loss": 1.2046, + "step": 1197 + }, + { + "epoch": 0.9738847678081496, + "grad_norm": 3.233693838119507, + "learning_rate": 3.548393789562732e-08, + "loss": 1.2325, + "step": 1198 + }, + { + "epoch": 0.9746976933238493, + "grad_norm": 3.159299612045288, + "learning_rate": 3.3302056586453916e-08, + "loss": 1.1693, + "step": 1199 + }, + { + "epoch": 0.9755106188395488, + "grad_norm": 2.7059924602508545, + "learning_rate": 3.118928989393699e-08, + "loss": 1.2422, + "step": 1200 + }, + { + "epoch": 0.9763235443552485, + "grad_norm": 3.511061668395996, + "learning_rate": 2.9145652469174666e-08, + "loss": 1.2184, + "step": 1201 + }, + { + "epoch": 0.977136469870948, + "grad_norm": 4.077070236206055, + "learning_rate": 2.7171158483882963e-08, + "loss": 1.2309, + "step": 1202 + }, + { + "epoch": 0.9779493953866477, + "grad_norm": 3.434537887573242, + "learning_rate": 2.5265821630298116e-08, + "loss": 1.1943, + "step": 1203 + }, + { + "epoch": 0.9787623209023473, + "grad_norm": 3.698641300201416, + "learning_rate": 2.3429655121085525e-08, + "loss": 1.2671, + "step": 1204 + }, + { + "epoch": 0.9795752464180469, + "grad_norm": 6.674719333648682, + "learning_rate": 2.1662671689242076e-08, + "loss": 1.1961, + "step": 1205 + }, + { + "epoch": 0.9803881719337466, + "grad_norm": 4.9146952629089355, + "learning_rate": 1.996488358801174e-08, + "loss": 1.2345, + "step": 1206 + }, + { + "epoch": 0.9812010974494462, + "grad_norm": 2.7147114276885986, + "learning_rate": 1.8336302590798992e-08, + "loss": 1.2118, + "step": 1207 + }, + { + "epoch": 0.9820140229651458, + "grad_norm": 2.809692859649658, + "learning_rate": 1.677693999109109e-08, + "loss": 1.2162, + "step": 1208 + }, + { + "epoch": 0.9828269484808454, + "grad_norm": 3.857846975326538, + "learning_rate": 1.5286806602372583e-08, + "loss": 1.1792, + "step": 1209 + }, + { + "epoch": 0.9836398739965451, + "grad_norm": 3.8911325931549072, + "learning_rate": 1.3865912758054267e-08, + "loss": 1.2332, + "step": 1210 + }, + { + "epoch": 0.9844527995122447, + "grad_norm": 3.5572190284729004, + "learning_rate": 1.2514268311405452e-08, + "loss": 1.2174, + "step": 1211 + }, + { + "epoch": 0.9852657250279443, + "grad_norm": 3.22208833694458, + "learning_rate": 1.1231882635477364e-08, + "loss": 1.2146, + "step": 1212 + }, + { + "epoch": 0.986078650543644, + "grad_norm": 4.469923973083496, + "learning_rate": 1.0018764623045407e-08, + "loss": 1.2168, + "step": 1213 + }, + { + "epoch": 0.9868915760593435, + "grad_norm": 3.1559510231018066, + "learning_rate": 8.874922686541442e-09, + "loss": 1.2074, + "step": 1214 + }, + { + "epoch": 0.9877045015750432, + "grad_norm": 2.6890878677368164, + "learning_rate": 7.800364758002721e-09, + "loss": 1.2358, + "step": 1215 + }, + { + "epoch": 0.9885174270907429, + "grad_norm": 3.4091622829437256, + "learning_rate": 6.795098289008595e-09, + "loss": 1.2484, + "step": 1216 + }, + { + "epoch": 0.9893303526064424, + "grad_norm": 3.0762569904327393, + "learning_rate": 5.859130250636113e-09, + "loss": 1.1787, + "step": 1217 + }, + { + "epoch": 0.9901432781221421, + "grad_norm": 2.616163492202759, + "learning_rate": 4.992467133406731e-09, + "loss": 1.2092, + "step": 1218 + }, + { + "epoch": 0.9909562036378416, + "grad_norm": 3.0248591899871826, + "learning_rate": 4.195114947244117e-09, + "loss": 1.1998, + "step": 1219 + }, + { + "epoch": 0.9917691291535413, + "grad_norm": 5.664068698883057, + "learning_rate": 3.4670792214297476e-09, + "loss": 1.2539, + "step": 1220 + }, + { + "epoch": 0.9925820546692409, + "grad_norm": 3.449087619781494, + "learning_rate": 2.808365004569602e-09, + "loss": 1.2463, + "step": 1221 + }, + { + "epoch": 0.9933949801849405, + "grad_norm": 2.958399534225464, + "learning_rate": 2.2189768645519693e-09, + "loss": 1.2076, + "step": 1222 + }, + { + "epoch": 0.9942079057006402, + "grad_norm": 3.4361188411712646, + "learning_rate": 1.6989188885219165e-09, + "loss": 1.2436, + "step": 1223 + }, + { + "epoch": 0.9950208312163398, + "grad_norm": 3.0529403686523438, + "learning_rate": 1.2481946828502011e-09, + "loss": 1.1955, + "step": 1224 + }, + { + "epoch": 0.9958337567320394, + "grad_norm": 3.090090274810791, + "learning_rate": 8.668073731088467e-10, + "loss": 1.1455, + "step": 1225 + }, + { + "epoch": 0.996646682247739, + "grad_norm": 3.2662580013275146, + "learning_rate": 5.547596040489378e-10, + "loss": 1.2283, + "step": 1226 + }, + { + "epoch": 0.9974596077634387, + "grad_norm": 2.7874884605407715, + "learning_rate": 3.1205353958285724e-10, + "loss": 1.2011, + "step": 1227 + }, + { + "epoch": 0.9982725332791383, + "grad_norm": 2.9483141899108887, + "learning_rate": 1.3869086276985243e-10, + "loss": 1.272, + "step": 1228 + }, + { + "epoch": 0.9990854587948379, + "grad_norm": 3.550588607788086, + "learning_rate": 3.467277580271322e-11, + "loss": 1.1665, + "step": 1229 + }, + { + "epoch": 0.9998983843105376, + "grad_norm": 3.500861406326294, + "learning_rate": 0.0, + "loss": 1.2382, + "step": 1230 + }, + { + "epoch": 0.9998983843105376, + "step": 1230, + "total_flos": 3.1215366383127757e+18, + "train_loss": 1.3087712280149382, + "train_runtime": 25084.8125, + "train_samples_per_second": 6.277, + "train_steps_per_second": 0.049 + } + ], + "logging_steps": 1.0, + "max_steps": 1230, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 7975, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.1215366383127757e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}