diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,78503 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9997324056729997, + "eval_steps": 500, + "global_step": 11210, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001783962180001784, + "grad_norm": 7.837475776672363, + "learning_rate": 0.0004999999901825538, + "loss": 6.5872, + "step": 1 + }, + { + "epoch": 0.0003567924360003568, + "grad_norm": 32.637916564941406, + "learning_rate": 0.000499999960730216, + "loss": 7.833, + "step": 2 + }, + { + "epoch": 0.0005351886540005352, + "grad_norm": 19.442291259765625, + "learning_rate": 0.0004999999116429887, + "loss": 6.3418, + "step": 3 + }, + { + "epoch": 0.0007135848720007136, + "grad_norm": 7.7614946365356445, + "learning_rate": 0.0004999998429208761, + "loss": 5.3582, + "step": 4 + }, + { + "epoch": 0.000891981090000892, + "grad_norm": 5.833192348480225, + "learning_rate": 0.0004999997545638834, + "loss": 4.4165, + "step": 5 + }, + { + "epoch": 0.0010703773080010704, + "grad_norm": 2.9772305488586426, + "learning_rate": 0.0004999996465720175, + "loss": 4.1561, + "step": 6 + }, + { + "epoch": 0.0012487735260012487, + "grad_norm": 3.942150354385376, + "learning_rate": 0.0004999995189452869, + "loss": 4.0736, + "step": 7 + }, + { + "epoch": 0.0014271697440014271, + "grad_norm": 2.2194981575012207, + "learning_rate": 0.0004999993716837017, + "loss": 3.9339, + "step": 8 + }, + { + "epoch": 0.0016055659620016055, + "grad_norm": 2.6826322078704834, + "learning_rate": 0.0004999992047872735, + "loss": 4.0066, + "step": 9 + }, + { + "epoch": 0.001783962180001784, + "grad_norm": 2.3299083709716797, + "learning_rate": 0.0004999990182560153, + "loss": 3.9545, + "step": 10 + }, + { + "epoch": 0.0019623583980019625, + "grad_norm": 2.3098955154418945, + "learning_rate": 0.0004999988120899418, + "loss": 3.8027, + "step": 11 + }, + { + "epoch": 0.0021407546160021407, + "grad_norm": 1.8366882801055908, + "learning_rate": 0.0004999985862890691, + "loss": 3.8252, + "step": 12 + }, + { + "epoch": 0.0023191508340023193, + "grad_norm": 1.689456820487976, + "learning_rate": 0.0004999983408534151, + "loss": 3.6765, + "step": 13 + }, + { + "epoch": 0.0024975470520024975, + "grad_norm": 2.1124043464660645, + "learning_rate": 0.000499998075782999, + "loss": 3.7322, + "step": 14 + }, + { + "epoch": 0.002675943270002676, + "grad_norm": 1.5068916082382202, + "learning_rate": 0.0004999977910778417, + "loss": 3.7449, + "step": 15 + }, + { + "epoch": 0.0028543394880028543, + "grad_norm": 1.2359281778335571, + "learning_rate": 0.0004999974867379652, + "loss": 3.6367, + "step": 16 + }, + { + "epoch": 0.003032735706003033, + "grad_norm": 1.5040236711502075, + "learning_rate": 0.0004999971627633939, + "loss": 3.5622, + "step": 17 + }, + { + "epoch": 0.003211131924003211, + "grad_norm": 1.7240302562713623, + "learning_rate": 0.000499996819154153, + "loss": 3.4124, + "step": 18 + }, + { + "epoch": 0.0033895281420033897, + "grad_norm": 1.6077510118484497, + "learning_rate": 0.0004999964559102693, + "loss": 3.6737, + "step": 19 + }, + { + "epoch": 0.003567924360003568, + "grad_norm": 1.3428990840911865, + "learning_rate": 0.0004999960730317718, + "loss": 3.4728, + "step": 20 + }, + { + "epoch": 0.0037463205780037465, + "grad_norm": 1.5660542249679565, + "learning_rate": 0.0004999956705186902, + "loss": 3.2715, + "step": 21 + }, + { + "epoch": 0.003924716796003925, + "grad_norm": 1.5012935400009155, + "learning_rate": 0.0004999952483710562, + "loss": 3.6257, + "step": 22 + }, + { + "epoch": 0.004103113014004103, + "grad_norm": 1.3174484968185425, + "learning_rate": 0.0004999948065889029, + "loss": 3.5645, + "step": 23 + }, + { + "epoch": 0.004281509232004281, + "grad_norm": 1.1299705505371094, + "learning_rate": 0.0004999943451722653, + "loss": 3.2416, + "step": 24 + }, + { + "epoch": 0.00445990545000446, + "grad_norm": 0.8905799984931946, + "learning_rate": 0.0004999938641211792, + "loss": 3.2397, + "step": 25 + }, + { + "epoch": 0.004638301668004639, + "grad_norm": 1.1655199527740479, + "learning_rate": 0.0004999933634356826, + "loss": 3.1717, + "step": 26 + }, + { + "epoch": 0.004816697886004816, + "grad_norm": 1.191267967224121, + "learning_rate": 0.0004999928431158149, + "loss": 3.3958, + "step": 27 + }, + { + "epoch": 0.004995094104004995, + "grad_norm": 1.0441665649414062, + "learning_rate": 0.0004999923031616169, + "loss": 3.3732, + "step": 28 + }, + { + "epoch": 0.005173490322005174, + "grad_norm": 1.2923051118850708, + "learning_rate": 0.0004999917435731309, + "loss": 3.3872, + "step": 29 + }, + { + "epoch": 0.005351886540005352, + "grad_norm": 1.1339762210845947, + "learning_rate": 0.000499991164350401, + "loss": 2.9439, + "step": 30 + }, + { + "epoch": 0.00553028275800553, + "grad_norm": 0.8818377256393433, + "learning_rate": 0.0004999905654934726, + "loss": 3.1922, + "step": 31 + }, + { + "epoch": 0.0057086789760057086, + "grad_norm": 0.7961766123771667, + "learning_rate": 0.0004999899470023929, + "loss": 3.2558, + "step": 32 + }, + { + "epoch": 0.005887075194005887, + "grad_norm": 1.180748701095581, + "learning_rate": 0.0004999893088772102, + "loss": 3.2286, + "step": 33 + }, + { + "epoch": 0.006065471412006066, + "grad_norm": 1.035274624824524, + "learning_rate": 0.0004999886511179748, + "loss": 3.4693, + "step": 34 + }, + { + "epoch": 0.0062438676300062435, + "grad_norm": 1.15408194065094, + "learning_rate": 0.0004999879737247382, + "loss": 3.2316, + "step": 35 + }, + { + "epoch": 0.006422263848006422, + "grad_norm": 0.8637914657592773, + "learning_rate": 0.0004999872766975539, + "loss": 2.9908, + "step": 36 + }, + { + "epoch": 0.006600660066006601, + "grad_norm": 1.0279821157455444, + "learning_rate": 0.0004999865600364764, + "loss": 3.1329, + "step": 37 + }, + { + "epoch": 0.006779056284006779, + "grad_norm": 0.9054232239723206, + "learning_rate": 0.000499985823741562, + "loss": 3.2613, + "step": 38 + }, + { + "epoch": 0.006957452502006957, + "grad_norm": 0.883553683757782, + "learning_rate": 0.0004999850678128687, + "loss": 3.0738, + "step": 39 + }, + { + "epoch": 0.007135848720007136, + "grad_norm": 0.9399454593658447, + "learning_rate": 0.0004999842922504556, + "loss": 3.2061, + "step": 40 + }, + { + "epoch": 0.007314244938007314, + "grad_norm": 1.0862022638320923, + "learning_rate": 0.0004999834970543839, + "loss": 3.0835, + "step": 41 + }, + { + "epoch": 0.007492641156007493, + "grad_norm": 0.9416713118553162, + "learning_rate": 0.0004999826822247159, + "loss": 3.0841, + "step": 42 + }, + { + "epoch": 0.007671037374007671, + "grad_norm": 1.0028291940689087, + "learning_rate": 0.0004999818477615155, + "loss": 3.0349, + "step": 43 + }, + { + "epoch": 0.00784943359200785, + "grad_norm": 0.8068011403083801, + "learning_rate": 0.0004999809936648484, + "loss": 3.0634, + "step": 44 + }, + { + "epoch": 0.008027829810008028, + "grad_norm": 2.0067076683044434, + "learning_rate": 0.0004999801199347817, + "loss": 3.2768, + "step": 45 + }, + { + "epoch": 0.008206226028008206, + "grad_norm": 0.8227775692939758, + "learning_rate": 0.000499979226571384, + "loss": 3.2637, + "step": 46 + }, + { + "epoch": 0.008384622246008385, + "grad_norm": 0.9798381924629211, + "learning_rate": 0.0004999783135747252, + "loss": 2.9619, + "step": 47 + }, + { + "epoch": 0.008563018464008563, + "grad_norm": 0.9678353071212769, + "learning_rate": 0.0004999773809448774, + "loss": 2.7488, + "step": 48 + }, + { + "epoch": 0.00874141468200874, + "grad_norm": 1.023210048675537, + "learning_rate": 0.0004999764286819137, + "loss": 3.1175, + "step": 49 + }, + { + "epoch": 0.00891981090000892, + "grad_norm": 0.892423689365387, + "learning_rate": 0.0004999754567859087, + "loss": 2.7036, + "step": 50 + }, + { + "epoch": 0.009098207118009098, + "grad_norm": 1.0287269353866577, + "learning_rate": 0.000499974465256939, + "loss": 2.892, + "step": 51 + }, + { + "epoch": 0.009276603336009277, + "grad_norm": 1.1491647958755493, + "learning_rate": 0.0004999734540950824, + "loss": 2.9976, + "step": 52 + }, + { + "epoch": 0.009454999554009455, + "grad_norm": 1.1946396827697754, + "learning_rate": 0.0004999724233004183, + "loss": 2.8391, + "step": 53 + }, + { + "epoch": 0.009633395772009633, + "grad_norm": 1.0518914461135864, + "learning_rate": 0.0004999713728730276, + "loss": 2.7981, + "step": 54 + }, + { + "epoch": 0.009811791990009812, + "grad_norm": 0.9127726554870605, + "learning_rate": 0.0004999703028129929, + "loss": 3.0045, + "step": 55 + }, + { + "epoch": 0.00999018820800999, + "grad_norm": 1.4292141199111938, + "learning_rate": 0.000499969213120398, + "loss": 2.8518, + "step": 56 + }, + { + "epoch": 0.010168584426010168, + "grad_norm": 1.256000280380249, + "learning_rate": 0.0004999681037953288, + "loss": 2.8525, + "step": 57 + }, + { + "epoch": 0.010346980644010347, + "grad_norm": 1.166292428970337, + "learning_rate": 0.0004999669748378723, + "loss": 2.9721, + "step": 58 + }, + { + "epoch": 0.010525376862010525, + "grad_norm": 1.197026014328003, + "learning_rate": 0.0004999658262481172, + "loss": 2.6609, + "step": 59 + }, + { + "epoch": 0.010703773080010704, + "grad_norm": 1.027642846107483, + "learning_rate": 0.0004999646580261537, + "loss": 2.7145, + "step": 60 + }, + { + "epoch": 0.010882169298010882, + "grad_norm": 1.0411521196365356, + "learning_rate": 0.0004999634701720734, + "loss": 2.895, + "step": 61 + }, + { + "epoch": 0.01106056551601106, + "grad_norm": 1.0868782997131348, + "learning_rate": 0.0004999622626859699, + "loss": 2.8614, + "step": 62 + }, + { + "epoch": 0.01123896173401124, + "grad_norm": 1.0062867403030396, + "learning_rate": 0.0004999610355679377, + "loss": 2.4979, + "step": 63 + }, + { + "epoch": 0.011417357952011417, + "grad_norm": 1.0268216133117676, + "learning_rate": 0.0004999597888180734, + "loss": 2.9, + "step": 64 + }, + { + "epoch": 0.011595754170011597, + "grad_norm": 0.9433121085166931, + "learning_rate": 0.0004999585224364748, + "loss": 3.0019, + "step": 65 + }, + { + "epoch": 0.011774150388011774, + "grad_norm": 0.9315336346626282, + "learning_rate": 0.0004999572364232414, + "loss": 2.7774, + "step": 66 + }, + { + "epoch": 0.011952546606011952, + "grad_norm": 0.8882467746734619, + "learning_rate": 0.0004999559307784743, + "loss": 2.7762, + "step": 67 + }, + { + "epoch": 0.012130942824012132, + "grad_norm": 0.9650992751121521, + "learning_rate": 0.000499954605502276, + "loss": 2.5736, + "step": 68 + }, + { + "epoch": 0.01230933904201231, + "grad_norm": 1.0080440044403076, + "learning_rate": 0.0004999532605947505, + "loss": 2.8355, + "step": 69 + }, + { + "epoch": 0.012487735260012487, + "grad_norm": 1.0923867225646973, + "learning_rate": 0.0004999518960560034, + "loss": 2.5415, + "step": 70 + }, + { + "epoch": 0.012666131478012667, + "grad_norm": 0.8062555193901062, + "learning_rate": 0.000499950511886142, + "loss": 2.1503, + "step": 71 + }, + { + "epoch": 0.012844527696012844, + "grad_norm": 1.1151552200317383, + "learning_rate": 0.000499949108085275, + "loss": 2.9247, + "step": 72 + }, + { + "epoch": 0.013022923914013024, + "grad_norm": 0.7492188811302185, + "learning_rate": 0.0004999476846535125, + "loss": 2.577, + "step": 73 + }, + { + "epoch": 0.013201320132013201, + "grad_norm": 1.0736455917358398, + "learning_rate": 0.0004999462415909664, + "loss": 2.7738, + "step": 74 + }, + { + "epoch": 0.01337971635001338, + "grad_norm": 0.9508681297302246, + "learning_rate": 0.0004999447788977502, + "loss": 2.4073, + "step": 75 + }, + { + "epoch": 0.013558112568013559, + "grad_norm": 0.8389919996261597, + "learning_rate": 0.0004999432965739786, + "loss": 2.7485, + "step": 76 + }, + { + "epoch": 0.013736508786013736, + "grad_norm": 1.055034875869751, + "learning_rate": 0.0004999417946197679, + "loss": 2.5765, + "step": 77 + }, + { + "epoch": 0.013914905004013914, + "grad_norm": 1.1897584199905396, + "learning_rate": 0.0004999402730352363, + "loss": 2.7678, + "step": 78 + }, + { + "epoch": 0.014093301222014094, + "grad_norm": 0.9991359710693359, + "learning_rate": 0.0004999387318205032, + "loss": 2.7551, + "step": 79 + }, + { + "epoch": 0.014271697440014271, + "grad_norm": 0.9848081469535828, + "learning_rate": 0.0004999371709756897, + "loss": 2.5171, + "step": 80 + }, + { + "epoch": 0.014450093658014451, + "grad_norm": 1.1168460845947266, + "learning_rate": 0.0004999355905009183, + "loss": 2.3899, + "step": 81 + }, + { + "epoch": 0.014628489876014629, + "grad_norm": 0.9936979413032532, + "learning_rate": 0.0004999339903963133, + "loss": 2.6265, + "step": 82 + }, + { + "epoch": 0.014806886094014806, + "grad_norm": 1.0733612775802612, + "learning_rate": 0.0004999323706620001, + "loss": 2.5981, + "step": 83 + }, + { + "epoch": 0.014985282312014986, + "grad_norm": 1.0748244524002075, + "learning_rate": 0.000499930731298106, + "loss": 2.5156, + "step": 84 + }, + { + "epoch": 0.015163678530015164, + "grad_norm": 0.8287347555160522, + "learning_rate": 0.00049992907230476, + "loss": 2.0732, + "step": 85 + }, + { + "epoch": 0.015342074748015341, + "grad_norm": 0.8893202543258667, + "learning_rate": 0.0004999273936820922, + "loss": 2.4538, + "step": 86 + }, + { + "epoch": 0.01552047096601552, + "grad_norm": 0.8320645689964294, + "learning_rate": 0.0004999256954302344, + "loss": 2.5106, + "step": 87 + }, + { + "epoch": 0.0156988671840157, + "grad_norm": 0.9351370930671692, + "learning_rate": 0.0004999239775493199, + "loss": 2.529, + "step": 88 + }, + { + "epoch": 0.015877263402015878, + "grad_norm": 0.8308570981025696, + "learning_rate": 0.0004999222400394839, + "loss": 2.234, + "step": 89 + }, + { + "epoch": 0.016055659620016056, + "grad_norm": 1.0600675344467163, + "learning_rate": 0.0004999204829008628, + "loss": 2.5008, + "step": 90 + }, + { + "epoch": 0.016234055838016234, + "grad_norm": 0.9695082902908325, + "learning_rate": 0.0004999187061335943, + "loss": 2.3691, + "step": 91 + }, + { + "epoch": 0.01641245205601641, + "grad_norm": 0.9149223566055298, + "learning_rate": 0.0004999169097378184, + "loss": 2.5968, + "step": 92 + }, + { + "epoch": 0.016590848274016592, + "grad_norm": 0.9797092080116272, + "learning_rate": 0.0004999150937136758, + "loss": 2.535, + "step": 93 + }, + { + "epoch": 0.01676924449201677, + "grad_norm": 0.7779546976089478, + "learning_rate": 0.0004999132580613094, + "loss": 2.0829, + "step": 94 + }, + { + "epoch": 0.016947640710016948, + "grad_norm": 0.8772634863853455, + "learning_rate": 0.0004999114027808632, + "loss": 2.2751, + "step": 95 + }, + { + "epoch": 0.017126036928017126, + "grad_norm": 0.828109860420227, + "learning_rate": 0.0004999095278724829, + "loss": 2.5915, + "step": 96 + }, + { + "epoch": 0.017304433146017303, + "grad_norm": 1.0454559326171875, + "learning_rate": 0.0004999076333363159, + "loss": 2.3756, + "step": 97 + }, + { + "epoch": 0.01748282936401748, + "grad_norm": 1.0165445804595947, + "learning_rate": 0.000499905719172511, + "loss": 2.3156, + "step": 98 + }, + { + "epoch": 0.017661225582017662, + "grad_norm": 0.9280771017074585, + "learning_rate": 0.0004999037853812183, + "loss": 2.4135, + "step": 99 + }, + { + "epoch": 0.01783962180001784, + "grad_norm": 0.9677728414535522, + "learning_rate": 0.0004999018319625898, + "loss": 2.2452, + "step": 100 + }, + { + "epoch": 0.018018018018018018, + "grad_norm": 0.9852823615074158, + "learning_rate": 0.0004998998589167791, + "loss": 2.1909, + "step": 101 + }, + { + "epoch": 0.018196414236018196, + "grad_norm": 1.07912278175354, + "learning_rate": 0.0004998978662439411, + "loss": 2.1278, + "step": 102 + }, + { + "epoch": 0.018374810454018373, + "grad_norm": 0.9431922435760498, + "learning_rate": 0.000499895853944232, + "loss": 2.2322, + "step": 103 + }, + { + "epoch": 0.018553206672018555, + "grad_norm": 0.8767147064208984, + "learning_rate": 0.0004998938220178102, + "loss": 2.1025, + "step": 104 + }, + { + "epoch": 0.018731602890018732, + "grad_norm": 1.079720377922058, + "learning_rate": 0.0004998917704648352, + "loss": 2.5964, + "step": 105 + }, + { + "epoch": 0.01890999910801891, + "grad_norm": 1.181444764137268, + "learning_rate": 0.000499889699285468, + "loss": 2.1149, + "step": 106 + }, + { + "epoch": 0.019088395326019088, + "grad_norm": 1.055641770362854, + "learning_rate": 0.0004998876084798714, + "loss": 2.3814, + "step": 107 + }, + { + "epoch": 0.019266791544019266, + "grad_norm": 0.9991637468338013, + "learning_rate": 0.0004998854980482095, + "loss": 2.4501, + "step": 108 + }, + { + "epoch": 0.019445187762019447, + "grad_norm": 0.878653883934021, + "learning_rate": 0.0004998833679906482, + "loss": 2.2617, + "step": 109 + }, + { + "epoch": 0.019623583980019624, + "grad_norm": 0.9334123134613037, + "learning_rate": 0.0004998812183073547, + "loss": 2.325, + "step": 110 + }, + { + "epoch": 0.019801980198019802, + "grad_norm": 0.7886718511581421, + "learning_rate": 0.0004998790489984978, + "loss": 2.1459, + "step": 111 + }, + { + "epoch": 0.01998037641601998, + "grad_norm": 0.8349006175994873, + "learning_rate": 0.0004998768600642479, + "loss": 2.3084, + "step": 112 + }, + { + "epoch": 0.020158772634020158, + "grad_norm": 0.9214372038841248, + "learning_rate": 0.000499874651504777, + "loss": 2.3816, + "step": 113 + }, + { + "epoch": 0.020337168852020335, + "grad_norm": 0.8497713804244995, + "learning_rate": 0.0004998724233202585, + "loss": 2.2302, + "step": 114 + }, + { + "epoch": 0.020515565070020517, + "grad_norm": 0.7643554210662842, + "learning_rate": 0.0004998701755108674, + "loss": 2.3578, + "step": 115 + }, + { + "epoch": 0.020693961288020694, + "grad_norm": 1.057411551475525, + "learning_rate": 0.0004998679080767802, + "loss": 2.1724, + "step": 116 + }, + { + "epoch": 0.020872357506020872, + "grad_norm": 1.0113954544067383, + "learning_rate": 0.000499865621018175, + "loss": 2.375, + "step": 117 + }, + { + "epoch": 0.02105075372402105, + "grad_norm": 0.8105971813201904, + "learning_rate": 0.0004998633143352315, + "loss": 2.0368, + "step": 118 + }, + { + "epoch": 0.021229149942021228, + "grad_norm": 1.2691274881362915, + "learning_rate": 0.0004998609880281309, + "loss": 2.2663, + "step": 119 + }, + { + "epoch": 0.02140754616002141, + "grad_norm": 0.8678051829338074, + "learning_rate": 0.0004998586420970557, + "loss": 2.0612, + "step": 120 + }, + { + "epoch": 0.021585942378021587, + "grad_norm": 0.7809403538703918, + "learning_rate": 0.0004998562765421903, + "loss": 2.0222, + "step": 121 + }, + { + "epoch": 0.021764338596021764, + "grad_norm": 0.8958665728569031, + "learning_rate": 0.0004998538913637205, + "loss": 2.2086, + "step": 122 + }, + { + "epoch": 0.021942734814021942, + "grad_norm": 0.8660983443260193, + "learning_rate": 0.0004998514865618335, + "loss": 2.0885, + "step": 123 + }, + { + "epoch": 0.02212113103202212, + "grad_norm": 0.8884083032608032, + "learning_rate": 0.0004998490621367184, + "loss": 2.0391, + "step": 124 + }, + { + "epoch": 0.0222995272500223, + "grad_norm": 0.9233642220497131, + "learning_rate": 0.0004998466180885653, + "loss": 2.3625, + "step": 125 + }, + { + "epoch": 0.02247792346802248, + "grad_norm": 0.9836124181747437, + "learning_rate": 0.0004998441544175666, + "loss": 2.2373, + "step": 126 + }, + { + "epoch": 0.022656319686022657, + "grad_norm": 0.8701616525650024, + "learning_rate": 0.0004998416711239153, + "loss": 2.2678, + "step": 127 + }, + { + "epoch": 0.022834715904022834, + "grad_norm": 0.8650346994400024, + "learning_rate": 0.0004998391682078067, + "loss": 2.1102, + "step": 128 + }, + { + "epoch": 0.023013112122023012, + "grad_norm": 1.1175918579101562, + "learning_rate": 0.0004998366456694374, + "loss": 2.1723, + "step": 129 + }, + { + "epoch": 0.023191508340023193, + "grad_norm": 0.8660110235214233, + "learning_rate": 0.0004998341035090055, + "loss": 2.0303, + "step": 130 + }, + { + "epoch": 0.02336990455802337, + "grad_norm": 0.7795286774635315, + "learning_rate": 0.0004998315417267105, + "loss": 2.2806, + "step": 131 + }, + { + "epoch": 0.02354830077602355, + "grad_norm": 0.8152791857719421, + "learning_rate": 0.0004998289603227538, + "loss": 2.0361, + "step": 132 + }, + { + "epoch": 0.023726696994023726, + "grad_norm": 0.8521414995193481, + "learning_rate": 0.0004998263592973381, + "loss": 2.3676, + "step": 133 + }, + { + "epoch": 0.023905093212023904, + "grad_norm": 0.8271788954734802, + "learning_rate": 0.0004998237386506676, + "loss": 2.2346, + "step": 134 + }, + { + "epoch": 0.024083489430024082, + "grad_norm": 0.8122360706329346, + "learning_rate": 0.0004998210983829482, + "loss": 2.2109, + "step": 135 + }, + { + "epoch": 0.024261885648024263, + "grad_norm": 0.8585101962089539, + "learning_rate": 0.0004998184384943871, + "loss": 2.2372, + "step": 136 + }, + { + "epoch": 0.02444028186602444, + "grad_norm": 1.338280200958252, + "learning_rate": 0.0004998157589851935, + "loss": 2.3498, + "step": 137 + }, + { + "epoch": 0.02461867808402462, + "grad_norm": 0.8073638677597046, + "learning_rate": 0.0004998130598555776, + "loss": 1.9356, + "step": 138 + }, + { + "epoch": 0.024797074302024796, + "grad_norm": 1.4015569686889648, + "learning_rate": 0.0004998103411057517, + "loss": 2.12, + "step": 139 + }, + { + "epoch": 0.024975470520024974, + "grad_norm": 3.684471845626831, + "learning_rate": 0.0004998076027359289, + "loss": 2.1707, + "step": 140 + }, + { + "epoch": 0.025153866738025155, + "grad_norm": 1.0060230493545532, + "learning_rate": 0.0004998048447463245, + "loss": 2.1753, + "step": 141 + }, + { + "epoch": 0.025332262956025333, + "grad_norm": 0.8520100116729736, + "learning_rate": 0.0004998020671371551, + "loss": 2.2941, + "step": 142 + }, + { + "epoch": 0.02551065917402551, + "grad_norm": 0.8406434059143066, + "learning_rate": 0.0004997992699086389, + "loss": 1.9893, + "step": 143 + }, + { + "epoch": 0.02568905539202569, + "grad_norm": 1.1668790578842163, + "learning_rate": 0.0004997964530609956, + "loss": 2.0301, + "step": 144 + }, + { + "epoch": 0.025867451610025866, + "grad_norm": 0.8116411566734314, + "learning_rate": 0.0004997936165944462, + "loss": 1.976, + "step": 145 + }, + { + "epoch": 0.026045847828026047, + "grad_norm": 1.0072658061981201, + "learning_rate": 0.0004997907605092138, + "loss": 2.0916, + "step": 146 + }, + { + "epoch": 0.026224244046026225, + "grad_norm": 1.1760748624801636, + "learning_rate": 0.0004997878848055225, + "loss": 1.9554, + "step": 147 + }, + { + "epoch": 0.026402640264026403, + "grad_norm": 0.9567267894744873, + "learning_rate": 0.0004997849894835982, + "loss": 2.1948, + "step": 148 + }, + { + "epoch": 0.02658103648202658, + "grad_norm": 1.109018325805664, + "learning_rate": 0.0004997820745436683, + "loss": 1.9971, + "step": 149 + }, + { + "epoch": 0.02675943270002676, + "grad_norm": 0.9176613092422485, + "learning_rate": 0.0004997791399859618, + "loss": 2.4487, + "step": 150 + }, + { + "epoch": 0.026937828918026936, + "grad_norm": 0.8620409369468689, + "learning_rate": 0.0004997761858107091, + "loss": 2.1291, + "step": 151 + }, + { + "epoch": 0.027116225136027117, + "grad_norm": 0.7816916108131409, + "learning_rate": 0.0004997732120181423, + "loss": 2.1157, + "step": 152 + }, + { + "epoch": 0.027294621354027295, + "grad_norm": 0.8684608340263367, + "learning_rate": 0.0004997702186084949, + "loss": 2.0322, + "step": 153 + }, + { + "epoch": 0.027473017572027473, + "grad_norm": 0.9221858382225037, + "learning_rate": 0.000499767205582002, + "loss": 1.9962, + "step": 154 + }, + { + "epoch": 0.02765141379002765, + "grad_norm": 0.9276618361473083, + "learning_rate": 0.0004997641729389002, + "loss": 2.2969, + "step": 155 + }, + { + "epoch": 0.02782981000802783, + "grad_norm": 0.9027115106582642, + "learning_rate": 0.0004997611206794278, + "loss": 2.1261, + "step": 156 + }, + { + "epoch": 0.02800820622602801, + "grad_norm": 0.7453658580780029, + "learning_rate": 0.0004997580488038245, + "loss": 1.9345, + "step": 157 + }, + { + "epoch": 0.028186602444028187, + "grad_norm": 0.8788852095603943, + "learning_rate": 0.0004997549573123314, + "loss": 2.096, + "step": 158 + }, + { + "epoch": 0.028364998662028365, + "grad_norm": 0.8055654764175415, + "learning_rate": 0.0004997518462051916, + "loss": 1.8397, + "step": 159 + }, + { + "epoch": 0.028543394880028543, + "grad_norm": 0.9946992993354797, + "learning_rate": 0.0004997487154826492, + "loss": 1.8003, + "step": 160 + }, + { + "epoch": 0.02872179109802872, + "grad_norm": 0.9577097296714783, + "learning_rate": 0.0004997455651449502, + "loss": 2.1386, + "step": 161 + }, + { + "epoch": 0.028900187316028902, + "grad_norm": 1.0292500257492065, + "learning_rate": 0.0004997423951923419, + "loss": 2.0328, + "step": 162 + }, + { + "epoch": 0.02907858353402908, + "grad_norm": 0.8872460126876831, + "learning_rate": 0.0004997392056250733, + "loss": 1.9567, + "step": 163 + }, + { + "epoch": 0.029256979752029257, + "grad_norm": 4.496621608734131, + "learning_rate": 0.0004997359964433952, + "loss": 2.2306, + "step": 164 + }, + { + "epoch": 0.029435375970029435, + "grad_norm": 1.3478816747665405, + "learning_rate": 0.0004997327676475593, + "loss": 1.9389, + "step": 165 + }, + { + "epoch": 0.029613772188029613, + "grad_norm": 1.18978750705719, + "learning_rate": 0.0004997295192378192, + "loss": 1.9183, + "step": 166 + }, + { + "epoch": 0.02979216840602979, + "grad_norm": 0.8998153805732727, + "learning_rate": 0.0004997262512144302, + "loss": 2.0974, + "step": 167 + }, + { + "epoch": 0.02997056462402997, + "grad_norm": 0.9464337229728699, + "learning_rate": 0.0004997229635776488, + "loss": 1.8474, + "step": 168 + }, + { + "epoch": 0.03014896084203015, + "grad_norm": 0.8955067992210388, + "learning_rate": 0.0004997196563277334, + "loss": 2.0262, + "step": 169 + }, + { + "epoch": 0.030327357060030327, + "grad_norm": 0.727933943271637, + "learning_rate": 0.0004997163294649437, + "loss": 1.7326, + "step": 170 + }, + { + "epoch": 0.030505753278030505, + "grad_norm": 1.0948885679244995, + "learning_rate": 0.0004997129829895409, + "loss": 2.1483, + "step": 171 + }, + { + "epoch": 0.030684149496030683, + "grad_norm": 0.9044740200042725, + "learning_rate": 0.0004997096169017879, + "loss": 1.8832, + "step": 172 + }, + { + "epoch": 0.030862545714030864, + "grad_norm": 1.0726819038391113, + "learning_rate": 0.0004997062312019489, + "loss": 1.9131, + "step": 173 + }, + { + "epoch": 0.03104094193203104, + "grad_norm": 0.9715486764907837, + "learning_rate": 0.0004997028258902902, + "loss": 2.0829, + "step": 174 + }, + { + "epoch": 0.03121933815003122, + "grad_norm": 0.8400108218193054, + "learning_rate": 0.0004996994009670788, + "loss": 1.9984, + "step": 175 + }, + { + "epoch": 0.0313977343680314, + "grad_norm": 0.9738950729370117, + "learning_rate": 0.000499695956432584, + "loss": 1.9813, + "step": 176 + }, + { + "epoch": 0.03157613058603158, + "grad_norm": 1.125439167022705, + "learning_rate": 0.0004996924922870762, + "loss": 1.8655, + "step": 177 + }, + { + "epoch": 0.031754526804031756, + "grad_norm": 1.0050427913665771, + "learning_rate": 0.0004996890085308275, + "loss": 2.1677, + "step": 178 + }, + { + "epoch": 0.031932923022031934, + "grad_norm": 0.7742742300033569, + "learning_rate": 0.0004996855051641116, + "loss": 2.0985, + "step": 179 + }, + { + "epoch": 0.03211131924003211, + "grad_norm": 1.3308488130569458, + "learning_rate": 0.0004996819821872035, + "loss": 1.7866, + "step": 180 + }, + { + "epoch": 0.03228971545803229, + "grad_norm": 0.776884913444519, + "learning_rate": 0.00049967843960038, + "loss": 1.8104, + "step": 181 + }, + { + "epoch": 0.03246811167603247, + "grad_norm": 0.8655677437782288, + "learning_rate": 0.0004996748774039192, + "loss": 2.0098, + "step": 182 + }, + { + "epoch": 0.032646507894032645, + "grad_norm": 0.7239806652069092, + "learning_rate": 0.000499671295598101, + "loss": 1.6073, + "step": 183 + }, + { + "epoch": 0.03282490411203282, + "grad_norm": 0.8207644820213318, + "learning_rate": 0.0004996676941832069, + "loss": 1.8347, + "step": 184 + }, + { + "epoch": 0.033003300330033, + "grad_norm": 0.7517510652542114, + "learning_rate": 0.0004996640731595194, + "loss": 2.0775, + "step": 185 + }, + { + "epoch": 0.033181696548033185, + "grad_norm": 0.7683122158050537, + "learning_rate": 0.000499660432527323, + "loss": 1.9587, + "step": 186 + }, + { + "epoch": 0.03336009276603336, + "grad_norm": 0.7496309280395508, + "learning_rate": 0.0004996567722869036, + "loss": 1.9149, + "step": 187 + }, + { + "epoch": 0.03353848898403354, + "grad_norm": 0.7267098426818848, + "learning_rate": 0.0004996530924385489, + "loss": 2.1301, + "step": 188 + }, + { + "epoch": 0.03371688520203372, + "grad_norm": 1.1069225072860718, + "learning_rate": 0.0004996493929825477, + "loss": 1.74, + "step": 189 + }, + { + "epoch": 0.033895281420033896, + "grad_norm": 0.795403778553009, + "learning_rate": 0.0004996456739191905, + "loss": 2.1685, + "step": 190 + }, + { + "epoch": 0.034073677638034074, + "grad_norm": 0.7450090050697327, + "learning_rate": 0.0004996419352487696, + "loss": 2.1011, + "step": 191 + }, + { + "epoch": 0.03425207385603425, + "grad_norm": 0.7762404084205627, + "learning_rate": 0.0004996381769715785, + "loss": 2.0786, + "step": 192 + }, + { + "epoch": 0.03443047007403443, + "grad_norm": 0.7745615839958191, + "learning_rate": 0.0004996343990879125, + "loss": 1.6322, + "step": 193 + }, + { + "epoch": 0.03460886629203461, + "grad_norm": 0.8149433732032776, + "learning_rate": 0.0004996306015980681, + "loss": 1.646, + "step": 194 + }, + { + "epoch": 0.034787262510034785, + "grad_norm": 0.8039779663085938, + "learning_rate": 0.0004996267845023437, + "loss": 1.6153, + "step": 195 + }, + { + "epoch": 0.03496565872803496, + "grad_norm": 0.7436607480049133, + "learning_rate": 0.0004996229478010392, + "loss": 1.9986, + "step": 196 + }, + { + "epoch": 0.03514405494603515, + "grad_norm": 1.0629464387893677, + "learning_rate": 0.0004996190914944556, + "loss": 1.7696, + "step": 197 + }, + { + "epoch": 0.035322451164035325, + "grad_norm": 0.7059940099716187, + "learning_rate": 0.0004996152155828961, + "loss": 1.75, + "step": 198 + }, + { + "epoch": 0.0355008473820355, + "grad_norm": 0.7760869860649109, + "learning_rate": 0.0004996113200666649, + "loss": 1.6804, + "step": 199 + }, + { + "epoch": 0.03567924360003568, + "grad_norm": 0.723596453666687, + "learning_rate": 0.000499607404946068, + "loss": 1.7709, + "step": 200 + }, + { + "epoch": 0.03585763981803586, + "grad_norm": 0.7342411279678345, + "learning_rate": 0.0004996034702214131, + "loss": 2.1074, + "step": 201 + }, + { + "epoch": 0.036036036036036036, + "grad_norm": 1.2572263479232788, + "learning_rate": 0.000499599515893009, + "loss": 1.8228, + "step": 202 + }, + { + "epoch": 0.03621443225403621, + "grad_norm": 0.9054358005523682, + "learning_rate": 0.0004995955419611663, + "loss": 2.0835, + "step": 203 + }, + { + "epoch": 0.03639282847203639, + "grad_norm": 0.8486645817756653, + "learning_rate": 0.0004995915484261971, + "loss": 2.1984, + "step": 204 + }, + { + "epoch": 0.03657122469003657, + "grad_norm": 0.7532051801681519, + "learning_rate": 0.0004995875352884152, + "loss": 1.9638, + "step": 205 + }, + { + "epoch": 0.03674962090803675, + "grad_norm": 0.9060899615287781, + "learning_rate": 0.0004995835025481357, + "loss": 1.8939, + "step": 206 + }, + { + "epoch": 0.03692801712603693, + "grad_norm": 0.7113338112831116, + "learning_rate": 0.0004995794502056751, + "loss": 1.6417, + "step": 207 + }, + { + "epoch": 0.03710641334403711, + "grad_norm": 3.6220555305480957, + "learning_rate": 0.0004995753782613521, + "loss": 1.7458, + "step": 208 + }, + { + "epoch": 0.03728480956203729, + "grad_norm": 1.1314635276794434, + "learning_rate": 0.0004995712867154863, + "loss": 1.9954, + "step": 209 + }, + { + "epoch": 0.037463205780037465, + "grad_norm": 0.8598477244377136, + "learning_rate": 0.0004995671755683989, + "loss": 1.7875, + "step": 210 + }, + { + "epoch": 0.03764160199803764, + "grad_norm": 0.725925087928772, + "learning_rate": 0.0004995630448204131, + "loss": 1.9205, + "step": 211 + }, + { + "epoch": 0.03781999821603782, + "grad_norm": 0.9679766297340393, + "learning_rate": 0.000499558894471853, + "loss": 2.1183, + "step": 212 + }, + { + "epoch": 0.037998394434038, + "grad_norm": 0.7008562684059143, + "learning_rate": 0.0004995547245230448, + "loss": 1.8477, + "step": 213 + }, + { + "epoch": 0.038176790652038176, + "grad_norm": 0.7993139624595642, + "learning_rate": 0.0004995505349743158, + "loss": 1.6427, + "step": 214 + }, + { + "epoch": 0.03835518687003835, + "grad_norm": 1.6259403228759766, + "learning_rate": 0.0004995463258259953, + "loss": 1.7742, + "step": 215 + }, + { + "epoch": 0.03853358308803853, + "grad_norm": 0.8765113353729248, + "learning_rate": 0.0004995420970784137, + "loss": 1.6492, + "step": 216 + }, + { + "epoch": 0.03871197930603871, + "grad_norm": 0.6928495168685913, + "learning_rate": 0.0004995378487319032, + "loss": 1.7392, + "step": 217 + }, + { + "epoch": 0.038890375524038893, + "grad_norm": 2.987717390060425, + "learning_rate": 0.0004995335807867975, + "loss": 1.8262, + "step": 218 + }, + { + "epoch": 0.03906877174203907, + "grad_norm": 1.99364173412323, + "learning_rate": 0.0004995292932434317, + "loss": 2.138, + "step": 219 + }, + { + "epoch": 0.03924716796003925, + "grad_norm": 1.4841055870056152, + "learning_rate": 0.0004995249861021425, + "loss": 1.7652, + "step": 220 + }, + { + "epoch": 0.03942556417803943, + "grad_norm": 0.9593233466148376, + "learning_rate": 0.0004995206593632685, + "loss": 2.2273, + "step": 221 + }, + { + "epoch": 0.039603960396039604, + "grad_norm": 0.7397097945213318, + "learning_rate": 0.0004995163130271491, + "loss": 1.6452, + "step": 222 + }, + { + "epoch": 0.03978235661403978, + "grad_norm": 0.7248395085334778, + "learning_rate": 0.0004995119470941259, + "loss": 1.7841, + "step": 223 + }, + { + "epoch": 0.03996075283203996, + "grad_norm": 0.8945801854133606, + "learning_rate": 0.0004995075615645418, + "loss": 1.8039, + "step": 224 + }, + { + "epoch": 0.04013914905004014, + "grad_norm": 1.0206605195999146, + "learning_rate": 0.0004995031564387411, + "loss": 1.8187, + "step": 225 + }, + { + "epoch": 0.040317545268040315, + "grad_norm": 0.802527129650116, + "learning_rate": 0.00049949873171707, + "loss": 1.7456, + "step": 226 + }, + { + "epoch": 0.04049594148604049, + "grad_norm": 0.8061395883560181, + "learning_rate": 0.0004994942873998757, + "loss": 1.7857, + "step": 227 + }, + { + "epoch": 0.04067433770404067, + "grad_norm": 0.8295297026634216, + "learning_rate": 0.0004994898234875074, + "loss": 1.6685, + "step": 228 + }, + { + "epoch": 0.040852733922040856, + "grad_norm": 0.9258911609649658, + "learning_rate": 0.000499485339980316, + "loss": 2.0146, + "step": 229 + }, + { + "epoch": 0.04103113014004103, + "grad_norm": 0.7650581002235413, + "learning_rate": 0.0004994808368786531, + "loss": 1.7862, + "step": 230 + }, + { + "epoch": 0.04120952635804121, + "grad_norm": 0.8616735339164734, + "learning_rate": 0.0004994763141828728, + "loss": 1.7706, + "step": 231 + }, + { + "epoch": 0.04138792257604139, + "grad_norm": 0.8802286386489868, + "learning_rate": 0.00049947177189333, + "loss": 1.7055, + "step": 232 + }, + { + "epoch": 0.04156631879404157, + "grad_norm": 0.6563844680786133, + "learning_rate": 0.0004994672100103818, + "loss": 1.7655, + "step": 233 + }, + { + "epoch": 0.041744715012041744, + "grad_norm": 0.8318186402320862, + "learning_rate": 0.0004994626285343861, + "loss": 1.7316, + "step": 234 + }, + { + "epoch": 0.04192311123004192, + "grad_norm": 0.8289990425109863, + "learning_rate": 0.0004994580274657029, + "loss": 1.9734, + "step": 235 + }, + { + "epoch": 0.0421015074480421, + "grad_norm": 0.8461620807647705, + "learning_rate": 0.0004994534068046936, + "loss": 1.8266, + "step": 236 + }, + { + "epoch": 0.04227990366604228, + "grad_norm": 0.8519775867462158, + "learning_rate": 0.0004994487665517212, + "loss": 1.8579, + "step": 237 + }, + { + "epoch": 0.042458299884042455, + "grad_norm": 0.707134485244751, + "learning_rate": 0.0004994441067071499, + "loss": 1.845, + "step": 238 + }, + { + "epoch": 0.04263669610204264, + "grad_norm": 1.2753993272781372, + "learning_rate": 0.0004994394272713459, + "loss": 1.8513, + "step": 239 + }, + { + "epoch": 0.04281509232004282, + "grad_norm": 0.9627553224563599, + "learning_rate": 0.0004994347282446765, + "loss": 2.2214, + "step": 240 + }, + { + "epoch": 0.042993488538042995, + "grad_norm": 0.8274680972099304, + "learning_rate": 0.0004994300096275108, + "loss": 1.8324, + "step": 241 + }, + { + "epoch": 0.04317188475604317, + "grad_norm": 1.0614216327667236, + "learning_rate": 0.0004994252714202198, + "loss": 1.8447, + "step": 242 + }, + { + "epoch": 0.04335028097404335, + "grad_norm": 0.853103518486023, + "learning_rate": 0.0004994205136231751, + "loss": 1.8614, + "step": 243 + }, + { + "epoch": 0.04352867719204353, + "grad_norm": 0.6934187412261963, + "learning_rate": 0.0004994157362367506, + "loss": 1.6686, + "step": 244 + }, + { + "epoch": 0.043707073410043706, + "grad_norm": 0.856378972530365, + "learning_rate": 0.0004994109392613215, + "loss": 1.7998, + "step": 245 + }, + { + "epoch": 0.043885469628043884, + "grad_norm": 0.8739482760429382, + "learning_rate": 0.0004994061226972647, + "loss": 1.7582, + "step": 246 + }, + { + "epoch": 0.04406386584604406, + "grad_norm": 0.904869794845581, + "learning_rate": 0.0004994012865449582, + "loss": 1.966, + "step": 247 + }, + { + "epoch": 0.04424226206404424, + "grad_norm": 0.8391656279563904, + "learning_rate": 0.0004993964308047821, + "loss": 1.7177, + "step": 248 + }, + { + "epoch": 0.04442065828204442, + "grad_norm": 0.9165405035018921, + "learning_rate": 0.0004993915554771175, + "loss": 1.8742, + "step": 249 + }, + { + "epoch": 0.0445990545000446, + "grad_norm": 0.7904878854751587, + "learning_rate": 0.0004993866605623475, + "loss": 1.8811, + "step": 250 + }, + { + "epoch": 0.04477745071804478, + "grad_norm": 0.7866482734680176, + "learning_rate": 0.0004993817460608566, + "loss": 1.6096, + "step": 251 + }, + { + "epoch": 0.04495584693604496, + "grad_norm": 0.8852795362472534, + "learning_rate": 0.0004993768119730306, + "loss": 1.8237, + "step": 252 + }, + { + "epoch": 0.045134243154045135, + "grad_norm": 0.9104618430137634, + "learning_rate": 0.0004993718582992572, + "loss": 1.8859, + "step": 253 + }, + { + "epoch": 0.04531263937204531, + "grad_norm": 0.7363934516906738, + "learning_rate": 0.0004993668850399252, + "loss": 1.3366, + "step": 254 + }, + { + "epoch": 0.04549103559004549, + "grad_norm": 0.7602558732032776, + "learning_rate": 0.0004993618921954254, + "loss": 2.0173, + "step": 255 + }, + { + "epoch": 0.04566943180804567, + "grad_norm": 0.8223975896835327, + "learning_rate": 0.00049935687976615, + "loss": 1.6692, + "step": 256 + }, + { + "epoch": 0.045847828026045846, + "grad_norm": 0.7281943559646606, + "learning_rate": 0.0004993518477524924, + "loss": 1.6527, + "step": 257 + }, + { + "epoch": 0.046026224244046024, + "grad_norm": 0.7140815854072571, + "learning_rate": 0.0004993467961548482, + "loss": 1.895, + "step": 258 + }, + { + "epoch": 0.0462046204620462, + "grad_norm": 1.0137954950332642, + "learning_rate": 0.0004993417249736138, + "loss": 2.0086, + "step": 259 + }, + { + "epoch": 0.046383016680046386, + "grad_norm": 0.7407935261726379, + "learning_rate": 0.0004993366342091876, + "loss": 1.6048, + "step": 260 + }, + { + "epoch": 0.046561412898046564, + "grad_norm": 0.7034161686897278, + "learning_rate": 0.0004993315238619695, + "loss": 1.8503, + "step": 261 + }, + { + "epoch": 0.04673980911604674, + "grad_norm": 0.6958416104316711, + "learning_rate": 0.0004993263939323608, + "loss": 1.6034, + "step": 262 + }, + { + "epoch": 0.04691820533404692, + "grad_norm": 0.7423539161682129, + "learning_rate": 0.0004993212444207644, + "loss": 1.598, + "step": 263 + }, + { + "epoch": 0.0470966015520471, + "grad_norm": 0.7731009125709534, + "learning_rate": 0.0004993160753275849, + "loss": 1.9353, + "step": 264 + }, + { + "epoch": 0.047274997770047275, + "grad_norm": 0.7424893379211426, + "learning_rate": 0.0004993108866532279, + "loss": 1.712, + "step": 265 + }, + { + "epoch": 0.04745339398804745, + "grad_norm": 0.8562301993370056, + "learning_rate": 0.0004993056783981013, + "loss": 1.8745, + "step": 266 + }, + { + "epoch": 0.04763179020604763, + "grad_norm": 1.6230698823928833, + "learning_rate": 0.000499300450562614, + "loss": 1.7503, + "step": 267 + }, + { + "epoch": 0.04781018642404781, + "grad_norm": 0.938933253288269, + "learning_rate": 0.0004992952031471765, + "loss": 1.683, + "step": 268 + }, + { + "epoch": 0.047988582642047986, + "grad_norm": 0.8198557496070862, + "learning_rate": 0.0004992899361522011, + "loss": 1.9077, + "step": 269 + }, + { + "epoch": 0.048166978860048164, + "grad_norm": 0.9028448462486267, + "learning_rate": 0.0004992846495781013, + "loss": 1.9227, + "step": 270 + }, + { + "epoch": 0.04834537507804835, + "grad_norm": 0.7884741425514221, + "learning_rate": 0.0004992793434252925, + "loss": 1.58, + "step": 271 + }, + { + "epoch": 0.048523771296048526, + "grad_norm": 1.0796072483062744, + "learning_rate": 0.0004992740176941912, + "loss": 1.6761, + "step": 272 + }, + { + "epoch": 0.048702167514048704, + "grad_norm": 0.9148755073547363, + "learning_rate": 0.0004992686723852161, + "loss": 1.7409, + "step": 273 + }, + { + "epoch": 0.04888056373204888, + "grad_norm": 0.7027503252029419, + "learning_rate": 0.0004992633074987864, + "loss": 1.4971, + "step": 274 + }, + { + "epoch": 0.04905895995004906, + "grad_norm": 0.8004087209701538, + "learning_rate": 0.000499257923035324, + "loss": 1.7677, + "step": 275 + }, + { + "epoch": 0.04923735616804924, + "grad_norm": 0.7200493216514587, + "learning_rate": 0.0004992525189952516, + "loss": 1.7596, + "step": 276 + }, + { + "epoch": 0.049415752386049415, + "grad_norm": 0.7689985036849976, + "learning_rate": 0.0004992470953789936, + "loss": 1.7032, + "step": 277 + }, + { + "epoch": 0.04959414860404959, + "grad_norm": 0.7748706936836243, + "learning_rate": 0.000499241652186976, + "loss": 1.972, + "step": 278 + }, + { + "epoch": 0.04977254482204977, + "grad_norm": 1.406790018081665, + "learning_rate": 0.0004992361894196263, + "loss": 1.5758, + "step": 279 + }, + { + "epoch": 0.04995094104004995, + "grad_norm": 0.6651303172111511, + "learning_rate": 0.0004992307070773734, + "loss": 1.6668, + "step": 280 + }, + { + "epoch": 0.050129337258050126, + "grad_norm": 0.7641019821166992, + "learning_rate": 0.0004992252051606481, + "loss": 1.8233, + "step": 281 + }, + { + "epoch": 0.05030773347605031, + "grad_norm": 0.6978675127029419, + "learning_rate": 0.0004992196836698825, + "loss": 1.6394, + "step": 282 + }, + { + "epoch": 0.05048612969405049, + "grad_norm": 0.7156147956848145, + "learning_rate": 0.0004992141426055101, + "loss": 1.6568, + "step": 283 + }, + { + "epoch": 0.050664525912050666, + "grad_norm": 0.791042149066925, + "learning_rate": 0.0004992085819679662, + "loss": 1.8138, + "step": 284 + }, + { + "epoch": 0.050842922130050844, + "grad_norm": 0.7295787334442139, + "learning_rate": 0.0004992030017576876, + "loss": 1.3886, + "step": 285 + }, + { + "epoch": 0.05102131834805102, + "grad_norm": 1.0952125787734985, + "learning_rate": 0.0004991974019751124, + "loss": 1.6662, + "step": 286 + }, + { + "epoch": 0.0511997145660512, + "grad_norm": 1.1296299695968628, + "learning_rate": 0.0004991917826206805, + "loss": 1.5014, + "step": 287 + }, + { + "epoch": 0.05137811078405138, + "grad_norm": 0.7258694767951965, + "learning_rate": 0.0004991861436948333, + "loss": 1.5864, + "step": 288 + }, + { + "epoch": 0.051556507002051555, + "grad_norm": 0.9456144571304321, + "learning_rate": 0.0004991804851980135, + "loss": 1.9251, + "step": 289 + }, + { + "epoch": 0.05173490322005173, + "grad_norm": 0.7830838561058044, + "learning_rate": 0.0004991748071306657, + "loss": 1.6641, + "step": 290 + }, + { + "epoch": 0.05191329943805191, + "grad_norm": 0.7046554684638977, + "learning_rate": 0.0004991691094932357, + "loss": 1.7749, + "step": 291 + }, + { + "epoch": 0.052091695656052095, + "grad_norm": 1.1261860132217407, + "learning_rate": 0.000499163392286171, + "loss": 1.7228, + "step": 292 + }, + { + "epoch": 0.05227009187405227, + "grad_norm": 0.7298058867454529, + "learning_rate": 0.0004991576555099208, + "loss": 1.666, + "step": 293 + }, + { + "epoch": 0.05244848809205245, + "grad_norm": 0.7975763082504272, + "learning_rate": 0.0004991518991649356, + "loss": 1.803, + "step": 294 + }, + { + "epoch": 0.05262688431005263, + "grad_norm": 1.0635559558868408, + "learning_rate": 0.0004991461232516675, + "loss": 1.4519, + "step": 295 + }, + { + "epoch": 0.052805280528052806, + "grad_norm": 0.9030710458755493, + "learning_rate": 0.00049914032777057, + "loss": 1.607, + "step": 296 + }, + { + "epoch": 0.052983676746052984, + "grad_norm": 1.0126208066940308, + "learning_rate": 0.0004991345127220983, + "loss": 1.6796, + "step": 297 + }, + { + "epoch": 0.05316207296405316, + "grad_norm": 0.8473824858665466, + "learning_rate": 0.0004991286781067094, + "loss": 1.6065, + "step": 298 + }, + { + "epoch": 0.05334046918205334, + "grad_norm": 0.7614424228668213, + "learning_rate": 0.0004991228239248611, + "loss": 1.4427, + "step": 299 + }, + { + "epoch": 0.05351886540005352, + "grad_norm": 0.8425862193107605, + "learning_rate": 0.0004991169501770135, + "loss": 1.6097, + "step": 300 + }, + { + "epoch": 0.053697261618053695, + "grad_norm": 0.7978277802467346, + "learning_rate": 0.0004991110568636278, + "loss": 1.6084, + "step": 301 + }, + { + "epoch": 0.05387565783605387, + "grad_norm": 0.9924404621124268, + "learning_rate": 0.000499105143985167, + "loss": 1.6886, + "step": 302 + }, + { + "epoch": 0.05405405405405406, + "grad_norm": 1.2324185371398926, + "learning_rate": 0.0004990992115420954, + "loss": 2.0878, + "step": 303 + }, + { + "epoch": 0.054232450272054235, + "grad_norm": 0.8193588256835938, + "learning_rate": 0.0004990932595348789, + "loss": 1.3275, + "step": 304 + }, + { + "epoch": 0.05441084649005441, + "grad_norm": 0.9794548153877258, + "learning_rate": 0.0004990872879639849, + "loss": 1.9818, + "step": 305 + }, + { + "epoch": 0.05458924270805459, + "grad_norm": 0.9556685090065002, + "learning_rate": 0.0004990812968298825, + "loss": 1.7266, + "step": 306 + }, + { + "epoch": 0.05476763892605477, + "grad_norm": 0.8109720945358276, + "learning_rate": 0.0004990752861330424, + "loss": 1.626, + "step": 307 + }, + { + "epoch": 0.054946035144054946, + "grad_norm": 0.7442929744720459, + "learning_rate": 0.0004990692558739363, + "loss": 1.6846, + "step": 308 + }, + { + "epoch": 0.055124431362055124, + "grad_norm": 0.8138058185577393, + "learning_rate": 0.0004990632060530381, + "loss": 1.4798, + "step": 309 + }, + { + "epoch": 0.0553028275800553, + "grad_norm": 1.426184892654419, + "learning_rate": 0.0004990571366708229, + "loss": 1.4324, + "step": 310 + }, + { + "epoch": 0.05548122379805548, + "grad_norm": 0.9690107703208923, + "learning_rate": 0.0004990510477277673, + "loss": 1.7228, + "step": 311 + }, + { + "epoch": 0.05565962001605566, + "grad_norm": 0.8152466416358948, + "learning_rate": 0.0004990449392243497, + "loss": 1.5993, + "step": 312 + }, + { + "epoch": 0.05583801623405584, + "grad_norm": 0.7555820345878601, + "learning_rate": 0.0004990388111610496, + "loss": 1.5512, + "step": 313 + }, + { + "epoch": 0.05601641245205602, + "grad_norm": 0.8185862302780151, + "learning_rate": 0.0004990326635383486, + "loss": 1.8048, + "step": 314 + }, + { + "epoch": 0.0561948086700562, + "grad_norm": 1.3000257015228271, + "learning_rate": 0.0004990264963567292, + "loss": 1.9454, + "step": 315 + }, + { + "epoch": 0.056373204888056375, + "grad_norm": 1.501250982284546, + "learning_rate": 0.0004990203096166761, + "loss": 1.7511, + "step": 316 + }, + { + "epoch": 0.05655160110605655, + "grad_norm": 0.7413790822029114, + "learning_rate": 0.000499014103318675, + "loss": 1.6659, + "step": 317 + }, + { + "epoch": 0.05672999732405673, + "grad_norm": 0.6846387982368469, + "learning_rate": 0.0004990078774632134, + "loss": 1.5829, + "step": 318 + }, + { + "epoch": 0.05690839354205691, + "grad_norm": 0.8253748416900635, + "learning_rate": 0.0004990016320507802, + "loss": 1.6716, + "step": 319 + }, + { + "epoch": 0.057086789760057086, + "grad_norm": 0.7216707468032837, + "learning_rate": 0.000498995367081866, + "loss": 1.6294, + "step": 320 + }, + { + "epoch": 0.05726518597805726, + "grad_norm": 0.8067865371704102, + "learning_rate": 0.0004989890825569628, + "loss": 1.7566, + "step": 321 + }, + { + "epoch": 0.05744358219605744, + "grad_norm": 0.867411732673645, + "learning_rate": 0.0004989827784765643, + "loss": 1.5055, + "step": 322 + }, + { + "epoch": 0.05762197841405762, + "grad_norm": 0.7605225443840027, + "learning_rate": 0.0004989764548411654, + "loss": 1.6925, + "step": 323 + }, + { + "epoch": 0.057800374632057804, + "grad_norm": 0.7990824580192566, + "learning_rate": 0.0004989701116512629, + "loss": 1.8807, + "step": 324 + }, + { + "epoch": 0.05797877085005798, + "grad_norm": 0.7364970445632935, + "learning_rate": 0.000498963748907355, + "loss": 1.6086, + "step": 325 + }, + { + "epoch": 0.05815716706805816, + "grad_norm": 0.7894652485847473, + "learning_rate": 0.0004989573666099415, + "loss": 1.9226, + "step": 326 + }, + { + "epoch": 0.05833556328605834, + "grad_norm": 0.8756299018859863, + "learning_rate": 0.0004989509647595234, + "loss": 1.7023, + "step": 327 + }, + { + "epoch": 0.058513959504058514, + "grad_norm": 1.0760093927383423, + "learning_rate": 0.0004989445433566037, + "loss": 1.4588, + "step": 328 + }, + { + "epoch": 0.05869235572205869, + "grad_norm": 0.8876066207885742, + "learning_rate": 0.0004989381024016867, + "loss": 1.7266, + "step": 329 + }, + { + "epoch": 0.05887075194005887, + "grad_norm": 4.222714900970459, + "learning_rate": 0.0004989316418952782, + "loss": 1.6749, + "step": 330 + }, + { + "epoch": 0.05904914815805905, + "grad_norm": 1.0172480344772339, + "learning_rate": 0.0004989251618378859, + "loss": 1.6138, + "step": 331 + }, + { + "epoch": 0.059227544376059225, + "grad_norm": 1.1439893245697021, + "learning_rate": 0.0004989186622300183, + "loss": 1.6511, + "step": 332 + }, + { + "epoch": 0.0594059405940594, + "grad_norm": 0.7901877760887146, + "learning_rate": 0.0004989121430721862, + "loss": 1.5239, + "step": 333 + }, + { + "epoch": 0.05958433681205958, + "grad_norm": 0.8215880393981934, + "learning_rate": 0.0004989056043649014, + "loss": 1.6242, + "step": 334 + }, + { + "epoch": 0.059762733030059766, + "grad_norm": 0.7913690805435181, + "learning_rate": 0.0004988990461086777, + "loss": 1.9562, + "step": 335 + }, + { + "epoch": 0.05994112924805994, + "grad_norm": 0.7676241397857666, + "learning_rate": 0.0004988924683040298, + "loss": 1.6554, + "step": 336 + }, + { + "epoch": 0.06011952546606012, + "grad_norm": 0.7700140476226807, + "learning_rate": 0.0004988858709514747, + "loss": 1.5174, + "step": 337 + }, + { + "epoch": 0.0602979216840603, + "grad_norm": 0.7194883227348328, + "learning_rate": 0.0004988792540515304, + "loss": 1.711, + "step": 338 + }, + { + "epoch": 0.06047631790206048, + "grad_norm": 0.737006425857544, + "learning_rate": 0.0004988726176047164, + "loss": 1.7161, + "step": 339 + }, + { + "epoch": 0.060654714120060654, + "grad_norm": 0.8978502154350281, + "learning_rate": 0.0004988659616115544, + "loss": 1.4597, + "step": 340 + }, + { + "epoch": 0.06083311033806083, + "grad_norm": 0.850236177444458, + "learning_rate": 0.0004988592860725667, + "loss": 1.7958, + "step": 341 + }, + { + "epoch": 0.06101150655606101, + "grad_norm": 0.6929709911346436, + "learning_rate": 0.0004988525909882779, + "loss": 1.4684, + "step": 342 + }, + { + "epoch": 0.06118990277406119, + "grad_norm": 0.8175438642501831, + "learning_rate": 0.0004988458763592135, + "loss": 1.5871, + "step": 343 + }, + { + "epoch": 0.061368298992061365, + "grad_norm": 1.5031802654266357, + "learning_rate": 0.0004988391421859011, + "loss": 1.8358, + "step": 344 + }, + { + "epoch": 0.06154669521006155, + "grad_norm": 0.8213221430778503, + "learning_rate": 0.0004988323884688696, + "loss": 1.6868, + "step": 345 + }, + { + "epoch": 0.06172509142806173, + "grad_norm": 0.855012059211731, + "learning_rate": 0.0004988256152086495, + "loss": 1.7024, + "step": 346 + }, + { + "epoch": 0.061903487646061905, + "grad_norm": 0.9920960664749146, + "learning_rate": 0.0004988188224057724, + "loss": 1.641, + "step": 347 + }, + { + "epoch": 0.06208188386406208, + "grad_norm": 0.7211305499076843, + "learning_rate": 0.0004988120100607723, + "loss": 1.3817, + "step": 348 + }, + { + "epoch": 0.06226028008206226, + "grad_norm": 0.8306823968887329, + "learning_rate": 0.0004988051781741839, + "loss": 1.6479, + "step": 349 + }, + { + "epoch": 0.06243867630006244, + "grad_norm": 0.6879592537879944, + "learning_rate": 0.0004987983267465439, + "loss": 1.7006, + "step": 350 + }, + { + "epoch": 0.06261707251806262, + "grad_norm": 0.7928709387779236, + "learning_rate": 0.0004987914557783905, + "loss": 1.7222, + "step": 351 + }, + { + "epoch": 0.0627954687360628, + "grad_norm": 0.8470203876495361, + "learning_rate": 0.000498784565270263, + "loss": 1.7225, + "step": 352 + }, + { + "epoch": 0.06297386495406297, + "grad_norm": 0.7351035475730896, + "learning_rate": 0.0004987776552227029, + "loss": 1.574, + "step": 353 + }, + { + "epoch": 0.06315226117206316, + "grad_norm": 0.8006918430328369, + "learning_rate": 0.0004987707256362529, + "loss": 1.9217, + "step": 354 + }, + { + "epoch": 0.06333065739006333, + "grad_norm": 0.7959868907928467, + "learning_rate": 0.0004987637765114571, + "loss": 1.6191, + "step": 355 + }, + { + "epoch": 0.06350905360806351, + "grad_norm": 0.6624523401260376, + "learning_rate": 0.0004987568078488613, + "loss": 1.4849, + "step": 356 + }, + { + "epoch": 0.06368744982606368, + "grad_norm": 0.665393054485321, + "learning_rate": 0.0004987498196490129, + "loss": 1.67, + "step": 357 + }, + { + "epoch": 0.06386584604406387, + "grad_norm": 1.2895721197128296, + "learning_rate": 0.0004987428119124607, + "loss": 1.6514, + "step": 358 + }, + { + "epoch": 0.06404424226206404, + "grad_norm": 0.7853053212165833, + "learning_rate": 0.0004987357846397551, + "loss": 1.528, + "step": 359 + }, + { + "epoch": 0.06422263848006422, + "grad_norm": 0.7855213284492493, + "learning_rate": 0.000498728737831448, + "loss": 1.3609, + "step": 360 + }, + { + "epoch": 0.06440103469806441, + "grad_norm": 0.9577522277832031, + "learning_rate": 0.0004987216714880929, + "loss": 1.6681, + "step": 361 + }, + { + "epoch": 0.06457943091606458, + "grad_norm": 0.7544964551925659, + "learning_rate": 0.0004987145856102448, + "loss": 1.5707, + "step": 362 + }, + { + "epoch": 0.06475782713406476, + "grad_norm": 0.9541155695915222, + "learning_rate": 0.00049870748019846, + "loss": 1.5559, + "step": 363 + }, + { + "epoch": 0.06493622335206493, + "grad_norm": 0.8656521439552307, + "learning_rate": 0.0004987003552532969, + "loss": 1.5096, + "step": 364 + }, + { + "epoch": 0.06511461957006512, + "grad_norm": 0.9872167706489563, + "learning_rate": 0.0004986932107753148, + "loss": 1.575, + "step": 365 + }, + { + "epoch": 0.06529301578806529, + "grad_norm": 0.7449053525924683, + "learning_rate": 0.000498686046765075, + "loss": 1.3222, + "step": 366 + }, + { + "epoch": 0.06547141200606547, + "grad_norm": 0.7896831631660461, + "learning_rate": 0.0004986788632231401, + "loss": 1.6182, + "step": 367 + }, + { + "epoch": 0.06564980822406564, + "grad_norm": 0.8574457168579102, + "learning_rate": 0.0004986716601500744, + "loss": 1.6993, + "step": 368 + }, + { + "epoch": 0.06582820444206583, + "grad_norm": 0.8115587830543518, + "learning_rate": 0.0004986644375464434, + "loss": 1.6287, + "step": 369 + }, + { + "epoch": 0.066006600660066, + "grad_norm": 1.8400505781173706, + "learning_rate": 0.0004986571954128145, + "loss": 1.6949, + "step": 370 + }, + { + "epoch": 0.06618499687806619, + "grad_norm": 0.8843457698822021, + "learning_rate": 0.0004986499337497565, + "loss": 1.3939, + "step": 371 + }, + { + "epoch": 0.06636339309606637, + "grad_norm": 0.9792248010635376, + "learning_rate": 0.0004986426525578398, + "loss": 1.6735, + "step": 372 + }, + { + "epoch": 0.06654178931406654, + "grad_norm": 0.8991268277168274, + "learning_rate": 0.000498635351837636, + "loss": 1.6638, + "step": 373 + }, + { + "epoch": 0.06672018553206673, + "grad_norm": 1.0878491401672363, + "learning_rate": 0.0004986280315897188, + "loss": 1.3675, + "step": 374 + }, + { + "epoch": 0.0668985817500669, + "grad_norm": 0.9121967554092407, + "learning_rate": 0.0004986206918146629, + "loss": 1.4673, + "step": 375 + }, + { + "epoch": 0.06707697796806708, + "grad_norm": 0.9130116105079651, + "learning_rate": 0.0004986133325130448, + "loss": 1.8798, + "step": 376 + }, + { + "epoch": 0.06725537418606725, + "grad_norm": 2.4044368267059326, + "learning_rate": 0.0004986059536854427, + "loss": 1.396, + "step": 377 + }, + { + "epoch": 0.06743377040406744, + "grad_norm": 1.2057360410690308, + "learning_rate": 0.0004985985553324359, + "loss": 1.6874, + "step": 378 + }, + { + "epoch": 0.06761216662206761, + "grad_norm": 0.9697250723838806, + "learning_rate": 0.0004985911374546056, + "loss": 1.6581, + "step": 379 + }, + { + "epoch": 0.06779056284006779, + "grad_norm": 8.421307563781738, + "learning_rate": 0.0004985837000525343, + "loss": 1.4384, + "step": 380 + }, + { + "epoch": 0.06796895905806796, + "grad_norm": 1.0877493619918823, + "learning_rate": 0.0004985762431268062, + "loss": 1.6454, + "step": 381 + }, + { + "epoch": 0.06814735527606815, + "grad_norm": 0.8904185891151428, + "learning_rate": 0.0004985687666780069, + "loss": 1.611, + "step": 382 + }, + { + "epoch": 0.06832575149406833, + "grad_norm": 0.9544228911399841, + "learning_rate": 0.0004985612707067237, + "loss": 1.3702, + "step": 383 + }, + { + "epoch": 0.0685041477120685, + "grad_norm": 0.9072893261909485, + "learning_rate": 0.0004985537552135451, + "loss": 1.6441, + "step": 384 + }, + { + "epoch": 0.06868254393006869, + "grad_norm": 0.8297262191772461, + "learning_rate": 0.0004985462201990617, + "loss": 1.7376, + "step": 385 + }, + { + "epoch": 0.06886094014806886, + "grad_norm": 1.0085870027542114, + "learning_rate": 0.000498538665663865, + "loss": 1.5545, + "step": 386 + }, + { + "epoch": 0.06903933636606904, + "grad_norm": 0.7521477341651917, + "learning_rate": 0.0004985310916085485, + "loss": 1.77, + "step": 387 + }, + { + "epoch": 0.06921773258406921, + "grad_norm": 0.8971685767173767, + "learning_rate": 0.000498523498033707, + "loss": 1.6057, + "step": 388 + }, + { + "epoch": 0.0693961288020694, + "grad_norm": 0.7471177577972412, + "learning_rate": 0.000498515884939937, + "loss": 1.6315, + "step": 389 + }, + { + "epoch": 0.06957452502006957, + "grad_norm": 1.5577987432479858, + "learning_rate": 0.0004985082523278363, + "loss": 1.7456, + "step": 390 + }, + { + "epoch": 0.06975292123806975, + "grad_norm": 0.7326628565788269, + "learning_rate": 0.0004985006001980044, + "loss": 1.8009, + "step": 391 + }, + { + "epoch": 0.06993131745606992, + "grad_norm": 0.6636003851890564, + "learning_rate": 0.0004984929285510423, + "loss": 1.6494, + "step": 392 + }, + { + "epoch": 0.07010971367407011, + "grad_norm": 0.7074577808380127, + "learning_rate": 0.0004984852373875524, + "loss": 1.5621, + "step": 393 + }, + { + "epoch": 0.0702881098920703, + "grad_norm": 0.682092547416687, + "learning_rate": 0.000498477526708139, + "loss": 1.3208, + "step": 394 + }, + { + "epoch": 0.07046650611007046, + "grad_norm": 0.692871630191803, + "learning_rate": 0.0004984697965134076, + "loss": 1.2526, + "step": 395 + }, + { + "epoch": 0.07064490232807065, + "grad_norm": 0.6794890761375427, + "learning_rate": 0.0004984620468039653, + "loss": 1.772, + "step": 396 + }, + { + "epoch": 0.07082329854607082, + "grad_norm": 0.9123995304107666, + "learning_rate": 0.0004984542775804207, + "loss": 1.6829, + "step": 397 + }, + { + "epoch": 0.071001694764071, + "grad_norm": 0.9195268154144287, + "learning_rate": 0.0004984464888433842, + "loss": 1.5221, + "step": 398 + }, + { + "epoch": 0.07118009098207118, + "grad_norm": 0.6694832444190979, + "learning_rate": 0.0004984386805934672, + "loss": 1.5026, + "step": 399 + }, + { + "epoch": 0.07135848720007136, + "grad_norm": 0.7524397969245911, + "learning_rate": 0.0004984308528312833, + "loss": 1.6038, + "step": 400 + }, + { + "epoch": 0.07153688341807153, + "grad_norm": 1.0407260656356812, + "learning_rate": 0.000498423005557447, + "loss": 1.9059, + "step": 401 + }, + { + "epoch": 0.07171527963607172, + "grad_norm": 0.9182829260826111, + "learning_rate": 0.0004984151387725748, + "loss": 1.8141, + "step": 402 + }, + { + "epoch": 0.07189367585407189, + "grad_norm": 0.67998206615448, + "learning_rate": 0.0004984072524772845, + "loss": 1.607, + "step": 403 + }, + { + "epoch": 0.07207207207207207, + "grad_norm": 0.6888518333435059, + "learning_rate": 0.0004983993466721955, + "loss": 1.8157, + "step": 404 + }, + { + "epoch": 0.07225046829007226, + "grad_norm": 0.9161537289619446, + "learning_rate": 0.0004983914213579287, + "loss": 1.28, + "step": 405 + }, + { + "epoch": 0.07242886450807243, + "grad_norm": 0.8046262860298157, + "learning_rate": 0.0004983834765351065, + "loss": 1.7245, + "step": 406 + }, + { + "epoch": 0.07260726072607261, + "grad_norm": 0.7060288786888123, + "learning_rate": 0.000498375512204353, + "loss": 1.3604, + "step": 407 + }, + { + "epoch": 0.07278565694407278, + "grad_norm": 0.6991462707519531, + "learning_rate": 0.0004983675283662936, + "loss": 1.3504, + "step": 408 + }, + { + "epoch": 0.07296405316207297, + "grad_norm": 0.7093139886856079, + "learning_rate": 0.0004983595250215556, + "loss": 1.3739, + "step": 409 + }, + { + "epoch": 0.07314244938007314, + "grad_norm": 0.7139406800270081, + "learning_rate": 0.0004983515021707672, + "loss": 1.6803, + "step": 410 + }, + { + "epoch": 0.07332084559807332, + "grad_norm": 0.6523078680038452, + "learning_rate": 0.0004983434598145587, + "loss": 1.5356, + "step": 411 + }, + { + "epoch": 0.0734992418160735, + "grad_norm": 0.7209059596061707, + "learning_rate": 0.0004983353979535617, + "loss": 1.5146, + "step": 412 + }, + { + "epoch": 0.07367763803407368, + "grad_norm": 0.8449965715408325, + "learning_rate": 0.0004983273165884096, + "loss": 1.5411, + "step": 413 + }, + { + "epoch": 0.07385603425207386, + "grad_norm": 0.8013091683387756, + "learning_rate": 0.0004983192157197368, + "loss": 1.6204, + "step": 414 + }, + { + "epoch": 0.07403443047007403, + "grad_norm": 0.7177372574806213, + "learning_rate": 0.0004983110953481796, + "loss": 1.3559, + "step": 415 + }, + { + "epoch": 0.07421282668807422, + "grad_norm": 1.297348976135254, + "learning_rate": 0.0004983029554743759, + "loss": 1.3917, + "step": 416 + }, + { + "epoch": 0.07439122290607439, + "grad_norm": 0.7415265440940857, + "learning_rate": 0.0004982947960989649, + "loss": 1.4611, + "step": 417 + }, + { + "epoch": 0.07456961912407457, + "grad_norm": 0.7333908677101135, + "learning_rate": 0.0004982866172225875, + "loss": 1.5049, + "step": 418 + }, + { + "epoch": 0.07474801534207474, + "grad_norm": 0.6998400092124939, + "learning_rate": 0.000498278418845886, + "loss": 1.5281, + "step": 419 + }, + { + "epoch": 0.07492641156007493, + "grad_norm": 0.8220930099487305, + "learning_rate": 0.0004982702009695044, + "loss": 1.6659, + "step": 420 + }, + { + "epoch": 0.0751048077780751, + "grad_norm": 0.6637287139892578, + "learning_rate": 0.0004982619635940879, + "loss": 1.5122, + "step": 421 + }, + { + "epoch": 0.07528320399607528, + "grad_norm": 0.6980449557304382, + "learning_rate": 0.0004982537067202837, + "loss": 1.4508, + "step": 422 + }, + { + "epoch": 0.07546160021407546, + "grad_norm": 0.7847539782524109, + "learning_rate": 0.0004982454303487403, + "loss": 1.5511, + "step": 423 + }, + { + "epoch": 0.07563999643207564, + "grad_norm": 0.7317553758621216, + "learning_rate": 0.0004982371344801074, + "loss": 1.6737, + "step": 424 + }, + { + "epoch": 0.07581839265007582, + "grad_norm": 0.816692054271698, + "learning_rate": 0.000498228819115037, + "loss": 1.6792, + "step": 425 + }, + { + "epoch": 0.075996788868076, + "grad_norm": 0.6910188794136047, + "learning_rate": 0.0004982204842541818, + "loss": 1.6721, + "step": 426 + }, + { + "epoch": 0.07617518508607618, + "grad_norm": 0.6814787983894348, + "learning_rate": 0.0004982121298981967, + "loss": 1.3176, + "step": 427 + }, + { + "epoch": 0.07635358130407635, + "grad_norm": 1.1283695697784424, + "learning_rate": 0.0004982037560477377, + "loss": 1.6094, + "step": 428 + }, + { + "epoch": 0.07653197752207654, + "grad_norm": 0.7835364937782288, + "learning_rate": 0.0004981953627034625, + "loss": 1.6079, + "step": 429 + }, + { + "epoch": 0.0767103737400767, + "grad_norm": 0.9054422974586487, + "learning_rate": 0.0004981869498660304, + "loss": 1.5844, + "step": 430 + }, + { + "epoch": 0.07688876995807689, + "grad_norm": 1.4797576665878296, + "learning_rate": 0.000498178517536102, + "loss": 1.6547, + "step": 431 + }, + { + "epoch": 0.07706716617607706, + "grad_norm": 0.9180254340171814, + "learning_rate": 0.0004981700657143396, + "loss": 1.6352, + "step": 432 + }, + { + "epoch": 0.07724556239407725, + "grad_norm": 0.703322172164917, + "learning_rate": 0.0004981615944014071, + "loss": 1.3254, + "step": 433 + }, + { + "epoch": 0.07742395861207742, + "grad_norm": 0.794771134853363, + "learning_rate": 0.0004981531035979697, + "loss": 1.6643, + "step": 434 + }, + { + "epoch": 0.0776023548300776, + "grad_norm": 0.7554542422294617, + "learning_rate": 0.0004981445933046944, + "loss": 1.393, + "step": 435 + }, + { + "epoch": 0.07778075104807779, + "grad_norm": 0.8355424404144287, + "learning_rate": 0.0004981360635222495, + "loss": 1.7901, + "step": 436 + }, + { + "epoch": 0.07795914726607796, + "grad_norm": 0.6845025420188904, + "learning_rate": 0.0004981275142513049, + "loss": 1.6268, + "step": 437 + }, + { + "epoch": 0.07813754348407814, + "grad_norm": 0.7474300265312195, + "learning_rate": 0.0004981189454925322, + "loss": 1.2496, + "step": 438 + }, + { + "epoch": 0.07831593970207831, + "grad_norm": 0.6945734620094299, + "learning_rate": 0.0004981103572466042, + "loss": 1.7841, + "step": 439 + }, + { + "epoch": 0.0784943359200785, + "grad_norm": 0.6879515051841736, + "learning_rate": 0.0004981017495141955, + "loss": 1.4973, + "step": 440 + }, + { + "epoch": 0.07867273213807867, + "grad_norm": 0.9907708168029785, + "learning_rate": 0.0004980931222959823, + "loss": 1.9778, + "step": 441 + }, + { + "epoch": 0.07885112835607885, + "grad_norm": 1.6716070175170898, + "learning_rate": 0.000498084475592642, + "loss": 1.8797, + "step": 442 + }, + { + "epoch": 0.07902952457407902, + "grad_norm": 0.8019148707389832, + "learning_rate": 0.0004980758094048536, + "loss": 1.4991, + "step": 443 + }, + { + "epoch": 0.07920792079207921, + "grad_norm": 0.7047806978225708, + "learning_rate": 0.000498067123733298, + "loss": 1.4349, + "step": 444 + }, + { + "epoch": 0.07938631701007938, + "grad_norm": 0.7296415567398071, + "learning_rate": 0.0004980584185786573, + "loss": 1.504, + "step": 445 + }, + { + "epoch": 0.07956471322807956, + "grad_norm": 0.8090648651123047, + "learning_rate": 0.0004980496939416151, + "loss": 1.5402, + "step": 446 + }, + { + "epoch": 0.07974310944607975, + "grad_norm": 0.7694994807243347, + "learning_rate": 0.0004980409498228566, + "loss": 1.6298, + "step": 447 + }, + { + "epoch": 0.07992150566407992, + "grad_norm": 1.529492735862732, + "learning_rate": 0.0004980321862230688, + "loss": 1.4919, + "step": 448 + }, + { + "epoch": 0.0800999018820801, + "grad_norm": 0.7055628299713135, + "learning_rate": 0.0004980234031429397, + "loss": 1.3756, + "step": 449 + }, + { + "epoch": 0.08027829810008028, + "grad_norm": 0.7243545055389404, + "learning_rate": 0.0004980146005831592, + "loss": 1.4947, + "step": 450 + }, + { + "epoch": 0.08045669431808046, + "grad_norm": 0.7006012797355652, + "learning_rate": 0.0004980057785444189, + "loss": 1.6987, + "step": 451 + }, + { + "epoch": 0.08063509053608063, + "grad_norm": 0.7193490862846375, + "learning_rate": 0.0004979969370274113, + "loss": 1.6076, + "step": 452 + }, + { + "epoch": 0.08081348675408082, + "grad_norm": 0.7039264440536499, + "learning_rate": 0.0004979880760328312, + "loss": 1.708, + "step": 453 + }, + { + "epoch": 0.08099188297208099, + "grad_norm": 0.7600789666175842, + "learning_rate": 0.0004979791955613741, + "loss": 1.9332, + "step": 454 + }, + { + "epoch": 0.08117027919008117, + "grad_norm": 4.260866165161133, + "learning_rate": 0.0004979702956137378, + "loss": 1.5614, + "step": 455 + }, + { + "epoch": 0.08134867540808134, + "grad_norm": 0.7909694314002991, + "learning_rate": 0.0004979613761906212, + "loss": 1.5609, + "step": 456 + }, + { + "epoch": 0.08152707162608153, + "grad_norm": 1.7039103507995605, + "learning_rate": 0.0004979524372927248, + "loss": 1.6149, + "step": 457 + }, + { + "epoch": 0.08170546784408171, + "grad_norm": 0.7937911152839661, + "learning_rate": 0.0004979434789207506, + "loss": 1.4368, + "step": 458 + }, + { + "epoch": 0.08188386406208188, + "grad_norm": 1.0153762102127075, + "learning_rate": 0.0004979345010754024, + "loss": 1.4232, + "step": 459 + }, + { + "epoch": 0.08206226028008207, + "grad_norm": 0.7721362709999084, + "learning_rate": 0.0004979255037573851, + "loss": 1.6384, + "step": 460 + }, + { + "epoch": 0.08224065649808224, + "grad_norm": 0.6385205388069153, + "learning_rate": 0.0004979164869674055, + "loss": 1.2598, + "step": 461 + }, + { + "epoch": 0.08241905271608242, + "grad_norm": 0.7460764646530151, + "learning_rate": 0.0004979074507061716, + "loss": 1.4009, + "step": 462 + }, + { + "epoch": 0.08259744893408259, + "grad_norm": 0.6863163113594055, + "learning_rate": 0.0004978983949743932, + "loss": 1.156, + "step": 463 + }, + { + "epoch": 0.08277584515208278, + "grad_norm": 0.9043185710906982, + "learning_rate": 0.0004978893197727817, + "loss": 1.4024, + "step": 464 + }, + { + "epoch": 0.08295424137008295, + "grad_norm": 0.8068817853927612, + "learning_rate": 0.0004978802251020494, + "loss": 1.5392, + "step": 465 + }, + { + "epoch": 0.08313263758808313, + "grad_norm": 0.8445210456848145, + "learning_rate": 0.0004978711109629112, + "loss": 1.3355, + "step": 466 + }, + { + "epoch": 0.08331103380608332, + "grad_norm": 0.9372073411941528, + "learning_rate": 0.0004978619773560824, + "loss": 1.5414, + "step": 467 + }, + { + "epoch": 0.08348943002408349, + "grad_norm": 0.7277926206588745, + "learning_rate": 0.0004978528242822807, + "loss": 1.3025, + "step": 468 + }, + { + "epoch": 0.08366782624208367, + "grad_norm": 0.8729844093322754, + "learning_rate": 0.0004978436517422248, + "loss": 1.5157, + "step": 469 + }, + { + "epoch": 0.08384622246008384, + "grad_norm": 0.6946988105773926, + "learning_rate": 0.0004978344597366351, + "loss": 1.5723, + "step": 470 + }, + { + "epoch": 0.08402461867808403, + "grad_norm": 0.6131045818328857, + "learning_rate": 0.0004978252482662337, + "loss": 1.3419, + "step": 471 + }, + { + "epoch": 0.0842030148960842, + "grad_norm": 0.8643115758895874, + "learning_rate": 0.0004978160173317438, + "loss": 1.609, + "step": 472 + }, + { + "epoch": 0.08438141111408438, + "grad_norm": 0.8250529170036316, + "learning_rate": 0.0004978067669338906, + "loss": 1.7085, + "step": 473 + }, + { + "epoch": 0.08455980733208456, + "grad_norm": 0.8330903649330139, + "learning_rate": 0.0004977974970734006, + "loss": 1.5074, + "step": 474 + }, + { + "epoch": 0.08473820355008474, + "grad_norm": 0.7635675072669983, + "learning_rate": 0.0004977882077510018, + "loss": 1.4812, + "step": 475 + }, + { + "epoch": 0.08491659976808491, + "grad_norm": 1.1247259378433228, + "learning_rate": 0.0004977788989674238, + "loss": 1.4147, + "step": 476 + }, + { + "epoch": 0.0850949959860851, + "grad_norm": 0.6841705441474915, + "learning_rate": 0.0004977695707233977, + "loss": 1.3934, + "step": 477 + }, + { + "epoch": 0.08527339220408528, + "grad_norm": 0.7214035987854004, + "learning_rate": 0.0004977602230196561, + "loss": 1.3389, + "step": 478 + }, + { + "epoch": 0.08545178842208545, + "grad_norm": 0.7651268243789673, + "learning_rate": 0.0004977508558569332, + "loss": 1.5814, + "step": 479 + }, + { + "epoch": 0.08563018464008564, + "grad_norm": 0.7980949282646179, + "learning_rate": 0.0004977414692359648, + "loss": 1.5766, + "step": 480 + }, + { + "epoch": 0.0858085808580858, + "grad_norm": 2.5435357093811035, + "learning_rate": 0.0004977320631574879, + "loss": 1.4437, + "step": 481 + }, + { + "epoch": 0.08598697707608599, + "grad_norm": 0.8370219469070435, + "learning_rate": 0.0004977226376222415, + "loss": 1.3928, + "step": 482 + }, + { + "epoch": 0.08616537329408616, + "grad_norm": 0.6488362550735474, + "learning_rate": 0.0004977131926309656, + "loss": 1.6902, + "step": 483 + }, + { + "epoch": 0.08634376951208635, + "grad_norm": 0.7219479084014893, + "learning_rate": 0.0004977037281844023, + "loss": 1.5692, + "step": 484 + }, + { + "epoch": 0.08652216573008652, + "grad_norm": 1.9494524002075195, + "learning_rate": 0.0004976942442832946, + "loss": 1.5357, + "step": 485 + }, + { + "epoch": 0.0867005619480867, + "grad_norm": 0.8252092003822327, + "learning_rate": 0.0004976847409283876, + "loss": 1.566, + "step": 486 + }, + { + "epoch": 0.08687895816608687, + "grad_norm": 0.624910295009613, + "learning_rate": 0.0004976752181204277, + "loss": 1.5799, + "step": 487 + }, + { + "epoch": 0.08705735438408706, + "grad_norm": 0.8229494094848633, + "learning_rate": 0.0004976656758601628, + "loss": 1.3698, + "step": 488 + }, + { + "epoch": 0.08723575060208724, + "grad_norm": 0.8363699316978455, + "learning_rate": 0.0004976561141483421, + "loss": 1.3137, + "step": 489 + }, + { + "epoch": 0.08741414682008741, + "grad_norm": 0.665203869342804, + "learning_rate": 0.0004976465329857169, + "loss": 1.579, + "step": 490 + }, + { + "epoch": 0.0875925430380876, + "grad_norm": 0.7317224144935608, + "learning_rate": 0.0004976369323730396, + "loss": 1.6404, + "step": 491 + }, + { + "epoch": 0.08777093925608777, + "grad_norm": 0.7230332493782043, + "learning_rate": 0.0004976273123110642, + "loss": 1.5054, + "step": 492 + }, + { + "epoch": 0.08794933547408795, + "grad_norm": 0.9114980697631836, + "learning_rate": 0.0004976176728005462, + "loss": 1.5787, + "step": 493 + }, + { + "epoch": 0.08812773169208812, + "grad_norm": 0.7472654581069946, + "learning_rate": 0.0004976080138422428, + "loss": 1.6348, + "step": 494 + }, + { + "epoch": 0.08830612791008831, + "grad_norm": 0.7824138402938843, + "learning_rate": 0.0004975983354369124, + "loss": 1.5032, + "step": 495 + }, + { + "epoch": 0.08848452412808848, + "grad_norm": 0.6334506869316101, + "learning_rate": 0.0004975886375853155, + "loss": 1.3295, + "step": 496 + }, + { + "epoch": 0.08866292034608866, + "grad_norm": 1.3684911727905273, + "learning_rate": 0.0004975789202882133, + "loss": 1.6436, + "step": 497 + }, + { + "epoch": 0.08884131656408883, + "grad_norm": 0.7262062430381775, + "learning_rate": 0.0004975691835463694, + "loss": 1.4747, + "step": 498 + }, + { + "epoch": 0.08901971278208902, + "grad_norm": 0.6823199987411499, + "learning_rate": 0.0004975594273605484, + "loss": 1.4783, + "step": 499 + }, + { + "epoch": 0.0891981090000892, + "grad_norm": 0.5289890766143799, + "learning_rate": 0.0004975496517315164, + "loss": 1.1087, + "step": 500 + }, + { + "epoch": 0.08937650521808937, + "grad_norm": 0.8217448592185974, + "learning_rate": 0.0004975398566600413, + "loss": 1.9442, + "step": 501 + }, + { + "epoch": 0.08955490143608956, + "grad_norm": 0.6704925894737244, + "learning_rate": 0.0004975300421468925, + "loss": 1.6343, + "step": 502 + }, + { + "epoch": 0.08973329765408973, + "grad_norm": 0.8133907914161682, + "learning_rate": 0.0004975202081928405, + "loss": 1.7413, + "step": 503 + }, + { + "epoch": 0.08991169387208992, + "grad_norm": 0.6570431590080261, + "learning_rate": 0.0004975103547986581, + "loss": 1.4866, + "step": 504 + }, + { + "epoch": 0.09009009009009009, + "grad_norm": 0.7188069820404053, + "learning_rate": 0.0004975004819651188, + "loss": 1.6916, + "step": 505 + }, + { + "epoch": 0.09026848630809027, + "grad_norm": 1.0071742534637451, + "learning_rate": 0.0004974905896929981, + "loss": 1.458, + "step": 506 + }, + { + "epoch": 0.09044688252609044, + "grad_norm": 0.7110443711280823, + "learning_rate": 0.0004974806779830731, + "loss": 1.4134, + "step": 507 + }, + { + "epoch": 0.09062527874409063, + "grad_norm": 0.6911363005638123, + "learning_rate": 0.0004974707468361221, + "loss": 1.5558, + "step": 508 + }, + { + "epoch": 0.0908036749620908, + "grad_norm": 0.632495641708374, + "learning_rate": 0.0004974607962529252, + "loss": 1.2942, + "step": 509 + }, + { + "epoch": 0.09098207118009098, + "grad_norm": 0.7343394160270691, + "learning_rate": 0.0004974508262342638, + "loss": 1.5239, + "step": 510 + }, + { + "epoch": 0.09116046739809117, + "grad_norm": 1.4106143712997437, + "learning_rate": 0.000497440836780921, + "loss": 1.3772, + "step": 511 + }, + { + "epoch": 0.09133886361609134, + "grad_norm": 0.7864501476287842, + "learning_rate": 0.0004974308278936813, + "loss": 1.672, + "step": 512 + }, + { + "epoch": 0.09151725983409152, + "grad_norm": 0.7116146087646484, + "learning_rate": 0.000497420799573331, + "loss": 1.3471, + "step": 513 + }, + { + "epoch": 0.09169565605209169, + "grad_norm": 0.6454455256462097, + "learning_rate": 0.0004974107518206575, + "loss": 1.5374, + "step": 514 + }, + { + "epoch": 0.09187405227009188, + "grad_norm": 1.5066065788269043, + "learning_rate": 0.00049740068463645, + "loss": 1.1575, + "step": 515 + }, + { + "epoch": 0.09205244848809205, + "grad_norm": 0.8750366568565369, + "learning_rate": 0.0004973905980214992, + "loss": 1.342, + "step": 516 + }, + { + "epoch": 0.09223084470609223, + "grad_norm": 1.1781761646270752, + "learning_rate": 0.0004973804919765973, + "loss": 1.5306, + "step": 517 + }, + { + "epoch": 0.0924092409240924, + "grad_norm": 0.7154638767242432, + "learning_rate": 0.0004973703665025381, + "loss": 1.3626, + "step": 518 + }, + { + "epoch": 0.09258763714209259, + "grad_norm": 0.8861932158470154, + "learning_rate": 0.0004973602216001166, + "loss": 1.4045, + "step": 519 + }, + { + "epoch": 0.09276603336009277, + "grad_norm": 0.8264048099517822, + "learning_rate": 0.0004973500572701299, + "loss": 1.5786, + "step": 520 + }, + { + "epoch": 0.09294442957809294, + "grad_norm": 0.6503618955612183, + "learning_rate": 0.0004973398735133761, + "loss": 1.2953, + "step": 521 + }, + { + "epoch": 0.09312282579609313, + "grad_norm": 1.5494425296783447, + "learning_rate": 0.000497329670330655, + "loss": 1.7067, + "step": 522 + }, + { + "epoch": 0.0933012220140933, + "grad_norm": 0.7740722298622131, + "learning_rate": 0.0004973194477227681, + "loss": 1.7424, + "step": 523 + }, + { + "epoch": 0.09347961823209348, + "grad_norm": 0.6886745095252991, + "learning_rate": 0.0004973092056905181, + "loss": 1.4517, + "step": 524 + }, + { + "epoch": 0.09365801445009365, + "grad_norm": 0.9685157537460327, + "learning_rate": 0.0004972989442347097, + "loss": 1.3515, + "step": 525 + }, + { + "epoch": 0.09383641066809384, + "grad_norm": 0.6641685962677002, + "learning_rate": 0.0004972886633561486, + "loss": 1.4647, + "step": 526 + }, + { + "epoch": 0.09401480688609401, + "grad_norm": 0.705767035484314, + "learning_rate": 0.0004972783630556421, + "loss": 1.6306, + "step": 527 + }, + { + "epoch": 0.0941932031040942, + "grad_norm": 0.6863012909889221, + "learning_rate": 0.0004972680433339995, + "loss": 1.5197, + "step": 528 + }, + { + "epoch": 0.09437159932209437, + "grad_norm": 3.830077886581421, + "learning_rate": 0.0004972577041920312, + "loss": 1.2183, + "step": 529 + }, + { + "epoch": 0.09454999554009455, + "grad_norm": 0.8609939217567444, + "learning_rate": 0.0004972473456305493, + "loss": 1.5419, + "step": 530 + }, + { + "epoch": 0.09472839175809473, + "grad_norm": 1.5111016035079956, + "learning_rate": 0.0004972369676503671, + "loss": 1.4507, + "step": 531 + }, + { + "epoch": 0.0949067879760949, + "grad_norm": 0.8439610600471497, + "learning_rate": 0.0004972265702523, + "loss": 1.5155, + "step": 532 + }, + { + "epoch": 0.09508518419409509, + "grad_norm": 0.9149817228317261, + "learning_rate": 0.0004972161534371643, + "loss": 1.727, + "step": 533 + }, + { + "epoch": 0.09526358041209526, + "grad_norm": 1.0200440883636475, + "learning_rate": 0.0004972057172057784, + "loss": 1.6166, + "step": 534 + }, + { + "epoch": 0.09544197663009545, + "grad_norm": 0.9794648885726929, + "learning_rate": 0.0004971952615589619, + "loss": 1.5486, + "step": 535 + }, + { + "epoch": 0.09562037284809562, + "grad_norm": 1.2553775310516357, + "learning_rate": 0.0004971847864975358, + "loss": 1.5297, + "step": 536 + }, + { + "epoch": 0.0957987690660958, + "grad_norm": 0.9116058349609375, + "learning_rate": 0.0004971742920223231, + "loss": 1.7309, + "step": 537 + }, + { + "epoch": 0.09597716528409597, + "grad_norm": 1.0907950401306152, + "learning_rate": 0.0004971637781341478, + "loss": 1.4598, + "step": 538 + }, + { + "epoch": 0.09615556150209616, + "grad_norm": 1.32576322555542, + "learning_rate": 0.0004971532448338357, + "loss": 1.3408, + "step": 539 + }, + { + "epoch": 0.09633395772009633, + "grad_norm": 0.7169795036315918, + "learning_rate": 0.0004971426921222141, + "loss": 1.3837, + "step": 540 + }, + { + "epoch": 0.09651235393809651, + "grad_norm": 1.4852778911590576, + "learning_rate": 0.0004971321200001119, + "loss": 1.3068, + "step": 541 + }, + { + "epoch": 0.0966907501560967, + "grad_norm": 5.810755729675293, + "learning_rate": 0.0004971215284683592, + "loss": 1.3056, + "step": 542 + }, + { + "epoch": 0.09686914637409687, + "grad_norm": 1.1541168689727783, + "learning_rate": 0.0004971109175277882, + "loss": 1.3768, + "step": 543 + }, + { + "epoch": 0.09704754259209705, + "grad_norm": 0.8170933127403259, + "learning_rate": 0.000497100287179232, + "loss": 1.6132, + "step": 544 + }, + { + "epoch": 0.09722593881009722, + "grad_norm": 0.8202213644981384, + "learning_rate": 0.0004970896374235255, + "loss": 1.7538, + "step": 545 + }, + { + "epoch": 0.09740433502809741, + "grad_norm": 0.7768821120262146, + "learning_rate": 0.0004970789682615052, + "loss": 1.5705, + "step": 546 + }, + { + "epoch": 0.09758273124609758, + "grad_norm": 1.4286799430847168, + "learning_rate": 0.0004970682796940091, + "loss": 1.5751, + "step": 547 + }, + { + "epoch": 0.09776112746409776, + "grad_norm": 2.075110912322998, + "learning_rate": 0.0004970575717218767, + "loss": 1.5745, + "step": 548 + }, + { + "epoch": 0.09793952368209793, + "grad_norm": 0.8903990387916565, + "learning_rate": 0.0004970468443459489, + "loss": 1.4809, + "step": 549 + }, + { + "epoch": 0.09811791990009812, + "grad_norm": 0.8855850696563721, + "learning_rate": 0.0004970360975670682, + "loss": 1.7035, + "step": 550 + }, + { + "epoch": 0.09829631611809829, + "grad_norm": 0.8159774541854858, + "learning_rate": 0.0004970253313860787, + "loss": 1.2233, + "step": 551 + }, + { + "epoch": 0.09847471233609847, + "grad_norm": 2.709984302520752, + "learning_rate": 0.0004970145458038261, + "loss": 1.497, + "step": 552 + }, + { + "epoch": 0.09865310855409866, + "grad_norm": 1.0269032716751099, + "learning_rate": 0.0004970037408211573, + "loss": 1.5952, + "step": 553 + }, + { + "epoch": 0.09883150477209883, + "grad_norm": 1.1201132535934448, + "learning_rate": 0.000496992916438921, + "loss": 1.3275, + "step": 554 + }, + { + "epoch": 0.09900990099009901, + "grad_norm": 0.8947777152061462, + "learning_rate": 0.0004969820726579673, + "loss": 1.4586, + "step": 555 + }, + { + "epoch": 0.09918829720809919, + "grad_norm": 0.9649630784988403, + "learning_rate": 0.0004969712094791479, + "loss": 1.4422, + "step": 556 + }, + { + "epoch": 0.09936669342609937, + "grad_norm": 0.9738620519638062, + "learning_rate": 0.0004969603269033159, + "loss": 1.4171, + "step": 557 + }, + { + "epoch": 0.09954508964409954, + "grad_norm": 0.7529584169387817, + "learning_rate": 0.0004969494249313262, + "loss": 1.2664, + "step": 558 + }, + { + "epoch": 0.09972348586209973, + "grad_norm": 0.8182567358016968, + "learning_rate": 0.000496938503564035, + "loss": 1.5923, + "step": 559 + }, + { + "epoch": 0.0999018820800999, + "grad_norm": 0.7630960941314697, + "learning_rate": 0.0004969275628023, + "loss": 1.3102, + "step": 560 + }, + { + "epoch": 0.10008027829810008, + "grad_norm": 0.7095026969909668, + "learning_rate": 0.0004969166026469803, + "loss": 1.3446, + "step": 561 + }, + { + "epoch": 0.10025867451610025, + "grad_norm": 0.7493692636489868, + "learning_rate": 0.0004969056230989371, + "loss": 1.5735, + "step": 562 + }, + { + "epoch": 0.10043707073410044, + "grad_norm": 0.6589444279670715, + "learning_rate": 0.0004968946241590324, + "loss": 1.2374, + "step": 563 + }, + { + "epoch": 0.10061546695210062, + "grad_norm": 1.1761233806610107, + "learning_rate": 0.0004968836058281301, + "loss": 1.4974, + "step": 564 + }, + { + "epoch": 0.10079386317010079, + "grad_norm": 0.6165998578071594, + "learning_rate": 0.0004968725681070957, + "loss": 1.1654, + "step": 565 + }, + { + "epoch": 0.10097225938810098, + "grad_norm": 0.7280291318893433, + "learning_rate": 0.0004968615109967961, + "loss": 1.4225, + "step": 566 + }, + { + "epoch": 0.10115065560610115, + "grad_norm": 1.2140183448791504, + "learning_rate": 0.0004968504344980997, + "loss": 1.6689, + "step": 567 + }, + { + "epoch": 0.10132905182410133, + "grad_norm": 0.7183876037597656, + "learning_rate": 0.0004968393386118763, + "loss": 1.5669, + "step": 568 + }, + { + "epoch": 0.1015074480421015, + "grad_norm": 0.6638524532318115, + "learning_rate": 0.0004968282233389974, + "loss": 1.1266, + "step": 569 + }, + { + "epoch": 0.10168584426010169, + "grad_norm": 0.7190263271331787, + "learning_rate": 0.0004968170886803362, + "loss": 1.363, + "step": 570 + }, + { + "epoch": 0.10186424047810186, + "grad_norm": 0.8314970135688782, + "learning_rate": 0.000496805934636767, + "loss": 1.6896, + "step": 571 + }, + { + "epoch": 0.10204263669610204, + "grad_norm": 0.6682543158531189, + "learning_rate": 0.0004967947612091659, + "loss": 1.4487, + "step": 572 + }, + { + "epoch": 0.10222103291410223, + "grad_norm": 0.806235134601593, + "learning_rate": 0.0004967835683984105, + "loss": 1.3844, + "step": 573 + }, + { + "epoch": 0.1023994291321024, + "grad_norm": 0.7934510707855225, + "learning_rate": 0.0004967723562053798, + "loss": 1.5275, + "step": 574 + }, + { + "epoch": 0.10257782535010258, + "grad_norm": 0.6725155115127563, + "learning_rate": 0.0004967611246309544, + "loss": 1.3457, + "step": 575 + }, + { + "epoch": 0.10275622156810275, + "grad_norm": 1.1390475034713745, + "learning_rate": 0.0004967498736760165, + "loss": 1.5195, + "step": 576 + }, + { + "epoch": 0.10293461778610294, + "grad_norm": 0.7463436126708984, + "learning_rate": 0.0004967386033414498, + "loss": 1.4778, + "step": 577 + }, + { + "epoch": 0.10311301400410311, + "grad_norm": 0.6401159763336182, + "learning_rate": 0.0004967273136281392, + "loss": 1.0325, + "step": 578 + }, + { + "epoch": 0.1032914102221033, + "grad_norm": 0.7498372793197632, + "learning_rate": 0.0004967160045369716, + "loss": 1.6773, + "step": 579 + }, + { + "epoch": 0.10346980644010347, + "grad_norm": 0.6850476861000061, + "learning_rate": 0.0004967046760688353, + "loss": 1.4975, + "step": 580 + }, + { + "epoch": 0.10364820265810365, + "grad_norm": 0.6645762324333191, + "learning_rate": 0.0004966933282246198, + "loss": 1.5224, + "step": 581 + }, + { + "epoch": 0.10382659887610382, + "grad_norm": 0.9928523898124695, + "learning_rate": 0.0004966819610052165, + "loss": 1.2436, + "step": 582 + }, + { + "epoch": 0.104004995094104, + "grad_norm": 2.335838794708252, + "learning_rate": 0.0004966705744115182, + "loss": 1.4003, + "step": 583 + }, + { + "epoch": 0.10418339131210419, + "grad_norm": 0.7920882701873779, + "learning_rate": 0.0004966591684444191, + "loss": 1.3903, + "step": 584 + }, + { + "epoch": 0.10436178753010436, + "grad_norm": 1.7378654479980469, + "learning_rate": 0.000496647743104815, + "loss": 1.5465, + "step": 585 + }, + { + "epoch": 0.10454018374810455, + "grad_norm": 1.0265392065048218, + "learning_rate": 0.0004966362983936033, + "loss": 1.639, + "step": 586 + }, + { + "epoch": 0.10471857996610472, + "grad_norm": 25.1031436920166, + "learning_rate": 0.0004966248343116828, + "loss": 1.8437, + "step": 587 + }, + { + "epoch": 0.1048969761841049, + "grad_norm": 2.059586524963379, + "learning_rate": 0.0004966133508599541, + "loss": 1.4263, + "step": 588 + }, + { + "epoch": 0.10507537240210507, + "grad_norm": 1.6019502878189087, + "learning_rate": 0.0004966018480393188, + "loss": 1.2302, + "step": 589 + }, + { + "epoch": 0.10525376862010526, + "grad_norm": 7.920602798461914, + "learning_rate": 0.0004965903258506806, + "loss": 1.5145, + "step": 590 + }, + { + "epoch": 0.10543216483810543, + "grad_norm": 8.575254440307617, + "learning_rate": 0.0004965787842949443, + "loss": 1.138, + "step": 591 + }, + { + "epoch": 0.10561056105610561, + "grad_norm": 9.764083862304688, + "learning_rate": 0.0004965672233730163, + "loss": 1.8386, + "step": 592 + }, + { + "epoch": 0.10578895727410578, + "grad_norm": 2.423666000366211, + "learning_rate": 0.0004965556430858049, + "loss": 1.2881, + "step": 593 + }, + { + "epoch": 0.10596735349210597, + "grad_norm": 1.2473442554473877, + "learning_rate": 0.0004965440434342191, + "loss": 1.1922, + "step": 594 + }, + { + "epoch": 0.10614574971010615, + "grad_norm": 0.9045082330703735, + "learning_rate": 0.0004965324244191704, + "loss": 1.3325, + "step": 595 + }, + { + "epoch": 0.10632414592810632, + "grad_norm": 1.219525694847107, + "learning_rate": 0.0004965207860415711, + "loss": 1.6335, + "step": 596 + }, + { + "epoch": 0.10650254214610651, + "grad_norm": 5.38252592086792, + "learning_rate": 0.0004965091283023355, + "loss": 1.425, + "step": 597 + }, + { + "epoch": 0.10668093836410668, + "grad_norm": 2.2056992053985596, + "learning_rate": 0.0004964974512023789, + "loss": 1.5319, + "step": 598 + }, + { + "epoch": 0.10685933458210686, + "grad_norm": 0.9872840642929077, + "learning_rate": 0.0004964857547426186, + "loss": 1.1613, + "step": 599 + }, + { + "epoch": 0.10703773080010703, + "grad_norm": 1.287277102470398, + "learning_rate": 0.0004964740389239732, + "loss": 1.6958, + "step": 600 + }, + { + "epoch": 0.10721612701810722, + "grad_norm": 1.0453811883926392, + "learning_rate": 0.0004964623037473628, + "loss": 1.7414, + "step": 601 + }, + { + "epoch": 0.10739452323610739, + "grad_norm": 1.040404200553894, + "learning_rate": 0.0004964505492137092, + "loss": 1.482, + "step": 602 + }, + { + "epoch": 0.10757291945410757, + "grad_norm": 0.5612449645996094, + "learning_rate": 0.0004964387753239355, + "loss": 1.3086, + "step": 603 + }, + { + "epoch": 0.10775131567210774, + "grad_norm": 42.32016372680664, + "learning_rate": 0.0004964269820789664, + "loss": 1.3425, + "step": 604 + }, + { + "epoch": 0.10792971189010793, + "grad_norm": 51.25515365600586, + "learning_rate": 0.0004964151694797282, + "loss": 1.6645, + "step": 605 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 1.5014013051986694, + "learning_rate": 0.0004964033375271486, + "loss": 1.5083, + "step": 606 + }, + { + "epoch": 0.10828650432610828, + "grad_norm": 2.11055588722229, + "learning_rate": 0.0004963914862221569, + "loss": 1.2624, + "step": 607 + }, + { + "epoch": 0.10846490054410847, + "grad_norm": 1.610378623008728, + "learning_rate": 0.000496379615565684, + "loss": 1.5693, + "step": 608 + }, + { + "epoch": 0.10864329676210864, + "grad_norm": 0.88837069272995, + "learning_rate": 0.000496367725558662, + "loss": 1.5379, + "step": 609 + }, + { + "epoch": 0.10882169298010883, + "grad_norm": 1.831912875175476, + "learning_rate": 0.000496355816202025, + "loss": 1.3983, + "step": 610 + }, + { + "epoch": 0.109000089198109, + "grad_norm": 0.7277733087539673, + "learning_rate": 0.0004963438874967081, + "loss": 1.5717, + "step": 611 + }, + { + "epoch": 0.10917848541610918, + "grad_norm": 0.7317838072776794, + "learning_rate": 0.0004963319394436483, + "loss": 1.2432, + "step": 612 + }, + { + "epoch": 0.10935688163410935, + "grad_norm": 5.225924491882324, + "learning_rate": 0.000496319972043784, + "loss": 1.3503, + "step": 613 + }, + { + "epoch": 0.10953527785210954, + "grad_norm": 1.8268285989761353, + "learning_rate": 0.0004963079852980551, + "loss": 1.5332, + "step": 614 + }, + { + "epoch": 0.1097136740701097, + "grad_norm": 0.9872994422912598, + "learning_rate": 0.000496295979207403, + "loss": 1.5261, + "step": 615 + }, + { + "epoch": 0.10989207028810989, + "grad_norm": 8.904617309570312, + "learning_rate": 0.0004962839537727706, + "loss": 1.4517, + "step": 616 + }, + { + "epoch": 0.11007046650611008, + "grad_norm": 1.2641127109527588, + "learning_rate": 0.0004962719089951027, + "loss": 1.6554, + "step": 617 + }, + { + "epoch": 0.11024886272411025, + "grad_norm": 0.7095901370048523, + "learning_rate": 0.0004962598448753448, + "loss": 1.5814, + "step": 618 + }, + { + "epoch": 0.11042725894211043, + "grad_norm": 0.8702186346054077, + "learning_rate": 0.0004962477614144448, + "loss": 1.2384, + "step": 619 + }, + { + "epoch": 0.1106056551601106, + "grad_norm": 0.9688485860824585, + "learning_rate": 0.0004962356586133515, + "loss": 1.6793, + "step": 620 + }, + { + "epoch": 0.11078405137811079, + "grad_norm": 0.9615238308906555, + "learning_rate": 0.0004962235364730157, + "loss": 1.4395, + "step": 621 + }, + { + "epoch": 0.11096244759611096, + "grad_norm": 1.0574663877487183, + "learning_rate": 0.0004962113949943891, + "loss": 1.6388, + "step": 622 + }, + { + "epoch": 0.11114084381411114, + "grad_norm": 0.8646270036697388, + "learning_rate": 0.0004961992341784256, + "loss": 1.4026, + "step": 623 + }, + { + "epoch": 0.11131924003211131, + "grad_norm": 0.6639731526374817, + "learning_rate": 0.0004961870540260801, + "loss": 1.2448, + "step": 624 + }, + { + "epoch": 0.1114976362501115, + "grad_norm": 0.8900001049041748, + "learning_rate": 0.0004961748545383094, + "loss": 1.66, + "step": 625 + }, + { + "epoch": 0.11167603246811168, + "grad_norm": 1.1087528467178345, + "learning_rate": 0.0004961626357160716, + "loss": 1.3272, + "step": 626 + }, + { + "epoch": 0.11185442868611185, + "grad_norm": 0.6758600473403931, + "learning_rate": 0.0004961503975603262, + "loss": 1.1703, + "step": 627 + }, + { + "epoch": 0.11203282490411204, + "grad_norm": 0.7017180323600769, + "learning_rate": 0.0004961381400720346, + "loss": 1.6468, + "step": 628 + }, + { + "epoch": 0.11221122112211221, + "grad_norm": 1.1828702688217163, + "learning_rate": 0.0004961258632521595, + "loss": 1.2218, + "step": 629 + }, + { + "epoch": 0.1123896173401124, + "grad_norm": 0.7270997762680054, + "learning_rate": 0.0004961135671016647, + "loss": 1.435, + "step": 630 + }, + { + "epoch": 0.11256801355811256, + "grad_norm": 0.6998748183250427, + "learning_rate": 0.0004961012516215166, + "loss": 1.4037, + "step": 631 + }, + { + "epoch": 0.11274640977611275, + "grad_norm": 0.6797101497650146, + "learning_rate": 0.0004960889168126819, + "loss": 1.3327, + "step": 632 + }, + { + "epoch": 0.11292480599411292, + "grad_norm": 0.7535882592201233, + "learning_rate": 0.0004960765626761296, + "loss": 1.2727, + "step": 633 + }, + { + "epoch": 0.1131032022121131, + "grad_norm": 0.7744285464286804, + "learning_rate": 0.00049606418921283, + "loss": 1.3627, + "step": 634 + }, + { + "epoch": 0.11328159843011328, + "grad_norm": 1.608994483947754, + "learning_rate": 0.0004960517964237548, + "loss": 1.4621, + "step": 635 + }, + { + "epoch": 0.11345999464811346, + "grad_norm": 5.228842258453369, + "learning_rate": 0.0004960393843098775, + "loss": 1.2311, + "step": 636 + }, + { + "epoch": 0.11363839086611364, + "grad_norm": 0.7488652467727661, + "learning_rate": 0.0004960269528721728, + "loss": 1.3578, + "step": 637 + }, + { + "epoch": 0.11381678708411382, + "grad_norm": 0.6869979500770569, + "learning_rate": 0.0004960145021116171, + "loss": 1.4778, + "step": 638 + }, + { + "epoch": 0.113995183302114, + "grad_norm": 0.8673625588417053, + "learning_rate": 0.0004960020320291882, + "loss": 1.5803, + "step": 639 + }, + { + "epoch": 0.11417357952011417, + "grad_norm": 1.1181416511535645, + "learning_rate": 0.0004959895426258656, + "loss": 1.2287, + "step": 640 + }, + { + "epoch": 0.11435197573811436, + "grad_norm": 1.0992931127548218, + "learning_rate": 0.0004959770339026301, + "loss": 1.5307, + "step": 641 + }, + { + "epoch": 0.11453037195611453, + "grad_norm": 0.8505348563194275, + "learning_rate": 0.0004959645058604644, + "loss": 1.2554, + "step": 642 + }, + { + "epoch": 0.11470876817411471, + "grad_norm": 1.2507725954055786, + "learning_rate": 0.0004959519585003521, + "loss": 1.3439, + "step": 643 + }, + { + "epoch": 0.11488716439211488, + "grad_norm": 0.9388852119445801, + "learning_rate": 0.0004959393918232789, + "loss": 1.6679, + "step": 644 + }, + { + "epoch": 0.11506556061011507, + "grad_norm": 0.9033030271530151, + "learning_rate": 0.0004959268058302318, + "loss": 1.5723, + "step": 645 + }, + { + "epoch": 0.11524395682811524, + "grad_norm": 0.7030625939369202, + "learning_rate": 0.0004959142005221991, + "loss": 1.4426, + "step": 646 + }, + { + "epoch": 0.11542235304611542, + "grad_norm": 0.7073930501937866, + "learning_rate": 0.0004959015759001708, + "loss": 1.3419, + "step": 647 + }, + { + "epoch": 0.11560074926411561, + "grad_norm": 0.7212799191474915, + "learning_rate": 0.0004958889319651386, + "loss": 1.4213, + "step": 648 + }, + { + "epoch": 0.11577914548211578, + "grad_norm": 0.7988331317901611, + "learning_rate": 0.0004958762687180956, + "loss": 1.5477, + "step": 649 + }, + { + "epoch": 0.11595754170011596, + "grad_norm": 0.6533039212226868, + "learning_rate": 0.0004958635861600362, + "loss": 1.3518, + "step": 650 + }, + { + "epoch": 0.11613593791811613, + "grad_norm": 1.167569637298584, + "learning_rate": 0.0004958508842919565, + "loss": 1.5068, + "step": 651 + }, + { + "epoch": 0.11631433413611632, + "grad_norm": 0.8685398101806641, + "learning_rate": 0.0004958381631148543, + "loss": 1.5344, + "step": 652 + }, + { + "epoch": 0.11649273035411649, + "grad_norm": 0.6582388281822205, + "learning_rate": 0.0004958254226297284, + "loss": 1.4228, + "step": 653 + }, + { + "epoch": 0.11667112657211667, + "grad_norm": 0.6874244213104248, + "learning_rate": 0.0004958126628375797, + "loss": 1.3626, + "step": 654 + }, + { + "epoch": 0.11684952279011684, + "grad_norm": 0.6385990977287292, + "learning_rate": 0.0004957998837394102, + "loss": 1.4008, + "step": 655 + }, + { + "epoch": 0.11702791900811703, + "grad_norm": 0.7000042796134949, + "learning_rate": 0.0004957870853362237, + "loss": 1.4344, + "step": 656 + }, + { + "epoch": 0.1172063152261172, + "grad_norm": 0.6392539143562317, + "learning_rate": 0.0004957742676290251, + "loss": 1.2199, + "step": 657 + }, + { + "epoch": 0.11738471144411738, + "grad_norm": 0.6661501526832581, + "learning_rate": 0.0004957614306188214, + "loss": 1.5327, + "step": 658 + }, + { + "epoch": 0.11756310766211757, + "grad_norm": 0.6941922903060913, + "learning_rate": 0.0004957485743066207, + "loss": 1.157, + "step": 659 + }, + { + "epoch": 0.11774150388011774, + "grad_norm": 0.8714244365692139, + "learning_rate": 0.0004957356986934326, + "loss": 1.4848, + "step": 660 + }, + { + "epoch": 0.11791990009811792, + "grad_norm": 0.5838067531585693, + "learning_rate": 0.0004957228037802687, + "loss": 1.3737, + "step": 661 + }, + { + "epoch": 0.1180982963161181, + "grad_norm": 0.9917201399803162, + "learning_rate": 0.0004957098895681414, + "loss": 1.5201, + "step": 662 + }, + { + "epoch": 0.11827669253411828, + "grad_norm": 5.74738883972168, + "learning_rate": 0.0004956969560580651, + "loss": 1.3064, + "step": 663 + }, + { + "epoch": 0.11845508875211845, + "grad_norm": 0.6317707896232605, + "learning_rate": 0.0004956840032510556, + "loss": 1.1942, + "step": 664 + }, + { + "epoch": 0.11863348497011864, + "grad_norm": 0.7513093948364258, + "learning_rate": 0.0004956710311481302, + "loss": 1.4782, + "step": 665 + }, + { + "epoch": 0.1188118811881188, + "grad_norm": 0.8310014605522156, + "learning_rate": 0.0004956580397503078, + "loss": 1.5899, + "step": 666 + }, + { + "epoch": 0.11899027740611899, + "grad_norm": 1.0249019861221313, + "learning_rate": 0.0004956450290586087, + "loss": 1.1852, + "step": 667 + }, + { + "epoch": 0.11916867362411916, + "grad_norm": 1.868385910987854, + "learning_rate": 0.0004956319990740547, + "loss": 1.4115, + "step": 668 + }, + { + "epoch": 0.11934706984211935, + "grad_norm": 0.7085257768630981, + "learning_rate": 0.0004956189497976691, + "loss": 1.578, + "step": 669 + }, + { + "epoch": 0.11952546606011953, + "grad_norm": 0.607820451259613, + "learning_rate": 0.0004956058812304769, + "loss": 1.4083, + "step": 670 + }, + { + "epoch": 0.1197038622781197, + "grad_norm": 0.6980863213539124, + "learning_rate": 0.0004955927933735046, + "loss": 1.2609, + "step": 671 + }, + { + "epoch": 0.11988225849611989, + "grad_norm": 0.5971283316612244, + "learning_rate": 0.0004955796862277799, + "loss": 1.2083, + "step": 672 + }, + { + "epoch": 0.12006065471412006, + "grad_norm": 0.641476571559906, + "learning_rate": 0.0004955665597943323, + "loss": 1.2241, + "step": 673 + }, + { + "epoch": 0.12023905093212024, + "grad_norm": 0.7836436629295349, + "learning_rate": 0.0004955534140741928, + "loss": 1.511, + "step": 674 + }, + { + "epoch": 0.12041744715012041, + "grad_norm": 0.716119110584259, + "learning_rate": 0.0004955402490683939, + "loss": 1.7792, + "step": 675 + }, + { + "epoch": 0.1205958433681206, + "grad_norm": 0.7199398279190063, + "learning_rate": 0.0004955270647779695, + "loss": 1.3504, + "step": 676 + }, + { + "epoch": 0.12077423958612077, + "grad_norm": 1.583276629447937, + "learning_rate": 0.000495513861203955, + "loss": 1.4377, + "step": 677 + }, + { + "epoch": 0.12095263580412095, + "grad_norm": 0.5903739333152771, + "learning_rate": 0.0004955006383473876, + "loss": 1.2851, + "step": 678 + }, + { + "epoch": 0.12113103202212114, + "grad_norm": 0.85763019323349, + "learning_rate": 0.0004954873962093056, + "loss": 1.6958, + "step": 679 + }, + { + "epoch": 0.12130942824012131, + "grad_norm": 0.5933085083961487, + "learning_rate": 0.0004954741347907492, + "loss": 1.4475, + "step": 680 + }, + { + "epoch": 0.1214878244581215, + "grad_norm": 0.6299702525138855, + "learning_rate": 0.0004954608540927599, + "loss": 1.5114, + "step": 681 + }, + { + "epoch": 0.12166622067612166, + "grad_norm": 0.588097333908081, + "learning_rate": 0.0004954475541163807, + "loss": 1.2504, + "step": 682 + }, + { + "epoch": 0.12184461689412185, + "grad_norm": 0.7014090418815613, + "learning_rate": 0.0004954342348626562, + "loss": 1.7175, + "step": 683 + }, + { + "epoch": 0.12202301311212202, + "grad_norm": 0.6995732188224792, + "learning_rate": 0.0004954208963326327, + "loss": 1.4363, + "step": 684 + }, + { + "epoch": 0.1222014093301222, + "grad_norm": 0.5883218050003052, + "learning_rate": 0.0004954075385273574, + "loss": 1.385, + "step": 685 + }, + { + "epoch": 0.12237980554812238, + "grad_norm": 0.6858941912651062, + "learning_rate": 0.0004953941614478797, + "loss": 1.5209, + "step": 686 + }, + { + "epoch": 0.12255820176612256, + "grad_norm": 2.024874687194824, + "learning_rate": 0.0004953807650952502, + "loss": 1.4826, + "step": 687 + }, + { + "epoch": 0.12273659798412273, + "grad_norm": 0.6671762466430664, + "learning_rate": 0.000495367349470521, + "loss": 1.3409, + "step": 688 + }, + { + "epoch": 0.12291499420212292, + "grad_norm": 0.6432123184204102, + "learning_rate": 0.0004953539145747457, + "loss": 1.3691, + "step": 689 + }, + { + "epoch": 0.1230933904201231, + "grad_norm": 0.6529083251953125, + "learning_rate": 0.0004953404604089796, + "loss": 1.4347, + "step": 690 + }, + { + "epoch": 0.12327178663812327, + "grad_norm": 0.6190119981765747, + "learning_rate": 0.0004953269869742792, + "loss": 1.2487, + "step": 691 + }, + { + "epoch": 0.12345018285612346, + "grad_norm": 0.6484703421592712, + "learning_rate": 0.000495313494271703, + "loss": 1.5288, + "step": 692 + }, + { + "epoch": 0.12362857907412363, + "grad_norm": 0.6164755821228027, + "learning_rate": 0.0004952999823023104, + "loss": 1.2645, + "step": 693 + }, + { + "epoch": 0.12380697529212381, + "grad_norm": 0.6002447009086609, + "learning_rate": 0.0004952864510671628, + "loss": 1.2468, + "step": 694 + }, + { + "epoch": 0.12398537151012398, + "grad_norm": 0.6057468056678772, + "learning_rate": 0.0004952729005673229, + "loss": 1.5349, + "step": 695 + }, + { + "epoch": 0.12416376772812417, + "grad_norm": 0.5792039036750793, + "learning_rate": 0.0004952593308038549, + "loss": 1.1507, + "step": 696 + }, + { + "epoch": 0.12434216394612434, + "grad_norm": 0.7286936044692993, + "learning_rate": 0.0004952457417778247, + "loss": 1.4709, + "step": 697 + }, + { + "epoch": 0.12452056016412452, + "grad_norm": 0.6991254687309265, + "learning_rate": 0.0004952321334902993, + "loss": 1.4715, + "step": 698 + }, + { + "epoch": 0.12469895638212469, + "grad_norm": 2.0999631881713867, + "learning_rate": 0.0004952185059423478, + "loss": 1.4724, + "step": 699 + }, + { + "epoch": 0.12487735260012488, + "grad_norm": 0.7571597099304199, + "learning_rate": 0.0004952048591350403, + "loss": 1.4842, + "step": 700 + }, + { + "epoch": 0.12505574881812506, + "grad_norm": 0.7109989523887634, + "learning_rate": 0.0004951911930694487, + "loss": 1.2872, + "step": 701 + }, + { + "epoch": 0.12523414503612523, + "grad_norm": 3.125427484512329, + "learning_rate": 0.0004951775077466463, + "loss": 1.1666, + "step": 702 + }, + { + "epoch": 0.1254125412541254, + "grad_norm": 0.9019857048988342, + "learning_rate": 0.0004951638031677081, + "loss": 1.3661, + "step": 703 + }, + { + "epoch": 0.1255909374721256, + "grad_norm": 0.7541314363479614, + "learning_rate": 0.00049515007933371, + "loss": 1.2676, + "step": 704 + }, + { + "epoch": 0.12576933369012577, + "grad_norm": 0.6455008387565613, + "learning_rate": 0.0004951363362457304, + "loss": 1.2095, + "step": 705 + }, + { + "epoch": 0.12594772990812594, + "grad_norm": 0.7301307916641235, + "learning_rate": 0.0004951225739048484, + "loss": 1.5632, + "step": 706 + }, + { + "epoch": 0.12612612612612611, + "grad_norm": 0.6640974879264832, + "learning_rate": 0.0004951087923121449, + "loss": 1.4562, + "step": 707 + }, + { + "epoch": 0.1263045223441263, + "grad_norm": 0.6812661290168762, + "learning_rate": 0.0004950949914687023, + "loss": 1.2051, + "step": 708 + }, + { + "epoch": 0.12648291856212648, + "grad_norm": 1.516214370727539, + "learning_rate": 0.0004950811713756047, + "loss": 1.3933, + "step": 709 + }, + { + "epoch": 0.12666131478012665, + "grad_norm": 0.8335700631141663, + "learning_rate": 0.0004950673320339372, + "loss": 1.6518, + "step": 710 + }, + { + "epoch": 0.12683971099812685, + "grad_norm": 0.6542342901229858, + "learning_rate": 0.0004950534734447869, + "loss": 1.3769, + "step": 711 + }, + { + "epoch": 0.12701810721612702, + "grad_norm": 0.8510260581970215, + "learning_rate": 0.0004950395956092423, + "loss": 1.3511, + "step": 712 + }, + { + "epoch": 0.1271965034341272, + "grad_norm": 2.36539626121521, + "learning_rate": 0.0004950256985283934, + "loss": 1.35, + "step": 713 + }, + { + "epoch": 0.12737489965212737, + "grad_norm": 0.6594715118408203, + "learning_rate": 0.0004950117822033315, + "loss": 1.2426, + "step": 714 + }, + { + "epoch": 0.12755329587012756, + "grad_norm": 0.619835615158081, + "learning_rate": 0.0004949978466351495, + "loss": 1.256, + "step": 715 + }, + { + "epoch": 0.12773169208812774, + "grad_norm": 0.792833685874939, + "learning_rate": 0.0004949838918249423, + "loss": 1.4526, + "step": 716 + }, + { + "epoch": 0.1279100883061279, + "grad_norm": 0.7402137517929077, + "learning_rate": 0.0004949699177738056, + "loss": 1.4895, + "step": 717 + }, + { + "epoch": 0.12808848452412808, + "grad_norm": 0.7301040291786194, + "learning_rate": 0.0004949559244828369, + "loss": 1.435, + "step": 718 + }, + { + "epoch": 0.12826688074212828, + "grad_norm": 0.6888275742530823, + "learning_rate": 0.0004949419119531354, + "loss": 1.4007, + "step": 719 + }, + { + "epoch": 0.12844527696012845, + "grad_norm": 13.487852096557617, + "learning_rate": 0.0004949278801858015, + "loss": 1.2581, + "step": 720 + }, + { + "epoch": 0.12862367317812862, + "grad_norm": 0.8741459846496582, + "learning_rate": 0.0004949138291819372, + "loss": 1.3751, + "step": 721 + }, + { + "epoch": 0.12880206939612882, + "grad_norm": 14.084300994873047, + "learning_rate": 0.0004948997589426463, + "loss": 1.6692, + "step": 722 + }, + { + "epoch": 0.128980465614129, + "grad_norm": 1.240329623222351, + "learning_rate": 0.0004948856694690337, + "loss": 1.6749, + "step": 723 + }, + { + "epoch": 0.12915886183212916, + "grad_norm": 0.7830163240432739, + "learning_rate": 0.000494871560762206, + "loss": 1.5147, + "step": 724 + }, + { + "epoch": 0.12933725805012933, + "grad_norm": 0.8917336463928223, + "learning_rate": 0.0004948574328232713, + "loss": 1.4268, + "step": 725 + }, + { + "epoch": 0.12951565426812953, + "grad_norm": 0.8025479912757874, + "learning_rate": 0.000494843285653339, + "loss": 1.4808, + "step": 726 + }, + { + "epoch": 0.1296940504861297, + "grad_norm": 0.8113172650337219, + "learning_rate": 0.0004948291192535206, + "loss": 1.6472, + "step": 727 + }, + { + "epoch": 0.12987244670412987, + "grad_norm": 0.6620337963104248, + "learning_rate": 0.0004948149336249286, + "loss": 1.4085, + "step": 728 + }, + { + "epoch": 0.13005084292213004, + "grad_norm": 1.0298174619674683, + "learning_rate": 0.0004948007287686769, + "loss": 1.2783, + "step": 729 + }, + { + "epoch": 0.13022923914013024, + "grad_norm": 0.6767082810401917, + "learning_rate": 0.0004947865046858814, + "loss": 1.2708, + "step": 730 + }, + { + "epoch": 0.1304076353581304, + "grad_norm": 0.7776159644126892, + "learning_rate": 0.0004947722613776591, + "loss": 1.1688, + "step": 731 + }, + { + "epoch": 0.13058603157613058, + "grad_norm": 0.7061336636543274, + "learning_rate": 0.0004947579988451288, + "loss": 1.4495, + "step": 732 + }, + { + "epoch": 0.13076442779413078, + "grad_norm": 0.5810830593109131, + "learning_rate": 0.0004947437170894105, + "loss": 1.0836, + "step": 733 + }, + { + "epoch": 0.13094282401213095, + "grad_norm": 0.6294886469841003, + "learning_rate": 0.000494729416111626, + "loss": 1.3136, + "step": 734 + }, + { + "epoch": 0.13112122023013112, + "grad_norm": 0.6610152721405029, + "learning_rate": 0.0004947150959128986, + "loss": 1.4622, + "step": 735 + }, + { + "epoch": 0.1312996164481313, + "grad_norm": 0.6016314625740051, + "learning_rate": 0.0004947007564943527, + "loss": 1.1905, + "step": 736 + }, + { + "epoch": 0.1314780126661315, + "grad_norm": 0.6377681493759155, + "learning_rate": 0.0004946863978571148, + "loss": 1.2777, + "step": 737 + }, + { + "epoch": 0.13165640888413166, + "grad_norm": 0.6914573907852173, + "learning_rate": 0.0004946720200023125, + "loss": 1.4642, + "step": 738 + }, + { + "epoch": 0.13183480510213183, + "grad_norm": 0.6614610552787781, + "learning_rate": 0.000494657622931075, + "loss": 1.4415, + "step": 739 + }, + { + "epoch": 0.132013201320132, + "grad_norm": 1.029931902885437, + "learning_rate": 0.0004946432066445331, + "loss": 1.2224, + "step": 740 + }, + { + "epoch": 0.1321915975381322, + "grad_norm": 0.7482788562774658, + "learning_rate": 0.000494628771143819, + "loss": 1.4412, + "step": 741 + }, + { + "epoch": 0.13236999375613237, + "grad_norm": 0.9311326146125793, + "learning_rate": 0.0004946143164300665, + "loss": 1.5545, + "step": 742 + }, + { + "epoch": 0.13254838997413254, + "grad_norm": 0.6425170302391052, + "learning_rate": 0.0004945998425044109, + "loss": 1.3134, + "step": 743 + }, + { + "epoch": 0.13272678619213274, + "grad_norm": 0.674411952495575, + "learning_rate": 0.0004945853493679889, + "loss": 1.4689, + "step": 744 + }, + { + "epoch": 0.1329051824101329, + "grad_norm": 0.6528111696243286, + "learning_rate": 0.0004945708370219388, + "loss": 1.3335, + "step": 745 + }, + { + "epoch": 0.13308357862813308, + "grad_norm": 0.663161039352417, + "learning_rate": 0.0004945563054674005, + "loss": 1.6369, + "step": 746 + }, + { + "epoch": 0.13326197484613325, + "grad_norm": 1.808029294013977, + "learning_rate": 0.0004945417547055151, + "loss": 1.2105, + "step": 747 + }, + { + "epoch": 0.13344037106413345, + "grad_norm": 0.683954656124115, + "learning_rate": 0.0004945271847374255, + "loss": 1.4248, + "step": 748 + }, + { + "epoch": 0.13361876728213362, + "grad_norm": 0.5650470852851868, + "learning_rate": 0.0004945125955642761, + "loss": 1.2412, + "step": 749 + }, + { + "epoch": 0.1337971635001338, + "grad_norm": 0.7181941866874695, + "learning_rate": 0.0004944979871872126, + "loss": 1.5134, + "step": 750 + }, + { + "epoch": 0.13397555971813396, + "grad_norm": 0.6759237051010132, + "learning_rate": 0.0004944833596073825, + "loss": 1.4109, + "step": 751 + }, + { + "epoch": 0.13415395593613416, + "grad_norm": 1.112088918685913, + "learning_rate": 0.0004944687128259345, + "loss": 1.4521, + "step": 752 + }, + { + "epoch": 0.13433235215413433, + "grad_norm": 0.8726287484169006, + "learning_rate": 0.000494454046844019, + "loss": 1.4233, + "step": 753 + }, + { + "epoch": 0.1345107483721345, + "grad_norm": 0.61822509765625, + "learning_rate": 0.0004944393616627879, + "loss": 1.1642, + "step": 754 + }, + { + "epoch": 0.1346891445901347, + "grad_norm": 0.7049110531806946, + "learning_rate": 0.0004944246572833945, + "loss": 1.3176, + "step": 755 + }, + { + "epoch": 0.13486754080813487, + "grad_norm": 0.7006151676177979, + "learning_rate": 0.0004944099337069936, + "loss": 1.4621, + "step": 756 + }, + { + "epoch": 0.13504593702613504, + "grad_norm": 0.907748818397522, + "learning_rate": 0.0004943951909347419, + "loss": 1.4472, + "step": 757 + }, + { + "epoch": 0.13522433324413521, + "grad_norm": 1.7227107286453247, + "learning_rate": 0.0004943804289677969, + "loss": 1.381, + "step": 758 + }, + { + "epoch": 0.1354027294621354, + "grad_norm": 0.8136081695556641, + "learning_rate": 0.0004943656478073182, + "loss": 1.2206, + "step": 759 + }, + { + "epoch": 0.13558112568013558, + "grad_norm": 1.1220283508300781, + "learning_rate": 0.0004943508474544667, + "loss": 1.5002, + "step": 760 + }, + { + "epoch": 0.13575952189813575, + "grad_norm": 2.818793296813965, + "learning_rate": 0.0004943360279104047, + "loss": 1.4188, + "step": 761 + }, + { + "epoch": 0.13593791811613593, + "grad_norm": 7.106777191162109, + "learning_rate": 0.0004943211891762964, + "loss": 1.6358, + "step": 762 + }, + { + "epoch": 0.13611631433413612, + "grad_norm": 1.419669270515442, + "learning_rate": 0.0004943063312533069, + "loss": 1.4433, + "step": 763 + }, + { + "epoch": 0.1362947105521363, + "grad_norm": 1.2436542510986328, + "learning_rate": 0.0004942914541426033, + "loss": 1.5007, + "step": 764 + }, + { + "epoch": 0.13647310677013647, + "grad_norm": 1.268811821937561, + "learning_rate": 0.0004942765578453541, + "loss": 1.6141, + "step": 765 + }, + { + "epoch": 0.13665150298813666, + "grad_norm": 1.0248541831970215, + "learning_rate": 0.000494261642362729, + "loss": 1.2407, + "step": 766 + }, + { + "epoch": 0.13682989920613683, + "grad_norm": 0.8062700629234314, + "learning_rate": 0.0004942467076958999, + "loss": 1.2817, + "step": 767 + }, + { + "epoch": 0.137008295424137, + "grad_norm": 1.0193575620651245, + "learning_rate": 0.0004942317538460392, + "loss": 1.2323, + "step": 768 + }, + { + "epoch": 0.13718669164213718, + "grad_norm": 1.2625796794891357, + "learning_rate": 0.0004942167808143218, + "loss": 1.6804, + "step": 769 + }, + { + "epoch": 0.13736508786013737, + "grad_norm": 0.7931640148162842, + "learning_rate": 0.0004942017886019236, + "loss": 1.2993, + "step": 770 + }, + { + "epoch": 0.13754348407813755, + "grad_norm": 1.35048508644104, + "learning_rate": 0.0004941867772100218, + "loss": 1.4314, + "step": 771 + }, + { + "epoch": 0.13772188029613772, + "grad_norm": 1.2043832540512085, + "learning_rate": 0.0004941717466397957, + "loss": 1.4401, + "step": 772 + }, + { + "epoch": 0.1379002765141379, + "grad_norm": 0.8232206106185913, + "learning_rate": 0.0004941566968924258, + "loss": 1.4218, + "step": 773 + }, + { + "epoch": 0.13807867273213809, + "grad_norm": 0.7783534526824951, + "learning_rate": 0.0004941416279690939, + "loss": 1.5267, + "step": 774 + }, + { + "epoch": 0.13825706895013826, + "grad_norm": 0.6729874014854431, + "learning_rate": 0.0004941265398709835, + "loss": 1.368, + "step": 775 + }, + { + "epoch": 0.13843546516813843, + "grad_norm": 0.7547962069511414, + "learning_rate": 0.0004941114325992798, + "loss": 1.3377, + "step": 776 + }, + { + "epoch": 0.13861386138613863, + "grad_norm": 0.7145437002182007, + "learning_rate": 0.0004940963061551693, + "loss": 1.3902, + "step": 777 + }, + { + "epoch": 0.1387922576041388, + "grad_norm": 0.6485962271690369, + "learning_rate": 0.0004940811605398399, + "loss": 1.0085, + "step": 778 + }, + { + "epoch": 0.13897065382213897, + "grad_norm": 0.630803644657135, + "learning_rate": 0.0004940659957544813, + "loss": 1.4345, + "step": 779 + }, + { + "epoch": 0.13914905004013914, + "grad_norm": 1.3144874572753906, + "learning_rate": 0.0004940508118002842, + "loss": 1.2102, + "step": 780 + }, + { + "epoch": 0.13932744625813934, + "grad_norm": 0.6318569779396057, + "learning_rate": 0.0004940356086784415, + "loss": 1.1991, + "step": 781 + }, + { + "epoch": 0.1395058424761395, + "grad_norm": 0.9692159295082092, + "learning_rate": 0.0004940203863901472, + "loss": 1.1838, + "step": 782 + }, + { + "epoch": 0.13968423869413968, + "grad_norm": 0.6397761106491089, + "learning_rate": 0.0004940051449365966, + "loss": 1.301, + "step": 783 + }, + { + "epoch": 0.13986263491213985, + "grad_norm": 0.7027068734169006, + "learning_rate": 0.000493989884318987, + "loss": 1.1906, + "step": 784 + }, + { + "epoch": 0.14004103113014005, + "grad_norm": 0.5374085307121277, + "learning_rate": 0.0004939746045385168, + "loss": 1.2157, + "step": 785 + }, + { + "epoch": 0.14021942734814022, + "grad_norm": 0.7548506855964661, + "learning_rate": 0.0004939593055963863, + "loss": 1.2354, + "step": 786 + }, + { + "epoch": 0.1403978235661404, + "grad_norm": 0.8988766670227051, + "learning_rate": 0.0004939439874937967, + "loss": 1.2367, + "step": 787 + }, + { + "epoch": 0.1405762197841406, + "grad_norm": 0.6512690186500549, + "learning_rate": 0.0004939286502319515, + "loss": 1.1598, + "step": 788 + }, + { + "epoch": 0.14075461600214076, + "grad_norm": 0.664986789226532, + "learning_rate": 0.0004939132938120551, + "loss": 1.4823, + "step": 789 + }, + { + "epoch": 0.14093301222014093, + "grad_norm": 0.6778481006622314, + "learning_rate": 0.0004938979182353134, + "loss": 1.2677, + "step": 790 + }, + { + "epoch": 0.1411114084381411, + "grad_norm": 0.570472240447998, + "learning_rate": 0.0004938825235029343, + "loss": 0.8828, + "step": 791 + }, + { + "epoch": 0.1412898046561413, + "grad_norm": 0.9883807897567749, + "learning_rate": 0.0004938671096161267, + "loss": 1.2965, + "step": 792 + }, + { + "epoch": 0.14146820087414147, + "grad_norm": 0.7180954217910767, + "learning_rate": 0.0004938516765761011, + "loss": 1.4603, + "step": 793 + }, + { + "epoch": 0.14164659709214164, + "grad_norm": 0.6821011900901794, + "learning_rate": 0.00049383622438407, + "loss": 1.6537, + "step": 794 + }, + { + "epoch": 0.1418249933101418, + "grad_norm": 0.6514670848846436, + "learning_rate": 0.0004938207530412467, + "loss": 1.1712, + "step": 795 + }, + { + "epoch": 0.142003389528142, + "grad_norm": 0.5831206440925598, + "learning_rate": 0.0004938052625488464, + "loss": 1.2547, + "step": 796 + }, + { + "epoch": 0.14218178574614218, + "grad_norm": 0.6372058987617493, + "learning_rate": 0.0004937897529080856, + "loss": 1.3056, + "step": 797 + }, + { + "epoch": 0.14236018196414235, + "grad_norm": 0.6786195635795593, + "learning_rate": 0.0004937742241201826, + "loss": 1.5176, + "step": 798 + }, + { + "epoch": 0.14253857818214255, + "grad_norm": 0.6339353322982788, + "learning_rate": 0.000493758676186357, + "loss": 1.2932, + "step": 799 + }, + { + "epoch": 0.14271697440014272, + "grad_norm": 0.6407662630081177, + "learning_rate": 0.0004937431091078297, + "loss": 1.2913, + "step": 800 + }, + { + "epoch": 0.1428953706181429, + "grad_norm": 0.7081295251846313, + "learning_rate": 0.0004937275228858235, + "loss": 1.2895, + "step": 801 + }, + { + "epoch": 0.14307376683614306, + "grad_norm": 0.5862438678741455, + "learning_rate": 0.0004937119175215627, + "loss": 1.2476, + "step": 802 + }, + { + "epoch": 0.14325216305414326, + "grad_norm": 0.6276576519012451, + "learning_rate": 0.0004936962930162728, + "loss": 1.2236, + "step": 803 + }, + { + "epoch": 0.14343055927214343, + "grad_norm": 0.6015689969062805, + "learning_rate": 0.0004936806493711808, + "loss": 1.2797, + "step": 804 + }, + { + "epoch": 0.1436089554901436, + "grad_norm": 1.0078672170639038, + "learning_rate": 0.0004936649865875155, + "loss": 1.5178, + "step": 805 + }, + { + "epoch": 0.14378735170814377, + "grad_norm": 0.6235846281051636, + "learning_rate": 0.000493649304666507, + "loss": 1.2727, + "step": 806 + }, + { + "epoch": 0.14396574792614397, + "grad_norm": 0.6958335638046265, + "learning_rate": 0.0004936336036093869, + "loss": 1.4622, + "step": 807 + }, + { + "epoch": 0.14414414414414414, + "grad_norm": 0.7218196392059326, + "learning_rate": 0.0004936178834173884, + "loss": 1.3002, + "step": 808 + }, + { + "epoch": 0.1443225403621443, + "grad_norm": 0.5912010669708252, + "learning_rate": 0.0004936021440917462, + "loss": 1.406, + "step": 809 + }, + { + "epoch": 0.1445009365801445, + "grad_norm": 1.7334743738174438, + "learning_rate": 0.0004935863856336965, + "loss": 1.2606, + "step": 810 + }, + { + "epoch": 0.14467933279814468, + "grad_norm": 0.7477133274078369, + "learning_rate": 0.000493570608044477, + "loss": 1.3254, + "step": 811 + }, + { + "epoch": 0.14485772901614485, + "grad_norm": 0.6125087141990662, + "learning_rate": 0.0004935548113253266, + "loss": 1.448, + "step": 812 + }, + { + "epoch": 0.14503612523414502, + "grad_norm": 0.6391841173171997, + "learning_rate": 0.0004935389954774861, + "loss": 1.0645, + "step": 813 + }, + { + "epoch": 0.14521452145214522, + "grad_norm": 0.6377097368240356, + "learning_rate": 0.0004935231605021977, + "loss": 1.2738, + "step": 814 + }, + { + "epoch": 0.1453929176701454, + "grad_norm": 0.6253454685211182, + "learning_rate": 0.0004935073064007052, + "loss": 1.6608, + "step": 815 + }, + { + "epoch": 0.14557131388814556, + "grad_norm": 0.7210360169410706, + "learning_rate": 0.0004934914331742536, + "loss": 1.5487, + "step": 816 + }, + { + "epoch": 0.14574971010614576, + "grad_norm": 0.589590311050415, + "learning_rate": 0.0004934755408240896, + "loss": 1.1513, + "step": 817 + }, + { + "epoch": 0.14592810632414593, + "grad_norm": 0.6044402718544006, + "learning_rate": 0.0004934596293514614, + "loss": 1.2131, + "step": 818 + }, + { + "epoch": 0.1461065025421461, + "grad_norm": 0.6587312817573547, + "learning_rate": 0.0004934436987576186, + "loss": 1.5939, + "step": 819 + }, + { + "epoch": 0.14628489876014628, + "grad_norm": 0.5897753238677979, + "learning_rate": 0.0004934277490438126, + "loss": 1.2628, + "step": 820 + }, + { + "epoch": 0.14646329497814647, + "grad_norm": 0.5983254909515381, + "learning_rate": 0.0004934117802112959, + "loss": 1.1709, + "step": 821 + }, + { + "epoch": 0.14664169119614665, + "grad_norm": 0.6504422426223755, + "learning_rate": 0.0004933957922613227, + "loss": 1.4531, + "step": 822 + }, + { + "epoch": 0.14682008741414682, + "grad_norm": 0.5873280763626099, + "learning_rate": 0.0004933797851951487, + "loss": 1.1531, + "step": 823 + }, + { + "epoch": 0.146998483632147, + "grad_norm": 2.0532479286193848, + "learning_rate": 0.0004933637590140311, + "loss": 1.4533, + "step": 824 + }, + { + "epoch": 0.14717687985014719, + "grad_norm": 0.6756530404090881, + "learning_rate": 0.0004933477137192287, + "loss": 1.4504, + "step": 825 + }, + { + "epoch": 0.14735527606814736, + "grad_norm": 0.5908591747283936, + "learning_rate": 0.0004933316493120015, + "loss": 1.1947, + "step": 826 + }, + { + "epoch": 0.14753367228614753, + "grad_norm": 0.5556735396385193, + "learning_rate": 0.0004933155657936112, + "loss": 1.0962, + "step": 827 + }, + { + "epoch": 0.14771206850414773, + "grad_norm": 0.5515411496162415, + "learning_rate": 0.0004932994631653212, + "loss": 1.1774, + "step": 828 + }, + { + "epoch": 0.1478904647221479, + "grad_norm": 0.8183857202529907, + "learning_rate": 0.000493283341428396, + "loss": 1.3823, + "step": 829 + }, + { + "epoch": 0.14806886094014807, + "grad_norm": 0.6181221008300781, + "learning_rate": 0.0004932672005841019, + "loss": 1.4347, + "step": 830 + }, + { + "epoch": 0.14824725715814824, + "grad_norm": 0.632533848285675, + "learning_rate": 0.0004932510406337065, + "loss": 1.6746, + "step": 831 + }, + { + "epoch": 0.14842565337614844, + "grad_norm": 0.679986834526062, + "learning_rate": 0.0004932348615784791, + "loss": 1.2888, + "step": 832 + }, + { + "epoch": 0.1486040495941486, + "grad_norm": 0.6554014086723328, + "learning_rate": 0.0004932186634196903, + "loss": 1.4259, + "step": 833 + }, + { + "epoch": 0.14878244581214878, + "grad_norm": 0.6169226169586182, + "learning_rate": 0.0004932024461586124, + "loss": 1.206, + "step": 834 + }, + { + "epoch": 0.14896084203014895, + "grad_norm": 0.6929764747619629, + "learning_rate": 0.000493186209796519, + "loss": 1.2828, + "step": 835 + }, + { + "epoch": 0.14913923824814915, + "grad_norm": 0.5675246119499207, + "learning_rate": 0.0004931699543346854, + "loss": 1.4443, + "step": 836 + }, + { + "epoch": 0.14931763446614932, + "grad_norm": 0.6860451698303223, + "learning_rate": 0.0004931536797743881, + "loss": 1.3001, + "step": 837 + }, + { + "epoch": 0.1494960306841495, + "grad_norm": 0.6203287243843079, + "learning_rate": 0.0004931373861169055, + "loss": 1.6086, + "step": 838 + }, + { + "epoch": 0.1496744269021497, + "grad_norm": 0.5558333992958069, + "learning_rate": 0.0004931210733635172, + "loss": 1.2781, + "step": 839 + }, + { + "epoch": 0.14985282312014986, + "grad_norm": 0.6401587128639221, + "learning_rate": 0.0004931047415155044, + "loss": 1.278, + "step": 840 + }, + { + "epoch": 0.15003121933815003, + "grad_norm": 0.6404081583023071, + "learning_rate": 0.00049308839057415, + "loss": 1.2592, + "step": 841 + }, + { + "epoch": 0.1502096155561502, + "grad_norm": 0.646500289440155, + "learning_rate": 0.0004930720205407378, + "loss": 1.4263, + "step": 842 + }, + { + "epoch": 0.1503880117741504, + "grad_norm": 0.6502192616462708, + "learning_rate": 0.0004930556314165538, + "loss": 1.3815, + "step": 843 + }, + { + "epoch": 0.15056640799215057, + "grad_norm": 0.7043633460998535, + "learning_rate": 0.0004930392232028851, + "loss": 1.3289, + "step": 844 + }, + { + "epoch": 0.15074480421015074, + "grad_norm": 0.566808819770813, + "learning_rate": 0.0004930227959010204, + "loss": 1.226, + "step": 845 + }, + { + "epoch": 0.1509232004281509, + "grad_norm": 0.5885351896286011, + "learning_rate": 0.0004930063495122498, + "loss": 1.0517, + "step": 846 + }, + { + "epoch": 0.1511015966461511, + "grad_norm": 0.696162760257721, + "learning_rate": 0.0004929898840378651, + "loss": 1.4365, + "step": 847 + }, + { + "epoch": 0.15127999286415128, + "grad_norm": 0.676964282989502, + "learning_rate": 0.0004929733994791597, + "loss": 1.5412, + "step": 848 + }, + { + "epoch": 0.15145838908215145, + "grad_norm": 0.5642474293708801, + "learning_rate": 0.0004929568958374278, + "loss": 1.155, + "step": 849 + }, + { + "epoch": 0.15163678530015165, + "grad_norm": 0.9656538367271423, + "learning_rate": 0.0004929403731139659, + "loss": 1.7266, + "step": 850 + }, + { + "epoch": 0.15181518151815182, + "grad_norm": 0.6227328777313232, + "learning_rate": 0.0004929238313100717, + "loss": 1.4751, + "step": 851 + }, + { + "epoch": 0.151993577736152, + "grad_norm": 0.6424437761306763, + "learning_rate": 0.0004929072704270444, + "loss": 1.4661, + "step": 852 + }, + { + "epoch": 0.15217197395415216, + "grad_norm": 0.5338221192359924, + "learning_rate": 0.0004928906904661845, + "loss": 1.2984, + "step": 853 + }, + { + "epoch": 0.15235037017215236, + "grad_norm": 0.594628632068634, + "learning_rate": 0.0004928740914287944, + "loss": 1.1771, + "step": 854 + }, + { + "epoch": 0.15252876639015253, + "grad_norm": 1.8328267335891724, + "learning_rate": 0.0004928574733161775, + "loss": 1.241, + "step": 855 + }, + { + "epoch": 0.1527071626081527, + "grad_norm": 4.153714656829834, + "learning_rate": 0.0004928408361296393, + "loss": 1.2908, + "step": 856 + }, + { + "epoch": 0.15288555882615287, + "grad_norm": 0.6561400294303894, + "learning_rate": 0.0004928241798704862, + "loss": 1.1795, + "step": 857 + }, + { + "epoch": 0.15306395504415307, + "grad_norm": 0.9632255434989929, + "learning_rate": 0.0004928075045400267, + "loss": 1.3548, + "step": 858 + }, + { + "epoch": 0.15324235126215324, + "grad_norm": 0.6676768064498901, + "learning_rate": 0.0004927908101395701, + "loss": 1.3005, + "step": 859 + }, + { + "epoch": 0.1534207474801534, + "grad_norm": 0.6931350827217102, + "learning_rate": 0.0004927740966704278, + "loss": 1.4234, + "step": 860 + }, + { + "epoch": 0.1535991436981536, + "grad_norm": 0.582984983921051, + "learning_rate": 0.0004927573641339125, + "loss": 1.286, + "step": 861 + }, + { + "epoch": 0.15377753991615378, + "grad_norm": 1.9343520402908325, + "learning_rate": 0.0004927406125313382, + "loss": 1.1929, + "step": 862 + }, + { + "epoch": 0.15395593613415395, + "grad_norm": 0.8297525644302368, + "learning_rate": 0.0004927238418640208, + "loss": 1.3183, + "step": 863 + }, + { + "epoch": 0.15413433235215412, + "grad_norm": 0.6805528998374939, + "learning_rate": 0.0004927070521332772, + "loss": 1.3282, + "step": 864 + }, + { + "epoch": 0.15431272857015432, + "grad_norm": 0.8657769560813904, + "learning_rate": 0.0004926902433404261, + "loss": 1.253, + "step": 865 + }, + { + "epoch": 0.1544911247881545, + "grad_norm": 0.7054359912872314, + "learning_rate": 0.0004926734154867878, + "loss": 1.2187, + "step": 866 + }, + { + "epoch": 0.15466952100615466, + "grad_norm": 0.6124011874198914, + "learning_rate": 0.0004926565685736839, + "loss": 1.1525, + "step": 867 + }, + { + "epoch": 0.15484791722415484, + "grad_norm": 0.6714728474617004, + "learning_rate": 0.0004926397026024375, + "loss": 1.1008, + "step": 868 + }, + { + "epoch": 0.15502631344215503, + "grad_norm": 0.7281408905982971, + "learning_rate": 0.0004926228175743733, + "loss": 1.4181, + "step": 869 + }, + { + "epoch": 0.1552047096601552, + "grad_norm": 0.6333009004592896, + "learning_rate": 0.0004926059134908173, + "loss": 1.3329, + "step": 870 + }, + { + "epoch": 0.15538310587815538, + "grad_norm": 0.6089525818824768, + "learning_rate": 0.0004925889903530973, + "loss": 1.257, + "step": 871 + }, + { + "epoch": 0.15556150209615557, + "grad_norm": 0.6068894267082214, + "learning_rate": 0.0004925720481625426, + "loss": 1.126, + "step": 872 + }, + { + "epoch": 0.15573989831415574, + "grad_norm": 0.594664990901947, + "learning_rate": 0.0004925550869204835, + "loss": 1.3666, + "step": 873 + }, + { + "epoch": 0.15591829453215592, + "grad_norm": 0.6410947442054749, + "learning_rate": 0.0004925381066282522, + "loss": 1.6397, + "step": 874 + }, + { + "epoch": 0.1560966907501561, + "grad_norm": 0.6146419644355774, + "learning_rate": 0.0004925211072871824, + "loss": 1.4024, + "step": 875 + }, + { + "epoch": 0.15627508696815628, + "grad_norm": 0.6187763810157776, + "learning_rate": 0.0004925040888986091, + "loss": 1.4961, + "step": 876 + }, + { + "epoch": 0.15645348318615646, + "grad_norm": 0.6881486773490906, + "learning_rate": 0.0004924870514638691, + "loss": 1.5297, + "step": 877 + }, + { + "epoch": 0.15663187940415663, + "grad_norm": 0.69078129529953, + "learning_rate": 0.0004924699949843004, + "loss": 1.2651, + "step": 878 + }, + { + "epoch": 0.1568102756221568, + "grad_norm": 0.5774492621421814, + "learning_rate": 0.0004924529194612428, + "loss": 1.4284, + "step": 879 + }, + { + "epoch": 0.156988671840157, + "grad_norm": 0.5534959435462952, + "learning_rate": 0.0004924358248960372, + "loss": 1.4157, + "step": 880 + }, + { + "epoch": 0.15716706805815717, + "grad_norm": 0.5735335350036621, + "learning_rate": 0.000492418711290026, + "loss": 1.1534, + "step": 881 + }, + { + "epoch": 0.15734546427615734, + "grad_norm": 0.5932490229606628, + "learning_rate": 0.0004924015786445537, + "loss": 1.3835, + "step": 882 + }, + { + "epoch": 0.15752386049415754, + "grad_norm": 0.6592042446136475, + "learning_rate": 0.0004923844269609657, + "loss": 1.2475, + "step": 883 + }, + { + "epoch": 0.1577022567121577, + "grad_norm": 0.6756531000137329, + "learning_rate": 0.0004923672562406092, + "loss": 1.5197, + "step": 884 + }, + { + "epoch": 0.15788065293015788, + "grad_norm": 0.6795757412910461, + "learning_rate": 0.0004923500664848326, + "loss": 1.466, + "step": 885 + }, + { + "epoch": 0.15805904914815805, + "grad_norm": 0.6750994920730591, + "learning_rate": 0.0004923328576949862, + "loss": 1.494, + "step": 886 + }, + { + "epoch": 0.15823744536615825, + "grad_norm": 0.5709784030914307, + "learning_rate": 0.0004923156298724213, + "loss": 1.2837, + "step": 887 + }, + { + "epoch": 0.15841584158415842, + "grad_norm": 0.6380211114883423, + "learning_rate": 0.000492298383018491, + "loss": 1.4147, + "step": 888 + }, + { + "epoch": 0.1585942378021586, + "grad_norm": 0.6981920599937439, + "learning_rate": 0.0004922811171345502, + "loss": 1.297, + "step": 889 + }, + { + "epoch": 0.15877263402015876, + "grad_norm": 0.6991307139396667, + "learning_rate": 0.0004922638322219546, + "loss": 1.445, + "step": 890 + }, + { + "epoch": 0.15895103023815896, + "grad_norm": 0.6050926446914673, + "learning_rate": 0.000492246528282062, + "loss": 1.3701, + "step": 891 + }, + { + "epoch": 0.15912942645615913, + "grad_norm": 0.6252057552337646, + "learning_rate": 0.0004922292053162312, + "loss": 1.1218, + "step": 892 + }, + { + "epoch": 0.1593078226741593, + "grad_norm": 0.6167547702789307, + "learning_rate": 0.0004922118633258228, + "loss": 1.2547, + "step": 893 + }, + { + "epoch": 0.1594862188921595, + "grad_norm": 0.5581344366073608, + "learning_rate": 0.0004921945023121989, + "loss": 1.2666, + "step": 894 + }, + { + "epoch": 0.15966461511015967, + "grad_norm": 0.6345050930976868, + "learning_rate": 0.0004921771222767231, + "loss": 1.3042, + "step": 895 + }, + { + "epoch": 0.15984301132815984, + "grad_norm": 0.6487632393836975, + "learning_rate": 0.0004921597232207604, + "loss": 1.3426, + "step": 896 + }, + { + "epoch": 0.16002140754616, + "grad_norm": 0.6063089966773987, + "learning_rate": 0.0004921423051456772, + "loss": 1.2359, + "step": 897 + }, + { + "epoch": 0.1601998037641602, + "grad_norm": 0.6432583928108215, + "learning_rate": 0.0004921248680528414, + "loss": 1.2015, + "step": 898 + }, + { + "epoch": 0.16037819998216038, + "grad_norm": 0.6074740290641785, + "learning_rate": 0.0004921074119436229, + "loss": 1.0048, + "step": 899 + }, + { + "epoch": 0.16055659620016055, + "grad_norm": 0.6494312882423401, + "learning_rate": 0.0004920899368193923, + "loss": 1.2296, + "step": 900 + }, + { + "epoch": 0.16073499241816072, + "grad_norm": 0.6664918661117554, + "learning_rate": 0.0004920724426815222, + "loss": 1.4211, + "step": 901 + }, + { + "epoch": 0.16091338863616092, + "grad_norm": 0.6054812073707581, + "learning_rate": 0.0004920549295313867, + "loss": 1.1764, + "step": 902 + }, + { + "epoch": 0.1610917848541611, + "grad_norm": 0.5707181096076965, + "learning_rate": 0.0004920373973703612, + "loss": 1.3884, + "step": 903 + }, + { + "epoch": 0.16127018107216126, + "grad_norm": 0.5909110307693481, + "learning_rate": 0.0004920198461998227, + "loss": 1.2596, + "step": 904 + }, + { + "epoch": 0.16144857729016146, + "grad_norm": 0.592135488986969, + "learning_rate": 0.0004920022760211496, + "loss": 1.2927, + "step": 905 + }, + { + "epoch": 0.16162697350816163, + "grad_norm": 0.6415541172027588, + "learning_rate": 0.0004919846868357218, + "loss": 1.3682, + "step": 906 + }, + { + "epoch": 0.1618053697261618, + "grad_norm": 0.5409737229347229, + "learning_rate": 0.0004919670786449208, + "loss": 1.2196, + "step": 907 + }, + { + "epoch": 0.16198376594416197, + "grad_norm": 25.429161071777344, + "learning_rate": 0.0004919494514501298, + "loss": 1.6991, + "step": 908 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 4.827995777130127, + "learning_rate": 0.0004919318052527328, + "loss": 1.4752, + "step": 909 + }, + { + "epoch": 0.16234055838016234, + "grad_norm": 0.8540498614311218, + "learning_rate": 0.0004919141400541161, + "loss": 1.6279, + "step": 910 + }, + { + "epoch": 0.1625189545981625, + "grad_norm": 0.6418678760528564, + "learning_rate": 0.0004918964558556668, + "loss": 1.4733, + "step": 911 + }, + { + "epoch": 0.16269735081616268, + "grad_norm": 0.7732519507408142, + "learning_rate": 0.0004918787526587739, + "loss": 1.2503, + "step": 912 + }, + { + "epoch": 0.16287574703416288, + "grad_norm": 0.678031325340271, + "learning_rate": 0.000491861030464828, + "loss": 1.208, + "step": 913 + }, + { + "epoch": 0.16305414325216305, + "grad_norm": 0.6401492357254028, + "learning_rate": 0.0004918432892752208, + "loss": 1.0655, + "step": 914 + }, + { + "epoch": 0.16323253947016322, + "grad_norm": 0.904754102230072, + "learning_rate": 0.0004918255290913457, + "loss": 1.3065, + "step": 915 + }, + { + "epoch": 0.16341093568816342, + "grad_norm": 0.6582340002059937, + "learning_rate": 0.0004918077499145977, + "loss": 1.3543, + "step": 916 + }, + { + "epoch": 0.1635893319061636, + "grad_norm": 1.050347089767456, + "learning_rate": 0.000491789951746373, + "loss": 1.3799, + "step": 917 + }, + { + "epoch": 0.16376772812416376, + "grad_norm": 6.393340587615967, + "learning_rate": 0.0004917721345880696, + "loss": 1.2498, + "step": 918 + }, + { + "epoch": 0.16394612434216393, + "grad_norm": 4.43874454498291, + "learning_rate": 0.0004917542984410867, + "loss": 1.3987, + "step": 919 + }, + { + "epoch": 0.16412452056016413, + "grad_norm": 2.1143059730529785, + "learning_rate": 0.0004917364433068253, + "loss": 1.4354, + "step": 920 + }, + { + "epoch": 0.1643029167781643, + "grad_norm": 2.551389455795288, + "learning_rate": 0.0004917185691866876, + "loss": 1.4108, + "step": 921 + }, + { + "epoch": 0.16448131299616447, + "grad_norm": 4.1894426345825195, + "learning_rate": 0.0004917006760820776, + "loss": 1.4468, + "step": 922 + }, + { + "epoch": 0.16465970921416467, + "grad_norm": 0.6970653533935547, + "learning_rate": 0.0004916827639944005, + "loss": 1.3746, + "step": 923 + }, + { + "epoch": 0.16483810543216484, + "grad_norm": 0.559752345085144, + "learning_rate": 0.0004916648329250631, + "loss": 1.1099, + "step": 924 + }, + { + "epoch": 0.16501650165016502, + "grad_norm": 0.6053521037101746, + "learning_rate": 0.0004916468828754737, + "loss": 1.3007, + "step": 925 + }, + { + "epoch": 0.16519489786816519, + "grad_norm": 0.5920356512069702, + "learning_rate": 0.000491628913847042, + "loss": 1.5636, + "step": 926 + }, + { + "epoch": 0.16537329408616538, + "grad_norm": 0.5610060095787048, + "learning_rate": 0.0004916109258411795, + "loss": 1.0346, + "step": 927 + }, + { + "epoch": 0.16555169030416556, + "grad_norm": 0.6758180260658264, + "learning_rate": 0.0004915929188592989, + "loss": 1.0975, + "step": 928 + }, + { + "epoch": 0.16573008652216573, + "grad_norm": 0.6995829343795776, + "learning_rate": 0.0004915748929028145, + "loss": 1.574, + "step": 929 + }, + { + "epoch": 0.1659084827401659, + "grad_norm": 0.6252689957618713, + "learning_rate": 0.0004915568479731417, + "loss": 1.3219, + "step": 930 + }, + { + "epoch": 0.1660868789581661, + "grad_norm": 0.7008538842201233, + "learning_rate": 0.0004915387840716982, + "loss": 1.3542, + "step": 931 + }, + { + "epoch": 0.16626527517616627, + "grad_norm": 0.6104601621627808, + "learning_rate": 0.0004915207011999025, + "loss": 1.2171, + "step": 932 + }, + { + "epoch": 0.16644367139416644, + "grad_norm": 0.5563907623291016, + "learning_rate": 0.0004915025993591748, + "loss": 1.2237, + "step": 933 + }, + { + "epoch": 0.16662206761216664, + "grad_norm": 0.7092035412788391, + "learning_rate": 0.000491484478550937, + "loss": 1.5328, + "step": 934 + }, + { + "epoch": 0.1668004638301668, + "grad_norm": 0.5745419263839722, + "learning_rate": 0.0004914663387766121, + "loss": 1.2546, + "step": 935 + }, + { + "epoch": 0.16697886004816698, + "grad_norm": 0.6714827418327332, + "learning_rate": 0.0004914481800376248, + "loss": 1.2575, + "step": 936 + }, + { + "epoch": 0.16715725626616715, + "grad_norm": 0.6761769652366638, + "learning_rate": 0.0004914300023354015, + "loss": 1.4926, + "step": 937 + }, + { + "epoch": 0.16733565248416735, + "grad_norm": 0.5255780220031738, + "learning_rate": 0.0004914118056713695, + "loss": 1.0914, + "step": 938 + }, + { + "epoch": 0.16751404870216752, + "grad_norm": 0.5681695938110352, + "learning_rate": 0.0004913935900469584, + "loss": 1.1869, + "step": 939 + }, + { + "epoch": 0.1676924449201677, + "grad_norm": 0.6561183929443359, + "learning_rate": 0.0004913753554635985, + "loss": 1.6089, + "step": 940 + }, + { + "epoch": 0.16787084113816786, + "grad_norm": 0.6278382539749146, + "learning_rate": 0.0004913571019227221, + "loss": 1.3546, + "step": 941 + }, + { + "epoch": 0.16804923735616806, + "grad_norm": 0.6552897095680237, + "learning_rate": 0.0004913388294257627, + "loss": 1.1408, + "step": 942 + }, + { + "epoch": 0.16822763357416823, + "grad_norm": 0.7079983353614807, + "learning_rate": 0.0004913205379741555, + "loss": 1.704, + "step": 943 + }, + { + "epoch": 0.1684060297921684, + "grad_norm": 0.8556866645812988, + "learning_rate": 0.0004913022275693372, + "loss": 1.3536, + "step": 944 + }, + { + "epoch": 0.1685844260101686, + "grad_norm": 0.6923747062683105, + "learning_rate": 0.0004912838982127456, + "loss": 1.2432, + "step": 945 + }, + { + "epoch": 0.16876282222816877, + "grad_norm": 0.6427940726280212, + "learning_rate": 0.0004912655499058207, + "loss": 1.0336, + "step": 946 + }, + { + "epoch": 0.16894121844616894, + "grad_norm": 0.7308516502380371, + "learning_rate": 0.0004912471826500032, + "loss": 1.5323, + "step": 947 + }, + { + "epoch": 0.1691196146641691, + "grad_norm": 0.6658211946487427, + "learning_rate": 0.0004912287964467358, + "loss": 1.3749, + "step": 948 + }, + { + "epoch": 0.1692980108821693, + "grad_norm": 0.5936264395713806, + "learning_rate": 0.0004912103912974626, + "loss": 1.1564, + "step": 949 + }, + { + "epoch": 0.16947640710016948, + "grad_norm": 0.6166514754295349, + "learning_rate": 0.000491191967203629, + "loss": 1.5516, + "step": 950 + }, + { + "epoch": 0.16965480331816965, + "grad_norm": 0.6150327920913696, + "learning_rate": 0.0004911735241666821, + "loss": 1.4399, + "step": 951 + }, + { + "epoch": 0.16983319953616982, + "grad_norm": 0.6488068699836731, + "learning_rate": 0.0004911550621880704, + "loss": 1.4561, + "step": 952 + }, + { + "epoch": 0.17001159575417002, + "grad_norm": 0.565139889717102, + "learning_rate": 0.0004911365812692439, + "loss": 1.3605, + "step": 953 + }, + { + "epoch": 0.1701899919721702, + "grad_norm": 0.5767154693603516, + "learning_rate": 0.0004911180814116541, + "loss": 1.2484, + "step": 954 + }, + { + "epoch": 0.17036838819017036, + "grad_norm": 0.6060677766799927, + "learning_rate": 0.0004910995626167539, + "loss": 1.1449, + "step": 955 + }, + { + "epoch": 0.17054678440817056, + "grad_norm": 0.5569210648536682, + "learning_rate": 0.0004910810248859979, + "loss": 1.2146, + "step": 956 + }, + { + "epoch": 0.17072518062617073, + "grad_norm": 1.7405076026916504, + "learning_rate": 0.0004910624682208418, + "loss": 1.1172, + "step": 957 + }, + { + "epoch": 0.1709035768441709, + "grad_norm": 0.6181856393814087, + "learning_rate": 0.0004910438926227433, + "loss": 1.3699, + "step": 958 + }, + { + "epoch": 0.17108197306217107, + "grad_norm": 0.6462327837944031, + "learning_rate": 0.0004910252980931611, + "loss": 1.3112, + "step": 959 + }, + { + "epoch": 0.17126036928017127, + "grad_norm": 0.5451391935348511, + "learning_rate": 0.0004910066846335558, + "loss": 1.1995, + "step": 960 + }, + { + "epoch": 0.17143876549817144, + "grad_norm": 0.5842812061309814, + "learning_rate": 0.0004909880522453891, + "loss": 1.2324, + "step": 961 + }, + { + "epoch": 0.1716171617161716, + "grad_norm": 0.56785649061203, + "learning_rate": 0.0004909694009301247, + "loss": 1.0771, + "step": 962 + }, + { + "epoch": 0.17179555793417178, + "grad_norm": 0.6656644344329834, + "learning_rate": 0.000490950730689227, + "loss": 1.3911, + "step": 963 + }, + { + "epoch": 0.17197395415217198, + "grad_norm": 0.6386826038360596, + "learning_rate": 0.0004909320415241627, + "loss": 1.1475, + "step": 964 + }, + { + "epoch": 0.17215235037017215, + "grad_norm": 0.6195803284645081, + "learning_rate": 0.0004909133334363996, + "loss": 1.3983, + "step": 965 + }, + { + "epoch": 0.17233074658817232, + "grad_norm": 0.6595250368118286, + "learning_rate": 0.000490894606427407, + "loss": 1.1298, + "step": 966 + }, + { + "epoch": 0.17250914280617252, + "grad_norm": 0.5828666090965271, + "learning_rate": 0.0004908758604986555, + "loss": 1.0375, + "step": 967 + }, + { + "epoch": 0.1726875390241727, + "grad_norm": 0.5903612375259399, + "learning_rate": 0.0004908570956516176, + "loss": 1.3681, + "step": 968 + }, + { + "epoch": 0.17286593524217286, + "grad_norm": 0.587887167930603, + "learning_rate": 0.0004908383118877672, + "loss": 1.1854, + "step": 969 + }, + { + "epoch": 0.17304433146017303, + "grad_norm": 0.5458610653877258, + "learning_rate": 0.0004908195092085794, + "loss": 1.3502, + "step": 970 + }, + { + "epoch": 0.17322272767817323, + "grad_norm": 0.5689892768859863, + "learning_rate": 0.0004908006876155309, + "loss": 1.072, + "step": 971 + }, + { + "epoch": 0.1734011238961734, + "grad_norm": 0.5631921887397766, + "learning_rate": 0.0004907818471101, + "loss": 1.1866, + "step": 972 + }, + { + "epoch": 0.17357952011417357, + "grad_norm": 2.34973406791687, + "learning_rate": 0.0004907629876937665, + "loss": 1.3553, + "step": 973 + }, + { + "epoch": 0.17375791633217375, + "grad_norm": 0.5664592385292053, + "learning_rate": 0.0004907441093680115, + "loss": 1.2058, + "step": 974 + }, + { + "epoch": 0.17393631255017394, + "grad_norm": 0.6143642663955688, + "learning_rate": 0.0004907252121343178, + "loss": 1.4795, + "step": 975 + }, + { + "epoch": 0.17411470876817411, + "grad_norm": 0.5856114625930786, + "learning_rate": 0.0004907062959941695, + "loss": 1.2277, + "step": 976 + }, + { + "epoch": 0.17429310498617429, + "grad_norm": 0.5994637608528137, + "learning_rate": 0.0004906873609490523, + "loss": 1.3448, + "step": 977 + }, + { + "epoch": 0.17447150120417448, + "grad_norm": 0.5852928161621094, + "learning_rate": 0.0004906684070004534, + "loss": 1.1422, + "step": 978 + }, + { + "epoch": 0.17464989742217465, + "grad_norm": 1.4769107103347778, + "learning_rate": 0.0004906494341498614, + "loss": 1.351, + "step": 979 + }, + { + "epoch": 0.17482829364017483, + "grad_norm": 0.6843876838684082, + "learning_rate": 0.0004906304423987663, + "loss": 1.1986, + "step": 980 + }, + { + "epoch": 0.175006689858175, + "grad_norm": 0.7055404782295227, + "learning_rate": 0.0004906114317486599, + "loss": 1.3418, + "step": 981 + }, + { + "epoch": 0.1751850860761752, + "grad_norm": 0.6515609622001648, + "learning_rate": 0.000490592402201035, + "loss": 1.011, + "step": 982 + }, + { + "epoch": 0.17536348229417537, + "grad_norm": 0.6670236587524414, + "learning_rate": 0.0004905733537573867, + "loss": 1.2945, + "step": 983 + }, + { + "epoch": 0.17554187851217554, + "grad_norm": 0.6488009691238403, + "learning_rate": 0.0004905542864192104, + "loss": 1.3415, + "step": 984 + }, + { + "epoch": 0.1757202747301757, + "grad_norm": 1.1373000144958496, + "learning_rate": 0.0004905352001880041, + "loss": 1.562, + "step": 985 + }, + { + "epoch": 0.1758986709481759, + "grad_norm": 0.5701956748962402, + "learning_rate": 0.0004905160950652667, + "loss": 1.0377, + "step": 986 + }, + { + "epoch": 0.17607706716617608, + "grad_norm": 2.7552855014801025, + "learning_rate": 0.0004904969710524987, + "loss": 1.2074, + "step": 987 + }, + { + "epoch": 0.17625546338417625, + "grad_norm": 0.6393254995346069, + "learning_rate": 0.0004904778281512021, + "loss": 1.1729, + "step": 988 + }, + { + "epoch": 0.17643385960217645, + "grad_norm": 0.6372584104537964, + "learning_rate": 0.0004904586663628804, + "loss": 1.2313, + "step": 989 + }, + { + "epoch": 0.17661225582017662, + "grad_norm": 0.6509175896644592, + "learning_rate": 0.0004904394856890384, + "loss": 1.3779, + "step": 990 + }, + { + "epoch": 0.1767906520381768, + "grad_norm": 0.668499231338501, + "learning_rate": 0.0004904202861311827, + "loss": 1.3286, + "step": 991 + }, + { + "epoch": 0.17696904825617696, + "grad_norm": 0.6670610308647156, + "learning_rate": 0.0004904010676908213, + "loss": 1.3467, + "step": 992 + }, + { + "epoch": 0.17714744447417716, + "grad_norm": 0.6278952956199646, + "learning_rate": 0.0004903818303694633, + "loss": 1.2545, + "step": 993 + }, + { + "epoch": 0.17732584069217733, + "grad_norm": 0.6193589568138123, + "learning_rate": 0.0004903625741686199, + "loss": 1.2678, + "step": 994 + }, + { + "epoch": 0.1775042369101775, + "grad_norm": 0.6413539052009583, + "learning_rate": 0.0004903432990898033, + "loss": 1.5118, + "step": 995 + }, + { + "epoch": 0.17768263312817767, + "grad_norm": 1.0503791570663452, + "learning_rate": 0.0004903240051345276, + "loss": 1.3905, + "step": 996 + }, + { + "epoch": 0.17786102934617787, + "grad_norm": 0.5560838580131531, + "learning_rate": 0.0004903046923043077, + "loss": 0.9987, + "step": 997 + }, + { + "epoch": 0.17803942556417804, + "grad_norm": 0.7070960402488708, + "learning_rate": 0.0004902853606006609, + "loss": 1.3521, + "step": 998 + }, + { + "epoch": 0.1782178217821782, + "grad_norm": 0.7161464691162109, + "learning_rate": 0.0004902660100251051, + "loss": 1.1022, + "step": 999 + }, + { + "epoch": 0.1783962180001784, + "grad_norm": 0.6511627435684204, + "learning_rate": 0.0004902466405791604, + "loss": 1.284, + "step": 1000 + }, + { + "epoch": 0.17857461421817858, + "grad_norm": 0.6245794892311096, + "learning_rate": 0.0004902272522643478, + "loss": 1.1996, + "step": 1001 + }, + { + "epoch": 0.17875301043617875, + "grad_norm": 0.6342217922210693, + "learning_rate": 0.0004902078450821904, + "loss": 1.2702, + "step": 1002 + }, + { + "epoch": 0.17893140665417892, + "grad_norm": 0.5756664872169495, + "learning_rate": 0.0004901884190342121, + "loss": 1.2978, + "step": 1003 + }, + { + "epoch": 0.17910980287217912, + "grad_norm": 0.653351902961731, + "learning_rate": 0.0004901689741219386, + "loss": 1.4452, + "step": 1004 + }, + { + "epoch": 0.1792881990901793, + "grad_norm": 0.6309232115745544, + "learning_rate": 0.0004901495103468974, + "loss": 1.4723, + "step": 1005 + }, + { + "epoch": 0.17946659530817946, + "grad_norm": 0.617276668548584, + "learning_rate": 0.000490130027710617, + "loss": 1.2532, + "step": 1006 + }, + { + "epoch": 0.17964499152617963, + "grad_norm": 0.599210798740387, + "learning_rate": 0.0004901105262146275, + "loss": 1.2724, + "step": 1007 + }, + { + "epoch": 0.17982338774417983, + "grad_norm": 0.9026014804840088, + "learning_rate": 0.0004900910058604606, + "loss": 1.2691, + "step": 1008 + }, + { + "epoch": 0.18000178396218, + "grad_norm": 0.7611701488494873, + "learning_rate": 0.0004900714666496494, + "loss": 1.2221, + "step": 1009 + }, + { + "epoch": 0.18018018018018017, + "grad_norm": 0.6050606966018677, + "learning_rate": 0.0004900519085837285, + "loss": 1.2423, + "step": 1010 + }, + { + "epoch": 0.18035857639818037, + "grad_norm": 0.6327321529388428, + "learning_rate": 0.0004900323316642341, + "loss": 1.5898, + "step": 1011 + }, + { + "epoch": 0.18053697261618054, + "grad_norm": 3.784043788909912, + "learning_rate": 0.0004900127358927036, + "loss": 1.3516, + "step": 1012 + }, + { + "epoch": 0.1807153688341807, + "grad_norm": 1.5190318822860718, + "learning_rate": 0.0004899931212706761, + "loss": 1.2567, + "step": 1013 + }, + { + "epoch": 0.18089376505218088, + "grad_norm": 0.7018091082572937, + "learning_rate": 0.0004899734877996922, + "loss": 1.3094, + "step": 1014 + }, + { + "epoch": 0.18107216127018108, + "grad_norm": 1.1731371879577637, + "learning_rate": 0.0004899538354812937, + "loss": 1.3155, + "step": 1015 + }, + { + "epoch": 0.18125055748818125, + "grad_norm": 0.6494358777999878, + "learning_rate": 0.0004899341643170243, + "loss": 1.2901, + "step": 1016 + }, + { + "epoch": 0.18142895370618142, + "grad_norm": 0.6760775446891785, + "learning_rate": 0.0004899144743084289, + "loss": 1.3969, + "step": 1017 + }, + { + "epoch": 0.1816073499241816, + "grad_norm": 0.6156429648399353, + "learning_rate": 0.000489894765457054, + "loss": 1.1901, + "step": 1018 + }, + { + "epoch": 0.1817857461421818, + "grad_norm": 0.9414156079292297, + "learning_rate": 0.0004898750377644474, + "loss": 1.1387, + "step": 1019 + }, + { + "epoch": 0.18196414236018196, + "grad_norm": 0.6034333109855652, + "learning_rate": 0.0004898552912321586, + "loss": 1.3314, + "step": 1020 + }, + { + "epoch": 0.18214253857818213, + "grad_norm": 0.5967845916748047, + "learning_rate": 0.0004898355258617383, + "loss": 1.3435, + "step": 1021 + }, + { + "epoch": 0.18232093479618233, + "grad_norm": 0.6214697957038879, + "learning_rate": 0.000489815741654739, + "loss": 1.1333, + "step": 1022 + }, + { + "epoch": 0.1824993310141825, + "grad_norm": 0.7541596293449402, + "learning_rate": 0.0004897959386127148, + "loss": 1.1722, + "step": 1023 + }, + { + "epoch": 0.18267772723218267, + "grad_norm": 0.6246985793113708, + "learning_rate": 0.0004897761167372205, + "loss": 1.6439, + "step": 1024 + }, + { + "epoch": 0.18285612345018284, + "grad_norm": 2.4204161167144775, + "learning_rate": 0.0004897562760298134, + "loss": 1.442, + "step": 1025 + }, + { + "epoch": 0.18303451966818304, + "grad_norm": 8.560677528381348, + "learning_rate": 0.0004897364164920514, + "loss": 1.6211, + "step": 1026 + }, + { + "epoch": 0.18321291588618321, + "grad_norm": 0.8490382432937622, + "learning_rate": 0.0004897165381254945, + "loss": 1.2842, + "step": 1027 + }, + { + "epoch": 0.18339131210418338, + "grad_norm": 0.6359208226203918, + "learning_rate": 0.0004896966409317038, + "loss": 1.1782, + "step": 1028 + }, + { + "epoch": 0.18356970832218358, + "grad_norm": 0.6399941444396973, + "learning_rate": 0.0004896767249122421, + "loss": 1.1316, + "step": 1029 + }, + { + "epoch": 0.18374810454018375, + "grad_norm": 0.680789589881897, + "learning_rate": 0.0004896567900686736, + "loss": 1.2123, + "step": 1030 + }, + { + "epoch": 0.18392650075818393, + "grad_norm": 0.7316074371337891, + "learning_rate": 0.0004896368364025639, + "loss": 1.4407, + "step": 1031 + }, + { + "epoch": 0.1841048969761841, + "grad_norm": 1.80794095993042, + "learning_rate": 0.0004896168639154802, + "loss": 1.3116, + "step": 1032 + }, + { + "epoch": 0.1842832931941843, + "grad_norm": 1.869382619857788, + "learning_rate": 0.000489596872608991, + "loss": 1.3759, + "step": 1033 + }, + { + "epoch": 0.18446168941218447, + "grad_norm": 0.9936087727546692, + "learning_rate": 0.0004895768624846667, + "loss": 1.4019, + "step": 1034 + }, + { + "epoch": 0.18464008563018464, + "grad_norm": 1.2543047666549683, + "learning_rate": 0.0004895568335440786, + "loss": 1.2554, + "step": 1035 + }, + { + "epoch": 0.1848184818481848, + "grad_norm": 1.1524609327316284, + "learning_rate": 0.0004895367857888, + "loss": 1.1582, + "step": 1036 + }, + { + "epoch": 0.184996878066185, + "grad_norm": 0.8680905103683472, + "learning_rate": 0.0004895167192204053, + "loss": 1.4434, + "step": 1037 + }, + { + "epoch": 0.18517527428418518, + "grad_norm": 0.7361879348754883, + "learning_rate": 0.0004894966338404705, + "loss": 1.3415, + "step": 1038 + }, + { + "epoch": 0.18535367050218535, + "grad_norm": 0.6387485861778259, + "learning_rate": 0.0004894765296505732, + "loss": 1.2213, + "step": 1039 + }, + { + "epoch": 0.18553206672018555, + "grad_norm": 2.19881534576416, + "learning_rate": 0.0004894564066522923, + "loss": 1.1374, + "step": 1040 + }, + { + "epoch": 0.18571046293818572, + "grad_norm": 0.7040894627571106, + "learning_rate": 0.0004894362648472082, + "loss": 1.3711, + "step": 1041 + }, + { + "epoch": 0.1858888591561859, + "grad_norm": 0.6495254039764404, + "learning_rate": 0.000489416104236903, + "loss": 1.106, + "step": 1042 + }, + { + "epoch": 0.18606725537418606, + "grad_norm": 0.832444429397583, + "learning_rate": 0.00048939592482296, + "loss": 1.3915, + "step": 1043 + }, + { + "epoch": 0.18624565159218626, + "grad_norm": 0.5508606433868408, + "learning_rate": 0.000489375726606964, + "loss": 0.9048, + "step": 1044 + }, + { + "epoch": 0.18642404781018643, + "grad_norm": 0.6425449848175049, + "learning_rate": 0.0004893555095905013, + "loss": 1.3206, + "step": 1045 + }, + { + "epoch": 0.1866024440281866, + "grad_norm": 0.5754274129867554, + "learning_rate": 0.0004893352737751601, + "loss": 1.2648, + "step": 1046 + }, + { + "epoch": 0.18678084024618677, + "grad_norm": 0.5647794604301453, + "learning_rate": 0.0004893150191625295, + "loss": 1.171, + "step": 1047 + }, + { + "epoch": 0.18695923646418697, + "grad_norm": 0.6045464277267456, + "learning_rate": 0.0004892947457542002, + "loss": 0.9138, + "step": 1048 + }, + { + "epoch": 0.18713763268218714, + "grad_norm": 0.7438944578170776, + "learning_rate": 0.0004892744535517646, + "loss": 1.4838, + "step": 1049 + }, + { + "epoch": 0.1873160289001873, + "grad_norm": 0.5550808310508728, + "learning_rate": 0.0004892541425568163, + "loss": 1.2546, + "step": 1050 + }, + { + "epoch": 0.1874944251181875, + "grad_norm": 0.574200451374054, + "learning_rate": 0.0004892338127709507, + "loss": 1.2506, + "step": 1051 + }, + { + "epoch": 0.18767282133618768, + "grad_norm": 1.666972041130066, + "learning_rate": 0.0004892134641957642, + "loss": 1.389, + "step": 1052 + }, + { + "epoch": 0.18785121755418785, + "grad_norm": 0.6545609831809998, + "learning_rate": 0.0004891930968328554, + "loss": 1.2801, + "step": 1053 + }, + { + "epoch": 0.18802961377218802, + "grad_norm": 1.3686062097549438, + "learning_rate": 0.0004891727106838236, + "loss": 1.2763, + "step": 1054 + }, + { + "epoch": 0.18820800999018822, + "grad_norm": 1.248139500617981, + "learning_rate": 0.0004891523057502701, + "loss": 1.4209, + "step": 1055 + }, + { + "epoch": 0.1883864062081884, + "grad_norm": 0.7310791611671448, + "learning_rate": 0.0004891318820337974, + "loss": 1.4907, + "step": 1056 + }, + { + "epoch": 0.18856480242618856, + "grad_norm": 0.6431208848953247, + "learning_rate": 0.0004891114395360096, + "loss": 1.5441, + "step": 1057 + }, + { + "epoch": 0.18874319864418873, + "grad_norm": 2.24438738822937, + "learning_rate": 0.0004890909782585121, + "loss": 1.2141, + "step": 1058 + }, + { + "epoch": 0.18892159486218893, + "grad_norm": 4.204579830169678, + "learning_rate": 0.0004890704982029122, + "loss": 1.4053, + "step": 1059 + }, + { + "epoch": 0.1890999910801891, + "grad_norm": 0.8106665015220642, + "learning_rate": 0.0004890499993708182, + "loss": 1.1461, + "step": 1060 + }, + { + "epoch": 0.18927838729818927, + "grad_norm": 0.7268997430801392, + "learning_rate": 0.0004890294817638401, + "loss": 1.1508, + "step": 1061 + }, + { + "epoch": 0.18945678351618947, + "grad_norm": 0.9513046741485596, + "learning_rate": 0.0004890089453835894, + "loss": 1.2753, + "step": 1062 + }, + { + "epoch": 0.18963517973418964, + "grad_norm": 0.7538788318634033, + "learning_rate": 0.000488988390231679, + "loss": 1.088, + "step": 1063 + }, + { + "epoch": 0.1898135759521898, + "grad_norm": 0.6763918399810791, + "learning_rate": 0.0004889678163097233, + "loss": 1.1825, + "step": 1064 + }, + { + "epoch": 0.18999197217018998, + "grad_norm": 0.7881626486778259, + "learning_rate": 0.0004889472236193381, + "loss": 1.3298, + "step": 1065 + }, + { + "epoch": 0.19017036838819018, + "grad_norm": 0.6589952111244202, + "learning_rate": 0.0004889266121621407, + "loss": 1.2218, + "step": 1066 + }, + { + "epoch": 0.19034876460619035, + "grad_norm": 0.7166324853897095, + "learning_rate": 0.0004889059819397501, + "loss": 1.5962, + "step": 1067 + }, + { + "epoch": 0.19052716082419052, + "grad_norm": 0.6386669874191284, + "learning_rate": 0.0004888853329537865, + "loss": 1.2342, + "step": 1068 + }, + { + "epoch": 0.1907055570421907, + "grad_norm": 0.7325186729431152, + "learning_rate": 0.0004888646652058716, + "loss": 1.2617, + "step": 1069 + }, + { + "epoch": 0.1908839532601909, + "grad_norm": 0.5728015899658203, + "learning_rate": 0.0004888439786976287, + "loss": 1.3414, + "step": 1070 + }, + { + "epoch": 0.19106234947819106, + "grad_norm": 0.6659836769104004, + "learning_rate": 0.0004888232734306825, + "loss": 1.2854, + "step": 1071 + }, + { + "epoch": 0.19124074569619123, + "grad_norm": 0.7242840528488159, + "learning_rate": 0.0004888025494066592, + "loss": 1.4503, + "step": 1072 + }, + { + "epoch": 0.19141914191419143, + "grad_norm": 0.5824533700942993, + "learning_rate": 0.0004887818066271864, + "loss": 1.3502, + "step": 1073 + }, + { + "epoch": 0.1915975381321916, + "grad_norm": 0.6332762241363525, + "learning_rate": 0.0004887610450938932, + "loss": 1.0815, + "step": 1074 + }, + { + "epoch": 0.19177593435019177, + "grad_norm": 1.1359984874725342, + "learning_rate": 0.0004887402648084104, + "loss": 1.2975, + "step": 1075 + }, + { + "epoch": 0.19195433056819194, + "grad_norm": 0.8233804106712341, + "learning_rate": 0.0004887194657723699, + "loss": 1.1015, + "step": 1076 + }, + { + "epoch": 0.19213272678619214, + "grad_norm": 0.6143079996109009, + "learning_rate": 0.0004886986479874052, + "loss": 1.2998, + "step": 1077 + }, + { + "epoch": 0.1923111230041923, + "grad_norm": 1.4513376951217651, + "learning_rate": 0.0004886778114551515, + "loss": 1.3469, + "step": 1078 + }, + { + "epoch": 0.19248951922219248, + "grad_norm": 0.6782541275024414, + "learning_rate": 0.0004886569561772452, + "loss": 1.1655, + "step": 1079 + }, + { + "epoch": 0.19266791544019266, + "grad_norm": 0.6781834959983826, + "learning_rate": 0.0004886360821553242, + "loss": 1.4877, + "step": 1080 + }, + { + "epoch": 0.19284631165819285, + "grad_norm": 0.6629863977432251, + "learning_rate": 0.000488615189391028, + "loss": 1.5273, + "step": 1081 + }, + { + "epoch": 0.19302470787619302, + "grad_norm": 0.5616167783737183, + "learning_rate": 0.0004885942778859976, + "loss": 1.1904, + "step": 1082 + }, + { + "epoch": 0.1932031040941932, + "grad_norm": 0.5528833270072937, + "learning_rate": 0.0004885733476418752, + "loss": 0.9634, + "step": 1083 + }, + { + "epoch": 0.1933815003121934, + "grad_norm": 0.7818634510040283, + "learning_rate": 0.0004885523986603048, + "loss": 1.3033, + "step": 1084 + }, + { + "epoch": 0.19355989653019356, + "grad_norm": 0.6581936478614807, + "learning_rate": 0.0004885314309429316, + "loss": 1.4896, + "step": 1085 + }, + { + "epoch": 0.19373829274819374, + "grad_norm": 0.652714192867279, + "learning_rate": 0.0004885104444914026, + "loss": 1.2453, + "step": 1086 + }, + { + "epoch": 0.1939166889661939, + "grad_norm": 3.307530641555786, + "learning_rate": 0.0004884894393073658, + "loss": 1.173, + "step": 1087 + }, + { + "epoch": 0.1940950851841941, + "grad_norm": 0.685991644859314, + "learning_rate": 0.0004884684153924711, + "loss": 0.9765, + "step": 1088 + }, + { + "epoch": 0.19427348140219428, + "grad_norm": 0.678576648235321, + "learning_rate": 0.0004884473727483697, + "loss": 1.1766, + "step": 1089 + }, + { + "epoch": 0.19445187762019445, + "grad_norm": 0.7492129802703857, + "learning_rate": 0.0004884263113767143, + "loss": 1.4711, + "step": 1090 + }, + { + "epoch": 0.19463027383819462, + "grad_norm": 0.6048595309257507, + "learning_rate": 0.0004884052312791588, + "loss": 1.0355, + "step": 1091 + }, + { + "epoch": 0.19480867005619482, + "grad_norm": 0.6212369203567505, + "learning_rate": 0.0004883841324573592, + "loss": 1.2895, + "step": 1092 + }, + { + "epoch": 0.194987066274195, + "grad_norm": 0.5634220838546753, + "learning_rate": 0.0004883630149129725, + "loss": 0.9151, + "step": 1093 + }, + { + "epoch": 0.19516546249219516, + "grad_norm": 0.6526708602905273, + "learning_rate": 0.000488341878647657, + "loss": 1.3422, + "step": 1094 + }, + { + "epoch": 0.19534385871019536, + "grad_norm": 0.5777380466461182, + "learning_rate": 0.000488320723663073, + "loss": 1.3257, + "step": 1095 + }, + { + "epoch": 0.19552225492819553, + "grad_norm": 0.9771785736083984, + "learning_rate": 0.0004882995499608819, + "loss": 1.4458, + "step": 1096 + }, + { + "epoch": 0.1957006511461957, + "grad_norm": 0.942666232585907, + "learning_rate": 0.00048827835754274674, + "loss": 1.3359, + "step": 1097 + }, + { + "epoch": 0.19587904736419587, + "grad_norm": 0.6071109175682068, + "learning_rate": 0.00048825714641033185, + "loss": 1.2979, + "step": 1098 + }, + { + "epoch": 0.19605744358219607, + "grad_norm": 0.9285879135131836, + "learning_rate": 0.0004882359165653033, + "loss": 1.2042, + "step": 1099 + }, + { + "epoch": 0.19623583980019624, + "grad_norm": 0.6125981211662292, + "learning_rate": 0.00048821466800932835, + "loss": 1.2767, + "step": 1100 + }, + { + "epoch": 0.1964142360181964, + "grad_norm": 0.549712061882019, + "learning_rate": 0.00048819340074407583, + "loss": 1.0559, + "step": 1101 + }, + { + "epoch": 0.19659263223619658, + "grad_norm": 0.6521591544151306, + "learning_rate": 0.00048817211477121617, + "loss": 1.0833, + "step": 1102 + }, + { + "epoch": 0.19677102845419678, + "grad_norm": 0.6663747429847717, + "learning_rate": 0.0004881508100924211, + "loss": 1.4768, + "step": 1103 + }, + { + "epoch": 0.19694942467219695, + "grad_norm": 1.3679718971252441, + "learning_rate": 0.00048812948670936385, + "loss": 1.43, + "step": 1104 + }, + { + "epoch": 0.19712782089019712, + "grad_norm": 0.7124596238136292, + "learning_rate": 0.0004881081446237192, + "loss": 1.0491, + "step": 1105 + }, + { + "epoch": 0.19730621710819732, + "grad_norm": 0.6204227209091187, + "learning_rate": 0.0004880867838371634, + "loss": 1.2674, + "step": 1106 + }, + { + "epoch": 0.1974846133261975, + "grad_norm": 1.057226300239563, + "learning_rate": 0.00048806540435137404, + "loss": 1.2598, + "step": 1107 + }, + { + "epoch": 0.19766300954419766, + "grad_norm": 0.7484321594238281, + "learning_rate": 0.00048804400616803026, + "loss": 1.2401, + "step": 1108 + }, + { + "epoch": 0.19784140576219783, + "grad_norm": 12.229165077209473, + "learning_rate": 0.0004880225892888126, + "loss": 1.2016, + "step": 1109 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 0.7973271608352661, + "learning_rate": 0.0004880011537154033, + "loss": 1.4108, + "step": 1110 + }, + { + "epoch": 0.1981981981981982, + "grad_norm": 1.3274434804916382, + "learning_rate": 0.0004879796994494858, + "loss": 1.013, + "step": 1111 + }, + { + "epoch": 0.19837659441619837, + "grad_norm": 0.8325060606002808, + "learning_rate": 0.00048795822649274506, + "loss": 1.3261, + "step": 1112 + }, + { + "epoch": 0.19855499063419854, + "grad_norm": 0.9182140231132507, + "learning_rate": 0.0004879367348468676, + "loss": 1.1414, + "step": 1113 + }, + { + "epoch": 0.19873338685219874, + "grad_norm": 0.7945131063461304, + "learning_rate": 0.0004879152245135415, + "loss": 1.3566, + "step": 1114 + }, + { + "epoch": 0.1989117830701989, + "grad_norm": 0.7313281297683716, + "learning_rate": 0.00048789369549445596, + "loss": 1.2533, + "step": 1115 + }, + { + "epoch": 0.19909017928819908, + "grad_norm": 0.7198520302772522, + "learning_rate": 0.00048787214779130196, + "loss": 1.4449, + "step": 1116 + }, + { + "epoch": 0.19926857550619928, + "grad_norm": 0.6107659935951233, + "learning_rate": 0.00048785058140577185, + "loss": 1.1473, + "step": 1117 + }, + { + "epoch": 0.19944697172419945, + "grad_norm": 0.6234009861946106, + "learning_rate": 0.0004878289963395594, + "loss": 1.0506, + "step": 1118 + }, + { + "epoch": 0.19962536794219962, + "grad_norm": 0.6401305198669434, + "learning_rate": 0.0004878073925943599, + "loss": 1.2247, + "step": 1119 + }, + { + "epoch": 0.1998037641601998, + "grad_norm": 0.6568447947502136, + "learning_rate": 0.0004877857701718702, + "loss": 1.2032, + "step": 1120 + }, + { + "epoch": 0.1999821603782, + "grad_norm": 0.6193853616714478, + "learning_rate": 0.0004877641290737884, + "loss": 1.2807, + "step": 1121 + }, + { + "epoch": 0.20016055659620016, + "grad_norm": 0.605709969997406, + "learning_rate": 0.0004877424693018142, + "loss": 1.3687, + "step": 1122 + }, + { + "epoch": 0.20033895281420033, + "grad_norm": 0.5979405045509338, + "learning_rate": 0.0004877207908576488, + "loss": 1.1466, + "step": 1123 + }, + { + "epoch": 0.2005173490322005, + "grad_norm": 0.5929616093635559, + "learning_rate": 0.00048769909374299483, + "loss": 1.3955, + "step": 1124 + }, + { + "epoch": 0.2006957452502007, + "grad_norm": 0.6005091071128845, + "learning_rate": 0.00048767737795955623, + "loss": 1.468, + "step": 1125 + }, + { + "epoch": 0.20087414146820087, + "grad_norm": 0.5641143918037415, + "learning_rate": 0.0004876556435090387, + "loss": 1.495, + "step": 1126 + }, + { + "epoch": 0.20105253768620104, + "grad_norm": 0.6153658628463745, + "learning_rate": 0.0004876338903931492, + "loss": 1.1535, + "step": 1127 + }, + { + "epoch": 0.20123093390420124, + "grad_norm": 0.5413753390312195, + "learning_rate": 0.0004876121186135962, + "loss": 1.002, + "step": 1128 + }, + { + "epoch": 0.2014093301222014, + "grad_norm": 0.573294460773468, + "learning_rate": 0.00048759032817208964, + "loss": 1.3631, + "step": 1129 + }, + { + "epoch": 0.20158772634020158, + "grad_norm": 0.569232702255249, + "learning_rate": 0.00048756851907034093, + "loss": 1.105, + "step": 1130 + }, + { + "epoch": 0.20176612255820175, + "grad_norm": 0.6847321391105652, + "learning_rate": 0.000487546691310063, + "loss": 1.1282, + "step": 1131 + }, + { + "epoch": 0.20194451877620195, + "grad_norm": 0.7410063743591309, + "learning_rate": 0.00048752484489297023, + "loss": 1.2903, + "step": 1132 + }, + { + "epoch": 0.20212291499420212, + "grad_norm": 0.6502756476402283, + "learning_rate": 0.0004875029798207783, + "loss": 1.4029, + "step": 1133 + }, + { + "epoch": 0.2023013112122023, + "grad_norm": 0.569385826587677, + "learning_rate": 0.0004874810960952045, + "loss": 1.2896, + "step": 1134 + }, + { + "epoch": 0.2024797074302025, + "grad_norm": 0.5342479348182678, + "learning_rate": 0.00048745919371796765, + "loss": 1.1572, + "step": 1135 + }, + { + "epoch": 0.20265810364820266, + "grad_norm": 0.750543475151062, + "learning_rate": 0.0004874372726907879, + "loss": 1.3087, + "step": 1136 + }, + { + "epoch": 0.20283649986620284, + "grad_norm": 0.6126813292503357, + "learning_rate": 0.00048741533301538685, + "loss": 1.4071, + "step": 1137 + }, + { + "epoch": 0.203014896084203, + "grad_norm": 0.648357093334198, + "learning_rate": 0.00048739337469348785, + "loss": 1.3212, + "step": 1138 + }, + { + "epoch": 0.2031932923022032, + "grad_norm": 0.6305087208747864, + "learning_rate": 0.00048737139772681525, + "loss": 1.1886, + "step": 1139 + }, + { + "epoch": 0.20337168852020338, + "grad_norm": 0.5908694863319397, + "learning_rate": 0.0004873494021170953, + "loss": 1.1184, + "step": 1140 + }, + { + "epoch": 0.20355008473820355, + "grad_norm": 0.6326526999473572, + "learning_rate": 0.0004873273878660555, + "loss": 1.358, + "step": 1141 + }, + { + "epoch": 0.20372848095620372, + "grad_norm": 0.9167863130569458, + "learning_rate": 0.00048730535497542465, + "loss": 1.3994, + "step": 1142 + }, + { + "epoch": 0.20390687717420392, + "grad_norm": 0.5850123167037964, + "learning_rate": 0.0004872833034469334, + "loss": 1.4304, + "step": 1143 + }, + { + "epoch": 0.2040852733922041, + "grad_norm": 0.5937606692314148, + "learning_rate": 0.00048726123328231367, + "loss": 1.1403, + "step": 1144 + }, + { + "epoch": 0.20426366961020426, + "grad_norm": 0.6057407855987549, + "learning_rate": 0.00048723914448329863, + "loss": 1.5015, + "step": 1145 + }, + { + "epoch": 0.20444206582820446, + "grad_norm": 0.5308231711387634, + "learning_rate": 0.0004872170370516234, + "loss": 1.2105, + "step": 1146 + }, + { + "epoch": 0.20462046204620463, + "grad_norm": 1.8873573541641235, + "learning_rate": 0.0004871949109890241, + "loss": 1.3811, + "step": 1147 + }, + { + "epoch": 0.2047988582642048, + "grad_norm": 0.6975631713867188, + "learning_rate": 0.0004871727662972386, + "loss": 1.331, + "step": 1148 + }, + { + "epoch": 0.20497725448220497, + "grad_norm": 4.755294322967529, + "learning_rate": 0.00048715060297800606, + "loss": 1.3582, + "step": 1149 + }, + { + "epoch": 0.20515565070020517, + "grad_norm": 5.158052921295166, + "learning_rate": 0.0004871284210330673, + "loss": 1.445, + "step": 1150 + }, + { + "epoch": 0.20533404691820534, + "grad_norm": 9.641525268554688, + "learning_rate": 0.0004871062204641643, + "loss": 1.25, + "step": 1151 + }, + { + "epoch": 0.2055124431362055, + "grad_norm": 1.7820740938186646, + "learning_rate": 0.00048708400127304085, + "loss": 1.4857, + "step": 1152 + }, + { + "epoch": 0.20569083935420568, + "grad_norm": 1.1601697206497192, + "learning_rate": 0.0004870617634614419, + "loss": 1.3956, + "step": 1153 + }, + { + "epoch": 0.20586923557220588, + "grad_norm": 0.9471563100814819, + "learning_rate": 0.0004870395070311141, + "loss": 1.2032, + "step": 1154 + }, + { + "epoch": 0.20604763179020605, + "grad_norm": 1.270116925239563, + "learning_rate": 0.00048701723198380545, + "loss": 1.5996, + "step": 1155 + }, + { + "epoch": 0.20622602800820622, + "grad_norm": 1.2946140766143799, + "learning_rate": 0.00048699493832126534, + "loss": 1.2046, + "step": 1156 + }, + { + "epoch": 0.20640442422620642, + "grad_norm": 0.6727069020271301, + "learning_rate": 0.00048697262604524485, + "loss": 1.1865, + "step": 1157 + }, + { + "epoch": 0.2065828204442066, + "grad_norm": 0.7931843400001526, + "learning_rate": 0.00048695029515749615, + "loss": 1.2591, + "step": 1158 + }, + { + "epoch": 0.20676121666220676, + "grad_norm": 0.7511464953422546, + "learning_rate": 0.0004869279456597733, + "loss": 1.2946, + "step": 1159 + }, + { + "epoch": 0.20693961288020693, + "grad_norm": 0.6185610294342041, + "learning_rate": 0.00048690557755383157, + "loss": 1.0467, + "step": 1160 + }, + { + "epoch": 0.20711800909820713, + "grad_norm": 1.2515671253204346, + "learning_rate": 0.00048688319084142775, + "loss": 1.3274, + "step": 1161 + }, + { + "epoch": 0.2072964053162073, + "grad_norm": 66.89749908447266, + "learning_rate": 0.00048686078552432, + "loss": 1.4026, + "step": 1162 + }, + { + "epoch": 0.20747480153420747, + "grad_norm": 1.6944903135299683, + "learning_rate": 0.0004868383616042682, + "loss": 1.2216, + "step": 1163 + }, + { + "epoch": 0.20765319775220764, + "grad_norm": 0.8577490448951721, + "learning_rate": 0.0004868159190830333, + "loss": 1.1645, + "step": 1164 + }, + { + "epoch": 0.20783159397020784, + "grad_norm": 0.8082415461540222, + "learning_rate": 0.00048679345796237805, + "loss": 1.3516, + "step": 1165 + }, + { + "epoch": 0.208009990188208, + "grad_norm": 0.6959142088890076, + "learning_rate": 0.00048677097824406646, + "loss": 1.4433, + "step": 1166 + }, + { + "epoch": 0.20818838640620818, + "grad_norm": 0.6914120316505432, + "learning_rate": 0.0004867484799298642, + "loss": 1.2154, + "step": 1167 + }, + { + "epoch": 0.20836678262420838, + "grad_norm": 0.6086413860321045, + "learning_rate": 0.00048672596302153814, + "loss": 1.3274, + "step": 1168 + }, + { + "epoch": 0.20854517884220855, + "grad_norm": 0.6582692861557007, + "learning_rate": 0.0004867034275208569, + "loss": 1.0341, + "step": 1169 + }, + { + "epoch": 0.20872357506020872, + "grad_norm": 0.9762216210365295, + "learning_rate": 0.0004866808734295903, + "loss": 1.4215, + "step": 1170 + }, + { + "epoch": 0.2089019712782089, + "grad_norm": 45.90314483642578, + "learning_rate": 0.00048665830074950966, + "loss": 1.5744, + "step": 1171 + }, + { + "epoch": 0.2090803674962091, + "grad_norm": 2.054628372192383, + "learning_rate": 0.00048663570948238806, + "loss": 1.3446, + "step": 1172 + }, + { + "epoch": 0.20925876371420926, + "grad_norm": 0.6909850239753723, + "learning_rate": 0.00048661309962999956, + "loss": 1.378, + "step": 1173 + }, + { + "epoch": 0.20943715993220943, + "grad_norm": 0.8046495318412781, + "learning_rate": 0.00048659047119412014, + "loss": 1.2083, + "step": 1174 + }, + { + "epoch": 0.2096155561502096, + "grad_norm": 0.6526099443435669, + "learning_rate": 0.00048656782417652686, + "loss": 1.3512, + "step": 1175 + }, + { + "epoch": 0.2097939523682098, + "grad_norm": 0.6259192824363708, + "learning_rate": 0.0004865451585789985, + "loss": 1.0709, + "step": 1176 + }, + { + "epoch": 0.20997234858620997, + "grad_norm": 0.5453342795372009, + "learning_rate": 0.0004865224744033151, + "loss": 1.0345, + "step": 1177 + }, + { + "epoch": 0.21015074480421014, + "grad_norm": 0.6736587285995483, + "learning_rate": 0.0004864997716512584, + "loss": 1.1594, + "step": 1178 + }, + { + "epoch": 0.21032914102221034, + "grad_norm": 0.6084941625595093, + "learning_rate": 0.0004864770503246114, + "loss": 1.1488, + "step": 1179 + }, + { + "epoch": 0.2105075372402105, + "grad_norm": 0.6036438941955566, + "learning_rate": 0.00048645431042515866, + "loss": 1.2402, + "step": 1180 + }, + { + "epoch": 0.21068593345821068, + "grad_norm": 0.6231617331504822, + "learning_rate": 0.0004864315519546861, + "loss": 0.8676, + "step": 1181 + }, + { + "epoch": 0.21086432967621085, + "grad_norm": 0.6669137477874756, + "learning_rate": 0.00048640877491498127, + "loss": 1.3004, + "step": 1182 + }, + { + "epoch": 0.21104272589421105, + "grad_norm": 0.6060863137245178, + "learning_rate": 0.0004863859793078329, + "loss": 1.2152, + "step": 1183 + }, + { + "epoch": 0.21122112211221122, + "grad_norm": 0.5332959294319153, + "learning_rate": 0.0004863631651350315, + "loss": 1.0145, + "step": 1184 + }, + { + "epoch": 0.2113995183302114, + "grad_norm": 0.5904478430747986, + "learning_rate": 0.0004863403323983688, + "loss": 1.2652, + "step": 1185 + }, + { + "epoch": 0.21157791454821157, + "grad_norm": 0.6089318990707397, + "learning_rate": 0.0004863174810996381, + "loss": 1.2047, + "step": 1186 + }, + { + "epoch": 0.21175631076621176, + "grad_norm": 0.5423917770385742, + "learning_rate": 0.00048629461124063413, + "loss": 1.0264, + "step": 1187 + }, + { + "epoch": 0.21193470698421193, + "grad_norm": 0.5629234910011292, + "learning_rate": 0.00048627172282315304, + "loss": 1.1713, + "step": 1188 + }, + { + "epoch": 0.2121131032022121, + "grad_norm": 0.7865850925445557, + "learning_rate": 0.00048624881584899253, + "loss": 1.3266, + "step": 1189 + }, + { + "epoch": 0.2122914994202123, + "grad_norm": 0.5741007328033447, + "learning_rate": 0.00048622589031995173, + "loss": 1.0702, + "step": 1190 + }, + { + "epoch": 0.21246989563821247, + "grad_norm": 0.7023731470108032, + "learning_rate": 0.0004862029462378311, + "loss": 1.2425, + "step": 1191 + }, + { + "epoch": 0.21264829185621265, + "grad_norm": 0.6102539300918579, + "learning_rate": 0.0004861799836044328, + "loss": 1.3116, + "step": 1192 + }, + { + "epoch": 0.21282668807421282, + "grad_norm": 0.6883063316345215, + "learning_rate": 0.0004861570024215602, + "loss": 1.4091, + "step": 1193 + }, + { + "epoch": 0.21300508429221301, + "grad_norm": 0.5917825698852539, + "learning_rate": 0.00048613400269101824, + "loss": 1.3487, + "step": 1194 + }, + { + "epoch": 0.21318348051021319, + "grad_norm": 0.5880262851715088, + "learning_rate": 0.00048611098441461335, + "loss": 1.2805, + "step": 1195 + }, + { + "epoch": 0.21336187672821336, + "grad_norm": 0.633368730545044, + "learning_rate": 0.00048608794759415333, + "loss": 1.0161, + "step": 1196 + }, + { + "epoch": 0.21354027294621353, + "grad_norm": 0.659824550151825, + "learning_rate": 0.00048606489223144744, + "loss": 1.2684, + "step": 1197 + }, + { + "epoch": 0.21371866916421373, + "grad_norm": 5.748763561248779, + "learning_rate": 0.0004860418183283066, + "loss": 1.2505, + "step": 1198 + }, + { + "epoch": 0.2138970653822139, + "grad_norm": 0.6809664368629456, + "learning_rate": 0.00048601872588654283, + "loss": 1.2108, + "step": 1199 + }, + { + "epoch": 0.21407546160021407, + "grad_norm": 0.6129047870635986, + "learning_rate": 0.00048599561490796995, + "loss": 1.0522, + "step": 1200 + }, + { + "epoch": 0.21425385781821427, + "grad_norm": 0.6308250427246094, + "learning_rate": 0.0004859724853944031, + "loss": 1.2155, + "step": 1201 + }, + { + "epoch": 0.21443225403621444, + "grad_norm": 0.5974701642990112, + "learning_rate": 0.00048594933734765866, + "loss": 1.2953, + "step": 1202 + }, + { + "epoch": 0.2146106502542146, + "grad_norm": 0.6131543517112732, + "learning_rate": 0.00048592617076955493, + "loss": 1.3646, + "step": 1203 + }, + { + "epoch": 0.21478904647221478, + "grad_norm": 0.6104893684387207, + "learning_rate": 0.00048590298566191116, + "loss": 1.629, + "step": 1204 + }, + { + "epoch": 0.21496744269021498, + "grad_norm": 0.8043147921562195, + "learning_rate": 0.0004858797820265485, + "loss": 1.2002, + "step": 1205 + }, + { + "epoch": 0.21514583890821515, + "grad_norm": 0.61087965965271, + "learning_rate": 0.0004858565598652892, + "loss": 1.3669, + "step": 1206 + }, + { + "epoch": 0.21532423512621532, + "grad_norm": 0.5664741396903992, + "learning_rate": 0.0004858333191799572, + "loss": 1.5041, + "step": 1207 + }, + { + "epoch": 0.2155026313442155, + "grad_norm": 0.6045664548873901, + "learning_rate": 0.0004858100599723778, + "loss": 1.3528, + "step": 1208 + }, + { + "epoch": 0.2156810275622157, + "grad_norm": 0.8556034564971924, + "learning_rate": 0.00048578678224437777, + "loss": 1.3253, + "step": 1209 + }, + { + "epoch": 0.21585942378021586, + "grad_norm": 0.7069472670555115, + "learning_rate": 0.0004857634859977854, + "loss": 1.3242, + "step": 1210 + }, + { + "epoch": 0.21603781999821603, + "grad_norm": 0.5635353922843933, + "learning_rate": 0.00048574017123443025, + "loss": 1.1976, + "step": 1211 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 0.6107478141784668, + "learning_rate": 0.00048571683795614346, + "loss": 1.2678, + "step": 1212 + }, + { + "epoch": 0.2163946124342164, + "grad_norm": 0.6950213313102722, + "learning_rate": 0.0004856934861647577, + "loss": 1.1423, + "step": 1213 + }, + { + "epoch": 0.21657300865221657, + "grad_norm": 0.7915967106819153, + "learning_rate": 0.00048567011586210697, + "loss": 1.5536, + "step": 1214 + }, + { + "epoch": 0.21675140487021674, + "grad_norm": 0.5662855505943298, + "learning_rate": 0.00048564672705002663, + "loss": 1.081, + "step": 1215 + }, + { + "epoch": 0.21692980108821694, + "grad_norm": 0.7298009991645813, + "learning_rate": 0.0004856233197303539, + "loss": 1.328, + "step": 1216 + }, + { + "epoch": 0.2171081973062171, + "grad_norm": 0.5519408583641052, + "learning_rate": 0.000485599893904927, + "loss": 1.2547, + "step": 1217 + }, + { + "epoch": 0.21728659352421728, + "grad_norm": 0.6197732090950012, + "learning_rate": 0.0004855764495755858, + "loss": 1.2905, + "step": 1218 + }, + { + "epoch": 0.21746498974221745, + "grad_norm": 0.6033538579940796, + "learning_rate": 0.00048555298674417166, + "loss": 1.1542, + "step": 1219 + }, + { + "epoch": 0.21764338596021765, + "grad_norm": 0.7289498448371887, + "learning_rate": 0.00048552950541252727, + "loss": 1.4689, + "step": 1220 + }, + { + "epoch": 0.21782178217821782, + "grad_norm": 0.5700170397758484, + "learning_rate": 0.0004855060055824969, + "loss": 0.9664, + "step": 1221 + }, + { + "epoch": 0.218000178396218, + "grad_norm": 0.6593589782714844, + "learning_rate": 0.00048548248725592617, + "loss": 1.2289, + "step": 1222 + }, + { + "epoch": 0.2181785746142182, + "grad_norm": 0.7251504063606262, + "learning_rate": 0.00048545895043466226, + "loss": 1.4661, + "step": 1223 + }, + { + "epoch": 0.21835697083221836, + "grad_norm": 0.6131605505943298, + "learning_rate": 0.00048543539512055367, + "loss": 1.182, + "step": 1224 + }, + { + "epoch": 0.21853536705021853, + "grad_norm": 0.6947142481803894, + "learning_rate": 0.00048541182131545054, + "loss": 1.5384, + "step": 1225 + }, + { + "epoch": 0.2187137632682187, + "grad_norm": 0.6361053586006165, + "learning_rate": 0.00048538822902120424, + "loss": 1.4648, + "step": 1226 + }, + { + "epoch": 0.2188921594862189, + "grad_norm": 0.5528272390365601, + "learning_rate": 0.0004853646182396677, + "loss": 1.1138, + "step": 1227 + }, + { + "epoch": 0.21907055570421907, + "grad_norm": 0.6252143979072571, + "learning_rate": 0.0004853409889726953, + "loss": 1.402, + "step": 1228 + }, + { + "epoch": 0.21924895192221924, + "grad_norm": 0.6612663865089417, + "learning_rate": 0.000485317341222143, + "loss": 1.3349, + "step": 1229 + }, + { + "epoch": 0.2194273481402194, + "grad_norm": 0.5366940498352051, + "learning_rate": 0.00048529367498986785, + "loss": 1.0664, + "step": 1230 + }, + { + "epoch": 0.2196057443582196, + "grad_norm": 1.6709811687469482, + "learning_rate": 0.00048526999027772883, + "loss": 1.0385, + "step": 1231 + }, + { + "epoch": 0.21978414057621978, + "grad_norm": 0.737723708152771, + "learning_rate": 0.000485246287087586, + "loss": 1.3627, + "step": 1232 + }, + { + "epoch": 0.21996253679421995, + "grad_norm": 0.6255607008934021, + "learning_rate": 0.0004852225654213009, + "loss": 1.1994, + "step": 1233 + }, + { + "epoch": 0.22014093301222015, + "grad_norm": 0.6627714037895203, + "learning_rate": 0.00048519882528073687, + "loss": 1.3014, + "step": 1234 + }, + { + "epoch": 0.22031932923022032, + "grad_norm": 0.5637165307998657, + "learning_rate": 0.00048517506666775835, + "loss": 1.1434, + "step": 1235 + }, + { + "epoch": 0.2204977254482205, + "grad_norm": 0.8759331703186035, + "learning_rate": 0.00048515128958423123, + "loss": 1.3106, + "step": 1236 + }, + { + "epoch": 0.22067612166622066, + "grad_norm": 0.5403386950492859, + "learning_rate": 0.000485127494032023, + "loss": 1.0078, + "step": 1237 + }, + { + "epoch": 0.22085451788422086, + "grad_norm": 0.628709614276886, + "learning_rate": 0.0004851036800130026, + "loss": 1.1152, + "step": 1238 + }, + { + "epoch": 0.22103291410222103, + "grad_norm": 0.7338658571243286, + "learning_rate": 0.00048507984752904035, + "loss": 1.4054, + "step": 1239 + }, + { + "epoch": 0.2212113103202212, + "grad_norm": 0.6589036583900452, + "learning_rate": 0.000485055996582008, + "loss": 1.252, + "step": 1240 + }, + { + "epoch": 0.2213897065382214, + "grad_norm": 0.672472357749939, + "learning_rate": 0.0004850321271737789, + "loss": 1.4335, + "step": 1241 + }, + { + "epoch": 0.22156810275622157, + "grad_norm": 0.6786160469055176, + "learning_rate": 0.0004850082393062276, + "loss": 1.3884, + "step": 1242 + }, + { + "epoch": 0.22174649897422175, + "grad_norm": 1.999408483505249, + "learning_rate": 0.00048498433298123036, + "loss": 1.2055, + "step": 1243 + }, + { + "epoch": 0.22192489519222192, + "grad_norm": 0.7847235202789307, + "learning_rate": 0.00048496040820066467, + "loss": 1.1009, + "step": 1244 + }, + { + "epoch": 0.22210329141022211, + "grad_norm": 0.6475226879119873, + "learning_rate": 0.0004849364649664097, + "loss": 1.3426, + "step": 1245 + }, + { + "epoch": 0.22228168762822229, + "grad_norm": 0.56676185131073, + "learning_rate": 0.0004849125032803459, + "loss": 1.0316, + "step": 1246 + }, + { + "epoch": 0.22246008384622246, + "grad_norm": 0.6990684270858765, + "learning_rate": 0.00048488852314435503, + "loss": 1.2661, + "step": 1247 + }, + { + "epoch": 0.22263848006422263, + "grad_norm": 0.6938690543174744, + "learning_rate": 0.0004848645245603208, + "loss": 1.3882, + "step": 1248 + }, + { + "epoch": 0.22281687628222283, + "grad_norm": 2.238471508026123, + "learning_rate": 0.00048484050753012784, + "loss": 1.3287, + "step": 1249 + }, + { + "epoch": 0.222995272500223, + "grad_norm": 0.8358233571052551, + "learning_rate": 0.0004848164720556624, + "loss": 1.1643, + "step": 1250 + }, + { + "epoch": 0.22317366871822317, + "grad_norm": 2.564124822616577, + "learning_rate": 0.00048479241813881237, + "loss": 1.2204, + "step": 1251 + }, + { + "epoch": 0.22335206493622337, + "grad_norm": 1.5827924013137817, + "learning_rate": 0.00048476834578146683, + "loss": 1.4587, + "step": 1252 + }, + { + "epoch": 0.22353046115422354, + "grad_norm": 1.2166929244995117, + "learning_rate": 0.00048474425498551643, + "loss": 1.6144, + "step": 1253 + }, + { + "epoch": 0.2237088573722237, + "grad_norm": 8.74429702758789, + "learning_rate": 0.00048472014575285326, + "loss": 2.0011, + "step": 1254 + }, + { + "epoch": 0.22388725359022388, + "grad_norm": 11.300551414489746, + "learning_rate": 0.0004846960180853709, + "loss": 2.7429, + "step": 1255 + }, + { + "epoch": 0.22406564980822408, + "grad_norm": 1.6926195621490479, + "learning_rate": 0.00048467187198496426, + "loss": 1.1161, + "step": 1256 + }, + { + "epoch": 0.22424404602622425, + "grad_norm": 0.9503287672996521, + "learning_rate": 0.00048464770745352984, + "loss": 1.4126, + "step": 1257 + }, + { + "epoch": 0.22442244224422442, + "grad_norm": 1.7149012088775635, + "learning_rate": 0.00048462352449296533, + "loss": 1.318, + "step": 1258 + }, + { + "epoch": 0.2246008384622246, + "grad_norm": 9.479455947875977, + "learning_rate": 0.00048459932310517017, + "loss": 1.2574, + "step": 1259 + }, + { + "epoch": 0.2247792346802248, + "grad_norm": 12.350288391113281, + "learning_rate": 0.0004845751032920452, + "loss": 1.4817, + "step": 1260 + }, + { + "epoch": 0.22495763089822496, + "grad_norm": 16.48604393005371, + "learning_rate": 0.0004845508650554926, + "loss": 1.3518, + "step": 1261 + }, + { + "epoch": 0.22513602711622513, + "grad_norm": 6.238397598266602, + "learning_rate": 0.00048452660839741593, + "loss": 0.9875, + "step": 1262 + }, + { + "epoch": 0.22531442333422533, + "grad_norm": 1.4720149040222168, + "learning_rate": 0.0004845023333197204, + "loss": 1.3612, + "step": 1263 + }, + { + "epoch": 0.2254928195522255, + "grad_norm": 2.780336856842041, + "learning_rate": 0.0004844780398243125, + "loss": 1.3331, + "step": 1264 + }, + { + "epoch": 0.22567121577022567, + "grad_norm": 1.0210349559783936, + "learning_rate": 0.0004844537279131002, + "loss": 1.5075, + "step": 1265 + }, + { + "epoch": 0.22584961198822584, + "grad_norm": 1.9614717960357666, + "learning_rate": 0.0004844293975879931, + "loss": 1.1595, + "step": 1266 + }, + { + "epoch": 0.22602800820622604, + "grad_norm": 0.7749541997909546, + "learning_rate": 0.000484405048850902, + "loss": 1.3533, + "step": 1267 + }, + { + "epoch": 0.2262064044242262, + "grad_norm": 1.7619514465332031, + "learning_rate": 0.00048438068170373916, + "loss": 1.0186, + "step": 1268 + }, + { + "epoch": 0.22638480064222638, + "grad_norm": 0.9682341814041138, + "learning_rate": 0.0004843562961484185, + "loss": 1.18, + "step": 1269 + }, + { + "epoch": 0.22656319686022655, + "grad_norm": 0.7163513898849487, + "learning_rate": 0.00048433189218685516, + "loss": 1.2096, + "step": 1270 + }, + { + "epoch": 0.22674159307822675, + "grad_norm": 0.5259701013565063, + "learning_rate": 0.0004843074698209658, + "loss": 1.1834, + "step": 1271 + }, + { + "epoch": 0.22691998929622692, + "grad_norm": 0.8285301923751831, + "learning_rate": 0.00048428302905266865, + "loss": 1.1552, + "step": 1272 + }, + { + "epoch": 0.2270983855142271, + "grad_norm": 0.6478269100189209, + "learning_rate": 0.0004842585698838832, + "loss": 1.2377, + "step": 1273 + }, + { + "epoch": 0.2272767817322273, + "grad_norm": 0.7141563296318054, + "learning_rate": 0.0004842340923165305, + "loss": 1.0266, + "step": 1274 + }, + { + "epoch": 0.22745517795022746, + "grad_norm": 1.3157190084457397, + "learning_rate": 0.000484209596352533, + "loss": 1.139, + "step": 1275 + }, + { + "epoch": 0.22763357416822763, + "grad_norm": 1.4093172550201416, + "learning_rate": 0.0004841850819938145, + "loss": 1.0584, + "step": 1276 + }, + { + "epoch": 0.2278119703862278, + "grad_norm": 0.805065929889679, + "learning_rate": 0.0004841605492423006, + "loss": 1.3242, + "step": 1277 + }, + { + "epoch": 0.227990366604228, + "grad_norm": 0.5310962796211243, + "learning_rate": 0.00048413599809991783, + "loss": 1.0605, + "step": 1278 + }, + { + "epoch": 0.22816876282222817, + "grad_norm": 0.5967015027999878, + "learning_rate": 0.00048411142856859447, + "loss": 1.1277, + "step": 1279 + }, + { + "epoch": 0.22834715904022834, + "grad_norm": 0.6595317721366882, + "learning_rate": 0.00048408684065026034, + "loss": 1.3209, + "step": 1280 + }, + { + "epoch": 0.2285255552582285, + "grad_norm": 0.583308219909668, + "learning_rate": 0.0004840622343468465, + "loss": 1.1515, + "step": 1281 + }, + { + "epoch": 0.2287039514762287, + "grad_norm": 0.9470168948173523, + "learning_rate": 0.0004840376096602854, + "loss": 1.1452, + "step": 1282 + }, + { + "epoch": 0.22888234769422888, + "grad_norm": 0.6106230616569519, + "learning_rate": 0.00048401296659251125, + "loss": 1.5261, + "step": 1283 + }, + { + "epoch": 0.22906074391222905, + "grad_norm": 0.656694233417511, + "learning_rate": 0.00048398830514545933, + "loss": 1.4598, + "step": 1284 + }, + { + "epoch": 0.22923914013022925, + "grad_norm": 0.7199791669845581, + "learning_rate": 0.0004839636253210667, + "loss": 1.0677, + "step": 1285 + }, + { + "epoch": 0.22941753634822942, + "grad_norm": 0.748662531375885, + "learning_rate": 0.0004839389271212715, + "loss": 1.1233, + "step": 1286 + }, + { + "epoch": 0.2295959325662296, + "grad_norm": 0.8158852458000183, + "learning_rate": 0.00048391421054801376, + "loss": 1.2534, + "step": 1287 + }, + { + "epoch": 0.22977432878422976, + "grad_norm": 1.9527506828308105, + "learning_rate": 0.0004838894756032345, + "loss": 1.2561, + "step": 1288 + }, + { + "epoch": 0.22995272500222996, + "grad_norm": 1.1054450273513794, + "learning_rate": 0.0004838647222888766, + "loss": 1.0608, + "step": 1289 + }, + { + "epoch": 0.23013112122023013, + "grad_norm": 0.5444337725639343, + "learning_rate": 0.0004838399506068839, + "loss": 1.193, + "step": 1290 + }, + { + "epoch": 0.2303095174382303, + "grad_norm": 0.6436626315116882, + "learning_rate": 0.0004838151605592022, + "loss": 1.4263, + "step": 1291 + }, + { + "epoch": 0.23048791365623048, + "grad_norm": 0.7222276926040649, + "learning_rate": 0.00048379035214777833, + "loss": 1.482, + "step": 1292 + }, + { + "epoch": 0.23066630987423067, + "grad_norm": 0.7238490581512451, + "learning_rate": 0.00048376552537456085, + "loss": 1.1443, + "step": 1293 + }, + { + "epoch": 0.23084470609223084, + "grad_norm": 0.9711647033691406, + "learning_rate": 0.00048374068024149966, + "loss": 1.2471, + "step": 1294 + }, + { + "epoch": 0.23102310231023102, + "grad_norm": 0.54137122631073, + "learning_rate": 0.000483715816750546, + "loss": 1.1579, + "step": 1295 + }, + { + "epoch": 0.23120149852823121, + "grad_norm": 0.8173871636390686, + "learning_rate": 0.0004836909349036527, + "loss": 1.4205, + "step": 1296 + }, + { + "epoch": 0.23137989474623138, + "grad_norm": 8.518444061279297, + "learning_rate": 0.0004836660347027738, + "loss": 1.0929, + "step": 1297 + }, + { + "epoch": 0.23155829096423156, + "grad_norm": 0.603255569934845, + "learning_rate": 0.0004836411161498652, + "loss": 1.2047, + "step": 1298 + }, + { + "epoch": 0.23173668718223173, + "grad_norm": 0.6280937790870667, + "learning_rate": 0.0004836161792468839, + "loss": 1.2314, + "step": 1299 + }, + { + "epoch": 0.23191508340023193, + "grad_norm": 0.5769376158714294, + "learning_rate": 0.00048359122399578835, + "loss": 1.077, + "step": 1300 + }, + { + "epoch": 0.2320934796182321, + "grad_norm": 0.6672999858856201, + "learning_rate": 0.00048356625039853865, + "loss": 1.1322, + "step": 1301 + }, + { + "epoch": 0.23227187583623227, + "grad_norm": 1.1087464094161987, + "learning_rate": 0.00048354125845709604, + "loss": 1.5064, + "step": 1302 + }, + { + "epoch": 0.23245027205423244, + "grad_norm": 0.6089938879013062, + "learning_rate": 0.0004835162481734237, + "loss": 0.9383, + "step": 1303 + }, + { + "epoch": 0.23262866827223264, + "grad_norm": 0.6506130695343018, + "learning_rate": 0.0004834912195494855, + "loss": 1.2445, + "step": 1304 + }, + { + "epoch": 0.2328070644902328, + "grad_norm": 0.6899453401565552, + "learning_rate": 0.0004834661725872475, + "loss": 1.2801, + "step": 1305 + }, + { + "epoch": 0.23298546070823298, + "grad_norm": 0.9193609952926636, + "learning_rate": 0.00048344110728867673, + "loss": 1.3748, + "step": 1306 + }, + { + "epoch": 0.23316385692623318, + "grad_norm": 0.905149519443512, + "learning_rate": 0.0004834160236557419, + "loss": 1.2297, + "step": 1307 + }, + { + "epoch": 0.23334225314423335, + "grad_norm": 0.5802024602890015, + "learning_rate": 0.00048339092169041306, + "loss": 1.0281, + "step": 1308 + }, + { + "epoch": 0.23352064936223352, + "grad_norm": 0.644010603427887, + "learning_rate": 0.0004833658013946616, + "loss": 1.4227, + "step": 1309 + }, + { + "epoch": 0.2336990455802337, + "grad_norm": 0.5992226600646973, + "learning_rate": 0.0004833406627704605, + "loss": 1.229, + "step": 1310 + }, + { + "epoch": 0.2338774417982339, + "grad_norm": 0.5435013771057129, + "learning_rate": 0.00048331550581978423, + "loss": 1.2623, + "step": 1311 + }, + { + "epoch": 0.23405583801623406, + "grad_norm": 0.5842602252960205, + "learning_rate": 0.0004832903305446085, + "loss": 1.3458, + "step": 1312 + }, + { + "epoch": 0.23423423423423423, + "grad_norm": 0.6166499257087708, + "learning_rate": 0.00048326513694691055, + "loss": 1.1492, + "step": 1313 + }, + { + "epoch": 0.2344126304522344, + "grad_norm": 0.586531400680542, + "learning_rate": 0.0004832399250286692, + "loss": 1.179, + "step": 1314 + }, + { + "epoch": 0.2345910266702346, + "grad_norm": 1.1618958711624146, + "learning_rate": 0.0004832146947918645, + "loss": 1.0842, + "step": 1315 + }, + { + "epoch": 0.23476942288823477, + "grad_norm": 0.5545247197151184, + "learning_rate": 0.000483189446238478, + "loss": 1.0436, + "step": 1316 + }, + { + "epoch": 0.23494781910623494, + "grad_norm": 0.5850834846496582, + "learning_rate": 0.00048316417937049275, + "loss": 1.1775, + "step": 1317 + }, + { + "epoch": 0.23512621532423514, + "grad_norm": 0.654733419418335, + "learning_rate": 0.0004831388941898932, + "loss": 1.2601, + "step": 1318 + }, + { + "epoch": 0.2353046115422353, + "grad_norm": 0.5766509175300598, + "learning_rate": 0.00048311359069866524, + "loss": 1.2645, + "step": 1319 + }, + { + "epoch": 0.23548300776023548, + "grad_norm": 0.577642560005188, + "learning_rate": 0.00048308826889879614, + "loss": 1.1026, + "step": 1320 + }, + { + "epoch": 0.23566140397823565, + "grad_norm": 0.6444463729858398, + "learning_rate": 0.0004830629287922748, + "loss": 1.2969, + "step": 1321 + }, + { + "epoch": 0.23583980019623585, + "grad_norm": 0.7246226072311401, + "learning_rate": 0.00048303757038109117, + "loss": 1.1957, + "step": 1322 + }, + { + "epoch": 0.23601819641423602, + "grad_norm": 0.7728081345558167, + "learning_rate": 0.00048301219366723714, + "loss": 1.1193, + "step": 1323 + }, + { + "epoch": 0.2361965926322362, + "grad_norm": 0.6015149354934692, + "learning_rate": 0.00048298679865270574, + "loss": 1.3462, + "step": 1324 + }, + { + "epoch": 0.23637498885023636, + "grad_norm": 0.5546750426292419, + "learning_rate": 0.0004829613853394914, + "loss": 1.2633, + "step": 1325 + }, + { + "epoch": 0.23655338506823656, + "grad_norm": 0.7235816717147827, + "learning_rate": 0.0004829359537295901, + "loss": 1.5118, + "step": 1326 + }, + { + "epoch": 0.23673178128623673, + "grad_norm": 0.6748101711273193, + "learning_rate": 0.0004829105038249992, + "loss": 1.1664, + "step": 1327 + }, + { + "epoch": 0.2369101775042369, + "grad_norm": 0.7603464722633362, + "learning_rate": 0.0004828850356277176, + "loss": 1.2698, + "step": 1328 + }, + { + "epoch": 0.2370885737222371, + "grad_norm": 0.6377245187759399, + "learning_rate": 0.0004828595491397455, + "loss": 0.9762, + "step": 1329 + }, + { + "epoch": 0.23726696994023727, + "grad_norm": 0.6112083792686462, + "learning_rate": 0.0004828340443630846, + "loss": 0.9882, + "step": 1330 + }, + { + "epoch": 0.23744536615823744, + "grad_norm": 0.5406150817871094, + "learning_rate": 0.00048280852129973807, + "loss": 0.9828, + "step": 1331 + }, + { + "epoch": 0.2376237623762376, + "grad_norm": 0.592891275882721, + "learning_rate": 0.0004827829799517105, + "loss": 1.0252, + "step": 1332 + }, + { + "epoch": 0.2378021585942378, + "grad_norm": 0.564150869846344, + "learning_rate": 0.0004827574203210078, + "loss": 1.2145, + "step": 1333 + }, + { + "epoch": 0.23798055481223798, + "grad_norm": 0.8308655619621277, + "learning_rate": 0.0004827318424096375, + "loss": 0.9988, + "step": 1334 + }, + { + "epoch": 0.23815895103023815, + "grad_norm": 0.6468952894210815, + "learning_rate": 0.00048270624621960846, + "loss": 1.0637, + "step": 1335 + }, + { + "epoch": 0.23833734724823832, + "grad_norm": 0.5933250784873962, + "learning_rate": 0.000482680631752931, + "loss": 1.2847, + "step": 1336 + }, + { + "epoch": 0.23851574346623852, + "grad_norm": 0.8203790187835693, + "learning_rate": 0.0004826549990116168, + "loss": 1.3192, + "step": 1337 + }, + { + "epoch": 0.2386941396842387, + "grad_norm": 0.8273621201515198, + "learning_rate": 0.0004826293479976791, + "loss": 1.2088, + "step": 1338 + }, + { + "epoch": 0.23887253590223886, + "grad_norm": 0.6572198271751404, + "learning_rate": 0.0004826036787131326, + "loss": 1.3699, + "step": 1339 + }, + { + "epoch": 0.23905093212023906, + "grad_norm": 0.6077407002449036, + "learning_rate": 0.0004825779911599932, + "loss": 1.2602, + "step": 1340 + }, + { + "epoch": 0.23922932833823923, + "grad_norm": 0.58214271068573, + "learning_rate": 0.00048255228534027845, + "loss": 0.9227, + "step": 1341 + }, + { + "epoch": 0.2394077245562394, + "grad_norm": 0.5967569947242737, + "learning_rate": 0.0004825265612560073, + "loss": 1.1136, + "step": 1342 + }, + { + "epoch": 0.23958612077423957, + "grad_norm": 0.5940139889717102, + "learning_rate": 0.0004825008189092001, + "loss": 1.1137, + "step": 1343 + }, + { + "epoch": 0.23976451699223977, + "grad_norm": 0.5328962802886963, + "learning_rate": 0.00048247505830187863, + "loss": 1.0354, + "step": 1344 + }, + { + "epoch": 0.23994291321023994, + "grad_norm": 0.548477828502655, + "learning_rate": 0.00048244927943606617, + "loss": 0.9724, + "step": 1345 + }, + { + "epoch": 0.24012130942824012, + "grad_norm": 0.6590238809585571, + "learning_rate": 0.0004824234823137873, + "loss": 1.2702, + "step": 1346 + }, + { + "epoch": 0.24029970564624029, + "grad_norm": 0.5709405541419983, + "learning_rate": 0.0004823976669370681, + "loss": 1.3085, + "step": 1347 + }, + { + "epoch": 0.24047810186424048, + "grad_norm": 0.6574723720550537, + "learning_rate": 0.0004823718333079362, + "loss": 1.5324, + "step": 1348 + }, + { + "epoch": 0.24065649808224066, + "grad_norm": 0.5990821123123169, + "learning_rate": 0.00048234598142842043, + "loss": 1.1941, + "step": 1349 + }, + { + "epoch": 0.24083489430024083, + "grad_norm": 0.6324416995048523, + "learning_rate": 0.0004823201113005514, + "loss": 1.3861, + "step": 1350 + }, + { + "epoch": 0.24101329051824102, + "grad_norm": 0.5532766580581665, + "learning_rate": 0.0004822942229263607, + "loss": 1.1808, + "step": 1351 + }, + { + "epoch": 0.2411916867362412, + "grad_norm": 0.5371944904327393, + "learning_rate": 0.00048226831630788174, + "loss": 0.9721, + "step": 1352 + }, + { + "epoch": 0.24137008295424137, + "grad_norm": 0.6007786393165588, + "learning_rate": 0.0004822423914471492, + "loss": 1.2506, + "step": 1353 + }, + { + "epoch": 0.24154847917224154, + "grad_norm": 0.5308194756507874, + "learning_rate": 0.0004822164483461991, + "loss": 1.1275, + "step": 1354 + }, + { + "epoch": 0.24172687539024174, + "grad_norm": 0.5771823525428772, + "learning_rate": 0.00048219048700706913, + "loss": 1.138, + "step": 1355 + }, + { + "epoch": 0.2419052716082419, + "grad_norm": 0.6079913377761841, + "learning_rate": 0.00048216450743179817, + "loss": 1.4897, + "step": 1356 + }, + { + "epoch": 0.24208366782624208, + "grad_norm": 0.5161705613136292, + "learning_rate": 0.0004821385096224268, + "loss": 1.0735, + "step": 1357 + }, + { + "epoch": 0.24226206404424228, + "grad_norm": 0.6866333484649658, + "learning_rate": 0.00048211249358099675, + "loss": 1.316, + "step": 1358 + }, + { + "epoch": 0.24244046026224245, + "grad_norm": 0.5263490080833435, + "learning_rate": 0.0004820864593095513, + "loss": 1.1486, + "step": 1359 + }, + { + "epoch": 0.24261885648024262, + "grad_norm": 0.5457320213317871, + "learning_rate": 0.0004820604068101352, + "loss": 1.076, + "step": 1360 + }, + { + "epoch": 0.2427972526982428, + "grad_norm": 0.5485474467277527, + "learning_rate": 0.00048203433608479465, + "loss": 1.4718, + "step": 1361 + }, + { + "epoch": 0.242975648916243, + "grad_norm": 0.5592158436775208, + "learning_rate": 0.0004820082471355772, + "loss": 1.123, + "step": 1362 + }, + { + "epoch": 0.24315404513424316, + "grad_norm": 0.6344873905181885, + "learning_rate": 0.0004819821399645319, + "loss": 1.2592, + "step": 1363 + }, + { + "epoch": 0.24333244135224333, + "grad_norm": 0.5310791730880737, + "learning_rate": 0.00048195601457370907, + "loss": 1.1477, + "step": 1364 + }, + { + "epoch": 0.2435108375702435, + "grad_norm": 0.5795305371284485, + "learning_rate": 0.0004819298709651607, + "loss": 1.3579, + "step": 1365 + }, + { + "epoch": 0.2436892337882437, + "grad_norm": 0.6433924436569214, + "learning_rate": 0.0004819037091409401, + "loss": 1.3058, + "step": 1366 + }, + { + "epoch": 0.24386763000624387, + "grad_norm": 0.6262935996055603, + "learning_rate": 0.00048187752910310196, + "loss": 1.4104, + "step": 1367 + }, + { + "epoch": 0.24404602622424404, + "grad_norm": 0.6437758207321167, + "learning_rate": 0.0004818513308537025, + "loss": 1.4279, + "step": 1368 + }, + { + "epoch": 0.24422442244224424, + "grad_norm": 0.5018840432167053, + "learning_rate": 0.00048182511439479926, + "loss": 1.0561, + "step": 1369 + }, + { + "epoch": 0.2444028186602444, + "grad_norm": 0.5912278890609741, + "learning_rate": 0.00048179887972845124, + "loss": 1.2844, + "step": 1370 + }, + { + "epoch": 0.24458121487824458, + "grad_norm": 0.5428308844566345, + "learning_rate": 0.0004817726268567191, + "loss": 1.1105, + "step": 1371 + }, + { + "epoch": 0.24475961109624475, + "grad_norm": 0.5821102857589722, + "learning_rate": 0.0004817463557816644, + "loss": 1.3421, + "step": 1372 + }, + { + "epoch": 0.24493800731424495, + "grad_norm": 0.5811814665794373, + "learning_rate": 0.0004817200665053508, + "loss": 1.1854, + "step": 1373 + }, + { + "epoch": 0.24511640353224512, + "grad_norm": 0.5271221399307251, + "learning_rate": 0.00048169375902984283, + "loss": 1.1517, + "step": 1374 + }, + { + "epoch": 0.2452947997502453, + "grad_norm": 0.592476487159729, + "learning_rate": 0.00048166743335720675, + "loss": 1.459, + "step": 1375 + }, + { + "epoch": 0.24547319596824546, + "grad_norm": 0.5430518984794617, + "learning_rate": 0.00048164108948951014, + "loss": 1.0736, + "step": 1376 + }, + { + "epoch": 0.24565159218624566, + "grad_norm": 0.601703405380249, + "learning_rate": 0.00048161472742882204, + "loss": 1.358, + "step": 1377 + }, + { + "epoch": 0.24582998840424583, + "grad_norm": 0.7517264485359192, + "learning_rate": 0.0004815883471772129, + "loss": 1.1916, + "step": 1378 + }, + { + "epoch": 0.246008384622246, + "grad_norm": 0.6537923216819763, + "learning_rate": 0.00048156194873675466, + "loss": 1.3555, + "step": 1379 + }, + { + "epoch": 0.2461867808402462, + "grad_norm": 0.5317115187644958, + "learning_rate": 0.0004815355321095206, + "loss": 1.1173, + "step": 1380 + }, + { + "epoch": 0.24636517705824637, + "grad_norm": 0.5854525566101074, + "learning_rate": 0.00048150909729758554, + "loss": 1.2466, + "step": 1381 + }, + { + "epoch": 0.24654357327624654, + "grad_norm": 0.5315707325935364, + "learning_rate": 0.0004814826443030256, + "loss": 1.0387, + "step": 1382 + }, + { + "epoch": 0.2467219694942467, + "grad_norm": 0.5110334157943726, + "learning_rate": 0.00048145617312791837, + "loss": 1.0514, + "step": 1383 + }, + { + "epoch": 0.2469003657122469, + "grad_norm": 0.5571600198745728, + "learning_rate": 0.0004814296837743428, + "loss": 1.1521, + "step": 1384 + }, + { + "epoch": 0.24707876193024708, + "grad_norm": 0.5861849188804626, + "learning_rate": 0.0004814031762443796, + "loss": 1.26, + "step": 1385 + }, + { + "epoch": 0.24725715814824725, + "grad_norm": 0.5479332208633423, + "learning_rate": 0.00048137665054011044, + "loss": 1.1034, + "step": 1386 + }, + { + "epoch": 0.24743555436624742, + "grad_norm": 0.5551263689994812, + "learning_rate": 0.0004813501066636187, + "loss": 1.0253, + "step": 1387 + }, + { + "epoch": 0.24761395058424762, + "grad_norm": 0.6580809354782104, + "learning_rate": 0.00048132354461698923, + "loss": 1.2498, + "step": 1388 + }, + { + "epoch": 0.2477923468022478, + "grad_norm": 0.5350015759468079, + "learning_rate": 0.0004812969644023081, + "loss": 1.1355, + "step": 1389 + }, + { + "epoch": 0.24797074302024796, + "grad_norm": 0.5560418367385864, + "learning_rate": 0.00048127036602166285, + "loss": 1.0944, + "step": 1390 + }, + { + "epoch": 0.24814913923824816, + "grad_norm": 0.5966114401817322, + "learning_rate": 0.00048124374947714263, + "loss": 1.0267, + "step": 1391 + }, + { + "epoch": 0.24832753545624833, + "grad_norm": 0.5361310243606567, + "learning_rate": 0.0004812171147708378, + "loss": 1.1974, + "step": 1392 + }, + { + "epoch": 0.2485059316742485, + "grad_norm": 0.5971125960350037, + "learning_rate": 0.00048119046190484027, + "loss": 1.2728, + "step": 1393 + }, + { + "epoch": 0.24868432789224867, + "grad_norm": 0.5017391443252563, + "learning_rate": 0.0004811637908812434, + "loss": 1.0279, + "step": 1394 + }, + { + "epoch": 0.24886272411024887, + "grad_norm": 0.5720884203910828, + "learning_rate": 0.00048113710170214185, + "loss": 1.3989, + "step": 1395 + }, + { + "epoch": 0.24904112032824904, + "grad_norm": 0.5782856345176697, + "learning_rate": 0.00048111039436963177, + "loss": 1.2528, + "step": 1396 + }, + { + "epoch": 0.24921951654624921, + "grad_norm": 0.5723220109939575, + "learning_rate": 0.00048108366888581077, + "loss": 1.0068, + "step": 1397 + }, + { + "epoch": 0.24939791276424939, + "grad_norm": 0.5289225578308105, + "learning_rate": 0.00048105692525277793, + "loss": 0.9357, + "step": 1398 + }, + { + "epoch": 0.24957630898224958, + "grad_norm": 0.5223532319068909, + "learning_rate": 0.00048103016347263356, + "loss": 1.0046, + "step": 1399 + }, + { + "epoch": 0.24975470520024975, + "grad_norm": 0.6456426978111267, + "learning_rate": 0.0004810033835474796, + "loss": 1.5626, + "step": 1400 + }, + { + "epoch": 0.24993310141824993, + "grad_norm": 0.5884763598442078, + "learning_rate": 0.00048097658547941927, + "loss": 1.4618, + "step": 1401 + }, + { + "epoch": 0.2501114976362501, + "grad_norm": 0.5833611488342285, + "learning_rate": 0.00048094976927055735, + "loss": 1.396, + "step": 1402 + }, + { + "epoch": 0.25028989385425027, + "grad_norm": 0.5198600888252258, + "learning_rate": 0.00048092293492299986, + "loss": 1.1263, + "step": 1403 + }, + { + "epoch": 0.25046829007225047, + "grad_norm": 0.5795050859451294, + "learning_rate": 0.00048089608243885454, + "loss": 1.2508, + "step": 1404 + }, + { + "epoch": 0.25064668629025066, + "grad_norm": 0.7242421507835388, + "learning_rate": 0.0004808692118202302, + "loss": 1.2361, + "step": 1405 + }, + { + "epoch": 0.2508250825082508, + "grad_norm": 0.6138631105422974, + "learning_rate": 0.00048084232306923737, + "loss": 1.3318, + "step": 1406 + }, + { + "epoch": 0.251003478726251, + "grad_norm": 0.5212423801422119, + "learning_rate": 0.0004808154161879877, + "loss": 1.2418, + "step": 1407 + }, + { + "epoch": 0.2511818749442512, + "grad_norm": 0.5970634818077087, + "learning_rate": 0.0004807884911785947, + "loss": 1.1968, + "step": 1408 + }, + { + "epoch": 0.25136027116225135, + "grad_norm": 0.6800307035446167, + "learning_rate": 0.0004807615480431729, + "loss": 1.2915, + "step": 1409 + }, + { + "epoch": 0.25153866738025155, + "grad_norm": 0.5408998727798462, + "learning_rate": 0.0004807345867838384, + "loss": 0.9737, + "step": 1410 + }, + { + "epoch": 0.25171706359825174, + "grad_norm": 0.6598271727561951, + "learning_rate": 0.00048070760740270873, + "loss": 1.2425, + "step": 1411 + }, + { + "epoch": 0.2518954598162519, + "grad_norm": 0.6781688332557678, + "learning_rate": 0.0004806806099019029, + "loss": 1.0545, + "step": 1412 + }, + { + "epoch": 0.2520738560342521, + "grad_norm": 0.6171199679374695, + "learning_rate": 0.00048065359428354115, + "loss": 1.1862, + "step": 1413 + }, + { + "epoch": 0.25225225225225223, + "grad_norm": 0.5618746876716614, + "learning_rate": 0.00048062656054974546, + "loss": 1.3709, + "step": 1414 + }, + { + "epoch": 0.25243064847025243, + "grad_norm": 2.3955955505371094, + "learning_rate": 0.0004805995087026389, + "loss": 1.2006, + "step": 1415 + }, + { + "epoch": 0.2526090446882526, + "grad_norm": 0.6166795492172241, + "learning_rate": 0.0004805724387443462, + "loss": 1.3305, + "step": 1416 + }, + { + "epoch": 0.25278744090625277, + "grad_norm": 0.5519673228263855, + "learning_rate": 0.00048054535067699333, + "loss": 1.1071, + "step": 1417 + }, + { + "epoch": 0.25296583712425297, + "grad_norm": 0.6297189593315125, + "learning_rate": 0.0004805182445027079, + "loss": 1.4462, + "step": 1418 + }, + { + "epoch": 0.25314423334225317, + "grad_norm": 0.5049257278442383, + "learning_rate": 0.0004804911202236187, + "loss": 0.9542, + "step": 1419 + }, + { + "epoch": 0.2533226295602533, + "grad_norm": 0.5491045713424683, + "learning_rate": 0.0004804639778418561, + "loss": 1.1949, + "step": 1420 + }, + { + "epoch": 0.2535010257782535, + "grad_norm": 0.6792704463005066, + "learning_rate": 0.00048043681735955183, + "loss": 1.4088, + "step": 1421 + }, + { + "epoch": 0.2536794219962537, + "grad_norm": 0.543935239315033, + "learning_rate": 0.0004804096387788391, + "loss": 1.3297, + "step": 1422 + }, + { + "epoch": 0.25385781821425385, + "grad_norm": 0.6691182851791382, + "learning_rate": 0.00048038244210185253, + "loss": 1.1259, + "step": 1423 + }, + { + "epoch": 0.25403621443225405, + "grad_norm": 0.6011500954627991, + "learning_rate": 0.0004803552273307281, + "loss": 1.2238, + "step": 1424 + }, + { + "epoch": 0.2542146106502542, + "grad_norm": 0.667797863483429, + "learning_rate": 0.0004803279944676032, + "loss": 1.206, + "step": 1425 + }, + { + "epoch": 0.2543930068682544, + "grad_norm": 0.676861047744751, + "learning_rate": 0.0004803007435146168, + "loss": 1.0519, + "step": 1426 + }, + { + "epoch": 0.2545714030862546, + "grad_norm": 0.5236627459526062, + "learning_rate": 0.00048027347447390914, + "loss": 1.0689, + "step": 1427 + }, + { + "epoch": 0.25474979930425473, + "grad_norm": 0.6431361436843872, + "learning_rate": 0.00048024618734762183, + "loss": 1.0586, + "step": 1428 + }, + { + "epoch": 0.25492819552225493, + "grad_norm": 0.6550641655921936, + "learning_rate": 0.00048021888213789797, + "loss": 1.2463, + "step": 1429 + }, + { + "epoch": 0.25510659174025513, + "grad_norm": 0.5982055068016052, + "learning_rate": 0.0004801915588468823, + "loss": 1.1063, + "step": 1430 + }, + { + "epoch": 0.25528498795825527, + "grad_norm": 0.5872926115989685, + "learning_rate": 0.00048016421747672054, + "loss": 1.2272, + "step": 1431 + }, + { + "epoch": 0.25546338417625547, + "grad_norm": 0.6466538310050964, + "learning_rate": 0.0004801368580295603, + "loss": 1.1145, + "step": 1432 + }, + { + "epoch": 0.25564178039425567, + "grad_norm": 0.5384798645973206, + "learning_rate": 0.0004801094805075502, + "loss": 1.1214, + "step": 1433 + }, + { + "epoch": 0.2558201766122558, + "grad_norm": 0.5813401937484741, + "learning_rate": 0.00048008208491284054, + "loss": 1.2142, + "step": 1434 + }, + { + "epoch": 0.255998572830256, + "grad_norm": 0.5265066623687744, + "learning_rate": 0.00048005467124758296, + "loss": 1.0926, + "step": 1435 + }, + { + "epoch": 0.25617696904825615, + "grad_norm": 0.5532288551330566, + "learning_rate": 0.0004800272395139305, + "loss": 1.2191, + "step": 1436 + }, + { + "epoch": 0.25635536526625635, + "grad_norm": 0.5451021194458008, + "learning_rate": 0.00047999978971403765, + "loss": 1.1107, + "step": 1437 + }, + { + "epoch": 0.25653376148425655, + "grad_norm": 0.5395956039428711, + "learning_rate": 0.0004799723218500602, + "loss": 1.2165, + "step": 1438 + }, + { + "epoch": 0.2567121577022567, + "grad_norm": 0.5500022172927856, + "learning_rate": 0.00047994483592415574, + "loss": 1.3835, + "step": 1439 + }, + { + "epoch": 0.2568905539202569, + "grad_norm": 0.5290274024009705, + "learning_rate": 0.0004799173319384826, + "loss": 0.966, + "step": 1440 + }, + { + "epoch": 0.2570689501382571, + "grad_norm": 0.6000217795372009, + "learning_rate": 0.0004798898098952013, + "loss": 1.3913, + "step": 1441 + }, + { + "epoch": 0.25724734635625723, + "grad_norm": 0.5701634883880615, + "learning_rate": 0.0004798622697964732, + "loss": 1.0587, + "step": 1442 + }, + { + "epoch": 0.25742574257425743, + "grad_norm": 0.5983652472496033, + "learning_rate": 0.00047983471164446135, + "loss": 1.2844, + "step": 1443 + }, + { + "epoch": 0.25760413879225763, + "grad_norm": 0.5445163249969482, + "learning_rate": 0.0004798071354413302, + "loss": 1.1172, + "step": 1444 + }, + { + "epoch": 0.2577825350102578, + "grad_norm": 0.5581110715866089, + "learning_rate": 0.0004797795411892455, + "loss": 1.1949, + "step": 1445 + }, + { + "epoch": 0.257960931228258, + "grad_norm": 0.585059642791748, + "learning_rate": 0.0004797519288903745, + "loss": 1.187, + "step": 1446 + }, + { + "epoch": 0.2581393274462581, + "grad_norm": 0.6527815461158752, + "learning_rate": 0.00047972429854688595, + "loss": 1.1534, + "step": 1447 + }, + { + "epoch": 0.2583177236642583, + "grad_norm": 0.7034025192260742, + "learning_rate": 0.00047969665016094976, + "loss": 1.0801, + "step": 1448 + }, + { + "epoch": 0.2584961198822585, + "grad_norm": 1.5441768169403076, + "learning_rate": 0.00047966898373473754, + "loss": 1.3137, + "step": 1449 + }, + { + "epoch": 0.25867451610025866, + "grad_norm": 0.5990326404571533, + "learning_rate": 0.0004796412992704221, + "loss": 1.3426, + "step": 1450 + }, + { + "epoch": 0.25885291231825885, + "grad_norm": 0.5593619346618652, + "learning_rate": 0.0004796135967701779, + "loss": 1.2633, + "step": 1451 + }, + { + "epoch": 0.25903130853625905, + "grad_norm": 0.5932374000549316, + "learning_rate": 0.00047958587623618066, + "loss": 1.111, + "step": 1452 + }, + { + "epoch": 0.2592097047542592, + "grad_norm": 0.9718675017356873, + "learning_rate": 0.0004795581376706075, + "loss": 0.9386, + "step": 1453 + }, + { + "epoch": 0.2593881009722594, + "grad_norm": 0.5909976363182068, + "learning_rate": 0.00047953038107563696, + "loss": 1.0972, + "step": 1454 + }, + { + "epoch": 0.2595664971902596, + "grad_norm": 0.550532877445221, + "learning_rate": 0.000479502606453449, + "loss": 0.9393, + "step": 1455 + }, + { + "epoch": 0.25974489340825974, + "grad_norm": 0.5949730277061462, + "learning_rate": 0.00047947481380622523, + "loss": 1.1803, + "step": 1456 + }, + { + "epoch": 0.25992328962625993, + "grad_norm": 4.032388210296631, + "learning_rate": 0.0004794470031361482, + "loss": 1.5666, + "step": 1457 + }, + { + "epoch": 0.2601016858442601, + "grad_norm": 0.5784013271331787, + "learning_rate": 0.0004794191744454024, + "loss": 1.3467, + "step": 1458 + }, + { + "epoch": 0.2602800820622603, + "grad_norm": 0.6242582201957703, + "learning_rate": 0.00047939132773617334, + "loss": 1.1208, + "step": 1459 + }, + { + "epoch": 0.2604584782802605, + "grad_norm": 0.7698078155517578, + "learning_rate": 0.00047936346301064806, + "loss": 1.0162, + "step": 1460 + }, + { + "epoch": 0.2606368744982606, + "grad_norm": 0.8937914967536926, + "learning_rate": 0.0004793355802710151, + "loss": 1.3314, + "step": 1461 + }, + { + "epoch": 0.2608152707162608, + "grad_norm": 0.8126354217529297, + "learning_rate": 0.0004793076795194644, + "loss": 1.1186, + "step": 1462 + }, + { + "epoch": 0.260993666934261, + "grad_norm": 0.6322470307350159, + "learning_rate": 0.0004792797607581872, + "loss": 1.1986, + "step": 1463 + }, + { + "epoch": 0.26117206315226116, + "grad_norm": 0.6327349543571472, + "learning_rate": 0.0004792518239893763, + "loss": 1.1737, + "step": 1464 + }, + { + "epoch": 0.26135045937026136, + "grad_norm": 0.5458834767341614, + "learning_rate": 0.00047922386921522576, + "loss": 1.0218, + "step": 1465 + }, + { + "epoch": 0.26152885558826156, + "grad_norm": 0.685711681842804, + "learning_rate": 0.0004791958964379312, + "loss": 1.2088, + "step": 1466 + }, + { + "epoch": 0.2617072518062617, + "grad_norm": 0.7403172850608826, + "learning_rate": 0.0004791679056596895, + "loss": 1.4212, + "step": 1467 + }, + { + "epoch": 0.2618856480242619, + "grad_norm": 0.6377992033958435, + "learning_rate": 0.0004791398968826991, + "loss": 1.2633, + "step": 1468 + }, + { + "epoch": 0.26206404424226204, + "grad_norm": 0.6038281321525574, + "learning_rate": 0.0004791118701091599, + "loss": 1.2117, + "step": 1469 + }, + { + "epoch": 0.26224244046026224, + "grad_norm": 2.225592613220215, + "learning_rate": 0.0004790838253412729, + "loss": 1.2892, + "step": 1470 + }, + { + "epoch": 0.26242083667826244, + "grad_norm": 0.5641833543777466, + "learning_rate": 0.0004790557625812409, + "loss": 1.1663, + "step": 1471 + }, + { + "epoch": 0.2625992328962626, + "grad_norm": 0.5233538746833801, + "learning_rate": 0.00047902768183126797, + "loss": 1.1302, + "step": 1472 + }, + { + "epoch": 0.2627776291142628, + "grad_norm": 0.5378060340881348, + "learning_rate": 0.0004789995830935594, + "loss": 1.0252, + "step": 1473 + }, + { + "epoch": 0.262956025332263, + "grad_norm": 0.5588990449905396, + "learning_rate": 0.0004789714663703221, + "loss": 1.2761, + "step": 1474 + }, + { + "epoch": 0.2631344215502631, + "grad_norm": 0.5507195591926575, + "learning_rate": 0.00047894333166376434, + "loss": 1.2709, + "step": 1475 + }, + { + "epoch": 0.2633128177682633, + "grad_norm": 0.6437628865242004, + "learning_rate": 0.00047891517897609594, + "loss": 1.1084, + "step": 1476 + }, + { + "epoch": 0.2634912139862635, + "grad_norm": 0.5252813100814819, + "learning_rate": 0.0004788870083095278, + "loss": 1.1753, + "step": 1477 + }, + { + "epoch": 0.26366961020426366, + "grad_norm": 0.721181333065033, + "learning_rate": 0.00047885881966627255, + "loss": 1.3439, + "step": 1478 + }, + { + "epoch": 0.26384800642226386, + "grad_norm": 0.6929256916046143, + "learning_rate": 0.000478830613048544, + "loss": 1.15, + "step": 1479 + }, + { + "epoch": 0.264026402640264, + "grad_norm": 0.5794677734375, + "learning_rate": 0.0004788023884585577, + "loss": 0.991, + "step": 1480 + }, + { + "epoch": 0.2642047988582642, + "grad_norm": 0.5833317041397095, + "learning_rate": 0.00047877414589853024, + "loss": 1.23, + "step": 1481 + }, + { + "epoch": 0.2643831950762644, + "grad_norm": 0.5699701309204102, + "learning_rate": 0.0004787458853706798, + "loss": 1.0603, + "step": 1482 + }, + { + "epoch": 0.26456159129426454, + "grad_norm": 0.6602557897567749, + "learning_rate": 0.00047871760687722597, + "loss": 1.2557, + "step": 1483 + }, + { + "epoch": 0.26473998751226474, + "grad_norm": 0.5460817813873291, + "learning_rate": 0.0004786893104203897, + "loss": 1.0461, + "step": 1484 + }, + { + "epoch": 0.26491838373026494, + "grad_norm": 0.5744298100471497, + "learning_rate": 0.0004786609960023934, + "loss": 1.3121, + "step": 1485 + }, + { + "epoch": 0.2650967799482651, + "grad_norm": 0.6171600222587585, + "learning_rate": 0.00047863266362546095, + "loss": 1.264, + "step": 1486 + }, + { + "epoch": 0.2652751761662653, + "grad_norm": 0.5980626344680786, + "learning_rate": 0.00047860431329181744, + "loss": 1.4093, + "step": 1487 + }, + { + "epoch": 0.2654535723842655, + "grad_norm": 0.674322247505188, + "learning_rate": 0.0004785759450036895, + "loss": 1.2626, + "step": 1488 + }, + { + "epoch": 0.2656319686022656, + "grad_norm": 0.7270940542221069, + "learning_rate": 0.0004785475587633052, + "loss": 1.0291, + "step": 1489 + }, + { + "epoch": 0.2658103648202658, + "grad_norm": 0.5281042456626892, + "learning_rate": 0.00047851915457289404, + "loss": 1.0533, + "step": 1490 + }, + { + "epoch": 0.26598876103826596, + "grad_norm": 0.498197466135025, + "learning_rate": 0.0004784907324346868, + "loss": 0.8067, + "step": 1491 + }, + { + "epoch": 0.26616715725626616, + "grad_norm": 0.583159863948822, + "learning_rate": 0.00047846229235091575, + "loss": 0.9819, + "step": 1492 + }, + { + "epoch": 0.26634555347426636, + "grad_norm": 0.5671996474266052, + "learning_rate": 0.0004784338343238146, + "loss": 1.2338, + "step": 1493 + }, + { + "epoch": 0.2665239496922665, + "grad_norm": 0.5304687023162842, + "learning_rate": 0.0004784053583556184, + "loss": 1.2323, + "step": 1494 + }, + { + "epoch": 0.2667023459102667, + "grad_norm": 0.8755894303321838, + "learning_rate": 0.0004783768644485636, + "loss": 1.0927, + "step": 1495 + }, + { + "epoch": 0.2668807421282669, + "grad_norm": 0.5539624691009521, + "learning_rate": 0.0004783483526048882, + "loss": 1.1342, + "step": 1496 + }, + { + "epoch": 0.26705913834626704, + "grad_norm": 0.5056388974189758, + "learning_rate": 0.0004783198228268314, + "loss": 1.0296, + "step": 1497 + }, + { + "epoch": 0.26723753456426724, + "grad_norm": 0.5661635994911194, + "learning_rate": 0.00047829127511663395, + "loss": 1.0945, + "step": 1498 + }, + { + "epoch": 0.26741593078226744, + "grad_norm": 0.5565497875213623, + "learning_rate": 0.00047826270947653803, + "loss": 1.0299, + "step": 1499 + }, + { + "epoch": 0.2675943270002676, + "grad_norm": 0.5293715596199036, + "learning_rate": 0.0004782341259087872, + "loss": 1.1213, + "step": 1500 + }, + { + "epoch": 0.2677727232182678, + "grad_norm": 0.6879062056541443, + "learning_rate": 0.00047820552441562626, + "loss": 1.1906, + "step": 1501 + }, + { + "epoch": 0.2679511194362679, + "grad_norm": 0.6181793212890625, + "learning_rate": 0.00047817690499930165, + "loss": 1.2696, + "step": 1502 + }, + { + "epoch": 0.2681295156542681, + "grad_norm": 0.8244830369949341, + "learning_rate": 0.00047814826766206115, + "loss": 1.1163, + "step": 1503 + }, + { + "epoch": 0.2683079118722683, + "grad_norm": 0.6630935072898865, + "learning_rate": 0.0004781196124061539, + "loss": 1.2271, + "step": 1504 + }, + { + "epoch": 0.26848630809026847, + "grad_norm": 0.6512210369110107, + "learning_rate": 0.0004780909392338304, + "loss": 1.3788, + "step": 1505 + }, + { + "epoch": 0.26866470430826866, + "grad_norm": 0.900384247303009, + "learning_rate": 0.00047806224814734275, + "loss": 1.1045, + "step": 1506 + }, + { + "epoch": 0.26884310052626886, + "grad_norm": 1.1172149181365967, + "learning_rate": 0.0004780335391489442, + "loss": 1.116, + "step": 1507 + }, + { + "epoch": 0.269021496744269, + "grad_norm": 0.6222420930862427, + "learning_rate": 0.00047800481224088965, + "loss": 1.0094, + "step": 1508 + }, + { + "epoch": 0.2691998929622692, + "grad_norm": 0.7224119901657104, + "learning_rate": 0.00047797606742543526, + "loss": 1.0397, + "step": 1509 + }, + { + "epoch": 0.2693782891802694, + "grad_norm": 0.5364158749580383, + "learning_rate": 0.00047794730470483867, + "loss": 1.0348, + "step": 1510 + }, + { + "epoch": 0.26955668539826955, + "grad_norm": 0.5481920838356018, + "learning_rate": 0.00047791852408135885, + "loss": 0.8935, + "step": 1511 + }, + { + "epoch": 0.26973508161626975, + "grad_norm": 4.288261890411377, + "learning_rate": 0.0004778897255572562, + "loss": 0.963, + "step": 1512 + }, + { + "epoch": 0.2699134778342699, + "grad_norm": 0.6022629141807556, + "learning_rate": 0.00047786090913479255, + "loss": 1.0822, + "step": 1513 + }, + { + "epoch": 0.2700918740522701, + "grad_norm": 0.609437882900238, + "learning_rate": 0.00047783207481623126, + "loss": 0.8158, + "step": 1514 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 0.5600314736366272, + "learning_rate": 0.00047780322260383674, + "loss": 1.1993, + "step": 1515 + }, + { + "epoch": 0.27044866648827043, + "grad_norm": 0.6165355443954468, + "learning_rate": 0.00047777435249987525, + "loss": 1.1175, + "step": 1516 + }, + { + "epoch": 0.2706270627062706, + "grad_norm": 0.5497164726257324, + "learning_rate": 0.00047774546450661407, + "loss": 0.9579, + "step": 1517 + }, + { + "epoch": 0.2708054589242708, + "grad_norm": 0.5620499849319458, + "learning_rate": 0.0004777165586263221, + "loss": 1.162, + "step": 1518 + }, + { + "epoch": 0.27098385514227097, + "grad_norm": 0.6000719666481018, + "learning_rate": 0.00047768763486126964, + "loss": 1.0962, + "step": 1519 + }, + { + "epoch": 0.27116225136027117, + "grad_norm": 0.6456661224365234, + "learning_rate": 0.00047765869321372834, + "loss": 1.2405, + "step": 1520 + }, + { + "epoch": 0.27134064757827137, + "grad_norm": 0.6640433073043823, + "learning_rate": 0.00047762973368597117, + "loss": 1.2917, + "step": 1521 + }, + { + "epoch": 0.2715190437962715, + "grad_norm": 0.5571081042289734, + "learning_rate": 0.0004776007562802728, + "loss": 1.1319, + "step": 1522 + }, + { + "epoch": 0.2716974400142717, + "grad_norm": 0.6709672212600708, + "learning_rate": 0.0004775717609989089, + "loss": 1.1581, + "step": 1523 + }, + { + "epoch": 0.27187583623227185, + "grad_norm": 0.6140976548194885, + "learning_rate": 0.00047754274784415673, + "loss": 1.0881, + "step": 1524 + }, + { + "epoch": 0.27205423245027205, + "grad_norm": 0.5703726410865784, + "learning_rate": 0.0004775137168182952, + "loss": 1.1018, + "step": 1525 + }, + { + "epoch": 0.27223262866827225, + "grad_norm": 1.8433053493499756, + "learning_rate": 0.00047748466792360425, + "loss": 1.0196, + "step": 1526 + }, + { + "epoch": 0.2724110248862724, + "grad_norm": 0.5568527579307556, + "learning_rate": 0.0004774556011623653, + "loss": 0.9479, + "step": 1527 + }, + { + "epoch": 0.2725894211042726, + "grad_norm": 0.6627964377403259, + "learning_rate": 0.00047742651653686133, + "loss": 0.8791, + "step": 1528 + }, + { + "epoch": 0.2727678173222728, + "grad_norm": 0.655723512172699, + "learning_rate": 0.00047739741404937666, + "loss": 1.5567, + "step": 1529 + }, + { + "epoch": 0.27294621354027293, + "grad_norm": 1.177575707435608, + "learning_rate": 0.00047736829370219694, + "loss": 1.1893, + "step": 1530 + }, + { + "epoch": 0.27312460975827313, + "grad_norm": 0.5675112009048462, + "learning_rate": 0.0004773391554976093, + "loss": 1.2078, + "step": 1531 + }, + { + "epoch": 0.2733030059762733, + "grad_norm": 0.6315929889678955, + "learning_rate": 0.00047730999943790216, + "loss": 1.0456, + "step": 1532 + }, + { + "epoch": 0.27348140219427347, + "grad_norm": 1.2165286540985107, + "learning_rate": 0.0004772808255253655, + "loss": 1.3296, + "step": 1533 + }, + { + "epoch": 0.27365979841227367, + "grad_norm": 0.6034713983535767, + "learning_rate": 0.00047725163376229063, + "loss": 1.2784, + "step": 1534 + }, + { + "epoch": 0.2738381946302738, + "grad_norm": 0.5982022285461426, + "learning_rate": 0.0004772224241509702, + "loss": 1.2338, + "step": 1535 + }, + { + "epoch": 0.274016590848274, + "grad_norm": 0.797183096408844, + "learning_rate": 0.00047719319669369843, + "loss": 1.2911, + "step": 1536 + }, + { + "epoch": 0.2741949870662742, + "grad_norm": 0.5509144067764282, + "learning_rate": 0.0004771639513927707, + "loss": 1.3141, + "step": 1537 + }, + { + "epoch": 0.27437338328427435, + "grad_norm": 0.6552438735961914, + "learning_rate": 0.000477134688250484, + "loss": 1.3726, + "step": 1538 + }, + { + "epoch": 0.27455177950227455, + "grad_norm": 0.5664661526679993, + "learning_rate": 0.0004771054072691367, + "loss": 1.484, + "step": 1539 + }, + { + "epoch": 0.27473017572027475, + "grad_norm": 0.657057523727417, + "learning_rate": 0.0004770761084510283, + "loss": 1.5254, + "step": 1540 + }, + { + "epoch": 0.2749085719382749, + "grad_norm": 0.5660455822944641, + "learning_rate": 0.00047704679179846014, + "loss": 1.1427, + "step": 1541 + }, + { + "epoch": 0.2750869681562751, + "grad_norm": 0.5524396300315857, + "learning_rate": 0.00047701745731373467, + "loss": 1.1335, + "step": 1542 + }, + { + "epoch": 0.2752653643742753, + "grad_norm": 0.5262104272842407, + "learning_rate": 0.00047698810499915577, + "loss": 1.0167, + "step": 1543 + }, + { + "epoch": 0.27544376059227543, + "grad_norm": 0.5536545515060425, + "learning_rate": 0.0004769587348570288, + "loss": 1.0548, + "step": 1544 + }, + { + "epoch": 0.27562215681027563, + "grad_norm": 0.5155540108680725, + "learning_rate": 0.0004769293468896605, + "loss": 1.1564, + "step": 1545 + }, + { + "epoch": 0.2758005530282758, + "grad_norm": 0.5515694618225098, + "learning_rate": 0.00047689994109935884, + "loss": 1.1428, + "step": 1546 + }, + { + "epoch": 0.275978949246276, + "grad_norm": 0.5768205523490906, + "learning_rate": 0.00047687051748843357, + "loss": 1.371, + "step": 1547 + }, + { + "epoch": 0.27615734546427617, + "grad_norm": 0.6669476628303528, + "learning_rate": 0.0004768410760591955, + "loss": 1.3972, + "step": 1548 + }, + { + "epoch": 0.2763357416822763, + "grad_norm": 1.1227506399154663, + "learning_rate": 0.0004768116168139568, + "loss": 0.9764, + "step": 1549 + }, + { + "epoch": 0.2765141379002765, + "grad_norm": 0.5911063551902771, + "learning_rate": 0.00047678213975503136, + "loss": 1.2668, + "step": 1550 + }, + { + "epoch": 0.2766925341182767, + "grad_norm": 0.47411879897117615, + "learning_rate": 0.00047675264488473436, + "loss": 0.9734, + "step": 1551 + }, + { + "epoch": 0.27687093033627685, + "grad_norm": 0.5374860763549805, + "learning_rate": 0.0004767231322053821, + "loss": 1.1893, + "step": 1552 + }, + { + "epoch": 0.27704932655427705, + "grad_norm": 0.5706403851509094, + "learning_rate": 0.00047669360171929265, + "loss": 1.1484, + "step": 1553 + }, + { + "epoch": 0.27722772277227725, + "grad_norm": 0.6070848703384399, + "learning_rate": 0.0004766640534287853, + "loss": 1.3872, + "step": 1554 + }, + { + "epoch": 0.2774061189902774, + "grad_norm": 0.5827165246009827, + "learning_rate": 0.00047663448733618066, + "loss": 1.3347, + "step": 1555 + }, + { + "epoch": 0.2775845152082776, + "grad_norm": 3.5079944133758545, + "learning_rate": 0.00047660490344380094, + "loss": 1.1723, + "step": 1556 + }, + { + "epoch": 0.27776291142627774, + "grad_norm": 0.6186500787734985, + "learning_rate": 0.00047657530175396955, + "loss": 1.5012, + "step": 1557 + }, + { + "epoch": 0.27794130764427794, + "grad_norm": 0.6020243763923645, + "learning_rate": 0.0004765456822690116, + "loss": 1.0751, + "step": 1558 + }, + { + "epoch": 0.27811970386227813, + "grad_norm": 0.5379880666732788, + "learning_rate": 0.00047651604499125325, + "loss": 0.9193, + "step": 1559 + }, + { + "epoch": 0.2782981000802783, + "grad_norm": 0.6437864899635315, + "learning_rate": 0.0004764863899230221, + "loss": 1.2344, + "step": 1560 + }, + { + "epoch": 0.2784764962982785, + "grad_norm": 0.5693362951278687, + "learning_rate": 0.00047645671706664737, + "loss": 1.1764, + "step": 1561 + }, + { + "epoch": 0.2786548925162787, + "grad_norm": 0.5577165484428406, + "learning_rate": 0.00047642702642445954, + "loss": 1.2728, + "step": 1562 + }, + { + "epoch": 0.2788332887342788, + "grad_norm": 0.5637865662574768, + "learning_rate": 0.0004763973179987906, + "loss": 1.3064, + "step": 1563 + }, + { + "epoch": 0.279011684952279, + "grad_norm": 0.5422176122665405, + "learning_rate": 0.00047636759179197366, + "loss": 0.9754, + "step": 1564 + }, + { + "epoch": 0.2791900811702792, + "grad_norm": 0.5627367496490479, + "learning_rate": 0.00047633784780634343, + "loss": 1.0802, + "step": 1565 + }, + { + "epoch": 0.27936847738827936, + "grad_norm": 0.6842982769012451, + "learning_rate": 0.0004763080860442361, + "loss": 1.0098, + "step": 1566 + }, + { + "epoch": 0.27954687360627956, + "grad_norm": 0.6131782531738281, + "learning_rate": 0.00047627830650798903, + "loss": 1.4362, + "step": 1567 + }, + { + "epoch": 0.2797252698242797, + "grad_norm": 0.6176859736442566, + "learning_rate": 0.00047624850919994113, + "loss": 1.3148, + "step": 1568 + }, + { + "epoch": 0.2799036660422799, + "grad_norm": 0.6015002727508545, + "learning_rate": 0.00047621869412243275, + "loss": 1.1703, + "step": 1569 + }, + { + "epoch": 0.2800820622602801, + "grad_norm": 0.750379204750061, + "learning_rate": 0.0004761888612778054, + "loss": 1.2353, + "step": 1570 + }, + { + "epoch": 0.28026045847828024, + "grad_norm": 0.567866325378418, + "learning_rate": 0.0004761590106684023, + "loss": 1.2543, + "step": 1571 + }, + { + "epoch": 0.28043885469628044, + "grad_norm": 0.5942226052284241, + "learning_rate": 0.0004761291422965678, + "loss": 1.2128, + "step": 1572 + }, + { + "epoch": 0.28061725091428064, + "grad_norm": 0.4971591830253601, + "learning_rate": 0.00047609925616464777, + "loss": 1.0342, + "step": 1573 + }, + { + "epoch": 0.2807956471322808, + "grad_norm": 0.5671881437301636, + "learning_rate": 0.0004760693522749894, + "loss": 1.0644, + "step": 1574 + }, + { + "epoch": 0.280974043350281, + "grad_norm": 0.887310266494751, + "learning_rate": 0.00047603943062994147, + "loss": 1.0307, + "step": 1575 + }, + { + "epoch": 0.2811524395682812, + "grad_norm": 0.5171691179275513, + "learning_rate": 0.00047600949123185386, + "loss": 0.9699, + "step": 1576 + }, + { + "epoch": 0.2813308357862813, + "grad_norm": 0.799001932144165, + "learning_rate": 0.00047597953408307813, + "loss": 1.2391, + "step": 1577 + }, + { + "epoch": 0.2815092320042815, + "grad_norm": 0.5518134832382202, + "learning_rate": 0.00047594955918596704, + "loss": 0.9825, + "step": 1578 + }, + { + "epoch": 0.28168762822228166, + "grad_norm": 0.512997567653656, + "learning_rate": 0.00047591956654287484, + "loss": 1.1596, + "step": 1579 + }, + { + "epoch": 0.28186602444028186, + "grad_norm": 0.5603839755058289, + "learning_rate": 0.00047588955615615705, + "loss": 1.1735, + "step": 1580 + }, + { + "epoch": 0.28204442065828206, + "grad_norm": 0.5397359728813171, + "learning_rate": 0.0004758595280281707, + "loss": 1.1707, + "step": 1581 + }, + { + "epoch": 0.2822228168762822, + "grad_norm": 0.5633308291435242, + "learning_rate": 0.0004758294821612742, + "loss": 1.1632, + "step": 1582 + }, + { + "epoch": 0.2824012130942824, + "grad_norm": 1.6372519731521606, + "learning_rate": 0.00047579941855782745, + "loss": 0.9778, + "step": 1583 + }, + { + "epoch": 0.2825796093122826, + "grad_norm": 0.790706217288971, + "learning_rate": 0.00047576933722019146, + "loss": 1.2109, + "step": 1584 + }, + { + "epoch": 0.28275800553028274, + "grad_norm": 0.5796968340873718, + "learning_rate": 0.0004757392381507289, + "loss": 1.3131, + "step": 1585 + }, + { + "epoch": 0.28293640174828294, + "grad_norm": 0.7055728435516357, + "learning_rate": 0.0004757091213518037, + "loss": 1.1037, + "step": 1586 + }, + { + "epoch": 0.28311479796628314, + "grad_norm": 1.4809973239898682, + "learning_rate": 0.00047567898682578124, + "loss": 1.1257, + "step": 1587 + }, + { + "epoch": 0.2832931941842833, + "grad_norm": 0.6054453253746033, + "learning_rate": 0.00047564883457502835, + "loss": 1.0544, + "step": 1588 + }, + { + "epoch": 0.2834715904022835, + "grad_norm": 6.443731307983398, + "learning_rate": 0.0004756186646019131, + "loss": 0.8428, + "step": 1589 + }, + { + "epoch": 0.2836499866202836, + "grad_norm": 9.057873725891113, + "learning_rate": 0.0004755884769088049, + "loss": 1.5444, + "step": 1590 + }, + { + "epoch": 0.2838283828382838, + "grad_norm": 0.7593368291854858, + "learning_rate": 0.00047555827149807484, + "loss": 1.1721, + "step": 1591 + }, + { + "epoch": 0.284006779056284, + "grad_norm": 0.6235383749008179, + "learning_rate": 0.00047552804837209525, + "loss": 1.3072, + "step": 1592 + }, + { + "epoch": 0.28418517527428416, + "grad_norm": 0.9655506610870361, + "learning_rate": 0.0004754978075332398, + "loss": 1.1129, + "step": 1593 + }, + { + "epoch": 0.28436357149228436, + "grad_norm": 0.7398672699928284, + "learning_rate": 0.0004754675489838835, + "loss": 0.8643, + "step": 1594 + }, + { + "epoch": 0.28454196771028456, + "grad_norm": 0.6276271343231201, + "learning_rate": 0.000475437272726403, + "loss": 1.1541, + "step": 1595 + }, + { + "epoch": 0.2847203639282847, + "grad_norm": 0.6548689603805542, + "learning_rate": 0.0004754069787631761, + "loss": 1.024, + "step": 1596 + }, + { + "epoch": 0.2848987601462849, + "grad_norm": 0.7249738574028015, + "learning_rate": 0.0004753766670965821, + "loss": 1.0359, + "step": 1597 + }, + { + "epoch": 0.2850771563642851, + "grad_norm": 0.6566236615180969, + "learning_rate": 0.0004753463377290016, + "loss": 0.8981, + "step": 1598 + }, + { + "epoch": 0.28525555258228524, + "grad_norm": 0.7142555117607117, + "learning_rate": 0.0004753159906628167, + "loss": 1.2243, + "step": 1599 + }, + { + "epoch": 0.28543394880028544, + "grad_norm": 0.5189533233642578, + "learning_rate": 0.00047528562590041097, + "loss": 1.1284, + "step": 1600 + }, + { + "epoch": 0.2856123450182856, + "grad_norm": 0.619817316532135, + "learning_rate": 0.000475255243444169, + "loss": 1.0063, + "step": 1601 + }, + { + "epoch": 0.2857907412362858, + "grad_norm": 3.838261127471924, + "learning_rate": 0.00047522484329647725, + "loss": 1.0014, + "step": 1602 + }, + { + "epoch": 0.285969137454286, + "grad_norm": 0.6709682941436768, + "learning_rate": 0.0004751944254597232, + "loss": 1.1519, + "step": 1603 + }, + { + "epoch": 0.2861475336722861, + "grad_norm": 0.6216676235198975, + "learning_rate": 0.0004751639899362958, + "loss": 1.17, + "step": 1604 + }, + { + "epoch": 0.2863259298902863, + "grad_norm": 2.6110217571258545, + "learning_rate": 0.00047513353672858565, + "loss": 1.3296, + "step": 1605 + }, + { + "epoch": 0.2865043261082865, + "grad_norm": 0.6154805421829224, + "learning_rate": 0.0004751030658389843, + "loss": 1.4798, + "step": 1606 + }, + { + "epoch": 0.28668272232628667, + "grad_norm": 0.6929340958595276, + "learning_rate": 0.00047507257726988515, + "loss": 0.8626, + "step": 1607 + }, + { + "epoch": 0.28686111854428686, + "grad_norm": 0.5480323433876038, + "learning_rate": 0.00047504207102368255, + "loss": 1.0958, + "step": 1608 + }, + { + "epoch": 0.28703951476228706, + "grad_norm": 0.6278098821640015, + "learning_rate": 0.00047501154710277255, + "loss": 1.4003, + "step": 1609 + }, + { + "epoch": 0.2872179109802872, + "grad_norm": 0.5721479058265686, + "learning_rate": 0.0004749810055095525, + "loss": 0.9552, + "step": 1610 + }, + { + "epoch": 0.2873963071982874, + "grad_norm": 0.5661513209342957, + "learning_rate": 0.000474950446246421, + "loss": 1.3078, + "step": 1611 + }, + { + "epoch": 0.28757470341628755, + "grad_norm": 0.508573055267334, + "learning_rate": 0.00047491986931577835, + "loss": 1.1201, + "step": 1612 + }, + { + "epoch": 0.28775309963428775, + "grad_norm": 0.5927449464797974, + "learning_rate": 0.00047488927472002596, + "loss": 1.26, + "step": 1613 + }, + { + "epoch": 0.28793149585228794, + "grad_norm": 0.5334576964378357, + "learning_rate": 0.00047485866246156665, + "loss": 1.313, + "step": 1614 + }, + { + "epoch": 0.2881098920702881, + "grad_norm": 0.5566635131835938, + "learning_rate": 0.0004748280325428048, + "loss": 1.4478, + "step": 1615 + }, + { + "epoch": 0.2882882882882883, + "grad_norm": 0.47653868794441223, + "learning_rate": 0.000474797384966146, + "loss": 1.0392, + "step": 1616 + }, + { + "epoch": 0.2884666845062885, + "grad_norm": 0.6128924489021301, + "learning_rate": 0.0004747667197339974, + "loss": 1.2961, + "step": 1617 + }, + { + "epoch": 0.2886450807242886, + "grad_norm": 0.4769141376018524, + "learning_rate": 0.0004747360368487672, + "loss": 0.9468, + "step": 1618 + }, + { + "epoch": 0.2888234769422888, + "grad_norm": 0.5372063517570496, + "learning_rate": 0.0004747053363128655, + "loss": 1.1281, + "step": 1619 + }, + { + "epoch": 0.289001873160289, + "grad_norm": 0.499072402715683, + "learning_rate": 0.0004746746181287034, + "loss": 1.094, + "step": 1620 + }, + { + "epoch": 0.28918026937828917, + "grad_norm": 0.6395072937011719, + "learning_rate": 0.0004746438822986934, + "loss": 1.3721, + "step": 1621 + }, + { + "epoch": 0.28935866559628937, + "grad_norm": 0.5611110925674438, + "learning_rate": 0.00047461312882524954, + "loss": 1.1692, + "step": 1622 + }, + { + "epoch": 0.28953706181428956, + "grad_norm": 0.6090511679649353, + "learning_rate": 0.0004745823577107873, + "loss": 1.175, + "step": 1623 + }, + { + "epoch": 0.2897154580322897, + "grad_norm": 0.5791161060333252, + "learning_rate": 0.0004745515689577233, + "loss": 1.2759, + "step": 1624 + }, + { + "epoch": 0.2898938542502899, + "grad_norm": 0.5539647936820984, + "learning_rate": 0.0004745207625684756, + "loss": 1.1852, + "step": 1625 + }, + { + "epoch": 0.29007225046829005, + "grad_norm": 1.770546317100525, + "learning_rate": 0.000474489938545464, + "loss": 1.0354, + "step": 1626 + }, + { + "epoch": 0.29025064668629025, + "grad_norm": 0.640678882598877, + "learning_rate": 0.0004744590968911091, + "loss": 1.2128, + "step": 1627 + }, + { + "epoch": 0.29042904290429045, + "grad_norm": 0.6019294261932373, + "learning_rate": 0.00047442823760783336, + "loss": 1.3319, + "step": 1628 + }, + { + "epoch": 0.2906074391222906, + "grad_norm": 0.6891437768936157, + "learning_rate": 0.0004743973606980604, + "loss": 0.8761, + "step": 1629 + }, + { + "epoch": 0.2907858353402908, + "grad_norm": 0.5156970620155334, + "learning_rate": 0.0004743664661642153, + "loss": 1.139, + "step": 1630 + }, + { + "epoch": 0.290964231558291, + "grad_norm": 0.5733403563499451, + "learning_rate": 0.0004743355540087245, + "loss": 1.3032, + "step": 1631 + }, + { + "epoch": 0.29114262777629113, + "grad_norm": 0.7720729112625122, + "learning_rate": 0.00047430462423401587, + "loss": 1.1981, + "step": 1632 + }, + { + "epoch": 0.29132102399429133, + "grad_norm": 0.4814129173755646, + "learning_rate": 0.00047427367684251855, + "loss": 0.7758, + "step": 1633 + }, + { + "epoch": 0.2914994202122915, + "grad_norm": 0.6132440567016602, + "learning_rate": 0.0004742427118366632, + "loss": 1.4416, + "step": 1634 + }, + { + "epoch": 0.29167781643029167, + "grad_norm": 0.5137991309165955, + "learning_rate": 0.0004742117292188817, + "loss": 0.913, + "step": 1635 + }, + { + "epoch": 0.29185621264829187, + "grad_norm": 0.561093270778656, + "learning_rate": 0.0004741807289916075, + "loss": 1.0205, + "step": 1636 + }, + { + "epoch": 0.292034608866292, + "grad_norm": 0.5749049782752991, + "learning_rate": 0.00047414971115727536, + "loss": 1.1973, + "step": 1637 + }, + { + "epoch": 0.2922130050842922, + "grad_norm": 0.5146419405937195, + "learning_rate": 0.00047411867571832135, + "loss": 1.0004, + "step": 1638 + }, + { + "epoch": 0.2923914013022924, + "grad_norm": 0.6099346280097961, + "learning_rate": 0.00047408762267718297, + "loss": 1.1748, + "step": 1639 + }, + { + "epoch": 0.29256979752029255, + "grad_norm": 0.5745267271995544, + "learning_rate": 0.0004740565520362991, + "loss": 1.1932, + "step": 1640 + }, + { + "epoch": 0.29274819373829275, + "grad_norm": 0.5441005229949951, + "learning_rate": 0.0004740254637981101, + "loss": 1.2205, + "step": 1641 + }, + { + "epoch": 0.29292658995629295, + "grad_norm": 0.5015900135040283, + "learning_rate": 0.00047399435796505754, + "loss": 0.8114, + "step": 1642 + }, + { + "epoch": 0.2931049861742931, + "grad_norm": 0.6179179549217224, + "learning_rate": 0.0004739632345395846, + "loss": 1.1337, + "step": 1643 + }, + { + "epoch": 0.2932833823922933, + "grad_norm": 0.5103684663772583, + "learning_rate": 0.0004739320935241355, + "loss": 0.8642, + "step": 1644 + }, + { + "epoch": 0.2934617786102935, + "grad_norm": 0.68010014295578, + "learning_rate": 0.0004739009349211561, + "loss": 1.284, + "step": 1645 + }, + { + "epoch": 0.29364017482829363, + "grad_norm": 0.5158190131187439, + "learning_rate": 0.0004738697587330937, + "loss": 1.1043, + "step": 1646 + }, + { + "epoch": 0.29381857104629383, + "grad_norm": 0.5020380616188049, + "learning_rate": 0.00047383856496239677, + "loss": 0.9218, + "step": 1647 + }, + { + "epoch": 0.293996967264294, + "grad_norm": 0.5497924089431763, + "learning_rate": 0.00047380735361151526, + "loss": 1.3251, + "step": 1648 + }, + { + "epoch": 0.29417536348229417, + "grad_norm": 0.5309639573097229, + "learning_rate": 0.00047377612468290053, + "loss": 1.0265, + "step": 1649 + }, + { + "epoch": 0.29435375970029437, + "grad_norm": 0.528611958026886, + "learning_rate": 0.0004737448781790052, + "loss": 1.1084, + "step": 1650 + }, + { + "epoch": 0.2945321559182945, + "grad_norm": 0.7254324555397034, + "learning_rate": 0.0004737136141022836, + "loss": 1.2578, + "step": 1651 + }, + { + "epoch": 0.2947105521362947, + "grad_norm": 0.6030469536781311, + "learning_rate": 0.0004736823324551909, + "loss": 1.0314, + "step": 1652 + }, + { + "epoch": 0.2948889483542949, + "grad_norm": 0.5361880660057068, + "learning_rate": 0.00047365103324018405, + "loss": 1.1275, + "step": 1653 + }, + { + "epoch": 0.29506734457229505, + "grad_norm": 0.5910381078720093, + "learning_rate": 0.00047361971645972135, + "loss": 0.9726, + "step": 1654 + }, + { + "epoch": 0.29524574079029525, + "grad_norm": 0.6346861124038696, + "learning_rate": 0.00047358838211626234, + "loss": 1.0678, + "step": 1655 + }, + { + "epoch": 0.29542413700829545, + "grad_norm": 2.6803693771362305, + "learning_rate": 0.0004735570302122679, + "loss": 1.2372, + "step": 1656 + }, + { + "epoch": 0.2956025332262956, + "grad_norm": 0.571451723575592, + "learning_rate": 0.0004735256607502006, + "loss": 1.1094, + "step": 1657 + }, + { + "epoch": 0.2957809294442958, + "grad_norm": 0.5796542167663574, + "learning_rate": 0.0004734942737325242, + "loss": 1.05, + "step": 1658 + }, + { + "epoch": 0.29595932566229594, + "grad_norm": 0.6353934407234192, + "learning_rate": 0.00047346286916170356, + "loss": 1.1633, + "step": 1659 + }, + { + "epoch": 0.29613772188029613, + "grad_norm": 0.7311161160469055, + "learning_rate": 0.00047343144704020543, + "loss": 1.1351, + "step": 1660 + }, + { + "epoch": 0.29631611809829633, + "grad_norm": 0.5946211218833923, + "learning_rate": 0.00047340000737049756, + "loss": 1.0721, + "step": 1661 + }, + { + "epoch": 0.2964945143162965, + "grad_norm": 0.6070974469184875, + "learning_rate": 0.00047336855015504923, + "loss": 1.0058, + "step": 1662 + }, + { + "epoch": 0.2966729105342967, + "grad_norm": 0.5165389180183411, + "learning_rate": 0.0004733370753963311, + "loss": 0.9556, + "step": 1663 + }, + { + "epoch": 0.2968513067522969, + "grad_norm": 0.7445652484893799, + "learning_rate": 0.0004733055830968152, + "loss": 0.9277, + "step": 1664 + }, + { + "epoch": 0.297029702970297, + "grad_norm": 0.520530641078949, + "learning_rate": 0.0004732740732589749, + "loss": 1.0856, + "step": 1665 + }, + { + "epoch": 0.2972080991882972, + "grad_norm": 0.6134181618690491, + "learning_rate": 0.00047324254588528497, + "loss": 1.3027, + "step": 1666 + }, + { + "epoch": 0.2973864954062974, + "grad_norm": 0.5679133534431458, + "learning_rate": 0.00047321100097822154, + "loss": 1.1749, + "step": 1667 + }, + { + "epoch": 0.29756489162429756, + "grad_norm": 0.5943863391876221, + "learning_rate": 0.0004731794385402621, + "loss": 1.2488, + "step": 1668 + }, + { + "epoch": 0.29774328784229775, + "grad_norm": 0.5449469089508057, + "learning_rate": 0.00047314785857388575, + "loss": 1.1212, + "step": 1669 + }, + { + "epoch": 0.2979216840602979, + "grad_norm": 0.882116973400116, + "learning_rate": 0.0004731162610815725, + "loss": 1.2176, + "step": 1670 + }, + { + "epoch": 0.2981000802782981, + "grad_norm": 0.6479673385620117, + "learning_rate": 0.0004730846460658041, + "loss": 1.3039, + "step": 1671 + }, + { + "epoch": 0.2982784764962983, + "grad_norm": 0.5808786749839783, + "learning_rate": 0.0004730530135290637, + "loss": 1.0677, + "step": 1672 + }, + { + "epoch": 0.29845687271429844, + "grad_norm": 0.527851402759552, + "learning_rate": 0.0004730213634738355, + "loss": 1.0218, + "step": 1673 + }, + { + "epoch": 0.29863526893229864, + "grad_norm": 0.8989919424057007, + "learning_rate": 0.00047298969590260545, + "loss": 1.0192, + "step": 1674 + }, + { + "epoch": 0.29881366515029884, + "grad_norm": 0.5305653214454651, + "learning_rate": 0.0004729580108178606, + "loss": 1.0954, + "step": 1675 + }, + { + "epoch": 0.298992061368299, + "grad_norm": 0.6758263111114502, + "learning_rate": 0.0004729263082220896, + "loss": 1.1321, + "step": 1676 + }, + { + "epoch": 0.2991704575862992, + "grad_norm": 0.7814352512359619, + "learning_rate": 0.0004728945881177822, + "loss": 0.8791, + "step": 1677 + }, + { + "epoch": 0.2993488538042994, + "grad_norm": 1.4971033334732056, + "learning_rate": 0.00047286285050742984, + "loss": 1.2261, + "step": 1678 + }, + { + "epoch": 0.2995272500222995, + "grad_norm": 0.6250385046005249, + "learning_rate": 0.0004728310953935251, + "loss": 1.0965, + "step": 1679 + }, + { + "epoch": 0.2997056462402997, + "grad_norm": 0.5605043172836304, + "learning_rate": 0.00047279932277856195, + "loss": 1.2065, + "step": 1680 + }, + { + "epoch": 0.29988404245829986, + "grad_norm": 0.5820797085762024, + "learning_rate": 0.0004727675326650359, + "loss": 0.892, + "step": 1681 + }, + { + "epoch": 0.30006243867630006, + "grad_norm": 0.626677930355072, + "learning_rate": 0.0004727357250554437, + "loss": 1.2638, + "step": 1682 + }, + { + "epoch": 0.30024083489430026, + "grad_norm": 0.5350278615951538, + "learning_rate": 0.00047270389995228353, + "loss": 1.2105, + "step": 1683 + }, + { + "epoch": 0.3004192311123004, + "grad_norm": 0.6105818748474121, + "learning_rate": 0.0004726720573580549, + "loss": 1.0251, + "step": 1684 + }, + { + "epoch": 0.3005976273303006, + "grad_norm": 0.6602348685264587, + "learning_rate": 0.00047264019727525866, + "loss": 1.3229, + "step": 1685 + }, + { + "epoch": 0.3007760235483008, + "grad_norm": 0.5458388924598694, + "learning_rate": 0.00047260831970639716, + "loss": 1.2786, + "step": 1686 + }, + { + "epoch": 0.30095441976630094, + "grad_norm": 0.6178166270256042, + "learning_rate": 0.000472576424653974, + "loss": 1.1324, + "step": 1687 + }, + { + "epoch": 0.30113281598430114, + "grad_norm": 0.6096773147583008, + "learning_rate": 0.0004725445121204943, + "loss": 1.0911, + "step": 1688 + }, + { + "epoch": 0.30131121220230134, + "grad_norm": 0.6458967924118042, + "learning_rate": 0.0004725125821084643, + "loss": 1.0113, + "step": 1689 + }, + { + "epoch": 0.3014896084203015, + "grad_norm": 0.6352248191833496, + "learning_rate": 0.00047248063462039194, + "loss": 1.206, + "step": 1690 + }, + { + "epoch": 0.3016680046383017, + "grad_norm": 0.7942283749580383, + "learning_rate": 0.0004724486696587862, + "loss": 1.3854, + "step": 1691 + }, + { + "epoch": 0.3018464008563018, + "grad_norm": 0.6200905442237854, + "learning_rate": 0.00047241668722615773, + "loss": 1.0581, + "step": 1692 + }, + { + "epoch": 0.302024797074302, + "grad_norm": 0.6159539222717285, + "learning_rate": 0.0004723846873250183, + "loss": 1.15, + "step": 1693 + }, + { + "epoch": 0.3022031932923022, + "grad_norm": 0.5503310561180115, + "learning_rate": 0.00047235266995788127, + "loss": 1.0242, + "step": 1694 + }, + { + "epoch": 0.30238158951030236, + "grad_norm": 0.8836873173713684, + "learning_rate": 0.00047232063512726125, + "loss": 1.2961, + "step": 1695 + }, + { + "epoch": 0.30255998572830256, + "grad_norm": 0.6245954036712646, + "learning_rate": 0.0004722885828356742, + "loss": 1.2354, + "step": 1696 + }, + { + "epoch": 0.30273838194630276, + "grad_norm": 1.8814623355865479, + "learning_rate": 0.0004722565130856375, + "loss": 0.9584, + "step": 1697 + }, + { + "epoch": 0.3029167781643029, + "grad_norm": 0.6291859149932861, + "learning_rate": 0.0004722244258796699, + "loss": 1.3523, + "step": 1698 + }, + { + "epoch": 0.3030951743823031, + "grad_norm": 0.52773517370224, + "learning_rate": 0.00047219232122029154, + "loss": 1.1431, + "step": 1699 + }, + { + "epoch": 0.3032735706003033, + "grad_norm": 0.6015790104866028, + "learning_rate": 0.0004721601991100239, + "loss": 1.2411, + "step": 1700 + }, + { + "epoch": 0.30345196681830344, + "grad_norm": 0.5504578351974487, + "learning_rate": 0.0004721280595513898, + "loss": 1.1042, + "step": 1701 + }, + { + "epoch": 0.30363036303630364, + "grad_norm": 0.5806344747543335, + "learning_rate": 0.00047209590254691347, + "loss": 1.4875, + "step": 1702 + }, + { + "epoch": 0.3038087592543038, + "grad_norm": 0.6146537065505981, + "learning_rate": 0.0004720637280991206, + "loss": 1.3344, + "step": 1703 + }, + { + "epoch": 0.303987155472304, + "grad_norm": 0.5845167636871338, + "learning_rate": 0.00047203153621053803, + "loss": 1.3061, + "step": 1704 + }, + { + "epoch": 0.3041655516903042, + "grad_norm": 0.555408775806427, + "learning_rate": 0.0004719993268836942, + "loss": 0.9783, + "step": 1705 + }, + { + "epoch": 0.3043439479083043, + "grad_norm": 0.7523461580276489, + "learning_rate": 0.00047196710012111865, + "loss": 1.0337, + "step": 1706 + }, + { + "epoch": 0.3045223441263045, + "grad_norm": 0.5478354096412659, + "learning_rate": 0.00047193485592534267, + "loss": 1.1427, + "step": 1707 + }, + { + "epoch": 0.3047007403443047, + "grad_norm": 0.5235744118690491, + "learning_rate": 0.0004719025942988986, + "loss": 1.1299, + "step": 1708 + }, + { + "epoch": 0.30487913656230486, + "grad_norm": 0.5737703442573547, + "learning_rate": 0.00047187031524432033, + "loss": 1.259, + "step": 1709 + }, + { + "epoch": 0.30505753278030506, + "grad_norm": 0.515281617641449, + "learning_rate": 0.0004718380187641429, + "loss": 1.0624, + "step": 1710 + }, + { + "epoch": 0.30523592899830526, + "grad_norm": 0.4825386703014374, + "learning_rate": 0.000471805704860903, + "loss": 0.9467, + "step": 1711 + }, + { + "epoch": 0.3054143252163054, + "grad_norm": 0.73399418592453, + "learning_rate": 0.00047177337353713843, + "loss": 1.2483, + "step": 1712 + }, + { + "epoch": 0.3055927214343056, + "grad_norm": 0.5785844922065735, + "learning_rate": 0.00047174102479538853, + "loss": 1.1172, + "step": 1713 + }, + { + "epoch": 0.30577111765230575, + "grad_norm": 0.559877336025238, + "learning_rate": 0.000471708658638194, + "loss": 1.1775, + "step": 1714 + }, + { + "epoch": 0.30594951387030594, + "grad_norm": 0.5484994649887085, + "learning_rate": 0.00047167627506809686, + "loss": 1.0763, + "step": 1715 + }, + { + "epoch": 0.30612791008830614, + "grad_norm": 0.6489263772964478, + "learning_rate": 0.0004716438740876404, + "loss": 1.0646, + "step": 1716 + }, + { + "epoch": 0.3063063063063063, + "grad_norm": 0.5778820514678955, + "learning_rate": 0.0004716114556993695, + "loss": 1.0459, + "step": 1717 + }, + { + "epoch": 0.3064847025243065, + "grad_norm": 2.8568458557128906, + "learning_rate": 0.00047157901990583026, + "loss": 0.9765, + "step": 1718 + }, + { + "epoch": 0.3066630987423067, + "grad_norm": 0.6178241968154907, + "learning_rate": 0.0004715465667095701, + "loss": 1.2187, + "step": 1719 + }, + { + "epoch": 0.3068414949603068, + "grad_norm": 0.5349613428115845, + "learning_rate": 0.0004715140961131379, + "loss": 1.063, + "step": 1720 + }, + { + "epoch": 0.307019891178307, + "grad_norm": 0.556450366973877, + "learning_rate": 0.00047148160811908395, + "loss": 1.0956, + "step": 1721 + }, + { + "epoch": 0.3071982873963072, + "grad_norm": 0.4912685453891754, + "learning_rate": 0.0004714491027299599, + "loss": 0.8442, + "step": 1722 + }, + { + "epoch": 0.30737668361430737, + "grad_norm": 0.5774837136268616, + "learning_rate": 0.0004714165799483185, + "loss": 1.2867, + "step": 1723 + }, + { + "epoch": 0.30755507983230757, + "grad_norm": 0.5329927206039429, + "learning_rate": 0.0004713840397767142, + "loss": 0.8711, + "step": 1724 + }, + { + "epoch": 0.3077334760503077, + "grad_norm": 0.5508580803871155, + "learning_rate": 0.00047135148221770273, + "loss": 1.1184, + "step": 1725 + }, + { + "epoch": 0.3079118722683079, + "grad_norm": 0.6747270822525024, + "learning_rate": 0.0004713189072738411, + "loss": 1.3457, + "step": 1726 + }, + { + "epoch": 0.3080902684863081, + "grad_norm": 0.5561022162437439, + "learning_rate": 0.0004712863149476877, + "loss": 1.0277, + "step": 1727 + }, + { + "epoch": 0.30826866470430825, + "grad_norm": 0.6054466962814331, + "learning_rate": 0.0004712537052418024, + "loss": 1.3957, + "step": 1728 + }, + { + "epoch": 0.30844706092230845, + "grad_norm": 84.19719696044922, + "learning_rate": 0.00047122107815874626, + "loss": 1.2228, + "step": 1729 + }, + { + "epoch": 0.30862545714030865, + "grad_norm": 0.6073416471481323, + "learning_rate": 0.00047118843370108187, + "loss": 1.0421, + "step": 1730 + }, + { + "epoch": 0.3088038533583088, + "grad_norm": 0.7347621917724609, + "learning_rate": 0.00047115577187137304, + "loss": 1.2824, + "step": 1731 + }, + { + "epoch": 0.308982249576309, + "grad_norm": 0.5585038661956787, + "learning_rate": 0.00047112309267218513, + "loss": 1.0725, + "step": 1732 + }, + { + "epoch": 0.3091606457943092, + "grad_norm": 0.5505072474479675, + "learning_rate": 0.00047109039610608464, + "loss": 1.1923, + "step": 1733 + }, + { + "epoch": 0.30933904201230933, + "grad_norm": 2.2346675395965576, + "learning_rate": 0.0004710576821756395, + "loss": 1.3058, + "step": 1734 + }, + { + "epoch": 0.3095174382303095, + "grad_norm": 0.5852141976356506, + "learning_rate": 0.0004710249508834192, + "loss": 1.3095, + "step": 1735 + }, + { + "epoch": 0.30969583444830967, + "grad_norm": 3.471550464630127, + "learning_rate": 0.00047099220223199444, + "loss": 1.2149, + "step": 1736 + }, + { + "epoch": 0.30987423066630987, + "grad_norm": 4.984081268310547, + "learning_rate": 0.0004709594362239371, + "loss": 1.286, + "step": 1737 + }, + { + "epoch": 0.31005262688431007, + "grad_norm": 0.8435764908790588, + "learning_rate": 0.0004709266528618208, + "loss": 0.8668, + "step": 1738 + }, + { + "epoch": 0.3102310231023102, + "grad_norm": 0.6224226355552673, + "learning_rate": 0.0004708938521482202, + "loss": 1.2869, + "step": 1739 + }, + { + "epoch": 0.3104094193203104, + "grad_norm": 0.569222092628479, + "learning_rate": 0.00047086103408571157, + "loss": 1.2534, + "step": 1740 + }, + { + "epoch": 0.3105878155383106, + "grad_norm": 5.230443000793457, + "learning_rate": 0.0004708281986768724, + "loss": 1.1547, + "step": 1741 + }, + { + "epoch": 0.31076621175631075, + "grad_norm": 0.726719319820404, + "learning_rate": 0.00047079534592428144, + "loss": 1.1936, + "step": 1742 + }, + { + "epoch": 0.31094460797431095, + "grad_norm": 0.794158399105072, + "learning_rate": 0.0004707624758305191, + "loss": 1.0309, + "step": 1743 + }, + { + "epoch": 0.31112300419231115, + "grad_norm": 0.580272376537323, + "learning_rate": 0.0004707295883981668, + "loss": 1.0352, + "step": 1744 + }, + { + "epoch": 0.3113014004103113, + "grad_norm": 0.6194749474525452, + "learning_rate": 0.00047069668362980776, + "loss": 1.2419, + "step": 1745 + }, + { + "epoch": 0.3114797966283115, + "grad_norm": 0.6249500513076782, + "learning_rate": 0.0004706637615280261, + "loss": 1.0573, + "step": 1746 + }, + { + "epoch": 0.31165819284631163, + "grad_norm": 0.6039573550224304, + "learning_rate": 0.0004706308220954075, + "loss": 1.0936, + "step": 1747 + }, + { + "epoch": 0.31183658906431183, + "grad_norm": 0.570054829120636, + "learning_rate": 0.00047059786533453917, + "loss": 0.9585, + "step": 1748 + }, + { + "epoch": 0.31201498528231203, + "grad_norm": 0.5929118394851685, + "learning_rate": 0.0004705648912480094, + "loss": 0.9652, + "step": 1749 + }, + { + "epoch": 0.3121933815003122, + "grad_norm": 0.6323100328445435, + "learning_rate": 0.000470531899838408, + "loss": 1.3066, + "step": 1750 + }, + { + "epoch": 0.31237177771831237, + "grad_norm": 0.5521016120910645, + "learning_rate": 0.00047049889110832604, + "loss": 1.082, + "step": 1751 + }, + { + "epoch": 0.31255017393631257, + "grad_norm": 0.5617358684539795, + "learning_rate": 0.00047046586506035613, + "loss": 1.2556, + "step": 1752 + }, + { + "epoch": 0.3127285701543127, + "grad_norm": 1.5406019687652588, + "learning_rate": 0.000470432821697092, + "loss": 1.303, + "step": 1753 + }, + { + "epoch": 0.3129069663723129, + "grad_norm": 0.5704224109649658, + "learning_rate": 0.0004703997610211289, + "loss": 1.0045, + "step": 1754 + }, + { + "epoch": 0.3130853625903131, + "grad_norm": 0.5109691023826599, + "learning_rate": 0.00047036668303506346, + "loss": 0.9764, + "step": 1755 + }, + { + "epoch": 0.31326375880831325, + "grad_norm": 0.5933413505554199, + "learning_rate": 0.0004703335877414936, + "loss": 1.1017, + "step": 1756 + }, + { + "epoch": 0.31344215502631345, + "grad_norm": 0.6716773509979248, + "learning_rate": 0.0004703004751430185, + "loss": 1.3598, + "step": 1757 + }, + { + "epoch": 0.3136205512443136, + "grad_norm": 0.5351828932762146, + "learning_rate": 0.0004702673452422389, + "loss": 0.9856, + "step": 1758 + }, + { + "epoch": 0.3137989474623138, + "grad_norm": 0.6115684509277344, + "learning_rate": 0.0004702341980417568, + "loss": 1.1048, + "step": 1759 + }, + { + "epoch": 0.313977343680314, + "grad_norm": 0.5649376511573792, + "learning_rate": 0.00047020103354417553, + "loss": 1.3311, + "step": 1760 + }, + { + "epoch": 0.31415573989831413, + "grad_norm": 0.5363413095474243, + "learning_rate": 0.0004701678517520999, + "loss": 1.1215, + "step": 1761 + }, + { + "epoch": 0.31433413611631433, + "grad_norm": 0.5385435223579407, + "learning_rate": 0.00047013465266813594, + "loss": 1.0061, + "step": 1762 + }, + { + "epoch": 0.31451253233431453, + "grad_norm": 0.6086006760597229, + "learning_rate": 0.0004701014362948911, + "loss": 1.2379, + "step": 1763 + }, + { + "epoch": 0.3146909285523147, + "grad_norm": 0.49953147768974304, + "learning_rate": 0.0004700682026349741, + "loss": 1.0193, + "step": 1764 + }, + { + "epoch": 0.3148693247703149, + "grad_norm": 0.5452679395675659, + "learning_rate": 0.0004700349516909952, + "loss": 1.2415, + "step": 1765 + }, + { + "epoch": 0.31504772098831507, + "grad_norm": 0.5797132253646851, + "learning_rate": 0.00047000168346556594, + "loss": 1.3095, + "step": 1766 + }, + { + "epoch": 0.3152261172063152, + "grad_norm": 0.586613655090332, + "learning_rate": 0.00046996839796129903, + "loss": 1.0955, + "step": 1767 + }, + { + "epoch": 0.3154045134243154, + "grad_norm": 0.5819451808929443, + "learning_rate": 0.0004699350951808089, + "loss": 1.2054, + "step": 1768 + }, + { + "epoch": 0.31558290964231556, + "grad_norm": 0.6394988298416138, + "learning_rate": 0.00046990177512671097, + "loss": 1.3011, + "step": 1769 + }, + { + "epoch": 0.31576130586031576, + "grad_norm": 0.5528479218482971, + "learning_rate": 0.00046986843780162223, + "loss": 1.3098, + "step": 1770 + }, + { + "epoch": 0.31593970207831595, + "grad_norm": 0.5443571209907532, + "learning_rate": 0.0004698350832081611, + "loss": 1.3117, + "step": 1771 + }, + { + "epoch": 0.3161180982963161, + "grad_norm": 0.4671507477760315, + "learning_rate": 0.00046980171134894714, + "loss": 1.0019, + "step": 1772 + }, + { + "epoch": 0.3162964945143163, + "grad_norm": 0.5098404884338379, + "learning_rate": 0.00046976832222660127, + "loss": 0.9793, + "step": 1773 + }, + { + "epoch": 0.3164748907323165, + "grad_norm": 0.7493408918380737, + "learning_rate": 0.000469734915843746, + "loss": 1.1057, + "step": 1774 + }, + { + "epoch": 0.31665328695031664, + "grad_norm": 0.5108188986778259, + "learning_rate": 0.00046970149220300496, + "loss": 1.0821, + "step": 1775 + }, + { + "epoch": 0.31683168316831684, + "grad_norm": 0.4858883321285248, + "learning_rate": 0.0004696680513070033, + "loss": 0.9252, + "step": 1776 + }, + { + "epoch": 0.31701007938631703, + "grad_norm": 0.5160908102989197, + "learning_rate": 0.00046963459315836744, + "loss": 1.2089, + "step": 1777 + }, + { + "epoch": 0.3171884756043172, + "grad_norm": 0.5474086403846741, + "learning_rate": 0.0004696011177597251, + "loss": 0.9082, + "step": 1778 + }, + { + "epoch": 0.3173668718223174, + "grad_norm": 0.5507884621620178, + "learning_rate": 0.0004695676251137055, + "loss": 1.2827, + "step": 1779 + }, + { + "epoch": 0.3175452680403175, + "grad_norm": 0.5097134113311768, + "learning_rate": 0.0004695341152229391, + "loss": 0.9908, + "step": 1780 + }, + { + "epoch": 0.3177236642583177, + "grad_norm": 0.5379379391670227, + "learning_rate": 0.00046950058809005775, + "loss": 1.1212, + "step": 1781 + }, + { + "epoch": 0.3179020604763179, + "grad_norm": 0.5245591998100281, + "learning_rate": 0.00046946704371769466, + "loss": 1.0469, + "step": 1782 + }, + { + "epoch": 0.31808045669431806, + "grad_norm": 0.5721558928489685, + "learning_rate": 0.0004694334821084845, + "loss": 1.5169, + "step": 1783 + }, + { + "epoch": 0.31825885291231826, + "grad_norm": 0.4801085889339447, + "learning_rate": 0.00046939990326506296, + "loss": 0.9409, + "step": 1784 + }, + { + "epoch": 0.31843724913031846, + "grad_norm": 0.5646408796310425, + "learning_rate": 0.0004693663071900675, + "loss": 1.1511, + "step": 1785 + }, + { + "epoch": 0.3186156453483186, + "grad_norm": 0.6158486008644104, + "learning_rate": 0.00046933269388613663, + "loss": 1.3416, + "step": 1786 + }, + { + "epoch": 0.3187940415663188, + "grad_norm": 0.5548638701438904, + "learning_rate": 0.0004692990633559104, + "loss": 1.0945, + "step": 1787 + }, + { + "epoch": 0.318972437784319, + "grad_norm": 0.5320894718170166, + "learning_rate": 0.0004692654156020302, + "loss": 1.1061, + "step": 1788 + }, + { + "epoch": 0.31915083400231914, + "grad_norm": 0.5542237162590027, + "learning_rate": 0.0004692317506271385, + "loss": 1.2178, + "step": 1789 + }, + { + "epoch": 0.31932923022031934, + "grad_norm": 0.49565958976745605, + "learning_rate": 0.0004691980684338795, + "loss": 0.9038, + "step": 1790 + }, + { + "epoch": 0.3195076264383195, + "grad_norm": 0.4972078502178192, + "learning_rate": 0.00046916436902489847, + "loss": 1.0761, + "step": 1791 + }, + { + "epoch": 0.3196860226563197, + "grad_norm": 0.5097312927246094, + "learning_rate": 0.00046913065240284226, + "loss": 0.9506, + "step": 1792 + }, + { + "epoch": 0.3198644188743199, + "grad_norm": 0.5239170789718628, + "learning_rate": 0.00046909691857035884, + "loss": 1.1015, + "step": 1793 + }, + { + "epoch": 0.32004281509232, + "grad_norm": 0.5465912818908691, + "learning_rate": 0.0004690631675300978, + "loss": 1.04, + "step": 1794 + }, + { + "epoch": 0.3202212113103202, + "grad_norm": 0.5346068143844604, + "learning_rate": 0.00046902939928470977, + "loss": 1.0701, + "step": 1795 + }, + { + "epoch": 0.3203996075283204, + "grad_norm": 0.5574382543563843, + "learning_rate": 0.00046899561383684707, + "loss": 1.3175, + "step": 1796 + }, + { + "epoch": 0.32057800374632056, + "grad_norm": 0.5112574100494385, + "learning_rate": 0.00046896181118916304, + "loss": 0.9372, + "step": 1797 + }, + { + "epoch": 0.32075639996432076, + "grad_norm": 0.5132248997688293, + "learning_rate": 0.00046892799134431263, + "loss": 1.0925, + "step": 1798 + }, + { + "epoch": 0.32093479618232096, + "grad_norm": 0.5472792983055115, + "learning_rate": 0.000468894154304952, + "loss": 0.8669, + "step": 1799 + }, + { + "epoch": 0.3211131924003211, + "grad_norm": 0.5460713505744934, + "learning_rate": 0.0004688603000737386, + "loss": 0.9067, + "step": 1800 + }, + { + "epoch": 0.3212915886183213, + "grad_norm": 0.5631927847862244, + "learning_rate": 0.00046882642865333146, + "loss": 1.0512, + "step": 1801 + }, + { + "epoch": 0.32146998483632144, + "grad_norm": 0.5665701627731323, + "learning_rate": 0.00046879254004639085, + "loss": 1.1049, + "step": 1802 + }, + { + "epoch": 0.32164838105432164, + "grad_norm": 0.6018973588943481, + "learning_rate": 0.00046875863425557823, + "loss": 1.1779, + "step": 1803 + }, + { + "epoch": 0.32182677727232184, + "grad_norm": 0.5643223524093628, + "learning_rate": 0.00046872471128355664, + "loss": 1.3245, + "step": 1804 + }, + { + "epoch": 0.322005173490322, + "grad_norm": 0.5305078625679016, + "learning_rate": 0.0004686907711329903, + "loss": 1.1447, + "step": 1805 + }, + { + "epoch": 0.3221835697083222, + "grad_norm": 0.5465999245643616, + "learning_rate": 0.000468656813806545, + "loss": 1.1083, + "step": 1806 + }, + { + "epoch": 0.3223619659263224, + "grad_norm": 0.5518834590911865, + "learning_rate": 0.0004686228393068875, + "loss": 1.4034, + "step": 1807 + }, + { + "epoch": 0.3225403621443225, + "grad_norm": 0.5887888669967651, + "learning_rate": 0.0004685888476366864, + "loss": 1.2044, + "step": 1808 + }, + { + "epoch": 0.3227187583623227, + "grad_norm": 0.5048173069953918, + "learning_rate": 0.0004685548387986113, + "loss": 1.0136, + "step": 1809 + }, + { + "epoch": 0.3228971545803229, + "grad_norm": 0.5357009172439575, + "learning_rate": 0.0004685208127953331, + "loss": 1.1239, + "step": 1810 + }, + { + "epoch": 0.32307555079832306, + "grad_norm": 0.4944056272506714, + "learning_rate": 0.00046848676962952434, + "loss": 1.1098, + "step": 1811 + }, + { + "epoch": 0.32325394701632326, + "grad_norm": 0.5399764776229858, + "learning_rate": 0.00046845270930385876, + "loss": 1.2677, + "step": 1812 + }, + { + "epoch": 0.3234323432343234, + "grad_norm": 0.541485607624054, + "learning_rate": 0.0004684186318210114, + "loss": 0.9894, + "step": 1813 + }, + { + "epoch": 0.3236107394523236, + "grad_norm": 0.4978035092353821, + "learning_rate": 0.0004683845371836586, + "loss": 1.0137, + "step": 1814 + }, + { + "epoch": 0.3237891356703238, + "grad_norm": 0.560565710067749, + "learning_rate": 0.0004683504253944783, + "loss": 1.1744, + "step": 1815 + }, + { + "epoch": 0.32396753188832395, + "grad_norm": 0.6290241479873657, + "learning_rate": 0.00046831629645614954, + "loss": 1.3022, + "step": 1816 + }, + { + "epoch": 0.32414592810632414, + "grad_norm": 0.5639137029647827, + "learning_rate": 0.0004682821503713528, + "loss": 1.3467, + "step": 1817 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 0.9618589282035828, + "learning_rate": 0.0004682479871427699, + "loss": 1.1614, + "step": 1818 + }, + { + "epoch": 0.3245027205423245, + "grad_norm": 0.5054482817649841, + "learning_rate": 0.000468213806773084, + "loss": 1.0439, + "step": 1819 + }, + { + "epoch": 0.3246811167603247, + "grad_norm": 0.604265570640564, + "learning_rate": 0.00046817960926497966, + "loss": 1.2124, + "step": 1820 + }, + { + "epoch": 0.3248595129783249, + "grad_norm": 0.6163999438285828, + "learning_rate": 0.00046814539462114267, + "loss": 1.3959, + "step": 1821 + }, + { + "epoch": 0.325037909196325, + "grad_norm": 0.5077233910560608, + "learning_rate": 0.00046811116284426027, + "loss": 0.9646, + "step": 1822 + }, + { + "epoch": 0.3252163054143252, + "grad_norm": 1.0014747381210327, + "learning_rate": 0.000468076913937021, + "loss": 1.2883, + "step": 1823 + }, + { + "epoch": 0.32539470163232537, + "grad_norm": 0.5301506519317627, + "learning_rate": 0.0004680426479021147, + "loss": 1.2094, + "step": 1824 + }, + { + "epoch": 0.32557309785032557, + "grad_norm": 0.4948747456073761, + "learning_rate": 0.00046800836474223274, + "loss": 0.9282, + "step": 1825 + }, + { + "epoch": 0.32575149406832576, + "grad_norm": 0.5325037837028503, + "learning_rate": 0.0004679740644600676, + "loss": 1.316, + "step": 1826 + }, + { + "epoch": 0.3259298902863259, + "grad_norm": 1.2007477283477783, + "learning_rate": 0.0004679397470583133, + "loss": 1.1872, + "step": 1827 + }, + { + "epoch": 0.3261082865043261, + "grad_norm": 0.5436804294586182, + "learning_rate": 0.00046790541253966493, + "loss": 1.32, + "step": 1828 + }, + { + "epoch": 0.3262866827223263, + "grad_norm": 1.843799114227295, + "learning_rate": 0.0004678710609068193, + "loss": 1.0474, + "step": 1829 + }, + { + "epoch": 0.32646507894032645, + "grad_norm": 0.5451673269271851, + "learning_rate": 0.0004678366921624743, + "loss": 0.938, + "step": 1830 + }, + { + "epoch": 0.32664347515832665, + "grad_norm": 0.6391348242759705, + "learning_rate": 0.0004678023063093293, + "loss": 1.0066, + "step": 1831 + }, + { + "epoch": 0.32682187137632684, + "grad_norm": 0.8246572613716125, + "learning_rate": 0.0004677679033500848, + "loss": 1.2115, + "step": 1832 + }, + { + "epoch": 0.327000267594327, + "grad_norm": 0.5913776159286499, + "learning_rate": 0.0004677334832874429, + "loss": 1.0055, + "step": 1833 + }, + { + "epoch": 0.3271786638123272, + "grad_norm": 0.7291193008422852, + "learning_rate": 0.00046769904612410694, + "loss": 1.2235, + "step": 1834 + }, + { + "epoch": 0.3273570600303274, + "grad_norm": 0.5839000940322876, + "learning_rate": 0.0004676645918627816, + "loss": 1.0966, + "step": 1835 + }, + { + "epoch": 0.32753545624832753, + "grad_norm": 0.6921905875205994, + "learning_rate": 0.0004676301205061728, + "loss": 1.0023, + "step": 1836 + }, + { + "epoch": 0.3277138524663277, + "grad_norm": 0.5414957404136658, + "learning_rate": 0.0004675956320569881, + "loss": 1.0105, + "step": 1837 + }, + { + "epoch": 0.32789224868432787, + "grad_norm": 0.6554968357086182, + "learning_rate": 0.000467561126517936, + "loss": 1.3459, + "step": 1838 + }, + { + "epoch": 0.32807064490232807, + "grad_norm": 0.6246145367622375, + "learning_rate": 0.00046752660389172673, + "loss": 1.2114, + "step": 1839 + }, + { + "epoch": 0.32824904112032827, + "grad_norm": 0.5389582514762878, + "learning_rate": 0.0004674920641810716, + "loss": 0.8482, + "step": 1840 + }, + { + "epoch": 0.3284274373383284, + "grad_norm": 0.5214718580245972, + "learning_rate": 0.0004674575073886833, + "loss": 0.8899, + "step": 1841 + }, + { + "epoch": 0.3286058335563286, + "grad_norm": 0.6277864575386047, + "learning_rate": 0.00046742293351727596, + "loss": 1.1186, + "step": 1842 + }, + { + "epoch": 0.3287842297743288, + "grad_norm": 0.4919874668121338, + "learning_rate": 0.00046738834256956495, + "loss": 0.8872, + "step": 1843 + }, + { + "epoch": 0.32896262599232895, + "grad_norm": 0.545361340045929, + "learning_rate": 0.00046735373454826715, + "loss": 1.0465, + "step": 1844 + }, + { + "epoch": 0.32914102221032915, + "grad_norm": 1.7328273057937622, + "learning_rate": 0.00046731910945610044, + "loss": 1.1124, + "step": 1845 + }, + { + "epoch": 0.32931941842832935, + "grad_norm": 0.6040815711021423, + "learning_rate": 0.0004672844672957846, + "loss": 1.1342, + "step": 1846 + }, + { + "epoch": 0.3294978146463295, + "grad_norm": 0.4832199513912201, + "learning_rate": 0.00046724980807004, + "loss": 0.825, + "step": 1847 + }, + { + "epoch": 0.3296762108643297, + "grad_norm": 0.7503164410591125, + "learning_rate": 0.00046721513178158905, + "loss": 0.9456, + "step": 1848 + }, + { + "epoch": 0.32985460708232983, + "grad_norm": 0.5634304881095886, + "learning_rate": 0.0004671804384331551, + "loss": 1.1604, + "step": 1849 + }, + { + "epoch": 0.33003300330033003, + "grad_norm": 0.5407414436340332, + "learning_rate": 0.00046714572802746305, + "loss": 1.0941, + "step": 1850 + }, + { + "epoch": 0.33021139951833023, + "grad_norm": 0.5838576555252075, + "learning_rate": 0.0004671110005672389, + "loss": 1.3654, + "step": 1851 + }, + { + "epoch": 0.33038979573633037, + "grad_norm": 0.5957556962966919, + "learning_rate": 0.00046707625605521016, + "loss": 1.153, + "step": 1852 + }, + { + "epoch": 0.33056819195433057, + "grad_norm": 0.5474072098731995, + "learning_rate": 0.0004670414944941057, + "loss": 1.1032, + "step": 1853 + }, + { + "epoch": 0.33074658817233077, + "grad_norm": 0.48801934719085693, + "learning_rate": 0.00046700671588665574, + "loss": 0.8788, + "step": 1854 + }, + { + "epoch": 0.3309249843903309, + "grad_norm": 0.6753059029579163, + "learning_rate": 0.00046697192023559167, + "loss": 1.2628, + "step": 1855 + }, + { + "epoch": 0.3311033806083311, + "grad_norm": 0.5072078108787537, + "learning_rate": 0.00046693710754364625, + "loss": 0.9892, + "step": 1856 + }, + { + "epoch": 0.3312817768263313, + "grad_norm": 0.6533268690109253, + "learning_rate": 0.0004669022778135539, + "loss": 1.1148, + "step": 1857 + }, + { + "epoch": 0.33146017304433145, + "grad_norm": 0.5230735540390015, + "learning_rate": 0.0004668674310480499, + "loss": 0.8552, + "step": 1858 + }, + { + "epoch": 0.33163856926233165, + "grad_norm": 0.5872694253921509, + "learning_rate": 0.00046683256724987124, + "loss": 1.1711, + "step": 1859 + }, + { + "epoch": 0.3318169654803318, + "grad_norm": 0.6011821627616882, + "learning_rate": 0.000466797686421756, + "loss": 1.1871, + "step": 1860 + }, + { + "epoch": 0.331995361698332, + "grad_norm": 0.643999457359314, + "learning_rate": 0.0004667627885664438, + "loss": 1.0446, + "step": 1861 + }, + { + "epoch": 0.3321737579163322, + "grad_norm": 0.5912252068519592, + "learning_rate": 0.00046672787368667553, + "loss": 1.1304, + "step": 1862 + }, + { + "epoch": 0.33235215413433233, + "grad_norm": 0.7412044405937195, + "learning_rate": 0.0004666929417851933, + "loss": 1.3677, + "step": 1863 + }, + { + "epoch": 0.33253055035233253, + "grad_norm": 0.5437285900115967, + "learning_rate": 0.0004666579928647406, + "loss": 1.039, + "step": 1864 + }, + { + "epoch": 0.33270894657033273, + "grad_norm": 0.5716599822044373, + "learning_rate": 0.0004666230269280625, + "loss": 1.1614, + "step": 1865 + }, + { + "epoch": 0.3328873427883329, + "grad_norm": 0.48691585659980774, + "learning_rate": 0.00046658804397790496, + "loss": 0.9387, + "step": 1866 + }, + { + "epoch": 0.3330657390063331, + "grad_norm": 0.5562944412231445, + "learning_rate": 0.00046655304401701565, + "loss": 1.1568, + "step": 1867 + }, + { + "epoch": 0.33324413522433327, + "grad_norm": 0.5540555119514465, + "learning_rate": 0.0004665180270481435, + "loss": 1.283, + "step": 1868 + }, + { + "epoch": 0.3334225314423334, + "grad_norm": 0.44283947348594666, + "learning_rate": 0.0004664829930740387, + "loss": 0.9378, + "step": 1869 + }, + { + "epoch": 0.3336009276603336, + "grad_norm": 0.7194570899009705, + "learning_rate": 0.00046644794209745277, + "loss": 1.3644, + "step": 1870 + }, + { + "epoch": 0.33377932387833376, + "grad_norm": 0.5517498254776001, + "learning_rate": 0.00046641287412113857, + "loss": 1.2408, + "step": 1871 + }, + { + "epoch": 0.33395772009633395, + "grad_norm": 0.504384458065033, + "learning_rate": 0.00046637778914785044, + "loss": 0.9018, + "step": 1872 + }, + { + "epoch": 0.33413611631433415, + "grad_norm": 1.1955726146697998, + "learning_rate": 0.0004663426871803438, + "loss": 0.9939, + "step": 1873 + }, + { + "epoch": 0.3343145125323343, + "grad_norm": 0.5495529174804688, + "learning_rate": 0.00046630756822137567, + "loss": 1.2006, + "step": 1874 + }, + { + "epoch": 0.3344929087503345, + "grad_norm": 0.5532088875770569, + "learning_rate": 0.00046627243227370413, + "loss": 1.0194, + "step": 1875 + }, + { + "epoch": 0.3346713049683347, + "grad_norm": 0.4987606406211853, + "learning_rate": 0.00046623727934008886, + "loss": 0.9675, + "step": 1876 + }, + { + "epoch": 0.33484970118633484, + "grad_norm": 0.6175581216812134, + "learning_rate": 0.0004662021094232908, + "loss": 1.1119, + "step": 1877 + }, + { + "epoch": 0.33502809740433503, + "grad_norm": 0.5381768345832825, + "learning_rate": 0.00046616692252607204, + "loss": 1.0028, + "step": 1878 + }, + { + "epoch": 0.33520649362233523, + "grad_norm": 0.5099619030952454, + "learning_rate": 0.00046613171865119617, + "loss": 0.9536, + "step": 1879 + }, + { + "epoch": 0.3353848898403354, + "grad_norm": 0.5081475377082825, + "learning_rate": 0.0004660964978014282, + "loss": 1.0749, + "step": 1880 + }, + { + "epoch": 0.3355632860583356, + "grad_norm": 0.5307644009590149, + "learning_rate": 0.00046606125997953425, + "loss": 1.3082, + "step": 1881 + }, + { + "epoch": 0.3357416822763357, + "grad_norm": 0.5692281126976013, + "learning_rate": 0.0004660260051882819, + "loss": 1.2991, + "step": 1882 + }, + { + "epoch": 0.3359200784943359, + "grad_norm": 0.5353856682777405, + "learning_rate": 0.00046599073343044014, + "loss": 1.0892, + "step": 1883 + }, + { + "epoch": 0.3360984747123361, + "grad_norm": 0.5255775451660156, + "learning_rate": 0.000465955444708779, + "loss": 1.2572, + "step": 1884 + }, + { + "epoch": 0.33627687093033626, + "grad_norm": 0.5284912586212158, + "learning_rate": 0.0004659201390260703, + "loss": 1.1411, + "step": 1885 + }, + { + "epoch": 0.33645526714833646, + "grad_norm": 0.5372970700263977, + "learning_rate": 0.0004658848163850867, + "loss": 0.9713, + "step": 1886 + }, + { + "epoch": 0.33663366336633666, + "grad_norm": 0.4853118062019348, + "learning_rate": 0.00046584947678860264, + "loss": 0.839, + "step": 1887 + }, + { + "epoch": 0.3368120595843368, + "grad_norm": 0.543500542640686, + "learning_rate": 0.0004658141202393935, + "loss": 1.2338, + "step": 1888 + }, + { + "epoch": 0.336990455802337, + "grad_norm": 0.5506289005279541, + "learning_rate": 0.00046577874674023624, + "loss": 0.9966, + "step": 1889 + }, + { + "epoch": 0.3371688520203372, + "grad_norm": 0.81171053647995, + "learning_rate": 0.00046574335629390894, + "loss": 0.9698, + "step": 1890 + }, + { + "epoch": 0.33734724823833734, + "grad_norm": 0.5851067304611206, + "learning_rate": 0.0004657079489031915, + "loss": 1.2608, + "step": 1891 + }, + { + "epoch": 0.33752564445633754, + "grad_norm": 0.5415493845939636, + "learning_rate": 0.0004656725245708644, + "loss": 1.2209, + "step": 1892 + }, + { + "epoch": 0.3377040406743377, + "grad_norm": 1.5110948085784912, + "learning_rate": 0.0004656370832997101, + "loss": 1.1676, + "step": 1893 + }, + { + "epoch": 0.3378824368923379, + "grad_norm": 0.4858531951904297, + "learning_rate": 0.00046560162509251204, + "loss": 0.8358, + "step": 1894 + }, + { + "epoch": 0.3380608331103381, + "grad_norm": 0.5556557774543762, + "learning_rate": 0.00046556614995205516, + "loss": 1.1761, + "step": 1895 + }, + { + "epoch": 0.3382392293283382, + "grad_norm": 0.5492879748344421, + "learning_rate": 0.00046553065788112563, + "loss": 1.0874, + "step": 1896 + }, + { + "epoch": 0.3384176255463384, + "grad_norm": 0.6402779817581177, + "learning_rate": 0.000465495148882511, + "loss": 1.186, + "step": 1897 + }, + { + "epoch": 0.3385960217643386, + "grad_norm": 0.5240576863288879, + "learning_rate": 0.0004654596229590001, + "loss": 1.0237, + "step": 1898 + }, + { + "epoch": 0.33877441798233876, + "grad_norm": 0.5123072862625122, + "learning_rate": 0.0004654240801133831, + "loss": 1.2273, + "step": 1899 + }, + { + "epoch": 0.33895281420033896, + "grad_norm": 0.5299893617630005, + "learning_rate": 0.0004653885203484515, + "loss": 1.009, + "step": 1900 + }, + { + "epoch": 0.33913121041833916, + "grad_norm": 0.548248291015625, + "learning_rate": 0.0004653529436669983, + "loss": 1.0276, + "step": 1901 + }, + { + "epoch": 0.3393096066363393, + "grad_norm": 0.5219201445579529, + "learning_rate": 0.00046531735007181754, + "loss": 1.1695, + "step": 1902 + }, + { + "epoch": 0.3394880028543395, + "grad_norm": 0.4921947419643402, + "learning_rate": 0.00046528173956570474, + "loss": 0.9397, + "step": 1903 + }, + { + "epoch": 0.33966639907233964, + "grad_norm": 0.6388809680938721, + "learning_rate": 0.00046524611215145685, + "loss": 1.3252, + "step": 1904 + }, + { + "epoch": 0.33984479529033984, + "grad_norm": 0.5055054426193237, + "learning_rate": 0.0004652104678318718, + "loss": 1.0557, + "step": 1905 + }, + { + "epoch": 0.34002319150834004, + "grad_norm": 0.5897945761680603, + "learning_rate": 0.0004651748066097493, + "loss": 1.3774, + "step": 1906 + }, + { + "epoch": 0.3402015877263402, + "grad_norm": 0.4601035714149475, + "learning_rate": 0.00046513912848789013, + "loss": 0.8346, + "step": 1907 + }, + { + "epoch": 0.3403799839443404, + "grad_norm": 0.515330970287323, + "learning_rate": 0.00046510343346909636, + "loss": 1.0378, + "step": 1908 + }, + { + "epoch": 0.3405583801623406, + "grad_norm": 0.5221341252326965, + "learning_rate": 0.0004650677215561714, + "loss": 1.2198, + "step": 1909 + }, + { + "epoch": 0.3407367763803407, + "grad_norm": 0.6407569050788879, + "learning_rate": 0.00046503199275192025, + "loss": 1.2767, + "step": 1910 + }, + { + "epoch": 0.3409151725983409, + "grad_norm": 0.5762472748756409, + "learning_rate": 0.0004649962470591489, + "loss": 1.2465, + "step": 1911 + }, + { + "epoch": 0.3410935688163411, + "grad_norm": 0.514964759349823, + "learning_rate": 0.00046496048448066484, + "loss": 1.1349, + "step": 1912 + }, + { + "epoch": 0.34127196503434126, + "grad_norm": 0.482735276222229, + "learning_rate": 0.00046492470501927676, + "loss": 0.9251, + "step": 1913 + }, + { + "epoch": 0.34145036125234146, + "grad_norm": 0.49272486567497253, + "learning_rate": 0.0004648889086777949, + "loss": 0.8477, + "step": 1914 + }, + { + "epoch": 0.3416287574703416, + "grad_norm": 0.5680924654006958, + "learning_rate": 0.00046485309545903064, + "loss": 1.2822, + "step": 1915 + }, + { + "epoch": 0.3418071536883418, + "grad_norm": 0.7367702722549438, + "learning_rate": 0.00046481726536579674, + "loss": 1.1015, + "step": 1916 + }, + { + "epoch": 0.341985549906342, + "grad_norm": 0.8924338817596436, + "learning_rate": 0.0004647814184009072, + "loss": 1.113, + "step": 1917 + }, + { + "epoch": 0.34216394612434214, + "grad_norm": 0.48184382915496826, + "learning_rate": 0.00046474555456717747, + "loss": 0.9971, + "step": 1918 + }, + { + "epoch": 0.34234234234234234, + "grad_norm": 0.6294741034507751, + "learning_rate": 0.0004647096738674243, + "loss": 1.1922, + "step": 1919 + }, + { + "epoch": 0.34252073856034254, + "grad_norm": 0.5504223108291626, + "learning_rate": 0.0004646737763044658, + "loss": 1.0714, + "step": 1920 + }, + { + "epoch": 0.3426991347783427, + "grad_norm": 0.6702556610107422, + "learning_rate": 0.0004646378618811212, + "loss": 1.2097, + "step": 1921 + }, + { + "epoch": 0.3428775309963429, + "grad_norm": 0.5340100526809692, + "learning_rate": 0.0004646019306002114, + "loss": 1.2057, + "step": 1922 + }, + { + "epoch": 0.3430559272143431, + "grad_norm": 0.49807506799697876, + "learning_rate": 0.0004645659824645583, + "loss": 0.8643, + "step": 1923 + }, + { + "epoch": 0.3432343234323432, + "grad_norm": 0.5424205660820007, + "learning_rate": 0.0004645300174769852, + "loss": 1.257, + "step": 1924 + }, + { + "epoch": 0.3434127196503434, + "grad_norm": 0.9090192317962646, + "learning_rate": 0.0004644940356403169, + "loss": 1.1699, + "step": 1925 + }, + { + "epoch": 0.34359111586834357, + "grad_norm": 0.6018331050872803, + "learning_rate": 0.0004644580369573793, + "loss": 1.1373, + "step": 1926 + }, + { + "epoch": 0.34376951208634376, + "grad_norm": 0.5179253220558167, + "learning_rate": 0.00046442202143099986, + "loss": 0.8974, + "step": 1927 + }, + { + "epoch": 0.34394790830434396, + "grad_norm": 0.5565230846405029, + "learning_rate": 0.00046438598906400707, + "loss": 1.0234, + "step": 1928 + }, + { + "epoch": 0.3441263045223441, + "grad_norm": 0.6058740019798279, + "learning_rate": 0.0004643499398592309, + "loss": 1.2254, + "step": 1929 + }, + { + "epoch": 0.3443047007403443, + "grad_norm": 0.6367807984352112, + "learning_rate": 0.0004643138738195028, + "loss": 0.9239, + "step": 1930 + }, + { + "epoch": 0.3444830969583445, + "grad_norm": 0.49235963821411133, + "learning_rate": 0.0004642777909476552, + "loss": 1.0013, + "step": 1931 + }, + { + "epoch": 0.34466149317634465, + "grad_norm": 0.5881189107894897, + "learning_rate": 0.00046424169124652216, + "loss": 1.2899, + "step": 1932 + }, + { + "epoch": 0.34483988939434485, + "grad_norm": 0.5489835143089294, + "learning_rate": 0.00046420557471893887, + "loss": 0.8001, + "step": 1933 + }, + { + "epoch": 0.34501828561234504, + "grad_norm": 0.5183538198471069, + "learning_rate": 0.0004641694413677419, + "loss": 0.9889, + "step": 1934 + }, + { + "epoch": 0.3451966818303452, + "grad_norm": 0.493448942899704, + "learning_rate": 0.0004641332911957692, + "loss": 0.9633, + "step": 1935 + }, + { + "epoch": 0.3453750780483454, + "grad_norm": 0.6520765423774719, + "learning_rate": 0.0004640971242058599, + "loss": 1.3168, + "step": 1936 + }, + { + "epoch": 0.34555347426634553, + "grad_norm": 0.5311633944511414, + "learning_rate": 0.00046406094040085465, + "loss": 1.1453, + "step": 1937 + }, + { + "epoch": 0.3457318704843457, + "grad_norm": 0.6454037427902222, + "learning_rate": 0.0004640247397835953, + "loss": 1.2844, + "step": 1938 + }, + { + "epoch": 0.3459102667023459, + "grad_norm": 0.5690452456474304, + "learning_rate": 0.00046398852235692494, + "loss": 1.0391, + "step": 1939 + }, + { + "epoch": 0.34608866292034607, + "grad_norm": 0.555842936038971, + "learning_rate": 0.0004639522881236881, + "loss": 1.3138, + "step": 1940 + }, + { + "epoch": 0.34626705913834627, + "grad_norm": 0.5336434841156006, + "learning_rate": 0.0004639160370867307, + "loss": 1.127, + "step": 1941 + }, + { + "epoch": 0.34644545535634647, + "grad_norm": 0.6537164449691772, + "learning_rate": 0.00046387976924889976, + "loss": 1.3808, + "step": 1942 + }, + { + "epoch": 0.3466238515743466, + "grad_norm": 0.5720651149749756, + "learning_rate": 0.00046384348461304373, + "loss": 1.4115, + "step": 1943 + }, + { + "epoch": 0.3468022477923468, + "grad_norm": 0.5567306280136108, + "learning_rate": 0.00046380718318201247, + "loss": 1.0522, + "step": 1944 + }, + { + "epoch": 0.346980644010347, + "grad_norm": 0.658480167388916, + "learning_rate": 0.00046377086495865707, + "loss": 1.1822, + "step": 1945 + }, + { + "epoch": 0.34715904022834715, + "grad_norm": 0.5562303066253662, + "learning_rate": 0.00046373452994582994, + "loss": 1.2199, + "step": 1946 + }, + { + "epoch": 0.34733743644634735, + "grad_norm": 0.585731029510498, + "learning_rate": 0.00046369817814638476, + "loss": 1.0057, + "step": 1947 + }, + { + "epoch": 0.3475158326643475, + "grad_norm": 0.5658949017524719, + "learning_rate": 0.0004636618095631767, + "loss": 1.1701, + "step": 1948 + }, + { + "epoch": 0.3476942288823477, + "grad_norm": 1.204848051071167, + "learning_rate": 0.00046362542419906195, + "loss": 1.1882, + "step": 1949 + }, + { + "epoch": 0.3478726251003479, + "grad_norm": 0.5047358274459839, + "learning_rate": 0.0004635890220568984, + "loss": 1.0479, + "step": 1950 + }, + { + "epoch": 0.34805102131834803, + "grad_norm": 0.6019084453582764, + "learning_rate": 0.0004635526031395449, + "loss": 1.2855, + "step": 1951 + }, + { + "epoch": 0.34822941753634823, + "grad_norm": 0.5449895858764648, + "learning_rate": 0.00046351616744986194, + "loss": 1.2736, + "step": 1952 + }, + { + "epoch": 0.3484078137543484, + "grad_norm": 0.5767678022384644, + "learning_rate": 0.00046347971499071104, + "loss": 1.0872, + "step": 1953 + }, + { + "epoch": 0.34858620997234857, + "grad_norm": 0.589439868927002, + "learning_rate": 0.0004634432457649552, + "loss": 1.1026, + "step": 1954 + }, + { + "epoch": 0.34876460619034877, + "grad_norm": 0.534095823764801, + "learning_rate": 0.00046340675977545867, + "loss": 1.1419, + "step": 1955 + }, + { + "epoch": 0.34894300240834897, + "grad_norm": 0.5593207478523254, + "learning_rate": 0.00046337025702508704, + "loss": 1.1605, + "step": 1956 + }, + { + "epoch": 0.3491213986263491, + "grad_norm": 0.5366277694702148, + "learning_rate": 0.00046333373751670734, + "loss": 1.039, + "step": 1957 + }, + { + "epoch": 0.3492997948443493, + "grad_norm": 0.5536123514175415, + "learning_rate": 0.00046329720125318766, + "loss": 1.1518, + "step": 1958 + }, + { + "epoch": 0.34947819106234945, + "grad_norm": 0.4309159219264984, + "learning_rate": 0.0004632606482373976, + "loss": 0.8568, + "step": 1959 + }, + { + "epoch": 0.34965658728034965, + "grad_norm": 0.7334818243980408, + "learning_rate": 0.0004632240784722079, + "loss": 1.2216, + "step": 1960 + }, + { + "epoch": 0.34983498349834985, + "grad_norm": 0.5555823445320129, + "learning_rate": 0.0004631874919604909, + "loss": 1.1513, + "step": 1961 + }, + { + "epoch": 0.35001337971635, + "grad_norm": 0.5582665205001831, + "learning_rate": 0.00046315088870512003, + "loss": 1.096, + "step": 1962 + }, + { + "epoch": 0.3501917759343502, + "grad_norm": 0.5533860921859741, + "learning_rate": 0.00046311426870897, + "loss": 1.1806, + "step": 1963 + }, + { + "epoch": 0.3503701721523504, + "grad_norm": 0.6270779371261597, + "learning_rate": 0.00046307763197491717, + "loss": 1.1347, + "step": 1964 + }, + { + "epoch": 0.35054856837035053, + "grad_norm": 0.5797653794288635, + "learning_rate": 0.0004630409785058387, + "loss": 1.176, + "step": 1965 + }, + { + "epoch": 0.35072696458835073, + "grad_norm": 0.5614036917686462, + "learning_rate": 0.0004630043083046135, + "loss": 1.2459, + "step": 1966 + }, + { + "epoch": 0.35090536080635093, + "grad_norm": 0.5455766916275024, + "learning_rate": 0.0004629676213741216, + "loss": 1.2499, + "step": 1967 + }, + { + "epoch": 0.3510837570243511, + "grad_norm": 0.5755292177200317, + "learning_rate": 0.00046293091771724433, + "loss": 0.9262, + "step": 1968 + }, + { + "epoch": 0.35126215324235127, + "grad_norm": 0.5601824522018433, + "learning_rate": 0.00046289419733686445, + "loss": 1.2802, + "step": 1969 + }, + { + "epoch": 0.3514405494603514, + "grad_norm": 0.5256120562553406, + "learning_rate": 0.00046285746023586586, + "loss": 1.0818, + "step": 1970 + }, + { + "epoch": 0.3516189456783516, + "grad_norm": 0.5001375675201416, + "learning_rate": 0.000462820706417134, + "loss": 0.8263, + "step": 1971 + }, + { + "epoch": 0.3517973418963518, + "grad_norm": 0.6029314994812012, + "learning_rate": 0.0004627839358835554, + "loss": 1.1909, + "step": 1972 + }, + { + "epoch": 0.35197573811435195, + "grad_norm": 0.5109052658081055, + "learning_rate": 0.00046274714863801813, + "loss": 0.9954, + "step": 1973 + }, + { + "epoch": 0.35215413433235215, + "grad_norm": 0.6349180936813354, + "learning_rate": 0.0004627103446834113, + "loss": 1.1535, + "step": 1974 + }, + { + "epoch": 0.35233253055035235, + "grad_norm": 0.5841493606567383, + "learning_rate": 0.0004626735240226255, + "loss": 0.9702, + "step": 1975 + }, + { + "epoch": 0.3525109267683525, + "grad_norm": 0.5837200880050659, + "learning_rate": 0.00046263668665855276, + "loss": 1.2831, + "step": 1976 + }, + { + "epoch": 0.3526893229863527, + "grad_norm": 0.5340337157249451, + "learning_rate": 0.000462599832594086, + "loss": 1.0643, + "step": 1977 + }, + { + "epoch": 0.3528677192043529, + "grad_norm": 0.5065892338752747, + "learning_rate": 0.00046256296183212, + "loss": 0.9666, + "step": 1978 + }, + { + "epoch": 0.35304611542235304, + "grad_norm": 0.5460461378097534, + "learning_rate": 0.00046252607437555037, + "loss": 1.0391, + "step": 1979 + }, + { + "epoch": 0.35322451164035323, + "grad_norm": 0.5758165717124939, + "learning_rate": 0.0004624891702272744, + "loss": 1.0593, + "step": 1980 + }, + { + "epoch": 0.3534029078583534, + "grad_norm": 0.739166259765625, + "learning_rate": 0.0004624522493901904, + "loss": 1.0089, + "step": 1981 + }, + { + "epoch": 0.3535813040763536, + "grad_norm": 0.5259382724761963, + "learning_rate": 0.0004624153118671981, + "loss": 0.8894, + "step": 1982 + }, + { + "epoch": 0.3537597002943538, + "grad_norm": 0.47911664843559265, + "learning_rate": 0.00046237835766119867, + "loss": 0.8527, + "step": 1983 + }, + { + "epoch": 0.3539380965123539, + "grad_norm": 0.5415805578231812, + "learning_rate": 0.0004623413867750944, + "loss": 1.1951, + "step": 1984 + }, + { + "epoch": 0.3541164927303541, + "grad_norm": 0.5259605050086975, + "learning_rate": 0.00046230439921178903, + "loss": 1.1803, + "step": 1985 + }, + { + "epoch": 0.3542948889483543, + "grad_norm": 0.5450373291969299, + "learning_rate": 0.00046226739497418745, + "loss": 1.0821, + "step": 1986 + }, + { + "epoch": 0.35447328516635446, + "grad_norm": 0.5040337443351746, + "learning_rate": 0.0004622303740651961, + "loss": 1.0438, + "step": 1987 + }, + { + "epoch": 0.35465168138435466, + "grad_norm": 0.531369686126709, + "learning_rate": 0.00046219333648772246, + "loss": 1.0472, + "step": 1988 + }, + { + "epoch": 0.35483007760235485, + "grad_norm": 0.5246288180351257, + "learning_rate": 0.00046215628224467546, + "loss": 1.0166, + "step": 1989 + }, + { + "epoch": 0.355008473820355, + "grad_norm": 0.5832285284996033, + "learning_rate": 0.0004621192113389654, + "loss": 1.3904, + "step": 1990 + }, + { + "epoch": 0.3551868700383552, + "grad_norm": 0.5046032667160034, + "learning_rate": 0.0004620821237735037, + "loss": 0.9671, + "step": 1991 + }, + { + "epoch": 0.35536526625635534, + "grad_norm": 0.505187451839447, + "learning_rate": 0.00046204501955120333, + "loss": 1.1395, + "step": 1992 + }, + { + "epoch": 0.35554366247435554, + "grad_norm": 0.5264232158660889, + "learning_rate": 0.00046200789867497836, + "loss": 1.1737, + "step": 1993 + }, + { + "epoch": 0.35572205869235574, + "grad_norm": 0.6432974338531494, + "learning_rate": 0.00046197076114774425, + "loss": 1.4899, + "step": 1994 + }, + { + "epoch": 0.3559004549103559, + "grad_norm": 0.5268731713294983, + "learning_rate": 0.00046193360697241773, + "loss": 1.3145, + "step": 1995 + }, + { + "epoch": 0.3560788511283561, + "grad_norm": 0.5824031829833984, + "learning_rate": 0.000461896436151917, + "loss": 1.0268, + "step": 1996 + }, + { + "epoch": 0.3562572473463563, + "grad_norm": 0.5190456509590149, + "learning_rate": 0.0004618592486891613, + "loss": 0.9869, + "step": 1997 + }, + { + "epoch": 0.3564356435643564, + "grad_norm": 0.5111797451972961, + "learning_rate": 0.0004618220445870714, + "loss": 0.8515, + "step": 1998 + }, + { + "epoch": 0.3566140397823566, + "grad_norm": 0.6227758526802063, + "learning_rate": 0.0004617848238485693, + "loss": 1.2615, + "step": 1999 + }, + { + "epoch": 0.3567924360003568, + "grad_norm": 0.5635523200035095, + "learning_rate": 0.0004617475864765782, + "loss": 1.1813, + "step": 2000 + }, + { + "epoch": 0.35697083221835696, + "grad_norm": 0.6283045411109924, + "learning_rate": 0.0004617103324740228, + "loss": 1.0593, + "step": 2001 + }, + { + "epoch": 0.35714922843635716, + "grad_norm": 0.5376402735710144, + "learning_rate": 0.00046167306184382906, + "loss": 0.9929, + "step": 2002 + }, + { + "epoch": 0.3573276246543573, + "grad_norm": 0.5043478608131409, + "learning_rate": 0.00046163577458892415, + "loss": 1.1796, + "step": 2003 + }, + { + "epoch": 0.3575060208723575, + "grad_norm": 0.5074126720428467, + "learning_rate": 0.00046159847071223646, + "loss": 0.8408, + "step": 2004 + }, + { + "epoch": 0.3576844170903577, + "grad_norm": 0.6142756938934326, + "learning_rate": 0.00046156115021669605, + "loss": 1.159, + "step": 2005 + }, + { + "epoch": 0.35786281330835784, + "grad_norm": 0.4879589378833771, + "learning_rate": 0.00046152381310523384, + "loss": 0.8117, + "step": 2006 + }, + { + "epoch": 0.35804120952635804, + "grad_norm": 0.5088807940483093, + "learning_rate": 0.0004614864593807825, + "loss": 1.0544, + "step": 2007 + }, + { + "epoch": 0.35821960574435824, + "grad_norm": 0.5416921377182007, + "learning_rate": 0.00046144908904627557, + "loss": 1.1541, + "step": 2008 + }, + { + "epoch": 0.3583980019623584, + "grad_norm": 0.5295444130897522, + "learning_rate": 0.0004614117021046482, + "loss": 1.1063, + "step": 2009 + }, + { + "epoch": 0.3585763981803586, + "grad_norm": 0.5223750472068787, + "learning_rate": 0.0004613742985588367, + "loss": 1.0807, + "step": 2010 + }, + { + "epoch": 0.3587547943983588, + "grad_norm": 0.5627544522285461, + "learning_rate": 0.0004613368784117787, + "loss": 1.0165, + "step": 2011 + }, + { + "epoch": 0.3589331906163589, + "grad_norm": 0.5210590958595276, + "learning_rate": 0.0004612994416664134, + "loss": 1.0561, + "step": 2012 + }, + { + "epoch": 0.3591115868343591, + "grad_norm": 0.5011810660362244, + "learning_rate": 0.00046126198832568065, + "loss": 1.175, + "step": 2013 + }, + { + "epoch": 0.35928998305235926, + "grad_norm": 0.47756415605545044, + "learning_rate": 0.0004612245183925224, + "loss": 1.052, + "step": 2014 + }, + { + "epoch": 0.35946837927035946, + "grad_norm": 0.5775620937347412, + "learning_rate": 0.0004611870318698813, + "loss": 1.0646, + "step": 2015 + }, + { + "epoch": 0.35964677548835966, + "grad_norm": 0.5540309548377991, + "learning_rate": 0.00046114952876070167, + "loss": 1.2858, + "step": 2016 + }, + { + "epoch": 0.3598251717063598, + "grad_norm": 0.5066989660263062, + "learning_rate": 0.0004611120090679289, + "loss": 0.9978, + "step": 2017 + }, + { + "epoch": 0.36000356792436, + "grad_norm": 1.06368887424469, + "learning_rate": 0.00046107447279450977, + "loss": 1.2269, + "step": 2018 + }, + { + "epoch": 0.3601819641423602, + "grad_norm": 0.5219504833221436, + "learning_rate": 0.00046103691994339236, + "loss": 0.9725, + "step": 2019 + }, + { + "epoch": 0.36036036036036034, + "grad_norm": 0.5700012445449829, + "learning_rate": 0.00046099935051752605, + "loss": 1.2358, + "step": 2020 + }, + { + "epoch": 0.36053875657836054, + "grad_norm": 0.514971911907196, + "learning_rate": 0.0004609617645198616, + "loss": 1.0931, + "step": 2021 + }, + { + "epoch": 0.36071715279636074, + "grad_norm": 0.5299590229988098, + "learning_rate": 0.00046092416195335094, + "loss": 1.1742, + "step": 2022 + }, + { + "epoch": 0.3608955490143609, + "grad_norm": 0.5127617120742798, + "learning_rate": 0.00046088654282094735, + "loss": 1.0115, + "step": 2023 + }, + { + "epoch": 0.3610739452323611, + "grad_norm": 0.5403224229812622, + "learning_rate": 0.00046084890712560546, + "loss": 1.2271, + "step": 2024 + }, + { + "epoch": 0.3612523414503612, + "grad_norm": 0.5359017252922058, + "learning_rate": 0.0004608112548702811, + "loss": 1.1065, + "step": 2025 + }, + { + "epoch": 0.3614307376683614, + "grad_norm": 0.5404418110847473, + "learning_rate": 0.00046077358605793156, + "loss": 1.0741, + "step": 2026 + }, + { + "epoch": 0.3616091338863616, + "grad_norm": 0.541814386844635, + "learning_rate": 0.0004607359006915153, + "loss": 1.2468, + "step": 2027 + }, + { + "epoch": 0.36178753010436177, + "grad_norm": 0.5587732791900635, + "learning_rate": 0.000460698198773992, + "loss": 1.0027, + "step": 2028 + }, + { + "epoch": 0.36196592632236196, + "grad_norm": 0.5569438338279724, + "learning_rate": 0.00046066048030832284, + "loss": 1.045, + "step": 2029 + }, + { + "epoch": 0.36214432254036216, + "grad_norm": 0.5397847294807434, + "learning_rate": 0.0004606227452974703, + "loss": 1.1198, + "step": 2030 + }, + { + "epoch": 0.3623227187583623, + "grad_norm": 0.4841654598712921, + "learning_rate": 0.0004605849937443979, + "loss": 1.0022, + "step": 2031 + }, + { + "epoch": 0.3625011149763625, + "grad_norm": 0.5069999694824219, + "learning_rate": 0.00046054722565207077, + "loss": 1.1917, + "step": 2032 + }, + { + "epoch": 0.3626795111943627, + "grad_norm": 0.5341327786445618, + "learning_rate": 0.0004605094410234551, + "loss": 0.9297, + "step": 2033 + }, + { + "epoch": 0.36285790741236285, + "grad_norm": 0.4698069095611572, + "learning_rate": 0.00046047163986151854, + "loss": 0.7909, + "step": 2034 + }, + { + "epoch": 0.36303630363036304, + "grad_norm": 0.6396573185920715, + "learning_rate": 0.0004604338221692299, + "loss": 1.26, + "step": 2035 + }, + { + "epoch": 0.3632146998483632, + "grad_norm": 0.5331511497497559, + "learning_rate": 0.0004603959879495595, + "loss": 1.0438, + "step": 2036 + }, + { + "epoch": 0.3633930960663634, + "grad_norm": 0.6419988870620728, + "learning_rate": 0.00046035813720547876, + "loss": 1.232, + "step": 2037 + }, + { + "epoch": 0.3635714922843636, + "grad_norm": 0.5257839560508728, + "learning_rate": 0.0004603202699399604, + "loss": 0.8447, + "step": 2038 + }, + { + "epoch": 0.3637498885023637, + "grad_norm": 0.49433434009552, + "learning_rate": 0.0004602823861559785, + "loss": 0.9731, + "step": 2039 + }, + { + "epoch": 0.3639282847203639, + "grad_norm": 0.5541132688522339, + "learning_rate": 0.00046024448585650857, + "loss": 1.0478, + "step": 2040 + }, + { + "epoch": 0.3641066809383641, + "grad_norm": 0.5009471774101257, + "learning_rate": 0.00046020656904452716, + "loss": 1.0364, + "step": 2041 + }, + { + "epoch": 0.36428507715636427, + "grad_norm": 0.5421225428581238, + "learning_rate": 0.0004601686357230123, + "loss": 1.0979, + "step": 2042 + }, + { + "epoch": 0.36446347337436447, + "grad_norm": 0.49714601039886475, + "learning_rate": 0.0004601306858949432, + "loss": 0.9815, + "step": 2043 + }, + { + "epoch": 0.36464186959236466, + "grad_norm": 0.5659156441688538, + "learning_rate": 0.0004600927195633005, + "loss": 1.0267, + "step": 2044 + }, + { + "epoch": 0.3648202658103648, + "grad_norm": 0.5777513980865479, + "learning_rate": 0.000460054736731066, + "loss": 1.1066, + "step": 2045 + }, + { + "epoch": 0.364998662028365, + "grad_norm": 0.4904979169368744, + "learning_rate": 0.00046001673740122287, + "loss": 1.0805, + "step": 2046 + }, + { + "epoch": 0.36517705824636515, + "grad_norm": 0.5406635403633118, + "learning_rate": 0.0004599787215767556, + "loss": 0.9333, + "step": 2047 + }, + { + "epoch": 0.36535545446436535, + "grad_norm": 0.5242641568183899, + "learning_rate": 0.0004599406892606498, + "loss": 0.9977, + "step": 2048 + }, + { + "epoch": 0.36553385068236555, + "grad_norm": 0.538411021232605, + "learning_rate": 0.00045990264045589263, + "loss": 1.0446, + "step": 2049 + }, + { + "epoch": 0.3657122469003657, + "grad_norm": 0.5017095804214478, + "learning_rate": 0.0004598645751654724, + "loss": 0.9025, + "step": 2050 + }, + { + "epoch": 0.3658906431183659, + "grad_norm": 0.5771719813346863, + "learning_rate": 0.0004598264933923788, + "loss": 1.1323, + "step": 2051 + }, + { + "epoch": 0.3660690393363661, + "grad_norm": 0.5927982926368713, + "learning_rate": 0.0004597883951396027, + "loss": 1.4047, + "step": 2052 + }, + { + "epoch": 0.36624743555436623, + "grad_norm": 0.5014179944992065, + "learning_rate": 0.00045975028041013633, + "loss": 0.9932, + "step": 2053 + }, + { + "epoch": 0.36642583177236643, + "grad_norm": 0.5538795590400696, + "learning_rate": 0.00045971214920697325, + "loss": 1.128, + "step": 2054 + }, + { + "epoch": 0.3666042279903666, + "grad_norm": 0.5837451219558716, + "learning_rate": 0.00045967400153310814, + "loss": 1.1844, + "step": 2055 + }, + { + "epoch": 0.36678262420836677, + "grad_norm": 0.4952576458454132, + "learning_rate": 0.00045963583739153715, + "loss": 0.9711, + "step": 2056 + }, + { + "epoch": 0.36696102042636697, + "grad_norm": 0.4757169187068939, + "learning_rate": 0.00045959765678525776, + "loss": 1.0064, + "step": 2057 + }, + { + "epoch": 0.36713941664436717, + "grad_norm": 0.5288332104682922, + "learning_rate": 0.00045955945971726855, + "loss": 1.4253, + "step": 2058 + }, + { + "epoch": 0.3673178128623673, + "grad_norm": 0.5419899225234985, + "learning_rate": 0.00045952124619056957, + "loss": 1.2097, + "step": 2059 + }, + { + "epoch": 0.3674962090803675, + "grad_norm": 0.5925987958908081, + "learning_rate": 0.00045948301620816215, + "loss": 1.1952, + "step": 2060 + }, + { + "epoch": 0.36767460529836765, + "grad_norm": 0.5184533596038818, + "learning_rate": 0.00045944476977304873, + "loss": 1.0255, + "step": 2061 + }, + { + "epoch": 0.36785300151636785, + "grad_norm": 0.5200556516647339, + "learning_rate": 0.0004594065068882332, + "loss": 1.1104, + "step": 2062 + }, + { + "epoch": 0.36803139773436805, + "grad_norm": 0.5943669080734253, + "learning_rate": 0.0004593682275567208, + "loss": 1.1162, + "step": 2063 + }, + { + "epoch": 0.3682097939523682, + "grad_norm": 1.1349503993988037, + "learning_rate": 0.00045932993178151785, + "loss": 1.0188, + "step": 2064 + }, + { + "epoch": 0.3683881901703684, + "grad_norm": 0.5693401098251343, + "learning_rate": 0.00045929161956563216, + "loss": 1.0564, + "step": 2065 + }, + { + "epoch": 0.3685665863883686, + "grad_norm": 0.5493849515914917, + "learning_rate": 0.00045925329091207266, + "loss": 0.9755, + "step": 2066 + }, + { + "epoch": 0.36874498260636873, + "grad_norm": 0.5377547740936279, + "learning_rate": 0.00045921494582384985, + "loss": 1.0967, + "step": 2067 + }, + { + "epoch": 0.36892337882436893, + "grad_norm": 0.5204565525054932, + "learning_rate": 0.00045917658430397526, + "loss": 0.9365, + "step": 2068 + }, + { + "epoch": 0.36910177504236913, + "grad_norm": 0.5643702149391174, + "learning_rate": 0.0004591382063554617, + "loss": 1.2203, + "step": 2069 + }, + { + "epoch": 0.36928017126036927, + "grad_norm": 0.49709245562553406, + "learning_rate": 0.00045909981198132336, + "loss": 0.9819, + "step": 2070 + }, + { + "epoch": 0.36945856747836947, + "grad_norm": 0.5403639674186707, + "learning_rate": 0.00045906140118457577, + "loss": 1.0955, + "step": 2071 + }, + { + "epoch": 0.3696369636963696, + "grad_norm": 0.643334686756134, + "learning_rate": 0.00045902297396823577, + "loss": 1.3619, + "step": 2072 + }, + { + "epoch": 0.3698153599143698, + "grad_norm": 0.5306541919708252, + "learning_rate": 0.0004589845303353213, + "loss": 0.9215, + "step": 2073 + }, + { + "epoch": 0.36999375613237, + "grad_norm": 0.5806733965873718, + "learning_rate": 0.00045894607028885173, + "loss": 1.0277, + "step": 2074 + }, + { + "epoch": 0.37017215235037015, + "grad_norm": 0.5571582317352295, + "learning_rate": 0.0004589075938318478, + "loss": 0.845, + "step": 2075 + }, + { + "epoch": 0.37035054856837035, + "grad_norm": 0.56407231092453, + "learning_rate": 0.00045886910096733127, + "loss": 1.0461, + "step": 2076 + }, + { + "epoch": 0.37052894478637055, + "grad_norm": 0.5526430010795593, + "learning_rate": 0.0004588305916983254, + "loss": 0.915, + "step": 2077 + }, + { + "epoch": 0.3707073410043707, + "grad_norm": 0.5798369646072388, + "learning_rate": 0.00045879206602785486, + "loss": 1.0021, + "step": 2078 + }, + { + "epoch": 0.3708857372223709, + "grad_norm": 0.5774779915809631, + "learning_rate": 0.0004587535239589452, + "loss": 1.1147, + "step": 2079 + }, + { + "epoch": 0.3710641334403711, + "grad_norm": 0.5789695978164673, + "learning_rate": 0.00045871496549462364, + "loss": 1.1118, + "step": 2080 + }, + { + "epoch": 0.37124252965837123, + "grad_norm": 0.4506717622280121, + "learning_rate": 0.0004586763906379184, + "loss": 0.8033, + "step": 2081 + }, + { + "epoch": 0.37142092587637143, + "grad_norm": 0.4841929078102112, + "learning_rate": 0.0004586377993918594, + "loss": 1.1138, + "step": 2082 + }, + { + "epoch": 0.3715993220943716, + "grad_norm": 0.5651434659957886, + "learning_rate": 0.0004585991917594774, + "loss": 1.1166, + "step": 2083 + }, + { + "epoch": 0.3717777183123718, + "grad_norm": 0.5341058969497681, + "learning_rate": 0.0004585605677438046, + "loss": 1.315, + "step": 2084 + }, + { + "epoch": 0.371956114530372, + "grad_norm": 1.9402390718460083, + "learning_rate": 0.0004585219273478745, + "loss": 1.2177, + "step": 2085 + }, + { + "epoch": 0.3721345107483721, + "grad_norm": 0.5437238812446594, + "learning_rate": 0.00045848327057472206, + "loss": 0.9626, + "step": 2086 + }, + { + "epoch": 0.3723129069663723, + "grad_norm": 0.5549441576004028, + "learning_rate": 0.0004584445974273832, + "loss": 1.0923, + "step": 2087 + }, + { + "epoch": 0.3724913031843725, + "grad_norm": 0.5701561570167542, + "learning_rate": 0.0004584059079088954, + "loss": 1.1516, + "step": 2088 + }, + { + "epoch": 0.37266969940237266, + "grad_norm": 0.5430149435997009, + "learning_rate": 0.00045836720202229727, + "loss": 1.2877, + "step": 2089 + }, + { + "epoch": 0.37284809562037285, + "grad_norm": 0.555594801902771, + "learning_rate": 0.00045832847977062875, + "loss": 0.9959, + "step": 2090 + }, + { + "epoch": 0.37302649183837305, + "grad_norm": 0.5235658288002014, + "learning_rate": 0.0004582897411569311, + "loss": 1.0502, + "step": 2091 + }, + { + "epoch": 0.3732048880563732, + "grad_norm": 0.6449779868125916, + "learning_rate": 0.0004582509861842468, + "loss": 1.1183, + "step": 2092 + }, + { + "epoch": 0.3733832842743734, + "grad_norm": 0.592603325843811, + "learning_rate": 0.0004582122148556196, + "loss": 1.1536, + "step": 2093 + }, + { + "epoch": 0.37356168049237354, + "grad_norm": 0.4949917197227478, + "learning_rate": 0.0004581734271740948, + "loss": 1.0221, + "step": 2094 + }, + { + "epoch": 0.37374007671037374, + "grad_norm": 0.6672910451889038, + "learning_rate": 0.0004581346231427185, + "loss": 1.064, + "step": 2095 + }, + { + "epoch": 0.37391847292837394, + "grad_norm": 0.4738200604915619, + "learning_rate": 0.00045809580276453843, + "loss": 1.0283, + "step": 2096 + }, + { + "epoch": 0.3740968691463741, + "grad_norm": 1.385906457901001, + "learning_rate": 0.00045805696604260364, + "loss": 0.9435, + "step": 2097 + }, + { + "epoch": 0.3742752653643743, + "grad_norm": 0.6020336747169495, + "learning_rate": 0.0004580181129799643, + "loss": 1.0175, + "step": 2098 + }, + { + "epoch": 0.3744536615823745, + "grad_norm": 0.7528032660484314, + "learning_rate": 0.0004579792435796718, + "loss": 1.2405, + "step": 2099 + }, + { + "epoch": 0.3746320578003746, + "grad_norm": 0.5541552305221558, + "learning_rate": 0.00045794035784477903, + "loss": 1.0247, + "step": 2100 + }, + { + "epoch": 0.3748104540183748, + "grad_norm": 0.5061083436012268, + "learning_rate": 0.00045790145577834007, + "loss": 0.8521, + "step": 2101 + }, + { + "epoch": 0.374988850236375, + "grad_norm": 1.0418980121612549, + "learning_rate": 0.0004578625373834102, + "loss": 1.3496, + "step": 2102 + }, + { + "epoch": 0.37516724645437516, + "grad_norm": 0.5358394384384155, + "learning_rate": 0.00045782360266304615, + "loss": 0.9427, + "step": 2103 + }, + { + "epoch": 0.37534564267237536, + "grad_norm": 0.5550572276115417, + "learning_rate": 0.0004577846516203057, + "loss": 1.29, + "step": 2104 + }, + { + "epoch": 0.3755240388903755, + "grad_norm": 0.5666816830635071, + "learning_rate": 0.0004577456842582482, + "loss": 0.9784, + "step": 2105 + }, + { + "epoch": 0.3757024351083757, + "grad_norm": 0.6263391375541687, + "learning_rate": 0.000457706700579934, + "loss": 1.3776, + "step": 2106 + }, + { + "epoch": 0.3758808313263759, + "grad_norm": 0.5329514145851135, + "learning_rate": 0.000457667700588425, + "loss": 0.9403, + "step": 2107 + }, + { + "epoch": 0.37605922754437604, + "grad_norm": 0.6623142957687378, + "learning_rate": 0.00045762868428678405, + "loss": 1.1171, + "step": 2108 + }, + { + "epoch": 0.37623762376237624, + "grad_norm": 0.5798327326774597, + "learning_rate": 0.00045758965167807565, + "loss": 1.1791, + "step": 2109 + }, + { + "epoch": 0.37641601998037644, + "grad_norm": 0.5603600144386292, + "learning_rate": 0.00045755060276536543, + "loss": 1.0435, + "step": 2110 + }, + { + "epoch": 0.3765944161983766, + "grad_norm": 0.5544630885124207, + "learning_rate": 0.00045751153755172006, + "loss": 1.1783, + "step": 2111 + }, + { + "epoch": 0.3767728124163768, + "grad_norm": 0.633995771408081, + "learning_rate": 0.0004574724560402078, + "loss": 1.1487, + "step": 2112 + }, + { + "epoch": 0.376951208634377, + "grad_norm": 0.6015441417694092, + "learning_rate": 0.00045743335823389826, + "loss": 1.0372, + "step": 2113 + }, + { + "epoch": 0.3771296048523771, + "grad_norm": 0.5980955958366394, + "learning_rate": 0.00045739424413586194, + "loss": 1.2749, + "step": 2114 + }, + { + "epoch": 0.3773080010703773, + "grad_norm": 0.5361245274543762, + "learning_rate": 0.00045735511374917095, + "loss": 1.0022, + "step": 2115 + }, + { + "epoch": 0.37748639728837746, + "grad_norm": 0.5869659781455994, + "learning_rate": 0.0004573159670768986, + "loss": 1.214, + "step": 2116 + }, + { + "epoch": 0.37766479350637766, + "grad_norm": 0.6182494759559631, + "learning_rate": 0.00045727680412211937, + "loss": 0.9911, + "step": 2117 + }, + { + "epoch": 0.37784318972437786, + "grad_norm": 0.5352988243103027, + "learning_rate": 0.0004572376248879091, + "loss": 1.0801, + "step": 2118 + }, + { + "epoch": 0.378021585942378, + "grad_norm": 0.5704354047775269, + "learning_rate": 0.000457198429377345, + "loss": 0.9513, + "step": 2119 + }, + { + "epoch": 0.3781999821603782, + "grad_norm": 0.5778263807296753, + "learning_rate": 0.00045715921759350545, + "loss": 1.1593, + "step": 2120 + }, + { + "epoch": 0.3783783783783784, + "grad_norm": 0.5286616683006287, + "learning_rate": 0.0004571199895394701, + "loss": 1.2545, + "step": 2121 + }, + { + "epoch": 0.37855677459637854, + "grad_norm": 0.5010552406311035, + "learning_rate": 0.00045708074521831984, + "loss": 0.8689, + "step": 2122 + }, + { + "epoch": 0.37873517081437874, + "grad_norm": 0.8215789198875427, + "learning_rate": 0.000457041484633137, + "loss": 1.1025, + "step": 2123 + }, + { + "epoch": 0.37891356703237894, + "grad_norm": 0.8685697317123413, + "learning_rate": 0.000457002207787005, + "loss": 1.0335, + "step": 2124 + }, + { + "epoch": 0.3790919632503791, + "grad_norm": 0.5928163528442383, + "learning_rate": 0.00045696291468300874, + "loss": 1.1333, + "step": 2125 + }, + { + "epoch": 0.3792703594683793, + "grad_norm": 0.5363757014274597, + "learning_rate": 0.00045692360532423423, + "loss": 1.3034, + "step": 2126 + }, + { + "epoch": 0.3794487556863794, + "grad_norm": 0.6252255439758301, + "learning_rate": 0.00045688427971376876, + "loss": 1.1585, + "step": 2127 + }, + { + "epoch": 0.3796271519043796, + "grad_norm": 0.6210330128669739, + "learning_rate": 0.00045684493785470105, + "loss": 0.9278, + "step": 2128 + }, + { + "epoch": 0.3798055481223798, + "grad_norm": 0.8249781131744385, + "learning_rate": 0.00045680557975012086, + "loss": 1.4528, + "step": 2129 + }, + { + "epoch": 0.37998394434037996, + "grad_norm": 0.5406894683837891, + "learning_rate": 0.00045676620540311953, + "loss": 0.9968, + "step": 2130 + }, + { + "epoch": 0.38016234055838016, + "grad_norm": 0.631753146648407, + "learning_rate": 0.00045672681481678936, + "loss": 1.05, + "step": 2131 + }, + { + "epoch": 0.38034073677638036, + "grad_norm": 0.5768477916717529, + "learning_rate": 0.000456687407994224, + "loss": 1.1387, + "step": 2132 + }, + { + "epoch": 0.3805191329943805, + "grad_norm": 0.5343456268310547, + "learning_rate": 0.00045664798493851873, + "loss": 1.0884, + "step": 2133 + }, + { + "epoch": 0.3806975292123807, + "grad_norm": 0.5370036363601685, + "learning_rate": 0.00045660854565276955, + "loss": 1.0014, + "step": 2134 + }, + { + "epoch": 0.3808759254303809, + "grad_norm": 0.5855540633201599, + "learning_rate": 0.0004565690901400742, + "loss": 1.2283, + "step": 2135 + }, + { + "epoch": 0.38105432164838104, + "grad_norm": 0.5994966626167297, + "learning_rate": 0.00045652961840353135, + "loss": 0.801, + "step": 2136 + }, + { + "epoch": 0.38123271786638124, + "grad_norm": 0.7402109503746033, + "learning_rate": 0.0004564901304462411, + "loss": 0.9229, + "step": 2137 + }, + { + "epoch": 0.3814111140843814, + "grad_norm": 0.8662267327308655, + "learning_rate": 0.0004564506262713049, + "loss": 1.0499, + "step": 2138 + }, + { + "epoch": 0.3815895103023816, + "grad_norm": 0.5050637125968933, + "learning_rate": 0.0004564111058818254, + "loss": 0.9188, + "step": 2139 + }, + { + "epoch": 0.3817679065203818, + "grad_norm": 0.5128204822540283, + "learning_rate": 0.0004563715692809064, + "loss": 0.9959, + "step": 2140 + }, + { + "epoch": 0.3819463027383819, + "grad_norm": 0.5767316818237305, + "learning_rate": 0.00045633201647165325, + "loss": 1.1759, + "step": 2141 + }, + { + "epoch": 0.3821246989563821, + "grad_norm": 0.5560585856437683, + "learning_rate": 0.0004562924474571722, + "loss": 1.2575, + "step": 2142 + }, + { + "epoch": 0.3823030951743823, + "grad_norm": 0.5354772210121155, + "learning_rate": 0.0004562528622405712, + "loss": 1.0975, + "step": 2143 + }, + { + "epoch": 0.38248149139238247, + "grad_norm": 0.5775118470191956, + "learning_rate": 0.0004562132608249591, + "loss": 1.0457, + "step": 2144 + }, + { + "epoch": 0.38265988761038267, + "grad_norm": 0.5361456274986267, + "learning_rate": 0.00045617364321344625, + "loss": 1.0162, + "step": 2145 + }, + { + "epoch": 0.38283828382838286, + "grad_norm": 0.6208562254905701, + "learning_rate": 0.00045613400940914417, + "loss": 1.0852, + "step": 2146 + }, + { + "epoch": 0.383016680046383, + "grad_norm": 0.5321409106254578, + "learning_rate": 0.0004560943594151657, + "loss": 1.0978, + "step": 2147 + }, + { + "epoch": 0.3831950762643832, + "grad_norm": 0.4865065813064575, + "learning_rate": 0.00045605469323462493, + "loss": 0.8982, + "step": 2148 + }, + { + "epoch": 0.38337347248238335, + "grad_norm": 0.5791817903518677, + "learning_rate": 0.0004560150108706372, + "loss": 1.0734, + "step": 2149 + }, + { + "epoch": 0.38355186870038355, + "grad_norm": 0.5767939686775208, + "learning_rate": 0.0004559753123263193, + "loss": 1.112, + "step": 2150 + }, + { + "epoch": 0.38373026491838375, + "grad_norm": 0.5261261463165283, + "learning_rate": 0.0004559355976047889, + "loss": 1.0849, + "step": 2151 + }, + { + "epoch": 0.3839086611363839, + "grad_norm": 0.5370740294456482, + "learning_rate": 0.00045589586670916526, + "loss": 0.8849, + "step": 2152 + }, + { + "epoch": 0.3840870573543841, + "grad_norm": 0.4868060350418091, + "learning_rate": 0.0004558561196425689, + "loss": 0.9771, + "step": 2153 + }, + { + "epoch": 0.3842654535723843, + "grad_norm": 0.5003844499588013, + "learning_rate": 0.0004558163564081215, + "loss": 0.9858, + "step": 2154 + }, + { + "epoch": 0.38444384979038443, + "grad_norm": 0.4873872399330139, + "learning_rate": 0.000455776577008946, + "loss": 1.082, + "step": 2155 + }, + { + "epoch": 0.3846222460083846, + "grad_norm": 0.5657439231872559, + "learning_rate": 0.0004557367814481668, + "loss": 1.1766, + "step": 2156 + }, + { + "epoch": 0.3848006422263848, + "grad_norm": 0.5301082134246826, + "learning_rate": 0.00045569696972890916, + "loss": 0.9749, + "step": 2157 + }, + { + "epoch": 0.38497903844438497, + "grad_norm": 0.5159010291099548, + "learning_rate": 0.0004556571418543001, + "loss": 1.0766, + "step": 2158 + }, + { + "epoch": 0.38515743466238517, + "grad_norm": 0.5375173687934875, + "learning_rate": 0.00045561729782746767, + "loss": 1.0563, + "step": 2159 + }, + { + "epoch": 0.3853358308803853, + "grad_norm": 0.5973328351974487, + "learning_rate": 0.0004555774376515411, + "loss": 1.3044, + "step": 2160 + }, + { + "epoch": 0.3855142270983855, + "grad_norm": 0.4792105555534363, + "learning_rate": 0.00045553756132965105, + "loss": 0.8944, + "step": 2161 + }, + { + "epoch": 0.3856926233163857, + "grad_norm": 0.5174294114112854, + "learning_rate": 0.00045549766886492945, + "loss": 1.0259, + "step": 2162 + }, + { + "epoch": 0.38587101953438585, + "grad_norm": 0.48922815918922424, + "learning_rate": 0.0004554577602605093, + "loss": 0.8496, + "step": 2163 + }, + { + "epoch": 0.38604941575238605, + "grad_norm": 0.49794623255729675, + "learning_rate": 0.0004554178355195251, + "loss": 0.8427, + "step": 2164 + }, + { + "epoch": 0.38622781197038625, + "grad_norm": 0.5163681507110596, + "learning_rate": 0.00045537789464511247, + "loss": 1.0703, + "step": 2165 + }, + { + "epoch": 0.3864062081883864, + "grad_norm": 0.5273317694664001, + "learning_rate": 0.00045533793764040845, + "loss": 1.0474, + "step": 2166 + }, + { + "epoch": 0.3865846044063866, + "grad_norm": 0.563606858253479, + "learning_rate": 0.0004552979645085511, + "loss": 1.3796, + "step": 2167 + }, + { + "epoch": 0.3867630006243868, + "grad_norm": 0.5003736615180969, + "learning_rate": 0.0004552579752526799, + "loss": 0.8776, + "step": 2168 + }, + { + "epoch": 0.38694139684238693, + "grad_norm": 0.5471118688583374, + "learning_rate": 0.0004552179698759358, + "loss": 1.2153, + "step": 2169 + }, + { + "epoch": 0.38711979306038713, + "grad_norm": 0.5452420115470886, + "learning_rate": 0.0004551779483814605, + "loss": 1.1194, + "step": 2170 + }, + { + "epoch": 0.3872981892783873, + "grad_norm": 0.5115416049957275, + "learning_rate": 0.0004551379107723976, + "loss": 1.2556, + "step": 2171 + }, + { + "epoch": 0.38747658549638747, + "grad_norm": 0.5057587027549744, + "learning_rate": 0.0004550978570518913, + "loss": 0.9426, + "step": 2172 + }, + { + "epoch": 0.38765498171438767, + "grad_norm": 0.5084266662597656, + "learning_rate": 0.00045505778722308764, + "loss": 0.9323, + "step": 2173 + }, + { + "epoch": 0.3878333779323878, + "grad_norm": 0.5315088629722595, + "learning_rate": 0.00045501770128913364, + "loss": 1.1016, + "step": 2174 + }, + { + "epoch": 0.388011774150388, + "grad_norm": 0.6192378401756287, + "learning_rate": 0.0004549775992531776, + "loss": 1.3612, + "step": 2175 + }, + { + "epoch": 0.3881901703683882, + "grad_norm": 0.49113729596138, + "learning_rate": 0.0004549374811183692, + "loss": 0.984, + "step": 2176 + }, + { + "epoch": 0.38836856658638835, + "grad_norm": 0.5000951886177063, + "learning_rate": 0.0004548973468878591, + "loss": 1.0827, + "step": 2177 + }, + { + "epoch": 0.38854696280438855, + "grad_norm": 0.5333259701728821, + "learning_rate": 0.00045485719656479957, + "loss": 1.0745, + "step": 2178 + }, + { + "epoch": 0.38872535902238875, + "grad_norm": 0.5190644860267639, + "learning_rate": 0.000454817030152344, + "loss": 1.1067, + "step": 2179 + }, + { + "epoch": 0.3889037552403889, + "grad_norm": 0.5464503765106201, + "learning_rate": 0.00045477684765364703, + "loss": 1.1484, + "step": 2180 + }, + { + "epoch": 0.3890821514583891, + "grad_norm": 0.5542097091674805, + "learning_rate": 0.0004547366490718645, + "loss": 1.1321, + "step": 2181 + }, + { + "epoch": 0.38926054767638923, + "grad_norm": 0.5173625349998474, + "learning_rate": 0.0004546964344101537, + "loss": 1.2251, + "step": 2182 + }, + { + "epoch": 0.38943894389438943, + "grad_norm": 0.5054041743278503, + "learning_rate": 0.0004546562036716732, + "loss": 0.9788, + "step": 2183 + }, + { + "epoch": 0.38961734011238963, + "grad_norm": 0.48386964201927185, + "learning_rate": 0.0004546159568595823, + "loss": 1.0653, + "step": 2184 + }, + { + "epoch": 0.3897957363303898, + "grad_norm": 0.5080329775810242, + "learning_rate": 0.00045457569397704226, + "loss": 1.1174, + "step": 2185 + }, + { + "epoch": 0.38997413254839, + "grad_norm": 1.7605923414230347, + "learning_rate": 0.0004545354150272153, + "loss": 1.0394, + "step": 2186 + }, + { + "epoch": 0.39015252876639017, + "grad_norm": 0.45817258954048157, + "learning_rate": 0.0004544951200132648, + "loss": 0.8228, + "step": 2187 + }, + { + "epoch": 0.3903309249843903, + "grad_norm": 0.5302280783653259, + "learning_rate": 0.00045445480893835567, + "loss": 1.1017, + "step": 2188 + }, + { + "epoch": 0.3905093212023905, + "grad_norm": 0.5459249019622803, + "learning_rate": 0.0004544144818056537, + "loss": 1.2306, + "step": 2189 + }, + { + "epoch": 0.3906877174203907, + "grad_norm": 0.6084957122802734, + "learning_rate": 0.0004543741386183264, + "loss": 1.0647, + "step": 2190 + }, + { + "epoch": 0.39086611363839086, + "grad_norm": 0.6476156711578369, + "learning_rate": 0.00045433377937954215, + "loss": 1.2749, + "step": 2191 + }, + { + "epoch": 0.39104450985639105, + "grad_norm": 0.5217106938362122, + "learning_rate": 0.00045429340409247084, + "loss": 0.9928, + "step": 2192 + }, + { + "epoch": 0.3912229060743912, + "grad_norm": 0.5622875690460205, + "learning_rate": 0.0004542530127602834, + "loss": 1.0067, + "step": 2193 + }, + { + "epoch": 0.3914013022923914, + "grad_norm": 1.472040057182312, + "learning_rate": 0.00045421260538615235, + "loss": 1.1993, + "step": 2194 + }, + { + "epoch": 0.3915796985103916, + "grad_norm": 0.5135729908943176, + "learning_rate": 0.00045417218197325106, + "loss": 0.9331, + "step": 2195 + }, + { + "epoch": 0.39175809472839174, + "grad_norm": 0.662834644317627, + "learning_rate": 0.00045413174252475455, + "loss": 1.1606, + "step": 2196 + }, + { + "epoch": 0.39193649094639194, + "grad_norm": 0.9087008237838745, + "learning_rate": 0.00045409128704383873, + "loss": 1.2032, + "step": 2197 + }, + { + "epoch": 0.39211488716439213, + "grad_norm": 0.5726129412651062, + "learning_rate": 0.0004540508155336811, + "loss": 1.1915, + "step": 2198 + }, + { + "epoch": 0.3922932833823923, + "grad_norm": 0.6241844296455383, + "learning_rate": 0.0004540103279974602, + "loss": 1.2713, + "step": 2199 + }, + { + "epoch": 0.3924716796003925, + "grad_norm": 0.5253695249557495, + "learning_rate": 0.000453969824438356, + "loss": 0.9503, + "step": 2200 + }, + { + "epoch": 0.3926500758183927, + "grad_norm": 0.4767884612083435, + "learning_rate": 0.00045392930485954955, + "loss": 0.9004, + "step": 2201 + }, + { + "epoch": 0.3928284720363928, + "grad_norm": 0.5421171188354492, + "learning_rate": 0.0004538887692642232, + "loss": 1.057, + "step": 2202 + }, + { + "epoch": 0.393006868254393, + "grad_norm": 0.6273959279060364, + "learning_rate": 0.0004538482176555607, + "loss": 1.0529, + "step": 2203 + }, + { + "epoch": 0.39318526447239316, + "grad_norm": 0.5183041095733643, + "learning_rate": 0.00045380765003674684, + "loss": 1.0245, + "step": 2204 + }, + { + "epoch": 0.39336366069039336, + "grad_norm": 0.5289639830589294, + "learning_rate": 0.00045376706641096786, + "loss": 1.1824, + "step": 2205 + }, + { + "epoch": 0.39354205690839356, + "grad_norm": 0.520853579044342, + "learning_rate": 0.00045372646678141127, + "loss": 1.0964, + "step": 2206 + }, + { + "epoch": 0.3937204531263937, + "grad_norm": 0.6077725291252136, + "learning_rate": 0.0004536858511512656, + "loss": 1.155, + "step": 2207 + }, + { + "epoch": 0.3938988493443939, + "grad_norm": 0.5906806588172913, + "learning_rate": 0.0004536452195237208, + "loss": 1.3433, + "step": 2208 + }, + { + "epoch": 0.3940772455623941, + "grad_norm": 1.7862430810928345, + "learning_rate": 0.00045360457190196814, + "loss": 1.2213, + "step": 2209 + }, + { + "epoch": 0.39425564178039424, + "grad_norm": 0.530116856098175, + "learning_rate": 0.0004535639082892, + "loss": 1.0639, + "step": 2210 + }, + { + "epoch": 0.39443403799839444, + "grad_norm": 0.5256516933441162, + "learning_rate": 0.00045352322868861004, + "loss": 1.1793, + "step": 2211 + }, + { + "epoch": 0.39461243421639464, + "grad_norm": 0.46268409490585327, + "learning_rate": 0.00045348253310339336, + "loss": 0.9103, + "step": 2212 + }, + { + "epoch": 0.3947908304343948, + "grad_norm": 0.5721630454063416, + "learning_rate": 0.0004534418215367461, + "loss": 1.3037, + "step": 2213 + }, + { + "epoch": 0.394969226652395, + "grad_norm": 0.550315797328949, + "learning_rate": 0.0004534010939918657, + "loss": 1.0648, + "step": 2214 + }, + { + "epoch": 0.3951476228703951, + "grad_norm": 0.48966914415359497, + "learning_rate": 0.0004533603504719509, + "loss": 0.9247, + "step": 2215 + }, + { + "epoch": 0.3953260190883953, + "grad_norm": 0.5156149864196777, + "learning_rate": 0.0004533195909802017, + "loss": 1.1591, + "step": 2216 + }, + { + "epoch": 0.3955044153063955, + "grad_norm": 0.5864170789718628, + "learning_rate": 0.0004532788155198193, + "loss": 0.967, + "step": 2217 + }, + { + "epoch": 0.39568281152439566, + "grad_norm": 0.4873606562614441, + "learning_rate": 0.00045323802409400626, + "loss": 0.9788, + "step": 2218 + }, + { + "epoch": 0.39586120774239586, + "grad_norm": 0.5299291014671326, + "learning_rate": 0.00045319721670596623, + "loss": 1.16, + "step": 2219 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.5208775997161865, + "learning_rate": 0.00045315639335890423, + "loss": 1.1583, + "step": 2220 + }, + { + "epoch": 0.3962180001783962, + "grad_norm": 0.5344860553741455, + "learning_rate": 0.00045311555405602656, + "loss": 1.0034, + "step": 2221 + }, + { + "epoch": 0.3963963963963964, + "grad_norm": 0.5340254306793213, + "learning_rate": 0.00045307469880054063, + "loss": 1.0769, + "step": 2222 + }, + { + "epoch": 0.3965747926143966, + "grad_norm": 0.48012998700141907, + "learning_rate": 0.00045303382759565524, + "loss": 0.9031, + "step": 2223 + }, + { + "epoch": 0.39675318883239674, + "grad_norm": 0.511830747127533, + "learning_rate": 0.0004529929404445805, + "loss": 0.9729, + "step": 2224 + }, + { + "epoch": 0.39693158505039694, + "grad_norm": 0.5484462976455688, + "learning_rate": 0.00045295203735052747, + "loss": 0.9895, + "step": 2225 + }, + { + "epoch": 0.3971099812683971, + "grad_norm": 0.5423020720481873, + "learning_rate": 0.0004529111183167088, + "loss": 1.1713, + "step": 2226 + }, + { + "epoch": 0.3972883774863973, + "grad_norm": 0.5270297527313232, + "learning_rate": 0.00045287018334633824, + "loss": 1.0229, + "step": 2227 + }, + { + "epoch": 0.3974667737043975, + "grad_norm": 0.5546413064002991, + "learning_rate": 0.00045282923244263076, + "loss": 1.2354, + "step": 2228 + }, + { + "epoch": 0.3976451699223976, + "grad_norm": 0.5324910283088684, + "learning_rate": 0.00045278826560880267, + "loss": 1.225, + "step": 2229 + }, + { + "epoch": 0.3978235661403978, + "grad_norm": 0.48066118359565735, + "learning_rate": 0.00045274728284807144, + "loss": 0.8824, + "step": 2230 + }, + { + "epoch": 0.398001962358398, + "grad_norm": 0.5693602561950684, + "learning_rate": 0.00045270628416365586, + "loss": 1.0692, + "step": 2231 + }, + { + "epoch": 0.39818035857639816, + "grad_norm": 0.5462480783462524, + "learning_rate": 0.00045266526955877595, + "loss": 1.0537, + "step": 2232 + }, + { + "epoch": 0.39835875479439836, + "grad_norm": 0.510565459728241, + "learning_rate": 0.000452624239036653, + "loss": 1.0072, + "step": 2233 + }, + { + "epoch": 0.39853715101239856, + "grad_norm": 0.4951585829257965, + "learning_rate": 0.0004525831926005095, + "loss": 0.9697, + "step": 2234 + }, + { + "epoch": 0.3987155472303987, + "grad_norm": 0.49053049087524414, + "learning_rate": 0.0004525421302535693, + "loss": 0.9513, + "step": 2235 + }, + { + "epoch": 0.3988939434483989, + "grad_norm": 1.259932041168213, + "learning_rate": 0.0004525010519990572, + "loss": 0.9762, + "step": 2236 + }, + { + "epoch": 0.39907233966639905, + "grad_norm": 0.5876604914665222, + "learning_rate": 0.00045245995784019973, + "loss": 1.1267, + "step": 2237 + }, + { + "epoch": 0.39925073588439924, + "grad_norm": 2.0633914470672607, + "learning_rate": 0.00045241884778022423, + "loss": 1.0779, + "step": 2238 + }, + { + "epoch": 0.39942913210239944, + "grad_norm": 0.4787190854549408, + "learning_rate": 0.0004523777218223596, + "loss": 0.9998, + "step": 2239 + }, + { + "epoch": 0.3996075283203996, + "grad_norm": 2.782982110977173, + "learning_rate": 0.0004523365799698358, + "loss": 1.0978, + "step": 2240 + }, + { + "epoch": 0.3997859245383998, + "grad_norm": 0.5323127508163452, + "learning_rate": 0.00045229542222588405, + "loss": 1.0, + "step": 2241 + }, + { + "epoch": 0.3999643207564, + "grad_norm": 0.49209773540496826, + "learning_rate": 0.0004522542485937369, + "loss": 0.9856, + "step": 2242 + }, + { + "epoch": 0.4001427169744001, + "grad_norm": 0.9652454257011414, + "learning_rate": 0.000452213059076628, + "loss": 1.0638, + "step": 2243 + }, + { + "epoch": 0.4003211131924003, + "grad_norm": 0.5381683111190796, + "learning_rate": 0.00045217185367779265, + "loss": 1.0115, + "step": 2244 + }, + { + "epoch": 0.4004995094104005, + "grad_norm": 0.48560935258865356, + "learning_rate": 0.0004521306324004668, + "loss": 1.0621, + "step": 2245 + }, + { + "epoch": 0.40067790562840067, + "grad_norm": 0.5368553996086121, + "learning_rate": 0.00045208939524788805, + "loss": 0.9944, + "step": 2246 + }, + { + "epoch": 0.40085630184640086, + "grad_norm": 0.5911176204681396, + "learning_rate": 0.00045204814222329513, + "loss": 1.1529, + "step": 2247 + }, + { + "epoch": 0.401034698064401, + "grad_norm": 0.4951234459877014, + "learning_rate": 0.0004520068733299282, + "loss": 1.0468, + "step": 2248 + }, + { + "epoch": 0.4012130942824012, + "grad_norm": 0.5037215948104858, + "learning_rate": 0.0004519655885710283, + "loss": 0.9121, + "step": 2249 + }, + { + "epoch": 0.4013914905004014, + "grad_norm": 0.6309484243392944, + "learning_rate": 0.000451924287949838, + "loss": 1.2108, + "step": 2250 + }, + { + "epoch": 0.40156988671840155, + "grad_norm": 0.5557146072387695, + "learning_rate": 0.000451882971469601, + "loss": 1.1713, + "step": 2251 + }, + { + "epoch": 0.40174828293640175, + "grad_norm": 0.8298324346542358, + "learning_rate": 0.0004518416391335623, + "loss": 1.1566, + "step": 2252 + }, + { + "epoch": 0.40192667915440194, + "grad_norm": 4.0279459953308105, + "learning_rate": 0.00045180029094496813, + "loss": 1.1309, + "step": 2253 + }, + { + "epoch": 0.4021050753724021, + "grad_norm": 0.5673984885215759, + "learning_rate": 0.0004517589269070659, + "loss": 1.0474, + "step": 2254 + }, + { + "epoch": 0.4022834715904023, + "grad_norm": 0.4788002669811249, + "learning_rate": 0.0004517175470231044, + "loss": 0.8327, + "step": 2255 + }, + { + "epoch": 0.4024618678084025, + "grad_norm": 0.549028217792511, + "learning_rate": 0.0004516761512963337, + "loss": 1.1673, + "step": 2256 + }, + { + "epoch": 0.40264026402640263, + "grad_norm": 0.7566819787025452, + "learning_rate": 0.0004516347397300047, + "loss": 1.103, + "step": 2257 + }, + { + "epoch": 0.4028186602444028, + "grad_norm": 0.5571449995040894, + "learning_rate": 0.00045159331232737, + "loss": 1.4132, + "step": 2258 + }, + { + "epoch": 0.40299705646240297, + "grad_norm": 0.47221845388412476, + "learning_rate": 0.00045155186909168345, + "loss": 0.8425, + "step": 2259 + }, + { + "epoch": 0.40317545268040317, + "grad_norm": 0.956231415271759, + "learning_rate": 0.0004515104100261997, + "loss": 1.0242, + "step": 2260 + }, + { + "epoch": 0.40335384889840337, + "grad_norm": 0.5245352387428284, + "learning_rate": 0.0004514689351341751, + "loss": 1.2424, + "step": 2261 + }, + { + "epoch": 0.4035322451164035, + "grad_norm": 0.5289427638053894, + "learning_rate": 0.00045142744441886706, + "loss": 0.9463, + "step": 2262 + }, + { + "epoch": 0.4037106413344037, + "grad_norm": 0.5312831997871399, + "learning_rate": 0.00045138593788353424, + "loss": 1.0078, + "step": 2263 + }, + { + "epoch": 0.4038890375524039, + "grad_norm": 0.5654627680778503, + "learning_rate": 0.00045134441553143647, + "loss": 1.2575, + "step": 2264 + }, + { + "epoch": 0.40406743377040405, + "grad_norm": 0.5695852637290955, + "learning_rate": 0.00045130287736583493, + "loss": 1.0575, + "step": 2265 + }, + { + "epoch": 0.40424582998840425, + "grad_norm": 0.5789728164672852, + "learning_rate": 0.00045126132338999203, + "loss": 1.0646, + "step": 2266 + }, + { + "epoch": 0.40442422620640445, + "grad_norm": 0.526038408279419, + "learning_rate": 0.0004512197536071715, + "loss": 1.035, + "step": 2267 + }, + { + "epoch": 0.4046026224244046, + "grad_norm": 0.537186324596405, + "learning_rate": 0.000451178168020638, + "loss": 1.0141, + "step": 2268 + }, + { + "epoch": 0.4047810186424048, + "grad_norm": 0.5389789342880249, + "learning_rate": 0.0004511365666336578, + "loss": 1.0923, + "step": 2269 + }, + { + "epoch": 0.404959414860405, + "grad_norm": 0.5258745551109314, + "learning_rate": 0.00045109494944949827, + "loss": 1.277, + "step": 2270 + }, + { + "epoch": 0.40513781107840513, + "grad_norm": 0.5116296410560608, + "learning_rate": 0.0004510533164714278, + "loss": 0.9776, + "step": 2271 + }, + { + "epoch": 0.40531620729640533, + "grad_norm": 0.4564659297466278, + "learning_rate": 0.0004510116677027165, + "loss": 1.0638, + "step": 2272 + }, + { + "epoch": 0.40549460351440547, + "grad_norm": 0.5655273199081421, + "learning_rate": 0.00045097000314663527, + "loss": 1.0859, + "step": 2273 + }, + { + "epoch": 0.40567299973240567, + "grad_norm": 0.5012032389640808, + "learning_rate": 0.0004509283228064565, + "loss": 0.9221, + "step": 2274 + }, + { + "epoch": 0.40585139595040587, + "grad_norm": 0.48415374755859375, + "learning_rate": 0.00045088662668545375, + "loss": 1.0236, + "step": 2275 + }, + { + "epoch": 0.406029792168406, + "grad_norm": 0.47426506876945496, + "learning_rate": 0.00045084491478690177, + "loss": 1.0458, + "step": 2276 + }, + { + "epoch": 0.4062081883864062, + "grad_norm": 0.4942657947540283, + "learning_rate": 0.0004508031871140765, + "loss": 1.1705, + "step": 2277 + }, + { + "epoch": 0.4063865846044064, + "grad_norm": 0.5193724036216736, + "learning_rate": 0.0004507614436702555, + "loss": 1.0264, + "step": 2278 + }, + { + "epoch": 0.40656498082240655, + "grad_norm": 0.4963577091693878, + "learning_rate": 0.000450719684458717, + "loss": 0.8334, + "step": 2279 + }, + { + "epoch": 0.40674337704040675, + "grad_norm": 1.6158602237701416, + "learning_rate": 0.0004506779094827409, + "loss": 1.128, + "step": 2280 + }, + { + "epoch": 0.40692177325840695, + "grad_norm": 1.3288167715072632, + "learning_rate": 0.00045063611874560815, + "loss": 1.006, + "step": 2281 + }, + { + "epoch": 0.4071001694764071, + "grad_norm": 0.5889169573783875, + "learning_rate": 0.000450594312250601, + "loss": 1.0914, + "step": 2282 + }, + { + "epoch": 0.4072785656944073, + "grad_norm": 0.562699019908905, + "learning_rate": 0.00045055249000100283, + "loss": 1.0634, + "step": 2283 + }, + { + "epoch": 0.40745696191240743, + "grad_norm": 0.5045656561851501, + "learning_rate": 0.00045051065200009844, + "loss": 1.0903, + "step": 2284 + }, + { + "epoch": 0.40763535813040763, + "grad_norm": 0.5693420767784119, + "learning_rate": 0.0004504687982511737, + "loss": 1.0265, + "step": 2285 + }, + { + "epoch": 0.40781375434840783, + "grad_norm": 0.556315004825592, + "learning_rate": 0.00045042692875751585, + "loss": 1.0908, + "step": 2286 + }, + { + "epoch": 0.407992150566408, + "grad_norm": 1.0736392736434937, + "learning_rate": 0.00045038504352241324, + "loss": 1.2133, + "step": 2287 + }, + { + "epoch": 0.4081705467844082, + "grad_norm": 0.6816602349281311, + "learning_rate": 0.00045034314254915555, + "loss": 0.9846, + "step": 2288 + }, + { + "epoch": 0.40834894300240837, + "grad_norm": 0.6575304865837097, + "learning_rate": 0.0004503012258410336, + "loss": 1.014, + "step": 2289 + }, + { + "epoch": 0.4085273392204085, + "grad_norm": 4.394046306610107, + "learning_rate": 0.00045025929340133963, + "loss": 1.0483, + "step": 2290 + }, + { + "epoch": 0.4087057354384087, + "grad_norm": 1.7720059156417847, + "learning_rate": 0.000450217345233367, + "loss": 0.9815, + "step": 2291 + }, + { + "epoch": 0.4088841316564089, + "grad_norm": 0.8023939728736877, + "learning_rate": 0.00045017538134041013, + "loss": 0.9606, + "step": 2292 + }, + { + "epoch": 0.40906252787440905, + "grad_norm": 0.7400693893432617, + "learning_rate": 0.000450133401725765, + "loss": 1.2293, + "step": 2293 + }, + { + "epoch": 0.40924092409240925, + "grad_norm": 0.6982590556144714, + "learning_rate": 0.0004500914063927286, + "loss": 1.056, + "step": 2294 + }, + { + "epoch": 0.4094193203104094, + "grad_norm": 0.7190932035446167, + "learning_rate": 0.00045004939534459923, + "loss": 1.333, + "step": 2295 + }, + { + "epoch": 0.4095977165284096, + "grad_norm": 0.5682306885719299, + "learning_rate": 0.0004500073685846765, + "loss": 1.0389, + "step": 2296 + }, + { + "epoch": 0.4097761127464098, + "grad_norm": 0.5392178297042847, + "learning_rate": 0.0004499653261162611, + "loss": 0.8848, + "step": 2297 + }, + { + "epoch": 0.40995450896440994, + "grad_norm": 0.7155362963676453, + "learning_rate": 0.0004499232679426549, + "loss": 1.0928, + "step": 2298 + }, + { + "epoch": 0.41013290518241013, + "grad_norm": 0.5469278693199158, + "learning_rate": 0.00044988119406716144, + "loss": 1.0405, + "step": 2299 + }, + { + "epoch": 0.41031130140041033, + "grad_norm": 0.5402587056159973, + "learning_rate": 0.000449839104493085, + "loss": 0.8677, + "step": 2300 + }, + { + "epoch": 0.4104896976184105, + "grad_norm": 0.5472204089164734, + "learning_rate": 0.0004497969992237312, + "loss": 0.8406, + "step": 2301 + }, + { + "epoch": 0.4106680938364107, + "grad_norm": 0.529851496219635, + "learning_rate": 0.00044975487826240715, + "loss": 1.058, + "step": 2302 + }, + { + "epoch": 0.4108464900544109, + "grad_norm": 0.5373932719230652, + "learning_rate": 0.00044971274161242084, + "loss": 1.0915, + "step": 2303 + }, + { + "epoch": 0.411024886272411, + "grad_norm": 0.5543463826179504, + "learning_rate": 0.0004496705892770818, + "loss": 1.0177, + "step": 2304 + }, + { + "epoch": 0.4112032824904112, + "grad_norm": 0.5836469531059265, + "learning_rate": 0.0004496284212597006, + "loss": 0.9446, + "step": 2305 + }, + { + "epoch": 0.41138167870841136, + "grad_norm": 0.6079695820808411, + "learning_rate": 0.00044958623756358905, + "loss": 0.9742, + "step": 2306 + }, + { + "epoch": 0.41156007492641156, + "grad_norm": 0.4826619625091553, + "learning_rate": 0.00044954403819206037, + "loss": 1.0292, + "step": 2307 + }, + { + "epoch": 0.41173847114441176, + "grad_norm": 0.6502671241760254, + "learning_rate": 0.00044950182314842875, + "loss": 1.0641, + "step": 2308 + }, + { + "epoch": 0.4119168673624119, + "grad_norm": 0.4860910475254059, + "learning_rate": 0.0004494595924360098, + "loss": 0.9264, + "step": 2309 + }, + { + "epoch": 0.4120952635804121, + "grad_norm": 0.6258044838905334, + "learning_rate": 0.00044941734605812033, + "loss": 0.9347, + "step": 2310 + }, + { + "epoch": 0.4122736597984123, + "grad_norm": 0.5750169157981873, + "learning_rate": 0.00044937508401807826, + "loss": 1.2065, + "step": 2311 + }, + { + "epoch": 0.41245205601641244, + "grad_norm": 0.560260534286499, + "learning_rate": 0.0004493328063192029, + "loss": 1.2164, + "step": 2312 + }, + { + "epoch": 0.41263045223441264, + "grad_norm": 0.5267819166183472, + "learning_rate": 0.0004492905129648147, + "loss": 1.1227, + "step": 2313 + }, + { + "epoch": 0.41280884845241284, + "grad_norm": 0.5849962830543518, + "learning_rate": 0.0004492482039582354, + "loss": 1.2503, + "step": 2314 + }, + { + "epoch": 0.412987244670413, + "grad_norm": 0.5180212259292603, + "learning_rate": 0.00044920587930278796, + "loss": 0.92, + "step": 2315 + }, + { + "epoch": 0.4131656408884132, + "grad_norm": 0.4263732135295868, + "learning_rate": 0.0004491635390017964, + "loss": 0.8326, + "step": 2316 + }, + { + "epoch": 0.4133440371064133, + "grad_norm": 0.5409104824066162, + "learning_rate": 0.0004491211830585862, + "loss": 1.1766, + "step": 2317 + }, + { + "epoch": 0.4135224333244135, + "grad_norm": 0.5557611584663391, + "learning_rate": 0.000449078811476484, + "loss": 1.0369, + "step": 2318 + }, + { + "epoch": 0.4137008295424137, + "grad_norm": 0.5291704535484314, + "learning_rate": 0.0004490364242588176, + "loss": 1.024, + "step": 2319 + }, + { + "epoch": 0.41387922576041386, + "grad_norm": 0.4819130301475525, + "learning_rate": 0.0004489940214089161, + "loss": 1.0032, + "step": 2320 + }, + { + "epoch": 0.41405762197841406, + "grad_norm": 0.5592676401138306, + "learning_rate": 0.0004489516029301098, + "loss": 1.1959, + "step": 2321 + }, + { + "epoch": 0.41423601819641426, + "grad_norm": 0.612910270690918, + "learning_rate": 0.0004489091688257303, + "loss": 1.1389, + "step": 2322 + }, + { + "epoch": 0.4144144144144144, + "grad_norm": 0.47932949662208557, + "learning_rate": 0.00044886671909911014, + "loss": 0.9523, + "step": 2323 + }, + { + "epoch": 0.4145928106324146, + "grad_norm": 0.49534985423088074, + "learning_rate": 0.0004488242537535835, + "loss": 1.1314, + "step": 2324 + }, + { + "epoch": 0.4147712068504148, + "grad_norm": 0.4879145920276642, + "learning_rate": 0.00044878177279248553, + "loss": 1.0072, + "step": 2325 + }, + { + "epoch": 0.41494960306841494, + "grad_norm": 0.6173017621040344, + "learning_rate": 0.00044873927621915267, + "loss": 1.3328, + "step": 2326 + }, + { + "epoch": 0.41512799928641514, + "grad_norm": 0.4791456162929535, + "learning_rate": 0.00044869676403692254, + "loss": 1.0026, + "step": 2327 + }, + { + "epoch": 0.4153063955044153, + "grad_norm": 0.4833122491836548, + "learning_rate": 0.0004486542362491341, + "loss": 0.9301, + "step": 2328 + }, + { + "epoch": 0.4154847917224155, + "grad_norm": 0.6367433667182922, + "learning_rate": 0.00044861169285912746, + "loss": 1.2782, + "step": 2329 + }, + { + "epoch": 0.4156631879404157, + "grad_norm": 0.47810354828834534, + "learning_rate": 0.0004485691338702439, + "loss": 0.8764, + "step": 2330 + }, + { + "epoch": 0.4158415841584158, + "grad_norm": 0.5335472226142883, + "learning_rate": 0.000448526559285826, + "loss": 1.033, + "step": 2331 + }, + { + "epoch": 0.416019980376416, + "grad_norm": 0.5508213043212891, + "learning_rate": 0.00044848396910921763, + "loss": 1.2068, + "step": 2332 + }, + { + "epoch": 0.4161983765944162, + "grad_norm": 0.5242919325828552, + "learning_rate": 0.00044844136334376366, + "loss": 1.1176, + "step": 2333 + }, + { + "epoch": 0.41637677281241636, + "grad_norm": 0.49891209602355957, + "learning_rate": 0.0004483987419928104, + "loss": 0.8453, + "step": 2334 + }, + { + "epoch": 0.41655516903041656, + "grad_norm": 0.4879932999610901, + "learning_rate": 0.0004483561050597054, + "loss": 0.8443, + "step": 2335 + }, + { + "epoch": 0.41673356524841676, + "grad_norm": 0.49955081939697266, + "learning_rate": 0.00044831345254779724, + "loss": 0.9135, + "step": 2336 + }, + { + "epoch": 0.4169119614664169, + "grad_norm": 0.5931764841079712, + "learning_rate": 0.0004482707844604359, + "loss": 0.8997, + "step": 2337 + }, + { + "epoch": 0.4170903576844171, + "grad_norm": 0.882546603679657, + "learning_rate": 0.00044822810080097245, + "loss": 1.1035, + "step": 2338 + }, + { + "epoch": 0.41726875390241724, + "grad_norm": 0.5271819233894348, + "learning_rate": 0.00044818540157275924, + "loss": 1.0151, + "step": 2339 + }, + { + "epoch": 0.41744715012041744, + "grad_norm": 0.5230399370193481, + "learning_rate": 0.00044814268677914983, + "loss": 1.21, + "step": 2340 + }, + { + "epoch": 0.41762554633841764, + "grad_norm": 0.5473104119300842, + "learning_rate": 0.0004480999564234991, + "loss": 1.2911, + "step": 2341 + }, + { + "epoch": 0.4178039425564178, + "grad_norm": 0.5277518033981323, + "learning_rate": 0.0004480572105091631, + "loss": 1.1298, + "step": 2342 + }, + { + "epoch": 0.417982338774418, + "grad_norm": 1.1111489534378052, + "learning_rate": 0.00044801444903949894, + "loss": 1.1337, + "step": 2343 + }, + { + "epoch": 0.4181607349924182, + "grad_norm": 0.5263912677764893, + "learning_rate": 0.00044797167201786526, + "loss": 0.9498, + "step": 2344 + }, + { + "epoch": 0.4183391312104183, + "grad_norm": 0.4941869378089905, + "learning_rate": 0.00044792887944762155, + "loss": 0.9439, + "step": 2345 + }, + { + "epoch": 0.4185175274284185, + "grad_norm": 0.5406994223594666, + "learning_rate": 0.0004478860713321289, + "loss": 1.0799, + "step": 2346 + }, + { + "epoch": 0.4186959236464187, + "grad_norm": 0.4971858263015747, + "learning_rate": 0.0004478432476747493, + "loss": 0.909, + "step": 2347 + }, + { + "epoch": 0.41887431986441886, + "grad_norm": 0.5575050115585327, + "learning_rate": 0.0004478004084788462, + "loss": 0.8755, + "step": 2348 + }, + { + "epoch": 0.41905271608241906, + "grad_norm": 0.6695868372917175, + "learning_rate": 0.00044775755374778413, + "loss": 1.1526, + "step": 2349 + }, + { + "epoch": 0.4192311123004192, + "grad_norm": 0.5547947883605957, + "learning_rate": 0.00044771468348492896, + "loss": 1.1636, + "step": 2350 + }, + { + "epoch": 0.4194095085184194, + "grad_norm": 1.2415190935134888, + "learning_rate": 0.00044767179769364754, + "loss": 0.9455, + "step": 2351 + }, + { + "epoch": 0.4195879047364196, + "grad_norm": 0.5791499614715576, + "learning_rate": 0.0004476288963773082, + "loss": 0.9658, + "step": 2352 + }, + { + "epoch": 0.41976630095441975, + "grad_norm": 0.576765239238739, + "learning_rate": 0.00044758597953928044, + "loss": 1.0575, + "step": 2353 + }, + { + "epoch": 0.41994469717241995, + "grad_norm": 0.5667539238929749, + "learning_rate": 0.0004475430471829348, + "loss": 1.1187, + "step": 2354 + }, + { + "epoch": 0.42012309339042014, + "grad_norm": 0.5511680245399475, + "learning_rate": 0.00044750009931164336, + "loss": 0.9857, + "step": 2355 + }, + { + "epoch": 0.4203014896084203, + "grad_norm": 2.5863170623779297, + "learning_rate": 0.00044745713592877904, + "loss": 0.9495, + "step": 2356 + }, + { + "epoch": 0.4204798858264205, + "grad_norm": 0.5761575102806091, + "learning_rate": 0.00044741415703771615, + "loss": 0.8793, + "step": 2357 + }, + { + "epoch": 0.4206582820444207, + "grad_norm": 2.08255672454834, + "learning_rate": 0.0004473711626418304, + "loss": 1.1166, + "step": 2358 + }, + { + "epoch": 0.4208366782624208, + "grad_norm": 1.4676316976547241, + "learning_rate": 0.00044732815274449856, + "loss": 1.1489, + "step": 2359 + }, + { + "epoch": 0.421015074480421, + "grad_norm": 0.5451486706733704, + "learning_rate": 0.00044728512734909845, + "loss": 0.9492, + "step": 2360 + }, + { + "epoch": 0.42119347069842117, + "grad_norm": 0.650875985622406, + "learning_rate": 0.0004472420864590093, + "loss": 1.0389, + "step": 2361 + }, + { + "epoch": 0.42137186691642137, + "grad_norm": 0.5705146193504333, + "learning_rate": 0.00044719903007761153, + "loss": 0.9989, + "step": 2362 + }, + { + "epoch": 0.42155026313442157, + "grad_norm": 0.4973379671573639, + "learning_rate": 0.00044715595820828694, + "loss": 1.0576, + "step": 2363 + }, + { + "epoch": 0.4217286593524217, + "grad_norm": 0.5207223892211914, + "learning_rate": 0.0004471128708544181, + "loss": 1.2406, + "step": 2364 + }, + { + "epoch": 0.4219070555704219, + "grad_norm": 0.5396470427513123, + "learning_rate": 0.00044706976801938927, + "loss": 1.1947, + "step": 2365 + }, + { + "epoch": 0.4220854517884221, + "grad_norm": 0.5786265730857849, + "learning_rate": 0.0004470266497065856, + "loss": 1.1694, + "step": 2366 + }, + { + "epoch": 0.42226384800642225, + "grad_norm": 0.5583718419075012, + "learning_rate": 0.0004469835159193937, + "loss": 1.1314, + "step": 2367 + }, + { + "epoch": 0.42244224422442245, + "grad_norm": 0.4932156205177307, + "learning_rate": 0.0004469403666612013, + "loss": 0.8307, + "step": 2368 + }, + { + "epoch": 0.42262064044242265, + "grad_norm": 0.48325487971305847, + "learning_rate": 0.0004468972019353972, + "loss": 1.0107, + "step": 2369 + }, + { + "epoch": 0.4227990366604228, + "grad_norm": 0.583512008190155, + "learning_rate": 0.0004468540217453715, + "loss": 0.897, + "step": 2370 + }, + { + "epoch": 0.422977432878423, + "grad_norm": 0.5151441097259521, + "learning_rate": 0.0004468108260945157, + "loss": 0.955, + "step": 2371 + }, + { + "epoch": 0.42315582909642313, + "grad_norm": 0.5649638772010803, + "learning_rate": 0.00044676761498622236, + "loss": 1.0448, + "step": 2372 + }, + { + "epoch": 0.42333422531442333, + "grad_norm": 0.7743201851844788, + "learning_rate": 0.0004467243884238852, + "loss": 1.0063, + "step": 2373 + }, + { + "epoch": 0.4235126215324235, + "grad_norm": 0.535677433013916, + "learning_rate": 0.0004466811464108992, + "loss": 1.2873, + "step": 2374 + }, + { + "epoch": 0.42369101775042367, + "grad_norm": 0.9543548822402954, + "learning_rate": 0.00044663788895066064, + "loss": 0.9269, + "step": 2375 + }, + { + "epoch": 0.42386941396842387, + "grad_norm": 0.5472463965415955, + "learning_rate": 0.00044659461604656687, + "loss": 1.1469, + "step": 2376 + }, + { + "epoch": 0.42404781018642407, + "grad_norm": 0.5283387303352356, + "learning_rate": 0.00044655132770201657, + "loss": 0.9037, + "step": 2377 + }, + { + "epoch": 0.4242262064044242, + "grad_norm": 0.8139501810073853, + "learning_rate": 0.00044650802392040957, + "loss": 0.9288, + "step": 2378 + }, + { + "epoch": 0.4244046026224244, + "grad_norm": 0.4796660542488098, + "learning_rate": 0.0004464647047051469, + "loss": 1.0693, + "step": 2379 + }, + { + "epoch": 0.4245829988404246, + "grad_norm": 0.648838222026825, + "learning_rate": 0.0004464213700596309, + "loss": 1.3307, + "step": 2380 + }, + { + "epoch": 0.42476139505842475, + "grad_norm": 0.5429944396018982, + "learning_rate": 0.0004463780199872651, + "loss": 1.1117, + "step": 2381 + }, + { + "epoch": 0.42493979127642495, + "grad_norm": 0.47701987624168396, + "learning_rate": 0.000446334654491454, + "loss": 0.9715, + "step": 2382 + }, + { + "epoch": 0.4251181874944251, + "grad_norm": 0.479390412569046, + "learning_rate": 0.00044629127357560366, + "loss": 0.7807, + "step": 2383 + }, + { + "epoch": 0.4252965837124253, + "grad_norm": 0.5513942241668701, + "learning_rate": 0.00044624787724312123, + "loss": 0.945, + "step": 2384 + }, + { + "epoch": 0.4254749799304255, + "grad_norm": 0.5833590626716614, + "learning_rate": 0.0004462044654974149, + "loss": 1.0181, + "step": 2385 + }, + { + "epoch": 0.42565337614842563, + "grad_norm": 0.5582590699195862, + "learning_rate": 0.00044616103834189426, + "loss": 1.2099, + "step": 2386 + }, + { + "epoch": 0.42583177236642583, + "grad_norm": 0.7029179930686951, + "learning_rate": 0.0004461175957799701, + "loss": 1.0159, + "step": 2387 + }, + { + "epoch": 0.42601016858442603, + "grad_norm": 0.6166037321090698, + "learning_rate": 0.0004460741378150544, + "loss": 1.157, + "step": 2388 + }, + { + "epoch": 0.4261885648024262, + "grad_norm": 0.4319639801979065, + "learning_rate": 0.0004460306644505603, + "loss": 1.0279, + "step": 2389 + }, + { + "epoch": 0.42636696102042637, + "grad_norm": 0.5709377527236938, + "learning_rate": 0.00044598717568990214, + "loss": 1.1857, + "step": 2390 + }, + { + "epoch": 0.42654535723842657, + "grad_norm": 0.5123762488365173, + "learning_rate": 0.00044594367153649554, + "loss": 0.8416, + "step": 2391 + }, + { + "epoch": 0.4267237534564267, + "grad_norm": 0.5814945101737976, + "learning_rate": 0.0004459001519937573, + "loss": 0.8822, + "step": 2392 + }, + { + "epoch": 0.4269021496744269, + "grad_norm": 0.5209176540374756, + "learning_rate": 0.0004458566170651055, + "loss": 0.8742, + "step": 2393 + }, + { + "epoch": 0.42708054589242705, + "grad_norm": 0.5272184610366821, + "learning_rate": 0.00044581306675395917, + "loss": 0.9116, + "step": 2394 + }, + { + "epoch": 0.42725894211042725, + "grad_norm": 0.6671530604362488, + "learning_rate": 0.00044576950106373894, + "loss": 1.5091, + "step": 2395 + }, + { + "epoch": 0.42743733832842745, + "grad_norm": 0.5785043835639954, + "learning_rate": 0.00044572591999786625, + "loss": 0.9558, + "step": 2396 + }, + { + "epoch": 0.4276157345464276, + "grad_norm": 0.5078967213630676, + "learning_rate": 0.0004456823235597641, + "loss": 0.9042, + "step": 2397 + }, + { + "epoch": 0.4277941307644278, + "grad_norm": 0.5760866403579712, + "learning_rate": 0.0004456387117528564, + "loss": 1.1446, + "step": 2398 + }, + { + "epoch": 0.427972526982428, + "grad_norm": 0.5377951264381409, + "learning_rate": 0.00044559508458056853, + "loss": 1.185, + "step": 2399 + }, + { + "epoch": 0.42815092320042814, + "grad_norm": 0.5057874917984009, + "learning_rate": 0.0004455514420463269, + "loss": 1.0324, + "step": 2400 + }, + { + "epoch": 0.42832931941842833, + "grad_norm": 0.45790576934814453, + "learning_rate": 0.0004455077841535591, + "loss": 1.1008, + "step": 2401 + }, + { + "epoch": 0.42850771563642853, + "grad_norm": 0.5488193035125732, + "learning_rate": 0.0004454641109056941, + "loss": 1.0925, + "step": 2402 + }, + { + "epoch": 0.4286861118544287, + "grad_norm": 0.5256093740463257, + "learning_rate": 0.0004454204223061619, + "loss": 1.1341, + "step": 2403 + }, + { + "epoch": 0.4288645080724289, + "grad_norm": 0.4891919493675232, + "learning_rate": 0.00044537671835839386, + "loss": 1.0355, + "step": 2404 + }, + { + "epoch": 0.429042904290429, + "grad_norm": 0.4800609052181244, + "learning_rate": 0.0004453329990658225, + "loss": 0.9841, + "step": 2405 + }, + { + "epoch": 0.4292213005084292, + "grad_norm": 0.5242645740509033, + "learning_rate": 0.00044528926443188137, + "loss": 0.9167, + "step": 2406 + }, + { + "epoch": 0.4293996967264294, + "grad_norm": 0.7704287767410278, + "learning_rate": 0.0004452455144600055, + "loss": 1.1246, + "step": 2407 + }, + { + "epoch": 0.42957809294442956, + "grad_norm": 0.5126969814300537, + "learning_rate": 0.0004452017491536309, + "loss": 1.1712, + "step": 2408 + }, + { + "epoch": 0.42975648916242976, + "grad_norm": 0.5316195487976074, + "learning_rate": 0.00044515796851619493, + "loss": 1.017, + "step": 2409 + }, + { + "epoch": 0.42993488538042995, + "grad_norm": 0.5696223974227905, + "learning_rate": 0.0004451141725511362, + "loss": 1.0804, + "step": 2410 + }, + { + "epoch": 0.4301132815984301, + "grad_norm": 0.45580875873565674, + "learning_rate": 0.00044507036126189414, + "loss": 0.9472, + "step": 2411 + }, + { + "epoch": 0.4302916778164303, + "grad_norm": 0.579031765460968, + "learning_rate": 0.00044502653465191, + "loss": 1.0372, + "step": 2412 + }, + { + "epoch": 0.4304700740344305, + "grad_norm": 0.5286597609519958, + "learning_rate": 0.00044498269272462567, + "loss": 1.2533, + "step": 2413 + }, + { + "epoch": 0.43064847025243064, + "grad_norm": 0.6354557275772095, + "learning_rate": 0.00044493883548348456, + "loss": 1.0672, + "step": 2414 + }, + { + "epoch": 0.43082686647043084, + "grad_norm": 0.6015318036079407, + "learning_rate": 0.0004448949629319313, + "loss": 1.1149, + "step": 2415 + }, + { + "epoch": 0.431005262688431, + "grad_norm": 0.5161615014076233, + "learning_rate": 0.0004448510750734114, + "loss": 1.1315, + "step": 2416 + }, + { + "epoch": 0.4311836589064312, + "grad_norm": 0.5163938403129578, + "learning_rate": 0.000444807171911372, + "loss": 1.0624, + "step": 2417 + }, + { + "epoch": 0.4313620551244314, + "grad_norm": 0.558363676071167, + "learning_rate": 0.00044476325344926113, + "loss": 1.3461, + "step": 2418 + }, + { + "epoch": 0.4315404513424315, + "grad_norm": 0.5142703652381897, + "learning_rate": 0.00044471931969052817, + "loss": 0.9696, + "step": 2419 + }, + { + "epoch": 0.4317188475604317, + "grad_norm": 0.5983433127403259, + "learning_rate": 0.00044467537063862353, + "loss": 1.0887, + "step": 2420 + }, + { + "epoch": 0.4318972437784319, + "grad_norm": 0.5289214253425598, + "learning_rate": 0.00044463140629699916, + "loss": 1.2276, + "step": 2421 + }, + { + "epoch": 0.43207563999643206, + "grad_norm": 0.5035809874534607, + "learning_rate": 0.00044458742666910785, + "loss": 1.048, + "step": 2422 + }, + { + "epoch": 0.43225403621443226, + "grad_norm": 0.5544365048408508, + "learning_rate": 0.0004445434317584038, + "loss": 1.0358, + "step": 2423 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 0.4954850673675537, + "learning_rate": 0.00044449942156834236, + "loss": 0.9533, + "step": 2424 + }, + { + "epoch": 0.4326108286504326, + "grad_norm": 0.5363656282424927, + "learning_rate": 0.0004444553961023801, + "loss": 1.0444, + "step": 2425 + }, + { + "epoch": 0.4327892248684328, + "grad_norm": 0.4978641867637634, + "learning_rate": 0.00044441135536397455, + "loss": 1.0043, + "step": 2426 + }, + { + "epoch": 0.43296762108643294, + "grad_norm": 0.536332905292511, + "learning_rate": 0.0004443672993565849, + "loss": 0.8316, + "step": 2427 + }, + { + "epoch": 0.43314601730443314, + "grad_norm": 0.47319361567497253, + "learning_rate": 0.0004443232280836712, + "loss": 0.9772, + "step": 2428 + }, + { + "epoch": 0.43332441352243334, + "grad_norm": 0.47262993454933167, + "learning_rate": 0.0004442791415486948, + "loss": 0.8884, + "step": 2429 + }, + { + "epoch": 0.4335028097404335, + "grad_norm": 0.5313016176223755, + "learning_rate": 0.0004442350397551182, + "loss": 1.0347, + "step": 2430 + }, + { + "epoch": 0.4336812059584337, + "grad_norm": 0.47843775153160095, + "learning_rate": 0.0004441909227064052, + "loss": 0.9594, + "step": 2431 + }, + { + "epoch": 0.4338596021764339, + "grad_norm": 0.5179790258407593, + "learning_rate": 0.00044414679040602066, + "loss": 0.9443, + "step": 2432 + }, + { + "epoch": 0.434037998394434, + "grad_norm": 0.5228581428527832, + "learning_rate": 0.0004441026428574308, + "loss": 0.8851, + "step": 2433 + }, + { + "epoch": 0.4342163946124342, + "grad_norm": 0.5385475754737854, + "learning_rate": 0.0004440584800641029, + "loss": 1.1273, + "step": 2434 + }, + { + "epoch": 0.4343947908304344, + "grad_norm": 0.5584160685539246, + "learning_rate": 0.0004440143020295054, + "loss": 1.1722, + "step": 2435 + }, + { + "epoch": 0.43457318704843456, + "grad_norm": 0.5520399808883667, + "learning_rate": 0.0004439701087571082, + "loss": 1.1687, + "step": 2436 + }, + { + "epoch": 0.43475158326643476, + "grad_norm": 0.5202842950820923, + "learning_rate": 0.000443925900250382, + "loss": 1.1505, + "step": 2437 + }, + { + "epoch": 0.4349299794844349, + "grad_norm": 0.49009421467781067, + "learning_rate": 0.0004438816765127992, + "loss": 1.0714, + "step": 2438 + }, + { + "epoch": 0.4351083757024351, + "grad_norm": 0.5388837456703186, + "learning_rate": 0.0004438374375478329, + "loss": 0.9845, + "step": 2439 + }, + { + "epoch": 0.4352867719204353, + "grad_norm": 0.5120062232017517, + "learning_rate": 0.0004437931833589577, + "loss": 0.8921, + "step": 2440 + }, + { + "epoch": 0.43546516813843544, + "grad_norm": 0.4710652530193329, + "learning_rate": 0.00044374891394964925, + "loss": 0.9668, + "step": 2441 + }, + { + "epoch": 0.43564356435643564, + "grad_norm": 1.251098871231079, + "learning_rate": 0.0004437046293233845, + "loss": 1.2214, + "step": 2442 + }, + { + "epoch": 0.43582196057443584, + "grad_norm": 0.5326117873191833, + "learning_rate": 0.00044366032948364145, + "loss": 1.2166, + "step": 2443 + }, + { + "epoch": 0.436000356792436, + "grad_norm": 0.558892011642456, + "learning_rate": 0.0004436160144338995, + "loss": 1.0941, + "step": 2444 + }, + { + "epoch": 0.4361787530104362, + "grad_norm": 0.5412200093269348, + "learning_rate": 0.0004435716841776391, + "loss": 0.9558, + "step": 2445 + }, + { + "epoch": 0.4363571492284364, + "grad_norm": 0.50641268491745, + "learning_rate": 0.00044352733871834193, + "loss": 0.87, + "step": 2446 + }, + { + "epoch": 0.4365355454464365, + "grad_norm": 0.5298489332199097, + "learning_rate": 0.0004434829780594909, + "loss": 1.0362, + "step": 2447 + }, + { + "epoch": 0.4367139416644367, + "grad_norm": 0.5438408851623535, + "learning_rate": 0.00044343860220456995, + "loss": 0.8945, + "step": 2448 + }, + { + "epoch": 0.43689233788243687, + "grad_norm": 0.5930485725402832, + "learning_rate": 0.00044339421115706444, + "loss": 1.1371, + "step": 2449 + }, + { + "epoch": 0.43707073410043706, + "grad_norm": 0.5204629302024841, + "learning_rate": 0.00044334980492046085, + "loss": 1.047, + "step": 2450 + }, + { + "epoch": 0.43724913031843726, + "grad_norm": 0.46508142352104187, + "learning_rate": 0.0004433053834982468, + "loss": 0.9044, + "step": 2451 + }, + { + "epoch": 0.4374275265364374, + "grad_norm": 0.5729008913040161, + "learning_rate": 0.00044326094689391105, + "loss": 1.0573, + "step": 2452 + }, + { + "epoch": 0.4376059227544376, + "grad_norm": 0.5557507276535034, + "learning_rate": 0.00044321649511094375, + "loss": 1.1433, + "step": 2453 + }, + { + "epoch": 0.4377843189724378, + "grad_norm": 0.4726797044277191, + "learning_rate": 0.00044317202815283605, + "loss": 1.0822, + "step": 2454 + }, + { + "epoch": 0.43796271519043795, + "grad_norm": 0.5183228254318237, + "learning_rate": 0.00044312754602308035, + "loss": 1.1219, + "step": 2455 + }, + { + "epoch": 0.43814111140843814, + "grad_norm": 0.5738768577575684, + "learning_rate": 0.00044308304872517037, + "loss": 1.2338, + "step": 2456 + }, + { + "epoch": 0.43831950762643834, + "grad_norm": 0.5397445559501648, + "learning_rate": 0.0004430385362626008, + "loss": 1.1709, + "step": 2457 + }, + { + "epoch": 0.4384979038444385, + "grad_norm": 0.5338584780693054, + "learning_rate": 0.0004429940086388676, + "loss": 1.1662, + "step": 2458 + }, + { + "epoch": 0.4386763000624387, + "grad_norm": 0.5330877304077148, + "learning_rate": 0.00044294946585746815, + "loss": 0.9068, + "step": 2459 + }, + { + "epoch": 0.4388546962804388, + "grad_norm": 0.5154666304588318, + "learning_rate": 0.0004429049079219006, + "loss": 0.8542, + "step": 2460 + }, + { + "epoch": 0.439033092498439, + "grad_norm": 0.5239487886428833, + "learning_rate": 0.00044286033483566456, + "loss": 1.1177, + "step": 2461 + }, + { + "epoch": 0.4392114887164392, + "grad_norm": 1.6719294786453247, + "learning_rate": 0.00044281574660226086, + "loss": 0.7778, + "step": 2462 + }, + { + "epoch": 0.43938988493443937, + "grad_norm": 0.515557587146759, + "learning_rate": 0.0004427711432251914, + "loss": 0.9718, + "step": 2463 + }, + { + "epoch": 0.43956828115243957, + "grad_norm": 0.51800537109375, + "learning_rate": 0.0004427265247079594, + "loss": 0.8828, + "step": 2464 + }, + { + "epoch": 0.43974667737043976, + "grad_norm": 0.7687791585922241, + "learning_rate": 0.00044268189105406896, + "loss": 1.1033, + "step": 2465 + }, + { + "epoch": 0.4399250735884399, + "grad_norm": 0.6673977971076965, + "learning_rate": 0.00044263724226702573, + "loss": 1.1026, + "step": 2466 + }, + { + "epoch": 0.4401034698064401, + "grad_norm": 1.0776582956314087, + "learning_rate": 0.0004425925783503364, + "loss": 1.1335, + "step": 2467 + }, + { + "epoch": 0.4402818660244403, + "grad_norm": 0.5958316326141357, + "learning_rate": 0.00044254789930750884, + "loss": 0.968, + "step": 2468 + }, + { + "epoch": 0.44046026224244045, + "grad_norm": 0.5925278663635254, + "learning_rate": 0.0004425032051420522, + "loss": 1.2383, + "step": 2469 + }, + { + "epoch": 0.44063865846044065, + "grad_norm": 0.5400102138519287, + "learning_rate": 0.00044245849585747656, + "loss": 1.0723, + "step": 2470 + }, + { + "epoch": 0.4408170546784408, + "grad_norm": 0.5745581388473511, + "learning_rate": 0.00044241377145729356, + "loss": 1.1578, + "step": 2471 + }, + { + "epoch": 0.440995450896441, + "grad_norm": 0.514392077922821, + "learning_rate": 0.00044236903194501566, + "loss": 0.9727, + "step": 2472 + }, + { + "epoch": 0.4411738471144412, + "grad_norm": 1.0300344228744507, + "learning_rate": 0.00044232427732415687, + "loss": 1.1997, + "step": 2473 + }, + { + "epoch": 0.44135224333244133, + "grad_norm": 0.45754095911979675, + "learning_rate": 0.0004422795075982321, + "loss": 0.9891, + "step": 2474 + }, + { + "epoch": 0.44153063955044153, + "grad_norm": 0.5444067120552063, + "learning_rate": 0.0004422347227707575, + "loss": 1.0424, + "step": 2475 + }, + { + "epoch": 0.4417090357684417, + "grad_norm": 2.9430816173553467, + "learning_rate": 0.0004421899228452505, + "loss": 1.0375, + "step": 2476 + }, + { + "epoch": 0.44188743198644187, + "grad_norm": 0.570728600025177, + "learning_rate": 0.00044214510782522966, + "loss": 1.257, + "step": 2477 + }, + { + "epoch": 0.44206582820444207, + "grad_norm": 0.5782069563865662, + "learning_rate": 0.0004421002777142148, + "loss": 1.0603, + "step": 2478 + }, + { + "epoch": 0.44224422442244227, + "grad_norm": 0.505310595035553, + "learning_rate": 0.00044205543251572664, + "loss": 0.9608, + "step": 2479 + }, + { + "epoch": 0.4424226206404424, + "grad_norm": 0.5967997312545776, + "learning_rate": 0.00044201057223328767, + "loss": 1.2066, + "step": 2480 + }, + { + "epoch": 0.4426010168584426, + "grad_norm": 0.5399075746536255, + "learning_rate": 0.00044196569687042085, + "loss": 1.0442, + "step": 2481 + }, + { + "epoch": 0.4427794130764428, + "grad_norm": 0.5512840151786804, + "learning_rate": 0.0004419208064306509, + "loss": 1.1372, + "step": 2482 + }, + { + "epoch": 0.44295780929444295, + "grad_norm": 0.540708601474762, + "learning_rate": 0.0004418759009175033, + "loss": 1.2523, + "step": 2483 + }, + { + "epoch": 0.44313620551244315, + "grad_norm": 0.5288528800010681, + "learning_rate": 0.0004418309803345051, + "loss": 1.1503, + "step": 2484 + }, + { + "epoch": 0.4433146017304433, + "grad_norm": 0.48563152551651, + "learning_rate": 0.0004417860446851841, + "loss": 0.9179, + "step": 2485 + }, + { + "epoch": 0.4434929979484435, + "grad_norm": 0.4692551791667938, + "learning_rate": 0.00044174109397306983, + "loss": 0.9259, + "step": 2486 + }, + { + "epoch": 0.4436713941664437, + "grad_norm": 0.5046235918998718, + "learning_rate": 0.0004416961282016926, + "loss": 0.8989, + "step": 2487 + }, + { + "epoch": 0.44384979038444383, + "grad_norm": 0.5326201915740967, + "learning_rate": 0.00044165114737458377, + "loss": 1.1311, + "step": 2488 + }, + { + "epoch": 0.44402818660244403, + "grad_norm": 0.4889252185821533, + "learning_rate": 0.00044160615149527643, + "loss": 0.9787, + "step": 2489 + }, + { + "epoch": 0.44420658282044423, + "grad_norm": 0.5168465971946716, + "learning_rate": 0.0004415611405673044, + "loss": 1.1548, + "step": 2490 + }, + { + "epoch": 0.44438497903844437, + "grad_norm": 0.49787530303001404, + "learning_rate": 0.0004415161145942028, + "loss": 1.1097, + "step": 2491 + }, + { + "epoch": 0.44456337525644457, + "grad_norm": 0.47516435384750366, + "learning_rate": 0.000441471073579508, + "loss": 1.0759, + "step": 2492 + }, + { + "epoch": 0.44474177147444477, + "grad_norm": 0.5186420679092407, + "learning_rate": 0.0004414260175267574, + "loss": 0.9333, + "step": 2493 + }, + { + "epoch": 0.4449201676924449, + "grad_norm": 0.538212239742279, + "learning_rate": 0.0004413809464394899, + "loss": 1.0112, + "step": 2494 + }, + { + "epoch": 0.4450985639104451, + "grad_norm": 0.6391366124153137, + "learning_rate": 0.0004413358603212451, + "loss": 1.0763, + "step": 2495 + }, + { + "epoch": 0.44527696012844525, + "grad_norm": 0.49085238575935364, + "learning_rate": 0.00044129075917556415, + "loss": 0.9748, + "step": 2496 + }, + { + "epoch": 0.44545535634644545, + "grad_norm": 3.5137956142425537, + "learning_rate": 0.00044124564300598943, + "loss": 0.9487, + "step": 2497 + }, + { + "epoch": 0.44563375256444565, + "grad_norm": 0.5351426601409912, + "learning_rate": 0.00044120051181606413, + "loss": 1.0537, + "step": 2498 + }, + { + "epoch": 0.4458121487824458, + "grad_norm": 0.5824923515319824, + "learning_rate": 0.0004411553656093329, + "loss": 0.9824, + "step": 2499 + }, + { + "epoch": 0.445990545000446, + "grad_norm": 0.5528644323348999, + "learning_rate": 0.00044111020438934153, + "loss": 1.1647, + "step": 2500 + }, + { + "epoch": 0.4461689412184462, + "grad_norm": 0.6510967016220093, + "learning_rate": 0.0004410650281596369, + "loss": 1.101, + "step": 2501 + }, + { + "epoch": 0.44634733743644633, + "grad_norm": 0.5696078538894653, + "learning_rate": 0.00044101983692376723, + "loss": 1.0418, + "step": 2502 + }, + { + "epoch": 0.44652573365444653, + "grad_norm": 0.5028705596923828, + "learning_rate": 0.0004409746306852818, + "loss": 0.8853, + "step": 2503 + }, + { + "epoch": 0.44670412987244673, + "grad_norm": 0.5344279408454895, + "learning_rate": 0.00044092940944773104, + "loss": 1.0267, + "step": 2504 + }, + { + "epoch": 0.4468825260904469, + "grad_norm": 0.5304036140441895, + "learning_rate": 0.00044088417321466657, + "loss": 1.0281, + "step": 2505 + }, + { + "epoch": 0.4470609223084471, + "grad_norm": 0.5835813879966736, + "learning_rate": 0.0004408389219896413, + "loss": 1.1255, + "step": 2506 + }, + { + "epoch": 0.4472393185264472, + "grad_norm": 0.48519715666770935, + "learning_rate": 0.0004407936557762093, + "loss": 1.0046, + "step": 2507 + }, + { + "epoch": 0.4474177147444474, + "grad_norm": 0.5301933288574219, + "learning_rate": 0.0004407483745779256, + "loss": 1.0998, + "step": 2508 + }, + { + "epoch": 0.4475961109624476, + "grad_norm": 0.4589798152446747, + "learning_rate": 0.00044070307839834665, + "loss": 0.7962, + "step": 2509 + }, + { + "epoch": 0.44777450718044776, + "grad_norm": 0.496530681848526, + "learning_rate": 0.00044065776724103, + "loss": 0.7726, + "step": 2510 + }, + { + "epoch": 0.44795290339844795, + "grad_norm": 0.4880942702293396, + "learning_rate": 0.00044061244110953437, + "loss": 0.8395, + "step": 2511 + }, + { + "epoch": 0.44813129961644815, + "grad_norm": 0.5023302435874939, + "learning_rate": 0.0004405671000074196, + "loss": 0.9507, + "step": 2512 + }, + { + "epoch": 0.4483096958344483, + "grad_norm": 0.561840832233429, + "learning_rate": 0.00044052174393824687, + "loss": 1.0694, + "step": 2513 + }, + { + "epoch": 0.4484880920524485, + "grad_norm": 0.5529274344444275, + "learning_rate": 0.00044047637290557835, + "loss": 1.221, + "step": 2514 + }, + { + "epoch": 0.4486664882704487, + "grad_norm": 0.44090861082077026, + "learning_rate": 0.00044043098691297746, + "loss": 1.0016, + "step": 2515 + }, + { + "epoch": 0.44884488448844884, + "grad_norm": 0.5670448541641235, + "learning_rate": 0.0004403855859640088, + "loss": 0.9971, + "step": 2516 + }, + { + "epoch": 0.44902328070644904, + "grad_norm": 0.45753970742225647, + "learning_rate": 0.00044034017006223817, + "loss": 0.7955, + "step": 2517 + }, + { + "epoch": 0.4492016769244492, + "grad_norm": 0.7095730900764465, + "learning_rate": 0.0004402947392112324, + "loss": 0.7677, + "step": 2518 + }, + { + "epoch": 0.4493800731424494, + "grad_norm": 0.5540285110473633, + "learning_rate": 0.00044024929341455977, + "loss": 1.34, + "step": 2519 + }, + { + "epoch": 0.4495584693604496, + "grad_norm": 0.5628980994224548, + "learning_rate": 0.00044020383267578955, + "loss": 1.0199, + "step": 2520 + }, + { + "epoch": 0.4497368655784497, + "grad_norm": 0.5465995073318481, + "learning_rate": 0.0004401583569984921, + "loss": 1.1564, + "step": 2521 + }, + { + "epoch": 0.4499152617964499, + "grad_norm": 0.4945712089538574, + "learning_rate": 0.00044011286638623916, + "loss": 1.0976, + "step": 2522 + }, + { + "epoch": 0.4500936580144501, + "grad_norm": 0.5624112486839294, + "learning_rate": 0.00044006736084260345, + "loss": 1.0677, + "step": 2523 + }, + { + "epoch": 0.45027205423245026, + "grad_norm": 0.5418776869773865, + "learning_rate": 0.0004400218403711591, + "loss": 1.2596, + "step": 2524 + }, + { + "epoch": 0.45045045045045046, + "grad_norm": 0.4920550584793091, + "learning_rate": 0.0004399763049754811, + "loss": 1.0434, + "step": 2525 + }, + { + "epoch": 0.45062884666845066, + "grad_norm": 0.5869379639625549, + "learning_rate": 0.0004399307546591459, + "loss": 1.0799, + "step": 2526 + }, + { + "epoch": 0.4508072428864508, + "grad_norm": 0.619750440120697, + "learning_rate": 0.0004398851894257309, + "loss": 1.1675, + "step": 2527 + }, + { + "epoch": 0.450985639104451, + "grad_norm": 0.5530658960342407, + "learning_rate": 0.00043983960927881493, + "loss": 0.9814, + "step": 2528 + }, + { + "epoch": 0.45116403532245114, + "grad_norm": 0.5448163151741028, + "learning_rate": 0.0004397940142219776, + "loss": 1.2051, + "step": 2529 + }, + { + "epoch": 0.45134243154045134, + "grad_norm": 0.5526171922683716, + "learning_rate": 0.00043974840425880027, + "loss": 1.2632, + "step": 2530 + }, + { + "epoch": 0.45152082775845154, + "grad_norm": 0.46508967876434326, + "learning_rate": 0.0004397027793928648, + "loss": 0.859, + "step": 2531 + }, + { + "epoch": 0.4516992239764517, + "grad_norm": 0.9926976561546326, + "learning_rate": 0.00043965713962775465, + "loss": 0.924, + "step": 2532 + }, + { + "epoch": 0.4518776201944519, + "grad_norm": 0.5205312967300415, + "learning_rate": 0.00043961148496705443, + "loss": 0.8696, + "step": 2533 + }, + { + "epoch": 0.4520560164124521, + "grad_norm": 0.5645661354064941, + "learning_rate": 0.0004395658154143498, + "loss": 1.0412, + "step": 2534 + }, + { + "epoch": 0.4522344126304522, + "grad_norm": 0.6434807181358337, + "learning_rate": 0.00043952013097322754, + "loss": 1.1406, + "step": 2535 + }, + { + "epoch": 0.4524128088484524, + "grad_norm": 0.5284314751625061, + "learning_rate": 0.00043947443164727573, + "loss": 0.9631, + "step": 2536 + }, + { + "epoch": 0.4525912050664526, + "grad_norm": 0.5045761466026306, + "learning_rate": 0.00043942871744008375, + "loss": 0.9317, + "step": 2537 + }, + { + "epoch": 0.45276960128445276, + "grad_norm": 0.5030120015144348, + "learning_rate": 0.00043938298835524166, + "loss": 0.8189, + "step": 2538 + }, + { + "epoch": 0.45294799750245296, + "grad_norm": 0.4574146270751953, + "learning_rate": 0.0004393372443963412, + "loss": 0.7983, + "step": 2539 + }, + { + "epoch": 0.4531263937204531, + "grad_norm": 0.5439005494117737, + "learning_rate": 0.00043929148556697505, + "loss": 1.0398, + "step": 2540 + }, + { + "epoch": 0.4533047899384533, + "grad_norm": 0.5667563080787659, + "learning_rate": 0.0004392457118707371, + "loss": 1.1755, + "step": 2541 + }, + { + "epoch": 0.4534831861564535, + "grad_norm": 0.5080005526542664, + "learning_rate": 0.0004391999233112224, + "loss": 1.0836, + "step": 2542 + }, + { + "epoch": 0.45366158237445364, + "grad_norm": 0.4875447750091553, + "learning_rate": 0.00043915411989202706, + "loss": 0.8928, + "step": 2543 + }, + { + "epoch": 0.45383997859245384, + "grad_norm": 0.5923610925674438, + "learning_rate": 0.0004391083016167486, + "loss": 1.0488, + "step": 2544 + }, + { + "epoch": 0.45401837481045404, + "grad_norm": 0.49262282252311707, + "learning_rate": 0.00043906246848898544, + "loss": 0.9689, + "step": 2545 + }, + { + "epoch": 0.4541967710284542, + "grad_norm": 0.4940873086452484, + "learning_rate": 0.0004390166205123375, + "loss": 1.0168, + "step": 2546 + }, + { + "epoch": 0.4543751672464544, + "grad_norm": 0.44292840361595154, + "learning_rate": 0.00043897075769040543, + "loss": 0.6536, + "step": 2547 + }, + { + "epoch": 0.4545535634644546, + "grad_norm": 0.4958019256591797, + "learning_rate": 0.00043892488002679144, + "loss": 0.918, + "step": 2548 + }, + { + "epoch": 0.4547319596824547, + "grad_norm": 8.831953048706055, + "learning_rate": 0.0004388789875250986, + "loss": 1.2642, + "step": 2549 + }, + { + "epoch": 0.4549103559004549, + "grad_norm": 0.8993814587593079, + "learning_rate": 0.0004388330801889314, + "loss": 1.0951, + "step": 2550 + }, + { + "epoch": 0.45508875211845506, + "grad_norm": 0.47165387868881226, + "learning_rate": 0.00043878715802189526, + "loss": 0.9744, + "step": 2551 + }, + { + "epoch": 0.45526714833645526, + "grad_norm": 0.4588290750980377, + "learning_rate": 0.0004387412210275971, + "loss": 0.7959, + "step": 2552 + }, + { + "epoch": 0.45544554455445546, + "grad_norm": 0.5076047778129578, + "learning_rate": 0.00043869526920964466, + "loss": 1.1043, + "step": 2553 + }, + { + "epoch": 0.4556239407724556, + "grad_norm": 0.5313244462013245, + "learning_rate": 0.0004386493025716469, + "loss": 1.1507, + "step": 2554 + }, + { + "epoch": 0.4558023369904558, + "grad_norm": 0.7201479077339172, + "learning_rate": 0.00043860332111721407, + "loss": 1.0751, + "step": 2555 + }, + { + "epoch": 0.455980733208456, + "grad_norm": 0.503410816192627, + "learning_rate": 0.0004385573248499576, + "loss": 1.0373, + "step": 2556 + }, + { + "epoch": 0.45615912942645614, + "grad_norm": 0.5491756200790405, + "learning_rate": 0.00043851131377349004, + "loss": 1.0991, + "step": 2557 + }, + { + "epoch": 0.45633752564445634, + "grad_norm": 0.5328734517097473, + "learning_rate": 0.0004384652878914249, + "loss": 1.0833, + "step": 2558 + }, + { + "epoch": 0.45651592186245654, + "grad_norm": 0.564434826374054, + "learning_rate": 0.00043841924720737724, + "loss": 1.1307, + "step": 2559 + }, + { + "epoch": 0.4566943180804567, + "grad_norm": 0.5131104588508606, + "learning_rate": 0.000438373191724963, + "loss": 0.9291, + "step": 2560 + }, + { + "epoch": 0.4568727142984569, + "grad_norm": 0.4851999282836914, + "learning_rate": 0.0004383271214477993, + "loss": 0.8158, + "step": 2561 + }, + { + "epoch": 0.457051110516457, + "grad_norm": 0.5580843091011047, + "learning_rate": 0.00043828103637950455, + "loss": 1.1642, + "step": 2562 + }, + { + "epoch": 0.4572295067344572, + "grad_norm": 0.5292948484420776, + "learning_rate": 0.00043823493652369824, + "loss": 1.0218, + "step": 2563 + }, + { + "epoch": 0.4574079029524574, + "grad_norm": 0.4889855682849884, + "learning_rate": 0.000438188821884001, + "loss": 0.9337, + "step": 2564 + }, + { + "epoch": 0.45758629917045757, + "grad_norm": 0.5168807506561279, + "learning_rate": 0.0004381426924640346, + "loss": 1.0195, + "step": 2565 + }, + { + "epoch": 0.45776469538845777, + "grad_norm": 0.5251139998435974, + "learning_rate": 0.0004380965482674222, + "loss": 1.0455, + "step": 2566 + }, + { + "epoch": 0.45794309160645796, + "grad_norm": 0.49821674823760986, + "learning_rate": 0.00043805038929778785, + "loss": 0.9241, + "step": 2567 + }, + { + "epoch": 0.4581214878244581, + "grad_norm": 0.48444879055023193, + "learning_rate": 0.0004380042155587568, + "loss": 1.0078, + "step": 2568 + }, + { + "epoch": 0.4582998840424583, + "grad_norm": 0.4438982903957367, + "learning_rate": 0.00043795802705395555, + "loss": 0.7366, + "step": 2569 + }, + { + "epoch": 0.4584782802604585, + "grad_norm": 0.48436489701271057, + "learning_rate": 0.0004379118237870118, + "loss": 0.9042, + "step": 2570 + }, + { + "epoch": 0.45865667647845865, + "grad_norm": 0.5146360397338867, + "learning_rate": 0.0004378656057615542, + "loss": 0.9988, + "step": 2571 + }, + { + "epoch": 0.45883507269645885, + "grad_norm": 0.5639342665672302, + "learning_rate": 0.00043781937298121275, + "loss": 1.2857, + "step": 2572 + }, + { + "epoch": 0.459013468914459, + "grad_norm": 0.5131158828735352, + "learning_rate": 0.00043777312544961865, + "loss": 1.1494, + "step": 2573 + }, + { + "epoch": 0.4591918651324592, + "grad_norm": 0.5281994342803955, + "learning_rate": 0.0004377268631704041, + "loss": 1.1338, + "step": 2574 + }, + { + "epoch": 0.4593702613504594, + "grad_norm": 0.6242533922195435, + "learning_rate": 0.0004376805861472024, + "loss": 1.1631, + "step": 2575 + }, + { + "epoch": 0.45954865756845953, + "grad_norm": 0.48754405975341797, + "learning_rate": 0.0004376342943836483, + "loss": 1.016, + "step": 2576 + }, + { + "epoch": 0.4597270537864597, + "grad_norm": 0.5026589632034302, + "learning_rate": 0.0004375879878833775, + "loss": 1.0022, + "step": 2577 + }, + { + "epoch": 0.4599054500044599, + "grad_norm": 0.4625113606452942, + "learning_rate": 0.0004375416666500268, + "loss": 0.942, + "step": 2578 + }, + { + "epoch": 0.46008384622246007, + "grad_norm": 0.552786111831665, + "learning_rate": 0.00043749533068723436, + "loss": 1.1234, + "step": 2579 + }, + { + "epoch": 0.46026224244046027, + "grad_norm": 0.5415485501289368, + "learning_rate": 0.0004374489799986393, + "loss": 0.8123, + "step": 2580 + }, + { + "epoch": 0.46044063865846047, + "grad_norm": 0.49887794256210327, + "learning_rate": 0.000437402614587882, + "loss": 1.0032, + "step": 2581 + }, + { + "epoch": 0.4606190348764606, + "grad_norm": 0.5717129111289978, + "learning_rate": 0.00043735623445860397, + "loss": 1.2778, + "step": 2582 + }, + { + "epoch": 0.4607974310944608, + "grad_norm": 0.4855397939682007, + "learning_rate": 0.00043730983961444794, + "loss": 1.0836, + "step": 2583 + }, + { + "epoch": 0.46097582731246095, + "grad_norm": 0.46733468770980835, + "learning_rate": 0.0004372634300590578, + "loss": 0.9161, + "step": 2584 + }, + { + "epoch": 0.46115422353046115, + "grad_norm": 0.5448064804077148, + "learning_rate": 0.0004372170057960783, + "loss": 1.0957, + "step": 2585 + }, + { + "epoch": 0.46133261974846135, + "grad_norm": 0.5003434419631958, + "learning_rate": 0.0004371705668291558, + "loss": 1.1793, + "step": 2586 + }, + { + "epoch": 0.4615110159664615, + "grad_norm": 0.5096003413200378, + "learning_rate": 0.00043712411316193755, + "loss": 1.29, + "step": 2587 + }, + { + "epoch": 0.4616894121844617, + "grad_norm": 0.5056552290916443, + "learning_rate": 0.00043707764479807194, + "loss": 1.0382, + "step": 2588 + }, + { + "epoch": 0.4618678084024619, + "grad_norm": 0.4839148223400116, + "learning_rate": 0.0004370311617412086, + "loss": 0.9115, + "step": 2589 + }, + { + "epoch": 0.46204620462046203, + "grad_norm": 0.49606168270111084, + "learning_rate": 0.0004369846639949984, + "loss": 1.1419, + "step": 2590 + }, + { + "epoch": 0.46222460083846223, + "grad_norm": 0.4850931763648987, + "learning_rate": 0.000436938151563093, + "loss": 1.0411, + "step": 2591 + }, + { + "epoch": 0.46240299705646243, + "grad_norm": 0.5619013905525208, + "learning_rate": 0.0004368916244491458, + "loss": 1.0051, + "step": 2592 + }, + { + "epoch": 0.46258139327446257, + "grad_norm": 0.5425794720649719, + "learning_rate": 0.00043684508265681065, + "loss": 1.3862, + "step": 2593 + }, + { + "epoch": 0.46275978949246277, + "grad_norm": 0.5217241644859314, + "learning_rate": 0.00043679852618974327, + "loss": 1.2932, + "step": 2594 + }, + { + "epoch": 0.4629381857104629, + "grad_norm": 0.5080055594444275, + "learning_rate": 0.0004367519550515999, + "loss": 1.1496, + "step": 2595 + }, + { + "epoch": 0.4631165819284631, + "grad_norm": 0.5273397564888, + "learning_rate": 0.0004367053692460385, + "loss": 1.1263, + "step": 2596 + }, + { + "epoch": 0.4632949781464633, + "grad_norm": 0.5620000958442688, + "learning_rate": 0.0004366587687767176, + "loss": 0.9767, + "step": 2597 + }, + { + "epoch": 0.46347337436446345, + "grad_norm": 0.49073854088783264, + "learning_rate": 0.0004366121536472974, + "loss": 1.2086, + "step": 2598 + }, + { + "epoch": 0.46365177058246365, + "grad_norm": 0.5440914034843445, + "learning_rate": 0.000436565523861439, + "loss": 1.248, + "step": 2599 + }, + { + "epoch": 0.46383016680046385, + "grad_norm": 0.5432100892066956, + "learning_rate": 0.00043651887942280454, + "loss": 0.9665, + "step": 2600 + }, + { + "epoch": 0.464008563018464, + "grad_norm": 0.5001043081283569, + "learning_rate": 0.00043647222033505763, + "loss": 0.8812, + "step": 2601 + }, + { + "epoch": 0.4641869592364642, + "grad_norm": 0.4610103964805603, + "learning_rate": 0.0004364255466018627, + "loss": 0.9032, + "step": 2602 + }, + { + "epoch": 0.4643653554544644, + "grad_norm": 0.5426071286201477, + "learning_rate": 0.0004363788582268857, + "loss": 1.1128, + "step": 2603 + }, + { + "epoch": 0.46454375167246453, + "grad_norm": 0.5425922274589539, + "learning_rate": 0.00043633215521379326, + "loss": 1.134, + "step": 2604 + }, + { + "epoch": 0.46472214789046473, + "grad_norm": 0.5755911469459534, + "learning_rate": 0.0004362854375662536, + "loss": 1.345, + "step": 2605 + }, + { + "epoch": 0.4649005441084649, + "grad_norm": 0.5014548301696777, + "learning_rate": 0.0004362387052879358, + "loss": 1.0121, + "step": 2606 + }, + { + "epoch": 0.4650789403264651, + "grad_norm": 0.5047395825386047, + "learning_rate": 0.00043619195838251023, + "loss": 1.0983, + "step": 2607 + }, + { + "epoch": 0.46525733654446527, + "grad_norm": 0.5320248603820801, + "learning_rate": 0.0004361451968536484, + "loss": 0.9432, + "step": 2608 + }, + { + "epoch": 0.4654357327624654, + "grad_norm": 0.5131525993347168, + "learning_rate": 0.0004360984207050229, + "loss": 0.9492, + "step": 2609 + }, + { + "epoch": 0.4656141289804656, + "grad_norm": 0.57234787940979, + "learning_rate": 0.0004360516299403075, + "loss": 1.3292, + "step": 2610 + }, + { + "epoch": 0.4657925251984658, + "grad_norm": 0.502707302570343, + "learning_rate": 0.0004360048245631772, + "loss": 1.0897, + "step": 2611 + }, + { + "epoch": 0.46597092141646596, + "grad_norm": 0.5083783268928528, + "learning_rate": 0.00043595800457730795, + "loss": 0.9891, + "step": 2612 + }, + { + "epoch": 0.46614931763446615, + "grad_norm": 0.5372927188873291, + "learning_rate": 0.00043591116998637717, + "loss": 1.1991, + "step": 2613 + }, + { + "epoch": 0.46632771385246635, + "grad_norm": 0.46498778462409973, + "learning_rate": 0.00043586432079406297, + "loss": 0.8831, + "step": 2614 + }, + { + "epoch": 0.4665061100704665, + "grad_norm": 0.5247387886047363, + "learning_rate": 0.0004358174570040451, + "loss": 1.2432, + "step": 2615 + }, + { + "epoch": 0.4666845062884667, + "grad_norm": 0.5532581210136414, + "learning_rate": 0.000435770578620004, + "loss": 1.1609, + "step": 2616 + }, + { + "epoch": 0.46686290250646684, + "grad_norm": 0.5167216062545776, + "learning_rate": 0.0004357236856456217, + "loss": 1.0657, + "step": 2617 + }, + { + "epoch": 0.46704129872446704, + "grad_norm": 0.48156222701072693, + "learning_rate": 0.000435676778084581, + "loss": 1.1393, + "step": 2618 + }, + { + "epoch": 0.46721969494246723, + "grad_norm": 0.5826752185821533, + "learning_rate": 0.0004356298559405661, + "loss": 1.1713, + "step": 2619 + }, + { + "epoch": 0.4673980911604674, + "grad_norm": 0.5204340815544128, + "learning_rate": 0.00043558291921726215, + "loss": 1.1773, + "step": 2620 + }, + { + "epoch": 0.4675764873784676, + "grad_norm": 0.5820581316947937, + "learning_rate": 0.00043553596791835557, + "loss": 0.9531, + "step": 2621 + }, + { + "epoch": 0.4677548835964678, + "grad_norm": 0.535647988319397, + "learning_rate": 0.000435489002047534, + "loss": 0.8793, + "step": 2622 + }, + { + "epoch": 0.4679332798144679, + "grad_norm": 0.4820907413959503, + "learning_rate": 0.000435442021608486, + "loss": 1.0139, + "step": 2623 + }, + { + "epoch": 0.4681116760324681, + "grad_norm": 0.5509294867515564, + "learning_rate": 0.0004353950266049014, + "loss": 1.0987, + "step": 2624 + }, + { + "epoch": 0.4682900722504683, + "grad_norm": 0.554036021232605, + "learning_rate": 0.00043534801704047115, + "loss": 1.0175, + "step": 2625 + }, + { + "epoch": 0.46846846846846846, + "grad_norm": 0.5267329812049866, + "learning_rate": 0.0004353009929188875, + "loss": 1.0262, + "step": 2626 + }, + { + "epoch": 0.46864686468646866, + "grad_norm": 0.5234902501106262, + "learning_rate": 0.0004352539542438436, + "loss": 0.8124, + "step": 2627 + }, + { + "epoch": 0.4688252609044688, + "grad_norm": 0.6302539706230164, + "learning_rate": 0.00043520690101903374, + "loss": 1.3212, + "step": 2628 + }, + { + "epoch": 0.469003657122469, + "grad_norm": 0.5413920879364014, + "learning_rate": 0.00043515983324815365, + "loss": 1.1461, + "step": 2629 + }, + { + "epoch": 0.4691820533404692, + "grad_norm": 0.4936441481113434, + "learning_rate": 0.0004351127509349, + "loss": 0.9603, + "step": 2630 + }, + { + "epoch": 0.46936044955846934, + "grad_norm": 0.4682817757129669, + "learning_rate": 0.00043506565408297045, + "loss": 0.8536, + "step": 2631 + }, + { + "epoch": 0.46953884577646954, + "grad_norm": 0.6088268160820007, + "learning_rate": 0.0004350185426960641, + "loss": 1.1274, + "step": 2632 + }, + { + "epoch": 0.46971724199446974, + "grad_norm": 0.5248781442642212, + "learning_rate": 0.00043497141677788107, + "loss": 1.1042, + "step": 2633 + }, + { + "epoch": 0.4698956382124699, + "grad_norm": 0.48056891560554504, + "learning_rate": 0.0004349242763321225, + "loss": 1.0877, + "step": 2634 + }, + { + "epoch": 0.4700740344304701, + "grad_norm": 0.48219484090805054, + "learning_rate": 0.0004348771213624909, + "loss": 1.0188, + "step": 2635 + }, + { + "epoch": 0.4702524306484703, + "grad_norm": 0.5074059963226318, + "learning_rate": 0.0004348299518726897, + "loss": 0.8514, + "step": 2636 + }, + { + "epoch": 0.4704308268664704, + "grad_norm": 0.48974233865737915, + "learning_rate": 0.00043478276786642364, + "loss": 1.0564, + "step": 2637 + }, + { + "epoch": 0.4706092230844706, + "grad_norm": 0.45620694756507874, + "learning_rate": 0.0004347355693473985, + "loss": 0.8984, + "step": 2638 + }, + { + "epoch": 0.47078761930247076, + "grad_norm": 0.5297985672950745, + "learning_rate": 0.00043468835631932124, + "loss": 0.9644, + "step": 2639 + }, + { + "epoch": 0.47096601552047096, + "grad_norm": 0.5142080187797546, + "learning_rate": 0.0004346411287859, + "loss": 1.0379, + "step": 2640 + }, + { + "epoch": 0.47114441173847116, + "grad_norm": 0.5011320114135742, + "learning_rate": 0.00043459388675084386, + "loss": 1.1448, + "step": 2641 + }, + { + "epoch": 0.4713228079564713, + "grad_norm": 0.5459404587745667, + "learning_rate": 0.00043454663021786337, + "loss": 1.0494, + "step": 2642 + }, + { + "epoch": 0.4715012041744715, + "grad_norm": 0.4870483875274658, + "learning_rate": 0.00043449935919066997, + "loss": 1.1025, + "step": 2643 + }, + { + "epoch": 0.4716796003924717, + "grad_norm": 0.4572905898094177, + "learning_rate": 0.00043445207367297624, + "loss": 1.0561, + "step": 2644 + }, + { + "epoch": 0.47185799661047184, + "grad_norm": 0.5108131766319275, + "learning_rate": 0.00043440477366849607, + "loss": 0.8723, + "step": 2645 + }, + { + "epoch": 0.47203639282847204, + "grad_norm": 0.49937039613723755, + "learning_rate": 0.00043435745918094437, + "loss": 1.0159, + "step": 2646 + }, + { + "epoch": 0.47221478904647224, + "grad_norm": 0.5099616050720215, + "learning_rate": 0.00043431013021403707, + "loss": 0.9302, + "step": 2647 + }, + { + "epoch": 0.4723931852644724, + "grad_norm": 0.5088136792182922, + "learning_rate": 0.0004342627867714915, + "loss": 1.054, + "step": 2648 + }, + { + "epoch": 0.4725715814824726, + "grad_norm": 0.48349374532699585, + "learning_rate": 0.0004342154288570259, + "loss": 1.0577, + "step": 2649 + }, + { + "epoch": 0.4727499777004727, + "grad_norm": 0.528062105178833, + "learning_rate": 0.0004341680564743599, + "loss": 1.0324, + "step": 2650 + }, + { + "epoch": 0.4729283739184729, + "grad_norm": 1.0809184312820435, + "learning_rate": 0.0004341206696272139, + "loss": 1.1805, + "step": 2651 + }, + { + "epoch": 0.4731067701364731, + "grad_norm": 0.5047591328620911, + "learning_rate": 0.00043407326831930985, + "loss": 1.0002, + "step": 2652 + }, + { + "epoch": 0.47328516635447326, + "grad_norm": 0.5000717639923096, + "learning_rate": 0.00043402585255437035, + "loss": 1.0015, + "step": 2653 + }, + { + "epoch": 0.47346356257247346, + "grad_norm": 0.608333170413971, + "learning_rate": 0.0004339784223361197, + "loss": 0.9044, + "step": 2654 + }, + { + "epoch": 0.47364195879047366, + "grad_norm": 0.5130841135978699, + "learning_rate": 0.00043393097766828293, + "loss": 1.1026, + "step": 2655 + }, + { + "epoch": 0.4738203550084738, + "grad_norm": 0.5287538170814514, + "learning_rate": 0.0004338835185545863, + "loss": 1.0682, + "step": 2656 + }, + { + "epoch": 0.473998751226474, + "grad_norm": 0.5100160837173462, + "learning_rate": 0.00043383604499875727, + "loss": 1.231, + "step": 2657 + }, + { + "epoch": 0.4741771474444742, + "grad_norm": 0.4938991963863373, + "learning_rate": 0.0004337885570045244, + "loss": 1.0292, + "step": 2658 + }, + { + "epoch": 0.47435554366247434, + "grad_norm": 0.45146873593330383, + "learning_rate": 0.0004337410545756173, + "loss": 0.8997, + "step": 2659 + }, + { + "epoch": 0.47453393988047454, + "grad_norm": 0.45835426449775696, + "learning_rate": 0.0004336935377157668, + "loss": 0.773, + "step": 2660 + }, + { + "epoch": 0.4747123360984747, + "grad_norm": 0.4573782980442047, + "learning_rate": 0.000433646006428705, + "loss": 0.8717, + "step": 2661 + }, + { + "epoch": 0.4748907323164749, + "grad_norm": 0.4976518154144287, + "learning_rate": 0.00043359846071816484, + "loss": 0.862, + "step": 2662 + }, + { + "epoch": 0.4750691285344751, + "grad_norm": 0.5328902006149292, + "learning_rate": 0.0004335509005878806, + "loss": 0.9372, + "step": 2663 + }, + { + "epoch": 0.4752475247524752, + "grad_norm": 0.5134164690971375, + "learning_rate": 0.0004335033260415876, + "loss": 0.9136, + "step": 2664 + }, + { + "epoch": 0.4754259209704754, + "grad_norm": 0.5302520990371704, + "learning_rate": 0.00043345573708302235, + "loss": 1.0568, + "step": 2665 + }, + { + "epoch": 0.4756043171884756, + "grad_norm": 0.4945065379142761, + "learning_rate": 0.0004334081337159225, + "loss": 0.8435, + "step": 2666 + }, + { + "epoch": 0.47578271340647577, + "grad_norm": 1.0435439348220825, + "learning_rate": 0.0004333605159440266, + "loss": 1.0573, + "step": 2667 + }, + { + "epoch": 0.47596110962447596, + "grad_norm": 2.6927244663238525, + "learning_rate": 0.0004333128837710748, + "loss": 1.0532, + "step": 2668 + }, + { + "epoch": 0.47613950584247616, + "grad_norm": 0.5720566511154175, + "learning_rate": 0.000433265237200808, + "loss": 1.0378, + "step": 2669 + }, + { + "epoch": 0.4763179020604763, + "grad_norm": 0.49117255210876465, + "learning_rate": 0.00043321757623696836, + "loss": 1.0109, + "step": 2670 + }, + { + "epoch": 0.4764962982784765, + "grad_norm": 0.5019311904907227, + "learning_rate": 0.00043316990088329907, + "loss": 1.0556, + "step": 2671 + }, + { + "epoch": 0.47667469449647665, + "grad_norm": 1.1491711139678955, + "learning_rate": 0.00043312221114354463, + "loss": 1.0185, + "step": 2672 + }, + { + "epoch": 0.47685309071447685, + "grad_norm": 1.0251749753952026, + "learning_rate": 0.00043307450702145047, + "loss": 1.1779, + "step": 2673 + }, + { + "epoch": 0.47703148693247704, + "grad_norm": 0.5985758900642395, + "learning_rate": 0.00043302678852076337, + "loss": 1.1387, + "step": 2674 + }, + { + "epoch": 0.4772098831504772, + "grad_norm": 0.6510484218597412, + "learning_rate": 0.000432979055645231, + "loss": 1.1599, + "step": 2675 + }, + { + "epoch": 0.4773882793684774, + "grad_norm": 0.5804511904716492, + "learning_rate": 0.0004329313083986024, + "loss": 1.0075, + "step": 2676 + }, + { + "epoch": 0.4775666755864776, + "grad_norm": 0.5480517745018005, + "learning_rate": 0.00043288354678462757, + "loss": 1.1907, + "step": 2677 + }, + { + "epoch": 0.47774507180447773, + "grad_norm": 0.5310084223747253, + "learning_rate": 0.0004328357708070576, + "loss": 0.9193, + "step": 2678 + }, + { + "epoch": 0.4779234680224779, + "grad_norm": 0.5491518974304199, + "learning_rate": 0.00043278798046964494, + "loss": 1.2297, + "step": 2679 + }, + { + "epoch": 0.4781018642404781, + "grad_norm": 0.4863007664680481, + "learning_rate": 0.0004327401757761429, + "loss": 0.9668, + "step": 2680 + }, + { + "epoch": 0.47828026045847827, + "grad_norm": 0.5505911111831665, + "learning_rate": 0.00043269235673030614, + "loss": 1.0962, + "step": 2681 + }, + { + "epoch": 0.47845865667647847, + "grad_norm": 0.4990256428718567, + "learning_rate": 0.00043264452333589034, + "loss": 0.8941, + "step": 2682 + }, + { + "epoch": 0.4786370528944786, + "grad_norm": 0.5696044564247131, + "learning_rate": 0.0004325966755966522, + "loss": 1.1817, + "step": 2683 + }, + { + "epoch": 0.4788154491124788, + "grad_norm": 0.5234004259109497, + "learning_rate": 0.00043254881351634976, + "loss": 1.0851, + "step": 2684 + }, + { + "epoch": 0.478993845330479, + "grad_norm": 0.5903022289276123, + "learning_rate": 0.0004325009370987421, + "loss": 1.2182, + "step": 2685 + }, + { + "epoch": 0.47917224154847915, + "grad_norm": 0.5186327695846558, + "learning_rate": 0.0004324530463475893, + "loss": 0.9566, + "step": 2686 + }, + { + "epoch": 0.47935063776647935, + "grad_norm": 0.5512902736663818, + "learning_rate": 0.00043240514126665274, + "loss": 0.9735, + "step": 2687 + }, + { + "epoch": 0.47952903398447955, + "grad_norm": 0.5918879508972168, + "learning_rate": 0.00043235722185969497, + "loss": 1.1593, + "step": 2688 + }, + { + "epoch": 0.4797074302024797, + "grad_norm": 0.5881208777427673, + "learning_rate": 0.0004323092881304794, + "loss": 1.0421, + "step": 2689 + }, + { + "epoch": 0.4798858264204799, + "grad_norm": 0.5843618512153625, + "learning_rate": 0.00043226134008277084, + "loss": 1.199, + "step": 2690 + }, + { + "epoch": 0.4800642226384801, + "grad_norm": 0.5014129877090454, + "learning_rate": 0.0004322133777203351, + "loss": 0.852, + "step": 2691 + }, + { + "epoch": 0.48024261885648023, + "grad_norm": 0.5492891073226929, + "learning_rate": 0.000432165401046939, + "loss": 0.9382, + "step": 2692 + }, + { + "epoch": 0.48042101507448043, + "grad_norm": 0.5708451271057129, + "learning_rate": 0.00043211741006635076, + "loss": 0.9182, + "step": 2693 + }, + { + "epoch": 0.48059941129248057, + "grad_norm": 0.5316191911697388, + "learning_rate": 0.00043206940478233947, + "loss": 1.05, + "step": 2694 + }, + { + "epoch": 0.48077780751048077, + "grad_norm": 0.5057091116905212, + "learning_rate": 0.0004320213851986755, + "loss": 0.8626, + "step": 2695 + }, + { + "epoch": 0.48095620372848097, + "grad_norm": 0.5974101424217224, + "learning_rate": 0.00043197335131913025, + "loss": 1.1793, + "step": 2696 + }, + { + "epoch": 0.4811345999464811, + "grad_norm": 0.5790625214576721, + "learning_rate": 0.00043192530314747625, + "loss": 1.0063, + "step": 2697 + }, + { + "epoch": 0.4813129961644813, + "grad_norm": 0.4502911865711212, + "learning_rate": 0.0004318772406874873, + "loss": 0.8672, + "step": 2698 + }, + { + "epoch": 0.4814913923824815, + "grad_norm": 0.5690745711326599, + "learning_rate": 0.00043182916394293817, + "loss": 0.9999, + "step": 2699 + }, + { + "epoch": 0.48166978860048165, + "grad_norm": 0.584483802318573, + "learning_rate": 0.00043178107291760463, + "loss": 1.0083, + "step": 2700 + }, + { + "epoch": 0.48184818481848185, + "grad_norm": 0.48927435278892517, + "learning_rate": 0.00043173296761526395, + "loss": 0.7844, + "step": 2701 + }, + { + "epoch": 0.48202658103648205, + "grad_norm": 0.6203662157058716, + "learning_rate": 0.0004316848480396941, + "loss": 0.9332, + "step": 2702 + }, + { + "epoch": 0.4822049772544822, + "grad_norm": 0.5448145866394043, + "learning_rate": 0.00043163671419467454, + "loss": 1.0309, + "step": 2703 + }, + { + "epoch": 0.4823833734724824, + "grad_norm": 0.540557324886322, + "learning_rate": 0.0004315885660839857, + "loss": 0.9622, + "step": 2704 + }, + { + "epoch": 0.4825617696904826, + "grad_norm": 0.4740718603134155, + "learning_rate": 0.0004315404037114089, + "loss": 1.003, + "step": 2705 + }, + { + "epoch": 0.48274016590848273, + "grad_norm": 0.5453110933303833, + "learning_rate": 0.0004314922270807269, + "loss": 1.2003, + "step": 2706 + }, + { + "epoch": 0.48291856212648293, + "grad_norm": 0.4645708501338959, + "learning_rate": 0.0004314440361957235, + "loss": 0.9038, + "step": 2707 + }, + { + "epoch": 0.4830969583444831, + "grad_norm": 0.48761603236198425, + "learning_rate": 0.00043139583106018367, + "loss": 1.1861, + "step": 2708 + }, + { + "epoch": 0.4832753545624833, + "grad_norm": 0.48210960626602173, + "learning_rate": 0.0004313476116778933, + "loss": 1.0031, + "step": 2709 + }, + { + "epoch": 0.48345375078048347, + "grad_norm": 0.4890519380569458, + "learning_rate": 0.00043129937805263944, + "loss": 0.9203, + "step": 2710 + }, + { + "epoch": 0.4836321469984836, + "grad_norm": 0.5152431726455688, + "learning_rate": 0.00043125113018821054, + "loss": 1.001, + "step": 2711 + }, + { + "epoch": 0.4838105432164838, + "grad_norm": 0.5150320529937744, + "learning_rate": 0.00043120286808839587, + "loss": 0.9304, + "step": 2712 + }, + { + "epoch": 0.483988939434484, + "grad_norm": 0.4525447487831116, + "learning_rate": 0.0004311545917569859, + "loss": 0.8478, + "step": 2713 + }, + { + "epoch": 0.48416733565248415, + "grad_norm": 0.47689542174339294, + "learning_rate": 0.0004311063011977723, + "loss": 1.032, + "step": 2714 + }, + { + "epoch": 0.48434573187048435, + "grad_norm": 0.5542166829109192, + "learning_rate": 0.0004310579964145477, + "loss": 1.2234, + "step": 2715 + }, + { + "epoch": 0.48452412808848455, + "grad_norm": 0.513331949710846, + "learning_rate": 0.00043100967741110593, + "loss": 0.9973, + "step": 2716 + }, + { + "epoch": 0.4847025243064847, + "grad_norm": 0.4602411389350891, + "learning_rate": 0.000430961344191242, + "loss": 0.8423, + "step": 2717 + }, + { + "epoch": 0.4848809205244849, + "grad_norm": 0.5340333580970764, + "learning_rate": 0.000430912996758752, + "loss": 1.2688, + "step": 2718 + }, + { + "epoch": 0.48505931674248504, + "grad_norm": 0.47053262591362, + "learning_rate": 0.00043086463511743313, + "loss": 0.8783, + "step": 2719 + }, + { + "epoch": 0.48523771296048523, + "grad_norm": 0.5520641207695007, + "learning_rate": 0.0004308162592710836, + "loss": 1.1515, + "step": 2720 + }, + { + "epoch": 0.48541610917848543, + "grad_norm": 0.48937782645225525, + "learning_rate": 0.0004307678692235029, + "loss": 1.0731, + "step": 2721 + }, + { + "epoch": 0.4855945053964856, + "grad_norm": 0.5234431624412537, + "learning_rate": 0.0004307194649784915, + "loss": 1.081, + "step": 2722 + }, + { + "epoch": 0.4857729016144858, + "grad_norm": 0.5052714943885803, + "learning_rate": 0.0004306710465398511, + "loss": 1.2472, + "step": 2723 + }, + { + "epoch": 0.485951297832486, + "grad_norm": 0.466281920671463, + "learning_rate": 0.00043062261391138454, + "loss": 0.875, + "step": 2724 + }, + { + "epoch": 0.4861296940504861, + "grad_norm": 0.47504422068595886, + "learning_rate": 0.00043057416709689554, + "loss": 1.0092, + "step": 2725 + }, + { + "epoch": 0.4863080902684863, + "grad_norm": 0.46300631761550903, + "learning_rate": 0.00043052570610018913, + "loss": 0.9929, + "step": 2726 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 0.4238641560077667, + "learning_rate": 0.0004304772309250715, + "loss": 0.8075, + "step": 2727 + }, + { + "epoch": 0.48666488270448666, + "grad_norm": 0.509427011013031, + "learning_rate": 0.00043042874157534985, + "loss": 0.9091, + "step": 2728 + }, + { + "epoch": 0.48684327892248686, + "grad_norm": 1.843088150024414, + "learning_rate": 0.0004303802380548324, + "loss": 1.6363, + "step": 2729 + }, + { + "epoch": 0.487021675140487, + "grad_norm": 0.5678585767745972, + "learning_rate": 0.0004303317203673287, + "loss": 1.1911, + "step": 2730 + }, + { + "epoch": 0.4872000713584872, + "grad_norm": 0.49592193961143494, + "learning_rate": 0.00043028318851664934, + "loss": 0.8901, + "step": 2731 + }, + { + "epoch": 0.4873784675764874, + "grad_norm": 0.4731305241584778, + "learning_rate": 0.0004302346425066059, + "loss": 0.9217, + "step": 2732 + }, + { + "epoch": 0.48755686379448754, + "grad_norm": 0.6168728470802307, + "learning_rate": 0.0004301860823410112, + "loss": 1.081, + "step": 2733 + }, + { + "epoch": 0.48773526001248774, + "grad_norm": 0.48209506273269653, + "learning_rate": 0.00043013750802367915, + "loss": 0.7949, + "step": 2734 + }, + { + "epoch": 0.48791365623048794, + "grad_norm": 0.7890462875366211, + "learning_rate": 0.00043008891955842465, + "loss": 1.1409, + "step": 2735 + }, + { + "epoch": 0.4880920524484881, + "grad_norm": 2.6161890029907227, + "learning_rate": 0.00043004031694906397, + "loss": 1.1929, + "step": 2736 + }, + { + "epoch": 0.4882704486664883, + "grad_norm": 0.7953986525535583, + "learning_rate": 0.0004299917001994143, + "loss": 0.956, + "step": 2737 + }, + { + "epoch": 0.4884488448844885, + "grad_norm": 1.4645198583602905, + "learning_rate": 0.00042994306931329394, + "loss": 1.0847, + "step": 2738 + }, + { + "epoch": 0.4886272411024886, + "grad_norm": 0.5251899361610413, + "learning_rate": 0.00042989442429452237, + "loss": 1.0796, + "step": 2739 + }, + { + "epoch": 0.4888056373204888, + "grad_norm": 0.4881310760974884, + "learning_rate": 0.0004298457651469201, + "loss": 0.9101, + "step": 2740 + }, + { + "epoch": 0.48898403353848896, + "grad_norm": 0.7399401068687439, + "learning_rate": 0.0004297970918743088, + "loss": 1.1056, + "step": 2741 + }, + { + "epoch": 0.48916242975648916, + "grad_norm": 0.531498908996582, + "learning_rate": 0.00042974840448051135, + "loss": 1.104, + "step": 2742 + }, + { + "epoch": 0.48934082597448936, + "grad_norm": 0.5090386867523193, + "learning_rate": 0.00042969970296935153, + "loss": 0.7713, + "step": 2743 + }, + { + "epoch": 0.4895192221924895, + "grad_norm": 0.5436068177223206, + "learning_rate": 0.00042965098734465434, + "loss": 0.9884, + "step": 2744 + }, + { + "epoch": 0.4896976184104897, + "grad_norm": 0.5477145314216614, + "learning_rate": 0.00042960225761024597, + "loss": 1.1816, + "step": 2745 + }, + { + "epoch": 0.4898760146284899, + "grad_norm": 0.5642074942588806, + "learning_rate": 0.00042955351376995355, + "loss": 1.057, + "step": 2746 + }, + { + "epoch": 0.49005441084649004, + "grad_norm": 0.5721654295921326, + "learning_rate": 0.0004295047558276054, + "loss": 0.8484, + "step": 2747 + }, + { + "epoch": 0.49023280706449024, + "grad_norm": 0.4718813896179199, + "learning_rate": 0.000429455983787031, + "loss": 0.8456, + "step": 2748 + }, + { + "epoch": 0.49041120328249044, + "grad_norm": 0.4769846498966217, + "learning_rate": 0.0004294071976520608, + "loss": 0.7766, + "step": 2749 + }, + { + "epoch": 0.4905895995004906, + "grad_norm": 0.571636438369751, + "learning_rate": 0.0004293583974265266, + "loss": 0.921, + "step": 2750 + }, + { + "epoch": 0.4907679957184908, + "grad_norm": 0.5464839935302734, + "learning_rate": 0.000429309583114261, + "loss": 1.1658, + "step": 2751 + }, + { + "epoch": 0.4909463919364909, + "grad_norm": 0.5281213521957397, + "learning_rate": 0.00042926075471909787, + "loss": 1.1353, + "step": 2752 + }, + { + "epoch": 0.4911247881544911, + "grad_norm": 0.5068462491035461, + "learning_rate": 0.0004292119122448723, + "loss": 0.9014, + "step": 2753 + }, + { + "epoch": 0.4913031843724913, + "grad_norm": 0.532954752445221, + "learning_rate": 0.0004291630556954202, + "loss": 1.0867, + "step": 2754 + }, + { + "epoch": 0.49148158059049146, + "grad_norm": 0.48287615180015564, + "learning_rate": 0.0004291141850745788, + "loss": 0.9296, + "step": 2755 + }, + { + "epoch": 0.49165997680849166, + "grad_norm": 0.4885706603527069, + "learning_rate": 0.0004290653003861864, + "loss": 0.9466, + "step": 2756 + }, + { + "epoch": 0.49183837302649186, + "grad_norm": 0.5004509091377258, + "learning_rate": 0.0004290164016340824, + "loss": 0.9552, + "step": 2757 + }, + { + "epoch": 0.492016769244492, + "grad_norm": 0.5846733450889587, + "learning_rate": 0.0004289674888221072, + "loss": 1.3886, + "step": 2758 + }, + { + "epoch": 0.4921951654624922, + "grad_norm": 0.4464746415615082, + "learning_rate": 0.00042891856195410237, + "loss": 0.7713, + "step": 2759 + }, + { + "epoch": 0.4923735616804924, + "grad_norm": 0.48179852962493896, + "learning_rate": 0.0004288696210339108, + "loss": 0.9217, + "step": 2760 + }, + { + "epoch": 0.49255195789849254, + "grad_norm": 0.5003469586372375, + "learning_rate": 0.0004288206660653762, + "loss": 0.8922, + "step": 2761 + }, + { + "epoch": 0.49273035411649274, + "grad_norm": 0.5266236066818237, + "learning_rate": 0.00042877169705234335, + "loss": 0.9248, + "step": 2762 + }, + { + "epoch": 0.4929087503344929, + "grad_norm": 0.5324473977088928, + "learning_rate": 0.00042872271399865835, + "loss": 1.0357, + "step": 2763 + }, + { + "epoch": 0.4930871465524931, + "grad_norm": 0.7425055503845215, + "learning_rate": 0.0004286737169081684, + "loss": 1.026, + "step": 2764 + }, + { + "epoch": 0.4932655427704933, + "grad_norm": 0.8448666334152222, + "learning_rate": 0.0004286247057847215, + "loss": 0.9976, + "step": 2765 + }, + { + "epoch": 0.4934439389884934, + "grad_norm": 0.5740126371383667, + "learning_rate": 0.0004285756806321671, + "loss": 1.1577, + "step": 2766 + }, + { + "epoch": 0.4936223352064936, + "grad_norm": 0.5506383180618286, + "learning_rate": 0.0004285266414543556, + "loss": 1.0981, + "step": 2767 + }, + { + "epoch": 0.4938007314244938, + "grad_norm": 0.49879491329193115, + "learning_rate": 0.00042847758825513847, + "loss": 0.8611, + "step": 2768 + }, + { + "epoch": 0.49397912764249396, + "grad_norm": 0.5577161908149719, + "learning_rate": 0.0004284285210383685, + "loss": 0.8755, + "step": 2769 + }, + { + "epoch": 0.49415752386049416, + "grad_norm": 1.8733638525009155, + "learning_rate": 0.00042837943980789914, + "loss": 1.0863, + "step": 2770 + }, + { + "epoch": 0.49433592007849436, + "grad_norm": 0.620151162147522, + "learning_rate": 0.00042833034456758533, + "loss": 1.0697, + "step": 2771 + }, + { + "epoch": 0.4945143162964945, + "grad_norm": 0.6251631379127502, + "learning_rate": 0.00042828123532128305, + "loss": 1.163, + "step": 2772 + }, + { + "epoch": 0.4946927125144947, + "grad_norm": 0.55173259973526, + "learning_rate": 0.0004282321120728493, + "loss": 1.0081, + "step": 2773 + }, + { + "epoch": 0.49487110873249485, + "grad_norm": 0.5649335384368896, + "learning_rate": 0.0004281829748261421, + "loss": 1.2093, + "step": 2774 + }, + { + "epoch": 0.49504950495049505, + "grad_norm": 0.5768599510192871, + "learning_rate": 0.0004281338235850208, + "loss": 1.1961, + "step": 2775 + }, + { + "epoch": 0.49522790116849524, + "grad_norm": 0.4870094358921051, + "learning_rate": 0.0004280846583533456, + "loss": 1.0099, + "step": 2776 + }, + { + "epoch": 0.4954062973864954, + "grad_norm": 0.5303916335105896, + "learning_rate": 0.00042803547913497795, + "loss": 0.934, + "step": 2777 + }, + { + "epoch": 0.4955846936044956, + "grad_norm": 1.3991183042526245, + "learning_rate": 0.00042798628593378044, + "loss": 1.0462, + "step": 2778 + }, + { + "epoch": 0.4957630898224958, + "grad_norm": 0.5671733617782593, + "learning_rate": 0.00042793707875361667, + "loss": 1.0136, + "step": 2779 + }, + { + "epoch": 0.4959414860404959, + "grad_norm": 0.5110854506492615, + "learning_rate": 0.00042788785759835117, + "loss": 0.9371, + "step": 2780 + }, + { + "epoch": 0.4961198822584961, + "grad_norm": 0.48012104630470276, + "learning_rate": 0.00042783862247185, + "loss": 1.0671, + "step": 2781 + }, + { + "epoch": 0.4962982784764963, + "grad_norm": 1.609937310218811, + "learning_rate": 0.0004277893733779798, + "loss": 1.1734, + "step": 2782 + }, + { + "epoch": 0.49647667469449647, + "grad_norm": 0.49931612610816956, + "learning_rate": 0.0004277401103206089, + "loss": 0.8801, + "step": 2783 + }, + { + "epoch": 0.49665507091249667, + "grad_norm": 0.5576810240745544, + "learning_rate": 0.0004276908333036061, + "loss": 0.998, + "step": 2784 + }, + { + "epoch": 0.4968334671304968, + "grad_norm": 0.5244370102882385, + "learning_rate": 0.00042764154233084184, + "loss": 1.1198, + "step": 2785 + }, + { + "epoch": 0.497011863348497, + "grad_norm": 0.5110731720924377, + "learning_rate": 0.00042759223740618723, + "loss": 1.0418, + "step": 2786 + }, + { + "epoch": 0.4971902595664972, + "grad_norm": 0.5166656374931335, + "learning_rate": 0.0004275429185335147, + "loss": 1.0333, + "step": 2787 + }, + { + "epoch": 0.49736865578449735, + "grad_norm": 0.558217465877533, + "learning_rate": 0.00042749358571669783, + "loss": 1.0004, + "step": 2788 + }, + { + "epoch": 0.49754705200249755, + "grad_norm": 0.5724664926528931, + "learning_rate": 0.000427444238959611, + "loss": 1.0876, + "step": 2789 + }, + { + "epoch": 0.49772544822049775, + "grad_norm": 0.5535562038421631, + "learning_rate": 0.00042739487826613006, + "loss": 1.0584, + "step": 2790 + }, + { + "epoch": 0.4979038444384979, + "grad_norm": 0.6063182950019836, + "learning_rate": 0.0004273455036401317, + "loss": 0.9144, + "step": 2791 + }, + { + "epoch": 0.4980822406564981, + "grad_norm": 0.6180481314659119, + "learning_rate": 0.00042729611508549384, + "loss": 1.1755, + "step": 2792 + }, + { + "epoch": 0.4982606368744983, + "grad_norm": 0.524019718170166, + "learning_rate": 0.0004272467126060954, + "loss": 1.0019, + "step": 2793 + }, + { + "epoch": 0.49843903309249843, + "grad_norm": 0.49826425313949585, + "learning_rate": 0.00042719729620581637, + "loss": 0.9266, + "step": 2794 + }, + { + "epoch": 0.4986174293104986, + "grad_norm": 0.5382309556007385, + "learning_rate": 0.000427147865888538, + "loss": 1.105, + "step": 2795 + }, + { + "epoch": 0.49879582552849877, + "grad_norm": 0.4761876165866852, + "learning_rate": 0.0004270984216581425, + "loss": 0.8737, + "step": 2796 + }, + { + "epoch": 0.49897422174649897, + "grad_norm": 0.4881121516227722, + "learning_rate": 0.0004270489635185131, + "loss": 0.9282, + "step": 2797 + }, + { + "epoch": 0.49915261796449917, + "grad_norm": 0.6625795960426331, + "learning_rate": 0.00042699949147353435, + "loss": 1.1568, + "step": 2798 + }, + { + "epoch": 0.4993310141824993, + "grad_norm": 0.5215954184532166, + "learning_rate": 0.00042695000552709164, + "loss": 1.0541, + "step": 2799 + }, + { + "epoch": 0.4995094104004995, + "grad_norm": 0.4769946336746216, + "learning_rate": 0.0004269005056830717, + "loss": 0.9529, + "step": 2800 + }, + { + "epoch": 0.4996878066184997, + "grad_norm": 0.46005573868751526, + "learning_rate": 0.00042685099194536216, + "loss": 1.0152, + "step": 2801 + }, + { + "epoch": 0.49986620283649985, + "grad_norm": 0.8600802421569824, + "learning_rate": 0.00042680146431785184, + "loss": 0.891, + "step": 2802 + }, + { + "epoch": 0.5000445990545, + "grad_norm": 0.47280198335647583, + "learning_rate": 0.00042675192280443053, + "loss": 1.0193, + "step": 2803 + }, + { + "epoch": 0.5002229952725002, + "grad_norm": 0.44590240716934204, + "learning_rate": 0.00042670236740898935, + "loss": 0.8403, + "step": 2804 + }, + { + "epoch": 0.5004013914905004, + "grad_norm": 0.5397332906723022, + "learning_rate": 0.00042665279813542024, + "loss": 1.0327, + "step": 2805 + }, + { + "epoch": 0.5005797877085005, + "grad_norm": 0.49650824069976807, + "learning_rate": 0.0004266032149876163, + "loss": 0.8893, + "step": 2806 + }, + { + "epoch": 0.5007581839265007, + "grad_norm": 0.5721169710159302, + "learning_rate": 0.000426553617969472, + "loss": 0.8712, + "step": 2807 + }, + { + "epoch": 0.5009365801445009, + "grad_norm": 0.4694698750972748, + "learning_rate": 0.00042650400708488245, + "loss": 0.8884, + "step": 2808 + }, + { + "epoch": 0.5011149763625011, + "grad_norm": 0.5037152767181396, + "learning_rate": 0.00042645438233774414, + "loss": 1.2473, + "step": 2809 + }, + { + "epoch": 0.5012933725805013, + "grad_norm": 0.4780758321285248, + "learning_rate": 0.00042640474373195457, + "loss": 0.9399, + "step": 2810 + }, + { + "epoch": 0.5014717687985015, + "grad_norm": 0.5098987817764282, + "learning_rate": 0.00042635509127141236, + "loss": 1.0123, + "step": 2811 + }, + { + "epoch": 0.5016501650165016, + "grad_norm": 0.4265352487564087, + "learning_rate": 0.0004263054249600172, + "loss": 0.704, + "step": 2812 + }, + { + "epoch": 0.5018285612345018, + "grad_norm": 0.45242688059806824, + "learning_rate": 0.0004262557448016697, + "loss": 0.8694, + "step": 2813 + }, + { + "epoch": 0.502006957452502, + "grad_norm": 0.503936767578125, + "learning_rate": 0.00042620605080027197, + "loss": 0.9914, + "step": 2814 + }, + { + "epoch": 0.5021853536705022, + "grad_norm": 0.45868226885795593, + "learning_rate": 0.0004261563429597268, + "loss": 0.9463, + "step": 2815 + }, + { + "epoch": 0.5023637498885024, + "grad_norm": 0.5536309480667114, + "learning_rate": 0.0004261066212839383, + "loss": 1.2046, + "step": 2816 + }, + { + "epoch": 0.5025421461065025, + "grad_norm": 0.43166589736938477, + "learning_rate": 0.00042605688577681156, + "loss": 0.793, + "step": 2817 + }, + { + "epoch": 0.5027205423245027, + "grad_norm": 0.5023233294487, + "learning_rate": 0.00042600713644225274, + "loss": 1.0378, + "step": 2818 + }, + { + "epoch": 0.5028989385425029, + "grad_norm": 0.5103394985198975, + "learning_rate": 0.0004259573732841692, + "loss": 1.0274, + "step": 2819 + }, + { + "epoch": 0.5030773347605031, + "grad_norm": 0.4885903298854828, + "learning_rate": 0.0004259075963064692, + "loss": 0.9873, + "step": 2820 + }, + { + "epoch": 0.5032557309785033, + "grad_norm": 0.46850115060806274, + "learning_rate": 0.0004258578055130623, + "loss": 1.0053, + "step": 2821 + }, + { + "epoch": 0.5034341271965035, + "grad_norm": 1.0507224798202515, + "learning_rate": 0.0004258080009078591, + "loss": 0.9356, + "step": 2822 + }, + { + "epoch": 0.5036125234145036, + "grad_norm": 0.48565196990966797, + "learning_rate": 0.0004257581824947711, + "loss": 0.9913, + "step": 2823 + }, + { + "epoch": 0.5037909196325038, + "grad_norm": 0.4672144651412964, + "learning_rate": 0.00042570835027771114, + "loss": 1.0402, + "step": 2824 + }, + { + "epoch": 0.503969315850504, + "grad_norm": 0.5116181373596191, + "learning_rate": 0.00042565850426059295, + "loss": 1.0377, + "step": 2825 + }, + { + "epoch": 0.5041477120685042, + "grad_norm": 0.5363556742668152, + "learning_rate": 0.0004256086444473314, + "loss": 1.0358, + "step": 2826 + }, + { + "epoch": 0.5043261082865044, + "grad_norm": 0.4890466332435608, + "learning_rate": 0.0004255587708418425, + "loss": 1.0442, + "step": 2827 + }, + { + "epoch": 0.5045045045045045, + "grad_norm": 0.50138258934021, + "learning_rate": 0.0004255088834480433, + "loss": 1.009, + "step": 2828 + }, + { + "epoch": 0.5046829007225047, + "grad_norm": 0.5070481300354004, + "learning_rate": 0.00042545898226985186, + "loss": 1.0232, + "step": 2829 + }, + { + "epoch": 0.5048612969405049, + "grad_norm": 0.5242338180541992, + "learning_rate": 0.00042540906731118746, + "loss": 1.099, + "step": 2830 + }, + { + "epoch": 0.505039693158505, + "grad_norm": 0.5118682980537415, + "learning_rate": 0.00042535913857597046, + "loss": 1.0284, + "step": 2831 + }, + { + "epoch": 0.5052180893765053, + "grad_norm": 0.4739261269569397, + "learning_rate": 0.00042530919606812215, + "loss": 1.0342, + "step": 2832 + }, + { + "epoch": 0.5053964855945055, + "grad_norm": 0.4524725675582886, + "learning_rate": 0.00042525923979156507, + "loss": 0.8644, + "step": 2833 + }, + { + "epoch": 0.5055748818125055, + "grad_norm": 0.499525249004364, + "learning_rate": 0.00042520926975022266, + "loss": 1.0513, + "step": 2834 + }, + { + "epoch": 0.5057532780305057, + "grad_norm": 0.4836816191673279, + "learning_rate": 0.00042515928594801964, + "loss": 0.8886, + "step": 2835 + }, + { + "epoch": 0.5059316742485059, + "grad_norm": 0.5527642965316772, + "learning_rate": 0.00042510928838888163, + "loss": 1.2704, + "step": 2836 + }, + { + "epoch": 0.5061100704665061, + "grad_norm": 0.5419808030128479, + "learning_rate": 0.0004250592770767355, + "loss": 1.1396, + "step": 2837 + }, + { + "epoch": 0.5062884666845063, + "grad_norm": 0.4811457395553589, + "learning_rate": 0.0004250092520155091, + "loss": 0.9047, + "step": 2838 + }, + { + "epoch": 0.5064668629025064, + "grad_norm": 0.4570630192756653, + "learning_rate": 0.0004249592132091313, + "loss": 0.8595, + "step": 2839 + }, + { + "epoch": 0.5066452591205066, + "grad_norm": 0.5125581622123718, + "learning_rate": 0.0004249091606615322, + "loss": 1.2342, + "step": 2840 + }, + { + "epoch": 0.5068236553385068, + "grad_norm": 0.5075557231903076, + "learning_rate": 0.0004248590943766429, + "loss": 1.032, + "step": 2841 + }, + { + "epoch": 0.507002051556507, + "grad_norm": 0.7127680778503418, + "learning_rate": 0.00042480901435839566, + "loss": 1.1435, + "step": 2842 + }, + { + "epoch": 0.5071804477745072, + "grad_norm": 0.5281385779380798, + "learning_rate": 0.0004247589206107236, + "loss": 1.1247, + "step": 2843 + }, + { + "epoch": 0.5073588439925074, + "grad_norm": 0.6512944102287292, + "learning_rate": 0.00042470881313756107, + "loss": 1.1724, + "step": 2844 + }, + { + "epoch": 0.5075372402105075, + "grad_norm": 0.6242824196815491, + "learning_rate": 0.0004246586919428436, + "loss": 1.0296, + "step": 2845 + }, + { + "epoch": 0.5077156364285077, + "grad_norm": 0.5622745752334595, + "learning_rate": 0.0004246085570305076, + "loss": 0.9505, + "step": 2846 + }, + { + "epoch": 0.5078940326465079, + "grad_norm": 0.5400723218917847, + "learning_rate": 0.0004245584084044907, + "loss": 1.081, + "step": 2847 + }, + { + "epoch": 0.5080724288645081, + "grad_norm": 0.8070915937423706, + "learning_rate": 0.00042450824606873145, + "loss": 0.9311, + "step": 2848 + }, + { + "epoch": 0.5082508250825083, + "grad_norm": 0.6803063154220581, + "learning_rate": 0.00042445807002716967, + "loss": 1.2538, + "step": 2849 + }, + { + "epoch": 0.5084292213005084, + "grad_norm": 0.6261130571365356, + "learning_rate": 0.0004244078802837462, + "loss": 1.1796, + "step": 2850 + }, + { + "epoch": 0.5086076175185086, + "grad_norm": 0.5159333348274231, + "learning_rate": 0.00042435767684240286, + "loss": 1.074, + "step": 2851 + }, + { + "epoch": 0.5087860137365088, + "grad_norm": 0.5446671843528748, + "learning_rate": 0.0004243074597070826, + "loss": 0.9219, + "step": 2852 + }, + { + "epoch": 0.508964409954509, + "grad_norm": 0.5136865377426147, + "learning_rate": 0.00042425722888172937, + "loss": 0.966, + "step": 2853 + }, + { + "epoch": 0.5091428061725092, + "grad_norm": 0.7158051133155823, + "learning_rate": 0.00042420698437028846, + "loss": 0.9874, + "step": 2854 + }, + { + "epoch": 0.5093212023905094, + "grad_norm": 0.4794641137123108, + "learning_rate": 0.000424156726176706, + "loss": 0.9166, + "step": 2855 + }, + { + "epoch": 0.5094995986085095, + "grad_norm": 0.5229628086090088, + "learning_rate": 0.0004241064543049292, + "loss": 0.9187, + "step": 2856 + }, + { + "epoch": 0.5096779948265097, + "grad_norm": 0.5953057408332825, + "learning_rate": 0.00042405616875890634, + "loss": 1.205, + "step": 2857 + }, + { + "epoch": 0.5098563910445099, + "grad_norm": 0.629094660282135, + "learning_rate": 0.0004240058695425869, + "loss": 1.5532, + "step": 2858 + }, + { + "epoch": 0.5100347872625101, + "grad_norm": 0.5075649619102478, + "learning_rate": 0.0004239555566599214, + "loss": 1.0137, + "step": 2859 + }, + { + "epoch": 0.5102131834805103, + "grad_norm": 0.5661181807518005, + "learning_rate": 0.00042390523011486133, + "loss": 0.9655, + "step": 2860 + }, + { + "epoch": 0.5103915796985103, + "grad_norm": 0.5415096282958984, + "learning_rate": 0.00042385488991135927, + "loss": 1.1261, + "step": 2861 + }, + { + "epoch": 0.5105699759165105, + "grad_norm": 0.5538334846496582, + "learning_rate": 0.00042380453605336897, + "loss": 0.8759, + "step": 2862 + }, + { + "epoch": 0.5107483721345107, + "grad_norm": 0.5417324900627136, + "learning_rate": 0.00042375416854484527, + "loss": 0.9578, + "step": 2863 + }, + { + "epoch": 0.5109267683525109, + "grad_norm": 0.5347568392753601, + "learning_rate": 0.0004237037873897439, + "loss": 1.0563, + "step": 2864 + }, + { + "epoch": 0.5111051645705111, + "grad_norm": 0.5405890941619873, + "learning_rate": 0.00042365339259202184, + "loss": 1.1051, + "step": 2865 + }, + { + "epoch": 0.5112835607885113, + "grad_norm": 0.4867878258228302, + "learning_rate": 0.00042360298415563706, + "loss": 0.9752, + "step": 2866 + }, + { + "epoch": 0.5114619570065114, + "grad_norm": 0.519904375076294, + "learning_rate": 0.0004235525620845486, + "loss": 1.0394, + "step": 2867 + }, + { + "epoch": 0.5116403532245116, + "grad_norm": 0.6319023966789246, + "learning_rate": 0.00042350212638271655, + "loss": 1.163, + "step": 2868 + }, + { + "epoch": 0.5118187494425118, + "grad_norm": 0.5480625033378601, + "learning_rate": 0.00042345167705410227, + "loss": 1.1205, + "step": 2869 + }, + { + "epoch": 0.511997145660512, + "grad_norm": 0.5435265898704529, + "learning_rate": 0.00042340121410266784, + "loss": 0.9511, + "step": 2870 + }, + { + "epoch": 0.5121755418785122, + "grad_norm": 0.4731312096118927, + "learning_rate": 0.00042335073753237666, + "loss": 0.8017, + "step": 2871 + }, + { + "epoch": 0.5123539380965123, + "grad_norm": 0.49716416001319885, + "learning_rate": 0.00042330024734719317, + "loss": 0.8897, + "step": 2872 + }, + { + "epoch": 0.5125323343145125, + "grad_norm": 0.48113831877708435, + "learning_rate": 0.00042324974355108285, + "loss": 0.9106, + "step": 2873 + }, + { + "epoch": 0.5127107305325127, + "grad_norm": 0.4926007390022278, + "learning_rate": 0.0004231992261480122, + "loss": 0.9383, + "step": 2874 + }, + { + "epoch": 0.5128891267505129, + "grad_norm": 0.537855327129364, + "learning_rate": 0.000423148695141949, + "loss": 1.0949, + "step": 2875 + }, + { + "epoch": 0.5130675229685131, + "grad_norm": 0.5071426033973694, + "learning_rate": 0.0004230981505368616, + "loss": 1.0838, + "step": 2876 + }, + { + "epoch": 0.5132459191865133, + "grad_norm": 0.5214564800262451, + "learning_rate": 0.00042304759233672, + "loss": 1.0188, + "step": 2877 + }, + { + "epoch": 0.5134243154045134, + "grad_norm": 0.48505401611328125, + "learning_rate": 0.000422997020545495, + "loss": 1.0605, + "step": 2878 + }, + { + "epoch": 0.5136027116225136, + "grad_norm": 0.5197986364364624, + "learning_rate": 0.0004229464351671585, + "loss": 0.8301, + "step": 2879 + }, + { + "epoch": 0.5137811078405138, + "grad_norm": 0.4954800605773926, + "learning_rate": 0.00042289583620568326, + "loss": 1.0674, + "step": 2880 + }, + { + "epoch": 0.513959504058514, + "grad_norm": 0.5165004730224609, + "learning_rate": 0.00042284522366504355, + "loss": 0.9053, + "step": 2881 + }, + { + "epoch": 0.5141379002765142, + "grad_norm": 0.6076773405075073, + "learning_rate": 0.00042279459754921436, + "loss": 1.08, + "step": 2882 + }, + { + "epoch": 0.5143162964945143, + "grad_norm": 0.7019060850143433, + "learning_rate": 0.0004227439578621718, + "loss": 0.994, + "step": 2883 + }, + { + "epoch": 0.5144946927125145, + "grad_norm": 0.5617488026618958, + "learning_rate": 0.00042269330460789314, + "loss": 1.1718, + "step": 2884 + }, + { + "epoch": 0.5146730889305147, + "grad_norm": 0.46552005410194397, + "learning_rate": 0.0004226426377903566, + "loss": 0.8396, + "step": 2885 + }, + { + "epoch": 0.5148514851485149, + "grad_norm": 0.49444034695625305, + "learning_rate": 0.00042259195741354167, + "loss": 0.8349, + "step": 2886 + }, + { + "epoch": 0.5150298813665151, + "grad_norm": 0.46994128823280334, + "learning_rate": 0.0004225412634814287, + "loss": 0.9429, + "step": 2887 + }, + { + "epoch": 0.5152082775845153, + "grad_norm": 0.5488457083702087, + "learning_rate": 0.0004224905559979991, + "loss": 0.9858, + "step": 2888 + }, + { + "epoch": 0.5153866738025153, + "grad_norm": 0.5592421889305115, + "learning_rate": 0.0004224398349672354, + "loss": 1.0363, + "step": 2889 + }, + { + "epoch": 0.5155650700205155, + "grad_norm": 0.4703425168991089, + "learning_rate": 0.00042238910039312134, + "loss": 0.9297, + "step": 2890 + }, + { + "epoch": 0.5157434662385157, + "grad_norm": 0.9002471566200256, + "learning_rate": 0.00042233835227964146, + "loss": 0.8405, + "step": 2891 + }, + { + "epoch": 0.515921862456516, + "grad_norm": 0.5256131291389465, + "learning_rate": 0.0004222875906307816, + "loss": 0.9992, + "step": 2892 + }, + { + "epoch": 0.5161002586745161, + "grad_norm": 0.47402289509773254, + "learning_rate": 0.0004222368154505285, + "loss": 0.7949, + "step": 2893 + }, + { + "epoch": 0.5162786548925162, + "grad_norm": 0.5556320548057556, + "learning_rate": 0.0004221860267428701, + "loss": 1.0749, + "step": 2894 + }, + { + "epoch": 0.5164570511105164, + "grad_norm": 0.5696842670440674, + "learning_rate": 0.0004221352245117952, + "loss": 0.9672, + "step": 2895 + }, + { + "epoch": 0.5166354473285166, + "grad_norm": 0.586262047290802, + "learning_rate": 0.00042208440876129384, + "loss": 1.2136, + "step": 2896 + }, + { + "epoch": 0.5168138435465168, + "grad_norm": 0.49055540561676025, + "learning_rate": 0.000422033579495357, + "loss": 0.9203, + "step": 2897 + }, + { + "epoch": 0.516992239764517, + "grad_norm": 0.5185585618019104, + "learning_rate": 0.00042198273671797693, + "loss": 0.9001, + "step": 2898 + }, + { + "epoch": 0.5171706359825172, + "grad_norm": 0.5147084593772888, + "learning_rate": 0.00042193188043314675, + "loss": 1.0424, + "step": 2899 + }, + { + "epoch": 0.5173490322005173, + "grad_norm": 0.5919014811515808, + "learning_rate": 0.0004218810106448606, + "loss": 1.3816, + "step": 2900 + }, + { + "epoch": 0.5175274284185175, + "grad_norm": 0.494163453578949, + "learning_rate": 0.0004218301273571139, + "loss": 0.8105, + "step": 2901 + }, + { + "epoch": 0.5177058246365177, + "grad_norm": 0.558854877948761, + "learning_rate": 0.000421779230573903, + "loss": 1.0603, + "step": 2902 + }, + { + "epoch": 0.5178842208545179, + "grad_norm": 0.47792795300483704, + "learning_rate": 0.00042172832029922514, + "loss": 0.9657, + "step": 2903 + }, + { + "epoch": 0.5180626170725181, + "grad_norm": 0.4979441463947296, + "learning_rate": 0.0004216773965370789, + "loss": 0.8444, + "step": 2904 + }, + { + "epoch": 0.5182410132905182, + "grad_norm": 0.5534027814865112, + "learning_rate": 0.00042162645929146394, + "loss": 1.0369, + "step": 2905 + }, + { + "epoch": 0.5184194095085184, + "grad_norm": 0.525301992893219, + "learning_rate": 0.0004215755085663806, + "loss": 1.0527, + "step": 2906 + }, + { + "epoch": 0.5185978057265186, + "grad_norm": 0.5665169954299927, + "learning_rate": 0.0004215245443658307, + "loss": 1.0721, + "step": 2907 + }, + { + "epoch": 0.5187762019445188, + "grad_norm": 0.4932265281677246, + "learning_rate": 0.0004214735666938169, + "loss": 1.0579, + "step": 2908 + }, + { + "epoch": 0.518954598162519, + "grad_norm": 0.47899937629699707, + "learning_rate": 0.0004214225755543429, + "loss": 0.9099, + "step": 2909 + }, + { + "epoch": 0.5191329943805192, + "grad_norm": 0.459412157535553, + "learning_rate": 0.00042137157095141367, + "loss": 0.8198, + "step": 2910 + }, + { + "epoch": 0.5193113905985193, + "grad_norm": 0.4750489890575409, + "learning_rate": 0.00042132055288903505, + "loss": 0.9241, + "step": 2911 + }, + { + "epoch": 0.5194897868165195, + "grad_norm": 0.4672413468360901, + "learning_rate": 0.0004212695213712138, + "loss": 0.7955, + "step": 2912 + }, + { + "epoch": 0.5196681830345197, + "grad_norm": 0.44326889514923096, + "learning_rate": 0.0004212184764019581, + "loss": 0.8813, + "step": 2913 + }, + { + "epoch": 0.5198465792525199, + "grad_norm": 0.5115212798118591, + "learning_rate": 0.00042116741798527694, + "loss": 1.0218, + "step": 2914 + }, + { + "epoch": 0.5200249754705201, + "grad_norm": 0.471718430519104, + "learning_rate": 0.0004211163461251804, + "loss": 0.9756, + "step": 2915 + }, + { + "epoch": 0.5202033716885202, + "grad_norm": 0.4659128785133362, + "learning_rate": 0.0004210652608256798, + "loss": 0.9583, + "step": 2916 + }, + { + "epoch": 0.5203817679065204, + "grad_norm": 0.5611960887908936, + "learning_rate": 0.00042101416209078707, + "loss": 1.1081, + "step": 2917 + }, + { + "epoch": 0.5205601641245206, + "grad_norm": 0.5797203183174133, + "learning_rate": 0.00042096304992451575, + "loss": 1.1466, + "step": 2918 + }, + { + "epoch": 0.5207385603425208, + "grad_norm": 0.522687554359436, + "learning_rate": 0.00042091192433088, + "loss": 0.9893, + "step": 2919 + }, + { + "epoch": 0.520916956560521, + "grad_norm": 0.5880950093269348, + "learning_rate": 0.00042086078531389524, + "loss": 1.0184, + "step": 2920 + }, + { + "epoch": 0.5210953527785211, + "grad_norm": 1.1611589193344116, + "learning_rate": 0.000420809632877578, + "loss": 1.0494, + "step": 2921 + }, + { + "epoch": 0.5212737489965212, + "grad_norm": 0.5259320735931396, + "learning_rate": 0.00042075846702594567, + "loss": 1.0708, + "step": 2922 + }, + { + "epoch": 0.5214521452145214, + "grad_norm": 0.5445079803466797, + "learning_rate": 0.0004207072877630168, + "loss": 1.2258, + "step": 2923 + }, + { + "epoch": 0.5216305414325216, + "grad_norm": 0.43372058868408203, + "learning_rate": 0.00042065609509281106, + "loss": 0.9371, + "step": 2924 + }, + { + "epoch": 0.5218089376505218, + "grad_norm": 0.4435960650444031, + "learning_rate": 0.000420604889019349, + "loss": 0.8437, + "step": 2925 + }, + { + "epoch": 0.521987333868522, + "grad_norm": 0.6090930104255676, + "learning_rate": 0.00042055366954665244, + "loss": 1.4102, + "step": 2926 + }, + { + "epoch": 0.5221657300865221, + "grad_norm": 0.5087897777557373, + "learning_rate": 0.000420502436678744, + "loss": 0.9809, + "step": 2927 + }, + { + "epoch": 0.5223441263045223, + "grad_norm": 0.4611089825630188, + "learning_rate": 0.0004204511904196476, + "loss": 0.9503, + "step": 2928 + }, + { + "epoch": 0.5225225225225225, + "grad_norm": 0.5042673945426941, + "learning_rate": 0.000420399930773388, + "loss": 0.865, + "step": 2929 + }, + { + "epoch": 0.5227009187405227, + "grad_norm": 0.4814876914024353, + "learning_rate": 0.00042034865774399124, + "loss": 1.1542, + "step": 2930 + }, + { + "epoch": 0.5228793149585229, + "grad_norm": 0.47714847326278687, + "learning_rate": 0.0004202973713354842, + "loss": 0.9069, + "step": 2931 + }, + { + "epoch": 0.5230577111765231, + "grad_norm": 0.5801002979278564, + "learning_rate": 0.0004202460715518948, + "loss": 1.1885, + "step": 2932 + }, + { + "epoch": 0.5232361073945232, + "grad_norm": 0.45451635122299194, + "learning_rate": 0.0004201947583972523, + "loss": 0.8125, + "step": 2933 + }, + { + "epoch": 0.5234145036125234, + "grad_norm": 0.5364030003547668, + "learning_rate": 0.00042014343187558666, + "loss": 0.9351, + "step": 2934 + }, + { + "epoch": 0.5235928998305236, + "grad_norm": 0.5005396604537964, + "learning_rate": 0.0004200920919909292, + "loss": 1.0081, + "step": 2935 + }, + { + "epoch": 0.5237712960485238, + "grad_norm": 0.5246373414993286, + "learning_rate": 0.00042004073874731196, + "loss": 1.0374, + "step": 2936 + }, + { + "epoch": 0.523949692266524, + "grad_norm": 0.5010136365890503, + "learning_rate": 0.0004199893721487682, + "loss": 1.0534, + "step": 2937 + }, + { + "epoch": 0.5241280884845241, + "grad_norm": 0.9451296329498291, + "learning_rate": 0.00041993799219933235, + "loss": 1.0364, + "step": 2938 + }, + { + "epoch": 0.5243064847025243, + "grad_norm": 0.5468368530273438, + "learning_rate": 0.0004198865989030398, + "loss": 1.0716, + "step": 2939 + }, + { + "epoch": 0.5244848809205245, + "grad_norm": 0.46857789158821106, + "learning_rate": 0.00041983519226392686, + "loss": 1.0187, + "step": 2940 + }, + { + "epoch": 0.5246632771385247, + "grad_norm": 0.4949086606502533, + "learning_rate": 0.00041978377228603093, + "loss": 0.8706, + "step": 2941 + }, + { + "epoch": 0.5248416733565249, + "grad_norm": 0.4794335961341858, + "learning_rate": 0.00041973233897339067, + "loss": 0.9308, + "step": 2942 + }, + { + "epoch": 0.5250200695745251, + "grad_norm": 0.4741996228694916, + "learning_rate": 0.0004196808923300455, + "loss": 1.0586, + "step": 2943 + }, + { + "epoch": 0.5251984657925252, + "grad_norm": 0.49954500794410706, + "learning_rate": 0.0004196294323600361, + "loss": 0.9257, + "step": 2944 + }, + { + "epoch": 0.5253768620105254, + "grad_norm": 0.5138105154037476, + "learning_rate": 0.000419577959067404, + "loss": 0.9921, + "step": 2945 + }, + { + "epoch": 0.5255552582285256, + "grad_norm": 0.4848662316799164, + "learning_rate": 0.00041952647245619204, + "loss": 1.0388, + "step": 2946 + }, + { + "epoch": 0.5257336544465258, + "grad_norm": 0.4525747001171112, + "learning_rate": 0.00041947497253044385, + "loss": 0.9866, + "step": 2947 + }, + { + "epoch": 0.525912050664526, + "grad_norm": 0.5498698949813843, + "learning_rate": 0.0004194234592942043, + "loss": 1.1586, + "step": 2948 + }, + { + "epoch": 0.526090446882526, + "grad_norm": 0.4874838590621948, + "learning_rate": 0.0004193719327515192, + "loss": 1.0105, + "step": 2949 + }, + { + "epoch": 0.5262688431005262, + "grad_norm": 0.5352873206138611, + "learning_rate": 0.0004193203929064353, + "loss": 1.0994, + "step": 2950 + }, + { + "epoch": 0.5264472393185264, + "grad_norm": 0.4859972596168518, + "learning_rate": 0.0004192688397630006, + "loss": 0.9533, + "step": 2951 + }, + { + "epoch": 0.5266256355365266, + "grad_norm": 0.5137537717819214, + "learning_rate": 0.0004192172733252641, + "loss": 1.0008, + "step": 2952 + }, + { + "epoch": 0.5268040317545268, + "grad_norm": 0.47844600677490234, + "learning_rate": 0.00041916569359727574, + "loss": 1.0954, + "step": 2953 + }, + { + "epoch": 0.526982427972527, + "grad_norm": 0.52599036693573, + "learning_rate": 0.00041911410058308667, + "loss": 1.0523, + "step": 2954 + }, + { + "epoch": 0.5271608241905271, + "grad_norm": 0.47040775418281555, + "learning_rate": 0.0004190624942867489, + "loss": 0.8066, + "step": 2955 + }, + { + "epoch": 0.5273392204085273, + "grad_norm": 0.5250223278999329, + "learning_rate": 0.0004190108747123156, + "loss": 1.2687, + "step": 2956 + }, + { + "epoch": 0.5275176166265275, + "grad_norm": 0.5163100361824036, + "learning_rate": 0.0004189592418638408, + "loss": 1.1398, + "step": 2957 + }, + { + "epoch": 0.5276960128445277, + "grad_norm": 0.47173264622688293, + "learning_rate": 0.00041890759574538, + "loss": 0.9907, + "step": 2958 + }, + { + "epoch": 0.5278744090625279, + "grad_norm": 0.5113806128501892, + "learning_rate": 0.0004188559363609893, + "loss": 1.2076, + "step": 2959 + }, + { + "epoch": 0.528052805280528, + "grad_norm": 0.47075924277305603, + "learning_rate": 0.0004188042637147259, + "loss": 0.8586, + "step": 2960 + }, + { + "epoch": 0.5282312014985282, + "grad_norm": 0.4917936325073242, + "learning_rate": 0.00041875257781064833, + "loss": 0.913, + "step": 2961 + }, + { + "epoch": 0.5284095977165284, + "grad_norm": 0.49223649501800537, + "learning_rate": 0.000418700878652816, + "loss": 1.0004, + "step": 2962 + }, + { + "epoch": 0.5285879939345286, + "grad_norm": 0.4502630829811096, + "learning_rate": 0.0004186491662452892, + "loss": 0.8698, + "step": 2963 + }, + { + "epoch": 0.5287663901525288, + "grad_norm": 0.5051430463790894, + "learning_rate": 0.00041859744059212945, + "loss": 1.093, + "step": 2964 + }, + { + "epoch": 0.528944786370529, + "grad_norm": 0.7281007170677185, + "learning_rate": 0.0004185457016973993, + "loss": 1.0823, + "step": 2965 + }, + { + "epoch": 0.5291231825885291, + "grad_norm": 0.4868094027042389, + "learning_rate": 0.00041849394956516227, + "loss": 1.0735, + "step": 2966 + }, + { + "epoch": 0.5293015788065293, + "grad_norm": 0.4833069443702698, + "learning_rate": 0.0004184421841994829, + "loss": 0.8735, + "step": 2967 + }, + { + "epoch": 0.5294799750245295, + "grad_norm": 0.5559689402580261, + "learning_rate": 0.0004183904056044269, + "loss": 1.0569, + "step": 2968 + }, + { + "epoch": 0.5296583712425297, + "grad_norm": 0.5828696489334106, + "learning_rate": 0.0004183386137840609, + "loss": 1.126, + "step": 2969 + }, + { + "epoch": 0.5298367674605299, + "grad_norm": 0.48288482427597046, + "learning_rate": 0.0004182868087424526, + "loss": 1.1427, + "step": 2970 + }, + { + "epoch": 0.53001516367853, + "grad_norm": 0.4690031111240387, + "learning_rate": 0.0004182349904836708, + "loss": 0.9756, + "step": 2971 + }, + { + "epoch": 0.5301935598965302, + "grad_norm": 0.5362717509269714, + "learning_rate": 0.00041818315901178527, + "loss": 1.1166, + "step": 2972 + }, + { + "epoch": 0.5303719561145304, + "grad_norm": 0.5287902355194092, + "learning_rate": 0.0004181313143308667, + "loss": 1.1939, + "step": 2973 + }, + { + "epoch": 0.5305503523325306, + "grad_norm": 0.4559158682823181, + "learning_rate": 0.0004180794564449872, + "loss": 0.7655, + "step": 2974 + }, + { + "epoch": 0.5307287485505308, + "grad_norm": 0.5267648696899414, + "learning_rate": 0.0004180275853582194, + "loss": 0.9355, + "step": 2975 + }, + { + "epoch": 0.530907144768531, + "grad_norm": 0.46050286293029785, + "learning_rate": 0.00041797570107463737, + "loss": 0.931, + "step": 2976 + }, + { + "epoch": 0.531085540986531, + "grad_norm": 0.4590972363948822, + "learning_rate": 0.0004179238035983161, + "loss": 0.9246, + "step": 2977 + }, + { + "epoch": 0.5312639372045312, + "grad_norm": 0.5887408256530762, + "learning_rate": 0.00041787189293333155, + "loss": 0.9576, + "step": 2978 + }, + { + "epoch": 0.5314423334225314, + "grad_norm": 0.5028809309005737, + "learning_rate": 0.00041781996908376077, + "loss": 1.0257, + "step": 2979 + }, + { + "epoch": 0.5316207296405316, + "grad_norm": 0.4969048500061035, + "learning_rate": 0.00041776803205368187, + "loss": 1.0092, + "step": 2980 + }, + { + "epoch": 0.5317991258585318, + "grad_norm": 0.5167542099952698, + "learning_rate": 0.00041771608184717384, + "loss": 1.0632, + "step": 2981 + }, + { + "epoch": 0.5319775220765319, + "grad_norm": 0.4737110733985901, + "learning_rate": 0.00041766411846831696, + "loss": 0.8571, + "step": 2982 + }, + { + "epoch": 0.5321559182945321, + "grad_norm": 0.5081421732902527, + "learning_rate": 0.00041761214192119234, + "loss": 1.216, + "step": 2983 + }, + { + "epoch": 0.5323343145125323, + "grad_norm": 0.47318655252456665, + "learning_rate": 0.0004175601522098823, + "loss": 0.939, + "step": 2984 + }, + { + "epoch": 0.5325127107305325, + "grad_norm": 0.5063821077346802, + "learning_rate": 0.0004175081493384699, + "loss": 0.9156, + "step": 2985 + }, + { + "epoch": 0.5326911069485327, + "grad_norm": 0.483039915561676, + "learning_rate": 0.00041745613331103964, + "loss": 0.9842, + "step": 2986 + }, + { + "epoch": 0.5328695031665329, + "grad_norm": 0.4720330536365509, + "learning_rate": 0.0004174041041316767, + "loss": 1.0204, + "step": 2987 + }, + { + "epoch": 0.533047899384533, + "grad_norm": 11.117390632629395, + "learning_rate": 0.0004173520618044675, + "loss": 1.2214, + "step": 2988 + }, + { + "epoch": 0.5332262956025332, + "grad_norm": 0.5497750043869019, + "learning_rate": 0.00041730000633349927, + "loss": 1.0853, + "step": 2989 + }, + { + "epoch": 0.5334046918205334, + "grad_norm": 0.5697376132011414, + "learning_rate": 0.00041724793772286066, + "loss": 1.0129, + "step": 2990 + }, + { + "epoch": 0.5335830880385336, + "grad_norm": 0.527197003364563, + "learning_rate": 0.0004171958559766409, + "loss": 1.2534, + "step": 2991 + }, + { + "epoch": 0.5337614842565338, + "grad_norm": 1.886084794998169, + "learning_rate": 0.0004171437610989306, + "loss": 1.1865, + "step": 2992 + }, + { + "epoch": 0.5339398804745339, + "grad_norm": 0.5286929607391357, + "learning_rate": 0.00041709165309382123, + "loss": 1.2268, + "step": 2993 + }, + { + "epoch": 0.5341182766925341, + "grad_norm": 0.5276958346366882, + "learning_rate": 0.0004170395319654054, + "loss": 1.024, + "step": 2994 + }, + { + "epoch": 0.5342966729105343, + "grad_norm": 0.44607049226760864, + "learning_rate": 0.0004169873977177765, + "loss": 0.8801, + "step": 2995 + }, + { + "epoch": 0.5344750691285345, + "grad_norm": 0.4721534252166748, + "learning_rate": 0.0004169352503550293, + "loss": 0.9466, + "step": 2996 + }, + { + "epoch": 0.5346534653465347, + "grad_norm": 0.5831915736198425, + "learning_rate": 0.00041688308988125944, + "loss": 1.2001, + "step": 2997 + }, + { + "epoch": 0.5348318615645349, + "grad_norm": 0.49683353304862976, + "learning_rate": 0.00041683091630056334, + "loss": 0.9686, + "step": 2998 + }, + { + "epoch": 0.535010257782535, + "grad_norm": 0.6138126850128174, + "learning_rate": 0.000416778729617039, + "loss": 1.0234, + "step": 2999 + }, + { + "epoch": 0.5351886540005352, + "grad_norm": 0.506582498550415, + "learning_rate": 0.000416726529834785, + "loss": 0.9276, + "step": 3000 + }, + { + "epoch": 0.5353670502185354, + "grad_norm": 0.5279721617698669, + "learning_rate": 0.000416674316957901, + "loss": 1.1615, + "step": 3001 + }, + { + "epoch": 0.5355454464365356, + "grad_norm": 0.48710203170776367, + "learning_rate": 0.000416622090990488, + "loss": 1.0638, + "step": 3002 + }, + { + "epoch": 0.5357238426545358, + "grad_norm": 0.5252576470375061, + "learning_rate": 0.00041656985193664763, + "loss": 0.9231, + "step": 3003 + }, + { + "epoch": 0.5359022388725359, + "grad_norm": 0.529524028301239, + "learning_rate": 0.00041651759980048276, + "loss": 0.9436, + "step": 3004 + }, + { + "epoch": 0.536080635090536, + "grad_norm": 0.4857736825942993, + "learning_rate": 0.00041646533458609725, + "loss": 0.8424, + "step": 3005 + }, + { + "epoch": 0.5362590313085362, + "grad_norm": 0.5015032291412354, + "learning_rate": 0.00041641305629759595, + "loss": 1.0656, + "step": 3006 + }, + { + "epoch": 0.5364374275265364, + "grad_norm": 0.4542873501777649, + "learning_rate": 0.0004163607649390849, + "loss": 1.0041, + "step": 3007 + }, + { + "epoch": 0.5366158237445366, + "grad_norm": 0.5027053952217102, + "learning_rate": 0.000416308460514671, + "loss": 1.209, + "step": 3008 + }, + { + "epoch": 0.5367942199625368, + "grad_norm": 0.4859675168991089, + "learning_rate": 0.00041625614302846206, + "loss": 0.9146, + "step": 3009 + }, + { + "epoch": 0.5369726161805369, + "grad_norm": 0.5812960863113403, + "learning_rate": 0.0004162038124845673, + "loss": 1.044, + "step": 3010 + }, + { + "epoch": 0.5371510123985371, + "grad_norm": 0.5171328186988831, + "learning_rate": 0.00041615146888709654, + "loss": 0.9578, + "step": 3011 + }, + { + "epoch": 0.5373294086165373, + "grad_norm": 0.5038743615150452, + "learning_rate": 0.000416099112240161, + "loss": 0.9988, + "step": 3012 + }, + { + "epoch": 0.5375078048345375, + "grad_norm": 0.5556420087814331, + "learning_rate": 0.0004160467425478726, + "loss": 1.0005, + "step": 3013 + }, + { + "epoch": 0.5376862010525377, + "grad_norm": 0.45622992515563965, + "learning_rate": 0.0004159943598143445, + "loss": 0.9728, + "step": 3014 + }, + { + "epoch": 0.5378645972705378, + "grad_norm": 0.5222346782684326, + "learning_rate": 0.00041594196404369076, + "loss": 1.1148, + "step": 3015 + }, + { + "epoch": 0.538042993488538, + "grad_norm": 0.6195734143257141, + "learning_rate": 0.0004158895552400267, + "loss": 1.0752, + "step": 3016 + }, + { + "epoch": 0.5382213897065382, + "grad_norm": 0.5263445973396301, + "learning_rate": 0.0004158371334074683, + "loss": 1.023, + "step": 3017 + }, + { + "epoch": 0.5383997859245384, + "grad_norm": 0.4883100688457489, + "learning_rate": 0.00041578469855013277, + "loss": 0.8856, + "step": 3018 + }, + { + "epoch": 0.5385781821425386, + "grad_norm": 0.517374575138092, + "learning_rate": 0.0004157322506721384, + "loss": 1.1114, + "step": 3019 + }, + { + "epoch": 0.5387565783605388, + "grad_norm": 0.5005238652229309, + "learning_rate": 0.00041567978977760444, + "loss": 1.1022, + "step": 3020 + }, + { + "epoch": 0.5389349745785389, + "grad_norm": 0.46176883578300476, + "learning_rate": 0.00041562731587065093, + "loss": 0.758, + "step": 3021 + }, + { + "epoch": 0.5391133707965391, + "grad_norm": 0.43858736753463745, + "learning_rate": 0.00041557482895539943, + "loss": 0.8694, + "step": 3022 + }, + { + "epoch": 0.5392917670145393, + "grad_norm": 0.4803151488304138, + "learning_rate": 0.0004155223290359721, + "loss": 0.8272, + "step": 3023 + }, + { + "epoch": 0.5394701632325395, + "grad_norm": 0.5557406544685364, + "learning_rate": 0.0004154698161164923, + "loss": 1.0889, + "step": 3024 + }, + { + "epoch": 0.5396485594505397, + "grad_norm": 2.459235668182373, + "learning_rate": 0.0004154172902010843, + "loss": 1.1591, + "step": 3025 + }, + { + "epoch": 0.5398269556685398, + "grad_norm": 0.5599625706672668, + "learning_rate": 0.0004153647512938735, + "loss": 1.1661, + "step": 3026 + }, + { + "epoch": 0.54000535188654, + "grad_norm": 0.5514055490493774, + "learning_rate": 0.00041531219939898635, + "loss": 1.0657, + "step": 3027 + }, + { + "epoch": 0.5401837481045402, + "grad_norm": 0.8572493195533752, + "learning_rate": 0.0004152596345205502, + "loss": 0.923, + "step": 3028 + }, + { + "epoch": 0.5403621443225404, + "grad_norm": 0.497321754693985, + "learning_rate": 0.00041520705666269343, + "loss": 0.9463, + "step": 3029 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.5045643448829651, + "learning_rate": 0.0004151544658295455, + "loss": 0.9698, + "step": 3030 + }, + { + "epoch": 0.5407189367585408, + "grad_norm": 0.519391655921936, + "learning_rate": 0.00041510186202523697, + "loss": 0.8543, + "step": 3031 + }, + { + "epoch": 0.5408973329765409, + "grad_norm": 1.3143996000289917, + "learning_rate": 0.0004150492452538992, + "loss": 0.9187, + "step": 3032 + }, + { + "epoch": 0.541075729194541, + "grad_norm": 0.5469959378242493, + "learning_rate": 0.0004149966155196648, + "loss": 1.3002, + "step": 3033 + }, + { + "epoch": 0.5412541254125413, + "grad_norm": 0.4593210518360138, + "learning_rate": 0.0004149439728266671, + "loss": 0.9523, + "step": 3034 + }, + { + "epoch": 0.5414325216305415, + "grad_norm": 0.4348868727684021, + "learning_rate": 0.0004148913171790408, + "loss": 0.7697, + "step": 3035 + }, + { + "epoch": 0.5416109178485417, + "grad_norm": 0.5638412833213806, + "learning_rate": 0.00041483864858092145, + "loss": 1.1035, + "step": 3036 + }, + { + "epoch": 0.5417893140665417, + "grad_norm": 0.4964679479598999, + "learning_rate": 0.00041478596703644553, + "loss": 1.0256, + "step": 3037 + }, + { + "epoch": 0.5419677102845419, + "grad_norm": 0.5189842581748962, + "learning_rate": 0.0004147332725497507, + "loss": 0.9451, + "step": 3038 + }, + { + "epoch": 0.5421461065025421, + "grad_norm": 0.5478013157844543, + "learning_rate": 0.0004146805651249755, + "loss": 0.938, + "step": 3039 + }, + { + "epoch": 0.5423245027205423, + "grad_norm": 0.5358539819717407, + "learning_rate": 0.0004146278447662597, + "loss": 0.8084, + "step": 3040 + }, + { + "epoch": 0.5425028989385425, + "grad_norm": 0.5389429330825806, + "learning_rate": 0.00041457511147774374, + "loss": 1.1265, + "step": 3041 + }, + { + "epoch": 0.5426812951565427, + "grad_norm": 0.4505546987056732, + "learning_rate": 0.0004145223652635693, + "loss": 0.8022, + "step": 3042 + }, + { + "epoch": 0.5428596913745428, + "grad_norm": 0.5185169577598572, + "learning_rate": 0.00041446960612787916, + "loss": 1.1086, + "step": 3043 + }, + { + "epoch": 0.543038087592543, + "grad_norm": 0.4975489377975464, + "learning_rate": 0.00041441683407481683, + "loss": 0.9063, + "step": 3044 + }, + { + "epoch": 0.5432164838105432, + "grad_norm": 0.48402321338653564, + "learning_rate": 0.0004143640491085272, + "loss": 0.9616, + "step": 3045 + }, + { + "epoch": 0.5433948800285434, + "grad_norm": 0.47887566685676575, + "learning_rate": 0.0004143112512331558, + "loss": 0.9543, + "step": 3046 + }, + { + "epoch": 0.5435732762465436, + "grad_norm": 0.5627540946006775, + "learning_rate": 0.00041425844045284957, + "loss": 1.1141, + "step": 3047 + }, + { + "epoch": 0.5437516724645437, + "grad_norm": 0.4868026077747345, + "learning_rate": 0.0004142056167717561, + "loss": 0.8547, + "step": 3048 + }, + { + "epoch": 0.5439300686825439, + "grad_norm": 0.4439375102519989, + "learning_rate": 0.000414152780194024, + "loss": 0.7576, + "step": 3049 + }, + { + "epoch": 0.5441084649005441, + "grad_norm": 0.5482441782951355, + "learning_rate": 0.00041409993072380333, + "loss": 1.1234, + "step": 3050 + }, + { + "epoch": 0.5442868611185443, + "grad_norm": 0.5158576965332031, + "learning_rate": 0.00041404706836524463, + "loss": 0.9731, + "step": 3051 + }, + { + "epoch": 0.5444652573365445, + "grad_norm": 0.44664448499679565, + "learning_rate": 0.0004139941931224998, + "loss": 0.9086, + "step": 3052 + }, + { + "epoch": 0.5446436535545447, + "grad_norm": 0.6744524240493774, + "learning_rate": 0.0004139413049997216, + "loss": 1.1988, + "step": 3053 + }, + { + "epoch": 0.5448220497725448, + "grad_norm": 0.47927117347717285, + "learning_rate": 0.0004138884040010639, + "loss": 1.06, + "step": 3054 + }, + { + "epoch": 0.545000445990545, + "grad_norm": 0.4434390962123871, + "learning_rate": 0.00041383549013068147, + "loss": 0.9022, + "step": 3055 + }, + { + "epoch": 0.5451788422085452, + "grad_norm": 0.5459398627281189, + "learning_rate": 0.0004137825633927301, + "loss": 1.183, + "step": 3056 + }, + { + "epoch": 0.5453572384265454, + "grad_norm": 0.48765432834625244, + "learning_rate": 0.00041372962379136676, + "loss": 1.0203, + "step": 3057 + }, + { + "epoch": 0.5455356346445456, + "grad_norm": 0.8096850514411926, + "learning_rate": 0.00041367667133074916, + "loss": 1.0457, + "step": 3058 + }, + { + "epoch": 0.5457140308625457, + "grad_norm": 0.4843488037586212, + "learning_rate": 0.0004136237060150363, + "loss": 1.0277, + "step": 3059 + }, + { + "epoch": 0.5458924270805459, + "grad_norm": 0.48028862476348877, + "learning_rate": 0.0004135707278483879, + "loss": 0.9519, + "step": 3060 + }, + { + "epoch": 0.5460708232985461, + "grad_norm": 0.7017949223518372, + "learning_rate": 0.00041351773683496497, + "loss": 0.9328, + "step": 3061 + }, + { + "epoch": 0.5462492195165463, + "grad_norm": 1.0810225009918213, + "learning_rate": 0.0004134647329789293, + "loss": 1.1302, + "step": 3062 + }, + { + "epoch": 0.5464276157345465, + "grad_norm": 0.5587711930274963, + "learning_rate": 0.0004134117162844439, + "loss": 1.0982, + "step": 3063 + }, + { + "epoch": 0.5466060119525467, + "grad_norm": 0.579770028591156, + "learning_rate": 0.00041335868675567263, + "loss": 1.14, + "step": 3064 + }, + { + "epoch": 0.5467844081705467, + "grad_norm": 0.7189714908599854, + "learning_rate": 0.0004133056443967804, + "loss": 1.0545, + "step": 3065 + }, + { + "epoch": 0.5469628043885469, + "grad_norm": 0.7987565398216248, + "learning_rate": 0.0004132525892119331, + "loss": 1.0576, + "step": 3066 + }, + { + "epoch": 0.5471412006065471, + "grad_norm": 0.6334807276725769, + "learning_rate": 0.00041319952120529767, + "loss": 1.0614, + "step": 3067 + }, + { + "epoch": 0.5473195968245473, + "grad_norm": 27.262441635131836, + "learning_rate": 0.00041314644038104216, + "loss": 1.2708, + "step": 3068 + }, + { + "epoch": 0.5474979930425475, + "grad_norm": 0.5688011646270752, + "learning_rate": 0.00041309334674333544, + "loss": 0.8942, + "step": 3069 + }, + { + "epoch": 0.5476763892605476, + "grad_norm": 0.5215781331062317, + "learning_rate": 0.00041304024029634737, + "loss": 0.895, + "step": 3070 + }, + { + "epoch": 0.5478547854785478, + "grad_norm": 0.5432342886924744, + "learning_rate": 0.00041298712104424903, + "loss": 0.9245, + "step": 3071 + }, + { + "epoch": 0.548033181696548, + "grad_norm": 0.49601709842681885, + "learning_rate": 0.0004129339889912123, + "loss": 0.9943, + "step": 3072 + }, + { + "epoch": 0.5482115779145482, + "grad_norm": 0.5069965124130249, + "learning_rate": 0.0004128808441414103, + "loss": 0.8476, + "step": 3073 + }, + { + "epoch": 0.5483899741325484, + "grad_norm": 0.7115893959999084, + "learning_rate": 0.0004128276864990168, + "loss": 1.1841, + "step": 3074 + }, + { + "epoch": 0.5485683703505486, + "grad_norm": 1.3543672561645508, + "learning_rate": 0.0004127745160682068, + "loss": 1.011, + "step": 3075 + }, + { + "epoch": 0.5487467665685487, + "grad_norm": 0.6685739159584045, + "learning_rate": 0.0004127213328531565, + "loss": 1.1114, + "step": 3076 + }, + { + "epoch": 0.5489251627865489, + "grad_norm": 1.056970477104187, + "learning_rate": 0.0004126681368580427, + "loss": 1.0183, + "step": 3077 + }, + { + "epoch": 0.5491035590045491, + "grad_norm": 0.7646797299385071, + "learning_rate": 0.00041261492808704336, + "loss": 1.1184, + "step": 3078 + }, + { + "epoch": 0.5492819552225493, + "grad_norm": 0.5978376269340515, + "learning_rate": 0.00041256170654433767, + "loss": 0.8961, + "step": 3079 + }, + { + "epoch": 0.5494603514405495, + "grad_norm": 0.5183614492416382, + "learning_rate": 0.0004125084722341054, + "loss": 0.8404, + "step": 3080 + }, + { + "epoch": 0.5496387476585496, + "grad_norm": 0.9243518114089966, + "learning_rate": 0.0004124552251605277, + "loss": 1.2223, + "step": 3081 + }, + { + "epoch": 0.5498171438765498, + "grad_norm": 0.5044293403625488, + "learning_rate": 0.0004124019653277865, + "loss": 0.8062, + "step": 3082 + }, + { + "epoch": 0.54999554009455, + "grad_norm": 0.8046514391899109, + "learning_rate": 0.0004123486927400649, + "loss": 0.9456, + "step": 3083 + }, + { + "epoch": 0.5501739363125502, + "grad_norm": 0.5840319991111755, + "learning_rate": 0.0004122954074015468, + "loss": 1.2686, + "step": 3084 + }, + { + "epoch": 0.5503523325305504, + "grad_norm": 1.5194361209869385, + "learning_rate": 0.0004122421093164172, + "loss": 1.1059, + "step": 3085 + }, + { + "epoch": 0.5505307287485506, + "grad_norm": 0.5864779353141785, + "learning_rate": 0.0004121887984888622, + "loss": 1.0505, + "step": 3086 + }, + { + "epoch": 0.5507091249665507, + "grad_norm": 1.8019944429397583, + "learning_rate": 0.00041213547492306875, + "loss": 1.0039, + "step": 3087 + }, + { + "epoch": 0.5508875211845509, + "grad_norm": 0.5829388499259949, + "learning_rate": 0.00041208213862322485, + "loss": 1.1364, + "step": 3088 + }, + { + "epoch": 0.5510659174025511, + "grad_norm": 0.7362989783287048, + "learning_rate": 0.0004120287895935196, + "loss": 1.0522, + "step": 3089 + }, + { + "epoch": 0.5512443136205513, + "grad_norm": 0.4867869019508362, + "learning_rate": 0.00041197542783814287, + "loss": 0.7512, + "step": 3090 + }, + { + "epoch": 0.5514227098385515, + "grad_norm": 0.6978092193603516, + "learning_rate": 0.0004119220533612857, + "loss": 1.1723, + "step": 3091 + }, + { + "epoch": 0.5516011060565515, + "grad_norm": 0.5516607761383057, + "learning_rate": 0.00041186866616714024, + "loss": 1.0468, + "step": 3092 + }, + { + "epoch": 0.5517795022745517, + "grad_norm": 0.7430167198181152, + "learning_rate": 0.0004118152662598994, + "loss": 1.1032, + "step": 3093 + }, + { + "epoch": 0.551957898492552, + "grad_norm": 0.48890167474746704, + "learning_rate": 0.0004117618536437571, + "loss": 0.7917, + "step": 3094 + }, + { + "epoch": 0.5521362947105521, + "grad_norm": 0.5151326060295105, + "learning_rate": 0.00041170842832290844, + "loss": 0.8906, + "step": 3095 + }, + { + "epoch": 0.5523146909285523, + "grad_norm": 0.4648483991622925, + "learning_rate": 0.0004116549903015495, + "loss": 0.9603, + "step": 3096 + }, + { + "epoch": 0.5524930871465525, + "grad_norm": 0.7839336395263672, + "learning_rate": 0.00041160153958387714, + "loss": 1.0892, + "step": 3097 + }, + { + "epoch": 0.5526714833645526, + "grad_norm": 0.48735418915748596, + "learning_rate": 0.0004115480761740893, + "loss": 0.9049, + "step": 3098 + }, + { + "epoch": 0.5528498795825528, + "grad_norm": 0.5337851047515869, + "learning_rate": 0.0004114946000763852, + "loss": 1.0263, + "step": 3099 + }, + { + "epoch": 0.553028275800553, + "grad_norm": 0.7853549122810364, + "learning_rate": 0.0004114411112949647, + "loss": 1.3071, + "step": 3100 + }, + { + "epoch": 0.5532066720185532, + "grad_norm": 0.5989134907722473, + "learning_rate": 0.0004113876098340288, + "loss": 1.1487, + "step": 3101 + }, + { + "epoch": 0.5533850682365534, + "grad_norm": 0.5571014881134033, + "learning_rate": 0.00041133409569777936, + "loss": 0.8642, + "step": 3102 + }, + { + "epoch": 0.5535634644545535, + "grad_norm": 0.5321959853172302, + "learning_rate": 0.0004112805688904196, + "loss": 0.9383, + "step": 3103 + }, + { + "epoch": 0.5537418606725537, + "grad_norm": 0.7023414373397827, + "learning_rate": 0.00041122702941615334, + "loss": 1.0434, + "step": 3104 + }, + { + "epoch": 0.5539202568905539, + "grad_norm": 0.6236468553543091, + "learning_rate": 0.00041117347727918555, + "loss": 0.9331, + "step": 3105 + }, + { + "epoch": 0.5540986531085541, + "grad_norm": 0.5762714743614197, + "learning_rate": 0.00041111991248372215, + "loss": 0.9617, + "step": 3106 + }, + { + "epoch": 0.5542770493265543, + "grad_norm": 0.9596529603004456, + "learning_rate": 0.00041106633503397016, + "loss": 1.0038, + "step": 3107 + }, + { + "epoch": 0.5544554455445545, + "grad_norm": 0.46744972467422485, + "learning_rate": 0.00041101274493413764, + "loss": 0.8293, + "step": 3108 + }, + { + "epoch": 0.5546338417625546, + "grad_norm": 0.514829158782959, + "learning_rate": 0.0004109591421884334, + "loss": 0.9105, + "step": 3109 + }, + { + "epoch": 0.5548122379805548, + "grad_norm": 0.5770788788795471, + "learning_rate": 0.0004109055268010674, + "loss": 1.164, + "step": 3110 + }, + { + "epoch": 0.554990634198555, + "grad_norm": 9.369170188903809, + "learning_rate": 0.00041085189877625053, + "loss": 1.8313, + "step": 3111 + }, + { + "epoch": 0.5551690304165552, + "grad_norm": 0.4911796748638153, + "learning_rate": 0.0004107982581181947, + "loss": 0.8055, + "step": 3112 + }, + { + "epoch": 0.5553474266345554, + "grad_norm": 0.5030617117881775, + "learning_rate": 0.00041074460483111287, + "loss": 0.9508, + "step": 3113 + }, + { + "epoch": 0.5555258228525555, + "grad_norm": 0.49075013399124146, + "learning_rate": 0.000410690938919219, + "loss": 0.9634, + "step": 3114 + }, + { + "epoch": 0.5557042190705557, + "grad_norm": 0.5361286401748657, + "learning_rate": 0.000410637260386728, + "loss": 0.8745, + "step": 3115 + }, + { + "epoch": 0.5558826152885559, + "grad_norm": 0.5814827084541321, + "learning_rate": 0.00041058356923785565, + "loss": 1.1225, + "step": 3116 + }, + { + "epoch": 0.5560610115065561, + "grad_norm": 0.48355334997177124, + "learning_rate": 0.0004105298654768189, + "loss": 0.758, + "step": 3117 + }, + { + "epoch": 0.5562394077245563, + "grad_norm": 0.6016453504562378, + "learning_rate": 0.0004104761491078355, + "loss": 1.1899, + "step": 3118 + }, + { + "epoch": 0.5564178039425565, + "grad_norm": 0.7221107482910156, + "learning_rate": 0.0004104224201351245, + "loss": 1.1253, + "step": 3119 + }, + { + "epoch": 0.5565962001605566, + "grad_norm": 0.5367442965507507, + "learning_rate": 0.00041036867856290567, + "loss": 0.8561, + "step": 3120 + }, + { + "epoch": 0.5567745963785568, + "grad_norm": 0.7140209078788757, + "learning_rate": 0.00041031492439539975, + "loss": 0.961, + "step": 3121 + }, + { + "epoch": 0.556952992596557, + "grad_norm": 0.7320627570152283, + "learning_rate": 0.0004102611576368287, + "loss": 1.2107, + "step": 3122 + }, + { + "epoch": 0.5571313888145571, + "grad_norm": 0.5843806862831116, + "learning_rate": 0.0004102073782914153, + "loss": 0.844, + "step": 3123 + }, + { + "epoch": 0.5573097850325573, + "grad_norm": 0.7023711204528809, + "learning_rate": 0.00041015358636338343, + "loss": 1.1463, + "step": 3124 + }, + { + "epoch": 0.5574881812505574, + "grad_norm": 0.5064725875854492, + "learning_rate": 0.0004100997818569577, + "loss": 0.9558, + "step": 3125 + }, + { + "epoch": 0.5576665774685576, + "grad_norm": 21.4315128326416, + "learning_rate": 0.00041004596477636405, + "loss": 1.645, + "step": 3126 + }, + { + "epoch": 0.5578449736865578, + "grad_norm": 0.8198938965797424, + "learning_rate": 0.0004099921351258292, + "loss": 0.9687, + "step": 3127 + }, + { + "epoch": 0.558023369904558, + "grad_norm": 8.272482872009277, + "learning_rate": 0.00040993829290958086, + "loss": 1.1445, + "step": 3128 + }, + { + "epoch": 0.5582017661225582, + "grad_norm": 0.8829731941223145, + "learning_rate": 0.0004098844381318478, + "loss": 1.2275, + "step": 3129 + }, + { + "epoch": 0.5583801623405584, + "grad_norm": 1.6794313192367554, + "learning_rate": 0.00040983057079685984, + "loss": 1.118, + "step": 3130 + }, + { + "epoch": 0.5585585585585585, + "grad_norm": 1.177979826927185, + "learning_rate": 0.0004097766909088476, + "loss": 1.0017, + "step": 3131 + }, + { + "epoch": 0.5587369547765587, + "grad_norm": 2.6292262077331543, + "learning_rate": 0.0004097227984720429, + "loss": 0.9328, + "step": 3132 + }, + { + "epoch": 0.5589153509945589, + "grad_norm": 1.0025197267532349, + "learning_rate": 0.0004096688934906782, + "loss": 1.0593, + "step": 3133 + }, + { + "epoch": 0.5590937472125591, + "grad_norm": 0.5532941222190857, + "learning_rate": 0.0004096149759689874, + "loss": 1.1529, + "step": 3134 + }, + { + "epoch": 0.5592721434305593, + "grad_norm": 0.6335615515708923, + "learning_rate": 0.00040956104591120503, + "loss": 1.0475, + "step": 3135 + }, + { + "epoch": 0.5594505396485594, + "grad_norm": 0.663866400718689, + "learning_rate": 0.00040950710332156683, + "loss": 0.8865, + "step": 3136 + }, + { + "epoch": 0.5596289358665596, + "grad_norm": 0.5401614308357239, + "learning_rate": 0.00040945314820430934, + "loss": 0.9367, + "step": 3137 + }, + { + "epoch": 0.5598073320845598, + "grad_norm": 0.5373167991638184, + "learning_rate": 0.0004093991805636702, + "loss": 0.8718, + "step": 3138 + }, + { + "epoch": 0.55998572830256, + "grad_norm": 0.5260427594184875, + "learning_rate": 0.00040934520040388807, + "loss": 0.911, + "step": 3139 + }, + { + "epoch": 0.5601641245205602, + "grad_norm": 0.5970147252082825, + "learning_rate": 0.00040929120772920243, + "loss": 1.0448, + "step": 3140 + }, + { + "epoch": 0.5603425207385604, + "grad_norm": 0.5675042271614075, + "learning_rate": 0.0004092372025438539, + "loss": 1.1786, + "step": 3141 + }, + { + "epoch": 0.5605209169565605, + "grad_norm": 0.5005537867546082, + "learning_rate": 0.0004091831848520839, + "loss": 0.827, + "step": 3142 + }, + { + "epoch": 0.5606993131745607, + "grad_norm": 0.5521116256713867, + "learning_rate": 0.00040912915465813525, + "loss": 0.8586, + "step": 3143 + }, + { + "epoch": 0.5608777093925609, + "grad_norm": 0.5322579145431519, + "learning_rate": 0.0004090751119662511, + "loss": 0.9931, + "step": 3144 + }, + { + "epoch": 0.5610561056105611, + "grad_norm": 0.5480323433876038, + "learning_rate": 0.00040902105678067627, + "loss": 1.031, + "step": 3145 + }, + { + "epoch": 0.5612345018285613, + "grad_norm": 0.5208452343940735, + "learning_rate": 0.00040896698910565597, + "loss": 1.0623, + "step": 3146 + }, + { + "epoch": 0.5614128980465614, + "grad_norm": 0.5295253992080688, + "learning_rate": 0.00040891290894543676, + "loss": 0.9008, + "step": 3147 + }, + { + "epoch": 0.5615912942645616, + "grad_norm": 0.521321713924408, + "learning_rate": 0.00040885881630426616, + "loss": 1.0494, + "step": 3148 + }, + { + "epoch": 0.5617696904825618, + "grad_norm": 0.49271395802497864, + "learning_rate": 0.0004088047111863924, + "loss": 0.8965, + "step": 3149 + }, + { + "epoch": 0.561948086700562, + "grad_norm": 0.5301600098609924, + "learning_rate": 0.000408750593596065, + "loss": 1.0071, + "step": 3150 + }, + { + "epoch": 0.5621264829185622, + "grad_norm": 0.5512881875038147, + "learning_rate": 0.0004086964635375342, + "loss": 1.0579, + "step": 3151 + }, + { + "epoch": 0.5623048791365624, + "grad_norm": 0.5429875254631042, + "learning_rate": 0.00040864232101505153, + "loss": 0.9885, + "step": 3152 + }, + { + "epoch": 0.5624832753545624, + "grad_norm": 0.4952228367328644, + "learning_rate": 0.00040858816603286924, + "loss": 0.8444, + "step": 3153 + }, + { + "epoch": 0.5626616715725626, + "grad_norm": 0.6742690205574036, + "learning_rate": 0.00040853399859524066, + "loss": 1.0294, + "step": 3154 + }, + { + "epoch": 0.5628400677905628, + "grad_norm": 0.5073679685592651, + "learning_rate": 0.00040847981870642004, + "loss": 1.0393, + "step": 3155 + }, + { + "epoch": 0.563018464008563, + "grad_norm": 0.5249845385551453, + "learning_rate": 0.0004084256263706626, + "loss": 1.0371, + "step": 3156 + }, + { + "epoch": 0.5631968602265632, + "grad_norm": 0.5406726002693176, + "learning_rate": 0.00040837142159222466, + "loss": 1.096, + "step": 3157 + }, + { + "epoch": 0.5633752564445633, + "grad_norm": 0.4763408899307251, + "learning_rate": 0.0004083172043753635, + "loss": 0.948, + "step": 3158 + }, + { + "epoch": 0.5635536526625635, + "grad_norm": 0.5455912947654724, + "learning_rate": 0.0004082629747243371, + "loss": 0.9939, + "step": 3159 + }, + { + "epoch": 0.5637320488805637, + "grad_norm": 0.5416297316551208, + "learning_rate": 0.00040820873264340484, + "loss": 0.823, + "step": 3160 + }, + { + "epoch": 0.5639104450985639, + "grad_norm": 0.5497506856918335, + "learning_rate": 0.0004081544781368268, + "loss": 0.8989, + "step": 3161 + }, + { + "epoch": 0.5640888413165641, + "grad_norm": 0.8488923907279968, + "learning_rate": 0.0004081002112088641, + "loss": 0.869, + "step": 3162 + }, + { + "epoch": 0.5642672375345643, + "grad_norm": 0.539082944393158, + "learning_rate": 0.0004080459318637789, + "loss": 1.0027, + "step": 3163 + }, + { + "epoch": 0.5644456337525644, + "grad_norm": 0.6145626306533813, + "learning_rate": 0.0004079916401058342, + "loss": 1.1312, + "step": 3164 + }, + { + "epoch": 0.5646240299705646, + "grad_norm": 0.5471964478492737, + "learning_rate": 0.00040793733593929405, + "loss": 1.0311, + "step": 3165 + }, + { + "epoch": 0.5648024261885648, + "grad_norm": 1.0628180503845215, + "learning_rate": 0.00040788301936842353, + "loss": 0.9842, + "step": 3166 + }, + { + "epoch": 0.564980822406565, + "grad_norm": 0.6143336892127991, + "learning_rate": 0.00040782869039748847, + "loss": 1.0282, + "step": 3167 + }, + { + "epoch": 0.5651592186245652, + "grad_norm": 0.4468381702899933, + "learning_rate": 0.0004077743490307562, + "loss": 0.6927, + "step": 3168 + }, + { + "epoch": 0.5653376148425653, + "grad_norm": 0.524698793888092, + "learning_rate": 0.0004077199952724944, + "loss": 1.0529, + "step": 3169 + }, + { + "epoch": 0.5655160110605655, + "grad_norm": 0.4760201573371887, + "learning_rate": 0.0004076656291269719, + "loss": 0.8662, + "step": 3170 + }, + { + "epoch": 0.5656944072785657, + "grad_norm": 0.5162889957427979, + "learning_rate": 0.00040761125059845887, + "loss": 0.9204, + "step": 3171 + }, + { + "epoch": 0.5658728034965659, + "grad_norm": 0.5113767981529236, + "learning_rate": 0.00040755685969122603, + "loss": 1.0988, + "step": 3172 + }, + { + "epoch": 0.5660511997145661, + "grad_norm": 0.5655469298362732, + "learning_rate": 0.0004075024564095452, + "loss": 0.918, + "step": 3173 + }, + { + "epoch": 0.5662295959325663, + "grad_norm": 0.5044064521789551, + "learning_rate": 0.0004074480407576892, + "loss": 1.1065, + "step": 3174 + }, + { + "epoch": 0.5664079921505664, + "grad_norm": 0.5524999499320984, + "learning_rate": 0.0004073936127399319, + "loss": 1.0733, + "step": 3175 + }, + { + "epoch": 0.5665863883685666, + "grad_norm": 0.4410164952278137, + "learning_rate": 0.000407339172360548, + "loss": 0.84, + "step": 3176 + }, + { + "epoch": 0.5667647845865668, + "grad_norm": 0.5362854599952698, + "learning_rate": 0.0004072847196238131, + "loss": 0.9016, + "step": 3177 + }, + { + "epoch": 0.566943180804567, + "grad_norm": 1.1655226945877075, + "learning_rate": 0.0004072302545340041, + "loss": 1.0215, + "step": 3178 + }, + { + "epoch": 0.5671215770225672, + "grad_norm": 5.327235698699951, + "learning_rate": 0.00040717577709539857, + "loss": 1.6297, + "step": 3179 + }, + { + "epoch": 0.5672999732405672, + "grad_norm": 0.6369319558143616, + "learning_rate": 0.00040712128731227513, + "loss": 1.1747, + "step": 3180 + }, + { + "epoch": 0.5674783694585674, + "grad_norm": 0.5702166557312012, + "learning_rate": 0.0004070667851889134, + "loss": 1.0747, + "step": 3181 + }, + { + "epoch": 0.5676567656765676, + "grad_norm": 0.6028847694396973, + "learning_rate": 0.0004070122707295939, + "loss": 0.7787, + "step": 3182 + }, + { + "epoch": 0.5678351618945678, + "grad_norm": 1.2918931245803833, + "learning_rate": 0.0004069577439385982, + "loss": 1.7838, + "step": 3183 + }, + { + "epoch": 0.568013558112568, + "grad_norm": 0.5654813647270203, + "learning_rate": 0.00040690320482020893, + "loss": 0.9124, + "step": 3184 + }, + { + "epoch": 0.5681919543305682, + "grad_norm": 0.902948796749115, + "learning_rate": 0.00040684865337870945, + "loss": 0.8613, + "step": 3185 + }, + { + "epoch": 0.5683703505485683, + "grad_norm": 1.1347112655639648, + "learning_rate": 0.00040679408961838426, + "loss": 1.1453, + "step": 3186 + }, + { + "epoch": 0.5685487467665685, + "grad_norm": 0.5079895853996277, + "learning_rate": 0.0004067395135435187, + "loss": 0.8407, + "step": 3187 + }, + { + "epoch": 0.5687271429845687, + "grad_norm": 0.7240801453590393, + "learning_rate": 0.0004066849251583992, + "loss": 0.8808, + "step": 3188 + }, + { + "epoch": 0.5689055392025689, + "grad_norm": 0.5793579816818237, + "learning_rate": 0.0004066303244673132, + "loss": 0.915, + "step": 3189 + }, + { + "epoch": 0.5690839354205691, + "grad_norm": 1.3618862628936768, + "learning_rate": 0.00040657571147454877, + "loss": 0.9031, + "step": 3190 + }, + { + "epoch": 0.5692623316385692, + "grad_norm": 14.210494995117188, + "learning_rate": 0.0004065210861843954, + "loss": 1.538, + "step": 3191 + }, + { + "epoch": 0.5694407278565694, + "grad_norm": 2.7108030319213867, + "learning_rate": 0.0004064664486011433, + "loss": 0.8688, + "step": 3192 + }, + { + "epoch": 0.5696191240745696, + "grad_norm": 1.628589391708374, + "learning_rate": 0.0004064117987290836, + "loss": 1.0493, + "step": 3193 + }, + { + "epoch": 0.5697975202925698, + "grad_norm": 4.802014350891113, + "learning_rate": 0.0004063571365725086, + "loss": 1.0736, + "step": 3194 + }, + { + "epoch": 0.56997591651057, + "grad_norm": 0.9495110511779785, + "learning_rate": 0.00040630246213571136, + "loss": 0.8673, + "step": 3195 + }, + { + "epoch": 0.5701543127285702, + "grad_norm": 1.0658725500106812, + "learning_rate": 0.000406247775422986, + "loss": 1.0411, + "step": 3196 + }, + { + "epoch": 0.5703327089465703, + "grad_norm": 0.6458380222320557, + "learning_rate": 0.00040619307643862757, + "loss": 1.1536, + "step": 3197 + }, + { + "epoch": 0.5705111051645705, + "grad_norm": 91.1878662109375, + "learning_rate": 0.00040613836518693213, + "loss": 1.0854, + "step": 3198 + }, + { + "epoch": 0.5706895013825707, + "grad_norm": 1.0638850927352905, + "learning_rate": 0.0004060836416721968, + "loss": 0.9599, + "step": 3199 + }, + { + "epoch": 0.5708678976005709, + "grad_norm": 0.7722949981689453, + "learning_rate": 0.00040602890589871933, + "loss": 0.9146, + "step": 3200 + }, + { + "epoch": 0.5710462938185711, + "grad_norm": 1.2300078868865967, + "learning_rate": 0.0004059741578707987, + "loss": 0.9866, + "step": 3201 + }, + { + "epoch": 0.5712246900365712, + "grad_norm": 0.5897446870803833, + "learning_rate": 0.00040591939759273486, + "loss": 0.8663, + "step": 3202 + }, + { + "epoch": 0.5714030862545714, + "grad_norm": 1.1152317523956299, + "learning_rate": 0.0004058646250688287, + "loss": 1.1437, + "step": 3203 + }, + { + "epoch": 0.5715814824725716, + "grad_norm": 0.5559517741203308, + "learning_rate": 0.00040580984030338187, + "loss": 0.7943, + "step": 3204 + }, + { + "epoch": 0.5717598786905718, + "grad_norm": 0.8801344633102417, + "learning_rate": 0.0004057550433006972, + "loss": 1.3015, + "step": 3205 + }, + { + "epoch": 0.571938274908572, + "grad_norm": 0.9479907155036926, + "learning_rate": 0.00040570023406507857, + "loss": 0.8974, + "step": 3206 + }, + { + "epoch": 0.5721166711265722, + "grad_norm": 0.6277695298194885, + "learning_rate": 0.0004056454126008305, + "loss": 1.0662, + "step": 3207 + }, + { + "epoch": 0.5722950673445723, + "grad_norm": 0.476595014333725, + "learning_rate": 0.0004055905789122587, + "loss": 0.7801, + "step": 3208 + }, + { + "epoch": 0.5724734635625724, + "grad_norm": 0.6994301080703735, + "learning_rate": 0.00040553573300366986, + "loss": 1.223, + "step": 3209 + }, + { + "epoch": 0.5726518597805726, + "grad_norm": 0.8489440679550171, + "learning_rate": 0.0004054808748793714, + "loss": 0.863, + "step": 3210 + }, + { + "epoch": 0.5728302559985728, + "grad_norm": 0.8950226902961731, + "learning_rate": 0.00040542600454367193, + "loss": 0.9557, + "step": 3211 + }, + { + "epoch": 0.573008652216573, + "grad_norm": 0.6158692240715027, + "learning_rate": 0.000405371122000881, + "loss": 1.0143, + "step": 3212 + }, + { + "epoch": 0.5731870484345731, + "grad_norm": 27.874290466308594, + "learning_rate": 0.00040531622725530894, + "loss": 0.8735, + "step": 3213 + }, + { + "epoch": 0.5733654446525733, + "grad_norm": 33.128963470458984, + "learning_rate": 0.0004052613203112673, + "loss": 1.7168, + "step": 3214 + }, + { + "epoch": 0.5735438408705735, + "grad_norm": 2.284363031387329, + "learning_rate": 0.0004052064011730684, + "loss": 0.9891, + "step": 3215 + }, + { + "epoch": 0.5737222370885737, + "grad_norm": 0.8609527945518494, + "learning_rate": 0.0004051514698450255, + "loss": 0.9398, + "step": 3216 + }, + { + "epoch": 0.5739006333065739, + "grad_norm": 0.9100723266601562, + "learning_rate": 0.0004050965263314529, + "loss": 0.7602, + "step": 3217 + }, + { + "epoch": 0.5740790295245741, + "grad_norm": 8.203483581542969, + "learning_rate": 0.0004050415706366659, + "loss": 0.9475, + "step": 3218 + }, + { + "epoch": 0.5742574257425742, + "grad_norm": 0.828872561454773, + "learning_rate": 0.0004049866027649807, + "loss": 0.9817, + "step": 3219 + }, + { + "epoch": 0.5744358219605744, + "grad_norm": 0.49633899331092834, + "learning_rate": 0.00040493162272071427, + "loss": 0.5957, + "step": 3220 + }, + { + "epoch": 0.5746142181785746, + "grad_norm": 4.844064712524414, + "learning_rate": 0.000404876630508185, + "loss": 1.3244, + "step": 3221 + }, + { + "epoch": 0.5747926143965748, + "grad_norm": 0.8198102116584778, + "learning_rate": 0.00040482162613171167, + "loss": 1.0323, + "step": 3222 + }, + { + "epoch": 0.574971010614575, + "grad_norm": 1.0812023878097534, + "learning_rate": 0.00040476660959561464, + "loss": 1.2488, + "step": 3223 + }, + { + "epoch": 0.5751494068325751, + "grad_norm": 0.8954762816429138, + "learning_rate": 0.0004047115809042146, + "loss": 0.7903, + "step": 3224 + }, + { + "epoch": 0.5753278030505753, + "grad_norm": 2.91465163230896, + "learning_rate": 0.0004046565400618336, + "loss": 1.1885, + "step": 3225 + }, + { + "epoch": 0.5755061992685755, + "grad_norm": 1.1171669960021973, + "learning_rate": 0.0004046014870727944, + "loss": 1.0482, + "step": 3226 + }, + { + "epoch": 0.5756845954865757, + "grad_norm": 0.8723791837692261, + "learning_rate": 0.0004045464219414211, + "loss": 0.8968, + "step": 3227 + }, + { + "epoch": 0.5758629917045759, + "grad_norm": 0.8948122262954712, + "learning_rate": 0.0004044913446720382, + "loss": 0.9063, + "step": 3228 + }, + { + "epoch": 0.5760413879225761, + "grad_norm": 0.7297793626785278, + "learning_rate": 0.0004044362552689716, + "loss": 0.9518, + "step": 3229 + }, + { + "epoch": 0.5762197841405762, + "grad_norm": 0.7813421487808228, + "learning_rate": 0.000404381153736548, + "loss": 0.923, + "step": 3230 + }, + { + "epoch": 0.5763981803585764, + "grad_norm": 0.5343239307403564, + "learning_rate": 0.00040432604007909504, + "loss": 0.9697, + "step": 3231 + }, + { + "epoch": 0.5765765765765766, + "grad_norm": 0.5333415269851685, + "learning_rate": 0.0004042709143009412, + "loss": 0.7906, + "step": 3232 + }, + { + "epoch": 0.5767549727945768, + "grad_norm": 0.5309287309646606, + "learning_rate": 0.0004042157764064163, + "loss": 0.8805, + "step": 3233 + }, + { + "epoch": 0.576933369012577, + "grad_norm": 0.8856096863746643, + "learning_rate": 0.00040416062639985053, + "loss": 0.9808, + "step": 3234 + }, + { + "epoch": 0.5771117652305772, + "grad_norm": 0.6096811890602112, + "learning_rate": 0.0004041054642855756, + "loss": 0.8629, + "step": 3235 + }, + { + "epoch": 0.5772901614485773, + "grad_norm": 0.5578605532646179, + "learning_rate": 0.0004040502900679237, + "loss": 1.0188, + "step": 3236 + }, + { + "epoch": 0.5774685576665775, + "grad_norm": 0.9917730689048767, + "learning_rate": 0.0004039951037512284, + "loss": 0.9557, + "step": 3237 + }, + { + "epoch": 0.5776469538845777, + "grad_norm": 0.8114546537399292, + "learning_rate": 0.00040393990533982397, + "loss": 0.9911, + "step": 3238 + }, + { + "epoch": 0.5778253501025779, + "grad_norm": 0.6181529760360718, + "learning_rate": 0.0004038846948380456, + "loss": 0.9745, + "step": 3239 + }, + { + "epoch": 0.578003746320578, + "grad_norm": 1.0224014520645142, + "learning_rate": 0.00040382947225022945, + "loss": 1.1888, + "step": 3240 + }, + { + "epoch": 0.5781821425385781, + "grad_norm": 0.6258061528205872, + "learning_rate": 0.0004037742375807127, + "loss": 0.9421, + "step": 3241 + }, + { + "epoch": 0.5783605387565783, + "grad_norm": 1.0917785167694092, + "learning_rate": 0.00040371899083383367, + "loss": 1.0569, + "step": 3242 + }, + { + "epoch": 0.5785389349745785, + "grad_norm": 0.6143338084220886, + "learning_rate": 0.00040366373201393115, + "loss": 1.0274, + "step": 3243 + }, + { + "epoch": 0.5787173311925787, + "grad_norm": 1.4802119731903076, + "learning_rate": 0.00040360846112534533, + "loss": 1.0424, + "step": 3244 + }, + { + "epoch": 0.5788957274105789, + "grad_norm": 7.478890895843506, + "learning_rate": 0.000403553178172417, + "loss": 2.1221, + "step": 3245 + }, + { + "epoch": 0.5790741236285791, + "grad_norm": 0.7259086966514587, + "learning_rate": 0.0004034978831594881, + "loss": 0.9648, + "step": 3246 + }, + { + "epoch": 0.5792525198465792, + "grad_norm": 0.8427548408508301, + "learning_rate": 0.00040344257609090155, + "loss": 0.8322, + "step": 3247 + }, + { + "epoch": 0.5794309160645794, + "grad_norm": 0.5752832889556885, + "learning_rate": 0.0004033872569710011, + "loss": 1.046, + "step": 3248 + }, + { + "epoch": 0.5796093122825796, + "grad_norm": 0.8446690440177917, + "learning_rate": 0.0004033319258041316, + "loss": 1.0119, + "step": 3249 + }, + { + "epoch": 0.5797877085005798, + "grad_norm": 0.5777229070663452, + "learning_rate": 0.0004032765825946385, + "loss": 0.9398, + "step": 3250 + }, + { + "epoch": 0.57996610471858, + "grad_norm": 1.039317011833191, + "learning_rate": 0.0004032212273468686, + "loss": 0.9415, + "step": 3251 + }, + { + "epoch": 0.5801445009365801, + "grad_norm": 0.9003728628158569, + "learning_rate": 0.0004031658600651694, + "loss": 1.0614, + "step": 3252 + }, + { + "epoch": 0.5803228971545803, + "grad_norm": 0.5967848896980286, + "learning_rate": 0.0004031104807538896, + "loss": 0.8657, + "step": 3253 + }, + { + "epoch": 0.5805012933725805, + "grad_norm": 0.9425020813941956, + "learning_rate": 0.0004030550894173783, + "loss": 1.0047, + "step": 3254 + }, + { + "epoch": 0.5806796895905807, + "grad_norm": 0.7595664262771606, + "learning_rate": 0.0004029996860599864, + "loss": 0.9598, + "step": 3255 + }, + { + "epoch": 0.5808580858085809, + "grad_norm": 0.629330039024353, + "learning_rate": 0.0004029442706860649, + "loss": 1.0411, + "step": 3256 + }, + { + "epoch": 0.5810364820265811, + "grad_norm": 0.588629961013794, + "learning_rate": 0.0004028888432999661, + "loss": 1.0998, + "step": 3257 + }, + { + "epoch": 0.5812148782445812, + "grad_norm": 0.8526011109352112, + "learning_rate": 0.0004028334039060434, + "loss": 1.2057, + "step": 3258 + }, + { + "epoch": 0.5813932744625814, + "grad_norm": 1.3286303281784058, + "learning_rate": 0.00040277795250865094, + "loss": 0.8182, + "step": 3259 + }, + { + "epoch": 0.5815716706805816, + "grad_norm": 0.985548734664917, + "learning_rate": 0.0004027224891121438, + "loss": 1.0537, + "step": 3260 + }, + { + "epoch": 0.5817500668985818, + "grad_norm": 0.6656898856163025, + "learning_rate": 0.0004026670137208782, + "loss": 0.7234, + "step": 3261 + }, + { + "epoch": 0.581928463116582, + "grad_norm": 1.2689619064331055, + "learning_rate": 0.00040261152633921097, + "loss": 1.115, + "step": 3262 + }, + { + "epoch": 0.5821068593345821, + "grad_norm": 1.3990188837051392, + "learning_rate": 0.00040255602697150005, + "loss": 1.0482, + "step": 3263 + }, + { + "epoch": 0.5822852555525823, + "grad_norm": 0.6655217409133911, + "learning_rate": 0.00040250051562210456, + "loss": 1.0004, + "step": 3264 + }, + { + "epoch": 0.5824636517705825, + "grad_norm": 0.6590464115142822, + "learning_rate": 0.0004024449922953841, + "loss": 1.2517, + "step": 3265 + }, + { + "epoch": 0.5826420479885827, + "grad_norm": 1.0634833574295044, + "learning_rate": 0.0004023894569956996, + "loss": 1.582, + "step": 3266 + }, + { + "epoch": 0.5828204442065829, + "grad_norm": 0.7259156703948975, + "learning_rate": 0.00040233390972741276, + "loss": 1.0507, + "step": 3267 + }, + { + "epoch": 0.582998840424583, + "grad_norm": 0.5403652191162109, + "learning_rate": 0.00040227835049488615, + "loss": 0.8231, + "step": 3268 + }, + { + "epoch": 0.5831772366425831, + "grad_norm": 0.6394296288490295, + "learning_rate": 0.0004022227793024834, + "loss": 0.9511, + "step": 3269 + }, + { + "epoch": 0.5833556328605833, + "grad_norm": 0.8414486050605774, + "learning_rate": 0.0004021671961545691, + "loss": 1.097, + "step": 3270 + }, + { + "epoch": 0.5835340290785835, + "grad_norm": 1.2080618143081665, + "learning_rate": 0.0004021116010555087, + "loss": 0.9097, + "step": 3271 + }, + { + "epoch": 0.5837124252965837, + "grad_norm": 0.6970852613449097, + "learning_rate": 0.00040205599400966864, + "loss": 0.9379, + "step": 3272 + }, + { + "epoch": 0.5838908215145839, + "grad_norm": 0.7079569101333618, + "learning_rate": 0.00040200037502141617, + "loss": 1.0276, + "step": 3273 + }, + { + "epoch": 0.584069217732584, + "grad_norm": 0.6691045761108398, + "learning_rate": 0.0004019447440951197, + "loss": 1.1062, + "step": 3274 + }, + { + "epoch": 0.5842476139505842, + "grad_norm": 0.7754026055335999, + "learning_rate": 0.0004018891012351484, + "loss": 1.0128, + "step": 3275 + }, + { + "epoch": 0.5844260101685844, + "grad_norm": 0.774585485458374, + "learning_rate": 0.0004018334464458725, + "loss": 0.9005, + "step": 3276 + }, + { + "epoch": 0.5846044063865846, + "grad_norm": 0.6378253698348999, + "learning_rate": 0.000401777779731663, + "loss": 1.1857, + "step": 3277 + }, + { + "epoch": 0.5847828026045848, + "grad_norm": 0.6450658440589905, + "learning_rate": 0.00040172210109689206, + "loss": 0.7742, + "step": 3278 + }, + { + "epoch": 0.584961198822585, + "grad_norm": 0.6561940908432007, + "learning_rate": 0.00040166641054593255, + "loss": 1.1851, + "step": 3279 + }, + { + "epoch": 0.5851395950405851, + "grad_norm": 0.5854855179786682, + "learning_rate": 0.0004016107080831584, + "loss": 0.9683, + "step": 3280 + }, + { + "epoch": 0.5853179912585853, + "grad_norm": 0.6039174199104309, + "learning_rate": 0.00040155499371294454, + "loss": 0.9314, + "step": 3281 + }, + { + "epoch": 0.5854963874765855, + "grad_norm": 0.8320233225822449, + "learning_rate": 0.0004014992674396666, + "loss": 0.7973, + "step": 3282 + }, + { + "epoch": 0.5856747836945857, + "grad_norm": 0.6969453692436218, + "learning_rate": 0.00040144352926770147, + "loss": 0.9077, + "step": 3283 + }, + { + "epoch": 0.5858531799125859, + "grad_norm": 0.9570133686065674, + "learning_rate": 0.0004013877792014267, + "loss": 1.0072, + "step": 3284 + }, + { + "epoch": 0.586031576130586, + "grad_norm": 0.6976554989814758, + "learning_rate": 0.0004013320172452209, + "loss": 1.054, + "step": 3285 + }, + { + "epoch": 0.5862099723485862, + "grad_norm": 0.6149062514305115, + "learning_rate": 0.00040127624340346356, + "loss": 0.9388, + "step": 3286 + }, + { + "epoch": 0.5863883685665864, + "grad_norm": 0.517910897731781, + "learning_rate": 0.0004012204576805352, + "loss": 0.9243, + "step": 3287 + }, + { + "epoch": 0.5865667647845866, + "grad_norm": 0.5356748700141907, + "learning_rate": 0.0004011646600808172, + "loss": 0.9906, + "step": 3288 + }, + { + "epoch": 0.5867451610025868, + "grad_norm": 0.5061875581741333, + "learning_rate": 0.00040110885060869173, + "loss": 1.0385, + "step": 3289 + }, + { + "epoch": 0.586923557220587, + "grad_norm": 0.6880626082420349, + "learning_rate": 0.00040105302926854224, + "loss": 0.8258, + "step": 3290 + }, + { + "epoch": 0.5871019534385871, + "grad_norm": 0.4615683853626251, + "learning_rate": 0.00040099719606475286, + "loss": 0.8729, + "step": 3291 + }, + { + "epoch": 0.5872803496565873, + "grad_norm": 0.7369007468223572, + "learning_rate": 0.0004009413510017087, + "loss": 0.9377, + "step": 3292 + }, + { + "epoch": 0.5874587458745875, + "grad_norm": 0.5646438598632812, + "learning_rate": 0.0004008854940837957, + "loss": 1.1106, + "step": 3293 + }, + { + "epoch": 0.5876371420925877, + "grad_norm": 0.5093971490859985, + "learning_rate": 0.000400829625315401, + "loss": 0.9263, + "step": 3294 + }, + { + "epoch": 0.5878155383105879, + "grad_norm": 0.8065229058265686, + "learning_rate": 0.00040077374470091237, + "loss": 0.7977, + "step": 3295 + }, + { + "epoch": 0.587993934528588, + "grad_norm": 1.8008029460906982, + "learning_rate": 0.0004007178522447188, + "loss": 1.1636, + "step": 3296 + }, + { + "epoch": 0.5881723307465881, + "grad_norm": 0.5000921487808228, + "learning_rate": 0.00040066194795120984, + "loss": 1.0567, + "step": 3297 + }, + { + "epoch": 0.5883507269645883, + "grad_norm": 0.5407942533493042, + "learning_rate": 0.0004006060318247764, + "loss": 1.1229, + "step": 3298 + }, + { + "epoch": 0.5885291231825885, + "grad_norm": 0.8123578429222107, + "learning_rate": 0.00040055010386981006, + "loss": 1.0442, + "step": 3299 + }, + { + "epoch": 0.5887075194005887, + "grad_norm": 0.5218117237091064, + "learning_rate": 0.00040049416409070326, + "loss": 0.8096, + "step": 3300 + }, + { + "epoch": 0.5888859156185889, + "grad_norm": 0.5112317204475403, + "learning_rate": 0.0004004382124918497, + "loss": 0.9776, + "step": 3301 + }, + { + "epoch": 0.589064311836589, + "grad_norm": 0.539185643196106, + "learning_rate": 0.00040038224907764356, + "loss": 1.0191, + "step": 3302 + }, + { + "epoch": 0.5892427080545892, + "grad_norm": 0.5724341869354248, + "learning_rate": 0.0004003262738524804, + "loss": 1.1246, + "step": 3303 + }, + { + "epoch": 0.5894211042725894, + "grad_norm": 0.568626880645752, + "learning_rate": 0.00040027028682075626, + "loss": 1.0875, + "step": 3304 + }, + { + "epoch": 0.5895995004905896, + "grad_norm": 0.48848995566368103, + "learning_rate": 0.00040021428798686854, + "loss": 0.8293, + "step": 3305 + }, + { + "epoch": 0.5897778967085898, + "grad_norm": 0.5772598385810852, + "learning_rate": 0.00040015827735521525, + "loss": 0.8439, + "step": 3306 + }, + { + "epoch": 0.5899562929265899, + "grad_norm": 1.8581256866455078, + "learning_rate": 0.0004001022549301955, + "loss": 1.0762, + "step": 3307 + }, + { + "epoch": 0.5901346891445901, + "grad_norm": 0.6715808510780334, + "learning_rate": 0.00040004622071620924, + "loss": 0.8739, + "step": 3308 + }, + { + "epoch": 0.5903130853625903, + "grad_norm": 0.5865873694419861, + "learning_rate": 0.00039999017471765736, + "loss": 1.2157, + "step": 3309 + }, + { + "epoch": 0.5904914815805905, + "grad_norm": 0.6115093231201172, + "learning_rate": 0.0003999341169389417, + "loss": 0.9827, + "step": 3310 + }, + { + "epoch": 0.5906698777985907, + "grad_norm": 1.7687232494354248, + "learning_rate": 0.0003998780473844651, + "loss": 0.7466, + "step": 3311 + }, + { + "epoch": 0.5908482740165909, + "grad_norm": 0.5366143584251404, + "learning_rate": 0.00039982196605863095, + "loss": 0.6974, + "step": 3312 + }, + { + "epoch": 0.591026670234591, + "grad_norm": 0.48479849100112915, + "learning_rate": 0.0003997658729658442, + "loss": 0.8932, + "step": 3313 + }, + { + "epoch": 0.5912050664525912, + "grad_norm": 0.5348507761955261, + "learning_rate": 0.0003997097681105103, + "loss": 1.0747, + "step": 3314 + }, + { + "epoch": 0.5913834626705914, + "grad_norm": 0.5859421491622925, + "learning_rate": 0.00039965365149703555, + "loss": 0.9456, + "step": 3315 + }, + { + "epoch": 0.5915618588885916, + "grad_norm": 0.5688529014587402, + "learning_rate": 0.00039959752312982745, + "loss": 0.985, + "step": 3316 + }, + { + "epoch": 0.5917402551065918, + "grad_norm": 0.5745177865028381, + "learning_rate": 0.00039954138301329426, + "loss": 1.0356, + "step": 3317 + }, + { + "epoch": 0.5919186513245919, + "grad_norm": 0.45974186062812805, + "learning_rate": 0.00039948523115184516, + "loss": 0.8968, + "step": 3318 + }, + { + "epoch": 0.5920970475425921, + "grad_norm": 0.5509001612663269, + "learning_rate": 0.00039942906754989035, + "loss": 1.0219, + "step": 3319 + }, + { + "epoch": 0.5922754437605923, + "grad_norm": 3.9084794521331787, + "learning_rate": 0.0003993728922118408, + "loss": 1.1745, + "step": 3320 + }, + { + "epoch": 0.5924538399785925, + "grad_norm": 20.251436233520508, + "learning_rate": 0.0003993167051421087, + "loss": 0.842, + "step": 3321 + }, + { + "epoch": 0.5926322361965927, + "grad_norm": 0.7031044363975525, + "learning_rate": 0.0003992605063451068, + "loss": 0.9488, + "step": 3322 + }, + { + "epoch": 0.5928106324145929, + "grad_norm": 0.5919948816299438, + "learning_rate": 0.00039920429582524896, + "loss": 1.0003, + "step": 3323 + }, + { + "epoch": 0.592989028632593, + "grad_norm": 1.6224722862243652, + "learning_rate": 0.00039914807358694995, + "loss": 1.2006, + "step": 3324 + }, + { + "epoch": 0.5931674248505932, + "grad_norm": 0.5909742712974548, + "learning_rate": 0.00039909183963462535, + "loss": 0.9017, + "step": 3325 + }, + { + "epoch": 0.5933458210685933, + "grad_norm": 0.5360785126686096, + "learning_rate": 0.0003990355939726919, + "loss": 0.9806, + "step": 3326 + }, + { + "epoch": 0.5935242172865935, + "grad_norm": 0.6022999286651611, + "learning_rate": 0.00039897933660556703, + "loss": 1.1784, + "step": 3327 + }, + { + "epoch": 0.5937026135045937, + "grad_norm": 0.5530278086662292, + "learning_rate": 0.0003989230675376691, + "loss": 0.7749, + "step": 3328 + }, + { + "epoch": 0.5938810097225938, + "grad_norm": 0.48187515139579773, + "learning_rate": 0.0003988667867734176, + "loss": 0.8854, + "step": 3329 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.5293228030204773, + "learning_rate": 0.0003988104943172327, + "loss": 0.9786, + "step": 3330 + }, + { + "epoch": 0.5942378021585942, + "grad_norm": 1.1052613258361816, + "learning_rate": 0.00039875419017353564, + "loss": 1.1643, + "step": 3331 + }, + { + "epoch": 0.5944161983765944, + "grad_norm": 0.663966953754425, + "learning_rate": 0.00039869787434674853, + "loss": 1.2176, + "step": 3332 + }, + { + "epoch": 0.5945945945945946, + "grad_norm": 0.6256305575370789, + "learning_rate": 0.0003986415468412943, + "loss": 1.0519, + "step": 3333 + }, + { + "epoch": 0.5947729908125948, + "grad_norm": 14.08318042755127, + "learning_rate": 0.00039858520766159703, + "loss": 1.3631, + "step": 3334 + }, + { + "epoch": 0.5949513870305949, + "grad_norm": 0.6284887790679932, + "learning_rate": 0.00039852885681208134, + "loss": 0.8746, + "step": 3335 + }, + { + "epoch": 0.5951297832485951, + "grad_norm": 0.9674341678619385, + "learning_rate": 0.00039847249429717326, + "loss": 1.2364, + "step": 3336 + }, + { + "epoch": 0.5953081794665953, + "grad_norm": 0.6534430980682373, + "learning_rate": 0.00039841612012129937, + "loss": 1.1151, + "step": 3337 + }, + { + "epoch": 0.5954865756845955, + "grad_norm": 0.5643439888954163, + "learning_rate": 0.0003983597342888872, + "loss": 0.8678, + "step": 3338 + }, + { + "epoch": 0.5956649719025957, + "grad_norm": 0.5396862626075745, + "learning_rate": 0.0003983033368043654, + "loss": 0.9284, + "step": 3339 + }, + { + "epoch": 0.5958433681205958, + "grad_norm": 0.6234003305435181, + "learning_rate": 0.00039824692767216337, + "loss": 1.0596, + "step": 3340 + }, + { + "epoch": 0.596021764338596, + "grad_norm": 0.5749251246452332, + "learning_rate": 0.00039819050689671143, + "loss": 0.8237, + "step": 3341 + }, + { + "epoch": 0.5962001605565962, + "grad_norm": 0.5611945390701294, + "learning_rate": 0.0003981340744824408, + "loss": 0.9064, + "step": 3342 + }, + { + "epoch": 0.5963785567745964, + "grad_norm": 0.6729212403297424, + "learning_rate": 0.0003980776304337838, + "loss": 1.1313, + "step": 3343 + }, + { + "epoch": 0.5965569529925966, + "grad_norm": 0.5482896566390991, + "learning_rate": 0.0003980211747551733, + "loss": 0.9448, + "step": 3344 + }, + { + "epoch": 0.5967353492105968, + "grad_norm": 44.00550842285156, + "learning_rate": 0.0003979647074510435, + "loss": 1.6368, + "step": 3345 + }, + { + "epoch": 0.5969137454285969, + "grad_norm": 0.5960306525230408, + "learning_rate": 0.00039790822852582927, + "loss": 1.0672, + "step": 3346 + }, + { + "epoch": 0.5970921416465971, + "grad_norm": 6.083531856536865, + "learning_rate": 0.00039785173798396637, + "loss": 1.4715, + "step": 3347 + }, + { + "epoch": 0.5972705378645973, + "grad_norm": 0.8834306597709656, + "learning_rate": 0.00039779523582989163, + "loss": 1.1152, + "step": 3348 + }, + { + "epoch": 0.5974489340825975, + "grad_norm": 1.775227427482605, + "learning_rate": 0.0003977387220680427, + "loss": 1.1541, + "step": 3349 + }, + { + "epoch": 0.5976273303005977, + "grad_norm": 0.7069253325462341, + "learning_rate": 0.00039768219670285805, + "loss": 0.889, + "step": 3350 + }, + { + "epoch": 0.5978057265185978, + "grad_norm": 1.1338763236999512, + "learning_rate": 0.00039762565973877726, + "loss": 0.8112, + "step": 3351 + }, + { + "epoch": 0.597984122736598, + "grad_norm": 1.5388410091400146, + "learning_rate": 0.00039756911118024065, + "loss": 1.0269, + "step": 3352 + }, + { + "epoch": 0.5981625189545982, + "grad_norm": 0.7311750650405884, + "learning_rate": 0.0003975125510316896, + "loss": 0.9498, + "step": 3353 + }, + { + "epoch": 0.5983409151725984, + "grad_norm": 2.2909646034240723, + "learning_rate": 0.0003974559792975663, + "loss": 0.9101, + "step": 3354 + }, + { + "epoch": 0.5985193113905986, + "grad_norm": 0.8901395201683044, + "learning_rate": 0.0003973993959823137, + "loss": 0.8863, + "step": 3355 + }, + { + "epoch": 0.5986977076085988, + "grad_norm": 0.6024214029312134, + "learning_rate": 0.00039734280109037613, + "loss": 1.0322, + "step": 3356 + }, + { + "epoch": 0.5988761038265988, + "grad_norm": 0.6770859956741333, + "learning_rate": 0.0003972861946261983, + "loss": 1.1135, + "step": 3357 + }, + { + "epoch": 0.599054500044599, + "grad_norm": 0.73415607213974, + "learning_rate": 0.0003972295765942261, + "loss": 0.9596, + "step": 3358 + }, + { + "epoch": 0.5992328962625992, + "grad_norm": 0.6480002403259277, + "learning_rate": 0.00039717294699890627, + "loss": 0.9741, + "step": 3359 + }, + { + "epoch": 0.5994112924805994, + "grad_norm": 1.2426977157592773, + "learning_rate": 0.0003971163058446866, + "loss": 1.0937, + "step": 3360 + }, + { + "epoch": 0.5995896886985996, + "grad_norm": 0.8961933851242065, + "learning_rate": 0.0003970596531360156, + "loss": 0.9281, + "step": 3361 + }, + { + "epoch": 0.5997680849165997, + "grad_norm": 0.7000132203102112, + "learning_rate": 0.00039700298887734273, + "loss": 1.0097, + "step": 3362 + }, + { + "epoch": 0.5999464811345999, + "grad_norm": 1.2038791179656982, + "learning_rate": 0.0003969463130731183, + "loss": 0.9242, + "step": 3363 + }, + { + "epoch": 0.6001248773526001, + "grad_norm": 1.661060094833374, + "learning_rate": 0.00039688962572779373, + "loss": 1.1603, + "step": 3364 + }, + { + "epoch": 0.6003032735706003, + "grad_norm": 1.0910298824310303, + "learning_rate": 0.0003968329268458212, + "loss": 0.9369, + "step": 3365 + }, + { + "epoch": 0.6004816697886005, + "grad_norm": 1.3606114387512207, + "learning_rate": 0.00039677621643165363, + "loss": 0.9468, + "step": 3366 + }, + { + "epoch": 0.6006600660066007, + "grad_norm": 1.111076831817627, + "learning_rate": 0.0003967194944897453, + "loss": 1.1383, + "step": 3367 + }, + { + "epoch": 0.6008384622246008, + "grad_norm": 1.8882251977920532, + "learning_rate": 0.000396662761024551, + "loss": 0.8431, + "step": 3368 + }, + { + "epoch": 0.601016858442601, + "grad_norm": 1.5878911018371582, + "learning_rate": 0.0003966060160405266, + "loss": 1.0407, + "step": 3369 + }, + { + "epoch": 0.6011952546606012, + "grad_norm": 0.7476792931556702, + "learning_rate": 0.00039654925954212873, + "loss": 0.7851, + "step": 3370 + }, + { + "epoch": 0.6013736508786014, + "grad_norm": 0.8636188507080078, + "learning_rate": 0.00039649249153381514, + "loss": 1.0617, + "step": 3371 + }, + { + "epoch": 0.6015520470966016, + "grad_norm": 0.7655645608901978, + "learning_rate": 0.00039643571202004426, + "loss": 0.9721, + "step": 3372 + }, + { + "epoch": 0.6017304433146017, + "grad_norm": 0.6459742188453674, + "learning_rate": 0.0003963789210052755, + "loss": 0.8403, + "step": 3373 + }, + { + "epoch": 0.6019088395326019, + "grad_norm": 0.7047243714332581, + "learning_rate": 0.00039632211849396936, + "loss": 1.1643, + "step": 3374 + }, + { + "epoch": 0.6020872357506021, + "grad_norm": 0.5050498247146606, + "learning_rate": 0.000396265304490587, + "loss": 0.7855, + "step": 3375 + }, + { + "epoch": 0.6022656319686023, + "grad_norm": 0.9648879766464233, + "learning_rate": 0.0003962084789995906, + "loss": 0.9511, + "step": 3376 + }, + { + "epoch": 0.6024440281866025, + "grad_norm": 0.5397990345954895, + "learning_rate": 0.00039615164202544314, + "loss": 0.8164, + "step": 3377 + }, + { + "epoch": 0.6026224244046027, + "grad_norm": 2.0958774089813232, + "learning_rate": 0.0003960947935726086, + "loss": 0.8944, + "step": 3378 + }, + { + "epoch": 0.6028008206226028, + "grad_norm": 1.3503490686416626, + "learning_rate": 0.00039603793364555184, + "loss": 0.8005, + "step": 3379 + }, + { + "epoch": 0.602979216840603, + "grad_norm": 0.5871055722236633, + "learning_rate": 0.00039598106224873866, + "loss": 0.9893, + "step": 3380 + }, + { + "epoch": 0.6031576130586032, + "grad_norm": 1.3168357610702515, + "learning_rate": 0.0003959241793866356, + "loss": 0.9926, + "step": 3381 + }, + { + "epoch": 0.6033360092766034, + "grad_norm": 0.8099937438964844, + "learning_rate": 0.0003958672850637103, + "loss": 0.9152, + "step": 3382 + }, + { + "epoch": 0.6035144054946036, + "grad_norm": 0.5617466568946838, + "learning_rate": 0.0003958103792844313, + "loss": 0.9704, + "step": 3383 + }, + { + "epoch": 0.6036928017126036, + "grad_norm": 0.758649468421936, + "learning_rate": 0.00039575346205326776, + "loss": 0.869, + "step": 3384 + }, + { + "epoch": 0.6038711979306038, + "grad_norm": 0.5572230219841003, + "learning_rate": 0.0003956965333746901, + "loss": 1.0961, + "step": 3385 + }, + { + "epoch": 0.604049594148604, + "grad_norm": 78.27342224121094, + "learning_rate": 0.00039563959325316934, + "loss": 1.2738, + "step": 3386 + }, + { + "epoch": 0.6042279903666042, + "grad_norm": 2.010958194732666, + "learning_rate": 0.00039558264169317766, + "loss": 0.9795, + "step": 3387 + }, + { + "epoch": 0.6044063865846044, + "grad_norm": 1.1752885580062866, + "learning_rate": 0.0003955256786991879, + "loss": 1.334, + "step": 3388 + }, + { + "epoch": 0.6045847828026046, + "grad_norm": 0.719290018081665, + "learning_rate": 0.0003954687042756739, + "loss": 0.9268, + "step": 3389 + }, + { + "epoch": 0.6047631790206047, + "grad_norm": 0.9729273915290833, + "learning_rate": 0.00039541171842711063, + "loss": 1.1478, + "step": 3390 + }, + { + "epoch": 0.6049415752386049, + "grad_norm": 0.5412781834602356, + "learning_rate": 0.00039535472115797345, + "loss": 1.0843, + "step": 3391 + }, + { + "epoch": 0.6051199714566051, + "grad_norm": 0.6292357444763184, + "learning_rate": 0.00039529771247273903, + "loss": 1.2036, + "step": 3392 + }, + { + "epoch": 0.6052983676746053, + "grad_norm": 0.617896556854248, + "learning_rate": 0.0003952406923758849, + "loss": 1.0214, + "step": 3393 + }, + { + "epoch": 0.6054767638926055, + "grad_norm": 0.4818667471408844, + "learning_rate": 0.00039518366087188924, + "loss": 1.0368, + "step": 3394 + }, + { + "epoch": 0.6056551601106056, + "grad_norm": 0.7148341536521912, + "learning_rate": 0.0003951266179652313, + "loss": 0.9985, + "step": 3395 + }, + { + "epoch": 0.6058335563286058, + "grad_norm": 1.9086631536483765, + "learning_rate": 0.0003950695636603912, + "loss": 1.0418, + "step": 3396 + }, + { + "epoch": 0.606011952546606, + "grad_norm": 0.5101553201675415, + "learning_rate": 0.00039501249796185006, + "loss": 0.7728, + "step": 3397 + }, + { + "epoch": 0.6061903487646062, + "grad_norm": 0.5302074551582336, + "learning_rate": 0.00039495542087408976, + "loss": 0.8685, + "step": 3398 + }, + { + "epoch": 0.6063687449826064, + "grad_norm": 0.6329625844955444, + "learning_rate": 0.000394898332401593, + "loss": 1.0125, + "step": 3399 + }, + { + "epoch": 0.6065471412006066, + "grad_norm": 0.6048588752746582, + "learning_rate": 0.0003948412325488436, + "loss": 1.1885, + "step": 3400 + }, + { + "epoch": 0.6067255374186067, + "grad_norm": 0.5368157029151917, + "learning_rate": 0.00039478412132032615, + "loss": 1.0989, + "step": 3401 + }, + { + "epoch": 0.6069039336366069, + "grad_norm": 0.5177137851715088, + "learning_rate": 0.0003947269987205261, + "loss": 0.9014, + "step": 3402 + }, + { + "epoch": 0.6070823298546071, + "grad_norm": 0.6155733466148376, + "learning_rate": 0.00039466986475392987, + "loss": 1.1821, + "step": 3403 + }, + { + "epoch": 0.6072607260726073, + "grad_norm": 0.49641674757003784, + "learning_rate": 0.0003946127194250247, + "loss": 1.0431, + "step": 3404 + }, + { + "epoch": 0.6074391222906075, + "grad_norm": 0.48299992084503174, + "learning_rate": 0.00039455556273829877, + "loss": 0.8856, + "step": 3405 + }, + { + "epoch": 0.6076175185086076, + "grad_norm": 0.49870428442955017, + "learning_rate": 0.0003944983946982412, + "loss": 0.9731, + "step": 3406 + }, + { + "epoch": 0.6077959147266078, + "grad_norm": 0.47452113032341003, + "learning_rate": 0.00039444121530934185, + "loss": 0.7636, + "step": 3407 + }, + { + "epoch": 0.607974310944608, + "grad_norm": 0.5165370106697083, + "learning_rate": 0.0003943840245760916, + "loss": 1.1063, + "step": 3408 + }, + { + "epoch": 0.6081527071626082, + "grad_norm": 0.5465608835220337, + "learning_rate": 0.00039432682250298225, + "loss": 1.1446, + "step": 3409 + }, + { + "epoch": 0.6083311033806084, + "grad_norm": 0.4796379804611206, + "learning_rate": 0.00039426960909450627, + "loss": 0.9792, + "step": 3410 + }, + { + "epoch": 0.6085094995986086, + "grad_norm": 0.48459067940711975, + "learning_rate": 0.00039421238435515736, + "loss": 1.1648, + "step": 3411 + }, + { + "epoch": 0.6086878958166086, + "grad_norm": 0.5157718062400818, + "learning_rate": 0.00039415514828942976, + "loss": 1.0494, + "step": 3412 + }, + { + "epoch": 0.6088662920346088, + "grad_norm": 0.4796636998653412, + "learning_rate": 0.00039409790090181896, + "loss": 0.8812, + "step": 3413 + }, + { + "epoch": 0.609044688252609, + "grad_norm": 0.5595741868019104, + "learning_rate": 0.000394040642196821, + "loss": 0.8946, + "step": 3414 + }, + { + "epoch": 0.6092230844706092, + "grad_norm": 0.5820662975311279, + "learning_rate": 0.00039398337217893295, + "loss": 1.1571, + "step": 3415 + }, + { + "epoch": 0.6094014806886094, + "grad_norm": 0.4832160770893097, + "learning_rate": 0.0003939260908526528, + "loss": 0.8372, + "step": 3416 + }, + { + "epoch": 0.6095798769066095, + "grad_norm": 0.4839080274105072, + "learning_rate": 0.00039386879822247945, + "loss": 0.8548, + "step": 3417 + }, + { + "epoch": 0.6097582731246097, + "grad_norm": 0.45255133509635925, + "learning_rate": 0.00039381149429291263, + "loss": 0.9553, + "step": 3418 + }, + { + "epoch": 0.6099366693426099, + "grad_norm": 0.4793449342250824, + "learning_rate": 0.00039375417906845284, + "loss": 0.965, + "step": 3419 + }, + { + "epoch": 0.6101150655606101, + "grad_norm": 0.4889264404773712, + "learning_rate": 0.0003936968525536018, + "loss": 1.0507, + "step": 3420 + }, + { + "epoch": 0.6102934617786103, + "grad_norm": 0.5014956593513489, + "learning_rate": 0.00039363951475286164, + "loss": 0.9827, + "step": 3421 + }, + { + "epoch": 0.6104718579966105, + "grad_norm": 0.9582971334457397, + "learning_rate": 0.0003935821656707359, + "loss": 1.0391, + "step": 3422 + }, + { + "epoch": 0.6106502542146106, + "grad_norm": 0.5389576554298401, + "learning_rate": 0.00039352480531172873, + "loss": 1.0455, + "step": 3423 + }, + { + "epoch": 0.6108286504326108, + "grad_norm": 0.4678865373134613, + "learning_rate": 0.000393467433680345, + "loss": 0.8876, + "step": 3424 + }, + { + "epoch": 0.611007046650611, + "grad_norm": 0.581387996673584, + "learning_rate": 0.00039341005078109083, + "loss": 1.0716, + "step": 3425 + }, + { + "epoch": 0.6111854428686112, + "grad_norm": 0.47930246591567993, + "learning_rate": 0.000393352656618473, + "loss": 0.7716, + "step": 3426 + }, + { + "epoch": 0.6113638390866114, + "grad_norm": 0.5663626790046692, + "learning_rate": 0.0003932952511969991, + "loss": 0.8454, + "step": 3427 + }, + { + "epoch": 0.6115422353046115, + "grad_norm": 0.5635462403297424, + "learning_rate": 0.0003932378345211779, + "loss": 0.9529, + "step": 3428 + }, + { + "epoch": 0.6117206315226117, + "grad_norm": 0.46473008394241333, + "learning_rate": 0.0003931804065955188, + "loss": 0.8475, + "step": 3429 + }, + { + "epoch": 0.6118990277406119, + "grad_norm": 0.5133330225944519, + "learning_rate": 0.00039312296742453223, + "loss": 0.896, + "step": 3430 + }, + { + "epoch": 0.6120774239586121, + "grad_norm": 0.5049217343330383, + "learning_rate": 0.0003930655170127294, + "loss": 1.0923, + "step": 3431 + }, + { + "epoch": 0.6122558201766123, + "grad_norm": 0.4742034375667572, + "learning_rate": 0.00039300805536462237, + "loss": 0.7796, + "step": 3432 + }, + { + "epoch": 0.6124342163946125, + "grad_norm": 0.5486430525779724, + "learning_rate": 0.0003929505824847243, + "loss": 0.92, + "step": 3433 + }, + { + "epoch": 0.6126126126126126, + "grad_norm": 0.514842689037323, + "learning_rate": 0.00039289309837754895, + "loss": 1.0119, + "step": 3434 + }, + { + "epoch": 0.6127910088306128, + "grad_norm": 0.4459700584411621, + "learning_rate": 0.000392835603047611, + "loss": 0.6496, + "step": 3435 + }, + { + "epoch": 0.612969405048613, + "grad_norm": 0.525751531124115, + "learning_rate": 0.00039277809649942644, + "loss": 0.8839, + "step": 3436 + }, + { + "epoch": 0.6131478012666132, + "grad_norm": 0.5619456171989441, + "learning_rate": 0.0003927205787375115, + "loss": 0.8713, + "step": 3437 + }, + { + "epoch": 0.6133261974846134, + "grad_norm": 0.4977574348449707, + "learning_rate": 0.0003926630497663839, + "loss": 0.9844, + "step": 3438 + }, + { + "epoch": 0.6135045937026135, + "grad_norm": 0.5254572033882141, + "learning_rate": 0.0003926055095905616, + "loss": 1.085, + "step": 3439 + }, + { + "epoch": 0.6136829899206137, + "grad_norm": 0.4378454089164734, + "learning_rate": 0.000392547958214564, + "loss": 0.7315, + "step": 3440 + }, + { + "epoch": 0.6138613861386139, + "grad_norm": 0.5086297392845154, + "learning_rate": 0.0003924903956429111, + "loss": 1.0405, + "step": 3441 + }, + { + "epoch": 0.614039782356614, + "grad_norm": 0.47228503227233887, + "learning_rate": 0.00039243282188012387, + "loss": 0.8687, + "step": 3442 + }, + { + "epoch": 0.6142181785746142, + "grad_norm": 0.46949103474617004, + "learning_rate": 0.0003923752369307241, + "loss": 0.7208, + "step": 3443 + }, + { + "epoch": 0.6143965747926144, + "grad_norm": 0.5485444664955139, + "learning_rate": 0.00039231764079923447, + "loss": 1.071, + "step": 3444 + }, + { + "epoch": 0.6145749710106145, + "grad_norm": 0.610989511013031, + "learning_rate": 0.0003922600334901786, + "loss": 0.989, + "step": 3445 + }, + { + "epoch": 0.6147533672286147, + "grad_norm": 0.48183125257492065, + "learning_rate": 0.0003922024150080808, + "loss": 0.9965, + "step": 3446 + }, + { + "epoch": 0.6149317634466149, + "grad_norm": 0.540808379650116, + "learning_rate": 0.00039214478535746665, + "loss": 1.0388, + "step": 3447 + }, + { + "epoch": 0.6151101596646151, + "grad_norm": 0.4990650415420532, + "learning_rate": 0.0003920871445428622, + "loss": 1.1055, + "step": 3448 + }, + { + "epoch": 0.6152885558826153, + "grad_norm": 0.5104119181632996, + "learning_rate": 0.00039202949256879463, + "loss": 0.9729, + "step": 3449 + }, + { + "epoch": 0.6154669521006154, + "grad_norm": 0.48018768429756165, + "learning_rate": 0.0003919718294397917, + "loss": 0.9862, + "step": 3450 + }, + { + "epoch": 0.6156453483186156, + "grad_norm": 0.48195213079452515, + "learning_rate": 0.0003919141551603824, + "loss": 1.0078, + "step": 3451 + }, + { + "epoch": 0.6158237445366158, + "grad_norm": 0.4913448095321655, + "learning_rate": 0.0003918564697350965, + "loss": 0.8028, + "step": 3452 + }, + { + "epoch": 0.616002140754616, + "grad_norm": 1.1352670192718506, + "learning_rate": 0.00039179877316846453, + "loss": 1.019, + "step": 3453 + }, + { + "epoch": 0.6161805369726162, + "grad_norm": 0.5746923685073853, + "learning_rate": 0.0003917410654650179, + "loss": 0.8682, + "step": 3454 + }, + { + "epoch": 0.6163589331906164, + "grad_norm": 0.7708411812782288, + "learning_rate": 0.00039168334662928895, + "loss": 1.3231, + "step": 3455 + }, + { + "epoch": 0.6165373294086165, + "grad_norm": 0.794039249420166, + "learning_rate": 0.00039162561666581096, + "loss": 0.8659, + "step": 3456 + }, + { + "epoch": 0.6167157256266167, + "grad_norm": 0.5135617256164551, + "learning_rate": 0.000391567875579118, + "loss": 1.0136, + "step": 3457 + }, + { + "epoch": 0.6168941218446169, + "grad_norm": 0.47095632553100586, + "learning_rate": 0.00039151012337374495, + "loss": 0.8962, + "step": 3458 + }, + { + "epoch": 0.6170725180626171, + "grad_norm": 0.5985879302024841, + "learning_rate": 0.0003914523600542277, + "loss": 1.0213, + "step": 3459 + }, + { + "epoch": 0.6172509142806173, + "grad_norm": 0.6508655548095703, + "learning_rate": 0.000391394585625103, + "loss": 1.2593, + "step": 3460 + }, + { + "epoch": 0.6174293104986174, + "grad_norm": 0.5690905451774597, + "learning_rate": 0.00039133680009090845, + "loss": 0.9674, + "step": 3461 + }, + { + "epoch": 0.6176077067166176, + "grad_norm": 0.610589325428009, + "learning_rate": 0.0003912790034561824, + "loss": 0.8924, + "step": 3462 + }, + { + "epoch": 0.6177861029346178, + "grad_norm": 0.7118551135063171, + "learning_rate": 0.00039122119572546424, + "loss": 1.1188, + "step": 3463 + }, + { + "epoch": 0.617964499152618, + "grad_norm": 0.4738007187843323, + "learning_rate": 0.0003911633769032941, + "loss": 0.8131, + "step": 3464 + }, + { + "epoch": 0.6181428953706182, + "grad_norm": 0.5291326642036438, + "learning_rate": 0.000391105546994213, + "loss": 0.8803, + "step": 3465 + }, + { + "epoch": 0.6183212915886184, + "grad_norm": 17.467649459838867, + "learning_rate": 0.0003910477060027631, + "loss": 1.0413, + "step": 3466 + }, + { + "epoch": 0.6184996878066185, + "grad_norm": 0.5017330646514893, + "learning_rate": 0.00039098985393348697, + "loss": 0.7768, + "step": 3467 + }, + { + "epoch": 0.6186780840246187, + "grad_norm": 0.49945586919784546, + "learning_rate": 0.00039093199079092843, + "loss": 0.8372, + "step": 3468 + }, + { + "epoch": 0.6188564802426189, + "grad_norm": 0.5586909651756287, + "learning_rate": 0.000390874116579632, + "loss": 0.8797, + "step": 3469 + }, + { + "epoch": 0.619034876460619, + "grad_norm": 0.5252298712730408, + "learning_rate": 0.0003908162313041431, + "loss": 0.917, + "step": 3470 + }, + { + "epoch": 0.6192132726786193, + "grad_norm": 0.48841506242752075, + "learning_rate": 0.00039075833496900794, + "loss": 0.8785, + "step": 3471 + }, + { + "epoch": 0.6193916688966193, + "grad_norm": 0.4840584099292755, + "learning_rate": 0.0003907004275787737, + "loss": 0.9556, + "step": 3472 + }, + { + "epoch": 0.6195700651146195, + "grad_norm": 0.47886261343955994, + "learning_rate": 0.0003906425091379885, + "loss": 1.1327, + "step": 3473 + }, + { + "epoch": 0.6197484613326197, + "grad_norm": 0.51800936460495, + "learning_rate": 0.0003905845796512011, + "loss": 0.8071, + "step": 3474 + }, + { + "epoch": 0.6199268575506199, + "grad_norm": 0.5068585872650146, + "learning_rate": 0.00039052663912296135, + "loss": 0.959, + "step": 3475 + }, + { + "epoch": 0.6201052537686201, + "grad_norm": 0.4845023453235626, + "learning_rate": 0.00039046868755781986, + "loss": 0.8652, + "step": 3476 + }, + { + "epoch": 0.6202836499866203, + "grad_norm": 0.4877776503562927, + "learning_rate": 0.00039041072496032804, + "loss": 1.0887, + "step": 3477 + }, + { + "epoch": 0.6204620462046204, + "grad_norm": 0.4742729663848877, + "learning_rate": 0.0003903527513350383, + "loss": 1.0202, + "step": 3478 + }, + { + "epoch": 0.6206404424226206, + "grad_norm": 0.4939587712287903, + "learning_rate": 0.0003902947666865039, + "loss": 0.8183, + "step": 3479 + }, + { + "epoch": 0.6208188386406208, + "grad_norm": 0.4751134216785431, + "learning_rate": 0.0003902367710192789, + "loss": 1.1003, + "step": 3480 + }, + { + "epoch": 0.620997234858621, + "grad_norm": 0.45981040596961975, + "learning_rate": 0.00039017876433791824, + "loss": 0.8677, + "step": 3481 + }, + { + "epoch": 0.6211756310766212, + "grad_norm": 0.5187476873397827, + "learning_rate": 0.00039012074664697774, + "loss": 0.8187, + "step": 3482 + }, + { + "epoch": 0.6213540272946213, + "grad_norm": 0.4452672600746155, + "learning_rate": 0.0003900627179510141, + "loss": 0.8628, + "step": 3483 + }, + { + "epoch": 0.6215324235126215, + "grad_norm": 0.5037822127342224, + "learning_rate": 0.0003900046782545849, + "loss": 0.9356, + "step": 3484 + }, + { + "epoch": 0.6217108197306217, + "grad_norm": 0.4531655013561249, + "learning_rate": 0.00038994662756224843, + "loss": 0.8167, + "step": 3485 + }, + { + "epoch": 0.6218892159486219, + "grad_norm": 0.5035321116447449, + "learning_rate": 0.00038988856587856413, + "loss": 0.8075, + "step": 3486 + }, + { + "epoch": 0.6220676121666221, + "grad_norm": 0.4611433148384094, + "learning_rate": 0.00038983049320809207, + "loss": 1.1105, + "step": 3487 + }, + { + "epoch": 0.6222460083846223, + "grad_norm": 0.48017269372940063, + "learning_rate": 0.00038977240955539316, + "loss": 0.8857, + "step": 3488 + }, + { + "epoch": 0.6224244046026224, + "grad_norm": 0.5595654845237732, + "learning_rate": 0.0003897143149250295, + "loss": 0.8854, + "step": 3489 + }, + { + "epoch": 0.6226028008206226, + "grad_norm": 0.5068066716194153, + "learning_rate": 0.00038965620932156355, + "loss": 0.9231, + "step": 3490 + }, + { + "epoch": 0.6227811970386228, + "grad_norm": 0.5294225215911865, + "learning_rate": 0.00038959809274955907, + "loss": 0.9997, + "step": 3491 + }, + { + "epoch": 0.622959593256623, + "grad_norm": 0.5624359846115112, + "learning_rate": 0.0003895399652135805, + "loss": 0.9754, + "step": 3492 + }, + { + "epoch": 0.6231379894746232, + "grad_norm": 0.4356614053249359, + "learning_rate": 0.00038948182671819304, + "loss": 0.7682, + "step": 3493 + }, + { + "epoch": 0.6233163856926233, + "grad_norm": 0.5377020239830017, + "learning_rate": 0.00038942367726796297, + "loss": 1.2892, + "step": 3494 + }, + { + "epoch": 0.6234947819106235, + "grad_norm": 0.5128370523452759, + "learning_rate": 0.0003893655168674572, + "loss": 0.8542, + "step": 3495 + }, + { + "epoch": 0.6236731781286237, + "grad_norm": 0.5239914059638977, + "learning_rate": 0.0003893073455212438, + "loss": 0.922, + "step": 3496 + }, + { + "epoch": 0.6238515743466239, + "grad_norm": 0.5009093284606934, + "learning_rate": 0.00038924916323389145, + "loss": 1.2072, + "step": 3497 + }, + { + "epoch": 0.6240299705646241, + "grad_norm": 0.5097877383232117, + "learning_rate": 0.00038919097000996973, + "loss": 1.0078, + "step": 3498 + }, + { + "epoch": 0.6242083667826243, + "grad_norm": 0.5235947370529175, + "learning_rate": 0.0003891327658540491, + "loss": 1.0455, + "step": 3499 + }, + { + "epoch": 0.6243867630006243, + "grad_norm": 0.4533289670944214, + "learning_rate": 0.00038907455077070085, + "loss": 0.7731, + "step": 3500 + }, + { + "epoch": 0.6245651592186245, + "grad_norm": 0.5486941337585449, + "learning_rate": 0.0003890163247644973, + "loss": 0.9088, + "step": 3501 + }, + { + "epoch": 0.6247435554366247, + "grad_norm": 0.5437402129173279, + "learning_rate": 0.0003889580878400115, + "loss": 0.9288, + "step": 3502 + }, + { + "epoch": 0.6249219516546249, + "grad_norm": 0.49082422256469727, + "learning_rate": 0.00038889984000181724, + "loss": 0.8177, + "step": 3503 + }, + { + "epoch": 0.6251003478726251, + "grad_norm": 0.7093980312347412, + "learning_rate": 0.0003888415812544892, + "loss": 1.1146, + "step": 3504 + }, + { + "epoch": 0.6252787440906252, + "grad_norm": 0.45868930220603943, + "learning_rate": 0.00038878331160260317, + "loss": 0.9161, + "step": 3505 + }, + { + "epoch": 0.6254571403086254, + "grad_norm": 0.527919590473175, + "learning_rate": 0.00038872503105073563, + "loss": 1.0866, + "step": 3506 + }, + { + "epoch": 0.6256355365266256, + "grad_norm": 0.5056113600730896, + "learning_rate": 0.0003886667396034638, + "loss": 0.8475, + "step": 3507 + }, + { + "epoch": 0.6258139327446258, + "grad_norm": 0.46975040435791016, + "learning_rate": 0.00038860843726536593, + "loss": 0.8076, + "step": 3508 + }, + { + "epoch": 0.625992328962626, + "grad_norm": 0.5507307052612305, + "learning_rate": 0.00038855012404102104, + "loss": 0.9643, + "step": 3509 + }, + { + "epoch": 0.6261707251806262, + "grad_norm": 0.6463258862495422, + "learning_rate": 0.00038849179993500905, + "loss": 0.9585, + "step": 3510 + }, + { + "epoch": 0.6263491213986263, + "grad_norm": 0.5084063410758972, + "learning_rate": 0.0003884334649519106, + "loss": 1.1892, + "step": 3511 + }, + { + "epoch": 0.6265275176166265, + "grad_norm": 0.5068933367729187, + "learning_rate": 0.0003883751190963075, + "loss": 0.8802, + "step": 3512 + }, + { + "epoch": 0.6267059138346267, + "grad_norm": 0.5390312075614929, + "learning_rate": 0.0003883167623727821, + "loss": 1.0255, + "step": 3513 + }, + { + "epoch": 0.6268843100526269, + "grad_norm": 0.5518016815185547, + "learning_rate": 0.0003882583947859176, + "loss": 1.0363, + "step": 3514 + }, + { + "epoch": 0.6270627062706271, + "grad_norm": 0.435151606798172, + "learning_rate": 0.0003882000163402983, + "loss": 0.8027, + "step": 3515 + }, + { + "epoch": 0.6272411024886272, + "grad_norm": 0.4693804979324341, + "learning_rate": 0.00038814162704050925, + "loss": 0.7981, + "step": 3516 + }, + { + "epoch": 0.6274194987066274, + "grad_norm": 0.44543004035949707, + "learning_rate": 0.0003880832268911363, + "loss": 0.7381, + "step": 3517 + }, + { + "epoch": 0.6275978949246276, + "grad_norm": 0.47917303442955017, + "learning_rate": 0.00038802481589676605, + "loss": 0.8535, + "step": 3518 + }, + { + "epoch": 0.6277762911426278, + "grad_norm": 0.5061355233192444, + "learning_rate": 0.0003879663940619861, + "loss": 0.9866, + "step": 3519 + }, + { + "epoch": 0.627954687360628, + "grad_norm": 0.5138995051383972, + "learning_rate": 0.00038790796139138506, + "loss": 0.9151, + "step": 3520 + }, + { + "epoch": 0.6281330835786282, + "grad_norm": 0.5253726840019226, + "learning_rate": 0.000387849517889552, + "loss": 1.0164, + "step": 3521 + }, + { + "epoch": 0.6283114797966283, + "grad_norm": 0.4949781000614166, + "learning_rate": 0.00038779106356107715, + "loss": 1.0074, + "step": 3522 + }, + { + "epoch": 0.6284898760146285, + "grad_norm": 0.49461349844932556, + "learning_rate": 0.0003877325984105514, + "loss": 0.9045, + "step": 3523 + }, + { + "epoch": 0.6286682722326287, + "grad_norm": 0.4767039716243744, + "learning_rate": 0.00038767412244256673, + "loss": 0.7782, + "step": 3524 + }, + { + "epoch": 0.6288466684506289, + "grad_norm": 0.5934154391288757, + "learning_rate": 0.00038761563566171576, + "loss": 1.2435, + "step": 3525 + }, + { + "epoch": 0.6290250646686291, + "grad_norm": 0.5259369611740112, + "learning_rate": 0.00038755713807259184, + "loss": 0.9742, + "step": 3526 + }, + { + "epoch": 0.6292034608866292, + "grad_norm": 0.44396549463272095, + "learning_rate": 0.0003874986296797896, + "loss": 0.7857, + "step": 3527 + }, + { + "epoch": 0.6293818571046293, + "grad_norm": 0.5298401713371277, + "learning_rate": 0.0003874401104879041, + "loss": 0.9236, + "step": 3528 + }, + { + "epoch": 0.6295602533226295, + "grad_norm": 0.5219202637672424, + "learning_rate": 0.00038738158050153157, + "loss": 1.0415, + "step": 3529 + }, + { + "epoch": 0.6297386495406297, + "grad_norm": 0.5267150402069092, + "learning_rate": 0.0003873230397252687, + "loss": 0.9082, + "step": 3530 + }, + { + "epoch": 0.62991704575863, + "grad_norm": 0.5192800164222717, + "learning_rate": 0.0003872644881637135, + "loss": 0.8653, + "step": 3531 + }, + { + "epoch": 0.6300954419766301, + "grad_norm": 0.5463857054710388, + "learning_rate": 0.0003872059258214644, + "loss": 0.9292, + "step": 3532 + }, + { + "epoch": 0.6302738381946302, + "grad_norm": 0.659090518951416, + "learning_rate": 0.00038714735270312095, + "loss": 0.8808, + "step": 3533 + }, + { + "epoch": 0.6304522344126304, + "grad_norm": 0.5042756795883179, + "learning_rate": 0.0003870887688132834, + "loss": 0.8481, + "step": 3534 + }, + { + "epoch": 0.6306306306306306, + "grad_norm": 0.4895661771297455, + "learning_rate": 0.00038703017415655296, + "loss": 0.9983, + "step": 3535 + }, + { + "epoch": 0.6308090268486308, + "grad_norm": 0.46938470005989075, + "learning_rate": 0.00038697156873753163, + "loss": 0.9572, + "step": 3536 + }, + { + "epoch": 0.630987423066631, + "grad_norm": 0.4477916359901428, + "learning_rate": 0.00038691295256082227, + "loss": 0.798, + "step": 3537 + }, + { + "epoch": 0.6311658192846311, + "grad_norm": 0.5256043076515198, + "learning_rate": 0.0003868543256310284, + "loss": 0.9967, + "step": 3538 + }, + { + "epoch": 0.6313442155026313, + "grad_norm": 0.5086610913276672, + "learning_rate": 0.0003867956879527548, + "loss": 1.1852, + "step": 3539 + }, + { + "epoch": 0.6315226117206315, + "grad_norm": 0.47662028670310974, + "learning_rate": 0.00038673703953060677, + "loss": 0.9776, + "step": 3540 + }, + { + "epoch": 0.6317010079386317, + "grad_norm": 0.461160272359848, + "learning_rate": 0.00038667838036919046, + "loss": 0.8206, + "step": 3541 + }, + { + "epoch": 0.6318794041566319, + "grad_norm": 0.5197997093200684, + "learning_rate": 0.0003866197104731129, + "loss": 0.9646, + "step": 3542 + }, + { + "epoch": 0.6320578003746321, + "grad_norm": 0.48244959115982056, + "learning_rate": 0.0003865610298469821, + "loss": 0.7589, + "step": 3543 + }, + { + "epoch": 0.6322361965926322, + "grad_norm": 0.4731544852256775, + "learning_rate": 0.00038650233849540683, + "loss": 0.9059, + "step": 3544 + }, + { + "epoch": 0.6324145928106324, + "grad_norm": 0.520389199256897, + "learning_rate": 0.00038644363642299665, + "loss": 1.0436, + "step": 3545 + }, + { + "epoch": 0.6325929890286326, + "grad_norm": 0.49500808119773865, + "learning_rate": 0.00038638492363436195, + "loss": 0.9817, + "step": 3546 + }, + { + "epoch": 0.6327713852466328, + "grad_norm": 0.4835432469844818, + "learning_rate": 0.000386326200134114, + "loss": 0.9696, + "step": 3547 + }, + { + "epoch": 0.632949781464633, + "grad_norm": 0.45955875515937805, + "learning_rate": 0.000386267465926865, + "loss": 0.9491, + "step": 3548 + }, + { + "epoch": 0.6331281776826331, + "grad_norm": 0.5106682777404785, + "learning_rate": 0.00038620872101722783, + "loss": 0.9027, + "step": 3549 + }, + { + "epoch": 0.6333065739006333, + "grad_norm": 0.5530784130096436, + "learning_rate": 0.0003861499654098164, + "loss": 1.1835, + "step": 3550 + }, + { + "epoch": 0.6334849701186335, + "grad_norm": 0.45463332533836365, + "learning_rate": 0.0003860911991092452, + "loss": 0.8381, + "step": 3551 + }, + { + "epoch": 0.6336633663366337, + "grad_norm": 0.5137251615524292, + "learning_rate": 0.0003860324221201298, + "loss": 0.9746, + "step": 3552 + }, + { + "epoch": 0.6338417625546339, + "grad_norm": 0.4892425835132599, + "learning_rate": 0.00038597363444708657, + "loss": 1.0123, + "step": 3553 + }, + { + "epoch": 0.6340201587726341, + "grad_norm": 0.482229083776474, + "learning_rate": 0.00038591483609473257, + "loss": 0.796, + "step": 3554 + }, + { + "epoch": 0.6341985549906342, + "grad_norm": 0.44259580969810486, + "learning_rate": 0.0003858560270676858, + "loss": 1.0132, + "step": 3555 + }, + { + "epoch": 0.6343769512086344, + "grad_norm": 0.5081446170806885, + "learning_rate": 0.00038579720737056517, + "loss": 0.9065, + "step": 3556 + }, + { + "epoch": 0.6345553474266346, + "grad_norm": 0.4761013090610504, + "learning_rate": 0.0003857383770079902, + "loss": 0.9184, + "step": 3557 + }, + { + "epoch": 0.6347337436446348, + "grad_norm": 0.5290868282318115, + "learning_rate": 0.00038567953598458163, + "loss": 0.9764, + "step": 3558 + }, + { + "epoch": 0.634912139862635, + "grad_norm": 0.5481404066085815, + "learning_rate": 0.00038562068430496066, + "loss": 0.9443, + "step": 3559 + }, + { + "epoch": 0.635090536080635, + "grad_norm": 0.48112472891807556, + "learning_rate": 0.00038556182197374957, + "loss": 1.0576, + "step": 3560 + }, + { + "epoch": 0.6352689322986352, + "grad_norm": 0.46322664618492126, + "learning_rate": 0.0003855029489955713, + "loss": 0.8526, + "step": 3561 + }, + { + "epoch": 0.6354473285166354, + "grad_norm": 0.501327633857727, + "learning_rate": 0.0003854440653750496, + "loss": 1.0425, + "step": 3562 + }, + { + "epoch": 0.6356257247346356, + "grad_norm": 0.9595483541488647, + "learning_rate": 0.0003853851711168094, + "loss": 0.73, + "step": 3563 + }, + { + "epoch": 0.6358041209526358, + "grad_norm": 0.44412168860435486, + "learning_rate": 0.00038532626622547614, + "loss": 0.8409, + "step": 3564 + }, + { + "epoch": 0.635982517170636, + "grad_norm": 0.5374020338058472, + "learning_rate": 0.0003852673507056761, + "loss": 1.0608, + "step": 3565 + }, + { + "epoch": 0.6361609133886361, + "grad_norm": 0.5410022735595703, + "learning_rate": 0.0003852084245620365, + "loss": 1.0212, + "step": 3566 + }, + { + "epoch": 0.6363393096066363, + "grad_norm": 0.46647214889526367, + "learning_rate": 0.0003851494877991856, + "loss": 0.9998, + "step": 3567 + }, + { + "epoch": 0.6365177058246365, + "grad_norm": 0.49088791012763977, + "learning_rate": 0.0003850905404217519, + "loss": 1.0369, + "step": 3568 + }, + { + "epoch": 0.6366961020426367, + "grad_norm": 0.5109038352966309, + "learning_rate": 0.00038503158243436537, + "loss": 1.1344, + "step": 3569 + }, + { + "epoch": 0.6368744982606369, + "grad_norm": 0.5097025036811829, + "learning_rate": 0.0003849726138416565, + "loss": 0.9358, + "step": 3570 + }, + { + "epoch": 0.637052894478637, + "grad_norm": 0.4957421123981476, + "learning_rate": 0.00038491363464825655, + "loss": 1.1196, + "step": 3571 + }, + { + "epoch": 0.6372312906966372, + "grad_norm": 0.5967016220092773, + "learning_rate": 0.00038485464485879783, + "loss": 1.1693, + "step": 3572 + }, + { + "epoch": 0.6374096869146374, + "grad_norm": 0.4782984256744385, + "learning_rate": 0.0003847956444779133, + "loss": 1.0175, + "step": 3573 + }, + { + "epoch": 0.6375880831326376, + "grad_norm": 0.4927447438240051, + "learning_rate": 0.0003847366335102369, + "loss": 1.0854, + "step": 3574 + }, + { + "epoch": 0.6377664793506378, + "grad_norm": 0.4766659140586853, + "learning_rate": 0.0003846776119604033, + "loss": 0.8279, + "step": 3575 + }, + { + "epoch": 0.637944875568638, + "grad_norm": 0.46618908643722534, + "learning_rate": 0.000384618579833048, + "loss": 0.8925, + "step": 3576 + }, + { + "epoch": 0.6381232717866381, + "grad_norm": 0.48249977827072144, + "learning_rate": 0.0003845595371328074, + "loss": 0.7996, + "step": 3577 + }, + { + "epoch": 0.6383016680046383, + "grad_norm": 0.463788777589798, + "learning_rate": 0.0003845004838643186, + "loss": 0.8095, + "step": 3578 + }, + { + "epoch": 0.6384800642226385, + "grad_norm": 0.5117577910423279, + "learning_rate": 0.0003844414200322197, + "loss": 0.939, + "step": 3579 + }, + { + "epoch": 0.6386584604406387, + "grad_norm": 1.0510876178741455, + "learning_rate": 0.0003843823456411495, + "loss": 0.9856, + "step": 3580 + }, + { + "epoch": 0.6388368566586389, + "grad_norm": 0.4879765808582306, + "learning_rate": 0.00038432326069574776, + "loss": 0.8623, + "step": 3581 + }, + { + "epoch": 0.639015252876639, + "grad_norm": 0.5112807154655457, + "learning_rate": 0.0003842641652006549, + "loss": 0.7826, + "step": 3582 + }, + { + "epoch": 0.6391936490946392, + "grad_norm": 0.49718165397644043, + "learning_rate": 0.0003842050591605122, + "loss": 0.9188, + "step": 3583 + }, + { + "epoch": 0.6393720453126394, + "grad_norm": 0.4578002989292145, + "learning_rate": 0.00038414594257996207, + "loss": 0.8357, + "step": 3584 + }, + { + "epoch": 0.6395504415306396, + "grad_norm": 0.5022866725921631, + "learning_rate": 0.0003840868154636472, + "loss": 0.9341, + "step": 3585 + }, + { + "epoch": 0.6397288377486398, + "grad_norm": 0.5059497356414795, + "learning_rate": 0.00038402767781621163, + "loss": 0.9208, + "step": 3586 + }, + { + "epoch": 0.63990723396664, + "grad_norm": 0.46369996666908264, + "learning_rate": 0.0003839685296422999, + "loss": 1.0211, + "step": 3587 + }, + { + "epoch": 0.64008563018464, + "grad_norm": 0.46570396423339844, + "learning_rate": 0.0003839093709465574, + "loss": 0.7812, + "step": 3588 + }, + { + "epoch": 0.6402640264026402, + "grad_norm": 0.5327849388122559, + "learning_rate": 0.00038385020173363065, + "loss": 0.868, + "step": 3589 + }, + { + "epoch": 0.6404424226206404, + "grad_norm": 0.5208088755607605, + "learning_rate": 0.0003837910220081667, + "loss": 1.0375, + "step": 3590 + }, + { + "epoch": 0.6406208188386406, + "grad_norm": 0.6217350363731384, + "learning_rate": 0.00038373183177481336, + "loss": 0.9972, + "step": 3591 + }, + { + "epoch": 0.6407992150566408, + "grad_norm": 0.5081048607826233, + "learning_rate": 0.00038367263103821956, + "loss": 0.9218, + "step": 3592 + }, + { + "epoch": 0.6409776112746409, + "grad_norm": 0.540647566318512, + "learning_rate": 0.00038361341980303477, + "loss": 0.838, + "step": 3593 + }, + { + "epoch": 0.6411560074926411, + "grad_norm": 0.4405638873577118, + "learning_rate": 0.0003835541980739096, + "loss": 0.9328, + "step": 3594 + }, + { + "epoch": 0.6413344037106413, + "grad_norm": 0.501441240310669, + "learning_rate": 0.00038349496585549504, + "loss": 0.9606, + "step": 3595 + }, + { + "epoch": 0.6415127999286415, + "grad_norm": 0.6081332564353943, + "learning_rate": 0.00038343572315244337, + "loss": 1.032, + "step": 3596 + }, + { + "epoch": 0.6416911961466417, + "grad_norm": 0.5091875791549683, + "learning_rate": 0.00038337646996940746, + "loss": 0.7836, + "step": 3597 + }, + { + "epoch": 0.6418695923646419, + "grad_norm": 0.48131605982780457, + "learning_rate": 0.00038331720631104094, + "loss": 0.8946, + "step": 3598 + }, + { + "epoch": 0.642047988582642, + "grad_norm": 1.0239105224609375, + "learning_rate": 0.00038325793218199844, + "loss": 0.7923, + "step": 3599 + }, + { + "epoch": 0.6422263848006422, + "grad_norm": 0.5322194695472717, + "learning_rate": 0.00038319864758693537, + "loss": 1.0488, + "step": 3600 + }, + { + "epoch": 0.6424047810186424, + "grad_norm": 0.47548213601112366, + "learning_rate": 0.00038313935253050767, + "loss": 1.0053, + "step": 3601 + }, + { + "epoch": 0.6425831772366426, + "grad_norm": 0.5237056016921997, + "learning_rate": 0.00038308004701737263, + "loss": 1.1428, + "step": 3602 + }, + { + "epoch": 0.6427615734546428, + "grad_norm": 0.6333736181259155, + "learning_rate": 0.00038302073105218794, + "loss": 1.0623, + "step": 3603 + }, + { + "epoch": 0.6429399696726429, + "grad_norm": 0.49249467253685, + "learning_rate": 0.00038296140463961226, + "loss": 1.0911, + "step": 3604 + }, + { + "epoch": 0.6431183658906431, + "grad_norm": 0.479319304227829, + "learning_rate": 0.00038290206778430515, + "loss": 0.8087, + "step": 3605 + }, + { + "epoch": 0.6432967621086433, + "grad_norm": 0.5194647312164307, + "learning_rate": 0.00038284272049092673, + "loss": 0.8521, + "step": 3606 + }, + { + "epoch": 0.6434751583266435, + "grad_norm": 0.47771453857421875, + "learning_rate": 0.00038278336276413827, + "loss": 1.0094, + "step": 3607 + }, + { + "epoch": 0.6436535545446437, + "grad_norm": 1.2369436025619507, + "learning_rate": 0.00038272399460860166, + "loss": 1.007, + "step": 3608 + }, + { + "epoch": 0.6438319507626439, + "grad_norm": 0.4876594841480255, + "learning_rate": 0.00038266461602897957, + "loss": 1.0556, + "step": 3609 + }, + { + "epoch": 0.644010346980644, + "grad_norm": 0.5744562149047852, + "learning_rate": 0.0003826052270299356, + "loss": 1.0985, + "step": 3610 + }, + { + "epoch": 0.6441887431986442, + "grad_norm": 0.4735945165157318, + "learning_rate": 0.00038254582761613424, + "loss": 1.0116, + "step": 3611 + }, + { + "epoch": 0.6443671394166444, + "grad_norm": 0.5286600589752197, + "learning_rate": 0.0003824864177922406, + "loss": 1.1372, + "step": 3612 + }, + { + "epoch": 0.6445455356346446, + "grad_norm": 0.5075108408927917, + "learning_rate": 0.0003824269975629207, + "loss": 1.0383, + "step": 3613 + }, + { + "epoch": 0.6447239318526448, + "grad_norm": 0.5046312212944031, + "learning_rate": 0.00038236756693284143, + "loss": 1.0341, + "step": 3614 + }, + { + "epoch": 0.6449023280706448, + "grad_norm": 0.49919357895851135, + "learning_rate": 0.00038230812590667044, + "loss": 1.0578, + "step": 3615 + }, + { + "epoch": 0.645080724288645, + "grad_norm": 0.4575040340423584, + "learning_rate": 0.0003822486744890761, + "loss": 0.7656, + "step": 3616 + }, + { + "epoch": 0.6452591205066452, + "grad_norm": 0.4908180236816406, + "learning_rate": 0.00038218921268472786, + "loss": 0.7803, + "step": 3617 + }, + { + "epoch": 0.6454375167246454, + "grad_norm": 0.4667602479457855, + "learning_rate": 0.00038212974049829564, + "loss": 0.9217, + "step": 3618 + }, + { + "epoch": 0.6456159129426456, + "grad_norm": 0.49208131432533264, + "learning_rate": 0.00038207025793445047, + "loss": 1.3218, + "step": 3619 + }, + { + "epoch": 0.6457943091606458, + "grad_norm": 0.47899407148361206, + "learning_rate": 0.0003820107649978641, + "loss": 0.9374, + "step": 3620 + }, + { + "epoch": 0.6459727053786459, + "grad_norm": 0.5173959136009216, + "learning_rate": 0.00038195126169320915, + "loss": 1.1439, + "step": 3621 + }, + { + "epoch": 0.6461511015966461, + "grad_norm": 1.7555327415466309, + "learning_rate": 0.00038189174802515883, + "loss": 1.1495, + "step": 3622 + }, + { + "epoch": 0.6463294978146463, + "grad_norm": 0.5820402503013611, + "learning_rate": 0.0003818322239983873, + "loss": 0.9644, + "step": 3623 + }, + { + "epoch": 0.6465078940326465, + "grad_norm": 0.5061475038528442, + "learning_rate": 0.0003817726896175697, + "loss": 1.0867, + "step": 3624 + }, + { + "epoch": 0.6466862902506467, + "grad_norm": 0.5175919532775879, + "learning_rate": 0.00038171314488738176, + "loss": 0.9003, + "step": 3625 + }, + { + "epoch": 0.6468646864686468, + "grad_norm": 0.5276834964752197, + "learning_rate": 0.0003816535898125001, + "loss": 1.0232, + "step": 3626 + }, + { + "epoch": 0.647043082686647, + "grad_norm": 0.4682513177394867, + "learning_rate": 0.0003815940243976022, + "loss": 0.8202, + "step": 3627 + }, + { + "epoch": 0.6472214789046472, + "grad_norm": 0.5589001774787903, + "learning_rate": 0.00038153444864736616, + "loss": 0.9364, + "step": 3628 + }, + { + "epoch": 0.6473998751226474, + "grad_norm": 0.5058655738830566, + "learning_rate": 0.00038147486256647113, + "loss": 1.0972, + "step": 3629 + }, + { + "epoch": 0.6475782713406476, + "grad_norm": 0.44797369837760925, + "learning_rate": 0.0003814152661595971, + "loss": 0.8263, + "step": 3630 + }, + { + "epoch": 0.6477566675586478, + "grad_norm": 0.5152768492698669, + "learning_rate": 0.00038135565943142445, + "loss": 0.9512, + "step": 3631 + }, + { + "epoch": 0.6479350637766479, + "grad_norm": 0.4588833451271057, + "learning_rate": 0.00038129604238663494, + "loss": 0.8475, + "step": 3632 + }, + { + "epoch": 0.6481134599946481, + "grad_norm": 0.5147669911384583, + "learning_rate": 0.00038123641502991074, + "loss": 0.9934, + "step": 3633 + }, + { + "epoch": 0.6482918562126483, + "grad_norm": 0.46724918484687805, + "learning_rate": 0.0003811767773659349, + "loss": 0.7732, + "step": 3634 + }, + { + "epoch": 0.6484702524306485, + "grad_norm": 0.6038320064544678, + "learning_rate": 0.00038111712939939153, + "loss": 1.3013, + "step": 3635 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 0.4923225939273834, + "learning_rate": 0.0003810574711349652, + "loss": 1.205, + "step": 3636 + }, + { + "epoch": 0.6488270448666488, + "grad_norm": 0.4713652431964874, + "learning_rate": 0.0003809978025773415, + "loss": 0.8677, + "step": 3637 + }, + { + "epoch": 0.649005441084649, + "grad_norm": 0.4485718607902527, + "learning_rate": 0.00038093812373120675, + "loss": 0.7994, + "step": 3638 + }, + { + "epoch": 0.6491838373026492, + "grad_norm": 0.5080364942550659, + "learning_rate": 0.00038087843460124813, + "loss": 0.9032, + "step": 3639 + }, + { + "epoch": 0.6493622335206494, + "grad_norm": 0.5731015801429749, + "learning_rate": 0.0003808187351921535, + "loss": 0.8565, + "step": 3640 + }, + { + "epoch": 0.6495406297386496, + "grad_norm": 0.47458168864250183, + "learning_rate": 0.00038075902550861176, + "loss": 1.0262, + "step": 3641 + }, + { + "epoch": 0.6497190259566498, + "grad_norm": 1.2307530641555786, + "learning_rate": 0.0003806993055553124, + "loss": 0.8389, + "step": 3642 + }, + { + "epoch": 0.6498974221746499, + "grad_norm": 0.481864333152771, + "learning_rate": 0.00038063957533694594, + "loss": 0.9069, + "step": 3643 + }, + { + "epoch": 0.65007581839265, + "grad_norm": 0.46057215332984924, + "learning_rate": 0.0003805798348582034, + "loss": 0.7955, + "step": 3644 + }, + { + "epoch": 0.6502542146106502, + "grad_norm": 0.5290510058403015, + "learning_rate": 0.0003805200841237767, + "loss": 1.1641, + "step": 3645 + }, + { + "epoch": 0.6504326108286504, + "grad_norm": 0.4636685252189636, + "learning_rate": 0.0003804603231383589, + "loss": 0.8106, + "step": 3646 + }, + { + "epoch": 0.6506110070466506, + "grad_norm": 0.5197182893753052, + "learning_rate": 0.00038040055190664336, + "loss": 1.0978, + "step": 3647 + }, + { + "epoch": 0.6507894032646507, + "grad_norm": 0.5658067464828491, + "learning_rate": 0.0003803407704333246, + "loss": 1.0294, + "step": 3648 + }, + { + "epoch": 0.6509677994826509, + "grad_norm": 0.4995347261428833, + "learning_rate": 0.0003802809787230979, + "loss": 1.0974, + "step": 3649 + }, + { + "epoch": 0.6511461957006511, + "grad_norm": 0.5178309679031372, + "learning_rate": 0.00038022117678065915, + "loss": 1.0917, + "step": 3650 + }, + { + "epoch": 0.6513245919186513, + "grad_norm": 0.450339674949646, + "learning_rate": 0.0003801613646107052, + "loss": 0.7781, + "step": 3651 + }, + { + "epoch": 0.6515029881366515, + "grad_norm": 0.48041054606437683, + "learning_rate": 0.0003801015422179337, + "loss": 0.7998, + "step": 3652 + }, + { + "epoch": 0.6516813843546517, + "grad_norm": 0.5061138868331909, + "learning_rate": 0.00038004170960704306, + "loss": 0.9966, + "step": 3653 + }, + { + "epoch": 0.6518597805726518, + "grad_norm": 0.4858459234237671, + "learning_rate": 0.0003799818667827325, + "loss": 0.8406, + "step": 3654 + }, + { + "epoch": 0.652038176790652, + "grad_norm": 0.49688011407852173, + "learning_rate": 0.00037992201374970205, + "loss": 1.0493, + "step": 3655 + }, + { + "epoch": 0.6522165730086522, + "grad_norm": 0.47799569368362427, + "learning_rate": 0.0003798621505126526, + "loss": 0.8846, + "step": 3656 + }, + { + "epoch": 0.6523949692266524, + "grad_norm": 0.500594437122345, + "learning_rate": 0.0003798022770762857, + "loss": 1.0214, + "step": 3657 + }, + { + "epoch": 0.6525733654446526, + "grad_norm": 0.5715755224227905, + "learning_rate": 0.0003797423934453038, + "loss": 1.312, + "step": 3658 + }, + { + "epoch": 0.6527517616626527, + "grad_norm": 0.47365105152130127, + "learning_rate": 0.00037968249962441015, + "loss": 0.9041, + "step": 3659 + }, + { + "epoch": 0.6529301578806529, + "grad_norm": 0.5020959973335266, + "learning_rate": 0.00037962259561830883, + "loss": 0.9089, + "step": 3660 + }, + { + "epoch": 0.6531085540986531, + "grad_norm": 0.5108777284622192, + "learning_rate": 0.0003795626814317046, + "loss": 0.7617, + "step": 3661 + }, + { + "epoch": 0.6532869503166533, + "grad_norm": 0.6099095940589905, + "learning_rate": 0.0003795027570693032, + "loss": 1.2797, + "step": 3662 + }, + { + "epoch": 0.6534653465346535, + "grad_norm": 0.5367801189422607, + "learning_rate": 0.00037944282253581086, + "loss": 0.9654, + "step": 3663 + }, + { + "epoch": 0.6536437427526537, + "grad_norm": 0.4482106566429138, + "learning_rate": 0.000379382877835935, + "loss": 0.8033, + "step": 3664 + }, + { + "epoch": 0.6538221389706538, + "grad_norm": 0.48024243116378784, + "learning_rate": 0.0003793229229743836, + "loss": 0.7399, + "step": 3665 + }, + { + "epoch": 0.654000535188654, + "grad_norm": 0.49091434478759766, + "learning_rate": 0.00037926295795586546, + "loss": 0.9129, + "step": 3666 + }, + { + "epoch": 0.6541789314066542, + "grad_norm": 0.428303062915802, + "learning_rate": 0.00037920298278509027, + "loss": 0.8762, + "step": 3667 + }, + { + "epoch": 0.6543573276246544, + "grad_norm": 0.4690133035182953, + "learning_rate": 0.00037914299746676837, + "loss": 0.748, + "step": 3668 + }, + { + "epoch": 0.6545357238426546, + "grad_norm": 0.5279445052146912, + "learning_rate": 0.00037908300200561107, + "loss": 0.9056, + "step": 3669 + }, + { + "epoch": 0.6547141200606548, + "grad_norm": 0.5457646250724792, + "learning_rate": 0.0003790229964063303, + "loss": 0.8575, + "step": 3670 + }, + { + "epoch": 0.6548925162786549, + "grad_norm": 0.4674092233181, + "learning_rate": 0.00037896298067363897, + "loss": 0.7337, + "step": 3671 + }, + { + "epoch": 0.6550709124966551, + "grad_norm": 0.539322555065155, + "learning_rate": 0.00037890295481225056, + "loss": 1.1518, + "step": 3672 + }, + { + "epoch": 0.6552493087146553, + "grad_norm": 0.5358078479766846, + "learning_rate": 0.00037884291882687955, + "loss": 1.2239, + "step": 3673 + }, + { + "epoch": 0.6554277049326555, + "grad_norm": 0.4866465926170349, + "learning_rate": 0.0003787828727222412, + "loss": 1.1942, + "step": 3674 + }, + { + "epoch": 0.6556061011506557, + "grad_norm": 0.5002499222755432, + "learning_rate": 0.0003787228165030514, + "loss": 0.8127, + "step": 3675 + }, + { + "epoch": 0.6557844973686557, + "grad_norm": 0.5849499106407166, + "learning_rate": 0.00037866275017402694, + "loss": 0.9762, + "step": 3676 + }, + { + "epoch": 0.6559628935866559, + "grad_norm": 0.46588608622550964, + "learning_rate": 0.0003786026737398857, + "loss": 0.8933, + "step": 3677 + }, + { + "epoch": 0.6561412898046561, + "grad_norm": 0.48897257447242737, + "learning_rate": 0.0003785425872053455, + "loss": 0.8101, + "step": 3678 + }, + { + "epoch": 0.6563196860226563, + "grad_norm": 0.46128103137016296, + "learning_rate": 0.00037848249057512596, + "loss": 0.8682, + "step": 3679 + }, + { + "epoch": 0.6564980822406565, + "grad_norm": 0.5110675692558289, + "learning_rate": 0.00037842238385394684, + "loss": 0.8593, + "step": 3680 + }, + { + "epoch": 0.6566764784586567, + "grad_norm": 0.4921587407588959, + "learning_rate": 0.00037836226704652897, + "loss": 0.9287, + "step": 3681 + }, + { + "epoch": 0.6568548746766568, + "grad_norm": 0.5147567987442017, + "learning_rate": 0.00037830214015759393, + "loss": 0.8533, + "step": 3682 + }, + { + "epoch": 0.657033270894657, + "grad_norm": 0.4901014268398285, + "learning_rate": 0.0003782420031918641, + "loss": 0.9394, + "step": 3683 + }, + { + "epoch": 0.6572116671126572, + "grad_norm": 0.5078468918800354, + "learning_rate": 0.00037818185615406236, + "loss": 0.9224, + "step": 3684 + }, + { + "epoch": 0.6573900633306574, + "grad_norm": 0.5428446531295776, + "learning_rate": 0.0003781216990489129, + "loss": 1.1286, + "step": 3685 + }, + { + "epoch": 0.6575684595486576, + "grad_norm": 0.5101941227912903, + "learning_rate": 0.0003780615318811402, + "loss": 0.9386, + "step": 3686 + }, + { + "epoch": 0.6577468557666577, + "grad_norm": 0.5293329358100891, + "learning_rate": 0.00037800135465547, + "loss": 0.9849, + "step": 3687 + }, + { + "epoch": 0.6579252519846579, + "grad_norm": 0.47697338461875916, + "learning_rate": 0.00037794116737662847, + "loss": 0.8925, + "step": 3688 + }, + { + "epoch": 0.6581036482026581, + "grad_norm": 0.5124788284301758, + "learning_rate": 0.00037788097004934275, + "loss": 1.0044, + "step": 3689 + }, + { + "epoch": 0.6582820444206583, + "grad_norm": 0.4236263632774353, + "learning_rate": 0.00037782076267834063, + "loss": 0.8828, + "step": 3690 + }, + { + "epoch": 0.6584604406386585, + "grad_norm": 0.4617786109447479, + "learning_rate": 0.00037776054526835086, + "loss": 0.8969, + "step": 3691 + }, + { + "epoch": 0.6586388368566587, + "grad_norm": 0.596630871295929, + "learning_rate": 0.0003777003178241028, + "loss": 1.0182, + "step": 3692 + }, + { + "epoch": 0.6588172330746588, + "grad_norm": 0.4604688882827759, + "learning_rate": 0.00037764008035032676, + "loss": 0.9566, + "step": 3693 + }, + { + "epoch": 0.658995629292659, + "grad_norm": 0.8154518604278564, + "learning_rate": 0.00037757983285175367, + "loss": 0.8091, + "step": 3694 + }, + { + "epoch": 0.6591740255106592, + "grad_norm": 0.5884724259376526, + "learning_rate": 0.00037751957533311545, + "loss": 0.9988, + "step": 3695 + }, + { + "epoch": 0.6593524217286594, + "grad_norm": 0.43609222769737244, + "learning_rate": 0.0003774593077991447, + "loss": 0.8279, + "step": 3696 + }, + { + "epoch": 0.6595308179466596, + "grad_norm": 0.5332170128822327, + "learning_rate": 0.0003773990302545748, + "loss": 0.8307, + "step": 3697 + }, + { + "epoch": 0.6597092141646597, + "grad_norm": 0.4429665505886078, + "learning_rate": 0.0003773387427041398, + "loss": 0.9021, + "step": 3698 + }, + { + "epoch": 0.6598876103826599, + "grad_norm": 0.48425301909446716, + "learning_rate": 0.00037727844515257473, + "loss": 0.8488, + "step": 3699 + }, + { + "epoch": 0.6600660066006601, + "grad_norm": 0.45970770716667175, + "learning_rate": 0.00037721813760461544, + "loss": 0.9876, + "step": 3700 + }, + { + "epoch": 0.6602444028186603, + "grad_norm": 0.47570955753326416, + "learning_rate": 0.00037715782006499826, + "loss": 0.9802, + "step": 3701 + }, + { + "epoch": 0.6604227990366605, + "grad_norm": 0.5924715399742126, + "learning_rate": 0.0003770974925384607, + "loss": 0.9458, + "step": 3702 + }, + { + "epoch": 0.6606011952546607, + "grad_norm": 0.47865262627601624, + "learning_rate": 0.0003770371550297407, + "loss": 0.7191, + "step": 3703 + }, + { + "epoch": 0.6607795914726607, + "grad_norm": 0.6037271618843079, + "learning_rate": 0.00037697680754357726, + "loss": 1.1374, + "step": 3704 + }, + { + "epoch": 0.6609579876906609, + "grad_norm": 0.499804824590683, + "learning_rate": 0.00037691645008471, + "loss": 0.9418, + "step": 3705 + }, + { + "epoch": 0.6611363839086611, + "grad_norm": 0.4647560715675354, + "learning_rate": 0.00037685608265787936, + "loss": 0.7682, + "step": 3706 + }, + { + "epoch": 0.6613147801266613, + "grad_norm": 0.48037344217300415, + "learning_rate": 0.0003767957052678266, + "loss": 0.6937, + "step": 3707 + }, + { + "epoch": 0.6614931763446615, + "grad_norm": 0.5261335968971252, + "learning_rate": 0.00037673531791929365, + "loss": 1.2998, + "step": 3708 + }, + { + "epoch": 0.6616715725626616, + "grad_norm": 0.48943454027175903, + "learning_rate": 0.0003766749206170234, + "loss": 0.7968, + "step": 3709 + }, + { + "epoch": 0.6618499687806618, + "grad_norm": 0.4693853557109833, + "learning_rate": 0.0003766145133657594, + "loss": 0.9241, + "step": 3710 + }, + { + "epoch": 0.662028364998662, + "grad_norm": 0.49054959416389465, + "learning_rate": 0.00037655409617024606, + "loss": 0.913, + "step": 3711 + }, + { + "epoch": 0.6622067612166622, + "grad_norm": 0.49110257625579834, + "learning_rate": 0.0003764936690352284, + "loss": 1.0838, + "step": 3712 + }, + { + "epoch": 0.6623851574346624, + "grad_norm": 0.4667607843875885, + "learning_rate": 0.00037643323196545245, + "loss": 0.8978, + "step": 3713 + }, + { + "epoch": 0.6625635536526626, + "grad_norm": 0.5172199010848999, + "learning_rate": 0.0003763727849656648, + "loss": 1.2393, + "step": 3714 + }, + { + "epoch": 0.6627419498706627, + "grad_norm": 0.4816920757293701, + "learning_rate": 0.0003763123280406131, + "loss": 1.0024, + "step": 3715 + }, + { + "epoch": 0.6629203460886629, + "grad_norm": 0.5231166481971741, + "learning_rate": 0.00037625186119504537, + "loss": 1.0882, + "step": 3716 + }, + { + "epoch": 0.6630987423066631, + "grad_norm": 0.5482428669929504, + "learning_rate": 0.000376191384433711, + "loss": 1.3842, + "step": 3717 + }, + { + "epoch": 0.6632771385246633, + "grad_norm": 0.5781173706054688, + "learning_rate": 0.00037613089776135947, + "loss": 1.4105, + "step": 3718 + }, + { + "epoch": 0.6634555347426635, + "grad_norm": 0.5220258235931396, + "learning_rate": 0.0003760704011827415, + "loss": 0.9949, + "step": 3719 + }, + { + "epoch": 0.6636339309606636, + "grad_norm": 0.46553224325180054, + "learning_rate": 0.0003760098947026085, + "loss": 0.7985, + "step": 3720 + }, + { + "epoch": 0.6638123271786638, + "grad_norm": 0.47272759675979614, + "learning_rate": 0.00037594937832571254, + "loss": 0.819, + "step": 3721 + }, + { + "epoch": 0.663990723396664, + "grad_norm": 0.5201370716094971, + "learning_rate": 0.0003758888520568067, + "loss": 0.899, + "step": 3722 + }, + { + "epoch": 0.6641691196146642, + "grad_norm": 0.52834552526474, + "learning_rate": 0.0003758283159006446, + "loss": 0.9406, + "step": 3723 + }, + { + "epoch": 0.6643475158326644, + "grad_norm": 0.4457343518733978, + "learning_rate": 0.00037576776986198064, + "loss": 0.841, + "step": 3724 + }, + { + "epoch": 0.6645259120506646, + "grad_norm": 0.49147358536720276, + "learning_rate": 0.00037570721394557016, + "loss": 0.9662, + "step": 3725 + }, + { + "epoch": 0.6647043082686647, + "grad_norm": 0.5019490122795105, + "learning_rate": 0.00037564664815616924, + "loss": 1.1227, + "step": 3726 + }, + { + "epoch": 0.6648827044866649, + "grad_norm": 0.4801279902458191, + "learning_rate": 0.0003755860724985346, + "loss": 0.7102, + "step": 3727 + }, + { + "epoch": 0.6650611007046651, + "grad_norm": 0.5752885937690735, + "learning_rate": 0.00037552548697742386, + "loss": 1.1417, + "step": 3728 + }, + { + "epoch": 0.6652394969226653, + "grad_norm": 0.45870140194892883, + "learning_rate": 0.00037546489159759545, + "loss": 0.763, + "step": 3729 + }, + { + "epoch": 0.6654178931406655, + "grad_norm": 0.4713680148124695, + "learning_rate": 0.0003754042863638084, + "loss": 0.8104, + "step": 3730 + }, + { + "epoch": 0.6655962893586655, + "grad_norm": 0.5558474659919739, + "learning_rate": 0.0003753436712808227, + "loss": 1.0264, + "step": 3731 + }, + { + "epoch": 0.6657746855766657, + "grad_norm": 0.5492153167724609, + "learning_rate": 0.000375283046353399, + "loss": 1.1912, + "step": 3732 + }, + { + "epoch": 0.665953081794666, + "grad_norm": 0.4615228474140167, + "learning_rate": 0.00037522241158629866, + "loss": 0.9509, + "step": 3733 + }, + { + "epoch": 0.6661314780126661, + "grad_norm": 0.4563251733779907, + "learning_rate": 0.00037516176698428413, + "loss": 0.8065, + "step": 3734 + }, + { + "epoch": 0.6663098742306663, + "grad_norm": 0.5740782618522644, + "learning_rate": 0.0003751011125521182, + "loss": 0.9293, + "step": 3735 + }, + { + "epoch": 0.6664882704486665, + "grad_norm": 0.49503761529922485, + "learning_rate": 0.0003750404482945648, + "loss": 0.9861, + "step": 3736 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.4828610122203827, + "learning_rate": 0.0003749797742163883, + "loss": 1.1359, + "step": 3737 + }, + { + "epoch": 0.6668450628846668, + "grad_norm": 0.4599243402481079, + "learning_rate": 0.00037491909032235423, + "loss": 0.7794, + "step": 3738 + }, + { + "epoch": 0.667023459102667, + "grad_norm": 0.5307605862617493, + "learning_rate": 0.0003748583966172285, + "loss": 0.8625, + "step": 3739 + }, + { + "epoch": 0.6672018553206672, + "grad_norm": 0.49819859862327576, + "learning_rate": 0.000374797693105778, + "loss": 0.8321, + "step": 3740 + }, + { + "epoch": 0.6673802515386674, + "grad_norm": 0.5101906657218933, + "learning_rate": 0.0003747369797927704, + "loss": 1.0208, + "step": 3741 + }, + { + "epoch": 0.6675586477566675, + "grad_norm": 0.4965439438819885, + "learning_rate": 0.0003746762566829742, + "loss": 0.9561, + "step": 3742 + }, + { + "epoch": 0.6677370439746677, + "grad_norm": 0.4781574308872223, + "learning_rate": 0.00037461552378115833, + "loss": 0.8842, + "step": 3743 + }, + { + "epoch": 0.6679154401926679, + "grad_norm": 0.512837827205658, + "learning_rate": 0.00037455478109209284, + "loss": 0.9223, + "step": 3744 + }, + { + "epoch": 0.6680938364106681, + "grad_norm": 0.484651654958725, + "learning_rate": 0.0003744940286205485, + "loss": 0.8012, + "step": 3745 + }, + { + "epoch": 0.6682722326286683, + "grad_norm": 0.5075222849845886, + "learning_rate": 0.00037443326637129674, + "loss": 0.9905, + "step": 3746 + }, + { + "epoch": 0.6684506288466685, + "grad_norm": 0.48406416177749634, + "learning_rate": 0.0003743724943491097, + "loss": 1.0252, + "step": 3747 + }, + { + "epoch": 0.6686290250646686, + "grad_norm": 0.4906768202781677, + "learning_rate": 0.0003743117125587606, + "loss": 0.8734, + "step": 3748 + }, + { + "epoch": 0.6688074212826688, + "grad_norm": 0.4577210545539856, + "learning_rate": 0.00037425092100502297, + "loss": 0.9074, + "step": 3749 + }, + { + "epoch": 0.668985817500669, + "grad_norm": 0.4762527346611023, + "learning_rate": 0.0003741901196926715, + "loss": 0.9042, + "step": 3750 + }, + { + "epoch": 0.6691642137186692, + "grad_norm": 0.5320841073989868, + "learning_rate": 0.00037412930862648153, + "loss": 1.1009, + "step": 3751 + }, + { + "epoch": 0.6693426099366694, + "grad_norm": 0.4859885275363922, + "learning_rate": 0.00037406848781122904, + "loss": 0.9112, + "step": 3752 + }, + { + "epoch": 0.6695210061546695, + "grad_norm": 0.5073356628417969, + "learning_rate": 0.0003740076572516909, + "loss": 1.1243, + "step": 3753 + }, + { + "epoch": 0.6696994023726697, + "grad_norm": 0.492034912109375, + "learning_rate": 0.00037394681695264475, + "loss": 1.0717, + "step": 3754 + }, + { + "epoch": 0.6698777985906699, + "grad_norm": 0.40963229537010193, + "learning_rate": 0.0003738859669188689, + "loss": 0.7091, + "step": 3755 + }, + { + "epoch": 0.6700561948086701, + "grad_norm": 0.5281137824058533, + "learning_rate": 0.00037382510715514255, + "loss": 1.0417, + "step": 3756 + }, + { + "epoch": 0.6702345910266703, + "grad_norm": 0.48045825958251953, + "learning_rate": 0.0003737642376662456, + "loss": 0.9008, + "step": 3757 + }, + { + "epoch": 0.6704129872446705, + "grad_norm": 0.565067708492279, + "learning_rate": 0.0003737033584569586, + "loss": 0.9471, + "step": 3758 + }, + { + "epoch": 0.6705913834626706, + "grad_norm": 0.5361825227737427, + "learning_rate": 0.0003736424695320631, + "loss": 1.1245, + "step": 3759 + }, + { + "epoch": 0.6707697796806708, + "grad_norm": 0.47022882103919983, + "learning_rate": 0.00037358157089634127, + "loss": 0.9096, + "step": 3760 + }, + { + "epoch": 0.670948175898671, + "grad_norm": 0.49209490418434143, + "learning_rate": 0.000373520662554576, + "loss": 0.9196, + "step": 3761 + }, + { + "epoch": 0.6711265721166711, + "grad_norm": 0.4670359194278717, + "learning_rate": 0.0003734597445115511, + "loss": 0.7455, + "step": 3762 + }, + { + "epoch": 0.6713049683346713, + "grad_norm": 0.520520806312561, + "learning_rate": 0.000373398816772051, + "loss": 0.9116, + "step": 3763 + }, + { + "epoch": 0.6714833645526714, + "grad_norm": 0.5995724201202393, + "learning_rate": 0.0003733378793408609, + "loss": 1.3693, + "step": 3764 + }, + { + "epoch": 0.6716617607706716, + "grad_norm": 0.4709326922893524, + "learning_rate": 0.00037327693222276683, + "loss": 0.9914, + "step": 3765 + }, + { + "epoch": 0.6718401569886718, + "grad_norm": 0.4547140598297119, + "learning_rate": 0.00037321597542255554, + "loss": 0.9951, + "step": 3766 + }, + { + "epoch": 0.672018553206672, + "grad_norm": 0.46117034554481506, + "learning_rate": 0.0003731550089450146, + "loss": 0.9847, + "step": 3767 + }, + { + "epoch": 0.6721969494246722, + "grad_norm": 0.6557008028030396, + "learning_rate": 0.00037309403279493227, + "loss": 0.9359, + "step": 3768 + }, + { + "epoch": 0.6723753456426724, + "grad_norm": 0.4666654169559479, + "learning_rate": 0.00037303304697709755, + "loss": 0.8247, + "step": 3769 + }, + { + "epoch": 0.6725537418606725, + "grad_norm": 0.4964097738265991, + "learning_rate": 0.00037297205149630023, + "loss": 0.8682, + "step": 3770 + }, + { + "epoch": 0.6727321380786727, + "grad_norm": 0.47724971175193787, + "learning_rate": 0.000372911046357331, + "loss": 0.7927, + "step": 3771 + }, + { + "epoch": 0.6729105342966729, + "grad_norm": 0.4240172803401947, + "learning_rate": 0.00037285003156498097, + "loss": 0.6913, + "step": 3772 + }, + { + "epoch": 0.6730889305146731, + "grad_norm": 0.7888692021369934, + "learning_rate": 0.00037278900712404235, + "loss": 0.734, + "step": 3773 + }, + { + "epoch": 0.6732673267326733, + "grad_norm": 0.46591684222221375, + "learning_rate": 0.000372727973039308, + "loss": 0.8113, + "step": 3774 + }, + { + "epoch": 0.6734457229506734, + "grad_norm": 0.47991588711738586, + "learning_rate": 0.00037266692931557145, + "loss": 0.9792, + "step": 3775 + }, + { + "epoch": 0.6736241191686736, + "grad_norm": 0.4709876775741577, + "learning_rate": 0.00037260587595762705, + "loss": 0.8581, + "step": 3776 + }, + { + "epoch": 0.6738025153866738, + "grad_norm": 0.4677605628967285, + "learning_rate": 0.0003725448129702699, + "loss": 0.9992, + "step": 3777 + }, + { + "epoch": 0.673980911604674, + "grad_norm": 0.47087326645851135, + "learning_rate": 0.0003724837403582959, + "loss": 0.9046, + "step": 3778 + }, + { + "epoch": 0.6741593078226742, + "grad_norm": 0.44651949405670166, + "learning_rate": 0.0003724226581265016, + "loss": 0.6954, + "step": 3779 + }, + { + "epoch": 0.6743377040406744, + "grad_norm": 0.4970155656337738, + "learning_rate": 0.0003723615662796844, + "loss": 0.9304, + "step": 3780 + }, + { + "epoch": 0.6745161002586745, + "grad_norm": 0.5014781355857849, + "learning_rate": 0.0003723004648226425, + "loss": 1.093, + "step": 3781 + }, + { + "epoch": 0.6746944964766747, + "grad_norm": 0.43172088265419006, + "learning_rate": 0.0003722393537601748, + "loss": 0.7743, + "step": 3782 + }, + { + "epoch": 0.6748728926946749, + "grad_norm": 0.4725377559661865, + "learning_rate": 0.0003721782330970808, + "loss": 1.0071, + "step": 3783 + }, + { + "epoch": 0.6750512889126751, + "grad_norm": 0.4693569839000702, + "learning_rate": 0.0003721171028381609, + "loss": 0.9637, + "step": 3784 + }, + { + "epoch": 0.6752296851306753, + "grad_norm": 1.366469144821167, + "learning_rate": 0.0003720559629882163, + "loss": 0.8897, + "step": 3785 + }, + { + "epoch": 0.6754080813486754, + "grad_norm": 0.4972679615020752, + "learning_rate": 0.0003719948135520489, + "loss": 0.8672, + "step": 3786 + }, + { + "epoch": 0.6755864775666756, + "grad_norm": 0.47456255555152893, + "learning_rate": 0.00037193365453446126, + "loss": 0.8747, + "step": 3787 + }, + { + "epoch": 0.6757648737846758, + "grad_norm": 0.4588843286037445, + "learning_rate": 0.000371872485940257, + "loss": 0.7423, + "step": 3788 + }, + { + "epoch": 0.675943270002676, + "grad_norm": 0.5166525840759277, + "learning_rate": 0.0003718113077742401, + "loss": 1.0436, + "step": 3789 + }, + { + "epoch": 0.6761216662206762, + "grad_norm": 0.5156573057174683, + "learning_rate": 0.0003717501200412154, + "loss": 0.8229, + "step": 3790 + }, + { + "epoch": 0.6763000624386764, + "grad_norm": 0.4706706702709198, + "learning_rate": 0.00037168892274598884, + "loss": 0.9703, + "step": 3791 + }, + { + "epoch": 0.6764784586566764, + "grad_norm": 0.48749077320098877, + "learning_rate": 0.0003716277158933666, + "loss": 0.9563, + "step": 3792 + }, + { + "epoch": 0.6766568548746766, + "grad_norm": 0.49335891008377075, + "learning_rate": 0.00037156649948815585, + "loss": 1.1277, + "step": 3793 + }, + { + "epoch": 0.6768352510926768, + "grad_norm": 0.45877259969711304, + "learning_rate": 0.00037150527353516457, + "loss": 0.7111, + "step": 3794 + }, + { + "epoch": 0.677013647310677, + "grad_norm": 0.5235643982887268, + "learning_rate": 0.00037144403803920136, + "loss": 0.9515, + "step": 3795 + }, + { + "epoch": 0.6771920435286772, + "grad_norm": 0.43552201986312866, + "learning_rate": 0.00037138279300507574, + "loss": 0.7094, + "step": 3796 + }, + { + "epoch": 0.6773704397466773, + "grad_norm": 0.4871746301651001, + "learning_rate": 0.0003713215384375977, + "loss": 1.0252, + "step": 3797 + }, + { + "epoch": 0.6775488359646775, + "grad_norm": 0.6767199635505676, + "learning_rate": 0.00037126027434157826, + "loss": 1.0102, + "step": 3798 + }, + { + "epoch": 0.6777272321826777, + "grad_norm": 0.514312207698822, + "learning_rate": 0.0003711990007218291, + "loss": 0.921, + "step": 3799 + }, + { + "epoch": 0.6779056284006779, + "grad_norm": 0.5360898375511169, + "learning_rate": 0.00037113771758316255, + "loss": 0.9598, + "step": 3800 + }, + { + "epoch": 0.6780840246186781, + "grad_norm": 0.47847217321395874, + "learning_rate": 0.00037107642493039184, + "loss": 0.8728, + "step": 3801 + }, + { + "epoch": 0.6782624208366783, + "grad_norm": 0.49450331926345825, + "learning_rate": 0.0003710151227683307, + "loss": 0.7853, + "step": 3802 + }, + { + "epoch": 0.6784408170546784, + "grad_norm": 0.47471120953559875, + "learning_rate": 0.00037095381110179406, + "loss": 1.0686, + "step": 3803 + }, + { + "epoch": 0.6786192132726786, + "grad_norm": 0.5599477887153625, + "learning_rate": 0.000370892489935597, + "loss": 1.1936, + "step": 3804 + }, + { + "epoch": 0.6787976094906788, + "grad_norm": 0.525962233543396, + "learning_rate": 0.0003708311592745559, + "loss": 0.7819, + "step": 3805 + }, + { + "epoch": 0.678976005708679, + "grad_norm": 2.3493411540985107, + "learning_rate": 0.00037076981912348753, + "loss": 0.8769, + "step": 3806 + }, + { + "epoch": 0.6791544019266792, + "grad_norm": 0.4811551868915558, + "learning_rate": 0.0003707084694872095, + "loss": 0.9784, + "step": 3807 + }, + { + "epoch": 0.6793327981446793, + "grad_norm": 0.4610568583011627, + "learning_rate": 0.0003706471103705402, + "loss": 1.0097, + "step": 3808 + }, + { + "epoch": 0.6795111943626795, + "grad_norm": 0.5705133080482483, + "learning_rate": 0.0003705857417782989, + "loss": 1.113, + "step": 3809 + }, + { + "epoch": 0.6796895905806797, + "grad_norm": 0.4349512457847595, + "learning_rate": 0.00037052436371530517, + "loss": 0.7286, + "step": 3810 + }, + { + "epoch": 0.6798679867986799, + "grad_norm": 0.5243642330169678, + "learning_rate": 0.00037046297618637984, + "loss": 0.8463, + "step": 3811 + }, + { + "epoch": 0.6800463830166801, + "grad_norm": 0.7079116702079773, + "learning_rate": 0.0003704015791963442, + "loss": 1.1995, + "step": 3812 + }, + { + "epoch": 0.6802247792346803, + "grad_norm": 0.547192394733429, + "learning_rate": 0.00037034017275002043, + "loss": 0.9706, + "step": 3813 + }, + { + "epoch": 0.6804031754526804, + "grad_norm": 1.1593716144561768, + "learning_rate": 0.00037027875685223115, + "loss": 0.8588, + "step": 3814 + }, + { + "epoch": 0.6805815716706806, + "grad_norm": 0.46274659037590027, + "learning_rate": 0.0003702173315078001, + "loss": 0.9027, + "step": 3815 + }, + { + "epoch": 0.6807599678886808, + "grad_norm": 0.5039249658584595, + "learning_rate": 0.0003701558967215517, + "loss": 1.0564, + "step": 3816 + }, + { + "epoch": 0.680938364106681, + "grad_norm": 0.43996962904930115, + "learning_rate": 0.00037009445249831075, + "loss": 0.9811, + "step": 3817 + }, + { + "epoch": 0.6811167603246812, + "grad_norm": 0.5002626180648804, + "learning_rate": 0.00037003299884290315, + "loss": 1.0044, + "step": 3818 + }, + { + "epoch": 0.6812951565426812, + "grad_norm": 0.5452658534049988, + "learning_rate": 0.0003699715357601555, + "loss": 0.82, + "step": 3819 + }, + { + "epoch": 0.6814735527606814, + "grad_norm": 0.5324759483337402, + "learning_rate": 0.00036991006325489507, + "loss": 0.8513, + "step": 3820 + }, + { + "epoch": 0.6816519489786816, + "grad_norm": 0.4961846172809601, + "learning_rate": 0.00036984858133194985, + "loss": 0.8615, + "step": 3821 + }, + { + "epoch": 0.6818303451966818, + "grad_norm": 0.46880999207496643, + "learning_rate": 0.0003697870899961487, + "loss": 0.907, + "step": 3822 + }, + { + "epoch": 0.682008741414682, + "grad_norm": 0.4610176086425781, + "learning_rate": 0.0003697255892523211, + "loss": 0.9399, + "step": 3823 + }, + { + "epoch": 0.6821871376326822, + "grad_norm": 0.5100571513175964, + "learning_rate": 0.00036966407910529715, + "loss": 0.9109, + "step": 3824 + }, + { + "epoch": 0.6823655338506823, + "grad_norm": 0.4634707570075989, + "learning_rate": 0.00036960255955990787, + "loss": 0.8546, + "step": 3825 + }, + { + "epoch": 0.6825439300686825, + "grad_norm": 0.5142379403114319, + "learning_rate": 0.0003695410306209851, + "loss": 0.9521, + "step": 3826 + }, + { + "epoch": 0.6827223262866827, + "grad_norm": 0.4953165352344513, + "learning_rate": 0.0003694794922933612, + "loss": 0.9574, + "step": 3827 + }, + { + "epoch": 0.6829007225046829, + "grad_norm": 0.530571460723877, + "learning_rate": 0.0003694179445818694, + "loss": 1.0247, + "step": 3828 + }, + { + "epoch": 0.6830791187226831, + "grad_norm": 0.49260106682777405, + "learning_rate": 0.0003693563874913437, + "loss": 0.8785, + "step": 3829 + }, + { + "epoch": 0.6832575149406832, + "grad_norm": 0.48824089765548706, + "learning_rate": 0.0003692948210266186, + "loss": 0.9205, + "step": 3830 + }, + { + "epoch": 0.6834359111586834, + "grad_norm": 0.5169446468353271, + "learning_rate": 0.0003692332451925296, + "loss": 0.7974, + "step": 3831 + }, + { + "epoch": 0.6836143073766836, + "grad_norm": 0.5154136419296265, + "learning_rate": 0.0003691716599939129, + "loss": 0.8386, + "step": 3832 + }, + { + "epoch": 0.6837927035946838, + "grad_norm": 0.47398611903190613, + "learning_rate": 0.00036911006543560514, + "loss": 0.9594, + "step": 3833 + }, + { + "epoch": 0.683971099812684, + "grad_norm": 0.4491838812828064, + "learning_rate": 0.00036904846152244425, + "loss": 0.8647, + "step": 3834 + }, + { + "epoch": 0.6841494960306842, + "grad_norm": 0.5062149167060852, + "learning_rate": 0.0003689868482592684, + "loss": 1.0463, + "step": 3835 + }, + { + "epoch": 0.6843278922486843, + "grad_norm": 0.7602079510688782, + "learning_rate": 0.00036892522565091666, + "loss": 1.035, + "step": 3836 + }, + { + "epoch": 0.6845062884666845, + "grad_norm": 0.46411171555519104, + "learning_rate": 0.00036886359370222896, + "loss": 0.8845, + "step": 3837 + }, + { + "epoch": 0.6846846846846847, + "grad_norm": 1.1559884548187256, + "learning_rate": 0.00036880195241804567, + "loss": 1.0434, + "step": 3838 + }, + { + "epoch": 0.6848630809026849, + "grad_norm": 0.477621853351593, + "learning_rate": 0.0003687403018032082, + "loss": 1.0325, + "step": 3839 + }, + { + "epoch": 0.6850414771206851, + "grad_norm": 0.47134527564048767, + "learning_rate": 0.0003686786418625585, + "loss": 0.8306, + "step": 3840 + }, + { + "epoch": 0.6852198733386852, + "grad_norm": 0.47260260581970215, + "learning_rate": 0.0003686169726009393, + "loss": 1.0148, + "step": 3841 + }, + { + "epoch": 0.6853982695566854, + "grad_norm": 0.4394271671772003, + "learning_rate": 0.0003685552940231942, + "loss": 0.8118, + "step": 3842 + }, + { + "epoch": 0.6855766657746856, + "grad_norm": 0.5549933910369873, + "learning_rate": 0.0003684936061341673, + "loss": 0.8082, + "step": 3843 + }, + { + "epoch": 0.6857550619926858, + "grad_norm": 0.49931079149246216, + "learning_rate": 0.00036843190893870356, + "loss": 0.9249, + "step": 3844 + }, + { + "epoch": 0.685933458210686, + "grad_norm": 0.5112189054489136, + "learning_rate": 0.00036837020244164865, + "loss": 0.9894, + "step": 3845 + }, + { + "epoch": 0.6861118544286862, + "grad_norm": 0.544303297996521, + "learning_rate": 0.00036830848664784894, + "loss": 1.0843, + "step": 3846 + }, + { + "epoch": 0.6862902506466863, + "grad_norm": 0.5149818658828735, + "learning_rate": 0.00036824676156215164, + "loss": 1.1905, + "step": 3847 + }, + { + "epoch": 0.6864686468646864, + "grad_norm": 0.44846704602241516, + "learning_rate": 0.00036818502718940463, + "loss": 0.7878, + "step": 3848 + }, + { + "epoch": 0.6866470430826866, + "grad_norm": 0.5292502641677856, + "learning_rate": 0.00036812328353445637, + "loss": 1.015, + "step": 3849 + }, + { + "epoch": 0.6868254393006868, + "grad_norm": 0.4887588918209076, + "learning_rate": 0.00036806153060215627, + "loss": 0.9761, + "step": 3850 + }, + { + "epoch": 0.687003835518687, + "grad_norm": 0.4837714433670044, + "learning_rate": 0.00036799976839735436, + "loss": 0.9546, + "step": 3851 + }, + { + "epoch": 0.6871822317366871, + "grad_norm": 0.4452599883079529, + "learning_rate": 0.00036793799692490145, + "loss": 0.9103, + "step": 3852 + }, + { + "epoch": 0.6873606279546873, + "grad_norm": 0.4157600402832031, + "learning_rate": 0.000367876216189649, + "loss": 0.7636, + "step": 3853 + }, + { + "epoch": 0.6875390241726875, + "grad_norm": 0.4187622666358948, + "learning_rate": 0.0003678144261964492, + "loss": 0.7129, + "step": 3854 + }, + { + "epoch": 0.6877174203906877, + "grad_norm": 1.3808401823043823, + "learning_rate": 0.0003677526269501551, + "loss": 1.03, + "step": 3855 + }, + { + "epoch": 0.6878958166086879, + "grad_norm": 0.4691815972328186, + "learning_rate": 0.00036769081845562033, + "loss": 0.8812, + "step": 3856 + }, + { + "epoch": 0.6880742128266881, + "grad_norm": 0.5647638440132141, + "learning_rate": 0.0003676290007176994, + "loss": 1.0311, + "step": 3857 + }, + { + "epoch": 0.6882526090446882, + "grad_norm": 0.47513964772224426, + "learning_rate": 0.0003675671737412473, + "loss": 0.8746, + "step": 3858 + }, + { + "epoch": 0.6884310052626884, + "grad_norm": 0.5500690340995789, + "learning_rate": 0.00036750533753112004, + "loss": 1.1705, + "step": 3859 + }, + { + "epoch": 0.6886094014806886, + "grad_norm": 0.45833203196525574, + "learning_rate": 0.0003674434920921741, + "loss": 0.8107, + "step": 3860 + }, + { + "epoch": 0.6887877976986888, + "grad_norm": 0.6918094158172607, + "learning_rate": 0.00036738163742926677, + "loss": 0.9976, + "step": 3861 + }, + { + "epoch": 0.688966193916689, + "grad_norm": 0.49966564774513245, + "learning_rate": 0.0003673197735472563, + "loss": 0.7383, + "step": 3862 + }, + { + "epoch": 0.6891445901346891, + "grad_norm": 0.5262913107872009, + "learning_rate": 0.0003672579004510012, + "loss": 0.924, + "step": 3863 + }, + { + "epoch": 0.6893229863526893, + "grad_norm": 0.5179538726806641, + "learning_rate": 0.000367196018145361, + "loss": 0.9421, + "step": 3864 + }, + { + "epoch": 0.6895013825706895, + "grad_norm": 0.502363920211792, + "learning_rate": 0.00036713412663519606, + "loss": 1.0026, + "step": 3865 + }, + { + "epoch": 0.6896797787886897, + "grad_norm": 0.5639678239822388, + "learning_rate": 0.0003670722259253672, + "loss": 1.0164, + "step": 3866 + }, + { + "epoch": 0.6898581750066899, + "grad_norm": 0.4934401214122772, + "learning_rate": 0.0003670103160207361, + "loss": 1.126, + "step": 3867 + }, + { + "epoch": 0.6900365712246901, + "grad_norm": 0.4546942710876465, + "learning_rate": 0.0003669483969261651, + "loss": 0.8328, + "step": 3868 + }, + { + "epoch": 0.6902149674426902, + "grad_norm": 0.5671041011810303, + "learning_rate": 0.00036688646864651745, + "loss": 1.0304, + "step": 3869 + }, + { + "epoch": 0.6903933636606904, + "grad_norm": 0.5230646133422852, + "learning_rate": 0.0003668245311866567, + "loss": 0.9212, + "step": 3870 + }, + { + "epoch": 0.6905717598786906, + "grad_norm": 0.4050388038158417, + "learning_rate": 0.0003667625845514476, + "loss": 0.5513, + "step": 3871 + }, + { + "epoch": 0.6907501560966908, + "grad_norm": 0.6363729238510132, + "learning_rate": 0.00036670062874575535, + "loss": 0.9514, + "step": 3872 + }, + { + "epoch": 0.690928552314691, + "grad_norm": 0.49325573444366455, + "learning_rate": 0.0003666386637744459, + "loss": 1.0872, + "step": 3873 + }, + { + "epoch": 0.6911069485326911, + "grad_norm": 0.49835440516471863, + "learning_rate": 0.00036657668964238613, + "loss": 1.1338, + "step": 3874 + }, + { + "epoch": 0.6912853447506913, + "grad_norm": 0.4174972176551819, + "learning_rate": 0.0003665147063544432, + "loss": 0.7673, + "step": 3875 + }, + { + "epoch": 0.6914637409686915, + "grad_norm": 0.43042904138565063, + "learning_rate": 0.0003664527139154854, + "loss": 0.768, + "step": 3876 + }, + { + "epoch": 0.6916421371866917, + "grad_norm": 0.46625080704689026, + "learning_rate": 0.00036639071233038155, + "loss": 0.9156, + "step": 3877 + }, + { + "epoch": 0.6918205334046919, + "grad_norm": 0.4662775993347168, + "learning_rate": 0.0003663287016040013, + "loss": 0.8285, + "step": 3878 + }, + { + "epoch": 0.691998929622692, + "grad_norm": 0.4520246982574463, + "learning_rate": 0.0003662666817412148, + "loss": 0.8158, + "step": 3879 + }, + { + "epoch": 0.6921773258406921, + "grad_norm": 0.5535463690757751, + "learning_rate": 0.0003662046527468932, + "loss": 1.1017, + "step": 3880 + }, + { + "epoch": 0.6923557220586923, + "grad_norm": 1.1918361186981201, + "learning_rate": 0.00036614261462590824, + "loss": 0.793, + "step": 3881 + }, + { + "epoch": 0.6925341182766925, + "grad_norm": 0.5035448670387268, + "learning_rate": 0.00036608056738313225, + "loss": 0.8917, + "step": 3882 + }, + { + "epoch": 0.6927125144946927, + "grad_norm": 0.5091261267662048, + "learning_rate": 0.00036601851102343843, + "loss": 1.0292, + "step": 3883 + }, + { + "epoch": 0.6928909107126929, + "grad_norm": 0.45190200209617615, + "learning_rate": 0.0003659564455517007, + "loss": 1.043, + "step": 3884 + }, + { + "epoch": 0.693069306930693, + "grad_norm": 0.5362012386322021, + "learning_rate": 0.0003658943709727936, + "loss": 1.0457, + "step": 3885 + }, + { + "epoch": 0.6932477031486932, + "grad_norm": 0.5333651304244995, + "learning_rate": 0.00036583228729159244, + "loss": 1.0046, + "step": 3886 + }, + { + "epoch": 0.6934260993666934, + "grad_norm": 0.5217045545578003, + "learning_rate": 0.0003657701945129734, + "loss": 0.9786, + "step": 3887 + }, + { + "epoch": 0.6936044955846936, + "grad_norm": 0.5744450688362122, + "learning_rate": 0.0003657080926418131, + "loss": 1.03, + "step": 3888 + }, + { + "epoch": 0.6937828918026938, + "grad_norm": 0.5800135135650635, + "learning_rate": 0.0003656459816829889, + "loss": 1.0379, + "step": 3889 + }, + { + "epoch": 0.693961288020694, + "grad_norm": 0.4809887707233429, + "learning_rate": 0.0003655838616413791, + "loss": 0.9534, + "step": 3890 + }, + { + "epoch": 0.6941396842386941, + "grad_norm": 0.46151599287986755, + "learning_rate": 0.0003655217325218626, + "loss": 0.9506, + "step": 3891 + }, + { + "epoch": 0.6943180804566943, + "grad_norm": 1.1912611722946167, + "learning_rate": 0.0003654595943293189, + "loss": 0.7983, + "step": 3892 + }, + { + "epoch": 0.6944964766746945, + "grad_norm": 0.48154789209365845, + "learning_rate": 0.00036539744706862837, + "loss": 0.8602, + "step": 3893 + }, + { + "epoch": 0.6946748728926947, + "grad_norm": 0.5169458985328674, + "learning_rate": 0.000365335290744672, + "loss": 1.0327, + "step": 3894 + }, + { + "epoch": 0.6948532691106949, + "grad_norm": 0.4628457725048065, + "learning_rate": 0.00036527312536233146, + "loss": 0.8947, + "step": 3895 + }, + { + "epoch": 0.695031665328695, + "grad_norm": 0.5199170708656311, + "learning_rate": 0.00036521095092648933, + "loss": 0.9375, + "step": 3896 + }, + { + "epoch": 0.6952100615466952, + "grad_norm": 0.5095205307006836, + "learning_rate": 0.0003651487674420287, + "loss": 1.0075, + "step": 3897 + }, + { + "epoch": 0.6953884577646954, + "grad_norm": 0.5311485528945923, + "learning_rate": 0.0003650865749138334, + "loss": 0.8594, + "step": 3898 + }, + { + "epoch": 0.6955668539826956, + "grad_norm": 0.5111739635467529, + "learning_rate": 0.000365024373346788, + "loss": 0.8538, + "step": 3899 + }, + { + "epoch": 0.6957452502006958, + "grad_norm": 0.48374149203300476, + "learning_rate": 0.0003649621627457779, + "loss": 0.9959, + "step": 3900 + }, + { + "epoch": 0.695923646418696, + "grad_norm": 0.4373980760574341, + "learning_rate": 0.00036489994311568897, + "loss": 0.8307, + "step": 3901 + }, + { + "epoch": 0.6961020426366961, + "grad_norm": 0.47654423117637634, + "learning_rate": 0.0003648377144614079, + "loss": 0.8287, + "step": 3902 + }, + { + "epoch": 0.6962804388546963, + "grad_norm": 0.5087730884552002, + "learning_rate": 0.0003647754767878222, + "loss": 1.21, + "step": 3903 + }, + { + "epoch": 0.6964588350726965, + "grad_norm": 0.503084123134613, + "learning_rate": 0.0003647132300998199, + "loss": 0.8816, + "step": 3904 + }, + { + "epoch": 0.6966372312906967, + "grad_norm": 0.4780065417289734, + "learning_rate": 0.00036465097440229003, + "loss": 0.9056, + "step": 3905 + }, + { + "epoch": 0.6968156275086969, + "grad_norm": 0.48324069380760193, + "learning_rate": 0.0003645887097001218, + "loss": 0.9316, + "step": 3906 + }, + { + "epoch": 0.6969940237266969, + "grad_norm": 0.4731059968471527, + "learning_rate": 0.0003645264359982057, + "loss": 0.8059, + "step": 3907 + }, + { + "epoch": 0.6971724199446971, + "grad_norm": 0.4973444640636444, + "learning_rate": 0.0003644641533014326, + "loss": 1.0185, + "step": 3908 + }, + { + "epoch": 0.6973508161626973, + "grad_norm": 0.5493341088294983, + "learning_rate": 0.00036440186161469407, + "loss": 1.1505, + "step": 3909 + }, + { + "epoch": 0.6975292123806975, + "grad_norm": 0.5255337357521057, + "learning_rate": 0.0003643395609428827, + "loss": 1.0592, + "step": 3910 + }, + { + "epoch": 0.6977076085986977, + "grad_norm": 0.4478547275066376, + "learning_rate": 0.0003642772512908913, + "loss": 0.8705, + "step": 3911 + }, + { + "epoch": 0.6978860048166979, + "grad_norm": 0.49308907985687256, + "learning_rate": 0.00036421493266361384, + "loss": 0.745, + "step": 3912 + }, + { + "epoch": 0.698064401034698, + "grad_norm": 0.5503551363945007, + "learning_rate": 0.00036415260506594463, + "loss": 1.0582, + "step": 3913 + }, + { + "epoch": 0.6982427972526982, + "grad_norm": 0.5110753178596497, + "learning_rate": 0.00036409026850277906, + "loss": 0.7994, + "step": 3914 + }, + { + "epoch": 0.6984211934706984, + "grad_norm": 0.5047644376754761, + "learning_rate": 0.00036402792297901283, + "loss": 0.9037, + "step": 3915 + }, + { + "epoch": 0.6985995896886986, + "grad_norm": 0.5323673486709595, + "learning_rate": 0.0003639655684995426, + "loss": 0.8984, + "step": 3916 + }, + { + "epoch": 0.6987779859066988, + "grad_norm": 0.49493974447250366, + "learning_rate": 0.0003639032050692656, + "loss": 0.949, + "step": 3917 + }, + { + "epoch": 0.6989563821246989, + "grad_norm": 0.4778236150741577, + "learning_rate": 0.00036384083269308, + "loss": 0.6483, + "step": 3918 + }, + { + "epoch": 0.6991347783426991, + "grad_norm": 0.519985556602478, + "learning_rate": 0.00036377845137588435, + "loss": 1.0307, + "step": 3919 + }, + { + "epoch": 0.6993131745606993, + "grad_norm": 0.49531275033950806, + "learning_rate": 0.00036371606112257807, + "loss": 0.7647, + "step": 3920 + }, + { + "epoch": 0.6994915707786995, + "grad_norm": 0.5061904788017273, + "learning_rate": 0.00036365366193806135, + "loss": 0.93, + "step": 3921 + }, + { + "epoch": 0.6996699669966997, + "grad_norm": 0.45195135474205017, + "learning_rate": 0.0003635912538272349, + "loss": 0.831, + "step": 3922 + }, + { + "epoch": 0.6998483632146999, + "grad_norm": 0.5661010146141052, + "learning_rate": 0.00036352883679500027, + "loss": 0.9327, + "step": 3923 + }, + { + "epoch": 0.7000267594327, + "grad_norm": 0.5504164099693298, + "learning_rate": 0.0003634664108462596, + "loss": 0.8443, + "step": 3924 + }, + { + "epoch": 0.7002051556507002, + "grad_norm": 0.6557819843292236, + "learning_rate": 0.0003634039759859158, + "loss": 1.0994, + "step": 3925 + }, + { + "epoch": 0.7003835518687004, + "grad_norm": 0.48669978976249695, + "learning_rate": 0.00036334153221887264, + "loss": 0.7105, + "step": 3926 + }, + { + "epoch": 0.7005619480867006, + "grad_norm": 0.4523518979549408, + "learning_rate": 0.00036327907955003425, + "loss": 0.8413, + "step": 3927 + }, + { + "epoch": 0.7007403443047008, + "grad_norm": 0.469539999961853, + "learning_rate": 0.0003632166179843058, + "loss": 0.7825, + "step": 3928 + }, + { + "epoch": 0.7009187405227009, + "grad_norm": 0.5297795534133911, + "learning_rate": 0.0003631541475265928, + "loss": 1.044, + "step": 3929 + }, + { + "epoch": 0.7010971367407011, + "grad_norm": 0.46915116906166077, + "learning_rate": 0.0003630916681818018, + "loss": 1.0134, + "step": 3930 + }, + { + "epoch": 0.7012755329587013, + "grad_norm": 0.7894530296325684, + "learning_rate": 0.0003630291799548398, + "loss": 1.0422, + "step": 3931 + }, + { + "epoch": 0.7014539291767015, + "grad_norm": 0.4228399395942688, + "learning_rate": 0.00036296668285061464, + "loss": 0.8499, + "step": 3932 + }, + { + "epoch": 0.7016323253947017, + "grad_norm": 0.5037341713905334, + "learning_rate": 0.00036290417687403483, + "loss": 0.9448, + "step": 3933 + }, + { + "epoch": 0.7018107216127019, + "grad_norm": 0.43407323956489563, + "learning_rate": 0.00036284166203000957, + "loss": 0.7291, + "step": 3934 + }, + { + "epoch": 0.701989117830702, + "grad_norm": 0.46290135383605957, + "learning_rate": 0.00036277913832344875, + "loss": 0.8746, + "step": 3935 + }, + { + "epoch": 0.7021675140487021, + "grad_norm": 0.4666912257671356, + "learning_rate": 0.0003627166057592629, + "loss": 0.8483, + "step": 3936 + }, + { + "epoch": 0.7023459102667023, + "grad_norm": 0.44675248861312866, + "learning_rate": 0.0003626540643423634, + "loss": 0.8229, + "step": 3937 + }, + { + "epoch": 0.7025243064847025, + "grad_norm": 0.46909114718437195, + "learning_rate": 0.0003625915140776621, + "loss": 0.7874, + "step": 3938 + }, + { + "epoch": 0.7027027027027027, + "grad_norm": 0.47172433137893677, + "learning_rate": 0.00036252895497007175, + "loss": 0.9997, + "step": 3939 + }, + { + "epoch": 0.7028810989207028, + "grad_norm": 0.5105620622634888, + "learning_rate": 0.0003624663870245057, + "loss": 0.9346, + "step": 3940 + }, + { + "epoch": 0.703059495138703, + "grad_norm": 0.4702140688896179, + "learning_rate": 0.0003624038102458781, + "loss": 0.9306, + "step": 3941 + }, + { + "epoch": 0.7032378913567032, + "grad_norm": 0.49693727493286133, + "learning_rate": 0.0003623412246391035, + "loss": 0.979, + "step": 3942 + }, + { + "epoch": 0.7034162875747034, + "grad_norm": 0.47724950313568115, + "learning_rate": 0.00036227863020909753, + "loss": 0.9922, + "step": 3943 + }, + { + "epoch": 0.7035946837927036, + "grad_norm": 0.4332534968852997, + "learning_rate": 0.0003622160269607762, + "loss": 0.8226, + "step": 3944 + }, + { + "epoch": 0.7037730800107038, + "grad_norm": 0.4521944522857666, + "learning_rate": 0.00036215341489905645, + "loss": 0.9031, + "step": 3945 + }, + { + "epoch": 0.7039514762287039, + "grad_norm": 0.46443673968315125, + "learning_rate": 0.00036209079402885577, + "loss": 0.8799, + "step": 3946 + }, + { + "epoch": 0.7041298724467041, + "grad_norm": 0.48729583621025085, + "learning_rate": 0.00036202816435509233, + "loss": 0.8665, + "step": 3947 + }, + { + "epoch": 0.7043082686647043, + "grad_norm": 0.48398932814598083, + "learning_rate": 0.00036196552588268506, + "loss": 0.8229, + "step": 3948 + }, + { + "epoch": 0.7044866648827045, + "grad_norm": 0.47495603561401367, + "learning_rate": 0.0003619028786165536, + "loss": 0.8878, + "step": 3949 + }, + { + "epoch": 0.7046650611007047, + "grad_norm": 0.5229374766349792, + "learning_rate": 0.0003618402225616182, + "loss": 0.9401, + "step": 3950 + }, + { + "epoch": 0.7048434573187048, + "grad_norm": 0.4787358343601227, + "learning_rate": 0.00036177755772279983, + "loss": 0.9042, + "step": 3951 + }, + { + "epoch": 0.705021853536705, + "grad_norm": 0.4975121021270752, + "learning_rate": 0.00036171488410502016, + "loss": 1.0987, + "step": 3952 + }, + { + "epoch": 0.7052002497547052, + "grad_norm": 0.44728225469589233, + "learning_rate": 0.00036165220171320166, + "loss": 0.774, + "step": 3953 + }, + { + "epoch": 0.7053786459727054, + "grad_norm": 0.5024580955505371, + "learning_rate": 0.0003615895105522672, + "loss": 0.9369, + "step": 3954 + }, + { + "epoch": 0.7055570421907056, + "grad_norm": 0.482075035572052, + "learning_rate": 0.00036152681062714064, + "loss": 0.9221, + "step": 3955 + }, + { + "epoch": 0.7057354384087058, + "grad_norm": 0.4962591230869293, + "learning_rate": 0.0003614641019427464, + "loss": 0.8499, + "step": 3956 + }, + { + "epoch": 0.7059138346267059, + "grad_norm": 0.5466794967651367, + "learning_rate": 0.0003614013845040095, + "loss": 0.8993, + "step": 3957 + }, + { + "epoch": 0.7060922308447061, + "grad_norm": 0.5112261176109314, + "learning_rate": 0.00036133865831585577, + "loss": 0.8757, + "step": 3958 + }, + { + "epoch": 0.7062706270627063, + "grad_norm": 0.4203137159347534, + "learning_rate": 0.0003612759233832118, + "loss": 0.7496, + "step": 3959 + }, + { + "epoch": 0.7064490232807065, + "grad_norm": 0.4720518887042999, + "learning_rate": 0.00036121317971100464, + "loss": 0.9851, + "step": 3960 + }, + { + "epoch": 0.7066274194987067, + "grad_norm": 0.43325722217559814, + "learning_rate": 0.0003611504273041623, + "loss": 0.6659, + "step": 3961 + }, + { + "epoch": 0.7068058157167068, + "grad_norm": 0.44432467222213745, + "learning_rate": 0.0003610876661676131, + "loss": 0.8509, + "step": 3962 + }, + { + "epoch": 0.706984211934707, + "grad_norm": 0.4786689281463623, + "learning_rate": 0.0003610248963062865, + "loss": 0.9216, + "step": 3963 + }, + { + "epoch": 0.7071626081527072, + "grad_norm": 0.4814835488796234, + "learning_rate": 0.00036096211772511226, + "loss": 1.0138, + "step": 3964 + }, + { + "epoch": 0.7073410043707073, + "grad_norm": 0.4865817725658417, + "learning_rate": 0.0003608993304290211, + "loss": 1.0312, + "step": 3965 + }, + { + "epoch": 0.7075194005887075, + "grad_norm": 0.49532395601272583, + "learning_rate": 0.00036083653442294417, + "loss": 0.7395, + "step": 3966 + }, + { + "epoch": 0.7076977968067077, + "grad_norm": 0.4562307298183441, + "learning_rate": 0.0003607737297118136, + "loss": 0.7416, + "step": 3967 + }, + { + "epoch": 0.7078761930247078, + "grad_norm": 0.5441547632217407, + "learning_rate": 0.00036071091630056204, + "loss": 1.2372, + "step": 3968 + }, + { + "epoch": 0.708054589242708, + "grad_norm": 0.4169851839542389, + "learning_rate": 0.00036064809419412264, + "loss": 0.8411, + "step": 3969 + }, + { + "epoch": 0.7082329854607082, + "grad_norm": 0.48741570115089417, + "learning_rate": 0.0003605852633974296, + "loss": 1.1458, + "step": 3970 + }, + { + "epoch": 0.7084113816787084, + "grad_norm": 0.47009482979774475, + "learning_rate": 0.0003605224239154175, + "loss": 0.8117, + "step": 3971 + }, + { + "epoch": 0.7085897778967086, + "grad_norm": 0.4695878028869629, + "learning_rate": 0.00036045957575302174, + "loss": 0.8465, + "step": 3972 + }, + { + "epoch": 0.7087681741147087, + "grad_norm": 0.46119317412376404, + "learning_rate": 0.0003603967189151785, + "loss": 0.7996, + "step": 3973 + }, + { + "epoch": 0.7089465703327089, + "grad_norm": 0.46736615896224976, + "learning_rate": 0.0003603338534068245, + "loss": 0.8513, + "step": 3974 + }, + { + "epoch": 0.7091249665507091, + "grad_norm": 0.588634729385376, + "learning_rate": 0.00036027097923289707, + "loss": 1.0962, + "step": 3975 + }, + { + "epoch": 0.7093033627687093, + "grad_norm": 0.4701951742172241, + "learning_rate": 0.00036020809639833446, + "loss": 0.8302, + "step": 3976 + }, + { + "epoch": 0.7094817589867095, + "grad_norm": 0.4576779305934906, + "learning_rate": 0.00036014520490807535, + "loss": 0.8504, + "step": 3977 + }, + { + "epoch": 0.7096601552047097, + "grad_norm": 0.5647838711738586, + "learning_rate": 0.00036008230476705915, + "loss": 0.8446, + "step": 3978 + }, + { + "epoch": 0.7098385514227098, + "grad_norm": 0.48183631896972656, + "learning_rate": 0.00036001939598022625, + "loss": 0.9849, + "step": 3979 + }, + { + "epoch": 0.71001694764071, + "grad_norm": 0.4493800699710846, + "learning_rate": 0.00035995647855251726, + "loss": 0.8262, + "step": 3980 + }, + { + "epoch": 0.7101953438587102, + "grad_norm": 0.5250908136367798, + "learning_rate": 0.00035989355248887384, + "loss": 0.7036, + "step": 3981 + }, + { + "epoch": 0.7103737400767104, + "grad_norm": 0.4472370743751526, + "learning_rate": 0.000359830617794238, + "loss": 0.8584, + "step": 3982 + }, + { + "epoch": 0.7105521362947106, + "grad_norm": 0.48600542545318604, + "learning_rate": 0.00035976767447355273, + "loss": 0.7812, + "step": 3983 + }, + { + "epoch": 0.7107305325127107, + "grad_norm": 0.49188825488090515, + "learning_rate": 0.00035970472253176155, + "loss": 0.8518, + "step": 3984 + }, + { + "epoch": 0.7109089287307109, + "grad_norm": 0.482270210981369, + "learning_rate": 0.0003596417619738087, + "loss": 0.767, + "step": 3985 + }, + { + "epoch": 0.7110873249487111, + "grad_norm": 0.460519403219223, + "learning_rate": 0.000359578792804639, + "loss": 0.9311, + "step": 3986 + }, + { + "epoch": 0.7112657211667113, + "grad_norm": 0.5443057417869568, + "learning_rate": 0.00035951581502919813, + "loss": 0.8974, + "step": 3987 + }, + { + "epoch": 0.7114441173847115, + "grad_norm": 0.44756433367729187, + "learning_rate": 0.0003594528286524322, + "loss": 0.8031, + "step": 3988 + }, + { + "epoch": 0.7116225136027117, + "grad_norm": 0.7050688862800598, + "learning_rate": 0.0003593898336792883, + "loss": 1.1276, + "step": 3989 + }, + { + "epoch": 0.7118009098207118, + "grad_norm": 0.5873562097549438, + "learning_rate": 0.0003593268301147139, + "loss": 1.1023, + "step": 3990 + }, + { + "epoch": 0.711979306038712, + "grad_norm": 0.4501679241657257, + "learning_rate": 0.0003592638179636573, + "loss": 0.8911, + "step": 3991 + }, + { + "epoch": 0.7121577022567122, + "grad_norm": 0.48664426803588867, + "learning_rate": 0.0003592007972310674, + "loss": 0.8171, + "step": 3992 + }, + { + "epoch": 0.7123360984747124, + "grad_norm": 0.45436549186706543, + "learning_rate": 0.000359137767921894, + "loss": 0.7589, + "step": 3993 + }, + { + "epoch": 0.7125144946927126, + "grad_norm": 0.5350733995437622, + "learning_rate": 0.0003590747300410871, + "loss": 1.1801, + "step": 3994 + }, + { + "epoch": 0.7126928909107126, + "grad_norm": 0.49254482984542847, + "learning_rate": 0.00035901168359359797, + "loss": 0.9704, + "step": 3995 + }, + { + "epoch": 0.7128712871287128, + "grad_norm": 0.4413221776485443, + "learning_rate": 0.0003589486285843781, + "loss": 0.9475, + "step": 3996 + }, + { + "epoch": 0.713049683346713, + "grad_norm": 0.4686570465564728, + "learning_rate": 0.0003588855650183798, + "loss": 0.8909, + "step": 3997 + }, + { + "epoch": 0.7132280795647132, + "grad_norm": 0.5192868113517761, + "learning_rate": 0.0003588224929005561, + "loss": 0.9769, + "step": 3998 + }, + { + "epoch": 0.7134064757827134, + "grad_norm": 0.4699283242225647, + "learning_rate": 0.0003587594122358607, + "loss": 0.8673, + "step": 3999 + }, + { + "epoch": 0.7135848720007136, + "grad_norm": 0.44109442830085754, + "learning_rate": 0.00035869632302924776, + "loss": 0.8594, + "step": 4000 + }, + { + "epoch": 0.7137632682187137, + "grad_norm": 0.44390279054641724, + "learning_rate": 0.00035863322528567246, + "loss": 0.8449, + "step": 4001 + }, + { + "epoch": 0.7139416644367139, + "grad_norm": 0.5160883069038391, + "learning_rate": 0.00035857011901009036, + "loss": 0.9412, + "step": 4002 + }, + { + "epoch": 0.7141200606547141, + "grad_norm": 0.49634233117103577, + "learning_rate": 0.00035850700420745783, + "loss": 1.0827, + "step": 4003 + }, + { + "epoch": 0.7142984568727143, + "grad_norm": 0.45757240056991577, + "learning_rate": 0.0003584438808827319, + "loss": 0.8453, + "step": 4004 + }, + { + "epoch": 0.7144768530907145, + "grad_norm": 2.338979959487915, + "learning_rate": 0.0003583807490408702, + "loss": 1.1377, + "step": 4005 + }, + { + "epoch": 0.7146552493087146, + "grad_norm": 0.46473217010498047, + "learning_rate": 0.00035831760868683117, + "loss": 0.9048, + "step": 4006 + }, + { + "epoch": 0.7148336455267148, + "grad_norm": 0.47779056429862976, + "learning_rate": 0.0003582544598255737, + "loss": 1.0637, + "step": 4007 + }, + { + "epoch": 0.715012041744715, + "grad_norm": 0.4706021845340729, + "learning_rate": 0.0003581913024620577, + "loss": 0.7752, + "step": 4008 + }, + { + "epoch": 0.7151904379627152, + "grad_norm": 0.4808463752269745, + "learning_rate": 0.0003581281366012431, + "loss": 0.8006, + "step": 4009 + }, + { + "epoch": 0.7153688341807154, + "grad_norm": 0.5095382928848267, + "learning_rate": 0.0003580649622480914, + "loss": 0.9802, + "step": 4010 + }, + { + "epoch": 0.7155472303987156, + "grad_norm": 0.5496415495872498, + "learning_rate": 0.000358001779407564, + "loss": 1.2067, + "step": 4011 + }, + { + "epoch": 0.7157256266167157, + "grad_norm": 0.47136303782463074, + "learning_rate": 0.0003579385880846232, + "loss": 0.8652, + "step": 4012 + }, + { + "epoch": 0.7159040228347159, + "grad_norm": 0.7725960612297058, + "learning_rate": 0.00035787538828423225, + "loss": 0.7905, + "step": 4013 + }, + { + "epoch": 0.7160824190527161, + "grad_norm": 0.5106984376907349, + "learning_rate": 0.0003578121800113548, + "loss": 0.8218, + "step": 4014 + }, + { + "epoch": 0.7162608152707163, + "grad_norm": 0.4280400276184082, + "learning_rate": 0.0003577489632709551, + "loss": 0.7157, + "step": 4015 + }, + { + "epoch": 0.7164392114887165, + "grad_norm": 0.439709335565567, + "learning_rate": 0.0003576857380679981, + "loss": 0.9857, + "step": 4016 + }, + { + "epoch": 0.7166176077067166, + "grad_norm": 0.4796721041202545, + "learning_rate": 0.0003576225044074496, + "loss": 0.7725, + "step": 4017 + }, + { + "epoch": 0.7167960039247168, + "grad_norm": 0.5130892992019653, + "learning_rate": 0.00035755926229427595, + "loss": 1.048, + "step": 4018 + }, + { + "epoch": 0.716974400142717, + "grad_norm": 0.6783162951469421, + "learning_rate": 0.0003574960117334441, + "loss": 0.9083, + "step": 4019 + }, + { + "epoch": 0.7171527963607172, + "grad_norm": 0.551443874835968, + "learning_rate": 0.00035743275272992177, + "loss": 0.9846, + "step": 4020 + }, + { + "epoch": 0.7173311925787174, + "grad_norm": 0.5131356716156006, + "learning_rate": 0.0003573694852886773, + "loss": 0.8985, + "step": 4021 + }, + { + "epoch": 0.7175095887967176, + "grad_norm": 0.5424959659576416, + "learning_rate": 0.0003573062094146796, + "loss": 1.0965, + "step": 4022 + }, + { + "epoch": 0.7176879850147176, + "grad_norm": 0.5547580122947693, + "learning_rate": 0.0003572429251128984, + "loss": 0.9774, + "step": 4023 + }, + { + "epoch": 0.7178663812327178, + "grad_norm": 0.4463725686073303, + "learning_rate": 0.000357179632388304, + "loss": 0.7453, + "step": 4024 + }, + { + "epoch": 0.718044777450718, + "grad_norm": 0.4756935238838196, + "learning_rate": 0.0003571163312458674, + "loss": 0.8413, + "step": 4025 + }, + { + "epoch": 0.7182231736687182, + "grad_norm": 0.47553977370262146, + "learning_rate": 0.0003570530216905603, + "loss": 0.7793, + "step": 4026 + }, + { + "epoch": 0.7184015698867184, + "grad_norm": 0.48220735788345337, + "learning_rate": 0.000356989703727355, + "loss": 0.858, + "step": 4027 + }, + { + "epoch": 0.7185799661047185, + "grad_norm": 1.1826428174972534, + "learning_rate": 0.0003569263773612242, + "loss": 1.007, + "step": 4028 + }, + { + "epoch": 0.7187583623227187, + "grad_norm": 0.5101688504219055, + "learning_rate": 0.0003568630425971419, + "loss": 1.0152, + "step": 4029 + }, + { + "epoch": 0.7189367585407189, + "grad_norm": 0.6145039200782776, + "learning_rate": 0.00035679969944008217, + "loss": 0.8944, + "step": 4030 + }, + { + "epoch": 0.7191151547587191, + "grad_norm": 0.4975384771823883, + "learning_rate": 0.00035673634789502, + "loss": 0.9689, + "step": 4031 + }, + { + "epoch": 0.7192935509767193, + "grad_norm": 0.5056141018867493, + "learning_rate": 0.00035667298796693097, + "loss": 0.8644, + "step": 4032 + }, + { + "epoch": 0.7194719471947195, + "grad_norm": 0.51641845703125, + "learning_rate": 0.00035660961966079146, + "loss": 1.1757, + "step": 4033 + }, + { + "epoch": 0.7196503434127196, + "grad_norm": 0.6307793259620667, + "learning_rate": 0.00035654624298157823, + "loss": 0.9336, + "step": 4034 + }, + { + "epoch": 0.7198287396307198, + "grad_norm": 0.5330437421798706, + "learning_rate": 0.0003564828579342689, + "loss": 0.9126, + "step": 4035 + }, + { + "epoch": 0.72000713584872, + "grad_norm": 0.4748086631298065, + "learning_rate": 0.00035641946452384183, + "loss": 0.9391, + "step": 4036 + }, + { + "epoch": 0.7201855320667202, + "grad_norm": 0.4811098873615265, + "learning_rate": 0.00035635606275527575, + "loss": 0.9427, + "step": 4037 + }, + { + "epoch": 0.7203639282847204, + "grad_norm": 0.551956832408905, + "learning_rate": 0.00035629265263355025, + "loss": 1.0388, + "step": 4038 + }, + { + "epoch": 0.7205423245027205, + "grad_norm": 0.5036234855651855, + "learning_rate": 0.0003562292341636456, + "loss": 0.9119, + "step": 4039 + }, + { + "epoch": 0.7207207207207207, + "grad_norm": 0.5007266402244568, + "learning_rate": 0.0003561658073505426, + "loss": 1.0323, + "step": 4040 + }, + { + "epoch": 0.7208991169387209, + "grad_norm": 0.5283923745155334, + "learning_rate": 0.0003561023721992228, + "loss": 0.8999, + "step": 4041 + }, + { + "epoch": 0.7210775131567211, + "grad_norm": 0.885420024394989, + "learning_rate": 0.0003560389287146683, + "loss": 0.9517, + "step": 4042 + }, + { + "epoch": 0.7212559093747213, + "grad_norm": 0.4583801329135895, + "learning_rate": 0.000355975476901862, + "loss": 0.8263, + "step": 4043 + }, + { + "epoch": 0.7214343055927215, + "grad_norm": 0.4987337291240692, + "learning_rate": 0.00035591201676578733, + "loss": 0.8681, + "step": 4044 + }, + { + "epoch": 0.7216127018107216, + "grad_norm": 0.5699477195739746, + "learning_rate": 0.00035584854831142846, + "loss": 0.9642, + "step": 4045 + }, + { + "epoch": 0.7217910980287218, + "grad_norm": 0.5087426900863647, + "learning_rate": 0.00035578507154377016, + "loss": 0.9291, + "step": 4046 + }, + { + "epoch": 0.721969494246722, + "grad_norm": 0.520293653011322, + "learning_rate": 0.00035572158646779786, + "loss": 0.988, + "step": 4047 + }, + { + "epoch": 0.7221478904647222, + "grad_norm": 0.511569082736969, + "learning_rate": 0.0003556580930884976, + "loss": 0.9026, + "step": 4048 + }, + { + "epoch": 0.7223262866827224, + "grad_norm": 0.4560140371322632, + "learning_rate": 0.0003555945914108562, + "loss": 0.8434, + "step": 4049 + }, + { + "epoch": 0.7225046829007225, + "grad_norm": 0.49847978353500366, + "learning_rate": 0.00035553108143986106, + "loss": 1.0567, + "step": 4050 + }, + { + "epoch": 0.7226830791187226, + "grad_norm": 0.4683671295642853, + "learning_rate": 0.0003554675631805002, + "loss": 0.7456, + "step": 4051 + }, + { + "epoch": 0.7228614753367228, + "grad_norm": 0.5200768113136292, + "learning_rate": 0.0003554040366377623, + "loss": 0.9804, + "step": 4052 + }, + { + "epoch": 0.723039871554723, + "grad_norm": 0.5331013202667236, + "learning_rate": 0.0003553405018166367, + "loss": 1.0793, + "step": 4053 + }, + { + "epoch": 0.7232182677727232, + "grad_norm": 0.48200780153274536, + "learning_rate": 0.00035527695872211354, + "loss": 0.9059, + "step": 4054 + }, + { + "epoch": 0.7233966639907234, + "grad_norm": 0.5711750388145447, + "learning_rate": 0.00035521340735918317, + "loss": 1.0259, + "step": 4055 + }, + { + "epoch": 0.7235750602087235, + "grad_norm": 0.5371613502502441, + "learning_rate": 0.00035514984773283713, + "loss": 0.9537, + "step": 4056 + }, + { + "epoch": 0.7237534564267237, + "grad_norm": 0.47757935523986816, + "learning_rate": 0.0003550862798480673, + "loss": 0.9782, + "step": 4057 + }, + { + "epoch": 0.7239318526447239, + "grad_norm": 0.5152607560157776, + "learning_rate": 0.0003550227037098663, + "loss": 0.9834, + "step": 4058 + }, + { + "epoch": 0.7241102488627241, + "grad_norm": 0.44710201025009155, + "learning_rate": 0.0003549591193232273, + "loss": 0.9038, + "step": 4059 + }, + { + "epoch": 0.7242886450807243, + "grad_norm": 0.4328174591064453, + "learning_rate": 0.00035489552669314427, + "loss": 0.5936, + "step": 4060 + }, + { + "epoch": 0.7244670412987244, + "grad_norm": 0.5450094938278198, + "learning_rate": 0.00035483192582461175, + "loss": 0.9184, + "step": 4061 + }, + { + "epoch": 0.7246454375167246, + "grad_norm": 0.4884642958641052, + "learning_rate": 0.0003547683167226249, + "loss": 0.982, + "step": 4062 + }, + { + "epoch": 0.7248238337347248, + "grad_norm": 0.4127320647239685, + "learning_rate": 0.00035470469939217944, + "loss": 0.6299, + "step": 4063 + }, + { + "epoch": 0.725002229952725, + "grad_norm": 0.5126365423202515, + "learning_rate": 0.0003546410738382719, + "loss": 0.7333, + "step": 4064 + }, + { + "epoch": 0.7251806261707252, + "grad_norm": 1.068241000175476, + "learning_rate": 0.0003545774400658996, + "loss": 0.7118, + "step": 4065 + }, + { + "epoch": 0.7253590223887254, + "grad_norm": 0.5835636258125305, + "learning_rate": 0.00035451379808006014, + "loss": 0.9209, + "step": 4066 + }, + { + "epoch": 0.7255374186067255, + "grad_norm": 0.4724158048629761, + "learning_rate": 0.0003544501478857519, + "loss": 0.7558, + "step": 4067 + }, + { + "epoch": 0.7257158148247257, + "grad_norm": 0.536562442779541, + "learning_rate": 0.000354386489487974, + "loss": 0.9178, + "step": 4068 + }, + { + "epoch": 0.7258942110427259, + "grad_norm": 1.3931955099105835, + "learning_rate": 0.0003543228228917262, + "loss": 1.0701, + "step": 4069 + }, + { + "epoch": 0.7260726072607261, + "grad_norm": 0.6029762029647827, + "learning_rate": 0.0003542591481020087, + "loss": 0.7614, + "step": 4070 + }, + { + "epoch": 0.7262510034787263, + "grad_norm": 2.0984432697296143, + "learning_rate": 0.0003541954651238226, + "loss": 0.9829, + "step": 4071 + }, + { + "epoch": 0.7264293996967264, + "grad_norm": 1.1561270952224731, + "learning_rate": 0.0003541317739621695, + "loss": 1.1042, + "step": 4072 + }, + { + "epoch": 0.7266077959147266, + "grad_norm": 0.9806638360023499, + "learning_rate": 0.0003540680746220518, + "loss": 0.7303, + "step": 4073 + }, + { + "epoch": 0.7267861921327268, + "grad_norm": 0.5632558465003967, + "learning_rate": 0.0003540043671084722, + "loss": 1.0854, + "step": 4074 + }, + { + "epoch": 0.726964588350727, + "grad_norm": 0.5446335077285767, + "learning_rate": 0.00035394065142643435, + "loss": 1.0717, + "step": 4075 + }, + { + "epoch": 0.7271429845687272, + "grad_norm": 1.9541873931884766, + "learning_rate": 0.0003538769275809425, + "loss": 0.9602, + "step": 4076 + }, + { + "epoch": 0.7273213807867274, + "grad_norm": 0.5483476519584656, + "learning_rate": 0.00035381319557700145, + "loss": 1.0379, + "step": 4077 + }, + { + "epoch": 0.7274997770047275, + "grad_norm": 0.5389299392700195, + "learning_rate": 0.00035374945541961656, + "loss": 0.9395, + "step": 4078 + }, + { + "epoch": 0.7276781732227277, + "grad_norm": 0.5572075247764587, + "learning_rate": 0.00035368570711379423, + "loss": 1.0089, + "step": 4079 + }, + { + "epoch": 0.7278565694407279, + "grad_norm": 0.4937756359577179, + "learning_rate": 0.00035362195066454116, + "loss": 0.9257, + "step": 4080 + }, + { + "epoch": 0.728034965658728, + "grad_norm": 0.5375781059265137, + "learning_rate": 0.00035355818607686455, + "loss": 0.9355, + "step": 4081 + }, + { + "epoch": 0.7282133618767282, + "grad_norm": 0.4889019727706909, + "learning_rate": 0.0003534944133557726, + "loss": 0.8256, + "step": 4082 + }, + { + "epoch": 0.7283917580947283, + "grad_norm": 0.5278054475784302, + "learning_rate": 0.000353430632506274, + "loss": 1.029, + "step": 4083 + }, + { + "epoch": 0.7285701543127285, + "grad_norm": 0.5437285900115967, + "learning_rate": 0.000353366843533378, + "loss": 1.1467, + "step": 4084 + }, + { + "epoch": 0.7287485505307287, + "grad_norm": 0.5061957240104675, + "learning_rate": 0.00035330304644209454, + "loss": 1.0266, + "step": 4085 + }, + { + "epoch": 0.7289269467487289, + "grad_norm": 0.5521642565727234, + "learning_rate": 0.00035323924123743436, + "loss": 0.9845, + "step": 4086 + }, + { + "epoch": 0.7291053429667291, + "grad_norm": 0.5419567823410034, + "learning_rate": 0.00035317542792440853, + "loss": 0.7985, + "step": 4087 + }, + { + "epoch": 0.7292837391847293, + "grad_norm": 0.510177731513977, + "learning_rate": 0.00035311160650802905, + "loss": 0.9469, + "step": 4088 + }, + { + "epoch": 0.7294621354027294, + "grad_norm": 0.5522062182426453, + "learning_rate": 0.0003530477769933083, + "loss": 0.9675, + "step": 4089 + }, + { + "epoch": 0.7296405316207296, + "grad_norm": 0.5909835696220398, + "learning_rate": 0.00035298393938525954, + "loss": 1.0044, + "step": 4090 + }, + { + "epoch": 0.7298189278387298, + "grad_norm": 0.5589690804481506, + "learning_rate": 0.0003529200936888965, + "loss": 1.0368, + "step": 4091 + }, + { + "epoch": 0.72999732405673, + "grad_norm": 0.5542118549346924, + "learning_rate": 0.00035285623990923356, + "loss": 0.7456, + "step": 4092 + }, + { + "epoch": 0.7301757202747302, + "grad_norm": 2.88665509223938, + "learning_rate": 0.00035279237805128585, + "loss": 0.7459, + "step": 4093 + }, + { + "epoch": 0.7303541164927303, + "grad_norm": 0.5373327732086182, + "learning_rate": 0.000352728508120069, + "loss": 1.2578, + "step": 4094 + }, + { + "epoch": 0.7305325127107305, + "grad_norm": 0.48414483666419983, + "learning_rate": 0.00035266463012059924, + "loss": 0.7918, + "step": 4095 + }, + { + "epoch": 0.7307109089287307, + "grad_norm": 0.45521920919418335, + "learning_rate": 0.00035260074405789365, + "loss": 0.8527, + "step": 4096 + }, + { + "epoch": 0.7308893051467309, + "grad_norm": 0.44525134563446045, + "learning_rate": 0.00035253684993696984, + "loss": 0.7479, + "step": 4097 + }, + { + "epoch": 0.7310677013647311, + "grad_norm": 0.4846455454826355, + "learning_rate": 0.0003524729477628459, + "loss": 0.7777, + "step": 4098 + }, + { + "epoch": 0.7312460975827313, + "grad_norm": 0.5616981983184814, + "learning_rate": 0.00035240903754054075, + "loss": 1.0553, + "step": 4099 + }, + { + "epoch": 0.7314244938007314, + "grad_norm": 0.5334004163742065, + "learning_rate": 0.00035234511927507387, + "loss": 0.8418, + "step": 4100 + }, + { + "epoch": 0.7316028900187316, + "grad_norm": 0.5169308185577393, + "learning_rate": 0.00035228119297146533, + "loss": 0.7405, + "step": 4101 + }, + { + "epoch": 0.7317812862367318, + "grad_norm": 0.7203896045684814, + "learning_rate": 0.00035221725863473596, + "loss": 0.8365, + "step": 4102 + }, + { + "epoch": 0.731959682454732, + "grad_norm": 0.4874943196773529, + "learning_rate": 0.0003521533162699071, + "loss": 0.8711, + "step": 4103 + }, + { + "epoch": 0.7321380786727322, + "grad_norm": 0.5255458950996399, + "learning_rate": 0.0003520893658820007, + "loss": 1.1175, + "step": 4104 + }, + { + "epoch": 0.7323164748907324, + "grad_norm": 0.4852457642555237, + "learning_rate": 0.0003520254074760394, + "loss": 0.9222, + "step": 4105 + }, + { + "epoch": 0.7324948711087325, + "grad_norm": 0.49769327044487, + "learning_rate": 0.00035196144105704654, + "loss": 0.8612, + "step": 4106 + }, + { + "epoch": 0.7326732673267327, + "grad_norm": 0.528090238571167, + "learning_rate": 0.00035189746663004607, + "loss": 0.8552, + "step": 4107 + }, + { + "epoch": 0.7328516635447329, + "grad_norm": 0.5445149540901184, + "learning_rate": 0.00035183348420006233, + "loss": 0.9377, + "step": 4108 + }, + { + "epoch": 0.733030059762733, + "grad_norm": 0.49806612730026245, + "learning_rate": 0.00035176949377212045, + "loss": 0.8447, + "step": 4109 + }, + { + "epoch": 0.7332084559807333, + "grad_norm": 0.5145043134689331, + "learning_rate": 0.00035170549535124647, + "loss": 0.8682, + "step": 4110 + }, + { + "epoch": 0.7333868521987333, + "grad_norm": 0.45539185404777527, + "learning_rate": 0.0003516414889424666, + "loss": 0.7736, + "step": 4111 + }, + { + "epoch": 0.7335652484167335, + "grad_norm": 0.5117284059524536, + "learning_rate": 0.00035157747455080794, + "loss": 0.8903, + "step": 4112 + }, + { + "epoch": 0.7337436446347337, + "grad_norm": 0.5072416663169861, + "learning_rate": 0.0003515134521812983, + "loss": 0.8481, + "step": 4113 + }, + { + "epoch": 0.7339220408527339, + "grad_norm": 0.46144771575927734, + "learning_rate": 0.0003514494218389656, + "loss": 0.7497, + "step": 4114 + }, + { + "epoch": 0.7341004370707341, + "grad_norm": 0.4636537432670593, + "learning_rate": 0.000351385383528839, + "loss": 0.9987, + "step": 4115 + }, + { + "epoch": 0.7342788332887343, + "grad_norm": 1.1357630491256714, + "learning_rate": 0.00035132133725594803, + "loss": 1.2211, + "step": 4116 + }, + { + "epoch": 0.7344572295067344, + "grad_norm": 0.48777201771736145, + "learning_rate": 0.0003512572830253228, + "loss": 0.9007, + "step": 4117 + }, + { + "epoch": 0.7346356257247346, + "grad_norm": 0.5612165331840515, + "learning_rate": 0.0003511932208419942, + "loss": 0.9013, + "step": 4118 + }, + { + "epoch": 0.7348140219427348, + "grad_norm": 0.6152064204216003, + "learning_rate": 0.00035112915071099354, + "loss": 1.0285, + "step": 4119 + }, + { + "epoch": 0.734992418160735, + "grad_norm": 0.6013907790184021, + "learning_rate": 0.000351065072637353, + "loss": 0.8736, + "step": 4120 + }, + { + "epoch": 0.7351708143787352, + "grad_norm": 0.5271583199501038, + "learning_rate": 0.000351000986626105, + "loss": 1.1314, + "step": 4121 + }, + { + "epoch": 0.7353492105967353, + "grad_norm": 0.583275556564331, + "learning_rate": 0.00035093689268228306, + "loss": 0.8167, + "step": 4122 + }, + { + "epoch": 0.7355276068147355, + "grad_norm": 0.49837273359298706, + "learning_rate": 0.000350872790810921, + "loss": 0.9489, + "step": 4123 + }, + { + "epoch": 0.7357060030327357, + "grad_norm": 0.4844464957714081, + "learning_rate": 0.0003508086810170533, + "loss": 0.9135, + "step": 4124 + }, + { + "epoch": 0.7358843992507359, + "grad_norm": 0.5695939660072327, + "learning_rate": 0.00035074456330571517, + "loss": 0.9493, + "step": 4125 + }, + { + "epoch": 0.7360627954687361, + "grad_norm": 0.5401460528373718, + "learning_rate": 0.0003506804376819425, + "loss": 0.9202, + "step": 4126 + }, + { + "epoch": 0.7362411916867363, + "grad_norm": 0.4903864860534668, + "learning_rate": 0.0003506163041507715, + "loss": 1.0455, + "step": 4127 + }, + { + "epoch": 0.7364195879047364, + "grad_norm": 0.7425463795661926, + "learning_rate": 0.00035055216271723933, + "loss": 0.9451, + "step": 4128 + }, + { + "epoch": 0.7365979841227366, + "grad_norm": 0.5137443542480469, + "learning_rate": 0.0003504880133863835, + "loss": 1.0321, + "step": 4129 + }, + { + "epoch": 0.7367763803407368, + "grad_norm": 0.5113863348960876, + "learning_rate": 0.0003504238561632424, + "loss": 1.0488, + "step": 4130 + }, + { + "epoch": 0.736954776558737, + "grad_norm": 0.469612717628479, + "learning_rate": 0.0003503596910528548, + "loss": 0.897, + "step": 4131 + }, + { + "epoch": 0.7371331727767372, + "grad_norm": 0.5417755246162415, + "learning_rate": 0.00035029551806026025, + "loss": 0.9615, + "step": 4132 + }, + { + "epoch": 0.7373115689947373, + "grad_norm": 2.0405690670013428, + "learning_rate": 0.00035023133719049894, + "loss": 0.8717, + "step": 4133 + }, + { + "epoch": 0.7374899652127375, + "grad_norm": 0.4939228594303131, + "learning_rate": 0.00035016714844861155, + "loss": 0.7748, + "step": 4134 + }, + { + "epoch": 0.7376683614307377, + "grad_norm": 0.5170210003852844, + "learning_rate": 0.00035010295183963936, + "loss": 0.9805, + "step": 4135 + }, + { + "epoch": 0.7378467576487379, + "grad_norm": 0.5120583772659302, + "learning_rate": 0.0003500387473686244, + "loss": 0.7109, + "step": 4136 + }, + { + "epoch": 0.7380251538667381, + "grad_norm": 0.5130133628845215, + "learning_rate": 0.0003499745350406093, + "loss": 0.9658, + "step": 4137 + }, + { + "epoch": 0.7382035500847383, + "grad_norm": 0.4344574511051178, + "learning_rate": 0.0003499103148606372, + "loss": 0.783, + "step": 4138 + }, + { + "epoch": 0.7383819463027383, + "grad_norm": 0.5518962144851685, + "learning_rate": 0.00034984608683375206, + "loss": 1.0517, + "step": 4139 + }, + { + "epoch": 0.7385603425207385, + "grad_norm": 0.46893310546875, + "learning_rate": 0.00034978185096499814, + "loss": 1.0466, + "step": 4140 + }, + { + "epoch": 0.7387387387387387, + "grad_norm": 0.566422164440155, + "learning_rate": 0.00034971760725942063, + "loss": 1.1278, + "step": 4141 + }, + { + "epoch": 0.7389171349567389, + "grad_norm": 0.47913649678230286, + "learning_rate": 0.00034965335572206515, + "loss": 0.965, + "step": 4142 + }, + { + "epoch": 0.7390955311747391, + "grad_norm": 0.4865665137767792, + "learning_rate": 0.000349589096357978, + "loss": 0.832, + "step": 4143 + }, + { + "epoch": 0.7392739273927392, + "grad_norm": 0.7697198987007141, + "learning_rate": 0.0003495248291722061, + "loss": 0.9949, + "step": 4144 + }, + { + "epoch": 0.7394523236107394, + "grad_norm": 0.49303874373435974, + "learning_rate": 0.00034946055416979686, + "loss": 1.1002, + "step": 4145 + }, + { + "epoch": 0.7396307198287396, + "grad_norm": 0.4551609456539154, + "learning_rate": 0.00034939627135579854, + "loss": 0.826, + "step": 4146 + }, + { + "epoch": 0.7398091160467398, + "grad_norm": 0.5011882781982422, + "learning_rate": 0.00034933198073525986, + "loss": 0.9986, + "step": 4147 + }, + { + "epoch": 0.73998751226474, + "grad_norm": 0.4582299590110779, + "learning_rate": 0.0003492676823132301, + "loss": 0.8219, + "step": 4148 + }, + { + "epoch": 0.7401659084827402, + "grad_norm": 0.4924689531326294, + "learning_rate": 0.00034920337609475936, + "loss": 1.0327, + "step": 4149 + }, + { + "epoch": 0.7403443047007403, + "grad_norm": 0.5348846912384033, + "learning_rate": 0.00034913906208489814, + "loss": 1.0617, + "step": 4150 + }, + { + "epoch": 0.7405227009187405, + "grad_norm": 0.4941270649433136, + "learning_rate": 0.0003490747402886977, + "loss": 0.8433, + "step": 4151 + }, + { + "epoch": 0.7407010971367407, + "grad_norm": 0.5572370290756226, + "learning_rate": 0.0003490104107112097, + "loss": 0.9576, + "step": 4152 + }, + { + "epoch": 0.7408794933547409, + "grad_norm": 0.5385303497314453, + "learning_rate": 0.00034894607335748674, + "loss": 0.9523, + "step": 4153 + }, + { + "epoch": 0.7410578895727411, + "grad_norm": 0.5019118189811707, + "learning_rate": 0.00034888172823258165, + "loss": 0.6984, + "step": 4154 + }, + { + "epoch": 0.7412362857907412, + "grad_norm": 0.4633517861366272, + "learning_rate": 0.0003488173753415482, + "loss": 0.8653, + "step": 4155 + }, + { + "epoch": 0.7414146820087414, + "grad_norm": 0.5317572355270386, + "learning_rate": 0.0003487530146894407, + "loss": 1.1626, + "step": 4156 + }, + { + "epoch": 0.7415930782267416, + "grad_norm": 0.45128071308135986, + "learning_rate": 0.0003486886462813138, + "loss": 0.8697, + "step": 4157 + }, + { + "epoch": 0.7417714744447418, + "grad_norm": 0.4344140589237213, + "learning_rate": 0.0003486242701222232, + "loss": 0.7773, + "step": 4158 + }, + { + "epoch": 0.741949870662742, + "grad_norm": 0.443740576505661, + "learning_rate": 0.0003485598862172248, + "loss": 0.8047, + "step": 4159 + }, + { + "epoch": 0.7421282668807422, + "grad_norm": 0.4513658583164215, + "learning_rate": 0.00034849549457137543, + "loss": 0.8605, + "step": 4160 + }, + { + "epoch": 0.7423066630987423, + "grad_norm": 0.4627962112426758, + "learning_rate": 0.00034843109518973225, + "loss": 0.768, + "step": 4161 + }, + { + "epoch": 0.7424850593167425, + "grad_norm": 0.5590542554855347, + "learning_rate": 0.00034836668807735314, + "loss": 0.9791, + "step": 4162 + }, + { + "epoch": 0.7426634555347427, + "grad_norm": 0.45700347423553467, + "learning_rate": 0.00034830227323929674, + "loss": 0.8522, + "step": 4163 + }, + { + "epoch": 0.7428418517527429, + "grad_norm": 2.536508798599243, + "learning_rate": 0.00034823785068062213, + "loss": 0.9045, + "step": 4164 + }, + { + "epoch": 0.7430202479707431, + "grad_norm": 0.5341298580169678, + "learning_rate": 0.00034817342040638897, + "loss": 0.9195, + "step": 4165 + }, + { + "epoch": 0.7431986441887432, + "grad_norm": 0.45887595415115356, + "learning_rate": 0.00034810898242165766, + "loss": 0.8731, + "step": 4166 + }, + { + "epoch": 0.7433770404067434, + "grad_norm": 0.4361880421638489, + "learning_rate": 0.00034804453673148905, + "loss": 0.8234, + "step": 4167 + }, + { + "epoch": 0.7435554366247435, + "grad_norm": 0.4275573790073395, + "learning_rate": 0.0003479800833409448, + "loss": 0.7071, + "step": 4168 + }, + { + "epoch": 0.7437338328427437, + "grad_norm": 0.5099348425865173, + "learning_rate": 0.0003479156222550869, + "loss": 0.8876, + "step": 4169 + }, + { + "epoch": 0.743912229060744, + "grad_norm": 0.4776555895805359, + "learning_rate": 0.00034785115347897805, + "loss": 0.7928, + "step": 4170 + }, + { + "epoch": 0.7440906252787441, + "grad_norm": 63.1339225769043, + "learning_rate": 0.00034778667701768187, + "loss": 1.9367, + "step": 4171 + }, + { + "epoch": 0.7442690214967442, + "grad_norm": 0.5373305678367615, + "learning_rate": 0.00034772219287626207, + "loss": 0.9005, + "step": 4172 + }, + { + "epoch": 0.7444474177147444, + "grad_norm": 0.5590469837188721, + "learning_rate": 0.0003476577010597834, + "loss": 0.8572, + "step": 4173 + }, + { + "epoch": 0.7446258139327446, + "grad_norm": 0.4844629466533661, + "learning_rate": 0.0003475932015733109, + "loss": 0.9525, + "step": 4174 + }, + { + "epoch": 0.7448042101507448, + "grad_norm": 0.41167715191841125, + "learning_rate": 0.00034752869442191027, + "loss": 0.8344, + "step": 4175 + }, + { + "epoch": 0.744982606368745, + "grad_norm": 0.5152734518051147, + "learning_rate": 0.00034746417961064793, + "loss": 1.0936, + "step": 4176 + }, + { + "epoch": 0.7451610025867451, + "grad_norm": 0.48306646943092346, + "learning_rate": 0.0003473996571445909, + "loss": 0.7804, + "step": 4177 + }, + { + "epoch": 0.7453393988047453, + "grad_norm": 0.5196247100830078, + "learning_rate": 0.0003473351270288067, + "loss": 0.9043, + "step": 4178 + }, + { + "epoch": 0.7455177950227455, + "grad_norm": 0.5509787201881409, + "learning_rate": 0.0003472705892683636, + "loss": 0.8701, + "step": 4179 + }, + { + "epoch": 0.7456961912407457, + "grad_norm": 0.7097774744033813, + "learning_rate": 0.0003472060438683302, + "loss": 0.7195, + "step": 4180 + }, + { + "epoch": 0.7458745874587459, + "grad_norm": 0.49308526515960693, + "learning_rate": 0.00034714149083377594, + "loss": 1.0288, + "step": 4181 + }, + { + "epoch": 0.7460529836767461, + "grad_norm": 0.6980340480804443, + "learning_rate": 0.00034707693016977083, + "loss": 0.858, + "step": 4182 + }, + { + "epoch": 0.7462313798947462, + "grad_norm": 0.4881076216697693, + "learning_rate": 0.0003470123618813854, + "loss": 0.9786, + "step": 4183 + }, + { + "epoch": 0.7464097761127464, + "grad_norm": 0.5367513298988342, + "learning_rate": 0.00034694778597369076, + "loss": 0.8505, + "step": 4184 + }, + { + "epoch": 0.7465881723307466, + "grad_norm": 0.47334301471710205, + "learning_rate": 0.00034688320245175873, + "loss": 0.8669, + "step": 4185 + }, + { + "epoch": 0.7467665685487468, + "grad_norm": 0.6059718728065491, + "learning_rate": 0.0003468186113206617, + "loss": 1.1616, + "step": 4186 + }, + { + "epoch": 0.746944964766747, + "grad_norm": 0.5258612036705017, + "learning_rate": 0.00034675401258547266, + "loss": 0.8902, + "step": 4187 + }, + { + "epoch": 0.7471233609847471, + "grad_norm": 0.5637246370315552, + "learning_rate": 0.00034668940625126506, + "loss": 1.2349, + "step": 4188 + }, + { + "epoch": 0.7473017572027473, + "grad_norm": 78.5971450805664, + "learning_rate": 0.00034662479232311306, + "loss": 1.0211, + "step": 4189 + }, + { + "epoch": 0.7474801534207475, + "grad_norm": 0.6106373071670532, + "learning_rate": 0.00034656017080609154, + "loss": 1.0691, + "step": 4190 + }, + { + "epoch": 0.7476585496387477, + "grad_norm": 0.5662621259689331, + "learning_rate": 0.0003464955417052757, + "loss": 0.9236, + "step": 4191 + }, + { + "epoch": 0.7478369458567479, + "grad_norm": 0.5042510032653809, + "learning_rate": 0.0003464309050257415, + "loss": 0.8522, + "step": 4192 + }, + { + "epoch": 0.7480153420747481, + "grad_norm": 0.5739179849624634, + "learning_rate": 0.0003463662607725656, + "loss": 0.9128, + "step": 4193 + }, + { + "epoch": 0.7481937382927482, + "grad_norm": 0.4624471962451935, + "learning_rate": 0.000346301608950825, + "loss": 0.7043, + "step": 4194 + }, + { + "epoch": 0.7483721345107484, + "grad_norm": 0.5266431570053101, + "learning_rate": 0.00034623694956559747, + "loss": 0.7484, + "step": 4195 + }, + { + "epoch": 0.7485505307287486, + "grad_norm": 0.5885065197944641, + "learning_rate": 0.0003461722826219614, + "loss": 1.179, + "step": 4196 + }, + { + "epoch": 0.7487289269467488, + "grad_norm": 0.5153014659881592, + "learning_rate": 0.0003461076081249956, + "loss": 0.7813, + "step": 4197 + }, + { + "epoch": 0.748907323164749, + "grad_norm": 0.5379844307899475, + "learning_rate": 0.0003460429260797796, + "loss": 0.7575, + "step": 4198 + }, + { + "epoch": 0.749085719382749, + "grad_norm": 0.502864420413971, + "learning_rate": 0.00034597823649139346, + "loss": 0.995, + "step": 4199 + }, + { + "epoch": 0.7492641156007492, + "grad_norm": 0.4639131724834442, + "learning_rate": 0.00034591353936491806, + "loss": 0.8029, + "step": 4200 + }, + { + "epoch": 0.7494425118187494, + "grad_norm": 0.5059810876846313, + "learning_rate": 0.0003458488347054345, + "loss": 0.8779, + "step": 4201 + }, + { + "epoch": 0.7496209080367496, + "grad_norm": 0.4426543712615967, + "learning_rate": 0.00034578412251802466, + "loss": 0.7159, + "step": 4202 + }, + { + "epoch": 0.7497993042547498, + "grad_norm": 0.571412205696106, + "learning_rate": 0.0003457194028077711, + "loss": 0.9628, + "step": 4203 + }, + { + "epoch": 0.74997770047275, + "grad_norm": 0.5668926239013672, + "learning_rate": 0.00034565467557975683, + "loss": 1.0421, + "step": 4204 + }, + { + "epoch": 0.7501560966907501, + "grad_norm": 0.48620158433914185, + "learning_rate": 0.0003455899408390655, + "loss": 0.8639, + "step": 4205 + }, + { + "epoch": 0.7503344929087503, + "grad_norm": 0.5328919887542725, + "learning_rate": 0.0003455251985907814, + "loss": 0.9498, + "step": 4206 + }, + { + "epoch": 0.7505128891267505, + "grad_norm": 0.5057373642921448, + "learning_rate": 0.0003454604488399893, + "loss": 0.9308, + "step": 4207 + }, + { + "epoch": 0.7506912853447507, + "grad_norm": 0.43592992424964905, + "learning_rate": 0.0003453956915917745, + "loss": 0.8826, + "step": 4208 + }, + { + "epoch": 0.7508696815627509, + "grad_norm": 0.46315765380859375, + "learning_rate": 0.00034533092685122324, + "loss": 0.9092, + "step": 4209 + }, + { + "epoch": 0.751048077780751, + "grad_norm": 0.48972928524017334, + "learning_rate": 0.000345266154623422, + "loss": 0.9986, + "step": 4210 + }, + { + "epoch": 0.7512264739987512, + "grad_norm": 0.49682357907295227, + "learning_rate": 0.0003452013749134579, + "loss": 0.9315, + "step": 4211 + }, + { + "epoch": 0.7514048702167514, + "grad_norm": 0.5060596466064453, + "learning_rate": 0.00034513658772641887, + "loss": 1.0385, + "step": 4212 + }, + { + "epoch": 0.7515832664347516, + "grad_norm": 0.652193546295166, + "learning_rate": 0.00034507179306739324, + "loss": 0.9741, + "step": 4213 + }, + { + "epoch": 0.7517616626527518, + "grad_norm": 0.5387786030769348, + "learning_rate": 0.0003450069909414698, + "loss": 0.8605, + "step": 4214 + }, + { + "epoch": 0.751940058870752, + "grad_norm": 0.5394373536109924, + "learning_rate": 0.00034494218135373817, + "loss": 0.8344, + "step": 4215 + }, + { + "epoch": 0.7521184550887521, + "grad_norm": 0.501558780670166, + "learning_rate": 0.00034487736430928846, + "loss": 1.0367, + "step": 4216 + }, + { + "epoch": 0.7522968513067523, + "grad_norm": 0.49226245284080505, + "learning_rate": 0.00034481253981321144, + "loss": 0.8282, + "step": 4217 + }, + { + "epoch": 0.7524752475247525, + "grad_norm": 0.5037868022918701, + "learning_rate": 0.0003447477078705983, + "loss": 1.0139, + "step": 4218 + }, + { + "epoch": 0.7526536437427527, + "grad_norm": 0.4862633943557739, + "learning_rate": 0.00034468286848654106, + "loss": 0.8863, + "step": 4219 + }, + { + "epoch": 0.7528320399607529, + "grad_norm": 0.4658646881580353, + "learning_rate": 0.000344618021666132, + "loss": 0.8069, + "step": 4220 + }, + { + "epoch": 0.753010436178753, + "grad_norm": 0.44514206051826477, + "learning_rate": 0.0003445531674144642, + "loss": 1.0605, + "step": 4221 + }, + { + "epoch": 0.7531888323967532, + "grad_norm": 0.5069167613983154, + "learning_rate": 0.0003444883057366314, + "loss": 0.8131, + "step": 4222 + }, + { + "epoch": 0.7533672286147534, + "grad_norm": 0.48138898611068726, + "learning_rate": 0.00034442343663772755, + "loss": 0.8838, + "step": 4223 + }, + { + "epoch": 0.7535456248327536, + "grad_norm": 0.46832725405693054, + "learning_rate": 0.0003443585601228478, + "loss": 1.0179, + "step": 4224 + }, + { + "epoch": 0.7537240210507538, + "grad_norm": 0.5185703635215759, + "learning_rate": 0.00034429367619708733, + "loss": 0.8979, + "step": 4225 + }, + { + "epoch": 0.753902417268754, + "grad_norm": 0.5143477916717529, + "learning_rate": 0.0003442287848655421, + "loss": 0.8142, + "step": 4226 + }, + { + "epoch": 0.754080813486754, + "grad_norm": 0.5070206522941589, + "learning_rate": 0.00034416388613330864, + "loss": 1.0616, + "step": 4227 + }, + { + "epoch": 0.7542592097047542, + "grad_norm": 0.5132788419723511, + "learning_rate": 0.00034409898000548403, + "loss": 1.0537, + "step": 4228 + }, + { + "epoch": 0.7544376059227544, + "grad_norm": 0.4766051173210144, + "learning_rate": 0.00034403406648716604, + "loss": 0.9183, + "step": 4229 + }, + { + "epoch": 0.7546160021407546, + "grad_norm": 0.6825383305549622, + "learning_rate": 0.00034396914558345297, + "loss": 1.0334, + "step": 4230 + }, + { + "epoch": 0.7547943983587548, + "grad_norm": 0.4257564842700958, + "learning_rate": 0.0003439042172994436, + "loss": 0.75, + "step": 4231 + }, + { + "epoch": 0.7549727945767549, + "grad_norm": 0.5030829906463623, + "learning_rate": 0.0003438392816402375, + "loss": 1.0223, + "step": 4232 + }, + { + "epoch": 0.7551511907947551, + "grad_norm": 0.4708796441555023, + "learning_rate": 0.00034377433861093457, + "loss": 0.7668, + "step": 4233 + }, + { + "epoch": 0.7553295870127553, + "grad_norm": 0.49385902285575867, + "learning_rate": 0.0003437093882166354, + "loss": 0.7773, + "step": 4234 + }, + { + "epoch": 0.7555079832307555, + "grad_norm": 0.5581505298614502, + "learning_rate": 0.00034364443046244124, + "loss": 0.9124, + "step": 4235 + }, + { + "epoch": 0.7556863794487557, + "grad_norm": 0.4790385067462921, + "learning_rate": 0.0003435794653534538, + "loss": 0.8304, + "step": 4236 + }, + { + "epoch": 0.7558647756667559, + "grad_norm": 0.43401768803596497, + "learning_rate": 0.00034351449289477545, + "loss": 0.77, + "step": 4237 + }, + { + "epoch": 0.756043171884756, + "grad_norm": 0.5333930253982544, + "learning_rate": 0.000343449513091509, + "loss": 0.8807, + "step": 4238 + }, + { + "epoch": 0.7562215681027562, + "grad_norm": 0.4971325993537903, + "learning_rate": 0.0003433845259487581, + "loss": 0.8195, + "step": 4239 + }, + { + "epoch": 0.7563999643207564, + "grad_norm": 0.4987589716911316, + "learning_rate": 0.00034331953147162666, + "loss": 0.9902, + "step": 4240 + }, + { + "epoch": 0.7565783605387566, + "grad_norm": 0.45375093817710876, + "learning_rate": 0.0003432545296652194, + "loss": 0.9725, + "step": 4241 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 1.4552748203277588, + "learning_rate": 0.00034318952053464147, + "loss": 0.8451, + "step": 4242 + }, + { + "epoch": 0.7569351529747569, + "grad_norm": 0.44463515281677246, + "learning_rate": 0.0003431245040849987, + "loss": 0.7589, + "step": 4243 + }, + { + "epoch": 0.7571135491927571, + "grad_norm": 0.5243865251541138, + "learning_rate": 0.00034305948032139745, + "loss": 0.776, + "step": 4244 + }, + { + "epoch": 0.7572919454107573, + "grad_norm": 14.643073081970215, + "learning_rate": 0.00034299444924894474, + "loss": 0.9523, + "step": 4245 + }, + { + "epoch": 0.7574703416287575, + "grad_norm": 0.5148363709449768, + "learning_rate": 0.00034292941087274794, + "loss": 0.8977, + "step": 4246 + }, + { + "epoch": 0.7576487378467577, + "grad_norm": 0.5748273134231567, + "learning_rate": 0.0003428643651979152, + "loss": 0.9339, + "step": 4247 + }, + { + "epoch": 0.7578271340647579, + "grad_norm": 0.5037097930908203, + "learning_rate": 0.00034279931222955517, + "loss": 0.8882, + "step": 4248 + }, + { + "epoch": 0.758005530282758, + "grad_norm": 0.5515426397323608, + "learning_rate": 0.00034273425197277715, + "loss": 1.113, + "step": 4249 + }, + { + "epoch": 0.7581839265007582, + "grad_norm": 0.49536243081092834, + "learning_rate": 0.00034266918443269083, + "loss": 0.7182, + "step": 4250 + }, + { + "epoch": 0.7583623227187584, + "grad_norm": 0.49721240997314453, + "learning_rate": 0.0003426041096144067, + "loss": 1.0869, + "step": 4251 + }, + { + "epoch": 0.7585407189367586, + "grad_norm": 0.48414304852485657, + "learning_rate": 0.0003425390275230356, + "loss": 0.8055, + "step": 4252 + }, + { + "epoch": 0.7587191151547588, + "grad_norm": 0.48049938678741455, + "learning_rate": 0.00034247393816368914, + "loss": 0.7842, + "step": 4253 + }, + { + "epoch": 0.7588975113727588, + "grad_norm": 0.5838775634765625, + "learning_rate": 0.00034240884154147934, + "loss": 0.9039, + "step": 4254 + }, + { + "epoch": 0.759075907590759, + "grad_norm": 0.5820823907852173, + "learning_rate": 0.0003423437376615189, + "loss": 1.0101, + "step": 4255 + }, + { + "epoch": 0.7592543038087592, + "grad_norm": 0.6763784289360046, + "learning_rate": 0.00034227862652892103, + "loss": 1.065, + "step": 4256 + }, + { + "epoch": 0.7594327000267594, + "grad_norm": 0.5086767077445984, + "learning_rate": 0.0003422135081487996, + "loss": 1.1122, + "step": 4257 + }, + { + "epoch": 0.7596110962447596, + "grad_norm": 0.4516467750072479, + "learning_rate": 0.0003421483825262688, + "loss": 0.785, + "step": 4258 + }, + { + "epoch": 0.7597894924627598, + "grad_norm": 0.6000534892082214, + "learning_rate": 0.0003420832496664439, + "loss": 1.013, + "step": 4259 + }, + { + "epoch": 0.7599678886807599, + "grad_norm": 0.4857349395751953, + "learning_rate": 0.00034201810957444, + "loss": 1.0669, + "step": 4260 + }, + { + "epoch": 0.7601462848987601, + "grad_norm": 0.5024265646934509, + "learning_rate": 0.0003419529622553735, + "loss": 0.9337, + "step": 4261 + }, + { + "epoch": 0.7603246811167603, + "grad_norm": 0.4981759190559387, + "learning_rate": 0.0003418878077143608, + "loss": 1.0209, + "step": 4262 + }, + { + "epoch": 0.7605030773347605, + "grad_norm": 0.4379788637161255, + "learning_rate": 0.00034182264595651927, + "loss": 0.668, + "step": 4263 + }, + { + "epoch": 0.7606814735527607, + "grad_norm": 0.5320781469345093, + "learning_rate": 0.0003417574769869666, + "loss": 0.8506, + "step": 4264 + }, + { + "epoch": 0.7608598697707608, + "grad_norm": 0.5261650085449219, + "learning_rate": 0.0003416923008108213, + "loss": 1.0954, + "step": 4265 + }, + { + "epoch": 0.761038265988761, + "grad_norm": 0.47511598467826843, + "learning_rate": 0.00034162711743320205, + "loss": 0.9106, + "step": 4266 + }, + { + "epoch": 0.7612166622067612, + "grad_norm": 0.4774247109889984, + "learning_rate": 0.00034156192685922846, + "loss": 0.7263, + "step": 4267 + }, + { + "epoch": 0.7613950584247614, + "grad_norm": 0.46322304010391235, + "learning_rate": 0.00034149672909402056, + "loss": 0.7982, + "step": 4268 + }, + { + "epoch": 0.7615734546427616, + "grad_norm": 0.48991408944129944, + "learning_rate": 0.00034143152414269887, + "loss": 0.8453, + "step": 4269 + }, + { + "epoch": 0.7617518508607618, + "grad_norm": 0.47439679503440857, + "learning_rate": 0.00034136631201038466, + "loss": 0.8049, + "step": 4270 + }, + { + "epoch": 0.7619302470787619, + "grad_norm": 0.4984125792980194, + "learning_rate": 0.0003413010927021996, + "loss": 0.8084, + "step": 4271 + }, + { + "epoch": 0.7621086432967621, + "grad_norm": 0.547827959060669, + "learning_rate": 0.0003412358662232661, + "loss": 0.9195, + "step": 4272 + }, + { + "epoch": 0.7622870395147623, + "grad_norm": 0.4370548725128174, + "learning_rate": 0.0003411706325787068, + "loss": 0.7599, + "step": 4273 + }, + { + "epoch": 0.7624654357327625, + "grad_norm": 0.47955775260925293, + "learning_rate": 0.00034110539177364534, + "loss": 0.8049, + "step": 4274 + }, + { + "epoch": 0.7626438319507627, + "grad_norm": 0.5160841941833496, + "learning_rate": 0.00034104014381320557, + "loss": 0.8973, + "step": 4275 + }, + { + "epoch": 0.7628222281687628, + "grad_norm": 0.506690502166748, + "learning_rate": 0.0003409748887025121, + "loss": 1.0149, + "step": 4276 + }, + { + "epoch": 0.763000624386763, + "grad_norm": 0.5552113652229309, + "learning_rate": 0.00034090962644669, + "loss": 1.028, + "step": 4277 + }, + { + "epoch": 0.7631790206047632, + "grad_norm": 0.45480722188949585, + "learning_rate": 0.000340844357050865, + "loss": 0.7717, + "step": 4278 + }, + { + "epoch": 0.7633574168227634, + "grad_norm": 0.5435570478439331, + "learning_rate": 0.0003407790805201633, + "loss": 1.2671, + "step": 4279 + }, + { + "epoch": 0.7635358130407636, + "grad_norm": 0.43579912185668945, + "learning_rate": 0.0003407137968597116, + "loss": 0.8684, + "step": 4280 + }, + { + "epoch": 0.7637142092587638, + "grad_norm": 0.4706798791885376, + "learning_rate": 0.00034064850607463736, + "loss": 0.9247, + "step": 4281 + }, + { + "epoch": 0.7638926054767639, + "grad_norm": 0.4617820084095001, + "learning_rate": 0.0003405832081700685, + "loss": 0.8273, + "step": 4282 + }, + { + "epoch": 0.764071001694764, + "grad_norm": 0.49580156803131104, + "learning_rate": 0.0003405179031511334, + "loss": 0.9895, + "step": 4283 + }, + { + "epoch": 0.7642493979127643, + "grad_norm": 0.4556731879711151, + "learning_rate": 0.00034045259102296124, + "loss": 0.8607, + "step": 4284 + }, + { + "epoch": 0.7644277941307644, + "grad_norm": 0.461270272731781, + "learning_rate": 0.0003403872717906814, + "loss": 0.8178, + "step": 4285 + }, + { + "epoch": 0.7646061903487646, + "grad_norm": 0.4924364686012268, + "learning_rate": 0.00034032194545942417, + "loss": 0.8912, + "step": 4286 + }, + { + "epoch": 0.7647845865667647, + "grad_norm": 0.5016777515411377, + "learning_rate": 0.00034025661203432024, + "loss": 1.1841, + "step": 4287 + }, + { + "epoch": 0.7649629827847649, + "grad_norm": 0.48156607151031494, + "learning_rate": 0.0003401912715205008, + "loss": 0.7984, + "step": 4288 + }, + { + "epoch": 0.7651413790027651, + "grad_norm": 0.4831927716732025, + "learning_rate": 0.00034012592392309774, + "loss": 0.8091, + "step": 4289 + }, + { + "epoch": 0.7653197752207653, + "grad_norm": 0.48947733640670776, + "learning_rate": 0.0003400605692472433, + "loss": 0.8708, + "step": 4290 + }, + { + "epoch": 0.7654981714387655, + "grad_norm": 0.5063187479972839, + "learning_rate": 0.0003399952074980706, + "loss": 0.8559, + "step": 4291 + }, + { + "epoch": 0.7656765676567657, + "grad_norm": 0.5040058493614197, + "learning_rate": 0.00033992983868071303, + "loss": 0.8416, + "step": 4292 + }, + { + "epoch": 0.7658549638747658, + "grad_norm": 0.5163808465003967, + "learning_rate": 0.0003398644628003046, + "loss": 0.8919, + "step": 4293 + }, + { + "epoch": 0.766033360092766, + "grad_norm": 0.5070242881774902, + "learning_rate": 0.00033979907986197993, + "loss": 0.8687, + "step": 4294 + }, + { + "epoch": 0.7662117563107662, + "grad_norm": 0.5390428900718689, + "learning_rate": 0.00033973368987087423, + "loss": 0.9808, + "step": 4295 + }, + { + "epoch": 0.7663901525287664, + "grad_norm": 0.4794482886791229, + "learning_rate": 0.0003396682928321231, + "loss": 0.9719, + "step": 4296 + }, + { + "epoch": 0.7665685487467666, + "grad_norm": 0.482950896024704, + "learning_rate": 0.0003396028887508628, + "loss": 0.7459, + "step": 4297 + }, + { + "epoch": 0.7667469449647667, + "grad_norm": 0.5277696251869202, + "learning_rate": 0.00033953747763223026, + "loss": 1.1243, + "step": 4298 + }, + { + "epoch": 0.7669253411827669, + "grad_norm": 0.4380790889263153, + "learning_rate": 0.0003394720594813627, + "loss": 0.8427, + "step": 4299 + }, + { + "epoch": 0.7671037374007671, + "grad_norm": 0.5023663640022278, + "learning_rate": 0.0003394066343033981, + "loss": 0.8703, + "step": 4300 + }, + { + "epoch": 0.7672821336187673, + "grad_norm": 0.48462873697280884, + "learning_rate": 0.00033934120210347496, + "loss": 0.8443, + "step": 4301 + }, + { + "epoch": 0.7674605298367675, + "grad_norm": 1.0551936626434326, + "learning_rate": 0.0003392757628867322, + "loss": 0.7749, + "step": 4302 + }, + { + "epoch": 0.7676389260547677, + "grad_norm": 0.6180291771888733, + "learning_rate": 0.0003392103166583095, + "loss": 1.1561, + "step": 4303 + }, + { + "epoch": 0.7678173222727678, + "grad_norm": 0.5031440854072571, + "learning_rate": 0.0003391448634233468, + "loss": 0.8814, + "step": 4304 + }, + { + "epoch": 0.767995718490768, + "grad_norm": 0.4589061737060547, + "learning_rate": 0.00033907940318698504, + "loss": 0.9982, + "step": 4305 + }, + { + "epoch": 0.7681741147087682, + "grad_norm": 0.46637728810310364, + "learning_rate": 0.00033901393595436527, + "loss": 0.8032, + "step": 4306 + }, + { + "epoch": 0.7683525109267684, + "grad_norm": 0.5185346007347107, + "learning_rate": 0.00033894846173062915, + "loss": 1.0561, + "step": 4307 + }, + { + "epoch": 0.7685309071447686, + "grad_norm": 0.43067285418510437, + "learning_rate": 0.00033888298052091916, + "loss": 0.7942, + "step": 4308 + }, + { + "epoch": 0.7687093033627687, + "grad_norm": 0.6915571689605713, + "learning_rate": 0.00033881749233037817, + "loss": 1.0712, + "step": 4309 + }, + { + "epoch": 0.7688876995807689, + "grad_norm": 0.4629500210285187, + "learning_rate": 0.0003387519971641495, + "loss": 0.7828, + "step": 4310 + }, + { + "epoch": 0.7690660957987691, + "grad_norm": 0.65446937084198, + "learning_rate": 0.00033868649502737726, + "loss": 0.8419, + "step": 4311 + }, + { + "epoch": 0.7692444920167693, + "grad_norm": 0.4821755588054657, + "learning_rate": 0.0003386209859252058, + "loss": 0.9422, + "step": 4312 + }, + { + "epoch": 0.7694228882347695, + "grad_norm": 3.9539215564727783, + "learning_rate": 0.0003385554698627803, + "loss": 0.7106, + "step": 4313 + }, + { + "epoch": 0.7696012844527697, + "grad_norm": 0.5639221668243408, + "learning_rate": 0.00033848994684524623, + "loss": 0.8801, + "step": 4314 + }, + { + "epoch": 0.7697796806707697, + "grad_norm": 0.4910728931427002, + "learning_rate": 0.0003384244168777498, + "loss": 0.8116, + "step": 4315 + }, + { + "epoch": 0.7699580768887699, + "grad_norm": 0.5648394823074341, + "learning_rate": 0.0003383588799654378, + "loss": 0.9331, + "step": 4316 + }, + { + "epoch": 0.7701364731067701, + "grad_norm": 0.5196200609207153, + "learning_rate": 0.00033829333611345736, + "loss": 0.8242, + "step": 4317 + }, + { + "epoch": 0.7703148693247703, + "grad_norm": 0.4403199553489685, + "learning_rate": 0.0003382277853269564, + "loss": 0.8163, + "step": 4318 + }, + { + "epoch": 0.7704932655427705, + "grad_norm": 0.7118186950683594, + "learning_rate": 0.000338162227611083, + "loss": 0.8899, + "step": 4319 + }, + { + "epoch": 0.7706716617607706, + "grad_norm": 0.8601351976394653, + "learning_rate": 0.00033809666297098624, + "loss": 1.0249, + "step": 4320 + }, + { + "epoch": 0.7708500579787708, + "grad_norm": 0.8977810144424438, + "learning_rate": 0.0003380310914118155, + "loss": 0.771, + "step": 4321 + }, + { + "epoch": 0.771028454196771, + "grad_norm": 0.5474297404289246, + "learning_rate": 0.0003379655129387207, + "loss": 0.9783, + "step": 4322 + }, + { + "epoch": 0.7712068504147712, + "grad_norm": 6.160559177398682, + "learning_rate": 0.0003378999275568523, + "loss": 1.1071, + "step": 4323 + }, + { + "epoch": 0.7713852466327714, + "grad_norm": 0.6195570230484009, + "learning_rate": 0.0003378343352713614, + "loss": 0.8238, + "step": 4324 + }, + { + "epoch": 0.7715636428507716, + "grad_norm": 0.5602040886878967, + "learning_rate": 0.00033776873608739976, + "loss": 0.7919, + "step": 4325 + }, + { + "epoch": 0.7717420390687717, + "grad_norm": 0.5972548127174377, + "learning_rate": 0.00033770313001011933, + "loss": 0.9286, + "step": 4326 + }, + { + "epoch": 0.7719204352867719, + "grad_norm": 0.5935573577880859, + "learning_rate": 0.0003376375170446727, + "loss": 0.9524, + "step": 4327 + }, + { + "epoch": 0.7720988315047721, + "grad_norm": 0.7750356197357178, + "learning_rate": 0.00033757189719621326, + "loss": 0.8514, + "step": 4328 + }, + { + "epoch": 0.7722772277227723, + "grad_norm": 0.5310060381889343, + "learning_rate": 0.00033750627046989475, + "loss": 0.8609, + "step": 4329 + }, + { + "epoch": 0.7724556239407725, + "grad_norm": 0.5345288515090942, + "learning_rate": 0.00033744063687087136, + "loss": 0.9829, + "step": 4330 + }, + { + "epoch": 0.7726340201587726, + "grad_norm": 0.50868159532547, + "learning_rate": 0.000337374996404298, + "loss": 0.9928, + "step": 4331 + }, + { + "epoch": 0.7728124163767728, + "grad_norm": 0.46974533796310425, + "learning_rate": 0.00033730934907532994, + "loss": 0.832, + "step": 4332 + }, + { + "epoch": 0.772990812594773, + "grad_norm": 0.4263937771320343, + "learning_rate": 0.0003372436948891233, + "loss": 0.7312, + "step": 4333 + }, + { + "epoch": 0.7731692088127732, + "grad_norm": 0.5079303979873657, + "learning_rate": 0.0003371780338508343, + "loss": 0.9124, + "step": 4334 + }, + { + "epoch": 0.7733476050307734, + "grad_norm": 0.5083035826683044, + "learning_rate": 0.00033711236596562004, + "loss": 0.8987, + "step": 4335 + }, + { + "epoch": 0.7735260012487736, + "grad_norm": 0.4942117929458618, + "learning_rate": 0.00033704669123863813, + "loss": 1.0008, + "step": 4336 + }, + { + "epoch": 0.7737043974667737, + "grad_norm": 0.5014617443084717, + "learning_rate": 0.00033698100967504655, + "loss": 0.6739, + "step": 4337 + }, + { + "epoch": 0.7738827936847739, + "grad_norm": 0.4389418065547943, + "learning_rate": 0.0003369153212800038, + "loss": 0.6625, + "step": 4338 + }, + { + "epoch": 0.7740611899027741, + "grad_norm": 0.4890193045139313, + "learning_rate": 0.0003368496260586692, + "loss": 0.7249, + "step": 4339 + }, + { + "epoch": 0.7742395861207743, + "grad_norm": 0.49702176451683044, + "learning_rate": 0.00033678392401620226, + "loss": 0.8034, + "step": 4340 + }, + { + "epoch": 0.7744179823387745, + "grad_norm": 0.5406886339187622, + "learning_rate": 0.00033671821515776336, + "loss": 0.9704, + "step": 4341 + }, + { + "epoch": 0.7745963785567745, + "grad_norm": 0.460330992937088, + "learning_rate": 0.00033665249948851316, + "loss": 0.6877, + "step": 4342 + }, + { + "epoch": 0.7747747747747747, + "grad_norm": 0.4634692370891571, + "learning_rate": 0.0003365867770136129, + "loss": 0.7982, + "step": 4343 + }, + { + "epoch": 0.7749531709927749, + "grad_norm": 0.5319910049438477, + "learning_rate": 0.00033652104773822445, + "loss": 1.0181, + "step": 4344 + }, + { + "epoch": 0.7751315672107751, + "grad_norm": 0.48047569394111633, + "learning_rate": 0.00033645531166751015, + "loss": 0.9439, + "step": 4345 + }, + { + "epoch": 0.7753099634287753, + "grad_norm": 0.4725799858570099, + "learning_rate": 0.00033638956880663285, + "loss": 0.8295, + "step": 4346 + }, + { + "epoch": 0.7754883596467755, + "grad_norm": 0.4927878975868225, + "learning_rate": 0.000336323819160756, + "loss": 0.8699, + "step": 4347 + }, + { + "epoch": 0.7756667558647756, + "grad_norm": 0.5209059715270996, + "learning_rate": 0.00033625806273504354, + "loss": 0.8208, + "step": 4348 + }, + { + "epoch": 0.7758451520827758, + "grad_norm": 0.4795133173465729, + "learning_rate": 0.00033619229953465996, + "loss": 0.8273, + "step": 4349 + }, + { + "epoch": 0.776023548300776, + "grad_norm": 0.5821531414985657, + "learning_rate": 0.0003361265295647703, + "loss": 0.8388, + "step": 4350 + }, + { + "epoch": 0.7762019445187762, + "grad_norm": 0.5282803177833557, + "learning_rate": 0.00033606075283054005, + "loss": 0.6619, + "step": 4351 + }, + { + "epoch": 0.7763803407367764, + "grad_norm": 0.6072514653205872, + "learning_rate": 0.00033599496933713535, + "loss": 1.0466, + "step": 4352 + }, + { + "epoch": 0.7765587369547765, + "grad_norm": 0.4529389441013336, + "learning_rate": 0.0003359291790897227, + "loss": 0.8325, + "step": 4353 + }, + { + "epoch": 0.7767371331727767, + "grad_norm": 0.47834986448287964, + "learning_rate": 0.0003358633820934692, + "loss": 0.9251, + "step": 4354 + }, + { + "epoch": 0.7769155293907769, + "grad_norm": 0.5303228497505188, + "learning_rate": 0.0003357975783535428, + "loss": 1.1421, + "step": 4355 + }, + { + "epoch": 0.7770939256087771, + "grad_norm": 0.5310399532318115, + "learning_rate": 0.00033573176787511145, + "loss": 0.8054, + "step": 4356 + }, + { + "epoch": 0.7772723218267773, + "grad_norm": 0.46372997760772705, + "learning_rate": 0.0003356659506633439, + "loss": 1.0177, + "step": 4357 + }, + { + "epoch": 0.7774507180447775, + "grad_norm": 0.45827627182006836, + "learning_rate": 0.00033560012672340957, + "loss": 0.7673, + "step": 4358 + }, + { + "epoch": 0.7776291142627776, + "grad_norm": 0.5642030835151672, + "learning_rate": 0.000335534296060478, + "loss": 0.8029, + "step": 4359 + }, + { + "epoch": 0.7778075104807778, + "grad_norm": 0.5288789868354797, + "learning_rate": 0.00033546845867971976, + "loss": 1.0097, + "step": 4360 + }, + { + "epoch": 0.777985906698778, + "grad_norm": 0.5099871158599854, + "learning_rate": 0.0003354026145863054, + "loss": 0.8843, + "step": 4361 + }, + { + "epoch": 0.7781643029167782, + "grad_norm": 0.4645063579082489, + "learning_rate": 0.0003353367637854065, + "loss": 0.8116, + "step": 4362 + }, + { + "epoch": 0.7783426991347784, + "grad_norm": 0.48817193508148193, + "learning_rate": 0.00033527090628219494, + "loss": 0.7076, + "step": 4363 + }, + { + "epoch": 0.7785210953527785, + "grad_norm": 0.4992578625679016, + "learning_rate": 0.00033520504208184304, + "loss": 0.9631, + "step": 4364 + }, + { + "epoch": 0.7786994915707787, + "grad_norm": 0.4915018081665039, + "learning_rate": 0.00033513917118952385, + "loss": 0.8473, + "step": 4365 + }, + { + "epoch": 0.7788778877887789, + "grad_norm": 1.4712707996368408, + "learning_rate": 0.0003350732936104108, + "loss": 1.0, + "step": 4366 + }, + { + "epoch": 0.7790562840067791, + "grad_norm": 0.5193944573402405, + "learning_rate": 0.0003350074093496778, + "loss": 1.0957, + "step": 4367 + }, + { + "epoch": 0.7792346802247793, + "grad_norm": 0.4992116689682007, + "learning_rate": 0.0003349415184124995, + "loss": 0.9401, + "step": 4368 + }, + { + "epoch": 0.7794130764427795, + "grad_norm": 0.512789785861969, + "learning_rate": 0.00033487562080405085, + "loss": 0.915, + "step": 4369 + }, + { + "epoch": 0.7795914726607795, + "grad_norm": 1.0886067152023315, + "learning_rate": 0.00033480971652950753, + "loss": 0.9546, + "step": 4370 + }, + { + "epoch": 0.7797698688787797, + "grad_norm": 0.49638882279396057, + "learning_rate": 0.0003347438055940456, + "loss": 0.6824, + "step": 4371 + }, + { + "epoch": 0.77994826509678, + "grad_norm": 0.5366002917289734, + "learning_rate": 0.0003346778880028416, + "loss": 1.0622, + "step": 4372 + }, + { + "epoch": 0.7801266613147801, + "grad_norm": 0.45976123213768005, + "learning_rate": 0.00033461196376107275, + "loss": 0.7995, + "step": 4373 + }, + { + "epoch": 0.7803050575327803, + "grad_norm": 0.48608720302581787, + "learning_rate": 0.00033454603287391666, + "loss": 1.0287, + "step": 4374 + }, + { + "epoch": 0.7804834537507804, + "grad_norm": 3.942063570022583, + "learning_rate": 0.0003344800953465515, + "loss": 1.1003, + "step": 4375 + }, + { + "epoch": 0.7806618499687806, + "grad_norm": 0.4614802598953247, + "learning_rate": 0.000334414151184156, + "loss": 0.8365, + "step": 4376 + }, + { + "epoch": 0.7808402461867808, + "grad_norm": 0.4670039117336273, + "learning_rate": 0.00033434820039190944, + "loss": 0.9256, + "step": 4377 + }, + { + "epoch": 0.781018642404781, + "grad_norm": 0.5666428208351135, + "learning_rate": 0.0003342822429749915, + "loss": 0.8964, + "step": 4378 + }, + { + "epoch": 0.7811970386227812, + "grad_norm": 0.5543097257614136, + "learning_rate": 0.00033421627893858244, + "loss": 0.9927, + "step": 4379 + }, + { + "epoch": 0.7813754348407814, + "grad_norm": 0.5561456680297852, + "learning_rate": 0.0003341503082878631, + "loss": 0.9066, + "step": 4380 + }, + { + "epoch": 0.7815538310587815, + "grad_norm": 0.5354291796684265, + "learning_rate": 0.00033408433102801474, + "loss": 1.0043, + "step": 4381 + }, + { + "epoch": 0.7817322272767817, + "grad_norm": 0.45063936710357666, + "learning_rate": 0.00033401834716421925, + "loss": 0.6419, + "step": 4382 + }, + { + "epoch": 0.7819106234947819, + "grad_norm": 0.5044664144515991, + "learning_rate": 0.0003339523567016589, + "loss": 0.9381, + "step": 4383 + }, + { + "epoch": 0.7820890197127821, + "grad_norm": 0.4677816331386566, + "learning_rate": 0.0003338863596455166, + "loss": 0.879, + "step": 4384 + }, + { + "epoch": 0.7822674159307823, + "grad_norm": 0.4861041009426117, + "learning_rate": 0.00033382035600097563, + "loss": 0.8487, + "step": 4385 + }, + { + "epoch": 0.7824458121487824, + "grad_norm": 0.5526744723320007, + "learning_rate": 0.0003337543457732201, + "loss": 0.9411, + "step": 4386 + }, + { + "epoch": 0.7826242083667826, + "grad_norm": 4.370226860046387, + "learning_rate": 0.0003336883289674342, + "loss": 0.9109, + "step": 4387 + }, + { + "epoch": 0.7828026045847828, + "grad_norm": 0.4913937449455261, + "learning_rate": 0.00033362230558880296, + "loss": 0.8254, + "step": 4388 + }, + { + "epoch": 0.782981000802783, + "grad_norm": 0.5903784036636353, + "learning_rate": 0.00033355627564251184, + "loss": 1.2674, + "step": 4389 + }, + { + "epoch": 0.7831593970207832, + "grad_norm": 0.48795902729034424, + "learning_rate": 0.0003334902391337468, + "loss": 0.7577, + "step": 4390 + }, + { + "epoch": 0.7833377932387834, + "grad_norm": 0.4892320930957794, + "learning_rate": 0.00033342419606769433, + "loss": 0.9373, + "step": 4391 + }, + { + "epoch": 0.7835161894567835, + "grad_norm": 0.4674488604068756, + "learning_rate": 0.00033335814644954137, + "loss": 0.7285, + "step": 4392 + }, + { + "epoch": 0.7836945856747837, + "grad_norm": 0.4890602231025696, + "learning_rate": 0.00033329209028447543, + "loss": 0.9521, + "step": 4393 + }, + { + "epoch": 0.7838729818927839, + "grad_norm": 0.4913036823272705, + "learning_rate": 0.0003332260275776846, + "loss": 0.8138, + "step": 4394 + }, + { + "epoch": 0.7840513781107841, + "grad_norm": 0.4752870500087738, + "learning_rate": 0.0003331599583343574, + "loss": 1.065, + "step": 4395 + }, + { + "epoch": 0.7842297743287843, + "grad_norm": 0.542586624622345, + "learning_rate": 0.0003330938825596828, + "loss": 0.9805, + "step": 4396 + }, + { + "epoch": 0.7844081705467844, + "grad_norm": 0.4948160946369171, + "learning_rate": 0.0003330278002588505, + "loss": 0.7871, + "step": 4397 + }, + { + "epoch": 0.7845865667647846, + "grad_norm": 0.5883384943008423, + "learning_rate": 0.0003329617114370505, + "loss": 1.1591, + "step": 4398 + }, + { + "epoch": 0.7847649629827848, + "grad_norm": 0.484783798456192, + "learning_rate": 0.00033289561609947335, + "loss": 1.0079, + "step": 4399 + }, + { + "epoch": 0.784943359200785, + "grad_norm": 0.5016215443611145, + "learning_rate": 0.00033282951425131014, + "loss": 0.7401, + "step": 4400 + }, + { + "epoch": 0.7851217554187851, + "grad_norm": 0.45084336400032043, + "learning_rate": 0.00033276340589775255, + "loss": 0.7707, + "step": 4401 + }, + { + "epoch": 0.7853001516367853, + "grad_norm": 0.4217044413089752, + "learning_rate": 0.00033269729104399263, + "loss": 0.614, + "step": 4402 + }, + { + "epoch": 0.7854785478547854, + "grad_norm": 0.5516197085380554, + "learning_rate": 0.00033263116969522316, + "loss": 0.9417, + "step": 4403 + }, + { + "epoch": 0.7856569440727856, + "grad_norm": 0.4416463375091553, + "learning_rate": 0.00033256504185663713, + "loss": 0.7047, + "step": 4404 + }, + { + "epoch": 0.7858353402907858, + "grad_norm": 2.543560743331909, + "learning_rate": 0.00033249890753342826, + "loss": 0.981, + "step": 4405 + }, + { + "epoch": 0.786013736508786, + "grad_norm": 0.44152694940567017, + "learning_rate": 0.0003324327667307907, + "loss": 0.8247, + "step": 4406 + }, + { + "epoch": 0.7861921327267862, + "grad_norm": 0.7885439395904541, + "learning_rate": 0.00033236661945391905, + "loss": 0.9313, + "step": 4407 + }, + { + "epoch": 0.7863705289447863, + "grad_norm": 0.6830782294273376, + "learning_rate": 0.00033230046570800867, + "loss": 0.713, + "step": 4408 + }, + { + "epoch": 0.7865489251627865, + "grad_norm": 0.5507636666297913, + "learning_rate": 0.000332234305498255, + "loss": 1.0907, + "step": 4409 + }, + { + "epoch": 0.7867273213807867, + "grad_norm": 0.4962003231048584, + "learning_rate": 0.00033216813882985444, + "loss": 0.8278, + "step": 4410 + }, + { + "epoch": 0.7869057175987869, + "grad_norm": 0.5796182155609131, + "learning_rate": 0.00033210196570800365, + "loss": 0.881, + "step": 4411 + }, + { + "epoch": 0.7870841138167871, + "grad_norm": 0.48952221870422363, + "learning_rate": 0.00033203578613789973, + "loss": 0.9298, + "step": 4412 + }, + { + "epoch": 0.7872625100347873, + "grad_norm": 0.49012526869773865, + "learning_rate": 0.00033196960012474053, + "loss": 0.8109, + "step": 4413 + }, + { + "epoch": 0.7874409062527874, + "grad_norm": 0.49443262815475464, + "learning_rate": 0.0003319034076737242, + "loss": 0.9402, + "step": 4414 + }, + { + "epoch": 0.7876193024707876, + "grad_norm": 0.4793679714202881, + "learning_rate": 0.00033183720879004935, + "loss": 0.7636, + "step": 4415 + }, + { + "epoch": 0.7877976986887878, + "grad_norm": 0.49106279015541077, + "learning_rate": 0.0003317710034789154, + "loss": 0.8781, + "step": 4416 + }, + { + "epoch": 0.787976094906788, + "grad_norm": 0.46945035457611084, + "learning_rate": 0.00033170479174552217, + "loss": 0.7358, + "step": 4417 + }, + { + "epoch": 0.7881544911247882, + "grad_norm": 0.48162609338760376, + "learning_rate": 0.00033163857359506965, + "loss": 0.8671, + "step": 4418 + }, + { + "epoch": 0.7883328873427883, + "grad_norm": 0.4425477385520935, + "learning_rate": 0.00033157234903275867, + "loss": 0.8101, + "step": 4419 + }, + { + "epoch": 0.7885112835607885, + "grad_norm": 0.499623566865921, + "learning_rate": 0.00033150611806379054, + "loss": 0.8189, + "step": 4420 + }, + { + "epoch": 0.7886896797787887, + "grad_norm": 0.44681599736213684, + "learning_rate": 0.000331439880693367, + "loss": 0.7808, + "step": 4421 + }, + { + "epoch": 0.7888680759967889, + "grad_norm": 0.4531422555446625, + "learning_rate": 0.00033137363692669014, + "loss": 0.7667, + "step": 4422 + }, + { + "epoch": 0.7890464722147891, + "grad_norm": 0.43248048424720764, + "learning_rate": 0.00033130738676896297, + "loss": 0.7906, + "step": 4423 + }, + { + "epoch": 0.7892248684327893, + "grad_norm": 0.5049116611480713, + "learning_rate": 0.00033124113022538865, + "loss": 0.9246, + "step": 4424 + }, + { + "epoch": 0.7894032646507894, + "grad_norm": 0.48414015769958496, + "learning_rate": 0.00033117486730117093, + "loss": 0.8807, + "step": 4425 + }, + { + "epoch": 0.7895816608687896, + "grad_norm": 0.5417115688323975, + "learning_rate": 0.000331108598001514, + "loss": 0.9549, + "step": 4426 + }, + { + "epoch": 0.7897600570867898, + "grad_norm": 0.4763809144496918, + "learning_rate": 0.0003310423223316227, + "loss": 0.8302, + "step": 4427 + }, + { + "epoch": 0.78993845330479, + "grad_norm": 0.4397316873073578, + "learning_rate": 0.00033097604029670225, + "loss": 0.8886, + "step": 4428 + }, + { + "epoch": 0.7901168495227902, + "grad_norm": 0.451040118932724, + "learning_rate": 0.0003309097519019585, + "loss": 0.6568, + "step": 4429 + }, + { + "epoch": 0.7902952457407902, + "grad_norm": 0.4390012323856354, + "learning_rate": 0.0003308434571525976, + "loss": 0.7361, + "step": 4430 + }, + { + "epoch": 0.7904736419587904, + "grad_norm": 0.48856833577156067, + "learning_rate": 0.0003307771560538264, + "loss": 0.7811, + "step": 4431 + }, + { + "epoch": 0.7906520381767906, + "grad_norm": 0.49298450350761414, + "learning_rate": 0.00033071084861085217, + "loss": 0.8156, + "step": 4432 + }, + { + "epoch": 0.7908304343947908, + "grad_norm": 0.424003541469574, + "learning_rate": 0.00033064453482888257, + "loss": 0.6857, + "step": 4433 + }, + { + "epoch": 0.791008830612791, + "grad_norm": 0.7829723358154297, + "learning_rate": 0.0003305782147131259, + "loss": 0.9136, + "step": 4434 + }, + { + "epoch": 0.7911872268307912, + "grad_norm": 0.549010157585144, + "learning_rate": 0.0003305118882687909, + "loss": 0.772, + "step": 4435 + }, + { + "epoch": 0.7913656230487913, + "grad_norm": 0.4902380704879761, + "learning_rate": 0.00033044555550108693, + "loss": 0.7763, + "step": 4436 + }, + { + "epoch": 0.7915440192667915, + "grad_norm": 0.4383619725704193, + "learning_rate": 0.0003303792164152236, + "loss": 0.9407, + "step": 4437 + }, + { + "epoch": 0.7917224154847917, + "grad_norm": 0.486372709274292, + "learning_rate": 0.00033031287101641116, + "loss": 0.997, + "step": 4438 + }, + { + "epoch": 0.7919008117027919, + "grad_norm": 0.5471330285072327, + "learning_rate": 0.00033024651930986044, + "loss": 0.8833, + "step": 4439 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.47940507531166077, + "learning_rate": 0.0003301801613007826, + "loss": 1.008, + "step": 4440 + }, + { + "epoch": 0.7922576041387922, + "grad_norm": 0.4837360382080078, + "learning_rate": 0.0003301137969943894, + "loss": 0.9464, + "step": 4441 + }, + { + "epoch": 0.7924360003567924, + "grad_norm": 0.48073476552963257, + "learning_rate": 0.0003300474263958931, + "loss": 0.8527, + "step": 4442 + }, + { + "epoch": 0.7926143965747926, + "grad_norm": 0.47916659712791443, + "learning_rate": 0.00032998104951050634, + "loss": 0.8098, + "step": 4443 + }, + { + "epoch": 0.7927927927927928, + "grad_norm": 0.5394938588142395, + "learning_rate": 0.00032991466634344234, + "loss": 0.9535, + "step": 4444 + }, + { + "epoch": 0.792971189010793, + "grad_norm": 0.44547122716903687, + "learning_rate": 0.00032984827689991493, + "loss": 0.7269, + "step": 4445 + }, + { + "epoch": 0.7931495852287932, + "grad_norm": 0.5474995374679565, + "learning_rate": 0.0003297818811851381, + "loss": 1.0227, + "step": 4446 + }, + { + "epoch": 0.7933279814467933, + "grad_norm": 0.5473585724830627, + "learning_rate": 0.0003297154792043268, + "loss": 1.043, + "step": 4447 + }, + { + "epoch": 0.7935063776647935, + "grad_norm": 0.4513511061668396, + "learning_rate": 0.0003296490709626959, + "loss": 0.7503, + "step": 4448 + }, + { + "epoch": 0.7936847738827937, + "grad_norm": 0.47088685631752014, + "learning_rate": 0.0003295826564654614, + "loss": 0.8366, + "step": 4449 + }, + { + "epoch": 0.7938631701007939, + "grad_norm": 0.4641473889350891, + "learning_rate": 0.00032951623571783915, + "loss": 0.8643, + "step": 4450 + }, + { + "epoch": 0.7940415663187941, + "grad_norm": 0.43831127882003784, + "learning_rate": 0.00032944980872504613, + "loss": 0.7894, + "step": 4451 + }, + { + "epoch": 0.7942199625367942, + "grad_norm": 0.4277407228946686, + "learning_rate": 0.00032938337549229924, + "loss": 0.6743, + "step": 4452 + }, + { + "epoch": 0.7943983587547944, + "grad_norm": 0.454748272895813, + "learning_rate": 0.00032931693602481616, + "loss": 0.829, + "step": 4453 + }, + { + "epoch": 0.7945767549727946, + "grad_norm": 0.5081732273101807, + "learning_rate": 0.0003292504903278151, + "loss": 0.9585, + "step": 4454 + }, + { + "epoch": 0.7947551511907948, + "grad_norm": 0.4319037199020386, + "learning_rate": 0.0003291840384065146, + "loss": 0.7298, + "step": 4455 + }, + { + "epoch": 0.794933547408795, + "grad_norm": 0.5093657374382019, + "learning_rate": 0.00032911758026613384, + "loss": 1.0046, + "step": 4456 + }, + { + "epoch": 0.7951119436267952, + "grad_norm": 0.4518407881259918, + "learning_rate": 0.0003290511159118924, + "loss": 0.7589, + "step": 4457 + }, + { + "epoch": 0.7952903398447952, + "grad_norm": 0.4718686044216156, + "learning_rate": 0.0003289846453490103, + "loss": 0.7775, + "step": 4458 + }, + { + "epoch": 0.7954687360627954, + "grad_norm": 0.520844578742981, + "learning_rate": 0.00032891816858270816, + "loss": 1.2267, + "step": 4459 + }, + { + "epoch": 0.7956471322807956, + "grad_norm": 0.4900132119655609, + "learning_rate": 0.000328851685618207, + "loss": 0.941, + "step": 4460 + }, + { + "epoch": 0.7958255284987958, + "grad_norm": 0.4234750270843506, + "learning_rate": 0.00032878519646072833, + "loss": 0.8223, + "step": 4461 + }, + { + "epoch": 0.796003924716796, + "grad_norm": 0.42578887939453125, + "learning_rate": 0.0003287187011154943, + "loss": 0.6952, + "step": 4462 + }, + { + "epoch": 0.7961823209347961, + "grad_norm": 0.4930354356765747, + "learning_rate": 0.00032865219958772734, + "loss": 0.8822, + "step": 4463 + }, + { + "epoch": 0.7963607171527963, + "grad_norm": 0.5005689263343811, + "learning_rate": 0.0003285856918826505, + "loss": 0.8978, + "step": 4464 + }, + { + "epoch": 0.7965391133707965, + "grad_norm": 0.46632346510887146, + "learning_rate": 0.00032851917800548724, + "loss": 0.9696, + "step": 4465 + }, + { + "epoch": 0.7967175095887967, + "grad_norm": 0.4460965692996979, + "learning_rate": 0.0003284526579614615, + "loss": 0.8667, + "step": 4466 + }, + { + "epoch": 0.7968959058067969, + "grad_norm": 0.45909878611564636, + "learning_rate": 0.0003283861317557978, + "loss": 0.9574, + "step": 4467 + }, + { + "epoch": 0.7970743020247971, + "grad_norm": 0.48101097345352173, + "learning_rate": 0.0003283195993937209, + "loss": 0.8039, + "step": 4468 + }, + { + "epoch": 0.7972526982427972, + "grad_norm": 0.5066887140274048, + "learning_rate": 0.0003282530608804565, + "loss": 0.8576, + "step": 4469 + }, + { + "epoch": 0.7974310944607974, + "grad_norm": 0.44910162687301636, + "learning_rate": 0.0003281865162212304, + "loss": 0.846, + "step": 4470 + }, + { + "epoch": 0.7976094906787976, + "grad_norm": 0.46548551321029663, + "learning_rate": 0.000328119965421269, + "loss": 0.7717, + "step": 4471 + }, + { + "epoch": 0.7977878868967978, + "grad_norm": 0.603074312210083, + "learning_rate": 0.00032805340848579903, + "loss": 0.9443, + "step": 4472 + }, + { + "epoch": 0.797966283114798, + "grad_norm": 0.4617455005645752, + "learning_rate": 0.00032798684542004793, + "loss": 0.6775, + "step": 4473 + }, + { + "epoch": 0.7981446793327981, + "grad_norm": 0.5626710653305054, + "learning_rate": 0.00032792027622924357, + "loss": 1.0429, + "step": 4474 + }, + { + "epoch": 0.7983230755507983, + "grad_norm": 0.500339925289154, + "learning_rate": 0.00032785370091861435, + "loss": 1.0465, + "step": 4475 + }, + { + "epoch": 0.7985014717687985, + "grad_norm": 0.44158151745796204, + "learning_rate": 0.0003277871194933888, + "loss": 0.6398, + "step": 4476 + }, + { + "epoch": 0.7986798679867987, + "grad_norm": 0.47629523277282715, + "learning_rate": 0.0003277205319587965, + "loss": 0.8574, + "step": 4477 + }, + { + "epoch": 0.7988582642047989, + "grad_norm": 0.6063137650489807, + "learning_rate": 0.00032765393832006695, + "loss": 0.928, + "step": 4478 + }, + { + "epoch": 0.7990366604227991, + "grad_norm": 0.41350358724594116, + "learning_rate": 0.00032758733858243054, + "loss": 0.7943, + "step": 4479 + }, + { + "epoch": 0.7992150566407992, + "grad_norm": 0.4761079251766205, + "learning_rate": 0.000327520732751118, + "loss": 0.9293, + "step": 4480 + }, + { + "epoch": 0.7993934528587994, + "grad_norm": 0.46137312054634094, + "learning_rate": 0.0003274541208313604, + "loss": 0.6324, + "step": 4481 + }, + { + "epoch": 0.7995718490767996, + "grad_norm": 0.46572351455688477, + "learning_rate": 0.00032738750282838955, + "loss": 0.796, + "step": 4482 + }, + { + "epoch": 0.7997502452947998, + "grad_norm": 1.1689634323120117, + "learning_rate": 0.0003273208787474375, + "loss": 0.8631, + "step": 4483 + }, + { + "epoch": 0.7999286415128, + "grad_norm": 0.5234804749488831, + "learning_rate": 0.00032725424859373687, + "loss": 1.022, + "step": 4484 + }, + { + "epoch": 0.8001070377308, + "grad_norm": 0.46076735854148865, + "learning_rate": 0.0003271876123725208, + "loss": 0.8363, + "step": 4485 + }, + { + "epoch": 0.8002854339488003, + "grad_norm": 0.5089244246482849, + "learning_rate": 0.0003271209700890229, + "loss": 1.0501, + "step": 4486 + }, + { + "epoch": 0.8004638301668004, + "grad_norm": 0.49238088726997375, + "learning_rate": 0.0003270543217484772, + "loss": 0.7696, + "step": 4487 + }, + { + "epoch": 0.8006422263848006, + "grad_norm": 0.5077065229415894, + "learning_rate": 0.0003269876673561183, + "loss": 0.7879, + "step": 4488 + }, + { + "epoch": 0.8008206226028008, + "grad_norm": 0.4703711271286011, + "learning_rate": 0.000326921006917181, + "loss": 0.7882, + "step": 4489 + }, + { + "epoch": 0.800999018820801, + "grad_norm": 0.4334394633769989, + "learning_rate": 0.000326854340436901, + "loss": 0.6552, + "step": 4490 + }, + { + "epoch": 0.8011774150388011, + "grad_norm": 0.45196327567100525, + "learning_rate": 0.0003267876679205142, + "loss": 0.674, + "step": 4491 + }, + { + "epoch": 0.8013558112568013, + "grad_norm": 0.629697859287262, + "learning_rate": 0.0003267209893732569, + "loss": 1.0846, + "step": 4492 + }, + { + "epoch": 0.8015342074748015, + "grad_norm": 0.4871639907360077, + "learning_rate": 0.00032665430480036616, + "loss": 0.8173, + "step": 4493 + }, + { + "epoch": 0.8017126036928017, + "grad_norm": 2.4100522994995117, + "learning_rate": 0.0003265876142070794, + "loss": 0.8096, + "step": 4494 + }, + { + "epoch": 0.8018909999108019, + "grad_norm": 0.567620038986206, + "learning_rate": 0.00032652091759863424, + "loss": 1.2446, + "step": 4495 + }, + { + "epoch": 0.802069396128802, + "grad_norm": 0.440104603767395, + "learning_rate": 0.0003264542149802692, + "loss": 0.7097, + "step": 4496 + }, + { + "epoch": 0.8022477923468022, + "grad_norm": 0.49658969044685364, + "learning_rate": 0.0003263875063572231, + "loss": 1.0596, + "step": 4497 + }, + { + "epoch": 0.8024261885648024, + "grad_norm": 0.5105962157249451, + "learning_rate": 0.000326320791734735, + "loss": 0.9645, + "step": 4498 + }, + { + "epoch": 0.8026045847828026, + "grad_norm": 0.5017589926719666, + "learning_rate": 0.00032625407111804477, + "loss": 0.8783, + "step": 4499 + }, + { + "epoch": 0.8027829810008028, + "grad_norm": 0.4669455885887146, + "learning_rate": 0.0003261873445123926, + "loss": 0.7848, + "step": 4500 + }, + { + "epoch": 0.802961377218803, + "grad_norm": 0.5145031809806824, + "learning_rate": 0.0003261206119230192, + "loss": 0.9043, + "step": 4501 + }, + { + "epoch": 0.8031397734368031, + "grad_norm": 0.8292232155799866, + "learning_rate": 0.0003260538733551658, + "loss": 0.8444, + "step": 4502 + }, + { + "epoch": 0.8033181696548033, + "grad_norm": 0.4397391974925995, + "learning_rate": 0.0003259871288140738, + "loss": 0.8279, + "step": 4503 + }, + { + "epoch": 0.8034965658728035, + "grad_norm": 0.48555225133895874, + "learning_rate": 0.0003259203783049854, + "loss": 1.0585, + "step": 4504 + }, + { + "epoch": 0.8036749620908037, + "grad_norm": 0.4733032286167145, + "learning_rate": 0.00032585362183314327, + "loss": 0.899, + "step": 4505 + }, + { + "epoch": 0.8038533583088039, + "grad_norm": 0.6270719766616821, + "learning_rate": 0.00032578685940379027, + "loss": 0.8007, + "step": 4506 + }, + { + "epoch": 0.804031754526804, + "grad_norm": 1.0983858108520508, + "learning_rate": 0.00032572009102216983, + "loss": 0.8323, + "step": 4507 + }, + { + "epoch": 0.8042101507448042, + "grad_norm": 0.5489131212234497, + "learning_rate": 0.00032565331669352613, + "loss": 0.78, + "step": 4508 + }, + { + "epoch": 0.8043885469628044, + "grad_norm": 0.4885414242744446, + "learning_rate": 0.00032558653642310347, + "loss": 0.9142, + "step": 4509 + }, + { + "epoch": 0.8045669431808046, + "grad_norm": 0.5155977010726929, + "learning_rate": 0.0003255197502161468, + "loss": 0.9124, + "step": 4510 + }, + { + "epoch": 0.8047453393988048, + "grad_norm": 0.44706472754478455, + "learning_rate": 0.0003254529580779014, + "loss": 0.7907, + "step": 4511 + }, + { + "epoch": 0.804923735616805, + "grad_norm": 0.6217315793037415, + "learning_rate": 0.0003253861600136132, + "loss": 0.7259, + "step": 4512 + }, + { + "epoch": 0.8051021318348051, + "grad_norm": 0.5625154972076416, + "learning_rate": 0.0003253193560285284, + "loss": 0.9467, + "step": 4513 + }, + { + "epoch": 0.8052805280528053, + "grad_norm": 0.46225252747535706, + "learning_rate": 0.00032525254612789377, + "loss": 0.8054, + "step": 4514 + }, + { + "epoch": 0.8054589242708055, + "grad_norm": 0.4399617612361908, + "learning_rate": 0.0003251857303169565, + "loss": 0.7312, + "step": 4515 + }, + { + "epoch": 0.8056373204888057, + "grad_norm": 0.4609040319919586, + "learning_rate": 0.00032511890860096443, + "loss": 0.8695, + "step": 4516 + }, + { + "epoch": 0.8058157167068059, + "grad_norm": 0.5766742825508118, + "learning_rate": 0.00032505208098516567, + "loss": 0.9441, + "step": 4517 + }, + { + "epoch": 0.8059941129248059, + "grad_norm": 0.47482702136039734, + "learning_rate": 0.0003249852474748086, + "loss": 0.7759, + "step": 4518 + }, + { + "epoch": 0.8061725091428061, + "grad_norm": 0.4535680115222931, + "learning_rate": 0.0003249184080751426, + "loss": 0.9628, + "step": 4519 + }, + { + "epoch": 0.8063509053608063, + "grad_norm": 0.44655969738960266, + "learning_rate": 0.00032485156279141695, + "loss": 0.8394, + "step": 4520 + }, + { + "epoch": 0.8065293015788065, + "grad_norm": 0.5331109762191772, + "learning_rate": 0.00032478471162888185, + "loss": 1.0328, + "step": 4521 + }, + { + "epoch": 0.8067076977968067, + "grad_norm": 0.5180974006652832, + "learning_rate": 0.00032471785459278757, + "loss": 0.914, + "step": 4522 + }, + { + "epoch": 0.8068860940148069, + "grad_norm": 0.4896256923675537, + "learning_rate": 0.0003246509916883853, + "loss": 0.6507, + "step": 4523 + }, + { + "epoch": 0.807064490232807, + "grad_norm": 0.5117029547691345, + "learning_rate": 0.0003245841229209262, + "loss": 1.0229, + "step": 4524 + }, + { + "epoch": 0.8072428864508072, + "grad_norm": 0.4755803346633911, + "learning_rate": 0.00032451724829566216, + "loss": 0.8733, + "step": 4525 + }, + { + "epoch": 0.8074212826688074, + "grad_norm": 0.642292320728302, + "learning_rate": 0.0003244503678178455, + "loss": 1.0649, + "step": 4526 + }, + { + "epoch": 0.8075996788868076, + "grad_norm": 0.5174018740653992, + "learning_rate": 0.000324383481492729, + "loss": 1.0805, + "step": 4527 + }, + { + "epoch": 0.8077780751048078, + "grad_norm": 0.439130961894989, + "learning_rate": 0.0003243165893255659, + "loss": 0.7667, + "step": 4528 + }, + { + "epoch": 0.807956471322808, + "grad_norm": 0.5517845153808594, + "learning_rate": 0.00032424969132160985, + "loss": 1.0015, + "step": 4529 + }, + { + "epoch": 0.8081348675408081, + "grad_norm": 0.5080962777137756, + "learning_rate": 0.00032418278748611495, + "loss": 0.8835, + "step": 4530 + }, + { + "epoch": 0.8083132637588083, + "grad_norm": 0.5714918971061707, + "learning_rate": 0.00032411587782433594, + "loss": 1.2275, + "step": 4531 + }, + { + "epoch": 0.8084916599768085, + "grad_norm": 0.5037386417388916, + "learning_rate": 0.0003240489623415277, + "loss": 0.8592, + "step": 4532 + }, + { + "epoch": 0.8086700561948087, + "grad_norm": 0.5023531913757324, + "learning_rate": 0.00032398204104294585, + "loss": 0.9501, + "step": 4533 + }, + { + "epoch": 0.8088484524128089, + "grad_norm": 0.4968222677707672, + "learning_rate": 0.00032391511393384633, + "loss": 0.8091, + "step": 4534 + }, + { + "epoch": 0.809026848630809, + "grad_norm": 0.5576387643814087, + "learning_rate": 0.00032384818101948554, + "loss": 0.8792, + "step": 4535 + }, + { + "epoch": 0.8092052448488092, + "grad_norm": 0.4761325716972351, + "learning_rate": 0.0003237812423051204, + "loss": 0.8114, + "step": 4536 + }, + { + "epoch": 0.8093836410668094, + "grad_norm": 0.393627792596817, + "learning_rate": 0.00032371429779600824, + "loss": 0.5444, + "step": 4537 + }, + { + "epoch": 0.8095620372848096, + "grad_norm": 0.5613236427307129, + "learning_rate": 0.00032364734749740687, + "loss": 0.855, + "step": 4538 + }, + { + "epoch": 0.8097404335028098, + "grad_norm": 0.46514925360679626, + "learning_rate": 0.00032358039141457454, + "loss": 0.7157, + "step": 4539 + }, + { + "epoch": 0.80991882972081, + "grad_norm": 0.4529775083065033, + "learning_rate": 0.0003235134295527699, + "loss": 0.8365, + "step": 4540 + }, + { + "epoch": 0.8100972259388101, + "grad_norm": 0.47237035632133484, + "learning_rate": 0.0003234464619172522, + "loss": 0.852, + "step": 4541 + }, + { + "epoch": 0.8102756221568103, + "grad_norm": 0.4663850963115692, + "learning_rate": 0.00032337948851328093, + "loss": 0.7608, + "step": 4542 + }, + { + "epoch": 0.8104540183748105, + "grad_norm": 0.5153512954711914, + "learning_rate": 0.0003233125093461162, + "loss": 0.8918, + "step": 4543 + }, + { + "epoch": 0.8106324145928107, + "grad_norm": 0.40413349866867065, + "learning_rate": 0.0003232455244210186, + "loss": 0.6576, + "step": 4544 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.4308657944202423, + "learning_rate": 0.0003231785337432489, + "loss": 0.9653, + "step": 4545 + }, + { + "epoch": 0.8109892070288109, + "grad_norm": 0.4188125729560852, + "learning_rate": 0.00032311153731806873, + "loss": 0.7579, + "step": 4546 + }, + { + "epoch": 0.8111676032468111, + "grad_norm": 0.5259095430374146, + "learning_rate": 0.00032304453515073994, + "loss": 1.1485, + "step": 4547 + }, + { + "epoch": 0.8113459994648113, + "grad_norm": 0.4392109215259552, + "learning_rate": 0.0003229775272465247, + "loss": 0.8834, + "step": 4548 + }, + { + "epoch": 0.8115243956828115, + "grad_norm": 0.4974541962146759, + "learning_rate": 0.0003229105136106859, + "loss": 0.865, + "step": 4549 + }, + { + "epoch": 0.8117027919008117, + "grad_norm": 1.2180891036987305, + "learning_rate": 0.0003228434942484869, + "loss": 0.9709, + "step": 4550 + }, + { + "epoch": 0.8118811881188119, + "grad_norm": 0.4866902828216553, + "learning_rate": 0.000322776469165191, + "loss": 0.9429, + "step": 4551 + }, + { + "epoch": 0.812059584336812, + "grad_norm": 0.5774986147880554, + "learning_rate": 0.0003227094383660626, + "loss": 0.8982, + "step": 4552 + }, + { + "epoch": 0.8122379805548122, + "grad_norm": 0.4660821259021759, + "learning_rate": 0.0003226424018563661, + "loss": 0.8265, + "step": 4553 + }, + { + "epoch": 0.8124163767728124, + "grad_norm": 0.4910152554512024, + "learning_rate": 0.00032257535964136673, + "loss": 0.868, + "step": 4554 + }, + { + "epoch": 0.8125947729908126, + "grad_norm": 0.4666675627231598, + "learning_rate": 0.0003225083117263298, + "loss": 0.849, + "step": 4555 + }, + { + "epoch": 0.8127731692088128, + "grad_norm": 0.5173287391662598, + "learning_rate": 0.00032244125811652135, + "loss": 0.9152, + "step": 4556 + }, + { + "epoch": 0.8129515654268129, + "grad_norm": 0.503570020198822, + "learning_rate": 0.00032237419881720765, + "loss": 1.0545, + "step": 4557 + }, + { + "epoch": 0.8131299616448131, + "grad_norm": 0.4771646559238434, + "learning_rate": 0.00032230713383365545, + "loss": 0.8746, + "step": 4558 + }, + { + "epoch": 0.8133083578628133, + "grad_norm": 0.47005799412727356, + "learning_rate": 0.0003222400631711321, + "loss": 0.8811, + "step": 4559 + }, + { + "epoch": 0.8134867540808135, + "grad_norm": 0.48531290888786316, + "learning_rate": 0.00032217298683490526, + "loss": 0.7489, + "step": 4560 + }, + { + "epoch": 0.8136651502988137, + "grad_norm": 0.4748542010784149, + "learning_rate": 0.0003221059048302431, + "loss": 1.0024, + "step": 4561 + }, + { + "epoch": 0.8138435465168139, + "grad_norm": 0.46616560220718384, + "learning_rate": 0.00032203881716241426, + "loss": 0.8083, + "step": 4562 + }, + { + "epoch": 0.814021942734814, + "grad_norm": 1.1073871850967407, + "learning_rate": 0.00032197172383668763, + "loss": 0.6794, + "step": 4563 + }, + { + "epoch": 0.8142003389528142, + "grad_norm": 0.5361044406890869, + "learning_rate": 0.0003219046248583329, + "loss": 1.0918, + "step": 4564 + }, + { + "epoch": 0.8143787351708144, + "grad_norm": 0.44288361072540283, + "learning_rate": 0.00032183752023261973, + "loss": 0.8201, + "step": 4565 + }, + { + "epoch": 0.8145571313888146, + "grad_norm": 0.4622916579246521, + "learning_rate": 0.00032177040996481874, + "loss": 0.9029, + "step": 4566 + }, + { + "epoch": 0.8147355276068148, + "grad_norm": 0.48599693179130554, + "learning_rate": 0.0003217032940602006, + "loss": 1.0285, + "step": 4567 + }, + { + "epoch": 0.8149139238248149, + "grad_norm": 0.46129268407821655, + "learning_rate": 0.00032163617252403654, + "loss": 0.8161, + "step": 4568 + }, + { + "epoch": 0.8150923200428151, + "grad_norm": 0.5313239097595215, + "learning_rate": 0.0003215690453615985, + "loss": 1.0821, + "step": 4569 + }, + { + "epoch": 0.8152707162608153, + "grad_norm": 0.42274999618530273, + "learning_rate": 0.0003215019125781583, + "loss": 0.7682, + "step": 4570 + }, + { + "epoch": 0.8154491124788155, + "grad_norm": 0.46743467450141907, + "learning_rate": 0.00032143477417898866, + "loss": 0.8906, + "step": 4571 + }, + { + "epoch": 0.8156275086968157, + "grad_norm": 0.4919176697731018, + "learning_rate": 0.0003213676301693626, + "loss": 0.8782, + "step": 4572 + }, + { + "epoch": 0.8158059049148159, + "grad_norm": 0.479407399892807, + "learning_rate": 0.00032130048055455356, + "loss": 0.6938, + "step": 4573 + }, + { + "epoch": 0.815984301132816, + "grad_norm": 0.4484141767024994, + "learning_rate": 0.0003212333253398355, + "loss": 0.6904, + "step": 4574 + }, + { + "epoch": 0.8161626973508161, + "grad_norm": 0.5200289487838745, + "learning_rate": 0.0003211661645304827, + "loss": 0.9434, + "step": 4575 + }, + { + "epoch": 0.8163410935688163, + "grad_norm": 0.5177778601646423, + "learning_rate": 0.00032109899813177, + "loss": 0.8227, + "step": 4576 + }, + { + "epoch": 0.8165194897868165, + "grad_norm": 0.5002527236938477, + "learning_rate": 0.0003210318261489725, + "loss": 0.9783, + "step": 4577 + }, + { + "epoch": 0.8166978860048167, + "grad_norm": 0.47500887513160706, + "learning_rate": 0.0003209646485873661, + "loss": 0.7346, + "step": 4578 + }, + { + "epoch": 0.8168762822228168, + "grad_norm": 0.49525851011276245, + "learning_rate": 0.00032089746545222657, + "loss": 0.8249, + "step": 4579 + }, + { + "epoch": 0.817054678440817, + "grad_norm": 0.5161625742912292, + "learning_rate": 0.0003208302767488307, + "loss": 0.9479, + "step": 4580 + }, + { + "epoch": 0.8172330746588172, + "grad_norm": 0.47351545095443726, + "learning_rate": 0.00032076308248245533, + "loss": 0.8557, + "step": 4581 + }, + { + "epoch": 0.8174114708768174, + "grad_norm": 0.37471267580986023, + "learning_rate": 0.00032069588265837794, + "loss": 0.4742, + "step": 4582 + }, + { + "epoch": 0.8175898670948176, + "grad_norm": 0.5405718088150024, + "learning_rate": 0.0003206286772818764, + "loss": 1.0018, + "step": 4583 + }, + { + "epoch": 0.8177682633128178, + "grad_norm": 0.4767707884311676, + "learning_rate": 0.00032056146635822886, + "loss": 0.8955, + "step": 4584 + }, + { + "epoch": 0.8179466595308179, + "grad_norm": 0.48326289653778076, + "learning_rate": 0.00032049424989271416, + "loss": 0.8068, + "step": 4585 + }, + { + "epoch": 0.8181250557488181, + "grad_norm": 0.44311580061912537, + "learning_rate": 0.0003204270278906114, + "loss": 0.7806, + "step": 4586 + }, + { + "epoch": 0.8183034519668183, + "grad_norm": 0.46994298696517944, + "learning_rate": 0.00032035980035720015, + "loss": 0.9937, + "step": 4587 + }, + { + "epoch": 0.8184818481848185, + "grad_norm": 1.0689364671707153, + "learning_rate": 0.0003202925672977605, + "loss": 0.9123, + "step": 4588 + }, + { + "epoch": 0.8186602444028187, + "grad_norm": 0.5591593980789185, + "learning_rate": 0.0003202253287175728, + "loss": 1.0514, + "step": 4589 + }, + { + "epoch": 0.8188386406208188, + "grad_norm": 5.0126953125, + "learning_rate": 0.00032015808462191816, + "loss": 0.8152, + "step": 4590 + }, + { + "epoch": 0.819017036838819, + "grad_norm": 1.514142394065857, + "learning_rate": 0.00032009083501607753, + "loss": 0.7788, + "step": 4591 + }, + { + "epoch": 0.8191954330568192, + "grad_norm": 0.5694434642791748, + "learning_rate": 0.00032002357990533296, + "loss": 0.9116, + "step": 4592 + }, + { + "epoch": 0.8193738292748194, + "grad_norm": 1.0806207656860352, + "learning_rate": 0.0003199563192949666, + "loss": 1.0898, + "step": 4593 + }, + { + "epoch": 0.8195522254928196, + "grad_norm": 0.490933895111084, + "learning_rate": 0.000319889053190261, + "loss": 0.7681, + "step": 4594 + }, + { + "epoch": 0.8197306217108198, + "grad_norm": 0.5183424949645996, + "learning_rate": 0.00031982178159649925, + "loss": 0.7874, + "step": 4595 + }, + { + "epoch": 0.8199090179288199, + "grad_norm": 0.5265378355979919, + "learning_rate": 0.0003197545045189648, + "loss": 0.8632, + "step": 4596 + }, + { + "epoch": 0.8200874141468201, + "grad_norm": 0.4929676353931427, + "learning_rate": 0.0003196872219629417, + "loss": 0.833, + "step": 4597 + }, + { + "epoch": 0.8202658103648203, + "grad_norm": 0.48382702469825745, + "learning_rate": 0.00031961993393371405, + "loss": 0.6996, + "step": 4598 + }, + { + "epoch": 0.8204442065828205, + "grad_norm": 0.7561694383621216, + "learning_rate": 0.00031955264043656675, + "loss": 0.8474, + "step": 4599 + }, + { + "epoch": 0.8206226028008207, + "grad_norm": 0.6017073392868042, + "learning_rate": 0.000319485341476785, + "loss": 0.9392, + "step": 4600 + }, + { + "epoch": 0.8208009990188208, + "grad_norm": 0.5361237525939941, + "learning_rate": 0.00031941803705965447, + "loss": 0.8068, + "step": 4601 + }, + { + "epoch": 0.820979395236821, + "grad_norm": 0.5034250617027283, + "learning_rate": 0.00031935072719046115, + "loss": 1.0488, + "step": 4602 + }, + { + "epoch": 0.8211577914548212, + "grad_norm": 0.48589175939559937, + "learning_rate": 0.0003192834118744916, + "loss": 1.0141, + "step": 4603 + }, + { + "epoch": 0.8213361876728213, + "grad_norm": 0.43965524435043335, + "learning_rate": 0.0003192160911170327, + "loss": 0.6931, + "step": 4604 + }, + { + "epoch": 0.8215145838908215, + "grad_norm": 0.6355204582214355, + "learning_rate": 0.00031914876492337177, + "loss": 0.8362, + "step": 4605 + }, + { + "epoch": 0.8216929801088217, + "grad_norm": 0.502838671207428, + "learning_rate": 0.0003190814332987965, + "loss": 0.9824, + "step": 4606 + }, + { + "epoch": 0.8218713763268218, + "grad_norm": 0.48031777143478394, + "learning_rate": 0.00031901409624859536, + "loss": 0.8348, + "step": 4607 + }, + { + "epoch": 0.822049772544822, + "grad_norm": 0.47550857067108154, + "learning_rate": 0.00031894675377805665, + "loss": 0.8757, + "step": 4608 + }, + { + "epoch": 0.8222281687628222, + "grad_norm": 1.1182494163513184, + "learning_rate": 0.0003188794058924697, + "loss": 0.9129, + "step": 4609 + }, + { + "epoch": 0.8224065649808224, + "grad_norm": 0.5143523812294006, + "learning_rate": 0.00031881205259712384, + "loss": 0.9542, + "step": 4610 + }, + { + "epoch": 0.8225849611988226, + "grad_norm": 0.5969739556312561, + "learning_rate": 0.00031874469389730884, + "loss": 0.9868, + "step": 4611 + }, + { + "epoch": 0.8227633574168227, + "grad_norm": 0.5791580677032471, + "learning_rate": 0.0003186773297983153, + "loss": 0.8447, + "step": 4612 + }, + { + "epoch": 0.8229417536348229, + "grad_norm": 0.4807104766368866, + "learning_rate": 0.00031860996030543383, + "loss": 0.9425, + "step": 4613 + }, + { + "epoch": 0.8231201498528231, + "grad_norm": 0.48759788274765015, + "learning_rate": 0.00031854258542395546, + "loss": 0.7504, + "step": 4614 + }, + { + "epoch": 0.8232985460708233, + "grad_norm": 0.5369093418121338, + "learning_rate": 0.00031847520515917207, + "loss": 0.8581, + "step": 4615 + }, + { + "epoch": 0.8234769422888235, + "grad_norm": 0.5256065130233765, + "learning_rate": 0.00031840781951637554, + "loss": 0.8261, + "step": 4616 + }, + { + "epoch": 0.8236553385068237, + "grad_norm": 0.47315025329589844, + "learning_rate": 0.0003183404285008582, + "loss": 0.786, + "step": 4617 + }, + { + "epoch": 0.8238337347248238, + "grad_norm": 0.4942631125450134, + "learning_rate": 0.00031827303211791314, + "loss": 0.8627, + "step": 4618 + }, + { + "epoch": 0.824012130942824, + "grad_norm": 0.4402255713939667, + "learning_rate": 0.0003182056303728334, + "loss": 0.7299, + "step": 4619 + }, + { + "epoch": 0.8241905271608242, + "grad_norm": 0.5043277740478516, + "learning_rate": 0.00031813822327091286, + "loss": 0.8561, + "step": 4620 + }, + { + "epoch": 0.8243689233788244, + "grad_norm": 0.49375712871551514, + "learning_rate": 0.0003180708108174456, + "loss": 0.8253, + "step": 4621 + }, + { + "epoch": 0.8245473195968246, + "grad_norm": 1.158111572265625, + "learning_rate": 0.00031800339301772614, + "loss": 0.844, + "step": 4622 + }, + { + "epoch": 0.8247257158148247, + "grad_norm": 0.5060315728187561, + "learning_rate": 0.0003179359698770494, + "loss": 0.8552, + "step": 4623 + }, + { + "epoch": 0.8249041120328249, + "grad_norm": 0.5320479273796082, + "learning_rate": 0.00031786854140071084, + "loss": 0.9473, + "step": 4624 + }, + { + "epoch": 0.8250825082508251, + "grad_norm": 0.5282738208770752, + "learning_rate": 0.00031780110759400634, + "loss": 0.9197, + "step": 4625 + }, + { + "epoch": 0.8252609044688253, + "grad_norm": 0.4730012118816376, + "learning_rate": 0.00031773366846223197, + "loss": 0.7853, + "step": 4626 + }, + { + "epoch": 0.8254393006868255, + "grad_norm": 0.5142650008201599, + "learning_rate": 0.00031766622401068433, + "loss": 0.8487, + "step": 4627 + }, + { + "epoch": 0.8256176969048257, + "grad_norm": 0.4720161259174347, + "learning_rate": 0.0003175987742446607, + "loss": 0.8038, + "step": 4628 + }, + { + "epoch": 0.8257960931228258, + "grad_norm": 0.5759044885635376, + "learning_rate": 0.00031753131916945835, + "loss": 0.712, + "step": 4629 + }, + { + "epoch": 0.825974489340826, + "grad_norm": 0.6872929930686951, + "learning_rate": 0.0003174638587903753, + "loss": 0.9687, + "step": 4630 + }, + { + "epoch": 0.8261528855588262, + "grad_norm": 0.6308576464653015, + "learning_rate": 0.0003173963931127099, + "loss": 0.8172, + "step": 4631 + }, + { + "epoch": 0.8263312817768264, + "grad_norm": 0.5117534399032593, + "learning_rate": 0.0003173289221417606, + "loss": 0.9426, + "step": 4632 + }, + { + "epoch": 0.8265096779948266, + "grad_norm": 0.5493487119674683, + "learning_rate": 0.00031726144588282686, + "loss": 0.9628, + "step": 4633 + }, + { + "epoch": 0.8266880742128266, + "grad_norm": 3.2146034240722656, + "learning_rate": 0.0003171939643412081, + "loss": 0.9362, + "step": 4634 + }, + { + "epoch": 0.8268664704308268, + "grad_norm": 0.49968186020851135, + "learning_rate": 0.00031712647752220427, + "loss": 0.6978, + "step": 4635 + }, + { + "epoch": 0.827044866648827, + "grad_norm": 0.5384466052055359, + "learning_rate": 0.0003170589854311157, + "loss": 1.0049, + "step": 4636 + }, + { + "epoch": 0.8272232628668272, + "grad_norm": 0.547645628452301, + "learning_rate": 0.0003169914880732434, + "loss": 1.1843, + "step": 4637 + }, + { + "epoch": 0.8274016590848274, + "grad_norm": 0.4683157503604889, + "learning_rate": 0.0003169239854538884, + "loss": 0.7661, + "step": 4638 + }, + { + "epoch": 0.8275800553028276, + "grad_norm": 0.5019885301589966, + "learning_rate": 0.0003168564775783523, + "loss": 0.8682, + "step": 4639 + }, + { + "epoch": 0.8277584515208277, + "grad_norm": 0.6580917835235596, + "learning_rate": 0.0003167889644519374, + "loss": 0.8636, + "step": 4640 + }, + { + "epoch": 0.8279368477388279, + "grad_norm": 0.5439825654029846, + "learning_rate": 0.00031672144607994583, + "loss": 0.8918, + "step": 4641 + }, + { + "epoch": 0.8281152439568281, + "grad_norm": 0.5139893889427185, + "learning_rate": 0.00031665392246768066, + "loss": 0.9374, + "step": 4642 + }, + { + "epoch": 0.8282936401748283, + "grad_norm": 0.4646841883659363, + "learning_rate": 0.00031658639362044515, + "loss": 0.744, + "step": 4643 + }, + { + "epoch": 0.8284720363928285, + "grad_norm": 0.4877662658691406, + "learning_rate": 0.00031651885954354285, + "loss": 0.635, + "step": 4644 + }, + { + "epoch": 0.8286504326108286, + "grad_norm": 0.5172538757324219, + "learning_rate": 0.00031645132024227794, + "loss": 0.9473, + "step": 4645 + }, + { + "epoch": 0.8288288288288288, + "grad_norm": 0.8445831537246704, + "learning_rate": 0.000316383775721955, + "loss": 0.7455, + "step": 4646 + }, + { + "epoch": 0.829007225046829, + "grad_norm": 0.5219873785972595, + "learning_rate": 0.0003163162259878788, + "loss": 0.9679, + "step": 4647 + }, + { + "epoch": 0.8291856212648292, + "grad_norm": 0.5205322504043579, + "learning_rate": 0.0003162486710453548, + "loss": 0.7856, + "step": 4648 + }, + { + "epoch": 0.8293640174828294, + "grad_norm": 0.4920894503593445, + "learning_rate": 0.0003161811108996888, + "loss": 0.7642, + "step": 4649 + }, + { + "epoch": 0.8295424137008296, + "grad_norm": 0.4882299602031708, + "learning_rate": 0.00031611354555618673, + "loss": 0.7915, + "step": 4650 + }, + { + "epoch": 0.8297208099188297, + "grad_norm": 0.5416523814201355, + "learning_rate": 0.0003160459750201552, + "loss": 1.1522, + "step": 4651 + }, + { + "epoch": 0.8298992061368299, + "grad_norm": 0.4422816038131714, + "learning_rate": 0.0003159783992969012, + "loss": 0.7744, + "step": 4652 + }, + { + "epoch": 0.8300776023548301, + "grad_norm": 0.5312891006469727, + "learning_rate": 0.0003159108183917321, + "loss": 1.0687, + "step": 4653 + }, + { + "epoch": 0.8302559985728303, + "grad_norm": 0.4771096706390381, + "learning_rate": 0.00031584323230995584, + "loss": 0.8311, + "step": 4654 + }, + { + "epoch": 0.8304343947908305, + "grad_norm": 0.5976044535636902, + "learning_rate": 0.0003157756410568803, + "loss": 0.6884, + "step": 4655 + }, + { + "epoch": 0.8306127910088306, + "grad_norm": 0.6168167591094971, + "learning_rate": 0.0003157080446378143, + "loss": 0.8282, + "step": 4656 + }, + { + "epoch": 0.8307911872268308, + "grad_norm": 0.5120362639427185, + "learning_rate": 0.0003156404430580667, + "loss": 1.0674, + "step": 4657 + }, + { + "epoch": 0.830969583444831, + "grad_norm": 0.5644657015800476, + "learning_rate": 0.000315572836322947, + "loss": 1.0913, + "step": 4658 + }, + { + "epoch": 0.8311479796628312, + "grad_norm": 0.4254976511001587, + "learning_rate": 0.00031550522443776497, + "loss": 0.8372, + "step": 4659 + }, + { + "epoch": 0.8313263758808314, + "grad_norm": 0.4404276907444, + "learning_rate": 0.0003154376074078307, + "loss": 0.8619, + "step": 4660 + }, + { + "epoch": 0.8315047720988316, + "grad_norm": 0.4840773046016693, + "learning_rate": 0.00031536998523845497, + "loss": 0.87, + "step": 4661 + }, + { + "epoch": 0.8316831683168316, + "grad_norm": 0.4883440136909485, + "learning_rate": 0.0003153023579349487, + "loss": 1.0516, + "step": 4662 + }, + { + "epoch": 0.8318615645348318, + "grad_norm": 0.39201605319976807, + "learning_rate": 0.0003152347255026234, + "loss": 0.6851, + "step": 4663 + }, + { + "epoch": 0.832039960752832, + "grad_norm": 0.4473479688167572, + "learning_rate": 0.0003151670879467908, + "loss": 0.6324, + "step": 4664 + }, + { + "epoch": 0.8322183569708322, + "grad_norm": 1.3589231967926025, + "learning_rate": 0.0003150994452727631, + "loss": 0.954, + "step": 4665 + }, + { + "epoch": 0.8323967531888324, + "grad_norm": 0.44502779841423035, + "learning_rate": 0.00031503179748585303, + "loss": 0.6668, + "step": 4666 + }, + { + "epoch": 0.8325751494068325, + "grad_norm": 0.49202197790145874, + "learning_rate": 0.0003149641445913736, + "loss": 0.8664, + "step": 4667 + }, + { + "epoch": 0.8327535456248327, + "grad_norm": 0.6371123790740967, + "learning_rate": 0.0003148964865946381, + "loss": 0.7495, + "step": 4668 + }, + { + "epoch": 0.8329319418428329, + "grad_norm": 0.5421488881111145, + "learning_rate": 0.00031482882350096063, + "loss": 1.1947, + "step": 4669 + }, + { + "epoch": 0.8331103380608331, + "grad_norm": 0.49121710658073425, + "learning_rate": 0.0003147611553156552, + "loss": 0.8075, + "step": 4670 + }, + { + "epoch": 0.8332887342788333, + "grad_norm": 0.5611662268638611, + "learning_rate": 0.00031469348204403647, + "loss": 0.9551, + "step": 4671 + }, + { + "epoch": 0.8334671304968335, + "grad_norm": 0.4589937627315521, + "learning_rate": 0.0003146258036914195, + "loss": 0.8752, + "step": 4672 + }, + { + "epoch": 0.8336455267148336, + "grad_norm": 0.49224385619163513, + "learning_rate": 0.0003145581202631197, + "loss": 0.7851, + "step": 4673 + }, + { + "epoch": 0.8338239229328338, + "grad_norm": 0.4641133248806, + "learning_rate": 0.00031449043176445297, + "loss": 0.8309, + "step": 4674 + }, + { + "epoch": 0.834002319150834, + "grad_norm": 0.46971380710601807, + "learning_rate": 0.0003144227382007355, + "loss": 0.5713, + "step": 4675 + }, + { + "epoch": 0.8341807153688342, + "grad_norm": 0.6246667504310608, + "learning_rate": 0.00031435503957728383, + "loss": 0.9682, + "step": 4676 + }, + { + "epoch": 0.8343591115868344, + "grad_norm": 0.5531001091003418, + "learning_rate": 0.00031428733589941506, + "loss": 0.9917, + "step": 4677 + }, + { + "epoch": 0.8345375078048345, + "grad_norm": 0.5050147175788879, + "learning_rate": 0.00031421962717244654, + "loss": 0.8575, + "step": 4678 + }, + { + "epoch": 0.8347159040228347, + "grad_norm": 0.5118961930274963, + "learning_rate": 0.0003141519134016962, + "loss": 0.8731, + "step": 4679 + }, + { + "epoch": 0.8348943002408349, + "grad_norm": 0.4847746789455414, + "learning_rate": 0.0003140841945924822, + "loss": 0.8582, + "step": 4680 + }, + { + "epoch": 0.8350726964588351, + "grad_norm": 0.4473601281642914, + "learning_rate": 0.0003140164707501232, + "loss": 0.768, + "step": 4681 + }, + { + "epoch": 0.8352510926768353, + "grad_norm": 0.46980252861976624, + "learning_rate": 0.00031394874187993805, + "loss": 0.9835, + "step": 4682 + }, + { + "epoch": 0.8354294888948355, + "grad_norm": 0.42925623059272766, + "learning_rate": 0.00031388100798724624, + "loss": 0.7166, + "step": 4683 + }, + { + "epoch": 0.8356078851128356, + "grad_norm": 0.4442448019981384, + "learning_rate": 0.0003138132690773675, + "loss": 0.8631, + "step": 4684 + }, + { + "epoch": 0.8357862813308358, + "grad_norm": 0.5010333061218262, + "learning_rate": 0.00031374552515562215, + "loss": 0.7902, + "step": 4685 + }, + { + "epoch": 0.835964677548836, + "grad_norm": 0.473787784576416, + "learning_rate": 0.0003136777762273306, + "loss": 0.9016, + "step": 4686 + }, + { + "epoch": 0.8361430737668362, + "grad_norm": 0.4981488287448883, + "learning_rate": 0.000313610022297814, + "loss": 0.8935, + "step": 4687 + }, + { + "epoch": 0.8363214699848364, + "grad_norm": 0.5715591907501221, + "learning_rate": 0.0003135422633723936, + "loss": 0.9939, + "step": 4688 + }, + { + "epoch": 0.8364998662028365, + "grad_norm": 0.587060809135437, + "learning_rate": 0.0003134744994563912, + "loss": 1.0045, + "step": 4689 + }, + { + "epoch": 0.8366782624208366, + "grad_norm": 0.4668729901313782, + "learning_rate": 0.0003134067305551289, + "loss": 0.9922, + "step": 4690 + }, + { + "epoch": 0.8368566586388368, + "grad_norm": 0.44394639134407043, + "learning_rate": 0.0003133389566739292, + "loss": 0.8563, + "step": 4691 + }, + { + "epoch": 0.837035054856837, + "grad_norm": 0.41054239869117737, + "learning_rate": 0.0003132711778181152, + "loss": 0.6808, + "step": 4692 + }, + { + "epoch": 0.8372134510748372, + "grad_norm": 0.45655539631843567, + "learning_rate": 0.00031320339399301005, + "loss": 0.7885, + "step": 4693 + }, + { + "epoch": 0.8373918472928374, + "grad_norm": 0.528401255607605, + "learning_rate": 0.00031313560520393756, + "loss": 1.3023, + "step": 4694 + }, + { + "epoch": 0.8375702435108375, + "grad_norm": 0.5124956369400024, + "learning_rate": 0.0003130678114562218, + "loss": 0.9328, + "step": 4695 + }, + { + "epoch": 0.8377486397288377, + "grad_norm": 0.5178223252296448, + "learning_rate": 0.00031300001275518733, + "loss": 1.0182, + "step": 4696 + }, + { + "epoch": 0.8379270359468379, + "grad_norm": 0.49288439750671387, + "learning_rate": 0.00031293220910615896, + "loss": 1.0634, + "step": 4697 + }, + { + "epoch": 0.8381054321648381, + "grad_norm": 0.47642451524734497, + "learning_rate": 0.00031286440051446187, + "loss": 0.9013, + "step": 4698 + }, + { + "epoch": 0.8382838283828383, + "grad_norm": 0.44043177366256714, + "learning_rate": 0.0003127965869854219, + "loss": 0.7628, + "step": 4699 + }, + { + "epoch": 0.8384622246008384, + "grad_norm": 0.4588578939437866, + "learning_rate": 0.00031272876852436493, + "loss": 0.7756, + "step": 4700 + }, + { + "epoch": 0.8386406208188386, + "grad_norm": 0.4768531620502472, + "learning_rate": 0.0003126609451366176, + "loss": 0.7546, + "step": 4701 + }, + { + "epoch": 0.8388190170368388, + "grad_norm": 0.4645833671092987, + "learning_rate": 0.00031259311682750655, + "loss": 0.7292, + "step": 4702 + }, + { + "epoch": 0.838997413254839, + "grad_norm": 0.49236834049224854, + "learning_rate": 0.00031252528360235907, + "loss": 0.884, + "step": 4703 + }, + { + "epoch": 0.8391758094728392, + "grad_norm": 0.5257160067558289, + "learning_rate": 0.0003124574454665027, + "loss": 0.9921, + "step": 4704 + }, + { + "epoch": 0.8393542056908394, + "grad_norm": 2.2589757442474365, + "learning_rate": 0.0003123896024252654, + "loss": 0.7997, + "step": 4705 + }, + { + "epoch": 0.8395326019088395, + "grad_norm": 0.4541998505592346, + "learning_rate": 0.00031232175448397547, + "loss": 0.7263, + "step": 4706 + }, + { + "epoch": 0.8397109981268397, + "grad_norm": 0.8749123215675354, + "learning_rate": 0.00031225390164796193, + "loss": 0.7226, + "step": 4707 + }, + { + "epoch": 0.8398893943448399, + "grad_norm": 0.46198785305023193, + "learning_rate": 0.0003121860439225537, + "loss": 0.7642, + "step": 4708 + }, + { + "epoch": 0.8400677905628401, + "grad_norm": 0.4428096115589142, + "learning_rate": 0.0003121181813130804, + "loss": 0.7683, + "step": 4709 + }, + { + "epoch": 0.8402461867808403, + "grad_norm": 0.4557209014892578, + "learning_rate": 0.0003120503138248718, + "loss": 0.6965, + "step": 4710 + }, + { + "epoch": 0.8404245829988404, + "grad_norm": 0.498859167098999, + "learning_rate": 0.0003119824414632583, + "loss": 0.7745, + "step": 4711 + }, + { + "epoch": 0.8406029792168406, + "grad_norm": 0.44227057695388794, + "learning_rate": 0.00031191456423357045, + "loss": 0.8255, + "step": 4712 + }, + { + "epoch": 0.8407813754348408, + "grad_norm": 0.5205174684524536, + "learning_rate": 0.0003118466821411394, + "loss": 1.0533, + "step": 4713 + }, + { + "epoch": 0.840959771652841, + "grad_norm": 0.48816418647766113, + "learning_rate": 0.0003117787951912965, + "loss": 0.8577, + "step": 4714 + }, + { + "epoch": 0.8411381678708412, + "grad_norm": 0.5432656407356262, + "learning_rate": 0.00031171090338937376, + "loss": 1.0221, + "step": 4715 + }, + { + "epoch": 0.8413165640888414, + "grad_norm": 0.4946669340133667, + "learning_rate": 0.0003116430067407031, + "loss": 0.8704, + "step": 4716 + }, + { + "epoch": 0.8414949603068415, + "grad_norm": 0.4849919378757477, + "learning_rate": 0.0003115751052506173, + "loss": 0.8248, + "step": 4717 + }, + { + "epoch": 0.8416733565248417, + "grad_norm": 0.5005682110786438, + "learning_rate": 0.0003115071989244491, + "loss": 0.9195, + "step": 4718 + }, + { + "epoch": 0.8418517527428419, + "grad_norm": 0.5993044972419739, + "learning_rate": 0.00031143928776753213, + "loss": 0.8419, + "step": 4719 + }, + { + "epoch": 0.842030148960842, + "grad_norm": 0.704413115978241, + "learning_rate": 0.0003113713717851998, + "loss": 0.9343, + "step": 4720 + }, + { + "epoch": 0.8422085451788422, + "grad_norm": 0.5452966094017029, + "learning_rate": 0.0003113034509827864, + "loss": 0.7155, + "step": 4721 + }, + { + "epoch": 0.8423869413968423, + "grad_norm": 0.5383533835411072, + "learning_rate": 0.0003112355253656263, + "loss": 0.7432, + "step": 4722 + }, + { + "epoch": 0.8425653376148425, + "grad_norm": 0.624819278717041, + "learning_rate": 0.00031116759493905445, + "loss": 0.8583, + "step": 4723 + }, + { + "epoch": 0.8427437338328427, + "grad_norm": 0.982383668422699, + "learning_rate": 0.000311099659708406, + "loss": 0.6117, + "step": 4724 + }, + { + "epoch": 0.8429221300508429, + "grad_norm": 0.6751213669776917, + "learning_rate": 0.00031103171967901655, + "loss": 1.022, + "step": 4725 + }, + { + "epoch": 0.8431005262688431, + "grad_norm": 1.0229430198669434, + "learning_rate": 0.00031096377485622214, + "loss": 0.7461, + "step": 4726 + }, + { + "epoch": 0.8432789224868433, + "grad_norm": 0.5392288565635681, + "learning_rate": 0.000310895825245359, + "loss": 0.7769, + "step": 4727 + }, + { + "epoch": 0.8434573187048434, + "grad_norm": 1.207713007926941, + "learning_rate": 0.0003108278708517641, + "loss": 0.5888, + "step": 4728 + }, + { + "epoch": 0.8436357149228436, + "grad_norm": 1.1378090381622314, + "learning_rate": 0.0003107599116807743, + "loss": 0.7526, + "step": 4729 + }, + { + "epoch": 0.8438141111408438, + "grad_norm": 0.7071771621704102, + "learning_rate": 0.00031069194773772715, + "loss": 1.0347, + "step": 4730 + }, + { + "epoch": 0.843992507358844, + "grad_norm": 0.7279558181762695, + "learning_rate": 0.0003106239790279606, + "loss": 0.8911, + "step": 4731 + }, + { + "epoch": 0.8441709035768442, + "grad_norm": 0.49013325572013855, + "learning_rate": 0.0003105560055568128, + "loss": 0.6986, + "step": 4732 + }, + { + "epoch": 0.8443492997948443, + "grad_norm": 0.5166245698928833, + "learning_rate": 0.0003104880273296224, + "loss": 1.0091, + "step": 4733 + }, + { + "epoch": 0.8445276960128445, + "grad_norm": 0.5410045385360718, + "learning_rate": 0.00031042004435172834, + "loss": 1.0074, + "step": 4734 + }, + { + "epoch": 0.8447060922308447, + "grad_norm": 0.40799999237060547, + "learning_rate": 0.00031035205662847005, + "loss": 0.7189, + "step": 4735 + }, + { + "epoch": 0.8448844884488449, + "grad_norm": 0.468019038438797, + "learning_rate": 0.0003102840641651872, + "loss": 0.9389, + "step": 4736 + }, + { + "epoch": 0.8450628846668451, + "grad_norm": 0.4619864225387573, + "learning_rate": 0.00031021606696721984, + "loss": 0.7924, + "step": 4737 + }, + { + "epoch": 0.8452412808848453, + "grad_norm": 1.7745176553726196, + "learning_rate": 0.0003101480650399085, + "loss": 0.8259, + "step": 4738 + }, + { + "epoch": 0.8454196771028454, + "grad_norm": 0.5147114396095276, + "learning_rate": 0.000310080058388594, + "loss": 0.7269, + "step": 4739 + }, + { + "epoch": 0.8455980733208456, + "grad_norm": 1.0041489601135254, + "learning_rate": 0.00031001204701861765, + "loss": 0.8016, + "step": 4740 + }, + { + "epoch": 0.8457764695388458, + "grad_norm": 0.5194323062896729, + "learning_rate": 0.00030994403093532086, + "loss": 0.8482, + "step": 4741 + }, + { + "epoch": 0.845954865756846, + "grad_norm": 1.3192564249038696, + "learning_rate": 0.00030987601014404576, + "loss": 0.9586, + "step": 4742 + }, + { + "epoch": 0.8461332619748462, + "grad_norm": 0.5231749415397644, + "learning_rate": 0.00030980798465013454, + "loss": 0.9005, + "step": 4743 + }, + { + "epoch": 0.8463116581928463, + "grad_norm": 0.52949458360672, + "learning_rate": 0.00030973995445892987, + "loss": 0.9448, + "step": 4744 + }, + { + "epoch": 0.8464900544108465, + "grad_norm": 0.509884238243103, + "learning_rate": 0.00030967191957577503, + "loss": 0.7253, + "step": 4745 + }, + { + "epoch": 0.8466684506288467, + "grad_norm": 0.4490432143211365, + "learning_rate": 0.00030960388000601325, + "loss": 0.7781, + "step": 4746 + }, + { + "epoch": 0.8468468468468469, + "grad_norm": 0.5485023856163025, + "learning_rate": 0.0003095358357549883, + "loss": 0.8696, + "step": 4747 + }, + { + "epoch": 0.847025243064847, + "grad_norm": 0.46540600061416626, + "learning_rate": 0.00030946778682804457, + "loss": 0.837, + "step": 4748 + }, + { + "epoch": 0.8472036392828473, + "grad_norm": 0.46531352400779724, + "learning_rate": 0.0003093997332305264, + "loss": 0.879, + "step": 4749 + }, + { + "epoch": 0.8473820355008473, + "grad_norm": 0.5620687007904053, + "learning_rate": 0.00030933167496777875, + "loss": 1.2353, + "step": 4750 + }, + { + "epoch": 0.8475604317188475, + "grad_norm": 0.4240880310535431, + "learning_rate": 0.0003092636120451469, + "loss": 0.6647, + "step": 4751 + }, + { + "epoch": 0.8477388279368477, + "grad_norm": 0.5051021575927734, + "learning_rate": 0.0003091955444679763, + "loss": 0.9274, + "step": 4752 + }, + { + "epoch": 0.8479172241548479, + "grad_norm": 0.47416186332702637, + "learning_rate": 0.00030912747224161324, + "loss": 0.7746, + "step": 4753 + }, + { + "epoch": 0.8480956203728481, + "grad_norm": 0.4895223081111908, + "learning_rate": 0.000309059395371404, + "loss": 0.7968, + "step": 4754 + }, + { + "epoch": 0.8482740165908482, + "grad_norm": 0.5074788331985474, + "learning_rate": 0.00030899131386269527, + "loss": 1.1851, + "step": 4755 + }, + { + "epoch": 0.8484524128088484, + "grad_norm": 0.45386070013046265, + "learning_rate": 0.000308923227720834, + "loss": 0.8569, + "step": 4756 + }, + { + "epoch": 0.8486308090268486, + "grad_norm": 0.5263837575912476, + "learning_rate": 0.0003088551369511679, + "loss": 0.9021, + "step": 4757 + }, + { + "epoch": 0.8488092052448488, + "grad_norm": 0.6763511896133423, + "learning_rate": 0.00030878704155904465, + "loss": 0.8721, + "step": 4758 + }, + { + "epoch": 0.848987601462849, + "grad_norm": 0.48090678453445435, + "learning_rate": 0.0003087189415498124, + "loss": 0.8649, + "step": 4759 + }, + { + "epoch": 0.8491659976808492, + "grad_norm": 0.44986963272094727, + "learning_rate": 0.0003086508369288198, + "loss": 0.9613, + "step": 4760 + }, + { + "epoch": 0.8493443938988493, + "grad_norm": 4.108095169067383, + "learning_rate": 0.00030858272770141574, + "loss": 0.7813, + "step": 4761 + }, + { + "epoch": 0.8495227901168495, + "grad_norm": 0.4738391935825348, + "learning_rate": 0.0003085146138729494, + "loss": 0.8247, + "step": 4762 + }, + { + "epoch": 0.8497011863348497, + "grad_norm": 0.45482027530670166, + "learning_rate": 0.0003084464954487705, + "loss": 0.7193, + "step": 4763 + }, + { + "epoch": 0.8498795825528499, + "grad_norm": 0.6398509740829468, + "learning_rate": 0.00030837837243422896, + "loss": 0.7891, + "step": 4764 + }, + { + "epoch": 0.8500579787708501, + "grad_norm": 0.45158737897872925, + "learning_rate": 0.00030831024483467517, + "loss": 0.7787, + "step": 4765 + }, + { + "epoch": 0.8502363749888502, + "grad_norm": 0.46626511216163635, + "learning_rate": 0.00030824211265545985, + "loss": 0.8941, + "step": 4766 + }, + { + "epoch": 0.8504147712068504, + "grad_norm": 0.4571463167667389, + "learning_rate": 0.00030817397590193404, + "loss": 0.9276, + "step": 4767 + }, + { + "epoch": 0.8505931674248506, + "grad_norm": 0.4970678687095642, + "learning_rate": 0.0003081058345794493, + "loss": 0.8267, + "step": 4768 + }, + { + "epoch": 0.8507715636428508, + "grad_norm": 0.5197835564613342, + "learning_rate": 0.0003080376886933572, + "loss": 0.8656, + "step": 4769 + }, + { + "epoch": 0.850949959860851, + "grad_norm": 0.48982325196266174, + "learning_rate": 0.0003079695382490101, + "loss": 0.7117, + "step": 4770 + }, + { + "epoch": 0.8511283560788512, + "grad_norm": 0.5008729696273804, + "learning_rate": 0.0003079013832517603, + "loss": 0.8803, + "step": 4771 + }, + { + "epoch": 0.8513067522968513, + "grad_norm": 0.4859359562397003, + "learning_rate": 0.00030783322370696087, + "loss": 0.8494, + "step": 4772 + }, + { + "epoch": 0.8514851485148515, + "grad_norm": 0.4618992209434509, + "learning_rate": 0.00030776505961996494, + "loss": 0.6753, + "step": 4773 + }, + { + "epoch": 0.8516635447328517, + "grad_norm": 0.6842380166053772, + "learning_rate": 0.00030769689099612604, + "loss": 0.8521, + "step": 4774 + }, + { + "epoch": 0.8518419409508519, + "grad_norm": 0.46096381545066833, + "learning_rate": 0.00030762871784079815, + "loss": 0.6602, + "step": 4775 + }, + { + "epoch": 0.8520203371688521, + "grad_norm": 0.5396759510040283, + "learning_rate": 0.0003075605401593356, + "loss": 1.053, + "step": 4776 + }, + { + "epoch": 0.8521987333868521, + "grad_norm": 0.4654025435447693, + "learning_rate": 0.000307492357957093, + "loss": 0.6658, + "step": 4777 + }, + { + "epoch": 0.8523771296048523, + "grad_norm": 0.46096524596214294, + "learning_rate": 0.0003074241712394253, + "loss": 0.7662, + "step": 4778 + }, + { + "epoch": 0.8525555258228525, + "grad_norm": 0.5289954543113708, + "learning_rate": 0.0003073559800116879, + "loss": 0.7916, + "step": 4779 + }, + { + "epoch": 0.8527339220408527, + "grad_norm": 0.4729297459125519, + "learning_rate": 0.00030728778427923655, + "loss": 0.8108, + "step": 4780 + }, + { + "epoch": 0.8529123182588529, + "grad_norm": 1.2165110111236572, + "learning_rate": 0.0003072195840474273, + "loss": 1.0334, + "step": 4781 + }, + { + "epoch": 0.8530907144768531, + "grad_norm": 0.6247953176498413, + "learning_rate": 0.00030715137932161646, + "loss": 1.1616, + "step": 4782 + }, + { + "epoch": 0.8532691106948532, + "grad_norm": 0.5182445645332336, + "learning_rate": 0.00030708317010716093, + "loss": 1.0503, + "step": 4783 + }, + { + "epoch": 0.8534475069128534, + "grad_norm": 0.5359323620796204, + "learning_rate": 0.0003070149564094178, + "loss": 1.0459, + "step": 4784 + }, + { + "epoch": 0.8536259031308536, + "grad_norm": 0.4704750180244446, + "learning_rate": 0.0003069467382337445, + "loss": 0.8514, + "step": 4785 + }, + { + "epoch": 0.8538042993488538, + "grad_norm": 0.5520228743553162, + "learning_rate": 0.0003068785155854989, + "loss": 0.8987, + "step": 4786 + }, + { + "epoch": 0.853982695566854, + "grad_norm": 0.4699079394340515, + "learning_rate": 0.0003068102884700391, + "loss": 0.736, + "step": 4787 + }, + { + "epoch": 0.8541610917848541, + "grad_norm": 0.5631700754165649, + "learning_rate": 0.00030674205689272375, + "loss": 0.9386, + "step": 4788 + }, + { + "epoch": 0.8543394880028543, + "grad_norm": 0.4554060101509094, + "learning_rate": 0.00030667382085891175, + "loss": 0.7084, + "step": 4789 + }, + { + "epoch": 0.8545178842208545, + "grad_norm": 0.46370020508766174, + "learning_rate": 0.00030660558037396216, + "loss": 0.9635, + "step": 4790 + }, + { + "epoch": 0.8546962804388547, + "grad_norm": 0.46454691886901855, + "learning_rate": 0.0003065373354432346, + "loss": 0.8481, + "step": 4791 + }, + { + "epoch": 0.8548746766568549, + "grad_norm": 0.4899047017097473, + "learning_rate": 0.0003064690860720891, + "loss": 0.8236, + "step": 4792 + }, + { + "epoch": 0.8550530728748551, + "grad_norm": 0.9299049377441406, + "learning_rate": 0.0003064008322658859, + "loss": 0.7952, + "step": 4793 + }, + { + "epoch": 0.8552314690928552, + "grad_norm": 1.665650725364685, + "learning_rate": 0.0003063325740299855, + "loss": 1.0987, + "step": 4794 + }, + { + "epoch": 0.8554098653108554, + "grad_norm": 0.4786631464958191, + "learning_rate": 0.0003062643113697492, + "loss": 0.8729, + "step": 4795 + }, + { + "epoch": 0.8555882615288556, + "grad_norm": 1.025985836982727, + "learning_rate": 0.00030619604429053793, + "loss": 0.891, + "step": 4796 + }, + { + "epoch": 0.8557666577468558, + "grad_norm": 0.5616780519485474, + "learning_rate": 0.0003061277727977135, + "loss": 1.0594, + "step": 4797 + }, + { + "epoch": 0.855945053964856, + "grad_norm": 0.4509812593460083, + "learning_rate": 0.000306059496896638, + "loss": 0.6073, + "step": 4798 + }, + { + "epoch": 0.8561234501828561, + "grad_norm": 0.4322461783885956, + "learning_rate": 0.0003059912165926738, + "loss": 0.772, + "step": 4799 + }, + { + "epoch": 0.8563018464008563, + "grad_norm": 0.40975263714790344, + "learning_rate": 0.00030592293189118344, + "loss": 0.7681, + "step": 4800 + }, + { + "epoch": 0.8564802426188565, + "grad_norm": 0.5190335512161255, + "learning_rate": 0.00030585464279753015, + "loss": 0.8196, + "step": 4801 + }, + { + "epoch": 0.8566586388368567, + "grad_norm": 0.5089114904403687, + "learning_rate": 0.0003057863493170772, + "loss": 0.7172, + "step": 4802 + }, + { + "epoch": 0.8568370350548569, + "grad_norm": 0.548192024230957, + "learning_rate": 0.0003057180514551884, + "loss": 0.8321, + "step": 4803 + }, + { + "epoch": 0.8570154312728571, + "grad_norm": 0.4767385721206665, + "learning_rate": 0.0003056497492172278, + "loss": 0.8879, + "step": 4804 + }, + { + "epoch": 0.8571938274908572, + "grad_norm": 0.488613098859787, + "learning_rate": 0.00030558144260855986, + "loss": 0.9423, + "step": 4805 + }, + { + "epoch": 0.8573722237088574, + "grad_norm": 0.48882797360420227, + "learning_rate": 0.0003055131316345493, + "loss": 0.7837, + "step": 4806 + }, + { + "epoch": 0.8575506199268575, + "grad_norm": 0.5603160262107849, + "learning_rate": 0.0003054448163005613, + "loss": 0.8026, + "step": 4807 + }, + { + "epoch": 0.8577290161448577, + "grad_norm": 0.46545833349227905, + "learning_rate": 0.00030537649661196135, + "loss": 0.8075, + "step": 4808 + }, + { + "epoch": 0.857907412362858, + "grad_norm": 0.4614555537700653, + "learning_rate": 0.00030530817257411517, + "loss": 0.7662, + "step": 4809 + }, + { + "epoch": 0.858085808580858, + "grad_norm": 0.5005578398704529, + "learning_rate": 0.0003052398441923888, + "loss": 0.9496, + "step": 4810 + }, + { + "epoch": 0.8582642047988582, + "grad_norm": 0.45669299364089966, + "learning_rate": 0.00030517151147214895, + "loss": 0.7207, + "step": 4811 + }, + { + "epoch": 0.8584426010168584, + "grad_norm": 0.4442518949508667, + "learning_rate": 0.0003051031744187623, + "loss": 0.7858, + "step": 4812 + }, + { + "epoch": 0.8586209972348586, + "grad_norm": 0.4947311580181122, + "learning_rate": 0.00030503483303759597, + "loss": 1.0889, + "step": 4813 + }, + { + "epoch": 0.8587993934528588, + "grad_norm": 0.4674645960330963, + "learning_rate": 0.00030496648733401764, + "loss": 0.7326, + "step": 4814 + }, + { + "epoch": 0.858977789670859, + "grad_norm": 0.49321508407592773, + "learning_rate": 0.00030489813731339504, + "loss": 0.8671, + "step": 4815 + }, + { + "epoch": 0.8591561858888591, + "grad_norm": 0.48469820618629456, + "learning_rate": 0.00030482978298109636, + "loss": 1.0099, + "step": 4816 + }, + { + "epoch": 0.8593345821068593, + "grad_norm": 0.5168629884719849, + "learning_rate": 0.0003047614243424901, + "loss": 1.0685, + "step": 4817 + }, + { + "epoch": 0.8595129783248595, + "grad_norm": 0.5016402006149292, + "learning_rate": 0.0003046930614029451, + "loss": 0.9678, + "step": 4818 + }, + { + "epoch": 0.8596913745428597, + "grad_norm": 0.458768367767334, + "learning_rate": 0.00030462469416783067, + "loss": 0.8762, + "step": 4819 + }, + { + "epoch": 0.8598697707608599, + "grad_norm": 0.6688326597213745, + "learning_rate": 0.0003045563226425162, + "loss": 0.9441, + "step": 4820 + }, + { + "epoch": 0.86004816697886, + "grad_norm": 0.5008928179740906, + "learning_rate": 0.0003044879468323716, + "loss": 0.7504, + "step": 4821 + }, + { + "epoch": 0.8602265631968602, + "grad_norm": 2.919682502746582, + "learning_rate": 0.0003044195667427672, + "loss": 1.1196, + "step": 4822 + }, + { + "epoch": 0.8604049594148604, + "grad_norm": 1.0007497072219849, + "learning_rate": 0.0003043511823790734, + "loss": 0.638, + "step": 4823 + }, + { + "epoch": 0.8605833556328606, + "grad_norm": 0.5225825905799866, + "learning_rate": 0.00030428279374666113, + "loss": 0.8362, + "step": 4824 + }, + { + "epoch": 0.8607617518508608, + "grad_norm": 0.533996045589447, + "learning_rate": 0.0003042144008509016, + "loss": 0.8759, + "step": 4825 + }, + { + "epoch": 0.860940148068861, + "grad_norm": 0.45570921897888184, + "learning_rate": 0.00030414600369716636, + "loss": 0.7521, + "step": 4826 + }, + { + "epoch": 0.8611185442868611, + "grad_norm": 2.225616216659546, + "learning_rate": 0.0003040776022908273, + "loss": 0.8446, + "step": 4827 + }, + { + "epoch": 0.8612969405048613, + "grad_norm": 0.5153058767318726, + "learning_rate": 0.00030400919663725655, + "loss": 1.0429, + "step": 4828 + }, + { + "epoch": 0.8614753367228615, + "grad_norm": 0.44439637660980225, + "learning_rate": 0.00030394078674182684, + "loss": 0.8713, + "step": 4829 + }, + { + "epoch": 0.8616537329408617, + "grad_norm": 0.5092588663101196, + "learning_rate": 0.0003038723726099109, + "loss": 0.7983, + "step": 4830 + }, + { + "epoch": 0.8618321291588619, + "grad_norm": 0.5052730441093445, + "learning_rate": 0.000303803954246882, + "loss": 0.9045, + "step": 4831 + }, + { + "epoch": 0.862010525376862, + "grad_norm": 0.5321127772331238, + "learning_rate": 0.00030373553165811377, + "loss": 0.9417, + "step": 4832 + }, + { + "epoch": 0.8621889215948622, + "grad_norm": 0.4712226688861847, + "learning_rate": 0.00030366710484897984, + "loss": 0.8645, + "step": 4833 + }, + { + "epoch": 0.8623673178128624, + "grad_norm": 0.5059930682182312, + "learning_rate": 0.0003035986738248547, + "loss": 0.9814, + "step": 4834 + }, + { + "epoch": 0.8625457140308626, + "grad_norm": 0.4837188124656677, + "learning_rate": 0.00030353023859111284, + "loss": 0.8914, + "step": 4835 + }, + { + "epoch": 0.8627241102488628, + "grad_norm": 0.4856887757778168, + "learning_rate": 0.0003034617991531289, + "loss": 0.9607, + "step": 4836 + }, + { + "epoch": 0.862902506466863, + "grad_norm": 0.5175963044166565, + "learning_rate": 0.0003033933555162784, + "loss": 0.9011, + "step": 4837 + }, + { + "epoch": 0.863080902684863, + "grad_norm": 0.4169839024543762, + "learning_rate": 0.0003033249076859367, + "loss": 0.7326, + "step": 4838 + }, + { + "epoch": 0.8632592989028632, + "grad_norm": 0.4630539119243622, + "learning_rate": 0.0003032564556674797, + "loss": 0.7955, + "step": 4839 + }, + { + "epoch": 0.8634376951208634, + "grad_norm": 0.47739243507385254, + "learning_rate": 0.0003031879994662836, + "loss": 0.8976, + "step": 4840 + }, + { + "epoch": 0.8636160913388636, + "grad_norm": 0.5732921957969666, + "learning_rate": 0.00030311953908772495, + "loss": 0.9195, + "step": 4841 + }, + { + "epoch": 0.8637944875568638, + "grad_norm": 0.45784977078437805, + "learning_rate": 0.0003030510745371805, + "loss": 0.8676, + "step": 4842 + }, + { + "epoch": 0.8639728837748639, + "grad_norm": 0.40974244475364685, + "learning_rate": 0.00030298260582002753, + "loss": 0.6207, + "step": 4843 + }, + { + "epoch": 0.8641512799928641, + "grad_norm": 0.5293233394622803, + "learning_rate": 0.00030291413294164336, + "loss": 0.8833, + "step": 4844 + }, + { + "epoch": 0.8643296762108643, + "grad_norm": 0.5298712253570557, + "learning_rate": 0.00030284565590740607, + "loss": 0.8459, + "step": 4845 + }, + { + "epoch": 0.8645080724288645, + "grad_norm": 0.5469498038291931, + "learning_rate": 0.00030277717472269373, + "loss": 0.8662, + "step": 4846 + }, + { + "epoch": 0.8646864686468647, + "grad_norm": 0.5502684116363525, + "learning_rate": 0.00030270868939288474, + "loss": 0.7796, + "step": 4847 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 0.5410329699516296, + "learning_rate": 0.00030264019992335805, + "loss": 0.6923, + "step": 4848 + }, + { + "epoch": 0.865043261082865, + "grad_norm": 0.5078060030937195, + "learning_rate": 0.00030257170631949265, + "loss": 0.8679, + "step": 4849 + }, + { + "epoch": 0.8652216573008652, + "grad_norm": 0.4987890422344208, + "learning_rate": 0.0003025032085866681, + "loss": 0.8628, + "step": 4850 + }, + { + "epoch": 0.8654000535188654, + "grad_norm": 0.5421337485313416, + "learning_rate": 0.000302434706730264, + "loss": 0.8687, + "step": 4851 + }, + { + "epoch": 0.8655784497368656, + "grad_norm": 0.4919837415218353, + "learning_rate": 0.0003023662007556607, + "loss": 0.9298, + "step": 4852 + }, + { + "epoch": 0.8657568459548658, + "grad_norm": 0.4684261381626129, + "learning_rate": 0.0003022976906682385, + "loss": 0.7224, + "step": 4853 + }, + { + "epoch": 0.8659352421728659, + "grad_norm": 0.5145940780639648, + "learning_rate": 0.00030222917647337833, + "loss": 0.9305, + "step": 4854 + }, + { + "epoch": 0.8661136383908661, + "grad_norm": 0.5013492107391357, + "learning_rate": 0.00030216065817646097, + "loss": 0.9581, + "step": 4855 + }, + { + "epoch": 0.8662920346088663, + "grad_norm": 0.4868033528327942, + "learning_rate": 0.000302092135782868, + "loss": 0.9121, + "step": 4856 + }, + { + "epoch": 0.8664704308268665, + "grad_norm": 0.5112666487693787, + "learning_rate": 0.0003020236092979811, + "loss": 0.9431, + "step": 4857 + }, + { + "epoch": 0.8666488270448667, + "grad_norm": 0.588441789150238, + "learning_rate": 0.0003019550787271823, + "loss": 0.9898, + "step": 4858 + }, + { + "epoch": 0.8668272232628669, + "grad_norm": 0.42368215322494507, + "learning_rate": 0.00030188654407585394, + "loss": 0.6658, + "step": 4859 + }, + { + "epoch": 0.867005619480867, + "grad_norm": 0.4669845700263977, + "learning_rate": 0.00030181800534937874, + "loss": 0.9835, + "step": 4860 + }, + { + "epoch": 0.8671840156988672, + "grad_norm": 0.8458566665649414, + "learning_rate": 0.00030174946255313986, + "loss": 0.8751, + "step": 4861 + }, + { + "epoch": 0.8673624119168674, + "grad_norm": 0.43754637241363525, + "learning_rate": 0.0003016809156925203, + "loss": 0.7733, + "step": 4862 + }, + { + "epoch": 0.8675408081348676, + "grad_norm": 0.539728045463562, + "learning_rate": 0.00030161236477290387, + "loss": 1.0081, + "step": 4863 + }, + { + "epoch": 0.8677192043528678, + "grad_norm": 0.6148440837860107, + "learning_rate": 0.00030154380979967457, + "loss": 1.0732, + "step": 4864 + }, + { + "epoch": 0.8678976005708678, + "grad_norm": 0.5512346625328064, + "learning_rate": 0.0003014752507782166, + "loss": 0.9659, + "step": 4865 + }, + { + "epoch": 0.868075996788868, + "grad_norm": 0.4736751616001129, + "learning_rate": 0.0003014066877139146, + "loss": 0.567, + "step": 4866 + }, + { + "epoch": 0.8682543930068682, + "grad_norm": 0.9616851806640625, + "learning_rate": 0.00030133812061215346, + "loss": 0.7497, + "step": 4867 + }, + { + "epoch": 0.8684327892248684, + "grad_norm": 0.5108407139778137, + "learning_rate": 0.00030126954947831843, + "loss": 0.954, + "step": 4868 + }, + { + "epoch": 0.8686111854428686, + "grad_norm": 0.5192012786865234, + "learning_rate": 0.000301200974317795, + "loss": 0.8119, + "step": 4869 + }, + { + "epoch": 0.8687895816608688, + "grad_norm": 0.5100924372673035, + "learning_rate": 0.0003011323951359692, + "loss": 0.9276, + "step": 4870 + }, + { + "epoch": 0.8689679778788689, + "grad_norm": 0.46075180172920227, + "learning_rate": 0.00030106381193822695, + "loss": 0.764, + "step": 4871 + }, + { + "epoch": 0.8691463740968691, + "grad_norm": 0.45364946126937866, + "learning_rate": 0.000300995224729955, + "loss": 0.7632, + "step": 4872 + }, + { + "epoch": 0.8693247703148693, + "grad_norm": 0.5815824270248413, + "learning_rate": 0.00030092663351654, + "loss": 0.9709, + "step": 4873 + }, + { + "epoch": 0.8695031665328695, + "grad_norm": 0.4578329920768738, + "learning_rate": 0.00030085803830336903, + "loss": 0.6986, + "step": 4874 + }, + { + "epoch": 0.8696815627508697, + "grad_norm": 0.9438912272453308, + "learning_rate": 0.0003007894390958297, + "loss": 0.9468, + "step": 4875 + }, + { + "epoch": 0.8698599589688698, + "grad_norm": 1.1074215173721313, + "learning_rate": 0.0003007208358993097, + "loss": 0.9185, + "step": 4876 + }, + { + "epoch": 0.87003835518687, + "grad_norm": 2.9237184524536133, + "learning_rate": 0.00030065222871919706, + "loss": 1.0407, + "step": 4877 + }, + { + "epoch": 0.8702167514048702, + "grad_norm": 0.5479034185409546, + "learning_rate": 0.00030058361756088014, + "loss": 1.0267, + "step": 4878 + }, + { + "epoch": 0.8703951476228704, + "grad_norm": 0.5139360427856445, + "learning_rate": 0.0003005150024297477, + "loss": 0.8951, + "step": 4879 + }, + { + "epoch": 0.8705735438408706, + "grad_norm": 0.48596107959747314, + "learning_rate": 0.00030044638333118873, + "loss": 0.8362, + "step": 4880 + }, + { + "epoch": 0.8707519400588708, + "grad_norm": 0.5204296708106995, + "learning_rate": 0.00030037776027059247, + "loss": 0.7555, + "step": 4881 + }, + { + "epoch": 0.8709303362768709, + "grad_norm": 0.49391311407089233, + "learning_rate": 0.00030030913325334864, + "loss": 0.9305, + "step": 4882 + }, + { + "epoch": 0.8711087324948711, + "grad_norm": 0.4564768970012665, + "learning_rate": 0.00030024050228484714, + "loss": 0.7117, + "step": 4883 + }, + { + "epoch": 0.8712871287128713, + "grad_norm": 0.434636652469635, + "learning_rate": 0.00030017186737047813, + "loss": 0.7125, + "step": 4884 + }, + { + "epoch": 0.8714655249308715, + "grad_norm": 0.542251706123352, + "learning_rate": 0.00030010322851563233, + "loss": 1.061, + "step": 4885 + }, + { + "epoch": 0.8716439211488717, + "grad_norm": 0.5167363882064819, + "learning_rate": 0.0003000345857257005, + "loss": 0.7555, + "step": 4886 + }, + { + "epoch": 0.8718223173668718, + "grad_norm": 0.4324584901332855, + "learning_rate": 0.0002999659390060738, + "loss": 0.6161, + "step": 4887 + }, + { + "epoch": 0.872000713584872, + "grad_norm": 0.49977007508277893, + "learning_rate": 0.0002998972883621439, + "loss": 0.7964, + "step": 4888 + }, + { + "epoch": 0.8721791098028722, + "grad_norm": 0.5305691361427307, + "learning_rate": 0.00029982863379930224, + "loss": 1.1202, + "step": 4889 + }, + { + "epoch": 0.8723575060208724, + "grad_norm": 0.5107898712158203, + "learning_rate": 0.0002997599753229412, + "loss": 1.0624, + "step": 4890 + }, + { + "epoch": 0.8725359022388726, + "grad_norm": 0.5440460443496704, + "learning_rate": 0.00029969131293845313, + "loss": 1.0675, + "step": 4891 + }, + { + "epoch": 0.8727142984568728, + "grad_norm": 0.48299506306648254, + "learning_rate": 0.00029962264665123076, + "loss": 0.8612, + "step": 4892 + }, + { + "epoch": 0.8728926946748728, + "grad_norm": 0.49671709537506104, + "learning_rate": 0.000299553976466667, + "loss": 0.9027, + "step": 4893 + }, + { + "epoch": 0.873071090892873, + "grad_norm": 0.5757230520248413, + "learning_rate": 0.00029948530239015534, + "loss": 0.9273, + "step": 4894 + }, + { + "epoch": 0.8732494871108732, + "grad_norm": 0.5014898180961609, + "learning_rate": 0.0002994166244270893, + "loss": 0.9075, + "step": 4895 + }, + { + "epoch": 0.8734278833288734, + "grad_norm": 0.5563370585441589, + "learning_rate": 0.0002993479425828628, + "loss": 1.0369, + "step": 4896 + }, + { + "epoch": 0.8736062795468736, + "grad_norm": 0.48165515065193176, + "learning_rate": 0.00029927925686287006, + "loss": 0.7797, + "step": 4897 + }, + { + "epoch": 0.8737846757648737, + "grad_norm": 0.5211619734764099, + "learning_rate": 0.0002992105672725058, + "loss": 0.8904, + "step": 4898 + }, + { + "epoch": 0.8739630719828739, + "grad_norm": 0.5099582672119141, + "learning_rate": 0.00029914187381716473, + "loss": 0.7874, + "step": 4899 + }, + { + "epoch": 0.8741414682008741, + "grad_norm": 0.48764798045158386, + "learning_rate": 0.00029907317650224204, + "loss": 0.6851, + "step": 4900 + }, + { + "epoch": 0.8743198644188743, + "grad_norm": 0.5231783986091614, + "learning_rate": 0.0002990044753331332, + "loss": 0.7969, + "step": 4901 + }, + { + "epoch": 0.8744982606368745, + "grad_norm": 0.5043377876281738, + "learning_rate": 0.00029893577031523403, + "loss": 0.7843, + "step": 4902 + }, + { + "epoch": 0.8746766568548747, + "grad_norm": 0.4754558801651001, + "learning_rate": 0.0002988670614539404, + "loss": 0.7068, + "step": 4903 + }, + { + "epoch": 0.8748550530728748, + "grad_norm": 0.4813019037246704, + "learning_rate": 0.0002987983487546488, + "loss": 0.8119, + "step": 4904 + }, + { + "epoch": 0.875033449290875, + "grad_norm": 0.5080868601799011, + "learning_rate": 0.0002987296322227559, + "loss": 0.993, + "step": 4905 + }, + { + "epoch": 0.8752118455088752, + "grad_norm": 0.5094708800315857, + "learning_rate": 0.00029866091186365865, + "loss": 1.0395, + "step": 4906 + }, + { + "epoch": 0.8753902417268754, + "grad_norm": 0.5148309469223022, + "learning_rate": 0.0002985921876827544, + "loss": 0.8391, + "step": 4907 + }, + { + "epoch": 0.8755686379448756, + "grad_norm": 0.47196096181869507, + "learning_rate": 0.00029852345968544057, + "loss": 0.7106, + "step": 4908 + }, + { + "epoch": 0.8757470341628757, + "grad_norm": 0.5898258090019226, + "learning_rate": 0.00029845472787711516, + "loss": 0.8076, + "step": 4909 + }, + { + "epoch": 0.8759254303808759, + "grad_norm": 0.5204131007194519, + "learning_rate": 0.0002983859922631762, + "loss": 0.7482, + "step": 4910 + }, + { + "epoch": 0.8761038265988761, + "grad_norm": 0.4837625026702881, + "learning_rate": 0.0002983172528490223, + "loss": 0.8764, + "step": 4911 + }, + { + "epoch": 0.8762822228168763, + "grad_norm": 0.46985456347465515, + "learning_rate": 0.00029824850964005215, + "loss": 0.6421, + "step": 4912 + }, + { + "epoch": 0.8764606190348765, + "grad_norm": 0.8495760560035706, + "learning_rate": 0.00029817976264166475, + "loss": 0.9803, + "step": 4913 + }, + { + "epoch": 0.8766390152528767, + "grad_norm": 0.5043582916259766, + "learning_rate": 0.00029811101185925955, + "loss": 1.0006, + "step": 4914 + }, + { + "epoch": 0.8768174114708768, + "grad_norm": 13.705134391784668, + "learning_rate": 0.00029804225729823615, + "loss": 0.9303, + "step": 4915 + }, + { + "epoch": 0.876995807688877, + "grad_norm": 0.5199925899505615, + "learning_rate": 0.00029797349896399457, + "loss": 0.8513, + "step": 4916 + }, + { + "epoch": 0.8771742039068772, + "grad_norm": 0.5620130300521851, + "learning_rate": 0.000297904736861935, + "loss": 0.7335, + "step": 4917 + }, + { + "epoch": 0.8773526001248774, + "grad_norm": 1.1893454790115356, + "learning_rate": 0.0002978359709974581, + "loss": 0.9265, + "step": 4918 + }, + { + "epoch": 0.8775309963428776, + "grad_norm": 0.9579685926437378, + "learning_rate": 0.0002977672013759645, + "loss": 0.7829, + "step": 4919 + }, + { + "epoch": 0.8777093925608777, + "grad_norm": 0.8897736668586731, + "learning_rate": 0.0002976984280028556, + "loss": 0.8619, + "step": 4920 + }, + { + "epoch": 0.8778877887788779, + "grad_norm": 0.6339399814605713, + "learning_rate": 0.00029762965088353256, + "loss": 0.9676, + "step": 4921 + }, + { + "epoch": 0.878066184996878, + "grad_norm": 0.5451995134353638, + "learning_rate": 0.00029756087002339734, + "loss": 1.0137, + "step": 4922 + }, + { + "epoch": 0.8782445812148783, + "grad_norm": 0.4819452166557312, + "learning_rate": 0.00029749208542785175, + "loss": 0.8346, + "step": 4923 + }, + { + "epoch": 0.8784229774328784, + "grad_norm": 0.500859260559082, + "learning_rate": 0.0002974232971022983, + "loss": 0.7695, + "step": 4924 + }, + { + "epoch": 0.8786013736508786, + "grad_norm": 0.7881564497947693, + "learning_rate": 0.00029735450505213943, + "loss": 1.0873, + "step": 4925 + }, + { + "epoch": 0.8787797698688787, + "grad_norm": 1.3103078603744507, + "learning_rate": 0.0002972857092827781, + "loss": 0.889, + "step": 4926 + }, + { + "epoch": 0.8789581660868789, + "grad_norm": 0.5103085041046143, + "learning_rate": 0.00029721690979961764, + "loss": 0.7738, + "step": 4927 + }, + { + "epoch": 0.8791365623048791, + "grad_norm": 0.7508847117424011, + "learning_rate": 0.0002971481066080613, + "loss": 0.7429, + "step": 4928 + }, + { + "epoch": 0.8793149585228793, + "grad_norm": 0.5857597589492798, + "learning_rate": 0.000297079299713513, + "loss": 0.9272, + "step": 4929 + }, + { + "epoch": 0.8794933547408795, + "grad_norm": 0.5782235264778137, + "learning_rate": 0.00029701048912137676, + "loss": 0.9134, + "step": 4930 + }, + { + "epoch": 0.8796717509588796, + "grad_norm": 0.5405158400535583, + "learning_rate": 0.00029694167483705684, + "loss": 0.8666, + "step": 4931 + }, + { + "epoch": 0.8798501471768798, + "grad_norm": 0.5647017359733582, + "learning_rate": 0.0002968728568659581, + "loss": 1.007, + "step": 4932 + }, + { + "epoch": 0.88002854339488, + "grad_norm": 0.537854015827179, + "learning_rate": 0.0002968040352134853, + "loss": 0.9957, + "step": 4933 + }, + { + "epoch": 0.8802069396128802, + "grad_norm": 0.5491927862167358, + "learning_rate": 0.00029673520988504376, + "loss": 0.9025, + "step": 4934 + }, + { + "epoch": 0.8803853358308804, + "grad_norm": 0.5295958518981934, + "learning_rate": 0.0002966663808860389, + "loss": 0.9705, + "step": 4935 + }, + { + "epoch": 0.8805637320488806, + "grad_norm": 0.5388423800468445, + "learning_rate": 0.0002965975482218766, + "loss": 0.7576, + "step": 4936 + }, + { + "epoch": 0.8807421282668807, + "grad_norm": 0.5519341230392456, + "learning_rate": 0.00029652871189796284, + "loss": 0.8774, + "step": 4937 + }, + { + "epoch": 0.8809205244848809, + "grad_norm": 0.5609647035598755, + "learning_rate": 0.00029645987191970414, + "loss": 0.9987, + "step": 4938 + }, + { + "epoch": 0.8810989207028811, + "grad_norm": 0.47703075408935547, + "learning_rate": 0.000296391028292507, + "loss": 0.7438, + "step": 4939 + }, + { + "epoch": 0.8812773169208813, + "grad_norm": 0.5322765111923218, + "learning_rate": 0.0002963221810217786, + "loss": 1.1033, + "step": 4940 + }, + { + "epoch": 0.8814557131388815, + "grad_norm": 0.4712493121623993, + "learning_rate": 0.000296253330112926, + "loss": 0.7569, + "step": 4941 + }, + { + "epoch": 0.8816341093568816, + "grad_norm": 0.5673714876174927, + "learning_rate": 0.00029618447557135677, + "loss": 0.7299, + "step": 4942 + }, + { + "epoch": 0.8818125055748818, + "grad_norm": 0.5213398337364197, + "learning_rate": 0.00029611561740247854, + "loss": 0.828, + "step": 4943 + }, + { + "epoch": 0.881990901792882, + "grad_norm": 0.4355573058128357, + "learning_rate": 0.0002960467556116997, + "loss": 0.6324, + "step": 4944 + }, + { + "epoch": 0.8821692980108822, + "grad_norm": 0.53075110912323, + "learning_rate": 0.0002959778902044285, + "loss": 0.852, + "step": 4945 + }, + { + "epoch": 0.8823476942288824, + "grad_norm": 0.5229327082633972, + "learning_rate": 0.00029590902118607353, + "loss": 0.8341, + "step": 4946 + }, + { + "epoch": 0.8825260904468826, + "grad_norm": 0.5234019160270691, + "learning_rate": 0.00029584014856204387, + "loss": 0.9364, + "step": 4947 + }, + { + "epoch": 0.8827044866648827, + "grad_norm": 0.5076524019241333, + "learning_rate": 0.0002957712723377487, + "loss": 0.8151, + "step": 4948 + }, + { + "epoch": 0.8828828828828829, + "grad_norm": 0.4554321765899658, + "learning_rate": 0.00029570239251859744, + "loss": 0.8135, + "step": 4949 + }, + { + "epoch": 0.8830612791008831, + "grad_norm": 0.47473978996276855, + "learning_rate": 0.0002956335091099999, + "loss": 0.7501, + "step": 4950 + }, + { + "epoch": 0.8832396753188833, + "grad_norm": 0.4464404284954071, + "learning_rate": 0.00029556462211736614, + "loss": 0.7138, + "step": 4951 + }, + { + "epoch": 0.8834180715368835, + "grad_norm": 0.5105646848678589, + "learning_rate": 0.0002954957315461066, + "loss": 0.9071, + "step": 4952 + }, + { + "epoch": 0.8835964677548835, + "grad_norm": 0.49081435799598694, + "learning_rate": 0.00029542683740163203, + "loss": 0.9046, + "step": 4953 + }, + { + "epoch": 0.8837748639728837, + "grad_norm": 0.45001259446144104, + "learning_rate": 0.0002953579396893531, + "loss": 0.7903, + "step": 4954 + }, + { + "epoch": 0.8839532601908839, + "grad_norm": 0.512221097946167, + "learning_rate": 0.00029528903841468106, + "loss": 0.7811, + "step": 4955 + }, + { + "epoch": 0.8841316564088841, + "grad_norm": 0.9528411030769348, + "learning_rate": 0.0002952201335830275, + "loss": 0.9261, + "step": 4956 + }, + { + "epoch": 0.8843100526268843, + "grad_norm": 0.5219822525978088, + "learning_rate": 0.00029515122519980407, + "loss": 0.7963, + "step": 4957 + }, + { + "epoch": 0.8844884488448845, + "grad_norm": 0.4960407614707947, + "learning_rate": 0.0002950823132704228, + "loss": 0.8326, + "step": 4958 + }, + { + "epoch": 0.8846668450628846, + "grad_norm": 0.6182901263237, + "learning_rate": 0.0002950133978002961, + "loss": 0.8034, + "step": 4959 + }, + { + "epoch": 0.8848452412808848, + "grad_norm": 0.5232715010643005, + "learning_rate": 0.00029494447879483657, + "loss": 0.9416, + "step": 4960 + }, + { + "epoch": 0.885023637498885, + "grad_norm": 0.514710009098053, + "learning_rate": 0.00029487555625945695, + "loss": 0.8282, + "step": 4961 + }, + { + "epoch": 0.8852020337168852, + "grad_norm": 0.49220260977745056, + "learning_rate": 0.0002948066301995704, + "loss": 0.7967, + "step": 4962 + }, + { + "epoch": 0.8853804299348854, + "grad_norm": 0.5210900902748108, + "learning_rate": 0.0002947377006205905, + "loss": 0.8346, + "step": 4963 + }, + { + "epoch": 0.8855588261528856, + "grad_norm": 0.5323128700256348, + "learning_rate": 0.0002946687675279308, + "loss": 0.7406, + "step": 4964 + }, + { + "epoch": 0.8857372223708857, + "grad_norm": 0.4670845866203308, + "learning_rate": 0.0002945998309270053, + "loss": 0.6944, + "step": 4965 + }, + { + "epoch": 0.8859156185888859, + "grad_norm": 0.4710376262664795, + "learning_rate": 0.0002945308908232283, + "loss": 0.6828, + "step": 4966 + }, + { + "epoch": 0.8860940148068861, + "grad_norm": 0.4978684186935425, + "learning_rate": 0.0002944619472220143, + "loss": 0.8463, + "step": 4967 + }, + { + "epoch": 0.8862724110248863, + "grad_norm": 0.5263177156448364, + "learning_rate": 0.0002943930001287781, + "loss": 0.8929, + "step": 4968 + }, + { + "epoch": 0.8864508072428865, + "grad_norm": 0.5331815481185913, + "learning_rate": 0.0002943240495489348, + "loss": 1.123, + "step": 4969 + }, + { + "epoch": 0.8866292034608866, + "grad_norm": 0.5104199051856995, + "learning_rate": 0.00029425509548789965, + "loss": 0.7605, + "step": 4970 + }, + { + "epoch": 0.8868075996788868, + "grad_norm": 0.5087189674377441, + "learning_rate": 0.00029418613795108837, + "loss": 0.8925, + "step": 4971 + }, + { + "epoch": 0.886985995896887, + "grad_norm": 0.4605477750301361, + "learning_rate": 0.0002941171769439168, + "loss": 0.8883, + "step": 4972 + }, + { + "epoch": 0.8871643921148872, + "grad_norm": 0.476589173078537, + "learning_rate": 0.0002940482124718012, + "loss": 0.8115, + "step": 4973 + }, + { + "epoch": 0.8873427883328874, + "grad_norm": 0.5065289735794067, + "learning_rate": 0.00029397924454015797, + "loss": 0.9306, + "step": 4974 + }, + { + "epoch": 0.8875211845508876, + "grad_norm": 0.5235819220542908, + "learning_rate": 0.0002939102731544037, + "loss": 1.0201, + "step": 4975 + }, + { + "epoch": 0.8876995807688877, + "grad_norm": 0.6401598453521729, + "learning_rate": 0.0002938412983199555, + "loss": 0.9119, + "step": 4976 + }, + { + "epoch": 0.8878779769868879, + "grad_norm": 0.5362576246261597, + "learning_rate": 0.00029377232004223065, + "loss": 1.006, + "step": 4977 + }, + { + "epoch": 0.8880563732048881, + "grad_norm": 0.4627399146556854, + "learning_rate": 0.00029370333832664657, + "loss": 0.7415, + "step": 4978 + }, + { + "epoch": 0.8882347694228883, + "grad_norm": 0.46277347207069397, + "learning_rate": 0.0002936343531786212, + "loss": 0.8446, + "step": 4979 + }, + { + "epoch": 0.8884131656408885, + "grad_norm": 0.48642510175704956, + "learning_rate": 0.0002935653646035724, + "loss": 0.9397, + "step": 4980 + }, + { + "epoch": 0.8885915618588885, + "grad_norm": 0.4110255837440491, + "learning_rate": 0.00029349637260691865, + "loss": 0.7168, + "step": 4981 + }, + { + "epoch": 0.8887699580768887, + "grad_norm": 0.47086673974990845, + "learning_rate": 0.0002934273771940785, + "loss": 0.7632, + "step": 4982 + }, + { + "epoch": 0.8889483542948889, + "grad_norm": 0.48255836963653564, + "learning_rate": 0.0002933583783704709, + "loss": 0.937, + "step": 4983 + }, + { + "epoch": 0.8891267505128891, + "grad_norm": 0.44749948382377625, + "learning_rate": 0.00029328937614151487, + "loss": 0.7854, + "step": 4984 + }, + { + "epoch": 0.8893051467308893, + "grad_norm": 0.5080204010009766, + "learning_rate": 0.0002932203705126298, + "loss": 0.7553, + "step": 4985 + }, + { + "epoch": 0.8894835429488895, + "grad_norm": 0.47025632858276367, + "learning_rate": 0.0002931513614892355, + "loss": 0.8807, + "step": 4986 + }, + { + "epoch": 0.8896619391668896, + "grad_norm": 0.46297869086265564, + "learning_rate": 0.0002930823490767519, + "loss": 0.8596, + "step": 4987 + }, + { + "epoch": 0.8898403353848898, + "grad_norm": 0.413412868976593, + "learning_rate": 0.0002930133332805991, + "loss": 0.7766, + "step": 4988 + }, + { + "epoch": 0.89001873160289, + "grad_norm": 0.4820116460323334, + "learning_rate": 0.0002929443141061975, + "loss": 0.8289, + "step": 4989 + }, + { + "epoch": 0.8901971278208902, + "grad_norm": 0.48540276288986206, + "learning_rate": 0.00029287529155896805, + "loss": 0.9301, + "step": 4990 + }, + { + "epoch": 0.8903755240388904, + "grad_norm": 1.949074149131775, + "learning_rate": 0.0002928062656443317, + "loss": 0.6227, + "step": 4991 + }, + { + "epoch": 0.8905539202568905, + "grad_norm": 2.410193920135498, + "learning_rate": 0.00029273723636770953, + "loss": 0.699, + "step": 4992 + }, + { + "epoch": 0.8907323164748907, + "grad_norm": 2.254347562789917, + "learning_rate": 0.00029266820373452334, + "loss": 0.7326, + "step": 4993 + }, + { + "epoch": 0.8909107126928909, + "grad_norm": 0.8029910326004028, + "learning_rate": 0.00029259916775019475, + "loss": 0.8962, + "step": 4994 + }, + { + "epoch": 0.8910891089108911, + "grad_norm": 0.6749674081802368, + "learning_rate": 0.0002925301284201458, + "loss": 0.9521, + "step": 4995 + }, + { + "epoch": 0.8912675051288913, + "grad_norm": 0.700823962688446, + "learning_rate": 0.00029246108574979896, + "loss": 0.8662, + "step": 4996 + }, + { + "epoch": 0.8914459013468915, + "grad_norm": 0.7377640604972839, + "learning_rate": 0.0002923920397445766, + "loss": 0.9585, + "step": 4997 + }, + { + "epoch": 0.8916242975648916, + "grad_norm": 0.5655547380447388, + "learning_rate": 0.00029232299040990174, + "loss": 0.8825, + "step": 4998 + }, + { + "epoch": 0.8918026937828918, + "grad_norm": 0.4939744472503662, + "learning_rate": 0.0002922539377511974, + "loss": 0.7917, + "step": 4999 + }, + { + "epoch": 0.891981090000892, + "grad_norm": 0.5311207175254822, + "learning_rate": 0.00029218488177388705, + "loss": 1.0183, + "step": 5000 + }, + { + "epoch": 0.8921594862188922, + "grad_norm": 0.5403327345848083, + "learning_rate": 0.00029211582248339424, + "loss": 0.7361, + "step": 5001 + }, + { + "epoch": 0.8923378824368924, + "grad_norm": 0.47330448031425476, + "learning_rate": 0.0002920467598851428, + "loss": 0.6321, + "step": 5002 + }, + { + "epoch": 0.8925162786548925, + "grad_norm": 0.5282453298568726, + "learning_rate": 0.0002919776939845569, + "loss": 0.8446, + "step": 5003 + }, + { + "epoch": 0.8926946748728927, + "grad_norm": 0.48607364296913147, + "learning_rate": 0.000291908624787061, + "loss": 0.8299, + "step": 5004 + }, + { + "epoch": 0.8928730710908929, + "grad_norm": 0.43826302886009216, + "learning_rate": 0.0002918395522980798, + "loss": 0.8356, + "step": 5005 + }, + { + "epoch": 0.8930514673088931, + "grad_norm": 0.4682931900024414, + "learning_rate": 0.00029177047652303816, + "loss": 0.835, + "step": 5006 + }, + { + "epoch": 0.8932298635268933, + "grad_norm": 0.6613882780075073, + "learning_rate": 0.0002917013974673612, + "loss": 0.7867, + "step": 5007 + }, + { + "epoch": 0.8934082597448935, + "grad_norm": 0.41586834192276, + "learning_rate": 0.00029163231513647454, + "loss": 0.772, + "step": 5008 + }, + { + "epoch": 0.8935866559628936, + "grad_norm": 0.5220624804496765, + "learning_rate": 0.00029156322953580367, + "loss": 0.8566, + "step": 5009 + }, + { + "epoch": 0.8937650521808937, + "grad_norm": 0.48346883058547974, + "learning_rate": 0.00029149414067077467, + "loss": 0.9499, + "step": 5010 + }, + { + "epoch": 0.893943448398894, + "grad_norm": 0.4970376789569855, + "learning_rate": 0.00029142504854681375, + "loss": 1.0051, + "step": 5011 + }, + { + "epoch": 0.8941218446168941, + "grad_norm": 0.532818615436554, + "learning_rate": 0.0002913559531693472, + "loss": 0.8754, + "step": 5012 + }, + { + "epoch": 0.8943002408348943, + "grad_norm": 0.4270954430103302, + "learning_rate": 0.00029128685454380207, + "loss": 0.6813, + "step": 5013 + }, + { + "epoch": 0.8944786370528944, + "grad_norm": 0.5017000436782837, + "learning_rate": 0.0002912177526756051, + "loss": 0.9326, + "step": 5014 + }, + { + "epoch": 0.8946570332708946, + "grad_norm": 0.5018951892852783, + "learning_rate": 0.0002911486475701835, + "loss": 0.867, + "step": 5015 + }, + { + "epoch": 0.8948354294888948, + "grad_norm": 0.5230339765548706, + "learning_rate": 0.0002910795392329649, + "loss": 0.8305, + "step": 5016 + }, + { + "epoch": 0.895013825706895, + "grad_norm": 0.4868851900100708, + "learning_rate": 0.00029101042766937693, + "loss": 0.8635, + "step": 5017 + }, + { + "epoch": 0.8951922219248952, + "grad_norm": 0.5056290626525879, + "learning_rate": 0.0002909413128848476, + "loss": 0.861, + "step": 5018 + }, + { + "epoch": 0.8953706181428954, + "grad_norm": 0.5414198040962219, + "learning_rate": 0.0002908721948848052, + "loss": 0.749, + "step": 5019 + }, + { + "epoch": 0.8955490143608955, + "grad_norm": 0.4794885814189911, + "learning_rate": 0.00029080307367467824, + "loss": 0.7712, + "step": 5020 + }, + { + "epoch": 0.8957274105788957, + "grad_norm": 0.454140841960907, + "learning_rate": 0.0002907339492598954, + "loss": 0.7231, + "step": 5021 + }, + { + "epoch": 0.8959058067968959, + "grad_norm": 0.5277546644210815, + "learning_rate": 0.0002906648216458857, + "loss": 0.8037, + "step": 5022 + }, + { + "epoch": 0.8960842030148961, + "grad_norm": 0.5510913729667664, + "learning_rate": 0.0002905956908380784, + "loss": 0.972, + "step": 5023 + }, + { + "epoch": 0.8962625992328963, + "grad_norm": 0.4881914556026459, + "learning_rate": 0.00029052655684190304, + "loss": 0.8542, + "step": 5024 + }, + { + "epoch": 0.8964409954508964, + "grad_norm": 0.5819581151008606, + "learning_rate": 0.0002904574196627893, + "loss": 0.6731, + "step": 5025 + }, + { + "epoch": 0.8966193916688966, + "grad_norm": 0.48652219772338867, + "learning_rate": 0.0002903882793061673, + "loss": 0.7228, + "step": 5026 + }, + { + "epoch": 0.8967977878868968, + "grad_norm": 0.49010154604911804, + "learning_rate": 0.00029031913577746716, + "loss": 0.7869, + "step": 5027 + }, + { + "epoch": 0.896976184104897, + "grad_norm": 0.4732781648635864, + "learning_rate": 0.00029024998908211945, + "loss": 0.8617, + "step": 5028 + }, + { + "epoch": 0.8971545803228972, + "grad_norm": 0.47294124960899353, + "learning_rate": 0.000290180839225555, + "loss": 1.0253, + "step": 5029 + }, + { + "epoch": 0.8973329765408974, + "grad_norm": 0.4338665306568146, + "learning_rate": 0.00029011168621320466, + "loss": 0.6808, + "step": 5030 + }, + { + "epoch": 0.8975113727588975, + "grad_norm": 0.5743736028671265, + "learning_rate": 0.00029004253005049976, + "loss": 0.6331, + "step": 5031 + }, + { + "epoch": 0.8976897689768977, + "grad_norm": 0.4324184060096741, + "learning_rate": 0.0002899733707428718, + "loss": 0.7521, + "step": 5032 + }, + { + "epoch": 0.8978681651948979, + "grad_norm": 0.95427405834198, + "learning_rate": 0.0002899042082957525, + "loss": 0.8488, + "step": 5033 + }, + { + "epoch": 0.8980465614128981, + "grad_norm": 0.4829496741294861, + "learning_rate": 0.00028983504271457385, + "loss": 0.8179, + "step": 5034 + }, + { + "epoch": 0.8982249576308983, + "grad_norm": 0.47819754481315613, + "learning_rate": 0.000289765874004768, + "loss": 0.8563, + "step": 5035 + }, + { + "epoch": 0.8984033538488984, + "grad_norm": 0.5096835494041443, + "learning_rate": 0.0002896967021717676, + "loss": 0.7857, + "step": 5036 + }, + { + "epoch": 0.8985817500668986, + "grad_norm": 0.47968176007270813, + "learning_rate": 0.0002896275272210053, + "loss": 0.7974, + "step": 5037 + }, + { + "epoch": 0.8987601462848988, + "grad_norm": 0.38809946179389954, + "learning_rate": 0.00028955834915791404, + "loss": 0.5523, + "step": 5038 + }, + { + "epoch": 0.898938542502899, + "grad_norm": 0.47436532378196716, + "learning_rate": 0.0002894891679879271, + "loss": 0.6803, + "step": 5039 + }, + { + "epoch": 0.8991169387208992, + "grad_norm": 0.4444058835506439, + "learning_rate": 0.0002894199837164779, + "loss": 0.6817, + "step": 5040 + }, + { + "epoch": 0.8992953349388993, + "grad_norm": 0.4691022038459778, + "learning_rate": 0.00028935079634900016, + "loss": 0.8635, + "step": 5041 + }, + { + "epoch": 0.8994737311568994, + "grad_norm": 0.4624834358692169, + "learning_rate": 0.0002892816058909277, + "loss": 0.8735, + "step": 5042 + }, + { + "epoch": 0.8996521273748996, + "grad_norm": 0.4934956431388855, + "learning_rate": 0.00028921241234769484, + "loss": 1.0094, + "step": 5043 + }, + { + "epoch": 0.8998305235928998, + "grad_norm": 0.481513649225235, + "learning_rate": 0.000289143215724736, + "loss": 0.8711, + "step": 5044 + }, + { + "epoch": 0.9000089198109, + "grad_norm": 0.47422125935554504, + "learning_rate": 0.0002890740160274859, + "loss": 0.8002, + "step": 5045 + }, + { + "epoch": 0.9001873160289002, + "grad_norm": 0.5127121806144714, + "learning_rate": 0.00028900481326137945, + "loss": 0.7709, + "step": 5046 + }, + { + "epoch": 0.9003657122469003, + "grad_norm": 0.4381260275840759, + "learning_rate": 0.00028893560743185166, + "loss": 0.693, + "step": 5047 + }, + { + "epoch": 0.9005441084649005, + "grad_norm": 0.49497178196907043, + "learning_rate": 0.000288866398544338, + "loss": 0.7078, + "step": 5048 + }, + { + "epoch": 0.9007225046829007, + "grad_norm": 0.511747419834137, + "learning_rate": 0.00028879718660427417, + "loss": 0.8916, + "step": 5049 + }, + { + "epoch": 0.9009009009009009, + "grad_norm": 0.48092395067214966, + "learning_rate": 0.00028872797161709593, + "loss": 0.8367, + "step": 5050 + }, + { + "epoch": 0.9010792971189011, + "grad_norm": 0.4676063060760498, + "learning_rate": 0.0002886587535882395, + "loss": 0.7425, + "step": 5051 + }, + { + "epoch": 0.9012576933369013, + "grad_norm": 0.45306262373924255, + "learning_rate": 0.00028858953252314126, + "loss": 0.7591, + "step": 5052 + }, + { + "epoch": 0.9014360895549014, + "grad_norm": 0.5242826342582703, + "learning_rate": 0.0002885203084272377, + "loss": 0.922, + "step": 5053 + }, + { + "epoch": 0.9016144857729016, + "grad_norm": 0.4822898209095001, + "learning_rate": 0.0002884510813059657, + "loss": 0.9461, + "step": 5054 + }, + { + "epoch": 0.9017928819909018, + "grad_norm": 0.46535515785217285, + "learning_rate": 0.0002883818511647623, + "loss": 0.8912, + "step": 5055 + }, + { + "epoch": 0.901971278208902, + "grad_norm": 0.4802592694759369, + "learning_rate": 0.0002883126180090648, + "loss": 0.9853, + "step": 5056 + }, + { + "epoch": 0.9021496744269022, + "grad_norm": 0.497173547744751, + "learning_rate": 0.0002882433818443109, + "loss": 0.9567, + "step": 5057 + }, + { + "epoch": 0.9023280706449023, + "grad_norm": 0.5876064896583557, + "learning_rate": 0.00028817414267593805, + "loss": 0.9769, + "step": 5058 + }, + { + "epoch": 0.9025064668629025, + "grad_norm": 0.4283727705478668, + "learning_rate": 0.0002881049005093846, + "loss": 0.7212, + "step": 5059 + }, + { + "epoch": 0.9026848630809027, + "grad_norm": 0.5137040019035339, + "learning_rate": 0.0002880356553500886, + "loss": 0.7794, + "step": 5060 + }, + { + "epoch": 0.9028632592989029, + "grad_norm": 0.6085144281387329, + "learning_rate": 0.00028796640720348866, + "loss": 0.8848, + "step": 5061 + }, + { + "epoch": 0.9030416555169031, + "grad_norm": 0.5343783497810364, + "learning_rate": 0.0002878971560750234, + "loss": 0.7601, + "step": 5062 + }, + { + "epoch": 0.9032200517349033, + "grad_norm": 0.5269213914871216, + "learning_rate": 0.0002878279019701318, + "loss": 0.8196, + "step": 5063 + }, + { + "epoch": 0.9033984479529034, + "grad_norm": 0.4757898449897766, + "learning_rate": 0.00028775864489425306, + "loss": 0.8186, + "step": 5064 + }, + { + "epoch": 0.9035768441709036, + "grad_norm": 0.5592185258865356, + "learning_rate": 0.0002876893848528266, + "loss": 1.1946, + "step": 5065 + }, + { + "epoch": 0.9037552403889038, + "grad_norm": 0.43977245688438416, + "learning_rate": 0.0002876201218512921, + "loss": 0.7215, + "step": 5066 + }, + { + "epoch": 0.903933636606904, + "grad_norm": 0.49869504570961, + "learning_rate": 0.0002875508558950894, + "loss": 0.8863, + "step": 5067 + }, + { + "epoch": 0.9041120328249042, + "grad_norm": 0.47307130694389343, + "learning_rate": 0.00028748158698965867, + "loss": 0.7705, + "step": 5068 + }, + { + "epoch": 0.9042904290429042, + "grad_norm": 0.43691718578338623, + "learning_rate": 0.00028741231514044013, + "loss": 0.6947, + "step": 5069 + }, + { + "epoch": 0.9044688252609044, + "grad_norm": 0.4944881200790405, + "learning_rate": 0.00028734304035287454, + "loss": 0.7556, + "step": 5070 + }, + { + "epoch": 0.9046472214789046, + "grad_norm": 0.48639604449272156, + "learning_rate": 0.00028727376263240265, + "loss": 0.7911, + "step": 5071 + }, + { + "epoch": 0.9048256176969048, + "grad_norm": 0.4833623468875885, + "learning_rate": 0.0002872044819844654, + "loss": 0.802, + "step": 5072 + }, + { + "epoch": 0.905004013914905, + "grad_norm": 0.47726356983184814, + "learning_rate": 0.0002871351984145042, + "loss": 0.8817, + "step": 5073 + }, + { + "epoch": 0.9051824101329052, + "grad_norm": 0.5050178170204163, + "learning_rate": 0.0002870659119279605, + "loss": 1.0378, + "step": 5074 + }, + { + "epoch": 0.9053608063509053, + "grad_norm": 0.4867967367172241, + "learning_rate": 0.00028699662253027606, + "loss": 0.7799, + "step": 5075 + }, + { + "epoch": 0.9055392025689055, + "grad_norm": 0.4266904890537262, + "learning_rate": 0.00028692733022689273, + "loss": 0.753, + "step": 5076 + }, + { + "epoch": 0.9057175987869057, + "grad_norm": 0.4467776417732239, + "learning_rate": 0.0002868580350232528, + "loss": 0.6849, + "step": 5077 + }, + { + "epoch": 0.9058959950049059, + "grad_norm": 0.46932464838027954, + "learning_rate": 0.0002867887369247987, + "loss": 1.0174, + "step": 5078 + }, + { + "epoch": 0.9060743912229061, + "grad_norm": 0.5115159153938293, + "learning_rate": 0.000286719435936973, + "loss": 0.8624, + "step": 5079 + }, + { + "epoch": 0.9062527874409062, + "grad_norm": 0.4789755046367645, + "learning_rate": 0.0002866501320652186, + "loss": 0.9705, + "step": 5080 + }, + { + "epoch": 0.9064311836589064, + "grad_norm": 0.4835361838340759, + "learning_rate": 0.0002865808253149786, + "loss": 0.7777, + "step": 5081 + }, + { + "epoch": 0.9066095798769066, + "grad_norm": 0.4759388566017151, + "learning_rate": 0.0002865115156916963, + "loss": 0.7886, + "step": 5082 + }, + { + "epoch": 0.9067879760949068, + "grad_norm": 0.5399003028869629, + "learning_rate": 0.0002864422032008153, + "loss": 0.9909, + "step": 5083 + }, + { + "epoch": 0.906966372312907, + "grad_norm": 0.5101630687713623, + "learning_rate": 0.0002863728878477793, + "loss": 0.932, + "step": 5084 + }, + { + "epoch": 0.9071447685309072, + "grad_norm": 0.5102178454399109, + "learning_rate": 0.0002863035696380324, + "loss": 0.9908, + "step": 5085 + }, + { + "epoch": 0.9073231647489073, + "grad_norm": 0.5420662760734558, + "learning_rate": 0.0002862342485770188, + "loss": 1.0111, + "step": 5086 + }, + { + "epoch": 0.9075015609669075, + "grad_norm": 0.4109286665916443, + "learning_rate": 0.00028616492467018286, + "loss": 0.8073, + "step": 5087 + }, + { + "epoch": 0.9076799571849077, + "grad_norm": 0.489951491355896, + "learning_rate": 0.0002860955979229693, + "loss": 0.8515, + "step": 5088 + }, + { + "epoch": 0.9078583534029079, + "grad_norm": 0.45059677958488464, + "learning_rate": 0.00028602626834082297, + "loss": 0.7568, + "step": 5089 + }, + { + "epoch": 0.9080367496209081, + "grad_norm": 0.4857634902000427, + "learning_rate": 0.00028595693592918905, + "loss": 0.8072, + "step": 5090 + }, + { + "epoch": 0.9082151458389082, + "grad_norm": 0.4536861479282379, + "learning_rate": 0.00028588760069351286, + "loss": 0.7684, + "step": 5091 + }, + { + "epoch": 0.9083935420569084, + "grad_norm": 0.4914097487926483, + "learning_rate": 0.0002858182626392399, + "loss": 0.7621, + "step": 5092 + }, + { + "epoch": 0.9085719382749086, + "grad_norm": 0.4842384159564972, + "learning_rate": 0.0002857489217718162, + "loss": 0.8194, + "step": 5093 + }, + { + "epoch": 0.9087503344929088, + "grad_norm": 0.6547600030899048, + "learning_rate": 0.00028567957809668744, + "loss": 0.9461, + "step": 5094 + }, + { + "epoch": 0.908928730710909, + "grad_norm": 0.7207086682319641, + "learning_rate": 0.00028561023161929996, + "loss": 0.7638, + "step": 5095 + }, + { + "epoch": 0.9091071269289092, + "grad_norm": 0.4441107511520386, + "learning_rate": 0.0002855408823451002, + "loss": 0.7315, + "step": 5096 + }, + { + "epoch": 0.9092855231469092, + "grad_norm": 0.42800113558769226, + "learning_rate": 0.00028547153027953483, + "loss": 0.6776, + "step": 5097 + }, + { + "epoch": 0.9094639193649094, + "grad_norm": 0.46136319637298584, + "learning_rate": 0.00028540217542805075, + "loss": 0.9207, + "step": 5098 + }, + { + "epoch": 0.9096423155829096, + "grad_norm": 0.4710569977760315, + "learning_rate": 0.0002853328177960951, + "loss": 0.8327, + "step": 5099 + }, + { + "epoch": 0.9098207118009098, + "grad_norm": 0.48937293887138367, + "learning_rate": 0.00028526345738911514, + "loss": 0.8316, + "step": 5100 + }, + { + "epoch": 0.90999910801891, + "grad_norm": 0.49789008498191833, + "learning_rate": 0.0002851940942125584, + "loss": 0.8759, + "step": 5101 + }, + { + "epoch": 0.9101775042369101, + "grad_norm": 0.4753686487674713, + "learning_rate": 0.0002851247282718726, + "loss": 0.8414, + "step": 5102 + }, + { + "epoch": 0.9103559004549103, + "grad_norm": 0.43923425674438477, + "learning_rate": 0.0002850553595725057, + "loss": 0.6973, + "step": 5103 + }, + { + "epoch": 0.9105342966729105, + "grad_norm": 0.4735580086708069, + "learning_rate": 0.000284985988119906, + "loss": 0.9447, + "step": 5104 + }, + { + "epoch": 0.9107126928909107, + "grad_norm": 0.543528139591217, + "learning_rate": 0.00028491661391952196, + "loss": 0.9476, + "step": 5105 + }, + { + "epoch": 0.9108910891089109, + "grad_norm": 0.48239144682884216, + "learning_rate": 0.00028484723697680197, + "loss": 0.9228, + "step": 5106 + }, + { + "epoch": 0.9110694853269111, + "grad_norm": 0.49379172921180725, + "learning_rate": 0.00028477785729719504, + "loss": 0.8531, + "step": 5107 + }, + { + "epoch": 0.9112478815449112, + "grad_norm": 0.4407452344894409, + "learning_rate": 0.00028470847488615015, + "loss": 0.6475, + "step": 5108 + }, + { + "epoch": 0.9114262777629114, + "grad_norm": 0.4452020227909088, + "learning_rate": 0.00028463908974911656, + "loss": 0.6565, + "step": 5109 + }, + { + "epoch": 0.9116046739809116, + "grad_norm": 0.48613160848617554, + "learning_rate": 0.0002845697018915437, + "loss": 0.6659, + "step": 5110 + }, + { + "epoch": 0.9117830701989118, + "grad_norm": 0.43221384286880493, + "learning_rate": 0.00028450031131888146, + "loss": 0.7194, + "step": 5111 + }, + { + "epoch": 0.911961466416912, + "grad_norm": 0.46625545620918274, + "learning_rate": 0.00028443091803657955, + "loss": 0.8089, + "step": 5112 + }, + { + "epoch": 0.9121398626349121, + "grad_norm": 0.4777173399925232, + "learning_rate": 0.0002843615220500881, + "loss": 0.8622, + "step": 5113 + }, + { + "epoch": 0.9123182588529123, + "grad_norm": 0.47781699895858765, + "learning_rate": 0.0002842921233648576, + "loss": 0.9369, + "step": 5114 + }, + { + "epoch": 0.9124966550709125, + "grad_norm": 0.5193250775337219, + "learning_rate": 0.0002842227219863385, + "loss": 0.967, + "step": 5115 + }, + { + "epoch": 0.9126750512889127, + "grad_norm": 0.46008825302124023, + "learning_rate": 0.00028415331791998145, + "loss": 0.9416, + "step": 5116 + }, + { + "epoch": 0.9128534475069129, + "grad_norm": 0.4777912199497223, + "learning_rate": 0.00028408391117123755, + "loss": 0.8793, + "step": 5117 + }, + { + "epoch": 0.9130318437249131, + "grad_norm": 0.45857903361320496, + "learning_rate": 0.00028401450174555794, + "loss": 0.908, + "step": 5118 + }, + { + "epoch": 0.9132102399429132, + "grad_norm": 0.4434804320335388, + "learning_rate": 0.000283945089648394, + "loss": 0.6586, + "step": 5119 + }, + { + "epoch": 0.9133886361609134, + "grad_norm": 0.47657546401023865, + "learning_rate": 0.0002838756748851973, + "loss": 0.5956, + "step": 5120 + }, + { + "epoch": 0.9135670323789136, + "grad_norm": 0.4849657118320465, + "learning_rate": 0.0002838062574614197, + "loss": 0.9679, + "step": 5121 + }, + { + "epoch": 0.9137454285969138, + "grad_norm": 0.5244645476341248, + "learning_rate": 0.00028373683738251314, + "loss": 1.0226, + "step": 5122 + }, + { + "epoch": 0.913923824814914, + "grad_norm": 0.4713945686817169, + "learning_rate": 0.00028366741465393, + "loss": 0.9147, + "step": 5123 + }, + { + "epoch": 0.914102221032914, + "grad_norm": 0.4651153087615967, + "learning_rate": 0.00028359798928112253, + "loss": 1.0022, + "step": 5124 + }, + { + "epoch": 0.9142806172509143, + "grad_norm": 0.4441218972206116, + "learning_rate": 0.0002835285612695434, + "loss": 0.7423, + "step": 5125 + }, + { + "epoch": 0.9144590134689145, + "grad_norm": 0.44443660974502563, + "learning_rate": 0.00028345913062464555, + "loss": 0.592, + "step": 5126 + }, + { + "epoch": 0.9146374096869146, + "grad_norm": 0.431494802236557, + "learning_rate": 0.00028338969735188196, + "loss": 0.7291, + "step": 5127 + }, + { + "epoch": 0.9148158059049148, + "grad_norm": 0.4383372366428375, + "learning_rate": 0.00028332026145670594, + "loss": 0.7891, + "step": 5128 + }, + { + "epoch": 0.914994202122915, + "grad_norm": 0.49089357256889343, + "learning_rate": 0.00028325082294457086, + "loss": 0.951, + "step": 5129 + }, + { + "epoch": 0.9151725983409151, + "grad_norm": 0.4921533167362213, + "learning_rate": 0.00028318138182093053, + "loss": 0.9203, + "step": 5130 + }, + { + "epoch": 0.9153509945589153, + "grad_norm": 0.48249009251594543, + "learning_rate": 0.0002831119380912387, + "loss": 0.7326, + "step": 5131 + }, + { + "epoch": 0.9155293907769155, + "grad_norm": 0.48014095425605774, + "learning_rate": 0.00028304249176094946, + "loss": 0.7696, + "step": 5132 + }, + { + "epoch": 0.9157077869949157, + "grad_norm": 0.5415335297584534, + "learning_rate": 0.00028297304283551725, + "loss": 0.9026, + "step": 5133 + }, + { + "epoch": 0.9158861832129159, + "grad_norm": 0.4653064012527466, + "learning_rate": 0.00028290359132039644, + "loss": 1.0105, + "step": 5134 + }, + { + "epoch": 0.916064579430916, + "grad_norm": 0.482500821352005, + "learning_rate": 0.00028283413722104164, + "loss": 0.8853, + "step": 5135 + }, + { + "epoch": 0.9162429756489162, + "grad_norm": 0.44477859139442444, + "learning_rate": 0.00028276468054290785, + "loss": 0.9191, + "step": 5136 + }, + { + "epoch": 0.9164213718669164, + "grad_norm": 0.4712953567504883, + "learning_rate": 0.00028269522129145013, + "loss": 0.7727, + "step": 5137 + }, + { + "epoch": 0.9165997680849166, + "grad_norm": 0.4651871621608734, + "learning_rate": 0.0002826257594721238, + "loss": 0.8326, + "step": 5138 + }, + { + "epoch": 0.9167781643029168, + "grad_norm": 0.4735598564147949, + "learning_rate": 0.00028255629509038447, + "loss": 0.8486, + "step": 5139 + }, + { + "epoch": 0.916956560520917, + "grad_norm": 0.4831654727458954, + "learning_rate": 0.00028248682815168767, + "loss": 0.7643, + "step": 5140 + }, + { + "epoch": 0.9171349567389171, + "grad_norm": 0.4459700584411621, + "learning_rate": 0.0002824173586614894, + "loss": 0.7463, + "step": 5141 + }, + { + "epoch": 0.9173133529569173, + "grad_norm": 0.5149909853935242, + "learning_rate": 0.0002823478866252456, + "loss": 0.9273, + "step": 5142 + }, + { + "epoch": 0.9174917491749175, + "grad_norm": 0.4949319362640381, + "learning_rate": 0.0002822784120484128, + "loss": 0.7418, + "step": 5143 + }, + { + "epoch": 0.9176701453929177, + "grad_norm": 0.47984182834625244, + "learning_rate": 0.00028220893493644737, + "loss": 0.7807, + "step": 5144 + }, + { + "epoch": 0.9178485416109179, + "grad_norm": 0.5391241908073425, + "learning_rate": 0.0002821394552948062, + "loss": 0.8166, + "step": 5145 + }, + { + "epoch": 0.918026937828918, + "grad_norm": 0.45041200518608093, + "learning_rate": 0.0002820699731289459, + "loss": 0.6369, + "step": 5146 + }, + { + "epoch": 0.9182053340469182, + "grad_norm": 0.45535093545913696, + "learning_rate": 0.00028200048844432375, + "loss": 0.6831, + "step": 5147 + }, + { + "epoch": 0.9183837302649184, + "grad_norm": 1.066908597946167, + "learning_rate": 0.000281931001246397, + "loss": 0.8285, + "step": 5148 + }, + { + "epoch": 0.9185621264829186, + "grad_norm": 0.46443071961402893, + "learning_rate": 0.0002818615115406231, + "loss": 0.7574, + "step": 5149 + }, + { + "epoch": 0.9187405227009188, + "grad_norm": 1.1375192403793335, + "learning_rate": 0.0002817920193324598, + "loss": 0.6816, + "step": 5150 + }, + { + "epoch": 0.918918918918919, + "grad_norm": 0.49952948093414307, + "learning_rate": 0.000281722524627365, + "loss": 0.8653, + "step": 5151 + }, + { + "epoch": 0.9190973151369191, + "grad_norm": 0.978867769241333, + "learning_rate": 0.00028165302743079693, + "loss": 0.8213, + "step": 5152 + }, + { + "epoch": 0.9192757113549193, + "grad_norm": 0.4918247163295746, + "learning_rate": 0.0002815835277482135, + "loss": 0.8755, + "step": 5153 + }, + { + "epoch": 0.9194541075729195, + "grad_norm": 0.4470139443874359, + "learning_rate": 0.0002815140255850735, + "loss": 0.6909, + "step": 5154 + }, + { + "epoch": 0.9196325037909197, + "grad_norm": 0.47432026267051697, + "learning_rate": 0.0002814445209468354, + "loss": 0.8101, + "step": 5155 + }, + { + "epoch": 0.9198109000089199, + "grad_norm": 0.5474990606307983, + "learning_rate": 0.00028137501383895824, + "loss": 1.0695, + "step": 5156 + }, + { + "epoch": 0.9199892962269199, + "grad_norm": 0.47270458936691284, + "learning_rate": 0.00028130550426690095, + "loss": 0.7906, + "step": 5157 + }, + { + "epoch": 0.9201676924449201, + "grad_norm": 0.4652954339981079, + "learning_rate": 0.0002812359922361228, + "loss": 0.6902, + "step": 5158 + }, + { + "epoch": 0.9203460886629203, + "grad_norm": 0.4987577199935913, + "learning_rate": 0.00028116647775208335, + "loss": 0.7337, + "step": 5159 + }, + { + "epoch": 0.9205244848809205, + "grad_norm": 0.530262291431427, + "learning_rate": 0.0002810969608202421, + "loss": 0.9222, + "step": 5160 + }, + { + "epoch": 0.9207028810989207, + "grad_norm": 0.48284605145454407, + "learning_rate": 0.00028102744144605895, + "loss": 0.7397, + "step": 5161 + }, + { + "epoch": 0.9208812773169209, + "grad_norm": 0.4856660068035126, + "learning_rate": 0.00028095791963499384, + "loss": 0.7192, + "step": 5162 + }, + { + "epoch": 0.921059673534921, + "grad_norm": 0.7125374674797058, + "learning_rate": 0.0002808883953925071, + "loss": 1.0275, + "step": 5163 + }, + { + "epoch": 0.9212380697529212, + "grad_norm": 0.4433026611804962, + "learning_rate": 0.0002808188687240591, + "loss": 0.7499, + "step": 5164 + }, + { + "epoch": 0.9214164659709214, + "grad_norm": 0.44827622175216675, + "learning_rate": 0.00028074933963511035, + "loss": 0.8216, + "step": 5165 + }, + { + "epoch": 0.9215948621889216, + "grad_norm": 0.5151771306991577, + "learning_rate": 0.0002806798081311217, + "loss": 0.9786, + "step": 5166 + }, + { + "epoch": 0.9217732584069218, + "grad_norm": 0.5016061663627625, + "learning_rate": 0.0002806102742175542, + "loss": 1.008, + "step": 5167 + }, + { + "epoch": 0.9219516546249219, + "grad_norm": 0.4590628743171692, + "learning_rate": 0.00028054073789986883, + "loss": 0.7287, + "step": 5168 + }, + { + "epoch": 0.9221300508429221, + "grad_norm": 0.47854331135749817, + "learning_rate": 0.00028047119918352717, + "loss": 1.1118, + "step": 5169 + }, + { + "epoch": 0.9223084470609223, + "grad_norm": 0.49446046352386475, + "learning_rate": 0.00028040165807399054, + "loss": 1.0227, + "step": 5170 + }, + { + "epoch": 0.9224868432789225, + "grad_norm": 0.49778786301612854, + "learning_rate": 0.0002803321145767208, + "loss": 0.7794, + "step": 5171 + }, + { + "epoch": 0.9226652394969227, + "grad_norm": 0.46323299407958984, + "learning_rate": 0.0002802625686971798, + "loss": 0.8109, + "step": 5172 + }, + { + "epoch": 0.9228436357149229, + "grad_norm": 0.4294353425502777, + "learning_rate": 0.0002801930204408297, + "loss": 0.7047, + "step": 5173 + }, + { + "epoch": 0.923022031932923, + "grad_norm": 0.4814031720161438, + "learning_rate": 0.0002801234698131328, + "loss": 0.6691, + "step": 5174 + }, + { + "epoch": 0.9232004281509232, + "grad_norm": 0.5088090300559998, + "learning_rate": 0.0002800539168195515, + "loss": 0.7468, + "step": 5175 + }, + { + "epoch": 0.9233788243689234, + "grad_norm": 0.486738383769989, + "learning_rate": 0.00027998436146554857, + "loss": 0.8424, + "step": 5176 + }, + { + "epoch": 0.9235572205869236, + "grad_norm": 0.43971219658851624, + "learning_rate": 0.0002799148037565867, + "loss": 0.829, + "step": 5177 + }, + { + "epoch": 0.9237356168049238, + "grad_norm": 0.47104689478874207, + "learning_rate": 0.0002798452436981291, + "loss": 0.608, + "step": 5178 + }, + { + "epoch": 0.9239140130229239, + "grad_norm": 0.477444589138031, + "learning_rate": 0.0002797756812956389, + "loss": 0.7896, + "step": 5179 + }, + { + "epoch": 0.9240924092409241, + "grad_norm": 0.7103281617164612, + "learning_rate": 0.00027970611655457953, + "loss": 0.7084, + "step": 5180 + }, + { + "epoch": 0.9242708054589243, + "grad_norm": 1.2961317300796509, + "learning_rate": 0.0002796365494804144, + "loss": 0.9616, + "step": 5181 + }, + { + "epoch": 0.9244492016769245, + "grad_norm": 0.540886402130127, + "learning_rate": 0.00027956698007860754, + "loss": 0.8563, + "step": 5182 + }, + { + "epoch": 0.9246275978949247, + "grad_norm": 0.4906296133995056, + "learning_rate": 0.0002794974083546227, + "loss": 0.8361, + "step": 5183 + }, + { + "epoch": 0.9248059941129249, + "grad_norm": 0.47894391417503357, + "learning_rate": 0.0002794278343139242, + "loss": 0.881, + "step": 5184 + }, + { + "epoch": 0.9249843903309249, + "grad_norm": 0.5145757794380188, + "learning_rate": 0.0002793582579619762, + "loss": 0.8234, + "step": 5185 + }, + { + "epoch": 0.9251627865489251, + "grad_norm": 0.43651270866394043, + "learning_rate": 0.0002792886793042434, + "loss": 0.625, + "step": 5186 + }, + { + "epoch": 0.9253411827669253, + "grad_norm": 0.5134057998657227, + "learning_rate": 0.00027921909834619017, + "loss": 0.9787, + "step": 5187 + }, + { + "epoch": 0.9255195789849255, + "grad_norm": 0.6394853591918945, + "learning_rate": 0.0002791495150932815, + "loss": 0.9377, + "step": 5188 + }, + { + "epoch": 0.9256979752029257, + "grad_norm": 0.4257347583770752, + "learning_rate": 0.0002790799295509825, + "loss": 0.6986, + "step": 5189 + }, + { + "epoch": 0.9258763714209258, + "grad_norm": 0.7512332797050476, + "learning_rate": 0.0002790103417247584, + "loss": 0.826, + "step": 5190 + }, + { + "epoch": 0.926054767638926, + "grad_norm": 0.49140146374702454, + "learning_rate": 0.0002789407516200746, + "loss": 0.7517, + "step": 5191 + }, + { + "epoch": 0.9262331638569262, + "grad_norm": 0.4644830524921417, + "learning_rate": 0.0002788711592423966, + "loss": 0.7325, + "step": 5192 + }, + { + "epoch": 0.9264115600749264, + "grad_norm": 0.466547429561615, + "learning_rate": 0.0002788015645971901, + "loss": 0.7623, + "step": 5193 + }, + { + "epoch": 0.9265899562929266, + "grad_norm": 0.4740476906299591, + "learning_rate": 0.00027873196768992114, + "loss": 0.7261, + "step": 5194 + }, + { + "epoch": 0.9267683525109268, + "grad_norm": 0.4969778060913086, + "learning_rate": 0.00027866236852605575, + "loss": 0.8977, + "step": 5195 + }, + { + "epoch": 0.9269467487289269, + "grad_norm": 0.5204504132270813, + "learning_rate": 0.0002785927671110603, + "loss": 0.9751, + "step": 5196 + }, + { + "epoch": 0.9271251449469271, + "grad_norm": 0.5292174220085144, + "learning_rate": 0.00027852316345040125, + "loss": 0.927, + "step": 5197 + }, + { + "epoch": 0.9273035411649273, + "grad_norm": 0.5514991283416748, + "learning_rate": 0.0002784535575495453, + "loss": 1.0084, + "step": 5198 + }, + { + "epoch": 0.9274819373829275, + "grad_norm": 0.5040212869644165, + "learning_rate": 0.00027838394941395907, + "loss": 0.7041, + "step": 5199 + }, + { + "epoch": 0.9276603336009277, + "grad_norm": 0.5497444272041321, + "learning_rate": 0.00027831433904910963, + "loss": 1.0214, + "step": 5200 + }, + { + "epoch": 0.9278387298189278, + "grad_norm": 0.4607004225254059, + "learning_rate": 0.0002782447264604643, + "loss": 0.7548, + "step": 5201 + }, + { + "epoch": 0.928017126036928, + "grad_norm": 0.4299633502960205, + "learning_rate": 0.00027817511165349024, + "loss": 0.7039, + "step": 5202 + }, + { + "epoch": 0.9281955222549282, + "grad_norm": 0.5070714950561523, + "learning_rate": 0.000278105494633655, + "loss": 0.9753, + "step": 5203 + }, + { + "epoch": 0.9283739184729284, + "grad_norm": 0.49922603368759155, + "learning_rate": 0.0002780358754064263, + "loss": 0.8345, + "step": 5204 + }, + { + "epoch": 0.9285523146909286, + "grad_norm": 0.6079359650611877, + "learning_rate": 0.00027796625397727214, + "loss": 0.694, + "step": 5205 + }, + { + "epoch": 0.9287307109089288, + "grad_norm": 0.45466673374176025, + "learning_rate": 0.0002778966303516603, + "loss": 0.8286, + "step": 5206 + }, + { + "epoch": 0.9289091071269289, + "grad_norm": 0.5105868577957153, + "learning_rate": 0.00027782700453505925, + "loss": 0.7927, + "step": 5207 + }, + { + "epoch": 0.9290875033449291, + "grad_norm": 0.5233384370803833, + "learning_rate": 0.00027775737653293716, + "loss": 0.8307, + "step": 5208 + }, + { + "epoch": 0.9292658995629293, + "grad_norm": 0.508243978023529, + "learning_rate": 0.00027768774635076265, + "loss": 0.8838, + "step": 5209 + }, + { + "epoch": 0.9294442957809295, + "grad_norm": 0.46551835536956787, + "learning_rate": 0.0002776181139940045, + "loss": 0.7637, + "step": 5210 + }, + { + "epoch": 0.9296226919989297, + "grad_norm": 0.44400593638420105, + "learning_rate": 0.0002775484794681315, + "loss": 0.7503, + "step": 5211 + }, + { + "epoch": 0.9298010882169297, + "grad_norm": 0.4329807460308075, + "learning_rate": 0.0002774788427786128, + "loss": 0.6834, + "step": 5212 + }, + { + "epoch": 0.92997948443493, + "grad_norm": 0.5895024538040161, + "learning_rate": 0.0002774092039309176, + "loss": 0.928, + "step": 5213 + }, + { + "epoch": 0.9301578806529301, + "grad_norm": 0.46719425916671753, + "learning_rate": 0.0002773395629305154, + "loss": 0.6546, + "step": 5214 + }, + { + "epoch": 0.9303362768709303, + "grad_norm": 0.5935095548629761, + "learning_rate": 0.0002772699197828756, + "loss": 0.7523, + "step": 5215 + }, + { + "epoch": 0.9305146730889305, + "grad_norm": 0.5028880834579468, + "learning_rate": 0.00027720027449346806, + "loss": 0.9039, + "step": 5216 + }, + { + "epoch": 0.9306930693069307, + "grad_norm": 0.4530572295188904, + "learning_rate": 0.00027713062706776273, + "loss": 0.7171, + "step": 5217 + }, + { + "epoch": 0.9308714655249308, + "grad_norm": 0.4633224606513977, + "learning_rate": 0.0002770609775112295, + "loss": 0.8019, + "step": 5218 + }, + { + "epoch": 0.931049861742931, + "grad_norm": 0.5543755292892456, + "learning_rate": 0.00027699132582933886, + "loss": 1.0682, + "step": 5219 + }, + { + "epoch": 0.9312282579609312, + "grad_norm": 0.4689428508281708, + "learning_rate": 0.000276921672027561, + "loss": 0.7497, + "step": 5220 + }, + { + "epoch": 0.9314066541789314, + "grad_norm": 0.5170400142669678, + "learning_rate": 0.0002768520161113667, + "loss": 0.8323, + "step": 5221 + }, + { + "epoch": 0.9315850503969316, + "grad_norm": 0.4578782320022583, + "learning_rate": 0.0002767823580862265, + "loss": 1.0117, + "step": 5222 + }, + { + "epoch": 0.9317634466149317, + "grad_norm": 0.44367796182632446, + "learning_rate": 0.00027671269795761155, + "loss": 0.7195, + "step": 5223 + }, + { + "epoch": 0.9319418428329319, + "grad_norm": 0.48492223024368286, + "learning_rate": 0.00027664303573099274, + "loss": 0.9721, + "step": 5224 + }, + { + "epoch": 0.9321202390509321, + "grad_norm": 0.696104109287262, + "learning_rate": 0.00027657337141184134, + "loss": 1.0126, + "step": 5225 + }, + { + "epoch": 0.9322986352689323, + "grad_norm": 0.7282087206840515, + "learning_rate": 0.00027650370500562885, + "loss": 0.9032, + "step": 5226 + }, + { + "epoch": 0.9324770314869325, + "grad_norm": 0.42898091673851013, + "learning_rate": 0.00027643403651782673, + "loss": 0.6429, + "step": 5227 + }, + { + "epoch": 0.9326554277049327, + "grad_norm": 0.4315216541290283, + "learning_rate": 0.00027636436595390674, + "loss": 0.8703, + "step": 5228 + }, + { + "epoch": 0.9328338239229328, + "grad_norm": 0.44752299785614014, + "learning_rate": 0.0002762946933193408, + "loss": 0.8215, + "step": 5229 + }, + { + "epoch": 0.933012220140933, + "grad_norm": 0.4484764635562897, + "learning_rate": 0.00027622501861960104, + "loss": 0.6912, + "step": 5230 + }, + { + "epoch": 0.9331906163589332, + "grad_norm": 0.371706485748291, + "learning_rate": 0.0002761553418601595, + "loss": 0.5803, + "step": 5231 + }, + { + "epoch": 0.9333690125769334, + "grad_norm": 0.4724928140640259, + "learning_rate": 0.0002760856630464888, + "loss": 0.8421, + "step": 5232 + }, + { + "epoch": 0.9335474087949336, + "grad_norm": 0.49815791845321655, + "learning_rate": 0.0002760159821840612, + "loss": 0.6765, + "step": 5233 + }, + { + "epoch": 0.9337258050129337, + "grad_norm": 0.5424796342849731, + "learning_rate": 0.00027594629927834956, + "loss": 0.8134, + "step": 5234 + }, + { + "epoch": 0.9339042012309339, + "grad_norm": 0.5402620434761047, + "learning_rate": 0.0002758766143348268, + "loss": 0.8324, + "step": 5235 + }, + { + "epoch": 0.9340825974489341, + "grad_norm": 0.4638790190219879, + "learning_rate": 0.0002758069273589659, + "loss": 0.8668, + "step": 5236 + }, + { + "epoch": 0.9342609936669343, + "grad_norm": 0.5340316295623779, + "learning_rate": 0.00027573723835624004, + "loss": 0.8628, + "step": 5237 + }, + { + "epoch": 0.9344393898849345, + "grad_norm": 0.6846177577972412, + "learning_rate": 0.00027566754733212255, + "loss": 0.831, + "step": 5238 + }, + { + "epoch": 0.9346177861029347, + "grad_norm": 0.502400815486908, + "learning_rate": 0.0002755978542920869, + "loss": 0.8583, + "step": 5239 + }, + { + "epoch": 0.9347961823209348, + "grad_norm": 0.44814401865005493, + "learning_rate": 0.00027552815924160686, + "loss": 0.7743, + "step": 5240 + }, + { + "epoch": 0.934974578538935, + "grad_norm": 0.6513227224349976, + "learning_rate": 0.0002754584621861561, + "loss": 0.922, + "step": 5241 + }, + { + "epoch": 0.9351529747569352, + "grad_norm": 0.4567849040031433, + "learning_rate": 0.0002753887631312086, + "loss": 0.7956, + "step": 5242 + }, + { + "epoch": 0.9353313709749353, + "grad_norm": 0.4874018430709839, + "learning_rate": 0.00027531906208223865, + "loss": 0.9146, + "step": 5243 + }, + { + "epoch": 0.9355097671929355, + "grad_norm": 0.4673248827457428, + "learning_rate": 0.00027524935904472053, + "loss": 0.8524, + "step": 5244 + }, + { + "epoch": 0.9356881634109356, + "grad_norm": 0.5060569047927856, + "learning_rate": 0.0002751796540241286, + "loss": 1.0127, + "step": 5245 + }, + { + "epoch": 0.9358665596289358, + "grad_norm": 0.4823893904685974, + "learning_rate": 0.00027510994702593743, + "loss": 0.8641, + "step": 5246 + }, + { + "epoch": 0.936044955846936, + "grad_norm": 0.5004808902740479, + "learning_rate": 0.0002750402380556218, + "loss": 0.8461, + "step": 5247 + }, + { + "epoch": 0.9362233520649362, + "grad_norm": 0.4644457995891571, + "learning_rate": 0.0002749705271186567, + "loss": 0.7319, + "step": 5248 + }, + { + "epoch": 0.9364017482829364, + "grad_norm": 0.49020230770111084, + "learning_rate": 0.0002749008142205171, + "loss": 0.7813, + "step": 5249 + }, + { + "epoch": 0.9365801445009366, + "grad_norm": 0.568228006362915, + "learning_rate": 0.0002748310993666783, + "loss": 1.0021, + "step": 5250 + }, + { + "epoch": 0.9367585407189367, + "grad_norm": 0.46288859844207764, + "learning_rate": 0.00027476138256261575, + "loss": 0.7229, + "step": 5251 + }, + { + "epoch": 0.9369369369369369, + "grad_norm": 0.5582467913627625, + "learning_rate": 0.00027469166381380474, + "loss": 0.729, + "step": 5252 + }, + { + "epoch": 0.9371153331549371, + "grad_norm": 0.6614149808883667, + "learning_rate": 0.0002746219431257211, + "loss": 0.8033, + "step": 5253 + }, + { + "epoch": 0.9372937293729373, + "grad_norm": 0.4389813244342804, + "learning_rate": 0.0002745522205038406, + "loss": 0.6155, + "step": 5254 + }, + { + "epoch": 0.9374721255909375, + "grad_norm": 0.564568042755127, + "learning_rate": 0.0002744824959536393, + "loss": 0.8861, + "step": 5255 + }, + { + "epoch": 0.9376505218089376, + "grad_norm": 0.6031481623649597, + "learning_rate": 0.00027441276948059337, + "loss": 0.8549, + "step": 5256 + }, + { + "epoch": 0.9378289180269378, + "grad_norm": 0.4482172429561615, + "learning_rate": 0.0002743430410901789, + "loss": 0.6852, + "step": 5257 + }, + { + "epoch": 0.938007314244938, + "grad_norm": 0.47618693113327026, + "learning_rate": 0.0002742733107878726, + "loss": 0.6698, + "step": 5258 + }, + { + "epoch": 0.9381857104629382, + "grad_norm": 0.4913199543952942, + "learning_rate": 0.00027420357857915083, + "loss": 0.9027, + "step": 5259 + }, + { + "epoch": 0.9383641066809384, + "grad_norm": 0.510835587978363, + "learning_rate": 0.0002741338444694904, + "loss": 0.8386, + "step": 5260 + }, + { + "epoch": 0.9385425028989386, + "grad_norm": 1.6068651676177979, + "learning_rate": 0.00027406410846436826, + "loss": 0.9289, + "step": 5261 + }, + { + "epoch": 0.9387208991169387, + "grad_norm": 0.5296999216079712, + "learning_rate": 0.0002739943705692614, + "loss": 0.9303, + "step": 5262 + }, + { + "epoch": 0.9388992953349389, + "grad_norm": 0.5108866095542908, + "learning_rate": 0.00027392463078964696, + "loss": 0.8593, + "step": 5263 + }, + { + "epoch": 0.9390776915529391, + "grad_norm": 0.48683467507362366, + "learning_rate": 0.0002738548891310023, + "loss": 0.8258, + "step": 5264 + }, + { + "epoch": 0.9392560877709393, + "grad_norm": 0.48959386348724365, + "learning_rate": 0.00027378514559880495, + "loss": 0.802, + "step": 5265 + }, + { + "epoch": 0.9394344839889395, + "grad_norm": 0.5760032534599304, + "learning_rate": 0.0002737154001985325, + "loss": 0.8428, + "step": 5266 + }, + { + "epoch": 0.9396128802069396, + "grad_norm": 0.47354596853256226, + "learning_rate": 0.0002736456529356627, + "loss": 0.7726, + "step": 5267 + }, + { + "epoch": 0.9397912764249398, + "grad_norm": 0.5111311078071594, + "learning_rate": 0.00027357590381567353, + "loss": 0.8006, + "step": 5268 + }, + { + "epoch": 0.93996967264294, + "grad_norm": 0.4919393062591553, + "learning_rate": 0.00027350615284404305, + "loss": 0.8726, + "step": 5269 + }, + { + "epoch": 0.9401480688609402, + "grad_norm": 0.43833592534065247, + "learning_rate": 0.0002734364000262494, + "loss": 0.8131, + "step": 5270 + }, + { + "epoch": 0.9403264650789404, + "grad_norm": 0.4701617658138275, + "learning_rate": 0.00027336664536777093, + "loss": 0.8316, + "step": 5271 + }, + { + "epoch": 0.9405048612969406, + "grad_norm": 0.430813729763031, + "learning_rate": 0.0002732968888740863, + "loss": 0.7455, + "step": 5272 + }, + { + "epoch": 0.9406832575149406, + "grad_norm": 0.46511298418045044, + "learning_rate": 0.00027322713055067397, + "loss": 0.8424, + "step": 5273 + }, + { + "epoch": 0.9408616537329408, + "grad_norm": 0.5365491509437561, + "learning_rate": 0.0002731573704030128, + "loss": 0.902, + "step": 5274 + }, + { + "epoch": 0.941040049950941, + "grad_norm": 0.4637891948223114, + "learning_rate": 0.0002730876084365817, + "loss": 0.7835, + "step": 5275 + }, + { + "epoch": 0.9412184461689412, + "grad_norm": 0.47008875012397766, + "learning_rate": 0.00027301784465685983, + "loss": 0.8082, + "step": 5276 + }, + { + "epoch": 0.9413968423869414, + "grad_norm": 0.42450207471847534, + "learning_rate": 0.0002729480790693263, + "loss": 0.7095, + "step": 5277 + }, + { + "epoch": 0.9415752386049415, + "grad_norm": 0.5233426690101624, + "learning_rate": 0.0002728783116794606, + "loss": 0.9605, + "step": 5278 + }, + { + "epoch": 0.9417536348229417, + "grad_norm": 1.0770610570907593, + "learning_rate": 0.00027280854249274206, + "loss": 0.6148, + "step": 5279 + }, + { + "epoch": 0.9419320310409419, + "grad_norm": 0.6112112402915955, + "learning_rate": 0.00027273877151465036, + "loss": 1.1989, + "step": 5280 + }, + { + "epoch": 0.9421104272589421, + "grad_norm": 0.5142430663108826, + "learning_rate": 0.0002726689987506654, + "loss": 0.9081, + "step": 5281 + }, + { + "epoch": 0.9422888234769423, + "grad_norm": 0.5745399594306946, + "learning_rate": 0.00027259922420626705, + "loss": 0.8929, + "step": 5282 + }, + { + "epoch": 0.9424672196949425, + "grad_norm": 0.6627066731452942, + "learning_rate": 0.00027252944788693536, + "loss": 0.6532, + "step": 5283 + }, + { + "epoch": 0.9426456159129426, + "grad_norm": 0.584989607334137, + "learning_rate": 0.00027245966979815044, + "loss": 0.8091, + "step": 5284 + }, + { + "epoch": 0.9428240121309428, + "grad_norm": 0.4927927851676941, + "learning_rate": 0.0002723898899453929, + "loss": 0.7812, + "step": 5285 + }, + { + "epoch": 0.943002408348943, + "grad_norm": 0.5424949526786804, + "learning_rate": 0.00027232010833414287, + "loss": 0.9299, + "step": 5286 + }, + { + "epoch": 0.9431808045669432, + "grad_norm": 0.5809288024902344, + "learning_rate": 0.0002722503249698812, + "loss": 0.8294, + "step": 5287 + }, + { + "epoch": 0.9433592007849434, + "grad_norm": 0.601315438747406, + "learning_rate": 0.0002721805398580885, + "loss": 0.7725, + "step": 5288 + }, + { + "epoch": 0.9435375970029435, + "grad_norm": 0.5783292055130005, + "learning_rate": 0.0002721107530042458, + "loss": 0.7394, + "step": 5289 + }, + { + "epoch": 0.9437159932209437, + "grad_norm": 0.4998534321784973, + "learning_rate": 0.00027204096441383414, + "loss": 0.8316, + "step": 5290 + }, + { + "epoch": 0.9438943894389439, + "grad_norm": 0.8708257079124451, + "learning_rate": 0.0002719711740923346, + "loss": 0.811, + "step": 5291 + }, + { + "epoch": 0.9440727856569441, + "grad_norm": 0.5783541798591614, + "learning_rate": 0.00027190138204522847, + "loss": 0.9316, + "step": 5292 + }, + { + "epoch": 0.9442511818749443, + "grad_norm": 1.6886039972305298, + "learning_rate": 0.0002718315882779972, + "loss": 0.6536, + "step": 5293 + }, + { + "epoch": 0.9444295780929445, + "grad_norm": 0.5459927320480347, + "learning_rate": 0.0002717617927961224, + "loss": 0.8067, + "step": 5294 + }, + { + "epoch": 0.9446079743109446, + "grad_norm": 0.46528294682502747, + "learning_rate": 0.00027169199560508574, + "loss": 0.5931, + "step": 5295 + }, + { + "epoch": 0.9447863705289448, + "grad_norm": 0.5428159832954407, + "learning_rate": 0.0002716221967103691, + "loss": 0.8225, + "step": 5296 + }, + { + "epoch": 0.944964766746945, + "grad_norm": 0.4363666772842407, + "learning_rate": 0.0002715523961174545, + "loss": 0.6572, + "step": 5297 + }, + { + "epoch": 0.9451431629649452, + "grad_norm": 0.6103348731994629, + "learning_rate": 0.0002714825938318239, + "loss": 1.0737, + "step": 5298 + }, + { + "epoch": 0.9453215591829454, + "grad_norm": 0.4808749854564667, + "learning_rate": 0.0002714127898589596, + "loss": 0.8564, + "step": 5299 + }, + { + "epoch": 0.9454999554009454, + "grad_norm": 0.46589285135269165, + "learning_rate": 0.00027134298420434405, + "loss": 0.7157, + "step": 5300 + }, + { + "epoch": 0.9456783516189456, + "grad_norm": 0.4251248836517334, + "learning_rate": 0.0002712731768734597, + "loss": 0.6488, + "step": 5301 + }, + { + "epoch": 0.9458567478369458, + "grad_norm": 0.4899739623069763, + "learning_rate": 0.0002712033678717892, + "loss": 0.9113, + "step": 5302 + }, + { + "epoch": 0.946035144054946, + "grad_norm": 0.506604790687561, + "learning_rate": 0.00027113355720481523, + "loss": 0.8268, + "step": 5303 + }, + { + "epoch": 0.9462135402729462, + "grad_norm": 0.4809357225894928, + "learning_rate": 0.00027106374487802096, + "loss": 0.9743, + "step": 5304 + }, + { + "epoch": 0.9463919364909464, + "grad_norm": 0.4477202296257019, + "learning_rate": 0.00027099393089688906, + "loss": 0.8365, + "step": 5305 + }, + { + "epoch": 0.9465703327089465, + "grad_norm": 0.42474350333213806, + "learning_rate": 0.0002709241152669029, + "loss": 0.654, + "step": 5306 + }, + { + "epoch": 0.9467487289269467, + "grad_norm": 0.4127642810344696, + "learning_rate": 0.00027085429799354575, + "loss": 0.7342, + "step": 5307 + }, + { + "epoch": 0.9469271251449469, + "grad_norm": 0.4678402543067932, + "learning_rate": 0.00027078447908230105, + "loss": 0.7684, + "step": 5308 + }, + { + "epoch": 0.9471055213629471, + "grad_norm": 0.434625506401062, + "learning_rate": 0.00027071465853865224, + "loss": 0.6473, + "step": 5309 + }, + { + "epoch": 0.9472839175809473, + "grad_norm": 0.48882344365119934, + "learning_rate": 0.00027064483636808314, + "loss": 0.8451, + "step": 5310 + }, + { + "epoch": 0.9474623137989474, + "grad_norm": 0.506234347820282, + "learning_rate": 0.0002705750125760774, + "loss": 1.075, + "step": 5311 + }, + { + "epoch": 0.9476407100169476, + "grad_norm": 0.47516417503356934, + "learning_rate": 0.00027050518716811904, + "loss": 0.8066, + "step": 5312 + }, + { + "epoch": 0.9478191062349478, + "grad_norm": 0.45935487747192383, + "learning_rate": 0.0002704353601496921, + "loss": 0.8011, + "step": 5313 + }, + { + "epoch": 0.947997502452948, + "grad_norm": 0.4938845634460449, + "learning_rate": 0.0002703655315262808, + "loss": 0.9695, + "step": 5314 + }, + { + "epoch": 0.9481758986709482, + "grad_norm": 0.4160667359828949, + "learning_rate": 0.00027029570130336937, + "loss": 0.6122, + "step": 5315 + }, + { + "epoch": 0.9483542948889484, + "grad_norm": 0.5093261003494263, + "learning_rate": 0.00027022586948644234, + "loss": 1.0263, + "step": 5316 + }, + { + "epoch": 0.9485326911069485, + "grad_norm": 0.4197608530521393, + "learning_rate": 0.0002701560360809842, + "loss": 0.8012, + "step": 5317 + }, + { + "epoch": 0.9487110873249487, + "grad_norm": 0.47987252473831177, + "learning_rate": 0.0002700862010924797, + "loss": 0.7935, + "step": 5318 + }, + { + "epoch": 0.9488894835429489, + "grad_norm": 0.4315970242023468, + "learning_rate": 0.00027001636452641354, + "loss": 0.6472, + "step": 5319 + }, + { + "epoch": 0.9490678797609491, + "grad_norm": 0.4990069270133972, + "learning_rate": 0.0002699465263882708, + "loss": 0.7692, + "step": 5320 + }, + { + "epoch": 0.9492462759789493, + "grad_norm": 0.4555619955062866, + "learning_rate": 0.00026987668668353637, + "loss": 0.7722, + "step": 5321 + }, + { + "epoch": 0.9494246721969494, + "grad_norm": 0.5311444401741028, + "learning_rate": 0.00026980684541769563, + "loss": 1.0347, + "step": 5322 + }, + { + "epoch": 0.9496030684149496, + "grad_norm": 0.4669055640697479, + "learning_rate": 0.0002697370025962337, + "loss": 0.6929, + "step": 5323 + }, + { + "epoch": 0.9497814646329498, + "grad_norm": 0.4881044924259186, + "learning_rate": 0.0002696671582246361, + "loss": 0.8754, + "step": 5324 + }, + { + "epoch": 0.94995986085095, + "grad_norm": 0.45897579193115234, + "learning_rate": 0.0002695973123083884, + "loss": 0.8245, + "step": 5325 + }, + { + "epoch": 0.9501382570689502, + "grad_norm": 0.4809344410896301, + "learning_rate": 0.00026952746485297614, + "loss": 0.6617, + "step": 5326 + }, + { + "epoch": 0.9503166532869504, + "grad_norm": 0.4562990963459015, + "learning_rate": 0.00026945761586388524, + "loss": 0.8179, + "step": 5327 + }, + { + "epoch": 0.9504950495049505, + "grad_norm": 0.4504075050354004, + "learning_rate": 0.0002693877653466015, + "loss": 0.8011, + "step": 5328 + }, + { + "epoch": 0.9506734457229506, + "grad_norm": 0.44723081588745117, + "learning_rate": 0.0002693179133066111, + "loss": 0.7159, + "step": 5329 + }, + { + "epoch": 0.9508518419409508, + "grad_norm": 0.49957334995269775, + "learning_rate": 0.00026924805974940007, + "loss": 0.8277, + "step": 5330 + }, + { + "epoch": 0.951030238158951, + "grad_norm": 0.4783354699611664, + "learning_rate": 0.0002691782046804548, + "loss": 0.9436, + "step": 5331 + }, + { + "epoch": 0.9512086343769512, + "grad_norm": 0.39161327481269836, + "learning_rate": 0.00026910834810526147, + "loss": 0.5881, + "step": 5332 + }, + { + "epoch": 0.9513870305949513, + "grad_norm": 0.46436673402786255, + "learning_rate": 0.00026903849002930677, + "loss": 0.7549, + "step": 5333 + }, + { + "epoch": 0.9515654268129515, + "grad_norm": 0.4534311592578888, + "learning_rate": 0.00026896863045807715, + "loss": 0.6577, + "step": 5334 + }, + { + "epoch": 0.9517438230309517, + "grad_norm": 0.4262526333332062, + "learning_rate": 0.00026889876939705946, + "loss": 0.7005, + "step": 5335 + }, + { + "epoch": 0.9519222192489519, + "grad_norm": 0.49773791432380676, + "learning_rate": 0.00026882890685174065, + "loss": 0.8739, + "step": 5336 + }, + { + "epoch": 0.9521006154669521, + "grad_norm": 0.46525028347969055, + "learning_rate": 0.00026875904282760765, + "loss": 0.6493, + "step": 5337 + }, + { + "epoch": 0.9522790116849523, + "grad_norm": 0.42473146319389343, + "learning_rate": 0.00026868917733014743, + "loss": 0.6203, + "step": 5338 + }, + { + "epoch": 0.9524574079029524, + "grad_norm": 0.49023476243019104, + "learning_rate": 0.0002686193103648472, + "loss": 0.9317, + "step": 5339 + }, + { + "epoch": 0.9526358041209526, + "grad_norm": 0.5018748641014099, + "learning_rate": 0.00026854944193719445, + "loss": 0.9483, + "step": 5340 + }, + { + "epoch": 0.9528142003389528, + "grad_norm": 0.49157464504241943, + "learning_rate": 0.00026847957205267635, + "loss": 0.9226, + "step": 5341 + }, + { + "epoch": 0.952992596556953, + "grad_norm": 0.474185049533844, + "learning_rate": 0.0002684097007167807, + "loss": 0.8999, + "step": 5342 + }, + { + "epoch": 0.9531709927749532, + "grad_norm": 0.4604833126068115, + "learning_rate": 0.0002683398279349952, + "loss": 0.8502, + "step": 5343 + }, + { + "epoch": 0.9533493889929533, + "grad_norm": 0.5128932595252991, + "learning_rate": 0.0002682699537128074, + "loss": 0.8059, + "step": 5344 + }, + { + "epoch": 0.9535277852109535, + "grad_norm": 0.42968082427978516, + "learning_rate": 0.00026820007805570536, + "loss": 0.6065, + "step": 5345 + }, + { + "epoch": 0.9537061814289537, + "grad_norm": 0.5407286286354065, + "learning_rate": 0.00026813020096917695, + "loss": 0.9448, + "step": 5346 + }, + { + "epoch": 0.9538845776469539, + "grad_norm": 1.0155121088027954, + "learning_rate": 0.0002680603224587104, + "loss": 1.2441, + "step": 5347 + }, + { + "epoch": 0.9540629738649541, + "grad_norm": 0.5304948687553406, + "learning_rate": 0.0002679904425297938, + "loss": 1.0268, + "step": 5348 + }, + { + "epoch": 0.9542413700829543, + "grad_norm": 0.4759924113750458, + "learning_rate": 0.00026792056118791563, + "loss": 0.9434, + "step": 5349 + }, + { + "epoch": 0.9544197663009544, + "grad_norm": 0.3957015573978424, + "learning_rate": 0.00026785067843856437, + "loss": 0.6315, + "step": 5350 + }, + { + "epoch": 0.9545981625189546, + "grad_norm": 0.46599963307380676, + "learning_rate": 0.00026778079428722845, + "loss": 0.7155, + "step": 5351 + }, + { + "epoch": 0.9547765587369548, + "grad_norm": 0.4720132648944855, + "learning_rate": 0.0002677109087393966, + "loss": 0.8976, + "step": 5352 + }, + { + "epoch": 0.954954954954955, + "grad_norm": 0.5006893277168274, + "learning_rate": 0.00026764102180055766, + "loss": 0.8828, + "step": 5353 + }, + { + "epoch": 0.9551333511729552, + "grad_norm": 0.4419143795967102, + "learning_rate": 0.0002675711334762004, + "loss": 0.7284, + "step": 5354 + }, + { + "epoch": 0.9553117473909553, + "grad_norm": 0.4111958146095276, + "learning_rate": 0.0002675012437718139, + "loss": 0.6126, + "step": 5355 + }, + { + "epoch": 0.9554901436089555, + "grad_norm": 0.5170559287071228, + "learning_rate": 0.0002674313526928872, + "loss": 0.9313, + "step": 5356 + }, + { + "epoch": 0.9556685398269557, + "grad_norm": 0.4762997329235077, + "learning_rate": 0.0002673614602449096, + "loss": 0.8053, + "step": 5357 + }, + { + "epoch": 0.9558469360449559, + "grad_norm": 0.4590204358100891, + "learning_rate": 0.0002672915664333704, + "loss": 0.85, + "step": 5358 + }, + { + "epoch": 0.956025332262956, + "grad_norm": 0.5133612155914307, + "learning_rate": 0.000267221671263759, + "loss": 0.7038, + "step": 5359 + }, + { + "epoch": 0.9562037284809562, + "grad_norm": 0.4732131063938141, + "learning_rate": 0.0002671517747415649, + "loss": 0.8324, + "step": 5360 + }, + { + "epoch": 0.9563821246989563, + "grad_norm": 0.4754936099052429, + "learning_rate": 0.0002670818768722778, + "loss": 0.9258, + "step": 5361 + }, + { + "epoch": 0.9565605209169565, + "grad_norm": 0.4571801722049713, + "learning_rate": 0.0002670119776613875, + "loss": 0.8035, + "step": 5362 + }, + { + "epoch": 0.9567389171349567, + "grad_norm": 0.48625341057777405, + "learning_rate": 0.0002669420771143838, + "loss": 0.8987, + "step": 5363 + }, + { + "epoch": 0.9569173133529569, + "grad_norm": 0.4703410267829895, + "learning_rate": 0.0002668721752367566, + "loss": 0.9292, + "step": 5364 + }, + { + "epoch": 0.9570957095709571, + "grad_norm": 0.458659291267395, + "learning_rate": 0.00026680227203399604, + "loss": 0.7997, + "step": 5365 + }, + { + "epoch": 0.9572741057889572, + "grad_norm": 0.4934099018573761, + "learning_rate": 0.0002667323675115922, + "loss": 0.7636, + "step": 5366 + }, + { + "epoch": 0.9574525020069574, + "grad_norm": 0.4184805154800415, + "learning_rate": 0.0002666624616750355, + "loss": 0.6587, + "step": 5367 + }, + { + "epoch": 0.9576308982249576, + "grad_norm": 0.485462486743927, + "learning_rate": 0.00026659255452981623, + "loss": 0.9463, + "step": 5368 + }, + { + "epoch": 0.9578092944429578, + "grad_norm": 0.6165233254432678, + "learning_rate": 0.00026652264608142484, + "loss": 0.6606, + "step": 5369 + }, + { + "epoch": 0.957987690660958, + "grad_norm": 0.44948717951774597, + "learning_rate": 0.000266452736335352, + "loss": 0.7658, + "step": 5370 + }, + { + "epoch": 0.9581660868789582, + "grad_norm": 0.40302959084510803, + "learning_rate": 0.0002663828252970883, + "loss": 0.6658, + "step": 5371 + }, + { + "epoch": 0.9583444830969583, + "grad_norm": 0.4867773652076721, + "learning_rate": 0.00026631291297212444, + "loss": 0.7478, + "step": 5372 + }, + { + "epoch": 0.9585228793149585, + "grad_norm": 0.4726101756095886, + "learning_rate": 0.0002662429993659515, + "loss": 0.8439, + "step": 5373 + }, + { + "epoch": 0.9587012755329587, + "grad_norm": 0.5044912099838257, + "learning_rate": 0.0002661730844840604, + "loss": 0.9133, + "step": 5374 + }, + { + "epoch": 0.9588796717509589, + "grad_norm": 0.4630397856235504, + "learning_rate": 0.0002661031683319422, + "loss": 0.9888, + "step": 5375 + }, + { + "epoch": 0.9590580679689591, + "grad_norm": 0.4974066913127899, + "learning_rate": 0.00026603325091508807, + "loss": 1.1051, + "step": 5376 + }, + { + "epoch": 0.9592364641869592, + "grad_norm": 0.42207884788513184, + "learning_rate": 0.00026596333223898933, + "loss": 0.7558, + "step": 5377 + }, + { + "epoch": 0.9594148604049594, + "grad_norm": 0.5264635682106018, + "learning_rate": 0.00026589341230913736, + "loss": 0.9417, + "step": 5378 + }, + { + "epoch": 0.9595932566229596, + "grad_norm": 0.467877596616745, + "learning_rate": 0.0002658234911310236, + "loss": 0.6177, + "step": 5379 + }, + { + "epoch": 0.9597716528409598, + "grad_norm": 0.5078222155570984, + "learning_rate": 0.0002657535687101396, + "loss": 0.9433, + "step": 5380 + }, + { + "epoch": 0.95995004905896, + "grad_norm": 0.5080622434616089, + "learning_rate": 0.0002656836450519772, + "loss": 0.8492, + "step": 5381 + }, + { + "epoch": 0.9601284452769602, + "grad_norm": 0.4674982726573944, + "learning_rate": 0.000265613720162028, + "loss": 0.7062, + "step": 5382 + }, + { + "epoch": 0.9603068414949603, + "grad_norm": 0.544535219669342, + "learning_rate": 0.00026554379404578396, + "loss": 0.8553, + "step": 5383 + }, + { + "epoch": 0.9604852377129605, + "grad_norm": 0.4691609740257263, + "learning_rate": 0.00026547386670873707, + "loss": 0.747, + "step": 5384 + }, + { + "epoch": 0.9606636339309607, + "grad_norm": 0.5239211916923523, + "learning_rate": 0.00026540393815637924, + "loss": 0.7682, + "step": 5385 + }, + { + "epoch": 0.9608420301489609, + "grad_norm": 0.4878639280796051, + "learning_rate": 0.00026533400839420286, + "loss": 1.0158, + "step": 5386 + }, + { + "epoch": 0.9610204263669611, + "grad_norm": 0.45283398032188416, + "learning_rate": 0.0002652640774276999, + "loss": 0.9548, + "step": 5387 + }, + { + "epoch": 0.9611988225849611, + "grad_norm": 0.4687561094760895, + "learning_rate": 0.00026519414526236297, + "loss": 0.8433, + "step": 5388 + }, + { + "epoch": 0.9613772188029613, + "grad_norm": 0.515119731426239, + "learning_rate": 0.0002651242119036844, + "loss": 0.8019, + "step": 5389 + }, + { + "epoch": 0.9615556150209615, + "grad_norm": 0.42879781126976013, + "learning_rate": 0.00026505427735715675, + "loss": 0.7789, + "step": 5390 + }, + { + "epoch": 0.9617340112389617, + "grad_norm": 0.49096372723579407, + "learning_rate": 0.00026498434162827266, + "loss": 0.9128, + "step": 5391 + }, + { + "epoch": 0.9619124074569619, + "grad_norm": 0.45153510570526123, + "learning_rate": 0.00026491440472252475, + "loss": 0.7555, + "step": 5392 + }, + { + "epoch": 0.9620908036749621, + "grad_norm": 0.43661659955978394, + "learning_rate": 0.00026484446664540594, + "loss": 0.6922, + "step": 5393 + }, + { + "epoch": 0.9622691998929622, + "grad_norm": 0.4993753731250763, + "learning_rate": 0.00026477452740240914, + "loss": 0.7653, + "step": 5394 + }, + { + "epoch": 0.9624475961109624, + "grad_norm": 0.4706138074398041, + "learning_rate": 0.00026470458699902723, + "loss": 0.9497, + "step": 5395 + }, + { + "epoch": 0.9626259923289626, + "grad_norm": 0.4838847815990448, + "learning_rate": 0.00026463464544075344, + "loss": 0.8203, + "step": 5396 + }, + { + "epoch": 0.9628043885469628, + "grad_norm": 0.4903802275657654, + "learning_rate": 0.000264564702733081, + "loss": 0.8855, + "step": 5397 + }, + { + "epoch": 0.962982784764963, + "grad_norm": 0.5173416137695312, + "learning_rate": 0.00026449475888150293, + "loss": 0.872, + "step": 5398 + }, + { + "epoch": 0.9631611809829632, + "grad_norm": 0.4816221594810486, + "learning_rate": 0.0002644248138915128, + "loss": 0.8021, + "step": 5399 + }, + { + "epoch": 0.9633395772009633, + "grad_norm": 0.4750434160232544, + "learning_rate": 0.00026435486776860395, + "loss": 0.8306, + "step": 5400 + }, + { + "epoch": 0.9635179734189635, + "grad_norm": 0.47214189171791077, + "learning_rate": 0.00026428492051827, + "loss": 0.8921, + "step": 5401 + }, + { + "epoch": 0.9636963696369637, + "grad_norm": 0.4591917097568512, + "learning_rate": 0.0002642149721460045, + "loss": 0.7029, + "step": 5402 + }, + { + "epoch": 0.9638747658549639, + "grad_norm": 0.5314555168151855, + "learning_rate": 0.00026414502265730125, + "loss": 0.9167, + "step": 5403 + }, + { + "epoch": 0.9640531620729641, + "grad_norm": 0.4804200232028961, + "learning_rate": 0.000264075072057654, + "loss": 0.9891, + "step": 5404 + }, + { + "epoch": 0.9642315582909642, + "grad_norm": 0.4692907929420471, + "learning_rate": 0.00026400512035255663, + "loss": 0.6985, + "step": 5405 + }, + { + "epoch": 0.9644099545089644, + "grad_norm": 0.4961127042770386, + "learning_rate": 0.00026393516754750313, + "loss": 0.8653, + "step": 5406 + }, + { + "epoch": 0.9645883507269646, + "grad_norm": 0.455837607383728, + "learning_rate": 0.0002638652136479876, + "loss": 0.6942, + "step": 5407 + }, + { + "epoch": 0.9647667469449648, + "grad_norm": 0.5186680555343628, + "learning_rate": 0.0002637952586595041, + "loss": 1.0216, + "step": 5408 + }, + { + "epoch": 0.964945143162965, + "grad_norm": 0.49785247445106506, + "learning_rate": 0.00026372530258754695, + "loss": 0.7818, + "step": 5409 + }, + { + "epoch": 0.9651235393809652, + "grad_norm": 0.4302063286304474, + "learning_rate": 0.0002636553454376105, + "loss": 0.6761, + "step": 5410 + }, + { + "epoch": 0.9653019355989653, + "grad_norm": 0.4406273066997528, + "learning_rate": 0.00026358538721518905, + "loss": 0.7473, + "step": 5411 + }, + { + "epoch": 0.9654803318169655, + "grad_norm": 0.42952749133110046, + "learning_rate": 0.0002635154279257771, + "loss": 0.6809, + "step": 5412 + }, + { + "epoch": 0.9656587280349657, + "grad_norm": 0.502311110496521, + "learning_rate": 0.00026344546757486924, + "loss": 1.1199, + "step": 5413 + }, + { + "epoch": 0.9658371242529659, + "grad_norm": 0.4300912022590637, + "learning_rate": 0.00026337550616796024, + "loss": 0.9147, + "step": 5414 + }, + { + "epoch": 0.9660155204709661, + "grad_norm": 0.4525073766708374, + "learning_rate": 0.00026330554371054466, + "loss": 0.9604, + "step": 5415 + }, + { + "epoch": 0.9661939166889661, + "grad_norm": 0.41568025946617126, + "learning_rate": 0.00026323558020811745, + "loss": 0.7829, + "step": 5416 + }, + { + "epoch": 0.9663723129069663, + "grad_norm": 0.5461636781692505, + "learning_rate": 0.00026316561566617347, + "loss": 0.8039, + "step": 5417 + }, + { + "epoch": 0.9665507091249665, + "grad_norm": 0.4481653571128845, + "learning_rate": 0.00026309565009020766, + "loss": 0.8262, + "step": 5418 + }, + { + "epoch": 0.9667291053429667, + "grad_norm": 0.911888599395752, + "learning_rate": 0.00026302568348571514, + "loss": 0.8544, + "step": 5419 + }, + { + "epoch": 0.9669075015609669, + "grad_norm": 0.42414039373397827, + "learning_rate": 0.0002629557158581911, + "loss": 0.6467, + "step": 5420 + }, + { + "epoch": 0.9670858977789671, + "grad_norm": 0.5355694890022278, + "learning_rate": 0.00026288574721313064, + "loss": 0.7265, + "step": 5421 + }, + { + "epoch": 0.9672642939969672, + "grad_norm": 0.8004781007766724, + "learning_rate": 0.0002628157775560291, + "loss": 0.8402, + "step": 5422 + }, + { + "epoch": 0.9674426902149674, + "grad_norm": 0.5186832547187805, + "learning_rate": 0.00026274580689238206, + "loss": 0.9926, + "step": 5423 + }, + { + "epoch": 0.9676210864329676, + "grad_norm": 0.5586687922477722, + "learning_rate": 0.00026267583522768473, + "loss": 1.045, + "step": 5424 + }, + { + "epoch": 0.9677994826509678, + "grad_norm": 0.4568381607532501, + "learning_rate": 0.0002626058625674328, + "loss": 0.6261, + "step": 5425 + }, + { + "epoch": 0.967977878868968, + "grad_norm": 0.4433128237724304, + "learning_rate": 0.0002625358889171217, + "loss": 0.9197, + "step": 5426 + }, + { + "epoch": 0.9681562750869681, + "grad_norm": 0.4375893175601959, + "learning_rate": 0.00026246591428224743, + "loss": 0.6837, + "step": 5427 + }, + { + "epoch": 0.9683346713049683, + "grad_norm": 0.48804354667663574, + "learning_rate": 0.0002623959386683056, + "loss": 0.9795, + "step": 5428 + }, + { + "epoch": 0.9685130675229685, + "grad_norm": 0.43355077505111694, + "learning_rate": 0.00026232596208079203, + "loss": 0.7921, + "step": 5429 + }, + { + "epoch": 0.9686914637409687, + "grad_norm": 0.4870491623878479, + "learning_rate": 0.00026225598452520277, + "loss": 0.9069, + "step": 5430 + }, + { + "epoch": 0.9688698599589689, + "grad_norm": 0.463943749666214, + "learning_rate": 0.00026218600600703376, + "loss": 0.7366, + "step": 5431 + }, + { + "epoch": 0.9690482561769691, + "grad_norm": 0.46572330594062805, + "learning_rate": 0.000262116026531781, + "loss": 0.7924, + "step": 5432 + }, + { + "epoch": 0.9692266523949692, + "grad_norm": 0.42976751923561096, + "learning_rate": 0.00026204604610494077, + "loss": 0.638, + "step": 5433 + }, + { + "epoch": 0.9694050486129694, + "grad_norm": 0.5167571306228638, + "learning_rate": 0.0002619760647320092, + "loss": 0.7912, + "step": 5434 + }, + { + "epoch": 0.9695834448309696, + "grad_norm": 0.39845386147499084, + "learning_rate": 0.0002619060824184828, + "loss": 0.6637, + "step": 5435 + }, + { + "epoch": 0.9697618410489698, + "grad_norm": 0.4528100788593292, + "learning_rate": 0.00026183609916985776, + "loss": 0.6917, + "step": 5436 + }, + { + "epoch": 0.96994023726697, + "grad_norm": 0.4622432291507721, + "learning_rate": 0.00026176611499163056, + "loss": 0.721, + "step": 5437 + }, + { + "epoch": 0.9701186334849701, + "grad_norm": 0.447052538394928, + "learning_rate": 0.00026169612988929773, + "loss": 0.6908, + "step": 5438 + }, + { + "epoch": 0.9702970297029703, + "grad_norm": 0.4297953248023987, + "learning_rate": 0.00026162614386835597, + "loss": 0.7084, + "step": 5439 + }, + { + "epoch": 0.9704754259209705, + "grad_norm": 0.4557560682296753, + "learning_rate": 0.0002615561569343018, + "loss": 0.8465, + "step": 5440 + }, + { + "epoch": 0.9706538221389707, + "grad_norm": 0.42266544699668884, + "learning_rate": 0.000261486169092632, + "loss": 0.6536, + "step": 5441 + }, + { + "epoch": 0.9708322183569709, + "grad_norm": 0.4929708242416382, + "learning_rate": 0.0002614161803488435, + "loss": 0.9094, + "step": 5442 + }, + { + "epoch": 0.9710106145749711, + "grad_norm": 0.45966464281082153, + "learning_rate": 0.0002613461907084331, + "loss": 0.7899, + "step": 5443 + }, + { + "epoch": 0.9711890107929712, + "grad_norm": 0.5083897113800049, + "learning_rate": 0.0002612762001768978, + "loss": 0.9044, + "step": 5444 + }, + { + "epoch": 0.9713674070109714, + "grad_norm": 0.4741891026496887, + "learning_rate": 0.00026120620875973453, + "loss": 0.9287, + "step": 5445 + }, + { + "epoch": 0.9715458032289715, + "grad_norm": 1.0679816007614136, + "learning_rate": 0.00026113621646244045, + "loss": 0.999, + "step": 5446 + }, + { + "epoch": 0.9717241994469717, + "grad_norm": 0.4491601288318634, + "learning_rate": 0.0002610662232905127, + "loss": 0.8127, + "step": 5447 + }, + { + "epoch": 0.971902595664972, + "grad_norm": 0.538118302822113, + "learning_rate": 0.00026099622924944863, + "loss": 0.905, + "step": 5448 + }, + { + "epoch": 0.972080991882972, + "grad_norm": 0.4619334936141968, + "learning_rate": 0.0002609262343447454, + "loss": 0.6049, + "step": 5449 + }, + { + "epoch": 0.9722593881009722, + "grad_norm": 0.5011905431747437, + "learning_rate": 0.0002608562385819004, + "loss": 0.6995, + "step": 5450 + }, + { + "epoch": 0.9724377843189724, + "grad_norm": 0.5354875326156616, + "learning_rate": 0.0002607862419664111, + "loss": 0.585, + "step": 5451 + }, + { + "epoch": 0.9726161805369726, + "grad_norm": 0.46661576628685, + "learning_rate": 0.00026071624450377495, + "loss": 0.7073, + "step": 5452 + }, + { + "epoch": 0.9727945767549728, + "grad_norm": 0.511663556098938, + "learning_rate": 0.0002606462461994896, + "loss": 0.8606, + "step": 5453 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 0.48708683252334595, + "learning_rate": 0.0002605762470590527, + "loss": 0.6398, + "step": 5454 + }, + { + "epoch": 0.9731513691909731, + "grad_norm": 0.48029178380966187, + "learning_rate": 0.0002605062470879619, + "loss": 0.7369, + "step": 5455 + }, + { + "epoch": 0.9733297654089733, + "grad_norm": 0.481996089220047, + "learning_rate": 0.00026043624629171495, + "loss": 0.9269, + "step": 5456 + }, + { + "epoch": 0.9735081616269735, + "grad_norm": 0.5002886652946472, + "learning_rate": 0.0002603662446758097, + "loss": 0.8318, + "step": 5457 + }, + { + "epoch": 0.9736865578449737, + "grad_norm": 0.48849231004714966, + "learning_rate": 0.0002602962422457441, + "loss": 0.7965, + "step": 5458 + }, + { + "epoch": 0.9738649540629739, + "grad_norm": 0.4990347623825073, + "learning_rate": 0.000260226239007016, + "loss": 0.87, + "step": 5459 + }, + { + "epoch": 0.974043350280974, + "grad_norm": 0.5066533088684082, + "learning_rate": 0.0002601562349651235, + "loss": 0.7364, + "step": 5460 + }, + { + "epoch": 0.9742217464989742, + "grad_norm": 0.5521520972251892, + "learning_rate": 0.0002600862301255647, + "loss": 0.9395, + "step": 5461 + }, + { + "epoch": 0.9744001427169744, + "grad_norm": 0.7457981109619141, + "learning_rate": 0.00026001622449383776, + "loss": 0.8701, + "step": 5462 + }, + { + "epoch": 0.9745785389349746, + "grad_norm": 0.5160167217254639, + "learning_rate": 0.00025994621807544084, + "loss": 0.9626, + "step": 5463 + }, + { + "epoch": 0.9747569351529748, + "grad_norm": 0.504137396812439, + "learning_rate": 0.0002598762108758722, + "loss": 0.7476, + "step": 5464 + }, + { + "epoch": 0.974935331370975, + "grad_norm": 0.4687594473361969, + "learning_rate": 0.00025980620290063023, + "loss": 0.7256, + "step": 5465 + }, + { + "epoch": 0.9751137275889751, + "grad_norm": 0.44819939136505127, + "learning_rate": 0.0002597361941552133, + "loss": 0.6916, + "step": 5466 + }, + { + "epoch": 0.9752921238069753, + "grad_norm": 2.4401307106018066, + "learning_rate": 0.00025966618464511986, + "loss": 0.7513, + "step": 5467 + }, + { + "epoch": 0.9754705200249755, + "grad_norm": 0.48852846026420593, + "learning_rate": 0.0002595961743758484, + "loss": 0.9704, + "step": 5468 + }, + { + "epoch": 0.9756489162429757, + "grad_norm": 0.47070446610450745, + "learning_rate": 0.00025952616335289766, + "loss": 0.8079, + "step": 5469 + }, + { + "epoch": 0.9758273124609759, + "grad_norm": 0.49295809864997864, + "learning_rate": 0.00025945615158176605, + "loss": 1.0164, + "step": 5470 + }, + { + "epoch": 0.976005708678976, + "grad_norm": 0.5102391839027405, + "learning_rate": 0.00025938613906795237, + "loss": 1.1723, + "step": 5471 + }, + { + "epoch": 0.9761841048969762, + "grad_norm": 0.4273965656757355, + "learning_rate": 0.0002593161258169554, + "loss": 0.5914, + "step": 5472 + }, + { + "epoch": 0.9763625011149764, + "grad_norm": 0.4733925759792328, + "learning_rate": 0.00025924611183427386, + "loss": 0.8552, + "step": 5473 + }, + { + "epoch": 0.9765408973329766, + "grad_norm": 0.4754532277584076, + "learning_rate": 0.00025917609712540674, + "loss": 0.8698, + "step": 5474 + }, + { + "epoch": 0.9767192935509768, + "grad_norm": 0.5383017063140869, + "learning_rate": 0.0002591060816958529, + "loss": 1.0911, + "step": 5475 + }, + { + "epoch": 0.976897689768977, + "grad_norm": 0.50196772813797, + "learning_rate": 0.00025903606555111123, + "loss": 0.9058, + "step": 5476 + }, + { + "epoch": 0.977076085986977, + "grad_norm": 0.5124291181564331, + "learning_rate": 0.000258966048696681, + "loss": 0.9839, + "step": 5477 + }, + { + "epoch": 0.9772544822049772, + "grad_norm": 0.5031840801239014, + "learning_rate": 0.0002588960311380611, + "loss": 0.7175, + "step": 5478 + }, + { + "epoch": 0.9774328784229774, + "grad_norm": 0.46920791268348694, + "learning_rate": 0.0002588260128807507, + "loss": 0.8517, + "step": 5479 + }, + { + "epoch": 0.9776112746409776, + "grad_norm": 0.5168631672859192, + "learning_rate": 0.0002587559939302491, + "loss": 0.964, + "step": 5480 + }, + { + "epoch": 0.9777896708589778, + "grad_norm": 0.44529473781585693, + "learning_rate": 0.00025868597429205543, + "loss": 0.778, + "step": 5481 + }, + { + "epoch": 0.9779680670769779, + "grad_norm": 0.4642591178417206, + "learning_rate": 0.00025861595397166915, + "loss": 0.7513, + "step": 5482 + }, + { + "epoch": 0.9781464632949781, + "grad_norm": 0.47108370065689087, + "learning_rate": 0.00025854593297458956, + "loss": 0.6897, + "step": 5483 + }, + { + "epoch": 0.9783248595129783, + "grad_norm": 0.5786851048469543, + "learning_rate": 0.00025847591130631603, + "loss": 0.9704, + "step": 5484 + }, + { + "epoch": 0.9785032557309785, + "grad_norm": 0.4903102219104767, + "learning_rate": 0.0002584058889723481, + "loss": 0.9588, + "step": 5485 + }, + { + "epoch": 0.9786816519489787, + "grad_norm": 0.40852758288383484, + "learning_rate": 0.00025833586597818526, + "loss": 0.6194, + "step": 5486 + }, + { + "epoch": 0.9788600481669789, + "grad_norm": 0.42910122871398926, + "learning_rate": 0.00025826584232932704, + "loss": 0.7026, + "step": 5487 + }, + { + "epoch": 0.979038444384979, + "grad_norm": 0.6865221261978149, + "learning_rate": 0.00025819581803127316, + "loss": 0.932, + "step": 5488 + }, + { + "epoch": 0.9792168406029792, + "grad_norm": 0.44170790910720825, + "learning_rate": 0.0002581257930895233, + "loss": 0.7926, + "step": 5489 + }, + { + "epoch": 0.9793952368209794, + "grad_norm": 0.4942973256111145, + "learning_rate": 0.00025805576750957714, + "loss": 0.9616, + "step": 5490 + }, + { + "epoch": 0.9795736330389796, + "grad_norm": 0.5086950063705444, + "learning_rate": 0.0002579857412969345, + "loss": 0.9139, + "step": 5491 + }, + { + "epoch": 0.9797520292569798, + "grad_norm": 0.5349062085151672, + "learning_rate": 0.00025791571445709505, + "loss": 0.9468, + "step": 5492 + }, + { + "epoch": 0.9799304254749799, + "grad_norm": 0.5083227157592773, + "learning_rate": 0.0002578456869955589, + "loss": 0.9298, + "step": 5493 + }, + { + "epoch": 0.9801088216929801, + "grad_norm": 0.4308398962020874, + "learning_rate": 0.0002577756589178258, + "loss": 0.6871, + "step": 5494 + }, + { + "epoch": 0.9802872179109803, + "grad_norm": 0.3946779668331146, + "learning_rate": 0.0002577056302293958, + "loss": 0.7342, + "step": 5495 + }, + { + "epoch": 0.9804656141289805, + "grad_norm": 0.4864563047885895, + "learning_rate": 0.000257635600935769, + "loss": 0.9185, + "step": 5496 + }, + { + "epoch": 0.9806440103469807, + "grad_norm": 0.427357941865921, + "learning_rate": 0.00025756557104244534, + "loss": 0.7518, + "step": 5497 + }, + { + "epoch": 0.9808224065649809, + "grad_norm": 0.4161073565483093, + "learning_rate": 0.000257495540554925, + "loss": 0.5618, + "step": 5498 + }, + { + "epoch": 0.981000802782981, + "grad_norm": 0.437459796667099, + "learning_rate": 0.00025742550947870806, + "loss": 0.638, + "step": 5499 + }, + { + "epoch": 0.9811791990009812, + "grad_norm": 0.4389030635356903, + "learning_rate": 0.00025735547781929484, + "loss": 0.6333, + "step": 5500 + }, + { + "epoch": 0.9813575952189814, + "grad_norm": 0.47596055269241333, + "learning_rate": 0.00025728544558218557, + "loss": 0.7331, + "step": 5501 + }, + { + "epoch": 0.9815359914369816, + "grad_norm": 0.49048876762390137, + "learning_rate": 0.00025721541277288053, + "loss": 0.7336, + "step": 5502 + }, + { + "epoch": 0.9817143876549818, + "grad_norm": 0.4905683398246765, + "learning_rate": 0.0002571453793968801, + "loss": 0.8261, + "step": 5503 + }, + { + "epoch": 0.9818927838729818, + "grad_norm": 0.46471673250198364, + "learning_rate": 0.0002570753454596846, + "loss": 0.8081, + "step": 5504 + }, + { + "epoch": 0.982071180090982, + "grad_norm": 0.49744272232055664, + "learning_rate": 0.00025700531096679456, + "loss": 0.8207, + "step": 5505 + }, + { + "epoch": 0.9822495763089822, + "grad_norm": 0.4972292184829712, + "learning_rate": 0.0002569352759237104, + "loss": 1.0645, + "step": 5506 + }, + { + "epoch": 0.9824279725269824, + "grad_norm": 0.47991102933883667, + "learning_rate": 0.00025686524033593263, + "loss": 0.8118, + "step": 5507 + }, + { + "epoch": 0.9826063687449826, + "grad_norm": 0.41838109493255615, + "learning_rate": 0.00025679520420896184, + "loss": 0.7066, + "step": 5508 + }, + { + "epoch": 0.9827847649629828, + "grad_norm": 0.40462827682495117, + "learning_rate": 0.00025672516754829866, + "loss": 0.6267, + "step": 5509 + }, + { + "epoch": 0.9829631611809829, + "grad_norm": 0.4698152542114258, + "learning_rate": 0.00025665513035944373, + "loss": 0.9033, + "step": 5510 + }, + { + "epoch": 0.9831415573989831, + "grad_norm": 0.4284512400627136, + "learning_rate": 0.0002565850926478977, + "loss": 0.7329, + "step": 5511 + }, + { + "epoch": 0.9833199536169833, + "grad_norm": 0.4176551401615143, + "learning_rate": 0.0002565150544191613, + "loss": 0.7597, + "step": 5512 + }, + { + "epoch": 0.9834983498349835, + "grad_norm": 0.46959561109542847, + "learning_rate": 0.00025644501567873533, + "loss": 0.7277, + "step": 5513 + }, + { + "epoch": 0.9836767460529837, + "grad_norm": 0.4930439591407776, + "learning_rate": 0.0002563749764321207, + "loss": 0.8034, + "step": 5514 + }, + { + "epoch": 0.9838551422709838, + "grad_norm": 0.4476320743560791, + "learning_rate": 0.0002563049366848181, + "loss": 0.7959, + "step": 5515 + }, + { + "epoch": 0.984033538488984, + "grad_norm": 0.7321682572364807, + "learning_rate": 0.00025623489644232845, + "loss": 0.7308, + "step": 5516 + }, + { + "epoch": 0.9842119347069842, + "grad_norm": 0.447443425655365, + "learning_rate": 0.00025616485571015277, + "loss": 0.7, + "step": 5517 + }, + { + "epoch": 0.9843903309249844, + "grad_norm": 0.4954073131084442, + "learning_rate": 0.0002560948144937919, + "loss": 0.8115, + "step": 5518 + }, + { + "epoch": 0.9845687271429846, + "grad_norm": 0.5299807190895081, + "learning_rate": 0.00025602477279874697, + "loss": 0.9499, + "step": 5519 + }, + { + "epoch": 0.9847471233609848, + "grad_norm": 0.4475019574165344, + "learning_rate": 0.000255954730630519, + "loss": 0.7138, + "step": 5520 + }, + { + "epoch": 0.9849255195789849, + "grad_norm": 0.4217219650745392, + "learning_rate": 0.000255884687994609, + "loss": 0.697, + "step": 5521 + }, + { + "epoch": 0.9851039157969851, + "grad_norm": 0.4400602877140045, + "learning_rate": 0.0002558146448965182, + "loss": 0.7563, + "step": 5522 + }, + { + "epoch": 0.9852823120149853, + "grad_norm": 0.42823049426078796, + "learning_rate": 0.0002557446013417477, + "loss": 0.6568, + "step": 5523 + }, + { + "epoch": 0.9854607082329855, + "grad_norm": 0.4775795638561249, + "learning_rate": 0.00025567455733579867, + "loss": 0.6586, + "step": 5524 + }, + { + "epoch": 0.9856391044509857, + "grad_norm": 0.4584738314151764, + "learning_rate": 0.00025560451288417224, + "loss": 0.7584, + "step": 5525 + }, + { + "epoch": 0.9858175006689858, + "grad_norm": 0.6482552289962769, + "learning_rate": 0.00025553446799236987, + "loss": 0.9836, + "step": 5526 + }, + { + "epoch": 0.985995896886986, + "grad_norm": 2.181297779083252, + "learning_rate": 0.00025546442266589274, + "loss": 0.9366, + "step": 5527 + }, + { + "epoch": 0.9861742931049862, + "grad_norm": 0.4901970326900482, + "learning_rate": 0.0002553943769102422, + "loss": 0.7734, + "step": 5528 + }, + { + "epoch": 0.9863526893229864, + "grad_norm": 0.5219231843948364, + "learning_rate": 0.00025532433073091967, + "loss": 0.7347, + "step": 5529 + }, + { + "epoch": 0.9865310855409866, + "grad_norm": 0.5067287087440491, + "learning_rate": 0.0002552542841334265, + "loss": 0.8365, + "step": 5530 + }, + { + "epoch": 0.9867094817589868, + "grad_norm": 3.1279547214508057, + "learning_rate": 0.0002551842371232641, + "loss": 0.8209, + "step": 5531 + }, + { + "epoch": 0.9868878779769868, + "grad_norm": 0.48862382769584656, + "learning_rate": 0.00025511418970593393, + "loss": 0.8062, + "step": 5532 + }, + { + "epoch": 0.987066274194987, + "grad_norm": 1.2135387659072876, + "learning_rate": 0.0002550441418869374, + "loss": 0.8549, + "step": 5533 + }, + { + "epoch": 0.9872446704129872, + "grad_norm": 0.8009121417999268, + "learning_rate": 0.00025497409367177627, + "loss": 0.8502, + "step": 5534 + }, + { + "epoch": 0.9874230666309874, + "grad_norm": 0.6405108571052551, + "learning_rate": 0.0002549040450659519, + "loss": 0.9773, + "step": 5535 + }, + { + "epoch": 0.9876014628489876, + "grad_norm": 3.273838758468628, + "learning_rate": 0.00025483399607496604, + "loss": 0.9752, + "step": 5536 + }, + { + "epoch": 0.9877798590669877, + "grad_norm": 0.4358234703540802, + "learning_rate": 0.0002547639467043201, + "loss": 0.6485, + "step": 5537 + }, + { + "epoch": 0.9879582552849879, + "grad_norm": 0.5305948853492737, + "learning_rate": 0.00025469389695951595, + "loss": 0.8997, + "step": 5538 + }, + { + "epoch": 0.9881366515029881, + "grad_norm": 0.7307850122451782, + "learning_rate": 0.0002546238468460551, + "loss": 0.7562, + "step": 5539 + }, + { + "epoch": 0.9883150477209883, + "grad_norm": 0.47485119104385376, + "learning_rate": 0.0002545537963694392, + "loss": 0.7355, + "step": 5540 + }, + { + "epoch": 0.9884934439389885, + "grad_norm": 0.5570200085639954, + "learning_rate": 0.0002544837455351702, + "loss": 0.8597, + "step": 5541 + }, + { + "epoch": 0.9886718401569887, + "grad_norm": 0.592291533946991, + "learning_rate": 0.00025441369434874977, + "loss": 1.0744, + "step": 5542 + }, + { + "epoch": 0.9888502363749888, + "grad_norm": 0.487498939037323, + "learning_rate": 0.0002543436428156796, + "loss": 0.8186, + "step": 5543 + }, + { + "epoch": 0.989028632592989, + "grad_norm": 0.475384920835495, + "learning_rate": 0.0002542735909414617, + "loss": 0.6558, + "step": 5544 + }, + { + "epoch": 0.9892070288109892, + "grad_norm": 0.44877302646636963, + "learning_rate": 0.00025420353873159774, + "loss": 0.7219, + "step": 5545 + }, + { + "epoch": 0.9893854250289894, + "grad_norm": 0.49043965339660645, + "learning_rate": 0.0002541334861915897, + "loss": 0.8794, + "step": 5546 + }, + { + "epoch": 0.9895638212469896, + "grad_norm": 0.47729647159576416, + "learning_rate": 0.00025406343332693934, + "loss": 0.8159, + "step": 5547 + }, + { + "epoch": 0.9897422174649897, + "grad_norm": 0.48762455582618713, + "learning_rate": 0.0002539933801431487, + "loss": 0.7083, + "step": 5548 + }, + { + "epoch": 0.9899206136829899, + "grad_norm": 0.4554971158504486, + "learning_rate": 0.0002539233266457198, + "loss": 0.6679, + "step": 5549 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 0.5924381613731384, + "learning_rate": 0.0002538532728401544, + "loss": 0.9812, + "step": 5550 + }, + { + "epoch": 0.9902774061189903, + "grad_norm": 0.5297411680221558, + "learning_rate": 0.0002537832187319547, + "loss": 0.7363, + "step": 5551 + }, + { + "epoch": 0.9904558023369905, + "grad_norm": 0.5259188413619995, + "learning_rate": 0.00025371316432662254, + "loss": 0.924, + "step": 5552 + }, + { + "epoch": 0.9906341985549907, + "grad_norm": 0.5142114162445068, + "learning_rate": 0.0002536431096296601, + "loss": 0.94, + "step": 5553 + }, + { + "epoch": 0.9908125947729908, + "grad_norm": 0.4876602590084076, + "learning_rate": 0.00025357305464656943, + "loss": 0.9249, + "step": 5554 + }, + { + "epoch": 0.990990990990991, + "grad_norm": 0.4915853440761566, + "learning_rate": 0.00025350299938285253, + "loss": 1.1248, + "step": 5555 + }, + { + "epoch": 0.9911693872089912, + "grad_norm": 0.5007473826408386, + "learning_rate": 0.0002534329438440116, + "loss": 1.0163, + "step": 5556 + }, + { + "epoch": 0.9913477834269914, + "grad_norm": 0.4649515151977539, + "learning_rate": 0.0002533628880355487, + "loss": 0.8398, + "step": 5557 + }, + { + "epoch": 0.9915261796449916, + "grad_norm": 0.4449246823787689, + "learning_rate": 0.0002532928319629661, + "loss": 0.8213, + "step": 5558 + }, + { + "epoch": 0.9917045758629917, + "grad_norm": 0.47130802273750305, + "learning_rate": 0.00025322277563176584, + "loss": 0.8082, + "step": 5559 + }, + { + "epoch": 0.9918829720809919, + "grad_norm": 0.5442295670509338, + "learning_rate": 0.00025315271904745014, + "loss": 0.9156, + "step": 5560 + }, + { + "epoch": 0.992061368298992, + "grad_norm": 0.5047063827514648, + "learning_rate": 0.0002530826622155213, + "loss": 0.8635, + "step": 5561 + }, + { + "epoch": 0.9922397645169923, + "grad_norm": 0.504047155380249, + "learning_rate": 0.00025301260514148146, + "loss": 0.9712, + "step": 5562 + }, + { + "epoch": 0.9924181607349924, + "grad_norm": 0.46802255511283875, + "learning_rate": 0.0002529425478308329, + "loss": 0.8758, + "step": 5563 + }, + { + "epoch": 0.9925965569529926, + "grad_norm": 0.5220746397972107, + "learning_rate": 0.00025287249028907796, + "loss": 1.001, + "step": 5564 + }, + { + "epoch": 0.9927749531709927, + "grad_norm": 0.43903014063835144, + "learning_rate": 0.0002528024325217188, + "loss": 0.6244, + "step": 5565 + }, + { + "epoch": 0.9929533493889929, + "grad_norm": 0.4588782787322998, + "learning_rate": 0.0002527323745342578, + "loss": 0.6223, + "step": 5566 + }, + { + "epoch": 0.9931317456069931, + "grad_norm": 0.501520574092865, + "learning_rate": 0.0002526623163321973, + "loss": 0.9619, + "step": 5567 + }, + { + "epoch": 0.9933101418249933, + "grad_norm": 0.4871865510940552, + "learning_rate": 0.0002525922579210396, + "loss": 0.9316, + "step": 5568 + }, + { + "epoch": 0.9934885380429935, + "grad_norm": 0.4477481544017792, + "learning_rate": 0.0002525221993062871, + "loss": 0.7816, + "step": 5569 + }, + { + "epoch": 0.9936669342609936, + "grad_norm": 0.4530073404312134, + "learning_rate": 0.00025245214049344225, + "loss": 0.7685, + "step": 5570 + }, + { + "epoch": 0.9938453304789938, + "grad_norm": 0.4744289815425873, + "learning_rate": 0.0002523820814880072, + "loss": 0.7734, + "step": 5571 + }, + { + "epoch": 0.994023726696994, + "grad_norm": 0.4457191526889801, + "learning_rate": 0.0002523120222954845, + "loss": 0.895, + "step": 5572 + }, + { + "epoch": 0.9942021229149942, + "grad_norm": 0.4728289842605591, + "learning_rate": 0.00025224196292137664, + "loss": 0.7997, + "step": 5573 + }, + { + "epoch": 0.9943805191329944, + "grad_norm": 0.4260111451148987, + "learning_rate": 0.00025217190337118594, + "loss": 0.7461, + "step": 5574 + }, + { + "epoch": 0.9945589153509946, + "grad_norm": 0.4607539176940918, + "learning_rate": 0.0002521018436504149, + "loss": 0.7959, + "step": 5575 + }, + { + "epoch": 0.9947373115689947, + "grad_norm": 0.6200941801071167, + "learning_rate": 0.000252031783764566, + "loss": 0.8551, + "step": 5576 + }, + { + "epoch": 0.9949157077869949, + "grad_norm": 0.4465862810611725, + "learning_rate": 0.0002519617237191416, + "loss": 0.7163, + "step": 5577 + }, + { + "epoch": 0.9950941040049951, + "grad_norm": 0.4562668800354004, + "learning_rate": 0.00025189166351964425, + "loss": 0.7838, + "step": 5578 + }, + { + "epoch": 0.9952725002229953, + "grad_norm": 0.45672228932380676, + "learning_rate": 0.0002518216031715765, + "loss": 0.7227, + "step": 5579 + }, + { + "epoch": 0.9954508964409955, + "grad_norm": 0.4691329002380371, + "learning_rate": 0.0002517515426804408, + "loss": 0.9586, + "step": 5580 + }, + { + "epoch": 0.9956292926589956, + "grad_norm": 1.6988149881362915, + "learning_rate": 0.00025168148205173974, + "loss": 0.7888, + "step": 5581 + }, + { + "epoch": 0.9958076888769958, + "grad_norm": 0.46691417694091797, + "learning_rate": 0.0002516114212909758, + "loss": 0.6838, + "step": 5582 + }, + { + "epoch": 0.995986085094996, + "grad_norm": 0.45711347460746765, + "learning_rate": 0.0002515413604036515, + "loss": 0.7862, + "step": 5583 + }, + { + "epoch": 0.9961644813129962, + "grad_norm": 0.49638915061950684, + "learning_rate": 0.0002514712993952694, + "loss": 0.7689, + "step": 5584 + }, + { + "epoch": 0.9963428775309964, + "grad_norm": 0.4389818608760834, + "learning_rate": 0.0002514012382713321, + "loss": 0.6992, + "step": 5585 + }, + { + "epoch": 0.9965212737489966, + "grad_norm": 0.4768664538860321, + "learning_rate": 0.00025133117703734207, + "loss": 0.6965, + "step": 5586 + }, + { + "epoch": 0.9966996699669967, + "grad_norm": 0.5143462419509888, + "learning_rate": 0.0002512611156988021, + "loss": 1.0382, + "step": 5587 + }, + { + "epoch": 0.9968780661849969, + "grad_norm": 0.47159501910209656, + "learning_rate": 0.00025119105426121455, + "loss": 0.7476, + "step": 5588 + }, + { + "epoch": 0.9970564624029971, + "grad_norm": 0.4691692292690277, + "learning_rate": 0.0002511209927300822, + "loss": 0.8535, + "step": 5589 + }, + { + "epoch": 0.9972348586209973, + "grad_norm": 0.44228291511535645, + "learning_rate": 0.00025105093111090756, + "loss": 0.777, + "step": 5590 + }, + { + "epoch": 0.9974132548389975, + "grad_norm": 0.43507784605026245, + "learning_rate": 0.00025098086940919317, + "loss": 0.6588, + "step": 5591 + }, + { + "epoch": 0.9975916510569975, + "grad_norm": 0.4751743972301483, + "learning_rate": 0.00025091080763044177, + "loss": 0.6748, + "step": 5592 + }, + { + "epoch": 0.9977700472749977, + "grad_norm": 4.461918354034424, + "learning_rate": 0.0002508407457801559, + "loss": 0.669, + "step": 5593 + }, + { + "epoch": 0.9979484434929979, + "grad_norm": 0.8328418135643005, + "learning_rate": 0.00025077068386383816, + "loss": 0.8627, + "step": 5594 + }, + { + "epoch": 0.9981268397109981, + "grad_norm": 1.091975212097168, + "learning_rate": 0.00025070062188699136, + "loss": 0.8833, + "step": 5595 + }, + { + "epoch": 0.9983052359289983, + "grad_norm": 0.4713873565196991, + "learning_rate": 0.00025063055985511794, + "loss": 0.6602, + "step": 5596 + }, + { + "epoch": 0.9984836321469985, + "grad_norm": 0.5721749663352966, + "learning_rate": 0.0002505604977737207, + "loss": 0.8543, + "step": 5597 + }, + { + "epoch": 0.9986620283649986, + "grad_norm": 0.5203986167907715, + "learning_rate": 0.00025049043564830207, + "loss": 0.7544, + "step": 5598 + }, + { + "epoch": 0.9988404245829988, + "grad_norm": 0.7062872052192688, + "learning_rate": 0.00025042037348436497, + "loss": 0.8659, + "step": 5599 + }, + { + "epoch": 0.999018820800999, + "grad_norm": 0.476829469203949, + "learning_rate": 0.00025035031128741185, + "loss": 0.7615, + "step": 5600 + }, + { + "epoch": 0.9991972170189992, + "grad_norm": 0.42391034960746765, + "learning_rate": 0.0002502802490629454, + "loss": 0.5286, + "step": 5601 + }, + { + "epoch": 0.9993756132369994, + "grad_norm": 0.5202667713165283, + "learning_rate": 0.0002502101868164684, + "loss": 0.9911, + "step": 5602 + }, + { + "epoch": 0.9995540094549995, + "grad_norm": 0.47505563497543335, + "learning_rate": 0.0002501401245534834, + "loss": 0.9305, + "step": 5603 + }, + { + "epoch": 0.9997324056729997, + "grad_norm": 0.49714621901512146, + "learning_rate": 0.000250070062279493, + "loss": 0.8849, + "step": 5604 + }, + { + "epoch": 0.9999108018909999, + "grad_norm": 0.5128735899925232, + "learning_rate": 0.00025, + "loss": 0.8105, + "step": 5605 + }, + { + "epoch": 1.0, + "grad_norm": 0.8229463696479797, + "learning_rate": 0.000249929937720507, + "loss": 0.5342, + "step": 5606 + }, + { + "epoch": 1.0001783962180002, + "grad_norm": 0.48117566108703613, + "learning_rate": 0.00024985987544651667, + "loss": 0.6871, + "step": 5607 + }, + { + "epoch": 1.0003567924360004, + "grad_norm": 0.4833311438560486, + "learning_rate": 0.0002497898131835316, + "loss": 0.6285, + "step": 5608 + }, + { + "epoch": 1.0005351886540006, + "grad_norm": 0.5113769769668579, + "learning_rate": 0.0002497197509370546, + "loss": 0.7797, + "step": 5609 + }, + { + "epoch": 1.0007135848720008, + "grad_norm": 0.5292689204216003, + "learning_rate": 0.0002496496887125882, + "loss": 0.8098, + "step": 5610 + }, + { + "epoch": 1.000891981090001, + "grad_norm": 0.5784763097763062, + "learning_rate": 0.0002495796265156351, + "loss": 0.8951, + "step": 5611 + }, + { + "epoch": 1.001070377308001, + "grad_norm": 0.516196608543396, + "learning_rate": 0.000249509564351698, + "loss": 0.8567, + "step": 5612 + }, + { + "epoch": 1.0012487735260012, + "grad_norm": 0.49342212080955505, + "learning_rate": 0.0002494395022262793, + "loss": 0.6955, + "step": 5613 + }, + { + "epoch": 1.0014271697440014, + "grad_norm": 0.45294061303138733, + "learning_rate": 0.00024936944014488207, + "loss": 0.6438, + "step": 5614 + }, + { + "epoch": 1.0016055659620016, + "grad_norm": 0.5231368541717529, + "learning_rate": 0.0002492993781130087, + "loss": 0.7458, + "step": 5615 + }, + { + "epoch": 1.0017839621800018, + "grad_norm": 0.49573010206222534, + "learning_rate": 0.0002492293161361618, + "loss": 0.6514, + "step": 5616 + }, + { + "epoch": 1.001962358398002, + "grad_norm": 0.4060691297054291, + "learning_rate": 0.00024915925421984417, + "loss": 0.5311, + "step": 5617 + }, + { + "epoch": 1.0021407546160022, + "grad_norm": 0.5073151588439941, + "learning_rate": 0.0002490891923695583, + "loss": 0.7972, + "step": 5618 + }, + { + "epoch": 1.0023191508340024, + "grad_norm": 0.5470726490020752, + "learning_rate": 0.00024901913059080684, + "loss": 0.6951, + "step": 5619 + }, + { + "epoch": 1.0024975470520026, + "grad_norm": 0.4746871888637543, + "learning_rate": 0.0002489490688890925, + "loss": 0.678, + "step": 5620 + }, + { + "epoch": 1.0026759432700028, + "grad_norm": 0.6704702377319336, + "learning_rate": 0.0002488790072699178, + "loss": 0.9827, + "step": 5621 + }, + { + "epoch": 1.002854339488003, + "grad_norm": 0.5733181834220886, + "learning_rate": 0.00024880894573878546, + "loss": 0.6582, + "step": 5622 + }, + { + "epoch": 1.003032735706003, + "grad_norm": 0.4528675377368927, + "learning_rate": 0.00024873888430119794, + "loss": 0.6188, + "step": 5623 + }, + { + "epoch": 1.0032111319240031, + "grad_norm": 0.4826764166355133, + "learning_rate": 0.00024866882296265794, + "loss": 0.7185, + "step": 5624 + }, + { + "epoch": 1.0033895281420033, + "grad_norm": 0.4640424847602844, + "learning_rate": 0.000248598761728668, + "loss": 0.5546, + "step": 5625 + }, + { + "epoch": 1.0035679243600035, + "grad_norm": 0.57952880859375, + "learning_rate": 0.0002485287006047307, + "loss": 0.5323, + "step": 5626 + }, + { + "epoch": 1.0037463205780037, + "grad_norm": 0.4803033173084259, + "learning_rate": 0.0002484586395963486, + "loss": 0.7079, + "step": 5627 + }, + { + "epoch": 1.003924716796004, + "grad_norm": 0.47530657052993774, + "learning_rate": 0.0002483885787090242, + "loss": 0.7073, + "step": 5628 + }, + { + "epoch": 1.0041031130140041, + "grad_norm": 0.5128652453422546, + "learning_rate": 0.00024831851794826027, + "loss": 0.7797, + "step": 5629 + }, + { + "epoch": 1.0042815092320043, + "grad_norm": 0.4710089862346649, + "learning_rate": 0.0002482484573195592, + "loss": 0.7584, + "step": 5630 + }, + { + "epoch": 1.0044599054500045, + "grad_norm": 0.5013584494590759, + "learning_rate": 0.0002481783968284235, + "loss": 0.7168, + "step": 5631 + }, + { + "epoch": 1.0046383016680047, + "grad_norm": 0.48483264446258545, + "learning_rate": 0.00024810833648035576, + "loss": 0.7159, + "step": 5632 + }, + { + "epoch": 1.004816697886005, + "grad_norm": 0.43199649453163147, + "learning_rate": 0.00024803827628085845, + "loss": 0.6508, + "step": 5633 + }, + { + "epoch": 1.004995094104005, + "grad_norm": 0.48384973406791687, + "learning_rate": 0.00024796821623543407, + "loss": 0.8085, + "step": 5634 + }, + { + "epoch": 1.005173490322005, + "grad_norm": 0.5814610719680786, + "learning_rate": 0.00024789815634958517, + "loss": 0.7655, + "step": 5635 + }, + { + "epoch": 1.0053518865400053, + "grad_norm": 0.4795527160167694, + "learning_rate": 0.0002478280966288141, + "loss": 0.7431, + "step": 5636 + }, + { + "epoch": 1.0055302827580055, + "grad_norm": 0.4990082383155823, + "learning_rate": 0.0002477580370786234, + "loss": 0.7987, + "step": 5637 + }, + { + "epoch": 1.0057086789760057, + "grad_norm": 0.5206576585769653, + "learning_rate": 0.0002476879777045155, + "loss": 0.7974, + "step": 5638 + }, + { + "epoch": 1.0058870751940059, + "grad_norm": 0.38653305172920227, + "learning_rate": 0.00024761791851199286, + "loss": 0.5669, + "step": 5639 + }, + { + "epoch": 1.006065471412006, + "grad_norm": 0.49176260828971863, + "learning_rate": 0.0002475478595065578, + "loss": 0.7417, + "step": 5640 + }, + { + "epoch": 1.0062438676300063, + "grad_norm": 0.4860985279083252, + "learning_rate": 0.0002474778006937129, + "loss": 0.5978, + "step": 5641 + }, + { + "epoch": 1.0064222638480065, + "grad_norm": 0.4277432858943939, + "learning_rate": 0.0002474077420789604, + "loss": 0.4554, + "step": 5642 + }, + { + "epoch": 1.0066006600660067, + "grad_norm": 0.43579888343811035, + "learning_rate": 0.0002473376836678028, + "loss": 0.6738, + "step": 5643 + }, + { + "epoch": 1.0067790562840069, + "grad_norm": 0.5105220079421997, + "learning_rate": 0.00024726762546574215, + "loss": 0.9275, + "step": 5644 + }, + { + "epoch": 1.0069574525020069, + "grad_norm": 0.4739680886268616, + "learning_rate": 0.0002471975674782812, + "loss": 0.7133, + "step": 5645 + }, + { + "epoch": 1.007135848720007, + "grad_norm": 0.4873730540275574, + "learning_rate": 0.00024712750971092205, + "loss": 0.7338, + "step": 5646 + }, + { + "epoch": 1.0073142449380073, + "grad_norm": 0.45006611943244934, + "learning_rate": 0.0002470574521691671, + "loss": 0.6444, + "step": 5647 + }, + { + "epoch": 1.0074926411560075, + "grad_norm": 0.46951183676719666, + "learning_rate": 0.0002469873948585186, + "loss": 0.7305, + "step": 5648 + }, + { + "epoch": 1.0076710373740076, + "grad_norm": 0.4490543007850647, + "learning_rate": 0.00024691733778447875, + "loss": 0.6245, + "step": 5649 + }, + { + "epoch": 1.0078494335920078, + "grad_norm": 0.5618079900741577, + "learning_rate": 0.00024684728095254987, + "loss": 0.787, + "step": 5650 + }, + { + "epoch": 1.008027829810008, + "grad_norm": 7.213865756988525, + "learning_rate": 0.0002467772243682342, + "loss": 0.5768, + "step": 5651 + }, + { + "epoch": 1.0082062260280082, + "grad_norm": 0.5140239000320435, + "learning_rate": 0.000246707168037034, + "loss": 0.7213, + "step": 5652 + }, + { + "epoch": 1.0083846222460084, + "grad_norm": 0.5297546982765198, + "learning_rate": 0.00024663711196445135, + "loss": 0.7577, + "step": 5653 + }, + { + "epoch": 1.0085630184640086, + "grad_norm": 0.5135631561279297, + "learning_rate": 0.00024656705615598844, + "loss": 0.8142, + "step": 5654 + }, + { + "epoch": 1.0087414146820088, + "grad_norm": 0.4576781094074249, + "learning_rate": 0.0002464970006171475, + "loss": 0.6083, + "step": 5655 + }, + { + "epoch": 1.0089198109000088, + "grad_norm": 0.5894820690155029, + "learning_rate": 0.0002464269453534307, + "loss": 0.8443, + "step": 5656 + }, + { + "epoch": 1.009098207118009, + "grad_norm": 0.5274859070777893, + "learning_rate": 0.00024635689037034, + "loss": 0.9412, + "step": 5657 + }, + { + "epoch": 1.0092766033360092, + "grad_norm": 0.46422895789146423, + "learning_rate": 0.0002462868356733775, + "loss": 0.7357, + "step": 5658 + }, + { + "epoch": 1.0094549995540094, + "grad_norm": 0.49951404333114624, + "learning_rate": 0.0002462167812680453, + "loss": 0.8181, + "step": 5659 + }, + { + "epoch": 1.0096333957720096, + "grad_norm": 0.48407530784606934, + "learning_rate": 0.00024614672715984556, + "loss": 0.6384, + "step": 5660 + }, + { + "epoch": 1.0098117919900098, + "grad_norm": 1.810511827468872, + "learning_rate": 0.0002460766733542803, + "loss": 0.7134, + "step": 5661 + }, + { + "epoch": 1.00999018820801, + "grad_norm": 0.4726197421550751, + "learning_rate": 0.0002460066198568513, + "loss": 0.6611, + "step": 5662 + }, + { + "epoch": 1.0101685844260102, + "grad_norm": 0.512795090675354, + "learning_rate": 0.0002459365666730607, + "loss": 0.7829, + "step": 5663 + }, + { + "epoch": 1.0103469806440104, + "grad_norm": 0.6051098108291626, + "learning_rate": 0.0002458665138084104, + "loss": 0.7989, + "step": 5664 + }, + { + "epoch": 1.0105253768620106, + "grad_norm": 0.5258930921554565, + "learning_rate": 0.00024579646126840233, + "loss": 0.5785, + "step": 5665 + }, + { + "epoch": 1.0107037730800108, + "grad_norm": 0.41359806060791016, + "learning_rate": 0.0002457264090585384, + "loss": 0.5786, + "step": 5666 + }, + { + "epoch": 1.0108821692980108, + "grad_norm": 0.4931972622871399, + "learning_rate": 0.0002456563571843204, + "loss": 0.8471, + "step": 5667 + }, + { + "epoch": 1.011060565516011, + "grad_norm": 0.49193498492240906, + "learning_rate": 0.0002455863056512503, + "loss": 0.8302, + "step": 5668 + }, + { + "epoch": 1.0112389617340112, + "grad_norm": 0.5347012877464294, + "learning_rate": 0.0002455162544648299, + "loss": 1.1447, + "step": 5669 + }, + { + "epoch": 1.0114173579520114, + "grad_norm": 0.46643638610839844, + "learning_rate": 0.00024544620363056084, + "loss": 0.5934, + "step": 5670 + }, + { + "epoch": 1.0115957541700116, + "grad_norm": 0.6032198071479797, + "learning_rate": 0.00024537615315394504, + "loss": 0.5384, + "step": 5671 + }, + { + "epoch": 1.0117741503880118, + "grad_norm": 0.5089538097381592, + "learning_rate": 0.00024530610304048417, + "loss": 0.8041, + "step": 5672 + }, + { + "epoch": 1.011952546606012, + "grad_norm": 0.5048344731330872, + "learning_rate": 0.00024523605329567996, + "loss": 0.7037, + "step": 5673 + }, + { + "epoch": 1.0121309428240122, + "grad_norm": 0.40389716625213623, + "learning_rate": 0.00024516600392503397, + "loss": 0.5336, + "step": 5674 + }, + { + "epoch": 1.0123093390420124, + "grad_norm": 0.4871548116207123, + "learning_rate": 0.0002450959549340481, + "loss": 0.6761, + "step": 5675 + }, + { + "epoch": 1.0124877352600126, + "grad_norm": 0.49612149596214294, + "learning_rate": 0.00024502590632822374, + "loss": 0.7691, + "step": 5676 + }, + { + "epoch": 1.0126661314780128, + "grad_norm": 0.4323468506336212, + "learning_rate": 0.0002449558581130626, + "loss": 0.5754, + "step": 5677 + }, + { + "epoch": 1.0128445276960127, + "grad_norm": 0.4288893938064575, + "learning_rate": 0.00024488581029406614, + "loss": 0.6697, + "step": 5678 + }, + { + "epoch": 1.013022923914013, + "grad_norm": 0.4768436551094055, + "learning_rate": 0.00024481576287673596, + "loss": 0.657, + "step": 5679 + }, + { + "epoch": 1.0132013201320131, + "grad_norm": 0.4346489906311035, + "learning_rate": 0.00024474571586657353, + "loss": 0.663, + "step": 5680 + }, + { + "epoch": 1.0133797163500133, + "grad_norm": 0.5226327180862427, + "learning_rate": 0.0002446756692690804, + "loss": 0.9833, + "step": 5681 + }, + { + "epoch": 1.0135581125680135, + "grad_norm": 0.5677915811538696, + "learning_rate": 0.0002446056230897578, + "loss": 0.9305, + "step": 5682 + }, + { + "epoch": 1.0137365087860137, + "grad_norm": 0.5274161100387573, + "learning_rate": 0.0002445355773341073, + "loss": 0.7953, + "step": 5683 + }, + { + "epoch": 1.013914905004014, + "grad_norm": 0.557561457157135, + "learning_rate": 0.0002444655320076302, + "loss": 0.9001, + "step": 5684 + }, + { + "epoch": 1.0140933012220141, + "grad_norm": 0.3999897241592407, + "learning_rate": 0.0002443954871158278, + "loss": 0.6062, + "step": 5685 + }, + { + "epoch": 1.0142716974400143, + "grad_norm": 0.6423475742340088, + "learning_rate": 0.00024432544266420145, + "loss": 0.8672, + "step": 5686 + }, + { + "epoch": 1.0144500936580145, + "grad_norm": 0.5340455770492554, + "learning_rate": 0.0002442553986582524, + "loss": 0.7465, + "step": 5687 + }, + { + "epoch": 1.0146284898760147, + "grad_norm": 0.4833824634552002, + "learning_rate": 0.00024418535510348184, + "loss": 0.7415, + "step": 5688 + }, + { + "epoch": 1.0148068860940147, + "grad_norm": 0.452972948551178, + "learning_rate": 0.00024411531200539102, + "loss": 0.6568, + "step": 5689 + }, + { + "epoch": 1.014985282312015, + "grad_norm": 0.45635709166526794, + "learning_rate": 0.00024404526936948098, + "loss": 0.5435, + "step": 5690 + }, + { + "epoch": 1.015163678530015, + "grad_norm": 0.5237349271774292, + "learning_rate": 0.00024397522720125302, + "loss": 0.7549, + "step": 5691 + }, + { + "epoch": 1.0153420747480153, + "grad_norm": 0.627680778503418, + "learning_rate": 0.00024390518550620807, + "loss": 0.6618, + "step": 5692 + }, + { + "epoch": 1.0155204709660155, + "grad_norm": 0.5116132497787476, + "learning_rate": 0.00024383514428984727, + "loss": 0.7378, + "step": 5693 + }, + { + "epoch": 1.0156988671840157, + "grad_norm": 1.0455267429351807, + "learning_rate": 0.00024376510355767161, + "loss": 0.8657, + "step": 5694 + }, + { + "epoch": 1.015877263402016, + "grad_norm": 0.5515667200088501, + "learning_rate": 0.000243695063315182, + "loss": 0.7739, + "step": 5695 + }, + { + "epoch": 1.016055659620016, + "grad_norm": 0.5279601812362671, + "learning_rate": 0.0002436250235678794, + "loss": 0.7465, + "step": 5696 + }, + { + "epoch": 1.0162340558380163, + "grad_norm": 0.4827010929584503, + "learning_rate": 0.00024355498432126468, + "loss": 0.7674, + "step": 5697 + }, + { + "epoch": 1.0164124520560165, + "grad_norm": 0.5047492384910583, + "learning_rate": 0.00024348494558083873, + "loss": 0.7607, + "step": 5698 + }, + { + "epoch": 1.0165908482740167, + "grad_norm": 0.4985560476779938, + "learning_rate": 0.00024341490735210237, + "loss": 0.8236, + "step": 5699 + }, + { + "epoch": 1.0167692444920167, + "grad_norm": 0.48609042167663574, + "learning_rate": 0.00024334486964055634, + "loss": 0.7264, + "step": 5700 + }, + { + "epoch": 1.0169476407100169, + "grad_norm": 0.4970250129699707, + "learning_rate": 0.00024327483245170138, + "loss": 0.6943, + "step": 5701 + }, + { + "epoch": 1.017126036928017, + "grad_norm": 0.5171718001365662, + "learning_rate": 0.00024320479579103825, + "loss": 0.6666, + "step": 5702 + }, + { + "epoch": 1.0173044331460173, + "grad_norm": 0.48517653346061707, + "learning_rate": 0.00024313475966406746, + "loss": 0.7518, + "step": 5703 + }, + { + "epoch": 1.0174828293640175, + "grad_norm": 0.5177552700042725, + "learning_rate": 0.0002430647240762897, + "loss": 0.6988, + "step": 5704 + }, + { + "epoch": 1.0176612255820177, + "grad_norm": 0.4849720597267151, + "learning_rate": 0.00024299468903320542, + "loss": 0.6417, + "step": 5705 + }, + { + "epoch": 1.0178396218000179, + "grad_norm": 0.5243503451347351, + "learning_rate": 0.00024292465454031536, + "loss": 0.7603, + "step": 5706 + }, + { + "epoch": 1.018018018018018, + "grad_norm": 0.48335912823677063, + "learning_rate": 0.00024285462060311995, + "loss": 0.8571, + "step": 5707 + }, + { + "epoch": 1.0181964142360183, + "grad_norm": 0.48116248846054077, + "learning_rate": 0.00024278458722711948, + "loss": 0.717, + "step": 5708 + }, + { + "epoch": 1.0183748104540185, + "grad_norm": 0.5435453653335571, + "learning_rate": 0.0002427145544178145, + "loss": 0.7622, + "step": 5709 + }, + { + "epoch": 1.0185532066720187, + "grad_norm": 0.5478907227516174, + "learning_rate": 0.0002426445221807052, + "loss": 0.6225, + "step": 5710 + }, + { + "epoch": 1.0187316028900186, + "grad_norm": 0.44758474826812744, + "learning_rate": 0.000242574490521292, + "loss": 0.7621, + "step": 5711 + }, + { + "epoch": 1.0189099991080188, + "grad_norm": 0.4896325170993805, + "learning_rate": 0.0002425044594450751, + "loss": 0.803, + "step": 5712 + }, + { + "epoch": 1.019088395326019, + "grad_norm": 0.6267058253288269, + "learning_rate": 0.00024243442895755476, + "loss": 0.782, + "step": 5713 + }, + { + "epoch": 1.0192667915440192, + "grad_norm": 0.49481847882270813, + "learning_rate": 0.00024236439906423105, + "loss": 0.7271, + "step": 5714 + }, + { + "epoch": 1.0194451877620194, + "grad_norm": 0.42377015948295593, + "learning_rate": 0.00024229436977060427, + "loss": 0.529, + "step": 5715 + }, + { + "epoch": 1.0196235839800196, + "grad_norm": 0.4759003520011902, + "learning_rate": 0.00024222434108217428, + "loss": 0.7413, + "step": 5716 + }, + { + "epoch": 1.0198019801980198, + "grad_norm": 0.41128072142601013, + "learning_rate": 0.0002421543130044412, + "loss": 0.5624, + "step": 5717 + }, + { + "epoch": 1.01998037641602, + "grad_norm": 0.5285554528236389, + "learning_rate": 0.00024208428554290502, + "loss": 0.71, + "step": 5718 + }, + { + "epoch": 1.0201587726340202, + "grad_norm": 0.4953111410140991, + "learning_rate": 0.00024201425870306565, + "loss": 0.6101, + "step": 5719 + }, + { + "epoch": 1.0203371688520204, + "grad_norm": 0.5717734098434448, + "learning_rate": 0.0002419442324904229, + "loss": 0.8363, + "step": 5720 + }, + { + "epoch": 1.0205155650700206, + "grad_norm": 0.49335336685180664, + "learning_rate": 0.0002418742069104767, + "loss": 0.5884, + "step": 5721 + }, + { + "epoch": 1.0206939612880206, + "grad_norm": 0.7236892580986023, + "learning_rate": 0.0002418041819687268, + "loss": 0.7232, + "step": 5722 + }, + { + "epoch": 1.0208723575060208, + "grad_norm": 0.5189496278762817, + "learning_rate": 0.00024173415767067295, + "loss": 0.7079, + "step": 5723 + }, + { + "epoch": 1.021050753724021, + "grad_norm": 0.5228090882301331, + "learning_rate": 0.00024166413402181477, + "loss": 0.7657, + "step": 5724 + }, + { + "epoch": 1.0212291499420212, + "grad_norm": 0.4883708953857422, + "learning_rate": 0.00024159411102765195, + "loss": 0.6471, + "step": 5725 + }, + { + "epoch": 1.0214075461600214, + "grad_norm": 0.5337876677513123, + "learning_rate": 0.00024152408869368398, + "loss": 0.8151, + "step": 5726 + }, + { + "epoch": 1.0215859423780216, + "grad_norm": 0.6350103616714478, + "learning_rate": 0.00024145406702541047, + "loss": 0.694, + "step": 5727 + }, + { + "epoch": 1.0217643385960218, + "grad_norm": 0.49717575311660767, + "learning_rate": 0.00024138404602833092, + "loss": 0.6491, + "step": 5728 + }, + { + "epoch": 1.021942734814022, + "grad_norm": 0.4632187783718109, + "learning_rate": 0.0002413140257079446, + "loss": 0.6376, + "step": 5729 + }, + { + "epoch": 1.0221211310320222, + "grad_norm": 0.7505040764808655, + "learning_rate": 0.000241244006069751, + "loss": 0.7011, + "step": 5730 + }, + { + "epoch": 1.0222995272500224, + "grad_norm": 0.4758153557777405, + "learning_rate": 0.00024117398711924937, + "loss": 0.6989, + "step": 5731 + }, + { + "epoch": 1.0224779234680226, + "grad_norm": 0.43656665086746216, + "learning_rate": 0.000241103968861939, + "loss": 0.5964, + "step": 5732 + }, + { + "epoch": 1.0226563196860226, + "grad_norm": 0.5343319177627563, + "learning_rate": 0.00024103395130331909, + "loss": 0.855, + "step": 5733 + }, + { + "epoch": 1.0228347159040228, + "grad_norm": 0.6632677912712097, + "learning_rate": 0.00024096393444888878, + "loss": 0.5942, + "step": 5734 + }, + { + "epoch": 1.023013112122023, + "grad_norm": 0.40866273641586304, + "learning_rate": 0.0002408939183041472, + "loss": 0.5326, + "step": 5735 + }, + { + "epoch": 1.0231915083400231, + "grad_norm": 0.43526768684387207, + "learning_rate": 0.00024082390287459327, + "loss": 0.6601, + "step": 5736 + }, + { + "epoch": 1.0233699045580233, + "grad_norm": 0.5318346619606018, + "learning_rate": 0.00024075388816572612, + "loss": 0.709, + "step": 5737 + }, + { + "epoch": 1.0235483007760235, + "grad_norm": 0.5785126090049744, + "learning_rate": 0.0002406838741830446, + "loss": 0.6006, + "step": 5738 + }, + { + "epoch": 1.0237266969940237, + "grad_norm": 0.4882996380329132, + "learning_rate": 0.0002406138609320476, + "loss": 0.7155, + "step": 5739 + }, + { + "epoch": 1.023905093212024, + "grad_norm": 0.5886842012405396, + "learning_rate": 0.00024054384841823396, + "loss": 0.7278, + "step": 5740 + }, + { + "epoch": 1.0240834894300241, + "grad_norm": 0.5136778354644775, + "learning_rate": 0.00024047383664710243, + "loss": 0.7661, + "step": 5741 + }, + { + "epoch": 1.0242618856480243, + "grad_norm": 0.563352644443512, + "learning_rate": 0.0002404038256241516, + "loss": 0.9258, + "step": 5742 + }, + { + "epoch": 1.0244402818660245, + "grad_norm": 0.4579552710056305, + "learning_rate": 0.0002403338153548802, + "loss": 0.5869, + "step": 5743 + }, + { + "epoch": 1.0246186780840245, + "grad_norm": 0.5219502449035645, + "learning_rate": 0.00024026380584478676, + "loss": 0.7318, + "step": 5744 + }, + { + "epoch": 1.0247970743020247, + "grad_norm": 0.4938696622848511, + "learning_rate": 0.00024019379709936984, + "loss": 0.8765, + "step": 5745 + }, + { + "epoch": 1.024975470520025, + "grad_norm": 0.6994608640670776, + "learning_rate": 0.00024012378912412785, + "loss": 0.7989, + "step": 5746 + }, + { + "epoch": 1.025153866738025, + "grad_norm": 0.4768621027469635, + "learning_rate": 0.00024005378192455923, + "loss": 0.6248, + "step": 5747 + }, + { + "epoch": 1.0253322629560253, + "grad_norm": 1.2631269693374634, + "learning_rate": 0.00023998377550616228, + "loss": 0.6479, + "step": 5748 + }, + { + "epoch": 1.0255106591740255, + "grad_norm": 0.48576176166534424, + "learning_rate": 0.00023991376987443535, + "loss": 0.7885, + "step": 5749 + }, + { + "epoch": 1.0256890553920257, + "grad_norm": 0.5313836336135864, + "learning_rate": 0.00023984376503487657, + "loss": 0.8569, + "step": 5750 + }, + { + "epoch": 1.025867451610026, + "grad_norm": 1.306301236152649, + "learning_rate": 0.00023977376099298397, + "loss": 0.7603, + "step": 5751 + }, + { + "epoch": 1.026045847828026, + "grad_norm": 0.6186336278915405, + "learning_rate": 0.0002397037577542559, + "loss": 0.8546, + "step": 5752 + }, + { + "epoch": 1.0262242440460263, + "grad_norm": 0.4687788784503937, + "learning_rate": 0.00023963375532419032, + "loss": 0.7342, + "step": 5753 + }, + { + "epoch": 1.0264026402640265, + "grad_norm": 0.5840802192687988, + "learning_rate": 0.00023956375370828508, + "loss": 0.6639, + "step": 5754 + }, + { + "epoch": 1.0265810364820265, + "grad_norm": 0.5559926629066467, + "learning_rate": 0.00023949375291203815, + "loss": 0.8283, + "step": 5755 + }, + { + "epoch": 1.0267594327000267, + "grad_norm": 0.47672855854034424, + "learning_rate": 0.0002394237529409473, + "loss": 0.7104, + "step": 5756 + }, + { + "epoch": 1.0269378289180269, + "grad_norm": 0.49324122071266174, + "learning_rate": 0.00023935375380051038, + "loss": 0.7398, + "step": 5757 + }, + { + "epoch": 1.027116225136027, + "grad_norm": 0.547975480556488, + "learning_rate": 0.0002392837554962251, + "loss": 0.8606, + "step": 5758 + }, + { + "epoch": 1.0272946213540273, + "grad_norm": 0.48279041051864624, + "learning_rate": 0.00023921375803358897, + "loss": 0.7028, + "step": 5759 + }, + { + "epoch": 1.0274730175720275, + "grad_norm": 0.5213961601257324, + "learning_rate": 0.00023914376141809967, + "loss": 0.6459, + "step": 5760 + }, + { + "epoch": 1.0276514137900277, + "grad_norm": 0.46963804960250854, + "learning_rate": 0.00023907376565525469, + "loss": 0.6467, + "step": 5761 + }, + { + "epoch": 1.0278298100080279, + "grad_norm": 0.4742526113986969, + "learning_rate": 0.0002390037707505515, + "loss": 0.6972, + "step": 5762 + }, + { + "epoch": 1.028008206226028, + "grad_norm": 0.6087868213653564, + "learning_rate": 0.00023893377670948735, + "loss": 0.6751, + "step": 5763 + }, + { + "epoch": 1.0281866024440283, + "grad_norm": 0.5412265062332153, + "learning_rate": 0.00023886378353755964, + "loss": 0.7496, + "step": 5764 + }, + { + "epoch": 1.0283649986620285, + "grad_norm": 0.6296983957290649, + "learning_rate": 0.00023879379124026556, + "loss": 0.7903, + "step": 5765 + }, + { + "epoch": 1.0285433948800284, + "grad_norm": 0.5297571420669556, + "learning_rate": 0.00023872379982310224, + "loss": 0.7143, + "step": 5766 + }, + { + "epoch": 1.0287217910980286, + "grad_norm": 0.4944717586040497, + "learning_rate": 0.00023865380929156691, + "loss": 0.7, + "step": 5767 + }, + { + "epoch": 1.0289001873160288, + "grad_norm": 0.44193384051322937, + "learning_rate": 0.0002385838196511565, + "loss": 0.6133, + "step": 5768 + }, + { + "epoch": 1.029078583534029, + "grad_norm": 2.3271965980529785, + "learning_rate": 0.000238513830907368, + "loss": 0.712, + "step": 5769 + }, + { + "epoch": 1.0292569797520292, + "grad_norm": 0.45124170184135437, + "learning_rate": 0.00023844384306569825, + "loss": 0.6838, + "step": 5770 + }, + { + "epoch": 1.0294353759700294, + "grad_norm": 0.488295316696167, + "learning_rate": 0.0002383738561316441, + "loss": 0.6474, + "step": 5771 + }, + { + "epoch": 1.0296137721880296, + "grad_norm": 0.4569397270679474, + "learning_rate": 0.00023830387011070225, + "loss": 0.7513, + "step": 5772 + }, + { + "epoch": 1.0297921684060298, + "grad_norm": 0.5301244258880615, + "learning_rate": 0.00023823388500836945, + "loss": 0.7864, + "step": 5773 + }, + { + "epoch": 1.02997056462403, + "grad_norm": 0.4296034872531891, + "learning_rate": 0.00023816390083014234, + "loss": 0.5479, + "step": 5774 + }, + { + "epoch": 1.0301489608420302, + "grad_norm": 0.49546340107917786, + "learning_rate": 0.00023809391758151726, + "loss": 0.6903, + "step": 5775 + }, + { + "epoch": 1.0303273570600304, + "grad_norm": 0.47323548793792725, + "learning_rate": 0.0002380239352679908, + "loss": 0.6874, + "step": 5776 + }, + { + "epoch": 1.0305057532780304, + "grad_norm": 0.45178884267807007, + "learning_rate": 0.00023795395389505927, + "loss": 0.8751, + "step": 5777 + }, + { + "epoch": 1.0306841494960306, + "grad_norm": 0.5081565380096436, + "learning_rate": 0.00023788397346821905, + "loss": 0.7723, + "step": 5778 + }, + { + "epoch": 1.0308625457140308, + "grad_norm": 0.4901391565799713, + "learning_rate": 0.00023781399399296635, + "loss": 0.628, + "step": 5779 + }, + { + "epoch": 1.031040941932031, + "grad_norm": 0.4235868752002716, + "learning_rate": 0.0002377440154747973, + "loss": 0.5508, + "step": 5780 + }, + { + "epoch": 1.0312193381500312, + "grad_norm": 0.45451483130455017, + "learning_rate": 0.000237674037919208, + "loss": 0.6556, + "step": 5781 + }, + { + "epoch": 1.0313977343680314, + "grad_norm": 0.5195580720901489, + "learning_rate": 0.0002376040613316944, + "loss": 0.9728, + "step": 5782 + }, + { + "epoch": 1.0315761305860316, + "grad_norm": 0.49326059222221375, + "learning_rate": 0.00023753408571775255, + "loss": 0.6793, + "step": 5783 + }, + { + "epoch": 1.0317545268040318, + "grad_norm": 0.4820156395435333, + "learning_rate": 0.00023746411108287825, + "loss": 0.562, + "step": 5784 + }, + { + "epoch": 1.031932923022032, + "grad_norm": 0.48524320125579834, + "learning_rate": 0.00023739413743256726, + "loss": 0.6854, + "step": 5785 + }, + { + "epoch": 1.0321113192400322, + "grad_norm": 0.49960482120513916, + "learning_rate": 0.0002373241647723153, + "loss": 0.8011, + "step": 5786 + }, + { + "epoch": 1.0322897154580324, + "grad_norm": 0.5163941979408264, + "learning_rate": 0.00023725419310761803, + "loss": 0.7166, + "step": 5787 + }, + { + "epoch": 1.0324681116760324, + "grad_norm": 0.507465660572052, + "learning_rate": 0.0002371842224439709, + "loss": 0.8175, + "step": 5788 + }, + { + "epoch": 1.0326465078940326, + "grad_norm": 0.47466954588890076, + "learning_rate": 0.00023711425278686945, + "loss": 0.6952, + "step": 5789 + }, + { + "epoch": 1.0328249041120328, + "grad_norm": 0.4188692569732666, + "learning_rate": 0.000237044284141809, + "loss": 0.5577, + "step": 5790 + }, + { + "epoch": 1.033003300330033, + "grad_norm": 0.5226710438728333, + "learning_rate": 0.0002369743165142849, + "loss": 0.8422, + "step": 5791 + }, + { + "epoch": 1.0331816965480332, + "grad_norm": 0.49022915959358215, + "learning_rate": 0.00023690434990979238, + "loss": 0.8066, + "step": 5792 + }, + { + "epoch": 1.0333600927660334, + "grad_norm": 0.49656999111175537, + "learning_rate": 0.0002368343843338266, + "loss": 0.7251, + "step": 5793 + }, + { + "epoch": 1.0335384889840336, + "grad_norm": 0.40922585129737854, + "learning_rate": 0.00023676441979188258, + "loss": 0.5209, + "step": 5794 + }, + { + "epoch": 1.0337168852020338, + "grad_norm": 0.46607285737991333, + "learning_rate": 0.0002366944562894554, + "loss": 0.6608, + "step": 5795 + }, + { + "epoch": 1.033895281420034, + "grad_norm": 0.43443769216537476, + "learning_rate": 0.00023662449383203988, + "loss": 0.6759, + "step": 5796 + }, + { + "epoch": 1.0340736776380341, + "grad_norm": 0.5248075127601624, + "learning_rate": 0.0002365545324251307, + "loss": 0.8347, + "step": 5797 + }, + { + "epoch": 1.0342520738560343, + "grad_norm": 0.5106051564216614, + "learning_rate": 0.0002364845720742229, + "loss": 0.8222, + "step": 5798 + }, + { + "epoch": 1.0344304700740343, + "grad_norm": 0.4647291600704193, + "learning_rate": 0.00023641461278481096, + "loss": 0.5955, + "step": 5799 + }, + { + "epoch": 1.0346088662920345, + "grad_norm": 0.3780640661716461, + "learning_rate": 0.00023634465456238957, + "loss": 0.4401, + "step": 5800 + }, + { + "epoch": 1.0347872625100347, + "grad_norm": 0.4369570016860962, + "learning_rate": 0.00023627469741245306, + "loss": 0.5725, + "step": 5801 + }, + { + "epoch": 1.034965658728035, + "grad_norm": 0.4812033772468567, + "learning_rate": 0.0002362047413404959, + "loss": 0.7532, + "step": 5802 + }, + { + "epoch": 1.0351440549460351, + "grad_norm": 0.49813112616539, + "learning_rate": 0.00023613478635201246, + "loss": 0.7331, + "step": 5803 + }, + { + "epoch": 1.0353224511640353, + "grad_norm": 0.45200127363204956, + "learning_rate": 0.0002360648324524969, + "loss": 0.6147, + "step": 5804 + }, + { + "epoch": 1.0355008473820355, + "grad_norm": 0.5208128690719604, + "learning_rate": 0.0002359948796474434, + "loss": 0.6966, + "step": 5805 + }, + { + "epoch": 1.0356792436000357, + "grad_norm": 0.49845966696739197, + "learning_rate": 0.00023592492794234605, + "loss": 0.7405, + "step": 5806 + }, + { + "epoch": 1.035857639818036, + "grad_norm": 0.4411349594593048, + "learning_rate": 0.0002358549773426988, + "loss": 0.6018, + "step": 5807 + }, + { + "epoch": 1.0360360360360361, + "grad_norm": 0.5190544128417969, + "learning_rate": 0.00023578502785399558, + "loss": 0.9016, + "step": 5808 + }, + { + "epoch": 1.0362144322540363, + "grad_norm": 0.45174866914749146, + "learning_rate": 0.0002357150794817301, + "loss": 0.5387, + "step": 5809 + }, + { + "epoch": 1.0363928284720363, + "grad_norm": 0.56296706199646, + "learning_rate": 0.00023564513223139615, + "loss": 0.6236, + "step": 5810 + }, + { + "epoch": 1.0365712246900365, + "grad_norm": 0.4660092294216156, + "learning_rate": 0.0002355751861084873, + "loss": 0.6932, + "step": 5811 + }, + { + "epoch": 1.0367496209080367, + "grad_norm": 0.4947931170463562, + "learning_rate": 0.00023550524111849705, + "loss": 0.6975, + "step": 5812 + }, + { + "epoch": 1.0369280171260369, + "grad_norm": 0.49009427428245544, + "learning_rate": 0.0002354352972669191, + "loss": 0.6677, + "step": 5813 + }, + { + "epoch": 1.037106413344037, + "grad_norm": 0.5009847283363342, + "learning_rate": 0.00023536535455924654, + "loss": 0.7397, + "step": 5814 + }, + { + "epoch": 1.0372848095620373, + "grad_norm": 0.5054614543914795, + "learning_rate": 0.00023529541300097275, + "loss": 0.7261, + "step": 5815 + }, + { + "epoch": 1.0374632057800375, + "grad_norm": 0.46940669417381287, + "learning_rate": 0.0002352254725975909, + "loss": 0.7628, + "step": 5816 + }, + { + "epoch": 1.0376416019980377, + "grad_norm": 0.4963463842868805, + "learning_rate": 0.00023515553335459407, + "loss": 0.9402, + "step": 5817 + }, + { + "epoch": 1.0378199982160379, + "grad_norm": 0.46906179189682007, + "learning_rate": 0.00023508559527747527, + "loss": 0.6101, + "step": 5818 + }, + { + "epoch": 1.037998394434038, + "grad_norm": 0.5523307919502258, + "learning_rate": 0.0002350156583717274, + "loss": 0.6492, + "step": 5819 + }, + { + "epoch": 1.0381767906520383, + "grad_norm": 0.5532516837120056, + "learning_rate": 0.00023494572264284326, + "loss": 0.82, + "step": 5820 + }, + { + "epoch": 1.0383551868700382, + "grad_norm": 0.5024081468582153, + "learning_rate": 0.00023487578809631567, + "loss": 0.725, + "step": 5821 + }, + { + "epoch": 1.0385335830880384, + "grad_norm": 0.4690212905406952, + "learning_rate": 0.00023480585473763707, + "loss": 0.7607, + "step": 5822 + }, + { + "epoch": 1.0387119793060386, + "grad_norm": 0.39463046193122864, + "learning_rate": 0.00023473592257230015, + "loss": 0.5608, + "step": 5823 + }, + { + "epoch": 1.0388903755240388, + "grad_norm": 0.44406580924987793, + "learning_rate": 0.00023466599160579726, + "loss": 0.6521, + "step": 5824 + }, + { + "epoch": 1.039068771742039, + "grad_norm": 0.46876436471939087, + "learning_rate": 0.0002345960618436208, + "loss": 0.643, + "step": 5825 + }, + { + "epoch": 1.0392471679600392, + "grad_norm": 0.5072786211967468, + "learning_rate": 0.00023452613329126305, + "loss": 0.91, + "step": 5826 + }, + { + "epoch": 1.0394255641780394, + "grad_norm": 0.45098283886909485, + "learning_rate": 0.0002344562059542161, + "loss": 0.6326, + "step": 5827 + }, + { + "epoch": 1.0396039603960396, + "grad_norm": 0.507174551486969, + "learning_rate": 0.000234386279837972, + "loss": 0.8313, + "step": 5828 + }, + { + "epoch": 1.0397823566140398, + "grad_norm": 0.44841310381889343, + "learning_rate": 0.0002343163549480228, + "loss": 0.6486, + "step": 5829 + }, + { + "epoch": 1.03996075283204, + "grad_norm": 0.45851168036460876, + "learning_rate": 0.00023424643128986037, + "loss": 0.6491, + "step": 5830 + }, + { + "epoch": 1.0401391490500402, + "grad_norm": 0.4500889778137207, + "learning_rate": 0.0002341765088689764, + "loss": 0.6452, + "step": 5831 + }, + { + "epoch": 1.0403175452680402, + "grad_norm": 0.4633578658103943, + "learning_rate": 0.00023410658769086265, + "loss": 0.6371, + "step": 5832 + }, + { + "epoch": 1.0404959414860404, + "grad_norm": 0.46451932191848755, + "learning_rate": 0.0002340366677610107, + "loss": 0.6226, + "step": 5833 + }, + { + "epoch": 1.0406743377040406, + "grad_norm": 0.5377410054206848, + "learning_rate": 0.00023396674908491194, + "loss": 0.8014, + "step": 5834 + }, + { + "epoch": 1.0408527339220408, + "grad_norm": 0.46611928939819336, + "learning_rate": 0.00023389683166805784, + "loss": 0.7689, + "step": 5835 + }, + { + "epoch": 1.041031130140041, + "grad_norm": 0.43398740887641907, + "learning_rate": 0.00023382691551593964, + "loss": 0.5971, + "step": 5836 + }, + { + "epoch": 1.0412095263580412, + "grad_norm": 0.46025121212005615, + "learning_rate": 0.0002337570006340485, + "loss": 0.548, + "step": 5837 + }, + { + "epoch": 1.0413879225760414, + "grad_norm": 0.5142043828964233, + "learning_rate": 0.00023368708702787555, + "loss": 0.7064, + "step": 5838 + }, + { + "epoch": 1.0415663187940416, + "grad_norm": 0.4556623697280884, + "learning_rate": 0.00023361717470291176, + "loss": 0.7167, + "step": 5839 + }, + { + "epoch": 1.0417447150120418, + "grad_norm": 0.46669575572013855, + "learning_rate": 0.00023354726366464808, + "loss": 0.6693, + "step": 5840 + }, + { + "epoch": 1.041923111230042, + "grad_norm": 0.5870226621627808, + "learning_rate": 0.00023347735391857517, + "loss": 0.81, + "step": 5841 + }, + { + "epoch": 1.0421015074480422, + "grad_norm": 0.5324650406837463, + "learning_rate": 0.00023340744547018384, + "loss": 0.6455, + "step": 5842 + }, + { + "epoch": 1.0422799036660422, + "grad_norm": 0.4658534824848175, + "learning_rate": 0.00023333753832496443, + "loss": 0.716, + "step": 5843 + }, + { + "epoch": 1.0424582998840424, + "grad_norm": 0.6966456174850464, + "learning_rate": 0.0002332676324884077, + "loss": 0.4844, + "step": 5844 + }, + { + "epoch": 1.0426366961020426, + "grad_norm": 0.440407395362854, + "learning_rate": 0.00023319772796600395, + "loss": 0.5498, + "step": 5845 + }, + { + "epoch": 1.0428150923200428, + "grad_norm": 0.5414045453071594, + "learning_rate": 0.00023312782476324345, + "loss": 0.8414, + "step": 5846 + }, + { + "epoch": 1.042993488538043, + "grad_norm": 0.4795417785644531, + "learning_rate": 0.0002330579228856163, + "loss": 0.7409, + "step": 5847 + }, + { + "epoch": 1.0431718847560432, + "grad_norm": 0.47270727157592773, + "learning_rate": 0.00023298802233861254, + "loss": 0.5839, + "step": 5848 + }, + { + "epoch": 1.0433502809740434, + "grad_norm": 0.45291322469711304, + "learning_rate": 0.0002329181231277222, + "loss": 0.6999, + "step": 5849 + }, + { + "epoch": 1.0435286771920436, + "grad_norm": 0.49870064854621887, + "learning_rate": 0.00023284822525843513, + "loss": 0.6977, + "step": 5850 + }, + { + "epoch": 1.0437070734100438, + "grad_norm": 0.500630259513855, + "learning_rate": 0.00023277832873624108, + "loss": 0.8385, + "step": 5851 + }, + { + "epoch": 1.043885469628044, + "grad_norm": 0.4953528642654419, + "learning_rate": 0.00023270843356662968, + "loss": 0.7302, + "step": 5852 + }, + { + "epoch": 1.0440638658460442, + "grad_norm": 0.4294586181640625, + "learning_rate": 0.00023263853975509044, + "loss": 0.529, + "step": 5853 + }, + { + "epoch": 1.0442422620640441, + "grad_norm": 0.5091565847396851, + "learning_rate": 0.00023256864730711289, + "loss": 0.7218, + "step": 5854 + }, + { + "epoch": 1.0444206582820443, + "grad_norm": 0.4561258554458618, + "learning_rate": 0.00023249875622818623, + "loss": 0.6372, + "step": 5855 + }, + { + "epoch": 1.0445990545000445, + "grad_norm": 0.46639683842658997, + "learning_rate": 0.00023242886652379973, + "loss": 0.6476, + "step": 5856 + }, + { + "epoch": 1.0447774507180447, + "grad_norm": 0.4989601671695709, + "learning_rate": 0.00023235897819944245, + "loss": 0.7827, + "step": 5857 + }, + { + "epoch": 1.044955846936045, + "grad_norm": 0.4990067780017853, + "learning_rate": 0.00023228909126060335, + "loss": 0.7969, + "step": 5858 + }, + { + "epoch": 1.0451342431540451, + "grad_norm": 0.4401039183139801, + "learning_rate": 0.00023221920571277159, + "loss": 0.6276, + "step": 5859 + }, + { + "epoch": 1.0453126393720453, + "grad_norm": 0.496707558631897, + "learning_rate": 0.00023214932156143564, + "loss": 0.7849, + "step": 5860 + }, + { + "epoch": 1.0454910355900455, + "grad_norm": 0.5055187940597534, + "learning_rate": 0.00023207943881208435, + "loss": 0.75, + "step": 5861 + }, + { + "epoch": 1.0456694318080457, + "grad_norm": 0.4924319088459015, + "learning_rate": 0.0002320095574702062, + "loss": 0.741, + "step": 5862 + }, + { + "epoch": 1.045847828026046, + "grad_norm": 0.4404760003089905, + "learning_rate": 0.0002319396775412897, + "loss": 0.5559, + "step": 5863 + }, + { + "epoch": 1.0460262242440461, + "grad_norm": 0.4808800518512726, + "learning_rate": 0.0002318697990308231, + "loss": 0.776, + "step": 5864 + }, + { + "epoch": 1.046204620462046, + "grad_norm": 0.4353162348270416, + "learning_rate": 0.00023179992194429473, + "loss": 0.534, + "step": 5865 + }, + { + "epoch": 1.0463830166800463, + "grad_norm": 0.489243745803833, + "learning_rate": 0.00023173004628719262, + "loss": 0.72, + "step": 5866 + }, + { + "epoch": 1.0465614128980465, + "grad_norm": 0.5229580998420715, + "learning_rate": 0.0002316601720650049, + "loss": 0.8733, + "step": 5867 + }, + { + "epoch": 1.0467398091160467, + "grad_norm": 0.4031560719013214, + "learning_rate": 0.0002315902992832193, + "loss": 0.4158, + "step": 5868 + }, + { + "epoch": 1.046918205334047, + "grad_norm": 0.5013259053230286, + "learning_rate": 0.00023152042794732366, + "loss": 0.6999, + "step": 5869 + }, + { + "epoch": 1.047096601552047, + "grad_norm": 0.4599556624889374, + "learning_rate": 0.00023145055806280567, + "loss": 0.6991, + "step": 5870 + }, + { + "epoch": 1.0472749977700473, + "grad_norm": 0.4679622948169708, + "learning_rate": 0.00023138068963515288, + "loss": 0.9515, + "step": 5871 + }, + { + "epoch": 1.0474533939880475, + "grad_norm": 0.4390574097633362, + "learning_rate": 0.0002313108226698527, + "loss": 0.5369, + "step": 5872 + }, + { + "epoch": 1.0476317902060477, + "grad_norm": 0.42867380380630493, + "learning_rate": 0.00023124095717239241, + "loss": 0.5489, + "step": 5873 + }, + { + "epoch": 1.0478101864240479, + "grad_norm": 0.43418964743614197, + "learning_rate": 0.00023117109314825933, + "loss": 0.7322, + "step": 5874 + }, + { + "epoch": 1.047988582642048, + "grad_norm": 0.4745190441608429, + "learning_rate": 0.00023110123060294047, + "loss": 0.682, + "step": 5875 + }, + { + "epoch": 1.048166978860048, + "grad_norm": 0.4581608176231384, + "learning_rate": 0.00023103136954192286, + "loss": 0.6654, + "step": 5876 + }, + { + "epoch": 1.0483453750780483, + "grad_norm": 0.46257057785987854, + "learning_rate": 0.0002309615099706933, + "loss": 0.6197, + "step": 5877 + }, + { + "epoch": 1.0485237712960485, + "grad_norm": 0.528059184551239, + "learning_rate": 0.00023089165189473857, + "loss": 0.7513, + "step": 5878 + }, + { + "epoch": 1.0487021675140487, + "grad_norm": 0.44776850938796997, + "learning_rate": 0.00023082179531954525, + "loss": 0.6703, + "step": 5879 + }, + { + "epoch": 1.0488805637320489, + "grad_norm": 0.46082332730293274, + "learning_rate": 0.00023075194025059994, + "loss": 0.5771, + "step": 5880 + }, + { + "epoch": 1.049058959950049, + "grad_norm": 0.49523085355758667, + "learning_rate": 0.00023068208669338894, + "loss": 0.5816, + "step": 5881 + }, + { + "epoch": 1.0492373561680493, + "grad_norm": 0.6181836128234863, + "learning_rate": 0.0002306122346533985, + "loss": 0.5963, + "step": 5882 + }, + { + "epoch": 1.0494157523860494, + "grad_norm": 0.8168792128562927, + "learning_rate": 0.00023054238413611482, + "loss": 0.609, + "step": 5883 + }, + { + "epoch": 1.0495941486040496, + "grad_norm": 0.5016876459121704, + "learning_rate": 0.0002304725351470239, + "loss": 0.8368, + "step": 5884 + }, + { + "epoch": 1.0497725448220498, + "grad_norm": 0.4856698215007782, + "learning_rate": 0.00023040268769161168, + "loss": 0.5799, + "step": 5885 + }, + { + "epoch": 1.04995094104005, + "grad_norm": 0.8010654449462891, + "learning_rate": 0.00023033284177536396, + "loss": 0.7327, + "step": 5886 + }, + { + "epoch": 1.05012933725805, + "grad_norm": 0.48454806208610535, + "learning_rate": 0.00023026299740376633, + "loss": 0.7485, + "step": 5887 + }, + { + "epoch": 1.0503077334760502, + "grad_norm": 0.47134482860565186, + "learning_rate": 0.00023019315458230449, + "loss": 0.7593, + "step": 5888 + }, + { + "epoch": 1.0504861296940504, + "grad_norm": 0.5328679084777832, + "learning_rate": 0.00023012331331646359, + "loss": 0.6926, + "step": 5889 + }, + { + "epoch": 1.0506645259120506, + "grad_norm": 0.46135085821151733, + "learning_rate": 0.0002300534736117292, + "loss": 0.6585, + "step": 5890 + }, + { + "epoch": 1.0508429221300508, + "grad_norm": 0.7322617173194885, + "learning_rate": 0.00022998363547358641, + "loss": 0.7974, + "step": 5891 + }, + { + "epoch": 1.051021318348051, + "grad_norm": 0.46016210317611694, + "learning_rate": 0.0002299137989075203, + "loss": 0.7362, + "step": 5892 + }, + { + "epoch": 1.0511997145660512, + "grad_norm": 0.4320010244846344, + "learning_rate": 0.00022984396391901582, + "loss": 0.5613, + "step": 5893 + }, + { + "epoch": 1.0513781107840514, + "grad_norm": 0.45406126976013184, + "learning_rate": 0.0002297741305135577, + "loss": 0.5832, + "step": 5894 + }, + { + "epoch": 1.0515565070020516, + "grad_norm": 0.5657575130462646, + "learning_rate": 0.00022970429869663064, + "loss": 0.9168, + "step": 5895 + }, + { + "epoch": 1.0517349032200518, + "grad_norm": 0.47483891248703003, + "learning_rate": 0.00022963446847371925, + "loss": 0.5917, + "step": 5896 + }, + { + "epoch": 1.051913299438052, + "grad_norm": 0.4458398222923279, + "learning_rate": 0.00022956463985030794, + "loss": 0.5259, + "step": 5897 + }, + { + "epoch": 1.0520916956560522, + "grad_norm": 0.4733836054801941, + "learning_rate": 0.000229494812831881, + "loss": 0.6835, + "step": 5898 + }, + { + "epoch": 1.0522700918740522, + "grad_norm": 0.49233517050743103, + "learning_rate": 0.00022942498742392265, + "loss": 0.8004, + "step": 5899 + }, + { + "epoch": 1.0524484880920524, + "grad_norm": 0.7832338809967041, + "learning_rate": 0.00022935516363191695, + "loss": 0.6962, + "step": 5900 + }, + { + "epoch": 1.0526268843100526, + "grad_norm": 0.7512785792350769, + "learning_rate": 0.00022928534146134783, + "loss": 0.7948, + "step": 5901 + }, + { + "epoch": 1.0528052805280528, + "grad_norm": 0.6257414221763611, + "learning_rate": 0.00022921552091769907, + "loss": 0.5168, + "step": 5902 + }, + { + "epoch": 1.052983676746053, + "grad_norm": 0.5249746441841125, + "learning_rate": 0.0002291457020064543, + "loss": 0.9227, + "step": 5903 + }, + { + "epoch": 1.0531620729640532, + "grad_norm": 0.4546600878238678, + "learning_rate": 0.00022907588473309703, + "loss": 0.5357, + "step": 5904 + }, + { + "epoch": 1.0533404691820534, + "grad_norm": 0.5810390114784241, + "learning_rate": 0.00022900606910311098, + "loss": 0.6856, + "step": 5905 + }, + { + "epoch": 1.0535188654000536, + "grad_norm": 0.45941051840782166, + "learning_rate": 0.00022893625512197913, + "loss": 0.6229, + "step": 5906 + }, + { + "epoch": 1.0536972616180538, + "grad_norm": 0.5021553039550781, + "learning_rate": 0.00022886644279518473, + "loss": 0.9207, + "step": 5907 + }, + { + "epoch": 1.053875657836054, + "grad_norm": 0.48117703199386597, + "learning_rate": 0.00022879663212821083, + "loss": 0.648, + "step": 5908 + }, + { + "epoch": 1.054054054054054, + "grad_norm": 0.5499297976493835, + "learning_rate": 0.00022872682312654032, + "loss": 0.7953, + "step": 5909 + }, + { + "epoch": 1.0542324502720541, + "grad_norm": 0.4166644513607025, + "learning_rate": 0.000228657015795656, + "loss": 0.506, + "step": 5910 + }, + { + "epoch": 1.0544108464900543, + "grad_norm": 0.5143164992332458, + "learning_rate": 0.00022858721014104043, + "loss": 0.7587, + "step": 5911 + }, + { + "epoch": 1.0545892427080545, + "grad_norm": 0.5195590257644653, + "learning_rate": 0.00022851740616817615, + "loss": 0.8337, + "step": 5912 + }, + { + "epoch": 1.0547676389260547, + "grad_norm": 0.5148821473121643, + "learning_rate": 0.00022844760388254556, + "loss": 0.9393, + "step": 5913 + }, + { + "epoch": 1.054946035144055, + "grad_norm": 0.5613206028938293, + "learning_rate": 0.00022837780328963095, + "loss": 1.0002, + "step": 5914 + }, + { + "epoch": 1.0551244313620551, + "grad_norm": 0.4363964796066284, + "learning_rate": 0.00022830800439491435, + "loss": 0.6343, + "step": 5915 + }, + { + "epoch": 1.0553028275800553, + "grad_norm": 0.46682143211364746, + "learning_rate": 0.00022823820720387766, + "loss": 0.6258, + "step": 5916 + }, + { + "epoch": 1.0554812237980555, + "grad_norm": 0.45022645592689514, + "learning_rate": 0.00022816841172200287, + "loss": 0.7482, + "step": 5917 + }, + { + "epoch": 1.0556596200160557, + "grad_norm": 0.5296302437782288, + "learning_rate": 0.00022809861795477162, + "loss": 0.7556, + "step": 5918 + }, + { + "epoch": 1.055838016234056, + "grad_norm": 0.48178842663764954, + "learning_rate": 0.00022802882590766544, + "loss": 0.8764, + "step": 5919 + }, + { + "epoch": 1.0560164124520561, + "grad_norm": 0.5427073240280151, + "learning_rate": 0.00022795903558616587, + "loss": 0.9599, + "step": 5920 + }, + { + "epoch": 1.056194808670056, + "grad_norm": 0.5280442833900452, + "learning_rate": 0.00022788924699575417, + "loss": 0.6732, + "step": 5921 + }, + { + "epoch": 1.0563732048880563, + "grad_norm": 0.5916957259178162, + "learning_rate": 0.00022781946014191145, + "loss": 0.8213, + "step": 5922 + }, + { + "epoch": 1.0565516011060565, + "grad_norm": 0.566719651222229, + "learning_rate": 0.00022774967503011884, + "loss": 0.7634, + "step": 5923 + }, + { + "epoch": 1.0567299973240567, + "grad_norm": 0.5337679982185364, + "learning_rate": 0.00022767989166585717, + "loss": 0.8209, + "step": 5924 + }, + { + "epoch": 1.056908393542057, + "grad_norm": 1.3235185146331787, + "learning_rate": 0.0002276101100546072, + "loss": 0.706, + "step": 5925 + }, + { + "epoch": 1.057086789760057, + "grad_norm": 0.4909132719039917, + "learning_rate": 0.0002275403302018496, + "loss": 0.6735, + "step": 5926 + }, + { + "epoch": 1.0572651859780573, + "grad_norm": 0.4729708135128021, + "learning_rate": 0.00022747055211306473, + "loss": 0.6506, + "step": 5927 + }, + { + "epoch": 1.0574435821960575, + "grad_norm": 0.48028764128685, + "learning_rate": 0.000227400775793733, + "loss": 0.577, + "step": 5928 + }, + { + "epoch": 1.0576219784140577, + "grad_norm": 0.4471667408943176, + "learning_rate": 0.00022733100124933464, + "loss": 0.5678, + "step": 5929 + }, + { + "epoch": 1.057800374632058, + "grad_norm": 0.4753149449825287, + "learning_rate": 0.00022726122848534965, + "loss": 0.7441, + "step": 5930 + }, + { + "epoch": 1.0579787708500579, + "grad_norm": 0.5008271932601929, + "learning_rate": 0.00022719145750725803, + "loss": 0.7606, + "step": 5931 + }, + { + "epoch": 1.058157167068058, + "grad_norm": 0.48235058784484863, + "learning_rate": 0.0002271216883205395, + "loss": 0.6566, + "step": 5932 + }, + { + "epoch": 1.0583355632860583, + "grad_norm": 0.49578502774238586, + "learning_rate": 0.00022705192093067377, + "loss": 0.6123, + "step": 5933 + }, + { + "epoch": 1.0585139595040585, + "grad_norm": 0.4736134111881256, + "learning_rate": 0.0002269821553431403, + "loss": 0.5684, + "step": 5934 + }, + { + "epoch": 1.0586923557220587, + "grad_norm": 0.4547494351863861, + "learning_rate": 0.00022691239156341828, + "loss": 0.6157, + "step": 5935 + }, + { + "epoch": 1.0588707519400589, + "grad_norm": 0.45939305424690247, + "learning_rate": 0.0002268426295969872, + "loss": 0.6121, + "step": 5936 + }, + { + "epoch": 1.059049148158059, + "grad_norm": 0.5370938181877136, + "learning_rate": 0.00022677286944932604, + "loss": 0.8359, + "step": 5937 + }, + { + "epoch": 1.0592275443760593, + "grad_norm": 0.5468327403068542, + "learning_rate": 0.0002267031111259137, + "loss": 0.7847, + "step": 5938 + }, + { + "epoch": 1.0594059405940595, + "grad_norm": 0.472802996635437, + "learning_rate": 0.00022663335463222906, + "loss": 0.6689, + "step": 5939 + }, + { + "epoch": 1.0595843368120597, + "grad_norm": 0.5464509129524231, + "learning_rate": 0.00022656359997375063, + "loss": 0.9584, + "step": 5940 + }, + { + "epoch": 1.0597627330300599, + "grad_norm": 0.5138636827468872, + "learning_rate": 0.000226493847155957, + "loss": 0.7645, + "step": 5941 + }, + { + "epoch": 1.05994112924806, + "grad_norm": 0.4933434724807739, + "learning_rate": 0.00022642409618432648, + "loss": 0.7348, + "step": 5942 + }, + { + "epoch": 1.06011952546606, + "grad_norm": 0.4958341121673584, + "learning_rate": 0.00022635434706433727, + "loss": 0.7549, + "step": 5943 + }, + { + "epoch": 1.0602979216840602, + "grad_norm": 0.4594402015209198, + "learning_rate": 0.00022628459980146752, + "loss": 0.5529, + "step": 5944 + }, + { + "epoch": 1.0604763179020604, + "grad_norm": 0.5959436893463135, + "learning_rate": 0.00022621485440119506, + "loss": 0.65, + "step": 5945 + }, + { + "epoch": 1.0606547141200606, + "grad_norm": 0.5254977345466614, + "learning_rate": 0.00022614511086899768, + "loss": 0.7693, + "step": 5946 + }, + { + "epoch": 1.0608331103380608, + "grad_norm": 0.4639701843261719, + "learning_rate": 0.00022607536921035313, + "loss": 0.7103, + "step": 5947 + }, + { + "epoch": 1.061011506556061, + "grad_norm": 0.5139868855476379, + "learning_rate": 0.00022600562943073872, + "loss": 0.7358, + "step": 5948 + }, + { + "epoch": 1.0611899027740612, + "grad_norm": 0.4823780059814453, + "learning_rate": 0.00022593589153563183, + "loss": 0.6385, + "step": 5949 + }, + { + "epoch": 1.0613682989920614, + "grad_norm": 0.475702702999115, + "learning_rate": 0.00022586615553050958, + "loss": 0.7695, + "step": 5950 + }, + { + "epoch": 1.0615466952100616, + "grad_norm": 0.4667466878890991, + "learning_rate": 0.00022579642142084918, + "loss": 0.7241, + "step": 5951 + }, + { + "epoch": 1.0617250914280618, + "grad_norm": 0.46129074692726135, + "learning_rate": 0.00022572668921212746, + "loss": 0.7244, + "step": 5952 + }, + { + "epoch": 1.0619034876460618, + "grad_norm": 0.45217975974082947, + "learning_rate": 0.0002256569589098211, + "loss": 0.5342, + "step": 5953 + }, + { + "epoch": 1.062081883864062, + "grad_norm": 0.4796280860900879, + "learning_rate": 0.0002255872305194067, + "loss": 0.7878, + "step": 5954 + }, + { + "epoch": 1.0622602800820622, + "grad_norm": 0.5046055912971497, + "learning_rate": 0.0002255175040463607, + "loss": 0.6931, + "step": 5955 + }, + { + "epoch": 1.0624386763000624, + "grad_norm": 0.5430681109428406, + "learning_rate": 0.00022544777949615942, + "loss": 0.6904, + "step": 5956 + }, + { + "epoch": 1.0626170725180626, + "grad_norm": 0.5339892506599426, + "learning_rate": 0.00022537805687427895, + "loss": 0.7239, + "step": 5957 + }, + { + "epoch": 1.0627954687360628, + "grad_norm": 0.510955274105072, + "learning_rate": 0.0002253083361861953, + "loss": 0.7252, + "step": 5958 + }, + { + "epoch": 1.062973864954063, + "grad_norm": 0.48307734727859497, + "learning_rate": 0.00022523861743738434, + "loss": 0.7571, + "step": 5959 + }, + { + "epoch": 1.0631522611720632, + "grad_norm": 0.4888105094432831, + "learning_rate": 0.00022516890063332173, + "loss": 0.6627, + "step": 5960 + }, + { + "epoch": 1.0633306573900634, + "grad_norm": 0.44385766983032227, + "learning_rate": 0.00022509918577948292, + "loss": 0.6508, + "step": 5961 + }, + { + "epoch": 1.0635090536080636, + "grad_norm": 0.4929356276988983, + "learning_rate": 0.00022502947288134334, + "loss": 0.6448, + "step": 5962 + }, + { + "epoch": 1.0636874498260638, + "grad_norm": 0.5029351711273193, + "learning_rate": 0.00022495976194437822, + "loss": 0.7504, + "step": 5963 + }, + { + "epoch": 1.063865846044064, + "grad_norm": 0.47953563928604126, + "learning_rate": 0.00022489005297406266, + "loss": 0.6591, + "step": 5964 + }, + { + "epoch": 1.064044242262064, + "grad_norm": 0.5105430483818054, + "learning_rate": 0.0002248203459758714, + "loss": 0.8449, + "step": 5965 + }, + { + "epoch": 1.0642226384800642, + "grad_norm": 0.5377670526504517, + "learning_rate": 0.00022475064095527948, + "loss": 0.8731, + "step": 5966 + }, + { + "epoch": 1.0644010346980644, + "grad_norm": 0.47288522124290466, + "learning_rate": 0.00022468093791776128, + "loss": 0.595, + "step": 5967 + }, + { + "epoch": 1.0645794309160646, + "grad_norm": 0.43630439043045044, + "learning_rate": 0.00022461123686879137, + "loss": 0.5941, + "step": 5968 + }, + { + "epoch": 1.0647578271340647, + "grad_norm": 0.4721398651599884, + "learning_rate": 0.00022454153781384395, + "loss": 0.6303, + "step": 5969 + }, + { + "epoch": 1.064936223352065, + "grad_norm": 0.5068725347518921, + "learning_rate": 0.00022447184075839323, + "loss": 0.7956, + "step": 5970 + }, + { + "epoch": 1.0651146195700651, + "grad_norm": 0.5235750675201416, + "learning_rate": 0.0002244021457079131, + "loss": 0.7384, + "step": 5971 + }, + { + "epoch": 1.0652930157880653, + "grad_norm": 0.44175130128860474, + "learning_rate": 0.00022433245266787749, + "loss": 0.6365, + "step": 5972 + }, + { + "epoch": 1.0654714120060655, + "grad_norm": 0.4824054539203644, + "learning_rate": 0.00022426276164376003, + "loss": 0.795, + "step": 5973 + }, + { + "epoch": 1.0656498082240657, + "grad_norm": 0.46271640062332153, + "learning_rate": 0.00022419307264103414, + "loss": 0.6397, + "step": 5974 + }, + { + "epoch": 1.0658282044420657, + "grad_norm": 0.4163155257701874, + "learning_rate": 0.0002241233856651732, + "loss": 0.4912, + "step": 5975 + }, + { + "epoch": 1.066006600660066, + "grad_norm": 0.46445098519325256, + "learning_rate": 0.00022405370072165043, + "loss": 0.587, + "step": 5976 + }, + { + "epoch": 1.0661849968780661, + "grad_norm": 0.4462120831012726, + "learning_rate": 0.00022398401781593884, + "loss": 0.6272, + "step": 5977 + }, + { + "epoch": 1.0663633930960663, + "grad_norm": 0.48081815242767334, + "learning_rate": 0.00022391433695351131, + "loss": 0.6237, + "step": 5978 + }, + { + "epoch": 1.0665417893140665, + "grad_norm": 0.5043575167655945, + "learning_rate": 0.00022384465813984054, + "loss": 0.6698, + "step": 5979 + }, + { + "epoch": 1.0667201855320667, + "grad_norm": 0.5385834574699402, + "learning_rate": 0.00022377498138039903, + "loss": 0.7248, + "step": 5980 + }, + { + "epoch": 1.066898581750067, + "grad_norm": 0.5373404622077942, + "learning_rate": 0.00022370530668065915, + "loss": 0.7383, + "step": 5981 + }, + { + "epoch": 1.067076977968067, + "grad_norm": 0.464138925075531, + "learning_rate": 0.0002236356340460932, + "loss": 0.5345, + "step": 5982 + }, + { + "epoch": 1.0672553741860673, + "grad_norm": 0.5200468301773071, + "learning_rate": 0.00022356596348217325, + "loss": 0.7235, + "step": 5983 + }, + { + "epoch": 1.0674337704040675, + "grad_norm": 0.5154780745506287, + "learning_rate": 0.00022349629499437116, + "loss": 0.8373, + "step": 5984 + }, + { + "epoch": 1.0676121666220677, + "grad_norm": 0.4483351707458496, + "learning_rate": 0.00022342662858815867, + "loss": 0.5755, + "step": 5985 + }, + { + "epoch": 1.067790562840068, + "grad_norm": 0.4439292848110199, + "learning_rate": 0.0002233569642690073, + "loss": 0.5772, + "step": 5986 + }, + { + "epoch": 1.0679689590580679, + "grad_norm": 0.49343034625053406, + "learning_rate": 0.00022328730204238852, + "loss": 0.7372, + "step": 5987 + }, + { + "epoch": 1.068147355276068, + "grad_norm": 0.4922356605529785, + "learning_rate": 0.00022321764191377347, + "loss": 0.7034, + "step": 5988 + }, + { + "epoch": 1.0683257514940683, + "grad_norm": 0.426301509141922, + "learning_rate": 0.00022314798388863336, + "loss": 0.6708, + "step": 5989 + }, + { + "epoch": 1.0685041477120685, + "grad_norm": 0.46274682879447937, + "learning_rate": 0.000223078327972439, + "loss": 0.5906, + "step": 5990 + }, + { + "epoch": 1.0686825439300687, + "grad_norm": 0.43291884660720825, + "learning_rate": 0.0002230086741706612, + "loss": 0.6506, + "step": 5991 + }, + { + "epoch": 1.0688609401480689, + "grad_norm": 0.4762420952320099, + "learning_rate": 0.00022293902248877052, + "loss": 0.6862, + "step": 5992 + }, + { + "epoch": 1.069039336366069, + "grad_norm": 0.4900970160961151, + "learning_rate": 0.00022286937293223736, + "loss": 0.7371, + "step": 5993 + }, + { + "epoch": 1.0692177325840693, + "grad_norm": 0.44312936067581177, + "learning_rate": 0.00022279972550653203, + "loss": 0.7384, + "step": 5994 + }, + { + "epoch": 1.0693961288020695, + "grad_norm": 0.5585013628005981, + "learning_rate": 0.00022273008021712448, + "loss": 0.9225, + "step": 5995 + }, + { + "epoch": 1.0695745250200697, + "grad_norm": 0.4900633990764618, + "learning_rate": 0.00022266043706948462, + "loss": 0.6349, + "step": 5996 + }, + { + "epoch": 1.0697529212380696, + "grad_norm": 0.47767671942710876, + "learning_rate": 0.00022259079606908237, + "loss": 0.728, + "step": 5997 + }, + { + "epoch": 1.0699313174560698, + "grad_norm": 0.477839857339859, + "learning_rate": 0.00022252115722138724, + "loss": 0.626, + "step": 5998 + }, + { + "epoch": 1.07010971367407, + "grad_norm": 0.47772642970085144, + "learning_rate": 0.00022245152053186853, + "loss": 0.7116, + "step": 5999 + }, + { + "epoch": 1.0702881098920702, + "grad_norm": 0.45496633648872375, + "learning_rate": 0.00022238188600599558, + "loss": 0.6855, + "step": 6000 + }, + { + "epoch": 1.0704665061100704, + "grad_norm": 0.5116882920265198, + "learning_rate": 0.00022231225364923736, + "loss": 0.7783, + "step": 6001 + }, + { + "epoch": 1.0706449023280706, + "grad_norm": 0.5786128044128418, + "learning_rate": 0.00022224262346706288, + "loss": 0.7753, + "step": 6002 + }, + { + "epoch": 1.0708232985460708, + "grad_norm": 0.4571937620639801, + "learning_rate": 0.00022217299546494078, + "loss": 0.6224, + "step": 6003 + }, + { + "epoch": 1.071001694764071, + "grad_norm": 0.4922640919685364, + "learning_rate": 0.00022210336964833966, + "loss": 0.7753, + "step": 6004 + }, + { + "epoch": 1.0711800909820712, + "grad_norm": 0.4930363595485687, + "learning_rate": 0.0002220337460227279, + "loss": 0.7542, + "step": 6005 + }, + { + "epoch": 1.0713584872000714, + "grad_norm": 0.4897131323814392, + "learning_rate": 0.00022196412459357372, + "loss": 0.7468, + "step": 6006 + }, + { + "epoch": 1.0715368834180716, + "grad_norm": 0.5091310739517212, + "learning_rate": 0.00022189450536634506, + "loss": 0.6646, + "step": 6007 + }, + { + "epoch": 1.0717152796360718, + "grad_norm": 0.5187907814979553, + "learning_rate": 0.00022182488834650987, + "loss": 0.816, + "step": 6008 + }, + { + "epoch": 1.0718936758540718, + "grad_norm": 0.5296580195426941, + "learning_rate": 0.00022175527353953585, + "loss": 0.8415, + "step": 6009 + }, + { + "epoch": 1.072072072072072, + "grad_norm": 0.4686925411224365, + "learning_rate": 0.00022168566095089043, + "loss": 0.5842, + "step": 6010 + }, + { + "epoch": 1.0722504682900722, + "grad_norm": 0.5479150414466858, + "learning_rate": 0.000221616050586041, + "loss": 0.7578, + "step": 6011 + }, + { + "epoch": 1.0724288645080724, + "grad_norm": 0.6131009459495544, + "learning_rate": 0.0002215464424504548, + "loss": 0.6768, + "step": 6012 + }, + { + "epoch": 1.0726072607260726, + "grad_norm": 0.47752901911735535, + "learning_rate": 0.00022147683654959876, + "loss": 0.6412, + "step": 6013 + }, + { + "epoch": 1.0727856569440728, + "grad_norm": 0.47805142402648926, + "learning_rate": 0.0002214072328889397, + "loss": 0.6528, + "step": 6014 + }, + { + "epoch": 1.072964053162073, + "grad_norm": 0.44289377331733704, + "learning_rate": 0.00022133763147394426, + "loss": 0.7201, + "step": 6015 + }, + { + "epoch": 1.0731424493800732, + "grad_norm": 0.5617145299911499, + "learning_rate": 0.00022126803231007893, + "loss": 0.7928, + "step": 6016 + }, + { + "epoch": 1.0733208455980734, + "grad_norm": 0.5184254050254822, + "learning_rate": 0.00022119843540280995, + "loss": 0.8179, + "step": 6017 + }, + { + "epoch": 1.0734992418160736, + "grad_norm": 0.4659225344657898, + "learning_rate": 0.00022112884075760347, + "loss": 0.5611, + "step": 6018 + }, + { + "epoch": 1.0736776380340736, + "grad_norm": 0.5036365389823914, + "learning_rate": 0.00022105924837992547, + "loss": 0.9739, + "step": 6019 + }, + { + "epoch": 1.0738560342520738, + "grad_norm": 0.4579788148403168, + "learning_rate": 0.0002209896582752416, + "loss": 0.6486, + "step": 6020 + }, + { + "epoch": 1.074034430470074, + "grad_norm": 0.4959608018398285, + "learning_rate": 0.00022092007044901746, + "loss": 0.9164, + "step": 6021 + }, + { + "epoch": 1.0742128266880742, + "grad_norm": 0.4846688508987427, + "learning_rate": 0.00022085048490671849, + "loss": 0.7229, + "step": 6022 + }, + { + "epoch": 1.0743912229060744, + "grad_norm": 0.4702696204185486, + "learning_rate": 0.0002207809016538099, + "loss": 0.5403, + "step": 6023 + }, + { + "epoch": 1.0745696191240746, + "grad_norm": 0.4317050576210022, + "learning_rate": 0.00022071132069575672, + "loss": 0.6298, + "step": 6024 + }, + { + "epoch": 1.0747480153420748, + "grad_norm": 0.4599125385284424, + "learning_rate": 0.00022064174203802382, + "loss": 0.6337, + "step": 6025 + }, + { + "epoch": 1.074926411560075, + "grad_norm": 0.5147960186004639, + "learning_rate": 0.00022057216568607582, + "loss": 0.6918, + "step": 6026 + }, + { + "epoch": 1.0751048077780752, + "grad_norm": 0.45055848360061646, + "learning_rate": 0.00022050259164537725, + "loss": 0.6336, + "step": 6027 + }, + { + "epoch": 1.0752832039960754, + "grad_norm": 0.5015688538551331, + "learning_rate": 0.00022043301992139247, + "loss": 0.7122, + "step": 6028 + }, + { + "epoch": 1.0754616002140756, + "grad_norm": 0.4941107928752899, + "learning_rate": 0.0002203634505195856, + "loss": 0.7284, + "step": 6029 + }, + { + "epoch": 1.0756399964320758, + "grad_norm": 0.4930790960788727, + "learning_rate": 0.00022029388344542056, + "loss": 0.6292, + "step": 6030 + }, + { + "epoch": 1.0758183926500757, + "grad_norm": 0.461085706949234, + "learning_rate": 0.00022022431870436114, + "loss": 0.5988, + "step": 6031 + }, + { + "epoch": 1.075996788868076, + "grad_norm": 0.5100793838500977, + "learning_rate": 0.00022015475630187095, + "loss": 0.773, + "step": 6032 + }, + { + "epoch": 1.0761751850860761, + "grad_norm": 0.45178812742233276, + "learning_rate": 0.00022008519624341333, + "loss": 0.6794, + "step": 6033 + }, + { + "epoch": 1.0763535813040763, + "grad_norm": 0.44591426849365234, + "learning_rate": 0.0002200156385344515, + "loss": 0.6673, + "step": 6034 + }, + { + "epoch": 1.0765319775220765, + "grad_norm": 0.47172197699546814, + "learning_rate": 0.00021994608318044853, + "loss": 0.7226, + "step": 6035 + }, + { + "epoch": 1.0767103737400767, + "grad_norm": 0.5064817070960999, + "learning_rate": 0.00021987653018686724, + "loss": 0.7875, + "step": 6036 + }, + { + "epoch": 1.076888769958077, + "grad_norm": 0.49598416686058044, + "learning_rate": 0.0002198069795591703, + "loss": 0.6392, + "step": 6037 + }, + { + "epoch": 1.0770671661760771, + "grad_norm": 0.5123727321624756, + "learning_rate": 0.00021973743130282024, + "loss": 0.7137, + "step": 6038 + }, + { + "epoch": 1.0772455623940773, + "grad_norm": 0.48429879546165466, + "learning_rate": 0.00021966788542327926, + "loss": 0.6153, + "step": 6039 + }, + { + "epoch": 1.0774239586120775, + "grad_norm": 0.5295233130455017, + "learning_rate": 0.00021959834192600958, + "loss": 0.7566, + "step": 6040 + }, + { + "epoch": 1.0776023548300775, + "grad_norm": 0.5196706652641296, + "learning_rate": 0.00021952880081647298, + "loss": 0.7511, + "step": 6041 + }, + { + "epoch": 1.0777807510480777, + "grad_norm": 0.4832461476325989, + "learning_rate": 0.00021945926210013112, + "loss": 0.5969, + "step": 6042 + }, + { + "epoch": 1.0779591472660779, + "grad_norm": 0.46703991293907166, + "learning_rate": 0.00021938972578244582, + "loss": 0.5534, + "step": 6043 + }, + { + "epoch": 1.078137543484078, + "grad_norm": 0.5281887054443359, + "learning_rate": 0.00021932019186887824, + "loss": 0.8217, + "step": 6044 + }, + { + "epoch": 1.0783159397020783, + "grad_norm": 0.48991912603378296, + "learning_rate": 0.00021925066036488969, + "loss": 0.6933, + "step": 6045 + }, + { + "epoch": 1.0784943359200785, + "grad_norm": 0.49540868401527405, + "learning_rate": 0.00021918113127594098, + "loss": 0.8362, + "step": 6046 + }, + { + "epoch": 1.0786727321380787, + "grad_norm": 0.4319058358669281, + "learning_rate": 0.00021911160460749295, + "loss": 0.5308, + "step": 6047 + }, + { + "epoch": 1.0788511283560789, + "grad_norm": 0.4906058609485626, + "learning_rate": 0.00021904208036500618, + "loss": 0.6291, + "step": 6048 + }, + { + "epoch": 1.079029524574079, + "grad_norm": 0.46787381172180176, + "learning_rate": 0.0002189725585539411, + "loss": 0.7964, + "step": 6049 + }, + { + "epoch": 1.0792079207920793, + "grad_norm": 0.5059190988540649, + "learning_rate": 0.00021890303917975794, + "loss": 0.7308, + "step": 6050 + }, + { + "epoch": 1.0793863170100795, + "grad_norm": 0.4896734356880188, + "learning_rate": 0.0002188335222479167, + "loss": 0.8345, + "step": 6051 + }, + { + "epoch": 1.0795647132280797, + "grad_norm": 0.48433777689933777, + "learning_rate": 0.0002187640077638772, + "loss": 0.5938, + "step": 6052 + }, + { + "epoch": 1.0797431094460797, + "grad_norm": 0.5422555804252625, + "learning_rate": 0.00021869449573309912, + "loss": 0.8484, + "step": 6053 + }, + { + "epoch": 1.0799215056640799, + "grad_norm": 0.4676606059074402, + "learning_rate": 0.00021862498616104188, + "loss": 0.6853, + "step": 6054 + }, + { + "epoch": 1.08009990188208, + "grad_norm": 0.5379725694656372, + "learning_rate": 0.00021855547905316467, + "loss": 0.8504, + "step": 6055 + }, + { + "epoch": 1.0802782981000802, + "grad_norm": 0.4376412034034729, + "learning_rate": 0.00021848597441492663, + "loss": 0.6913, + "step": 6056 + }, + { + "epoch": 1.0804566943180804, + "grad_norm": 0.48028087615966797, + "learning_rate": 0.0002184164722517865, + "loss": 0.7915, + "step": 6057 + }, + { + "epoch": 1.0806350905360806, + "grad_norm": 0.5001854300498962, + "learning_rate": 0.00021834697256920316, + "loss": 0.7001, + "step": 6058 + }, + { + "epoch": 1.0808134867540808, + "grad_norm": 0.4411584436893463, + "learning_rate": 0.00021827747537263496, + "loss": 0.5408, + "step": 6059 + }, + { + "epoch": 1.080991882972081, + "grad_norm": 0.4824419319629669, + "learning_rate": 0.0002182079806675402, + "loss": 0.6647, + "step": 6060 + }, + { + "epoch": 1.0811702791900812, + "grad_norm": 0.4544886350631714, + "learning_rate": 0.00021813848845937691, + "loss": 0.6396, + "step": 6061 + }, + { + "epoch": 1.0813486754080814, + "grad_norm": 0.5271000266075134, + "learning_rate": 0.00021806899875360307, + "loss": 0.7279, + "step": 6062 + }, + { + "epoch": 1.0815270716260814, + "grad_norm": 0.5003283619880676, + "learning_rate": 0.00021799951155567632, + "loss": 0.7433, + "step": 6063 + }, + { + "epoch": 1.0817054678440816, + "grad_norm": 0.5777568221092224, + "learning_rate": 0.00021793002687105415, + "loss": 0.8419, + "step": 6064 + }, + { + "epoch": 1.0818838640620818, + "grad_norm": 0.4634368121623993, + "learning_rate": 0.00021786054470519388, + "loss": 0.6258, + "step": 6065 + }, + { + "epoch": 1.082062260280082, + "grad_norm": 0.4333091378211975, + "learning_rate": 0.00021779106506355264, + "loss": 0.6735, + "step": 6066 + }, + { + "epoch": 1.0822406564980822, + "grad_norm": 0.45657244324684143, + "learning_rate": 0.00021772158795158725, + "loss": 0.6708, + "step": 6067 + }, + { + "epoch": 1.0824190527160824, + "grad_norm": 0.5117029547691345, + "learning_rate": 0.00021765211337475445, + "loss": 0.7658, + "step": 6068 + }, + { + "epoch": 1.0825974489340826, + "grad_norm": 0.45610618591308594, + "learning_rate": 0.00021758264133851072, + "loss": 0.5174, + "step": 6069 + }, + { + "epoch": 1.0827758451520828, + "grad_norm": 0.4404855966567993, + "learning_rate": 0.0002175131718483124, + "loss": 0.5822, + "step": 6070 + }, + { + "epoch": 1.082954241370083, + "grad_norm": 0.5023370385169983, + "learning_rate": 0.0002174437049096156, + "loss": 0.7341, + "step": 6071 + }, + { + "epoch": 1.0831326375880832, + "grad_norm": 0.5234084725379944, + "learning_rate": 0.00021737424052787618, + "loss": 0.8447, + "step": 6072 + }, + { + "epoch": 1.0833110338060834, + "grad_norm": 0.49975350499153137, + "learning_rate": 0.00021730477870854985, + "loss": 0.6664, + "step": 6073 + }, + { + "epoch": 1.0834894300240836, + "grad_norm": 0.5313869118690491, + "learning_rate": 0.00021723531945709216, + "loss": 0.7133, + "step": 6074 + }, + { + "epoch": 1.0836678262420836, + "grad_norm": 0.513853907585144, + "learning_rate": 0.0002171658627789584, + "loss": 0.9083, + "step": 6075 + }, + { + "epoch": 1.0838462224600838, + "grad_norm": 0.48460695147514343, + "learning_rate": 0.00021709640867960362, + "loss": 0.8195, + "step": 6076 + }, + { + "epoch": 1.084024618678084, + "grad_norm": 0.49029091000556946, + "learning_rate": 0.00021702695716448276, + "loss": 0.6181, + "step": 6077 + }, + { + "epoch": 1.0842030148960842, + "grad_norm": 0.48168060183525085, + "learning_rate": 0.00021695750823905053, + "loss": 0.6585, + "step": 6078 + }, + { + "epoch": 1.0843814111140844, + "grad_norm": 0.5174769759178162, + "learning_rate": 0.00021688806190876136, + "loss": 0.684, + "step": 6079 + }, + { + "epoch": 1.0845598073320846, + "grad_norm": 0.47358494997024536, + "learning_rate": 0.00021681861817906954, + "loss": 0.6681, + "step": 6080 + }, + { + "epoch": 1.0847382035500848, + "grad_norm": 0.5476037859916687, + "learning_rate": 0.00021674917705542918, + "loss": 0.8857, + "step": 6081 + }, + { + "epoch": 1.084916599768085, + "grad_norm": 0.47755712270736694, + "learning_rate": 0.00021667973854329415, + "loss": 0.6948, + "step": 6082 + }, + { + "epoch": 1.0850949959860852, + "grad_norm": 0.48054665327072144, + "learning_rate": 0.0002166103026481181, + "loss": 0.6114, + "step": 6083 + }, + { + "epoch": 1.0852733922040854, + "grad_norm": 0.5116613507270813, + "learning_rate": 0.00021654086937535449, + "loss": 0.7028, + "step": 6084 + }, + { + "epoch": 1.0854517884220853, + "grad_norm": 0.5386189222335815, + "learning_rate": 0.00021647143873045662, + "loss": 0.8417, + "step": 6085 + }, + { + "epoch": 1.0856301846400855, + "grad_norm": 0.770457923412323, + "learning_rate": 0.00021640201071887761, + "loss": 0.7823, + "step": 6086 + }, + { + "epoch": 1.0858085808580857, + "grad_norm": 0.45015949010849, + "learning_rate": 0.00021633258534607013, + "loss": 0.5068, + "step": 6087 + }, + { + "epoch": 1.085986977076086, + "grad_norm": 0.4800700843334198, + "learning_rate": 0.0002162631626174868, + "loss": 0.6321, + "step": 6088 + }, + { + "epoch": 1.0861653732940861, + "grad_norm": 0.5034077167510986, + "learning_rate": 0.0002161937425385803, + "loss": 0.7027, + "step": 6089 + }, + { + "epoch": 1.0863437695120863, + "grad_norm": 0.46986886858940125, + "learning_rate": 0.00021612432511480267, + "loss": 0.5984, + "step": 6090 + }, + { + "epoch": 1.0865221657300865, + "grad_norm": 1.3976149559020996, + "learning_rate": 0.00021605491035160603, + "loss": 0.81, + "step": 6091 + }, + { + "epoch": 1.0867005619480867, + "grad_norm": 0.4368259906768799, + "learning_rate": 0.0002159854982544421, + "loss": 0.5048, + "step": 6092 + }, + { + "epoch": 1.086878958166087, + "grad_norm": 0.45895346999168396, + "learning_rate": 0.00021591608882876249, + "loss": 0.6807, + "step": 6093 + }, + { + "epoch": 1.0870573543840871, + "grad_norm": 0.500730037689209, + "learning_rate": 0.00021584668208001856, + "loss": 0.7392, + "step": 6094 + }, + { + "epoch": 1.0872357506020873, + "grad_norm": 0.5281662344932556, + "learning_rate": 0.00021577727801366158, + "loss": 0.8316, + "step": 6095 + }, + { + "epoch": 1.0874141468200875, + "grad_norm": 0.5275529623031616, + "learning_rate": 0.00021570787663514242, + "loss": 0.7324, + "step": 6096 + }, + { + "epoch": 1.0875925430380875, + "grad_norm": 0.4927082061767578, + "learning_rate": 0.00021563847794991186, + "loss": 0.7293, + "step": 6097 + }, + { + "epoch": 1.0877709392560877, + "grad_norm": 0.49843451380729675, + "learning_rate": 0.0002155690819634205, + "loss": 0.7416, + "step": 6098 + }, + { + "epoch": 1.087949335474088, + "grad_norm": 0.46133577823638916, + "learning_rate": 0.00021549968868111863, + "loss": 0.5853, + "step": 6099 + }, + { + "epoch": 1.088127731692088, + "grad_norm": 0.45552465319633484, + "learning_rate": 0.00021543029810845634, + "loss": 0.6492, + "step": 6100 + }, + { + "epoch": 1.0883061279100883, + "grad_norm": 0.40714430809020996, + "learning_rate": 0.00021536091025088356, + "loss": 0.5971, + "step": 6101 + }, + { + "epoch": 1.0884845241280885, + "grad_norm": 0.46682703495025635, + "learning_rate": 0.00021529152511384997, + "loss": 0.7605, + "step": 6102 + }, + { + "epoch": 1.0886629203460887, + "grad_norm": 0.45366188883781433, + "learning_rate": 0.00021522214270280497, + "loss": 0.6504, + "step": 6103 + }, + { + "epoch": 1.088841316564089, + "grad_norm": 0.44408997893333435, + "learning_rate": 0.00021515276302319807, + "loss": 0.6159, + "step": 6104 + }, + { + "epoch": 1.089019712782089, + "grad_norm": 0.4182736277580261, + "learning_rate": 0.0002150833860804781, + "loss": 0.6284, + "step": 6105 + }, + { + "epoch": 1.0891981090000893, + "grad_norm": 0.4445931911468506, + "learning_rate": 0.00021501401188009397, + "loss": 0.5778, + "step": 6106 + }, + { + "epoch": 1.0893765052180893, + "grad_norm": 0.46306318044662476, + "learning_rate": 0.00021494464042749427, + "loss": 0.6435, + "step": 6107 + }, + { + "epoch": 1.0895549014360895, + "grad_norm": 0.49822935461997986, + "learning_rate": 0.0002148752717281275, + "loss": 0.6868, + "step": 6108 + }, + { + "epoch": 1.0897332976540897, + "grad_norm": 0.4217550754547119, + "learning_rate": 0.0002148059057874417, + "loss": 0.5048, + "step": 6109 + }, + { + "epoch": 1.0899116938720899, + "grad_norm": 0.497555136680603, + "learning_rate": 0.00021473654261088492, + "loss": 0.741, + "step": 6110 + }, + { + "epoch": 1.09009009009009, + "grad_norm": 0.47492715716362, + "learning_rate": 0.00021466718220390494, + "loss": 0.6734, + "step": 6111 + }, + { + "epoch": 1.0902684863080903, + "grad_norm": 0.5085891485214233, + "learning_rate": 0.0002145978245719493, + "loss": 0.6618, + "step": 6112 + }, + { + "epoch": 1.0904468825260905, + "grad_norm": 0.45178478956222534, + "learning_rate": 0.00021452846972046523, + "loss": 0.5666, + "step": 6113 + }, + { + "epoch": 1.0906252787440907, + "grad_norm": 0.44206705689430237, + "learning_rate": 0.0002144591176548999, + "loss": 0.5961, + "step": 6114 + }, + { + "epoch": 1.0908036749620909, + "grad_norm": 0.4810873866081238, + "learning_rate": 0.00021438976838070016, + "loss": 0.7622, + "step": 6115 + }, + { + "epoch": 1.090982071180091, + "grad_norm": 0.45664024353027344, + "learning_rate": 0.00021432042190331266, + "loss": 0.7886, + "step": 6116 + }, + { + "epoch": 1.0911604673980912, + "grad_norm": 0.439494788646698, + "learning_rate": 0.0002142510782281839, + "loss": 0.7159, + "step": 6117 + }, + { + "epoch": 1.0913388636160914, + "grad_norm": 0.5126174688339233, + "learning_rate": 0.00021418173736076007, + "loss": 0.806, + "step": 6118 + }, + { + "epoch": 1.0915172598340914, + "grad_norm": 0.465787798166275, + "learning_rate": 0.00021411239930648713, + "loss": 0.5724, + "step": 6119 + }, + { + "epoch": 1.0916956560520916, + "grad_norm": 0.5169677138328552, + "learning_rate": 0.00021404306407081094, + "loss": 0.7997, + "step": 6120 + }, + { + "epoch": 1.0918740522700918, + "grad_norm": 0.5252036452293396, + "learning_rate": 0.00021397373165917704, + "loss": 0.7794, + "step": 6121 + }, + { + "epoch": 1.092052448488092, + "grad_norm": 0.456243097782135, + "learning_rate": 0.00021390440207703075, + "loss": 0.6367, + "step": 6122 + }, + { + "epoch": 1.0922308447060922, + "grad_norm": 0.45093104243278503, + "learning_rate": 0.00021383507532981717, + "loss": 0.5247, + "step": 6123 + }, + { + "epoch": 1.0924092409240924, + "grad_norm": 0.40191787481307983, + "learning_rate": 0.00021376575142298122, + "loss": 0.4306, + "step": 6124 + }, + { + "epoch": 1.0925876371420926, + "grad_norm": 0.4974512755870819, + "learning_rate": 0.00021369643036196762, + "loss": 0.6029, + "step": 6125 + }, + { + "epoch": 1.0927660333600928, + "grad_norm": 0.4702337682247162, + "learning_rate": 0.0002136271121522207, + "loss": 0.6395, + "step": 6126 + }, + { + "epoch": 1.092944429578093, + "grad_norm": 0.48748883605003357, + "learning_rate": 0.00021355779679918475, + "loss": 0.6442, + "step": 6127 + }, + { + "epoch": 1.0931228257960932, + "grad_norm": 0.4701106548309326, + "learning_rate": 0.00021348848430830376, + "loss": 0.597, + "step": 6128 + }, + { + "epoch": 1.0933012220140932, + "grad_norm": 0.492400586605072, + "learning_rate": 0.00021341917468502148, + "loss": 0.6764, + "step": 6129 + }, + { + "epoch": 1.0934796182320934, + "grad_norm": 0.5202962756156921, + "learning_rate": 0.00021334986793478147, + "loss": 0.8682, + "step": 6130 + }, + { + "epoch": 1.0936580144500936, + "grad_norm": 0.4661516547203064, + "learning_rate": 0.00021328056406302707, + "loss": 0.7784, + "step": 6131 + }, + { + "epoch": 1.0938364106680938, + "grad_norm": 0.5273587107658386, + "learning_rate": 0.0002132112630752014, + "loss": 0.6089, + "step": 6132 + }, + { + "epoch": 1.094014806886094, + "grad_norm": 0.44249820709228516, + "learning_rate": 0.0002131419649767473, + "loss": 0.539, + "step": 6133 + }, + { + "epoch": 1.0941932031040942, + "grad_norm": 0.4947591722011566, + "learning_rate": 0.00021307266977310728, + "loss": 0.8595, + "step": 6134 + }, + { + "epoch": 1.0943715993220944, + "grad_norm": 0.46888822317123413, + "learning_rate": 0.00021300337746972398, + "loss": 0.719, + "step": 6135 + }, + { + "epoch": 1.0945499955400946, + "grad_norm": 0.5104032158851624, + "learning_rate": 0.00021293408807203948, + "loss": 0.7808, + "step": 6136 + }, + { + "epoch": 1.0947283917580948, + "grad_norm": 0.5240311622619629, + "learning_rate": 0.00021286480158549582, + "loss": 0.7997, + "step": 6137 + }, + { + "epoch": 1.094906787976095, + "grad_norm": 0.5189229249954224, + "learning_rate": 0.00021279551801553463, + "loss": 0.8566, + "step": 6138 + }, + { + "epoch": 1.0950851841940952, + "grad_norm": 0.4285975396633148, + "learning_rate": 0.00021272623736759742, + "loss": 0.7213, + "step": 6139 + }, + { + "epoch": 1.0952635804120954, + "grad_norm": 0.5412182211875916, + "learning_rate": 0.0002126569596471255, + "loss": 0.8526, + "step": 6140 + }, + { + "epoch": 1.0954419766300953, + "grad_norm": 0.5037285685539246, + "learning_rate": 0.00021258768485955988, + "loss": 0.7894, + "step": 6141 + }, + { + "epoch": 1.0956203728480955, + "grad_norm": 0.4721333980560303, + "learning_rate": 0.00021251841301034142, + "loss": 0.7696, + "step": 6142 + }, + { + "epoch": 1.0957987690660957, + "grad_norm": 0.5173086524009705, + "learning_rate": 0.00021244914410491062, + "loss": 0.8049, + "step": 6143 + }, + { + "epoch": 1.095977165284096, + "grad_norm": 0.47684210538864136, + "learning_rate": 0.00021237987814870795, + "loss": 0.6094, + "step": 6144 + }, + { + "epoch": 1.0961555615020961, + "grad_norm": 0.42279088497161865, + "learning_rate": 0.0002123106151471734, + "loss": 0.5532, + "step": 6145 + }, + { + "epoch": 1.0963339577200963, + "grad_norm": 0.5065193176269531, + "learning_rate": 0.00021224135510574703, + "loss": 0.6686, + "step": 6146 + }, + { + "epoch": 1.0965123539380965, + "grad_norm": 0.4805443584918976, + "learning_rate": 0.0002121720980298683, + "loss": 0.5822, + "step": 6147 + }, + { + "epoch": 1.0966907501560967, + "grad_norm": 0.4822993278503418, + "learning_rate": 0.0002121028439249767, + "loss": 0.7553, + "step": 6148 + }, + { + "epoch": 1.096869146374097, + "grad_norm": 0.4846903383731842, + "learning_rate": 0.00021203359279651132, + "loss": 0.6991, + "step": 6149 + }, + { + "epoch": 1.0970475425920971, + "grad_norm": 0.4852921962738037, + "learning_rate": 0.0002119643446499114, + "loss": 0.7108, + "step": 6150 + }, + { + "epoch": 1.0972259388100971, + "grad_norm": 0.4798181354999542, + "learning_rate": 0.00021189509949061543, + "loss": 0.6163, + "step": 6151 + }, + { + "epoch": 1.0974043350280973, + "grad_norm": 0.5046129822731018, + "learning_rate": 0.00021182585732406196, + "loss": 0.8545, + "step": 6152 + }, + { + "epoch": 1.0975827312460975, + "grad_norm": 0.5011611580848694, + "learning_rate": 0.0002117566181556892, + "loss": 0.7396, + "step": 6153 + }, + { + "epoch": 1.0977611274640977, + "grad_norm": 0.4669513702392578, + "learning_rate": 0.0002116873819909352, + "loss": 0.6778, + "step": 6154 + }, + { + "epoch": 1.097939523682098, + "grad_norm": 0.4715968370437622, + "learning_rate": 0.00021161814883523773, + "loss": 0.7572, + "step": 6155 + }, + { + "epoch": 1.098117919900098, + "grad_norm": 0.9391183257102966, + "learning_rate": 0.00021154891869403433, + "loss": 0.7393, + "step": 6156 + }, + { + "epoch": 1.0982963161180983, + "grad_norm": 0.49894896149635315, + "learning_rate": 0.00021147969157276234, + "loss": 0.7028, + "step": 6157 + }, + { + "epoch": 1.0984747123360985, + "grad_norm": 0.4233601987361908, + "learning_rate": 0.00021141046747685883, + "loss": 0.6457, + "step": 6158 + }, + { + "epoch": 1.0986531085540987, + "grad_norm": 0.5224723219871521, + "learning_rate": 0.00021134124641176052, + "loss": 0.8284, + "step": 6159 + }, + { + "epoch": 1.098831504772099, + "grad_norm": 0.4575176537036896, + "learning_rate": 0.00021127202838290413, + "loss": 0.6213, + "step": 6160 + }, + { + "epoch": 1.099009900990099, + "grad_norm": 0.4861457347869873, + "learning_rate": 0.00021120281339572595, + "loss": 0.7269, + "step": 6161 + }, + { + "epoch": 1.0991882972080993, + "grad_norm": 0.49578866362571716, + "learning_rate": 0.00021113360145566206, + "loss": 0.7596, + "step": 6162 + }, + { + "epoch": 1.0993666934260993, + "grad_norm": 0.43700939416885376, + "learning_rate": 0.00021106439256814844, + "loss": 0.6501, + "step": 6163 + }, + { + "epoch": 1.0995450896440995, + "grad_norm": 0.5099201798439026, + "learning_rate": 0.00021099518673862061, + "loss": 0.9244, + "step": 6164 + }, + { + "epoch": 1.0997234858620997, + "grad_norm": 0.434171587228775, + "learning_rate": 0.00021092598397251408, + "loss": 0.543, + "step": 6165 + }, + { + "epoch": 1.0999018820800999, + "grad_norm": 0.45645880699157715, + "learning_rate": 0.00021085678427526394, + "loss": 0.661, + "step": 6166 + }, + { + "epoch": 1.1000802782981, + "grad_norm": 0.45062491297721863, + "learning_rate": 0.00021078758765230514, + "loss": 0.5422, + "step": 6167 + }, + { + "epoch": 1.1002586745161003, + "grad_norm": 0.4871162176132202, + "learning_rate": 0.00021071839410907232, + "loss": 0.7189, + "step": 6168 + }, + { + "epoch": 1.1004370707341005, + "grad_norm": 0.3932052254676819, + "learning_rate": 0.00021064920365099994, + "loss": 0.4697, + "step": 6169 + }, + { + "epoch": 1.1006154669521007, + "grad_norm": 0.46333831548690796, + "learning_rate": 0.00021058001628352214, + "loss": 0.6093, + "step": 6170 + }, + { + "epoch": 1.1007938631701009, + "grad_norm": 0.4353841543197632, + "learning_rate": 0.00021051083201207297, + "loss": 0.555, + "step": 6171 + }, + { + "epoch": 1.100972259388101, + "grad_norm": 0.4251698851585388, + "learning_rate": 0.000210441650842086, + "loss": 0.616, + "step": 6172 + }, + { + "epoch": 1.101150655606101, + "grad_norm": 0.5128315091133118, + "learning_rate": 0.00021037247277899473, + "loss": 0.8073, + "step": 6173 + }, + { + "epoch": 1.1013290518241012, + "grad_norm": 0.49934786558151245, + "learning_rate": 0.00021030329782823244, + "loss": 0.7532, + "step": 6174 + }, + { + "epoch": 1.1015074480421014, + "grad_norm": 0.45256903767585754, + "learning_rate": 0.00021023412599523202, + "loss": 0.6656, + "step": 6175 + }, + { + "epoch": 1.1016858442601016, + "grad_norm": 0.4254659414291382, + "learning_rate": 0.00021016495728542626, + "loss": 0.5954, + "step": 6176 + }, + { + "epoch": 1.1018642404781018, + "grad_norm": 0.4867410957813263, + "learning_rate": 0.00021009579170424758, + "loss": 0.7753, + "step": 6177 + }, + { + "epoch": 1.102042636696102, + "grad_norm": 0.42600226402282715, + "learning_rate": 0.00021002662925712827, + "loss": 0.6506, + "step": 6178 + }, + { + "epoch": 1.1022210329141022, + "grad_norm": 0.4646792709827423, + "learning_rate": 0.00020995746994950036, + "loss": 0.722, + "step": 6179 + }, + { + "epoch": 1.1023994291321024, + "grad_norm": 0.4408302903175354, + "learning_rate": 0.00020988831378679536, + "loss": 0.5475, + "step": 6180 + }, + { + "epoch": 1.1025778253501026, + "grad_norm": 0.5261194705963135, + "learning_rate": 0.000209819160774445, + "loss": 0.8598, + "step": 6181 + }, + { + "epoch": 1.1027562215681028, + "grad_norm": 0.4445323944091797, + "learning_rate": 0.00020975001091788048, + "loss": 0.5923, + "step": 6182 + }, + { + "epoch": 1.102934617786103, + "grad_norm": 0.8575343489646912, + "learning_rate": 0.0002096808642225328, + "loss": 0.6453, + "step": 6183 + }, + { + "epoch": 1.1031130140041032, + "grad_norm": 0.47943076491355896, + "learning_rate": 0.00020961172069383275, + "loss": 0.6593, + "step": 6184 + }, + { + "epoch": 1.1032914102221032, + "grad_norm": 0.4768347442150116, + "learning_rate": 0.00020954258033721072, + "loss": 0.81, + "step": 6185 + }, + { + "epoch": 1.1034698064401034, + "grad_norm": 0.5110130906105042, + "learning_rate": 0.00020947344315809703, + "loss": 0.7159, + "step": 6186 + }, + { + "epoch": 1.1036482026581036, + "grad_norm": 0.5536441206932068, + "learning_rate": 0.00020940430916192165, + "loss": 0.7023, + "step": 6187 + }, + { + "epoch": 1.1038265988761038, + "grad_norm": 0.5131134986877441, + "learning_rate": 0.00020933517835411436, + "loss": 0.7933, + "step": 6188 + }, + { + "epoch": 1.104004995094104, + "grad_norm": 0.4293597936630249, + "learning_rate": 0.0002092660507401047, + "loss": 0.5803, + "step": 6189 + }, + { + "epoch": 1.1041833913121042, + "grad_norm": 0.5110434293746948, + "learning_rate": 0.00020919692632532182, + "loss": 0.7934, + "step": 6190 + }, + { + "epoch": 1.1043617875301044, + "grad_norm": 0.5271182656288147, + "learning_rate": 0.00020912780511519484, + "loss": 0.8077, + "step": 6191 + }, + { + "epoch": 1.1045401837481046, + "grad_norm": 0.4269711673259735, + "learning_rate": 0.00020905868711515248, + "loss": 0.6145, + "step": 6192 + }, + { + "epoch": 1.1047185799661048, + "grad_norm": 0.4809115529060364, + "learning_rate": 0.0002089895723306232, + "loss": 0.6951, + "step": 6193 + }, + { + "epoch": 1.104896976184105, + "grad_norm": 0.5060098171234131, + "learning_rate": 0.00020892046076703523, + "loss": 0.9445, + "step": 6194 + }, + { + "epoch": 1.105075372402105, + "grad_norm": 0.5290777683258057, + "learning_rate": 0.00020885135242981647, + "loss": 0.7443, + "step": 6195 + }, + { + "epoch": 1.1052537686201052, + "grad_norm": 0.47494322061538696, + "learning_rate": 0.00020878224732439493, + "loss": 0.7927, + "step": 6196 + }, + { + "epoch": 1.1054321648381054, + "grad_norm": 0.44407710433006287, + "learning_rate": 0.000208713145456198, + "loss": 0.5888, + "step": 6197 + }, + { + "epoch": 1.1056105610561056, + "grad_norm": 0.45321714878082275, + "learning_rate": 0.00020864404683065276, + "loss": 0.6145, + "step": 6198 + }, + { + "epoch": 1.1057889572741058, + "grad_norm": 0.4794192910194397, + "learning_rate": 0.00020857495145318634, + "loss": 0.7451, + "step": 6199 + }, + { + "epoch": 1.105967353492106, + "grad_norm": 0.4514195919036865, + "learning_rate": 0.0002085058593292254, + "loss": 0.6112, + "step": 6200 + }, + { + "epoch": 1.1061457497101062, + "grad_norm": 0.48285120725631714, + "learning_rate": 0.00020843677046419637, + "loss": 0.7315, + "step": 6201 + }, + { + "epoch": 1.1063241459281064, + "grad_norm": 0.4666329324245453, + "learning_rate": 0.00020836768486352553, + "loss": 0.5705, + "step": 6202 + }, + { + "epoch": 1.1065025421461065, + "grad_norm": 0.5958988070487976, + "learning_rate": 0.0002082986025326388, + "loss": 0.6226, + "step": 6203 + }, + { + "epoch": 1.1066809383641067, + "grad_norm": 0.5066222548484802, + "learning_rate": 0.00020822952347696188, + "loss": 0.5811, + "step": 6204 + }, + { + "epoch": 1.106859334582107, + "grad_norm": 0.4955041706562042, + "learning_rate": 0.00020816044770192028, + "loss": 0.8117, + "step": 6205 + }, + { + "epoch": 1.1070377308001071, + "grad_norm": 0.4577980637550354, + "learning_rate": 0.00020809137521293902, + "loss": 0.5482, + "step": 6206 + }, + { + "epoch": 1.1072161270181071, + "grad_norm": 0.581256091594696, + "learning_rate": 0.00020802230601544314, + "loss": 0.9406, + "step": 6207 + }, + { + "epoch": 1.1073945232361073, + "grad_norm": 0.43365752696990967, + "learning_rate": 0.00020795324011485728, + "loss": 0.5084, + "step": 6208 + }, + { + "epoch": 1.1075729194541075, + "grad_norm": 0.4679083526134491, + "learning_rate": 0.00020788417751660588, + "loss": 0.6819, + "step": 6209 + }, + { + "epoch": 1.1077513156721077, + "grad_norm": 0.41970065236091614, + "learning_rate": 0.00020781511822611296, + "loss": 0.4688, + "step": 6210 + }, + { + "epoch": 1.107929711890108, + "grad_norm": 0.4997738301753998, + "learning_rate": 0.00020774606224880255, + "loss": 0.5245, + "step": 6211 + }, + { + "epoch": 1.1081081081081081, + "grad_norm": 0.4666350781917572, + "learning_rate": 0.00020767700959009824, + "loss": 0.7218, + "step": 6212 + }, + { + "epoch": 1.1082865043261083, + "grad_norm": 0.4743230938911438, + "learning_rate": 0.0002076079602554234, + "loss": 0.6804, + "step": 6213 + }, + { + "epoch": 1.1084649005441085, + "grad_norm": 0.5221150517463684, + "learning_rate": 0.0002075389142502011, + "loss": 0.8767, + "step": 6214 + }, + { + "epoch": 1.1086432967621087, + "grad_norm": 0.4845503866672516, + "learning_rate": 0.0002074698715798542, + "loss": 0.6975, + "step": 6215 + }, + { + "epoch": 1.108821692980109, + "grad_norm": 0.424908846616745, + "learning_rate": 0.0002074008322498053, + "loss": 0.4744, + "step": 6216 + }, + { + "epoch": 1.1090000891981089, + "grad_norm": 0.44817301630973816, + "learning_rate": 0.00020733179626547667, + "loss": 0.5131, + "step": 6217 + }, + { + "epoch": 1.109178485416109, + "grad_norm": 0.49306273460388184, + "learning_rate": 0.0002072627636322905, + "loss": 0.7012, + "step": 6218 + }, + { + "epoch": 1.1093568816341093, + "grad_norm": 0.4819309413433075, + "learning_rate": 0.00020719373435566842, + "loss": 0.6232, + "step": 6219 + }, + { + "epoch": 1.1095352778521095, + "grad_norm": 0.5482651591300964, + "learning_rate": 0.00020712470844103198, + "loss": 0.8316, + "step": 6220 + }, + { + "epoch": 1.1097136740701097, + "grad_norm": 0.47659358382225037, + "learning_rate": 0.00020705568589380252, + "loss": 0.7383, + "step": 6221 + }, + { + "epoch": 1.1098920702881099, + "grad_norm": 0.4586566686630249, + "learning_rate": 0.00020698666671940103, + "loss": 0.6438, + "step": 6222 + }, + { + "epoch": 1.11007046650611, + "grad_norm": 0.5156270861625671, + "learning_rate": 0.0002069176509232482, + "loss": 0.7463, + "step": 6223 + }, + { + "epoch": 1.1102488627241103, + "grad_norm": 0.4540422260761261, + "learning_rate": 0.0002068486385107645, + "loss": 0.7645, + "step": 6224 + }, + { + "epoch": 1.1104272589421105, + "grad_norm": 0.39585331082344055, + "learning_rate": 0.00020677962948737022, + "loss": 0.5105, + "step": 6225 + }, + { + "epoch": 1.1106056551601107, + "grad_norm": 0.4956061840057373, + "learning_rate": 0.00020671062385848517, + "loss": 0.7741, + "step": 6226 + }, + { + "epoch": 1.1107840513781109, + "grad_norm": 0.45627540349960327, + "learning_rate": 0.00020664162162952913, + "loss": 0.6525, + "step": 6227 + }, + { + "epoch": 1.110962447596111, + "grad_norm": 0.45656707882881165, + "learning_rate": 0.00020657262280592147, + "loss": 0.6371, + "step": 6228 + }, + { + "epoch": 1.111140843814111, + "grad_norm": 0.5140594840049744, + "learning_rate": 0.0002065036273930813, + "loss": 0.8361, + "step": 6229 + }, + { + "epoch": 1.1113192400321112, + "grad_norm": 0.4385344386100769, + "learning_rate": 0.00020643463539642766, + "loss": 0.5181, + "step": 6230 + }, + { + "epoch": 1.1114976362501114, + "grad_norm": 0.5141750574111938, + "learning_rate": 0.00020636564682137887, + "loss": 0.5044, + "step": 6231 + }, + { + "epoch": 1.1116760324681116, + "grad_norm": 0.5330828428268433, + "learning_rate": 0.00020629666167335342, + "loss": 0.7067, + "step": 6232 + }, + { + "epoch": 1.1118544286861118, + "grad_norm": 0.5701097249984741, + "learning_rate": 0.00020622767995776936, + "loss": 0.8013, + "step": 6233 + }, + { + "epoch": 1.112032824904112, + "grad_norm": 0.5556646585464478, + "learning_rate": 0.00020615870168004449, + "loss": 0.8375, + "step": 6234 + }, + { + "epoch": 1.1122112211221122, + "grad_norm": 0.5408580303192139, + "learning_rate": 0.0002060897268455963, + "loss": 0.5886, + "step": 6235 + }, + { + "epoch": 1.1123896173401124, + "grad_norm": 0.4653705954551697, + "learning_rate": 0.0002060207554598421, + "loss": 0.6007, + "step": 6236 + }, + { + "epoch": 1.1125680135581126, + "grad_norm": 0.485100656747818, + "learning_rate": 0.00020595178752819883, + "loss": 0.6615, + "step": 6237 + }, + { + "epoch": 1.1127464097761128, + "grad_norm": 0.4773385226726532, + "learning_rate": 0.00020588282305608325, + "loss": 0.7062, + "step": 6238 + }, + { + "epoch": 1.1129248059941128, + "grad_norm": 0.538426399230957, + "learning_rate": 0.00020581386204891172, + "loss": 0.8382, + "step": 6239 + }, + { + "epoch": 1.113103202212113, + "grad_norm": 0.45640110969543457, + "learning_rate": 0.00020574490451210045, + "loss": 0.6707, + "step": 6240 + }, + { + "epoch": 1.1132815984301132, + "grad_norm": 0.5065681338310242, + "learning_rate": 0.00020567595045106523, + "loss": 0.7169, + "step": 6241 + }, + { + "epoch": 1.1134599946481134, + "grad_norm": 0.49156734347343445, + "learning_rate": 0.0002056069998712219, + "loss": 0.6726, + "step": 6242 + }, + { + "epoch": 1.1136383908661136, + "grad_norm": 0.4353838860988617, + "learning_rate": 0.00020553805277798574, + "loss": 0.5635, + "step": 6243 + }, + { + "epoch": 1.1138167870841138, + "grad_norm": 0.5049417614936829, + "learning_rate": 0.00020546910917677172, + "loss": 0.8726, + "step": 6244 + }, + { + "epoch": 1.113995183302114, + "grad_norm": 0.46186742186546326, + "learning_rate": 0.00020540016907299473, + "loss": 0.5505, + "step": 6245 + }, + { + "epoch": 1.1141735795201142, + "grad_norm": 0.38499900698661804, + "learning_rate": 0.0002053312324720692, + "loss": 0.4884, + "step": 6246 + }, + { + "epoch": 1.1143519757381144, + "grad_norm": 0.4557759165763855, + "learning_rate": 0.00020526229937940956, + "loss": 0.5865, + "step": 6247 + }, + { + "epoch": 1.1145303719561146, + "grad_norm": 0.4711809754371643, + "learning_rate": 0.00020519336980042956, + "loss": 0.6212, + "step": 6248 + }, + { + "epoch": 1.1147087681741148, + "grad_norm": 0.5073553323745728, + "learning_rate": 0.00020512444374054309, + "loss": 0.7891, + "step": 6249 + }, + { + "epoch": 1.114887164392115, + "grad_norm": 0.5361490845680237, + "learning_rate": 0.00020505552120516347, + "loss": 0.8727, + "step": 6250 + }, + { + "epoch": 1.115065560610115, + "grad_norm": 0.49489954113960266, + "learning_rate": 0.00020498660219970394, + "loss": 0.6391, + "step": 6251 + }, + { + "epoch": 1.1152439568281152, + "grad_norm": 0.44518017768859863, + "learning_rate": 0.00020491768672957722, + "loss": 0.6854, + "step": 6252 + }, + { + "epoch": 1.1154223530461154, + "grad_norm": 0.501320481300354, + "learning_rate": 0.00020484877480019602, + "loss": 0.8994, + "step": 6253 + }, + { + "epoch": 1.1156007492641156, + "grad_norm": 0.5842064023017883, + "learning_rate": 0.0002047798664169726, + "loss": 0.988, + "step": 6254 + }, + { + "epoch": 1.1157791454821158, + "grad_norm": 0.4702877998352051, + "learning_rate": 0.000204710961585319, + "loss": 0.8212, + "step": 6255 + }, + { + "epoch": 1.115957541700116, + "grad_norm": 0.4764789640903473, + "learning_rate": 0.00020464206031064694, + "loss": 0.6702, + "step": 6256 + }, + { + "epoch": 1.1161359379181162, + "grad_norm": 0.4664061367511749, + "learning_rate": 0.000204573162598368, + "loss": 0.6985, + "step": 6257 + }, + { + "epoch": 1.1163143341361164, + "grad_norm": 1.0998966693878174, + "learning_rate": 0.00020450426845389333, + "loss": 0.6394, + "step": 6258 + }, + { + "epoch": 1.1164927303541166, + "grad_norm": 0.4073130786418915, + "learning_rate": 0.00020443537788263384, + "loss": 0.5515, + "step": 6259 + }, + { + "epoch": 1.1166711265721168, + "grad_norm": 0.48286718130111694, + "learning_rate": 0.00020436649089000013, + "loss": 0.7877, + "step": 6260 + }, + { + "epoch": 1.1168495227901167, + "grad_norm": 0.505884051322937, + "learning_rate": 0.00020429760748140262, + "loss": 0.7128, + "step": 6261 + }, + { + "epoch": 1.117027919008117, + "grad_norm": 0.6370548605918884, + "learning_rate": 0.00020422872766225137, + "loss": 0.9009, + "step": 6262 + }, + { + "epoch": 1.1172063152261171, + "grad_norm": 0.4987682104110718, + "learning_rate": 0.00020415985143795612, + "loss": 0.6506, + "step": 6263 + }, + { + "epoch": 1.1173847114441173, + "grad_norm": 0.7356500029563904, + "learning_rate": 0.00020409097881392646, + "loss": 0.7871, + "step": 6264 + }, + { + "epoch": 1.1175631076621175, + "grad_norm": 0.578613817691803, + "learning_rate": 0.00020402210979557153, + "loss": 0.7906, + "step": 6265 + }, + { + "epoch": 1.1177415038801177, + "grad_norm": 0.4457342028617859, + "learning_rate": 0.00020395324438830033, + "loss": 0.6093, + "step": 6266 + }, + { + "epoch": 1.117919900098118, + "grad_norm": 0.4944089949131012, + "learning_rate": 0.00020388438259752147, + "loss": 0.7959, + "step": 6267 + }, + { + "epoch": 1.1180982963161181, + "grad_norm": 0.46992331743240356, + "learning_rate": 0.00020381552442864337, + "loss": 0.6263, + "step": 6268 + }, + { + "epoch": 1.1182766925341183, + "grad_norm": 0.42646077275276184, + "learning_rate": 0.00020374666988707407, + "loss": 0.5327, + "step": 6269 + }, + { + "epoch": 1.1184550887521185, + "grad_norm": 0.48363184928894043, + "learning_rate": 0.00020367781897822146, + "loss": 0.7761, + "step": 6270 + }, + { + "epoch": 1.1186334849701187, + "grad_norm": 0.47337979078292847, + "learning_rate": 0.00020360897170749299, + "loss": 0.6731, + "step": 6271 + }, + { + "epoch": 1.118811881188119, + "grad_norm": 0.4817976653575897, + "learning_rate": 0.00020354012808029587, + "loss": 0.6265, + "step": 6272 + }, + { + "epoch": 1.118990277406119, + "grad_norm": 0.45678937435150146, + "learning_rate": 0.00020347128810203717, + "loss": 0.6553, + "step": 6273 + }, + { + "epoch": 1.119168673624119, + "grad_norm": 0.483571857213974, + "learning_rate": 0.00020340245177812344, + "loss": 0.746, + "step": 6274 + }, + { + "epoch": 1.1193470698421193, + "grad_norm": 0.4845406115055084, + "learning_rate": 0.00020333361911396112, + "loss": 0.6712, + "step": 6275 + }, + { + "epoch": 1.1195254660601195, + "grad_norm": 0.464175820350647, + "learning_rate": 0.00020326479011495627, + "loss": 0.5484, + "step": 6276 + }, + { + "epoch": 1.1197038622781197, + "grad_norm": 0.48887899518013, + "learning_rate": 0.00020319596478651477, + "loss": 0.6694, + "step": 6277 + }, + { + "epoch": 1.1198822584961199, + "grad_norm": 0.4447461664676666, + "learning_rate": 0.00020312714313404197, + "loss": 0.4637, + "step": 6278 + }, + { + "epoch": 1.12006065471412, + "grad_norm": 0.5541086792945862, + "learning_rate": 0.00020305832516294314, + "loss": 0.7051, + "step": 6279 + }, + { + "epoch": 1.1202390509321203, + "grad_norm": 0.43977195024490356, + "learning_rate": 0.00020298951087862333, + "loss": 0.5394, + "step": 6280 + }, + { + "epoch": 1.1204174471501205, + "grad_norm": 0.5157051682472229, + "learning_rate": 0.00020292070028648707, + "loss": 0.7626, + "step": 6281 + }, + { + "epoch": 1.1205958433681207, + "grad_norm": 0.4950587749481201, + "learning_rate": 0.00020285189339193873, + "loss": 0.704, + "step": 6282 + }, + { + "epoch": 1.1207742395861207, + "grad_norm": 0.4870520532131195, + "learning_rate": 0.0002027830902003824, + "loss": 0.6548, + "step": 6283 + }, + { + "epoch": 1.1209526358041209, + "grad_norm": 0.5210698246955872, + "learning_rate": 0.00020271429071722186, + "loss": 0.6496, + "step": 6284 + }, + { + "epoch": 1.121131032022121, + "grad_norm": 0.4650278091430664, + "learning_rate": 0.00020264549494786066, + "loss": 0.5775, + "step": 6285 + }, + { + "epoch": 1.1213094282401213, + "grad_norm": 0.5056297779083252, + "learning_rate": 0.00020257670289770181, + "loss": 0.7124, + "step": 6286 + }, + { + "epoch": 1.1214878244581215, + "grad_norm": 0.4929437041282654, + "learning_rate": 0.00020250791457214823, + "loss": 0.9018, + "step": 6287 + }, + { + "epoch": 1.1216662206761217, + "grad_norm": 0.5105372071266174, + "learning_rate": 0.0002024391299766027, + "loss": 0.7514, + "step": 6288 + }, + { + "epoch": 1.1218446168941218, + "grad_norm": 0.4985578656196594, + "learning_rate": 0.00020237034911646745, + "loss": 0.6705, + "step": 6289 + }, + { + "epoch": 1.122023013112122, + "grad_norm": 0.5785099864006042, + "learning_rate": 0.0002023015719971445, + "loss": 1.1394, + "step": 6290 + }, + { + "epoch": 1.1222014093301222, + "grad_norm": 0.5343894362449646, + "learning_rate": 0.0002022327986240355, + "loss": 0.8509, + "step": 6291 + }, + { + "epoch": 1.1223798055481224, + "grad_norm": 0.4681146442890167, + "learning_rate": 0.00020216402900254197, + "loss": 0.5766, + "step": 6292 + }, + { + "epoch": 1.1225582017661226, + "grad_norm": 0.4778856039047241, + "learning_rate": 0.000202095263138065, + "loss": 0.8314, + "step": 6293 + }, + { + "epoch": 1.1227365979841228, + "grad_norm": 0.44656750559806824, + "learning_rate": 0.00020202650103600544, + "loss": 0.6106, + "step": 6294 + }, + { + "epoch": 1.1229149942021228, + "grad_norm": 0.5104446411132812, + "learning_rate": 0.00020195774270176386, + "loss": 0.6599, + "step": 6295 + }, + { + "epoch": 1.123093390420123, + "grad_norm": 0.44691547751426697, + "learning_rate": 0.0002018889881407405, + "loss": 0.5945, + "step": 6296 + }, + { + "epoch": 1.1232717866381232, + "grad_norm": 0.5005384683609009, + "learning_rate": 0.00020182023735833531, + "loss": 0.5805, + "step": 6297 + }, + { + "epoch": 1.1234501828561234, + "grad_norm": 0.5261300802230835, + "learning_rate": 0.000201751490359948, + "loss": 0.825, + "step": 6298 + }, + { + "epoch": 1.1236285790741236, + "grad_norm": 0.47424525022506714, + "learning_rate": 0.00020168274715097782, + "loss": 0.6754, + "step": 6299 + }, + { + "epoch": 1.1238069752921238, + "grad_norm": 0.48731860518455505, + "learning_rate": 0.00020161400773682387, + "loss": 0.6607, + "step": 6300 + }, + { + "epoch": 1.123985371510124, + "grad_norm": 0.4794895350933075, + "learning_rate": 0.00020154527212288493, + "loss": 0.6262, + "step": 6301 + }, + { + "epoch": 1.1241637677281242, + "grad_norm": 0.4823654592037201, + "learning_rate": 0.00020147654031455942, + "loss": 0.6762, + "step": 6302 + }, + { + "epoch": 1.1243421639461244, + "grad_norm": 0.5071163177490234, + "learning_rate": 0.0002014078123172456, + "loss": 0.7975, + "step": 6303 + }, + { + "epoch": 1.1245205601641246, + "grad_norm": 0.4861220419406891, + "learning_rate": 0.0002013390881363413, + "loss": 0.6742, + "step": 6304 + }, + { + "epoch": 1.1246989563821246, + "grad_norm": 0.5352680087089539, + "learning_rate": 0.00020127036777724407, + "loss": 0.7588, + "step": 6305 + }, + { + "epoch": 1.1248773526001248, + "grad_norm": 0.4422907829284668, + "learning_rate": 0.00020120165124535119, + "loss": 0.6386, + "step": 6306 + }, + { + "epoch": 1.125055748818125, + "grad_norm": 0.4726184606552124, + "learning_rate": 0.00020113293854605963, + "loss": 0.6324, + "step": 6307 + }, + { + "epoch": 1.1252341450361252, + "grad_norm": 0.5098016858100891, + "learning_rate": 0.00020106422968476604, + "loss": 0.7632, + "step": 6308 + }, + { + "epoch": 1.1254125412541254, + "grad_norm": 0.4972569942474365, + "learning_rate": 0.00020099552466686677, + "loss": 0.6985, + "step": 6309 + }, + { + "epoch": 1.1255909374721256, + "grad_norm": 0.4584605097770691, + "learning_rate": 0.00020092682349775797, + "loss": 0.6985, + "step": 6310 + }, + { + "epoch": 1.1257693336901258, + "grad_norm": 0.49379658699035645, + "learning_rate": 0.0002008581261828353, + "loss": 0.5424, + "step": 6311 + }, + { + "epoch": 1.125947729908126, + "grad_norm": 0.5629817247390747, + "learning_rate": 0.00020078943272749426, + "loss": 0.6988, + "step": 6312 + }, + { + "epoch": 1.1261261261261262, + "grad_norm": 0.49711212515830994, + "learning_rate": 0.00020072074313712995, + "loss": 0.6466, + "step": 6313 + }, + { + "epoch": 1.1263045223441264, + "grad_norm": 0.5518889427185059, + "learning_rate": 0.00020065205741713732, + "loss": 0.7526, + "step": 6314 + }, + { + "epoch": 1.1264829185621266, + "grad_norm": 0.44839757680892944, + "learning_rate": 0.00020058337557291085, + "loss": 0.6743, + "step": 6315 + }, + { + "epoch": 1.1266613147801268, + "grad_norm": 0.4963269531726837, + "learning_rate": 0.00020051469760984475, + "loss": 0.7584, + "step": 6316 + }, + { + "epoch": 1.126839710998127, + "grad_norm": 0.4421939551830292, + "learning_rate": 0.00020044602353333304, + "loss": 0.6155, + "step": 6317 + }, + { + "epoch": 1.127018107216127, + "grad_norm": 0.47304272651672363, + "learning_rate": 0.00020037735334876928, + "loss": 0.843, + "step": 6318 + }, + { + "epoch": 1.1271965034341271, + "grad_norm": 0.4691406786441803, + "learning_rate": 0.00020030868706154688, + "loss": 0.8837, + "step": 6319 + }, + { + "epoch": 1.1273748996521273, + "grad_norm": 0.470851868391037, + "learning_rate": 0.00020024002467705878, + "loss": 0.7696, + "step": 6320 + }, + { + "epoch": 1.1275532958701275, + "grad_norm": 0.5345527529716492, + "learning_rate": 0.00020017136620069777, + "loss": 0.9637, + "step": 6321 + }, + { + "epoch": 1.1277316920881277, + "grad_norm": 0.47198501229286194, + "learning_rate": 0.00020010271163785622, + "loss": 0.705, + "step": 6322 + }, + { + "epoch": 1.127910088306128, + "grad_norm": 0.5204360485076904, + "learning_rate": 0.00020003406099392625, + "loss": 0.6943, + "step": 6323 + }, + { + "epoch": 1.1280884845241281, + "grad_norm": 0.49591392278671265, + "learning_rate": 0.00019996541427429957, + "loss": 0.6205, + "step": 6324 + }, + { + "epoch": 1.1282668807421283, + "grad_norm": 0.45050108432769775, + "learning_rate": 0.0001998967714843677, + "loss": 0.5172, + "step": 6325 + }, + { + "epoch": 1.1284452769601285, + "grad_norm": 0.45870310068130493, + "learning_rate": 0.0001998281326295219, + "loss": 0.6089, + "step": 6326 + }, + { + "epoch": 1.1286236731781285, + "grad_norm": 0.4750981032848358, + "learning_rate": 0.00019975949771515296, + "loss": 0.6546, + "step": 6327 + }, + { + "epoch": 1.1288020693961287, + "grad_norm": 1.4412025213241577, + "learning_rate": 0.0001996908667466514, + "loss": 0.7441, + "step": 6328 + }, + { + "epoch": 1.128980465614129, + "grad_norm": 0.49182528257369995, + "learning_rate": 0.00019962223972940757, + "loss": 0.6518, + "step": 6329 + }, + { + "epoch": 1.129158861832129, + "grad_norm": 0.5143123269081116, + "learning_rate": 0.00019955361666881133, + "loss": 0.6357, + "step": 6330 + }, + { + "epoch": 1.1293372580501293, + "grad_norm": 1.9303398132324219, + "learning_rate": 0.00019948499757025239, + "loss": 0.5802, + "step": 6331 + }, + { + "epoch": 1.1295156542681295, + "grad_norm": 0.5342963337898254, + "learning_rate": 0.00019941638243911993, + "loss": 0.6309, + "step": 6332 + }, + { + "epoch": 1.1296940504861297, + "grad_norm": 0.4394116997718811, + "learning_rate": 0.00019934777128080292, + "loss": 0.6286, + "step": 6333 + }, + { + "epoch": 1.12987244670413, + "grad_norm": 0.5031057596206665, + "learning_rate": 0.00019927916410069027, + "loss": 0.6877, + "step": 6334 + }, + { + "epoch": 1.13005084292213, + "grad_norm": 0.4916764795780182, + "learning_rate": 0.00019921056090417026, + "loss": 0.7054, + "step": 6335 + }, + { + "epoch": 1.1302292391401303, + "grad_norm": 0.535476803779602, + "learning_rate": 0.00019914196169663095, + "loss": 0.7493, + "step": 6336 + }, + { + "epoch": 1.1304076353581305, + "grad_norm": 0.5123772025108337, + "learning_rate": 0.00019907336648346008, + "loss": 0.6594, + "step": 6337 + }, + { + "epoch": 1.1305860315761307, + "grad_norm": 0.5296303629875183, + "learning_rate": 0.00019900477527004507, + "loss": 0.7239, + "step": 6338 + }, + { + "epoch": 1.1307644277941309, + "grad_norm": 0.5358021259307861, + "learning_rate": 0.00019893618806177306, + "loss": 0.9083, + "step": 6339 + }, + { + "epoch": 1.1309428240121309, + "grad_norm": 0.47676989436149597, + "learning_rate": 0.00019886760486403088, + "loss": 0.5314, + "step": 6340 + }, + { + "epoch": 1.131121220230131, + "grad_norm": 0.5259501934051514, + "learning_rate": 0.00019879902568220497, + "loss": 0.7819, + "step": 6341 + }, + { + "epoch": 1.1312996164481313, + "grad_norm": 0.4975660741329193, + "learning_rate": 0.00019873045052168158, + "loss": 0.6725, + "step": 6342 + }, + { + "epoch": 1.1314780126661315, + "grad_norm": 0.5157343149185181, + "learning_rate": 0.00019866187938784657, + "loss": 0.6001, + "step": 6343 + }, + { + "epoch": 1.1316564088841317, + "grad_norm": 0.43325984477996826, + "learning_rate": 0.00019859331228608547, + "loss": 0.6975, + "step": 6344 + }, + { + "epoch": 1.1318348051021319, + "grad_norm": 0.49547433853149414, + "learning_rate": 0.0001985247492217835, + "loss": 0.6571, + "step": 6345 + }, + { + "epoch": 1.132013201320132, + "grad_norm": 0.49157583713531494, + "learning_rate": 0.00019845619020032553, + "loss": 0.7212, + "step": 6346 + }, + { + "epoch": 1.1321915975381323, + "grad_norm": 0.48966532945632935, + "learning_rate": 0.0001983876352270962, + "loss": 0.5914, + "step": 6347 + }, + { + "epoch": 1.1323699937561325, + "grad_norm": 0.5003472566604614, + "learning_rate": 0.0001983190843074797, + "loss": 0.5599, + "step": 6348 + }, + { + "epoch": 1.1325483899741324, + "grad_norm": 0.48509982228279114, + "learning_rate": 0.00019825053744686023, + "loss": 0.6925, + "step": 6349 + }, + { + "epoch": 1.1327267861921326, + "grad_norm": 0.4695090651512146, + "learning_rate": 0.00019818199465062122, + "loss": 0.7031, + "step": 6350 + }, + { + "epoch": 1.1329051824101328, + "grad_norm": 0.494907408952713, + "learning_rate": 0.00019811345592414607, + "loss": 0.6428, + "step": 6351 + }, + { + "epoch": 1.133083578628133, + "grad_norm": 0.5415506958961487, + "learning_rate": 0.00019804492127281772, + "loss": 0.9687, + "step": 6352 + }, + { + "epoch": 1.1332619748461332, + "grad_norm": 0.45336905121803284, + "learning_rate": 0.00019797639070201896, + "loss": 0.6972, + "step": 6353 + }, + { + "epoch": 1.1334403710641334, + "grad_norm": 0.4650866687297821, + "learning_rate": 0.00019790786421713204, + "loss": 0.6297, + "step": 6354 + }, + { + "epoch": 1.1336187672821336, + "grad_norm": 0.4537782669067383, + "learning_rate": 0.00019783934182353904, + "loss": 0.5633, + "step": 6355 + }, + { + "epoch": 1.1337971635001338, + "grad_norm": 0.45536094903945923, + "learning_rate": 0.00019777082352662173, + "loss": 0.6543, + "step": 6356 + }, + { + "epoch": 1.133975559718134, + "grad_norm": 0.4556442201137543, + "learning_rate": 0.00019770230933176147, + "loss": 0.6048, + "step": 6357 + }, + { + "epoch": 1.1341539559361342, + "grad_norm": 0.44016602635383606, + "learning_rate": 0.00019763379924433934, + "loss": 0.6132, + "step": 6358 + }, + { + "epoch": 1.1343323521541344, + "grad_norm": 0.5556267499923706, + "learning_rate": 0.00019756529326973602, + "loss": 0.8597, + "step": 6359 + }, + { + "epoch": 1.1345107483721346, + "grad_norm": 0.5245857834815979, + "learning_rate": 0.00019749679141333205, + "loss": 0.6483, + "step": 6360 + }, + { + "epoch": 1.1346891445901348, + "grad_norm": 0.4881051182746887, + "learning_rate": 0.00019742829368050744, + "loss": 0.6523, + "step": 6361 + }, + { + "epoch": 1.1348675408081348, + "grad_norm": 0.4933343231678009, + "learning_rate": 0.00019735980007664207, + "loss": 0.6629, + "step": 6362 + }, + { + "epoch": 1.135045937026135, + "grad_norm": 0.4620760977268219, + "learning_rate": 0.0001972913106071153, + "loss": 0.6428, + "step": 6363 + }, + { + "epoch": 1.1352243332441352, + "grad_norm": 0.5003635883331299, + "learning_rate": 0.00019722282527730628, + "loss": 0.6564, + "step": 6364 + }, + { + "epoch": 1.1354027294621354, + "grad_norm": 0.41504591703414917, + "learning_rate": 0.0001971543440925939, + "loss": 0.441, + "step": 6365 + }, + { + "epoch": 1.1355811256801356, + "grad_norm": 0.49655354022979736, + "learning_rate": 0.0001970858670583566, + "loss": 0.6645, + "step": 6366 + }, + { + "epoch": 1.1357595218981358, + "grad_norm": 0.4752812087535858, + "learning_rate": 0.00019701739417997256, + "loss": 0.6179, + "step": 6367 + }, + { + "epoch": 1.135937918116136, + "grad_norm": 0.46678200364112854, + "learning_rate": 0.00019694892546281954, + "loss": 0.5577, + "step": 6368 + }, + { + "epoch": 1.1361163143341362, + "grad_norm": 0.5003397464752197, + "learning_rate": 0.0001968804609122751, + "loss": 0.6509, + "step": 6369 + }, + { + "epoch": 1.1362947105521364, + "grad_norm": 0.5790354013442993, + "learning_rate": 0.00019681200053371645, + "loss": 0.6953, + "step": 6370 + }, + { + "epoch": 1.1364731067701364, + "grad_norm": 0.5248673558235168, + "learning_rate": 0.00019674354433252034, + "loss": 0.6928, + "step": 6371 + }, + { + "epoch": 1.1366515029881366, + "grad_norm": 0.5383015871047974, + "learning_rate": 0.00019667509231406332, + "loss": 0.6522, + "step": 6372 + }, + { + "epoch": 1.1368298992061368, + "grad_norm": 0.5394713282585144, + "learning_rate": 0.00019660664448372162, + "loss": 0.6593, + "step": 6373 + }, + { + "epoch": 1.137008295424137, + "grad_norm": 0.5048239827156067, + "learning_rate": 0.00019653820084687107, + "loss": 0.8368, + "step": 6374 + }, + { + "epoch": 1.1371866916421371, + "grad_norm": 0.4014268219470978, + "learning_rate": 0.00019646976140888725, + "loss": 0.5085, + "step": 6375 + }, + { + "epoch": 1.1373650878601373, + "grad_norm": 0.4501616954803467, + "learning_rate": 0.00019640132617514534, + "loss": 0.5781, + "step": 6376 + }, + { + "epoch": 1.1375434840781375, + "grad_norm": 0.5049312114715576, + "learning_rate": 0.00019633289515102017, + "loss": 0.8279, + "step": 6377 + }, + { + "epoch": 1.1377218802961377, + "grad_norm": 0.4441901743412018, + "learning_rate": 0.00019626446834188638, + "loss": 0.58, + "step": 6378 + }, + { + "epoch": 1.137900276514138, + "grad_norm": 0.4739856719970703, + "learning_rate": 0.00019619604575311797, + "loss": 0.755, + "step": 6379 + }, + { + "epoch": 1.1380786727321381, + "grad_norm": 0.4583461880683899, + "learning_rate": 0.0001961276273900891, + "loss": 0.5062, + "step": 6380 + }, + { + "epoch": 1.1382570689501383, + "grad_norm": 0.4415271282196045, + "learning_rate": 0.00019605921325817317, + "loss": 0.5786, + "step": 6381 + }, + { + "epoch": 1.1384354651681385, + "grad_norm": 0.5233315229415894, + "learning_rate": 0.00019599080336274343, + "loss": 0.8495, + "step": 6382 + }, + { + "epoch": 1.1386138613861387, + "grad_norm": 0.5158662796020508, + "learning_rate": 0.00019592239770917276, + "loss": 0.7314, + "step": 6383 + }, + { + "epoch": 1.1387922576041387, + "grad_norm": 0.6792814135551453, + "learning_rate": 0.00019585399630283367, + "loss": 0.7851, + "step": 6384 + }, + { + "epoch": 1.138970653822139, + "grad_norm": 0.44133511185646057, + "learning_rate": 0.00019578559914909844, + "loss": 0.6067, + "step": 6385 + }, + { + "epoch": 1.139149050040139, + "grad_norm": 0.4633074700832367, + "learning_rate": 0.00019571720625333888, + "loss": 0.6206, + "step": 6386 + }, + { + "epoch": 1.1393274462581393, + "grad_norm": 0.7552552223205566, + "learning_rate": 0.00019564881762092662, + "loss": 0.659, + "step": 6387 + }, + { + "epoch": 1.1395058424761395, + "grad_norm": 0.5166911482810974, + "learning_rate": 0.00019558043325723282, + "loss": 0.6211, + "step": 6388 + }, + { + "epoch": 1.1396842386941397, + "grad_norm": 0.5080016851425171, + "learning_rate": 0.00019551205316762838, + "loss": 0.8162, + "step": 6389 + }, + { + "epoch": 1.13986263491214, + "grad_norm": 0.5829038619995117, + "learning_rate": 0.00019544367735748388, + "loss": 0.9879, + "step": 6390 + }, + { + "epoch": 1.14004103113014, + "grad_norm": 0.49968674778938293, + "learning_rate": 0.00019537530583216945, + "loss": 0.6904, + "step": 6391 + }, + { + "epoch": 1.1402194273481403, + "grad_norm": 0.469959557056427, + "learning_rate": 0.00019530693859705497, + "loss": 0.748, + "step": 6392 + }, + { + "epoch": 1.1403978235661403, + "grad_norm": 0.5133938193321228, + "learning_rate": 0.00019523857565751003, + "loss": 0.7729, + "step": 6393 + }, + { + "epoch": 1.1405762197841405, + "grad_norm": 0.543165385723114, + "learning_rate": 0.00019517021701890365, + "loss": 0.7534, + "step": 6394 + }, + { + "epoch": 1.1407546160021407, + "grad_norm": 0.4762142300605774, + "learning_rate": 0.00019510186268660497, + "loss": 0.4908, + "step": 6395 + }, + { + "epoch": 1.1409330122201409, + "grad_norm": 0.5711979866027832, + "learning_rate": 0.00019503351266598234, + "loss": 0.7407, + "step": 6396 + }, + { + "epoch": 1.141111408438141, + "grad_norm": 0.4412236511707306, + "learning_rate": 0.00019496516696240399, + "loss": 0.6312, + "step": 6397 + }, + { + "epoch": 1.1412898046561413, + "grad_norm": 0.5444515347480774, + "learning_rate": 0.00019489682558123772, + "loss": 0.6745, + "step": 6398 + }, + { + "epoch": 1.1414682008741415, + "grad_norm": 0.4231199026107788, + "learning_rate": 0.00019482848852785107, + "loss": 0.4643, + "step": 6399 + }, + { + "epoch": 1.1416465970921417, + "grad_norm": 0.49194931983947754, + "learning_rate": 0.00019476015580761118, + "loss": 0.6983, + "step": 6400 + }, + { + "epoch": 1.1418249933101419, + "grad_norm": 0.5072386860847473, + "learning_rate": 0.0001946918274258849, + "loss": 0.7764, + "step": 6401 + }, + { + "epoch": 1.142003389528142, + "grad_norm": 0.4746543765068054, + "learning_rate": 0.0001946235033880387, + "loss": 0.7924, + "step": 6402 + }, + { + "epoch": 1.1421817857461423, + "grad_norm": 0.4350649416446686, + "learning_rate": 0.00019455518369943873, + "loss": 0.4785, + "step": 6403 + }, + { + "epoch": 1.1423601819641425, + "grad_norm": 0.5181750655174255, + "learning_rate": 0.00019448686836545073, + "loss": 0.8109, + "step": 6404 + }, + { + "epoch": 1.1425385781821427, + "grad_norm": 0.6082465052604675, + "learning_rate": 0.0001944185573914402, + "loss": 0.6327, + "step": 6405 + }, + { + "epoch": 1.1427169744001426, + "grad_norm": 0.5113035440444946, + "learning_rate": 0.00019435025078277227, + "loss": 0.7438, + "step": 6406 + }, + { + "epoch": 1.1428953706181428, + "grad_norm": 0.4369472861289978, + "learning_rate": 0.00019428194854481169, + "loss": 0.5909, + "step": 6407 + }, + { + "epoch": 1.143073766836143, + "grad_norm": 0.4993836283683777, + "learning_rate": 0.00019421365068292287, + "loss": 0.6107, + "step": 6408 + }, + { + "epoch": 1.1432521630541432, + "grad_norm": 0.41479700803756714, + "learning_rate": 0.0001941453572024699, + "loss": 0.496, + "step": 6409 + }, + { + "epoch": 1.1434305592721434, + "grad_norm": 0.5059956312179565, + "learning_rate": 0.00019407706810881657, + "loss": 0.7022, + "step": 6410 + }, + { + "epoch": 1.1436089554901436, + "grad_norm": 0.46626099944114685, + "learning_rate": 0.00019400878340732625, + "loss": 0.5752, + "step": 6411 + }, + { + "epoch": 1.1437873517081438, + "grad_norm": 0.44759035110473633, + "learning_rate": 0.00019394050310336198, + "loss": 0.5391, + "step": 6412 + }, + { + "epoch": 1.143965747926144, + "grad_norm": 0.4467226564884186, + "learning_rate": 0.0001938722272022865, + "loss": 0.4655, + "step": 6413 + }, + { + "epoch": 1.1441441441441442, + "grad_norm": 0.49334871768951416, + "learning_rate": 0.0001938039557094621, + "loss": 0.6975, + "step": 6414 + }, + { + "epoch": 1.1443225403621442, + "grad_norm": 0.5575069785118103, + "learning_rate": 0.00019373568863025086, + "loss": 0.7331, + "step": 6415 + }, + { + "epoch": 1.1445009365801444, + "grad_norm": 0.5732327699661255, + "learning_rate": 0.00019366742597001446, + "loss": 0.9117, + "step": 6416 + }, + { + "epoch": 1.1446793327981446, + "grad_norm": 0.5360227227210999, + "learning_rate": 0.00019359916773411414, + "loss": 0.8448, + "step": 6417 + }, + { + "epoch": 1.1448577290161448, + "grad_norm": 0.4648105204105377, + "learning_rate": 0.00019353091392791094, + "loss": 0.5582, + "step": 6418 + }, + { + "epoch": 1.145036125234145, + "grad_norm": 0.5667071342468262, + "learning_rate": 0.00019346266455676542, + "loss": 0.7995, + "step": 6419 + }, + { + "epoch": 1.1452145214521452, + "grad_norm": 0.4563814699649811, + "learning_rate": 0.00019339441962603794, + "loss": 0.7236, + "step": 6420 + }, + { + "epoch": 1.1453929176701454, + "grad_norm": 0.4414633512496948, + "learning_rate": 0.00019332617914108834, + "loss": 0.5218, + "step": 6421 + }, + { + "epoch": 1.1455713138881456, + "grad_norm": 0.46729329228401184, + "learning_rate": 0.00019325794310727626, + "loss": 0.7908, + "step": 6422 + }, + { + "epoch": 1.1457497101061458, + "grad_norm": 0.4846903383731842, + "learning_rate": 0.0001931897115299609, + "loss": 0.6743, + "step": 6423 + }, + { + "epoch": 1.145928106324146, + "grad_norm": 0.4348449409008026, + "learning_rate": 0.00019312148441450122, + "loss": 0.5317, + "step": 6424 + }, + { + "epoch": 1.1461065025421462, + "grad_norm": 0.4379520118236542, + "learning_rate": 0.0001930532617662555, + "loss": 0.6015, + "step": 6425 + }, + { + "epoch": 1.1462848987601464, + "grad_norm": 0.42976364493370056, + "learning_rate": 0.00019298504359058222, + "loss": 0.4379, + "step": 6426 + }, + { + "epoch": 1.1464632949781466, + "grad_norm": 0.48329517245292664, + "learning_rate": 0.00019291682989283908, + "loss": 0.6796, + "step": 6427 + }, + { + "epoch": 1.1466416911961466, + "grad_norm": 0.4555102288722992, + "learning_rate": 0.00019284862067838352, + "loss": 0.5488, + "step": 6428 + }, + { + "epoch": 1.1468200874141468, + "grad_norm": 0.49463093280792236, + "learning_rate": 0.0001927804159525728, + "loss": 0.7097, + "step": 6429 + }, + { + "epoch": 1.146998483632147, + "grad_norm": 0.5678471922874451, + "learning_rate": 0.0001927122157207635, + "loss": 0.9311, + "step": 6430 + }, + { + "epoch": 1.1471768798501472, + "grad_norm": 0.5055651068687439, + "learning_rate": 0.00019264401998831212, + "loss": 0.737, + "step": 6431 + }, + { + "epoch": 1.1473552760681474, + "grad_norm": 0.5009707808494568, + "learning_rate": 0.00019257582876057474, + "loss": 0.6349, + "step": 6432 + }, + { + "epoch": 1.1475336722861476, + "grad_norm": 0.4183083176612854, + "learning_rate": 0.00019250764204290709, + "loss": 0.5802, + "step": 6433 + }, + { + "epoch": 1.1477120685041478, + "grad_norm": 0.44495829939842224, + "learning_rate": 0.00019243945984066444, + "loss": 0.5597, + "step": 6434 + }, + { + "epoch": 1.147890464722148, + "grad_norm": 0.5188366770744324, + "learning_rate": 0.00019237128215920187, + "loss": 0.7409, + "step": 6435 + }, + { + "epoch": 1.1480688609401482, + "grad_norm": 0.49368321895599365, + "learning_rate": 0.000192303109003874, + "loss": 0.6452, + "step": 6436 + }, + { + "epoch": 1.1482472571581481, + "grad_norm": 0.41561809182167053, + "learning_rate": 0.00019223494038003516, + "loss": 0.6164, + "step": 6437 + }, + { + "epoch": 1.1484256533761483, + "grad_norm": 0.49593281745910645, + "learning_rate": 0.00019216677629303923, + "loss": 0.7516, + "step": 6438 + }, + { + "epoch": 1.1486040495941485, + "grad_norm": 0.5000485181808472, + "learning_rate": 0.00019209861674823975, + "loss": 0.6881, + "step": 6439 + }, + { + "epoch": 1.1487824458121487, + "grad_norm": 0.41080528497695923, + "learning_rate": 0.00019203046175098992, + "loss": 0.5175, + "step": 6440 + }, + { + "epoch": 1.148960842030149, + "grad_norm": 0.46818774938583374, + "learning_rate": 0.00019196231130664282, + "loss": 0.7355, + "step": 6441 + }, + { + "epoch": 1.1491392382481491, + "grad_norm": 0.4806887209415436, + "learning_rate": 0.00019189416542055078, + "loss": 0.6491, + "step": 6442 + }, + { + "epoch": 1.1493176344661493, + "grad_norm": 0.46855100989341736, + "learning_rate": 0.00019182602409806597, + "loss": 0.7616, + "step": 6443 + }, + { + "epoch": 1.1494960306841495, + "grad_norm": 0.49849364161491394, + "learning_rate": 0.00019175788734454019, + "loss": 0.722, + "step": 6444 + }, + { + "epoch": 1.1496744269021497, + "grad_norm": 0.48306459188461304, + "learning_rate": 0.0001916897551653249, + "loss": 0.6938, + "step": 6445 + }, + { + "epoch": 1.14985282312015, + "grad_norm": 0.4553467035293579, + "learning_rate": 0.0001916216275657711, + "loss": 0.5726, + "step": 6446 + }, + { + "epoch": 1.1500312193381501, + "grad_norm": 0.4541803300380707, + "learning_rate": 0.0001915535045512296, + "loss": 0.5465, + "step": 6447 + }, + { + "epoch": 1.1502096155561503, + "grad_norm": 0.5156320333480835, + "learning_rate": 0.00019148538612705066, + "loss": 0.6868, + "step": 6448 + }, + { + "epoch": 1.1503880117741505, + "grad_norm": 0.4388979375362396, + "learning_rate": 0.00019141727229858433, + "loss": 0.5943, + "step": 6449 + }, + { + "epoch": 1.1505664079921505, + "grad_norm": 0.4948444366455078, + "learning_rate": 0.00019134916307118028, + "loss": 0.6421, + "step": 6450 + }, + { + "epoch": 1.1507448042101507, + "grad_norm": 0.46321436762809753, + "learning_rate": 0.00019128105845018766, + "loss": 0.5047, + "step": 6451 + }, + { + "epoch": 1.1509232004281509, + "grad_norm": 0.46855947375297546, + "learning_rate": 0.00019121295844095544, + "loss": 0.6025, + "step": 6452 + }, + { + "epoch": 1.151101596646151, + "grad_norm": 0.45880499482154846, + "learning_rate": 0.00019114486304883216, + "loss": 0.7395, + "step": 6453 + }, + { + "epoch": 1.1512799928641513, + "grad_norm": 0.4672548472881317, + "learning_rate": 0.00019107677227916603, + "loss": 0.7361, + "step": 6454 + }, + { + "epoch": 1.1514583890821515, + "grad_norm": 0.4436180591583252, + "learning_rate": 0.0001910086861373048, + "loss": 0.5226, + "step": 6455 + }, + { + "epoch": 1.1516367853001517, + "grad_norm": 0.47873300313949585, + "learning_rate": 0.000190940604628596, + "loss": 0.7932, + "step": 6456 + }, + { + "epoch": 1.1518151815181519, + "grad_norm": 0.5310173034667969, + "learning_rate": 0.00019087252775838671, + "loss": 0.8148, + "step": 6457 + }, + { + "epoch": 1.151993577736152, + "grad_norm": 0.4951856732368469, + "learning_rate": 0.00019080445553202363, + "loss": 0.7325, + "step": 6458 + }, + { + "epoch": 1.152171973954152, + "grad_norm": 0.5043248534202576, + "learning_rate": 0.0001907363879548532, + "loss": 0.6899, + "step": 6459 + }, + { + "epoch": 1.1523503701721522, + "grad_norm": 0.5174239873886108, + "learning_rate": 0.00019066832503222128, + "loss": 0.6925, + "step": 6460 + }, + { + "epoch": 1.1525287663901524, + "grad_norm": 0.47568047046661377, + "learning_rate": 0.00019060026676947362, + "loss": 0.7057, + "step": 6461 + }, + { + "epoch": 1.1527071626081526, + "grad_norm": 0.4853077828884125, + "learning_rate": 0.0001905322131719555, + "loss": 0.7131, + "step": 6462 + }, + { + "epoch": 1.1528855588261528, + "grad_norm": 0.4645387828350067, + "learning_rate": 0.0001904641642450117, + "loss": 0.6627, + "step": 6463 + }, + { + "epoch": 1.153063955044153, + "grad_norm": 0.4763006269931793, + "learning_rate": 0.00019039611999398682, + "loss": 0.6816, + "step": 6464 + }, + { + "epoch": 1.1532423512621532, + "grad_norm": 0.5304715633392334, + "learning_rate": 0.00019032808042422503, + "loss": 0.7631, + "step": 6465 + }, + { + "epoch": 1.1534207474801534, + "grad_norm": 0.4912330210208893, + "learning_rate": 0.0001902600455410701, + "loss": 0.6471, + "step": 6466 + }, + { + "epoch": 1.1535991436981536, + "grad_norm": 0.4972264766693115, + "learning_rate": 0.00019019201534986553, + "loss": 0.7728, + "step": 6467 + }, + { + "epoch": 1.1537775399161538, + "grad_norm": 2.746565341949463, + "learning_rate": 0.0001901239898559543, + "loss": 0.6513, + "step": 6468 + }, + { + "epoch": 1.153955936134154, + "grad_norm": 0.5050514340400696, + "learning_rate": 0.00019005596906467918, + "loss": 0.8309, + "step": 6469 + }, + { + "epoch": 1.1541343323521542, + "grad_norm": 0.5243608951568604, + "learning_rate": 0.0001899879529813825, + "loss": 0.8084, + "step": 6470 + }, + { + "epoch": 1.1543127285701544, + "grad_norm": 0.4700818955898285, + "learning_rate": 0.00018991994161140596, + "loss": 0.5866, + "step": 6471 + }, + { + "epoch": 1.1544911247881544, + "grad_norm": 0.46044886112213135, + "learning_rate": 0.00018985193496009152, + "loss": 0.6026, + "step": 6472 + }, + { + "epoch": 1.1546695210061546, + "grad_norm": 0.47794002294540405, + "learning_rate": 0.0001897839330327802, + "loss": 0.593, + "step": 6473 + }, + { + "epoch": 1.1548479172241548, + "grad_norm": 0.49822142720222473, + "learning_rate": 0.00018971593583481282, + "loss": 0.8087, + "step": 6474 + }, + { + "epoch": 1.155026313442155, + "grad_norm": 0.49274197220802307, + "learning_rate": 0.00018964794337153002, + "loss": 0.6831, + "step": 6475 + }, + { + "epoch": 1.1552047096601552, + "grad_norm": 0.49197664856910706, + "learning_rate": 0.0001895799556482717, + "loss": 0.5862, + "step": 6476 + }, + { + "epoch": 1.1553831058781554, + "grad_norm": 0.49241068959236145, + "learning_rate": 0.00018951197267037765, + "loss": 0.6444, + "step": 6477 + }, + { + "epoch": 1.1555615020961556, + "grad_norm": 0.5059458017349243, + "learning_rate": 0.00018944399444318724, + "loss": 0.7429, + "step": 6478 + }, + { + "epoch": 1.1557398983141558, + "grad_norm": 0.4923364222049713, + "learning_rate": 0.00018937602097203942, + "loss": 0.6462, + "step": 6479 + }, + { + "epoch": 1.155918294532156, + "grad_norm": 0.5192193388938904, + "learning_rate": 0.0001893080522622729, + "loss": 0.7404, + "step": 6480 + }, + { + "epoch": 1.156096690750156, + "grad_norm": 0.5662673711776733, + "learning_rate": 0.0001892400883192258, + "loss": 0.7242, + "step": 6481 + }, + { + "epoch": 1.1562750869681562, + "grad_norm": 0.4407677948474884, + "learning_rate": 0.00018917212914823598, + "loss": 0.5281, + "step": 6482 + }, + { + "epoch": 1.1564534831861564, + "grad_norm": 0.4470885694026947, + "learning_rate": 0.00018910417475464104, + "loss": 0.6045, + "step": 6483 + }, + { + "epoch": 1.1566318794041566, + "grad_norm": 0.477516233921051, + "learning_rate": 0.00018903622514377798, + "loss": 0.6069, + "step": 6484 + }, + { + "epoch": 1.1568102756221568, + "grad_norm": 0.5461446046829224, + "learning_rate": 0.00018896828032098352, + "loss": 0.7073, + "step": 6485 + }, + { + "epoch": 1.156988671840157, + "grad_norm": 0.5411002039909363, + "learning_rate": 0.00018890034029159399, + "loss": 0.8649, + "step": 6486 + }, + { + "epoch": 1.1571670680581572, + "grad_norm": 0.5572760701179504, + "learning_rate": 0.0001888324050609455, + "loss": 0.9441, + "step": 6487 + }, + { + "epoch": 1.1573454642761574, + "grad_norm": 0.44463208317756653, + "learning_rate": 0.00018876447463437367, + "loss": 0.5227, + "step": 6488 + }, + { + "epoch": 1.1575238604941576, + "grad_norm": 0.47355902194976807, + "learning_rate": 0.0001886965490172136, + "loss": 0.4937, + "step": 6489 + }, + { + "epoch": 1.1577022567121578, + "grad_norm": 0.44894346594810486, + "learning_rate": 0.00018862862821480023, + "loss": 0.5107, + "step": 6490 + }, + { + "epoch": 1.157880652930158, + "grad_norm": 0.5426954627037048, + "learning_rate": 0.00018856071223246796, + "loss": 0.6947, + "step": 6491 + }, + { + "epoch": 1.1580590491481582, + "grad_norm": 0.44600164890289307, + "learning_rate": 0.0001884928010755509, + "loss": 0.6412, + "step": 6492 + }, + { + "epoch": 1.1582374453661584, + "grad_norm": 0.4314921498298645, + "learning_rate": 0.0001884248947493828, + "loss": 0.5661, + "step": 6493 + }, + { + "epoch": 1.1584158415841583, + "grad_norm": 0.44217392802238464, + "learning_rate": 0.00018835699325929692, + "loss": 0.5272, + "step": 6494 + }, + { + "epoch": 1.1585942378021585, + "grad_norm": 0.42524126172065735, + "learning_rate": 0.0001882890966106263, + "loss": 0.5549, + "step": 6495 + }, + { + "epoch": 1.1587726340201587, + "grad_norm": 0.4192121922969818, + "learning_rate": 0.00018822120480870352, + "loss": 0.4929, + "step": 6496 + }, + { + "epoch": 1.158951030238159, + "grad_norm": 0.4756525456905365, + "learning_rate": 0.00018815331785886066, + "loss": 0.6762, + "step": 6497 + }, + { + "epoch": 1.1591294264561591, + "grad_norm": 0.5104422569274902, + "learning_rate": 0.00018808543576642964, + "loss": 0.7025, + "step": 6498 + }, + { + "epoch": 1.1593078226741593, + "grad_norm": 0.46176689863204956, + "learning_rate": 0.00018801755853674183, + "loss": 0.5673, + "step": 6499 + }, + { + "epoch": 1.1594862188921595, + "grad_norm": 0.49885469675064087, + "learning_rate": 0.00018794968617512827, + "loss": 0.6967, + "step": 6500 + }, + { + "epoch": 1.1596646151101597, + "grad_norm": 0.4945884943008423, + "learning_rate": 0.00018788181868691965, + "loss": 0.7192, + "step": 6501 + }, + { + "epoch": 1.15984301132816, + "grad_norm": 0.4873616099357605, + "learning_rate": 0.00018781395607744627, + "loss": 0.6867, + "step": 6502 + }, + { + "epoch": 1.16002140754616, + "grad_norm": 0.393449068069458, + "learning_rate": 0.00018774609835203808, + "loss": 0.4874, + "step": 6503 + }, + { + "epoch": 1.16019980376416, + "grad_norm": 0.42069491744041443, + "learning_rate": 0.00018767824551602446, + "loss": 0.493, + "step": 6504 + }, + { + "epoch": 1.1603781999821603, + "grad_norm": 0.44530075788497925, + "learning_rate": 0.00018761039757473466, + "loss": 0.4976, + "step": 6505 + }, + { + "epoch": 1.1605565962001605, + "grad_norm": 0.4841354191303253, + "learning_rate": 0.0001875425545334974, + "loss": 0.7774, + "step": 6506 + }, + { + "epoch": 1.1607349924181607, + "grad_norm": 0.44460541009902954, + "learning_rate": 0.00018747471639764103, + "loss": 0.6535, + "step": 6507 + }, + { + "epoch": 1.160913388636161, + "grad_norm": 0.5261197686195374, + "learning_rate": 0.00018740688317249349, + "loss": 0.7753, + "step": 6508 + }, + { + "epoch": 1.161091784854161, + "grad_norm": 0.4835450351238251, + "learning_rate": 0.0001873390548633825, + "loss": 0.9085, + "step": 6509 + }, + { + "epoch": 1.1612701810721613, + "grad_norm": 0.4938504993915558, + "learning_rate": 0.00018727123147563508, + "loss": 0.7482, + "step": 6510 + }, + { + "epoch": 1.1614485772901615, + "grad_norm": 0.4450148642063141, + "learning_rate": 0.00018720341301457815, + "loss": 0.5129, + "step": 6511 + }, + { + "epoch": 1.1616269735081617, + "grad_norm": 0.49409425258636475, + "learning_rate": 0.00018713559948553815, + "loss": 0.6571, + "step": 6512 + }, + { + "epoch": 1.1618053697261619, + "grad_norm": 0.42532381415367126, + "learning_rate": 0.00018706779089384113, + "loss": 0.508, + "step": 6513 + }, + { + "epoch": 1.161983765944162, + "grad_norm": 0.5003641843795776, + "learning_rate": 0.0001869999872448127, + "loss": 0.6938, + "step": 6514 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 0.4792661666870117, + "learning_rate": 0.0001869321885437782, + "loss": 0.6612, + "step": 6515 + }, + { + "epoch": 1.1623405583801623, + "grad_norm": 0.44840648770332336, + "learning_rate": 0.00018686439479606245, + "loss": 0.609, + "step": 6516 + }, + { + "epoch": 1.1625189545981625, + "grad_norm": 0.48601728677749634, + "learning_rate": 0.00018679660600698996, + "loss": 0.747, + "step": 6517 + }, + { + "epoch": 1.1626973508161627, + "grad_norm": 0.5069820284843445, + "learning_rate": 0.0001867288221818848, + "loss": 0.5674, + "step": 6518 + }, + { + "epoch": 1.1628757470341629, + "grad_norm": 0.4526546001434326, + "learning_rate": 0.00018666104332607075, + "loss": 0.5845, + "step": 6519 + }, + { + "epoch": 1.163054143252163, + "grad_norm": 0.48952043056488037, + "learning_rate": 0.00018659326944487115, + "loss": 0.7082, + "step": 6520 + }, + { + "epoch": 1.1632325394701633, + "grad_norm": 0.5443047881126404, + "learning_rate": 0.0001865255005436089, + "loss": 0.718, + "step": 6521 + }, + { + "epoch": 1.1634109356881635, + "grad_norm": 0.5090129375457764, + "learning_rate": 0.00018645773662760647, + "loss": 0.9893, + "step": 6522 + }, + { + "epoch": 1.1635893319061636, + "grad_norm": 0.5263794660568237, + "learning_rate": 0.00018638997770218602, + "loss": 0.6613, + "step": 6523 + }, + { + "epoch": 1.1637677281241638, + "grad_norm": 0.6042819619178772, + "learning_rate": 0.0001863222237726694, + "loss": 0.7601, + "step": 6524 + }, + { + "epoch": 1.1639461243421638, + "grad_norm": 0.5052105784416199, + "learning_rate": 0.0001862544748443779, + "loss": 0.7175, + "step": 6525 + }, + { + "epoch": 1.164124520560164, + "grad_norm": 0.4983629882335663, + "learning_rate": 0.00018618673092263253, + "loss": 0.682, + "step": 6526 + }, + { + "epoch": 1.1643029167781642, + "grad_norm": 0.4915952682495117, + "learning_rate": 0.00018611899201275385, + "loss": 0.6723, + "step": 6527 + }, + { + "epoch": 1.1644813129961644, + "grad_norm": 0.5149100422859192, + "learning_rate": 0.000186051258120062, + "loss": 0.6966, + "step": 6528 + }, + { + "epoch": 1.1646597092141646, + "grad_norm": 0.5236539244651794, + "learning_rate": 0.00018598352924987689, + "loss": 0.8879, + "step": 6529 + }, + { + "epoch": 1.1648381054321648, + "grad_norm": 0.4917599856853485, + "learning_rate": 0.00018591580540751784, + "loss": 0.7433, + "step": 6530 + }, + { + "epoch": 1.165016501650165, + "grad_norm": 0.47887739539146423, + "learning_rate": 0.00018584808659830385, + "loss": 0.7286, + "step": 6531 + }, + { + "epoch": 1.1651948978681652, + "grad_norm": 0.4636463224887848, + "learning_rate": 0.0001857803728275534, + "loss": 0.6089, + "step": 6532 + }, + { + "epoch": 1.1653732940861654, + "grad_norm": 0.5316303372383118, + "learning_rate": 0.00018571266410058492, + "loss": 0.6789, + "step": 6533 + }, + { + "epoch": 1.1655516903041656, + "grad_norm": 0.49224573373794556, + "learning_rate": 0.00018564496042271624, + "loss": 0.788, + "step": 6534 + }, + { + "epoch": 1.1657300865221658, + "grad_norm": 0.5619474053382874, + "learning_rate": 0.0001855772617992646, + "loss": 0.7761, + "step": 6535 + }, + { + "epoch": 1.165908482740166, + "grad_norm": 0.49133652448654175, + "learning_rate": 0.00018550956823554706, + "loss": 0.6949, + "step": 6536 + }, + { + "epoch": 1.1660868789581662, + "grad_norm": 0.4631252884864807, + "learning_rate": 0.00018544187973688032, + "loss": 0.5078, + "step": 6537 + }, + { + "epoch": 1.1662652751761662, + "grad_norm": 0.4481963515281677, + "learning_rate": 0.00018537419630858053, + "loss": 0.6133, + "step": 6538 + }, + { + "epoch": 1.1664436713941664, + "grad_norm": 0.49360591173171997, + "learning_rate": 0.0001853065179559636, + "loss": 0.6333, + "step": 6539 + }, + { + "epoch": 1.1666220676121666, + "grad_norm": 0.5313968062400818, + "learning_rate": 0.00018523884468434488, + "loss": 0.8331, + "step": 6540 + }, + { + "epoch": 1.1668004638301668, + "grad_norm": 0.5059915781021118, + "learning_rate": 0.00018517117649903943, + "loss": 0.7155, + "step": 6541 + }, + { + "epoch": 1.166978860048167, + "grad_norm": 0.4583684504032135, + "learning_rate": 0.00018510351340536192, + "loss": 0.6011, + "step": 6542 + }, + { + "epoch": 1.1671572562661672, + "grad_norm": 0.4290786385536194, + "learning_rate": 0.0001850358554086265, + "loss": 0.5471, + "step": 6543 + }, + { + "epoch": 1.1673356524841674, + "grad_norm": 0.4612555205821991, + "learning_rate": 0.00018496820251414703, + "loss": 0.6882, + "step": 6544 + }, + { + "epoch": 1.1675140487021676, + "grad_norm": 0.3945925831794739, + "learning_rate": 0.00018490055472723696, + "loss": 0.51, + "step": 6545 + }, + { + "epoch": 1.1676924449201678, + "grad_norm": 0.43021753430366516, + "learning_rate": 0.0001848329120532093, + "loss": 0.5537, + "step": 6546 + }, + { + "epoch": 1.1678708411381677, + "grad_norm": 0.46711066365242004, + "learning_rate": 0.00018476527449737666, + "loss": 0.7032, + "step": 6547 + }, + { + "epoch": 1.168049237356168, + "grad_norm": 0.4292619228363037, + "learning_rate": 0.0001846976420650513, + "loss": 0.569, + "step": 6548 + }, + { + "epoch": 1.1682276335741681, + "grad_norm": 0.45965656638145447, + "learning_rate": 0.00018463001476154507, + "loss": 0.7034, + "step": 6549 + }, + { + "epoch": 1.1684060297921683, + "grad_norm": 0.49549368023872375, + "learning_rate": 0.00018456239259216934, + "loss": 0.6292, + "step": 6550 + }, + { + "epoch": 1.1685844260101685, + "grad_norm": 0.5122089385986328, + "learning_rate": 0.0001844947755622351, + "loss": 0.7911, + "step": 6551 + }, + { + "epoch": 1.1687628222281687, + "grad_norm": 1.3904839754104614, + "learning_rate": 0.00018442716367705303, + "loss": 0.6504, + "step": 6552 + }, + { + "epoch": 1.168941218446169, + "grad_norm": 0.5320877432823181, + "learning_rate": 0.0001843595569419333, + "loss": 0.6984, + "step": 6553 + }, + { + "epoch": 1.1691196146641691, + "grad_norm": 0.44560760259628296, + "learning_rate": 0.0001842919553621857, + "loss": 0.5054, + "step": 6554 + }, + { + "epoch": 1.1692980108821693, + "grad_norm": 0.4876161217689514, + "learning_rate": 0.00018422435894311972, + "loss": 0.6423, + "step": 6555 + }, + { + "epoch": 1.1694764071001695, + "grad_norm": 0.4854297339916229, + "learning_rate": 0.00018415676769004426, + "loss": 0.6986, + "step": 6556 + }, + { + "epoch": 1.1696548033181697, + "grad_norm": 0.4554065465927124, + "learning_rate": 0.00018408918160826789, + "loss": 0.5424, + "step": 6557 + }, + { + "epoch": 1.16983319953617, + "grad_norm": 0.5047731399536133, + "learning_rate": 0.00018402160070309884, + "loss": 0.7302, + "step": 6558 + }, + { + "epoch": 1.1700115957541701, + "grad_norm": 0.47815465927124023, + "learning_rate": 0.0001839540249798449, + "loss": 0.7022, + "step": 6559 + }, + { + "epoch": 1.17018999197217, + "grad_norm": 0.6329243183135986, + "learning_rate": 0.0001838864544438134, + "loss": 0.5491, + "step": 6560 + }, + { + "epoch": 1.1703683881901703, + "grad_norm": 0.4645484685897827, + "learning_rate": 0.0001838188891003113, + "loss": 0.6384, + "step": 6561 + }, + { + "epoch": 1.1705467844081705, + "grad_norm": 0.49263012409210205, + "learning_rate": 0.0001837513289546452, + "loss": 0.7574, + "step": 6562 + }, + { + "epoch": 1.1707251806261707, + "grad_norm": 0.44261929392814636, + "learning_rate": 0.00018368377401212116, + "loss": 0.5943, + "step": 6563 + }, + { + "epoch": 1.170903576844171, + "grad_norm": 0.4851824641227722, + "learning_rate": 0.000183616224278045, + "loss": 0.5772, + "step": 6564 + }, + { + "epoch": 1.171081973062171, + "grad_norm": 0.4466243088245392, + "learning_rate": 0.00018354867975772205, + "loss": 0.5803, + "step": 6565 + }, + { + "epoch": 1.1712603692801713, + "grad_norm": 0.5128055810928345, + "learning_rate": 0.00018348114045645713, + "loss": 0.7322, + "step": 6566 + }, + { + "epoch": 1.1714387654981715, + "grad_norm": 0.5170561075210571, + "learning_rate": 0.00018341360637955489, + "loss": 0.7447, + "step": 6567 + }, + { + "epoch": 1.1716171617161717, + "grad_norm": 0.4576283097267151, + "learning_rate": 0.00018334607753231935, + "loss": 0.6132, + "step": 6568 + }, + { + "epoch": 1.1717955579341717, + "grad_norm": 0.44425490498542786, + "learning_rate": 0.00018327855392005418, + "loss": 0.5558, + "step": 6569 + }, + { + "epoch": 1.1719739541521719, + "grad_norm": 0.5051802396774292, + "learning_rate": 0.00018321103554806267, + "loss": 0.6139, + "step": 6570 + }, + { + "epoch": 1.172152350370172, + "grad_norm": 0.4622550904750824, + "learning_rate": 0.00018314352242164767, + "loss": 0.6846, + "step": 6571 + }, + { + "epoch": 1.1723307465881723, + "grad_norm": 0.47944626212120056, + "learning_rate": 0.00018307601454611166, + "loss": 0.6305, + "step": 6572 + }, + { + "epoch": 1.1725091428061725, + "grad_norm": 0.4514850080013275, + "learning_rate": 0.00018300851192675665, + "loss": 0.6158, + "step": 6573 + }, + { + "epoch": 1.1726875390241727, + "grad_norm": 0.4573253095149994, + "learning_rate": 0.00018294101456888432, + "loss": 0.6954, + "step": 6574 + }, + { + "epoch": 1.1728659352421729, + "grad_norm": 0.428072065114975, + "learning_rate": 0.00018287352247779582, + "loss": 0.4599, + "step": 6575 + }, + { + "epoch": 1.173044331460173, + "grad_norm": 0.4548456072807312, + "learning_rate": 0.00018280603565879207, + "loss": 0.6041, + "step": 6576 + }, + { + "epoch": 1.1732227276781733, + "grad_norm": 0.4897709786891937, + "learning_rate": 0.00018273855411717323, + "loss": 0.6279, + "step": 6577 + }, + { + "epoch": 1.1734011238961735, + "grad_norm": 0.5311153531074524, + "learning_rate": 0.00018267107785823936, + "loss": 0.645, + "step": 6578 + }, + { + "epoch": 1.1735795201141737, + "grad_norm": 0.48785173892974854, + "learning_rate": 0.00018260360688729016, + "loss": 0.5661, + "step": 6579 + }, + { + "epoch": 1.1737579163321739, + "grad_norm": 0.5114821791648865, + "learning_rate": 0.00018253614120962463, + "loss": 0.7209, + "step": 6580 + }, + { + "epoch": 1.173936312550174, + "grad_norm": 0.5291317105293274, + "learning_rate": 0.00018246868083054166, + "loss": 0.8096, + "step": 6581 + }, + { + "epoch": 1.174114708768174, + "grad_norm": 0.5284007787704468, + "learning_rate": 0.00018240122575533934, + "loss": 0.7968, + "step": 6582 + }, + { + "epoch": 1.1742931049861742, + "grad_norm": 0.49136999249458313, + "learning_rate": 0.00018233377598931566, + "loss": 0.6637, + "step": 6583 + }, + { + "epoch": 1.1744715012041744, + "grad_norm": 0.4023672342300415, + "learning_rate": 0.00018226633153776812, + "loss": 0.4708, + "step": 6584 + }, + { + "epoch": 1.1746498974221746, + "grad_norm": 0.39422962069511414, + "learning_rate": 0.00018219889240599375, + "loss": 0.5222, + "step": 6585 + }, + { + "epoch": 1.1748282936401748, + "grad_norm": 0.5104691386222839, + "learning_rate": 0.00018213145859928914, + "loss": 0.6543, + "step": 6586 + }, + { + "epoch": 1.175006689858175, + "grad_norm": 0.4648468792438507, + "learning_rate": 0.00018206403012295064, + "loss": 0.6523, + "step": 6587 + }, + { + "epoch": 1.1751850860761752, + "grad_norm": 0.5008534789085388, + "learning_rate": 0.00018199660698227393, + "loss": 0.7357, + "step": 6588 + }, + { + "epoch": 1.1753634822941754, + "grad_norm": 0.5412165522575378, + "learning_rate": 0.00018192918918255452, + "loss": 0.7517, + "step": 6589 + }, + { + "epoch": 1.1755418785121756, + "grad_norm": 0.4949028193950653, + "learning_rate": 0.0001818617767290872, + "loss": 0.636, + "step": 6590 + }, + { + "epoch": 1.1757202747301756, + "grad_norm": 0.49255791306495667, + "learning_rate": 0.00018179436962716667, + "loss": 0.6114, + "step": 6591 + }, + { + "epoch": 1.1758986709481758, + "grad_norm": 0.5153682827949524, + "learning_rate": 0.00018172696788208698, + "loss": 0.8209, + "step": 6592 + }, + { + "epoch": 1.176077067166176, + "grad_norm": 0.4873290956020355, + "learning_rate": 0.0001816595714991418, + "loss": 0.6777, + "step": 6593 + }, + { + "epoch": 1.1762554633841762, + "grad_norm": 0.4800300598144531, + "learning_rate": 0.00018159218048362452, + "loss": 0.659, + "step": 6594 + }, + { + "epoch": 1.1764338596021764, + "grad_norm": 0.511806070804596, + "learning_rate": 0.00018152479484082797, + "loss": 0.7344, + "step": 6595 + }, + { + "epoch": 1.1766122558201766, + "grad_norm": 0.5367228388786316, + "learning_rate": 0.0001814574145760445, + "loss": 0.9972, + "step": 6596 + }, + { + "epoch": 1.1767906520381768, + "grad_norm": 0.5340408682823181, + "learning_rate": 0.00018139003969456623, + "loss": 0.8547, + "step": 6597 + }, + { + "epoch": 1.176969048256177, + "grad_norm": 0.44577550888061523, + "learning_rate": 0.00018132267020168471, + "loss": 0.5981, + "step": 6598 + }, + { + "epoch": 1.1771474444741772, + "grad_norm": 0.5312680006027222, + "learning_rate": 0.00018125530610269114, + "loss": 0.7921, + "step": 6599 + }, + { + "epoch": 1.1773258406921774, + "grad_norm": 0.4474240839481354, + "learning_rate": 0.00018118794740287625, + "loss": 0.532, + "step": 6600 + }, + { + "epoch": 1.1775042369101776, + "grad_norm": 0.455321341753006, + "learning_rate": 0.00018112059410753034, + "loss": 0.5826, + "step": 6601 + }, + { + "epoch": 1.1776826331281778, + "grad_norm": 0.5083394646644592, + "learning_rate": 0.00018105324622194336, + "loss": 0.7073, + "step": 6602 + }, + { + "epoch": 1.177861029346178, + "grad_norm": 0.5182134509086609, + "learning_rate": 0.00018098590375140473, + "loss": 0.7853, + "step": 6603 + }, + { + "epoch": 1.178039425564178, + "grad_norm": 0.5327116250991821, + "learning_rate": 0.00018091856670120348, + "loss": 0.8714, + "step": 6604 + }, + { + "epoch": 1.1782178217821782, + "grad_norm": 0.4751187562942505, + "learning_rate": 0.00018085123507662832, + "loss": 0.6433, + "step": 6605 + }, + { + "epoch": 1.1783962180001784, + "grad_norm": 0.4442801773548126, + "learning_rate": 0.0001807839088829674, + "loss": 0.539, + "step": 6606 + }, + { + "epoch": 1.1785746142181786, + "grad_norm": 0.5099819302558899, + "learning_rate": 0.00018071658812550845, + "loss": 0.741, + "step": 6607 + }, + { + "epoch": 1.1787530104361787, + "grad_norm": 0.4700085520744324, + "learning_rate": 0.00018064927280953891, + "loss": 0.7668, + "step": 6608 + }, + { + "epoch": 1.178931406654179, + "grad_norm": 0.5318174362182617, + "learning_rate": 0.00018058196294034554, + "loss": 0.8966, + "step": 6609 + }, + { + "epoch": 1.1791098028721791, + "grad_norm": 0.46459850668907166, + "learning_rate": 0.000180514658523215, + "loss": 0.6678, + "step": 6610 + }, + { + "epoch": 1.1792881990901793, + "grad_norm": 0.47724831104278564, + "learning_rate": 0.00018044735956343328, + "loss": 0.6598, + "step": 6611 + }, + { + "epoch": 1.1794665953081795, + "grad_norm": 0.46925148367881775, + "learning_rate": 0.000180380066066286, + "loss": 0.5628, + "step": 6612 + }, + { + "epoch": 1.1796449915261795, + "grad_norm": 0.4924178421497345, + "learning_rate": 0.00018031277803705835, + "loss": 0.703, + "step": 6613 + }, + { + "epoch": 1.1798233877441797, + "grad_norm": 0.460475891828537, + "learning_rate": 0.00018024549548103518, + "loss": 0.5927, + "step": 6614 + }, + { + "epoch": 1.18000178396218, + "grad_norm": 0.5572229027748108, + "learning_rate": 0.0001801782184035008, + "loss": 0.8473, + "step": 6615 + }, + { + "epoch": 1.1801801801801801, + "grad_norm": 0.5910245180130005, + "learning_rate": 0.00018011094680973902, + "loss": 0.7021, + "step": 6616 + }, + { + "epoch": 1.1803585763981803, + "grad_norm": 0.5176860094070435, + "learning_rate": 0.00018004368070503342, + "loss": 0.7578, + "step": 6617 + }, + { + "epoch": 1.1805369726161805, + "grad_norm": 0.5350726842880249, + "learning_rate": 0.00017997642009466702, + "loss": 0.8098, + "step": 6618 + }, + { + "epoch": 1.1807153688341807, + "grad_norm": 0.552034318447113, + "learning_rate": 0.0001799091649839225, + "loss": 0.8272, + "step": 6619 + }, + { + "epoch": 1.180893765052181, + "grad_norm": 0.5275470018386841, + "learning_rate": 0.00017984191537808198, + "loss": 0.6984, + "step": 6620 + }, + { + "epoch": 1.181072161270181, + "grad_norm": 0.5647268891334534, + "learning_rate": 0.0001797746712824272, + "loss": 0.935, + "step": 6621 + }, + { + "epoch": 1.1812505574881813, + "grad_norm": 0.4905039370059967, + "learning_rate": 0.0001797074327022396, + "loss": 0.645, + "step": 6622 + }, + { + "epoch": 1.1814289537061815, + "grad_norm": 0.4301641881465912, + "learning_rate": 0.00017964019964279994, + "loss": 0.4728, + "step": 6623 + }, + { + "epoch": 1.1816073499241817, + "grad_norm": 0.46979376673698425, + "learning_rate": 0.0001795729721093886, + "loss": 0.6428, + "step": 6624 + }, + { + "epoch": 1.181785746142182, + "grad_norm": 0.48805493116378784, + "learning_rate": 0.00017950575010728582, + "loss": 0.6446, + "step": 6625 + }, + { + "epoch": 1.1819641423601819, + "grad_norm": 0.49873703718185425, + "learning_rate": 0.00017943853364177112, + "loss": 0.7287, + "step": 6626 + }, + { + "epoch": 1.182142538578182, + "grad_norm": 0.422006756067276, + "learning_rate": 0.00017937132271812368, + "loss": 0.4329, + "step": 6627 + }, + { + "epoch": 1.1823209347961823, + "grad_norm": 0.46575191617012024, + "learning_rate": 0.00017930411734162204, + "loss": 0.582, + "step": 6628 + }, + { + "epoch": 1.1824993310141825, + "grad_norm": 0.5291426181793213, + "learning_rate": 0.00017923691751754468, + "loss": 0.7962, + "step": 6629 + }, + { + "epoch": 1.1826777272321827, + "grad_norm": 0.48327967524528503, + "learning_rate": 0.00017916972325116931, + "loss": 0.6514, + "step": 6630 + }, + { + "epoch": 1.1828561234501829, + "grad_norm": 0.47633296251296997, + "learning_rate": 0.00017910253454777344, + "loss": 0.6475, + "step": 6631 + }, + { + "epoch": 1.183034519668183, + "grad_norm": 0.5271769165992737, + "learning_rate": 0.000179035351412634, + "loss": 0.7574, + "step": 6632 + }, + { + "epoch": 1.1832129158861833, + "grad_norm": 0.45109376311302185, + "learning_rate": 0.00017896817385102748, + "loss": 0.538, + "step": 6633 + }, + { + "epoch": 1.1833913121041835, + "grad_norm": 0.489512175321579, + "learning_rate": 0.00017890100186823004, + "loss": 0.7353, + "step": 6634 + }, + { + "epoch": 1.1835697083221837, + "grad_norm": 0.5405703186988831, + "learning_rate": 0.00017883383546951737, + "loss": 0.7591, + "step": 6635 + }, + { + "epoch": 1.1837481045401836, + "grad_norm": 0.4727650284767151, + "learning_rate": 0.00017876667466016458, + "loss": 0.7046, + "step": 6636 + }, + { + "epoch": 1.1839265007581838, + "grad_norm": 0.49231699109077454, + "learning_rate": 0.0001786995194454465, + "loss": 0.7205, + "step": 6637 + }, + { + "epoch": 1.184104896976184, + "grad_norm": 0.49565812945365906, + "learning_rate": 0.0001786323698306375, + "loss": 0.7502, + "step": 6638 + }, + { + "epoch": 1.1842832931941842, + "grad_norm": 0.4355218708515167, + "learning_rate": 0.00017856522582101133, + "loss": 0.5357, + "step": 6639 + }, + { + "epoch": 1.1844616894121844, + "grad_norm": 0.509213387966156, + "learning_rate": 0.00017849808742184176, + "loss": 0.8362, + "step": 6640 + }, + { + "epoch": 1.1846400856301846, + "grad_norm": 0.49817323684692383, + "learning_rate": 0.0001784309546384016, + "loss": 0.6041, + "step": 6641 + }, + { + "epoch": 1.1848184818481848, + "grad_norm": 0.44568195939064026, + "learning_rate": 0.00017836382747596341, + "loss": 0.6403, + "step": 6642 + }, + { + "epoch": 1.184996878066185, + "grad_norm": 0.42027372121810913, + "learning_rate": 0.00017829670593979944, + "loss": 0.5297, + "step": 6643 + }, + { + "epoch": 1.1851752742841852, + "grad_norm": 0.4120577871799469, + "learning_rate": 0.0001782295900351813, + "loss": 0.5559, + "step": 6644 + }, + { + "epoch": 1.1853536705021854, + "grad_norm": 0.4313491880893707, + "learning_rate": 0.00017816247976738025, + "loss": 0.629, + "step": 6645 + }, + { + "epoch": 1.1855320667201856, + "grad_norm": 0.45468538999557495, + "learning_rate": 0.00017809537514166718, + "loss": 0.6541, + "step": 6646 + }, + { + "epoch": 1.1857104629381858, + "grad_norm": 42.75632858276367, + "learning_rate": 0.00017802827616331235, + "loss": 1.0525, + "step": 6647 + }, + { + "epoch": 1.1858888591561858, + "grad_norm": 0.5608348846435547, + "learning_rate": 0.00017796118283758584, + "loss": 0.7031, + "step": 6648 + }, + { + "epoch": 1.186067255374186, + "grad_norm": 0.4569344222545624, + "learning_rate": 0.00017789409516975698, + "loss": 0.524, + "step": 6649 + }, + { + "epoch": 1.1862456515921862, + "grad_norm": 0.5035879015922546, + "learning_rate": 0.0001778270131650948, + "loss": 0.681, + "step": 6650 + }, + { + "epoch": 1.1864240478101864, + "grad_norm": 0.4764963984489441, + "learning_rate": 0.000177759936828868, + "loss": 0.6345, + "step": 6651 + }, + { + "epoch": 1.1866024440281866, + "grad_norm": 0.46746543049812317, + "learning_rate": 0.00017769286616634461, + "loss": 0.6118, + "step": 6652 + }, + { + "epoch": 1.1867808402461868, + "grad_norm": 0.4906679689884186, + "learning_rate": 0.00017762580118279244, + "loss": 0.6521, + "step": 6653 + }, + { + "epoch": 1.186959236464187, + "grad_norm": 0.5302233099937439, + "learning_rate": 0.0001775587418834787, + "loss": 0.6907, + "step": 6654 + }, + { + "epoch": 1.1871376326821872, + "grad_norm": 0.46473437547683716, + "learning_rate": 0.00017749168827367015, + "loss": 0.6949, + "step": 6655 + }, + { + "epoch": 1.1873160289001874, + "grad_norm": 0.43767476081848145, + "learning_rate": 0.00017742464035863325, + "loss": 0.5771, + "step": 6656 + }, + { + "epoch": 1.1874944251181876, + "grad_norm": 0.5204544067382812, + "learning_rate": 0.00017735759814363383, + "loss": 0.6368, + "step": 6657 + }, + { + "epoch": 1.1876728213361876, + "grad_norm": 0.49561235308647156, + "learning_rate": 0.00017729056163393744, + "loss": 0.5123, + "step": 6658 + }, + { + "epoch": 1.1878512175541878, + "grad_norm": 0.4810577929019928, + "learning_rate": 0.00017722353083480903, + "loss": 0.6761, + "step": 6659 + }, + { + "epoch": 1.188029613772188, + "grad_norm": 0.5160006284713745, + "learning_rate": 0.00017715650575151322, + "loss": 0.6518, + "step": 6660 + }, + { + "epoch": 1.1882080099901882, + "grad_norm": 0.42943114042282104, + "learning_rate": 0.0001770894863893141, + "loss": 0.529, + "step": 6661 + }, + { + "epoch": 1.1883864062081884, + "grad_norm": 0.4512742757797241, + "learning_rate": 0.00017702247275347532, + "loss": 0.5871, + "step": 6662 + }, + { + "epoch": 1.1885648024261886, + "grad_norm": 0.5028432011604309, + "learning_rate": 0.00017695546484926012, + "loss": 0.5853, + "step": 6663 + }, + { + "epoch": 1.1887431986441888, + "grad_norm": 0.5111497044563293, + "learning_rate": 0.00017688846268193125, + "loss": 0.6258, + "step": 6664 + }, + { + "epoch": 1.188921594862189, + "grad_norm": 0.4514205753803253, + "learning_rate": 0.0001768214662567511, + "loss": 0.5353, + "step": 6665 + }, + { + "epoch": 1.1890999910801892, + "grad_norm": 0.4557707607746124, + "learning_rate": 0.0001767544755789815, + "loss": 0.6054, + "step": 6666 + }, + { + "epoch": 1.1892783872981894, + "grad_norm": 0.5693210959434509, + "learning_rate": 0.00017668749065388384, + "loss": 0.7786, + "step": 6667 + }, + { + "epoch": 1.1894567835161896, + "grad_norm": 0.47501084208488464, + "learning_rate": 0.00017662051148671914, + "loss": 0.6898, + "step": 6668 + }, + { + "epoch": 1.1896351797341898, + "grad_norm": 0.4343969225883484, + "learning_rate": 0.00017655353808274793, + "loss": 0.5845, + "step": 6669 + }, + { + "epoch": 1.1898135759521897, + "grad_norm": 0.4890352487564087, + "learning_rate": 0.00017648657044723007, + "loss": 0.579, + "step": 6670 + }, + { + "epoch": 1.18999197217019, + "grad_norm": 0.4660862386226654, + "learning_rate": 0.00017641960858542544, + "loss": 0.5289, + "step": 6671 + }, + { + "epoch": 1.1901703683881901, + "grad_norm": 0.49059972167015076, + "learning_rate": 0.0001763526525025931, + "loss": 0.6006, + "step": 6672 + }, + { + "epoch": 1.1903487646061903, + "grad_norm": 0.4573802351951599, + "learning_rate": 0.00017628570220399177, + "loss": 0.5264, + "step": 6673 + }, + { + "epoch": 1.1905271608241905, + "grad_norm": 0.5544562935829163, + "learning_rate": 0.00017621875769487964, + "loss": 0.7761, + "step": 6674 + }, + { + "epoch": 1.1907055570421907, + "grad_norm": 0.4699319303035736, + "learning_rate": 0.00017615181898051452, + "loss": 0.604, + "step": 6675 + }, + { + "epoch": 1.190883953260191, + "grad_norm": 0.4405967891216278, + "learning_rate": 0.00017608488606615376, + "loss": 0.5839, + "step": 6676 + }, + { + "epoch": 1.1910623494781911, + "grad_norm": 0.4909313917160034, + "learning_rate": 0.00017601795895705422, + "loss": 0.6793, + "step": 6677 + }, + { + "epoch": 1.1912407456961913, + "grad_norm": 0.5177908539772034, + "learning_rate": 0.00017595103765847238, + "loss": 0.7589, + "step": 6678 + }, + { + "epoch": 1.1914191419141915, + "grad_norm": 0.4939672648906708, + "learning_rate": 0.00017588412217566413, + "loss": 0.6188, + "step": 6679 + }, + { + "epoch": 1.1915975381321915, + "grad_norm": 0.4704304039478302, + "learning_rate": 0.00017581721251388506, + "loss": 0.5345, + "step": 6680 + }, + { + "epoch": 1.1917759343501917, + "grad_norm": 0.43384838104248047, + "learning_rate": 0.0001757503086783902, + "loss": 0.5269, + "step": 6681 + }, + { + "epoch": 1.1919543305681919, + "grad_norm": 0.4727664887905121, + "learning_rate": 0.0001756834106744342, + "loss": 0.7044, + "step": 6682 + }, + { + "epoch": 1.192132726786192, + "grad_norm": 0.5356529355049133, + "learning_rate": 0.00017561651850727105, + "loss": 0.8679, + "step": 6683 + }, + { + "epoch": 1.1923111230041923, + "grad_norm": 0.45256125926971436, + "learning_rate": 0.00017554963218215458, + "loss": 0.6471, + "step": 6684 + }, + { + "epoch": 1.1924895192221925, + "grad_norm": 0.6792163252830505, + "learning_rate": 0.00017548275170433783, + "loss": 0.713, + "step": 6685 + }, + { + "epoch": 1.1926679154401927, + "grad_norm": 0.42677950859069824, + "learning_rate": 0.00017541587707907387, + "loss": 0.4628, + "step": 6686 + }, + { + "epoch": 1.1928463116581929, + "grad_norm": 0.5247853994369507, + "learning_rate": 0.00017534900831161476, + "loss": 0.8362, + "step": 6687 + }, + { + "epoch": 1.193024707876193, + "grad_norm": 0.48497462272644043, + "learning_rate": 0.00017528214540721241, + "loss": 0.6612, + "step": 6688 + }, + { + "epoch": 1.1932031040941933, + "grad_norm": 0.43292883038520813, + "learning_rate": 0.0001752152883711182, + "loss": 0.5158, + "step": 6689 + }, + { + "epoch": 1.1933815003121935, + "grad_norm": 0.5147132277488708, + "learning_rate": 0.00017514843720858308, + "loss": 0.795, + "step": 6690 + }, + { + "epoch": 1.1935598965301937, + "grad_norm": 0.5482494831085205, + "learning_rate": 0.00017508159192485746, + "loss": 0.6979, + "step": 6691 + }, + { + "epoch": 1.1937382927481937, + "grad_norm": 0.5331830978393555, + "learning_rate": 0.0001750147525251914, + "loss": 0.5722, + "step": 6692 + }, + { + "epoch": 1.1939166889661939, + "grad_norm": 0.5137856006622314, + "learning_rate": 0.0001749479190148344, + "loss": 0.7025, + "step": 6693 + }, + { + "epoch": 1.194095085184194, + "grad_norm": 0.4909903109073639, + "learning_rate": 0.00017488109139903558, + "loss": 0.5722, + "step": 6694 + }, + { + "epoch": 1.1942734814021942, + "grad_norm": 0.5011717081069946, + "learning_rate": 0.00017481426968304347, + "loss": 0.6142, + "step": 6695 + }, + { + "epoch": 1.1944518776201944, + "grad_norm": 0.635654628276825, + "learning_rate": 0.00017474745387210627, + "loss": 0.6943, + "step": 6696 + }, + { + "epoch": 1.1946302738381946, + "grad_norm": 0.5469489097595215, + "learning_rate": 0.00017468064397147166, + "loss": 0.6825, + "step": 6697 + }, + { + "epoch": 1.1948086700561948, + "grad_norm": 0.5090768337249756, + "learning_rate": 0.00017461383998638685, + "loss": 0.6433, + "step": 6698 + }, + { + "epoch": 1.194987066274195, + "grad_norm": 0.5082751512527466, + "learning_rate": 0.00017454704192209863, + "loss": 0.6789, + "step": 6699 + }, + { + "epoch": 1.1951654624921952, + "grad_norm": 0.4537877142429352, + "learning_rate": 0.0001744802497838532, + "loss": 0.6015, + "step": 6700 + }, + { + "epoch": 1.1953438587101954, + "grad_norm": 0.4903229773044586, + "learning_rate": 0.00017441346357689651, + "loss": 0.6014, + "step": 6701 + }, + { + "epoch": 1.1955222549281954, + "grad_norm": 0.4441526532173157, + "learning_rate": 0.00017434668330647385, + "loss": 0.5323, + "step": 6702 + }, + { + "epoch": 1.1957006511461956, + "grad_norm": 0.532263994216919, + "learning_rate": 0.00017427990897783013, + "loss": 0.729, + "step": 6703 + }, + { + "epoch": 1.1958790473641958, + "grad_norm": 0.49677395820617676, + "learning_rate": 0.0001742131405962098, + "loss": 0.646, + "step": 6704 + }, + { + "epoch": 1.196057443582196, + "grad_norm": 0.5097184181213379, + "learning_rate": 0.00017414637816685677, + "loss": 0.688, + "step": 6705 + }, + { + "epoch": 1.1962358398001962, + "grad_norm": 0.4844436049461365, + "learning_rate": 0.00017407962169501456, + "loss": 0.6241, + "step": 6706 + }, + { + "epoch": 1.1964142360181964, + "grad_norm": 0.49923595786094666, + "learning_rate": 0.00017401287118592624, + "loss": 0.7081, + "step": 6707 + }, + { + "epoch": 1.1965926322361966, + "grad_norm": 0.49586620926856995, + "learning_rate": 0.00017394612664483429, + "loss": 0.6703, + "step": 6708 + }, + { + "epoch": 1.1967710284541968, + "grad_norm": 0.5127303004264832, + "learning_rate": 0.00017387938807698078, + "loss": 0.5966, + "step": 6709 + }, + { + "epoch": 1.196949424672197, + "grad_norm": 0.5159239172935486, + "learning_rate": 0.0001738126554876074, + "loss": 0.6792, + "step": 6710 + }, + { + "epoch": 1.1971278208901972, + "grad_norm": 0.45261603593826294, + "learning_rate": 0.0001737459288819553, + "loss": 0.5966, + "step": 6711 + }, + { + "epoch": 1.1973062171081974, + "grad_norm": 0.4482129216194153, + "learning_rate": 0.00017367920826526508, + "loss": 0.5745, + "step": 6712 + }, + { + "epoch": 1.1974846133261976, + "grad_norm": 0.49750131368637085, + "learning_rate": 0.000173612493642777, + "loss": 0.6134, + "step": 6713 + }, + { + "epoch": 1.1976630095441976, + "grad_norm": 0.5099869966506958, + "learning_rate": 0.00017354578501973083, + "loss": 0.8435, + "step": 6714 + }, + { + "epoch": 1.1978414057621978, + "grad_norm": 0.4573536813259125, + "learning_rate": 0.00017347908240136585, + "loss": 0.5377, + "step": 6715 + }, + { + "epoch": 1.198019801980198, + "grad_norm": 0.5403563380241394, + "learning_rate": 0.00017341238579292063, + "loss": 0.792, + "step": 6716 + }, + { + "epoch": 1.1981981981981982, + "grad_norm": 0.44874098896980286, + "learning_rate": 0.00017334569519963377, + "loss": 0.6036, + "step": 6717 + }, + { + "epoch": 1.1983765944161984, + "grad_norm": 0.5061765313148499, + "learning_rate": 0.00017327901062674306, + "loss": 0.6612, + "step": 6718 + }, + { + "epoch": 1.1985549906341986, + "grad_norm": 0.4852476418018341, + "learning_rate": 0.00017321233207948583, + "loss": 0.5551, + "step": 6719 + }, + { + "epoch": 1.1987333868521988, + "grad_norm": 0.7161738276481628, + "learning_rate": 0.00017314565956309903, + "loss": 0.5223, + "step": 6720 + }, + { + "epoch": 1.198911783070199, + "grad_norm": 0.5031499266624451, + "learning_rate": 0.000173078993082819, + "loss": 0.7557, + "step": 6721 + }, + { + "epoch": 1.1990901792881992, + "grad_norm": 1.0971347093582153, + "learning_rate": 0.00017301233264388176, + "loss": 0.6716, + "step": 6722 + }, + { + "epoch": 1.1992685755061994, + "grad_norm": 0.6954957842826843, + "learning_rate": 0.0001729456782515228, + "loss": 0.669, + "step": 6723 + }, + { + "epoch": 1.1994469717241993, + "grad_norm": 0.5447986125946045, + "learning_rate": 0.0001728790299109771, + "loss": 0.7785, + "step": 6724 + }, + { + "epoch": 1.1996253679421995, + "grad_norm": 0.5347351431846619, + "learning_rate": 0.00017281238762747919, + "loss": 0.7014, + "step": 6725 + }, + { + "epoch": 1.1998037641601997, + "grad_norm": 0.5047118663787842, + "learning_rate": 0.00017274575140626317, + "loss": 0.574, + "step": 6726 + }, + { + "epoch": 1.1999821603782, + "grad_norm": 0.5407329797744751, + "learning_rate": 0.0001726791212525626, + "loss": 0.7382, + "step": 6727 + }, + { + "epoch": 1.2001605565962001, + "grad_norm": 0.51371830701828, + "learning_rate": 0.00017261249717161054, + "loss": 0.779, + "step": 6728 + }, + { + "epoch": 1.2003389528142003, + "grad_norm": 0.4466804563999176, + "learning_rate": 0.00017254587916863969, + "loss": 0.5177, + "step": 6729 + }, + { + "epoch": 1.2005173490322005, + "grad_norm": 0.45748934149742126, + "learning_rate": 0.0001724792672488821, + "loss": 0.6624, + "step": 6730 + }, + { + "epoch": 1.2006957452502007, + "grad_norm": 0.5126693844795227, + "learning_rate": 0.0001724126614175694, + "loss": 0.7792, + "step": 6731 + }, + { + "epoch": 1.200874141468201, + "grad_norm": 0.46355128288269043, + "learning_rate": 0.00017234606167993303, + "loss": 0.6419, + "step": 6732 + }, + { + "epoch": 1.2010525376862011, + "grad_norm": 0.4698454439640045, + "learning_rate": 0.0001722794680412036, + "loss": 0.6422, + "step": 6733 + }, + { + "epoch": 1.2012309339042013, + "grad_norm": 0.4981541335582733, + "learning_rate": 0.0001722128805066112, + "loss": 0.6687, + "step": 6734 + }, + { + "epoch": 1.2014093301222015, + "grad_norm": 0.5997437834739685, + "learning_rate": 0.00017214629908138574, + "loss": 0.9846, + "step": 6735 + }, + { + "epoch": 1.2015877263402015, + "grad_norm": 0.4991457164287567, + "learning_rate": 0.0001720797237707564, + "loss": 0.6268, + "step": 6736 + }, + { + "epoch": 1.2017661225582017, + "grad_norm": 0.4917805790901184, + "learning_rate": 0.0001720131545799521, + "loss": 0.62, + "step": 6737 + }, + { + "epoch": 1.201944518776202, + "grad_norm": 0.45788219571113586, + "learning_rate": 0.00017194659151420106, + "loss": 0.7157, + "step": 6738 + }, + { + "epoch": 1.202122914994202, + "grad_norm": 0.49863317608833313, + "learning_rate": 0.0001718800345787311, + "loss": 0.6979, + "step": 6739 + }, + { + "epoch": 1.2023013112122023, + "grad_norm": 0.591437578201294, + "learning_rate": 0.00017181348377876958, + "loss": 0.6895, + "step": 6740 + }, + { + "epoch": 1.2024797074302025, + "grad_norm": 0.4187926650047302, + "learning_rate": 0.00017174693911954354, + "loss": 0.5386, + "step": 6741 + }, + { + "epoch": 1.2026581036482027, + "grad_norm": 0.4731749892234802, + "learning_rate": 0.0001716804006062791, + "loss": 0.6097, + "step": 6742 + }, + { + "epoch": 1.202836499866203, + "grad_norm": 0.4960040748119354, + "learning_rate": 0.00017161386824420232, + "loss": 0.7277, + "step": 6743 + }, + { + "epoch": 1.203014896084203, + "grad_norm": 0.47977831959724426, + "learning_rate": 0.00017154734203853858, + "loss": 0.6146, + "step": 6744 + }, + { + "epoch": 1.2031932923022033, + "grad_norm": 0.6268126368522644, + "learning_rate": 0.00017148082199451288, + "loss": 0.6726, + "step": 6745 + }, + { + "epoch": 1.2033716885202033, + "grad_norm": 0.5123054385185242, + "learning_rate": 0.0001714143081173495, + "loss": 0.8116, + "step": 6746 + }, + { + "epoch": 1.2035500847382035, + "grad_norm": 0.45163649320602417, + "learning_rate": 0.00017134780041227265, + "loss": 0.6415, + "step": 6747 + }, + { + "epoch": 1.2037284809562037, + "grad_norm": 0.511045515537262, + "learning_rate": 0.00017128129888450573, + "loss": 0.5628, + "step": 6748 + }, + { + "epoch": 1.2039068771742039, + "grad_norm": 0.44558185338974, + "learning_rate": 0.00017121480353927165, + "loss": 0.5404, + "step": 6749 + }, + { + "epoch": 1.204085273392204, + "grad_norm": 0.5241187810897827, + "learning_rate": 0.00017114831438179304, + "loss": 0.7767, + "step": 6750 + }, + { + "epoch": 1.2042636696102043, + "grad_norm": 0.722863495349884, + "learning_rate": 0.00017108183141729188, + "loss": 0.685, + "step": 6751 + }, + { + "epoch": 1.2044420658282045, + "grad_norm": 0.6952433586120605, + "learning_rate": 0.00017101535465098973, + "loss": 0.6479, + "step": 6752 + }, + { + "epoch": 1.2046204620462047, + "grad_norm": 0.48947012424468994, + "learning_rate": 0.00017094888408810763, + "loss": 0.749, + "step": 6753 + }, + { + "epoch": 1.2047988582642049, + "grad_norm": 0.4884073734283447, + "learning_rate": 0.0001708824197338662, + "loss": 0.727, + "step": 6754 + }, + { + "epoch": 1.204977254482205, + "grad_norm": 0.5633352398872375, + "learning_rate": 0.00017081596159348544, + "loss": 0.8563, + "step": 6755 + }, + { + "epoch": 1.2051556507002052, + "grad_norm": 0.48596227169036865, + "learning_rate": 0.00017074950967218495, + "loss": 0.6633, + "step": 6756 + }, + { + "epoch": 1.2053340469182054, + "grad_norm": 0.45270687341690063, + "learning_rate": 0.0001706830639751839, + "loss": 0.5795, + "step": 6757 + }, + { + "epoch": 1.2055124431362054, + "grad_norm": 0.48366621136665344, + "learning_rate": 0.00017061662450770085, + "loss": 0.7047, + "step": 6758 + }, + { + "epoch": 1.2056908393542056, + "grad_norm": 0.48411786556243896, + "learning_rate": 0.00017055019127495396, + "loss": 0.7534, + "step": 6759 + }, + { + "epoch": 1.2058692355722058, + "grad_norm": 0.4793466627597809, + "learning_rate": 0.00017048376428216083, + "loss": 0.6564, + "step": 6760 + }, + { + "epoch": 1.206047631790206, + "grad_norm": 0.4573822021484375, + "learning_rate": 0.0001704173435345387, + "loss": 0.4675, + "step": 6761 + }, + { + "epoch": 1.2062260280082062, + "grad_norm": 0.540582537651062, + "learning_rate": 0.00017035092903730403, + "loss": 0.7691, + "step": 6762 + }, + { + "epoch": 1.2064044242262064, + "grad_norm": 0.5048598647117615, + "learning_rate": 0.00017028452079567325, + "loss": 0.7058, + "step": 6763 + }, + { + "epoch": 1.2065828204442066, + "grad_norm": 0.46692147850990295, + "learning_rate": 0.00017021811881486184, + "loss": 0.5641, + "step": 6764 + }, + { + "epoch": 1.2067612166622068, + "grad_norm": 0.4819284975528717, + "learning_rate": 0.00017015172310008508, + "loss": 0.733, + "step": 6765 + }, + { + "epoch": 1.206939612880207, + "grad_norm": 0.4647090435028076, + "learning_rate": 0.00017008533365655765, + "loss": 0.5602, + "step": 6766 + }, + { + "epoch": 1.2071180090982072, + "grad_norm": 0.42821335792541504, + "learning_rate": 0.0001700189504894937, + "loss": 0.5918, + "step": 6767 + }, + { + "epoch": 1.2072964053162072, + "grad_norm": 0.42195186018943787, + "learning_rate": 0.00016995257360410694, + "loss": 0.5887, + "step": 6768 + }, + { + "epoch": 1.2074748015342074, + "grad_norm": 0.5590615272521973, + "learning_rate": 0.00016988620300561065, + "loss": 0.5942, + "step": 6769 + }, + { + "epoch": 1.2076531977522076, + "grad_norm": 0.459682822227478, + "learning_rate": 0.0001698198386992174, + "loss": 0.6768, + "step": 6770 + }, + { + "epoch": 1.2078315939702078, + "grad_norm": 0.5003504157066345, + "learning_rate": 0.0001697534806901396, + "loss": 0.7413, + "step": 6771 + }, + { + "epoch": 1.208009990188208, + "grad_norm": 0.4521839916706085, + "learning_rate": 0.00016968712898358888, + "loss": 0.5958, + "step": 6772 + }, + { + "epoch": 1.2081883864062082, + "grad_norm": 0.47012245655059814, + "learning_rate": 0.00016962078358477648, + "loss": 0.6722, + "step": 6773 + }, + { + "epoch": 1.2083667826242084, + "grad_norm": 0.4755409359931946, + "learning_rate": 0.0001695544444989132, + "loss": 0.7082, + "step": 6774 + }, + { + "epoch": 1.2085451788422086, + "grad_norm": 0.44018933176994324, + "learning_rate": 0.00016948811173120914, + "loss": 0.6281, + "step": 6775 + }, + { + "epoch": 1.2087235750602088, + "grad_norm": 0.5177139639854431, + "learning_rate": 0.00016942178528687419, + "loss": 0.829, + "step": 6776 + }, + { + "epoch": 1.208901971278209, + "grad_norm": 0.42148521542549133, + "learning_rate": 0.00016935546517111744, + "loss": 0.5036, + "step": 6777 + }, + { + "epoch": 1.2090803674962092, + "grad_norm": 0.4840809404850006, + "learning_rate": 0.00016928915138914787, + "loss": 0.6851, + "step": 6778 + }, + { + "epoch": 1.2092587637142094, + "grad_norm": 0.4874308407306671, + "learning_rate": 0.0001692228439461736, + "loss": 0.7215, + "step": 6779 + }, + { + "epoch": 1.2094371599322093, + "grad_norm": 0.5354540944099426, + "learning_rate": 0.0001691565428474024, + "loss": 0.6564, + "step": 6780 + }, + { + "epoch": 1.2096155561502095, + "grad_norm": 0.47135040163993835, + "learning_rate": 0.0001690902480980415, + "loss": 0.6163, + "step": 6781 + }, + { + "epoch": 1.2097939523682097, + "grad_norm": 0.5024769902229309, + "learning_rate": 0.00016902395970329776, + "loss": 0.7379, + "step": 6782 + }, + { + "epoch": 1.20997234858621, + "grad_norm": 0.4286527633666992, + "learning_rate": 0.00016895767766837732, + "loss": 0.4737, + "step": 6783 + }, + { + "epoch": 1.2101507448042101, + "grad_norm": 0.4754869043827057, + "learning_rate": 0.00016889140199848605, + "loss": 0.5874, + "step": 6784 + }, + { + "epoch": 1.2103291410222103, + "grad_norm": 0.5316412448883057, + "learning_rate": 0.00016882513269882916, + "loss": 0.8397, + "step": 6785 + }, + { + "epoch": 1.2105075372402105, + "grad_norm": 0.46648043394088745, + "learning_rate": 0.00016875886977461136, + "loss": 0.6463, + "step": 6786 + }, + { + "epoch": 1.2106859334582107, + "grad_norm": 0.5333806276321411, + "learning_rate": 0.00016869261323103707, + "loss": 0.845, + "step": 6787 + }, + { + "epoch": 1.210864329676211, + "grad_norm": 0.48115333914756775, + "learning_rate": 0.00016862636307330987, + "loss": 0.6568, + "step": 6788 + }, + { + "epoch": 1.2110427258942111, + "grad_norm": 0.524509072303772, + "learning_rate": 0.00016856011930663312, + "loss": 0.8149, + "step": 6789 + }, + { + "epoch": 1.2112211221122111, + "grad_norm": 0.4812416732311249, + "learning_rate": 0.0001684938819362095, + "loss": 0.5716, + "step": 6790 + }, + { + "epoch": 1.2113995183302113, + "grad_norm": 0.4996643662452698, + "learning_rate": 0.0001684276509672414, + "loss": 0.8065, + "step": 6791 + }, + { + "epoch": 1.2115779145482115, + "grad_norm": 0.5265586376190186, + "learning_rate": 0.0001683614264049304, + "loss": 0.7074, + "step": 6792 + }, + { + "epoch": 1.2117563107662117, + "grad_norm": 0.4892607033252716, + "learning_rate": 0.00016829520825447787, + "loss": 0.7555, + "step": 6793 + }, + { + "epoch": 1.211934706984212, + "grad_norm": 0.493960440158844, + "learning_rate": 0.00016822899652108454, + "loss": 0.8655, + "step": 6794 + }, + { + "epoch": 1.212113103202212, + "grad_norm": 0.5139688849449158, + "learning_rate": 0.00016816279120995063, + "loss": 0.8795, + "step": 6795 + }, + { + "epoch": 1.2122914994202123, + "grad_norm": 0.41714024543762207, + "learning_rate": 0.00016809659232627588, + "loss": 0.4714, + "step": 6796 + }, + { + "epoch": 1.2124698956382125, + "grad_norm": 0.5181586742401123, + "learning_rate": 0.00016803039987525953, + "loss": 0.7304, + "step": 6797 + }, + { + "epoch": 1.2126482918562127, + "grad_norm": 0.47298526763916016, + "learning_rate": 0.00016796421386210028, + "loss": 0.5293, + "step": 6798 + }, + { + "epoch": 1.212826688074213, + "grad_norm": 0.47888705134391785, + "learning_rate": 0.0001678980342919964, + "loss": 0.5461, + "step": 6799 + }, + { + "epoch": 1.213005084292213, + "grad_norm": 0.5607367753982544, + "learning_rate": 0.0001678318611701456, + "loss": 0.8278, + "step": 6800 + }, + { + "epoch": 1.2131834805102133, + "grad_norm": 0.5705345869064331, + "learning_rate": 0.00016776569450174504, + "loss": 0.786, + "step": 6801 + }, + { + "epoch": 1.2133618767282133, + "grad_norm": 0.5020168423652649, + "learning_rate": 0.00016769953429199142, + "loss": 0.7604, + "step": 6802 + }, + { + "epoch": 1.2135402729462135, + "grad_norm": 0.4835543632507324, + "learning_rate": 0.00016763338054608096, + "loss": 0.6996, + "step": 6803 + }, + { + "epoch": 1.2137186691642137, + "grad_norm": 0.44707992672920227, + "learning_rate": 0.00016756723326920937, + "loss": 0.6119, + "step": 6804 + }, + { + "epoch": 1.2138970653822139, + "grad_norm": 0.40079420804977417, + "learning_rate": 0.0001675010924665718, + "loss": 0.5236, + "step": 6805 + }, + { + "epoch": 1.214075461600214, + "grad_norm": 0.4688962697982788, + "learning_rate": 0.0001674349581433629, + "loss": 0.5646, + "step": 6806 + }, + { + "epoch": 1.2142538578182143, + "grad_norm": 0.46147796511650085, + "learning_rate": 0.0001673688303047769, + "loss": 0.5085, + "step": 6807 + }, + { + "epoch": 1.2144322540362145, + "grad_norm": 0.4987983703613281, + "learning_rate": 0.00016730270895600732, + "loss": 0.7102, + "step": 6808 + }, + { + "epoch": 1.2146106502542147, + "grad_norm": 0.4857620894908905, + "learning_rate": 0.00016723659410224746, + "loss": 0.6561, + "step": 6809 + }, + { + "epoch": 1.2147890464722149, + "grad_norm": 0.5185416340827942, + "learning_rate": 0.00016717048574868987, + "loss": 0.688, + "step": 6810 + }, + { + "epoch": 1.214967442690215, + "grad_norm": 0.43132469058036804, + "learning_rate": 0.0001671043839005267, + "loss": 0.6281, + "step": 6811 + }, + { + "epoch": 1.215145838908215, + "grad_norm": 0.5003576278686523, + "learning_rate": 0.00016703828856294955, + "loss": 0.7138, + "step": 6812 + }, + { + "epoch": 1.2153242351262152, + "grad_norm": 0.5227103233337402, + "learning_rate": 0.00016697219974114955, + "loss": 0.7263, + "step": 6813 + }, + { + "epoch": 1.2155026313442154, + "grad_norm": 0.46056926250457764, + "learning_rate": 0.0001669061174403172, + "loss": 0.6924, + "step": 6814 + }, + { + "epoch": 1.2156810275622156, + "grad_norm": 0.5291325449943542, + "learning_rate": 0.00016684004166564264, + "loss": 0.7044, + "step": 6815 + }, + { + "epoch": 1.2158594237802158, + "grad_norm": 0.3964986503124237, + "learning_rate": 0.0001667739724223154, + "loss": 0.4319, + "step": 6816 + }, + { + "epoch": 1.216037819998216, + "grad_norm": 0.49517232179641724, + "learning_rate": 0.00016670790971552458, + "loss": 0.5721, + "step": 6817 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.47603654861450195, + "learning_rate": 0.0001666418535504587, + "loss": 0.6336, + "step": 6818 + }, + { + "epoch": 1.2163946124342164, + "grad_norm": 0.4822855293750763, + "learning_rate": 0.00016657580393230573, + "loss": 0.7157, + "step": 6819 + }, + { + "epoch": 1.2165730086522166, + "grad_norm": 0.45155301690101624, + "learning_rate": 0.00016650976086625324, + "loss": 0.5622, + "step": 6820 + }, + { + "epoch": 1.2167514048702168, + "grad_norm": 0.4612867832183838, + "learning_rate": 0.00016644372435748822, + "loss": 0.5382, + "step": 6821 + }, + { + "epoch": 1.216929801088217, + "grad_norm": 0.44989264011383057, + "learning_rate": 0.00016637769441119713, + "loss": 0.5503, + "step": 6822 + }, + { + "epoch": 1.2171081973062172, + "grad_norm": 0.5135206580162048, + "learning_rate": 0.00016631167103256582, + "loss": 0.7834, + "step": 6823 + }, + { + "epoch": 1.2172865935242172, + "grad_norm": 0.5145620107650757, + "learning_rate": 0.00016624565422677996, + "loss": 0.8026, + "step": 6824 + }, + { + "epoch": 1.2174649897422174, + "grad_norm": 0.4703541398048401, + "learning_rate": 0.0001661796439990244, + "loss": 0.677, + "step": 6825 + }, + { + "epoch": 1.2176433859602176, + "grad_norm": 0.4866558611392975, + "learning_rate": 0.00016611364035448348, + "loss": 0.6063, + "step": 6826 + }, + { + "epoch": 1.2178217821782178, + "grad_norm": 0.5036291480064392, + "learning_rate": 0.00016604764329834117, + "loss": 0.8213, + "step": 6827 + }, + { + "epoch": 1.218000178396218, + "grad_norm": 0.4724942743778229, + "learning_rate": 0.00016598165283578082, + "loss": 0.6396, + "step": 6828 + }, + { + "epoch": 1.2181785746142182, + "grad_norm": 0.44807612895965576, + "learning_rate": 0.0001659156689719853, + "loss": 0.5442, + "step": 6829 + }, + { + "epoch": 1.2183569708322184, + "grad_norm": 0.42465436458587646, + "learning_rate": 0.00016584969171213693, + "loss": 0.4956, + "step": 6830 + }, + { + "epoch": 1.2185353670502186, + "grad_norm": 0.9363518357276917, + "learning_rate": 0.0001657837210614176, + "loss": 0.8942, + "step": 6831 + }, + { + "epoch": 1.2187137632682188, + "grad_norm": 0.44213372468948364, + "learning_rate": 0.00016571775702500856, + "loss": 0.4695, + "step": 6832 + }, + { + "epoch": 1.218892159486219, + "grad_norm": 0.4889032542705536, + "learning_rate": 0.0001656517996080906, + "loss": 0.4976, + "step": 6833 + }, + { + "epoch": 1.219070555704219, + "grad_norm": 0.5484008193016052, + "learning_rate": 0.00016558584881584408, + "loss": 0.7479, + "step": 6834 + }, + { + "epoch": 1.2192489519222192, + "grad_norm": 0.4762975573539734, + "learning_rate": 0.00016551990465344857, + "loss": 0.6117, + "step": 6835 + }, + { + "epoch": 1.2194273481402194, + "grad_norm": 0.5832328200340271, + "learning_rate": 0.00016545396712608346, + "loss": 0.7153, + "step": 6836 + }, + { + "epoch": 1.2196057443582196, + "grad_norm": 0.46144014596939087, + "learning_rate": 0.00016538803623892734, + "loss": 0.568, + "step": 6837 + }, + { + "epoch": 1.2197841405762198, + "grad_norm": 0.4600122570991516, + "learning_rate": 0.0001653221119971584, + "loss": 0.5168, + "step": 6838 + }, + { + "epoch": 1.21996253679422, + "grad_norm": 0.51224684715271, + "learning_rate": 0.0001652561944059544, + "loss": 0.77, + "step": 6839 + }, + { + "epoch": 1.2201409330122202, + "grad_norm": 0.529111921787262, + "learning_rate": 0.00016519028347049242, + "loss": 0.7114, + "step": 6840 + }, + { + "epoch": 1.2203193292302204, + "grad_norm": 0.49239465594291687, + "learning_rate": 0.00016512437919594908, + "loss": 0.7044, + "step": 6841 + }, + { + "epoch": 1.2204977254482205, + "grad_norm": 0.4696052670478821, + "learning_rate": 0.00016505848158750047, + "loss": 0.647, + "step": 6842 + }, + { + "epoch": 1.2206761216662207, + "grad_norm": 0.4317275285720825, + "learning_rate": 0.00016499259065032217, + "loss": 0.5817, + "step": 6843 + }, + { + "epoch": 1.220854517884221, + "grad_norm": 0.5241662263870239, + "learning_rate": 0.00016492670638958924, + "loss": 0.7357, + "step": 6844 + }, + { + "epoch": 1.2210329141022211, + "grad_norm": 0.5198963284492493, + "learning_rate": 0.00016486082881047616, + "loss": 0.5708, + "step": 6845 + }, + { + "epoch": 1.2212113103202211, + "grad_norm": 0.49722328782081604, + "learning_rate": 0.00016479495791815702, + "loss": 0.6887, + "step": 6846 + }, + { + "epoch": 1.2213897065382213, + "grad_norm": 0.5908113121986389, + "learning_rate": 0.00016472909371780512, + "loss": 0.9454, + "step": 6847 + }, + { + "epoch": 1.2215681027562215, + "grad_norm": 0.5063602328300476, + "learning_rate": 0.00016466323621459352, + "loss": 0.6783, + "step": 6848 + }, + { + "epoch": 1.2217464989742217, + "grad_norm": 0.47944772243499756, + "learning_rate": 0.00016459738541369466, + "loss": 0.6294, + "step": 6849 + }, + { + "epoch": 1.221924895192222, + "grad_norm": 0.48459392786026, + "learning_rate": 0.00016453154132028036, + "loss": 0.6708, + "step": 6850 + }, + { + "epoch": 1.2221032914102221, + "grad_norm": 0.5024449229240417, + "learning_rate": 0.00016446570393952205, + "loss": 0.6274, + "step": 6851 + }, + { + "epoch": 1.2222816876282223, + "grad_norm": 0.4757004976272583, + "learning_rate": 0.0001643998732765905, + "loss": 0.6813, + "step": 6852 + }, + { + "epoch": 1.2224600838462225, + "grad_norm": 0.6229151487350464, + "learning_rate": 0.0001643340493366561, + "loss": 0.5887, + "step": 6853 + }, + { + "epoch": 1.2226384800642227, + "grad_norm": 0.5339584946632385, + "learning_rate": 0.00016426823212488856, + "loss": 0.8846, + "step": 6854 + }, + { + "epoch": 1.222816876282223, + "grad_norm": 0.4785784184932709, + "learning_rate": 0.0001642024216464572, + "loss": 0.5552, + "step": 6855 + }, + { + "epoch": 1.2229952725002229, + "grad_norm": 0.45622581243515015, + "learning_rate": 0.00016413661790653074, + "loss": 0.5302, + "step": 6856 + }, + { + "epoch": 1.223173668718223, + "grad_norm": 0.4625503420829773, + "learning_rate": 0.00016407082091027736, + "loss": 0.5837, + "step": 6857 + }, + { + "epoch": 1.2233520649362233, + "grad_norm": 0.49430111050605774, + "learning_rate": 0.00016400503066286472, + "loss": 0.5746, + "step": 6858 + }, + { + "epoch": 1.2235304611542235, + "grad_norm": 0.6408543586730957, + "learning_rate": 0.00016393924716946002, + "loss": 0.6712, + "step": 6859 + }, + { + "epoch": 1.2237088573722237, + "grad_norm": 0.5817059874534607, + "learning_rate": 0.00016387347043522976, + "loss": 0.8973, + "step": 6860 + }, + { + "epoch": 1.2238872535902239, + "grad_norm": 0.7297067642211914, + "learning_rate": 0.00016380770046534005, + "loss": 0.9795, + "step": 6861 + }, + { + "epoch": 1.224065649808224, + "grad_norm": 0.5279180407524109, + "learning_rate": 0.00016374193726495647, + "loss": 0.6417, + "step": 6862 + }, + { + "epoch": 1.2242440460262243, + "grad_norm": 0.45359596610069275, + "learning_rate": 0.00016367618083924402, + "loss": 0.6208, + "step": 6863 + }, + { + "epoch": 1.2244224422442245, + "grad_norm": 0.47759270668029785, + "learning_rate": 0.00016361043119336719, + "loss": 0.4829, + "step": 6864 + }, + { + "epoch": 1.2246008384622247, + "grad_norm": 0.4140789210796356, + "learning_rate": 0.00016354468833248992, + "loss": 0.3787, + "step": 6865 + }, + { + "epoch": 1.2247792346802249, + "grad_norm": 0.4323157072067261, + "learning_rate": 0.00016347895226177561, + "loss": 0.5246, + "step": 6866 + }, + { + "epoch": 1.224957630898225, + "grad_norm": 0.42192304134368896, + "learning_rate": 0.0001634132229863872, + "loss": 0.5802, + "step": 6867 + }, + { + "epoch": 1.225136027116225, + "grad_norm": 0.39509162306785583, + "learning_rate": 0.00016334750051148696, + "loss": 0.3766, + "step": 6868 + }, + { + "epoch": 1.2253144233342252, + "grad_norm": 0.5555423498153687, + "learning_rate": 0.0001632817848422366, + "loss": 0.8368, + "step": 6869 + }, + { + "epoch": 1.2254928195522254, + "grad_norm": 0.489187628030777, + "learning_rate": 0.00016321607598379767, + "loss": 0.6287, + "step": 6870 + }, + { + "epoch": 1.2256712157702256, + "grad_norm": 0.4797876477241516, + "learning_rate": 0.00016315037394133082, + "loss": 0.5631, + "step": 6871 + }, + { + "epoch": 1.2258496119882258, + "grad_norm": 0.5317756533622742, + "learning_rate": 0.00016308467871999622, + "loss": 0.6423, + "step": 6872 + }, + { + "epoch": 1.226028008206226, + "grad_norm": 0.5093753933906555, + "learning_rate": 0.00016301899032495354, + "loss": 0.7641, + "step": 6873 + }, + { + "epoch": 1.2262064044242262, + "grad_norm": 0.4402000606060028, + "learning_rate": 0.0001629533087613619, + "loss": 0.5262, + "step": 6874 + }, + { + "epoch": 1.2263848006422264, + "grad_norm": 0.605444073677063, + "learning_rate": 0.00016288763403437994, + "loss": 0.8924, + "step": 6875 + }, + { + "epoch": 1.2265631968602266, + "grad_norm": 0.5263209342956543, + "learning_rate": 0.00016282196614916572, + "loss": 0.5979, + "step": 6876 + }, + { + "epoch": 1.2267415930782268, + "grad_norm": 0.49599960446357727, + "learning_rate": 0.0001627563051108768, + "loss": 0.554, + "step": 6877 + }, + { + "epoch": 1.2269199892962268, + "grad_norm": 0.5215542912483215, + "learning_rate": 0.0001626906509246701, + "loss": 0.673, + "step": 6878 + }, + { + "epoch": 1.227098385514227, + "grad_norm": 0.48019513487815857, + "learning_rate": 0.0001626250035957021, + "loss": 0.6647, + "step": 6879 + }, + { + "epoch": 1.2272767817322272, + "grad_norm": 0.5026001334190369, + "learning_rate": 0.00016255936312912876, + "loss": 0.5802, + "step": 6880 + }, + { + "epoch": 1.2274551779502274, + "grad_norm": 0.5892630815505981, + "learning_rate": 0.00016249372953010537, + "loss": 0.7349, + "step": 6881 + }, + { + "epoch": 1.2276335741682276, + "grad_norm": 0.49261564016342163, + "learning_rate": 0.00016242810280378678, + "loss": 0.6885, + "step": 6882 + }, + { + "epoch": 1.2278119703862278, + "grad_norm": 0.5151751637458801, + "learning_rate": 0.00016236248295532736, + "loss": 0.7812, + "step": 6883 + }, + { + "epoch": 1.227990366604228, + "grad_norm": 1.1146833896636963, + "learning_rate": 0.00016229686998988068, + "loss": 0.5243, + "step": 6884 + }, + { + "epoch": 1.2281687628222282, + "grad_norm": 0.5391646027565002, + "learning_rate": 0.00016223126391260023, + "loss": 0.8716, + "step": 6885 + }, + { + "epoch": 1.2283471590402284, + "grad_norm": 0.4731014370918274, + "learning_rate": 0.00016216566472863854, + "loss": 0.5053, + "step": 6886 + }, + { + "epoch": 1.2285255552582286, + "grad_norm": 0.5936664938926697, + "learning_rate": 0.00016210007244314774, + "loss": 0.8034, + "step": 6887 + }, + { + "epoch": 1.2287039514762288, + "grad_norm": 0.5338798761367798, + "learning_rate": 0.00016203448706127938, + "loss": 0.7982, + "step": 6888 + }, + { + "epoch": 1.228882347694229, + "grad_norm": 0.5306500792503357, + "learning_rate": 0.00016196890858818458, + "loss": 0.6466, + "step": 6889 + }, + { + "epoch": 1.229060743912229, + "grad_norm": 0.44917765259742737, + "learning_rate": 0.0001619033370290138, + "loss": 0.5473, + "step": 6890 + }, + { + "epoch": 1.2292391401302292, + "grad_norm": 0.520117998123169, + "learning_rate": 0.00016183777238891703, + "loss": 0.6827, + "step": 6891 + }, + { + "epoch": 1.2294175363482294, + "grad_norm": 0.5320703387260437, + "learning_rate": 0.0001617722146730437, + "loss": 0.8353, + "step": 6892 + }, + { + "epoch": 1.2295959325662296, + "grad_norm": 0.47005659341812134, + "learning_rate": 0.00016170666388654265, + "loss": 0.6324, + "step": 6893 + }, + { + "epoch": 1.2297743287842298, + "grad_norm": 0.45758256316185, + "learning_rate": 0.00016164112003456223, + "loss": 0.6812, + "step": 6894 + }, + { + "epoch": 1.22995272500223, + "grad_norm": 0.4741186201572418, + "learning_rate": 0.00016157558312225018, + "loss": 0.588, + "step": 6895 + }, + { + "epoch": 1.2301311212202302, + "grad_norm": 0.4784983694553375, + "learning_rate": 0.0001615100531547538, + "loss": 0.601, + "step": 6896 + }, + { + "epoch": 1.2303095174382304, + "grad_norm": 0.45370981097221375, + "learning_rate": 0.00016144453013721978, + "loss": 0.4963, + "step": 6897 + }, + { + "epoch": 1.2304879136562306, + "grad_norm": 0.5208538770675659, + "learning_rate": 0.00016137901407479421, + "loss": 0.5965, + "step": 6898 + }, + { + "epoch": 1.2306663098742308, + "grad_norm": 0.45619308948516846, + "learning_rate": 0.00016131350497262278, + "loss": 0.5699, + "step": 6899 + }, + { + "epoch": 1.2308447060922307, + "grad_norm": 0.5480870604515076, + "learning_rate": 0.00016124800283585044, + "loss": 0.6709, + "step": 6900 + }, + { + "epoch": 1.231023102310231, + "grad_norm": 0.5735124945640564, + "learning_rate": 0.00016118250766962184, + "loss": 0.7542, + "step": 6901 + }, + { + "epoch": 1.2312014985282311, + "grad_norm": 0.4865647852420807, + "learning_rate": 0.00016111701947908085, + "loss": 0.6056, + "step": 6902 + }, + { + "epoch": 1.2313798947462313, + "grad_norm": 0.7100339531898499, + "learning_rate": 0.00016105153826937086, + "loss": 0.9174, + "step": 6903 + }, + { + "epoch": 1.2315582909642315, + "grad_norm": 0.5096773505210876, + "learning_rate": 0.00016098606404563482, + "loss": 0.6564, + "step": 6904 + }, + { + "epoch": 1.2317366871822317, + "grad_norm": 0.5119658708572388, + "learning_rate": 0.000160920596813015, + "loss": 0.7196, + "step": 6905 + }, + { + "epoch": 1.231915083400232, + "grad_norm": 0.4193865656852722, + "learning_rate": 0.0001608551365766532, + "loss": 0.5337, + "step": 6906 + }, + { + "epoch": 1.2320934796182321, + "grad_norm": 0.47937509417533875, + "learning_rate": 0.00016078968334169057, + "loss": 0.6803, + "step": 6907 + }, + { + "epoch": 1.2322718758362323, + "grad_norm": 0.5272152423858643, + "learning_rate": 0.00016072423711326782, + "loss": 0.6065, + "step": 6908 + }, + { + "epoch": 1.2324502720542325, + "grad_norm": 0.40397560596466064, + "learning_rate": 0.0001606587978965251, + "loss": 0.546, + "step": 6909 + }, + { + "epoch": 1.2326286682722327, + "grad_norm": 0.428237646818161, + "learning_rate": 0.00016059336569660193, + "loss": 0.4717, + "step": 6910 + }, + { + "epoch": 1.232807064490233, + "grad_norm": 0.4891768991947174, + "learning_rate": 0.00016052794051863733, + "loss": 0.6167, + "step": 6911 + }, + { + "epoch": 1.232985460708233, + "grad_norm": 0.5653678178787231, + "learning_rate": 0.00016046252236776978, + "loss": 0.8338, + "step": 6912 + }, + { + "epoch": 1.233163856926233, + "grad_norm": 0.4607936441898346, + "learning_rate": 0.00016039711124913718, + "loss": 0.6023, + "step": 6913 + }, + { + "epoch": 1.2333422531442333, + "grad_norm": 0.5353094339370728, + "learning_rate": 0.00016033170716787698, + "loss": 0.785, + "step": 6914 + }, + { + "epoch": 1.2335206493622335, + "grad_norm": 0.47947603464126587, + "learning_rate": 0.00016026631012912578, + "loss": 0.6174, + "step": 6915 + }, + { + "epoch": 1.2336990455802337, + "grad_norm": 0.5786678791046143, + "learning_rate": 0.00016020092013802002, + "loss": 0.9001, + "step": 6916 + }, + { + "epoch": 1.2338774417982339, + "grad_norm": 0.4535296559333801, + "learning_rate": 0.00016013553719969537, + "loss": 0.5654, + "step": 6917 + }, + { + "epoch": 1.234055838016234, + "grad_norm": 0.5149961113929749, + "learning_rate": 0.00016007016131928703, + "loss": 0.6585, + "step": 6918 + }, + { + "epoch": 1.2342342342342343, + "grad_norm": 0.4400131106376648, + "learning_rate": 0.00016000479250192942, + "loss": 0.5134, + "step": 6919 + }, + { + "epoch": 1.2344126304522345, + "grad_norm": 0.4986150562763214, + "learning_rate": 0.0001599394307527567, + "loss": 0.6103, + "step": 6920 + }, + { + "epoch": 1.2345910266702347, + "grad_norm": 0.487212210893631, + "learning_rate": 0.00015987407607690235, + "loss": 0.7115, + "step": 6921 + }, + { + "epoch": 1.2347694228882347, + "grad_norm": 0.46623316407203674, + "learning_rate": 0.0001598087284794993, + "loss": 0.5599, + "step": 6922 + }, + { + "epoch": 1.2349478191062349, + "grad_norm": 0.4859059453010559, + "learning_rate": 0.00015974338796567982, + "loss": 0.6134, + "step": 6923 + }, + { + "epoch": 1.235126215324235, + "grad_norm": 0.5123870968818665, + "learning_rate": 0.00015967805454057587, + "loss": 0.7607, + "step": 6924 + }, + { + "epoch": 1.2353046115422353, + "grad_norm": 0.49646908044815063, + "learning_rate": 0.00015961272820931867, + "loss": 0.6267, + "step": 6925 + }, + { + "epoch": 1.2354830077602355, + "grad_norm": 0.4790204167366028, + "learning_rate": 0.0001595474089770389, + "loss": 0.6403, + "step": 6926 + }, + { + "epoch": 1.2356614039782357, + "grad_norm": 0.49380695819854736, + "learning_rate": 0.00015948209684886667, + "loss": 0.6544, + "step": 6927 + }, + { + "epoch": 1.2358398001962358, + "grad_norm": 0.5434766411781311, + "learning_rate": 0.00015941679182993157, + "loss": 0.8058, + "step": 6928 + }, + { + "epoch": 1.236018196414236, + "grad_norm": 0.6229785680770874, + "learning_rate": 0.00015935149392536273, + "loss": 0.7764, + "step": 6929 + }, + { + "epoch": 1.2361965926322362, + "grad_norm": 0.5308107733726501, + "learning_rate": 0.00015928620314028838, + "loss": 0.6995, + "step": 6930 + }, + { + "epoch": 1.2363749888502364, + "grad_norm": 0.6573651432991028, + "learning_rate": 0.0001592209194798368, + "loss": 0.7208, + "step": 6931 + }, + { + "epoch": 1.2365533850682366, + "grad_norm": 0.5968353748321533, + "learning_rate": 0.00015915564294913503, + "loss": 0.8999, + "step": 6932 + }, + { + "epoch": 1.2367317812862368, + "grad_norm": 0.45062559843063354, + "learning_rate": 0.00015909037355331003, + "loss": 0.5419, + "step": 6933 + }, + { + "epoch": 1.2369101775042368, + "grad_norm": 0.5150581002235413, + "learning_rate": 0.0001590251112974879, + "loss": 0.6129, + "step": 6934 + }, + { + "epoch": 1.237088573722237, + "grad_norm": 0.4370402693748474, + "learning_rate": 0.00015895985618679444, + "loss": 0.6262, + "step": 6935 + }, + { + "epoch": 1.2372669699402372, + "grad_norm": 0.5415884852409363, + "learning_rate": 0.0001588946082263547, + "loss": 0.8408, + "step": 6936 + }, + { + "epoch": 1.2374453661582374, + "grad_norm": 0.49960747361183167, + "learning_rate": 0.0001588293674212932, + "loss": 0.7211, + "step": 6937 + }, + { + "epoch": 1.2376237623762376, + "grad_norm": 0.47522178292274475, + "learning_rate": 0.00015876413377673395, + "loss": 0.4949, + "step": 6938 + }, + { + "epoch": 1.2378021585942378, + "grad_norm": 0.5218939781188965, + "learning_rate": 0.00015869890729780045, + "loss": 0.5721, + "step": 6939 + }, + { + "epoch": 1.237980554812238, + "grad_norm": 0.5287362933158875, + "learning_rate": 0.00015863368798961538, + "loss": 0.9253, + "step": 6940 + }, + { + "epoch": 1.2381589510302382, + "grad_norm": 0.5668135285377502, + "learning_rate": 0.00015856847585730117, + "loss": 0.8171, + "step": 6941 + }, + { + "epoch": 1.2383373472482384, + "grad_norm": 0.5259323716163635, + "learning_rate": 0.00015850327090597953, + "loss": 0.6072, + "step": 6942 + }, + { + "epoch": 1.2385157434662386, + "grad_norm": 0.4646325409412384, + "learning_rate": 0.00015843807314077157, + "loss": 0.5816, + "step": 6943 + }, + { + "epoch": 1.2386941396842386, + "grad_norm": 0.4163079857826233, + "learning_rate": 0.00015837288256679799, + "loss": 0.4746, + "step": 6944 + }, + { + "epoch": 1.2388725359022388, + "grad_norm": 0.5029598474502563, + "learning_rate": 0.00015830769918917872, + "loss": 0.7697, + "step": 6945 + }, + { + "epoch": 1.239050932120239, + "grad_norm": 0.5194973349571228, + "learning_rate": 0.00015824252301303336, + "loss": 0.7874, + "step": 6946 + }, + { + "epoch": 1.2392293283382392, + "grad_norm": 0.4863186478614807, + "learning_rate": 0.00015817735404348072, + "loss": 0.715, + "step": 6947 + }, + { + "epoch": 1.2394077245562394, + "grad_norm": 0.4885355830192566, + "learning_rate": 0.00015811219228563923, + "loss": 0.6721, + "step": 6948 + }, + { + "epoch": 1.2395861207742396, + "grad_norm": 0.46745818853378296, + "learning_rate": 0.00015804703774462657, + "loss": 0.6217, + "step": 6949 + }, + { + "epoch": 1.2397645169922398, + "grad_norm": 0.5246323347091675, + "learning_rate": 0.00015798189042556, + "loss": 0.7571, + "step": 6950 + }, + { + "epoch": 1.23994291321024, + "grad_norm": 0.5258684158325195, + "learning_rate": 0.0001579167503335562, + "loss": 0.8582, + "step": 6951 + }, + { + "epoch": 1.2401213094282402, + "grad_norm": 0.49560466408729553, + "learning_rate": 0.0001578516174737312, + "loss": 0.6666, + "step": 6952 + }, + { + "epoch": 1.2402997056462404, + "grad_norm": 0.4392854571342468, + "learning_rate": 0.00015778649185120048, + "loss": 0.5967, + "step": 6953 + }, + { + "epoch": 1.2404781018642406, + "grad_norm": 0.49791455268859863, + "learning_rate": 0.000157721373471079, + "loss": 0.6931, + "step": 6954 + }, + { + "epoch": 1.2406564980822408, + "grad_norm": 0.48088231682777405, + "learning_rate": 0.00015765626233848115, + "loss": 0.733, + "step": 6955 + }, + { + "epoch": 1.2408348943002407, + "grad_norm": 0.5019537806510925, + "learning_rate": 0.00015759115845852072, + "loss": 0.8829, + "step": 6956 + }, + { + "epoch": 1.241013290518241, + "grad_norm": 0.5134669542312622, + "learning_rate": 0.0001575260618363109, + "loss": 0.7432, + "step": 6957 + }, + { + "epoch": 1.2411916867362411, + "grad_norm": 0.5204741358757019, + "learning_rate": 0.00015746097247696443, + "loss": 0.7888, + "step": 6958 + }, + { + "epoch": 1.2413700829542413, + "grad_norm": 0.5397398471832275, + "learning_rate": 0.0001573958903855934, + "loss": 0.7761, + "step": 6959 + }, + { + "epoch": 1.2415484791722415, + "grad_norm": 0.4783801734447479, + "learning_rate": 0.00015733081556730926, + "loss": 0.7478, + "step": 6960 + }, + { + "epoch": 1.2417268753902417, + "grad_norm": 0.6109618544578552, + "learning_rate": 0.00015726574802722284, + "loss": 0.7975, + "step": 6961 + }, + { + "epoch": 1.241905271608242, + "grad_norm": 0.48145678639411926, + "learning_rate": 0.00015720068777044476, + "loss": 0.5358, + "step": 6962 + }, + { + "epoch": 1.2420836678262421, + "grad_norm": 0.5572999715805054, + "learning_rate": 0.0001571356348020848, + "loss": 0.6544, + "step": 6963 + }, + { + "epoch": 1.2422620640442423, + "grad_norm": 0.503771185874939, + "learning_rate": 0.00015707058912725207, + "loss": 0.5603, + "step": 6964 + }, + { + "epoch": 1.2424404602622425, + "grad_norm": 0.5243847370147705, + "learning_rate": 0.00015700555075105532, + "loss": 0.6095, + "step": 6965 + }, + { + "epoch": 1.2426188564802425, + "grad_norm": 0.5380844473838806, + "learning_rate": 0.00015694051967860256, + "loss": 0.6285, + "step": 6966 + }, + { + "epoch": 1.2427972526982427, + "grad_norm": 0.5606120824813843, + "learning_rate": 0.00015687549591500134, + "loss": 0.8092, + "step": 6967 + }, + { + "epoch": 1.242975648916243, + "grad_norm": 0.5026856660842896, + "learning_rate": 0.0001568104794653586, + "loss": 0.6013, + "step": 6968 + }, + { + "epoch": 1.243154045134243, + "grad_norm": 0.47534608840942383, + "learning_rate": 0.00015674547033478072, + "loss": 0.5524, + "step": 6969 + }, + { + "epoch": 1.2433324413522433, + "grad_norm": 0.5083494186401367, + "learning_rate": 0.00015668046852837343, + "loss": 0.7611, + "step": 6970 + }, + { + "epoch": 1.2435108375702435, + "grad_norm": 0.4332534372806549, + "learning_rate": 0.000156615474051242, + "loss": 0.4897, + "step": 6971 + }, + { + "epoch": 1.2436892337882437, + "grad_norm": 0.4423793852329254, + "learning_rate": 0.00015655048690849102, + "loss": 0.5234, + "step": 6972 + }, + { + "epoch": 1.243867630006244, + "grad_norm": 0.47594019770622253, + "learning_rate": 0.00015648550710522466, + "loss": 0.6987, + "step": 6973 + }, + { + "epoch": 1.244046026224244, + "grad_norm": 0.5025418400764465, + "learning_rate": 0.00015642053464654627, + "loss": 0.5931, + "step": 6974 + }, + { + "epoch": 1.2442244224422443, + "grad_norm": 0.48870721459388733, + "learning_rate": 0.00015635556953755883, + "loss": 0.7561, + "step": 6975 + }, + { + "epoch": 1.2444028186602445, + "grad_norm": 0.5276898741722107, + "learning_rate": 0.00015629061178336456, + "loss": 0.5714, + "step": 6976 + }, + { + "epoch": 1.2445812148782447, + "grad_norm": 0.5232852697372437, + "learning_rate": 0.00015622566138906547, + "loss": 0.6564, + "step": 6977 + }, + { + "epoch": 1.2447596110962447, + "grad_norm": 0.41942137479782104, + "learning_rate": 0.00015616071835976254, + "loss": 0.4946, + "step": 6978 + }, + { + "epoch": 1.2449380073142449, + "grad_norm": 0.6095328330993652, + "learning_rate": 0.00015609578270055636, + "loss": 0.6298, + "step": 6979 + }, + { + "epoch": 1.245116403532245, + "grad_norm": 0.4936719238758087, + "learning_rate": 0.00015603085441654702, + "loss": 0.744, + "step": 6980 + }, + { + "epoch": 1.2452947997502453, + "grad_norm": 0.4040975868701935, + "learning_rate": 0.00015596593351283394, + "loss": 0.5204, + "step": 6981 + }, + { + "epoch": 1.2454731959682455, + "grad_norm": 0.5431026816368103, + "learning_rate": 0.000155901019994516, + "loss": 0.7077, + "step": 6982 + }, + { + "epoch": 1.2456515921862457, + "grad_norm": 0.5668693780899048, + "learning_rate": 0.00015583611386669143, + "loss": 0.7685, + "step": 6983 + }, + { + "epoch": 1.2458299884042459, + "grad_norm": 0.4577692747116089, + "learning_rate": 0.00015577121513445796, + "loss": 0.4953, + "step": 6984 + }, + { + "epoch": 1.246008384622246, + "grad_norm": 0.5005066394805908, + "learning_rate": 0.0001557063238029127, + "loss": 0.6231, + "step": 6985 + }, + { + "epoch": 1.2461867808402463, + "grad_norm": 0.4887985289096832, + "learning_rate": 0.00015564143987715224, + "loss": 0.5986, + "step": 6986 + }, + { + "epoch": 1.2463651770582465, + "grad_norm": 0.5014304518699646, + "learning_rate": 0.00015557656336227243, + "loss": 0.6816, + "step": 6987 + }, + { + "epoch": 1.2465435732762464, + "grad_norm": 0.4619125425815582, + "learning_rate": 0.00015551169426336874, + "loss": 0.608, + "step": 6988 + }, + { + "epoch": 1.2467219694942466, + "grad_norm": 0.4961630702018738, + "learning_rate": 0.0001554468325855359, + "loss": 0.6304, + "step": 6989 + }, + { + "epoch": 1.2469003657122468, + "grad_norm": 0.5148522853851318, + "learning_rate": 0.00015538197833386814, + "loss": 0.81, + "step": 6990 + }, + { + "epoch": 1.247078761930247, + "grad_norm": 0.414350688457489, + "learning_rate": 0.000155317131513459, + "loss": 0.4424, + "step": 6991 + }, + { + "epoch": 1.2472571581482472, + "grad_norm": 0.4806058406829834, + "learning_rate": 0.00015525229212940168, + "loss": 0.6765, + "step": 6992 + }, + { + "epoch": 1.2474355543662474, + "grad_norm": 0.5416685342788696, + "learning_rate": 0.0001551874601867886, + "loss": 0.7167, + "step": 6993 + }, + { + "epoch": 1.2476139505842476, + "grad_norm": 0.4818612039089203, + "learning_rate": 0.00015512263569071152, + "loss": 0.6258, + "step": 6994 + }, + { + "epoch": 1.2477923468022478, + "grad_norm": 0.5225372910499573, + "learning_rate": 0.00015505781864626184, + "loss": 0.8175, + "step": 6995 + }, + { + "epoch": 1.247970743020248, + "grad_norm": 0.45656535029411316, + "learning_rate": 0.00015499300905853026, + "loss": 0.6183, + "step": 6996 + }, + { + "epoch": 1.2481491392382482, + "grad_norm": 0.5095564126968384, + "learning_rate": 0.00015492820693260682, + "loss": 0.3982, + "step": 6997 + }, + { + "epoch": 1.2483275354562484, + "grad_norm": 0.4879413843154907, + "learning_rate": 0.00015486341227358114, + "loss": 0.5668, + "step": 6998 + }, + { + "epoch": 1.2485059316742486, + "grad_norm": 0.463022381067276, + "learning_rate": 0.00015479862508654212, + "loss": 0.6153, + "step": 6999 + }, + { + "epoch": 1.2486843278922486, + "grad_norm": 52.42532730102539, + "learning_rate": 0.0001547338453765781, + "loss": 1.0213, + "step": 7000 + }, + { + "epoch": 1.2488627241102488, + "grad_norm": 0.45679882168769836, + "learning_rate": 0.00015466907314877682, + "loss": 0.5221, + "step": 7001 + }, + { + "epoch": 1.249041120328249, + "grad_norm": 0.5205798745155334, + "learning_rate": 0.00015460430840822552, + "loss": 0.6813, + "step": 7002 + }, + { + "epoch": 1.2492195165462492, + "grad_norm": 0.510740339756012, + "learning_rate": 0.00015453955116001084, + "loss": 0.6022, + "step": 7003 + }, + { + "epoch": 1.2493979127642494, + "grad_norm": 0.5229342579841614, + "learning_rate": 0.0001544748014092187, + "loss": 0.6381, + "step": 7004 + }, + { + "epoch": 1.2495763089822496, + "grad_norm": 0.48941877484321594, + "learning_rate": 0.00015441005916093454, + "loss": 0.6733, + "step": 7005 + }, + { + "epoch": 1.2497547052002498, + "grad_norm": 0.5428256392478943, + "learning_rate": 0.0001543453244202433, + "loss": 0.7319, + "step": 7006 + }, + { + "epoch": 1.24993310141825, + "grad_norm": 0.4766102433204651, + "learning_rate": 0.0001542805971922289, + "loss": 0.6504, + "step": 7007 + }, + { + "epoch": 1.2501114976362502, + "grad_norm": 0.573760986328125, + "learning_rate": 0.00015421587748197532, + "loss": 0.7209, + "step": 7008 + }, + { + "epoch": 1.2502898938542502, + "grad_norm": 0.5121127963066101, + "learning_rate": 0.00015415116529456552, + "loss": 0.7118, + "step": 7009 + }, + { + "epoch": 1.2504682900722504, + "grad_norm": 0.5492568612098694, + "learning_rate": 0.00015408646063508197, + "loss": 0.7907, + "step": 7010 + }, + { + "epoch": 1.2506466862902506, + "grad_norm": 0.4394141733646393, + "learning_rate": 0.00015402176350860653, + "loss": 0.5036, + "step": 7011 + }, + { + "epoch": 1.2508250825082508, + "grad_norm": 0.4782421886920929, + "learning_rate": 0.00015395707392022045, + "loss": 0.6055, + "step": 7012 + }, + { + "epoch": 1.251003478726251, + "grad_norm": 0.5013086795806885, + "learning_rate": 0.0001538923918750045, + "loss": 0.6325, + "step": 7013 + }, + { + "epoch": 1.2511818749442511, + "grad_norm": 0.5063600540161133, + "learning_rate": 0.00015382771737803866, + "loss": 0.7373, + "step": 7014 + }, + { + "epoch": 1.2513602711622513, + "grad_norm": 0.48954328894615173, + "learning_rate": 0.00015376305043440254, + "loss": 0.5793, + "step": 7015 + }, + { + "epoch": 1.2515386673802515, + "grad_norm": 0.4994099736213684, + "learning_rate": 0.00015369839104917505, + "loss": 0.5759, + "step": 7016 + }, + { + "epoch": 1.2517170635982517, + "grad_norm": 0.5017921924591064, + "learning_rate": 0.00015363373922743444, + "loss": 0.6372, + "step": 7017 + }, + { + "epoch": 1.251895459816252, + "grad_norm": 0.4896945059299469, + "learning_rate": 0.0001535690949742585, + "loss": 0.5314, + "step": 7018 + }, + { + "epoch": 1.2520738560342521, + "grad_norm": 0.524643063545227, + "learning_rate": 0.00015350445829472444, + "loss": 0.7481, + "step": 7019 + }, + { + "epoch": 1.2522522522522523, + "grad_norm": 0.42429521679878235, + "learning_rate": 0.00015343982919390858, + "loss": 0.5296, + "step": 7020 + }, + { + "epoch": 1.2524306484702525, + "grad_norm": 0.44809257984161377, + "learning_rate": 0.000153375207676887, + "loss": 0.5499, + "step": 7021 + }, + { + "epoch": 1.2526090446882527, + "grad_norm": 0.45676136016845703, + "learning_rate": 0.00015331059374873495, + "loss": 0.537, + "step": 7022 + }, + { + "epoch": 1.2527874409062527, + "grad_norm": 0.5065332651138306, + "learning_rate": 0.00015324598741452733, + "loss": 0.7571, + "step": 7023 + }, + { + "epoch": 1.252965837124253, + "grad_norm": 0.4751022160053253, + "learning_rate": 0.0001531813886793383, + "loss": 0.5616, + "step": 7024 + }, + { + "epoch": 1.253144233342253, + "grad_norm": 0.5255012512207031, + "learning_rate": 0.00015311679754824125, + "loss": 0.832, + "step": 7025 + }, + { + "epoch": 1.2533226295602533, + "grad_norm": 0.4546463191509247, + "learning_rate": 0.00015305221402630925, + "loss": 0.5618, + "step": 7026 + }, + { + "epoch": 1.2535010257782535, + "grad_norm": 0.46323102712631226, + "learning_rate": 0.00015298763811861466, + "loss": 0.501, + "step": 7027 + }, + { + "epoch": 1.2536794219962537, + "grad_norm": 0.5189341902732849, + "learning_rate": 0.0001529230698302292, + "loss": 0.6688, + "step": 7028 + }, + { + "epoch": 1.253857818214254, + "grad_norm": 0.49833574891090393, + "learning_rate": 0.0001528585091662241, + "loss": 0.6542, + "step": 7029 + }, + { + "epoch": 1.254036214432254, + "grad_norm": 0.5910803079605103, + "learning_rate": 0.00015279395613166985, + "loss": 0.8463, + "step": 7030 + }, + { + "epoch": 1.254214610650254, + "grad_norm": 0.4795231223106384, + "learning_rate": 0.00015272941073163647, + "loss": 0.709, + "step": 7031 + }, + { + "epoch": 1.2543930068682543, + "grad_norm": 0.520872950553894, + "learning_rate": 0.00015266487297119335, + "loss": 0.7132, + "step": 7032 + }, + { + "epoch": 1.2545714030862545, + "grad_norm": 0.44845151901245117, + "learning_rate": 0.00015260034285540915, + "loss": 0.5002, + "step": 7033 + }, + { + "epoch": 1.2547497993042547, + "grad_norm": 0.4645794928073883, + "learning_rate": 0.00015253582038935216, + "loss": 0.5065, + "step": 7034 + }, + { + "epoch": 1.2549281955222549, + "grad_norm": 0.4405880272388458, + "learning_rate": 0.00015247130557808985, + "loss": 0.5645, + "step": 7035 + }, + { + "epoch": 1.255106591740255, + "grad_norm": 0.4625834822654724, + "learning_rate": 0.00015240679842668924, + "loss": 0.4968, + "step": 7036 + }, + { + "epoch": 1.2552849879582553, + "grad_norm": 0.4433553218841553, + "learning_rate": 0.00015234229894021666, + "loss": 0.5323, + "step": 7037 + }, + { + "epoch": 1.2554633841762555, + "grad_norm": 0.510197639465332, + "learning_rate": 0.0001522778071237379, + "loss": 0.7702, + "step": 7038 + }, + { + "epoch": 1.2556417803942557, + "grad_norm": 0.5122009515762329, + "learning_rate": 0.00015221332298231816, + "loss": 0.6138, + "step": 7039 + }, + { + "epoch": 1.2558201766122559, + "grad_norm": 0.47657519578933716, + "learning_rate": 0.00015214884652102193, + "loss": 0.6609, + "step": 7040 + }, + { + "epoch": 1.255998572830256, + "grad_norm": 0.4612807631492615, + "learning_rate": 0.00015208437774491318, + "loss": 0.466, + "step": 7041 + }, + { + "epoch": 1.2561769690482563, + "grad_norm": 0.5131711959838867, + "learning_rate": 0.00015201991665905528, + "loss": 0.6583, + "step": 7042 + }, + { + "epoch": 1.2563553652662565, + "grad_norm": 0.4903299808502197, + "learning_rate": 0.00015195546326851096, + "loss": 0.5683, + "step": 7043 + }, + { + "epoch": 1.2565337614842567, + "grad_norm": 0.772181510925293, + "learning_rate": 0.00015189101757834235, + "loss": 0.58, + "step": 7044 + }, + { + "epoch": 1.2567121577022566, + "grad_norm": 0.43410158157348633, + "learning_rate": 0.00015182657959361107, + "loss": 0.5278, + "step": 7045 + }, + { + "epoch": 1.2568905539202568, + "grad_norm": 0.5446970462799072, + "learning_rate": 0.00015176214931937794, + "loss": 0.7534, + "step": 7046 + }, + { + "epoch": 1.257068950138257, + "grad_norm": 0.4579203426837921, + "learning_rate": 0.00015169772676070328, + "loss": 0.6194, + "step": 7047 + }, + { + "epoch": 1.2572473463562572, + "grad_norm": 0.4995640516281128, + "learning_rate": 0.0001516333119226469, + "loss": 0.6539, + "step": 7048 + }, + { + "epoch": 1.2574257425742574, + "grad_norm": 0.4806896150112152, + "learning_rate": 0.00015156890481026787, + "loss": 0.4902, + "step": 7049 + }, + { + "epoch": 1.2576041387922576, + "grad_norm": 0.5618352293968201, + "learning_rate": 0.00015150450542862466, + "loss": 0.7631, + "step": 7050 + }, + { + "epoch": 1.2577825350102578, + "grad_norm": 0.679996907711029, + "learning_rate": 0.00015144011378277522, + "loss": 0.8891, + "step": 7051 + }, + { + "epoch": 1.257960931228258, + "grad_norm": 0.47723329067230225, + "learning_rate": 0.00015137572987777688, + "loss": 0.526, + "step": 7052 + }, + { + "epoch": 1.258139327446258, + "grad_norm": 0.53426593542099, + "learning_rate": 0.00015131135371868615, + "loss": 0.5509, + "step": 7053 + }, + { + "epoch": 1.2583177236642582, + "grad_norm": 0.5936741232872009, + "learning_rate": 0.0001512469853105593, + "loss": 0.6007, + "step": 7054 + }, + { + "epoch": 1.2584961198822584, + "grad_norm": 0.5116682052612305, + "learning_rate": 0.00015118262465845179, + "loss": 0.7028, + "step": 7055 + }, + { + "epoch": 1.2586745161002586, + "grad_norm": 0.5201318264007568, + "learning_rate": 0.00015111827176741833, + "loss": 0.7666, + "step": 7056 + }, + { + "epoch": 1.2588529123182588, + "grad_norm": 0.4887928068637848, + "learning_rate": 0.0001510539266425133, + "loss": 0.6696, + "step": 7057 + }, + { + "epoch": 1.259031308536259, + "grad_norm": 0.499969482421875, + "learning_rate": 0.00015098958928879035, + "loss": 0.5775, + "step": 7058 + }, + { + "epoch": 1.2592097047542592, + "grad_norm": 0.4900054335594177, + "learning_rate": 0.0001509252597113024, + "loss": 0.7217, + "step": 7059 + }, + { + "epoch": 1.2593881009722594, + "grad_norm": 0.45761123299598694, + "learning_rate": 0.00015086093791510187, + "loss": 0.4968, + "step": 7060 + }, + { + "epoch": 1.2595664971902596, + "grad_norm": 0.4074775278568268, + "learning_rate": 0.00015079662390524062, + "loss": 0.457, + "step": 7061 + }, + { + "epoch": 1.2597448934082598, + "grad_norm": 0.5297835469245911, + "learning_rate": 0.00015073231768676987, + "loss": 0.6604, + "step": 7062 + }, + { + "epoch": 1.25992328962626, + "grad_norm": 0.5857179164886475, + "learning_rate": 0.00015066801926474015, + "loss": 0.8372, + "step": 7063 + }, + { + "epoch": 1.2601016858442602, + "grad_norm": 0.5072324872016907, + "learning_rate": 0.00015060372864420147, + "loss": 0.6661, + "step": 7064 + }, + { + "epoch": 1.2602800820622604, + "grad_norm": 0.5058489441871643, + "learning_rate": 0.00015053944583020318, + "loss": 0.6894, + "step": 7065 + }, + { + "epoch": 1.2604584782802606, + "grad_norm": 0.4949549436569214, + "learning_rate": 0.00015047517082779406, + "loss": 0.6834, + "step": 7066 + }, + { + "epoch": 1.2606368744982606, + "grad_norm": 0.47919708490371704, + "learning_rate": 0.0001504109036420221, + "loss": 0.6148, + "step": 7067 + }, + { + "epoch": 1.2608152707162608, + "grad_norm": 0.5355616211891174, + "learning_rate": 0.00015034664427793484, + "loss": 0.6993, + "step": 7068 + }, + { + "epoch": 1.260993666934261, + "grad_norm": 0.4291095435619354, + "learning_rate": 0.00015028239274057938, + "loss": 0.5921, + "step": 7069 + }, + { + "epoch": 1.2611720631522612, + "grad_norm": 0.5145418047904968, + "learning_rate": 0.0001502181490350019, + "loss": 0.715, + "step": 7070 + }, + { + "epoch": 1.2613504593702614, + "grad_norm": 0.48409318923950195, + "learning_rate": 0.00015015391316624798, + "loss": 0.6167, + "step": 7071 + }, + { + "epoch": 1.2615288555882616, + "grad_norm": 0.5300613641738892, + "learning_rate": 0.0001500896851393628, + "loss": 0.7082, + "step": 7072 + }, + { + "epoch": 1.2617072518062618, + "grad_norm": 0.5523662567138672, + "learning_rate": 0.00015002546495939073, + "loss": 0.6498, + "step": 7073 + }, + { + "epoch": 1.261885648024262, + "grad_norm": 0.48872825503349304, + "learning_rate": 0.00014996125263137564, + "loss": 0.6851, + "step": 7074 + }, + { + "epoch": 1.262064044242262, + "grad_norm": 0.4612981677055359, + "learning_rate": 0.0001498970481603607, + "loss": 0.7026, + "step": 7075 + }, + { + "epoch": 1.2622424404602621, + "grad_norm": 0.47795626521110535, + "learning_rate": 0.00014983285155138854, + "loss": 0.7194, + "step": 7076 + }, + { + "epoch": 1.2624208366782623, + "grad_norm": 0.4560318887233734, + "learning_rate": 0.00014976866280950107, + "loss": 0.587, + "step": 7077 + }, + { + "epoch": 1.2625992328962625, + "grad_norm": 0.4796944856643677, + "learning_rate": 0.00014970448193973979, + "loss": 0.6386, + "step": 7078 + }, + { + "epoch": 1.2627776291142627, + "grad_norm": 0.528049647808075, + "learning_rate": 0.00014964030894714525, + "loss": 0.5845, + "step": 7079 + }, + { + "epoch": 1.262956025332263, + "grad_norm": 0.46722978353500366, + "learning_rate": 0.0001495761438367577, + "loss": 0.6446, + "step": 7080 + }, + { + "epoch": 1.2631344215502631, + "grad_norm": 0.5087683200836182, + "learning_rate": 0.00014951198661361656, + "loss": 0.589, + "step": 7081 + }, + { + "epoch": 1.2633128177682633, + "grad_norm": 0.5003060698509216, + "learning_rate": 0.00014944783728276076, + "loss": 0.7562, + "step": 7082 + }, + { + "epoch": 1.2634912139862635, + "grad_norm": 0.5159687399864197, + "learning_rate": 0.0001493836958492285, + "loss": 0.6961, + "step": 7083 + }, + { + "epoch": 1.2636696102042637, + "grad_norm": 0.6173499822616577, + "learning_rate": 0.0001493195623180575, + "loss": 0.7186, + "step": 7084 + }, + { + "epoch": 1.263848006422264, + "grad_norm": 0.5042861104011536, + "learning_rate": 0.00014925543669428478, + "loss": 0.6666, + "step": 7085 + }, + { + "epoch": 1.2640264026402641, + "grad_norm": 0.5728045105934143, + "learning_rate": 0.00014919131898294668, + "loss": 0.7712, + "step": 7086 + }, + { + "epoch": 1.2642047988582643, + "grad_norm": 0.46245428919792175, + "learning_rate": 0.00014912720918907905, + "loss": 0.5197, + "step": 7087 + }, + { + "epoch": 1.2643831950762645, + "grad_norm": 0.4901646077632904, + "learning_rate": 0.00014906310731771697, + "loss": 0.6486, + "step": 7088 + }, + { + "epoch": 1.2645615912942645, + "grad_norm": 0.5244214534759521, + "learning_rate": 0.000148999013373895, + "loss": 0.6701, + "step": 7089 + }, + { + "epoch": 1.2647399875122647, + "grad_norm": 0.6227227449417114, + "learning_rate": 0.00014893492736264708, + "loss": 0.5513, + "step": 7090 + }, + { + "epoch": 1.2649183837302649, + "grad_norm": 0.4480903148651123, + "learning_rate": 0.00014887084928900653, + "loss": 0.5382, + "step": 7091 + }, + { + "epoch": 1.265096779948265, + "grad_norm": 0.41941893100738525, + "learning_rate": 0.00014880677915800585, + "loss": 0.451, + "step": 7092 + }, + { + "epoch": 1.2652751761662653, + "grad_norm": 0.44205811619758606, + "learning_rate": 0.00014874271697467724, + "loss": 0.5072, + "step": 7093 + }, + { + "epoch": 1.2654535723842655, + "grad_norm": 0.4529136121273041, + "learning_rate": 0.00014867866274405204, + "loss": 0.5873, + "step": 7094 + }, + { + "epoch": 1.2656319686022657, + "grad_norm": 0.4797188639640808, + "learning_rate": 0.00014861461647116105, + "loss": 0.579, + "step": 7095 + }, + { + "epoch": 1.2658103648202659, + "grad_norm": 0.47483915090560913, + "learning_rate": 0.00014855057816103452, + "loss": 0.6112, + "step": 7096 + }, + { + "epoch": 1.2659887610382659, + "grad_norm": 0.6069769263267517, + "learning_rate": 0.00014848654781870186, + "loss": 0.6348, + "step": 7097 + }, + { + "epoch": 1.266167157256266, + "grad_norm": 0.6138676404953003, + "learning_rate": 0.00014842252544919205, + "loss": 0.5382, + "step": 7098 + }, + { + "epoch": 1.2663455534742663, + "grad_norm": 0.5163396596908569, + "learning_rate": 0.00014835851105753333, + "loss": 0.6694, + "step": 7099 + }, + { + "epoch": 1.2665239496922664, + "grad_norm": 0.5151557922363281, + "learning_rate": 0.0001482945046487535, + "loss": 0.7113, + "step": 7100 + }, + { + "epoch": 1.2667023459102666, + "grad_norm": 0.5661108493804932, + "learning_rate": 0.00014823050622787948, + "loss": 0.8239, + "step": 7101 + }, + { + "epoch": 1.2668807421282668, + "grad_norm": 0.7180517315864563, + "learning_rate": 0.00014816651579993773, + "loss": 0.8975, + "step": 7102 + }, + { + "epoch": 1.267059138346267, + "grad_norm": 0.5672160983085632, + "learning_rate": 0.000148102533369954, + "loss": 0.6035, + "step": 7103 + }, + { + "epoch": 1.2672375345642672, + "grad_norm": 0.47604504227638245, + "learning_rate": 0.0001480385589429535, + "loss": 0.5376, + "step": 7104 + }, + { + "epoch": 1.2674159307822674, + "grad_norm": 0.6531439423561096, + "learning_rate": 0.0001479745925239606, + "loss": 0.5673, + "step": 7105 + }, + { + "epoch": 1.2675943270002676, + "grad_norm": 0.5715680122375488, + "learning_rate": 0.0001479106341179994, + "loss": 0.7327, + "step": 7106 + }, + { + "epoch": 1.2677727232182678, + "grad_norm": 0.5460495948791504, + "learning_rate": 0.00014784668373009298, + "loss": 0.5201, + "step": 7107 + }, + { + "epoch": 1.267951119436268, + "grad_norm": 0.5381412506103516, + "learning_rate": 0.00014778274136526408, + "loss": 0.7042, + "step": 7108 + }, + { + "epoch": 1.2681295156542682, + "grad_norm": 0.49676334857940674, + "learning_rate": 0.00014771880702853468, + "loss": 0.5428, + "step": 7109 + }, + { + "epoch": 1.2683079118722684, + "grad_norm": 0.726411759853363, + "learning_rate": 0.00014765488072492617, + "loss": 0.6615, + "step": 7110 + }, + { + "epoch": 1.2684863080902684, + "grad_norm": 0.5252700448036194, + "learning_rate": 0.00014759096245945929, + "loss": 0.6958, + "step": 7111 + }, + { + "epoch": 1.2686647043082686, + "grad_norm": 1.2057543992996216, + "learning_rate": 0.0001475270522371542, + "loss": 0.4825, + "step": 7112 + }, + { + "epoch": 1.2688431005262688, + "grad_norm": 0.4835338592529297, + "learning_rate": 0.00014746315006303027, + "loss": 0.6651, + "step": 7113 + }, + { + "epoch": 1.269021496744269, + "grad_norm": 3.452477216720581, + "learning_rate": 0.0001473992559421063, + "loss": 0.7079, + "step": 7114 + }, + { + "epoch": 1.2691998929622692, + "grad_norm": 0.5416814088821411, + "learning_rate": 0.00014733536987940075, + "loss": 0.5033, + "step": 7115 + }, + { + "epoch": 1.2693782891802694, + "grad_norm": 0.46173161268234253, + "learning_rate": 0.00014727149187993105, + "loss": 0.6123, + "step": 7116 + }, + { + "epoch": 1.2695566853982696, + "grad_norm": 0.4671329855918884, + "learning_rate": 0.00014720762194871424, + "loss": 0.6949, + "step": 7117 + }, + { + "epoch": 1.2697350816162698, + "grad_norm": 0.5171025395393372, + "learning_rate": 0.00014714376009076647, + "loss": 0.6301, + "step": 7118 + }, + { + "epoch": 1.2699134778342698, + "grad_norm": 0.47935229539871216, + "learning_rate": 0.00014707990631110355, + "loss": 0.6334, + "step": 7119 + }, + { + "epoch": 1.27009187405227, + "grad_norm": 0.4533785283565521, + "learning_rate": 0.0001470160606147405, + "loss": 0.5618, + "step": 7120 + }, + { + "epoch": 1.2702702702702702, + "grad_norm": 0.5093548893928528, + "learning_rate": 0.0001469522230066917, + "loss": 0.7452, + "step": 7121 + }, + { + "epoch": 1.2704486664882704, + "grad_norm": 0.43215861916542053, + "learning_rate": 0.000146888393491971, + "loss": 0.599, + "step": 7122 + }, + { + "epoch": 1.2706270627062706, + "grad_norm": 0.46486908197402954, + "learning_rate": 0.0001468245720755915, + "loss": 0.6116, + "step": 7123 + }, + { + "epoch": 1.2708054589242708, + "grad_norm": 0.5231932401657104, + "learning_rate": 0.00014676075876256567, + "loss": 0.5941, + "step": 7124 + }, + { + "epoch": 1.270983855142271, + "grad_norm": 0.5373736619949341, + "learning_rate": 0.00014669695355790552, + "loss": 0.5963, + "step": 7125 + }, + { + "epoch": 1.2711622513602712, + "grad_norm": 0.5684579014778137, + "learning_rate": 0.00014663315646662212, + "loss": 0.6317, + "step": 7126 + }, + { + "epoch": 1.2713406475782714, + "grad_norm": 0.5242096185684204, + "learning_rate": 0.00014656936749372614, + "loss": 0.8106, + "step": 7127 + }, + { + "epoch": 1.2715190437962716, + "grad_norm": 0.5143678188323975, + "learning_rate": 0.00014650558664422748, + "loss": 0.665, + "step": 7128 + }, + { + "epoch": 1.2716974400142718, + "grad_norm": 0.5314942598342896, + "learning_rate": 0.0001464418139231355, + "loss": 0.633, + "step": 7129 + }, + { + "epoch": 1.271875836232272, + "grad_norm": 0.5419167876243591, + "learning_rate": 0.0001463780493354589, + "loss": 0.6588, + "step": 7130 + }, + { + "epoch": 1.2720542324502722, + "grad_norm": 1.082300066947937, + "learning_rate": 0.00014631429288620575, + "loss": 0.7633, + "step": 7131 + }, + { + "epoch": 1.2722326286682724, + "grad_norm": 0.5155816674232483, + "learning_rate": 0.0001462505445803834, + "loss": 0.6571, + "step": 7132 + }, + { + "epoch": 1.2724110248862723, + "grad_norm": 0.7917349338531494, + "learning_rate": 0.00014618680442299864, + "loss": 0.5941, + "step": 7133 + }, + { + "epoch": 1.2725894211042725, + "grad_norm": 0.5167495012283325, + "learning_rate": 0.00014612307241905758, + "loss": 0.7426, + "step": 7134 + }, + { + "epoch": 1.2727678173222727, + "grad_norm": 0.6022337675094604, + "learning_rate": 0.00014605934857356571, + "loss": 0.7413, + "step": 7135 + }, + { + "epoch": 1.272946213540273, + "grad_norm": 0.4629674553871155, + "learning_rate": 0.0001459956328915279, + "loss": 0.5869, + "step": 7136 + }, + { + "epoch": 1.2731246097582731, + "grad_norm": 0.4646163582801819, + "learning_rate": 0.00014593192537794834, + "loss": 0.52, + "step": 7137 + }, + { + "epoch": 1.2733030059762733, + "grad_norm": 0.4827001988887787, + "learning_rate": 0.00014586822603783047, + "loss": 0.6536, + "step": 7138 + }, + { + "epoch": 1.2734814021942735, + "grad_norm": 0.442564457654953, + "learning_rate": 0.00014580453487617745, + "loss": 0.5261, + "step": 7139 + }, + { + "epoch": 1.2736597984122737, + "grad_norm": 0.5968412756919861, + "learning_rate": 0.0001457408518979913, + "loss": 0.8212, + "step": 7140 + }, + { + "epoch": 1.2738381946302737, + "grad_norm": 0.5755636692047119, + "learning_rate": 0.00014567717710827388, + "loss": 0.8274, + "step": 7141 + }, + { + "epoch": 1.274016590848274, + "grad_norm": 0.5358353853225708, + "learning_rate": 0.0001456135105120261, + "loss": 0.7647, + "step": 7142 + }, + { + "epoch": 1.274194987066274, + "grad_norm": 0.45210355520248413, + "learning_rate": 0.00014554985211424814, + "loss": 0.582, + "step": 7143 + }, + { + "epoch": 1.2743733832842743, + "grad_norm": 0.5111218094825745, + "learning_rate": 0.00014548620191994, + "loss": 0.7459, + "step": 7144 + }, + { + "epoch": 1.2745517795022745, + "grad_norm": 0.4721735715866089, + "learning_rate": 0.00014542255993410034, + "loss": 0.6472, + "step": 7145 + }, + { + "epoch": 1.2747301757202747, + "grad_norm": 0.4758826196193695, + "learning_rate": 0.000145358926161728, + "loss": 0.5357, + "step": 7146 + }, + { + "epoch": 1.274908571938275, + "grad_norm": 0.5536307096481323, + "learning_rate": 0.00014529530060782066, + "loss": 0.7217, + "step": 7147 + }, + { + "epoch": 1.275086968156275, + "grad_norm": 0.465385377407074, + "learning_rate": 0.00014523168327737517, + "loss": 0.5545, + "step": 7148 + }, + { + "epoch": 1.2752653643742753, + "grad_norm": 0.47780296206474304, + "learning_rate": 0.0001451680741753883, + "loss": 0.6572, + "step": 7149 + }, + { + "epoch": 1.2754437605922755, + "grad_norm": 0.4876740276813507, + "learning_rate": 0.00014510447330685572, + "loss": 0.6728, + "step": 7150 + }, + { + "epoch": 1.2756221568102757, + "grad_norm": 0.5187113881111145, + "learning_rate": 0.00014504088067677273, + "loss": 0.573, + "step": 7151 + }, + { + "epoch": 1.2758005530282759, + "grad_norm": 0.46036651730537415, + "learning_rate": 0.00014497729629013367, + "loss": 0.6419, + "step": 7152 + }, + { + "epoch": 1.275978949246276, + "grad_norm": 0.49554046988487244, + "learning_rate": 0.0001449137201519327, + "loss": 0.7062, + "step": 7153 + }, + { + "epoch": 1.2761573454642763, + "grad_norm": 0.44751298427581787, + "learning_rate": 0.00014485015226716296, + "loss": 0.5182, + "step": 7154 + }, + { + "epoch": 1.2763357416822763, + "grad_norm": 0.4852393567562103, + "learning_rate": 0.0001447865926408169, + "loss": 0.6061, + "step": 7155 + }, + { + "epoch": 1.2765141379002765, + "grad_norm": 0.49661117792129517, + "learning_rate": 0.00014472304127788663, + "loss": 0.7156, + "step": 7156 + }, + { + "epoch": 1.2766925341182767, + "grad_norm": 0.4922674298286438, + "learning_rate": 0.00014465949818336332, + "loss": 0.6471, + "step": 7157 + }, + { + "epoch": 1.2768709303362769, + "grad_norm": 0.4639200270175934, + "learning_rate": 0.0001445959633622378, + "loss": 0.5754, + "step": 7158 + }, + { + "epoch": 1.277049326554277, + "grad_norm": 0.4920192360877991, + "learning_rate": 0.00014453243681949985, + "loss": 0.5749, + "step": 7159 + }, + { + "epoch": 1.2772277227722773, + "grad_norm": 0.5195376873016357, + "learning_rate": 0.00014446891856013895, + "loss": 0.6442, + "step": 7160 + }, + { + "epoch": 1.2774061189902775, + "grad_norm": 0.5274832844734192, + "learning_rate": 0.00014440540858914384, + "loss": 0.6963, + "step": 7161 + }, + { + "epoch": 1.2775845152082776, + "grad_norm": 0.538936972618103, + "learning_rate": 0.0001443419069115024, + "loss": 0.7555, + "step": 7162 + }, + { + "epoch": 1.2777629114262776, + "grad_norm": 0.4810023009777069, + "learning_rate": 0.00014427841353220223, + "loss": 0.5429, + "step": 7163 + }, + { + "epoch": 1.2779413076442778, + "grad_norm": 0.49055951833724976, + "learning_rate": 0.00014421492845622985, + "loss": 0.6905, + "step": 7164 + }, + { + "epoch": 1.278119703862278, + "grad_norm": 0.4986650049686432, + "learning_rate": 0.0001441514516885716, + "loss": 0.8251, + "step": 7165 + }, + { + "epoch": 1.2782981000802782, + "grad_norm": 0.47552618384361267, + "learning_rate": 0.00014408798323421268, + "loss": 0.5881, + "step": 7166 + }, + { + "epoch": 1.2784764962982784, + "grad_norm": 0.4542260468006134, + "learning_rate": 0.00014402452309813808, + "loss": 0.549, + "step": 7167 + }, + { + "epoch": 1.2786548925162786, + "grad_norm": 0.44126781821250916, + "learning_rate": 0.00014396107128533182, + "loss": 0.499, + "step": 7168 + }, + { + "epoch": 1.2788332887342788, + "grad_norm": 0.45107805728912354, + "learning_rate": 0.00014389762780077725, + "loss": 0.5451, + "step": 7169 + }, + { + "epoch": 1.279011684952279, + "grad_norm": 0.5026717185974121, + "learning_rate": 0.00014383419264945747, + "loss": 0.634, + "step": 7170 + }, + { + "epoch": 1.2791900811702792, + "grad_norm": 0.4510766267776489, + "learning_rate": 0.00014377076583635442, + "loss": 0.5339, + "step": 7171 + }, + { + "epoch": 1.2793684773882794, + "grad_norm": 0.40873438119888306, + "learning_rate": 0.0001437073473664498, + "loss": 0.4696, + "step": 7172 + }, + { + "epoch": 1.2795468736062796, + "grad_norm": 0.48713698983192444, + "learning_rate": 0.0001436439372447243, + "loss": 0.5792, + "step": 7173 + }, + { + "epoch": 1.2797252698242798, + "grad_norm": 0.45994314551353455, + "learning_rate": 0.00014358053547615824, + "loss": 0.5733, + "step": 7174 + }, + { + "epoch": 1.27990366604228, + "grad_norm": 0.5316993594169617, + "learning_rate": 0.00014351714206573107, + "loss": 0.726, + "step": 7175 + }, + { + "epoch": 1.2800820622602802, + "grad_norm": 0.6356591582298279, + "learning_rate": 0.00014345375701842173, + "loss": 0.5762, + "step": 7176 + }, + { + "epoch": 1.2802604584782802, + "grad_norm": 0.5784276723861694, + "learning_rate": 0.00014339038033920858, + "loss": 0.6256, + "step": 7177 + }, + { + "epoch": 1.2804388546962804, + "grad_norm": 0.4795292615890503, + "learning_rate": 0.00014332701203306896, + "loss": 0.6083, + "step": 7178 + }, + { + "epoch": 1.2806172509142806, + "grad_norm": 0.46686503291130066, + "learning_rate": 0.00014326365210498001, + "loss": 0.5921, + "step": 7179 + }, + { + "epoch": 1.2807956471322808, + "grad_norm": 0.4302233159542084, + "learning_rate": 0.0001432003005599179, + "loss": 0.5002, + "step": 7180 + }, + { + "epoch": 1.280974043350281, + "grad_norm": 0.5263996124267578, + "learning_rate": 0.00014313695740285814, + "loss": 0.7031, + "step": 7181 + }, + { + "epoch": 1.2811524395682812, + "grad_norm": 0.48142942786216736, + "learning_rate": 0.00014307362263877581, + "loss": 0.5069, + "step": 7182 + }, + { + "epoch": 1.2813308357862814, + "grad_norm": 0.503164529800415, + "learning_rate": 0.00014301029627264512, + "loss": 0.6391, + "step": 7183 + }, + { + "epoch": 1.2815092320042816, + "grad_norm": 0.4538707137107849, + "learning_rate": 0.00014294697830943975, + "loss": 0.4964, + "step": 7184 + }, + { + "epoch": 1.2816876282222815, + "grad_norm": 0.5014795660972595, + "learning_rate": 0.00014288366875413256, + "loss": 0.7765, + "step": 7185 + }, + { + "epoch": 1.2818660244402817, + "grad_norm": 0.5531452298164368, + "learning_rate": 0.00014282036761169604, + "loss": 0.6846, + "step": 7186 + }, + { + "epoch": 1.282044420658282, + "grad_norm": 0.6034454703330994, + "learning_rate": 0.0001427570748871016, + "loss": 0.7885, + "step": 7187 + }, + { + "epoch": 1.2822228168762821, + "grad_norm": 0.4677254557609558, + "learning_rate": 0.0001426937905853205, + "loss": 0.6689, + "step": 7188 + }, + { + "epoch": 1.2824012130942823, + "grad_norm": 0.4888259172439575, + "learning_rate": 0.00014263051471132286, + "loss": 0.5989, + "step": 7189 + }, + { + "epoch": 1.2825796093122825, + "grad_norm": 0.5150851607322693, + "learning_rate": 0.0001425672472700783, + "loss": 0.7595, + "step": 7190 + }, + { + "epoch": 1.2827580055302827, + "grad_norm": 1.0994735956192017, + "learning_rate": 0.00014250398826655593, + "loss": 0.7208, + "step": 7191 + }, + { + "epoch": 1.282936401748283, + "grad_norm": 0.5045804977416992, + "learning_rate": 0.00014244073770572403, + "loss": 0.7697, + "step": 7192 + }, + { + "epoch": 1.2831147979662831, + "grad_norm": 0.7239548563957214, + "learning_rate": 0.00014237749559255043, + "loss": 0.7041, + "step": 7193 + }, + { + "epoch": 1.2832931941842833, + "grad_norm": 0.4901469647884369, + "learning_rate": 0.000142314261932002, + "loss": 0.7087, + "step": 7194 + }, + { + "epoch": 1.2834715904022835, + "grad_norm": 0.5142937302589417, + "learning_rate": 0.000142251036729045, + "loss": 0.6189, + "step": 7195 + }, + { + "epoch": 1.2836499866202837, + "grad_norm": 0.6226754784584045, + "learning_rate": 0.00014218781998864526, + "loss": 0.8731, + "step": 7196 + }, + { + "epoch": 1.283828382838284, + "grad_norm": 0.48372164368629456, + "learning_rate": 0.00014212461171576768, + "loss": 0.7479, + "step": 7197 + }, + { + "epoch": 1.2840067790562841, + "grad_norm": 0.5454527139663696, + "learning_rate": 0.0001420614119153768, + "loss": 0.6334, + "step": 7198 + }, + { + "epoch": 1.284185175274284, + "grad_norm": 0.5037645101547241, + "learning_rate": 0.00014199822059243606, + "loss": 0.5815, + "step": 7199 + }, + { + "epoch": 1.2843635714922843, + "grad_norm": 0.5184837579727173, + "learning_rate": 0.00014193503775190868, + "loss": 0.7455, + "step": 7200 + }, + { + "epoch": 1.2845419677102845, + "grad_norm": 0.52362459897995, + "learning_rate": 0.00014187186339875696, + "loss": 0.6227, + "step": 7201 + }, + { + "epoch": 1.2847203639282847, + "grad_norm": 0.3797712028026581, + "learning_rate": 0.00014180869753794247, + "loss": 0.4564, + "step": 7202 + }, + { + "epoch": 1.284898760146285, + "grad_norm": 0.49934831261634827, + "learning_rate": 0.00014174554017442638, + "loss": 0.6207, + "step": 7203 + }, + { + "epoch": 1.285077156364285, + "grad_norm": 0.5866370797157288, + "learning_rate": 0.0001416823913131689, + "loss": 0.6251, + "step": 7204 + }, + { + "epoch": 1.2852555525822853, + "grad_norm": 0.5229966640472412, + "learning_rate": 0.00014161925095912986, + "loss": 0.6002, + "step": 7205 + }, + { + "epoch": 1.2854339488002855, + "grad_norm": 0.5066050887107849, + "learning_rate": 0.00014155611911726814, + "loss": 0.6648, + "step": 7206 + }, + { + "epoch": 1.2856123450182855, + "grad_norm": 0.547694981098175, + "learning_rate": 0.00014149299579254215, + "loss": 0.7228, + "step": 7207 + }, + { + "epoch": 1.2857907412362857, + "grad_norm": 0.475382000207901, + "learning_rate": 0.00014142988098990968, + "loss": 0.6569, + "step": 7208 + }, + { + "epoch": 1.2859691374542859, + "grad_norm": 0.4719434082508087, + "learning_rate": 0.00014136677471432755, + "loss": 0.6066, + "step": 7209 + }, + { + "epoch": 1.286147533672286, + "grad_norm": 0.4972936511039734, + "learning_rate": 0.00014130367697075225, + "loss": 0.6338, + "step": 7210 + }, + { + "epoch": 1.2863259298902863, + "grad_norm": 0.4694492220878601, + "learning_rate": 0.0001412405877641393, + "loss": 0.6117, + "step": 7211 + }, + { + "epoch": 1.2865043261082865, + "grad_norm": 0.4956999719142914, + "learning_rate": 0.00014117750709944388, + "loss": 0.706, + "step": 7212 + }, + { + "epoch": 1.2866827223262867, + "grad_norm": 0.49469372630119324, + "learning_rate": 0.00014111443498162013, + "loss": 0.7405, + "step": 7213 + }, + { + "epoch": 1.2868611185442869, + "grad_norm": 0.48914238810539246, + "learning_rate": 0.00014105137141562192, + "loss": 0.6415, + "step": 7214 + }, + { + "epoch": 1.287039514762287, + "grad_norm": 0.43339380621910095, + "learning_rate": 0.0001409883164064021, + "loss": 0.4761, + "step": 7215 + }, + { + "epoch": 1.2872179109802873, + "grad_norm": 0.5167938470840454, + "learning_rate": 0.0001409252699589129, + "loss": 0.7187, + "step": 7216 + }, + { + "epoch": 1.2873963071982875, + "grad_norm": 0.45789870619773865, + "learning_rate": 0.00014086223207810614, + "loss": 0.5403, + "step": 7217 + }, + { + "epoch": 1.2875747034162877, + "grad_norm": 0.49316656589508057, + "learning_rate": 0.00014079920276893263, + "loss": 0.6763, + "step": 7218 + }, + { + "epoch": 1.2877530996342879, + "grad_norm": 0.4639813303947449, + "learning_rate": 0.00014073618203634282, + "loss": 0.5242, + "step": 7219 + }, + { + "epoch": 1.287931495852288, + "grad_norm": 0.47487959265708923, + "learning_rate": 0.00014067316988528616, + "loss": 0.5721, + "step": 7220 + }, + { + "epoch": 1.288109892070288, + "grad_norm": 0.4365386664867401, + "learning_rate": 0.00014061016632071173, + "loss": 0.4473, + "step": 7221 + }, + { + "epoch": 1.2882882882882882, + "grad_norm": 0.5029173493385315, + "learning_rate": 0.0001405471713475678, + "loss": 0.7293, + "step": 7222 + }, + { + "epoch": 1.2884666845062884, + "grad_norm": 0.6281735301017761, + "learning_rate": 0.00014048418497080185, + "loss": 0.7677, + "step": 7223 + }, + { + "epoch": 1.2886450807242886, + "grad_norm": 0.5290473103523254, + "learning_rate": 0.000140421207195361, + "loss": 0.6217, + "step": 7224 + }, + { + "epoch": 1.2888234769422888, + "grad_norm": 0.4867291748523712, + "learning_rate": 0.00014035823802619127, + "loss": 0.6605, + "step": 7225 + }, + { + "epoch": 1.289001873160289, + "grad_norm": 0.5893452167510986, + "learning_rate": 0.00014029527746823846, + "loss": 0.6862, + "step": 7226 + }, + { + "epoch": 1.2891802693782892, + "grad_norm": 0.5156494975090027, + "learning_rate": 0.00014023232552644733, + "loss": 0.658, + "step": 7227 + }, + { + "epoch": 1.2893586655962894, + "grad_norm": 0.48713207244873047, + "learning_rate": 0.00014016938220576204, + "loss": 0.6116, + "step": 7228 + }, + { + "epoch": 1.2895370618142896, + "grad_norm": 0.4963529109954834, + "learning_rate": 0.00014010644751112628, + "loss": 0.5374, + "step": 7229 + }, + { + "epoch": 1.2897154580322896, + "grad_norm": 0.5015867948532104, + "learning_rate": 0.00014004352144748273, + "loss": 0.6357, + "step": 7230 + }, + { + "epoch": 1.2898938542502898, + "grad_norm": 0.5196330547332764, + "learning_rate": 0.0001399806040197738, + "loss": 0.6529, + "step": 7231 + }, + { + "epoch": 1.29007225046829, + "grad_norm": 0.6642472147941589, + "learning_rate": 0.00013991769523294078, + "loss": 0.6634, + "step": 7232 + }, + { + "epoch": 1.2902506466862902, + "grad_norm": 0.5459299087524414, + "learning_rate": 0.00013985479509192472, + "loss": 0.7802, + "step": 7233 + }, + { + "epoch": 1.2904290429042904, + "grad_norm": 0.5235913991928101, + "learning_rate": 0.00013979190360166566, + "loss": 0.6494, + "step": 7234 + }, + { + "epoch": 1.2906074391222906, + "grad_norm": 0.4864553213119507, + "learning_rate": 0.00013972902076710297, + "loss": 0.5321, + "step": 7235 + }, + { + "epoch": 1.2907858353402908, + "grad_norm": 0.4859999418258667, + "learning_rate": 0.0001396661465931755, + "loss": 0.5942, + "step": 7236 + }, + { + "epoch": 1.290964231558291, + "grad_norm": 0.4459374248981476, + "learning_rate": 0.00013960328108482146, + "loss": 0.5264, + "step": 7237 + }, + { + "epoch": 1.2911426277762912, + "grad_norm": 0.5186498165130615, + "learning_rate": 0.00013954042424697827, + "loss": 0.7167, + "step": 7238 + }, + { + "epoch": 1.2913210239942914, + "grad_norm": 0.49951282143592834, + "learning_rate": 0.00013947757608458262, + "loss": 0.5215, + "step": 7239 + }, + { + "epoch": 1.2914994202122916, + "grad_norm": 0.4620944857597351, + "learning_rate": 0.00013941473660257047, + "loss": 0.7108, + "step": 7240 + }, + { + "epoch": 1.2916778164302918, + "grad_norm": 0.4689721167087555, + "learning_rate": 0.00013935190580587745, + "loss": 0.6203, + "step": 7241 + }, + { + "epoch": 1.291856212648292, + "grad_norm": 0.43373170495033264, + "learning_rate": 0.00013928908369943802, + "loss": 0.5734, + "step": 7242 + }, + { + "epoch": 1.292034608866292, + "grad_norm": 0.48551568388938904, + "learning_rate": 0.00013922627028818642, + "loss": 0.6898, + "step": 7243 + }, + { + "epoch": 1.2922130050842922, + "grad_norm": 0.4706454873085022, + "learning_rate": 0.00013916346557705579, + "loss": 0.6658, + "step": 7244 + }, + { + "epoch": 1.2923914013022924, + "grad_norm": 0.47324392199516296, + "learning_rate": 0.00013910066957097895, + "loss": 0.6189, + "step": 7245 + }, + { + "epoch": 1.2925697975202926, + "grad_norm": 0.4348990321159363, + "learning_rate": 0.00013903788227488773, + "loss": 0.5601, + "step": 7246 + }, + { + "epoch": 1.2927481937382928, + "grad_norm": 0.4262705445289612, + "learning_rate": 0.00013897510369371359, + "loss": 0.4819, + "step": 7247 + }, + { + "epoch": 1.292926589956293, + "grad_norm": 0.5076077580451965, + "learning_rate": 0.000138912333832387, + "loss": 0.8249, + "step": 7248 + }, + { + "epoch": 1.2931049861742931, + "grad_norm": 0.44083496928215027, + "learning_rate": 0.00013884957269583777, + "loss": 0.5899, + "step": 7249 + }, + { + "epoch": 1.2932833823922933, + "grad_norm": 0.5185807347297668, + "learning_rate": 0.00013878682028899543, + "loss": 0.6274, + "step": 7250 + }, + { + "epoch": 1.2934617786102935, + "grad_norm": 0.46207600831985474, + "learning_rate": 0.00013872407661678825, + "loss": 0.6729, + "step": 7251 + }, + { + "epoch": 1.2936401748282935, + "grad_norm": 0.49033087491989136, + "learning_rate": 0.00013866134168414421, + "loss": 0.5321, + "step": 7252 + }, + { + "epoch": 1.2938185710462937, + "grad_norm": 0.728157103061676, + "learning_rate": 0.00013859861549599058, + "loss": 0.4031, + "step": 7253 + }, + { + "epoch": 1.293996967264294, + "grad_norm": 0.5120450258255005, + "learning_rate": 0.00013853589805725363, + "loss": 0.6634, + "step": 7254 + }, + { + "epoch": 1.2941753634822941, + "grad_norm": 0.4849170446395874, + "learning_rate": 0.00013847318937285942, + "loss": 0.6161, + "step": 7255 + }, + { + "epoch": 1.2943537597002943, + "grad_norm": 0.5624775886535645, + "learning_rate": 0.00013841048944773278, + "loss": 0.6795, + "step": 7256 + }, + { + "epoch": 1.2945321559182945, + "grad_norm": 0.5056706070899963, + "learning_rate": 0.00013834779828679838, + "loss": 0.7651, + "step": 7257 + }, + { + "epoch": 1.2947105521362947, + "grad_norm": 0.5136926174163818, + "learning_rate": 0.00013828511589497977, + "loss": 0.8467, + "step": 7258 + }, + { + "epoch": 1.294888948354295, + "grad_norm": 0.4533271789550781, + "learning_rate": 0.0001382224422772002, + "loss": 0.536, + "step": 7259 + }, + { + "epoch": 1.295067344572295, + "grad_norm": 0.4658295214176178, + "learning_rate": 0.00013815977743838188, + "loss": 0.5347, + "step": 7260 + }, + { + "epoch": 1.2952457407902953, + "grad_norm": 0.4228724539279938, + "learning_rate": 0.00013809712138344643, + "loss": 0.5316, + "step": 7261 + }, + { + "epoch": 1.2954241370082955, + "grad_norm": 0.4298241436481476, + "learning_rate": 0.000138034474117315, + "loss": 0.4455, + "step": 7262 + }, + { + "epoch": 1.2956025332262957, + "grad_norm": 0.4992910623550415, + "learning_rate": 0.00013797183564490773, + "loss": 0.6047, + "step": 7263 + }, + { + "epoch": 1.295780929444296, + "grad_norm": 0.49004730582237244, + "learning_rate": 0.00013790920597114433, + "loss": 0.6561, + "step": 7264 + }, + { + "epoch": 1.2959593256622959, + "grad_norm": 1.2394171953201294, + "learning_rate": 0.00013784658510094356, + "loss": 0.5966, + "step": 7265 + }, + { + "epoch": 1.296137721880296, + "grad_norm": 0.40918824076652527, + "learning_rate": 0.00013778397303922387, + "loss": 0.4365, + "step": 7266 + }, + { + "epoch": 1.2963161180982963, + "grad_norm": 0.521958589553833, + "learning_rate": 0.0001377213697909025, + "loss": 0.6175, + "step": 7267 + }, + { + "epoch": 1.2964945143162965, + "grad_norm": 0.594232976436615, + "learning_rate": 0.00013765877536089648, + "loss": 0.7357, + "step": 7268 + }, + { + "epoch": 1.2966729105342967, + "grad_norm": 0.6050662398338318, + "learning_rate": 0.00013759618975412198, + "loss": 0.8643, + "step": 7269 + }, + { + "epoch": 1.2968513067522969, + "grad_norm": 0.4976935088634491, + "learning_rate": 0.00013753361297549421, + "loss": 0.5672, + "step": 7270 + }, + { + "epoch": 1.297029702970297, + "grad_norm": 0.4690896272659302, + "learning_rate": 0.00013747104502992823, + "loss": 0.5142, + "step": 7271 + }, + { + "epoch": 1.2972080991882973, + "grad_norm": 0.5143970251083374, + "learning_rate": 0.00013740848592233785, + "loss": 0.6846, + "step": 7272 + }, + { + "epoch": 1.2973864954062975, + "grad_norm": 0.5174264907836914, + "learning_rate": 0.00013734593565763664, + "loss": 0.6434, + "step": 7273 + }, + { + "epoch": 1.2975648916242974, + "grad_norm": 0.4763343334197998, + "learning_rate": 0.00013728339424073715, + "loss": 0.6206, + "step": 7274 + }, + { + "epoch": 1.2977432878422976, + "grad_norm": 0.44902318716049194, + "learning_rate": 0.00013722086167655128, + "loss": 0.4841, + "step": 7275 + }, + { + "epoch": 1.2979216840602978, + "grad_norm": 0.6365222930908203, + "learning_rate": 0.0001371583379699905, + "loss": 0.7602, + "step": 7276 + }, + { + "epoch": 1.298100080278298, + "grad_norm": 0.49694737792015076, + "learning_rate": 0.0001370958231259652, + "loss": 0.5375, + "step": 7277 + }, + { + "epoch": 1.2982784764962982, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00013703331714938545, + "loss": 0.6576, + "step": 7278 + }, + { + "epoch": 1.2984568727142984, + "grad_norm": 0.45106565952301025, + "learning_rate": 0.00013697082004516026, + "loss": 0.5242, + "step": 7279 + }, + { + "epoch": 1.2986352689322986, + "grad_norm": 0.4997382164001465, + "learning_rate": 0.00013690833181819834, + "loss": 0.6324, + "step": 7280 + }, + { + "epoch": 1.2988136651502988, + "grad_norm": 0.4807584583759308, + "learning_rate": 0.00013684585247340734, + "loss": 0.6275, + "step": 7281 + }, + { + "epoch": 1.298992061368299, + "grad_norm": 0.5849037170410156, + "learning_rate": 0.00013678338201569422, + "loss": 0.7473, + "step": 7282 + }, + { + "epoch": 1.2991704575862992, + "grad_norm": 0.5092651844024658, + "learning_rate": 0.00013672092044996576, + "loss": 0.6442, + "step": 7283 + }, + { + "epoch": 1.2993488538042994, + "grad_norm": 0.45579296350479126, + "learning_rate": 0.00013665846778112734, + "loss": 0.4597, + "step": 7284 + }, + { + "epoch": 1.2995272500222996, + "grad_norm": 0.5193167924880981, + "learning_rate": 0.00013659602401408416, + "loss": 0.614, + "step": 7285 + }, + { + "epoch": 1.2997056462402998, + "grad_norm": 0.511915922164917, + "learning_rate": 0.0001365335891537405, + "loss": 0.69, + "step": 7286 + }, + { + "epoch": 1.2998840424582998, + "grad_norm": 0.47553443908691406, + "learning_rate": 0.0001364711632049998, + "loss": 0.5546, + "step": 7287 + }, + { + "epoch": 1.3000624386763, + "grad_norm": 0.4340362250804901, + "learning_rate": 0.00013640874617276523, + "loss": 0.4619, + "step": 7288 + }, + { + "epoch": 1.3002408348943002, + "grad_norm": 0.558949887752533, + "learning_rate": 0.00013634633806193868, + "loss": 1.1185, + "step": 7289 + }, + { + "epoch": 1.3004192311123004, + "grad_norm": 0.45011255145072937, + "learning_rate": 0.00013628393887742197, + "loss": 0.5726, + "step": 7290 + }, + { + "epoch": 1.3005976273303006, + "grad_norm": 0.44671180844306946, + "learning_rate": 0.00013622154862411568, + "loss": 0.549, + "step": 7291 + }, + { + "epoch": 1.3007760235483008, + "grad_norm": 0.5005772709846497, + "learning_rate": 0.00013615916730692006, + "loss": 0.6959, + "step": 7292 + }, + { + "epoch": 1.300954419766301, + "grad_norm": 0.5107164978981018, + "learning_rate": 0.00013609679493073435, + "loss": 0.5958, + "step": 7293 + }, + { + "epoch": 1.3011328159843012, + "grad_norm": 0.46262556314468384, + "learning_rate": 0.00013603443150045745, + "loss": 0.531, + "step": 7294 + }, + { + "epoch": 1.3013112122023014, + "grad_norm": 0.49760428071022034, + "learning_rate": 0.0001359720770209873, + "loss": 0.5535, + "step": 7295 + }, + { + "epoch": 1.3014896084203014, + "grad_norm": 0.5045909881591797, + "learning_rate": 0.00013590973149722103, + "loss": 0.737, + "step": 7296 + }, + { + "epoch": 1.3016680046383016, + "grad_norm": 0.4710780680179596, + "learning_rate": 0.00013584739493405546, + "loss": 0.6419, + "step": 7297 + }, + { + "epoch": 1.3018464008563018, + "grad_norm": 0.5229588747024536, + "learning_rate": 0.00013578506733638622, + "loss": 0.6623, + "step": 7298 + }, + { + "epoch": 1.302024797074302, + "grad_norm": 0.48090076446533203, + "learning_rate": 0.0001357227487091087, + "loss": 0.645, + "step": 7299 + }, + { + "epoch": 1.3022031932923022, + "grad_norm": 0.4656026363372803, + "learning_rate": 0.0001356604390571174, + "loss": 0.482, + "step": 7300 + }, + { + "epoch": 1.3023815895103024, + "grad_norm": 0.4530593454837799, + "learning_rate": 0.00013559813838530588, + "loss": 0.4968, + "step": 7301 + }, + { + "epoch": 1.3025599857283026, + "grad_norm": 0.5168888568878174, + "learning_rate": 0.0001355358466985675, + "loss": 0.6378, + "step": 7302 + }, + { + "epoch": 1.3027383819463028, + "grad_norm": 0.45193690061569214, + "learning_rate": 0.00013547356400179432, + "loss": 0.4601, + "step": 7303 + }, + { + "epoch": 1.302916778164303, + "grad_norm": 0.5073840618133545, + "learning_rate": 0.00013541129029987826, + "loss": 0.5461, + "step": 7304 + }, + { + "epoch": 1.3030951743823032, + "grad_norm": 0.5524479150772095, + "learning_rate": 0.00013534902559771, + "loss": 0.6818, + "step": 7305 + }, + { + "epoch": 1.3032735706003034, + "grad_norm": 0.5372066497802734, + "learning_rate": 0.00013528676990018007, + "loss": 0.7605, + "step": 7306 + }, + { + "epoch": 1.3034519668183036, + "grad_norm": 0.5457733869552612, + "learning_rate": 0.00013522452321217788, + "loss": 0.7541, + "step": 7307 + }, + { + "epoch": 1.3036303630363038, + "grad_norm": 0.5292448997497559, + "learning_rate": 0.00013516228553859212, + "loss": 0.6048, + "step": 7308 + }, + { + "epoch": 1.3038087592543037, + "grad_norm": 0.45932427048683167, + "learning_rate": 0.00013510005688431115, + "loss": 0.5494, + "step": 7309 + }, + { + "epoch": 1.303987155472304, + "grad_norm": 0.4594475030899048, + "learning_rate": 0.00013503783725422216, + "loss": 0.4971, + "step": 7310 + }, + { + "epoch": 1.3041655516903041, + "grad_norm": 0.5574771165847778, + "learning_rate": 0.00013497562665321206, + "loss": 0.5786, + "step": 7311 + }, + { + "epoch": 1.3043439479083043, + "grad_norm": 0.46712732315063477, + "learning_rate": 0.00013491342508616667, + "loss": 0.5883, + "step": 7312 + }, + { + "epoch": 1.3045223441263045, + "grad_norm": 0.47527366876602173, + "learning_rate": 0.00013485123255797132, + "loss": 0.6923, + "step": 7313 + }, + { + "epoch": 1.3047007403443047, + "grad_norm": 0.45830777287483215, + "learning_rate": 0.0001347890490735107, + "loss": 0.5369, + "step": 7314 + }, + { + "epoch": 1.304879136562305, + "grad_norm": 0.5052610039710999, + "learning_rate": 0.0001347268746376685, + "loss": 0.5744, + "step": 7315 + }, + { + "epoch": 1.3050575327803051, + "grad_norm": 0.4734193682670593, + "learning_rate": 0.00013466470925532808, + "loss": 0.5339, + "step": 7316 + }, + { + "epoch": 1.3052359289983053, + "grad_norm": 0.4872669279575348, + "learning_rate": 0.00013460255293137164, + "loss": 0.5804, + "step": 7317 + }, + { + "epoch": 1.3054143252163053, + "grad_norm": 0.5103761553764343, + "learning_rate": 0.00013454040567068113, + "loss": 0.6856, + "step": 7318 + }, + { + "epoch": 1.3055927214343055, + "grad_norm": 0.5222267508506775, + "learning_rate": 0.00013447826747813748, + "loss": 0.7514, + "step": 7319 + }, + { + "epoch": 1.3057711176523057, + "grad_norm": 0.4690699875354767, + "learning_rate": 0.0001344161383586209, + "loss": 0.6305, + "step": 7320 + }, + { + "epoch": 1.305949513870306, + "grad_norm": 0.5072498917579651, + "learning_rate": 0.00013435401831701115, + "loss": 0.6352, + "step": 7321 + }, + { + "epoch": 1.306127910088306, + "grad_norm": 0.4983615577220917, + "learning_rate": 0.00013429190735818696, + "loss": 0.5993, + "step": 7322 + }, + { + "epoch": 1.3063063063063063, + "grad_norm": 0.5258157253265381, + "learning_rate": 0.0001342298054870267, + "loss": 0.7972, + "step": 7323 + }, + { + "epoch": 1.3064847025243065, + "grad_norm": 0.4537825882434845, + "learning_rate": 0.00013416771270840751, + "loss": 0.5255, + "step": 7324 + }, + { + "epoch": 1.3066630987423067, + "grad_norm": 0.47432446479797363, + "learning_rate": 0.00013410562902720647, + "loss": 0.4699, + "step": 7325 + }, + { + "epoch": 1.3068414949603069, + "grad_norm": 0.5209951996803284, + "learning_rate": 0.00013404355444829934, + "loss": 0.6747, + "step": 7326 + }, + { + "epoch": 1.307019891178307, + "grad_norm": 0.4715597927570343, + "learning_rate": 0.00013398148897656164, + "loss": 0.5054, + "step": 7327 + }, + { + "epoch": 1.3071982873963073, + "grad_norm": 0.8090279698371887, + "learning_rate": 0.00013391943261686782, + "loss": 0.4708, + "step": 7328 + }, + { + "epoch": 1.3073766836143075, + "grad_norm": 0.500850260257721, + "learning_rate": 0.00013385738537409174, + "loss": 0.6052, + "step": 7329 + }, + { + "epoch": 1.3075550798323077, + "grad_norm": 0.5126155018806458, + "learning_rate": 0.00013379534725310678, + "loss": 0.5697, + "step": 7330 + }, + { + "epoch": 1.3077334760503077, + "grad_norm": 0.5333918929100037, + "learning_rate": 0.00013373331825878516, + "loss": 0.695, + "step": 7331 + }, + { + "epoch": 1.3079118722683079, + "grad_norm": 0.5330007076263428, + "learning_rate": 0.00013367129839599872, + "loss": 0.5267, + "step": 7332 + }, + { + "epoch": 1.308090268486308, + "grad_norm": 0.5808795690536499, + "learning_rate": 0.0001336092876696185, + "loss": 0.8733, + "step": 7333 + }, + { + "epoch": 1.3082686647043082, + "grad_norm": 0.5341413021087646, + "learning_rate": 0.0001335472860845146, + "loss": 0.778, + "step": 7334 + }, + { + "epoch": 1.3084470609223084, + "grad_norm": 0.4854552447795868, + "learning_rate": 0.00013348529364555685, + "loss": 0.5773, + "step": 7335 + }, + { + "epoch": 1.3086254571403086, + "grad_norm": 0.4257410168647766, + "learning_rate": 0.0001334233103576139, + "loss": 0.5018, + "step": 7336 + }, + { + "epoch": 1.3088038533583088, + "grad_norm": 0.5281109809875488, + "learning_rate": 0.0001333613362255541, + "loss": 0.633, + "step": 7337 + }, + { + "epoch": 1.308982249576309, + "grad_norm": 0.45057862997055054, + "learning_rate": 0.00013329937125424466, + "loss": 0.5499, + "step": 7338 + }, + { + "epoch": 1.3091606457943092, + "grad_norm": 0.568513035774231, + "learning_rate": 0.00013323741544855246, + "loss": 0.7561, + "step": 7339 + }, + { + "epoch": 1.3093390420123092, + "grad_norm": 0.4433366358280182, + "learning_rate": 0.00013317546881334342, + "loss": 0.4844, + "step": 7340 + }, + { + "epoch": 1.3095174382303094, + "grad_norm": 0.4924166798591614, + "learning_rate": 0.00013311353135348267, + "loss": 0.5766, + "step": 7341 + }, + { + "epoch": 1.3096958344483096, + "grad_norm": 0.531856894493103, + "learning_rate": 0.00013305160307383495, + "loss": 0.6314, + "step": 7342 + }, + { + "epoch": 1.3098742306663098, + "grad_norm": 0.5930870175361633, + "learning_rate": 0.00013298968397926398, + "loss": 0.7782, + "step": 7343 + }, + { + "epoch": 1.31005262688431, + "grad_norm": 0.5085564255714417, + "learning_rate": 0.0001329277740746328, + "loss": 0.6933, + "step": 7344 + }, + { + "epoch": 1.3102310231023102, + "grad_norm": 0.5098043084144592, + "learning_rate": 0.000132865873364804, + "loss": 0.6053, + "step": 7345 + }, + { + "epoch": 1.3104094193203104, + "grad_norm": 0.49799683690071106, + "learning_rate": 0.00013280398185463898, + "loss": 0.6819, + "step": 7346 + }, + { + "epoch": 1.3105878155383106, + "grad_norm": 0.4547264277935028, + "learning_rate": 0.00013274209954899888, + "loss": 0.4916, + "step": 7347 + }, + { + "epoch": 1.3107662117563108, + "grad_norm": 0.48524683713912964, + "learning_rate": 0.00013268022645274375, + "loss": 0.7242, + "step": 7348 + }, + { + "epoch": 1.310944607974311, + "grad_norm": 0.4403549134731293, + "learning_rate": 0.00013261836257073324, + "loss": 0.5146, + "step": 7349 + }, + { + "epoch": 1.3111230041923112, + "grad_norm": 0.4839078485965729, + "learning_rate": 0.00013255650790782591, + "loss": 0.5517, + "step": 7350 + }, + { + "epoch": 1.3113014004103114, + "grad_norm": 0.4646989107131958, + "learning_rate": 0.00013249466246888, + "loss": 0.5391, + "step": 7351 + }, + { + "epoch": 1.3114797966283116, + "grad_norm": 0.46415239572525024, + "learning_rate": 0.00013243282625875267, + "loss": 0.6404, + "step": 7352 + }, + { + "epoch": 1.3116581928463116, + "grad_norm": 0.5185125470161438, + "learning_rate": 0.00013237099928230066, + "loss": 0.7389, + "step": 7353 + }, + { + "epoch": 1.3118365890643118, + "grad_norm": 0.4275851547718048, + "learning_rate": 0.0001323091815443797, + "loss": 0.5303, + "step": 7354 + }, + { + "epoch": 1.312014985282312, + "grad_norm": 0.41446274518966675, + "learning_rate": 0.00013224737304984494, + "loss": 0.4631, + "step": 7355 + }, + { + "epoch": 1.3121933815003122, + "grad_norm": 0.4250105917453766, + "learning_rate": 0.0001321855738035509, + "loss": 0.483, + "step": 7356 + }, + { + "epoch": 1.3123717777183124, + "grad_norm": 0.501487135887146, + "learning_rate": 0.0001321237838103511, + "loss": 0.6878, + "step": 7357 + }, + { + "epoch": 1.3125501739363126, + "grad_norm": 0.5788127183914185, + "learning_rate": 0.0001320620030750987, + "loss": 0.5583, + "step": 7358 + }, + { + "epoch": 1.3127285701543128, + "grad_norm": 0.5718269348144531, + "learning_rate": 0.00013200023160264568, + "loss": 0.7069, + "step": 7359 + }, + { + "epoch": 1.312906966372313, + "grad_norm": 0.5326660871505737, + "learning_rate": 0.00013193846939784374, + "loss": 0.7465, + "step": 7360 + }, + { + "epoch": 1.3130853625903132, + "grad_norm": 0.5201784372329712, + "learning_rate": 0.00013187671646554367, + "loss": 0.6788, + "step": 7361 + }, + { + "epoch": 1.3132637588083131, + "grad_norm": 0.5949232578277588, + "learning_rate": 0.0001318149728105954, + "loss": 0.6459, + "step": 7362 + }, + { + "epoch": 1.3134421550263133, + "grad_norm": 0.4743814766407013, + "learning_rate": 0.00013175323843784837, + "loss": 0.705, + "step": 7363 + }, + { + "epoch": 1.3136205512443135, + "grad_norm": 0.4792519211769104, + "learning_rate": 0.00013169151335215101, + "loss": 0.6492, + "step": 7364 + }, + { + "epoch": 1.3137989474623137, + "grad_norm": 0.4962617754936218, + "learning_rate": 0.00013162979755835142, + "loss": 0.5276, + "step": 7365 + }, + { + "epoch": 1.313977343680314, + "grad_norm": 0.438672810792923, + "learning_rate": 0.00013156809106129656, + "loss": 0.4126, + "step": 7366 + }, + { + "epoch": 1.3141557398983141, + "grad_norm": 0.43237635493278503, + "learning_rate": 0.00013150639386583278, + "loss": 0.5041, + "step": 7367 + }, + { + "epoch": 1.3143341361163143, + "grad_norm": 0.4956110119819641, + "learning_rate": 0.00013144470597680592, + "loss": 0.6361, + "step": 7368 + }, + { + "epoch": 1.3145125323343145, + "grad_norm": 0.4873587191104889, + "learning_rate": 0.00013138302739906072, + "loss": 0.7563, + "step": 7369 + }, + { + "epoch": 1.3146909285523147, + "grad_norm": 0.4835222363471985, + "learning_rate": 0.0001313213581374416, + "loss": 0.5999, + "step": 7370 + }, + { + "epoch": 1.314869324770315, + "grad_norm": 0.4809320867061615, + "learning_rate": 0.00013125969819679188, + "loss": 0.6086, + "step": 7371 + }, + { + "epoch": 1.3150477209883151, + "grad_norm": 0.4588382840156555, + "learning_rate": 0.00013119804758195442, + "loss": 0.4699, + "step": 7372 + }, + { + "epoch": 1.3152261172063153, + "grad_norm": 0.4489077031612396, + "learning_rate": 0.00013113640629777113, + "loss": 0.3956, + "step": 7373 + }, + { + "epoch": 1.3154045134243155, + "grad_norm": 0.5158323049545288, + "learning_rate": 0.0001310747743490833, + "loss": 0.6929, + "step": 7374 + }, + { + "epoch": 1.3155829096423155, + "grad_norm": 0.5298815965652466, + "learning_rate": 0.0001310131517407316, + "loss": 0.7762, + "step": 7375 + }, + { + "epoch": 1.3157613058603157, + "grad_norm": 0.5620563626289368, + "learning_rate": 0.0001309515384775557, + "loss": 0.5016, + "step": 7376 + }, + { + "epoch": 1.315939702078316, + "grad_norm": 0.5272053480148315, + "learning_rate": 0.0001308899345643948, + "loss": 0.8066, + "step": 7377 + }, + { + "epoch": 1.316118098296316, + "grad_norm": 0.4727185368537903, + "learning_rate": 0.00013082834000608724, + "loss": 0.5287, + "step": 7378 + }, + { + "epoch": 1.3162964945143163, + "grad_norm": 0.4828498959541321, + "learning_rate": 0.00013076675480747042, + "loss": 0.6108, + "step": 7379 + }, + { + "epoch": 1.3164748907323165, + "grad_norm": 0.45956501364707947, + "learning_rate": 0.00013070517897338147, + "loss": 0.5884, + "step": 7380 + }, + { + "epoch": 1.3166532869503167, + "grad_norm": 0.49907490611076355, + "learning_rate": 0.00013064361250865637, + "loss": 0.5318, + "step": 7381 + }, + { + "epoch": 1.316831683168317, + "grad_norm": 0.48141568899154663, + "learning_rate": 0.0001305820554181306, + "loss": 0.7263, + "step": 7382 + }, + { + "epoch": 1.317010079386317, + "grad_norm": 0.4059731960296631, + "learning_rate": 0.0001305205077066388, + "loss": 0.4103, + "step": 7383 + }, + { + "epoch": 1.317188475604317, + "grad_norm": 0.48073357343673706, + "learning_rate": 0.00013045896937901496, + "loss": 0.5717, + "step": 7384 + }, + { + "epoch": 1.3173668718223173, + "grad_norm": 0.4782210886478424, + "learning_rate": 0.00013039744044009212, + "loss": 0.6653, + "step": 7385 + }, + { + "epoch": 1.3175452680403175, + "grad_norm": 0.5318115949630737, + "learning_rate": 0.00013033592089470295, + "loss": 0.5354, + "step": 7386 + }, + { + "epoch": 1.3177236642583177, + "grad_norm": 0.5100518465042114, + "learning_rate": 0.00013027441074767903, + "loss": 0.6491, + "step": 7387 + }, + { + "epoch": 1.3179020604763179, + "grad_norm": 0.4386556148529053, + "learning_rate": 0.00013021291000385132, + "loss": 0.4745, + "step": 7388 + }, + { + "epoch": 1.318080456694318, + "grad_norm": 0.5141617655754089, + "learning_rate": 0.0001301514186680502, + "loss": 0.5839, + "step": 7389 + }, + { + "epoch": 1.3182588529123183, + "grad_norm": 7.867745399475098, + "learning_rate": 0.00013008993674510483, + "loss": 0.5498, + "step": 7390 + }, + { + "epoch": 1.3184372491303185, + "grad_norm": 0.5267961621284485, + "learning_rate": 0.00013002846423984448, + "loss": 0.6516, + "step": 7391 + }, + { + "epoch": 1.3186156453483187, + "grad_norm": 0.47720450162887573, + "learning_rate": 0.00012996700115709692, + "loss": 0.5947, + "step": 7392 + }, + { + "epoch": 1.3187940415663189, + "grad_norm": 0.514872133731842, + "learning_rate": 0.00012990554750168931, + "loss": 0.6416, + "step": 7393 + }, + { + "epoch": 1.318972437784319, + "grad_norm": 0.443766713142395, + "learning_rate": 0.00012984410327844843, + "loss": 0.4777, + "step": 7394 + }, + { + "epoch": 1.3191508340023193, + "grad_norm": 0.42529335618019104, + "learning_rate": 0.00012978266849219985, + "loss": 0.5089, + "step": 7395 + }, + { + "epoch": 1.3193292302203194, + "grad_norm": 0.540155291557312, + "learning_rate": 0.00012972124314776886, + "loss": 0.7208, + "step": 7396 + }, + { + "epoch": 1.3195076264383194, + "grad_norm": 0.4460678994655609, + "learning_rate": 0.0001296598272499796, + "loss": 0.5367, + "step": 7397 + }, + { + "epoch": 1.3196860226563196, + "grad_norm": 0.5822709798812866, + "learning_rate": 0.0001295984208036558, + "loss": 0.5829, + "step": 7398 + }, + { + "epoch": 1.3198644188743198, + "grad_norm": 0.4621860086917877, + "learning_rate": 0.00012953702381362023, + "loss": 0.5372, + "step": 7399 + }, + { + "epoch": 1.32004281509232, + "grad_norm": 0.5000967979431152, + "learning_rate": 0.00012947563628469487, + "loss": 0.5573, + "step": 7400 + }, + { + "epoch": 1.3202212113103202, + "grad_norm": 0.4751860797405243, + "learning_rate": 0.00012941425822170124, + "loss": 0.6386, + "step": 7401 + }, + { + "epoch": 1.3203996075283204, + "grad_norm": 0.5089471340179443, + "learning_rate": 0.0001293528896294598, + "loss": 0.664, + "step": 7402 + }, + { + "epoch": 1.3205780037463206, + "grad_norm": 0.5417841076850891, + "learning_rate": 0.00012929153051279062, + "loss": 0.5501, + "step": 7403 + }, + { + "epoch": 1.3207563999643208, + "grad_norm": 0.5360578298568726, + "learning_rate": 0.00012923018087651256, + "loss": 0.6679, + "step": 7404 + }, + { + "epoch": 1.320934796182321, + "grad_norm": 0.48479247093200684, + "learning_rate": 0.0001291688407254441, + "loss": 0.4484, + "step": 7405 + }, + { + "epoch": 1.321113192400321, + "grad_norm": 0.5442363023757935, + "learning_rate": 0.000129107510064403, + "loss": 0.6294, + "step": 7406 + }, + { + "epoch": 1.3212915886183212, + "grad_norm": 0.41899165511131287, + "learning_rate": 0.00012904618889820595, + "loss": 0.5342, + "step": 7407 + }, + { + "epoch": 1.3214699848363214, + "grad_norm": 0.49729251861572266, + "learning_rate": 0.0001289848772316693, + "loss": 0.7296, + "step": 7408 + }, + { + "epoch": 1.3216483810543216, + "grad_norm": 0.49834224581718445, + "learning_rate": 0.00012892357506960817, + "loss": 0.654, + "step": 7409 + }, + { + "epoch": 1.3218267772723218, + "grad_norm": 0.47109490633010864, + "learning_rate": 0.0001288622824168375, + "loss": 0.6164, + "step": 7410 + }, + { + "epoch": 1.322005173490322, + "grad_norm": 0.43739229440689087, + "learning_rate": 0.0001288009992781709, + "loss": 0.5467, + "step": 7411 + }, + { + "epoch": 1.3221835697083222, + "grad_norm": 0.4927254617214203, + "learning_rate": 0.00012873972565842173, + "loss": 0.5437, + "step": 7412 + }, + { + "epoch": 1.3223619659263224, + "grad_norm": 0.4604239761829376, + "learning_rate": 0.00012867846156240238, + "loss": 0.5555, + "step": 7413 + }, + { + "epoch": 1.3225403621443226, + "grad_norm": 0.4850262999534607, + "learning_rate": 0.00012861720699492435, + "loss": 0.5776, + "step": 7414 + }, + { + "epoch": 1.3227187583623228, + "grad_norm": 0.5016975998878479, + "learning_rate": 0.00012855596196079873, + "loss": 0.7419, + "step": 7415 + }, + { + "epoch": 1.322897154580323, + "grad_norm": 0.49091964960098267, + "learning_rate": 0.0001284947264648355, + "loss": 0.7162, + "step": 7416 + }, + { + "epoch": 1.3230755507983232, + "grad_norm": 0.4609431326389313, + "learning_rate": 0.00012843350051184425, + "loss": 0.5136, + "step": 7417 + }, + { + "epoch": 1.3232539470163234, + "grad_norm": 0.4845166802406311, + "learning_rate": 0.00012837228410663348, + "loss": 0.5677, + "step": 7418 + }, + { + "epoch": 1.3234323432343233, + "grad_norm": 0.5238844156265259, + "learning_rate": 0.00012831107725401125, + "loss": 0.6283, + "step": 7419 + }, + { + "epoch": 1.3236107394523235, + "grad_norm": 0.5406588912010193, + "learning_rate": 0.00012824987995878456, + "loss": 0.5869, + "step": 7420 + }, + { + "epoch": 1.3237891356703237, + "grad_norm": 0.526140570640564, + "learning_rate": 0.0001281886922257599, + "loss": 0.7051, + "step": 7421 + }, + { + "epoch": 1.323967531888324, + "grad_norm": 0.4669652581214905, + "learning_rate": 0.00012812751405974306, + "loss": 0.5571, + "step": 7422 + }, + { + "epoch": 1.3241459281063241, + "grad_norm": 0.47372832894325256, + "learning_rate": 0.0001280663454655387, + "loss": 0.5887, + "step": 7423 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 0.4857545793056488, + "learning_rate": 0.00012800518644795117, + "loss": 0.6542, + "step": 7424 + }, + { + "epoch": 1.3245027205423245, + "grad_norm": 0.5294866561889648, + "learning_rate": 0.0001279440370117838, + "loss": 0.6887, + "step": 7425 + }, + { + "epoch": 1.3246811167603247, + "grad_norm": 0.5368157029151917, + "learning_rate": 0.00012788289716183918, + "loss": 0.6384, + "step": 7426 + }, + { + "epoch": 1.324859512978325, + "grad_norm": 0.4165150225162506, + "learning_rate": 0.00012782176690291936, + "loss": 0.4424, + "step": 7427 + }, + { + "epoch": 1.325037909196325, + "grad_norm": 0.5419753789901733, + "learning_rate": 0.00012776064623982525, + "loss": 0.65, + "step": 7428 + }, + { + "epoch": 1.3252163054143251, + "grad_norm": 0.5392053127288818, + "learning_rate": 0.0001276995351773575, + "loss": 0.6592, + "step": 7429 + }, + { + "epoch": 1.3253947016323253, + "grad_norm": 0.49515146017074585, + "learning_rate": 0.00012763843372031554, + "loss": 0.6608, + "step": 7430 + }, + { + "epoch": 1.3255730978503255, + "grad_norm": 0.4864502549171448, + "learning_rate": 0.00012757734187349843, + "loss": 0.5878, + "step": 7431 + }, + { + "epoch": 1.3257514940683257, + "grad_norm": 0.5112424492835999, + "learning_rate": 0.0001275162596417041, + "loss": 0.4266, + "step": 7432 + }, + { + "epoch": 1.325929890286326, + "grad_norm": 0.5315030813217163, + "learning_rate": 0.00012745518702973014, + "loss": 0.7017, + "step": 7433 + }, + { + "epoch": 1.326108286504326, + "grad_norm": 0.5505911111831665, + "learning_rate": 0.00012739412404237305, + "loss": 0.7733, + "step": 7434 + }, + { + "epoch": 1.3262866827223263, + "grad_norm": 0.5162804126739502, + "learning_rate": 0.00012733307068442862, + "loss": 0.6944, + "step": 7435 + }, + { + "epoch": 1.3264650789403265, + "grad_norm": 0.4766071140766144, + "learning_rate": 0.000127272026960692, + "loss": 0.573, + "step": 7436 + }, + { + "epoch": 1.3266434751583267, + "grad_norm": 0.4825485944747925, + "learning_rate": 0.00012721099287595766, + "loss": 0.6408, + "step": 7437 + }, + { + "epoch": 1.326821871376327, + "grad_norm": 0.40906190872192383, + "learning_rate": 0.00012714996843501904, + "loss": 0.507, + "step": 7438 + }, + { + "epoch": 1.327000267594327, + "grad_norm": 0.5242511630058289, + "learning_rate": 0.0001270889536426691, + "loss": 0.7517, + "step": 7439 + }, + { + "epoch": 1.3271786638123273, + "grad_norm": 0.5269128084182739, + "learning_rate": 0.00012702794850369975, + "loss": 0.7014, + "step": 7440 + }, + { + "epoch": 1.3273570600303275, + "grad_norm": 0.5377137064933777, + "learning_rate": 0.00012696695302290251, + "loss": 0.7327, + "step": 7441 + }, + { + "epoch": 1.3275354562483275, + "grad_norm": 0.4645613729953766, + "learning_rate": 0.00012690596720506776, + "loss": 0.5923, + "step": 7442 + }, + { + "epoch": 1.3277138524663277, + "grad_norm": 0.5495232343673706, + "learning_rate": 0.00012684499105498543, + "loss": 0.7983, + "step": 7443 + }, + { + "epoch": 1.3278922486843279, + "grad_norm": 0.5172943472862244, + "learning_rate": 0.00012678402457744442, + "loss": 0.5517, + "step": 7444 + }, + { + "epoch": 1.328070644902328, + "grad_norm": 0.44186466932296753, + "learning_rate": 0.0001267230677772332, + "loss": 0.4636, + "step": 7445 + }, + { + "epoch": 1.3282490411203283, + "grad_norm": 0.4601621925830841, + "learning_rate": 0.00012666212065913922, + "loss": 0.5381, + "step": 7446 + }, + { + "epoch": 1.3284274373383285, + "grad_norm": 0.540649950504303, + "learning_rate": 0.00012660118322794907, + "loss": 0.584, + "step": 7447 + }, + { + "epoch": 1.3286058335563287, + "grad_norm": 0.45804736018180847, + "learning_rate": 0.000126540255488449, + "loss": 0.5367, + "step": 7448 + }, + { + "epoch": 1.3287842297743289, + "grad_norm": 0.46976906061172485, + "learning_rate": 0.000126479337445424, + "loss": 0.5498, + "step": 7449 + }, + { + "epoch": 1.3289626259923288, + "grad_norm": 0.42822468280792236, + "learning_rate": 0.0001264184291036588, + "loss": 0.519, + "step": 7450 + }, + { + "epoch": 1.329141022210329, + "grad_norm": 0.5430753827095032, + "learning_rate": 0.00012635753046793692, + "loss": 0.7222, + "step": 7451 + }, + { + "epoch": 1.3293194184283292, + "grad_norm": 0.4692500829696655, + "learning_rate": 0.00012629664154304137, + "loss": 0.4665, + "step": 7452 + }, + { + "epoch": 1.3294978146463294, + "grad_norm": 0.45048218965530396, + "learning_rate": 0.00012623576233375449, + "loss": 0.4468, + "step": 7453 + }, + { + "epoch": 1.3296762108643296, + "grad_norm": 0.5404579043388367, + "learning_rate": 0.00012617489284485746, + "loss": 0.6637, + "step": 7454 + }, + { + "epoch": 1.3298546070823298, + "grad_norm": 0.5273029208183289, + "learning_rate": 0.00012611403308113113, + "loss": 0.7473, + "step": 7455 + }, + { + "epoch": 1.33003300330033, + "grad_norm": 0.4636683762073517, + "learning_rate": 0.00012605318304735524, + "loss": 0.4647, + "step": 7456 + }, + { + "epoch": 1.3302113995183302, + "grad_norm": 0.5994560718536377, + "learning_rate": 0.00012599234274830913, + "loss": 0.7458, + "step": 7457 + }, + { + "epoch": 1.3303897957363304, + "grad_norm": 0.4628642201423645, + "learning_rate": 0.00012593151218877105, + "loss": 0.523, + "step": 7458 + }, + { + "epoch": 1.3305681919543306, + "grad_norm": 0.5867023468017578, + "learning_rate": 0.00012587069137351853, + "loss": 0.8639, + "step": 7459 + }, + { + "epoch": 1.3307465881723308, + "grad_norm": 0.5695323944091797, + "learning_rate": 0.00012580988030732858, + "loss": 0.7051, + "step": 7460 + }, + { + "epoch": 1.330924984390331, + "grad_norm": 0.5253010392189026, + "learning_rate": 0.00012574907899497707, + "loss": 0.7779, + "step": 7461 + }, + { + "epoch": 1.3311033806083312, + "grad_norm": 0.4339081346988678, + "learning_rate": 0.00012568828744123956, + "loss": 0.4361, + "step": 7462 + }, + { + "epoch": 1.3312817768263314, + "grad_norm": 0.4968469738960266, + "learning_rate": 0.0001256275056508903, + "loss": 0.6992, + "step": 7463 + }, + { + "epoch": 1.3314601730443314, + "grad_norm": 0.4775252044200897, + "learning_rate": 0.00012556673362870338, + "loss": 0.582, + "step": 7464 + }, + { + "epoch": 1.3316385692623316, + "grad_norm": 0.482811838388443, + "learning_rate": 0.00012550597137945152, + "loss": 0.6071, + "step": 7465 + }, + { + "epoch": 1.3318169654803318, + "grad_norm": 0.4188464879989624, + "learning_rate": 0.00012544521890790712, + "loss": 0.4437, + "step": 7466 + }, + { + "epoch": 1.331995361698332, + "grad_norm": 0.49079805612564087, + "learning_rate": 0.0001253844762188417, + "loss": 0.5477, + "step": 7467 + }, + { + "epoch": 1.3321737579163322, + "grad_norm": 0.4884779751300812, + "learning_rate": 0.00012532374331702584, + "loss": 0.5643, + "step": 7468 + }, + { + "epoch": 1.3323521541343324, + "grad_norm": 0.9924476742744446, + "learning_rate": 0.00012526302020722958, + "loss": 0.5498, + "step": 7469 + }, + { + "epoch": 1.3325305503523326, + "grad_norm": 0.4724234640598297, + "learning_rate": 0.00012520230689422196, + "loss": 0.5189, + "step": 7470 + }, + { + "epoch": 1.3327089465703328, + "grad_norm": 0.4852631390094757, + "learning_rate": 0.00012514160338277154, + "loss": 0.515, + "step": 7471 + }, + { + "epoch": 1.3328873427883328, + "grad_norm": 0.4969404637813568, + "learning_rate": 0.00012508090967764586, + "loss": 0.6241, + "step": 7472 + }, + { + "epoch": 1.333065739006333, + "grad_norm": 0.47249653935432434, + "learning_rate": 0.00012502022578361166, + "loss": 0.6354, + "step": 7473 + }, + { + "epoch": 1.3332441352243332, + "grad_norm": 0.4671791195869446, + "learning_rate": 0.00012495955170543528, + "loss": 0.5399, + "step": 7474 + }, + { + "epoch": 1.3334225314423334, + "grad_norm": 0.45596200227737427, + "learning_rate": 0.00012489888744788178, + "loss": 0.4377, + "step": 7475 + }, + { + "epoch": 1.3336009276603336, + "grad_norm": 0.48205843567848206, + "learning_rate": 0.00012483823301571593, + "loss": 0.6145, + "step": 7476 + }, + { + "epoch": 1.3337793238783338, + "grad_norm": 0.6052742004394531, + "learning_rate": 0.0001247775884137013, + "loss": 0.7105, + "step": 7477 + }, + { + "epoch": 1.333957720096334, + "grad_norm": 0.4180543124675751, + "learning_rate": 0.00012471695364660106, + "loss": 0.6102, + "step": 7478 + }, + { + "epoch": 1.3341361163143342, + "grad_norm": 0.5577853322029114, + "learning_rate": 0.0001246563287191774, + "loss": 0.5936, + "step": 7479 + }, + { + "epoch": 1.3343145125323344, + "grad_norm": 0.5047107934951782, + "learning_rate": 0.00012459571363619167, + "loss": 0.5753, + "step": 7480 + }, + { + "epoch": 1.3344929087503345, + "grad_norm": 0.4621174931526184, + "learning_rate": 0.00012453510840240457, + "loss": 0.5743, + "step": 7481 + }, + { + "epoch": 1.3346713049683347, + "grad_norm": 0.456528902053833, + "learning_rate": 0.00012447451302257607, + "loss": 0.5022, + "step": 7482 + }, + { + "epoch": 1.334849701186335, + "grad_norm": 0.43456900119781494, + "learning_rate": 0.00012441392750146542, + "loss": 0.4349, + "step": 7483 + }, + { + "epoch": 1.3350280974043351, + "grad_norm": 0.46913912892341614, + "learning_rate": 0.00012435335184383085, + "loss": 0.709, + "step": 7484 + }, + { + "epoch": 1.3352064936223353, + "grad_norm": 0.42998531460762024, + "learning_rate": 0.00012429278605442988, + "loss": 0.5149, + "step": 7485 + }, + { + "epoch": 1.3353848898403353, + "grad_norm": 0.5419842600822449, + "learning_rate": 0.00012423223013801945, + "loss": 0.7627, + "step": 7486 + }, + { + "epoch": 1.3355632860583355, + "grad_norm": 0.4358643889427185, + "learning_rate": 0.00012417168409935547, + "loss": 0.4494, + "step": 7487 + }, + { + "epoch": 1.3357416822763357, + "grad_norm": 0.6511504650115967, + "learning_rate": 0.00012411114794319336, + "loss": 0.7345, + "step": 7488 + }, + { + "epoch": 1.335920078494336, + "grad_norm": 0.40742728114128113, + "learning_rate": 0.00012405062167428744, + "loss": 0.4618, + "step": 7489 + }, + { + "epoch": 1.3360984747123361, + "grad_norm": 0.4425670802593231, + "learning_rate": 0.00012399010529739158, + "loss": 0.5878, + "step": 7490 + }, + { + "epoch": 1.3362768709303363, + "grad_norm": 0.4261229634284973, + "learning_rate": 0.00012392959881725853, + "loss": 0.4344, + "step": 7491 + }, + { + "epoch": 1.3364552671483365, + "grad_norm": 0.5334789156913757, + "learning_rate": 0.00012386910223864062, + "loss": 0.6877, + "step": 7492 + }, + { + "epoch": 1.3366336633663367, + "grad_norm": 0.4170753061771393, + "learning_rate": 0.00012380861556628915, + "loss": 0.4582, + "step": 7493 + }, + { + "epoch": 1.3368120595843367, + "grad_norm": 0.4649193286895752, + "learning_rate": 0.0001237481388049546, + "loss": 0.5534, + "step": 7494 + }, + { + "epoch": 1.3369904558023369, + "grad_norm": 0.4809529483318329, + "learning_rate": 0.00012368767195938701, + "loss": 0.4642, + "step": 7495 + }, + { + "epoch": 1.337168852020337, + "grad_norm": 0.467174232006073, + "learning_rate": 0.00012362721503433521, + "loss": 0.5001, + "step": 7496 + }, + { + "epoch": 1.3373472482383373, + "grad_norm": 0.5081356167793274, + "learning_rate": 0.00012356676803454758, + "loss": 0.6698, + "step": 7497 + }, + { + "epoch": 1.3375256444563375, + "grad_norm": 0.5005205273628235, + "learning_rate": 0.00012350633096477165, + "loss": 0.6773, + "step": 7498 + }, + { + "epoch": 1.3377040406743377, + "grad_norm": 0.5624291896820068, + "learning_rate": 0.00012344590382975395, + "loss": 0.7196, + "step": 7499 + }, + { + "epoch": 1.3378824368923379, + "grad_norm": 0.5750653147697449, + "learning_rate": 0.00012338548663424063, + "loss": 0.7924, + "step": 7500 + }, + { + "epoch": 1.338060833110338, + "grad_norm": 0.47942960262298584, + "learning_rate": 0.00012332507938297657, + "loss": 0.5147, + "step": 7501 + }, + { + "epoch": 1.3382392293283383, + "grad_norm": 0.4365304112434387, + "learning_rate": 0.0001232646820807064, + "loss": 0.5417, + "step": 7502 + }, + { + "epoch": 1.3384176255463385, + "grad_norm": 0.4866253435611725, + "learning_rate": 0.0001232042947321734, + "loss": 0.5912, + "step": 7503 + }, + { + "epoch": 1.3385960217643387, + "grad_norm": 0.42192235589027405, + "learning_rate": 0.00012314391734212068, + "loss": 0.4446, + "step": 7504 + }, + { + "epoch": 1.3387744179823389, + "grad_norm": 0.45548638701438904, + "learning_rate": 0.00012308354991529008, + "loss": 0.5527, + "step": 7505 + }, + { + "epoch": 1.338952814200339, + "grad_norm": 0.4403938949108124, + "learning_rate": 0.00012302319245642278, + "loss": 0.4453, + "step": 7506 + }, + { + "epoch": 1.3391312104183393, + "grad_norm": 0.4472365975379944, + "learning_rate": 0.00012296284497025938, + "loss": 0.485, + "step": 7507 + }, + { + "epoch": 1.3393096066363392, + "grad_norm": 0.49241968989372253, + "learning_rate": 0.00012290250746153935, + "loss": 0.674, + "step": 7508 + }, + { + "epoch": 1.3394880028543394, + "grad_norm": 0.5484983921051025, + "learning_rate": 0.0001228421799350018, + "loss": 0.6603, + "step": 7509 + }, + { + "epoch": 1.3396663990723396, + "grad_norm": 0.666890025138855, + "learning_rate": 0.00012278186239538463, + "loss": 0.5482, + "step": 7510 + }, + { + "epoch": 1.3398447952903398, + "grad_norm": 0.45667779445648193, + "learning_rate": 0.00012272155484742534, + "loss": 0.5867, + "step": 7511 + }, + { + "epoch": 1.34002319150834, + "grad_norm": 0.4096013009548187, + "learning_rate": 0.00012266125729586025, + "loss": 0.4004, + "step": 7512 + }, + { + "epoch": 1.3402015877263402, + "grad_norm": 0.4495585262775421, + "learning_rate": 0.00012260096974542524, + "loss": 0.4578, + "step": 7513 + }, + { + "epoch": 1.3403799839443404, + "grad_norm": 0.49030259251594543, + "learning_rate": 0.0001225406922008553, + "loss": 0.7084, + "step": 7514 + }, + { + "epoch": 1.3405583801623406, + "grad_norm": 0.49130570888519287, + "learning_rate": 0.0001224804246668845, + "loss": 0.6026, + "step": 7515 + }, + { + "epoch": 1.3407367763803406, + "grad_norm": 0.4424048364162445, + "learning_rate": 0.00012242016714824632, + "loss": 0.4755, + "step": 7516 + }, + { + "epoch": 1.3409151725983408, + "grad_norm": 0.5130663514137268, + "learning_rate": 0.00012235991964967325, + "loss": 0.6028, + "step": 7517 + }, + { + "epoch": 1.341093568816341, + "grad_norm": 0.4313303232192993, + "learning_rate": 0.0001222996821758972, + "loss": 0.4998, + "step": 7518 + }, + { + "epoch": 1.3412719650343412, + "grad_norm": 0.5465921759605408, + "learning_rate": 0.0001222394547316492, + "loss": 0.8042, + "step": 7519 + }, + { + "epoch": 1.3414503612523414, + "grad_norm": 0.5101335048675537, + "learning_rate": 0.00012217923732165938, + "loss": 0.6261, + "step": 7520 + }, + { + "epoch": 1.3416287574703416, + "grad_norm": 0.5270312428474426, + "learning_rate": 0.00012211902995065728, + "loss": 0.6911, + "step": 7521 + }, + { + "epoch": 1.3418071536883418, + "grad_norm": 0.5207296013832092, + "learning_rate": 0.0001220588326233715, + "loss": 0.7429, + "step": 7522 + }, + { + "epoch": 1.341985549906342, + "grad_norm": 0.4834536015987396, + "learning_rate": 0.00012199864534453003, + "loss": 0.594, + "step": 7523 + }, + { + "epoch": 1.3421639461243422, + "grad_norm": 0.46801358461380005, + "learning_rate": 0.00012193846811885978, + "loss": 0.5481, + "step": 7524 + }, + { + "epoch": 1.3423423423423424, + "grad_norm": 0.4802463948726654, + "learning_rate": 0.00012187830095108721, + "loss": 0.7895, + "step": 7525 + }, + { + "epoch": 1.3425207385603426, + "grad_norm": 0.49674612283706665, + "learning_rate": 0.00012181814384593776, + "loss": 0.693, + "step": 7526 + }, + { + "epoch": 1.3426991347783428, + "grad_norm": 0.44266417622566223, + "learning_rate": 0.00012175799680813593, + "loss": 0.4989, + "step": 7527 + }, + { + "epoch": 1.342877530996343, + "grad_norm": 0.5586349964141846, + "learning_rate": 0.00012169785984240605, + "loss": 0.556, + "step": 7528 + }, + { + "epoch": 1.3430559272143432, + "grad_norm": 0.485568106174469, + "learning_rate": 0.00012163773295347095, + "loss": 0.6456, + "step": 7529 + }, + { + "epoch": 1.3432343234323432, + "grad_norm": 0.47074824571609497, + "learning_rate": 0.00012157761614605314, + "loss": 0.5116, + "step": 7530 + }, + { + "epoch": 1.3434127196503434, + "grad_norm": 0.48266589641571045, + "learning_rate": 0.0001215175094248741, + "loss": 0.5583, + "step": 7531 + }, + { + "epoch": 1.3435911158683436, + "grad_norm": 0.5991159677505493, + "learning_rate": 0.0001214574127946545, + "loss": 0.8355, + "step": 7532 + }, + { + "epoch": 1.3437695120863438, + "grad_norm": 0.47576338052749634, + "learning_rate": 0.00012139732626011446, + "loss": 0.6401, + "step": 7533 + }, + { + "epoch": 1.343947908304344, + "grad_norm": 0.5145847201347351, + "learning_rate": 0.000121337249825973, + "loss": 0.6782, + "step": 7534 + }, + { + "epoch": 1.3441263045223442, + "grad_norm": 0.6812740564346313, + "learning_rate": 0.00012127718349694863, + "loss": 0.5941, + "step": 7535 + }, + { + "epoch": 1.3443047007403444, + "grad_norm": 0.5664840936660767, + "learning_rate": 0.00012121712727775882, + "loss": 0.755, + "step": 7536 + }, + { + "epoch": 1.3444830969583446, + "grad_norm": 0.43093249201774597, + "learning_rate": 0.00012115708117312049, + "loss": 0.4926, + "step": 7537 + }, + { + "epoch": 1.3446614931763445, + "grad_norm": 0.4539382755756378, + "learning_rate": 0.00012109704518774956, + "loss": 0.5437, + "step": 7538 + }, + { + "epoch": 1.3448398893943447, + "grad_norm": 0.46809741854667664, + "learning_rate": 0.00012103701932636114, + "loss": 0.4688, + "step": 7539 + }, + { + "epoch": 1.345018285612345, + "grad_norm": 0.479373037815094, + "learning_rate": 0.00012097700359366981, + "loss": 0.6389, + "step": 7540 + }, + { + "epoch": 1.3451966818303451, + "grad_norm": 0.3928679823875427, + "learning_rate": 0.00012091699799438899, + "loss": 0.36, + "step": 7541 + }, + { + "epoch": 1.3453750780483453, + "grad_norm": 0.4709312915802002, + "learning_rate": 0.00012085700253323173, + "loss": 0.5524, + "step": 7542 + }, + { + "epoch": 1.3455534742663455, + "grad_norm": 0.4242085814476013, + "learning_rate": 0.0001207970172149098, + "loss": 0.4975, + "step": 7543 + }, + { + "epoch": 1.3457318704843457, + "grad_norm": 0.4750572741031647, + "learning_rate": 0.00012073704204413452, + "loss": 0.6045, + "step": 7544 + }, + { + "epoch": 1.345910266702346, + "grad_norm": 0.5419875383377075, + "learning_rate": 0.00012067707702561645, + "loss": 0.7744, + "step": 7545 + }, + { + "epoch": 1.3460886629203461, + "grad_norm": 0.5909712910652161, + "learning_rate": 0.00012061712216406501, + "loss": 0.8503, + "step": 7546 + }, + { + "epoch": 1.3462670591383463, + "grad_norm": 0.4636007249355316, + "learning_rate": 0.00012055717746418918, + "loss": 0.5914, + "step": 7547 + }, + { + "epoch": 1.3464454553563465, + "grad_norm": 0.5388915538787842, + "learning_rate": 0.00012049724293069686, + "loss": 0.7088, + "step": 7548 + }, + { + "epoch": 1.3466238515743467, + "grad_norm": 0.5035228729248047, + "learning_rate": 0.00012043731856829543, + "loss": 0.6403, + "step": 7549 + }, + { + "epoch": 1.346802247792347, + "grad_norm": 0.4420143961906433, + "learning_rate": 0.00012037740438169118, + "loss": 0.5323, + "step": 7550 + }, + { + "epoch": 1.3469806440103471, + "grad_norm": 0.5462261438369751, + "learning_rate": 0.00012031750037558986, + "loss": 0.8891, + "step": 7551 + }, + { + "epoch": 1.347159040228347, + "grad_norm": 0.45456668734550476, + "learning_rate": 0.00012025760655469628, + "loss": 0.4707, + "step": 7552 + }, + { + "epoch": 1.3473374364463473, + "grad_norm": 0.4722398519515991, + "learning_rate": 0.00012019772292371437, + "loss": 0.5228, + "step": 7553 + }, + { + "epoch": 1.3475158326643475, + "grad_norm": 0.53815096616745, + "learning_rate": 0.0001201378494873475, + "loss": 0.804, + "step": 7554 + }, + { + "epoch": 1.3476942288823477, + "grad_norm": 0.48279356956481934, + "learning_rate": 0.00012007798625029798, + "loss": 0.7179, + "step": 7555 + }, + { + "epoch": 1.3478726251003479, + "grad_norm": 1.062436819076538, + "learning_rate": 0.0001200181332172676, + "loss": 0.4516, + "step": 7556 + }, + { + "epoch": 1.348051021318348, + "grad_norm": 0.44580498337745667, + "learning_rate": 0.000119958290392957, + "loss": 0.5279, + "step": 7557 + }, + { + "epoch": 1.3482294175363483, + "grad_norm": 0.48104509711265564, + "learning_rate": 0.00011989845778206629, + "loss": 0.6148, + "step": 7558 + }, + { + "epoch": 1.3484078137543485, + "grad_norm": 0.5047935843467712, + "learning_rate": 0.00011983863538929485, + "loss": 0.6991, + "step": 7559 + }, + { + "epoch": 1.3485862099723485, + "grad_norm": 0.5424177646636963, + "learning_rate": 0.00011977882321934086, + "loss": 0.6587, + "step": 7560 + }, + { + "epoch": 1.3487646061903487, + "grad_norm": 0.5127599239349365, + "learning_rate": 0.00011971902127690215, + "loss": 0.7465, + "step": 7561 + }, + { + "epoch": 1.3489430024083489, + "grad_norm": 0.4546005427837372, + "learning_rate": 0.00011965922956667535, + "loss": 0.5514, + "step": 7562 + }, + { + "epoch": 1.349121398626349, + "grad_norm": 0.49438631534576416, + "learning_rate": 0.00011959944809335668, + "loss": 0.6156, + "step": 7563 + }, + { + "epoch": 1.3492997948443493, + "grad_norm": 0.5039214491844177, + "learning_rate": 0.00011953967686164125, + "loss": 0.532, + "step": 7564 + }, + { + "epoch": 1.3494781910623495, + "grad_norm": 0.5272899270057678, + "learning_rate": 0.00011947991587622334, + "loss": 0.7789, + "step": 7565 + }, + { + "epoch": 1.3496565872803497, + "grad_norm": 0.5394687652587891, + "learning_rate": 0.00011942016514179677, + "loss": 0.7146, + "step": 7566 + }, + { + "epoch": 1.3498349834983498, + "grad_norm": 0.4609784185886383, + "learning_rate": 0.00011936042466305413, + "loss": 0.555, + "step": 7567 + }, + { + "epoch": 1.35001337971635, + "grad_norm": 0.48477205634117126, + "learning_rate": 0.00011930069444468764, + "loss": 0.5976, + "step": 7568 + }, + { + "epoch": 1.3501917759343502, + "grad_norm": 0.47605961561203003, + "learning_rate": 0.00011924097449138824, + "loss": 0.5233, + "step": 7569 + }, + { + "epoch": 1.3503701721523504, + "grad_norm": 0.46146827936172485, + "learning_rate": 0.00011918126480784655, + "loss": 0.4737, + "step": 7570 + }, + { + "epoch": 1.3505485683703506, + "grad_norm": 0.48921510577201843, + "learning_rate": 0.0001191215653987519, + "loss": 0.6749, + "step": 7571 + }, + { + "epoch": 1.3507269645883508, + "grad_norm": 0.49213096499443054, + "learning_rate": 0.0001190618762687933, + "loss": 0.6015, + "step": 7572 + }, + { + "epoch": 1.350905360806351, + "grad_norm": 0.5104438662528992, + "learning_rate": 0.0001190021974226585, + "loss": 0.5599, + "step": 7573 + }, + { + "epoch": 1.351083757024351, + "grad_norm": 0.55939120054245, + "learning_rate": 0.00011894252886503476, + "loss": 0.6426, + "step": 7574 + }, + { + "epoch": 1.3512621532423512, + "grad_norm": 0.5199212431907654, + "learning_rate": 0.00011888287060060845, + "loss": 0.7041, + "step": 7575 + }, + { + "epoch": 1.3514405494603514, + "grad_norm": 0.48316359519958496, + "learning_rate": 0.000118823222634065, + "loss": 0.5074, + "step": 7576 + }, + { + "epoch": 1.3516189456783516, + "grad_norm": 0.630157470703125, + "learning_rate": 0.0001187635849700893, + "loss": 0.5867, + "step": 7577 + }, + { + "epoch": 1.3517973418963518, + "grad_norm": 0.48873627185821533, + "learning_rate": 0.00011870395761336514, + "loss": 0.5635, + "step": 7578 + }, + { + "epoch": 1.351975738114352, + "grad_norm": 0.4779564142227173, + "learning_rate": 0.00011864434056857554, + "loss": 0.4933, + "step": 7579 + }, + { + "epoch": 1.3521541343323522, + "grad_norm": 0.5071210861206055, + "learning_rate": 0.00011858473384040302, + "loss": 0.7165, + "step": 7580 + }, + { + "epoch": 1.3523325305503524, + "grad_norm": 0.5788468718528748, + "learning_rate": 0.00011852513743352885, + "loss": 0.8357, + "step": 7581 + }, + { + "epoch": 1.3525109267683524, + "grad_norm": 0.4292871356010437, + "learning_rate": 0.0001184655513526339, + "loss": 0.5256, + "step": 7582 + }, + { + "epoch": 1.3526893229863526, + "grad_norm": 0.4763195216655731, + "learning_rate": 0.00011840597560239785, + "loss": 0.5758, + "step": 7583 + }, + { + "epoch": 1.3528677192043528, + "grad_norm": 0.5240211486816406, + "learning_rate": 0.00011834641018749994, + "loss": 0.5393, + "step": 7584 + }, + { + "epoch": 1.353046115422353, + "grad_norm": 0.46273085474967957, + "learning_rate": 0.00011828685511261833, + "loss": 0.4274, + "step": 7585 + }, + { + "epoch": 1.3532245116403532, + "grad_norm": 0.5363104343414307, + "learning_rate": 0.00011822731038243035, + "loss": 0.6826, + "step": 7586 + }, + { + "epoch": 1.3534029078583534, + "grad_norm": 0.46597224473953247, + "learning_rate": 0.00011816777600161278, + "loss": 0.5414, + "step": 7587 + }, + { + "epoch": 1.3535813040763536, + "grad_norm": 0.48101913928985596, + "learning_rate": 0.00011810825197484126, + "loss": 0.4022, + "step": 7588 + }, + { + "epoch": 1.3537597002943538, + "grad_norm": 0.5787967443466187, + "learning_rate": 0.00011804873830679089, + "loss": 0.7585, + "step": 7589 + }, + { + "epoch": 1.353938096512354, + "grad_norm": 0.39563214778900146, + "learning_rate": 0.0001179892350021359, + "loss": 0.3329, + "step": 7590 + }, + { + "epoch": 1.3541164927303542, + "grad_norm": 0.5252716541290283, + "learning_rate": 0.00011792974206554949, + "loss": 0.6866, + "step": 7591 + }, + { + "epoch": 1.3542948889483544, + "grad_norm": 0.5136657953262329, + "learning_rate": 0.00011787025950170441, + "loss": 0.6072, + "step": 7592 + }, + { + "epoch": 1.3544732851663546, + "grad_norm": 0.503588080406189, + "learning_rate": 0.0001178107873152722, + "loss": 0.6746, + "step": 7593 + }, + { + "epoch": 1.3546516813843548, + "grad_norm": 0.5402643084526062, + "learning_rate": 0.00011775132551092397, + "loss": 0.73, + "step": 7594 + }, + { + "epoch": 1.354830077602355, + "grad_norm": 0.4741210639476776, + "learning_rate": 0.0001176918740933296, + "loss": 0.5221, + "step": 7595 + }, + { + "epoch": 1.355008473820355, + "grad_norm": 0.5205968618392944, + "learning_rate": 0.00011763243306715862, + "loss": 0.639, + "step": 7596 + }, + { + "epoch": 1.3551868700383551, + "grad_norm": 0.5402557253837585, + "learning_rate": 0.00011757300243707927, + "loss": 0.7846, + "step": 7597 + }, + { + "epoch": 1.3553652662563553, + "grad_norm": 0.46083173155784607, + "learning_rate": 0.00011751358220775943, + "loss": 0.5686, + "step": 7598 + }, + { + "epoch": 1.3555436624743555, + "grad_norm": 0.5107517242431641, + "learning_rate": 0.00011745417238386583, + "loss": 0.639, + "step": 7599 + }, + { + "epoch": 1.3557220586923557, + "grad_norm": 0.5665692687034607, + "learning_rate": 0.0001173947729700644, + "loss": 0.7715, + "step": 7600 + }, + { + "epoch": 1.355900454910356, + "grad_norm": 0.5142179727554321, + "learning_rate": 0.00011733538397102053, + "loss": 0.5198, + "step": 7601 + }, + { + "epoch": 1.3560788511283561, + "grad_norm": 0.49913138151168823, + "learning_rate": 0.00011727600539139841, + "loss": 0.6599, + "step": 7602 + }, + { + "epoch": 1.3562572473463563, + "grad_norm": 0.4611426293849945, + "learning_rate": 0.00011721663723586181, + "loss": 0.5418, + "step": 7603 + }, + { + "epoch": 1.3564356435643563, + "grad_norm": 0.49066436290740967, + "learning_rate": 0.00011715727950907329, + "loss": 0.6004, + "step": 7604 + }, + { + "epoch": 1.3566140397823565, + "grad_norm": 0.4376862645149231, + "learning_rate": 0.00011709793221569486, + "loss": 0.5262, + "step": 7605 + }, + { + "epoch": 1.3567924360003567, + "grad_norm": 0.46929794549942017, + "learning_rate": 0.00011703859536038774, + "loss": 0.4824, + "step": 7606 + }, + { + "epoch": 1.356970832218357, + "grad_norm": 0.5407469868659973, + "learning_rate": 0.00011697926894781205, + "loss": 0.692, + "step": 7607 + }, + { + "epoch": 1.357149228436357, + "grad_norm": 0.5103482007980347, + "learning_rate": 0.00011691995298262739, + "loss": 0.6721, + "step": 7608 + }, + { + "epoch": 1.3573276246543573, + "grad_norm": 0.532581090927124, + "learning_rate": 0.00011686064746949229, + "loss": 0.7574, + "step": 7609 + }, + { + "epoch": 1.3575060208723575, + "grad_norm": 0.4488193690776825, + "learning_rate": 0.00011680135241306472, + "loss": 0.5191, + "step": 7610 + }, + { + "epoch": 1.3576844170903577, + "grad_norm": 0.4854279160499573, + "learning_rate": 0.00011674206781800162, + "loss": 0.5963, + "step": 7611 + }, + { + "epoch": 1.357862813308358, + "grad_norm": 0.4275936186313629, + "learning_rate": 0.00011668279368895907, + "loss": 0.4465, + "step": 7612 + }, + { + "epoch": 1.358041209526358, + "grad_norm": 0.5791702270507812, + "learning_rate": 0.00011662353003059262, + "loss": 0.7993, + "step": 7613 + }, + { + "epoch": 1.3582196057443583, + "grad_norm": 0.5329555869102478, + "learning_rate": 0.00011656427684755666, + "loss": 0.6066, + "step": 7614 + }, + { + "epoch": 1.3583980019623585, + "grad_norm": 0.4500406086444855, + "learning_rate": 0.00011650503414450502, + "loss": 0.5459, + "step": 7615 + }, + { + "epoch": 1.3585763981803587, + "grad_norm": 0.5136424899101257, + "learning_rate": 0.0001164458019260905, + "loss": 0.6399, + "step": 7616 + }, + { + "epoch": 1.358754794398359, + "grad_norm": 0.49857786297798157, + "learning_rate": 0.0001163865801969653, + "loss": 0.632, + "step": 7617 + }, + { + "epoch": 1.3589331906163589, + "grad_norm": 0.529205858707428, + "learning_rate": 0.00011632736896178059, + "loss": 0.5917, + "step": 7618 + }, + { + "epoch": 1.359111586834359, + "grad_norm": 0.6193312406539917, + "learning_rate": 0.00011626816822518662, + "loss": 0.693, + "step": 7619 + }, + { + "epoch": 1.3592899830523593, + "grad_norm": 0.4899401068687439, + "learning_rate": 0.00011620897799183336, + "loss": 0.5771, + "step": 7620 + }, + { + "epoch": 1.3594683792703595, + "grad_norm": 0.5664353966712952, + "learning_rate": 0.0001161497982663693, + "loss": 0.7672, + "step": 7621 + }, + { + "epoch": 1.3596467754883597, + "grad_norm": 0.4969763159751892, + "learning_rate": 0.00011609062905344256, + "loss": 0.669, + "step": 7622 + }, + { + "epoch": 1.3598251717063599, + "grad_norm": 0.48748770356178284, + "learning_rate": 0.0001160314703577002, + "loss": 0.6251, + "step": 7623 + }, + { + "epoch": 1.36000356792436, + "grad_norm": 0.4871166944503784, + "learning_rate": 0.00011597232218378842, + "loss": 0.4868, + "step": 7624 + }, + { + "epoch": 1.3601819641423603, + "grad_norm": 0.4607223868370056, + "learning_rate": 0.00011591318453635286, + "loss": 0.617, + "step": 7625 + }, + { + "epoch": 1.3603603603603602, + "grad_norm": 0.5145749449729919, + "learning_rate": 0.000115854057420038, + "loss": 0.8115, + "step": 7626 + }, + { + "epoch": 1.3605387565783604, + "grad_norm": 0.4471273124217987, + "learning_rate": 0.00011579494083948783, + "loss": 0.4159, + "step": 7627 + }, + { + "epoch": 1.3607171527963606, + "grad_norm": 0.5210676193237305, + "learning_rate": 0.00011573583479934516, + "loss": 0.6254, + "step": 7628 + }, + { + "epoch": 1.3608955490143608, + "grad_norm": 0.44996219873428345, + "learning_rate": 0.00011567673930425232, + "loss": 0.4922, + "step": 7629 + }, + { + "epoch": 1.361073945232361, + "grad_norm": 0.5416077375411987, + "learning_rate": 0.0001156176543588505, + "loss": 0.6659, + "step": 7630 + }, + { + "epoch": 1.3612523414503612, + "grad_norm": 0.5629211068153381, + "learning_rate": 0.00011555857996778038, + "loss": 0.6857, + "step": 7631 + }, + { + "epoch": 1.3614307376683614, + "grad_norm": 0.552343487739563, + "learning_rate": 0.00011549951613568152, + "loss": 0.8255, + "step": 7632 + }, + { + "epoch": 1.3616091338863616, + "grad_norm": 0.4920686185359955, + "learning_rate": 0.0001154404628671927, + "loss": 0.5705, + "step": 7633 + }, + { + "epoch": 1.3617875301043618, + "grad_norm": 0.514735758304596, + "learning_rate": 0.0001153814201669521, + "loss": 0.5387, + "step": 7634 + }, + { + "epoch": 1.361965926322362, + "grad_norm": 0.6460976600646973, + "learning_rate": 0.00011532238803959666, + "loss": 0.5714, + "step": 7635 + }, + { + "epoch": 1.3621443225403622, + "grad_norm": 0.48432081937789917, + "learning_rate": 0.00011526336648976307, + "loss": 0.4946, + "step": 7636 + }, + { + "epoch": 1.3623227187583624, + "grad_norm": 0.5579152703285217, + "learning_rate": 0.00011520435552208672, + "loss": 0.6566, + "step": 7637 + }, + { + "epoch": 1.3625011149763626, + "grad_norm": 0.5772841572761536, + "learning_rate": 0.00011514535514120217, + "loss": 0.7587, + "step": 7638 + }, + { + "epoch": 1.3626795111943628, + "grad_norm": 0.43424656987190247, + "learning_rate": 0.00011508636535174349, + "loss": 0.4863, + "step": 7639 + }, + { + "epoch": 1.3628579074123628, + "grad_norm": 0.5111271739006042, + "learning_rate": 0.00011502738615834351, + "loss": 0.6528, + "step": 7640 + }, + { + "epoch": 1.363036303630363, + "grad_norm": 0.4701317548751831, + "learning_rate": 0.00011496841756563467, + "loss": 0.4773, + "step": 7641 + }, + { + "epoch": 1.3632146998483632, + "grad_norm": 0.5139216780662537, + "learning_rate": 0.00011490945957824808, + "loss": 0.647, + "step": 7642 + }, + { + "epoch": 1.3633930960663634, + "grad_norm": 0.5112646222114563, + "learning_rate": 0.00011485051220081449, + "loss": 0.582, + "step": 7643 + }, + { + "epoch": 1.3635714922843636, + "grad_norm": 0.5711879730224609, + "learning_rate": 0.00011479157543796353, + "loss": 0.7522, + "step": 7644 + }, + { + "epoch": 1.3637498885023638, + "grad_norm": 0.4305424988269806, + "learning_rate": 0.00011473264929432398, + "loss": 0.4716, + "step": 7645 + }, + { + "epoch": 1.363928284720364, + "grad_norm": 0.5155298113822937, + "learning_rate": 0.000114673733774524, + "loss": 0.6646, + "step": 7646 + }, + { + "epoch": 1.3641066809383642, + "grad_norm": 0.5119175910949707, + "learning_rate": 0.00011461482888319064, + "loss": 0.6522, + "step": 7647 + }, + { + "epoch": 1.3642850771563642, + "grad_norm": 0.5106381177902222, + "learning_rate": 0.00011455593462495047, + "loss": 0.7938, + "step": 7648 + }, + { + "epoch": 1.3644634733743644, + "grad_norm": 0.44368186593055725, + "learning_rate": 0.00011449705100442881, + "loss": 0.4743, + "step": 7649 + }, + { + "epoch": 1.3646418695923646, + "grad_norm": 0.4758698046207428, + "learning_rate": 0.00011443817802625044, + "loss": 0.5669, + "step": 7650 + }, + { + "epoch": 1.3648202658103648, + "grad_norm": 0.41871505975723267, + "learning_rate": 0.00011437931569503935, + "loss": 0.4859, + "step": 7651 + }, + { + "epoch": 1.364998662028365, + "grad_norm": 0.44119468331336975, + "learning_rate": 0.00011432046401541835, + "loss": 0.57, + "step": 7652 + }, + { + "epoch": 1.3651770582463651, + "grad_norm": 0.5434176921844482, + "learning_rate": 0.0001142616229920098, + "loss": 0.7457, + "step": 7653 + }, + { + "epoch": 1.3653554544643653, + "grad_norm": 0.4746476411819458, + "learning_rate": 0.00011420279262943487, + "loss": 0.5716, + "step": 7654 + }, + { + "epoch": 1.3655338506823655, + "grad_norm": 0.4947410523891449, + "learning_rate": 0.00011414397293231424, + "loss": 0.6332, + "step": 7655 + }, + { + "epoch": 1.3657122469003657, + "grad_norm": 0.4677318036556244, + "learning_rate": 0.00011408516390526747, + "loss": 0.5055, + "step": 7656 + }, + { + "epoch": 1.365890643118366, + "grad_norm": 0.535660445690155, + "learning_rate": 0.00011402636555291348, + "loss": 0.8911, + "step": 7657 + }, + { + "epoch": 1.3660690393363661, + "grad_norm": 0.4632878601551056, + "learning_rate": 0.00011396757787987025, + "loss": 0.5509, + "step": 7658 + }, + { + "epoch": 1.3662474355543663, + "grad_norm": 0.5350466966629028, + "learning_rate": 0.00011390880089075483, + "loss": 0.7065, + "step": 7659 + }, + { + "epoch": 1.3664258317723665, + "grad_norm": 0.5175812840461731, + "learning_rate": 0.00011385003459018369, + "loss": 0.7321, + "step": 7660 + }, + { + "epoch": 1.3666042279903667, + "grad_norm": 0.4582364857196808, + "learning_rate": 0.00011379127898277217, + "loss": 0.5169, + "step": 7661 + }, + { + "epoch": 1.3667826242083667, + "grad_norm": 0.45724961161613464, + "learning_rate": 0.00011373253407313508, + "loss": 0.5547, + "step": 7662 + }, + { + "epoch": 1.366961020426367, + "grad_norm": 0.48377981781959534, + "learning_rate": 0.00011367379986588603, + "loss": 0.4886, + "step": 7663 + }, + { + "epoch": 1.3671394166443671, + "grad_norm": 0.4916684627532959, + "learning_rate": 0.00011361507636563817, + "loss": 0.4752, + "step": 7664 + }, + { + "epoch": 1.3673178128623673, + "grad_norm": 0.48477721214294434, + "learning_rate": 0.00011355636357700342, + "loss": 0.5644, + "step": 7665 + }, + { + "epoch": 1.3674962090803675, + "grad_norm": 0.5753141641616821, + "learning_rate": 0.00011349766150459314, + "loss": 0.7344, + "step": 7666 + }, + { + "epoch": 1.3676746052983677, + "grad_norm": 0.48007839918136597, + "learning_rate": 0.0001134389701530179, + "loss": 0.5605, + "step": 7667 + }, + { + "epoch": 1.367853001516368, + "grad_norm": 0.46499067544937134, + "learning_rate": 0.00011338028952688709, + "loss": 0.5273, + "step": 7668 + }, + { + "epoch": 1.368031397734368, + "grad_norm": 0.4519871473312378, + "learning_rate": 0.00011332161963080961, + "loss": 0.5694, + "step": 7669 + }, + { + "epoch": 1.368209793952368, + "grad_norm": 0.4737212061882019, + "learning_rate": 0.00011326296046939332, + "loss": 0.5597, + "step": 7670 + }, + { + "epoch": 1.3683881901703683, + "grad_norm": 0.4954215884208679, + "learning_rate": 0.00011320431204724519, + "loss": 0.6356, + "step": 7671 + }, + { + "epoch": 1.3685665863883685, + "grad_norm": 0.4386094808578491, + "learning_rate": 0.00011314567436897161, + "loss": 0.6296, + "step": 7672 + }, + { + "epoch": 1.3687449826063687, + "grad_norm": 0.7949509620666504, + "learning_rate": 0.0001130870474391778, + "loss": 0.6376, + "step": 7673 + }, + { + "epoch": 1.3689233788243689, + "grad_norm": 0.47607821226119995, + "learning_rate": 0.00011302843126246842, + "loss": 0.5973, + "step": 7674 + }, + { + "epoch": 1.369101775042369, + "grad_norm": 0.4358568489551544, + "learning_rate": 0.00011296982584344704, + "loss": 0.4679, + "step": 7675 + }, + { + "epoch": 1.3692801712603693, + "grad_norm": 0.5030319690704346, + "learning_rate": 0.00011291123118671665, + "loss": 0.6087, + "step": 7676 + }, + { + "epoch": 1.3694585674783695, + "grad_norm": 0.5743786096572876, + "learning_rate": 0.00011285264729687908, + "loss": 0.8441, + "step": 7677 + }, + { + "epoch": 1.3696369636963697, + "grad_norm": 0.45528116822242737, + "learning_rate": 0.00011279407417853569, + "loss": 0.4794, + "step": 7678 + }, + { + "epoch": 1.3698153599143699, + "grad_norm": 0.538870632648468, + "learning_rate": 0.00011273551183628664, + "loss": 0.6597, + "step": 7679 + }, + { + "epoch": 1.36999375613237, + "grad_norm": 0.4873597025871277, + "learning_rate": 0.00011267696027473132, + "loss": 0.5062, + "step": 7680 + }, + { + "epoch": 1.3701721523503703, + "grad_norm": 0.573042094707489, + "learning_rate": 0.00011261841949846846, + "loss": 0.6462, + "step": 7681 + }, + { + "epoch": 1.3703505485683705, + "grad_norm": 0.4371779263019562, + "learning_rate": 0.00011255988951209589, + "loss": 0.4188, + "step": 7682 + }, + { + "epoch": 1.3705289447863707, + "grad_norm": 0.5239207744598389, + "learning_rate": 0.00011250137032021038, + "loss": 0.6176, + "step": 7683 + }, + { + "epoch": 1.3707073410043706, + "grad_norm": 0.4236181378364563, + "learning_rate": 0.00011244286192740815, + "loss": 0.4331, + "step": 7684 + }, + { + "epoch": 1.3708857372223708, + "grad_norm": 0.5079860091209412, + "learning_rate": 0.00011238436433828427, + "loss": 0.5725, + "step": 7685 + }, + { + "epoch": 1.371064133440371, + "grad_norm": 0.48492246866226196, + "learning_rate": 0.00011232587755743332, + "loss": 0.5257, + "step": 7686 + }, + { + "epoch": 1.3712425296583712, + "grad_norm": 0.43290892243385315, + "learning_rate": 0.00011226740158944856, + "loss": 0.4729, + "step": 7687 + }, + { + "epoch": 1.3714209258763714, + "grad_norm": 0.4931113123893738, + "learning_rate": 0.00011220893643892291, + "loss": 0.5433, + "step": 7688 + }, + { + "epoch": 1.3715993220943716, + "grad_norm": 0.492724746465683, + "learning_rate": 0.00011215048211044801, + "loss": 0.5631, + "step": 7689 + }, + { + "epoch": 1.3717777183123718, + "grad_norm": 0.48536619544029236, + "learning_rate": 0.000112092038608615, + "loss": 0.509, + "step": 7690 + }, + { + "epoch": 1.371956114530372, + "grad_norm": 0.43689486384391785, + "learning_rate": 0.00011203360593801396, + "loss": 0.5927, + "step": 7691 + }, + { + "epoch": 1.372134510748372, + "grad_norm": 0.40810397267341614, + "learning_rate": 0.00011197518410323401, + "loss": 0.4723, + "step": 7692 + }, + { + "epoch": 1.3723129069663722, + "grad_norm": 0.4644933044910431, + "learning_rate": 0.00011191677310886384, + "loss": 0.562, + "step": 7693 + }, + { + "epoch": 1.3724913031843724, + "grad_norm": 0.5269336104393005, + "learning_rate": 0.00011185837295949075, + "loss": 0.5993, + "step": 7694 + }, + { + "epoch": 1.3726696994023726, + "grad_norm": 0.5395172238349915, + "learning_rate": 0.00011179998365970174, + "loss": 0.7452, + "step": 7695 + }, + { + "epoch": 1.3728480956203728, + "grad_norm": 0.5476558804512024, + "learning_rate": 0.00011174160521408241, + "loss": 0.7672, + "step": 7696 + }, + { + "epoch": 1.373026491838373, + "grad_norm": 0.5317317247390747, + "learning_rate": 0.0001116832376272179, + "loss": 0.679, + "step": 7697 + }, + { + "epoch": 1.3732048880563732, + "grad_norm": 0.9869957566261292, + "learning_rate": 0.00011162488090369252, + "loss": 0.7226, + "step": 7698 + }, + { + "epoch": 1.3733832842743734, + "grad_norm": 0.5336429476737976, + "learning_rate": 0.00011156653504808934, + "loss": 0.7258, + "step": 7699 + }, + { + "epoch": 1.3735616804923736, + "grad_norm": 0.5142067670822144, + "learning_rate": 0.00011150820006499101, + "loss": 0.6718, + "step": 7700 + }, + { + "epoch": 1.3737400767103738, + "grad_norm": 0.49160951375961304, + "learning_rate": 0.00011144987595897896, + "loss": 0.623, + "step": 7701 + }, + { + "epoch": 1.373918472928374, + "grad_norm": 0.4981287717819214, + "learning_rate": 0.00011139156273463411, + "loss": 0.6447, + "step": 7702 + }, + { + "epoch": 1.3740968691463742, + "grad_norm": 0.4330550730228424, + "learning_rate": 0.0001113332603965363, + "loss": 0.3944, + "step": 7703 + }, + { + "epoch": 1.3742752653643744, + "grad_norm": 0.5066832900047302, + "learning_rate": 0.00011127496894926442, + "loss": 0.5891, + "step": 7704 + }, + { + "epoch": 1.3744536615823746, + "grad_norm": 0.5171502232551575, + "learning_rate": 0.00011121668839739691, + "loss": 0.7084, + "step": 7705 + }, + { + "epoch": 1.3746320578003746, + "grad_norm": 0.48667481541633606, + "learning_rate": 0.00011115841874551084, + "loss": 0.5493, + "step": 7706 + }, + { + "epoch": 1.3748104540183748, + "grad_norm": 0.4992111623287201, + "learning_rate": 0.00011110015999818293, + "loss": 0.584, + "step": 7707 + }, + { + "epoch": 1.374988850236375, + "grad_norm": 0.4618065357208252, + "learning_rate": 0.00011104191215998857, + "loss": 0.6038, + "step": 7708 + }, + { + "epoch": 1.3751672464543752, + "grad_norm": 0.4726942777633667, + "learning_rate": 0.00011098367523550273, + "loss": 0.5524, + "step": 7709 + }, + { + "epoch": 1.3753456426723754, + "grad_norm": 0.4904707372188568, + "learning_rate": 0.00011092544922929914, + "loss": 0.6269, + "step": 7710 + }, + { + "epoch": 1.3755240388903756, + "grad_norm": 0.4519045054912567, + "learning_rate": 0.0001108672341459509, + "loss": 0.5587, + "step": 7711 + }, + { + "epoch": 1.3757024351083758, + "grad_norm": 0.45719850063323975, + "learning_rate": 0.00011080902999003032, + "loss": 0.5617, + "step": 7712 + }, + { + "epoch": 1.375880831326376, + "grad_norm": 0.4688970446586609, + "learning_rate": 0.00011075083676610853, + "loss": 0.4525, + "step": 7713 + }, + { + "epoch": 1.376059227544376, + "grad_norm": 0.4638102650642395, + "learning_rate": 0.00011069265447875617, + "loss": 0.5887, + "step": 7714 + }, + { + "epoch": 1.3762376237623761, + "grad_norm": 0.5201415419578552, + "learning_rate": 0.0001106344831325427, + "loss": 0.6049, + "step": 7715 + }, + { + "epoch": 1.3764160199803763, + "grad_norm": 0.5137766003608704, + "learning_rate": 0.00011057632273203708, + "loss": 0.6497, + "step": 7716 + }, + { + "epoch": 1.3765944161983765, + "grad_norm": 0.5084298849105835, + "learning_rate": 0.00011051817328180702, + "loss": 0.6516, + "step": 7717 + }, + { + "epoch": 1.3767728124163767, + "grad_norm": 0.4721417725086212, + "learning_rate": 0.00011046003478641955, + "loss": 0.5664, + "step": 7718 + }, + { + "epoch": 1.376951208634377, + "grad_norm": 0.4899318814277649, + "learning_rate": 0.00011040190725044097, + "loss": 0.6067, + "step": 7719 + }, + { + "epoch": 1.3771296048523771, + "grad_norm": 0.48559266328811646, + "learning_rate": 0.00011034379067843644, + "loss": 0.5541, + "step": 7720 + }, + { + "epoch": 1.3773080010703773, + "grad_norm": 0.46098917722702026, + "learning_rate": 0.00011028568507497058, + "loss": 0.5172, + "step": 7721 + }, + { + "epoch": 1.3774863972883775, + "grad_norm": 0.5564523935317993, + "learning_rate": 0.00011022759044460678, + "loss": 0.7203, + "step": 7722 + }, + { + "epoch": 1.3776647935063777, + "grad_norm": 0.5125877261161804, + "learning_rate": 0.00011016950679190798, + "loss": 0.5827, + "step": 7723 + }, + { + "epoch": 1.377843189724378, + "grad_norm": 0.4653429090976715, + "learning_rate": 0.00011011143412143596, + "loss": 0.593, + "step": 7724 + }, + { + "epoch": 1.3780215859423781, + "grad_norm": 0.44072821736335754, + "learning_rate": 0.0001100533724377516, + "loss": 0.5665, + "step": 7725 + }, + { + "epoch": 1.3781999821603783, + "grad_norm": 0.48405829071998596, + "learning_rate": 0.00010999532174541524, + "loss": 0.6273, + "step": 7726 + }, + { + "epoch": 1.3783783783783785, + "grad_norm": 0.5710448622703552, + "learning_rate": 0.0001099372820489859, + "loss": 0.6321, + "step": 7727 + }, + { + "epoch": 1.3785567745963785, + "grad_norm": 0.47928139567375183, + "learning_rate": 0.00010987925335302229, + "loss": 0.6109, + "step": 7728 + }, + { + "epoch": 1.3787351708143787, + "grad_norm": 0.47921472787857056, + "learning_rate": 0.00010982123566208185, + "loss": 0.5726, + "step": 7729 + }, + { + "epoch": 1.3789135670323789, + "grad_norm": 0.4955821633338928, + "learning_rate": 0.00010976322898072117, + "loss": 0.5733, + "step": 7730 + }, + { + "epoch": 1.379091963250379, + "grad_norm": 0.45048630237579346, + "learning_rate": 0.00010970523331349619, + "loss": 0.5002, + "step": 7731 + }, + { + "epoch": 1.3792703594683793, + "grad_norm": 0.4999293088912964, + "learning_rate": 0.00010964724866496173, + "loss": 0.6949, + "step": 7732 + }, + { + "epoch": 1.3794487556863795, + "grad_norm": 0.5359913110733032, + "learning_rate": 0.00010958927503967206, + "loss": 0.6405, + "step": 7733 + }, + { + "epoch": 1.3796271519043797, + "grad_norm": 0.4533331096172333, + "learning_rate": 0.0001095313124421802, + "loss": 0.5499, + "step": 7734 + }, + { + "epoch": 1.3798055481223799, + "grad_norm": 0.4622163772583008, + "learning_rate": 0.00010947336087703872, + "loss": 0.532, + "step": 7735 + }, + { + "epoch": 1.3799839443403799, + "grad_norm": 0.4350248873233795, + "learning_rate": 0.0001094154203487989, + "loss": 0.4439, + "step": 7736 + }, + { + "epoch": 1.38016234055838, + "grad_norm": 0.47929033637046814, + "learning_rate": 0.00010935749086201158, + "loss": 0.5798, + "step": 7737 + }, + { + "epoch": 1.3803407367763803, + "grad_norm": 0.5002579689025879, + "learning_rate": 0.00010929957242122637, + "loss": 0.6427, + "step": 7738 + }, + { + "epoch": 1.3805191329943804, + "grad_norm": 0.5057860612869263, + "learning_rate": 0.00010924166503099211, + "loss": 0.7614, + "step": 7739 + }, + { + "epoch": 1.3806975292123806, + "grad_norm": 0.4656047224998474, + "learning_rate": 0.00010918376869585702, + "loss": 0.5467, + "step": 7740 + }, + { + "epoch": 1.3808759254303808, + "grad_norm": 0.5039976835250854, + "learning_rate": 0.00010912588342036802, + "loss": 0.6859, + "step": 7741 + }, + { + "epoch": 1.381054321648381, + "grad_norm": 0.6870278716087341, + "learning_rate": 0.00010906800920907153, + "loss": 0.5815, + "step": 7742 + }, + { + "epoch": 1.3812327178663812, + "grad_norm": 0.43213722109794617, + "learning_rate": 0.00010901014606651305, + "loss": 0.3694, + "step": 7743 + }, + { + "epoch": 1.3814111140843814, + "grad_norm": 0.5106995701789856, + "learning_rate": 0.00010895229399723694, + "loss": 0.5759, + "step": 7744 + }, + { + "epoch": 1.3815895103023816, + "grad_norm": 0.45927223563194275, + "learning_rate": 0.00010889445300578701, + "loss": 0.5318, + "step": 7745 + }, + { + "epoch": 1.3817679065203818, + "grad_norm": 0.5364645719528198, + "learning_rate": 0.00010883662309670597, + "loss": 0.6094, + "step": 7746 + }, + { + "epoch": 1.381946302738382, + "grad_norm": 0.5728722214698792, + "learning_rate": 0.00010877880427453588, + "loss": 0.7887, + "step": 7747 + }, + { + "epoch": 1.3821246989563822, + "grad_norm": 0.49186766147613525, + "learning_rate": 0.00010872099654381762, + "loss": 0.5248, + "step": 7748 + }, + { + "epoch": 1.3823030951743824, + "grad_norm": 0.48165062069892883, + "learning_rate": 0.00010866319990909163, + "loss": 0.6226, + "step": 7749 + }, + { + "epoch": 1.3824814913923824, + "grad_norm": 0.49330055713653564, + "learning_rate": 0.00010860541437489705, + "loss": 0.6703, + "step": 7750 + }, + { + "epoch": 1.3826598876103826, + "grad_norm": 0.5186200141906738, + "learning_rate": 0.00010854763994577232, + "loss": 0.7396, + "step": 7751 + }, + { + "epoch": 1.3828382838283828, + "grad_norm": 0.5332891941070557, + "learning_rate": 0.00010848987662625515, + "loss": 0.7148, + "step": 7752 + }, + { + "epoch": 1.383016680046383, + "grad_norm": 0.5629260540008545, + "learning_rate": 0.00010843212442088207, + "loss": 0.7122, + "step": 7753 + }, + { + "epoch": 1.3831950762643832, + "grad_norm": 0.44756463170051575, + "learning_rate": 0.00010837438333418914, + "loss": 0.5578, + "step": 7754 + }, + { + "epoch": 1.3833734724823834, + "grad_norm": 0.48155879974365234, + "learning_rate": 0.00010831665337071109, + "loss": 0.5866, + "step": 7755 + }, + { + "epoch": 1.3835518687003836, + "grad_norm": 0.5249548554420471, + "learning_rate": 0.0001082589345349822, + "loss": 0.6235, + "step": 7756 + }, + { + "epoch": 1.3837302649183838, + "grad_norm": 0.4601089656352997, + "learning_rate": 0.00010820122683153552, + "loss": 0.4966, + "step": 7757 + }, + { + "epoch": 1.3839086611363838, + "grad_norm": 0.4255712330341339, + "learning_rate": 0.00010814353026490345, + "loss": 0.3554, + "step": 7758 + }, + { + "epoch": 1.384087057354384, + "grad_norm": 0.3792378902435303, + "learning_rate": 0.00010808584483961755, + "loss": 0.3656, + "step": 7759 + }, + { + "epoch": 1.3842654535723842, + "grad_norm": 0.5113689303398132, + "learning_rate": 0.00010802817056020825, + "loss": 0.6168, + "step": 7760 + }, + { + "epoch": 1.3844438497903844, + "grad_norm": 0.5441461205482483, + "learning_rate": 0.00010797050743120542, + "loss": 0.7846, + "step": 7761 + }, + { + "epoch": 1.3846222460083846, + "grad_norm": 0.47966670989990234, + "learning_rate": 0.00010791285545713783, + "loss": 0.5587, + "step": 7762 + }, + { + "epoch": 1.3848006422263848, + "grad_norm": 0.46844810247421265, + "learning_rate": 0.00010785521464253334, + "loss": 0.551, + "step": 7763 + }, + { + "epoch": 1.384979038444385, + "grad_norm": 1.28788161277771, + "learning_rate": 0.00010779758499191919, + "loss": 0.7065, + "step": 7764 + }, + { + "epoch": 1.3851574346623852, + "grad_norm": 0.5162654519081116, + "learning_rate": 0.00010773996650982146, + "loss": 0.6795, + "step": 7765 + }, + { + "epoch": 1.3853358308803854, + "grad_norm": 0.5239789485931396, + "learning_rate": 0.00010768235920076561, + "loss": 0.7062, + "step": 7766 + }, + { + "epoch": 1.3855142270983856, + "grad_norm": 0.4905536472797394, + "learning_rate": 0.00010762476306927594, + "loss": 0.5994, + "step": 7767 + }, + { + "epoch": 1.3856926233163858, + "grad_norm": 0.5204237699508667, + "learning_rate": 0.00010756717811987618, + "loss": 0.5825, + "step": 7768 + }, + { + "epoch": 1.385871019534386, + "grad_norm": 0.5072759389877319, + "learning_rate": 0.0001075096043570889, + "loss": 0.6562, + "step": 7769 + }, + { + "epoch": 1.3860494157523862, + "grad_norm": 0.45566022396087646, + "learning_rate": 0.00010745204178543605, + "loss": 0.5398, + "step": 7770 + }, + { + "epoch": 1.3862278119703864, + "grad_norm": 0.4731004238128662, + "learning_rate": 0.00010739449040943849, + "loss": 0.6673, + "step": 7771 + }, + { + "epoch": 1.3864062081883863, + "grad_norm": 0.472790390253067, + "learning_rate": 0.0001073369502336161, + "loss": 0.499, + "step": 7772 + }, + { + "epoch": 1.3865846044063865, + "grad_norm": 0.46505823731422424, + "learning_rate": 0.00010727942126248843, + "loss": 0.5368, + "step": 7773 + }, + { + "epoch": 1.3867630006243867, + "grad_norm": 0.47594350576400757, + "learning_rate": 0.0001072219035005735, + "loss": 0.5684, + "step": 7774 + }, + { + "epoch": 1.386941396842387, + "grad_norm": 0.49051812291145325, + "learning_rate": 0.00010716439695238895, + "loss": 0.5684, + "step": 7775 + }, + { + "epoch": 1.3871197930603871, + "grad_norm": 0.5569090843200684, + "learning_rate": 0.00010710690162245118, + "loss": 0.666, + "step": 7776 + }, + { + "epoch": 1.3872981892783873, + "grad_norm": 0.4677387475967407, + "learning_rate": 0.00010704941751527578, + "loss": 0.5223, + "step": 7777 + }, + { + "epoch": 1.3874765854963875, + "grad_norm": 0.5612090229988098, + "learning_rate": 0.00010699194463537767, + "loss": 0.7503, + "step": 7778 + }, + { + "epoch": 1.3876549817143877, + "grad_norm": 0.5161556601524353, + "learning_rate": 0.00010693448298727062, + "loss": 0.6457, + "step": 7779 + }, + { + "epoch": 1.3878333779323877, + "grad_norm": 0.5419967174530029, + "learning_rate": 0.0001068770325754678, + "loss": 0.6822, + "step": 7780 + }, + { + "epoch": 1.388011774150388, + "grad_norm": 0.4641594886779785, + "learning_rate": 0.00010681959340448116, + "loss": 0.4709, + "step": 7781 + }, + { + "epoch": 1.388190170368388, + "grad_norm": 0.8507322669029236, + "learning_rate": 0.00010676216547882214, + "loss": 0.5894, + "step": 7782 + }, + { + "epoch": 1.3883685665863883, + "grad_norm": 0.5100333094596863, + "learning_rate": 0.00010670474880300098, + "loss": 0.6258, + "step": 7783 + }, + { + "epoch": 1.3885469628043885, + "grad_norm": 0.49771493673324585, + "learning_rate": 0.00010664734338152712, + "loss": 0.5853, + "step": 7784 + }, + { + "epoch": 1.3887253590223887, + "grad_norm": 0.5230436325073242, + "learning_rate": 0.0001065899492189093, + "loss": 0.6821, + "step": 7785 + }, + { + "epoch": 1.388903755240389, + "grad_norm": 0.5067762732505798, + "learning_rate": 0.00010653256631965505, + "loss": 0.7528, + "step": 7786 + }, + { + "epoch": 1.389082151458389, + "grad_norm": 0.48270925879478455, + "learning_rate": 0.0001064751946882714, + "loss": 0.5972, + "step": 7787 + }, + { + "epoch": 1.3892605476763893, + "grad_norm": 0.47450000047683716, + "learning_rate": 0.0001064178343292641, + "loss": 0.5595, + "step": 7788 + }, + { + "epoch": 1.3894389438943895, + "grad_norm": 0.48282551765441895, + "learning_rate": 0.00010636048524713832, + "loss": 0.515, + "step": 7789 + }, + { + "epoch": 1.3896173401123897, + "grad_norm": 0.5305141806602478, + "learning_rate": 0.0001063031474463983, + "loss": 0.5755, + "step": 7790 + }, + { + "epoch": 1.3897957363303899, + "grad_norm": 0.48532387614250183, + "learning_rate": 0.00010624582093154717, + "loss": 0.5692, + "step": 7791 + }, + { + "epoch": 1.38997413254839, + "grad_norm": 0.5484132170677185, + "learning_rate": 0.00010618850570708746, + "loss": 0.6092, + "step": 7792 + }, + { + "epoch": 1.3901525287663903, + "grad_norm": 0.5588973760604858, + "learning_rate": 0.00010613120177752056, + "loss": 0.7572, + "step": 7793 + }, + { + "epoch": 1.3903309249843903, + "grad_norm": 0.5241721272468567, + "learning_rate": 0.00010607390914734721, + "loss": 0.4951, + "step": 7794 + }, + { + "epoch": 1.3905093212023905, + "grad_norm": 0.49718037247657776, + "learning_rate": 0.00010601662782106705, + "loss": 0.7169, + "step": 7795 + }, + { + "epoch": 1.3906877174203907, + "grad_norm": 0.5253555774688721, + "learning_rate": 0.00010595935780317906, + "loss": 0.6851, + "step": 7796 + }, + { + "epoch": 1.3908661136383909, + "grad_norm": 0.5283133387565613, + "learning_rate": 0.00010590209909818113, + "loss": 0.6305, + "step": 7797 + }, + { + "epoch": 1.391044509856391, + "grad_norm": 0.5732675790786743, + "learning_rate": 0.0001058448517105702, + "loss": 0.781, + "step": 7798 + }, + { + "epoch": 1.3912229060743913, + "grad_norm": 0.4708344638347626, + "learning_rate": 0.0001057876156448427, + "loss": 0.5672, + "step": 7799 + }, + { + "epoch": 1.3914013022923915, + "grad_norm": 0.48784154653549194, + "learning_rate": 0.00010573039090549374, + "loss": 0.5962, + "step": 7800 + }, + { + "epoch": 1.3915796985103916, + "grad_norm": 0.4613569974899292, + "learning_rate": 0.00010567317749701785, + "loss": 0.5355, + "step": 7801 + }, + { + "epoch": 1.3917580947283916, + "grad_norm": 0.51911461353302, + "learning_rate": 0.00010561597542390842, + "loss": 0.609, + "step": 7802 + }, + { + "epoch": 1.3919364909463918, + "grad_norm": 0.478140652179718, + "learning_rate": 0.00010555878469065814, + "loss": 0.5447, + "step": 7803 + }, + { + "epoch": 1.392114887164392, + "grad_norm": 0.4372231662273407, + "learning_rate": 0.00010550160530175883, + "loss": 0.5127, + "step": 7804 + }, + { + "epoch": 1.3922932833823922, + "grad_norm": 0.4996614456176758, + "learning_rate": 0.00010544443726170119, + "loss": 0.5593, + "step": 7805 + }, + { + "epoch": 1.3924716796003924, + "grad_norm": 0.47102734446525574, + "learning_rate": 0.00010538728057497532, + "loss": 0.6029, + "step": 7806 + }, + { + "epoch": 1.3926500758183926, + "grad_norm": 0.5277090072631836, + "learning_rate": 0.0001053301352460701, + "loss": 0.6538, + "step": 7807 + }, + { + "epoch": 1.3928284720363928, + "grad_norm": 0.4949512183666229, + "learning_rate": 0.0001052730012794739, + "loss": 0.5816, + "step": 7808 + }, + { + "epoch": 1.393006868254393, + "grad_norm": 0.4444587230682373, + "learning_rate": 0.00010521587867967389, + "loss": 0.4541, + "step": 7809 + }, + { + "epoch": 1.3931852644723932, + "grad_norm": 0.46152183413505554, + "learning_rate": 0.0001051587674511564, + "loss": 0.4362, + "step": 7810 + }, + { + "epoch": 1.3933636606903934, + "grad_norm": 0.4519818127155304, + "learning_rate": 0.00010510166759840705, + "loss": 0.4725, + "step": 7811 + }, + { + "epoch": 1.3935420569083936, + "grad_norm": 0.48864850401878357, + "learning_rate": 0.00010504457912591028, + "loss": 0.4834, + "step": 7812 + }, + { + "epoch": 1.3937204531263938, + "grad_norm": 0.5076178908348083, + "learning_rate": 0.00010498750203815, + "loss": 0.6161, + "step": 7813 + }, + { + "epoch": 1.393898849344394, + "grad_norm": 0.4903450012207031, + "learning_rate": 0.0001049304363396088, + "loss": 0.5639, + "step": 7814 + }, + { + "epoch": 1.3940772455623942, + "grad_norm": 0.44738972187042236, + "learning_rate": 0.0001048733820347688, + "loss": 0.4897, + "step": 7815 + }, + { + "epoch": 1.3942556417803942, + "grad_norm": 0.5322644710540771, + "learning_rate": 0.00010481633912811081, + "loss": 0.7432, + "step": 7816 + }, + { + "epoch": 1.3944340379983944, + "grad_norm": 0.5089389085769653, + "learning_rate": 0.00010475930762411518, + "loss": 0.5638, + "step": 7817 + }, + { + "epoch": 1.3946124342163946, + "grad_norm": 0.5720763206481934, + "learning_rate": 0.00010470228752726094, + "loss": 0.6849, + "step": 7818 + }, + { + "epoch": 1.3947908304343948, + "grad_norm": 0.5013535618782043, + "learning_rate": 0.00010464527884202648, + "loss": 0.6032, + "step": 7819 + }, + { + "epoch": 1.394969226652395, + "grad_norm": 0.523189127445221, + "learning_rate": 0.00010458828157288938, + "loss": 0.7473, + "step": 7820 + }, + { + "epoch": 1.3951476228703952, + "grad_norm": 0.42415139079093933, + "learning_rate": 0.00010453129572432599, + "loss": 0.4601, + "step": 7821 + }, + { + "epoch": 1.3953260190883954, + "grad_norm": 0.4629247784614563, + "learning_rate": 0.0001044743213008121, + "loss": 0.4898, + "step": 7822 + }, + { + "epoch": 1.3955044153063956, + "grad_norm": 0.7844071984291077, + "learning_rate": 0.00010441735830682242, + "loss": 0.8687, + "step": 7823 + }, + { + "epoch": 1.3956828115243956, + "grad_norm": 0.46940067410469055, + "learning_rate": 0.00010436040674683067, + "loss": 0.5528, + "step": 7824 + }, + { + "epoch": 1.3958612077423957, + "grad_norm": 0.49708932638168335, + "learning_rate": 0.00010430346662530999, + "loss": 0.6975, + "step": 7825 + }, + { + "epoch": 1.396039603960396, + "grad_norm": 0.47257721424102783, + "learning_rate": 0.00010424653794673222, + "loss": 0.5658, + "step": 7826 + }, + { + "epoch": 1.3962180001783961, + "grad_norm": 0.4764406383037567, + "learning_rate": 0.00010418962071556876, + "loss": 0.7406, + "step": 7827 + }, + { + "epoch": 1.3963963963963963, + "grad_norm": 0.4551503360271454, + "learning_rate": 0.00010413271493628965, + "loss": 0.5595, + "step": 7828 + }, + { + "epoch": 1.3965747926143965, + "grad_norm": 0.4807843267917633, + "learning_rate": 0.00010407582061336443, + "loss": 0.4765, + "step": 7829 + }, + { + "epoch": 1.3967531888323967, + "grad_norm": 0.46619912981987, + "learning_rate": 0.00010401893775126146, + "loss": 0.5123, + "step": 7830 + }, + { + "epoch": 1.396931585050397, + "grad_norm": 0.4833389222621918, + "learning_rate": 0.00010396206635444819, + "loss": 0.5746, + "step": 7831 + }, + { + "epoch": 1.3971099812683971, + "grad_norm": 1.4866968393325806, + "learning_rate": 0.00010390520642739149, + "loss": 0.4599, + "step": 7832 + }, + { + "epoch": 1.3972883774863973, + "grad_norm": 0.4797883927822113, + "learning_rate": 0.00010384835797455691, + "loss": 0.4975, + "step": 7833 + }, + { + "epoch": 1.3974667737043975, + "grad_norm": 0.543132483959198, + "learning_rate": 0.00010379152100040942, + "loss": 0.7443, + "step": 7834 + }, + { + "epoch": 1.3976451699223977, + "grad_norm": 0.49611344933509827, + "learning_rate": 0.00010373469550941304, + "loss": 0.5435, + "step": 7835 + }, + { + "epoch": 1.397823566140398, + "grad_norm": 0.549901008605957, + "learning_rate": 0.00010367788150603061, + "loss": 0.8226, + "step": 7836 + }, + { + "epoch": 1.3980019623583981, + "grad_norm": 0.48870500922203064, + "learning_rate": 0.0001036210789947245, + "loss": 0.6682, + "step": 7837 + }, + { + "epoch": 1.398180358576398, + "grad_norm": 0.4649103879928589, + "learning_rate": 0.00010356428797995579, + "loss": 0.5182, + "step": 7838 + }, + { + "epoch": 1.3983587547943983, + "grad_norm": 0.49515223503112793, + "learning_rate": 0.00010350750846618495, + "loss": 0.6104, + "step": 7839 + }, + { + "epoch": 1.3985371510123985, + "grad_norm": 0.4876445531845093, + "learning_rate": 0.00010345074045787128, + "loss": 0.6187, + "step": 7840 + }, + { + "epoch": 1.3987155472303987, + "grad_norm": 0.47437965869903564, + "learning_rate": 0.00010339398395947347, + "loss": 0.7047, + "step": 7841 + }, + { + "epoch": 1.398893943448399, + "grad_norm": 0.4776071608066559, + "learning_rate": 0.00010333723897544908, + "loss": 0.655, + "step": 7842 + }, + { + "epoch": 1.399072339666399, + "grad_norm": 0.48358532786369324, + "learning_rate": 0.00010328050551025472, + "loss": 0.6231, + "step": 7843 + }, + { + "epoch": 1.3992507358843993, + "grad_norm": 0.5266225934028625, + "learning_rate": 0.00010322378356834641, + "loss": 0.7969, + "step": 7844 + }, + { + "epoch": 1.3994291321023995, + "grad_norm": 0.42173731327056885, + "learning_rate": 0.00010316707315417892, + "loss": 0.4284, + "step": 7845 + }, + { + "epoch": 1.3996075283203995, + "grad_norm": 0.47026675939559937, + "learning_rate": 0.00010311037427220637, + "loss": 0.5441, + "step": 7846 + }, + { + "epoch": 1.3997859245383997, + "grad_norm": 0.4756429195404053, + "learning_rate": 0.00010305368692688174, + "loss": 0.4786, + "step": 7847 + }, + { + "epoch": 1.3999643207563999, + "grad_norm": 0.48206183314323425, + "learning_rate": 0.00010299701112265739, + "loss": 0.623, + "step": 7848 + }, + { + "epoch": 1.4001427169744, + "grad_norm": 0.47061610221862793, + "learning_rate": 0.00010294034686398443, + "loss": 0.5716, + "step": 7849 + }, + { + "epoch": 1.4003211131924003, + "grad_norm": 0.46232593059539795, + "learning_rate": 0.00010288369415531335, + "loss": 0.5713, + "step": 7850 + }, + { + "epoch": 1.4004995094104005, + "grad_norm": 0.415913850069046, + "learning_rate": 0.00010282705300109372, + "loss": 0.4381, + "step": 7851 + }, + { + "epoch": 1.4006779056284007, + "grad_norm": 0.5227693319320679, + "learning_rate": 0.00010277042340577388, + "loss": 0.6813, + "step": 7852 + }, + { + "epoch": 1.4008563018464009, + "grad_norm": 0.4625205099582672, + "learning_rate": 0.00010271380537380176, + "loss": 0.5038, + "step": 7853 + }, + { + "epoch": 1.401034698064401, + "grad_norm": 0.405355840921402, + "learning_rate": 0.00010265719890962388, + "loss": 0.4174, + "step": 7854 + }, + { + "epoch": 1.4012130942824013, + "grad_norm": 0.4783884584903717, + "learning_rate": 0.00010260060401768628, + "loss": 0.5663, + "step": 7855 + }, + { + "epoch": 1.4013914905004015, + "grad_norm": 0.49855825304985046, + "learning_rate": 0.00010254402070243383, + "loss": 0.6415, + "step": 7856 + }, + { + "epoch": 1.4015698867184017, + "grad_norm": 0.5020763278007507, + "learning_rate": 0.00010248744896831044, + "loss": 0.6289, + "step": 7857 + }, + { + "epoch": 1.4017482829364019, + "grad_norm": 0.5697162747383118, + "learning_rate": 0.0001024308888197594, + "loss": 0.7057, + "step": 7858 + }, + { + "epoch": 1.401926679154402, + "grad_norm": 0.5255553126335144, + "learning_rate": 0.00010237434026122278, + "loss": 0.699, + "step": 7859 + }, + { + "epoch": 1.402105075372402, + "grad_norm": 0.48212558031082153, + "learning_rate": 0.00010231780329714202, + "loss": 0.4816, + "step": 7860 + }, + { + "epoch": 1.4022834715904022, + "grad_norm": 0.55892014503479, + "learning_rate": 0.00010226127793195736, + "loss": 0.7858, + "step": 7861 + }, + { + "epoch": 1.4024618678084024, + "grad_norm": 0.473724365234375, + "learning_rate": 0.00010220476417010843, + "loss": 0.7018, + "step": 7862 + }, + { + "epoch": 1.4026402640264026, + "grad_norm": 0.5031070709228516, + "learning_rate": 0.00010214826201603372, + "loss": 0.5582, + "step": 7863 + }, + { + "epoch": 1.4028186602444028, + "grad_norm": 0.5168371200561523, + "learning_rate": 0.00010209177147417073, + "loss": 0.6044, + "step": 7864 + }, + { + "epoch": 1.402997056462403, + "grad_norm": 0.5146030783653259, + "learning_rate": 0.00010203529254895652, + "loss": 0.6976, + "step": 7865 + }, + { + "epoch": 1.4031754526804032, + "grad_norm": 0.5497780442237854, + "learning_rate": 0.00010197882524482669, + "loss": 0.6612, + "step": 7866 + }, + { + "epoch": 1.4033538488984034, + "grad_norm": 0.530221164226532, + "learning_rate": 0.00010192236956621628, + "loss": 0.6626, + "step": 7867 + }, + { + "epoch": 1.4035322451164034, + "grad_norm": 0.5098912715911865, + "learning_rate": 0.00010186592551755927, + "loss": 0.6494, + "step": 7868 + }, + { + "epoch": 1.4037106413344036, + "grad_norm": 0.44442427158355713, + "learning_rate": 0.0001018094931032886, + "loss": 0.4793, + "step": 7869 + }, + { + "epoch": 1.4038890375524038, + "grad_norm": 0.5126553773880005, + "learning_rate": 0.0001017530723278367, + "loss": 0.6441, + "step": 7870 + }, + { + "epoch": 1.404067433770404, + "grad_norm": 0.568909227848053, + "learning_rate": 0.00010169666319563458, + "loss": 0.8517, + "step": 7871 + }, + { + "epoch": 1.4042458299884042, + "grad_norm": 0.500556230545044, + "learning_rate": 0.00010164026571111284, + "loss": 0.6562, + "step": 7872 + }, + { + "epoch": 1.4044242262064044, + "grad_norm": 0.4273224472999573, + "learning_rate": 0.00010158387987870065, + "loss": 0.4493, + "step": 7873 + }, + { + "epoch": 1.4046026224244046, + "grad_norm": 0.49929437041282654, + "learning_rate": 0.00010152750570282679, + "loss": 0.7073, + "step": 7874 + }, + { + "epoch": 1.4047810186424048, + "grad_norm": 0.47794604301452637, + "learning_rate": 0.00010147114318791864, + "loss": 0.5762, + "step": 7875 + }, + { + "epoch": 1.404959414860405, + "grad_norm": 0.4568422734737396, + "learning_rate": 0.00010141479233840309, + "loss": 0.5284, + "step": 7876 + }, + { + "epoch": 1.4051378110784052, + "grad_norm": 0.5395530462265015, + "learning_rate": 0.00010135845315870579, + "loss": 0.7679, + "step": 7877 + }, + { + "epoch": 1.4053162072964054, + "grad_norm": 0.5494503974914551, + "learning_rate": 0.00010130212565325153, + "loss": 0.7338, + "step": 7878 + }, + { + "epoch": 1.4054946035144056, + "grad_norm": 0.4741472899913788, + "learning_rate": 0.00010124580982646442, + "loss": 0.5062, + "step": 7879 + }, + { + "epoch": 1.4056729997324058, + "grad_norm": 0.47103115916252136, + "learning_rate": 0.00010118950568276722, + "loss": 0.548, + "step": 7880 + }, + { + "epoch": 1.405851395950406, + "grad_norm": 0.48441192507743835, + "learning_rate": 0.0001011332132265824, + "loss": 0.6215, + "step": 7881 + }, + { + "epoch": 1.406029792168406, + "grad_norm": 0.5685884356498718, + "learning_rate": 0.0001010769324623309, + "loss": 0.5983, + "step": 7882 + }, + { + "epoch": 1.4062081883864062, + "grad_norm": 0.5327664613723755, + "learning_rate": 0.00010102066339443299, + "loss": 0.418, + "step": 7883 + }, + { + "epoch": 1.4063865846044064, + "grad_norm": 0.4698556363582611, + "learning_rate": 0.00010096440602730816, + "loss": 0.6731, + "step": 7884 + }, + { + "epoch": 1.4065649808224066, + "grad_norm": 0.46319758892059326, + "learning_rate": 0.00010090816036537462, + "loss": 0.4945, + "step": 7885 + }, + { + "epoch": 1.4067433770404068, + "grad_norm": 0.5858322381973267, + "learning_rate": 0.00010085192641305013, + "loss": 0.7778, + "step": 7886 + }, + { + "epoch": 1.406921773258407, + "grad_norm": 0.4609849750995636, + "learning_rate": 0.00010079570417475106, + "loss": 0.521, + "step": 7887 + }, + { + "epoch": 1.4071001694764071, + "grad_norm": 0.6530886888504028, + "learning_rate": 0.00010073949365489323, + "loss": 0.3922, + "step": 7888 + }, + { + "epoch": 1.4072785656944073, + "grad_norm": 0.7445523142814636, + "learning_rate": 0.00010068329485789138, + "loss": 0.6121, + "step": 7889 + }, + { + "epoch": 1.4074569619124073, + "grad_norm": 0.5070978999137878, + "learning_rate": 0.0001006271077881592, + "loss": 0.5824, + "step": 7890 + }, + { + "epoch": 1.4076353581304075, + "grad_norm": 0.46640315651893616, + "learning_rate": 0.00010057093245010975, + "loss": 0.5733, + "step": 7891 + }, + { + "epoch": 1.4078137543484077, + "grad_norm": 0.5237272381782532, + "learning_rate": 0.00010051476884815491, + "loss": 0.5761, + "step": 7892 + }, + { + "epoch": 1.407992150566408, + "grad_norm": 0.4710404872894287, + "learning_rate": 0.00010045861698670589, + "loss": 0.5382, + "step": 7893 + }, + { + "epoch": 1.4081705467844081, + "grad_norm": 0.46644294261932373, + "learning_rate": 0.00010040247687017263, + "loss": 0.3815, + "step": 7894 + }, + { + "epoch": 1.4083489430024083, + "grad_norm": 0.4705592393875122, + "learning_rate": 0.00010034634850296445, + "loss": 0.4517, + "step": 7895 + }, + { + "epoch": 1.4085273392204085, + "grad_norm": 0.5408839583396912, + "learning_rate": 0.00010029023188948976, + "loss": 0.8073, + "step": 7896 + }, + { + "epoch": 1.4087057354384087, + "grad_norm": 0.5179523825645447, + "learning_rate": 0.00010023412703415574, + "loss": 0.6676, + "step": 7897 + }, + { + "epoch": 1.408884131656409, + "grad_norm": 0.49431368708610535, + "learning_rate": 0.00010017803394136902, + "loss": 0.4934, + "step": 7898 + }, + { + "epoch": 1.409062527874409, + "grad_norm": 0.5114967226982117, + "learning_rate": 0.00010012195261553494, + "loss": 0.6566, + "step": 7899 + }, + { + "epoch": 1.4092409240924093, + "grad_norm": 0.47235408425331116, + "learning_rate": 0.00010006588306105832, + "loss": 0.5071, + "step": 7900 + }, + { + "epoch": 1.4094193203104095, + "grad_norm": 0.4692701995372772, + "learning_rate": 0.00010000982528234262, + "loss": 0.5268, + "step": 7901 + }, + { + "epoch": 1.4095977165284097, + "grad_norm": 0.5551561117172241, + "learning_rate": 9.995377928379079e-05, + "loss": 0.6615, + "step": 7902 + }, + { + "epoch": 1.40977611274641, + "grad_norm": 0.48021402955055237, + "learning_rate": 9.989774506980457e-05, + "loss": 0.601, + "step": 7903 + }, + { + "epoch": 1.4099545089644099, + "grad_norm": 0.5263514518737793, + "learning_rate": 9.984172264478475e-05, + "loss": 0.6353, + "step": 7904 + }, + { + "epoch": 1.41013290518241, + "grad_norm": 0.48175185918807983, + "learning_rate": 9.978571201313153e-05, + "loss": 0.5538, + "step": 7905 + }, + { + "epoch": 1.4103113014004103, + "grad_norm": 0.47728538513183594, + "learning_rate": 9.972971317924374e-05, + "loss": 0.5215, + "step": 7906 + }, + { + "epoch": 1.4104896976184105, + "grad_norm": 0.49182960391044617, + "learning_rate": 9.967372614751971e-05, + "loss": 0.4886, + "step": 7907 + }, + { + "epoch": 1.4106680938364107, + "grad_norm": 0.532971203327179, + "learning_rate": 9.961775092235642e-05, + "loss": 0.6214, + "step": 7908 + }, + { + "epoch": 1.4108464900544109, + "grad_norm": 0.4523656368255615, + "learning_rate": 9.956178750815037e-05, + "loss": 0.4945, + "step": 7909 + }, + { + "epoch": 1.411024886272411, + "grad_norm": 0.49244529008865356, + "learning_rate": 9.950583590929671e-05, + "loss": 0.565, + "step": 7910 + }, + { + "epoch": 1.4112032824904113, + "grad_norm": 0.4732704162597656, + "learning_rate": 9.944989613018993e-05, + "loss": 0.471, + "step": 7911 + }, + { + "epoch": 1.4113816787084112, + "grad_norm": 0.5447127223014832, + "learning_rate": 9.939396817522362e-05, + "loss": 0.6094, + "step": 7912 + }, + { + "epoch": 1.4115600749264114, + "grad_norm": 0.5061089992523193, + "learning_rate": 9.933805204879013e-05, + "loss": 0.553, + "step": 7913 + }, + { + "epoch": 1.4117384711444116, + "grad_norm": 0.5438100695610046, + "learning_rate": 9.928214775528127e-05, + "loss": 0.6007, + "step": 7914 + }, + { + "epoch": 1.4119168673624118, + "grad_norm": 0.49287474155426025, + "learning_rate": 9.922625529908769e-05, + "loss": 0.5443, + "step": 7915 + }, + { + "epoch": 1.412095263580412, + "grad_norm": 0.5883775353431702, + "learning_rate": 9.917037468459905e-05, + "loss": 0.7577, + "step": 7916 + }, + { + "epoch": 1.4122736597984122, + "grad_norm": 0.540412187576294, + "learning_rate": 9.911450591620436e-05, + "loss": 0.6534, + "step": 7917 + }, + { + "epoch": 1.4124520560164124, + "grad_norm": 0.5728485584259033, + "learning_rate": 9.905864899829135e-05, + "loss": 0.7171, + "step": 7918 + }, + { + "epoch": 1.4126304522344126, + "grad_norm": 0.4712778329849243, + "learning_rate": 9.900280393524719e-05, + "loss": 0.5482, + "step": 7919 + }, + { + "epoch": 1.4128088484524128, + "grad_norm": 0.5077334642410278, + "learning_rate": 9.894697073145773e-05, + "loss": 0.7018, + "step": 7920 + }, + { + "epoch": 1.412987244670413, + "grad_norm": 0.489620566368103, + "learning_rate": 9.889114939130828e-05, + "loss": 0.6211, + "step": 7921 + }, + { + "epoch": 1.4131656408884132, + "grad_norm": 0.4577091634273529, + "learning_rate": 9.883533991918291e-05, + "loss": 0.5056, + "step": 7922 + }, + { + "epoch": 1.4133440371064134, + "grad_norm": 0.5330988764762878, + "learning_rate": 9.877954231946485e-05, + "loss": 0.5441, + "step": 7923 + }, + { + "epoch": 1.4135224333244136, + "grad_norm": 0.46641597151756287, + "learning_rate": 9.872375659653652e-05, + "loss": 0.5333, + "step": 7924 + }, + { + "epoch": 1.4137008295424138, + "grad_norm": 0.47872334718704224, + "learning_rate": 9.866798275477915e-05, + "loss": 0.5553, + "step": 7925 + }, + { + "epoch": 1.4138792257604138, + "grad_norm": 0.4439626634120941, + "learning_rate": 9.861222079857332e-05, + "loss": 0.4818, + "step": 7926 + }, + { + "epoch": 1.414057621978414, + "grad_norm": 0.5242343544960022, + "learning_rate": 9.85564707322986e-05, + "loss": 0.6662, + "step": 7927 + }, + { + "epoch": 1.4142360181964142, + "grad_norm": 0.47222810983657837, + "learning_rate": 9.850073256033337e-05, + "loss": 0.5828, + "step": 7928 + }, + { + "epoch": 1.4144144144144144, + "grad_norm": 0.4517582952976227, + "learning_rate": 9.844500628705555e-05, + "loss": 0.464, + "step": 7929 + }, + { + "epoch": 1.4145928106324146, + "grad_norm": 0.5746625065803528, + "learning_rate": 9.83892919168416e-05, + "loss": 0.7129, + "step": 7930 + }, + { + "epoch": 1.4147712068504148, + "grad_norm": 0.41956639289855957, + "learning_rate": 9.83335894540675e-05, + "loss": 0.4671, + "step": 7931 + }, + { + "epoch": 1.414949603068415, + "grad_norm": 0.46683749556541443, + "learning_rate": 9.827789890310795e-05, + "loss": 0.547, + "step": 7932 + }, + { + "epoch": 1.4151279992864152, + "grad_norm": 0.5841799378395081, + "learning_rate": 9.822222026833703e-05, + "loss": 0.8423, + "step": 7933 + }, + { + "epoch": 1.4153063955044152, + "grad_norm": 0.520228922367096, + "learning_rate": 9.816655355412748e-05, + "loss": 0.6169, + "step": 7934 + }, + { + "epoch": 1.4154847917224154, + "grad_norm": 0.47443145513534546, + "learning_rate": 9.81108987648516e-05, + "loss": 0.725, + "step": 7935 + }, + { + "epoch": 1.4156631879404156, + "grad_norm": 0.5204434990882874, + "learning_rate": 9.805525590488037e-05, + "loss": 0.7623, + "step": 7936 + }, + { + "epoch": 1.4158415841584158, + "grad_norm": 0.4761342704296112, + "learning_rate": 9.799962497858387e-05, + "loss": 0.7241, + "step": 7937 + }, + { + "epoch": 1.416019980376416, + "grad_norm": 0.4414876401424408, + "learning_rate": 9.794400599033146e-05, + "loss": 0.5095, + "step": 7938 + }, + { + "epoch": 1.4161983765944162, + "grad_norm": 0.543932318687439, + "learning_rate": 9.788839894449134e-05, + "loss": 0.692, + "step": 7939 + }, + { + "epoch": 1.4163767728124164, + "grad_norm": 0.5136645436286926, + "learning_rate": 9.783280384543097e-05, + "loss": 0.6467, + "step": 7940 + }, + { + "epoch": 1.4165551690304166, + "grad_norm": 0.5013181567192078, + "learning_rate": 9.777722069751663e-05, + "loss": 0.6199, + "step": 7941 + }, + { + "epoch": 1.4167335652484168, + "grad_norm": 0.4954874813556671, + "learning_rate": 9.772164950511386e-05, + "loss": 0.4462, + "step": 7942 + }, + { + "epoch": 1.416911961466417, + "grad_norm": 0.42616045475006104, + "learning_rate": 9.76660902725873e-05, + "loss": 0.4236, + "step": 7943 + }, + { + "epoch": 1.4170903576844172, + "grad_norm": 0.4762613773345947, + "learning_rate": 9.761054300430036e-05, + "loss": 0.6666, + "step": 7944 + }, + { + "epoch": 1.4172687539024174, + "grad_norm": 0.43737682700157166, + "learning_rate": 9.75550077046159e-05, + "loss": 0.4711, + "step": 7945 + }, + { + "epoch": 1.4174471501204176, + "grad_norm": 0.514951229095459, + "learning_rate": 9.749948437789544e-05, + "loss": 0.5771, + "step": 7946 + }, + { + "epoch": 1.4176255463384178, + "grad_norm": 1.177257776260376, + "learning_rate": 9.744397302849995e-05, + "loss": 0.6234, + "step": 7947 + }, + { + "epoch": 1.4178039425564177, + "grad_norm": 0.4536794424057007, + "learning_rate": 9.738847366078912e-05, + "loss": 0.6181, + "step": 7948 + }, + { + "epoch": 1.417982338774418, + "grad_norm": 0.6050822138786316, + "learning_rate": 9.733298627912185e-05, + "loss": 0.6667, + "step": 7949 + }, + { + "epoch": 1.4181607349924181, + "grad_norm": 0.5121973752975464, + "learning_rate": 9.727751088785621e-05, + "loss": 0.6663, + "step": 7950 + }, + { + "epoch": 1.4183391312104183, + "grad_norm": 0.5665974020957947, + "learning_rate": 9.722204749134908e-05, + "loss": 0.634, + "step": 7951 + }, + { + "epoch": 1.4185175274284185, + "grad_norm": 0.5509992241859436, + "learning_rate": 9.716659609395665e-05, + "loss": 0.6952, + "step": 7952 + }, + { + "epoch": 1.4186959236464187, + "grad_norm": 0.4740985035896301, + "learning_rate": 9.711115670003393e-05, + "loss": 0.5432, + "step": 7953 + }, + { + "epoch": 1.418874319864419, + "grad_norm": 0.4883805513381958, + "learning_rate": 9.705572931393525e-05, + "loss": 0.5195, + "step": 7954 + }, + { + "epoch": 1.4190527160824191, + "grad_norm": 0.4700393080711365, + "learning_rate": 9.700031394001366e-05, + "loss": 0.5883, + "step": 7955 + }, + { + "epoch": 1.419231112300419, + "grad_norm": 0.46181434392929077, + "learning_rate": 9.694491058262162e-05, + "loss": 0.5271, + "step": 7956 + }, + { + "epoch": 1.4194095085184193, + "grad_norm": 0.48557132482528687, + "learning_rate": 9.688951924611048e-05, + "loss": 0.5342, + "step": 7957 + }, + { + "epoch": 1.4195879047364195, + "grad_norm": 0.4598628878593445, + "learning_rate": 9.683413993483053e-05, + "loss": 0.4381, + "step": 7958 + }, + { + "epoch": 1.4197663009544197, + "grad_norm": 0.7683700323104858, + "learning_rate": 9.677877265313143e-05, + "loss": 0.5352, + "step": 7959 + }, + { + "epoch": 1.41994469717242, + "grad_norm": 0.4990261197090149, + "learning_rate": 9.67234174053615e-05, + "loss": 0.5855, + "step": 7960 + }, + { + "epoch": 1.42012309339042, + "grad_norm": 0.46791011095046997, + "learning_rate": 9.666807419586849e-05, + "loss": 0.4852, + "step": 7961 + }, + { + "epoch": 1.4203014896084203, + "grad_norm": 0.5144142508506775, + "learning_rate": 9.661274302899891e-05, + "loss": 0.5664, + "step": 7962 + }, + { + "epoch": 1.4204798858264205, + "grad_norm": 0.4881432354450226, + "learning_rate": 9.655742390909845e-05, + "loss": 0.4927, + "step": 7963 + }, + { + "epoch": 1.4206582820444207, + "grad_norm": 0.4858154356479645, + "learning_rate": 9.650211684051193e-05, + "loss": 0.5022, + "step": 7964 + }, + { + "epoch": 1.4208366782624209, + "grad_norm": 0.4513051509857178, + "learning_rate": 9.644682182758304e-05, + "loss": 0.4369, + "step": 7965 + }, + { + "epoch": 1.421015074480421, + "grad_norm": 0.6125142574310303, + "learning_rate": 9.639153887465477e-05, + "loss": 0.6945, + "step": 7966 + }, + { + "epoch": 1.4211934706984213, + "grad_norm": 0.45510220527648926, + "learning_rate": 9.633626798606885e-05, + "loss": 0.5108, + "step": 7967 + }, + { + "epoch": 1.4213718669164215, + "grad_norm": 0.5278647541999817, + "learning_rate": 9.628100916616638e-05, + "loss": 0.8597, + "step": 7968 + }, + { + "epoch": 1.4215502631344217, + "grad_norm": 0.4560278654098511, + "learning_rate": 9.622576241928733e-05, + "loss": 0.5682, + "step": 7969 + }, + { + "epoch": 1.4217286593524217, + "grad_norm": 0.49285340309143066, + "learning_rate": 9.617052774977061e-05, + "loss": 0.522, + "step": 7970 + }, + { + "epoch": 1.4219070555704219, + "grad_norm": 0.4475046992301941, + "learning_rate": 9.611530516195454e-05, + "loss": 0.493, + "step": 7971 + }, + { + "epoch": 1.422085451788422, + "grad_norm": 0.46044278144836426, + "learning_rate": 9.606009466017602e-05, + "loss": 0.5749, + "step": 7972 + }, + { + "epoch": 1.4222638480064222, + "grad_norm": 0.53668612241745, + "learning_rate": 9.600489624877157e-05, + "loss": 0.562, + "step": 7973 + }, + { + "epoch": 1.4224422442244224, + "grad_norm": 0.4615837335586548, + "learning_rate": 9.59497099320763e-05, + "loss": 0.5402, + "step": 7974 + }, + { + "epoch": 1.4226206404424226, + "grad_norm": 0.531973659992218, + "learning_rate": 9.589453571442444e-05, + "loss": 0.7516, + "step": 7975 + }, + { + "epoch": 1.4227990366604228, + "grad_norm": 0.5027945637702942, + "learning_rate": 9.583937360014952e-05, + "loss": 0.6668, + "step": 7976 + }, + { + "epoch": 1.422977432878423, + "grad_norm": 0.49252915382385254, + "learning_rate": 9.578422359358377e-05, + "loss": 0.624, + "step": 7977 + }, + { + "epoch": 1.423155829096423, + "grad_norm": 0.4632330536842346, + "learning_rate": 9.572908569905883e-05, + "loss": 0.6206, + "step": 7978 + }, + { + "epoch": 1.4233342253144232, + "grad_norm": 0.46430516242980957, + "learning_rate": 9.5673959920905e-05, + "loss": 0.5605, + "step": 7979 + }, + { + "epoch": 1.4235126215324234, + "grad_norm": 0.5031800270080566, + "learning_rate": 9.561884626345205e-05, + "loss": 0.541, + "step": 7980 + }, + { + "epoch": 1.4236910177504236, + "grad_norm": 0.5108554363250732, + "learning_rate": 9.556374473102839e-05, + "loss": 0.5613, + "step": 7981 + }, + { + "epoch": 1.4238694139684238, + "grad_norm": 0.5381075143814087, + "learning_rate": 9.550865532796185e-05, + "loss": 0.635, + "step": 7982 + }, + { + "epoch": 1.424047810186424, + "grad_norm": 0.522135317325592, + "learning_rate": 9.545357805857901e-05, + "loss": 0.5884, + "step": 7983 + }, + { + "epoch": 1.4242262064044242, + "grad_norm": 0.5213260650634766, + "learning_rate": 9.539851292720562e-05, + "loss": 0.6214, + "step": 7984 + }, + { + "epoch": 1.4244046026224244, + "grad_norm": 0.4458264112472534, + "learning_rate": 9.53434599381665e-05, + "loss": 0.47, + "step": 7985 + }, + { + "epoch": 1.4245829988404246, + "grad_norm": 0.543368935585022, + "learning_rate": 9.528841909578545e-05, + "loss": 0.6638, + "step": 7986 + }, + { + "epoch": 1.4247613950584248, + "grad_norm": 0.5813699960708618, + "learning_rate": 9.523339040438536e-05, + "loss": 0.7685, + "step": 7987 + }, + { + "epoch": 1.424939791276425, + "grad_norm": 0.5484562516212463, + "learning_rate": 9.517837386828829e-05, + "loss": 0.6551, + "step": 7988 + }, + { + "epoch": 1.4251181874944252, + "grad_norm": 0.5792138576507568, + "learning_rate": 9.512336949181502e-05, + "loss": 0.6305, + "step": 7989 + }, + { + "epoch": 1.4252965837124254, + "grad_norm": 0.5417539477348328, + "learning_rate": 9.506837727928577e-05, + "loss": 0.7499, + "step": 7990 + }, + { + "epoch": 1.4254749799304256, + "grad_norm": 0.49307334423065186, + "learning_rate": 9.501339723501937e-05, + "loss": 0.5378, + "step": 7991 + }, + { + "epoch": 1.4256533761484256, + "grad_norm": 0.44642525911331177, + "learning_rate": 9.495842936333415e-05, + "loss": 0.4978, + "step": 7992 + }, + { + "epoch": 1.4258317723664258, + "grad_norm": 0.5179886817932129, + "learning_rate": 9.49034736685471e-05, + "loss": 0.6413, + "step": 7993 + }, + { + "epoch": 1.426010168584426, + "grad_norm": 0.48655420541763306, + "learning_rate": 9.484853015497458e-05, + "loss": 0.5178, + "step": 7994 + }, + { + "epoch": 1.4261885648024262, + "grad_norm": 0.4873103201389313, + "learning_rate": 9.47935988269317e-05, + "loss": 0.5396, + "step": 7995 + }, + { + "epoch": 1.4263669610204264, + "grad_norm": 0.49392032623291016, + "learning_rate": 9.47386796887327e-05, + "loss": 0.5208, + "step": 7996 + }, + { + "epoch": 1.4265453572384266, + "grad_norm": 0.5058830380439758, + "learning_rate": 9.468377274469109e-05, + "loss": 0.5961, + "step": 7997 + }, + { + "epoch": 1.4267237534564268, + "grad_norm": 0.4766574800014496, + "learning_rate": 9.462887799911904e-05, + "loss": 0.3617, + "step": 7998 + }, + { + "epoch": 1.426902149674427, + "grad_norm": 0.44295012950897217, + "learning_rate": 9.457399545632814e-05, + "loss": 0.5054, + "step": 7999 + }, + { + "epoch": 1.427080545892427, + "grad_norm": 0.5294760465621948, + "learning_rate": 9.451912512062863e-05, + "loss": 0.6995, + "step": 8000 + }, + { + "epoch": 1.4272589421104271, + "grad_norm": 0.46700701117515564, + "learning_rate": 9.446426699633023e-05, + "loss": 0.613, + "step": 8001 + }, + { + "epoch": 1.4274373383284273, + "grad_norm": 0.5137273669242859, + "learning_rate": 9.44094210877413e-05, + "loss": 0.7029, + "step": 8002 + }, + { + "epoch": 1.4276157345464275, + "grad_norm": 0.5057253837585449, + "learning_rate": 9.435458739916946e-05, + "loss": 0.6471, + "step": 8003 + }, + { + "epoch": 1.4277941307644277, + "grad_norm": 0.5789628028869629, + "learning_rate": 9.429976593492146e-05, + "loss": 0.7758, + "step": 8004 + }, + { + "epoch": 1.427972526982428, + "grad_norm": 0.5334276556968689, + "learning_rate": 9.424495669930272e-05, + "loss": 0.6382, + "step": 8005 + }, + { + "epoch": 1.4281509232004281, + "grad_norm": 0.5051296949386597, + "learning_rate": 9.419015969661814e-05, + "loss": 0.6057, + "step": 8006 + }, + { + "epoch": 1.4283293194184283, + "grad_norm": 0.5126160979270935, + "learning_rate": 9.413537493117142e-05, + "loss": 0.7181, + "step": 8007 + }, + { + "epoch": 1.4285077156364285, + "grad_norm": 0.5015937089920044, + "learning_rate": 9.408060240726515e-05, + "loss": 0.5093, + "step": 8008 + }, + { + "epoch": 1.4286861118544287, + "grad_norm": 0.5069268941879272, + "learning_rate": 9.402584212920134e-05, + "loss": 0.5187, + "step": 8009 + }, + { + "epoch": 1.428864508072429, + "grad_norm": 0.5026482343673706, + "learning_rate": 9.397109410128071e-05, + "loss": 0.5594, + "step": 8010 + }, + { + "epoch": 1.4290429042904291, + "grad_norm": 0.4498440623283386, + "learning_rate": 9.391635832780329e-05, + "loss": 0.4379, + "step": 8011 + }, + { + "epoch": 1.4292213005084293, + "grad_norm": 0.494722843170166, + "learning_rate": 9.386163481306784e-05, + "loss": 0.5101, + "step": 8012 + }, + { + "epoch": 1.4293996967264295, + "grad_norm": 0.4928201138973236, + "learning_rate": 9.380692356137247e-05, + "loss": 0.5785, + "step": 8013 + }, + { + "epoch": 1.4295780929444295, + "grad_norm": 0.49480581283569336, + "learning_rate": 9.375222457701401e-05, + "loss": 0.6379, + "step": 8014 + }, + { + "epoch": 1.4297564891624297, + "grad_norm": 0.46592390537261963, + "learning_rate": 9.369753786428869e-05, + "loss": 0.5308, + "step": 8015 + }, + { + "epoch": 1.42993488538043, + "grad_norm": 0.5155026912689209, + "learning_rate": 9.364286342749151e-05, + "loss": 0.6286, + "step": 8016 + }, + { + "epoch": 1.43011328159843, + "grad_norm": 0.4269189238548279, + "learning_rate": 9.358820127091636e-05, + "loss": 0.4702, + "step": 8017 + }, + { + "epoch": 1.4302916778164303, + "grad_norm": 0.4979037344455719, + "learning_rate": 9.353355139885672e-05, + "loss": 0.5917, + "step": 8018 + }, + { + "epoch": 1.4304700740344305, + "grad_norm": 0.43645283579826355, + "learning_rate": 9.347891381560455e-05, + "loss": 0.5333, + "step": 8019 + }, + { + "epoch": 1.4306484702524307, + "grad_norm": 0.47989028692245483, + "learning_rate": 9.342428852545123e-05, + "loss": 0.6838, + "step": 8020 + }, + { + "epoch": 1.430826866470431, + "grad_norm": 0.41572171449661255, + "learning_rate": 9.336967553268691e-05, + "loss": 0.3637, + "step": 8021 + }, + { + "epoch": 1.4310052626884309, + "grad_norm": 0.5728147029876709, + "learning_rate": 9.33150748416008e-05, + "loss": 0.7151, + "step": 8022 + }, + { + "epoch": 1.431183658906431, + "grad_norm": 0.48712360858917236, + "learning_rate": 9.326048645648134e-05, + "loss": 0.5196, + "step": 8023 + }, + { + "epoch": 1.4313620551244313, + "grad_norm": 0.44054681062698364, + "learning_rate": 9.320591038161574e-05, + "loss": 0.4682, + "step": 8024 + }, + { + "epoch": 1.4315404513424315, + "grad_norm": 0.48739486932754517, + "learning_rate": 9.315134662129058e-05, + "loss": 0.5445, + "step": 8025 + }, + { + "epoch": 1.4317188475604317, + "grad_norm": 0.44029682874679565, + "learning_rate": 9.309679517979102e-05, + "loss": 0.4334, + "step": 8026 + }, + { + "epoch": 1.4318972437784319, + "grad_norm": 0.47833380103111267, + "learning_rate": 9.304225606140176e-05, + "loss": 0.4728, + "step": 8027 + }, + { + "epoch": 1.432075639996432, + "grad_norm": 0.5727766156196594, + "learning_rate": 9.298772927040618e-05, + "loss": 0.7175, + "step": 8028 + }, + { + "epoch": 1.4322540362144323, + "grad_norm": 0.4518079161643982, + "learning_rate": 9.293321481108668e-05, + "loss": 0.5195, + "step": 8029 + }, + { + "epoch": 1.4324324324324325, + "grad_norm": 0.526343584060669, + "learning_rate": 9.2878712687725e-05, + "loss": 0.588, + "step": 8030 + }, + { + "epoch": 1.4326108286504327, + "grad_norm": 0.4975213408470154, + "learning_rate": 9.282422290460149e-05, + "loss": 0.6872, + "step": 8031 + }, + { + "epoch": 1.4327892248684329, + "grad_norm": 0.5089072585105896, + "learning_rate": 9.276974546599599e-05, + "loss": 0.6198, + "step": 8032 + }, + { + "epoch": 1.432967621086433, + "grad_norm": 0.4496549665927887, + "learning_rate": 9.27152803761869e-05, + "loss": 0.501, + "step": 8033 + }, + { + "epoch": 1.4331460173044333, + "grad_norm": 0.5993954539299011, + "learning_rate": 9.266082763945202e-05, + "loss": 0.7204, + "step": 8034 + }, + { + "epoch": 1.4333244135224334, + "grad_norm": 0.5009292960166931, + "learning_rate": 9.260638726006812e-05, + "loss": 0.5112, + "step": 8035 + }, + { + "epoch": 1.4335028097404334, + "grad_norm": 0.49264848232269287, + "learning_rate": 9.255195924231075e-05, + "loss": 0.5595, + "step": 8036 + }, + { + "epoch": 1.4336812059584336, + "grad_norm": 0.481372594833374, + "learning_rate": 9.249754359045484e-05, + "loss": 0.5221, + "step": 8037 + }, + { + "epoch": 1.4338596021764338, + "grad_norm": 0.5572468638420105, + "learning_rate": 9.244314030877398e-05, + "loss": 0.7088, + "step": 8038 + }, + { + "epoch": 1.434037998394434, + "grad_norm": 0.4595695436000824, + "learning_rate": 9.238874940154116e-05, + "loss": 0.4761, + "step": 8039 + }, + { + "epoch": 1.4342163946124342, + "grad_norm": 0.43319255113601685, + "learning_rate": 9.233437087302806e-05, + "loss": 0.5051, + "step": 8040 + }, + { + "epoch": 1.4343947908304344, + "grad_norm": 0.4756569266319275, + "learning_rate": 9.228000472750569e-05, + "loss": 0.5404, + "step": 8041 + }, + { + "epoch": 1.4345731870484346, + "grad_norm": 0.5660195350646973, + "learning_rate": 9.22256509692439e-05, + "loss": 0.6584, + "step": 8042 + }, + { + "epoch": 1.4347515832664348, + "grad_norm": 0.5442212224006653, + "learning_rate": 9.21713096025115e-05, + "loss": 0.7664, + "step": 8043 + }, + { + "epoch": 1.4349299794844348, + "grad_norm": 0.4902845323085785, + "learning_rate": 9.211698063157659e-05, + "loss": 0.6131, + "step": 8044 + }, + { + "epoch": 1.435108375702435, + "grad_norm": 0.47630754113197327, + "learning_rate": 9.206266406070601e-05, + "loss": 0.5899, + "step": 8045 + }, + { + "epoch": 1.4352867719204352, + "grad_norm": 0.4589233696460724, + "learning_rate": 9.200835989416589e-05, + "loss": 0.5258, + "step": 8046 + }, + { + "epoch": 1.4354651681384354, + "grad_norm": 0.5233177542686462, + "learning_rate": 9.195406813622115e-05, + "loss": 0.6126, + "step": 8047 + }, + { + "epoch": 1.4356435643564356, + "grad_norm": 0.475238561630249, + "learning_rate": 9.189978879113587e-05, + "loss": 0.6048, + "step": 8048 + }, + { + "epoch": 1.4358219605744358, + "grad_norm": 0.47865399718284607, + "learning_rate": 9.184552186317321e-05, + "loss": 0.6042, + "step": 8049 + }, + { + "epoch": 1.436000356792436, + "grad_norm": 0.49072253704071045, + "learning_rate": 9.179126735659513e-05, + "loss": 0.4852, + "step": 8050 + }, + { + "epoch": 1.4361787530104362, + "grad_norm": 0.47413280606269836, + "learning_rate": 9.173702527566292e-05, + "loss": 0.4936, + "step": 8051 + }, + { + "epoch": 1.4363571492284364, + "grad_norm": 0.5141640901565552, + "learning_rate": 9.168279562463655e-05, + "loss": 0.6562, + "step": 8052 + }, + { + "epoch": 1.4365355454464366, + "grad_norm": 0.5093448758125305, + "learning_rate": 9.162857840777535e-05, + "loss": 0.7238, + "step": 8053 + }, + { + "epoch": 1.4367139416644368, + "grad_norm": 0.45524322986602783, + "learning_rate": 9.157437362933749e-05, + "loss": 0.5362, + "step": 8054 + }, + { + "epoch": 1.436892337882437, + "grad_norm": 0.479604572057724, + "learning_rate": 9.152018129358003e-05, + "loss": 0.4765, + "step": 8055 + }, + { + "epoch": 1.4370707341004372, + "grad_norm": 0.4629632532596588, + "learning_rate": 9.146600140475944e-05, + "loss": 0.5064, + "step": 8056 + }, + { + "epoch": 1.4372491303184374, + "grad_norm": 0.595065176486969, + "learning_rate": 9.141183396713077e-05, + "loss": 0.7567, + "step": 8057 + }, + { + "epoch": 1.4374275265364373, + "grad_norm": 0.47794783115386963, + "learning_rate": 9.13576789849485e-05, + "loss": 0.4937, + "step": 8058 + }, + { + "epoch": 1.4376059227544375, + "grad_norm": 0.5082597732543945, + "learning_rate": 9.130353646246578e-05, + "loss": 0.7144, + "step": 8059 + }, + { + "epoch": 1.4377843189724377, + "grad_norm": 0.46265289187431335, + "learning_rate": 9.124940640393512e-05, + "loss": 0.5559, + "step": 8060 + }, + { + "epoch": 1.437962715190438, + "grad_norm": 0.5391396880149841, + "learning_rate": 9.119528881360764e-05, + "loss": 0.4127, + "step": 8061 + }, + { + "epoch": 1.4381411114084381, + "grad_norm": 0.4582943618297577, + "learning_rate": 9.114118369573393e-05, + "loss": 0.3894, + "step": 8062 + }, + { + "epoch": 1.4383195076264383, + "grad_norm": 0.460657000541687, + "learning_rate": 9.108709105456323e-05, + "loss": 0.4666, + "step": 8063 + }, + { + "epoch": 1.4384979038444385, + "grad_norm": 0.5155767798423767, + "learning_rate": 9.103301089434399e-05, + "loss": 0.673, + "step": 8064 + }, + { + "epoch": 1.4386763000624387, + "grad_norm": 0.41279855370521545, + "learning_rate": 9.097894321932377e-05, + "loss": 0.3958, + "step": 8065 + }, + { + "epoch": 1.4388546962804387, + "grad_norm": 0.4912382662296295, + "learning_rate": 9.09248880337489e-05, + "loss": 0.6629, + "step": 8066 + }, + { + "epoch": 1.439033092498439, + "grad_norm": 0.5284416675567627, + "learning_rate": 9.087084534186476e-05, + "loss": 0.6543, + "step": 8067 + }, + { + "epoch": 1.4392114887164391, + "grad_norm": 0.49840492010116577, + "learning_rate": 9.081681514791609e-05, + "loss": 0.65, + "step": 8068 + }, + { + "epoch": 1.4393898849344393, + "grad_norm": 0.4940994679927826, + "learning_rate": 9.076279745614613e-05, + "loss": 0.5973, + "step": 8069 + }, + { + "epoch": 1.4395682811524395, + "grad_norm": 0.49154722690582275, + "learning_rate": 9.070879227079765e-05, + "loss": 0.6159, + "step": 8070 + }, + { + "epoch": 1.4397466773704397, + "grad_norm": 0.4630894660949707, + "learning_rate": 9.065479959611194e-05, + "loss": 0.5687, + "step": 8071 + }, + { + "epoch": 1.43992507358844, + "grad_norm": 0.7224597930908203, + "learning_rate": 9.060081943632983e-05, + "loss": 0.499, + "step": 8072 + }, + { + "epoch": 1.44010346980644, + "grad_norm": 0.47265711426734924, + "learning_rate": 9.054685179569066e-05, + "loss": 0.5876, + "step": 8073 + }, + { + "epoch": 1.4402818660244403, + "grad_norm": 0.5103265643119812, + "learning_rate": 9.049289667843325e-05, + "loss": 0.7843, + "step": 8074 + }, + { + "epoch": 1.4404602622424405, + "grad_norm": 0.5575939416885376, + "learning_rate": 9.043895408879505e-05, + "loss": 0.7539, + "step": 8075 + }, + { + "epoch": 1.4406386584604407, + "grad_norm": 0.46837523579597473, + "learning_rate": 9.038502403101268e-05, + "loss": 0.5082, + "step": 8076 + }, + { + "epoch": 1.440817054678441, + "grad_norm": 0.5041645169258118, + "learning_rate": 9.033110650932188e-05, + "loss": 0.635, + "step": 8077 + }, + { + "epoch": 1.440995450896441, + "grad_norm": 0.4759989380836487, + "learning_rate": 9.027720152795721e-05, + "loss": 0.4989, + "step": 8078 + }, + { + "epoch": 1.4411738471144413, + "grad_norm": 0.49040788412094116, + "learning_rate": 9.022330909115239e-05, + "loss": 0.5498, + "step": 8079 + }, + { + "epoch": 1.4413522433324413, + "grad_norm": 0.44871506094932556, + "learning_rate": 9.01694292031402e-05, + "loss": 0.5627, + "step": 8080 + }, + { + "epoch": 1.4415306395504415, + "grad_norm": 0.45151621103286743, + "learning_rate": 9.011556186815217e-05, + "loss": 0.4521, + "step": 8081 + }, + { + "epoch": 1.4417090357684417, + "grad_norm": 0.5003564953804016, + "learning_rate": 9.006170709041922e-05, + "loss": 0.5973, + "step": 8082 + }, + { + "epoch": 1.4418874319864419, + "grad_norm": 0.559359073638916, + "learning_rate": 9.000786487417084e-05, + "loss": 0.65, + "step": 8083 + }, + { + "epoch": 1.442065828204442, + "grad_norm": 0.48790860176086426, + "learning_rate": 8.995403522363602e-05, + "loss": 0.5723, + "step": 8084 + }, + { + "epoch": 1.4422442244224423, + "grad_norm": 0.5341598391532898, + "learning_rate": 8.99002181430423e-05, + "loss": 0.7464, + "step": 8085 + }, + { + "epoch": 1.4424226206404425, + "grad_norm": 0.5069512128829956, + "learning_rate": 8.984641363661666e-05, + "loss": 0.7237, + "step": 8086 + }, + { + "epoch": 1.4426010168584427, + "grad_norm": 0.5093790888786316, + "learning_rate": 8.979262170858474e-05, + "loss": 0.6849, + "step": 8087 + }, + { + "epoch": 1.4427794130764429, + "grad_norm": 0.4905139207839966, + "learning_rate": 8.973884236317131e-05, + "loss": 0.6719, + "step": 8088 + }, + { + "epoch": 1.4429578092944428, + "grad_norm": 0.5070201754570007, + "learning_rate": 8.968507560460029e-05, + "loss": 0.6181, + "step": 8089 + }, + { + "epoch": 1.443136205512443, + "grad_norm": 0.47273147106170654, + "learning_rate": 8.963132143709437e-05, + "loss": 0.5333, + "step": 8090 + }, + { + "epoch": 1.4433146017304432, + "grad_norm": 0.5406144261360168, + "learning_rate": 8.957757986487556e-05, + "loss": 0.6466, + "step": 8091 + }, + { + "epoch": 1.4434929979484434, + "grad_norm": 0.506829023361206, + "learning_rate": 8.95238508921645e-05, + "loss": 0.5386, + "step": 8092 + }, + { + "epoch": 1.4436713941664436, + "grad_norm": 0.515042781829834, + "learning_rate": 8.94701345231812e-05, + "loss": 0.6152, + "step": 8093 + }, + { + "epoch": 1.4438497903844438, + "grad_norm": 0.48475679755210876, + "learning_rate": 8.941643076214436e-05, + "loss": 0.6374, + "step": 8094 + }, + { + "epoch": 1.444028186602444, + "grad_norm": 0.5149679780006409, + "learning_rate": 8.936273961327198e-05, + "loss": 0.6776, + "step": 8095 + }, + { + "epoch": 1.4442065828204442, + "grad_norm": 0.610506534576416, + "learning_rate": 8.930906108078096e-05, + "loss": 0.5515, + "step": 8096 + }, + { + "epoch": 1.4443849790384444, + "grad_norm": 0.4772319793701172, + "learning_rate": 8.925539516888706e-05, + "loss": 0.5777, + "step": 8097 + }, + { + "epoch": 1.4445633752564446, + "grad_norm": 0.42948904633522034, + "learning_rate": 8.920174188180533e-05, + "loss": 0.4874, + "step": 8098 + }, + { + "epoch": 1.4447417714744448, + "grad_norm": 0.44335153698921204, + "learning_rate": 8.91481012237495e-05, + "loss": 0.3909, + "step": 8099 + }, + { + "epoch": 1.444920167692445, + "grad_norm": 0.5608864426612854, + "learning_rate": 8.909447319893269e-05, + "loss": 0.7402, + "step": 8100 + }, + { + "epoch": 1.4450985639104452, + "grad_norm": 0.5135005116462708, + "learning_rate": 8.904085781156671e-05, + "loss": 0.6809, + "step": 8101 + }, + { + "epoch": 1.4452769601284452, + "grad_norm": 0.5145747661590576, + "learning_rate": 8.898725506586239e-05, + "loss": 0.6553, + "step": 8102 + }, + { + "epoch": 1.4454553563464454, + "grad_norm": 0.46137893199920654, + "learning_rate": 8.893366496602984e-05, + "loss": 0.4671, + "step": 8103 + }, + { + "epoch": 1.4456337525644456, + "grad_norm": 0.47708660364151, + "learning_rate": 8.888008751627788e-05, + "loss": 0.4231, + "step": 8104 + }, + { + "epoch": 1.4458121487824458, + "grad_norm": 0.5265963077545166, + "learning_rate": 8.882652272081457e-05, + "loss": 0.6085, + "step": 8105 + }, + { + "epoch": 1.445990545000446, + "grad_norm": 0.5075244307518005, + "learning_rate": 8.877297058384673e-05, + "loss": 0.53, + "step": 8106 + }, + { + "epoch": 1.4461689412184462, + "grad_norm": 0.504497230052948, + "learning_rate": 8.871943110958048e-05, + "loss": 0.5873, + "step": 8107 + }, + { + "epoch": 1.4463473374364464, + "grad_norm": 0.5384894013404846, + "learning_rate": 8.866590430222072e-05, + "loss": 0.542, + "step": 8108 + }, + { + "epoch": 1.4465257336544466, + "grad_norm": 0.4829096496105194, + "learning_rate": 8.861239016597123e-05, + "loss": 0.5085, + "step": 8109 + }, + { + "epoch": 1.4467041298724468, + "grad_norm": 0.5008237361907959, + "learning_rate": 8.855888870503535e-05, + "loss": 0.5474, + "step": 8110 + }, + { + "epoch": 1.4468825260904468, + "grad_norm": 0.4817045331001282, + "learning_rate": 8.850539992361475e-05, + "loss": 0.4752, + "step": 8111 + }, + { + "epoch": 1.447060922308447, + "grad_norm": 0.5164246559143066, + "learning_rate": 8.845192382591067e-05, + "loss": 0.6815, + "step": 8112 + }, + { + "epoch": 1.4472393185264472, + "grad_norm": 0.4688229262828827, + "learning_rate": 8.839846041612295e-05, + "loss": 0.5217, + "step": 8113 + }, + { + "epoch": 1.4474177147444474, + "grad_norm": 0.4715273082256317, + "learning_rate": 8.834500969845052e-05, + "loss": 0.5708, + "step": 8114 + }, + { + "epoch": 1.4475961109624476, + "grad_norm": 0.42476388812065125, + "learning_rate": 8.829157167709157e-05, + "loss": 0.4669, + "step": 8115 + }, + { + "epoch": 1.4477745071804478, + "grad_norm": 0.5047100782394409, + "learning_rate": 8.823814635624288e-05, + "loss": 0.7172, + "step": 8116 + }, + { + "epoch": 1.447952903398448, + "grad_norm": 0.49228885769844055, + "learning_rate": 8.81847337401007e-05, + "loss": 0.5684, + "step": 8117 + }, + { + "epoch": 1.4481312996164482, + "grad_norm": 0.5029509663581848, + "learning_rate": 8.813133383285977e-05, + "loss": 0.6608, + "step": 8118 + }, + { + "epoch": 1.4483096958344484, + "grad_norm": 0.5024793148040771, + "learning_rate": 8.807794663871429e-05, + "loss": 0.6344, + "step": 8119 + }, + { + "epoch": 1.4484880920524486, + "grad_norm": 0.49262115359306335, + "learning_rate": 8.802457216185717e-05, + "loss": 0.6377, + "step": 8120 + }, + { + "epoch": 1.4486664882704487, + "grad_norm": 0.43594643473625183, + "learning_rate": 8.797121040648049e-05, + "loss": 0.527, + "step": 8121 + }, + { + "epoch": 1.448844884488449, + "grad_norm": 0.4278947114944458, + "learning_rate": 8.791786137677524e-05, + "loss": 0.5052, + "step": 8122 + }, + { + "epoch": 1.4490232807064491, + "grad_norm": 0.4548480212688446, + "learning_rate": 8.78645250769313e-05, + "loss": 0.4939, + "step": 8123 + }, + { + "epoch": 1.4492016769244491, + "grad_norm": 0.5257616639137268, + "learning_rate": 8.781120151113788e-05, + "loss": 0.6517, + "step": 8124 + }, + { + "epoch": 1.4493800731424493, + "grad_norm": 0.5141300559043884, + "learning_rate": 8.775789068358283e-05, + "loss": 0.5612, + "step": 8125 + }, + { + "epoch": 1.4495584693604495, + "grad_norm": 0.464578241109848, + "learning_rate": 8.770459259845323e-05, + "loss": 0.4524, + "step": 8126 + }, + { + "epoch": 1.4497368655784497, + "grad_norm": 0.48500585556030273, + "learning_rate": 8.765130725993514e-05, + "loss": 0.6593, + "step": 8127 + }, + { + "epoch": 1.44991526179645, + "grad_norm": 0.4844876825809479, + "learning_rate": 8.759803467221348e-05, + "loss": 0.589, + "step": 8128 + }, + { + "epoch": 1.4500936580144501, + "grad_norm": 0.5171689391136169, + "learning_rate": 8.754477483947232e-05, + "loss": 0.6696, + "step": 8129 + }, + { + "epoch": 1.4502720542324503, + "grad_norm": 0.5518413186073303, + "learning_rate": 8.749152776589459e-05, + "loss": 0.7856, + "step": 8130 + }, + { + "epoch": 1.4504504504504505, + "grad_norm": 0.4898933470249176, + "learning_rate": 8.74382934556624e-05, + "loss": 0.6053, + "step": 8131 + }, + { + "epoch": 1.4506288466684507, + "grad_norm": 0.4357226490974426, + "learning_rate": 8.738507191295658e-05, + "loss": 0.5202, + "step": 8132 + }, + { + "epoch": 1.4508072428864507, + "grad_norm": 0.5392679572105408, + "learning_rate": 8.733186314195734e-05, + "loss": 0.6614, + "step": 8133 + }, + { + "epoch": 1.4509856391044509, + "grad_norm": 0.566256582736969, + "learning_rate": 8.72786671468436e-05, + "loss": 0.7098, + "step": 8134 + }, + { + "epoch": 1.451164035322451, + "grad_norm": 0.45817068219184875, + "learning_rate": 8.722548393179319e-05, + "loss": 0.5082, + "step": 8135 + }, + { + "epoch": 1.4513424315404513, + "grad_norm": 0.4491539001464844, + "learning_rate": 8.717231350098331e-05, + "loss": 0.4243, + "step": 8136 + }, + { + "epoch": 1.4515208277584515, + "grad_norm": 0.5039847493171692, + "learning_rate": 8.711915585858979e-05, + "loss": 0.5085, + "step": 8137 + }, + { + "epoch": 1.4516992239764517, + "grad_norm": 0.5082625150680542, + "learning_rate": 8.706601100878778e-05, + "loss": 0.5393, + "step": 8138 + }, + { + "epoch": 1.4518776201944519, + "grad_norm": 0.5001698732376099, + "learning_rate": 8.701287895575102e-05, + "loss": 0.6006, + "step": 8139 + }, + { + "epoch": 1.452056016412452, + "grad_norm": 0.4417283535003662, + "learning_rate": 8.695975970365264e-05, + "loss": 0.6028, + "step": 8140 + }, + { + "epoch": 1.4522344126304523, + "grad_norm": 0.46480488777160645, + "learning_rate": 8.690665325666463e-05, + "loss": 0.5018, + "step": 8141 + }, + { + "epoch": 1.4524128088484525, + "grad_norm": 0.5661883354187012, + "learning_rate": 8.685355961895783e-05, + "loss": 0.7788, + "step": 8142 + }, + { + "epoch": 1.4525912050664527, + "grad_norm": 0.46579429507255554, + "learning_rate": 8.680047879470233e-05, + "loss": 0.5646, + "step": 8143 + }, + { + "epoch": 1.4527696012844529, + "grad_norm": 0.4672081768512726, + "learning_rate": 8.67474107880669e-05, + "loss": 0.5924, + "step": 8144 + }, + { + "epoch": 1.452947997502453, + "grad_norm": 0.4720962941646576, + "learning_rate": 8.669435560321968e-05, + "loss": 0.4921, + "step": 8145 + }, + { + "epoch": 1.453126393720453, + "grad_norm": 0.5264710783958435, + "learning_rate": 8.664131324432745e-05, + "loss": 0.6339, + "step": 8146 + }, + { + "epoch": 1.4533047899384532, + "grad_norm": 0.4442679286003113, + "learning_rate": 8.658828371555613e-05, + "loss": 0.4405, + "step": 8147 + }, + { + "epoch": 1.4534831861564534, + "grad_norm": 0.4961048364639282, + "learning_rate": 8.653526702107075e-05, + "loss": 0.5394, + "step": 8148 + }, + { + "epoch": 1.4536615823744536, + "grad_norm": 0.5059047341346741, + "learning_rate": 8.64822631650351e-05, + "loss": 0.5187, + "step": 8149 + }, + { + "epoch": 1.4538399785924538, + "grad_norm": 0.6599757671356201, + "learning_rate": 8.64292721516122e-05, + "loss": 0.6328, + "step": 8150 + }, + { + "epoch": 1.454018374810454, + "grad_norm": 0.48593562841415405, + "learning_rate": 8.637629398496377e-05, + "loss": 0.5988, + "step": 8151 + }, + { + "epoch": 1.4541967710284542, + "grad_norm": 0.5314595103263855, + "learning_rate": 8.632332866925091e-05, + "loss": 0.5746, + "step": 8152 + }, + { + "epoch": 1.4543751672464544, + "grad_norm": 0.4576443135738373, + "learning_rate": 8.627037620863328e-05, + "loss": 0.4491, + "step": 8153 + }, + { + "epoch": 1.4545535634644546, + "grad_norm": 0.4969932734966278, + "learning_rate": 8.621743660726994e-05, + "loss": 0.6862, + "step": 8154 + }, + { + "epoch": 1.4547319596824546, + "grad_norm": 0.5143519043922424, + "learning_rate": 8.616450986931857e-05, + "loss": 0.4361, + "step": 8155 + }, + { + "epoch": 1.4549103559004548, + "grad_norm": 0.498677134513855, + "learning_rate": 8.611159599893609e-05, + "loss": 0.5173, + "step": 8156 + }, + { + "epoch": 1.455088752118455, + "grad_norm": 0.47454512119293213, + "learning_rate": 8.605869500027838e-05, + "loss": 0.6723, + "step": 8157 + }, + { + "epoch": 1.4552671483364552, + "grad_norm": 0.46952202916145325, + "learning_rate": 8.600580687750017e-05, + "loss": 0.4888, + "step": 8158 + }, + { + "epoch": 1.4554455445544554, + "grad_norm": 0.6815268993377686, + "learning_rate": 8.59529316347554e-05, + "loss": 0.5794, + "step": 8159 + }, + { + "epoch": 1.4556239407724556, + "grad_norm": 0.5202351808547974, + "learning_rate": 8.590006927619676e-05, + "loss": 0.6447, + "step": 8160 + }, + { + "epoch": 1.4558023369904558, + "grad_norm": 0.4786362051963806, + "learning_rate": 8.584721980597599e-05, + "loss": 0.5809, + "step": 8161 + }, + { + "epoch": 1.455980733208456, + "grad_norm": 0.5115570425987244, + "learning_rate": 8.579438322824403e-05, + "loss": 0.6302, + "step": 8162 + }, + { + "epoch": 1.4561591294264562, + "grad_norm": 0.4989685118198395, + "learning_rate": 8.574155954715047e-05, + "loss": 0.5402, + "step": 8163 + }, + { + "epoch": 1.4563375256444564, + "grad_norm": 0.5098851919174194, + "learning_rate": 8.568874876684418e-05, + "loss": 0.6773, + "step": 8164 + }, + { + "epoch": 1.4565159218624566, + "grad_norm": 0.5459485650062561, + "learning_rate": 8.56359508914728e-05, + "loss": 0.7716, + "step": 8165 + }, + { + "epoch": 1.4566943180804568, + "grad_norm": 0.5095139145851135, + "learning_rate": 8.55831659251832e-05, + "loss": 0.7628, + "step": 8166 + }, + { + "epoch": 1.456872714298457, + "grad_norm": 0.5003282427787781, + "learning_rate": 8.553039387212097e-05, + "loss": 0.5369, + "step": 8167 + }, + { + "epoch": 1.457051110516457, + "grad_norm": 0.5022057890892029, + "learning_rate": 8.547763473643074e-05, + "loss": 0.5863, + "step": 8168 + }, + { + "epoch": 1.4572295067344572, + "grad_norm": 0.4498676061630249, + "learning_rate": 8.542488852225638e-05, + "loss": 0.4954, + "step": 8169 + }, + { + "epoch": 1.4574079029524574, + "grad_norm": 0.6011500358581543, + "learning_rate": 8.537215523374037e-05, + "loss": 0.7241, + "step": 8170 + }, + { + "epoch": 1.4575862991704576, + "grad_norm": 0.48579999804496765, + "learning_rate": 8.531943487502445e-05, + "loss": 0.4106, + "step": 8171 + }, + { + "epoch": 1.4577646953884578, + "grad_norm": 0.5279344320297241, + "learning_rate": 8.52667274502493e-05, + "loss": 0.5385, + "step": 8172 + }, + { + "epoch": 1.457943091606458, + "grad_norm": 0.5536198616027832, + "learning_rate": 8.521403296355443e-05, + "loss": 0.7606, + "step": 8173 + }, + { + "epoch": 1.4581214878244582, + "grad_norm": 0.5520446300506592, + "learning_rate": 8.516135141907858e-05, + "loss": 0.5819, + "step": 8174 + }, + { + "epoch": 1.4582998840424584, + "grad_norm": 0.49602624773979187, + "learning_rate": 8.510868282095916e-05, + "loss": 0.6204, + "step": 8175 + }, + { + "epoch": 1.4584782802604586, + "grad_norm": 0.4096802771091461, + "learning_rate": 8.505602717333291e-05, + "loss": 0.3461, + "step": 8176 + }, + { + "epoch": 1.4586566764784585, + "grad_norm": 0.5201975107192993, + "learning_rate": 8.500338448033524e-05, + "loss": 0.5463, + "step": 8177 + }, + { + "epoch": 1.4588350726964587, + "grad_norm": 0.45560964941978455, + "learning_rate": 8.495075474610081e-05, + "loss": 0.4585, + "step": 8178 + }, + { + "epoch": 1.459013468914459, + "grad_norm": 0.4644407629966736, + "learning_rate": 8.489813797476303e-05, + "loss": 0.5049, + "step": 8179 + }, + { + "epoch": 1.4591918651324591, + "grad_norm": 0.4632987678050995, + "learning_rate": 8.484553417045448e-05, + "loss": 0.4836, + "step": 8180 + }, + { + "epoch": 1.4593702613504593, + "grad_norm": 0.5432924628257751, + "learning_rate": 8.479294333730664e-05, + "loss": 0.6128, + "step": 8181 + }, + { + "epoch": 1.4595486575684595, + "grad_norm": 0.4162793755531311, + "learning_rate": 8.474036547944985e-05, + "loss": 0.4007, + "step": 8182 + }, + { + "epoch": 1.4597270537864597, + "grad_norm": 0.49239131808280945, + "learning_rate": 8.468780060101372e-05, + "loss": 0.6786, + "step": 8183 + }, + { + "epoch": 1.45990545000446, + "grad_norm": 0.5322994589805603, + "learning_rate": 8.463524870612649e-05, + "loss": 0.6764, + "step": 8184 + }, + { + "epoch": 1.4600838462224601, + "grad_norm": 0.5249794125556946, + "learning_rate": 8.458270979891578e-05, + "loss": 0.5932, + "step": 8185 + }, + { + "epoch": 1.4602622424404603, + "grad_norm": 0.5151904821395874, + "learning_rate": 8.453018388350772e-05, + "loss": 0.5464, + "step": 8186 + }, + { + "epoch": 1.4604406386584605, + "grad_norm": 0.5483924746513367, + "learning_rate": 8.447767096402787e-05, + "loss": 0.6301, + "step": 8187 + }, + { + "epoch": 1.4606190348764607, + "grad_norm": 0.4792023301124573, + "learning_rate": 8.442517104460057e-05, + "loss": 0.5793, + "step": 8188 + }, + { + "epoch": 1.460797431094461, + "grad_norm": 0.5382273197174072, + "learning_rate": 8.437268412934898e-05, + "loss": 0.7128, + "step": 8189 + }, + { + "epoch": 1.460975827312461, + "grad_norm": 0.5009669065475464, + "learning_rate": 8.432021022239561e-05, + "loss": 0.5538, + "step": 8190 + }, + { + "epoch": 1.461154223530461, + "grad_norm": 0.4763016998767853, + "learning_rate": 8.426774932786154e-05, + "loss": 0.5062, + "step": 8191 + }, + { + "epoch": 1.4613326197484613, + "grad_norm": 0.49051937460899353, + "learning_rate": 8.421530144986722e-05, + "loss": 0.6787, + "step": 8192 + }, + { + "epoch": 1.4615110159664615, + "grad_norm": 0.48328647017478943, + "learning_rate": 8.416286659253178e-05, + "loss": 0.5788, + "step": 8193 + }, + { + "epoch": 1.4616894121844617, + "grad_norm": 0.48622655868530273, + "learning_rate": 8.411044475997331e-05, + "loss": 0.6081, + "step": 8194 + }, + { + "epoch": 1.4618678084024619, + "grad_norm": 0.45523685216903687, + "learning_rate": 8.405803595630926e-05, + "loss": 0.5096, + "step": 8195 + }, + { + "epoch": 1.462046204620462, + "grad_norm": 0.5314157009124756, + "learning_rate": 8.400564018565554e-05, + "loss": 0.6395, + "step": 8196 + }, + { + "epoch": 1.4622246008384623, + "grad_norm": 0.48079803586006165, + "learning_rate": 8.395325745212747e-05, + "loss": 0.5513, + "step": 8197 + }, + { + "epoch": 1.4624029970564625, + "grad_norm": 0.4904074966907501, + "learning_rate": 8.390088775983906e-05, + "loss": 0.566, + "step": 8198 + }, + { + "epoch": 1.4625813932744625, + "grad_norm": 0.48788025975227356, + "learning_rate": 8.384853111290352e-05, + "loss": 0.6323, + "step": 8199 + }, + { + "epoch": 1.4627597894924627, + "grad_norm": 0.455021470785141, + "learning_rate": 8.379618751543274e-05, + "loss": 0.5147, + "step": 8200 + }, + { + "epoch": 1.4629381857104629, + "grad_norm": 0.47679826617240906, + "learning_rate": 8.374385697153791e-05, + "loss": 0.6236, + "step": 8201 + }, + { + "epoch": 1.463116581928463, + "grad_norm": 0.5282472968101501, + "learning_rate": 8.369153948532907e-05, + "loss": 0.6563, + "step": 8202 + }, + { + "epoch": 1.4632949781464633, + "grad_norm": 0.5126857757568359, + "learning_rate": 8.363923506091506e-05, + "loss": 0.5623, + "step": 8203 + }, + { + "epoch": 1.4634733743644635, + "grad_norm": 0.476513147354126, + "learning_rate": 8.358694370240402e-05, + "loss": 0.443, + "step": 8204 + }, + { + "epoch": 1.4636517705824637, + "grad_norm": 0.5799571871757507, + "learning_rate": 8.353466541390273e-05, + "loss": 0.5796, + "step": 8205 + }, + { + "epoch": 1.4638301668004639, + "grad_norm": 0.41350480914115906, + "learning_rate": 8.348240019951728e-05, + "loss": 0.3979, + "step": 8206 + }, + { + "epoch": 1.464008563018464, + "grad_norm": 0.47292932868003845, + "learning_rate": 8.343014806335245e-05, + "loss": 0.4637, + "step": 8207 + }, + { + "epoch": 1.4641869592364642, + "grad_norm": 0.5536729693412781, + "learning_rate": 8.3377909009512e-05, + "loss": 0.6503, + "step": 8208 + }, + { + "epoch": 1.4643653554544644, + "grad_norm": 0.5005719065666199, + "learning_rate": 8.3325683042099e-05, + "loss": 0.4994, + "step": 8209 + }, + { + "epoch": 1.4645437516724646, + "grad_norm": 0.47958654165267944, + "learning_rate": 8.327347016521503e-05, + "loss": 0.5329, + "step": 8210 + }, + { + "epoch": 1.4647221478904648, + "grad_norm": 0.5664541125297546, + "learning_rate": 8.322127038296104e-05, + "loss": 0.766, + "step": 8211 + }, + { + "epoch": 1.4649005441084648, + "grad_norm": 0.5163717865943909, + "learning_rate": 8.316908369943663e-05, + "loss": 0.5823, + "step": 8212 + }, + { + "epoch": 1.465078940326465, + "grad_norm": 0.4811464548110962, + "learning_rate": 8.311691011874067e-05, + "loss": 0.55, + "step": 8213 + }, + { + "epoch": 1.4652573365444652, + "grad_norm": 0.4442557990550995, + "learning_rate": 8.306474964497076e-05, + "loss": 0.4014, + "step": 8214 + }, + { + "epoch": 1.4654357327624654, + "grad_norm": 0.5611222982406616, + "learning_rate": 8.301260228222351e-05, + "loss": 0.7519, + "step": 8215 + }, + { + "epoch": 1.4656141289804656, + "grad_norm": 0.4814377427101135, + "learning_rate": 8.29604680345947e-05, + "loss": 0.5494, + "step": 8216 + }, + { + "epoch": 1.4657925251984658, + "grad_norm": 0.42738616466522217, + "learning_rate": 8.290834690617868e-05, + "loss": 0.5058, + "step": 8217 + }, + { + "epoch": 1.465970921416466, + "grad_norm": 0.6096622347831726, + "learning_rate": 8.285623890106936e-05, + "loss": 0.573, + "step": 8218 + }, + { + "epoch": 1.4661493176344662, + "grad_norm": 0.43758609890937805, + "learning_rate": 8.280414402335909e-05, + "loss": 0.4614, + "step": 8219 + }, + { + "epoch": 1.4663277138524664, + "grad_norm": 0.48316100239753723, + "learning_rate": 8.275206227713936e-05, + "loss": 0.5601, + "step": 8220 + }, + { + "epoch": 1.4665061100704664, + "grad_norm": 0.5487798452377319, + "learning_rate": 8.269999366650071e-05, + "loss": 0.7356, + "step": 8221 + }, + { + "epoch": 1.4666845062884666, + "grad_norm": 0.4505595862865448, + "learning_rate": 8.264793819553252e-05, + "loss": 0.4967, + "step": 8222 + }, + { + "epoch": 1.4668629025064668, + "grad_norm": 0.46539705991744995, + "learning_rate": 8.259589586832331e-05, + "loss": 0.4694, + "step": 8223 + }, + { + "epoch": 1.467041298724467, + "grad_norm": 0.5119168758392334, + "learning_rate": 8.254386668896033e-05, + "loss": 0.5304, + "step": 8224 + }, + { + "epoch": 1.4672196949424672, + "grad_norm": 0.494151771068573, + "learning_rate": 8.249185066153006e-05, + "loss": 0.5928, + "step": 8225 + }, + { + "epoch": 1.4673980911604674, + "grad_norm": 0.4593886137008667, + "learning_rate": 8.243984779011779e-05, + "loss": 0.5024, + "step": 8226 + }, + { + "epoch": 1.4675764873784676, + "grad_norm": 0.46240317821502686, + "learning_rate": 8.238785807880767e-05, + "loss": 0.534, + "step": 8227 + }, + { + "epoch": 1.4677548835964678, + "grad_norm": 0.49743345379829407, + "learning_rate": 8.233588153168312e-05, + "loss": 0.5577, + "step": 8228 + }, + { + "epoch": 1.467933279814468, + "grad_norm": 0.43200600147247314, + "learning_rate": 8.228391815282619e-05, + "loss": 0.3676, + "step": 8229 + }, + { + "epoch": 1.4681116760324682, + "grad_norm": 0.46943768858909607, + "learning_rate": 8.223196794631826e-05, + "loss": 0.4637, + "step": 8230 + }, + { + "epoch": 1.4682900722504684, + "grad_norm": 0.5156434178352356, + "learning_rate": 8.218003091623927e-05, + "loss": 0.6567, + "step": 8231 + }, + { + "epoch": 1.4684684684684686, + "grad_norm": 0.46780145168304443, + "learning_rate": 8.212810706666846e-05, + "loss": 0.5733, + "step": 8232 + }, + { + "epoch": 1.4686468646864688, + "grad_norm": 0.5769681334495544, + "learning_rate": 8.207619640168393e-05, + "loss": 0.8275, + "step": 8233 + }, + { + "epoch": 1.4688252609044687, + "grad_norm": 0.4969267249107361, + "learning_rate": 8.202429892536261e-05, + "loss": 0.5933, + "step": 8234 + }, + { + "epoch": 1.469003657122469, + "grad_norm": 0.5537621378898621, + "learning_rate": 8.197241464178065e-05, + "loss": 0.7404, + "step": 8235 + }, + { + "epoch": 1.4691820533404691, + "grad_norm": 0.5195968747138977, + "learning_rate": 8.192054355501282e-05, + "loss": 0.6725, + "step": 8236 + }, + { + "epoch": 1.4693604495584693, + "grad_norm": 0.47820261120796204, + "learning_rate": 8.18686856691333e-05, + "loss": 0.6093, + "step": 8237 + }, + { + "epoch": 1.4695388457764695, + "grad_norm": 0.45387136936187744, + "learning_rate": 8.181684098821474e-05, + "loss": 0.552, + "step": 8238 + }, + { + "epoch": 1.4697172419944697, + "grad_norm": 0.49222466349601746, + "learning_rate": 8.17650095163292e-05, + "loss": 0.6341, + "step": 8239 + }, + { + "epoch": 1.46989563821247, + "grad_norm": 0.47406187653541565, + "learning_rate": 8.171319125754745e-05, + "loss": 0.5789, + "step": 8240 + }, + { + "epoch": 1.4700740344304701, + "grad_norm": 0.49534159898757935, + "learning_rate": 8.16613862159391e-05, + "loss": 0.5961, + "step": 8241 + }, + { + "epoch": 1.4702524306484703, + "grad_norm": 0.5249592661857605, + "learning_rate": 8.160959439557316e-05, + "loss": 0.6454, + "step": 8242 + }, + { + "epoch": 1.4704308268664703, + "grad_norm": 0.4379402697086334, + "learning_rate": 8.155781580051714e-05, + "loss": 0.4289, + "step": 8243 + }, + { + "epoch": 1.4706092230844705, + "grad_norm": 0.4104413092136383, + "learning_rate": 8.150605043483783e-05, + "loss": 0.4201, + "step": 8244 + }, + { + "epoch": 1.4707876193024707, + "grad_norm": 0.5442726612091064, + "learning_rate": 8.145429830260073e-05, + "loss": 0.7512, + "step": 8245 + }, + { + "epoch": 1.470966015520471, + "grad_norm": 0.47776398062705994, + "learning_rate": 8.140255940787059e-05, + "loss": 0.5361, + "step": 8246 + }, + { + "epoch": 1.471144411738471, + "grad_norm": 0.49612924456596375, + "learning_rate": 8.13508337547108e-05, + "loss": 0.6154, + "step": 8247 + }, + { + "epoch": 1.4713228079564713, + "grad_norm": 0.45897966623306274, + "learning_rate": 8.129912134718398e-05, + "loss": 0.4906, + "step": 8248 + }, + { + "epoch": 1.4715012041744715, + "grad_norm": 0.5512779355049133, + "learning_rate": 8.124742218935164e-05, + "loss": 0.739, + "step": 8249 + }, + { + "epoch": 1.4716796003924717, + "grad_norm": 0.5072031617164612, + "learning_rate": 8.119573628527404e-05, + "loss": 0.5826, + "step": 8250 + }, + { + "epoch": 1.471857996610472, + "grad_norm": 0.47292739152908325, + "learning_rate": 8.114406363901078e-05, + "loss": 0.4861, + "step": 8251 + }, + { + "epoch": 1.472036392828472, + "grad_norm": 0.448589026927948, + "learning_rate": 8.109240425462008e-05, + "loss": 0.4831, + "step": 8252 + }, + { + "epoch": 1.4722147890464723, + "grad_norm": 0.4658273756504059, + "learning_rate": 8.104075813615918e-05, + "loss": 0.5328, + "step": 8253 + }, + { + "epoch": 1.4723931852644725, + "grad_norm": 0.5529447197914124, + "learning_rate": 8.098912528768452e-05, + "loss": 0.6661, + "step": 8254 + }, + { + "epoch": 1.4725715814824727, + "grad_norm": 0.46816307306289673, + "learning_rate": 8.093750571325112e-05, + "loss": 0.4844, + "step": 8255 + }, + { + "epoch": 1.4727499777004727, + "grad_norm": 0.5499048829078674, + "learning_rate": 8.088589941691338e-05, + "loss": 0.5624, + "step": 8256 + }, + { + "epoch": 1.4729283739184729, + "grad_norm": 0.5012791752815247, + "learning_rate": 8.083430640272424e-05, + "loss": 0.5128, + "step": 8257 + }, + { + "epoch": 1.473106770136473, + "grad_norm": 0.5420527458190918, + "learning_rate": 8.078272667473593e-05, + "loss": 0.5709, + "step": 8258 + }, + { + "epoch": 1.4732851663544733, + "grad_norm": 0.4874211549758911, + "learning_rate": 8.073116023699939e-05, + "loss": 0.5259, + "step": 8259 + }, + { + "epoch": 1.4734635625724735, + "grad_norm": 0.5653133988380432, + "learning_rate": 8.067960709356478e-05, + "loss": 0.6445, + "step": 8260 + }, + { + "epoch": 1.4736419587904737, + "grad_norm": 0.4977559745311737, + "learning_rate": 8.062806724848093e-05, + "loss": 0.5627, + "step": 8261 + }, + { + "epoch": 1.4738203550084739, + "grad_norm": 0.5231120586395264, + "learning_rate": 8.057654070579573e-05, + "loss": 0.5859, + "step": 8262 + }, + { + "epoch": 1.473998751226474, + "grad_norm": 0.5144016146659851, + "learning_rate": 8.052502746955612e-05, + "loss": 0.6625, + "step": 8263 + }, + { + "epoch": 1.4741771474444743, + "grad_norm": 0.5300917029380798, + "learning_rate": 8.04735275438079e-05, + "loss": 0.6414, + "step": 8264 + }, + { + "epoch": 1.4743555436624742, + "grad_norm": 0.506809651851654, + "learning_rate": 8.042204093259597e-05, + "loss": 0.5545, + "step": 8265 + }, + { + "epoch": 1.4745339398804744, + "grad_norm": 0.4742717742919922, + "learning_rate": 8.037056763996398e-05, + "loss": 0.5692, + "step": 8266 + }, + { + "epoch": 1.4747123360984746, + "grad_norm": 0.477117657661438, + "learning_rate": 8.031910766995451e-05, + "loss": 0.4557, + "step": 8267 + }, + { + "epoch": 1.4748907323164748, + "grad_norm": 0.4841883182525635, + "learning_rate": 8.02676610266094e-05, + "loss": 0.6635, + "step": 8268 + }, + { + "epoch": 1.475069128534475, + "grad_norm": 0.4822656810283661, + "learning_rate": 8.021622771396905e-05, + "loss": 0.557, + "step": 8269 + }, + { + "epoch": 1.4752475247524752, + "grad_norm": 0.49790987372398376, + "learning_rate": 8.01648077360732e-05, + "loss": 0.6972, + "step": 8270 + }, + { + "epoch": 1.4754259209704754, + "grad_norm": 0.5201764702796936, + "learning_rate": 8.01134010969602e-05, + "loss": 0.6559, + "step": 8271 + }, + { + "epoch": 1.4756043171884756, + "grad_norm": 0.4643043577671051, + "learning_rate": 8.006200780066763e-05, + "loss": 0.4562, + "step": 8272 + }, + { + "epoch": 1.4757827134064758, + "grad_norm": 0.4936201870441437, + "learning_rate": 8.001062785123184e-05, + "loss": 0.519, + "step": 8273 + }, + { + "epoch": 1.475961109624476, + "grad_norm": 0.5245823860168457, + "learning_rate": 7.995926125268813e-05, + "loss": 0.7527, + "step": 8274 + }, + { + "epoch": 1.4761395058424762, + "grad_norm": 0.45136356353759766, + "learning_rate": 7.99079080090709e-05, + "loss": 0.4744, + "step": 8275 + }, + { + "epoch": 1.4763179020604764, + "grad_norm": 0.5103075504302979, + "learning_rate": 7.985656812441336e-05, + "loss": 0.5456, + "step": 8276 + }, + { + "epoch": 1.4764962982784766, + "grad_norm": 0.5783809423446655, + "learning_rate": 7.980524160274776e-05, + "loss": 0.6718, + "step": 8277 + }, + { + "epoch": 1.4766746944964766, + "grad_norm": 0.41144779324531555, + "learning_rate": 7.975392844810523e-05, + "loss": 0.4706, + "step": 8278 + }, + { + "epoch": 1.4768530907144768, + "grad_norm": 0.5507053732872009, + "learning_rate": 7.970262866451583e-05, + "loss": 0.6462, + "step": 8279 + }, + { + "epoch": 1.477031486932477, + "grad_norm": 0.5084280967712402, + "learning_rate": 7.965134225600881e-05, + "loss": 0.5376, + "step": 8280 + }, + { + "epoch": 1.4772098831504772, + "grad_norm": 0.46495214104652405, + "learning_rate": 7.960006922661197e-05, + "loss": 0.4167, + "step": 8281 + }, + { + "epoch": 1.4773882793684774, + "grad_norm": 0.54592365026474, + "learning_rate": 7.954880958035245e-05, + "loss": 0.6447, + "step": 8282 + }, + { + "epoch": 1.4775666755864776, + "grad_norm": 0.588589608669281, + "learning_rate": 7.949756332125599e-05, + "loss": 0.7427, + "step": 8283 + }, + { + "epoch": 1.4777450718044778, + "grad_norm": 0.5428383350372314, + "learning_rate": 7.944633045334763e-05, + "loss": 0.628, + "step": 8284 + }, + { + "epoch": 1.477923468022478, + "grad_norm": 0.45825469493865967, + "learning_rate": 7.939511098065097e-05, + "loss": 0.4774, + "step": 8285 + }, + { + "epoch": 1.4781018642404782, + "grad_norm": 0.591995358467102, + "learning_rate": 7.934390490718898e-05, + "loss": 0.5751, + "step": 8286 + }, + { + "epoch": 1.4782802604584782, + "grad_norm": 0.5302790403366089, + "learning_rate": 7.929271223698326e-05, + "loss": 0.5787, + "step": 8287 + }, + { + "epoch": 1.4784586566764784, + "grad_norm": 0.5035462379455566, + "learning_rate": 7.924153297405437e-05, + "loss": 0.5414, + "step": 8288 + }, + { + "epoch": 1.4786370528944786, + "grad_norm": 0.4451696574687958, + "learning_rate": 7.919036712242205e-05, + "loss": 0.4648, + "step": 8289 + }, + { + "epoch": 1.4788154491124788, + "grad_norm": 0.5414475798606873, + "learning_rate": 7.913921468610477e-05, + "loss": 0.7727, + "step": 8290 + }, + { + "epoch": 1.478993845330479, + "grad_norm": 0.44653692841529846, + "learning_rate": 7.908807566912007e-05, + "loss": 0.5023, + "step": 8291 + }, + { + "epoch": 1.4791722415484791, + "grad_norm": 0.4529421925544739, + "learning_rate": 7.903695007548432e-05, + "loss": 0.534, + "step": 8292 + }, + { + "epoch": 1.4793506377664793, + "grad_norm": 0.5018486380577087, + "learning_rate": 7.898583790921291e-05, + "loss": 0.514, + "step": 8293 + }, + { + "epoch": 1.4795290339844795, + "grad_norm": 0.4705956280231476, + "learning_rate": 7.893473917432029e-05, + "loss": 0.5485, + "step": 8294 + }, + { + "epoch": 1.4797074302024797, + "grad_norm": 0.4740748107433319, + "learning_rate": 7.888365387481955e-05, + "loss": 0.5393, + "step": 8295 + }, + { + "epoch": 1.47988582642048, + "grad_norm": 0.5045658349990845, + "learning_rate": 7.88325820147231e-05, + "loss": 0.5581, + "step": 8296 + }, + { + "epoch": 1.4800642226384801, + "grad_norm": 0.48949524760246277, + "learning_rate": 7.87815235980419e-05, + "loss": 0.5983, + "step": 8297 + }, + { + "epoch": 1.4802426188564803, + "grad_norm": 0.47559529542922974, + "learning_rate": 7.873047862878624e-05, + "loss": 0.566, + "step": 8298 + }, + { + "epoch": 1.4804210150744805, + "grad_norm": 0.49631282687187195, + "learning_rate": 7.867944711096508e-05, + "loss": 0.5443, + "step": 8299 + }, + { + "epoch": 1.4805994112924805, + "grad_norm": 0.508677065372467, + "learning_rate": 7.862842904858633e-05, + "loss": 0.602, + "step": 8300 + }, + { + "epoch": 1.4807778075104807, + "grad_norm": 0.5043870806694031, + "learning_rate": 7.857742444565713e-05, + "loss": 0.5086, + "step": 8301 + }, + { + "epoch": 1.480956203728481, + "grad_norm": 0.5479679107666016, + "learning_rate": 7.852643330618314e-05, + "loss": 0.6697, + "step": 8302 + }, + { + "epoch": 1.4811345999464811, + "grad_norm": 0.5437744855880737, + "learning_rate": 7.847545563416936e-05, + "loss": 0.6367, + "step": 8303 + }, + { + "epoch": 1.4813129961644813, + "grad_norm": 0.5325936675071716, + "learning_rate": 7.842449143361943e-05, + "loss": 0.6722, + "step": 8304 + }, + { + "epoch": 1.4814913923824815, + "grad_norm": 0.48976537585258484, + "learning_rate": 7.837354070853616e-05, + "loss": 0.6325, + "step": 8305 + }, + { + "epoch": 1.4816697886004817, + "grad_norm": 0.5160368084907532, + "learning_rate": 7.832260346292117e-05, + "loss": 0.5519, + "step": 8306 + }, + { + "epoch": 1.481848184818482, + "grad_norm": 0.49977096915245056, + "learning_rate": 7.827167970077492e-05, + "loss": 0.5907, + "step": 8307 + }, + { + "epoch": 1.482026581036482, + "grad_norm": 0.47831645607948303, + "learning_rate": 7.822076942609707e-05, + "loss": 0.5587, + "step": 8308 + }, + { + "epoch": 1.482204977254482, + "grad_norm": 0.496364951133728, + "learning_rate": 7.816987264288606e-05, + "loss": 0.5853, + "step": 8309 + }, + { + "epoch": 1.4823833734724823, + "grad_norm": 0.45871299505233765, + "learning_rate": 7.811898935513936e-05, + "loss": 0.4607, + "step": 8310 + }, + { + "epoch": 1.4825617696904825, + "grad_norm": 0.4363771677017212, + "learning_rate": 7.806811956685331e-05, + "loss": 0.5019, + "step": 8311 + }, + { + "epoch": 1.4827401659084827, + "grad_norm": 0.5848436951637268, + "learning_rate": 7.801726328202305e-05, + "loss": 0.6789, + "step": 8312 + }, + { + "epoch": 1.4829185621264829, + "grad_norm": 0.4620121121406555, + "learning_rate": 7.796642050464303e-05, + "loss": 0.5479, + "step": 8313 + }, + { + "epoch": 1.483096958344483, + "grad_norm": 0.5338596701622009, + "learning_rate": 7.79155912387062e-05, + "loss": 0.5664, + "step": 8314 + }, + { + "epoch": 1.4832753545624833, + "grad_norm": 0.4988861382007599, + "learning_rate": 7.786477548820489e-05, + "loss": 0.5782, + "step": 8315 + }, + { + "epoch": 1.4834537507804835, + "grad_norm": 0.6170352101325989, + "learning_rate": 7.781397325712994e-05, + "loss": 0.5044, + "step": 8316 + }, + { + "epoch": 1.4836321469984837, + "grad_norm": 0.5399267077445984, + "learning_rate": 7.776318454947154e-05, + "loss": 0.6209, + "step": 8317 + }, + { + "epoch": 1.4838105432164839, + "grad_norm": 0.5562114715576172, + "learning_rate": 7.771240936921839e-05, + "loss": 0.7107, + "step": 8318 + }, + { + "epoch": 1.483988939434484, + "grad_norm": 0.46948572993278503, + "learning_rate": 7.766164772035855e-05, + "loss": 0.5113, + "step": 8319 + }, + { + "epoch": 1.4841673356524843, + "grad_norm": 0.5978415012359619, + "learning_rate": 7.761089960687876e-05, + "loss": 0.6767, + "step": 8320 + }, + { + "epoch": 1.4843457318704845, + "grad_norm": 0.4891349673271179, + "learning_rate": 7.756016503276464e-05, + "loss": 0.5338, + "step": 8321 + }, + { + "epoch": 1.4845241280884847, + "grad_norm": 0.45725369453430176, + "learning_rate": 7.750944400200102e-05, + "loss": 0.4976, + "step": 8322 + }, + { + "epoch": 1.4847025243064846, + "grad_norm": 0.42975348234176636, + "learning_rate": 7.745873651857138e-05, + "loss": 0.4602, + "step": 8323 + }, + { + "epoch": 1.4848809205244848, + "grad_norm": 0.4848777949810028, + "learning_rate": 7.740804258645831e-05, + "loss": 0.6199, + "step": 8324 + }, + { + "epoch": 1.485059316742485, + "grad_norm": 0.4301760792732239, + "learning_rate": 7.735736220964337e-05, + "loss": 0.4882, + "step": 8325 + }, + { + "epoch": 1.4852377129604852, + "grad_norm": 0.4948813021183014, + "learning_rate": 7.730669539210686e-05, + "loss": 0.6615, + "step": 8326 + }, + { + "epoch": 1.4854161091784854, + "grad_norm": 0.4712623059749603, + "learning_rate": 7.725604213782824e-05, + "loss": 0.5958, + "step": 8327 + }, + { + "epoch": 1.4855945053964856, + "grad_norm": 0.484377384185791, + "learning_rate": 7.720540245078567e-05, + "loss": 0.6291, + "step": 8328 + }, + { + "epoch": 1.4857729016144858, + "grad_norm": 0.5532923340797424, + "learning_rate": 7.715477633495649e-05, + "loss": 0.712, + "step": 8329 + }, + { + "epoch": 1.485951297832486, + "grad_norm": 0.4761124849319458, + "learning_rate": 7.71041637943167e-05, + "loss": 0.4219, + "step": 8330 + }, + { + "epoch": 1.486129694050486, + "grad_norm": 0.4717256724834442, + "learning_rate": 7.70535648328416e-05, + "loss": 0.5188, + "step": 8331 + }, + { + "epoch": 1.4863080902684862, + "grad_norm": 0.4593075215816498, + "learning_rate": 7.700297945450507e-05, + "loss": 0.4863, + "step": 8332 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.4873063266277313, + "learning_rate": 7.695240766328002e-05, + "loss": 0.5419, + "step": 8333 + }, + { + "epoch": 1.4866648827044866, + "grad_norm": 0.5233901143074036, + "learning_rate": 7.690184946313846e-05, + "loss": 0.5865, + "step": 8334 + }, + { + "epoch": 1.4868432789224868, + "grad_norm": 0.5119859576225281, + "learning_rate": 7.685130485805112e-05, + "loss": 0.5873, + "step": 8335 + }, + { + "epoch": 1.487021675140487, + "grad_norm": 0.4452389180660248, + "learning_rate": 7.680077385198783e-05, + "loss": 0.3941, + "step": 8336 + }, + { + "epoch": 1.4872000713584872, + "grad_norm": 0.49985215067863464, + "learning_rate": 7.675025644891714e-05, + "loss": 0.586, + "step": 8337 + }, + { + "epoch": 1.4873784675764874, + "grad_norm": 0.5451793074607849, + "learning_rate": 7.669975265280688e-05, + "loss": 0.597, + "step": 8338 + }, + { + "epoch": 1.4875568637944876, + "grad_norm": 0.47509869933128357, + "learning_rate": 7.664926246762335e-05, + "loss": 0.5232, + "step": 8339 + }, + { + "epoch": 1.4877352600124878, + "grad_norm": 0.44898343086242676, + "learning_rate": 7.659878589733216e-05, + "loss": 0.4391, + "step": 8340 + }, + { + "epoch": 1.487913656230488, + "grad_norm": 0.5049835443496704, + "learning_rate": 7.654832294589775e-05, + "loss": 0.5392, + "step": 8341 + }, + { + "epoch": 1.4880920524484882, + "grad_norm": 0.5432078242301941, + "learning_rate": 7.649787361728338e-05, + "loss": 0.694, + "step": 8342 + }, + { + "epoch": 1.4882704486664884, + "grad_norm": 0.4939239025115967, + "learning_rate": 7.644743791545141e-05, + "loss": 0.4921, + "step": 8343 + }, + { + "epoch": 1.4884488448844886, + "grad_norm": 0.43726587295532227, + "learning_rate": 7.639701584436292e-05, + "loss": 0.4466, + "step": 8344 + }, + { + "epoch": 1.4886272411024886, + "grad_norm": 0.48199325799942017, + "learning_rate": 7.634660740797817e-05, + "loss": 0.5736, + "step": 8345 + }, + { + "epoch": 1.4888056373204888, + "grad_norm": 0.4477761387825012, + "learning_rate": 7.629621261025613e-05, + "loss": 0.4244, + "step": 8346 + }, + { + "epoch": 1.488984033538489, + "grad_norm": 0.564702570438385, + "learning_rate": 7.624583145515474e-05, + "loss": 0.7607, + "step": 8347 + }, + { + "epoch": 1.4891624297564892, + "grad_norm": 0.4603510797023773, + "learning_rate": 7.619546394663104e-05, + "loss": 0.4789, + "step": 8348 + }, + { + "epoch": 1.4893408259744894, + "grad_norm": 0.4973684549331665, + "learning_rate": 7.614511008864073e-05, + "loss": 0.5349, + "step": 8349 + }, + { + "epoch": 1.4895192221924896, + "grad_norm": 0.4872196316719055, + "learning_rate": 7.609476988513875e-05, + "loss": 0.5265, + "step": 8350 + }, + { + "epoch": 1.4896976184104898, + "grad_norm": 0.5209683775901794, + "learning_rate": 7.604444334007862e-05, + "loss": 0.6186, + "step": 8351 + }, + { + "epoch": 1.48987601462849, + "grad_norm": 0.48873645067214966, + "learning_rate": 7.599413045741313e-05, + "loss": 0.5037, + "step": 8352 + }, + { + "epoch": 1.49005441084649, + "grad_norm": 0.45188388228416443, + "learning_rate": 7.594383124109375e-05, + "loss": 0.4037, + "step": 8353 + }, + { + "epoch": 1.4902328070644901, + "grad_norm": 0.5492420196533203, + "learning_rate": 7.589354569507081e-05, + "loss": 0.589, + "step": 8354 + }, + { + "epoch": 1.4904112032824903, + "grad_norm": 0.532741904258728, + "learning_rate": 7.584327382329401e-05, + "loss": 0.6725, + "step": 8355 + }, + { + "epoch": 1.4905895995004905, + "grad_norm": 0.5740191340446472, + "learning_rate": 7.579301562971147e-05, + "loss": 0.6417, + "step": 8356 + }, + { + "epoch": 1.4907679957184907, + "grad_norm": 0.4454120099544525, + "learning_rate": 7.57427711182706e-05, + "loss": 0.397, + "step": 8357 + }, + { + "epoch": 1.490946391936491, + "grad_norm": 0.49578696489334106, + "learning_rate": 7.56925402929175e-05, + "loss": 0.4834, + "step": 8358 + }, + { + "epoch": 1.4911247881544911, + "grad_norm": 0.5603286623954773, + "learning_rate": 7.564232315759718e-05, + "loss": 0.4489, + "step": 8359 + }, + { + "epoch": 1.4913031843724913, + "grad_norm": 0.5061764717102051, + "learning_rate": 7.559211971625385e-05, + "loss": 0.5227, + "step": 8360 + }, + { + "epoch": 1.4914815805904915, + "grad_norm": 0.48216429352760315, + "learning_rate": 7.554192997283033e-05, + "loss": 0.605, + "step": 8361 + }, + { + "epoch": 1.4916599768084917, + "grad_norm": 0.47625431418418884, + "learning_rate": 7.549175393126861e-05, + "loss": 0.4631, + "step": 8362 + }, + { + "epoch": 1.491838373026492, + "grad_norm": 0.5075172781944275, + "learning_rate": 7.544159159550937e-05, + "loss": 0.5409, + "step": 8363 + }, + { + "epoch": 1.4920167692444921, + "grad_norm": 0.5858179330825806, + "learning_rate": 7.539144296949246e-05, + "loss": 0.6648, + "step": 8364 + }, + { + "epoch": 1.4921951654624923, + "grad_norm": 0.47895708680152893, + "learning_rate": 7.534130805715644e-05, + "loss": 0.5248, + "step": 8365 + }, + { + "epoch": 1.4923735616804925, + "grad_norm": 0.4535577893257141, + "learning_rate": 7.529118686243897e-05, + "loss": 0.5158, + "step": 8366 + }, + { + "epoch": 1.4925519578984925, + "grad_norm": 0.4920910894870758, + "learning_rate": 7.524107938927652e-05, + "loss": 0.5183, + "step": 8367 + }, + { + "epoch": 1.4927303541164927, + "grad_norm": 0.49673569202423096, + "learning_rate": 7.51909856416044e-05, + "loss": 0.5476, + "step": 8368 + }, + { + "epoch": 1.4929087503344929, + "grad_norm": 0.5008490681648254, + "learning_rate": 7.514090562335712e-05, + "loss": 0.5373, + "step": 8369 + }, + { + "epoch": 1.493087146552493, + "grad_norm": 0.46472302079200745, + "learning_rate": 7.50908393384678e-05, + "loss": 0.4453, + "step": 8370 + }, + { + "epoch": 1.4932655427704933, + "grad_norm": 0.4681701362133026, + "learning_rate": 7.504078679086868e-05, + "loss": 0.5433, + "step": 8371 + }, + { + "epoch": 1.4934439389884935, + "grad_norm": 0.48634573817253113, + "learning_rate": 7.499074798449095e-05, + "loss": 0.4632, + "step": 8372 + }, + { + "epoch": 1.4936223352064937, + "grad_norm": 0.5008024573326111, + "learning_rate": 7.494072292326448e-05, + "loss": 0.5638, + "step": 8373 + }, + { + "epoch": 1.4938007314244939, + "grad_norm": 0.5322297811508179, + "learning_rate": 7.48907116111184e-05, + "loss": 0.642, + "step": 8374 + }, + { + "epoch": 1.4939791276424939, + "grad_norm": 0.5250205993652344, + "learning_rate": 7.484071405198037e-05, + "loss": 0.6071, + "step": 8375 + }, + { + "epoch": 1.494157523860494, + "grad_norm": 0.5197263360023499, + "learning_rate": 7.479073024977736e-05, + "loss": 0.6804, + "step": 8376 + }, + { + "epoch": 1.4943359200784943, + "grad_norm": 0.4836525022983551, + "learning_rate": 7.474076020843496e-05, + "loss": 0.5058, + "step": 8377 + }, + { + "epoch": 1.4945143162964944, + "grad_norm": 0.5223329663276672, + "learning_rate": 7.469080393187786e-05, + "loss": 0.6614, + "step": 8378 + }, + { + "epoch": 1.4946927125144946, + "grad_norm": 0.4857519865036011, + "learning_rate": 7.464086142402959e-05, + "loss": 0.5892, + "step": 8379 + }, + { + "epoch": 1.4948711087324948, + "grad_norm": 0.4820669889450073, + "learning_rate": 7.459093268881254e-05, + "loss": 0.5444, + "step": 8380 + }, + { + "epoch": 1.495049504950495, + "grad_norm": 0.4609326720237732, + "learning_rate": 7.45410177301482e-05, + "loss": 0.4923, + "step": 8381 + }, + { + "epoch": 1.4952279011684952, + "grad_norm": 0.47613245248794556, + "learning_rate": 7.449111655195678e-05, + "loss": 0.5083, + "step": 8382 + }, + { + "epoch": 1.4954062973864954, + "grad_norm": 0.496552973985672, + "learning_rate": 7.444122915815759e-05, + "loss": 0.5721, + "step": 8383 + }, + { + "epoch": 1.4955846936044956, + "grad_norm": 0.493670254945755, + "learning_rate": 7.439135555266866e-05, + "loss": 0.461, + "step": 8384 + }, + { + "epoch": 1.4957630898224958, + "grad_norm": 0.5373759269714355, + "learning_rate": 7.434149573940707e-05, + "loss": 0.5685, + "step": 8385 + }, + { + "epoch": 1.495941486040496, + "grad_norm": 0.5208349227905273, + "learning_rate": 7.429164972228891e-05, + "loss": 0.5965, + "step": 8386 + }, + { + "epoch": 1.4961198822584962, + "grad_norm": 0.5484915971755981, + "learning_rate": 7.424181750522887e-05, + "loss": 0.5944, + "step": 8387 + }, + { + "epoch": 1.4962982784764964, + "grad_norm": 0.5299747586250305, + "learning_rate": 7.419199909214095e-05, + "loss": 0.5318, + "step": 8388 + }, + { + "epoch": 1.4964766746944964, + "grad_norm": 0.5350375175476074, + "learning_rate": 7.414219448693769e-05, + "loss": 0.6534, + "step": 8389 + }, + { + "epoch": 1.4966550709124966, + "grad_norm": 0.5854753255844116, + "learning_rate": 7.409240369353084e-05, + "loss": 0.7582, + "step": 8390 + }, + { + "epoch": 1.4968334671304968, + "grad_norm": 0.5602213144302368, + "learning_rate": 7.404262671583092e-05, + "loss": 0.583, + "step": 8391 + }, + { + "epoch": 1.497011863348497, + "grad_norm": 0.4914671778678894, + "learning_rate": 7.399286355774732e-05, + "loss": 0.5397, + "step": 8392 + }, + { + "epoch": 1.4971902595664972, + "grad_norm": 0.47588059306144714, + "learning_rate": 7.394311422318853e-05, + "loss": 0.4886, + "step": 8393 + }, + { + "epoch": 1.4973686557844974, + "grad_norm": 0.5675820708274841, + "learning_rate": 7.389337871606172e-05, + "loss": 0.6568, + "step": 8394 + }, + { + "epoch": 1.4975470520024976, + "grad_norm": 0.508738100528717, + "learning_rate": 7.384365704027321e-05, + "loss": 0.5592, + "step": 8395 + }, + { + "epoch": 1.4977254482204978, + "grad_norm": 0.4801078140735626, + "learning_rate": 7.379394919972804e-05, + "loss": 0.4698, + "step": 8396 + }, + { + "epoch": 1.4979038444384978, + "grad_norm": 0.48387011885643005, + "learning_rate": 7.374425519833031e-05, + "loss": 0.551, + "step": 8397 + }, + { + "epoch": 1.498082240656498, + "grad_norm": 0.5585319399833679, + "learning_rate": 7.369457503998286e-05, + "loss": 0.7281, + "step": 8398 + }, + { + "epoch": 1.4982606368744982, + "grad_norm": 0.5239479541778564, + "learning_rate": 7.36449087285877e-05, + "loss": 0.6676, + "step": 8399 + }, + { + "epoch": 1.4984390330924984, + "grad_norm": 0.550262451171875, + "learning_rate": 7.359525626804544e-05, + "loss": 0.7478, + "step": 8400 + }, + { + "epoch": 1.4986174293104986, + "grad_norm": 0.45570600032806396, + "learning_rate": 7.354561766225584e-05, + "loss": 0.5162, + "step": 8401 + }, + { + "epoch": 1.4987958255284988, + "grad_norm": 0.43634846806526184, + "learning_rate": 7.349599291511757e-05, + "loss": 0.4103, + "step": 8402 + }, + { + "epoch": 1.498974221746499, + "grad_norm": 0.5149981379508972, + "learning_rate": 7.344638203052798e-05, + "loss": 0.594, + "step": 8403 + }, + { + "epoch": 1.4991526179644992, + "grad_norm": 0.4976111054420471, + "learning_rate": 7.339678501238364e-05, + "loss": 0.5622, + "step": 8404 + }, + { + "epoch": 1.4993310141824994, + "grad_norm": 0.5391610860824585, + "learning_rate": 7.334720186457982e-05, + "loss": 0.6742, + "step": 8405 + }, + { + "epoch": 1.4995094104004996, + "grad_norm": 0.4595610499382019, + "learning_rate": 7.329763259101069e-05, + "loss": 0.4807, + "step": 8406 + }, + { + "epoch": 1.4996878066184998, + "grad_norm": 0.49623727798461914, + "learning_rate": 7.32480771955695e-05, + "loss": 0.6094, + "step": 8407 + }, + { + "epoch": 1.4998662028365, + "grad_norm": 0.47364911437034607, + "learning_rate": 7.319853568214818e-05, + "loss": 0.5401, + "step": 8408 + }, + { + "epoch": 1.5000445990545002, + "grad_norm": 0.4699852764606476, + "learning_rate": 7.314900805463789e-05, + "loss": 0.4449, + "step": 8409 + }, + { + "epoch": 1.5002229952725004, + "grad_norm": 0.4462581276893616, + "learning_rate": 7.30994943169283e-05, + "loss": 0.4451, + "step": 8410 + }, + { + "epoch": 1.5004013914905006, + "grad_norm": 0.5866194367408752, + "learning_rate": 7.304999447290838e-05, + "loss": 0.6961, + "step": 8411 + }, + { + "epoch": 1.5005797877085005, + "grad_norm": 0.4362695515155792, + "learning_rate": 7.300050852646578e-05, + "loss": 0.3869, + "step": 8412 + }, + { + "epoch": 1.5007581839265007, + "grad_norm": 0.6117103099822998, + "learning_rate": 7.295103648148697e-05, + "loss": 0.6624, + "step": 8413 + }, + { + "epoch": 1.500936580144501, + "grad_norm": 0.49935221672058105, + "learning_rate": 7.290157834185763e-05, + "loss": 0.4923, + "step": 8414 + }, + { + "epoch": 1.5011149763625011, + "grad_norm": 0.5208079218864441, + "learning_rate": 7.285213411146205e-05, + "loss": 0.4946, + "step": 8415 + }, + { + "epoch": 1.5012933725805013, + "grad_norm": 0.6199221611022949, + "learning_rate": 7.280270379418363e-05, + "loss": 0.6581, + "step": 8416 + }, + { + "epoch": 1.5014717687985015, + "grad_norm": 0.49500852823257446, + "learning_rate": 7.275328739390465e-05, + "loss": 0.4407, + "step": 8417 + }, + { + "epoch": 1.5016501650165015, + "grad_norm": 0.4843553602695465, + "learning_rate": 7.270388491450616e-05, + "loss": 0.5689, + "step": 8418 + }, + { + "epoch": 1.5018285612345017, + "grad_norm": 0.5777263045310974, + "learning_rate": 7.265449635986831e-05, + "loss": 0.5386, + "step": 8419 + }, + { + "epoch": 1.502006957452502, + "grad_norm": 0.5656114220619202, + "learning_rate": 7.260512173386993e-05, + "loss": 0.6281, + "step": 8420 + }, + { + "epoch": 1.502185353670502, + "grad_norm": 0.48054200410842896, + "learning_rate": 7.255576104038902e-05, + "loss": 0.5009, + "step": 8421 + }, + { + "epoch": 1.5023637498885023, + "grad_norm": 0.47279128432273865, + "learning_rate": 7.25064142833022e-05, + "loss": 0.4902, + "step": 8422 + }, + { + "epoch": 1.5025421461065025, + "grad_norm": 0.5290525555610657, + "learning_rate": 7.24570814664853e-05, + "loss": 0.4631, + "step": 8423 + }, + { + "epoch": 1.5027205423245027, + "grad_norm": 0.46618005633354187, + "learning_rate": 7.240776259381276e-05, + "loss": 0.5708, + "step": 8424 + }, + { + "epoch": 1.502898938542503, + "grad_norm": 0.4757915139198303, + "learning_rate": 7.235845766915819e-05, + "loss": 0.499, + "step": 8425 + }, + { + "epoch": 1.503077334760503, + "grad_norm": 0.4191453754901886, + "learning_rate": 7.23091666963939e-05, + "loss": 0.3913, + "step": 8426 + }, + { + "epoch": 1.5032557309785033, + "grad_norm": 0.4610666036605835, + "learning_rate": 7.225988967939113e-05, + "loss": 0.3548, + "step": 8427 + }, + { + "epoch": 1.5034341271965035, + "grad_norm": 0.488862544298172, + "learning_rate": 7.221062662202018e-05, + "loss": 0.5241, + "step": 8428 + }, + { + "epoch": 1.5036125234145037, + "grad_norm": 0.5716550946235657, + "learning_rate": 7.216137752815005e-05, + "loss": 0.7528, + "step": 8429 + }, + { + "epoch": 1.5037909196325039, + "grad_norm": 0.5017661452293396, + "learning_rate": 7.211214240164887e-05, + "loss": 0.5129, + "step": 8430 + }, + { + "epoch": 1.503969315850504, + "grad_norm": 0.4955231547355652, + "learning_rate": 7.206292124638342e-05, + "loss": 0.5957, + "step": 8431 + }, + { + "epoch": 1.5041477120685043, + "grad_norm": 0.5126091837882996, + "learning_rate": 7.201371406621954e-05, + "loss": 0.6606, + "step": 8432 + }, + { + "epoch": 1.5043261082865045, + "grad_norm": 0.4719708561897278, + "learning_rate": 7.196452086502206e-05, + "loss": 0.4889, + "step": 8433 + }, + { + "epoch": 1.5045045045045045, + "grad_norm": 0.4715104401111603, + "learning_rate": 7.191534164665439e-05, + "loss": 0.4824, + "step": 8434 + }, + { + "epoch": 1.5046829007225047, + "grad_norm": 0.49331334233283997, + "learning_rate": 7.186617641497926e-05, + "loss": 0.5115, + "step": 8435 + }, + { + "epoch": 1.5048612969405049, + "grad_norm": 0.4969932436943054, + "learning_rate": 7.181702517385788e-05, + "loss": 0.5298, + "step": 8436 + }, + { + "epoch": 1.505039693158505, + "grad_norm": 0.5154744386672974, + "learning_rate": 7.176788792715074e-05, + "loss": 0.5574, + "step": 8437 + }, + { + "epoch": 1.5052180893765053, + "grad_norm": 0.5841989517211914, + "learning_rate": 7.171876467871699e-05, + "loss": 0.5517, + "step": 8438 + }, + { + "epoch": 1.5053964855945055, + "grad_norm": 0.5319039225578308, + "learning_rate": 7.166965543241466e-05, + "loss": 0.7089, + "step": 8439 + }, + { + "epoch": 1.5055748818125054, + "grad_norm": 0.5384335517883301, + "learning_rate": 7.162056019210095e-05, + "loss": 0.604, + "step": 8440 + }, + { + "epoch": 1.5057532780305056, + "grad_norm": 0.48112642765045166, + "learning_rate": 7.157147896163157e-05, + "loss": 0.464, + "step": 8441 + }, + { + "epoch": 1.5059316742485058, + "grad_norm": 0.519540011882782, + "learning_rate": 7.152241174486154e-05, + "loss": 0.5771, + "step": 8442 + }, + { + "epoch": 1.506110070466506, + "grad_norm": 0.49812787771224976, + "learning_rate": 7.147335854564444e-05, + "loss": 0.5615, + "step": 8443 + }, + { + "epoch": 1.5062884666845062, + "grad_norm": 0.4924754500389099, + "learning_rate": 7.142431936783297e-05, + "loss": 0.4846, + "step": 8444 + }, + { + "epoch": 1.5064668629025064, + "grad_norm": 0.5007926225662231, + "learning_rate": 7.137529421527852e-05, + "loss": 0.5509, + "step": 8445 + }, + { + "epoch": 1.5066452591205066, + "grad_norm": 0.4786141514778137, + "learning_rate": 7.132628309183165e-05, + "loss": 0.4272, + "step": 8446 + }, + { + "epoch": 1.5068236553385068, + "grad_norm": 0.4895467758178711, + "learning_rate": 7.127728600134164e-05, + "loss": 0.4545, + "step": 8447 + }, + { + "epoch": 1.507002051556507, + "grad_norm": 0.4716068506240845, + "learning_rate": 7.122830294765664e-05, + "loss": 0.4106, + "step": 8448 + }, + { + "epoch": 1.5071804477745072, + "grad_norm": 0.5841825008392334, + "learning_rate": 7.117933393462384e-05, + "loss": 0.7319, + "step": 8449 + }, + { + "epoch": 1.5073588439925074, + "grad_norm": 0.4899286925792694, + "learning_rate": 7.113037896608923e-05, + "loss": 0.4271, + "step": 8450 + }, + { + "epoch": 1.5075372402105076, + "grad_norm": 0.5296789407730103, + "learning_rate": 7.108143804589759e-05, + "loss": 0.5624, + "step": 8451 + }, + { + "epoch": 1.5077156364285078, + "grad_norm": 0.48504510521888733, + "learning_rate": 7.10325111778929e-05, + "loss": 0.5338, + "step": 8452 + }, + { + "epoch": 1.507894032646508, + "grad_norm": 0.5016950368881226, + "learning_rate": 7.098359836591764e-05, + "loss": 0.5561, + "step": 8453 + }, + { + "epoch": 1.5080724288645082, + "grad_norm": 0.5088602900505066, + "learning_rate": 7.093469961381365e-05, + "loss": 0.6215, + "step": 8454 + }, + { + "epoch": 1.5082508250825084, + "grad_norm": 0.6125203371047974, + "learning_rate": 7.088581492542121e-05, + "loss": 0.6588, + "step": 8455 + }, + { + "epoch": 1.5084292213005084, + "grad_norm": 0.8469102382659912, + "learning_rate": 7.083694430457988e-05, + "loss": 0.5289, + "step": 8456 + }, + { + "epoch": 1.5086076175185086, + "grad_norm": 0.49744218587875366, + "learning_rate": 7.078808775512774e-05, + "loss": 0.6164, + "step": 8457 + }, + { + "epoch": 1.5087860137365088, + "grad_norm": 0.5175665616989136, + "learning_rate": 7.073924528090214e-05, + "loss": 0.5147, + "step": 8458 + }, + { + "epoch": 1.508964409954509, + "grad_norm": 0.492880254983902, + "learning_rate": 7.06904168857391e-05, + "loss": 0.5535, + "step": 8459 + }, + { + "epoch": 1.5091428061725092, + "grad_norm": 0.5100347399711609, + "learning_rate": 7.064160257347346e-05, + "loss": 0.6079, + "step": 8460 + }, + { + "epoch": 1.5093212023905094, + "grad_norm": 0.4987751543521881, + "learning_rate": 7.059280234793927e-05, + "loss": 0.5412, + "step": 8461 + }, + { + "epoch": 1.5094995986085094, + "grad_norm": 0.4200221002101898, + "learning_rate": 7.054401621296899e-05, + "loss": 0.3935, + "step": 8462 + }, + { + "epoch": 1.5096779948265096, + "grad_norm": 0.5284053087234497, + "learning_rate": 7.049524417239465e-05, + "loss": 0.6021, + "step": 8463 + }, + { + "epoch": 1.5098563910445097, + "grad_norm": 0.5351274013519287, + "learning_rate": 7.044648623004654e-05, + "loss": 0.6754, + "step": 8464 + }, + { + "epoch": 1.51003478726251, + "grad_norm": 0.5108939409255981, + "learning_rate": 7.039774238975408e-05, + "loss": 0.6219, + "step": 8465 + }, + { + "epoch": 1.5102131834805101, + "grad_norm": 0.5333287119865417, + "learning_rate": 7.034901265534571e-05, + "loss": 0.581, + "step": 8466 + }, + { + "epoch": 1.5103915796985103, + "grad_norm": 0.6006919145584106, + "learning_rate": 7.030029703064849e-05, + "loss": 0.6058, + "step": 8467 + }, + { + "epoch": 1.5105699759165105, + "grad_norm": 0.4753846526145935, + "learning_rate": 7.02515955194887e-05, + "loss": 0.4262, + "step": 8468 + }, + { + "epoch": 1.5107483721345107, + "grad_norm": 0.48568055033683777, + "learning_rate": 7.020290812569119e-05, + "loss": 0.4365, + "step": 8469 + }, + { + "epoch": 1.510926768352511, + "grad_norm": 0.488115519285202, + "learning_rate": 7.015423485307996e-05, + "loss": 0.5195, + "step": 8470 + }, + { + "epoch": 1.5111051645705111, + "grad_norm": 0.5295347571372986, + "learning_rate": 7.010557570547774e-05, + "loss": 0.5334, + "step": 8471 + }, + { + "epoch": 1.5112835607885113, + "grad_norm": 0.49440649151802063, + "learning_rate": 7.00569306867061e-05, + "loss": 0.546, + "step": 8472 + }, + { + "epoch": 1.5114619570065115, + "grad_norm": 0.5544924736022949, + "learning_rate": 7.000829980058576e-05, + "loss": 0.5707, + "step": 8473 + }, + { + "epoch": 1.5116403532245117, + "grad_norm": 0.6297951936721802, + "learning_rate": 6.995968305093603e-05, + "loss": 0.5151, + "step": 8474 + }, + { + "epoch": 1.511818749442512, + "grad_norm": 0.4706231653690338, + "learning_rate": 6.991108044157537e-05, + "loss": 0.5719, + "step": 8475 + }, + { + "epoch": 1.5119971456605121, + "grad_norm": 0.5109243988990784, + "learning_rate": 6.986249197632092e-05, + "loss": 0.6104, + "step": 8476 + }, + { + "epoch": 1.5121755418785123, + "grad_norm": 0.5036676526069641, + "learning_rate": 6.981391765898881e-05, + "loss": 0.6026, + "step": 8477 + }, + { + "epoch": 1.5123539380965123, + "grad_norm": 0.5482192635536194, + "learning_rate": 6.976535749339413e-05, + "loss": 0.5634, + "step": 8478 + }, + { + "epoch": 1.5125323343145125, + "grad_norm": 0.47447332739830017, + "learning_rate": 6.971681148335066e-05, + "loss": 0.4998, + "step": 8479 + }, + { + "epoch": 1.5127107305325127, + "grad_norm": 0.46494314074516296, + "learning_rate": 6.96682796326713e-05, + "loss": 0.4863, + "step": 8480 + }, + { + "epoch": 1.512889126750513, + "grad_norm": 0.525435745716095, + "learning_rate": 6.961976194516759e-05, + "loss": 0.5581, + "step": 8481 + }, + { + "epoch": 1.513067522968513, + "grad_norm": 0.5048510432243347, + "learning_rate": 6.95712584246502e-05, + "loss": 0.5747, + "step": 8482 + }, + { + "epoch": 1.5132459191865133, + "grad_norm": 0.4954485297203064, + "learning_rate": 6.952276907492846e-05, + "loss": 0.504, + "step": 8483 + }, + { + "epoch": 1.5134243154045133, + "grad_norm": 0.5358535051345825, + "learning_rate": 6.947429389981085e-05, + "loss": 0.6579, + "step": 8484 + }, + { + "epoch": 1.5136027116225135, + "grad_norm": 0.4358263611793518, + "learning_rate": 6.942583290310453e-05, + "loss": 0.4034, + "step": 8485 + }, + { + "epoch": 1.5137811078405137, + "grad_norm": 0.43710726499557495, + "learning_rate": 6.937738608861552e-05, + "loss": 0.4148, + "step": 8486 + }, + { + "epoch": 1.5139595040585139, + "grad_norm": 0.4802948832511902, + "learning_rate": 6.932895346014893e-05, + "loss": 0.4326, + "step": 8487 + }, + { + "epoch": 1.514137900276514, + "grad_norm": 0.515270471572876, + "learning_rate": 6.928053502150849e-05, + "loss": 0.4963, + "step": 8488 + }, + { + "epoch": 1.5143162964945143, + "grad_norm": 0.5241411924362183, + "learning_rate": 6.923213077649718e-05, + "loss": 0.5903, + "step": 8489 + }, + { + "epoch": 1.5144946927125145, + "grad_norm": 0.5037186741828918, + "learning_rate": 6.918374072891643e-05, + "loss": 0.6109, + "step": 8490 + }, + { + "epoch": 1.5146730889305147, + "grad_norm": 0.586534857749939, + "learning_rate": 6.913536488256695e-05, + "loss": 0.6432, + "step": 8491 + }, + { + "epoch": 1.5148514851485149, + "grad_norm": 0.49035558104515076, + "learning_rate": 6.9087003241248e-05, + "loss": 0.4847, + "step": 8492 + }, + { + "epoch": 1.515029881366515, + "grad_norm": 0.5941229462623596, + "learning_rate": 6.903865580875795e-05, + "loss": 0.7171, + "step": 8493 + }, + { + "epoch": 1.5152082775845153, + "grad_norm": 0.4788690209388733, + "learning_rate": 6.899032258889409e-05, + "loss": 0.5057, + "step": 8494 + }, + { + "epoch": 1.5153866738025155, + "grad_norm": 0.6357868909835815, + "learning_rate": 6.894200358545233e-05, + "loss": 0.7302, + "step": 8495 + }, + { + "epoch": 1.5155650700205157, + "grad_norm": 0.5005565881729126, + "learning_rate": 6.889369880222776e-05, + "loss": 0.4858, + "step": 8496 + }, + { + "epoch": 1.5157434662385159, + "grad_norm": 0.5777773857116699, + "learning_rate": 6.884540824301416e-05, + "loss": 0.6461, + "step": 8497 + }, + { + "epoch": 1.515921862456516, + "grad_norm": 0.4599872827529907, + "learning_rate": 6.879713191160417e-05, + "loss": 0.5051, + "step": 8498 + }, + { + "epoch": 1.5161002586745163, + "grad_norm": 0.44417500495910645, + "learning_rate": 6.874886981178952e-05, + "loss": 0.4822, + "step": 8499 + }, + { + "epoch": 1.5162786548925162, + "grad_norm": 0.46541309356689453, + "learning_rate": 6.870062194736057e-05, + "loss": 0.4517, + "step": 8500 + }, + { + "epoch": 1.5164570511105164, + "grad_norm": 0.6577228903770447, + "learning_rate": 6.865238832210682e-05, + "loss": 0.5325, + "step": 8501 + }, + { + "epoch": 1.5166354473285166, + "grad_norm": 0.43800440430641174, + "learning_rate": 6.860416893981638e-05, + "loss": 0.4597, + "step": 8502 + }, + { + "epoch": 1.5168138435465168, + "grad_norm": 0.46744707226753235, + "learning_rate": 6.855596380427651e-05, + "loss": 0.5437, + "step": 8503 + }, + { + "epoch": 1.516992239764517, + "grad_norm": 0.48990002274513245, + "learning_rate": 6.85077729192731e-05, + "loss": 0.5202, + "step": 8504 + }, + { + "epoch": 1.5171706359825172, + "grad_norm": 0.49149778485298157, + "learning_rate": 6.845959628859119e-05, + "loss": 0.5856, + "step": 8505 + }, + { + "epoch": 1.5173490322005172, + "grad_norm": 0.5475661158561707, + "learning_rate": 6.841143391601445e-05, + "loss": 0.7378, + "step": 8506 + }, + { + "epoch": 1.5175274284185174, + "grad_norm": 0.5574973821640015, + "learning_rate": 6.836328580532547e-05, + "loss": 0.5967, + "step": 8507 + }, + { + "epoch": 1.5177058246365176, + "grad_norm": 0.522175133228302, + "learning_rate": 6.831515196030588e-05, + "loss": 0.6041, + "step": 8508 + }, + { + "epoch": 1.5178842208545178, + "grad_norm": 0.5970126390457153, + "learning_rate": 6.826703238473605e-05, + "loss": 0.8074, + "step": 8509 + }, + { + "epoch": 1.518062617072518, + "grad_norm": 0.5063888430595398, + "learning_rate": 6.821892708239535e-05, + "loss": 0.5915, + "step": 8510 + }, + { + "epoch": 1.5182410132905182, + "grad_norm": 0.5437260866165161, + "learning_rate": 6.817083605706193e-05, + "loss": 0.6038, + "step": 8511 + }, + { + "epoch": 1.5184194095085184, + "grad_norm": 0.49606233835220337, + "learning_rate": 6.812275931251268e-05, + "loss": 0.5352, + "step": 8512 + }, + { + "epoch": 1.5185978057265186, + "grad_norm": 0.509730339050293, + "learning_rate": 6.807469685252376e-05, + "loss": 0.5743, + "step": 8513 + }, + { + "epoch": 1.5187762019445188, + "grad_norm": 0.5043073296546936, + "learning_rate": 6.802664868086978e-05, + "loss": 0.496, + "step": 8514 + }, + { + "epoch": 1.518954598162519, + "grad_norm": 0.5181002020835876, + "learning_rate": 6.797861480132456e-05, + "loss": 0.5839, + "step": 8515 + }, + { + "epoch": 1.5191329943805192, + "grad_norm": 0.5447046756744385, + "learning_rate": 6.793059521766054e-05, + "loss": 0.6502, + "step": 8516 + }, + { + "epoch": 1.5193113905985194, + "grad_norm": 0.5003826022148132, + "learning_rate": 6.788258993364929e-05, + "loss": 0.5199, + "step": 8517 + }, + { + "epoch": 1.5194897868165196, + "grad_norm": 0.5281699299812317, + "learning_rate": 6.783459895306107e-05, + "loss": 0.6103, + "step": 8518 + }, + { + "epoch": 1.5196681830345198, + "grad_norm": 0.4539282023906708, + "learning_rate": 6.778662227966495e-05, + "loss": 0.4092, + "step": 8519 + }, + { + "epoch": 1.51984657925252, + "grad_norm": 0.49163827300071716, + "learning_rate": 6.773865991722921e-05, + "loss": 0.4445, + "step": 8520 + }, + { + "epoch": 1.5200249754705202, + "grad_norm": 0.47441521286964417, + "learning_rate": 6.76907118695206e-05, + "loss": 0.4945, + "step": 8521 + }, + { + "epoch": 1.5202033716885202, + "grad_norm": 0.6085272431373596, + "learning_rate": 6.76427781403051e-05, + "loss": 0.7252, + "step": 8522 + }, + { + "epoch": 1.5203817679065204, + "grad_norm": 0.5102150440216064, + "learning_rate": 6.759485873334725e-05, + "loss": 0.4936, + "step": 8523 + }, + { + "epoch": 1.5205601641245206, + "grad_norm": 0.48524045944213867, + "learning_rate": 6.754695365241071e-05, + "loss": 0.5631, + "step": 8524 + }, + { + "epoch": 1.5207385603425208, + "grad_norm": 0.433088481426239, + "learning_rate": 6.749906290125799e-05, + "loss": 0.4493, + "step": 8525 + }, + { + "epoch": 1.520916956560521, + "grad_norm": 0.5861080288887024, + "learning_rate": 6.745118648365026e-05, + "loss": 0.6544, + "step": 8526 + }, + { + "epoch": 1.5210953527785211, + "grad_norm": 0.4654527008533478, + "learning_rate": 6.740332440334784e-05, + "loss": 0.4399, + "step": 8527 + }, + { + "epoch": 1.5212737489965211, + "grad_norm": 0.5457956194877625, + "learning_rate": 6.735547666410968e-05, + "loss": 0.6563, + "step": 8528 + }, + { + "epoch": 1.5214521452145213, + "grad_norm": 0.6288897395133972, + "learning_rate": 6.730764326969388e-05, + "loss": 0.7027, + "step": 8529 + }, + { + "epoch": 1.5216305414325215, + "grad_norm": 0.5368294715881348, + "learning_rate": 6.725982422385715e-05, + "loss": 0.6109, + "step": 8530 + }, + { + "epoch": 1.5218089376505217, + "grad_norm": 0.5222885012626648, + "learning_rate": 6.721201953035511e-05, + "loss": 0.5362, + "step": 8531 + }, + { + "epoch": 1.521987333868522, + "grad_norm": 0.49758774042129517, + "learning_rate": 6.716422919294247e-05, + "loss": 0.4813, + "step": 8532 + }, + { + "epoch": 1.5221657300865221, + "grad_norm": 0.5573477745056152, + "learning_rate": 6.71164532153725e-05, + "loss": 0.7868, + "step": 8533 + }, + { + "epoch": 1.5223441263045223, + "grad_norm": 0.46816423535346985, + "learning_rate": 6.706869160139767e-05, + "loss": 0.4346, + "step": 8534 + }, + { + "epoch": 1.5225225225225225, + "grad_norm": 0.521041750907898, + "learning_rate": 6.702094435476902e-05, + "loss": 0.5432, + "step": 8535 + }, + { + "epoch": 1.5227009187405227, + "grad_norm": 0.49728789925575256, + "learning_rate": 6.697321147923671e-05, + "loss": 0.6175, + "step": 8536 + }, + { + "epoch": 1.522879314958523, + "grad_norm": 0.5513471364974976, + "learning_rate": 6.692549297854956e-05, + "loss": 0.6744, + "step": 8537 + }, + { + "epoch": 1.523057711176523, + "grad_norm": 1.1942397356033325, + "learning_rate": 6.68777888564554e-05, + "loss": 0.6059, + "step": 8538 + }, + { + "epoch": 1.5232361073945233, + "grad_norm": 0.5422153472900391, + "learning_rate": 6.683009911670095e-05, + "loss": 0.5448, + "step": 8539 + }, + { + "epoch": 1.5234145036125235, + "grad_norm": 0.5084717869758606, + "learning_rate": 6.678242376303165e-05, + "loss": 0.5318, + "step": 8540 + }, + { + "epoch": 1.5235928998305237, + "grad_norm": 0.4946412444114685, + "learning_rate": 6.673476279919202e-05, + "loss": 0.4886, + "step": 8541 + }, + { + "epoch": 1.523771296048524, + "grad_norm": 0.5950092673301697, + "learning_rate": 6.668711622892515e-05, + "loss": 0.7057, + "step": 8542 + }, + { + "epoch": 1.523949692266524, + "grad_norm": 0.45473888516426086, + "learning_rate": 6.663948405597339e-05, + "loss": 0.4004, + "step": 8543 + }, + { + "epoch": 1.524128088484524, + "grad_norm": 0.5750323534011841, + "learning_rate": 6.659186628407762e-05, + "loss": 0.6046, + "step": 8544 + }, + { + "epoch": 1.5243064847025243, + "grad_norm": 0.5322027802467346, + "learning_rate": 6.654426291697768e-05, + "loss": 0.5181, + "step": 8545 + }, + { + "epoch": 1.5244848809205245, + "grad_norm": 0.4996216893196106, + "learning_rate": 6.649667395841247e-05, + "loss": 0.5104, + "step": 8546 + }, + { + "epoch": 1.5246632771385247, + "grad_norm": 0.5934137105941772, + "learning_rate": 6.644909941211943e-05, + "loss": 0.7318, + "step": 8547 + }, + { + "epoch": 1.5248416733565249, + "grad_norm": 0.4300279915332794, + "learning_rate": 6.640153928183523e-05, + "loss": 0.3769, + "step": 8548 + }, + { + "epoch": 1.525020069574525, + "grad_norm": 0.5084002614021301, + "learning_rate": 6.635399357129501e-05, + "loss": 0.4071, + "step": 8549 + }, + { + "epoch": 1.525198465792525, + "grad_norm": 0.47641390562057495, + "learning_rate": 6.630646228423323e-05, + "loss": 0.6074, + "step": 8550 + }, + { + "epoch": 1.5253768620105252, + "grad_norm": 0.5083361268043518, + "learning_rate": 6.625894542438283e-05, + "loss": 0.657, + "step": 8551 + }, + { + "epoch": 1.5255552582285254, + "grad_norm": 0.5362805128097534, + "learning_rate": 6.621144299547572e-05, + "loss": 0.6543, + "step": 8552 + }, + { + "epoch": 1.5257336544465256, + "grad_norm": 0.46762344241142273, + "learning_rate": 6.616395500124276e-05, + "loss": 0.4705, + "step": 8553 + }, + { + "epoch": 1.5259120506645258, + "grad_norm": 0.41276419162750244, + "learning_rate": 6.611648144541369e-05, + "loss": 0.4374, + "step": 8554 + }, + { + "epoch": 1.526090446882526, + "grad_norm": 0.5008599758148193, + "learning_rate": 6.60690223317171e-05, + "loss": 0.5399, + "step": 8555 + }, + { + "epoch": 1.5262688431005262, + "grad_norm": 0.8646233677864075, + "learning_rate": 6.602157766388034e-05, + "loss": 0.4534, + "step": 8556 + }, + { + "epoch": 1.5264472393185264, + "grad_norm": 0.4763386845588684, + "learning_rate": 6.597414744562963e-05, + "loss": 0.5278, + "step": 8557 + }, + { + "epoch": 1.5266256355365266, + "grad_norm": 0.5339080691337585, + "learning_rate": 6.592673168069027e-05, + "loss": 0.555, + "step": 8558 + }, + { + "epoch": 1.5268040317545268, + "grad_norm": 0.5438008904457092, + "learning_rate": 6.587933037278609e-05, + "loss": 0.6286, + "step": 8559 + }, + { + "epoch": 1.526982427972527, + "grad_norm": 0.4510871469974518, + "learning_rate": 6.583194352564017e-05, + "loss": 0.4141, + "step": 8560 + }, + { + "epoch": 1.5271608241905272, + "grad_norm": 0.5599578619003296, + "learning_rate": 6.578457114297407e-05, + "loss": 0.619, + "step": 8561 + }, + { + "epoch": 1.5273392204085274, + "grad_norm": 0.5268929600715637, + "learning_rate": 6.573721322850854e-05, + "loss": 0.6176, + "step": 8562 + }, + { + "epoch": 1.5275176166265276, + "grad_norm": 0.5147603154182434, + "learning_rate": 6.568986978596291e-05, + "loss": 0.5238, + "step": 8563 + }, + { + "epoch": 1.5276960128445278, + "grad_norm": 0.5306607484817505, + "learning_rate": 6.564254081905571e-05, + "loss": 0.5801, + "step": 8564 + }, + { + "epoch": 1.527874409062528, + "grad_norm": 0.5111916065216064, + "learning_rate": 6.559522633150397e-05, + "loss": 0.5071, + "step": 8565 + }, + { + "epoch": 1.528052805280528, + "grad_norm": 0.48746898770332336, + "learning_rate": 6.554792632702376e-05, + "loss": 0.5161, + "step": 8566 + }, + { + "epoch": 1.5282312014985282, + "grad_norm": 0.48005223274230957, + "learning_rate": 6.55006408093301e-05, + "loss": 0.4149, + "step": 8567 + }, + { + "epoch": 1.5284095977165284, + "grad_norm": 0.5667544007301331, + "learning_rate": 6.545336978213664e-05, + "loss": 0.6735, + "step": 8568 + }, + { + "epoch": 1.5285879939345286, + "grad_norm": 0.5892688035964966, + "learning_rate": 6.54061132491561e-05, + "loss": 0.5779, + "step": 8569 + }, + { + "epoch": 1.5287663901525288, + "grad_norm": 0.4722636342048645, + "learning_rate": 6.535887121410006e-05, + "loss": 0.4264, + "step": 8570 + }, + { + "epoch": 1.528944786370529, + "grad_norm": 0.5367898941040039, + "learning_rate": 6.531164368067874e-05, + "loss": 0.5761, + "step": 8571 + }, + { + "epoch": 1.529123182588529, + "grad_norm": 0.5178304314613342, + "learning_rate": 6.526443065260154e-05, + "loss": 0.5354, + "step": 8572 + }, + { + "epoch": 1.5293015788065292, + "grad_norm": 0.4545149803161621, + "learning_rate": 6.521723213357635e-05, + "loss": 0.3896, + "step": 8573 + }, + { + "epoch": 1.5294799750245294, + "grad_norm": 0.5091972947120667, + "learning_rate": 6.517004812731034e-05, + "loss": 0.6283, + "step": 8574 + }, + { + "epoch": 1.5296583712425296, + "grad_norm": 0.5601230263710022, + "learning_rate": 6.512287863750912e-05, + "loss": 0.6376, + "step": 8575 + }, + { + "epoch": 1.5298367674605298, + "grad_norm": 0.44082170724868774, + "learning_rate": 6.507572366787753e-05, + "loss": 0.412, + "step": 8576 + }, + { + "epoch": 1.53001516367853, + "grad_norm": 0.6141606569290161, + "learning_rate": 6.502858322211902e-05, + "loss": 0.7171, + "step": 8577 + }, + { + "epoch": 1.5301935598965302, + "grad_norm": 0.5599898099899292, + "learning_rate": 6.498145730393592e-05, + "loss": 0.6881, + "step": 8578 + }, + { + "epoch": 1.5303719561145304, + "grad_norm": 0.5843207836151123, + "learning_rate": 6.493434591702962e-05, + "loss": 0.7688, + "step": 8579 + }, + { + "epoch": 1.5305503523325306, + "grad_norm": 0.4394163191318512, + "learning_rate": 6.488724906510008e-05, + "loss": 0.4879, + "step": 8580 + }, + { + "epoch": 1.5307287485505308, + "grad_norm": 0.49832600355148315, + "learning_rate": 6.484016675184639e-05, + "loss": 0.5856, + "step": 8581 + }, + { + "epoch": 1.530907144768531, + "grad_norm": 0.5916674137115479, + "learning_rate": 6.479309898096627e-05, + "loss": 0.5649, + "step": 8582 + }, + { + "epoch": 1.5310855409865312, + "grad_norm": 0.4655032455921173, + "learning_rate": 6.474604575615653e-05, + "loss": 0.5289, + "step": 8583 + }, + { + "epoch": 1.5312639372045314, + "grad_norm": 0.48161542415618896, + "learning_rate": 6.469900708111254e-05, + "loss": 0.4909, + "step": 8584 + }, + { + "epoch": 1.5314423334225316, + "grad_norm": 0.564775288105011, + "learning_rate": 6.465198295952881e-05, + "loss": 0.5441, + "step": 8585 + }, + { + "epoch": 1.5316207296405318, + "grad_norm": 0.4698559641838074, + "learning_rate": 6.460497339509864e-05, + "loss": 0.487, + "step": 8586 + }, + { + "epoch": 1.531799125858532, + "grad_norm": 0.45102497935295105, + "learning_rate": 6.4557978391514e-05, + "loss": 0.5289, + "step": 8587 + }, + { + "epoch": 1.531977522076532, + "grad_norm": 0.5399623513221741, + "learning_rate": 6.451099795246604e-05, + "loss": 0.7627, + "step": 8588 + }, + { + "epoch": 1.5321559182945321, + "grad_norm": 0.5812605023384094, + "learning_rate": 6.446403208164436e-05, + "loss": 0.7045, + "step": 8589 + }, + { + "epoch": 1.5323343145125323, + "grad_norm": 0.5008137822151184, + "learning_rate": 6.441708078273787e-05, + "loss": 0.4697, + "step": 8590 + }, + { + "epoch": 1.5325127107305325, + "grad_norm": 0.5091111660003662, + "learning_rate": 6.437014405943397e-05, + "loss": 0.4132, + "step": 8591 + }, + { + "epoch": 1.5326911069485327, + "grad_norm": 0.6219770312309265, + "learning_rate": 6.432322191541901e-05, + "loss": 0.5969, + "step": 8592 + }, + { + "epoch": 1.532869503166533, + "grad_norm": 0.5382397770881653, + "learning_rate": 6.427631435437836e-05, + "loss": 0.5745, + "step": 8593 + }, + { + "epoch": 1.533047899384533, + "grad_norm": 0.5575965642929077, + "learning_rate": 6.422942137999598e-05, + "loss": 0.4856, + "step": 8594 + }, + { + "epoch": 1.533226295602533, + "grad_norm": 0.5452744960784912, + "learning_rate": 6.418254299595499e-05, + "loss": 0.5125, + "step": 8595 + }, + { + "epoch": 1.5334046918205333, + "grad_norm": 0.5527557134628296, + "learning_rate": 6.413567920593705e-05, + "loss": 0.6013, + "step": 8596 + }, + { + "epoch": 1.5335830880385335, + "grad_norm": 0.47276362776756287, + "learning_rate": 6.408883001362292e-05, + "loss": 0.4906, + "step": 8597 + }, + { + "epoch": 1.5337614842565337, + "grad_norm": 0.5150408148765564, + "learning_rate": 6.404199542269213e-05, + "loss": 0.5248, + "step": 8598 + }, + { + "epoch": 1.533939880474534, + "grad_norm": 0.4958198070526123, + "learning_rate": 6.399517543682278e-05, + "loss": 0.5716, + "step": 8599 + }, + { + "epoch": 1.534118276692534, + "grad_norm": 0.49585381150245667, + "learning_rate": 6.39483700596925e-05, + "loss": 0.6034, + "step": 8600 + }, + { + "epoch": 1.5342966729105343, + "grad_norm": 0.5567946434020996, + "learning_rate": 6.390157929497708e-05, + "loss": 0.6809, + "step": 8601 + }, + { + "epoch": 1.5344750691285345, + "grad_norm": 0.541259765625, + "learning_rate": 6.385480314635162e-05, + "loss": 0.6461, + "step": 8602 + }, + { + "epoch": 1.5346534653465347, + "grad_norm": 0.5411567091941833, + "learning_rate": 6.380804161748982e-05, + "loss": 0.6509, + "step": 8603 + }, + { + "epoch": 1.5348318615645349, + "grad_norm": 0.4732004404067993, + "learning_rate": 6.376129471206422e-05, + "loss": 0.4766, + "step": 8604 + }, + { + "epoch": 1.535010257782535, + "grad_norm": 0.45597603917121887, + "learning_rate": 6.371456243374646e-05, + "loss": 0.5052, + "step": 8605 + }, + { + "epoch": 1.5351886540005353, + "grad_norm": 0.49953046441078186, + "learning_rate": 6.366784478620674e-05, + "loss": 0.6363, + "step": 8606 + }, + { + "epoch": 1.5353670502185355, + "grad_norm": 0.4946024715900421, + "learning_rate": 6.362114177311437e-05, + "loss": 0.5352, + "step": 8607 + }, + { + "epoch": 1.5355454464365357, + "grad_norm": 0.4868670403957367, + "learning_rate": 6.357445339813726e-05, + "loss": 0.4525, + "step": 8608 + }, + { + "epoch": 1.5357238426545359, + "grad_norm": 0.5404412150382996, + "learning_rate": 6.352777966494242e-05, + "loss": 0.6858, + "step": 8609 + }, + { + "epoch": 1.5359022388725359, + "grad_norm": 0.5547458529472351, + "learning_rate": 6.348112057719551e-05, + "loss": 0.6672, + "step": 8610 + }, + { + "epoch": 1.536080635090536, + "grad_norm": 0.5338977575302124, + "learning_rate": 6.343447613856108e-05, + "loss": 0.694, + "step": 8611 + }, + { + "epoch": 1.5362590313085362, + "grad_norm": 0.5286486744880676, + "learning_rate": 6.338784635270264e-05, + "loss": 0.6343, + "step": 8612 + }, + { + "epoch": 1.5364374275265364, + "grad_norm": 0.49329471588134766, + "learning_rate": 6.334123122328239e-05, + "loss": 0.5588, + "step": 8613 + }, + { + "epoch": 1.5366158237445366, + "grad_norm": 0.5088777542114258, + "learning_rate": 6.32946307539616e-05, + "loss": 0.5553, + "step": 8614 + }, + { + "epoch": 1.5367942199625368, + "grad_norm": 0.627946674823761, + "learning_rate": 6.324804494840008e-05, + "loss": 0.4839, + "step": 8615 + }, + { + "epoch": 1.5369726161805368, + "grad_norm": 0.5603138208389282, + "learning_rate": 6.320147381025673e-05, + "loss": 0.7505, + "step": 8616 + }, + { + "epoch": 1.537151012398537, + "grad_norm": 0.4293627440929413, + "learning_rate": 6.315491734318934e-05, + "loss": 0.3965, + "step": 8617 + }, + { + "epoch": 1.5373294086165372, + "grad_norm": 0.5595969557762146, + "learning_rate": 6.310837555085424e-05, + "loss": 0.716, + "step": 8618 + }, + { + "epoch": 1.5375078048345374, + "grad_norm": 0.4659646153450012, + "learning_rate": 6.306184843690699e-05, + "loss": 0.4463, + "step": 8619 + }, + { + "epoch": 1.5376862010525376, + "grad_norm": 0.5158681869506836, + "learning_rate": 6.301533600500165e-05, + "loss": 0.5118, + "step": 8620 + }, + { + "epoch": 1.5378645972705378, + "grad_norm": 0.4700511693954468, + "learning_rate": 6.296883825879141e-05, + "loss": 0.4758, + "step": 8621 + }, + { + "epoch": 1.538042993488538, + "grad_norm": 0.5160353183746338, + "learning_rate": 6.292235520192807e-05, + "loss": 0.6518, + "step": 8622 + }, + { + "epoch": 1.5382213897065382, + "grad_norm": 0.4958648085594177, + "learning_rate": 6.28758868380625e-05, + "loss": 0.4795, + "step": 8623 + }, + { + "epoch": 1.5383997859245384, + "grad_norm": 0.42343762516975403, + "learning_rate": 6.282943317084428e-05, + "loss": 0.3622, + "step": 8624 + }, + { + "epoch": 1.5385781821425386, + "grad_norm": 0.5087177753448486, + "learning_rate": 6.278299420392173e-05, + "loss": 0.4692, + "step": 8625 + }, + { + "epoch": 1.5387565783605388, + "grad_norm": 0.44395479559898376, + "learning_rate": 6.273656994094232e-05, + "loss": 0.4136, + "step": 8626 + }, + { + "epoch": 1.538934974578539, + "grad_norm": 0.5368765592575073, + "learning_rate": 6.269016038555206e-05, + "loss": 0.648, + "step": 8627 + }, + { + "epoch": 1.5391133707965392, + "grad_norm": 0.6212941408157349, + "learning_rate": 6.264376554139608e-05, + "loss": 0.7711, + "step": 8628 + }, + { + "epoch": 1.5392917670145394, + "grad_norm": 0.5054884552955627, + "learning_rate": 6.259738541211804e-05, + "loss": 0.5336, + "step": 8629 + }, + { + "epoch": 1.5394701632325396, + "grad_norm": 0.5286135673522949, + "learning_rate": 6.255102000136073e-05, + "loss": 0.5894, + "step": 8630 + }, + { + "epoch": 1.5396485594505398, + "grad_norm": 0.581322193145752, + "learning_rate": 6.250466931276569e-05, + "loss": 0.732, + "step": 8631 + }, + { + "epoch": 1.5398269556685398, + "grad_norm": 0.492714524269104, + "learning_rate": 6.245833334997317e-05, + "loss": 0.583, + "step": 8632 + }, + { + "epoch": 1.54000535188654, + "grad_norm": 0.49149906635284424, + "learning_rate": 6.241201211662254e-05, + "loss": 0.496, + "step": 8633 + }, + { + "epoch": 1.5401837481045402, + "grad_norm": 0.45817720890045166, + "learning_rate": 6.236570561635163e-05, + "loss": 0.5408, + "step": 8634 + }, + { + "epoch": 1.5403621443225404, + "grad_norm": 0.5533788204193115, + "learning_rate": 6.231941385279757e-05, + "loss": 0.6682, + "step": 8635 + }, + { + "epoch": 1.5405405405405406, + "grad_norm": 0.5434373021125793, + "learning_rate": 6.227313682959596e-05, + "loss": 0.6231, + "step": 8636 + }, + { + "epoch": 1.5407189367585408, + "grad_norm": 0.45322054624557495, + "learning_rate": 6.222687455038134e-05, + "loss": 0.3839, + "step": 8637 + }, + { + "epoch": 1.5408973329765407, + "grad_norm": 0.5391322374343872, + "learning_rate": 6.218062701878724e-05, + "loss": 0.4676, + "step": 8638 + }, + { + "epoch": 1.541075729194541, + "grad_norm": 0.4535292685031891, + "learning_rate": 6.213439423844583e-05, + "loss": 0.4573, + "step": 8639 + }, + { + "epoch": 1.5412541254125411, + "grad_norm": 0.46426570415496826, + "learning_rate": 6.208817621298829e-05, + "loss": 0.4326, + "step": 8640 + }, + { + "epoch": 1.5414325216305413, + "grad_norm": 0.6217679977416992, + "learning_rate": 6.204197294604446e-05, + "loss": 0.6326, + "step": 8641 + }, + { + "epoch": 1.5416109178485415, + "grad_norm": 0.4808856248855591, + "learning_rate": 6.199578444124329e-05, + "loss": 0.4614, + "step": 8642 + }, + { + "epoch": 1.5417893140665417, + "grad_norm": 0.5089523792266846, + "learning_rate": 6.194961070221219e-05, + "loss": 0.5047, + "step": 8643 + }, + { + "epoch": 1.541967710284542, + "grad_norm": 0.43264392018318176, + "learning_rate": 6.190345173257786e-05, + "loss": 0.3809, + "step": 8644 + }, + { + "epoch": 1.5421461065025421, + "grad_norm": 0.450720876455307, + "learning_rate": 6.185730753596539e-05, + "loss": 0.4279, + "step": 8645 + }, + { + "epoch": 1.5423245027205423, + "grad_norm": 0.4698851704597473, + "learning_rate": 6.181117811599901e-05, + "loss": 0.4536, + "step": 8646 + }, + { + "epoch": 1.5425028989385425, + "grad_norm": 0.46005532145500183, + "learning_rate": 6.176506347630181e-05, + "loss": 0.4388, + "step": 8647 + }, + { + "epoch": 1.5426812951565427, + "grad_norm": 0.5724323391914368, + "learning_rate": 6.171896362049542e-05, + "loss": 0.6823, + "step": 8648 + }, + { + "epoch": 1.542859691374543, + "grad_norm": 0.47760656476020813, + "learning_rate": 6.167287855220072e-05, + "loss": 0.4862, + "step": 8649 + }, + { + "epoch": 1.5430380875925431, + "grad_norm": 0.533450186252594, + "learning_rate": 6.162680827503705e-05, + "loss": 0.606, + "step": 8650 + }, + { + "epoch": 1.5432164838105433, + "grad_norm": 0.5100364089012146, + "learning_rate": 6.158075279262273e-05, + "loss": 0.5607, + "step": 8651 + }, + { + "epoch": 1.5433948800285435, + "grad_norm": 0.5601518154144287, + "learning_rate": 6.153471210857511e-05, + "loss": 0.6058, + "step": 8652 + }, + { + "epoch": 1.5435732762465437, + "grad_norm": 0.4649224579334259, + "learning_rate": 6.148868622650999e-05, + "loss": 0.461, + "step": 8653 + }, + { + "epoch": 1.5437516724645437, + "grad_norm": 0.510765016078949, + "learning_rate": 6.144267515004243e-05, + "loss": 0.5787, + "step": 8654 + }, + { + "epoch": 1.543930068682544, + "grad_norm": 0.5434867143630981, + "learning_rate": 6.139667888278594e-05, + "loss": 0.5901, + "step": 8655 + }, + { + "epoch": 1.544108464900544, + "grad_norm": 0.5194826722145081, + "learning_rate": 6.13506974283532e-05, + "loss": 0.5809, + "step": 8656 + }, + { + "epoch": 1.5442868611185443, + "grad_norm": 0.42716190218925476, + "learning_rate": 6.130473079035548e-05, + "loss": 0.4636, + "step": 8657 + }, + { + "epoch": 1.5444652573365445, + "grad_norm": 0.4728177785873413, + "learning_rate": 6.125877897240295e-05, + "loss": 0.4347, + "step": 8658 + }, + { + "epoch": 1.5446436535545447, + "grad_norm": 0.4928957223892212, + "learning_rate": 6.121284197810476e-05, + "loss": 0.5965, + "step": 8659 + }, + { + "epoch": 1.5448220497725447, + "grad_norm": 0.5190236568450928, + "learning_rate": 6.116691981106868e-05, + "loss": 0.5734, + "step": 8660 + }, + { + "epoch": 1.5450004459905449, + "grad_norm": 0.4811258018016815, + "learning_rate": 6.11210124749014e-05, + "loss": 0.5823, + "step": 8661 + }, + { + "epoch": 1.545178842208545, + "grad_norm": 0.5223371982574463, + "learning_rate": 6.107511997320863e-05, + "loss": 0.52, + "step": 8662 + }, + { + "epoch": 1.5453572384265453, + "grad_norm": 0.5431803464889526, + "learning_rate": 6.102924230959456e-05, + "loss": 0.6178, + "step": 8663 + }, + { + "epoch": 1.5455356346445455, + "grad_norm": 0.5262665152549744, + "learning_rate": 6.0983379487662555e-05, + "loss": 0.5756, + "step": 8664 + }, + { + "epoch": 1.5457140308625457, + "grad_norm": 0.5077388882637024, + "learning_rate": 6.09375315110145e-05, + "loss": 0.5741, + "step": 8665 + }, + { + "epoch": 1.5458924270805459, + "grad_norm": 0.5087509155273438, + "learning_rate": 6.089169838325143e-05, + "loss": 0.5392, + "step": 8666 + }, + { + "epoch": 1.546070823298546, + "grad_norm": 0.4039164185523987, + "learning_rate": 6.084588010797293e-05, + "loss": 0.3906, + "step": 8667 + }, + { + "epoch": 1.5462492195165463, + "grad_norm": 0.4832989573478699, + "learning_rate": 6.0800076688777684e-05, + "loss": 0.5393, + "step": 8668 + }, + { + "epoch": 1.5464276157345465, + "grad_norm": 0.4995843470096588, + "learning_rate": 6.0754288129262895e-05, + "loss": 0.6855, + "step": 8669 + }, + { + "epoch": 1.5466060119525467, + "grad_norm": 0.47281551361083984, + "learning_rate": 6.070851443302497e-05, + "loss": 0.489, + "step": 8670 + }, + { + "epoch": 1.5467844081705469, + "grad_norm": 0.4991108477115631, + "learning_rate": 6.066275560365886e-05, + "loss": 0.5685, + "step": 8671 + }, + { + "epoch": 1.546962804388547, + "grad_norm": 0.5392157435417175, + "learning_rate": 6.0617011644758385e-05, + "loss": 0.7395, + "step": 8672 + }, + { + "epoch": 1.5471412006065473, + "grad_norm": 0.6251504421234131, + "learning_rate": 6.057128255991637e-05, + "loss": 0.777, + "step": 8673 + }, + { + "epoch": 1.5473195968245474, + "grad_norm": 0.4212150573730469, + "learning_rate": 6.052556835272424e-05, + "loss": 0.3672, + "step": 8674 + }, + { + "epoch": 1.5474979930425476, + "grad_norm": 0.3978899419307709, + "learning_rate": 6.047986902677252e-05, + "loss": 0.352, + "step": 8675 + }, + { + "epoch": 1.5476763892605476, + "grad_norm": 0.5088796615600586, + "learning_rate": 6.0434184585650256e-05, + "loss": 0.5676, + "step": 8676 + }, + { + "epoch": 1.5478547854785478, + "grad_norm": 0.42953309416770935, + "learning_rate": 6.038851503294554e-05, + "loss": 0.3769, + "step": 8677 + }, + { + "epoch": 1.548033181696548, + "grad_norm": 0.49997836351394653, + "learning_rate": 6.0342860372245344e-05, + "loss": 0.6421, + "step": 8678 + }, + { + "epoch": 1.5482115779145482, + "grad_norm": 0.5206395387649536, + "learning_rate": 6.029722060713519e-05, + "loss": 0.521, + "step": 8679 + }, + { + "epoch": 1.5483899741325484, + "grad_norm": 0.4601840376853943, + "learning_rate": 6.025159574119979e-05, + "loss": 0.5562, + "step": 8680 + }, + { + "epoch": 1.5485683703505486, + "grad_norm": 0.5508368015289307, + "learning_rate": 6.0205985778022305e-05, + "loss": 0.5545, + "step": 8681 + }, + { + "epoch": 1.5487467665685486, + "grad_norm": 0.5534743070602417, + "learning_rate": 6.016039072118512e-05, + "loss": 0.5463, + "step": 8682 + }, + { + "epoch": 1.5489251627865488, + "grad_norm": 0.4522656798362732, + "learning_rate": 6.011481057426915e-05, + "loss": 0.4339, + "step": 8683 + }, + { + "epoch": 1.549103559004549, + "grad_norm": 0.44534674286842346, + "learning_rate": 6.006924534085414e-05, + "loss": 0.4096, + "step": 8684 + }, + { + "epoch": 1.5492819552225492, + "grad_norm": 0.4928872883319855, + "learning_rate": 6.002369502451899e-05, + "loss": 0.5002, + "step": 8685 + }, + { + "epoch": 1.5494603514405494, + "grad_norm": 0.509283185005188, + "learning_rate": 5.997815962884098e-05, + "loss": 0.4871, + "step": 8686 + }, + { + "epoch": 1.5496387476585496, + "grad_norm": 0.5575897097587585, + "learning_rate": 5.993263915739661e-05, + "loss": 0.4546, + "step": 8687 + }, + { + "epoch": 1.5498171438765498, + "grad_norm": 0.49267205595970154, + "learning_rate": 5.988713361376089e-05, + "loss": 0.5014, + "step": 8688 + }, + { + "epoch": 1.54999554009455, + "grad_norm": 0.4820170998573303, + "learning_rate": 5.984164300150796e-05, + "loss": 0.4648, + "step": 8689 + }, + { + "epoch": 1.5501739363125502, + "grad_norm": 0.5354559421539307, + "learning_rate": 5.9796167324210505e-05, + "loss": 0.6559, + "step": 8690 + }, + { + "epoch": 1.5503523325305504, + "grad_norm": 0.5515713095664978, + "learning_rate": 5.975070658544021e-05, + "loss": 0.5613, + "step": 8691 + }, + { + "epoch": 1.5505307287485506, + "grad_norm": 0.5345812439918518, + "learning_rate": 5.9705260788767594e-05, + "loss": 0.6878, + "step": 8692 + }, + { + "epoch": 1.5507091249665508, + "grad_norm": 0.48849523067474365, + "learning_rate": 5.9659829937761865e-05, + "loss": 0.4398, + "step": 8693 + }, + { + "epoch": 1.550887521184551, + "grad_norm": 0.5398678183555603, + "learning_rate": 5.9614414035991244e-05, + "loss": 0.6064, + "step": 8694 + }, + { + "epoch": 1.5510659174025512, + "grad_norm": 0.47150692343711853, + "learning_rate": 5.9569013087022614e-05, + "loss": 0.5028, + "step": 8695 + }, + { + "epoch": 1.5512443136205514, + "grad_norm": 0.4753498435020447, + "learning_rate": 5.9523627094421664e-05, + "loss": 0.4533, + "step": 8696 + }, + { + "epoch": 1.5514227098385516, + "grad_norm": 0.5937864184379578, + "learning_rate": 5.947825606175317e-05, + "loss": 0.6236, + "step": 8697 + }, + { + "epoch": 1.5516011060565515, + "grad_norm": 0.5065819025039673, + "learning_rate": 5.943289999258036e-05, + "loss": 0.5351, + "step": 8698 + }, + { + "epoch": 1.5517795022745517, + "grad_norm": 0.5110275149345398, + "learning_rate": 5.938755889046565e-05, + "loss": 0.5605, + "step": 8699 + }, + { + "epoch": 1.551957898492552, + "grad_norm": 0.5337404608726501, + "learning_rate": 5.934223275896999e-05, + "loss": 0.6949, + "step": 8700 + }, + { + "epoch": 1.5521362947105521, + "grad_norm": 0.5709247589111328, + "learning_rate": 5.92969216016534e-05, + "loss": 0.7602, + "step": 8701 + }, + { + "epoch": 1.5523146909285523, + "grad_norm": 0.4845641255378723, + "learning_rate": 5.9251625422074406e-05, + "loss": 0.4694, + "step": 8702 + }, + { + "epoch": 1.5524930871465525, + "grad_norm": 0.5533928871154785, + "learning_rate": 5.920634422379079e-05, + "loss": 0.6047, + "step": 8703 + }, + { + "epoch": 1.5526714833645525, + "grad_norm": 0.5261991024017334, + "learning_rate": 5.916107801035875e-05, + "loss": 0.6173, + "step": 8704 + }, + { + "epoch": 1.5528498795825527, + "grad_norm": 0.5172926783561707, + "learning_rate": 5.9115826785333473e-05, + "loss": 0.5984, + "step": 8705 + }, + { + "epoch": 1.553028275800553, + "grad_norm": 0.47893017530441284, + "learning_rate": 5.907059055226907e-05, + "loss": 0.4444, + "step": 8706 + }, + { + "epoch": 1.5532066720185531, + "grad_norm": 0.5027623772621155, + "learning_rate": 5.902536931471819e-05, + "loss": 0.4835, + "step": 8707 + }, + { + "epoch": 1.5533850682365533, + "grad_norm": 0.566230297088623, + "learning_rate": 5.898016307623275e-05, + "loss": 0.5732, + "step": 8708 + }, + { + "epoch": 1.5535634644545535, + "grad_norm": 0.49636390805244446, + "learning_rate": 5.8934971840363116e-05, + "loss": 0.5499, + "step": 8709 + }, + { + "epoch": 1.5537418606725537, + "grad_norm": 0.5405829548835754, + "learning_rate": 5.888979561065849e-05, + "loss": 0.5984, + "step": 8710 + }, + { + "epoch": 1.553920256890554, + "grad_norm": 0.4457230865955353, + "learning_rate": 5.8844634390667176e-05, + "loss": 0.4734, + "step": 8711 + }, + { + "epoch": 1.554098653108554, + "grad_norm": 0.5478752851486206, + "learning_rate": 5.879948818393591e-05, + "loss": 0.5332, + "step": 8712 + }, + { + "epoch": 1.5542770493265543, + "grad_norm": 0.49561017751693726, + "learning_rate": 5.8754356994010634e-05, + "loss": 0.5383, + "step": 8713 + }, + { + "epoch": 1.5544554455445545, + "grad_norm": 0.571239709854126, + "learning_rate": 5.8709240824435795e-05, + "loss": 0.4595, + "step": 8714 + }, + { + "epoch": 1.5546338417625547, + "grad_norm": 0.48616328835487366, + "learning_rate": 5.8664139678754944e-05, + "loss": 0.5232, + "step": 8715 + }, + { + "epoch": 1.554812237980555, + "grad_norm": 0.5562140345573425, + "learning_rate": 5.861905356051023e-05, + "loss": 0.6471, + "step": 8716 + }, + { + "epoch": 1.554990634198555, + "grad_norm": 0.516631007194519, + "learning_rate": 5.857398247324261e-05, + "loss": 0.572, + "step": 8717 + }, + { + "epoch": 1.5551690304165553, + "grad_norm": 0.4869072139263153, + "learning_rate": 5.85289264204921e-05, + "loss": 0.5075, + "step": 8718 + }, + { + "epoch": 1.5553474266345555, + "grad_norm": 0.5700942873954773, + "learning_rate": 5.8483885405797246e-05, + "loss": 0.6769, + "step": 8719 + }, + { + "epoch": 1.5555258228525555, + "grad_norm": 1.086318850517273, + "learning_rate": 5.843885943269567e-05, + "loss": 0.4651, + "step": 8720 + }, + { + "epoch": 1.5557042190705557, + "grad_norm": 0.45219793915748596, + "learning_rate": 5.83938485047236e-05, + "loss": 0.4308, + "step": 8721 + }, + { + "epoch": 1.5558826152885559, + "grad_norm": 0.5690205097198486, + "learning_rate": 5.834885262541617e-05, + "loss": 0.628, + "step": 8722 + }, + { + "epoch": 1.556061011506556, + "grad_norm": 0.5013719797134399, + "learning_rate": 5.830387179830748e-05, + "loss": 0.5243, + "step": 8723 + }, + { + "epoch": 1.5562394077245563, + "grad_norm": 0.4831756055355072, + "learning_rate": 5.825890602693013e-05, + "loss": 0.4328, + "step": 8724 + }, + { + "epoch": 1.5564178039425565, + "grad_norm": 0.511838436126709, + "learning_rate": 5.8213955314815853e-05, + "loss": 0.4934, + "step": 8725 + }, + { + "epoch": 1.5565962001605564, + "grad_norm": 0.5139269232749939, + "learning_rate": 5.816901966549495e-05, + "loss": 0.4621, + "step": 8726 + }, + { + "epoch": 1.5567745963785566, + "grad_norm": 0.4796489477157593, + "learning_rate": 5.8124099082496745e-05, + "loss": 0.5047, + "step": 8727 + }, + { + "epoch": 1.5569529925965568, + "grad_norm": 0.47193264961242676, + "learning_rate": 5.807919356934915e-05, + "loss": 0.467, + "step": 8728 + }, + { + "epoch": 1.557131388814557, + "grad_norm": 0.570898711681366, + "learning_rate": 5.8034303129579164e-05, + "loss": 0.7563, + "step": 8729 + }, + { + "epoch": 1.5573097850325572, + "grad_norm": 0.4879952073097229, + "learning_rate": 5.798942776671243e-05, + "loss": 0.4973, + "step": 8730 + }, + { + "epoch": 1.5574881812505574, + "grad_norm": 0.44711223244667053, + "learning_rate": 5.794456748427332e-05, + "loss": 0.3708, + "step": 8731 + }, + { + "epoch": 1.5576665774685576, + "grad_norm": 0.6115127801895142, + "learning_rate": 5.78997222857853e-05, + "loss": 0.6516, + "step": 8732 + }, + { + "epoch": 1.5578449736865578, + "grad_norm": 0.5244799852371216, + "learning_rate": 5.785489217477036e-05, + "loss": 0.507, + "step": 8733 + }, + { + "epoch": 1.558023369904558, + "grad_norm": 0.5613646507263184, + "learning_rate": 5.7810077154749566e-05, + "loss": 0.6944, + "step": 8734 + }, + { + "epoch": 1.5582017661225582, + "grad_norm": 0.49775010347366333, + "learning_rate": 5.7765277229242546e-05, + "loss": 0.4435, + "step": 8735 + }, + { + "epoch": 1.5583801623405584, + "grad_norm": 0.46436411142349243, + "learning_rate": 5.772049240176799e-05, + "loss": 0.4435, + "step": 8736 + }, + { + "epoch": 1.5585585585585586, + "grad_norm": 0.5458922982215881, + "learning_rate": 5.7675722675843144e-05, + "loss": 0.5705, + "step": 8737 + }, + { + "epoch": 1.5587369547765588, + "grad_norm": 0.5422109365463257, + "learning_rate": 5.763096805498427e-05, + "loss": 0.6135, + "step": 8738 + }, + { + "epoch": 1.558915350994559, + "grad_norm": 0.574105441570282, + "learning_rate": 5.758622854270648e-05, + "loss": 0.7073, + "step": 8739 + }, + { + "epoch": 1.5590937472125592, + "grad_norm": 0.5504136085510254, + "learning_rate": 5.7541504142523406e-05, + "loss": 0.6011, + "step": 8740 + }, + { + "epoch": 1.5592721434305594, + "grad_norm": 0.5172324180603027, + "learning_rate": 5.7496794857947846e-05, + "loss": 0.4303, + "step": 8741 + }, + { + "epoch": 1.5594505396485594, + "grad_norm": 0.5464860200881958, + "learning_rate": 5.745210069249118e-05, + "loss": 0.6365, + "step": 8742 + }, + { + "epoch": 1.5596289358665596, + "grad_norm": 0.47427821159362793, + "learning_rate": 5.740742164966362e-05, + "loss": 0.4361, + "step": 8743 + }, + { + "epoch": 1.5598073320845598, + "grad_norm": 1.116687536239624, + "learning_rate": 5.736275773297431e-05, + "loss": 0.699, + "step": 8744 + }, + { + "epoch": 1.55998572830256, + "grad_norm": 0.4698363244533539, + "learning_rate": 5.731810894593106e-05, + "loss": 0.5028, + "step": 8745 + }, + { + "epoch": 1.5601641245205602, + "grad_norm": 0.47105473279953003, + "learning_rate": 5.727347529204069e-05, + "loss": 0.5059, + "step": 8746 + }, + { + "epoch": 1.5603425207385604, + "grad_norm": 0.5197162628173828, + "learning_rate": 5.722885677480857e-05, + "loss": 0.5229, + "step": 8747 + }, + { + "epoch": 1.5605209169565604, + "grad_norm": 0.5358749032020569, + "learning_rate": 5.718425339773914e-05, + "loss": 0.5431, + "step": 8748 + }, + { + "epoch": 1.5606993131745606, + "grad_norm": 0.6025269627571106, + "learning_rate": 5.713966516433541e-05, + "loss": 0.791, + "step": 8749 + }, + { + "epoch": 1.5608777093925608, + "grad_norm": 0.502884030342102, + "learning_rate": 5.709509207809946e-05, + "loss": 0.5499, + "step": 8750 + }, + { + "epoch": 1.561056105610561, + "grad_norm": 0.49932992458343506, + "learning_rate": 5.705053414253195e-05, + "loss": 0.5878, + "step": 8751 + }, + { + "epoch": 1.5612345018285612, + "grad_norm": 0.4696747362613678, + "learning_rate": 5.700599136113238e-05, + "loss": 0.4294, + "step": 8752 + }, + { + "epoch": 1.5614128980465614, + "grad_norm": 0.4963851273059845, + "learning_rate": 5.6961463737399215e-05, + "loss": 0.5904, + "step": 8753 + }, + { + "epoch": 1.5615912942645616, + "grad_norm": 0.542914867401123, + "learning_rate": 5.691695127482968e-05, + "loss": 0.7059, + "step": 8754 + }, + { + "epoch": 1.5617696904825618, + "grad_norm": 0.44281649589538574, + "learning_rate": 5.687245397691962e-05, + "loss": 0.416, + "step": 8755 + }, + { + "epoch": 1.561948086700562, + "grad_norm": 0.48408737778663635, + "learning_rate": 5.682797184716401e-05, + "loss": 0.5626, + "step": 8756 + }, + { + "epoch": 1.5621264829185622, + "grad_norm": 0.4773915410041809, + "learning_rate": 5.6783504889056285e-05, + "loss": 0.5477, + "step": 8757 + }, + { + "epoch": 1.5623048791365624, + "grad_norm": 0.5725618600845337, + "learning_rate": 5.6739053106088984e-05, + "loss": 0.5617, + "step": 8758 + }, + { + "epoch": 1.5624832753545626, + "grad_norm": 0.4656837582588196, + "learning_rate": 5.6694616501753256e-05, + "loss": 0.4798, + "step": 8759 + }, + { + "epoch": 1.5626616715725627, + "grad_norm": 0.4770435094833374, + "learning_rate": 5.6650195079539194e-05, + "loss": 0.5398, + "step": 8760 + }, + { + "epoch": 1.562840067790563, + "grad_norm": 0.47299638390541077, + "learning_rate": 5.6605788842935544e-05, + "loss": 0.471, + "step": 8761 + }, + { + "epoch": 1.5630184640085631, + "grad_norm": 0.4295462369918823, + "learning_rate": 5.6561397795430096e-05, + "loss": 0.4549, + "step": 8762 + }, + { + "epoch": 1.5631968602265633, + "grad_norm": 0.5424062609672546, + "learning_rate": 5.6517021940509225e-05, + "loss": 0.649, + "step": 8763 + }, + { + "epoch": 1.5633752564445633, + "grad_norm": 0.4859166145324707, + "learning_rate": 5.6472661281658125e-05, + "loss": 0.5242, + "step": 8764 + }, + { + "epoch": 1.5635536526625635, + "grad_norm": 0.5274991393089294, + "learning_rate": 5.642831582236096e-05, + "loss": 0.5842, + "step": 8765 + }, + { + "epoch": 1.5637320488805637, + "grad_norm": 0.5063260197639465, + "learning_rate": 5.6383985566100525e-05, + "loss": 0.4953, + "step": 8766 + }, + { + "epoch": 1.563910445098564, + "grad_norm": 0.5385729074478149, + "learning_rate": 5.6339670516358633e-05, + "loss": 0.6145, + "step": 8767 + }, + { + "epoch": 1.5640888413165641, + "grad_norm": 0.429574579000473, + "learning_rate": 5.6295370676615584e-05, + "loss": 0.3678, + "step": 8768 + }, + { + "epoch": 1.5642672375345643, + "grad_norm": 0.45450228452682495, + "learning_rate": 5.625108605035076e-05, + "loss": 0.4379, + "step": 8769 + }, + { + "epoch": 1.5644456337525643, + "grad_norm": 0.5811430215835571, + "learning_rate": 5.620681664104235e-05, + "loss": 0.6731, + "step": 8770 + }, + { + "epoch": 1.5646240299705645, + "grad_norm": 0.5694655179977417, + "learning_rate": 5.6162562452167085e-05, + "loss": 0.6028, + "step": 8771 + }, + { + "epoch": 1.5648024261885647, + "grad_norm": 0.5351256728172302, + "learning_rate": 5.6118323487200806e-05, + "loss": 0.4978, + "step": 8772 + }, + { + "epoch": 1.5649808224065649, + "grad_norm": 0.43898749351501465, + "learning_rate": 5.6074099749617914e-05, + "loss": 0.4938, + "step": 8773 + }, + { + "epoch": 1.565159218624565, + "grad_norm": 0.5240178108215332, + "learning_rate": 5.602989124289185e-05, + "loss": 0.4156, + "step": 8774 + }, + { + "epoch": 1.5653376148425653, + "grad_norm": 0.4986781179904938, + "learning_rate": 5.598569797049466e-05, + "loss": 0.4061, + "step": 8775 + }, + { + "epoch": 1.5655160110605655, + "grad_norm": 0.6685777306556702, + "learning_rate": 5.5941519935897164e-05, + "loss": 0.8649, + "step": 8776 + }, + { + "epoch": 1.5656944072785657, + "grad_norm": 0.46365004777908325, + "learning_rate": 5.589735714256927e-05, + "loss": 0.4269, + "step": 8777 + }, + { + "epoch": 1.5658728034965659, + "grad_norm": 0.5603156089782715, + "learning_rate": 5.5853209593979354e-05, + "loss": 0.7361, + "step": 8778 + }, + { + "epoch": 1.566051199714566, + "grad_norm": 0.5568613409996033, + "learning_rate": 5.580907729359486e-05, + "loss": 0.5341, + "step": 8779 + }, + { + "epoch": 1.5662295959325663, + "grad_norm": 0.5259753465652466, + "learning_rate": 5.5764960244881815e-05, + "loss": 0.511, + "step": 8780 + }, + { + "epoch": 1.5664079921505665, + "grad_norm": 0.5009199976921082, + "learning_rate": 5.5720858451305255e-05, + "loss": 0.5754, + "step": 8781 + }, + { + "epoch": 1.5665863883685667, + "grad_norm": 0.48167309165000916, + "learning_rate": 5.567677191632883e-05, + "loss": 0.4789, + "step": 8782 + }, + { + "epoch": 1.5667647845865669, + "grad_norm": 0.5015170574188232, + "learning_rate": 5.563270064341508e-05, + "loss": 0.5136, + "step": 8783 + }, + { + "epoch": 1.566943180804567, + "grad_norm": 0.5342589616775513, + "learning_rate": 5.558864463602548e-05, + "loss": 0.6359, + "step": 8784 + }, + { + "epoch": 1.5671215770225673, + "grad_norm": 0.5268881916999817, + "learning_rate": 5.5544603897619976e-05, + "loss": 0.5012, + "step": 8785 + }, + { + "epoch": 1.5672999732405672, + "grad_norm": 0.5006667375564575, + "learning_rate": 5.5500578431657675e-05, + "loss": 0.5217, + "step": 8786 + }, + { + "epoch": 1.5674783694585674, + "grad_norm": 0.49677345156669617, + "learning_rate": 5.545656824159617e-05, + "loss": 0.5327, + "step": 8787 + }, + { + "epoch": 1.5676567656765676, + "grad_norm": 0.5768446326255798, + "learning_rate": 5.5412573330892165e-05, + "loss": 0.6573, + "step": 8788 + }, + { + "epoch": 1.5678351618945678, + "grad_norm": 0.48240020871162415, + "learning_rate": 5.53685937030009e-05, + "loss": 0.5009, + "step": 8789 + }, + { + "epoch": 1.568013558112568, + "grad_norm": 0.4898053705692291, + "learning_rate": 5.532462936137647e-05, + "loss": 0.5342, + "step": 8790 + }, + { + "epoch": 1.5681919543305682, + "grad_norm": 0.5353338122367859, + "learning_rate": 5.528068030947192e-05, + "loss": 0.5924, + "step": 8791 + }, + { + "epoch": 1.5683703505485682, + "grad_norm": 0.47058504819869995, + "learning_rate": 5.52367465507389e-05, + "loss": 0.4101, + "step": 8792 + }, + { + "epoch": 1.5685487467665684, + "grad_norm": 0.47491228580474854, + "learning_rate": 5.519282808862805e-05, + "loss": 0.4776, + "step": 8793 + }, + { + "epoch": 1.5687271429845686, + "grad_norm": 0.47631198167800903, + "learning_rate": 5.5148924926588574e-05, + "loss": 0.4497, + "step": 8794 + }, + { + "epoch": 1.5689055392025688, + "grad_norm": 0.4845799207687378, + "learning_rate": 5.510503706806877e-05, + "loss": 0.5219, + "step": 8795 + }, + { + "epoch": 1.569083935420569, + "grad_norm": 0.5517118573188782, + "learning_rate": 5.506116451651547e-05, + "loss": 0.5573, + "step": 8796 + }, + { + "epoch": 1.5692623316385692, + "grad_norm": 0.529208242893219, + "learning_rate": 5.501730727537435e-05, + "loss": 0.5191, + "step": 8797 + }, + { + "epoch": 1.5694407278565694, + "grad_norm": 0.48718565702438354, + "learning_rate": 5.497346534809011e-05, + "loss": 0.4791, + "step": 8798 + }, + { + "epoch": 1.5696191240745696, + "grad_norm": 0.5780670642852783, + "learning_rate": 5.4929638738105805e-05, + "loss": 0.5571, + "step": 8799 + }, + { + "epoch": 1.5697975202925698, + "grad_norm": 0.5497144460678101, + "learning_rate": 5.488582744886386e-05, + "loss": 0.6058, + "step": 8800 + }, + { + "epoch": 1.56997591651057, + "grad_norm": 0.5026035904884338, + "learning_rate": 5.484203148380509e-05, + "loss": 0.436, + "step": 8801 + }, + { + "epoch": 1.5701543127285702, + "grad_norm": 0.4526662528514862, + "learning_rate": 5.479825084636911e-05, + "loss": 0.4857, + "step": 8802 + }, + { + "epoch": 1.5703327089465704, + "grad_norm": 0.5049575567245483, + "learning_rate": 5.475448553999454e-05, + "loss": 0.5069, + "step": 8803 + }, + { + "epoch": 1.5705111051645706, + "grad_norm": 0.503604531288147, + "learning_rate": 5.471073556811862e-05, + "loss": 0.5236, + "step": 8804 + }, + { + "epoch": 1.5706895013825708, + "grad_norm": 0.5270532369613647, + "learning_rate": 5.4667000934177564e-05, + "loss": 0.5193, + "step": 8805 + }, + { + "epoch": 1.570867897600571, + "grad_norm": 0.49302002787590027, + "learning_rate": 5.4623281641606096e-05, + "loss": 0.4832, + "step": 8806 + }, + { + "epoch": 1.5710462938185712, + "grad_norm": 0.4588510990142822, + "learning_rate": 5.457957769383812e-05, + "loss": 0.4937, + "step": 8807 + }, + { + "epoch": 1.5712246900365712, + "grad_norm": 0.4889686703681946, + "learning_rate": 5.453588909430593e-05, + "loss": 0.4553, + "step": 8808 + }, + { + "epoch": 1.5714030862545714, + "grad_norm": 0.536016047000885, + "learning_rate": 5.4492215846440953e-05, + "loss": 0.6425, + "step": 8809 + }, + { + "epoch": 1.5715814824725716, + "grad_norm": 0.4554092288017273, + "learning_rate": 5.4448557953673204e-05, + "loss": 0.434, + "step": 8810 + }, + { + "epoch": 1.5717598786905718, + "grad_norm": 0.4832884669303894, + "learning_rate": 5.440491541943152e-05, + "loss": 0.4417, + "step": 8811 + }, + { + "epoch": 1.571938274908572, + "grad_norm": 0.5221408009529114, + "learning_rate": 5.4361288247143646e-05, + "loss": 0.5256, + "step": 8812 + }, + { + "epoch": 1.5721166711265722, + "grad_norm": 0.4202393889427185, + "learning_rate": 5.4317676440235967e-05, + "loss": 0.3462, + "step": 8813 + }, + { + "epoch": 1.5722950673445721, + "grad_norm": 0.4951157867908478, + "learning_rate": 5.427408000213374e-05, + "loss": 0.4871, + "step": 8814 + }, + { + "epoch": 1.5724734635625723, + "grad_norm": 0.4902489483356476, + "learning_rate": 5.423049893626114e-05, + "loss": 0.5418, + "step": 8815 + }, + { + "epoch": 1.5726518597805725, + "grad_norm": 0.5259934067726135, + "learning_rate": 5.418693324604082e-05, + "loss": 0.5426, + "step": 8816 + }, + { + "epoch": 1.5728302559985727, + "grad_norm": 0.5680667757987976, + "learning_rate": 5.414338293489457e-05, + "loss": 0.6104, + "step": 8817 + }, + { + "epoch": 1.573008652216573, + "grad_norm": 0.5012747645378113, + "learning_rate": 5.409984800624265e-05, + "loss": 0.5192, + "step": 8818 + }, + { + "epoch": 1.5731870484345731, + "grad_norm": 0.48108309507369995, + "learning_rate": 5.4056328463504475e-05, + "loss": 0.5373, + "step": 8819 + }, + { + "epoch": 1.5733654446525733, + "grad_norm": 0.5479810237884521, + "learning_rate": 5.401282431009785e-05, + "loss": 0.6286, + "step": 8820 + }, + { + "epoch": 1.5735438408705735, + "grad_norm": 0.5330629944801331, + "learning_rate": 5.3969335549439726e-05, + "loss": 0.5358, + "step": 8821 + }, + { + "epoch": 1.5737222370885737, + "grad_norm": 0.5190926194190979, + "learning_rate": 5.392586218494563e-05, + "loss": 0.6252, + "step": 8822 + }, + { + "epoch": 1.573900633306574, + "grad_norm": 0.5403081178665161, + "learning_rate": 5.388240422002991e-05, + "loss": 0.5921, + "step": 8823 + }, + { + "epoch": 1.5740790295245741, + "grad_norm": 0.5377727746963501, + "learning_rate": 5.383896165810578e-05, + "loss": 0.4856, + "step": 8824 + }, + { + "epoch": 1.5742574257425743, + "grad_norm": 0.513603925704956, + "learning_rate": 5.3795534502585146e-05, + "loss": 0.5591, + "step": 8825 + }, + { + "epoch": 1.5744358219605745, + "grad_norm": 0.5131911039352417, + "learning_rate": 5.375212275687888e-05, + "loss": 0.4422, + "step": 8826 + }, + { + "epoch": 1.5746142181785747, + "grad_norm": 0.519425630569458, + "learning_rate": 5.3708726424396365e-05, + "loss": 0.5824, + "step": 8827 + }, + { + "epoch": 1.574792614396575, + "grad_norm": 0.47717246413230896, + "learning_rate": 5.366534550854607e-05, + "loss": 0.4575, + "step": 8828 + }, + { + "epoch": 1.5749710106145751, + "grad_norm": 0.563754677772522, + "learning_rate": 5.3621980012734965e-05, + "loss": 0.5664, + "step": 8829 + }, + { + "epoch": 1.575149406832575, + "grad_norm": 0.5223338603973389, + "learning_rate": 5.357862994036905e-05, + "loss": 0.498, + "step": 8830 + }, + { + "epoch": 1.5753278030505753, + "grad_norm": 0.5984678268432617, + "learning_rate": 5.35352952948531e-05, + "loss": 0.6791, + "step": 8831 + }, + { + "epoch": 1.5755061992685755, + "grad_norm": 0.554014265537262, + "learning_rate": 5.349197607959042e-05, + "loss": 0.6039, + "step": 8832 + }, + { + "epoch": 1.5756845954865757, + "grad_norm": 0.561259388923645, + "learning_rate": 5.3448672297983445e-05, + "loss": 0.5535, + "step": 8833 + }, + { + "epoch": 1.5758629917045759, + "grad_norm": 0.49919480085372925, + "learning_rate": 5.3405383953433196e-05, + "loss": 0.4131, + "step": 8834 + }, + { + "epoch": 1.576041387922576, + "grad_norm": 0.5846698880195618, + "learning_rate": 5.336211104933938e-05, + "loss": 0.6158, + "step": 8835 + }, + { + "epoch": 1.576219784140576, + "grad_norm": 0.5364250540733337, + "learning_rate": 5.3318853589100824e-05, + "loss": 0.6105, + "step": 8836 + }, + { + "epoch": 1.5763981803585763, + "grad_norm": 0.5680550932884216, + "learning_rate": 5.3275611576114825e-05, + "loss": 0.7472, + "step": 8837 + }, + { + "epoch": 1.5765765765765765, + "grad_norm": 0.4823981821537018, + "learning_rate": 5.323238501377767e-05, + "loss": 0.4895, + "step": 8838 + }, + { + "epoch": 1.5767549727945767, + "grad_norm": 0.6050514578819275, + "learning_rate": 5.318917390548428e-05, + "loss": 0.7538, + "step": 8839 + }, + { + "epoch": 1.5769333690125769, + "grad_norm": 0.4770009517669678, + "learning_rate": 5.314597825462852e-05, + "loss": 0.4793, + "step": 8840 + }, + { + "epoch": 1.577111765230577, + "grad_norm": 0.4747209846973419, + "learning_rate": 5.310279806460286e-05, + "loss": 0.4359, + "step": 8841 + }, + { + "epoch": 1.5772901614485773, + "grad_norm": 0.44378408789634705, + "learning_rate": 5.305963333879879e-05, + "loss": 0.3823, + "step": 8842 + }, + { + "epoch": 1.5774685576665775, + "grad_norm": 0.4758981466293335, + "learning_rate": 5.301648408060633e-05, + "loss": 0.6459, + "step": 8843 + }, + { + "epoch": 1.5776469538845777, + "grad_norm": 0.5262596011161804, + "learning_rate": 5.297335029341433e-05, + "loss": 0.5195, + "step": 8844 + }, + { + "epoch": 1.5778253501025779, + "grad_norm": 0.4246023893356323, + "learning_rate": 5.2930231980610756e-05, + "loss": 0.3224, + "step": 8845 + }, + { + "epoch": 1.578003746320578, + "grad_norm": 0.502919614315033, + "learning_rate": 5.288712914558189e-05, + "loss": 0.5362, + "step": 8846 + }, + { + "epoch": 1.5781821425385782, + "grad_norm": 0.5143377184867859, + "learning_rate": 5.284404179171312e-05, + "loss": 0.5843, + "step": 8847 + }, + { + "epoch": 1.5783605387565784, + "grad_norm": 0.48670274019241333, + "learning_rate": 5.2800969922388474e-05, + "loss": 0.4592, + "step": 8848 + }, + { + "epoch": 1.5785389349745786, + "grad_norm": 0.45652881264686584, + "learning_rate": 5.2757913540990715e-05, + "loss": 0.42, + "step": 8849 + }, + { + "epoch": 1.5787173311925788, + "grad_norm": 0.5632809996604919, + "learning_rate": 5.271487265090164e-05, + "loss": 0.7437, + "step": 8850 + }, + { + "epoch": 1.578895727410579, + "grad_norm": 0.5193389654159546, + "learning_rate": 5.267184725550148e-05, + "loss": 0.6235, + "step": 8851 + }, + { + "epoch": 1.5790741236285792, + "grad_norm": 0.45982155203819275, + "learning_rate": 5.262883735816959e-05, + "loss": 0.4623, + "step": 8852 + }, + { + "epoch": 1.5792525198465792, + "grad_norm": 0.5154650211334229, + "learning_rate": 5.25858429622838e-05, + "loss": 0.537, + "step": 8853 + }, + { + "epoch": 1.5794309160645794, + "grad_norm": 0.46093055605888367, + "learning_rate": 5.254286407122103e-05, + "loss": 0.4579, + "step": 8854 + }, + { + "epoch": 1.5796093122825796, + "grad_norm": 0.4196965992450714, + "learning_rate": 5.249990068835675e-05, + "loss": 0.3861, + "step": 8855 + }, + { + "epoch": 1.5797877085005798, + "grad_norm": 0.5842517614364624, + "learning_rate": 5.24569528170652e-05, + "loss": 0.68, + "step": 8856 + }, + { + "epoch": 1.57996610471858, + "grad_norm": 0.5660757422447205, + "learning_rate": 5.2414020460719636e-05, + "loss": 0.5804, + "step": 8857 + }, + { + "epoch": 1.58014450093658, + "grad_norm": 0.49591630697250366, + "learning_rate": 5.237110362269182e-05, + "loss": 0.5518, + "step": 8858 + }, + { + "epoch": 1.5803228971545802, + "grad_norm": 0.5173941850662231, + "learning_rate": 5.232820230635255e-05, + "loss": 0.4721, + "step": 8859 + }, + { + "epoch": 1.5805012933725804, + "grad_norm": 0.538420557975769, + "learning_rate": 5.228531651507112e-05, + "loss": 0.5329, + "step": 8860 + }, + { + "epoch": 1.5806796895905806, + "grad_norm": 0.5405678749084473, + "learning_rate": 5.2242446252215856e-05, + "loss": 0.652, + "step": 8861 + }, + { + "epoch": 1.5808580858085808, + "grad_norm": 0.5772570967674255, + "learning_rate": 5.219959152115381e-05, + "loss": 0.459, + "step": 8862 + }, + { + "epoch": 1.581036482026581, + "grad_norm": 0.5803507566452026, + "learning_rate": 5.215675232525069e-05, + "loss": 0.6247, + "step": 8863 + }, + { + "epoch": 1.5812148782445812, + "grad_norm": 0.49514850974082947, + "learning_rate": 5.211392866787115e-05, + "loss": 0.4549, + "step": 8864 + }, + { + "epoch": 1.5813932744625814, + "grad_norm": 0.5379515290260315, + "learning_rate": 5.207112055237842e-05, + "loss": 0.6696, + "step": 8865 + }, + { + "epoch": 1.5815716706805816, + "grad_norm": 0.5178011059761047, + "learning_rate": 5.20283279821348e-05, + "loss": 0.5623, + "step": 8866 + }, + { + "epoch": 1.5817500668985818, + "grad_norm": 0.4703742563724518, + "learning_rate": 5.198555096050103e-05, + "loss": 0.5029, + "step": 8867 + }, + { + "epoch": 1.581928463116582, + "grad_norm": 0.5060920119285583, + "learning_rate": 5.194278949083695e-05, + "loss": 0.5898, + "step": 8868 + }, + { + "epoch": 1.5821068593345822, + "grad_norm": 0.4721207618713379, + "learning_rate": 5.1900043576500936e-05, + "loss": 0.3526, + "step": 8869 + }, + { + "epoch": 1.5822852555525824, + "grad_norm": 0.46023473143577576, + "learning_rate": 5.185731322085019e-05, + "loss": 0.5206, + "step": 8870 + }, + { + "epoch": 1.5824636517705826, + "grad_norm": 0.46701812744140625, + "learning_rate": 5.181459842724087e-05, + "loss": 0.4646, + "step": 8871 + }, + { + "epoch": 1.5826420479885828, + "grad_norm": 0.5196385383605957, + "learning_rate": 5.177189919902761e-05, + "loss": 0.4874, + "step": 8872 + }, + { + "epoch": 1.582820444206583, + "grad_norm": 0.5126696228981018, + "learning_rate": 5.172921553956417e-05, + "loss": 0.525, + "step": 8873 + }, + { + "epoch": 1.5829988404245832, + "grad_norm": 0.5253236889839172, + "learning_rate": 5.168654745220275e-05, + "loss": 0.5603, + "step": 8874 + }, + { + "epoch": 1.5831772366425831, + "grad_norm": 0.5543179512023926, + "learning_rate": 5.164389494029456e-05, + "loss": 0.6045, + "step": 8875 + }, + { + "epoch": 1.5833556328605833, + "grad_norm": 0.4786223769187927, + "learning_rate": 5.160125800718956e-05, + "loss": 0.4731, + "step": 8876 + }, + { + "epoch": 1.5835340290785835, + "grad_norm": 0.5591602921485901, + "learning_rate": 5.1558636656236306e-05, + "loss": 0.6183, + "step": 8877 + }, + { + "epoch": 1.5837124252965837, + "grad_norm": 0.6518511772155762, + "learning_rate": 5.151603089078241e-05, + "loss": 0.5388, + "step": 8878 + }, + { + "epoch": 1.583890821514584, + "grad_norm": 0.5040208101272583, + "learning_rate": 5.147344071417398e-05, + "loss": 0.4481, + "step": 8879 + }, + { + "epoch": 1.584069217732584, + "grad_norm": 0.533556342124939, + "learning_rate": 5.143086612975611e-05, + "loss": 0.5437, + "step": 8880 + }, + { + "epoch": 1.584247613950584, + "grad_norm": 0.4733051657676697, + "learning_rate": 5.1388307140872614e-05, + "loss": 0.51, + "step": 8881 + }, + { + "epoch": 1.5844260101685843, + "grad_norm": 0.5339987277984619, + "learning_rate": 5.134576375086591e-05, + "loss": 0.5314, + "step": 8882 + }, + { + "epoch": 1.5846044063865845, + "grad_norm": 0.477483332157135, + "learning_rate": 5.130323596307751e-05, + "loss": 0.3984, + "step": 8883 + }, + { + "epoch": 1.5847828026045847, + "grad_norm": 0.509962260723114, + "learning_rate": 5.126072378084737e-05, + "loss": 0.461, + "step": 8884 + }, + { + "epoch": 1.584961198822585, + "grad_norm": 0.5546766519546509, + "learning_rate": 5.121822720751454e-05, + "loss": 0.6239, + "step": 8885 + }, + { + "epoch": 1.585139595040585, + "grad_norm": 0.5097823739051819, + "learning_rate": 5.117574624641652e-05, + "loss": 0.4881, + "step": 8886 + }, + { + "epoch": 1.5853179912585853, + "grad_norm": 0.43369320034980774, + "learning_rate": 5.1133280900889924e-05, + "loss": 0.3753, + "step": 8887 + }, + { + "epoch": 1.5854963874765855, + "grad_norm": 0.523571252822876, + "learning_rate": 5.109083117426977e-05, + "loss": 0.6094, + "step": 8888 + }, + { + "epoch": 1.5856747836945857, + "grad_norm": 0.5631095170974731, + "learning_rate": 5.104839706989023e-05, + "loss": 0.5408, + "step": 8889 + }, + { + "epoch": 1.585853179912586, + "grad_norm": 0.4018095135688782, + "learning_rate": 5.100597859108388e-05, + "loss": 0.3599, + "step": 8890 + }, + { + "epoch": 1.586031576130586, + "grad_norm": 0.5422604084014893, + "learning_rate": 5.096357574118235e-05, + "loss": 0.4818, + "step": 8891 + }, + { + "epoch": 1.5862099723485863, + "grad_norm": 0.4900394082069397, + "learning_rate": 5.092118852351599e-05, + "loss": 0.4453, + "step": 8892 + }, + { + "epoch": 1.5863883685665865, + "grad_norm": 0.5061340928077698, + "learning_rate": 5.0878816941413744e-05, + "loss": 0.5332, + "step": 8893 + }, + { + "epoch": 1.5865667647845867, + "grad_norm": 0.4389035999774933, + "learning_rate": 5.0836460998203606e-05, + "loss": 0.361, + "step": 8894 + }, + { + "epoch": 1.586745161002587, + "grad_norm": 0.533852756023407, + "learning_rate": 5.0794120697212094e-05, + "loss": 0.5116, + "step": 8895 + }, + { + "epoch": 1.586923557220587, + "grad_norm": 0.5132311582565308, + "learning_rate": 5.075179604176458e-05, + "loss": 0.5874, + "step": 8896 + }, + { + "epoch": 1.587101953438587, + "grad_norm": 0.49478766322135925, + "learning_rate": 5.07094870351853e-05, + "loss": 0.4595, + "step": 8897 + }, + { + "epoch": 1.5872803496565873, + "grad_norm": 0.5392327904701233, + "learning_rate": 5.066719368079708e-05, + "loss": 0.7202, + "step": 8898 + }, + { + "epoch": 1.5874587458745875, + "grad_norm": 0.5095738768577576, + "learning_rate": 5.062491598192179e-05, + "loss": 0.5124, + "step": 8899 + }, + { + "epoch": 1.5876371420925877, + "grad_norm": 0.5276995301246643, + "learning_rate": 5.058265394187969e-05, + "loss": 0.6014, + "step": 8900 + }, + { + "epoch": 1.5878155383105879, + "grad_norm": 0.5482483506202698, + "learning_rate": 5.054040756399023e-05, + "loss": 0.7651, + "step": 8901 + }, + { + "epoch": 1.5879939345285878, + "grad_norm": 0.46668532490730286, + "learning_rate": 5.049817685157132e-05, + "loss": 0.4828, + "step": 8902 + }, + { + "epoch": 1.588172330746588, + "grad_norm": 0.6453546285629272, + "learning_rate": 5.045596180793968e-05, + "loss": 0.7255, + "step": 8903 + }, + { + "epoch": 1.5883507269645882, + "grad_norm": 0.47187116742134094, + "learning_rate": 5.041376243641099e-05, + "loss": 0.4884, + "step": 8904 + }, + { + "epoch": 1.5885291231825884, + "grad_norm": 0.4838643968105316, + "learning_rate": 5.037157874029946e-05, + "loss": 0.5405, + "step": 8905 + }, + { + "epoch": 1.5887075194005886, + "grad_norm": 0.5167852640151978, + "learning_rate": 5.032941072291822e-05, + "loss": 0.6482, + "step": 8906 + }, + { + "epoch": 1.5888859156185888, + "grad_norm": 0.44085460901260376, + "learning_rate": 5.028725838757919e-05, + "loss": 0.4037, + "step": 8907 + }, + { + "epoch": 1.589064311836589, + "grad_norm": 0.5068758726119995, + "learning_rate": 5.024512173759288e-05, + "loss": 0.5228, + "step": 8908 + }, + { + "epoch": 1.5892427080545892, + "grad_norm": 0.4763888418674469, + "learning_rate": 5.020300077626883e-05, + "loss": 0.4412, + "step": 8909 + }, + { + "epoch": 1.5894211042725894, + "grad_norm": 0.5120984315872192, + "learning_rate": 5.016089550691505e-05, + "loss": 0.5497, + "step": 8910 + }, + { + "epoch": 1.5895995004905896, + "grad_norm": 0.5709746479988098, + "learning_rate": 5.0118805932838604e-05, + "loss": 0.5747, + "step": 8911 + }, + { + "epoch": 1.5897778967085898, + "grad_norm": 0.5391095280647278, + "learning_rate": 5.0076732057345034e-05, + "loss": 0.547, + "step": 8912 + }, + { + "epoch": 1.58995629292659, + "grad_norm": 0.5848898887634277, + "learning_rate": 5.0034673883738974e-05, + "loss": 0.5088, + "step": 8913 + }, + { + "epoch": 1.5901346891445902, + "grad_norm": 0.4863363206386566, + "learning_rate": 4.999263141532359e-05, + "loss": 0.4024, + "step": 8914 + }, + { + "epoch": 1.5903130853625904, + "grad_norm": 0.5417057871818542, + "learning_rate": 4.99506046554008e-05, + "loss": 0.5766, + "step": 8915 + }, + { + "epoch": 1.5904914815805906, + "grad_norm": 0.5208100080490112, + "learning_rate": 4.990859360727148e-05, + "loss": 0.617, + "step": 8916 + }, + { + "epoch": 1.5906698777985908, + "grad_norm": 0.5628762245178223, + "learning_rate": 4.9866598274235064e-05, + "loss": 0.6443, + "step": 8917 + }, + { + "epoch": 1.590848274016591, + "grad_norm": 0.5478858351707458, + "learning_rate": 4.9824618659589957e-05, + "loss": 0.7009, + "step": 8918 + }, + { + "epoch": 1.591026670234591, + "grad_norm": 0.5053479075431824, + "learning_rate": 4.9782654766633076e-05, + "loss": 0.4551, + "step": 8919 + }, + { + "epoch": 1.5912050664525912, + "grad_norm": 0.46800491213798523, + "learning_rate": 4.9740706598660396e-05, + "loss": 0.4591, + "step": 8920 + }, + { + "epoch": 1.5913834626705914, + "grad_norm": 0.47018712759017944, + "learning_rate": 4.9698774158966395e-05, + "loss": 0.4372, + "step": 8921 + }, + { + "epoch": 1.5915618588885916, + "grad_norm": 0.4378306567668915, + "learning_rate": 4.965685745084447e-05, + "loss": 0.4064, + "step": 8922 + }, + { + "epoch": 1.5917402551065918, + "grad_norm": 0.4751421809196472, + "learning_rate": 4.961495647758679e-05, + "loss": 0.5386, + "step": 8923 + }, + { + "epoch": 1.5919186513245918, + "grad_norm": 0.4474140703678131, + "learning_rate": 4.957307124248417e-05, + "loss": 0.5067, + "step": 8924 + }, + { + "epoch": 1.592097047542592, + "grad_norm": 0.5166023969650269, + "learning_rate": 4.9531201748826335e-05, + "loss": 0.5903, + "step": 8925 + }, + { + "epoch": 1.5922754437605922, + "grad_norm": 0.5033789277076721, + "learning_rate": 4.9489347999901567e-05, + "loss": 0.5299, + "step": 8926 + }, + { + "epoch": 1.5924538399785924, + "grad_norm": 0.4894052743911743, + "learning_rate": 4.944750999899719e-05, + "loss": 0.4912, + "step": 8927 + }, + { + "epoch": 1.5926322361965926, + "grad_norm": 0.484478235244751, + "learning_rate": 4.9405687749399076e-05, + "loss": 0.5635, + "step": 8928 + }, + { + "epoch": 1.5928106324145928, + "grad_norm": 0.47258055210113525, + "learning_rate": 4.936388125439184e-05, + "loss": 0.4479, + "step": 8929 + }, + { + "epoch": 1.592989028632593, + "grad_norm": 0.4957214593887329, + "learning_rate": 4.932209051725914e-05, + "loss": 0.554, + "step": 8930 + }, + { + "epoch": 1.5931674248505932, + "grad_norm": 0.5196431279182434, + "learning_rate": 4.9280315541282985e-05, + "loss": 0.5595, + "step": 8931 + }, + { + "epoch": 1.5933458210685933, + "grad_norm": 0.4598284065723419, + "learning_rate": 4.923855632974455e-05, + "loss": 0.421, + "step": 8932 + }, + { + "epoch": 1.5935242172865935, + "grad_norm": 0.42510855197906494, + "learning_rate": 4.919681288592345e-05, + "loss": 0.3646, + "step": 8933 + }, + { + "epoch": 1.5937026135045937, + "grad_norm": 0.5351304411888123, + "learning_rate": 4.91550852130983e-05, + "loss": 0.5159, + "step": 8934 + }, + { + "epoch": 1.593881009722594, + "grad_norm": 0.499664306640625, + "learning_rate": 4.911337331454635e-05, + "loss": 0.4688, + "step": 8935 + }, + { + "epoch": 1.5940594059405941, + "grad_norm": 0.47697949409484863, + "learning_rate": 4.907167719354347e-05, + "loss": 0.4619, + "step": 8936 + }, + { + "epoch": 1.5942378021585943, + "grad_norm": 0.48516783118247986, + "learning_rate": 4.9029996853364734e-05, + "loss": 0.5222, + "step": 8937 + }, + { + "epoch": 1.5944161983765945, + "grad_norm": 0.4567139446735382, + "learning_rate": 4.898833229728347e-05, + "loss": 0.4139, + "step": 8938 + }, + { + "epoch": 1.5945945945945947, + "grad_norm": 0.5564625859260559, + "learning_rate": 4.894668352857218e-05, + "loss": 0.5797, + "step": 8939 + }, + { + "epoch": 1.594772990812595, + "grad_norm": 0.5010836124420166, + "learning_rate": 4.890505055050182e-05, + "loss": 0.5232, + "step": 8940 + }, + { + "epoch": 1.594951387030595, + "grad_norm": 0.5170525908470154, + "learning_rate": 4.886343336634222e-05, + "loss": 0.5822, + "step": 8941 + }, + { + "epoch": 1.5951297832485951, + "grad_norm": 0.5084660649299622, + "learning_rate": 4.8821831979362044e-05, + "loss": 0.5954, + "step": 8942 + }, + { + "epoch": 1.5953081794665953, + "grad_norm": 0.49296459555625916, + "learning_rate": 4.878024639282855e-05, + "loss": 0.5069, + "step": 8943 + }, + { + "epoch": 1.5954865756845955, + "grad_norm": 0.515155017375946, + "learning_rate": 4.8738676610008e-05, + "loss": 0.5101, + "step": 8944 + }, + { + "epoch": 1.5956649719025957, + "grad_norm": 0.3873237073421478, + "learning_rate": 4.869712263416509e-05, + "loss": 0.2959, + "step": 8945 + }, + { + "epoch": 1.5958433681205957, + "grad_norm": 0.4798237681388855, + "learning_rate": 4.865558446856361e-05, + "loss": 0.5004, + "step": 8946 + }, + { + "epoch": 1.5960217643385959, + "grad_norm": 0.5282852053642273, + "learning_rate": 4.8614062116465826e-05, + "loss": 0.5694, + "step": 8947 + }, + { + "epoch": 1.596200160556596, + "grad_norm": 0.43830955028533936, + "learning_rate": 4.8572555581132995e-05, + "loss": 0.4124, + "step": 8948 + }, + { + "epoch": 1.5963785567745963, + "grad_norm": 0.5239266753196716, + "learning_rate": 4.853106486582498e-05, + "loss": 0.6721, + "step": 8949 + }, + { + "epoch": 1.5965569529925965, + "grad_norm": 0.46557673811912537, + "learning_rate": 4.8489589973800346e-05, + "loss": 0.4868, + "step": 8950 + }, + { + "epoch": 1.5967353492105967, + "grad_norm": 0.5202271342277527, + "learning_rate": 4.844813090831668e-05, + "loss": 0.734, + "step": 8951 + }, + { + "epoch": 1.5969137454285969, + "grad_norm": 0.4756893515586853, + "learning_rate": 4.840668767262993e-05, + "loss": 0.5069, + "step": 8952 + }, + { + "epoch": 1.597092141646597, + "grad_norm": 0.40755748748779297, + "learning_rate": 4.836526026999532e-05, + "loss": 0.3434, + "step": 8953 + }, + { + "epoch": 1.5972705378645973, + "grad_norm": 0.5153047442436218, + "learning_rate": 4.8323848703666405e-05, + "loss": 0.4607, + "step": 8954 + }, + { + "epoch": 1.5974489340825975, + "grad_norm": 0.5465795397758484, + "learning_rate": 4.8282452976895566e-05, + "loss": 0.5989, + "step": 8955 + }, + { + "epoch": 1.5976273303005977, + "grad_norm": 0.47764524817466736, + "learning_rate": 4.8241073092934104e-05, + "loss": 0.4355, + "step": 8956 + }, + { + "epoch": 1.5978057265185979, + "grad_norm": 0.5183848142623901, + "learning_rate": 4.8199709055031876e-05, + "loss": 0.4979, + "step": 8957 + }, + { + "epoch": 1.597984122736598, + "grad_norm": 0.55250483751297, + "learning_rate": 4.815836086643774e-05, + "loss": 0.5273, + "step": 8958 + }, + { + "epoch": 1.5981625189545983, + "grad_norm": 0.4400150775909424, + "learning_rate": 4.811702853039901e-05, + "loss": 0.4164, + "step": 8959 + }, + { + "epoch": 1.5983409151725985, + "grad_norm": 0.5529983639717102, + "learning_rate": 4.807571205016206e-05, + "loss": 0.6202, + "step": 8960 + }, + { + "epoch": 1.5985193113905987, + "grad_norm": 0.5263171792030334, + "learning_rate": 4.803441142897178e-05, + "loss": 0.5444, + "step": 8961 + }, + { + "epoch": 1.5986977076085989, + "grad_norm": 0.4600481688976288, + "learning_rate": 4.799312667007183e-05, + "loss": 0.4249, + "step": 8962 + }, + { + "epoch": 1.5988761038265988, + "grad_norm": 0.5363233089447021, + "learning_rate": 4.795185777670485e-05, + "loss": 0.4851, + "step": 8963 + }, + { + "epoch": 1.599054500044599, + "grad_norm": 0.478712797164917, + "learning_rate": 4.791060475211198e-05, + "loss": 0.3852, + "step": 8964 + }, + { + "epoch": 1.5992328962625992, + "grad_norm": 0.5279228687286377, + "learning_rate": 4.7869367599533284e-05, + "loss": 0.6256, + "step": 8965 + }, + { + "epoch": 1.5994112924805994, + "grad_norm": 0.4477089047431946, + "learning_rate": 4.782814632220742e-05, + "loss": 0.4195, + "step": 8966 + }, + { + "epoch": 1.5995896886985996, + "grad_norm": 0.5009279847145081, + "learning_rate": 4.778694092337194e-05, + "loss": 0.5, + "step": 8967 + }, + { + "epoch": 1.5997680849165996, + "grad_norm": 0.47871533036231995, + "learning_rate": 4.7745751406263163e-05, + "loss": 0.5258, + "step": 8968 + }, + { + "epoch": 1.5999464811345998, + "grad_norm": 0.47330227494239807, + "learning_rate": 4.770457777411597e-05, + "loss": 0.4042, + "step": 8969 + }, + { + "epoch": 1.6001248773526, + "grad_norm": 0.4700087308883667, + "learning_rate": 4.766342003016424e-05, + "loss": 0.462, + "step": 8970 + }, + { + "epoch": 1.6003032735706002, + "grad_norm": 0.5046184659004211, + "learning_rate": 4.7622278177640366e-05, + "loss": 0.4632, + "step": 8971 + }, + { + "epoch": 1.6004816697886004, + "grad_norm": 0.5220001935958862, + "learning_rate": 4.758115221977574e-05, + "loss": 0.531, + "step": 8972 + }, + { + "epoch": 1.6006600660066006, + "grad_norm": 0.5016428232192993, + "learning_rate": 4.7540042159800264e-05, + "loss": 0.5008, + "step": 8973 + }, + { + "epoch": 1.6008384622246008, + "grad_norm": 0.4888211786746979, + "learning_rate": 4.7498948000942814e-05, + "loss": 0.4499, + "step": 8974 + }, + { + "epoch": 1.601016858442601, + "grad_norm": 0.5536308288574219, + "learning_rate": 4.745786974643082e-05, + "loss": 0.5715, + "step": 8975 + }, + { + "epoch": 1.6011952546606012, + "grad_norm": 0.48591530323028564, + "learning_rate": 4.741680739949053e-05, + "loss": 0.4089, + "step": 8976 + }, + { + "epoch": 1.6013736508786014, + "grad_norm": 0.6232868432998657, + "learning_rate": 4.7375760963347056e-05, + "loss": 0.5869, + "step": 8977 + }, + { + "epoch": 1.6015520470966016, + "grad_norm": 0.5616008043289185, + "learning_rate": 4.733473044122408e-05, + "loss": 0.5639, + "step": 8978 + }, + { + "epoch": 1.6017304433146018, + "grad_norm": 0.5132565498352051, + "learning_rate": 4.72937158363442e-05, + "loss": 0.53, + "step": 8979 + }, + { + "epoch": 1.601908839532602, + "grad_norm": 0.5778293609619141, + "learning_rate": 4.725271715192861e-05, + "loss": 0.7178, + "step": 8980 + }, + { + "epoch": 1.6020872357506022, + "grad_norm": 0.5367584228515625, + "learning_rate": 4.721173439119742e-05, + "loss": 0.5345, + "step": 8981 + }, + { + "epoch": 1.6022656319686024, + "grad_norm": 0.4915335476398468, + "learning_rate": 4.7170767557369264e-05, + "loss": 0.5284, + "step": 8982 + }, + { + "epoch": 1.6024440281866026, + "grad_norm": 0.524190366268158, + "learning_rate": 4.712981665366176e-05, + "loss": 0.5688, + "step": 8983 + }, + { + "epoch": 1.6026224244046028, + "grad_norm": 0.5471283793449402, + "learning_rate": 4.70888816832912e-05, + "loss": 0.5799, + "step": 8984 + }, + { + "epoch": 1.6028008206226028, + "grad_norm": 0.5151530504226685, + "learning_rate": 4.7047962649472504e-05, + "loss": 0.5424, + "step": 8985 + }, + { + "epoch": 1.602979216840603, + "grad_norm": 0.5410858988761902, + "learning_rate": 4.7007059555419535e-05, + "loss": 0.624, + "step": 8986 + }, + { + "epoch": 1.6031576130586032, + "grad_norm": 0.4905013144016266, + "learning_rate": 4.696617240434475e-05, + "loss": 0.4588, + "step": 8987 + }, + { + "epoch": 1.6033360092766034, + "grad_norm": 0.4888882637023926, + "learning_rate": 4.692530119945937e-05, + "loss": 0.4577, + "step": 8988 + }, + { + "epoch": 1.6035144054946036, + "grad_norm": 0.4958007335662842, + "learning_rate": 4.688444594397351e-05, + "loss": 0.519, + "step": 8989 + }, + { + "epoch": 1.6036928017126035, + "grad_norm": 0.5347129106521606, + "learning_rate": 4.684360664109577e-05, + "loss": 0.687, + "step": 8990 + }, + { + "epoch": 1.6038711979306037, + "grad_norm": 0.5105026364326477, + "learning_rate": 4.680278329403381e-05, + "loss": 0.5317, + "step": 8991 + }, + { + "epoch": 1.604049594148604, + "grad_norm": 0.4922098219394684, + "learning_rate": 4.676197590599377e-05, + "loss": 0.482, + "step": 8992 + }, + { + "epoch": 1.6042279903666041, + "grad_norm": 0.44692763686180115, + "learning_rate": 4.672118448018073e-05, + "loss": 0.4144, + "step": 8993 + }, + { + "epoch": 1.6044063865846043, + "grad_norm": 0.5374802947044373, + "learning_rate": 4.6680409019798364e-05, + "loss": 0.6081, + "step": 8994 + }, + { + "epoch": 1.6045847828026045, + "grad_norm": 0.54610675573349, + "learning_rate": 4.6639649528049135e-05, + "loss": 0.6023, + "step": 8995 + }, + { + "epoch": 1.6047631790206047, + "grad_norm": 0.4369504451751709, + "learning_rate": 4.659890600813438e-05, + "loss": 0.357, + "step": 8996 + }, + { + "epoch": 1.604941575238605, + "grad_norm": 0.47287872433662415, + "learning_rate": 4.6558178463253944e-05, + "loss": 0.4559, + "step": 8997 + }, + { + "epoch": 1.6051199714566051, + "grad_norm": 0.4916176497936249, + "learning_rate": 4.651746689660663e-05, + "loss": 0.5313, + "step": 8998 + }, + { + "epoch": 1.6052983676746053, + "grad_norm": 0.5396872758865356, + "learning_rate": 4.647677131138997e-05, + "loss": 0.5313, + "step": 8999 + }, + { + "epoch": 1.6054767638926055, + "grad_norm": 0.5110781788825989, + "learning_rate": 4.643609171080001e-05, + "loss": 0.5488, + "step": 9000 + }, + { + "epoch": 1.6056551601106057, + "grad_norm": 0.5322538018226624, + "learning_rate": 4.63954280980319e-05, + "loss": 0.5717, + "step": 9001 + }, + { + "epoch": 1.605833556328606, + "grad_norm": 0.49910300970077515, + "learning_rate": 4.63547804762792e-05, + "loss": 0.5186, + "step": 9002 + }, + { + "epoch": 1.6060119525466061, + "grad_norm": 0.48461076617240906, + "learning_rate": 4.631414884873444e-05, + "loss": 0.5344, + "step": 9003 + }, + { + "epoch": 1.6061903487646063, + "grad_norm": 0.626358151435852, + "learning_rate": 4.627353321858874e-05, + "loss": 0.7774, + "step": 9004 + }, + { + "epoch": 1.6063687449826065, + "grad_norm": 0.6041339039802551, + "learning_rate": 4.6232933589032105e-05, + "loss": 0.6707, + "step": 9005 + }, + { + "epoch": 1.6065471412006067, + "grad_norm": 0.4728602468967438, + "learning_rate": 4.619234996325314e-05, + "loss": 0.4132, + "step": 9006 + }, + { + "epoch": 1.6067255374186067, + "grad_norm": 0.5530493259429932, + "learning_rate": 4.6151782344439366e-05, + "loss": 0.5413, + "step": 9007 + }, + { + "epoch": 1.6069039336366069, + "grad_norm": 0.5874399542808533, + "learning_rate": 4.611123073577686e-05, + "loss": 0.7141, + "step": 9008 + }, + { + "epoch": 1.607082329854607, + "grad_norm": 0.5239754319190979, + "learning_rate": 4.607069514045051e-05, + "loss": 0.6276, + "step": 9009 + }, + { + "epoch": 1.6072607260726073, + "grad_norm": 0.5062006115913391, + "learning_rate": 4.603017556164407e-05, + "loss": 0.4942, + "step": 9010 + }, + { + "epoch": 1.6074391222906075, + "grad_norm": 0.49783483147621155, + "learning_rate": 4.5989672002539785e-05, + "loss": 0.4906, + "step": 9011 + }, + { + "epoch": 1.6076175185086075, + "grad_norm": 0.5406367182731628, + "learning_rate": 4.594918446631896e-05, + "loss": 0.6172, + "step": 9012 + }, + { + "epoch": 1.6077959147266077, + "grad_norm": 0.48416727781295776, + "learning_rate": 4.590871295616128e-05, + "loss": 0.5055, + "step": 9013 + }, + { + "epoch": 1.6079743109446079, + "grad_norm": 0.5911693572998047, + "learning_rate": 4.586825747524548e-05, + "loss": 0.7576, + "step": 9014 + }, + { + "epoch": 1.608152707162608, + "grad_norm": 0.5087306499481201, + "learning_rate": 4.582781802674896e-05, + "loss": 0.4797, + "step": 9015 + }, + { + "epoch": 1.6083311033806083, + "grad_norm": 0.4570036232471466, + "learning_rate": 4.578739461384765e-05, + "loss": 0.5285, + "step": 9016 + }, + { + "epoch": 1.6085094995986084, + "grad_norm": 0.46012383699417114, + "learning_rate": 4.57469872397166e-05, + "loss": 0.4309, + "step": 9017 + }, + { + "epoch": 1.6086878958166086, + "grad_norm": 0.4837871789932251, + "learning_rate": 4.570659590752918e-05, + "loss": 0.5151, + "step": 9018 + }, + { + "epoch": 1.6088662920346088, + "grad_norm": 0.5085347890853882, + "learning_rate": 4.566622062045786e-05, + "loss": 0.6172, + "step": 9019 + }, + { + "epoch": 1.609044688252609, + "grad_norm": 0.4480245113372803, + "learning_rate": 4.562586138167368e-05, + "loss": 0.3621, + "step": 9020 + }, + { + "epoch": 1.6092230844706092, + "grad_norm": 0.5031635761260986, + "learning_rate": 4.558551819434631e-05, + "loss": 0.5342, + "step": 9021 + }, + { + "epoch": 1.6094014806886094, + "grad_norm": 0.5782181024551392, + "learning_rate": 4.554519106164442e-05, + "loss": 0.551, + "step": 9022 + }, + { + "epoch": 1.6095798769066096, + "grad_norm": 0.5325855612754822, + "learning_rate": 4.550487998673519e-05, + "loss": 0.5286, + "step": 9023 + }, + { + "epoch": 1.6097582731246098, + "grad_norm": 0.4554942548274994, + "learning_rate": 4.5464584972784774e-05, + "loss": 0.453, + "step": 9024 + }, + { + "epoch": 1.60993666934261, + "grad_norm": 0.5228267312049866, + "learning_rate": 4.5424306022957745e-05, + "loss": 0.6083, + "step": 9025 + }, + { + "epoch": 1.6101150655606102, + "grad_norm": 0.5161296725273132, + "learning_rate": 4.538404314041775e-05, + "loss": 0.5022, + "step": 9026 + }, + { + "epoch": 1.6102934617786104, + "grad_norm": 0.5143990516662598, + "learning_rate": 4.534379632832691e-05, + "loss": 0.4937, + "step": 9027 + }, + { + "epoch": 1.6104718579966106, + "grad_norm": 0.483243852853775, + "learning_rate": 4.530356558984622e-05, + "loss": 0.4821, + "step": 9028 + }, + { + "epoch": 1.6106502542146106, + "grad_norm": 0.5501946210861206, + "learning_rate": 4.5263350928135465e-05, + "loss": 0.6569, + "step": 9029 + }, + { + "epoch": 1.6108286504326108, + "grad_norm": 0.5732588171958923, + "learning_rate": 4.5223152346352964e-05, + "loss": 0.7411, + "step": 9030 + }, + { + "epoch": 1.611007046650611, + "grad_norm": 0.4644021987915039, + "learning_rate": 4.518296984765599e-05, + "loss": 0.506, + "step": 9031 + }, + { + "epoch": 1.6111854428686112, + "grad_norm": 0.5346475839614868, + "learning_rate": 4.514280343520041e-05, + "loss": 0.5926, + "step": 9032 + }, + { + "epoch": 1.6113638390866114, + "grad_norm": 0.4996803104877472, + "learning_rate": 4.510265311214093e-05, + "loss": 0.4819, + "step": 9033 + }, + { + "epoch": 1.6115422353046114, + "grad_norm": 0.5373480319976807, + "learning_rate": 4.50625188816309e-05, + "loss": 0.5144, + "step": 9034 + }, + { + "epoch": 1.6117206315226116, + "grad_norm": 0.5674717426300049, + "learning_rate": 4.5022400746822374e-05, + "loss": 0.5336, + "step": 9035 + }, + { + "epoch": 1.6118990277406118, + "grad_norm": 0.531956672668457, + "learning_rate": 4.498229871086637e-05, + "loss": 0.4986, + "step": 9036 + }, + { + "epoch": 1.612077423958612, + "grad_norm": 0.5014679431915283, + "learning_rate": 4.4942212776912325e-05, + "loss": 0.5051, + "step": 9037 + }, + { + "epoch": 1.6122558201766122, + "grad_norm": 0.5485700368881226, + "learning_rate": 4.4902142948108684e-05, + "loss": 0.502, + "step": 9038 + }, + { + "epoch": 1.6124342163946124, + "grad_norm": 0.5426976084709167, + "learning_rate": 4.486208922760243e-05, + "loss": 0.5571, + "step": 9039 + }, + { + "epoch": 1.6126126126126126, + "grad_norm": 0.49919578433036804, + "learning_rate": 4.48220516185395e-05, + "loss": 0.4595, + "step": 9040 + }, + { + "epoch": 1.6127910088306128, + "grad_norm": 0.43158578872680664, + "learning_rate": 4.4782030124064314e-05, + "loss": 0.3117, + "step": 9041 + }, + { + "epoch": 1.612969405048613, + "grad_norm": 0.5657439231872559, + "learning_rate": 4.474202474732011e-05, + "loss": 0.5668, + "step": 9042 + }, + { + "epoch": 1.6131478012666132, + "grad_norm": 0.5453231334686279, + "learning_rate": 4.470203549144902e-05, + "loss": 0.6741, + "step": 9043 + }, + { + "epoch": 1.6133261974846134, + "grad_norm": 0.5499635338783264, + "learning_rate": 4.4662062359591585e-05, + "loss": 0.719, + "step": 9044 + }, + { + "epoch": 1.6135045937026136, + "grad_norm": 0.43844422698020935, + "learning_rate": 4.4622105354887534e-05, + "loss": 0.3209, + "step": 9045 + }, + { + "epoch": 1.6136829899206138, + "grad_norm": 0.5634275078773499, + "learning_rate": 4.458216448047494e-05, + "loss": 0.5132, + "step": 9046 + }, + { + "epoch": 1.613861386138614, + "grad_norm": 0.4690277874469757, + "learning_rate": 4.4542239739490705e-05, + "loss": 0.3584, + "step": 9047 + }, + { + "epoch": 1.6140397823566142, + "grad_norm": 0.4696029722690582, + "learning_rate": 4.45023311350706e-05, + "loss": 0.4606, + "step": 9048 + }, + { + "epoch": 1.6142181785746144, + "grad_norm": 0.4961758553981781, + "learning_rate": 4.446243867034891e-05, + "loss": 0.5177, + "step": 9049 + }, + { + "epoch": 1.6143965747926146, + "grad_norm": 0.5290566682815552, + "learning_rate": 4.44225623484589e-05, + "loss": 0.5366, + "step": 9050 + }, + { + "epoch": 1.6145749710106145, + "grad_norm": 0.5504355430603027, + "learning_rate": 4.438270217253232e-05, + "loss": 0.6723, + "step": 9051 + }, + { + "epoch": 1.6147533672286147, + "grad_norm": 0.49992886185646057, + "learning_rate": 4.434285814569988e-05, + "loss": 0.5295, + "step": 9052 + }, + { + "epoch": 1.614931763446615, + "grad_norm": 0.5225392580032349, + "learning_rate": 4.430303027109081e-05, + "loss": 0.575, + "step": 9053 + }, + { + "epoch": 1.6151101596646151, + "grad_norm": 0.5753360390663147, + "learning_rate": 4.4263218551833294e-05, + "loss": 0.6629, + "step": 9054 + }, + { + "epoch": 1.6152885558826153, + "grad_norm": 0.4710274040699005, + "learning_rate": 4.422342299105403e-05, + "loss": 0.5181, + "step": 9055 + }, + { + "epoch": 1.6154669521006153, + "grad_norm": 0.5554249286651611, + "learning_rate": 4.4183643591878515e-05, + "loss": 0.6106, + "step": 9056 + }, + { + "epoch": 1.6156453483186155, + "grad_norm": 0.5206986665725708, + "learning_rate": 4.414388035743114e-05, + "loss": 0.5476, + "step": 9057 + }, + { + "epoch": 1.6158237445366157, + "grad_norm": 0.5251834392547607, + "learning_rate": 4.410413329083473e-05, + "loss": 0.6463, + "step": 9058 + }, + { + "epoch": 1.616002140754616, + "grad_norm": 0.49586114287376404, + "learning_rate": 4.4064402395211116e-05, + "loss": 0.427, + "step": 9059 + }, + { + "epoch": 1.616180536972616, + "grad_norm": 0.4815883934497833, + "learning_rate": 4.402468767368076e-05, + "loss": 0.4655, + "step": 9060 + }, + { + "epoch": 1.6163589331906163, + "grad_norm": 0.5105904936790466, + "learning_rate": 4.3984989129362744e-05, + "loss": 0.4924, + "step": 9061 + }, + { + "epoch": 1.6165373294086165, + "grad_norm": 0.47116348147392273, + "learning_rate": 4.3945306765375086e-05, + "loss": 0.4197, + "step": 9062 + }, + { + "epoch": 1.6167157256266167, + "grad_norm": 0.516294538974762, + "learning_rate": 4.390564058483429e-05, + "loss": 0.5377, + "step": 9063 + }, + { + "epoch": 1.616894121844617, + "grad_norm": 0.4575921595096588, + "learning_rate": 4.3865990590855885e-05, + "loss": 0.5038, + "step": 9064 + }, + { + "epoch": 1.617072518062617, + "grad_norm": 0.5077314376831055, + "learning_rate": 4.3826356786553776e-05, + "loss": 0.4385, + "step": 9065 + }, + { + "epoch": 1.6172509142806173, + "grad_norm": 0.4583306908607483, + "learning_rate": 4.378673917504094e-05, + "loss": 0.4456, + "step": 9066 + }, + { + "epoch": 1.6174293104986175, + "grad_norm": 0.5505391955375671, + "learning_rate": 4.37471377594289e-05, + "loss": 0.5579, + "step": 9067 + }, + { + "epoch": 1.6176077067166177, + "grad_norm": 0.501794159412384, + "learning_rate": 4.3707552542827824e-05, + "loss": 0.4491, + "step": 9068 + }, + { + "epoch": 1.6177861029346179, + "grad_norm": 0.4820462465286255, + "learning_rate": 4.366798352834686e-05, + "loss": 0.4494, + "step": 9069 + }, + { + "epoch": 1.617964499152618, + "grad_norm": 0.6349626183509827, + "learning_rate": 4.3628430719093614e-05, + "loss": 0.6564, + "step": 9070 + }, + { + "epoch": 1.6181428953706183, + "grad_norm": 0.577896773815155, + "learning_rate": 4.3588894118174685e-05, + "loss": 0.6188, + "step": 9071 + }, + { + "epoch": 1.6183212915886185, + "grad_norm": 0.5295639634132385, + "learning_rate": 4.3549373728695105e-05, + "loss": 0.5548, + "step": 9072 + }, + { + "epoch": 1.6184996878066185, + "grad_norm": 0.5311222076416016, + "learning_rate": 4.350986955375893e-05, + "loss": 0.4302, + "step": 9073 + }, + { + "epoch": 1.6186780840246187, + "grad_norm": 0.4720657467842102, + "learning_rate": 4.3470381596468714e-05, + "loss": 0.4666, + "step": 9074 + }, + { + "epoch": 1.6188564802426189, + "grad_norm": 0.4858959913253784, + "learning_rate": 4.3430909859925814e-05, + "loss": 0.5341, + "step": 9075 + }, + { + "epoch": 1.619034876460619, + "grad_norm": 0.5353506207466125, + "learning_rate": 4.339145434723044e-05, + "loss": 0.5534, + "step": 9076 + }, + { + "epoch": 1.6192132726786193, + "grad_norm": 0.4330075681209564, + "learning_rate": 4.335201506148126e-05, + "loss": 0.363, + "step": 9077 + }, + { + "epoch": 1.6193916688966192, + "grad_norm": 0.5149824023246765, + "learning_rate": 4.3312592005775946e-05, + "loss": 0.5245, + "step": 9078 + }, + { + "epoch": 1.6195700651146194, + "grad_norm": 0.49177101254463196, + "learning_rate": 4.327318518321074e-05, + "loss": 0.5551, + "step": 9079 + }, + { + "epoch": 1.6197484613326196, + "grad_norm": 0.47272542119026184, + "learning_rate": 4.323379459688051e-05, + "loss": 0.4035, + "step": 9080 + }, + { + "epoch": 1.6199268575506198, + "grad_norm": 0.509996235370636, + "learning_rate": 4.319442024987916e-05, + "loss": 0.5317, + "step": 9081 + }, + { + "epoch": 1.62010525376862, + "grad_norm": 0.5650163888931274, + "learning_rate": 4.315506214529899e-05, + "loss": 0.7105, + "step": 9082 + }, + { + "epoch": 1.6202836499866202, + "grad_norm": 0.5097303986549377, + "learning_rate": 4.3115720286231257e-05, + "loss": 0.4972, + "step": 9083 + }, + { + "epoch": 1.6204620462046204, + "grad_norm": 0.5151503086090088, + "learning_rate": 4.3076394675765796e-05, + "loss": 0.5673, + "step": 9084 + }, + { + "epoch": 1.6206404424226206, + "grad_norm": 0.5100242495536804, + "learning_rate": 4.30370853169913e-05, + "loss": 0.4747, + "step": 9085 + }, + { + "epoch": 1.6208188386406208, + "grad_norm": 0.5815212726593018, + "learning_rate": 4.299779221299499e-05, + "loss": 0.6796, + "step": 9086 + }, + { + "epoch": 1.620997234858621, + "grad_norm": 0.5294939875602722, + "learning_rate": 4.2958515366863075e-05, + "loss": 0.5649, + "step": 9087 + }, + { + "epoch": 1.6211756310766212, + "grad_norm": 0.45899614691734314, + "learning_rate": 4.291925478168024e-05, + "loss": 0.4118, + "step": 9088 + }, + { + "epoch": 1.6213540272946214, + "grad_norm": 0.5030704736709595, + "learning_rate": 4.288001046052992e-05, + "loss": 0.5717, + "step": 9089 + }, + { + "epoch": 1.6215324235126216, + "grad_norm": 0.4515380561351776, + "learning_rate": 4.284078240649458e-05, + "loss": 0.4641, + "step": 9090 + }, + { + "epoch": 1.6217108197306218, + "grad_norm": 0.5059680938720703, + "learning_rate": 4.280157062265497e-05, + "loss": 0.4881, + "step": 9091 + }, + { + "epoch": 1.621889215948622, + "grad_norm": 0.4840905964374542, + "learning_rate": 4.2762375112090886e-05, + "loss": 0.4164, + "step": 9092 + }, + { + "epoch": 1.6220676121666222, + "grad_norm": 0.5307683348655701, + "learning_rate": 4.2723195877880706e-05, + "loss": 0.5169, + "step": 9093 + }, + { + "epoch": 1.6222460083846224, + "grad_norm": 0.6119624376296997, + "learning_rate": 4.268403292310144e-05, + "loss": 0.4207, + "step": 9094 + }, + { + "epoch": 1.6224244046026224, + "grad_norm": 0.4793750047683716, + "learning_rate": 4.264488625082907e-05, + "loss": 0.4304, + "step": 9095 + }, + { + "epoch": 1.6226028008206226, + "grad_norm": 0.5105946660041809, + "learning_rate": 4.260575586413806e-05, + "loss": 0.4994, + "step": 9096 + }, + { + "epoch": 1.6227811970386228, + "grad_norm": 0.5425340533256531, + "learning_rate": 4.256664176610178e-05, + "loss": 0.5135, + "step": 9097 + }, + { + "epoch": 1.622959593256623, + "grad_norm": 0.5028733611106873, + "learning_rate": 4.252754395979216e-05, + "loss": 0.4335, + "step": 9098 + }, + { + "epoch": 1.6231379894746232, + "grad_norm": 0.48356130719184875, + "learning_rate": 4.2488462448280005e-05, + "loss": 0.3819, + "step": 9099 + }, + { + "epoch": 1.6233163856926232, + "grad_norm": 0.5290387868881226, + "learning_rate": 4.244939723463467e-05, + "loss": 0.4932, + "step": 9100 + }, + { + "epoch": 1.6234947819106234, + "grad_norm": 0.4586488604545593, + "learning_rate": 4.241034832192434e-05, + "loss": 0.3835, + "step": 9101 + }, + { + "epoch": 1.6236731781286236, + "grad_norm": 0.5234144926071167, + "learning_rate": 4.237131571321598e-05, + "loss": 0.5544, + "step": 9102 + }, + { + "epoch": 1.6238515743466237, + "grad_norm": 0.46121421456336975, + "learning_rate": 4.233229941157504e-05, + "loss": 0.3899, + "step": 9103 + }, + { + "epoch": 1.624029970564624, + "grad_norm": 0.500857412815094, + "learning_rate": 4.229329942006604e-05, + "loss": 0.6343, + "step": 9104 + }, + { + "epoch": 1.6242083667826241, + "grad_norm": 0.47724878787994385, + "learning_rate": 4.225431574175184e-05, + "loss": 0.4782, + "step": 9105 + }, + { + "epoch": 1.6243867630006243, + "grad_norm": 0.46206358075141907, + "learning_rate": 4.221534837969429e-05, + "loss": 0.44, + "step": 9106 + }, + { + "epoch": 1.6245651592186245, + "grad_norm": 0.5365794897079468, + "learning_rate": 4.217639733695391e-05, + "loss": 0.5337, + "step": 9107 + }, + { + "epoch": 1.6247435554366247, + "grad_norm": 0.49346688389778137, + "learning_rate": 4.21374626165898e-05, + "loss": 0.4771, + "step": 9108 + }, + { + "epoch": 1.624921951654625, + "grad_norm": 0.5585914254188538, + "learning_rate": 4.209854422165998e-05, + "loss": 0.5715, + "step": 9109 + }, + { + "epoch": 1.6251003478726251, + "grad_norm": 0.49180254340171814, + "learning_rate": 4.205964215522096e-05, + "loss": 0.5383, + "step": 9110 + }, + { + "epoch": 1.6252787440906253, + "grad_norm": 0.5028634667396545, + "learning_rate": 4.202075642032824e-05, + "loss": 0.5635, + "step": 9111 + }, + { + "epoch": 1.6254571403086255, + "grad_norm": 0.583336353302002, + "learning_rate": 4.198188702003575e-05, + "loss": 0.6608, + "step": 9112 + }, + { + "epoch": 1.6256355365266257, + "grad_norm": 0.5647164583206177, + "learning_rate": 4.194303395739638e-05, + "loss": 0.7343, + "step": 9113 + }, + { + "epoch": 1.625813932744626, + "grad_norm": 0.4452681839466095, + "learning_rate": 4.19041972354616e-05, + "loss": 0.4388, + "step": 9114 + }, + { + "epoch": 1.6259923289626261, + "grad_norm": 0.3759744167327881, + "learning_rate": 4.186537685728156e-05, + "loss": 0.2556, + "step": 9115 + }, + { + "epoch": 1.6261707251806263, + "grad_norm": 0.6420390605926514, + "learning_rate": 4.1826572825905296e-05, + "loss": 0.8147, + "step": 9116 + }, + { + "epoch": 1.6263491213986263, + "grad_norm": 0.4719489812850952, + "learning_rate": 4.178778514438036e-05, + "loss": 0.501, + "step": 9117 + }, + { + "epoch": 1.6265275176166265, + "grad_norm": 0.5943500995635986, + "learning_rate": 4.174901381575327e-05, + "loss": 0.5954, + "step": 9118 + }, + { + "epoch": 1.6267059138346267, + "grad_norm": 0.45145153999328613, + "learning_rate": 4.171025884306892e-05, + "loss": 0.3752, + "step": 9119 + }, + { + "epoch": 1.626884310052627, + "grad_norm": 0.49264243245124817, + "learning_rate": 4.1671520229371234e-05, + "loss": 0.3958, + "step": 9120 + }, + { + "epoch": 1.627062706270627, + "grad_norm": 0.5612894296646118, + "learning_rate": 4.163279797770275e-05, + "loss": 0.6676, + "step": 9121 + }, + { + "epoch": 1.627241102488627, + "grad_norm": 0.6251209378242493, + "learning_rate": 4.1594092091104594e-05, + "loss": 0.7134, + "step": 9122 + }, + { + "epoch": 1.6274194987066273, + "grad_norm": 0.49562308192253113, + "learning_rate": 4.155540257261681e-05, + "loss": 0.5082, + "step": 9123 + }, + { + "epoch": 1.6275978949246275, + "grad_norm": 0.6067000031471252, + "learning_rate": 4.1516729425277924e-05, + "loss": 0.6371, + "step": 9124 + }, + { + "epoch": 1.6277762911426277, + "grad_norm": 0.46219685673713684, + "learning_rate": 4.14780726521255e-05, + "loss": 0.4726, + "step": 9125 + }, + { + "epoch": 1.6279546873606279, + "grad_norm": 0.5683661103248596, + "learning_rate": 4.143943225619548e-05, + "loss": 0.6561, + "step": 9126 + }, + { + "epoch": 1.628133083578628, + "grad_norm": 0.5201235413551331, + "learning_rate": 4.140080824052264e-05, + "loss": 0.6107, + "step": 9127 + }, + { + "epoch": 1.6283114797966283, + "grad_norm": 0.5444372296333313, + "learning_rate": 4.1362200608140635e-05, + "loss": 0.6235, + "step": 9128 + }, + { + "epoch": 1.6284898760146285, + "grad_norm": 0.5139498114585876, + "learning_rate": 4.132360936208154e-05, + "loss": 0.4917, + "step": 9129 + }, + { + "epoch": 1.6286682722326287, + "grad_norm": 0.43743810057640076, + "learning_rate": 4.1285034505376436e-05, + "loss": 0.4096, + "step": 9130 + }, + { + "epoch": 1.6288466684506289, + "grad_norm": 0.5461446046829224, + "learning_rate": 4.124647604105483e-05, + "loss": 0.6073, + "step": 9131 + }, + { + "epoch": 1.629025064668629, + "grad_norm": 0.5800282955169678, + "learning_rate": 4.120793397214523e-05, + "loss": 0.6349, + "step": 9132 + }, + { + "epoch": 1.6292034608866293, + "grad_norm": 0.6056262850761414, + "learning_rate": 4.1169408301674566e-05, + "loss": 0.5903, + "step": 9133 + }, + { + "epoch": 1.6293818571046295, + "grad_norm": 0.47324472665786743, + "learning_rate": 4.113089903266879e-05, + "loss": 0.4385, + "step": 9134 + }, + { + "epoch": 1.6295602533226297, + "grad_norm": 0.5741338133811951, + "learning_rate": 4.109240616815227e-05, + "loss": 0.6246, + "step": 9135 + }, + { + "epoch": 1.6297386495406299, + "grad_norm": 0.5018966794013977, + "learning_rate": 4.105392971114824e-05, + "loss": 0.5335, + "step": 9136 + }, + { + "epoch": 1.62991704575863, + "grad_norm": 0.5375863909721375, + "learning_rate": 4.101546966467873e-05, + "loss": 0.6031, + "step": 9137 + }, + { + "epoch": 1.6300954419766303, + "grad_norm": 0.5858981013298035, + "learning_rate": 4.0977026031764286e-05, + "loss": 0.4568, + "step": 9138 + }, + { + "epoch": 1.6302738381946302, + "grad_norm": 0.43643826246261597, + "learning_rate": 4.093859881542422e-05, + "loss": 0.3848, + "step": 9139 + }, + { + "epoch": 1.6304522344126304, + "grad_norm": 0.5104368329048157, + "learning_rate": 4.09001880186767e-05, + "loss": 0.5939, + "step": 9140 + }, + { + "epoch": 1.6306306306306306, + "grad_norm": 0.5159505605697632, + "learning_rate": 4.0861793644538374e-05, + "loss": 0.55, + "step": 9141 + }, + { + "epoch": 1.6308090268486308, + "grad_norm": 0.5293989777565002, + "learning_rate": 4.082341569602482e-05, + "loss": 0.5429, + "step": 9142 + }, + { + "epoch": 1.630987423066631, + "grad_norm": 0.42517781257629395, + "learning_rate": 4.0785054176150135e-05, + "loss": 0.4012, + "step": 9143 + }, + { + "epoch": 1.631165819284631, + "grad_norm": 0.5382733941078186, + "learning_rate": 4.07467090879273e-05, + "loss": 0.5455, + "step": 9144 + }, + { + "epoch": 1.6313442155026312, + "grad_norm": 0.503835141658783, + "learning_rate": 4.0708380434367864e-05, + "loss": 0.5749, + "step": 9145 + }, + { + "epoch": 1.6315226117206314, + "grad_norm": 0.44363367557525635, + "learning_rate": 4.067006821848218e-05, + "loss": 0.4532, + "step": 9146 + }, + { + "epoch": 1.6317010079386316, + "grad_norm": 0.4841151833534241, + "learning_rate": 4.063177244327929e-05, + "loss": 0.4812, + "step": 9147 + }, + { + "epoch": 1.6318794041566318, + "grad_norm": 0.4346253573894501, + "learning_rate": 4.059349311176683e-05, + "loss": 0.4208, + "step": 9148 + }, + { + "epoch": 1.632057800374632, + "grad_norm": 0.46320968866348267, + "learning_rate": 4.055523022695135e-05, + "loss": 0.5005, + "step": 9149 + }, + { + "epoch": 1.6322361965926322, + "grad_norm": 0.4925728440284729, + "learning_rate": 4.051698379183791e-05, + "loss": 0.5086, + "step": 9150 + }, + { + "epoch": 1.6324145928106324, + "grad_norm": 0.4870889484882355, + "learning_rate": 4.047875380943039e-05, + "loss": 0.4466, + "step": 9151 + }, + { + "epoch": 1.6325929890286326, + "grad_norm": 0.4800761342048645, + "learning_rate": 4.0440540282731476e-05, + "loss": 0.4516, + "step": 9152 + }, + { + "epoch": 1.6327713852466328, + "grad_norm": 0.5413798689842224, + "learning_rate": 4.040234321474226e-05, + "loss": 0.455, + "step": 9153 + }, + { + "epoch": 1.632949781464633, + "grad_norm": 0.4921496510505676, + "learning_rate": 4.0364162608462904e-05, + "loss": 0.4303, + "step": 9154 + }, + { + "epoch": 1.6331281776826332, + "grad_norm": 0.5225081443786621, + "learning_rate": 4.0325998466891914e-05, + "loss": 0.4553, + "step": 9155 + }, + { + "epoch": 1.6333065739006334, + "grad_norm": 0.4533441364765167, + "learning_rate": 4.0287850793026825e-05, + "loss": 0.4079, + "step": 9156 + }, + { + "epoch": 1.6334849701186336, + "grad_norm": 0.521223247051239, + "learning_rate": 4.024971958986365e-05, + "loss": 0.5097, + "step": 9157 + }, + { + "epoch": 1.6336633663366338, + "grad_norm": 0.4792344272136688, + "learning_rate": 4.0211604860397295e-05, + "loss": 0.4567, + "step": 9158 + }, + { + "epoch": 1.633841762554634, + "grad_norm": 0.5261356830596924, + "learning_rate": 4.0173506607621227e-05, + "loss": 0.5597, + "step": 9159 + }, + { + "epoch": 1.6340201587726342, + "grad_norm": 0.5096288919448853, + "learning_rate": 4.013542483452759e-05, + "loss": 0.3727, + "step": 9160 + }, + { + "epoch": 1.6341985549906342, + "grad_norm": 0.5453589558601379, + "learning_rate": 4.0097359544107424e-05, + "loss": 0.5976, + "step": 9161 + }, + { + "epoch": 1.6343769512086344, + "grad_norm": 0.5257166028022766, + "learning_rate": 4.005931073935024e-05, + "loss": 0.4802, + "step": 9162 + }, + { + "epoch": 1.6345553474266346, + "grad_norm": 0.4476458728313446, + "learning_rate": 4.002127842324452e-05, + "loss": 0.4914, + "step": 9163 + }, + { + "epoch": 1.6347337436446348, + "grad_norm": 0.4358140230178833, + "learning_rate": 3.998326259877716e-05, + "loss": 0.3802, + "step": 9164 + }, + { + "epoch": 1.634912139862635, + "grad_norm": 0.4720216989517212, + "learning_rate": 3.994526326893405e-05, + "loss": 0.4643, + "step": 9165 + }, + { + "epoch": 1.635090536080635, + "grad_norm": 0.5170959234237671, + "learning_rate": 3.990728043669953e-05, + "loss": 0.6022, + "step": 9166 + }, + { + "epoch": 1.6352689322986351, + "grad_norm": 0.49744245409965515, + "learning_rate": 3.986931410505676e-05, + "loss": 0.4991, + "step": 9167 + }, + { + "epoch": 1.6354473285166353, + "grad_norm": 0.5377562642097473, + "learning_rate": 3.9831364276987717e-05, + "loss": 0.5476, + "step": 9168 + }, + { + "epoch": 1.6356257247346355, + "grad_norm": 0.5449107885360718, + "learning_rate": 3.97934309554728e-05, + "loss": 0.5108, + "step": 9169 + }, + { + "epoch": 1.6358041209526357, + "grad_norm": 0.49150487780570984, + "learning_rate": 3.9755514143491434e-05, + "loss": 0.3777, + "step": 9170 + }, + { + "epoch": 1.635982517170636, + "grad_norm": 0.5345403552055359, + "learning_rate": 3.971761384402145e-05, + "loss": 0.5278, + "step": 9171 + }, + { + "epoch": 1.6361609133886361, + "grad_norm": 0.5451551079750061, + "learning_rate": 3.9679730060039634e-05, + "loss": 0.4898, + "step": 9172 + }, + { + "epoch": 1.6363393096066363, + "grad_norm": 0.5874769687652588, + "learning_rate": 3.964186279452131e-05, + "loss": 0.6324, + "step": 9173 + }, + { + "epoch": 1.6365177058246365, + "grad_norm": 0.47333958745002747, + "learning_rate": 3.960401205044051e-05, + "loss": 0.4932, + "step": 9174 + }, + { + "epoch": 1.6366961020426367, + "grad_norm": 0.4722568094730377, + "learning_rate": 3.95661778307701e-05, + "loss": 0.3842, + "step": 9175 + }, + { + "epoch": 1.636874498260637, + "grad_norm": 0.55815190076828, + "learning_rate": 3.952836013848149e-05, + "loss": 0.668, + "step": 9176 + }, + { + "epoch": 1.637052894478637, + "grad_norm": 0.5843459963798523, + "learning_rate": 3.9490558976544965e-05, + "loss": 0.6534, + "step": 9177 + }, + { + "epoch": 1.6372312906966373, + "grad_norm": 0.44996100664138794, + "learning_rate": 3.9452774347929264e-05, + "loss": 0.3837, + "step": 9178 + }, + { + "epoch": 1.6374096869146375, + "grad_norm": 0.46350741386413574, + "learning_rate": 3.9415006255602123e-05, + "loss": 0.4841, + "step": 9179 + }, + { + "epoch": 1.6375880831326377, + "grad_norm": 0.5201796889305115, + "learning_rate": 3.9377254702529784e-05, + "loss": 0.5497, + "step": 9180 + }, + { + "epoch": 1.637766479350638, + "grad_norm": 0.5969486236572266, + "learning_rate": 3.933951969167709e-05, + "loss": 0.6576, + "step": 9181 + }, + { + "epoch": 1.637944875568638, + "grad_norm": 0.5387462377548218, + "learning_rate": 3.9301801226008014e-05, + "loss": 0.7601, + "step": 9182 + }, + { + "epoch": 1.638123271786638, + "grad_norm": 0.6476037502288818, + "learning_rate": 3.926409930848471e-05, + "loss": 0.7146, + "step": 9183 + }, + { + "epoch": 1.6383016680046383, + "grad_norm": 0.48097339272499084, + "learning_rate": 3.922641394206844e-05, + "loss": 0.4778, + "step": 9184 + }, + { + "epoch": 1.6384800642226385, + "grad_norm": 0.46164533495903015, + "learning_rate": 3.91887451297189e-05, + "loss": 0.3921, + "step": 9185 + }, + { + "epoch": 1.6386584604406387, + "grad_norm": 0.4835575520992279, + "learning_rate": 3.915109287439453e-05, + "loss": 0.4748, + "step": 9186 + }, + { + "epoch": 1.6388368566586389, + "grad_norm": 0.48624852299690247, + "learning_rate": 3.911345717905268e-05, + "loss": 0.4619, + "step": 9187 + }, + { + "epoch": 1.6390152528766389, + "grad_norm": 0.5269492864608765, + "learning_rate": 3.907583804664908e-05, + "loss": 0.4905, + "step": 9188 + }, + { + "epoch": 1.639193649094639, + "grad_norm": 0.538076639175415, + "learning_rate": 3.9038235480138435e-05, + "loss": 0.6448, + "step": 9189 + }, + { + "epoch": 1.6393720453126392, + "grad_norm": 0.5943297147750854, + "learning_rate": 3.9000649482473946e-05, + "loss": 0.6616, + "step": 9190 + }, + { + "epoch": 1.6395504415306394, + "grad_norm": 0.509810745716095, + "learning_rate": 3.8963080056607705e-05, + "loss": 0.5005, + "step": 9191 + }, + { + "epoch": 1.6397288377486396, + "grad_norm": 0.5434547662734985, + "learning_rate": 3.892552720549028e-05, + "loss": 0.5415, + "step": 9192 + }, + { + "epoch": 1.6399072339666398, + "grad_norm": 0.47341305017471313, + "learning_rate": 3.8887990932071156e-05, + "loss": 0.4871, + "step": 9193 + }, + { + "epoch": 1.64008563018464, + "grad_norm": 0.5444488525390625, + "learning_rate": 3.885047123929841e-05, + "loss": 0.6654, + "step": 9194 + }, + { + "epoch": 1.6402640264026402, + "grad_norm": 0.49865496158599854, + "learning_rate": 3.8812968130118696e-05, + "loss": 0.5161, + "step": 9195 + }, + { + "epoch": 1.6404424226206404, + "grad_norm": 0.6822000741958618, + "learning_rate": 3.877548160747768e-05, + "loss": 0.7265, + "step": 9196 + }, + { + "epoch": 1.6406208188386406, + "grad_norm": 0.5185505151748657, + "learning_rate": 3.873801167431928e-05, + "loss": 0.4423, + "step": 9197 + }, + { + "epoch": 1.6407992150566408, + "grad_norm": 0.5268929600715637, + "learning_rate": 3.8700558333586686e-05, + "loss": 0.563, + "step": 9198 + }, + { + "epoch": 1.640977611274641, + "grad_norm": 0.5534122586250305, + "learning_rate": 3.86631215882213e-05, + "loss": 0.6052, + "step": 9199 + }, + { + "epoch": 1.6411560074926412, + "grad_norm": 0.4893801212310791, + "learning_rate": 3.862570144116334e-05, + "loss": 0.5551, + "step": 9200 + }, + { + "epoch": 1.6413344037106414, + "grad_norm": 0.5151475667953491, + "learning_rate": 3.858829789535187e-05, + "loss": 0.5263, + "step": 9201 + }, + { + "epoch": 1.6415127999286416, + "grad_norm": 0.5644826292991638, + "learning_rate": 3.8550910953724456e-05, + "loss": 0.5625, + "step": 9202 + }, + { + "epoch": 1.6416911961466418, + "grad_norm": 0.5256550312042236, + "learning_rate": 3.851354061921758e-05, + "loss": 0.6462, + "step": 9203 + }, + { + "epoch": 1.641869592364642, + "grad_norm": 0.47919219732284546, + "learning_rate": 3.847618689476612e-05, + "loss": 0.4722, + "step": 9204 + }, + { + "epoch": 1.642047988582642, + "grad_norm": 0.5013111233711243, + "learning_rate": 3.8438849783304e-05, + "loss": 0.5176, + "step": 9205 + }, + { + "epoch": 1.6422263848006422, + "grad_norm": 0.5357193350791931, + "learning_rate": 3.840152928776358e-05, + "loss": 0.6217, + "step": 9206 + }, + { + "epoch": 1.6424047810186424, + "grad_norm": 0.5302641987800598, + "learning_rate": 3.836422541107593e-05, + "loss": 0.5448, + "step": 9207 + }, + { + "epoch": 1.6425831772366426, + "grad_norm": 0.5999414324760437, + "learning_rate": 3.832693815617097e-05, + "loss": 0.5155, + "step": 9208 + }, + { + "epoch": 1.6427615734546428, + "grad_norm": 0.5149564146995544, + "learning_rate": 3.828966752597718e-05, + "loss": 0.5502, + "step": 9209 + }, + { + "epoch": 1.6429399696726428, + "grad_norm": 0.46984735131263733, + "learning_rate": 3.8252413523421816e-05, + "loss": 0.3823, + "step": 9210 + }, + { + "epoch": 1.643118365890643, + "grad_norm": 0.4093347191810608, + "learning_rate": 3.821517615143075e-05, + "loss": 0.3818, + "step": 9211 + }, + { + "epoch": 1.6432967621086432, + "grad_norm": 0.5009578466415405, + "learning_rate": 3.817795541292859e-05, + "loss": 0.5433, + "step": 9212 + }, + { + "epoch": 1.6434751583266434, + "grad_norm": 0.5088503360748291, + "learning_rate": 3.8140751310838715e-05, + "loss": 0.4853, + "step": 9213 + }, + { + "epoch": 1.6436535545446436, + "grad_norm": 0.42338132858276367, + "learning_rate": 3.810356384808303e-05, + "loss": 0.3726, + "step": 9214 + }, + { + "epoch": 1.6438319507626438, + "grad_norm": 0.5196266770362854, + "learning_rate": 3.806639302758227e-05, + "loss": 0.4542, + "step": 9215 + }, + { + "epoch": 1.644010346980644, + "grad_norm": 0.5105109214782715, + "learning_rate": 3.802923885225576e-05, + "loss": 0.4412, + "step": 9216 + }, + { + "epoch": 1.6441887431986442, + "grad_norm": 0.5296480655670166, + "learning_rate": 3.7992101325021674e-05, + "loss": 0.5869, + "step": 9217 + }, + { + "epoch": 1.6443671394166444, + "grad_norm": 0.5003146529197693, + "learning_rate": 3.7954980448796724e-05, + "loss": 0.4731, + "step": 9218 + }, + { + "epoch": 1.6445455356346446, + "grad_norm": 0.5123296976089478, + "learning_rate": 3.7917876226496284e-05, + "loss": 0.5295, + "step": 9219 + }, + { + "epoch": 1.6447239318526448, + "grad_norm": 0.45916229486465454, + "learning_rate": 3.788078866103467e-05, + "loss": 0.4446, + "step": 9220 + }, + { + "epoch": 1.644902328070645, + "grad_norm": 0.5705180168151855, + "learning_rate": 3.7843717755324525e-05, + "loss": 0.5378, + "step": 9221 + }, + { + "epoch": 1.6450807242886452, + "grad_norm": 0.5570235848426819, + "learning_rate": 3.78066635122776e-05, + "loss": 0.5277, + "step": 9222 + }, + { + "epoch": 1.6452591205066454, + "grad_norm": 0.5336431860923767, + "learning_rate": 3.7769625934803904e-05, + "loss": 0.5871, + "step": 9223 + }, + { + "epoch": 1.6454375167246456, + "grad_norm": 0.4929081201553345, + "learning_rate": 3.773260502581255e-05, + "loss": 0.5053, + "step": 9224 + }, + { + "epoch": 1.6456159129426458, + "grad_norm": 0.4710312783718109, + "learning_rate": 3.7695600788210967e-05, + "loss": 0.5122, + "step": 9225 + }, + { + "epoch": 1.645794309160646, + "grad_norm": 0.44707542657852173, + "learning_rate": 3.7658613224905606e-05, + "loss": 0.3666, + "step": 9226 + }, + { + "epoch": 1.645972705378646, + "grad_norm": 0.6010311245918274, + "learning_rate": 3.7621642338801335e-05, + "loss": 0.5016, + "step": 9227 + }, + { + "epoch": 1.6461511015966461, + "grad_norm": 0.5563137531280518, + "learning_rate": 3.758468813280186e-05, + "loss": 0.5181, + "step": 9228 + }, + { + "epoch": 1.6463294978146463, + "grad_norm": 0.4832812547683716, + "learning_rate": 3.754775060980964e-05, + "loss": 0.5077, + "step": 9229 + }, + { + "epoch": 1.6465078940326465, + "grad_norm": 0.5060386657714844, + "learning_rate": 3.75108297727256e-05, + "loss": 0.5059, + "step": 9230 + }, + { + "epoch": 1.6466862902506467, + "grad_norm": 0.4577668309211731, + "learning_rate": 3.7473925624449625e-05, + "loss": 0.4391, + "step": 9231 + }, + { + "epoch": 1.6468646864686467, + "grad_norm": 0.5205204486846924, + "learning_rate": 3.743703816788005e-05, + "loss": 0.614, + "step": 9232 + }, + { + "epoch": 1.647043082686647, + "grad_norm": 0.550428569316864, + "learning_rate": 3.740016740591398e-05, + "loss": 0.558, + "step": 9233 + }, + { + "epoch": 1.647221478904647, + "grad_norm": 0.5250325798988342, + "learning_rate": 3.736331334144733e-05, + "loss": 0.4774, + "step": 9234 + }, + { + "epoch": 1.6473998751226473, + "grad_norm": 0.5304805040359497, + "learning_rate": 3.7326475977374486e-05, + "loss": 0.5603, + "step": 9235 + }, + { + "epoch": 1.6475782713406475, + "grad_norm": 0.5167137384414673, + "learning_rate": 3.728965531658876e-05, + "loss": 0.5417, + "step": 9236 + }, + { + "epoch": 1.6477566675586477, + "grad_norm": 0.4665099084377289, + "learning_rate": 3.725285136198189e-05, + "loss": 0.5014, + "step": 9237 + }, + { + "epoch": 1.647935063776648, + "grad_norm": 0.5300410985946655, + "learning_rate": 3.72160641164446e-05, + "loss": 0.5706, + "step": 9238 + }, + { + "epoch": 1.648113459994648, + "grad_norm": 0.5009768009185791, + "learning_rate": 3.7179293582866064e-05, + "loss": 0.4469, + "step": 9239 + }, + { + "epoch": 1.6482918562126483, + "grad_norm": 0.4268481433391571, + "learning_rate": 3.714253976413418e-05, + "loss": 0.3472, + "step": 9240 + }, + { + "epoch": 1.6484702524306485, + "grad_norm": 0.5363745093345642, + "learning_rate": 3.710580266313565e-05, + "loss": 0.6022, + "step": 9241 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 0.5669705271720886, + "learning_rate": 3.706908228275571e-05, + "loss": 0.6687, + "step": 9242 + }, + { + "epoch": 1.6488270448666489, + "grad_norm": 0.5033018589019775, + "learning_rate": 3.703237862587844e-05, + "loss": 0.4968, + "step": 9243 + }, + { + "epoch": 1.649005441084649, + "grad_norm": 0.491277813911438, + "learning_rate": 3.699569169538655e-05, + "loss": 0.4729, + "step": 9244 + }, + { + "epoch": 1.6491838373026493, + "grad_norm": 0.43952322006225586, + "learning_rate": 3.695902149416133e-05, + "loss": 0.3538, + "step": 9245 + }, + { + "epoch": 1.6493622335206495, + "grad_norm": 0.6005352735519409, + "learning_rate": 3.692236802508292e-05, + "loss": 0.6145, + "step": 9246 + }, + { + "epoch": 1.6495406297386497, + "grad_norm": 0.566465437412262, + "learning_rate": 3.688573129102999e-05, + "loss": 0.5943, + "step": 9247 + }, + { + "epoch": 1.6497190259566499, + "grad_norm": 0.4471827745437622, + "learning_rate": 3.6849111294880056e-05, + "loss": 0.4295, + "step": 9248 + }, + { + "epoch": 1.6498974221746499, + "grad_norm": 0.4851609766483307, + "learning_rate": 3.681250803950914e-05, + "loss": 0.4421, + "step": 9249 + }, + { + "epoch": 1.65007581839265, + "grad_norm": 0.5344951152801514, + "learning_rate": 3.6775921527792164e-05, + "loss": 0.483, + "step": 9250 + }, + { + "epoch": 1.6502542146106502, + "grad_norm": 0.5168811082839966, + "learning_rate": 3.673935176260249e-05, + "loss": 0.4603, + "step": 9251 + }, + { + "epoch": 1.6504326108286504, + "grad_norm": 0.5323466062545776, + "learning_rate": 3.67027987468124e-05, + "loss": 0.5537, + "step": 9252 + }, + { + "epoch": 1.6506110070466506, + "grad_norm": 0.46962469816207886, + "learning_rate": 3.6666262483292715e-05, + "loss": 0.3963, + "step": 9253 + }, + { + "epoch": 1.6507894032646506, + "grad_norm": 0.5902261734008789, + "learning_rate": 3.662974297491292e-05, + "loss": 0.6236, + "step": 9254 + }, + { + "epoch": 1.6509677994826508, + "grad_norm": 0.5006431937217712, + "learning_rate": 3.6593240224541357e-05, + "loss": 0.4853, + "step": 9255 + }, + { + "epoch": 1.651146195700651, + "grad_norm": 0.6096543669700623, + "learning_rate": 3.6556754235044815e-05, + "loss": 0.7908, + "step": 9256 + }, + { + "epoch": 1.6513245919186512, + "grad_norm": 0.4698275625705719, + "learning_rate": 3.6520285009289e-05, + "loss": 0.4004, + "step": 9257 + }, + { + "epoch": 1.6515029881366514, + "grad_norm": 0.49396297335624695, + "learning_rate": 3.648383255013804e-05, + "loss": 0.5341, + "step": 9258 + }, + { + "epoch": 1.6516813843546516, + "grad_norm": 0.49175316095352173, + "learning_rate": 3.644739686045503e-05, + "loss": 0.4549, + "step": 9259 + }, + { + "epoch": 1.6518597805726518, + "grad_norm": 0.533931314945221, + "learning_rate": 3.6410977943101606e-05, + "loss": 0.5214, + "step": 9260 + }, + { + "epoch": 1.652038176790652, + "grad_norm": 0.5131736397743225, + "learning_rate": 3.6374575800938004e-05, + "loss": 0.546, + "step": 9261 + }, + { + "epoch": 1.6522165730086522, + "grad_norm": 0.4640941321849823, + "learning_rate": 3.633819043682338e-05, + "loss": 0.4511, + "step": 9262 + }, + { + "epoch": 1.6523949692266524, + "grad_norm": 0.506545901298523, + "learning_rate": 3.6301821853615216e-05, + "loss": 0.4355, + "step": 9263 + }, + { + "epoch": 1.6525733654446526, + "grad_norm": 0.5474511981010437, + "learning_rate": 3.6265470054170107e-05, + "loss": 0.5829, + "step": 9264 + }, + { + "epoch": 1.6527517616626528, + "grad_norm": 0.5812618732452393, + "learning_rate": 3.622913504134298e-05, + "loss": 0.5626, + "step": 9265 + }, + { + "epoch": 1.652930157880653, + "grad_norm": 0.5066866874694824, + "learning_rate": 3.619281681798756e-05, + "loss": 0.5741, + "step": 9266 + }, + { + "epoch": 1.6531085540986532, + "grad_norm": 0.48909926414489746, + "learning_rate": 3.615651538695633e-05, + "loss": 0.4439, + "step": 9267 + }, + { + "epoch": 1.6532869503166534, + "grad_norm": 0.5382171869277954, + "learning_rate": 3.6120230751100295e-05, + "loss": 0.5164, + "step": 9268 + }, + { + "epoch": 1.6534653465346536, + "grad_norm": 0.5178452730178833, + "learning_rate": 3.608396291326938e-05, + "loss": 0.5316, + "step": 9269 + }, + { + "epoch": 1.6536437427526538, + "grad_norm": 0.5068187713623047, + "learning_rate": 3.6047711876311895e-05, + "loss": 0.4757, + "step": 9270 + }, + { + "epoch": 1.6538221389706538, + "grad_norm": 0.5174327492713928, + "learning_rate": 3.601147764307511e-05, + "loss": 0.4613, + "step": 9271 + }, + { + "epoch": 1.654000535188654, + "grad_norm": 0.4983610212802887, + "learning_rate": 3.597526021640471e-05, + "loss": 0.4841, + "step": 9272 + }, + { + "epoch": 1.6541789314066542, + "grad_norm": 0.5520349740982056, + "learning_rate": 3.593905959914528e-05, + "loss": 0.6076, + "step": 9273 + }, + { + "epoch": 1.6543573276246544, + "grad_norm": 0.5307412147521973, + "learning_rate": 3.590287579414006e-05, + "loss": 0.5639, + "step": 9274 + }, + { + "epoch": 1.6545357238426546, + "grad_norm": 0.5179523825645447, + "learning_rate": 3.586670880423079e-05, + "loss": 0.5217, + "step": 9275 + }, + { + "epoch": 1.6547141200606548, + "grad_norm": 0.5160864591598511, + "learning_rate": 3.5830558632258095e-05, + "loss": 0.597, + "step": 9276 + }, + { + "epoch": 1.6548925162786547, + "grad_norm": 0.5735654830932617, + "learning_rate": 3.579442528106111e-05, + "loss": 0.6853, + "step": 9277 + }, + { + "epoch": 1.655070912496655, + "grad_norm": 0.5181892514228821, + "learning_rate": 3.5758308753477855e-05, + "loss": 0.4761, + "step": 9278 + }, + { + "epoch": 1.6552493087146551, + "grad_norm": 0.4839540421962738, + "learning_rate": 3.5722209052344826e-05, + "loss": 0.444, + "step": 9279 + }, + { + "epoch": 1.6554277049326553, + "grad_norm": 0.4774402678012848, + "learning_rate": 3.5686126180497214e-05, + "loss": 0.4561, + "step": 9280 + }, + { + "epoch": 1.6556061011506555, + "grad_norm": 0.5063000321388245, + "learning_rate": 3.56500601407691e-05, + "loss": 0.4304, + "step": 9281 + }, + { + "epoch": 1.6557844973686557, + "grad_norm": 0.39081576466560364, + "learning_rate": 3.561401093599295e-05, + "loss": 0.318, + "step": 9282 + }, + { + "epoch": 1.655962893586656, + "grad_norm": 0.5337099432945251, + "learning_rate": 3.557797856900022e-05, + "loss": 0.3636, + "step": 9283 + }, + { + "epoch": 1.6561412898046561, + "grad_norm": 0.5275738835334778, + "learning_rate": 3.554196304262067e-05, + "loss": 0.4887, + "step": 9284 + }, + { + "epoch": 1.6563196860226563, + "grad_norm": 0.5422784686088562, + "learning_rate": 3.5505964359683146e-05, + "loss": 0.5377, + "step": 9285 + }, + { + "epoch": 1.6564980822406565, + "grad_norm": 0.48426032066345215, + "learning_rate": 3.546998252301487e-05, + "loss": 0.4597, + "step": 9286 + }, + { + "epoch": 1.6566764784586567, + "grad_norm": 0.5601605176925659, + "learning_rate": 3.543401753544179e-05, + "loss": 0.6027, + "step": 9287 + }, + { + "epoch": 1.656854874676657, + "grad_norm": 0.4989088773727417, + "learning_rate": 3.539806939978868e-05, + "loss": 0.4151, + "step": 9288 + }, + { + "epoch": 1.6570332708946571, + "grad_norm": 0.5130565762519836, + "learning_rate": 3.536213811887876e-05, + "loss": 0.4985, + "step": 9289 + }, + { + "epoch": 1.6572116671126573, + "grad_norm": 0.4882428050041199, + "learning_rate": 3.532622369553423e-05, + "loss": 0.5041, + "step": 9290 + }, + { + "epoch": 1.6573900633306575, + "grad_norm": 0.5201172828674316, + "learning_rate": 3.529032613257574e-05, + "loss": 0.5467, + "step": 9291 + }, + { + "epoch": 1.6575684595486577, + "grad_norm": 0.5011335015296936, + "learning_rate": 3.525444543282255e-05, + "loss": 0.4953, + "step": 9292 + }, + { + "epoch": 1.6577468557666577, + "grad_norm": 0.4961317777633667, + "learning_rate": 3.521858159909289e-05, + "loss": 0.4933, + "step": 9293 + }, + { + "epoch": 1.657925251984658, + "grad_norm": 0.5875723958015442, + "learning_rate": 3.518273463420332e-05, + "loss": 0.5181, + "step": 9294 + }, + { + "epoch": 1.658103648202658, + "grad_norm": 0.4927757680416107, + "learning_rate": 3.5146904540969414e-05, + "loss": 0.3826, + "step": 9295 + }, + { + "epoch": 1.6582820444206583, + "grad_norm": 0.5020849108695984, + "learning_rate": 3.511109132220508e-05, + "loss": 0.4191, + "step": 9296 + }, + { + "epoch": 1.6584604406386585, + "grad_norm": 0.44826123118400574, + "learning_rate": 3.507529498072323e-05, + "loss": 0.3812, + "step": 9297 + }, + { + "epoch": 1.6586388368566587, + "grad_norm": 0.5366451144218445, + "learning_rate": 3.5039515519335236e-05, + "loss": 0.4824, + "step": 9298 + }, + { + "epoch": 1.6588172330746587, + "grad_norm": 0.5516796112060547, + "learning_rate": 3.500375294085112e-05, + "loss": 0.5474, + "step": 9299 + }, + { + "epoch": 1.6589956292926589, + "grad_norm": 0.5130575895309448, + "learning_rate": 3.4968007248079776e-05, + "loss": 0.4346, + "step": 9300 + }, + { + "epoch": 1.659174025510659, + "grad_norm": 0.6171131134033203, + "learning_rate": 3.493227844382857e-05, + "loss": 0.6627, + "step": 9301 + }, + { + "epoch": 1.6593524217286593, + "grad_norm": 0.5802091956138611, + "learning_rate": 3.48965665309037e-05, + "loss": 0.5415, + "step": 9302 + }, + { + "epoch": 1.6595308179466595, + "grad_norm": 0.5305059552192688, + "learning_rate": 3.48608715121099e-05, + "loss": 0.5481, + "step": 9303 + }, + { + "epoch": 1.6597092141646597, + "grad_norm": 0.5003054738044739, + "learning_rate": 3.4825193390250645e-05, + "loss": 0.511, + "step": 9304 + }, + { + "epoch": 1.6598876103826599, + "grad_norm": 0.5035673379898071, + "learning_rate": 3.478953216812816e-05, + "loss": 0.5759, + "step": 9305 + }, + { + "epoch": 1.66006600660066, + "grad_norm": 0.5683543682098389, + "learning_rate": 3.4753887848543163e-05, + "loss": 0.602, + "step": 9306 + }, + { + "epoch": 1.6602444028186603, + "grad_norm": 0.4327681362628937, + "learning_rate": 3.471826043429524e-05, + "loss": 0.3409, + "step": 9307 + }, + { + "epoch": 1.6604227990366605, + "grad_norm": 0.5470651388168335, + "learning_rate": 3.468264992818246e-05, + "loss": 0.5559, + "step": 9308 + }, + { + "epoch": 1.6606011952546607, + "grad_norm": 0.49540919065475464, + "learning_rate": 3.464705633300172e-05, + "loss": 0.4914, + "step": 9309 + }, + { + "epoch": 1.6607795914726609, + "grad_norm": 0.45050835609436035, + "learning_rate": 3.461147965154846e-05, + "loss": 0.4139, + "step": 9310 + }, + { + "epoch": 1.660957987690661, + "grad_norm": 0.5284720659255981, + "learning_rate": 3.457591988661696e-05, + "loss": 0.4919, + "step": 9311 + }, + { + "epoch": 1.6611363839086613, + "grad_norm": 0.4902994930744171, + "learning_rate": 3.4540377040999995e-05, + "loss": 0.4287, + "step": 9312 + }, + { + "epoch": 1.6613147801266615, + "grad_norm": 0.4877963364124298, + "learning_rate": 3.450485111748905e-05, + "loss": 0.4539, + "step": 9313 + }, + { + "epoch": 1.6614931763446616, + "grad_norm": 0.5532664656639099, + "learning_rate": 3.446934211887443e-05, + "loss": 0.4228, + "step": 9314 + }, + { + "epoch": 1.6616715725626616, + "grad_norm": 0.5521690249443054, + "learning_rate": 3.443385004794486e-05, + "loss": 0.5662, + "step": 9315 + }, + { + "epoch": 1.6618499687806618, + "grad_norm": 0.6033298969268799, + "learning_rate": 3.439837490748798e-05, + "loss": 0.6267, + "step": 9316 + }, + { + "epoch": 1.662028364998662, + "grad_norm": 0.5335450172424316, + "learning_rate": 3.436291670028993e-05, + "loss": 0.5272, + "step": 9317 + }, + { + "epoch": 1.6622067612166622, + "grad_norm": 0.5540075302124023, + "learning_rate": 3.432747542913564e-05, + "loss": 0.4624, + "step": 9318 + }, + { + "epoch": 1.6623851574346624, + "grad_norm": 0.5262585282325745, + "learning_rate": 3.429205109680858e-05, + "loss": 0.526, + "step": 9319 + }, + { + "epoch": 1.6625635536526626, + "grad_norm": 0.5248980522155762, + "learning_rate": 3.425664370609099e-05, + "loss": 0.5116, + "step": 9320 + }, + { + "epoch": 1.6627419498706626, + "grad_norm": 0.547280490398407, + "learning_rate": 3.422125325976383e-05, + "loss": 0.561, + "step": 9321 + }, + { + "epoch": 1.6629203460886628, + "grad_norm": 0.4935222268104553, + "learning_rate": 3.418587976060653e-05, + "loss": 0.4185, + "step": 9322 + }, + { + "epoch": 1.663098742306663, + "grad_norm": 0.5188893675804138, + "learning_rate": 3.415052321139739e-05, + "loss": 0.5019, + "step": 9323 + }, + { + "epoch": 1.6632771385246632, + "grad_norm": 0.4804520905017853, + "learning_rate": 3.411518361491328e-05, + "loss": 0.4503, + "step": 9324 + }, + { + "epoch": 1.6634555347426634, + "grad_norm": 0.5874333381652832, + "learning_rate": 3.407986097392971e-05, + "loss": 0.5774, + "step": 9325 + }, + { + "epoch": 1.6636339309606636, + "grad_norm": 0.49579691886901855, + "learning_rate": 3.404455529122097e-05, + "loss": 0.4188, + "step": 9326 + }, + { + "epoch": 1.6638123271786638, + "grad_norm": 0.45815184712409973, + "learning_rate": 3.400926656955988e-05, + "loss": 0.4121, + "step": 9327 + }, + { + "epoch": 1.663990723396664, + "grad_norm": 0.4780590534210205, + "learning_rate": 3.397399481171812e-05, + "loss": 0.4392, + "step": 9328 + }, + { + "epoch": 1.6641691196146642, + "grad_norm": 0.4702582359313965, + "learning_rate": 3.393874002046576e-05, + "loss": 0.4654, + "step": 9329 + }, + { + "epoch": 1.6643475158326644, + "grad_norm": 0.4759620428085327, + "learning_rate": 3.3903502198571855e-05, + "loss": 0.4035, + "step": 9330 + }, + { + "epoch": 1.6645259120506646, + "grad_norm": 0.5115009546279907, + "learning_rate": 3.386828134880382e-05, + "loss": 0.5284, + "step": 9331 + }, + { + "epoch": 1.6647043082686648, + "grad_norm": 0.5846856832504272, + "learning_rate": 3.383307747392802e-05, + "loss": 0.6325, + "step": 9332 + }, + { + "epoch": 1.664882704486665, + "grad_norm": 0.6280512809753418, + "learning_rate": 3.379789057670929e-05, + "loss": 0.5222, + "step": 9333 + }, + { + "epoch": 1.6650611007046652, + "grad_norm": 0.5682896375656128, + "learning_rate": 3.376272065991115e-05, + "loss": 0.5462, + "step": 9334 + }, + { + "epoch": 1.6652394969226654, + "grad_norm": 0.5903425216674805, + "learning_rate": 3.372756772629587e-05, + "loss": 0.8133, + "step": 9335 + }, + { + "epoch": 1.6654178931406656, + "grad_norm": 0.5490643978118896, + "learning_rate": 3.369243177862436e-05, + "loss": 0.6145, + "step": 9336 + }, + { + "epoch": 1.6655962893586655, + "grad_norm": 0.5370810031890869, + "learning_rate": 3.3657312819656226e-05, + "loss": 0.5671, + "step": 9337 + }, + { + "epoch": 1.6657746855766657, + "grad_norm": 0.5224997997283936, + "learning_rate": 3.362221085214964e-05, + "loss": 0.5352, + "step": 9338 + }, + { + "epoch": 1.665953081794666, + "grad_norm": 0.5418823957443237, + "learning_rate": 3.358712587886143e-05, + "loss": 0.5853, + "step": 9339 + }, + { + "epoch": 1.6661314780126661, + "grad_norm": 0.5162733197212219, + "learning_rate": 3.3552057902547286e-05, + "loss": 0.4823, + "step": 9340 + }, + { + "epoch": 1.6663098742306663, + "grad_norm": 0.4687744081020355, + "learning_rate": 3.351700692596132e-05, + "loss": 0.4868, + "step": 9341 + }, + { + "epoch": 1.6664882704486665, + "grad_norm": 0.5440129637718201, + "learning_rate": 3.348197295185654e-05, + "loss": 0.5426, + "step": 9342 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.4801231920719147, + "learning_rate": 3.344695598298436e-05, + "loss": 0.5322, + "step": 9343 + }, + { + "epoch": 1.6668450628846667, + "grad_norm": 0.4614768624305725, + "learning_rate": 3.341195602209512e-05, + "loss": 0.3709, + "step": 9344 + }, + { + "epoch": 1.667023459102667, + "grad_norm": 0.4808447062969208, + "learning_rate": 3.3376973071937656e-05, + "loss": 0.4343, + "step": 9345 + }, + { + "epoch": 1.6672018553206671, + "grad_norm": 0.42641395330429077, + "learning_rate": 3.3342007135259425e-05, + "loss": 0.442, + "step": 9346 + }, + { + "epoch": 1.6673802515386673, + "grad_norm": 0.4673963189125061, + "learning_rate": 3.3307058214806814e-05, + "loss": 0.4625, + "step": 9347 + }, + { + "epoch": 1.6675586477566675, + "grad_norm": 0.4446796178817749, + "learning_rate": 3.327212631332452e-05, + "loss": 0.3887, + "step": 9348 + }, + { + "epoch": 1.6677370439746677, + "grad_norm": 0.42202529311180115, + "learning_rate": 3.323721143355621e-05, + "loss": 0.3426, + "step": 9349 + }, + { + "epoch": 1.667915440192668, + "grad_norm": 0.5200551748275757, + "learning_rate": 3.320231357824399e-05, + "loss": 0.5275, + "step": 9350 + }, + { + "epoch": 1.668093836410668, + "grad_norm": 0.4585854411125183, + "learning_rate": 3.3167432750128764e-05, + "loss": 0.4564, + "step": 9351 + }, + { + "epoch": 1.6682722326286683, + "grad_norm": 0.5158877968788147, + "learning_rate": 3.313256895195013e-05, + "loss": 0.4869, + "step": 9352 + }, + { + "epoch": 1.6684506288466685, + "grad_norm": 0.4725387692451477, + "learning_rate": 3.3097722186446135e-05, + "loss": 0.5002, + "step": 9353 + }, + { + "epoch": 1.6686290250646687, + "grad_norm": 0.44371485710144043, + "learning_rate": 3.306289245635374e-05, + "loss": 0.398, + "step": 9354 + }, + { + "epoch": 1.668807421282669, + "grad_norm": 0.5774705410003662, + "learning_rate": 3.3028079764408386e-05, + "loss": 0.5725, + "step": 9355 + }, + { + "epoch": 1.668985817500669, + "grad_norm": 0.5633199214935303, + "learning_rate": 3.2993284113344315e-05, + "loss": 0.5249, + "step": 9356 + }, + { + "epoch": 1.6691642137186693, + "grad_norm": 0.4772298336029053, + "learning_rate": 3.295850550589427e-05, + "loss": 0.3945, + "step": 9357 + }, + { + "epoch": 1.6693426099366695, + "grad_norm": 0.4757455587387085, + "learning_rate": 3.292374394478986e-05, + "loss": 0.4001, + "step": 9358 + }, + { + "epoch": 1.6695210061546695, + "grad_norm": 0.49446776509284973, + "learning_rate": 3.288899943276119e-05, + "loss": 0.3805, + "step": 9359 + }, + { + "epoch": 1.6696994023726697, + "grad_norm": 0.4915049970149994, + "learning_rate": 3.285427197253704e-05, + "loss": 0.4954, + "step": 9360 + }, + { + "epoch": 1.6698777985906699, + "grad_norm": 0.5779103636741638, + "learning_rate": 3.281956156684496e-05, + "loss": 0.514, + "step": 9361 + }, + { + "epoch": 1.67005619480867, + "grad_norm": 0.5662655830383301, + "learning_rate": 3.278486821841098e-05, + "loss": 0.5022, + "step": 9362 + }, + { + "epoch": 1.6702345910266703, + "grad_norm": 0.5742835998535156, + "learning_rate": 3.275019192996004e-05, + "loss": 0.6153, + "step": 9363 + }, + { + "epoch": 1.6704129872446705, + "grad_norm": 0.517959713935852, + "learning_rate": 3.271553270421551e-05, + "loss": 0.4259, + "step": 9364 + }, + { + "epoch": 1.6705913834626704, + "grad_norm": 0.5644903779029846, + "learning_rate": 3.26808905438995e-05, + "loss": 0.6748, + "step": 9365 + }, + { + "epoch": 1.6707697796806706, + "grad_norm": 0.5745575428009033, + "learning_rate": 3.264626545173291e-05, + "loss": 0.6036, + "step": 9366 + }, + { + "epoch": 1.6709481758986708, + "grad_norm": 0.5247329473495483, + "learning_rate": 3.261165743043501e-05, + "loss": 0.5762, + "step": 9367 + }, + { + "epoch": 1.671126572116671, + "grad_norm": 0.5291442275047302, + "learning_rate": 3.2577066482724074e-05, + "loss": 0.5571, + "step": 9368 + }, + { + "epoch": 1.6713049683346712, + "grad_norm": 0.5165200233459473, + "learning_rate": 3.2542492611316696e-05, + "loss": 0.6014, + "step": 9369 + }, + { + "epoch": 1.6714833645526714, + "grad_norm": 0.5730751156806946, + "learning_rate": 3.250793581892844e-05, + "loss": 0.7261, + "step": 9370 + }, + { + "epoch": 1.6716617607706716, + "grad_norm": 0.5317413806915283, + "learning_rate": 3.2473396108273297e-05, + "loss": 0.4659, + "step": 9371 + }, + { + "epoch": 1.6718401569886718, + "grad_norm": 0.5281661152839661, + "learning_rate": 3.243887348206395e-05, + "loss": 0.5902, + "step": 9372 + }, + { + "epoch": 1.672018553206672, + "grad_norm": 0.4783737361431122, + "learning_rate": 3.240436794301194e-05, + "loss": 0.5077, + "step": 9373 + }, + { + "epoch": 1.6721969494246722, + "grad_norm": 0.5345422029495239, + "learning_rate": 3.2369879493827167e-05, + "loss": 0.6897, + "step": 9374 + }, + { + "epoch": 1.6723753456426724, + "grad_norm": 0.4720827639102936, + "learning_rate": 3.233540813721844e-05, + "loss": 0.3754, + "step": 9375 + }, + { + "epoch": 1.6725537418606726, + "grad_norm": 0.4707276225090027, + "learning_rate": 3.2300953875893046e-05, + "loss": 0.4113, + "step": 9376 + }, + { + "epoch": 1.6727321380786728, + "grad_norm": 0.5764208436012268, + "learning_rate": 3.226651671255712e-05, + "loss": 0.5561, + "step": 9377 + }, + { + "epoch": 1.672910534296673, + "grad_norm": 0.4676361680030823, + "learning_rate": 3.2232096649915196e-05, + "loss": 0.4113, + "step": 9378 + }, + { + "epoch": 1.6730889305146732, + "grad_norm": 0.5252630710601807, + "learning_rate": 3.219769369067077e-05, + "loss": 0.5728, + "step": 9379 + }, + { + "epoch": 1.6732673267326734, + "grad_norm": 0.4781394302845001, + "learning_rate": 3.216330783752569e-05, + "loss": 0.2909, + "step": 9380 + }, + { + "epoch": 1.6734457229506734, + "grad_norm": 0.47112560272216797, + "learning_rate": 3.2128939093180655e-05, + "loss": 0.4191, + "step": 9381 + }, + { + "epoch": 1.6736241191686736, + "grad_norm": 0.5391256809234619, + "learning_rate": 3.209458746033506e-05, + "loss": 0.7091, + "step": 9382 + }, + { + "epoch": 1.6738025153866738, + "grad_norm": 0.49783357977867126, + "learning_rate": 3.206025294168677e-05, + "loss": 0.5378, + "step": 9383 + }, + { + "epoch": 1.673980911604674, + "grad_norm": 0.5017059445381165, + "learning_rate": 3.202593553993238e-05, + "loss": 0.4537, + "step": 9384 + }, + { + "epoch": 1.6741593078226742, + "grad_norm": 0.5273604989051819, + "learning_rate": 3.1991635257767274e-05, + "loss": 0.4771, + "step": 9385 + }, + { + "epoch": 1.6743377040406744, + "grad_norm": 0.5304795503616333, + "learning_rate": 3.195735209788528e-05, + "loss": 0.3931, + "step": 9386 + }, + { + "epoch": 1.6745161002586744, + "grad_norm": 0.5918444395065308, + "learning_rate": 3.1923086062979056e-05, + "loss": 0.7699, + "step": 9387 + }, + { + "epoch": 1.6746944964766746, + "grad_norm": 0.5880241990089417, + "learning_rate": 3.188883715573976e-05, + "loss": 0.6155, + "step": 9388 + }, + { + "epoch": 1.6748728926946748, + "grad_norm": 0.4606783092021942, + "learning_rate": 3.185460537885737e-05, + "loss": 0.4407, + "step": 9389 + }, + { + "epoch": 1.675051288912675, + "grad_norm": 0.44340142607688904, + "learning_rate": 3.182039073502035e-05, + "loss": 0.3536, + "step": 9390 + }, + { + "epoch": 1.6752296851306752, + "grad_norm": 0.5192257761955261, + "learning_rate": 3.1786193226916005e-05, + "loss": 0.5362, + "step": 9391 + }, + { + "epoch": 1.6754080813486754, + "grad_norm": 0.6925177574157715, + "learning_rate": 3.175201285723017e-05, + "loss": 0.7919, + "step": 9392 + }, + { + "epoch": 1.6755864775666756, + "grad_norm": 0.557155966758728, + "learning_rate": 3.171784962864724e-05, + "loss": 0.4573, + "step": 9393 + }, + { + "epoch": 1.6757648737846758, + "grad_norm": 0.5083150267601013, + "learning_rate": 3.1683703543850526e-05, + "loss": 0.5892, + "step": 9394 + }, + { + "epoch": 1.675943270002676, + "grad_norm": 0.46054404973983765, + "learning_rate": 3.164957460552173e-05, + "loss": 0.3862, + "step": 9395 + }, + { + "epoch": 1.6761216662206762, + "grad_norm": 0.5175045728683472, + "learning_rate": 3.16154628163414e-05, + "loss": 0.5591, + "step": 9396 + }, + { + "epoch": 1.6763000624386764, + "grad_norm": 0.4765566885471344, + "learning_rate": 3.1581368178988654e-05, + "loss": 0.405, + "step": 9397 + }, + { + "epoch": 1.6764784586566766, + "grad_norm": 0.5212094783782959, + "learning_rate": 3.154729069614123e-05, + "loss": 0.4395, + "step": 9398 + }, + { + "epoch": 1.6766568548746767, + "grad_norm": 0.5103222727775574, + "learning_rate": 3.1513230370475654e-05, + "loss": 0.4392, + "step": 9399 + }, + { + "epoch": 1.676835251092677, + "grad_norm": 0.46660804748535156, + "learning_rate": 3.147918720466689e-05, + "loss": 0.4229, + "step": 9400 + }, + { + "epoch": 1.6770136473106771, + "grad_norm": 0.500568687915802, + "learning_rate": 3.1445161201388766e-05, + "loss": 0.4259, + "step": 9401 + }, + { + "epoch": 1.6771920435286773, + "grad_norm": 0.5917816162109375, + "learning_rate": 3.141115236331357e-05, + "loss": 0.7201, + "step": 9402 + }, + { + "epoch": 1.6773704397466773, + "grad_norm": 0.49264368414878845, + "learning_rate": 3.137716069311247e-05, + "loss": 0.4191, + "step": 9403 + }, + { + "epoch": 1.6775488359646775, + "grad_norm": 0.5691975355148315, + "learning_rate": 3.134318619345508e-05, + "loss": 0.6493, + "step": 9404 + }, + { + "epoch": 1.6777272321826777, + "grad_norm": 0.5436373353004456, + "learning_rate": 3.130922886700968e-05, + "loss": 0.5603, + "step": 9405 + }, + { + "epoch": 1.677905628400678, + "grad_norm": 0.47983697056770325, + "learning_rate": 3.127528871644342e-05, + "loss": 0.4834, + "step": 9406 + }, + { + "epoch": 1.6780840246186781, + "grad_norm": 0.5476971864700317, + "learning_rate": 3.12413657444218e-05, + "loss": 0.5386, + "step": 9407 + }, + { + "epoch": 1.6782624208366783, + "grad_norm": 0.5388808846473694, + "learning_rate": 3.120745995360921e-05, + "loss": 0.483, + "step": 9408 + }, + { + "epoch": 1.6784408170546783, + "grad_norm": 0.5438225865364075, + "learning_rate": 3.117357134666851e-05, + "loss": 0.5662, + "step": 9409 + }, + { + "epoch": 1.6786192132726785, + "grad_norm": 0.5238845348358154, + "learning_rate": 3.113969992626142e-05, + "loss": 0.5429, + "step": 9410 + }, + { + "epoch": 1.6787976094906787, + "grad_norm": 0.5313870906829834, + "learning_rate": 3.110584569504804e-05, + "loss": 0.5065, + "step": 9411 + }, + { + "epoch": 1.6789760057086789, + "grad_norm": 0.5582557916641235, + "learning_rate": 3.1072008655687376e-05, + "loss": 0.5529, + "step": 9412 + }, + { + "epoch": 1.679154401926679, + "grad_norm": 0.552903950214386, + "learning_rate": 3.103818881083695e-05, + "loss": 0.553, + "step": 9413 + }, + { + "epoch": 1.6793327981446793, + "grad_norm": 0.5650268793106079, + "learning_rate": 3.100438616315293e-05, + "loss": 0.6105, + "step": 9414 + }, + { + "epoch": 1.6795111943626795, + "grad_norm": 0.4516279995441437, + "learning_rate": 3.0970600715290204e-05, + "loss": 0.3535, + "step": 9415 + }, + { + "epoch": 1.6796895905806797, + "grad_norm": 0.4835292100906372, + "learning_rate": 3.0936832469902226e-05, + "loss": 0.4125, + "step": 9416 + }, + { + "epoch": 1.6798679867986799, + "grad_norm": 0.6102882623672485, + "learning_rate": 3.0903081429641186e-05, + "loss": 0.5305, + "step": 9417 + }, + { + "epoch": 1.68004638301668, + "grad_norm": 0.485422283411026, + "learning_rate": 3.086934759715784e-05, + "loss": 0.4507, + "step": 9418 + }, + { + "epoch": 1.6802247792346803, + "grad_norm": 0.4775411784648895, + "learning_rate": 3.0835630975101587e-05, + "loss": 0.4746, + "step": 9419 + }, + { + "epoch": 1.6804031754526805, + "grad_norm": 0.4786146283149719, + "learning_rate": 3.08019315661206e-05, + "loss": 0.5359, + "step": 9420 + }, + { + "epoch": 1.6805815716706807, + "grad_norm": 0.5550956130027771, + "learning_rate": 3.076824937286155e-05, + "loss": 0.4877, + "step": 9421 + }, + { + "epoch": 1.6807599678886809, + "grad_norm": 0.5382834076881409, + "learning_rate": 3.073458439796989e-05, + "loss": 0.6299, + "step": 9422 + }, + { + "epoch": 1.680938364106681, + "grad_norm": 0.5830110311508179, + "learning_rate": 3.0700936644089574e-05, + "loss": 0.7122, + "step": 9423 + }, + { + "epoch": 1.6811167603246813, + "grad_norm": 0.506211519241333, + "learning_rate": 3.0667306113863366e-05, + "loss": 0.3911, + "step": 9424 + }, + { + "epoch": 1.6812951565426812, + "grad_norm": 0.46619656682014465, + "learning_rate": 3.0633692809932554e-05, + "loss": 0.4391, + "step": 9425 + }, + { + "epoch": 1.6814735527606814, + "grad_norm": 0.4801982343196869, + "learning_rate": 3.060009673493702e-05, + "loss": 0.4515, + "step": 9426 + }, + { + "epoch": 1.6816519489786816, + "grad_norm": 0.5707588791847229, + "learning_rate": 3.0566517891515546e-05, + "loss": 0.645, + "step": 9427 + }, + { + "epoch": 1.6818303451966818, + "grad_norm": 0.5557790994644165, + "learning_rate": 3.0532956282305294e-05, + "loss": 0.6412, + "step": 9428 + }, + { + "epoch": 1.682008741414682, + "grad_norm": 0.5159487128257751, + "learning_rate": 3.0499411909942265e-05, + "loss": 0.5476, + "step": 9429 + }, + { + "epoch": 1.6821871376326822, + "grad_norm": 0.4673673212528229, + "learning_rate": 3.0465884777060943e-05, + "loss": 0.4216, + "step": 9430 + }, + { + "epoch": 1.6823655338506822, + "grad_norm": 0.484874963760376, + "learning_rate": 3.0432374886294523e-05, + "loss": 0.5434, + "step": 9431 + }, + { + "epoch": 1.6825439300686824, + "grad_norm": 0.7181788682937622, + "learning_rate": 3.0398882240274955e-05, + "loss": 0.8602, + "step": 9432 + }, + { + "epoch": 1.6827223262866826, + "grad_norm": 0.4546528458595276, + "learning_rate": 3.036540684163261e-05, + "loss": 0.3598, + "step": 9433 + }, + { + "epoch": 1.6829007225046828, + "grad_norm": 0.4792789816856384, + "learning_rate": 3.033194869299674e-05, + "loss": 0.4134, + "step": 9434 + }, + { + "epoch": 1.683079118722683, + "grad_norm": 0.4788531959056854, + "learning_rate": 3.0298507796995056e-05, + "loss": 0.4332, + "step": 9435 + }, + { + "epoch": 1.6832575149406832, + "grad_norm": 0.6334043145179749, + "learning_rate": 3.0265084156254064e-05, + "loss": 0.4675, + "step": 9436 + }, + { + "epoch": 1.6834359111586834, + "grad_norm": 0.48785632848739624, + "learning_rate": 3.0231677773398748e-05, + "loss": 0.4613, + "step": 9437 + }, + { + "epoch": 1.6836143073766836, + "grad_norm": 0.5417238473892212, + "learning_rate": 3.019828865105295e-05, + "loss": 0.6667, + "step": 9438 + }, + { + "epoch": 1.6837927035946838, + "grad_norm": 0.46144363284111023, + "learning_rate": 3.016491679183897e-05, + "loss": 0.3897, + "step": 9439 + }, + { + "epoch": 1.683971099812684, + "grad_norm": 0.4795248508453369, + "learning_rate": 3.013156219837776e-05, + "loss": 0.4471, + "step": 9440 + }, + { + "epoch": 1.6841494960306842, + "grad_norm": 0.5149518251419067, + "learning_rate": 3.0098224873289086e-05, + "loss": 0.5248, + "step": 9441 + }, + { + "epoch": 1.6843278922486844, + "grad_norm": 0.5572359561920166, + "learning_rate": 3.0064904819191162e-05, + "loss": 0.6678, + "step": 9442 + }, + { + "epoch": 1.6845062884666846, + "grad_norm": 0.5138469934463501, + "learning_rate": 3.003160203870095e-05, + "loss": 0.5152, + "step": 9443 + }, + { + "epoch": 1.6846846846846848, + "grad_norm": 0.5161378383636475, + "learning_rate": 2.9998316534434135e-05, + "loss": 0.4903, + "step": 9444 + }, + { + "epoch": 1.684863080902685, + "grad_norm": 0.49434447288513184, + "learning_rate": 2.996504830900479e-05, + "loss": 0.5285, + "step": 9445 + }, + { + "epoch": 1.6850414771206852, + "grad_norm": 0.5236574411392212, + "learning_rate": 2.9931797365025937e-05, + "loss": 0.5159, + "step": 9446 + }, + { + "epoch": 1.6852198733386852, + "grad_norm": 0.544776201248169, + "learning_rate": 2.9898563705108932e-05, + "loss": 0.6266, + "step": 9447 + }, + { + "epoch": 1.6853982695566854, + "grad_norm": 0.5385255217552185, + "learning_rate": 2.9865347331864106e-05, + "loss": 0.5478, + "step": 9448 + }, + { + "epoch": 1.6855766657746856, + "grad_norm": 0.5094165205955505, + "learning_rate": 2.9832148247900092e-05, + "loss": 0.5402, + "step": 9449 + }, + { + "epoch": 1.6857550619926858, + "grad_norm": 0.5184796452522278, + "learning_rate": 2.9798966455824473e-05, + "loss": 0.4279, + "step": 9450 + }, + { + "epoch": 1.685933458210686, + "grad_norm": 0.5272186398506165, + "learning_rate": 2.9765801958243245e-05, + "loss": 0.4309, + "step": 9451 + }, + { + "epoch": 1.6861118544286862, + "grad_norm": 0.49219128489494324, + "learning_rate": 2.973265475776113e-05, + "loss": 0.4853, + "step": 9452 + }, + { + "epoch": 1.6862902506466861, + "grad_norm": 0.46345192193984985, + "learning_rate": 2.9699524856981574e-05, + "loss": 0.5184, + "step": 9453 + }, + { + "epoch": 1.6864686468646863, + "grad_norm": 0.528198778629303, + "learning_rate": 2.966641225850647e-05, + "loss": 0.572, + "step": 9454 + }, + { + "epoch": 1.6866470430826865, + "grad_norm": 0.5292263627052307, + "learning_rate": 2.963331696493657e-05, + "loss": 0.4926, + "step": 9455 + }, + { + "epoch": 1.6868254393006867, + "grad_norm": 0.48872697353363037, + "learning_rate": 2.9600238978871097e-05, + "loss": 0.3942, + "step": 9456 + }, + { + "epoch": 1.687003835518687, + "grad_norm": 0.5367798805236816, + "learning_rate": 2.9567178302908005e-05, + "loss": 0.5765, + "step": 9457 + }, + { + "epoch": 1.6871822317366871, + "grad_norm": 0.48006823658943176, + "learning_rate": 2.953413493964391e-05, + "loss": 0.4903, + "step": 9458 + }, + { + "epoch": 1.6873606279546873, + "grad_norm": 0.4655410349369049, + "learning_rate": 2.9501108891673928e-05, + "loss": 0.3782, + "step": 9459 + }, + { + "epoch": 1.6875390241726875, + "grad_norm": 0.49426302313804626, + "learning_rate": 2.9468100161592043e-05, + "loss": 0.5576, + "step": 9460 + }, + { + "epoch": 1.6877174203906877, + "grad_norm": 0.4561796188354492, + "learning_rate": 2.9435108751990596e-05, + "loss": 0.4239, + "step": 9461 + }, + { + "epoch": 1.687895816608688, + "grad_norm": 0.4534534513950348, + "learning_rate": 2.940213466546085e-05, + "loss": 0.3932, + "step": 9462 + }, + { + "epoch": 1.6880742128266881, + "grad_norm": 0.5681625604629517, + "learning_rate": 2.936917790459251e-05, + "loss": 0.5814, + "step": 9463 + }, + { + "epoch": 1.6882526090446883, + "grad_norm": 0.5786123275756836, + "learning_rate": 2.9336238471973947e-05, + "loss": 0.7368, + "step": 9464 + }, + { + "epoch": 1.6884310052626885, + "grad_norm": 0.6435004472732544, + "learning_rate": 2.930331637019229e-05, + "loss": 0.6626, + "step": 9465 + }, + { + "epoch": 1.6886094014806887, + "grad_norm": 0.4304559528827667, + "learning_rate": 2.9270411601833162e-05, + "loss": 0.3071, + "step": 9466 + }, + { + "epoch": 1.688787797698689, + "grad_norm": 0.5037026405334473, + "learning_rate": 2.9237524169480974e-05, + "loss": 0.4809, + "step": 9467 + }, + { + "epoch": 1.6889661939166891, + "grad_norm": 0.5338348150253296, + "learning_rate": 2.9204654075718568e-05, + "loss": 0.5474, + "step": 9468 + }, + { + "epoch": 1.689144590134689, + "grad_norm": 0.44183140993118286, + "learning_rate": 2.917180132312766e-05, + "loss": 0.3869, + "step": 9469 + }, + { + "epoch": 1.6893229863526893, + "grad_norm": 0.5616282224655151, + "learning_rate": 2.913896591428841e-05, + "loss": 0.5424, + "step": 9470 + }, + { + "epoch": 1.6895013825706895, + "grad_norm": 0.5086612701416016, + "learning_rate": 2.9106147851779785e-05, + "loss": 0.5467, + "step": 9471 + }, + { + "epoch": 1.6896797787886897, + "grad_norm": 0.44916772842407227, + "learning_rate": 2.907334713817919e-05, + "loss": 0.3491, + "step": 9472 + }, + { + "epoch": 1.6898581750066899, + "grad_norm": 0.5420513153076172, + "learning_rate": 2.9040563776062845e-05, + "loss": 0.5353, + "step": 9473 + }, + { + "epoch": 1.69003657122469, + "grad_norm": 0.4886205196380615, + "learning_rate": 2.9007797768005606e-05, + "loss": 0.4478, + "step": 9474 + }, + { + "epoch": 1.69021496744269, + "grad_norm": 0.47686657309532166, + "learning_rate": 2.897504911658075e-05, + "loss": 0.4001, + "step": 9475 + }, + { + "epoch": 1.6903933636606903, + "grad_norm": 0.5385986566543579, + "learning_rate": 2.8942317824360493e-05, + "loss": 0.6524, + "step": 9476 + }, + { + "epoch": 1.6905717598786905, + "grad_norm": 0.5010984539985657, + "learning_rate": 2.890960389391545e-05, + "loss": 0.4801, + "step": 9477 + }, + { + "epoch": 1.6907501560966907, + "grad_norm": 0.6029075384140015, + "learning_rate": 2.887690732781492e-05, + "loss": 0.6388, + "step": 9478 + }, + { + "epoch": 1.6909285523146909, + "grad_norm": 0.47980138659477234, + "learning_rate": 2.8844228128627e-05, + "loss": 0.4344, + "step": 9479 + }, + { + "epoch": 1.691106948532691, + "grad_norm": 0.5094255208969116, + "learning_rate": 2.881156629891815e-05, + "loss": 0.5046, + "step": 9480 + }, + { + "epoch": 1.6912853447506913, + "grad_norm": 0.4894050061702728, + "learning_rate": 2.8778921841253774e-05, + "loss": 0.4561, + "step": 9481 + }, + { + "epoch": 1.6914637409686915, + "grad_norm": 0.531238317489624, + "learning_rate": 2.8746294758197622e-05, + "loss": 0.5447, + "step": 9482 + }, + { + "epoch": 1.6916421371866917, + "grad_norm": 0.5232047438621521, + "learning_rate": 2.871368505231234e-05, + "loss": 0.474, + "step": 9483 + }, + { + "epoch": 1.6918205334046919, + "grad_norm": 0.4739258885383606, + "learning_rate": 2.8681092726158957e-05, + "loss": 0.4283, + "step": 9484 + }, + { + "epoch": 1.691998929622692, + "grad_norm": 0.4757033884525299, + "learning_rate": 2.8648517782297294e-05, + "loss": 0.3738, + "step": 9485 + }, + { + "epoch": 1.6921773258406922, + "grad_norm": 0.5776329636573792, + "learning_rate": 2.861596022328583e-05, + "loss": 0.7618, + "step": 9486 + }, + { + "epoch": 1.6923557220586924, + "grad_norm": 0.5738844275474548, + "learning_rate": 2.8583420051681545e-05, + "loss": 0.678, + "step": 9487 + }, + { + "epoch": 1.6925341182766926, + "grad_norm": 0.5281713604927063, + "learning_rate": 2.8550897270040148e-05, + "loss": 0.4907, + "step": 9488 + }, + { + "epoch": 1.6927125144946928, + "grad_norm": 0.5512872338294983, + "learning_rate": 2.851839188091604e-05, + "loss": 0.6453, + "step": 9489 + }, + { + "epoch": 1.692890910712693, + "grad_norm": 0.5240409970283508, + "learning_rate": 2.8485903886862093e-05, + "loss": 0.443, + "step": 9490 + }, + { + "epoch": 1.693069306930693, + "grad_norm": 0.6053004860877991, + "learning_rate": 2.845343329042996e-05, + "loss": 0.568, + "step": 9491 + }, + { + "epoch": 1.6932477031486932, + "grad_norm": 0.6063686013221741, + "learning_rate": 2.842098009416977e-05, + "loss": 0.5555, + "step": 9492 + }, + { + "epoch": 1.6934260993666934, + "grad_norm": 0.5201789736747742, + "learning_rate": 2.8388544300630538e-05, + "loss": 0.515, + "step": 9493 + }, + { + "epoch": 1.6936044955846936, + "grad_norm": 0.47415268421173096, + "learning_rate": 2.8356125912359587e-05, + "loss": 0.3263, + "step": 9494 + }, + { + "epoch": 1.6937828918026938, + "grad_norm": 0.4585654139518738, + "learning_rate": 2.832372493190319e-05, + "loss": 0.3953, + "step": 9495 + }, + { + "epoch": 1.693961288020694, + "grad_norm": 0.552780032157898, + "learning_rate": 2.8291341361805973e-05, + "loss": 0.5802, + "step": 9496 + }, + { + "epoch": 1.694139684238694, + "grad_norm": 0.5424774885177612, + "learning_rate": 2.8258975204611488e-05, + "loss": 0.5196, + "step": 9497 + }, + { + "epoch": 1.6943180804566942, + "grad_norm": 0.5058227777481079, + "learning_rate": 2.8226626462861645e-05, + "loss": 0.4721, + "step": 9498 + }, + { + "epoch": 1.6944964766746944, + "grad_norm": 0.4840177297592163, + "learning_rate": 2.819429513909705e-05, + "loss": 0.5075, + "step": 9499 + }, + { + "epoch": 1.6946748728926946, + "grad_norm": 0.47189828753471375, + "learning_rate": 2.816198123585714e-05, + "loss": 0.3598, + "step": 9500 + }, + { + "epoch": 1.6948532691106948, + "grad_norm": 0.46200263500213623, + "learning_rate": 2.81296847556797e-05, + "loss": 0.4265, + "step": 9501 + }, + { + "epoch": 1.695031665328695, + "grad_norm": 0.5357261300086975, + "learning_rate": 2.809740570110142e-05, + "loss": 0.4951, + "step": 9502 + }, + { + "epoch": 1.6952100615466952, + "grad_norm": 0.5782269835472107, + "learning_rate": 2.8065144074657322e-05, + "loss": 0.586, + "step": 9503 + }, + { + "epoch": 1.6953884577646954, + "grad_norm": 0.5674153566360474, + "learning_rate": 2.8032899878881303e-05, + "loss": 0.5702, + "step": 9504 + }, + { + "epoch": 1.6955668539826956, + "grad_norm": 0.5033431649208069, + "learning_rate": 2.8000673116305857e-05, + "loss": 0.4803, + "step": 9505 + }, + { + "epoch": 1.6957452502006958, + "grad_norm": 0.5172526836395264, + "learning_rate": 2.7968463789461968e-05, + "loss": 0.5727, + "step": 9506 + }, + { + "epoch": 1.695923646418696, + "grad_norm": 0.5547386407852173, + "learning_rate": 2.793627190087944e-05, + "loss": 0.5522, + "step": 9507 + }, + { + "epoch": 1.6961020426366962, + "grad_norm": 0.5478581786155701, + "learning_rate": 2.7904097453086503e-05, + "loss": 0.5495, + "step": 9508 + }, + { + "epoch": 1.6962804388546964, + "grad_norm": 0.5385379791259766, + "learning_rate": 2.7871940448610244e-05, + "loss": 0.4659, + "step": 9509 + }, + { + "epoch": 1.6964588350726966, + "grad_norm": 0.4548676609992981, + "learning_rate": 2.7839800889976173e-05, + "loss": 0.4397, + "step": 9510 + }, + { + "epoch": 1.6966372312906968, + "grad_norm": 0.5475732684135437, + "learning_rate": 2.7807678779708466e-05, + "loss": 0.6873, + "step": 9511 + }, + { + "epoch": 1.696815627508697, + "grad_norm": 0.49754053354263306, + "learning_rate": 2.7775574120330132e-05, + "loss": 0.4393, + "step": 9512 + }, + { + "epoch": 1.696994023726697, + "grad_norm": 0.5146554708480835, + "learning_rate": 2.7743486914362513e-05, + "loss": 0.4761, + "step": 9513 + }, + { + "epoch": 1.6971724199446971, + "grad_norm": 0.5417225956916809, + "learning_rate": 2.771141716432585e-05, + "loss": 0.5846, + "step": 9514 + }, + { + "epoch": 1.6973508161626973, + "grad_norm": 0.5172662734985352, + "learning_rate": 2.7679364872738753e-05, + "loss": 0.4245, + "step": 9515 + }, + { + "epoch": 1.6975292123806975, + "grad_norm": 0.5381408929824829, + "learning_rate": 2.764733004211875e-05, + "loss": 0.4776, + "step": 9516 + }, + { + "epoch": 1.6977076085986977, + "grad_norm": 0.4274336099624634, + "learning_rate": 2.7615312674981686e-05, + "loss": 0.3263, + "step": 9517 + }, + { + "epoch": 1.697886004816698, + "grad_norm": 0.5179285407066345, + "learning_rate": 2.7583312773842267e-05, + "loss": 0.5205, + "step": 9518 + }, + { + "epoch": 1.698064401034698, + "grad_norm": 0.5593109726905823, + "learning_rate": 2.7551330341213793e-05, + "loss": 0.568, + "step": 9519 + }, + { + "epoch": 1.698242797252698, + "grad_norm": 0.6157004833221436, + "learning_rate": 2.7519365379608058e-05, + "loss": 0.7443, + "step": 9520 + }, + { + "epoch": 1.6984211934706983, + "grad_norm": 0.5052160620689392, + "learning_rate": 2.7487417891535693e-05, + "loss": 0.4443, + "step": 9521 + }, + { + "epoch": 1.6985995896886985, + "grad_norm": 0.569677472114563, + "learning_rate": 2.7455487879505747e-05, + "loss": 0.5609, + "step": 9522 + }, + { + "epoch": 1.6987779859066987, + "grad_norm": 0.45186012983322144, + "learning_rate": 2.742357534602599e-05, + "loss": 0.3359, + "step": 9523 + }, + { + "epoch": 1.698956382124699, + "grad_norm": 0.5323873162269592, + "learning_rate": 2.7391680293602866e-05, + "loss": 0.5906, + "step": 9524 + }, + { + "epoch": 1.699134778342699, + "grad_norm": 0.47437727451324463, + "learning_rate": 2.7359802724741367e-05, + "loss": 0.4004, + "step": 9525 + }, + { + "epoch": 1.6993131745606993, + "grad_norm": 0.47239458560943604, + "learning_rate": 2.7327942641945157e-05, + "loss": 0.4497, + "step": 9526 + }, + { + "epoch": 1.6994915707786995, + "grad_norm": 0.5352083444595337, + "learning_rate": 2.7296100047716488e-05, + "loss": 0.5579, + "step": 9527 + }, + { + "epoch": 1.6996699669966997, + "grad_norm": 0.530881404876709, + "learning_rate": 2.7264274944556328e-05, + "loss": 0.5424, + "step": 9528 + }, + { + "epoch": 1.6998483632147, + "grad_norm": 0.5889961123466492, + "learning_rate": 2.7232467334964095e-05, + "loss": 0.7217, + "step": 9529 + }, + { + "epoch": 1.7000267594327, + "grad_norm": 0.5244048833847046, + "learning_rate": 2.7200677221438098e-05, + "loss": 0.5383, + "step": 9530 + }, + { + "epoch": 1.7002051556507003, + "grad_norm": 0.5565990209579468, + "learning_rate": 2.7168904606475005e-05, + "loss": 0.642, + "step": 9531 + }, + { + "epoch": 1.7003835518687005, + "grad_norm": 0.4610227644443512, + "learning_rate": 2.7137149492570207e-05, + "loss": 0.3604, + "step": 9532 + }, + { + "epoch": 1.7005619480867007, + "grad_norm": 0.4633643329143524, + "learning_rate": 2.7105411882217852e-05, + "loss": 0.3937, + "step": 9533 + }, + { + "epoch": 1.700740344304701, + "grad_norm": 0.5488377213478088, + "learning_rate": 2.7073691777910387e-05, + "loss": 0.5148, + "step": 9534 + }, + { + "epoch": 1.7009187405227009, + "grad_norm": 0.5138922929763794, + "learning_rate": 2.7041989182139375e-05, + "loss": 0.4672, + "step": 9535 + }, + { + "epoch": 1.701097136740701, + "grad_norm": 0.5196233987808228, + "learning_rate": 2.7010304097394578e-05, + "loss": 0.4599, + "step": 9536 + }, + { + "epoch": 1.7012755329587013, + "grad_norm": 0.45630475878715515, + "learning_rate": 2.6978636526164474e-05, + "loss": 0.3845, + "step": 9537 + }, + { + "epoch": 1.7014539291767015, + "grad_norm": 0.5754191279411316, + "learning_rate": 2.6946986470936352e-05, + "loss": 0.7097, + "step": 9538 + }, + { + "epoch": 1.7016323253947017, + "grad_norm": 0.5594825148582458, + "learning_rate": 2.6915353934195864e-05, + "loss": 0.6275, + "step": 9539 + }, + { + "epoch": 1.7018107216127019, + "grad_norm": 0.5201573967933655, + "learning_rate": 2.688373891842752e-05, + "loss": 0.5501, + "step": 9540 + }, + { + "epoch": 1.7019891178307018, + "grad_norm": 0.5017893314361572, + "learning_rate": 2.685214142611428e-05, + "loss": 0.4429, + "step": 9541 + }, + { + "epoch": 1.702167514048702, + "grad_norm": 0.5738409161567688, + "learning_rate": 2.682056145973785e-05, + "loss": 0.6522, + "step": 9542 + }, + { + "epoch": 1.7023459102667022, + "grad_norm": 0.5462552905082703, + "learning_rate": 2.6788999021778506e-05, + "loss": 0.7407, + "step": 9543 + }, + { + "epoch": 1.7025243064847024, + "grad_norm": 0.5017895102500916, + "learning_rate": 2.6757454114715058e-05, + "loss": 0.4998, + "step": 9544 + }, + { + "epoch": 1.7027027027027026, + "grad_norm": 0.47052887082099915, + "learning_rate": 2.6725926741025143e-05, + "loss": 0.3548, + "step": 9545 + }, + { + "epoch": 1.7028810989207028, + "grad_norm": 0.5245179533958435, + "learning_rate": 2.6694416903184805e-05, + "loss": 0.5281, + "step": 9546 + }, + { + "epoch": 1.703059495138703, + "grad_norm": 0.4874159097671509, + "learning_rate": 2.6662924603668927e-05, + "loss": 0.443, + "step": 9547 + }, + { + "epoch": 1.7032378913567032, + "grad_norm": 0.5207834243774414, + "learning_rate": 2.6631449844950806e-05, + "loss": 0.4949, + "step": 9548 + }, + { + "epoch": 1.7034162875747034, + "grad_norm": 0.43551138043403625, + "learning_rate": 2.6599992629502466e-05, + "loss": 0.3538, + "step": 9549 + }, + { + "epoch": 1.7035946837927036, + "grad_norm": 0.49214231967926025, + "learning_rate": 2.656855295979463e-05, + "loss": 0.4541, + "step": 9550 + }, + { + "epoch": 1.7037730800107038, + "grad_norm": 0.4991733729839325, + "learning_rate": 2.6537130838296452e-05, + "loss": 0.3988, + "step": 9551 + }, + { + "epoch": 1.703951476228704, + "grad_norm": 0.5545747876167297, + "learning_rate": 2.650572626747588e-05, + "loss": 0.5933, + "step": 9552 + }, + { + "epoch": 1.7041298724467042, + "grad_norm": 0.542052686214447, + "learning_rate": 2.6474339249799362e-05, + "loss": 0.5359, + "step": 9553 + }, + { + "epoch": 1.7043082686647044, + "grad_norm": 0.5299772620201111, + "learning_rate": 2.6442969787732085e-05, + "loss": 0.5399, + "step": 9554 + }, + { + "epoch": 1.7044866648827046, + "grad_norm": 0.490618497133255, + "learning_rate": 2.641161788373772e-05, + "loss": 0.4699, + "step": 9555 + }, + { + "epoch": 1.7046650611007048, + "grad_norm": 0.5627853870391846, + "learning_rate": 2.6380283540278714e-05, + "loss": 0.6102, + "step": 9556 + }, + { + "epoch": 1.7048434573187048, + "grad_norm": 0.6248000860214233, + "learning_rate": 2.634896675981599e-05, + "loss": 0.5612, + "step": 9557 + }, + { + "epoch": 1.705021853536705, + "grad_norm": 0.5680274367332458, + "learning_rate": 2.6317667544809132e-05, + "loss": 0.5528, + "step": 9558 + }, + { + "epoch": 1.7052002497547052, + "grad_norm": 0.5207681059837341, + "learning_rate": 2.628638589771648e-05, + "loss": 0.526, + "step": 9559 + }, + { + "epoch": 1.7053786459727054, + "grad_norm": 0.4698508381843567, + "learning_rate": 2.6255121820994737e-05, + "loss": 0.4154, + "step": 9560 + }, + { + "epoch": 1.7055570421907056, + "grad_norm": 0.45155656337738037, + "learning_rate": 2.6223875317099492e-05, + "loss": 0.4199, + "step": 9561 + }, + { + "epoch": 1.7057354384087058, + "grad_norm": 0.5513625741004944, + "learning_rate": 2.6192646388484732e-05, + "loss": 0.5584, + "step": 9562 + }, + { + "epoch": 1.7059138346267058, + "grad_norm": 0.4611358642578125, + "learning_rate": 2.6161435037603264e-05, + "loss": 0.3435, + "step": 9563 + }, + { + "epoch": 1.706092230844706, + "grad_norm": 0.5059071183204651, + "learning_rate": 2.61302412669063e-05, + "loss": 0.4466, + "step": 9564 + }, + { + "epoch": 1.7062706270627062, + "grad_norm": 0.5149518251419067, + "learning_rate": 2.609906507884388e-05, + "loss": 0.4098, + "step": 9565 + }, + { + "epoch": 1.7064490232807064, + "grad_norm": 0.5157696604728699, + "learning_rate": 2.606790647586457e-05, + "loss": 0.4633, + "step": 9566 + }, + { + "epoch": 1.7066274194987066, + "grad_norm": 0.47338274121284485, + "learning_rate": 2.6036765460415447e-05, + "loss": 0.4056, + "step": 9567 + }, + { + "epoch": 1.7068058157167068, + "grad_norm": 0.49794062972068787, + "learning_rate": 2.6005642034942463e-05, + "loss": 0.5416, + "step": 9568 + }, + { + "epoch": 1.706984211934707, + "grad_norm": 0.6232036352157593, + "learning_rate": 2.597453620188997e-05, + "loss": 0.6416, + "step": 9569 + }, + { + "epoch": 1.7071626081527072, + "grad_norm": 0.5632308125495911, + "learning_rate": 2.5943447963700933e-05, + "loss": 0.5564, + "step": 9570 + }, + { + "epoch": 1.7073410043707073, + "grad_norm": 0.5408989191055298, + "learning_rate": 2.591237732281712e-05, + "loss": 0.5595, + "step": 9571 + }, + { + "epoch": 1.7075194005887075, + "grad_norm": 0.5331925749778748, + "learning_rate": 2.588132428167869e-05, + "loss": 0.567, + "step": 9572 + }, + { + "epoch": 1.7076977968067077, + "grad_norm": 0.4525754749774933, + "learning_rate": 2.5850288842724696e-05, + "loss": 0.3582, + "step": 9573 + }, + { + "epoch": 1.707876193024708, + "grad_norm": 0.4124913513660431, + "learning_rate": 2.5819271008392486e-05, + "loss": 0.3726, + "step": 9574 + }, + { + "epoch": 1.7080545892427081, + "grad_norm": 0.45310550928115845, + "learning_rate": 2.578827078111831e-05, + "loss": 0.4381, + "step": 9575 + }, + { + "epoch": 1.7082329854607083, + "grad_norm": 0.45527383685112, + "learning_rate": 2.5757288163336808e-05, + "loss": 0.4271, + "step": 9576 + }, + { + "epoch": 1.7084113816787085, + "grad_norm": 0.5173305869102478, + "learning_rate": 2.572632315748144e-05, + "loss": 0.5474, + "step": 9577 + }, + { + "epoch": 1.7085897778967087, + "grad_norm": 0.4721916913986206, + "learning_rate": 2.569537576598416e-05, + "loss": 0.37, + "step": 9578 + }, + { + "epoch": 1.7087681741147087, + "grad_norm": 0.503395676612854, + "learning_rate": 2.5664445991275486e-05, + "loss": 0.5476, + "step": 9579 + }, + { + "epoch": 1.708946570332709, + "grad_norm": 0.4126671254634857, + "learning_rate": 2.563353383578468e-05, + "loss": 0.3101, + "step": 9580 + }, + { + "epoch": 1.7091249665507091, + "grad_norm": 0.546122133731842, + "learning_rate": 2.560263930193957e-05, + "loss": 0.4748, + "step": 9581 + }, + { + "epoch": 1.7093033627687093, + "grad_norm": 0.4883154332637787, + "learning_rate": 2.557176239216666e-05, + "loss": 0.465, + "step": 9582 + }, + { + "epoch": 1.7094817589867095, + "grad_norm": 0.5338440537452698, + "learning_rate": 2.554090310889093e-05, + "loss": 0.4543, + "step": 9583 + }, + { + "epoch": 1.7096601552047097, + "grad_norm": 0.49619346857070923, + "learning_rate": 2.5510061454536048e-05, + "loss": 0.4506, + "step": 9584 + }, + { + "epoch": 1.7098385514227097, + "grad_norm": 0.5309063196182251, + "learning_rate": 2.5479237431524387e-05, + "loss": 0.6047, + "step": 9585 + }, + { + "epoch": 1.7100169476407099, + "grad_norm": 0.4647756516933441, + "learning_rate": 2.5448431042276732e-05, + "loss": 0.3435, + "step": 9586 + }, + { + "epoch": 1.71019534385871, + "grad_norm": 0.5021915435791016, + "learning_rate": 2.5417642289212756e-05, + "loss": 0.5529, + "step": 9587 + }, + { + "epoch": 1.7103737400767103, + "grad_norm": 0.5375409126281738, + "learning_rate": 2.5386871174750415e-05, + "loss": 0.5268, + "step": 9588 + }, + { + "epoch": 1.7105521362947105, + "grad_norm": 0.5530412793159485, + "learning_rate": 2.5356117701306637e-05, + "loss": 0.4953, + "step": 9589 + }, + { + "epoch": 1.7107305325127107, + "grad_norm": 0.5221925973892212, + "learning_rate": 2.532538187129668e-05, + "loss": 0.536, + "step": 9590 + }, + { + "epoch": 1.7109089287307109, + "grad_norm": 0.46529194712638855, + "learning_rate": 2.5294663687134512e-05, + "loss": 0.436, + "step": 9591 + }, + { + "epoch": 1.711087324948711, + "grad_norm": 0.5522665977478027, + "learning_rate": 2.5263963151232806e-05, + "loss": 0.4987, + "step": 9592 + }, + { + "epoch": 1.7112657211667113, + "grad_norm": 0.5286535024642944, + "learning_rate": 2.523328026600266e-05, + "loss": 0.5313, + "step": 9593 + }, + { + "epoch": 1.7114441173847115, + "grad_norm": 0.476859986782074, + "learning_rate": 2.5202615033854016e-05, + "loss": 0.4428, + "step": 9594 + }, + { + "epoch": 1.7116225136027117, + "grad_norm": 0.5317862629890442, + "learning_rate": 2.5171967457195217e-05, + "loss": 0.5632, + "step": 9595 + }, + { + "epoch": 1.7118009098207119, + "grad_norm": 0.5589151978492737, + "learning_rate": 2.5141337538433312e-05, + "loss": 0.6869, + "step": 9596 + }, + { + "epoch": 1.711979306038712, + "grad_norm": 0.4832451343536377, + "learning_rate": 2.5110725279974074e-05, + "loss": 0.4022, + "step": 9597 + }, + { + "epoch": 1.7121577022567123, + "grad_norm": 0.45813506841659546, + "learning_rate": 2.508013068422163e-05, + "loss": 0.3612, + "step": 9598 + }, + { + "epoch": 1.7123360984747125, + "grad_norm": 0.4622073471546173, + "learning_rate": 2.504955375357895e-05, + "loss": 0.3288, + "step": 9599 + }, + { + "epoch": 1.7125144946927127, + "grad_norm": 0.5384669899940491, + "learning_rate": 2.5018994490447505e-05, + "loss": 0.4338, + "step": 9600 + }, + { + "epoch": 1.7126928909107126, + "grad_norm": 0.5117572546005249, + "learning_rate": 2.4988452897227454e-05, + "loss": 0.4504, + "step": 9601 + }, + { + "epoch": 1.7128712871287128, + "grad_norm": 0.5058663487434387, + "learning_rate": 2.495792897631749e-05, + "loss": 0.41, + "step": 9602 + }, + { + "epoch": 1.713049683346713, + "grad_norm": 0.481413871049881, + "learning_rate": 2.492742273011489e-05, + "loss": 0.4433, + "step": 9603 + }, + { + "epoch": 1.7132280795647132, + "grad_norm": 0.5749334692955017, + "learning_rate": 2.4896934161015683e-05, + "loss": 0.542, + "step": 9604 + }, + { + "epoch": 1.7134064757827134, + "grad_norm": 0.4859156012535095, + "learning_rate": 2.486646327141437e-05, + "loss": 0.4481, + "step": 9605 + }, + { + "epoch": 1.7135848720007136, + "grad_norm": 0.5681634545326233, + "learning_rate": 2.4836010063704174e-05, + "loss": 0.5799, + "step": 9606 + }, + { + "epoch": 1.7137632682187136, + "grad_norm": 0.5673406720161438, + "learning_rate": 2.4805574540276822e-05, + "loss": 0.659, + "step": 9607 + }, + { + "epoch": 1.7139416644367138, + "grad_norm": 0.5422623753547668, + "learning_rate": 2.477515670352279e-05, + "loss": 0.4897, + "step": 9608 + }, + { + "epoch": 1.714120060654714, + "grad_norm": 0.4804922044277191, + "learning_rate": 2.4744756555830972e-05, + "loss": 0.4234, + "step": 9609 + }, + { + "epoch": 1.7142984568727142, + "grad_norm": 0.5365844368934631, + "learning_rate": 2.4714374099589042e-05, + "loss": 0.5374, + "step": 9610 + }, + { + "epoch": 1.7144768530907144, + "grad_norm": 0.5775295495986938, + "learning_rate": 2.468400933718326e-05, + "loss": 0.5003, + "step": 9611 + }, + { + "epoch": 1.7146552493087146, + "grad_norm": 0.5075268149375916, + "learning_rate": 2.4653662270998383e-05, + "loss": 0.4016, + "step": 9612 + }, + { + "epoch": 1.7148336455267148, + "grad_norm": 0.5659419298171997, + "learning_rate": 2.462333290341795e-05, + "loss": 0.6249, + "step": 9613 + }, + { + "epoch": 1.715012041744715, + "grad_norm": 0.5081014633178711, + "learning_rate": 2.4593021236823914e-05, + "loss": 0.394, + "step": 9614 + }, + { + "epoch": 1.7151904379627152, + "grad_norm": 0.5532371401786804, + "learning_rate": 2.456272727359704e-05, + "loss": 0.606, + "step": 9615 + }, + { + "epoch": 1.7153688341807154, + "grad_norm": 0.5273371934890747, + "learning_rate": 2.4532451016116535e-05, + "loss": 0.6084, + "step": 9616 + }, + { + "epoch": 1.7155472303987156, + "grad_norm": 0.5197193622589111, + "learning_rate": 2.4502192466760276e-05, + "loss": 0.5743, + "step": 9617 + }, + { + "epoch": 1.7157256266167158, + "grad_norm": 0.5154910683631897, + "learning_rate": 2.4471951627904804e-05, + "loss": 0.5972, + "step": 9618 + }, + { + "epoch": 1.715904022834716, + "grad_norm": 0.5434054732322693, + "learning_rate": 2.4441728501925165e-05, + "loss": 0.5537, + "step": 9619 + }, + { + "epoch": 1.7160824190527162, + "grad_norm": 0.5460578799247742, + "learning_rate": 2.4411523091195153e-05, + "loss": 0.6007, + "step": 9620 + }, + { + "epoch": 1.7162608152707164, + "grad_norm": 0.4608527421951294, + "learning_rate": 2.4381335398086985e-05, + "loss": 0.3695, + "step": 9621 + }, + { + "epoch": 1.7164392114887166, + "grad_norm": 0.6361474990844727, + "learning_rate": 2.4351165424971706e-05, + "loss": 0.7393, + "step": 9622 + }, + { + "epoch": 1.7166176077067166, + "grad_norm": 0.5027257204055786, + "learning_rate": 2.4321013174218782e-05, + "loss": 0.4491, + "step": 9623 + }, + { + "epoch": 1.7167960039247168, + "grad_norm": 0.6172974705696106, + "learning_rate": 2.4290878648196318e-05, + "loss": 0.6406, + "step": 9624 + }, + { + "epoch": 1.716974400142717, + "grad_norm": 0.48582372069358826, + "learning_rate": 2.4260761849271116e-05, + "loss": 0.5167, + "step": 9625 + }, + { + "epoch": 1.7171527963607172, + "grad_norm": 0.5107640624046326, + "learning_rate": 2.4230662779808538e-05, + "loss": 0.4779, + "step": 9626 + }, + { + "epoch": 1.7173311925787174, + "grad_norm": 0.591314971446991, + "learning_rate": 2.4200581442172576e-05, + "loss": 0.6189, + "step": 9627 + }, + { + "epoch": 1.7175095887967176, + "grad_norm": 0.5267913937568665, + "learning_rate": 2.4170517838725816e-05, + "loss": 0.4974, + "step": 9628 + }, + { + "epoch": 1.7176879850147175, + "grad_norm": 0.5801319479942322, + "learning_rate": 2.4140471971829338e-05, + "loss": 0.5212, + "step": 9629 + }, + { + "epoch": 1.7178663812327177, + "grad_norm": 0.5426841974258423, + "learning_rate": 2.4110443843843034e-05, + "loss": 0.4802, + "step": 9630 + }, + { + "epoch": 1.718044777450718, + "grad_norm": 0.5340462327003479, + "learning_rate": 2.408043345712521e-05, + "loss": 0.5782, + "step": 9631 + }, + { + "epoch": 1.7182231736687181, + "grad_norm": 0.5407845973968506, + "learning_rate": 2.4050440814032986e-05, + "loss": 0.5252, + "step": 9632 + }, + { + "epoch": 1.7184015698867183, + "grad_norm": 0.5170387625694275, + "learning_rate": 2.4020465916921862e-05, + "loss": 0.5455, + "step": 9633 + }, + { + "epoch": 1.7185799661047185, + "grad_norm": 0.45365792512893677, + "learning_rate": 2.3990508768146125e-05, + "loss": 0.3647, + "step": 9634 + }, + { + "epoch": 1.7187583623227187, + "grad_norm": 0.5068604946136475, + "learning_rate": 2.3960569370058528e-05, + "loss": 0.5216, + "step": 9635 + }, + { + "epoch": 1.718936758540719, + "grad_norm": 0.6078252792358398, + "learning_rate": 2.3930647725010607e-05, + "loss": 0.7029, + "step": 9636 + }, + { + "epoch": 1.7191151547587191, + "grad_norm": 0.4622660279273987, + "learning_rate": 2.3900743835352316e-05, + "loss": 0.4294, + "step": 9637 + }, + { + "epoch": 1.7192935509767193, + "grad_norm": 0.45116907358169556, + "learning_rate": 2.387085770343225e-05, + "loss": 0.3998, + "step": 9638 + }, + { + "epoch": 1.7194719471947195, + "grad_norm": 0.4865681231021881, + "learning_rate": 2.3840989331597757e-05, + "loss": 0.424, + "step": 9639 + }, + { + "epoch": 1.7196503434127197, + "grad_norm": 0.5520277619361877, + "learning_rate": 2.3811138722194593e-05, + "loss": 0.5527, + "step": 9640 + }, + { + "epoch": 1.71982873963072, + "grad_norm": 0.4993618130683899, + "learning_rate": 2.378130587756727e-05, + "loss": 0.4283, + "step": 9641 + }, + { + "epoch": 1.7200071358487201, + "grad_norm": 0.5111703276634216, + "learning_rate": 2.3751490800058865e-05, + "loss": 0.4524, + "step": 9642 + }, + { + "epoch": 1.7201855320667203, + "grad_norm": 0.554426372051239, + "learning_rate": 2.3721693492010977e-05, + "loss": 0.5797, + "step": 9643 + }, + { + "epoch": 1.7203639282847205, + "grad_norm": 0.4680947959423065, + "learning_rate": 2.369191395576395e-05, + "loss": 0.3494, + "step": 9644 + }, + { + "epoch": 1.7205423245027205, + "grad_norm": 0.5217585563659668, + "learning_rate": 2.3662152193656554e-05, + "loss": 0.6172, + "step": 9645 + }, + { + "epoch": 1.7207207207207207, + "grad_norm": 0.5365320444107056, + "learning_rate": 2.3632408208026395e-05, + "loss": 0.5242, + "step": 9646 + }, + { + "epoch": 1.7208991169387209, + "grad_norm": 0.6032186150550842, + "learning_rate": 2.3602682001209408e-05, + "loss": 0.6657, + "step": 9647 + }, + { + "epoch": 1.721077513156721, + "grad_norm": 0.5880352258682251, + "learning_rate": 2.3572973575540418e-05, + "loss": 0.665, + "step": 9648 + }, + { + "epoch": 1.7212559093747213, + "grad_norm": 0.5372354388237, + "learning_rate": 2.3543282933352645e-05, + "loss": 0.4893, + "step": 9649 + }, + { + "epoch": 1.7214343055927215, + "grad_norm": 0.44951343536376953, + "learning_rate": 2.3513610076977916e-05, + "loss": 0.3493, + "step": 9650 + }, + { + "epoch": 1.7216127018107215, + "grad_norm": 0.49766871333122253, + "learning_rate": 2.348395500874684e-05, + "loss": 0.4823, + "step": 9651 + }, + { + "epoch": 1.7217910980287217, + "grad_norm": 0.5321714282035828, + "learning_rate": 2.345431773098841e-05, + "loss": 0.5048, + "step": 9652 + }, + { + "epoch": 1.7219694942467219, + "grad_norm": 0.5095254182815552, + "learning_rate": 2.342469824603044e-05, + "loss": 0.4675, + "step": 9653 + }, + { + "epoch": 1.722147890464722, + "grad_norm": 0.48094990849494934, + "learning_rate": 2.339509655619909e-05, + "loss": 0.4484, + "step": 9654 + }, + { + "epoch": 1.7223262866827223, + "grad_norm": 0.5008235573768616, + "learning_rate": 2.336551266381942e-05, + "loss": 0.3872, + "step": 9655 + }, + { + "epoch": 1.7225046829007225, + "grad_norm": 0.552046537399292, + "learning_rate": 2.3335946571214795e-05, + "loss": 0.6718, + "step": 9656 + }, + { + "epoch": 1.7226830791187226, + "grad_norm": 0.5397679805755615, + "learning_rate": 2.330639828070738e-05, + "loss": 0.4931, + "step": 9657 + }, + { + "epoch": 1.7228614753367228, + "grad_norm": 0.5258644819259644, + "learning_rate": 2.3276867794617936e-05, + "loss": 0.3977, + "step": 9658 + }, + { + "epoch": 1.723039871554723, + "grad_norm": 0.5579936504364014, + "learning_rate": 2.3247355115265684e-05, + "loss": 0.5178, + "step": 9659 + }, + { + "epoch": 1.7232182677727232, + "grad_norm": 0.53934246301651, + "learning_rate": 2.3217860244968638e-05, + "loss": 0.5224, + "step": 9660 + }, + { + "epoch": 1.7233966639907234, + "grad_norm": 0.5465741157531738, + "learning_rate": 2.3188383186043187e-05, + "loss": 0.5329, + "step": 9661 + }, + { + "epoch": 1.7235750602087236, + "grad_norm": 0.6120946407318115, + "learning_rate": 2.3158923940804572e-05, + "loss": 0.7921, + "step": 9662 + }, + { + "epoch": 1.7237534564267238, + "grad_norm": 0.5330802798271179, + "learning_rate": 2.3129482511566463e-05, + "loss": 0.5436, + "step": 9663 + }, + { + "epoch": 1.723931852644724, + "grad_norm": 0.4863676428794861, + "learning_rate": 2.3100058900641125e-05, + "loss": 0.4046, + "step": 9664 + }, + { + "epoch": 1.7241102488627242, + "grad_norm": 0.43972915410995483, + "learning_rate": 2.3070653110339567e-05, + "loss": 0.4352, + "step": 9665 + }, + { + "epoch": 1.7242886450807244, + "grad_norm": 0.5199171304702759, + "learning_rate": 2.3041265142971196e-05, + "loss": 0.486, + "step": 9666 + }, + { + "epoch": 1.7244670412987244, + "grad_norm": 0.4947983920574188, + "learning_rate": 2.3011895000844247e-05, + "loss": 0.4544, + "step": 9667 + }, + { + "epoch": 1.7246454375167246, + "grad_norm": 0.565777599811554, + "learning_rate": 2.298254268626532e-05, + "loss": 0.5195, + "step": 9668 + }, + { + "epoch": 1.7248238337347248, + "grad_norm": 0.4736662805080414, + "learning_rate": 2.2953208201539873e-05, + "loss": 0.3791, + "step": 9669 + }, + { + "epoch": 1.725002229952725, + "grad_norm": 0.5035544633865356, + "learning_rate": 2.292389154897173e-05, + "loss": 0.488, + "step": 9670 + }, + { + "epoch": 1.7251806261707252, + "grad_norm": 0.6010121703147888, + "learning_rate": 2.2894592730863335e-05, + "loss": 0.6792, + "step": 9671 + }, + { + "epoch": 1.7253590223887254, + "grad_norm": 0.47702497243881226, + "learning_rate": 2.2865311749515978e-05, + "loss": 0.3775, + "step": 9672 + }, + { + "epoch": 1.7255374186067254, + "grad_norm": 0.5288291573524475, + "learning_rate": 2.2836048607229264e-05, + "loss": 0.4574, + "step": 9673 + }, + { + "epoch": 1.7257158148247256, + "grad_norm": 0.5164685845375061, + "learning_rate": 2.2806803306301583e-05, + "loss": 0.4143, + "step": 9674 + }, + { + "epoch": 1.7258942110427258, + "grad_norm": 0.5557328462600708, + "learning_rate": 2.2777575849029785e-05, + "loss": 0.5724, + "step": 9675 + }, + { + "epoch": 1.726072607260726, + "grad_norm": 0.5354199409484863, + "learning_rate": 2.2748366237709372e-05, + "loss": 0.499, + "step": 9676 + }, + { + "epoch": 1.7262510034787262, + "grad_norm": 0.5438181757926941, + "learning_rate": 2.271917447463451e-05, + "loss": 0.6275, + "step": 9677 + }, + { + "epoch": 1.7264293996967264, + "grad_norm": 0.5154425501823425, + "learning_rate": 2.2690000562097858e-05, + "loss": 0.4563, + "step": 9678 + }, + { + "epoch": 1.7266077959147266, + "grad_norm": 0.58868008852005, + "learning_rate": 2.2660844502390754e-05, + "loss": 0.632, + "step": 9679 + }, + { + "epoch": 1.7267861921327268, + "grad_norm": 0.541050136089325, + "learning_rate": 2.263170629780306e-05, + "loss": 0.59, + "step": 9680 + }, + { + "epoch": 1.726964588350727, + "grad_norm": 0.5016245245933533, + "learning_rate": 2.2602585950623367e-05, + "loss": 0.4542, + "step": 9681 + }, + { + "epoch": 1.7271429845687272, + "grad_norm": 0.5481529235839844, + "learning_rate": 2.25734834631387e-05, + "loss": 0.6274, + "step": 9682 + }, + { + "epoch": 1.7273213807867274, + "grad_norm": 0.5226790308952332, + "learning_rate": 2.2544398837634732e-05, + "loss": 0.4746, + "step": 9683 + }, + { + "epoch": 1.7274997770047276, + "grad_norm": 0.5335855484008789, + "learning_rate": 2.2515332076395862e-05, + "loss": 0.4868, + "step": 9684 + }, + { + "epoch": 1.7276781732227278, + "grad_norm": 0.5473877787590027, + "learning_rate": 2.2486283181704842e-05, + "loss": 0.4419, + "step": 9685 + }, + { + "epoch": 1.727856569440728, + "grad_norm": 0.5562229752540588, + "learning_rate": 2.2457252155843293e-05, + "loss": 0.6055, + "step": 9686 + }, + { + "epoch": 1.7280349656587282, + "grad_norm": 0.5625514984130859, + "learning_rate": 2.24282390010912e-05, + "loss": 0.5543, + "step": 9687 + }, + { + "epoch": 1.7282133618767284, + "grad_norm": 0.5255547165870667, + "learning_rate": 2.2399243719727265e-05, + "loss": 0.5099, + "step": 9688 + }, + { + "epoch": 1.7283917580947283, + "grad_norm": 0.6275233626365662, + "learning_rate": 2.237026631402883e-05, + "loss": 0.5615, + "step": 9689 + }, + { + "epoch": 1.7285701543127285, + "grad_norm": 0.5601621866226196, + "learning_rate": 2.234130678627169e-05, + "loss": 0.5324, + "step": 9690 + }, + { + "epoch": 1.7287485505307287, + "grad_norm": 0.5600411891937256, + "learning_rate": 2.231236513873039e-05, + "loss": 0.5877, + "step": 9691 + }, + { + "epoch": 1.728926946748729, + "grad_norm": 0.5259904861450195, + "learning_rate": 2.228344137367791e-05, + "loss": 0.5547, + "step": 9692 + }, + { + "epoch": 1.7291053429667291, + "grad_norm": 0.4963001012802124, + "learning_rate": 2.225453549338599e-05, + "loss": 0.4832, + "step": 9693 + }, + { + "epoch": 1.7292837391847293, + "grad_norm": 0.534983217716217, + "learning_rate": 2.222564750012479e-05, + "loss": 0.5112, + "step": 9694 + }, + { + "epoch": 1.7294621354027293, + "grad_norm": 0.47947463393211365, + "learning_rate": 2.219677739616327e-05, + "loss": 0.4251, + "step": 9695 + }, + { + "epoch": 1.7296405316207295, + "grad_norm": 0.5085344314575195, + "learning_rate": 2.216792518376884e-05, + "loss": 0.4215, + "step": 9696 + }, + { + "epoch": 1.7298189278387297, + "grad_norm": 0.48446446657180786, + "learning_rate": 2.213909086520746e-05, + "loss": 0.443, + "step": 9697 + }, + { + "epoch": 1.72999732405673, + "grad_norm": 0.4940386116504669, + "learning_rate": 2.2110274442743853e-05, + "loss": 0.4204, + "step": 9698 + }, + { + "epoch": 1.73017572027473, + "grad_norm": 0.5226745009422302, + "learning_rate": 2.2081475918641208e-05, + "loss": 0.5349, + "step": 9699 + }, + { + "epoch": 1.7303541164927303, + "grad_norm": 0.47551804780960083, + "learning_rate": 2.2052695295161407e-05, + "loss": 0.4946, + "step": 9700 + }, + { + "epoch": 1.7305325127107305, + "grad_norm": 0.5348621606826782, + "learning_rate": 2.2023932574564754e-05, + "loss": 0.5195, + "step": 9701 + }, + { + "epoch": 1.7307109089287307, + "grad_norm": 0.5494500398635864, + "learning_rate": 2.199518775911036e-05, + "loss": 0.5932, + "step": 9702 + }, + { + "epoch": 1.730889305146731, + "grad_norm": 0.5604888796806335, + "learning_rate": 2.196646085105583e-05, + "loss": 0.6247, + "step": 9703 + }, + { + "epoch": 1.731067701364731, + "grad_norm": 0.5602217316627502, + "learning_rate": 2.1937751852657285e-05, + "loss": 0.7553, + "step": 9704 + }, + { + "epoch": 1.7312460975827313, + "grad_norm": 0.5081519484519958, + "learning_rate": 2.190906076616961e-05, + "loss": 0.4189, + "step": 9705 + }, + { + "epoch": 1.7314244938007315, + "grad_norm": 0.5131904482841492, + "learning_rate": 2.188038759384611e-05, + "loss": 0.4722, + "step": 9706 + }, + { + "epoch": 1.7316028900187317, + "grad_norm": 0.47898441553115845, + "learning_rate": 2.1851732337938855e-05, + "loss": 0.393, + "step": 9707 + }, + { + "epoch": 1.7317812862367319, + "grad_norm": 0.4974140524864197, + "learning_rate": 2.1823095000698346e-05, + "loss": 0.5102, + "step": 9708 + }, + { + "epoch": 1.731959682454732, + "grad_norm": 0.5431427955627441, + "learning_rate": 2.1794475584373723e-05, + "loss": 0.4054, + "step": 9709 + }, + { + "epoch": 1.7321380786727323, + "grad_norm": 0.5118406414985657, + "learning_rate": 2.1765874091212834e-05, + "loss": 0.5844, + "step": 9710 + }, + { + "epoch": 1.7323164748907325, + "grad_norm": 0.5889133214950562, + "learning_rate": 2.1737290523461932e-05, + "loss": 0.662, + "step": 9711 + }, + { + "epoch": 1.7324948711087325, + "grad_norm": 0.5474311113357544, + "learning_rate": 2.170872488336606e-05, + "loss": 0.5537, + "step": 9712 + }, + { + "epoch": 1.7326732673267327, + "grad_norm": 0.5133411884307861, + "learning_rate": 2.1680177173168615e-05, + "loss": 0.5039, + "step": 9713 + }, + { + "epoch": 1.7328516635447329, + "grad_norm": 0.5092921257019043, + "learning_rate": 2.1651647395111884e-05, + "loss": 0.4727, + "step": 9714 + }, + { + "epoch": 1.733030059762733, + "grad_norm": 0.521981954574585, + "learning_rate": 2.1623135551436442e-05, + "loss": 0.5204, + "step": 9715 + }, + { + "epoch": 1.7332084559807333, + "grad_norm": 0.48075124621391296, + "learning_rate": 2.1594641644381684e-05, + "loss": 0.4316, + "step": 9716 + }, + { + "epoch": 1.7333868521987332, + "grad_norm": 0.5798913836479187, + "learning_rate": 2.156616567618544e-05, + "loss": 0.6439, + "step": 9717 + }, + { + "epoch": 1.7335652484167334, + "grad_norm": 0.42159613966941833, + "learning_rate": 2.153770764908425e-05, + "loss": 0.3031, + "step": 9718 + }, + { + "epoch": 1.7337436446347336, + "grad_norm": 0.7180142998695374, + "learning_rate": 2.150926756531324e-05, + "loss": 0.4069, + "step": 9719 + }, + { + "epoch": 1.7339220408527338, + "grad_norm": 0.4658437669277191, + "learning_rate": 2.148084542710596e-05, + "loss": 0.4876, + "step": 9720 + }, + { + "epoch": 1.734100437070734, + "grad_norm": 0.48459357023239136, + "learning_rate": 2.1452441236694792e-05, + "loss": 0.4056, + "step": 9721 + }, + { + "epoch": 1.7342788332887342, + "grad_norm": 0.5127313733100891, + "learning_rate": 2.142405499631056e-05, + "loss": 0.4503, + "step": 9722 + }, + { + "epoch": 1.7344572295067344, + "grad_norm": 0.47491639852523804, + "learning_rate": 2.139568670818262e-05, + "loss": 0.4279, + "step": 9723 + }, + { + "epoch": 1.7346356257247346, + "grad_norm": 0.5612772107124329, + "learning_rate": 2.136733637453911e-05, + "loss": 0.6575, + "step": 9724 + }, + { + "epoch": 1.7348140219427348, + "grad_norm": 0.566277027130127, + "learning_rate": 2.1339003997606577e-05, + "loss": 0.628, + "step": 9725 + }, + { + "epoch": 1.734992418160735, + "grad_norm": 0.5447231531143188, + "learning_rate": 2.1310689579610327e-05, + "loss": 0.5066, + "step": 9726 + }, + { + "epoch": 1.7351708143787352, + "grad_norm": 0.463008850812912, + "learning_rate": 2.1282393122774053e-05, + "loss": 0.3353, + "step": 9727 + }, + { + "epoch": 1.7353492105967354, + "grad_norm": 0.4631538689136505, + "learning_rate": 2.1254114629320227e-05, + "loss": 0.3234, + "step": 9728 + }, + { + "epoch": 1.7355276068147356, + "grad_norm": 0.5900436043739319, + "learning_rate": 2.1225854101469794e-05, + "loss": 0.5786, + "step": 9729 + }, + { + "epoch": 1.7357060030327358, + "grad_norm": 0.4738953113555908, + "learning_rate": 2.1197611541442313e-05, + "loss": 0.4288, + "step": 9730 + }, + { + "epoch": 1.735884399250736, + "grad_norm": 0.4944877028465271, + "learning_rate": 2.116938695145598e-05, + "loss": 0.3708, + "step": 9731 + }, + { + "epoch": 1.7360627954687362, + "grad_norm": 0.49892377853393555, + "learning_rate": 2.11411803337275e-05, + "loss": 0.4571, + "step": 9732 + }, + { + "epoch": 1.7362411916867364, + "grad_norm": 0.5799189209938049, + "learning_rate": 2.1112991690472234e-05, + "loss": 0.5227, + "step": 9733 + }, + { + "epoch": 1.7364195879047364, + "grad_norm": 0.5338386297225952, + "learning_rate": 2.1084821023904133e-05, + "loss": 0.4744, + "step": 9734 + }, + { + "epoch": 1.7365979841227366, + "grad_norm": 1.029298186302185, + "learning_rate": 2.1056668336235624e-05, + "loss": 0.5359, + "step": 9735 + }, + { + "epoch": 1.7367763803407368, + "grad_norm": 0.520614504814148, + "learning_rate": 2.1028533629677937e-05, + "loss": 0.5175, + "step": 9736 + }, + { + "epoch": 1.736954776558737, + "grad_norm": 0.5271267890930176, + "learning_rate": 2.1000416906440613e-05, + "loss": 0.5389, + "step": 9737 + }, + { + "epoch": 1.7371331727767372, + "grad_norm": 0.5082148313522339, + "learning_rate": 2.0972318168732048e-05, + "loss": 0.4963, + "step": 9738 + }, + { + "epoch": 1.7373115689947372, + "grad_norm": 0.42362144589424133, + "learning_rate": 2.094423741875903e-05, + "loss": 0.4004, + "step": 9739 + }, + { + "epoch": 1.7374899652127374, + "grad_norm": 0.5850268602371216, + "learning_rate": 2.0916174658727054e-05, + "loss": 0.5616, + "step": 9740 + }, + { + "epoch": 1.7376683614307376, + "grad_norm": 0.5065614581108093, + "learning_rate": 2.0888129890840102e-05, + "loss": 0.4137, + "step": 9741 + }, + { + "epoch": 1.7378467576487377, + "grad_norm": 0.535125732421875, + "learning_rate": 2.0860103117300882e-05, + "loss": 0.5246, + "step": 9742 + }, + { + "epoch": 1.738025153866738, + "grad_norm": 0.49829819798469543, + "learning_rate": 2.0832094340310554e-05, + "loss": 0.3795, + "step": 9743 + }, + { + "epoch": 1.7382035500847381, + "grad_norm": 0.5445170402526855, + "learning_rate": 2.0804103562068883e-05, + "loss": 0.5326, + "step": 9744 + }, + { + "epoch": 1.7383819463027383, + "grad_norm": 0.5501541495323181, + "learning_rate": 2.0776130784774333e-05, + "loss": 0.5399, + "step": 9745 + }, + { + "epoch": 1.7385603425207385, + "grad_norm": 0.48159241676330566, + "learning_rate": 2.0748176010623758e-05, + "loss": 0.4521, + "step": 9746 + }, + { + "epoch": 1.7387387387387387, + "grad_norm": 0.5003437399864197, + "learning_rate": 2.0720239241812848e-05, + "loss": 0.4789, + "step": 9747 + }, + { + "epoch": 1.738917134956739, + "grad_norm": 0.4868369400501251, + "learning_rate": 2.0692320480535624e-05, + "loss": 0.3965, + "step": 9748 + }, + { + "epoch": 1.7390955311747391, + "grad_norm": 0.5282782316207886, + "learning_rate": 2.066441972898489e-05, + "loss": 0.4808, + "step": 9749 + }, + { + "epoch": 1.7392739273927393, + "grad_norm": 0.4921809434890747, + "learning_rate": 2.0636536989351972e-05, + "loss": 0.458, + "step": 9750 + }, + { + "epoch": 1.7394523236107395, + "grad_norm": 0.5522050857543945, + "learning_rate": 2.0608672263826705e-05, + "loss": 0.5707, + "step": 9751 + }, + { + "epoch": 1.7396307198287397, + "grad_norm": 0.6109182834625244, + "learning_rate": 2.0580825554597644e-05, + "loss": 0.773, + "step": 9752 + }, + { + "epoch": 1.73980911604674, + "grad_norm": 0.5634390115737915, + "learning_rate": 2.055299686385176e-05, + "loss": 0.6855, + "step": 9753 + }, + { + "epoch": 1.7399875122647401, + "grad_norm": 0.47607871890068054, + "learning_rate": 2.0525186193774802e-05, + "loss": 0.4555, + "step": 9754 + }, + { + "epoch": 1.7401659084827403, + "grad_norm": 0.5108838677406311, + "learning_rate": 2.0497393546551003e-05, + "loss": 0.4981, + "step": 9755 + }, + { + "epoch": 1.7403443047007403, + "grad_norm": 0.522916316986084, + "learning_rate": 2.046961892436308e-05, + "loss": 0.5068, + "step": 9756 + }, + { + "epoch": 1.7405227009187405, + "grad_norm": 0.49157410860061646, + "learning_rate": 2.0441862329392547e-05, + "loss": 0.4636, + "step": 9757 + }, + { + "epoch": 1.7407010971367407, + "grad_norm": 0.5237782597541809, + "learning_rate": 2.0414123763819348e-05, + "loss": 0.4787, + "step": 9758 + }, + { + "epoch": 1.740879493354741, + "grad_norm": 0.521073043346405, + "learning_rate": 2.0386403229822102e-05, + "loss": 0.4384, + "step": 9759 + }, + { + "epoch": 1.741057889572741, + "grad_norm": 0.5777454376220703, + "learning_rate": 2.035870072957788e-05, + "loss": 0.6771, + "step": 9760 + }, + { + "epoch": 1.741236285790741, + "grad_norm": 0.5693924427032471, + "learning_rate": 2.0331016265262543e-05, + "loss": 0.6439, + "step": 9761 + }, + { + "epoch": 1.7414146820087413, + "grad_norm": 0.5041927099227905, + "learning_rate": 2.030334983905027e-05, + "loss": 0.4557, + "step": 9762 + }, + { + "epoch": 1.7415930782267415, + "grad_norm": 0.5025537014007568, + "learning_rate": 2.0275701453114108e-05, + "loss": 0.4614, + "step": 9763 + }, + { + "epoch": 1.7417714744447417, + "grad_norm": 0.5347465872764587, + "learning_rate": 2.024807110962551e-05, + "loss": 0.5502, + "step": 9764 + }, + { + "epoch": 1.7419498706627419, + "grad_norm": 0.4626553952693939, + "learning_rate": 2.0220458810754488e-05, + "loss": 0.3479, + "step": 9765 + }, + { + "epoch": 1.742128266880742, + "grad_norm": 0.4479665458202362, + "learning_rate": 2.019286455866981e-05, + "loss": 0.4113, + "step": 9766 + }, + { + "epoch": 1.7423066630987423, + "grad_norm": 0.5242836475372314, + "learning_rate": 2.0165288355538656e-05, + "loss": 0.522, + "step": 9767 + }, + { + "epoch": 1.7424850593167425, + "grad_norm": 0.4955342411994934, + "learning_rate": 2.01377302035268e-05, + "loss": 0.3913, + "step": 9768 + }, + { + "epoch": 1.7426634555347427, + "grad_norm": 0.5188047289848328, + "learning_rate": 2.0110190104798727e-05, + "loss": 0.4209, + "step": 9769 + }, + { + "epoch": 1.7428418517527429, + "grad_norm": 0.47911545634269714, + "learning_rate": 2.0082668061517373e-05, + "loss": 0.3811, + "step": 9770 + }, + { + "epoch": 1.743020247970743, + "grad_norm": 0.5450549125671387, + "learning_rate": 2.0055164075844345e-05, + "loss": 0.4722, + "step": 9771 + }, + { + "epoch": 1.7431986441887433, + "grad_norm": 0.5390350818634033, + "learning_rate": 2.0027678149939747e-05, + "loss": 0.558, + "step": 9772 + }, + { + "epoch": 1.7433770404067435, + "grad_norm": 0.5970989465713501, + "learning_rate": 2.0000210285962385e-05, + "loss": 0.5316, + "step": 9773 + }, + { + "epoch": 1.7435554366247437, + "grad_norm": 0.5830510258674622, + "learning_rate": 1.9972760486069498e-05, + "loss": 0.6375, + "step": 9774 + }, + { + "epoch": 1.7437338328427439, + "grad_norm": 0.5821729898452759, + "learning_rate": 1.9945328752417057e-05, + "loss": 0.5732, + "step": 9775 + }, + { + "epoch": 1.743912229060744, + "grad_norm": 0.5588756203651428, + "learning_rate": 1.9917915087159482e-05, + "loss": 0.5578, + "step": 9776 + }, + { + "epoch": 1.7440906252787443, + "grad_norm": 0.5491783618927002, + "learning_rate": 1.9890519492449798e-05, + "loss": 0.5814, + "step": 9777 + }, + { + "epoch": 1.7442690214967442, + "grad_norm": 0.5677632093429565, + "learning_rate": 1.9863141970439758e-05, + "loss": 0.5373, + "step": 9778 + }, + { + "epoch": 1.7444474177147444, + "grad_norm": 0.4712635576725006, + "learning_rate": 1.983578252327939e-05, + "loss": 0.3357, + "step": 9779 + }, + { + "epoch": 1.7446258139327446, + "grad_norm": 0.5437815189361572, + "learning_rate": 1.9808441153117723e-05, + "loss": 0.5909, + "step": 9780 + }, + { + "epoch": 1.7448042101507448, + "grad_norm": 0.49370652437210083, + "learning_rate": 1.9781117862102045e-05, + "loss": 0.5412, + "step": 9781 + }, + { + "epoch": 1.744982606368745, + "grad_norm": 0.5577530860900879, + "learning_rate": 1.9753812652378217e-05, + "loss": 0.672, + "step": 9782 + }, + { + "epoch": 1.745161002586745, + "grad_norm": 0.5543391108512878, + "learning_rate": 1.9726525526090917e-05, + "loss": 0.5612, + "step": 9783 + }, + { + "epoch": 1.7453393988047452, + "grad_norm": 0.4798511564731598, + "learning_rate": 1.9699256485383177e-05, + "loss": 0.4455, + "step": 9784 + }, + { + "epoch": 1.7455177950227454, + "grad_norm": 0.5388596057891846, + "learning_rate": 1.967200553239676e-05, + "loss": 0.4951, + "step": 9785 + }, + { + "epoch": 1.7456961912407456, + "grad_norm": 0.4370432496070862, + "learning_rate": 1.9644772669271894e-05, + "loss": 0.3437, + "step": 9786 + }, + { + "epoch": 1.7458745874587458, + "grad_norm": 0.5266339182853699, + "learning_rate": 1.9617557898147454e-05, + "loss": 0.5436, + "step": 9787 + }, + { + "epoch": 1.746052983676746, + "grad_norm": 0.4976480305194855, + "learning_rate": 1.9590361221160897e-05, + "loss": 0.396, + "step": 9788 + }, + { + "epoch": 1.7462313798947462, + "grad_norm": 0.5532608032226562, + "learning_rate": 1.956318264044818e-05, + "loss": 0.5835, + "step": 9789 + }, + { + "epoch": 1.7464097761127464, + "grad_norm": 0.45472416281700134, + "learning_rate": 1.9536022158143956e-05, + "loss": 0.3592, + "step": 9790 + }, + { + "epoch": 1.7465881723307466, + "grad_norm": 0.5193656086921692, + "learning_rate": 1.9508879776381355e-05, + "loss": 0.4665, + "step": 9791 + }, + { + "epoch": 1.7467665685487468, + "grad_norm": 0.5724708437919617, + "learning_rate": 1.948175549729217e-05, + "loss": 0.5152, + "step": 9792 + }, + { + "epoch": 1.746944964766747, + "grad_norm": 0.5208200216293335, + "learning_rate": 1.945464932300667e-05, + "loss": 0.6308, + "step": 9793 + }, + { + "epoch": 1.7471233609847472, + "grad_norm": 0.4895492494106293, + "learning_rate": 1.9427561255653815e-05, + "loss": 0.5369, + "step": 9794 + }, + { + "epoch": 1.7473017572027474, + "grad_norm": 0.46594980359077454, + "learning_rate": 1.94004912973611e-05, + "loss": 0.4012, + "step": 9795 + }, + { + "epoch": 1.7474801534207476, + "grad_norm": 0.5352393984794617, + "learning_rate": 1.937343945025455e-05, + "loss": 0.5697, + "step": 9796 + }, + { + "epoch": 1.7476585496387478, + "grad_norm": 0.5204436182975769, + "learning_rate": 1.934640571645882e-05, + "loss": 0.564, + "step": 9797 + }, + { + "epoch": 1.747836945856748, + "grad_norm": 0.5590188503265381, + "learning_rate": 1.9319390098097108e-05, + "loss": 0.5922, + "step": 9798 + }, + { + "epoch": 1.7480153420747482, + "grad_norm": 0.5261103510856628, + "learning_rate": 1.9292392597291293e-05, + "loss": 0.3612, + "step": 9799 + }, + { + "epoch": 1.7481937382927482, + "grad_norm": 0.4883411228656769, + "learning_rate": 1.9265413216161598e-05, + "loss": 0.4208, + "step": 9800 + }, + { + "epoch": 1.7483721345107484, + "grad_norm": 0.5407469272613525, + "learning_rate": 1.9238451956827135e-05, + "loss": 0.5281, + "step": 9801 + }, + { + "epoch": 1.7485505307287486, + "grad_norm": 0.5203365087509155, + "learning_rate": 1.921150882140532e-05, + "loss": 0.4411, + "step": 9802 + }, + { + "epoch": 1.7487289269467488, + "grad_norm": 0.5493040084838867, + "learning_rate": 1.9184583812012268e-05, + "loss": 0.5184, + "step": 9803 + }, + { + "epoch": 1.748907323164749, + "grad_norm": 0.41875502467155457, + "learning_rate": 1.9157676930762702e-05, + "loss": 0.3174, + "step": 9804 + }, + { + "epoch": 1.749085719382749, + "grad_norm": 0.5331047177314758, + "learning_rate": 1.913078817976982e-05, + "loss": 0.5325, + "step": 9805 + }, + { + "epoch": 1.7492641156007491, + "grad_norm": 0.5197538137435913, + "learning_rate": 1.9103917561145516e-05, + "loss": 0.4778, + "step": 9806 + }, + { + "epoch": 1.7494425118187493, + "grad_norm": 0.5139304995536804, + "learning_rate": 1.9077065077000104e-05, + "loss": 0.5117, + "step": 9807 + }, + { + "epoch": 1.7496209080367495, + "grad_norm": 0.540160596370697, + "learning_rate": 1.9050230729442702e-05, + "loss": 0.4415, + "step": 9808 + }, + { + "epoch": 1.7497993042547497, + "grad_norm": 0.5008589029312134, + "learning_rate": 1.9023414520580733e-05, + "loss": 0.4228, + "step": 9809 + }, + { + "epoch": 1.74997770047275, + "grad_norm": 0.5717454552650452, + "learning_rate": 1.8996616452520404e-05, + "loss": 0.5889, + "step": 9810 + }, + { + "epoch": 1.7501560966907501, + "grad_norm": 0.5014664530754089, + "learning_rate": 1.8969836527366448e-05, + "loss": 0.4555, + "step": 9811 + }, + { + "epoch": 1.7503344929087503, + "grad_norm": 0.5449042320251465, + "learning_rate": 1.8943074747222068e-05, + "loss": 0.5205, + "step": 9812 + }, + { + "epoch": 1.7505128891267505, + "grad_norm": 0.49682438373565674, + "learning_rate": 1.8916331114189195e-05, + "loss": 0.4651, + "step": 9813 + }, + { + "epoch": 1.7506912853447507, + "grad_norm": 0.5355203151702881, + "learning_rate": 1.8889605630368235e-05, + "loss": 0.5275, + "step": 9814 + }, + { + "epoch": 1.750869681562751, + "grad_norm": 0.5955283045768738, + "learning_rate": 1.8862898297858173e-05, + "loss": 0.6346, + "step": 9815 + }, + { + "epoch": 1.751048077780751, + "grad_norm": 0.5433852672576904, + "learning_rate": 1.8836209118756637e-05, + "loss": 0.5114, + "step": 9816 + }, + { + "epoch": 1.7512264739987513, + "grad_norm": 0.5149011611938477, + "learning_rate": 1.8809538095159727e-05, + "loss": 0.4793, + "step": 9817 + }, + { + "epoch": 1.7514048702167515, + "grad_norm": 0.4862879514694214, + "learning_rate": 1.8782885229162245e-05, + "loss": 0.3931, + "step": 9818 + }, + { + "epoch": 1.7515832664347517, + "grad_norm": 0.6665529012680054, + "learning_rate": 1.8756250522857397e-05, + "loss": 0.8526, + "step": 9819 + }, + { + "epoch": 1.751761662652752, + "grad_norm": 0.5120258927345276, + "learning_rate": 1.8729633978337184e-05, + "loss": 0.5787, + "step": 9820 + }, + { + "epoch": 1.751940058870752, + "grad_norm": 0.5091425776481628, + "learning_rate": 1.870303559769196e-05, + "loss": 0.4539, + "step": 9821 + }, + { + "epoch": 1.752118455088752, + "grad_norm": 0.5244202017784119, + "learning_rate": 1.86764553830108e-05, + "loss": 0.5742, + "step": 9822 + }, + { + "epoch": 1.7522968513067523, + "grad_norm": 0.4686015546321869, + "learning_rate": 1.8649893336381313e-05, + "loss": 0.4521, + "step": 9823 + }, + { + "epoch": 1.7524752475247525, + "grad_norm": 0.5084050893783569, + "learning_rate": 1.8623349459889582e-05, + "loss": 0.487, + "step": 9824 + }, + { + "epoch": 1.7526536437427527, + "grad_norm": 0.4738958477973938, + "learning_rate": 1.859682375562044e-05, + "loss": 0.3566, + "step": 9825 + }, + { + "epoch": 1.7528320399607529, + "grad_norm": 0.5699481964111328, + "learning_rate": 1.857031622565722e-05, + "loss": 0.5028, + "step": 9826 + }, + { + "epoch": 1.7530104361787529, + "grad_norm": 0.4751138985157013, + "learning_rate": 1.85438268720817e-05, + "loss": 0.3851, + "step": 9827 + }, + { + "epoch": 1.753188832396753, + "grad_norm": 0.5180573463439941, + "learning_rate": 1.851735569697449e-05, + "loss": 0.4933, + "step": 9828 + }, + { + "epoch": 1.7533672286147532, + "grad_norm": 0.5232348442077637, + "learning_rate": 1.849090270241449e-05, + "loss": 0.4913, + "step": 9829 + }, + { + "epoch": 1.7535456248327534, + "grad_norm": 0.43816691637039185, + "learning_rate": 1.8464467890479397e-05, + "loss": 0.3977, + "step": 9830 + }, + { + "epoch": 1.7537240210507536, + "grad_norm": 0.45382770895957947, + "learning_rate": 1.8438051263245326e-05, + "loss": 0.3, + "step": 9831 + }, + { + "epoch": 1.7539024172687538, + "grad_norm": 0.5431753396987915, + "learning_rate": 1.8411652822787118e-05, + "loss": 0.6459, + "step": 9832 + }, + { + "epoch": 1.754080813486754, + "grad_norm": 0.4964623749256134, + "learning_rate": 1.8385272571177974e-05, + "loss": 0.5321, + "step": 9833 + }, + { + "epoch": 1.7542592097047542, + "grad_norm": 0.5045402646064758, + "learning_rate": 1.8358910510489907e-05, + "loss": 0.4846, + "step": 9834 + }, + { + "epoch": 1.7544376059227544, + "grad_norm": 0.4275517761707306, + "learning_rate": 1.8332566642793312e-05, + "loss": 0.3162, + "step": 9835 + }, + { + "epoch": 1.7546160021407546, + "grad_norm": 0.5664304494857788, + "learning_rate": 1.8306240970157206e-05, + "loss": 0.5896, + "step": 9836 + }, + { + "epoch": 1.7547943983587548, + "grad_norm": 0.5230024456977844, + "learning_rate": 1.8279933494649265e-05, + "loss": 0.414, + "step": 9837 + }, + { + "epoch": 1.754972794576755, + "grad_norm": 0.5773944854736328, + "learning_rate": 1.8253644218335584e-05, + "loss": 0.557, + "step": 9838 + }, + { + "epoch": 1.7551511907947552, + "grad_norm": 0.5354689359664917, + "learning_rate": 1.8227373143281017e-05, + "loss": 0.516, + "step": 9839 + }, + { + "epoch": 1.7553295870127554, + "grad_norm": 0.47697752714157104, + "learning_rate": 1.820112027154877e-05, + "loss": 0.4222, + "step": 9840 + }, + { + "epoch": 1.7555079832307556, + "grad_norm": 0.5502009391784668, + "learning_rate": 1.8174885605200782e-05, + "loss": 0.4705, + "step": 9841 + }, + { + "epoch": 1.7556863794487558, + "grad_norm": 0.4850895404815674, + "learning_rate": 1.8148669146297565e-05, + "loss": 0.4318, + "step": 9842 + }, + { + "epoch": 1.755864775666756, + "grad_norm": 0.5271044969558716, + "learning_rate": 1.8122470896898057e-05, + "loss": 0.4357, + "step": 9843 + }, + { + "epoch": 1.756043171884756, + "grad_norm": 0.5360130071640015, + "learning_rate": 1.809629085905992e-05, + "loss": 0.4743, + "step": 9844 + }, + { + "epoch": 1.7562215681027562, + "grad_norm": 0.5907223224639893, + "learning_rate": 1.807012903483929e-05, + "loss": 0.4996, + "step": 9845 + }, + { + "epoch": 1.7563999643207564, + "grad_norm": 0.5198745131492615, + "learning_rate": 1.8043985426290958e-05, + "loss": 0.5376, + "step": 9846 + }, + { + "epoch": 1.7565783605387566, + "grad_norm": 0.7061692476272583, + "learning_rate": 1.801786003546818e-05, + "loss": 0.5927, + "step": 9847 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.4907400906085968, + "learning_rate": 1.799175286442281e-05, + "loss": 0.4041, + "step": 9848 + }, + { + "epoch": 1.7569351529747568, + "grad_norm": 0.4804750382900238, + "learning_rate": 1.7965663915205376e-05, + "loss": 0.4552, + "step": 9849 + }, + { + "epoch": 1.757113549192757, + "grad_norm": 0.5161197781562805, + "learning_rate": 1.7939593189864794e-05, + "loss": 0.4615, + "step": 9850 + }, + { + "epoch": 1.7572919454107572, + "grad_norm": 0.5448468327522278, + "learning_rate": 1.791354069044876e-05, + "loss": 0.4771, + "step": 9851 + }, + { + "epoch": 1.7574703416287574, + "grad_norm": 0.5089532136917114, + "learning_rate": 1.7887506419003303e-05, + "loss": 0.4607, + "step": 9852 + }, + { + "epoch": 1.7576487378467576, + "grad_norm": 0.49231788516044617, + "learning_rate": 1.786149037757326e-05, + "loss": 0.5082, + "step": 9853 + }, + { + "epoch": 1.7578271340647578, + "grad_norm": 0.5540265440940857, + "learning_rate": 1.78354925682018e-05, + "loss": 0.5348, + "step": 9854 + }, + { + "epoch": 1.758005530282758, + "grad_norm": 0.6183173060417175, + "learning_rate": 1.7809512992930875e-05, + "loss": 0.6396, + "step": 9855 + }, + { + "epoch": 1.7581839265007582, + "grad_norm": 0.4565494656562805, + "learning_rate": 1.778355165380091e-05, + "loss": 0.407, + "step": 9856 + }, + { + "epoch": 1.7583623227187584, + "grad_norm": 0.5190179347991943, + "learning_rate": 1.7757608552850828e-05, + "loss": 0.499, + "step": 9857 + }, + { + "epoch": 1.7585407189367586, + "grad_norm": 0.5426142811775208, + "learning_rate": 1.7731683692118277e-05, + "loss": 0.5268, + "step": 9858 + }, + { + "epoch": 1.7587191151547588, + "grad_norm": 0.5767265558242798, + "learning_rate": 1.770577707363927e-05, + "loss": 0.4803, + "step": 9859 + }, + { + "epoch": 1.758897511372759, + "grad_norm": 0.5167482495307922, + "learning_rate": 1.7679888699448644e-05, + "loss": 0.5015, + "step": 9860 + }, + { + "epoch": 1.7590759075907592, + "grad_norm": 0.4636717736721039, + "learning_rate": 1.7654018571579554e-05, + "loss": 0.3958, + "step": 9861 + }, + { + "epoch": 1.7592543038087594, + "grad_norm": 0.504102349281311, + "learning_rate": 1.7628166692063823e-05, + "loss": 0.3926, + "step": 9862 + }, + { + "epoch": 1.7594327000267596, + "grad_norm": 0.5384790301322937, + "learning_rate": 1.7602333062931935e-05, + "loss": 0.4652, + "step": 9863 + }, + { + "epoch": 1.7596110962447598, + "grad_norm": 0.48373332619667053, + "learning_rate": 1.757651768621274e-05, + "loss": 0.3906, + "step": 9864 + }, + { + "epoch": 1.75978949246276, + "grad_norm": 0.4555506110191345, + "learning_rate": 1.755072056393389e-05, + "loss": 0.3741, + "step": 9865 + }, + { + "epoch": 1.75996788868076, + "grad_norm": 0.6488031148910522, + "learning_rate": 1.7524941698121354e-05, + "loss": 0.7711, + "step": 9866 + }, + { + "epoch": 1.7601462848987601, + "grad_norm": 0.5170810222625732, + "learning_rate": 1.7499181090799928e-05, + "loss": 0.4957, + "step": 9867 + }, + { + "epoch": 1.7603246811167603, + "grad_norm": 0.5525889992713928, + "learning_rate": 1.7473438743992736e-05, + "loss": 0.6491, + "step": 9868 + }, + { + "epoch": 1.7605030773347605, + "grad_norm": 0.4711996018886566, + "learning_rate": 1.7447714659721586e-05, + "loss": 0.4518, + "step": 9869 + }, + { + "epoch": 1.7606814735527607, + "grad_norm": 0.43584319949150085, + "learning_rate": 1.742200884000686e-05, + "loss": 0.3699, + "step": 9870 + }, + { + "epoch": 1.7608598697707607, + "grad_norm": 0.49232321977615356, + "learning_rate": 1.7396321286867412e-05, + "loss": 0.4172, + "step": 9871 + }, + { + "epoch": 1.761038265988761, + "grad_norm": 0.6322119235992432, + "learning_rate": 1.737065200232088e-05, + "loss": 0.5614, + "step": 9872 + }, + { + "epoch": 1.761216662206761, + "grad_norm": 0.46853572130203247, + "learning_rate": 1.7345000988383208e-05, + "loss": 0.3119, + "step": 9873 + }, + { + "epoch": 1.7613950584247613, + "grad_norm": 0.5115934610366821, + "learning_rate": 1.7319368247069005e-05, + "loss": 0.5075, + "step": 9874 + }, + { + "epoch": 1.7615734546427615, + "grad_norm": 0.524381697177887, + "learning_rate": 1.729375378039155e-05, + "loss": 0.4831, + "step": 9875 + }, + { + "epoch": 1.7617518508607617, + "grad_norm": 0.5672167539596558, + "learning_rate": 1.7268157590362487e-05, + "loss": 0.5661, + "step": 9876 + }, + { + "epoch": 1.761930247078762, + "grad_norm": 0.5367761850357056, + "learning_rate": 1.7242579678992204e-05, + "loss": 0.5691, + "step": 9877 + }, + { + "epoch": 1.762108643296762, + "grad_norm": 0.5382869243621826, + "learning_rate": 1.721702004828951e-05, + "loss": 0.4331, + "step": 9878 + }, + { + "epoch": 1.7622870395147623, + "grad_norm": 0.4912665784358978, + "learning_rate": 1.719147870026194e-05, + "loss": 0.4589, + "step": 9879 + }, + { + "epoch": 1.7624654357327625, + "grad_norm": 0.45093002915382385, + "learning_rate": 1.716595563691539e-05, + "loss": 0.3703, + "step": 9880 + }, + { + "epoch": 1.7626438319507627, + "grad_norm": 0.614578127861023, + "learning_rate": 1.7140450860254535e-05, + "loss": 0.5906, + "step": 9881 + }, + { + "epoch": 1.7628222281687629, + "grad_norm": 0.45444604754447937, + "learning_rate": 1.7114964372282466e-05, + "loss": 0.4004, + "step": 9882 + }, + { + "epoch": 1.763000624386763, + "grad_norm": 0.5366213321685791, + "learning_rate": 1.708949617500083e-05, + "loss": 0.4764, + "step": 9883 + }, + { + "epoch": 1.7631790206047633, + "grad_norm": 0.48068365454673767, + "learning_rate": 1.7064046270409973e-05, + "loss": 0.447, + "step": 9884 + }, + { + "epoch": 1.7633574168227635, + "grad_norm": 0.5525119304656982, + "learning_rate": 1.7038614660508657e-05, + "loss": 0.6492, + "step": 9885 + }, + { + "epoch": 1.7635358130407637, + "grad_norm": 0.5292766094207764, + "learning_rate": 1.7013201347294284e-05, + "loss": 0.4177, + "step": 9886 + }, + { + "epoch": 1.7637142092587639, + "grad_norm": 0.48070111870765686, + "learning_rate": 1.698780633276284e-05, + "loss": 0.3513, + "step": 9887 + }, + { + "epoch": 1.7638926054767639, + "grad_norm": 0.4753524363040924, + "learning_rate": 1.6962429618908785e-05, + "loss": 0.4376, + "step": 9888 + }, + { + "epoch": 1.764071001694764, + "grad_norm": 0.5445675849914551, + "learning_rate": 1.693707120772528e-05, + "loss": 0.5026, + "step": 9889 + }, + { + "epoch": 1.7642493979127643, + "grad_norm": 0.5858935713768005, + "learning_rate": 1.6911731101203863e-05, + "loss": 0.7673, + "step": 9890 + }, + { + "epoch": 1.7644277941307644, + "grad_norm": 0.5799805521965027, + "learning_rate": 1.688640930133481e-05, + "loss": 0.6229, + "step": 9891 + }, + { + "epoch": 1.7646061903487646, + "grad_norm": 0.5168468952178955, + "learning_rate": 1.6861105810106804e-05, + "loss": 0.4652, + "step": 9892 + }, + { + "epoch": 1.7647845865667646, + "grad_norm": 0.5596296787261963, + "learning_rate": 1.683582062950728e-05, + "loss": 0.6326, + "step": 9893 + }, + { + "epoch": 1.7649629827847648, + "grad_norm": 0.5667998194694519, + "learning_rate": 1.6810553761522047e-05, + "loss": 0.5322, + "step": 9894 + }, + { + "epoch": 1.765141379002765, + "grad_norm": 0.5282207131385803, + "learning_rate": 1.6785305208135538e-05, + "loss": 0.4871, + "step": 9895 + }, + { + "epoch": 1.7653197752207652, + "grad_norm": 0.5011943578720093, + "learning_rate": 1.6760074971330863e-05, + "loss": 0.4561, + "step": 9896 + }, + { + "epoch": 1.7654981714387654, + "grad_norm": 0.49850714206695557, + "learning_rate": 1.6734863053089467e-05, + "loss": 0.4113, + "step": 9897 + }, + { + "epoch": 1.7656765676567656, + "grad_norm": 0.5289611220359802, + "learning_rate": 1.670966945539157e-05, + "loss": 0.5378, + "step": 9898 + }, + { + "epoch": 1.7658549638747658, + "grad_norm": 0.5059836506843567, + "learning_rate": 1.6684494180215837e-05, + "loss": 0.4601, + "step": 9899 + }, + { + "epoch": 1.766033360092766, + "grad_norm": 0.577877402305603, + "learning_rate": 1.6659337229539525e-05, + "loss": 0.6335, + "step": 9900 + }, + { + "epoch": 1.7662117563107662, + "grad_norm": 0.454922080039978, + "learning_rate": 1.6634198605338437e-05, + "loss": 0.3668, + "step": 9901 + }, + { + "epoch": 1.7663901525287664, + "grad_norm": 0.5340895652770996, + "learning_rate": 1.6609078309586967e-05, + "loss": 0.4897, + "step": 9902 + }, + { + "epoch": 1.7665685487467666, + "grad_norm": 0.5871651768684387, + "learning_rate": 1.6583976344258097e-05, + "loss": 0.7127, + "step": 9903 + }, + { + "epoch": 1.7667469449647668, + "grad_norm": 0.48758915066719055, + "learning_rate": 1.6558892711323215e-05, + "loss": 0.4231, + "step": 9904 + }, + { + "epoch": 1.766925341182767, + "grad_norm": 0.5873585343360901, + "learning_rate": 1.65338274127525e-05, + "loss": 0.6313, + "step": 9905 + }, + { + "epoch": 1.7671037374007672, + "grad_norm": 0.5112351775169373, + "learning_rate": 1.6508780450514516e-05, + "loss": 0.4986, + "step": 9906 + }, + { + "epoch": 1.7672821336187674, + "grad_norm": 0.5402355194091797, + "learning_rate": 1.6483751826576382e-05, + "loss": 0.6041, + "step": 9907 + }, + { + "epoch": 1.7674605298367676, + "grad_norm": 0.925947368144989, + "learning_rate": 1.6458741542903942e-05, + "loss": 0.4839, + "step": 9908 + }, + { + "epoch": 1.7676389260547678, + "grad_norm": 0.5026435256004333, + "learning_rate": 1.6433749601461378e-05, + "loss": 0.4032, + "step": 9909 + }, + { + "epoch": 1.7678173222727678, + "grad_norm": 0.5951597690582275, + "learning_rate": 1.6408776004211674e-05, + "loss": 0.5571, + "step": 9910 + }, + { + "epoch": 1.767995718490768, + "grad_norm": 0.4633539915084839, + "learning_rate": 1.6383820753116118e-05, + "loss": 0.4171, + "step": 9911 + }, + { + "epoch": 1.7681741147087682, + "grad_norm": 0.5063800811767578, + "learning_rate": 1.6358883850134816e-05, + "loss": 0.3949, + "step": 9912 + }, + { + "epoch": 1.7683525109267684, + "grad_norm": 0.5754553079605103, + "learning_rate": 1.6333965297226166e-05, + "loss": 0.6068, + "step": 9913 + }, + { + "epoch": 1.7685309071447686, + "grad_norm": 0.477913498878479, + "learning_rate": 1.6309065096347386e-05, + "loss": 0.4111, + "step": 9914 + }, + { + "epoch": 1.7687093033627685, + "grad_norm": 0.49996158480644226, + "learning_rate": 1.6284183249454048e-05, + "loss": 0.3918, + "step": 9915 + }, + { + "epoch": 1.7688876995807687, + "grad_norm": 0.6070882678031921, + "learning_rate": 1.6259319758500312e-05, + "loss": 0.6542, + "step": 9916 + }, + { + "epoch": 1.769066095798769, + "grad_norm": 0.5135297179222107, + "learning_rate": 1.6234474625439117e-05, + "loss": 0.4433, + "step": 9917 + }, + { + "epoch": 1.7692444920167691, + "grad_norm": 0.48473936319351196, + "learning_rate": 1.620964785222162e-05, + "loss": 0.5045, + "step": 9918 + }, + { + "epoch": 1.7694228882347693, + "grad_norm": 0.4841310679912567, + "learning_rate": 1.618483944079782e-05, + "loss": 0.3964, + "step": 9919 + }, + { + "epoch": 1.7696012844527695, + "grad_norm": 0.54714035987854, + "learning_rate": 1.6160049393116104e-05, + "loss": 0.5497, + "step": 9920 + }, + { + "epoch": 1.7697796806707697, + "grad_norm": 0.49543190002441406, + "learning_rate": 1.6135277711123443e-05, + "loss": 0.5597, + "step": 9921 + }, + { + "epoch": 1.76995807688877, + "grad_norm": 0.5189563035964966, + "learning_rate": 1.6110524396765496e-05, + "loss": 0.4857, + "step": 9922 + }, + { + "epoch": 1.7701364731067701, + "grad_norm": 0.5028755068778992, + "learning_rate": 1.6085789451986245e-05, + "loss": 0.4835, + "step": 9923 + }, + { + "epoch": 1.7703148693247703, + "grad_norm": 0.5071883201599121, + "learning_rate": 1.606107287872846e-05, + "loss": 0.5195, + "step": 9924 + }, + { + "epoch": 1.7704932655427705, + "grad_norm": 0.5900061130523682, + "learning_rate": 1.603637467893332e-05, + "loss": 0.6589, + "step": 9925 + }, + { + "epoch": 1.7706716617607707, + "grad_norm": 0.644641637802124, + "learning_rate": 1.6011694854540683e-05, + "loss": 0.4762, + "step": 9926 + }, + { + "epoch": 1.770850057978771, + "grad_norm": 0.543786883354187, + "learning_rate": 1.5987033407488806e-05, + "loss": 0.5394, + "step": 9927 + }, + { + "epoch": 1.7710284541967711, + "grad_norm": 0.5951870083808899, + "learning_rate": 1.5962390339714613e-05, + "loss": 0.6289, + "step": 9928 + }, + { + "epoch": 1.7712068504147713, + "grad_norm": 0.5071197748184204, + "learning_rate": 1.593776565315358e-05, + "loss": 0.4177, + "step": 9929 + }, + { + "epoch": 1.7713852466327715, + "grad_norm": 0.4966924786567688, + "learning_rate": 1.591315934973969e-05, + "loss": 0.4219, + "step": 9930 + }, + { + "epoch": 1.7715636428507717, + "grad_norm": 0.4919562339782715, + "learning_rate": 1.588857143140554e-05, + "loss": 0.4038, + "step": 9931 + }, + { + "epoch": 1.7717420390687717, + "grad_norm": 0.59256911277771, + "learning_rate": 1.5864001900082247e-05, + "loss": 0.5511, + "step": 9932 + }, + { + "epoch": 1.771920435286772, + "grad_norm": 0.5095023512840271, + "learning_rate": 1.5839450757699465e-05, + "loss": 0.5277, + "step": 9933 + }, + { + "epoch": 1.772098831504772, + "grad_norm": 0.5355534553527832, + "learning_rate": 1.581491800618548e-05, + "loss": 0.4272, + "step": 9934 + }, + { + "epoch": 1.7722772277227723, + "grad_norm": 0.5533571839332581, + "learning_rate": 1.5790403647467033e-05, + "loss": 0.5586, + "step": 9935 + }, + { + "epoch": 1.7724556239407725, + "grad_norm": 0.45834532380104065, + "learning_rate": 1.5765907683469527e-05, + "loss": 0.4001, + "step": 9936 + }, + { + "epoch": 1.7726340201587725, + "grad_norm": 0.5351871848106384, + "learning_rate": 1.5741430116116813e-05, + "loss": 0.5495, + "step": 9937 + }, + { + "epoch": 1.7728124163767727, + "grad_norm": 0.48931190371513367, + "learning_rate": 1.5716970947331376e-05, + "loss": 0.4277, + "step": 9938 + }, + { + "epoch": 1.7729908125947729, + "grad_norm": 0.48030897974967957, + "learning_rate": 1.5692530179034215e-05, + "loss": 0.4325, + "step": 9939 + }, + { + "epoch": 1.773169208812773, + "grad_norm": 0.5139449238777161, + "learning_rate": 1.5668107813144927e-05, + "loss": 0.5081, + "step": 9940 + }, + { + "epoch": 1.7733476050307733, + "grad_norm": 0.5078691840171814, + "learning_rate": 1.564370385158159e-05, + "loss": 0.3923, + "step": 9941 + }, + { + "epoch": 1.7735260012487735, + "grad_norm": 0.4742141664028168, + "learning_rate": 1.5619318296260897e-05, + "loss": 0.4197, + "step": 9942 + }, + { + "epoch": 1.7737043974667737, + "grad_norm": 0.5286515355110168, + "learning_rate": 1.5594951149098092e-05, + "loss": 0.4823, + "step": 9943 + }, + { + "epoch": 1.7738827936847739, + "grad_norm": 0.481815904378891, + "learning_rate": 1.5570602412006944e-05, + "loss": 0.4204, + "step": 9944 + }, + { + "epoch": 1.774061189902774, + "grad_norm": 0.6102718114852905, + "learning_rate": 1.554627208689982e-05, + "loss": 0.6736, + "step": 9945 + }, + { + "epoch": 1.7742395861207743, + "grad_norm": 0.534480094909668, + "learning_rate": 1.552196017568755e-05, + "loss": 0.4361, + "step": 9946 + }, + { + "epoch": 1.7744179823387745, + "grad_norm": 0.4984724521636963, + "learning_rate": 1.549766668027963e-05, + "loss": 0.4064, + "step": 9947 + }, + { + "epoch": 1.7745963785567747, + "grad_norm": 0.5009212493896484, + "learning_rate": 1.5473391602584096e-05, + "loss": 0.437, + "step": 9948 + }, + { + "epoch": 1.7747747747747749, + "grad_norm": 0.5126849412918091, + "learning_rate": 1.544913494450742e-05, + "loss": 0.4619, + "step": 9949 + }, + { + "epoch": 1.774953170992775, + "grad_norm": 0.5047542452812195, + "learning_rate": 1.5424896707954773e-05, + "loss": 0.451, + "step": 9950 + }, + { + "epoch": 1.7751315672107753, + "grad_norm": 0.5584752559661865, + "learning_rate": 1.5400676894829767e-05, + "loss": 0.5518, + "step": 9951 + }, + { + "epoch": 1.7753099634287755, + "grad_norm": 0.5159489512443542, + "learning_rate": 1.5376475507034694e-05, + "loss": 0.4518, + "step": 9952 + }, + { + "epoch": 1.7754883596467756, + "grad_norm": 0.589447021484375, + "learning_rate": 1.535229254647025e-05, + "loss": 0.5093, + "step": 9953 + }, + { + "epoch": 1.7756667558647756, + "grad_norm": 0.5014997720718384, + "learning_rate": 1.5328128015035746e-05, + "loss": 0.3893, + "step": 9954 + }, + { + "epoch": 1.7758451520827758, + "grad_norm": 0.5109811425209045, + "learning_rate": 1.5303981914629117e-05, + "loss": 0.4326, + "step": 9955 + }, + { + "epoch": 1.776023548300776, + "grad_norm": 0.5431187152862549, + "learning_rate": 1.52798542471467e-05, + "loss": 0.4706, + "step": 9956 + }, + { + "epoch": 1.7762019445187762, + "grad_norm": 0.6107049584388733, + "learning_rate": 1.5255745014483569e-05, + "loss": 0.4079, + "step": 9957 + }, + { + "epoch": 1.7763803407367764, + "grad_norm": 0.5987387299537659, + "learning_rate": 1.5231654218533175e-05, + "loss": 0.5319, + "step": 9958 + }, + { + "epoch": 1.7765587369547764, + "grad_norm": 0.5544418096542358, + "learning_rate": 1.5207581861187647e-05, + "loss": 0.4959, + "step": 9959 + }, + { + "epoch": 1.7767371331727766, + "grad_norm": 0.6012864112854004, + "learning_rate": 1.5183527944337583e-05, + "loss": 0.5722, + "step": 9960 + }, + { + "epoch": 1.7769155293907768, + "grad_norm": 0.5390208959579468, + "learning_rate": 1.5159492469872221e-05, + "loss": 0.6233, + "step": 9961 + }, + { + "epoch": 1.777093925608777, + "grad_norm": 0.5232547521591187, + "learning_rate": 1.513547543967922e-05, + "loss": 0.5397, + "step": 9962 + }, + { + "epoch": 1.7772723218267772, + "grad_norm": 0.508708655834198, + "learning_rate": 1.5111476855644901e-05, + "loss": 0.4217, + "step": 9963 + }, + { + "epoch": 1.7774507180447774, + "grad_norm": 0.49109214544296265, + "learning_rate": 1.5087496719654149e-05, + "loss": 0.4588, + "step": 9964 + }, + { + "epoch": 1.7776291142627776, + "grad_norm": 0.4775922894477844, + "learning_rate": 1.5063535033590287e-05, + "loss": 0.4091, + "step": 9965 + }, + { + "epoch": 1.7778075104807778, + "grad_norm": 0.4845198392868042, + "learning_rate": 1.5039591799335312e-05, + "loss": 0.374, + "step": 9966 + }, + { + "epoch": 1.777985906698778, + "grad_norm": 0.48541465401649475, + "learning_rate": 1.5015667018769692e-05, + "loss": 0.5168, + "step": 9967 + }, + { + "epoch": 1.7781643029167782, + "grad_norm": 0.5408685803413391, + "learning_rate": 1.4991760693772422e-05, + "loss": 0.617, + "step": 9968 + }, + { + "epoch": 1.7783426991347784, + "grad_norm": 0.48128771781921387, + "learning_rate": 1.4967872826221168e-05, + "loss": 0.4295, + "step": 9969 + }, + { + "epoch": 1.7785210953527786, + "grad_norm": 0.49828970432281494, + "learning_rate": 1.4944003417992014e-05, + "loss": 0.4724, + "step": 9970 + }, + { + "epoch": 1.7786994915707788, + "grad_norm": 0.487578809261322, + "learning_rate": 1.4920152470959707e-05, + "loss": 0.4936, + "step": 9971 + }, + { + "epoch": 1.778877887788779, + "grad_norm": 0.47483158111572266, + "learning_rate": 1.489631998699742e-05, + "loss": 0.4905, + "step": 9972 + }, + { + "epoch": 1.7790562840067792, + "grad_norm": 0.4644508957862854, + "learning_rate": 1.487250596797704e-05, + "loss": 0.4285, + "step": 9973 + }, + { + "epoch": 1.7792346802247794, + "grad_norm": 0.5382997989654541, + "learning_rate": 1.4848710415768824e-05, + "loss": 0.5722, + "step": 9974 + }, + { + "epoch": 1.7794130764427796, + "grad_norm": 0.5405296087265015, + "learning_rate": 1.4824933332241692e-05, + "loss": 0.4465, + "step": 9975 + }, + { + "epoch": 1.7795914726607795, + "grad_norm": 0.526763916015625, + "learning_rate": 1.4801174719263122e-05, + "loss": 0.5126, + "step": 9976 + }, + { + "epoch": 1.7797698688787797, + "grad_norm": 0.4779605567455292, + "learning_rate": 1.477743457869904e-05, + "loss": 0.3936, + "step": 9977 + }, + { + "epoch": 1.77994826509678, + "grad_norm": 0.5108462572097778, + "learning_rate": 1.4753712912414035e-05, + "loss": 0.4744, + "step": 9978 + }, + { + "epoch": 1.7801266613147801, + "grad_norm": 0.5295320749282837, + "learning_rate": 1.4730009722271204e-05, + "loss": 0.5429, + "step": 9979 + }, + { + "epoch": 1.7803050575327803, + "grad_norm": 0.5670889019966125, + "learning_rate": 1.4706325010132137e-05, + "loss": 0.5606, + "step": 9980 + }, + { + "epoch": 1.7804834537507803, + "grad_norm": 0.5170961022377014, + "learning_rate": 1.4682658777857072e-05, + "loss": 0.5119, + "step": 9981 + }, + { + "epoch": 1.7806618499687805, + "grad_norm": 0.4892038106918335, + "learning_rate": 1.4659011027304686e-05, + "loss": 0.4014, + "step": 9982 + }, + { + "epoch": 1.7808402461867807, + "grad_norm": 0.46245473623275757, + "learning_rate": 1.4635381760332356e-05, + "loss": 0.3665, + "step": 9983 + }, + { + "epoch": 1.781018642404781, + "grad_norm": 0.5037046074867249, + "learning_rate": 1.461177097879579e-05, + "loss": 0.388, + "step": 9984 + }, + { + "epoch": 1.7811970386227811, + "grad_norm": 0.4744492471218109, + "learning_rate": 1.458817868454948e-05, + "loss": 0.3254, + "step": 9985 + }, + { + "epoch": 1.7813754348407813, + "grad_norm": 0.4990524649620056, + "learning_rate": 1.456460487944633e-05, + "loss": 0.4834, + "step": 9986 + }, + { + "epoch": 1.7815538310587815, + "grad_norm": 0.631146252155304, + "learning_rate": 1.4541049565337749e-05, + "loss": 0.5804, + "step": 9987 + }, + { + "epoch": 1.7817322272767817, + "grad_norm": 0.5024986863136292, + "learning_rate": 1.451751274407384e-05, + "loss": 0.5254, + "step": 9988 + }, + { + "epoch": 1.781910623494782, + "grad_norm": 0.5565928816795349, + "learning_rate": 1.4493994417503127e-05, + "loss": 0.4387, + "step": 9989 + }, + { + "epoch": 1.782089019712782, + "grad_norm": 0.5144158005714417, + "learning_rate": 1.4470494587472765e-05, + "loss": 0.4451, + "step": 9990 + }, + { + "epoch": 1.7822674159307823, + "grad_norm": 0.5400242805480957, + "learning_rate": 1.4447013255828368e-05, + "loss": 0.5332, + "step": 9991 + }, + { + "epoch": 1.7824458121487825, + "grad_norm": 0.5203432440757751, + "learning_rate": 1.4423550424414234e-05, + "loss": 0.4738, + "step": 9992 + }, + { + "epoch": 1.7826242083667827, + "grad_norm": 0.5619713068008423, + "learning_rate": 1.4400106095073029e-05, + "loss": 0.5508, + "step": 9993 + }, + { + "epoch": 1.782802604584783, + "grad_norm": 0.5214311480522156, + "learning_rate": 1.4376680269646086e-05, + "loss": 0.4902, + "step": 9994 + }, + { + "epoch": 1.782981000802783, + "grad_norm": 0.4371115267276764, + "learning_rate": 1.4353272949973322e-05, + "loss": 0.3828, + "step": 9995 + }, + { + "epoch": 1.7831593970207833, + "grad_norm": 0.5262027382850647, + "learning_rate": 1.4329884137893074e-05, + "loss": 0.5594, + "step": 9996 + }, + { + "epoch": 1.7833377932387835, + "grad_norm": 0.5280055403709412, + "learning_rate": 1.430651383524234e-05, + "loss": 0.454, + "step": 9997 + }, + { + "epoch": 1.7835161894567835, + "grad_norm": 0.5150504112243652, + "learning_rate": 1.4283162043856546e-05, + "loss": 0.5101, + "step": 9998 + }, + { + "epoch": 1.7836945856747837, + "grad_norm": 0.5318194627761841, + "learning_rate": 1.4259828765569777e-05, + "loss": 0.4295, + "step": 9999 + }, + { + "epoch": 1.7838729818927839, + "grad_norm": 0.5350292921066284, + "learning_rate": 1.423651400221465e-05, + "loss": 0.4757, + "step": 10000 + }, + { + "epoch": 1.784051378110784, + "grad_norm": 0.5203887224197388, + "learning_rate": 1.4213217755622205e-05, + "loss": 0.5545, + "step": 10001 + }, + { + "epoch": 1.7842297743287843, + "grad_norm": 0.48459410667419434, + "learning_rate": 1.4189940027622194e-05, + "loss": 0.4402, + "step": 10002 + }, + { + "epoch": 1.7844081705467842, + "grad_norm": 0.5484018921852112, + "learning_rate": 1.41666808200428e-05, + "loss": 0.5066, + "step": 10003 + }, + { + "epoch": 1.7845865667647844, + "grad_norm": 0.6117169260978699, + "learning_rate": 1.4143440134710833e-05, + "loss": 0.7196, + "step": 10004 + }, + { + "epoch": 1.7847649629827846, + "grad_norm": 0.5282416939735413, + "learning_rate": 1.4120217973451533e-05, + "loss": 0.4336, + "step": 10005 + }, + { + "epoch": 1.7849433592007848, + "grad_norm": 0.5563318729400635, + "learning_rate": 1.4097014338088855e-05, + "loss": 0.5367, + "step": 10006 + }, + { + "epoch": 1.785121755418785, + "grad_norm": 0.6051769256591797, + "learning_rate": 1.4073829230445173e-05, + "loss": 0.6119, + "step": 10007 + }, + { + "epoch": 1.7853001516367852, + "grad_norm": 0.5828459858894348, + "learning_rate": 1.4050662652341312e-05, + "loss": 0.6749, + "step": 10008 + }, + { + "epoch": 1.7854785478547854, + "grad_norm": 0.5630053281784058, + "learning_rate": 1.4027514605596952e-05, + "loss": 0.5675, + "step": 10009 + }, + { + "epoch": 1.7856569440727856, + "grad_norm": 0.613758385181427, + "learning_rate": 1.4004385092030031e-05, + "loss": 0.5525, + "step": 10010 + }, + { + "epoch": 1.7858353402907858, + "grad_norm": 0.4936037063598633, + "learning_rate": 1.3981274113457148e-05, + "loss": 0.4094, + "step": 10011 + }, + { + "epoch": 1.786013736508786, + "grad_norm": 0.5002244114875793, + "learning_rate": 1.3958181671693466e-05, + "loss": 0.4153, + "step": 10012 + }, + { + "epoch": 1.7861921327267862, + "grad_norm": 0.5575555562973022, + "learning_rate": 1.3935107768552557e-05, + "loss": 0.6104, + "step": 10013 + }, + { + "epoch": 1.7863705289447864, + "grad_norm": 0.5280625224113464, + "learning_rate": 1.3912052405846754e-05, + "loss": 0.565, + "step": 10014 + }, + { + "epoch": 1.7865489251627866, + "grad_norm": 0.505584716796875, + "learning_rate": 1.3889015585386689e-05, + "loss": 0.4413, + "step": 10015 + }, + { + "epoch": 1.7867273213807868, + "grad_norm": 0.4564724862575531, + "learning_rate": 1.3865997308981804e-05, + "loss": 0.3544, + "step": 10016 + }, + { + "epoch": 1.786905717598787, + "grad_norm": 0.6348497271537781, + "learning_rate": 1.3842997578439819e-05, + "loss": 0.6875, + "step": 10017 + }, + { + "epoch": 1.7870841138167872, + "grad_norm": 0.5605414509773254, + "learning_rate": 1.3820016395567209e-05, + "loss": 0.5125, + "step": 10018 + }, + { + "epoch": 1.7872625100347874, + "grad_norm": 0.4736771285533905, + "learning_rate": 1.379705376216886e-05, + "loss": 0.4449, + "step": 10019 + }, + { + "epoch": 1.7874409062527874, + "grad_norm": 0.4950930178165436, + "learning_rate": 1.3774109680048274e-05, + "loss": 0.4283, + "step": 10020 + }, + { + "epoch": 1.7876193024707876, + "grad_norm": 0.4793688654899597, + "learning_rate": 1.3751184151007485e-05, + "loss": 0.4695, + "step": 10021 + }, + { + "epoch": 1.7877976986887878, + "grad_norm": 0.4778786599636078, + "learning_rate": 1.3728277176846965e-05, + "loss": 0.4909, + "step": 10022 + }, + { + "epoch": 1.787976094906788, + "grad_norm": 0.5141474008560181, + "learning_rate": 1.3705388759365945e-05, + "loss": 0.4603, + "step": 10023 + }, + { + "epoch": 1.7881544911247882, + "grad_norm": 0.4962654411792755, + "learning_rate": 1.3682518900361902e-05, + "loss": 0.4038, + "step": 10024 + }, + { + "epoch": 1.7883328873427882, + "grad_norm": 0.44499000906944275, + "learning_rate": 1.3659667601631231e-05, + "loss": 0.3354, + "step": 10025 + }, + { + "epoch": 1.7885112835607884, + "grad_norm": 0.5469255447387695, + "learning_rate": 1.3636834864968556e-05, + "loss": 0.5772, + "step": 10026 + }, + { + "epoch": 1.7886896797787886, + "grad_norm": 0.5044131278991699, + "learning_rate": 1.3614020692167107e-05, + "loss": 0.4198, + "step": 10027 + }, + { + "epoch": 1.7888680759967888, + "grad_norm": 0.5094908475875854, + "learning_rate": 1.3591225085018782e-05, + "loss": 0.5186, + "step": 10028 + }, + { + "epoch": 1.789046472214789, + "grad_norm": 0.4827946424484253, + "learning_rate": 1.3568448045313874e-05, + "loss": 0.4773, + "step": 10029 + }, + { + "epoch": 1.7892248684327892, + "grad_norm": 0.5509223341941833, + "learning_rate": 1.354568957484134e-05, + "loss": 0.5378, + "step": 10030 + }, + { + "epoch": 1.7894032646507894, + "grad_norm": 0.49218085408210754, + "learning_rate": 1.3522949675388557e-05, + "loss": 0.4328, + "step": 10031 + }, + { + "epoch": 1.7895816608687896, + "grad_norm": 0.5489743947982788, + "learning_rate": 1.3500228348741594e-05, + "loss": 0.5715, + "step": 10032 + }, + { + "epoch": 1.7897600570867898, + "grad_norm": 0.5286942720413208, + "learning_rate": 1.3477525596684914e-05, + "loss": 0.4155, + "step": 10033 + }, + { + "epoch": 1.78993845330479, + "grad_norm": 0.5898469090461731, + "learning_rate": 1.3454841421001562e-05, + "loss": 0.6253, + "step": 10034 + }, + { + "epoch": 1.7901168495227902, + "grad_norm": 0.4971984326839447, + "learning_rate": 1.3432175823473197e-05, + "loss": 0.401, + "step": 10035 + }, + { + "epoch": 1.7902952457407904, + "grad_norm": 0.5394257307052612, + "learning_rate": 1.3409528805879895e-05, + "loss": 0.5684, + "step": 10036 + }, + { + "epoch": 1.7904736419587906, + "grad_norm": 0.47669845819473267, + "learning_rate": 1.3386900370000455e-05, + "loss": 0.4986, + "step": 10037 + }, + { + "epoch": 1.7906520381767908, + "grad_norm": 0.4922144114971161, + "learning_rate": 1.3364290517611982e-05, + "loss": 0.4396, + "step": 10038 + }, + { + "epoch": 1.790830434394791, + "grad_norm": 0.5299371480941772, + "learning_rate": 1.334169925049028e-05, + "loss": 0.4327, + "step": 10039 + }, + { + "epoch": 1.7910088306127911, + "grad_norm": 0.5726783871650696, + "learning_rate": 1.3319126570409734e-05, + "loss": 0.5291, + "step": 10040 + }, + { + "epoch": 1.7911872268307913, + "grad_norm": 0.5141592621803284, + "learning_rate": 1.3296572479143093e-05, + "loss": 0.4491, + "step": 10041 + }, + { + "epoch": 1.7913656230487913, + "grad_norm": 0.5207017660140991, + "learning_rate": 1.3274036978461829e-05, + "loss": 0.4182, + "step": 10042 + }, + { + "epoch": 1.7915440192667915, + "grad_norm": 0.5985919833183289, + "learning_rate": 1.3251520070135803e-05, + "loss": 0.6607, + "step": 10043 + }, + { + "epoch": 1.7917224154847917, + "grad_norm": 0.5005565285682678, + "learning_rate": 1.3229021755933546e-05, + "loss": 0.4086, + "step": 10044 + }, + { + "epoch": 1.791900811702792, + "grad_norm": 0.5269946455955505, + "learning_rate": 1.3206542037621978e-05, + "loss": 0.4658, + "step": 10045 + }, + { + "epoch": 1.7920792079207921, + "grad_norm": 0.5867735147476196, + "learning_rate": 1.318408091696674e-05, + "loss": 0.6541, + "step": 10046 + }, + { + "epoch": 1.792257604138792, + "grad_norm": 0.491451621055603, + "learning_rate": 1.3161638395731867e-05, + "loss": 0.3777, + "step": 10047 + }, + { + "epoch": 1.7924360003567923, + "grad_norm": 0.5263445377349854, + "learning_rate": 1.3139214475679977e-05, + "loss": 0.4627, + "step": 10048 + }, + { + "epoch": 1.7926143965747925, + "grad_norm": 0.4960615038871765, + "learning_rate": 1.3116809158572273e-05, + "loss": 0.4039, + "step": 10049 + }, + { + "epoch": 1.7927927927927927, + "grad_norm": 0.5893881916999817, + "learning_rate": 1.3094422446168403e-05, + "loss": 0.6089, + "step": 10050 + }, + { + "epoch": 1.7929711890107929, + "grad_norm": 0.636078953742981, + "learning_rate": 1.3072054340226708e-05, + "loss": 0.5522, + "step": 10051 + }, + { + "epoch": 1.793149585228793, + "grad_norm": 0.6112774610519409, + "learning_rate": 1.304970484250384e-05, + "loss": 0.6133, + "step": 10052 + }, + { + "epoch": 1.7933279814467933, + "grad_norm": 0.5139406323432922, + "learning_rate": 1.3027373954755229e-05, + "loss": 0.3866, + "step": 10053 + }, + { + "epoch": 1.7935063776647935, + "grad_norm": 0.5027743577957153, + "learning_rate": 1.3005061678734665e-05, + "loss": 0.4824, + "step": 10054 + }, + { + "epoch": 1.7936847738827937, + "grad_norm": 0.45598676800727844, + "learning_rate": 1.2982768016194551e-05, + "loss": 0.3925, + "step": 10055 + }, + { + "epoch": 1.7938631701007939, + "grad_norm": 0.5171281099319458, + "learning_rate": 1.2960492968885906e-05, + "loss": 0.499, + "step": 10056 + }, + { + "epoch": 1.794041566318794, + "grad_norm": 0.5457966923713684, + "learning_rate": 1.2938236538558079e-05, + "loss": 0.4644, + "step": 10057 + }, + { + "epoch": 1.7942199625367943, + "grad_norm": 0.5165960788726807, + "learning_rate": 1.2915998726959172e-05, + "loss": 0.5193, + "step": 10058 + }, + { + "epoch": 1.7943983587547945, + "grad_norm": 0.5514568090438843, + "learning_rate": 1.2893779535835703e-05, + "loss": 0.4803, + "step": 10059 + }, + { + "epoch": 1.7945767549727947, + "grad_norm": 0.47774699330329895, + "learning_rate": 1.2871578966932723e-05, + "loss": 0.3729, + "step": 10060 + }, + { + "epoch": 1.7947551511907949, + "grad_norm": 0.5756377577781677, + "learning_rate": 1.2849397021993947e-05, + "loss": 0.554, + "step": 10061 + }, + { + "epoch": 1.794933547408795, + "grad_norm": 0.5572470426559448, + "learning_rate": 1.2827233702761398e-05, + "loss": 0.4625, + "step": 10062 + }, + { + "epoch": 1.7951119436267953, + "grad_norm": 0.508305549621582, + "learning_rate": 1.2805089010975906e-05, + "loss": 0.4027, + "step": 10063 + }, + { + "epoch": 1.7952903398447952, + "grad_norm": 0.5296847224235535, + "learning_rate": 1.2782962948376608e-05, + "loss": 0.5182, + "step": 10064 + }, + { + "epoch": 1.7954687360627954, + "grad_norm": 0.5482270121574402, + "learning_rate": 1.2760855516701364e-05, + "loss": 0.6118, + "step": 10065 + }, + { + "epoch": 1.7956471322807956, + "grad_norm": 0.6912332773208618, + "learning_rate": 1.2738766717686396e-05, + "loss": 0.5096, + "step": 10066 + }, + { + "epoch": 1.7958255284987958, + "grad_norm": 0.5811026692390442, + "learning_rate": 1.271669655306662e-05, + "loss": 0.6013, + "step": 10067 + }, + { + "epoch": 1.796003924716796, + "grad_norm": 0.5055823922157288, + "learning_rate": 1.2694645024575374e-05, + "loss": 0.4401, + "step": 10068 + }, + { + "epoch": 1.796182320934796, + "grad_norm": 0.6301665306091309, + "learning_rate": 1.2672612133944578e-05, + "loss": 0.5362, + "step": 10069 + }, + { + "epoch": 1.7963607171527962, + "grad_norm": 0.4803159534931183, + "learning_rate": 1.265059788290468e-05, + "loss": 0.3668, + "step": 10070 + }, + { + "epoch": 1.7965391133707964, + "grad_norm": 0.4969823360443115, + "learning_rate": 1.2628602273184714e-05, + "loss": 0.4755, + "step": 10071 + }, + { + "epoch": 1.7967175095887966, + "grad_norm": 0.5457087755203247, + "learning_rate": 1.2606625306512159e-05, + "loss": 0.5974, + "step": 10072 + }, + { + "epoch": 1.7968959058067968, + "grad_norm": 0.6440889835357666, + "learning_rate": 1.2584666984613107e-05, + "loss": 0.6619, + "step": 10073 + }, + { + "epoch": 1.797074302024797, + "grad_norm": 0.5694631338119507, + "learning_rate": 1.2562727309212125e-05, + "loss": 0.5553, + "step": 10074 + }, + { + "epoch": 1.7972526982427972, + "grad_norm": 0.582349956035614, + "learning_rate": 1.2540806282032385e-05, + "loss": 0.5732, + "step": 10075 + }, + { + "epoch": 1.7974310944607974, + "grad_norm": 0.539436936378479, + "learning_rate": 1.2518903904795515e-05, + "loss": 0.4747, + "step": 10076 + }, + { + "epoch": 1.7976094906787976, + "grad_norm": 0.5120388865470886, + "learning_rate": 1.2497020179221747e-05, + "loss": 0.5279, + "step": 10077 + }, + { + "epoch": 1.7977878868967978, + "grad_norm": 0.5143983364105225, + "learning_rate": 1.247515510702979e-05, + "loss": 0.453, + "step": 10078 + }, + { + "epoch": 1.797966283114798, + "grad_norm": 0.46396833658218384, + "learning_rate": 1.2453308689936965e-05, + "loss": 0.3753, + "step": 10079 + }, + { + "epoch": 1.7981446793327982, + "grad_norm": 0.5515454411506653, + "learning_rate": 1.2431480929659066e-05, + "loss": 0.5563, + "step": 10080 + }, + { + "epoch": 1.7983230755507984, + "grad_norm": 0.5875598788261414, + "learning_rate": 1.2409671827910363e-05, + "loss": 0.5579, + "step": 10081 + }, + { + "epoch": 1.7985014717687986, + "grad_norm": 0.43758925795555115, + "learning_rate": 1.2387881386403844e-05, + "loss": 0.3783, + "step": 10082 + }, + { + "epoch": 1.7986798679867988, + "grad_norm": 0.4410098195075989, + "learning_rate": 1.2366109606850834e-05, + "loss": 0.336, + "step": 10083 + }, + { + "epoch": 1.798858264204799, + "grad_norm": 0.5217193365097046, + "learning_rate": 1.2344356490961328e-05, + "loss": 0.4554, + "step": 10084 + }, + { + "epoch": 1.7990366604227992, + "grad_norm": 0.5786857604980469, + "learning_rate": 1.2322622040443793e-05, + "loss": 0.5736, + "step": 10085 + }, + { + "epoch": 1.7992150566407992, + "grad_norm": 0.5340781807899475, + "learning_rate": 1.2300906257005196e-05, + "loss": 0.5068, + "step": 10086 + }, + { + "epoch": 1.7993934528587994, + "grad_norm": 0.5103334784507751, + "learning_rate": 1.22792091423512e-05, + "loss": 0.4845, + "step": 10087 + }, + { + "epoch": 1.7995718490767996, + "grad_norm": 0.5319827795028687, + "learning_rate": 1.2257530698185776e-05, + "loss": 0.5371, + "step": 10088 + }, + { + "epoch": 1.7997502452947998, + "grad_norm": 0.5461127161979675, + "learning_rate": 1.2235870926211617e-05, + "loss": 0.5464, + "step": 10089 + }, + { + "epoch": 1.7999286415128, + "grad_norm": 0.6167702078819275, + "learning_rate": 1.2214229828129808e-05, + "loss": 0.5783, + "step": 10090 + }, + { + "epoch": 1.8001070377308, + "grad_norm": 0.538575291633606, + "learning_rate": 1.2192607405640072e-05, + "loss": 0.5085, + "step": 10091 + }, + { + "epoch": 1.8002854339488001, + "grad_norm": 0.4790734648704529, + "learning_rate": 1.2171003660440633e-05, + "loss": 0.3925, + "step": 10092 + }, + { + "epoch": 1.8004638301668003, + "grad_norm": 0.5150331854820251, + "learning_rate": 1.214941859422819e-05, + "loss": 0.5111, + "step": 10093 + }, + { + "epoch": 1.8006422263848005, + "grad_norm": 0.5167433023452759, + "learning_rate": 1.2127852208698081e-05, + "loss": 0.4739, + "step": 10094 + }, + { + "epoch": 1.8008206226028007, + "grad_norm": 0.48076915740966797, + "learning_rate": 1.2106304505544063e-05, + "loss": 0.402, + "step": 10095 + }, + { + "epoch": 1.800999018820801, + "grad_norm": 0.5094106197357178, + "learning_rate": 1.208477548645856e-05, + "loss": 0.562, + "step": 10096 + }, + { + "epoch": 1.8011774150388011, + "grad_norm": 0.4915674030780792, + "learning_rate": 1.2063265153132359e-05, + "loss": 0.4165, + "step": 10097 + }, + { + "epoch": 1.8013558112568013, + "grad_norm": 0.4733083248138428, + "learning_rate": 1.2041773507254966e-05, + "loss": 0.3459, + "step": 10098 + }, + { + "epoch": 1.8015342074748015, + "grad_norm": 0.5540547966957092, + "learning_rate": 1.202030055051423e-05, + "loss": 0.4423, + "step": 10099 + }, + { + "epoch": 1.8017126036928017, + "grad_norm": 0.5337905883789062, + "learning_rate": 1.1998846284596687e-05, + "loss": 0.5369, + "step": 10100 + }, + { + "epoch": 1.801890999910802, + "grad_norm": 0.5766735672950745, + "learning_rate": 1.1977410711187381e-05, + "loss": 0.574, + "step": 10101 + }, + { + "epoch": 1.8020693961288021, + "grad_norm": 0.6233359575271606, + "learning_rate": 1.1955993831969769e-05, + "loss": 0.6326, + "step": 10102 + }, + { + "epoch": 1.8022477923468023, + "grad_norm": 0.538557231426239, + "learning_rate": 1.1934595648625978e-05, + "loss": 0.51, + "step": 10103 + }, + { + "epoch": 1.8024261885648025, + "grad_norm": 0.5933203101158142, + "learning_rate": 1.1913216162836582e-05, + "loss": 0.6019, + "step": 10104 + }, + { + "epoch": 1.8026045847828027, + "grad_norm": 0.5655239224433899, + "learning_rate": 1.1891855376280764e-05, + "loss": 0.5871, + "step": 10105 + }, + { + "epoch": 1.802782981000803, + "grad_norm": 0.4605049788951874, + "learning_rate": 1.1870513290636154e-05, + "loss": 0.3936, + "step": 10106 + }, + { + "epoch": 1.8029613772188031, + "grad_norm": 0.4446168541908264, + "learning_rate": 1.1849189907578938e-05, + "loss": 0.3525, + "step": 10107 + }, + { + "epoch": 1.803139773436803, + "grad_norm": 0.5252434611320496, + "learning_rate": 1.1827885228783863e-05, + "loss": 0.5285, + "step": 10108 + }, + { + "epoch": 1.8033181696548033, + "grad_norm": 0.49297425150871277, + "learning_rate": 1.1806599255924172e-05, + "loss": 0.4882, + "step": 10109 + }, + { + "epoch": 1.8034965658728035, + "grad_norm": 0.5996000170707703, + "learning_rate": 1.1785331990671722e-05, + "loss": 0.5251, + "step": 10110 + }, + { + "epoch": 1.8036749620908037, + "grad_norm": 0.5449775457382202, + "learning_rate": 1.1764083434696732e-05, + "loss": 0.5504, + "step": 10111 + }, + { + "epoch": 1.803853358308804, + "grad_norm": 0.5374523997306824, + "learning_rate": 1.1742853589668145e-05, + "loss": 0.5502, + "step": 10112 + }, + { + "epoch": 1.8040317545268039, + "grad_norm": 0.5069290399551392, + "learning_rate": 1.1721642457253323e-05, + "loss": 0.475, + "step": 10113 + }, + { + "epoch": 1.804210150744804, + "grad_norm": 0.5023181438446045, + "learning_rate": 1.1700450039118127e-05, + "loss": 0.4728, + "step": 10114 + }, + { + "epoch": 1.8043885469628043, + "grad_norm": 0.44995036721229553, + "learning_rate": 1.1679276336927058e-05, + "loss": 0.3337, + "step": 10115 + }, + { + "epoch": 1.8045669431808045, + "grad_norm": 0.5543752312660217, + "learning_rate": 1.1658121352342982e-05, + "loss": 0.5126, + "step": 10116 + }, + { + "epoch": 1.8047453393988047, + "grad_norm": 0.5311068296432495, + "learning_rate": 1.1636985087027597e-05, + "loss": 0.5781, + "step": 10117 + }, + { + "epoch": 1.8049237356168049, + "grad_norm": 0.5337813496589661, + "learning_rate": 1.1615867542640795e-05, + "loss": 0.4699, + "step": 10118 + }, + { + "epoch": 1.805102131834805, + "grad_norm": 0.5538474321365356, + "learning_rate": 1.1594768720841142e-05, + "loss": 0.616, + "step": 10119 + }, + { + "epoch": 1.8052805280528053, + "grad_norm": 0.5138535499572754, + "learning_rate": 1.157368862328581e-05, + "loss": 0.5966, + "step": 10120 + }, + { + "epoch": 1.8054589242708055, + "grad_norm": 0.5369413495063782, + "learning_rate": 1.1552627251630338e-05, + "loss": 0.6007, + "step": 10121 + }, + { + "epoch": 1.8056373204888057, + "grad_norm": 0.4994553029537201, + "learning_rate": 1.1531584607528928e-05, + "loss": 0.3921, + "step": 10122 + }, + { + "epoch": 1.8058157167068059, + "grad_norm": 0.6017369031906128, + "learning_rate": 1.1510560692634203e-05, + "loss": 0.553, + "step": 10123 + }, + { + "epoch": 1.805994112924806, + "grad_norm": 0.5081920027732849, + "learning_rate": 1.1489555508597455e-05, + "loss": 0.3915, + "step": 10124 + }, + { + "epoch": 1.8061725091428062, + "grad_norm": 0.5011181235313416, + "learning_rate": 1.1468569057068363e-05, + "loss": 0.4489, + "step": 10125 + }, + { + "epoch": 1.8063509053608064, + "grad_norm": 0.5123955607414246, + "learning_rate": 1.1447601339695218e-05, + "loss": 0.4911, + "step": 10126 + }, + { + "epoch": 1.8065293015788066, + "grad_norm": 0.6243327856063843, + "learning_rate": 1.1426652358124817e-05, + "loss": 0.7168, + "step": 10127 + }, + { + "epoch": 1.8067076977968068, + "grad_norm": 0.525888204574585, + "learning_rate": 1.1405722114002425e-05, + "loss": 0.512, + "step": 10128 + }, + { + "epoch": 1.806886094014807, + "grad_norm": 0.5606545209884644, + "learning_rate": 1.1384810608972007e-05, + "loss": 0.5632, + "step": 10129 + }, + { + "epoch": 1.807064490232807, + "grad_norm": 0.5493068695068359, + "learning_rate": 1.1363917844675803e-05, + "loss": 0.4691, + "step": 10130 + }, + { + "epoch": 1.8072428864508072, + "grad_norm": 0.6075304746627808, + "learning_rate": 1.1343043822754834e-05, + "loss": 0.7009, + "step": 10131 + }, + { + "epoch": 1.8074212826688074, + "grad_norm": 0.49980399012565613, + "learning_rate": 1.132218854484851e-05, + "loss": 0.4691, + "step": 10132 + }, + { + "epoch": 1.8075996788868076, + "grad_norm": 0.5099700093269348, + "learning_rate": 1.1301352012594774e-05, + "loss": 0.4577, + "step": 10133 + }, + { + "epoch": 1.8077780751048078, + "grad_norm": 0.5869147777557373, + "learning_rate": 1.1280534227630173e-05, + "loss": 0.6752, + "step": 10134 + }, + { + "epoch": 1.807956471322808, + "grad_norm": 0.5993606448173523, + "learning_rate": 1.1259735191589626e-05, + "loss": 0.6543, + "step": 10135 + }, + { + "epoch": 1.808134867540808, + "grad_norm": 0.5113434195518494, + "learning_rate": 1.123895490610677e-05, + "loss": 0.5406, + "step": 10136 + }, + { + "epoch": 1.8083132637588082, + "grad_norm": 0.5298689007759094, + "learning_rate": 1.1218193372813628e-05, + "loss": 0.4952, + "step": 10137 + }, + { + "epoch": 1.8084916599768084, + "grad_norm": 0.5689482092857361, + "learning_rate": 1.1197450593340875e-05, + "loss": 0.543, + "step": 10138 + }, + { + "epoch": 1.8086700561948086, + "grad_norm": 0.46555694937705994, + "learning_rate": 1.1176726569317563e-05, + "loss": 0.4123, + "step": 10139 + }, + { + "epoch": 1.8088484524128088, + "grad_norm": 0.4243280589580536, + "learning_rate": 1.1156021302371338e-05, + "loss": 0.4059, + "step": 10140 + }, + { + "epoch": 1.809026848630809, + "grad_norm": 0.6901927590370178, + "learning_rate": 1.1135334794128455e-05, + "loss": 0.6079, + "step": 10141 + }, + { + "epoch": 1.8092052448488092, + "grad_norm": 0.5594136714935303, + "learning_rate": 1.1114667046213555e-05, + "loss": 0.5835, + "step": 10142 + }, + { + "epoch": 1.8093836410668094, + "grad_norm": 0.4830605983734131, + "learning_rate": 1.1094018060249928e-05, + "loss": 0.3979, + "step": 10143 + }, + { + "epoch": 1.8095620372848096, + "grad_norm": 0.5415348410606384, + "learning_rate": 1.10733878378593e-05, + "loss": 0.5458, + "step": 10144 + }, + { + "epoch": 1.8097404335028098, + "grad_norm": 0.5037901401519775, + "learning_rate": 1.1052776380661988e-05, + "loss": 0.4389, + "step": 10145 + }, + { + "epoch": 1.80991882972081, + "grad_norm": 0.47683677077293396, + "learning_rate": 1.1032183690276754e-05, + "loss": 0.4552, + "step": 10146 + }, + { + "epoch": 1.8100972259388102, + "grad_norm": 0.5881403684616089, + "learning_rate": 1.1011609768320995e-05, + "loss": 0.617, + "step": 10147 + }, + { + "epoch": 1.8102756221568104, + "grad_norm": 0.5397078394889832, + "learning_rate": 1.099105461641059e-05, + "loss": 0.5371, + "step": 10148 + }, + { + "epoch": 1.8104540183748106, + "grad_norm": 0.5096381306648254, + "learning_rate": 1.0970518236159882e-05, + "loss": 0.4908, + "step": 10149 + }, + { + "epoch": 1.8106324145928108, + "grad_norm": 0.5621026754379272, + "learning_rate": 1.0950000629181806e-05, + "loss": 0.5941, + "step": 10150 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 0.501204252243042, + "learning_rate": 1.0929501797087848e-05, + "loss": 0.4206, + "step": 10151 + }, + { + "epoch": 1.810989207028811, + "grad_norm": 0.48253610730171204, + "learning_rate": 1.0909021741487862e-05, + "loss": 0.4153, + "step": 10152 + }, + { + "epoch": 1.8111676032468111, + "grad_norm": 0.60355144739151, + "learning_rate": 1.0888560463990476e-05, + "loss": 0.518, + "step": 10153 + }, + { + "epoch": 1.8113459994648113, + "grad_norm": 0.4504523277282715, + "learning_rate": 1.086811796620263e-05, + "loss": 0.3618, + "step": 10154 + }, + { + "epoch": 1.8115243956828115, + "grad_norm": 0.5853896737098694, + "learning_rate": 1.0847694249729922e-05, + "loss": 0.6146, + "step": 10155 + }, + { + "epoch": 1.8117027919008117, + "grad_norm": 0.5694953203201294, + "learning_rate": 1.0827289316176353e-05, + "loss": 0.5908, + "step": 10156 + }, + { + "epoch": 1.811881188118812, + "grad_norm": 0.5596148371696472, + "learning_rate": 1.0806903167144583e-05, + "loss": 0.583, + "step": 10157 + }, + { + "epoch": 1.812059584336812, + "grad_norm": 0.4649914801120758, + "learning_rate": 1.0786535804235693e-05, + "loss": 0.4772, + "step": 10158 + }, + { + "epoch": 1.8122379805548121, + "grad_norm": 0.5577880144119263, + "learning_rate": 1.0766187229049345e-05, + "loss": 0.5872, + "step": 10159 + }, + { + "epoch": 1.8124163767728123, + "grad_norm": 0.6191621422767639, + "learning_rate": 1.0745857443183737e-05, + "loss": 0.6917, + "step": 10160 + }, + { + "epoch": 1.8125947729908125, + "grad_norm": 0.6169114112854004, + "learning_rate": 1.0725546448235424e-05, + "loss": 0.7141, + "step": 10161 + }, + { + "epoch": 1.8127731692088127, + "grad_norm": 0.5002092719078064, + "learning_rate": 1.0705254245799823e-05, + "loss": 0.435, + "step": 10162 + }, + { + "epoch": 1.812951565426813, + "grad_norm": 0.48867136240005493, + "learning_rate": 1.068498083747052e-05, + "loss": 0.4144, + "step": 10163 + }, + { + "epoch": 1.813129961644813, + "grad_norm": 0.5426896214485168, + "learning_rate": 1.0664726224839882e-05, + "loss": 0.4681, + "step": 10164 + }, + { + "epoch": 1.8133083578628133, + "grad_norm": 0.5177076458930969, + "learning_rate": 1.0644490409498636e-05, + "loss": 0.4917, + "step": 10165 + }, + { + "epoch": 1.8134867540808135, + "grad_norm": 0.5665025115013123, + "learning_rate": 1.0624273393036093e-05, + "loss": 0.5453, + "step": 10166 + }, + { + "epoch": 1.8136651502988137, + "grad_norm": 0.5852944254875183, + "learning_rate": 1.0604075177040151e-05, + "loss": 0.607, + "step": 10167 + }, + { + "epoch": 1.813843546516814, + "grad_norm": 0.5647523999214172, + "learning_rate": 1.0583895763097068e-05, + "loss": 0.6147, + "step": 10168 + }, + { + "epoch": 1.814021942734814, + "grad_norm": 0.4771578907966614, + "learning_rate": 1.0563735152791826e-05, + "loss": 0.3808, + "step": 10169 + }, + { + "epoch": 1.8142003389528143, + "grad_norm": 0.49404028058052063, + "learning_rate": 1.0543593347707742e-05, + "loss": 0.4938, + "step": 10170 + }, + { + "epoch": 1.8143787351708145, + "grad_norm": 0.5241032242774963, + "learning_rate": 1.0523470349426856e-05, + "loss": 0.4203, + "step": 10171 + }, + { + "epoch": 1.8145571313888147, + "grad_norm": 0.4687019884586334, + "learning_rate": 1.0503366159529515e-05, + "loss": 0.4542, + "step": 10172 + }, + { + "epoch": 1.814735527606815, + "grad_norm": 0.5116732120513916, + "learning_rate": 1.0483280779594707e-05, + "loss": 0.4266, + "step": 10173 + }, + { + "epoch": 1.8149139238248149, + "grad_norm": 0.593781054019928, + "learning_rate": 1.0463214211200001e-05, + "loss": 0.5538, + "step": 10174 + }, + { + "epoch": 1.815092320042815, + "grad_norm": 0.5247955918312073, + "learning_rate": 1.0443166455921332e-05, + "loss": 0.4487, + "step": 10175 + }, + { + "epoch": 1.8152707162608153, + "grad_norm": 0.5464117527008057, + "learning_rate": 1.042313751533333e-05, + "loss": 0.527, + "step": 10176 + }, + { + "epoch": 1.8154491124788155, + "grad_norm": 0.5567406415939331, + "learning_rate": 1.040312739100896e-05, + "loss": 0.6, + "step": 10177 + }, + { + "epoch": 1.8156275086968157, + "grad_norm": 0.5131311416625977, + "learning_rate": 1.038313608451985e-05, + "loss": 0.4397, + "step": 10178 + }, + { + "epoch": 1.8158059049148159, + "grad_norm": 0.45174816250801086, + "learning_rate": 1.0363163597436165e-05, + "loss": 0.389, + "step": 10179 + }, + { + "epoch": 1.8159843011328158, + "grad_norm": 0.47523677349090576, + "learning_rate": 1.0343209931326453e-05, + "loss": 0.3687, + "step": 10180 + }, + { + "epoch": 1.816162697350816, + "grad_norm": 0.49906423687934875, + "learning_rate": 1.0323275087757905e-05, + "loss": 0.4489, + "step": 10181 + }, + { + "epoch": 1.8163410935688162, + "grad_norm": 0.4328477382659912, + "learning_rate": 1.0303359068296187e-05, + "loss": 0.307, + "step": 10182 + }, + { + "epoch": 1.8165194897868164, + "grad_norm": 0.48864254355430603, + "learning_rate": 1.0283461874505545e-05, + "loss": 0.4578, + "step": 10183 + }, + { + "epoch": 1.8166978860048166, + "grad_norm": 0.5428241491317749, + "learning_rate": 1.0263583507948592e-05, + "loss": 0.5653, + "step": 10184 + }, + { + "epoch": 1.8168762822228168, + "grad_norm": 0.7537945508956909, + "learning_rate": 1.0243723970186663e-05, + "loss": 0.4563, + "step": 10185 + }, + { + "epoch": 1.817054678440817, + "grad_norm": 1.7523473501205444, + "learning_rate": 1.0223883262779455e-05, + "loss": 0.3763, + "step": 10186 + }, + { + "epoch": 1.8172330746588172, + "grad_norm": 0.5494785308837891, + "learning_rate": 1.0204061387285274e-05, + "loss": 0.5098, + "step": 10187 + }, + { + "epoch": 1.8174114708768174, + "grad_norm": 0.5195872187614441, + "learning_rate": 1.0184258345260933e-05, + "loss": 0.476, + "step": 10188 + }, + { + "epoch": 1.8175898670948176, + "grad_norm": 0.674954891204834, + "learning_rate": 1.0164474138261714e-05, + "loss": 0.447, + "step": 10189 + }, + { + "epoch": 1.8177682633128178, + "grad_norm": 0.5051981806755066, + "learning_rate": 1.0144708767841514e-05, + "loss": 0.4994, + "step": 10190 + }, + { + "epoch": 1.817946659530818, + "grad_norm": 0.5411693453788757, + "learning_rate": 1.0124962235552647e-05, + "loss": 0.504, + "step": 10191 + }, + { + "epoch": 1.8181250557488182, + "grad_norm": 0.5327993631362915, + "learning_rate": 1.0105234542946013e-05, + "loss": 0.5145, + "step": 10192 + }, + { + "epoch": 1.8183034519668184, + "grad_norm": 0.5137004256248474, + "learning_rate": 1.0085525691571063e-05, + "loss": 0.5566, + "step": 10193 + }, + { + "epoch": 1.8184818481848186, + "grad_norm": 0.4950316250324249, + "learning_rate": 1.0065835682975644e-05, + "loss": 0.4582, + "step": 10194 + }, + { + "epoch": 1.8186602444028188, + "grad_norm": 0.6043919324874878, + "learning_rate": 1.0046164518706269e-05, + "loss": 0.6262, + "step": 10195 + }, + { + "epoch": 1.8188386406208188, + "grad_norm": 0.48688915371894836, + "learning_rate": 1.0026512200307841e-05, + "loss": 0.4427, + "step": 10196 + }, + { + "epoch": 1.819017036838819, + "grad_norm": 0.5831645727157593, + "learning_rate": 1.0006878729323905e-05, + "loss": 0.6149, + "step": 10197 + }, + { + "epoch": 1.8191954330568192, + "grad_norm": 0.5436547994613647, + "learning_rate": 9.987264107296445e-06, + "loss": 0.541, + "step": 10198 + }, + { + "epoch": 1.8193738292748194, + "grad_norm": 0.4676104784011841, + "learning_rate": 9.967668335765927e-06, + "loss": 0.448, + "step": 10199 + }, + { + "epoch": 1.8195522254928196, + "grad_norm": 0.5124318599700928, + "learning_rate": 9.948091416271482e-06, + "loss": 0.4321, + "step": 10200 + }, + { + "epoch": 1.8197306217108198, + "grad_norm": 0.5362337231636047, + "learning_rate": 9.928533350350627e-06, + "loss": 0.4066, + "step": 10201 + }, + { + "epoch": 1.8199090179288198, + "grad_norm": 0.4841192066669464, + "learning_rate": 9.908994139539467e-06, + "loss": 0.4793, + "step": 10202 + }, + { + "epoch": 1.82008741414682, + "grad_norm": 0.577235996723175, + "learning_rate": 9.889473785372554e-06, + "loss": 0.5232, + "step": 10203 + }, + { + "epoch": 1.8202658103648202, + "grad_norm": 0.5391908288002014, + "learning_rate": 9.869972289383078e-06, + "loss": 0.4879, + "step": 10204 + }, + { + "epoch": 1.8204442065828204, + "grad_norm": 0.5783559679985046, + "learning_rate": 9.85048965310259e-06, + "loss": 0.5618, + "step": 10205 + }, + { + "epoch": 1.8206226028008206, + "grad_norm": 0.47223010659217834, + "learning_rate": 9.831025878061366e-06, + "loss": 0.4077, + "step": 10206 + }, + { + "epoch": 1.8208009990188208, + "grad_norm": 0.5232735872268677, + "learning_rate": 9.811580965787964e-06, + "loss": 0.5183, + "step": 10207 + }, + { + "epoch": 1.820979395236821, + "grad_norm": 0.5101804137229919, + "learning_rate": 9.792154917809631e-06, + "loss": 0.4681, + "step": 10208 + }, + { + "epoch": 1.8211577914548212, + "grad_norm": 0.5707786679267883, + "learning_rate": 9.772747735652122e-06, + "loss": 0.6249, + "step": 10209 + }, + { + "epoch": 1.8213361876728213, + "grad_norm": 0.5304009318351746, + "learning_rate": 9.753359420839631e-06, + "loss": 0.4669, + "step": 10210 + }, + { + "epoch": 1.8215145838908215, + "grad_norm": 0.555030107498169, + "learning_rate": 9.733989974894858e-06, + "loss": 0.5323, + "step": 10211 + }, + { + "epoch": 1.8216929801088217, + "grad_norm": 0.5161325931549072, + "learning_rate": 9.71463939933917e-06, + "loss": 0.4242, + "step": 10212 + }, + { + "epoch": 1.821871376326822, + "grad_norm": 0.4960889518260956, + "learning_rate": 9.69530769569224e-06, + "loss": 0.4857, + "step": 10213 + }, + { + "epoch": 1.8220497725448221, + "grad_norm": 0.5438405871391296, + "learning_rate": 9.675994865472492e-06, + "loss": 0.5828, + "step": 10214 + }, + { + "epoch": 1.8222281687628223, + "grad_norm": 0.5128703713417053, + "learning_rate": 9.656700910196631e-06, + "loss": 0.4462, + "step": 10215 + }, + { + "epoch": 1.8224065649808225, + "grad_norm": 0.506435751914978, + "learning_rate": 9.637425831380109e-06, + "loss": 0.4737, + "step": 10216 + }, + { + "epoch": 1.8225849611988227, + "grad_norm": 0.5082954168319702, + "learning_rate": 9.618169630536688e-06, + "loss": 0.5197, + "step": 10217 + }, + { + "epoch": 1.8227633574168227, + "grad_norm": 0.5157825946807861, + "learning_rate": 9.598932309178798e-06, + "loss": 0.4332, + "step": 10218 + }, + { + "epoch": 1.822941753634823, + "grad_norm": 0.5034956932067871, + "learning_rate": 9.579713868817313e-06, + "loss": 0.4036, + "step": 10219 + }, + { + "epoch": 1.8231201498528231, + "grad_norm": 0.566260576248169, + "learning_rate": 9.560514310961637e-06, + "loss": 0.6078, + "step": 10220 + }, + { + "epoch": 1.8232985460708233, + "grad_norm": 0.5446032881736755, + "learning_rate": 9.541333637119704e-06, + "loss": 0.449, + "step": 10221 + }, + { + "epoch": 1.8234769422888235, + "grad_norm": 0.5551034808158875, + "learning_rate": 9.522171848797917e-06, + "loss": 0.4385, + "step": 10222 + }, + { + "epoch": 1.8236553385068237, + "grad_norm": 0.5290305018424988, + "learning_rate": 9.503028947501269e-06, + "loss": 0.3873, + "step": 10223 + }, + { + "epoch": 1.8238337347248237, + "grad_norm": 0.5309109091758728, + "learning_rate": 9.483904934733278e-06, + "loss": 0.5638, + "step": 10224 + }, + { + "epoch": 1.8240121309428239, + "grad_norm": 0.5156473517417908, + "learning_rate": 9.464799811995855e-06, + "loss": 0.4597, + "step": 10225 + }, + { + "epoch": 1.824190527160824, + "grad_norm": 0.5108242630958557, + "learning_rate": 9.44571358078955e-06, + "loss": 0.4872, + "step": 10226 + }, + { + "epoch": 1.8243689233788243, + "grad_norm": 0.45744287967681885, + "learning_rate": 9.426646242613385e-06, + "loss": 0.3599, + "step": 10227 + }, + { + "epoch": 1.8245473195968245, + "grad_norm": 0.5380007028579712, + "learning_rate": 9.407597798964911e-06, + "loss": 0.4486, + "step": 10228 + }, + { + "epoch": 1.8247257158148247, + "grad_norm": 0.5029316544532776, + "learning_rate": 9.388568251340157e-06, + "loss": 0.4662, + "step": 10229 + }, + { + "epoch": 1.8249041120328249, + "grad_norm": 0.5693724751472473, + "learning_rate": 9.369557601233703e-06, + "loss": 0.4787, + "step": 10230 + }, + { + "epoch": 1.825082508250825, + "grad_norm": 0.49826717376708984, + "learning_rate": 9.350565850138688e-06, + "loss": 0.4471, + "step": 10231 + }, + { + "epoch": 1.8252609044688253, + "grad_norm": 0.6438023447990417, + "learning_rate": 9.331592999546612e-06, + "loss": 0.6015, + "step": 10232 + }, + { + "epoch": 1.8254393006868255, + "grad_norm": 0.6153572797775269, + "learning_rate": 9.312639050947702e-06, + "loss": 0.7253, + "step": 10233 + }, + { + "epoch": 1.8256176969048257, + "grad_norm": 0.508865237236023, + "learning_rate": 9.293704005830488e-06, + "loss": 0.4195, + "step": 10234 + }, + { + "epoch": 1.8257960931228259, + "grad_norm": 0.533497154712677, + "learning_rate": 9.274787865682227e-06, + "loss": 0.4233, + "step": 10235 + }, + { + "epoch": 1.825974489340826, + "grad_norm": 0.5289534330368042, + "learning_rate": 9.255890631988505e-06, + "loss": 0.5847, + "step": 10236 + }, + { + "epoch": 1.8261528855588263, + "grad_norm": 0.5981556177139282, + "learning_rate": 9.237012306233555e-06, + "loss": 0.7135, + "step": 10237 + }, + { + "epoch": 1.8263312817768265, + "grad_norm": 0.5175567269325256, + "learning_rate": 9.21815288990005e-06, + "loss": 0.4407, + "step": 10238 + }, + { + "epoch": 1.8265096779948267, + "grad_norm": 0.4687216877937317, + "learning_rate": 9.199312384469166e-06, + "loss": 0.408, + "step": 10239 + }, + { + "epoch": 1.8266880742128266, + "grad_norm": 0.5143327713012695, + "learning_rate": 9.180490791420693e-06, + "loss": 0.5345, + "step": 10240 + }, + { + "epoch": 1.8268664704308268, + "grad_norm": 0.5955470204353333, + "learning_rate": 9.161688112232836e-06, + "loss": 0.6327, + "step": 10241 + }, + { + "epoch": 1.827044866648827, + "grad_norm": 0.482693612575531, + "learning_rate": 9.142904348382359e-06, + "loss": 0.39, + "step": 10242 + }, + { + "epoch": 1.8272232628668272, + "grad_norm": 0.6863899230957031, + "learning_rate": 9.124139501344496e-06, + "loss": 0.6892, + "step": 10243 + }, + { + "epoch": 1.8274016590848274, + "grad_norm": 0.49588871002197266, + "learning_rate": 9.105393572593102e-06, + "loss": 0.3929, + "step": 10244 + }, + { + "epoch": 1.8275800553028276, + "grad_norm": 0.5905642509460449, + "learning_rate": 9.086666563600437e-06, + "loss": 0.7023, + "step": 10245 + }, + { + "epoch": 1.8277584515208276, + "grad_norm": 0.5300746560096741, + "learning_rate": 9.067958475837274e-06, + "loss": 0.5067, + "step": 10246 + }, + { + "epoch": 1.8279368477388278, + "grad_norm": 0.5370733141899109, + "learning_rate": 9.04926931077299e-06, + "loss": 0.5826, + "step": 10247 + }, + { + "epoch": 1.828115243956828, + "grad_norm": 0.5125030279159546, + "learning_rate": 9.030599069875383e-06, + "loss": 0.4693, + "step": 10248 + }, + { + "epoch": 1.8282936401748282, + "grad_norm": 0.48797720670700073, + "learning_rate": 9.011947754610839e-06, + "loss": 0.4299, + "step": 10249 + }, + { + "epoch": 1.8284720363928284, + "grad_norm": 0.5317216515541077, + "learning_rate": 8.99331536644421e-06, + "loss": 0.4746, + "step": 10250 + }, + { + "epoch": 1.8286504326108286, + "grad_norm": 0.5748763680458069, + "learning_rate": 8.974701906838884e-06, + "loss": 0.51, + "step": 10251 + }, + { + "epoch": 1.8288288288288288, + "grad_norm": 0.573381781578064, + "learning_rate": 8.956107377256772e-06, + "loss": 0.6049, + "step": 10252 + }, + { + "epoch": 1.829007225046829, + "grad_norm": 0.4922345280647278, + "learning_rate": 8.937531779158181e-06, + "loss": 0.371, + "step": 10253 + }, + { + "epoch": 1.8291856212648292, + "grad_norm": 0.494431734085083, + "learning_rate": 8.918975114002192e-06, + "loss": 0.4184, + "step": 10254 + }, + { + "epoch": 1.8293640174828294, + "grad_norm": 0.5176249146461487, + "learning_rate": 8.900437383246084e-06, + "loss": 0.45, + "step": 10255 + }, + { + "epoch": 1.8295424137008296, + "grad_norm": 0.4918590188026428, + "learning_rate": 8.881918588345917e-06, + "loss": 0.5091, + "step": 10256 + }, + { + "epoch": 1.8297208099188298, + "grad_norm": 0.5216599702835083, + "learning_rate": 8.863418730756106e-06, + "loss": 0.4921, + "step": 10257 + }, + { + "epoch": 1.82989920613683, + "grad_norm": 0.45257261395454407, + "learning_rate": 8.844937811929605e-06, + "loss": 0.3359, + "step": 10258 + }, + { + "epoch": 1.8300776023548302, + "grad_norm": 0.5172678828239441, + "learning_rate": 8.826475833317914e-06, + "loss": 0.5071, + "step": 10259 + }, + { + "epoch": 1.8302559985728304, + "grad_norm": 0.5505502223968506, + "learning_rate": 8.808032796371018e-06, + "loss": 0.6637, + "step": 10260 + }, + { + "epoch": 1.8304343947908306, + "grad_norm": 0.48044058680534363, + "learning_rate": 8.78960870253745e-06, + "loss": 0.4304, + "step": 10261 + }, + { + "epoch": 1.8306127910088306, + "grad_norm": 0.4884861707687378, + "learning_rate": 8.77120355326419e-06, + "loss": 0.3665, + "step": 10262 + }, + { + "epoch": 1.8307911872268308, + "grad_norm": 0.4926454424858093, + "learning_rate": 8.752817349996806e-06, + "loss": 0.3983, + "step": 10263 + }, + { + "epoch": 1.830969583444831, + "grad_norm": 0.5212196111679077, + "learning_rate": 8.734450094179309e-06, + "loss": 0.5036, + "step": 10264 + }, + { + "epoch": 1.8311479796628312, + "grad_norm": 0.4858270287513733, + "learning_rate": 8.716101787254321e-06, + "loss": 0.467, + "step": 10265 + }, + { + "epoch": 1.8313263758808314, + "grad_norm": 0.5602033138275146, + "learning_rate": 8.697772430662858e-06, + "loss": 0.5441, + "step": 10266 + }, + { + "epoch": 1.8315047720988316, + "grad_norm": 0.5834726095199585, + "learning_rate": 8.679462025844464e-06, + "loss": 0.6227, + "step": 10267 + }, + { + "epoch": 1.8316831683168315, + "grad_norm": 0.5684033632278442, + "learning_rate": 8.66117057423732e-06, + "loss": 0.57, + "step": 10268 + }, + { + "epoch": 1.8318615645348317, + "grad_norm": 0.4957300126552582, + "learning_rate": 8.642898077277944e-06, + "loss": 0.4303, + "step": 10269 + }, + { + "epoch": 1.832039960752832, + "grad_norm": 0.5206865668296814, + "learning_rate": 8.624644536401521e-06, + "loss": 0.5213, + "step": 10270 + }, + { + "epoch": 1.8322183569708321, + "grad_norm": 0.5535826683044434, + "learning_rate": 8.606409953041627e-06, + "loss": 0.5366, + "step": 10271 + }, + { + "epoch": 1.8323967531888323, + "grad_norm": 0.5583673715591431, + "learning_rate": 8.588194328630422e-06, + "loss": 0.5131, + "step": 10272 + }, + { + "epoch": 1.8325751494068325, + "grad_norm": 0.45444154739379883, + "learning_rate": 8.569997664598567e-06, + "loss": 0.2948, + "step": 10273 + }, + { + "epoch": 1.8327535456248327, + "grad_norm": 0.5591797232627869, + "learning_rate": 8.55181996237514e-06, + "loss": 0.546, + "step": 10274 + }, + { + "epoch": 1.832931941842833, + "grad_norm": 0.6100978255271912, + "learning_rate": 8.533661223387946e-06, + "loss": 0.4099, + "step": 10275 + }, + { + "epoch": 1.8331103380608331, + "grad_norm": 0.5809532403945923, + "learning_rate": 8.515521449063036e-06, + "loss": 0.5211, + "step": 10276 + }, + { + "epoch": 1.8332887342788333, + "grad_norm": 0.5336730480194092, + "learning_rate": 8.497400640825186e-06, + "loss": 0.5875, + "step": 10277 + }, + { + "epoch": 1.8334671304968335, + "grad_norm": 0.5325056910514832, + "learning_rate": 8.47929880009754e-06, + "loss": 0.4842, + "step": 10278 + }, + { + "epoch": 1.8336455267148337, + "grad_norm": 0.528160810470581, + "learning_rate": 8.461215928301819e-06, + "loss": 0.5506, + "step": 10279 + }, + { + "epoch": 1.833823922932834, + "grad_norm": 0.5754840970039368, + "learning_rate": 8.443152026858303e-06, + "loss": 0.4679, + "step": 10280 + }, + { + "epoch": 1.8340023191508341, + "grad_norm": 0.5175067782402039, + "learning_rate": 8.425107097185636e-06, + "loss": 0.5501, + "step": 10281 + }, + { + "epoch": 1.8341807153688343, + "grad_norm": 0.47843798995018005, + "learning_rate": 8.407081140701128e-06, + "loss": 0.3725, + "step": 10282 + }, + { + "epoch": 1.8343591115868345, + "grad_norm": 0.5563110709190369, + "learning_rate": 8.389074158820453e-06, + "loss": 0.5422, + "step": 10283 + }, + { + "epoch": 1.8345375078048345, + "grad_norm": 0.5997392535209656, + "learning_rate": 8.371086152957952e-06, + "loss": 0.6135, + "step": 10284 + }, + { + "epoch": 1.8347159040228347, + "grad_norm": 0.5224518775939941, + "learning_rate": 8.353117124526382e-06, + "loss": 0.4452, + "step": 10285 + }, + { + "epoch": 1.8348943002408349, + "grad_norm": 0.5603434443473816, + "learning_rate": 8.33516707493695e-06, + "loss": 0.5088, + "step": 10286 + }, + { + "epoch": 1.835072696458835, + "grad_norm": 0.5151491761207581, + "learning_rate": 8.317236005599554e-06, + "loss": 0.5069, + "step": 10287 + }, + { + "epoch": 1.8352510926768353, + "grad_norm": 0.46216633915901184, + "learning_rate": 8.299323917922402e-06, + "loss": 0.3475, + "step": 10288 + }, + { + "epoch": 1.8354294888948355, + "grad_norm": 0.5850088000297546, + "learning_rate": 8.281430813312368e-06, + "loss": 0.6099, + "step": 10289 + }, + { + "epoch": 1.8356078851128355, + "grad_norm": 0.510653018951416, + "learning_rate": 8.263556693174745e-06, + "loss": 0.4093, + "step": 10290 + }, + { + "epoch": 1.8357862813308357, + "grad_norm": 0.6367735862731934, + "learning_rate": 8.245701558913327e-06, + "loss": 0.5877, + "step": 10291 + }, + { + "epoch": 1.8359646775488359, + "grad_norm": 0.5245457887649536, + "learning_rate": 8.227865411930492e-06, + "loss": 0.4942, + "step": 10292 + }, + { + "epoch": 1.836143073766836, + "grad_norm": 0.5388858318328857, + "learning_rate": 8.210048253627034e-06, + "loss": 0.5077, + "step": 10293 + }, + { + "epoch": 1.8363214699848363, + "grad_norm": 0.4671160876750946, + "learning_rate": 8.192250085402364e-06, + "loss": 0.4215, + "step": 10294 + }, + { + "epoch": 1.8364998662028365, + "grad_norm": 0.5302553772926331, + "learning_rate": 8.174470908654309e-06, + "loss": 0.4563, + "step": 10295 + }, + { + "epoch": 1.8366782624208366, + "grad_norm": 0.5391623377799988, + "learning_rate": 8.156710724779249e-06, + "loss": 0.5735, + "step": 10296 + }, + { + "epoch": 1.8368566586388368, + "grad_norm": 0.5140558481216431, + "learning_rate": 8.138969535172014e-06, + "loss": 0.4466, + "step": 10297 + }, + { + "epoch": 1.837035054856837, + "grad_norm": 0.497837096452713, + "learning_rate": 8.121247341226074e-06, + "loss": 0.4269, + "step": 10298 + }, + { + "epoch": 1.8372134510748372, + "grad_norm": 0.49367204308509827, + "learning_rate": 8.103544144333259e-06, + "loss": 0.4433, + "step": 10299 + }, + { + "epoch": 1.8373918472928374, + "grad_norm": 0.5372571349143982, + "learning_rate": 8.085859945883984e-06, + "loss": 0.5503, + "step": 10300 + }, + { + "epoch": 1.8375702435108376, + "grad_norm": 0.5628436207771301, + "learning_rate": 8.068194747267193e-06, + "loss": 0.5568, + "step": 10301 + }, + { + "epoch": 1.8377486397288378, + "grad_norm": 0.5379055142402649, + "learning_rate": 8.050548549870252e-06, + "loss": 0.5478, + "step": 10302 + }, + { + "epoch": 1.837927035946838, + "grad_norm": 1.1105915307998657, + "learning_rate": 8.032921355079132e-06, + "loss": 0.4982, + "step": 10303 + }, + { + "epoch": 1.8381054321648382, + "grad_norm": 0.4970918893814087, + "learning_rate": 8.015313164278227e-06, + "loss": 0.5111, + "step": 10304 + }, + { + "epoch": 1.8382838283828384, + "grad_norm": 0.4896716773509979, + "learning_rate": 7.997723978850486e-06, + "loss": 0.3981, + "step": 10305 + }, + { + "epoch": 1.8384622246008384, + "grad_norm": 0.5600361824035645, + "learning_rate": 7.980153800177387e-06, + "loss": 0.4834, + "step": 10306 + }, + { + "epoch": 1.8386406208188386, + "grad_norm": 0.560856819152832, + "learning_rate": 7.962602629638827e-06, + "loss": 0.6468, + "step": 10307 + }, + { + "epoch": 1.8388190170368388, + "grad_norm": 0.46853670477867126, + "learning_rate": 7.945070468613313e-06, + "loss": 0.4358, + "step": 10308 + }, + { + "epoch": 1.838997413254839, + "grad_norm": 0.4796409606933594, + "learning_rate": 7.9275573184778e-06, + "loss": 0.4112, + "step": 10309 + }, + { + "epoch": 1.8391758094728392, + "grad_norm": 0.5527827143669128, + "learning_rate": 7.910063180607775e-06, + "loss": 0.63, + "step": 10310 + }, + { + "epoch": 1.8393542056908394, + "grad_norm": 0.5861801505088806, + "learning_rate": 7.892588056377214e-06, + "loss": 0.6435, + "step": 10311 + }, + { + "epoch": 1.8395326019088394, + "grad_norm": 0.5089249610900879, + "learning_rate": 7.875131947158554e-06, + "loss": 0.4791, + "step": 10312 + }, + { + "epoch": 1.8397109981268396, + "grad_norm": 0.5814892053604126, + "learning_rate": 7.857694854322888e-06, + "loss": 0.5377, + "step": 10313 + }, + { + "epoch": 1.8398893943448398, + "grad_norm": 0.516287088394165, + "learning_rate": 7.840276779239625e-06, + "loss": 0.3958, + "step": 10314 + }, + { + "epoch": 1.84006779056284, + "grad_norm": 0.5036550164222717, + "learning_rate": 7.822877723276834e-06, + "loss": 0.518, + "step": 10315 + }, + { + "epoch": 1.8402461867808402, + "grad_norm": 0.4933060109615326, + "learning_rate": 7.805497687801006e-06, + "loss": 0.4199, + "step": 10316 + }, + { + "epoch": 1.8404245829988404, + "grad_norm": 0.4610743522644043, + "learning_rate": 7.78813667417716e-06, + "loss": 0.3651, + "step": 10317 + }, + { + "epoch": 1.8406029792168406, + "grad_norm": 0.5514711737632751, + "learning_rate": 7.770794683768845e-06, + "loss": 0.4906, + "step": 10318 + }, + { + "epoch": 1.8407813754348408, + "grad_norm": 0.6051323413848877, + "learning_rate": 7.753471717938054e-06, + "loss": 0.6172, + "step": 10319 + }, + { + "epoch": 1.840959771652841, + "grad_norm": 0.4640166461467743, + "learning_rate": 7.736167778045367e-06, + "loss": 0.4343, + "step": 10320 + }, + { + "epoch": 1.8411381678708412, + "grad_norm": 0.4809337556362152, + "learning_rate": 7.718882865449806e-06, + "loss": 0.3689, + "step": 10321 + }, + { + "epoch": 1.8413165640888414, + "grad_norm": 0.5500307083129883, + "learning_rate": 7.701616981508924e-06, + "loss": 0.5312, + "step": 10322 + }, + { + "epoch": 1.8414949603068416, + "grad_norm": 0.5196850299835205, + "learning_rate": 7.684370127578749e-06, + "loss": 0.4428, + "step": 10323 + }, + { + "epoch": 1.8416733565248418, + "grad_norm": 0.5277876853942871, + "learning_rate": 7.66714230501392e-06, + "loss": 0.5147, + "step": 10324 + }, + { + "epoch": 1.841851752742842, + "grad_norm": 0.4931204915046692, + "learning_rate": 7.649933515167407e-06, + "loss": 0.4189, + "step": 10325 + }, + { + "epoch": 1.8420301489608422, + "grad_norm": 0.5108685493469238, + "learning_rate": 7.632743759390826e-06, + "loss": 0.5112, + "step": 10326 + }, + { + "epoch": 1.8422085451788424, + "grad_norm": 0.5715261697769165, + "learning_rate": 7.61557303903429e-06, + "loss": 0.5237, + "step": 10327 + }, + { + "epoch": 1.8423869413968423, + "grad_norm": 0.5731346607208252, + "learning_rate": 7.5984213554462775e-06, + "loss": 0.6463, + "step": 10328 + }, + { + "epoch": 1.8425653376148425, + "grad_norm": 0.4316110610961914, + "learning_rate": 7.581288709973988e-06, + "loss": 0.3569, + "step": 10329 + }, + { + "epoch": 1.8427437338328427, + "grad_norm": 0.512586772441864, + "learning_rate": 7.564175103962956e-06, + "loss": 0.4411, + "step": 10330 + }, + { + "epoch": 1.842922130050843, + "grad_norm": 0.5421055555343628, + "learning_rate": 7.547080538757245e-06, + "loss": 0.4933, + "step": 10331 + }, + { + "epoch": 1.8431005262688431, + "grad_norm": 0.5586483478546143, + "learning_rate": 7.5300050156995315e-06, + "loss": 0.5065, + "step": 10332 + }, + { + "epoch": 1.8432789224868433, + "grad_norm": 0.6223009824752808, + "learning_rate": 7.5129485361308534e-06, + "loss": 0.4358, + "step": 10333 + }, + { + "epoch": 1.8434573187048433, + "grad_norm": 0.5178797841072083, + "learning_rate": 7.49591110139089e-06, + "loss": 0.4661, + "step": 10334 + }, + { + "epoch": 1.8436357149228435, + "grad_norm": 0.5489305257797241, + "learning_rate": 7.478892712817681e-06, + "loss": 0.5131, + "step": 10335 + }, + { + "epoch": 1.8438141111408437, + "grad_norm": 0.5495460033416748, + "learning_rate": 7.4618933717478796e-06, + "loss": 0.6006, + "step": 10336 + }, + { + "epoch": 1.843992507358844, + "grad_norm": 0.4898929297924042, + "learning_rate": 7.444913079516613e-06, + "loss": 0.3961, + "step": 10337 + }, + { + "epoch": 1.844170903576844, + "grad_norm": 0.48516249656677246, + "learning_rate": 7.42795183745748e-06, + "loss": 0.3852, + "step": 10338 + }, + { + "epoch": 1.8443492997948443, + "grad_norm": 0.6237598061561584, + "learning_rate": 7.411009646902639e-06, + "loss": 0.6107, + "step": 10339 + }, + { + "epoch": 1.8445276960128445, + "grad_norm": 0.49778953194618225, + "learning_rate": 7.394086509182663e-06, + "loss": 0.3913, + "step": 10340 + }, + { + "epoch": 1.8447060922308447, + "grad_norm": 0.5452266931533813, + "learning_rate": 7.377182425626766e-06, + "loss": 0.5521, + "step": 10341 + }, + { + "epoch": 1.844884488448845, + "grad_norm": 0.6213571429252625, + "learning_rate": 7.360297397562527e-06, + "loss": 0.6084, + "step": 10342 + }, + { + "epoch": 1.845062884666845, + "grad_norm": 0.5283211469650269, + "learning_rate": 7.34343142631616e-06, + "loss": 0.5813, + "step": 10343 + }, + { + "epoch": 1.8452412808848453, + "grad_norm": 0.45486849546432495, + "learning_rate": 7.32658451321222e-06, + "loss": 0.2873, + "step": 10344 + }, + { + "epoch": 1.8454196771028455, + "grad_norm": 0.544691264629364, + "learning_rate": 7.3097566595738965e-06, + "loss": 0.5815, + "step": 10345 + }, + { + "epoch": 1.8455980733208457, + "grad_norm": 0.6090754866600037, + "learning_rate": 7.292947866722882e-06, + "loss": 0.567, + "step": 10346 + }, + { + "epoch": 1.8457764695388459, + "grad_norm": 0.5459246635437012, + "learning_rate": 7.276158135979288e-06, + "loss": 0.6544, + "step": 10347 + }, + { + "epoch": 1.845954865756846, + "grad_norm": 0.54819256067276, + "learning_rate": 7.259387468661782e-06, + "loss": 0.5531, + "step": 10348 + }, + { + "epoch": 1.8461332619748463, + "grad_norm": 0.5134831070899963, + "learning_rate": 7.242635866087505e-06, + "loss": 0.4531, + "step": 10349 + }, + { + "epoch": 1.8463116581928463, + "grad_norm": 0.5044652223587036, + "learning_rate": 7.225903329572181e-06, + "loss": 0.4553, + "step": 10350 + }, + { + "epoch": 1.8464900544108465, + "grad_norm": 0.4595658779144287, + "learning_rate": 7.209189860429899e-06, + "loss": 0.34, + "step": 10351 + }, + { + "epoch": 1.8466684506288467, + "grad_norm": 0.5554966330528259, + "learning_rate": 7.1924954599733864e-06, + "loss": 0.4627, + "step": 10352 + }, + { + "epoch": 1.8468468468468469, + "grad_norm": 0.49979686737060547, + "learning_rate": 7.175820129513788e-06, + "loss": 0.4567, + "step": 10353 + }, + { + "epoch": 1.847025243064847, + "grad_norm": 0.527464747428894, + "learning_rate": 7.159163870360752e-06, + "loss": 0.4319, + "step": 10354 + }, + { + "epoch": 1.8472036392828473, + "grad_norm": 0.5300561785697937, + "learning_rate": 7.142526683822537e-06, + "loss": 0.4145, + "step": 10355 + }, + { + "epoch": 1.8473820355008472, + "grad_norm": 0.5047762393951416, + "learning_rate": 7.125908571205708e-06, + "loss": 0.411, + "step": 10356 + }, + { + "epoch": 1.8475604317188474, + "grad_norm": 0.5059321522712708, + "learning_rate": 7.109309533815556e-06, + "loss": 0.4051, + "step": 10357 + }, + { + "epoch": 1.8477388279368476, + "grad_norm": 0.5237871408462524, + "learning_rate": 7.092729572955675e-06, + "loss": 0.4884, + "step": 10358 + }, + { + "epoch": 1.8479172241548478, + "grad_norm": 0.4973069131374359, + "learning_rate": 7.076168689928275e-06, + "loss": 0.3889, + "step": 10359 + }, + { + "epoch": 1.848095620372848, + "grad_norm": 0.5443868637084961, + "learning_rate": 7.059626886034093e-06, + "loss": 0.4669, + "step": 10360 + }, + { + "epoch": 1.8482740165908482, + "grad_norm": 0.5395627021789551, + "learning_rate": 7.0431041625722e-06, + "loss": 0.4496, + "step": 10361 + }, + { + "epoch": 1.8484524128088484, + "grad_norm": 0.5488491654396057, + "learning_rate": 7.026600520840393e-06, + "loss": 0.5293, + "step": 10362 + }, + { + "epoch": 1.8486308090268486, + "grad_norm": 0.5110378265380859, + "learning_rate": 7.010115962134855e-06, + "loss": 0.4904, + "step": 10363 + }, + { + "epoch": 1.8488092052448488, + "grad_norm": 0.5103933215141296, + "learning_rate": 6.993650487750192e-06, + "loss": 0.5069, + "step": 10364 + }, + { + "epoch": 1.848987601462849, + "grad_norm": 0.5104700326919556, + "learning_rate": 6.9772040989796725e-06, + "loss": 0.468, + "step": 10365 + }, + { + "epoch": 1.8491659976808492, + "grad_norm": 0.4703051745891571, + "learning_rate": 6.960776797114931e-06, + "loss": 0.4197, + "step": 10366 + }, + { + "epoch": 1.8493443938988494, + "grad_norm": 0.5158678889274597, + "learning_rate": 6.944368583446242e-06, + "loss": 0.4147, + "step": 10367 + }, + { + "epoch": 1.8495227901168496, + "grad_norm": 0.5142411589622498, + "learning_rate": 6.927979459262212e-06, + "loss": 0.4338, + "step": 10368 + }, + { + "epoch": 1.8497011863348498, + "grad_norm": 0.4859728515148163, + "learning_rate": 6.9116094258500905e-06, + "loss": 0.38, + "step": 10369 + }, + { + "epoch": 1.84987958255285, + "grad_norm": 0.5308949947357178, + "learning_rate": 6.895258484495515e-06, + "loss": 0.5024, + "step": 10370 + }, + { + "epoch": 1.8500579787708502, + "grad_norm": 0.46590912342071533, + "learning_rate": 6.878926636482791e-06, + "loss": 0.3446, + "step": 10371 + }, + { + "epoch": 1.8502363749888502, + "grad_norm": 0.5768349170684814, + "learning_rate": 6.862613883094504e-06, + "loss": 0.593, + "step": 10372 + }, + { + "epoch": 1.8504147712068504, + "grad_norm": 0.6099775433540344, + "learning_rate": 6.84632022561188e-06, + "loss": 0.5706, + "step": 10373 + }, + { + "epoch": 1.8505931674248506, + "grad_norm": 0.6040098071098328, + "learning_rate": 6.830045665314672e-06, + "loss": 0.6825, + "step": 10374 + }, + { + "epoch": 1.8507715636428508, + "grad_norm": 0.4853060841560364, + "learning_rate": 6.813790203480996e-06, + "loss": 0.4622, + "step": 10375 + }, + { + "epoch": 1.850949959860851, + "grad_norm": 0.5379960536956787, + "learning_rate": 6.7975538413875825e-06, + "loss": 0.5277, + "step": 10376 + }, + { + "epoch": 1.8511283560788512, + "grad_norm": 0.6292576193809509, + "learning_rate": 6.781336580309661e-06, + "loss": 0.6256, + "step": 10377 + }, + { + "epoch": 1.8513067522968512, + "grad_norm": 0.5415869355201721, + "learning_rate": 6.765138421520878e-06, + "loss": 0.3965, + "step": 10378 + }, + { + "epoch": 1.8514851485148514, + "grad_norm": 0.5575872659683228, + "learning_rate": 6.748959366293467e-06, + "loss": 0.5549, + "step": 10379 + }, + { + "epoch": 1.8516635447328516, + "grad_norm": 0.5051519870758057, + "learning_rate": 6.732799415898078e-06, + "loss": 0.4057, + "step": 10380 + }, + { + "epoch": 1.8518419409508518, + "grad_norm": 0.48078250885009766, + "learning_rate": 6.716658571603973e-06, + "loss": 0.3774, + "step": 10381 + }, + { + "epoch": 1.852020337168852, + "grad_norm": 0.6021243333816528, + "learning_rate": 6.7005368346787775e-06, + "loss": 0.5807, + "step": 10382 + }, + { + "epoch": 1.8521987333868521, + "grad_norm": 0.5509424209594727, + "learning_rate": 6.6844342063887565e-06, + "loss": 0.5441, + "step": 10383 + }, + { + "epoch": 1.8523771296048523, + "grad_norm": 0.5380014181137085, + "learning_rate": 6.6683506879985645e-06, + "loss": 0.583, + "step": 10384 + }, + { + "epoch": 1.8525555258228525, + "grad_norm": 0.5455968976020813, + "learning_rate": 6.652286280771358e-06, + "loss": 0.4776, + "step": 10385 + }, + { + "epoch": 1.8527339220408527, + "grad_norm": 0.5762505531311035, + "learning_rate": 6.636240985968906e-06, + "loss": 0.5963, + "step": 10386 + }, + { + "epoch": 1.852912318258853, + "grad_norm": 0.5499910712242126, + "learning_rate": 6.620214804851338e-06, + "loss": 0.5036, + "step": 10387 + }, + { + "epoch": 1.8530907144768531, + "grad_norm": 0.46007874608039856, + "learning_rate": 6.60420773867737e-06, + "loss": 0.3352, + "step": 10388 + }, + { + "epoch": 1.8532691106948533, + "grad_norm": 0.5644509792327881, + "learning_rate": 6.588219788704164e-06, + "loss": 0.5467, + "step": 10389 + }, + { + "epoch": 1.8534475069128535, + "grad_norm": 0.5105949640274048, + "learning_rate": 6.572250956187465e-06, + "loss": 0.4846, + "step": 10390 + }, + { + "epoch": 1.8536259031308537, + "grad_norm": 0.505388617515564, + "learning_rate": 6.556301242381379e-06, + "loss": 0.4123, + "step": 10391 + }, + { + "epoch": 1.853804299348854, + "grad_norm": 0.49062561988830566, + "learning_rate": 6.540370648538657e-06, + "loss": 0.3651, + "step": 10392 + }, + { + "epoch": 1.8539826955668541, + "grad_norm": 0.5342191457748413, + "learning_rate": 6.524459175910464e-06, + "loss": 0.5833, + "step": 10393 + }, + { + "epoch": 1.854161091784854, + "grad_norm": 0.5109807848930359, + "learning_rate": 6.508566825746437e-06, + "loss": 0.402, + "step": 10394 + }, + { + "epoch": 1.8543394880028543, + "grad_norm": 0.5528099536895752, + "learning_rate": 6.49269359929483e-06, + "loss": 0.5504, + "step": 10395 + }, + { + "epoch": 1.8545178842208545, + "grad_norm": 0.479790061712265, + "learning_rate": 6.476839497802256e-06, + "loss": 0.3927, + "step": 10396 + }, + { + "epoch": 1.8546962804388547, + "grad_norm": 0.5817665457725525, + "learning_rate": 6.461004522513913e-06, + "loss": 0.5102, + "step": 10397 + }, + { + "epoch": 1.854874676656855, + "grad_norm": 0.5208722352981567, + "learning_rate": 6.445188674673474e-06, + "loss": 0.489, + "step": 10398 + }, + { + "epoch": 1.855053072874855, + "grad_norm": 0.4927591383457184, + "learning_rate": 6.429391955523112e-06, + "loss": 0.469, + "step": 10399 + }, + { + "epoch": 1.855231469092855, + "grad_norm": 0.5992792844772339, + "learning_rate": 6.413614366303472e-06, + "loss": 0.5953, + "step": 10400 + }, + { + "epoch": 1.8554098653108553, + "grad_norm": 0.5083011984825134, + "learning_rate": 6.397855908253758e-06, + "loss": 0.5346, + "step": 10401 + }, + { + "epoch": 1.8555882615288555, + "grad_norm": 0.557491660118103, + "learning_rate": 6.382116582611591e-06, + "loss": 0.6326, + "step": 10402 + }, + { + "epoch": 1.8557666577468557, + "grad_norm": 0.5369499325752258, + "learning_rate": 6.36639639061315e-06, + "loss": 0.5539, + "step": 10403 + }, + { + "epoch": 1.8559450539648559, + "grad_norm": 0.5842743515968323, + "learning_rate": 6.350695333493112e-06, + "loss": 0.5387, + "step": 10404 + }, + { + "epoch": 1.856123450182856, + "grad_norm": 0.5030590295791626, + "learning_rate": 6.33501341248463e-06, + "loss": 0.4925, + "step": 10405 + }, + { + "epoch": 1.8563018464008563, + "grad_norm": 0.5017548203468323, + "learning_rate": 6.319350628819304e-06, + "loss": 0.4771, + "step": 10406 + }, + { + "epoch": 1.8564802426188565, + "grad_norm": 0.5642099380493164, + "learning_rate": 6.303706983727286e-06, + "loss": 0.5488, + "step": 10407 + }, + { + "epoch": 1.8566586388368567, + "grad_norm": 0.5220343470573425, + "learning_rate": 6.28808247843729e-06, + "loss": 0.5252, + "step": 10408 + }, + { + "epoch": 1.8568370350548569, + "grad_norm": 0.5058274865150452, + "learning_rate": 6.272477114176417e-06, + "loss": 0.4143, + "step": 10409 + }, + { + "epoch": 1.857015431272857, + "grad_norm": 0.5488356351852417, + "learning_rate": 6.2568908921703245e-06, + "loss": 0.5446, + "step": 10410 + }, + { + "epoch": 1.8571938274908573, + "grad_norm": 0.5424718856811523, + "learning_rate": 6.241323813643091e-06, + "loss": 0.4847, + "step": 10411 + }, + { + "epoch": 1.8573722237088575, + "grad_norm": 0.5151358246803284, + "learning_rate": 6.22577587981743e-06, + "loss": 0.4357, + "step": 10412 + }, + { + "epoch": 1.8575506199268577, + "grad_norm": 0.5469593405723572, + "learning_rate": 6.210247091914395e-06, + "loss": 0.4538, + "step": 10413 + }, + { + "epoch": 1.8577290161448579, + "grad_norm": 0.5348469614982605, + "learning_rate": 6.194737451153648e-06, + "loss": 0.5542, + "step": 10414 + }, + { + "epoch": 1.857907412362858, + "grad_norm": 0.6335954070091248, + "learning_rate": 6.179246958753298e-06, + "loss": 0.57, + "step": 10415 + }, + { + "epoch": 1.858085808580858, + "grad_norm": 0.5610979795455933, + "learning_rate": 6.163775615929984e-06, + "loss": 0.585, + "step": 10416 + }, + { + "epoch": 1.8582642047988582, + "grad_norm": 0.4940567910671234, + "learning_rate": 6.148323423898816e-06, + "loss": 0.4222, + "step": 10417 + }, + { + "epoch": 1.8584426010168584, + "grad_norm": 0.5531600117683411, + "learning_rate": 6.132890383873352e-06, + "loss": 0.4795, + "step": 10418 + }, + { + "epoch": 1.8586209972348586, + "grad_norm": 0.5417333841323853, + "learning_rate": 6.11747649706576e-06, + "loss": 0.5194, + "step": 10419 + }, + { + "epoch": 1.8587993934528588, + "grad_norm": 0.5340380668640137, + "learning_rate": 6.1020817646866014e-06, + "loss": 0.534, + "step": 10420 + }, + { + "epoch": 1.858977789670859, + "grad_norm": 0.5391452312469482, + "learning_rate": 6.08670618794499e-06, + "loss": 0.4972, + "step": 10421 + }, + { + "epoch": 1.859156185888859, + "grad_norm": 0.4881535470485687, + "learning_rate": 6.07134976804849e-06, + "loss": 0.3307, + "step": 10422 + }, + { + "epoch": 1.8593345821068592, + "grad_norm": 0.4824979305267334, + "learning_rate": 6.056012506203218e-06, + "loss": 0.4743, + "step": 10423 + }, + { + "epoch": 1.8595129783248594, + "grad_norm": 0.5851719975471497, + "learning_rate": 6.040694403613767e-06, + "loss": 0.5364, + "step": 10424 + }, + { + "epoch": 1.8596913745428596, + "grad_norm": 0.5611156821250916, + "learning_rate": 6.025395461483174e-06, + "loss": 0.5917, + "step": 10425 + }, + { + "epoch": 1.8598697707608598, + "grad_norm": 0.5232062935829163, + "learning_rate": 6.010115681013034e-06, + "loss": 0.381, + "step": 10426 + }, + { + "epoch": 1.86004816697886, + "grad_norm": 0.5068747401237488, + "learning_rate": 5.994855063403415e-06, + "loss": 0.453, + "step": 10427 + }, + { + "epoch": 1.8602265631968602, + "grad_norm": 0.5189545154571533, + "learning_rate": 5.979613609852885e-06, + "loss": 0.3813, + "step": 10428 + }, + { + "epoch": 1.8604049594148604, + "grad_norm": 0.5572920441627502, + "learning_rate": 5.96439132155846e-06, + "loss": 0.597, + "step": 10429 + }, + { + "epoch": 1.8605833556328606, + "grad_norm": 0.6158145666122437, + "learning_rate": 5.949188199715766e-06, + "loss": 0.6569, + "step": 10430 + }, + { + "epoch": 1.8607617518508608, + "grad_norm": 0.5191422700881958, + "learning_rate": 5.934004245518793e-06, + "loss": 0.401, + "step": 10431 + }, + { + "epoch": 1.860940148068861, + "grad_norm": 0.5226636528968811, + "learning_rate": 5.918839460160086e-06, + "loss": 0.4098, + "step": 10432 + }, + { + "epoch": 1.8611185442868612, + "grad_norm": 0.5896599888801575, + "learning_rate": 5.903693844830693e-06, + "loss": 0.634, + "step": 10433 + }, + { + "epoch": 1.8612969405048614, + "grad_norm": 0.5435173511505127, + "learning_rate": 5.888567400720135e-06, + "loss": 0.4192, + "step": 10434 + }, + { + "epoch": 1.8614753367228616, + "grad_norm": 0.5945151448249817, + "learning_rate": 5.8734601290164615e-06, + "loss": 0.5485, + "step": 10435 + }, + { + "epoch": 1.8616537329408618, + "grad_norm": 0.5320826172828674, + "learning_rate": 5.858372030906167e-06, + "loss": 0.5008, + "step": 10436 + }, + { + "epoch": 1.861832129158862, + "grad_norm": 0.569065272808075, + "learning_rate": 5.843303107574249e-06, + "loss": 0.5214, + "step": 10437 + }, + { + "epoch": 1.862010525376862, + "grad_norm": 0.5648977756500244, + "learning_rate": 5.828253360204261e-06, + "loss": 0.6008, + "step": 10438 + }, + { + "epoch": 1.8621889215948622, + "grad_norm": 0.5260228514671326, + "learning_rate": 5.813222789978173e-06, + "loss": 0.4568, + "step": 10439 + }, + { + "epoch": 1.8623673178128624, + "grad_norm": 0.4943072497844696, + "learning_rate": 5.798211398076486e-06, + "loss": 0.411, + "step": 10440 + }, + { + "epoch": 1.8625457140308626, + "grad_norm": 0.5590642094612122, + "learning_rate": 5.783219185678173e-06, + "loss": 0.4807, + "step": 10441 + }, + { + "epoch": 1.8627241102488628, + "grad_norm": 0.490158349275589, + "learning_rate": 5.768246153960766e-06, + "loss": 0.4316, + "step": 10442 + }, + { + "epoch": 1.862902506466863, + "grad_norm": 0.5113316774368286, + "learning_rate": 5.7532923041001825e-06, + "loss": 0.3966, + "step": 10443 + }, + { + "epoch": 1.863080902684863, + "grad_norm": 0.5623192191123962, + "learning_rate": 5.73835763727093e-06, + "loss": 0.5611, + "step": 10444 + }, + { + "epoch": 1.8632592989028631, + "grad_norm": 0.5246427059173584, + "learning_rate": 5.723442154645931e-06, + "loss": 0.4583, + "step": 10445 + }, + { + "epoch": 1.8634376951208633, + "grad_norm": 0.5438576936721802, + "learning_rate": 5.708545857396663e-06, + "loss": 0.4384, + "step": 10446 + }, + { + "epoch": 1.8636160913388635, + "grad_norm": 0.4891131818294525, + "learning_rate": 5.693668746693109e-06, + "loss": 0.4494, + "step": 10447 + }, + { + "epoch": 1.8637944875568637, + "grad_norm": 0.504887044429779, + "learning_rate": 5.678810823703639e-06, + "loss": 0.414, + "step": 10448 + }, + { + "epoch": 1.863972883774864, + "grad_norm": 0.5959895253181458, + "learning_rate": 5.663972089595265e-06, + "loss": 0.6671, + "step": 10449 + }, + { + "epoch": 1.8641512799928641, + "grad_norm": 0.45866143703460693, + "learning_rate": 5.649152545533331e-06, + "loss": 0.347, + "step": 10450 + }, + { + "epoch": 1.8643296762108643, + "grad_norm": 0.5574930310249329, + "learning_rate": 5.634352192681852e-06, + "loss": 0.6087, + "step": 10451 + }, + { + "epoch": 1.8645080724288645, + "grad_norm": 0.5660715103149414, + "learning_rate": 5.619571032203147e-06, + "loss": 0.5658, + "step": 10452 + }, + { + "epoch": 1.8646864686468647, + "grad_norm": 0.5469009280204773, + "learning_rate": 5.604809065258176e-06, + "loss": 0.5725, + "step": 10453 + }, + { + "epoch": 1.864864864864865, + "grad_norm": 0.5283670425415039, + "learning_rate": 5.590066293006374e-06, + "loss": 0.4507, + "step": 10454 + }, + { + "epoch": 1.8650432610828651, + "grad_norm": 0.5461406111717224, + "learning_rate": 5.5753427166055635e-06, + "loss": 0.5125, + "step": 10455 + }, + { + "epoch": 1.8652216573008653, + "grad_norm": 0.5304574370384216, + "learning_rate": 5.560638337212126e-06, + "loss": 0.504, + "step": 10456 + }, + { + "epoch": 1.8654000535188655, + "grad_norm": 0.5879049301147461, + "learning_rate": 5.545953155980998e-06, + "loss": 0.5871, + "step": 10457 + }, + { + "epoch": 1.8655784497368657, + "grad_norm": 0.4767381250858307, + "learning_rate": 5.531287174065508e-06, + "loss": 0.4185, + "step": 10458 + }, + { + "epoch": 1.865756845954866, + "grad_norm": 0.5429925918579102, + "learning_rate": 5.516640392617511e-06, + "loss": 0.5435, + "step": 10459 + }, + { + "epoch": 1.8659352421728659, + "grad_norm": 0.5666899085044861, + "learning_rate": 5.502012812787366e-06, + "loss": 0.4597, + "step": 10460 + }, + { + "epoch": 1.866113638390866, + "grad_norm": 0.4842609167098999, + "learning_rate": 5.4874044357239305e-06, + "loss": 0.4877, + "step": 10461 + }, + { + "epoch": 1.8662920346088663, + "grad_norm": 0.5365511178970337, + "learning_rate": 5.4728152625745094e-06, + "loss": 0.5841, + "step": 10462 + }, + { + "epoch": 1.8664704308268665, + "grad_norm": 0.4799635112285614, + "learning_rate": 5.458245294484964e-06, + "loss": 0.4223, + "step": 10463 + }, + { + "epoch": 1.8666488270448667, + "grad_norm": 0.5694946050643921, + "learning_rate": 5.443694532599602e-06, + "loss": 0.6436, + "step": 10464 + }, + { + "epoch": 1.8668272232628669, + "grad_norm": 0.5620405673980713, + "learning_rate": 5.429162978061203e-06, + "loss": 0.5037, + "step": 10465 + }, + { + "epoch": 1.8670056194808669, + "grad_norm": 0.5125569105148315, + "learning_rate": 5.414650632011131e-06, + "loss": 0.5161, + "step": 10466 + }, + { + "epoch": 1.867184015698867, + "grad_norm": 0.5853519439697266, + "learning_rate": 5.400157495589114e-06, + "loss": 0.6433, + "step": 10467 + }, + { + "epoch": 1.8673624119168672, + "grad_norm": 0.46554625034332275, + "learning_rate": 5.385683569933464e-06, + "loss": 0.3893, + "step": 10468 + }, + { + "epoch": 1.8675408081348674, + "grad_norm": 0.547414243221283, + "learning_rate": 5.371228856180993e-06, + "loss": 0.4324, + "step": 10469 + }, + { + "epoch": 1.8677192043528676, + "grad_norm": 0.6247468590736389, + "learning_rate": 5.356793355466933e-06, + "loss": 0.729, + "step": 10470 + }, + { + "epoch": 1.8678976005708678, + "grad_norm": 0.5195764303207397, + "learning_rate": 5.342377068925041e-06, + "loss": 0.4048, + "step": 10471 + }, + { + "epoch": 1.868075996788868, + "grad_norm": 0.4851647615432739, + "learning_rate": 5.327979997687554e-06, + "loss": 0.342, + "step": 10472 + }, + { + "epoch": 1.8682543930068682, + "grad_norm": 0.5746437907218933, + "learning_rate": 5.313602142885232e-06, + "loss": 0.4786, + "step": 10473 + }, + { + "epoch": 1.8684327892248684, + "grad_norm": 0.6188861131668091, + "learning_rate": 5.299243505647283e-06, + "loss": 0.6486, + "step": 10474 + }, + { + "epoch": 1.8686111854428686, + "grad_norm": 0.48796316981315613, + "learning_rate": 5.2849040871015e-06, + "loss": 0.3936, + "step": 10475 + }, + { + "epoch": 1.8687895816608688, + "grad_norm": 0.5425949692726135, + "learning_rate": 5.270583888374009e-06, + "loss": 0.5362, + "step": 10476 + }, + { + "epoch": 1.868967977878869, + "grad_norm": 0.5946719646453857, + "learning_rate": 5.256282910589521e-06, + "loss": 0.5137, + "step": 10477 + }, + { + "epoch": 1.8691463740968692, + "grad_norm": 0.5937526822090149, + "learning_rate": 5.242001154871306e-06, + "loss": 0.5385, + "step": 10478 + }, + { + "epoch": 1.8693247703148694, + "grad_norm": 0.5041108727455139, + "learning_rate": 5.227738622340938e-06, + "loss": 0.4807, + "step": 10479 + }, + { + "epoch": 1.8695031665328696, + "grad_norm": 0.45467957854270935, + "learning_rate": 5.213495314118688e-06, + "loss": 0.3457, + "step": 10480 + }, + { + "epoch": 1.8696815627508698, + "grad_norm": 0.5164430141448975, + "learning_rate": 5.199271231323133e-06, + "loss": 0.4477, + "step": 10481 + }, + { + "epoch": 1.8698599589688698, + "grad_norm": 0.5357136726379395, + "learning_rate": 5.185066375071518e-06, + "loss": 0.4991, + "step": 10482 + }, + { + "epoch": 1.87003835518687, + "grad_norm": 0.4412253797054291, + "learning_rate": 5.170880746479395e-06, + "loss": 0.3304, + "step": 10483 + }, + { + "epoch": 1.8702167514048702, + "grad_norm": 0.4293851852416992, + "learning_rate": 5.156714346660957e-06, + "loss": 0.3241, + "step": 10484 + }, + { + "epoch": 1.8703951476228704, + "grad_norm": 0.5267345309257507, + "learning_rate": 5.142567176728813e-06, + "loss": 0.5351, + "step": 10485 + }, + { + "epoch": 1.8705735438408706, + "grad_norm": 0.5791170001029968, + "learning_rate": 5.128439237794047e-06, + "loss": 0.4566, + "step": 10486 + }, + { + "epoch": 1.8707519400588708, + "grad_norm": 0.4116140305995941, + "learning_rate": 5.114330530966326e-06, + "loss": 0.2923, + "step": 10487 + }, + { + "epoch": 1.8709303362768708, + "grad_norm": 0.4969407618045807, + "learning_rate": 5.100241057353683e-06, + "loss": 0.4083, + "step": 10488 + }, + { + "epoch": 1.871108732494871, + "grad_norm": 0.6181281208992004, + "learning_rate": 5.08617081806273e-06, + "loss": 0.6443, + "step": 10489 + }, + { + "epoch": 1.8712871287128712, + "grad_norm": 0.5408806204795837, + "learning_rate": 5.072119814198528e-06, + "loss": 0.4914, + "step": 10490 + }, + { + "epoch": 1.8714655249308714, + "grad_norm": 0.4671943783760071, + "learning_rate": 5.058088046864611e-06, + "loss": 0.3844, + "step": 10491 + }, + { + "epoch": 1.8716439211488716, + "grad_norm": 0.5404434204101562, + "learning_rate": 5.044075517163071e-06, + "loss": 0.4863, + "step": 10492 + }, + { + "epoch": 1.8718223173668718, + "grad_norm": 0.5988584756851196, + "learning_rate": 5.030082226194415e-06, + "loss": 0.5877, + "step": 10493 + }, + { + "epoch": 1.872000713584872, + "grad_norm": 0.5672785639762878, + "learning_rate": 5.01610817505771e-06, + "loss": 0.4992, + "step": 10494 + }, + { + "epoch": 1.8721791098028722, + "grad_norm": 0.5256212949752808, + "learning_rate": 5.002153364850409e-06, + "loss": 0.4614, + "step": 10495 + }, + { + "epoch": 1.8723575060208724, + "grad_norm": 0.47382408380508423, + "learning_rate": 4.9882177966685814e-06, + "loss": 0.3875, + "step": 10496 + }, + { + "epoch": 1.8725359022388726, + "grad_norm": 0.5471598505973816, + "learning_rate": 4.974301471606685e-06, + "loss": 0.4792, + "step": 10497 + }, + { + "epoch": 1.8727142984568728, + "grad_norm": 0.514196515083313, + "learning_rate": 4.96040439075765e-06, + "loss": 0.5545, + "step": 10498 + }, + { + "epoch": 1.872892694674873, + "grad_norm": 0.5053452849388123, + "learning_rate": 4.946526555213077e-06, + "loss": 0.4108, + "step": 10499 + }, + { + "epoch": 1.8730710908928732, + "grad_norm": 0.6003931164741516, + "learning_rate": 4.9326679660628145e-06, + "loss": 0.5834, + "step": 10500 + }, + { + "epoch": 1.8732494871108734, + "grad_norm": 0.559822678565979, + "learning_rate": 4.918828624395383e-06, + "loss": 0.517, + "step": 10501 + }, + { + "epoch": 1.8734278833288736, + "grad_norm": 0.5788659453392029, + "learning_rate": 4.905008531297661e-06, + "loss": 0.5528, + "step": 10502 + }, + { + "epoch": 1.8736062795468738, + "grad_norm": 0.4903022050857544, + "learning_rate": 4.891207687855115e-06, + "loss": 0.4657, + "step": 10503 + }, + { + "epoch": 1.8737846757648737, + "grad_norm": 0.529029369354248, + "learning_rate": 4.877426095151627e-06, + "loss": 0.4519, + "step": 10504 + }, + { + "epoch": 1.873963071982874, + "grad_norm": 0.48290225863456726, + "learning_rate": 4.863663754269609e-06, + "loss": 0.4803, + "step": 10505 + }, + { + "epoch": 1.8741414682008741, + "grad_norm": 0.5195373296737671, + "learning_rate": 4.849920666289947e-06, + "loss": 0.3987, + "step": 10506 + }, + { + "epoch": 1.8743198644188743, + "grad_norm": 0.537219226360321, + "learning_rate": 4.836196832292e-06, + "loss": 0.5085, + "step": 10507 + }, + { + "epoch": 1.8744982606368745, + "grad_norm": 0.4911697208881378, + "learning_rate": 4.8224922533536834e-06, + "loss": 0.4654, + "step": 10508 + }, + { + "epoch": 1.8746766568548747, + "grad_norm": 0.5637452006340027, + "learning_rate": 4.8088069305513015e-06, + "loss": 0.431, + "step": 10509 + }, + { + "epoch": 1.8748550530728747, + "grad_norm": 0.4910629391670227, + "learning_rate": 4.795140864959718e-06, + "loss": 0.3722, + "step": 10510 + }, + { + "epoch": 1.875033449290875, + "grad_norm": 0.5422062873840332, + "learning_rate": 4.781494057652269e-06, + "loss": 0.5239, + "step": 10511 + }, + { + "epoch": 1.875211845508875, + "grad_norm": 0.5688182711601257, + "learning_rate": 4.767866509700708e-06, + "loss": 0.4991, + "step": 10512 + }, + { + "epoch": 1.8753902417268753, + "grad_norm": 0.5716597437858582, + "learning_rate": 4.754258222175428e-06, + "loss": 0.5913, + "step": 10513 + }, + { + "epoch": 1.8755686379448755, + "grad_norm": 0.5582311749458313, + "learning_rate": 4.740669196145131e-06, + "loss": 0.5269, + "step": 10514 + }, + { + "epoch": 1.8757470341628757, + "grad_norm": 0.5111207365989685, + "learning_rate": 4.727099432677129e-06, + "loss": 0.4636, + "step": 10515 + }, + { + "epoch": 1.875925430380876, + "grad_norm": 0.5934797525405884, + "learning_rate": 4.713548932837208e-06, + "loss": 0.5653, + "step": 10516 + }, + { + "epoch": 1.876103826598876, + "grad_norm": 0.602021336555481, + "learning_rate": 4.700017697689574e-06, + "loss": 0.6561, + "step": 10517 + }, + { + "epoch": 1.8762822228168763, + "grad_norm": 0.5633701086044312, + "learning_rate": 4.686505728297013e-06, + "loss": 0.5126, + "step": 10518 + }, + { + "epoch": 1.8764606190348765, + "grad_norm": 0.5579110383987427, + "learning_rate": 4.6730130257207345e-06, + "loss": 0.5513, + "step": 10519 + }, + { + "epoch": 1.8766390152528767, + "grad_norm": 0.5543150901794434, + "learning_rate": 4.659539591020417e-06, + "loss": 0.5097, + "step": 10520 + }, + { + "epoch": 1.8768174114708769, + "grad_norm": 0.4759593904018402, + "learning_rate": 4.646085425254298e-06, + "loss": 0.4002, + "step": 10521 + }, + { + "epoch": 1.876995807688877, + "grad_norm": 0.6420212388038635, + "learning_rate": 4.632650529479032e-06, + "loss": 0.6743, + "step": 10522 + }, + { + "epoch": 1.8771742039068773, + "grad_norm": 0.483822762966156, + "learning_rate": 4.61923490474983e-06, + "loss": 0.4431, + "step": 10523 + }, + { + "epoch": 1.8773526001248775, + "grad_norm": 0.5354387760162354, + "learning_rate": 4.605838552120295e-06, + "loss": 0.4565, + "step": 10524 + }, + { + "epoch": 1.8775309963428777, + "grad_norm": 0.5774738192558289, + "learning_rate": 4.592461472642611e-06, + "loss": 0.635, + "step": 10525 + }, + { + "epoch": 1.8777093925608777, + "grad_norm": 0.4764508605003357, + "learning_rate": 4.579103667367385e-06, + "loss": 0.357, + "step": 10526 + }, + { + "epoch": 1.8778877887788779, + "grad_norm": 0.5291125774383545, + "learning_rate": 4.565765137343775e-06, + "loss": 0.4858, + "step": 10527 + }, + { + "epoch": 1.878066184996878, + "grad_norm": 0.5577571392059326, + "learning_rate": 4.552445883619305e-06, + "loss": 0.4481, + "step": 10528 + }, + { + "epoch": 1.8782445812148783, + "grad_norm": 0.489429771900177, + "learning_rate": 4.539145907240139e-06, + "loss": 0.4101, + "step": 10529 + }, + { + "epoch": 1.8784229774328784, + "grad_norm": 0.49116069078445435, + "learning_rate": 4.525865209250829e-06, + "loss": 0.3959, + "step": 10530 + }, + { + "epoch": 1.8786013736508786, + "grad_norm": 0.5530174970626831, + "learning_rate": 4.512603790694403e-06, + "loss": 0.6012, + "step": 10531 + }, + { + "epoch": 1.8787797698688786, + "grad_norm": 0.5700563788414001, + "learning_rate": 4.499361652612444e-06, + "loss": 0.5586, + "step": 10532 + }, + { + "epoch": 1.8789581660868788, + "grad_norm": 0.562254011631012, + "learning_rate": 4.486138796044981e-06, + "loss": 0.4436, + "step": 10533 + }, + { + "epoch": 1.879136562304879, + "grad_norm": 0.4984350800514221, + "learning_rate": 4.472935222030544e-06, + "loss": 0.4187, + "step": 10534 + }, + { + "epoch": 1.8793149585228792, + "grad_norm": 0.6067535281181335, + "learning_rate": 4.459750931606083e-06, + "loss": 0.6892, + "step": 10535 + }, + { + "epoch": 1.8794933547408794, + "grad_norm": 0.5302973389625549, + "learning_rate": 4.446585925807129e-06, + "loss": 0.4837, + "step": 10536 + }, + { + "epoch": 1.8796717509588796, + "grad_norm": 0.466941773891449, + "learning_rate": 4.43344020566766e-06, + "loss": 0.393, + "step": 10537 + }, + { + "epoch": 1.8798501471768798, + "grad_norm": 0.4711865186691284, + "learning_rate": 4.420313772220103e-06, + "loss": 0.4012, + "step": 10538 + }, + { + "epoch": 1.88002854339488, + "grad_norm": 0.574239194393158, + "learning_rate": 4.4072066264954355e-06, + "loss": 0.5279, + "step": 10539 + }, + { + "epoch": 1.8802069396128802, + "grad_norm": 0.5025227069854736, + "learning_rate": 4.394118769523059e-06, + "loss": 0.4958, + "step": 10540 + }, + { + "epoch": 1.8803853358308804, + "grad_norm": 0.47873345017433167, + "learning_rate": 4.381050202330927e-06, + "loss": 0.3807, + "step": 10541 + }, + { + "epoch": 1.8805637320488806, + "grad_norm": 0.5627789497375488, + "learning_rate": 4.368000925945386e-06, + "loss": 0.5274, + "step": 10542 + }, + { + "epoch": 1.8807421282668808, + "grad_norm": 0.5269836187362671, + "learning_rate": 4.3549709413913675e-06, + "loss": 0.4638, + "step": 10543 + }, + { + "epoch": 1.880920524484881, + "grad_norm": 0.5568525791168213, + "learning_rate": 4.34196024969219e-06, + "loss": 0.4697, + "step": 10544 + }, + { + "epoch": 1.8810989207028812, + "grad_norm": 0.4760674834251404, + "learning_rate": 4.328968851869758e-06, + "loss": 0.4065, + "step": 10545 + }, + { + "epoch": 1.8812773169208814, + "grad_norm": 0.5974029898643494, + "learning_rate": 4.3159967489443955e-06, + "loss": 0.6253, + "step": 10546 + }, + { + "epoch": 1.8814557131388816, + "grad_norm": 0.49886420369148254, + "learning_rate": 4.3030439419349255e-06, + "loss": 0.4455, + "step": 10547 + }, + { + "epoch": 1.8816341093568816, + "grad_norm": 0.5672736167907715, + "learning_rate": 4.290110431858646e-06, + "loss": 0.4771, + "step": 10548 + }, + { + "epoch": 1.8818125055748818, + "grad_norm": 0.5267834067344666, + "learning_rate": 4.277196219731383e-06, + "loss": 0.5392, + "step": 10549 + }, + { + "epoch": 1.881990901792882, + "grad_norm": 0.5531571507453918, + "learning_rate": 4.264301306567353e-06, + "loss": 0.6175, + "step": 10550 + }, + { + "epoch": 1.8821692980108822, + "grad_norm": 0.5343933701515198, + "learning_rate": 4.251425693379357e-06, + "loss": 0.5271, + "step": 10551 + }, + { + "epoch": 1.8823476942288824, + "grad_norm": 0.5798811316490173, + "learning_rate": 4.238569381178642e-06, + "loss": 0.6559, + "step": 10552 + }, + { + "epoch": 1.8825260904468826, + "grad_norm": 0.5240869522094727, + "learning_rate": 4.225732370974928e-06, + "loss": 0.4699, + "step": 10553 + }, + { + "epoch": 1.8827044866648825, + "grad_norm": 0.5387951731681824, + "learning_rate": 4.212914663776407e-06, + "loss": 0.4743, + "step": 10554 + }, + { + "epoch": 1.8828828828828827, + "grad_norm": 0.5581191778182983, + "learning_rate": 4.200116260589831e-06, + "loss": 0.4566, + "step": 10555 + }, + { + "epoch": 1.883061279100883, + "grad_norm": 0.5226394534111023, + "learning_rate": 4.1873371624203406e-06, + "loss": 0.4776, + "step": 10556 + }, + { + "epoch": 1.8832396753188831, + "grad_norm": 0.5839105844497681, + "learning_rate": 4.174577370271576e-06, + "loss": 0.5059, + "step": 10557 + }, + { + "epoch": 1.8834180715368833, + "grad_norm": 0.5159413814544678, + "learning_rate": 4.161836885145765e-06, + "loss": 0.4771, + "step": 10558 + }, + { + "epoch": 1.8835964677548835, + "grad_norm": 0.5594849586486816, + "learning_rate": 4.149115708043438e-06, + "loss": 0.5485, + "step": 10559 + }, + { + "epoch": 1.8837748639728837, + "grad_norm": 0.5460865497589111, + "learning_rate": 4.136413839963799e-06, + "loss": 0.4878, + "step": 10560 + }, + { + "epoch": 1.883953260190884, + "grad_norm": 0.5533373355865479, + "learning_rate": 4.123731281904408e-06, + "loss": 0.4856, + "step": 10561 + }, + { + "epoch": 1.8841316564088841, + "grad_norm": 0.6217470169067383, + "learning_rate": 4.111068034861359e-06, + "loss": 0.6534, + "step": 10562 + }, + { + "epoch": 1.8843100526268843, + "grad_norm": 0.5605418682098389, + "learning_rate": 4.098424099829218e-06, + "loss": 0.5772, + "step": 10563 + }, + { + "epoch": 1.8844884488448845, + "grad_norm": 0.6147893667221069, + "learning_rate": 4.085799477800995e-06, + "loss": 0.739, + "step": 10564 + }, + { + "epoch": 1.8846668450628847, + "grad_norm": 0.5503641963005066, + "learning_rate": 4.073194169768285e-06, + "loss": 0.5422, + "step": 10565 + }, + { + "epoch": 1.884845241280885, + "grad_norm": 0.5137640833854675, + "learning_rate": 4.0606081767210755e-06, + "loss": 0.417, + "step": 10566 + }, + { + "epoch": 1.8850236374988851, + "grad_norm": 0.46685469150543213, + "learning_rate": 4.048041499647853e-06, + "loss": 0.3761, + "step": 10567 + }, + { + "epoch": 1.8852020337168853, + "grad_norm": 0.4922786056995392, + "learning_rate": 4.035494139535606e-06, + "loss": 0.4437, + "step": 10568 + }, + { + "epoch": 1.8853804299348855, + "grad_norm": 0.5689695477485657, + "learning_rate": 4.0229660973698235e-06, + "loss": 0.5262, + "step": 10569 + }, + { + "epoch": 1.8855588261528857, + "grad_norm": 0.5004351139068604, + "learning_rate": 4.010457374134441e-06, + "loss": 0.389, + "step": 10570 + }, + { + "epoch": 1.8857372223708857, + "grad_norm": 0.5173558592796326, + "learning_rate": 3.997967970811839e-06, + "loss": 0.5562, + "step": 10571 + }, + { + "epoch": 1.885915618588886, + "grad_norm": 0.48343634605407715, + "learning_rate": 3.985497888382983e-06, + "loss": 0.4431, + "step": 10572 + }, + { + "epoch": 1.886094014806886, + "grad_norm": 0.5336623787879944, + "learning_rate": 3.973047127827256e-06, + "loss": 0.4717, + "step": 10573 + }, + { + "epoch": 1.8862724110248863, + "grad_norm": 0.5859199166297913, + "learning_rate": 3.960615690122543e-06, + "loss": 0.5706, + "step": 10574 + }, + { + "epoch": 1.8864508072428865, + "grad_norm": 0.5274314880371094, + "learning_rate": 3.948203576245174e-06, + "loss": 0.5479, + "step": 10575 + }, + { + "epoch": 1.8866292034608865, + "grad_norm": 0.5471269488334656, + "learning_rate": 3.935810787170036e-06, + "loss": 0.5647, + "step": 10576 + }, + { + "epoch": 1.8868075996788867, + "grad_norm": 0.48193615674972534, + "learning_rate": 3.923437323870405e-06, + "loss": 0.4555, + "step": 10577 + }, + { + "epoch": 1.8869859958968869, + "grad_norm": 0.574404776096344, + "learning_rate": 3.911083187318115e-06, + "loss": 0.6331, + "step": 10578 + }, + { + "epoch": 1.887164392114887, + "grad_norm": 0.5169478058815002, + "learning_rate": 3.898748378483474e-06, + "loss": 0.5231, + "step": 10579 + }, + { + "epoch": 1.8873427883328873, + "grad_norm": 0.56248939037323, + "learning_rate": 3.886432898335207e-06, + "loss": 0.6228, + "step": 10580 + }, + { + "epoch": 1.8875211845508875, + "grad_norm": 0.516521692276001, + "learning_rate": 3.874136747840623e-06, + "loss": 0.4946, + "step": 10581 + }, + { + "epoch": 1.8876995807688877, + "grad_norm": 0.5308520793914795, + "learning_rate": 3.861859927965394e-06, + "loss": 0.5038, + "step": 10582 + }, + { + "epoch": 1.8878779769868879, + "grad_norm": 0.5017712712287903, + "learning_rate": 3.849602439673749e-06, + "loss": 0.4414, + "step": 10583 + }, + { + "epoch": 1.888056373204888, + "grad_norm": 0.5355345010757446, + "learning_rate": 3.837364283928446e-06, + "loss": 0.4509, + "step": 10584 + }, + { + "epoch": 1.8882347694228883, + "grad_norm": 0.5393890142440796, + "learning_rate": 3.825145461690577e-06, + "loss": 0.6249, + "step": 10585 + }, + { + "epoch": 1.8884131656408885, + "grad_norm": 0.5255075693130493, + "learning_rate": 3.8129459739198737e-06, + "loss": 0.5318, + "step": 10586 + }, + { + "epoch": 1.8885915618588887, + "grad_norm": 0.5181835293769836, + "learning_rate": 3.800765821574431e-06, + "loss": 0.5787, + "step": 10587 + }, + { + "epoch": 1.8887699580768889, + "grad_norm": 0.5990747213363647, + "learning_rate": 3.7886050056109287e-06, + "loss": 0.5349, + "step": 10588 + }, + { + "epoch": 1.888948354294889, + "grad_norm": 0.49007025361061096, + "learning_rate": 3.7764635269843804e-06, + "loss": 0.3994, + "step": 10589 + }, + { + "epoch": 1.8891267505128893, + "grad_norm": 0.45775508880615234, + "learning_rate": 3.7643413866484678e-06, + "loss": 0.325, + "step": 10590 + }, + { + "epoch": 1.8893051467308895, + "grad_norm": 0.5210441946983337, + "learning_rate": 3.7522385855552067e-06, + "loss": 0.5073, + "step": 10591 + }, + { + "epoch": 1.8894835429488896, + "grad_norm": 0.5008862018585205, + "learning_rate": 3.7401551246551703e-06, + "loss": 0.4124, + "step": 10592 + }, + { + "epoch": 1.8896619391668896, + "grad_norm": 0.5803366899490356, + "learning_rate": 3.728091004897377e-06, + "loss": 0.5425, + "step": 10593 + }, + { + "epoch": 1.8898403353848898, + "grad_norm": 0.48922333121299744, + "learning_rate": 3.7160462272293195e-06, + "loss": 0.461, + "step": 10594 + }, + { + "epoch": 1.89001873160289, + "grad_norm": 0.6023316979408264, + "learning_rate": 3.704020792597018e-06, + "loss": 0.7902, + "step": 10595 + }, + { + "epoch": 1.8901971278208902, + "grad_norm": 0.5757283568382263, + "learning_rate": 3.69201470194494e-06, + "loss": 0.5757, + "step": 10596 + }, + { + "epoch": 1.8903755240388904, + "grad_norm": 0.5134090185165405, + "learning_rate": 3.6800279562160257e-06, + "loss": 0.5602, + "step": 10597 + }, + { + "epoch": 1.8905539202568904, + "grad_norm": 0.49674350023269653, + "learning_rate": 3.6680605563517153e-06, + "loss": 0.4284, + "step": 10598 + }, + { + "epoch": 1.8907323164748906, + "grad_norm": 0.5035196542739868, + "learning_rate": 3.6561125032918975e-06, + "loss": 0.4645, + "step": 10599 + }, + { + "epoch": 1.8909107126928908, + "grad_norm": 0.5700138807296753, + "learning_rate": 3.6441837979750427e-06, + "loss": 0.6076, + "step": 10600 + }, + { + "epoch": 1.891089108910891, + "grad_norm": 0.5219821333885193, + "learning_rate": 3.632274441337957e-06, + "loss": 0.4652, + "step": 10601 + }, + { + "epoch": 1.8912675051288912, + "grad_norm": 0.5027645826339722, + "learning_rate": 3.620384434316004e-06, + "loss": 0.4392, + "step": 10602 + }, + { + "epoch": 1.8914459013468914, + "grad_norm": 0.4853323996067047, + "learning_rate": 3.608513777843048e-06, + "loss": 0.4215, + "step": 10603 + }, + { + "epoch": 1.8916242975648916, + "grad_norm": 0.5268037915229797, + "learning_rate": 3.59666247285137e-06, + "loss": 0.5088, + "step": 10604 + }, + { + "epoch": 1.8918026937828918, + "grad_norm": 0.5432520508766174, + "learning_rate": 3.584830520271809e-06, + "loss": 0.4998, + "step": 10605 + }, + { + "epoch": 1.891981090000892, + "grad_norm": 0.6279560327529907, + "learning_rate": 3.5730179210335946e-06, + "loss": 0.5981, + "step": 10606 + }, + { + "epoch": 1.8921594862188922, + "grad_norm": 0.5216326713562012, + "learning_rate": 3.5612246760645118e-06, + "loss": 0.4698, + "step": 10607 + }, + { + "epoch": 1.8923378824368924, + "grad_norm": 0.5420396327972412, + "learning_rate": 3.54945078629082e-06, + "loss": 0.5827, + "step": 10608 + }, + { + "epoch": 1.8925162786548926, + "grad_norm": 0.5069664716720581, + "learning_rate": 3.5376962526371682e-06, + "loss": 0.4561, + "step": 10609 + }, + { + "epoch": 1.8926946748728928, + "grad_norm": 0.4788225293159485, + "learning_rate": 3.525961076026818e-06, + "loss": 0.4292, + "step": 10610 + }, + { + "epoch": 1.892873071090893, + "grad_norm": 0.5387194156646729, + "learning_rate": 3.514245257381421e-06, + "loss": 0.5415, + "step": 10611 + }, + { + "epoch": 1.8930514673088932, + "grad_norm": 0.5902103185653687, + "learning_rate": 3.50254879762113e-06, + "loss": 0.6707, + "step": 10612 + }, + { + "epoch": 1.8932298635268934, + "grad_norm": 0.4795367121696472, + "learning_rate": 3.490871697664544e-06, + "loss": 0.3785, + "step": 10613 + }, + { + "epoch": 1.8934082597448936, + "grad_norm": 0.5698316097259521, + "learning_rate": 3.4792139584288728e-06, + "loss": 0.5774, + "step": 10614 + }, + { + "epoch": 1.8935866559628936, + "grad_norm": 0.530458927154541, + "learning_rate": 3.4675755808296073e-06, + "loss": 0.5076, + "step": 10615 + }, + { + "epoch": 1.8937650521808937, + "grad_norm": 0.5088088512420654, + "learning_rate": 3.4559565657808766e-06, + "loss": 0.4608, + "step": 10616 + }, + { + "epoch": 1.893943448398894, + "grad_norm": 0.46197885274887085, + "learning_rate": 3.444356914195229e-06, + "loss": 0.355, + "step": 10617 + }, + { + "epoch": 1.8941218446168941, + "grad_norm": 0.5059583783149719, + "learning_rate": 3.432776626983658e-06, + "loss": 0.4677, + "step": 10618 + }, + { + "epoch": 1.8943002408348943, + "grad_norm": 0.519899308681488, + "learning_rate": 3.421215705055741e-06, + "loss": 0.485, + "step": 10619 + }, + { + "epoch": 1.8944786370528943, + "grad_norm": 0.544313907623291, + "learning_rate": 3.4096741493194194e-06, + "loss": 0.5031, + "step": 10620 + }, + { + "epoch": 1.8946570332708945, + "grad_norm": 0.5371688008308411, + "learning_rate": 3.3981519606811616e-06, + "loss": 0.5519, + "step": 10621 + }, + { + "epoch": 1.8948354294888947, + "grad_norm": 0.47906023263931274, + "learning_rate": 3.3866491400459387e-06, + "loss": 0.4269, + "step": 10622 + }, + { + "epoch": 1.895013825706895, + "grad_norm": 0.5047534704208374, + "learning_rate": 3.3751656883171668e-06, + "loss": 0.4814, + "step": 10623 + }, + { + "epoch": 1.8951922219248951, + "grad_norm": 0.5101923942565918, + "learning_rate": 3.363701606396735e-06, + "loss": 0.4778, + "step": 10624 + }, + { + "epoch": 1.8953706181428953, + "grad_norm": 0.46724262833595276, + "learning_rate": 3.352256895185063e-06, + "loss": 0.3659, + "step": 10625 + }, + { + "epoch": 1.8955490143608955, + "grad_norm": 0.5907710194587708, + "learning_rate": 3.3408315555809863e-06, + "loss": 0.4965, + "step": 10626 + }, + { + "epoch": 1.8957274105788957, + "grad_norm": 0.5587756037712097, + "learning_rate": 3.3294255884818435e-06, + "loss": 0.5713, + "step": 10627 + }, + { + "epoch": 1.895905806796896, + "grad_norm": 0.5773804783821106, + "learning_rate": 3.318038994783501e-06, + "loss": 0.5113, + "step": 10628 + }, + { + "epoch": 1.896084203014896, + "grad_norm": 0.5329833626747131, + "learning_rate": 3.306671775380188e-06, + "loss": 0.5319, + "step": 10629 + }, + { + "epoch": 1.8962625992328963, + "grad_norm": 0.541481077671051, + "learning_rate": 3.2953239311647175e-06, + "loss": 0.3925, + "step": 10630 + }, + { + "epoch": 1.8964409954508965, + "grad_norm": 0.5258289575576782, + "learning_rate": 3.2839954630283497e-06, + "loss": 0.4991, + "step": 10631 + }, + { + "epoch": 1.8966193916688967, + "grad_norm": 0.6551702618598938, + "learning_rate": 3.272686371860789e-06, + "loss": 0.6916, + "step": 10632 + }, + { + "epoch": 1.896797787886897, + "grad_norm": 0.5645005702972412, + "learning_rate": 3.2613966585502976e-06, + "loss": 0.5999, + "step": 10633 + }, + { + "epoch": 1.896976184104897, + "grad_norm": 0.7276445627212524, + "learning_rate": 3.2501263239834987e-06, + "loss": 0.4629, + "step": 10634 + }, + { + "epoch": 1.8971545803228973, + "grad_norm": 0.5128322839736938, + "learning_rate": 3.2388753690456296e-06, + "loss": 0.5149, + "step": 10635 + }, + { + "epoch": 1.8973329765408975, + "grad_norm": 0.4655255377292633, + "learning_rate": 3.2276437946202607e-06, + "loss": 0.3665, + "step": 10636 + }, + { + "epoch": 1.8975113727588975, + "grad_norm": 0.5978597402572632, + "learning_rate": 3.2164316015895476e-06, + "loss": 0.6666, + "step": 10637 + }, + { + "epoch": 1.8976897689768977, + "grad_norm": 0.5160049796104431, + "learning_rate": 3.205238790834147e-06, + "loss": 0.4178, + "step": 10638 + }, + { + "epoch": 1.8978681651948979, + "grad_norm": 0.5093833804130554, + "learning_rate": 3.194065363233051e-06, + "loss": 0.5388, + "step": 10639 + }, + { + "epoch": 1.898046561412898, + "grad_norm": 0.5614250898361206, + "learning_rate": 3.1829113196638614e-06, + "loss": 0.4887, + "step": 10640 + }, + { + "epoch": 1.8982249576308983, + "grad_norm": 0.7175207734107971, + "learning_rate": 3.171776661002601e-06, + "loss": 0.4711, + "step": 10641 + }, + { + "epoch": 1.8984033538488982, + "grad_norm": 0.5213533043861389, + "learning_rate": 3.1606613881237924e-06, + "loss": 0.4632, + "step": 10642 + }, + { + "epoch": 1.8985817500668984, + "grad_norm": 0.5014880895614624, + "learning_rate": 3.1495655019004032e-06, + "loss": 0.4144, + "step": 10643 + }, + { + "epoch": 1.8987601462848986, + "grad_norm": 0.5747901201248169, + "learning_rate": 3.138489003203904e-06, + "loss": 0.6008, + "step": 10644 + }, + { + "epoch": 1.8989385425028988, + "grad_norm": 0.5374071598052979, + "learning_rate": 3.1274318929042644e-06, + "loss": 0.435, + "step": 10645 + }, + { + "epoch": 1.899116938720899, + "grad_norm": 0.457996129989624, + "learning_rate": 3.116394171869874e-06, + "loss": 0.4792, + "step": 10646 + }, + { + "epoch": 1.8992953349388992, + "grad_norm": 0.4922611713409424, + "learning_rate": 3.105375840967678e-06, + "loss": 0.3421, + "step": 10647 + }, + { + "epoch": 1.8994737311568994, + "grad_norm": 0.5241499543190002, + "learning_rate": 3.0943769010629565e-06, + "loss": 0.5663, + "step": 10648 + }, + { + "epoch": 1.8996521273748996, + "grad_norm": 0.4872463345527649, + "learning_rate": 3.0833973530196846e-06, + "loss": 0.3615, + "step": 10649 + }, + { + "epoch": 1.8998305235928998, + "grad_norm": 0.5093347430229187, + "learning_rate": 3.07243719770009e-06, + "loss": 0.477, + "step": 10650 + }, + { + "epoch": 1.9000089198109, + "grad_norm": 0.48740455508232117, + "learning_rate": 3.0614964359650112e-06, + "loss": 0.4094, + "step": 10651 + }, + { + "epoch": 1.9001873160289002, + "grad_norm": 0.46468567848205566, + "learning_rate": 3.0505750686737332e-06, + "loss": 0.3435, + "step": 10652 + }, + { + "epoch": 1.9003657122469004, + "grad_norm": 0.49886494874954224, + "learning_rate": 3.0396730966840423e-06, + "loss": 0.4067, + "step": 10653 + }, + { + "epoch": 1.9005441084649006, + "grad_norm": 0.5205758213996887, + "learning_rate": 3.0287905208521427e-06, + "loss": 0.3478, + "step": 10654 + }, + { + "epoch": 1.9007225046829008, + "grad_norm": 0.5665127635002136, + "learning_rate": 3.017927342032767e-06, + "loss": 0.5246, + "step": 10655 + }, + { + "epoch": 1.900900900900901, + "grad_norm": 0.5088105797767639, + "learning_rate": 3.007083561079066e-06, + "loss": 0.3591, + "step": 10656 + }, + { + "epoch": 1.9010792971189012, + "grad_norm": 0.49926015734672546, + "learning_rate": 2.9962591788427473e-06, + "loss": 0.4791, + "step": 10657 + }, + { + "epoch": 1.9012576933369014, + "grad_norm": 0.5580708384513855, + "learning_rate": 2.985454196173937e-06, + "loss": 0.4928, + "step": 10658 + }, + { + "epoch": 1.9014360895549014, + "grad_norm": 0.5072367787361145, + "learning_rate": 2.974668613921261e-06, + "loss": 0.4386, + "step": 10659 + }, + { + "epoch": 1.9016144857729016, + "grad_norm": 0.5622830986976624, + "learning_rate": 2.963902432931792e-06, + "loss": 0.4739, + "step": 10660 + }, + { + "epoch": 1.9017928819909018, + "grad_norm": 0.5466598272323608, + "learning_rate": 2.9531556540511593e-06, + "loss": 0.5038, + "step": 10661 + }, + { + "epoch": 1.901971278208902, + "grad_norm": 0.5235751271247864, + "learning_rate": 2.942428278123327e-06, + "loss": 0.5349, + "step": 10662 + }, + { + "epoch": 1.9021496744269022, + "grad_norm": 0.5213757753372192, + "learning_rate": 2.9317203059908703e-06, + "loss": 0.4482, + "step": 10663 + }, + { + "epoch": 1.9023280706449022, + "grad_norm": 0.537200391292572, + "learning_rate": 2.921031738494784e-06, + "loss": 0.473, + "step": 10664 + }, + { + "epoch": 1.9025064668629024, + "grad_norm": 0.5032278895378113, + "learning_rate": 2.9103625764745344e-06, + "loss": 0.4604, + "step": 10665 + }, + { + "epoch": 1.9026848630809026, + "grad_norm": 0.5438551902770996, + "learning_rate": 2.899712820768091e-06, + "loss": 0.5475, + "step": 10666 + }, + { + "epoch": 1.9028632592989028, + "grad_norm": 0.5733838677406311, + "learning_rate": 2.8890824722118403e-06, + "loss": 0.4181, + "step": 10667 + }, + { + "epoch": 1.903041655516903, + "grad_norm": 0.5377230644226074, + "learning_rate": 2.878471531640725e-06, + "loss": 0.4601, + "step": 10668 + }, + { + "epoch": 1.9032200517349032, + "grad_norm": 0.5529724955558777, + "learning_rate": 2.8678799998881345e-06, + "loss": 0.4844, + "step": 10669 + }, + { + "epoch": 1.9033984479529034, + "grad_norm": 0.5532283186912537, + "learning_rate": 2.8573078777858753e-06, + "loss": 0.5625, + "step": 10670 + }, + { + "epoch": 1.9035768441709036, + "grad_norm": 0.5340506434440613, + "learning_rate": 2.846755166164311e-06, + "loss": 0.4384, + "step": 10671 + }, + { + "epoch": 1.9037552403889038, + "grad_norm": 0.520977258682251, + "learning_rate": 2.836221865852223e-06, + "loss": 0.4364, + "step": 10672 + }, + { + "epoch": 1.903933636606904, + "grad_norm": 0.47905829548835754, + "learning_rate": 2.8257079776769224e-06, + "loss": 0.415, + "step": 10673 + }, + { + "epoch": 1.9041120328249042, + "grad_norm": 0.540252149105072, + "learning_rate": 2.8152135024641366e-06, + "loss": 0.5036, + "step": 10674 + }, + { + "epoch": 1.9042904290429044, + "grad_norm": 0.4911253750324249, + "learning_rate": 2.8047384410381237e-06, + "loss": 0.4359, + "step": 10675 + }, + { + "epoch": 1.9044688252609046, + "grad_norm": 0.5184650421142578, + "learning_rate": 2.7942827942215585e-06, + "loss": 0.4715, + "step": 10676 + }, + { + "epoch": 1.9046472214789048, + "grad_norm": 0.4889717102050781, + "learning_rate": 2.783846562835646e-06, + "loss": 0.3955, + "step": 10677 + }, + { + "epoch": 1.904825617696905, + "grad_norm": 0.5702592134475708, + "learning_rate": 2.7734297477000626e-06, + "loss": 0.5709, + "step": 10678 + }, + { + "epoch": 1.9050040139149051, + "grad_norm": 0.5835392475128174, + "learning_rate": 2.7630323496328767e-06, + "loss": 0.4535, + "step": 10679 + }, + { + "epoch": 1.9051824101329053, + "grad_norm": 0.5429273843765259, + "learning_rate": 2.752654369450769e-06, + "loss": 0.5521, + "step": 10680 + }, + { + "epoch": 1.9053608063509053, + "grad_norm": 0.5422603487968445, + "learning_rate": 2.742295807968781e-06, + "loss": 0.5759, + "step": 10681 + }, + { + "epoch": 1.9055392025689055, + "grad_norm": 0.5456951856613159, + "learning_rate": 2.731956666000457e-06, + "loss": 0.4278, + "step": 10682 + }, + { + "epoch": 1.9057175987869057, + "grad_norm": 0.5333420038223267, + "learning_rate": 2.7216369443578694e-06, + "loss": 0.5113, + "step": 10683 + }, + { + "epoch": 1.905895995004906, + "grad_norm": 0.6626531481742859, + "learning_rate": 2.7113366438515085e-06, + "loss": 0.6753, + "step": 10684 + }, + { + "epoch": 1.9060743912229061, + "grad_norm": 0.5112046003341675, + "learning_rate": 2.701055765290339e-06, + "loss": 0.4424, + "step": 10685 + }, + { + "epoch": 1.906252787440906, + "grad_norm": 0.48551252484321594, + "learning_rate": 2.6907943094818255e-06, + "loss": 0.4418, + "step": 10686 + }, + { + "epoch": 1.9064311836589063, + "grad_norm": 0.5245196223258972, + "learning_rate": 2.680552277231907e-06, + "loss": 0.4654, + "step": 10687 + }, + { + "epoch": 1.9066095798769065, + "grad_norm": 0.5276603102684021, + "learning_rate": 2.6703296693449953e-06, + "loss": 0.4482, + "step": 10688 + }, + { + "epoch": 1.9067879760949067, + "grad_norm": 0.5473247766494751, + "learning_rate": 2.6601264866239485e-06, + "loss": 0.5329, + "step": 10689 + }, + { + "epoch": 1.9069663723129069, + "grad_norm": 0.6117315292358398, + "learning_rate": 2.649942729870125e-06, + "loss": 0.5775, + "step": 10690 + }, + { + "epoch": 1.907144768530907, + "grad_norm": 0.5208797454833984, + "learning_rate": 2.639778399883358e-06, + "loss": 0.5727, + "step": 10691 + }, + { + "epoch": 1.9073231647489073, + "grad_norm": 0.5800744295120239, + "learning_rate": 2.6296334974619528e-06, + "loss": 0.6938, + "step": 10692 + }, + { + "epoch": 1.9075015609669075, + "grad_norm": 0.49815651774406433, + "learning_rate": 2.6195080234026613e-06, + "loss": 0.4241, + "step": 10693 + }, + { + "epoch": 1.9076799571849077, + "grad_norm": 0.515434980392456, + "learning_rate": 2.6094019785007914e-06, + "loss": 0.4804, + "step": 10694 + }, + { + "epoch": 1.9078583534029079, + "grad_norm": 0.5178319215774536, + "learning_rate": 2.5993153635500143e-06, + "loss": 0.3885, + "step": 10695 + }, + { + "epoch": 1.908036749620908, + "grad_norm": 0.48950934410095215, + "learning_rate": 2.589248179342529e-06, + "loss": 0.395, + "step": 10696 + }, + { + "epoch": 1.9082151458389083, + "grad_norm": 0.5800197124481201, + "learning_rate": 2.5792004266690094e-06, + "loss": 0.6178, + "step": 10697 + }, + { + "epoch": 1.9083935420569085, + "grad_norm": 0.6045670509338379, + "learning_rate": 2.569172106318629e-06, + "loss": 0.511, + "step": 10698 + }, + { + "epoch": 1.9085719382749087, + "grad_norm": 0.4140417277812958, + "learning_rate": 2.5591632190790084e-06, + "loss": 0.2933, + "step": 10699 + }, + { + "epoch": 1.9087503344929089, + "grad_norm": 0.5823186635971069, + "learning_rate": 2.5491737657362123e-06, + "loss": 0.4683, + "step": 10700 + }, + { + "epoch": 1.908928730710909, + "grad_norm": 0.5681852102279663, + "learning_rate": 2.5392037470748365e-06, + "loss": 0.5934, + "step": 10701 + }, + { + "epoch": 1.9091071269289093, + "grad_norm": 0.5418094992637634, + "learning_rate": 2.5292531638778926e-06, + "loss": 0.5774, + "step": 10702 + }, + { + "epoch": 1.9092855231469092, + "grad_norm": 0.47225964069366455, + "learning_rate": 2.519322016926895e-06, + "loss": 0.3773, + "step": 10703 + }, + { + "epoch": 1.9094639193649094, + "grad_norm": 0.5559024810791016, + "learning_rate": 2.5094103070018858e-06, + "loss": 0.3676, + "step": 10704 + }, + { + "epoch": 1.9096423155829096, + "grad_norm": 0.5408554077148438, + "learning_rate": 2.499518034881271e-06, + "loss": 0.3963, + "step": 10705 + }, + { + "epoch": 1.9098207118009098, + "grad_norm": 0.4998513162136078, + "learning_rate": 2.4896452013420114e-06, + "loss": 0.3837, + "step": 10706 + }, + { + "epoch": 1.90999910801891, + "grad_norm": 0.5326787233352661, + "learning_rate": 2.4797918071594594e-06, + "loss": 0.5206, + "step": 10707 + }, + { + "epoch": 1.91017750423691, + "grad_norm": 0.5712464451789856, + "learning_rate": 2.469957853107607e-06, + "loss": 0.5556, + "step": 10708 + }, + { + "epoch": 1.9103559004549102, + "grad_norm": 0.5410659909248352, + "learning_rate": 2.460143339958726e-06, + "loss": 0.4366, + "step": 10709 + }, + { + "epoch": 1.9105342966729104, + "grad_norm": 0.5795556902885437, + "learning_rate": 2.4503482684836154e-06, + "loss": 0.5793, + "step": 10710 + }, + { + "epoch": 1.9107126928909106, + "grad_norm": 0.6724327802658081, + "learning_rate": 2.440572639451688e-06, + "loss": 0.6335, + "step": 10711 + }, + { + "epoch": 1.9108910891089108, + "grad_norm": 0.44440463185310364, + "learning_rate": 2.4308164536306075e-06, + "loss": 0.3103, + "step": 10712 + }, + { + "epoch": 1.911069485326911, + "grad_norm": 0.49097147583961487, + "learning_rate": 2.421079711786678e-06, + "loss": 0.3918, + "step": 10713 + }, + { + "epoch": 1.9112478815449112, + "grad_norm": 0.5756399035453796, + "learning_rate": 2.4113624146846205e-06, + "loss": 0.669, + "step": 10714 + }, + { + "epoch": 1.9114262777629114, + "grad_norm": 0.5272496938705444, + "learning_rate": 2.4016645630875744e-06, + "loss": 0.4483, + "step": 10715 + }, + { + "epoch": 1.9116046739809116, + "grad_norm": 0.561396062374115, + "learning_rate": 2.391986157757292e-06, + "loss": 0.6147, + "step": 10716 + }, + { + "epoch": 1.9117830701989118, + "grad_norm": 0.6622545123100281, + "learning_rate": 2.382327199453832e-06, + "loss": 0.4827, + "step": 10717 + }, + { + "epoch": 1.911961466416912, + "grad_norm": 0.5634920597076416, + "learning_rate": 2.372687688935837e-06, + "loss": 0.5104, + "step": 10718 + }, + { + "epoch": 1.9121398626349122, + "grad_norm": 0.5393690466880798, + "learning_rate": 2.363067626960397e-06, + "loss": 0.4723, + "step": 10719 + }, + { + "epoch": 1.9123182588529124, + "grad_norm": 0.501996636390686, + "learning_rate": 2.353467014283073e-06, + "loss": 0.4674, + "step": 10720 + }, + { + "epoch": 1.9124966550709126, + "grad_norm": 0.5242270827293396, + "learning_rate": 2.3438858516578733e-06, + "loss": 0.4293, + "step": 10721 + }, + { + "epoch": 1.9126750512889128, + "grad_norm": 0.5785874128341675, + "learning_rate": 2.3343241398372796e-06, + "loss": 0.5556, + "step": 10722 + }, + { + "epoch": 1.912853447506913, + "grad_norm": 0.49174314737319946, + "learning_rate": 2.324781879572302e-06, + "loss": 0.3852, + "step": 10723 + }, + { + "epoch": 1.9130318437249132, + "grad_norm": 0.56098473072052, + "learning_rate": 2.3152590716123688e-06, + "loss": 0.5528, + "step": 10724 + }, + { + "epoch": 1.9132102399429132, + "grad_norm": 0.5013071894645691, + "learning_rate": 2.3057557167054377e-06, + "loss": 0.4798, + "step": 10725 + }, + { + "epoch": 1.9133886361609134, + "grad_norm": 0.5687433481216431, + "learning_rate": 2.2962718155978e-06, + "loss": 0.5398, + "step": 10726 + }, + { + "epoch": 1.9135670323789136, + "grad_norm": 0.5144028067588806, + "learning_rate": 2.286807369034416e-06, + "loss": 0.3893, + "step": 10727 + }, + { + "epoch": 1.9137454285969138, + "grad_norm": 0.48423799872398376, + "learning_rate": 2.2773623777585796e-06, + "loss": 0.5236, + "step": 10728 + }, + { + "epoch": 1.913923824814914, + "grad_norm": 0.5465452671051025, + "learning_rate": 2.267936842512086e-06, + "loss": 0.5458, + "step": 10729 + }, + { + "epoch": 1.914102221032914, + "grad_norm": 0.4663107991218567, + "learning_rate": 2.2585307640352326e-06, + "loss": 0.4132, + "step": 10730 + }, + { + "epoch": 1.9142806172509141, + "grad_norm": 0.5530097484588623, + "learning_rate": 2.2491441430667614e-06, + "loss": 0.6104, + "step": 10731 + }, + { + "epoch": 1.9144590134689143, + "grad_norm": 0.5279607176780701, + "learning_rate": 2.239776980343916e-06, + "loss": 0.4833, + "step": 10732 + }, + { + "epoch": 1.9146374096869145, + "grad_norm": 0.554326057434082, + "learning_rate": 2.2304292766023304e-06, + "loss": 0.5667, + "step": 10733 + }, + { + "epoch": 1.9148158059049147, + "grad_norm": 0.5720328688621521, + "learning_rate": 2.2211010325762227e-06, + "loss": 0.5472, + "step": 10734 + }, + { + "epoch": 1.914994202122915, + "grad_norm": 0.6049355864524841, + "learning_rate": 2.211792248998229e-06, + "loss": 0.7378, + "step": 10735 + }, + { + "epoch": 1.9151725983409151, + "grad_norm": 0.5384811758995056, + "learning_rate": 2.2025029265994036e-06, + "loss": 0.588, + "step": 10736 + }, + { + "epoch": 1.9153509945589153, + "grad_norm": 0.5090281963348389, + "learning_rate": 2.1932330661093845e-06, + "loss": 0.5302, + "step": 10737 + }, + { + "epoch": 1.9155293907769155, + "grad_norm": 0.5153109431266785, + "learning_rate": 2.183982668256201e-06, + "loss": 0.5014, + "step": 10738 + }, + { + "epoch": 1.9157077869949157, + "grad_norm": 0.5636624693870544, + "learning_rate": 2.1747517337663836e-06, + "loss": 0.6512, + "step": 10739 + }, + { + "epoch": 1.915886183212916, + "grad_norm": 0.5594334006309509, + "learning_rate": 2.1655402633648792e-06, + "loss": 0.5636, + "step": 10740 + }, + { + "epoch": 1.9160645794309161, + "grad_norm": 0.5760837197303772, + "learning_rate": 2.1563482577752482e-06, + "loss": 0.5173, + "step": 10741 + }, + { + "epoch": 1.9162429756489163, + "grad_norm": 0.5558215379714966, + "learning_rate": 2.1471757177193295e-06, + "loss": 0.486, + "step": 10742 + }, + { + "epoch": 1.9164213718669165, + "grad_norm": 0.4847973883152008, + "learning_rate": 2.1380226439175478e-06, + "loss": 0.3862, + "step": 10743 + }, + { + "epoch": 1.9165997680849167, + "grad_norm": 0.548984944820404, + "learning_rate": 2.1288890370888546e-06, + "loss": 0.546, + "step": 10744 + }, + { + "epoch": 1.916778164302917, + "grad_norm": 0.5168630480766296, + "learning_rate": 2.119774897950538e-06, + "loss": 0.5107, + "step": 10745 + }, + { + "epoch": 1.9169565605209171, + "grad_norm": 0.5281267762184143, + "learning_rate": 2.1106802272184146e-06, + "loss": 0.4927, + "step": 10746 + }, + { + "epoch": 1.917134956738917, + "grad_norm": 0.5126060843467712, + "learning_rate": 2.1016050256068007e-06, + "loss": 0.3642, + "step": 10747 + }, + { + "epoch": 1.9173133529569173, + "grad_norm": 0.5802842378616333, + "learning_rate": 2.092549293828433e-06, + "loss": 0.5518, + "step": 10748 + }, + { + "epoch": 1.9174917491749175, + "grad_norm": 0.5918566584587097, + "learning_rate": 2.0835130325946026e-06, + "loss": 0.5455, + "step": 10749 + }, + { + "epoch": 1.9176701453929177, + "grad_norm": 0.5723239779472351, + "learning_rate": 2.0744962426149374e-06, + "loss": 0.5907, + "step": 10750 + }, + { + "epoch": 1.917848541610918, + "grad_norm": 0.4773622751235962, + "learning_rate": 2.0654989245976473e-06, + "loss": 0.4396, + "step": 10751 + }, + { + "epoch": 1.9180269378289179, + "grad_norm": 0.5784128308296204, + "learning_rate": 2.0565210792493903e-06, + "loss": 0.6173, + "step": 10752 + }, + { + "epoch": 1.918205334046918, + "grad_norm": 0.5458529591560364, + "learning_rate": 2.047562707275269e-06, + "loss": 0.5314, + "step": 10753 + }, + { + "epoch": 1.9183837302649183, + "grad_norm": 0.4943179488182068, + "learning_rate": 2.038623809378859e-06, + "loss": 0.3984, + "step": 10754 + }, + { + "epoch": 1.9185621264829185, + "grad_norm": 0.6198917031288147, + "learning_rate": 2.029704386262238e-06, + "loss": 0.7195, + "step": 10755 + }, + { + "epoch": 1.9187405227009187, + "grad_norm": 0.5691092610359192, + "learning_rate": 2.020804438625928e-06, + "loss": 0.6608, + "step": 10756 + }, + { + "epoch": 1.9189189189189189, + "grad_norm": 0.6087114214897156, + "learning_rate": 2.011923967168899e-06, + "loss": 0.7876, + "step": 10757 + }, + { + "epoch": 1.919097315136919, + "grad_norm": 0.5224511027336121, + "learning_rate": 2.0030629725886763e-06, + "loss": 0.418, + "step": 10758 + }, + { + "epoch": 1.9192757113549193, + "grad_norm": 0.5266249775886536, + "learning_rate": 1.9942214555811187e-06, + "loss": 0.4646, + "step": 10759 + }, + { + "epoch": 1.9194541075729195, + "grad_norm": 0.5271503925323486, + "learning_rate": 1.9853994168407273e-06, + "loss": 0.4795, + "step": 10760 + }, + { + "epoch": 1.9196325037909197, + "grad_norm": 0.5206458568572998, + "learning_rate": 1.9765968570603365e-06, + "loss": 0.449, + "step": 10761 + }, + { + "epoch": 1.9198109000089199, + "grad_norm": 0.5074752569198608, + "learning_rate": 1.9678137769312543e-06, + "loss": 0.4446, + "step": 10762 + }, + { + "epoch": 1.91998929622692, + "grad_norm": 0.4914679229259491, + "learning_rate": 1.9590501771433735e-06, + "loss": 0.4738, + "step": 10763 + }, + { + "epoch": 1.9201676924449202, + "grad_norm": 0.5547556281089783, + "learning_rate": 1.9503060583849485e-06, + "loss": 0.5181, + "step": 10764 + }, + { + "epoch": 1.9203460886629204, + "grad_norm": 0.45473459362983704, + "learning_rate": 1.941581421342764e-06, + "loss": 0.3413, + "step": 10765 + }, + { + "epoch": 1.9205244848809206, + "grad_norm": 0.5488914251327515, + "learning_rate": 1.932876266701994e-06, + "loss": 0.5253, + "step": 10766 + }, + { + "epoch": 1.9207028810989208, + "grad_norm": 0.5307526588439941, + "learning_rate": 1.924190595146369e-06, + "loss": 0.5097, + "step": 10767 + }, + { + "epoch": 1.920881277316921, + "grad_norm": 0.530610203742981, + "learning_rate": 1.9155244073580936e-06, + "loss": 0.4198, + "step": 10768 + }, + { + "epoch": 1.921059673534921, + "grad_norm": 0.48655280470848083, + "learning_rate": 1.9068777040177342e-06, + "loss": 0.4815, + "step": 10769 + }, + { + "epoch": 1.9212380697529212, + "grad_norm": 0.5858240127563477, + "learning_rate": 1.8982504858044703e-06, + "loss": 0.5908, + "step": 10770 + }, + { + "epoch": 1.9214164659709214, + "grad_norm": 0.5390949249267578, + "learning_rate": 1.889642753395815e-06, + "loss": 0.5308, + "step": 10771 + }, + { + "epoch": 1.9215948621889216, + "grad_norm": 0.4654117524623871, + "learning_rate": 1.8810545074678664e-06, + "loss": 0.3985, + "step": 10772 + }, + { + "epoch": 1.9217732584069218, + "grad_norm": 0.570939838886261, + "learning_rate": 1.872485748695113e-06, + "loss": 0.5217, + "step": 10773 + }, + { + "epoch": 1.9219516546249218, + "grad_norm": 0.5508378148078918, + "learning_rate": 1.8639364777505442e-06, + "loss": 0.5657, + "step": 10774 + }, + { + "epoch": 1.922130050842922, + "grad_norm": 0.49652281403541565, + "learning_rate": 1.8554066953056502e-06, + "loss": 0.4462, + "step": 10775 + }, + { + "epoch": 1.9223084470609222, + "grad_norm": 0.5410851836204529, + "learning_rate": 1.8468964020302847e-06, + "loss": 0.5473, + "step": 10776 + }, + { + "epoch": 1.9224868432789224, + "grad_norm": 0.5557335615158081, + "learning_rate": 1.8384055985929405e-06, + "loss": 0.6211, + "step": 10777 + }, + { + "epoch": 1.9226652394969226, + "grad_norm": 0.49317964911460876, + "learning_rate": 1.8299342856603895e-06, + "loss": 0.4219, + "step": 10778 + }, + { + "epoch": 1.9228436357149228, + "grad_norm": 0.5330275297164917, + "learning_rate": 1.8214824638980166e-06, + "loss": 0.4602, + "step": 10779 + }, + { + "epoch": 1.923022031932923, + "grad_norm": 0.5371294617652893, + "learning_rate": 1.8130501339696237e-06, + "loss": 0.4941, + "step": 10780 + }, + { + "epoch": 1.9232004281509232, + "grad_norm": 0.5647792220115662, + "learning_rate": 1.8046372965374592e-06, + "loss": 0.5879, + "step": 10781 + }, + { + "epoch": 1.9233788243689234, + "grad_norm": 0.5888603329658508, + "learning_rate": 1.796243952262272e-06, + "loss": 0.6028, + "step": 10782 + }, + { + "epoch": 1.9235572205869236, + "grad_norm": 0.5580531358718872, + "learning_rate": 1.787870101803285e-06, + "loss": 0.5612, + "step": 10783 + }, + { + "epoch": 1.9237356168049238, + "grad_norm": 0.5175449848175049, + "learning_rate": 1.7795157458181389e-06, + "loss": 0.4524, + "step": 10784 + }, + { + "epoch": 1.923914013022924, + "grad_norm": 0.5534098148345947, + "learning_rate": 1.7711808849630307e-06, + "loss": 0.5132, + "step": 10785 + }, + { + "epoch": 1.9240924092409242, + "grad_norm": 0.5773023962974548, + "learning_rate": 1.7628655198925481e-06, + "loss": 0.5995, + "step": 10786 + }, + { + "epoch": 1.9242708054589244, + "grad_norm": 0.4896835386753082, + "learning_rate": 1.7545696512597797e-06, + "loss": 0.4349, + "step": 10787 + }, + { + "epoch": 1.9244492016769246, + "grad_norm": 0.4547155797481537, + "learning_rate": 1.7462932797163156e-06, + "loss": 0.3644, + "step": 10788 + }, + { + "epoch": 1.9246275978949248, + "grad_norm": 0.5199649333953857, + "learning_rate": 1.738036405912108e-06, + "loss": 0.5188, + "step": 10789 + }, + { + "epoch": 1.924805994112925, + "grad_norm": 0.5165232419967651, + "learning_rate": 1.7297990304956934e-06, + "loss": 0.4517, + "step": 10790 + }, + { + "epoch": 1.924984390330925, + "grad_norm": 0.5304819345474243, + "learning_rate": 1.721581154114027e-06, + "loss": 0.4473, + "step": 10791 + }, + { + "epoch": 1.9251627865489251, + "grad_norm": 0.582391619682312, + "learning_rate": 1.7133827774125365e-06, + "loss": 0.5413, + "step": 10792 + }, + { + "epoch": 1.9253411827669253, + "grad_norm": 0.5489014983177185, + "learning_rate": 1.7052039010350962e-06, + "loss": 0.5362, + "step": 10793 + }, + { + "epoch": 1.9255195789849255, + "grad_norm": 0.560737669467926, + "learning_rate": 1.6970445256241363e-06, + "loss": 0.5321, + "step": 10794 + }, + { + "epoch": 1.9256979752029257, + "grad_norm": 0.5768745541572571, + "learning_rate": 1.6889046518203943e-06, + "loss": 0.5531, + "step": 10795 + }, + { + "epoch": 1.9258763714209257, + "grad_norm": 0.4823215901851654, + "learning_rate": 1.6807842802632756e-06, + "loss": 0.4796, + "step": 10796 + }, + { + "epoch": 1.926054767638926, + "grad_norm": 0.5584346055984497, + "learning_rate": 1.6726834115904643e-06, + "loss": 0.5695, + "step": 10797 + }, + { + "epoch": 1.9262331638569261, + "grad_norm": 0.6224119067192078, + "learning_rate": 1.6646020464382294e-06, + "loss": 0.59, + "step": 10798 + }, + { + "epoch": 1.9264115600749263, + "grad_norm": 0.5279660820960999, + "learning_rate": 1.6565401854413132e-06, + "loss": 0.5258, + "step": 10799 + }, + { + "epoch": 1.9265899562929265, + "grad_norm": 0.529108464717865, + "learning_rate": 1.648497829232848e-06, + "loss": 0.4812, + "step": 10800 + }, + { + "epoch": 1.9267683525109267, + "grad_norm": 0.6457714438438416, + "learning_rate": 1.6404749784444673e-06, + "loss": 0.5662, + "step": 10801 + }, + { + "epoch": 1.926946748728927, + "grad_norm": 0.5989267230033875, + "learning_rate": 1.632471633706334e-06, + "loss": 0.5611, + "step": 10802 + }, + { + "epoch": 1.927125144946927, + "grad_norm": 0.4550947844982147, + "learning_rate": 1.6244877956469728e-06, + "loss": 0.3234, + "step": 10803 + }, + { + "epoch": 1.9273035411649273, + "grad_norm": 0.49629008769989014, + "learning_rate": 1.6165234648934657e-06, + "loss": 0.4037, + "step": 10804 + }, + { + "epoch": 1.9274819373829275, + "grad_norm": 0.44870060682296753, + "learning_rate": 1.6085786420713123e-06, + "loss": 0.2973, + "step": 10805 + }, + { + "epoch": 1.9276603336009277, + "grad_norm": 0.5849354267120361, + "learning_rate": 1.6006533278045131e-06, + "loss": 0.6132, + "step": 10806 + }, + { + "epoch": 1.927838729818928, + "grad_norm": 0.4785587787628174, + "learning_rate": 1.5927475227155152e-06, + "loss": 0.3464, + "step": 10807 + }, + { + "epoch": 1.928017126036928, + "grad_norm": 0.4874267876148224, + "learning_rate": 1.5848612274252105e-06, + "loss": 0.4023, + "step": 10808 + }, + { + "epoch": 1.9281955222549283, + "grad_norm": 0.48776915669441223, + "learning_rate": 1.5769944425530202e-06, + "loss": 0.3752, + "step": 10809 + }, + { + "epoch": 1.9283739184729285, + "grad_norm": 0.612381100654602, + "learning_rate": 1.5691471687167558e-06, + "loss": 0.5973, + "step": 10810 + }, + { + "epoch": 1.9285523146909287, + "grad_norm": 0.5721031427383423, + "learning_rate": 1.561319406532785e-06, + "loss": 0.5869, + "step": 10811 + }, + { + "epoch": 1.928730710908929, + "grad_norm": 0.524873673915863, + "learning_rate": 1.5535111566158667e-06, + "loss": 0.4592, + "step": 10812 + }, + { + "epoch": 1.9289091071269289, + "grad_norm": 0.5512157678604126, + "learning_rate": 1.5457224195792873e-06, + "loss": 0.5546, + "step": 10813 + }, + { + "epoch": 1.929087503344929, + "grad_norm": 0.562592625617981, + "learning_rate": 1.5379531960347247e-06, + "loss": 0.4041, + "step": 10814 + }, + { + "epoch": 1.9292658995629293, + "grad_norm": 0.5566669702529907, + "learning_rate": 1.530203486592413e-06, + "loss": 0.4858, + "step": 10815 + }, + { + "epoch": 1.9294442957809295, + "grad_norm": 0.5376592874526978, + "learning_rate": 1.5224732918609762e-06, + "loss": 0.5448, + "step": 10816 + }, + { + "epoch": 1.9296226919989297, + "grad_norm": 0.46925434470176697, + "learning_rate": 1.5147626124475955e-06, + "loss": 0.4111, + "step": 10817 + }, + { + "epoch": 1.9298010882169296, + "grad_norm": 0.5781516432762146, + "learning_rate": 1.5070714489577864e-06, + "loss": 0.471, + "step": 10818 + }, + { + "epoch": 1.9299794844349298, + "grad_norm": 0.7423487305641174, + "learning_rate": 1.4993998019956767e-06, + "loss": 0.4972, + "step": 10819 + }, + { + "epoch": 1.93015788065293, + "grad_norm": 0.487798273563385, + "learning_rate": 1.4917476721637569e-06, + "loss": 0.4267, + "step": 10820 + }, + { + "epoch": 1.9303362768709302, + "grad_norm": 0.47426673769950867, + "learning_rate": 1.4841150600630183e-06, + "loss": 0.3958, + "step": 10821 + }, + { + "epoch": 1.9305146730889304, + "grad_norm": 0.4360140562057495, + "learning_rate": 1.4765019662929812e-06, + "loss": 0.3328, + "step": 10822 + }, + { + "epoch": 1.9306930693069306, + "grad_norm": 0.49829891324043274, + "learning_rate": 1.468908391451501e-06, + "loss": 0.3546, + "step": 10823 + }, + { + "epoch": 1.9308714655249308, + "grad_norm": 0.5402979254722595, + "learning_rate": 1.4613343361349897e-06, + "loss": 0.5048, + "step": 10824 + }, + { + "epoch": 1.931049861742931, + "grad_norm": 0.49174410104751587, + "learning_rate": 1.4537798009383596e-06, + "loss": 0.4204, + "step": 10825 + }, + { + "epoch": 1.9312282579609312, + "grad_norm": 0.5174482464790344, + "learning_rate": 1.4462447864548866e-06, + "loss": 0.4889, + "step": 10826 + }, + { + "epoch": 1.9314066541789314, + "grad_norm": 0.5101271271705627, + "learning_rate": 1.4387292932764029e-06, + "loss": 0.447, + "step": 10827 + }, + { + "epoch": 1.9315850503969316, + "grad_norm": 0.5471371412277222, + "learning_rate": 1.4312333219931307e-06, + "loss": 0.4726, + "step": 10828 + }, + { + "epoch": 1.9317634466149318, + "grad_norm": 0.5995847582817078, + "learning_rate": 1.4237568731938488e-06, + "loss": 0.5217, + "step": 10829 + }, + { + "epoch": 1.931941842832932, + "grad_norm": 0.5153140425682068, + "learning_rate": 1.4162999474657267e-06, + "loss": 0.4917, + "step": 10830 + }, + { + "epoch": 1.9321202390509322, + "grad_norm": 0.4703426957130432, + "learning_rate": 1.4088625453944348e-06, + "loss": 0.4425, + "step": 10831 + }, + { + "epoch": 1.9322986352689324, + "grad_norm": 0.5764087438583374, + "learning_rate": 1.401444667564089e-06, + "loss": 0.553, + "step": 10832 + }, + { + "epoch": 1.9324770314869326, + "grad_norm": 0.7190420627593994, + "learning_rate": 1.3940463145573068e-06, + "loss": 0.5883, + "step": 10833 + }, + { + "epoch": 1.9326554277049328, + "grad_norm": 0.5590774416923523, + "learning_rate": 1.3866674869551232e-06, + "loss": 0.5026, + "step": 10834 + }, + { + "epoch": 1.9328338239229328, + "grad_norm": 0.5439345240592957, + "learning_rate": 1.3793081853371026e-06, + "loss": 0.5682, + "step": 10835 + }, + { + "epoch": 1.933012220140933, + "grad_norm": 0.52024245262146, + "learning_rate": 1.3719684102812547e-06, + "loss": 0.428, + "step": 10836 + }, + { + "epoch": 1.9331906163589332, + "grad_norm": 0.5542047023773193, + "learning_rate": 1.3646481623639794e-06, + "loss": 0.5323, + "step": 10837 + }, + { + "epoch": 1.9333690125769334, + "grad_norm": 0.5421310663223267, + "learning_rate": 1.3573474421602617e-06, + "loss": 0.4396, + "step": 10838 + }, + { + "epoch": 1.9335474087949336, + "grad_norm": 0.4774705767631531, + "learning_rate": 1.3500662502434758e-06, + "loss": 0.4457, + "step": 10839 + }, + { + "epoch": 1.9337258050129336, + "grad_norm": 0.5488468408584595, + "learning_rate": 1.34280458718547e-06, + "loss": 0.5401, + "step": 10840 + }, + { + "epoch": 1.9339042012309338, + "grad_norm": 0.4551815986633301, + "learning_rate": 1.3355624535565936e-06, + "loss": 0.3325, + "step": 10841 + }, + { + "epoch": 1.934082597448934, + "grad_norm": 0.5352870225906372, + "learning_rate": 1.3283398499256138e-06, + "loss": 0.5896, + "step": 10842 + }, + { + "epoch": 1.9342609936669342, + "grad_norm": 0.5666049718856812, + "learning_rate": 1.3211367768598548e-06, + "loss": 0.5675, + "step": 10843 + }, + { + "epoch": 1.9344393898849344, + "grad_norm": 0.5233065485954285, + "learning_rate": 1.3139532349249473e-06, + "loss": 0.5271, + "step": 10844 + }, + { + "epoch": 1.9346177861029346, + "grad_norm": 0.47093701362609863, + "learning_rate": 1.3067892246851897e-06, + "loss": 0.3446, + "step": 10845 + }, + { + "epoch": 1.9347961823209348, + "grad_norm": 0.5583943724632263, + "learning_rate": 1.2996447467031326e-06, + "loss": 0.5945, + "step": 10846 + }, + { + "epoch": 1.934974578538935, + "grad_norm": 0.4825393259525299, + "learning_rate": 1.292519801539993e-06, + "loss": 0.369, + "step": 10847 + }, + { + "epoch": 1.9351529747569352, + "grad_norm": 0.6124469637870789, + "learning_rate": 1.2854143897552961e-06, + "loss": 0.5456, + "step": 10848 + }, + { + "epoch": 1.9353313709749353, + "grad_norm": 0.4597680866718292, + "learning_rate": 1.278328511907123e-06, + "loss": 0.3464, + "step": 10849 + }, + { + "epoch": 1.9355097671929355, + "grad_norm": 0.529517650604248, + "learning_rate": 1.2712621685520287e-06, + "loss": 0.4432, + "step": 10850 + }, + { + "epoch": 1.9356881634109357, + "grad_norm": 0.52036052942276, + "learning_rate": 1.2642153602449303e-06, + "loss": 0.488, + "step": 10851 + }, + { + "epoch": 1.935866559628936, + "grad_norm": 0.5084417462348938, + "learning_rate": 1.2571880875393293e-06, + "loss": 0.4408, + "step": 10852 + }, + { + "epoch": 1.9360449558469361, + "grad_norm": 0.5051507353782654, + "learning_rate": 1.2501803509871457e-06, + "loss": 0.4353, + "step": 10853 + }, + { + "epoch": 1.9362233520649363, + "grad_norm": 0.4483596980571747, + "learning_rate": 1.2431921511387167e-06, + "loss": 0.3902, + "step": 10854 + }, + { + "epoch": 1.9364017482829365, + "grad_norm": 0.5327636003494263, + "learning_rate": 1.2362234885429646e-06, + "loss": 0.5355, + "step": 10855 + }, + { + "epoch": 1.9365801445009367, + "grad_norm": 0.5151218175888062, + "learning_rate": 1.229274363747146e-06, + "loss": 0.5091, + "step": 10856 + }, + { + "epoch": 1.9367585407189367, + "grad_norm": 0.4956924319267273, + "learning_rate": 1.2223447772970742e-06, + "loss": 0.4205, + "step": 10857 + }, + { + "epoch": 1.936936936936937, + "grad_norm": 0.4500170052051544, + "learning_rate": 1.2154347297369806e-06, + "loss": 0.3676, + "step": 10858 + }, + { + "epoch": 1.9371153331549371, + "grad_norm": 0.44707950949668884, + "learning_rate": 1.2085442216095977e-06, + "loss": 0.3637, + "step": 10859 + }, + { + "epoch": 1.9372937293729373, + "grad_norm": 0.5490984916687012, + "learning_rate": 1.201673253456076e-06, + "loss": 0.498, + "step": 10860 + }, + { + "epoch": 1.9374721255909375, + "grad_norm": 0.5158944725990295, + "learning_rate": 1.194821825816067e-06, + "loss": 0.4, + "step": 10861 + }, + { + "epoch": 1.9376505218089375, + "grad_norm": 0.519051194190979, + "learning_rate": 1.1879899392276961e-06, + "loss": 0.4514, + "step": 10862 + }, + { + "epoch": 1.9378289180269377, + "grad_norm": 0.5193673968315125, + "learning_rate": 1.1811775942275061e-06, + "loss": 0.5199, + "step": 10863 + }, + { + "epoch": 1.9380073142449379, + "grad_norm": 0.600447952747345, + "learning_rate": 1.174384791350569e-06, + "loss": 0.6023, + "step": 10864 + }, + { + "epoch": 1.938185710462938, + "grad_norm": 0.5528981685638428, + "learning_rate": 1.1676115311303747e-06, + "loss": 0.4367, + "step": 10865 + }, + { + "epoch": 1.9383641066809383, + "grad_norm": 0.5125691890716553, + "learning_rate": 1.160857814098859e-06, + "loss": 0.4637, + "step": 10866 + }, + { + "epoch": 1.9385425028989385, + "grad_norm": 0.5695079565048218, + "learning_rate": 1.154123640786514e-06, + "loss": 0.573, + "step": 10867 + }, + { + "epoch": 1.9387208991169387, + "grad_norm": 0.5509238243103027, + "learning_rate": 1.1474090117221947e-06, + "loss": 0.5835, + "step": 10868 + }, + { + "epoch": 1.9388992953349389, + "grad_norm": 0.5293681025505066, + "learning_rate": 1.1407139274333124e-06, + "loss": 0.5052, + "step": 10869 + }, + { + "epoch": 1.939077691552939, + "grad_norm": 0.5039731860160828, + "learning_rate": 1.1340383884456407e-06, + "loss": 0.4013, + "step": 10870 + }, + { + "epoch": 1.9392560877709393, + "grad_norm": 0.5404682159423828, + "learning_rate": 1.1273823952835106e-06, + "loss": 0.5339, + "step": 10871 + }, + { + "epoch": 1.9394344839889395, + "grad_norm": 0.49914366006851196, + "learning_rate": 1.1207459484696424e-06, + "loss": 0.5104, + "step": 10872 + }, + { + "epoch": 1.9396128802069397, + "grad_norm": 0.5963449478149414, + "learning_rate": 1.1141290485253141e-06, + "loss": 0.5313, + "step": 10873 + }, + { + "epoch": 1.9397912764249399, + "grad_norm": 0.5142577886581421, + "learning_rate": 1.107531695970193e-06, + "loss": 0.4771, + "step": 10874 + }, + { + "epoch": 1.93996967264294, + "grad_norm": 0.49809107184410095, + "learning_rate": 1.1009538913223927e-06, + "loss": 0.4144, + "step": 10875 + }, + { + "epoch": 1.9401480688609403, + "grad_norm": 0.5837324261665344, + "learning_rate": 1.0943956350985828e-06, + "loss": 0.6171, + "step": 10876 + }, + { + "epoch": 1.9403264650789405, + "grad_norm": 0.48720839619636536, + "learning_rate": 1.0878569278138239e-06, + "loss": 0.331, + "step": 10877 + }, + { + "epoch": 1.9405048612969407, + "grad_norm": 0.5243699550628662, + "learning_rate": 1.0813377699816773e-06, + "loss": 0.4904, + "step": 10878 + }, + { + "epoch": 1.9406832575149406, + "grad_norm": 0.564947783946991, + "learning_rate": 1.07483816211415e-06, + "loss": 0.6612, + "step": 10879 + }, + { + "epoch": 1.9408616537329408, + "grad_norm": 0.5145286321640015, + "learning_rate": 1.068358104721695e-06, + "loss": 0.5127, + "step": 10880 + }, + { + "epoch": 1.941040049950941, + "grad_norm": 0.5018771290779114, + "learning_rate": 1.0618975983132662e-06, + "loss": 0.4624, + "step": 10881 + }, + { + "epoch": 1.9412184461689412, + "grad_norm": 0.490432471036911, + "learning_rate": 1.055456643396291e-06, + "loss": 0.4464, + "step": 10882 + }, + { + "epoch": 1.9413968423869414, + "grad_norm": 0.45111989974975586, + "learning_rate": 1.0490352404766146e-06, + "loss": 0.3621, + "step": 10883 + }, + { + "epoch": 1.9415752386049414, + "grad_norm": 0.5172296762466431, + "learning_rate": 1.042633390058556e-06, + "loss": 0.4011, + "step": 10884 + }, + { + "epoch": 1.9417536348229416, + "grad_norm": 0.5376213192939758, + "learning_rate": 1.036251092644963e-06, + "loss": 0.5768, + "step": 10885 + }, + { + "epoch": 1.9419320310409418, + "grad_norm": 0.5566000938415527, + "learning_rate": 1.0298883487370736e-06, + "loss": 0.5342, + "step": 10886 + }, + { + "epoch": 1.942110427258942, + "grad_norm": 0.5264571309089661, + "learning_rate": 1.023545158834599e-06, + "loss": 0.4918, + "step": 10887 + }, + { + "epoch": 1.9422888234769422, + "grad_norm": 0.4640972316265106, + "learning_rate": 1.0172215234357519e-06, + "loss": 0.3828, + "step": 10888 + }, + { + "epoch": 1.9424672196949424, + "grad_norm": 0.5521480441093445, + "learning_rate": 1.0109174430371905e-06, + "loss": 0.4931, + "step": 10889 + }, + { + "epoch": 1.9426456159129426, + "grad_norm": 0.48122310638427734, + "learning_rate": 1.0046329181340187e-06, + "loss": 0.3782, + "step": 10890 + }, + { + "epoch": 1.9428240121309428, + "grad_norm": 0.5191644430160522, + "learning_rate": 9.983679492198139e-07, + "loss": 0.4283, + "step": 10891 + }, + { + "epoch": 1.943002408348943, + "grad_norm": 0.48529642820358276, + "learning_rate": 9.92122536786627e-07, + "loss": 0.3826, + "step": 10892 + }, + { + "epoch": 1.9431808045669432, + "grad_norm": 0.5107811689376831, + "learning_rate": 9.858966813250102e-07, + "loss": 0.497, + "step": 10893 + }, + { + "epoch": 1.9433592007849434, + "grad_norm": 0.4362351596355438, + "learning_rate": 9.796903833239057e-07, + "loss": 0.3437, + "step": 10894 + }, + { + "epoch": 1.9435375970029436, + "grad_norm": 0.5843741297721863, + "learning_rate": 9.73503643270729e-07, + "loss": 0.6999, + "step": 10895 + }, + { + "epoch": 1.9437159932209438, + "grad_norm": 0.49152183532714844, + "learning_rate": 9.673364616514247e-07, + "loss": 0.4538, + "step": 10896 + }, + { + "epoch": 1.943894389438944, + "grad_norm": 0.6454296112060547, + "learning_rate": 9.611888389503553e-07, + "loss": 0.5617, + "step": 10897 + }, + { + "epoch": 1.9440727856569442, + "grad_norm": 0.5631362795829773, + "learning_rate": 9.55060775650357e-07, + "loss": 0.502, + "step": 10898 + }, + { + "epoch": 1.9442511818749444, + "grad_norm": 0.5519803762435913, + "learning_rate": 9.489522722326838e-07, + "loss": 0.6406, + "step": 10899 + }, + { + "epoch": 1.9444295780929446, + "grad_norm": 0.5168196558952332, + "learning_rate": 9.42863329177146e-07, + "loss": 0.3918, + "step": 10900 + }, + { + "epoch": 1.9446079743109446, + "grad_norm": 0.5088583827018738, + "learning_rate": 9.367939469619169e-07, + "loss": 0.3789, + "step": 10901 + }, + { + "epoch": 1.9447863705289448, + "grad_norm": 0.5261327624320984, + "learning_rate": 9.307441260637261e-07, + "loss": 0.5678, + "step": 10902 + }, + { + "epoch": 1.944964766746945, + "grad_norm": 0.5280705094337463, + "learning_rate": 9.247138669577215e-07, + "loss": 0.4663, + "step": 10903 + }, + { + "epoch": 1.9451431629649452, + "grad_norm": 0.5464169383049011, + "learning_rate": 9.187031701174963e-07, + "loss": 0.5453, + "step": 10904 + }, + { + "epoch": 1.9453215591829454, + "grad_norm": 0.587669312953949, + "learning_rate": 9.127120360151175e-07, + "loss": 0.5612, + "step": 10905 + }, + { + "epoch": 1.9454999554009453, + "grad_norm": 0.512403130531311, + "learning_rate": 9.067404651211808e-07, + "loss": 0.4129, + "step": 10906 + }, + { + "epoch": 1.9456783516189455, + "grad_norm": 0.6009140014648438, + "learning_rate": 9.007884579046444e-07, + "loss": 0.5581, + "step": 10907 + }, + { + "epoch": 1.9458567478369457, + "grad_norm": 0.5417647957801819, + "learning_rate": 8.948560148329676e-07, + "loss": 0.5139, + "step": 10908 + }, + { + "epoch": 1.946035144054946, + "grad_norm": 0.5130847692489624, + "learning_rate": 8.889431363721112e-07, + "loss": 0.4498, + "step": 10909 + }, + { + "epoch": 1.9462135402729461, + "grad_norm": 0.5233637094497681, + "learning_rate": 8.830498229864537e-07, + "loss": 0.4592, + "step": 10910 + }, + { + "epoch": 1.9463919364909463, + "grad_norm": 0.5232832431793213, + "learning_rate": 8.771760751388746e-07, + "loss": 0.44, + "step": 10911 + }, + { + "epoch": 1.9465703327089465, + "grad_norm": 0.47777479887008667, + "learning_rate": 8.713218932906719e-07, + "loss": 0.4183, + "step": 10912 + }, + { + "epoch": 1.9467487289269467, + "grad_norm": 0.5267250537872314, + "learning_rate": 8.654872779016443e-07, + "loss": 0.4499, + "step": 10913 + }, + { + "epoch": 1.946927125144947, + "grad_norm": 0.5199578404426575, + "learning_rate": 8.596722294300364e-07, + "loss": 0.4039, + "step": 10914 + }, + { + "epoch": 1.9471055213629471, + "grad_norm": 0.4668145477771759, + "learning_rate": 8.538767483325383e-07, + "loss": 0.3787, + "step": 10915 + }, + { + "epoch": 1.9472839175809473, + "grad_norm": 0.5497430562973022, + "learning_rate": 8.481008350643693e-07, + "loss": 0.5505, + "step": 10916 + }, + { + "epoch": 1.9474623137989475, + "grad_norm": 0.5314509272575378, + "learning_rate": 8.423444900791111e-07, + "loss": 0.4944, + "step": 10917 + }, + { + "epoch": 1.9476407100169477, + "grad_norm": 0.46878379583358765, + "learning_rate": 8.366077138289296e-07, + "loss": 0.3517, + "step": 10918 + }, + { + "epoch": 1.947819106234948, + "grad_norm": 0.4314865469932556, + "learning_rate": 8.308905067643536e-07, + "loss": 0.3136, + "step": 10919 + }, + { + "epoch": 1.9479975024529481, + "grad_norm": 0.5057112574577332, + "learning_rate": 8.251928693343846e-07, + "loss": 0.5543, + "step": 10920 + }, + { + "epoch": 1.9481758986709483, + "grad_norm": 0.5008751153945923, + "learning_rate": 8.19514801986554e-07, + "loss": 0.4436, + "step": 10921 + }, + { + "epoch": 1.9483542948889485, + "grad_norm": 0.49664172530174255, + "learning_rate": 8.138563051667824e-07, + "loss": 0.3823, + "step": 10922 + }, + { + "epoch": 1.9485326911069485, + "grad_norm": 0.9543033838272095, + "learning_rate": 8.082173793195479e-07, + "loss": 0.448, + "step": 10923 + }, + { + "epoch": 1.9487110873249487, + "grad_norm": 0.49950093030929565, + "learning_rate": 8.025980248876352e-07, + "loss": 0.3344, + "step": 10924 + }, + { + "epoch": 1.9488894835429489, + "grad_norm": 0.5399156808853149, + "learning_rate": 7.969982423124689e-07, + "loss": 0.5778, + "step": 10925 + }, + { + "epoch": 1.949067879760949, + "grad_norm": 0.5570793151855469, + "learning_rate": 7.914180320338082e-07, + "loss": 0.511, + "step": 10926 + }, + { + "epoch": 1.9492462759789493, + "grad_norm": 0.5615115761756897, + "learning_rate": 7.858573944899139e-07, + "loss": 0.5364, + "step": 10927 + }, + { + "epoch": 1.9494246721969493, + "grad_norm": 0.5255836844444275, + "learning_rate": 7.803163301175753e-07, + "loss": 0.5906, + "step": 10928 + }, + { + "epoch": 1.9496030684149495, + "grad_norm": 0.5021457672119141, + "learning_rate": 7.747948393519167e-07, + "loss": 0.4357, + "step": 10929 + }, + { + "epoch": 1.9497814646329497, + "grad_norm": 0.5015770196914673, + "learning_rate": 7.692929226266188e-07, + "loss": 0.4276, + "step": 10930 + }, + { + "epoch": 1.9499598608509499, + "grad_norm": 0.5243127346038818, + "learning_rate": 7.638105803738083e-07, + "loss": 0.5425, + "step": 10931 + }, + { + "epoch": 1.95013825706895, + "grad_norm": 0.47330746054649353, + "learning_rate": 7.583478130240851e-07, + "loss": 0.3797, + "step": 10932 + }, + { + "epoch": 1.9503166532869503, + "grad_norm": 0.49246537685394287, + "learning_rate": 7.529046210064394e-07, + "loss": 0.435, + "step": 10933 + }, + { + "epoch": 1.9504950495049505, + "grad_norm": 0.5126248598098755, + "learning_rate": 7.474810047484182e-07, + "loss": 0.4749, + "step": 10934 + }, + { + "epoch": 1.9506734457229506, + "grad_norm": 0.5344600081443787, + "learning_rate": 7.420769646759584e-07, + "loss": 0.4556, + "step": 10935 + }, + { + "epoch": 1.9508518419409508, + "grad_norm": 0.5256651043891907, + "learning_rate": 7.366925012135262e-07, + "loss": 0.5272, + "step": 10936 + }, + { + "epoch": 1.951030238158951, + "grad_norm": 0.49008166790008545, + "learning_rate": 7.313276147840053e-07, + "loss": 0.4495, + "step": 10937 + }, + { + "epoch": 1.9512086343769512, + "grad_norm": 0.4859989285469055, + "learning_rate": 7.259823058087256e-07, + "loss": 0.3527, + "step": 10938 + }, + { + "epoch": 1.9513870305949514, + "grad_norm": 0.4771299958229065, + "learning_rate": 7.206565747075178e-07, + "loss": 0.4565, + "step": 10939 + }, + { + "epoch": 1.9515654268129516, + "grad_norm": 0.49878621101379395, + "learning_rate": 7.153504218986862e-07, + "loss": 0.39, + "step": 10940 + }, + { + "epoch": 1.9517438230309518, + "grad_norm": 0.6032863855361938, + "learning_rate": 7.10063847798953e-07, + "loss": 0.4377, + "step": 10941 + }, + { + "epoch": 1.951922219248952, + "grad_norm": 0.5695745348930359, + "learning_rate": 7.047968528235416e-07, + "loss": 0.5126, + "step": 10942 + }, + { + "epoch": 1.9521006154669522, + "grad_norm": 0.567709743976593, + "learning_rate": 6.995494373860656e-07, + "loss": 0.5469, + "step": 10943 + }, + { + "epoch": 1.9522790116849524, + "grad_norm": 0.516247034072876, + "learning_rate": 6.943216018987508e-07, + "loss": 0.437, + "step": 10944 + }, + { + "epoch": 1.9524574079029524, + "grad_norm": 0.5342845916748047, + "learning_rate": 6.891133467721022e-07, + "loss": 0.4906, + "step": 10945 + }, + { + "epoch": 1.9526358041209526, + "grad_norm": 0.5445780158042908, + "learning_rate": 6.839246724151815e-07, + "loss": 0.4014, + "step": 10946 + }, + { + "epoch": 1.9528142003389528, + "grad_norm": 0.5402600169181824, + "learning_rate": 6.787555792355793e-07, + "loss": 0.5584, + "step": 10947 + }, + { + "epoch": 1.952992596556953, + "grad_norm": 0.5336621403694153, + "learning_rate": 6.736060676391653e-07, + "loss": 0.435, + "step": 10948 + }, + { + "epoch": 1.9531709927749532, + "grad_norm": 0.6429981589317322, + "learning_rate": 6.684761380304772e-07, + "loss": 0.4705, + "step": 10949 + }, + { + "epoch": 1.9533493889929532, + "grad_norm": 0.4981113374233246, + "learning_rate": 6.633657908123592e-07, + "loss": 0.484, + "step": 10950 + }, + { + "epoch": 1.9535277852109534, + "grad_norm": 0.5144602656364441, + "learning_rate": 6.582750263862125e-07, + "loss": 0.5402, + "step": 10951 + }, + { + "epoch": 1.9537061814289536, + "grad_norm": 0.5602140426635742, + "learning_rate": 6.532038451518286e-07, + "loss": 0.498, + "step": 10952 + }, + { + "epoch": 1.9538845776469538, + "grad_norm": 0.5775460600852966, + "learning_rate": 6.481522475075274e-07, + "loss": 0.5634, + "step": 10953 + }, + { + "epoch": 1.954062973864954, + "grad_norm": 0.4967274069786072, + "learning_rate": 6.431202338500475e-07, + "loss": 0.3654, + "step": 10954 + }, + { + "epoch": 1.9542413700829542, + "grad_norm": 0.5827997326850891, + "learning_rate": 6.381078045745725e-07, + "loss": 0.6665, + "step": 10955 + }, + { + "epoch": 1.9544197663009544, + "grad_norm": 0.5111951231956482, + "learning_rate": 6.331149600748154e-07, + "loss": 0.4386, + "step": 10956 + }, + { + "epoch": 1.9545981625189546, + "grad_norm": 0.4768127202987671, + "learning_rate": 6.281417007429069e-07, + "loss": 0.4033, + "step": 10957 + }, + { + "epoch": 1.9547765587369548, + "grad_norm": 0.5813576579093933, + "learning_rate": 6.231880269694235e-07, + "loss": 0.7205, + "step": 10958 + }, + { + "epoch": 1.954954954954955, + "grad_norm": 0.4979632496833801, + "learning_rate": 6.182539391434428e-07, + "loss": 0.3926, + "step": 10959 + }, + { + "epoch": 1.9551333511729552, + "grad_norm": 0.4942147433757782, + "learning_rate": 6.133394376524604e-07, + "loss": 0.4401, + "step": 10960 + }, + { + "epoch": 1.9553117473909554, + "grad_norm": 0.5690099000930786, + "learning_rate": 6.084445228825009e-07, + "loss": 0.5989, + "step": 10961 + }, + { + "epoch": 1.9554901436089556, + "grad_norm": 0.5280899405479431, + "learning_rate": 6.035691952179789e-07, + "loss": 0.4661, + "step": 10962 + }, + { + "epoch": 1.9556685398269558, + "grad_norm": 0.49601706862449646, + "learning_rate": 5.987134550418105e-07, + "loss": 0.4591, + "step": 10963 + }, + { + "epoch": 1.955846936044956, + "grad_norm": 0.5549279451370239, + "learning_rate": 5.938773027353572e-07, + "loss": 0.4277, + "step": 10964 + }, + { + "epoch": 1.9560253322629562, + "grad_norm": 0.6416307687759399, + "learning_rate": 5.890607386784818e-07, + "loss": 0.6958, + "step": 10965 + }, + { + "epoch": 1.9562037284809564, + "grad_norm": 0.5070635080337524, + "learning_rate": 5.842637632494097e-07, + "loss": 0.4569, + "step": 10966 + }, + { + "epoch": 1.9563821246989563, + "grad_norm": 0.5254069566726685, + "learning_rate": 5.794863768249503e-07, + "loss": 0.5185, + "step": 10967 + }, + { + "epoch": 1.9565605209169565, + "grad_norm": 0.48455336689949036, + "learning_rate": 5.747285797802759e-07, + "loss": 0.403, + "step": 10968 + }, + { + "epoch": 1.9567389171349567, + "grad_norm": 0.574587345123291, + "learning_rate": 5.699903724891153e-07, + "loss": 0.7152, + "step": 10969 + }, + { + "epoch": 1.956917313352957, + "grad_norm": 0.5534584522247314, + "learning_rate": 5.652717553235597e-07, + "loss": 0.5475, + "step": 10970 + }, + { + "epoch": 1.9570957095709571, + "grad_norm": 0.5564422011375427, + "learning_rate": 5.605727286542017e-07, + "loss": 0.544, + "step": 10971 + }, + { + "epoch": 1.957274105788957, + "grad_norm": 0.5110240578651428, + "learning_rate": 5.558932928501347e-07, + "loss": 0.4253, + "step": 10972 + }, + { + "epoch": 1.9574525020069573, + "grad_norm": 0.5717257857322693, + "learning_rate": 5.512334482788428e-07, + "loss": 0.685, + "step": 10973 + }, + { + "epoch": 1.9576308982249575, + "grad_norm": 0.5493577718734741, + "learning_rate": 5.465931953063663e-07, + "loss": 0.459, + "step": 10974 + }, + { + "epoch": 1.9578092944429577, + "grad_norm": 0.5325579643249512, + "learning_rate": 5.419725342970805e-07, + "loss": 0.4834, + "step": 10975 + }, + { + "epoch": 1.957987690660958, + "grad_norm": 0.5637785792350769, + "learning_rate": 5.37371465613945e-07, + "loss": 0.5612, + "step": 10976 + }, + { + "epoch": 1.958166086878958, + "grad_norm": 0.5901339054107666, + "learning_rate": 5.32789989618282e-07, + "loss": 0.6958, + "step": 10977 + }, + { + "epoch": 1.9583444830969583, + "grad_norm": 0.49308058619499207, + "learning_rate": 5.282281066699701e-07, + "loss": 0.4053, + "step": 10978 + }, + { + "epoch": 1.9585228793149585, + "grad_norm": 0.5078552961349487, + "learning_rate": 5.236858171272229e-07, + "loss": 0.433, + "step": 10979 + }, + { + "epoch": 1.9587012755329587, + "grad_norm": 0.5317704677581787, + "learning_rate": 5.191631213468661e-07, + "loss": 0.4935, + "step": 10980 + }, + { + "epoch": 1.958879671750959, + "grad_norm": 0.606121301651001, + "learning_rate": 5.1466001968406e-07, + "loss": 0.6044, + "step": 10981 + }, + { + "epoch": 1.959058067968959, + "grad_norm": 0.49278080463409424, + "learning_rate": 5.101765124925217e-07, + "loss": 0.3807, + "step": 10982 + }, + { + "epoch": 1.9592364641869593, + "grad_norm": 0.5795280933380127, + "learning_rate": 5.057126001243306e-07, + "loss": 0.5853, + "step": 10983 + }, + { + "epoch": 1.9594148604049595, + "grad_norm": 0.5039269924163818, + "learning_rate": 5.01268282930123e-07, + "loss": 0.4857, + "step": 10984 + }, + { + "epoch": 1.9595932566229597, + "grad_norm": 0.5751108527183533, + "learning_rate": 4.968435612588973e-07, + "loss": 0.5788, + "step": 10985 + }, + { + "epoch": 1.9597716528409599, + "grad_norm": 0.5224045515060425, + "learning_rate": 4.924384354582645e-07, + "loss": 0.505, + "step": 10986 + }, + { + "epoch": 1.95995004905896, + "grad_norm": 0.6626994609832764, + "learning_rate": 4.880529058741146e-07, + "loss": 0.586, + "step": 10987 + }, + { + "epoch": 1.9601284452769603, + "grad_norm": 0.571682333946228, + "learning_rate": 4.836869728508941e-07, + "loss": 0.4711, + "step": 10988 + }, + { + "epoch": 1.9603068414949603, + "grad_norm": 0.5564180016517639, + "learning_rate": 4.793406367315512e-07, + "loss": 0.5446, + "step": 10989 + }, + { + "epoch": 1.9604852377129605, + "grad_norm": 0.5148413777351379, + "learning_rate": 4.750138978574237e-07, + "loss": 0.5736, + "step": 10990 + }, + { + "epoch": 1.9606636339309607, + "grad_norm": 0.5848867297172546, + "learning_rate": 4.7070675656832316e-07, + "loss": 0.5588, + "step": 10991 + }, + { + "epoch": 1.9608420301489609, + "grad_norm": 0.5551292896270752, + "learning_rate": 4.6641921320253465e-07, + "loss": 0.5043, + "step": 10992 + }, + { + "epoch": 1.961020426366961, + "grad_norm": 0.543438732624054, + "learning_rate": 4.6215126809678875e-07, + "loss": 0.4619, + "step": 10993 + }, + { + "epoch": 1.961198822584961, + "grad_norm": 0.4963773190975189, + "learning_rate": 4.579029215862895e-07, + "loss": 0.4392, + "step": 10994 + }, + { + "epoch": 1.9613772188029612, + "grad_norm": 0.5400619506835938, + "learning_rate": 4.5367417400471454e-07, + "loss": 0.6237, + "step": 10995 + }, + { + "epoch": 1.9615556150209614, + "grad_norm": 0.5189566612243652, + "learning_rate": 4.49465025684187e-07, + "loss": 0.416, + "step": 10996 + }, + { + "epoch": 1.9617340112389616, + "grad_norm": 0.5119401216506958, + "learning_rate": 4.452754769553036e-07, + "loss": 0.4979, + "step": 10997 + }, + { + "epoch": 1.9619124074569618, + "grad_norm": 0.4972620904445648, + "learning_rate": 4.4110552814707884e-07, + "loss": 0.4132, + "step": 10998 + }, + { + "epoch": 1.962090803674962, + "grad_norm": 0.6130009889602661, + "learning_rate": 4.3695517958702856e-07, + "loss": 0.6817, + "step": 10999 + }, + { + "epoch": 1.9622691998929622, + "grad_norm": 0.5110456347465515, + "learning_rate": 4.328244316011143e-07, + "loss": 0.3981, + "step": 11000 + }, + { + "epoch": 1.9624475961109624, + "grad_norm": 0.5126025676727295, + "learning_rate": 4.287132845137709e-07, + "loss": 0.4441, + "step": 11001 + }, + { + "epoch": 1.9626259923289626, + "grad_norm": 0.5288227796554565, + "learning_rate": 4.246217386479068e-07, + "loss": 0.4437, + "step": 11002 + }, + { + "epoch": 1.9628043885469628, + "grad_norm": 0.5382393002510071, + "learning_rate": 4.2054979432482044e-07, + "loss": 0.571, + "step": 11003 + }, + { + "epoch": 1.962982784764963, + "grad_norm": 0.4760775566101074, + "learning_rate": 4.164974518643672e-07, + "loss": 0.4336, + "step": 11004 + }, + { + "epoch": 1.9631611809829632, + "grad_norm": 0.5154813528060913, + "learning_rate": 4.1246471158482015e-07, + "loss": 0.4456, + "step": 11005 + }, + { + "epoch": 1.9633395772009634, + "grad_norm": 0.5530751943588257, + "learning_rate": 4.0845157380287047e-07, + "loss": 0.5216, + "step": 11006 + }, + { + "epoch": 1.9635179734189636, + "grad_norm": 0.5821408033370972, + "learning_rate": 4.044580388337105e-07, + "loss": 0.6407, + "step": 11007 + }, + { + "epoch": 1.9636963696369638, + "grad_norm": 0.5298261642456055, + "learning_rate": 4.0048410699103365e-07, + "loss": 0.5755, + "step": 11008 + }, + { + "epoch": 1.963874765854964, + "grad_norm": 0.5816518068313599, + "learning_rate": 3.9652977858692375e-07, + "loss": 0.6441, + "step": 11009 + }, + { + "epoch": 1.9640531620729642, + "grad_norm": 0.5004158020019531, + "learning_rate": 3.9259505393193785e-07, + "loss": 0.4389, + "step": 11010 + }, + { + "epoch": 1.9642315582909642, + "grad_norm": 0.4865829348564148, + "learning_rate": 3.886799333351343e-07, + "loss": 0.4019, + "step": 11011 + }, + { + "epoch": 1.9644099545089644, + "grad_norm": 0.48600196838378906, + "learning_rate": 3.847844171039616e-07, + "loss": 0.3581, + "step": 11012 + }, + { + "epoch": 1.9645883507269646, + "grad_norm": 0.4642302393913269, + "learning_rate": 3.809085055444528e-07, + "loss": 0.3521, + "step": 11013 + }, + { + "epoch": 1.9647667469449648, + "grad_norm": 0.45133063197135925, + "learning_rate": 3.770521989609199e-07, + "loss": 0.3936, + "step": 11014 + }, + { + "epoch": 1.964945143162965, + "grad_norm": 0.4991499185562134, + "learning_rate": 3.7321549765631514e-07, + "loss": 0.3983, + "step": 11015 + }, + { + "epoch": 1.9651235393809652, + "grad_norm": 0.526542067527771, + "learning_rate": 3.6939840193195296e-07, + "loss": 0.4563, + "step": 11016 + }, + { + "epoch": 1.9653019355989652, + "grad_norm": 0.5464054346084595, + "learning_rate": 3.656009120875936e-07, + "loss": 0.4337, + "step": 11017 + }, + { + "epoch": 1.9654803318169654, + "grad_norm": 0.43672722578048706, + "learning_rate": 3.618230284215263e-07, + "loss": 0.2778, + "step": 11018 + }, + { + "epoch": 1.9656587280349656, + "grad_norm": 0.6000391244888306, + "learning_rate": 3.580647512304303e-07, + "loss": 0.5887, + "step": 11019 + }, + { + "epoch": 1.9658371242529658, + "grad_norm": 0.5648049712181091, + "learning_rate": 3.543260808095139e-07, + "loss": 0.623, + "step": 11020 + }, + { + "epoch": 1.966015520470966, + "grad_norm": 0.4601522386074066, + "learning_rate": 3.5060701745240344e-07, + "loss": 0.3363, + "step": 11021 + }, + { + "epoch": 1.9661939166889661, + "grad_norm": 0.48688170313835144, + "learning_rate": 3.4690756145117074e-07, + "loss": 0.431, + "step": 11022 + }, + { + "epoch": 1.9663723129069663, + "grad_norm": 0.5293704271316528, + "learning_rate": 3.43227713096389e-07, + "loss": 0.5696, + "step": 11023 + }, + { + "epoch": 1.9665507091249665, + "grad_norm": 0.511298418045044, + "learning_rate": 3.39567472677077e-07, + "loss": 0.4419, + "step": 11024 + }, + { + "epoch": 1.9667291053429667, + "grad_norm": 0.580195963382721, + "learning_rate": 3.3592684048067144e-07, + "loss": 0.6206, + "step": 11025 + }, + { + "epoch": 1.966907501560967, + "grad_norm": 0.5301962494850159, + "learning_rate": 3.3230581679316584e-07, + "loss": 0.4893, + "step": 11026 + }, + { + "epoch": 1.9670858977789671, + "grad_norm": 0.48305174708366394, + "learning_rate": 3.287044018988883e-07, + "loss": 0.4104, + "step": 11027 + }, + { + "epoch": 1.9672642939969673, + "grad_norm": 0.5521623492240906, + "learning_rate": 3.2512259608075134e-07, + "loss": 0.5381, + "step": 11028 + }, + { + "epoch": 1.9674426902149675, + "grad_norm": 0.5551265478134155, + "learning_rate": 3.2156039962003e-07, + "loss": 0.5689, + "step": 11029 + }, + { + "epoch": 1.9676210864329677, + "grad_norm": 0.5386922359466553, + "learning_rate": 3.180178127965283e-07, + "loss": 0.5749, + "step": 11030 + }, + { + "epoch": 1.967799482650968, + "grad_norm": 0.5408132672309875, + "learning_rate": 3.144948358884403e-07, + "loss": 0.4842, + "step": 11031 + }, + { + "epoch": 1.9679778788689681, + "grad_norm": 0.5893284678459167, + "learning_rate": 3.109914691724891e-07, + "loss": 0.6408, + "step": 11032 + }, + { + "epoch": 1.968156275086968, + "grad_norm": 0.5585829019546509, + "learning_rate": 3.0750771292381573e-07, + "loss": 0.5564, + "step": 11033 + }, + { + "epoch": 1.9683346713049683, + "grad_norm": 0.544394850730896, + "learning_rate": 3.0404356741603467e-07, + "loss": 0.4695, + "step": 11034 + }, + { + "epoch": 1.9685130675229685, + "grad_norm": 0.4653702676296234, + "learning_rate": 3.0059903292120605e-07, + "loss": 0.3943, + "step": 11035 + }, + { + "epoch": 1.9686914637409687, + "grad_norm": 0.5404059886932373, + "learning_rate": 2.971741097098912e-07, + "loss": 0.5084, + "step": 11036 + }, + { + "epoch": 1.968869859958969, + "grad_norm": 0.5264713168144226, + "learning_rate": 2.9376879805106947e-07, + "loss": 0.4465, + "step": 11037 + }, + { + "epoch": 1.969048256176969, + "grad_norm": 0.48295858502388, + "learning_rate": 2.903830982121658e-07, + "loss": 0.3769, + "step": 11038 + }, + { + "epoch": 1.969226652394969, + "grad_norm": 0.5343292355537415, + "learning_rate": 2.870170104591341e-07, + "loss": 0.533, + "step": 11039 + }, + { + "epoch": 1.9694050486129693, + "grad_norm": 0.5259873270988464, + "learning_rate": 2.8367053505631846e-07, + "loss": 0.4229, + "step": 11040 + }, + { + "epoch": 1.9695834448309695, + "grad_norm": 0.46989312767982483, + "learning_rate": 2.803436722665642e-07, + "loss": 0.4079, + "step": 11041 + }, + { + "epoch": 1.9697618410489697, + "grad_norm": 0.50926274061203, + "learning_rate": 2.770364223511623e-07, + "loss": 0.494, + "step": 11042 + }, + { + "epoch": 1.9699402372669699, + "grad_norm": 0.756600022315979, + "learning_rate": 2.737487855698495e-07, + "loss": 0.5463, + "step": 11043 + }, + { + "epoch": 1.97011863348497, + "grad_norm": 0.47964367270469666, + "learning_rate": 2.7048076218083585e-07, + "loss": 0.3886, + "step": 11044 + }, + { + "epoch": 1.9702970297029703, + "grad_norm": 0.5303892493247986, + "learning_rate": 2.672323524408049e-07, + "loss": 0.5722, + "step": 11045 + }, + { + "epoch": 1.9704754259209705, + "grad_norm": 0.4234614670276642, + "learning_rate": 2.6400355660488594e-07, + "loss": 0.2525, + "step": 11046 + }, + { + "epoch": 1.9706538221389707, + "grad_norm": 0.5104755163192749, + "learning_rate": 2.607943749266262e-07, + "loss": 0.4572, + "step": 11047 + }, + { + "epoch": 1.9708322183569709, + "grad_norm": 0.5379624366760254, + "learning_rate": 2.5760480765812946e-07, + "loss": 0.5121, + "step": 11048 + }, + { + "epoch": 1.971010614574971, + "grad_norm": 0.5435539484024048, + "learning_rate": 2.544348550498621e-07, + "loss": 0.5189, + "step": 11049 + }, + { + "epoch": 1.9711890107929713, + "grad_norm": 0.6087327599525452, + "learning_rate": 2.512845173508194e-07, + "loss": 0.5903, + "step": 11050 + }, + { + "epoch": 1.9713674070109715, + "grad_norm": 0.5233466029167175, + "learning_rate": 2.481537948084145e-07, + "loss": 0.5574, + "step": 11051 + }, + { + "epoch": 1.9715458032289717, + "grad_norm": 0.4774353802204132, + "learning_rate": 2.4504268766853413e-07, + "loss": 0.4658, + "step": 11052 + }, + { + "epoch": 1.9717241994469719, + "grad_norm": 0.5861150622367859, + "learning_rate": 2.4195119617551054e-07, + "loss": 0.4947, + "step": 11053 + }, + { + "epoch": 1.971902595664972, + "grad_norm": 0.4522158205509186, + "learning_rate": 2.388793205721773e-07, + "loss": 0.2857, + "step": 11054 + }, + { + "epoch": 1.972080991882972, + "grad_norm": 0.4695200026035309, + "learning_rate": 2.358270610997859e-07, + "loss": 0.4349, + "step": 11055 + }, + { + "epoch": 1.9722593881009722, + "grad_norm": 0.5388724207878113, + "learning_rate": 2.3279441799803347e-07, + "loss": 0.5289, + "step": 11056 + }, + { + "epoch": 1.9724377843189724, + "grad_norm": 0.5425492525100708, + "learning_rate": 2.297813915051461e-07, + "loss": 0.4849, + "step": 11057 + }, + { + "epoch": 1.9726161805369726, + "grad_norm": 0.5144857168197632, + "learning_rate": 2.2678798185771233e-07, + "loss": 0.4728, + "step": 11058 + }, + { + "epoch": 1.9727945767549728, + "grad_norm": 0.5156108736991882, + "learning_rate": 2.2381418929090515e-07, + "loss": 0.4917, + "step": 11059 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 0.4618896543979645, + "learning_rate": 2.208600140382322e-07, + "loss": 0.3896, + "step": 11060 + }, + { + "epoch": 1.973151369190973, + "grad_norm": 0.484020471572876, + "learning_rate": 2.1792545633170236e-07, + "loss": 0.4211, + "step": 11061 + }, + { + "epoch": 1.9733297654089732, + "grad_norm": 0.5049816370010376, + "learning_rate": 2.1501051640182566e-07, + "loss": 0.4561, + "step": 11062 + }, + { + "epoch": 1.9735081616269734, + "grad_norm": 0.5276315808296204, + "learning_rate": 2.121151944775579e-07, + "loss": 0.4653, + "step": 11063 + }, + { + "epoch": 1.9736865578449736, + "grad_norm": 0.45448240637779236, + "learning_rate": 2.0923949078624493e-07, + "loss": 0.3645, + "step": 11064 + }, + { + "epoch": 1.9738649540629738, + "grad_norm": 0.5729450583457947, + "learning_rate": 2.0638340555376168e-07, + "loss": 0.5265, + "step": 11065 + }, + { + "epoch": 1.974043350280974, + "grad_norm": 0.47857052087783813, + "learning_rate": 2.0354693900445643e-07, + "loss": 0.3919, + "step": 11066 + }, + { + "epoch": 1.9742217464989742, + "grad_norm": 0.5773352980613708, + "learning_rate": 2.0073009136106768e-07, + "loss": 0.4904, + "step": 11067 + }, + { + "epoch": 1.9744001427169744, + "grad_norm": 0.47996997833251953, + "learning_rate": 1.9793286284483514e-07, + "loss": 0.4176, + "step": 11068 + }, + { + "epoch": 1.9745785389349746, + "grad_norm": 0.45277294516563416, + "learning_rate": 1.9515525367547192e-07, + "loss": 0.3085, + "step": 11069 + }, + { + "epoch": 1.9747569351529748, + "grad_norm": 0.49644288420677185, + "learning_rate": 1.9239726407110913e-07, + "loss": 0.4873, + "step": 11070 + }, + { + "epoch": 1.974935331370975, + "grad_norm": 0.5908966064453125, + "learning_rate": 1.8965889424835126e-07, + "loss": 0.6073, + "step": 11071 + }, + { + "epoch": 1.9751137275889752, + "grad_norm": 0.487192839384079, + "learning_rate": 1.86940144422304e-07, + "loss": 0.4247, + "step": 11072 + }, + { + "epoch": 1.9752921238069754, + "grad_norm": 0.5491554141044617, + "learning_rate": 1.8424101480646326e-07, + "loss": 0.4814, + "step": 11073 + }, + { + "epoch": 1.9754705200249756, + "grad_norm": 0.5391243696212769, + "learning_rate": 1.8156150561282614e-07, + "loss": 0.4927, + "step": 11074 + }, + { + "epoch": 1.9756489162429758, + "grad_norm": 0.5700384974479675, + "learning_rate": 1.7890161705183538e-07, + "loss": 0.6383, + "step": 11075 + }, + { + "epoch": 1.975827312460976, + "grad_norm": 0.5709267258644104, + "learning_rate": 1.7626134933243498e-07, + "loss": 0.5058, + "step": 11076 + }, + { + "epoch": 1.976005708678976, + "grad_norm": 0.5328453779220581, + "learning_rate": 1.7364070266193133e-07, + "loss": 0.4764, + "step": 11077 + }, + { + "epoch": 1.9761841048969762, + "grad_norm": 0.5492943525314331, + "learning_rate": 1.7103967724618753e-07, + "loss": 0.5105, + "step": 11078 + }, + { + "epoch": 1.9763625011149764, + "grad_norm": 0.6113517880439758, + "learning_rate": 1.684582732894846e-07, + "loss": 0.53, + "step": 11079 + }, + { + "epoch": 1.9765408973329766, + "grad_norm": 0.6604119539260864, + "learning_rate": 1.658964909945493e-07, + "loss": 0.5217, + "step": 11080 + }, + { + "epoch": 1.9767192935509768, + "grad_norm": 0.5008206367492676, + "learning_rate": 1.6335433056258176e-07, + "loss": 0.4225, + "step": 11081 + }, + { + "epoch": 1.976897689768977, + "grad_norm": 0.5335331559181213, + "learning_rate": 1.608317921932556e-07, + "loss": 0.4722, + "step": 11082 + }, + { + "epoch": 1.977076085986977, + "grad_norm": 0.5418149828910828, + "learning_rate": 1.5832887608471792e-07, + "loss": 0.4254, + "step": 11083 + }, + { + "epoch": 1.9772544822049771, + "grad_norm": 0.5159653425216675, + "learning_rate": 1.5584558243347813e-07, + "loss": 0.4687, + "step": 11084 + }, + { + "epoch": 1.9774328784229773, + "grad_norm": 0.5415779948234558, + "learning_rate": 1.5338191143463022e-07, + "loss": 0.4405, + "step": 11085 + }, + { + "epoch": 1.9776112746409775, + "grad_norm": 0.6081345677375793, + "learning_rate": 1.5093786328163052e-07, + "loss": 0.4643, + "step": 11086 + }, + { + "epoch": 1.9777896708589777, + "grad_norm": 0.5560696721076965, + "learning_rate": 1.4851343816646434e-07, + "loss": 0.6136, + "step": 11087 + }, + { + "epoch": 1.977968067076978, + "grad_norm": 0.521943986415863, + "learning_rate": 1.4610863627953496e-07, + "loss": 0.5582, + "step": 11088 + }, + { + "epoch": 1.9781464632949781, + "grad_norm": 0.5156327486038208, + "learning_rate": 1.4372345780971907e-07, + "loss": 0.4443, + "step": 11089 + }, + { + "epoch": 1.9783248595129783, + "grad_norm": 0.5138960480690002, + "learning_rate": 1.4135790294433904e-07, + "loss": 0.4079, + "step": 11090 + }, + { + "epoch": 1.9785032557309785, + "grad_norm": 0.503109335899353, + "learning_rate": 1.3901197186919067e-07, + "loss": 0.4661, + "step": 11091 + }, + { + "epoch": 1.9786816519489787, + "grad_norm": 0.601678192615509, + "learning_rate": 1.3668566476848775e-07, + "loss": 0.6345, + "step": 11092 + }, + { + "epoch": 1.978860048166979, + "grad_norm": 0.5237042903900146, + "learning_rate": 1.3437898182500075e-07, + "loss": 0.504, + "step": 11093 + }, + { + "epoch": 1.9790384443849791, + "grad_norm": 0.4751412868499756, + "learning_rate": 1.3209192321986252e-07, + "loss": 0.3427, + "step": 11094 + }, + { + "epoch": 1.9792168406029793, + "grad_norm": 0.4530685245990753, + "learning_rate": 1.298244891326794e-07, + "loss": 0.3887, + "step": 11095 + }, + { + "epoch": 1.9793952368209795, + "grad_norm": 0.5451707243919373, + "learning_rate": 1.2757667974155895e-07, + "loss": 0.627, + "step": 11096 + }, + { + "epoch": 1.9795736330389797, + "grad_norm": 0.5313323140144348, + "learning_rate": 1.253484952230266e-07, + "loss": 0.4812, + "step": 11097 + }, + { + "epoch": 1.97975202925698, + "grad_norm": 0.5562189221382141, + "learning_rate": 1.2313993575210901e-07, + "loss": 0.5047, + "step": 11098 + }, + { + "epoch": 1.9799304254749799, + "grad_norm": 0.622951865196228, + "learning_rate": 1.209510015022508e-07, + "loss": 0.8287, + "step": 11099 + }, + { + "epoch": 1.98010882169298, + "grad_norm": 0.5174034833908081, + "learning_rate": 1.1878169264536997e-07, + "loss": 0.4743, + "step": 11100 + }, + { + "epoch": 1.9802872179109803, + "grad_norm": 0.5302353501319885, + "learning_rate": 1.1663200935183028e-07, + "loss": 0.5506, + "step": 11101 + }, + { + "epoch": 1.9804656141289805, + "grad_norm": 0.6083629727363586, + "learning_rate": 1.145019517904966e-07, + "loss": 0.6897, + "step": 11102 + }, + { + "epoch": 1.9806440103469807, + "grad_norm": 0.591913640499115, + "learning_rate": 1.1239152012865183e-07, + "loss": 0.6142, + "step": 11103 + }, + { + "epoch": 1.9808224065649809, + "grad_norm": 0.49568477272987366, + "learning_rate": 1.103007145320245e-07, + "loss": 0.4795, + "step": 11104 + }, + { + "epoch": 1.9810008027829809, + "grad_norm": 0.6029313802719116, + "learning_rate": 1.0822953516484436e-07, + "loss": 0.627, + "step": 11105 + }, + { + "epoch": 1.981179199000981, + "grad_norm": 0.5010586977005005, + "learning_rate": 1.061779821897868e-07, + "loss": 0.4972, + "step": 11106 + }, + { + "epoch": 1.9813575952189812, + "grad_norm": 0.4987967312335968, + "learning_rate": 1.0414605576797298e-07, + "loss": 0.4968, + "step": 11107 + }, + { + "epoch": 1.9815359914369814, + "grad_norm": 0.6345359683036804, + "learning_rate": 1.0213375605896969e-07, + "loss": 0.6357, + "step": 11108 + }, + { + "epoch": 1.9817143876549816, + "grad_norm": 0.5932630896568298, + "learning_rate": 1.0014108322084492e-07, + "loss": 0.6181, + "step": 11109 + }, + { + "epoch": 1.9818927838729818, + "grad_norm": 0.48465976119041443, + "learning_rate": 9.816803741011237e-08, + "loss": 0.4411, + "step": 11110 + }, + { + "epoch": 1.982071180090982, + "grad_norm": 0.5137790441513062, + "learning_rate": 9.621461878173143e-08, + "loss": 0.4103, + "step": 11111 + }, + { + "epoch": 1.9822495763089822, + "grad_norm": 0.5033047199249268, + "learning_rate": 9.428082748910716e-08, + "loss": 0.4768, + "step": 11112 + }, + { + "epoch": 1.9824279725269824, + "grad_norm": 0.713068425655365, + "learning_rate": 9.236666368411806e-08, + "loss": 0.4846, + "step": 11113 + }, + { + "epoch": 1.9826063687449826, + "grad_norm": 0.5444625616073608, + "learning_rate": 9.047212751708834e-08, + "loss": 0.5477, + "step": 11114 + }, + { + "epoch": 1.9827847649629828, + "grad_norm": 0.49971529841423035, + "learning_rate": 8.859721913684337e-08, + "loss": 0.5182, + "step": 11115 + }, + { + "epoch": 1.982963161180983, + "grad_norm": 0.5162731409072876, + "learning_rate": 8.674193869065428e-08, + "loss": 0.4779, + "step": 11116 + }, + { + "epoch": 1.9831415573989832, + "grad_norm": 0.5822873115539551, + "learning_rate": 8.49062863241823e-08, + "loss": 0.6038, + "step": 11117 + }, + { + "epoch": 1.9833199536169834, + "grad_norm": 0.5523039698600769, + "learning_rate": 8.309026218161764e-08, + "loss": 0.5589, + "step": 11118 + }, + { + "epoch": 1.9834983498349836, + "grad_norm": 0.49146732687950134, + "learning_rate": 8.129386640562398e-08, + "loss": 0.4082, + "step": 11119 + }, + { + "epoch": 1.9836767460529838, + "grad_norm": 0.5370675325393677, + "learning_rate": 7.951709913722738e-08, + "loss": 0.5361, + "step": 11120 + }, + { + "epoch": 1.9838551422709838, + "grad_norm": 0.5307615399360657, + "learning_rate": 7.77599605160384e-08, + "loss": 0.4153, + "step": 11121 + }, + { + "epoch": 1.984033538488984, + "grad_norm": 0.49418723583221436, + "learning_rate": 7.602245068003e-08, + "loss": 0.4469, + "step": 11122 + }, + { + "epoch": 1.9842119347069842, + "grad_norm": 0.4878632724285126, + "learning_rate": 7.430456976564859e-08, + "loss": 0.4843, + "step": 11123 + }, + { + "epoch": 1.9843903309249844, + "grad_norm": 0.45123180747032166, + "learning_rate": 7.260631790784178e-08, + "loss": 0.4106, + "step": 11124 + }, + { + "epoch": 1.9845687271429846, + "grad_norm": 0.4762474000453949, + "learning_rate": 7.09276952399751e-08, + "loss": 0.349, + "step": 11125 + }, + { + "epoch": 1.9847471233609848, + "grad_norm": 0.547519862651825, + "learning_rate": 6.926870189391532e-08, + "loss": 0.4515, + "step": 11126 + }, + { + "epoch": 1.9849255195789848, + "grad_norm": 0.6240761876106262, + "learning_rate": 6.762933799991933e-08, + "loss": 0.6499, + "step": 11127 + }, + { + "epoch": 1.985103915796985, + "grad_norm": 0.5219815969467163, + "learning_rate": 6.6009603686773e-08, + "loss": 0.5382, + "step": 11128 + }, + { + "epoch": 1.9852823120149852, + "grad_norm": 0.5049837231636047, + "learning_rate": 6.440949908168014e-08, + "loss": 0.4419, + "step": 11129 + }, + { + "epoch": 1.9854607082329854, + "grad_norm": 0.5972961783409119, + "learning_rate": 6.282902431029025e-08, + "loss": 0.6477, + "step": 11130 + }, + { + "epoch": 1.9856391044509856, + "grad_norm": 0.5097864270210266, + "learning_rate": 6.126817949678176e-08, + "loss": 0.4747, + "step": 11131 + }, + { + "epoch": 1.9858175006689858, + "grad_norm": 0.5179688930511475, + "learning_rate": 5.972696476369555e-08, + "loss": 0.4006, + "step": 11132 + }, + { + "epoch": 1.985995896886986, + "grad_norm": 0.5777585506439209, + "learning_rate": 5.8205380232073666e-08, + "loss": 0.6213, + "step": 11133 + }, + { + "epoch": 1.9861742931049862, + "grad_norm": 0.5305327773094177, + "learning_rate": 5.6703426021487146e-08, + "loss": 0.4677, + "step": 11134 + }, + { + "epoch": 1.9863526893229864, + "grad_norm": 0.5374904870986938, + "learning_rate": 5.522110224981391e-08, + "loss": 0.492, + "step": 11135 + }, + { + "epoch": 1.9865310855409866, + "grad_norm": 0.5351753830909729, + "learning_rate": 5.375840903354412e-08, + "loss": 0.5088, + "step": 11136 + }, + { + "epoch": 1.9867094817589868, + "grad_norm": 0.513270378112793, + "learning_rate": 5.2315346487530334e-08, + "loss": 0.4432, + "step": 11137 + }, + { + "epoch": 1.986887877976987, + "grad_norm": 0.4735656678676605, + "learning_rate": 5.089191472507082e-08, + "loss": 0.4919, + "step": 11138 + }, + { + "epoch": 1.9870662741949872, + "grad_norm": 0.5290320515632629, + "learning_rate": 4.94881138580483e-08, + "loss": 0.4908, + "step": 11139 + }, + { + "epoch": 1.9872446704129874, + "grad_norm": 0.5589138865470886, + "learning_rate": 4.8103943996624654e-08, + "loss": 0.5107, + "step": 11140 + }, + { + "epoch": 1.9874230666309876, + "grad_norm": 0.5663526058197021, + "learning_rate": 4.673940524957398e-08, + "loss": 0.5412, + "step": 11141 + }, + { + "epoch": 1.9876014628489878, + "grad_norm": 0.5641487240791321, + "learning_rate": 4.539449772406057e-08, + "loss": 0.5338, + "step": 11142 + }, + { + "epoch": 1.9877798590669877, + "grad_norm": 0.5815507769584656, + "learning_rate": 4.406922152566661e-08, + "loss": 0.6252, + "step": 11143 + }, + { + "epoch": 1.987958255284988, + "grad_norm": 0.49604350328445435, + "learning_rate": 4.276357675853104e-08, + "loss": 0.4181, + "step": 11144 + }, + { + "epoch": 1.9881366515029881, + "grad_norm": 0.5443685054779053, + "learning_rate": 4.1477563525182945e-08, + "loss": 0.4321, + "step": 11145 + }, + { + "epoch": 1.9883150477209883, + "grad_norm": 0.532589316368103, + "learning_rate": 4.021118192662487e-08, + "loss": 0.5099, + "step": 11146 + }, + { + "epoch": 1.9884934439389885, + "grad_norm": 0.5429589152336121, + "learning_rate": 3.8964432062305046e-08, + "loss": 0.5191, + "step": 11147 + }, + { + "epoch": 1.9886718401569887, + "grad_norm": 0.6189258098602295, + "learning_rate": 3.773731403014513e-08, + "loss": 0.4195, + "step": 11148 + }, + { + "epoch": 1.9888502363749887, + "grad_norm": 0.5345398187637329, + "learning_rate": 3.652982792654025e-08, + "loss": 0.4932, + "step": 11149 + }, + { + "epoch": 1.989028632592989, + "grad_norm": 0.5087316036224365, + "learning_rate": 3.534197384630344e-08, + "loss": 0.4493, + "step": 11150 + }, + { + "epoch": 1.989207028810989, + "grad_norm": 0.7846596240997314, + "learning_rate": 3.417375188274896e-08, + "loss": 0.4459, + "step": 11151 + }, + { + "epoch": 1.9893854250289893, + "grad_norm": 0.5246078968048096, + "learning_rate": 3.302516212763673e-08, + "loss": 0.4156, + "step": 11152 + }, + { + "epoch": 1.9895638212469895, + "grad_norm": 0.5263677835464478, + "learning_rate": 3.1896204671144625e-08, + "loss": 0.5612, + "step": 11153 + }, + { + "epoch": 1.9897422174649897, + "grad_norm": 0.5278795957565308, + "learning_rate": 3.07868796019517e-08, + "loss": 0.474, + "step": 11154 + }, + { + "epoch": 1.98992061368299, + "grad_norm": 0.5510560274124146, + "learning_rate": 2.9697187007182715e-08, + "loss": 0.5469, + "step": 11155 + }, + { + "epoch": 1.99009900990099, + "grad_norm": 0.47594988346099854, + "learning_rate": 2.8627126972435857e-08, + "loss": 0.4399, + "step": 11156 + }, + { + "epoch": 1.9902774061189903, + "grad_norm": 0.5883272290229797, + "learning_rate": 2.757669958172726e-08, + "loss": 0.6997, + "step": 11157 + }, + { + "epoch": 1.9904558023369905, + "grad_norm": 0.5753530859947205, + "learning_rate": 2.654590491757425e-08, + "loss": 0.5808, + "step": 11158 + }, + { + "epoch": 1.9906341985549907, + "grad_norm": 0.45445308089256287, + "learning_rate": 2.5534743060939836e-08, + "loss": 0.3831, + "step": 11159 + }, + { + "epoch": 1.9908125947729909, + "grad_norm": 0.5948703289031982, + "learning_rate": 2.4543214091232723e-08, + "loss": 0.5709, + "step": 11160 + }, + { + "epoch": 1.990990990990991, + "grad_norm": 0.4452953338623047, + "learning_rate": 2.357131808633506e-08, + "loss": 0.3258, + "step": 11161 + }, + { + "epoch": 1.9911693872089913, + "grad_norm": 0.485113263130188, + "learning_rate": 2.2619055122574674e-08, + "loss": 0.3959, + "step": 11162 + }, + { + "epoch": 1.9913477834269915, + "grad_norm": 0.48504024744033813, + "learning_rate": 2.168642527475284e-08, + "loss": 0.4657, + "step": 11163 + }, + { + "epoch": 1.9915261796449917, + "grad_norm": 0.557052731513977, + "learning_rate": 2.0773428616088773e-08, + "loss": 0.5872, + "step": 11164 + }, + { + "epoch": 1.9917045758629917, + "grad_norm": 0.5224217176437378, + "learning_rate": 1.9880065218302877e-08, + "loss": 0.5756, + "step": 11165 + }, + { + "epoch": 1.9918829720809919, + "grad_norm": 0.5543699264526367, + "learning_rate": 1.900633515156125e-08, + "loss": 0.5897, + "step": 11166 + }, + { + "epoch": 1.992061368298992, + "grad_norm": 0.5769318342208862, + "learning_rate": 1.815223848447567e-08, + "loss": 0.5921, + "step": 11167 + }, + { + "epoch": 1.9922397645169923, + "grad_norm": 0.519289493560791, + "learning_rate": 1.731777528415912e-08, + "loss": 0.5332, + "step": 11168 + }, + { + "epoch": 1.9924181607349924, + "grad_norm": 0.4625517725944519, + "learning_rate": 1.650294561611476e-08, + "loss": 0.3986, + "step": 11169 + }, + { + "epoch": 1.9925965569529926, + "grad_norm": 0.7756746411323547, + "learning_rate": 1.5707749544374705e-08, + "loss": 0.4403, + "step": 11170 + }, + { + "epoch": 1.9927749531709926, + "grad_norm": 0.5089825987815857, + "learning_rate": 1.4932187131333485e-08, + "loss": 0.4378, + "step": 11171 + }, + { + "epoch": 1.9929533493889928, + "grad_norm": 0.5764268040657043, + "learning_rate": 1.4176258437970102e-08, + "loss": 0.5419, + "step": 11172 + }, + { + "epoch": 1.993131745606993, + "grad_norm": 0.5734716653823853, + "learning_rate": 1.3439963523625976e-08, + "loss": 0.5719, + "step": 11173 + }, + { + "epoch": 1.9933101418249932, + "grad_norm": 0.5470873117446899, + "learning_rate": 1.2723302446115969e-08, + "loss": 0.4679, + "step": 11174 + }, + { + "epoch": 1.9934885380429934, + "grad_norm": 0.5397417545318604, + "learning_rate": 1.2026275261756148e-08, + "loss": 0.6085, + "step": 11175 + }, + { + "epoch": 1.9936669342609936, + "grad_norm": 0.5251044034957886, + "learning_rate": 1.1348882025252749e-08, + "loss": 0.4125, + "step": 11176 + }, + { + "epoch": 1.9938453304789938, + "grad_norm": 0.5922536253929138, + "learning_rate": 1.0691122789840969e-08, + "loss": 0.5207, + "step": 11177 + }, + { + "epoch": 1.994023726696994, + "grad_norm": 0.5680334568023682, + "learning_rate": 1.005299760717393e-08, + "loss": 0.5015, + "step": 11178 + }, + { + "epoch": 1.9942021229149942, + "grad_norm": 0.4970249831676483, + "learning_rate": 9.434506527378206e-09, + "loss": 0.3887, + "step": 11179 + }, + { + "epoch": 1.9943805191329944, + "grad_norm": 0.46873077750205994, + "learning_rate": 8.835649598998297e-09, + "loss": 0.4312, + "step": 11180 + }, + { + "epoch": 1.9945589153509946, + "grad_norm": 0.5385526418685913, + "learning_rate": 8.256426869079903e-09, + "loss": 0.4843, + "step": 11181 + }, + { + "epoch": 1.9947373115689948, + "grad_norm": 0.520590603351593, + "learning_rate": 7.696838383114412e-09, + "loss": 0.5141, + "step": 11182 + }, + { + "epoch": 1.994915707786995, + "grad_norm": 0.568450927734375, + "learning_rate": 7.156884185094414e-09, + "loss": 0.5097, + "step": 11183 + }, + { + "epoch": 1.9950941040049952, + "grad_norm": 0.5293620228767395, + "learning_rate": 6.636564317374916e-09, + "loss": 0.4605, + "step": 11184 + }, + { + "epoch": 1.9952725002229954, + "grad_norm": 0.5584262013435364, + "learning_rate": 6.13587882083988e-09, + "loss": 0.5948, + "step": 11185 + }, + { + "epoch": 1.9954508964409956, + "grad_norm": 0.5276932120323181, + "learning_rate": 5.654827734791201e-09, + "loss": 0.4695, + "step": 11186 + }, + { + "epoch": 1.9956292926589956, + "grad_norm": 0.5315849781036377, + "learning_rate": 5.19341109705973e-09, + "loss": 0.4228, + "step": 11187 + }, + { + "epoch": 1.9958076888769958, + "grad_norm": 0.5869420170783997, + "learning_rate": 4.751628943838737e-09, + "loss": 0.5691, + "step": 11188 + }, + { + "epoch": 1.995986085094996, + "grad_norm": 0.5161817669868469, + "learning_rate": 4.329481309850447e-09, + "loss": 0.4109, + "step": 11189 + }, + { + "epoch": 1.9961644813129962, + "grad_norm": 0.5489330887794495, + "learning_rate": 3.926968228262773e-09, + "loss": 0.5284, + "step": 11190 + }, + { + "epoch": 1.9963428775309964, + "grad_norm": 0.49122124910354614, + "learning_rate": 3.5440897306338037e-09, + "loss": 0.386, + "step": 11191 + }, + { + "epoch": 1.9965212737489966, + "grad_norm": 0.4992600679397583, + "learning_rate": 3.1808458470783395e-09, + "loss": 0.5033, + "step": 11192 + }, + { + "epoch": 1.9966996699669965, + "grad_norm": 0.4739280045032501, + "learning_rate": 2.837236606129112e-09, + "loss": 0.3841, + "step": 11193 + }, + { + "epoch": 1.9968780661849967, + "grad_norm": 0.5642136931419373, + "learning_rate": 2.5132620347645407e-09, + "loss": 0.5123, + "step": 11194 + }, + { + "epoch": 1.997056462402997, + "grad_norm": 0.4581596553325653, + "learning_rate": 2.2089221584087328e-09, + "loss": 0.3491, + "step": 11195 + }, + { + "epoch": 1.9972348586209971, + "grad_norm": 0.5008403062820435, + "learning_rate": 1.9242170010147497e-09, + "loss": 0.4724, + "step": 11196 + }, + { + "epoch": 1.9974132548389973, + "grad_norm": 0.5869413614273071, + "learning_rate": 1.6591465848703192e-09, + "loss": 0.5462, + "step": 11197 + }, + { + "epoch": 1.9975916510569975, + "grad_norm": 0.48155996203422546, + "learning_rate": 1.4137109308476338e-09, + "loss": 0.3609, + "step": 11198 + }, + { + "epoch": 1.9977700472749977, + "grad_norm": 0.5589842796325684, + "learning_rate": 1.187910058209063e-09, + "loss": 0.5697, + "step": 11199 + }, + { + "epoch": 1.997948443492998, + "grad_norm": 0.5098150968551636, + "learning_rate": 9.817439847181753e-10, + "loss": 0.3743, + "step": 11200 + }, + { + "epoch": 1.9981268397109981, + "grad_norm": 0.47274699807167053, + "learning_rate": 7.952127265009601e-10, + "loss": 0.2905, + "step": 11201 + }, + { + "epoch": 1.9983052359289983, + "grad_norm": 0.48699966073036194, + "learning_rate": 6.283162982678725e-10, + "loss": 0.5163, + "step": 11202 + }, + { + "epoch": 1.9984836321469985, + "grad_norm": 0.5384899973869324, + "learning_rate": 4.810547130917886e-10, + "loss": 0.5131, + "step": 11203 + }, + { + "epoch": 1.9986620283649987, + "grad_norm": 0.5598852038383484, + "learning_rate": 3.534279825467834e-10, + "loss": 0.6017, + "step": 11204 + }, + { + "epoch": 1.998840424582999, + "grad_norm": 0.513744592666626, + "learning_rate": 2.454361166526198e-10, + "loss": 0.4185, + "step": 11205 + }, + { + "epoch": 1.9990188208009991, + "grad_norm": 0.5312053561210632, + "learning_rate": 1.570791239025038e-10, + "loss": 0.4817, + "step": 11206 + }, + { + "epoch": 1.9991972170189993, + "grad_norm": 0.48965707421302795, + "learning_rate": 8.835701123532935e-11, + "loss": 0.4257, + "step": 11207 + }, + { + "epoch": 1.9993756132369995, + "grad_norm": 0.6015014052391052, + "learning_rate": 3.9269784063433736e-11, + "loss": 0.7296, + "step": 11208 + }, + { + "epoch": 1.9995540094549995, + "grad_norm": 0.4663368761539459, + "learning_rate": 9.817446217086356e-12, + "loss": 0.3784, + "step": 11209 + }, + { + "epoch": 1.9997324056729997, + "grad_norm": 0.471098393201828, + "learning_rate": 0.0, + "loss": 0.4405, + "step": 11210 + } + ], + "logging_steps": 1, + "max_steps": 11210, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.680882782008115e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}