{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999258581914342, "eval_steps": 500, "global_step": 10115, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 9.885574475446704e-05, "grad_norm": 30.475161450255904, "learning_rate": 6.578947368421052e-10, "loss": 1.3543, "step": 1 }, { "epoch": 0.00019771148950893408, "grad_norm": 25.63627551623113, "learning_rate": 1.3157894736842103e-09, "loss": 1.2477, "step": 2 }, { "epoch": 0.0002965672342634011, "grad_norm": 26.880189530771116, "learning_rate": 1.9736842105263157e-09, "loss": 1.2169, "step": 3 }, { "epoch": 0.00039542297901786817, "grad_norm": 29.204642281541645, "learning_rate": 2.6315789473684206e-09, "loss": 1.2682, "step": 4 }, { "epoch": 0.0004942787237723353, "grad_norm": 20.69177911235117, "learning_rate": 3.289473684210526e-09, "loss": 1.2487, "step": 5 }, { "epoch": 0.0005931344685268023, "grad_norm": 33.62965161068742, "learning_rate": 3.947368421052631e-09, "loss": 1.1952, "step": 6 }, { "epoch": 0.0006919902132812693, "grad_norm": 25.916269777673786, "learning_rate": 4.605263157894737e-09, "loss": 1.364, "step": 7 }, { "epoch": 0.0007908459580357363, "grad_norm": 27.53094707131892, "learning_rate": 5.263157894736841e-09, "loss": 1.4426, "step": 8 }, { "epoch": 0.0008897017027902034, "grad_norm": 22.741627054268797, "learning_rate": 5.921052631578947e-09, "loss": 1.258, "step": 9 }, { "epoch": 0.0009885574475446705, "grad_norm": 24.524525686982866, "learning_rate": 6.578947368421052e-09, "loss": 1.2817, "step": 10 }, { "epoch": 0.0010874131922991375, "grad_norm": 19.618340096500052, "learning_rate": 7.236842105263158e-09, "loss": 1.2174, "step": 11 }, { "epoch": 0.0011862689370536045, "grad_norm": 30.421092565807395, "learning_rate": 7.894736842105263e-09, "loss": 1.3101, "step": 12 }, { "epoch": 0.0012851246818080715, "grad_norm": 23.581419126592415, "learning_rate": 8.552631578947369e-09, "loss": 1.3247, "step": 13 }, { "epoch": 0.0013839804265625387, "grad_norm": 23.626275912522168, "learning_rate": 9.210526315789473e-09, "loss": 1.2552, "step": 14 }, { "epoch": 0.0014828361713170057, "grad_norm": 21.477523157020613, "learning_rate": 9.868421052631578e-09, "loss": 1.2507, "step": 15 }, { "epoch": 0.0015816919160714727, "grad_norm": 30.820945395036834, "learning_rate": 1.0526315789473683e-08, "loss": 1.2954, "step": 16 }, { "epoch": 0.0016805476608259397, "grad_norm": 23.824876326760428, "learning_rate": 1.1184210526315789e-08, "loss": 1.3248, "step": 17 }, { "epoch": 0.0017794034055804069, "grad_norm": 36.066258295720075, "learning_rate": 1.1842105263157893e-08, "loss": 1.233, "step": 18 }, { "epoch": 0.0018782591503348739, "grad_norm": 21.94762414426342, "learning_rate": 1.25e-08, "loss": 1.2854, "step": 19 }, { "epoch": 0.001977114895089341, "grad_norm": 21.681910498793197, "learning_rate": 1.3157894736842104e-08, "loss": 1.3429, "step": 20 }, { "epoch": 0.002075970639843808, "grad_norm": 16.688943321500336, "learning_rate": 1.3815789473684212e-08, "loss": 1.1988, "step": 21 }, { "epoch": 0.002174826384598275, "grad_norm": 36.416719109873455, "learning_rate": 1.4473684210526316e-08, "loss": 1.2642, "step": 22 }, { "epoch": 0.002273682129352742, "grad_norm": 34.54437953061398, "learning_rate": 1.513157894736842e-08, "loss": 1.3295, "step": 23 }, { "epoch": 0.002372537874107209, "grad_norm": 26.632002477591357, "learning_rate": 1.5789473684210525e-08, "loss": 1.3022, "step": 24 }, { "epoch": 0.002471393618861676, "grad_norm": 29.71648252728109, "learning_rate": 1.6447368421052633e-08, "loss": 1.3483, "step": 25 }, { "epoch": 0.002570249363616143, "grad_norm": 23.699256345176092, "learning_rate": 1.7105263157894738e-08, "loss": 1.2494, "step": 26 }, { "epoch": 0.00266910510837061, "grad_norm": 94.40521841909921, "learning_rate": 1.7763157894736842e-08, "loss": 1.3368, "step": 27 }, { "epoch": 0.0027679608531250774, "grad_norm": 22.203142185465968, "learning_rate": 1.8421052631578947e-08, "loss": 1.259, "step": 28 }, { "epoch": 0.0028668165978795444, "grad_norm": 19.614181402952656, "learning_rate": 1.907894736842105e-08, "loss": 1.1969, "step": 29 }, { "epoch": 0.0029656723426340114, "grad_norm": 26.376973408013956, "learning_rate": 1.9736842105263156e-08, "loss": 1.279, "step": 30 }, { "epoch": 0.0030645280873884784, "grad_norm": 21.422194350149937, "learning_rate": 2.039473684210526e-08, "loss": 1.301, "step": 31 }, { "epoch": 0.0031633838321429453, "grad_norm": 29.58679833523307, "learning_rate": 2.1052631578947365e-08, "loss": 1.2699, "step": 32 }, { "epoch": 0.0032622395768974123, "grad_norm": 25.45352992934416, "learning_rate": 2.1710526315789473e-08, "loss": 1.1979, "step": 33 }, { "epoch": 0.0033610953216518793, "grad_norm": 27.033596859445943, "learning_rate": 2.2368421052631577e-08, "loss": 1.1732, "step": 34 }, { "epoch": 0.0034599510664063467, "grad_norm": 23.589954718316708, "learning_rate": 2.3026315789473682e-08, "loss": 1.3077, "step": 35 }, { "epoch": 0.0035588068111608137, "grad_norm": 19.28129884303676, "learning_rate": 2.3684210526315786e-08, "loss": 1.2949, "step": 36 }, { "epoch": 0.0036576625559152807, "grad_norm": 30.998092460764106, "learning_rate": 2.4342105263157894e-08, "loss": 1.3412, "step": 37 }, { "epoch": 0.0037565183006697477, "grad_norm": 24.41733409673652, "learning_rate": 2.5e-08, "loss": 1.334, "step": 38 }, { "epoch": 0.0038553740454242147, "grad_norm": 18.129556875001967, "learning_rate": 2.5657894736842107e-08, "loss": 1.2206, "step": 39 }, { "epoch": 0.003954229790178682, "grad_norm": 27.09188453575059, "learning_rate": 2.6315789473684208e-08, "loss": 1.3589, "step": 40 }, { "epoch": 0.004053085534933149, "grad_norm": 23.938554293056164, "learning_rate": 2.6973684210526316e-08, "loss": 1.2469, "step": 41 }, { "epoch": 0.004151941279687616, "grad_norm": 38.17807070933581, "learning_rate": 2.7631578947368424e-08, "loss": 1.2433, "step": 42 }, { "epoch": 0.004250797024442083, "grad_norm": 18.963305645131506, "learning_rate": 2.8289473684210525e-08, "loss": 1.2614, "step": 43 }, { "epoch": 0.00434965276919655, "grad_norm": 25.80391305450501, "learning_rate": 2.8947368421052633e-08, "loss": 1.1906, "step": 44 }, { "epoch": 0.004448508513951017, "grad_norm": 28.059919956982725, "learning_rate": 2.9605263157894734e-08, "loss": 1.3289, "step": 45 }, { "epoch": 0.004547364258705484, "grad_norm": 30.02594751552414, "learning_rate": 3.026315789473684e-08, "loss": 1.2145, "step": 46 }, { "epoch": 0.0046462200034599515, "grad_norm": 34.83838131155323, "learning_rate": 3.0921052631578946e-08, "loss": 1.3216, "step": 47 }, { "epoch": 0.004745075748214418, "grad_norm": 27.90915268612838, "learning_rate": 3.157894736842105e-08, "loss": 1.1785, "step": 48 }, { "epoch": 0.0048439314929688854, "grad_norm": 24.931782308267127, "learning_rate": 3.2236842105263155e-08, "loss": 1.2273, "step": 49 }, { "epoch": 0.004942787237723352, "grad_norm": 32.21006520601201, "learning_rate": 3.2894736842105267e-08, "loss": 1.2628, "step": 50 }, { "epoch": 0.005041642982477819, "grad_norm": 26.164724322584906, "learning_rate": 3.3552631578947364e-08, "loss": 1.2355, "step": 51 }, { "epoch": 0.005140498727232286, "grad_norm": 28.95837805454712, "learning_rate": 3.4210526315789476e-08, "loss": 1.3627, "step": 52 }, { "epoch": 0.005239354471986753, "grad_norm": 20.649015924877492, "learning_rate": 3.4868421052631574e-08, "loss": 1.2143, "step": 53 }, { "epoch": 0.00533821021674122, "grad_norm": 17.220123821205775, "learning_rate": 3.5526315789473685e-08, "loss": 1.2758, "step": 54 }, { "epoch": 0.005437065961495687, "grad_norm": 36.53416030748279, "learning_rate": 3.618421052631579e-08, "loss": 1.2415, "step": 55 }, { "epoch": 0.005535921706250155, "grad_norm": 23.336546800404633, "learning_rate": 3.6842105263157894e-08, "loss": 1.3166, "step": 56 }, { "epoch": 0.005634777451004621, "grad_norm": 20.190739753364692, "learning_rate": 3.75e-08, "loss": 1.213, "step": 57 }, { "epoch": 0.005733633195759089, "grad_norm": 22.638034053855648, "learning_rate": 3.81578947368421e-08, "loss": 1.1663, "step": 58 }, { "epoch": 0.005832488940513555, "grad_norm": 28.52408410319144, "learning_rate": 3.881578947368421e-08, "loss": 1.1402, "step": 59 }, { "epoch": 0.005931344685268023, "grad_norm": 21.198308066537084, "learning_rate": 3.947368421052631e-08, "loss": 1.3457, "step": 60 }, { "epoch": 0.006030200430022489, "grad_norm": 27.24335295144261, "learning_rate": 4.013157894736842e-08, "loss": 1.3207, "step": 61 }, { "epoch": 0.006129056174776957, "grad_norm": 18.59161301136332, "learning_rate": 4.078947368421052e-08, "loss": 1.1665, "step": 62 }, { "epoch": 0.006227911919531424, "grad_norm": 15.292867095245674, "learning_rate": 4.144736842105263e-08, "loss": 1.173, "step": 63 }, { "epoch": 0.006326767664285891, "grad_norm": 22.11688269338932, "learning_rate": 4.210526315789473e-08, "loss": 1.2131, "step": 64 }, { "epoch": 0.006425623409040358, "grad_norm": 27.506214801818913, "learning_rate": 4.276315789473684e-08, "loss": 1.2134, "step": 65 }, { "epoch": 0.006524479153794825, "grad_norm": 26.107873774741005, "learning_rate": 4.3421052631578946e-08, "loss": 1.2446, "step": 66 }, { "epoch": 0.006623334898549292, "grad_norm": 22.452661046804067, "learning_rate": 4.407894736842105e-08, "loss": 1.2146, "step": 67 }, { "epoch": 0.006722190643303759, "grad_norm": 36.52498015327471, "learning_rate": 4.4736842105263155e-08, "loss": 1.2417, "step": 68 }, { "epoch": 0.006821046388058226, "grad_norm": 20.18679038264168, "learning_rate": 4.5394736842105266e-08, "loss": 1.0271, "step": 69 }, { "epoch": 0.0069199021328126935, "grad_norm": 33.1648312130604, "learning_rate": 4.6052631578947364e-08, "loss": 1.1842, "step": 70 }, { "epoch": 0.00701875787756716, "grad_norm": 32.75296826907222, "learning_rate": 4.6710526315789475e-08, "loss": 1.2243, "step": 71 }, { "epoch": 0.0071176136223216275, "grad_norm": 20.59350005334737, "learning_rate": 4.736842105263157e-08, "loss": 1.1486, "step": 72 }, { "epoch": 0.007216469367076094, "grad_norm": 21.834523901628476, "learning_rate": 4.8026315789473684e-08, "loss": 1.3099, "step": 73 }, { "epoch": 0.007315325111830561, "grad_norm": 28.44933168118671, "learning_rate": 4.868421052631579e-08, "loss": 1.1638, "step": 74 }, { "epoch": 0.007414180856585028, "grad_norm": 24.847112761036428, "learning_rate": 4.934210526315789e-08, "loss": 1.1478, "step": 75 }, { "epoch": 0.007513036601339495, "grad_norm": 21.335765633115127, "learning_rate": 5e-08, "loss": 1.0836, "step": 76 }, { "epoch": 0.007611892346093962, "grad_norm": 21.40429709802549, "learning_rate": 5.06578947368421e-08, "loss": 1.1951, "step": 77 }, { "epoch": 0.007710748090848429, "grad_norm": 21.6013249534994, "learning_rate": 5.1315789473684213e-08, "loss": 1.1567, "step": 78 }, { "epoch": 0.007809603835602897, "grad_norm": 19.529375441496942, "learning_rate": 5.197368421052631e-08, "loss": 1.2136, "step": 79 }, { "epoch": 0.007908459580357364, "grad_norm": 21.363164829302917, "learning_rate": 5.2631578947368416e-08, "loss": 1.2903, "step": 80 }, { "epoch": 0.00800731532511183, "grad_norm": 19.92070819296784, "learning_rate": 5.328947368421053e-08, "loss": 1.1887, "step": 81 }, { "epoch": 0.008106171069866297, "grad_norm": 22.045875357228017, "learning_rate": 5.394736842105263e-08, "loss": 1.1822, "step": 82 }, { "epoch": 0.008205026814620764, "grad_norm": 18.353684377040466, "learning_rate": 5.460526315789473e-08, "loss": 1.1455, "step": 83 }, { "epoch": 0.008303882559375232, "grad_norm": 20.06687356284966, "learning_rate": 5.526315789473685e-08, "loss": 1.1625, "step": 84 }, { "epoch": 0.008402738304129699, "grad_norm": 20.4056699355096, "learning_rate": 5.5921052631578945e-08, "loss": 1.1528, "step": 85 }, { "epoch": 0.008501594048884165, "grad_norm": 19.782015490514596, "learning_rate": 5.657894736842105e-08, "loss": 1.168, "step": 86 }, { "epoch": 0.008600449793638634, "grad_norm": 19.80399703853161, "learning_rate": 5.723684210526315e-08, "loss": 1.1928, "step": 87 }, { "epoch": 0.0086993055383931, "grad_norm": 19.14520180869539, "learning_rate": 5.7894736842105265e-08, "loss": 1.1129, "step": 88 }, { "epoch": 0.008798161283147567, "grad_norm": 25.11034976296009, "learning_rate": 5.8552631578947363e-08, "loss": 1.2428, "step": 89 }, { "epoch": 0.008897017027902033, "grad_norm": 22.554201467008006, "learning_rate": 5.921052631578947e-08, "loss": 1.1356, "step": 90 }, { "epoch": 0.008995872772656502, "grad_norm": 19.431948238616226, "learning_rate": 5.986842105263158e-08, "loss": 1.0743, "step": 91 }, { "epoch": 0.009094728517410968, "grad_norm": 32.468351431823194, "learning_rate": 6.052631578947368e-08, "loss": 1.1539, "step": 92 }, { "epoch": 0.009193584262165435, "grad_norm": 12.786904270549623, "learning_rate": 6.118421052631579e-08, "loss": 1.0592, "step": 93 }, { "epoch": 0.009292440006919903, "grad_norm": 18.29961861137409, "learning_rate": 6.184210526315789e-08, "loss": 1.1845, "step": 94 }, { "epoch": 0.00939129575167437, "grad_norm": 13.181401174381381, "learning_rate": 6.25e-08, "loss": 1.116, "step": 95 }, { "epoch": 0.009490151496428836, "grad_norm": 17.144426090808107, "learning_rate": 6.31578947368421e-08, "loss": 1.1364, "step": 96 }, { "epoch": 0.009589007241183303, "grad_norm": 25.228401337522772, "learning_rate": 6.38157894736842e-08, "loss": 1.1139, "step": 97 }, { "epoch": 0.009687862985937771, "grad_norm": 17.768356712511643, "learning_rate": 6.447368421052631e-08, "loss": 1.1066, "step": 98 }, { "epoch": 0.009786718730692237, "grad_norm": 16.849595142890006, "learning_rate": 6.513157894736842e-08, "loss": 1.071, "step": 99 }, { "epoch": 0.009885574475446704, "grad_norm": 12.668678548805174, "learning_rate": 6.578947368421053e-08, "loss": 1.0526, "step": 100 }, { "epoch": 0.009984430220201172, "grad_norm": 16.20538694127326, "learning_rate": 6.644736842105262e-08, "loss": 1.1871, "step": 101 }, { "epoch": 0.010083285964955639, "grad_norm": 38.05575626758174, "learning_rate": 6.710526315789473e-08, "loss": 1.1158, "step": 102 }, { "epoch": 0.010182141709710105, "grad_norm": 13.064750354178049, "learning_rate": 6.776315789473685e-08, "loss": 1.0432, "step": 103 }, { "epoch": 0.010280997454464572, "grad_norm": 14.03702608522195, "learning_rate": 6.842105263157895e-08, "loss": 1.1229, "step": 104 }, { "epoch": 0.01037985319921904, "grad_norm": 22.644509574103132, "learning_rate": 6.907894736842104e-08, "loss": 1.128, "step": 105 }, { "epoch": 0.010478708943973507, "grad_norm": 39.490779033555945, "learning_rate": 6.973684210526315e-08, "loss": 1.1456, "step": 106 }, { "epoch": 0.010577564688727973, "grad_norm": 12.934347664611838, "learning_rate": 7.039473684210526e-08, "loss": 1.0575, "step": 107 }, { "epoch": 0.01067642043348244, "grad_norm": 13.727672650380955, "learning_rate": 7.105263157894737e-08, "loss": 1.0456, "step": 108 }, { "epoch": 0.010775276178236908, "grad_norm": 18.287886771273506, "learning_rate": 7.171052631578946e-08, "loss": 1.0733, "step": 109 }, { "epoch": 0.010874131922991375, "grad_norm": 18.510818251501302, "learning_rate": 7.236842105263158e-08, "loss": 1.1613, "step": 110 }, { "epoch": 0.010972987667745841, "grad_norm": 25.108705441896266, "learning_rate": 7.302631578947368e-08, "loss": 1.0179, "step": 111 }, { "epoch": 0.01107184341250031, "grad_norm": 13.091098494263713, "learning_rate": 7.368421052631579e-08, "loss": 1.0004, "step": 112 }, { "epoch": 0.011170699157254776, "grad_norm": 10.93754505557004, "learning_rate": 7.434210526315789e-08, "loss": 1.0607, "step": 113 }, { "epoch": 0.011269554902009243, "grad_norm": 10.694229988651172, "learning_rate": 7.5e-08, "loss": 1.0775, "step": 114 }, { "epoch": 0.01136841064676371, "grad_norm": 11.588821552314744, "learning_rate": 7.56578947368421e-08, "loss": 1.1043, "step": 115 }, { "epoch": 0.011467266391518178, "grad_norm": 10.213282833720534, "learning_rate": 7.63157894736842e-08, "loss": 1.0575, "step": 116 }, { "epoch": 0.011566122136272644, "grad_norm": 24.529333519075678, "learning_rate": 7.697368421052631e-08, "loss": 1.0857, "step": 117 }, { "epoch": 0.01166497788102711, "grad_norm": 20.887484558964072, "learning_rate": 7.763157894736841e-08, "loss": 1.0521, "step": 118 }, { "epoch": 0.011763833625781579, "grad_norm": 13.900651669621821, "learning_rate": 7.828947368421053e-08, "loss": 1.0863, "step": 119 }, { "epoch": 0.011862689370536045, "grad_norm": 9.876241253694515, "learning_rate": 7.894736842105262e-08, "loss": 1.1065, "step": 120 }, { "epoch": 0.011961545115290512, "grad_norm": 17.44896731924779, "learning_rate": 7.960526315789473e-08, "loss": 1.0366, "step": 121 }, { "epoch": 0.012060400860044979, "grad_norm": 8.629138629726691, "learning_rate": 8.026315789473685e-08, "loss": 1.1312, "step": 122 }, { "epoch": 0.012159256604799447, "grad_norm": 8.70651247663794, "learning_rate": 8.092105263157895e-08, "loss": 1.0206, "step": 123 }, { "epoch": 0.012258112349553913, "grad_norm": 17.311173406077977, "learning_rate": 8.157894736842104e-08, "loss": 1.0678, "step": 124 }, { "epoch": 0.01235696809430838, "grad_norm": 13.257121528064598, "learning_rate": 8.223684210526315e-08, "loss": 0.9714, "step": 125 }, { "epoch": 0.012455823839062848, "grad_norm": 9.384450323703225, "learning_rate": 8.289473684210526e-08, "loss": 1.0012, "step": 126 }, { "epoch": 0.012554679583817315, "grad_norm": 8.932883850126371, "learning_rate": 8.355263157894737e-08, "loss": 0.9687, "step": 127 }, { "epoch": 0.012653535328571781, "grad_norm": 12.800751030963688, "learning_rate": 8.421052631578946e-08, "loss": 1.0067, "step": 128 }, { "epoch": 0.012752391073326248, "grad_norm": 9.220450048295941, "learning_rate": 8.486842105263158e-08, "loss": 1.0789, "step": 129 }, { "epoch": 0.012851246818080716, "grad_norm": 22.67630905748596, "learning_rate": 8.552631578947368e-08, "loss": 0.962, "step": 130 }, { "epoch": 0.012950102562835183, "grad_norm": 10.49845358818206, "learning_rate": 8.618421052631579e-08, "loss": 1.0665, "step": 131 }, { "epoch": 0.01304895830758965, "grad_norm": 15.38429172795839, "learning_rate": 8.684210526315789e-08, "loss": 0.9817, "step": 132 }, { "epoch": 0.013147814052344118, "grad_norm": 13.459637925702838, "learning_rate": 8.75e-08, "loss": 1.0242, "step": 133 }, { "epoch": 0.013246669797098584, "grad_norm": 8.105136238594314, "learning_rate": 8.81578947368421e-08, "loss": 0.9525, "step": 134 }, { "epoch": 0.01334552554185305, "grad_norm": 45.20154563248894, "learning_rate": 8.88157894736842e-08, "loss": 0.9923, "step": 135 }, { "epoch": 0.013444381286607517, "grad_norm": 9.795013178084414, "learning_rate": 8.947368421052631e-08, "loss": 1.0543, "step": 136 }, { "epoch": 0.013543237031361986, "grad_norm": 11.344645171644183, "learning_rate": 9.013157894736841e-08, "loss": 0.9959, "step": 137 }, { "epoch": 0.013642092776116452, "grad_norm": 30.49206720065832, "learning_rate": 9.078947368421053e-08, "loss": 1.015, "step": 138 }, { "epoch": 0.013740948520870919, "grad_norm": 10.73647581982021, "learning_rate": 9.144736842105262e-08, "loss": 1.0105, "step": 139 }, { "epoch": 0.013839804265625387, "grad_norm": 10.426677844741391, "learning_rate": 9.210526315789473e-08, "loss": 1.0127, "step": 140 }, { "epoch": 0.013938660010379854, "grad_norm": 9.93998020191762, "learning_rate": 9.276315789473685e-08, "loss": 1.0682, "step": 141 }, { "epoch": 0.01403751575513432, "grad_norm": 84.9191220115612, "learning_rate": 9.342105263157895e-08, "loss": 0.9885, "step": 142 }, { "epoch": 0.014136371499888787, "grad_norm": 9.60801124579524, "learning_rate": 9.407894736842104e-08, "loss": 1.0754, "step": 143 }, { "epoch": 0.014235227244643255, "grad_norm": 7.352246015543652, "learning_rate": 9.473684210526315e-08, "loss": 0.9727, "step": 144 }, { "epoch": 0.014334082989397721, "grad_norm": 37.867138735653675, "learning_rate": 9.539473684210526e-08, "loss": 1.0598, "step": 145 }, { "epoch": 0.014432938734152188, "grad_norm": 8.738463381307774, "learning_rate": 9.605263157894737e-08, "loss": 0.9525, "step": 146 }, { "epoch": 0.014531794478906656, "grad_norm": 154.06000804647067, "learning_rate": 9.671052631578946e-08, "loss": 0.989, "step": 147 }, { "epoch": 0.014630650223661123, "grad_norm": 18.822958567470394, "learning_rate": 9.736842105263158e-08, "loss": 1.0833, "step": 148 }, { "epoch": 0.01472950596841559, "grad_norm": 8.804414684626433, "learning_rate": 9.802631578947368e-08, "loss": 1.0046, "step": 149 }, { "epoch": 0.014828361713170056, "grad_norm": 15.858054663189922, "learning_rate": 9.868421052631579e-08, "loss": 1.0, "step": 150 }, { "epoch": 0.014927217457924524, "grad_norm": 8.002709504902622, "learning_rate": 9.934210526315789e-08, "loss": 0.9236, "step": 151 }, { "epoch": 0.01502607320267899, "grad_norm": 14.553093668522887, "learning_rate": 1e-07, "loss": 0.9369, "step": 152 }, { "epoch": 0.015124928947433457, "grad_norm": 9.28585115225019, "learning_rate": 1.006578947368421e-07, "loss": 1.117, "step": 153 }, { "epoch": 0.015223784692187924, "grad_norm": 8.54560759406047, "learning_rate": 1.013157894736842e-07, "loss": 1.0503, "step": 154 }, { "epoch": 0.015322640436942392, "grad_norm": 7.67773918625784, "learning_rate": 1.019736842105263e-07, "loss": 1.0919, "step": 155 }, { "epoch": 0.015421496181696859, "grad_norm": 10.42466209953423, "learning_rate": 1.0263157894736843e-07, "loss": 1.018, "step": 156 }, { "epoch": 0.015520351926451325, "grad_norm": 13.379321545328466, "learning_rate": 1.0328947368421053e-07, "loss": 0.9797, "step": 157 }, { "epoch": 0.015619207671205794, "grad_norm": 8.046068101481339, "learning_rate": 1.0394736842105262e-07, "loss": 0.9958, "step": 158 }, { "epoch": 0.01571806341596026, "grad_norm": 7.254624033301213, "learning_rate": 1.0460526315789473e-07, "loss": 1.1204, "step": 159 }, { "epoch": 0.01581691916071473, "grad_norm": 8.036053744601878, "learning_rate": 1.0526315789473683e-07, "loss": 1.078, "step": 160 }, { "epoch": 0.015915774905469193, "grad_norm": 10.96173017141793, "learning_rate": 1.0592105263157894e-07, "loss": 1.1073, "step": 161 }, { "epoch": 0.01601463065022366, "grad_norm": 6.227885279321283, "learning_rate": 1.0657894736842105e-07, "loss": 0.9296, "step": 162 }, { "epoch": 0.01611348639497813, "grad_norm": 6.87831428471293, "learning_rate": 1.0723684210526316e-07, "loss": 0.969, "step": 163 }, { "epoch": 0.016212342139732595, "grad_norm": 8.7835846647056, "learning_rate": 1.0789473684210526e-07, "loss": 0.9994, "step": 164 }, { "epoch": 0.016311197884487063, "grad_norm": 9.585784133415094, "learning_rate": 1.0855263157894737e-07, "loss": 1.0493, "step": 165 }, { "epoch": 0.016410053629241528, "grad_norm": 8.586580508648508, "learning_rate": 1.0921052631578946e-07, "loss": 1.0069, "step": 166 }, { "epoch": 0.016508909373995996, "grad_norm": 7.839914197423952, "learning_rate": 1.0986842105263156e-07, "loss": 1.0349, "step": 167 }, { "epoch": 0.016607765118750464, "grad_norm": 6.214837014620745, "learning_rate": 1.105263157894737e-07, "loss": 0.948, "step": 168 }, { "epoch": 0.01670662086350493, "grad_norm": 8.672882714510084, "learning_rate": 1.1118421052631579e-07, "loss": 0.9165, "step": 169 }, { "epoch": 0.016805476608259397, "grad_norm": 9.19483184084849, "learning_rate": 1.1184210526315789e-07, "loss": 0.9568, "step": 170 }, { "epoch": 0.016904332353013866, "grad_norm": 7.211727957865169, "learning_rate": 1.125e-07, "loss": 1.0107, "step": 171 }, { "epoch": 0.01700318809776833, "grad_norm": 7.000621559410127, "learning_rate": 1.131578947368421e-07, "loss": 0.9805, "step": 172 }, { "epoch": 0.0171020438425228, "grad_norm": 6.776582260245124, "learning_rate": 1.138157894736842e-07, "loss": 0.9136, "step": 173 }, { "epoch": 0.017200899587277267, "grad_norm": 48.80703781971459, "learning_rate": 1.144736842105263e-07, "loss": 1.0061, "step": 174 }, { "epoch": 0.017299755332031732, "grad_norm": 9.899510148202532, "learning_rate": 1.1513157894736843e-07, "loss": 0.9793, "step": 175 }, { "epoch": 0.0173986110767862, "grad_norm": 6.807484060976695, "learning_rate": 1.1578947368421053e-07, "loss": 0.9438, "step": 176 }, { "epoch": 0.017497466821540665, "grad_norm": 9.501054529202943, "learning_rate": 1.1644736842105262e-07, "loss": 0.9263, "step": 177 }, { "epoch": 0.017596322566295133, "grad_norm": 12.684440239429364, "learning_rate": 1.1710526315789473e-07, "loss": 1.0145, "step": 178 }, { "epoch": 0.0176951783110496, "grad_norm": 10.238266556413308, "learning_rate": 1.1776315789473683e-07, "loss": 0.9481, "step": 179 }, { "epoch": 0.017794034055804066, "grad_norm": 11.388249978188357, "learning_rate": 1.1842105263157894e-07, "loss": 1.0565, "step": 180 }, { "epoch": 0.017892889800558535, "grad_norm": 8.638444349599615, "learning_rate": 1.1907894736842105e-07, "loss": 0.9731, "step": 181 }, { "epoch": 0.017991745545313003, "grad_norm": 7.600743407147101, "learning_rate": 1.1973684210526316e-07, "loss": 0.9954, "step": 182 }, { "epoch": 0.018090601290067468, "grad_norm": 8.062910417564103, "learning_rate": 1.2039473684210526e-07, "loss": 1.0336, "step": 183 }, { "epoch": 0.018189457034821936, "grad_norm": 6.845585087062875, "learning_rate": 1.2105263157894737e-07, "loss": 0.8703, "step": 184 }, { "epoch": 0.018288312779576404, "grad_norm": 5.500591921602448, "learning_rate": 1.2171052631578947e-07, "loss": 0.946, "step": 185 }, { "epoch": 0.01838716852433087, "grad_norm": 5.792695121885709, "learning_rate": 1.2236842105263158e-07, "loss": 0.9055, "step": 186 }, { "epoch": 0.018486024269085338, "grad_norm": 6.750291369804461, "learning_rate": 1.2302631578947368e-07, "loss": 0.9828, "step": 187 }, { "epoch": 0.018584880013839806, "grad_norm": 5.66225015799794, "learning_rate": 1.2368421052631579e-07, "loss": 0.9379, "step": 188 }, { "epoch": 0.01868373575859427, "grad_norm": 7.627144143639098, "learning_rate": 1.243421052631579e-07, "loss": 0.9354, "step": 189 }, { "epoch": 0.01878259150334874, "grad_norm": 5.6705022383636265, "learning_rate": 1.25e-07, "loss": 0.9458, "step": 190 }, { "epoch": 0.018881447248103204, "grad_norm": 8.81749430986215, "learning_rate": 1.256578947368421e-07, "loss": 0.9417, "step": 191 }, { "epoch": 0.018980302992857672, "grad_norm": 8.304119562253083, "learning_rate": 1.263157894736842e-07, "loss": 0.8957, "step": 192 }, { "epoch": 0.01907915873761214, "grad_norm": 11.067849371349094, "learning_rate": 1.269736842105263e-07, "loss": 0.8583, "step": 193 }, { "epoch": 0.019178014482366605, "grad_norm": 14.691139380694542, "learning_rate": 1.276315789473684e-07, "loss": 0.9321, "step": 194 }, { "epoch": 0.019276870227121073, "grad_norm": 6.033635379255785, "learning_rate": 1.2828947368421052e-07, "loss": 0.942, "step": 195 }, { "epoch": 0.019375725971875542, "grad_norm": 44.40632926098656, "learning_rate": 1.2894736842105262e-07, "loss": 1.0129, "step": 196 }, { "epoch": 0.019474581716630007, "grad_norm": 5.032443622461142, "learning_rate": 1.2960526315789473e-07, "loss": 0.9297, "step": 197 }, { "epoch": 0.019573437461384475, "grad_norm": 6.505173085220018, "learning_rate": 1.3026315789473683e-07, "loss": 0.8746, "step": 198 }, { "epoch": 0.019672293206138943, "grad_norm": 5.349635522929612, "learning_rate": 1.3092105263157894e-07, "loss": 0.9155, "step": 199 }, { "epoch": 0.019771148950893408, "grad_norm": 50.60483552817676, "learning_rate": 1.3157894736842107e-07, "loss": 0.9024, "step": 200 }, { "epoch": 0.019870004695647876, "grad_norm": 5.695287837213799, "learning_rate": 1.3223684210526317e-07, "loss": 0.9828, "step": 201 }, { "epoch": 0.019968860440402345, "grad_norm": 6.222308626549295, "learning_rate": 1.3289473684210525e-07, "loss": 0.8506, "step": 202 }, { "epoch": 0.02006771618515681, "grad_norm": 6.975139667568593, "learning_rate": 1.3355263157894735e-07, "loss": 0.9418, "step": 203 }, { "epoch": 0.020166571929911278, "grad_norm": 13.365042956743633, "learning_rate": 1.3421052631578946e-07, "loss": 0.9018, "step": 204 }, { "epoch": 0.020265427674665742, "grad_norm": 7.788559187107898, "learning_rate": 1.3486842105263156e-07, "loss": 0.9469, "step": 205 }, { "epoch": 0.02036428341942021, "grad_norm": 4.8050129828736035, "learning_rate": 1.355263157894737e-07, "loss": 0.9741, "step": 206 }, { "epoch": 0.02046313916417468, "grad_norm": 7.160654119075421, "learning_rate": 1.361842105263158e-07, "loss": 0.9262, "step": 207 }, { "epoch": 0.020561994908929144, "grad_norm": 6.100421251572534, "learning_rate": 1.368421052631579e-07, "loss": 0.9218, "step": 208 }, { "epoch": 0.020660850653683612, "grad_norm": 6.737953849181826, "learning_rate": 1.375e-07, "loss": 0.918, "step": 209 }, { "epoch": 0.02075970639843808, "grad_norm": 7.92374117191971, "learning_rate": 1.3815789473684209e-07, "loss": 0.9205, "step": 210 }, { "epoch": 0.020858562143192545, "grad_norm": 5.4721842622805035, "learning_rate": 1.388157894736842e-07, "loss": 1.0177, "step": 211 }, { "epoch": 0.020957417887947014, "grad_norm": 14.430571833713174, "learning_rate": 1.394736842105263e-07, "loss": 0.8871, "step": 212 }, { "epoch": 0.021056273632701482, "grad_norm": 6.42911338349966, "learning_rate": 1.4013157894736843e-07, "loss": 0.876, "step": 213 }, { "epoch": 0.021155129377455947, "grad_norm": 5.88556656635028, "learning_rate": 1.4078947368421053e-07, "loss": 0.8746, "step": 214 }, { "epoch": 0.021253985122210415, "grad_norm": 14.045518583665917, "learning_rate": 1.4144736842105263e-07, "loss": 0.981, "step": 215 }, { "epoch": 0.02135284086696488, "grad_norm": 14.989803515257334, "learning_rate": 1.4210526315789474e-07, "loss": 0.8946, "step": 216 }, { "epoch": 0.021451696611719348, "grad_norm": 4.184925185507607, "learning_rate": 1.4276315789473682e-07, "loss": 0.9389, "step": 217 }, { "epoch": 0.021550552356473816, "grad_norm": 6.687260028808132, "learning_rate": 1.4342105263157892e-07, "loss": 0.883, "step": 218 }, { "epoch": 0.02164940810122828, "grad_norm": 5.630434184179349, "learning_rate": 1.4407894736842105e-07, "loss": 0.8768, "step": 219 }, { "epoch": 0.02174826384598275, "grad_norm": 5.19053635806534, "learning_rate": 1.4473684210526316e-07, "loss": 0.8423, "step": 220 }, { "epoch": 0.021847119590737218, "grad_norm": 5.861033794277367, "learning_rate": 1.4539473684210526e-07, "loss": 0.9897, "step": 221 }, { "epoch": 0.021945975335491683, "grad_norm": 6.0708170823934475, "learning_rate": 1.4605263157894737e-07, "loss": 0.9804, "step": 222 }, { "epoch": 0.02204483108024615, "grad_norm": 7.0080960560379255, "learning_rate": 1.4671052631578947e-07, "loss": 0.9715, "step": 223 }, { "epoch": 0.02214368682500062, "grad_norm": 5.600333528334201, "learning_rate": 1.4736842105263158e-07, "loss": 0.8283, "step": 224 }, { "epoch": 0.022242542569755084, "grad_norm": 8.34310915369233, "learning_rate": 1.4802631578947368e-07, "loss": 0.7616, "step": 225 }, { "epoch": 0.022341398314509552, "grad_norm": 5.157518897388644, "learning_rate": 1.4868421052631578e-07, "loss": 0.891, "step": 226 }, { "epoch": 0.02244025405926402, "grad_norm": 5.9524881787097215, "learning_rate": 1.493421052631579e-07, "loss": 0.8792, "step": 227 }, { "epoch": 0.022539109804018485, "grad_norm": 7.514775598803251, "learning_rate": 1.5e-07, "loss": 0.9301, "step": 228 }, { "epoch": 0.022637965548772954, "grad_norm": 14.046518614465036, "learning_rate": 1.506578947368421e-07, "loss": 0.9474, "step": 229 }, { "epoch": 0.02273682129352742, "grad_norm": 6.192233454523717, "learning_rate": 1.513157894736842e-07, "loss": 0.8696, "step": 230 }, { "epoch": 0.022835677038281887, "grad_norm": 5.978207230614067, "learning_rate": 1.519736842105263e-07, "loss": 0.9174, "step": 231 }, { "epoch": 0.022934532783036355, "grad_norm": 6.578420824460962, "learning_rate": 1.526315789473684e-07, "loss": 0.9253, "step": 232 }, { "epoch": 0.02303338852779082, "grad_norm": 7.009076585244014, "learning_rate": 1.5328947368421052e-07, "loss": 0.8757, "step": 233 }, { "epoch": 0.023132244272545288, "grad_norm": 4.127479188865312, "learning_rate": 1.5394736842105262e-07, "loss": 0.9192, "step": 234 }, { "epoch": 0.023231100017299756, "grad_norm": 17.260180936889498, "learning_rate": 1.5460526315789472e-07, "loss": 0.9029, "step": 235 }, { "epoch": 0.02332995576205422, "grad_norm": 5.665725666958535, "learning_rate": 1.5526315789473683e-07, "loss": 0.945, "step": 236 }, { "epoch": 0.02342881150680869, "grad_norm": 5.4796015900415584, "learning_rate": 1.5592105263157893e-07, "loss": 0.9806, "step": 237 }, { "epoch": 0.023527667251563158, "grad_norm": 6.387678925959827, "learning_rate": 1.5657894736842107e-07, "loss": 0.9551, "step": 238 }, { "epoch": 0.023626522996317623, "grad_norm": 5.534777241888097, "learning_rate": 1.5723684210526317e-07, "loss": 0.8345, "step": 239 }, { "epoch": 0.02372537874107209, "grad_norm": 5.412055751863232, "learning_rate": 1.5789473684210525e-07, "loss": 0.9603, "step": 240 }, { "epoch": 0.02382423448582656, "grad_norm": 4.743483162983015, "learning_rate": 1.5855263157894735e-07, "loss": 0.8826, "step": 241 }, { "epoch": 0.023923090230581024, "grad_norm": 5.762764154527077, "learning_rate": 1.5921052631578946e-07, "loss": 0.8683, "step": 242 }, { "epoch": 0.024021945975335492, "grad_norm": 8.730391086925994, "learning_rate": 1.5986842105263156e-07, "loss": 0.8318, "step": 243 }, { "epoch": 0.024120801720089957, "grad_norm": 5.742619182540114, "learning_rate": 1.605263157894737e-07, "loss": 0.8868, "step": 244 }, { "epoch": 0.024219657464844425, "grad_norm": 8.534223110897912, "learning_rate": 1.611842105263158e-07, "loss": 0.8559, "step": 245 }, { "epoch": 0.024318513209598894, "grad_norm": 32.46803214043959, "learning_rate": 1.618421052631579e-07, "loss": 0.9414, "step": 246 }, { "epoch": 0.02441736895435336, "grad_norm": 6.375318276638526, "learning_rate": 1.6249999999999998e-07, "loss": 0.9299, "step": 247 }, { "epoch": 0.024516224699107827, "grad_norm": 5.979823353380745, "learning_rate": 1.6315789473684208e-07, "loss": 0.8036, "step": 248 }, { "epoch": 0.024615080443862295, "grad_norm": 6.087671317981926, "learning_rate": 1.638157894736842e-07, "loss": 0.8693, "step": 249 }, { "epoch": 0.02471393618861676, "grad_norm": 6.915152298729674, "learning_rate": 1.644736842105263e-07, "loss": 0.8972, "step": 250 }, { "epoch": 0.024812791933371228, "grad_norm": 10.247492978180086, "learning_rate": 1.6513157894736842e-07, "loss": 0.8578, "step": 251 }, { "epoch": 0.024911647678125697, "grad_norm": 9.431763370350696, "learning_rate": 1.6578947368421053e-07, "loss": 0.8925, "step": 252 }, { "epoch": 0.02501050342288016, "grad_norm": 7.194837800136805, "learning_rate": 1.6644736842105263e-07, "loss": 0.9089, "step": 253 }, { "epoch": 0.02510935916763463, "grad_norm": 6.58907781938773, "learning_rate": 1.6710526315789474e-07, "loss": 0.898, "step": 254 }, { "epoch": 0.025208214912389098, "grad_norm": 4.292600996418018, "learning_rate": 1.6776315789473682e-07, "loss": 0.8772, "step": 255 }, { "epoch": 0.025307070657143563, "grad_norm": 11.905136070935857, "learning_rate": 1.6842105263157892e-07, "loss": 0.8715, "step": 256 }, { "epoch": 0.02540592640189803, "grad_norm": 4.699543582800261, "learning_rate": 1.6907894736842105e-07, "loss": 0.7948, "step": 257 }, { "epoch": 0.025504782146652496, "grad_norm": 6.130909306230232, "learning_rate": 1.6973684210526316e-07, "loss": 0.8896, "step": 258 }, { "epoch": 0.025603637891406964, "grad_norm": 7.395915028686098, "learning_rate": 1.7039473684210526e-07, "loss": 0.892, "step": 259 }, { "epoch": 0.025702493636161432, "grad_norm": 34.6930542988766, "learning_rate": 1.7105263157894736e-07, "loss": 0.8237, "step": 260 }, { "epoch": 0.025801349380915897, "grad_norm": 10.254149909100025, "learning_rate": 1.7171052631578947e-07, "loss": 0.8613, "step": 261 }, { "epoch": 0.025900205125670366, "grad_norm": 5.083506746674676, "learning_rate": 1.7236842105263157e-07, "loss": 0.8866, "step": 262 }, { "epoch": 0.025999060870424834, "grad_norm": 4.836952013765102, "learning_rate": 1.7302631578947368e-07, "loss": 0.8888, "step": 263 }, { "epoch": 0.0260979166151793, "grad_norm": 11.414818959540721, "learning_rate": 1.7368421052631578e-07, "loss": 0.8184, "step": 264 }, { "epoch": 0.026196772359933767, "grad_norm": 7.102826819721976, "learning_rate": 1.743421052631579e-07, "loss": 0.9272, "step": 265 }, { "epoch": 0.026295628104688235, "grad_norm": 4.305526545612905, "learning_rate": 1.75e-07, "loss": 0.9278, "step": 266 }, { "epoch": 0.0263944838494427, "grad_norm": 4.136733915711035, "learning_rate": 1.756578947368421e-07, "loss": 0.7844, "step": 267 }, { "epoch": 0.02649333959419717, "grad_norm": 4.096048932106808, "learning_rate": 1.763157894736842e-07, "loss": 0.9174, "step": 268 }, { "epoch": 0.026592195338951633, "grad_norm": 5.937952086656196, "learning_rate": 1.769736842105263e-07, "loss": 0.8535, "step": 269 }, { "epoch": 0.0266910510837061, "grad_norm": 14.045522971307724, "learning_rate": 1.776315789473684e-07, "loss": 0.8854, "step": 270 }, { "epoch": 0.02678990682846057, "grad_norm": 5.691906363431568, "learning_rate": 1.7828947368421051e-07, "loss": 0.9041, "step": 271 }, { "epoch": 0.026888762573215035, "grad_norm": 6.0728266785093155, "learning_rate": 1.7894736842105262e-07, "loss": 0.8814, "step": 272 }, { "epoch": 0.026987618317969503, "grad_norm": 7.196579720309739, "learning_rate": 1.7960526315789472e-07, "loss": 0.8354, "step": 273 }, { "epoch": 0.02708647406272397, "grad_norm": 6.841013900237215, "learning_rate": 1.8026315789473683e-07, "loss": 0.8896, "step": 274 }, { "epoch": 0.027185329807478436, "grad_norm": 4.664238625859023, "learning_rate": 1.8092105263157893e-07, "loss": 0.9236, "step": 275 }, { "epoch": 0.027284185552232904, "grad_norm": 5.890687750366183, "learning_rate": 1.8157894736842106e-07, "loss": 0.8317, "step": 276 }, { "epoch": 0.027383041296987373, "grad_norm": 4.162270080326169, "learning_rate": 1.8223684210526317e-07, "loss": 0.8367, "step": 277 }, { "epoch": 0.027481897041741837, "grad_norm": 6.1790375030473745, "learning_rate": 1.8289473684210525e-07, "loss": 0.9113, "step": 278 }, { "epoch": 0.027580752786496306, "grad_norm": 5.140351816938926, "learning_rate": 1.8355263157894735e-07, "loss": 0.8163, "step": 279 }, { "epoch": 0.027679608531250774, "grad_norm": 9.482714168289796, "learning_rate": 1.8421052631578946e-07, "loss": 0.9143, "step": 280 }, { "epoch": 0.02777846427600524, "grad_norm": 7.179291769102827, "learning_rate": 1.8486842105263156e-07, "loss": 0.9207, "step": 281 }, { "epoch": 0.027877320020759707, "grad_norm": 7.595006199026464, "learning_rate": 1.855263157894737e-07, "loss": 0.8438, "step": 282 }, { "epoch": 0.027976175765514172, "grad_norm": 4.479918478157686, "learning_rate": 1.861842105263158e-07, "loss": 0.8697, "step": 283 }, { "epoch": 0.02807503151026864, "grad_norm": 6.524122071899191, "learning_rate": 1.868421052631579e-07, "loss": 0.8941, "step": 284 }, { "epoch": 0.02817388725502311, "grad_norm": 7.300543531941686, "learning_rate": 1.875e-07, "loss": 0.7524, "step": 285 }, { "epoch": 0.028272742999777573, "grad_norm": 8.132327592430405, "learning_rate": 1.8815789473684208e-07, "loss": 0.8157, "step": 286 }, { "epoch": 0.02837159874453204, "grad_norm": 11.705346345672435, "learning_rate": 1.888157894736842e-07, "loss": 0.8662, "step": 287 }, { "epoch": 0.02847045448928651, "grad_norm": 5.3530130770076445, "learning_rate": 1.894736842105263e-07, "loss": 0.8341, "step": 288 }, { "epoch": 0.028569310234040975, "grad_norm": 6.5029793213155225, "learning_rate": 1.9013157894736842e-07, "loss": 0.8981, "step": 289 }, { "epoch": 0.028668165978795443, "grad_norm": 8.365708325740078, "learning_rate": 1.9078947368421053e-07, "loss": 0.8404, "step": 290 }, { "epoch": 0.02876702172354991, "grad_norm": 5.082720662307608, "learning_rate": 1.9144736842105263e-07, "loss": 0.8874, "step": 291 }, { "epoch": 0.028865877468304376, "grad_norm": 6.541338043696397, "learning_rate": 1.9210526315789474e-07, "loss": 0.9247, "step": 292 }, { "epoch": 0.028964733213058844, "grad_norm": 4.255655761637827, "learning_rate": 1.9276315789473681e-07, "loss": 0.8499, "step": 293 }, { "epoch": 0.029063588957813313, "grad_norm": 8.038414636102766, "learning_rate": 1.9342105263157892e-07, "loss": 0.7558, "step": 294 }, { "epoch": 0.029162444702567777, "grad_norm": 4.06354472740116, "learning_rate": 1.9407894736842105e-07, "loss": 0.8431, "step": 295 }, { "epoch": 0.029261300447322246, "grad_norm": 8.03754489269599, "learning_rate": 1.9473684210526315e-07, "loss": 0.8378, "step": 296 }, { "epoch": 0.02936015619207671, "grad_norm": 10.985117531903805, "learning_rate": 1.9539473684210526e-07, "loss": 0.9433, "step": 297 }, { "epoch": 0.02945901193683118, "grad_norm": 5.196650218876621, "learning_rate": 1.9605263157894736e-07, "loss": 0.901, "step": 298 }, { "epoch": 0.029557867681585647, "grad_norm": 6.141495419134231, "learning_rate": 1.9671052631578947e-07, "loss": 0.8812, "step": 299 }, { "epoch": 0.029656723426340112, "grad_norm": 4.7059109300415445, "learning_rate": 1.9736842105263157e-07, "loss": 0.8957, "step": 300 }, { "epoch": 0.02975557917109458, "grad_norm": 12.684093658926217, "learning_rate": 1.9802631578947368e-07, "loss": 0.8535, "step": 301 }, { "epoch": 0.02985443491584905, "grad_norm": 3.5528956101403706, "learning_rate": 1.9868421052631578e-07, "loss": 0.768, "step": 302 }, { "epoch": 0.029953290660603513, "grad_norm": 11.92314178068368, "learning_rate": 1.9934210526315789e-07, "loss": 0.8062, "step": 303 }, { "epoch": 0.03005214640535798, "grad_norm": 3.7151086410731793, "learning_rate": 2e-07, "loss": 0.8422, "step": 304 }, { "epoch": 0.03015100215011245, "grad_norm": 5.266364512168245, "learning_rate": 1.9999999487323755e-07, "loss": 0.9344, "step": 305 }, { "epoch": 0.030249857894866915, "grad_norm": 6.384002459161972, "learning_rate": 1.9999997949295072e-07, "loss": 0.832, "step": 306 }, { "epoch": 0.030348713639621383, "grad_norm": 4.672920472534386, "learning_rate": 1.9999995385914113e-07, "loss": 0.7971, "step": 307 }, { "epoch": 0.030447569384375848, "grad_norm": 4.788246342785124, "learning_rate": 1.9999991797181134e-07, "loss": 0.7648, "step": 308 }, { "epoch": 0.030546425129130316, "grad_norm": 5.249308298706671, "learning_rate": 1.9999987183096508e-07, "loss": 0.8745, "step": 309 }, { "epoch": 0.030645280873884784, "grad_norm": 26.80512696800146, "learning_rate": 1.9999981543660706e-07, "loss": 0.8147, "step": 310 }, { "epoch": 0.03074413661863925, "grad_norm": 3.4260569534034615, "learning_rate": 1.9999974878874305e-07, "loss": 0.8392, "step": 311 }, { "epoch": 0.030842992363393718, "grad_norm": 3.90712815438934, "learning_rate": 1.9999967188737995e-07, "loss": 0.8453, "step": 312 }, { "epoch": 0.030941848108148186, "grad_norm": 7.835493126132616, "learning_rate": 1.9999958473252557e-07, "loss": 0.7857, "step": 313 }, { "epoch": 0.03104070385290265, "grad_norm": 4.889129625411227, "learning_rate": 1.9999948732418885e-07, "loss": 0.7823, "step": 314 }, { "epoch": 0.03113955959765712, "grad_norm": 22.55418392914248, "learning_rate": 1.9999937966237983e-07, "loss": 0.8466, "step": 315 }, { "epoch": 0.031238415342411587, "grad_norm": 8.018760679336987, "learning_rate": 1.9999926174710954e-07, "loss": 0.7594, "step": 316 }, { "epoch": 0.03133727108716605, "grad_norm": 10.44961557573718, "learning_rate": 1.9999913357839e-07, "loss": 0.7775, "step": 317 }, { "epoch": 0.03143612683192052, "grad_norm": 4.3998926640318965, "learning_rate": 1.9999899515623443e-07, "loss": 0.8891, "step": 318 }, { "epoch": 0.03153498257667499, "grad_norm": 4.035369799976758, "learning_rate": 1.9999884648065699e-07, "loss": 0.881, "step": 319 }, { "epoch": 0.03163383832142946, "grad_norm": 5.334145767204725, "learning_rate": 1.9999868755167295e-07, "loss": 0.7735, "step": 320 }, { "epoch": 0.03173269406618392, "grad_norm": 4.308574675131147, "learning_rate": 1.9999851836929857e-07, "loss": 0.8619, "step": 321 }, { "epoch": 0.03183154981093839, "grad_norm": 5.610229127457844, "learning_rate": 1.999983389335512e-07, "loss": 0.7464, "step": 322 }, { "epoch": 0.031930405555692855, "grad_norm": 8.076606759530424, "learning_rate": 1.9999814924444926e-07, "loss": 0.831, "step": 323 }, { "epoch": 0.03202926130044732, "grad_norm": 3.9371867728901284, "learning_rate": 1.999979493020122e-07, "loss": 0.8347, "step": 324 }, { "epoch": 0.03212811704520179, "grad_norm": 6.7604657597178965, "learning_rate": 1.9999773910626052e-07, "loss": 0.8846, "step": 325 }, { "epoch": 0.03222697278995626, "grad_norm": 5.044381436362942, "learning_rate": 1.9999751865721574e-07, "loss": 0.9535, "step": 326 }, { "epoch": 0.03232582853471072, "grad_norm": 4.025716575067825, "learning_rate": 1.9999728795490052e-07, "loss": 0.8322, "step": 327 }, { "epoch": 0.03242468427946519, "grad_norm": 7.2894766020034805, "learning_rate": 1.9999704699933847e-07, "loss": 0.8434, "step": 328 }, { "epoch": 0.03252354002421966, "grad_norm": 5.177011522310207, "learning_rate": 1.999967957905543e-07, "loss": 0.8951, "step": 329 }, { "epoch": 0.032622395768974126, "grad_norm": 7.085312951981446, "learning_rate": 1.9999653432857379e-07, "loss": 0.8164, "step": 330 }, { "epoch": 0.032721251513728594, "grad_norm": 4.7538627531621165, "learning_rate": 1.9999626261342373e-07, "loss": 0.8931, "step": 331 }, { "epoch": 0.032820107258483056, "grad_norm": 5.408924907598474, "learning_rate": 1.99995980645132e-07, "loss": 0.841, "step": 332 }, { "epoch": 0.032918963003237524, "grad_norm": 14.449733296281659, "learning_rate": 1.9999568842372748e-07, "loss": 0.9255, "step": 333 }, { "epoch": 0.03301781874799199, "grad_norm": 4.422665063018142, "learning_rate": 1.9999538594924016e-07, "loss": 0.792, "step": 334 }, { "epoch": 0.03311667449274646, "grad_norm": 4.613792173053604, "learning_rate": 1.9999507322170104e-07, "loss": 0.864, "step": 335 }, { "epoch": 0.03321553023750093, "grad_norm": 6.887211411703311, "learning_rate": 1.9999475024114219e-07, "loss": 0.8341, "step": 336 }, { "epoch": 0.0333143859822554, "grad_norm": 7.7204027978785845, "learning_rate": 1.9999441700759676e-07, "loss": 0.9416, "step": 337 }, { "epoch": 0.03341324172700986, "grad_norm": 5.6965610314041815, "learning_rate": 1.9999407352109886e-07, "loss": 0.8506, "step": 338 }, { "epoch": 0.03351209747176433, "grad_norm": 11.934550181444044, "learning_rate": 1.9999371978168374e-07, "loss": 0.8064, "step": 339 }, { "epoch": 0.033610953216518795, "grad_norm": 5.789253230698797, "learning_rate": 1.9999335578938767e-07, "loss": 0.934, "step": 340 }, { "epoch": 0.03370980896127326, "grad_norm": 6.230886370004745, "learning_rate": 1.9999298154424796e-07, "loss": 0.8545, "step": 341 }, { "epoch": 0.03380866470602773, "grad_norm": 6.399403280360268, "learning_rate": 1.99992597046303e-07, "loss": 0.8393, "step": 342 }, { "epoch": 0.03390752045078219, "grad_norm": 3.611734081030316, "learning_rate": 1.999922022955922e-07, "loss": 0.8417, "step": 343 }, { "epoch": 0.03400637619553666, "grad_norm": 6.901492076279038, "learning_rate": 1.999917972921561e-07, "loss": 0.7153, "step": 344 }, { "epoch": 0.03410523194029113, "grad_norm": 5.667731123821929, "learning_rate": 1.999913820360361e-07, "loss": 0.771, "step": 345 }, { "epoch": 0.0342040876850456, "grad_norm": 4.086026152003282, "learning_rate": 1.9999095652727487e-07, "loss": 0.8265, "step": 346 }, { "epoch": 0.034302943429800066, "grad_norm": 5.691119881385631, "learning_rate": 1.9999052076591606e-07, "loss": 0.8483, "step": 347 }, { "epoch": 0.034401799174554534, "grad_norm": 12.438329093933309, "learning_rate": 1.9999007475200425e-07, "loss": 0.8122, "step": 348 }, { "epoch": 0.034500654919308996, "grad_norm": 4.057495729207768, "learning_rate": 1.9998961848558524e-07, "loss": 0.8218, "step": 349 }, { "epoch": 0.034599510664063464, "grad_norm": 4.4885159107755355, "learning_rate": 1.9998915196670584e-07, "loss": 0.8406, "step": 350 }, { "epoch": 0.03469836640881793, "grad_norm": 15.238497547228697, "learning_rate": 1.999886751954138e-07, "loss": 0.858, "step": 351 }, { "epoch": 0.0347972221535724, "grad_norm": 8.765867990730682, "learning_rate": 1.9998818817175808e-07, "loss": 0.7987, "step": 352 }, { "epoch": 0.03489607789832687, "grad_norm": 20.71816332235506, "learning_rate": 1.9998769089578863e-07, "loss": 0.7851, "step": 353 }, { "epoch": 0.03499493364308133, "grad_norm": 4.539548707016545, "learning_rate": 1.9998718336755635e-07, "loss": 0.8783, "step": 354 }, { "epoch": 0.0350937893878358, "grad_norm": 5.160358072525824, "learning_rate": 1.9998666558711338e-07, "loss": 0.8797, "step": 355 }, { "epoch": 0.03519264513259027, "grad_norm": 8.032174110957932, "learning_rate": 1.999861375545127e-07, "loss": 0.8274, "step": 356 }, { "epoch": 0.035291500877344735, "grad_norm": 5.719100658732963, "learning_rate": 1.9998559926980856e-07, "loss": 0.8271, "step": 357 }, { "epoch": 0.0353903566220992, "grad_norm": 3.5258946125174924, "learning_rate": 1.9998505073305606e-07, "loss": 0.7885, "step": 358 }, { "epoch": 0.03548921236685367, "grad_norm": 5.324561683955382, "learning_rate": 1.9998449194431154e-07, "loss": 0.85, "step": 359 }, { "epoch": 0.03558806811160813, "grad_norm": 5.679128420602625, "learning_rate": 1.9998392290363223e-07, "loss": 0.7947, "step": 360 }, { "epoch": 0.0356869238563626, "grad_norm": 4.67555845171299, "learning_rate": 1.9998334361107646e-07, "loss": 0.8389, "step": 361 }, { "epoch": 0.03578577960111707, "grad_norm": 4.068869759218887, "learning_rate": 1.9998275406670372e-07, "loss": 0.7432, "step": 362 }, { "epoch": 0.03588463534587154, "grad_norm": 3.9129955108062515, "learning_rate": 1.9998215427057435e-07, "loss": 0.7722, "step": 363 }, { "epoch": 0.035983491090626006, "grad_norm": 14.655470022543033, "learning_rate": 1.9998154422274993e-07, "loss": 0.8562, "step": 364 }, { "epoch": 0.036082346835380474, "grad_norm": 7.1942008439858265, "learning_rate": 1.9998092392329295e-07, "loss": 0.7988, "step": 365 }, { "epoch": 0.036181202580134936, "grad_norm": 5.111128699112608, "learning_rate": 1.9998029337226706e-07, "loss": 0.8902, "step": 366 }, { "epoch": 0.036280058324889404, "grad_norm": 5.630858305543143, "learning_rate": 1.999796525697369e-07, "loss": 0.8966, "step": 367 }, { "epoch": 0.03637891406964387, "grad_norm": 6.040433997532994, "learning_rate": 1.9997900151576818e-07, "loss": 0.7762, "step": 368 }, { "epoch": 0.03647776981439834, "grad_norm": 4.183838129731422, "learning_rate": 1.9997834021042764e-07, "loss": 0.8513, "step": 369 }, { "epoch": 0.03657662555915281, "grad_norm": 9.358697870269305, "learning_rate": 1.9997766865378313e-07, "loss": 0.8595, "step": 370 }, { "epoch": 0.03667548130390727, "grad_norm": 18.3748233760883, "learning_rate": 1.999769868459034e-07, "loss": 0.7567, "step": 371 }, { "epoch": 0.03677433704866174, "grad_norm": 6.182458126295749, "learning_rate": 1.9997629478685848e-07, "loss": 0.8599, "step": 372 }, { "epoch": 0.03687319279341621, "grad_norm": 5.99717192059847, "learning_rate": 1.999755924767193e-07, "loss": 0.7251, "step": 373 }, { "epoch": 0.036972048538170675, "grad_norm": 4.320412630204419, "learning_rate": 1.9997487991555783e-07, "loss": 0.868, "step": 374 }, { "epoch": 0.03707090428292514, "grad_norm": 5.80599906162185, "learning_rate": 1.9997415710344716e-07, "loss": 0.8735, "step": 375 }, { "epoch": 0.03716976002767961, "grad_norm": 9.266422696089464, "learning_rate": 1.999734240404614e-07, "loss": 0.8892, "step": 376 }, { "epoch": 0.03726861577243407, "grad_norm": 4.813529822223293, "learning_rate": 1.9997268072667572e-07, "loss": 0.7663, "step": 377 }, { "epoch": 0.03736747151718854, "grad_norm": 15.687710450264023, "learning_rate": 1.9997192716216635e-07, "loss": 0.8878, "step": 378 }, { "epoch": 0.03746632726194301, "grad_norm": 5.664425049239898, "learning_rate": 1.9997116334701052e-07, "loss": 0.8761, "step": 379 }, { "epoch": 0.03756518300669748, "grad_norm": 4.346012986911804, "learning_rate": 1.9997038928128656e-07, "loss": 0.8236, "step": 380 }, { "epoch": 0.037664038751451946, "grad_norm": 4.095818649520275, "learning_rate": 1.9996960496507387e-07, "loss": 0.6989, "step": 381 }, { "epoch": 0.03776289449620641, "grad_norm": 6.726619669774511, "learning_rate": 1.999688103984528e-07, "loss": 0.8213, "step": 382 }, { "epoch": 0.037861750240960876, "grad_norm": 4.923925961446826, "learning_rate": 1.999680055815049e-07, "loss": 0.8308, "step": 383 }, { "epoch": 0.037960605985715344, "grad_norm": 4.985612472471738, "learning_rate": 1.9996719051431267e-07, "loss": 0.8373, "step": 384 }, { "epoch": 0.03805946173046981, "grad_norm": 8.082388320822929, "learning_rate": 1.9996636519695965e-07, "loss": 0.7695, "step": 385 }, { "epoch": 0.03815831747522428, "grad_norm": 11.255138644968893, "learning_rate": 1.9996552962953052e-07, "loss": 0.8885, "step": 386 }, { "epoch": 0.03825717321997875, "grad_norm": 7.45857009927108, "learning_rate": 1.999646838121109e-07, "loss": 0.8612, "step": 387 }, { "epoch": 0.03835602896473321, "grad_norm": 4.660663038832222, "learning_rate": 1.9996382774478753e-07, "loss": 0.839, "step": 388 }, { "epoch": 0.03845488470948768, "grad_norm": 4.343319707601303, "learning_rate": 1.9996296142764821e-07, "loss": 0.7547, "step": 389 }, { "epoch": 0.03855374045424215, "grad_norm": 4.305769211448222, "learning_rate": 1.9996208486078174e-07, "loss": 0.8076, "step": 390 }, { "epoch": 0.038652596198996615, "grad_norm": 10.864278383580485, "learning_rate": 1.9996119804427802e-07, "loss": 0.7821, "step": 391 }, { "epoch": 0.038751451943751083, "grad_norm": 6.575855235990588, "learning_rate": 1.9996030097822798e-07, "loss": 0.8118, "step": 392 }, { "epoch": 0.038850307688505545, "grad_norm": 4.926962019448554, "learning_rate": 1.9995939366272357e-07, "loss": 0.7367, "step": 393 }, { "epoch": 0.03894916343326001, "grad_norm": 8.776714876815573, "learning_rate": 1.9995847609785788e-07, "loss": 0.7832, "step": 394 }, { "epoch": 0.03904801917801448, "grad_norm": 4.480226018772917, "learning_rate": 1.9995754828372493e-07, "loss": 0.7809, "step": 395 }, { "epoch": 0.03914687492276895, "grad_norm": 4.157006260320855, "learning_rate": 1.9995661022041987e-07, "loss": 0.779, "step": 396 }, { "epoch": 0.03924573066752342, "grad_norm": 3.378999916148288, "learning_rate": 1.9995566190803892e-07, "loss": 0.8089, "step": 397 }, { "epoch": 0.039344586412277886, "grad_norm": 4.710117957876718, "learning_rate": 1.9995470334667928e-07, "loss": 0.7844, "step": 398 }, { "epoch": 0.03944344215703235, "grad_norm": 3.26494476411337, "learning_rate": 1.9995373453643923e-07, "loss": 0.674, "step": 399 }, { "epoch": 0.039542297901786816, "grad_norm": 11.412128052415037, "learning_rate": 1.9995275547741816e-07, "loss": 0.763, "step": 400 }, { "epoch": 0.039641153646541284, "grad_norm": 4.450091174077025, "learning_rate": 1.999517661697164e-07, "loss": 0.8026, "step": 401 }, { "epoch": 0.03974000939129575, "grad_norm": 5.224552408444077, "learning_rate": 1.999507666134354e-07, "loss": 0.8021, "step": 402 }, { "epoch": 0.03983886513605022, "grad_norm": 10.230362602292555, "learning_rate": 1.9994975680867768e-07, "loss": 0.8656, "step": 403 }, { "epoch": 0.03993772088080469, "grad_norm": 15.460096400102703, "learning_rate": 1.999487367555468e-07, "loss": 0.716, "step": 404 }, { "epoch": 0.04003657662555915, "grad_norm": 3.5136394629567844, "learning_rate": 1.9994770645414728e-07, "loss": 0.8477, "step": 405 }, { "epoch": 0.04013543237031362, "grad_norm": 4.192756040070386, "learning_rate": 1.9994666590458477e-07, "loss": 0.721, "step": 406 }, { "epoch": 0.04023428811506809, "grad_norm": 4.2283514119369014, "learning_rate": 1.99945615106966e-07, "loss": 0.7956, "step": 407 }, { "epoch": 0.040333143859822555, "grad_norm": 6.312766155035111, "learning_rate": 1.9994455406139875e-07, "loss": 0.7579, "step": 408 }, { "epoch": 0.040431999604577024, "grad_norm": 5.525523972845212, "learning_rate": 1.9994348276799168e-07, "loss": 0.8388, "step": 409 }, { "epoch": 0.040530855349331485, "grad_norm": 5.507709095515701, "learning_rate": 1.999424012268548e-07, "loss": 0.7933, "step": 410 }, { "epoch": 0.04062971109408595, "grad_norm": 3.7403961158486907, "learning_rate": 1.9994130943809887e-07, "loss": 0.8482, "step": 411 }, { "epoch": 0.04072856683884042, "grad_norm": 5.611186069925248, "learning_rate": 1.9994020740183594e-07, "loss": 0.7101, "step": 412 }, { "epoch": 0.04082742258359489, "grad_norm": 9.28560206635102, "learning_rate": 1.9993909511817896e-07, "loss": 0.7582, "step": 413 }, { "epoch": 0.04092627832834936, "grad_norm": 17.711117570151426, "learning_rate": 1.9993797258724196e-07, "loss": 0.8654, "step": 414 }, { "epoch": 0.041025134073103826, "grad_norm": 5.5302807433352745, "learning_rate": 1.9993683980914008e-07, "loss": 0.7833, "step": 415 }, { "epoch": 0.04112398981785829, "grad_norm": 14.932613924003414, "learning_rate": 1.9993569678398943e-07, "loss": 0.7905, "step": 416 }, { "epoch": 0.041222845562612756, "grad_norm": 18.364873674608894, "learning_rate": 1.9993454351190723e-07, "loss": 0.8167, "step": 417 }, { "epoch": 0.041321701307367224, "grad_norm": 5.599083189015394, "learning_rate": 1.9993337999301176e-07, "loss": 0.9052, "step": 418 }, { "epoch": 0.04142055705212169, "grad_norm": 4.855817914066184, "learning_rate": 1.9993220622742226e-07, "loss": 0.781, "step": 419 }, { "epoch": 0.04151941279687616, "grad_norm": 4.396621048217355, "learning_rate": 1.9993102221525912e-07, "loss": 0.8649, "step": 420 }, { "epoch": 0.04161826854163062, "grad_norm": 4.303637387538735, "learning_rate": 1.9992982795664375e-07, "loss": 0.7832, "step": 421 }, { "epoch": 0.04171712428638509, "grad_norm": 13.425544054353267, "learning_rate": 1.9992862345169858e-07, "loss": 0.7953, "step": 422 }, { "epoch": 0.04181598003113956, "grad_norm": 9.003501221875005, "learning_rate": 1.9992740870054713e-07, "loss": 0.7927, "step": 423 }, { "epoch": 0.04191483577589403, "grad_norm": 15.221501817381794, "learning_rate": 1.9992618370331397e-07, "loss": 0.7623, "step": 424 }, { "epoch": 0.042013691520648495, "grad_norm": 6.174005704483763, "learning_rate": 1.9992494846012466e-07, "loss": 0.7659, "step": 425 }, { "epoch": 0.042112547265402964, "grad_norm": 32.25792275726504, "learning_rate": 1.999237029711059e-07, "loss": 0.7666, "step": 426 }, { "epoch": 0.042211403010157425, "grad_norm": 7.070717533882822, "learning_rate": 1.9992244723638536e-07, "loss": 0.8148, "step": 427 }, { "epoch": 0.04231025875491189, "grad_norm": 4.003858667861757, "learning_rate": 1.9992118125609184e-07, "loss": 0.8473, "step": 428 }, { "epoch": 0.04240911449966636, "grad_norm": 8.259202006210145, "learning_rate": 1.999199050303551e-07, "loss": 0.7927, "step": 429 }, { "epoch": 0.04250797024442083, "grad_norm": 3.8126595523371356, "learning_rate": 1.9991861855930604e-07, "loss": 0.8087, "step": 430 }, { "epoch": 0.0426068259891753, "grad_norm": 3.766792519005445, "learning_rate": 1.9991732184307654e-07, "loss": 0.833, "step": 431 }, { "epoch": 0.04270568173392976, "grad_norm": 9.38053135747968, "learning_rate": 1.9991601488179957e-07, "loss": 0.8239, "step": 432 }, { "epoch": 0.04280453747868423, "grad_norm": 3.357235436597659, "learning_rate": 1.9991469767560914e-07, "loss": 0.8866, "step": 433 }, { "epoch": 0.042903393223438696, "grad_norm": 5.944776713804741, "learning_rate": 1.9991337022464033e-07, "loss": 0.7594, "step": 434 }, { "epoch": 0.043002248968193164, "grad_norm": 3.9399995967574872, "learning_rate": 1.999120325290292e-07, "loss": 0.8278, "step": 435 }, { "epoch": 0.04310110471294763, "grad_norm": 3.7662243910024853, "learning_rate": 1.999106845889129e-07, "loss": 0.7511, "step": 436 }, { "epoch": 0.0431999604577021, "grad_norm": 5.9009578029556495, "learning_rate": 1.9990932640442976e-07, "loss": 0.7971, "step": 437 }, { "epoch": 0.04329881620245656, "grad_norm": 7.317625052744907, "learning_rate": 1.9990795797571893e-07, "loss": 0.7689, "step": 438 }, { "epoch": 0.04339767194721103, "grad_norm": 8.387711716615467, "learning_rate": 1.9990657930292077e-07, "loss": 0.718, "step": 439 }, { "epoch": 0.0434965276919655, "grad_norm": 4.055861802351913, "learning_rate": 1.999051903861766e-07, "loss": 0.8252, "step": 440 }, { "epoch": 0.04359538343671997, "grad_norm": 3.4134345369414887, "learning_rate": 1.9990379122562888e-07, "loss": 0.8018, "step": 441 }, { "epoch": 0.043694239181474435, "grad_norm": 5.1746524425174645, "learning_rate": 1.9990238182142107e-07, "loss": 0.7318, "step": 442 }, { "epoch": 0.043793094926228904, "grad_norm": 5.063514977037955, "learning_rate": 1.9990096217369766e-07, "loss": 0.9258, "step": 443 }, { "epoch": 0.043891950670983365, "grad_norm": 4.542862596562116, "learning_rate": 1.9989953228260418e-07, "loss": 0.7844, "step": 444 }, { "epoch": 0.04399080641573783, "grad_norm": 8.966816768548158, "learning_rate": 1.9989809214828736e-07, "loss": 0.7806, "step": 445 }, { "epoch": 0.0440896621604923, "grad_norm": 3.906974327784337, "learning_rate": 1.9989664177089472e-07, "loss": 0.8423, "step": 446 }, { "epoch": 0.04418851790524677, "grad_norm": 7.853685824907073, "learning_rate": 1.998951811505751e-07, "loss": 0.8135, "step": 447 }, { "epoch": 0.04428737365000124, "grad_norm": 5.384518855964276, "learning_rate": 1.9989371028747817e-07, "loss": 0.7961, "step": 448 }, { "epoch": 0.0443862293947557, "grad_norm": 6.336490624882754, "learning_rate": 1.9989222918175478e-07, "loss": 0.7797, "step": 449 }, { "epoch": 0.04448508513951017, "grad_norm": 5.738542241917747, "learning_rate": 1.9989073783355684e-07, "loss": 0.8043, "step": 450 }, { "epoch": 0.044583940884264636, "grad_norm": 12.707916032636668, "learning_rate": 1.9988923624303718e-07, "loss": 0.8174, "step": 451 }, { "epoch": 0.044682796629019104, "grad_norm": 5.321506103710058, "learning_rate": 1.9988772441034988e-07, "loss": 0.9339, "step": 452 }, { "epoch": 0.04478165237377357, "grad_norm": 5.861776095011742, "learning_rate": 1.9988620233564984e-07, "loss": 0.8142, "step": 453 }, { "epoch": 0.04488050811852804, "grad_norm": 4.721179738793206, "learning_rate": 1.998846700190932e-07, "loss": 0.745, "step": 454 }, { "epoch": 0.0449793638632825, "grad_norm": 4.2427011803511085, "learning_rate": 1.99883127460837e-07, "loss": 0.788, "step": 455 }, { "epoch": 0.04507821960803697, "grad_norm": 10.458101930384597, "learning_rate": 1.9988157466103953e-07, "loss": 0.8246, "step": 456 }, { "epoch": 0.04517707535279144, "grad_norm": 3.748182656422571, "learning_rate": 1.998800116198599e-07, "loss": 0.8637, "step": 457 }, { "epoch": 0.04527593109754591, "grad_norm": 6.658952763185419, "learning_rate": 1.9987843833745841e-07, "loss": 0.7919, "step": 458 }, { "epoch": 0.045374786842300376, "grad_norm": 6.900471611676066, "learning_rate": 1.9987685481399642e-07, "loss": 0.9194, "step": 459 }, { "epoch": 0.04547364258705484, "grad_norm": 4.901462337331058, "learning_rate": 1.9987526104963619e-07, "loss": 0.7935, "step": 460 }, { "epoch": 0.045572498331809305, "grad_norm": 15.655914151588954, "learning_rate": 1.9987365704454123e-07, "loss": 0.7235, "step": 461 }, { "epoch": 0.045671354076563773, "grad_norm": 4.131477921193643, "learning_rate": 1.99872042798876e-07, "loss": 0.6886, "step": 462 }, { "epoch": 0.04577020982131824, "grad_norm": 4.027212923721047, "learning_rate": 1.9987041831280597e-07, "loss": 0.7966, "step": 463 }, { "epoch": 0.04586906556607271, "grad_norm": 4.711726780055351, "learning_rate": 1.9986878358649774e-07, "loss": 0.8675, "step": 464 }, { "epoch": 0.04596792131082718, "grad_norm": 4.730429086413092, "learning_rate": 1.9986713862011896e-07, "loss": 0.776, "step": 465 }, { "epoch": 0.04606677705558164, "grad_norm": 4.0128621231391355, "learning_rate": 1.998654834138382e-07, "loss": 0.777, "step": 466 }, { "epoch": 0.04616563280033611, "grad_norm": 9.381545453598022, "learning_rate": 1.9986381796782527e-07, "loss": 0.7446, "step": 467 }, { "epoch": 0.046264488545090576, "grad_norm": 4.597355506639818, "learning_rate": 1.998621422822509e-07, "loss": 0.809, "step": 468 }, { "epoch": 0.046363344289845045, "grad_norm": 3.754386598963816, "learning_rate": 1.998604563572869e-07, "loss": 0.7365, "step": 469 }, { "epoch": 0.04646220003459951, "grad_norm": 5.058433019133744, "learning_rate": 1.9985876019310614e-07, "loss": 0.6917, "step": 470 }, { "epoch": 0.04656105577935398, "grad_norm": 3.919436101021462, "learning_rate": 1.9985705378988257e-07, "loss": 0.7461, "step": 471 }, { "epoch": 0.04665991152410844, "grad_norm": 4.434462610030953, "learning_rate": 1.998553371477911e-07, "loss": 0.7533, "step": 472 }, { "epoch": 0.04675876726886291, "grad_norm": 6.6046225316529075, "learning_rate": 1.9985361026700779e-07, "loss": 0.83, "step": 473 }, { "epoch": 0.04685762301361738, "grad_norm": 6.388539912069082, "learning_rate": 1.9985187314770968e-07, "loss": 0.8052, "step": 474 }, { "epoch": 0.04695647875837185, "grad_norm": 11.41393496393874, "learning_rate": 1.9985012579007491e-07, "loss": 0.7331, "step": 475 }, { "epoch": 0.047055334503126316, "grad_norm": 8.626758156377393, "learning_rate": 1.9984836819428264e-07, "loss": 0.6719, "step": 476 }, { "epoch": 0.04715419024788078, "grad_norm": 4.065204659167926, "learning_rate": 1.9984660036051306e-07, "loss": 0.7464, "step": 477 }, { "epoch": 0.047253045992635245, "grad_norm": 5.510067861728492, "learning_rate": 1.9984482228894747e-07, "loss": 0.8867, "step": 478 }, { "epoch": 0.047351901737389714, "grad_norm": 10.175246884586214, "learning_rate": 1.9984303397976816e-07, "loss": 0.8034, "step": 479 }, { "epoch": 0.04745075748214418, "grad_norm": 4.269998288611957, "learning_rate": 1.998412354331585e-07, "loss": 0.7339, "step": 480 }, { "epoch": 0.04754961322689865, "grad_norm": 5.267560544804125, "learning_rate": 1.998394266493029e-07, "loss": 0.7914, "step": 481 }, { "epoch": 0.04764846897165312, "grad_norm": 8.594420200713202, "learning_rate": 1.9983760762838687e-07, "loss": 0.7174, "step": 482 }, { "epoch": 0.04774732471640758, "grad_norm": 3.696091337560013, "learning_rate": 1.9983577837059685e-07, "loss": 0.7635, "step": 483 }, { "epoch": 0.04784618046116205, "grad_norm": 3.703476016689107, "learning_rate": 1.9983393887612046e-07, "loss": 0.817, "step": 484 }, { "epoch": 0.047945036205916516, "grad_norm": 7.503945564129945, "learning_rate": 1.9983208914514629e-07, "loss": 0.7699, "step": 485 }, { "epoch": 0.048043891950670985, "grad_norm": 3.698256953092467, "learning_rate": 1.99830229177864e-07, "loss": 0.7252, "step": 486 }, { "epoch": 0.04814274769542545, "grad_norm": 5.640795718727372, "learning_rate": 1.9982835897446427e-07, "loss": 0.7718, "step": 487 }, { "epoch": 0.048241603440179914, "grad_norm": 4.966435240807712, "learning_rate": 1.9982647853513894e-07, "loss": 0.8087, "step": 488 }, { "epoch": 0.04834045918493438, "grad_norm": 3.164752233697765, "learning_rate": 1.998245878600808e-07, "loss": 0.8634, "step": 489 }, { "epoch": 0.04843931492968885, "grad_norm": 5.0940183942065955, "learning_rate": 1.9982268694948365e-07, "loss": 0.7244, "step": 490 }, { "epoch": 0.04853817067444332, "grad_norm": 4.876365362960067, "learning_rate": 1.9982077580354247e-07, "loss": 0.7371, "step": 491 }, { "epoch": 0.04863702641919779, "grad_norm": 22.80252668595278, "learning_rate": 1.9981885442245315e-07, "loss": 0.7156, "step": 492 }, { "epoch": 0.048735882163952256, "grad_norm": 6.76950883906046, "learning_rate": 1.9981692280641277e-07, "loss": 0.8417, "step": 493 }, { "epoch": 0.04883473790870672, "grad_norm": 4.159066054549585, "learning_rate": 1.9981498095561937e-07, "loss": 0.849, "step": 494 }, { "epoch": 0.048933593653461185, "grad_norm": 3.9077094431015045, "learning_rate": 1.9981302887027205e-07, "loss": 0.7607, "step": 495 }, { "epoch": 0.049032449398215654, "grad_norm": 3.7492417530136146, "learning_rate": 1.9981106655057096e-07, "loss": 0.7815, "step": 496 }, { "epoch": 0.04913130514297012, "grad_norm": 7.488461103359557, "learning_rate": 1.9980909399671728e-07, "loss": 0.7452, "step": 497 }, { "epoch": 0.04923016088772459, "grad_norm": 9.058752920828953, "learning_rate": 1.9980711120891333e-07, "loss": 0.7781, "step": 498 }, { "epoch": 0.04932901663247905, "grad_norm": 6.51558292977401, "learning_rate": 1.9980511818736238e-07, "loss": 0.7415, "step": 499 }, { "epoch": 0.04942787237723352, "grad_norm": 3.9958327707302996, "learning_rate": 1.9980311493226878e-07, "loss": 0.9181, "step": 500 }, { "epoch": 0.04952672812198799, "grad_norm": 4.878511089224303, "learning_rate": 1.9980110144383796e-07, "loss": 0.8493, "step": 501 }, { "epoch": 0.049625583866742456, "grad_norm": 12.433887239823019, "learning_rate": 1.9979907772227634e-07, "loss": 0.7992, "step": 502 }, { "epoch": 0.049724439611496925, "grad_norm": 5.584226186237488, "learning_rate": 1.9979704376779142e-07, "loss": 0.7808, "step": 503 }, { "epoch": 0.04982329535625139, "grad_norm": 12.399411454924568, "learning_rate": 1.9979499958059182e-07, "loss": 0.8029, "step": 504 }, { "epoch": 0.049922151101005854, "grad_norm": 3.257168379805648, "learning_rate": 1.9979294516088706e-07, "loss": 0.744, "step": 505 }, { "epoch": 0.05002100684576032, "grad_norm": 4.134701304991991, "learning_rate": 1.9979088050888785e-07, "loss": 0.7814, "step": 506 }, { "epoch": 0.05011986259051479, "grad_norm": 11.931233682131015, "learning_rate": 1.9978880562480583e-07, "loss": 0.8345, "step": 507 }, { "epoch": 0.05021871833526926, "grad_norm": 8.86114964567862, "learning_rate": 1.9978672050885378e-07, "loss": 0.8853, "step": 508 }, { "epoch": 0.05031757408002373, "grad_norm": 9.954132493610551, "learning_rate": 1.9978462516124553e-07, "loss": 0.8393, "step": 509 }, { "epoch": 0.050416429824778196, "grad_norm": 5.291354925913319, "learning_rate": 1.9978251958219588e-07, "loss": 0.8716, "step": 510 }, { "epoch": 0.05051528556953266, "grad_norm": 6.747534177003873, "learning_rate": 1.9978040377192075e-07, "loss": 0.7181, "step": 511 }, { "epoch": 0.050614141314287125, "grad_norm": 8.322192074892827, "learning_rate": 1.9977827773063707e-07, "loss": 0.6897, "step": 512 }, { "epoch": 0.050712997059041594, "grad_norm": 4.2905864401934855, "learning_rate": 1.9977614145856284e-07, "loss": 0.7452, "step": 513 }, { "epoch": 0.05081185280379606, "grad_norm": 6.262494662534928, "learning_rate": 1.9977399495591713e-07, "loss": 0.8059, "step": 514 }, { "epoch": 0.05091070854855053, "grad_norm": 3.9009630828697293, "learning_rate": 1.9977183822291998e-07, "loss": 0.7179, "step": 515 }, { "epoch": 0.05100956429330499, "grad_norm": 2.9077842959893707, "learning_rate": 1.997696712597926e-07, "loss": 0.789, "step": 516 }, { "epoch": 0.05110842003805946, "grad_norm": 5.719037759261635, "learning_rate": 1.9976749406675709e-07, "loss": 0.8164, "step": 517 }, { "epoch": 0.05120727578281393, "grad_norm": 4.091389196901534, "learning_rate": 1.9976530664403675e-07, "loss": 0.7506, "step": 518 }, { "epoch": 0.0513061315275684, "grad_norm": 5.022957864443125, "learning_rate": 1.9976310899185584e-07, "loss": 0.8671, "step": 519 }, { "epoch": 0.051404987272322865, "grad_norm": 8.798367885087051, "learning_rate": 1.9976090111043973e-07, "loss": 0.7709, "step": 520 }, { "epoch": 0.05150384301707733, "grad_norm": 5.936090413495646, "learning_rate": 1.997586830000148e-07, "loss": 0.7173, "step": 521 }, { "epoch": 0.051602698761831794, "grad_norm": 6.21000128848077, "learning_rate": 1.9975645466080845e-07, "loss": 0.8219, "step": 522 }, { "epoch": 0.05170155450658626, "grad_norm": 9.538415774632162, "learning_rate": 1.997542160930492e-07, "loss": 0.8722, "step": 523 }, { "epoch": 0.05180041025134073, "grad_norm": 4.463319927114203, "learning_rate": 1.9975196729696656e-07, "loss": 0.7206, "step": 524 }, { "epoch": 0.0518992659960952, "grad_norm": 8.67454765119347, "learning_rate": 1.9974970827279111e-07, "loss": 0.8521, "step": 525 }, { "epoch": 0.05199812174084967, "grad_norm": 8.377243385090626, "learning_rate": 1.9974743902075453e-07, "loss": 0.8201, "step": 526 }, { "epoch": 0.05209697748560413, "grad_norm": 6.246276768881138, "learning_rate": 1.9974515954108943e-07, "loss": 0.6447, "step": 527 }, { "epoch": 0.0521958332303586, "grad_norm": 5.362805647620259, "learning_rate": 1.9974286983402955e-07, "loss": 0.7369, "step": 528 }, { "epoch": 0.052294688975113066, "grad_norm": 4.4514513739921115, "learning_rate": 1.9974056989980968e-07, "loss": 0.8543, "step": 529 }, { "epoch": 0.052393544719867534, "grad_norm": 4.770550658376816, "learning_rate": 1.9973825973866564e-07, "loss": 0.752, "step": 530 }, { "epoch": 0.052492400464622, "grad_norm": 8.835892173275344, "learning_rate": 1.9973593935083435e-07, "loss": 0.883, "step": 531 }, { "epoch": 0.05259125620937647, "grad_norm": 4.08375576083483, "learning_rate": 1.9973360873655366e-07, "loss": 0.7774, "step": 532 }, { "epoch": 0.05269011195413093, "grad_norm": 9.977815375993414, "learning_rate": 1.9973126789606254e-07, "loss": 0.7743, "step": 533 }, { "epoch": 0.0527889676988854, "grad_norm": 5.598171039920406, "learning_rate": 1.9972891682960108e-07, "loss": 0.8274, "step": 534 }, { "epoch": 0.05288782344363987, "grad_norm": 4.28862547125591, "learning_rate": 1.997265555374103e-07, "loss": 0.9384, "step": 535 }, { "epoch": 0.05298667918839434, "grad_norm": 5.834062887410753, "learning_rate": 1.997241840197323e-07, "loss": 0.752, "step": 536 }, { "epoch": 0.053085534933148805, "grad_norm": 3.9039154556378004, "learning_rate": 1.9972180227681025e-07, "loss": 0.7705, "step": 537 }, { "epoch": 0.053184390677903266, "grad_norm": 5.935253694093829, "learning_rate": 1.9971941030888843e-07, "loss": 0.8063, "step": 538 }, { "epoch": 0.053283246422657735, "grad_norm": 3.2227217592716646, "learning_rate": 1.99717008116212e-07, "loss": 0.8059, "step": 539 }, { "epoch": 0.0533821021674122, "grad_norm": 5.450449662583103, "learning_rate": 1.9971459569902733e-07, "loss": 0.7929, "step": 540 }, { "epoch": 0.05348095791216667, "grad_norm": 3.7246751094365202, "learning_rate": 1.9971217305758178e-07, "loss": 0.7557, "step": 541 }, { "epoch": 0.05357981365692114, "grad_norm": 6.453846399486815, "learning_rate": 1.997097401921237e-07, "loss": 0.8986, "step": 542 }, { "epoch": 0.05367866940167561, "grad_norm": 5.804207991789014, "learning_rate": 1.9970729710290264e-07, "loss": 0.7101, "step": 543 }, { "epoch": 0.05377752514643007, "grad_norm": 5.2838025283155385, "learning_rate": 1.9970484379016902e-07, "loss": 0.8211, "step": 544 }, { "epoch": 0.05387638089118454, "grad_norm": 3.9761968932275904, "learning_rate": 1.9970238025417442e-07, "loss": 0.7843, "step": 545 }, { "epoch": 0.053975236635939006, "grad_norm": 5.311875424897549, "learning_rate": 1.9969990649517144e-07, "loss": 0.7806, "step": 546 }, { "epoch": 0.054074092380693474, "grad_norm": 4.75912330094027, "learning_rate": 1.9969742251341373e-07, "loss": 0.7949, "step": 547 }, { "epoch": 0.05417294812544794, "grad_norm": 4.72231071871465, "learning_rate": 1.9969492830915598e-07, "loss": 0.7815, "step": 548 }, { "epoch": 0.05427180387020241, "grad_norm": 8.997717916815803, "learning_rate": 1.9969242388265392e-07, "loss": 0.8302, "step": 549 }, { "epoch": 0.05437065961495687, "grad_norm": 9.20612542983796, "learning_rate": 1.996899092341644e-07, "loss": 0.7585, "step": 550 }, { "epoch": 0.05446951535971134, "grad_norm": 6.463546073650438, "learning_rate": 1.996873843639452e-07, "loss": 0.8169, "step": 551 }, { "epoch": 0.05456837110446581, "grad_norm": 4.9679609855771725, "learning_rate": 1.996848492722552e-07, "loss": 0.8339, "step": 552 }, { "epoch": 0.05466722684922028, "grad_norm": 11.819676562227507, "learning_rate": 1.9968230395935437e-07, "loss": 0.7215, "step": 553 }, { "epoch": 0.054766082593974745, "grad_norm": 4.710563877393096, "learning_rate": 1.996797484255037e-07, "loss": 0.7768, "step": 554 }, { "epoch": 0.054864938338729206, "grad_norm": 3.627944342853949, "learning_rate": 1.996771826709652e-07, "loss": 0.7502, "step": 555 }, { "epoch": 0.054963794083483675, "grad_norm": 5.129998740293697, "learning_rate": 1.9967460669600196e-07, "loss": 0.8111, "step": 556 }, { "epoch": 0.05506264982823814, "grad_norm": 8.082798011288501, "learning_rate": 1.9967202050087812e-07, "loss": 0.793, "step": 557 }, { "epoch": 0.05516150557299261, "grad_norm": 5.60444285644719, "learning_rate": 1.9966942408585882e-07, "loss": 0.7875, "step": 558 }, { "epoch": 0.05526036131774708, "grad_norm": 4.67849293655465, "learning_rate": 1.9966681745121033e-07, "loss": 0.844, "step": 559 }, { "epoch": 0.05535921706250155, "grad_norm": 3.6781896715539504, "learning_rate": 1.9966420059719988e-07, "loss": 0.7517, "step": 560 }, { "epoch": 0.05545807280725601, "grad_norm": 4.06316282473506, "learning_rate": 1.9966157352409583e-07, "loss": 0.7607, "step": 561 }, { "epoch": 0.05555692855201048, "grad_norm": 4.967578424876608, "learning_rate": 1.996589362321675e-07, "loss": 0.7942, "step": 562 }, { "epoch": 0.055655784296764946, "grad_norm": 3.6258564455888327, "learning_rate": 1.9965628872168537e-07, "loss": 0.7543, "step": 563 }, { "epoch": 0.055754640041519414, "grad_norm": 3.849656379576122, "learning_rate": 1.9965363099292082e-07, "loss": 0.7587, "step": 564 }, { "epoch": 0.05585349578627388, "grad_norm": 3.6637604691149943, "learning_rate": 1.9965096304614642e-07, "loss": 0.6645, "step": 565 }, { "epoch": 0.055952351531028344, "grad_norm": 5.360051375440951, "learning_rate": 1.9964828488163574e-07, "loss": 0.7071, "step": 566 }, { "epoch": 0.05605120727578281, "grad_norm": 3.345687429320808, "learning_rate": 1.9964559649966334e-07, "loss": 0.8106, "step": 567 }, { "epoch": 0.05615006302053728, "grad_norm": 28.340580580250098, "learning_rate": 1.996428979005049e-07, "loss": 0.8606, "step": 568 }, { "epoch": 0.05624891876529175, "grad_norm": 8.118026925873485, "learning_rate": 1.9964018908443712e-07, "loss": 0.8248, "step": 569 }, { "epoch": 0.05634777451004622, "grad_norm": 8.775371762225875, "learning_rate": 1.9963747005173774e-07, "loss": 0.7204, "step": 570 }, { "epoch": 0.056446630254800685, "grad_norm": 4.654273518498881, "learning_rate": 1.9963474080268557e-07, "loss": 0.7796, "step": 571 }, { "epoch": 0.056545485999555146, "grad_norm": 4.629830796688502, "learning_rate": 1.9963200133756044e-07, "loss": 0.8423, "step": 572 }, { "epoch": 0.056644341744309615, "grad_norm": 7.188741104822034, "learning_rate": 1.9962925165664324e-07, "loss": 0.782, "step": 573 }, { "epoch": 0.05674319748906408, "grad_norm": 4.935874132525405, "learning_rate": 1.9962649176021593e-07, "loss": 0.7979, "step": 574 }, { "epoch": 0.05684205323381855, "grad_norm": 4.183425875542571, "learning_rate": 1.996237216485615e-07, "loss": 0.6732, "step": 575 }, { "epoch": 0.05694090897857302, "grad_norm": 4.163474122412592, "learning_rate": 1.9962094132196393e-07, "loss": 0.7941, "step": 576 }, { "epoch": 0.05703976472332748, "grad_norm": 3.627186731015763, "learning_rate": 1.9961815078070836e-07, "loss": 0.8357, "step": 577 }, { "epoch": 0.05713862046808195, "grad_norm": 9.303576093524484, "learning_rate": 1.996153500250809e-07, "loss": 0.79, "step": 578 }, { "epoch": 0.05723747621283642, "grad_norm": 8.328395885993935, "learning_rate": 1.9961253905536871e-07, "loss": 0.7688, "step": 579 }, { "epoch": 0.057336331957590886, "grad_norm": 5.186268338753882, "learning_rate": 1.9960971787186004e-07, "loss": 0.7931, "step": 580 }, { "epoch": 0.057435187702345354, "grad_norm": 3.8716703742571745, "learning_rate": 1.9960688647484413e-07, "loss": 0.7904, "step": 581 }, { "epoch": 0.05753404344709982, "grad_norm": 4.44536045499621, "learning_rate": 1.9960404486461136e-07, "loss": 0.718, "step": 582 }, { "epoch": 0.057632899191854284, "grad_norm": 8.810447328417162, "learning_rate": 1.9960119304145305e-07, "loss": 0.8017, "step": 583 }, { "epoch": 0.05773175493660875, "grad_norm": 6.3291666385324845, "learning_rate": 1.9959833100566156e-07, "loss": 0.7878, "step": 584 }, { "epoch": 0.05783061068136322, "grad_norm": 4.2180688597399865, "learning_rate": 1.9959545875753045e-07, "loss": 0.7297, "step": 585 }, { "epoch": 0.05792946642611769, "grad_norm": 14.23868236573787, "learning_rate": 1.9959257629735418e-07, "loss": 0.8153, "step": 586 }, { "epoch": 0.05802832217087216, "grad_norm": 6.302976070727713, "learning_rate": 1.9958968362542826e-07, "loss": 0.789, "step": 587 }, { "epoch": 0.058127177915626625, "grad_norm": 4.210520190750532, "learning_rate": 1.995867807420494e-07, "loss": 0.8542, "step": 588 }, { "epoch": 0.05822603366038109, "grad_norm": 3.1024841417657267, "learning_rate": 1.9958386764751515e-07, "loss": 0.6535, "step": 589 }, { "epoch": 0.058324889405135555, "grad_norm": 3.158443208099288, "learning_rate": 1.9958094434212423e-07, "loss": 0.7538, "step": 590 }, { "epoch": 0.05842374514989002, "grad_norm": 4.356934053596402, "learning_rate": 1.9957801082617637e-07, "loss": 0.8416, "step": 591 }, { "epoch": 0.05852260089464449, "grad_norm": 7.50093203784475, "learning_rate": 1.9957506709997243e-07, "loss": 0.8185, "step": 592 }, { "epoch": 0.05862145663939896, "grad_norm": 4.175917853831336, "learning_rate": 1.9957211316381418e-07, "loss": 0.8628, "step": 593 }, { "epoch": 0.05872031238415342, "grad_norm": 4.457940489279119, "learning_rate": 1.995691490180045e-07, "loss": 0.7124, "step": 594 }, { "epoch": 0.05881916812890789, "grad_norm": 14.728033479268223, "learning_rate": 1.9956617466284732e-07, "loss": 0.689, "step": 595 }, { "epoch": 0.05891802387366236, "grad_norm": 11.856245814360035, "learning_rate": 1.9956319009864768e-07, "loss": 0.713, "step": 596 }, { "epoch": 0.059016879618416826, "grad_norm": 5.017404840825539, "learning_rate": 1.9956019532571153e-07, "loss": 0.7474, "step": 597 }, { "epoch": 0.059115735363171294, "grad_norm": 13.030117726828792, "learning_rate": 1.9955719034434596e-07, "loss": 0.7832, "step": 598 }, { "epoch": 0.05921459110792576, "grad_norm": 6.4253291347651444, "learning_rate": 1.995541751548591e-07, "loss": 0.8043, "step": 599 }, { "epoch": 0.059313446852680224, "grad_norm": 5.10778417019394, "learning_rate": 1.995511497575601e-07, "loss": 0.7389, "step": 600 }, { "epoch": 0.05941230259743469, "grad_norm": 5.526231585134557, "learning_rate": 1.9954811415275921e-07, "loss": 0.8539, "step": 601 }, { "epoch": 0.05951115834218916, "grad_norm": 21.8768788559293, "learning_rate": 1.9954506834076761e-07, "loss": 0.8014, "step": 602 }, { "epoch": 0.05961001408694363, "grad_norm": 4.010757730135512, "learning_rate": 1.9954201232189766e-07, "loss": 0.8794, "step": 603 }, { "epoch": 0.0597088698316981, "grad_norm": 3.484538389165416, "learning_rate": 1.9953894609646272e-07, "loss": 0.7805, "step": 604 }, { "epoch": 0.05980772557645256, "grad_norm": 4.440352691583752, "learning_rate": 1.995358696647771e-07, "loss": 0.8571, "step": 605 }, { "epoch": 0.05990658132120703, "grad_norm": 7.032420462493198, "learning_rate": 1.9953278302715635e-07, "loss": 0.7065, "step": 606 }, { "epoch": 0.060005437065961495, "grad_norm": 3.812710806972093, "learning_rate": 1.995296861839169e-07, "loss": 0.7068, "step": 607 }, { "epoch": 0.06010429281071596, "grad_norm": 6.1595051747188, "learning_rate": 1.9952657913537632e-07, "loss": 0.7021, "step": 608 }, { "epoch": 0.06020314855547043, "grad_norm": 5.022461644219375, "learning_rate": 1.9952346188185313e-07, "loss": 0.7742, "step": 609 }, { "epoch": 0.0603020043002249, "grad_norm": 14.668415177089749, "learning_rate": 1.9952033442366703e-07, "loss": 0.6975, "step": 610 }, { "epoch": 0.06040086004497936, "grad_norm": 5.933759916083074, "learning_rate": 1.9951719676113864e-07, "loss": 0.7489, "step": 611 }, { "epoch": 0.06049971578973383, "grad_norm": 3.1866079560632445, "learning_rate": 1.995140488945897e-07, "loss": 0.6679, "step": 612 }, { "epoch": 0.0605985715344883, "grad_norm": 4.280786151924749, "learning_rate": 1.9951089082434302e-07, "loss": 0.9005, "step": 613 }, { "epoch": 0.060697427279242766, "grad_norm": 5.50939212040624, "learning_rate": 1.9950772255072233e-07, "loss": 0.8538, "step": 614 }, { "epoch": 0.060796283023997234, "grad_norm": 15.066566346118774, "learning_rate": 1.9950454407405254e-07, "loss": 0.8126, "step": 615 }, { "epoch": 0.060895138768751696, "grad_norm": 3.4439412127846047, "learning_rate": 1.9950135539465957e-07, "loss": 0.7629, "step": 616 }, { "epoch": 0.060993994513506164, "grad_norm": 5.825167031224391, "learning_rate": 1.9949815651287034e-07, "loss": 0.7718, "step": 617 }, { "epoch": 0.06109285025826063, "grad_norm": 3.90172213318952, "learning_rate": 1.994949474290129e-07, "loss": 0.7553, "step": 618 }, { "epoch": 0.0611917060030151, "grad_norm": 3.514512547153918, "learning_rate": 1.9949172814341622e-07, "loss": 0.7976, "step": 619 }, { "epoch": 0.06129056174776957, "grad_norm": 3.970479559516452, "learning_rate": 1.994884986564104e-07, "loss": 0.6856, "step": 620 }, { "epoch": 0.06138941749252404, "grad_norm": 70.4010028852345, "learning_rate": 1.9948525896832663e-07, "loss": 0.7019, "step": 621 }, { "epoch": 0.0614882732372785, "grad_norm": 12.414491369358108, "learning_rate": 1.9948200907949708e-07, "loss": 0.7817, "step": 622 }, { "epoch": 0.06158712898203297, "grad_norm": 5.184155447406912, "learning_rate": 1.9947874899025494e-07, "loss": 0.6618, "step": 623 }, { "epoch": 0.061685984726787435, "grad_norm": 4.6088433248204845, "learning_rate": 1.994754787009345e-07, "loss": 0.7924, "step": 624 }, { "epoch": 0.0617848404715419, "grad_norm": 3.9693478561502684, "learning_rate": 1.9947219821187108e-07, "loss": 0.7984, "step": 625 }, { "epoch": 0.06188369621629637, "grad_norm": 32.2803123742957, "learning_rate": 1.994689075234011e-07, "loss": 0.8223, "step": 626 }, { "epoch": 0.06198255196105084, "grad_norm": 3.1820784083814893, "learning_rate": 1.9946560663586188e-07, "loss": 0.7401, "step": 627 }, { "epoch": 0.0620814077058053, "grad_norm": 3.5986410593181137, "learning_rate": 1.9946229554959193e-07, "loss": 0.7495, "step": 628 }, { "epoch": 0.06218026345055977, "grad_norm": 4.687726427257642, "learning_rate": 1.9945897426493075e-07, "loss": 0.8244, "step": 629 }, { "epoch": 0.06227911919531424, "grad_norm": 3.0907456771802253, "learning_rate": 1.9945564278221887e-07, "loss": 0.7553, "step": 630 }, { "epoch": 0.062377974940068706, "grad_norm": 3.7255512052420983, "learning_rate": 1.994523011017979e-07, "loss": 0.7302, "step": 631 }, { "epoch": 0.062476830684823174, "grad_norm": 5.272383098590543, "learning_rate": 1.9944894922401046e-07, "loss": 0.7688, "step": 632 }, { "epoch": 0.06257568642957764, "grad_norm": 5.75235275058399, "learning_rate": 1.994455871492003e-07, "loss": 0.7722, "step": 633 }, { "epoch": 0.0626745421743321, "grad_norm": 4.55774155520791, "learning_rate": 1.9944221487771209e-07, "loss": 0.7168, "step": 634 }, { "epoch": 0.06277339791908658, "grad_norm": 3.4794790875591657, "learning_rate": 1.994388324098916e-07, "loss": 0.771, "step": 635 }, { "epoch": 0.06287225366384104, "grad_norm": 6.291936548838725, "learning_rate": 1.9943543974608564e-07, "loss": 0.7667, "step": 636 }, { "epoch": 0.0629711094085955, "grad_norm": 7.771698903118231, "learning_rate": 1.9943203688664217e-07, "loss": 0.6848, "step": 637 }, { "epoch": 0.06306996515334998, "grad_norm": 27.328515965799834, "learning_rate": 1.9942862383191e-07, "loss": 0.7659, "step": 638 }, { "epoch": 0.06316882089810444, "grad_norm": 4.1937092196964025, "learning_rate": 1.9942520058223918e-07, "loss": 0.7268, "step": 639 }, { "epoch": 0.06326767664285891, "grad_norm": 11.373640171137183, "learning_rate": 1.9942176713798063e-07, "loss": 0.7306, "step": 640 }, { "epoch": 0.06336653238761338, "grad_norm": 5.345114298656406, "learning_rate": 1.9941832349948643e-07, "loss": 0.6483, "step": 641 }, { "epoch": 0.06346538813236784, "grad_norm": 3.8407595679524307, "learning_rate": 1.9941486966710972e-07, "loss": 0.8045, "step": 642 }, { "epoch": 0.06356424387712231, "grad_norm": 4.30973568003605, "learning_rate": 1.9941140564120457e-07, "loss": 0.7329, "step": 643 }, { "epoch": 0.06366309962187677, "grad_norm": 3.66491375611255, "learning_rate": 1.994079314221262e-07, "loss": 0.7248, "step": 644 }, { "epoch": 0.06376195536663125, "grad_norm": 3.7657527531925097, "learning_rate": 1.994044470102308e-07, "loss": 0.7249, "step": 645 }, { "epoch": 0.06386081111138571, "grad_norm": 25.55088229929469, "learning_rate": 1.994009524058757e-07, "loss": 0.6385, "step": 646 }, { "epoch": 0.06395966685614017, "grad_norm": 5.531713167280419, "learning_rate": 1.9939744760941922e-07, "loss": 0.7285, "step": 647 }, { "epoch": 0.06405852260089465, "grad_norm": 4.400238289529198, "learning_rate": 1.9939393262122067e-07, "loss": 0.7689, "step": 648 }, { "epoch": 0.06415737834564911, "grad_norm": 4.858059268299443, "learning_rate": 1.993904074416405e-07, "loss": 0.7799, "step": 649 }, { "epoch": 0.06425623409040358, "grad_norm": 7.258924284412419, "learning_rate": 1.993868720710402e-07, "loss": 0.8285, "step": 650 }, { "epoch": 0.06435508983515804, "grad_norm": 5.060143218946353, "learning_rate": 1.9938332650978217e-07, "loss": 0.7457, "step": 651 }, { "epoch": 0.06445394557991252, "grad_norm": 5.72119992213571, "learning_rate": 1.9937977075823005e-07, "loss": 0.7443, "step": 652 }, { "epoch": 0.06455280132466698, "grad_norm": 4.890847574554586, "learning_rate": 1.9937620481674837e-07, "loss": 0.811, "step": 653 }, { "epoch": 0.06465165706942144, "grad_norm": 7.7869397889051095, "learning_rate": 1.9937262868570279e-07, "loss": 0.679, "step": 654 }, { "epoch": 0.06475051281417592, "grad_norm": 7.229676300629243, "learning_rate": 1.9936904236546002e-07, "loss": 0.7437, "step": 655 }, { "epoch": 0.06484936855893038, "grad_norm": 15.38591407667973, "learning_rate": 1.9936544585638772e-07, "loss": 0.7672, "step": 656 }, { "epoch": 0.06494822430368485, "grad_norm": 6.745946086128724, "learning_rate": 1.9936183915885468e-07, "loss": 0.7958, "step": 657 }, { "epoch": 0.06504708004843932, "grad_norm": 3.947889297936475, "learning_rate": 1.9935822227323077e-07, "loss": 0.8445, "step": 658 }, { "epoch": 0.06514593579319378, "grad_norm": 7.155655628276617, "learning_rate": 1.9935459519988677e-07, "loss": 0.7605, "step": 659 }, { "epoch": 0.06524479153794825, "grad_norm": 3.517338056352976, "learning_rate": 1.9935095793919463e-07, "loss": 0.7676, "step": 660 }, { "epoch": 0.06534364728270271, "grad_norm": 3.469106887177477, "learning_rate": 1.9934731049152727e-07, "loss": 0.7434, "step": 661 }, { "epoch": 0.06544250302745719, "grad_norm": 5.588840309280119, "learning_rate": 1.993436528572587e-07, "loss": 0.7828, "step": 662 }, { "epoch": 0.06554135877221165, "grad_norm": 3.922214188420278, "learning_rate": 1.9933998503676394e-07, "loss": 0.7857, "step": 663 }, { "epoch": 0.06564021451696611, "grad_norm": 10.07131497745927, "learning_rate": 1.993363070304191e-07, "loss": 0.7161, "step": 664 }, { "epoch": 0.06573907026172059, "grad_norm": 3.617634166245376, "learning_rate": 1.993326188386013e-07, "loss": 0.7259, "step": 665 }, { "epoch": 0.06583792600647505, "grad_norm": 9.172056109781693, "learning_rate": 1.9932892046168867e-07, "loss": 0.7009, "step": 666 }, { "epoch": 0.06593678175122952, "grad_norm": 5.047590614155565, "learning_rate": 1.9932521190006046e-07, "loss": 0.7824, "step": 667 }, { "epoch": 0.06603563749598398, "grad_norm": 3.10977448498011, "learning_rate": 1.9932149315409692e-07, "loss": 0.6725, "step": 668 }, { "epoch": 0.06613449324073845, "grad_norm": 24.761525998573045, "learning_rate": 1.9931776422417937e-07, "loss": 0.8204, "step": 669 }, { "epoch": 0.06623334898549292, "grad_norm": 3.942792193979351, "learning_rate": 1.993140251106901e-07, "loss": 0.7908, "step": 670 }, { "epoch": 0.06633220473024738, "grad_norm": 8.355570942070676, "learning_rate": 1.9931027581401255e-07, "loss": 0.7062, "step": 671 }, { "epoch": 0.06643106047500186, "grad_norm": 4.6033483075338895, "learning_rate": 1.993065163345312e-07, "loss": 0.7105, "step": 672 }, { "epoch": 0.06652991621975632, "grad_norm": 9.67939959813092, "learning_rate": 1.993027466726314e-07, "loss": 0.7591, "step": 673 }, { "epoch": 0.0666287719645108, "grad_norm": 7.083669208638802, "learning_rate": 1.9929896682869977e-07, "loss": 0.6727, "step": 674 }, { "epoch": 0.06672762770926526, "grad_norm": 3.058717258565138, "learning_rate": 1.9929517680312386e-07, "loss": 0.6755, "step": 675 }, { "epoch": 0.06682648345401972, "grad_norm": 4.516076094137312, "learning_rate": 1.992913765962923e-07, "loss": 0.6971, "step": 676 }, { "epoch": 0.06692533919877419, "grad_norm": 5.745370500179706, "learning_rate": 1.992875662085947e-07, "loss": 0.825, "step": 677 }, { "epoch": 0.06702419494352865, "grad_norm": 18.566326875176973, "learning_rate": 1.9928374564042176e-07, "loss": 0.8062, "step": 678 }, { "epoch": 0.06712305068828313, "grad_norm": 6.8621971288259935, "learning_rate": 1.992799148921653e-07, "loss": 0.8035, "step": 679 }, { "epoch": 0.06722190643303759, "grad_norm": 3.739600262633109, "learning_rate": 1.99276073964218e-07, "loss": 0.6959, "step": 680 }, { "epoch": 0.06732076217779205, "grad_norm": 3.663244751054349, "learning_rate": 1.9927222285697375e-07, "loss": 0.7894, "step": 681 }, { "epoch": 0.06741961792254653, "grad_norm": 4.417755463361492, "learning_rate": 1.992683615708274e-07, "loss": 0.7265, "step": 682 }, { "epoch": 0.06751847366730099, "grad_norm": 4.362436476373061, "learning_rate": 1.992644901061749e-07, "loss": 0.7141, "step": 683 }, { "epoch": 0.06761732941205546, "grad_norm": 8.292734662051044, "learning_rate": 1.992606084634132e-07, "loss": 0.7415, "step": 684 }, { "epoch": 0.06771618515680992, "grad_norm": 5.489698564883644, "learning_rate": 1.9925671664294028e-07, "loss": 0.8587, "step": 685 }, { "epoch": 0.06781504090156439, "grad_norm": 3.596805019634159, "learning_rate": 1.9925281464515522e-07, "loss": 0.7083, "step": 686 }, { "epoch": 0.06791389664631886, "grad_norm": 8.57288755803437, "learning_rate": 1.9924890247045811e-07, "loss": 0.7754, "step": 687 }, { "epoch": 0.06801275239107332, "grad_norm": 4.806516791011535, "learning_rate": 1.9924498011925008e-07, "loss": 0.7973, "step": 688 }, { "epoch": 0.0681116081358278, "grad_norm": 3.9078803043106527, "learning_rate": 1.9924104759193327e-07, "loss": 0.7071, "step": 689 }, { "epoch": 0.06821046388058226, "grad_norm": 7.114712903518572, "learning_rate": 1.9923710488891098e-07, "loss": 0.7833, "step": 690 }, { "epoch": 0.06830931962533673, "grad_norm": 4.053358353645886, "learning_rate": 1.992331520105874e-07, "loss": 0.8274, "step": 691 }, { "epoch": 0.0684081753700912, "grad_norm": 22.38140021424328, "learning_rate": 1.992291889573679e-07, "loss": 0.7256, "step": 692 }, { "epoch": 0.06850703111484566, "grad_norm": 3.2279270111388048, "learning_rate": 1.9922521572965877e-07, "loss": 0.7996, "step": 693 }, { "epoch": 0.06860588685960013, "grad_norm": 5.8828313406716095, "learning_rate": 1.9922123232786748e-07, "loss": 0.7702, "step": 694 }, { "epoch": 0.0687047426043546, "grad_norm": 3.952389997780548, "learning_rate": 1.992172387524024e-07, "loss": 0.7501, "step": 695 }, { "epoch": 0.06880359834910907, "grad_norm": 5.059293401018895, "learning_rate": 1.9921323500367306e-07, "loss": 0.7629, "step": 696 }, { "epoch": 0.06890245409386353, "grad_norm": 6.029913322129757, "learning_rate": 1.9920922108208996e-07, "loss": 0.8431, "step": 697 }, { "epoch": 0.06900130983861799, "grad_norm": 3.7737552386342594, "learning_rate": 1.9920519698806469e-07, "loss": 0.6893, "step": 698 }, { "epoch": 0.06910016558337247, "grad_norm": 4.231710088102548, "learning_rate": 1.9920116272200987e-07, "loss": 0.7673, "step": 699 }, { "epoch": 0.06919902132812693, "grad_norm": 14.754044814347832, "learning_rate": 1.991971182843391e-07, "loss": 0.7091, "step": 700 }, { "epoch": 0.0692978770728814, "grad_norm": 5.4321874193895345, "learning_rate": 1.991930636754671e-07, "loss": 0.7892, "step": 701 }, { "epoch": 0.06939673281763586, "grad_norm": 5.012149811902219, "learning_rate": 1.9918899889580962e-07, "loss": 0.769, "step": 702 }, { "epoch": 0.06949558856239033, "grad_norm": 3.4596001879756217, "learning_rate": 1.9918492394578347e-07, "loss": 0.6569, "step": 703 }, { "epoch": 0.0695944443071448, "grad_norm": 3.2515720722496755, "learning_rate": 1.9918083882580644e-07, "loss": 0.6982, "step": 704 }, { "epoch": 0.06969330005189926, "grad_norm": 3.1027587528774276, "learning_rate": 1.9917674353629743e-07, "loss": 0.7664, "step": 705 }, { "epoch": 0.06979215579665374, "grad_norm": 3.7072685611372616, "learning_rate": 1.9917263807767627e-07, "loss": 0.806, "step": 706 }, { "epoch": 0.0698910115414082, "grad_norm": 26.561335475692584, "learning_rate": 1.9916852245036404e-07, "loss": 0.8222, "step": 707 }, { "epoch": 0.06998986728616266, "grad_norm": 7.824107075299489, "learning_rate": 1.9916439665478265e-07, "loss": 0.783, "step": 708 }, { "epoch": 0.07008872303091714, "grad_norm": 7.776914185685562, "learning_rate": 1.9916026069135516e-07, "loss": 0.7119, "step": 709 }, { "epoch": 0.0701875787756716, "grad_norm": 3.622366111082512, "learning_rate": 1.9915611456050563e-07, "loss": 0.7765, "step": 710 }, { "epoch": 0.07028643452042607, "grad_norm": 3.7378901464724184, "learning_rate": 1.9915195826265924e-07, "loss": 0.7432, "step": 711 }, { "epoch": 0.07038529026518053, "grad_norm": 6.189146855791468, "learning_rate": 1.9914779179824212e-07, "loss": 0.6783, "step": 712 }, { "epoch": 0.07048414600993501, "grad_norm": 7.550242476675408, "learning_rate": 1.9914361516768145e-07, "loss": 0.6958, "step": 713 }, { "epoch": 0.07058300175468947, "grad_norm": 5.027182750871855, "learning_rate": 1.9913942837140554e-07, "loss": 0.848, "step": 714 }, { "epoch": 0.07068185749944393, "grad_norm": 3.8080429553909614, "learning_rate": 1.9913523140984367e-07, "loss": 0.7291, "step": 715 }, { "epoch": 0.0707807132441984, "grad_norm": 4.058155457610422, "learning_rate": 1.9913102428342614e-07, "loss": 0.8742, "step": 716 }, { "epoch": 0.07087956898895287, "grad_norm": 4.71909905231287, "learning_rate": 1.991268069925844e-07, "loss": 0.7738, "step": 717 }, { "epoch": 0.07097842473370734, "grad_norm": 7.7816740597923175, "learning_rate": 1.991225795377508e-07, "loss": 0.7115, "step": 718 }, { "epoch": 0.0710772804784618, "grad_norm": 3.594697127816907, "learning_rate": 1.991183419193588e-07, "loss": 0.7341, "step": 719 }, { "epoch": 0.07117613622321627, "grad_norm": 3.6824583745598085, "learning_rate": 1.9911409413784298e-07, "loss": 0.771, "step": 720 }, { "epoch": 0.07127499196797074, "grad_norm": 5.7014248291136225, "learning_rate": 1.991098361936388e-07, "loss": 0.8336, "step": 721 }, { "epoch": 0.0713738477127252, "grad_norm": 5.763057597012828, "learning_rate": 1.9910556808718293e-07, "loss": 0.795, "step": 722 }, { "epoch": 0.07147270345747968, "grad_norm": 24.317252156960127, "learning_rate": 1.9910128981891296e-07, "loss": 0.7262, "step": 723 }, { "epoch": 0.07157155920223414, "grad_norm": 4.145075407351751, "learning_rate": 1.9909700138926755e-07, "loss": 0.8862, "step": 724 }, { "epoch": 0.0716704149469886, "grad_norm": 4.640838633475026, "learning_rate": 1.9909270279868645e-07, "loss": 0.7328, "step": 725 }, { "epoch": 0.07176927069174308, "grad_norm": 10.418647182247435, "learning_rate": 1.990883940476104e-07, "loss": 0.7135, "step": 726 }, { "epoch": 0.07186812643649754, "grad_norm": 4.395141068248769, "learning_rate": 1.9908407513648117e-07, "loss": 0.7018, "step": 727 }, { "epoch": 0.07196698218125201, "grad_norm": 4.329061315465672, "learning_rate": 1.9907974606574163e-07, "loss": 0.7926, "step": 728 }, { "epoch": 0.07206583792600647, "grad_norm": 4.516061543655133, "learning_rate": 1.9907540683583567e-07, "loss": 0.8152, "step": 729 }, { "epoch": 0.07216469367076095, "grad_norm": 4.339204628915288, "learning_rate": 1.990710574472082e-07, "loss": 0.798, "step": 730 }, { "epoch": 0.07226354941551541, "grad_norm": 3.836271426299221, "learning_rate": 1.9906669790030516e-07, "loss": 0.7091, "step": 731 }, { "epoch": 0.07236240516026987, "grad_norm": 4.304068222738715, "learning_rate": 1.9906232819557362e-07, "loss": 0.7338, "step": 732 }, { "epoch": 0.07246126090502435, "grad_norm": 6.434619047274208, "learning_rate": 1.990579483334616e-07, "loss": 0.6505, "step": 733 }, { "epoch": 0.07256011664977881, "grad_norm": 7.030358176849566, "learning_rate": 1.990535583144182e-07, "loss": 0.7025, "step": 734 }, { "epoch": 0.07265897239453328, "grad_norm": 6.167883454262384, "learning_rate": 1.990491581388935e-07, "loss": 0.7471, "step": 735 }, { "epoch": 0.07275782813928774, "grad_norm": 6.2570900528319875, "learning_rate": 1.9904474780733874e-07, "loss": 0.7548, "step": 736 }, { "epoch": 0.0728566838840422, "grad_norm": 4.039410039137964, "learning_rate": 1.990403273202061e-07, "loss": 0.6529, "step": 737 }, { "epoch": 0.07295553962879668, "grad_norm": 3.795692911038379, "learning_rate": 1.9903589667794882e-07, "loss": 0.7442, "step": 738 }, { "epoch": 0.07305439537355114, "grad_norm": 4.080900896114658, "learning_rate": 1.9903145588102124e-07, "loss": 0.8195, "step": 739 }, { "epoch": 0.07315325111830562, "grad_norm": 3.171736275523173, "learning_rate": 1.9902700492987866e-07, "loss": 0.7191, "step": 740 }, { "epoch": 0.07325210686306008, "grad_norm": 4.305999994181826, "learning_rate": 1.9902254382497748e-07, "loss": 0.6409, "step": 741 }, { "epoch": 0.07335096260781454, "grad_norm": 5.608596448884323, "learning_rate": 1.9901807256677512e-07, "loss": 0.7246, "step": 742 }, { "epoch": 0.07344981835256902, "grad_norm": 4.501493468624949, "learning_rate": 1.9901359115573006e-07, "loss": 0.6875, "step": 743 }, { "epoch": 0.07354867409732348, "grad_norm": 10.209197672187054, "learning_rate": 1.9900909959230173e-07, "loss": 0.7652, "step": 744 }, { "epoch": 0.07364752984207795, "grad_norm": 3.0972966120161596, "learning_rate": 1.9900459787695078e-07, "loss": 0.6976, "step": 745 }, { "epoch": 0.07374638558683241, "grad_norm": 39.16403054221962, "learning_rate": 1.990000860101387e-07, "loss": 0.7273, "step": 746 }, { "epoch": 0.07384524133158687, "grad_norm": 7.2527880937637965, "learning_rate": 1.9899556399232816e-07, "loss": 0.6912, "step": 747 }, { "epoch": 0.07394409707634135, "grad_norm": 3.607023924978217, "learning_rate": 1.9899103182398282e-07, "loss": 0.7301, "step": 748 }, { "epoch": 0.07404295282109581, "grad_norm": 3.749955276091892, "learning_rate": 1.989864895055674e-07, "loss": 0.7225, "step": 749 }, { "epoch": 0.07414180856585029, "grad_norm": 12.680948495817658, "learning_rate": 1.9898193703754763e-07, "loss": 0.7612, "step": 750 }, { "epoch": 0.07424066431060475, "grad_norm": 7.764726194697529, "learning_rate": 1.989773744203903e-07, "loss": 0.8334, "step": 751 }, { "epoch": 0.07433952005535922, "grad_norm": 3.4362378962387345, "learning_rate": 1.9897280165456322e-07, "loss": 0.7719, "step": 752 }, { "epoch": 0.07443837580011368, "grad_norm": 3.5780340402541704, "learning_rate": 1.9896821874053532e-07, "loss": 0.7446, "step": 753 }, { "epoch": 0.07453723154486815, "grad_norm": 3.2078953717286662, "learning_rate": 1.9896362567877644e-07, "loss": 0.7293, "step": 754 }, { "epoch": 0.07463608728962262, "grad_norm": 7.78350898021283, "learning_rate": 1.9895902246975757e-07, "loss": 0.6874, "step": 755 }, { "epoch": 0.07473494303437708, "grad_norm": 5.352410268779308, "learning_rate": 1.9895440911395073e-07, "loss": 0.7125, "step": 756 }, { "epoch": 0.07483379877913156, "grad_norm": 3.7735431196452227, "learning_rate": 1.9894978561182887e-07, "loss": 0.862, "step": 757 }, { "epoch": 0.07493265452388602, "grad_norm": 8.809300514529598, "learning_rate": 1.989451519638661e-07, "loss": 0.7905, "step": 758 }, { "epoch": 0.07503151026864048, "grad_norm": 4.884794697963846, "learning_rate": 1.989405081705376e-07, "loss": 0.8184, "step": 759 }, { "epoch": 0.07513036601339496, "grad_norm": 8.456171736946942, "learning_rate": 1.9893585423231943e-07, "loss": 0.7935, "step": 760 }, { "epoch": 0.07522922175814942, "grad_norm": 8.555494271273227, "learning_rate": 1.989311901496888e-07, "loss": 0.7945, "step": 761 }, { "epoch": 0.07532807750290389, "grad_norm": 4.325461156044643, "learning_rate": 1.9892651592312396e-07, "loss": 0.8131, "step": 762 }, { "epoch": 0.07542693324765835, "grad_norm": 5.466562070992862, "learning_rate": 1.989218315531042e-07, "loss": 0.7042, "step": 763 }, { "epoch": 0.07552578899241282, "grad_norm": 3.6685325391509176, "learning_rate": 1.9891713704010982e-07, "loss": 0.727, "step": 764 }, { "epoch": 0.07562464473716729, "grad_norm": 7.232233865782656, "learning_rate": 1.9891243238462216e-07, "loss": 0.743, "step": 765 }, { "epoch": 0.07572350048192175, "grad_norm": 10.13390524854547, "learning_rate": 1.9890771758712362e-07, "loss": 0.6879, "step": 766 }, { "epoch": 0.07582235622667623, "grad_norm": 7.944449062723659, "learning_rate": 1.989029926480976e-07, "loss": 0.6923, "step": 767 }, { "epoch": 0.07592121197143069, "grad_norm": 3.829357682804349, "learning_rate": 1.9889825756802865e-07, "loss": 0.7916, "step": 768 }, { "epoch": 0.07602006771618516, "grad_norm": 3.851113104447836, "learning_rate": 1.988935123474022e-07, "loss": 0.7054, "step": 769 }, { "epoch": 0.07611892346093962, "grad_norm": 3.0377050449586904, "learning_rate": 1.9888875698670486e-07, "loss": 0.7536, "step": 770 }, { "epoch": 0.07621777920569409, "grad_norm": 3.5991429727313333, "learning_rate": 1.988839914864242e-07, "loss": 0.7309, "step": 771 }, { "epoch": 0.07631663495044856, "grad_norm": 4.521781137218234, "learning_rate": 1.9887921584704888e-07, "loss": 0.8344, "step": 772 }, { "epoch": 0.07641549069520302, "grad_norm": 7.279719438255686, "learning_rate": 1.9887443006906853e-07, "loss": 0.7766, "step": 773 }, { "epoch": 0.0765143464399575, "grad_norm": 9.11961298703795, "learning_rate": 1.9886963415297384e-07, "loss": 0.7536, "step": 774 }, { "epoch": 0.07661320218471196, "grad_norm": 5.225782649350661, "learning_rate": 1.9886482809925663e-07, "loss": 0.7133, "step": 775 }, { "epoch": 0.07671205792946642, "grad_norm": 3.484893864137296, "learning_rate": 1.9886001190840965e-07, "loss": 0.8302, "step": 776 }, { "epoch": 0.0768109136742209, "grad_norm": 3.5253487712773746, "learning_rate": 1.9885518558092675e-07, "loss": 0.7662, "step": 777 }, { "epoch": 0.07690976941897536, "grad_norm": 4.296758766405044, "learning_rate": 1.9885034911730277e-07, "loss": 0.7891, "step": 778 }, { "epoch": 0.07700862516372983, "grad_norm": 4.962800844289576, "learning_rate": 1.9884550251803365e-07, "loss": 0.8553, "step": 779 }, { "epoch": 0.0771074809084843, "grad_norm": 6.025076616658453, "learning_rate": 1.988406457836163e-07, "loss": 0.639, "step": 780 }, { "epoch": 0.07720633665323876, "grad_norm": 4.75313492543018, "learning_rate": 1.9883577891454874e-07, "loss": 0.8041, "step": 781 }, { "epoch": 0.07730519239799323, "grad_norm": 3.7474338993119094, "learning_rate": 1.9883090191132997e-07, "loss": 0.7953, "step": 782 }, { "epoch": 0.07740404814274769, "grad_norm": 4.852853186612177, "learning_rate": 1.9882601477446007e-07, "loss": 0.7452, "step": 783 }, { "epoch": 0.07750290388750217, "grad_norm": 4.413149669292896, "learning_rate": 1.9882111750444018e-07, "loss": 0.8106, "step": 784 }, { "epoch": 0.07760175963225663, "grad_norm": 3.721686318953266, "learning_rate": 1.9881621010177235e-07, "loss": 0.7316, "step": 785 }, { "epoch": 0.07770061537701109, "grad_norm": 5.2104866879326845, "learning_rate": 1.9881129256695983e-07, "loss": 0.8006, "step": 786 }, { "epoch": 0.07779947112176556, "grad_norm": 3.891899037836648, "learning_rate": 1.9880636490050686e-07, "loss": 0.7691, "step": 787 }, { "epoch": 0.07789832686652003, "grad_norm": 5.005587609141315, "learning_rate": 1.9880142710291863e-07, "loss": 0.7833, "step": 788 }, { "epoch": 0.0779971826112745, "grad_norm": 4.155246434173738, "learning_rate": 1.987964791747015e-07, "loss": 0.8192, "step": 789 }, { "epoch": 0.07809603835602896, "grad_norm": 4.185286011202474, "learning_rate": 1.9879152111636275e-07, "loss": 0.7309, "step": 790 }, { "epoch": 0.07819489410078344, "grad_norm": 4.662798597717031, "learning_rate": 1.987865529284108e-07, "loss": 0.7921, "step": 791 }, { "epoch": 0.0782937498455379, "grad_norm": 5.658409162286452, "learning_rate": 1.9878157461135509e-07, "loss": 0.7284, "step": 792 }, { "epoch": 0.07839260559029236, "grad_norm": 9.44546993410144, "learning_rate": 1.9877658616570597e-07, "loss": 0.7405, "step": 793 }, { "epoch": 0.07849146133504684, "grad_norm": 3.4992306426165904, "learning_rate": 1.98771587591975e-07, "loss": 0.6944, "step": 794 }, { "epoch": 0.0785903170798013, "grad_norm": 4.104978737836563, "learning_rate": 1.9876657889067476e-07, "loss": 0.7596, "step": 795 }, { "epoch": 0.07868917282455577, "grad_norm": 5.214548542970481, "learning_rate": 1.987615600623187e-07, "loss": 0.8185, "step": 796 }, { "epoch": 0.07878802856931023, "grad_norm": 17.965256797520407, "learning_rate": 1.9875653110742157e-07, "loss": 0.8118, "step": 797 }, { "epoch": 0.0788868843140647, "grad_norm": 4.829172213887559, "learning_rate": 1.9875149202649886e-07, "loss": 0.6847, "step": 798 }, { "epoch": 0.07898574005881917, "grad_norm": 4.105287199321695, "learning_rate": 1.9874644282006735e-07, "loss": 0.7341, "step": 799 }, { "epoch": 0.07908459580357363, "grad_norm": 4.226025231830703, "learning_rate": 1.9874138348864476e-07, "loss": 0.7034, "step": 800 }, { "epoch": 0.07918345154832811, "grad_norm": 3.630380700965607, "learning_rate": 1.9873631403274983e-07, "loss": 0.8017, "step": 801 }, { "epoch": 0.07928230729308257, "grad_norm": 6.659541247636427, "learning_rate": 1.9873123445290236e-07, "loss": 0.7424, "step": 802 }, { "epoch": 0.07938116303783703, "grad_norm": 19.23650281561679, "learning_rate": 1.9872614474962317e-07, "loss": 0.7158, "step": 803 }, { "epoch": 0.0794800187825915, "grad_norm": 3.929424724418909, "learning_rate": 1.9872104492343418e-07, "loss": 0.6706, "step": 804 }, { "epoch": 0.07957887452734597, "grad_norm": 6.905009982565782, "learning_rate": 1.9871593497485823e-07, "loss": 0.7558, "step": 805 }, { "epoch": 0.07967773027210044, "grad_norm": 12.255420033554843, "learning_rate": 1.9871081490441935e-07, "loss": 0.7736, "step": 806 }, { "epoch": 0.0797765860168549, "grad_norm": 5.250536386257707, "learning_rate": 1.9870568471264243e-07, "loss": 0.8035, "step": 807 }, { "epoch": 0.07987544176160938, "grad_norm": 5.354749942689716, "learning_rate": 1.9870054440005362e-07, "loss": 0.7937, "step": 808 }, { "epoch": 0.07997429750636384, "grad_norm": 8.837205278518203, "learning_rate": 1.9869539396717988e-07, "loss": 0.7456, "step": 809 }, { "epoch": 0.0800731532511183, "grad_norm": 5.3864153004375295, "learning_rate": 1.9869023341454935e-07, "loss": 0.8031, "step": 810 }, { "epoch": 0.08017200899587278, "grad_norm": 6.15435767117148, "learning_rate": 1.986850627426912e-07, "loss": 0.8313, "step": 811 }, { "epoch": 0.08027086474062724, "grad_norm": 4.245939297558537, "learning_rate": 1.9867988195213554e-07, "loss": 0.7846, "step": 812 }, { "epoch": 0.08036972048538171, "grad_norm": 6.239636917418423, "learning_rate": 1.986746910434136e-07, "loss": 0.8954, "step": 813 }, { "epoch": 0.08046857623013617, "grad_norm": 3.976873909313196, "learning_rate": 1.9866949001705768e-07, "loss": 0.8021, "step": 814 }, { "epoch": 0.08056743197489064, "grad_norm": 3.7468659372137556, "learning_rate": 1.9866427887360105e-07, "loss": 0.6576, "step": 815 }, { "epoch": 0.08066628771964511, "grad_norm": 30.690961391913937, "learning_rate": 1.98659057613578e-07, "loss": 0.6978, "step": 816 }, { "epoch": 0.08076514346439957, "grad_norm": 14.407851009278248, "learning_rate": 1.986538262375239e-07, "loss": 0.7896, "step": 817 }, { "epoch": 0.08086399920915405, "grad_norm": 3.2539214509653256, "learning_rate": 1.9864858474597516e-07, "loss": 0.6667, "step": 818 }, { "epoch": 0.08096285495390851, "grad_norm": 3.5579486011855552, "learning_rate": 1.9864333313946927e-07, "loss": 0.6281, "step": 819 }, { "epoch": 0.08106171069866297, "grad_norm": 7.384169343550695, "learning_rate": 1.9863807141854463e-07, "loss": 0.7666, "step": 820 }, { "epoch": 0.08116056644341745, "grad_norm": 3.8644177584333566, "learning_rate": 1.9863279958374076e-07, "loss": 0.6997, "step": 821 }, { "epoch": 0.0812594221881719, "grad_norm": 4.991519416288131, "learning_rate": 1.9862751763559827e-07, "loss": 0.7833, "step": 822 }, { "epoch": 0.08135827793292638, "grad_norm": 3.6610485188644057, "learning_rate": 1.9862222557465866e-07, "loss": 0.8443, "step": 823 }, { "epoch": 0.08145713367768084, "grad_norm": 4.922354990484463, "learning_rate": 1.9861692340146463e-07, "loss": 0.787, "step": 824 }, { "epoch": 0.0815559894224353, "grad_norm": 3.259339898108427, "learning_rate": 1.986116111165598e-07, "loss": 0.7704, "step": 825 }, { "epoch": 0.08165484516718978, "grad_norm": 5.328174756434093, "learning_rate": 1.9860628872048888e-07, "loss": 0.7362, "step": 826 }, { "epoch": 0.08175370091194424, "grad_norm": 4.372367691286817, "learning_rate": 1.9860095621379757e-07, "loss": 0.7859, "step": 827 }, { "epoch": 0.08185255665669872, "grad_norm": 10.38007253378599, "learning_rate": 1.9859561359703272e-07, "loss": 0.6434, "step": 828 }, { "epoch": 0.08195141240145318, "grad_norm": 4.415345800072433, "learning_rate": 1.9859026087074203e-07, "loss": 0.859, "step": 829 }, { "epoch": 0.08205026814620765, "grad_norm": 3.6221072347778955, "learning_rate": 1.9858489803547443e-07, "loss": 0.805, "step": 830 }, { "epoch": 0.08214912389096211, "grad_norm": 11.912724116335559, "learning_rate": 1.9857952509177974e-07, "loss": 0.7603, "step": 831 }, { "epoch": 0.08224797963571658, "grad_norm": 3.8282029364525885, "learning_rate": 1.985741420402089e-07, "loss": 0.8448, "step": 832 }, { "epoch": 0.08234683538047105, "grad_norm": 9.48714494587464, "learning_rate": 1.985687488813139e-07, "loss": 0.8396, "step": 833 }, { "epoch": 0.08244569112522551, "grad_norm": 4.27357580429121, "learning_rate": 1.9856334561564767e-07, "loss": 0.8055, "step": 834 }, { "epoch": 0.08254454686997999, "grad_norm": 4.294978182834366, "learning_rate": 1.9855793224376427e-07, "loss": 0.7707, "step": 835 }, { "epoch": 0.08264340261473445, "grad_norm": 3.87519284546518, "learning_rate": 1.985525087662187e-07, "loss": 0.7649, "step": 836 }, { "epoch": 0.08274225835948891, "grad_norm": 3.2009063024540763, "learning_rate": 1.9854707518356717e-07, "loss": 0.8032, "step": 837 }, { "epoch": 0.08284111410424339, "grad_norm": 5.419668170208262, "learning_rate": 1.9854163149636672e-07, "loss": 0.7129, "step": 838 }, { "epoch": 0.08293996984899785, "grad_norm": 5.322405729929826, "learning_rate": 1.9853617770517558e-07, "loss": 0.7338, "step": 839 }, { "epoch": 0.08303882559375232, "grad_norm": 3.522059614559362, "learning_rate": 1.9853071381055292e-07, "loss": 0.7693, "step": 840 }, { "epoch": 0.08313768133850678, "grad_norm": 4.8160918372272725, "learning_rate": 1.9852523981305896e-07, "loss": 0.737, "step": 841 }, { "epoch": 0.08323653708326124, "grad_norm": 3.610621735537937, "learning_rate": 1.9851975571325504e-07, "loss": 0.7458, "step": 842 }, { "epoch": 0.08333539282801572, "grad_norm": 4.988577800410283, "learning_rate": 1.9851426151170342e-07, "loss": 0.6967, "step": 843 }, { "epoch": 0.08343424857277018, "grad_norm": 2.89792044905663, "learning_rate": 1.9850875720896746e-07, "loss": 0.7069, "step": 844 }, { "epoch": 0.08353310431752466, "grad_norm": 6.99272429488204, "learning_rate": 1.985032428056116e-07, "loss": 0.7559, "step": 845 }, { "epoch": 0.08363196006227912, "grad_norm": 8.991531595643592, "learning_rate": 1.9849771830220118e-07, "loss": 0.8234, "step": 846 }, { "epoch": 0.08373081580703359, "grad_norm": 4.484469749634898, "learning_rate": 1.9849218369930268e-07, "loss": 0.7822, "step": 847 }, { "epoch": 0.08382967155178805, "grad_norm": 5.372239032979066, "learning_rate": 1.9848663899748363e-07, "loss": 0.6474, "step": 848 }, { "epoch": 0.08392852729654252, "grad_norm": 3.7097462411279647, "learning_rate": 1.9848108419731254e-07, "loss": 0.7021, "step": 849 }, { "epoch": 0.08402738304129699, "grad_norm": 3.7672815213882713, "learning_rate": 1.9847551929935893e-07, "loss": 0.7664, "step": 850 }, { "epoch": 0.08412623878605145, "grad_norm": 5.112768690470133, "learning_rate": 1.9846994430419347e-07, "loss": 0.6364, "step": 851 }, { "epoch": 0.08422509453080593, "grad_norm": 9.283184836367477, "learning_rate": 1.9846435921238772e-07, "loss": 0.8053, "step": 852 }, { "epoch": 0.08432395027556039, "grad_norm": 4.641899073408122, "learning_rate": 1.9845876402451439e-07, "loss": 0.7857, "step": 853 }, { "epoch": 0.08442280602031485, "grad_norm": 7.844723835520541, "learning_rate": 1.9845315874114719e-07, "loss": 0.6722, "step": 854 }, { "epoch": 0.08452166176506933, "grad_norm": 11.89001883872346, "learning_rate": 1.9844754336286083e-07, "loss": 0.7465, "step": 855 }, { "epoch": 0.08462051750982379, "grad_norm": 11.862602767462036, "learning_rate": 1.984419178902311e-07, "loss": 0.6613, "step": 856 }, { "epoch": 0.08471937325457826, "grad_norm": 4.486796269006515, "learning_rate": 1.9843628232383484e-07, "loss": 0.7362, "step": 857 }, { "epoch": 0.08481822899933272, "grad_norm": 4.14493865745818, "learning_rate": 1.9843063666424987e-07, "loss": 0.7703, "step": 858 }, { "epoch": 0.08491708474408718, "grad_norm": 52.992928550541514, "learning_rate": 1.9842498091205502e-07, "loss": 0.7132, "step": 859 }, { "epoch": 0.08501594048884166, "grad_norm": 6.535440644055109, "learning_rate": 1.9841931506783027e-07, "loss": 0.7032, "step": 860 }, { "epoch": 0.08511479623359612, "grad_norm": 5.624438777712978, "learning_rate": 1.9841363913215658e-07, "loss": 0.8004, "step": 861 }, { "epoch": 0.0852136519783506, "grad_norm": 6.287734676271625, "learning_rate": 1.9840795310561585e-07, "loss": 0.8073, "step": 862 }, { "epoch": 0.08531250772310506, "grad_norm": 3.930633126665939, "learning_rate": 1.9840225698879117e-07, "loss": 0.8182, "step": 863 }, { "epoch": 0.08541136346785952, "grad_norm": 3.5212968957296, "learning_rate": 1.9839655078226657e-07, "loss": 0.7972, "step": 864 }, { "epoch": 0.085510219212614, "grad_norm": 11.084579392991461, "learning_rate": 1.9839083448662712e-07, "loss": 0.7326, "step": 865 }, { "epoch": 0.08560907495736846, "grad_norm": 8.09834389690182, "learning_rate": 1.98385108102459e-07, "loss": 0.6885, "step": 866 }, { "epoch": 0.08570793070212293, "grad_norm": 9.462335183684687, "learning_rate": 1.9837937163034927e-07, "loss": 0.8104, "step": 867 }, { "epoch": 0.08580678644687739, "grad_norm": 3.8763047047789927, "learning_rate": 1.9837362507088624e-07, "loss": 0.7138, "step": 868 }, { "epoch": 0.08590564219163187, "grad_norm": 5.430271804166304, "learning_rate": 1.9836786842465903e-07, "loss": 0.678, "step": 869 }, { "epoch": 0.08600449793638633, "grad_norm": 4.0562925411616115, "learning_rate": 1.9836210169225796e-07, "loss": 0.6687, "step": 870 }, { "epoch": 0.08610335368114079, "grad_norm": 4.986835142943566, "learning_rate": 1.9835632487427428e-07, "loss": 0.7112, "step": 871 }, { "epoch": 0.08620220942589527, "grad_norm": 13.365202470400366, "learning_rate": 1.9835053797130037e-07, "loss": 0.7808, "step": 872 }, { "epoch": 0.08630106517064973, "grad_norm": 4.778659956489834, "learning_rate": 1.9834474098392957e-07, "loss": 0.7104, "step": 873 }, { "epoch": 0.0863999209154042, "grad_norm": 5.768771104939935, "learning_rate": 1.9833893391275625e-07, "loss": 0.7597, "step": 874 }, { "epoch": 0.08649877666015866, "grad_norm": 7.567129429976021, "learning_rate": 1.9833311675837584e-07, "loss": 0.7378, "step": 875 }, { "epoch": 0.08659763240491312, "grad_norm": 4.455226815361179, "learning_rate": 1.9832728952138487e-07, "loss": 0.7451, "step": 876 }, { "epoch": 0.0866964881496676, "grad_norm": 4.381278016031531, "learning_rate": 1.9832145220238075e-07, "loss": 0.7645, "step": 877 }, { "epoch": 0.08679534389442206, "grad_norm": 3.7943218022766114, "learning_rate": 1.9831560480196207e-07, "loss": 0.7358, "step": 878 }, { "epoch": 0.08689419963917654, "grad_norm": 3.696921007763004, "learning_rate": 1.9830974732072838e-07, "loss": 0.7855, "step": 879 }, { "epoch": 0.086993055383931, "grad_norm": 3.1371834363680806, "learning_rate": 1.9830387975928024e-07, "loss": 0.7547, "step": 880 }, { "epoch": 0.08709191112868546, "grad_norm": 4.64452160841938, "learning_rate": 1.9829800211821936e-07, "loss": 0.7795, "step": 881 }, { "epoch": 0.08719076687343993, "grad_norm": 4.0040460958056565, "learning_rate": 1.9829211439814832e-07, "loss": 0.8118, "step": 882 }, { "epoch": 0.0872896226181944, "grad_norm": 6.366119570437248, "learning_rate": 1.982862165996709e-07, "loss": 0.8181, "step": 883 }, { "epoch": 0.08738847836294887, "grad_norm": 4.515983980850584, "learning_rate": 1.9828030872339174e-07, "loss": 0.7759, "step": 884 }, { "epoch": 0.08748733410770333, "grad_norm": 3.902417988633249, "learning_rate": 1.982743907699167e-07, "loss": 0.6248, "step": 885 }, { "epoch": 0.08758618985245781, "grad_norm": 8.65243180095402, "learning_rate": 1.9826846273985252e-07, "loss": 0.8386, "step": 886 }, { "epoch": 0.08768504559721227, "grad_norm": 3.092683472698625, "learning_rate": 1.9826252463380704e-07, "loss": 0.8499, "step": 887 }, { "epoch": 0.08778390134196673, "grad_norm": 4.421673007825054, "learning_rate": 1.9825657645238912e-07, "loss": 0.7746, "step": 888 }, { "epoch": 0.0878827570867212, "grad_norm": 9.058862356024104, "learning_rate": 1.982506181962087e-07, "loss": 0.7161, "step": 889 }, { "epoch": 0.08798161283147567, "grad_norm": 5.978148399638466, "learning_rate": 1.9824464986587671e-07, "loss": 0.7142, "step": 890 }, { "epoch": 0.08808046857623014, "grad_norm": 7.241222338192638, "learning_rate": 1.9823867146200505e-07, "loss": 0.7484, "step": 891 }, { "epoch": 0.0881793243209846, "grad_norm": 5.250237427444657, "learning_rate": 1.9823268298520674e-07, "loss": 0.7677, "step": 892 }, { "epoch": 0.08827818006573906, "grad_norm": 15.90843219663121, "learning_rate": 1.9822668443609585e-07, "loss": 0.7517, "step": 893 }, { "epoch": 0.08837703581049354, "grad_norm": 11.376828907266571, "learning_rate": 1.982206758152874e-07, "loss": 0.6686, "step": 894 }, { "epoch": 0.088475891555248, "grad_norm": 5.809925732898762, "learning_rate": 1.982146571233975e-07, "loss": 0.7382, "step": 895 }, { "epoch": 0.08857474730000248, "grad_norm": 4.921938729652527, "learning_rate": 1.9820862836104332e-07, "loss": 0.7787, "step": 896 }, { "epoch": 0.08867360304475694, "grad_norm": 4.54734638410769, "learning_rate": 1.982025895288429e-07, "loss": 0.8104, "step": 897 }, { "epoch": 0.0887724587895114, "grad_norm": 5.196461481925448, "learning_rate": 1.9819654062741558e-07, "loss": 0.7759, "step": 898 }, { "epoch": 0.08887131453426587, "grad_norm": 3.6519244294956525, "learning_rate": 1.9819048165738153e-07, "loss": 0.8168, "step": 899 }, { "epoch": 0.08897017027902034, "grad_norm": 5.241362548827772, "learning_rate": 1.9818441261936196e-07, "loss": 0.6995, "step": 900 }, { "epoch": 0.08906902602377481, "grad_norm": 4.557603654576993, "learning_rate": 1.981783335139792e-07, "loss": 0.8512, "step": 901 }, { "epoch": 0.08916788176852927, "grad_norm": 3.208950772220537, "learning_rate": 1.981722443418566e-07, "loss": 0.7318, "step": 902 }, { "epoch": 0.08926673751328375, "grad_norm": 5.33583082029189, "learning_rate": 1.9816614510361844e-07, "loss": 0.7552, "step": 903 }, { "epoch": 0.08936559325803821, "grad_norm": 4.07701830814837, "learning_rate": 1.9816003579989018e-07, "loss": 0.8403, "step": 904 }, { "epoch": 0.08946444900279267, "grad_norm": 5.716591169589403, "learning_rate": 1.981539164312982e-07, "loss": 0.7406, "step": 905 }, { "epoch": 0.08956330474754715, "grad_norm": 5.552818519185844, "learning_rate": 1.9814778699846997e-07, "loss": 0.768, "step": 906 }, { "epoch": 0.0896621604923016, "grad_norm": 7.003601100450262, "learning_rate": 1.9814164750203396e-07, "loss": 0.7827, "step": 907 }, { "epoch": 0.08976101623705608, "grad_norm": 4.371753709971432, "learning_rate": 1.981354979426197e-07, "loss": 0.6607, "step": 908 }, { "epoch": 0.08985987198181054, "grad_norm": 3.8385239013421795, "learning_rate": 1.9812933832085772e-07, "loss": 0.8206, "step": 909 }, { "epoch": 0.089958727726565, "grad_norm": 7.402589332585354, "learning_rate": 1.9812316863737962e-07, "loss": 0.7459, "step": 910 }, { "epoch": 0.09005758347131948, "grad_norm": 2.707285399914594, "learning_rate": 1.9811698889281797e-07, "loss": 0.6618, "step": 911 }, { "epoch": 0.09015643921607394, "grad_norm": 6.550472888640813, "learning_rate": 1.9811079908780646e-07, "loss": 0.7571, "step": 912 }, { "epoch": 0.09025529496082842, "grad_norm": 7.373598063489139, "learning_rate": 1.9810459922297972e-07, "loss": 0.7319, "step": 913 }, { "epoch": 0.09035415070558288, "grad_norm": 4.341690460005673, "learning_rate": 1.9809838929897347e-07, "loss": 0.6778, "step": 914 }, { "epoch": 0.09045300645033734, "grad_norm": 5.548671858522059, "learning_rate": 1.9809216931642446e-07, "loss": 0.8277, "step": 915 }, { "epoch": 0.09055186219509181, "grad_norm": 4.761726693405643, "learning_rate": 1.9808593927597046e-07, "loss": 0.6672, "step": 916 }, { "epoch": 0.09065071793984628, "grad_norm": 3.916484533329558, "learning_rate": 1.9807969917825025e-07, "loss": 0.9155, "step": 917 }, { "epoch": 0.09074957368460075, "grad_norm": 4.3301571325649775, "learning_rate": 1.9807344902390366e-07, "loss": 0.7297, "step": 918 }, { "epoch": 0.09084842942935521, "grad_norm": 4.672323203137857, "learning_rate": 1.9806718881357156e-07, "loss": 0.7816, "step": 919 }, { "epoch": 0.09094728517410967, "grad_norm": 5.98096678247886, "learning_rate": 1.9806091854789583e-07, "loss": 0.731, "step": 920 }, { "epoch": 0.09104614091886415, "grad_norm": 3.7668782708425805, "learning_rate": 1.9805463822751942e-07, "loss": 0.6752, "step": 921 }, { "epoch": 0.09114499666361861, "grad_norm": 4.016819716439341, "learning_rate": 1.9804834785308626e-07, "loss": 0.8044, "step": 922 }, { "epoch": 0.09124385240837309, "grad_norm": 4.757797269891826, "learning_rate": 1.9804204742524133e-07, "loss": 0.7248, "step": 923 }, { "epoch": 0.09134270815312755, "grad_norm": 4.607458782010897, "learning_rate": 1.9803573694463069e-07, "loss": 0.6414, "step": 924 }, { "epoch": 0.09144156389788202, "grad_norm": 4.081323920089845, "learning_rate": 1.9802941641190133e-07, "loss": 0.7212, "step": 925 }, { "epoch": 0.09154041964263648, "grad_norm": 3.2899023716749776, "learning_rate": 1.9802308582770134e-07, "loss": 0.8306, "step": 926 }, { "epoch": 0.09163927538739094, "grad_norm": 4.8897356054291645, "learning_rate": 1.9801674519267987e-07, "loss": 0.8189, "step": 927 }, { "epoch": 0.09173813113214542, "grad_norm": 7.770108855179524, "learning_rate": 1.9801039450748704e-07, "loss": 0.754, "step": 928 }, { "epoch": 0.09183698687689988, "grad_norm": 5.238870306300288, "learning_rate": 1.9800403377277398e-07, "loss": 0.7439, "step": 929 }, { "epoch": 0.09193584262165436, "grad_norm": 20.473114733729616, "learning_rate": 1.9799766298919294e-07, "loss": 0.7405, "step": 930 }, { "epoch": 0.09203469836640882, "grad_norm": 3.010421290904927, "learning_rate": 1.979912821573971e-07, "loss": 0.7703, "step": 931 }, { "epoch": 0.09213355411116328, "grad_norm": 3.905348440801245, "learning_rate": 1.9798489127804077e-07, "loss": 0.7894, "step": 932 }, { "epoch": 0.09223240985591775, "grad_norm": 4.2854173947543535, "learning_rate": 1.979784903517792e-07, "loss": 0.7619, "step": 933 }, { "epoch": 0.09233126560067222, "grad_norm": 3.6045291253242695, "learning_rate": 1.9797207937926873e-07, "loss": 0.815, "step": 934 }, { "epoch": 0.09243012134542669, "grad_norm": 3.022906754604075, "learning_rate": 1.9796565836116671e-07, "loss": 0.7989, "step": 935 }, { "epoch": 0.09252897709018115, "grad_norm": 5.081533686869524, "learning_rate": 1.9795922729813155e-07, "loss": 0.7203, "step": 936 }, { "epoch": 0.09262783283493561, "grad_norm": 4.827018422233551, "learning_rate": 1.9795278619082262e-07, "loss": 0.7713, "step": 937 }, { "epoch": 0.09272668857969009, "grad_norm": 3.2845832350857243, "learning_rate": 1.9794633503990035e-07, "loss": 0.8243, "step": 938 }, { "epoch": 0.09282554432444455, "grad_norm": 6.175907729606399, "learning_rate": 1.9793987384602625e-07, "loss": 0.7173, "step": 939 }, { "epoch": 0.09292440006919903, "grad_norm": 3.198463070244492, "learning_rate": 1.9793340260986277e-07, "loss": 0.7715, "step": 940 }, { "epoch": 0.09302325581395349, "grad_norm": 4.192722546034901, "learning_rate": 1.9792692133207352e-07, "loss": 0.6863, "step": 941 }, { "epoch": 0.09312211155870796, "grad_norm": 4.108045458234193, "learning_rate": 1.97920430013323e-07, "loss": 0.7532, "step": 942 }, { "epoch": 0.09322096730346242, "grad_norm": 4.304530946797021, "learning_rate": 1.9791392865427677e-07, "loss": 0.7793, "step": 943 }, { "epoch": 0.09331982304821689, "grad_norm": 4.691091631388994, "learning_rate": 1.979074172556015e-07, "loss": 0.6875, "step": 944 }, { "epoch": 0.09341867879297136, "grad_norm": 3.6303162997585563, "learning_rate": 1.9790089581796488e-07, "loss": 0.6835, "step": 945 }, { "epoch": 0.09351753453772582, "grad_norm": 4.353557897553249, "learning_rate": 1.9789436434203549e-07, "loss": 0.7362, "step": 946 }, { "epoch": 0.0936163902824803, "grad_norm": 6.087437903762105, "learning_rate": 1.9788782282848308e-07, "loss": 0.9038, "step": 947 }, { "epoch": 0.09371524602723476, "grad_norm": 4.031163558251467, "learning_rate": 1.978812712779784e-07, "loss": 0.8409, "step": 948 }, { "epoch": 0.09381410177198922, "grad_norm": 5.43922746258757, "learning_rate": 1.9787470969119317e-07, "loss": 0.7912, "step": 949 }, { "epoch": 0.0939129575167437, "grad_norm": 3.621040438012115, "learning_rate": 1.9786813806880023e-07, "loss": 0.8181, "step": 950 }, { "epoch": 0.09401181326149816, "grad_norm": 4.241873340293866, "learning_rate": 1.978615564114734e-07, "loss": 0.8378, "step": 951 }, { "epoch": 0.09411066900625263, "grad_norm": 4.035819500055859, "learning_rate": 1.9785496471988755e-07, "loss": 0.7106, "step": 952 }, { "epoch": 0.09420952475100709, "grad_norm": 20.701732290784598, "learning_rate": 1.9784836299471846e-07, "loss": 0.7705, "step": 953 }, { "epoch": 0.09430838049576155, "grad_norm": 3.874751164266666, "learning_rate": 1.9784175123664314e-07, "loss": 0.6765, "step": 954 }, { "epoch": 0.09440723624051603, "grad_norm": 6.588364848570739, "learning_rate": 1.9783512944633953e-07, "loss": 0.7609, "step": 955 }, { "epoch": 0.09450609198527049, "grad_norm": 7.42533836140291, "learning_rate": 1.9782849762448652e-07, "loss": 0.6578, "step": 956 }, { "epoch": 0.09460494773002497, "grad_norm": 3.5094067020374844, "learning_rate": 1.9782185577176418e-07, "loss": 0.8069, "step": 957 }, { "epoch": 0.09470380347477943, "grad_norm": 4.346961361797709, "learning_rate": 1.9781520388885348e-07, "loss": 0.6762, "step": 958 }, { "epoch": 0.09480265921953389, "grad_norm": 9.331837964571536, "learning_rate": 1.978085419764365e-07, "loss": 0.7363, "step": 959 }, { "epoch": 0.09490151496428836, "grad_norm": 5.8697024580756265, "learning_rate": 1.9780187003519635e-07, "loss": 0.7576, "step": 960 }, { "epoch": 0.09500037070904283, "grad_norm": 3.72559139490969, "learning_rate": 1.9779518806581708e-07, "loss": 0.8156, "step": 961 }, { "epoch": 0.0950992264537973, "grad_norm": 7.395256131794705, "learning_rate": 1.9778849606898387e-07, "loss": 0.7383, "step": 962 }, { "epoch": 0.09519808219855176, "grad_norm": 4.948538900812303, "learning_rate": 1.9778179404538285e-07, "loss": 0.8069, "step": 963 }, { "epoch": 0.09529693794330624, "grad_norm": 17.95977751087989, "learning_rate": 1.9777508199570124e-07, "loss": 0.6726, "step": 964 }, { "epoch": 0.0953957936880607, "grad_norm": 5.094379940164772, "learning_rate": 1.9776835992062725e-07, "loss": 0.811, "step": 965 }, { "epoch": 0.09549464943281516, "grad_norm": 5.623995858805992, "learning_rate": 1.9776162782085014e-07, "loss": 0.726, "step": 966 }, { "epoch": 0.09559350517756963, "grad_norm": 4.590338505016363, "learning_rate": 1.977548856970602e-07, "loss": 0.662, "step": 967 }, { "epoch": 0.0956923609223241, "grad_norm": 6.486950103382593, "learning_rate": 1.9774813354994867e-07, "loss": 0.7609, "step": 968 }, { "epoch": 0.09579121666707857, "grad_norm": 3.589315737214571, "learning_rate": 1.97741371380208e-07, "loss": 0.8192, "step": 969 }, { "epoch": 0.09589007241183303, "grad_norm": 3.102030899765386, "learning_rate": 1.9773459918853145e-07, "loss": 0.7638, "step": 970 }, { "epoch": 0.0959889281565875, "grad_norm": 5.934393260945401, "learning_rate": 1.9772781697561343e-07, "loss": 0.7565, "step": 971 }, { "epoch": 0.09608778390134197, "grad_norm": 5.9666292665219505, "learning_rate": 1.9772102474214937e-07, "loss": 0.833, "step": 972 }, { "epoch": 0.09618663964609643, "grad_norm": 3.357070708740453, "learning_rate": 1.977142224888357e-07, "loss": 0.8521, "step": 973 }, { "epoch": 0.0962854953908509, "grad_norm": 10.075695851945303, "learning_rate": 1.9770741021636992e-07, "loss": 0.7401, "step": 974 }, { "epoch": 0.09638435113560537, "grad_norm": 4.373683497144296, "learning_rate": 1.9770058792545048e-07, "loss": 0.6434, "step": 975 }, { "epoch": 0.09648320688035983, "grad_norm": 3.15898742945996, "learning_rate": 1.9769375561677697e-07, "loss": 0.7243, "step": 976 }, { "epoch": 0.0965820626251143, "grad_norm": 4.372271071293006, "learning_rate": 1.9768691329104988e-07, "loss": 0.7819, "step": 977 }, { "epoch": 0.09668091836986877, "grad_norm": 3.739274707747004, "learning_rate": 1.9768006094897083e-07, "loss": 0.7167, "step": 978 }, { "epoch": 0.09677977411462324, "grad_norm": 23.963350151486566, "learning_rate": 1.9767319859124242e-07, "loss": 0.7779, "step": 979 }, { "epoch": 0.0968786298593777, "grad_norm": 4.450571302225443, "learning_rate": 1.9766632621856826e-07, "loss": 0.6823, "step": 980 }, { "epoch": 0.09697748560413218, "grad_norm": 13.873177812407274, "learning_rate": 1.97659443831653e-07, "loss": 0.8032, "step": 981 }, { "epoch": 0.09707634134888664, "grad_norm": 3.051291589998188, "learning_rate": 1.9765255143120239e-07, "loss": 0.7391, "step": 982 }, { "epoch": 0.0971751970936411, "grad_norm": 4.3085709875783555, "learning_rate": 1.976456490179231e-07, "loss": 0.7842, "step": 983 }, { "epoch": 0.09727405283839557, "grad_norm": 6.3181135205734655, "learning_rate": 1.9763873659252285e-07, "loss": 0.744, "step": 984 }, { "epoch": 0.09737290858315004, "grad_norm": 5.805975665942823, "learning_rate": 1.9763181415571046e-07, "loss": 0.6678, "step": 985 }, { "epoch": 0.09747176432790451, "grad_norm": 5.081865261638517, "learning_rate": 1.9762488170819571e-07, "loss": 0.6042, "step": 986 }, { "epoch": 0.09757062007265897, "grad_norm": 4.844748896886957, "learning_rate": 1.976179392506894e-07, "loss": 0.6653, "step": 987 }, { "epoch": 0.09766947581741343, "grad_norm": 9.529115173620644, "learning_rate": 1.976109867839034e-07, "loss": 0.8133, "step": 988 }, { "epoch": 0.09776833156216791, "grad_norm": 2.8890740282775558, "learning_rate": 1.9760402430855055e-07, "loss": 0.6894, "step": 989 }, { "epoch": 0.09786718730692237, "grad_norm": 4.971949484883432, "learning_rate": 1.9759705182534477e-07, "loss": 0.7774, "step": 990 }, { "epoch": 0.09796604305167685, "grad_norm": 4.667810418562867, "learning_rate": 1.9759006933500097e-07, "loss": 0.7886, "step": 991 }, { "epoch": 0.09806489879643131, "grad_norm": 6.427228375156375, "learning_rate": 1.9758307683823515e-07, "loss": 0.6729, "step": 992 }, { "epoch": 0.09816375454118577, "grad_norm": 3.6509465735128206, "learning_rate": 1.9757607433576423e-07, "loss": 0.7593, "step": 993 }, { "epoch": 0.09826261028594024, "grad_norm": 4.230309612816775, "learning_rate": 1.975690618283062e-07, "loss": 0.7646, "step": 994 }, { "epoch": 0.0983614660306947, "grad_norm": 5.906958817561689, "learning_rate": 1.9756203931658017e-07, "loss": 0.7371, "step": 995 }, { "epoch": 0.09846032177544918, "grad_norm": 3.379544638620219, "learning_rate": 1.9755500680130616e-07, "loss": 0.7032, "step": 996 }, { "epoch": 0.09855917752020364, "grad_norm": 3.4021128793021287, "learning_rate": 1.975479642832052e-07, "loss": 0.7351, "step": 997 }, { "epoch": 0.0986580332649581, "grad_norm": 5.234007072288429, "learning_rate": 1.9754091176299945e-07, "loss": 0.8133, "step": 998 }, { "epoch": 0.09875688900971258, "grad_norm": 6.317727173916338, "learning_rate": 1.9753384924141205e-07, "loss": 0.7669, "step": 999 }, { "epoch": 0.09885574475446704, "grad_norm": 6.791241934411815, "learning_rate": 1.975267767191671e-07, "loss": 0.7618, "step": 1000 }, { "epoch": 0.09895460049922152, "grad_norm": 4.089582814858532, "learning_rate": 1.9751969419698986e-07, "loss": 0.7951, "step": 1001 }, { "epoch": 0.09905345624397598, "grad_norm": 3.8297635481480015, "learning_rate": 1.975126016756065e-07, "loss": 0.715, "step": 1002 }, { "epoch": 0.09915231198873045, "grad_norm": 8.339206921821702, "learning_rate": 1.975054991557442e-07, "loss": 0.7943, "step": 1003 }, { "epoch": 0.09925116773348491, "grad_norm": 10.760578096980979, "learning_rate": 1.9749838663813129e-07, "loss": 0.7175, "step": 1004 }, { "epoch": 0.09935002347823937, "grad_norm": 5.979719600999058, "learning_rate": 1.9749126412349704e-07, "loss": 0.8145, "step": 1005 }, { "epoch": 0.09944887922299385, "grad_norm": 16.13788492016263, "learning_rate": 1.9748413161257176e-07, "loss": 0.7253, "step": 1006 }, { "epoch": 0.09954773496774831, "grad_norm": 3.5307258714803593, "learning_rate": 1.9747698910608675e-07, "loss": 0.7169, "step": 1007 }, { "epoch": 0.09964659071250279, "grad_norm": 4.481765994485093, "learning_rate": 1.974698366047744e-07, "loss": 0.7776, "step": 1008 }, { "epoch": 0.09974544645725725, "grad_norm": 8.083205743494124, "learning_rate": 1.9746267410936813e-07, "loss": 0.8545, "step": 1009 }, { "epoch": 0.09984430220201171, "grad_norm": 3.7992316756178535, "learning_rate": 1.9745550162060225e-07, "loss": 0.7355, "step": 1010 }, { "epoch": 0.09994315794676618, "grad_norm": 4.197308047337933, "learning_rate": 1.9744831913921227e-07, "loss": 0.7161, "step": 1011 }, { "epoch": 0.10004201369152065, "grad_norm": 3.783430405720231, "learning_rate": 1.974411266659346e-07, "loss": 0.7179, "step": 1012 }, { "epoch": 0.10014086943627512, "grad_norm": 3.7409822124565397, "learning_rate": 1.974339242015068e-07, "loss": 0.7114, "step": 1013 }, { "epoch": 0.10023972518102958, "grad_norm": 5.843017495574706, "learning_rate": 1.974267117466673e-07, "loss": 0.7258, "step": 1014 }, { "epoch": 0.10033858092578404, "grad_norm": 5.48415231388352, "learning_rate": 1.9741948930215563e-07, "loss": 0.7598, "step": 1015 }, { "epoch": 0.10043743667053852, "grad_norm": 8.916798361030231, "learning_rate": 1.974122568687124e-07, "loss": 0.6695, "step": 1016 }, { "epoch": 0.10053629241529298, "grad_norm": 4.187343714602581, "learning_rate": 1.9740501444707917e-07, "loss": 0.8011, "step": 1017 }, { "epoch": 0.10063514816004746, "grad_norm": 12.0113550947875, "learning_rate": 1.9739776203799852e-07, "loss": 0.7311, "step": 1018 }, { "epoch": 0.10073400390480192, "grad_norm": 12.237979607480492, "learning_rate": 1.973904996422141e-07, "loss": 0.7165, "step": 1019 }, { "epoch": 0.10083285964955639, "grad_norm": 4.433247551889724, "learning_rate": 1.973832272604705e-07, "loss": 0.795, "step": 1020 }, { "epoch": 0.10093171539431085, "grad_norm": 3.640821147666539, "learning_rate": 1.973759448935135e-07, "loss": 0.7325, "step": 1021 }, { "epoch": 0.10103057113906531, "grad_norm": 4.434911511956639, "learning_rate": 1.9736865254208976e-07, "loss": 0.7679, "step": 1022 }, { "epoch": 0.10112942688381979, "grad_norm": 8.195345823076845, "learning_rate": 1.9736135020694697e-07, "loss": 0.6896, "step": 1023 }, { "epoch": 0.10122828262857425, "grad_norm": 4.14381987341804, "learning_rate": 1.973540378888339e-07, "loss": 0.7249, "step": 1024 }, { "epoch": 0.10132713837332873, "grad_norm": 4.445419156526462, "learning_rate": 1.9734671558850028e-07, "loss": 0.838, "step": 1025 }, { "epoch": 0.10142599411808319, "grad_norm": 3.706931421041756, "learning_rate": 1.97339383306697e-07, "loss": 0.7931, "step": 1026 }, { "epoch": 0.10152484986283765, "grad_norm": 4.328332031171043, "learning_rate": 1.973320410441758e-07, "loss": 0.8096, "step": 1027 }, { "epoch": 0.10162370560759212, "grad_norm": 3.702947727662279, "learning_rate": 1.973246888016895e-07, "loss": 0.7362, "step": 1028 }, { "epoch": 0.10172256135234659, "grad_norm": 5.560187698580335, "learning_rate": 1.9731732657999208e-07, "loss": 0.7376, "step": 1029 }, { "epoch": 0.10182141709710106, "grad_norm": 4.331835544794006, "learning_rate": 1.973099543798383e-07, "loss": 0.7909, "step": 1030 }, { "epoch": 0.10192027284185552, "grad_norm": 4.2138603990037495, "learning_rate": 1.973025722019841e-07, "loss": 0.7804, "step": 1031 }, { "epoch": 0.10201912858660998, "grad_norm": 20.163209344267734, "learning_rate": 1.9729518004718647e-07, "loss": 0.8203, "step": 1032 }, { "epoch": 0.10211798433136446, "grad_norm": 5.862920688811912, "learning_rate": 1.9728777791620332e-07, "loss": 0.6981, "step": 1033 }, { "epoch": 0.10221684007611892, "grad_norm": 4.523560242993291, "learning_rate": 1.9728036580979362e-07, "loss": 0.7058, "step": 1034 }, { "epoch": 0.1023156958208734, "grad_norm": 2.980192314030771, "learning_rate": 1.9727294372871745e-07, "loss": 0.7241, "step": 1035 }, { "epoch": 0.10241455156562786, "grad_norm": 4.764962331364637, "learning_rate": 1.9726551167373576e-07, "loss": 0.7144, "step": 1036 }, { "epoch": 0.10251340731038232, "grad_norm": 27.796379601881302, "learning_rate": 1.9725806964561057e-07, "loss": 0.8143, "step": 1037 }, { "epoch": 0.1026122630551368, "grad_norm": 5.138934897207487, "learning_rate": 1.9725061764510502e-07, "loss": 0.7758, "step": 1038 }, { "epoch": 0.10271111879989125, "grad_norm": 9.310032774161082, "learning_rate": 1.972431556729832e-07, "loss": 0.7223, "step": 1039 }, { "epoch": 0.10280997454464573, "grad_norm": 6.657170197698489, "learning_rate": 1.9723568373001018e-07, "loss": 0.6629, "step": 1040 }, { "epoch": 0.10290883028940019, "grad_norm": 2.7418742863363406, "learning_rate": 1.9722820181695212e-07, "loss": 0.831, "step": 1041 }, { "epoch": 0.10300768603415467, "grad_norm": 2.8046324244086795, "learning_rate": 1.972207099345762e-07, "loss": 0.6286, "step": 1042 }, { "epoch": 0.10310654177890913, "grad_norm": 3.513262408718687, "learning_rate": 1.9721320808365056e-07, "loss": 0.6932, "step": 1043 }, { "epoch": 0.10320539752366359, "grad_norm": 3.1478488611088387, "learning_rate": 1.9720569626494442e-07, "loss": 0.6618, "step": 1044 }, { "epoch": 0.10330425326841806, "grad_norm": 6.71008368007407, "learning_rate": 1.9719817447922807e-07, "loss": 0.6924, "step": 1045 }, { "epoch": 0.10340310901317253, "grad_norm": 4.976275748455934, "learning_rate": 1.9719064272727266e-07, "loss": 0.7761, "step": 1046 }, { "epoch": 0.103501964757927, "grad_norm": 4.835436894878733, "learning_rate": 1.9718310100985049e-07, "loss": 0.7491, "step": 1047 }, { "epoch": 0.10360082050268146, "grad_norm": 5.505738276932341, "learning_rate": 1.9717554932773486e-07, "loss": 0.7947, "step": 1048 }, { "epoch": 0.10369967624743592, "grad_norm": 4.650673059137605, "learning_rate": 1.971679876817001e-07, "loss": 0.7621, "step": 1049 }, { "epoch": 0.1037985319921904, "grad_norm": 9.0132398437828, "learning_rate": 1.9716041607252153e-07, "loss": 0.6717, "step": 1050 }, { "epoch": 0.10389738773694486, "grad_norm": 7.210296106423745, "learning_rate": 1.9715283450097552e-07, "loss": 0.6264, "step": 1051 }, { "epoch": 0.10399624348169934, "grad_norm": 2.8208429835220326, "learning_rate": 1.9714524296783942e-07, "loss": 0.6776, "step": 1052 }, { "epoch": 0.1040950992264538, "grad_norm": 4.965030213933701, "learning_rate": 1.9713764147389166e-07, "loss": 0.692, "step": 1053 }, { "epoch": 0.10419395497120826, "grad_norm": 4.266548644806464, "learning_rate": 1.9713003001991168e-07, "loss": 0.7079, "step": 1054 }, { "epoch": 0.10429281071596273, "grad_norm": 3.3866909579638116, "learning_rate": 1.9712240860667984e-07, "loss": 0.701, "step": 1055 }, { "epoch": 0.1043916664607172, "grad_norm": 3.4095031223740775, "learning_rate": 1.9711477723497767e-07, "loss": 0.7018, "step": 1056 }, { "epoch": 0.10449052220547167, "grad_norm": 4.7434332735427605, "learning_rate": 1.9710713590558763e-07, "loss": 0.7954, "step": 1057 }, { "epoch": 0.10458937795022613, "grad_norm": 4.575346164681996, "learning_rate": 1.9709948461929326e-07, "loss": 0.6915, "step": 1058 }, { "epoch": 0.1046882336949806, "grad_norm": 5.294386608138551, "learning_rate": 1.9709182337687902e-07, "loss": 0.6664, "step": 1059 }, { "epoch": 0.10478708943973507, "grad_norm": 5.264900973190541, "learning_rate": 1.9708415217913052e-07, "loss": 0.7531, "step": 1060 }, { "epoch": 0.10488594518448953, "grad_norm": 7.294654766218854, "learning_rate": 1.9707647102683432e-07, "loss": 0.7676, "step": 1061 }, { "epoch": 0.104984800929244, "grad_norm": 3.372747187081616, "learning_rate": 1.9706877992077796e-07, "loss": 0.6362, "step": 1062 }, { "epoch": 0.10508365667399847, "grad_norm": 3.615211759187662, "learning_rate": 1.9706107886175013e-07, "loss": 0.7794, "step": 1063 }, { "epoch": 0.10518251241875294, "grad_norm": 4.326398793083993, "learning_rate": 1.9705336785054037e-07, "loss": 0.7747, "step": 1064 }, { "epoch": 0.1052813681635074, "grad_norm": 6.538676043835103, "learning_rate": 1.970456468879394e-07, "loss": 0.7274, "step": 1065 }, { "epoch": 0.10538022390826186, "grad_norm": 4.547453850034342, "learning_rate": 1.9703791597473884e-07, "loss": 0.6926, "step": 1066 }, { "epoch": 0.10547907965301634, "grad_norm": 3.1263192275847858, "learning_rate": 1.9703017511173144e-07, "loss": 0.7459, "step": 1067 }, { "epoch": 0.1055779353977708, "grad_norm": 3.881137195441936, "learning_rate": 1.9702242429971086e-07, "loss": 0.8003, "step": 1068 }, { "epoch": 0.10567679114252528, "grad_norm": 5.630079794462404, "learning_rate": 1.9701466353947186e-07, "loss": 0.6097, "step": 1069 }, { "epoch": 0.10577564688727974, "grad_norm": 5.949046138484859, "learning_rate": 1.9700689283181017e-07, "loss": 0.6775, "step": 1070 }, { "epoch": 0.1058745026320342, "grad_norm": 3.503929647219076, "learning_rate": 1.9699911217752255e-07, "loss": 0.7771, "step": 1071 }, { "epoch": 0.10597335837678867, "grad_norm": 5.682027771800958, "learning_rate": 1.9699132157740684e-07, "loss": 0.7099, "step": 1072 }, { "epoch": 0.10607221412154313, "grad_norm": 11.623976142033273, "learning_rate": 1.9698352103226187e-07, "loss": 0.6727, "step": 1073 }, { "epoch": 0.10617106986629761, "grad_norm": 6.787581016457171, "learning_rate": 1.9697571054288735e-07, "loss": 0.7595, "step": 1074 }, { "epoch": 0.10626992561105207, "grad_norm": 7.113529088133563, "learning_rate": 1.9696789011008423e-07, "loss": 0.8299, "step": 1075 }, { "epoch": 0.10636878135580653, "grad_norm": 4.674931834142927, "learning_rate": 1.969600597346544e-07, "loss": 0.7377, "step": 1076 }, { "epoch": 0.10646763710056101, "grad_norm": 26.37932469893689, "learning_rate": 1.9695221941740066e-07, "loss": 0.6845, "step": 1077 }, { "epoch": 0.10656649284531547, "grad_norm": 5.917148411684509, "learning_rate": 1.9694436915912695e-07, "loss": 0.8323, "step": 1078 }, { "epoch": 0.10666534859006994, "grad_norm": 3.29157486205495, "learning_rate": 1.9693650896063825e-07, "loss": 0.6998, "step": 1079 }, { "epoch": 0.1067642043348244, "grad_norm": 4.07341156003961, "learning_rate": 1.9692863882274048e-07, "loss": 0.8016, "step": 1080 }, { "epoch": 0.10686306007957888, "grad_norm": 10.674640491187617, "learning_rate": 1.9692075874624054e-07, "loss": 0.7617, "step": 1081 }, { "epoch": 0.10696191582433334, "grad_norm": 14.411437567503732, "learning_rate": 1.9691286873194654e-07, "loss": 0.6477, "step": 1082 }, { "epoch": 0.1070607715690878, "grad_norm": 3.817854280659467, "learning_rate": 1.9690496878066735e-07, "loss": 0.8388, "step": 1083 }, { "epoch": 0.10715962731384228, "grad_norm": 3.787193130907739, "learning_rate": 1.968970588932131e-07, "loss": 0.7989, "step": 1084 }, { "epoch": 0.10725848305859674, "grad_norm": 7.6332037165015, "learning_rate": 1.968891390703948e-07, "loss": 0.812, "step": 1085 }, { "epoch": 0.10735733880335122, "grad_norm": 6.063861616237909, "learning_rate": 1.9688120931302448e-07, "loss": 0.7984, "step": 1086 }, { "epoch": 0.10745619454810568, "grad_norm": 3.3226248446759685, "learning_rate": 1.9687326962191527e-07, "loss": 0.6695, "step": 1087 }, { "epoch": 0.10755505029286014, "grad_norm": 6.291851977946078, "learning_rate": 1.968653199978812e-07, "loss": 0.7984, "step": 1088 }, { "epoch": 0.10765390603761461, "grad_norm": 5.3831950239290824, "learning_rate": 1.9685736044173743e-07, "loss": 0.8153, "step": 1089 }, { "epoch": 0.10775276178236907, "grad_norm": 9.272983922789042, "learning_rate": 1.9684939095430014e-07, "loss": 0.7326, "step": 1090 }, { "epoch": 0.10785161752712355, "grad_norm": 4.655731506191009, "learning_rate": 1.9684141153638644e-07, "loss": 0.6039, "step": 1091 }, { "epoch": 0.10795047327187801, "grad_norm": 4.828917617121873, "learning_rate": 1.9683342218881444e-07, "loss": 0.805, "step": 1092 }, { "epoch": 0.10804932901663247, "grad_norm": 15.430383165763907, "learning_rate": 1.9682542291240342e-07, "loss": 0.6797, "step": 1093 }, { "epoch": 0.10814818476138695, "grad_norm": 4.571745453317567, "learning_rate": 1.9681741370797356e-07, "loss": 0.8237, "step": 1094 }, { "epoch": 0.10824704050614141, "grad_norm": 15.835594030467476, "learning_rate": 1.968093945763461e-07, "loss": 0.7219, "step": 1095 }, { "epoch": 0.10834589625089588, "grad_norm": 3.33590869730009, "learning_rate": 1.9680136551834323e-07, "loss": 0.8101, "step": 1096 }, { "epoch": 0.10844475199565035, "grad_norm": 3.5105968629894577, "learning_rate": 1.9679332653478825e-07, "loss": 0.6862, "step": 1097 }, { "epoch": 0.10854360774040482, "grad_norm": 2.9814034247680836, "learning_rate": 1.9678527762650545e-07, "loss": 0.6918, "step": 1098 }, { "epoch": 0.10864246348515928, "grad_norm": 4.016729235695684, "learning_rate": 1.967772187943201e-07, "loss": 0.692, "step": 1099 }, { "epoch": 0.10874131922991374, "grad_norm": 4.670344606243124, "learning_rate": 1.967691500390585e-07, "loss": 0.7523, "step": 1100 }, { "epoch": 0.10884017497466822, "grad_norm": 3.128074070781782, "learning_rate": 1.9676107136154806e-07, "loss": 0.6762, "step": 1101 }, { "epoch": 0.10893903071942268, "grad_norm": 3.4313740603166893, "learning_rate": 1.967529827626171e-07, "loss": 0.6412, "step": 1102 }, { "epoch": 0.10903788646417716, "grad_norm": 6.4972151871219985, "learning_rate": 1.967448842430949e-07, "loss": 0.6447, "step": 1103 }, { "epoch": 0.10913674220893162, "grad_norm": 4.54849499935191, "learning_rate": 1.9673677580381193e-07, "loss": 0.763, "step": 1104 }, { "epoch": 0.10923559795368608, "grad_norm": 2.867658574634454, "learning_rate": 1.9672865744559956e-07, "loss": 0.6693, "step": 1105 }, { "epoch": 0.10933445369844055, "grad_norm": 3.6273336915466596, "learning_rate": 1.9672052916929025e-07, "loss": 0.709, "step": 1106 }, { "epoch": 0.10943330944319501, "grad_norm": 5.5670507438997765, "learning_rate": 1.967123909757174e-07, "loss": 0.7317, "step": 1107 }, { "epoch": 0.10953216518794949, "grad_norm": 6.583102806759151, "learning_rate": 1.9670424286571544e-07, "loss": 0.7847, "step": 1108 }, { "epoch": 0.10963102093270395, "grad_norm": 3.8170210838420515, "learning_rate": 1.9669608484011985e-07, "loss": 0.7841, "step": 1109 }, { "epoch": 0.10972987667745841, "grad_norm": 4.934154269714187, "learning_rate": 1.9668791689976713e-07, "loss": 0.7749, "step": 1110 }, { "epoch": 0.10982873242221289, "grad_norm": 4.849229288240025, "learning_rate": 1.9667973904549482e-07, "loss": 0.6902, "step": 1111 }, { "epoch": 0.10992758816696735, "grad_norm": 4.461236967449306, "learning_rate": 1.9667155127814136e-07, "loss": 0.7947, "step": 1112 }, { "epoch": 0.11002644391172182, "grad_norm": 4.053317864905526, "learning_rate": 1.9666335359854633e-07, "loss": 0.8277, "step": 1113 }, { "epoch": 0.11012529965647629, "grad_norm": 7.934024393932383, "learning_rate": 1.9665514600755028e-07, "loss": 0.8569, "step": 1114 }, { "epoch": 0.11022415540123075, "grad_norm": 3.5147136229295883, "learning_rate": 1.9664692850599478e-07, "loss": 0.7587, "step": 1115 }, { "epoch": 0.11032301114598522, "grad_norm": 4.480598341900733, "learning_rate": 1.9663870109472242e-07, "loss": 0.723, "step": 1116 }, { "epoch": 0.11042186689073968, "grad_norm": 2.5140330503222277, "learning_rate": 1.9663046377457674e-07, "loss": 0.8137, "step": 1117 }, { "epoch": 0.11052072263549416, "grad_norm": 5.069882928382326, "learning_rate": 1.966222165464024e-07, "loss": 0.8157, "step": 1118 }, { "epoch": 0.11061957838024862, "grad_norm": 6.268607336288659, "learning_rate": 1.9661395941104505e-07, "loss": 0.7637, "step": 1119 }, { "epoch": 0.1107184341250031, "grad_norm": 3.5806802613882396, "learning_rate": 1.966056923693513e-07, "loss": 0.7799, "step": 1120 }, { "epoch": 0.11081728986975756, "grad_norm": 24.2998326666818, "learning_rate": 1.9659741542216886e-07, "loss": 0.753, "step": 1121 }, { "epoch": 0.11091614561451202, "grad_norm": 4.668212902193974, "learning_rate": 1.9658912857034635e-07, "loss": 0.7939, "step": 1122 }, { "epoch": 0.1110150013592665, "grad_norm": 6.077830709044338, "learning_rate": 1.9658083181473352e-07, "loss": 0.6345, "step": 1123 }, { "epoch": 0.11111385710402095, "grad_norm": 11.538505877466205, "learning_rate": 1.9657252515618106e-07, "loss": 0.8116, "step": 1124 }, { "epoch": 0.11121271284877543, "grad_norm": 3.2752053098944023, "learning_rate": 1.9656420859554066e-07, "loss": 0.6048, "step": 1125 }, { "epoch": 0.11131156859352989, "grad_norm": 26.945483711685743, "learning_rate": 1.965558821336651e-07, "loss": 0.7621, "step": 1126 }, { "epoch": 0.11141042433828435, "grad_norm": 3.923797973036838, "learning_rate": 1.9654754577140816e-07, "loss": 0.7232, "step": 1127 }, { "epoch": 0.11150928008303883, "grad_norm": 4.702446306488891, "learning_rate": 1.9653919950962454e-07, "loss": 0.5601, "step": 1128 }, { "epoch": 0.11160813582779329, "grad_norm": 4.083872239507477, "learning_rate": 1.965308433491701e-07, "loss": 0.6972, "step": 1129 }, { "epoch": 0.11170699157254776, "grad_norm": 3.9862494110315883, "learning_rate": 1.9652247729090156e-07, "loss": 0.7132, "step": 1130 }, { "epoch": 0.11180584731730223, "grad_norm": 6.173695704281635, "learning_rate": 1.965141013356768e-07, "loss": 0.8349, "step": 1131 }, { "epoch": 0.11190470306205669, "grad_norm": 12.834819820350553, "learning_rate": 1.9650571548435464e-07, "loss": 0.729, "step": 1132 }, { "epoch": 0.11200355880681116, "grad_norm": 3.7334168286670217, "learning_rate": 1.964973197377949e-07, "loss": 0.7109, "step": 1133 }, { "epoch": 0.11210241455156562, "grad_norm": 3.3256479558716654, "learning_rate": 1.9648891409685847e-07, "loss": 0.7381, "step": 1134 }, { "epoch": 0.1122012702963201, "grad_norm": 3.6595892573805955, "learning_rate": 1.964804985624072e-07, "loss": 0.5925, "step": 1135 }, { "epoch": 0.11230012604107456, "grad_norm": 6.761103959706202, "learning_rate": 1.96472073135304e-07, "loss": 0.7941, "step": 1136 }, { "epoch": 0.11239898178582904, "grad_norm": 4.059086064032219, "learning_rate": 1.9646363781641274e-07, "loss": 0.7364, "step": 1137 }, { "epoch": 0.1124978375305835, "grad_norm": 5.0497379196121726, "learning_rate": 1.9645519260659839e-07, "loss": 0.7695, "step": 1138 }, { "epoch": 0.11259669327533796, "grad_norm": 4.089581892064416, "learning_rate": 1.9644673750672686e-07, "loss": 0.7558, "step": 1139 }, { "epoch": 0.11269554902009243, "grad_norm": 8.131012161746074, "learning_rate": 1.9643827251766507e-07, "loss": 0.7793, "step": 1140 }, { "epoch": 0.1127944047648469, "grad_norm": 5.888324807609808, "learning_rate": 1.96429797640281e-07, "loss": 0.6362, "step": 1141 }, { "epoch": 0.11289326050960137, "grad_norm": 2.552170674412662, "learning_rate": 1.9642131287544363e-07, "loss": 0.6943, "step": 1142 }, { "epoch": 0.11299211625435583, "grad_norm": 6.760296543329767, "learning_rate": 1.964128182240229e-07, "loss": 0.732, "step": 1143 }, { "epoch": 0.11309097199911029, "grad_norm": 6.392655919170905, "learning_rate": 1.964043136868899e-07, "loss": 0.6789, "step": 1144 }, { "epoch": 0.11318982774386477, "grad_norm": 5.2410735783499165, "learning_rate": 1.9639579926491656e-07, "loss": 0.7987, "step": 1145 }, { "epoch": 0.11328868348861923, "grad_norm": 10.23231614927342, "learning_rate": 1.9638727495897599e-07, "loss": 0.8954, "step": 1146 }, { "epoch": 0.1133875392333737, "grad_norm": 3.2265123050826405, "learning_rate": 1.9637874076994215e-07, "loss": 0.7466, "step": 1147 }, { "epoch": 0.11348639497812817, "grad_norm": 5.351436761266222, "learning_rate": 1.9637019669869014e-07, "loss": 0.851, "step": 1148 }, { "epoch": 0.11358525072288263, "grad_norm": 3.2548096976373686, "learning_rate": 1.96361642746096e-07, "loss": 0.7805, "step": 1149 }, { "epoch": 0.1136841064676371, "grad_norm": 6.349500022495833, "learning_rate": 1.9635307891303685e-07, "loss": 0.6832, "step": 1150 }, { "epoch": 0.11378296221239156, "grad_norm": 3.3732504099341245, "learning_rate": 1.9634450520039074e-07, "loss": 0.8051, "step": 1151 }, { "epoch": 0.11388181795714604, "grad_norm": 8.431809111718307, "learning_rate": 1.9633592160903683e-07, "loss": 0.7565, "step": 1152 }, { "epoch": 0.1139806737019005, "grad_norm": 4.046361396728659, "learning_rate": 1.963273281398552e-07, "loss": 0.7323, "step": 1153 }, { "epoch": 0.11407952944665496, "grad_norm": 4.763758884306997, "learning_rate": 1.9631872479372702e-07, "loss": 0.8714, "step": 1154 }, { "epoch": 0.11417838519140944, "grad_norm": 4.297751085871059, "learning_rate": 1.963101115715344e-07, "loss": 0.7141, "step": 1155 }, { "epoch": 0.1142772409361639, "grad_norm": 4.020862265663989, "learning_rate": 1.963014884741605e-07, "loss": 0.781, "step": 1156 }, { "epoch": 0.11437609668091837, "grad_norm": 3.832556709664541, "learning_rate": 1.962928555024895e-07, "loss": 0.6506, "step": 1157 }, { "epoch": 0.11447495242567284, "grad_norm": 4.307343782368354, "learning_rate": 1.962842126574066e-07, "loss": 0.6564, "step": 1158 }, { "epoch": 0.11457380817042731, "grad_norm": 5.2540407056750675, "learning_rate": 1.96275559939798e-07, "loss": 0.7478, "step": 1159 }, { "epoch": 0.11467266391518177, "grad_norm": 15.500103201837033, "learning_rate": 1.9626689735055088e-07, "loss": 0.7886, "step": 1160 }, { "epoch": 0.11477151965993623, "grad_norm": 4.7581293093390356, "learning_rate": 1.9625822489055345e-07, "loss": 0.8076, "step": 1161 }, { "epoch": 0.11487037540469071, "grad_norm": 6.685121039232671, "learning_rate": 1.96249542560695e-07, "loss": 0.7979, "step": 1162 }, { "epoch": 0.11496923114944517, "grad_norm": 26.34103516111553, "learning_rate": 1.9624085036186574e-07, "loss": 0.7069, "step": 1163 }, { "epoch": 0.11506808689419964, "grad_norm": 5.135120025828368, "learning_rate": 1.9623214829495692e-07, "loss": 0.7518, "step": 1164 }, { "epoch": 0.1151669426389541, "grad_norm": 3.823081028889439, "learning_rate": 1.962234363608608e-07, "loss": 0.7241, "step": 1165 }, { "epoch": 0.11526579838370857, "grad_norm": 5.725249390062637, "learning_rate": 1.962147145604707e-07, "loss": 0.8034, "step": 1166 }, { "epoch": 0.11536465412846304, "grad_norm": 4.489484866435147, "learning_rate": 1.962059828946809e-07, "loss": 0.8066, "step": 1167 }, { "epoch": 0.1154635098732175, "grad_norm": 5.279773803730768, "learning_rate": 1.9619724136438663e-07, "loss": 0.7972, "step": 1168 }, { "epoch": 0.11556236561797198, "grad_norm": 7.9119799787410345, "learning_rate": 1.9618848997048433e-07, "loss": 0.8289, "step": 1169 }, { "epoch": 0.11566122136272644, "grad_norm": 5.289202047687702, "learning_rate": 1.9617972871387126e-07, "loss": 0.6869, "step": 1170 }, { "epoch": 0.1157600771074809, "grad_norm": 3.2877272079389375, "learning_rate": 1.9617095759544573e-07, "loss": 0.7334, "step": 1171 }, { "epoch": 0.11585893285223538, "grad_norm": 4.490103626492425, "learning_rate": 1.9616217661610713e-07, "loss": 0.8483, "step": 1172 }, { "epoch": 0.11595778859698984, "grad_norm": 4.656653462676062, "learning_rate": 1.961533857767558e-07, "loss": 0.6926, "step": 1173 }, { "epoch": 0.11605664434174431, "grad_norm": 8.61064428661033, "learning_rate": 1.9614458507829315e-07, "loss": 0.7868, "step": 1174 }, { "epoch": 0.11615550008649878, "grad_norm": 7.2963973902716734, "learning_rate": 1.961357745216215e-07, "loss": 0.7808, "step": 1175 }, { "epoch": 0.11625435583125325, "grad_norm": 5.755811897626709, "learning_rate": 1.961269541076443e-07, "loss": 0.6962, "step": 1176 }, { "epoch": 0.11635321157600771, "grad_norm": 7.9948885433264225, "learning_rate": 1.9611812383726592e-07, "loss": 0.8393, "step": 1177 }, { "epoch": 0.11645206732076217, "grad_norm": 12.525050946062107, "learning_rate": 1.9610928371139178e-07, "loss": 0.5462, "step": 1178 }, { "epoch": 0.11655092306551665, "grad_norm": 3.789658205770461, "learning_rate": 1.961004337309283e-07, "loss": 0.6946, "step": 1179 }, { "epoch": 0.11664977881027111, "grad_norm": 8.036777000989021, "learning_rate": 1.9609157389678294e-07, "loss": 0.7911, "step": 1180 }, { "epoch": 0.11674863455502558, "grad_norm": 4.109307745292568, "learning_rate": 1.960827042098641e-07, "loss": 0.7662, "step": 1181 }, { "epoch": 0.11684749029978005, "grad_norm": 3.4578480041785746, "learning_rate": 1.9607382467108132e-07, "loss": 0.6511, "step": 1182 }, { "epoch": 0.11694634604453451, "grad_norm": 5.209534382648805, "learning_rate": 1.9606493528134496e-07, "loss": 0.62, "step": 1183 }, { "epoch": 0.11704520178928898, "grad_norm": 4.572993872537506, "learning_rate": 1.9605603604156652e-07, "loss": 0.6923, "step": 1184 }, { "epoch": 0.11714405753404344, "grad_norm": 4.026154171424179, "learning_rate": 1.9604712695265858e-07, "loss": 0.7478, "step": 1185 }, { "epoch": 0.11724291327879792, "grad_norm": 5.319915188704291, "learning_rate": 1.9603820801553452e-07, "loss": 0.771, "step": 1186 }, { "epoch": 0.11734176902355238, "grad_norm": 11.436323995444692, "learning_rate": 1.960292792311089e-07, "loss": 0.8073, "step": 1187 }, { "epoch": 0.11744062476830684, "grad_norm": 6.571150718533046, "learning_rate": 1.9602034060029725e-07, "loss": 0.6192, "step": 1188 }, { "epoch": 0.11753948051306132, "grad_norm": 31.254039055780368, "learning_rate": 1.9601139212401604e-07, "loss": 0.8308, "step": 1189 }, { "epoch": 0.11763833625781578, "grad_norm": 3.143021111290879, "learning_rate": 1.9600243380318284e-07, "loss": 0.7133, "step": 1190 }, { "epoch": 0.11773719200257025, "grad_norm": 4.466601954082947, "learning_rate": 1.959934656387162e-07, "loss": 0.7804, "step": 1191 }, { "epoch": 0.11783604774732472, "grad_norm": 3.907789020401191, "learning_rate": 1.9598448763153565e-07, "loss": 0.7534, "step": 1192 }, { "epoch": 0.11793490349207918, "grad_norm": 5.873474795959918, "learning_rate": 1.959754997825618e-07, "loss": 0.7646, "step": 1193 }, { "epoch": 0.11803375923683365, "grad_norm": 5.639852245172317, "learning_rate": 1.959665020927161e-07, "loss": 0.7175, "step": 1194 }, { "epoch": 0.11813261498158811, "grad_norm": 7.206126669642875, "learning_rate": 1.959574945629213e-07, "loss": 0.7889, "step": 1195 }, { "epoch": 0.11823147072634259, "grad_norm": 8.967908879496926, "learning_rate": 1.959484771941009e-07, "loss": 0.7406, "step": 1196 }, { "epoch": 0.11833032647109705, "grad_norm": 11.540403948175356, "learning_rate": 1.9593944998717945e-07, "loss": 0.6326, "step": 1197 }, { "epoch": 0.11842918221585153, "grad_norm": 3.505811829692756, "learning_rate": 1.9593041294308264e-07, "loss": 0.674, "step": 1198 }, { "epoch": 0.11852803796060599, "grad_norm": 8.749859277628525, "learning_rate": 1.9592136606273706e-07, "loss": 0.8097, "step": 1199 }, { "epoch": 0.11862689370536045, "grad_norm": 3.882724256173352, "learning_rate": 1.9591230934707032e-07, "loss": 0.7141, "step": 1200 }, { "epoch": 0.11872574945011492, "grad_norm": 3.5060938007521707, "learning_rate": 1.9590324279701108e-07, "loss": 0.7364, "step": 1201 }, { "epoch": 0.11882460519486938, "grad_norm": 3.605655663471504, "learning_rate": 1.9589416641348894e-07, "loss": 0.7474, "step": 1202 }, { "epoch": 0.11892346093962386, "grad_norm": 6.681771021555283, "learning_rate": 1.958850801974346e-07, "loss": 0.7417, "step": 1203 }, { "epoch": 0.11902231668437832, "grad_norm": 9.673119432920508, "learning_rate": 1.958759841497797e-07, "loss": 0.7792, "step": 1204 }, { "epoch": 0.11912117242913278, "grad_norm": 3.3188697785959342, "learning_rate": 1.9586687827145685e-07, "loss": 0.6227, "step": 1205 }, { "epoch": 0.11922002817388726, "grad_norm": 5.3966291114283615, "learning_rate": 1.958577625633998e-07, "loss": 0.6442, "step": 1206 }, { "epoch": 0.11931888391864172, "grad_norm": 6.217554867578828, "learning_rate": 1.9584863702654318e-07, "loss": 0.8291, "step": 1207 }, { "epoch": 0.1194177396633962, "grad_norm": 13.672791203285747, "learning_rate": 1.958395016618227e-07, "loss": 0.7068, "step": 1208 }, { "epoch": 0.11951659540815066, "grad_norm": 5.09596439942259, "learning_rate": 1.9583035647017507e-07, "loss": 0.6982, "step": 1209 }, { "epoch": 0.11961545115290512, "grad_norm": 4.3332757811370755, "learning_rate": 1.9582120145253797e-07, "loss": 0.9263, "step": 1210 }, { "epoch": 0.11971430689765959, "grad_norm": 5.825728404093789, "learning_rate": 1.9581203660985012e-07, "loss": 0.7653, "step": 1211 }, { "epoch": 0.11981316264241405, "grad_norm": 4.000236758457947, "learning_rate": 1.9580286194305127e-07, "loss": 0.8053, "step": 1212 }, { "epoch": 0.11991201838716853, "grad_norm": 5.650188215307143, "learning_rate": 1.9579367745308214e-07, "loss": 0.6577, "step": 1213 }, { "epoch": 0.12001087413192299, "grad_norm": 5.737539255059414, "learning_rate": 1.9578448314088438e-07, "loss": 0.7967, "step": 1214 }, { "epoch": 0.12010972987667747, "grad_norm": 4.000045582858667, "learning_rate": 1.9577527900740082e-07, "loss": 0.7507, "step": 1215 }, { "epoch": 0.12020858562143193, "grad_norm": 4.333244652692537, "learning_rate": 1.957660650535752e-07, "loss": 0.7736, "step": 1216 }, { "epoch": 0.12030744136618639, "grad_norm": 18.244089493775345, "learning_rate": 1.957568412803522e-07, "loss": 0.7372, "step": 1217 }, { "epoch": 0.12040629711094086, "grad_norm": 4.9138681935681054, "learning_rate": 1.9574760768867772e-07, "loss": 0.783, "step": 1218 }, { "epoch": 0.12050515285569532, "grad_norm": 5.601676307151677, "learning_rate": 1.957383642794984e-07, "loss": 0.7929, "step": 1219 }, { "epoch": 0.1206040086004498, "grad_norm": 12.00338297048234, "learning_rate": 1.9572911105376208e-07, "loss": 0.6742, "step": 1220 }, { "epoch": 0.12070286434520426, "grad_norm": 3.6006908146041647, "learning_rate": 1.957198480124175e-07, "loss": 0.7406, "step": 1221 }, { "epoch": 0.12080172008995872, "grad_norm": 7.016600601553458, "learning_rate": 1.957105751564145e-07, "loss": 0.6779, "step": 1222 }, { "epoch": 0.1209005758347132, "grad_norm": 8.445968365442841, "learning_rate": 1.9570129248670385e-07, "loss": 0.7915, "step": 1223 }, { "epoch": 0.12099943157946766, "grad_norm": 7.743229078307025, "learning_rate": 1.9569200000423734e-07, "loss": 0.7078, "step": 1224 }, { "epoch": 0.12109828732422213, "grad_norm": 5.186175670859474, "learning_rate": 1.956826977099678e-07, "loss": 0.7497, "step": 1225 }, { "epoch": 0.1211971430689766, "grad_norm": 5.832923622225965, "learning_rate": 1.95673385604849e-07, "loss": 0.7432, "step": 1226 }, { "epoch": 0.12129599881373106, "grad_norm": 4.30809855952609, "learning_rate": 1.956640636898358e-07, "loss": 0.6764, "step": 1227 }, { "epoch": 0.12139485455848553, "grad_norm": 4.05714902381824, "learning_rate": 1.9565473196588402e-07, "loss": 0.766, "step": 1228 }, { "epoch": 0.12149371030324, "grad_norm": 16.056362468266308, "learning_rate": 1.9564539043395048e-07, "loss": 0.6746, "step": 1229 }, { "epoch": 0.12159256604799447, "grad_norm": 7.451086901454275, "learning_rate": 1.95636039094993e-07, "loss": 0.7156, "step": 1230 }, { "epoch": 0.12169142179274893, "grad_norm": 2.889269086294448, "learning_rate": 1.9562667794997045e-07, "loss": 0.6457, "step": 1231 }, { "epoch": 0.12179027753750339, "grad_norm": 10.493801474266144, "learning_rate": 1.956173069998427e-07, "loss": 0.754, "step": 1232 }, { "epoch": 0.12188913328225787, "grad_norm": 17.031949300848748, "learning_rate": 1.9560792624557053e-07, "loss": 0.6569, "step": 1233 }, { "epoch": 0.12198798902701233, "grad_norm": 43.709640459001356, "learning_rate": 1.955985356881159e-07, "loss": 0.6453, "step": 1234 }, { "epoch": 0.1220868447717668, "grad_norm": 3.901935284874946, "learning_rate": 1.9558913532844156e-07, "loss": 0.7836, "step": 1235 }, { "epoch": 0.12218570051652126, "grad_norm": 4.378586997239479, "learning_rate": 1.9557972516751142e-07, "loss": 0.7568, "step": 1236 }, { "epoch": 0.12228455626127574, "grad_norm": 4.803563065932561, "learning_rate": 1.955703052062904e-07, "loss": 0.7519, "step": 1237 }, { "epoch": 0.1223834120060302, "grad_norm": 4.247082092429813, "learning_rate": 1.9556087544574432e-07, "loss": 0.7779, "step": 1238 }, { "epoch": 0.12248226775078466, "grad_norm": 4.317158197230976, "learning_rate": 1.955514358868401e-07, "loss": 0.7735, "step": 1239 }, { "epoch": 0.12258112349553914, "grad_norm": 4.209967533409742, "learning_rate": 1.9554198653054562e-07, "loss": 0.8249, "step": 1240 }, { "epoch": 0.1226799792402936, "grad_norm": 6.411781425356489, "learning_rate": 1.9553252737782975e-07, "loss": 0.8235, "step": 1241 }, { "epoch": 0.12277883498504807, "grad_norm": 4.056401182675957, "learning_rate": 1.9552305842966238e-07, "loss": 0.6938, "step": 1242 }, { "epoch": 0.12287769072980254, "grad_norm": 4.901489205430493, "learning_rate": 1.9551357968701447e-07, "loss": 0.7586, "step": 1243 }, { "epoch": 0.122976546474557, "grad_norm": 4.929723099586568, "learning_rate": 1.9550409115085785e-07, "loss": 0.81, "step": 1244 }, { "epoch": 0.12307540221931147, "grad_norm": 4.387900679367985, "learning_rate": 1.954945928221655e-07, "loss": 0.8074, "step": 1245 }, { "epoch": 0.12317425796406593, "grad_norm": 4.701965355419206, "learning_rate": 1.954850847019113e-07, "loss": 0.6603, "step": 1246 }, { "epoch": 0.12327311370882041, "grad_norm": 4.446984597354013, "learning_rate": 1.9547556679107018e-07, "loss": 0.7001, "step": 1247 }, { "epoch": 0.12337196945357487, "grad_norm": 3.6657419025938442, "learning_rate": 1.9546603909061801e-07, "loss": 0.766, "step": 1248 }, { "epoch": 0.12347082519832933, "grad_norm": 4.166094618692891, "learning_rate": 1.9545650160153177e-07, "loss": 0.7549, "step": 1249 }, { "epoch": 0.1235696809430838, "grad_norm": 4.042033462675203, "learning_rate": 1.9544695432478937e-07, "loss": 0.7188, "step": 1250 }, { "epoch": 0.12366853668783827, "grad_norm": 7.930801012743697, "learning_rate": 1.9543739726136977e-07, "loss": 0.7216, "step": 1251 }, { "epoch": 0.12376739243259274, "grad_norm": 3.920476062203573, "learning_rate": 1.9542783041225285e-07, "loss": 0.6874, "step": 1252 }, { "epoch": 0.1238662481773472, "grad_norm": 3.5741685138491532, "learning_rate": 1.954182537784196e-07, "loss": 0.6991, "step": 1253 }, { "epoch": 0.12396510392210168, "grad_norm": 7.031236541691503, "learning_rate": 1.9540866736085193e-07, "loss": 0.7818, "step": 1254 }, { "epoch": 0.12406395966685614, "grad_norm": 5.1624735502316135, "learning_rate": 1.9539907116053282e-07, "loss": 0.6713, "step": 1255 }, { "epoch": 0.1241628154116106, "grad_norm": 3.528913892861073, "learning_rate": 1.953894651784462e-07, "loss": 0.7318, "step": 1256 }, { "epoch": 0.12426167115636508, "grad_norm": 4.1709411560180225, "learning_rate": 1.95379849415577e-07, "loss": 0.8074, "step": 1257 }, { "epoch": 0.12436052690111954, "grad_norm": 4.780207591888047, "learning_rate": 1.9537022387291118e-07, "loss": 0.7489, "step": 1258 }, { "epoch": 0.12445938264587401, "grad_norm": 3.6478405339856987, "learning_rate": 1.9536058855143575e-07, "loss": 0.6573, "step": 1259 }, { "epoch": 0.12455823839062848, "grad_norm": 7.2511622322238365, "learning_rate": 1.9535094345213858e-07, "loss": 0.7967, "step": 1260 }, { "epoch": 0.12465709413538294, "grad_norm": 4.827253777819403, "learning_rate": 1.953412885760087e-07, "loss": 0.6464, "step": 1261 }, { "epoch": 0.12475594988013741, "grad_norm": 5.420568613399631, "learning_rate": 1.953316239240361e-07, "loss": 0.7883, "step": 1262 }, { "epoch": 0.12485480562489187, "grad_norm": 5.02760004568445, "learning_rate": 1.9532194949721168e-07, "loss": 0.8018, "step": 1263 }, { "epoch": 0.12495366136964635, "grad_norm": 8.287705619050639, "learning_rate": 1.9531226529652742e-07, "loss": 0.6981, "step": 1264 }, { "epoch": 0.12505251711440082, "grad_norm": 7.298016156977976, "learning_rate": 1.9530257132297633e-07, "loss": 0.7505, "step": 1265 }, { "epoch": 0.12515137285915529, "grad_norm": 7.29132583981489, "learning_rate": 1.952928675775524e-07, "loss": 0.6865, "step": 1266 }, { "epoch": 0.12525022860390975, "grad_norm": 6.413961032623333, "learning_rate": 1.952831540612505e-07, "loss": 0.7335, "step": 1267 }, { "epoch": 0.1253490843486642, "grad_norm": 3.8741187117853695, "learning_rate": 1.952734307750667e-07, "loss": 0.7474, "step": 1268 }, { "epoch": 0.12544794009341867, "grad_norm": 8.743839635210694, "learning_rate": 1.9526369771999794e-07, "loss": 0.7789, "step": 1269 }, { "epoch": 0.12554679583817316, "grad_norm": 3.448467471337922, "learning_rate": 1.9525395489704225e-07, "loss": 0.8067, "step": 1270 }, { "epoch": 0.12564565158292762, "grad_norm": 3.3729542827377608, "learning_rate": 1.952442023071986e-07, "loss": 0.6878, "step": 1271 }, { "epoch": 0.12574450732768208, "grad_norm": 9.28311289390572, "learning_rate": 1.952344399514669e-07, "loss": 0.747, "step": 1272 }, { "epoch": 0.12584336307243654, "grad_norm": 4.224489912819047, "learning_rate": 1.9522466783084822e-07, "loss": 0.7407, "step": 1273 }, { "epoch": 0.125942218817191, "grad_norm": 7.233095731757951, "learning_rate": 1.952148859463445e-07, "loss": 0.772, "step": 1274 }, { "epoch": 0.1260410745619455, "grad_norm": 7.444100979964925, "learning_rate": 1.9520509429895875e-07, "loss": 0.6453, "step": 1275 }, { "epoch": 0.12613993030669995, "grad_norm": 4.580101184426206, "learning_rate": 1.9519529288969497e-07, "loss": 0.7189, "step": 1276 }, { "epoch": 0.12623878605145442, "grad_norm": 4.401794505660196, "learning_rate": 1.9518548171955812e-07, "loss": 0.7429, "step": 1277 }, { "epoch": 0.12633764179620888, "grad_norm": 4.358764896796247, "learning_rate": 1.951756607895542e-07, "loss": 0.7925, "step": 1278 }, { "epoch": 0.12643649754096334, "grad_norm": 13.740068262295237, "learning_rate": 1.951658301006902e-07, "loss": 0.7283, "step": 1279 }, { "epoch": 0.12653535328571783, "grad_norm": 8.23562103457597, "learning_rate": 1.9515598965397414e-07, "loss": 0.7603, "step": 1280 }, { "epoch": 0.1266342090304723, "grad_norm": 3.542544199280855, "learning_rate": 1.9514613945041498e-07, "loss": 0.7438, "step": 1281 }, { "epoch": 0.12673306477522675, "grad_norm": 9.481413761723882, "learning_rate": 1.9513627949102274e-07, "loss": 0.7024, "step": 1282 }, { "epoch": 0.1268319205199812, "grad_norm": 3.7241120862635064, "learning_rate": 1.9512640977680838e-07, "loss": 0.8008, "step": 1283 }, { "epoch": 0.12693077626473567, "grad_norm": 4.568118807965242, "learning_rate": 1.951165303087839e-07, "loss": 0.7146, "step": 1284 }, { "epoch": 0.12702963200949016, "grad_norm": 7.024757136288451, "learning_rate": 1.951066410879623e-07, "loss": 0.7448, "step": 1285 }, { "epoch": 0.12712848775424462, "grad_norm": 12.191197392329478, "learning_rate": 1.9509674211535764e-07, "loss": 0.738, "step": 1286 }, { "epoch": 0.12722734349899908, "grad_norm": 7.947132890685516, "learning_rate": 1.9508683339198478e-07, "loss": 0.6649, "step": 1287 }, { "epoch": 0.12732619924375355, "grad_norm": 7.606792519329367, "learning_rate": 1.950769149188598e-07, "loss": 0.8307, "step": 1288 }, { "epoch": 0.127425054988508, "grad_norm": 15.465219342300303, "learning_rate": 1.950669866969997e-07, "loss": 0.7658, "step": 1289 }, { "epoch": 0.1275239107332625, "grad_norm": 5.7248459454171785, "learning_rate": 1.9505704872742243e-07, "loss": 0.8787, "step": 1290 }, { "epoch": 0.12762276647801696, "grad_norm": 7.820474639484991, "learning_rate": 1.9504710101114702e-07, "loss": 0.788, "step": 1291 }, { "epoch": 0.12772162222277142, "grad_norm": 7.184047667138028, "learning_rate": 1.9503714354919344e-07, "loss": 0.6762, "step": 1292 }, { "epoch": 0.12782047796752588, "grad_norm": 14.44474925276693, "learning_rate": 1.9502717634258268e-07, "loss": 0.7012, "step": 1293 }, { "epoch": 0.12791933371228034, "grad_norm": 5.897747577191385, "learning_rate": 1.9501719939233673e-07, "loss": 0.7559, "step": 1294 }, { "epoch": 0.12801818945703483, "grad_norm": 4.667608603366718, "learning_rate": 1.9500721269947859e-07, "loss": 0.6939, "step": 1295 }, { "epoch": 0.1281170452017893, "grad_norm": 3.248058468807248, "learning_rate": 1.9499721626503224e-07, "loss": 0.6804, "step": 1296 }, { "epoch": 0.12821590094654375, "grad_norm": 3.934328935106518, "learning_rate": 1.9498721009002268e-07, "loss": 0.8116, "step": 1297 }, { "epoch": 0.12831475669129822, "grad_norm": 6.735125656589037, "learning_rate": 1.949771941754759e-07, "loss": 0.7175, "step": 1298 }, { "epoch": 0.12841361243605268, "grad_norm": 3.259338332639882, "learning_rate": 1.9496716852241882e-07, "loss": 0.7276, "step": 1299 }, { "epoch": 0.12851246818080717, "grad_norm": 6.795060294357565, "learning_rate": 1.949571331318795e-07, "loss": 0.785, "step": 1300 }, { "epoch": 0.12861132392556163, "grad_norm": 4.924199099151521, "learning_rate": 1.949470880048869e-07, "loss": 0.6504, "step": 1301 }, { "epoch": 0.1287101796703161, "grad_norm": 5.795101902559786, "learning_rate": 1.94937033142471e-07, "loss": 0.7422, "step": 1302 }, { "epoch": 0.12880903541507055, "grad_norm": 5.807133416286141, "learning_rate": 1.9492696854566275e-07, "loss": 0.8002, "step": 1303 }, { "epoch": 0.12890789115982504, "grad_norm": 3.088621678409238, "learning_rate": 1.9491689421549417e-07, "loss": 0.7295, "step": 1304 }, { "epoch": 0.1290067469045795, "grad_norm": 9.41964476544067, "learning_rate": 1.9490681015299822e-07, "loss": 0.7674, "step": 1305 }, { "epoch": 0.12910560264933396, "grad_norm": 3.5569244825065462, "learning_rate": 1.9489671635920884e-07, "loss": 0.675, "step": 1306 }, { "epoch": 0.12920445839408842, "grad_norm": 7.4171468871707775, "learning_rate": 1.94886612835161e-07, "loss": 0.7885, "step": 1307 }, { "epoch": 0.12930331413884288, "grad_norm": 9.286194158144225, "learning_rate": 1.948764995818907e-07, "loss": 0.7028, "step": 1308 }, { "epoch": 0.12940216988359737, "grad_norm": 4.175889275207011, "learning_rate": 1.9486637660043493e-07, "loss": 0.8338, "step": 1309 }, { "epoch": 0.12950102562835183, "grad_norm": 18.35857279811028, "learning_rate": 1.9485624389183158e-07, "loss": 0.8069, "step": 1310 }, { "epoch": 0.1295998813731063, "grad_norm": 4.5818825508176255, "learning_rate": 1.9484610145711967e-07, "loss": 0.7754, "step": 1311 }, { "epoch": 0.12969873711786076, "grad_norm": 4.697021104375583, "learning_rate": 1.9483594929733912e-07, "loss": 0.6822, "step": 1312 }, { "epoch": 0.12979759286261522, "grad_norm": 7.372009666775526, "learning_rate": 1.9482578741353093e-07, "loss": 0.7255, "step": 1313 }, { "epoch": 0.1298964486073697, "grad_norm": 5.221355706547645, "learning_rate": 1.9481561580673696e-07, "loss": 0.8832, "step": 1314 }, { "epoch": 0.12999530435212417, "grad_norm": 5.399428139333168, "learning_rate": 1.9480543447800027e-07, "loss": 0.7226, "step": 1315 }, { "epoch": 0.13009416009687863, "grad_norm": 4.513574027608461, "learning_rate": 1.9479524342836473e-07, "loss": 0.8458, "step": 1316 }, { "epoch": 0.1301930158416331, "grad_norm": 5.108713985120417, "learning_rate": 1.947850426588753e-07, "loss": 0.684, "step": 1317 }, { "epoch": 0.13029187158638755, "grad_norm": 5.775529312253205, "learning_rate": 1.9477483217057795e-07, "loss": 0.7664, "step": 1318 }, { "epoch": 0.13039072733114204, "grad_norm": 4.794664882056166, "learning_rate": 1.9476461196451957e-07, "loss": 0.7883, "step": 1319 }, { "epoch": 0.1304895830758965, "grad_norm": 3.8488207081371977, "learning_rate": 1.947543820417481e-07, "loss": 0.7113, "step": 1320 }, { "epoch": 0.13058843882065096, "grad_norm": 3.559217896659162, "learning_rate": 1.947441424033125e-07, "loss": 0.6441, "step": 1321 }, { "epoch": 0.13068729456540543, "grad_norm": 4.149228042199806, "learning_rate": 1.9473389305026265e-07, "loss": 0.7678, "step": 1322 }, { "epoch": 0.1307861503101599, "grad_norm": 4.012179470123882, "learning_rate": 1.947236339836495e-07, "loss": 0.743, "step": 1323 }, { "epoch": 0.13088500605491438, "grad_norm": 3.776544306964287, "learning_rate": 1.9471336520452495e-07, "loss": 0.774, "step": 1324 }, { "epoch": 0.13098386179966884, "grad_norm": 3.720322182859924, "learning_rate": 1.947030867139419e-07, "loss": 0.7474, "step": 1325 }, { "epoch": 0.1310827175444233, "grad_norm": 3.424130844307389, "learning_rate": 1.9469279851295433e-07, "loss": 0.7309, "step": 1326 }, { "epoch": 0.13118157328917776, "grad_norm": 3.0943503359180076, "learning_rate": 1.9468250060261707e-07, "loss": 0.8612, "step": 1327 }, { "epoch": 0.13128042903393222, "grad_norm": 4.377012185510415, "learning_rate": 1.9467219298398602e-07, "loss": 0.6473, "step": 1328 }, { "epoch": 0.1313792847786867, "grad_norm": 23.455664801101783, "learning_rate": 1.946618756581181e-07, "loss": 0.8614, "step": 1329 }, { "epoch": 0.13147814052344117, "grad_norm": 4.235341487808135, "learning_rate": 1.9465154862607118e-07, "loss": 0.819, "step": 1330 }, { "epoch": 0.13157699626819563, "grad_norm": 3.8407259304836727, "learning_rate": 1.9464121188890417e-07, "loss": 0.6649, "step": 1331 }, { "epoch": 0.1316758520129501, "grad_norm": 36.76002447001568, "learning_rate": 1.9463086544767693e-07, "loss": 0.7403, "step": 1332 }, { "epoch": 0.13177470775770456, "grad_norm": 4.624763226917082, "learning_rate": 1.9462050930345035e-07, "loss": 0.8033, "step": 1333 }, { "epoch": 0.13187356350245905, "grad_norm": 4.048662439547007, "learning_rate": 1.946101434572863e-07, "loss": 0.7541, "step": 1334 }, { "epoch": 0.1319724192472135, "grad_norm": 7.5842227498787524, "learning_rate": 1.945997679102476e-07, "loss": 0.7668, "step": 1335 }, { "epoch": 0.13207127499196797, "grad_norm": 3.587197425998407, "learning_rate": 1.945893826633982e-07, "loss": 0.8553, "step": 1336 }, { "epoch": 0.13217013073672243, "grad_norm": 3.7058427650000274, "learning_rate": 1.9457898771780287e-07, "loss": 0.7371, "step": 1337 }, { "epoch": 0.1322689864814769, "grad_norm": 7.291052153888168, "learning_rate": 1.945685830745275e-07, "loss": 0.7215, "step": 1338 }, { "epoch": 0.13236784222623138, "grad_norm": 5.482840831478433, "learning_rate": 1.9455816873463892e-07, "loss": 0.7551, "step": 1339 }, { "epoch": 0.13246669797098584, "grad_norm": 10.130599407742041, "learning_rate": 1.9454774469920495e-07, "loss": 0.7321, "step": 1340 }, { "epoch": 0.1325655537157403, "grad_norm": 4.930751449113531, "learning_rate": 1.9453731096929445e-07, "loss": 0.6529, "step": 1341 }, { "epoch": 0.13266440946049476, "grad_norm": 4.539557532683274, "learning_rate": 1.9452686754597721e-07, "loss": 0.7042, "step": 1342 }, { "epoch": 0.13276326520524925, "grad_norm": 4.280443475694957, "learning_rate": 1.945164144303241e-07, "loss": 0.8114, "step": 1343 }, { "epoch": 0.13286212095000371, "grad_norm": 3.6569330611824262, "learning_rate": 1.945059516234069e-07, "loss": 0.6973, "step": 1344 }, { "epoch": 0.13296097669475818, "grad_norm": 6.584574166707988, "learning_rate": 1.944954791262984e-07, "loss": 0.7838, "step": 1345 }, { "epoch": 0.13305983243951264, "grad_norm": 29.24873544355493, "learning_rate": 1.9448499694007245e-07, "loss": 0.6982, "step": 1346 }, { "epoch": 0.1331586881842671, "grad_norm": 4.75546331336948, "learning_rate": 1.944745050658038e-07, "loss": 0.7532, "step": 1347 }, { "epoch": 0.1332575439290216, "grad_norm": 7.023118645785252, "learning_rate": 1.9446400350456829e-07, "loss": 0.7718, "step": 1348 }, { "epoch": 0.13335639967377605, "grad_norm": 3.7200800511128365, "learning_rate": 1.944534922574426e-07, "loss": 0.7079, "step": 1349 }, { "epoch": 0.1334552554185305, "grad_norm": 3.0702870115940817, "learning_rate": 1.9444297132550461e-07, "loss": 0.6964, "step": 1350 }, { "epoch": 0.13355411116328497, "grad_norm": 4.1721816190056975, "learning_rate": 1.9443244070983305e-07, "loss": 0.7392, "step": 1351 }, { "epoch": 0.13365296690803943, "grad_norm": 3.777911982959137, "learning_rate": 1.9442190041150764e-07, "loss": 0.7297, "step": 1352 }, { "epoch": 0.13375182265279392, "grad_norm": 5.648955675901979, "learning_rate": 1.9441135043160916e-07, "loss": 0.7971, "step": 1353 }, { "epoch": 0.13385067839754838, "grad_norm": 6.236030497699051, "learning_rate": 1.9440079077121937e-07, "loss": 0.7811, "step": 1354 }, { "epoch": 0.13394953414230285, "grad_norm": 3.615837981281405, "learning_rate": 1.9439022143142103e-07, "loss": 0.6834, "step": 1355 }, { "epoch": 0.1340483898870573, "grad_norm": 5.560580172868369, "learning_rate": 1.9437964241329778e-07, "loss": 0.7275, "step": 1356 }, { "epoch": 0.13414724563181177, "grad_norm": 3.621762819330606, "learning_rate": 1.9436905371793445e-07, "loss": 0.7102, "step": 1357 }, { "epoch": 0.13424610137656626, "grad_norm": 4.660293085117622, "learning_rate": 1.9435845534641666e-07, "loss": 0.7777, "step": 1358 }, { "epoch": 0.13434495712132072, "grad_norm": 7.63569243960629, "learning_rate": 1.9434784729983118e-07, "loss": 0.779, "step": 1359 }, { "epoch": 0.13444381286607518, "grad_norm": 3.3268529024897826, "learning_rate": 1.9433722957926567e-07, "loss": 0.7834, "step": 1360 }, { "epoch": 0.13454266861082964, "grad_norm": 29.68526888014553, "learning_rate": 1.9432660218580886e-07, "loss": 0.793, "step": 1361 }, { "epoch": 0.1346415243555841, "grad_norm": 4.506316873453273, "learning_rate": 1.9431596512055043e-07, "loss": 0.7301, "step": 1362 }, { "epoch": 0.1347403801003386, "grad_norm": 6.193933803275632, "learning_rate": 1.94305318384581e-07, "loss": 0.7319, "step": 1363 }, { "epoch": 0.13483923584509305, "grad_norm": 9.47378067408924, "learning_rate": 1.9429466197899228e-07, "loss": 0.7472, "step": 1364 }, { "epoch": 0.13493809158984751, "grad_norm": 4.180190812584948, "learning_rate": 1.942839959048769e-07, "loss": 0.7279, "step": 1365 }, { "epoch": 0.13503694733460198, "grad_norm": 21.98513867260318, "learning_rate": 1.9427332016332855e-07, "loss": 0.6663, "step": 1366 }, { "epoch": 0.13513580307935644, "grad_norm": 11.708253059193414, "learning_rate": 1.9426263475544186e-07, "loss": 0.7114, "step": 1367 }, { "epoch": 0.13523465882411093, "grad_norm": 4.573183701005416, "learning_rate": 1.9425193968231244e-07, "loss": 0.7566, "step": 1368 }, { "epoch": 0.1353335145688654, "grad_norm": 3.8387281049553135, "learning_rate": 1.942412349450369e-07, "loss": 0.7859, "step": 1369 }, { "epoch": 0.13543237031361985, "grad_norm": 4.8122996739465815, "learning_rate": 1.9423052054471288e-07, "loss": 0.7044, "step": 1370 }, { "epoch": 0.1355312260583743, "grad_norm": 22.411899130703777, "learning_rate": 1.9421979648243897e-07, "loss": 0.8109, "step": 1371 }, { "epoch": 0.13563008180312877, "grad_norm": 4.3781717748279885, "learning_rate": 1.942090627593148e-07, "loss": 0.7535, "step": 1372 }, { "epoch": 0.13572893754788326, "grad_norm": 5.687653360861449, "learning_rate": 1.941983193764409e-07, "loss": 0.7418, "step": 1373 }, { "epoch": 0.13582779329263772, "grad_norm": 4.35297259949213, "learning_rate": 1.9418756633491887e-07, "loss": 0.7388, "step": 1374 }, { "epoch": 0.13592664903739218, "grad_norm": 6.152921087589322, "learning_rate": 1.9417680363585128e-07, "loss": 0.8501, "step": 1375 }, { "epoch": 0.13602550478214664, "grad_norm": 2.92710831127592, "learning_rate": 1.9416603128034167e-07, "loss": 0.864, "step": 1376 }, { "epoch": 0.1361243605269011, "grad_norm": 4.941607333415102, "learning_rate": 1.9415524926949463e-07, "loss": 0.7149, "step": 1377 }, { "epoch": 0.1362232162716556, "grad_norm": 3.9649950358114263, "learning_rate": 1.9414445760441564e-07, "loss": 0.6256, "step": 1378 }, { "epoch": 0.13632207201641006, "grad_norm": 4.650852239190141, "learning_rate": 1.9413365628621125e-07, "loss": 0.6982, "step": 1379 }, { "epoch": 0.13642092776116452, "grad_norm": 4.219773324319354, "learning_rate": 1.94122845315989e-07, "loss": 0.7725, "step": 1380 }, { "epoch": 0.13651978350591898, "grad_norm": 3.1555974278550107, "learning_rate": 1.9411202469485736e-07, "loss": 0.7742, "step": 1381 }, { "epoch": 0.13661863925067347, "grad_norm": 4.738023842130687, "learning_rate": 1.9410119442392582e-07, "loss": 0.7889, "step": 1382 }, { "epoch": 0.13671749499542793, "grad_norm": 3.944258295157563, "learning_rate": 1.9409035450430491e-07, "loss": 0.8146, "step": 1383 }, { "epoch": 0.1368163507401824, "grad_norm": 4.38271913121012, "learning_rate": 1.9407950493710606e-07, "loss": 0.7929, "step": 1384 }, { "epoch": 0.13691520648493685, "grad_norm": 4.2414662942654315, "learning_rate": 1.9406864572344176e-07, "loss": 0.8052, "step": 1385 }, { "epoch": 0.1370140622296913, "grad_norm": 4.252615987101041, "learning_rate": 1.9405777686442546e-07, "loss": 0.7845, "step": 1386 }, { "epoch": 0.1371129179744458, "grad_norm": 18.199539468330546, "learning_rate": 1.9404689836117156e-07, "loss": 0.7206, "step": 1387 }, { "epoch": 0.13721177371920026, "grad_norm": 6.907778440197796, "learning_rate": 1.9403601021479555e-07, "loss": 0.6843, "step": 1388 }, { "epoch": 0.13731062946395473, "grad_norm": 5.896600317169673, "learning_rate": 1.940251124264138e-07, "loss": 0.7344, "step": 1389 }, { "epoch": 0.1374094852087092, "grad_norm": 10.008726338069106, "learning_rate": 1.9401420499714376e-07, "loss": 0.821, "step": 1390 }, { "epoch": 0.13750834095346365, "grad_norm": 3.0136658479596257, "learning_rate": 1.940032879281038e-07, "loss": 0.7144, "step": 1391 }, { "epoch": 0.13760719669821814, "grad_norm": 5.456440668372255, "learning_rate": 1.939923612204133e-07, "loss": 0.7217, "step": 1392 }, { "epoch": 0.1377060524429726, "grad_norm": 6.020398618263541, "learning_rate": 1.9398142487519263e-07, "loss": 0.6423, "step": 1393 }, { "epoch": 0.13780490818772706, "grad_norm": 5.960336313331038, "learning_rate": 1.9397047889356316e-07, "loss": 0.7137, "step": 1394 }, { "epoch": 0.13790376393248152, "grad_norm": 17.50382780907154, "learning_rate": 1.9395952327664727e-07, "loss": 0.7425, "step": 1395 }, { "epoch": 0.13800261967723598, "grad_norm": 3.780528294448486, "learning_rate": 1.939485580255683e-07, "loss": 0.673, "step": 1396 }, { "epoch": 0.13810147542199047, "grad_norm": 13.140642357908101, "learning_rate": 1.9393758314145047e-07, "loss": 0.7024, "step": 1397 }, { "epoch": 0.13820033116674493, "grad_norm": 3.777844836239173, "learning_rate": 1.939265986254192e-07, "loss": 0.7478, "step": 1398 }, { "epoch": 0.1382991869114994, "grad_norm": 15.98678841525936, "learning_rate": 1.9391560447860077e-07, "loss": 0.74, "step": 1399 }, { "epoch": 0.13839804265625386, "grad_norm": 5.041189944991649, "learning_rate": 1.939046007021224e-07, "loss": 0.7324, "step": 1400 }, { "epoch": 0.13849689840100832, "grad_norm": 3.4559382821560436, "learning_rate": 1.9389358729711246e-07, "loss": 0.7291, "step": 1401 }, { "epoch": 0.1385957541457628, "grad_norm": 11.125760399271332, "learning_rate": 1.9388256426470017e-07, "loss": 0.7538, "step": 1402 }, { "epoch": 0.13869460989051727, "grad_norm": 3.430411631490619, "learning_rate": 1.9387153160601578e-07, "loss": 0.7266, "step": 1403 }, { "epoch": 0.13879346563527173, "grad_norm": 11.993985952886646, "learning_rate": 1.9386048932219052e-07, "loss": 0.8122, "step": 1404 }, { "epoch": 0.1388923213800262, "grad_norm": 15.290296896549313, "learning_rate": 1.938494374143566e-07, "loss": 0.8529, "step": 1405 }, { "epoch": 0.13899117712478065, "grad_norm": 19.109306753259112, "learning_rate": 1.938383758836473e-07, "loss": 0.7385, "step": 1406 }, { "epoch": 0.13909003286953514, "grad_norm": 3.615065507727475, "learning_rate": 1.9382730473119672e-07, "loss": 0.6506, "step": 1407 }, { "epoch": 0.1391888886142896, "grad_norm": 9.528671558582328, "learning_rate": 1.9381622395814012e-07, "loss": 0.6654, "step": 1408 }, { "epoch": 0.13928774435904406, "grad_norm": 13.771324013398806, "learning_rate": 1.9380513356561362e-07, "loss": 0.7969, "step": 1409 }, { "epoch": 0.13938660010379852, "grad_norm": 4.0698850042679195, "learning_rate": 1.9379403355475442e-07, "loss": 0.7935, "step": 1410 }, { "epoch": 0.13948545584855299, "grad_norm": 6.2220305786063035, "learning_rate": 1.9378292392670063e-07, "loss": 0.7663, "step": 1411 }, { "epoch": 0.13958431159330748, "grad_norm": 3.37158315674799, "learning_rate": 1.9377180468259135e-07, "loss": 0.8319, "step": 1412 }, { "epoch": 0.13968316733806194, "grad_norm": 13.23424261209968, "learning_rate": 1.9376067582356677e-07, "loss": 0.7455, "step": 1413 }, { "epoch": 0.1397820230828164, "grad_norm": 6.0262709041883875, "learning_rate": 1.9374953735076795e-07, "loss": 0.7564, "step": 1414 }, { "epoch": 0.13988087882757086, "grad_norm": 5.440792351084578, "learning_rate": 1.9373838926533695e-07, "loss": 0.6601, "step": 1415 }, { "epoch": 0.13997973457232532, "grad_norm": 15.160546657396278, "learning_rate": 1.937272315684169e-07, "loss": 0.6886, "step": 1416 }, { "epoch": 0.1400785903170798, "grad_norm": 3.340986150286348, "learning_rate": 1.937160642611518e-07, "loss": 0.6758, "step": 1417 }, { "epoch": 0.14017744606183427, "grad_norm": 6.371982665642472, "learning_rate": 1.9370488734468676e-07, "loss": 0.7167, "step": 1418 }, { "epoch": 0.14027630180658873, "grad_norm": 5.038146212194647, "learning_rate": 1.9369370082016775e-07, "loss": 0.7466, "step": 1419 }, { "epoch": 0.1403751575513432, "grad_norm": 4.100900260420734, "learning_rate": 1.9368250468874176e-07, "loss": 0.8302, "step": 1420 }, { "epoch": 0.14047401329609768, "grad_norm": 5.450668542794107, "learning_rate": 1.9367129895155685e-07, "loss": 0.8322, "step": 1421 }, { "epoch": 0.14057286904085214, "grad_norm": 4.662044682651495, "learning_rate": 1.93660083609762e-07, "loss": 0.7804, "step": 1422 }, { "epoch": 0.1406717247856066, "grad_norm": 4.505313939666244, "learning_rate": 1.9364885866450714e-07, "loss": 0.7709, "step": 1423 }, { "epoch": 0.14077058053036107, "grad_norm": 4.722296370589035, "learning_rate": 1.9363762411694326e-07, "loss": 0.8197, "step": 1424 }, { "epoch": 0.14086943627511553, "grad_norm": 5.457639767685148, "learning_rate": 1.9362637996822228e-07, "loss": 0.6688, "step": 1425 }, { "epoch": 0.14096829201987002, "grad_norm": 3.4443641149801154, "learning_rate": 1.936151262194971e-07, "loss": 0.736, "step": 1426 }, { "epoch": 0.14106714776462448, "grad_norm": 5.124536878684874, "learning_rate": 1.9360386287192165e-07, "loss": 0.7431, "step": 1427 }, { "epoch": 0.14116600350937894, "grad_norm": 7.542218355017893, "learning_rate": 1.9359258992665084e-07, "loss": 0.8147, "step": 1428 }, { "epoch": 0.1412648592541334, "grad_norm": 3.9881518022981357, "learning_rate": 1.9358130738484052e-07, "loss": 0.7174, "step": 1429 }, { "epoch": 0.14136371499888786, "grad_norm": 6.720187345525028, "learning_rate": 1.935700152476475e-07, "loss": 0.7843, "step": 1430 }, { "epoch": 0.14146257074364235, "grad_norm": 4.6374020964250375, "learning_rate": 1.9355871351622972e-07, "loss": 0.7268, "step": 1431 }, { "epoch": 0.1415614264883968, "grad_norm": 4.37549976521285, "learning_rate": 1.9354740219174595e-07, "loss": 0.7189, "step": 1432 }, { "epoch": 0.14166028223315127, "grad_norm": 5.254152100489806, "learning_rate": 1.93536081275356e-07, "loss": 0.6686, "step": 1433 }, { "epoch": 0.14175913797790574, "grad_norm": 4.319821875227706, "learning_rate": 1.935247507682207e-07, "loss": 0.7172, "step": 1434 }, { "epoch": 0.1418579937226602, "grad_norm": 4.358075376224312, "learning_rate": 1.9351341067150173e-07, "loss": 0.8069, "step": 1435 }, { "epoch": 0.1419568494674147, "grad_norm": 6.293810939166785, "learning_rate": 1.9350206098636194e-07, "loss": 0.7603, "step": 1436 }, { "epoch": 0.14205570521216915, "grad_norm": 11.54639304270749, "learning_rate": 1.9349070171396507e-07, "loss": 0.6856, "step": 1437 }, { "epoch": 0.1421545609569236, "grad_norm": 3.3547288186254423, "learning_rate": 1.934793328554758e-07, "loss": 0.6903, "step": 1438 }, { "epoch": 0.14225341670167807, "grad_norm": 4.728455184453007, "learning_rate": 1.9346795441205986e-07, "loss": 0.8374, "step": 1439 }, { "epoch": 0.14235227244643253, "grad_norm": 4.084892295794369, "learning_rate": 1.9345656638488399e-07, "loss": 0.6927, "step": 1440 }, { "epoch": 0.14245112819118702, "grad_norm": 8.311491093754027, "learning_rate": 1.934451687751158e-07, "loss": 0.7855, "step": 1441 }, { "epoch": 0.14254998393594148, "grad_norm": 8.293275287486674, "learning_rate": 1.9343376158392394e-07, "loss": 0.7438, "step": 1442 }, { "epoch": 0.14264883968069594, "grad_norm": 7.223429305970848, "learning_rate": 1.934223448124781e-07, "loss": 0.7357, "step": 1443 }, { "epoch": 0.1427476954254504, "grad_norm": 3.0730419976838195, "learning_rate": 1.9341091846194885e-07, "loss": 0.7995, "step": 1444 }, { "epoch": 0.14284655117020487, "grad_norm": 8.761012232056142, "learning_rate": 1.9339948253350783e-07, "loss": 0.7657, "step": 1445 }, { "epoch": 0.14294540691495936, "grad_norm": 3.6141645075230033, "learning_rate": 1.9338803702832763e-07, "loss": 0.755, "step": 1446 }, { "epoch": 0.14304426265971382, "grad_norm": 3.415139603183968, "learning_rate": 1.933765819475818e-07, "loss": 0.6291, "step": 1447 }, { "epoch": 0.14314311840446828, "grad_norm": 6.521962319653034, "learning_rate": 1.9336511729244486e-07, "loss": 0.7549, "step": 1448 }, { "epoch": 0.14324197414922274, "grad_norm": 4.339468239543043, "learning_rate": 1.9335364306409242e-07, "loss": 0.757, "step": 1449 }, { "epoch": 0.1433408298939772, "grad_norm": 4.558189990777735, "learning_rate": 1.9334215926370092e-07, "loss": 0.8108, "step": 1450 }, { "epoch": 0.1434396856387317, "grad_norm": 6.979782870282327, "learning_rate": 1.933306658924479e-07, "loss": 0.828, "step": 1451 }, { "epoch": 0.14353854138348615, "grad_norm": 3.9402975878148365, "learning_rate": 1.933191629515118e-07, "loss": 0.6948, "step": 1452 }, { "epoch": 0.1436373971282406, "grad_norm": 3.696961671807679, "learning_rate": 1.9330765044207208e-07, "loss": 0.724, "step": 1453 }, { "epoch": 0.14373625287299507, "grad_norm": 3.879991912958697, "learning_rate": 1.9329612836530922e-07, "loss": 0.8277, "step": 1454 }, { "epoch": 0.14383510861774954, "grad_norm": 4.932418438866404, "learning_rate": 1.932845967224046e-07, "loss": 0.7401, "step": 1455 }, { "epoch": 0.14393396436250402, "grad_norm": 5.101493830846892, "learning_rate": 1.9327305551454062e-07, "loss": 0.7159, "step": 1456 }, { "epoch": 0.14403282010725849, "grad_norm": 3.837683939972634, "learning_rate": 1.932615047429007e-07, "loss": 0.7344, "step": 1457 }, { "epoch": 0.14413167585201295, "grad_norm": 6.568224750773238, "learning_rate": 1.9324994440866915e-07, "loss": 0.7199, "step": 1458 }, { "epoch": 0.1442305315967674, "grad_norm": 8.17601519218967, "learning_rate": 1.9323837451303135e-07, "loss": 0.5969, "step": 1459 }, { "epoch": 0.1443293873415219, "grad_norm": 4.019112032465348, "learning_rate": 1.932267950571736e-07, "loss": 0.8197, "step": 1460 }, { "epoch": 0.14442824308627636, "grad_norm": 3.65502413426619, "learning_rate": 1.9321520604228318e-07, "loss": 0.7397, "step": 1461 }, { "epoch": 0.14452709883103082, "grad_norm": 3.3740961649192167, "learning_rate": 1.9320360746954844e-07, "loss": 0.7374, "step": 1462 }, { "epoch": 0.14462595457578528, "grad_norm": 3.3393584905667497, "learning_rate": 1.931919993401586e-07, "loss": 0.6669, "step": 1463 }, { "epoch": 0.14472481032053974, "grad_norm": 4.82517060544772, "learning_rate": 1.9318038165530388e-07, "loss": 0.6037, "step": 1464 }, { "epoch": 0.14482366606529423, "grad_norm": 2.9205808105180444, "learning_rate": 1.9316875441617554e-07, "loss": 0.7483, "step": 1465 }, { "epoch": 0.1449225218100487, "grad_norm": 3.1539630004781527, "learning_rate": 1.9315711762396578e-07, "loss": 0.8731, "step": 1466 }, { "epoch": 0.14502137755480315, "grad_norm": 5.034377152047869, "learning_rate": 1.9314547127986777e-07, "loss": 0.726, "step": 1467 }, { "epoch": 0.14512023329955762, "grad_norm": 13.836598060302233, "learning_rate": 1.9313381538507566e-07, "loss": 0.7733, "step": 1468 }, { "epoch": 0.14521908904431208, "grad_norm": 5.585324995373478, "learning_rate": 1.9312214994078463e-07, "loss": 0.6232, "step": 1469 }, { "epoch": 0.14531794478906657, "grad_norm": 3.383043496983211, "learning_rate": 1.9311047494819074e-07, "loss": 0.7256, "step": 1470 }, { "epoch": 0.14541680053382103, "grad_norm": 3.489694879061595, "learning_rate": 1.9309879040849115e-07, "loss": 0.8566, "step": 1471 }, { "epoch": 0.1455156562785755, "grad_norm": 3.6263275502866494, "learning_rate": 1.930870963228839e-07, "loss": 0.6735, "step": 1472 }, { "epoch": 0.14561451202332995, "grad_norm": 3.553057924550956, "learning_rate": 1.9307539269256804e-07, "loss": 0.7887, "step": 1473 }, { "epoch": 0.1457133677680844, "grad_norm": 123.46346679141459, "learning_rate": 1.9306367951874364e-07, "loss": 0.6818, "step": 1474 }, { "epoch": 0.1458122235128389, "grad_norm": 3.0945519996571327, "learning_rate": 1.9305195680261165e-07, "loss": 0.7427, "step": 1475 }, { "epoch": 0.14591107925759336, "grad_norm": 4.051772402279348, "learning_rate": 1.9304022454537413e-07, "loss": 0.7294, "step": 1476 }, { "epoch": 0.14600993500234782, "grad_norm": 9.35186747196895, "learning_rate": 1.9302848274823402e-07, "loss": 0.7408, "step": 1477 }, { "epoch": 0.14610879074710229, "grad_norm": 15.35706687457826, "learning_rate": 1.930167314123953e-07, "loss": 0.6714, "step": 1478 }, { "epoch": 0.14620764649185675, "grad_norm": 11.86036047702726, "learning_rate": 1.9300497053906283e-07, "loss": 0.6643, "step": 1479 }, { "epoch": 0.14630650223661124, "grad_norm": 6.749478052614757, "learning_rate": 1.9299320012944257e-07, "loss": 0.6805, "step": 1480 }, { "epoch": 0.1464053579813657, "grad_norm": 3.283058326494834, "learning_rate": 1.9298142018474137e-07, "loss": 0.7011, "step": 1481 }, { "epoch": 0.14650421372612016, "grad_norm": 3.412368370680551, "learning_rate": 1.929696307061671e-07, "loss": 0.8198, "step": 1482 }, { "epoch": 0.14660306947087462, "grad_norm": 4.289625409739477, "learning_rate": 1.9295783169492862e-07, "loss": 0.6152, "step": 1483 }, { "epoch": 0.14670192521562908, "grad_norm": 5.013028713005321, "learning_rate": 1.9294602315223569e-07, "loss": 0.6591, "step": 1484 }, { "epoch": 0.14680078096038357, "grad_norm": 4.962834750584621, "learning_rate": 1.9293420507929917e-07, "loss": 0.7536, "step": 1485 }, { "epoch": 0.14689963670513803, "grad_norm": 7.863441490118767, "learning_rate": 1.9292237747733076e-07, "loss": 0.7886, "step": 1486 }, { "epoch": 0.1469984924498925, "grad_norm": 4.451697087553442, "learning_rate": 1.9291054034754325e-07, "loss": 0.6923, "step": 1487 }, { "epoch": 0.14709734819464695, "grad_norm": 14.3244214071126, "learning_rate": 1.928986936911504e-07, "loss": 0.7917, "step": 1488 }, { "epoch": 0.14719620393940142, "grad_norm": 4.285114122989727, "learning_rate": 1.928868375093668e-07, "loss": 0.7854, "step": 1489 }, { "epoch": 0.1472950596841559, "grad_norm": 3.8766697100583647, "learning_rate": 1.9287497180340825e-07, "loss": 0.7583, "step": 1490 }, { "epoch": 0.14739391542891037, "grad_norm": 4.544588849791243, "learning_rate": 1.9286309657449128e-07, "loss": 0.7193, "step": 1491 }, { "epoch": 0.14749277117366483, "grad_norm": 3.51320475948321, "learning_rate": 1.9285121182383365e-07, "loss": 0.7396, "step": 1492 }, { "epoch": 0.1475916269184193, "grad_norm": 5.222921275181644, "learning_rate": 1.9283931755265383e-07, "loss": 0.6992, "step": 1493 }, { "epoch": 0.14769048266317375, "grad_norm": 5.5037023775055856, "learning_rate": 1.9282741376217152e-07, "loss": 0.6968, "step": 1494 }, { "epoch": 0.14778933840792824, "grad_norm": 4.752937492371211, "learning_rate": 1.9281550045360722e-07, "loss": 0.7051, "step": 1495 }, { "epoch": 0.1478881941526827, "grad_norm": 5.757977579453367, "learning_rate": 1.928035776281825e-07, "loss": 0.7373, "step": 1496 }, { "epoch": 0.14798704989743716, "grad_norm": 4.95884198986748, "learning_rate": 1.927916452871198e-07, "loss": 0.7491, "step": 1497 }, { "epoch": 0.14808590564219162, "grad_norm": 4.769487633549527, "learning_rate": 1.9277970343164266e-07, "loss": 0.7371, "step": 1498 }, { "epoch": 0.1481847613869461, "grad_norm": 7.233728705859902, "learning_rate": 1.9276775206297552e-07, "loss": 0.74, "step": 1499 }, { "epoch": 0.14828361713170057, "grad_norm": 5.446996892682441, "learning_rate": 1.9275579118234387e-07, "loss": 0.7054, "step": 1500 }, { "epoch": 0.14838247287645503, "grad_norm": 3.758225553458617, "learning_rate": 1.9274382079097403e-07, "loss": 0.69, "step": 1501 }, { "epoch": 0.1484813286212095, "grad_norm": 8.33796855242717, "learning_rate": 1.9273184089009344e-07, "loss": 0.6461, "step": 1502 }, { "epoch": 0.14858018436596396, "grad_norm": 4.742184176616951, "learning_rate": 1.927198514809305e-07, "loss": 0.7053, "step": 1503 }, { "epoch": 0.14867904011071845, "grad_norm": 4.817529328880994, "learning_rate": 1.9270785256471446e-07, "loss": 0.7569, "step": 1504 }, { "epoch": 0.1487778958554729, "grad_norm": 25.59893032923366, "learning_rate": 1.9269584414267567e-07, "loss": 0.6292, "step": 1505 }, { "epoch": 0.14887675160022737, "grad_norm": 6.168159052469597, "learning_rate": 1.9268382621604545e-07, "loss": 0.7788, "step": 1506 }, { "epoch": 0.14897560734498183, "grad_norm": 8.692739381097766, "learning_rate": 1.9267179878605603e-07, "loss": 0.7796, "step": 1507 }, { "epoch": 0.1490744630897363, "grad_norm": 3.4118822461261553, "learning_rate": 1.9265976185394067e-07, "loss": 0.856, "step": 1508 }, { "epoch": 0.14917331883449078, "grad_norm": 5.427039541739763, "learning_rate": 1.9264771542093354e-07, "loss": 0.8505, "step": 1509 }, { "epoch": 0.14927217457924524, "grad_norm": 4.782109353730399, "learning_rate": 1.9263565948826983e-07, "loss": 0.792, "step": 1510 }, { "epoch": 0.1493710303239997, "grad_norm": 10.940986123839895, "learning_rate": 1.9262359405718574e-07, "loss": 0.712, "step": 1511 }, { "epoch": 0.14946988606875417, "grad_norm": 3.3706384552712527, "learning_rate": 1.9261151912891834e-07, "loss": 0.7021, "step": 1512 }, { "epoch": 0.14956874181350863, "grad_norm": 4.445894250883976, "learning_rate": 1.925994347047058e-07, "loss": 0.7552, "step": 1513 }, { "epoch": 0.14966759755826312, "grad_norm": 4.126808956205197, "learning_rate": 1.9258734078578716e-07, "loss": 0.8587, "step": 1514 }, { "epoch": 0.14976645330301758, "grad_norm": 3.8608640421775666, "learning_rate": 1.9257523737340246e-07, "loss": 0.7259, "step": 1515 }, { "epoch": 0.14986530904777204, "grad_norm": 7.999588573156191, "learning_rate": 1.9256312446879277e-07, "loss": 0.6297, "step": 1516 }, { "epoch": 0.1499641647925265, "grad_norm": 3.728283848292913, "learning_rate": 1.9255100207320008e-07, "loss": 0.821, "step": 1517 }, { "epoch": 0.15006302053728096, "grad_norm": 6.998036941205037, "learning_rate": 1.925388701878673e-07, "loss": 0.592, "step": 1518 }, { "epoch": 0.15016187628203545, "grad_norm": 3.3605830413765383, "learning_rate": 1.925267288140385e-07, "loss": 0.7186, "step": 1519 }, { "epoch": 0.1502607320267899, "grad_norm": 30.00060491495836, "learning_rate": 1.925145779529585e-07, "loss": 0.6854, "step": 1520 }, { "epoch": 0.15035958777154437, "grad_norm": 6.970179356784423, "learning_rate": 1.925024176058732e-07, "loss": 0.8279, "step": 1521 }, { "epoch": 0.15045844351629883, "grad_norm": 4.775717585131605, "learning_rate": 1.9249024777402947e-07, "loss": 0.7929, "step": 1522 }, { "epoch": 0.1505572992610533, "grad_norm": 4.841735468332748, "learning_rate": 1.924780684586752e-07, "loss": 0.6897, "step": 1523 }, { "epoch": 0.15065615500580778, "grad_norm": 10.12166633802942, "learning_rate": 1.9246587966105912e-07, "loss": 0.8085, "step": 1524 }, { "epoch": 0.15075501075056225, "grad_norm": 3.076304912750501, "learning_rate": 1.9245368138243103e-07, "loss": 0.7482, "step": 1525 }, { "epoch": 0.1508538664953167, "grad_norm": 5.192137743877308, "learning_rate": 1.9244147362404175e-07, "loss": 0.6607, "step": 1526 }, { "epoch": 0.15095272224007117, "grad_norm": 3.714455601447199, "learning_rate": 1.9242925638714293e-07, "loss": 0.7035, "step": 1527 }, { "epoch": 0.15105157798482563, "grad_norm": 8.812304574468612, "learning_rate": 1.924170296729873e-07, "loss": 0.7764, "step": 1528 }, { "epoch": 0.15115043372958012, "grad_norm": 19.51234441335062, "learning_rate": 1.9240479348282852e-07, "loss": 0.6953, "step": 1529 }, { "epoch": 0.15124928947433458, "grad_norm": 9.573115054523573, "learning_rate": 1.9239254781792126e-07, "loss": 0.6851, "step": 1530 }, { "epoch": 0.15134814521908904, "grad_norm": 4.528486059217281, "learning_rate": 1.923802926795211e-07, "loss": 0.7348, "step": 1531 }, { "epoch": 0.1514470009638435, "grad_norm": 3.6997505126599233, "learning_rate": 1.923680280688846e-07, "loss": 0.6958, "step": 1532 }, { "epoch": 0.15154585670859796, "grad_norm": 6.591169756644878, "learning_rate": 1.9235575398726937e-07, "loss": 0.8258, "step": 1533 }, { "epoch": 0.15164471245335245, "grad_norm": 4.495197279001063, "learning_rate": 1.923434704359339e-07, "loss": 0.7023, "step": 1534 }, { "epoch": 0.15174356819810692, "grad_norm": 28.017614527403055, "learning_rate": 1.9233117741613772e-07, "loss": 0.8394, "step": 1535 }, { "epoch": 0.15184242394286138, "grad_norm": 3.4415132322601054, "learning_rate": 1.9231887492914126e-07, "loss": 0.8196, "step": 1536 }, { "epoch": 0.15194127968761584, "grad_norm": 3.739787890220457, "learning_rate": 1.9230656297620598e-07, "loss": 0.8061, "step": 1537 }, { "epoch": 0.15204013543237033, "grad_norm": 4.915846055808366, "learning_rate": 1.922942415585943e-07, "loss": 0.7406, "step": 1538 }, { "epoch": 0.1521389911771248, "grad_norm": 4.676451066090141, "learning_rate": 1.9228191067756955e-07, "loss": 0.8032, "step": 1539 }, { "epoch": 0.15223784692187925, "grad_norm": 3.2699820157528845, "learning_rate": 1.9226957033439613e-07, "loss": 0.6667, "step": 1540 }, { "epoch": 0.1523367026666337, "grad_norm": 8.0037158709571, "learning_rate": 1.9225722053033935e-07, "loss": 0.7365, "step": 1541 }, { "epoch": 0.15243555841138817, "grad_norm": 5.200785865474845, "learning_rate": 1.922448612666655e-07, "loss": 0.7661, "step": 1542 }, { "epoch": 0.15253441415614266, "grad_norm": 10.832139319705773, "learning_rate": 1.9223249254464182e-07, "loss": 0.6156, "step": 1543 }, { "epoch": 0.15263326990089712, "grad_norm": 26.746517272549674, "learning_rate": 1.9222011436553657e-07, "loss": 0.8328, "step": 1544 }, { "epoch": 0.15273212564565158, "grad_norm": 8.07201581679297, "learning_rate": 1.922077267306189e-07, "loss": 0.7071, "step": 1545 }, { "epoch": 0.15283098139040605, "grad_norm": 4.093842586030663, "learning_rate": 1.9219532964115903e-07, "loss": 0.7305, "step": 1546 }, { "epoch": 0.1529298371351605, "grad_norm": 4.04528849417612, "learning_rate": 1.921829230984281e-07, "loss": 0.7083, "step": 1547 }, { "epoch": 0.153028692879915, "grad_norm": 11.103611031027844, "learning_rate": 1.9217050710369819e-07, "loss": 0.5863, "step": 1548 }, { "epoch": 0.15312754862466946, "grad_norm": 4.510210195277667, "learning_rate": 1.921580816582424e-07, "loss": 0.8053, "step": 1549 }, { "epoch": 0.15322640436942392, "grad_norm": 3.6354685042425436, "learning_rate": 1.9214564676333473e-07, "loss": 0.7931, "step": 1550 }, { "epoch": 0.15332526011417838, "grad_norm": 5.2470931357245565, "learning_rate": 1.9213320242025024e-07, "loss": 0.7342, "step": 1551 }, { "epoch": 0.15342411585893284, "grad_norm": 4.4800205466208975, "learning_rate": 1.921207486302649e-07, "loss": 0.7264, "step": 1552 }, { "epoch": 0.15352297160368733, "grad_norm": 4.638358945962189, "learning_rate": 1.921082853946557e-07, "loss": 0.7836, "step": 1553 }, { "epoch": 0.1536218273484418, "grad_norm": 4.953240349748287, "learning_rate": 1.920958127147005e-07, "loss": 0.7612, "step": 1554 }, { "epoch": 0.15372068309319625, "grad_norm": 7.3873423954196245, "learning_rate": 1.920833305916782e-07, "loss": 0.7931, "step": 1555 }, { "epoch": 0.15381953883795071, "grad_norm": 6.4053236563326275, "learning_rate": 1.920708390268687e-07, "loss": 0.7211, "step": 1556 }, { "epoch": 0.15391839458270518, "grad_norm": 13.711925204296382, "learning_rate": 1.9205833802155277e-07, "loss": 0.7521, "step": 1557 }, { "epoch": 0.15401725032745966, "grad_norm": 3.8896622519805866, "learning_rate": 1.9204582757701227e-07, "loss": 0.7583, "step": 1558 }, { "epoch": 0.15411610607221413, "grad_norm": 3.9378413737509548, "learning_rate": 1.920333076945299e-07, "loss": 0.7857, "step": 1559 }, { "epoch": 0.1542149618169686, "grad_norm": 6.230453223577076, "learning_rate": 1.9202077837538942e-07, "loss": 0.7138, "step": 1560 }, { "epoch": 0.15431381756172305, "grad_norm": 5.595915633650135, "learning_rate": 1.920082396208755e-07, "loss": 0.8164, "step": 1561 }, { "epoch": 0.1544126733064775, "grad_norm": 4.335010447520082, "learning_rate": 1.9199569143227383e-07, "loss": 0.6981, "step": 1562 }, { "epoch": 0.154511529051232, "grad_norm": 4.837310489335801, "learning_rate": 1.9198313381087106e-07, "loss": 0.7022, "step": 1563 }, { "epoch": 0.15461038479598646, "grad_norm": 3.723694717252675, "learning_rate": 1.9197056675795473e-07, "loss": 0.8161, "step": 1564 }, { "epoch": 0.15470924054074092, "grad_norm": 3.0896086055418004, "learning_rate": 1.9195799027481345e-07, "loss": 0.6625, "step": 1565 }, { "epoch": 0.15480809628549538, "grad_norm": 5.706000179632502, "learning_rate": 1.9194540436273675e-07, "loss": 0.7478, "step": 1566 }, { "epoch": 0.15490695203024984, "grad_norm": 4.757753899528588, "learning_rate": 1.9193280902301515e-07, "loss": 0.6686, "step": 1567 }, { "epoch": 0.15500580777500433, "grad_norm": 7.555489303026925, "learning_rate": 1.9192020425694005e-07, "loss": 0.7547, "step": 1568 }, { "epoch": 0.1551046635197588, "grad_norm": 5.913143870141827, "learning_rate": 1.9190759006580392e-07, "loss": 0.6286, "step": 1569 }, { "epoch": 0.15520351926451326, "grad_norm": 3.9572313883594523, "learning_rate": 1.918949664509002e-07, "loss": 0.7156, "step": 1570 }, { "epoch": 0.15530237500926772, "grad_norm": 4.595135162380234, "learning_rate": 1.9188233341352317e-07, "loss": 0.6948, "step": 1571 }, { "epoch": 0.15540123075402218, "grad_norm": 3.5878831274198566, "learning_rate": 1.9186969095496824e-07, "loss": 0.7328, "step": 1572 }, { "epoch": 0.15550008649877667, "grad_norm": 3.5136528565110976, "learning_rate": 1.9185703907653165e-07, "loss": 0.7224, "step": 1573 }, { "epoch": 0.15559894224353113, "grad_norm": 6.66415885347148, "learning_rate": 1.9184437777951073e-07, "loss": 0.6755, "step": 1574 }, { "epoch": 0.1556977979882856, "grad_norm": 25.6448053053244, "learning_rate": 1.918317070652036e-07, "loss": 0.7521, "step": 1575 }, { "epoch": 0.15579665373304005, "grad_norm": 5.882500317835107, "learning_rate": 1.918190269349096e-07, "loss": 0.6761, "step": 1576 }, { "epoch": 0.15589550947779454, "grad_norm": 3.382166802221458, "learning_rate": 1.9180633738992878e-07, "loss": 0.7754, "step": 1577 }, { "epoch": 0.155994365222549, "grad_norm": 5.114046320581695, "learning_rate": 1.917936384315623e-07, "loss": 0.6556, "step": 1578 }, { "epoch": 0.15609322096730346, "grad_norm": 3.1270215643475088, "learning_rate": 1.9178093006111224e-07, "loss": 0.7537, "step": 1579 }, { "epoch": 0.15619207671205793, "grad_norm": 3.8883365808600017, "learning_rate": 1.9176821227988168e-07, "loss": 0.7954, "step": 1580 }, { "epoch": 0.1562909324568124, "grad_norm": 15.712779667378834, "learning_rate": 1.9175548508917462e-07, "loss": 0.6948, "step": 1581 }, { "epoch": 0.15638978820156688, "grad_norm": 6.563607788816714, "learning_rate": 1.9174274849029609e-07, "loss": 0.7526, "step": 1582 }, { "epoch": 0.15648864394632134, "grad_norm": 7.311172566957364, "learning_rate": 1.9173000248455194e-07, "loss": 0.7823, "step": 1583 }, { "epoch": 0.1565874996910758, "grad_norm": 3.7912846543923515, "learning_rate": 1.9171724707324918e-07, "loss": 0.7991, "step": 1584 }, { "epoch": 0.15668635543583026, "grad_norm": 6.994261486897858, "learning_rate": 1.9170448225769565e-07, "loss": 0.6608, "step": 1585 }, { "epoch": 0.15678521118058472, "grad_norm": 11.569582566961419, "learning_rate": 1.916917080392002e-07, "loss": 0.6759, "step": 1586 }, { "epoch": 0.1568840669253392, "grad_norm": 3.8445801879424657, "learning_rate": 1.9167892441907267e-07, "loss": 0.7497, "step": 1587 }, { "epoch": 0.15698292267009367, "grad_norm": 5.184131630721037, "learning_rate": 1.9166613139862378e-07, "loss": 0.735, "step": 1588 }, { "epoch": 0.15708177841484813, "grad_norm": 3.918117482127153, "learning_rate": 1.9165332897916527e-07, "loss": 0.7695, "step": 1589 }, { "epoch": 0.1571806341596026, "grad_norm": 3.8725308640037426, "learning_rate": 1.9164051716200986e-07, "loss": 0.7177, "step": 1590 }, { "epoch": 0.15727948990435706, "grad_norm": 8.465956009455192, "learning_rate": 1.916276959484712e-07, "loss": 0.7368, "step": 1591 }, { "epoch": 0.15737834564911155, "grad_norm": 10.21865027205561, "learning_rate": 1.9161486533986395e-07, "loss": 0.6694, "step": 1592 }, { "epoch": 0.157477201393866, "grad_norm": 3.5848187313447126, "learning_rate": 1.9160202533750365e-07, "loss": 0.6904, "step": 1593 }, { "epoch": 0.15757605713862047, "grad_norm": 4.13282462410451, "learning_rate": 1.915891759427069e-07, "loss": 0.6973, "step": 1594 }, { "epoch": 0.15767491288337493, "grad_norm": 3.2674453884883508, "learning_rate": 1.9157631715679118e-07, "loss": 0.6815, "step": 1595 }, { "epoch": 0.1577737686281294, "grad_norm": 9.801519868663762, "learning_rate": 1.91563448981075e-07, "loss": 0.6501, "step": 1596 }, { "epoch": 0.15787262437288388, "grad_norm": 12.151408104070974, "learning_rate": 1.9155057141687775e-07, "loss": 0.663, "step": 1597 }, { "epoch": 0.15797148011763834, "grad_norm": 4.795163950429591, "learning_rate": 1.9153768446551988e-07, "loss": 0.7831, "step": 1598 }, { "epoch": 0.1580703358623928, "grad_norm": 3.755113794429627, "learning_rate": 1.9152478812832275e-07, "loss": 0.6873, "step": 1599 }, { "epoch": 0.15816919160714726, "grad_norm": 3.2739515091938864, "learning_rate": 1.9151188240660866e-07, "loss": 0.7983, "step": 1600 }, { "epoch": 0.15826804735190173, "grad_norm": 3.902641688922389, "learning_rate": 1.9149896730170093e-07, "loss": 0.7201, "step": 1601 }, { "epoch": 0.15836690309665621, "grad_norm": 6.579513457620932, "learning_rate": 1.9148604281492385e-07, "loss": 0.6073, "step": 1602 }, { "epoch": 0.15846575884141068, "grad_norm": 4.172848937097197, "learning_rate": 1.9147310894760255e-07, "loss": 0.7022, "step": 1603 }, { "epoch": 0.15856461458616514, "grad_norm": 8.265687288687849, "learning_rate": 1.9146016570106327e-07, "loss": 0.6278, "step": 1604 }, { "epoch": 0.1586634703309196, "grad_norm": 5.051956218935498, "learning_rate": 1.9144721307663312e-07, "loss": 0.7233, "step": 1605 }, { "epoch": 0.15876232607567406, "grad_norm": 6.259930465096014, "learning_rate": 1.9143425107564023e-07, "loss": 0.7696, "step": 1606 }, { "epoch": 0.15886118182042855, "grad_norm": 2.7816100900528777, "learning_rate": 1.9142127969941362e-07, "loss": 0.6445, "step": 1607 }, { "epoch": 0.158960037565183, "grad_norm": 5.267354432560429, "learning_rate": 1.9140829894928333e-07, "loss": 0.79, "step": 1608 }, { "epoch": 0.15905889330993747, "grad_norm": 4.9039292217091575, "learning_rate": 1.9139530882658036e-07, "loss": 0.7976, "step": 1609 }, { "epoch": 0.15915774905469193, "grad_norm": 6.234482401868283, "learning_rate": 1.9138230933263666e-07, "loss": 0.8292, "step": 1610 }, { "epoch": 0.1592566047994464, "grad_norm": 9.876803248541771, "learning_rate": 1.9136930046878512e-07, "loss": 0.7732, "step": 1611 }, { "epoch": 0.15935546054420088, "grad_norm": 11.311176703503378, "learning_rate": 1.913562822363596e-07, "loss": 0.7693, "step": 1612 }, { "epoch": 0.15945431628895534, "grad_norm": 3.829842522592581, "learning_rate": 1.913432546366949e-07, "loss": 0.7361, "step": 1613 }, { "epoch": 0.1595531720337098, "grad_norm": 10.03421192101163, "learning_rate": 1.913302176711269e-07, "loss": 0.729, "step": 1614 }, { "epoch": 0.15965202777846427, "grad_norm": 3.3359775978053126, "learning_rate": 1.9131717134099228e-07, "loss": 0.7775, "step": 1615 }, { "epoch": 0.15975088352321876, "grad_norm": 3.4509300400139495, "learning_rate": 1.9130411564762876e-07, "loss": 0.7704, "step": 1616 }, { "epoch": 0.15984973926797322, "grad_norm": 24.308432247845968, "learning_rate": 1.91291050592375e-07, "loss": 0.6567, "step": 1617 }, { "epoch": 0.15994859501272768, "grad_norm": 4.877329188251267, "learning_rate": 1.9127797617657065e-07, "loss": 0.7461, "step": 1618 }, { "epoch": 0.16004745075748214, "grad_norm": 6.982641347396236, "learning_rate": 1.912648924015563e-07, "loss": 0.7273, "step": 1619 }, { "epoch": 0.1601463065022366, "grad_norm": 3.405945151404724, "learning_rate": 1.9125179926867346e-07, "loss": 0.6058, "step": 1620 }, { "epoch": 0.1602451622469911, "grad_norm": 4.023510961444016, "learning_rate": 1.9123869677926467e-07, "loss": 0.6563, "step": 1621 }, { "epoch": 0.16034401799174555, "grad_norm": 3.006902563397172, "learning_rate": 1.9122558493467338e-07, "loss": 0.7405, "step": 1622 }, { "epoch": 0.1604428737365, "grad_norm": 4.585723275294254, "learning_rate": 1.9121246373624405e-07, "loss": 0.8422, "step": 1623 }, { "epoch": 0.16054172948125447, "grad_norm": 4.109780986344828, "learning_rate": 1.9119933318532203e-07, "loss": 0.7552, "step": 1624 }, { "epoch": 0.16064058522600894, "grad_norm": 4.577524366477109, "learning_rate": 1.9118619328325368e-07, "loss": 0.6955, "step": 1625 }, { "epoch": 0.16073944097076343, "grad_norm": 2.989832816105517, "learning_rate": 1.911730440313863e-07, "loss": 0.7245, "step": 1626 }, { "epoch": 0.1608382967155179, "grad_norm": 5.093778944072761, "learning_rate": 1.9115988543106815e-07, "loss": 0.7401, "step": 1627 }, { "epoch": 0.16093715246027235, "grad_norm": 6.3798775989768455, "learning_rate": 1.9114671748364847e-07, "loss": 0.7389, "step": 1628 }, { "epoch": 0.1610360082050268, "grad_norm": 4.733875121126953, "learning_rate": 1.911335401904774e-07, "loss": 0.66, "step": 1629 }, { "epoch": 0.16113486394978127, "grad_norm": 5.829348903436602, "learning_rate": 1.911203535529061e-07, "loss": 0.6849, "step": 1630 }, { "epoch": 0.16123371969453576, "grad_norm": 3.404188709889018, "learning_rate": 1.9110715757228667e-07, "loss": 0.6923, "step": 1631 }, { "epoch": 0.16133257543929022, "grad_norm": 7.433723893558098, "learning_rate": 1.9109395224997217e-07, "loss": 0.653, "step": 1632 }, { "epoch": 0.16143143118404468, "grad_norm": 4.175206053479436, "learning_rate": 1.9108073758731657e-07, "loss": 0.6795, "step": 1633 }, { "epoch": 0.16153028692879914, "grad_norm": 3.5049837418927137, "learning_rate": 1.9106751358567488e-07, "loss": 0.8686, "step": 1634 }, { "epoch": 0.1616291426735536, "grad_norm": 3.54641786603234, "learning_rate": 1.9105428024640296e-07, "loss": 0.7185, "step": 1635 }, { "epoch": 0.1617279984183081, "grad_norm": 11.119653407264087, "learning_rate": 1.910410375708578e-07, "loss": 0.7349, "step": 1636 }, { "epoch": 0.16182685416306256, "grad_norm": 4.516333182072062, "learning_rate": 1.910277855603972e-07, "loss": 0.7188, "step": 1637 }, { "epoch": 0.16192570990781702, "grad_norm": 4.791356374985866, "learning_rate": 1.910145242163799e-07, "loss": 0.6664, "step": 1638 }, { "epoch": 0.16202456565257148, "grad_norm": 6.141424162927891, "learning_rate": 1.9100125354016577e-07, "loss": 0.7269, "step": 1639 }, { "epoch": 0.16212342139732594, "grad_norm": 3.910480494693417, "learning_rate": 1.909879735331154e-07, "loss": 0.7225, "step": 1640 }, { "epoch": 0.16222227714208043, "grad_norm": 3.367096064404229, "learning_rate": 1.9097468419659052e-07, "loss": 0.778, "step": 1641 }, { "epoch": 0.1623211328868349, "grad_norm": 5.080913062272009, "learning_rate": 1.9096138553195374e-07, "loss": 0.7466, "step": 1642 }, { "epoch": 0.16241998863158935, "grad_norm": 31.212206669511, "learning_rate": 1.9094807754056866e-07, "loss": 0.8658, "step": 1643 }, { "epoch": 0.1625188443763438, "grad_norm": 7.527090381762391, "learning_rate": 1.909347602237998e-07, "loss": 0.6421, "step": 1644 }, { "epoch": 0.16261770012109827, "grad_norm": 3.5186531049315066, "learning_rate": 1.9092143358301265e-07, "loss": 0.7286, "step": 1645 }, { "epoch": 0.16271655586585276, "grad_norm": 4.066301537608991, "learning_rate": 1.909080976195737e-07, "loss": 0.8669, "step": 1646 }, { "epoch": 0.16281541161060722, "grad_norm": 3.375247822900512, "learning_rate": 1.908947523348503e-07, "loss": 0.7377, "step": 1647 }, { "epoch": 0.16291426735536169, "grad_norm": 3.8126304271095477, "learning_rate": 1.9088139773021084e-07, "loss": 0.8071, "step": 1648 }, { "epoch": 0.16301312310011615, "grad_norm": 3.5703729009065226, "learning_rate": 1.9086803380702464e-07, "loss": 0.6445, "step": 1649 }, { "epoch": 0.1631119788448706, "grad_norm": 4.705917066318424, "learning_rate": 1.90854660566662e-07, "loss": 0.7642, "step": 1650 }, { "epoch": 0.1632108345896251, "grad_norm": 3.5353365352669788, "learning_rate": 1.908412780104941e-07, "loss": 0.7656, "step": 1651 }, { "epoch": 0.16330969033437956, "grad_norm": 4.916287267960891, "learning_rate": 1.9082788613989313e-07, "loss": 0.7828, "step": 1652 }, { "epoch": 0.16340854607913402, "grad_norm": 3.724085070875087, "learning_rate": 1.9081448495623226e-07, "loss": 0.6242, "step": 1653 }, { "epoch": 0.16350740182388848, "grad_norm": 4.4813730664795814, "learning_rate": 1.9080107446088555e-07, "loss": 0.7808, "step": 1654 }, { "epoch": 0.16360625756864297, "grad_norm": 3.8439357535012357, "learning_rate": 1.9078765465522807e-07, "loss": 0.6864, "step": 1655 }, { "epoch": 0.16370511331339743, "grad_norm": 3.1149160924169705, "learning_rate": 1.9077422554063586e-07, "loss": 0.7858, "step": 1656 }, { "epoch": 0.1638039690581519, "grad_norm": 53.43499734830409, "learning_rate": 1.9076078711848577e-07, "loss": 0.7309, "step": 1657 }, { "epoch": 0.16390282480290636, "grad_norm": 3.0515389376220394, "learning_rate": 1.9074733939015578e-07, "loss": 0.6818, "step": 1658 }, { "epoch": 0.16400168054766082, "grad_norm": 4.096741442756604, "learning_rate": 1.907338823570248e-07, "loss": 0.7046, "step": 1659 }, { "epoch": 0.1641005362924153, "grad_norm": 4.028177595552836, "learning_rate": 1.9072041602047258e-07, "loss": 0.6924, "step": 1660 }, { "epoch": 0.16419939203716977, "grad_norm": 3.8554340442166803, "learning_rate": 1.9070694038187992e-07, "loss": 0.8177, "step": 1661 }, { "epoch": 0.16429824778192423, "grad_norm": 4.496679320512058, "learning_rate": 1.9069345544262854e-07, "loss": 0.6496, "step": 1662 }, { "epoch": 0.1643971035266787, "grad_norm": 7.816519393586867, "learning_rate": 1.9067996120410111e-07, "loss": 0.7417, "step": 1663 }, { "epoch": 0.16449595927143315, "grad_norm": 10.384589476659524, "learning_rate": 1.906664576676813e-07, "loss": 0.7162, "step": 1664 }, { "epoch": 0.16459481501618764, "grad_norm": 4.361680122063604, "learning_rate": 1.906529448347537e-07, "loss": 0.7125, "step": 1665 }, { "epoch": 0.1646936707609421, "grad_norm": 4.043313096527939, "learning_rate": 1.9063942270670382e-07, "loss": 0.7197, "step": 1666 }, { "epoch": 0.16479252650569656, "grad_norm": 4.406479883673326, "learning_rate": 1.9062589128491814e-07, "loss": 0.7829, "step": 1667 }, { "epoch": 0.16489138225045102, "grad_norm": 2.871680622149349, "learning_rate": 1.9061235057078418e-07, "loss": 0.6711, "step": 1668 }, { "epoch": 0.16499023799520549, "grad_norm": 10.393191466386375, "learning_rate": 1.9059880056569025e-07, "loss": 0.7484, "step": 1669 }, { "epoch": 0.16508909373995997, "grad_norm": 2.981156093786772, "learning_rate": 1.9058524127102578e-07, "loss": 0.762, "step": 1670 }, { "epoch": 0.16518794948471444, "grad_norm": 7.815663874880071, "learning_rate": 1.9057167268818102e-07, "loss": 0.8046, "step": 1671 }, { "epoch": 0.1652868052294689, "grad_norm": 4.022937268972966, "learning_rate": 1.9055809481854728e-07, "loss": 0.7215, "step": 1672 }, { "epoch": 0.16538566097422336, "grad_norm": 6.090162475797414, "learning_rate": 1.9054450766351672e-07, "loss": 0.6999, "step": 1673 }, { "epoch": 0.16548451671897782, "grad_norm": 5.238496981247826, "learning_rate": 1.9053091122448252e-07, "loss": 0.7162, "step": 1674 }, { "epoch": 0.1655833724637323, "grad_norm": 7.481448856154265, "learning_rate": 1.905173055028388e-07, "loss": 0.7096, "step": 1675 }, { "epoch": 0.16568222820848677, "grad_norm": 4.129954325612517, "learning_rate": 1.9050369049998066e-07, "loss": 0.7445, "step": 1676 }, { "epoch": 0.16578108395324123, "grad_norm": 4.706673828854298, "learning_rate": 1.9049006621730406e-07, "loss": 0.7204, "step": 1677 }, { "epoch": 0.1658799396979957, "grad_norm": 3.065198480265075, "learning_rate": 1.9047643265620598e-07, "loss": 0.6656, "step": 1678 }, { "epoch": 0.16597879544275015, "grad_norm": 3.4783977923125278, "learning_rate": 1.9046278981808436e-07, "loss": 0.7648, "step": 1679 }, { "epoch": 0.16607765118750464, "grad_norm": 4.958889508187133, "learning_rate": 1.9044913770433808e-07, "loss": 0.8153, "step": 1680 }, { "epoch": 0.1661765069322591, "grad_norm": 3.614700163121188, "learning_rate": 1.9043547631636694e-07, "loss": 0.8014, "step": 1681 }, { "epoch": 0.16627536267701357, "grad_norm": 11.926934499565979, "learning_rate": 1.904218056555717e-07, "loss": 0.6898, "step": 1682 }, { "epoch": 0.16637421842176803, "grad_norm": 4.73634351378157, "learning_rate": 1.9040812572335412e-07, "loss": 0.6701, "step": 1683 }, { "epoch": 0.1664730741665225, "grad_norm": 6.727120221114989, "learning_rate": 1.9039443652111687e-07, "loss": 0.5657, "step": 1684 }, { "epoch": 0.16657192991127698, "grad_norm": 5.018045517313606, "learning_rate": 1.9038073805026356e-07, "loss": 0.7971, "step": 1685 }, { "epoch": 0.16667078565603144, "grad_norm": 14.667402363682845, "learning_rate": 1.9036703031219877e-07, "loss": 0.6563, "step": 1686 }, { "epoch": 0.1667696414007859, "grad_norm": 4.215522359262666, "learning_rate": 1.9035331330832802e-07, "loss": 0.6984, "step": 1687 }, { "epoch": 0.16686849714554036, "grad_norm": 4.5584260587053755, "learning_rate": 1.9033958704005778e-07, "loss": 0.718, "step": 1688 }, { "epoch": 0.16696735289029482, "grad_norm": 5.023904749410402, "learning_rate": 1.9032585150879553e-07, "loss": 0.7272, "step": 1689 }, { "epoch": 0.1670662086350493, "grad_norm": 6.0462743504777645, "learning_rate": 1.9031210671594963e-07, "loss": 0.6987, "step": 1690 }, { "epoch": 0.16716506437980377, "grad_norm": 10.444076777045005, "learning_rate": 1.9029835266292934e-07, "loss": 0.7779, "step": 1691 }, { "epoch": 0.16726392012455824, "grad_norm": 142.38434698405595, "learning_rate": 1.9028458935114498e-07, "loss": 0.7066, "step": 1692 }, { "epoch": 0.1673627758693127, "grad_norm": 3.706237944886247, "learning_rate": 1.9027081678200778e-07, "loss": 0.8256, "step": 1693 }, { "epoch": 0.16746163161406719, "grad_norm": 3.5426956297858885, "learning_rate": 1.902570349569299e-07, "loss": 0.6673, "step": 1694 }, { "epoch": 0.16756048735882165, "grad_norm": 5.418971558482785, "learning_rate": 1.9024324387732449e-07, "loss": 0.8023, "step": 1695 }, { "epoch": 0.1676593431035761, "grad_norm": 4.342970067889736, "learning_rate": 1.9022944354460557e-07, "loss": 0.7473, "step": 1696 }, { "epoch": 0.16775819884833057, "grad_norm": 2.876493019350703, "learning_rate": 1.9021563396018822e-07, "loss": 0.8146, "step": 1697 }, { "epoch": 0.16785705459308503, "grad_norm": 3.4083648875047876, "learning_rate": 1.9020181512548837e-07, "loss": 0.6976, "step": 1698 }, { "epoch": 0.16795591033783952, "grad_norm": 6.628650821173191, "learning_rate": 1.9018798704192296e-07, "loss": 0.6328, "step": 1699 }, { "epoch": 0.16805476608259398, "grad_norm": 3.643165201023311, "learning_rate": 1.9017414971090983e-07, "loss": 0.7643, "step": 1700 }, { "epoch": 0.16815362182734844, "grad_norm": 3.851485684197734, "learning_rate": 1.901603031338678e-07, "loss": 0.8556, "step": 1701 }, { "epoch": 0.1682524775721029, "grad_norm": 4.447041682571627, "learning_rate": 1.9014644731221668e-07, "loss": 0.7687, "step": 1702 }, { "epoch": 0.16835133331685737, "grad_norm": 4.1854428411498885, "learning_rate": 1.9013258224737713e-07, "loss": 0.702, "step": 1703 }, { "epoch": 0.16845018906161185, "grad_norm": 3.9803373773992363, "learning_rate": 1.901187079407708e-07, "loss": 0.6336, "step": 1704 }, { "epoch": 0.16854904480636632, "grad_norm": 4.2805448524660505, "learning_rate": 1.901048243938203e-07, "loss": 0.7376, "step": 1705 }, { "epoch": 0.16864790055112078, "grad_norm": 6.631879612240396, "learning_rate": 1.9009093160794923e-07, "loss": 0.7085, "step": 1706 }, { "epoch": 0.16874675629587524, "grad_norm": 6.679649169622515, "learning_rate": 1.9007702958458208e-07, "loss": 0.821, "step": 1707 }, { "epoch": 0.1688456120406297, "grad_norm": 4.168141591967967, "learning_rate": 1.9006311832514424e-07, "loss": 0.6918, "step": 1708 }, { "epoch": 0.1689444677853842, "grad_norm": 4.4554496357926086, "learning_rate": 1.9004919783106215e-07, "loss": 0.6644, "step": 1709 }, { "epoch": 0.16904332353013865, "grad_norm": 3.2773324556179135, "learning_rate": 1.9003526810376314e-07, "loss": 0.6859, "step": 1710 }, { "epoch": 0.1691421792748931, "grad_norm": 4.435898621621379, "learning_rate": 1.900213291446755e-07, "loss": 0.7286, "step": 1711 }, { "epoch": 0.16924103501964757, "grad_norm": 4.600776536740985, "learning_rate": 1.9000738095522845e-07, "loss": 0.7366, "step": 1712 }, { "epoch": 0.16933989076440203, "grad_norm": 3.000091884373794, "learning_rate": 1.8999342353685218e-07, "loss": 0.7813, "step": 1713 }, { "epoch": 0.16943874650915652, "grad_norm": 4.535331652119621, "learning_rate": 1.8997945689097789e-07, "loss": 0.8168, "step": 1714 }, { "epoch": 0.16953760225391099, "grad_norm": 7.068950893937681, "learning_rate": 1.899654810190375e-07, "loss": 0.6461, "step": 1715 }, { "epoch": 0.16963645799866545, "grad_norm": 3.120734608640814, "learning_rate": 1.8995149592246417e-07, "loss": 0.7773, "step": 1716 }, { "epoch": 0.1697353137434199, "grad_norm": 4.412998530041257, "learning_rate": 1.899375016026918e-07, "loss": 0.9191, "step": 1717 }, { "epoch": 0.16983416948817437, "grad_norm": 3.43073580285405, "learning_rate": 1.899234980611553e-07, "loss": 0.7317, "step": 1718 }, { "epoch": 0.16993302523292886, "grad_norm": 4.233550258650561, "learning_rate": 1.8990948529929054e-07, "loss": 0.6977, "step": 1719 }, { "epoch": 0.17003188097768332, "grad_norm": 4.907745067852664, "learning_rate": 1.8989546331853434e-07, "loss": 0.7754, "step": 1720 }, { "epoch": 0.17013073672243778, "grad_norm": 6.51290228630269, "learning_rate": 1.898814321203244e-07, "loss": 0.832, "step": 1721 }, { "epoch": 0.17022959246719224, "grad_norm": 3.839650625038715, "learning_rate": 1.8986739170609945e-07, "loss": 0.782, "step": 1722 }, { "epoch": 0.1703284482119467, "grad_norm": 4.930498419663581, "learning_rate": 1.8985334207729914e-07, "loss": 0.7026, "step": 1723 }, { "epoch": 0.1704273039567012, "grad_norm": 3.955689719599107, "learning_rate": 1.89839283235364e-07, "loss": 0.7171, "step": 1724 }, { "epoch": 0.17052615970145565, "grad_norm": 2.8878718030475246, "learning_rate": 1.898252151817356e-07, "loss": 0.6674, "step": 1725 }, { "epoch": 0.17062501544621012, "grad_norm": 5.489335130573389, "learning_rate": 1.8981113791785641e-07, "loss": 0.6934, "step": 1726 }, { "epoch": 0.17072387119096458, "grad_norm": 7.928674863071406, "learning_rate": 1.897970514451698e-07, "loss": 0.7331, "step": 1727 }, { "epoch": 0.17082272693571904, "grad_norm": 4.353054313942994, "learning_rate": 1.8978295576512017e-07, "loss": 0.7606, "step": 1728 }, { "epoch": 0.17092158268047353, "grad_norm": 5.006464359075778, "learning_rate": 1.8976885087915286e-07, "loss": 0.786, "step": 1729 }, { "epoch": 0.171020438425228, "grad_norm": 4.357417179087538, "learning_rate": 1.8975473678871406e-07, "loss": 0.7185, "step": 1730 }, { "epoch": 0.17111929416998245, "grad_norm": 10.81384220176505, "learning_rate": 1.8974061349525096e-07, "loss": 0.7901, "step": 1731 }, { "epoch": 0.1712181499147369, "grad_norm": 4.406026358238695, "learning_rate": 1.8972648100021172e-07, "loss": 0.7734, "step": 1732 }, { "epoch": 0.1713170056594914, "grad_norm": 16.202336011277236, "learning_rate": 1.897123393050454e-07, "loss": 0.8214, "step": 1733 }, { "epoch": 0.17141586140424586, "grad_norm": 4.080143971755174, "learning_rate": 1.8969818841120207e-07, "loss": 0.7314, "step": 1734 }, { "epoch": 0.17151471714900032, "grad_norm": 4.171856302880728, "learning_rate": 1.8968402832013265e-07, "loss": 0.6863, "step": 1735 }, { "epoch": 0.17161357289375478, "grad_norm": 4.84946915449895, "learning_rate": 1.8966985903328903e-07, "loss": 0.7238, "step": 1736 }, { "epoch": 0.17171242863850925, "grad_norm": 6.441182281391494, "learning_rate": 1.896556805521241e-07, "loss": 0.7789, "step": 1737 }, { "epoch": 0.17181128438326373, "grad_norm": 8.291743822086435, "learning_rate": 1.8964149287809167e-07, "loss": 0.8129, "step": 1738 }, { "epoch": 0.1719101401280182, "grad_norm": 4.70383052545174, "learning_rate": 1.896272960126464e-07, "loss": 0.7125, "step": 1739 }, { "epoch": 0.17200899587277266, "grad_norm": 5.9811051893954374, "learning_rate": 1.8961308995724405e-07, "loss": 0.7708, "step": 1740 }, { "epoch": 0.17210785161752712, "grad_norm": 13.069501563222797, "learning_rate": 1.895988747133412e-07, "loss": 0.6855, "step": 1741 }, { "epoch": 0.17220670736228158, "grad_norm": 3.4705187303408316, "learning_rate": 1.8958465028239545e-07, "loss": 0.7314, "step": 1742 }, { "epoch": 0.17230556310703607, "grad_norm": 4.901384492946249, "learning_rate": 1.8957041666586528e-07, "loss": 0.7463, "step": 1743 }, { "epoch": 0.17240441885179053, "grad_norm": 6.536802538689493, "learning_rate": 1.895561738652101e-07, "loss": 0.6924, "step": 1744 }, { "epoch": 0.172503274596545, "grad_norm": 3.974540928351649, "learning_rate": 1.8954192188189039e-07, "loss": 0.7854, "step": 1745 }, { "epoch": 0.17260213034129945, "grad_norm": 4.336895499611506, "learning_rate": 1.8952766071736736e-07, "loss": 0.6423, "step": 1746 }, { "epoch": 0.17270098608605391, "grad_norm": 6.373389861461393, "learning_rate": 1.895133903731034e-07, "loss": 0.6428, "step": 1747 }, { "epoch": 0.1727998418308084, "grad_norm": 8.206593870199931, "learning_rate": 1.8949911085056166e-07, "loss": 0.665, "step": 1748 }, { "epoch": 0.17289869757556287, "grad_norm": 3.165081106294251, "learning_rate": 1.8948482215120628e-07, "loss": 0.7252, "step": 1749 }, { "epoch": 0.17299755332031733, "grad_norm": 3.578501599162503, "learning_rate": 1.894705242765024e-07, "loss": 0.7664, "step": 1750 }, { "epoch": 0.1730964090650718, "grad_norm": 3.402998652649381, "learning_rate": 1.8945621722791607e-07, "loss": 0.7624, "step": 1751 }, { "epoch": 0.17319526480982625, "grad_norm": 4.638525594784221, "learning_rate": 1.894419010069142e-07, "loss": 0.7252, "step": 1752 }, { "epoch": 0.17329412055458074, "grad_norm": 3.042258569106757, "learning_rate": 1.8942757561496475e-07, "loss": 0.7808, "step": 1753 }, { "epoch": 0.1733929762993352, "grad_norm": 7.542922716157105, "learning_rate": 1.894132410535366e-07, "loss": 0.7284, "step": 1754 }, { "epoch": 0.17349183204408966, "grad_norm": 4.685950028445863, "learning_rate": 1.8939889732409949e-07, "loss": 0.7875, "step": 1755 }, { "epoch": 0.17359068778884412, "grad_norm": 5.658840237884323, "learning_rate": 1.8938454442812416e-07, "loss": 0.8001, "step": 1756 }, { "epoch": 0.17368954353359858, "grad_norm": 8.07060851496577, "learning_rate": 1.8937018236708235e-07, "loss": 0.6387, "step": 1757 }, { "epoch": 0.17378839927835307, "grad_norm": 8.830843705495276, "learning_rate": 1.8935581114244666e-07, "loss": 0.6546, "step": 1758 }, { "epoch": 0.17388725502310753, "grad_norm": 4.272025947799766, "learning_rate": 1.893414307556906e-07, "loss": 0.786, "step": 1759 }, { "epoch": 0.173986110767862, "grad_norm": 4.61647122642708, "learning_rate": 1.8932704120828869e-07, "loss": 0.7276, "step": 1760 }, { "epoch": 0.17408496651261646, "grad_norm": 3.9374874255778627, "learning_rate": 1.8931264250171638e-07, "loss": 0.6783, "step": 1761 }, { "epoch": 0.17418382225737092, "grad_norm": 4.048144042566202, "learning_rate": 1.8929823463745003e-07, "loss": 0.6552, "step": 1762 }, { "epoch": 0.1742826780021254, "grad_norm": 4.2539613749152885, "learning_rate": 1.8928381761696698e-07, "loss": 0.6568, "step": 1763 }, { "epoch": 0.17438153374687987, "grad_norm": 4.458867409695586, "learning_rate": 1.892693914417454e-07, "loss": 0.6735, "step": 1764 }, { "epoch": 0.17448038949163433, "grad_norm": 3.010595195350182, "learning_rate": 1.8925495611326462e-07, "loss": 0.6897, "step": 1765 }, { "epoch": 0.1745792452363888, "grad_norm": 22.852024670608735, "learning_rate": 1.8924051163300466e-07, "loss": 0.771, "step": 1766 }, { "epoch": 0.17467810098114328, "grad_norm": 4.016872586373533, "learning_rate": 1.892260580024466e-07, "loss": 0.8302, "step": 1767 }, { "epoch": 0.17477695672589774, "grad_norm": 4.500413307954438, "learning_rate": 1.892115952230725e-07, "loss": 0.7506, "step": 1768 }, { "epoch": 0.1748758124706522, "grad_norm": 3.8781098857454186, "learning_rate": 1.8919712329636528e-07, "loss": 0.6943, "step": 1769 }, { "epoch": 0.17497466821540666, "grad_norm": 5.629554095566065, "learning_rate": 1.891826422238088e-07, "loss": 0.7075, "step": 1770 }, { "epoch": 0.17507352396016113, "grad_norm": 11.310222685857072, "learning_rate": 1.891681520068879e-07, "loss": 0.712, "step": 1771 }, { "epoch": 0.17517237970491562, "grad_norm": 4.060853691527991, "learning_rate": 1.8915365264708831e-07, "loss": 0.7665, "step": 1772 }, { "epoch": 0.17527123544967008, "grad_norm": 4.62303655426697, "learning_rate": 1.891391441458968e-07, "loss": 0.6926, "step": 1773 }, { "epoch": 0.17537009119442454, "grad_norm": 3.8642200552047528, "learning_rate": 1.8912462650480092e-07, "loss": 0.7449, "step": 1774 }, { "epoch": 0.175468946939179, "grad_norm": 3.901746644457528, "learning_rate": 1.891100997252893e-07, "loss": 0.6979, "step": 1775 }, { "epoch": 0.17556780268393346, "grad_norm": 3.8068463817214844, "learning_rate": 1.890955638088514e-07, "loss": 0.8209, "step": 1776 }, { "epoch": 0.17566665842868795, "grad_norm": 3.2587877828065173, "learning_rate": 1.8908101875697766e-07, "loss": 0.7189, "step": 1777 }, { "epoch": 0.1757655141734424, "grad_norm": 3.9959476291139544, "learning_rate": 1.890664645711595e-07, "loss": 0.7277, "step": 1778 }, { "epoch": 0.17586436991819687, "grad_norm": 5.5307517045140155, "learning_rate": 1.8905190125288923e-07, "loss": 0.8285, "step": 1779 }, { "epoch": 0.17596322566295133, "grad_norm": 4.399177816817838, "learning_rate": 1.890373288036601e-07, "loss": 0.8217, "step": 1780 }, { "epoch": 0.1760620814077058, "grad_norm": 6.839882407150985, "learning_rate": 1.8902274722496628e-07, "loss": 0.8515, "step": 1781 }, { "epoch": 0.17616093715246028, "grad_norm": 3.836977304198438, "learning_rate": 1.890081565183029e-07, "loss": 0.7943, "step": 1782 }, { "epoch": 0.17625979289721475, "grad_norm": 3.555806399108246, "learning_rate": 1.8899355668516605e-07, "loss": 0.6454, "step": 1783 }, { "epoch": 0.1763586486419692, "grad_norm": 3.8563238025197486, "learning_rate": 1.8897894772705272e-07, "loss": 0.6754, "step": 1784 }, { "epoch": 0.17645750438672367, "grad_norm": 6.27192475326568, "learning_rate": 1.8896432964546084e-07, "loss": 0.7594, "step": 1785 }, { "epoch": 0.17655636013147813, "grad_norm": 3.8325504702393802, "learning_rate": 1.889497024418892e-07, "loss": 0.7975, "step": 1786 }, { "epoch": 0.17665521587623262, "grad_norm": 4.365220680776323, "learning_rate": 1.8893506611783776e-07, "loss": 0.8145, "step": 1787 }, { "epoch": 0.17675407162098708, "grad_norm": 7.20686772467932, "learning_rate": 1.8892042067480713e-07, "loss": 0.6858, "step": 1788 }, { "epoch": 0.17685292736574154, "grad_norm": 4.279943063461015, "learning_rate": 1.8890576611429906e-07, "loss": 0.7093, "step": 1789 }, { "epoch": 0.176951783110496, "grad_norm": 4.731685967918578, "learning_rate": 1.888911024378161e-07, "loss": 0.7318, "step": 1790 }, { "epoch": 0.17705063885525046, "grad_norm": 11.9445551856101, "learning_rate": 1.888764296468618e-07, "loss": 0.7626, "step": 1791 }, { "epoch": 0.17714949460000495, "grad_norm": 3.966015145645517, "learning_rate": 1.888617477429407e-07, "loss": 0.7392, "step": 1792 }, { "epoch": 0.17724835034475941, "grad_norm": 5.053992575307353, "learning_rate": 1.8884705672755817e-07, "loss": 0.863, "step": 1793 }, { "epoch": 0.17734720608951388, "grad_norm": 3.629841627651401, "learning_rate": 1.8883235660222052e-07, "loss": 0.7258, "step": 1794 }, { "epoch": 0.17744606183426834, "grad_norm": 4.023209819926416, "learning_rate": 1.8881764736843512e-07, "loss": 0.6932, "step": 1795 }, { "epoch": 0.1775449175790228, "grad_norm": 4.577415029841412, "learning_rate": 1.8880292902771008e-07, "loss": 0.7128, "step": 1796 }, { "epoch": 0.1776437733237773, "grad_norm": 3.451597529321051, "learning_rate": 1.8878820158155463e-07, "loss": 0.6791, "step": 1797 }, { "epoch": 0.17774262906853175, "grad_norm": 3.8990301245707992, "learning_rate": 1.8877346503147885e-07, "loss": 0.6368, "step": 1798 }, { "epoch": 0.1778414848132862, "grad_norm": 77.30844159490094, "learning_rate": 1.8875871937899374e-07, "loss": 0.8033, "step": 1799 }, { "epoch": 0.17794034055804067, "grad_norm": 6.176577443754129, "learning_rate": 1.887439646256112e-07, "loss": 0.7062, "step": 1800 }, { "epoch": 0.17803919630279513, "grad_norm": 4.475411286539406, "learning_rate": 1.8872920077284414e-07, "loss": 0.7526, "step": 1801 }, { "epoch": 0.17813805204754962, "grad_norm": 3.2811333797559197, "learning_rate": 1.8871442782220643e-07, "loss": 0.8072, "step": 1802 }, { "epoch": 0.17823690779230408, "grad_norm": 3.3281516515946965, "learning_rate": 1.8869964577521276e-07, "loss": 0.7376, "step": 1803 }, { "epoch": 0.17833576353705854, "grad_norm": 5.941068734113968, "learning_rate": 1.8868485463337884e-07, "loss": 0.7736, "step": 1804 }, { "epoch": 0.178434619281813, "grad_norm": 34.95799558733265, "learning_rate": 1.8867005439822126e-07, "loss": 0.7185, "step": 1805 }, { "epoch": 0.1785334750265675, "grad_norm": 4.76487394551764, "learning_rate": 1.8865524507125755e-07, "loss": 0.7055, "step": 1806 }, { "epoch": 0.17863233077132196, "grad_norm": 3.9363867184476793, "learning_rate": 1.8864042665400625e-07, "loss": 0.7474, "step": 1807 }, { "epoch": 0.17873118651607642, "grad_norm": 5.202205334626098, "learning_rate": 1.8862559914798673e-07, "loss": 0.6999, "step": 1808 }, { "epoch": 0.17883004226083088, "grad_norm": 4.5455299664929845, "learning_rate": 1.8861076255471934e-07, "loss": 0.6991, "step": 1809 }, { "epoch": 0.17892889800558534, "grad_norm": 4.643668243486312, "learning_rate": 1.8859591687572536e-07, "loss": 0.7752, "step": 1810 }, { "epoch": 0.17902775375033983, "grad_norm": 8.444676119069486, "learning_rate": 1.8858106211252695e-07, "loss": 0.7251, "step": 1811 }, { "epoch": 0.1791266094950943, "grad_norm": 3.82870412967721, "learning_rate": 1.8856619826664732e-07, "loss": 0.7802, "step": 1812 }, { "epoch": 0.17922546523984875, "grad_norm": 15.079608015041147, "learning_rate": 1.885513253396105e-07, "loss": 0.7686, "step": 1813 }, { "epoch": 0.1793243209846032, "grad_norm": 4.205803605995836, "learning_rate": 1.8853644333294145e-07, "loss": 0.7702, "step": 1814 }, { "epoch": 0.17942317672935768, "grad_norm": 2.743874180639262, "learning_rate": 1.8852155224816616e-07, "loss": 0.7473, "step": 1815 }, { "epoch": 0.17952203247411216, "grad_norm": 30.951166140010553, "learning_rate": 1.885066520868115e-07, "loss": 0.7655, "step": 1816 }, { "epoch": 0.17962088821886663, "grad_norm": 4.519911135954376, "learning_rate": 1.8849174285040518e-07, "loss": 0.7192, "step": 1817 }, { "epoch": 0.1797197439636211, "grad_norm": 4.323111743427903, "learning_rate": 1.8847682454047602e-07, "loss": 0.6746, "step": 1818 }, { "epoch": 0.17981859970837555, "grad_norm": 3.437457255062977, "learning_rate": 1.8846189715855362e-07, "loss": 0.6972, "step": 1819 }, { "epoch": 0.17991745545313, "grad_norm": 3.216207640147576, "learning_rate": 1.8844696070616857e-07, "loss": 0.7215, "step": 1820 }, { "epoch": 0.1800163111978845, "grad_norm": 4.650357954327682, "learning_rate": 1.8843201518485237e-07, "loss": 0.7258, "step": 1821 }, { "epoch": 0.18011516694263896, "grad_norm": 8.463705440406722, "learning_rate": 1.884170605961375e-07, "loss": 0.7728, "step": 1822 }, { "epoch": 0.18021402268739342, "grad_norm": 3.32009383004289, "learning_rate": 1.884020969415573e-07, "loss": 0.7168, "step": 1823 }, { "epoch": 0.18031287843214788, "grad_norm": 3.8647142381400856, "learning_rate": 1.8838712422264606e-07, "loss": 0.8023, "step": 1824 }, { "epoch": 0.18041173417690234, "grad_norm": 4.262500882953969, "learning_rate": 1.8837214244093907e-07, "loss": 0.6391, "step": 1825 }, { "epoch": 0.18051058992165683, "grad_norm": 4.545539652406747, "learning_rate": 1.883571515979724e-07, "loss": 0.6546, "step": 1826 }, { "epoch": 0.1806094456664113, "grad_norm": 4.215530451054043, "learning_rate": 1.8834215169528323e-07, "loss": 0.7225, "step": 1827 }, { "epoch": 0.18070830141116576, "grad_norm": 4.4946764209759404, "learning_rate": 1.8832714273440957e-07, "loss": 0.7314, "step": 1828 }, { "epoch": 0.18080715715592022, "grad_norm": 18.95783094880516, "learning_rate": 1.883121247168903e-07, "loss": 0.7093, "step": 1829 }, { "epoch": 0.18090601290067468, "grad_norm": 3.0818185579694557, "learning_rate": 1.8829709764426535e-07, "loss": 0.756, "step": 1830 }, { "epoch": 0.18100486864542917, "grad_norm": 3.9336023794880077, "learning_rate": 1.8828206151807547e-07, "loss": 0.7297, "step": 1831 }, { "epoch": 0.18110372439018363, "grad_norm": 4.6207629100820675, "learning_rate": 1.8826701633986247e-07, "loss": 0.7402, "step": 1832 }, { "epoch": 0.1812025801349381, "grad_norm": 5.9592450990500465, "learning_rate": 1.88251962111169e-07, "loss": 0.7302, "step": 1833 }, { "epoch": 0.18130143587969255, "grad_norm": 4.720060689042958, "learning_rate": 1.882368988335386e-07, "loss": 0.6564, "step": 1834 }, { "epoch": 0.181400291624447, "grad_norm": 15.209966017243241, "learning_rate": 1.8822182650851581e-07, "loss": 0.6643, "step": 1835 }, { "epoch": 0.1814991473692015, "grad_norm": 4.445402013094883, "learning_rate": 1.8820674513764607e-07, "loss": 0.6571, "step": 1836 }, { "epoch": 0.18159800311395596, "grad_norm": 3.8778550577600885, "learning_rate": 1.8819165472247576e-07, "loss": 0.7792, "step": 1837 }, { "epoch": 0.18169685885871042, "grad_norm": 4.03354195572501, "learning_rate": 1.881765552645522e-07, "loss": 0.6878, "step": 1838 }, { "epoch": 0.1817957146034649, "grad_norm": 3.900934242724237, "learning_rate": 1.8816144676542356e-07, "loss": 0.8178, "step": 1839 }, { "epoch": 0.18189457034821935, "grad_norm": 3.578923415300182, "learning_rate": 1.8814632922663902e-07, "loss": 0.7317, "step": 1840 }, { "epoch": 0.18199342609297384, "grad_norm": 6.598521219653272, "learning_rate": 1.881312026497487e-07, "loss": 0.639, "step": 1841 }, { "epoch": 0.1820922818377283, "grad_norm": 4.3452935955708165, "learning_rate": 1.8811606703630353e-07, "loss": 0.6151, "step": 1842 }, { "epoch": 0.18219113758248276, "grad_norm": 3.8910122840411434, "learning_rate": 1.8810092238785552e-07, "loss": 0.7168, "step": 1843 }, { "epoch": 0.18228999332723722, "grad_norm": 2.866194611743589, "learning_rate": 1.880857687059575e-07, "loss": 0.803, "step": 1844 }, { "epoch": 0.1823888490719917, "grad_norm": 4.802179268857267, "learning_rate": 1.8807060599216324e-07, "loss": 0.883, "step": 1845 }, { "epoch": 0.18248770481674617, "grad_norm": 3.6272125313623023, "learning_rate": 1.880554342480275e-07, "loss": 0.7024, "step": 1846 }, { "epoch": 0.18258656056150063, "grad_norm": 3.448906467400841, "learning_rate": 1.8804025347510586e-07, "loss": 0.658, "step": 1847 }, { "epoch": 0.1826854163062551, "grad_norm": 4.524931398519262, "learning_rate": 1.8802506367495493e-07, "loss": 0.7029, "step": 1848 }, { "epoch": 0.18278427205100956, "grad_norm": 4.5672967374315885, "learning_rate": 1.8800986484913217e-07, "loss": 0.8162, "step": 1849 }, { "epoch": 0.18288312779576404, "grad_norm": 4.574513430638343, "learning_rate": 1.8799465699919602e-07, "loss": 0.7142, "step": 1850 }, { "epoch": 0.1829819835405185, "grad_norm": 5.878244969011009, "learning_rate": 1.8797944012670578e-07, "loss": 0.8426, "step": 1851 }, { "epoch": 0.18308083928527297, "grad_norm": 4.231494737065659, "learning_rate": 1.8796421423322174e-07, "loss": 0.8209, "step": 1852 }, { "epoch": 0.18317969503002743, "grad_norm": 3.4652217495660533, "learning_rate": 1.8794897932030513e-07, "loss": 0.7012, "step": 1853 }, { "epoch": 0.1832785507747819, "grad_norm": 3.9181472581296686, "learning_rate": 1.87933735389518e-07, "loss": 0.6576, "step": 1854 }, { "epoch": 0.18337740651953638, "grad_norm": 4.050154772169212, "learning_rate": 1.8791848244242344e-07, "loss": 0.7323, "step": 1855 }, { "epoch": 0.18347626226429084, "grad_norm": 3.558667355033466, "learning_rate": 1.8790322048058541e-07, "loss": 0.7516, "step": 1856 }, { "epoch": 0.1835751180090453, "grad_norm": 4.117433311008633, "learning_rate": 1.8788794950556874e-07, "loss": 0.8548, "step": 1857 }, { "epoch": 0.18367397375379976, "grad_norm": 105.21866878898952, "learning_rate": 1.878726695189393e-07, "loss": 0.8643, "step": 1858 }, { "epoch": 0.18377282949855422, "grad_norm": 7.094828338186231, "learning_rate": 1.878573805222638e-07, "loss": 0.7679, "step": 1859 }, { "epoch": 0.1838716852433087, "grad_norm": 4.658572665766819, "learning_rate": 1.8784208251710996e-07, "loss": 0.7895, "step": 1860 }, { "epoch": 0.18397054098806317, "grad_norm": 4.947935419646001, "learning_rate": 1.878267755050463e-07, "loss": 0.7476, "step": 1861 }, { "epoch": 0.18406939673281764, "grad_norm": 5.916255601364905, "learning_rate": 1.8781145948764233e-07, "loss": 0.6732, "step": 1862 }, { "epoch": 0.1841682524775721, "grad_norm": 3.682682444582462, "learning_rate": 1.8779613446646851e-07, "loss": 0.7406, "step": 1863 }, { "epoch": 0.18426710822232656, "grad_norm": 4.45693514640527, "learning_rate": 1.877808004430962e-07, "loss": 0.7681, "step": 1864 }, { "epoch": 0.18436596396708105, "grad_norm": 8.042125278281219, "learning_rate": 1.8776545741909764e-07, "loss": 0.6658, "step": 1865 }, { "epoch": 0.1844648197118355, "grad_norm": 3.6591392297887224, "learning_rate": 1.8775010539604606e-07, "loss": 0.7218, "step": 1866 }, { "epoch": 0.18456367545658997, "grad_norm": 3.917209878717619, "learning_rate": 1.8773474437551557e-07, "loss": 0.7321, "step": 1867 }, { "epoch": 0.18466253120134443, "grad_norm": 6.442848237061841, "learning_rate": 1.8771937435908122e-07, "loss": 0.7925, "step": 1868 }, { "epoch": 0.1847613869460989, "grad_norm": 40.479942240295, "learning_rate": 1.87703995348319e-07, "loss": 0.7677, "step": 1869 }, { "epoch": 0.18486024269085338, "grad_norm": 4.848335356501039, "learning_rate": 1.876886073448058e-07, "loss": 0.7047, "step": 1870 }, { "epoch": 0.18495909843560784, "grad_norm": 6.818397165275487, "learning_rate": 1.8767321035011938e-07, "loss": 0.7149, "step": 1871 }, { "epoch": 0.1850579541803623, "grad_norm": 9.346820435549832, "learning_rate": 1.876578043658385e-07, "loss": 0.6335, "step": 1872 }, { "epoch": 0.18515680992511677, "grad_norm": 4.263900221463838, "learning_rate": 1.8764238939354288e-07, "loss": 0.734, "step": 1873 }, { "epoch": 0.18525566566987123, "grad_norm": 4.336061474534013, "learning_rate": 1.87626965434813e-07, "loss": 0.7549, "step": 1874 }, { "epoch": 0.18535452141462572, "grad_norm": 3.3656806316016934, "learning_rate": 1.876115324912304e-07, "loss": 0.757, "step": 1875 }, { "epoch": 0.18545337715938018, "grad_norm": 3.901975381835877, "learning_rate": 1.8759609056437751e-07, "loss": 0.647, "step": 1876 }, { "epoch": 0.18555223290413464, "grad_norm": 19.389664288223855, "learning_rate": 1.8758063965583766e-07, "loss": 0.7356, "step": 1877 }, { "epoch": 0.1856510886488891, "grad_norm": 6.770302235559567, "learning_rate": 1.8756517976719516e-07, "loss": 0.6859, "step": 1878 }, { "epoch": 0.18574994439364356, "grad_norm": 3.9133105150520837, "learning_rate": 1.875497109000351e-07, "loss": 0.7713, "step": 1879 }, { "epoch": 0.18584880013839805, "grad_norm": 10.78513007135395, "learning_rate": 1.8753423305594366e-07, "loss": 0.7126, "step": 1880 }, { "epoch": 0.1859476558831525, "grad_norm": 4.7028035085464674, "learning_rate": 1.8751874623650786e-07, "loss": 0.7267, "step": 1881 }, { "epoch": 0.18604651162790697, "grad_norm": 5.349512460822471, "learning_rate": 1.875032504433156e-07, "loss": 0.7985, "step": 1882 }, { "epoch": 0.18614536737266144, "grad_norm": 5.874298178455858, "learning_rate": 1.8748774567795577e-07, "loss": 0.7717, "step": 1883 }, { "epoch": 0.18624422311741592, "grad_norm": 3.742046928033859, "learning_rate": 1.8747223194201815e-07, "loss": 0.7402, "step": 1884 }, { "epoch": 0.18634307886217039, "grad_norm": 4.0533616968828525, "learning_rate": 1.8745670923709348e-07, "loss": 0.7952, "step": 1885 }, { "epoch": 0.18644193460692485, "grad_norm": 4.217258366941031, "learning_rate": 1.8744117756477332e-07, "loss": 0.7459, "step": 1886 }, { "epoch": 0.1865407903516793, "grad_norm": 5.344972462545019, "learning_rate": 1.874256369266503e-07, "loss": 0.7263, "step": 1887 }, { "epoch": 0.18663964609643377, "grad_norm": 8.574753898603118, "learning_rate": 1.8741008732431778e-07, "loss": 0.6801, "step": 1888 }, { "epoch": 0.18673850184118826, "grad_norm": 31.076856296542843, "learning_rate": 1.873945287593702e-07, "loss": 0.8197, "step": 1889 }, { "epoch": 0.18683735758594272, "grad_norm": 3.5117957311870778, "learning_rate": 1.8737896123340286e-07, "loss": 0.7142, "step": 1890 }, { "epoch": 0.18693621333069718, "grad_norm": 4.82712285754925, "learning_rate": 1.8736338474801201e-07, "loss": 0.8138, "step": 1891 }, { "epoch": 0.18703506907545164, "grad_norm": 6.744625155113649, "learning_rate": 1.8734779930479472e-07, "loss": 0.7457, "step": 1892 }, { "epoch": 0.1871339248202061, "grad_norm": 3.7288790158498593, "learning_rate": 1.873322049053491e-07, "loss": 0.719, "step": 1893 }, { "epoch": 0.1872327805649606, "grad_norm": 4.50784502630179, "learning_rate": 1.873166015512741e-07, "loss": 0.7124, "step": 1894 }, { "epoch": 0.18733163630971505, "grad_norm": 5.732874961730996, "learning_rate": 1.873009892441696e-07, "loss": 0.7532, "step": 1895 }, { "epoch": 0.18743049205446952, "grad_norm": 3.2969103847205723, "learning_rate": 1.8728536798563645e-07, "loss": 0.7395, "step": 1896 }, { "epoch": 0.18752934779922398, "grad_norm": 3.3422283968473323, "learning_rate": 1.8726973777727638e-07, "loss": 0.6764, "step": 1897 }, { "epoch": 0.18762820354397844, "grad_norm": 3.2836247088860167, "learning_rate": 1.8725409862069198e-07, "loss": 0.7615, "step": 1898 }, { "epoch": 0.18772705928873293, "grad_norm": 4.482358254849221, "learning_rate": 1.8723845051748693e-07, "loss": 0.6663, "step": 1899 }, { "epoch": 0.1878259150334874, "grad_norm": 4.500240968314746, "learning_rate": 1.8722279346926557e-07, "loss": 0.7412, "step": 1900 }, { "epoch": 0.18792477077824185, "grad_norm": 3.63285317766633, "learning_rate": 1.8720712747763338e-07, "loss": 0.7828, "step": 1901 }, { "epoch": 0.1880236265229963, "grad_norm": 3.0464095686219603, "learning_rate": 1.8719145254419667e-07, "loss": 0.6592, "step": 1902 }, { "epoch": 0.18812248226775077, "grad_norm": 4.991750217735482, "learning_rate": 1.8717576867056267e-07, "loss": 0.7257, "step": 1903 }, { "epoch": 0.18822133801250526, "grad_norm": 4.045140642123365, "learning_rate": 1.871600758583395e-07, "loss": 0.7731, "step": 1904 }, { "epoch": 0.18832019375725972, "grad_norm": 8.689155234185737, "learning_rate": 1.871443741091363e-07, "loss": 0.8706, "step": 1905 }, { "epoch": 0.18841904950201419, "grad_norm": 5.803533385050469, "learning_rate": 1.8712866342456296e-07, "loss": 0.74, "step": 1906 }, { "epoch": 0.18851790524676865, "grad_norm": 5.016204710358484, "learning_rate": 1.8711294380623044e-07, "loss": 0.7538, "step": 1907 }, { "epoch": 0.1886167609915231, "grad_norm": 3.6876257342722556, "learning_rate": 1.8709721525575054e-07, "loss": 0.8652, "step": 1908 }, { "epoch": 0.1887156167362776, "grad_norm": 3.483514718884918, "learning_rate": 1.87081477774736e-07, "loss": 0.746, "step": 1909 }, { "epoch": 0.18881447248103206, "grad_norm": 3.809410292563556, "learning_rate": 1.8706573136480043e-07, "loss": 0.7447, "step": 1910 }, { "epoch": 0.18891332822578652, "grad_norm": 3.620845125978442, "learning_rate": 1.8704997602755842e-07, "loss": 0.8005, "step": 1911 }, { "epoch": 0.18901218397054098, "grad_norm": 3.501943072052065, "learning_rate": 1.8703421176462549e-07, "loss": 0.6845, "step": 1912 }, { "epoch": 0.18911103971529544, "grad_norm": 4.899741223158862, "learning_rate": 1.8701843857761796e-07, "loss": 0.6851, "step": 1913 }, { "epoch": 0.18920989546004993, "grad_norm": 7.057631324938306, "learning_rate": 1.8700265646815317e-07, "loss": 0.848, "step": 1914 }, { "epoch": 0.1893087512048044, "grad_norm": 5.148111888918845, "learning_rate": 1.8698686543784935e-07, "loss": 0.7295, "step": 1915 }, { "epoch": 0.18940760694955885, "grad_norm": 4.108260700235145, "learning_rate": 1.869710654883256e-07, "loss": 0.6357, "step": 1916 }, { "epoch": 0.18950646269431332, "grad_norm": 7.051704098816246, "learning_rate": 1.86955256621202e-07, "loss": 0.7892, "step": 1917 }, { "epoch": 0.18960531843906778, "grad_norm": 4.202887880591141, "learning_rate": 1.8693943883809953e-07, "loss": 0.803, "step": 1918 }, { "epoch": 0.18970417418382227, "grad_norm": 7.799982007759803, "learning_rate": 1.8692361214064004e-07, "loss": 0.6871, "step": 1919 }, { "epoch": 0.18980302992857673, "grad_norm": 4.983445244764989, "learning_rate": 1.8690777653044634e-07, "loss": 0.6644, "step": 1920 }, { "epoch": 0.1899018856733312, "grad_norm": 3.5499396652532353, "learning_rate": 1.8689193200914215e-07, "loss": 0.731, "step": 1921 }, { "epoch": 0.19000074141808565, "grad_norm": 8.074545821666383, "learning_rate": 1.8687607857835206e-07, "loss": 0.7884, "step": 1922 }, { "epoch": 0.19009959716284014, "grad_norm": 21.766083737473622, "learning_rate": 1.8686021623970163e-07, "loss": 0.757, "step": 1923 }, { "epoch": 0.1901984529075946, "grad_norm": 12.684671453147544, "learning_rate": 1.868443449948173e-07, "loss": 0.7505, "step": 1924 }, { "epoch": 0.19029730865234906, "grad_norm": 19.674597938538255, "learning_rate": 1.8682846484532644e-07, "loss": 0.727, "step": 1925 }, { "epoch": 0.19039616439710352, "grad_norm": 6.1356617369851305, "learning_rate": 1.8681257579285734e-07, "loss": 0.7033, "step": 1926 }, { "epoch": 0.19049502014185798, "grad_norm": 3.341396678996001, "learning_rate": 1.8679667783903917e-07, "loss": 0.6992, "step": 1927 }, { "epoch": 0.19059387588661247, "grad_norm": 3.074389685482824, "learning_rate": 1.8678077098550197e-07, "loss": 0.6631, "step": 1928 }, { "epoch": 0.19069273163136694, "grad_norm": 4.238446523740354, "learning_rate": 1.8676485523387686e-07, "loss": 0.7505, "step": 1929 }, { "epoch": 0.1907915873761214, "grad_norm": 3.783729317151618, "learning_rate": 1.8674893058579572e-07, "loss": 0.6987, "step": 1930 }, { "epoch": 0.19089044312087586, "grad_norm": 3.0395851080727287, "learning_rate": 1.8673299704289136e-07, "loss": 0.6899, "step": 1931 }, { "epoch": 0.19098929886563032, "grad_norm": 6.138223006987396, "learning_rate": 1.867170546067976e-07, "loss": 0.7533, "step": 1932 }, { "epoch": 0.1910881546103848, "grad_norm": 4.2118464175427235, "learning_rate": 1.86701103279149e-07, "loss": 0.7461, "step": 1933 }, { "epoch": 0.19118701035513927, "grad_norm": 9.11331141470458, "learning_rate": 1.866851430615812e-07, "loss": 0.6615, "step": 1934 }, { "epoch": 0.19128586609989373, "grad_norm": 9.654146727942761, "learning_rate": 1.866691739557307e-07, "loss": 0.7581, "step": 1935 }, { "epoch": 0.1913847218446482, "grad_norm": 12.265088307108782, "learning_rate": 1.8665319596323487e-07, "loss": 0.7688, "step": 1936 }, { "epoch": 0.19148357758940265, "grad_norm": 4.849643320705063, "learning_rate": 1.8663720908573199e-07, "loss": 0.623, "step": 1937 }, { "epoch": 0.19158243333415714, "grad_norm": 36.85656350290554, "learning_rate": 1.8662121332486135e-07, "loss": 0.7656, "step": 1938 }, { "epoch": 0.1916812890789116, "grad_norm": 21.62825651259929, "learning_rate": 1.8660520868226303e-07, "loss": 0.7854, "step": 1939 }, { "epoch": 0.19178014482366607, "grad_norm": 5.372182705836245, "learning_rate": 1.8658919515957804e-07, "loss": 0.6756, "step": 1940 }, { "epoch": 0.19187900056842053, "grad_norm": 4.017895334617562, "learning_rate": 1.8657317275844836e-07, "loss": 0.7287, "step": 1941 }, { "epoch": 0.191977856313175, "grad_norm": 4.21043627975122, "learning_rate": 1.865571414805169e-07, "loss": 0.7624, "step": 1942 }, { "epoch": 0.19207671205792948, "grad_norm": 9.389874270421053, "learning_rate": 1.865411013274274e-07, "loss": 0.6689, "step": 1943 }, { "epoch": 0.19217556780268394, "grad_norm": 6.9484055105973175, "learning_rate": 1.8652505230082448e-07, "loss": 0.8289, "step": 1944 }, { "epoch": 0.1922744235474384, "grad_norm": 9.64788591882521, "learning_rate": 1.865089944023538e-07, "loss": 0.76, "step": 1945 }, { "epoch": 0.19237327929219286, "grad_norm": 3.780502316695749, "learning_rate": 1.8649292763366185e-07, "loss": 0.8278, "step": 1946 }, { "epoch": 0.19247213503694732, "grad_norm": 5.222946234151323, "learning_rate": 1.8647685199639604e-07, "loss": 0.7218, "step": 1947 }, { "epoch": 0.1925709907817018, "grad_norm": 4.0293218875811085, "learning_rate": 1.8646076749220464e-07, "loss": 0.7352, "step": 1948 }, { "epoch": 0.19266984652645627, "grad_norm": 4.187860621454855, "learning_rate": 1.8644467412273696e-07, "loss": 0.669, "step": 1949 }, { "epoch": 0.19276870227121073, "grad_norm": 3.7720366068084745, "learning_rate": 1.8642857188964306e-07, "loss": 0.6443, "step": 1950 }, { "epoch": 0.1928675580159652, "grad_norm": 5.777924010325393, "learning_rate": 1.8641246079457407e-07, "loss": 0.6816, "step": 1951 }, { "epoch": 0.19296641376071966, "grad_norm": 5.836200292135517, "learning_rate": 1.8639634083918186e-07, "loss": 0.6988, "step": 1952 }, { "epoch": 0.19306526950547415, "grad_norm": 3.756071286363221, "learning_rate": 1.8638021202511935e-07, "loss": 0.6478, "step": 1953 }, { "epoch": 0.1931641252502286, "grad_norm": 4.355592454079265, "learning_rate": 1.863640743540403e-07, "loss": 0.7816, "step": 1954 }, { "epoch": 0.19326298099498307, "grad_norm": 5.051697416123819, "learning_rate": 1.8634792782759939e-07, "loss": 0.6211, "step": 1955 }, { "epoch": 0.19336183673973753, "grad_norm": 3.2188495907151133, "learning_rate": 1.863317724474522e-07, "loss": 0.7853, "step": 1956 }, { "epoch": 0.193460692484492, "grad_norm": 4.156959307316423, "learning_rate": 1.8631560821525523e-07, "loss": 0.7482, "step": 1957 }, { "epoch": 0.19355954822924648, "grad_norm": 3.662095110274648, "learning_rate": 1.8629943513266586e-07, "loss": 0.6833, "step": 1958 }, { "epoch": 0.19365840397400094, "grad_norm": 4.177477282943034, "learning_rate": 1.8628325320134245e-07, "loss": 0.7474, "step": 1959 }, { "epoch": 0.1937572597187554, "grad_norm": 8.999719021418775, "learning_rate": 1.8626706242294414e-07, "loss": 0.7092, "step": 1960 }, { "epoch": 0.19385611546350986, "grad_norm": 3.863179878292873, "learning_rate": 1.862508627991312e-07, "loss": 0.694, "step": 1961 }, { "epoch": 0.19395497120826435, "grad_norm": 7.25389030221628, "learning_rate": 1.862346543315645e-07, "loss": 0.6656, "step": 1962 }, { "epoch": 0.19405382695301882, "grad_norm": 3.011368458648499, "learning_rate": 1.8621843702190605e-07, "loss": 0.808, "step": 1963 }, { "epoch": 0.19415268269777328, "grad_norm": 5.078148150117684, "learning_rate": 1.8620221087181868e-07, "loss": 0.6292, "step": 1964 }, { "epoch": 0.19425153844252774, "grad_norm": 3.6537949752011603, "learning_rate": 1.8618597588296621e-07, "loss": 0.7502, "step": 1965 }, { "epoch": 0.1943503941872822, "grad_norm": 3.5930459606781397, "learning_rate": 1.8616973205701322e-07, "loss": 0.7732, "step": 1966 }, { "epoch": 0.1944492499320367, "grad_norm": 3.6873448276760135, "learning_rate": 1.861534793956253e-07, "loss": 0.6476, "step": 1967 }, { "epoch": 0.19454810567679115, "grad_norm": 3.5309316161982793, "learning_rate": 1.8613721790046893e-07, "loss": 0.8254, "step": 1968 }, { "epoch": 0.1946469614215456, "grad_norm": 5.0617123642870885, "learning_rate": 1.8612094757321146e-07, "loss": 0.7454, "step": 1969 }, { "epoch": 0.19474581716630007, "grad_norm": 21.50718598654768, "learning_rate": 1.8610466841552122e-07, "loss": 0.762, "step": 1970 }, { "epoch": 0.19484467291105453, "grad_norm": 5.682328387896488, "learning_rate": 1.8608838042906734e-07, "loss": 0.7942, "step": 1971 }, { "epoch": 0.19494352865580902, "grad_norm": 3.4986147379014816, "learning_rate": 1.8607208361551993e-07, "loss": 0.7663, "step": 1972 }, { "epoch": 0.19504238440056348, "grad_norm": 5.411974138464323, "learning_rate": 1.8605577797655002e-07, "loss": 0.5895, "step": 1973 }, { "epoch": 0.19514124014531795, "grad_norm": 8.903525364175954, "learning_rate": 1.860394635138295e-07, "loss": 0.7109, "step": 1974 }, { "epoch": 0.1952400958900724, "grad_norm": 3.478387702105189, "learning_rate": 1.8602314022903117e-07, "loss": 0.7293, "step": 1975 }, { "epoch": 0.19533895163482687, "grad_norm": 4.086198379982533, "learning_rate": 1.8600680812382872e-07, "loss": 0.7607, "step": 1976 }, { "epoch": 0.19543780737958136, "grad_norm": 3.33171134362502, "learning_rate": 1.8599046719989677e-07, "loss": 0.718, "step": 1977 }, { "epoch": 0.19553666312433582, "grad_norm": 3.9085603799658375, "learning_rate": 1.859741174589109e-07, "loss": 0.7634, "step": 1978 }, { "epoch": 0.19563551886909028, "grad_norm": 3.222983510646508, "learning_rate": 1.8595775890254749e-07, "loss": 0.717, "step": 1979 }, { "epoch": 0.19573437461384474, "grad_norm": 9.33255233547826, "learning_rate": 1.8594139153248382e-07, "loss": 0.7527, "step": 1980 }, { "epoch": 0.1958332303585992, "grad_norm": 3.564222216714175, "learning_rate": 1.8592501535039823e-07, "loss": 0.7394, "step": 1981 }, { "epoch": 0.1959320861033537, "grad_norm": 3.118912220548284, "learning_rate": 1.8590863035796976e-07, "loss": 0.8251, "step": 1982 }, { "epoch": 0.19603094184810815, "grad_norm": 4.926386829837075, "learning_rate": 1.8589223655687852e-07, "loss": 0.7798, "step": 1983 }, { "epoch": 0.19612979759286261, "grad_norm": 3.875115983833439, "learning_rate": 1.858758339488054e-07, "loss": 0.7311, "step": 1984 }, { "epoch": 0.19622865333761708, "grad_norm": 4.277079821104643, "learning_rate": 1.8585942253543226e-07, "loss": 0.7129, "step": 1985 }, { "epoch": 0.19632750908237154, "grad_norm": 4.202060299357318, "learning_rate": 1.8584300231844186e-07, "loss": 0.6502, "step": 1986 }, { "epoch": 0.19642636482712603, "grad_norm": 4.421674473782117, "learning_rate": 1.8582657329951786e-07, "loss": 0.6394, "step": 1987 }, { "epoch": 0.1965252205718805, "grad_norm": 4.549215536605262, "learning_rate": 1.8581013548034477e-07, "loss": 0.7185, "step": 1988 }, { "epoch": 0.19662407631663495, "grad_norm": 5.123613862132663, "learning_rate": 1.857936888626081e-07, "loss": 0.7048, "step": 1989 }, { "epoch": 0.1967229320613894, "grad_norm": 4.129078078543203, "learning_rate": 1.8577723344799417e-07, "loss": 0.7498, "step": 1990 }, { "epoch": 0.19682178780614387, "grad_norm": 4.655640894864318, "learning_rate": 1.8576076923819027e-07, "loss": 0.6881, "step": 1991 }, { "epoch": 0.19692064355089836, "grad_norm": 3.3859907568126655, "learning_rate": 1.8574429623488453e-07, "loss": 0.7849, "step": 1992 }, { "epoch": 0.19701949929565282, "grad_norm": 8.159999385393213, "learning_rate": 1.8572781443976603e-07, "loss": 0.7758, "step": 1993 }, { "epoch": 0.19711835504040728, "grad_norm": 3.231858111683805, "learning_rate": 1.8571132385452474e-07, "loss": 0.692, "step": 1994 }, { "epoch": 0.19721721078516175, "grad_norm": 3.666922421322966, "learning_rate": 1.8569482448085152e-07, "loss": 0.6934, "step": 1995 }, { "epoch": 0.1973160665299162, "grad_norm": 5.7330239538606165, "learning_rate": 1.856783163204381e-07, "loss": 0.7089, "step": 1996 }, { "epoch": 0.1974149222746707, "grad_norm": 5.456843455235043, "learning_rate": 1.8566179937497725e-07, "loss": 0.8004, "step": 1997 }, { "epoch": 0.19751377801942516, "grad_norm": 7.596041030306734, "learning_rate": 1.8564527364616245e-07, "loss": 0.743, "step": 1998 }, { "epoch": 0.19761263376417962, "grad_norm": 4.39477424418152, "learning_rate": 1.8562873913568817e-07, "loss": 0.7342, "step": 1999 }, { "epoch": 0.19771148950893408, "grad_norm": 4.908385848226004, "learning_rate": 1.856121958452498e-07, "loss": 0.7569, "step": 2000 }, { "epoch": 0.19781034525368857, "grad_norm": 5.459821876791214, "learning_rate": 1.8559564377654365e-07, "loss": 0.7871, "step": 2001 }, { "epoch": 0.19790920099844303, "grad_norm": 3.7794298240922912, "learning_rate": 1.8557908293126686e-07, "loss": 0.7838, "step": 2002 }, { "epoch": 0.1980080567431975, "grad_norm": 5.536526902226849, "learning_rate": 1.8556251331111746e-07, "loss": 0.715, "step": 2003 }, { "epoch": 0.19810691248795195, "grad_norm": 3.7771766620918386, "learning_rate": 1.855459349177945e-07, "loss": 0.617, "step": 2004 }, { "epoch": 0.19820576823270641, "grad_norm": 3.158765925520094, "learning_rate": 1.8552934775299777e-07, "loss": 0.6715, "step": 2005 }, { "epoch": 0.1983046239774609, "grad_norm": 4.031318975815699, "learning_rate": 1.8551275181842806e-07, "loss": 0.7263, "step": 2006 }, { "epoch": 0.19840347972221536, "grad_norm": 5.807040972288272, "learning_rate": 1.854961471157871e-07, "loss": 0.6296, "step": 2007 }, { "epoch": 0.19850233546696983, "grad_norm": 4.744984302603516, "learning_rate": 1.8547953364677738e-07, "loss": 0.7239, "step": 2008 }, { "epoch": 0.1986011912117243, "grad_norm": 8.198867899753633, "learning_rate": 1.854629114131024e-07, "loss": 0.7271, "step": 2009 }, { "epoch": 0.19870004695647875, "grad_norm": 4.316456517769032, "learning_rate": 1.8544628041646653e-07, "loss": 0.7086, "step": 2010 }, { "epoch": 0.19879890270123324, "grad_norm": 6.89464877790718, "learning_rate": 1.85429640658575e-07, "loss": 0.7465, "step": 2011 }, { "epoch": 0.1988977584459877, "grad_norm": 4.854001987489522, "learning_rate": 1.8541299214113405e-07, "loss": 0.795, "step": 2012 }, { "epoch": 0.19899661419074216, "grad_norm": 7.521661854532481, "learning_rate": 1.8539633486585063e-07, "loss": 0.7936, "step": 2013 }, { "epoch": 0.19909546993549662, "grad_norm": 4.234939822727864, "learning_rate": 1.8537966883443277e-07, "loss": 0.6846, "step": 2014 }, { "epoch": 0.19919432568025108, "grad_norm": 115.80350118368003, "learning_rate": 1.8536299404858933e-07, "loss": 0.6982, "step": 2015 }, { "epoch": 0.19929318142500557, "grad_norm": 2.677720720621416, "learning_rate": 1.8534631051003002e-07, "loss": 0.6713, "step": 2016 }, { "epoch": 0.19939203716976003, "grad_norm": 5.747486046378795, "learning_rate": 1.8532961822046552e-07, "loss": 0.6139, "step": 2017 }, { "epoch": 0.1994908929145145, "grad_norm": 3.8757676307567266, "learning_rate": 1.853129171816074e-07, "loss": 0.7421, "step": 2018 }, { "epoch": 0.19958974865926896, "grad_norm": 17.221514102630028, "learning_rate": 1.8529620739516806e-07, "loss": 0.7648, "step": 2019 }, { "epoch": 0.19968860440402342, "grad_norm": 4.257730799713445, "learning_rate": 1.8527948886286081e-07, "loss": 0.7028, "step": 2020 }, { "epoch": 0.1997874601487779, "grad_norm": 4.773551643875445, "learning_rate": 1.8526276158639997e-07, "loss": 0.747, "step": 2021 }, { "epoch": 0.19988631589353237, "grad_norm": 5.458688489622943, "learning_rate": 1.8524602556750067e-07, "loss": 0.6962, "step": 2022 }, { "epoch": 0.19998517163828683, "grad_norm": 5.381793198970702, "learning_rate": 1.8522928080787888e-07, "loss": 0.7362, "step": 2023 }, { "epoch": 0.2000840273830413, "grad_norm": 4.048573073555702, "learning_rate": 1.852125273092516e-07, "loss": 0.6398, "step": 2024 }, { "epoch": 0.20018288312779575, "grad_norm": 14.49924567761937, "learning_rate": 1.851957650733366e-07, "loss": 0.683, "step": 2025 }, { "epoch": 0.20028173887255024, "grad_norm": 7.416574550730637, "learning_rate": 1.851789941018526e-07, "loss": 0.789, "step": 2026 }, { "epoch": 0.2003805946173047, "grad_norm": 3.630509307082584, "learning_rate": 1.8516221439651927e-07, "loss": 0.7117, "step": 2027 }, { "epoch": 0.20047945036205916, "grad_norm": 15.585682846142296, "learning_rate": 1.8514542595905707e-07, "loss": 0.792, "step": 2028 }, { "epoch": 0.20057830610681363, "grad_norm": 2.7644625792571107, "learning_rate": 1.8512862879118742e-07, "loss": 0.628, "step": 2029 }, { "epoch": 0.2006771618515681, "grad_norm": 3.9225088005558213, "learning_rate": 1.851118228946326e-07, "loss": 0.7364, "step": 2030 }, { "epoch": 0.20077601759632258, "grad_norm": 4.363391086571473, "learning_rate": 1.8509500827111585e-07, "loss": 0.7564, "step": 2031 }, { "epoch": 0.20087487334107704, "grad_norm": 3.6919457556012456, "learning_rate": 1.8507818492236125e-07, "loss": 0.6936, "step": 2032 }, { "epoch": 0.2009737290858315, "grad_norm": 3.3602267770233554, "learning_rate": 1.8506135285009378e-07, "loss": 0.6135, "step": 2033 }, { "epoch": 0.20107258483058596, "grad_norm": 7.385664038669656, "learning_rate": 1.8504451205603928e-07, "loss": 0.7465, "step": 2034 }, { "epoch": 0.20117144057534042, "grad_norm": 3.9188951468791826, "learning_rate": 1.850276625419246e-07, "loss": 0.6843, "step": 2035 }, { "epoch": 0.2012702963200949, "grad_norm": 5.017566962052129, "learning_rate": 1.8501080430947735e-07, "loss": 0.7245, "step": 2036 }, { "epoch": 0.20136915206484937, "grad_norm": 4.628907637624656, "learning_rate": 1.8499393736042616e-07, "loss": 0.7355, "step": 2037 }, { "epoch": 0.20146800780960383, "grad_norm": 4.530533025640185, "learning_rate": 1.8497706169650038e-07, "loss": 0.8286, "step": 2038 }, { "epoch": 0.2015668635543583, "grad_norm": 3.7062807705538923, "learning_rate": 1.849601773194305e-07, "loss": 0.7194, "step": 2039 }, { "epoch": 0.20166571929911278, "grad_norm": 3.5619644661959073, "learning_rate": 1.849432842309476e-07, "loss": 0.7154, "step": 2040 }, { "epoch": 0.20176457504386724, "grad_norm": 5.554081810371467, "learning_rate": 1.8492638243278395e-07, "loss": 0.7267, "step": 2041 }, { "epoch": 0.2018634307886217, "grad_norm": 3.3862874961756493, "learning_rate": 1.849094719266725e-07, "loss": 0.7968, "step": 2042 }, { "epoch": 0.20196228653337617, "grad_norm": 11.152754608173318, "learning_rate": 1.8489255271434728e-07, "loss": 0.7983, "step": 2043 }, { "epoch": 0.20206114227813063, "grad_norm": 3.4220570371602004, "learning_rate": 1.8487562479754296e-07, "loss": 0.726, "step": 2044 }, { "epoch": 0.20215999802288512, "grad_norm": 4.314969863895004, "learning_rate": 1.8485868817799535e-07, "loss": 0.6704, "step": 2045 }, { "epoch": 0.20225885376763958, "grad_norm": 4.071755654413249, "learning_rate": 1.8484174285744102e-07, "loss": 0.8868, "step": 2046 }, { "epoch": 0.20235770951239404, "grad_norm": 6.439666685979803, "learning_rate": 1.8482478883761748e-07, "loss": 0.8431, "step": 2047 }, { "epoch": 0.2024565652571485, "grad_norm": 9.751228204612607, "learning_rate": 1.8480782612026306e-07, "loss": 0.7837, "step": 2048 }, { "epoch": 0.20255542100190296, "grad_norm": 3.5880327492709276, "learning_rate": 1.8479085470711714e-07, "loss": 0.8187, "step": 2049 }, { "epoch": 0.20265427674665745, "grad_norm": 4.055449412623008, "learning_rate": 1.8477387459991974e-07, "loss": 0.7319, "step": 2050 }, { "epoch": 0.2027531324914119, "grad_norm": 6.833578040595688, "learning_rate": 1.8475688580041204e-07, "loss": 0.7508, "step": 2051 }, { "epoch": 0.20285198823616638, "grad_norm": 4.344836122301642, "learning_rate": 1.8473988831033595e-07, "loss": 0.796, "step": 2052 }, { "epoch": 0.20295084398092084, "grad_norm": 4.925981567297614, "learning_rate": 1.8472288213143434e-07, "loss": 0.6863, "step": 2053 }, { "epoch": 0.2030496997256753, "grad_norm": 3.5608217943702996, "learning_rate": 1.847058672654509e-07, "loss": 0.7233, "step": 2054 }, { "epoch": 0.2031485554704298, "grad_norm": 3.563368266088548, "learning_rate": 1.8468884371413026e-07, "loss": 0.6431, "step": 2055 }, { "epoch": 0.20324741121518425, "grad_norm": 3.592625770041143, "learning_rate": 1.8467181147921795e-07, "loss": 0.745, "step": 2056 }, { "epoch": 0.2033462669599387, "grad_norm": 4.33495726175727, "learning_rate": 1.8465477056246036e-07, "loss": 0.8441, "step": 2057 }, { "epoch": 0.20344512270469317, "grad_norm": 4.953149981510225, "learning_rate": 1.8463772096560483e-07, "loss": 0.7599, "step": 2058 }, { "epoch": 0.20354397844944763, "grad_norm": 8.463998441983403, "learning_rate": 1.846206626903995e-07, "loss": 0.6967, "step": 2059 }, { "epoch": 0.20364283419420212, "grad_norm": 3.405542817435583, "learning_rate": 1.846035957385934e-07, "loss": 0.7417, "step": 2060 }, { "epoch": 0.20374168993895658, "grad_norm": 7.838212717808324, "learning_rate": 1.8458652011193664e-07, "loss": 0.7302, "step": 2061 }, { "epoch": 0.20384054568371104, "grad_norm": 4.915235171303165, "learning_rate": 1.8456943581217993e-07, "loss": 0.8382, "step": 2062 }, { "epoch": 0.2039394014284655, "grad_norm": 3.767450173957523, "learning_rate": 1.8455234284107504e-07, "loss": 0.6793, "step": 2063 }, { "epoch": 0.20403825717321997, "grad_norm": 5.223157479598452, "learning_rate": 1.8453524120037467e-07, "loss": 0.6745, "step": 2064 }, { "epoch": 0.20413711291797446, "grad_norm": 3.774702897314655, "learning_rate": 1.8451813089183226e-07, "loss": 0.7461, "step": 2065 }, { "epoch": 0.20423596866272892, "grad_norm": 19.412836236720153, "learning_rate": 1.8450101191720227e-07, "loss": 0.7119, "step": 2066 }, { "epoch": 0.20433482440748338, "grad_norm": 3.402853657796023, "learning_rate": 1.8448388427824e-07, "loss": 0.7962, "step": 2067 }, { "epoch": 0.20443368015223784, "grad_norm": 11.967001619721819, "learning_rate": 1.844667479767016e-07, "loss": 0.7935, "step": 2068 }, { "epoch": 0.2045325358969923, "grad_norm": 3.0368270693781247, "learning_rate": 1.844496030143442e-07, "loss": 0.7266, "step": 2069 }, { "epoch": 0.2046313916417468, "grad_norm": 18.462100944743057, "learning_rate": 1.844324493929257e-07, "loss": 0.8014, "step": 2070 }, { "epoch": 0.20473024738650125, "grad_norm": 4.75736371013612, "learning_rate": 1.8441528711420499e-07, "loss": 0.8248, "step": 2071 }, { "epoch": 0.2048291031312557, "grad_norm": 3.618245171947734, "learning_rate": 1.843981161799418e-07, "loss": 0.6217, "step": 2072 }, { "epoch": 0.20492795887601017, "grad_norm": 3.487991315167392, "learning_rate": 1.8438093659189677e-07, "loss": 0.7709, "step": 2073 }, { "epoch": 0.20502681462076464, "grad_norm": 4.352386651774669, "learning_rate": 1.843637483518314e-07, "loss": 0.6945, "step": 2074 }, { "epoch": 0.20512567036551912, "grad_norm": 6.420154417782353, "learning_rate": 1.8434655146150806e-07, "loss": 0.6558, "step": 2075 }, { "epoch": 0.2052245261102736, "grad_norm": 4.7312805085286485, "learning_rate": 1.843293459226901e-07, "loss": 0.7636, "step": 2076 }, { "epoch": 0.20532338185502805, "grad_norm": 3.659408825577001, "learning_rate": 1.843121317371416e-07, "loss": 0.7459, "step": 2077 }, { "epoch": 0.2054222375997825, "grad_norm": 5.559727190030357, "learning_rate": 1.8429490890662776e-07, "loss": 0.707, "step": 2078 }, { "epoch": 0.205521093344537, "grad_norm": 3.5705899234134786, "learning_rate": 1.842776774329144e-07, "loss": 0.7974, "step": 2079 }, { "epoch": 0.20561994908929146, "grad_norm": 3.2588898234167933, "learning_rate": 1.8426043731776844e-07, "loss": 0.7841, "step": 2080 }, { "epoch": 0.20571880483404592, "grad_norm": 4.492173142495916, "learning_rate": 1.8424318856295754e-07, "loss": 0.8034, "step": 2081 }, { "epoch": 0.20581766057880038, "grad_norm": 4.3932208529625845, "learning_rate": 1.8422593117025033e-07, "loss": 0.7288, "step": 2082 }, { "epoch": 0.20591651632355484, "grad_norm": 5.21394630006988, "learning_rate": 1.8420866514141626e-07, "loss": 0.6397, "step": 2083 }, { "epoch": 0.20601537206830933, "grad_norm": 3.5215545693706574, "learning_rate": 1.8419139047822577e-07, "loss": 0.6099, "step": 2084 }, { "epoch": 0.2061142278130638, "grad_norm": 18.00847661944974, "learning_rate": 1.841741071824501e-07, "loss": 0.7183, "step": 2085 }, { "epoch": 0.20621308355781826, "grad_norm": 6.8528321739097775, "learning_rate": 1.8415681525586143e-07, "loss": 0.7877, "step": 2086 }, { "epoch": 0.20631193930257272, "grad_norm": 5.111669500790089, "learning_rate": 1.8413951470023273e-07, "loss": 0.7644, "step": 2087 }, { "epoch": 0.20641079504732718, "grad_norm": 6.258121625823441, "learning_rate": 1.8412220551733793e-07, "loss": 0.8078, "step": 2088 }, { "epoch": 0.20650965079208167, "grad_norm": 8.997569899170418, "learning_rate": 1.8410488770895184e-07, "loss": 0.7153, "step": 2089 }, { "epoch": 0.20660850653683613, "grad_norm": 10.729357812280155, "learning_rate": 1.840875612768502e-07, "loss": 0.7624, "step": 2090 }, { "epoch": 0.2067073622815906, "grad_norm": 6.566868326808714, "learning_rate": 1.8407022622280948e-07, "loss": 0.7515, "step": 2091 }, { "epoch": 0.20680621802634505, "grad_norm": 45.55189711159325, "learning_rate": 1.8405288254860718e-07, "loss": 0.6676, "step": 2092 }, { "epoch": 0.2069050737710995, "grad_norm": 3.793282011424986, "learning_rate": 1.8403553025602167e-07, "loss": 0.7879, "step": 2093 }, { "epoch": 0.207003929515854, "grad_norm": 5.457850851607358, "learning_rate": 1.8401816934683212e-07, "loss": 0.7482, "step": 2094 }, { "epoch": 0.20710278526060846, "grad_norm": 3.3799164583540238, "learning_rate": 1.8400079982281868e-07, "loss": 0.7373, "step": 2095 }, { "epoch": 0.20720164100536292, "grad_norm": 4.767220049395196, "learning_rate": 1.839834216857623e-07, "loss": 0.8869, "step": 2096 }, { "epoch": 0.20730049675011739, "grad_norm": 4.142270179894253, "learning_rate": 1.8396603493744489e-07, "loss": 0.7879, "step": 2097 }, { "epoch": 0.20739935249487185, "grad_norm": 5.714830123396406, "learning_rate": 1.8394863957964917e-07, "loss": 0.7503, "step": 2098 }, { "epoch": 0.20749820823962634, "grad_norm": 3.5871396891535703, "learning_rate": 1.8393123561415877e-07, "loss": 0.797, "step": 2099 }, { "epoch": 0.2075970639843808, "grad_norm": 6.308185656596125, "learning_rate": 1.8391382304275826e-07, "loss": 0.7072, "step": 2100 }, { "epoch": 0.20769591972913526, "grad_norm": 5.525101712979395, "learning_rate": 1.83896401867233e-07, "loss": 0.764, "step": 2101 }, { "epoch": 0.20779477547388972, "grad_norm": 11.75651874126839, "learning_rate": 1.8387897208936931e-07, "loss": 0.6815, "step": 2102 }, { "epoch": 0.20789363121864418, "grad_norm": 6.980919786922329, "learning_rate": 1.838615337109543e-07, "loss": 0.7162, "step": 2103 }, { "epoch": 0.20799248696339867, "grad_norm": 3.5346411460993683, "learning_rate": 1.838440867337761e-07, "loss": 0.6865, "step": 2104 }, { "epoch": 0.20809134270815313, "grad_norm": 9.921758683914227, "learning_rate": 1.8382663115962352e-07, "loss": 0.7827, "step": 2105 }, { "epoch": 0.2081901984529076, "grad_norm": 4.6396282860501925, "learning_rate": 1.838091669902865e-07, "loss": 0.6941, "step": 2106 }, { "epoch": 0.20828905419766205, "grad_norm": 3.1688828878373907, "learning_rate": 1.8379169422755568e-07, "loss": 0.673, "step": 2107 }, { "epoch": 0.20838790994241652, "grad_norm": 4.756508313635189, "learning_rate": 1.8377421287322261e-07, "loss": 0.6044, "step": 2108 }, { "epoch": 0.208486765687171, "grad_norm": 6.79528214883216, "learning_rate": 1.8375672292907978e-07, "loss": 0.8208, "step": 2109 }, { "epoch": 0.20858562143192547, "grad_norm": 3.7616754942296913, "learning_rate": 1.8373922439692048e-07, "loss": 0.6928, "step": 2110 }, { "epoch": 0.20868447717667993, "grad_norm": 20.056356484600997, "learning_rate": 1.83721717278539e-07, "loss": 0.7198, "step": 2111 }, { "epoch": 0.2087833329214344, "grad_norm": 6.185995567145533, "learning_rate": 1.8370420157573036e-07, "loss": 0.7831, "step": 2112 }, { "epoch": 0.20888218866618885, "grad_norm": 12.977941978979313, "learning_rate": 1.836866772902906e-07, "loss": 0.7699, "step": 2113 }, { "epoch": 0.20898104441094334, "grad_norm": 3.675452620697515, "learning_rate": 1.8366914442401654e-07, "loss": 0.6464, "step": 2114 }, { "epoch": 0.2090799001556978, "grad_norm": 15.794609978215982, "learning_rate": 1.8365160297870593e-07, "loss": 0.6497, "step": 2115 }, { "epoch": 0.20917875590045226, "grad_norm": 5.76524704391002, "learning_rate": 1.8363405295615736e-07, "loss": 0.792, "step": 2116 }, { "epoch": 0.20927761164520672, "grad_norm": 3.7204594931732387, "learning_rate": 1.8361649435817038e-07, "loss": 0.7349, "step": 2117 }, { "epoch": 0.2093764673899612, "grad_norm": 4.199165174262191, "learning_rate": 1.835989271865453e-07, "loss": 0.7948, "step": 2118 }, { "epoch": 0.20947532313471567, "grad_norm": 4.517813469477539, "learning_rate": 1.8358135144308345e-07, "loss": 0.7161, "step": 2119 }, { "epoch": 0.20957417887947014, "grad_norm": 4.444002612523335, "learning_rate": 1.8356376712958687e-07, "loss": 0.7794, "step": 2120 }, { "epoch": 0.2096730346242246, "grad_norm": 4.641434700604214, "learning_rate": 1.8354617424785867e-07, "loss": 0.5881, "step": 2121 }, { "epoch": 0.20977189036897906, "grad_norm": 3.340730291696587, "learning_rate": 1.835285727997027e-07, "loss": 0.5884, "step": 2122 }, { "epoch": 0.20987074611373355, "grad_norm": 4.6119961131275335, "learning_rate": 1.8351096278692365e-07, "loss": 0.7077, "step": 2123 }, { "epoch": 0.209969601858488, "grad_norm": 3.2498345588957993, "learning_rate": 1.834933442113273e-07, "loss": 0.7009, "step": 2124 }, { "epoch": 0.21006845760324247, "grad_norm": 3.9087821897893225, "learning_rate": 1.834757170747201e-07, "loss": 0.7216, "step": 2125 }, { "epoch": 0.21016731334799693, "grad_norm": 3.8096058853236943, "learning_rate": 1.8345808137890947e-07, "loss": 0.6644, "step": 2126 }, { "epoch": 0.2102661690927514, "grad_norm": 3.1156212801168297, "learning_rate": 1.834404371257037e-07, "loss": 0.6929, "step": 2127 }, { "epoch": 0.21036502483750588, "grad_norm": 5.424580706755206, "learning_rate": 1.8342278431691192e-07, "loss": 0.7093, "step": 2128 }, { "epoch": 0.21046388058226034, "grad_norm": 14.967898638424883, "learning_rate": 1.8340512295434417e-07, "loss": 0.8553, "step": 2129 }, { "epoch": 0.2105627363270148, "grad_norm": 4.258055042342696, "learning_rate": 1.833874530398114e-07, "loss": 0.7685, "step": 2130 }, { "epoch": 0.21066159207176927, "grad_norm": 3.2329403046462635, "learning_rate": 1.8336977457512535e-07, "loss": 0.807, "step": 2131 }, { "epoch": 0.21076044781652373, "grad_norm": 4.793296715070051, "learning_rate": 1.8335208756209872e-07, "loss": 0.7893, "step": 2132 }, { "epoch": 0.21085930356127822, "grad_norm": 4.758018451015908, "learning_rate": 1.8333439200254505e-07, "loss": 0.7211, "step": 2133 }, { "epoch": 0.21095815930603268, "grad_norm": 2.96568219236192, "learning_rate": 1.8331668789827873e-07, "loss": 0.6547, "step": 2134 }, { "epoch": 0.21105701505078714, "grad_norm": 7.804179668992488, "learning_rate": 1.832989752511151e-07, "loss": 0.7053, "step": 2135 }, { "epoch": 0.2111558707955416, "grad_norm": 5.008373272229768, "learning_rate": 1.8328125406287026e-07, "loss": 0.7609, "step": 2136 }, { "epoch": 0.21125472654029606, "grad_norm": 3.5562725664885133, "learning_rate": 1.8326352433536135e-07, "loss": 0.6547, "step": 2137 }, { "epoch": 0.21135358228505055, "grad_norm": 2.482367584789509, "learning_rate": 1.832457860704062e-07, "loss": 0.6352, "step": 2138 }, { "epoch": 0.211452438029805, "grad_norm": 3.842954393673078, "learning_rate": 1.8322803926982368e-07, "loss": 0.6497, "step": 2139 }, { "epoch": 0.21155129377455947, "grad_norm": 3.6948440040270722, "learning_rate": 1.8321028393543344e-07, "loss": 0.8204, "step": 2140 }, { "epoch": 0.21165014951931393, "grad_norm": 3.655054804571866, "learning_rate": 1.8319252006905597e-07, "loss": 0.7908, "step": 2141 }, { "epoch": 0.2117490052640684, "grad_norm": 4.41506955968452, "learning_rate": 1.8317474767251277e-07, "loss": 0.6334, "step": 2142 }, { "epoch": 0.21184786100882289, "grad_norm": 6.528677333345299, "learning_rate": 1.8315696674762613e-07, "loss": 0.6998, "step": 2143 }, { "epoch": 0.21194671675357735, "grad_norm": 3.6893863536195544, "learning_rate": 1.8313917729621916e-07, "loss": 0.7459, "step": 2144 }, { "epoch": 0.2120455724983318, "grad_norm": 6.0064499056825165, "learning_rate": 1.83121379320116e-07, "loss": 0.7912, "step": 2145 }, { "epoch": 0.21214442824308627, "grad_norm": 7.23661761547919, "learning_rate": 1.8310357282114148e-07, "loss": 0.7834, "step": 2146 }, { "epoch": 0.21224328398784073, "grad_norm": 6.3072089601577, "learning_rate": 1.8308575780112143e-07, "loss": 0.7302, "step": 2147 }, { "epoch": 0.21234213973259522, "grad_norm": 3.44486679680075, "learning_rate": 1.8306793426188256e-07, "loss": 0.7526, "step": 2148 }, { "epoch": 0.21244099547734968, "grad_norm": 10.936906715756319, "learning_rate": 1.8305010220525232e-07, "loss": 0.7555, "step": 2149 }, { "epoch": 0.21253985122210414, "grad_norm": 3.937312233048026, "learning_rate": 1.830322616330592e-07, "loss": 0.7109, "step": 2150 }, { "epoch": 0.2126387069668586, "grad_norm": 78.99223907164522, "learning_rate": 1.8301441254713245e-07, "loss": 0.6832, "step": 2151 }, { "epoch": 0.21273756271161307, "grad_norm": 5.174156483408895, "learning_rate": 1.8299655494930225e-07, "loss": 0.693, "step": 2152 }, { "epoch": 0.21283641845636755, "grad_norm": 7.239649263197708, "learning_rate": 1.8297868884139964e-07, "loss": 0.5962, "step": 2153 }, { "epoch": 0.21293527420112202, "grad_norm": 5.7272916451380365, "learning_rate": 1.8296081422525648e-07, "loss": 0.655, "step": 2154 }, { "epoch": 0.21303412994587648, "grad_norm": 3.177520083211639, "learning_rate": 1.829429311027056e-07, "loss": 0.8213, "step": 2155 }, { "epoch": 0.21313298569063094, "grad_norm": 3.770164547246557, "learning_rate": 1.829250394755806e-07, "loss": 0.8228, "step": 2156 }, { "epoch": 0.21323184143538543, "grad_norm": 61.419029841970634, "learning_rate": 1.8290713934571609e-07, "loss": 0.7918, "step": 2157 }, { "epoch": 0.2133306971801399, "grad_norm": 5.344440704361083, "learning_rate": 1.8288923071494735e-07, "loss": 0.7851, "step": 2158 }, { "epoch": 0.21342955292489435, "grad_norm": 3.3540363263139494, "learning_rate": 1.8287131358511074e-07, "loss": 0.7482, "step": 2159 }, { "epoch": 0.2135284086696488, "grad_norm": 3.7330411823188365, "learning_rate": 1.8285338795804334e-07, "loss": 0.7143, "step": 2160 }, { "epoch": 0.21362726441440327, "grad_norm": 4.4114012223554555, "learning_rate": 1.828354538355832e-07, "loss": 0.6923, "step": 2161 }, { "epoch": 0.21372612015915776, "grad_norm": 6.746280681019429, "learning_rate": 1.8281751121956917e-07, "loss": 0.7905, "step": 2162 }, { "epoch": 0.21382497590391222, "grad_norm": 4.020923849067699, "learning_rate": 1.8279956011184097e-07, "loss": 0.7712, "step": 2163 }, { "epoch": 0.21392383164866668, "grad_norm": 7.997368122310902, "learning_rate": 1.827816005142393e-07, "loss": 0.7001, "step": 2164 }, { "epoch": 0.21402268739342115, "grad_norm": 4.631155727425905, "learning_rate": 1.827636324286056e-07, "loss": 0.7746, "step": 2165 }, { "epoch": 0.2141215431381756, "grad_norm": 8.748761337423502, "learning_rate": 1.8274565585678222e-07, "loss": 0.7391, "step": 2166 }, { "epoch": 0.2142203988829301, "grad_norm": 12.734442089521497, "learning_rate": 1.8272767080061246e-07, "loss": 0.7347, "step": 2167 }, { "epoch": 0.21431925462768456, "grad_norm": 6.428052270470054, "learning_rate": 1.8270967726194035e-07, "loss": 0.7977, "step": 2168 }, { "epoch": 0.21441811037243902, "grad_norm": 8.618949323725866, "learning_rate": 1.826916752426109e-07, "loss": 0.696, "step": 2169 }, { "epoch": 0.21451696611719348, "grad_norm": 3.2692778231826822, "learning_rate": 1.8267366474446995e-07, "loss": 0.6723, "step": 2170 }, { "epoch": 0.21461582186194794, "grad_norm": 23.356634351924736, "learning_rate": 1.826556457693642e-07, "loss": 0.8346, "step": 2171 }, { "epoch": 0.21471467760670243, "grad_norm": 4.906812233683138, "learning_rate": 1.8263761831914125e-07, "loss": 0.7472, "step": 2172 }, { "epoch": 0.2148135333514569, "grad_norm": 6.023416865971916, "learning_rate": 1.8261958239564948e-07, "loss": 0.7463, "step": 2173 }, { "epoch": 0.21491238909621135, "grad_norm": 2.7618035799401945, "learning_rate": 1.826015380007383e-07, "loss": 0.6531, "step": 2174 }, { "epoch": 0.21501124484096582, "grad_norm": 3.490987219972063, "learning_rate": 1.8258348513625785e-07, "loss": 0.7879, "step": 2175 }, { "epoch": 0.21511010058572028, "grad_norm": 3.3025358742403643, "learning_rate": 1.825654238040592e-07, "loss": 0.6393, "step": 2176 }, { "epoch": 0.21520895633047477, "grad_norm": 3.587798195825661, "learning_rate": 1.8254735400599423e-07, "loss": 0.7019, "step": 2177 }, { "epoch": 0.21530781207522923, "grad_norm": 6.84285102318608, "learning_rate": 1.8252927574391582e-07, "loss": 0.8288, "step": 2178 }, { "epoch": 0.2154066678199837, "grad_norm": 4.911645001686645, "learning_rate": 1.825111890196775e-07, "loss": 0.6884, "step": 2179 }, { "epoch": 0.21550552356473815, "grad_norm": 7.494434164130699, "learning_rate": 1.8249309383513395e-07, "loss": 0.6932, "step": 2180 }, { "epoch": 0.2156043793094926, "grad_norm": 3.943736510728258, "learning_rate": 1.8247499019214044e-07, "loss": 0.7381, "step": 2181 }, { "epoch": 0.2157032350542471, "grad_norm": 3.529616431930521, "learning_rate": 1.824568780925533e-07, "loss": 0.7876, "step": 2182 }, { "epoch": 0.21580209079900156, "grad_norm": 3.431223497160719, "learning_rate": 1.8243875753822962e-07, "loss": 0.6867, "step": 2183 }, { "epoch": 0.21590094654375602, "grad_norm": 33.86237858007758, "learning_rate": 1.8242062853102742e-07, "loss": 0.8048, "step": 2184 }, { "epoch": 0.21599980228851048, "grad_norm": 4.823276676023819, "learning_rate": 1.8240249107280555e-07, "loss": 0.7602, "step": 2185 }, { "epoch": 0.21609865803326495, "grad_norm": 3.5494244317799404, "learning_rate": 1.8238434516542372e-07, "loss": 0.7636, "step": 2186 }, { "epoch": 0.21619751377801943, "grad_norm": 5.038376640663892, "learning_rate": 1.8236619081074257e-07, "loss": 0.7625, "step": 2187 }, { "epoch": 0.2162963695227739, "grad_norm": 5.1634319073152914, "learning_rate": 1.8234802801062355e-07, "loss": 0.6188, "step": 2188 }, { "epoch": 0.21639522526752836, "grad_norm": 4.884125998810155, "learning_rate": 1.8232985676692896e-07, "loss": 0.7342, "step": 2189 }, { "epoch": 0.21649408101228282, "grad_norm": 22.110463935697503, "learning_rate": 1.8231167708152202e-07, "loss": 0.7291, "step": 2190 }, { "epoch": 0.21659293675703728, "grad_norm": 3.863379214738135, "learning_rate": 1.8229348895626677e-07, "loss": 0.7727, "step": 2191 }, { "epoch": 0.21669179250179177, "grad_norm": 25.23031394306092, "learning_rate": 1.8227529239302814e-07, "loss": 0.7705, "step": 2192 }, { "epoch": 0.21679064824654623, "grad_norm": 4.4999397945867505, "learning_rate": 1.8225708739367194e-07, "loss": 0.7869, "step": 2193 }, { "epoch": 0.2168895039913007, "grad_norm": 4.108292662260369, "learning_rate": 1.822388739600648e-07, "loss": 0.7164, "step": 2194 }, { "epoch": 0.21698835973605515, "grad_norm": 6.043449906829299, "learning_rate": 1.8222065209407421e-07, "loss": 0.744, "step": 2195 }, { "epoch": 0.21708721548080964, "grad_norm": 22.235348832982133, "learning_rate": 1.8220242179756863e-07, "loss": 0.8166, "step": 2196 }, { "epoch": 0.2171860712255641, "grad_norm": 4.339269132697207, "learning_rate": 1.8218418307241725e-07, "loss": 0.8546, "step": 2197 }, { "epoch": 0.21728492697031856, "grad_norm": 5.526685920232019, "learning_rate": 1.821659359204902e-07, "loss": 0.7148, "step": 2198 }, { "epoch": 0.21738378271507303, "grad_norm": 4.018299684782868, "learning_rate": 1.8214768034365845e-07, "loss": 0.7311, "step": 2199 }, { "epoch": 0.2174826384598275, "grad_norm": 4.841124887835498, "learning_rate": 1.8212941634379384e-07, "loss": 0.6494, "step": 2200 }, { "epoch": 0.21758149420458198, "grad_norm": 8.70689107260504, "learning_rate": 1.8211114392276907e-07, "loss": 0.7098, "step": 2201 }, { "epoch": 0.21768034994933644, "grad_norm": 4.740666919833462, "learning_rate": 1.8209286308245775e-07, "loss": 0.7426, "step": 2202 }, { "epoch": 0.2177792056940909, "grad_norm": 8.29916457451471, "learning_rate": 1.8207457382473426e-07, "loss": 0.6981, "step": 2203 }, { "epoch": 0.21787806143884536, "grad_norm": 3.199727396806589, "learning_rate": 1.820562761514739e-07, "loss": 0.6757, "step": 2204 }, { "epoch": 0.21797691718359982, "grad_norm": 4.331528777962173, "learning_rate": 1.8203797006455283e-07, "loss": 0.8289, "step": 2205 }, { "epoch": 0.2180757729283543, "grad_norm": 4.58579796110199, "learning_rate": 1.820196555658481e-07, "loss": 0.7684, "step": 2206 }, { "epoch": 0.21817462867310877, "grad_norm": 72.18523980245836, "learning_rate": 1.8200133265723756e-07, "loss": 0.7199, "step": 2207 }, { "epoch": 0.21827348441786323, "grad_norm": 2.9385078096013064, "learning_rate": 1.8198300134059994e-07, "loss": 0.8161, "step": 2208 }, { "epoch": 0.2183723401626177, "grad_norm": 8.5177614265311, "learning_rate": 1.8196466161781486e-07, "loss": 0.686, "step": 2209 }, { "epoch": 0.21847119590737216, "grad_norm": 4.580667717352341, "learning_rate": 1.8194631349076285e-07, "loss": 0.695, "step": 2210 }, { "epoch": 0.21857005165212665, "grad_norm": 3.56322058790244, "learning_rate": 1.8192795696132515e-07, "loss": 0.7411, "step": 2211 }, { "epoch": 0.2186689073968811, "grad_norm": 3.6400923621150367, "learning_rate": 1.8190959203138397e-07, "loss": 0.6811, "step": 2212 }, { "epoch": 0.21876776314163557, "grad_norm": 4.218950695708814, "learning_rate": 1.818912187028224e-07, "loss": 0.7433, "step": 2213 }, { "epoch": 0.21886661888639003, "grad_norm": 4.851248346687532, "learning_rate": 1.8187283697752432e-07, "loss": 0.721, "step": 2214 }, { "epoch": 0.2189654746311445, "grad_norm": 5.813175843892063, "learning_rate": 1.8185444685737452e-07, "loss": 0.7779, "step": 2215 }, { "epoch": 0.21906433037589898, "grad_norm": 6.5804617842054425, "learning_rate": 1.8183604834425865e-07, "loss": 0.6459, "step": 2216 }, { "epoch": 0.21916318612065344, "grad_norm": 6.433011453456284, "learning_rate": 1.8181764144006317e-07, "loss": 0.7164, "step": 2217 }, { "epoch": 0.2192620418654079, "grad_norm": 3.162140485708682, "learning_rate": 1.8179922614667548e-07, "loss": 0.8184, "step": 2218 }, { "epoch": 0.21936089761016236, "grad_norm": 4.0400889202512555, "learning_rate": 1.8178080246598372e-07, "loss": 0.7182, "step": 2219 }, { "epoch": 0.21945975335491683, "grad_norm": 5.040201357782888, "learning_rate": 1.8176237039987706e-07, "loss": 0.7525, "step": 2220 }, { "epoch": 0.21955860909967131, "grad_norm": 2.843395922223714, "learning_rate": 1.817439299502454e-07, "loss": 0.6454, "step": 2221 }, { "epoch": 0.21965746484442578, "grad_norm": 3.7689775570715414, "learning_rate": 1.817254811189795e-07, "loss": 0.7218, "step": 2222 }, { "epoch": 0.21975632058918024, "grad_norm": 3.4067846840258413, "learning_rate": 1.8170702390797107e-07, "loss": 0.6857, "step": 2223 }, { "epoch": 0.2198551763339347, "grad_norm": 4.411899866147998, "learning_rate": 1.8168855831911261e-07, "loss": 0.7113, "step": 2224 }, { "epoch": 0.21995403207868916, "grad_norm": 4.294685321888325, "learning_rate": 1.8167008435429747e-07, "loss": 0.7546, "step": 2225 }, { "epoch": 0.22005288782344365, "grad_norm": 7.83924496395221, "learning_rate": 1.816516020154199e-07, "loss": 0.6298, "step": 2226 }, { "epoch": 0.2201517435681981, "grad_norm": 5.0500929246539785, "learning_rate": 1.8163311130437496e-07, "loss": 0.783, "step": 2227 }, { "epoch": 0.22025059931295257, "grad_norm": 4.414900404650592, "learning_rate": 1.8161461222305866e-07, "loss": 0.6177, "step": 2228 }, { "epoch": 0.22034945505770703, "grad_norm": 6.038395985833523, "learning_rate": 1.815961047733678e-07, "loss": 0.7146, "step": 2229 }, { "epoch": 0.2204483108024615, "grad_norm": 5.794213359705727, "learning_rate": 1.8157758895719999e-07, "loss": 0.6268, "step": 2230 }, { "epoch": 0.22054716654721598, "grad_norm": 4.667437999322638, "learning_rate": 1.815590647764538e-07, "loss": 0.8057, "step": 2231 }, { "epoch": 0.22064602229197045, "grad_norm": 7.6866328362622, "learning_rate": 1.815405322330286e-07, "loss": 0.7478, "step": 2232 }, { "epoch": 0.2207448780367249, "grad_norm": 7.046859814249418, "learning_rate": 1.8152199132882462e-07, "loss": 0.796, "step": 2233 }, { "epoch": 0.22084373378147937, "grad_norm": 5.13464041961843, "learning_rate": 1.8150344206574296e-07, "loss": 0.834, "step": 2234 }, { "epoch": 0.22094258952623386, "grad_norm": 3.6557481382908774, "learning_rate": 1.8148488444568557e-07, "loss": 0.6793, "step": 2235 }, { "epoch": 0.22104144527098832, "grad_norm": 4.943340127729641, "learning_rate": 1.814663184705553e-07, "loss": 0.758, "step": 2236 }, { "epoch": 0.22114030101574278, "grad_norm": 9.404996615559199, "learning_rate": 1.8144774414225574e-07, "loss": 0.8042, "step": 2237 }, { "epoch": 0.22123915676049724, "grad_norm": 3.4347086295708706, "learning_rate": 1.814291614626915e-07, "loss": 0.7619, "step": 2238 }, { "epoch": 0.2213380125052517, "grad_norm": 4.723037044603319, "learning_rate": 1.8141057043376789e-07, "loss": 0.7029, "step": 2239 }, { "epoch": 0.2214368682500062, "grad_norm": 3.9063199651559506, "learning_rate": 1.8139197105739118e-07, "loss": 0.6983, "step": 2240 }, { "epoch": 0.22153572399476065, "grad_norm": 3.3986448765300294, "learning_rate": 1.8137336333546842e-07, "loss": 0.8021, "step": 2241 }, { "epoch": 0.22163457973951511, "grad_norm": 4.430915242472711, "learning_rate": 1.8135474726990764e-07, "loss": 0.6895, "step": 2242 }, { "epoch": 0.22173343548426958, "grad_norm": 3.583820231953911, "learning_rate": 1.8133612286261758e-07, "loss": 0.7229, "step": 2243 }, { "epoch": 0.22183229122902404, "grad_norm": 5.420435602322517, "learning_rate": 1.813174901155079e-07, "loss": 0.6538, "step": 2244 }, { "epoch": 0.22193114697377853, "grad_norm": 7.461733121188919, "learning_rate": 1.8129884903048913e-07, "loss": 0.713, "step": 2245 }, { "epoch": 0.222030002718533, "grad_norm": 7.302142482531213, "learning_rate": 1.8128019960947266e-07, "loss": 0.6695, "step": 2246 }, { "epoch": 0.22212885846328745, "grad_norm": 7.512341595921433, "learning_rate": 1.8126154185437066e-07, "loss": 0.6403, "step": 2247 }, { "epoch": 0.2222277142080419, "grad_norm": 11.401539791865352, "learning_rate": 1.8124287576709622e-07, "loss": 0.7208, "step": 2248 }, { "epoch": 0.22232656995279637, "grad_norm": 4.602699245826749, "learning_rate": 1.8122420134956332e-07, "loss": 0.677, "step": 2249 }, { "epoch": 0.22242542569755086, "grad_norm": 6.5364598443905715, "learning_rate": 1.8120551860368672e-07, "loss": 0.6142, "step": 2250 }, { "epoch": 0.22252428144230532, "grad_norm": 4.855845394062281, "learning_rate": 1.8118682753138202e-07, "loss": 0.7401, "step": 2251 }, { "epoch": 0.22262313718705978, "grad_norm": 3.7295662275728017, "learning_rate": 1.8116812813456575e-07, "loss": 0.6516, "step": 2252 }, { "epoch": 0.22272199293181424, "grad_norm": 11.614116434831002, "learning_rate": 1.8114942041515527e-07, "loss": 0.7183, "step": 2253 }, { "epoch": 0.2228208486765687, "grad_norm": 4.0489110019320735, "learning_rate": 1.8113070437506878e-07, "loss": 0.7919, "step": 2254 }, { "epoch": 0.2229197044213232, "grad_norm": 16.538539406264523, "learning_rate": 1.811119800162253e-07, "loss": 0.7205, "step": 2255 }, { "epoch": 0.22301856016607766, "grad_norm": 3.908051445386847, "learning_rate": 1.8109324734054474e-07, "loss": 0.7822, "step": 2256 }, { "epoch": 0.22311741591083212, "grad_norm": 29.725503523381, "learning_rate": 1.8107450634994788e-07, "loss": 0.6996, "step": 2257 }, { "epoch": 0.22321627165558658, "grad_norm": 4.326472687580419, "learning_rate": 1.8105575704635634e-07, "loss": 0.7383, "step": 2258 }, { "epoch": 0.22331512740034104, "grad_norm": 3.4411171326858963, "learning_rate": 1.8103699943169257e-07, "loss": 0.7789, "step": 2259 }, { "epoch": 0.22341398314509553, "grad_norm": 4.066362488064526, "learning_rate": 1.8101823350787988e-07, "loss": 0.7288, "step": 2260 }, { "epoch": 0.22351283888985, "grad_norm": 6.469017007670019, "learning_rate": 1.8099945927684246e-07, "loss": 0.6703, "step": 2261 }, { "epoch": 0.22361169463460445, "grad_norm": 4.047195437170868, "learning_rate": 1.809806767405053e-07, "loss": 0.7092, "step": 2262 }, { "epoch": 0.2237105503793589, "grad_norm": 7.197360406271074, "learning_rate": 1.809618859007943e-07, "loss": 0.6652, "step": 2263 }, { "epoch": 0.22380940612411337, "grad_norm": 5.703925639337483, "learning_rate": 1.8094308675963617e-07, "loss": 0.7526, "step": 2264 }, { "epoch": 0.22390826186886786, "grad_norm": 6.763971887388284, "learning_rate": 1.8092427931895848e-07, "loss": 0.706, "step": 2265 }, { "epoch": 0.22400711761362233, "grad_norm": 3.6348701586371193, "learning_rate": 1.809054635806897e-07, "loss": 0.7149, "step": 2266 }, { "epoch": 0.2241059733583768, "grad_norm": 5.826371786519916, "learning_rate": 1.80886639546759e-07, "loss": 0.6991, "step": 2267 }, { "epoch": 0.22420482910313125, "grad_norm": 4.788298193014567, "learning_rate": 1.8086780721909663e-07, "loss": 0.6116, "step": 2268 }, { "epoch": 0.2243036848478857, "grad_norm": 3.3951348734656994, "learning_rate": 1.808489665996335e-07, "loss": 0.6954, "step": 2269 }, { "epoch": 0.2244025405926402, "grad_norm": 3.0269984561207397, "learning_rate": 1.8083011769030145e-07, "loss": 0.6668, "step": 2270 }, { "epoch": 0.22450139633739466, "grad_norm": 5.399258655302236, "learning_rate": 1.8081126049303314e-07, "loss": 0.742, "step": 2271 }, { "epoch": 0.22460025208214912, "grad_norm": 4.375264209829578, "learning_rate": 1.8079239500976216e-07, "loss": 0.6423, "step": 2272 }, { "epoch": 0.22469910782690358, "grad_norm": 4.838705580265416, "learning_rate": 1.8077352124242283e-07, "loss": 0.714, "step": 2273 }, { "epoch": 0.22479796357165807, "grad_norm": 3.165851657635201, "learning_rate": 1.8075463919295037e-07, "loss": 0.7254, "step": 2274 }, { "epoch": 0.22489681931641253, "grad_norm": 4.710963170588682, "learning_rate": 1.807357488632809e-07, "loss": 0.6338, "step": 2275 }, { "epoch": 0.224995675061167, "grad_norm": 5.178372708029737, "learning_rate": 1.807168502553513e-07, "loss": 0.701, "step": 2276 }, { "epoch": 0.22509453080592146, "grad_norm": 4.107801873224287, "learning_rate": 1.8069794337109935e-07, "loss": 0.7084, "step": 2277 }, { "epoch": 0.22519338655067592, "grad_norm": 5.679366561806041, "learning_rate": 1.8067902821246373e-07, "loss": 0.6844, "step": 2278 }, { "epoch": 0.2252922422954304, "grad_norm": 6.59816162929532, "learning_rate": 1.8066010478138383e-07, "loss": 0.8134, "step": 2279 }, { "epoch": 0.22539109804018487, "grad_norm": 3.3273584325408856, "learning_rate": 1.806411730798e-07, "loss": 0.6035, "step": 2280 }, { "epoch": 0.22548995378493933, "grad_norm": 4.929975644030452, "learning_rate": 1.8062223310965346e-07, "loss": 0.7126, "step": 2281 }, { "epoch": 0.2255888095296938, "grad_norm": 4.837395406386643, "learning_rate": 1.8060328487288613e-07, "loss": 0.7736, "step": 2282 }, { "epoch": 0.22568766527444825, "grad_norm": 6.744421903709185, "learning_rate": 1.8058432837144094e-07, "loss": 0.7378, "step": 2283 }, { "epoch": 0.22578652101920274, "grad_norm": 11.331873234553592, "learning_rate": 1.8056536360726155e-07, "loss": 0.836, "step": 2284 }, { "epoch": 0.2258853767639572, "grad_norm": 7.034215524006916, "learning_rate": 1.8054639058229258e-07, "loss": 0.6975, "step": 2285 }, { "epoch": 0.22598423250871166, "grad_norm": 3.697899115301812, "learning_rate": 1.805274092984794e-07, "loss": 0.7234, "step": 2286 }, { "epoch": 0.22608308825346612, "grad_norm": 4.423837218013055, "learning_rate": 1.8050841975776822e-07, "loss": 0.7686, "step": 2287 }, { "epoch": 0.22618194399822059, "grad_norm": 17.753626327998095, "learning_rate": 1.804894219621062e-07, "loss": 0.7287, "step": 2288 }, { "epoch": 0.22628079974297508, "grad_norm": 3.9442940828468473, "learning_rate": 1.8047041591344128e-07, "loss": 0.8078, "step": 2289 }, { "epoch": 0.22637965548772954, "grad_norm": 6.0976288445848, "learning_rate": 1.804514016137222e-07, "loss": 0.6417, "step": 2290 }, { "epoch": 0.226478511232484, "grad_norm": 6.562205855032461, "learning_rate": 1.8043237906489864e-07, "loss": 0.7282, "step": 2291 }, { "epoch": 0.22657736697723846, "grad_norm": 17.434091864506833, "learning_rate": 1.8041334826892106e-07, "loss": 0.8158, "step": 2292 }, { "epoch": 0.22667622272199292, "grad_norm": 4.908122533680318, "learning_rate": 1.803943092277408e-07, "loss": 0.7187, "step": 2293 }, { "epoch": 0.2267750784667474, "grad_norm": 3.482336274221947, "learning_rate": 1.8037526194331001e-07, "loss": 0.7715, "step": 2294 }, { "epoch": 0.22687393421150187, "grad_norm": 3.9402111758658602, "learning_rate": 1.8035620641758175e-07, "loss": 0.6528, "step": 2295 }, { "epoch": 0.22697278995625633, "grad_norm": 5.392592682095414, "learning_rate": 1.8033714265250987e-07, "loss": 0.7167, "step": 2296 }, { "epoch": 0.2270716457010108, "grad_norm": 4.812434834058206, "learning_rate": 1.8031807065004906e-07, "loss": 0.7223, "step": 2297 }, { "epoch": 0.22717050144576525, "grad_norm": 5.080801626880967, "learning_rate": 1.8029899041215483e-07, "loss": 0.6515, "step": 2298 }, { "epoch": 0.22726935719051974, "grad_norm": 3.462825250797019, "learning_rate": 1.8027990194078367e-07, "loss": 0.7081, "step": 2299 }, { "epoch": 0.2273682129352742, "grad_norm": 3.671139786738081, "learning_rate": 1.8026080523789279e-07, "loss": 0.8255, "step": 2300 }, { "epoch": 0.22746706868002867, "grad_norm": 33.89805121425542, "learning_rate": 1.8024170030544027e-07, "loss": 0.5625, "step": 2301 }, { "epoch": 0.22756592442478313, "grad_norm": 3.526006402270299, "learning_rate": 1.80222587145385e-07, "loss": 0.7136, "step": 2302 }, { "epoch": 0.2276647801695376, "grad_norm": 3.3260083601149515, "learning_rate": 1.8020346575968679e-07, "loss": 0.6723, "step": 2303 }, { "epoch": 0.22776363591429208, "grad_norm": 3.14714515282056, "learning_rate": 1.8018433615030626e-07, "loss": 0.6868, "step": 2304 }, { "epoch": 0.22786249165904654, "grad_norm": 3.5529878995590622, "learning_rate": 1.8016519831920485e-07, "loss": 0.6026, "step": 2305 }, { "epoch": 0.227961347403801, "grad_norm": 5.320884115235474, "learning_rate": 1.801460522683449e-07, "loss": 0.7869, "step": 2306 }, { "epoch": 0.22806020314855546, "grad_norm": 4.165275900460274, "learning_rate": 1.801268979996895e-07, "loss": 0.7834, "step": 2307 }, { "epoch": 0.22815905889330992, "grad_norm": 3.2649217933485604, "learning_rate": 1.8010773551520266e-07, "loss": 0.8007, "step": 2308 }, { "epoch": 0.2282579146380644, "grad_norm": 2.8281277234448914, "learning_rate": 1.8008856481684922e-07, "loss": 0.8039, "step": 2309 }, { "epoch": 0.22835677038281887, "grad_norm": 3.7238205726417104, "learning_rate": 1.8006938590659484e-07, "loss": 0.7633, "step": 2310 }, { "epoch": 0.22845562612757334, "grad_norm": 3.256446732220314, "learning_rate": 1.800501987864061e-07, "loss": 0.7141, "step": 2311 }, { "epoch": 0.2285544818723278, "grad_norm": 5.506007748468183, "learning_rate": 1.8003100345825021e-07, "loss": 0.7768, "step": 2312 }, { "epoch": 0.2286533376170823, "grad_norm": 11.065627113870244, "learning_rate": 1.8001179992409552e-07, "loss": 0.7111, "step": 2313 }, { "epoch": 0.22875219336183675, "grad_norm": 31.425612251565877, "learning_rate": 1.7999258818591098e-07, "loss": 0.8612, "step": 2314 }, { "epoch": 0.2288510491065912, "grad_norm": 4.775154158123451, "learning_rate": 1.7997336824566648e-07, "loss": 0.7068, "step": 2315 }, { "epoch": 0.22894990485134567, "grad_norm": 6.043512788077851, "learning_rate": 1.7995414010533279e-07, "loss": 0.6667, "step": 2316 }, { "epoch": 0.22904876059610013, "grad_norm": 3.2202504544162602, "learning_rate": 1.7993490376688142e-07, "loss": 0.7319, "step": 2317 }, { "epoch": 0.22914761634085462, "grad_norm": 3.1033158319487675, "learning_rate": 1.799156592322848e-07, "loss": 0.6512, "step": 2318 }, { "epoch": 0.22924647208560908, "grad_norm": 2.7598582404283043, "learning_rate": 1.7989640650351617e-07, "loss": 0.6186, "step": 2319 }, { "epoch": 0.22934532783036354, "grad_norm": 4.966225277743665, "learning_rate": 1.798771455825496e-07, "loss": 0.7385, "step": 2320 }, { "epoch": 0.229444183575118, "grad_norm": 3.9072878009258236, "learning_rate": 1.7985787647135999e-07, "loss": 0.703, "step": 2321 }, { "epoch": 0.22954303931987247, "grad_norm": 3.6469386914853854, "learning_rate": 1.7983859917192318e-07, "loss": 0.71, "step": 2322 }, { "epoch": 0.22964189506462696, "grad_norm": 3.6043384478092997, "learning_rate": 1.798193136862157e-07, "loss": 0.724, "step": 2323 }, { "epoch": 0.22974075080938142, "grad_norm": 3.0602221544184474, "learning_rate": 1.7980002001621504e-07, "loss": 0.7766, "step": 2324 }, { "epoch": 0.22983960655413588, "grad_norm": 4.01702872145419, "learning_rate": 1.7978071816389947e-07, "loss": 0.6897, "step": 2325 }, { "epoch": 0.22993846229889034, "grad_norm": 13.66378925802652, "learning_rate": 1.7976140813124806e-07, "loss": 0.7134, "step": 2326 }, { "epoch": 0.2300373180436448, "grad_norm": 4.6438545677081455, "learning_rate": 1.7974208992024081e-07, "loss": 0.8275, "step": 2327 }, { "epoch": 0.2301361737883993, "grad_norm": 6.7419453227795385, "learning_rate": 1.7972276353285857e-07, "loss": 0.7333, "step": 2328 }, { "epoch": 0.23023502953315375, "grad_norm": 18.36546642927546, "learning_rate": 1.797034289710829e-07, "loss": 0.8672, "step": 2329 }, { "epoch": 0.2303338852779082, "grad_norm": 4.956906007056394, "learning_rate": 1.7968408623689627e-07, "loss": 0.7237, "step": 2330 }, { "epoch": 0.23043274102266267, "grad_norm": 3.0788188220679324, "learning_rate": 1.7966473533228203e-07, "loss": 0.6961, "step": 2331 }, { "epoch": 0.23053159676741714, "grad_norm": 7.094401340209455, "learning_rate": 1.7964537625922433e-07, "loss": 0.6539, "step": 2332 }, { "epoch": 0.23063045251217162, "grad_norm": 7.459589007777191, "learning_rate": 1.7962600901970816e-07, "loss": 0.7682, "step": 2333 }, { "epoch": 0.23072930825692609, "grad_norm": 3.3939079431698427, "learning_rate": 1.7960663361571933e-07, "loss": 0.7493, "step": 2334 }, { "epoch": 0.23082816400168055, "grad_norm": 4.226424166004542, "learning_rate": 1.795872500492445e-07, "loss": 0.7842, "step": 2335 }, { "epoch": 0.230927019746435, "grad_norm": 6.6913751353105635, "learning_rate": 1.7956785832227113e-07, "loss": 0.7799, "step": 2336 }, { "epoch": 0.23102587549118947, "grad_norm": 4.635867380265187, "learning_rate": 1.7954845843678763e-07, "loss": 0.6994, "step": 2337 }, { "epoch": 0.23112473123594396, "grad_norm": 3.504023533105126, "learning_rate": 1.7952905039478315e-07, "loss": 0.7593, "step": 2338 }, { "epoch": 0.23122358698069842, "grad_norm": 4.7814818653627045, "learning_rate": 1.7950963419824768e-07, "loss": 0.7856, "step": 2339 }, { "epoch": 0.23132244272545288, "grad_norm": 5.507026970149726, "learning_rate": 1.7949020984917204e-07, "loss": 0.7714, "step": 2340 }, { "epoch": 0.23142129847020734, "grad_norm": 4.654546938506628, "learning_rate": 1.79470777349548e-07, "loss": 0.7142, "step": 2341 }, { "epoch": 0.2315201542149618, "grad_norm": 4.898484545939503, "learning_rate": 1.7945133670136796e-07, "loss": 0.8338, "step": 2342 }, { "epoch": 0.2316190099597163, "grad_norm": 3.658022149572809, "learning_rate": 1.7943188790662537e-07, "loss": 0.7558, "step": 2343 }, { "epoch": 0.23171786570447075, "grad_norm": 3.9307864321389063, "learning_rate": 1.7941243096731437e-07, "loss": 0.6032, "step": 2344 }, { "epoch": 0.23181672144922522, "grad_norm": 7.731591997601774, "learning_rate": 1.7939296588542999e-07, "loss": 0.8005, "step": 2345 }, { "epoch": 0.23191557719397968, "grad_norm": 3.85219343279815, "learning_rate": 1.793734926629681e-07, "loss": 0.6628, "step": 2346 }, { "epoch": 0.23201443293873414, "grad_norm": 6.633923508841117, "learning_rate": 1.7935401130192536e-07, "loss": 0.7895, "step": 2347 }, { "epoch": 0.23211328868348863, "grad_norm": 5.36892087512545, "learning_rate": 1.7933452180429933e-07, "loss": 0.7296, "step": 2348 }, { "epoch": 0.2322121444282431, "grad_norm": 37.54816536924243, "learning_rate": 1.7931502417208834e-07, "loss": 0.6484, "step": 2349 }, { "epoch": 0.23231100017299755, "grad_norm": 5.0667391259538945, "learning_rate": 1.792955184072916e-07, "loss": 0.7387, "step": 2350 }, { "epoch": 0.232409855917752, "grad_norm": 3.1678264686625455, "learning_rate": 1.7927600451190914e-07, "loss": 0.7885, "step": 2351 }, { "epoch": 0.2325087116625065, "grad_norm": 3.7693286044633396, "learning_rate": 1.7925648248794185e-07, "loss": 0.7022, "step": 2352 }, { "epoch": 0.23260756740726096, "grad_norm": 4.289063185685198, "learning_rate": 1.7923695233739134e-07, "loss": 0.731, "step": 2353 }, { "epoch": 0.23270642315201542, "grad_norm": 3.906410943401662, "learning_rate": 1.7921741406226026e-07, "loss": 0.7678, "step": 2354 }, { "epoch": 0.23280527889676988, "grad_norm": 23.97322630909973, "learning_rate": 1.7919786766455186e-07, "loss": 0.6156, "step": 2355 }, { "epoch": 0.23290413464152435, "grad_norm": 8.939167568433776, "learning_rate": 1.7917831314627038e-07, "loss": 0.6808, "step": 2356 }, { "epoch": 0.23300299038627884, "grad_norm": 3.589132537500845, "learning_rate": 1.7915875050942088e-07, "loss": 0.6517, "step": 2357 }, { "epoch": 0.2331018461310333, "grad_norm": 4.162590781641654, "learning_rate": 1.7913917975600916e-07, "loss": 0.7593, "step": 2358 }, { "epoch": 0.23320070187578776, "grad_norm": 3.153506456844686, "learning_rate": 1.7911960088804198e-07, "loss": 0.7614, "step": 2359 }, { "epoch": 0.23329955762054222, "grad_norm": 4.068563217201816, "learning_rate": 1.791000139075268e-07, "loss": 0.7418, "step": 2360 }, { "epoch": 0.23339841336529668, "grad_norm": 11.211266619277687, "learning_rate": 1.7908041881647197e-07, "loss": 0.7118, "step": 2361 }, { "epoch": 0.23349726911005117, "grad_norm": 5.808212732113187, "learning_rate": 1.7906081561688675e-07, "loss": 0.7331, "step": 2362 }, { "epoch": 0.23359612485480563, "grad_norm": 3.6482394240581324, "learning_rate": 1.7904120431078112e-07, "loss": 0.6094, "step": 2363 }, { "epoch": 0.2336949805995601, "grad_norm": 5.829953011390809, "learning_rate": 1.790215849001659e-07, "loss": 0.7617, "step": 2364 }, { "epoch": 0.23379383634431455, "grad_norm": 3.731916947125293, "learning_rate": 1.7900195738705285e-07, "loss": 0.8435, "step": 2365 }, { "epoch": 0.23389269208906902, "grad_norm": 4.369194243787675, "learning_rate": 1.7898232177345438e-07, "loss": 0.7434, "step": 2366 }, { "epoch": 0.2339915478338235, "grad_norm": 4.052027796294193, "learning_rate": 1.789626780613839e-07, "loss": 0.6862, "step": 2367 }, { "epoch": 0.23409040357857797, "grad_norm": 3.01400142277909, "learning_rate": 1.7894302625285557e-07, "loss": 0.6983, "step": 2368 }, { "epoch": 0.23418925932333243, "grad_norm": 6.284158009203555, "learning_rate": 1.7892336634988437e-07, "loss": 0.6354, "step": 2369 }, { "epoch": 0.2342881150680869, "grad_norm": 7.339184832533638, "learning_rate": 1.789036983544862e-07, "loss": 0.762, "step": 2370 }, { "epoch": 0.23438697081284135, "grad_norm": 3.8723245608623165, "learning_rate": 1.788840222686776e-07, "loss": 0.78, "step": 2371 }, { "epoch": 0.23448582655759584, "grad_norm": 3.7307895594635796, "learning_rate": 1.788643380944762e-07, "loss": 0.736, "step": 2372 }, { "epoch": 0.2345846823023503, "grad_norm": 3.687147335894984, "learning_rate": 1.7884464583390022e-07, "loss": 0.8017, "step": 2373 }, { "epoch": 0.23468353804710476, "grad_norm": 13.830887503214711, "learning_rate": 1.7882494548896888e-07, "loss": 0.6776, "step": 2374 }, { "epoch": 0.23478239379185922, "grad_norm": 5.661337264896834, "learning_rate": 1.788052370617021e-07, "loss": 0.7208, "step": 2375 }, { "epoch": 0.23488124953661368, "grad_norm": 5.197337976405989, "learning_rate": 1.7878552055412075e-07, "loss": 0.6372, "step": 2376 }, { "epoch": 0.23498010528136817, "grad_norm": 3.9043732366999406, "learning_rate": 1.787657959682464e-07, "loss": 0.6534, "step": 2377 }, { "epoch": 0.23507896102612263, "grad_norm": 4.0907893114550005, "learning_rate": 1.7874606330610158e-07, "loss": 0.6868, "step": 2378 }, { "epoch": 0.2351778167708771, "grad_norm": 3.6462265429114384, "learning_rate": 1.7872632256970955e-07, "loss": 0.5791, "step": 2379 }, { "epoch": 0.23527667251563156, "grad_norm": 4.951975658320117, "learning_rate": 1.7870657376109444e-07, "loss": 0.6944, "step": 2380 }, { "epoch": 0.23537552826038602, "grad_norm": 2.942556561569832, "learning_rate": 1.7868681688228118e-07, "loss": 0.6583, "step": 2381 }, { "epoch": 0.2354743840051405, "grad_norm": 3.431049890053801, "learning_rate": 1.7866705193529557e-07, "loss": 0.7448, "step": 2382 }, { "epoch": 0.23557323974989497, "grad_norm": 9.564637596281061, "learning_rate": 1.7864727892216421e-07, "loss": 0.7544, "step": 2383 }, { "epoch": 0.23567209549464943, "grad_norm": 3.0651565406088457, "learning_rate": 1.7862749784491452e-07, "loss": 0.6529, "step": 2384 }, { "epoch": 0.2357709512394039, "grad_norm": 4.581361277514506, "learning_rate": 1.7860770870557475e-07, "loss": 0.7145, "step": 2385 }, { "epoch": 0.23586980698415835, "grad_norm": 6.469513367778448, "learning_rate": 1.78587911506174e-07, "loss": 0.7533, "step": 2386 }, { "epoch": 0.23596866272891284, "grad_norm": 3.850491056571681, "learning_rate": 1.7856810624874219e-07, "loss": 0.8474, "step": 2387 }, { "epoch": 0.2360675184736673, "grad_norm": 11.584442039658784, "learning_rate": 1.7854829293531005e-07, "loss": 0.7001, "step": 2388 }, { "epoch": 0.23616637421842177, "grad_norm": 11.027745221703094, "learning_rate": 1.7852847156790913e-07, "loss": 0.6828, "step": 2389 }, { "epoch": 0.23626522996317623, "grad_norm": 4.64465382324378, "learning_rate": 1.7850864214857183e-07, "loss": 0.6506, "step": 2390 }, { "epoch": 0.23636408570793072, "grad_norm": 3.1749441463210393, "learning_rate": 1.7848880467933136e-07, "loss": 0.722, "step": 2391 }, { "epoch": 0.23646294145268518, "grad_norm": 61.646353164000786, "learning_rate": 1.7846895916222176e-07, "loss": 0.767, "step": 2392 }, { "epoch": 0.23656179719743964, "grad_norm": 11.018486115030829, "learning_rate": 1.7844910559927787e-07, "loss": 0.7252, "step": 2393 }, { "epoch": 0.2366606529421941, "grad_norm": 4.284640556283989, "learning_rate": 1.7842924399253544e-07, "loss": 0.6773, "step": 2394 }, { "epoch": 0.23675950868694856, "grad_norm": 7.13147733163538, "learning_rate": 1.7840937434403095e-07, "loss": 0.721, "step": 2395 }, { "epoch": 0.23685836443170305, "grad_norm": 5.642915635528565, "learning_rate": 1.7838949665580176e-07, "loss": 0.6723, "step": 2396 }, { "epoch": 0.2369572201764575, "grad_norm": 4.1914069601372725, "learning_rate": 1.78369610929886e-07, "loss": 0.6639, "step": 2397 }, { "epoch": 0.23705607592121197, "grad_norm": 9.1931913207487, "learning_rate": 1.7834971716832264e-07, "loss": 0.5895, "step": 2398 }, { "epoch": 0.23715493166596643, "grad_norm": 9.85802401041135, "learning_rate": 1.7832981537315155e-07, "loss": 0.7286, "step": 2399 }, { "epoch": 0.2372537874107209, "grad_norm": 4.0438282666943595, "learning_rate": 1.7830990554641332e-07, "loss": 0.7264, "step": 2400 }, { "epoch": 0.23735264315547538, "grad_norm": 4.586859428760961, "learning_rate": 1.7828998769014944e-07, "loss": 0.7917, "step": 2401 }, { "epoch": 0.23745149890022985, "grad_norm": 3.58116537807087, "learning_rate": 1.7827006180640218e-07, "loss": 0.698, "step": 2402 }, { "epoch": 0.2375503546449843, "grad_norm": 18.235971692408402, "learning_rate": 1.7825012789721468e-07, "loss": 0.7142, "step": 2403 }, { "epoch": 0.23764921038973877, "grad_norm": 6.129467137853977, "learning_rate": 1.7823018596463083e-07, "loss": 0.7401, "step": 2404 }, { "epoch": 0.23774806613449323, "grad_norm": 6.113230941695234, "learning_rate": 1.7821023601069534e-07, "loss": 0.6702, "step": 2405 }, { "epoch": 0.23784692187924772, "grad_norm": 3.593330461486022, "learning_rate": 1.7819027803745382e-07, "loss": 0.6313, "step": 2406 }, { "epoch": 0.23794577762400218, "grad_norm": 4.848387348189897, "learning_rate": 1.7817031204695274e-07, "loss": 0.7, "step": 2407 }, { "epoch": 0.23804463336875664, "grad_norm": 4.165376422912614, "learning_rate": 1.781503380412392e-07, "loss": 0.7617, "step": 2408 }, { "epoch": 0.2381434891135111, "grad_norm": 3.207873659842536, "learning_rate": 1.7813035602236134e-07, "loss": 0.6821, "step": 2409 }, { "epoch": 0.23824234485826556, "grad_norm": 9.297991517936739, "learning_rate": 1.7811036599236792e-07, "loss": 0.7983, "step": 2410 }, { "epoch": 0.23834120060302005, "grad_norm": 9.434388126478582, "learning_rate": 1.780903679533087e-07, "loss": 0.6749, "step": 2411 }, { "epoch": 0.23844005634777451, "grad_norm": 4.578553844885686, "learning_rate": 1.7807036190723416e-07, "loss": 0.7396, "step": 2412 }, { "epoch": 0.23853891209252898, "grad_norm": 4.770548581920049, "learning_rate": 1.7805034785619567e-07, "loss": 0.7183, "step": 2413 }, { "epoch": 0.23863776783728344, "grad_norm": 5.508829134442424, "learning_rate": 1.7803032580224526e-07, "loss": 0.6187, "step": 2414 }, { "epoch": 0.2387366235820379, "grad_norm": 5.452310724083021, "learning_rate": 1.78010295747436e-07, "loss": 0.6974, "step": 2415 }, { "epoch": 0.2388354793267924, "grad_norm": 4.433225706780586, "learning_rate": 1.7799025769382163e-07, "loss": 0.7491, "step": 2416 }, { "epoch": 0.23893433507154685, "grad_norm": 4.895244287104808, "learning_rate": 1.779702116434568e-07, "loss": 0.8103, "step": 2417 }, { "epoch": 0.2390331908163013, "grad_norm": 3.317456281218132, "learning_rate": 1.7795015759839687e-07, "loss": 0.7614, "step": 2418 }, { "epoch": 0.23913204656105577, "grad_norm": 5.929703900740601, "learning_rate": 1.7793009556069815e-07, "loss": 0.7896, "step": 2419 }, { "epoch": 0.23923090230581023, "grad_norm": 3.748422486996989, "learning_rate": 1.7791002553241764e-07, "loss": 0.8026, "step": 2420 }, { "epoch": 0.23932975805056472, "grad_norm": 5.378306172576823, "learning_rate": 1.7788994751561331e-07, "loss": 0.8009, "step": 2421 }, { "epoch": 0.23942861379531918, "grad_norm": 3.143932157198784, "learning_rate": 1.7786986151234383e-07, "loss": 0.6877, "step": 2422 }, { "epoch": 0.23952746954007365, "grad_norm": 4.719441353736738, "learning_rate": 1.778497675246687e-07, "loss": 0.7498, "step": 2423 }, { "epoch": 0.2396263252848281, "grad_norm": 4.461989297266671, "learning_rate": 1.7782966555464824e-07, "loss": 0.7741, "step": 2424 }, { "epoch": 0.23972518102958257, "grad_norm": 2.7239360153355197, "learning_rate": 1.778095556043437e-07, "loss": 0.7333, "step": 2425 }, { "epoch": 0.23982403677433706, "grad_norm": 8.343779775782885, "learning_rate": 1.7778943767581696e-07, "loss": 0.7103, "step": 2426 }, { "epoch": 0.23992289251909152, "grad_norm": 3.494465602430989, "learning_rate": 1.7776931177113088e-07, "loss": 0.7738, "step": 2427 }, { "epoch": 0.24002174826384598, "grad_norm": 4.910352088982755, "learning_rate": 1.7774917789234904e-07, "loss": 0.7845, "step": 2428 }, { "epoch": 0.24012060400860044, "grad_norm": 6.657978956847763, "learning_rate": 1.7772903604153592e-07, "loss": 0.6877, "step": 2429 }, { "epoch": 0.24021945975335493, "grad_norm": 9.401235725437902, "learning_rate": 1.777088862207567e-07, "loss": 0.7322, "step": 2430 }, { "epoch": 0.2403183154981094, "grad_norm": 6.20288001572692, "learning_rate": 1.776887284320775e-07, "loss": 0.7323, "step": 2431 }, { "epoch": 0.24041717124286385, "grad_norm": 8.846944423796538, "learning_rate": 1.776685626775652e-07, "loss": 0.6497, "step": 2432 }, { "epoch": 0.24051602698761831, "grad_norm": 8.760498628095192, "learning_rate": 1.7764838895928748e-07, "loss": 0.6188, "step": 2433 }, { "epoch": 0.24061488273237278, "grad_norm": 5.550861172900912, "learning_rate": 1.776282072793129e-07, "loss": 0.7082, "step": 2434 }, { "epoch": 0.24071373847712726, "grad_norm": 12.843868488854449, "learning_rate": 1.7760801763971072e-07, "loss": 0.6888, "step": 2435 }, { "epoch": 0.24081259422188173, "grad_norm": 6.212730071033395, "learning_rate": 1.7758782004255112e-07, "loss": 0.8031, "step": 2436 }, { "epoch": 0.2409114499666362, "grad_norm": 3.68952862099971, "learning_rate": 1.775676144899051e-07, "loss": 0.6585, "step": 2437 }, { "epoch": 0.24101030571139065, "grad_norm": 4.278970502269807, "learning_rate": 1.7754740098384443e-07, "loss": 0.7585, "step": 2438 }, { "epoch": 0.2411091614561451, "grad_norm": 3.6180778406406846, "learning_rate": 1.7752717952644166e-07, "loss": 0.7455, "step": 2439 }, { "epoch": 0.2412080172008996, "grad_norm": 5.74139252973786, "learning_rate": 1.7750695011977025e-07, "loss": 0.657, "step": 2440 }, { "epoch": 0.24130687294565406, "grad_norm": 4.296857508256599, "learning_rate": 1.7748671276590442e-07, "loss": 0.6487, "step": 2441 }, { "epoch": 0.24140572869040852, "grad_norm": 3.3868148381179894, "learning_rate": 1.774664674669192e-07, "loss": 0.802, "step": 2442 }, { "epoch": 0.24150458443516298, "grad_norm": 3.414264655151745, "learning_rate": 1.7744621422489045e-07, "loss": 0.7315, "step": 2443 }, { "epoch": 0.24160344017991744, "grad_norm": 4.118589148267608, "learning_rate": 1.7742595304189484e-07, "loss": 0.6053, "step": 2444 }, { "epoch": 0.24170229592467193, "grad_norm": 3.138085187201764, "learning_rate": 1.7740568392000987e-07, "loss": 0.6863, "step": 2445 }, { "epoch": 0.2418011516694264, "grad_norm": 4.705837508606869, "learning_rate": 1.7738540686131383e-07, "loss": 0.6122, "step": 2446 }, { "epoch": 0.24190000741418086, "grad_norm": 3.897090698443874, "learning_rate": 1.773651218678858e-07, "loss": 0.6448, "step": 2447 }, { "epoch": 0.24199886315893532, "grad_norm": 3.878911662684798, "learning_rate": 1.7734482894180579e-07, "loss": 0.6736, "step": 2448 }, { "epoch": 0.24209771890368978, "grad_norm": 3.995836069155526, "learning_rate": 1.7732452808515444e-07, "loss": 0.7111, "step": 2449 }, { "epoch": 0.24219657464844427, "grad_norm": 6.906842004693163, "learning_rate": 1.773042193000134e-07, "loss": 0.7113, "step": 2450 }, { "epoch": 0.24229543039319873, "grad_norm": 7.173488825434043, "learning_rate": 1.7728390258846492e-07, "loss": 0.6916, "step": 2451 }, { "epoch": 0.2423942861379532, "grad_norm": 8.342601355042884, "learning_rate": 1.772635779525923e-07, "loss": 0.7219, "step": 2452 }, { "epoch": 0.24249314188270765, "grad_norm": 3.724555406583542, "learning_rate": 1.7724324539447948e-07, "loss": 0.8007, "step": 2453 }, { "epoch": 0.2425919976274621, "grad_norm": 3.7456644729109363, "learning_rate": 1.7722290491621123e-07, "loss": 0.6959, "step": 2454 }, { "epoch": 0.2426908533722166, "grad_norm": 5.078708952891566, "learning_rate": 1.7720255651987323e-07, "loss": 0.6996, "step": 2455 }, { "epoch": 0.24278970911697106, "grad_norm": 4.744083328166485, "learning_rate": 1.7718220020755186e-07, "loss": 0.6636, "step": 2456 }, { "epoch": 0.24288856486172553, "grad_norm": 4.978765425272639, "learning_rate": 1.7716183598133438e-07, "loss": 0.7324, "step": 2457 }, { "epoch": 0.24298742060648, "grad_norm": 2.944275704610409, "learning_rate": 1.7714146384330886e-07, "loss": 0.6906, "step": 2458 }, { "epoch": 0.24308627635123445, "grad_norm": 3.8385708481468694, "learning_rate": 1.771210837955641e-07, "loss": 0.6235, "step": 2459 }, { "epoch": 0.24318513209598894, "grad_norm": 4.758588526650373, "learning_rate": 1.7710069584018982e-07, "loss": 0.7568, "step": 2460 }, { "epoch": 0.2432839878407434, "grad_norm": 3.4259530743689446, "learning_rate": 1.7708029997927652e-07, "loss": 0.8023, "step": 2461 }, { "epoch": 0.24338284358549786, "grad_norm": 4.061480294838327, "learning_rate": 1.7705989621491546e-07, "loss": 0.7084, "step": 2462 }, { "epoch": 0.24348169933025232, "grad_norm": 8.376766382646403, "learning_rate": 1.7703948454919876e-07, "loss": 0.7411, "step": 2463 }, { "epoch": 0.24358055507500678, "grad_norm": 3.65852939562364, "learning_rate": 1.7701906498421933e-07, "loss": 0.6834, "step": 2464 }, { "epoch": 0.24367941081976127, "grad_norm": 6.140092843402329, "learning_rate": 1.769986375220709e-07, "loss": 0.6963, "step": 2465 }, { "epoch": 0.24377826656451573, "grad_norm": 4.230968968577057, "learning_rate": 1.7697820216484797e-07, "loss": 0.6845, "step": 2466 }, { "epoch": 0.2438771223092702, "grad_norm": 5.737229580318362, "learning_rate": 1.7695775891464596e-07, "loss": 0.7439, "step": 2467 }, { "epoch": 0.24397597805402466, "grad_norm": 3.982041588088474, "learning_rate": 1.7693730777356096e-07, "loss": 0.8045, "step": 2468 }, { "epoch": 0.24407483379877914, "grad_norm": 4.329869812743968, "learning_rate": 1.7691684874368995e-07, "loss": 0.7624, "step": 2469 }, { "epoch": 0.2441736895435336, "grad_norm": 9.156272479026446, "learning_rate": 1.7689638182713072e-07, "loss": 0.6536, "step": 2470 }, { "epoch": 0.24427254528828807, "grad_norm": 4.581996162099595, "learning_rate": 1.7687590702598182e-07, "loss": 0.7165, "step": 2471 }, { "epoch": 0.24437140103304253, "grad_norm": 3.4468341883897367, "learning_rate": 1.7685542434234267e-07, "loss": 0.6788, "step": 2472 }, { "epoch": 0.244470256777797, "grad_norm": 3.612944149748399, "learning_rate": 1.7683493377831343e-07, "loss": 0.7927, "step": 2473 }, { "epoch": 0.24456911252255148, "grad_norm": 3.2185652520044927, "learning_rate": 1.7681443533599513e-07, "loss": 0.7547, "step": 2474 }, { "epoch": 0.24466796826730594, "grad_norm": 3.602920615066137, "learning_rate": 1.7679392901748958e-07, "loss": 0.8225, "step": 2475 }, { "epoch": 0.2447668240120604, "grad_norm": 4.282765241008579, "learning_rate": 1.7677341482489943e-07, "loss": 0.6793, "step": 2476 }, { "epoch": 0.24486567975681486, "grad_norm": 4.398788648639304, "learning_rate": 1.7675289276032803e-07, "loss": 0.7702, "step": 2477 }, { "epoch": 0.24496453550156932, "grad_norm": 7.839821873873161, "learning_rate": 1.7673236282587972e-07, "loss": 0.7816, "step": 2478 }, { "epoch": 0.24506339124632381, "grad_norm": 10.692733943743454, "learning_rate": 1.7671182502365943e-07, "loss": 0.6693, "step": 2479 }, { "epoch": 0.24516224699107828, "grad_norm": 3.4116748290941445, "learning_rate": 1.766912793557731e-07, "loss": 0.7076, "step": 2480 }, { "epoch": 0.24526110273583274, "grad_norm": 19.053769537014283, "learning_rate": 1.7667072582432732e-07, "loss": 0.7422, "step": 2481 }, { "epoch": 0.2453599584805872, "grad_norm": 3.4981970208898945, "learning_rate": 1.7665016443142958e-07, "loss": 0.6751, "step": 2482 }, { "epoch": 0.24545881422534166, "grad_norm": 3.1412536625789573, "learning_rate": 1.7662959517918815e-07, "loss": 0.6647, "step": 2483 }, { "epoch": 0.24555766997009615, "grad_norm": 3.2647934372260257, "learning_rate": 1.766090180697121e-07, "loss": 0.8186, "step": 2484 }, { "epoch": 0.2456565257148506, "grad_norm": 6.946925242548534, "learning_rate": 1.7658843310511134e-07, "loss": 0.7262, "step": 2485 }, { "epoch": 0.24575538145960507, "grad_norm": 8.889455891869993, "learning_rate": 1.7656784028749648e-07, "loss": 0.6679, "step": 2486 }, { "epoch": 0.24585423720435953, "grad_norm": 11.410057643375843, "learning_rate": 1.7654723961897909e-07, "loss": 0.715, "step": 2487 }, { "epoch": 0.245953092949114, "grad_norm": 3.320585726937451, "learning_rate": 1.7652663110167142e-07, "loss": 0.6114, "step": 2488 }, { "epoch": 0.24605194869386848, "grad_norm": 5.625502555218986, "learning_rate": 1.7650601473768653e-07, "loss": 0.6662, "step": 2489 }, { "epoch": 0.24615080443862294, "grad_norm": 4.336177650344499, "learning_rate": 1.7648539052913843e-07, "loss": 0.738, "step": 2490 }, { "epoch": 0.2462496601833774, "grad_norm": 3.671404396096694, "learning_rate": 1.7646475847814174e-07, "loss": 0.777, "step": 2491 }, { "epoch": 0.24634851592813187, "grad_norm": 4.7663528149013255, "learning_rate": 1.7644411858681203e-07, "loss": 0.7318, "step": 2492 }, { "epoch": 0.24644737167288633, "grad_norm": 3.5098593616931124, "learning_rate": 1.7642347085726556e-07, "loss": 0.7514, "step": 2493 }, { "epoch": 0.24654622741764082, "grad_norm": 7.9922739981760955, "learning_rate": 1.7640281529161948e-07, "loss": 0.6086, "step": 2494 }, { "epoch": 0.24664508316239528, "grad_norm": 5.38458925551755, "learning_rate": 1.7638215189199175e-07, "loss": 0.5735, "step": 2495 }, { "epoch": 0.24674393890714974, "grad_norm": 4.183484775840916, "learning_rate": 1.7636148066050104e-07, "loss": 0.68, "step": 2496 }, { "epoch": 0.2468427946519042, "grad_norm": 8.62483509107993, "learning_rate": 1.7634080159926689e-07, "loss": 0.7281, "step": 2497 }, { "epoch": 0.24694165039665866, "grad_norm": 5.1621187931689985, "learning_rate": 1.7632011471040965e-07, "loss": 0.7217, "step": 2498 }, { "epoch": 0.24704050614141315, "grad_norm": 9.356829092901002, "learning_rate": 1.7629941999605044e-07, "loss": 0.7168, "step": 2499 }, { "epoch": 0.2471393618861676, "grad_norm": 4.5730549708997295, "learning_rate": 1.7627871745831123e-07, "loss": 0.7306, "step": 2500 }, { "epoch": 0.24723821763092207, "grad_norm": 4.2724531875057545, "learning_rate": 1.7625800709931468e-07, "loss": 0.6544, "step": 2501 }, { "epoch": 0.24733707337567654, "grad_norm": 3.794592541830703, "learning_rate": 1.7623728892118444e-07, "loss": 0.696, "step": 2502 }, { "epoch": 0.247435929120431, "grad_norm": 3.498996806633802, "learning_rate": 1.7621656292604476e-07, "loss": 0.7138, "step": 2503 }, { "epoch": 0.2475347848651855, "grad_norm": 10.443949698391775, "learning_rate": 1.7619582911602087e-07, "loss": 0.7766, "step": 2504 }, { "epoch": 0.24763364060993995, "grad_norm": 3.4102472791754033, "learning_rate": 1.7617508749323864e-07, "loss": 0.6973, "step": 2505 }, { "epoch": 0.2477324963546944, "grad_norm": 3.2018984068622625, "learning_rate": 1.7615433805982484e-07, "loss": 0.6959, "step": 2506 }, { "epoch": 0.24783135209944887, "grad_norm": 4.892942300996585, "learning_rate": 1.7613358081790705e-07, "loss": 0.7664, "step": 2507 }, { "epoch": 0.24793020784420336, "grad_norm": 9.93184581581607, "learning_rate": 1.7611281576961358e-07, "loss": 0.7035, "step": 2508 }, { "epoch": 0.24802906358895782, "grad_norm": 4.7808847259358584, "learning_rate": 1.7609204291707363e-07, "loss": 0.6357, "step": 2509 }, { "epoch": 0.24812791933371228, "grad_norm": 8.089359093380752, "learning_rate": 1.760712622624171e-07, "loss": 0.724, "step": 2510 }, { "epoch": 0.24822677507846674, "grad_norm": 4.168674566110212, "learning_rate": 1.7605047380777473e-07, "loss": 0.7836, "step": 2511 }, { "epoch": 0.2483256308232212, "grad_norm": 9.360843840944087, "learning_rate": 1.760296775552781e-07, "loss": 0.6153, "step": 2512 }, { "epoch": 0.2484244865679757, "grad_norm": 5.454715932020585, "learning_rate": 1.7600887350705959e-07, "loss": 0.7085, "step": 2513 }, { "epoch": 0.24852334231273016, "grad_norm": 3.748058744017857, "learning_rate": 1.7598806166525228e-07, "loss": 0.6329, "step": 2514 }, { "epoch": 0.24862219805748462, "grad_norm": 3.636136424326512, "learning_rate": 1.7596724203199017e-07, "loss": 0.6148, "step": 2515 }, { "epoch": 0.24872105380223908, "grad_norm": 13.715925881131213, "learning_rate": 1.7594641460940794e-07, "loss": 0.7218, "step": 2516 }, { "epoch": 0.24881990954699354, "grad_norm": 8.988378303470881, "learning_rate": 1.759255793996412e-07, "loss": 0.7762, "step": 2517 }, { "epoch": 0.24891876529174803, "grad_norm": 4.13650552473767, "learning_rate": 1.7590473640482633e-07, "loss": 0.6962, "step": 2518 }, { "epoch": 0.2490176210365025, "grad_norm": 3.8377917263110746, "learning_rate": 1.7588388562710036e-07, "loss": 0.7977, "step": 2519 }, { "epoch": 0.24911647678125695, "grad_norm": 3.793851245889237, "learning_rate": 1.758630270686013e-07, "loss": 0.6946, "step": 2520 }, { "epoch": 0.2492153325260114, "grad_norm": 3.6573226902860996, "learning_rate": 1.7584216073146784e-07, "loss": 0.7414, "step": 2521 }, { "epoch": 0.24931418827076587, "grad_norm": 9.162345359558287, "learning_rate": 1.758212866178396e-07, "loss": 0.8288, "step": 2522 }, { "epoch": 0.24941304401552036, "grad_norm": 6.657945839598378, "learning_rate": 1.7580040472985683e-07, "loss": 0.7862, "step": 2523 }, { "epoch": 0.24951189976027482, "grad_norm": 4.942357749035868, "learning_rate": 1.757795150696607e-07, "loss": 0.5859, "step": 2524 }, { "epoch": 0.24961075550502929, "grad_norm": 5.394180287800419, "learning_rate": 1.757586176393931e-07, "loss": 0.7195, "step": 2525 }, { "epoch": 0.24970961124978375, "grad_norm": 43.9773010585701, "learning_rate": 1.7573771244119683e-07, "loss": 0.7369, "step": 2526 }, { "epoch": 0.2498084669945382, "grad_norm": 4.2205964718940825, "learning_rate": 1.757167994772153e-07, "loss": 0.7298, "step": 2527 }, { "epoch": 0.2499073227392927, "grad_norm": 5.70351166541157, "learning_rate": 1.7569587874959292e-07, "loss": 0.7665, "step": 2528 }, { "epoch": 0.25000617848404716, "grad_norm": 3.23737141804287, "learning_rate": 1.7567495026047477e-07, "loss": 0.7298, "step": 2529 }, { "epoch": 0.25010503422880165, "grad_norm": 3.3806513715060884, "learning_rate": 1.7565401401200674e-07, "loss": 0.704, "step": 2530 }, { "epoch": 0.2502038899735561, "grad_norm": 3.3885555044601947, "learning_rate": 1.7563307000633555e-07, "loss": 0.7181, "step": 2531 }, { "epoch": 0.25030274571831057, "grad_norm": 3.792444764732508, "learning_rate": 1.756121182456087e-07, "loss": 0.6499, "step": 2532 }, { "epoch": 0.250401601463065, "grad_norm": 4.856788120683121, "learning_rate": 1.755911587319745e-07, "loss": 0.725, "step": 2533 }, { "epoch": 0.2505004572078195, "grad_norm": 8.579585706017015, "learning_rate": 1.75570191467582e-07, "loss": 0.714, "step": 2534 }, { "epoch": 0.250599312952574, "grad_norm": 5.535268583082957, "learning_rate": 1.7554921645458108e-07, "loss": 0.6537, "step": 2535 }, { "epoch": 0.2506981686973284, "grad_norm": 3.5267650147920886, "learning_rate": 1.755282336951225e-07, "loss": 0.8029, "step": 2536 }, { "epoch": 0.2507970244420829, "grad_norm": 5.424893068152502, "learning_rate": 1.755072431913576e-07, "loss": 0.64, "step": 2537 }, { "epoch": 0.25089588018683734, "grad_norm": 11.56611810511707, "learning_rate": 1.754862449454388e-07, "loss": 0.7468, "step": 2538 }, { "epoch": 0.25099473593159183, "grad_norm": 25.88718748593189, "learning_rate": 1.7546523895951903e-07, "loss": 0.6794, "step": 2539 }, { "epoch": 0.2510935916763463, "grad_norm": 3.9695773040182134, "learning_rate": 1.754442252357522e-07, "loss": 0.8229, "step": 2540 }, { "epoch": 0.25119244742110075, "grad_norm": 3.5143341011768867, "learning_rate": 1.7542320377629295e-07, "loss": 0.6254, "step": 2541 }, { "epoch": 0.25129130316585524, "grad_norm": 3.8495303661309634, "learning_rate": 1.7540217458329677e-07, "loss": 0.6262, "step": 2542 }, { "epoch": 0.2513901589106097, "grad_norm": 3.5593099644802355, "learning_rate": 1.753811376589198e-07, "loss": 0.6988, "step": 2543 }, { "epoch": 0.25148901465536416, "grad_norm": 4.158512862002003, "learning_rate": 1.7536009300531913e-07, "loss": 0.7698, "step": 2544 }, { "epoch": 0.25158787040011865, "grad_norm": 14.136582755812698, "learning_rate": 1.7533904062465257e-07, "loss": 0.734, "step": 2545 }, { "epoch": 0.2516867261448731, "grad_norm": 5.514154460949664, "learning_rate": 1.7531798051907872e-07, "loss": 0.7633, "step": 2546 }, { "epoch": 0.2517855818896276, "grad_norm": 5.115326242639329, "learning_rate": 1.7529691269075694e-07, "loss": 0.7784, "step": 2547 }, { "epoch": 0.251884437634382, "grad_norm": 6.145716270484772, "learning_rate": 1.7527583714184753e-07, "loss": 0.7014, "step": 2548 }, { "epoch": 0.2519832933791365, "grad_norm": 3.047543441126006, "learning_rate": 1.752547538745114e-07, "loss": 0.7166, "step": 2549 }, { "epoch": 0.252082149123891, "grad_norm": 5.586113701172232, "learning_rate": 1.7523366289091035e-07, "loss": 0.6718, "step": 2550 }, { "epoch": 0.2521810048686454, "grad_norm": 4.885472839868687, "learning_rate": 1.7521256419320692e-07, "loss": 0.6802, "step": 2551 }, { "epoch": 0.2522798606133999, "grad_norm": 7.826482265874667, "learning_rate": 1.7519145778356452e-07, "loss": 0.7708, "step": 2552 }, { "epoch": 0.25237871635815434, "grad_norm": 5.958178735698945, "learning_rate": 1.7517034366414726e-07, "loss": 0.7668, "step": 2553 }, { "epoch": 0.25247757210290883, "grad_norm": 4.086636375152883, "learning_rate": 1.751492218371201e-07, "loss": 0.7085, "step": 2554 }, { "epoch": 0.2525764278476633, "grad_norm": 22.792613738618396, "learning_rate": 1.751280923046488e-07, "loss": 0.827, "step": 2555 }, { "epoch": 0.25267528359241775, "grad_norm": 3.6925920407315105, "learning_rate": 1.751069550688998e-07, "loss": 0.5968, "step": 2556 }, { "epoch": 0.25277413933717224, "grad_norm": 6.553517637441141, "learning_rate": 1.750858101320405e-07, "loss": 0.8419, "step": 2557 }, { "epoch": 0.2528729950819267, "grad_norm": 4.432554748077511, "learning_rate": 1.7506465749623898e-07, "loss": 0.709, "step": 2558 }, { "epoch": 0.25297185082668117, "grad_norm": 5.3852153749394605, "learning_rate": 1.7504349716366407e-07, "loss": 0.7895, "step": 2559 }, { "epoch": 0.25307070657143566, "grad_norm": 14.443490714717976, "learning_rate": 1.7502232913648554e-07, "loss": 0.7514, "step": 2560 }, { "epoch": 0.2531695623161901, "grad_norm": 5.555020684815613, "learning_rate": 1.7500115341687377e-07, "loss": 0.7208, "step": 2561 }, { "epoch": 0.2532684180609446, "grad_norm": 7.225553851688573, "learning_rate": 1.7497997000700008e-07, "loss": 0.7861, "step": 2562 }, { "epoch": 0.253367273805699, "grad_norm": 4.8137115689304775, "learning_rate": 1.749587789090365e-07, "loss": 0.787, "step": 2563 }, { "epoch": 0.2534661295504535, "grad_norm": 4.82780296394149, "learning_rate": 1.7493758012515588e-07, "loss": 0.6047, "step": 2564 }, { "epoch": 0.253564985295208, "grad_norm": 4.258589274278392, "learning_rate": 1.7491637365753178e-07, "loss": 0.6346, "step": 2565 }, { "epoch": 0.2536638410399624, "grad_norm": 3.5468784879298054, "learning_rate": 1.748951595083387e-07, "loss": 0.6608, "step": 2566 }, { "epoch": 0.2537626967847169, "grad_norm": 5.477811882757876, "learning_rate": 1.748739376797518e-07, "loss": 0.8272, "step": 2567 }, { "epoch": 0.25386155252947135, "grad_norm": 3.501439849147315, "learning_rate": 1.7485270817394704e-07, "loss": 0.8043, "step": 2568 }, { "epoch": 0.25396040827422584, "grad_norm": 6.967249219877242, "learning_rate": 1.7483147099310118e-07, "loss": 0.7524, "step": 2569 }, { "epoch": 0.2540592640189803, "grad_norm": 3.3410928137378457, "learning_rate": 1.7481022613939186e-07, "loss": 0.5104, "step": 2570 }, { "epoch": 0.25415811976373476, "grad_norm": 4.11969303504438, "learning_rate": 1.7478897361499735e-07, "loss": 0.6687, "step": 2571 }, { "epoch": 0.25425697550848925, "grad_norm": 4.878248876566138, "learning_rate": 1.7476771342209683e-07, "loss": 0.6857, "step": 2572 }, { "epoch": 0.2543558312532437, "grad_norm": 5.395236963535395, "learning_rate": 1.7474644556287017e-07, "loss": 0.7044, "step": 2573 }, { "epoch": 0.25445468699799817, "grad_norm": 13.660917410263362, "learning_rate": 1.7472517003949813e-07, "loss": 0.7021, "step": 2574 }, { "epoch": 0.25455354274275266, "grad_norm": 3.809978207545114, "learning_rate": 1.7470388685416217e-07, "loss": 0.6392, "step": 2575 }, { "epoch": 0.2546523984875071, "grad_norm": 3.1759207254770936, "learning_rate": 1.7468259600904458e-07, "loss": 0.6506, "step": 2576 }, { "epoch": 0.2547512542322616, "grad_norm": 4.394458490649577, "learning_rate": 1.7466129750632842e-07, "loss": 0.71, "step": 2577 }, { "epoch": 0.254850109977016, "grad_norm": 5.326604839835025, "learning_rate": 1.746399913481975e-07, "loss": 0.7149, "step": 2578 }, { "epoch": 0.2549489657217705, "grad_norm": 3.7356530750302666, "learning_rate": 1.7461867753683651e-07, "loss": 0.8131, "step": 2579 }, { "epoch": 0.255047821466525, "grad_norm": 5.103292922619978, "learning_rate": 1.7459735607443086e-07, "loss": 0.6932, "step": 2580 }, { "epoch": 0.2551466772112794, "grad_norm": 4.593642833846963, "learning_rate": 1.745760269631667e-07, "loss": 0.621, "step": 2581 }, { "epoch": 0.2552455329560339, "grad_norm": 3.9308917513932067, "learning_rate": 1.7455469020523106e-07, "loss": 0.7009, "step": 2582 }, { "epoch": 0.25534438870078835, "grad_norm": 4.853226896317163, "learning_rate": 1.7453334580281166e-07, "loss": 0.7247, "step": 2583 }, { "epoch": 0.25544324444554284, "grad_norm": 4.841370552271901, "learning_rate": 1.7451199375809718e-07, "loss": 0.7723, "step": 2584 }, { "epoch": 0.2555421001902973, "grad_norm": 4.448589158346291, "learning_rate": 1.744906340732768e-07, "loss": 0.7668, "step": 2585 }, { "epoch": 0.25564095593505176, "grad_norm": 3.624675363661294, "learning_rate": 1.7446926675054074e-07, "loss": 0.7008, "step": 2586 }, { "epoch": 0.25573981167980625, "grad_norm": 3.5013149745444596, "learning_rate": 1.7444789179207983e-07, "loss": 0.6981, "step": 2587 }, { "epoch": 0.2558386674245607, "grad_norm": 10.351704041559495, "learning_rate": 1.7442650920008585e-07, "loss": 0.7115, "step": 2588 }, { "epoch": 0.2559375231693152, "grad_norm": 8.483091007225129, "learning_rate": 1.744051189767512e-07, "loss": 0.634, "step": 2589 }, { "epoch": 0.25603637891406966, "grad_norm": 3.5885496473003955, "learning_rate": 1.7438372112426919e-07, "loss": 0.5632, "step": 2590 }, { "epoch": 0.2561352346588241, "grad_norm": 3.497578399064761, "learning_rate": 1.7436231564483376e-07, "loss": 0.7095, "step": 2591 }, { "epoch": 0.2562340904035786, "grad_norm": 3.688603429035982, "learning_rate": 1.743409025406398e-07, "loss": 0.7295, "step": 2592 }, { "epoch": 0.256332946148333, "grad_norm": 3.768977578229503, "learning_rate": 1.743194818138829e-07, "loss": 0.7285, "step": 2593 }, { "epoch": 0.2564318018930875, "grad_norm": 4.712294050693658, "learning_rate": 1.742980534667594e-07, "loss": 0.7723, "step": 2594 }, { "epoch": 0.256530657637842, "grad_norm": 4.797464312145532, "learning_rate": 1.7427661750146654e-07, "loss": 0.6319, "step": 2595 }, { "epoch": 0.25662951338259643, "grad_norm": 3.6908362441407268, "learning_rate": 1.7425517392020215e-07, "loss": 0.7309, "step": 2596 }, { "epoch": 0.2567283691273509, "grad_norm": 3.8501315625658, "learning_rate": 1.7423372272516507e-07, "loss": 0.736, "step": 2597 }, { "epoch": 0.25682722487210535, "grad_norm": 8.697286336874848, "learning_rate": 1.7421226391855471e-07, "loss": 0.7217, "step": 2598 }, { "epoch": 0.25692608061685984, "grad_norm": 2.8978683470043536, "learning_rate": 1.7419079750257141e-07, "loss": 0.6761, "step": 2599 }, { "epoch": 0.25702493636161433, "grad_norm": 4.322022387896223, "learning_rate": 1.7416932347941624e-07, "loss": 0.6336, "step": 2600 }, { "epoch": 0.25712379210636876, "grad_norm": 3.6066343237292067, "learning_rate": 1.74147841851291e-07, "loss": 0.8463, "step": 2601 }, { "epoch": 0.25722264785112325, "grad_norm": 4.135812261543081, "learning_rate": 1.7412635262039834e-07, "loss": 0.7642, "step": 2602 }, { "epoch": 0.25732150359587774, "grad_norm": 3.1280831529841957, "learning_rate": 1.7410485578894164e-07, "loss": 0.7085, "step": 2603 }, { "epoch": 0.2574203593406322, "grad_norm": 6.8858715409845965, "learning_rate": 1.7408335135912512e-07, "loss": 0.7356, "step": 2604 }, { "epoch": 0.25751921508538667, "grad_norm": 3.7029362919694084, "learning_rate": 1.7406183933315371e-07, "loss": 0.6994, "step": 2605 }, { "epoch": 0.2576180708301411, "grad_norm": 6.182826578957922, "learning_rate": 1.740403197132332e-07, "loss": 0.6721, "step": 2606 }, { "epoch": 0.2577169265748956, "grad_norm": 8.894863279889158, "learning_rate": 1.7401879250157006e-07, "loss": 0.7417, "step": 2607 }, { "epoch": 0.2578157823196501, "grad_norm": 3.62953261263379, "learning_rate": 1.739972577003716e-07, "loss": 0.7378, "step": 2608 }, { "epoch": 0.2579146380644045, "grad_norm": 4.807859443073986, "learning_rate": 1.7397571531184587e-07, "loss": 0.73, "step": 2609 }, { "epoch": 0.258013493809159, "grad_norm": 12.985849348791023, "learning_rate": 1.739541653382018e-07, "loss": 0.6791, "step": 2610 }, { "epoch": 0.25811234955391343, "grad_norm": 2.9348041243124516, "learning_rate": 1.7393260778164894e-07, "loss": 0.655, "step": 2611 }, { "epoch": 0.2582112052986679, "grad_norm": 3.8368424121236084, "learning_rate": 1.7391104264439775e-07, "loss": 0.746, "step": 2612 }, { "epoch": 0.2583100610434224, "grad_norm": 4.97911318533631, "learning_rate": 1.738894699286594e-07, "loss": 0.6371, "step": 2613 }, { "epoch": 0.25840891678817685, "grad_norm": 8.896766517343393, "learning_rate": 1.7386788963664587e-07, "loss": 0.7509, "step": 2614 }, { "epoch": 0.25850777253293133, "grad_norm": 5.48965729056475, "learning_rate": 1.738463017705699e-07, "loss": 0.735, "step": 2615 }, { "epoch": 0.25860662827768577, "grad_norm": 4.970798072265461, "learning_rate": 1.7382470633264493e-07, "loss": 0.7343, "step": 2616 }, { "epoch": 0.25870548402244026, "grad_norm": 5.136390429075031, "learning_rate": 1.7380310332508533e-07, "loss": 0.7656, "step": 2617 }, { "epoch": 0.25880433976719475, "grad_norm": 5.822043653378741, "learning_rate": 1.737814927501062e-07, "loss": 0.6614, "step": 2618 }, { "epoch": 0.2589031955119492, "grad_norm": 26.764213329308824, "learning_rate": 1.7375987460992331e-07, "loss": 0.7416, "step": 2619 }, { "epoch": 0.25900205125670367, "grad_norm": 4.994614069200326, "learning_rate": 1.7373824890675334e-07, "loss": 0.7057, "step": 2620 }, { "epoch": 0.2591009070014581, "grad_norm": 6.737518394462079, "learning_rate": 1.7371661564281365e-07, "loss": 0.8052, "step": 2621 }, { "epoch": 0.2591997627462126, "grad_norm": 2.6138975694348408, "learning_rate": 1.736949748203224e-07, "loss": 0.6464, "step": 2622 }, { "epoch": 0.2592986184909671, "grad_norm": 3.550778200999612, "learning_rate": 1.736733264414986e-07, "loss": 0.7328, "step": 2623 }, { "epoch": 0.2593974742357215, "grad_norm": 17.70772412089997, "learning_rate": 1.736516705085619e-07, "loss": 0.6884, "step": 2624 }, { "epoch": 0.259496329980476, "grad_norm": 4.434915052820346, "learning_rate": 1.7363000702373283e-07, "loss": 0.7044, "step": 2625 }, { "epoch": 0.25959518572523044, "grad_norm": 15.345038158908627, "learning_rate": 1.7360833598923265e-07, "loss": 0.7497, "step": 2626 }, { "epoch": 0.2596940414699849, "grad_norm": 14.807401051540316, "learning_rate": 1.7358665740728344e-07, "loss": 0.6828, "step": 2627 }, { "epoch": 0.2597928972147394, "grad_norm": 4.620533412149072, "learning_rate": 1.7356497128010798e-07, "loss": 0.7361, "step": 2628 }, { "epoch": 0.25989175295949385, "grad_norm": 5.021193712104873, "learning_rate": 1.7354327760992988e-07, "loss": 0.6963, "step": 2629 }, { "epoch": 0.25999060870424834, "grad_norm": 4.7564961143060245, "learning_rate": 1.735215763989735e-07, "loss": 0.7376, "step": 2630 }, { "epoch": 0.26008946444900277, "grad_norm": 4.112645269630967, "learning_rate": 1.7349986764946395e-07, "loss": 0.7682, "step": 2631 }, { "epoch": 0.26018832019375726, "grad_norm": 3.7346349649161863, "learning_rate": 1.734781513636272e-07, "loss": 0.6742, "step": 2632 }, { "epoch": 0.26028717593851175, "grad_norm": 4.606703462831743, "learning_rate": 1.734564275436899e-07, "loss": 0.713, "step": 2633 }, { "epoch": 0.2603860316832662, "grad_norm": 4.7472223618483005, "learning_rate": 1.7343469619187954e-07, "loss": 0.6362, "step": 2634 }, { "epoch": 0.2604848874280207, "grad_norm": 4.018166403523397, "learning_rate": 1.7341295731042428e-07, "loss": 0.7657, "step": 2635 }, { "epoch": 0.2605837431727751, "grad_norm": 4.759837265957269, "learning_rate": 1.733912109015532e-07, "loss": 0.6945, "step": 2636 }, { "epoch": 0.2606825989175296, "grad_norm": 3.4492680476832036, "learning_rate": 1.7336945696749602e-07, "loss": 0.6996, "step": 2637 }, { "epoch": 0.2607814546622841, "grad_norm": 3.5237982642190198, "learning_rate": 1.7334769551048328e-07, "loss": 0.7095, "step": 2638 }, { "epoch": 0.2608803104070385, "grad_norm": 5.601059903563019, "learning_rate": 1.7332592653274637e-07, "loss": 0.7558, "step": 2639 }, { "epoch": 0.260979166151793, "grad_norm": 3.6636456101104904, "learning_rate": 1.7330415003651733e-07, "loss": 0.735, "step": 2640 }, { "epoch": 0.26107802189654744, "grad_norm": 12.803390046774275, "learning_rate": 1.7328236602402898e-07, "loss": 0.6236, "step": 2641 }, { "epoch": 0.26117687764130193, "grad_norm": 3.66787765414936, "learning_rate": 1.73260574497515e-07, "loss": 0.7887, "step": 2642 }, { "epoch": 0.2612757333860564, "grad_norm": 3.970944152972017, "learning_rate": 1.7323877545920977e-07, "loss": 0.8116, "step": 2643 }, { "epoch": 0.26137458913081085, "grad_norm": 14.763060745929776, "learning_rate": 1.7321696891134847e-07, "loss": 0.7117, "step": 2644 }, { "epoch": 0.26147344487556534, "grad_norm": 9.749440150341105, "learning_rate": 1.7319515485616702e-07, "loss": 0.6928, "step": 2645 }, { "epoch": 0.2615723006203198, "grad_norm": 4.180740344376574, "learning_rate": 1.7317333329590217e-07, "loss": 0.726, "step": 2646 }, { "epoch": 0.26167115636507426, "grad_norm": 3.7262429250804003, "learning_rate": 1.7315150423279135e-07, "loss": 0.8468, "step": 2647 }, { "epoch": 0.26177001210982875, "grad_norm": 4.88127956969713, "learning_rate": 1.7312966766907283e-07, "loss": 0.858, "step": 2648 }, { "epoch": 0.2618688678545832, "grad_norm": 7.888114437666803, "learning_rate": 1.7310782360698565e-07, "loss": 0.7806, "step": 2649 }, { "epoch": 0.2619677235993377, "grad_norm": 10.189201481307917, "learning_rate": 1.7308597204876959e-07, "loss": 0.7432, "step": 2650 }, { "epoch": 0.2620665793440921, "grad_norm": 5.402701603631917, "learning_rate": 1.7306411299666516e-07, "loss": 0.7283, "step": 2651 }, { "epoch": 0.2621654350888466, "grad_norm": 3.4802971113431536, "learning_rate": 1.730422464529137e-07, "loss": 0.71, "step": 2652 }, { "epoch": 0.2622642908336011, "grad_norm": 2.8613607328180253, "learning_rate": 1.7302037241975734e-07, "loss": 0.7194, "step": 2653 }, { "epoch": 0.2623631465783555, "grad_norm": 4.208893593328062, "learning_rate": 1.729984908994389e-07, "loss": 0.7518, "step": 2654 }, { "epoch": 0.26246200232311, "grad_norm": 4.4884253532834855, "learning_rate": 1.7297660189420205e-07, "loss": 0.74, "step": 2655 }, { "epoch": 0.26256085806786444, "grad_norm": 16.546122331381905, "learning_rate": 1.7295470540629112e-07, "loss": 0.7648, "step": 2656 }, { "epoch": 0.26265971381261893, "grad_norm": 3.341909227031189, "learning_rate": 1.7293280143795131e-07, "loss": 0.6008, "step": 2657 }, { "epoch": 0.2627585695573734, "grad_norm": 3.1300278122615843, "learning_rate": 1.7291088999142859e-07, "loss": 0.8031, "step": 2658 }, { "epoch": 0.26285742530212786, "grad_norm": 3.709792149743311, "learning_rate": 1.7288897106896956e-07, "loss": 0.7415, "step": 2659 }, { "epoch": 0.26295628104688235, "grad_norm": 3.2591827875433976, "learning_rate": 1.7286704467282176e-07, "loss": 0.7641, "step": 2660 }, { "epoch": 0.2630551367916368, "grad_norm": 3.7786186986847654, "learning_rate": 1.728451108052334e-07, "loss": 0.7612, "step": 2661 }, { "epoch": 0.26315399253639127, "grad_norm": 3.6271066329749453, "learning_rate": 1.7282316946845347e-07, "loss": 0.819, "step": 2662 }, { "epoch": 0.26325284828114576, "grad_norm": 3.414955300272365, "learning_rate": 1.728012206647317e-07, "loss": 0.7077, "step": 2663 }, { "epoch": 0.2633517040259002, "grad_norm": 319.9984134019195, "learning_rate": 1.7277926439631866e-07, "loss": 0.6744, "step": 2664 }, { "epoch": 0.2634505597706547, "grad_norm": 5.03937928907127, "learning_rate": 1.7275730066546563e-07, "loss": 0.6655, "step": 2665 }, { "epoch": 0.2635494155154091, "grad_norm": 4.248578237555021, "learning_rate": 1.7273532947442466e-07, "loss": 0.6928, "step": 2666 }, { "epoch": 0.2636482712601636, "grad_norm": 5.068226798374296, "learning_rate": 1.7271335082544858e-07, "loss": 0.824, "step": 2667 }, { "epoch": 0.2637471270049181, "grad_norm": 3.96359665467839, "learning_rate": 1.72691364720791e-07, "loss": 0.6795, "step": 2668 }, { "epoch": 0.2638459827496725, "grad_norm": 3.324405264743262, "learning_rate": 1.7266937116270618e-07, "loss": 0.7028, "step": 2669 }, { "epoch": 0.263944838494427, "grad_norm": 4.795453918704021, "learning_rate": 1.7264737015344933e-07, "loss": 0.7729, "step": 2670 }, { "epoch": 0.26404369423918145, "grad_norm": 3.372399072594103, "learning_rate": 1.7262536169527629e-07, "loss": 0.8507, "step": 2671 }, { "epoch": 0.26414254998393594, "grad_norm": 5.518100165847274, "learning_rate": 1.7260334579044372e-07, "loss": 0.8385, "step": 2672 }, { "epoch": 0.2642414057286904, "grad_norm": 5.157689787915596, "learning_rate": 1.7258132244120899e-07, "loss": 0.7875, "step": 2673 }, { "epoch": 0.26434026147344486, "grad_norm": 3.7256774351640862, "learning_rate": 1.7255929164983028e-07, "loss": 0.7103, "step": 2674 }, { "epoch": 0.26443911721819935, "grad_norm": 3.6069213424878686, "learning_rate": 1.7253725341856654e-07, "loss": 0.7952, "step": 2675 }, { "epoch": 0.2645379729629538, "grad_norm": 2.7879353602583112, "learning_rate": 1.725152077496775e-07, "loss": 0.7253, "step": 2676 }, { "epoch": 0.26463682870770827, "grad_norm": 5.9389715641056755, "learning_rate": 1.7249315464542358e-07, "loss": 0.7899, "step": 2677 }, { "epoch": 0.26473568445246276, "grad_norm": 3.5535183317217145, "learning_rate": 1.7247109410806598e-07, "loss": 0.6564, "step": 2678 }, { "epoch": 0.2648345401972172, "grad_norm": 4.00066731254155, "learning_rate": 1.7244902613986669e-07, "loss": 0.6602, "step": 2679 }, { "epoch": 0.2649333959419717, "grad_norm": 4.532286930600595, "learning_rate": 1.724269507430885e-07, "loss": 0.8024, "step": 2680 }, { "epoch": 0.2650322516867262, "grad_norm": 14.237854914680526, "learning_rate": 1.7240486791999484e-07, "loss": 0.6932, "step": 2681 }, { "epoch": 0.2651311074314806, "grad_norm": 8.737687547522533, "learning_rate": 1.7238277767285005e-07, "loss": 0.7213, "step": 2682 }, { "epoch": 0.2652299631762351, "grad_norm": 5.268082798280748, "learning_rate": 1.7236068000391914e-07, "loss": 0.7229, "step": 2683 }, { "epoch": 0.26532881892098953, "grad_norm": 3.5678429355501318, "learning_rate": 1.7233857491546787e-07, "loss": 0.7374, "step": 2684 }, { "epoch": 0.265427674665744, "grad_norm": 7.414483735275459, "learning_rate": 1.7231646240976283e-07, "loss": 0.6713, "step": 2685 }, { "epoch": 0.2655265304104985, "grad_norm": 7.787999242025072, "learning_rate": 1.722943424890713e-07, "loss": 0.7745, "step": 2686 }, { "epoch": 0.26562538615525294, "grad_norm": 4.523955830252212, "learning_rate": 1.7227221515566137e-07, "loss": 0.6894, "step": 2687 }, { "epoch": 0.26572424190000743, "grad_norm": 7.0225782833062365, "learning_rate": 1.7225008041180185e-07, "loss": 0.7893, "step": 2688 }, { "epoch": 0.26582309764476186, "grad_norm": 6.60583685843071, "learning_rate": 1.7222793825976236e-07, "loss": 0.7677, "step": 2689 }, { "epoch": 0.26592195338951635, "grad_norm": 5.930441967518548, "learning_rate": 1.7220578870181325e-07, "loss": 0.7012, "step": 2690 }, { "epoch": 0.26602080913427084, "grad_norm": 6.587046904087079, "learning_rate": 1.721836317402256e-07, "loss": 0.6668, "step": 2691 }, { "epoch": 0.2661196648790253, "grad_norm": 5.96238644071406, "learning_rate": 1.721614673772713e-07, "loss": 0.6208, "step": 2692 }, { "epoch": 0.26621852062377976, "grad_norm": 3.8429411676577603, "learning_rate": 1.72139295615223e-07, "loss": 0.7585, "step": 2693 }, { "epoch": 0.2663173763685342, "grad_norm": 3.126805205616945, "learning_rate": 1.7211711645635404e-07, "loss": 0.6213, "step": 2694 }, { "epoch": 0.2664162321132887, "grad_norm": 4.2934639786529685, "learning_rate": 1.7209492990293862e-07, "loss": 0.6095, "step": 2695 }, { "epoch": 0.2665150878580432, "grad_norm": 4.520292397005222, "learning_rate": 1.7207273595725157e-07, "loss": 0.7099, "step": 2696 }, { "epoch": 0.2666139436027976, "grad_norm": 3.8172081669623465, "learning_rate": 1.720505346215686e-07, "loss": 0.6163, "step": 2697 }, { "epoch": 0.2667127993475521, "grad_norm": 3.88547183495742, "learning_rate": 1.7202832589816615e-07, "loss": 0.7033, "step": 2698 }, { "epoch": 0.26681165509230653, "grad_norm": 4.973127253909407, "learning_rate": 1.7200610978932137e-07, "loss": 0.8837, "step": 2699 }, { "epoch": 0.266910510837061, "grad_norm": 13.515075718147092, "learning_rate": 1.719838862973122e-07, "loss": 0.779, "step": 2700 }, { "epoch": 0.2670093665818155, "grad_norm": 3.3211256723621294, "learning_rate": 1.719616554244173e-07, "loss": 0.7111, "step": 2701 }, { "epoch": 0.26710822232656994, "grad_norm": 4.38572314928293, "learning_rate": 1.7193941717291618e-07, "loss": 0.7166, "step": 2702 }, { "epoch": 0.26720707807132443, "grad_norm": 5.186600974771468, "learning_rate": 1.7191717154508897e-07, "loss": 0.8215, "step": 2703 }, { "epoch": 0.26730593381607887, "grad_norm": 3.72848570156967, "learning_rate": 1.7189491854321672e-07, "loss": 0.7429, "step": 2704 }, { "epoch": 0.26740478956083336, "grad_norm": 3.6077111490470184, "learning_rate": 1.7187265816958105e-07, "loss": 0.5842, "step": 2705 }, { "epoch": 0.26750364530558784, "grad_norm": 5.057305919298859, "learning_rate": 1.718503904264645e-07, "loss": 0.7476, "step": 2706 }, { "epoch": 0.2676025010503423, "grad_norm": 3.4203649368439204, "learning_rate": 1.718281153161503e-07, "loss": 0.6563, "step": 2707 }, { "epoch": 0.26770135679509677, "grad_norm": 3.7845360973386364, "learning_rate": 1.718058328409224e-07, "loss": 0.7985, "step": 2708 }, { "epoch": 0.2678002125398512, "grad_norm": 2.7978239614305207, "learning_rate": 1.7178354300306555e-07, "loss": 0.742, "step": 2709 }, { "epoch": 0.2678990682846057, "grad_norm": 3.2886283354133634, "learning_rate": 1.7176124580486523e-07, "loss": 0.7314, "step": 2710 }, { "epoch": 0.2679979240293602, "grad_norm": 5.0870432459155515, "learning_rate": 1.7173894124860771e-07, "loss": 0.7277, "step": 2711 }, { "epoch": 0.2680967797741146, "grad_norm": 5.1474967420976085, "learning_rate": 1.7171662933658e-07, "loss": 0.8155, "step": 2712 }, { "epoch": 0.2681956355188691, "grad_norm": 6.411846710363167, "learning_rate": 1.7169431007106984e-07, "loss": 0.7462, "step": 2713 }, { "epoch": 0.26829449126362354, "grad_norm": 4.373310845647385, "learning_rate": 1.7167198345436575e-07, "loss": 0.7847, "step": 2714 }, { "epoch": 0.268393347008378, "grad_norm": 4.406336033052503, "learning_rate": 1.71649649488757e-07, "loss": 0.6869, "step": 2715 }, { "epoch": 0.2684922027531325, "grad_norm": 15.459789719604203, "learning_rate": 1.716273081765336e-07, "loss": 0.6823, "step": 2716 }, { "epoch": 0.26859105849788695, "grad_norm": 5.114592238506369, "learning_rate": 1.716049595199863e-07, "loss": 0.6557, "step": 2717 }, { "epoch": 0.26868991424264144, "grad_norm": 4.276011660410829, "learning_rate": 1.715826035214067e-07, "loss": 0.792, "step": 2718 }, { "epoch": 0.26878876998739587, "grad_norm": 6.317679090142235, "learning_rate": 1.71560240183087e-07, "loss": 0.7087, "step": 2719 }, { "epoch": 0.26888762573215036, "grad_norm": 11.149777039212214, "learning_rate": 1.7153786950732023e-07, "loss": 0.7259, "step": 2720 }, { "epoch": 0.26898648147690485, "grad_norm": 5.640768744686316, "learning_rate": 1.7151549149640024e-07, "loss": 0.7417, "step": 2721 }, { "epoch": 0.2690853372216593, "grad_norm": 3.804775695626346, "learning_rate": 1.714931061526215e-07, "loss": 0.6923, "step": 2722 }, { "epoch": 0.26918419296641377, "grad_norm": 4.174490303535939, "learning_rate": 1.7147071347827933e-07, "loss": 0.738, "step": 2723 }, { "epoch": 0.2692830487111682, "grad_norm": 3.8418018878860845, "learning_rate": 1.7144831347566976e-07, "loss": 0.7336, "step": 2724 }, { "epoch": 0.2693819044559227, "grad_norm": 5.695029858028046, "learning_rate": 1.7142590614708957e-07, "loss": 0.7512, "step": 2725 }, { "epoch": 0.2694807602006772, "grad_norm": 3.231158971327436, "learning_rate": 1.714034914948363e-07, "loss": 0.7184, "step": 2726 }, { "epoch": 0.2695796159454316, "grad_norm": 7.91830619774103, "learning_rate": 1.713810695212083e-07, "loss": 0.7815, "step": 2727 }, { "epoch": 0.2696784716901861, "grad_norm": 4.0589960880831875, "learning_rate": 1.713586402285045e-07, "loss": 0.7903, "step": 2728 }, { "epoch": 0.26977732743494054, "grad_norm": 3.9100993896595972, "learning_rate": 1.7133620361902482e-07, "loss": 0.7514, "step": 2729 }, { "epoch": 0.26987618317969503, "grad_norm": 8.026697077426341, "learning_rate": 1.7131375969506972e-07, "loss": 0.6745, "step": 2730 }, { "epoch": 0.2699750389244495, "grad_norm": 3.4843343589437725, "learning_rate": 1.7129130845894053e-07, "loss": 0.7383, "step": 2731 }, { "epoch": 0.27007389466920395, "grad_norm": 6.677193416839262, "learning_rate": 1.7126884991293924e-07, "loss": 0.6941, "step": 2732 }, { "epoch": 0.27017275041395844, "grad_norm": 6.579620412921381, "learning_rate": 1.712463840593687e-07, "loss": 0.7634, "step": 2733 }, { "epoch": 0.2702716061587129, "grad_norm": 3.372900121240285, "learning_rate": 1.7122391090053246e-07, "loss": 0.6974, "step": 2734 }, { "epoch": 0.27037046190346736, "grad_norm": 3.849039439915618, "learning_rate": 1.7120143043873477e-07, "loss": 0.6682, "step": 2735 }, { "epoch": 0.27046931764822185, "grad_norm": 14.236431164391774, "learning_rate": 1.7117894267628066e-07, "loss": 0.7373, "step": 2736 }, { "epoch": 0.2705681733929763, "grad_norm": 4.983920512019094, "learning_rate": 1.7115644761547596e-07, "loss": 0.7421, "step": 2737 }, { "epoch": 0.2706670291377308, "grad_norm": 5.766959578690897, "learning_rate": 1.7113394525862718e-07, "loss": 0.7103, "step": 2738 }, { "epoch": 0.2707658848824852, "grad_norm": 3.5123996283938177, "learning_rate": 1.711114356080416e-07, "loss": 0.6569, "step": 2739 }, { "epoch": 0.2708647406272397, "grad_norm": 3.641046838805623, "learning_rate": 1.7108891866602727e-07, "loss": 0.6919, "step": 2740 }, { "epoch": 0.2709635963719942, "grad_norm": 7.379567825751909, "learning_rate": 1.71066394434893e-07, "loss": 0.7081, "step": 2741 }, { "epoch": 0.2710624521167486, "grad_norm": 5.721743710596797, "learning_rate": 1.7104386291694827e-07, "loss": 0.6656, "step": 2742 }, { "epoch": 0.2711613078615031, "grad_norm": 11.809436160498016, "learning_rate": 1.7102132411450336e-07, "loss": 0.6572, "step": 2743 }, { "epoch": 0.27126016360625754, "grad_norm": 3.6298938671609124, "learning_rate": 1.7099877802986932e-07, "loss": 0.5259, "step": 2744 }, { "epoch": 0.27135901935101203, "grad_norm": 3.4756043981456175, "learning_rate": 1.7097622466535785e-07, "loss": 0.6858, "step": 2745 }, { "epoch": 0.2714578750957665, "grad_norm": 4.268714582796012, "learning_rate": 1.7095366402328156e-07, "loss": 0.7996, "step": 2746 }, { "epoch": 0.27155673084052095, "grad_norm": 5.110685425511531, "learning_rate": 1.7093109610595365e-07, "loss": 0.7577, "step": 2747 }, { "epoch": 0.27165558658527544, "grad_norm": 3.180075781983478, "learning_rate": 1.7090852091568814e-07, "loss": 0.7166, "step": 2748 }, { "epoch": 0.2717544423300299, "grad_norm": 4.555187870547729, "learning_rate": 1.7088593845479978e-07, "loss": 0.7601, "step": 2749 }, { "epoch": 0.27185329807478437, "grad_norm": 3.8379359357874585, "learning_rate": 1.7086334872560406e-07, "loss": 0.6451, "step": 2750 }, { "epoch": 0.27195215381953886, "grad_norm": 5.588082387607627, "learning_rate": 1.7084075173041727e-07, "loss": 0.7403, "step": 2751 }, { "epoch": 0.2720510095642933, "grad_norm": 11.662015857809356, "learning_rate": 1.7081814747155634e-07, "loss": 0.6834, "step": 2752 }, { "epoch": 0.2721498653090478, "grad_norm": 3.0972144542479514, "learning_rate": 1.7079553595133904e-07, "loss": 0.7007, "step": 2753 }, { "epoch": 0.2722487210538022, "grad_norm": 6.484561990930114, "learning_rate": 1.7077291717208383e-07, "loss": 0.6974, "step": 2754 }, { "epoch": 0.2723475767985567, "grad_norm": 3.069365584711649, "learning_rate": 1.7075029113610992e-07, "loss": 0.7119, "step": 2755 }, { "epoch": 0.2724464325433112, "grad_norm": 6.326860751064532, "learning_rate": 1.7072765784573732e-07, "loss": 0.7133, "step": 2756 }, { "epoch": 0.2725452882880656, "grad_norm": 19.095871113892567, "learning_rate": 1.707050173032867e-07, "loss": 0.6668, "step": 2757 }, { "epoch": 0.2726441440328201, "grad_norm": 3.265472457129888, "learning_rate": 1.7068236951107954e-07, "loss": 0.7157, "step": 2758 }, { "epoch": 0.2727429997775746, "grad_norm": 4.878611144433243, "learning_rate": 1.7065971447143804e-07, "loss": 0.785, "step": 2759 }, { "epoch": 0.27284185552232904, "grad_norm": 3.6253477542031542, "learning_rate": 1.706370521866851e-07, "loss": 0.7549, "step": 2760 }, { "epoch": 0.2729407112670835, "grad_norm": 4.715531098562006, "learning_rate": 1.7061438265914445e-07, "loss": 0.6667, "step": 2761 }, { "epoch": 0.27303956701183796, "grad_norm": 8.055872437049178, "learning_rate": 1.7059170589114048e-07, "loss": 0.704, "step": 2762 }, { "epoch": 0.27313842275659245, "grad_norm": 7.276697355399356, "learning_rate": 1.7056902188499836e-07, "loss": 0.666, "step": 2763 }, { "epoch": 0.27323727850134694, "grad_norm": 4.139905557877823, "learning_rate": 1.70546330643044e-07, "loss": 0.6967, "step": 2764 }, { "epoch": 0.27333613424610137, "grad_norm": 3.1560459948303916, "learning_rate": 1.705236321676041e-07, "loss": 0.6864, "step": 2765 }, { "epoch": 0.27343498999085586, "grad_norm": 4.122840556752035, "learning_rate": 1.70500926461006e-07, "loss": 0.6452, "step": 2766 }, { "epoch": 0.2735338457356103, "grad_norm": 12.754842171734808, "learning_rate": 1.704782135255779e-07, "loss": 0.8194, "step": 2767 }, { "epoch": 0.2736327014803648, "grad_norm": 3.667547230244823, "learning_rate": 1.7045549336364854e-07, "loss": 0.7546, "step": 2768 }, { "epoch": 0.27373155722511927, "grad_norm": 4.768286744570924, "learning_rate": 1.704327659775477e-07, "loss": 0.6623, "step": 2769 }, { "epoch": 0.2738304129698737, "grad_norm": 3.407922886015219, "learning_rate": 1.7041003136960564e-07, "loss": 0.7402, "step": 2770 }, { "epoch": 0.2739292687146282, "grad_norm": 3.8010381477705804, "learning_rate": 1.7038728954215346e-07, "loss": 0.7771, "step": 2771 }, { "epoch": 0.2740281244593826, "grad_norm": 10.353262258327643, "learning_rate": 1.7036454049752304e-07, "loss": 0.7542, "step": 2772 }, { "epoch": 0.2741269802041371, "grad_norm": 3.4756256338326987, "learning_rate": 1.7034178423804698e-07, "loss": 0.7522, "step": 2773 }, { "epoch": 0.2742258359488916, "grad_norm": 8.270971235605833, "learning_rate": 1.7031902076605853e-07, "loss": 0.6332, "step": 2774 }, { "epoch": 0.27432469169364604, "grad_norm": 9.11170275799054, "learning_rate": 1.7029625008389177e-07, "loss": 0.7218, "step": 2775 }, { "epoch": 0.27442354743840053, "grad_norm": 4.026906649270887, "learning_rate": 1.702734721938815e-07, "loss": 0.5586, "step": 2776 }, { "epoch": 0.27452240318315496, "grad_norm": 4.070619481102734, "learning_rate": 1.7025068709836327e-07, "loss": 0.7469, "step": 2777 }, { "epoch": 0.27462125892790945, "grad_norm": 3.738105023272579, "learning_rate": 1.7022789479967337e-07, "loss": 0.6428, "step": 2778 }, { "epoch": 0.27472011467266394, "grad_norm": 3.870121556443452, "learning_rate": 1.7020509530014878e-07, "loss": 0.798, "step": 2779 }, { "epoch": 0.2748189704174184, "grad_norm": 3.6017344301138077, "learning_rate": 1.7018228860212726e-07, "loss": 0.5284, "step": 2780 }, { "epoch": 0.27491782616217286, "grad_norm": 5.704918520965602, "learning_rate": 1.701594747079473e-07, "loss": 0.7337, "step": 2781 }, { "epoch": 0.2750166819069273, "grad_norm": 9.983047148869666, "learning_rate": 1.7013665361994815e-07, "loss": 0.7453, "step": 2782 }, { "epoch": 0.2751155376516818, "grad_norm": 3.373031994405885, "learning_rate": 1.7011382534046977e-07, "loss": 0.8295, "step": 2783 }, { "epoch": 0.2752143933964363, "grad_norm": 4.48474411181987, "learning_rate": 1.700909898718528e-07, "loss": 0.7813, "step": 2784 }, { "epoch": 0.2753132491411907, "grad_norm": 4.138888399143323, "learning_rate": 1.700681472164388e-07, "loss": 0.6591, "step": 2785 }, { "epoch": 0.2754121048859452, "grad_norm": 7.719928914748751, "learning_rate": 1.7004529737656986e-07, "loss": 0.6932, "step": 2786 }, { "epoch": 0.27551096063069963, "grad_norm": 3.5853207966446896, "learning_rate": 1.700224403545889e-07, "loss": 0.6036, "step": 2787 }, { "epoch": 0.2756098163754541, "grad_norm": 4.70423189644711, "learning_rate": 1.6999957615283956e-07, "loss": 0.6995, "step": 2788 }, { "epoch": 0.2757086721202086, "grad_norm": 5.756835304811161, "learning_rate": 1.6997670477366631e-07, "loss": 0.7856, "step": 2789 }, { "epoch": 0.27580752786496304, "grad_norm": 3.1137238895000956, "learning_rate": 1.699538262194142e-07, "loss": 0.7606, "step": 2790 }, { "epoch": 0.27590638360971753, "grad_norm": 4.016064938163057, "learning_rate": 1.699309404924291e-07, "loss": 0.7817, "step": 2791 }, { "epoch": 0.27600523935447197, "grad_norm": 7.78352844587789, "learning_rate": 1.6990804759505756e-07, "loss": 0.7192, "step": 2792 }, { "epoch": 0.27610409509922645, "grad_norm": 5.059689749454026, "learning_rate": 1.69885147529647e-07, "loss": 0.8114, "step": 2793 }, { "epoch": 0.27620295084398094, "grad_norm": 3.9391395393723148, "learning_rate": 1.6986224029854543e-07, "loss": 0.7772, "step": 2794 }, { "epoch": 0.2763018065887354, "grad_norm": 3.4095566878137404, "learning_rate": 1.698393259041017e-07, "loss": 0.7723, "step": 2795 }, { "epoch": 0.27640066233348987, "grad_norm": 3.2045375296487695, "learning_rate": 1.6981640434866523e-07, "loss": 0.741, "step": 2796 }, { "epoch": 0.2764995180782443, "grad_norm": 3.5179637813241955, "learning_rate": 1.6979347563458636e-07, "loss": 0.6722, "step": 2797 }, { "epoch": 0.2765983738229988, "grad_norm": 5.311132005149992, "learning_rate": 1.6977053976421614e-07, "loss": 0.6788, "step": 2798 }, { "epoch": 0.2766972295677533, "grad_norm": 4.868701827805657, "learning_rate": 1.6974759673990624e-07, "loss": 0.6363, "step": 2799 }, { "epoch": 0.2767960853125077, "grad_norm": 9.703872098721078, "learning_rate": 1.6972464656400912e-07, "loss": 0.7397, "step": 2800 }, { "epoch": 0.2768949410572622, "grad_norm": 4.732192263472022, "learning_rate": 1.69701689238878e-07, "loss": 0.5856, "step": 2801 }, { "epoch": 0.27699379680201663, "grad_norm": 13.997146616948957, "learning_rate": 1.6967872476686686e-07, "loss": 0.6972, "step": 2802 }, { "epoch": 0.2770926525467711, "grad_norm": 12.40888459050881, "learning_rate": 1.6965575315033035e-07, "loss": 0.7141, "step": 2803 }, { "epoch": 0.2771915082915256, "grad_norm": 6.9588597584764935, "learning_rate": 1.696327743916238e-07, "loss": 0.8439, "step": 2804 }, { "epoch": 0.27729036403628005, "grad_norm": 4.189039861994579, "learning_rate": 1.696097884931034e-07, "loss": 0.7188, "step": 2805 }, { "epoch": 0.27738921978103454, "grad_norm": 3.051323762707604, "learning_rate": 1.69586795457126e-07, "loss": 0.7341, "step": 2806 }, { "epoch": 0.27748807552578897, "grad_norm": 3.4573748698540574, "learning_rate": 1.6956379528604924e-07, "loss": 0.6919, "step": 2807 }, { "epoch": 0.27758693127054346, "grad_norm": 3.228416792717548, "learning_rate": 1.6954078798223136e-07, "loss": 0.753, "step": 2808 }, { "epoch": 0.27768578701529795, "grad_norm": 4.282133470970379, "learning_rate": 1.695177735480315e-07, "loss": 0.7471, "step": 2809 }, { "epoch": 0.2777846427600524, "grad_norm": 4.6729284497532495, "learning_rate": 1.6949475198580942e-07, "loss": 0.7314, "step": 2810 }, { "epoch": 0.27788349850480687, "grad_norm": 3.167353788249365, "learning_rate": 1.6947172329792565e-07, "loss": 0.7288, "step": 2811 }, { "epoch": 0.2779823542495613, "grad_norm": 4.23408905054495, "learning_rate": 1.6944868748674142e-07, "loss": 0.6659, "step": 2812 }, { "epoch": 0.2780812099943158, "grad_norm": 3.7231378919294595, "learning_rate": 1.6942564455461873e-07, "loss": 0.6045, "step": 2813 }, { "epoch": 0.2781800657390703, "grad_norm": 13.448174180378265, "learning_rate": 1.6940259450392032e-07, "loss": 0.777, "step": 2814 }, { "epoch": 0.2782789214838247, "grad_norm": 4.873261974364326, "learning_rate": 1.6937953733700954e-07, "loss": 0.7609, "step": 2815 }, { "epoch": 0.2783777772285792, "grad_norm": 5.907146252544327, "learning_rate": 1.6935647305625068e-07, "loss": 0.7629, "step": 2816 }, { "epoch": 0.27847663297333364, "grad_norm": 8.734817881861092, "learning_rate": 1.6933340166400858e-07, "loss": 0.7878, "step": 2817 }, { "epoch": 0.2785754887180881, "grad_norm": 4.158507642036664, "learning_rate": 1.693103231626489e-07, "loss": 0.7645, "step": 2818 }, { "epoch": 0.2786743444628426, "grad_norm": 4.200664521405948, "learning_rate": 1.692872375545379e-07, "loss": 0.7476, "step": 2819 }, { "epoch": 0.27877320020759705, "grad_norm": 3.5485701523357136, "learning_rate": 1.6926414484204283e-07, "loss": 0.6651, "step": 2820 }, { "epoch": 0.27887205595235154, "grad_norm": 4.324023948982937, "learning_rate": 1.692410450275314e-07, "loss": 0.6624, "step": 2821 }, { "epoch": 0.27897091169710597, "grad_norm": 4.333034138510861, "learning_rate": 1.6921793811337217e-07, "loss": 0.6685, "step": 2822 }, { "epoch": 0.27906976744186046, "grad_norm": 9.068404695842725, "learning_rate": 1.6919482410193443e-07, "loss": 0.6782, "step": 2823 }, { "epoch": 0.27916862318661495, "grad_norm": 3.2363748983731937, "learning_rate": 1.6917170299558818e-07, "loss": 0.5884, "step": 2824 }, { "epoch": 0.2792674789313694, "grad_norm": 3.665437285190474, "learning_rate": 1.6914857479670412e-07, "loss": 0.6785, "step": 2825 }, { "epoch": 0.2793663346761239, "grad_norm": 4.855405481919817, "learning_rate": 1.6912543950765374e-07, "loss": 0.7524, "step": 2826 }, { "epoch": 0.2794651904208783, "grad_norm": 6.211779238103908, "learning_rate": 1.6910229713080922e-07, "loss": 0.7208, "step": 2827 }, { "epoch": 0.2795640461656328, "grad_norm": 4.155043109512756, "learning_rate": 1.6907914766854344e-07, "loss": 0.7273, "step": 2828 }, { "epoch": 0.2796629019103873, "grad_norm": 3.969927702184816, "learning_rate": 1.6905599112323008e-07, "loss": 0.7038, "step": 2829 }, { "epoch": 0.2797617576551417, "grad_norm": 4.833828568473684, "learning_rate": 1.6903282749724345e-07, "loss": 0.7982, "step": 2830 }, { "epoch": 0.2798606133998962, "grad_norm": 5.781415210383675, "learning_rate": 1.6900965679295868e-07, "loss": 0.6931, "step": 2831 }, { "epoch": 0.27995946914465064, "grad_norm": 4.383609742721973, "learning_rate": 1.6898647901275157e-07, "loss": 0.7466, "step": 2832 }, { "epoch": 0.28005832488940513, "grad_norm": 5.016381818609073, "learning_rate": 1.6896329415899868e-07, "loss": 0.7556, "step": 2833 }, { "epoch": 0.2801571806341596, "grad_norm": 3.682293338678717, "learning_rate": 1.6894010223407724e-07, "loss": 0.7194, "step": 2834 }, { "epoch": 0.28025603637891405, "grad_norm": 17.37940688854949, "learning_rate": 1.6891690324036526e-07, "loss": 0.7496, "step": 2835 }, { "epoch": 0.28035489212366854, "grad_norm": 5.225918800480516, "learning_rate": 1.6889369718024143e-07, "loss": 0.7189, "step": 2836 }, { "epoch": 0.28045374786842303, "grad_norm": 2.971087388548522, "learning_rate": 1.6887048405608519e-07, "loss": 0.7428, "step": 2837 }, { "epoch": 0.28055260361317746, "grad_norm": 3.7216510252264103, "learning_rate": 1.6884726387027678e-07, "loss": 0.8743, "step": 2838 }, { "epoch": 0.28065145935793195, "grad_norm": 3.237759571123034, "learning_rate": 1.68824036625197e-07, "loss": 0.6934, "step": 2839 }, { "epoch": 0.2807503151026864, "grad_norm": 4.211631839018718, "learning_rate": 1.6880080232322746e-07, "loss": 0.6126, "step": 2840 }, { "epoch": 0.2808491708474409, "grad_norm": 6.430570881994642, "learning_rate": 1.6877756096675054e-07, "loss": 0.6701, "step": 2841 }, { "epoch": 0.28094802659219537, "grad_norm": 4.086316188650863, "learning_rate": 1.687543125581493e-07, "loss": 0.6913, "step": 2842 }, { "epoch": 0.2810468823369498, "grad_norm": 4.714319876249902, "learning_rate": 1.687310570998075e-07, "loss": 0.7544, "step": 2843 }, { "epoch": 0.2811457380817043, "grad_norm": 10.870422498915262, "learning_rate": 1.6870779459410965e-07, "loss": 0.6762, "step": 2844 }, { "epoch": 0.2812445938264587, "grad_norm": 5.722773257808273, "learning_rate": 1.6868452504344096e-07, "loss": 0.6675, "step": 2845 }, { "epoch": 0.2813434495712132, "grad_norm": 5.084053881943481, "learning_rate": 1.6866124845018743e-07, "loss": 0.8293, "step": 2846 }, { "epoch": 0.2814423053159677, "grad_norm": 3.616338770554812, "learning_rate": 1.6863796481673567e-07, "loss": 0.7257, "step": 2847 }, { "epoch": 0.28154116106072213, "grad_norm": 6.414030303976801, "learning_rate": 1.686146741454731e-07, "loss": 0.7652, "step": 2848 }, { "epoch": 0.2816400168054766, "grad_norm": 6.649939923045427, "learning_rate": 1.6859137643878783e-07, "loss": 0.7246, "step": 2849 }, { "epoch": 0.28173887255023106, "grad_norm": 3.516095853527276, "learning_rate": 1.6856807169906873e-07, "loss": 0.7709, "step": 2850 }, { "epoch": 0.28183772829498555, "grad_norm": 3.9424212774849736, "learning_rate": 1.685447599287053e-07, "loss": 0.7738, "step": 2851 }, { "epoch": 0.28193658403974003, "grad_norm": 11.92224343241575, "learning_rate": 1.6852144113008784e-07, "loss": 0.6406, "step": 2852 }, { "epoch": 0.28203543978449447, "grad_norm": 3.196653867262117, "learning_rate": 1.6849811530560737e-07, "loss": 0.8113, "step": 2853 }, { "epoch": 0.28213429552924896, "grad_norm": 4.951119183837967, "learning_rate": 1.6847478245765562e-07, "loss": 0.6373, "step": 2854 }, { "epoch": 0.2822331512740034, "grad_norm": 5.919404959664721, "learning_rate": 1.6845144258862495e-07, "loss": 0.7213, "step": 2855 }, { "epoch": 0.2823320070187579, "grad_norm": 3.2095734786175063, "learning_rate": 1.684280957009086e-07, "loss": 0.6944, "step": 2856 }, { "epoch": 0.28243086276351237, "grad_norm": 4.069845856582812, "learning_rate": 1.684047417969004e-07, "loss": 0.6189, "step": 2857 }, { "epoch": 0.2825297185082668, "grad_norm": 11.518702082261028, "learning_rate": 1.6838138087899497e-07, "loss": 0.6965, "step": 2858 }, { "epoch": 0.2826285742530213, "grad_norm": 4.766719028162733, "learning_rate": 1.6835801294958764e-07, "loss": 0.7674, "step": 2859 }, { "epoch": 0.2827274299977757, "grad_norm": 5.515020459840256, "learning_rate": 1.6833463801107447e-07, "loss": 0.6982, "step": 2860 }, { "epoch": 0.2828262857425302, "grad_norm": 6.928467974207027, "learning_rate": 1.6831125606585213e-07, "loss": 0.7151, "step": 2861 }, { "epoch": 0.2829251414872847, "grad_norm": 5.051392298917339, "learning_rate": 1.6828786711631816e-07, "loss": 0.7204, "step": 2862 }, { "epoch": 0.28302399723203914, "grad_norm": 4.862576416082069, "learning_rate": 1.6826447116487072e-07, "loss": 0.7921, "step": 2863 }, { "epoch": 0.2831228529767936, "grad_norm": 3.4006399713670428, "learning_rate": 1.6824106821390873e-07, "loss": 0.7188, "step": 2864 }, { "epoch": 0.28322170872154806, "grad_norm": 4.348481050486068, "learning_rate": 1.6821765826583185e-07, "loss": 0.6761, "step": 2865 }, { "epoch": 0.28332056446630255, "grad_norm": 3.636123668405624, "learning_rate": 1.681942413230404e-07, "loss": 0.6679, "step": 2866 }, { "epoch": 0.28341942021105704, "grad_norm": 3.996002290216521, "learning_rate": 1.681708173879354e-07, "loss": 0.6948, "step": 2867 }, { "epoch": 0.28351827595581147, "grad_norm": 4.59692825259003, "learning_rate": 1.6814738646291867e-07, "loss": 0.759, "step": 2868 }, { "epoch": 0.28361713170056596, "grad_norm": 3.6556817858709327, "learning_rate": 1.6812394855039275e-07, "loss": 0.7179, "step": 2869 }, { "epoch": 0.2837159874453204, "grad_norm": 18.985045924265723, "learning_rate": 1.6810050365276078e-07, "loss": 0.6925, "step": 2870 }, { "epoch": 0.2838148431900749, "grad_norm": 3.6442241566462314, "learning_rate": 1.680770517724267e-07, "loss": 0.7531, "step": 2871 }, { "epoch": 0.2839136989348294, "grad_norm": 5.341634637513418, "learning_rate": 1.6805359291179516e-07, "loss": 0.6976, "step": 2872 }, { "epoch": 0.2840125546795838, "grad_norm": 3.9254379925815375, "learning_rate": 1.6803012707327157e-07, "loss": 0.7132, "step": 2873 }, { "epoch": 0.2841114104243383, "grad_norm": 6.264937793549122, "learning_rate": 1.6800665425926193e-07, "loss": 0.8026, "step": 2874 }, { "epoch": 0.28421026616909273, "grad_norm": 4.232157567778203, "learning_rate": 1.6798317447217306e-07, "loss": 0.6403, "step": 2875 }, { "epoch": 0.2843091219138472, "grad_norm": 3.9715828648934934, "learning_rate": 1.679596877144125e-07, "loss": 0.7674, "step": 2876 }, { "epoch": 0.2844079776586017, "grad_norm": 10.06476920334696, "learning_rate": 1.6793619398838841e-07, "loss": 0.6966, "step": 2877 }, { "epoch": 0.28450683340335614, "grad_norm": 4.683602055792839, "learning_rate": 1.679126932965098e-07, "loss": 0.7873, "step": 2878 }, { "epoch": 0.28460568914811063, "grad_norm": 21.737212338572675, "learning_rate": 1.6788918564118624e-07, "loss": 0.7248, "step": 2879 }, { "epoch": 0.28470454489286506, "grad_norm": 3.6208041225978596, "learning_rate": 1.6786567102482812e-07, "loss": 0.7503, "step": 2880 }, { "epoch": 0.28480340063761955, "grad_norm": 4.063916175889555, "learning_rate": 1.6784214944984653e-07, "loss": 0.7492, "step": 2881 }, { "epoch": 0.28490225638237404, "grad_norm": 4.771983223129476, "learning_rate": 1.678186209186533e-07, "loss": 0.7795, "step": 2882 }, { "epoch": 0.2850011121271285, "grad_norm": 3.478213936421685, "learning_rate": 1.6779508543366085e-07, "loss": 0.7681, "step": 2883 }, { "epoch": 0.28509996787188296, "grad_norm": 5.214815118640428, "learning_rate": 1.6777154299728244e-07, "loss": 0.8241, "step": 2884 }, { "epoch": 0.2851988236166374, "grad_norm": 5.3560928952892555, "learning_rate": 1.6774799361193197e-07, "loss": 0.7107, "step": 2885 }, { "epoch": 0.2852976793613919, "grad_norm": 4.816028933181752, "learning_rate": 1.6772443728002414e-07, "loss": 0.6983, "step": 2886 }, { "epoch": 0.2853965351061464, "grad_norm": 6.814466816749082, "learning_rate": 1.677008740039743e-07, "loss": 0.6199, "step": 2887 }, { "epoch": 0.2854953908509008, "grad_norm": 10.708775733874663, "learning_rate": 1.6767730378619845e-07, "loss": 0.6948, "step": 2888 }, { "epoch": 0.2855942465956553, "grad_norm": 3.3777068158072825, "learning_rate": 1.676537266291134e-07, "loss": 0.7592, "step": 2889 }, { "epoch": 0.28569310234040973, "grad_norm": 4.280452404553901, "learning_rate": 1.6763014253513666e-07, "loss": 0.6916, "step": 2890 }, { "epoch": 0.2857919580851642, "grad_norm": 3.6163916616914062, "learning_rate": 1.6760655150668644e-07, "loss": 0.6249, "step": 2891 }, { "epoch": 0.2858908138299187, "grad_norm": 4.932592597327992, "learning_rate": 1.675829535461816e-07, "loss": 0.806, "step": 2892 }, { "epoch": 0.28598966957467314, "grad_norm": 5.705966311411102, "learning_rate": 1.675593486560418e-07, "loss": 0.6412, "step": 2893 }, { "epoch": 0.28608852531942763, "grad_norm": 4.431458251651551, "learning_rate": 1.675357368386874e-07, "loss": 0.7001, "step": 2894 }, { "epoch": 0.28618738106418207, "grad_norm": 3.1674056628654914, "learning_rate": 1.6751211809653937e-07, "loss": 0.7109, "step": 2895 }, { "epoch": 0.28628623680893656, "grad_norm": 3.350369820753518, "learning_rate": 1.6748849243201948e-07, "loss": 0.7188, "step": 2896 }, { "epoch": 0.28638509255369105, "grad_norm": 4.254320612478171, "learning_rate": 1.6746485984755026e-07, "loss": 0.7023, "step": 2897 }, { "epoch": 0.2864839482984455, "grad_norm": 3.7109889419325794, "learning_rate": 1.6744122034555482e-07, "loss": 0.8068, "step": 2898 }, { "epoch": 0.28658280404319997, "grad_norm": 5.706799763509911, "learning_rate": 1.6741757392845707e-07, "loss": 0.7974, "step": 2899 }, { "epoch": 0.2866816597879544, "grad_norm": 5.869183973296711, "learning_rate": 1.673939205986816e-07, "loss": 0.7768, "step": 2900 }, { "epoch": 0.2867805155327089, "grad_norm": 3.2361292670326733, "learning_rate": 1.6737026035865367e-07, "loss": 0.6772, "step": 2901 }, { "epoch": 0.2868793712774634, "grad_norm": 3.462386269100872, "learning_rate": 1.6734659321079937e-07, "loss": 0.6761, "step": 2902 }, { "epoch": 0.2869782270222178, "grad_norm": 4.243407352340233, "learning_rate": 1.6732291915754532e-07, "loss": 0.7636, "step": 2903 }, { "epoch": 0.2870770827669723, "grad_norm": 4.082366359224146, "learning_rate": 1.6729923820131902e-07, "loss": 0.6824, "step": 2904 }, { "epoch": 0.28717593851172674, "grad_norm": 4.0235471054253305, "learning_rate": 1.6727555034454858e-07, "loss": 0.6207, "step": 2905 }, { "epoch": 0.2872747942564812, "grad_norm": 50.42877646547694, "learning_rate": 1.6725185558966283e-07, "loss": 0.6896, "step": 2906 }, { "epoch": 0.2873736500012357, "grad_norm": 24.766436414665236, "learning_rate": 1.672281539390913e-07, "loss": 0.7342, "step": 2907 }, { "epoch": 0.28747250574599015, "grad_norm": 6.078040851964941, "learning_rate": 1.672044453952643e-07, "loss": 0.7116, "step": 2908 }, { "epoch": 0.28757136149074464, "grad_norm": 3.697732134158651, "learning_rate": 1.6718072996061278e-07, "loss": 0.6656, "step": 2909 }, { "epoch": 0.28767021723549907, "grad_norm": 3.7794213993260284, "learning_rate": 1.6715700763756833e-07, "loss": 0.7212, "step": 2910 }, { "epoch": 0.28776907298025356, "grad_norm": 4.452822381337555, "learning_rate": 1.671332784285634e-07, "loss": 0.7014, "step": 2911 }, { "epoch": 0.28786792872500805, "grad_norm": 5.330463295720958, "learning_rate": 1.671095423360311e-07, "loss": 0.7317, "step": 2912 }, { "epoch": 0.2879667844697625, "grad_norm": 5.5598647708252535, "learning_rate": 1.6708579936240511e-07, "loss": 0.6969, "step": 2913 }, { "epoch": 0.28806564021451697, "grad_norm": 14.046334380035546, "learning_rate": 1.6706204951012002e-07, "loss": 0.6786, "step": 2914 }, { "epoch": 0.28816449595927146, "grad_norm": 6.977130181158292, "learning_rate": 1.6703829278161097e-07, "loss": 0.6661, "step": 2915 }, { "epoch": 0.2882633517040259, "grad_norm": 6.275243500853885, "learning_rate": 1.6701452917931389e-07, "loss": 0.68, "step": 2916 }, { "epoch": 0.2883622074487804, "grad_norm": 3.3461338033201344, "learning_rate": 1.6699075870566536e-07, "loss": 0.6708, "step": 2917 }, { "epoch": 0.2884610631935348, "grad_norm": 4.741526704915874, "learning_rate": 1.6696698136310272e-07, "loss": 0.6326, "step": 2918 }, { "epoch": 0.2885599189382893, "grad_norm": 2.9948196367244, "learning_rate": 1.6694319715406398e-07, "loss": 0.7366, "step": 2919 }, { "epoch": 0.2886587746830438, "grad_norm": 4.477561455645616, "learning_rate": 1.6691940608098784e-07, "loss": 0.7147, "step": 2920 }, { "epoch": 0.28875763042779823, "grad_norm": 4.009977902128466, "learning_rate": 1.6689560814631374e-07, "loss": 0.749, "step": 2921 }, { "epoch": 0.2888564861725527, "grad_norm": 9.988636877376875, "learning_rate": 1.6687180335248182e-07, "loss": 0.7416, "step": 2922 }, { "epoch": 0.28895534191730715, "grad_norm": 8.075965488731708, "learning_rate": 1.6684799170193287e-07, "loss": 0.66, "step": 2923 }, { "epoch": 0.28905419766206164, "grad_norm": 4.660852732184505, "learning_rate": 1.6682417319710846e-07, "loss": 0.7228, "step": 2924 }, { "epoch": 0.28915305340681613, "grad_norm": 3.568649443570103, "learning_rate": 1.668003478404508e-07, "loss": 0.7191, "step": 2925 }, { "epoch": 0.28925190915157056, "grad_norm": 6.416431315859174, "learning_rate": 1.6677651563440285e-07, "loss": 0.7531, "step": 2926 }, { "epoch": 0.28935076489632505, "grad_norm": 10.93267692091047, "learning_rate": 1.6675267658140826e-07, "loss": 0.6913, "step": 2927 }, { "epoch": 0.2894496206410795, "grad_norm": 3.0879162301742618, "learning_rate": 1.6672883068391132e-07, "loss": 0.7046, "step": 2928 }, { "epoch": 0.289548476385834, "grad_norm": 5.983675047586937, "learning_rate": 1.6670497794435717e-07, "loss": 0.8554, "step": 2929 }, { "epoch": 0.28964733213058846, "grad_norm": 4.349238384685525, "learning_rate": 1.6668111836519147e-07, "loss": 0.686, "step": 2930 }, { "epoch": 0.2897461878753429, "grad_norm": 4.511126269322746, "learning_rate": 1.666572519488607e-07, "loss": 0.7194, "step": 2931 }, { "epoch": 0.2898450436200974, "grad_norm": 7.16640082927601, "learning_rate": 1.6663337869781198e-07, "loss": 0.5864, "step": 2932 }, { "epoch": 0.2899438993648518, "grad_norm": 4.966485168665288, "learning_rate": 1.6660949861449322e-07, "loss": 0.6871, "step": 2933 }, { "epoch": 0.2900427551096063, "grad_norm": 5.7887026562655395, "learning_rate": 1.6658561170135291e-07, "loss": 0.6512, "step": 2934 }, { "epoch": 0.2901416108543608, "grad_norm": 5.166783202317499, "learning_rate": 1.6656171796084039e-07, "loss": 0.7067, "step": 2935 }, { "epoch": 0.29024046659911523, "grad_norm": 3.9151767244014373, "learning_rate": 1.6653781739540547e-07, "loss": 0.7325, "step": 2936 }, { "epoch": 0.2903393223438697, "grad_norm": 3.8487066115986357, "learning_rate": 1.665139100074989e-07, "loss": 0.6924, "step": 2937 }, { "epoch": 0.29043817808862415, "grad_norm": 3.7980176192593036, "learning_rate": 1.6648999579957204e-07, "loss": 0.8204, "step": 2938 }, { "epoch": 0.29053703383337864, "grad_norm": 3.200030341089195, "learning_rate": 1.6646607477407687e-07, "loss": 0.747, "step": 2939 }, { "epoch": 0.29063588957813313, "grad_norm": 6.182722955579262, "learning_rate": 1.664421469334662e-07, "loss": 0.7084, "step": 2940 }, { "epoch": 0.29073474532288757, "grad_norm": 3.8641692473348304, "learning_rate": 1.6641821228019345e-07, "loss": 0.7844, "step": 2941 }, { "epoch": 0.29083360106764206, "grad_norm": 4.942923102788017, "learning_rate": 1.6639427081671277e-07, "loss": 0.6582, "step": 2942 }, { "epoch": 0.2909324568123965, "grad_norm": 8.762598659639725, "learning_rate": 1.6637032254547898e-07, "loss": 0.6555, "step": 2943 }, { "epoch": 0.291031312557151, "grad_norm": 6.508385031944667, "learning_rate": 1.6634636746894762e-07, "loss": 0.6483, "step": 2944 }, { "epoch": 0.29113016830190547, "grad_norm": 8.7970375321157, "learning_rate": 1.6632240558957502e-07, "loss": 0.7103, "step": 2945 }, { "epoch": 0.2912290240466599, "grad_norm": 5.671693045505708, "learning_rate": 1.66298436909818e-07, "loss": 0.8462, "step": 2946 }, { "epoch": 0.2913278797914144, "grad_norm": 3.6232265698574357, "learning_rate": 1.6627446143213427e-07, "loss": 0.7414, "step": 2947 }, { "epoch": 0.2914267355361688, "grad_norm": 8.020449977608214, "learning_rate": 1.6625047915898214e-07, "loss": 0.7268, "step": 2948 }, { "epoch": 0.2915255912809233, "grad_norm": 7.2283264676661325, "learning_rate": 1.6622649009282065e-07, "loss": 0.6989, "step": 2949 }, { "epoch": 0.2916244470256778, "grad_norm": 5.220455960744127, "learning_rate": 1.6620249423610947e-07, "loss": 0.7514, "step": 2950 }, { "epoch": 0.29172330277043224, "grad_norm": 3.7334273700753213, "learning_rate": 1.661784915913091e-07, "loss": 0.758, "step": 2951 }, { "epoch": 0.2918221585151867, "grad_norm": 18.19610169147317, "learning_rate": 1.661544821608806e-07, "loss": 0.7377, "step": 2952 }, { "epoch": 0.29192101425994116, "grad_norm": 4.36827086675971, "learning_rate": 1.661304659472858e-07, "loss": 0.6868, "step": 2953 }, { "epoch": 0.29201987000469565, "grad_norm": 48.63254532072366, "learning_rate": 1.6610644295298722e-07, "loss": 0.6601, "step": 2954 }, { "epoch": 0.29211872574945014, "grad_norm": 4.443718576682958, "learning_rate": 1.6608241318044804e-07, "loss": 0.7658, "step": 2955 }, { "epoch": 0.29221758149420457, "grad_norm": 4.005970576124661, "learning_rate": 1.6605837663213217e-07, "loss": 0.695, "step": 2956 }, { "epoch": 0.29231643723895906, "grad_norm": 4.598474590093278, "learning_rate": 1.6603433331050423e-07, "loss": 0.7404, "step": 2957 }, { "epoch": 0.2924152929837135, "grad_norm": 7.327568675665927, "learning_rate": 1.6601028321802947e-07, "loss": 0.6888, "step": 2958 }, { "epoch": 0.292514148728468, "grad_norm": 4.587068098689788, "learning_rate": 1.6598622635717388e-07, "loss": 0.7501, "step": 2959 }, { "epoch": 0.29261300447322247, "grad_norm": 4.333076312326571, "learning_rate": 1.6596216273040415e-07, "loss": 0.6223, "step": 2960 }, { "epoch": 0.2927118602179769, "grad_norm": 8.160101619169971, "learning_rate": 1.6593809234018763e-07, "loss": 0.6468, "step": 2961 }, { "epoch": 0.2928107159627314, "grad_norm": 4.433412237394138, "learning_rate": 1.659140151889924e-07, "loss": 0.7401, "step": 2962 }, { "epoch": 0.2929095717074858, "grad_norm": 5.372283018107874, "learning_rate": 1.6588993127928722e-07, "loss": 0.6307, "step": 2963 }, { "epoch": 0.2930084274522403, "grad_norm": 3.127936750932502, "learning_rate": 1.6586584061354155e-07, "loss": 0.8461, "step": 2964 }, { "epoch": 0.2931072831969948, "grad_norm": 3.194941310910834, "learning_rate": 1.6584174319422548e-07, "loss": 0.6365, "step": 2965 }, { "epoch": 0.29320613894174924, "grad_norm": 5.56381485861516, "learning_rate": 1.658176390238099e-07, "loss": 0.7335, "step": 2966 }, { "epoch": 0.29330499468650373, "grad_norm": 19.961263715217665, "learning_rate": 1.657935281047663e-07, "loss": 0.6592, "step": 2967 }, { "epoch": 0.29340385043125816, "grad_norm": 9.051526679705852, "learning_rate": 1.6576941043956694e-07, "loss": 0.8002, "step": 2968 }, { "epoch": 0.29350270617601265, "grad_norm": 3.152604692712811, "learning_rate": 1.6574528603068468e-07, "loss": 0.6581, "step": 2969 }, { "epoch": 0.29360156192076714, "grad_norm": 5.272157188295804, "learning_rate": 1.6572115488059317e-07, "loss": 0.6278, "step": 2970 }, { "epoch": 0.2937004176655216, "grad_norm": 7.168307922641315, "learning_rate": 1.6569701699176667e-07, "loss": 0.7023, "step": 2971 }, { "epoch": 0.29379927341027606, "grad_norm": 4.093847027078792, "learning_rate": 1.6567287236668018e-07, "loss": 0.7866, "step": 2972 }, { "epoch": 0.2938981291550305, "grad_norm": 3.5160967723592917, "learning_rate": 1.6564872100780938e-07, "loss": 0.705, "step": 2973 }, { "epoch": 0.293996984899785, "grad_norm": 3.9093644109658814, "learning_rate": 1.656245629176306e-07, "loss": 0.7184, "step": 2974 }, { "epoch": 0.2940958406445395, "grad_norm": 3.4994053117086885, "learning_rate": 1.6560039809862096e-07, "loss": 0.7201, "step": 2975 }, { "epoch": 0.2941946963892939, "grad_norm": 5.46941639257022, "learning_rate": 1.6557622655325812e-07, "loss": 0.7074, "step": 2976 }, { "epoch": 0.2942935521340484, "grad_norm": 8.625950544010237, "learning_rate": 1.6555204828402062e-07, "loss": 0.7551, "step": 2977 }, { "epoch": 0.29439240787880283, "grad_norm": 3.44053630922227, "learning_rate": 1.655278632933875e-07, "loss": 0.7024, "step": 2978 }, { "epoch": 0.2944912636235573, "grad_norm": 3.2769510141931586, "learning_rate": 1.6550367158383863e-07, "loss": 0.6996, "step": 2979 }, { "epoch": 0.2945901193683118, "grad_norm": 5.087418971183405, "learning_rate": 1.6547947315785445e-07, "loss": 0.7046, "step": 2980 }, { "epoch": 0.29468897511306624, "grad_norm": 4.603317339518199, "learning_rate": 1.6545526801791622e-07, "loss": 0.6772, "step": 2981 }, { "epoch": 0.29478783085782073, "grad_norm": 4.200853302529868, "learning_rate": 1.6543105616650578e-07, "loss": 0.7141, "step": 2982 }, { "epoch": 0.29488668660257517, "grad_norm": 4.358310546584248, "learning_rate": 1.6540683760610569e-07, "loss": 0.681, "step": 2983 }, { "epoch": 0.29498554234732965, "grad_norm": 3.443361249799756, "learning_rate": 1.653826123391992e-07, "loss": 0.6801, "step": 2984 }, { "epoch": 0.29508439809208414, "grad_norm": 4.0707045879363655, "learning_rate": 1.6535838036827033e-07, "loss": 0.7006, "step": 2985 }, { "epoch": 0.2951832538368386, "grad_norm": 3.4866961306645767, "learning_rate": 1.6533414169580363e-07, "loss": 0.6722, "step": 2986 }, { "epoch": 0.29528210958159307, "grad_norm": 4.334466789087875, "learning_rate": 1.6530989632428446e-07, "loss": 0.72, "step": 2987 }, { "epoch": 0.2953809653263475, "grad_norm": 5.9272484422229645, "learning_rate": 1.6528564425619877e-07, "loss": 0.7018, "step": 2988 }, { "epoch": 0.295479821071102, "grad_norm": 3.35716496417885, "learning_rate": 1.6526138549403332e-07, "loss": 0.6778, "step": 2989 }, { "epoch": 0.2955786768158565, "grad_norm": 4.141959897175426, "learning_rate": 1.652371200402755e-07, "loss": 0.6012, "step": 2990 }, { "epoch": 0.2956775325606109, "grad_norm": 3.8420179003946893, "learning_rate": 1.652128478974133e-07, "loss": 0.7247, "step": 2991 }, { "epoch": 0.2957763883053654, "grad_norm": 3.475362912219321, "learning_rate": 1.6518856906793547e-07, "loss": 0.6828, "step": 2992 }, { "epoch": 0.2958752440501199, "grad_norm": 6.585532581813327, "learning_rate": 1.6516428355433153e-07, "loss": 0.7764, "step": 2993 }, { "epoch": 0.2959740997948743, "grad_norm": 4.389184830521142, "learning_rate": 1.6513999135909152e-07, "loss": 0.7403, "step": 2994 }, { "epoch": 0.2960729555396288, "grad_norm": 5.106313007504171, "learning_rate": 1.651156924847063e-07, "loss": 0.7402, "step": 2995 }, { "epoch": 0.29617181128438325, "grad_norm": 3.877936240751328, "learning_rate": 1.6509138693366732e-07, "loss": 0.761, "step": 2996 }, { "epoch": 0.29627066702913774, "grad_norm": 3.078411657447238, "learning_rate": 1.650670747084668e-07, "loss": 0.7418, "step": 2997 }, { "epoch": 0.2963695227738922, "grad_norm": 3.9771299924636927, "learning_rate": 1.6504275581159758e-07, "loss": 0.7953, "step": 2998 }, { "epoch": 0.29646837851864666, "grad_norm": 6.894003844094074, "learning_rate": 1.6501843024555318e-07, "loss": 0.6874, "step": 2999 }, { "epoch": 0.29656723426340115, "grad_norm": 3.5467093215905994, "learning_rate": 1.6499409801282785e-07, "loss": 0.7513, "step": 3000 }, { "epoch": 0.2966660900081556, "grad_norm": 12.18314006526318, "learning_rate": 1.649697591159165e-07, "loss": 0.7785, "step": 3001 }, { "epoch": 0.29676494575291007, "grad_norm": 4.301698889518849, "learning_rate": 1.6494541355731474e-07, "loss": 0.7476, "step": 3002 }, { "epoch": 0.29686380149766456, "grad_norm": 3.096021783492918, "learning_rate": 1.649210613395188e-07, "loss": 0.6774, "step": 3003 }, { "epoch": 0.296962657242419, "grad_norm": 13.242118369689427, "learning_rate": 1.648967024650257e-07, "loss": 0.8341, "step": 3004 }, { "epoch": 0.2970615129871735, "grad_norm": 3.1352313424120792, "learning_rate": 1.6487233693633303e-07, "loss": 0.706, "step": 3005 }, { "epoch": 0.2971603687319279, "grad_norm": 3.8988976560114414, "learning_rate": 1.6484796475593918e-07, "loss": 0.6745, "step": 3006 }, { "epoch": 0.2972592244766824, "grad_norm": 3.9653422286768474, "learning_rate": 1.648235859263431e-07, "loss": 0.678, "step": 3007 }, { "epoch": 0.2973580802214369, "grad_norm": 6.022121126067576, "learning_rate": 1.6479920045004446e-07, "loss": 0.8432, "step": 3008 }, { "epoch": 0.2974569359661913, "grad_norm": 4.468648450264459, "learning_rate": 1.647748083295437e-07, "loss": 0.8277, "step": 3009 }, { "epoch": 0.2975557917109458, "grad_norm": 3.6150772843579895, "learning_rate": 1.6475040956734186e-07, "loss": 0.7898, "step": 3010 }, { "epoch": 0.29765464745570025, "grad_norm": 3.276258235841093, "learning_rate": 1.6472600416594067e-07, "loss": 0.7221, "step": 3011 }, { "epoch": 0.29775350320045474, "grad_norm": 3.5035569586161253, "learning_rate": 1.647015921278425e-07, "loss": 0.7533, "step": 3012 }, { "epoch": 0.29785235894520923, "grad_norm": 4.3866409792970416, "learning_rate": 1.6467717345555044e-07, "loss": 0.7357, "step": 3013 }, { "epoch": 0.29795121468996366, "grad_norm": 3.2447351712563095, "learning_rate": 1.6465274815156836e-07, "loss": 0.6759, "step": 3014 }, { "epoch": 0.29805007043471815, "grad_norm": 3.276743809408426, "learning_rate": 1.646283162184006e-07, "loss": 0.7715, "step": 3015 }, { "epoch": 0.2981489261794726, "grad_norm": 8.38757623437377, "learning_rate": 1.6460387765855238e-07, "loss": 0.7909, "step": 3016 }, { "epoch": 0.2982477819242271, "grad_norm": 20.162705488559553, "learning_rate": 1.6457943247452944e-07, "loss": 0.8305, "step": 3017 }, { "epoch": 0.29834663766898156, "grad_norm": 3.832009024872091, "learning_rate": 1.6455498066883838e-07, "loss": 0.6396, "step": 3018 }, { "epoch": 0.298445493413736, "grad_norm": 2.8746847241118942, "learning_rate": 1.6453052224398624e-07, "loss": 0.8247, "step": 3019 }, { "epoch": 0.2985443491584905, "grad_norm": 7.952288280138214, "learning_rate": 1.6450605720248098e-07, "loss": 0.6961, "step": 3020 }, { "epoch": 0.2986432049032449, "grad_norm": 4.3646281982968365, "learning_rate": 1.6448158554683106e-07, "loss": 0.8318, "step": 3021 }, { "epoch": 0.2987420606479994, "grad_norm": 4.191030397469099, "learning_rate": 1.6445710727954572e-07, "loss": 0.6939, "step": 3022 }, { "epoch": 0.2988409163927539, "grad_norm": 19.31873245798296, "learning_rate": 1.6443262240313486e-07, "loss": 0.7921, "step": 3023 }, { "epoch": 0.29893977213750833, "grad_norm": 4.7396718496914625, "learning_rate": 1.64408130920109e-07, "loss": 0.6267, "step": 3024 }, { "epoch": 0.2990386278822628, "grad_norm": 4.417926287717722, "learning_rate": 1.643836328329794e-07, "loss": 0.7204, "step": 3025 }, { "epoch": 0.29913748362701725, "grad_norm": 3.367857407363134, "learning_rate": 1.6435912814425795e-07, "loss": 0.796, "step": 3026 }, { "epoch": 0.29923633937177174, "grad_norm": 4.088557351052837, "learning_rate": 1.643346168564573e-07, "loss": 0.8114, "step": 3027 }, { "epoch": 0.29933519511652623, "grad_norm": 4.122775506905657, "learning_rate": 1.6431009897209072e-07, "loss": 0.66, "step": 3028 }, { "epoch": 0.29943405086128067, "grad_norm": 4.55890112487925, "learning_rate": 1.6428557449367212e-07, "loss": 0.7867, "step": 3029 }, { "epoch": 0.29953290660603515, "grad_norm": 3.2382730956525854, "learning_rate": 1.6426104342371612e-07, "loss": 0.6183, "step": 3030 }, { "epoch": 0.2996317623507896, "grad_norm": 2.872709031471928, "learning_rate": 1.6423650576473803e-07, "loss": 0.7287, "step": 3031 }, { "epoch": 0.2997306180955441, "grad_norm": 5.044942029246941, "learning_rate": 1.6421196151925383e-07, "loss": 0.7432, "step": 3032 }, { "epoch": 0.29982947384029857, "grad_norm": 3.2838876070869003, "learning_rate": 1.6418741068978016e-07, "loss": 0.7317, "step": 3033 }, { "epoch": 0.299928329585053, "grad_norm": 7.504296292192675, "learning_rate": 1.641628532788344e-07, "loss": 0.643, "step": 3034 }, { "epoch": 0.3000271853298075, "grad_norm": 3.883738129807063, "learning_rate": 1.6413828928893445e-07, "loss": 0.826, "step": 3035 }, { "epoch": 0.3001260410745619, "grad_norm": 3.599532546755983, "learning_rate": 1.6411371872259906e-07, "loss": 0.7351, "step": 3036 }, { "epoch": 0.3002248968193164, "grad_norm": 3.2939757214087373, "learning_rate": 1.6408914158234756e-07, "loss": 0.7792, "step": 3037 }, { "epoch": 0.3003237525640709, "grad_norm": 5.85486341694874, "learning_rate": 1.6406455787069996e-07, "loss": 0.711, "step": 3038 }, { "epoch": 0.30042260830882533, "grad_norm": 31.16229070700125, "learning_rate": 1.64039967590177e-07, "loss": 0.6939, "step": 3039 }, { "epoch": 0.3005214640535798, "grad_norm": 4.099422928038333, "learning_rate": 1.6401537074330002e-07, "loss": 0.7602, "step": 3040 }, { "epoch": 0.30062031979833426, "grad_norm": 3.8823670954870972, "learning_rate": 1.6399076733259104e-07, "loss": 0.7163, "step": 3041 }, { "epoch": 0.30071917554308875, "grad_norm": 4.018243214502019, "learning_rate": 1.639661573605728e-07, "loss": 0.7302, "step": 3042 }, { "epoch": 0.30081803128784323, "grad_norm": 10.349509187919443, "learning_rate": 1.6394154082976867e-07, "loss": 0.675, "step": 3043 }, { "epoch": 0.30091688703259767, "grad_norm": 4.411477420919039, "learning_rate": 1.639169177427028e-07, "loss": 0.6924, "step": 3044 }, { "epoch": 0.30101574277735216, "grad_norm": 6.011067600226859, "learning_rate": 1.6389228810189982e-07, "loss": 0.6971, "step": 3045 }, { "epoch": 0.3011145985221066, "grad_norm": 46.06342986631917, "learning_rate": 1.6386765190988515e-07, "loss": 0.7748, "step": 3046 }, { "epoch": 0.3012134542668611, "grad_norm": 3.438218271289163, "learning_rate": 1.638430091691849e-07, "loss": 0.6768, "step": 3047 }, { "epoch": 0.30131231001161557, "grad_norm": 5.623255561522635, "learning_rate": 1.6381835988232582e-07, "loss": 0.6231, "step": 3048 }, { "epoch": 0.30141116575637, "grad_norm": 5.505622036068598, "learning_rate": 1.637937040518353e-07, "loss": 0.7659, "step": 3049 }, { "epoch": 0.3015100215011245, "grad_norm": 3.8456712975785012, "learning_rate": 1.6376904168024147e-07, "loss": 0.6824, "step": 3050 }, { "epoch": 0.3016088772458789, "grad_norm": 4.789170039776482, "learning_rate": 1.6374437277007308e-07, "loss": 0.7449, "step": 3051 }, { "epoch": 0.3017077329906334, "grad_norm": 3.634712860281757, "learning_rate": 1.6371969732385958e-07, "loss": 0.7803, "step": 3052 }, { "epoch": 0.3018065887353879, "grad_norm": 3.8623462783507336, "learning_rate": 1.6369501534413105e-07, "loss": 0.7092, "step": 3053 }, { "epoch": 0.30190544448014234, "grad_norm": 4.554782506176385, "learning_rate": 1.6367032683341822e-07, "loss": 0.7383, "step": 3054 }, { "epoch": 0.3020043002248968, "grad_norm": 4.317877561165182, "learning_rate": 1.636456317942526e-07, "loss": 0.6486, "step": 3055 }, { "epoch": 0.30210315596965126, "grad_norm": 5.7475177264748245, "learning_rate": 1.6362093022916626e-07, "loss": 0.6804, "step": 3056 }, { "epoch": 0.30220201171440575, "grad_norm": 4.181151380699082, "learning_rate": 1.6359622214069203e-07, "loss": 0.7304, "step": 3057 }, { "epoch": 0.30230086745916024, "grad_norm": 3.813265967235769, "learning_rate": 1.6357150753136332e-07, "loss": 0.7328, "step": 3058 }, { "epoch": 0.30239972320391467, "grad_norm": 2.900921414808093, "learning_rate": 1.6354678640371424e-07, "loss": 0.644, "step": 3059 }, { "epoch": 0.30249857894866916, "grad_norm": 11.882840495746054, "learning_rate": 1.635220587602796e-07, "loss": 0.6538, "step": 3060 }, { "epoch": 0.3025974346934236, "grad_norm": 3.812887930455186, "learning_rate": 1.6349732460359487e-07, "loss": 0.7588, "step": 3061 }, { "epoch": 0.3026962904381781, "grad_norm": 3.1695359462393453, "learning_rate": 1.6347258393619616e-07, "loss": 0.6601, "step": 3062 }, { "epoch": 0.3027951461829326, "grad_norm": 3.3679458251671837, "learning_rate": 1.6344783676062024e-07, "loss": 0.7461, "step": 3063 }, { "epoch": 0.302894001927687, "grad_norm": 3.4548765309086904, "learning_rate": 1.6342308307940455e-07, "loss": 0.706, "step": 3064 }, { "epoch": 0.3029928576724415, "grad_norm": 3.7249640849836463, "learning_rate": 1.633983228950873e-07, "loss": 0.6068, "step": 3065 }, { "epoch": 0.30309171341719593, "grad_norm": 3.545096209465632, "learning_rate": 1.633735562102072e-07, "loss": 0.8371, "step": 3066 }, { "epoch": 0.3031905691619504, "grad_norm": 4.059800417703488, "learning_rate": 1.6334878302730376e-07, "loss": 0.6302, "step": 3067 }, { "epoch": 0.3032894249067049, "grad_norm": 3.997277477922249, "learning_rate": 1.6332400334891704e-07, "loss": 0.705, "step": 3068 }, { "epoch": 0.30338828065145934, "grad_norm": 3.8249612035917617, "learning_rate": 1.632992171775879e-07, "loss": 0.6465, "step": 3069 }, { "epoch": 0.30348713639621383, "grad_norm": 8.228644771238056, "learning_rate": 1.6327442451585777e-07, "loss": 0.7486, "step": 3070 }, { "epoch": 0.3035859921409683, "grad_norm": 3.9682034763635508, "learning_rate": 1.6324962536626874e-07, "loss": 0.7942, "step": 3071 }, { "epoch": 0.30368484788572275, "grad_norm": 6.419239084606046, "learning_rate": 1.6322481973136365e-07, "loss": 0.7425, "step": 3072 }, { "epoch": 0.30378370363047724, "grad_norm": 3.218656513466064, "learning_rate": 1.632000076136859e-07, "loss": 0.7533, "step": 3073 }, { "epoch": 0.3038825593752317, "grad_norm": 16.17444489663314, "learning_rate": 1.6317518901577967e-07, "loss": 0.7352, "step": 3074 }, { "epoch": 0.30398141511998616, "grad_norm": 3.7105015264543235, "learning_rate": 1.631503639401897e-07, "loss": 0.7001, "step": 3075 }, { "epoch": 0.30408027086474065, "grad_norm": 3.0039992783569485, "learning_rate": 1.6312553238946143e-07, "loss": 0.7638, "step": 3076 }, { "epoch": 0.3041791266094951, "grad_norm": 3.686136821310322, "learning_rate": 1.63100694366141e-07, "loss": 0.6989, "step": 3077 }, { "epoch": 0.3042779823542496, "grad_norm": 7.693469946487675, "learning_rate": 1.6307584987277513e-07, "loss": 0.7557, "step": 3078 }, { "epoch": 0.304376838099004, "grad_norm": 3.383620416517909, "learning_rate": 1.6305099891191132e-07, "loss": 0.6734, "step": 3079 }, { "epoch": 0.3044756938437585, "grad_norm": 3.9735014332275944, "learning_rate": 1.6302614148609763e-07, "loss": 0.6798, "step": 3080 }, { "epoch": 0.304574549588513, "grad_norm": 4.7230406496389215, "learning_rate": 1.6300127759788286e-07, "loss": 0.6551, "step": 3081 }, { "epoch": 0.3046734053332674, "grad_norm": 3.0836621663492108, "learning_rate": 1.6297640724981634e-07, "loss": 0.7666, "step": 3082 }, { "epoch": 0.3047722610780219, "grad_norm": 2.8621892965253704, "learning_rate": 1.6295153044444825e-07, "loss": 0.7399, "step": 3083 }, { "epoch": 0.30487111682277634, "grad_norm": 9.902262882864472, "learning_rate": 1.6292664718432935e-07, "loss": 0.6928, "step": 3084 }, { "epoch": 0.30496997256753083, "grad_norm": 4.29046327545885, "learning_rate": 1.62901757472011e-07, "loss": 0.7393, "step": 3085 }, { "epoch": 0.3050688283122853, "grad_norm": 6.681251786043108, "learning_rate": 1.6287686131004525e-07, "loss": 0.7935, "step": 3086 }, { "epoch": 0.30516768405703976, "grad_norm": 43.60897253496351, "learning_rate": 1.628519587009849e-07, "loss": 0.6452, "step": 3087 }, { "epoch": 0.30526653980179425, "grad_norm": 4.655354265437068, "learning_rate": 1.628270496473833e-07, "loss": 0.7226, "step": 3088 }, { "epoch": 0.3053653955465487, "grad_norm": 6.792481195253978, "learning_rate": 1.628021341517946e-07, "loss": 0.7551, "step": 3089 }, { "epoch": 0.30546425129130317, "grad_norm": 6.477207765613272, "learning_rate": 1.6277721221677334e-07, "loss": 0.7089, "step": 3090 }, { "epoch": 0.30556310703605766, "grad_norm": 17.36732032557593, "learning_rate": 1.6275228384487502e-07, "loss": 0.6438, "step": 3091 }, { "epoch": 0.3056619627808121, "grad_norm": 3.3424784109244765, "learning_rate": 1.627273490386557e-07, "loss": 0.6121, "step": 3092 }, { "epoch": 0.3057608185255666, "grad_norm": 4.817878930465507, "learning_rate": 1.6270240780067197e-07, "loss": 0.7918, "step": 3093 }, { "epoch": 0.305859674270321, "grad_norm": 4.006770884183399, "learning_rate": 1.6267746013348127e-07, "loss": 0.6872, "step": 3094 }, { "epoch": 0.3059585300150755, "grad_norm": 11.105386747893618, "learning_rate": 1.6265250603964157e-07, "loss": 0.7505, "step": 3095 }, { "epoch": 0.30605738575983, "grad_norm": 3.883662354681814, "learning_rate": 1.626275455217116e-07, "loss": 0.7667, "step": 3096 }, { "epoch": 0.3061562415045844, "grad_norm": 3.322560467894458, "learning_rate": 1.6260257858225065e-07, "loss": 0.6633, "step": 3097 }, { "epoch": 0.3062550972493389, "grad_norm": 4.041459531337794, "learning_rate": 1.6257760522381873e-07, "loss": 0.6675, "step": 3098 }, { "epoch": 0.30635395299409335, "grad_norm": 4.081085214255873, "learning_rate": 1.6255262544897646e-07, "loss": 0.6213, "step": 3099 }, { "epoch": 0.30645280873884784, "grad_norm": 3.999896342127683, "learning_rate": 1.6252763926028518e-07, "loss": 0.659, "step": 3100 }, { "epoch": 0.3065516644836023, "grad_norm": 4.975146389722468, "learning_rate": 1.625026466603068e-07, "loss": 0.7932, "step": 3101 }, { "epoch": 0.30665052022835676, "grad_norm": 3.2615672270887925, "learning_rate": 1.6247764765160405e-07, "loss": 0.7203, "step": 3102 }, { "epoch": 0.30674937597311125, "grad_norm": 6.504723545341909, "learning_rate": 1.624526422367401e-07, "loss": 0.6436, "step": 3103 }, { "epoch": 0.3068482317178657, "grad_norm": 5.686391892395115, "learning_rate": 1.624276304182789e-07, "loss": 0.735, "step": 3104 }, { "epoch": 0.30694708746262017, "grad_norm": 77.54356413436516, "learning_rate": 1.6240261219878516e-07, "loss": 0.6867, "step": 3105 }, { "epoch": 0.30704594320737466, "grad_norm": 3.6204672124544923, "learning_rate": 1.6237758758082398e-07, "loss": 0.722, "step": 3106 }, { "epoch": 0.3071447989521291, "grad_norm": 4.405275220901767, "learning_rate": 1.6235255656696137e-07, "loss": 0.7698, "step": 3107 }, { "epoch": 0.3072436546968836, "grad_norm": 4.885301823578195, "learning_rate": 1.623275191597638e-07, "loss": 0.7335, "step": 3108 }, { "epoch": 0.307342510441638, "grad_norm": 4.046505686855976, "learning_rate": 1.6230247536179855e-07, "loss": 0.8159, "step": 3109 }, { "epoch": 0.3074413661863925, "grad_norm": 4.04904326947199, "learning_rate": 1.622774251756335e-07, "loss": 0.7644, "step": 3110 }, { "epoch": 0.307540221931147, "grad_norm": 3.722910512330455, "learning_rate": 1.6225236860383714e-07, "loss": 0.7631, "step": 3111 }, { "epoch": 0.30763907767590143, "grad_norm": 3.7322877934288488, "learning_rate": 1.6222730564897865e-07, "loss": 0.7862, "step": 3112 }, { "epoch": 0.3077379334206559, "grad_norm": 4.688069516978413, "learning_rate": 1.622022363136279e-07, "loss": 0.8245, "step": 3113 }, { "epoch": 0.30783678916541035, "grad_norm": 3.1744215706657553, "learning_rate": 1.621771606003554e-07, "loss": 0.667, "step": 3114 }, { "epoch": 0.30793564491016484, "grad_norm": 4.946242855402599, "learning_rate": 1.6215207851173218e-07, "loss": 0.6667, "step": 3115 }, { "epoch": 0.30803450065491933, "grad_norm": 4.019576444901938, "learning_rate": 1.6212699005033014e-07, "loss": 0.6986, "step": 3116 }, { "epoch": 0.30813335639967376, "grad_norm": 5.148462238495362, "learning_rate": 1.6210189521872174e-07, "loss": 0.7098, "step": 3117 }, { "epoch": 0.30823221214442825, "grad_norm": 6.276412413498243, "learning_rate": 1.6207679401948003e-07, "loss": 0.7543, "step": 3118 }, { "epoch": 0.3083310678891827, "grad_norm": 9.429717909344458, "learning_rate": 1.620516864551788e-07, "loss": 0.7389, "step": 3119 }, { "epoch": 0.3084299236339372, "grad_norm": 3.1539515089912125, "learning_rate": 1.6202657252839247e-07, "loss": 0.7858, "step": 3120 }, { "epoch": 0.30852877937869166, "grad_norm": 7.346566015854159, "learning_rate": 1.6200145224169605e-07, "loss": 0.7377, "step": 3121 }, { "epoch": 0.3086276351234461, "grad_norm": 51.557128284741296, "learning_rate": 1.6197632559766532e-07, "loss": 0.7272, "step": 3122 }, { "epoch": 0.3087264908682006, "grad_norm": 5.632638187754077, "learning_rate": 1.6195119259887657e-07, "loss": 0.7416, "step": 3123 }, { "epoch": 0.308825346612955, "grad_norm": 4.479740172583247, "learning_rate": 1.619260532479069e-07, "loss": 0.7225, "step": 3124 }, { "epoch": 0.3089242023577095, "grad_norm": 4.182762574714818, "learning_rate": 1.6190090754733395e-07, "loss": 0.8104, "step": 3125 }, { "epoch": 0.309023058102464, "grad_norm": 6.075811160977129, "learning_rate": 1.61875755499736e-07, "loss": 0.6996, "step": 3126 }, { "epoch": 0.30912191384721843, "grad_norm": 5.055769736571222, "learning_rate": 1.6185059710769207e-07, "loss": 0.8077, "step": 3127 }, { "epoch": 0.3092207695919729, "grad_norm": 6.81880936309117, "learning_rate": 1.6182543237378176e-07, "loss": 0.6308, "step": 3128 }, { "epoch": 0.30931962533672736, "grad_norm": 3.5561933807617674, "learning_rate": 1.6180026130058536e-07, "loss": 0.6644, "step": 3129 }, { "epoch": 0.30941848108148184, "grad_norm": 14.142626665939046, "learning_rate": 1.6177508389068377e-07, "loss": 0.7415, "step": 3130 }, { "epoch": 0.30951733682623633, "grad_norm": 2.8932617346199563, "learning_rate": 1.617499001466586e-07, "loss": 0.7474, "step": 3131 }, { "epoch": 0.30961619257099077, "grad_norm": 3.6265659075909618, "learning_rate": 1.6172471007109204e-07, "loss": 0.7566, "step": 3132 }, { "epoch": 0.30971504831574526, "grad_norm": 5.347121636340912, "learning_rate": 1.6169951366656695e-07, "loss": 0.7944, "step": 3133 }, { "epoch": 0.3098139040604997, "grad_norm": 7.489733646882159, "learning_rate": 1.6167431093566685e-07, "loss": 0.7771, "step": 3134 }, { "epoch": 0.3099127598052542, "grad_norm": 5.625544578481589, "learning_rate": 1.6164910188097596e-07, "loss": 0.725, "step": 3135 }, { "epoch": 0.31001161555000867, "grad_norm": 3.676137599360571, "learning_rate": 1.6162388650507905e-07, "loss": 0.668, "step": 3136 }, { "epoch": 0.3101104712947631, "grad_norm": 3.8171910189358997, "learning_rate": 1.6159866481056158e-07, "loss": 0.8045, "step": 3137 }, { "epoch": 0.3102093270395176, "grad_norm": 3.5032258381961574, "learning_rate": 1.6157343680000968e-07, "loss": 0.7261, "step": 3138 }, { "epoch": 0.310308182784272, "grad_norm": 9.32078367004573, "learning_rate": 1.615482024760101e-07, "loss": 0.6973, "step": 3139 }, { "epoch": 0.3104070385290265, "grad_norm": 2.8226795804959965, "learning_rate": 1.615229618411503e-07, "loss": 0.687, "step": 3140 }, { "epoch": 0.310505894273781, "grad_norm": 2.7367426174195026, "learning_rate": 1.6149771489801825e-07, "loss": 0.7427, "step": 3141 }, { "epoch": 0.31060475001853544, "grad_norm": 3.620513083135988, "learning_rate": 1.614724616492027e-07, "loss": 0.731, "step": 3142 }, { "epoch": 0.3107036057632899, "grad_norm": 4.577007027789433, "learning_rate": 1.61447202097293e-07, "loss": 0.7957, "step": 3143 }, { "epoch": 0.31080246150804436, "grad_norm": 7.64932389687859, "learning_rate": 1.6142193624487914e-07, "loss": 0.6617, "step": 3144 }, { "epoch": 0.31090131725279885, "grad_norm": 4.23692058370569, "learning_rate": 1.6139666409455174e-07, "loss": 0.6345, "step": 3145 }, { "epoch": 0.31100017299755334, "grad_norm": 3.7380716683853303, "learning_rate": 1.613713856489021e-07, "loss": 0.7873, "step": 3146 }, { "epoch": 0.31109902874230777, "grad_norm": 6.32370188144456, "learning_rate": 1.6134610091052216e-07, "loss": 0.686, "step": 3147 }, { "epoch": 0.31119788448706226, "grad_norm": 3.6400341999642416, "learning_rate": 1.6132080988200446e-07, "loss": 0.7636, "step": 3148 }, { "epoch": 0.31129674023181675, "grad_norm": 6.864143868059582, "learning_rate": 1.612955125659423e-07, "loss": 0.7901, "step": 3149 }, { "epoch": 0.3113955959765712, "grad_norm": 8.77562304276397, "learning_rate": 1.612702089649295e-07, "loss": 0.6709, "step": 3150 }, { "epoch": 0.31149445172132567, "grad_norm": 4.004634208967606, "learning_rate": 1.6124489908156055e-07, "loss": 0.7254, "step": 3151 }, { "epoch": 0.3115933074660801, "grad_norm": 8.349117394932485, "learning_rate": 1.6121958291843063e-07, "loss": 0.727, "step": 3152 }, { "epoch": 0.3116921632108346, "grad_norm": 5.683772574907308, "learning_rate": 1.611942604781355e-07, "loss": 0.7883, "step": 3153 }, { "epoch": 0.3117910189555891, "grad_norm": 5.062255380590402, "learning_rate": 1.6116893176327168e-07, "loss": 0.6857, "step": 3154 }, { "epoch": 0.3118898747003435, "grad_norm": 4.029030996424644, "learning_rate": 1.611435967764362e-07, "loss": 0.7107, "step": 3155 }, { "epoch": 0.311988730445098, "grad_norm": 5.071430499035809, "learning_rate": 1.611182555202268e-07, "loss": 0.6461, "step": 3156 }, { "epoch": 0.31208758618985244, "grad_norm": 3.9898575992657093, "learning_rate": 1.6109290799724186e-07, "loss": 0.704, "step": 3157 }, { "epoch": 0.31218644193460693, "grad_norm": 4.454628765161279, "learning_rate": 1.610675542100804e-07, "loss": 0.7211, "step": 3158 }, { "epoch": 0.3122852976793614, "grad_norm": 5.112426171889325, "learning_rate": 1.6104219416134202e-07, "loss": 0.5837, "step": 3159 }, { "epoch": 0.31238415342411585, "grad_norm": 3.2648024811619045, "learning_rate": 1.6101682785362712e-07, "loss": 0.6798, "step": 3160 }, { "epoch": 0.31248300916887034, "grad_norm": 3.0869118010867376, "learning_rate": 1.6099145528953656e-07, "loss": 0.653, "step": 3161 }, { "epoch": 0.3125818649136248, "grad_norm": 6.661095180353757, "learning_rate": 1.6096607647167196e-07, "loss": 0.6696, "step": 3162 }, { "epoch": 0.31268072065837926, "grad_norm": 4.197494382504308, "learning_rate": 1.609406914026355e-07, "loss": 0.6642, "step": 3163 }, { "epoch": 0.31277957640313375, "grad_norm": 3.5932311508629686, "learning_rate": 1.609153000850301e-07, "loss": 0.7463, "step": 3164 }, { "epoch": 0.3128784321478882, "grad_norm": 3.30962981900364, "learning_rate": 1.6088990252145923e-07, "loss": 0.8362, "step": 3165 }, { "epoch": 0.3129772878926427, "grad_norm": 10.861086236325606, "learning_rate": 1.6086449871452704e-07, "loss": 0.6519, "step": 3166 }, { "epoch": 0.3130761436373971, "grad_norm": 4.303069842395143, "learning_rate": 1.6083908866683834e-07, "loss": 0.6781, "step": 3167 }, { "epoch": 0.3131749993821516, "grad_norm": 5.554174316808517, "learning_rate": 1.6081367238099852e-07, "loss": 0.7323, "step": 3168 }, { "epoch": 0.3132738551269061, "grad_norm": 3.2988579472245783, "learning_rate": 1.607882498596137e-07, "loss": 0.8069, "step": 3169 }, { "epoch": 0.3133727108716605, "grad_norm": 3.640981945624258, "learning_rate": 1.6076282110529046e-07, "loss": 0.7426, "step": 3170 }, { "epoch": 0.313471566616415, "grad_norm": 19.283408134741215, "learning_rate": 1.607373861206363e-07, "loss": 0.6706, "step": 3171 }, { "epoch": 0.31357042236116944, "grad_norm": 3.967406664133984, "learning_rate": 1.6071194490825913e-07, "loss": 0.7237, "step": 3172 }, { "epoch": 0.31366927810592393, "grad_norm": 3.041059016278564, "learning_rate": 1.6068649747076756e-07, "loss": 0.7485, "step": 3173 }, { "epoch": 0.3137681338506784, "grad_norm": 3.975934224978301, "learning_rate": 1.6066104381077087e-07, "loss": 0.6835, "step": 3174 }, { "epoch": 0.31386698959543285, "grad_norm": 214.0147667857848, "learning_rate": 1.6063558393087892e-07, "loss": 0.7995, "step": 3175 }, { "epoch": 0.31396584534018734, "grad_norm": 3.7810348249133727, "learning_rate": 1.606101178337023e-07, "loss": 0.7785, "step": 3176 }, { "epoch": 0.3140647010849418, "grad_norm": 3.1995701341288103, "learning_rate": 1.6058464552185217e-07, "loss": 0.7558, "step": 3177 }, { "epoch": 0.31416355682969627, "grad_norm": 3.2938762659084846, "learning_rate": 1.6055916699794033e-07, "loss": 0.7111, "step": 3178 }, { "epoch": 0.31426241257445076, "grad_norm": 5.170885538008125, "learning_rate": 1.6053368226457922e-07, "loss": 0.7419, "step": 3179 }, { "epoch": 0.3143612683192052, "grad_norm": 3.4035493663473035, "learning_rate": 1.6050819132438194e-07, "loss": 0.64, "step": 3180 }, { "epoch": 0.3144601240639597, "grad_norm": 2.9431462433655846, "learning_rate": 1.6048269417996215e-07, "loss": 0.6461, "step": 3181 }, { "epoch": 0.3145589798087141, "grad_norm": 5.697668733984508, "learning_rate": 1.604571908339343e-07, "loss": 0.6602, "step": 3182 }, { "epoch": 0.3146578355534686, "grad_norm": 3.8022249590138246, "learning_rate": 1.604316812889133e-07, "loss": 0.7232, "step": 3183 }, { "epoch": 0.3147566912982231, "grad_norm": 3.2408982735915655, "learning_rate": 1.6040616554751485e-07, "loss": 0.7702, "step": 3184 }, { "epoch": 0.3148555470429775, "grad_norm": 3.7245093889576193, "learning_rate": 1.6038064361235515e-07, "loss": 0.6727, "step": 3185 }, { "epoch": 0.314954402787732, "grad_norm": 3.455059421803336, "learning_rate": 1.6035511548605115e-07, "loss": 0.7233, "step": 3186 }, { "epoch": 0.31505325853248645, "grad_norm": 4.12028273821115, "learning_rate": 1.6032958117122032e-07, "loss": 0.726, "step": 3187 }, { "epoch": 0.31515211427724094, "grad_norm": 2.839390399656588, "learning_rate": 1.6030404067048088e-07, "loss": 0.7496, "step": 3188 }, { "epoch": 0.3152509700219954, "grad_norm": 4.994014576264601, "learning_rate": 1.6027849398645165e-07, "loss": 0.7332, "step": 3189 }, { "epoch": 0.31534982576674986, "grad_norm": 5.308066043736463, "learning_rate": 1.60252941121752e-07, "loss": 0.643, "step": 3190 }, { "epoch": 0.31544868151150435, "grad_norm": 4.011855770291665, "learning_rate": 1.6022738207900205e-07, "loss": 0.7381, "step": 3191 }, { "epoch": 0.3155475372562588, "grad_norm": 3.87209508073509, "learning_rate": 1.6020181686082247e-07, "loss": 0.7361, "step": 3192 }, { "epoch": 0.31564639300101327, "grad_norm": 3.828731423154275, "learning_rate": 1.6017624546983462e-07, "loss": 0.7446, "step": 3193 }, { "epoch": 0.31574524874576776, "grad_norm": 4.913120092669052, "learning_rate": 1.6015066790866044e-07, "loss": 0.6323, "step": 3194 }, { "epoch": 0.3158441044905222, "grad_norm": 4.241669540612553, "learning_rate": 1.6012508417992256e-07, "loss": 0.6474, "step": 3195 }, { "epoch": 0.3159429602352767, "grad_norm": 4.249215223461972, "learning_rate": 1.600994942862442e-07, "loss": 0.8365, "step": 3196 }, { "epoch": 0.3160418159800311, "grad_norm": 8.692851959569614, "learning_rate": 1.6007389823024924e-07, "loss": 0.6972, "step": 3197 }, { "epoch": 0.3161406717247856, "grad_norm": 5.420239597274601, "learning_rate": 1.6004829601456218e-07, "loss": 0.7164, "step": 3198 }, { "epoch": 0.3162395274695401, "grad_norm": 5.987386444863469, "learning_rate": 1.6002268764180812e-07, "loss": 0.7295, "step": 3199 }, { "epoch": 0.3163383832142945, "grad_norm": 3.7468435316284725, "learning_rate": 1.5999707311461282e-07, "loss": 0.7183, "step": 3200 }, { "epoch": 0.316437238959049, "grad_norm": 3.6774283248548794, "learning_rate": 1.5997145243560276e-07, "loss": 0.7605, "step": 3201 }, { "epoch": 0.31653609470380345, "grad_norm": 10.208711565209999, "learning_rate": 1.5994582560740483e-07, "loss": 0.7468, "step": 3202 }, { "epoch": 0.31663495044855794, "grad_norm": 6.474217852897381, "learning_rate": 1.5992019263264676e-07, "loss": 0.7062, "step": 3203 }, { "epoch": 0.31673380619331243, "grad_norm": 4.438314121705886, "learning_rate": 1.5989455351395684e-07, "loss": 0.73, "step": 3204 }, { "epoch": 0.31683266193806686, "grad_norm": 4.278386419944885, "learning_rate": 1.5986890825396395e-07, "loss": 0.7483, "step": 3205 }, { "epoch": 0.31693151768282135, "grad_norm": 11.298225774965173, "learning_rate": 1.5984325685529765e-07, "loss": 0.6724, "step": 3206 }, { "epoch": 0.3170303734275758, "grad_norm": 6.541117063538867, "learning_rate": 1.598175993205881e-07, "loss": 0.6766, "step": 3207 }, { "epoch": 0.3171292291723303, "grad_norm": 4.680971859262622, "learning_rate": 1.5979193565246613e-07, "loss": 0.7143, "step": 3208 }, { "epoch": 0.31722808491708476, "grad_norm": 5.675154195106667, "learning_rate": 1.5976626585356313e-07, "loss": 0.7018, "step": 3209 }, { "epoch": 0.3173269406618392, "grad_norm": 3.250745011627665, "learning_rate": 1.597405899265112e-07, "loss": 0.7742, "step": 3210 }, { "epoch": 0.3174257964065937, "grad_norm": 5.294920687347966, "learning_rate": 1.59714907873943e-07, "loss": 0.7247, "step": 3211 }, { "epoch": 0.3175246521513481, "grad_norm": 4.691377611805262, "learning_rate": 1.5968921969849187e-07, "loss": 0.6947, "step": 3212 }, { "epoch": 0.3176235078961026, "grad_norm": 6.137665697664341, "learning_rate": 1.5966352540279174e-07, "loss": 0.7765, "step": 3213 }, { "epoch": 0.3177223636408571, "grad_norm": 10.424550195467136, "learning_rate": 1.5963782498947714e-07, "loss": 0.7624, "step": 3214 }, { "epoch": 0.31782121938561153, "grad_norm": 5.743434311174949, "learning_rate": 1.5961211846118333e-07, "loss": 0.6861, "step": 3215 }, { "epoch": 0.317920075130366, "grad_norm": 5.264398509385489, "learning_rate": 1.5958640582054612e-07, "loss": 0.7471, "step": 3216 }, { "epoch": 0.31801893087512045, "grad_norm": 3.2766981444127965, "learning_rate": 1.5956068707020194e-07, "loss": 0.6773, "step": 3217 }, { "epoch": 0.31811778661987494, "grad_norm": 17.080313206362586, "learning_rate": 1.5953496221278788e-07, "loss": 0.7384, "step": 3218 }, { "epoch": 0.31821664236462943, "grad_norm": 3.0119809076017527, "learning_rate": 1.5950923125094166e-07, "loss": 0.6913, "step": 3219 }, { "epoch": 0.31831549810938387, "grad_norm": 10.366661359735955, "learning_rate": 1.594834941873016e-07, "loss": 0.7666, "step": 3220 }, { "epoch": 0.31841435385413835, "grad_norm": 9.04798457571723, "learning_rate": 1.5945775102450664e-07, "loss": 0.6769, "step": 3221 }, { "epoch": 0.3185132095988928, "grad_norm": 3.5476084306984017, "learning_rate": 1.594320017651964e-07, "loss": 0.7274, "step": 3222 }, { "epoch": 0.3186120653436473, "grad_norm": 8.263748107072734, "learning_rate": 1.5940624641201102e-07, "loss": 0.7301, "step": 3223 }, { "epoch": 0.31871092108840177, "grad_norm": 3.8743575849753946, "learning_rate": 1.5938048496759144e-07, "loss": 0.7405, "step": 3224 }, { "epoch": 0.3188097768331562, "grad_norm": 4.351028533895859, "learning_rate": 1.59354717434579e-07, "loss": 0.7204, "step": 3225 }, { "epoch": 0.3189086325779107, "grad_norm": 3.428073248116463, "learning_rate": 1.5932894381561582e-07, "loss": 0.6701, "step": 3226 }, { "epoch": 0.3190074883226652, "grad_norm": 18.939481114517278, "learning_rate": 1.5930316411334465e-07, "loss": 0.7979, "step": 3227 }, { "epoch": 0.3191063440674196, "grad_norm": 9.179237422879316, "learning_rate": 1.5927737833040878e-07, "loss": 0.5994, "step": 3228 }, { "epoch": 0.3192051998121741, "grad_norm": 4.430295974747974, "learning_rate": 1.5925158646945216e-07, "loss": 0.7063, "step": 3229 }, { "epoch": 0.31930405555692853, "grad_norm": 8.350042120720248, "learning_rate": 1.5922578853311936e-07, "loss": 0.7313, "step": 3230 }, { "epoch": 0.319402911301683, "grad_norm": 4.084326850136773, "learning_rate": 1.591999845240556e-07, "loss": 0.7653, "step": 3231 }, { "epoch": 0.3195017670464375, "grad_norm": 5.1566576173428125, "learning_rate": 1.591741744449067e-07, "loss": 0.6125, "step": 3232 }, { "epoch": 0.31960062279119195, "grad_norm": 9.298098450828299, "learning_rate": 1.5914835829831907e-07, "loss": 0.7074, "step": 3233 }, { "epoch": 0.31969947853594644, "grad_norm": 3.4232915237369417, "learning_rate": 1.591225360869398e-07, "loss": 0.6838, "step": 3234 }, { "epoch": 0.31979833428070087, "grad_norm": 4.075727554046275, "learning_rate": 1.5909670781341657e-07, "loss": 0.6962, "step": 3235 }, { "epoch": 0.31989719002545536, "grad_norm": 13.650805214901233, "learning_rate": 1.590708734803977e-07, "loss": 0.779, "step": 3236 }, { "epoch": 0.31999604577020985, "grad_norm": 2.992145289536845, "learning_rate": 1.590450330905321e-07, "loss": 0.7561, "step": 3237 }, { "epoch": 0.3200949015149643, "grad_norm": 7.815708447200665, "learning_rate": 1.5901918664646935e-07, "loss": 0.6734, "step": 3238 }, { "epoch": 0.32019375725971877, "grad_norm": 3.693271731527079, "learning_rate": 1.5899333415085958e-07, "loss": 0.706, "step": 3239 }, { "epoch": 0.3202926130044732, "grad_norm": 45.133257041910824, "learning_rate": 1.5896747560635362e-07, "loss": 0.7468, "step": 3240 }, { "epoch": 0.3203914687492277, "grad_norm": 5.853807340624508, "learning_rate": 1.5894161101560286e-07, "loss": 0.6782, "step": 3241 }, { "epoch": 0.3204903244939822, "grad_norm": 7.918295518645881, "learning_rate": 1.5891574038125938e-07, "loss": 0.7622, "step": 3242 }, { "epoch": 0.3205891802387366, "grad_norm": 4.527723605429583, "learning_rate": 1.5888986370597572e-07, "loss": 0.7506, "step": 3243 }, { "epoch": 0.3206880359834911, "grad_norm": 34.794240690429476, "learning_rate": 1.588639809924053e-07, "loss": 0.748, "step": 3244 }, { "epoch": 0.32078689172824554, "grad_norm": 3.9132682569057837, "learning_rate": 1.5883809224320187e-07, "loss": 0.6817, "step": 3245 }, { "epoch": 0.320885747473, "grad_norm": 2.9667834096065255, "learning_rate": 1.5881219746102003e-07, "loss": 0.7089, "step": 3246 }, { "epoch": 0.3209846032177545, "grad_norm": 4.501879590115043, "learning_rate": 1.5878629664851486e-07, "loss": 0.7265, "step": 3247 }, { "epoch": 0.32108345896250895, "grad_norm": 3.9920481731631745, "learning_rate": 1.5876038980834216e-07, "loss": 0.7419, "step": 3248 }, { "epoch": 0.32118231470726344, "grad_norm": 3.371668959683131, "learning_rate": 1.5873447694315827e-07, "loss": 0.6052, "step": 3249 }, { "epoch": 0.3212811704520179, "grad_norm": 4.948979880715994, "learning_rate": 1.5870855805562013e-07, "loss": 0.6153, "step": 3250 }, { "epoch": 0.32138002619677236, "grad_norm": 3.413678509643568, "learning_rate": 1.5868263314838539e-07, "loss": 0.722, "step": 3251 }, { "epoch": 0.32147888194152685, "grad_norm": 6.6200596374525995, "learning_rate": 1.5865670222411226e-07, "loss": 0.7094, "step": 3252 }, { "epoch": 0.3215777376862813, "grad_norm": 4.92235774706373, "learning_rate": 1.5863076528545953e-07, "loss": 0.7893, "step": 3253 }, { "epoch": 0.3216765934310358, "grad_norm": 6.77376879340371, "learning_rate": 1.586048223350867e-07, "loss": 0.6305, "step": 3254 }, { "epoch": 0.3217754491757902, "grad_norm": 3.844167344818066, "learning_rate": 1.5857887337565387e-07, "loss": 0.782, "step": 3255 }, { "epoch": 0.3218743049205447, "grad_norm": 3.339732617141909, "learning_rate": 1.5855291840982162e-07, "loss": 0.7342, "step": 3256 }, { "epoch": 0.3219731606652992, "grad_norm": 4.0052517159322605, "learning_rate": 1.5852695744025133e-07, "loss": 0.6825, "step": 3257 }, { "epoch": 0.3220720164100536, "grad_norm": 3.8381130444890617, "learning_rate": 1.5850099046960485e-07, "loss": 0.6352, "step": 3258 }, { "epoch": 0.3221708721548081, "grad_norm": 4.678706163791063, "learning_rate": 1.584750175005448e-07, "loss": 0.7168, "step": 3259 }, { "epoch": 0.32226972789956254, "grad_norm": 4.211278075144235, "learning_rate": 1.584490385357342e-07, "loss": 0.7934, "step": 3260 }, { "epoch": 0.32236858364431703, "grad_norm": 5.609145869275169, "learning_rate": 1.5842305357783695e-07, "loss": 0.8129, "step": 3261 }, { "epoch": 0.3224674393890715, "grad_norm": 6.813974164403523, "learning_rate": 1.5839706262951732e-07, "loss": 0.7061, "step": 3262 }, { "epoch": 0.32256629513382595, "grad_norm": 10.777347512710701, "learning_rate": 1.5837106569344036e-07, "loss": 0.6095, "step": 3263 }, { "epoch": 0.32266515087858044, "grad_norm": 3.73424080498466, "learning_rate": 1.5834506277227164e-07, "loss": 0.8041, "step": 3264 }, { "epoch": 0.3227640066233349, "grad_norm": 5.383686860916343, "learning_rate": 1.5831905386867738e-07, "loss": 0.6672, "step": 3265 }, { "epoch": 0.32286286236808937, "grad_norm": 12.684281288637656, "learning_rate": 1.582930389853244e-07, "loss": 0.7279, "step": 3266 }, { "epoch": 0.32296171811284385, "grad_norm": 4.164795099679318, "learning_rate": 1.5826701812488014e-07, "loss": 0.7224, "step": 3267 }, { "epoch": 0.3230605738575983, "grad_norm": 4.923132750305054, "learning_rate": 1.5824099129001271e-07, "loss": 0.7151, "step": 3268 }, { "epoch": 0.3231594296023528, "grad_norm": 8.919002025372901, "learning_rate": 1.5821495848339072e-07, "loss": 0.6535, "step": 3269 }, { "epoch": 0.3232582853471072, "grad_norm": 3.408671584960044, "learning_rate": 1.581889197076835e-07, "loss": 0.6788, "step": 3270 }, { "epoch": 0.3233571410918617, "grad_norm": 3.41354058805903, "learning_rate": 1.5816287496556087e-07, "loss": 0.6361, "step": 3271 }, { "epoch": 0.3234559968366162, "grad_norm": 3.6212506895110166, "learning_rate": 1.581368242596934e-07, "loss": 0.7694, "step": 3272 }, { "epoch": 0.3235548525813706, "grad_norm": 5.401243850504713, "learning_rate": 1.5811076759275215e-07, "loss": 0.7908, "step": 3273 }, { "epoch": 0.3236537083261251, "grad_norm": 2.935439136367023, "learning_rate": 1.5808470496740892e-07, "loss": 0.6568, "step": 3274 }, { "epoch": 0.32375256407087954, "grad_norm": 3.2935266094445583, "learning_rate": 1.5805863638633596e-07, "loss": 0.729, "step": 3275 }, { "epoch": 0.32385141981563403, "grad_norm": 4.276882910631688, "learning_rate": 1.5803256185220627e-07, "loss": 0.6386, "step": 3276 }, { "epoch": 0.3239502755603885, "grad_norm": 10.732901115070991, "learning_rate": 1.5800648136769345e-07, "loss": 0.7099, "step": 3277 }, { "epoch": 0.32404913130514296, "grad_norm": 4.786979018600005, "learning_rate": 1.5798039493547157e-07, "loss": 0.7182, "step": 3278 }, { "epoch": 0.32414798704989745, "grad_norm": 3.5086606362182544, "learning_rate": 1.579543025582155e-07, "loss": 0.7293, "step": 3279 }, { "epoch": 0.3242468427946519, "grad_norm": 4.193260455146662, "learning_rate": 1.5792820423860053e-07, "loss": 0.7239, "step": 3280 }, { "epoch": 0.32434569853940637, "grad_norm": 3.5910158506782506, "learning_rate": 1.5790209997930278e-07, "loss": 0.6155, "step": 3281 }, { "epoch": 0.32444455428416086, "grad_norm": 3.664837721649474, "learning_rate": 1.5787598978299878e-07, "loss": 0.7887, "step": 3282 }, { "epoch": 0.3245434100289153, "grad_norm": 3.29893564642089, "learning_rate": 1.5784987365236573e-07, "loss": 0.6183, "step": 3283 }, { "epoch": 0.3246422657736698, "grad_norm": 6.707336026665019, "learning_rate": 1.578237515900815e-07, "loss": 0.7238, "step": 3284 }, { "epoch": 0.3247411215184242, "grad_norm": 3.2866442687654933, "learning_rate": 1.577976235988245e-07, "loss": 0.7048, "step": 3285 }, { "epoch": 0.3248399772631787, "grad_norm": 6.158798360283989, "learning_rate": 1.577714896812738e-07, "loss": 0.6621, "step": 3286 }, { "epoch": 0.3249388330079332, "grad_norm": 5.5317353590887555, "learning_rate": 1.57745349840109e-07, "loss": 0.6863, "step": 3287 }, { "epoch": 0.3250376887526876, "grad_norm": 3.354995744646344, "learning_rate": 1.5771920407801037e-07, "loss": 0.7176, "step": 3288 }, { "epoch": 0.3251365444974421, "grad_norm": 16.49388902397358, "learning_rate": 1.5769305239765877e-07, "loss": 0.7032, "step": 3289 }, { "epoch": 0.32523540024219655, "grad_norm": 5.388823537841306, "learning_rate": 1.576668948017357e-07, "loss": 0.7279, "step": 3290 }, { "epoch": 0.32533425598695104, "grad_norm": 11.610477465415848, "learning_rate": 1.5764073129292324e-07, "loss": 0.6574, "step": 3291 }, { "epoch": 0.3254331117317055, "grad_norm": 3.152045175576492, "learning_rate": 1.57614561873904e-07, "loss": 0.6332, "step": 3292 }, { "epoch": 0.32553196747645996, "grad_norm": 3.8582105554225543, "learning_rate": 1.5758838654736134e-07, "loss": 0.7964, "step": 3293 }, { "epoch": 0.32563082322121445, "grad_norm": 3.75121100409366, "learning_rate": 1.5756220531597912e-07, "loss": 0.7699, "step": 3294 }, { "epoch": 0.3257296789659689, "grad_norm": 16.809456365538537, "learning_rate": 1.5753601818244185e-07, "loss": 0.6738, "step": 3295 }, { "epoch": 0.32582853471072337, "grad_norm": 3.645619644794589, "learning_rate": 1.5750982514943462e-07, "loss": 0.7102, "step": 3296 }, { "epoch": 0.32592739045547786, "grad_norm": 3.869170248644768, "learning_rate": 1.5748362621964316e-07, "loss": 0.7707, "step": 3297 }, { "epoch": 0.3260262462002323, "grad_norm": 8.414091394110764, "learning_rate": 1.574574213957538e-07, "loss": 0.6492, "step": 3298 }, { "epoch": 0.3261251019449868, "grad_norm": 10.214319721148106, "learning_rate": 1.574312106804534e-07, "loss": 0.6722, "step": 3299 }, { "epoch": 0.3262239576897412, "grad_norm": 3.2351268944392486, "learning_rate": 1.5740499407642953e-07, "loss": 0.8297, "step": 3300 }, { "epoch": 0.3263228134344957, "grad_norm": 3.904766714083447, "learning_rate": 1.5737877158637027e-07, "loss": 0.6326, "step": 3301 }, { "epoch": 0.3264216691792502, "grad_norm": 3.563684441492177, "learning_rate": 1.5735254321296441e-07, "loss": 0.7174, "step": 3302 }, { "epoch": 0.32652052492400463, "grad_norm": 16.596705530665872, "learning_rate": 1.5732630895890124e-07, "loss": 0.7618, "step": 3303 }, { "epoch": 0.3266193806687591, "grad_norm": 3.1715554333618186, "learning_rate": 1.5730006882687073e-07, "loss": 0.7184, "step": 3304 }, { "epoch": 0.3267182364135136, "grad_norm": 19.282022588893298, "learning_rate": 1.5727382281956338e-07, "loss": 0.7401, "step": 3305 }, { "epoch": 0.32681709215826804, "grad_norm": 3.0952677153969073, "learning_rate": 1.5724757093967033e-07, "loss": 0.6612, "step": 3306 }, { "epoch": 0.32691594790302253, "grad_norm": 4.269383011691742, "learning_rate": 1.5722131318988337e-07, "loss": 0.6776, "step": 3307 }, { "epoch": 0.32701480364777696, "grad_norm": 6.140797151154471, "learning_rate": 1.5719504957289483e-07, "loss": 0.7865, "step": 3308 }, { "epoch": 0.32711365939253145, "grad_norm": 3.4068476244268378, "learning_rate": 1.571687800913976e-07, "loss": 0.7308, "step": 3309 }, { "epoch": 0.32721251513728594, "grad_norm": 3.4679012689203925, "learning_rate": 1.571425047480853e-07, "loss": 0.6744, "step": 3310 }, { "epoch": 0.3273113708820404, "grad_norm": 4.014563913942624, "learning_rate": 1.5711622354565204e-07, "loss": 0.6831, "step": 3311 }, { "epoch": 0.32741022662679486, "grad_norm": 4.379640409102196, "learning_rate": 1.5708993648679258e-07, "loss": 0.7193, "step": 3312 }, { "epoch": 0.3275090823715493, "grad_norm": 4.797074808143279, "learning_rate": 1.5706364357420227e-07, "loss": 0.667, "step": 3313 }, { "epoch": 0.3276079381163038, "grad_norm": 4.311187677523124, "learning_rate": 1.5703734481057706e-07, "loss": 0.759, "step": 3314 }, { "epoch": 0.3277067938610583, "grad_norm": 3.921918732401354, "learning_rate": 1.5701104019861352e-07, "loss": 0.7953, "step": 3315 }, { "epoch": 0.3278056496058127, "grad_norm": 3.704703585499704, "learning_rate": 1.5698472974100874e-07, "loss": 0.6367, "step": 3316 }, { "epoch": 0.3279045053505672, "grad_norm": 4.6074218029380045, "learning_rate": 1.5695841344046053e-07, "loss": 0.6563, "step": 3317 }, { "epoch": 0.32800336109532163, "grad_norm": 4.11326626040705, "learning_rate": 1.569320912996672e-07, "loss": 0.8535, "step": 3318 }, { "epoch": 0.3281022168400761, "grad_norm": 5.848390995911136, "learning_rate": 1.5690576332132775e-07, "loss": 0.6387, "step": 3319 }, { "epoch": 0.3282010725848306, "grad_norm": 9.145338698934648, "learning_rate": 1.5687942950814166e-07, "loss": 0.7215, "step": 3320 }, { "epoch": 0.32829992832958504, "grad_norm": 6.218805856468651, "learning_rate": 1.568530898628091e-07, "loss": 0.6596, "step": 3321 }, { "epoch": 0.32839878407433953, "grad_norm": 3.369089538713358, "learning_rate": 1.5682674438803085e-07, "loss": 0.7627, "step": 3322 }, { "epoch": 0.32849763981909397, "grad_norm": 4.632724212475782, "learning_rate": 1.568003930865082e-07, "loss": 0.7297, "step": 3323 }, { "epoch": 0.32859649556384846, "grad_norm": 3.6062146484630855, "learning_rate": 1.5677403596094307e-07, "loss": 0.781, "step": 3324 }, { "epoch": 0.32869535130860295, "grad_norm": 4.9251460654501935, "learning_rate": 1.5674767301403806e-07, "loss": 0.651, "step": 3325 }, { "epoch": 0.3287942070533574, "grad_norm": 5.022244771229125, "learning_rate": 1.567213042484963e-07, "loss": 0.6499, "step": 3326 }, { "epoch": 0.32889306279811187, "grad_norm": 14.560985645520223, "learning_rate": 1.5669492966702145e-07, "loss": 0.7457, "step": 3327 }, { "epoch": 0.3289919185428663, "grad_norm": 4.025316919491248, "learning_rate": 1.5666854927231788e-07, "loss": 0.7219, "step": 3328 }, { "epoch": 0.3290907742876208, "grad_norm": 16.4329466844937, "learning_rate": 1.5664216306709048e-07, "loss": 0.6901, "step": 3329 }, { "epoch": 0.3291896300323753, "grad_norm": 3.508676997279705, "learning_rate": 1.5661577105404483e-07, "loss": 0.7182, "step": 3330 }, { "epoch": 0.3292884857771297, "grad_norm": 4.111932589610013, "learning_rate": 1.5658937323588695e-07, "loss": 0.6978, "step": 3331 }, { "epoch": 0.3293873415218842, "grad_norm": 3.830670009301702, "learning_rate": 1.5656296961532363e-07, "loss": 0.7398, "step": 3332 }, { "epoch": 0.32948619726663864, "grad_norm": 7.116548265214187, "learning_rate": 1.5653656019506214e-07, "loss": 0.7183, "step": 3333 }, { "epoch": 0.3295850530113931, "grad_norm": 2.5596687783418974, "learning_rate": 1.5651014497781037e-07, "loss": 0.6528, "step": 3334 }, { "epoch": 0.3296839087561476, "grad_norm": 17.294416088881754, "learning_rate": 1.564837239662768e-07, "loss": 0.6425, "step": 3335 }, { "epoch": 0.32978276450090205, "grad_norm": 5.220942969197681, "learning_rate": 1.5645729716317052e-07, "loss": 0.7277, "step": 3336 }, { "epoch": 0.32988162024565654, "grad_norm": 5.24058718233169, "learning_rate": 1.5643086457120126e-07, "loss": 0.673, "step": 3337 }, { "epoch": 0.32998047599041097, "grad_norm": 3.6452757121169452, "learning_rate": 1.5640442619307924e-07, "loss": 0.7989, "step": 3338 }, { "epoch": 0.33007933173516546, "grad_norm": 3.955271879529515, "learning_rate": 1.5637798203151533e-07, "loss": 0.7437, "step": 3339 }, { "epoch": 0.33017818747991995, "grad_norm": 5.512147896226781, "learning_rate": 1.5635153208922102e-07, "loss": 0.7089, "step": 3340 }, { "epoch": 0.3302770432246744, "grad_norm": 4.413564604184577, "learning_rate": 1.5632507636890832e-07, "loss": 0.7037, "step": 3341 }, { "epoch": 0.33037589896942887, "grad_norm": 4.401560364734806, "learning_rate": 1.5629861487328987e-07, "loss": 0.6526, "step": 3342 }, { "epoch": 0.3304747547141833, "grad_norm": 7.835097899917708, "learning_rate": 1.5627214760507895e-07, "loss": 0.7652, "step": 3343 }, { "epoch": 0.3305736104589378, "grad_norm": 2.881351065295805, "learning_rate": 1.562456745669894e-07, "loss": 0.7648, "step": 3344 }, { "epoch": 0.3306724662036923, "grad_norm": 5.084410762411646, "learning_rate": 1.5621919576173554e-07, "loss": 0.7608, "step": 3345 }, { "epoch": 0.3307713219484467, "grad_norm": 4.727347757804229, "learning_rate": 1.5619271119203247e-07, "loss": 0.5968, "step": 3346 }, { "epoch": 0.3308701776932012, "grad_norm": 5.320998479591818, "learning_rate": 1.561662208605958e-07, "loss": 0.6748, "step": 3347 }, { "epoch": 0.33096903343795564, "grad_norm": 3.9777929377082866, "learning_rate": 1.5613972477014165e-07, "loss": 0.6999, "step": 3348 }, { "epoch": 0.33106788918271013, "grad_norm": 3.734666444773565, "learning_rate": 1.5611322292338685e-07, "loss": 0.7181, "step": 3349 }, { "epoch": 0.3311667449274646, "grad_norm": 5.725695504267395, "learning_rate": 1.5608671532304877e-07, "loss": 0.6503, "step": 3350 }, { "epoch": 0.33126560067221905, "grad_norm": 4.495864913159523, "learning_rate": 1.5606020197184537e-07, "loss": 0.7584, "step": 3351 }, { "epoch": 0.33136445641697354, "grad_norm": 5.249674158532663, "learning_rate": 1.560336828724952e-07, "loss": 0.6859, "step": 3352 }, { "epoch": 0.331463312161728, "grad_norm": 3.4608185232923034, "learning_rate": 1.5600715802771743e-07, "loss": 0.7523, "step": 3353 }, { "epoch": 0.33156216790648246, "grad_norm": 3.0839283700561286, "learning_rate": 1.5598062744023174e-07, "loss": 0.7432, "step": 3354 }, { "epoch": 0.33166102365123695, "grad_norm": 3.2511545819493275, "learning_rate": 1.559540911127585e-07, "loss": 0.6602, "step": 3355 }, { "epoch": 0.3317598793959914, "grad_norm": 7.362585979341072, "learning_rate": 1.5592754904801859e-07, "loss": 0.7257, "step": 3356 }, { "epoch": 0.3318587351407459, "grad_norm": 3.9068456498679915, "learning_rate": 1.5590100124873352e-07, "loss": 0.7225, "step": 3357 }, { "epoch": 0.3319575908855003, "grad_norm": 3.008765401939737, "learning_rate": 1.5587444771762533e-07, "loss": 0.7161, "step": 3358 }, { "epoch": 0.3320564466302548, "grad_norm": 5.055740983924254, "learning_rate": 1.5584788845741682e-07, "loss": 0.708, "step": 3359 }, { "epoch": 0.3321553023750093, "grad_norm": 4.177986259959426, "learning_rate": 1.5582132347083108e-07, "loss": 0.8116, "step": 3360 }, { "epoch": 0.3322541581197637, "grad_norm": 3.470784503939456, "learning_rate": 1.5579475276059207e-07, "loss": 0.6615, "step": 3361 }, { "epoch": 0.3323530138645182, "grad_norm": 3.7690855699685253, "learning_rate": 1.5576817632942421e-07, "loss": 0.6938, "step": 3362 }, { "epoch": 0.33245186960927264, "grad_norm": 2.92040859835785, "learning_rate": 1.5574159418005247e-07, "loss": 0.7825, "step": 3363 }, { "epoch": 0.33255072535402713, "grad_norm": 3.481749690836404, "learning_rate": 1.5571500631520254e-07, "loss": 0.6949, "step": 3364 }, { "epoch": 0.3326495810987816, "grad_norm": 2.6802031859849444, "learning_rate": 1.5568841273760055e-07, "loss": 0.6565, "step": 3365 }, { "epoch": 0.33274843684353606, "grad_norm": 5.830619743517246, "learning_rate": 1.5566181344997327e-07, "loss": 0.6615, "step": 3366 }, { "epoch": 0.33284729258829054, "grad_norm": 3.708599753443995, "learning_rate": 1.556352084550481e-07, "loss": 0.7361, "step": 3367 }, { "epoch": 0.332946148333045, "grad_norm": 7.950403461506017, "learning_rate": 1.5560859775555296e-07, "loss": 0.7878, "step": 3368 }, { "epoch": 0.33304500407779947, "grad_norm": 4.100171477492787, "learning_rate": 1.5558198135421648e-07, "loss": 0.8139, "step": 3369 }, { "epoch": 0.33314385982255396, "grad_norm": 4.501477811299288, "learning_rate": 1.5555535925376763e-07, "loss": 0.7256, "step": 3370 }, { "epoch": 0.3332427155673084, "grad_norm": 4.826852806293034, "learning_rate": 1.5552873145693623e-07, "loss": 0.7287, "step": 3371 }, { "epoch": 0.3333415713120629, "grad_norm": 9.384279243863439, "learning_rate": 1.555020979664525e-07, "loss": 0.7215, "step": 3372 }, { "epoch": 0.3334404270568173, "grad_norm": 13.36268725850793, "learning_rate": 1.5547545878504734e-07, "loss": 0.6717, "step": 3373 }, { "epoch": 0.3335392828015718, "grad_norm": 14.993033765111912, "learning_rate": 1.554488139154522e-07, "loss": 0.6099, "step": 3374 }, { "epoch": 0.3336381385463263, "grad_norm": 3.355874761285684, "learning_rate": 1.5542216336039913e-07, "loss": 0.7319, "step": 3375 }, { "epoch": 0.3337369942910807, "grad_norm": 3.441567396634726, "learning_rate": 1.5539550712262073e-07, "loss": 0.5852, "step": 3376 }, { "epoch": 0.3338358500358352, "grad_norm": 7.304128548857031, "learning_rate": 1.5536884520485022e-07, "loss": 0.7953, "step": 3377 }, { "epoch": 0.33393470578058965, "grad_norm": 6.372795311992978, "learning_rate": 1.5534217760982142e-07, "loss": 0.7305, "step": 3378 }, { "epoch": 0.33403356152534414, "grad_norm": 3.169429001516498, "learning_rate": 1.553155043402686e-07, "loss": 0.6968, "step": 3379 }, { "epoch": 0.3341324172700986, "grad_norm": 3.2082653451911254, "learning_rate": 1.552888253989268e-07, "loss": 0.7367, "step": 3380 }, { "epoch": 0.33423127301485306, "grad_norm": 4.14001201681523, "learning_rate": 1.5526214078853154e-07, "loss": 0.7292, "step": 3381 }, { "epoch": 0.33433012875960755, "grad_norm": 4.748658978370739, "learning_rate": 1.552354505118189e-07, "loss": 0.7058, "step": 3382 }, { "epoch": 0.33442898450436204, "grad_norm": 2.675636335241841, "learning_rate": 1.5520875457152559e-07, "loss": 0.7534, "step": 3383 }, { "epoch": 0.33452784024911647, "grad_norm": 3.8858371288736384, "learning_rate": 1.551820529703889e-07, "loss": 0.6956, "step": 3384 }, { "epoch": 0.33462669599387096, "grad_norm": 7.193825715201345, "learning_rate": 1.5515534571114662e-07, "loss": 0.7028, "step": 3385 }, { "epoch": 0.3347255517386254, "grad_norm": 3.20656162922396, "learning_rate": 1.5512863279653726e-07, "loss": 0.6846, "step": 3386 }, { "epoch": 0.3348244074833799, "grad_norm": 3.284387709445011, "learning_rate": 1.5510191422929983e-07, "loss": 0.6777, "step": 3387 }, { "epoch": 0.33492326322813437, "grad_norm": 3.7232391760940535, "learning_rate": 1.5507519001217393e-07, "loss": 0.7124, "step": 3388 }, { "epoch": 0.3350221189728888, "grad_norm": 6.312293890642153, "learning_rate": 1.5504846014789968e-07, "loss": 0.7225, "step": 3389 }, { "epoch": 0.3351209747176433, "grad_norm": 3.7344802949567684, "learning_rate": 1.5502172463921784e-07, "loss": 0.6989, "step": 3390 }, { "epoch": 0.3352198304623977, "grad_norm": 4.1657204438288975, "learning_rate": 1.549949834888698e-07, "loss": 0.7075, "step": 3391 }, { "epoch": 0.3353186862071522, "grad_norm": 9.246200845789376, "learning_rate": 1.549682366995974e-07, "loss": 0.7064, "step": 3392 }, { "epoch": 0.3354175419519067, "grad_norm": 3.4690785631806187, "learning_rate": 1.549414842741432e-07, "loss": 0.7355, "step": 3393 }, { "epoch": 0.33551639769666114, "grad_norm": 6.219510268129855, "learning_rate": 1.5491472621525023e-07, "loss": 0.664, "step": 3394 }, { "epoch": 0.33561525344141563, "grad_norm": 16.236229142668194, "learning_rate": 1.548879625256621e-07, "loss": 0.8051, "step": 3395 }, { "epoch": 0.33571410918617006, "grad_norm": 3.3754943253426317, "learning_rate": 1.5486119320812315e-07, "loss": 0.7154, "step": 3396 }, { "epoch": 0.33581296493092455, "grad_norm": 8.104192442493066, "learning_rate": 1.54834418265378e-07, "loss": 0.6722, "step": 3397 }, { "epoch": 0.33591182067567904, "grad_norm": 4.351751319896158, "learning_rate": 1.5480763770017218e-07, "loss": 0.7202, "step": 3398 }, { "epoch": 0.3360106764204335, "grad_norm": 3.1659015905809085, "learning_rate": 1.5478085151525155e-07, "loss": 0.7809, "step": 3399 }, { "epoch": 0.33610953216518796, "grad_norm": 22.931191725149997, "learning_rate": 1.5475405971336272e-07, "loss": 0.6823, "step": 3400 }, { "epoch": 0.3362083879099424, "grad_norm": 4.179182046622785, "learning_rate": 1.547272622972527e-07, "loss": 0.7477, "step": 3401 }, { "epoch": 0.3363072436546969, "grad_norm": 10.377681879359452, "learning_rate": 1.547004592696692e-07, "loss": 0.6915, "step": 3402 }, { "epoch": 0.3364060993994514, "grad_norm": 4.181482891849487, "learning_rate": 1.5467365063336052e-07, "loss": 0.7485, "step": 3403 }, { "epoch": 0.3365049551442058, "grad_norm": 6.205950005754268, "learning_rate": 1.5464683639107546e-07, "loss": 0.7404, "step": 3404 }, { "epoch": 0.3366038108889603, "grad_norm": 5.089735939931679, "learning_rate": 1.5462001654556344e-07, "loss": 0.7644, "step": 3405 }, { "epoch": 0.33670266663371473, "grad_norm": 9.920190067126605, "learning_rate": 1.545931910995744e-07, "loss": 0.7, "step": 3406 }, { "epoch": 0.3368015223784692, "grad_norm": 4.378507390051743, "learning_rate": 1.5456636005585893e-07, "loss": 0.7498, "step": 3407 }, { "epoch": 0.3369003781232237, "grad_norm": 5.287062516677671, "learning_rate": 1.5453952341716814e-07, "loss": 0.6652, "step": 3408 }, { "epoch": 0.33699923386797814, "grad_norm": 3.125418452623746, "learning_rate": 1.5451268118625374e-07, "loss": 0.7545, "step": 3409 }, { "epoch": 0.33709808961273263, "grad_norm": 3.7213114229095403, "learning_rate": 1.5448583336586802e-07, "loss": 0.8438, "step": 3410 }, { "epoch": 0.33719694535748707, "grad_norm": 4.6221567691655885, "learning_rate": 1.544589799587638e-07, "loss": 0.7401, "step": 3411 }, { "epoch": 0.33729580110224155, "grad_norm": 4.407400476487539, "learning_rate": 1.5443212096769449e-07, "loss": 0.8883, "step": 3412 }, { "epoch": 0.33739465684699604, "grad_norm": 3.5633952549105166, "learning_rate": 1.5440525639541413e-07, "loss": 0.6873, "step": 3413 }, { "epoch": 0.3374935125917505, "grad_norm": 3.374152459307575, "learning_rate": 1.5437838624467725e-07, "loss": 0.5983, "step": 3414 }, { "epoch": 0.33759236833650497, "grad_norm": 5.815609012281859, "learning_rate": 1.54351510518239e-07, "loss": 0.672, "step": 3415 }, { "epoch": 0.3376912240812594, "grad_norm": 3.9096887089050862, "learning_rate": 1.543246292188551e-07, "loss": 0.7057, "step": 3416 }, { "epoch": 0.3377900798260139, "grad_norm": 7.252330308690125, "learning_rate": 1.542977423492818e-07, "loss": 0.7466, "step": 3417 }, { "epoch": 0.3378889355707684, "grad_norm": 3.741137031911654, "learning_rate": 1.5427084991227598e-07, "loss": 0.7057, "step": 3418 }, { "epoch": 0.3379877913155228, "grad_norm": 5.403564310987992, "learning_rate": 1.5424395191059504e-07, "loss": 0.7762, "step": 3419 }, { "epoch": 0.3380866470602773, "grad_norm": 4.6238689582853345, "learning_rate": 1.5421704834699703e-07, "loss": 0.7073, "step": 3420 }, { "epoch": 0.33818550280503173, "grad_norm": 5.574016534854883, "learning_rate": 1.541901392242404e-07, "loss": 0.7049, "step": 3421 }, { "epoch": 0.3382843585497862, "grad_norm": 7.423189065037691, "learning_rate": 1.541632245450844e-07, "loss": 0.7122, "step": 3422 }, { "epoch": 0.3383832142945407, "grad_norm": 3.93400199944179, "learning_rate": 1.541363043122887e-07, "loss": 0.6675, "step": 3423 }, { "epoch": 0.33848207003929515, "grad_norm": 4.505967003032372, "learning_rate": 1.5410937852861353e-07, "loss": 0.8308, "step": 3424 }, { "epoch": 0.33858092578404964, "grad_norm": 4.142049694910113, "learning_rate": 1.5408244719681975e-07, "loss": 0.7395, "step": 3425 }, { "epoch": 0.33867978152880407, "grad_norm": 2.952409713569996, "learning_rate": 1.5405551031966884e-07, "loss": 0.5934, "step": 3426 }, { "epoch": 0.33877863727355856, "grad_norm": 3.494110302691782, "learning_rate": 1.5402856789992264e-07, "loss": 0.8214, "step": 3427 }, { "epoch": 0.33887749301831305, "grad_norm": 4.049408079991959, "learning_rate": 1.5400161994034384e-07, "loss": 0.7321, "step": 3428 }, { "epoch": 0.3389763487630675, "grad_norm": 14.900677526181031, "learning_rate": 1.5397466644369548e-07, "loss": 0.7159, "step": 3429 }, { "epoch": 0.33907520450782197, "grad_norm": 3.7496756464084626, "learning_rate": 1.5394770741274123e-07, "loss": 0.7717, "step": 3430 }, { "epoch": 0.3391740602525764, "grad_norm": 10.113740079840348, "learning_rate": 1.539207428502454e-07, "loss": 0.7211, "step": 3431 }, { "epoch": 0.3392729159973309, "grad_norm": 6.786960486767994, "learning_rate": 1.5389377275897276e-07, "loss": 0.7201, "step": 3432 }, { "epoch": 0.3393717717420854, "grad_norm": 7.521776711038403, "learning_rate": 1.5386679714168875e-07, "loss": 0.7717, "step": 3433 }, { "epoch": 0.3394706274868398, "grad_norm": 6.281606183573816, "learning_rate": 1.5383981600115927e-07, "loss": 0.7144, "step": 3434 }, { "epoch": 0.3395694832315943, "grad_norm": 6.7710367670826574, "learning_rate": 1.5381282934015082e-07, "loss": 0.7436, "step": 3435 }, { "epoch": 0.33966833897634874, "grad_norm": 3.840386609343554, "learning_rate": 1.5378583716143055e-07, "loss": 0.7122, "step": 3436 }, { "epoch": 0.3397671947211032, "grad_norm": 2.903791529528234, "learning_rate": 1.5375883946776608e-07, "loss": 0.72, "step": 3437 }, { "epoch": 0.3398660504658577, "grad_norm": 3.347402257706789, "learning_rate": 1.537318362619256e-07, "loss": 0.8048, "step": 3438 }, { "epoch": 0.33996490621061215, "grad_norm": 4.65261636066201, "learning_rate": 1.5370482754667792e-07, "loss": 0.7432, "step": 3439 }, { "epoch": 0.34006376195536664, "grad_norm": 3.73523971825845, "learning_rate": 1.5367781332479237e-07, "loss": 0.6098, "step": 3440 }, { "epoch": 0.3401626177001211, "grad_norm": 4.52246404304519, "learning_rate": 1.5365079359903885e-07, "loss": 0.7002, "step": 3441 }, { "epoch": 0.34026147344487556, "grad_norm": 3.1328148000937754, "learning_rate": 1.536237683721879e-07, "loss": 0.8115, "step": 3442 }, { "epoch": 0.34036032918963005, "grad_norm": 5.167569637842267, "learning_rate": 1.5359673764701048e-07, "loss": 0.7556, "step": 3443 }, { "epoch": 0.3404591849343845, "grad_norm": 3.450659395525906, "learning_rate": 1.5356970142627824e-07, "loss": 0.6362, "step": 3444 }, { "epoch": 0.340558040679139, "grad_norm": 20.160413183637818, "learning_rate": 1.535426597127633e-07, "loss": 0.7412, "step": 3445 }, { "epoch": 0.3406568964238934, "grad_norm": 3.618393746075291, "learning_rate": 1.535156125092384e-07, "loss": 0.6917, "step": 3446 }, { "epoch": 0.3407557521686479, "grad_norm": 3.8795819479009883, "learning_rate": 1.534885598184769e-07, "loss": 0.7324, "step": 3447 }, { "epoch": 0.3408546079134024, "grad_norm": 5.715136243513716, "learning_rate": 1.5346150164325256e-07, "loss": 0.7664, "step": 3448 }, { "epoch": 0.3409534636581568, "grad_norm": 6.8743957814530985, "learning_rate": 1.5343443798633984e-07, "loss": 0.7779, "step": 3449 }, { "epoch": 0.3410523194029113, "grad_norm": 3.4703042988085278, "learning_rate": 1.5340736885051375e-07, "loss": 0.7263, "step": 3450 }, { "epoch": 0.34115117514766574, "grad_norm": 3.596781608477162, "learning_rate": 1.533802942385498e-07, "loss": 0.7726, "step": 3451 }, { "epoch": 0.34125003089242023, "grad_norm": 3.0185521258379326, "learning_rate": 1.5335321415322407e-07, "loss": 0.6822, "step": 3452 }, { "epoch": 0.3413488866371747, "grad_norm": 3.1745446662853043, "learning_rate": 1.5332612859731323e-07, "loss": 0.7563, "step": 3453 }, { "epoch": 0.34144774238192915, "grad_norm": 2.918064530234702, "learning_rate": 1.5329903757359454e-07, "loss": 0.7457, "step": 3454 }, { "epoch": 0.34154659812668364, "grad_norm": 4.173601190597856, "learning_rate": 1.5327194108484575e-07, "loss": 0.6952, "step": 3455 }, { "epoch": 0.3416454538714381, "grad_norm": 17.88578398999503, "learning_rate": 1.5324483913384523e-07, "loss": 0.7426, "step": 3456 }, { "epoch": 0.34174430961619257, "grad_norm": 6.984622317110158, "learning_rate": 1.5321773172337187e-07, "loss": 0.8306, "step": 3457 }, { "epoch": 0.34184316536094705, "grad_norm": 13.027294432977849, "learning_rate": 1.5319061885620515e-07, "loss": 0.685, "step": 3458 }, { "epoch": 0.3419420211057015, "grad_norm": 5.351129861616586, "learning_rate": 1.5316350053512505e-07, "loss": 0.6603, "step": 3459 }, { "epoch": 0.342040876850456, "grad_norm": 3.5787038712614767, "learning_rate": 1.5313637676291222e-07, "loss": 0.7466, "step": 3460 }, { "epoch": 0.34213973259521047, "grad_norm": 3.711529990152059, "learning_rate": 1.5310924754234775e-07, "loss": 0.6843, "step": 3461 }, { "epoch": 0.3422385883399649, "grad_norm": 3.935626977595704, "learning_rate": 1.5308211287621337e-07, "loss": 0.7082, "step": 3462 }, { "epoch": 0.3423374440847194, "grad_norm": 3.0005123599210655, "learning_rate": 1.5305497276729133e-07, "loss": 0.6987, "step": 3463 }, { "epoch": 0.3424362998294738, "grad_norm": 9.946484957189817, "learning_rate": 1.5302782721836445e-07, "loss": 0.6379, "step": 3464 }, { "epoch": 0.3425351555742283, "grad_norm": 4.642564100319465, "learning_rate": 1.5300067623221612e-07, "loss": 0.7444, "step": 3465 }, { "epoch": 0.3426340113189828, "grad_norm": 5.371474910118832, "learning_rate": 1.5297351981163026e-07, "loss": 0.6839, "step": 3466 }, { "epoch": 0.34273286706373723, "grad_norm": 8.444037158874908, "learning_rate": 1.529463579593913e-07, "loss": 0.7859, "step": 3467 }, { "epoch": 0.3428317228084917, "grad_norm": 7.626211949736986, "learning_rate": 1.529191906782844e-07, "loss": 0.6871, "step": 3468 }, { "epoch": 0.34293057855324616, "grad_norm": 2.806900203121799, "learning_rate": 1.528920179710951e-07, "loss": 0.692, "step": 3469 }, { "epoch": 0.34302943429800065, "grad_norm": 3.0707822878338944, "learning_rate": 1.5286483984060956e-07, "loss": 0.6969, "step": 3470 }, { "epoch": 0.34312829004275514, "grad_norm": 3.13576791570386, "learning_rate": 1.528376562896145e-07, "loss": 0.6532, "step": 3471 }, { "epoch": 0.34322714578750957, "grad_norm": 7.816720306346298, "learning_rate": 1.528104673208972e-07, "loss": 0.7698, "step": 3472 }, { "epoch": 0.34332600153226406, "grad_norm": 4.096292936747229, "learning_rate": 1.5278327293724548e-07, "loss": 0.6498, "step": 3473 }, { "epoch": 0.3434248572770185, "grad_norm": 3.5978119615729685, "learning_rate": 1.5275607314144775e-07, "loss": 0.6859, "step": 3474 }, { "epoch": 0.343523713021773, "grad_norm": 3.672440759945241, "learning_rate": 1.527288679362929e-07, "loss": 0.7381, "step": 3475 }, { "epoch": 0.34362256876652747, "grad_norm": 6.602873425372868, "learning_rate": 1.5270165732457047e-07, "loss": 0.6881, "step": 3476 }, { "epoch": 0.3437214245112819, "grad_norm": 3.8893793149300824, "learning_rate": 1.526744413090705e-07, "loss": 0.734, "step": 3477 }, { "epoch": 0.3438202802560364, "grad_norm": 5.657276115624566, "learning_rate": 1.5264721989258352e-07, "loss": 0.7321, "step": 3478 }, { "epoch": 0.3439191360007908, "grad_norm": 3.1849025446724135, "learning_rate": 1.526199930779008e-07, "loss": 0.724, "step": 3479 }, { "epoch": 0.3440179917455453, "grad_norm": 2.983018431230655, "learning_rate": 1.5259276086781397e-07, "loss": 0.695, "step": 3480 }, { "epoch": 0.3441168474902998, "grad_norm": 7.763802298955474, "learning_rate": 1.5256552326511529e-07, "loss": 0.6063, "step": 3481 }, { "epoch": 0.34421570323505424, "grad_norm": 11.148054589894054, "learning_rate": 1.5253828027259762e-07, "loss": 0.7037, "step": 3482 }, { "epoch": 0.3443145589798087, "grad_norm": 4.375930007180887, "learning_rate": 1.525110318930543e-07, "loss": 0.662, "step": 3483 }, { "epoch": 0.34441341472456316, "grad_norm": 20.892056465673637, "learning_rate": 1.5248377812927924e-07, "loss": 0.7612, "step": 3484 }, { "epoch": 0.34451227046931765, "grad_norm": 4.434538545730614, "learning_rate": 1.5245651898406694e-07, "loss": 0.6684, "step": 3485 }, { "epoch": 0.34461112621407214, "grad_norm": 4.4008864879589975, "learning_rate": 1.524292544602124e-07, "loss": 0.7273, "step": 3486 }, { "epoch": 0.3447099819588266, "grad_norm": 17.81289332047644, "learning_rate": 1.524019845605112e-07, "loss": 0.603, "step": 3487 }, { "epoch": 0.34480883770358106, "grad_norm": 3.7967736071444858, "learning_rate": 1.5237470928775948e-07, "loss": 0.739, "step": 3488 }, { "epoch": 0.3449076934483355, "grad_norm": 4.037451805521626, "learning_rate": 1.5234742864475385e-07, "loss": 0.7925, "step": 3489 }, { "epoch": 0.34500654919309, "grad_norm": 7.085736869079253, "learning_rate": 1.5232014263429164e-07, "loss": 0.7605, "step": 3490 }, { "epoch": 0.3451054049378445, "grad_norm": 4.289994202369924, "learning_rate": 1.5229285125917058e-07, "loss": 0.8461, "step": 3491 }, { "epoch": 0.3452042606825989, "grad_norm": 5.667258014621529, "learning_rate": 1.5226555452218897e-07, "loss": 0.7336, "step": 3492 }, { "epoch": 0.3453031164273534, "grad_norm": 5.8837231558996015, "learning_rate": 1.5223825242614572e-07, "loss": 0.7481, "step": 3493 }, { "epoch": 0.34540197217210783, "grad_norm": 3.5363562203119914, "learning_rate": 1.5221094497384026e-07, "loss": 0.6776, "step": 3494 }, { "epoch": 0.3455008279168623, "grad_norm": 2.9308737999293206, "learning_rate": 1.521836321680726e-07, "loss": 0.7538, "step": 3495 }, { "epoch": 0.3455996836616168, "grad_norm": 4.943695216104501, "learning_rate": 1.5215631401164316e-07, "loss": 0.8277, "step": 3496 }, { "epoch": 0.34569853940637124, "grad_norm": 4.152638068581815, "learning_rate": 1.521289905073531e-07, "loss": 0.6888, "step": 3497 }, { "epoch": 0.34579739515112573, "grad_norm": 5.131385668278917, "learning_rate": 1.5210166165800403e-07, "loss": 0.7036, "step": 3498 }, { "epoch": 0.34589625089588016, "grad_norm": 9.210620189593723, "learning_rate": 1.520743274663981e-07, "loss": 0.6877, "step": 3499 }, { "epoch": 0.34599510664063465, "grad_norm": 5.50061700114168, "learning_rate": 1.52046987935338e-07, "loss": 0.7255, "step": 3500 }, { "epoch": 0.34609396238538914, "grad_norm": 4.846706679376976, "learning_rate": 1.520196430676271e-07, "loss": 0.748, "step": 3501 }, { "epoch": 0.3461928181301436, "grad_norm": 8.168753102690593, "learning_rate": 1.5199229286606912e-07, "loss": 0.698, "step": 3502 }, { "epoch": 0.34629167387489806, "grad_norm": 4.581332839265094, "learning_rate": 1.5196493733346846e-07, "loss": 0.702, "step": 3503 }, { "epoch": 0.3463905296196525, "grad_norm": 22.286580967054995, "learning_rate": 1.5193757647263e-07, "loss": 0.7059, "step": 3504 }, { "epoch": 0.346489385364407, "grad_norm": 5.617006624423454, "learning_rate": 1.5191021028635922e-07, "loss": 0.7186, "step": 3505 }, { "epoch": 0.3465882411091615, "grad_norm": 3.8745316417796087, "learning_rate": 1.518828387774621e-07, "loss": 0.6037, "step": 3506 }, { "epoch": 0.3466870968539159, "grad_norm": 5.876061715600381, "learning_rate": 1.5185546194874518e-07, "loss": 0.7474, "step": 3507 }, { "epoch": 0.3467859525986704, "grad_norm": 3.196815890671076, "learning_rate": 1.5182807980301555e-07, "loss": 0.7624, "step": 3508 }, { "epoch": 0.34688480834342483, "grad_norm": 3.1725137931540943, "learning_rate": 1.518006923430809e-07, "loss": 0.7217, "step": 3509 }, { "epoch": 0.3469836640881793, "grad_norm": 2.7157228586467146, "learning_rate": 1.517732995717493e-07, "loss": 0.6387, "step": 3510 }, { "epoch": 0.3470825198329338, "grad_norm": 9.923734573065127, "learning_rate": 1.5174590149182957e-07, "loss": 0.7095, "step": 3511 }, { "epoch": 0.34718137557768824, "grad_norm": 4.365618426693073, "learning_rate": 1.5171849810613097e-07, "loss": 0.7301, "step": 3512 }, { "epoch": 0.34728023132244273, "grad_norm": 4.514788388920906, "learning_rate": 1.5169108941746327e-07, "loss": 0.6812, "step": 3513 }, { "epoch": 0.34737908706719717, "grad_norm": 3.384827947043089, "learning_rate": 1.516636754286368e-07, "loss": 0.7315, "step": 3514 }, { "epoch": 0.34747794281195166, "grad_norm": 3.8093610004456173, "learning_rate": 1.5163625614246257e-07, "loss": 0.7798, "step": 3515 }, { "epoch": 0.34757679855670615, "grad_norm": 5.027058402465436, "learning_rate": 1.5160883156175195e-07, "loss": 0.675, "step": 3516 }, { "epoch": 0.3476756543014606, "grad_norm": 11.575420165225765, "learning_rate": 1.5158140168931693e-07, "loss": 0.6189, "step": 3517 }, { "epoch": 0.34777451004621507, "grad_norm": 3.3807262487667677, "learning_rate": 1.5155396652797003e-07, "loss": 0.683, "step": 3518 }, { "epoch": 0.3478733657909695, "grad_norm": 4.426345777794725, "learning_rate": 1.5152652608052433e-07, "loss": 0.6542, "step": 3519 }, { "epoch": 0.347972221535724, "grad_norm": 3.957028460298404, "learning_rate": 1.514990803497935e-07, "loss": 0.6858, "step": 3520 }, { "epoch": 0.3480710772804785, "grad_norm": 3.1117063771388302, "learning_rate": 1.5147162933859163e-07, "loss": 0.7622, "step": 3521 }, { "epoch": 0.3481699330252329, "grad_norm": 3.9932157247200095, "learning_rate": 1.5144417304973342e-07, "loss": 0.7966, "step": 3522 }, { "epoch": 0.3482687887699874, "grad_norm": 5.009970884138738, "learning_rate": 1.5141671148603412e-07, "loss": 0.7224, "step": 3523 }, { "epoch": 0.34836764451474184, "grad_norm": 3.3642018627306913, "learning_rate": 1.513892446503095e-07, "loss": 0.7312, "step": 3524 }, { "epoch": 0.3484665002594963, "grad_norm": 3.1270127627541346, "learning_rate": 1.5136177254537587e-07, "loss": 0.739, "step": 3525 }, { "epoch": 0.3485653560042508, "grad_norm": 3.432870835645691, "learning_rate": 1.5133429517405016e-07, "loss": 0.6256, "step": 3526 }, { "epoch": 0.34866421174900525, "grad_norm": 7.695780904259153, "learning_rate": 1.5130681253914968e-07, "loss": 0.7474, "step": 3527 }, { "epoch": 0.34876306749375974, "grad_norm": 3.604866464597523, "learning_rate": 1.512793246434924e-07, "loss": 0.8163, "step": 3528 }, { "epoch": 0.34886192323851417, "grad_norm": 3.244802464975086, "learning_rate": 1.512518314898968e-07, "loss": 0.7144, "step": 3529 }, { "epoch": 0.34896077898326866, "grad_norm": 3.3156638485057526, "learning_rate": 1.512243330811819e-07, "loss": 0.6428, "step": 3530 }, { "epoch": 0.34905963472802315, "grad_norm": 3.0231793072423607, "learning_rate": 1.5119682942016727e-07, "loss": 0.5858, "step": 3531 }, { "epoch": 0.3491584904727776, "grad_norm": 5.298337297851877, "learning_rate": 1.51169320509673e-07, "loss": 0.7473, "step": 3532 }, { "epoch": 0.34925734621753207, "grad_norm": 10.946866392807316, "learning_rate": 1.5114180635251966e-07, "loss": 0.7283, "step": 3533 }, { "epoch": 0.34935620196228656, "grad_norm": 4.311552878663955, "learning_rate": 1.511142869515285e-07, "loss": 0.6813, "step": 3534 }, { "epoch": 0.349455057707041, "grad_norm": 3.83280635864606, "learning_rate": 1.510867623095212e-07, "loss": 0.6871, "step": 3535 }, { "epoch": 0.3495539134517955, "grad_norm": 3.4529997755190625, "learning_rate": 1.5105923242932002e-07, "loss": 0.7064, "step": 3536 }, { "epoch": 0.3496527691965499, "grad_norm": 6.53175765617147, "learning_rate": 1.510316973137477e-07, "loss": 0.6994, "step": 3537 }, { "epoch": 0.3497516249413044, "grad_norm": 3.5075250024517857, "learning_rate": 1.5100415696562765e-07, "loss": 0.6663, "step": 3538 }, { "epoch": 0.3498504806860589, "grad_norm": 2.7801210002071963, "learning_rate": 1.5097661138778363e-07, "loss": 0.8319, "step": 3539 }, { "epoch": 0.34994933643081333, "grad_norm": 3.8992182146229912, "learning_rate": 1.5094906058304003e-07, "loss": 0.8037, "step": 3540 }, { "epoch": 0.3500481921755678, "grad_norm": 3.189224845181829, "learning_rate": 1.509215045542219e-07, "loss": 0.6482, "step": 3541 }, { "epoch": 0.35014704792032225, "grad_norm": 3.9117892633324836, "learning_rate": 1.5089394330415458e-07, "loss": 0.7433, "step": 3542 }, { "epoch": 0.35024590366507674, "grad_norm": 6.787832456304181, "learning_rate": 1.5086637683566413e-07, "loss": 0.6908, "step": 3543 }, { "epoch": 0.35034475940983123, "grad_norm": 7.889532236288772, "learning_rate": 1.5083880515157702e-07, "loss": 0.755, "step": 3544 }, { "epoch": 0.35044361515458566, "grad_norm": 4.702098459409698, "learning_rate": 1.508112282547204e-07, "loss": 0.7293, "step": 3545 }, { "epoch": 0.35054247089934015, "grad_norm": 3.7583379708936158, "learning_rate": 1.5078364614792186e-07, "loss": 0.7062, "step": 3546 }, { "epoch": 0.3506413266440946, "grad_norm": 3.0226732365585796, "learning_rate": 1.507560588340095e-07, "loss": 0.6973, "step": 3547 }, { "epoch": 0.3507401823888491, "grad_norm": 6.457467073729727, "learning_rate": 1.5072846631581202e-07, "loss": 0.7191, "step": 3548 }, { "epoch": 0.35083903813360356, "grad_norm": 7.927066686907352, "learning_rate": 1.5070086859615861e-07, "loss": 0.7585, "step": 3549 }, { "epoch": 0.350937893878358, "grad_norm": 5.723467655921544, "learning_rate": 1.50673265677879e-07, "loss": 0.6529, "step": 3550 }, { "epoch": 0.3510367496231125, "grad_norm": 3.743349590358212, "learning_rate": 1.5064565756380348e-07, "loss": 0.6308, "step": 3551 }, { "epoch": 0.3511356053678669, "grad_norm": 7.322723653496503, "learning_rate": 1.5061804425676288e-07, "loss": 0.7316, "step": 3552 }, { "epoch": 0.3512344611126214, "grad_norm": 12.300069279967863, "learning_rate": 1.505904257595885e-07, "loss": 0.7123, "step": 3553 }, { "epoch": 0.3513333168573759, "grad_norm": 3.2817336495573994, "learning_rate": 1.5056280207511223e-07, "loss": 0.6803, "step": 3554 }, { "epoch": 0.35143217260213033, "grad_norm": 5.343286822319338, "learning_rate": 1.5053517320616643e-07, "loss": 0.6529, "step": 3555 }, { "epoch": 0.3515310283468848, "grad_norm": 12.70158932641803, "learning_rate": 1.5050753915558408e-07, "loss": 0.7863, "step": 3556 }, { "epoch": 0.35162988409163926, "grad_norm": 3.126182924564837, "learning_rate": 1.5047989992619864e-07, "loss": 0.7702, "step": 3557 }, { "epoch": 0.35172873983639374, "grad_norm": 3.885258177897647, "learning_rate": 1.5045225552084405e-07, "loss": 0.6827, "step": 3558 }, { "epoch": 0.35182759558114823, "grad_norm": 4.142647420107966, "learning_rate": 1.5042460594235493e-07, "loss": 0.7477, "step": 3559 }, { "epoch": 0.35192645132590267, "grad_norm": 3.0291495262740726, "learning_rate": 1.5039695119356627e-07, "loss": 0.721, "step": 3560 }, { "epoch": 0.35202530707065716, "grad_norm": 4.539818305723451, "learning_rate": 1.5036929127731365e-07, "loss": 0.7561, "step": 3561 }, { "epoch": 0.3521241628154116, "grad_norm": 4.503891756808077, "learning_rate": 1.5034162619643324e-07, "loss": 0.7512, "step": 3562 }, { "epoch": 0.3522230185601661, "grad_norm": 3.7661209936815596, "learning_rate": 1.5031395595376164e-07, "loss": 0.7763, "step": 3563 }, { "epoch": 0.35232187430492057, "grad_norm": 3.615777300922611, "learning_rate": 1.5028628055213605e-07, "loss": 0.7896, "step": 3564 }, { "epoch": 0.352420730049675, "grad_norm": 4.328058368408435, "learning_rate": 1.5025859999439415e-07, "loss": 0.6687, "step": 3565 }, { "epoch": 0.3525195857944295, "grad_norm": 7.636062885849978, "learning_rate": 1.502309142833742e-07, "loss": 0.7688, "step": 3566 }, { "epoch": 0.3526184415391839, "grad_norm": 7.227419613475218, "learning_rate": 1.5020322342191497e-07, "loss": 0.672, "step": 3567 }, { "epoch": 0.3527172972839384, "grad_norm": 6.459968792122518, "learning_rate": 1.5017552741285568e-07, "loss": 0.6789, "step": 3568 }, { "epoch": 0.3528161530286929, "grad_norm": 4.3900211430987905, "learning_rate": 1.5014782625903618e-07, "loss": 0.6491, "step": 3569 }, { "epoch": 0.35291500877344734, "grad_norm": 3.7753090841119445, "learning_rate": 1.5012011996329688e-07, "loss": 0.6839, "step": 3570 }, { "epoch": 0.3530138645182018, "grad_norm": 3.094466311507995, "learning_rate": 1.500924085284786e-07, "loss": 0.7132, "step": 3571 }, { "epoch": 0.35311272026295626, "grad_norm": 3.0592766818150476, "learning_rate": 1.5006469195742273e-07, "loss": 0.6905, "step": 3572 }, { "epoch": 0.35321157600771075, "grad_norm": 2.997323221332699, "learning_rate": 1.500369702529712e-07, "loss": 0.6528, "step": 3573 }, { "epoch": 0.35331043175246524, "grad_norm": 2.7041979859823897, "learning_rate": 1.500092434179665e-07, "loss": 0.7827, "step": 3574 }, { "epoch": 0.35340928749721967, "grad_norm": 7.9111499071354325, "learning_rate": 1.4998151145525154e-07, "loss": 0.7306, "step": 3575 }, { "epoch": 0.35350814324197416, "grad_norm": 5.228559768965661, "learning_rate": 1.4995377436766984e-07, "loss": 0.7739, "step": 3576 }, { "epoch": 0.3536069989867286, "grad_norm": 3.559793581806373, "learning_rate": 1.4992603215806548e-07, "loss": 0.7239, "step": 3577 }, { "epoch": 0.3537058547314831, "grad_norm": 10.761271739465293, "learning_rate": 1.4989828482928298e-07, "loss": 0.6949, "step": 3578 }, { "epoch": 0.35380471047623757, "grad_norm": 4.962133754698089, "learning_rate": 1.4987053238416743e-07, "loss": 0.6694, "step": 3579 }, { "epoch": 0.353903566220992, "grad_norm": 6.990741266969024, "learning_rate": 1.498427748255644e-07, "loss": 0.7174, "step": 3580 }, { "epoch": 0.3540024219657465, "grad_norm": 15.97666795643509, "learning_rate": 1.4981501215632008e-07, "loss": 0.7755, "step": 3581 }, { "epoch": 0.35410127771050093, "grad_norm": 2.852960808113818, "learning_rate": 1.4978724437928105e-07, "loss": 0.6581, "step": 3582 }, { "epoch": 0.3542001334552554, "grad_norm": 3.1520382918086525, "learning_rate": 1.4975947149729455e-07, "loss": 0.7381, "step": 3583 }, { "epoch": 0.3542989892000099, "grad_norm": 10.162958080614407, "learning_rate": 1.4973169351320824e-07, "loss": 0.6536, "step": 3584 }, { "epoch": 0.35439784494476434, "grad_norm": 4.038614161194025, "learning_rate": 1.4970391042987035e-07, "loss": 0.7599, "step": 3585 }, { "epoch": 0.35449670068951883, "grad_norm": 4.26496554030443, "learning_rate": 1.4967612225012964e-07, "loss": 0.7883, "step": 3586 }, { "epoch": 0.35459555643427326, "grad_norm": 18.720833719002044, "learning_rate": 1.4964832897683536e-07, "loss": 0.6689, "step": 3587 }, { "epoch": 0.35469441217902775, "grad_norm": 4.880920090188901, "learning_rate": 1.496205306128373e-07, "loss": 0.7435, "step": 3588 }, { "epoch": 0.35479326792378224, "grad_norm": 4.396380744804757, "learning_rate": 1.4959272716098578e-07, "loss": 0.6598, "step": 3589 }, { "epoch": 0.3548921236685367, "grad_norm": 3.9117014656584606, "learning_rate": 1.4956491862413164e-07, "loss": 0.8134, "step": 3590 }, { "epoch": 0.35499097941329116, "grad_norm": 6.494599286210102, "learning_rate": 1.4953710500512625e-07, "loss": 0.7639, "step": 3591 }, { "epoch": 0.3550898351580456, "grad_norm": 5.1758771497641645, "learning_rate": 1.4950928630682146e-07, "loss": 0.7392, "step": 3592 }, { "epoch": 0.3551886909028001, "grad_norm": 4.621941366646833, "learning_rate": 1.4948146253206967e-07, "loss": 0.7206, "step": 3593 }, { "epoch": 0.3552875466475546, "grad_norm": 57.392796566108395, "learning_rate": 1.4945363368372378e-07, "loss": 0.7321, "step": 3594 }, { "epoch": 0.355386402392309, "grad_norm": 4.906146898594353, "learning_rate": 1.4942579976463725e-07, "loss": 0.6247, "step": 3595 }, { "epoch": 0.3554852581370635, "grad_norm": 3.181409189999283, "learning_rate": 1.4939796077766403e-07, "loss": 0.6626, "step": 3596 }, { "epoch": 0.35558411388181793, "grad_norm": 2.9463108480031392, "learning_rate": 1.4937011672565865e-07, "loss": 0.7061, "step": 3597 }, { "epoch": 0.3556829696265724, "grad_norm": 5.391663339219531, "learning_rate": 1.4934226761147603e-07, "loss": 0.7289, "step": 3598 }, { "epoch": 0.3557818253713269, "grad_norm": 6.034349354070085, "learning_rate": 1.493144134379717e-07, "loss": 0.7978, "step": 3599 }, { "epoch": 0.35588068111608134, "grad_norm": 3.533853007792903, "learning_rate": 1.4928655420800173e-07, "loss": 0.6773, "step": 3600 }, { "epoch": 0.35597953686083583, "grad_norm": 6.2053060906735675, "learning_rate": 1.4925868992442268e-07, "loss": 0.4986, "step": 3601 }, { "epoch": 0.35607839260559027, "grad_norm": 5.323765181127638, "learning_rate": 1.4923082059009154e-07, "loss": 0.7335, "step": 3602 }, { "epoch": 0.35617724835034476, "grad_norm": 5.529427976407117, "learning_rate": 1.4920294620786597e-07, "loss": 0.6965, "step": 3603 }, { "epoch": 0.35627610409509924, "grad_norm": 3.4663870057435733, "learning_rate": 1.4917506678060408e-07, "loss": 0.8293, "step": 3604 }, { "epoch": 0.3563749598398537, "grad_norm": 4.533449667026421, "learning_rate": 1.4914718231116447e-07, "loss": 0.6456, "step": 3605 }, { "epoch": 0.35647381558460817, "grad_norm": 7.3341347480888555, "learning_rate": 1.4911929280240628e-07, "loss": 0.67, "step": 3606 }, { "epoch": 0.3565726713293626, "grad_norm": 3.0837285792238145, "learning_rate": 1.4909139825718918e-07, "loss": 0.7892, "step": 3607 }, { "epoch": 0.3566715270741171, "grad_norm": 3.9440466081806087, "learning_rate": 1.490634986783733e-07, "loss": 0.7695, "step": 3608 }, { "epoch": 0.3567703828188716, "grad_norm": 3.5951081335379684, "learning_rate": 1.4903559406881944e-07, "loss": 0.7185, "step": 3609 }, { "epoch": 0.356869238563626, "grad_norm": 16.749967233475115, "learning_rate": 1.490076844313887e-07, "loss": 0.7642, "step": 3610 }, { "epoch": 0.3569680943083805, "grad_norm": 4.691976232644765, "learning_rate": 1.4897976976894282e-07, "loss": 0.699, "step": 3611 }, { "epoch": 0.357066950053135, "grad_norm": 3.6628060708962615, "learning_rate": 1.4895185008434405e-07, "loss": 0.6847, "step": 3612 }, { "epoch": 0.3571658057978894, "grad_norm": 12.632188543529255, "learning_rate": 1.4892392538045515e-07, "loss": 0.716, "step": 3613 }, { "epoch": 0.3572646615426439, "grad_norm": 4.06672354863936, "learning_rate": 1.4889599566013938e-07, "loss": 0.6801, "step": 3614 }, { "epoch": 0.35736351728739835, "grad_norm": 4.294167648926604, "learning_rate": 1.4886806092626053e-07, "loss": 0.695, "step": 3615 }, { "epoch": 0.35746237303215284, "grad_norm": 6.809606100436298, "learning_rate": 1.488401211816829e-07, "loss": 0.7019, "step": 3616 }, { "epoch": 0.3575612287769073, "grad_norm": 3.498010553802556, "learning_rate": 1.488121764292713e-07, "loss": 0.7476, "step": 3617 }, { "epoch": 0.35766008452166176, "grad_norm": 3.538001316796065, "learning_rate": 1.4878422667189098e-07, "loss": 0.6762, "step": 3618 }, { "epoch": 0.35775894026641625, "grad_norm": 9.350832690884783, "learning_rate": 1.487562719124079e-07, "loss": 0.6522, "step": 3619 }, { "epoch": 0.3578577960111707, "grad_norm": 3.265537145861484, "learning_rate": 1.4872831215368828e-07, "loss": 0.7538, "step": 3620 }, { "epoch": 0.35795665175592517, "grad_norm": 2.7792225725585746, "learning_rate": 1.4870034739859906e-07, "loss": 0.737, "step": 3621 }, { "epoch": 0.35805550750067966, "grad_norm": 6.896852167330569, "learning_rate": 1.486723776500076e-07, "loss": 0.7553, "step": 3622 }, { "epoch": 0.3581543632454341, "grad_norm": 3.842412874140129, "learning_rate": 1.486444029107818e-07, "loss": 0.7743, "step": 3623 }, { "epoch": 0.3582532189901886, "grad_norm": 3.8920428978117547, "learning_rate": 1.4861642318379e-07, "loss": 0.6526, "step": 3624 }, { "epoch": 0.358352074734943, "grad_norm": 2.843723893493013, "learning_rate": 1.4858843847190115e-07, "loss": 0.6369, "step": 3625 }, { "epoch": 0.3584509304796975, "grad_norm": 3.9943138418972746, "learning_rate": 1.485604487779847e-07, "loss": 0.6621, "step": 3626 }, { "epoch": 0.358549786224452, "grad_norm": 23.574058654771576, "learning_rate": 1.4853245410491054e-07, "loss": 0.6664, "step": 3627 }, { "epoch": 0.3586486419692064, "grad_norm": 3.5105090727833472, "learning_rate": 1.4850445445554906e-07, "loss": 0.736, "step": 3628 }, { "epoch": 0.3587474977139609, "grad_norm": 4.52796256428317, "learning_rate": 1.484764498327713e-07, "loss": 0.8814, "step": 3629 }, { "epoch": 0.35884635345871535, "grad_norm": 3.61307472157643, "learning_rate": 1.484484402394487e-07, "loss": 0.7433, "step": 3630 }, { "epoch": 0.35894520920346984, "grad_norm": 3.1323685544251885, "learning_rate": 1.4842042567845323e-07, "loss": 0.6954, "step": 3631 }, { "epoch": 0.35904406494822433, "grad_norm": 4.778258149592983, "learning_rate": 1.4839240615265732e-07, "loss": 0.8673, "step": 3632 }, { "epoch": 0.35914292069297876, "grad_norm": 3.1459659546138, "learning_rate": 1.4836438166493402e-07, "loss": 0.7535, "step": 3633 }, { "epoch": 0.35924177643773325, "grad_norm": 3.5388816162076795, "learning_rate": 1.483363522181568e-07, "loss": 0.631, "step": 3634 }, { "epoch": 0.3593406321824877, "grad_norm": 2.709089793298291, "learning_rate": 1.4830831781519967e-07, "loss": 0.7561, "step": 3635 }, { "epoch": 0.3594394879272422, "grad_norm": 4.585001470495791, "learning_rate": 1.4828027845893715e-07, "loss": 0.7911, "step": 3636 }, { "epoch": 0.35953834367199666, "grad_norm": 3.010205364624134, "learning_rate": 1.4825223415224427e-07, "loss": 0.6452, "step": 3637 }, { "epoch": 0.3596371994167511, "grad_norm": 2.9008046211295153, "learning_rate": 1.4822418489799653e-07, "loss": 0.7033, "step": 3638 }, { "epoch": 0.3597360551615056, "grad_norm": 3.250226240587601, "learning_rate": 1.4819613069906998e-07, "loss": 0.8759, "step": 3639 }, { "epoch": 0.35983491090626, "grad_norm": 7.43852164708562, "learning_rate": 1.4816807155834117e-07, "loss": 0.6842, "step": 3640 }, { "epoch": 0.3599337666510145, "grad_norm": 4.768673449359528, "learning_rate": 1.4814000747868714e-07, "loss": 0.7781, "step": 3641 }, { "epoch": 0.360032622395769, "grad_norm": 3.2487134246053677, "learning_rate": 1.4811193846298552e-07, "loss": 0.6069, "step": 3642 }, { "epoch": 0.36013147814052343, "grad_norm": 3.810051852484412, "learning_rate": 1.4808386451411424e-07, "loss": 0.8084, "step": 3643 }, { "epoch": 0.3602303338852779, "grad_norm": 2.8724611653101144, "learning_rate": 1.48055785634952e-07, "loss": 0.7148, "step": 3644 }, { "epoch": 0.36032918963003235, "grad_norm": 3.41696190279382, "learning_rate": 1.480277018283778e-07, "loss": 0.7433, "step": 3645 }, { "epoch": 0.36042804537478684, "grad_norm": 4.0214063949870775, "learning_rate": 1.479996130972712e-07, "loss": 0.6876, "step": 3646 }, { "epoch": 0.36052690111954133, "grad_norm": 3.4810808439349312, "learning_rate": 1.4797151944451233e-07, "loss": 0.7872, "step": 3647 }, { "epoch": 0.36062575686429577, "grad_norm": 5.881220253017299, "learning_rate": 1.479434208729818e-07, "loss": 0.6805, "step": 3648 }, { "epoch": 0.36072461260905025, "grad_norm": 4.042773929620973, "learning_rate": 1.4791531738556066e-07, "loss": 0.7676, "step": 3649 }, { "epoch": 0.3608234683538047, "grad_norm": 35.35422784286995, "learning_rate": 1.478872089851305e-07, "loss": 0.721, "step": 3650 }, { "epoch": 0.3609223240985592, "grad_norm": 5.753701013298731, "learning_rate": 1.4785909567457347e-07, "loss": 0.6904, "step": 3651 }, { "epoch": 0.36102117984331367, "grad_norm": 4.242589158252777, "learning_rate": 1.4783097745677214e-07, "loss": 0.7583, "step": 3652 }, { "epoch": 0.3611200355880681, "grad_norm": 3.3178323967603536, "learning_rate": 1.4780285433460966e-07, "loss": 0.762, "step": 3653 }, { "epoch": 0.3612188913328226, "grad_norm": 5.109321238883726, "learning_rate": 1.477747263109696e-07, "loss": 0.7061, "step": 3654 }, { "epoch": 0.361317747077577, "grad_norm": 3.694369670782072, "learning_rate": 1.4774659338873604e-07, "loss": 0.6455, "step": 3655 }, { "epoch": 0.3614166028223315, "grad_norm": 9.00655024315941, "learning_rate": 1.4771845557079367e-07, "loss": 0.6764, "step": 3656 }, { "epoch": 0.361515458567086, "grad_norm": 4.252860320051604, "learning_rate": 1.4769031286002759e-07, "loss": 0.7111, "step": 3657 }, { "epoch": 0.36161431431184043, "grad_norm": 3.0528342298512583, "learning_rate": 1.4766216525932338e-07, "loss": 0.6207, "step": 3658 }, { "epoch": 0.3617131700565949, "grad_norm": 3.49128515906311, "learning_rate": 1.476340127715672e-07, "loss": 0.6417, "step": 3659 }, { "epoch": 0.36181202580134936, "grad_norm": 10.130568883581747, "learning_rate": 1.4760585539964566e-07, "loss": 0.6862, "step": 3660 }, { "epoch": 0.36191088154610385, "grad_norm": 3.7979188734732765, "learning_rate": 1.475776931464459e-07, "loss": 0.6852, "step": 3661 }, { "epoch": 0.36200973729085834, "grad_norm": 4.826450125747866, "learning_rate": 1.4754952601485552e-07, "loss": 0.7537, "step": 3662 }, { "epoch": 0.36210859303561277, "grad_norm": 4.73201016657891, "learning_rate": 1.4752135400776266e-07, "loss": 0.7248, "step": 3663 }, { "epoch": 0.36220744878036726, "grad_norm": 4.769561134201383, "learning_rate": 1.474931771280559e-07, "loss": 0.811, "step": 3664 }, { "epoch": 0.3623063045251217, "grad_norm": 4.2894055857568905, "learning_rate": 1.4746499537862445e-07, "loss": 0.7021, "step": 3665 }, { "epoch": 0.3624051602698762, "grad_norm": 3.8654516305296367, "learning_rate": 1.4743680876235783e-07, "loss": 0.8171, "step": 3666 }, { "epoch": 0.36250401601463067, "grad_norm": 3.222910537016421, "learning_rate": 1.4740861728214626e-07, "loss": 0.7587, "step": 3667 }, { "epoch": 0.3626028717593851, "grad_norm": 6.647395269469236, "learning_rate": 1.4738042094088028e-07, "loss": 0.7387, "step": 3668 }, { "epoch": 0.3627017275041396, "grad_norm": 5.24899423135289, "learning_rate": 1.4735221974145104e-07, "loss": 0.719, "step": 3669 }, { "epoch": 0.362800583248894, "grad_norm": 4.552162839509807, "learning_rate": 1.473240136867502e-07, "loss": 0.6894, "step": 3670 }, { "epoch": 0.3628994389936485, "grad_norm": 3.706143857815008, "learning_rate": 1.4729580277966981e-07, "loss": 0.6868, "step": 3671 }, { "epoch": 0.362998294738403, "grad_norm": 3.767087111052779, "learning_rate": 1.472675870231025e-07, "loss": 0.6068, "step": 3672 }, { "epoch": 0.36309715048315744, "grad_norm": 4.055451608309152, "learning_rate": 1.472393664199414e-07, "loss": 0.7999, "step": 3673 }, { "epoch": 0.3631960062279119, "grad_norm": 3.9857492064063615, "learning_rate": 1.4721114097308012e-07, "loss": 0.7349, "step": 3674 }, { "epoch": 0.36329486197266636, "grad_norm": 15.498486021687006, "learning_rate": 1.471829106854127e-07, "loss": 0.7611, "step": 3675 }, { "epoch": 0.36339371771742085, "grad_norm": 3.3546242230637873, "learning_rate": 1.471546755598338e-07, "loss": 0.7916, "step": 3676 }, { "epoch": 0.36349257346217534, "grad_norm": 4.32823231744246, "learning_rate": 1.4712643559923852e-07, "loss": 0.7383, "step": 3677 }, { "epoch": 0.3635914292069298, "grad_norm": 3.942507469591325, "learning_rate": 1.4709819080652244e-07, "loss": 0.7639, "step": 3678 }, { "epoch": 0.36369028495168426, "grad_norm": 6.20192126794547, "learning_rate": 1.470699411845816e-07, "loss": 0.7336, "step": 3679 }, { "epoch": 0.3637891406964387, "grad_norm": 2.847975375953498, "learning_rate": 1.4704168673631266e-07, "loss": 0.7267, "step": 3680 }, { "epoch": 0.3638879964411932, "grad_norm": 3.3847220707475967, "learning_rate": 1.4701342746461264e-07, "loss": 0.6306, "step": 3681 }, { "epoch": 0.3639868521859477, "grad_norm": 5.227086473188239, "learning_rate": 1.4698516337237916e-07, "loss": 0.7493, "step": 3682 }, { "epoch": 0.3640857079307021, "grad_norm": 4.0903294212781365, "learning_rate": 1.4695689446251024e-07, "loss": 0.7549, "step": 3683 }, { "epoch": 0.3641845636754566, "grad_norm": 6.391138634465423, "learning_rate": 1.4692862073790443e-07, "loss": 0.7116, "step": 3684 }, { "epoch": 0.36428341942021103, "grad_norm": 5.962367896957634, "learning_rate": 1.4690034220146083e-07, "loss": 0.6957, "step": 3685 }, { "epoch": 0.3643822751649655, "grad_norm": 3.2352943494835076, "learning_rate": 1.4687205885607896e-07, "loss": 0.6195, "step": 3686 }, { "epoch": 0.36448113090972, "grad_norm": 6.05635398614733, "learning_rate": 1.4684377070465889e-07, "loss": 0.7303, "step": 3687 }, { "epoch": 0.36457998665447444, "grad_norm": 5.8495066657352, "learning_rate": 1.468154777501011e-07, "loss": 0.6651, "step": 3688 }, { "epoch": 0.36467884239922893, "grad_norm": 3.8648590556467637, "learning_rate": 1.4678717999530666e-07, "loss": 0.688, "step": 3689 }, { "epoch": 0.3647776981439834, "grad_norm": 9.445869636338283, "learning_rate": 1.4675887744317706e-07, "loss": 0.7142, "step": 3690 }, { "epoch": 0.36487655388873785, "grad_norm": 2.979750569128963, "learning_rate": 1.4673057009661432e-07, "loss": 0.7772, "step": 3691 }, { "epoch": 0.36497540963349234, "grad_norm": 3.1053265448806355, "learning_rate": 1.4670225795852092e-07, "loss": 0.8139, "step": 3692 }, { "epoch": 0.3650742653782468, "grad_norm": 3.3995116526538998, "learning_rate": 1.466739410317999e-07, "loss": 0.644, "step": 3693 }, { "epoch": 0.36517312112300127, "grad_norm": 4.452139069508782, "learning_rate": 1.4664561931935474e-07, "loss": 0.7543, "step": 3694 }, { "epoch": 0.36527197686775575, "grad_norm": 4.643532483875229, "learning_rate": 1.4661729282408934e-07, "loss": 0.62, "step": 3695 }, { "epoch": 0.3653708326125102, "grad_norm": 3.7135638795007195, "learning_rate": 1.4658896154890825e-07, "loss": 0.7669, "step": 3696 }, { "epoch": 0.3654696883572647, "grad_norm": 5.290097389846457, "learning_rate": 1.465606254967164e-07, "loss": 0.7396, "step": 3697 }, { "epoch": 0.3655685441020191, "grad_norm": 3.2385668684521938, "learning_rate": 1.4653228467041917e-07, "loss": 0.6711, "step": 3698 }, { "epoch": 0.3656673998467736, "grad_norm": 3.9086768518953248, "learning_rate": 1.4650393907292257e-07, "loss": 0.6945, "step": 3699 }, { "epoch": 0.3657662555915281, "grad_norm": 3.592808297693197, "learning_rate": 1.4647558870713302e-07, "loss": 0.6037, "step": 3700 }, { "epoch": 0.3658651113362825, "grad_norm": 3.881343659392955, "learning_rate": 1.4644723357595738e-07, "loss": 0.6651, "step": 3701 }, { "epoch": 0.365963967081037, "grad_norm": 3.5552836221358506, "learning_rate": 1.464188736823031e-07, "loss": 0.6501, "step": 3702 }, { "epoch": 0.36606282282579145, "grad_norm": 4.663740398167636, "learning_rate": 1.4639050902907805e-07, "loss": 0.8298, "step": 3703 }, { "epoch": 0.36616167857054593, "grad_norm": 4.59516646973444, "learning_rate": 1.463621396191906e-07, "loss": 0.7064, "step": 3704 }, { "epoch": 0.3662605343153004, "grad_norm": 13.197558070435921, "learning_rate": 1.4633376545554966e-07, "loss": 0.5933, "step": 3705 }, { "epoch": 0.36635939006005486, "grad_norm": 5.528350042398596, "learning_rate": 1.463053865410645e-07, "loss": 0.7566, "step": 3706 }, { "epoch": 0.36645824580480935, "grad_norm": 3.628773762901117, "learning_rate": 1.4627700287864502e-07, "loss": 0.6146, "step": 3707 }, { "epoch": 0.3665571015495638, "grad_norm": 26.980812548083414, "learning_rate": 1.4624861447120155e-07, "loss": 0.7897, "step": 3708 }, { "epoch": 0.36665595729431827, "grad_norm": 4.226615883760169, "learning_rate": 1.4622022132164483e-07, "loss": 0.7244, "step": 3709 }, { "epoch": 0.36675481303907276, "grad_norm": 2.6952402148837833, "learning_rate": 1.4619182343288623e-07, "loss": 0.6694, "step": 3710 }, { "epoch": 0.3668536687838272, "grad_norm": 3.0884267528749385, "learning_rate": 1.461634208078375e-07, "loss": 0.8529, "step": 3711 }, { "epoch": 0.3669525245285817, "grad_norm": 5.629629300356241, "learning_rate": 1.4613501344941096e-07, "loss": 0.7982, "step": 3712 }, { "epoch": 0.3670513802733361, "grad_norm": 6.421780163941448, "learning_rate": 1.4610660136051928e-07, "loss": 0.7947, "step": 3713 }, { "epoch": 0.3671502360180906, "grad_norm": 3.436041828849022, "learning_rate": 1.460781845440758e-07, "loss": 0.7406, "step": 3714 }, { "epoch": 0.3672490917628451, "grad_norm": 4.642129919224962, "learning_rate": 1.460497630029942e-07, "loss": 0.6823, "step": 3715 }, { "epoch": 0.3673479475075995, "grad_norm": 3.9816058868343105, "learning_rate": 1.4602133674018867e-07, "loss": 0.736, "step": 3716 }, { "epoch": 0.367446803252354, "grad_norm": 4.643835763120923, "learning_rate": 1.459929057585739e-07, "loss": 0.819, "step": 3717 }, { "epoch": 0.36754565899710845, "grad_norm": 5.540647424418481, "learning_rate": 1.4596447006106513e-07, "loss": 0.7242, "step": 3718 }, { "epoch": 0.36764451474186294, "grad_norm": 4.668644764956431, "learning_rate": 1.459360296505779e-07, "loss": 0.6353, "step": 3719 }, { "epoch": 0.3677433704866174, "grad_norm": 15.938455990022064, "learning_rate": 1.459075845300285e-07, "loss": 0.6798, "step": 3720 }, { "epoch": 0.36784222623137186, "grad_norm": 6.259922330575343, "learning_rate": 1.4587913470233348e-07, "loss": 0.7163, "step": 3721 }, { "epoch": 0.36794108197612635, "grad_norm": 5.611970573918138, "learning_rate": 1.4585068017040996e-07, "loss": 0.7232, "step": 3722 }, { "epoch": 0.3680399377208808, "grad_norm": 6.892965548170632, "learning_rate": 1.4582222093717554e-07, "loss": 0.7947, "step": 3723 }, { "epoch": 0.3681387934656353, "grad_norm": 3.0431060098310576, "learning_rate": 1.4579375700554828e-07, "loss": 0.6657, "step": 3724 }, { "epoch": 0.36823764921038976, "grad_norm": 6.403457597244105, "learning_rate": 1.4576528837844674e-07, "loss": 0.7812, "step": 3725 }, { "epoch": 0.3683365049551442, "grad_norm": 4.5561572090310785, "learning_rate": 1.4573681505878996e-07, "loss": 0.6348, "step": 3726 }, { "epoch": 0.3684353606998987, "grad_norm": 8.894566503408681, "learning_rate": 1.4570833704949746e-07, "loss": 0.8652, "step": 3727 }, { "epoch": 0.3685342164446531, "grad_norm": 22.63536318870003, "learning_rate": 1.4567985435348926e-07, "loss": 0.7191, "step": 3728 }, { "epoch": 0.3686330721894076, "grad_norm": 3.477222386098271, "learning_rate": 1.456513669736858e-07, "loss": 0.8078, "step": 3729 }, { "epoch": 0.3687319279341621, "grad_norm": 4.324272999738785, "learning_rate": 1.4562287491300808e-07, "loss": 0.7225, "step": 3730 }, { "epoch": 0.36883078367891653, "grad_norm": 3.174075100210025, "learning_rate": 1.4559437817437753e-07, "loss": 0.6384, "step": 3731 }, { "epoch": 0.368929639423671, "grad_norm": 4.764392974913229, "learning_rate": 1.4556587676071602e-07, "loss": 0.7169, "step": 3732 }, { "epoch": 0.36902849516842545, "grad_norm": 4.448229127269209, "learning_rate": 1.4553737067494603e-07, "loss": 0.6438, "step": 3733 }, { "epoch": 0.36912735091317994, "grad_norm": 8.154185467828954, "learning_rate": 1.4550885991999038e-07, "loss": 0.6426, "step": 3734 }, { "epoch": 0.36922620665793443, "grad_norm": 6.012363866815673, "learning_rate": 1.4548034449877245e-07, "loss": 0.6403, "step": 3735 }, { "epoch": 0.36932506240268886, "grad_norm": 3.452356229150554, "learning_rate": 1.4545182441421607e-07, "loss": 0.782, "step": 3736 }, { "epoch": 0.36942391814744335, "grad_norm": 4.698745747368808, "learning_rate": 1.454232996692456e-07, "loss": 0.6974, "step": 3737 }, { "epoch": 0.3695227738921978, "grad_norm": 4.141339333448688, "learning_rate": 1.4539477026678576e-07, "loss": 0.7588, "step": 3738 }, { "epoch": 0.3696216296369523, "grad_norm": 7.588497593051641, "learning_rate": 1.4536623620976182e-07, "loss": 0.7688, "step": 3739 }, { "epoch": 0.36972048538170676, "grad_norm": 4.881478311060784, "learning_rate": 1.453376975010996e-07, "loss": 0.6645, "step": 3740 }, { "epoch": 0.3698193411264612, "grad_norm": 10.351039531887343, "learning_rate": 1.4530915414372526e-07, "loss": 0.7843, "step": 3741 }, { "epoch": 0.3699181968712157, "grad_norm": 8.734386296133778, "learning_rate": 1.4528060614056553e-07, "loss": 0.6563, "step": 3742 }, { "epoch": 0.3700170526159701, "grad_norm": 4.506233399890511, "learning_rate": 1.4525205349454756e-07, "loss": 0.6733, "step": 3743 }, { "epoch": 0.3701159083607246, "grad_norm": 3.8233482002636396, "learning_rate": 1.4522349620859904e-07, "loss": 0.7501, "step": 3744 }, { "epoch": 0.3702147641054791, "grad_norm": 3.058382078745008, "learning_rate": 1.4519493428564804e-07, "loss": 0.8108, "step": 3745 }, { "epoch": 0.37031361985023353, "grad_norm": 5.844453116367016, "learning_rate": 1.4516636772862322e-07, "loss": 0.7017, "step": 3746 }, { "epoch": 0.370412475594988, "grad_norm": 5.347106019110983, "learning_rate": 1.4513779654045367e-07, "loss": 0.7757, "step": 3747 }, { "epoch": 0.37051133133974246, "grad_norm": 9.375756622696139, "learning_rate": 1.451092207240689e-07, "loss": 0.695, "step": 3748 }, { "epoch": 0.37061018708449694, "grad_norm": 3.6499124050116927, "learning_rate": 1.4508064028239892e-07, "loss": 0.7319, "step": 3749 }, { "epoch": 0.37070904282925143, "grad_norm": 3.341110004299616, "learning_rate": 1.4505205521837431e-07, "loss": 0.6793, "step": 3750 }, { "epoch": 0.37080789857400587, "grad_norm": 3.3624991550569128, "learning_rate": 1.4502346553492598e-07, "loss": 0.6767, "step": 3751 }, { "epoch": 0.37090675431876036, "grad_norm": 15.549667202913941, "learning_rate": 1.449948712349854e-07, "loss": 0.6741, "step": 3752 }, { "epoch": 0.3710056100635148, "grad_norm": 2.93119167222858, "learning_rate": 1.4496627232148448e-07, "loss": 0.7107, "step": 3753 }, { "epoch": 0.3711044658082693, "grad_norm": 3.910226743062934, "learning_rate": 1.4493766879735567e-07, "loss": 0.8058, "step": 3754 }, { "epoch": 0.37120332155302377, "grad_norm": 4.6729889080985085, "learning_rate": 1.4490906066553178e-07, "loss": 0.6704, "step": 3755 }, { "epoch": 0.3713021772977782, "grad_norm": 4.693994670370568, "learning_rate": 1.4488044792894616e-07, "loss": 0.7837, "step": 3756 }, { "epoch": 0.3714010330425327, "grad_norm": 7.645498796116748, "learning_rate": 1.4485183059053268e-07, "loss": 0.7126, "step": 3757 }, { "epoch": 0.3714998887872871, "grad_norm": 3.7355431191151087, "learning_rate": 1.4482320865322555e-07, "loss": 0.8309, "step": 3758 }, { "epoch": 0.3715987445320416, "grad_norm": 4.263149997398231, "learning_rate": 1.447945821199596e-07, "loss": 0.7383, "step": 3759 }, { "epoch": 0.3716976002767961, "grad_norm": 4.945275260844403, "learning_rate": 1.4476595099366996e-07, "loss": 0.6729, "step": 3760 }, { "epoch": 0.37179645602155054, "grad_norm": 2.965647778574568, "learning_rate": 1.4473731527729242e-07, "loss": 0.7692, "step": 3761 }, { "epoch": 0.371895311766305, "grad_norm": 3.589069200099517, "learning_rate": 1.4470867497376314e-07, "loss": 0.6518, "step": 3762 }, { "epoch": 0.37199416751105946, "grad_norm": 2.8176045052626906, "learning_rate": 1.446800300860187e-07, "loss": 0.6101, "step": 3763 }, { "epoch": 0.37209302325581395, "grad_norm": 4.775632209173637, "learning_rate": 1.4465138061699626e-07, "loss": 0.679, "step": 3764 }, { "epoch": 0.37219187900056844, "grad_norm": 6.11156094451087, "learning_rate": 1.446227265696334e-07, "loss": 0.7105, "step": 3765 }, { "epoch": 0.37229073474532287, "grad_norm": 3.681850942029265, "learning_rate": 1.445940679468682e-07, "loss": 0.6859, "step": 3766 }, { "epoch": 0.37238959049007736, "grad_norm": 8.294940893741206, "learning_rate": 1.445654047516391e-07, "loss": 0.8124, "step": 3767 }, { "epoch": 0.37248844623483185, "grad_norm": 8.448794220248097, "learning_rate": 1.4453673698688514e-07, "loss": 0.667, "step": 3768 }, { "epoch": 0.3725873019795863, "grad_norm": 3.5678598371590344, "learning_rate": 1.4450806465554578e-07, "loss": 0.8021, "step": 3769 }, { "epoch": 0.37268615772434077, "grad_norm": 12.27858298265465, "learning_rate": 1.444793877605609e-07, "loss": 0.738, "step": 3770 }, { "epoch": 0.3727850134690952, "grad_norm": 4.403874511710134, "learning_rate": 1.444507063048709e-07, "loss": 0.8104, "step": 3771 }, { "epoch": 0.3728838692138497, "grad_norm": 4.882687293341145, "learning_rate": 1.4442202029141672e-07, "loss": 0.706, "step": 3772 }, { "epoch": 0.3729827249586042, "grad_norm": 6.787625511297325, "learning_rate": 1.443933297231396e-07, "loss": 0.7099, "step": 3773 }, { "epoch": 0.3730815807033586, "grad_norm": 7.967015031593697, "learning_rate": 1.4436463460298137e-07, "loss": 0.6684, "step": 3774 }, { "epoch": 0.3731804364481131, "grad_norm": 4.069003265275342, "learning_rate": 1.443359349338843e-07, "loss": 0.5601, "step": 3775 }, { "epoch": 0.37327929219286754, "grad_norm": 4.097953672965826, "learning_rate": 1.4430723071879108e-07, "loss": 0.6401, "step": 3776 }, { "epoch": 0.37337814793762203, "grad_norm": 4.791118977843389, "learning_rate": 1.4427852196064497e-07, "loss": 0.6876, "step": 3777 }, { "epoch": 0.3734770036823765, "grad_norm": 4.290826416435919, "learning_rate": 1.442498086623895e-07, "loss": 0.7287, "step": 3778 }, { "epoch": 0.37357585942713095, "grad_norm": 5.940333148719334, "learning_rate": 1.4422109082696897e-07, "loss": 0.7537, "step": 3779 }, { "epoch": 0.37367471517188544, "grad_norm": 3.094113599599272, "learning_rate": 1.4419236845732784e-07, "loss": 0.6326, "step": 3780 }, { "epoch": 0.3737735709166399, "grad_norm": 5.330764104455922, "learning_rate": 1.4416364155641122e-07, "loss": 0.7224, "step": 3781 }, { "epoch": 0.37387242666139436, "grad_norm": 3.5990684127346055, "learning_rate": 1.4413491012716458e-07, "loss": 0.6924, "step": 3782 }, { "epoch": 0.37397128240614885, "grad_norm": 5.406752879912292, "learning_rate": 1.4410617417253402e-07, "loss": 0.6825, "step": 3783 }, { "epoch": 0.3740701381509033, "grad_norm": 5.420243330807034, "learning_rate": 1.440774336954659e-07, "loss": 0.6311, "step": 3784 }, { "epoch": 0.3741689938956578, "grad_norm": 5.5675126859551085, "learning_rate": 1.4404868869890708e-07, "loss": 0.7887, "step": 3785 }, { "epoch": 0.3742678496404122, "grad_norm": 2.9830545145334755, "learning_rate": 1.4401993918580502e-07, "loss": 0.6898, "step": 3786 }, { "epoch": 0.3743667053851667, "grad_norm": 7.870631468791329, "learning_rate": 1.4399118515910756e-07, "loss": 0.6379, "step": 3787 }, { "epoch": 0.3744655611299212, "grad_norm": 4.431501225348336, "learning_rate": 1.4396242662176293e-07, "loss": 0.616, "step": 3788 }, { "epoch": 0.3745644168746756, "grad_norm": 5.975140565413247, "learning_rate": 1.4393366357671998e-07, "loss": 0.6488, "step": 3789 }, { "epoch": 0.3746632726194301, "grad_norm": 63.10675229816202, "learning_rate": 1.439048960269279e-07, "loss": 0.6558, "step": 3790 }, { "epoch": 0.37476212836418454, "grad_norm": 5.286250300182555, "learning_rate": 1.4387612397533637e-07, "loss": 0.7036, "step": 3791 }, { "epoch": 0.37486098410893903, "grad_norm": 2.909815980190188, "learning_rate": 1.4384734742489552e-07, "loss": 0.7135, "step": 3792 }, { "epoch": 0.3749598398536935, "grad_norm": 4.425359406861969, "learning_rate": 1.4381856637855603e-07, "loss": 0.6967, "step": 3793 }, { "epoch": 0.37505869559844796, "grad_norm": 5.1839098216375685, "learning_rate": 1.4378978083926893e-07, "loss": 0.6665, "step": 3794 }, { "epoch": 0.37515755134320244, "grad_norm": 3.3778017450646924, "learning_rate": 1.437609908099857e-07, "loss": 0.712, "step": 3795 }, { "epoch": 0.3752564070879569, "grad_norm": 3.8425475184270863, "learning_rate": 1.437321962936584e-07, "loss": 0.7607, "step": 3796 }, { "epoch": 0.37535526283271137, "grad_norm": 4.01863783061278, "learning_rate": 1.4370339729323946e-07, "loss": 0.7005, "step": 3797 }, { "epoch": 0.37545411857746586, "grad_norm": 7.890354412861929, "learning_rate": 1.4367459381168182e-07, "loss": 0.8143, "step": 3798 }, { "epoch": 0.3755529743222203, "grad_norm": 4.110045165196139, "learning_rate": 1.4364578585193883e-07, "loss": 0.812, "step": 3799 }, { "epoch": 0.3756518300669748, "grad_norm": 27.235236490852454, "learning_rate": 1.436169734169643e-07, "loss": 0.7559, "step": 3800 }, { "epoch": 0.3757506858117292, "grad_norm": 3.6101353361572754, "learning_rate": 1.4358815650971257e-07, "loss": 0.6582, "step": 3801 }, { "epoch": 0.3758495415564837, "grad_norm": 3.391699666442935, "learning_rate": 1.4355933513313833e-07, "loss": 0.6901, "step": 3802 }, { "epoch": 0.3759483973012382, "grad_norm": 4.048960003955002, "learning_rate": 1.4353050929019683e-07, "loss": 0.7612, "step": 3803 }, { "epoch": 0.3760472530459926, "grad_norm": 15.161078818077417, "learning_rate": 1.435016789838437e-07, "loss": 0.6033, "step": 3804 }, { "epoch": 0.3761461087907471, "grad_norm": 5.775847519567743, "learning_rate": 1.4347284421703511e-07, "loss": 0.771, "step": 3805 }, { "epoch": 0.37624496453550155, "grad_norm": 4.092839562617768, "learning_rate": 1.4344400499272763e-07, "loss": 0.728, "step": 3806 }, { "epoch": 0.37634382028025604, "grad_norm": 5.619419211931012, "learning_rate": 1.4341516131387824e-07, "loss": 0.7355, "step": 3807 }, { "epoch": 0.3764426760250105, "grad_norm": 3.5534246430558625, "learning_rate": 1.4338631318344453e-07, "loss": 0.6202, "step": 3808 }, { "epoch": 0.37654153176976496, "grad_norm": 3.858523324137024, "learning_rate": 1.4335746060438435e-07, "loss": 0.6827, "step": 3809 }, { "epoch": 0.37664038751451945, "grad_norm": 4.460448716039743, "learning_rate": 1.4332860357965619e-07, "loss": 0.7385, "step": 3810 }, { "epoch": 0.3767392432592739, "grad_norm": 3.4059069343345425, "learning_rate": 1.4329974211221885e-07, "loss": 0.7945, "step": 3811 }, { "epoch": 0.37683809900402837, "grad_norm": 3.1883169024925575, "learning_rate": 1.432708762050317e-07, "loss": 0.7677, "step": 3812 }, { "epoch": 0.37693695474878286, "grad_norm": 13.597756300761619, "learning_rate": 1.4324200586105444e-07, "loss": 0.7181, "step": 3813 }, { "epoch": 0.3770358104935373, "grad_norm": 7.344170018859479, "learning_rate": 1.4321313108324738e-07, "loss": 0.7277, "step": 3814 }, { "epoch": 0.3771346662382918, "grad_norm": 3.8690125318804376, "learning_rate": 1.4318425187457113e-07, "loss": 0.8051, "step": 3815 }, { "epoch": 0.3772335219830462, "grad_norm": 6.8656182320992825, "learning_rate": 1.4315536823798688e-07, "loss": 0.645, "step": 3816 }, { "epoch": 0.3773323777278007, "grad_norm": 6.167892756188442, "learning_rate": 1.4312648017645623e-07, "loss": 0.7771, "step": 3817 }, { "epoch": 0.3774312334725552, "grad_norm": 2.9291366154317644, "learning_rate": 1.4309758769294118e-07, "loss": 0.8183, "step": 3818 }, { "epoch": 0.37753008921730963, "grad_norm": 4.940510717736193, "learning_rate": 1.4306869079040425e-07, "loss": 0.7444, "step": 3819 }, { "epoch": 0.3776289449620641, "grad_norm": 3.909903448154188, "learning_rate": 1.4303978947180838e-07, "loss": 0.8454, "step": 3820 }, { "epoch": 0.37772780070681855, "grad_norm": 5.798104209610484, "learning_rate": 1.43010883740117e-07, "loss": 0.6903, "step": 3821 }, { "epoch": 0.37782665645157304, "grad_norm": 5.0191492644571, "learning_rate": 1.429819735982939e-07, "loss": 0.7647, "step": 3822 }, { "epoch": 0.37792551219632753, "grad_norm": 5.567695285222633, "learning_rate": 1.4295305904930346e-07, "loss": 0.743, "step": 3823 }, { "epoch": 0.37802436794108196, "grad_norm": 13.61106204393925, "learning_rate": 1.4292414009611041e-07, "loss": 0.7008, "step": 3824 }, { "epoch": 0.37812322368583645, "grad_norm": 4.848277654323795, "learning_rate": 1.4289521674168002e-07, "loss": 0.7036, "step": 3825 }, { "epoch": 0.3782220794305909, "grad_norm": 5.559587479072044, "learning_rate": 1.4286628898897784e-07, "loss": 0.8188, "step": 3826 }, { "epoch": 0.3783209351753454, "grad_norm": 5.93350931400336, "learning_rate": 1.4283735684097006e-07, "loss": 0.6989, "step": 3827 }, { "epoch": 0.37841979092009986, "grad_norm": 5.167998164283585, "learning_rate": 1.4280842030062327e-07, "loss": 0.6228, "step": 3828 }, { "epoch": 0.3785186466648543, "grad_norm": 7.604535306903161, "learning_rate": 1.427794793709044e-07, "loss": 0.6167, "step": 3829 }, { "epoch": 0.3786175024096088, "grad_norm": 3.8850834831101686, "learning_rate": 1.42750534054781e-07, "loss": 0.6653, "step": 3830 }, { "epoch": 0.3787163581543632, "grad_norm": 4.94632752981876, "learning_rate": 1.427215843552209e-07, "loss": 0.8099, "step": 3831 }, { "epoch": 0.3788152138991177, "grad_norm": 5.148228607709109, "learning_rate": 1.4269263027519256e-07, "loss": 0.7997, "step": 3832 }, { "epoch": 0.3789140696438722, "grad_norm": 3.4266682944272144, "learning_rate": 1.426636718176647e-07, "loss": 0.6893, "step": 3833 }, { "epoch": 0.37901292538862663, "grad_norm": 3.578283355351983, "learning_rate": 1.4263470898560664e-07, "loss": 0.7252, "step": 3834 }, { "epoch": 0.3791117811333811, "grad_norm": 4.165948236262646, "learning_rate": 1.426057417819881e-07, "loss": 0.7352, "step": 3835 }, { "epoch": 0.37921063687813555, "grad_norm": 6.488945808433723, "learning_rate": 1.4257677020977923e-07, "loss": 0.6822, "step": 3836 }, { "epoch": 0.37930949262289004, "grad_norm": 5.869491630806906, "learning_rate": 1.4254779427195063e-07, "loss": 0.7428, "step": 3837 }, { "epoch": 0.37940834836764453, "grad_norm": 6.442818607011678, "learning_rate": 1.4251881397147333e-07, "loss": 0.6585, "step": 3838 }, { "epoch": 0.37950720411239897, "grad_norm": 9.751769721421384, "learning_rate": 1.4248982931131885e-07, "loss": 0.6731, "step": 3839 }, { "epoch": 0.37960605985715346, "grad_norm": 4.618944956068225, "learning_rate": 1.4246084029445912e-07, "loss": 0.7327, "step": 3840 }, { "epoch": 0.3797049156019079, "grad_norm": 3.41188228045909, "learning_rate": 1.424318469238666e-07, "loss": 0.626, "step": 3841 }, { "epoch": 0.3798037713466624, "grad_norm": 3.852348390250874, "learning_rate": 1.4240284920251408e-07, "loss": 0.7286, "step": 3842 }, { "epoch": 0.37990262709141687, "grad_norm": 2.6976706310354697, "learning_rate": 1.4237384713337487e-07, "loss": 0.6587, "step": 3843 }, { "epoch": 0.3800014828361713, "grad_norm": 9.913093535044265, "learning_rate": 1.4234484071942267e-07, "loss": 0.8326, "step": 3844 }, { "epoch": 0.3801003385809258, "grad_norm": 4.670423523445183, "learning_rate": 1.4231582996363172e-07, "loss": 0.7012, "step": 3845 }, { "epoch": 0.3801991943256803, "grad_norm": 4.175514023360776, "learning_rate": 1.422868148689766e-07, "loss": 0.8026, "step": 3846 }, { "epoch": 0.3802980500704347, "grad_norm": 7.0416949971028835, "learning_rate": 1.4225779543843238e-07, "loss": 0.6571, "step": 3847 }, { "epoch": 0.3803969058151892, "grad_norm": 12.12374785406397, "learning_rate": 1.4222877167497455e-07, "loss": 0.7351, "step": 3848 }, { "epoch": 0.38049576155994363, "grad_norm": 4.7629493375642635, "learning_rate": 1.4219974358157916e-07, "loss": 0.7598, "step": 3849 }, { "epoch": 0.3805946173046981, "grad_norm": 4.5845521145546435, "learning_rate": 1.4217071116122252e-07, "loss": 0.7479, "step": 3850 }, { "epoch": 0.3806934730494526, "grad_norm": 5.154982854945801, "learning_rate": 1.421416744168815e-07, "loss": 0.6295, "step": 3851 }, { "epoch": 0.38079232879420705, "grad_norm": 6.2234535929846295, "learning_rate": 1.4211263335153343e-07, "loss": 0.7651, "step": 3852 }, { "epoch": 0.38089118453896154, "grad_norm": 4.994264675027666, "learning_rate": 1.4208358796815604e-07, "loss": 0.7437, "step": 3853 }, { "epoch": 0.38099004028371597, "grad_norm": 4.264777765053465, "learning_rate": 1.4205453826972745e-07, "loss": 0.7204, "step": 3854 }, { "epoch": 0.38108889602847046, "grad_norm": 3.687976008759733, "learning_rate": 1.420254842592263e-07, "loss": 0.5996, "step": 3855 }, { "epoch": 0.38118775177322495, "grad_norm": 4.029233603643152, "learning_rate": 1.4199642593963165e-07, "loss": 0.6611, "step": 3856 }, { "epoch": 0.3812866075179794, "grad_norm": 3.7569657525175657, "learning_rate": 1.4196736331392305e-07, "loss": 0.7859, "step": 3857 }, { "epoch": 0.38138546326273387, "grad_norm": 6.142873067158661, "learning_rate": 1.4193829638508037e-07, "loss": 0.7227, "step": 3858 }, { "epoch": 0.3814843190074883, "grad_norm": 5.03968909949774, "learning_rate": 1.4190922515608403e-07, "loss": 0.7299, "step": 3859 }, { "epoch": 0.3815831747522428, "grad_norm": 3.088592517274486, "learning_rate": 1.4188014962991488e-07, "loss": 0.7936, "step": 3860 }, { "epoch": 0.3816820304969973, "grad_norm": 3.6823715092175604, "learning_rate": 1.4185106980955415e-07, "loss": 0.6891, "step": 3861 }, { "epoch": 0.3817808862417517, "grad_norm": 4.523899212077057, "learning_rate": 1.4182198569798356e-07, "loss": 0.6506, "step": 3862 }, { "epoch": 0.3818797419865062, "grad_norm": 16.59667579842722, "learning_rate": 1.4179289729818525e-07, "loss": 0.7576, "step": 3863 }, { "epoch": 0.38197859773126064, "grad_norm": 3.825091570713382, "learning_rate": 1.4176380461314182e-07, "loss": 0.8613, "step": 3864 }, { "epoch": 0.3820774534760151, "grad_norm": 4.744488221725254, "learning_rate": 1.4173470764583627e-07, "loss": 0.7042, "step": 3865 }, { "epoch": 0.3821763092207696, "grad_norm": 6.452028072293079, "learning_rate": 1.4170560639925206e-07, "loss": 0.795, "step": 3866 }, { "epoch": 0.38227516496552405, "grad_norm": 4.170030550702709, "learning_rate": 1.4167650087637315e-07, "loss": 0.6372, "step": 3867 }, { "epoch": 0.38237402071027854, "grad_norm": 3.485443253154045, "learning_rate": 1.4164739108018386e-07, "loss": 0.6965, "step": 3868 }, { "epoch": 0.382472876455033, "grad_norm": 4.390355101053253, "learning_rate": 1.4161827701366893e-07, "loss": 0.7278, "step": 3869 }, { "epoch": 0.38257173219978746, "grad_norm": 5.295728553849626, "learning_rate": 1.415891586798136e-07, "loss": 0.688, "step": 3870 }, { "epoch": 0.38267058794454195, "grad_norm": 4.361576217936964, "learning_rate": 1.4156003608160353e-07, "loss": 0.8103, "step": 3871 }, { "epoch": 0.3827694436892964, "grad_norm": 4.679320363031833, "learning_rate": 1.4153090922202487e-07, "loss": 0.6994, "step": 3872 }, { "epoch": 0.3828682994340509, "grad_norm": 4.571448121916102, "learning_rate": 1.4150177810406404e-07, "loss": 0.7441, "step": 3873 }, { "epoch": 0.3829671551788053, "grad_norm": 7.111254056398886, "learning_rate": 1.4147264273070807e-07, "loss": 0.6669, "step": 3874 }, { "epoch": 0.3830660109235598, "grad_norm": 7.424086766704459, "learning_rate": 1.4144350310494434e-07, "loss": 0.7524, "step": 3875 }, { "epoch": 0.3831648666683143, "grad_norm": 3.439820533549137, "learning_rate": 1.414143592297607e-07, "loss": 0.8642, "step": 3876 }, { "epoch": 0.3832637224130687, "grad_norm": 6.693289072750498, "learning_rate": 1.4138521110814542e-07, "loss": 0.7834, "step": 3877 }, { "epoch": 0.3833625781578232, "grad_norm": 6.924196556687613, "learning_rate": 1.4135605874308725e-07, "loss": 0.7548, "step": 3878 }, { "epoch": 0.38346143390257764, "grad_norm": 2.672586463706325, "learning_rate": 1.413269021375753e-07, "loss": 0.6576, "step": 3879 }, { "epoch": 0.38356028964733213, "grad_norm": 5.085894950379209, "learning_rate": 1.4129774129459913e-07, "loss": 0.6437, "step": 3880 }, { "epoch": 0.3836591453920866, "grad_norm": 5.232923287911695, "learning_rate": 1.4126857621714877e-07, "loss": 0.7026, "step": 3881 }, { "epoch": 0.38375800113684105, "grad_norm": 3.6845415196924596, "learning_rate": 1.412394069082147e-07, "loss": 0.694, "step": 3882 }, { "epoch": 0.38385685688159554, "grad_norm": 4.140800181908264, "learning_rate": 1.4121023337078778e-07, "loss": 0.707, "step": 3883 }, { "epoch": 0.38395571262635, "grad_norm": 3.2287612543673543, "learning_rate": 1.4118105560785926e-07, "loss": 0.6521, "step": 3884 }, { "epoch": 0.38405456837110447, "grad_norm": 3.4739187737755106, "learning_rate": 1.41151873622421e-07, "loss": 0.7371, "step": 3885 }, { "epoch": 0.38415342411585895, "grad_norm": 3.84977434803961, "learning_rate": 1.4112268741746513e-07, "loss": 0.7405, "step": 3886 }, { "epoch": 0.3842522798606134, "grad_norm": 3.9681373614406827, "learning_rate": 1.4109349699598428e-07, "loss": 0.6825, "step": 3887 }, { "epoch": 0.3843511356053679, "grad_norm": 4.884763885652891, "learning_rate": 1.4106430236097144e-07, "loss": 0.7312, "step": 3888 }, { "epoch": 0.3844499913501223, "grad_norm": 3.5926853310490996, "learning_rate": 1.4103510351542018e-07, "loss": 0.7544, "step": 3889 }, { "epoch": 0.3845488470948768, "grad_norm": 4.677172552829473, "learning_rate": 1.4100590046232434e-07, "loss": 0.68, "step": 3890 }, { "epoch": 0.3846477028396313, "grad_norm": 7.634700466142064, "learning_rate": 1.4097669320467832e-07, "loss": 0.8012, "step": 3891 }, { "epoch": 0.3847465585843857, "grad_norm": 3.0767658583021844, "learning_rate": 1.409474817454768e-07, "loss": 0.6991, "step": 3892 }, { "epoch": 0.3848454143291402, "grad_norm": 6.974124381910248, "learning_rate": 1.4091826608771506e-07, "loss": 0.8201, "step": 3893 }, { "epoch": 0.38494427007389465, "grad_norm": 6.349902884995338, "learning_rate": 1.4088904623438875e-07, "loss": 0.6555, "step": 3894 }, { "epoch": 0.38504312581864913, "grad_norm": 9.851481321510652, "learning_rate": 1.408598221884939e-07, "loss": 0.7219, "step": 3895 }, { "epoch": 0.3851419815634036, "grad_norm": 4.702174022580679, "learning_rate": 1.4083059395302698e-07, "loss": 0.7198, "step": 3896 }, { "epoch": 0.38524083730815806, "grad_norm": 4.7163923415536155, "learning_rate": 1.4080136153098497e-07, "loss": 0.7643, "step": 3897 }, { "epoch": 0.38533969305291255, "grad_norm": 5.507335081019142, "learning_rate": 1.4077212492536522e-07, "loss": 0.7426, "step": 3898 }, { "epoch": 0.385438548797667, "grad_norm": 4.56634335411123, "learning_rate": 1.4074288413916543e-07, "loss": 0.7708, "step": 3899 }, { "epoch": 0.38553740454242147, "grad_norm": 3.125313607395516, "learning_rate": 1.4071363917538393e-07, "loss": 0.6577, "step": 3900 }, { "epoch": 0.38563626028717596, "grad_norm": 3.445990557425034, "learning_rate": 1.4068439003701929e-07, "loss": 0.7158, "step": 3901 }, { "epoch": 0.3857351160319304, "grad_norm": 4.546979720419309, "learning_rate": 1.4065513672707056e-07, "loss": 0.7741, "step": 3902 }, { "epoch": 0.3858339717766849, "grad_norm": 3.8098091416924875, "learning_rate": 1.4062587924853727e-07, "loss": 0.7902, "step": 3903 }, { "epoch": 0.3859328275214393, "grad_norm": 3.7955486472633813, "learning_rate": 1.4059661760441935e-07, "loss": 0.7642, "step": 3904 }, { "epoch": 0.3860316832661938, "grad_norm": 3.762734986400821, "learning_rate": 1.4056735179771712e-07, "loss": 0.6855, "step": 3905 }, { "epoch": 0.3861305390109483, "grad_norm": 3.5483592976486995, "learning_rate": 1.405380818314314e-07, "loss": 0.7162, "step": 3906 }, { "epoch": 0.3862293947557027, "grad_norm": 3.3902630681461012, "learning_rate": 1.4050880770856336e-07, "loss": 0.6429, "step": 3907 }, { "epoch": 0.3863282505004572, "grad_norm": 5.675720849112474, "learning_rate": 1.4047952943211464e-07, "loss": 0.6679, "step": 3908 }, { "epoch": 0.38642710624521165, "grad_norm": 6.069390012547412, "learning_rate": 1.4045024700508727e-07, "loss": 0.7184, "step": 3909 }, { "epoch": 0.38652596198996614, "grad_norm": 12.039207879403753, "learning_rate": 1.4042096043048374e-07, "loss": 0.7248, "step": 3910 }, { "epoch": 0.3866248177347206, "grad_norm": 3.9312315834379707, "learning_rate": 1.40391669711307e-07, "loss": 0.7291, "step": 3911 }, { "epoch": 0.38672367347947506, "grad_norm": 6.990248611373263, "learning_rate": 1.4036237485056034e-07, "loss": 0.6484, "step": 3912 }, { "epoch": 0.38682252922422955, "grad_norm": 3.3698788327852003, "learning_rate": 1.4033307585124754e-07, "loss": 0.7032, "step": 3913 }, { "epoch": 0.386921384968984, "grad_norm": 3.7472316664794145, "learning_rate": 1.4030377271637272e-07, "loss": 0.7491, "step": 3914 }, { "epoch": 0.3870202407137385, "grad_norm": 3.8988200847559122, "learning_rate": 1.4027446544894059e-07, "loss": 0.7178, "step": 3915 }, { "epoch": 0.38711909645849296, "grad_norm": 4.032716587987853, "learning_rate": 1.4024515405195613e-07, "loss": 0.7225, "step": 3916 }, { "epoch": 0.3872179522032474, "grad_norm": 9.150499973970511, "learning_rate": 1.4021583852842472e-07, "loss": 0.7027, "step": 3917 }, { "epoch": 0.3873168079480019, "grad_norm": 7.444709494176636, "learning_rate": 1.4018651888135232e-07, "loss": 0.7695, "step": 3918 }, { "epoch": 0.3874156636927563, "grad_norm": 4.799775726204708, "learning_rate": 1.401571951137452e-07, "loss": 0.7079, "step": 3919 }, { "epoch": 0.3875145194375108, "grad_norm": 3.3752479496614063, "learning_rate": 1.4012786722861008e-07, "loss": 0.6128, "step": 3920 }, { "epoch": 0.3876133751822653, "grad_norm": 5.675649348640341, "learning_rate": 1.4009853522895406e-07, "loss": 0.6008, "step": 3921 }, { "epoch": 0.38771223092701973, "grad_norm": 4.001744964228146, "learning_rate": 1.4006919911778482e-07, "loss": 0.7433, "step": 3922 }, { "epoch": 0.3878110866717742, "grad_norm": 4.183878413131544, "learning_rate": 1.4003985889811024e-07, "loss": 0.7659, "step": 3923 }, { "epoch": 0.3879099424165287, "grad_norm": 3.687098778421048, "learning_rate": 1.4001051457293878e-07, "loss": 0.7972, "step": 3924 }, { "epoch": 0.38800879816128314, "grad_norm": 4.35359288643428, "learning_rate": 1.399811661452792e-07, "loss": 0.7175, "step": 3925 }, { "epoch": 0.38810765390603763, "grad_norm": 3.953724201032734, "learning_rate": 1.3995181361814087e-07, "loss": 0.6272, "step": 3926 }, { "epoch": 0.38820650965079206, "grad_norm": 5.584745759158036, "learning_rate": 1.3992245699453335e-07, "loss": 0.7085, "step": 3927 }, { "epoch": 0.38830536539554655, "grad_norm": 4.516091129561202, "learning_rate": 1.398930962774667e-07, "loss": 0.7604, "step": 3928 }, { "epoch": 0.38840422114030104, "grad_norm": 100.33700115759795, "learning_rate": 1.3986373146995155e-07, "loss": 0.8592, "step": 3929 }, { "epoch": 0.3885030768850555, "grad_norm": 3.7630701700223685, "learning_rate": 1.3983436257499878e-07, "loss": 0.7146, "step": 3930 }, { "epoch": 0.38860193262980997, "grad_norm": 11.40197766691148, "learning_rate": 1.398049895956197e-07, "loss": 0.8082, "step": 3931 }, { "epoch": 0.3887007883745644, "grad_norm": 4.502894994983002, "learning_rate": 1.397756125348261e-07, "loss": 0.5928, "step": 3932 }, { "epoch": 0.3887996441193189, "grad_norm": 3.0854630212166447, "learning_rate": 1.397462313956302e-07, "loss": 0.7407, "step": 3933 }, { "epoch": 0.3888984998640734, "grad_norm": 3.536696016464864, "learning_rate": 1.3971684618104456e-07, "loss": 0.8516, "step": 3934 }, { "epoch": 0.3889973556088278, "grad_norm": 3.448537403137319, "learning_rate": 1.396874568940822e-07, "loss": 0.7414, "step": 3935 }, { "epoch": 0.3890962113535823, "grad_norm": 3.614189885706666, "learning_rate": 1.3965806353775653e-07, "loss": 0.6138, "step": 3936 }, { "epoch": 0.38919506709833673, "grad_norm": 22.539773619085196, "learning_rate": 1.396286661150815e-07, "loss": 0.8686, "step": 3937 }, { "epoch": 0.3892939228430912, "grad_norm": 3.4897583064533597, "learning_rate": 1.395992646290713e-07, "loss": 0.7075, "step": 3938 }, { "epoch": 0.3893927785878457, "grad_norm": 5.3200450108196975, "learning_rate": 1.395698590827406e-07, "loss": 0.6451, "step": 3939 }, { "epoch": 0.38949163433260015, "grad_norm": 4.036642057361084, "learning_rate": 1.395404494791046e-07, "loss": 0.7224, "step": 3940 }, { "epoch": 0.38959049007735463, "grad_norm": 16.502682297018513, "learning_rate": 1.3951103582117875e-07, "loss": 0.8234, "step": 3941 }, { "epoch": 0.38968934582210907, "grad_norm": 3.5072238205148256, "learning_rate": 1.3948161811197902e-07, "loss": 0.7998, "step": 3942 }, { "epoch": 0.38978820156686356, "grad_norm": 3.9304980286823037, "learning_rate": 1.3945219635452174e-07, "loss": 0.6395, "step": 3943 }, { "epoch": 0.38988705731161805, "grad_norm": 3.3061010208115356, "learning_rate": 1.394227705518237e-07, "loss": 0.7203, "step": 3944 }, { "epoch": 0.3899859130563725, "grad_norm": 3.365491894419096, "learning_rate": 1.3939334070690204e-07, "loss": 0.7977, "step": 3945 }, { "epoch": 0.39008476880112697, "grad_norm": 3.5060786997470266, "learning_rate": 1.393639068227744e-07, "loss": 0.657, "step": 3946 }, { "epoch": 0.3901836245458814, "grad_norm": 4.078831332801902, "learning_rate": 1.3933446890245876e-07, "loss": 0.6437, "step": 3947 }, { "epoch": 0.3902824802906359, "grad_norm": 10.377797885712024, "learning_rate": 1.3930502694897358e-07, "loss": 0.7615, "step": 3948 }, { "epoch": 0.3903813360353904, "grad_norm": 8.11413733549631, "learning_rate": 1.3927558096533764e-07, "loss": 0.7592, "step": 3949 }, { "epoch": 0.3904801917801448, "grad_norm": 9.152498073936858, "learning_rate": 1.3924613095457026e-07, "loss": 0.6687, "step": 3950 }, { "epoch": 0.3905790475248993, "grad_norm": 2.9845396965966198, "learning_rate": 1.3921667691969105e-07, "loss": 0.7113, "step": 3951 }, { "epoch": 0.39067790326965374, "grad_norm": 42.97942356262272, "learning_rate": 1.3918721886372014e-07, "loss": 0.7648, "step": 3952 }, { "epoch": 0.3907767590144082, "grad_norm": 3.0252759197155648, "learning_rate": 1.3915775678967796e-07, "loss": 0.7353, "step": 3953 }, { "epoch": 0.3908756147591627, "grad_norm": 3.236379253890259, "learning_rate": 1.3912829070058544e-07, "loss": 0.7718, "step": 3954 }, { "epoch": 0.39097447050391715, "grad_norm": 4.935773505157943, "learning_rate": 1.390988205994639e-07, "loss": 0.7087, "step": 3955 }, { "epoch": 0.39107332624867164, "grad_norm": 3.5016825679238086, "learning_rate": 1.3906934648933506e-07, "loss": 0.7452, "step": 3956 }, { "epoch": 0.39117218199342607, "grad_norm": 10.021434670579003, "learning_rate": 1.3903986837322106e-07, "loss": 0.6824, "step": 3957 }, { "epoch": 0.39127103773818056, "grad_norm": 3.6357160874283, "learning_rate": 1.390103862541444e-07, "loss": 0.7224, "step": 3958 }, { "epoch": 0.39136989348293505, "grad_norm": 6.162404678703686, "learning_rate": 1.389809001351281e-07, "loss": 0.6583, "step": 3959 }, { "epoch": 0.3914687492276895, "grad_norm": 3.4819435228178945, "learning_rate": 1.389514100191955e-07, "loss": 0.7436, "step": 3960 }, { "epoch": 0.39156760497244397, "grad_norm": 3.260804084695177, "learning_rate": 1.3892191590937038e-07, "loss": 0.6883, "step": 3961 }, { "epoch": 0.3916664607171984, "grad_norm": 4.5596200940155605, "learning_rate": 1.3889241780867691e-07, "loss": 0.6893, "step": 3962 }, { "epoch": 0.3917653164619529, "grad_norm": 3.87831445921358, "learning_rate": 1.388629157201397e-07, "loss": 0.6883, "step": 3963 }, { "epoch": 0.3918641722067074, "grad_norm": 7.221312059468625, "learning_rate": 1.3883340964678376e-07, "loss": 0.6418, "step": 3964 }, { "epoch": 0.3919630279514618, "grad_norm": 3.5948307170376257, "learning_rate": 1.3880389959163444e-07, "loss": 0.7604, "step": 3965 }, { "epoch": 0.3920618836962163, "grad_norm": 3.3372034685319263, "learning_rate": 1.387743855577177e-07, "loss": 0.8553, "step": 3966 }, { "epoch": 0.39216073944097074, "grad_norm": 2.843255974140833, "learning_rate": 1.3874486754805963e-07, "loss": 0.7611, "step": 3967 }, { "epoch": 0.39225959518572523, "grad_norm": 3.7761235293642277, "learning_rate": 1.3871534556568695e-07, "loss": 0.8032, "step": 3968 }, { "epoch": 0.3923584509304797, "grad_norm": 14.213313193347721, "learning_rate": 1.3868581961362665e-07, "loss": 0.6968, "step": 3969 }, { "epoch": 0.39245730667523415, "grad_norm": 6.757362479401202, "learning_rate": 1.386562896949062e-07, "loss": 0.6831, "step": 3970 }, { "epoch": 0.39255616241998864, "grad_norm": 15.427731334849755, "learning_rate": 1.3862675581255346e-07, "loss": 0.7974, "step": 3971 }, { "epoch": 0.3926550181647431, "grad_norm": 6.753785678257378, "learning_rate": 1.385972179695967e-07, "loss": 0.6699, "step": 3972 }, { "epoch": 0.39275387390949756, "grad_norm": 3.118284784028652, "learning_rate": 1.3856767616906462e-07, "loss": 0.7338, "step": 3973 }, { "epoch": 0.39285272965425205, "grad_norm": 8.566998372639103, "learning_rate": 1.3853813041398623e-07, "loss": 0.7152, "step": 3974 }, { "epoch": 0.3929515853990065, "grad_norm": 3.3351316661012604, "learning_rate": 1.3850858070739107e-07, "loss": 0.8157, "step": 3975 }, { "epoch": 0.393050441143761, "grad_norm": 5.219607904892859, "learning_rate": 1.38479027052309e-07, "loss": 0.6135, "step": 3976 }, { "epoch": 0.3931492968885154, "grad_norm": 3.604949472845347, "learning_rate": 1.3844946945177027e-07, "loss": 0.6143, "step": 3977 }, { "epoch": 0.3932481526332699, "grad_norm": 8.020503338586627, "learning_rate": 1.3841990790880566e-07, "loss": 0.6776, "step": 3978 }, { "epoch": 0.3933470083780244, "grad_norm": 3.630927745504876, "learning_rate": 1.383903424264462e-07, "loss": 0.7986, "step": 3979 }, { "epoch": 0.3934458641227788, "grad_norm": 3.1814130525572066, "learning_rate": 1.383607730077234e-07, "loss": 0.7367, "step": 3980 }, { "epoch": 0.3935447198675333, "grad_norm": 3.3592782351311667, "learning_rate": 1.3833119965566923e-07, "loss": 0.6911, "step": 3981 }, { "epoch": 0.39364357561228774, "grad_norm": 9.981549653414024, "learning_rate": 1.3830162237331595e-07, "loss": 0.7729, "step": 3982 }, { "epoch": 0.39374243135704223, "grad_norm": 4.355782260320361, "learning_rate": 1.382720411636963e-07, "loss": 0.7506, "step": 3983 }, { "epoch": 0.3938412871017967, "grad_norm": 6.914409616613802, "learning_rate": 1.3824245602984334e-07, "loss": 0.784, "step": 3984 }, { "epoch": 0.39394014284655116, "grad_norm": 12.074656431495715, "learning_rate": 1.3821286697479067e-07, "loss": 0.7456, "step": 3985 }, { "epoch": 0.39403899859130564, "grad_norm": 25.201140030482893, "learning_rate": 1.3818327400157215e-07, "loss": 0.7682, "step": 3986 }, { "epoch": 0.3941378543360601, "grad_norm": 3.4357751250433433, "learning_rate": 1.3815367711322215e-07, "loss": 0.7077, "step": 3987 }, { "epoch": 0.39423671008081457, "grad_norm": 20.46404228847062, "learning_rate": 1.3812407631277534e-07, "loss": 0.7113, "step": 3988 }, { "epoch": 0.39433556582556906, "grad_norm": 4.257612710271797, "learning_rate": 1.3809447160326688e-07, "loss": 0.6631, "step": 3989 }, { "epoch": 0.3944344215703235, "grad_norm": 3.8656963509398987, "learning_rate": 1.3806486298773228e-07, "loss": 0.653, "step": 3990 }, { "epoch": 0.394533277315078, "grad_norm": 3.751104890093559, "learning_rate": 1.380352504692075e-07, "loss": 0.7484, "step": 3991 }, { "epoch": 0.3946321330598324, "grad_norm": 4.740090446677816, "learning_rate": 1.3800563405072882e-07, "loss": 0.6134, "step": 3992 }, { "epoch": 0.3947309888045869, "grad_norm": 4.869731284667213, "learning_rate": 1.37976013735333e-07, "loss": 0.7987, "step": 3993 }, { "epoch": 0.3948298445493414, "grad_norm": 10.245722370497834, "learning_rate": 1.3794638952605716e-07, "loss": 0.8081, "step": 3994 }, { "epoch": 0.3949287002940958, "grad_norm": 3.6480969213401706, "learning_rate": 1.3791676142593884e-07, "loss": 0.753, "step": 3995 }, { "epoch": 0.3950275560388503, "grad_norm": 3.45310251226282, "learning_rate": 1.3788712943801594e-07, "loss": 0.7191, "step": 3996 }, { "epoch": 0.39512641178360475, "grad_norm": 7.080359996086882, "learning_rate": 1.3785749356532679e-07, "loss": 0.7597, "step": 3997 }, { "epoch": 0.39522526752835924, "grad_norm": 5.7720015767131425, "learning_rate": 1.378278538109101e-07, "loss": 0.8329, "step": 3998 }, { "epoch": 0.3953241232731137, "grad_norm": 3.8972610892052835, "learning_rate": 1.37798210177805e-07, "loss": 0.7919, "step": 3999 }, { "epoch": 0.39542297901786816, "grad_norm": 3.7600148325388005, "learning_rate": 1.3776856266905102e-07, "loss": 0.7614, "step": 4000 }, { "epoch": 0.39552183476262265, "grad_norm": 3.83611771663307, "learning_rate": 1.3773891128768803e-07, "loss": 0.7161, "step": 4001 }, { "epoch": 0.39562069050737714, "grad_norm": 5.4779545353004915, "learning_rate": 1.377092560367564e-07, "loss": 0.7545, "step": 4002 }, { "epoch": 0.39571954625213157, "grad_norm": 3.9287786967075475, "learning_rate": 1.3767959691929684e-07, "loss": 0.7248, "step": 4003 }, { "epoch": 0.39581840199688606, "grad_norm": 2.474771019161882, "learning_rate": 1.3764993393835042e-07, "loss": 0.7199, "step": 4004 }, { "epoch": 0.3959172577416405, "grad_norm": 3.782543037957009, "learning_rate": 1.3762026709695862e-07, "loss": 0.6962, "step": 4005 }, { "epoch": 0.396016113486395, "grad_norm": 7.107575656085507, "learning_rate": 1.375905963981634e-07, "loss": 0.6905, "step": 4006 }, { "epoch": 0.39611496923114947, "grad_norm": 4.404630015438051, "learning_rate": 1.3756092184500698e-07, "loss": 0.7111, "step": 4007 }, { "epoch": 0.3962138249759039, "grad_norm": 3.4650878363545603, "learning_rate": 1.3753124344053212e-07, "loss": 0.6405, "step": 4008 }, { "epoch": 0.3963126807206584, "grad_norm": 7.227359937431192, "learning_rate": 1.3750156118778184e-07, "loss": 0.7142, "step": 4009 }, { "epoch": 0.39641153646541283, "grad_norm": 5.7591457990245285, "learning_rate": 1.3747187508979968e-07, "loss": 0.7305, "step": 4010 }, { "epoch": 0.3965103922101673, "grad_norm": 3.6305039026454726, "learning_rate": 1.3744218514962946e-07, "loss": 0.6313, "step": 4011 }, { "epoch": 0.3966092479549218, "grad_norm": 4.206663973658014, "learning_rate": 1.374124913703155e-07, "loss": 0.7875, "step": 4012 }, { "epoch": 0.39670810369967624, "grad_norm": 5.397436446944782, "learning_rate": 1.373827937549024e-07, "loss": 0.7624, "step": 4013 }, { "epoch": 0.39680695944443073, "grad_norm": 3.321113422746335, "learning_rate": 1.3735309230643523e-07, "loss": 0.7111, "step": 4014 }, { "epoch": 0.39690581518918516, "grad_norm": 3.1702087209170395, "learning_rate": 1.3732338702795944e-07, "loss": 0.7555, "step": 4015 }, { "epoch": 0.39700467093393965, "grad_norm": 3.8134528210713254, "learning_rate": 1.3729367792252085e-07, "loss": 0.73, "step": 4016 }, { "epoch": 0.39710352667869414, "grad_norm": 2.960023740250979, "learning_rate": 1.3726396499316572e-07, "loss": 0.7241, "step": 4017 }, { "epoch": 0.3972023824234486, "grad_norm": 11.6207427374832, "learning_rate": 1.3723424824294067e-07, "loss": 0.7334, "step": 4018 }, { "epoch": 0.39730123816820306, "grad_norm": 4.626121052753708, "learning_rate": 1.372045276748927e-07, "loss": 0.6166, "step": 4019 }, { "epoch": 0.3974000939129575, "grad_norm": 8.614831745943947, "learning_rate": 1.3717480329206925e-07, "loss": 0.6984, "step": 4020 }, { "epoch": 0.397498949657712, "grad_norm": 10.262874150053017, "learning_rate": 1.371450750975181e-07, "loss": 0.7176, "step": 4021 }, { "epoch": 0.3975978054024665, "grad_norm": 4.137836423560795, "learning_rate": 1.371153430942874e-07, "loss": 0.7064, "step": 4022 }, { "epoch": 0.3976966611472209, "grad_norm": 4.588145825341124, "learning_rate": 1.3708560728542572e-07, "loss": 0.7565, "step": 4023 }, { "epoch": 0.3977955168919754, "grad_norm": 4.0464427444575835, "learning_rate": 1.370558676739821e-07, "loss": 0.706, "step": 4024 }, { "epoch": 0.39789437263672983, "grad_norm": 4.732773842949324, "learning_rate": 1.3702612426300587e-07, "loss": 0.748, "step": 4025 }, { "epoch": 0.3979932283814843, "grad_norm": 7.278629800390711, "learning_rate": 1.3699637705554674e-07, "loss": 0.6705, "step": 4026 }, { "epoch": 0.3980920841262388, "grad_norm": 4.246837129154639, "learning_rate": 1.3696662605465489e-07, "loss": 0.7201, "step": 4027 }, { "epoch": 0.39819093987099324, "grad_norm": 4.344085865751359, "learning_rate": 1.3693687126338086e-07, "loss": 0.7068, "step": 4028 }, { "epoch": 0.39828979561574773, "grad_norm": 3.1712231808989078, "learning_rate": 1.369071126847755e-07, "loss": 0.5849, "step": 4029 }, { "epoch": 0.39838865136050217, "grad_norm": 6.578610152910971, "learning_rate": 1.3687735032189022e-07, "loss": 0.5786, "step": 4030 }, { "epoch": 0.39848750710525666, "grad_norm": 8.284407053384417, "learning_rate": 1.3684758417777655e-07, "loss": 0.6598, "step": 4031 }, { "epoch": 0.39858636285001114, "grad_norm": 4.523601075982932, "learning_rate": 1.3681781425548672e-07, "loss": 0.6711, "step": 4032 }, { "epoch": 0.3986852185947656, "grad_norm": 3.914255274973054, "learning_rate": 1.3678804055807314e-07, "loss": 0.7347, "step": 4033 }, { "epoch": 0.39878407433952007, "grad_norm": 5.629800474897567, "learning_rate": 1.3675826308858862e-07, "loss": 0.6336, "step": 4034 }, { "epoch": 0.3988829300842745, "grad_norm": 3.6712606184518557, "learning_rate": 1.367284818500865e-07, "loss": 0.7386, "step": 4035 }, { "epoch": 0.398981785829029, "grad_norm": 3.64098127686032, "learning_rate": 1.3669869684562032e-07, "loss": 0.6257, "step": 4036 }, { "epoch": 0.3990806415737835, "grad_norm": 4.869767129871034, "learning_rate": 1.3666890807824411e-07, "loss": 0.6784, "step": 4037 }, { "epoch": 0.3991794973185379, "grad_norm": 5.796957005290569, "learning_rate": 1.366391155510123e-07, "loss": 0.7015, "step": 4038 }, { "epoch": 0.3992783530632924, "grad_norm": 2.8894908798086756, "learning_rate": 1.3660931926697967e-07, "loss": 0.643, "step": 4039 }, { "epoch": 0.39937720880804684, "grad_norm": 10.104397118841547, "learning_rate": 1.3657951922920137e-07, "loss": 0.761, "step": 4040 }, { "epoch": 0.3994760645528013, "grad_norm": 3.339828871181091, "learning_rate": 1.3654971544073293e-07, "loss": 0.6703, "step": 4041 }, { "epoch": 0.3995749202975558, "grad_norm": 3.4697652639366945, "learning_rate": 1.3651990790463033e-07, "loss": 0.7497, "step": 4042 }, { "epoch": 0.39967377604231025, "grad_norm": 3.915813820660208, "learning_rate": 1.3649009662394988e-07, "loss": 0.6328, "step": 4043 }, { "epoch": 0.39977263178706474, "grad_norm": 7.040194932655014, "learning_rate": 1.364602816017483e-07, "loss": 0.6508, "step": 4044 }, { "epoch": 0.39987148753181917, "grad_norm": 4.980836509271625, "learning_rate": 1.3643046284108264e-07, "loss": 0.6762, "step": 4045 }, { "epoch": 0.39997034327657366, "grad_norm": 2.8818179378940694, "learning_rate": 1.364006403450104e-07, "loss": 0.5983, "step": 4046 }, { "epoch": 0.40006919902132815, "grad_norm": 23.09596715590435, "learning_rate": 1.3637081411658952e-07, "loss": 0.7155, "step": 4047 }, { "epoch": 0.4001680547660826, "grad_norm": 8.286993696904812, "learning_rate": 1.363409841588781e-07, "loss": 0.8009, "step": 4048 }, { "epoch": 0.40026691051083707, "grad_norm": 4.652889838141533, "learning_rate": 1.363111504749348e-07, "loss": 0.7487, "step": 4049 }, { "epoch": 0.4003657662555915, "grad_norm": 6.903087202743209, "learning_rate": 1.3628131306781867e-07, "loss": 0.7661, "step": 4050 }, { "epoch": 0.400464622000346, "grad_norm": 5.200728879720409, "learning_rate": 1.3625147194058907e-07, "loss": 0.6334, "step": 4051 }, { "epoch": 0.4005634777451005, "grad_norm": 3.757789092384292, "learning_rate": 1.3622162709630578e-07, "loss": 0.7543, "step": 4052 }, { "epoch": 0.4006623334898549, "grad_norm": 3.2770684079798107, "learning_rate": 1.3619177853802892e-07, "loss": 0.7857, "step": 4053 }, { "epoch": 0.4007611892346094, "grad_norm": 7.21133612661471, "learning_rate": 1.3616192626881903e-07, "loss": 0.6842, "step": 4054 }, { "epoch": 0.40086004497936384, "grad_norm": 4.8346287612410315, "learning_rate": 1.3613207029173708e-07, "loss": 0.8077, "step": 4055 }, { "epoch": 0.40095890072411833, "grad_norm": 3.8399678086675073, "learning_rate": 1.3610221060984427e-07, "loss": 0.6734, "step": 4056 }, { "epoch": 0.4010577564688728, "grad_norm": 4.1244589780258964, "learning_rate": 1.360723472262023e-07, "loss": 0.7141, "step": 4057 }, { "epoch": 0.40115661221362725, "grad_norm": 5.697798411548719, "learning_rate": 1.3604248014387325e-07, "loss": 0.6845, "step": 4058 }, { "epoch": 0.40125546795838174, "grad_norm": 4.19376393613447, "learning_rate": 1.3601260936591953e-07, "loss": 0.8519, "step": 4059 }, { "epoch": 0.4013543237031362, "grad_norm": 7.1413700997838125, "learning_rate": 1.3598273489540387e-07, "loss": 0.7024, "step": 4060 }, { "epoch": 0.40145317944789066, "grad_norm": 3.5292365905340652, "learning_rate": 1.359528567353896e-07, "loss": 0.6432, "step": 4061 }, { "epoch": 0.40155203519264515, "grad_norm": 3.639278164546402, "learning_rate": 1.359229748889402e-07, "loss": 0.646, "step": 4062 }, { "epoch": 0.4016508909373996, "grad_norm": 4.058273282652866, "learning_rate": 1.3589308935911962e-07, "loss": 0.7576, "step": 4063 }, { "epoch": 0.4017497466821541, "grad_norm": 5.452381484054536, "learning_rate": 1.3586320014899218e-07, "loss": 0.7471, "step": 4064 }, { "epoch": 0.4018486024269085, "grad_norm": 3.3475648135181864, "learning_rate": 1.3583330726162261e-07, "loss": 0.7348, "step": 4065 }, { "epoch": 0.401947458171663, "grad_norm": 3.4670129777800502, "learning_rate": 1.3580341070007595e-07, "loss": 0.6516, "step": 4066 }, { "epoch": 0.4020463139164175, "grad_norm": 4.129596210117518, "learning_rate": 1.3577351046741764e-07, "loss": 0.7515, "step": 4067 }, { "epoch": 0.4021451696611719, "grad_norm": 6.543429753887918, "learning_rate": 1.3574360656671354e-07, "loss": 0.7206, "step": 4068 }, { "epoch": 0.4022440254059264, "grad_norm": 8.77206435341458, "learning_rate": 1.3571369900102985e-07, "loss": 0.6811, "step": 4069 }, { "epoch": 0.40234288115068084, "grad_norm": 7.86506050713617, "learning_rate": 1.3568378777343312e-07, "loss": 0.6892, "step": 4070 }, { "epoch": 0.40244173689543533, "grad_norm": 4.201738300196104, "learning_rate": 1.3565387288699034e-07, "loss": 0.8339, "step": 4071 }, { "epoch": 0.4025405926401898, "grad_norm": 16.745038618492913, "learning_rate": 1.3562395434476882e-07, "loss": 0.6584, "step": 4072 }, { "epoch": 0.40263944838494425, "grad_norm": 4.952864354211317, "learning_rate": 1.355940321498363e-07, "loss": 0.8108, "step": 4073 }, { "epoch": 0.40273830412969874, "grad_norm": 7.873730708754862, "learning_rate": 1.3556410630526082e-07, "loss": 0.8158, "step": 4074 }, { "epoch": 0.4028371598744532, "grad_norm": 4.270952692994384, "learning_rate": 1.3553417681411082e-07, "loss": 0.6584, "step": 4075 }, { "epoch": 0.40293601561920767, "grad_norm": 6.700923925057391, "learning_rate": 1.3550424367945516e-07, "loss": 0.7669, "step": 4076 }, { "epoch": 0.40303487136396215, "grad_norm": 3.61847847510114, "learning_rate": 1.3547430690436306e-07, "loss": 0.8079, "step": 4077 }, { "epoch": 0.4031337271087166, "grad_norm": 4.238468069781882, "learning_rate": 1.3544436649190404e-07, "loss": 0.7151, "step": 4078 }, { "epoch": 0.4032325828534711, "grad_norm": 9.920337783018105, "learning_rate": 1.354144224451481e-07, "loss": 0.7161, "step": 4079 }, { "epoch": 0.40333143859822557, "grad_norm": 5.006999764593267, "learning_rate": 1.353844747671655e-07, "loss": 0.6878, "step": 4080 }, { "epoch": 0.40343029434298, "grad_norm": 3.707862341029652, "learning_rate": 1.3535452346102704e-07, "loss": 0.634, "step": 4081 }, { "epoch": 0.4035291500877345, "grad_norm": 4.544735110547847, "learning_rate": 1.3532456852980367e-07, "loss": 0.7088, "step": 4082 }, { "epoch": 0.4036280058324889, "grad_norm": 3.684116725939991, "learning_rate": 1.3529460997656687e-07, "loss": 0.6868, "step": 4083 }, { "epoch": 0.4037268615772434, "grad_norm": 4.8156656307100585, "learning_rate": 1.3526464780438847e-07, "loss": 0.6944, "step": 4084 }, { "epoch": 0.4038257173219979, "grad_norm": 4.524529139159837, "learning_rate": 1.3523468201634063e-07, "loss": 0.7991, "step": 4085 }, { "epoch": 0.40392457306675233, "grad_norm": 4.479873756069666, "learning_rate": 1.3520471261549587e-07, "loss": 0.6985, "step": 4086 }, { "epoch": 0.4040234288115068, "grad_norm": 3.8440799451718974, "learning_rate": 1.3517473960492713e-07, "loss": 0.7253, "step": 4087 }, { "epoch": 0.40412228455626126, "grad_norm": 6.466647349400415, "learning_rate": 1.3514476298770775e-07, "loss": 0.7151, "step": 4088 }, { "epoch": 0.40422114030101575, "grad_norm": 4.40723970182832, "learning_rate": 1.3511478276691129e-07, "loss": 0.7045, "step": 4089 }, { "epoch": 0.40431999604577024, "grad_norm": 6.855370722068972, "learning_rate": 1.3508479894561186e-07, "loss": 0.6716, "step": 4090 }, { "epoch": 0.40441885179052467, "grad_norm": 3.817123356586155, "learning_rate": 1.350548115268839e-07, "loss": 0.6711, "step": 4091 }, { "epoch": 0.40451770753527916, "grad_norm": 5.378601781530161, "learning_rate": 1.3502482051380205e-07, "loss": 0.7868, "step": 4092 }, { "epoch": 0.4046165632800336, "grad_norm": 4.0811280150210125, "learning_rate": 1.3499482590944151e-07, "loss": 0.7287, "step": 4093 }, { "epoch": 0.4047154190247881, "grad_norm": 3.0062025758898603, "learning_rate": 1.3496482771687778e-07, "loss": 0.7027, "step": 4094 }, { "epoch": 0.40481427476954257, "grad_norm": 6.154310808808824, "learning_rate": 1.3493482593918675e-07, "loss": 0.723, "step": 4095 }, { "epoch": 0.404913130514297, "grad_norm": 10.423995926458753, "learning_rate": 1.3490482057944466e-07, "loss": 0.6942, "step": 4096 }, { "epoch": 0.4050119862590515, "grad_norm": 4.048513288377183, "learning_rate": 1.3487481164072804e-07, "loss": 0.6855, "step": 4097 }, { "epoch": 0.4051108420038059, "grad_norm": 11.404783134408174, "learning_rate": 1.34844799126114e-07, "loss": 0.7447, "step": 4098 }, { "epoch": 0.4052096977485604, "grad_norm": 3.392279360141884, "learning_rate": 1.3481478303867977e-07, "loss": 0.8255, "step": 4099 }, { "epoch": 0.4053085534933149, "grad_norm": 8.122762073996684, "learning_rate": 1.347847633815031e-07, "loss": 0.7659, "step": 4100 }, { "epoch": 0.40540740923806934, "grad_norm": 5.649219548964488, "learning_rate": 1.3475474015766203e-07, "loss": 0.8197, "step": 4101 }, { "epoch": 0.4055062649828238, "grad_norm": 13.352487300658492, "learning_rate": 1.3472471337023509e-07, "loss": 0.678, "step": 4102 }, { "epoch": 0.40560512072757826, "grad_norm": 4.240576012296398, "learning_rate": 1.34694683022301e-07, "loss": 0.7466, "step": 4103 }, { "epoch": 0.40570397647233275, "grad_norm": 4.677013368174527, "learning_rate": 1.346646491169389e-07, "loss": 0.6441, "step": 4104 }, { "epoch": 0.40580283221708724, "grad_norm": 7.575260837194892, "learning_rate": 1.3463461165722838e-07, "loss": 0.6424, "step": 4105 }, { "epoch": 0.4059016879618417, "grad_norm": 6.4835821572479375, "learning_rate": 1.3460457064624935e-07, "loss": 0.8052, "step": 4106 }, { "epoch": 0.40600054370659616, "grad_norm": 5.130605481987227, "learning_rate": 1.3457452608708205e-07, "loss": 0.8062, "step": 4107 }, { "epoch": 0.4060993994513506, "grad_norm": 4.810238829632351, "learning_rate": 1.3454447798280714e-07, "loss": 0.6795, "step": 4108 }, { "epoch": 0.4061982551961051, "grad_norm": 5.136336157373226, "learning_rate": 1.3451442633650556e-07, "loss": 0.6812, "step": 4109 }, { "epoch": 0.4062971109408596, "grad_norm": 4.271319149052712, "learning_rate": 1.3448437115125867e-07, "loss": 0.663, "step": 4110 }, { "epoch": 0.406395966685614, "grad_norm": 5.700904386598441, "learning_rate": 1.344543124301482e-07, "loss": 0.7088, "step": 4111 }, { "epoch": 0.4064948224303685, "grad_norm": 4.362687832835221, "learning_rate": 1.3442425017625625e-07, "loss": 0.6827, "step": 4112 }, { "epoch": 0.40659367817512293, "grad_norm": 5.08607591473625, "learning_rate": 1.3439418439266522e-07, "loss": 0.7235, "step": 4113 }, { "epoch": 0.4066925339198774, "grad_norm": 4.155751201111846, "learning_rate": 1.3436411508245793e-07, "loss": 0.6815, "step": 4114 }, { "epoch": 0.4067913896646319, "grad_norm": 4.563187309179184, "learning_rate": 1.3433404224871755e-07, "loss": 0.7036, "step": 4115 }, { "epoch": 0.40689024540938634, "grad_norm": 4.577381662375172, "learning_rate": 1.3430396589452758e-07, "loss": 0.658, "step": 4116 }, { "epoch": 0.40698910115414083, "grad_norm": 11.547868482356213, "learning_rate": 1.3427388602297194e-07, "loss": 0.7641, "step": 4117 }, { "epoch": 0.40708795689889526, "grad_norm": 4.662895221904019, "learning_rate": 1.3424380263713492e-07, "loss": 0.6339, "step": 4118 }, { "epoch": 0.40718681264364975, "grad_norm": 6.004413533624086, "learning_rate": 1.3421371574010095e-07, "loss": 0.6937, "step": 4119 }, { "epoch": 0.40728566838840424, "grad_norm": 4.788561945038308, "learning_rate": 1.3418362533495523e-07, "loss": 0.8257, "step": 4120 }, { "epoch": 0.4073845241331587, "grad_norm": 7.3666903479737655, "learning_rate": 1.341535314247829e-07, "loss": 0.7065, "step": 4121 }, { "epoch": 0.40748337987791317, "grad_norm": 3.315968921654255, "learning_rate": 1.3412343401266976e-07, "loss": 0.7846, "step": 4122 }, { "epoch": 0.4075822356226676, "grad_norm": 4.154920525651394, "learning_rate": 1.3409333310170182e-07, "loss": 0.7613, "step": 4123 }, { "epoch": 0.4076810913674221, "grad_norm": 5.482760758365575, "learning_rate": 1.340632286949655e-07, "loss": 0.7424, "step": 4124 }, { "epoch": 0.4077799471121766, "grad_norm": 3.2306197677240664, "learning_rate": 1.3403312079554748e-07, "loss": 0.644, "step": 4125 }, { "epoch": 0.407878802856931, "grad_norm": 3.657253134504103, "learning_rate": 1.3400300940653502e-07, "loss": 0.6532, "step": 4126 }, { "epoch": 0.4079776586016855, "grad_norm": 3.7194324967367365, "learning_rate": 1.339728945310155e-07, "loss": 0.6315, "step": 4127 }, { "epoch": 0.40807651434643993, "grad_norm": 3.416240958479376, "learning_rate": 1.3394277617207676e-07, "loss": 0.6678, "step": 4128 }, { "epoch": 0.4081753700911944, "grad_norm": 4.311140956824454, "learning_rate": 1.3391265433280703e-07, "loss": 0.7883, "step": 4129 }, { "epoch": 0.4082742258359489, "grad_norm": 4.033071796817395, "learning_rate": 1.3388252901629484e-07, "loss": 0.6322, "step": 4130 }, { "epoch": 0.40837308158070335, "grad_norm": 4.828479300138106, "learning_rate": 1.338524002256291e-07, "loss": 0.7148, "step": 4131 }, { "epoch": 0.40847193732545783, "grad_norm": 5.0262323223595855, "learning_rate": 1.3382226796389907e-07, "loss": 0.6838, "step": 4132 }, { "epoch": 0.40857079307021227, "grad_norm": 10.372692847096229, "learning_rate": 1.3379213223419438e-07, "loss": 0.7517, "step": 4133 }, { "epoch": 0.40866964881496676, "grad_norm": 4.7193611174939845, "learning_rate": 1.3376199303960502e-07, "loss": 0.7886, "step": 4134 }, { "epoch": 0.40876850455972125, "grad_norm": 9.388815139161405, "learning_rate": 1.337318503832213e-07, "loss": 0.6944, "step": 4135 }, { "epoch": 0.4088673603044757, "grad_norm": 5.296882716483853, "learning_rate": 1.3370170426813386e-07, "loss": 0.7117, "step": 4136 }, { "epoch": 0.40896621604923017, "grad_norm": 3.7576753793180364, "learning_rate": 1.336715546974338e-07, "loss": 0.6974, "step": 4137 }, { "epoch": 0.4090650717939846, "grad_norm": 3.4600104326380148, "learning_rate": 1.3364140167421252e-07, "loss": 0.6996, "step": 4138 }, { "epoch": 0.4091639275387391, "grad_norm": 3.578149951697119, "learning_rate": 1.3361124520156174e-07, "loss": 0.6494, "step": 4139 }, { "epoch": 0.4092627832834936, "grad_norm": 4.124512486794874, "learning_rate": 1.3358108528257355e-07, "loss": 0.7094, "step": 4140 }, { "epoch": 0.409361639028248, "grad_norm": 3.7909296040839564, "learning_rate": 1.3355092192034037e-07, "loss": 0.6693, "step": 4141 }, { "epoch": 0.4094604947730025, "grad_norm": 10.027111051379237, "learning_rate": 1.3352075511795514e-07, "loss": 0.672, "step": 4142 }, { "epoch": 0.40955935051775694, "grad_norm": 4.184900457188137, "learning_rate": 1.3349058487851093e-07, "loss": 0.6926, "step": 4143 }, { "epoch": 0.4096582062625114, "grad_norm": 3.816461379806068, "learning_rate": 1.3346041120510123e-07, "loss": 0.6854, "step": 4144 }, { "epoch": 0.4097570620072659, "grad_norm": 3.138289445685196, "learning_rate": 1.3343023410081995e-07, "loss": 0.7398, "step": 4145 }, { "epoch": 0.40985591775202035, "grad_norm": 6.667259519513072, "learning_rate": 1.3340005356876129e-07, "loss": 0.6979, "step": 4146 }, { "epoch": 0.40995477349677484, "grad_norm": 5.063314824366961, "learning_rate": 1.3336986961201983e-07, "loss": 0.7192, "step": 4147 }, { "epoch": 0.41005362924152927, "grad_norm": 6.25006471242061, "learning_rate": 1.3333968223369045e-07, "loss": 0.7995, "step": 4148 }, { "epoch": 0.41015248498628376, "grad_norm": 4.682739822101903, "learning_rate": 1.333094914368685e-07, "loss": 0.6172, "step": 4149 }, { "epoch": 0.41025134073103825, "grad_norm": 4.904357566647226, "learning_rate": 1.3327929722464953e-07, "loss": 0.6698, "step": 4150 }, { "epoch": 0.4103501964757927, "grad_norm": 3.7512662160052304, "learning_rate": 1.332490996001295e-07, "loss": 0.7246, "step": 4151 }, { "epoch": 0.4104490522205472, "grad_norm": 4.1059405588388245, "learning_rate": 1.3321889856640481e-07, "loss": 0.7276, "step": 4152 }, { "epoch": 0.4105479079653016, "grad_norm": 5.289667843108652, "learning_rate": 1.3318869412657207e-07, "loss": 0.7082, "step": 4153 }, { "epoch": 0.4106467637100561, "grad_norm": 4.42016271758641, "learning_rate": 1.3315848628372835e-07, "loss": 0.7207, "step": 4154 }, { "epoch": 0.4107456194548106, "grad_norm": 4.226248768050254, "learning_rate": 1.3312827504097092e-07, "loss": 0.732, "step": 4155 }, { "epoch": 0.410844475199565, "grad_norm": 16.689637415270777, "learning_rate": 1.3309806040139758e-07, "loss": 0.6785, "step": 4156 }, { "epoch": 0.4109433309443195, "grad_norm": 3.528625719977437, "learning_rate": 1.3306784236810643e-07, "loss": 0.666, "step": 4157 }, { "epoch": 0.411042186689074, "grad_norm": 4.289495554823182, "learning_rate": 1.3303762094419578e-07, "loss": 0.7018, "step": 4158 }, { "epoch": 0.41114104243382843, "grad_norm": 3.4444168190781594, "learning_rate": 1.3300739613276442e-07, "loss": 0.7227, "step": 4159 }, { "epoch": 0.4112398981785829, "grad_norm": 8.51425251109211, "learning_rate": 1.3297716793691152e-07, "loss": 0.6491, "step": 4160 }, { "epoch": 0.41133875392333735, "grad_norm": 3.8218078989572, "learning_rate": 1.329469363597365e-07, "loss": 0.6747, "step": 4161 }, { "epoch": 0.41143760966809184, "grad_norm": 5.8392546526973375, "learning_rate": 1.3291670140433914e-07, "loss": 0.7654, "step": 4162 }, { "epoch": 0.41153646541284633, "grad_norm": 4.05451166130992, "learning_rate": 1.328864630738196e-07, "loss": 0.685, "step": 4163 }, { "epoch": 0.41163532115760076, "grad_norm": 2.8385978683656403, "learning_rate": 1.328562213712784e-07, "loss": 0.8128, "step": 4164 }, { "epoch": 0.41173417690235525, "grad_norm": 5.305264485637704, "learning_rate": 1.3282597629981636e-07, "loss": 0.6463, "step": 4165 }, { "epoch": 0.4118330326471097, "grad_norm": 6.0807564828619025, "learning_rate": 1.3279572786253464e-07, "loss": 0.6528, "step": 4166 }, { "epoch": 0.4119318883918642, "grad_norm": 4.411072547377337, "learning_rate": 1.3276547606253482e-07, "loss": 0.6391, "step": 4167 }, { "epoch": 0.41203074413661867, "grad_norm": 8.576258871781818, "learning_rate": 1.3273522090291876e-07, "loss": 0.6986, "step": 4168 }, { "epoch": 0.4121295998813731, "grad_norm": 6.326283823466673, "learning_rate": 1.3270496238678867e-07, "loss": 0.7625, "step": 4169 }, { "epoch": 0.4122284556261276, "grad_norm": 4.0330553043248125, "learning_rate": 1.3267470051724714e-07, "loss": 0.8122, "step": 4170 }, { "epoch": 0.412327311370882, "grad_norm": 5.547031900172805, "learning_rate": 1.3264443529739705e-07, "loss": 0.7777, "step": 4171 }, { "epoch": 0.4124261671156365, "grad_norm": 6.1020691703148024, "learning_rate": 1.3261416673034163e-07, "loss": 0.6888, "step": 4172 }, { "epoch": 0.412525022860391, "grad_norm": 4.762960742516245, "learning_rate": 1.325838948191845e-07, "loss": 0.7273, "step": 4173 }, { "epoch": 0.41262387860514543, "grad_norm": 3.468085628334079, "learning_rate": 1.3255361956702961e-07, "loss": 0.7207, "step": 4174 }, { "epoch": 0.4127227343498999, "grad_norm": 4.139633296415921, "learning_rate": 1.3252334097698127e-07, "loss": 0.7633, "step": 4175 }, { "epoch": 0.41282159009465436, "grad_norm": 3.1265741365353943, "learning_rate": 1.3249305905214403e-07, "loss": 0.6937, "step": 4176 }, { "epoch": 0.41292044583940885, "grad_norm": 3.663736771192603, "learning_rate": 1.324627737956229e-07, "loss": 0.7116, "step": 4177 }, { "epoch": 0.41301930158416333, "grad_norm": 2.8907151168411334, "learning_rate": 1.3243248521052317e-07, "loss": 0.6855, "step": 4178 }, { "epoch": 0.41311815732891777, "grad_norm": 5.49325597981901, "learning_rate": 1.3240219329995052e-07, "loss": 0.8764, "step": 4179 }, { "epoch": 0.41321701307367226, "grad_norm": 5.914429364586734, "learning_rate": 1.323718980670109e-07, "loss": 0.7913, "step": 4180 }, { "epoch": 0.4133158688184267, "grad_norm": 7.4166513009622, "learning_rate": 1.3234159951481062e-07, "loss": 0.6933, "step": 4181 }, { "epoch": 0.4134147245631812, "grad_norm": 10.497114907352046, "learning_rate": 1.3231129764645644e-07, "loss": 0.7808, "step": 4182 }, { "epoch": 0.41351358030793567, "grad_norm": 26.73464326126239, "learning_rate": 1.3228099246505528e-07, "loss": 0.6892, "step": 4183 }, { "epoch": 0.4136124360526901, "grad_norm": 6.454655450237664, "learning_rate": 1.3225068397371453e-07, "loss": 0.8097, "step": 4184 }, { "epoch": 0.4137112917974446, "grad_norm": 4.089267807025319, "learning_rate": 1.3222037217554186e-07, "loss": 0.8105, "step": 4185 }, { "epoch": 0.413810147542199, "grad_norm": 7.137514667883646, "learning_rate": 1.321900570736453e-07, "loss": 0.6659, "step": 4186 }, { "epoch": 0.4139090032869535, "grad_norm": 4.535880225065229, "learning_rate": 1.3215973867113323e-07, "loss": 0.6687, "step": 4187 }, { "epoch": 0.414007859031708, "grad_norm": 13.072613652095777, "learning_rate": 1.3212941697111436e-07, "loss": 0.8001, "step": 4188 }, { "epoch": 0.41410671477646244, "grad_norm": 3.8115259848286596, "learning_rate": 1.3209909197669774e-07, "loss": 0.7345, "step": 4189 }, { "epoch": 0.4142055705212169, "grad_norm": 4.1744050896259415, "learning_rate": 1.320687636909927e-07, "loss": 0.7406, "step": 4190 }, { "epoch": 0.41430442626597136, "grad_norm": 5.0448130177705375, "learning_rate": 1.32038432117109e-07, "loss": 0.6781, "step": 4191 }, { "epoch": 0.41440328201072585, "grad_norm": 3.712726308212724, "learning_rate": 1.3200809725815667e-07, "loss": 0.7302, "step": 4192 }, { "epoch": 0.41450213775548034, "grad_norm": 4.775312234267417, "learning_rate": 1.3197775911724613e-07, "loss": 0.8619, "step": 4193 }, { "epoch": 0.41460099350023477, "grad_norm": 7.283231885923636, "learning_rate": 1.319474176974881e-07, "loss": 0.7221, "step": 4194 }, { "epoch": 0.41469984924498926, "grad_norm": 3.9150880884735115, "learning_rate": 1.3191707300199365e-07, "loss": 0.7346, "step": 4195 }, { "epoch": 0.4147987049897437, "grad_norm": 4.254786955188688, "learning_rate": 1.3188672503387417e-07, "loss": 0.639, "step": 4196 }, { "epoch": 0.4148975607344982, "grad_norm": 7.187899887693538, "learning_rate": 1.318563737962414e-07, "loss": 0.6822, "step": 4197 }, { "epoch": 0.41499641647925267, "grad_norm": 6.00280896675638, "learning_rate": 1.3182601929220744e-07, "loss": 0.6223, "step": 4198 }, { "epoch": 0.4150952722240071, "grad_norm": 9.567787542558415, "learning_rate": 1.317956615248846e-07, "loss": 0.5771, "step": 4199 }, { "epoch": 0.4151941279687616, "grad_norm": 2.943451369839274, "learning_rate": 1.3176530049738573e-07, "loss": 0.6585, "step": 4200 }, { "epoch": 0.41529298371351603, "grad_norm": 6.951195446865218, "learning_rate": 1.3173493621282384e-07, "loss": 0.645, "step": 4201 }, { "epoch": 0.4153918394582705, "grad_norm": 5.100460019764596, "learning_rate": 1.3170456867431242e-07, "loss": 0.748, "step": 4202 }, { "epoch": 0.415490695203025, "grad_norm": 3.815386210307499, "learning_rate": 1.3167419788496509e-07, "loss": 0.6644, "step": 4203 }, { "epoch": 0.41558955094777944, "grad_norm": 4.762921805977451, "learning_rate": 1.3164382384789602e-07, "loss": 0.6948, "step": 4204 }, { "epoch": 0.41568840669253393, "grad_norm": 4.655185678526731, "learning_rate": 1.3161344656621961e-07, "loss": 0.766, "step": 4205 }, { "epoch": 0.41578726243728836, "grad_norm": 4.379285576381063, "learning_rate": 1.3158306604305063e-07, "loss": 0.626, "step": 4206 }, { "epoch": 0.41588611818204285, "grad_norm": 3.985568600716228, "learning_rate": 1.31552682281504e-07, "loss": 0.6875, "step": 4207 }, { "epoch": 0.41598497392679734, "grad_norm": 2.8127696626830323, "learning_rate": 1.315222952846953e-07, "loss": 0.7688, "step": 4208 }, { "epoch": 0.4160838296715518, "grad_norm": 7.922990145200747, "learning_rate": 1.3149190505574024e-07, "loss": 0.7248, "step": 4209 }, { "epoch": 0.41618268541630626, "grad_norm": 26.79635756871759, "learning_rate": 1.314615115977548e-07, "loss": 0.8011, "step": 4210 }, { "epoch": 0.4162815411610607, "grad_norm": 4.453661912482545, "learning_rate": 1.3143111491385543e-07, "loss": 0.8224, "step": 4211 }, { "epoch": 0.4163803969058152, "grad_norm": 3.743084456260384, "learning_rate": 1.314007150071589e-07, "loss": 0.6336, "step": 4212 }, { "epoch": 0.4164792526505697, "grad_norm": 11.380289602556273, "learning_rate": 1.3137031188078224e-07, "loss": 0.7087, "step": 4213 }, { "epoch": 0.4165781083953241, "grad_norm": 3.4164025087991585, "learning_rate": 1.3133990553784285e-07, "loss": 0.7264, "step": 4214 }, { "epoch": 0.4166769641400786, "grad_norm": 4.627037006155441, "learning_rate": 1.3130949598145842e-07, "loss": 0.7336, "step": 4215 }, { "epoch": 0.41677581988483303, "grad_norm": 5.799516705035995, "learning_rate": 1.3127908321474702e-07, "loss": 0.6855, "step": 4216 }, { "epoch": 0.4168746756295875, "grad_norm": 3.150824890690916, "learning_rate": 1.3124866724082703e-07, "loss": 0.6744, "step": 4217 }, { "epoch": 0.416973531374342, "grad_norm": 3.598199714599413, "learning_rate": 1.3121824806281718e-07, "loss": 0.7392, "step": 4218 }, { "epoch": 0.41707238711909644, "grad_norm": 4.851393469064133, "learning_rate": 1.3118782568383648e-07, "loss": 0.7758, "step": 4219 }, { "epoch": 0.41717124286385093, "grad_norm": 3.8413086362031366, "learning_rate": 1.3115740010700433e-07, "loss": 0.705, "step": 4220 }, { "epoch": 0.41727009860860537, "grad_norm": 4.322342856411626, "learning_rate": 1.3112697133544038e-07, "loss": 0.6818, "step": 4221 }, { "epoch": 0.41736895435335986, "grad_norm": 4.343681241028514, "learning_rate": 1.310965393722647e-07, "loss": 0.7412, "step": 4222 }, { "epoch": 0.41746781009811434, "grad_norm": 6.512687985129534, "learning_rate": 1.310661042205976e-07, "loss": 0.7373, "step": 4223 }, { "epoch": 0.4175666658428688, "grad_norm": 3.1981218064096093, "learning_rate": 1.3103566588355977e-07, "loss": 0.6552, "step": 4224 }, { "epoch": 0.41766552158762327, "grad_norm": 3.7922130021152864, "learning_rate": 1.310052243642722e-07, "loss": 0.7402, "step": 4225 }, { "epoch": 0.4177643773323777, "grad_norm": 3.9471658645217627, "learning_rate": 1.3097477966585624e-07, "loss": 0.6651, "step": 4226 }, { "epoch": 0.4178632330771322, "grad_norm": 3.957191536892536, "learning_rate": 1.3094433179143357e-07, "loss": 0.7377, "step": 4227 }, { "epoch": 0.4179620888218867, "grad_norm": 3.1708347473949985, "learning_rate": 1.3091388074412609e-07, "loss": 0.7678, "step": 4228 }, { "epoch": 0.4180609445666411, "grad_norm": 4.013879059795465, "learning_rate": 1.3088342652705614e-07, "loss": 0.646, "step": 4229 }, { "epoch": 0.4181598003113956, "grad_norm": 16.187526344087836, "learning_rate": 1.3085296914334638e-07, "loss": 0.6569, "step": 4230 }, { "epoch": 0.41825865605615004, "grad_norm": 10.896850463452301, "learning_rate": 1.3082250859611976e-07, "loss": 0.7035, "step": 4231 }, { "epoch": 0.4183575118009045, "grad_norm": 5.075156988991124, "learning_rate": 1.3079204488849955e-07, "loss": 0.6554, "step": 4232 }, { "epoch": 0.418456367545659, "grad_norm": 4.2394394907333135, "learning_rate": 1.3076157802360934e-07, "loss": 0.7127, "step": 4233 }, { "epoch": 0.41855522329041345, "grad_norm": 3.182869860010819, "learning_rate": 1.3073110800457308e-07, "loss": 0.7491, "step": 4234 }, { "epoch": 0.41865407903516794, "grad_norm": 3.018938095120049, "learning_rate": 1.3070063483451501e-07, "loss": 0.6439, "step": 4235 }, { "epoch": 0.4187529347799224, "grad_norm": 7.716857285602844, "learning_rate": 1.3067015851655966e-07, "loss": 0.7135, "step": 4236 }, { "epoch": 0.41885179052467686, "grad_norm": 4.135321382879602, "learning_rate": 1.3063967905383205e-07, "loss": 0.7429, "step": 4237 }, { "epoch": 0.41895064626943135, "grad_norm": 5.635047715864612, "learning_rate": 1.3060919644945725e-07, "loss": 0.6875, "step": 4238 }, { "epoch": 0.4190495020141858, "grad_norm": 3.117496205847984, "learning_rate": 1.3057871070656092e-07, "loss": 0.7285, "step": 4239 }, { "epoch": 0.41914835775894027, "grad_norm": 3.4137128878124012, "learning_rate": 1.3054822182826886e-07, "loss": 0.7565, "step": 4240 }, { "epoch": 0.41924721350369476, "grad_norm": 3.110118256946252, "learning_rate": 1.3051772981770726e-07, "loss": 0.6797, "step": 4241 }, { "epoch": 0.4193460692484492, "grad_norm": 3.854698494167551, "learning_rate": 1.3048723467800265e-07, "loss": 0.6703, "step": 4242 }, { "epoch": 0.4194449249932037, "grad_norm": 4.602089899332918, "learning_rate": 1.304567364122818e-07, "loss": 0.7728, "step": 4243 }, { "epoch": 0.4195437807379581, "grad_norm": 3.6437942450514775, "learning_rate": 1.3042623502367194e-07, "loss": 0.7124, "step": 4244 }, { "epoch": 0.4196426364827126, "grad_norm": 10.005784764682327, "learning_rate": 1.3039573051530049e-07, "loss": 0.6175, "step": 4245 }, { "epoch": 0.4197414922274671, "grad_norm": 3.7561039271630148, "learning_rate": 1.3036522289029524e-07, "loss": 0.7024, "step": 4246 }, { "epoch": 0.41984034797222153, "grad_norm": 5.623028762994784, "learning_rate": 1.3033471215178428e-07, "loss": 0.7756, "step": 4247 }, { "epoch": 0.419939203716976, "grad_norm": 7.165027583210982, "learning_rate": 1.3030419830289607e-07, "loss": 0.7428, "step": 4248 }, { "epoch": 0.42003805946173045, "grad_norm": 11.345036131321374, "learning_rate": 1.3027368134675935e-07, "loss": 0.8126, "step": 4249 }, { "epoch": 0.42013691520648494, "grad_norm": 3.419486302995425, "learning_rate": 1.3024316128650322e-07, "loss": 0.7802, "step": 4250 }, { "epoch": 0.42023577095123943, "grad_norm": 3.148351811230768, "learning_rate": 1.3021263812525697e-07, "loss": 0.6944, "step": 4251 }, { "epoch": 0.42033462669599386, "grad_norm": 3.41944531883536, "learning_rate": 1.3018211186615035e-07, "loss": 0.6684, "step": 4252 }, { "epoch": 0.42043348244074835, "grad_norm": 5.054559649598608, "learning_rate": 1.3015158251231338e-07, "loss": 0.6941, "step": 4253 }, { "epoch": 0.4205323381855028, "grad_norm": 10.627621477921144, "learning_rate": 1.3012105006687637e-07, "loss": 0.7439, "step": 4254 }, { "epoch": 0.4206311939302573, "grad_norm": 3.5446811718602493, "learning_rate": 1.3009051453297003e-07, "loss": 0.7525, "step": 4255 }, { "epoch": 0.42073004967501176, "grad_norm": 3.7799938017053294, "learning_rate": 1.3005997591372525e-07, "loss": 0.7815, "step": 4256 }, { "epoch": 0.4208289054197662, "grad_norm": 3.8022865208711343, "learning_rate": 1.3002943421227338e-07, "loss": 0.6318, "step": 4257 }, { "epoch": 0.4209277611645207, "grad_norm": 2.805054642441229, "learning_rate": 1.2999888943174603e-07, "loss": 0.6555, "step": 4258 }, { "epoch": 0.4210266169092751, "grad_norm": 4.98767407367027, "learning_rate": 1.2996834157527503e-07, "loss": 0.7475, "step": 4259 }, { "epoch": 0.4211254726540296, "grad_norm": 5.867577277486626, "learning_rate": 1.2993779064599268e-07, "loss": 0.7701, "step": 4260 }, { "epoch": 0.4212243283987841, "grad_norm": 3.2108946998775796, "learning_rate": 1.299072366470315e-07, "loss": 0.6929, "step": 4261 }, { "epoch": 0.42132318414353853, "grad_norm": 11.397387089225367, "learning_rate": 1.298766795815244e-07, "loss": 0.7414, "step": 4262 }, { "epoch": 0.421422039888293, "grad_norm": 6.792582028031758, "learning_rate": 1.298461194526045e-07, "loss": 0.7658, "step": 4263 }, { "epoch": 0.42152089563304745, "grad_norm": 3.739186657137695, "learning_rate": 1.2981555626340534e-07, "loss": 0.7977, "step": 4264 }, { "epoch": 0.42161975137780194, "grad_norm": 9.2855546379833, "learning_rate": 1.2978499001706063e-07, "loss": 0.7081, "step": 4265 }, { "epoch": 0.42171860712255643, "grad_norm": 2.9010126193510684, "learning_rate": 1.2975442071670465e-07, "loss": 0.7139, "step": 4266 }, { "epoch": 0.42181746286731087, "grad_norm": 2.889527640368366, "learning_rate": 1.2972384836547164e-07, "loss": 0.6306, "step": 4267 }, { "epoch": 0.42191631861206536, "grad_norm": 5.050864523488164, "learning_rate": 1.296932729664965e-07, "loss": 0.645, "step": 4268 }, { "epoch": 0.4220151743568198, "grad_norm": 6.054781258281637, "learning_rate": 1.296626945229142e-07, "loss": 0.7705, "step": 4269 }, { "epoch": 0.4221140301015743, "grad_norm": 4.603335207205996, "learning_rate": 1.2963211303786011e-07, "loss": 0.7142, "step": 4270 }, { "epoch": 0.42221288584632877, "grad_norm": 3.428583261872119, "learning_rate": 1.2960152851446995e-07, "loss": 0.7827, "step": 4271 }, { "epoch": 0.4223117415910832, "grad_norm": 5.626756198225381, "learning_rate": 1.2957094095587968e-07, "loss": 0.747, "step": 4272 }, { "epoch": 0.4224105973358377, "grad_norm": 4.277068149133427, "learning_rate": 1.2954035036522566e-07, "loss": 0.7015, "step": 4273 }, { "epoch": 0.4225094530805921, "grad_norm": 6.513191653666133, "learning_rate": 1.2950975674564442e-07, "loss": 0.6498, "step": 4274 }, { "epoch": 0.4226083088253466, "grad_norm": 6.080438836459712, "learning_rate": 1.2947916010027297e-07, "loss": 0.6813, "step": 4275 }, { "epoch": 0.4227071645701011, "grad_norm": 6.485488214474651, "learning_rate": 1.2944856043224846e-07, "loss": 0.7228, "step": 4276 }, { "epoch": 0.42280602031485554, "grad_norm": 9.171123004234422, "learning_rate": 1.2941795774470849e-07, "loss": 0.7065, "step": 4277 }, { "epoch": 0.42290487605961, "grad_norm": 6.462563499714309, "learning_rate": 1.293873520407909e-07, "loss": 0.7655, "step": 4278 }, { "epoch": 0.42300373180436446, "grad_norm": 3.9263814117833156, "learning_rate": 1.2935674332363385e-07, "loss": 0.7119, "step": 4279 }, { "epoch": 0.42310258754911895, "grad_norm": 3.2757300384786188, "learning_rate": 1.293261315963758e-07, "loss": 0.6252, "step": 4280 }, { "epoch": 0.42320144329387344, "grad_norm": 5.2941345996013025, "learning_rate": 1.2929551686215554e-07, "loss": 0.7082, "step": 4281 }, { "epoch": 0.42330029903862787, "grad_norm": 3.8906088967964045, "learning_rate": 1.2926489912411218e-07, "loss": 0.6788, "step": 4282 }, { "epoch": 0.42339915478338236, "grad_norm": 9.789232333710155, "learning_rate": 1.2923427838538512e-07, "loss": 0.6802, "step": 4283 }, { "epoch": 0.4234980105281368, "grad_norm": 4.632933159698568, "learning_rate": 1.2920365464911402e-07, "loss": 0.7385, "step": 4284 }, { "epoch": 0.4235968662728913, "grad_norm": 3.2194541455200154, "learning_rate": 1.2917302791843897e-07, "loss": 0.5911, "step": 4285 }, { "epoch": 0.42369572201764577, "grad_norm": 3.300277627698636, "learning_rate": 1.291423981965002e-07, "loss": 0.6389, "step": 4286 }, { "epoch": 0.4237945777624002, "grad_norm": 6.781442155054954, "learning_rate": 1.2911176548643834e-07, "loss": 0.7463, "step": 4287 }, { "epoch": 0.4238934335071547, "grad_norm": 9.184575955142646, "learning_rate": 1.290811297913944e-07, "loss": 0.7388, "step": 4288 }, { "epoch": 0.4239922892519091, "grad_norm": 4.476161662953344, "learning_rate": 1.2905049111450958e-07, "loss": 0.7657, "step": 4289 }, { "epoch": 0.4240911449966636, "grad_norm": 6.979539947746087, "learning_rate": 1.2901984945892542e-07, "loss": 0.6962, "step": 4290 }, { "epoch": 0.4241900007414181, "grad_norm": 3.993502749854961, "learning_rate": 1.2898920482778372e-07, "loss": 0.7106, "step": 4291 }, { "epoch": 0.42428885648617254, "grad_norm": 3.8241512412751395, "learning_rate": 1.2895855722422674e-07, "loss": 0.7487, "step": 4292 }, { "epoch": 0.42438771223092703, "grad_norm": 3.115888402122379, "learning_rate": 1.289279066513969e-07, "loss": 0.5497, "step": 4293 }, { "epoch": 0.42448656797568146, "grad_norm": 2.988733158512786, "learning_rate": 1.288972531124369e-07, "loss": 0.6752, "step": 4294 }, { "epoch": 0.42458542372043595, "grad_norm": 4.960556726980964, "learning_rate": 1.2886659661048992e-07, "loss": 0.7067, "step": 4295 }, { "epoch": 0.42468427946519044, "grad_norm": 4.871374304429121, "learning_rate": 1.2883593714869921e-07, "loss": 0.7291, "step": 4296 }, { "epoch": 0.4247831352099449, "grad_norm": 2.6948418375720182, "learning_rate": 1.2880527473020856e-07, "loss": 0.7158, "step": 4297 }, { "epoch": 0.42488199095469936, "grad_norm": 7.66664006996376, "learning_rate": 1.2877460935816182e-07, "loss": 0.7468, "step": 4298 }, { "epoch": 0.4249808466994538, "grad_norm": 3.2013752995905507, "learning_rate": 1.2874394103570342e-07, "loss": 0.619, "step": 4299 }, { "epoch": 0.4250797024442083, "grad_norm": 5.552470757576194, "learning_rate": 1.2871326976597782e-07, "loss": 0.8107, "step": 4300 }, { "epoch": 0.4251785581889628, "grad_norm": 6.7049379654740315, "learning_rate": 1.2868259555212996e-07, "loss": 0.8039, "step": 4301 }, { "epoch": 0.4252774139337172, "grad_norm": 3.6763348531618134, "learning_rate": 1.2865191839730504e-07, "loss": 0.7498, "step": 4302 }, { "epoch": 0.4253762696784717, "grad_norm": 3.5952447053413583, "learning_rate": 1.2862123830464854e-07, "loss": 0.7337, "step": 4303 }, { "epoch": 0.42547512542322613, "grad_norm": 3.4833497833747322, "learning_rate": 1.2859055527730624e-07, "loss": 0.8011, "step": 4304 }, { "epoch": 0.4255739811679806, "grad_norm": 4.136442942795607, "learning_rate": 1.2855986931842423e-07, "loss": 0.635, "step": 4305 }, { "epoch": 0.4256728369127351, "grad_norm": 3.2665450327074006, "learning_rate": 1.2852918043114892e-07, "loss": 0.6927, "step": 4306 }, { "epoch": 0.42577169265748954, "grad_norm": 4.4937892486060536, "learning_rate": 1.2849848861862698e-07, "loss": 0.6611, "step": 4307 }, { "epoch": 0.42587054840224403, "grad_norm": 3.1221564948742877, "learning_rate": 1.284677938840054e-07, "loss": 0.7629, "step": 4308 }, { "epoch": 0.42596940414699846, "grad_norm": 11.67751252323262, "learning_rate": 1.284370962304315e-07, "loss": 0.698, "step": 4309 }, { "epoch": 0.42606825989175295, "grad_norm": 8.077474830286286, "learning_rate": 1.2840639566105288e-07, "loss": 0.7271, "step": 4310 }, { "epoch": 0.42616711563650744, "grad_norm": 11.984633035072333, "learning_rate": 1.283756921790174e-07, "loss": 0.7015, "step": 4311 }, { "epoch": 0.4262659713812619, "grad_norm": 3.7213064225905312, "learning_rate": 1.2834498578747327e-07, "loss": 0.6355, "step": 4312 }, { "epoch": 0.42636482712601637, "grad_norm": 5.351540137863119, "learning_rate": 1.283142764895689e-07, "loss": 0.6761, "step": 4313 }, { "epoch": 0.42646368287077085, "grad_norm": 8.474271054067831, "learning_rate": 1.2828356428845322e-07, "loss": 0.707, "step": 4314 }, { "epoch": 0.4265625386155253, "grad_norm": 4.421087814022167, "learning_rate": 1.2825284918727518e-07, "loss": 0.7386, "step": 4315 }, { "epoch": 0.4266613943602798, "grad_norm": 6.509729067225355, "learning_rate": 1.2822213118918425e-07, "loss": 0.7923, "step": 4316 }, { "epoch": 0.4267602501050342, "grad_norm": 4.603889426587649, "learning_rate": 1.2819141029733005e-07, "loss": 0.6768, "step": 4317 }, { "epoch": 0.4268591058497887, "grad_norm": 37.99671174094154, "learning_rate": 1.2816068651486255e-07, "loss": 0.7193, "step": 4318 }, { "epoch": 0.4269579615945432, "grad_norm": 5.808324892064933, "learning_rate": 1.281299598449321e-07, "loss": 0.6644, "step": 4319 }, { "epoch": 0.4270568173392976, "grad_norm": 3.610514385721219, "learning_rate": 1.2809923029068922e-07, "loss": 0.7383, "step": 4320 }, { "epoch": 0.4271556730840521, "grad_norm": 3.8690291386311797, "learning_rate": 1.2806849785528472e-07, "loss": 0.6994, "step": 4321 }, { "epoch": 0.42725452882880655, "grad_norm": 4.9720881066044775, "learning_rate": 1.2803776254186985e-07, "loss": 0.696, "step": 4322 }, { "epoch": 0.42735338457356103, "grad_norm": 10.489705384118094, "learning_rate": 1.28007024353596e-07, "loss": 0.7063, "step": 4323 }, { "epoch": 0.4274522403183155, "grad_norm": 10.156889144823271, "learning_rate": 1.279762832936149e-07, "loss": 0.7111, "step": 4324 }, { "epoch": 0.42755109606306996, "grad_norm": 4.064546666181333, "learning_rate": 1.2794553936507866e-07, "loss": 0.6213, "step": 4325 }, { "epoch": 0.42764995180782445, "grad_norm": 4.5495291288938455, "learning_rate": 1.2791479257113958e-07, "loss": 0.7411, "step": 4326 }, { "epoch": 0.4277488075525789, "grad_norm": 6.008356395336652, "learning_rate": 1.278840429149503e-07, "loss": 0.6886, "step": 4327 }, { "epoch": 0.42784766329733337, "grad_norm": 4.897801670254142, "learning_rate": 1.2785329039966373e-07, "loss": 0.715, "step": 4328 }, { "epoch": 0.42794651904208786, "grad_norm": 3.0442160985529068, "learning_rate": 1.278225350284331e-07, "loss": 0.642, "step": 4329 }, { "epoch": 0.4280453747868423, "grad_norm": 4.057452925170052, "learning_rate": 1.277917768044119e-07, "loss": 0.7051, "step": 4330 }, { "epoch": 0.4281442305315968, "grad_norm": 3.444549949648897, "learning_rate": 1.2776101573075396e-07, "loss": 0.6496, "step": 4331 }, { "epoch": 0.4282430862763512, "grad_norm": 2.9525505451994336, "learning_rate": 1.2773025181061337e-07, "loss": 0.6661, "step": 4332 }, { "epoch": 0.4283419420211057, "grad_norm": 5.04625991089581, "learning_rate": 1.276994850471445e-07, "loss": 0.6904, "step": 4333 }, { "epoch": 0.4284407977658602, "grad_norm": 3.103642275491389, "learning_rate": 1.27668715443502e-07, "loss": 0.6139, "step": 4334 }, { "epoch": 0.4285396535106146, "grad_norm": 3.2801239862414975, "learning_rate": 1.276379430028409e-07, "loss": 0.8062, "step": 4335 }, { "epoch": 0.4286385092553691, "grad_norm": 6.89741587957365, "learning_rate": 1.2760716772831645e-07, "loss": 0.781, "step": 4336 }, { "epoch": 0.42873736500012355, "grad_norm": 5.124431694699376, "learning_rate": 1.275763896230842e-07, "loss": 0.753, "step": 4337 }, { "epoch": 0.42883622074487804, "grad_norm": 3.5043506639887028, "learning_rate": 1.275456086902999e-07, "loss": 0.7129, "step": 4338 }, { "epoch": 0.4289350764896325, "grad_norm": 6.725192764441534, "learning_rate": 1.275148249331198e-07, "loss": 0.7289, "step": 4339 }, { "epoch": 0.42903393223438696, "grad_norm": 4.289084601167294, "learning_rate": 1.2748403835470027e-07, "loss": 0.6991, "step": 4340 }, { "epoch": 0.42913278797914145, "grad_norm": 3.412954652627056, "learning_rate": 1.2745324895819805e-07, "loss": 0.6345, "step": 4341 }, { "epoch": 0.4292316437238959, "grad_norm": 4.359660170908633, "learning_rate": 1.2742245674677008e-07, "loss": 0.6353, "step": 4342 }, { "epoch": 0.4293304994686504, "grad_norm": 6.004726049690692, "learning_rate": 1.273916617235737e-07, "loss": 0.7129, "step": 4343 }, { "epoch": 0.42942935521340486, "grad_norm": 5.30434329102087, "learning_rate": 1.2736086389176646e-07, "loss": 0.7029, "step": 4344 }, { "epoch": 0.4295282109581593, "grad_norm": 3.3467146890743518, "learning_rate": 1.2733006325450622e-07, "loss": 0.6204, "step": 4345 }, { "epoch": 0.4296270667029138, "grad_norm": 6.308991652462591, "learning_rate": 1.2729925981495114e-07, "loss": 0.6439, "step": 4346 }, { "epoch": 0.4297259224476682, "grad_norm": 4.22619870188735, "learning_rate": 1.2726845357625968e-07, "loss": 0.7821, "step": 4347 }, { "epoch": 0.4298247781924227, "grad_norm": 18.602160604572163, "learning_rate": 1.2723764454159053e-07, "loss": 0.6534, "step": 4348 }, { "epoch": 0.4299236339371772, "grad_norm": 4.644661111120811, "learning_rate": 1.272068327141027e-07, "loss": 0.711, "step": 4349 }, { "epoch": 0.43002248968193163, "grad_norm": 4.660188510926421, "learning_rate": 1.2717601809695553e-07, "loss": 0.6934, "step": 4350 }, { "epoch": 0.4301213454266861, "grad_norm": 5.3953222748973975, "learning_rate": 1.2714520069330857e-07, "loss": 0.7623, "step": 4351 }, { "epoch": 0.43022020117144055, "grad_norm": 4.22370307234915, "learning_rate": 1.2711438050632168e-07, "loss": 0.6401, "step": 4352 }, { "epoch": 0.43031905691619504, "grad_norm": 7.588591804898984, "learning_rate": 1.2708355753915506e-07, "loss": 0.7086, "step": 4353 }, { "epoch": 0.43041791266094953, "grad_norm": 3.0145203162230825, "learning_rate": 1.2705273179496914e-07, "loss": 0.6347, "step": 4354 }, { "epoch": 0.43051676840570396, "grad_norm": 3.4919977204765695, "learning_rate": 1.270219032769246e-07, "loss": 0.7106, "step": 4355 }, { "epoch": 0.43061562415045845, "grad_norm": 5.285295345279206, "learning_rate": 1.2699107198818249e-07, "loss": 0.7076, "step": 4356 }, { "epoch": 0.4307144798952129, "grad_norm": 4.604014830179218, "learning_rate": 1.2696023793190407e-07, "loss": 0.6514, "step": 4357 }, { "epoch": 0.4308133356399674, "grad_norm": 6.4999279368531955, "learning_rate": 1.2692940111125096e-07, "loss": 0.7201, "step": 4358 }, { "epoch": 0.43091219138472187, "grad_norm": 30.077677883730775, "learning_rate": 1.2689856152938503e-07, "loss": 0.7752, "step": 4359 }, { "epoch": 0.4310110471294763, "grad_norm": 17.40373359824735, "learning_rate": 1.2686771918946833e-07, "loss": 0.7012, "step": 4360 }, { "epoch": 0.4311099028742308, "grad_norm": 2.7744416624335084, "learning_rate": 1.2683687409466343e-07, "loss": 0.6608, "step": 4361 }, { "epoch": 0.4312087586189852, "grad_norm": 4.951734798684461, "learning_rate": 1.268060262481329e-07, "loss": 0.7203, "step": 4362 }, { "epoch": 0.4313076143637397, "grad_norm": 3.7613180023067767, "learning_rate": 1.2677517565303984e-07, "loss": 0.6602, "step": 4363 }, { "epoch": 0.4314064701084942, "grad_norm": 4.799613210065259, "learning_rate": 1.2674432231254744e-07, "loss": 0.8099, "step": 4364 }, { "epoch": 0.43150532585324863, "grad_norm": 6.769777888943804, "learning_rate": 1.2671346622981936e-07, "loss": 0.6769, "step": 4365 }, { "epoch": 0.4316041815980031, "grad_norm": 4.394752711791347, "learning_rate": 1.266826074080193e-07, "loss": 0.6921, "step": 4366 }, { "epoch": 0.43170303734275756, "grad_norm": 7.29462833045304, "learning_rate": 1.2665174585031148e-07, "loss": 0.6879, "step": 4367 }, { "epoch": 0.43180189308751205, "grad_norm": 5.386676072898909, "learning_rate": 1.2662088155986024e-07, "loss": 0.6425, "step": 4368 }, { "epoch": 0.43190074883226653, "grad_norm": 5.091814726763433, "learning_rate": 1.265900145398303e-07, "loss": 0.7103, "step": 4369 }, { "epoch": 0.43199960457702097, "grad_norm": 8.75232455834141, "learning_rate": 1.265591447933866e-07, "loss": 0.649, "step": 4370 }, { "epoch": 0.43209846032177546, "grad_norm": 4.667663277345671, "learning_rate": 1.265282723236944e-07, "loss": 0.7344, "step": 4371 }, { "epoch": 0.4321973160665299, "grad_norm": 3.4294750830664427, "learning_rate": 1.2649739713391915e-07, "loss": 0.7673, "step": 4372 }, { "epoch": 0.4322961718112844, "grad_norm": 8.262375070432439, "learning_rate": 1.2646651922722672e-07, "loss": 0.7281, "step": 4373 }, { "epoch": 0.43239502755603887, "grad_norm": 4.5168429854814125, "learning_rate": 1.2643563860678313e-07, "loss": 0.7319, "step": 4374 }, { "epoch": 0.4324938833007933, "grad_norm": 3.5130406942244248, "learning_rate": 1.2640475527575474e-07, "loss": 0.7099, "step": 4375 }, { "epoch": 0.4325927390455478, "grad_norm": 4.997817930470844, "learning_rate": 1.263738692373082e-07, "loss": 0.8305, "step": 4376 }, { "epoch": 0.4326915947903022, "grad_norm": 4.400517902086195, "learning_rate": 1.2634298049461043e-07, "loss": 0.6829, "step": 4377 }, { "epoch": 0.4327904505350567, "grad_norm": 5.3044053964218225, "learning_rate": 1.263120890508286e-07, "loss": 0.7341, "step": 4378 }, { "epoch": 0.4328893062798112, "grad_norm": 5.849384098437953, "learning_rate": 1.2628119490913013e-07, "loss": 0.6987, "step": 4379 }, { "epoch": 0.43298816202456564, "grad_norm": 5.492677717683169, "learning_rate": 1.2625029807268287e-07, "loss": 0.6919, "step": 4380 }, { "epoch": 0.4330870177693201, "grad_norm": 6.8367239311903605, "learning_rate": 1.2621939854465472e-07, "loss": 0.6621, "step": 4381 }, { "epoch": 0.43318587351407456, "grad_norm": 4.466951493405875, "learning_rate": 1.26188496328214e-07, "loss": 0.6858, "step": 4382 }, { "epoch": 0.43328472925882905, "grad_norm": 4.538675015767456, "learning_rate": 1.2615759142652927e-07, "loss": 0.7062, "step": 4383 }, { "epoch": 0.43338358500358354, "grad_norm": 8.83538273287972, "learning_rate": 1.2612668384276945e-07, "loss": 0.5815, "step": 4384 }, { "epoch": 0.43348244074833797, "grad_norm": 2.744002988402256, "learning_rate": 1.2609577358010358e-07, "loss": 0.7475, "step": 4385 }, { "epoch": 0.43358129649309246, "grad_norm": 6.486717477469218, "learning_rate": 1.2606486064170102e-07, "loss": 0.7697, "step": 4386 }, { "epoch": 0.4336801522378469, "grad_norm": 4.070926237751014, "learning_rate": 1.2603394503073156e-07, "loss": 0.8002, "step": 4387 }, { "epoch": 0.4337790079826014, "grad_norm": 4.206049778302832, "learning_rate": 1.26003026750365e-07, "loss": 0.6518, "step": 4388 }, { "epoch": 0.4338778637273559, "grad_norm": 3.865201676393821, "learning_rate": 1.2597210580377166e-07, "loss": 0.6699, "step": 4389 }, { "epoch": 0.4339767194721103, "grad_norm": 3.8879430037055176, "learning_rate": 1.2594118219412198e-07, "loss": 0.7246, "step": 4390 }, { "epoch": 0.4340755752168648, "grad_norm": 11.197237523245033, "learning_rate": 1.2591025592458672e-07, "loss": 0.7116, "step": 4391 }, { "epoch": 0.4341744309616193, "grad_norm": 4.605759831314139, "learning_rate": 1.258793269983369e-07, "loss": 0.7276, "step": 4392 }, { "epoch": 0.4342732867063737, "grad_norm": 3.5941489036446708, "learning_rate": 1.2584839541854387e-07, "loss": 0.7143, "step": 4393 }, { "epoch": 0.4343721424511282, "grad_norm": 5.184211638820875, "learning_rate": 1.2581746118837915e-07, "loss": 0.8074, "step": 4394 }, { "epoch": 0.43447099819588264, "grad_norm": 6.022482001726597, "learning_rate": 1.2578652431101465e-07, "loss": 0.7228, "step": 4395 }, { "epoch": 0.43456985394063713, "grad_norm": 2.749775118282979, "learning_rate": 1.2575558478962246e-07, "loss": 0.7878, "step": 4396 }, { "epoch": 0.4346687096853916, "grad_norm": 3.283533919143582, "learning_rate": 1.2572464262737493e-07, "loss": 0.7669, "step": 4397 }, { "epoch": 0.43476756543014605, "grad_norm": 4.422052847420881, "learning_rate": 1.256936978274448e-07, "loss": 0.622, "step": 4398 }, { "epoch": 0.43486642117490054, "grad_norm": 3.458444463914515, "learning_rate": 1.2566275039300497e-07, "loss": 0.6919, "step": 4399 }, { "epoch": 0.434965276919655, "grad_norm": 3.2184056055563612, "learning_rate": 1.2563180032722864e-07, "loss": 0.7163, "step": 4400 }, { "epoch": 0.43506413266440946, "grad_norm": 5.577293245462467, "learning_rate": 1.2560084763328925e-07, "loss": 0.7796, "step": 4401 }, { "epoch": 0.43516298840916395, "grad_norm": 4.979559465331439, "learning_rate": 1.2556989231436062e-07, "loss": 0.7381, "step": 4402 }, { "epoch": 0.4352618441539184, "grad_norm": 4.8979629830041445, "learning_rate": 1.255389343736167e-07, "loss": 0.773, "step": 4403 }, { "epoch": 0.4353606998986729, "grad_norm": 4.96966890020378, "learning_rate": 1.2550797381423175e-07, "loss": 0.7652, "step": 4404 }, { "epoch": 0.4354595556434273, "grad_norm": 10.278301396615241, "learning_rate": 1.2547701063938037e-07, "loss": 0.7334, "step": 4405 }, { "epoch": 0.4355584113881818, "grad_norm": 3.0367457589605977, "learning_rate": 1.2544604485223738e-07, "loss": 0.6099, "step": 4406 }, { "epoch": 0.4356572671329363, "grad_norm": 4.239197505564717, "learning_rate": 1.2541507645597785e-07, "loss": 0.734, "step": 4407 }, { "epoch": 0.4357561228776907, "grad_norm": 8.290412218083139, "learning_rate": 1.253841054537771e-07, "loss": 0.6962, "step": 4408 }, { "epoch": 0.4358549786224452, "grad_norm": 3.9414910788371187, "learning_rate": 1.253531318488108e-07, "loss": 0.7629, "step": 4409 }, { "epoch": 0.43595383436719964, "grad_norm": 4.228695858547385, "learning_rate": 1.2532215564425484e-07, "loss": 0.6836, "step": 4410 }, { "epoch": 0.43605269011195413, "grad_norm": 4.342870926769546, "learning_rate": 1.2529117684328529e-07, "loss": 0.6812, "step": 4411 }, { "epoch": 0.4361515458567086, "grad_norm": 8.621313403882251, "learning_rate": 1.2526019544907865e-07, "loss": 0.6475, "step": 4412 }, { "epoch": 0.43625040160146306, "grad_norm": 3.2162120902575064, "learning_rate": 1.2522921146481157e-07, "loss": 0.7607, "step": 4413 }, { "epoch": 0.43634925734621755, "grad_norm": 24.510036779593257, "learning_rate": 1.2519822489366102e-07, "loss": 0.6977, "step": 4414 }, { "epoch": 0.436448113090972, "grad_norm": 4.100377435496341, "learning_rate": 1.2516723573880424e-07, "loss": 0.6785, "step": 4415 }, { "epoch": 0.43654696883572647, "grad_norm": 3.640739682116871, "learning_rate": 1.2513624400341865e-07, "loss": 0.7524, "step": 4416 }, { "epoch": 0.43664582458048096, "grad_norm": 3.110513398577446, "learning_rate": 1.25105249690682e-07, "loss": 0.6609, "step": 4417 }, { "epoch": 0.4367446803252354, "grad_norm": 8.1246429261824, "learning_rate": 1.2507425280377237e-07, "loss": 0.6467, "step": 4418 }, { "epoch": 0.4368435360699899, "grad_norm": 3.8303794823742066, "learning_rate": 1.2504325334586794e-07, "loss": 0.6186, "step": 4419 }, { "epoch": 0.4369423918147443, "grad_norm": 5.064371279810979, "learning_rate": 1.2501225132014732e-07, "loss": 0.7728, "step": 4420 }, { "epoch": 0.4370412475594988, "grad_norm": 4.406913377961118, "learning_rate": 1.2498124672978926e-07, "loss": 0.694, "step": 4421 }, { "epoch": 0.4371401033042533, "grad_norm": 8.664113637523895, "learning_rate": 1.249502395779729e-07, "loss": 0.7107, "step": 4422 }, { "epoch": 0.4372389590490077, "grad_norm": 3.413037520587641, "learning_rate": 1.2491922986787744e-07, "loss": 0.6398, "step": 4423 }, { "epoch": 0.4373378147937622, "grad_norm": 3.1169176984800404, "learning_rate": 1.2488821760268257e-07, "loss": 0.7062, "step": 4424 }, { "epoch": 0.43743667053851665, "grad_norm": 7.6481769376041955, "learning_rate": 1.2485720278556817e-07, "loss": 0.7161, "step": 4425 }, { "epoch": 0.43753552628327114, "grad_norm": 3.7604710815942957, "learning_rate": 1.2482618541971421e-07, "loss": 0.6928, "step": 4426 }, { "epoch": 0.4376343820280256, "grad_norm": 4.666300995906099, "learning_rate": 1.247951655083012e-07, "loss": 0.7924, "step": 4427 }, { "epoch": 0.43773323777278006, "grad_norm": 5.155820412847689, "learning_rate": 1.247641430545097e-07, "loss": 0.6049, "step": 4428 }, { "epoch": 0.43783209351753455, "grad_norm": 4.287211360862099, "learning_rate": 1.2473311806152063e-07, "loss": 0.8135, "step": 4429 }, { "epoch": 0.437930949262289, "grad_norm": 5.266649361322913, "learning_rate": 1.2470209053251512e-07, "loss": 0.7076, "step": 4430 }, { "epoch": 0.43802980500704347, "grad_norm": 15.456491604262355, "learning_rate": 1.2467106047067463e-07, "loss": 0.7349, "step": 4431 }, { "epoch": 0.43812866075179796, "grad_norm": 6.295204589479301, "learning_rate": 1.2464002787918081e-07, "loss": 0.746, "step": 4432 }, { "epoch": 0.4382275164965524, "grad_norm": 4.7418785713357785, "learning_rate": 1.2460899276121562e-07, "loss": 0.7396, "step": 4433 }, { "epoch": 0.4383263722413069, "grad_norm": 3.419410270782434, "learning_rate": 1.2457795511996121e-07, "loss": 0.66, "step": 4434 }, { "epoch": 0.4384252279860613, "grad_norm": 3.9940326336933087, "learning_rate": 1.2454691495860007e-07, "loss": 0.7172, "step": 4435 }, { "epoch": 0.4385240837308158, "grad_norm": 5.622727285437912, "learning_rate": 1.2451587228031487e-07, "loss": 0.6655, "step": 4436 }, { "epoch": 0.4386229394755703, "grad_norm": 5.53886105315408, "learning_rate": 1.244848270882886e-07, "loss": 0.6859, "step": 4437 }, { "epoch": 0.43872179522032473, "grad_norm": 3.9429385985391225, "learning_rate": 1.244537793857045e-07, "loss": 0.6745, "step": 4438 }, { "epoch": 0.4388206509650792, "grad_norm": 5.9124074971423966, "learning_rate": 1.2442272917574602e-07, "loss": 0.784, "step": 4439 }, { "epoch": 0.43891950670983365, "grad_norm": 4.870215060491875, "learning_rate": 1.2439167646159697e-07, "loss": 0.6839, "step": 4440 }, { "epoch": 0.43901836245458814, "grad_norm": 4.816482647965505, "learning_rate": 1.2436062124644126e-07, "loss": 0.6599, "step": 4441 }, { "epoch": 0.43911721819934263, "grad_norm": 8.201739083063268, "learning_rate": 1.2432956353346323e-07, "loss": 0.6821, "step": 4442 }, { "epoch": 0.43921607394409706, "grad_norm": 4.299313184498597, "learning_rate": 1.242985033258473e-07, "loss": 0.731, "step": 4443 }, { "epoch": 0.43931492968885155, "grad_norm": 5.105167828321615, "learning_rate": 1.242674406267783e-07, "loss": 0.7029, "step": 4444 }, { "epoch": 0.439413785433606, "grad_norm": 7.920638463598698, "learning_rate": 1.242363754394412e-07, "loss": 0.7742, "step": 4445 }, { "epoch": 0.4395126411783605, "grad_norm": 9.59679063558055, "learning_rate": 1.2420530776702136e-07, "loss": 0.6812, "step": 4446 }, { "epoch": 0.43961149692311496, "grad_norm": 8.216441295033869, "learning_rate": 1.2417423761270424e-07, "loss": 0.8131, "step": 4447 }, { "epoch": 0.4397103526678694, "grad_norm": 6.585632555768356, "learning_rate": 1.2414316497967565e-07, "loss": 0.7246, "step": 4448 }, { "epoch": 0.4398092084126239, "grad_norm": 17.450940592278886, "learning_rate": 1.2411208987112164e-07, "loss": 0.7396, "step": 4449 }, { "epoch": 0.4399080641573783, "grad_norm": 3.6426803671272485, "learning_rate": 1.2408101229022848e-07, "loss": 0.7118, "step": 4450 }, { "epoch": 0.4400069199021328, "grad_norm": 7.878755847522421, "learning_rate": 1.2404993224018272e-07, "loss": 0.7433, "step": 4451 }, { "epoch": 0.4401057756468873, "grad_norm": 4.024591844987845, "learning_rate": 1.2401884972417122e-07, "loss": 0.755, "step": 4452 }, { "epoch": 0.44020463139164173, "grad_norm": 4.380316553528367, "learning_rate": 1.2398776474538092e-07, "loss": 0.6621, "step": 4453 }, { "epoch": 0.4403034871363962, "grad_norm": 3.4616190007493297, "learning_rate": 1.2395667730699923e-07, "loss": 0.5833, "step": 4454 }, { "epoch": 0.44040234288115065, "grad_norm": 5.212395189633108, "learning_rate": 1.2392558741221364e-07, "loss": 0.7293, "step": 4455 }, { "epoch": 0.44050119862590514, "grad_norm": 4.823604756138332, "learning_rate": 1.2389449506421199e-07, "loss": 0.696, "step": 4456 }, { "epoch": 0.44060005437065963, "grad_norm": 3.7280142651962924, "learning_rate": 1.2386340026618235e-07, "loss": 0.7675, "step": 4457 }, { "epoch": 0.44069891011541407, "grad_norm": 6.458838267961823, "learning_rate": 1.2383230302131305e-07, "loss": 0.6742, "step": 4458 }, { "epoch": 0.44079776586016856, "grad_norm": 4.45332170936143, "learning_rate": 1.2380120333279258e-07, "loss": 0.6878, "step": 4459 }, { "epoch": 0.440896621604923, "grad_norm": 6.764293359217026, "learning_rate": 1.2377010120380984e-07, "loss": 0.78, "step": 4460 }, { "epoch": 0.4409954773496775, "grad_norm": 4.89660273082407, "learning_rate": 1.2373899663755382e-07, "loss": 0.7385, "step": 4461 }, { "epoch": 0.44109433309443197, "grad_norm": 16.090459728839996, "learning_rate": 1.237078896372139e-07, "loss": 0.6731, "step": 4462 }, { "epoch": 0.4411931888391864, "grad_norm": 5.6724518325023965, "learning_rate": 1.236767802059796e-07, "loss": 0.6869, "step": 4463 }, { "epoch": 0.4412920445839409, "grad_norm": 5.453777922367036, "learning_rate": 1.2364566834704074e-07, "loss": 0.7756, "step": 4464 }, { "epoch": 0.4413909003286953, "grad_norm": 2.9047569178020227, "learning_rate": 1.2361455406358738e-07, "loss": 0.6542, "step": 4465 }, { "epoch": 0.4414897560734498, "grad_norm": 4.968953442224472, "learning_rate": 1.2358343735880988e-07, "loss": 0.7105, "step": 4466 }, { "epoch": 0.4415886118182043, "grad_norm": 10.090350201393843, "learning_rate": 1.235523182358987e-07, "loss": 0.7127, "step": 4467 }, { "epoch": 0.44168746756295874, "grad_norm": 4.329821831988494, "learning_rate": 1.2352119669804476e-07, "loss": 0.7074, "step": 4468 }, { "epoch": 0.4417863233077132, "grad_norm": 7.269967251474778, "learning_rate": 1.2349007274843905e-07, "loss": 0.7071, "step": 4469 }, { "epoch": 0.4418851790524677, "grad_norm": 4.239048571706389, "learning_rate": 1.2345894639027285e-07, "loss": 0.6525, "step": 4470 }, { "epoch": 0.44198403479722215, "grad_norm": 5.445068191429476, "learning_rate": 1.2342781762673777e-07, "loss": 0.8434, "step": 4471 }, { "epoch": 0.44208289054197664, "grad_norm": 8.43378628827867, "learning_rate": 1.2339668646102558e-07, "loss": 0.6469, "step": 4472 }, { "epoch": 0.44218174628673107, "grad_norm": 4.462925830105942, "learning_rate": 1.2336555289632832e-07, "loss": 0.7454, "step": 4473 }, { "epoch": 0.44228060203148556, "grad_norm": 4.1501502401287915, "learning_rate": 1.2333441693583825e-07, "loss": 0.712, "step": 4474 }, { "epoch": 0.44237945777624005, "grad_norm": 5.080971815573156, "learning_rate": 1.2330327858274795e-07, "loss": 0.6216, "step": 4475 }, { "epoch": 0.4424783135209945, "grad_norm": 3.5525067456010584, "learning_rate": 1.2327213784025019e-07, "loss": 0.7316, "step": 4476 }, { "epoch": 0.44257716926574897, "grad_norm": 3.7901045466999794, "learning_rate": 1.2324099471153797e-07, "loss": 0.8385, "step": 4477 }, { "epoch": 0.4426760250105034, "grad_norm": 4.829630274722525, "learning_rate": 1.2320984919980457e-07, "loss": 0.7073, "step": 4478 }, { "epoch": 0.4427748807552579, "grad_norm": 6.838625149691489, "learning_rate": 1.2317870130824352e-07, "loss": 0.6746, "step": 4479 }, { "epoch": 0.4428737365000124, "grad_norm": 3.752519521585151, "learning_rate": 1.2314755104004854e-07, "loss": 0.7264, "step": 4480 }, { "epoch": 0.4429725922447668, "grad_norm": 5.3986834053287005, "learning_rate": 1.2311639839841364e-07, "loss": 0.6827, "step": 4481 }, { "epoch": 0.4430714479895213, "grad_norm": 4.752032787902562, "learning_rate": 1.230852433865331e-07, "loss": 0.7504, "step": 4482 }, { "epoch": 0.44317030373427574, "grad_norm": 3.233549496532395, "learning_rate": 1.2305408600760136e-07, "loss": 0.6811, "step": 4483 }, { "epoch": 0.44326915947903023, "grad_norm": 3.3578525709958003, "learning_rate": 1.230229262648132e-07, "loss": 0.816, "step": 4484 }, { "epoch": 0.4433680152237847, "grad_norm": 4.017067039483065, "learning_rate": 1.2299176416136355e-07, "loss": 0.6925, "step": 4485 }, { "epoch": 0.44346687096853915, "grad_norm": 4.069350476945201, "learning_rate": 1.2296059970044766e-07, "loss": 0.6968, "step": 4486 }, { "epoch": 0.44356572671329364, "grad_norm": 3.103115312144985, "learning_rate": 1.2292943288526093e-07, "loss": 0.7901, "step": 4487 }, { "epoch": 0.4436645824580481, "grad_norm": 4.215047374448014, "learning_rate": 1.228982637189991e-07, "loss": 0.6818, "step": 4488 }, { "epoch": 0.44376343820280256, "grad_norm": 6.348230034396298, "learning_rate": 1.228670922048581e-07, "loss": 0.7571, "step": 4489 }, { "epoch": 0.44386229394755705, "grad_norm": 5.590526277076746, "learning_rate": 1.2283591834603408e-07, "loss": 0.7182, "step": 4490 }, { "epoch": 0.4439611496923115, "grad_norm": 4.273379647594977, "learning_rate": 1.2280474214572353e-07, "loss": 0.569, "step": 4491 }, { "epoch": 0.444060005437066, "grad_norm": 3.428113293724736, "learning_rate": 1.2277356360712303e-07, "loss": 0.6703, "step": 4492 }, { "epoch": 0.4441588611818204, "grad_norm": 5.759277166479194, "learning_rate": 1.2274238273342953e-07, "loss": 0.7263, "step": 4493 }, { "epoch": 0.4442577169265749, "grad_norm": 3.9582515686031257, "learning_rate": 1.2271119952784017e-07, "loss": 0.6925, "step": 4494 }, { "epoch": 0.4443565726713294, "grad_norm": 5.324509422002788, "learning_rate": 1.2268001399355231e-07, "loss": 0.6739, "step": 4495 }, { "epoch": 0.4444554284160838, "grad_norm": 4.695072901320314, "learning_rate": 1.2264882613376358e-07, "loss": 0.7827, "step": 4496 }, { "epoch": 0.4445542841608383, "grad_norm": 5.140840930603065, "learning_rate": 1.2261763595167182e-07, "loss": 0.7352, "step": 4497 }, { "epoch": 0.44465313990559274, "grad_norm": 6.9231287155565076, "learning_rate": 1.2258644345047507e-07, "loss": 0.6598, "step": 4498 }, { "epoch": 0.44475199565034723, "grad_norm": 3.56066444947844, "learning_rate": 1.225552486333718e-07, "loss": 0.7522, "step": 4499 }, { "epoch": 0.4448508513951017, "grad_norm": 5.901764905112503, "learning_rate": 1.2252405150356042e-07, "loss": 0.701, "step": 4500 }, { "epoch": 0.44494970713985615, "grad_norm": 4.074530756891591, "learning_rate": 1.2249285206423986e-07, "loss": 0.7118, "step": 4501 }, { "epoch": 0.44504856288461064, "grad_norm": 3.0701122737352353, "learning_rate": 1.2246165031860912e-07, "loss": 0.7684, "step": 4502 }, { "epoch": 0.4451474186293651, "grad_norm": 4.945021383246274, "learning_rate": 1.2243044626986744e-07, "loss": 0.7291, "step": 4503 }, { "epoch": 0.44524627437411957, "grad_norm": 3.6497231508580543, "learning_rate": 1.223992399212144e-07, "loss": 0.7627, "step": 4504 }, { "epoch": 0.44534513011887406, "grad_norm": 25.357354651331907, "learning_rate": 1.2236803127584968e-07, "loss": 0.7924, "step": 4505 }, { "epoch": 0.4454439858636285, "grad_norm": 4.18909074711267, "learning_rate": 1.2233682033697332e-07, "loss": 0.6177, "step": 4506 }, { "epoch": 0.445542841608383, "grad_norm": 3.660823984173389, "learning_rate": 1.223056071077855e-07, "loss": 0.6863, "step": 4507 }, { "epoch": 0.4456416973531374, "grad_norm": 3.5894453950617353, "learning_rate": 1.2227439159148675e-07, "loss": 0.6828, "step": 4508 }, { "epoch": 0.4457405530978919, "grad_norm": 6.312791248000489, "learning_rate": 1.222431737912777e-07, "loss": 0.6865, "step": 4509 }, { "epoch": 0.4458394088426464, "grad_norm": 11.299661389221923, "learning_rate": 1.2221195371035928e-07, "loss": 0.6815, "step": 4510 }, { "epoch": 0.4459382645874008, "grad_norm": 3.0943843287373647, "learning_rate": 1.2218073135193266e-07, "loss": 0.7197, "step": 4511 }, { "epoch": 0.4460371203321553, "grad_norm": 11.010272172576105, "learning_rate": 1.2214950671919924e-07, "loss": 0.6673, "step": 4512 }, { "epoch": 0.44613597607690975, "grad_norm": 4.146332121678589, "learning_rate": 1.2211827981536063e-07, "loss": 0.6509, "step": 4513 }, { "epoch": 0.44623483182166424, "grad_norm": 6.116564283184518, "learning_rate": 1.2208705064361867e-07, "loss": 0.6971, "step": 4514 }, { "epoch": 0.4463336875664187, "grad_norm": 3.906039076014608, "learning_rate": 1.220558192071755e-07, "loss": 0.6471, "step": 4515 }, { "epoch": 0.44643254331117316, "grad_norm": 3.4786839706551143, "learning_rate": 1.2202458550923341e-07, "loss": 0.737, "step": 4516 }, { "epoch": 0.44653139905592765, "grad_norm": 8.249372936557851, "learning_rate": 1.2199334955299493e-07, "loss": 0.7722, "step": 4517 }, { "epoch": 0.4466302548006821, "grad_norm": 16.98135624043647, "learning_rate": 1.219621113416629e-07, "loss": 0.7517, "step": 4518 }, { "epoch": 0.44672911054543657, "grad_norm": 5.345593157287908, "learning_rate": 1.219308708784403e-07, "loss": 0.7465, "step": 4519 }, { "epoch": 0.44682796629019106, "grad_norm": 3.6997305097740543, "learning_rate": 1.2189962816653044e-07, "loss": 0.7379, "step": 4520 }, { "epoch": 0.4469268220349455, "grad_norm": 5.59564178855951, "learning_rate": 1.2186838320913673e-07, "loss": 0.7738, "step": 4521 }, { "epoch": 0.4470256777797, "grad_norm": 4.913077706077855, "learning_rate": 1.218371360094629e-07, "loss": 0.6352, "step": 4522 }, { "epoch": 0.4471245335244544, "grad_norm": 4.751800492601646, "learning_rate": 1.218058865707129e-07, "loss": 0.7764, "step": 4523 }, { "epoch": 0.4472233892692089, "grad_norm": 4.640952698944985, "learning_rate": 1.2177463489609088e-07, "loss": 0.7188, "step": 4524 }, { "epoch": 0.4473222450139634, "grad_norm": 5.94554685699758, "learning_rate": 1.2174338098880123e-07, "loss": 0.7682, "step": 4525 }, { "epoch": 0.4474211007587178, "grad_norm": 3.2151806624893844, "learning_rate": 1.2171212485204862e-07, "loss": 0.7857, "step": 4526 }, { "epoch": 0.4475199565034723, "grad_norm": 4.497718292386858, "learning_rate": 1.216808664890379e-07, "loss": 0.7493, "step": 4527 }, { "epoch": 0.44761881224822675, "grad_norm": 4.801821268629588, "learning_rate": 1.2164960590297413e-07, "loss": 0.6905, "step": 4528 }, { "epoch": 0.44771766799298124, "grad_norm": 2.7608439931811604, "learning_rate": 1.2161834309706261e-07, "loss": 0.6767, "step": 4529 }, { "epoch": 0.4478165237377357, "grad_norm": 3.2188912957496405, "learning_rate": 1.2158707807450892e-07, "loss": 0.6571, "step": 4530 }, { "epoch": 0.44791537948249016, "grad_norm": 14.21015777898158, "learning_rate": 1.215558108385188e-07, "loss": 0.6582, "step": 4531 }, { "epoch": 0.44801423522724465, "grad_norm": 41.42054385999666, "learning_rate": 1.2152454139229823e-07, "loss": 0.6809, "step": 4532 }, { "epoch": 0.4481130909719991, "grad_norm": 2.881791183891332, "learning_rate": 1.2149326973905346e-07, "loss": 0.6467, "step": 4533 }, { "epoch": 0.4482119467167536, "grad_norm": 2.9942151798850047, "learning_rate": 1.2146199588199094e-07, "loss": 0.6783, "step": 4534 }, { "epoch": 0.44831080246150806, "grad_norm": 4.9079634483442, "learning_rate": 1.214307198243173e-07, "loss": 0.715, "step": 4535 }, { "epoch": 0.4484096582062625, "grad_norm": 14.526484974452057, "learning_rate": 1.2139944156923946e-07, "loss": 0.685, "step": 4536 }, { "epoch": 0.448508513951017, "grad_norm": 4.034172284609909, "learning_rate": 1.213681611199646e-07, "loss": 0.7095, "step": 4537 }, { "epoch": 0.4486073696957714, "grad_norm": 3.6479634060739947, "learning_rate": 1.213368784797e-07, "loss": 0.6842, "step": 4538 }, { "epoch": 0.4487062254405259, "grad_norm": 9.189295409900012, "learning_rate": 1.2130559365165328e-07, "loss": 0.696, "step": 4539 }, { "epoch": 0.4488050811852804, "grad_norm": 5.892721199056226, "learning_rate": 1.2127430663903215e-07, "loss": 0.7006, "step": 4540 }, { "epoch": 0.44890393693003483, "grad_norm": 4.371711302071998, "learning_rate": 1.2124301744504472e-07, "loss": 0.6916, "step": 4541 }, { "epoch": 0.4490027926747893, "grad_norm": 3.4784343338603314, "learning_rate": 1.2121172607289923e-07, "loss": 0.6284, "step": 4542 }, { "epoch": 0.44910164841954375, "grad_norm": 3.8110529640568935, "learning_rate": 1.2118043252580408e-07, "loss": 0.6914, "step": 4543 }, { "epoch": 0.44920050416429824, "grad_norm": 3.172560418074783, "learning_rate": 1.2114913680696804e-07, "loss": 0.7162, "step": 4544 }, { "epoch": 0.44929935990905273, "grad_norm": 5.610711696262875, "learning_rate": 1.2111783891959996e-07, "loss": 0.7484, "step": 4545 }, { "epoch": 0.44939821565380716, "grad_norm": 5.254378421284161, "learning_rate": 1.2108653886690905e-07, "loss": 0.6584, "step": 4546 }, { "epoch": 0.44949707139856165, "grad_norm": 6.136010070680098, "learning_rate": 1.2105523665210462e-07, "loss": 0.8016, "step": 4547 }, { "epoch": 0.44959592714331614, "grad_norm": 5.130816913143164, "learning_rate": 1.2102393227839625e-07, "loss": 0.7674, "step": 4548 }, { "epoch": 0.4496947828880706, "grad_norm": 3.228551260776028, "learning_rate": 1.2099262574899375e-07, "loss": 0.7629, "step": 4549 }, { "epoch": 0.44979363863282507, "grad_norm": 3.408691378054981, "learning_rate": 1.2096131706710714e-07, "loss": 0.6306, "step": 4550 }, { "epoch": 0.4498924943775795, "grad_norm": 4.5897149865060465, "learning_rate": 1.2093000623594667e-07, "loss": 0.8141, "step": 4551 }, { "epoch": 0.449991350122334, "grad_norm": 17.836618983725316, "learning_rate": 1.208986932587228e-07, "loss": 0.7886, "step": 4552 }, { "epoch": 0.4500902058670885, "grad_norm": 7.362429932192727, "learning_rate": 1.208673781386462e-07, "loss": 0.7546, "step": 4553 }, { "epoch": 0.4501890616118429, "grad_norm": 8.47689853549441, "learning_rate": 1.208360608789278e-07, "loss": 0.8327, "step": 4554 }, { "epoch": 0.4502879173565974, "grad_norm": 3.49563267163135, "learning_rate": 1.208047414827787e-07, "loss": 0.7024, "step": 4555 }, { "epoch": 0.45038677310135183, "grad_norm": 5.937133594674317, "learning_rate": 1.2077341995341028e-07, "loss": 0.7006, "step": 4556 }, { "epoch": 0.4504856288461063, "grad_norm": 5.930105528725069, "learning_rate": 1.2074209629403406e-07, "loss": 0.6942, "step": 4557 }, { "epoch": 0.4505844845908608, "grad_norm": 5.976429960786998, "learning_rate": 1.207107705078618e-07, "loss": 0.6788, "step": 4558 }, { "epoch": 0.45068334033561525, "grad_norm": 3.4803268648109347, "learning_rate": 1.2067944259810556e-07, "loss": 0.6649, "step": 4559 }, { "epoch": 0.45078219608036973, "grad_norm": 4.685447139675545, "learning_rate": 1.2064811256797754e-07, "loss": 0.7955, "step": 4560 }, { "epoch": 0.45088105182512417, "grad_norm": 3.6730342355862935, "learning_rate": 1.2061678042069013e-07, "loss": 0.7432, "step": 4561 }, { "epoch": 0.45097990756987866, "grad_norm": 6.016444148595582, "learning_rate": 1.20585446159456e-07, "loss": 0.7102, "step": 4562 }, { "epoch": 0.45107876331463315, "grad_norm": 3.7047181913074168, "learning_rate": 1.2055410978748805e-07, "loss": 0.5793, "step": 4563 }, { "epoch": 0.4511776190593876, "grad_norm": 5.275003338565461, "learning_rate": 1.2052277130799933e-07, "loss": 0.693, "step": 4564 }, { "epoch": 0.45127647480414207, "grad_norm": 6.697750758089831, "learning_rate": 1.2049143072420313e-07, "loss": 0.7331, "step": 4565 }, { "epoch": 0.4513753305488965, "grad_norm": 8.751222439336527, "learning_rate": 1.20460088039313e-07, "loss": 0.8122, "step": 4566 }, { "epoch": 0.451474186293651, "grad_norm": 14.816459236312584, "learning_rate": 1.2042874325654264e-07, "loss": 0.6501, "step": 4567 }, { "epoch": 0.4515730420384055, "grad_norm": 3.2785204461265542, "learning_rate": 1.2039739637910603e-07, "loss": 0.5907, "step": 4568 }, { "epoch": 0.4516718977831599, "grad_norm": 3.4539430209643864, "learning_rate": 1.2036604741021723e-07, "loss": 0.7262, "step": 4569 }, { "epoch": 0.4517707535279144, "grad_norm": 10.550135756384824, "learning_rate": 1.2033469635309075e-07, "loss": 0.7679, "step": 4570 }, { "epoch": 0.45186960927266884, "grad_norm": 3.915627998302014, "learning_rate": 1.2030334321094108e-07, "loss": 0.8026, "step": 4571 }, { "epoch": 0.4519684650174233, "grad_norm": 3.4131574715448774, "learning_rate": 1.202719879869831e-07, "loss": 0.6477, "step": 4572 }, { "epoch": 0.4520673207621778, "grad_norm": 3.141244801962473, "learning_rate": 1.2024063068443177e-07, "loss": 0.674, "step": 4573 }, { "epoch": 0.45216617650693225, "grad_norm": 3.614573880719883, "learning_rate": 1.2020927130650233e-07, "loss": 0.6978, "step": 4574 }, { "epoch": 0.45226503225168674, "grad_norm": 9.724336633908523, "learning_rate": 1.2017790985641023e-07, "loss": 0.8219, "step": 4575 }, { "epoch": 0.45236388799644117, "grad_norm": 6.601043032691873, "learning_rate": 1.201465463373711e-07, "loss": 0.8092, "step": 4576 }, { "epoch": 0.45246274374119566, "grad_norm": 8.635980617435617, "learning_rate": 1.2011518075260082e-07, "loss": 0.6653, "step": 4577 }, { "epoch": 0.45256159948595015, "grad_norm": 11.299400332656445, "learning_rate": 1.200838131053155e-07, "loss": 0.6089, "step": 4578 }, { "epoch": 0.4526604552307046, "grad_norm": 3.9505272191270815, "learning_rate": 1.2005244339873143e-07, "loss": 0.6578, "step": 4579 }, { "epoch": 0.4527593109754591, "grad_norm": 3.7730488992274687, "learning_rate": 1.20021071636065e-07, "loss": 0.6204, "step": 4580 }, { "epoch": 0.4528581667202135, "grad_norm": 28.647370673967924, "learning_rate": 1.199896978205331e-07, "loss": 0.7644, "step": 4581 }, { "epoch": 0.452957022464968, "grad_norm": 3.5440750881373666, "learning_rate": 1.199583219553525e-07, "loss": 0.7213, "step": 4582 }, { "epoch": 0.4530558782097225, "grad_norm": 7.096377642374201, "learning_rate": 1.1992694404374043e-07, "loss": 0.7145, "step": 4583 }, { "epoch": 0.4531547339544769, "grad_norm": 3.6802297303889686, "learning_rate": 1.1989556408891412e-07, "loss": 0.7604, "step": 4584 }, { "epoch": 0.4532535896992314, "grad_norm": 4.0278272454309105, "learning_rate": 1.1986418209409124e-07, "loss": 0.752, "step": 4585 }, { "epoch": 0.45335244544398584, "grad_norm": 3.8418144310224744, "learning_rate": 1.198327980624895e-07, "loss": 0.7673, "step": 4586 }, { "epoch": 0.45345130118874033, "grad_norm": 3.92102209897504, "learning_rate": 1.1980141199732688e-07, "loss": 0.7214, "step": 4587 }, { "epoch": 0.4535501569334948, "grad_norm": 3.9674124813499976, "learning_rate": 1.1977002390182155e-07, "loss": 0.7375, "step": 4588 }, { "epoch": 0.45364901267824925, "grad_norm": 3.528700138324538, "learning_rate": 1.1973863377919186e-07, "loss": 0.6958, "step": 4589 }, { "epoch": 0.45374786842300374, "grad_norm": 3.1044076872644277, "learning_rate": 1.1970724163265648e-07, "loss": 0.6193, "step": 4590 }, { "epoch": 0.4538467241677582, "grad_norm": 27.6871821716826, "learning_rate": 1.1967584746543417e-07, "loss": 0.6438, "step": 4591 }, { "epoch": 0.45394557991251266, "grad_norm": 14.786972188457668, "learning_rate": 1.1964445128074394e-07, "loss": 0.6226, "step": 4592 }, { "epoch": 0.45404443565726715, "grad_norm": 5.2853616059093165, "learning_rate": 1.19613053081805e-07, "loss": 0.6831, "step": 4593 }, { "epoch": 0.4541432914020216, "grad_norm": 4.845051184201071, "learning_rate": 1.195816528718368e-07, "loss": 0.743, "step": 4594 }, { "epoch": 0.4542421471467761, "grad_norm": 5.012754119941879, "learning_rate": 1.195502506540589e-07, "loss": 0.6853, "step": 4595 }, { "epoch": 0.4543410028915305, "grad_norm": 12.764780735693048, "learning_rate": 1.1951884643169124e-07, "loss": 0.7892, "step": 4596 }, { "epoch": 0.454439858636285, "grad_norm": 6.09271412284658, "learning_rate": 1.1948744020795376e-07, "loss": 0.6677, "step": 4597 }, { "epoch": 0.4545387143810395, "grad_norm": 3.989347893709507, "learning_rate": 1.1945603198606675e-07, "loss": 0.7573, "step": 4598 }, { "epoch": 0.4546375701257939, "grad_norm": 3.4152608338685235, "learning_rate": 1.1942462176925068e-07, "loss": 0.6939, "step": 4599 }, { "epoch": 0.4547364258705484, "grad_norm": 5.498366042000261, "learning_rate": 1.1939320956072618e-07, "loss": 0.7263, "step": 4600 }, { "epoch": 0.45483528161530284, "grad_norm": 3.3929465033737736, "learning_rate": 1.193617953637141e-07, "loss": 0.768, "step": 4601 }, { "epoch": 0.45493413736005733, "grad_norm": 3.778416689658618, "learning_rate": 1.1933037918143549e-07, "loss": 0.7386, "step": 4602 }, { "epoch": 0.4550329931048118, "grad_norm": 4.082904268829284, "learning_rate": 1.1929896101711166e-07, "loss": 0.6219, "step": 4603 }, { "epoch": 0.45513184884956626, "grad_norm": 3.2315080196937855, "learning_rate": 1.1926754087396407e-07, "loss": 0.6956, "step": 4604 }, { "epoch": 0.45523070459432075, "grad_norm": 5.851598970621131, "learning_rate": 1.1923611875521437e-07, "loss": 0.6969, "step": 4605 }, { "epoch": 0.4553295603390752, "grad_norm": 3.4232129752488762, "learning_rate": 1.1920469466408441e-07, "loss": 0.7825, "step": 4606 }, { "epoch": 0.45542841608382967, "grad_norm": 3.590420034995068, "learning_rate": 1.1917326860379633e-07, "loss": 0.6751, "step": 4607 }, { "epoch": 0.45552727182858416, "grad_norm": 5.7855401702308225, "learning_rate": 1.1914184057757239e-07, "loss": 0.6886, "step": 4608 }, { "epoch": 0.4556261275733386, "grad_norm": 4.2818653453716795, "learning_rate": 1.1911041058863504e-07, "loss": 0.7755, "step": 4609 }, { "epoch": 0.4557249833180931, "grad_norm": 3.9186367535425735, "learning_rate": 1.1907897864020701e-07, "loss": 0.7954, "step": 4610 }, { "epoch": 0.4558238390628475, "grad_norm": 3.808290157360293, "learning_rate": 1.1904754473551116e-07, "loss": 0.7262, "step": 4611 }, { "epoch": 0.455922694807602, "grad_norm": 6.996252744032158, "learning_rate": 1.1901610887777054e-07, "loss": 0.8001, "step": 4612 }, { "epoch": 0.4560215505523565, "grad_norm": 3.681890634758645, "learning_rate": 1.1898467107020844e-07, "loss": 0.6396, "step": 4613 }, { "epoch": 0.4561204062971109, "grad_norm": 4.325822456743447, "learning_rate": 1.189532313160484e-07, "loss": 0.7736, "step": 4614 }, { "epoch": 0.4562192620418654, "grad_norm": 10.453009376026047, "learning_rate": 1.1892178961851407e-07, "loss": 0.7022, "step": 4615 }, { "epoch": 0.45631811778661985, "grad_norm": 2.898088069437076, "learning_rate": 1.1889034598082933e-07, "loss": 0.6342, "step": 4616 }, { "epoch": 0.45641697353137434, "grad_norm": 3.7583073972783856, "learning_rate": 1.1885890040621826e-07, "loss": 0.6708, "step": 4617 }, { "epoch": 0.4565158292761288, "grad_norm": 3.955420137703679, "learning_rate": 1.1882745289790514e-07, "loss": 0.654, "step": 4618 }, { "epoch": 0.45661468502088326, "grad_norm": 3.5779500549970877, "learning_rate": 1.1879600345911445e-07, "loss": 0.6767, "step": 4619 }, { "epoch": 0.45671354076563775, "grad_norm": 3.916301513297369, "learning_rate": 1.1876455209307086e-07, "loss": 0.7384, "step": 4620 }, { "epoch": 0.4568123965103922, "grad_norm": 6.507337440728105, "learning_rate": 1.1873309880299924e-07, "loss": 0.6897, "step": 4621 }, { "epoch": 0.45691125225514667, "grad_norm": 10.425992368475319, "learning_rate": 1.1870164359212468e-07, "loss": 0.7366, "step": 4622 }, { "epoch": 0.45701010799990116, "grad_norm": 4.663593769406385, "learning_rate": 1.1867018646367246e-07, "loss": 0.7409, "step": 4623 }, { "epoch": 0.4571089637446556, "grad_norm": 4.373052096860973, "learning_rate": 1.1863872742086798e-07, "loss": 0.711, "step": 4624 }, { "epoch": 0.4572078194894101, "grad_norm": 3.8927789876540717, "learning_rate": 1.1860726646693697e-07, "loss": 0.6951, "step": 4625 }, { "epoch": 0.4573066752341646, "grad_norm": 3.2092514675880235, "learning_rate": 1.1857580360510527e-07, "loss": 0.7611, "step": 4626 }, { "epoch": 0.457405530978919, "grad_norm": 4.0947723905829605, "learning_rate": 1.1854433883859894e-07, "loss": 0.734, "step": 4627 }, { "epoch": 0.4575043867236735, "grad_norm": 3.503449877220396, "learning_rate": 1.1851287217064414e-07, "loss": 0.6434, "step": 4628 }, { "epoch": 0.45760324246842793, "grad_norm": 20.035457869414834, "learning_rate": 1.1848140360446743e-07, "loss": 0.6888, "step": 4629 }, { "epoch": 0.4577020982131824, "grad_norm": 6.236922508272508, "learning_rate": 1.1844993314329541e-07, "loss": 0.6366, "step": 4630 }, { "epoch": 0.4578009539579369, "grad_norm": 14.72188144394107, "learning_rate": 1.1841846079035487e-07, "loss": 0.6729, "step": 4631 }, { "epoch": 0.45789980970269134, "grad_norm": 3.8204088593688055, "learning_rate": 1.1838698654887287e-07, "loss": 0.6545, "step": 4632 }, { "epoch": 0.45799866544744583, "grad_norm": 2.795099865429774, "learning_rate": 1.1835551042207665e-07, "loss": 0.6779, "step": 4633 }, { "epoch": 0.45809752119220026, "grad_norm": 10.711559894723669, "learning_rate": 1.183240324131936e-07, "loss": 0.6823, "step": 4634 }, { "epoch": 0.45819637693695475, "grad_norm": 3.5246733324598747, "learning_rate": 1.1829255252545132e-07, "loss": 0.6329, "step": 4635 }, { "epoch": 0.45829523268170924, "grad_norm": 5.157053081345313, "learning_rate": 1.1826107076207761e-07, "loss": 0.6977, "step": 4636 }, { "epoch": 0.4583940884264637, "grad_norm": 8.491817674752408, "learning_rate": 1.1822958712630047e-07, "loss": 0.6184, "step": 4637 }, { "epoch": 0.45849294417121816, "grad_norm": 8.31485096379298, "learning_rate": 1.1819810162134804e-07, "loss": 0.7409, "step": 4638 }, { "epoch": 0.4585917999159726, "grad_norm": 4.201103259407058, "learning_rate": 1.1816661425044876e-07, "loss": 0.6651, "step": 4639 }, { "epoch": 0.4586906556607271, "grad_norm": 3.0060358503292166, "learning_rate": 1.1813512501683116e-07, "loss": 0.728, "step": 4640 }, { "epoch": 0.4587895114054816, "grad_norm": 8.804458737932189, "learning_rate": 1.18103633923724e-07, "loss": 0.714, "step": 4641 }, { "epoch": 0.458888367150236, "grad_norm": 6.9604376842802855, "learning_rate": 1.180721409743562e-07, "loss": 0.6234, "step": 4642 }, { "epoch": 0.4589872228949905, "grad_norm": 11.346181740783173, "learning_rate": 1.1804064617195697e-07, "loss": 0.6969, "step": 4643 }, { "epoch": 0.45908607863974493, "grad_norm": 7.896040164038983, "learning_rate": 1.180091495197556e-07, "loss": 0.6288, "step": 4644 }, { "epoch": 0.4591849343844994, "grad_norm": 4.432438131744009, "learning_rate": 1.1797765102098158e-07, "loss": 0.7576, "step": 4645 }, { "epoch": 0.4592837901292539, "grad_norm": 3.816758736867726, "learning_rate": 1.1794615067886462e-07, "loss": 0.6632, "step": 4646 }, { "epoch": 0.45938264587400834, "grad_norm": 2.267710691897828, "learning_rate": 1.1791464849663466e-07, "loss": 0.5286, "step": 4647 }, { "epoch": 0.45948150161876283, "grad_norm": 5.1641920899183384, "learning_rate": 1.1788314447752175e-07, "loss": 0.7829, "step": 4648 }, { "epoch": 0.45958035736351727, "grad_norm": 3.7744862640296892, "learning_rate": 1.1785163862475616e-07, "loss": 0.6891, "step": 4649 }, { "epoch": 0.45967921310827176, "grad_norm": 3.0755400356752034, "learning_rate": 1.1782013094156835e-07, "loss": 0.7477, "step": 4650 }, { "epoch": 0.45977806885302624, "grad_norm": 9.077743933110929, "learning_rate": 1.1778862143118899e-07, "loss": 0.7725, "step": 4651 }, { "epoch": 0.4598769245977807, "grad_norm": 2.912147389523053, "learning_rate": 1.177571100968489e-07, "loss": 0.6833, "step": 4652 }, { "epoch": 0.45997578034253517, "grad_norm": 3.4453638112579648, "learning_rate": 1.1772559694177911e-07, "loss": 0.6483, "step": 4653 }, { "epoch": 0.4600746360872896, "grad_norm": 4.293927106449199, "learning_rate": 1.1769408196921082e-07, "loss": 0.6469, "step": 4654 }, { "epoch": 0.4601734918320441, "grad_norm": 12.590819462184028, "learning_rate": 1.1766256518237545e-07, "loss": 0.7062, "step": 4655 }, { "epoch": 0.4602723475767986, "grad_norm": 4.039522575073549, "learning_rate": 1.1763104658450453e-07, "loss": 0.6753, "step": 4656 }, { "epoch": 0.460371203321553, "grad_norm": 4.950722352279942, "learning_rate": 1.1759952617882987e-07, "loss": 0.7893, "step": 4657 }, { "epoch": 0.4604700590663075, "grad_norm": 4.2828131086538175, "learning_rate": 1.1756800396858341e-07, "loss": 0.6674, "step": 4658 }, { "epoch": 0.46056891481106194, "grad_norm": 6.606696125613661, "learning_rate": 1.1753647995699731e-07, "loss": 0.6599, "step": 4659 }, { "epoch": 0.4606677705558164, "grad_norm": 3.203459937806646, "learning_rate": 1.1750495414730386e-07, "loss": 0.7089, "step": 4660 }, { "epoch": 0.4607666263005709, "grad_norm": 8.774166201620362, "learning_rate": 1.1747342654273557e-07, "loss": 0.6957, "step": 4661 }, { "epoch": 0.46086548204532535, "grad_norm": 4.495767098772353, "learning_rate": 1.1744189714652514e-07, "loss": 0.678, "step": 4662 }, { "epoch": 0.46096433779007984, "grad_norm": 5.635033539579608, "learning_rate": 1.1741036596190546e-07, "loss": 0.7815, "step": 4663 }, { "epoch": 0.46106319353483427, "grad_norm": 5.042411270691449, "learning_rate": 1.1737883299210953e-07, "loss": 0.682, "step": 4664 }, { "epoch": 0.46116204927958876, "grad_norm": 3.692571613269367, "learning_rate": 1.1734729824037069e-07, "loss": 0.7163, "step": 4665 }, { "epoch": 0.46126090502434325, "grad_norm": 3.8647548789410897, "learning_rate": 1.1731576170992225e-07, "loss": 0.6965, "step": 4666 }, { "epoch": 0.4613597607690977, "grad_norm": 7.039948623611418, "learning_rate": 1.1728422340399793e-07, "loss": 0.6622, "step": 4667 }, { "epoch": 0.46145861651385217, "grad_norm": 4.252241589524265, "learning_rate": 1.172526833258314e-07, "loss": 0.6806, "step": 4668 }, { "epoch": 0.4615574722586066, "grad_norm": 6.611354818948827, "learning_rate": 1.1722114147865671e-07, "loss": 0.7299, "step": 4669 }, { "epoch": 0.4616563280033611, "grad_norm": 9.058539266225598, "learning_rate": 1.1718959786570801e-07, "loss": 0.7538, "step": 4670 }, { "epoch": 0.4617551837481156, "grad_norm": 3.378186138624088, "learning_rate": 1.1715805249021964e-07, "loss": 0.6563, "step": 4671 }, { "epoch": 0.46185403949287, "grad_norm": 3.565069229451203, "learning_rate": 1.1712650535542603e-07, "loss": 0.6354, "step": 4672 }, { "epoch": 0.4619528952376245, "grad_norm": 4.844047598409716, "learning_rate": 1.1709495646456196e-07, "loss": 0.7444, "step": 4673 }, { "epoch": 0.46205175098237894, "grad_norm": 3.2531833346651844, "learning_rate": 1.1706340582086227e-07, "loss": 0.7827, "step": 4674 }, { "epoch": 0.46215060672713343, "grad_norm": 10.568667644395836, "learning_rate": 1.1703185342756199e-07, "loss": 0.7442, "step": 4675 }, { "epoch": 0.4622494624718879, "grad_norm": 3.463765321618339, "learning_rate": 1.1700029928789639e-07, "loss": 0.6823, "step": 4676 }, { "epoch": 0.46234831821664235, "grad_norm": 3.9990672789499135, "learning_rate": 1.1696874340510088e-07, "loss": 0.7682, "step": 4677 }, { "epoch": 0.46244717396139684, "grad_norm": 3.536414998583463, "learning_rate": 1.1693718578241105e-07, "loss": 0.6907, "step": 4678 }, { "epoch": 0.4625460297061513, "grad_norm": 3.290131956037447, "learning_rate": 1.1690562642306266e-07, "loss": 0.7729, "step": 4679 }, { "epoch": 0.46264488545090576, "grad_norm": 5.149613395297949, "learning_rate": 1.1687406533029163e-07, "loss": 0.6595, "step": 4680 }, { "epoch": 0.46274374119566025, "grad_norm": 3.128533817498692, "learning_rate": 1.1684250250733412e-07, "loss": 0.7791, "step": 4681 }, { "epoch": 0.4628425969404147, "grad_norm": 3.570379796147385, "learning_rate": 1.1681093795742644e-07, "loss": 0.769, "step": 4682 }, { "epoch": 0.4629414526851692, "grad_norm": 4.088388059245153, "learning_rate": 1.1677937168380503e-07, "loss": 0.6796, "step": 4683 }, { "epoch": 0.4630403084299236, "grad_norm": 3.6741413775561185, "learning_rate": 1.1674780368970656e-07, "loss": 0.6659, "step": 4684 }, { "epoch": 0.4631391641746781, "grad_norm": 3.735292716456063, "learning_rate": 1.1671623397836789e-07, "loss": 0.791, "step": 4685 }, { "epoch": 0.4632380199194326, "grad_norm": 4.008793165795117, "learning_rate": 1.1668466255302598e-07, "loss": 0.5466, "step": 4686 }, { "epoch": 0.463336875664187, "grad_norm": 3.1534999885384485, "learning_rate": 1.1665308941691807e-07, "loss": 0.7017, "step": 4687 }, { "epoch": 0.4634357314089415, "grad_norm": 4.242041194185755, "learning_rate": 1.1662151457328151e-07, "loss": 0.8476, "step": 4688 }, { "epoch": 0.46353458715369594, "grad_norm": 5.349901967760987, "learning_rate": 1.165899380253538e-07, "loss": 0.6427, "step": 4689 }, { "epoch": 0.46363344289845043, "grad_norm": 3.740536799551845, "learning_rate": 1.1655835977637263e-07, "loss": 0.7943, "step": 4690 }, { "epoch": 0.4637322986432049, "grad_norm": 5.445941783149553, "learning_rate": 1.1652677982957593e-07, "loss": 0.6684, "step": 4691 }, { "epoch": 0.46383115438795935, "grad_norm": 3.3083071740962686, "learning_rate": 1.1649519818820175e-07, "loss": 0.687, "step": 4692 }, { "epoch": 0.46393001013271384, "grad_norm": 3.4590813114977434, "learning_rate": 1.1646361485548833e-07, "loss": 0.8196, "step": 4693 }, { "epoch": 0.4640288658774683, "grad_norm": 4.365700732306801, "learning_rate": 1.1643202983467404e-07, "loss": 0.6632, "step": 4694 }, { "epoch": 0.46412772162222277, "grad_norm": 5.749723334242027, "learning_rate": 1.1640044312899751e-07, "loss": 0.7067, "step": 4695 }, { "epoch": 0.46422657736697726, "grad_norm": 11.519415353869658, "learning_rate": 1.1636885474169744e-07, "loss": 0.7793, "step": 4696 }, { "epoch": 0.4643254331117317, "grad_norm": 9.678608984391213, "learning_rate": 1.1633726467601277e-07, "loss": 0.6656, "step": 4697 }, { "epoch": 0.4644242888564862, "grad_norm": 2.7417267428040075, "learning_rate": 1.163056729351826e-07, "loss": 0.679, "step": 4698 }, { "epoch": 0.46452314460124067, "grad_norm": 5.982984539032504, "learning_rate": 1.1627407952244624e-07, "loss": 0.6859, "step": 4699 }, { "epoch": 0.4646220003459951, "grad_norm": 5.555793290449128, "learning_rate": 1.1624248444104303e-07, "loss": 0.7489, "step": 4700 }, { "epoch": 0.4647208560907496, "grad_norm": 9.621755606373215, "learning_rate": 1.1621088769421263e-07, "loss": 0.669, "step": 4701 }, { "epoch": 0.464819711835504, "grad_norm": 5.7329061174706455, "learning_rate": 1.1617928928519487e-07, "loss": 0.752, "step": 4702 }, { "epoch": 0.4649185675802585, "grad_norm": 3.6604994378405573, "learning_rate": 1.1614768921722965e-07, "loss": 0.6597, "step": 4703 }, { "epoch": 0.465017423325013, "grad_norm": 8.609836965523133, "learning_rate": 1.1611608749355706e-07, "loss": 0.7116, "step": 4704 }, { "epoch": 0.46511627906976744, "grad_norm": 17.111637428732216, "learning_rate": 1.1608448411741745e-07, "loss": 0.5876, "step": 4705 }, { "epoch": 0.4652151348145219, "grad_norm": 4.060235593766756, "learning_rate": 1.1605287909205126e-07, "loss": 0.6775, "step": 4706 }, { "epoch": 0.46531399055927636, "grad_norm": 3.2910948919967082, "learning_rate": 1.1602127242069911e-07, "loss": 0.6543, "step": 4707 }, { "epoch": 0.46541284630403085, "grad_norm": 3.7808516791336397, "learning_rate": 1.1598966410660178e-07, "loss": 0.7264, "step": 4708 }, { "epoch": 0.46551170204878534, "grad_norm": 9.427986524593207, "learning_rate": 1.1595805415300029e-07, "loss": 0.5736, "step": 4709 }, { "epoch": 0.46561055779353977, "grad_norm": 5.976630673309605, "learning_rate": 1.1592644256313572e-07, "loss": 0.7481, "step": 4710 }, { "epoch": 0.46570941353829426, "grad_norm": 4.624927364809713, "learning_rate": 1.1589482934024941e-07, "loss": 0.7149, "step": 4711 }, { "epoch": 0.4658082692830487, "grad_norm": 3.4419574358635963, "learning_rate": 1.1586321448758281e-07, "loss": 0.7398, "step": 4712 }, { "epoch": 0.4659071250278032, "grad_norm": 3.120717223096803, "learning_rate": 1.1583159800837758e-07, "loss": 0.5969, "step": 4713 }, { "epoch": 0.46600598077255767, "grad_norm": 4.8453948273489305, "learning_rate": 1.1579997990587549e-07, "loss": 0.6516, "step": 4714 }, { "epoch": 0.4661048365173121, "grad_norm": 3.0943879641097056, "learning_rate": 1.1576836018331857e-07, "loss": 0.729, "step": 4715 }, { "epoch": 0.4662036922620666, "grad_norm": 3.9193846330208775, "learning_rate": 1.1573673884394885e-07, "loss": 0.875, "step": 4716 }, { "epoch": 0.466302548006821, "grad_norm": 6.834003181684879, "learning_rate": 1.1570511589100874e-07, "loss": 0.7031, "step": 4717 }, { "epoch": 0.4664014037515755, "grad_norm": 5.252000842016032, "learning_rate": 1.1567349132774062e-07, "loss": 0.7122, "step": 4718 }, { "epoch": 0.46650025949633, "grad_norm": 4.3698348109257905, "learning_rate": 1.1564186515738719e-07, "loss": 0.678, "step": 4719 }, { "epoch": 0.46659911524108444, "grad_norm": 4.544744341755708, "learning_rate": 1.1561023738319122e-07, "loss": 0.6869, "step": 4720 }, { "epoch": 0.46669797098583893, "grad_norm": 3.9455358074527456, "learning_rate": 1.1557860800839567e-07, "loss": 0.7238, "step": 4721 }, { "epoch": 0.46679682673059336, "grad_norm": 3.0692528455587156, "learning_rate": 1.155469770362437e-07, "loss": 0.67, "step": 4722 }, { "epoch": 0.46689568247534785, "grad_norm": 4.3596911248609445, "learning_rate": 1.1551534446997852e-07, "loss": 0.6617, "step": 4723 }, { "epoch": 0.46699453822010234, "grad_norm": 3.8898571289552053, "learning_rate": 1.1548371031284368e-07, "loss": 0.7197, "step": 4724 }, { "epoch": 0.4670933939648568, "grad_norm": 3.687045836318085, "learning_rate": 1.1545207456808272e-07, "loss": 0.6699, "step": 4725 }, { "epoch": 0.46719224970961126, "grad_norm": 15.0945003114756, "learning_rate": 1.1542043723893944e-07, "loss": 0.8255, "step": 4726 }, { "epoch": 0.4672911054543657, "grad_norm": 6.63233313877866, "learning_rate": 1.153887983286578e-07, "loss": 0.6651, "step": 4727 }, { "epoch": 0.4673899611991202, "grad_norm": 6.088459862920595, "learning_rate": 1.1535715784048188e-07, "loss": 0.5999, "step": 4728 }, { "epoch": 0.4674888169438747, "grad_norm": 46.09721081772401, "learning_rate": 1.1532551577765597e-07, "loss": 0.6397, "step": 4729 }, { "epoch": 0.4675876726886291, "grad_norm": 3.4992431270471203, "learning_rate": 1.1529387214342447e-07, "loss": 0.8203, "step": 4730 }, { "epoch": 0.4676865284333836, "grad_norm": 5.759420848194705, "learning_rate": 1.1526222694103202e-07, "loss": 0.6325, "step": 4731 }, { "epoch": 0.46778538417813803, "grad_norm": 5.698135216003437, "learning_rate": 1.1523058017372331e-07, "loss": 0.6558, "step": 4732 }, { "epoch": 0.4678842399228925, "grad_norm": 4.41892910247054, "learning_rate": 1.1519893184474327e-07, "loss": 0.5974, "step": 4733 }, { "epoch": 0.467983095667647, "grad_norm": 4.127738154456137, "learning_rate": 1.1516728195733695e-07, "loss": 0.6639, "step": 4734 }, { "epoch": 0.46808195141240144, "grad_norm": 4.589122011658872, "learning_rate": 1.1513563051474961e-07, "loss": 0.6775, "step": 4735 }, { "epoch": 0.46818080715715593, "grad_norm": 12.503323495320583, "learning_rate": 1.1510397752022665e-07, "loss": 0.7309, "step": 4736 }, { "epoch": 0.46827966290191037, "grad_norm": 4.046731133438157, "learning_rate": 1.1507232297701357e-07, "loss": 0.6989, "step": 4737 }, { "epoch": 0.46837851864666485, "grad_norm": 4.116497819421549, "learning_rate": 1.1504066688835607e-07, "loss": 0.7187, "step": 4738 }, { "epoch": 0.46847737439141934, "grad_norm": 3.3660008058564177, "learning_rate": 1.150090092575001e-07, "loss": 0.7176, "step": 4739 }, { "epoch": 0.4685762301361738, "grad_norm": 4.967269189763686, "learning_rate": 1.1497735008769161e-07, "loss": 0.7224, "step": 4740 }, { "epoch": 0.46867508588092827, "grad_norm": 4.115200445846632, "learning_rate": 1.1494568938217682e-07, "loss": 0.7072, "step": 4741 }, { "epoch": 0.4687739416256827, "grad_norm": 3.5369347910871918, "learning_rate": 1.1491402714420202e-07, "loss": 0.7283, "step": 4742 }, { "epoch": 0.4688727973704372, "grad_norm": 3.7275759695320736, "learning_rate": 1.1488236337701376e-07, "loss": 0.6797, "step": 4743 }, { "epoch": 0.4689716531151917, "grad_norm": 11.993252242219121, "learning_rate": 1.1485069808385866e-07, "loss": 0.7723, "step": 4744 }, { "epoch": 0.4690705088599461, "grad_norm": 4.145050439710796, "learning_rate": 1.1481903126798349e-07, "loss": 0.7302, "step": 4745 }, { "epoch": 0.4691693646047006, "grad_norm": 7.082357114593416, "learning_rate": 1.1478736293263529e-07, "loss": 0.6925, "step": 4746 }, { "epoch": 0.46926822034945503, "grad_norm": 3.8466547753478246, "learning_rate": 1.1475569308106117e-07, "loss": 0.6449, "step": 4747 }, { "epoch": 0.4693670760942095, "grad_norm": 4.029119842199248, "learning_rate": 1.1472402171650837e-07, "loss": 0.7002, "step": 4748 }, { "epoch": 0.469465931838964, "grad_norm": 3.4612435528820535, "learning_rate": 1.1469234884222434e-07, "loss": 0.6831, "step": 4749 }, { "epoch": 0.46956478758371845, "grad_norm": 3.0351195562618227, "learning_rate": 1.1466067446145666e-07, "loss": 0.6727, "step": 4750 }, { "epoch": 0.46966364332847294, "grad_norm": 6.237232336046948, "learning_rate": 1.1462899857745307e-07, "loss": 0.8575, "step": 4751 }, { "epoch": 0.46976249907322737, "grad_norm": 3.656110778328478, "learning_rate": 1.1459732119346146e-07, "loss": 0.7421, "step": 4752 }, { "epoch": 0.46986135481798186, "grad_norm": 6.0778065533405385, "learning_rate": 1.1456564231272992e-07, "loss": 0.7809, "step": 4753 }, { "epoch": 0.46996021056273635, "grad_norm": 3.2879756432519347, "learning_rate": 1.1453396193850659e-07, "loss": 0.6001, "step": 4754 }, { "epoch": 0.4700590663074908, "grad_norm": 4.345443234668562, "learning_rate": 1.1450228007403988e-07, "loss": 0.7303, "step": 4755 }, { "epoch": 0.47015792205224527, "grad_norm": 5.888325107309177, "learning_rate": 1.1447059672257823e-07, "loss": 0.6582, "step": 4756 }, { "epoch": 0.4702567777969997, "grad_norm": 8.653022346998403, "learning_rate": 1.1443891188737038e-07, "loss": 0.7547, "step": 4757 }, { "epoch": 0.4703556335417542, "grad_norm": 3.6413584568839705, "learning_rate": 1.1440722557166511e-07, "loss": 0.727, "step": 4758 }, { "epoch": 0.4704544892865087, "grad_norm": 5.0862550364480965, "learning_rate": 1.1437553777871138e-07, "loss": 0.6694, "step": 4759 }, { "epoch": 0.4705533450312631, "grad_norm": 4.686032588845638, "learning_rate": 1.1434384851175826e-07, "loss": 0.6295, "step": 4760 }, { "epoch": 0.4706522007760176, "grad_norm": 12.294816050316133, "learning_rate": 1.143121577740551e-07, "loss": 0.6007, "step": 4761 }, { "epoch": 0.47075105652077204, "grad_norm": 5.013159575825405, "learning_rate": 1.1428046556885127e-07, "loss": 0.7146, "step": 4762 }, { "epoch": 0.4708499122655265, "grad_norm": 3.8525977166211627, "learning_rate": 1.1424877189939632e-07, "loss": 0.6922, "step": 4763 }, { "epoch": 0.470948768010281, "grad_norm": 4.642034119818238, "learning_rate": 1.1421707676894004e-07, "loss": 0.8061, "step": 4764 }, { "epoch": 0.47104762375503545, "grad_norm": 3.8282134631909974, "learning_rate": 1.1418538018073226e-07, "loss": 0.6999, "step": 4765 }, { "epoch": 0.47114647949978994, "grad_norm": 4.173515301928192, "learning_rate": 1.1415368213802295e-07, "loss": 0.6954, "step": 4766 }, { "epoch": 0.47124533524454437, "grad_norm": 3.796953713890127, "learning_rate": 1.1412198264406234e-07, "loss": 0.6877, "step": 4767 }, { "epoch": 0.47134419098929886, "grad_norm": 4.250623861562799, "learning_rate": 1.1409028170210074e-07, "loss": 0.7364, "step": 4768 }, { "epoch": 0.47144304673405335, "grad_norm": 8.128875259125175, "learning_rate": 1.1405857931538856e-07, "loss": 0.8536, "step": 4769 }, { "epoch": 0.4715419024788078, "grad_norm": 4.421817449054516, "learning_rate": 1.140268754871765e-07, "loss": 0.6887, "step": 4770 }, { "epoch": 0.4716407582235623, "grad_norm": 3.521137018769241, "learning_rate": 1.1399517022071519e-07, "loss": 0.721, "step": 4771 }, { "epoch": 0.4717396139683167, "grad_norm": 2.8038154966074096, "learning_rate": 1.1396346351925568e-07, "loss": 0.6235, "step": 4772 }, { "epoch": 0.4718384697130712, "grad_norm": 4.066169650778574, "learning_rate": 1.1393175538604895e-07, "loss": 0.7841, "step": 4773 }, { "epoch": 0.4719373254578257, "grad_norm": 3.6796767458230772, "learning_rate": 1.1390004582434617e-07, "loss": 0.7959, "step": 4774 }, { "epoch": 0.4720361812025801, "grad_norm": 3.912029722415188, "learning_rate": 1.1386833483739881e-07, "loss": 0.6601, "step": 4775 }, { "epoch": 0.4721350369473346, "grad_norm": 6.775346766966058, "learning_rate": 1.1383662242845826e-07, "loss": 0.6692, "step": 4776 }, { "epoch": 0.4722338926920891, "grad_norm": 3.1135649797503215, "learning_rate": 1.1380490860077616e-07, "loss": 0.6993, "step": 4777 }, { "epoch": 0.47233274843684353, "grad_norm": 4.684035762143831, "learning_rate": 1.1377319335760431e-07, "loss": 0.7153, "step": 4778 }, { "epoch": 0.472431604181598, "grad_norm": 3.5865089801424945, "learning_rate": 1.1374147670219467e-07, "loss": 0.7265, "step": 4779 }, { "epoch": 0.47253045992635245, "grad_norm": 4.831593982873998, "learning_rate": 1.137097586377993e-07, "loss": 0.7061, "step": 4780 }, { "epoch": 0.47262931567110694, "grad_norm": 3.8743394390008654, "learning_rate": 1.1367803916767043e-07, "loss": 0.6044, "step": 4781 }, { "epoch": 0.47272817141586143, "grad_norm": 6.090812780367724, "learning_rate": 1.1364631829506037e-07, "loss": 0.6497, "step": 4782 }, { "epoch": 0.47282702716061586, "grad_norm": 5.346108340936592, "learning_rate": 1.1361459602322168e-07, "loss": 0.7767, "step": 4783 }, { "epoch": 0.47292588290537035, "grad_norm": 4.37422065164614, "learning_rate": 1.13582872355407e-07, "loss": 0.7884, "step": 4784 }, { "epoch": 0.4730247386501248, "grad_norm": 3.6019420207804043, "learning_rate": 1.1355114729486915e-07, "loss": 0.734, "step": 4785 }, { "epoch": 0.4731235943948793, "grad_norm": 4.540844869309253, "learning_rate": 1.13519420844861e-07, "loss": 0.7172, "step": 4786 }, { "epoch": 0.47322245013963377, "grad_norm": 4.464854434756222, "learning_rate": 1.1348769300863567e-07, "loss": 0.67, "step": 4787 }, { "epoch": 0.4733213058843882, "grad_norm": 4.534591024449965, "learning_rate": 1.134559637894464e-07, "loss": 0.7273, "step": 4788 }, { "epoch": 0.4734201616291427, "grad_norm": 4.97740350991135, "learning_rate": 1.134242331905465e-07, "loss": 0.7346, "step": 4789 }, { "epoch": 0.4735190173738971, "grad_norm": 3.9399808379737777, "learning_rate": 1.1339250121518955e-07, "loss": 0.6016, "step": 4790 }, { "epoch": 0.4736178731186516, "grad_norm": 7.115401611468567, "learning_rate": 1.1336076786662914e-07, "loss": 0.6615, "step": 4791 }, { "epoch": 0.4737167288634061, "grad_norm": 4.217901948350837, "learning_rate": 1.1332903314811906e-07, "loss": 0.6223, "step": 4792 }, { "epoch": 0.47381558460816053, "grad_norm": 6.5004319458787165, "learning_rate": 1.1329729706291324e-07, "loss": 0.7242, "step": 4793 }, { "epoch": 0.473914440352915, "grad_norm": 7.48493753883289, "learning_rate": 1.1326555961426578e-07, "loss": 0.6623, "step": 4794 }, { "epoch": 0.47401329609766946, "grad_norm": 6.371928638039179, "learning_rate": 1.1323382080543085e-07, "loss": 0.693, "step": 4795 }, { "epoch": 0.47411215184242395, "grad_norm": 6.076585785333862, "learning_rate": 1.1320208063966279e-07, "loss": 0.717, "step": 4796 }, { "epoch": 0.47421100758717843, "grad_norm": 3.3954887025293403, "learning_rate": 1.1317033912021612e-07, "loss": 0.749, "step": 4797 }, { "epoch": 0.47430986333193287, "grad_norm": 5.799272460384484, "learning_rate": 1.1313859625034545e-07, "loss": 0.6789, "step": 4798 }, { "epoch": 0.47440871907668736, "grad_norm": 4.21886273307908, "learning_rate": 1.1310685203330556e-07, "loss": 0.8423, "step": 4799 }, { "epoch": 0.4745075748214418, "grad_norm": 12.337340627004302, "learning_rate": 1.130751064723513e-07, "loss": 0.7518, "step": 4800 }, { "epoch": 0.4746064305661963, "grad_norm": 5.262698770432406, "learning_rate": 1.1304335957073776e-07, "loss": 0.7164, "step": 4801 }, { "epoch": 0.47470528631095077, "grad_norm": 4.26689992209808, "learning_rate": 1.1301161133172012e-07, "loss": 0.6662, "step": 4802 }, { "epoch": 0.4748041420557052, "grad_norm": 7.039276245213629, "learning_rate": 1.1297986175855365e-07, "loss": 0.6439, "step": 4803 }, { "epoch": 0.4749029978004597, "grad_norm": 7.910952725458966, "learning_rate": 1.1294811085449383e-07, "loss": 0.7488, "step": 4804 }, { "epoch": 0.4750018535452141, "grad_norm": 4.585138380142986, "learning_rate": 1.1291635862279623e-07, "loss": 0.6983, "step": 4805 }, { "epoch": 0.4751007092899686, "grad_norm": 4.774946028813517, "learning_rate": 1.128846050667166e-07, "loss": 0.6798, "step": 4806 }, { "epoch": 0.4751995650347231, "grad_norm": 5.0624522952712345, "learning_rate": 1.1285285018951074e-07, "loss": 0.8252, "step": 4807 }, { "epoch": 0.47529842077947754, "grad_norm": 4.956299902408893, "learning_rate": 1.1282109399443474e-07, "loss": 0.6432, "step": 4808 }, { "epoch": 0.475397276524232, "grad_norm": 5.3385094409519445, "learning_rate": 1.1278933648474464e-07, "loss": 0.6993, "step": 4809 }, { "epoch": 0.47549613226898646, "grad_norm": 3.2067952421989583, "learning_rate": 1.1275757766369675e-07, "loss": 0.5856, "step": 4810 }, { "epoch": 0.47559498801374095, "grad_norm": 7.636609818464929, "learning_rate": 1.1272581753454747e-07, "loss": 0.7802, "step": 4811 }, { "epoch": 0.47569384375849544, "grad_norm": 4.388299091644086, "learning_rate": 1.1269405610055331e-07, "loss": 0.7627, "step": 4812 }, { "epoch": 0.47579269950324987, "grad_norm": 3.4479448005176576, "learning_rate": 1.1266229336497096e-07, "loss": 0.6996, "step": 4813 }, { "epoch": 0.47589155524800436, "grad_norm": 4.3080875408764685, "learning_rate": 1.1263052933105717e-07, "loss": 0.8197, "step": 4814 }, { "epoch": 0.4759904109927588, "grad_norm": 4.471056154693649, "learning_rate": 1.1259876400206894e-07, "loss": 0.7308, "step": 4815 }, { "epoch": 0.4760892667375133, "grad_norm": 3.848082640712577, "learning_rate": 1.125669973812633e-07, "loss": 0.7682, "step": 4816 }, { "epoch": 0.4761881224822678, "grad_norm": 3.147038818538059, "learning_rate": 1.1253522947189747e-07, "loss": 0.6963, "step": 4817 }, { "epoch": 0.4762869782270222, "grad_norm": 3.535078839851762, "learning_rate": 1.1250346027722872e-07, "loss": 0.7354, "step": 4818 }, { "epoch": 0.4763858339717767, "grad_norm": 9.890132115370285, "learning_rate": 1.1247168980051463e-07, "loss": 0.6767, "step": 4819 }, { "epoch": 0.47648468971653113, "grad_norm": 3.2037953418363094, "learning_rate": 1.1243991804501267e-07, "loss": 0.7104, "step": 4820 }, { "epoch": 0.4765835454612856, "grad_norm": 5.202070579947606, "learning_rate": 1.1240814501398064e-07, "loss": 0.8042, "step": 4821 }, { "epoch": 0.4766824012060401, "grad_norm": 23.71745865829343, "learning_rate": 1.1237637071067634e-07, "loss": 0.7019, "step": 4822 }, { "epoch": 0.47678125695079454, "grad_norm": 9.399607428764302, "learning_rate": 1.1234459513835781e-07, "loss": 0.6735, "step": 4823 }, { "epoch": 0.47688011269554903, "grad_norm": 3.582615826921775, "learning_rate": 1.1231281830028314e-07, "loss": 0.731, "step": 4824 }, { "epoch": 0.47697896844030346, "grad_norm": 3.677347538629893, "learning_rate": 1.1228104019971057e-07, "loss": 0.6394, "step": 4825 }, { "epoch": 0.47707782418505795, "grad_norm": 3.1797647087728604, "learning_rate": 1.122492608398985e-07, "loss": 0.6892, "step": 4826 }, { "epoch": 0.47717667992981244, "grad_norm": 4.035150629111296, "learning_rate": 1.1221748022410542e-07, "loss": 0.8077, "step": 4827 }, { "epoch": 0.4772755356745669, "grad_norm": 3.680488951032811, "learning_rate": 1.1218569835558995e-07, "loss": 0.7462, "step": 4828 }, { "epoch": 0.47737439141932136, "grad_norm": 4.935144470291025, "learning_rate": 1.1215391523761088e-07, "loss": 0.7609, "step": 4829 }, { "epoch": 0.4774732471640758, "grad_norm": 3.6819595425598637, "learning_rate": 1.121221308734271e-07, "loss": 0.6772, "step": 4830 }, { "epoch": 0.4775721029088303, "grad_norm": 4.1167397740147615, "learning_rate": 1.1209034526629759e-07, "loss": 0.6039, "step": 4831 }, { "epoch": 0.4776709586535848, "grad_norm": 25.84147200352015, "learning_rate": 1.1205855841948154e-07, "loss": 0.809, "step": 4832 }, { "epoch": 0.4777698143983392, "grad_norm": 3.5539630088529615, "learning_rate": 1.1202677033623815e-07, "loss": 0.7654, "step": 4833 }, { "epoch": 0.4778686701430937, "grad_norm": 3.5436509469854256, "learning_rate": 1.119949810198269e-07, "loss": 0.7418, "step": 4834 }, { "epoch": 0.47796752588784813, "grad_norm": 4.482298063598885, "learning_rate": 1.1196319047350728e-07, "loss": 0.7581, "step": 4835 }, { "epoch": 0.4780663816326026, "grad_norm": 3.6674359756353145, "learning_rate": 1.1193139870053898e-07, "loss": 0.6866, "step": 4836 }, { "epoch": 0.4781652373773571, "grad_norm": 6.962563511880813, "learning_rate": 1.1189960570418171e-07, "loss": 0.8009, "step": 4837 }, { "epoch": 0.47826409312211154, "grad_norm": 4.785340720588053, "learning_rate": 1.1186781148769541e-07, "loss": 0.7426, "step": 4838 }, { "epoch": 0.47836294886686603, "grad_norm": 3.9149436740723944, "learning_rate": 1.1183601605434012e-07, "loss": 0.6507, "step": 4839 }, { "epoch": 0.47846180461162047, "grad_norm": 4.985419092151062, "learning_rate": 1.1180421940737594e-07, "loss": 0.7343, "step": 4840 }, { "epoch": 0.47856066035637496, "grad_norm": 3.2885346449293382, "learning_rate": 1.1177242155006322e-07, "loss": 0.6948, "step": 4841 }, { "epoch": 0.47865951610112945, "grad_norm": 4.268916446902121, "learning_rate": 1.1174062248566231e-07, "loss": 0.7457, "step": 4842 }, { "epoch": 0.4787583718458839, "grad_norm": 3.6470551769381734, "learning_rate": 1.1170882221743377e-07, "loss": 0.6651, "step": 4843 }, { "epoch": 0.47885722759063837, "grad_norm": 8.10325163583803, "learning_rate": 1.1167702074863818e-07, "loss": 0.7827, "step": 4844 }, { "epoch": 0.4789560833353928, "grad_norm": 4.138027673662975, "learning_rate": 1.1164521808253641e-07, "loss": 0.6729, "step": 4845 }, { "epoch": 0.4790549390801473, "grad_norm": 4.856152200566382, "learning_rate": 1.1161341422238932e-07, "loss": 0.7171, "step": 4846 }, { "epoch": 0.4791537948249018, "grad_norm": 4.264553747395361, "learning_rate": 1.1158160917145785e-07, "loss": 0.7344, "step": 4847 }, { "epoch": 0.4792526505696562, "grad_norm": 7.44916434482811, "learning_rate": 1.1154980293300324e-07, "loss": 0.7674, "step": 4848 }, { "epoch": 0.4793515063144107, "grad_norm": 6.61599031701135, "learning_rate": 1.1151799551028672e-07, "loss": 0.7685, "step": 4849 }, { "epoch": 0.47945036205916514, "grad_norm": 4.780726837919409, "learning_rate": 1.1148618690656963e-07, "loss": 0.7175, "step": 4850 }, { "epoch": 0.4795492178039196, "grad_norm": 7.378049036465865, "learning_rate": 1.114543771251135e-07, "loss": 0.7261, "step": 4851 }, { "epoch": 0.4796480735486741, "grad_norm": 4.116607169199086, "learning_rate": 1.1142256616917997e-07, "loss": 0.7532, "step": 4852 }, { "epoch": 0.47974692929342855, "grad_norm": 3.4624542346270544, "learning_rate": 1.113907540420308e-07, "loss": 0.7046, "step": 4853 }, { "epoch": 0.47984578503818304, "grad_norm": 2.8371761074191064, "learning_rate": 1.1135894074692782e-07, "loss": 0.6423, "step": 4854 }, { "epoch": 0.4799446407829375, "grad_norm": 4.8682443234018145, "learning_rate": 1.1132712628713301e-07, "loss": 0.6299, "step": 4855 }, { "epoch": 0.48004349652769196, "grad_norm": 6.1767399923675805, "learning_rate": 1.1129531066590851e-07, "loss": 0.7411, "step": 4856 }, { "epoch": 0.48014235227244645, "grad_norm": 16.89503162892244, "learning_rate": 1.112634938865165e-07, "loss": 0.7584, "step": 4857 }, { "epoch": 0.4802412080172009, "grad_norm": 4.792935880981809, "learning_rate": 1.1123167595221931e-07, "loss": 0.6388, "step": 4858 }, { "epoch": 0.48034006376195537, "grad_norm": 5.501212260840952, "learning_rate": 1.1119985686627948e-07, "loss": 0.7277, "step": 4859 }, { "epoch": 0.48043891950670986, "grad_norm": 5.092174238310617, "learning_rate": 1.1116803663195951e-07, "loss": 0.6602, "step": 4860 }, { "epoch": 0.4805377752514643, "grad_norm": 4.843152833414339, "learning_rate": 1.1113621525252214e-07, "loss": 0.6915, "step": 4861 }, { "epoch": 0.4806366309962188, "grad_norm": 6.012568691979855, "learning_rate": 1.1110439273123013e-07, "loss": 0.7156, "step": 4862 }, { "epoch": 0.4807354867409732, "grad_norm": 4.347330702886589, "learning_rate": 1.110725690713465e-07, "loss": 0.7467, "step": 4863 }, { "epoch": 0.4808343424857277, "grad_norm": 3.585039754199605, "learning_rate": 1.1104074427613423e-07, "loss": 0.6921, "step": 4864 }, { "epoch": 0.4809331982304822, "grad_norm": 6.447538620949876, "learning_rate": 1.1100891834885648e-07, "loss": 0.6029, "step": 4865 }, { "epoch": 0.48103205397523663, "grad_norm": 4.635661748257955, "learning_rate": 1.1097709129277654e-07, "loss": 0.7376, "step": 4866 }, { "epoch": 0.4811309097199911, "grad_norm": 4.696447370792104, "learning_rate": 1.1094526311115783e-07, "loss": 0.6939, "step": 4867 }, { "epoch": 0.48122976546474555, "grad_norm": 5.107637324728658, "learning_rate": 1.1091343380726383e-07, "loss": 0.7985, "step": 4868 }, { "epoch": 0.48132862120950004, "grad_norm": 5.544975864518503, "learning_rate": 1.1088160338435817e-07, "loss": 0.6573, "step": 4869 }, { "epoch": 0.48142747695425453, "grad_norm": 5.431725763529287, "learning_rate": 1.1084977184570461e-07, "loss": 0.6657, "step": 4870 }, { "epoch": 0.48152633269900896, "grad_norm": 3.4985533351354148, "learning_rate": 1.1081793919456701e-07, "loss": 0.7258, "step": 4871 }, { "epoch": 0.48162518844376345, "grad_norm": 5.023255670594426, "learning_rate": 1.1078610543420929e-07, "loss": 0.683, "step": 4872 }, { "epoch": 0.4817240441885179, "grad_norm": 3.586634101102196, "learning_rate": 1.107542705678956e-07, "loss": 0.6842, "step": 4873 }, { "epoch": 0.4818228999332724, "grad_norm": 3.6448472291203573, "learning_rate": 1.1072243459889008e-07, "loss": 0.7298, "step": 4874 }, { "epoch": 0.48192175567802686, "grad_norm": 4.829019537947762, "learning_rate": 1.1069059753045706e-07, "loss": 0.719, "step": 4875 }, { "epoch": 0.4820206114227813, "grad_norm": 3.1138484941934124, "learning_rate": 1.1065875936586095e-07, "loss": 0.7371, "step": 4876 }, { "epoch": 0.4821194671675358, "grad_norm": 11.40634640585387, "learning_rate": 1.106269201083663e-07, "loss": 0.7791, "step": 4877 }, { "epoch": 0.4822183229122902, "grad_norm": 2.9205769620197928, "learning_rate": 1.1059507976123774e-07, "loss": 0.6838, "step": 4878 }, { "epoch": 0.4823171786570447, "grad_norm": 3.6816004166090917, "learning_rate": 1.1056323832774006e-07, "loss": 0.6252, "step": 4879 }, { "epoch": 0.4824160344017992, "grad_norm": 4.735532988650914, "learning_rate": 1.105313958111381e-07, "loss": 0.7852, "step": 4880 }, { "epoch": 0.48251489014655363, "grad_norm": 3.1617779835221924, "learning_rate": 1.1049955221469687e-07, "loss": 0.6897, "step": 4881 }, { "epoch": 0.4826137458913081, "grad_norm": 3.046302848193647, "learning_rate": 1.1046770754168142e-07, "loss": 0.663, "step": 4882 }, { "epoch": 0.48271260163606255, "grad_norm": 6.099039869597704, "learning_rate": 1.1043586179535697e-07, "loss": 0.6886, "step": 4883 }, { "epoch": 0.48281145738081704, "grad_norm": 4.337379858521869, "learning_rate": 1.1040401497898881e-07, "loss": 0.5963, "step": 4884 }, { "epoch": 0.48291031312557153, "grad_norm": 3.6174672942509276, "learning_rate": 1.103721670958424e-07, "loss": 0.6593, "step": 4885 }, { "epoch": 0.48300916887032597, "grad_norm": 4.224917200277799, "learning_rate": 1.1034031814918328e-07, "loss": 0.7961, "step": 4886 }, { "epoch": 0.48310802461508046, "grad_norm": 4.492040764180408, "learning_rate": 1.1030846814227705e-07, "loss": 0.7349, "step": 4887 }, { "epoch": 0.4832068803598349, "grad_norm": 3.447323592006722, "learning_rate": 1.1027661707838945e-07, "loss": 0.632, "step": 4888 }, { "epoch": 0.4833057361045894, "grad_norm": 12.614376681060467, "learning_rate": 1.1024476496078638e-07, "loss": 0.7729, "step": 4889 }, { "epoch": 0.48340459184934387, "grad_norm": 12.601489905707208, "learning_rate": 1.102129117927338e-07, "loss": 0.6169, "step": 4890 }, { "epoch": 0.4835034475940983, "grad_norm": 3.570453380771824, "learning_rate": 1.1018105757749774e-07, "loss": 0.682, "step": 4891 }, { "epoch": 0.4836023033388528, "grad_norm": 4.990232075839659, "learning_rate": 1.1014920231834441e-07, "loss": 0.6476, "step": 4892 }, { "epoch": 0.4837011590836072, "grad_norm": 4.725634761681587, "learning_rate": 1.101173460185401e-07, "loss": 0.684, "step": 4893 }, { "epoch": 0.4838000148283617, "grad_norm": 3.3958394553069198, "learning_rate": 1.1008548868135122e-07, "loss": 0.7597, "step": 4894 }, { "epoch": 0.4838988705731162, "grad_norm": 4.3408337071728305, "learning_rate": 1.100536303100442e-07, "loss": 0.6889, "step": 4895 }, { "epoch": 0.48399772631787064, "grad_norm": 14.485135727220992, "learning_rate": 1.1002177090788572e-07, "loss": 0.7882, "step": 4896 }, { "epoch": 0.4840965820626251, "grad_norm": 6.648612373067033, "learning_rate": 1.0998991047814246e-07, "loss": 0.6048, "step": 4897 }, { "epoch": 0.48419543780737956, "grad_norm": 3.432258795261783, "learning_rate": 1.0995804902408123e-07, "loss": 0.7526, "step": 4898 }, { "epoch": 0.48429429355213405, "grad_norm": 8.12721884864471, "learning_rate": 1.0992618654896899e-07, "loss": 0.7138, "step": 4899 }, { "epoch": 0.48439314929688854, "grad_norm": 4.333881865905653, "learning_rate": 1.0989432305607273e-07, "loss": 0.6797, "step": 4900 }, { "epoch": 0.48449200504164297, "grad_norm": 5.115473361220506, "learning_rate": 1.0986245854865961e-07, "loss": 0.6705, "step": 4901 }, { "epoch": 0.48459086078639746, "grad_norm": 3.3935865984658773, "learning_rate": 1.098305930299968e-07, "loss": 0.5724, "step": 4902 }, { "epoch": 0.4846897165311519, "grad_norm": 4.504836598243756, "learning_rate": 1.0979872650335171e-07, "loss": 0.7025, "step": 4903 }, { "epoch": 0.4847885722759064, "grad_norm": 4.511563342818421, "learning_rate": 1.0976685897199175e-07, "loss": 0.6407, "step": 4904 }, { "epoch": 0.48488742802066087, "grad_norm": 2.9988238088928827, "learning_rate": 1.097349904391845e-07, "loss": 0.6047, "step": 4905 }, { "epoch": 0.4849862837654153, "grad_norm": 8.927154429048263, "learning_rate": 1.0970312090819755e-07, "loss": 0.6352, "step": 4906 }, { "epoch": 0.4850851395101698, "grad_norm": 8.711643319759613, "learning_rate": 1.0967125038229874e-07, "loss": 0.6844, "step": 4907 }, { "epoch": 0.4851839952549242, "grad_norm": 5.782719561135157, "learning_rate": 1.0963937886475583e-07, "loss": 0.7042, "step": 4908 }, { "epoch": 0.4852828509996787, "grad_norm": 2.8923411606330376, "learning_rate": 1.0960750635883679e-07, "loss": 0.7207, "step": 4909 }, { "epoch": 0.4853817067444332, "grad_norm": 3.3605990073025356, "learning_rate": 1.0957563286780969e-07, "loss": 0.6807, "step": 4910 }, { "epoch": 0.48548056248918764, "grad_norm": 4.218151862823981, "learning_rate": 1.0954375839494272e-07, "loss": 0.7573, "step": 4911 }, { "epoch": 0.48557941823394213, "grad_norm": 3.3294122183202997, "learning_rate": 1.0951188294350409e-07, "loss": 0.7068, "step": 4912 }, { "epoch": 0.48567827397869656, "grad_norm": 8.178559215236952, "learning_rate": 1.0948000651676217e-07, "loss": 0.7101, "step": 4913 }, { "epoch": 0.48577712972345105, "grad_norm": 7.862273245276272, "learning_rate": 1.0944812911798542e-07, "loss": 0.7458, "step": 4914 }, { "epoch": 0.48587598546820554, "grad_norm": 25.986679876859643, "learning_rate": 1.094162507504424e-07, "loss": 0.776, "step": 4915 }, { "epoch": 0.48597484121296, "grad_norm": 3.5897489126669315, "learning_rate": 1.0938437141740177e-07, "loss": 0.6998, "step": 4916 }, { "epoch": 0.48607369695771446, "grad_norm": 3.2189081744016774, "learning_rate": 1.093524911221323e-07, "loss": 0.6713, "step": 4917 }, { "epoch": 0.4861725527024689, "grad_norm": 3.574269575948743, "learning_rate": 1.0932060986790281e-07, "loss": 0.6786, "step": 4918 }, { "epoch": 0.4862714084472234, "grad_norm": 3.459697969171859, "learning_rate": 1.0928872765798227e-07, "loss": 0.726, "step": 4919 }, { "epoch": 0.4863702641919779, "grad_norm": 4.520662088588456, "learning_rate": 1.0925684449563973e-07, "loss": 0.6619, "step": 4920 }, { "epoch": 0.4864691199367323, "grad_norm": 6.164768573204754, "learning_rate": 1.092249603841443e-07, "loss": 0.7914, "step": 4921 }, { "epoch": 0.4865679756814868, "grad_norm": 6.43478289522857, "learning_rate": 1.091930753267653e-07, "loss": 0.6005, "step": 4922 }, { "epoch": 0.48666683142624123, "grad_norm": 5.958186682462018, "learning_rate": 1.09161189326772e-07, "loss": 0.6968, "step": 4923 }, { "epoch": 0.4867656871709957, "grad_norm": 7.180773082882422, "learning_rate": 1.0912930238743392e-07, "loss": 0.7897, "step": 4924 }, { "epoch": 0.4868645429157502, "grad_norm": 6.2747857603568935, "learning_rate": 1.090974145120205e-07, "loss": 0.6621, "step": 4925 }, { "epoch": 0.48696339866050464, "grad_norm": 3.021712135056737, "learning_rate": 1.0906552570380145e-07, "loss": 0.8071, "step": 4926 }, { "epoch": 0.48706225440525913, "grad_norm": 3.9489475081622256, "learning_rate": 1.0903363596604644e-07, "loss": 0.6973, "step": 4927 }, { "epoch": 0.48716111015001357, "grad_norm": 4.760470168740779, "learning_rate": 1.0900174530202533e-07, "loss": 0.6806, "step": 4928 }, { "epoch": 0.48725996589476805, "grad_norm": 3.454604103633635, "learning_rate": 1.0896985371500802e-07, "loss": 0.7747, "step": 4929 }, { "epoch": 0.48735882163952254, "grad_norm": 3.8421558969954632, "learning_rate": 1.0893796120826453e-07, "loss": 0.6332, "step": 4930 }, { "epoch": 0.487457677384277, "grad_norm": 15.913924841022988, "learning_rate": 1.0890606778506497e-07, "loss": 0.6284, "step": 4931 }, { "epoch": 0.48755653312903147, "grad_norm": 7.6035743143298165, "learning_rate": 1.0887417344867951e-07, "loss": 0.5953, "step": 4932 }, { "epoch": 0.48765538887378596, "grad_norm": 3.2006596639042377, "learning_rate": 1.0884227820237848e-07, "loss": 0.735, "step": 4933 }, { "epoch": 0.4877542446185404, "grad_norm": 9.258333040284953, "learning_rate": 1.0881038204943228e-07, "loss": 0.7165, "step": 4934 }, { "epoch": 0.4878531003632949, "grad_norm": 5.556418887789064, "learning_rate": 1.087784849931113e-07, "loss": 0.7434, "step": 4935 }, { "epoch": 0.4879519561080493, "grad_norm": 3.6102378113474303, "learning_rate": 1.0874658703668623e-07, "loss": 0.5925, "step": 4936 }, { "epoch": 0.4880508118528038, "grad_norm": 3.1599544907409, "learning_rate": 1.0871468818342766e-07, "loss": 0.8631, "step": 4937 }, { "epoch": 0.4881496675975583, "grad_norm": 3.598948492907296, "learning_rate": 1.0868278843660637e-07, "loss": 0.6245, "step": 4938 }, { "epoch": 0.4882485233423127, "grad_norm": 3.145963938068849, "learning_rate": 1.0865088779949318e-07, "loss": 0.6839, "step": 4939 }, { "epoch": 0.4883473790870672, "grad_norm": 5.163289995708715, "learning_rate": 1.0861898627535909e-07, "loss": 0.7454, "step": 4940 }, { "epoch": 0.48844623483182165, "grad_norm": 3.6404858164882836, "learning_rate": 1.0858708386747508e-07, "loss": 0.6903, "step": 4941 }, { "epoch": 0.48854509057657614, "grad_norm": 5.276805873085752, "learning_rate": 1.0855518057911231e-07, "loss": 0.7433, "step": 4942 }, { "epoch": 0.4886439463213306, "grad_norm": 6.819631877960541, "learning_rate": 1.0852327641354195e-07, "loss": 0.6382, "step": 4943 }, { "epoch": 0.48874280206608506, "grad_norm": 5.518998993843801, "learning_rate": 1.0849137137403532e-07, "loss": 0.6651, "step": 4944 }, { "epoch": 0.48884165781083955, "grad_norm": 3.4046267506747077, "learning_rate": 1.0845946546386384e-07, "loss": 0.6908, "step": 4945 }, { "epoch": 0.488940513555594, "grad_norm": 3.6975439907321097, "learning_rate": 1.0842755868629891e-07, "loss": 0.6344, "step": 4946 }, { "epoch": 0.48903936930034847, "grad_norm": 4.329024447910644, "learning_rate": 1.0839565104461218e-07, "loss": 0.5819, "step": 4947 }, { "epoch": 0.48913822504510296, "grad_norm": 4.392438971097348, "learning_rate": 1.083637425420753e-07, "loss": 0.7464, "step": 4948 }, { "epoch": 0.4892370807898574, "grad_norm": 3.599142782017634, "learning_rate": 1.0833183318195997e-07, "loss": 0.6971, "step": 4949 }, { "epoch": 0.4893359365346119, "grad_norm": 6.727671288477943, "learning_rate": 1.0829992296753807e-07, "loss": 0.7277, "step": 4950 }, { "epoch": 0.4894347922793663, "grad_norm": 5.2901096652897115, "learning_rate": 1.082680119020815e-07, "loss": 0.6519, "step": 4951 }, { "epoch": 0.4895336480241208, "grad_norm": 5.102324251878221, "learning_rate": 1.0823609998886225e-07, "loss": 0.7061, "step": 4952 }, { "epoch": 0.4896325037688753, "grad_norm": 4.148032543498175, "learning_rate": 1.0820418723115246e-07, "loss": 0.657, "step": 4953 }, { "epoch": 0.4897313595136297, "grad_norm": 9.660494305659068, "learning_rate": 1.0817227363222425e-07, "loss": 0.6937, "step": 4954 }, { "epoch": 0.4898302152583842, "grad_norm": 9.145800112237929, "learning_rate": 1.0814035919534998e-07, "loss": 0.6701, "step": 4955 }, { "epoch": 0.48992907100313865, "grad_norm": 3.2385170266232337, "learning_rate": 1.0810844392380192e-07, "loss": 0.6434, "step": 4956 }, { "epoch": 0.49002792674789314, "grad_norm": 6.687435237480198, "learning_rate": 1.0807652782085255e-07, "loss": 0.5718, "step": 4957 }, { "epoch": 0.49012678249264763, "grad_norm": 3.975918914329794, "learning_rate": 1.0804461088977439e-07, "loss": 0.7477, "step": 4958 }, { "epoch": 0.49022563823740206, "grad_norm": 5.3922727118795875, "learning_rate": 1.0801269313384005e-07, "loss": 0.6425, "step": 4959 }, { "epoch": 0.49032449398215655, "grad_norm": 7.26258367513739, "learning_rate": 1.0798077455632225e-07, "loss": 0.8142, "step": 4960 }, { "epoch": 0.490423349726911, "grad_norm": 4.520364987380895, "learning_rate": 1.079488551604937e-07, "loss": 0.664, "step": 4961 }, { "epoch": 0.4905222054716655, "grad_norm": 3.8095858960602627, "learning_rate": 1.0791693494962735e-07, "loss": 0.7486, "step": 4962 }, { "epoch": 0.49062106121641996, "grad_norm": 3.477944933816795, "learning_rate": 1.0788501392699607e-07, "loss": 0.7324, "step": 4963 }, { "epoch": 0.4907199169611744, "grad_norm": 3.841160323580473, "learning_rate": 1.0785309209587294e-07, "loss": 0.7805, "step": 4964 }, { "epoch": 0.4908187727059289, "grad_norm": 4.137282273502954, "learning_rate": 1.0782116945953103e-07, "loss": 0.7728, "step": 4965 }, { "epoch": 0.4909176284506833, "grad_norm": 4.690748766502305, "learning_rate": 1.0778924602124359e-07, "loss": 0.815, "step": 4966 }, { "epoch": 0.4910164841954378, "grad_norm": 4.143948678023414, "learning_rate": 1.0775732178428388e-07, "loss": 0.6759, "step": 4967 }, { "epoch": 0.4911153399401923, "grad_norm": 3.897769804714911, "learning_rate": 1.0772539675192524e-07, "loss": 0.7087, "step": 4968 }, { "epoch": 0.49121419568494673, "grad_norm": 37.72190598504822, "learning_rate": 1.0769347092744112e-07, "loss": 0.6965, "step": 4969 }, { "epoch": 0.4913130514297012, "grad_norm": 6.371711871081384, "learning_rate": 1.0766154431410503e-07, "loss": 0.6356, "step": 4970 }, { "epoch": 0.49141190717445565, "grad_norm": 6.903172273037336, "learning_rate": 1.0762961691519062e-07, "loss": 0.6844, "step": 4971 }, { "epoch": 0.49151076291921014, "grad_norm": 3.691618512246146, "learning_rate": 1.0759768873397148e-07, "loss": 0.7027, "step": 4972 }, { "epoch": 0.49160961866396463, "grad_norm": 4.573614105269382, "learning_rate": 1.0756575977372149e-07, "loss": 0.6994, "step": 4973 }, { "epoch": 0.49170847440871907, "grad_norm": 7.11723595424024, "learning_rate": 1.0753383003771441e-07, "loss": 0.7093, "step": 4974 }, { "epoch": 0.49180733015347355, "grad_norm": 8.43079941025531, "learning_rate": 1.075018995292242e-07, "loss": 0.631, "step": 4975 }, { "epoch": 0.491906185898228, "grad_norm": 5.5401386696437935, "learning_rate": 1.0746996825152483e-07, "loss": 0.7699, "step": 4976 }, { "epoch": 0.4920050416429825, "grad_norm": 3.298834216814719, "learning_rate": 1.0743803620789047e-07, "loss": 0.7692, "step": 4977 }, { "epoch": 0.49210389738773697, "grad_norm": 3.585881000433193, "learning_rate": 1.0740610340159517e-07, "loss": 0.7808, "step": 4978 }, { "epoch": 0.4922027531324914, "grad_norm": 3.696715570267291, "learning_rate": 1.0737416983591321e-07, "loss": 0.6954, "step": 4979 }, { "epoch": 0.4923016088772459, "grad_norm": 9.625967380537853, "learning_rate": 1.0734223551411892e-07, "loss": 0.6648, "step": 4980 }, { "epoch": 0.4924004646220003, "grad_norm": 6.30308593212552, "learning_rate": 1.0731030043948667e-07, "loss": 0.6922, "step": 4981 }, { "epoch": 0.4924993203667548, "grad_norm": 14.112451421758925, "learning_rate": 1.0727836461529097e-07, "loss": 0.7646, "step": 4982 }, { "epoch": 0.4925981761115093, "grad_norm": 4.582849062488963, "learning_rate": 1.072464280448063e-07, "loss": 0.7249, "step": 4983 }, { "epoch": 0.49269703185626373, "grad_norm": 7.677843819211311, "learning_rate": 1.0721449073130738e-07, "loss": 0.7603, "step": 4984 }, { "epoch": 0.4927958876010182, "grad_norm": 3.190630291649785, "learning_rate": 1.0718255267806882e-07, "loss": 0.7758, "step": 4985 }, { "epoch": 0.49289474334577266, "grad_norm": 3.7498270934065485, "learning_rate": 1.0715061388836546e-07, "loss": 0.66, "step": 4986 }, { "epoch": 0.49299359909052715, "grad_norm": 3.735043621174924, "learning_rate": 1.071186743654721e-07, "loss": 0.681, "step": 4987 }, { "epoch": 0.49309245483528164, "grad_norm": 2.888198110448533, "learning_rate": 1.070867341126637e-07, "loss": 0.6898, "step": 4988 }, { "epoch": 0.49319131058003607, "grad_norm": 3.730553329203329, "learning_rate": 1.0705479313321526e-07, "loss": 0.7512, "step": 4989 }, { "epoch": 0.49329016632479056, "grad_norm": 4.905389476979263, "learning_rate": 1.0702285143040181e-07, "loss": 0.641, "step": 4990 }, { "epoch": 0.493389022069545, "grad_norm": 3.470931005840382, "learning_rate": 1.0699090900749858e-07, "loss": 0.6645, "step": 4991 }, { "epoch": 0.4934878778142995, "grad_norm": 4.595086793982326, "learning_rate": 1.0695896586778077e-07, "loss": 0.8125, "step": 4992 }, { "epoch": 0.49358673355905397, "grad_norm": 3.823470447683234, "learning_rate": 1.0692702201452364e-07, "loss": 0.8686, "step": 4993 }, { "epoch": 0.4936855893038084, "grad_norm": 2.9844375428344243, "learning_rate": 1.068950774510026e-07, "loss": 0.7339, "step": 4994 }, { "epoch": 0.4937844450485629, "grad_norm": 8.880984788533603, "learning_rate": 1.0686313218049306e-07, "loss": 0.7127, "step": 4995 }, { "epoch": 0.4938833007933173, "grad_norm": 5.8180761791677105, "learning_rate": 1.0683118620627056e-07, "loss": 0.6894, "step": 4996 }, { "epoch": 0.4939821565380718, "grad_norm": 4.591844241287601, "learning_rate": 1.0679923953161067e-07, "loss": 0.7175, "step": 4997 }, { "epoch": 0.4940810122828263, "grad_norm": 6.530912476683678, "learning_rate": 1.0676729215978905e-07, "loss": 0.7191, "step": 4998 }, { "epoch": 0.49417986802758074, "grad_norm": 3.9683884036815313, "learning_rate": 1.0673534409408147e-07, "loss": 0.7803, "step": 4999 }, { "epoch": 0.4942787237723352, "grad_norm": 4.197272000757149, "learning_rate": 1.067033953377637e-07, "loss": 0.5637, "step": 5000 }, { "epoch": 0.49437757951708966, "grad_norm": 2.684990338410304, "learning_rate": 1.0667144589411158e-07, "loss": 0.6712, "step": 5001 }, { "epoch": 0.49447643526184415, "grad_norm": 7.762974912945271, "learning_rate": 1.0663949576640114e-07, "loss": 0.7853, "step": 5002 }, { "epoch": 0.49457529100659864, "grad_norm": 3.394081531813035, "learning_rate": 1.0660754495790835e-07, "loss": 0.7058, "step": 5003 }, { "epoch": 0.49467414675135307, "grad_norm": 7.196613795039451, "learning_rate": 1.065755934719093e-07, "loss": 0.7531, "step": 5004 }, { "epoch": 0.49477300249610756, "grad_norm": 54.37138027758901, "learning_rate": 1.0654364131168012e-07, "loss": 0.7367, "step": 5005 }, { "epoch": 0.494871858240862, "grad_norm": 4.5279112488389295, "learning_rate": 1.0651168848049706e-07, "loss": 0.6614, "step": 5006 }, { "epoch": 0.4949707139856165, "grad_norm": 4.9012278128282905, "learning_rate": 1.0647973498163638e-07, "loss": 0.7591, "step": 5007 }, { "epoch": 0.495069569730371, "grad_norm": 3.007142921183577, "learning_rate": 1.0644778081837447e-07, "loss": 0.7288, "step": 5008 }, { "epoch": 0.4951684254751254, "grad_norm": 4.391830645113759, "learning_rate": 1.0641582599398773e-07, "loss": 0.7173, "step": 5009 }, { "epoch": 0.4952672812198799, "grad_norm": 4.309477122914594, "learning_rate": 1.063838705117527e-07, "loss": 0.6826, "step": 5010 }, { "epoch": 0.4953661369646344, "grad_norm": 3.1310574498744463, "learning_rate": 1.063519143749459e-07, "loss": 0.6513, "step": 5011 }, { "epoch": 0.4954649927093888, "grad_norm": 3.3234414419344906, "learning_rate": 1.06319957586844e-07, "loss": 0.7001, "step": 5012 }, { "epoch": 0.4955638484541433, "grad_norm": 4.0428888167506445, "learning_rate": 1.0628800015072368e-07, "loss": 0.6596, "step": 5013 }, { "epoch": 0.49566270419889774, "grad_norm": 3.3302362107812153, "learning_rate": 1.0625604206986168e-07, "loss": 0.642, "step": 5014 }, { "epoch": 0.49576155994365223, "grad_norm": 4.496326321937332, "learning_rate": 1.0622408334753485e-07, "loss": 0.6662, "step": 5015 }, { "epoch": 0.4958604156884067, "grad_norm": 3.070089463176683, "learning_rate": 1.0619212398702007e-07, "loss": 0.6216, "step": 5016 }, { "epoch": 0.49595927143316115, "grad_norm": 3.020486485168461, "learning_rate": 1.0616016399159433e-07, "loss": 0.62, "step": 5017 }, { "epoch": 0.49605812717791564, "grad_norm": 20.22838156412443, "learning_rate": 1.0612820336453465e-07, "loss": 0.677, "step": 5018 }, { "epoch": 0.4961569829226701, "grad_norm": 3.9063623596569204, "learning_rate": 1.0609624210911812e-07, "loss": 0.7841, "step": 5019 }, { "epoch": 0.49625583866742456, "grad_norm": 4.011213146817881, "learning_rate": 1.0606428022862184e-07, "loss": 0.5877, "step": 5020 }, { "epoch": 0.49635469441217905, "grad_norm": 3.703255820278696, "learning_rate": 1.0603231772632315e-07, "loss": 0.706, "step": 5021 }, { "epoch": 0.4964535501569335, "grad_norm": 7.28321698606604, "learning_rate": 1.0600035460549921e-07, "loss": 0.6038, "step": 5022 }, { "epoch": 0.496552405901688, "grad_norm": 4.195299232698053, "learning_rate": 1.0596839086942741e-07, "loss": 0.7447, "step": 5023 }, { "epoch": 0.4966512616464424, "grad_norm": 6.76027746362631, "learning_rate": 1.0593642652138519e-07, "loss": 0.6998, "step": 5024 }, { "epoch": 0.4967501173911969, "grad_norm": 4.100007129441382, "learning_rate": 1.0590446156465e-07, "loss": 0.6783, "step": 5025 }, { "epoch": 0.4968489731359514, "grad_norm": 4.6506920134143, "learning_rate": 1.0587249600249937e-07, "loss": 0.7802, "step": 5026 }, { "epoch": 0.4969478288807058, "grad_norm": 7.231279095645681, "learning_rate": 1.0584052983821087e-07, "loss": 0.7239, "step": 5027 }, { "epoch": 0.4970466846254603, "grad_norm": 4.560547802296419, "learning_rate": 1.0580856307506223e-07, "loss": 0.692, "step": 5028 }, { "epoch": 0.49714554037021474, "grad_norm": 4.038279041149098, "learning_rate": 1.0577659571633111e-07, "loss": 0.6925, "step": 5029 }, { "epoch": 0.49724439611496923, "grad_norm": 3.7659357234343815, "learning_rate": 1.0574462776529531e-07, "loss": 0.7079, "step": 5030 }, { "epoch": 0.4973432518597237, "grad_norm": 3.639560907065725, "learning_rate": 1.0571265922523269e-07, "loss": 0.6966, "step": 5031 }, { "epoch": 0.49744210760447816, "grad_norm": 6.320479606522326, "learning_rate": 1.056806900994211e-07, "loss": 0.7409, "step": 5032 }, { "epoch": 0.49754096334923265, "grad_norm": 3.357759264460438, "learning_rate": 1.0564872039113857e-07, "loss": 0.7769, "step": 5033 }, { "epoch": 0.4976398190939871, "grad_norm": 3.493563400968946, "learning_rate": 1.0561675010366306e-07, "loss": 0.8383, "step": 5034 }, { "epoch": 0.49773867483874157, "grad_norm": 4.596751370131817, "learning_rate": 1.055847792402727e-07, "loss": 0.7593, "step": 5035 }, { "epoch": 0.49783753058349606, "grad_norm": 3.261957459812754, "learning_rate": 1.055528078042456e-07, "loss": 0.6537, "step": 5036 }, { "epoch": 0.4979363863282505, "grad_norm": 3.8465278989835383, "learning_rate": 1.0552083579885998e-07, "loss": 0.6029, "step": 5037 }, { "epoch": 0.498035242073005, "grad_norm": 3.2451251957141123, "learning_rate": 1.054888632273941e-07, "loss": 0.7566, "step": 5038 }, { "epoch": 0.4981340978177594, "grad_norm": 4.768612960692695, "learning_rate": 1.0545689009312624e-07, "loss": 0.7796, "step": 5039 }, { "epoch": 0.4982329535625139, "grad_norm": 3.8758635074619776, "learning_rate": 1.054249163993348e-07, "loss": 0.7251, "step": 5040 }, { "epoch": 0.4983318093072684, "grad_norm": 3.7518217612703477, "learning_rate": 1.0539294214929821e-07, "loss": 0.6249, "step": 5041 }, { "epoch": 0.4984306650520228, "grad_norm": 6.265295092139124, "learning_rate": 1.0536096734629494e-07, "loss": 0.7092, "step": 5042 }, { "epoch": 0.4985295207967773, "grad_norm": 4.052293808920071, "learning_rate": 1.0532899199360354e-07, "loss": 0.6427, "step": 5043 }, { "epoch": 0.49862837654153175, "grad_norm": 2.5715405623678405, "learning_rate": 1.0529701609450266e-07, "loss": 0.663, "step": 5044 }, { "epoch": 0.49872723228628624, "grad_norm": 4.38825780774269, "learning_rate": 1.0526503965227087e-07, "loss": 0.726, "step": 5045 }, { "epoch": 0.4988260880310407, "grad_norm": 2.9601955626616734, "learning_rate": 1.0523306267018695e-07, "loss": 0.7776, "step": 5046 }, { "epoch": 0.49892494377579516, "grad_norm": 6.246606028877477, "learning_rate": 1.0520108515152966e-07, "loss": 0.6923, "step": 5047 }, { "epoch": 0.49902379952054965, "grad_norm": 5.14330628594759, "learning_rate": 1.051691070995778e-07, "loss": 0.7717, "step": 5048 }, { "epoch": 0.4991226552653041, "grad_norm": 3.4823069337695833, "learning_rate": 1.0513712851761025e-07, "loss": 0.7207, "step": 5049 }, { "epoch": 0.49922151101005857, "grad_norm": 3.26302499917406, "learning_rate": 1.0510514940890596e-07, "loss": 0.7716, "step": 5050 }, { "epoch": 0.49932036675481306, "grad_norm": 4.3728703416083015, "learning_rate": 1.050731697767439e-07, "loss": 0.706, "step": 5051 }, { "epoch": 0.4994192224995675, "grad_norm": 5.4408352585657855, "learning_rate": 1.0504118962440311e-07, "loss": 0.6704, "step": 5052 }, { "epoch": 0.499518078244322, "grad_norm": 3.4812510528833402, "learning_rate": 1.0500920895516266e-07, "loss": 0.7085, "step": 5053 }, { "epoch": 0.4996169339890764, "grad_norm": 3.232223468575376, "learning_rate": 1.0497722777230176e-07, "loss": 0.7111, "step": 5054 }, { "epoch": 0.4997157897338309, "grad_norm": 5.548592464449253, "learning_rate": 1.0494524607909956e-07, "loss": 0.7049, "step": 5055 }, { "epoch": 0.4998146454785854, "grad_norm": 3.785803594530723, "learning_rate": 1.0491326387883532e-07, "loss": 0.6959, "step": 5056 }, { "epoch": 0.49991350122333983, "grad_norm": 3.2230721962357, "learning_rate": 1.0488128117478836e-07, "loss": 0.79, "step": 5057 }, { "epoch": 0.5000123569680943, "grad_norm": 23.531308969988828, "learning_rate": 1.04849297970238e-07, "loss": 0.5968, "step": 5058 }, { "epoch": 0.5001112127128488, "grad_norm": 4.85752578815118, "learning_rate": 1.048173142684637e-07, "loss": 0.6882, "step": 5059 }, { "epoch": 0.5002100684576033, "grad_norm": 3.3964328554239462, "learning_rate": 1.0478533007274482e-07, "loss": 0.7655, "step": 5060 }, { "epoch": 0.5003089242023577, "grad_norm": 4.600841991442908, "learning_rate": 1.0475334538636098e-07, "loss": 0.6631, "step": 5061 }, { "epoch": 0.5004077799471122, "grad_norm": 5.802837513349761, "learning_rate": 1.0472136021259166e-07, "loss": 0.7694, "step": 5062 }, { "epoch": 0.5005066356918666, "grad_norm": 3.483054735334268, "learning_rate": 1.0468937455471652e-07, "loss": 0.6191, "step": 5063 }, { "epoch": 0.5006054914366211, "grad_norm": 3.777220123432716, "learning_rate": 1.0465738841601516e-07, "loss": 0.6588, "step": 5064 }, { "epoch": 0.5007043471813756, "grad_norm": 3.259472870371891, "learning_rate": 1.0462540179976735e-07, "loss": 0.742, "step": 5065 }, { "epoch": 0.50080320292613, "grad_norm": 3.6318339877154178, "learning_rate": 1.0459341470925283e-07, "loss": 0.7148, "step": 5066 }, { "epoch": 0.5009020586708846, "grad_norm": 7.768127885232397, "learning_rate": 1.0456142714775134e-07, "loss": 0.7385, "step": 5067 }, { "epoch": 0.501000914415639, "grad_norm": 2.7609873819795667, "learning_rate": 1.0452943911854279e-07, "loss": 0.7288, "step": 5068 }, { "epoch": 0.5010997701603934, "grad_norm": 4.685291127054411, "learning_rate": 1.0449745062490711e-07, "loss": 0.6148, "step": 5069 }, { "epoch": 0.501198625905148, "grad_norm": 5.067410968390232, "learning_rate": 1.0446546167012418e-07, "loss": 0.8727, "step": 5070 }, { "epoch": 0.5012974816499024, "grad_norm": 4.6172892146910804, "learning_rate": 1.0443347225747403e-07, "loss": 0.6946, "step": 5071 }, { "epoch": 0.5013963373946568, "grad_norm": 3.327521992282824, "learning_rate": 1.0440148239023669e-07, "loss": 0.7325, "step": 5072 }, { "epoch": 0.5014951931394113, "grad_norm": 5.040498064472465, "learning_rate": 1.0436949207169228e-07, "loss": 0.6842, "step": 5073 }, { "epoch": 0.5015940488841658, "grad_norm": 6.435603453996979, "learning_rate": 1.0433750130512092e-07, "loss": 0.737, "step": 5074 }, { "epoch": 0.5016929046289202, "grad_norm": 3.864804608093426, "learning_rate": 1.0430551009380275e-07, "loss": 0.7834, "step": 5075 }, { "epoch": 0.5017917603736747, "grad_norm": 3.7715096531100034, "learning_rate": 1.0427351844101806e-07, "loss": 0.6934, "step": 5076 }, { "epoch": 0.5018906161184292, "grad_norm": 28.95598901390921, "learning_rate": 1.0424152635004709e-07, "loss": 0.7093, "step": 5077 }, { "epoch": 0.5019894718631837, "grad_norm": 3.022504717229382, "learning_rate": 1.0420953382417013e-07, "loss": 0.7055, "step": 5078 }, { "epoch": 0.5020883276079381, "grad_norm": 3.7343294773832527, "learning_rate": 1.0417754086666759e-07, "loss": 0.7308, "step": 5079 }, { "epoch": 0.5021871833526926, "grad_norm": 3.367538565609187, "learning_rate": 1.0414554748081985e-07, "loss": 0.7023, "step": 5080 }, { "epoch": 0.5022860390974471, "grad_norm": 5.035334503354703, "learning_rate": 1.0411355366990737e-07, "loss": 0.6035, "step": 5081 }, { "epoch": 0.5023848948422015, "grad_norm": 13.296338822586444, "learning_rate": 1.0408155943721066e-07, "loss": 0.6405, "step": 5082 }, { "epoch": 0.5024837505869559, "grad_norm": 4.454165977119614, "learning_rate": 1.0404956478601021e-07, "loss": 0.7537, "step": 5083 }, { "epoch": 0.5025826063317105, "grad_norm": 5.273263540296186, "learning_rate": 1.0401756971958666e-07, "loss": 0.6619, "step": 5084 }, { "epoch": 0.5026814620764649, "grad_norm": 4.803211778154168, "learning_rate": 1.0398557424122054e-07, "loss": 0.7319, "step": 5085 }, { "epoch": 0.5027803178212193, "grad_norm": 5.5100420831309025, "learning_rate": 1.0395357835419263e-07, "loss": 0.6834, "step": 5086 }, { "epoch": 0.5028791735659739, "grad_norm": 3.3279158132159896, "learning_rate": 1.0392158206178356e-07, "loss": 0.6641, "step": 5087 }, { "epoch": 0.5029780293107283, "grad_norm": 3.408262926431073, "learning_rate": 1.038895853672741e-07, "loss": 0.5861, "step": 5088 }, { "epoch": 0.5030768850554828, "grad_norm": 5.997607109953472, "learning_rate": 1.0385758827394502e-07, "loss": 0.7022, "step": 5089 }, { "epoch": 0.5031757408002373, "grad_norm": 8.552085199645681, "learning_rate": 1.0382559078507717e-07, "loss": 0.7537, "step": 5090 }, { "epoch": 0.5032745965449917, "grad_norm": 18.24595993812914, "learning_rate": 1.0379359290395142e-07, "loss": 0.7635, "step": 5091 }, { "epoch": 0.5033734522897462, "grad_norm": 4.829693449294128, "learning_rate": 1.037615946338487e-07, "loss": 0.6548, "step": 5092 }, { "epoch": 0.5034723080345006, "grad_norm": 3.16054147862091, "learning_rate": 1.0372959597804992e-07, "loss": 0.739, "step": 5093 }, { "epoch": 0.5035711637792551, "grad_norm": 4.8708693283791495, "learning_rate": 1.0369759693983608e-07, "loss": 0.6941, "step": 5094 }, { "epoch": 0.5036700195240096, "grad_norm": 3.4371241105043517, "learning_rate": 1.0366559752248823e-07, "loss": 0.7047, "step": 5095 }, { "epoch": 0.503768875268764, "grad_norm": 3.494896385057394, "learning_rate": 1.036335977292874e-07, "loss": 0.6932, "step": 5096 }, { "epoch": 0.5038677310135186, "grad_norm": 8.062785604954087, "learning_rate": 1.0360159756351474e-07, "loss": 0.7593, "step": 5097 }, { "epoch": 0.503966586758273, "grad_norm": 3.6887712257974026, "learning_rate": 1.0356959702845139e-07, "loss": 0.7505, "step": 5098 }, { "epoch": 0.5040654425030274, "grad_norm": 4.493360868755047, "learning_rate": 1.0353759612737851e-07, "loss": 0.7897, "step": 5099 }, { "epoch": 0.504164298247782, "grad_norm": 3.2044107262557455, "learning_rate": 1.0350559486357732e-07, "loss": 0.7438, "step": 5100 }, { "epoch": 0.5042631539925364, "grad_norm": 2.7455519238809543, "learning_rate": 1.0347359324032912e-07, "loss": 0.6502, "step": 5101 }, { "epoch": 0.5043620097372908, "grad_norm": 3.223588495610112, "learning_rate": 1.0344159126091514e-07, "loss": 0.5932, "step": 5102 }, { "epoch": 0.5044608654820453, "grad_norm": 8.64157431519327, "learning_rate": 1.0340958892861675e-07, "loss": 0.6908, "step": 5103 }, { "epoch": 0.5045597212267998, "grad_norm": 3.918983925152668, "learning_rate": 1.0337758624671531e-07, "loss": 0.6875, "step": 5104 }, { "epoch": 0.5046585769715543, "grad_norm": 3.8080882943896426, "learning_rate": 1.0334558321849223e-07, "loss": 0.6938, "step": 5105 }, { "epoch": 0.5047574327163087, "grad_norm": 3.4184332150044354, "learning_rate": 1.0331357984722893e-07, "loss": 0.7184, "step": 5106 }, { "epoch": 0.5048562884610632, "grad_norm": 3.056512970246195, "learning_rate": 1.0328157613620689e-07, "loss": 0.7281, "step": 5107 }, { "epoch": 0.5049551442058177, "grad_norm": 14.220366318296442, "learning_rate": 1.0324957208870764e-07, "loss": 0.7566, "step": 5108 }, { "epoch": 0.5050539999505721, "grad_norm": 9.071063719628482, "learning_rate": 1.0321756770801273e-07, "loss": 0.8569, "step": 5109 }, { "epoch": 0.5051528556953266, "grad_norm": 22.874070004169347, "learning_rate": 1.0318556299740368e-07, "loss": 0.6875, "step": 5110 }, { "epoch": 0.5052517114400811, "grad_norm": 21.982008593108702, "learning_rate": 1.0315355796016214e-07, "loss": 0.7047, "step": 5111 }, { "epoch": 0.5053505671848355, "grad_norm": 5.534161950524919, "learning_rate": 1.0312155259956976e-07, "loss": 0.7331, "step": 5112 }, { "epoch": 0.5054494229295899, "grad_norm": 5.679967048738447, "learning_rate": 1.0308954691890819e-07, "loss": 0.7543, "step": 5113 }, { "epoch": 0.5055482786743445, "grad_norm": 3.8671769368750684, "learning_rate": 1.0305754092145917e-07, "loss": 0.7414, "step": 5114 }, { "epoch": 0.5056471344190989, "grad_norm": 5.2003656977615895, "learning_rate": 1.030255346105044e-07, "loss": 0.6228, "step": 5115 }, { "epoch": 0.5057459901638534, "grad_norm": 5.218802482180568, "learning_rate": 1.0299352798932573e-07, "loss": 0.7447, "step": 5116 }, { "epoch": 0.5058448459086079, "grad_norm": 3.2587440320237966, "learning_rate": 1.0296152106120491e-07, "loss": 0.7435, "step": 5117 }, { "epoch": 0.5059437016533623, "grad_norm": 6.088887244130914, "learning_rate": 1.0292951382942378e-07, "loss": 0.7588, "step": 5118 }, { "epoch": 0.5060425573981168, "grad_norm": 4.5291697676512666, "learning_rate": 1.0289750629726423e-07, "loss": 0.799, "step": 5119 }, { "epoch": 0.5061414131428713, "grad_norm": 4.163832501718261, "learning_rate": 1.0286549846800816e-07, "loss": 0.7405, "step": 5120 }, { "epoch": 0.5062402688876257, "grad_norm": 3.3796633520404247, "learning_rate": 1.0283349034493749e-07, "loss": 0.7056, "step": 5121 }, { "epoch": 0.5063391246323802, "grad_norm": 16.73013739419581, "learning_rate": 1.0280148193133413e-07, "loss": 0.7281, "step": 5122 }, { "epoch": 0.5064379803771346, "grad_norm": 9.975423516502135, "learning_rate": 1.0276947323048017e-07, "loss": 0.6973, "step": 5123 }, { "epoch": 0.5065368361218892, "grad_norm": 3.836962076531748, "learning_rate": 1.0273746424565758e-07, "loss": 0.7428, "step": 5124 }, { "epoch": 0.5066356918666436, "grad_norm": 3.8179497709529238, "learning_rate": 1.0270545498014842e-07, "loss": 0.7078, "step": 5125 }, { "epoch": 0.506734547611398, "grad_norm": 4.762315556206525, "learning_rate": 1.0267344543723475e-07, "loss": 0.7376, "step": 5126 }, { "epoch": 0.5068334033561526, "grad_norm": 23.804401027496468, "learning_rate": 1.0264143562019869e-07, "loss": 0.755, "step": 5127 }, { "epoch": 0.506932259100907, "grad_norm": 3.1423667171901974, "learning_rate": 1.0260942553232236e-07, "loss": 0.8391, "step": 5128 }, { "epoch": 0.5070311148456614, "grad_norm": 3.448777551216671, "learning_rate": 1.0257741517688792e-07, "loss": 0.7023, "step": 5129 }, { "epoch": 0.507129970590416, "grad_norm": 5.8102254002219995, "learning_rate": 1.0254540455717759e-07, "loss": 0.7307, "step": 5130 }, { "epoch": 0.5072288263351704, "grad_norm": 4.713276967394988, "learning_rate": 1.0251339367647358e-07, "loss": 0.7176, "step": 5131 }, { "epoch": 0.5073276820799248, "grad_norm": 4.993035647270986, "learning_rate": 1.0248138253805812e-07, "loss": 0.718, "step": 5132 }, { "epoch": 0.5074265378246794, "grad_norm": 5.434236201759911, "learning_rate": 1.0244937114521346e-07, "loss": 0.5812, "step": 5133 }, { "epoch": 0.5075253935694338, "grad_norm": 6.219552462818052, "learning_rate": 1.0241735950122194e-07, "loss": 0.6319, "step": 5134 }, { "epoch": 0.5076242493141883, "grad_norm": 2.852682538350435, "learning_rate": 1.0238534760936588e-07, "loss": 0.6421, "step": 5135 }, { "epoch": 0.5077231050589427, "grad_norm": 5.097951143512171, "learning_rate": 1.0235333547292761e-07, "loss": 0.6886, "step": 5136 }, { "epoch": 0.5078219608036972, "grad_norm": 3.772337278373112, "learning_rate": 1.0232132309518946e-07, "loss": 0.6044, "step": 5137 }, { "epoch": 0.5079208165484517, "grad_norm": 3.9054236547256176, "learning_rate": 1.0228931047943389e-07, "loss": 0.6452, "step": 5138 }, { "epoch": 0.5080196722932061, "grad_norm": 3.8508755939247865, "learning_rate": 1.022572976289433e-07, "loss": 0.61, "step": 5139 }, { "epoch": 0.5081185280379606, "grad_norm": 8.499286326453216, "learning_rate": 1.0222528454700013e-07, "loss": 0.7948, "step": 5140 }, { "epoch": 0.5082173837827151, "grad_norm": 4.164683368315368, "learning_rate": 1.0219327123688686e-07, "loss": 0.8173, "step": 5141 }, { "epoch": 0.5083162395274695, "grad_norm": 4.8170643666537, "learning_rate": 1.0216125770188597e-07, "loss": 0.7723, "step": 5142 }, { "epoch": 0.5084150952722241, "grad_norm": 7.572281133703752, "learning_rate": 1.0212924394528e-07, "loss": 0.6941, "step": 5143 }, { "epoch": 0.5085139510169785, "grad_norm": 4.555360599937588, "learning_rate": 1.0209722997035146e-07, "loss": 0.6208, "step": 5144 }, { "epoch": 0.5086128067617329, "grad_norm": 5.179980797423835, "learning_rate": 1.0206521578038294e-07, "loss": 0.6981, "step": 5145 }, { "epoch": 0.5087116625064874, "grad_norm": 3.1993187337342985, "learning_rate": 1.0203320137865699e-07, "loss": 0.6813, "step": 5146 }, { "epoch": 0.5088105182512419, "grad_norm": 3.936348817172037, "learning_rate": 1.0200118676845624e-07, "loss": 0.7492, "step": 5147 }, { "epoch": 0.5089093739959963, "grad_norm": 12.927839544668123, "learning_rate": 1.0196917195306328e-07, "loss": 0.686, "step": 5148 }, { "epoch": 0.5090082297407508, "grad_norm": 4.195562827286109, "learning_rate": 1.0193715693576079e-07, "loss": 0.8012, "step": 5149 }, { "epoch": 0.5091070854855053, "grad_norm": 5.18228485381953, "learning_rate": 1.0190514171983146e-07, "loss": 0.7079, "step": 5150 }, { "epoch": 0.5092059412302598, "grad_norm": 4.3051513160659765, "learning_rate": 1.0187312630855791e-07, "loss": 0.6558, "step": 5151 }, { "epoch": 0.5093047969750142, "grad_norm": 3.669228064002108, "learning_rate": 1.0184111070522292e-07, "loss": 0.712, "step": 5152 }, { "epoch": 0.5094036527197687, "grad_norm": 2.8312284064662188, "learning_rate": 1.0180909491310921e-07, "loss": 0.7698, "step": 5153 }, { "epoch": 0.5095025084645232, "grad_norm": 4.678167712805343, "learning_rate": 1.0177707893549947e-07, "loss": 0.6712, "step": 5154 }, { "epoch": 0.5096013642092776, "grad_norm": 3.9397437605349435, "learning_rate": 1.0174506277567647e-07, "loss": 0.7273, "step": 5155 }, { "epoch": 0.509700219954032, "grad_norm": 5.550400268314377, "learning_rate": 1.0171304643692305e-07, "loss": 0.686, "step": 5156 }, { "epoch": 0.5097990756987866, "grad_norm": 5.976739558639031, "learning_rate": 1.0168102992252199e-07, "loss": 0.748, "step": 5157 }, { "epoch": 0.509897931443541, "grad_norm": 142.2819580157784, "learning_rate": 1.0164901323575612e-07, "loss": 0.712, "step": 5158 }, { "epoch": 0.5099967871882954, "grad_norm": 3.6344098239381113, "learning_rate": 1.0161699637990824e-07, "loss": 0.7775, "step": 5159 }, { "epoch": 0.51009564293305, "grad_norm": 3.8497019485257304, "learning_rate": 1.0158497935826125e-07, "loss": 0.7661, "step": 5160 }, { "epoch": 0.5101944986778044, "grad_norm": 5.204497622541665, "learning_rate": 1.0155296217409802e-07, "loss": 0.6897, "step": 5161 }, { "epoch": 0.5102933544225589, "grad_norm": 4.673098396692905, "learning_rate": 1.0152094483070139e-07, "loss": 0.7807, "step": 5162 }, { "epoch": 0.5103922101673134, "grad_norm": 5.693224136089495, "learning_rate": 1.0148892733135434e-07, "loss": 0.7649, "step": 5163 }, { "epoch": 0.5104910659120678, "grad_norm": 4.24875032997248, "learning_rate": 1.0145690967933973e-07, "loss": 0.6796, "step": 5164 }, { "epoch": 0.5105899216568223, "grad_norm": 3.7453503495465967, "learning_rate": 1.0142489187794054e-07, "loss": 0.8031, "step": 5165 }, { "epoch": 0.5106887774015767, "grad_norm": 9.061420243019416, "learning_rate": 1.0139287393043969e-07, "loss": 0.7696, "step": 5166 }, { "epoch": 0.5107876331463312, "grad_norm": 3.8839209691581367, "learning_rate": 1.0136085584012017e-07, "loss": 0.9186, "step": 5167 }, { "epoch": 0.5108864888910857, "grad_norm": 4.920805585373029, "learning_rate": 1.0132883761026499e-07, "loss": 0.8102, "step": 5168 }, { "epoch": 0.5109853446358401, "grad_norm": 3.476681553549399, "learning_rate": 1.0129681924415708e-07, "loss": 0.747, "step": 5169 }, { "epoch": 0.5110842003805947, "grad_norm": 3.164388232449787, "learning_rate": 1.0126480074507949e-07, "loss": 0.8058, "step": 5170 }, { "epoch": 0.5111830561253491, "grad_norm": 4.306732297698357, "learning_rate": 1.0123278211631525e-07, "loss": 0.636, "step": 5171 }, { "epoch": 0.5112819118701035, "grad_norm": 3.370488024708446, "learning_rate": 1.0120076336114738e-07, "loss": 0.6589, "step": 5172 }, { "epoch": 0.5113807676148581, "grad_norm": 3.415411654343872, "learning_rate": 1.0116874448285892e-07, "loss": 0.6243, "step": 5173 }, { "epoch": 0.5114796233596125, "grad_norm": 3.5450332726854112, "learning_rate": 1.0113672548473297e-07, "loss": 0.7661, "step": 5174 }, { "epoch": 0.5115784791043669, "grad_norm": 5.171405128140053, "learning_rate": 1.0110470637005261e-07, "loss": 0.6753, "step": 5175 }, { "epoch": 0.5116773348491214, "grad_norm": 4.166783663456934, "learning_rate": 1.010726871421009e-07, "loss": 0.6717, "step": 5176 }, { "epoch": 0.5117761905938759, "grad_norm": 6.567135829836197, "learning_rate": 1.0104066780416091e-07, "loss": 0.6553, "step": 5177 }, { "epoch": 0.5118750463386303, "grad_norm": 3.57172831700378, "learning_rate": 1.0100864835951584e-07, "loss": 0.7242, "step": 5178 }, { "epoch": 0.5119739020833848, "grad_norm": 3.8908196605736904, "learning_rate": 1.0097662881144873e-07, "loss": 0.7761, "step": 5179 }, { "epoch": 0.5120727578281393, "grad_norm": 3.9559701276201094, "learning_rate": 1.0094460916324279e-07, "loss": 0.5855, "step": 5180 }, { "epoch": 0.5121716135728938, "grad_norm": 3.237798191440199, "learning_rate": 1.0091258941818104e-07, "loss": 0.7348, "step": 5181 }, { "epoch": 0.5122704693176482, "grad_norm": 3.3036582043503078, "learning_rate": 1.0088056957954673e-07, "loss": 0.7364, "step": 5182 }, { "epoch": 0.5123693250624027, "grad_norm": 4.089944748045106, "learning_rate": 1.0084854965062301e-07, "loss": 0.6559, "step": 5183 }, { "epoch": 0.5124681808071572, "grad_norm": 7.548530594243388, "learning_rate": 1.0081652963469303e-07, "loss": 0.8397, "step": 5184 }, { "epoch": 0.5125670365519116, "grad_norm": 4.623602792953504, "learning_rate": 1.0078450953504001e-07, "loss": 0.7855, "step": 5185 }, { "epoch": 0.512665892296666, "grad_norm": 2.907186252271077, "learning_rate": 1.007524893549471e-07, "loss": 0.7495, "step": 5186 }, { "epoch": 0.5127647480414206, "grad_norm": 9.920193790916882, "learning_rate": 1.0072046909769747e-07, "loss": 0.638, "step": 5187 }, { "epoch": 0.512863603786175, "grad_norm": 3.9355850675224477, "learning_rate": 1.006884487665744e-07, "loss": 0.774, "step": 5188 }, { "epoch": 0.5129624595309294, "grad_norm": 3.448045786761459, "learning_rate": 1.0065642836486106e-07, "loss": 0.6504, "step": 5189 }, { "epoch": 0.513061315275684, "grad_norm": 6.552323919328628, "learning_rate": 1.0062440789584066e-07, "loss": 0.6768, "step": 5190 }, { "epoch": 0.5131601710204384, "grad_norm": 6.083481870527904, "learning_rate": 1.0059238736279645e-07, "loss": 0.6908, "step": 5191 }, { "epoch": 0.5132590267651929, "grad_norm": 5.792277355776187, "learning_rate": 1.0056036676901163e-07, "loss": 0.6785, "step": 5192 }, { "epoch": 0.5133578825099474, "grad_norm": 7.41000081424419, "learning_rate": 1.005283461177695e-07, "loss": 0.676, "step": 5193 }, { "epoch": 0.5134567382547018, "grad_norm": 3.8747432307773657, "learning_rate": 1.0049632541235324e-07, "loss": 0.7829, "step": 5194 }, { "epoch": 0.5135555939994563, "grad_norm": 4.7933927384914545, "learning_rate": 1.0046430465604614e-07, "loss": 0.5304, "step": 5195 }, { "epoch": 0.5136544497442107, "grad_norm": 3.3367437924634484, "learning_rate": 1.0043228385213147e-07, "loss": 0.7934, "step": 5196 }, { "epoch": 0.5137533054889653, "grad_norm": 3.8873580859976236, "learning_rate": 1.0040026300389246e-07, "loss": 0.6835, "step": 5197 }, { "epoch": 0.5138521612337197, "grad_norm": 4.4507919105265525, "learning_rate": 1.0036824211461237e-07, "loss": 0.6893, "step": 5198 }, { "epoch": 0.5139510169784741, "grad_norm": 4.017058438683735, "learning_rate": 1.0033622118757448e-07, "loss": 0.7206, "step": 5199 }, { "epoch": 0.5140498727232287, "grad_norm": 3.9534355430569166, "learning_rate": 1.0030420022606205e-07, "loss": 0.6959, "step": 5200 }, { "epoch": 0.5141487284679831, "grad_norm": 4.616934756735788, "learning_rate": 1.002721792333584e-07, "loss": 0.6958, "step": 5201 }, { "epoch": 0.5142475842127375, "grad_norm": 4.861821462753415, "learning_rate": 1.0024015821274678e-07, "loss": 0.754, "step": 5202 }, { "epoch": 0.5143464399574921, "grad_norm": 6.612771220690631, "learning_rate": 1.0020813716751047e-07, "loss": 0.6958, "step": 5203 }, { "epoch": 0.5144452957022465, "grad_norm": 8.715288422021418, "learning_rate": 1.0017611610093277e-07, "loss": 0.7077, "step": 5204 }, { "epoch": 0.5145441514470009, "grad_norm": 4.112484682583029, "learning_rate": 1.0014409501629697e-07, "loss": 0.6087, "step": 5205 }, { "epoch": 0.5146430071917555, "grad_norm": 4.520008954717863, "learning_rate": 1.0011207391688637e-07, "loss": 0.644, "step": 5206 }, { "epoch": 0.5147418629365099, "grad_norm": 3.2701410613526014, "learning_rate": 1.0008005280598419e-07, "loss": 0.6765, "step": 5207 }, { "epoch": 0.5148407186812644, "grad_norm": 6.455400598492269, "learning_rate": 1.0004803168687382e-07, "loss": 0.7496, "step": 5208 }, { "epoch": 0.5149395744260188, "grad_norm": 4.809831205254909, "learning_rate": 1.000160105628385e-07, "loss": 0.8063, "step": 5209 }, { "epoch": 0.5150384301707733, "grad_norm": 3.3885573019784947, "learning_rate": 9.99839894371615e-08, "loss": 0.6612, "step": 5210 }, { "epoch": 0.5151372859155278, "grad_norm": 6.7911551954643254, "learning_rate": 9.99519683131262e-08, "loss": 0.615, "step": 5211 }, { "epoch": 0.5152361416602822, "grad_norm": 6.176796459895119, "learning_rate": 9.99199471940158e-08, "loss": 0.7829, "step": 5212 }, { "epoch": 0.5153349974050367, "grad_norm": 4.372200004356536, "learning_rate": 9.988792608311365e-08, "loss": 0.7035, "step": 5213 }, { "epoch": 0.5154338531497912, "grad_norm": 6.069113498501078, "learning_rate": 9.985590498370304e-08, "loss": 0.709, "step": 5214 }, { "epoch": 0.5155327088945456, "grad_norm": 4.298654150422346, "learning_rate": 9.982388389906723e-08, "loss": 0.6624, "step": 5215 }, { "epoch": 0.5156315646393002, "grad_norm": 3.2326061885799406, "learning_rate": 9.97918628324895e-08, "loss": 0.7043, "step": 5216 }, { "epoch": 0.5157304203840546, "grad_norm": 5.146232945356326, "learning_rate": 9.975984178725323e-08, "loss": 0.7432, "step": 5217 }, { "epoch": 0.515829276128809, "grad_norm": 3.864810953749978, "learning_rate": 9.972782076664159e-08, "loss": 0.7482, "step": 5218 }, { "epoch": 0.5159281318735635, "grad_norm": 4.9190639591151895, "learning_rate": 9.969579977393792e-08, "loss": 0.7433, "step": 5219 }, { "epoch": 0.516026987618318, "grad_norm": 3.8908218099868113, "learning_rate": 9.966377881242554e-08, "loss": 0.7312, "step": 5220 }, { "epoch": 0.5161258433630724, "grad_norm": 3.2843813635241794, "learning_rate": 9.963175788538764e-08, "loss": 0.726, "step": 5221 }, { "epoch": 0.5162246991078269, "grad_norm": 11.448661674785312, "learning_rate": 9.959973699610757e-08, "loss": 0.6648, "step": 5222 }, { "epoch": 0.5163235548525814, "grad_norm": 13.606729043089802, "learning_rate": 9.956771614786852e-08, "loss": 0.8351, "step": 5223 }, { "epoch": 0.5164224105973358, "grad_norm": 25.953744511506006, "learning_rate": 9.953569534395384e-08, "loss": 0.8038, "step": 5224 }, { "epoch": 0.5165212663420903, "grad_norm": 3.514073206219885, "learning_rate": 9.950367458764677e-08, "loss": 0.6976, "step": 5225 }, { "epoch": 0.5166201220868448, "grad_norm": 3.601173306142156, "learning_rate": 9.94716538822305e-08, "loss": 0.7966, "step": 5226 }, { "epoch": 0.5167189778315993, "grad_norm": 4.123896239181222, "learning_rate": 9.943963323098833e-08, "loss": 0.765, "step": 5227 }, { "epoch": 0.5168178335763537, "grad_norm": 3.8432940011626617, "learning_rate": 9.940761263720357e-08, "loss": 0.5997, "step": 5228 }, { "epoch": 0.5169166893211081, "grad_norm": 30.719505042538515, "learning_rate": 9.937559210415933e-08, "loss": 0.6984, "step": 5229 }, { "epoch": 0.5170155450658627, "grad_norm": 4.0367666101635225, "learning_rate": 9.934357163513895e-08, "loss": 0.6536, "step": 5230 }, { "epoch": 0.5171144008106171, "grad_norm": 4.176012707811109, "learning_rate": 9.93115512334256e-08, "loss": 0.6907, "step": 5231 }, { "epoch": 0.5172132565553715, "grad_norm": 3.4289638096813775, "learning_rate": 9.927953090230252e-08, "loss": 0.614, "step": 5232 }, { "epoch": 0.5173121123001261, "grad_norm": 4.333031644724551, "learning_rate": 9.924751064505292e-08, "loss": 0.6901, "step": 5233 }, { "epoch": 0.5174109680448805, "grad_norm": 13.055056372965046, "learning_rate": 9.921549046496e-08, "loss": 0.6229, "step": 5234 }, { "epoch": 0.517509823789635, "grad_norm": 5.498379447409048, "learning_rate": 9.918347036530695e-08, "loss": 0.6551, "step": 5235 }, { "epoch": 0.5176086795343895, "grad_norm": 6.089428710759652, "learning_rate": 9.9151450349377e-08, "loss": 0.782, "step": 5236 }, { "epoch": 0.5177075352791439, "grad_norm": 4.303092302017949, "learning_rate": 9.911943042045326e-08, "loss": 0.7237, "step": 5237 }, { "epoch": 0.5178063910238984, "grad_norm": 3.372905109299711, "learning_rate": 9.908741058181895e-08, "loss": 0.5261, "step": 5238 }, { "epoch": 0.5179052467686528, "grad_norm": 5.554612226759997, "learning_rate": 9.905539083675724e-08, "loss": 0.6823, "step": 5239 }, { "epoch": 0.5180041025134073, "grad_norm": 3.174415449306823, "learning_rate": 9.902337118855128e-08, "loss": 0.7487, "step": 5240 }, { "epoch": 0.5181029582581618, "grad_norm": 3.8473932083703644, "learning_rate": 9.899135164048414e-08, "loss": 0.7666, "step": 5241 }, { "epoch": 0.5182018140029162, "grad_norm": 3.6183568430264894, "learning_rate": 9.895933219583908e-08, "loss": 0.6741, "step": 5242 }, { "epoch": 0.5183006697476708, "grad_norm": 4.673341645996973, "learning_rate": 9.89273128578991e-08, "loss": 0.6127, "step": 5243 }, { "epoch": 0.5183995254924252, "grad_norm": 2.951573839659583, "learning_rate": 9.88952936299474e-08, "loss": 0.7249, "step": 5244 }, { "epoch": 0.5184983812371796, "grad_norm": 5.778532143835413, "learning_rate": 9.886327451526702e-08, "loss": 0.6712, "step": 5245 }, { "epoch": 0.5185972369819342, "grad_norm": 3.3155600664817926, "learning_rate": 9.883125551714106e-08, "loss": 0.7173, "step": 5246 }, { "epoch": 0.5186960927266886, "grad_norm": 7.4333772082145035, "learning_rate": 9.879923663885265e-08, "loss": 0.7007, "step": 5247 }, { "epoch": 0.518794948471443, "grad_norm": 11.2828630746386, "learning_rate": 9.876721788368474e-08, "loss": 0.6314, "step": 5248 }, { "epoch": 0.5188938042161975, "grad_norm": 3.412410516030076, "learning_rate": 9.87351992549205e-08, "loss": 0.801, "step": 5249 }, { "epoch": 0.518992659960952, "grad_norm": 3.890896643775121, "learning_rate": 9.870318075584293e-08, "loss": 0.7304, "step": 5250 }, { "epoch": 0.5190915157057064, "grad_norm": 3.1257103693668484, "learning_rate": 9.867116238973503e-08, "loss": 0.6252, "step": 5251 }, { "epoch": 0.5191903714504609, "grad_norm": 21.35795565687115, "learning_rate": 9.86391441598798e-08, "loss": 0.6884, "step": 5252 }, { "epoch": 0.5192892271952154, "grad_norm": 3.1865135867723864, "learning_rate": 9.860712606956032e-08, "loss": 0.6601, "step": 5253 }, { "epoch": 0.5193880829399699, "grad_norm": 5.038067798009723, "learning_rate": 9.857510812205946e-08, "loss": 0.7356, "step": 5254 }, { "epoch": 0.5194869386847243, "grad_norm": 7.09118023591621, "learning_rate": 9.854309032066028e-08, "loss": 0.7758, "step": 5255 }, { "epoch": 0.5195857944294788, "grad_norm": 4.395564396480461, "learning_rate": 9.851107266864566e-08, "loss": 0.7167, "step": 5256 }, { "epoch": 0.5196846501742333, "grad_norm": 4.67886310070672, "learning_rate": 9.847905516929861e-08, "loss": 0.6517, "step": 5257 }, { "epoch": 0.5197835059189877, "grad_norm": 3.150371172776044, "learning_rate": 9.8447037825902e-08, "loss": 0.703, "step": 5258 }, { "epoch": 0.5198823616637421, "grad_norm": 4.065840212229686, "learning_rate": 9.841502064173875e-08, "loss": 0.5704, "step": 5259 }, { "epoch": 0.5199812174084967, "grad_norm": 4.7053417205395025, "learning_rate": 9.838300362009175e-08, "loss": 0.6417, "step": 5260 }, { "epoch": 0.5200800731532511, "grad_norm": 4.811618252193097, "learning_rate": 9.83509867642439e-08, "loss": 0.7635, "step": 5261 }, { "epoch": 0.5201789288980055, "grad_norm": 3.5557017912260878, "learning_rate": 9.8318970077478e-08, "loss": 0.7389, "step": 5262 }, { "epoch": 0.5202777846427601, "grad_norm": 4.852927795474712, "learning_rate": 9.828695356307692e-08, "loss": 0.6832, "step": 5263 }, { "epoch": 0.5203766403875145, "grad_norm": 3.8331197446808134, "learning_rate": 9.825493722432353e-08, "loss": 0.7171, "step": 5264 }, { "epoch": 0.520475496132269, "grad_norm": 3.2829345013563924, "learning_rate": 9.822292106450054e-08, "loss": 0.6038, "step": 5265 }, { "epoch": 0.5205743518770235, "grad_norm": 3.7848737299129644, "learning_rate": 9.819090508689083e-08, "loss": 0.7253, "step": 5266 }, { "epoch": 0.5206732076217779, "grad_norm": 5.299228562629602, "learning_rate": 9.81588892947771e-08, "loss": 0.7639, "step": 5267 }, { "epoch": 0.5207720633665324, "grad_norm": 10.091681620437388, "learning_rate": 9.812687369144207e-08, "loss": 0.6627, "step": 5268 }, { "epoch": 0.5208709191112868, "grad_norm": 4.346398286078482, "learning_rate": 9.809485828016857e-08, "loss": 0.6428, "step": 5269 }, { "epoch": 0.5209697748560413, "grad_norm": 3.1023145932430767, "learning_rate": 9.80628430642392e-08, "loss": 0.7347, "step": 5270 }, { "epoch": 0.5210686306007958, "grad_norm": 3.5102422033259137, "learning_rate": 9.803082804693671e-08, "loss": 0.7415, "step": 5271 }, { "epoch": 0.5211674863455502, "grad_norm": 5.01719246158629, "learning_rate": 9.799881323154379e-08, "loss": 0.7559, "step": 5272 }, { "epoch": 0.5212663420903048, "grad_norm": 3.983272583258675, "learning_rate": 9.796679862134302e-08, "loss": 0.6663, "step": 5273 }, { "epoch": 0.5213651978350592, "grad_norm": 6.311605848936584, "learning_rate": 9.793478421961707e-08, "loss": 0.6654, "step": 5274 }, { "epoch": 0.5214640535798136, "grad_norm": 4.762220336186127, "learning_rate": 9.790277002964853e-08, "loss": 0.707, "step": 5275 }, { "epoch": 0.5215629093245682, "grad_norm": 5.2960363403934645, "learning_rate": 9.787075605472e-08, "loss": 0.7728, "step": 5276 }, { "epoch": 0.5216617650693226, "grad_norm": 13.739321268757857, "learning_rate": 9.783874229811402e-08, "loss": 0.7873, "step": 5277 }, { "epoch": 0.521760620814077, "grad_norm": 9.501204247564678, "learning_rate": 9.780672876311316e-08, "loss": 0.7855, "step": 5278 }, { "epoch": 0.5218594765588315, "grad_norm": 3.5818092975818843, "learning_rate": 9.777471545299986e-08, "loss": 0.6771, "step": 5279 }, { "epoch": 0.521958332303586, "grad_norm": 3.8212807758113945, "learning_rate": 9.774270237105671e-08, "loss": 0.7305, "step": 5280 }, { "epoch": 0.5220571880483404, "grad_norm": 3.6429981389132666, "learning_rate": 9.771068952056611e-08, "loss": 0.7702, "step": 5281 }, { "epoch": 0.5221560437930949, "grad_norm": 7.985525678494868, "learning_rate": 9.767867690481052e-08, "loss": 0.7942, "step": 5282 }, { "epoch": 0.5222548995378494, "grad_norm": 32.33185405477215, "learning_rate": 9.764666452707242e-08, "loss": 0.8347, "step": 5283 }, { "epoch": 0.5223537552826039, "grad_norm": 3.347470362218499, "learning_rate": 9.761465239063414e-08, "loss": 0.6238, "step": 5284 }, { "epoch": 0.5224526110273583, "grad_norm": 3.4686541078437, "learning_rate": 9.758264049877804e-08, "loss": 0.7496, "step": 5285 }, { "epoch": 0.5225514667721128, "grad_norm": 5.162626535625287, "learning_rate": 9.755062885478654e-08, "loss": 0.6892, "step": 5286 }, { "epoch": 0.5226503225168673, "grad_norm": 3.8243612211473073, "learning_rate": 9.751861746194187e-08, "loss": 0.716, "step": 5287 }, { "epoch": 0.5227491782616217, "grad_norm": 4.526422719290918, "learning_rate": 9.748660632352645e-08, "loss": 0.6562, "step": 5288 }, { "epoch": 0.5228480340063762, "grad_norm": 7.206696790624657, "learning_rate": 9.74545954428224e-08, "loss": 0.6695, "step": 5289 }, { "epoch": 0.5229468897511307, "grad_norm": 4.386047485729767, "learning_rate": 9.742258482311205e-08, "loss": 0.7809, "step": 5290 }, { "epoch": 0.5230457454958851, "grad_norm": 3.3481502561278758, "learning_rate": 9.739057446767765e-08, "loss": 0.7167, "step": 5291 }, { "epoch": 0.5231446012406396, "grad_norm": 3.686013103652321, "learning_rate": 9.735856437980131e-08, "loss": 0.6786, "step": 5292 }, { "epoch": 0.5232434569853941, "grad_norm": 9.876831332956012, "learning_rate": 9.732655456276525e-08, "loss": 0.6447, "step": 5293 }, { "epoch": 0.5233423127301485, "grad_norm": 5.773988367260863, "learning_rate": 9.729454501985161e-08, "loss": 0.6395, "step": 5294 }, { "epoch": 0.523441168474903, "grad_norm": 4.647334979150752, "learning_rate": 9.72625357543424e-08, "loss": 0.772, "step": 5295 }, { "epoch": 0.5235400242196575, "grad_norm": 5.804408970521308, "learning_rate": 9.72305267695198e-08, "loss": 0.7193, "step": 5296 }, { "epoch": 0.5236388799644119, "grad_norm": 5.482655754507211, "learning_rate": 9.719851806866586e-08, "loss": 0.7366, "step": 5297 }, { "epoch": 0.5237377357091664, "grad_norm": 4.152915387587098, "learning_rate": 9.716650965506252e-08, "loss": 0.7396, "step": 5298 }, { "epoch": 0.5238365914539209, "grad_norm": 4.221233859165618, "learning_rate": 9.713450153199186e-08, "loss": 0.6486, "step": 5299 }, { "epoch": 0.5239354471986754, "grad_norm": 5.080527180483363, "learning_rate": 9.710249370273577e-08, "loss": 0.7544, "step": 5300 }, { "epoch": 0.5240343029434298, "grad_norm": 3.0696626920100094, "learning_rate": 9.707048617057622e-08, "loss": 0.6822, "step": 5301 }, { "epoch": 0.5241331586881842, "grad_norm": 5.208476985391087, "learning_rate": 9.703847893879508e-08, "loss": 0.725, "step": 5302 }, { "epoch": 0.5242320144329388, "grad_norm": 3.808271160397599, "learning_rate": 9.700647201067426e-08, "loss": 0.6428, "step": 5303 }, { "epoch": 0.5243308701776932, "grad_norm": 4.631353678306793, "learning_rate": 9.697446538949557e-08, "loss": 0.6309, "step": 5304 }, { "epoch": 0.5244297259224476, "grad_norm": 2.9671363513437994, "learning_rate": 9.694245907854085e-08, "loss": 0.6725, "step": 5305 }, { "epoch": 0.5245285816672022, "grad_norm": 3.5522472313030002, "learning_rate": 9.69104530810918e-08, "loss": 0.7153, "step": 5306 }, { "epoch": 0.5246274374119566, "grad_norm": 3.7366827826358993, "learning_rate": 9.687844740043023e-08, "loss": 0.6499, "step": 5307 }, { "epoch": 0.524726293156711, "grad_norm": 10.523144344247834, "learning_rate": 9.684644203983786e-08, "loss": 0.8197, "step": 5308 }, { "epoch": 0.5248251489014656, "grad_norm": 3.816403970613758, "learning_rate": 9.681443700259631e-08, "loss": 0.6736, "step": 5309 }, { "epoch": 0.52492400464622, "grad_norm": 4.069107452649294, "learning_rate": 9.67824322919873e-08, "loss": 0.7402, "step": 5310 }, { "epoch": 0.5250228603909745, "grad_norm": 3.56526041250723, "learning_rate": 9.675042791129236e-08, "loss": 0.7701, "step": 5311 }, { "epoch": 0.5251217161357289, "grad_norm": 3.3363081306379314, "learning_rate": 9.671842386379309e-08, "loss": 0.659, "step": 5312 }, { "epoch": 0.5252205718804834, "grad_norm": 7.488569722620966, "learning_rate": 9.668642015277109e-08, "loss": 0.7315, "step": 5313 }, { "epoch": 0.5253194276252379, "grad_norm": 3.6500107143571987, "learning_rate": 9.665441678150778e-08, "loss": 0.737, "step": 5314 }, { "epoch": 0.5254182833699923, "grad_norm": 3.056116706714036, "learning_rate": 9.662241375328467e-08, "loss": 0.6721, "step": 5315 }, { "epoch": 0.5255171391147468, "grad_norm": 3.5935793310225765, "learning_rate": 9.659041107138325e-08, "loss": 0.6698, "step": 5316 }, { "epoch": 0.5256159948595013, "grad_norm": 3.279032468034479, "learning_rate": 9.655840873908485e-08, "loss": 0.7844, "step": 5317 }, { "epoch": 0.5257148506042557, "grad_norm": 2.932466634038758, "learning_rate": 9.652640675967089e-08, "loss": 0.786, "step": 5318 }, { "epoch": 0.5258137063490103, "grad_norm": 4.398059859207706, "learning_rate": 9.649440513642268e-08, "loss": 0.6293, "step": 5319 }, { "epoch": 0.5259125620937647, "grad_norm": 4.238561609352924, "learning_rate": 9.64624038726215e-08, "loss": 0.6738, "step": 5320 }, { "epoch": 0.5260114178385191, "grad_norm": 2.6896932666634266, "learning_rate": 9.643040297154864e-08, "loss": 0.7658, "step": 5321 }, { "epoch": 0.5261102735832736, "grad_norm": 2.9443598200672048, "learning_rate": 9.639840243648527e-08, "loss": 0.6289, "step": 5322 }, { "epoch": 0.5262091293280281, "grad_norm": 2.8673342998474127, "learning_rate": 9.636640227071257e-08, "loss": 0.7656, "step": 5323 }, { "epoch": 0.5263079850727825, "grad_norm": 4.451367431285134, "learning_rate": 9.633440247751179e-08, "loss": 0.6611, "step": 5324 }, { "epoch": 0.526406840817537, "grad_norm": 6.502169529009352, "learning_rate": 9.630240306016392e-08, "loss": 0.6667, "step": 5325 }, { "epoch": 0.5265056965622915, "grad_norm": 2.971161455676223, "learning_rate": 9.627040402195008e-08, "loss": 0.645, "step": 5326 }, { "epoch": 0.526604552307046, "grad_norm": 13.147015131273006, "learning_rate": 9.62384053661513e-08, "loss": 0.747, "step": 5327 }, { "epoch": 0.5267034080518004, "grad_norm": 6.820267516720407, "learning_rate": 9.620640709604858e-08, "loss": 0.7424, "step": 5328 }, { "epoch": 0.5268022637965549, "grad_norm": 3.6567919476206927, "learning_rate": 9.617440921492282e-08, "loss": 0.7003, "step": 5329 }, { "epoch": 0.5269011195413094, "grad_norm": 3.6609980853752484, "learning_rate": 9.6142411726055e-08, "loss": 0.7127, "step": 5330 }, { "epoch": 0.5269999752860638, "grad_norm": 9.665047500114843, "learning_rate": 9.611041463272591e-08, "loss": 0.6106, "step": 5331 }, { "epoch": 0.5270988310308182, "grad_norm": 3.6079352534226716, "learning_rate": 9.607841793821647e-08, "loss": 0.6217, "step": 5332 }, { "epoch": 0.5271976867755728, "grad_norm": 6.495209991188831, "learning_rate": 9.604642164580738e-08, "loss": 0.7479, "step": 5333 }, { "epoch": 0.5272965425203272, "grad_norm": 4.488740246574695, "learning_rate": 9.601442575877943e-08, "loss": 0.7018, "step": 5334 }, { "epoch": 0.5273953982650816, "grad_norm": 3.192376243232391, "learning_rate": 9.598243028041337e-08, "loss": 0.6869, "step": 5335 }, { "epoch": 0.5274942540098362, "grad_norm": 3.87773430843339, "learning_rate": 9.595043521398978e-08, "loss": 0.6924, "step": 5336 }, { "epoch": 0.5275931097545906, "grad_norm": 3.523983480639423, "learning_rate": 9.591844056278934e-08, "loss": 0.6664, "step": 5337 }, { "epoch": 0.527691965499345, "grad_norm": 4.091210219873373, "learning_rate": 9.588644633009265e-08, "loss": 0.7058, "step": 5338 }, { "epoch": 0.5277908212440996, "grad_norm": 5.088197778296859, "learning_rate": 9.585445251918015e-08, "loss": 0.6311, "step": 5339 }, { "epoch": 0.527889676988854, "grad_norm": 5.5225793835332215, "learning_rate": 9.58224591333324e-08, "loss": 0.7114, "step": 5340 }, { "epoch": 0.5279885327336085, "grad_norm": 3.396246553776375, "learning_rate": 9.579046617582989e-08, "loss": 0.7227, "step": 5341 }, { "epoch": 0.5280873884783629, "grad_norm": 3.400920482041242, "learning_rate": 9.575847364995293e-08, "loss": 0.7585, "step": 5342 }, { "epoch": 0.5281862442231174, "grad_norm": 3.828138069714206, "learning_rate": 9.572648155898198e-08, "loss": 0.7595, "step": 5343 }, { "epoch": 0.5282850999678719, "grad_norm": 3.490815359043999, "learning_rate": 9.569448990619726e-08, "loss": 0.7322, "step": 5344 }, { "epoch": 0.5283839557126263, "grad_norm": 8.799359519922534, "learning_rate": 9.56624986948791e-08, "loss": 0.7737, "step": 5345 }, { "epoch": 0.5284828114573809, "grad_norm": 2.683491622840961, "learning_rate": 9.563050792830772e-08, "loss": 0.6959, "step": 5346 }, { "epoch": 0.5285816672021353, "grad_norm": 5.416065061226504, "learning_rate": 9.559851760976331e-08, "loss": 0.672, "step": 5347 }, { "epoch": 0.5286805229468897, "grad_norm": 3.9718374698671264, "learning_rate": 9.556652774252596e-08, "loss": 0.7409, "step": 5348 }, { "epoch": 0.5287793786916443, "grad_norm": 2.9909945049759994, "learning_rate": 9.553453832987584e-08, "loss": 0.7111, "step": 5349 }, { "epoch": 0.5288782344363987, "grad_norm": 3.9146887178079526, "learning_rate": 9.55025493750929e-08, "loss": 0.7451, "step": 5350 }, { "epoch": 0.5289770901811531, "grad_norm": 3.8022655853677447, "learning_rate": 9.547056088145717e-08, "loss": 0.6056, "step": 5351 }, { "epoch": 0.5290759459259076, "grad_norm": 12.85163268573417, "learning_rate": 9.543857285224867e-08, "loss": 0.6606, "step": 5352 }, { "epoch": 0.5291748016706621, "grad_norm": 3.3205172076614917, "learning_rate": 9.540658529074717e-08, "loss": 0.7463, "step": 5353 }, { "epoch": 0.5292736574154165, "grad_norm": 3.806466081490698, "learning_rate": 9.537459820023266e-08, "loss": 0.7112, "step": 5354 }, { "epoch": 0.529372513160171, "grad_norm": 3.629168844336948, "learning_rate": 9.534261158398484e-08, "loss": 0.7028, "step": 5355 }, { "epoch": 0.5294713689049255, "grad_norm": 10.64505156612048, "learning_rate": 9.531062544528349e-08, "loss": 0.6611, "step": 5356 }, { "epoch": 0.52957022464968, "grad_norm": 3.9796107803835605, "learning_rate": 9.527863978740836e-08, "loss": 0.6597, "step": 5357 }, { "epoch": 0.5296690803944344, "grad_norm": 7.0406201582893075, "learning_rate": 9.524665461363904e-08, "loss": 0.6836, "step": 5358 }, { "epoch": 0.5297679361391889, "grad_norm": 4.166047574829191, "learning_rate": 9.521466992725516e-08, "loss": 0.6966, "step": 5359 }, { "epoch": 0.5298667918839434, "grad_norm": 3.5387040508142804, "learning_rate": 9.518268573153633e-08, "loss": 0.6876, "step": 5360 }, { "epoch": 0.5299656476286978, "grad_norm": 4.384376731333225, "learning_rate": 9.515070202976199e-08, "loss": 0.9271, "step": 5361 }, { "epoch": 0.5300645033734523, "grad_norm": 6.250510697432791, "learning_rate": 9.511871882521165e-08, "loss": 0.717, "step": 5362 }, { "epoch": 0.5301633591182068, "grad_norm": 4.170966517180185, "learning_rate": 9.508673612116468e-08, "loss": 0.6382, "step": 5363 }, { "epoch": 0.5302622148629612, "grad_norm": 4.51495853240118, "learning_rate": 9.505475392090044e-08, "loss": 0.5835, "step": 5364 }, { "epoch": 0.5303610706077156, "grad_norm": 6.691991298913472, "learning_rate": 9.502277222769827e-08, "loss": 0.7499, "step": 5365 }, { "epoch": 0.5304599263524702, "grad_norm": 13.227388407564307, "learning_rate": 9.499079104483733e-08, "loss": 0.6764, "step": 5366 }, { "epoch": 0.5305587820972246, "grad_norm": 3.166354705811398, "learning_rate": 9.49588103755969e-08, "loss": 0.7161, "step": 5367 }, { "epoch": 0.5306576378419791, "grad_norm": 4.999643311901486, "learning_rate": 9.492683022325613e-08, "loss": 0.723, "step": 5368 }, { "epoch": 0.5307564935867336, "grad_norm": 3.3255525415319993, "learning_rate": 9.489485059109406e-08, "loss": 0.7341, "step": 5369 }, { "epoch": 0.530855349331488, "grad_norm": 6.821401561891237, "learning_rate": 9.486287148238974e-08, "loss": 0.724, "step": 5370 }, { "epoch": 0.5309542050762425, "grad_norm": 3.877730353188762, "learning_rate": 9.48308929004222e-08, "loss": 0.787, "step": 5371 }, { "epoch": 0.531053060820997, "grad_norm": 5.531131715160006, "learning_rate": 9.479891484847036e-08, "loss": 0.6524, "step": 5372 }, { "epoch": 0.5311519165657514, "grad_norm": 5.4062351054504, "learning_rate": 9.476693732981303e-08, "loss": 0.7314, "step": 5373 }, { "epoch": 0.5312507723105059, "grad_norm": 6.544957477039126, "learning_rate": 9.473496034772915e-08, "loss": 0.6593, "step": 5374 }, { "epoch": 0.5313496280552603, "grad_norm": 4.549033763162557, "learning_rate": 9.470298390549736e-08, "loss": 0.7478, "step": 5375 }, { "epoch": 0.5314484838000149, "grad_norm": 3.276611138826865, "learning_rate": 9.467100800639648e-08, "loss": 0.6976, "step": 5376 }, { "epoch": 0.5315473395447693, "grad_norm": 24.242688782387553, "learning_rate": 9.463903265370508e-08, "loss": 0.6755, "step": 5377 }, { "epoch": 0.5316461952895237, "grad_norm": 5.967936369543118, "learning_rate": 9.46070578507018e-08, "loss": 0.7069, "step": 5378 }, { "epoch": 0.5317450510342783, "grad_norm": 4.629084668988543, "learning_rate": 9.457508360066523e-08, "loss": 0.7378, "step": 5379 }, { "epoch": 0.5318439067790327, "grad_norm": 4.667877970336129, "learning_rate": 9.454310990687376e-08, "loss": 0.7371, "step": 5380 }, { "epoch": 0.5319427625237871, "grad_norm": 5.647855461205118, "learning_rate": 9.451113677260592e-08, "loss": 0.6785, "step": 5381 }, { "epoch": 0.5320416182685417, "grad_norm": 4.824479466797406, "learning_rate": 9.447916420114004e-08, "loss": 0.8157, "step": 5382 }, { "epoch": 0.5321404740132961, "grad_norm": 3.4861476964037412, "learning_rate": 9.44471921957544e-08, "loss": 0.7541, "step": 5383 }, { "epoch": 0.5322393297580506, "grad_norm": 3.820436937681865, "learning_rate": 9.441522075972728e-08, "loss": 0.7067, "step": 5384 }, { "epoch": 0.532338185502805, "grad_norm": 2.8852400013278325, "learning_rate": 9.438324989633693e-08, "loss": 0.8021, "step": 5385 }, { "epoch": 0.5324370412475595, "grad_norm": 4.405682455622722, "learning_rate": 9.435127960886142e-08, "loss": 0.791, "step": 5386 }, { "epoch": 0.532535896992314, "grad_norm": 3.755991563757569, "learning_rate": 9.43193099005789e-08, "loss": 0.7159, "step": 5387 }, { "epoch": 0.5326347527370684, "grad_norm": 44.06448313599135, "learning_rate": 9.428734077476732e-08, "loss": 0.7759, "step": 5388 }, { "epoch": 0.5327336084818229, "grad_norm": 6.108174380713787, "learning_rate": 9.425537223470468e-08, "loss": 0.6942, "step": 5389 }, { "epoch": 0.5328324642265774, "grad_norm": 4.336660140179777, "learning_rate": 9.422340428366889e-08, "loss": 0.7215, "step": 5390 }, { "epoch": 0.5329313199713318, "grad_norm": 122.95643173275614, "learning_rate": 9.419143692493779e-08, "loss": 0.7261, "step": 5391 }, { "epoch": 0.5330301757160864, "grad_norm": 3.477675484616408, "learning_rate": 9.415947016178912e-08, "loss": 0.7731, "step": 5392 }, { "epoch": 0.5331290314608408, "grad_norm": 6.479670018244905, "learning_rate": 9.412750399750065e-08, "loss": 0.7065, "step": 5393 }, { "epoch": 0.5332278872055952, "grad_norm": 5.24269137261282, "learning_rate": 9.409553843535e-08, "loss": 0.752, "step": 5394 }, { "epoch": 0.5333267429503497, "grad_norm": 18.366372080502995, "learning_rate": 9.406357347861479e-08, "loss": 0.7551, "step": 5395 }, { "epoch": 0.5334255986951042, "grad_norm": 10.86944982050237, "learning_rate": 9.403160913057259e-08, "loss": 0.7366, "step": 5396 }, { "epoch": 0.5335244544398586, "grad_norm": 3.221393003541163, "learning_rate": 9.399964539450078e-08, "loss": 0.7223, "step": 5397 }, { "epoch": 0.5336233101846131, "grad_norm": 3.7905873330595337, "learning_rate": 9.396768227367688e-08, "loss": 0.8052, "step": 5398 }, { "epoch": 0.5337221659293676, "grad_norm": 3.5299465325264157, "learning_rate": 9.393571977137815e-08, "loss": 0.6819, "step": 5399 }, { "epoch": 0.533821021674122, "grad_norm": 3.045356334802583, "learning_rate": 9.39037578908819e-08, "loss": 0.7397, "step": 5400 }, { "epoch": 0.5339198774188765, "grad_norm": 4.1317070515040974, "learning_rate": 9.387179663546537e-08, "loss": 0.6599, "step": 5401 }, { "epoch": 0.534018733163631, "grad_norm": 3.635136931127015, "learning_rate": 9.383983600840568e-08, "loss": 0.7313, "step": 5402 }, { "epoch": 0.5341175889083855, "grad_norm": 2.9470844160740346, "learning_rate": 9.380787601297991e-08, "loss": 0.7115, "step": 5403 }, { "epoch": 0.5342164446531399, "grad_norm": 6.4388871085024135, "learning_rate": 9.377591665246517e-08, "loss": 0.719, "step": 5404 }, { "epoch": 0.5343153003978943, "grad_norm": 3.7349838835342273, "learning_rate": 9.374395793013833e-08, "loss": 0.6901, "step": 5405 }, { "epoch": 0.5344141561426489, "grad_norm": 3.465854549560405, "learning_rate": 9.371199984927634e-08, "loss": 0.6989, "step": 5406 }, { "epoch": 0.5345130118874033, "grad_norm": 6.287125722855138, "learning_rate": 9.3680042413156e-08, "loss": 0.6392, "step": 5407 }, { "epoch": 0.5346118676321577, "grad_norm": 3.799814547584937, "learning_rate": 9.364808562505409e-08, "loss": 0.8046, "step": 5408 }, { "epoch": 0.5347107233769123, "grad_norm": 7.259538871202132, "learning_rate": 9.361612948824733e-08, "loss": 0.7523, "step": 5409 }, { "epoch": 0.5348095791216667, "grad_norm": 3.4053315275678164, "learning_rate": 9.358417400601227e-08, "loss": 0.8123, "step": 5410 }, { "epoch": 0.5349084348664211, "grad_norm": 3.9450130142257303, "learning_rate": 9.355221918162552e-08, "loss": 0.7188, "step": 5411 }, { "epoch": 0.5350072906111757, "grad_norm": 5.60294983752288, "learning_rate": 9.352026501836364e-08, "loss": 0.7964, "step": 5412 }, { "epoch": 0.5351061463559301, "grad_norm": 9.129685295911456, "learning_rate": 9.348831151950296e-08, "loss": 0.6908, "step": 5413 }, { "epoch": 0.5352050021006846, "grad_norm": 3.461508842941914, "learning_rate": 9.345635868831987e-08, "loss": 0.6712, "step": 5414 }, { "epoch": 0.535303857845439, "grad_norm": 11.427661546302343, "learning_rate": 9.342440652809072e-08, "loss": 0.7372, "step": 5415 }, { "epoch": 0.5354027135901935, "grad_norm": 6.558547358740465, "learning_rate": 9.339245504209165e-08, "loss": 0.7489, "step": 5416 }, { "epoch": 0.535501569334948, "grad_norm": 3.4982618005043267, "learning_rate": 9.336050423359882e-08, "loss": 0.6131, "step": 5417 }, { "epoch": 0.5356004250797024, "grad_norm": 4.141737327688224, "learning_rate": 9.332855410588841e-08, "loss": 0.6798, "step": 5418 }, { "epoch": 0.535699280824457, "grad_norm": 15.541883029369082, "learning_rate": 9.329660466223631e-08, "loss": 0.6631, "step": 5419 }, { "epoch": 0.5357981365692114, "grad_norm": 2.7629508110540577, "learning_rate": 9.326465590591856e-08, "loss": 0.6797, "step": 5420 }, { "epoch": 0.5358969923139658, "grad_norm": 6.015384766635216, "learning_rate": 9.323270784021095e-08, "loss": 0.6836, "step": 5421 }, { "epoch": 0.5359958480587204, "grad_norm": 3.77724641498081, "learning_rate": 9.320076046838933e-08, "loss": 0.6129, "step": 5422 }, { "epoch": 0.5360947038034748, "grad_norm": 3.92501865516041, "learning_rate": 9.316881379372947e-08, "loss": 0.6538, "step": 5423 }, { "epoch": 0.5361935595482292, "grad_norm": 24.14731406779913, "learning_rate": 9.313686781950695e-08, "loss": 0.8144, "step": 5424 }, { "epoch": 0.5362924152929837, "grad_norm": 4.640915148908676, "learning_rate": 9.310492254899741e-08, "loss": 0.7327, "step": 5425 }, { "epoch": 0.5363912710377382, "grad_norm": 3.7917096698758592, "learning_rate": 9.307297798547639e-08, "loss": 0.6747, "step": 5426 }, { "epoch": 0.5364901267824926, "grad_norm": 7.3582079381203025, "learning_rate": 9.304103413221923e-08, "loss": 0.7534, "step": 5427 }, { "epoch": 0.5365889825272471, "grad_norm": 5.352069519658265, "learning_rate": 9.300909099250138e-08, "loss": 0.6636, "step": 5428 }, { "epoch": 0.5366878382720016, "grad_norm": 3.4019803496724914, "learning_rate": 9.297714856959818e-08, "loss": 0.619, "step": 5429 }, { "epoch": 0.536786694016756, "grad_norm": 6.646057306170499, "learning_rate": 9.294520686678475e-08, "loss": 0.7193, "step": 5430 }, { "epoch": 0.5368855497615105, "grad_norm": 4.84817283366918, "learning_rate": 9.291326588733632e-08, "loss": 0.6845, "step": 5431 }, { "epoch": 0.536984405506265, "grad_norm": 4.195807621039848, "learning_rate": 9.28813256345279e-08, "loss": 0.6944, "step": 5432 }, { "epoch": 0.5370832612510195, "grad_norm": 3.7680529711326063, "learning_rate": 9.284938611163456e-08, "loss": 0.8095, "step": 5433 }, { "epoch": 0.5371821169957739, "grad_norm": 3.0677126365669762, "learning_rate": 9.281744732193119e-08, "loss": 0.8169, "step": 5434 }, { "epoch": 0.5372809727405283, "grad_norm": 4.979035118053513, "learning_rate": 9.278550926869264e-08, "loss": 0.7716, "step": 5435 }, { "epoch": 0.5373798284852829, "grad_norm": 4.3320888027646145, "learning_rate": 9.275357195519367e-08, "loss": 0.7856, "step": 5436 }, { "epoch": 0.5374786842300373, "grad_norm": 2.906186518219438, "learning_rate": 9.272163538470905e-08, "loss": 0.7361, "step": 5437 }, { "epoch": 0.5375775399747917, "grad_norm": 5.425923335394132, "learning_rate": 9.268969956051332e-08, "loss": 0.7046, "step": 5438 }, { "epoch": 0.5376763957195463, "grad_norm": 3.7786532735895935, "learning_rate": 9.265776448588107e-08, "loss": 0.6129, "step": 5439 }, { "epoch": 0.5377752514643007, "grad_norm": 6.146759373051676, "learning_rate": 9.26258301640868e-08, "loss": 0.5892, "step": 5440 }, { "epoch": 0.5378741072090552, "grad_norm": 6.016780851816374, "learning_rate": 9.259389659840483e-08, "loss": 0.7555, "step": 5441 }, { "epoch": 0.5379729629538097, "grad_norm": 4.369671957464884, "learning_rate": 9.256196379210956e-08, "loss": 0.6824, "step": 5442 }, { "epoch": 0.5380718186985641, "grad_norm": 4.180072298792183, "learning_rate": 9.253003174847517e-08, "loss": 0.7196, "step": 5443 }, { "epoch": 0.5381706744433186, "grad_norm": 3.172646302371047, "learning_rate": 9.24981004707758e-08, "loss": 0.7795, "step": 5444 }, { "epoch": 0.5382695301880731, "grad_norm": 3.8121429345468942, "learning_rate": 9.246616996228562e-08, "loss": 0.7335, "step": 5445 }, { "epoch": 0.5383683859328275, "grad_norm": 5.814330052843449, "learning_rate": 9.243424022627853e-08, "loss": 0.682, "step": 5446 }, { "epoch": 0.538467241677582, "grad_norm": 3.135967788104405, "learning_rate": 9.240231126602848e-08, "loss": 0.7809, "step": 5447 }, { "epoch": 0.5385660974223364, "grad_norm": 3.831745689131576, "learning_rate": 9.237038308480941e-08, "loss": 0.6559, "step": 5448 }, { "epoch": 0.538664953167091, "grad_norm": 3.379669930861098, "learning_rate": 9.233845568589496e-08, "loss": 0.6808, "step": 5449 }, { "epoch": 0.5387638089118454, "grad_norm": 5.838964225784685, "learning_rate": 9.230652907255889e-08, "loss": 0.7132, "step": 5450 }, { "epoch": 0.5388626646565998, "grad_norm": 10.812413360891448, "learning_rate": 9.227460324807476e-08, "loss": 0.713, "step": 5451 }, { "epoch": 0.5389615204013544, "grad_norm": 4.4399873177177165, "learning_rate": 9.224267821571611e-08, "loss": 0.6447, "step": 5452 }, { "epoch": 0.5390603761461088, "grad_norm": 10.238154447067211, "learning_rate": 9.221075397875641e-08, "loss": 0.8021, "step": 5453 }, { "epoch": 0.5391592318908632, "grad_norm": 11.71938049989512, "learning_rate": 9.217883054046896e-08, "loss": 0.6848, "step": 5454 }, { "epoch": 0.5392580876356178, "grad_norm": 3.4484475280824403, "learning_rate": 9.214690790412706e-08, "loss": 0.6501, "step": 5455 }, { "epoch": 0.5393569433803722, "grad_norm": 3.4700852686455685, "learning_rate": 9.211498607300395e-08, "loss": 0.7701, "step": 5456 }, { "epoch": 0.5394557991251266, "grad_norm": 3.877856330953515, "learning_rate": 9.208306505037267e-08, "loss": 0.6394, "step": 5457 }, { "epoch": 0.5395546548698811, "grad_norm": 3.8812918534125274, "learning_rate": 9.205114483950627e-08, "loss": 0.7444, "step": 5458 }, { "epoch": 0.5396535106146356, "grad_norm": 4.722495655866669, "learning_rate": 9.201922544367777e-08, "loss": 0.6711, "step": 5459 }, { "epoch": 0.5397523663593901, "grad_norm": 5.468362658950029, "learning_rate": 9.198730686615996e-08, "loss": 0.6901, "step": 5460 }, { "epoch": 0.5398512221041445, "grad_norm": 4.140087006798514, "learning_rate": 9.195538911022559e-08, "loss": 0.71, "step": 5461 }, { "epoch": 0.539950077848899, "grad_norm": 4.9502626964559395, "learning_rate": 9.192347217914747e-08, "loss": 0.7086, "step": 5462 }, { "epoch": 0.5400489335936535, "grad_norm": 28.13863413661896, "learning_rate": 9.189155607619807e-08, "loss": 0.6439, "step": 5463 }, { "epoch": 0.5401477893384079, "grad_norm": 3.5687479489289764, "learning_rate": 9.185964080465005e-08, "loss": 0.6022, "step": 5464 }, { "epoch": 0.5402466450831624, "grad_norm": 4.469302045759572, "learning_rate": 9.182772636777574e-08, "loss": 0.8433, "step": 5465 }, { "epoch": 0.5403455008279169, "grad_norm": 3.6061423738737255, "learning_rate": 9.179581276884753e-08, "loss": 0.6161, "step": 5466 }, { "epoch": 0.5404443565726713, "grad_norm": 4.217293793177038, "learning_rate": 9.176390001113776e-08, "loss": 0.7618, "step": 5467 }, { "epoch": 0.5405432123174257, "grad_norm": 3.4447405742853854, "learning_rate": 9.173198809791852e-08, "loss": 0.6941, "step": 5468 }, { "epoch": 0.5406420680621803, "grad_norm": 3.984047793844125, "learning_rate": 9.170007703246194e-08, "loss": 0.7831, "step": 5469 }, { "epoch": 0.5407409238069347, "grad_norm": 3.625239199551596, "learning_rate": 9.166816681804005e-08, "loss": 0.6621, "step": 5470 }, { "epoch": 0.5408397795516892, "grad_norm": 3.9580308841676186, "learning_rate": 9.16362574579247e-08, "loss": 0.7031, "step": 5471 }, { "epoch": 0.5409386352964437, "grad_norm": 77.87778527285352, "learning_rate": 9.16043489553878e-08, "loss": 0.6442, "step": 5472 }, { "epoch": 0.5410374910411981, "grad_norm": 6.147148953454198, "learning_rate": 9.157244131370109e-08, "loss": 0.7195, "step": 5473 }, { "epoch": 0.5411363467859526, "grad_norm": 3.387613009112442, "learning_rate": 9.154053453613618e-08, "loss": 0.6763, "step": 5474 }, { "epoch": 0.5412352025307071, "grad_norm": 10.765708384847265, "learning_rate": 9.15086286259647e-08, "loss": 0.7472, "step": 5475 }, { "epoch": 0.5413340582754615, "grad_norm": 11.90628827032786, "learning_rate": 9.147672358645805e-08, "loss": 0.5985, "step": 5476 }, { "epoch": 0.541432914020216, "grad_norm": 17.701121105499567, "learning_rate": 9.14448194208877e-08, "loss": 0.614, "step": 5477 }, { "epoch": 0.5415317697649704, "grad_norm": 5.5408404872502866, "learning_rate": 9.141291613252492e-08, "loss": 0.6497, "step": 5478 }, { "epoch": 0.541630625509725, "grad_norm": 6.456424132930539, "learning_rate": 9.138101372464092e-08, "loss": 0.7164, "step": 5479 }, { "epoch": 0.5417294812544794, "grad_norm": 2.9932764938827376, "learning_rate": 9.13491122005068e-08, "loss": 0.7791, "step": 5480 }, { "epoch": 0.5418283369992338, "grad_norm": 3.4287656798764634, "learning_rate": 9.131721156339366e-08, "loss": 0.6824, "step": 5481 }, { "epoch": 0.5419271927439884, "grad_norm": 3.7522588328017665, "learning_rate": 9.128531181657235e-08, "loss": 0.8041, "step": 5482 }, { "epoch": 0.5420260484887428, "grad_norm": 3.785335328560544, "learning_rate": 9.125341296331376e-08, "loss": 0.7039, "step": 5483 }, { "epoch": 0.5421249042334972, "grad_norm": 3.3990959331531987, "learning_rate": 9.12215150068887e-08, "loss": 0.6522, "step": 5484 }, { "epoch": 0.5422237599782518, "grad_norm": 3.561372601686652, "learning_rate": 9.118961795056773e-08, "loss": 0.6945, "step": 5485 }, { "epoch": 0.5423226157230062, "grad_norm": 4.370453426565204, "learning_rate": 9.115772179762151e-08, "loss": 0.6528, "step": 5486 }, { "epoch": 0.5424214714677607, "grad_norm": 4.1211165041085085, "learning_rate": 9.11258265513205e-08, "loss": 0.7766, "step": 5487 }, { "epoch": 0.5425203272125151, "grad_norm": 4.395433877439904, "learning_rate": 9.109393221493505e-08, "loss": 0.6875, "step": 5488 }, { "epoch": 0.5426191829572696, "grad_norm": 12.252475542161267, "learning_rate": 9.106203879173549e-08, "loss": 0.7579, "step": 5489 }, { "epoch": 0.5427180387020241, "grad_norm": 4.291697436872045, "learning_rate": 9.103014628499198e-08, "loss": 0.5939, "step": 5490 }, { "epoch": 0.5428168944467785, "grad_norm": 3.929743092804351, "learning_rate": 9.099825469797466e-08, "loss": 0.7201, "step": 5491 }, { "epoch": 0.542915750191533, "grad_norm": 6.218055650145727, "learning_rate": 9.096636403395357e-08, "loss": 0.5193, "step": 5492 }, { "epoch": 0.5430146059362875, "grad_norm": 11.65905241281121, "learning_rate": 9.093447429619857e-08, "loss": 0.7929, "step": 5493 }, { "epoch": 0.5431134616810419, "grad_norm": 2.9171501520951364, "learning_rate": 9.090258548797949e-08, "loss": 0.793, "step": 5494 }, { "epoch": 0.5432123174257965, "grad_norm": 4.024531347082412, "learning_rate": 9.08706976125661e-08, "loss": 0.6122, "step": 5495 }, { "epoch": 0.5433111731705509, "grad_norm": 4.500235280890116, "learning_rate": 9.083881067322799e-08, "loss": 0.7868, "step": 5496 }, { "epoch": 0.5434100289153053, "grad_norm": 3.451843386523784, "learning_rate": 9.080692467323468e-08, "loss": 0.7211, "step": 5497 }, { "epoch": 0.5435088846600598, "grad_norm": 4.258854199488862, "learning_rate": 9.07750396158557e-08, "loss": 0.6672, "step": 5498 }, { "epoch": 0.5436077404048143, "grad_norm": 6.527749091116677, "learning_rate": 9.074315550436028e-08, "loss": 0.6746, "step": 5499 }, { "epoch": 0.5437065961495687, "grad_norm": 5.213404965435421, "learning_rate": 9.071127234201775e-08, "loss": 0.7045, "step": 5500 }, { "epoch": 0.5438054518943232, "grad_norm": 3.3700677049826706, "learning_rate": 9.067939013209718e-08, "loss": 0.5861, "step": 5501 }, { "epoch": 0.5439043076390777, "grad_norm": 4.0810181874671745, "learning_rate": 9.064750887786767e-08, "loss": 0.5966, "step": 5502 }, { "epoch": 0.5440031633838321, "grad_norm": 6.726793792058383, "learning_rate": 9.061562858259822e-08, "loss": 0.6797, "step": 5503 }, { "epoch": 0.5441020191285866, "grad_norm": 3.717451241545791, "learning_rate": 9.05837492495576e-08, "loss": 0.8343, "step": 5504 }, { "epoch": 0.5442008748733411, "grad_norm": 4.697313225424097, "learning_rate": 9.055187088201456e-08, "loss": 0.7736, "step": 5505 }, { "epoch": 0.5442997306180956, "grad_norm": 4.770680363697563, "learning_rate": 9.051999348323785e-08, "loss": 0.7161, "step": 5506 }, { "epoch": 0.54439858636285, "grad_norm": 5.429225776648173, "learning_rate": 9.048811705649592e-08, "loss": 0.6712, "step": 5507 }, { "epoch": 0.5444974421076044, "grad_norm": 3.2051581223127035, "learning_rate": 9.045624160505727e-08, "loss": 0.6766, "step": 5508 }, { "epoch": 0.544596297852359, "grad_norm": 4.510738233401222, "learning_rate": 9.04243671321903e-08, "loss": 0.6538, "step": 5509 }, { "epoch": 0.5446951535971134, "grad_norm": 6.197542378045775, "learning_rate": 9.03924936411632e-08, "loss": 0.6705, "step": 5510 }, { "epoch": 0.5447940093418678, "grad_norm": 3.7752484665053005, "learning_rate": 9.036062113524421e-08, "loss": 0.6498, "step": 5511 }, { "epoch": 0.5448928650866224, "grad_norm": 4.497341250314914, "learning_rate": 9.032874961770127e-08, "loss": 0.6511, "step": 5512 }, { "epoch": 0.5449917208313768, "grad_norm": 4.064564426114251, "learning_rate": 9.029687909180243e-08, "loss": 0.699, "step": 5513 }, { "epoch": 0.5450905765761312, "grad_norm": 4.754041374336784, "learning_rate": 9.026500956081553e-08, "loss": 0.6613, "step": 5514 }, { "epoch": 0.5451894323208858, "grad_norm": 3.489472220578984, "learning_rate": 9.023314102800824e-08, "loss": 0.6853, "step": 5515 }, { "epoch": 0.5452882880656402, "grad_norm": 7.891031451872883, "learning_rate": 9.020127349664827e-08, "loss": 0.6302, "step": 5516 }, { "epoch": 0.5453871438103947, "grad_norm": 7.625379786793269, "learning_rate": 9.01694069700032e-08, "loss": 0.7476, "step": 5517 }, { "epoch": 0.5454859995551492, "grad_norm": 3.107728661132936, "learning_rate": 9.01375414513404e-08, "loss": 0.6054, "step": 5518 }, { "epoch": 0.5455848552999036, "grad_norm": 3.562197078015887, "learning_rate": 9.010567694392725e-08, "loss": 0.7907, "step": 5519 }, { "epoch": 0.5456837110446581, "grad_norm": 5.872470714060215, "learning_rate": 9.007381345103102e-08, "loss": 0.7386, "step": 5520 }, { "epoch": 0.5457825667894125, "grad_norm": 3.7441498914686613, "learning_rate": 9.004195097591876e-08, "loss": 0.8025, "step": 5521 }, { "epoch": 0.545881422534167, "grad_norm": 6.930792501001246, "learning_rate": 9.001008952185754e-08, "loss": 0.7006, "step": 5522 }, { "epoch": 0.5459802782789215, "grad_norm": 4.022299129630744, "learning_rate": 8.997822909211429e-08, "loss": 0.6925, "step": 5523 }, { "epoch": 0.5460791340236759, "grad_norm": 3.5553617555203125, "learning_rate": 8.994636968995579e-08, "loss": 0.7307, "step": 5524 }, { "epoch": 0.5461779897684305, "grad_norm": 4.561391687189453, "learning_rate": 8.991451131864883e-08, "loss": 0.801, "step": 5525 }, { "epoch": 0.5462768455131849, "grad_norm": 3.364673516426641, "learning_rate": 8.98826539814599e-08, "loss": 0.6042, "step": 5526 }, { "epoch": 0.5463757012579393, "grad_norm": 10.16617145769568, "learning_rate": 8.985079768165557e-08, "loss": 0.7708, "step": 5527 }, { "epoch": 0.5464745570026939, "grad_norm": 4.269358608322347, "learning_rate": 8.981894242250226e-08, "loss": 0.7701, "step": 5528 }, { "epoch": 0.5465734127474483, "grad_norm": 4.359602381881918, "learning_rate": 8.978708820726619e-08, "loss": 0.6446, "step": 5529 }, { "epoch": 0.5466722684922027, "grad_norm": 10.462931881144845, "learning_rate": 8.97552350392136e-08, "loss": 0.7056, "step": 5530 }, { "epoch": 0.5467711242369572, "grad_norm": 6.047998358609549, "learning_rate": 8.972338292161056e-08, "loss": 0.7066, "step": 5531 }, { "epoch": 0.5468699799817117, "grad_norm": 4.826188245403201, "learning_rate": 8.969153185772295e-08, "loss": 0.7207, "step": 5532 }, { "epoch": 0.5469688357264662, "grad_norm": 4.095565506216744, "learning_rate": 8.965968185081675e-08, "loss": 0.7136, "step": 5533 }, { "epoch": 0.5470676914712206, "grad_norm": 4.02221765645325, "learning_rate": 8.962783290415759e-08, "loss": 0.6305, "step": 5534 }, { "epoch": 0.5471665472159751, "grad_norm": 7.824484349227778, "learning_rate": 8.959598502101117e-08, "loss": 0.7115, "step": 5535 }, { "epoch": 0.5472654029607296, "grad_norm": 4.256196260529801, "learning_rate": 8.956413820464306e-08, "loss": 0.782, "step": 5536 }, { "epoch": 0.547364258705484, "grad_norm": 5.898916078685272, "learning_rate": 8.95322924583186e-08, "loss": 0.6099, "step": 5537 }, { "epoch": 0.5474631144502385, "grad_norm": 3.6753305870723922, "learning_rate": 8.950044778530315e-08, "loss": 0.6328, "step": 5538 }, { "epoch": 0.547561970194993, "grad_norm": 4.015635708726488, "learning_rate": 8.94686041888619e-08, "loss": 0.7294, "step": 5539 }, { "epoch": 0.5476608259397474, "grad_norm": 5.091307193355761, "learning_rate": 8.943676167225994e-08, "loss": 0.6792, "step": 5540 }, { "epoch": 0.5477596816845018, "grad_norm": 4.520374372522441, "learning_rate": 8.940492023876223e-08, "loss": 0.7164, "step": 5541 }, { "epoch": 0.5478585374292564, "grad_norm": 3.320176358254382, "learning_rate": 8.93730798916337e-08, "loss": 0.6684, "step": 5542 }, { "epoch": 0.5479573931740108, "grad_norm": 2.9887273669938574, "learning_rate": 8.934124063413905e-08, "loss": 0.6916, "step": 5543 }, { "epoch": 0.5480562489187653, "grad_norm": 7.2280716962757126, "learning_rate": 8.930940246954297e-08, "loss": 0.689, "step": 5544 }, { "epoch": 0.5481551046635198, "grad_norm": 5.390700883764027, "learning_rate": 8.927756540110993e-08, "loss": 0.7536, "step": 5545 }, { "epoch": 0.5482539604082742, "grad_norm": 3.4384781030555343, "learning_rate": 8.924572943210438e-08, "loss": 0.7548, "step": 5546 }, { "epoch": 0.5483528161530287, "grad_norm": 4.7911688066834675, "learning_rate": 8.92138945657907e-08, "loss": 0.6943, "step": 5547 }, { "epoch": 0.5484516718977832, "grad_norm": 3.883674084984668, "learning_rate": 8.9182060805433e-08, "loss": 0.6815, "step": 5548 }, { "epoch": 0.5485505276425376, "grad_norm": 4.04316114344802, "learning_rate": 8.915022815429536e-08, "loss": 0.6381, "step": 5549 }, { "epoch": 0.5486493833872921, "grad_norm": 3.380245677679543, "learning_rate": 8.911839661564183e-08, "loss": 0.677, "step": 5550 }, { "epoch": 0.5487482391320465, "grad_norm": 9.0081302799177, "learning_rate": 8.908656619273616e-08, "loss": 0.6371, "step": 5551 }, { "epoch": 0.5488470948768011, "grad_norm": 3.678860272685736, "learning_rate": 8.905473688884216e-08, "loss": 0.7552, "step": 5552 }, { "epoch": 0.5489459506215555, "grad_norm": 3.0605748564990107, "learning_rate": 8.902290870722346e-08, "loss": 0.7265, "step": 5553 }, { "epoch": 0.5490448063663099, "grad_norm": 5.365106057964961, "learning_rate": 8.899108165114351e-08, "loss": 0.6148, "step": 5554 }, { "epoch": 0.5491436621110645, "grad_norm": 4.145758757317412, "learning_rate": 8.895925572386579e-08, "loss": 0.7616, "step": 5555 }, { "epoch": 0.5492425178558189, "grad_norm": 4.936720118853012, "learning_rate": 8.892743092865349e-08, "loss": 0.6904, "step": 5556 }, { "epoch": 0.5493413736005733, "grad_norm": 8.430388842468727, "learning_rate": 8.889560726876985e-08, "loss": 0.7691, "step": 5557 }, { "epoch": 0.5494402293453279, "grad_norm": 10.548933633059514, "learning_rate": 8.886378474747789e-08, "loss": 0.5702, "step": 5558 }, { "epoch": 0.5495390850900823, "grad_norm": 5.319103864985203, "learning_rate": 8.883196336804048e-08, "loss": 0.7897, "step": 5559 }, { "epoch": 0.5496379408348367, "grad_norm": 4.704144139448874, "learning_rate": 8.880014313372051e-08, "loss": 0.6232, "step": 5560 }, { "epoch": 0.5497367965795912, "grad_norm": 4.2014624318402225, "learning_rate": 8.876832404778068e-08, "loss": 0.715, "step": 5561 }, { "epoch": 0.5498356523243457, "grad_norm": 4.022355809340892, "learning_rate": 8.87365061134835e-08, "loss": 0.7218, "step": 5562 }, { "epoch": 0.5499345080691002, "grad_norm": 4.022430124720475, "learning_rate": 8.870468933409148e-08, "loss": 0.6734, "step": 5563 }, { "epoch": 0.5500333638138546, "grad_norm": 12.845749564657876, "learning_rate": 8.867287371286698e-08, "loss": 0.7575, "step": 5564 }, { "epoch": 0.5501322195586091, "grad_norm": 4.482773976108219, "learning_rate": 8.864105925307218e-08, "loss": 0.6879, "step": 5565 }, { "epoch": 0.5502310753033636, "grad_norm": 3.3647568782221255, "learning_rate": 8.86092459579692e-08, "loss": 0.7875, "step": 5566 }, { "epoch": 0.550329931048118, "grad_norm": 4.405467519958067, "learning_rate": 8.857743383082002e-08, "loss": 0.6725, "step": 5567 }, { "epoch": 0.5504287867928725, "grad_norm": 3.5587018102698527, "learning_rate": 8.854562287488647e-08, "loss": 0.6429, "step": 5568 }, { "epoch": 0.550527642537627, "grad_norm": 9.293468479140465, "learning_rate": 8.851381309343039e-08, "loss": 0.6273, "step": 5569 }, { "epoch": 0.5506264982823814, "grad_norm": 3.6197614265710714, "learning_rate": 8.84820044897133e-08, "loss": 0.7144, "step": 5570 }, { "epoch": 0.5507253540271358, "grad_norm": 13.700183084632403, "learning_rate": 8.845019706699675e-08, "loss": 0.73, "step": 5571 }, { "epoch": 0.5508242097718904, "grad_norm": 4.024269740711445, "learning_rate": 8.841839082854214e-08, "loss": 0.6517, "step": 5572 }, { "epoch": 0.5509230655166448, "grad_norm": 4.036703423631276, "learning_rate": 8.83865857776107e-08, "loss": 0.734, "step": 5573 }, { "epoch": 0.5510219212613993, "grad_norm": 3.7322799783295735, "learning_rate": 8.835478191746358e-08, "loss": 0.6152, "step": 5574 }, { "epoch": 0.5511207770061538, "grad_norm": 5.093130662247495, "learning_rate": 8.832297925136181e-08, "loss": 0.8138, "step": 5575 }, { "epoch": 0.5512196327509082, "grad_norm": 4.07743722182365, "learning_rate": 8.829117778256624e-08, "loss": 0.6701, "step": 5576 }, { "epoch": 0.5513184884956627, "grad_norm": 3.9983810804591964, "learning_rate": 8.82593775143377e-08, "loss": 0.7474, "step": 5577 }, { "epoch": 0.5514173442404172, "grad_norm": 3.689715046446761, "learning_rate": 8.822757844993677e-08, "loss": 0.7203, "step": 5578 }, { "epoch": 0.5515161999851717, "grad_norm": 16.619696254095807, "learning_rate": 8.819578059262403e-08, "loss": 0.7204, "step": 5579 }, { "epoch": 0.5516150557299261, "grad_norm": 11.475064704466122, "learning_rate": 8.81639839456599e-08, "loss": 0.6753, "step": 5580 }, { "epoch": 0.5517139114746805, "grad_norm": 4.429756993004492, "learning_rate": 8.813218851230459e-08, "loss": 0.7233, "step": 5581 }, { "epoch": 0.5518127672194351, "grad_norm": 11.026386988486845, "learning_rate": 8.810039429581829e-08, "loss": 0.7162, "step": 5582 }, { "epoch": 0.5519116229641895, "grad_norm": 5.766134192222263, "learning_rate": 8.806860129946104e-08, "loss": 0.7425, "step": 5583 }, { "epoch": 0.5520104787089439, "grad_norm": 5.229851152812705, "learning_rate": 8.803680952649271e-08, "loss": 0.6962, "step": 5584 }, { "epoch": 0.5521093344536985, "grad_norm": 3.325035193411012, "learning_rate": 8.800501898017308e-08, "loss": 0.6467, "step": 5585 }, { "epoch": 0.5522081901984529, "grad_norm": 3.5516167228520223, "learning_rate": 8.797322966376184e-08, "loss": 0.7307, "step": 5586 }, { "epoch": 0.5523070459432073, "grad_norm": 4.160764597801965, "learning_rate": 8.794144158051848e-08, "loss": 0.6707, "step": 5587 }, { "epoch": 0.5524059016879619, "grad_norm": 4.0513193892289365, "learning_rate": 8.790965473370243e-08, "loss": 0.6727, "step": 5588 }, { "epoch": 0.5525047574327163, "grad_norm": 5.078600711242933, "learning_rate": 8.787786912657292e-08, "loss": 0.7337, "step": 5589 }, { "epoch": 0.5526036131774708, "grad_norm": 7.488991274585162, "learning_rate": 8.78460847623891e-08, "loss": 0.7325, "step": 5590 }, { "epoch": 0.5527024689222252, "grad_norm": 5.001535437570028, "learning_rate": 8.781430164441004e-08, "loss": 0.6524, "step": 5591 }, { "epoch": 0.5528013246669797, "grad_norm": 4.075585777545015, "learning_rate": 8.77825197758946e-08, "loss": 0.6832, "step": 5592 }, { "epoch": 0.5529001804117342, "grad_norm": 3.915397765113006, "learning_rate": 8.775073916010148e-08, "loss": 0.6551, "step": 5593 }, { "epoch": 0.5529990361564886, "grad_norm": 5.247256671257444, "learning_rate": 8.771895980028944e-08, "loss": 0.8286, "step": 5594 }, { "epoch": 0.5530978919012431, "grad_norm": 2.8912981759923704, "learning_rate": 8.768718169971686e-08, "loss": 0.6929, "step": 5595 }, { "epoch": 0.5531967476459976, "grad_norm": 4.713692562528132, "learning_rate": 8.765540486164217e-08, "loss": 0.7746, "step": 5596 }, { "epoch": 0.553295603390752, "grad_norm": 5.517764174430145, "learning_rate": 8.762362928932366e-08, "loss": 0.7283, "step": 5597 }, { "epoch": 0.5533944591355066, "grad_norm": 3.9426679883651867, "learning_rate": 8.759185498601936e-08, "loss": 0.8107, "step": 5598 }, { "epoch": 0.553493314880261, "grad_norm": 3.339600969238521, "learning_rate": 8.756008195498734e-08, "loss": 0.6555, "step": 5599 }, { "epoch": 0.5535921706250154, "grad_norm": 3.681021496188008, "learning_rate": 8.752831019948538e-08, "loss": 0.7137, "step": 5600 }, { "epoch": 0.55369102636977, "grad_norm": 4.035826165918055, "learning_rate": 8.749653972277126e-08, "loss": 0.6833, "step": 5601 }, { "epoch": 0.5537898821145244, "grad_norm": 2.8805914260434404, "learning_rate": 8.746477052810257e-08, "loss": 0.719, "step": 5602 }, { "epoch": 0.5538887378592788, "grad_norm": 10.526796026594154, "learning_rate": 8.743300261873669e-08, "loss": 0.6611, "step": 5603 }, { "epoch": 0.5539875936040333, "grad_norm": 5.205519395114601, "learning_rate": 8.740123599793104e-08, "loss": 0.7559, "step": 5604 }, { "epoch": 0.5540864493487878, "grad_norm": 4.776950210953872, "learning_rate": 8.736947066894283e-08, "loss": 0.6861, "step": 5605 }, { "epoch": 0.5541853050935422, "grad_norm": 40.714080449565955, "learning_rate": 8.733770663502905e-08, "loss": 0.6564, "step": 5606 }, { "epoch": 0.5542841608382967, "grad_norm": 4.9489954288138325, "learning_rate": 8.730594389944669e-08, "loss": 0.6733, "step": 5607 }, { "epoch": 0.5543830165830512, "grad_norm": 5.882334733277827, "learning_rate": 8.727418246545252e-08, "loss": 0.6653, "step": 5608 }, { "epoch": 0.5544818723278057, "grad_norm": 8.268548555348994, "learning_rate": 8.724242233630325e-08, "loss": 0.7213, "step": 5609 }, { "epoch": 0.5545807280725601, "grad_norm": 3.159330222643178, "learning_rate": 8.721066351525536e-08, "loss": 0.6389, "step": 5610 }, { "epoch": 0.5546795838173146, "grad_norm": 4.09652386713119, "learning_rate": 8.717890600556528e-08, "loss": 0.6606, "step": 5611 }, { "epoch": 0.5547784395620691, "grad_norm": 2.9026272103396407, "learning_rate": 8.714714981048922e-08, "loss": 0.5965, "step": 5612 }, { "epoch": 0.5548772953068235, "grad_norm": 3.2673117426611644, "learning_rate": 8.711539493328342e-08, "loss": 0.6459, "step": 5613 }, { "epoch": 0.5549761510515779, "grad_norm": 4.044456456768989, "learning_rate": 8.708364137720376e-08, "loss": 0.6917, "step": 5614 }, { "epoch": 0.5550750067963325, "grad_norm": 3.8506962865232865, "learning_rate": 8.705188914550616e-08, "loss": 0.6235, "step": 5615 }, { "epoch": 0.5551738625410869, "grad_norm": 3.686930830953817, "learning_rate": 8.702013824144636e-08, "loss": 0.7465, "step": 5616 }, { "epoch": 0.5552727182858413, "grad_norm": 5.660576549895734, "learning_rate": 8.698838866827987e-08, "loss": 0.7145, "step": 5617 }, { "epoch": 0.5553715740305959, "grad_norm": 5.497481043501608, "learning_rate": 8.695664042926222e-08, "loss": 0.6305, "step": 5618 }, { "epoch": 0.5554704297753503, "grad_norm": 2.996520461814899, "learning_rate": 8.692489352764871e-08, "loss": 0.6817, "step": 5619 }, { "epoch": 0.5555692855201048, "grad_norm": 3.1167523001211666, "learning_rate": 8.689314796669444e-08, "loss": 0.7615, "step": 5620 }, { "epoch": 0.5556681412648593, "grad_norm": 3.794812792088693, "learning_rate": 8.686140374965456e-08, "loss": 0.6658, "step": 5621 }, { "epoch": 0.5557669970096137, "grad_norm": 4.892222622837848, "learning_rate": 8.682966087978387e-08, "loss": 0.7006, "step": 5622 }, { "epoch": 0.5558658527543682, "grad_norm": 3.7294096728945525, "learning_rate": 8.679791936033719e-08, "loss": 0.6517, "step": 5623 }, { "epoch": 0.5559647084991226, "grad_norm": 9.319425853681834, "learning_rate": 8.676617919456917e-08, "loss": 0.7767, "step": 5624 }, { "epoch": 0.5560635642438772, "grad_norm": 3.7417975030093347, "learning_rate": 8.673444038573422e-08, "loss": 0.6976, "step": 5625 }, { "epoch": 0.5561624199886316, "grad_norm": 3.8820702178797655, "learning_rate": 8.670270293708675e-08, "loss": 0.6207, "step": 5626 }, { "epoch": 0.556261275733386, "grad_norm": 4.5674981519659745, "learning_rate": 8.667096685188095e-08, "loss": 0.6688, "step": 5627 }, { "epoch": 0.5563601314781406, "grad_norm": 2.682461763165966, "learning_rate": 8.663923213337088e-08, "loss": 0.7809, "step": 5628 }, { "epoch": 0.556458987222895, "grad_norm": 17.547632814610964, "learning_rate": 8.660749878481045e-08, "loss": 0.7059, "step": 5629 }, { "epoch": 0.5565578429676494, "grad_norm": 9.21040078190634, "learning_rate": 8.65757668094535e-08, "loss": 0.7484, "step": 5630 }, { "epoch": 0.556656698712404, "grad_norm": 3.081651563735899, "learning_rate": 8.654403621055359e-08, "loss": 0.729, "step": 5631 }, { "epoch": 0.5567555544571584, "grad_norm": 9.043719464648992, "learning_rate": 8.651230699136434e-08, "loss": 0.6884, "step": 5632 }, { "epoch": 0.5568544102019128, "grad_norm": 3.5039319159445577, "learning_rate": 8.6480579155139e-08, "loss": 0.7293, "step": 5633 }, { "epoch": 0.5569532659466673, "grad_norm": 5.382759753518157, "learning_rate": 8.644885270513084e-08, "loss": 0.6456, "step": 5634 }, { "epoch": 0.5570521216914218, "grad_norm": 3.542371790811809, "learning_rate": 8.6417127644593e-08, "loss": 0.6355, "step": 5635 }, { "epoch": 0.5571509774361763, "grad_norm": 3.074498441481339, "learning_rate": 8.638540397677831e-08, "loss": 0.6593, "step": 5636 }, { "epoch": 0.5572498331809307, "grad_norm": 5.912201559428354, "learning_rate": 8.635368170493962e-08, "loss": 0.682, "step": 5637 }, { "epoch": 0.5573486889256852, "grad_norm": 4.7916428868133165, "learning_rate": 8.63219608323296e-08, "loss": 0.7132, "step": 5638 }, { "epoch": 0.5574475446704397, "grad_norm": 4.282830867407417, "learning_rate": 8.629024136220069e-08, "loss": 0.7362, "step": 5639 }, { "epoch": 0.5575464004151941, "grad_norm": 3.7155949810912245, "learning_rate": 8.625852329780529e-08, "loss": 0.7362, "step": 5640 }, { "epoch": 0.5576452561599486, "grad_norm": 6.522620965312519, "learning_rate": 8.622680664239568e-08, "loss": 0.7041, "step": 5641 }, { "epoch": 0.5577441119047031, "grad_norm": 3.1046785464144238, "learning_rate": 8.619509139922383e-08, "loss": 0.7153, "step": 5642 }, { "epoch": 0.5578429676494575, "grad_norm": 4.477754923460198, "learning_rate": 8.616337757154177e-08, "loss": 0.7843, "step": 5643 }, { "epoch": 0.5579418233942119, "grad_norm": 4.294438785967592, "learning_rate": 8.613166516260119e-08, "loss": 0.5985, "step": 5644 }, { "epoch": 0.5580406791389665, "grad_norm": 2.783312081115992, "learning_rate": 8.609995417565379e-08, "loss": 0.7085, "step": 5645 }, { "epoch": 0.5581395348837209, "grad_norm": 3.4675851284163444, "learning_rate": 8.606824461395108e-08, "loss": 0.7965, "step": 5646 }, { "epoch": 0.5582383906284754, "grad_norm": 7.998306474484964, "learning_rate": 8.603653648074432e-08, "loss": 0.6634, "step": 5647 }, { "epoch": 0.5583372463732299, "grad_norm": 3.948815768332269, "learning_rate": 8.600482977928479e-08, "loss": 0.7197, "step": 5648 }, { "epoch": 0.5584361021179843, "grad_norm": 5.056535199583191, "learning_rate": 8.597312451282355e-08, "loss": 0.6733, "step": 5649 }, { "epoch": 0.5585349578627388, "grad_norm": 3.4944567647291787, "learning_rate": 8.594142068461143e-08, "loss": 0.7506, "step": 5650 }, { "epoch": 0.5586338136074933, "grad_norm": 4.201141738962651, "learning_rate": 8.590971829789928e-08, "loss": 0.6408, "step": 5651 }, { "epoch": 0.5587326693522477, "grad_norm": 6.6069574961053545, "learning_rate": 8.587801735593766e-08, "loss": 0.7145, "step": 5652 }, { "epoch": 0.5588315250970022, "grad_norm": 6.730359927484027, "learning_rate": 8.584631786197704e-08, "loss": 0.6921, "step": 5653 }, { "epoch": 0.5589303808417566, "grad_norm": 4.192976022981044, "learning_rate": 8.581461981926776e-08, "loss": 0.681, "step": 5654 }, { "epoch": 0.5590292365865112, "grad_norm": 3.993661664637694, "learning_rate": 8.578292323105997e-08, "loss": 0.6607, "step": 5655 }, { "epoch": 0.5591280923312656, "grad_norm": 6.485782892393973, "learning_rate": 8.575122810060364e-08, "loss": 0.6805, "step": 5656 }, { "epoch": 0.55922694807602, "grad_norm": 7.279940174237071, "learning_rate": 8.571953443114875e-08, "loss": 0.7109, "step": 5657 }, { "epoch": 0.5593258038207746, "grad_norm": 3.6874575802399803, "learning_rate": 8.56878422259449e-08, "loss": 0.6594, "step": 5658 }, { "epoch": 0.559424659565529, "grad_norm": 4.284744610763881, "learning_rate": 8.565615148824172e-08, "loss": 0.7074, "step": 5659 }, { "epoch": 0.5595235153102834, "grad_norm": 2.981343561633971, "learning_rate": 8.562446222128865e-08, "loss": 0.7233, "step": 5660 }, { "epoch": 0.559622371055038, "grad_norm": 6.281605853069698, "learning_rate": 8.559277442833488e-08, "loss": 0.6764, "step": 5661 }, { "epoch": 0.5597212267997924, "grad_norm": 4.253184774471703, "learning_rate": 8.556108811262961e-08, "loss": 0.7146, "step": 5662 }, { "epoch": 0.5598200825445468, "grad_norm": 3.782897517519226, "learning_rate": 8.552940327742177e-08, "loss": 0.6333, "step": 5663 }, { "epoch": 0.5599189382893013, "grad_norm": 7.478670004556808, "learning_rate": 8.549771992596013e-08, "loss": 0.5912, "step": 5664 }, { "epoch": 0.5600177940340558, "grad_norm": 7.153195481384384, "learning_rate": 8.546603806149343e-08, "loss": 0.7423, "step": 5665 }, { "epoch": 0.5601166497788103, "grad_norm": 5.870151004078986, "learning_rate": 8.543435768727009e-08, "loss": 0.7131, "step": 5666 }, { "epoch": 0.5602155055235647, "grad_norm": 3.753953592829209, "learning_rate": 8.540267880653851e-08, "loss": 0.6095, "step": 5667 }, { "epoch": 0.5603143612683192, "grad_norm": 3.5666826515263876, "learning_rate": 8.537100142254695e-08, "loss": 0.7677, "step": 5668 }, { "epoch": 0.5604132170130737, "grad_norm": 10.378312931503434, "learning_rate": 8.533932553854334e-08, "loss": 0.7615, "step": 5669 }, { "epoch": 0.5605120727578281, "grad_norm": 5.759047416319832, "learning_rate": 8.530765115777567e-08, "loss": 0.6372, "step": 5670 }, { "epoch": 0.5606109285025827, "grad_norm": 3.218960966161888, "learning_rate": 8.527597828349163e-08, "loss": 0.7038, "step": 5671 }, { "epoch": 0.5607097842473371, "grad_norm": 4.676239404422223, "learning_rate": 8.524430691893885e-08, "loss": 0.709, "step": 5672 }, { "epoch": 0.5608086399920915, "grad_norm": 4.628160729360212, "learning_rate": 8.521263706736468e-08, "loss": 0.8096, "step": 5673 }, { "epoch": 0.5609074957368461, "grad_norm": 4.676249055778843, "learning_rate": 8.518096873201652e-08, "loss": 0.7634, "step": 5674 }, { "epoch": 0.5610063514816005, "grad_norm": 4.363687119587278, "learning_rate": 8.514930191614136e-08, "loss": 0.7115, "step": 5675 }, { "epoch": 0.5611052072263549, "grad_norm": 9.275169337031349, "learning_rate": 8.511763662298627e-08, "loss": 0.8771, "step": 5676 }, { "epoch": 0.5612040629711094, "grad_norm": 8.89765068140308, "learning_rate": 8.508597285579798e-08, "loss": 0.7237, "step": 5677 }, { "epoch": 0.5613029187158639, "grad_norm": 4.685988277965803, "learning_rate": 8.50543106178232e-08, "loss": 0.684, "step": 5678 }, { "epoch": 0.5614017744606183, "grad_norm": 4.008633401180562, "learning_rate": 8.502264991230838e-08, "loss": 0.6854, "step": 5679 }, { "epoch": 0.5615006302053728, "grad_norm": 3.9393636152286007, "learning_rate": 8.49909907424999e-08, "loss": 0.6368, "step": 5680 }, { "epoch": 0.5615994859501273, "grad_norm": 3.7254231563734903, "learning_rate": 8.49593331116439e-08, "loss": 0.6293, "step": 5681 }, { "epoch": 0.5616983416948818, "grad_norm": 12.626517413198266, "learning_rate": 8.492767702298646e-08, "loss": 0.6836, "step": 5682 }, { "epoch": 0.5617971974396362, "grad_norm": 3.699984331521919, "learning_rate": 8.489602247977337e-08, "loss": 0.6336, "step": 5683 }, { "epoch": 0.5618960531843907, "grad_norm": 3.2258209229275545, "learning_rate": 8.486436948525037e-08, "loss": 0.7019, "step": 5684 }, { "epoch": 0.5619949089291452, "grad_norm": 4.064638538781205, "learning_rate": 8.483271804266305e-08, "loss": 0.6548, "step": 5685 }, { "epoch": 0.5620937646738996, "grad_norm": 3.5857249936384044, "learning_rate": 8.480106815525673e-08, "loss": 0.7256, "step": 5686 }, { "epoch": 0.562192620418654, "grad_norm": 4.556952758334905, "learning_rate": 8.476941982627672e-08, "loss": 0.7208, "step": 5687 }, { "epoch": 0.5622914761634086, "grad_norm": 6.709768272261749, "learning_rate": 8.473777305896798e-08, "loss": 0.7914, "step": 5688 }, { "epoch": 0.562390331908163, "grad_norm": 4.151950919762593, "learning_rate": 8.470612785657552e-08, "loss": 0.6732, "step": 5689 }, { "epoch": 0.5624891876529174, "grad_norm": 16.41292721404762, "learning_rate": 8.467448422234405e-08, "loss": 0.7063, "step": 5690 }, { "epoch": 0.562588043397672, "grad_norm": 3.723090133650851, "learning_rate": 8.464284215951811e-08, "loss": 0.7595, "step": 5691 }, { "epoch": 0.5626868991424264, "grad_norm": 3.9791937756092346, "learning_rate": 8.46112016713422e-08, "loss": 0.6581, "step": 5692 }, { "epoch": 0.5627857548871809, "grad_norm": 7.8539308560782874, "learning_rate": 8.457956276106058e-08, "loss": 0.6756, "step": 5693 }, { "epoch": 0.5628846106319354, "grad_norm": 3.627789598143021, "learning_rate": 8.454792543191728e-08, "loss": 0.7719, "step": 5694 }, { "epoch": 0.5629834663766898, "grad_norm": 3.809734508939976, "learning_rate": 8.451628968715634e-08, "loss": 0.6491, "step": 5695 }, { "epoch": 0.5630823221214443, "grad_norm": 3.2717409487970355, "learning_rate": 8.448465553002147e-08, "loss": 0.6866, "step": 5696 }, { "epoch": 0.5631811778661987, "grad_norm": 4.703541519248384, "learning_rate": 8.445302296375632e-08, "loss": 0.7219, "step": 5697 }, { "epoch": 0.5632800336109532, "grad_norm": 4.549821166626662, "learning_rate": 8.442139199160432e-08, "loss": 0.6065, "step": 5698 }, { "epoch": 0.5633788893557077, "grad_norm": 80.68119080243555, "learning_rate": 8.438976261680878e-08, "loss": 0.6911, "step": 5699 }, { "epoch": 0.5634777451004621, "grad_norm": 7.1312155505191335, "learning_rate": 8.435813484261279e-08, "loss": 0.6474, "step": 5700 }, { "epoch": 0.5635766008452167, "grad_norm": 6.724437770527778, "learning_rate": 8.432650867225938e-08, "loss": 0.6257, "step": 5701 }, { "epoch": 0.5636754565899711, "grad_norm": 2.5008685272942572, "learning_rate": 8.429488410899127e-08, "loss": 0.7585, "step": 5702 }, { "epoch": 0.5637743123347255, "grad_norm": 9.666956877987843, "learning_rate": 8.426326115605113e-08, "loss": 0.7435, "step": 5703 }, { "epoch": 0.5638731680794801, "grad_norm": 8.832196501962256, "learning_rate": 8.423163981668146e-08, "loss": 0.7361, "step": 5704 }, { "epoch": 0.5639720238242345, "grad_norm": 7.140635472763243, "learning_rate": 8.420002009412452e-08, "loss": 0.7347, "step": 5705 }, { "epoch": 0.5640708795689889, "grad_norm": 5.229333175956623, "learning_rate": 8.416840199162241e-08, "loss": 0.763, "step": 5706 }, { "epoch": 0.5641697353137434, "grad_norm": 8.153786396146925, "learning_rate": 8.41367855124172e-08, "loss": 0.7684, "step": 5707 }, { "epoch": 0.5642685910584979, "grad_norm": 4.713407812171167, "learning_rate": 8.410517065975058e-08, "loss": 0.8002, "step": 5708 }, { "epoch": 0.5643674468032523, "grad_norm": 3.5788265525199647, "learning_rate": 8.407355743686431e-08, "loss": 0.6956, "step": 5709 }, { "epoch": 0.5644663025480068, "grad_norm": 4.099951119870306, "learning_rate": 8.404194584699973e-08, "loss": 0.6819, "step": 5710 }, { "epoch": 0.5645651582927613, "grad_norm": 3.768054094521578, "learning_rate": 8.401033589339821e-08, "loss": 0.74, "step": 5711 }, { "epoch": 0.5646640140375158, "grad_norm": 7.508807184604626, "learning_rate": 8.397872757930092e-08, "loss": 0.7271, "step": 5712 }, { "epoch": 0.5647628697822702, "grad_norm": 4.956585813835168, "learning_rate": 8.394712090794875e-08, "loss": 0.8154, "step": 5713 }, { "epoch": 0.5648617255270247, "grad_norm": 4.822708481057138, "learning_rate": 8.391551588258256e-08, "loss": 0.7151, "step": 5714 }, { "epoch": 0.5649605812717792, "grad_norm": 3.939179407295999, "learning_rate": 8.388391250644294e-08, "loss": 0.7422, "step": 5715 }, { "epoch": 0.5650594370165336, "grad_norm": 4.315366935907443, "learning_rate": 8.385231078277037e-08, "loss": 0.7205, "step": 5716 }, { "epoch": 0.565158292761288, "grad_norm": 2.972167877981754, "learning_rate": 8.382071071480512e-08, "loss": 0.6552, "step": 5717 }, { "epoch": 0.5652571485060426, "grad_norm": 3.4979336182699026, "learning_rate": 8.378911230578736e-08, "loss": 0.6368, "step": 5718 }, { "epoch": 0.565356004250797, "grad_norm": 4.041805627082899, "learning_rate": 8.375751555895696e-08, "loss": 0.6603, "step": 5719 }, { "epoch": 0.5654548599955515, "grad_norm": 2.761764977345551, "learning_rate": 8.37259204775538e-08, "loss": 0.6331, "step": 5720 }, { "epoch": 0.565553715740306, "grad_norm": 9.010982556321155, "learning_rate": 8.369432706481739e-08, "loss": 0.7272, "step": 5721 }, { "epoch": 0.5656525714850604, "grad_norm": 3.5572630199836808, "learning_rate": 8.366273532398722e-08, "loss": 0.6002, "step": 5722 }, { "epoch": 0.5657514272298149, "grad_norm": 3.7754466083540943, "learning_rate": 8.363114525830258e-08, "loss": 0.7224, "step": 5723 }, { "epoch": 0.5658502829745694, "grad_norm": 4.098463553745817, "learning_rate": 8.359955687100251e-08, "loss": 0.69, "step": 5724 }, { "epoch": 0.5659491387193238, "grad_norm": 3.0640059259359265, "learning_rate": 8.356797016532594e-08, "loss": 0.719, "step": 5725 }, { "epoch": 0.5660479944640783, "grad_norm": 5.282919925044142, "learning_rate": 8.353638514451169e-08, "loss": 0.7294, "step": 5726 }, { "epoch": 0.5661468502088327, "grad_norm": 4.544609554361328, "learning_rate": 8.350480181179824e-08, "loss": 0.7844, "step": 5727 }, { "epoch": 0.5662457059535873, "grad_norm": 4.862583421981987, "learning_rate": 8.347322017042405e-08, "loss": 0.6996, "step": 5728 }, { "epoch": 0.5663445616983417, "grad_norm": 3.8002182466109384, "learning_rate": 8.344164022362738e-08, "loss": 0.6518, "step": 5729 }, { "epoch": 0.5664434174430961, "grad_norm": 6.825539751138623, "learning_rate": 8.341006197464622e-08, "loss": 0.6999, "step": 5730 }, { "epoch": 0.5665422731878507, "grad_norm": 18.932724092348842, "learning_rate": 8.337848542671853e-08, "loss": 0.6666, "step": 5731 }, { "epoch": 0.5666411289326051, "grad_norm": 4.7699768253783095, "learning_rate": 8.334691058308195e-08, "loss": 0.7087, "step": 5732 }, { "epoch": 0.5667399846773595, "grad_norm": 4.756790664870883, "learning_rate": 8.3315337446974e-08, "loss": 0.7634, "step": 5733 }, { "epoch": 0.5668388404221141, "grad_norm": 6.588416923534919, "learning_rate": 8.328376602163214e-08, "loss": 0.73, "step": 5734 }, { "epoch": 0.5669376961668685, "grad_norm": 3.371678969811359, "learning_rate": 8.325219631029344e-08, "loss": 0.6783, "step": 5735 }, { "epoch": 0.5670365519116229, "grad_norm": 22.220386229484344, "learning_rate": 8.322062831619497e-08, "loss": 0.6524, "step": 5736 }, { "epoch": 0.5671354076563774, "grad_norm": 8.536178910381503, "learning_rate": 8.318906204257359e-08, "loss": 0.6484, "step": 5737 }, { "epoch": 0.5672342634011319, "grad_norm": 11.011969830819462, "learning_rate": 8.315749749266587e-08, "loss": 0.7472, "step": 5738 }, { "epoch": 0.5673331191458864, "grad_norm": 5.710677824791938, "learning_rate": 8.312593466970836e-08, "loss": 0.7241, "step": 5739 }, { "epoch": 0.5674319748906408, "grad_norm": 4.4132678849852445, "learning_rate": 8.309437357693736e-08, "loss": 0.6213, "step": 5740 }, { "epoch": 0.5675308306353953, "grad_norm": 3.1534352488636395, "learning_rate": 8.306281421758897e-08, "loss": 0.7506, "step": 5741 }, { "epoch": 0.5676296863801498, "grad_norm": 4.918406098672621, "learning_rate": 8.303125659489911e-08, "loss": 0.7598, "step": 5742 }, { "epoch": 0.5677285421249042, "grad_norm": 7.631878667265885, "learning_rate": 8.29997007121036e-08, "loss": 0.8237, "step": 5743 }, { "epoch": 0.5678273978696587, "grad_norm": 4.14068433601075, "learning_rate": 8.2968146572438e-08, "loss": 0.7765, "step": 5744 }, { "epoch": 0.5679262536144132, "grad_norm": 45.05423416723282, "learning_rate": 8.293659417913776e-08, "loss": 0.6744, "step": 5745 }, { "epoch": 0.5680251093591676, "grad_norm": 4.7279335330810275, "learning_rate": 8.290504353543804e-08, "loss": 0.6541, "step": 5746 }, { "epoch": 0.568123965103922, "grad_norm": 3.390051848068928, "learning_rate": 8.287349464457395e-08, "loss": 0.6931, "step": 5747 }, { "epoch": 0.5682228208486766, "grad_norm": 2.9129505963200684, "learning_rate": 8.284194750978039e-08, "loss": 0.6718, "step": 5748 }, { "epoch": 0.568321676593431, "grad_norm": 3.6593708627000523, "learning_rate": 8.281040213429198e-08, "loss": 0.6695, "step": 5749 }, { "epoch": 0.5684205323381855, "grad_norm": 2.8688822870472324, "learning_rate": 8.277885852134325e-08, "loss": 0.7183, "step": 5750 }, { "epoch": 0.56851938808294, "grad_norm": 16.833099758020104, "learning_rate": 8.27473166741686e-08, "loss": 0.7057, "step": 5751 }, { "epoch": 0.5686182438276944, "grad_norm": 2.6682612779261, "learning_rate": 8.271577659600209e-08, "loss": 0.6512, "step": 5752 }, { "epoch": 0.5687170995724489, "grad_norm": 3.52474110491925, "learning_rate": 8.268423829007775e-08, "loss": 0.7473, "step": 5753 }, { "epoch": 0.5688159553172034, "grad_norm": 4.986351505263934, "learning_rate": 8.265270175962933e-08, "loss": 0.6192, "step": 5754 }, { "epoch": 0.5689148110619578, "grad_norm": 5.1967259930388385, "learning_rate": 8.262116700789044e-08, "loss": 0.6308, "step": 5755 }, { "epoch": 0.5690136668067123, "grad_norm": 2.9213192393116567, "learning_rate": 8.258963403809456e-08, "loss": 0.5351, "step": 5756 }, { "epoch": 0.5691125225514668, "grad_norm": 4.262134562506784, "learning_rate": 8.255810285347485e-08, "loss": 0.7139, "step": 5757 }, { "epoch": 0.5692113782962213, "grad_norm": 7.068833415909573, "learning_rate": 8.252657345726443e-08, "loss": 0.8013, "step": 5758 }, { "epoch": 0.5693102340409757, "grad_norm": 3.9343458423149316, "learning_rate": 8.249504585269617e-08, "loss": 0.6747, "step": 5759 }, { "epoch": 0.5694090897857301, "grad_norm": 2.763257187458842, "learning_rate": 8.24635200430027e-08, "loss": 0.6999, "step": 5760 }, { "epoch": 0.5695079455304847, "grad_norm": 5.76451974249321, "learning_rate": 8.243199603141656e-08, "loss": 0.732, "step": 5761 }, { "epoch": 0.5696068012752391, "grad_norm": 3.4904799917522493, "learning_rate": 8.240047382117014e-08, "loss": 0.7845, "step": 5762 }, { "epoch": 0.5697056570199935, "grad_norm": 8.26485778888831, "learning_rate": 8.236895341549546e-08, "loss": 0.7382, "step": 5763 }, { "epoch": 0.5698045127647481, "grad_norm": 3.6040753777628094, "learning_rate": 8.233743481762459e-08, "loss": 0.6768, "step": 5764 }, { "epoch": 0.5699033685095025, "grad_norm": 5.468959215410337, "learning_rate": 8.230591803078917e-08, "loss": 0.7257, "step": 5765 }, { "epoch": 0.570002224254257, "grad_norm": 8.053436255268032, "learning_rate": 8.227440305822088e-08, "loss": 0.816, "step": 5766 }, { "epoch": 0.5701010799990115, "grad_norm": 6.11959781746441, "learning_rate": 8.22428899031511e-08, "loss": 0.7844, "step": 5767 }, { "epoch": 0.5701999357437659, "grad_norm": 4.089909421988143, "learning_rate": 8.221137856881102e-08, "loss": 0.6849, "step": 5768 }, { "epoch": 0.5702987914885204, "grad_norm": 3.4137107573537637, "learning_rate": 8.217986905843164e-08, "loss": 0.7379, "step": 5769 }, { "epoch": 0.5703976472332748, "grad_norm": 4.551681195752091, "learning_rate": 8.214836137524387e-08, "loss": 0.7564, "step": 5770 }, { "epoch": 0.5704965029780293, "grad_norm": 3.433775326095088, "learning_rate": 8.211685552247827e-08, "loss": 0.6907, "step": 5771 }, { "epoch": 0.5705953587227838, "grad_norm": 2.6980100066663906, "learning_rate": 8.208535150336533e-08, "loss": 0.6782, "step": 5772 }, { "epoch": 0.5706942144675382, "grad_norm": 5.468379290895837, "learning_rate": 8.205384932113539e-08, "loss": 0.733, "step": 5773 }, { "epoch": 0.5707930702122928, "grad_norm": 12.673819014864709, "learning_rate": 8.202234897901843e-08, "loss": 0.72, "step": 5774 }, { "epoch": 0.5708919259570472, "grad_norm": 4.269312301786664, "learning_rate": 8.199085048024442e-08, "loss": 0.7221, "step": 5775 }, { "epoch": 0.5709907817018016, "grad_norm": 5.233617616599987, "learning_rate": 8.195935382804303e-08, "loss": 0.73, "step": 5776 }, { "epoch": 0.5710896374465562, "grad_norm": 3.6306908923904055, "learning_rate": 8.192785902564377e-08, "loss": 0.7306, "step": 5777 }, { "epoch": 0.5711884931913106, "grad_norm": 3.667700951061173, "learning_rate": 8.189636607627603e-08, "loss": 0.8471, "step": 5778 }, { "epoch": 0.571287348936065, "grad_norm": 7.601872591540931, "learning_rate": 8.186487498316886e-08, "loss": 0.7407, "step": 5779 }, { "epoch": 0.5713862046808195, "grad_norm": 4.052302064846763, "learning_rate": 8.183338574955123e-08, "loss": 0.7702, "step": 5780 }, { "epoch": 0.571485060425574, "grad_norm": 7.850648924733983, "learning_rate": 8.180189837865196e-08, "loss": 0.83, "step": 5781 }, { "epoch": 0.5715839161703284, "grad_norm": 5.900913768119201, "learning_rate": 8.177041287369955e-08, "loss": 0.6382, "step": 5782 }, { "epoch": 0.5716827719150829, "grad_norm": 10.103702306148817, "learning_rate": 8.17389292379224e-08, "loss": 0.699, "step": 5783 }, { "epoch": 0.5717816276598374, "grad_norm": 2.9188908710032555, "learning_rate": 8.17074474745487e-08, "loss": 0.8353, "step": 5784 }, { "epoch": 0.5718804834045919, "grad_norm": 4.598168268928201, "learning_rate": 8.167596758680641e-08, "loss": 0.6587, "step": 5785 }, { "epoch": 0.5719793391493463, "grad_norm": 6.095519085801619, "learning_rate": 8.164448957792335e-08, "loss": 0.7825, "step": 5786 }, { "epoch": 0.5720781948941008, "grad_norm": 3.1011564605333715, "learning_rate": 8.161301345112713e-08, "loss": 0.5837, "step": 5787 }, { "epoch": 0.5721770506388553, "grad_norm": 4.480807429685703, "learning_rate": 8.158153920964512e-08, "loss": 0.8083, "step": 5788 }, { "epoch": 0.5722759063836097, "grad_norm": 75.89713442947463, "learning_rate": 8.155006685670462e-08, "loss": 0.7313, "step": 5789 }, { "epoch": 0.5723747621283641, "grad_norm": 3.380624603865087, "learning_rate": 8.151859639553256e-08, "loss": 0.6522, "step": 5790 }, { "epoch": 0.5724736178731187, "grad_norm": 19.833349387044496, "learning_rate": 8.148712782935582e-08, "loss": 0.72, "step": 5791 }, { "epoch": 0.5725724736178731, "grad_norm": 4.63398151607536, "learning_rate": 8.14556611614011e-08, "loss": 0.6154, "step": 5792 }, { "epoch": 0.5726713293626275, "grad_norm": 4.866717584760495, "learning_rate": 8.142419639489474e-08, "loss": 0.7425, "step": 5793 }, { "epoch": 0.5727701851073821, "grad_norm": 6.663186184438289, "learning_rate": 8.139273353306301e-08, "loss": 0.6653, "step": 5794 }, { "epoch": 0.5728690408521365, "grad_norm": 12.075860888574542, "learning_rate": 8.136127257913202e-08, "loss": 0.7437, "step": 5795 }, { "epoch": 0.572967896596891, "grad_norm": 10.802707858363302, "learning_rate": 8.132981353632755e-08, "loss": 0.7488, "step": 5796 }, { "epoch": 0.5730667523416455, "grad_norm": 8.520515363367952, "learning_rate": 8.129835640787533e-08, "loss": 0.8038, "step": 5797 }, { "epoch": 0.5731656080863999, "grad_norm": 4.511429937538754, "learning_rate": 8.126690119700076e-08, "loss": 0.6427, "step": 5798 }, { "epoch": 0.5732644638311544, "grad_norm": 3.059444873125183, "learning_rate": 8.123544790692913e-08, "loss": 0.7931, "step": 5799 }, { "epoch": 0.5733633195759088, "grad_norm": 3.505868420706949, "learning_rate": 8.120399654088558e-08, "loss": 0.61, "step": 5800 }, { "epoch": 0.5734621753206633, "grad_norm": 3.3058881131662763, "learning_rate": 8.117254710209487e-08, "loss": 0.6278, "step": 5801 }, { "epoch": 0.5735610310654178, "grad_norm": 5.045298905863333, "learning_rate": 8.114109959378174e-08, "loss": 0.8382, "step": 5802 }, { "epoch": 0.5736598868101722, "grad_norm": 13.815477797537483, "learning_rate": 8.110965401917069e-08, "loss": 0.7029, "step": 5803 }, { "epoch": 0.5737587425549268, "grad_norm": 3.705503463916965, "learning_rate": 8.107821038148593e-08, "loss": 0.6941, "step": 5804 }, { "epoch": 0.5738575982996812, "grad_norm": 6.449322587320779, "learning_rate": 8.104676868395157e-08, "loss": 0.7907, "step": 5805 }, { "epoch": 0.5739564540444356, "grad_norm": 4.0116850747215125, "learning_rate": 8.101532892979156e-08, "loss": 0.6839, "step": 5806 }, { "epoch": 0.5740553097891902, "grad_norm": 3.7561531816151077, "learning_rate": 8.098389112222947e-08, "loss": 0.704, "step": 5807 }, { "epoch": 0.5741541655339446, "grad_norm": 3.6036409982736486, "learning_rate": 8.095245526448889e-08, "loss": 0.557, "step": 5808 }, { "epoch": 0.574253021278699, "grad_norm": 3.0827418430911724, "learning_rate": 8.092102135979299e-08, "loss": 0.6956, "step": 5809 }, { "epoch": 0.5743518770234535, "grad_norm": 3.566026886930881, "learning_rate": 8.088958941136495e-08, "loss": 0.7153, "step": 5810 }, { "epoch": 0.574450732768208, "grad_norm": 4.732488932605444, "learning_rate": 8.085815942242762e-08, "loss": 0.7856, "step": 5811 }, { "epoch": 0.5745495885129625, "grad_norm": 3.724780661292933, "learning_rate": 8.082673139620366e-08, "loss": 0.7896, "step": 5812 }, { "epoch": 0.5746484442577169, "grad_norm": 3.900739667497971, "learning_rate": 8.079530533591557e-08, "loss": 0.6779, "step": 5813 }, { "epoch": 0.5747473000024714, "grad_norm": 7.730751549577519, "learning_rate": 8.076388124478565e-08, "loss": 0.799, "step": 5814 }, { "epoch": 0.5748461557472259, "grad_norm": 4.622447362041101, "learning_rate": 8.073245912603594e-08, "loss": 0.6887, "step": 5815 }, { "epoch": 0.5749450114919803, "grad_norm": 9.943640137080605, "learning_rate": 8.070103898288832e-08, "loss": 0.7301, "step": 5816 }, { "epoch": 0.5750438672367348, "grad_norm": 4.490399283427174, "learning_rate": 8.06696208185645e-08, "loss": 0.6978, "step": 5817 }, { "epoch": 0.5751427229814893, "grad_norm": 3.5603857530213685, "learning_rate": 8.06382046362859e-08, "loss": 0.7077, "step": 5818 }, { "epoch": 0.5752415787262437, "grad_norm": 5.877840407618834, "learning_rate": 8.060679043927385e-08, "loss": 0.6192, "step": 5819 }, { "epoch": 0.5753404344709981, "grad_norm": 8.995545527066994, "learning_rate": 8.057537823074934e-08, "loss": 0.7211, "step": 5820 }, { "epoch": 0.5754392902157527, "grad_norm": 4.781175743001937, "learning_rate": 8.054396801393324e-08, "loss": 0.6954, "step": 5821 }, { "epoch": 0.5755381459605071, "grad_norm": 4.674748784270626, "learning_rate": 8.051255979204626e-08, "loss": 0.7397, "step": 5822 }, { "epoch": 0.5756370017052616, "grad_norm": 3.9335288319339066, "learning_rate": 8.048115356830878e-08, "loss": 0.682, "step": 5823 }, { "epoch": 0.5757358574500161, "grad_norm": 4.799503056442607, "learning_rate": 8.044974934594107e-08, "loss": 0.7865, "step": 5824 }, { "epoch": 0.5758347131947705, "grad_norm": 3.4969583882819197, "learning_rate": 8.041834712816322e-08, "loss": 0.7952, "step": 5825 }, { "epoch": 0.575933568939525, "grad_norm": 26.713203978176885, "learning_rate": 8.038694691819498e-08, "loss": 0.7202, "step": 5826 }, { "epoch": 0.5760324246842795, "grad_norm": 7.925842485413404, "learning_rate": 8.035554871925606e-08, "loss": 0.6686, "step": 5827 }, { "epoch": 0.5761312804290339, "grad_norm": 5.293275657336656, "learning_rate": 8.032415253456582e-08, "loss": 0.7347, "step": 5828 }, { "epoch": 0.5762301361737884, "grad_norm": 19.569068067883787, "learning_rate": 8.029275836734351e-08, "loss": 0.6881, "step": 5829 }, { "epoch": 0.5763289919185429, "grad_norm": 19.479621137583376, "learning_rate": 8.026136622080815e-08, "loss": 0.7339, "step": 5830 }, { "epoch": 0.5764278476632974, "grad_norm": 4.993737931599202, "learning_rate": 8.022997609817848e-08, "loss": 0.8138, "step": 5831 }, { "epoch": 0.5765267034080518, "grad_norm": 3.626587976341799, "learning_rate": 8.019858800267312e-08, "loss": 0.7079, "step": 5832 }, { "epoch": 0.5766255591528062, "grad_norm": 6.351841017192694, "learning_rate": 8.016720193751053e-08, "loss": 0.7097, "step": 5833 }, { "epoch": 0.5767244148975608, "grad_norm": 3.107258168791699, "learning_rate": 8.013581790590876e-08, "loss": 0.7429, "step": 5834 }, { "epoch": 0.5768232706423152, "grad_norm": 4.587197958447509, "learning_rate": 8.010443591108586e-08, "loss": 0.6762, "step": 5835 }, { "epoch": 0.5769221263870696, "grad_norm": 4.023453381044657, "learning_rate": 8.007305595625961e-08, "loss": 0.7179, "step": 5836 }, { "epoch": 0.5770209821318242, "grad_norm": 4.089297316147371, "learning_rate": 8.004167804464752e-08, "loss": 0.6711, "step": 5837 }, { "epoch": 0.5771198378765786, "grad_norm": 41.539799620816986, "learning_rate": 8.001030217946692e-08, "loss": 0.7855, "step": 5838 }, { "epoch": 0.577218693621333, "grad_norm": 7.122310685310781, "learning_rate": 7.997892836393499e-08, "loss": 0.695, "step": 5839 }, { "epoch": 0.5773175493660876, "grad_norm": 11.18500330772825, "learning_rate": 7.99475566012686e-08, "loss": 0.7644, "step": 5840 }, { "epoch": 0.577416405110842, "grad_norm": 3.9949963628443492, "learning_rate": 7.991618689468452e-08, "loss": 0.7114, "step": 5841 }, { "epoch": 0.5775152608555965, "grad_norm": 5.279655696806209, "learning_rate": 7.988481924739916e-08, "loss": 0.7425, "step": 5842 }, { "epoch": 0.5776141166003509, "grad_norm": 3.439545550586066, "learning_rate": 7.98534536626289e-08, "loss": 0.7286, "step": 5843 }, { "epoch": 0.5777129723451054, "grad_norm": 5.635034799693416, "learning_rate": 7.982209014358979e-08, "loss": 0.6863, "step": 5844 }, { "epoch": 0.5778118280898599, "grad_norm": 6.931549644456772, "learning_rate": 7.979072869349767e-08, "loss": 0.6724, "step": 5845 }, { "epoch": 0.5779106838346143, "grad_norm": 3.499675700286754, "learning_rate": 7.975936931556823e-08, "loss": 0.6907, "step": 5846 }, { "epoch": 0.5780095395793688, "grad_norm": 4.2641770854892735, "learning_rate": 7.972801201301693e-08, "loss": 0.7617, "step": 5847 }, { "epoch": 0.5781083953241233, "grad_norm": 4.429592683293635, "learning_rate": 7.96966567890589e-08, "loss": 0.6937, "step": 5848 }, { "epoch": 0.5782072510688777, "grad_norm": 4.8511688524800585, "learning_rate": 7.966530364690924e-08, "loss": 0.6573, "step": 5849 }, { "epoch": 0.5783061068136323, "grad_norm": 3.625076061371386, "learning_rate": 7.963395258978276e-08, "loss": 0.6143, "step": 5850 }, { "epoch": 0.5784049625583867, "grad_norm": 4.067307136870742, "learning_rate": 7.960260362089399e-08, "loss": 0.6956, "step": 5851 }, { "epoch": 0.5785038183031411, "grad_norm": 2.9773974338626017, "learning_rate": 7.957125674345738e-08, "loss": 0.6935, "step": 5852 }, { "epoch": 0.5786026740478956, "grad_norm": 3.788201303457232, "learning_rate": 7.9539911960687e-08, "loss": 0.6761, "step": 5853 }, { "epoch": 0.5787015297926501, "grad_norm": 4.9975764811263215, "learning_rate": 7.950856927579685e-08, "loss": 0.7872, "step": 5854 }, { "epoch": 0.5788003855374045, "grad_norm": 3.489363421165812, "learning_rate": 7.947722869200068e-08, "loss": 0.7368, "step": 5855 }, { "epoch": 0.578899241282159, "grad_norm": 9.156578215396106, "learning_rate": 7.944589021251196e-08, "loss": 0.652, "step": 5856 }, { "epoch": 0.5789980970269135, "grad_norm": 3.4162520693209815, "learning_rate": 7.941455384054398e-08, "loss": 0.6855, "step": 5857 }, { "epoch": 0.579096952771668, "grad_norm": 3.6104453864047676, "learning_rate": 7.938321957930988e-08, "loss": 0.7325, "step": 5858 }, { "epoch": 0.5791958085164224, "grad_norm": 3.0774551108812074, "learning_rate": 7.935188743202247e-08, "loss": 0.7036, "step": 5859 }, { "epoch": 0.5792946642611769, "grad_norm": 3.9428916589763525, "learning_rate": 7.93205574018944e-08, "loss": 0.7453, "step": 5860 }, { "epoch": 0.5793935200059314, "grad_norm": 4.487331526368109, "learning_rate": 7.928922949213819e-08, "loss": 0.7228, "step": 5861 }, { "epoch": 0.5794923757506858, "grad_norm": 6.491521534954926, "learning_rate": 7.925790370596595e-08, "loss": 0.7891, "step": 5862 }, { "epoch": 0.5795912314954402, "grad_norm": 3.1847051536956634, "learning_rate": 7.922658004658975e-08, "loss": 0.6549, "step": 5863 }, { "epoch": 0.5796900872401948, "grad_norm": 6.213625417149403, "learning_rate": 7.91952585172213e-08, "loss": 0.7271, "step": 5864 }, { "epoch": 0.5797889429849492, "grad_norm": 4.920221040352242, "learning_rate": 7.91639391210722e-08, "loss": 0.6466, "step": 5865 }, { "epoch": 0.5798877987297036, "grad_norm": 4.365108961695499, "learning_rate": 7.913262186135383e-08, "loss": 0.6079, "step": 5866 }, { "epoch": 0.5799866544744582, "grad_norm": 3.1329846597671778, "learning_rate": 7.910130674127722e-08, "loss": 0.7357, "step": 5867 }, { "epoch": 0.5800855102192126, "grad_norm": 3.7368679480368963, "learning_rate": 7.906999376405332e-08, "loss": 0.7441, "step": 5868 }, { "epoch": 0.580184365963967, "grad_norm": 15.680182285561171, "learning_rate": 7.903868293289287e-08, "loss": 0.6966, "step": 5869 }, { "epoch": 0.5802832217087216, "grad_norm": 6.748491855622522, "learning_rate": 7.900737425100626e-08, "loss": 0.5918, "step": 5870 }, { "epoch": 0.580382077453476, "grad_norm": 5.61974014949513, "learning_rate": 7.897606772160375e-08, "loss": 0.6816, "step": 5871 }, { "epoch": 0.5804809331982305, "grad_norm": 4.893340026241116, "learning_rate": 7.894476334789538e-08, "loss": 0.7071, "step": 5872 }, { "epoch": 0.5805797889429849, "grad_norm": 3.071633172245995, "learning_rate": 7.891346113309094e-08, "loss": 0.7655, "step": 5873 }, { "epoch": 0.5806786446877394, "grad_norm": 14.960560787808406, "learning_rate": 7.888216108040004e-08, "loss": 0.6689, "step": 5874 }, { "epoch": 0.5807775004324939, "grad_norm": 3.2063728889731915, "learning_rate": 7.885086319303198e-08, "loss": 0.6905, "step": 5875 }, { "epoch": 0.5808763561772483, "grad_norm": 4.1815904743522, "learning_rate": 7.881956747419592e-08, "loss": 0.675, "step": 5876 }, { "epoch": 0.5809752119220029, "grad_norm": 4.179430979779995, "learning_rate": 7.878827392710082e-08, "loss": 0.5979, "step": 5877 }, { "epoch": 0.5810740676667573, "grad_norm": 4.509452329597974, "learning_rate": 7.875698255495528e-08, "loss": 0.8314, "step": 5878 }, { "epoch": 0.5811729234115117, "grad_norm": 3.5921645477455244, "learning_rate": 7.872569336096784e-08, "loss": 0.6134, "step": 5879 }, { "epoch": 0.5812717791562663, "grad_norm": 3.8676706377328394, "learning_rate": 7.869440634834677e-08, "loss": 0.7895, "step": 5880 }, { "epoch": 0.5813706349010207, "grad_norm": 4.287131517670594, "learning_rate": 7.866312152030002e-08, "loss": 0.7106, "step": 5881 }, { "epoch": 0.5814694906457751, "grad_norm": 4.550275263469946, "learning_rate": 7.863183888003538e-08, "loss": 0.6836, "step": 5882 }, { "epoch": 0.5815683463905296, "grad_norm": 6.071150373206913, "learning_rate": 7.860055843076053e-08, "loss": 0.6145, "step": 5883 }, { "epoch": 0.5816672021352841, "grad_norm": 5.009452125649452, "learning_rate": 7.85692801756827e-08, "loss": 0.7332, "step": 5884 }, { "epoch": 0.5817660578800385, "grad_norm": 190.7721169486765, "learning_rate": 7.85380041180091e-08, "loss": 0.792, "step": 5885 }, { "epoch": 0.581864913624793, "grad_norm": 3.574231089733683, "learning_rate": 7.850673026094655e-08, "loss": 0.6461, "step": 5886 }, { "epoch": 0.5819637693695475, "grad_norm": 3.3558854893815253, "learning_rate": 7.847545860770176e-08, "loss": 0.6365, "step": 5887 }, { "epoch": 0.582062625114302, "grad_norm": 6.134268333001915, "learning_rate": 7.844418916148123e-08, "loss": 0.7239, "step": 5888 }, { "epoch": 0.5821614808590564, "grad_norm": 3.625876845667662, "learning_rate": 7.841292192549109e-08, "loss": 0.718, "step": 5889 }, { "epoch": 0.5822603366038109, "grad_norm": 3.3105921229242603, "learning_rate": 7.838165690293739e-08, "loss": 0.7009, "step": 5890 }, { "epoch": 0.5823591923485654, "grad_norm": 4.693272614967511, "learning_rate": 7.835039409702591e-08, "loss": 0.7295, "step": 5891 }, { "epoch": 0.5824580480933198, "grad_norm": 10.730359160486268, "learning_rate": 7.831913351096211e-08, "loss": 0.7511, "step": 5892 }, { "epoch": 0.5825569038380742, "grad_norm": 9.4897000996809, "learning_rate": 7.828787514795135e-08, "loss": 0.7829, "step": 5893 }, { "epoch": 0.5826557595828288, "grad_norm": 4.585429833122537, "learning_rate": 7.825661901119876e-08, "loss": 0.711, "step": 5894 }, { "epoch": 0.5827546153275832, "grad_norm": 7.602459208086165, "learning_rate": 7.822536510390912e-08, "loss": 0.7149, "step": 5895 }, { "epoch": 0.5828534710723376, "grad_norm": 3.068522981919128, "learning_rate": 7.819411342928713e-08, "loss": 0.7085, "step": 5896 }, { "epoch": 0.5829523268170922, "grad_norm": 3.177925283313242, "learning_rate": 7.816286399053711e-08, "loss": 0.664, "step": 5897 }, { "epoch": 0.5830511825618466, "grad_norm": 3.318800879096164, "learning_rate": 7.813161679086329e-08, "loss": 0.7627, "step": 5898 }, { "epoch": 0.5831500383066011, "grad_norm": 3.2592079472103577, "learning_rate": 7.810037183346957e-08, "loss": 0.6802, "step": 5899 }, { "epoch": 0.5832488940513556, "grad_norm": 9.289225150229415, "learning_rate": 7.806912912155968e-08, "loss": 0.6414, "step": 5900 }, { "epoch": 0.58334774979611, "grad_norm": 5.1661077381618945, "learning_rate": 7.803788865833709e-08, "loss": 0.6592, "step": 5901 }, { "epoch": 0.5834466055408645, "grad_norm": 7.17108156046824, "learning_rate": 7.800665044700509e-08, "loss": 0.6081, "step": 5902 }, { "epoch": 0.583545461285619, "grad_norm": 5.175224019261036, "learning_rate": 7.797541449076662e-08, "loss": 0.7604, "step": 5903 }, { "epoch": 0.5836443170303734, "grad_norm": 5.05016768437379, "learning_rate": 7.79441807928245e-08, "loss": 0.7028, "step": 5904 }, { "epoch": 0.5837431727751279, "grad_norm": 3.970636555410633, "learning_rate": 7.791294935638133e-08, "loss": 0.7006, "step": 5905 }, { "epoch": 0.5838420285198823, "grad_norm": 2.9235381406080476, "learning_rate": 7.788172018463938e-08, "loss": 0.7101, "step": 5906 }, { "epoch": 0.5839408842646369, "grad_norm": 2.9442897939573704, "learning_rate": 7.785049328080078e-08, "loss": 0.6299, "step": 5907 }, { "epoch": 0.5840397400093913, "grad_norm": 8.562840833666016, "learning_rate": 7.781926864806735e-08, "loss": 0.7022, "step": 5908 }, { "epoch": 0.5841385957541457, "grad_norm": 2.9099019011260703, "learning_rate": 7.778804628964071e-08, "loss": 0.684, "step": 5909 }, { "epoch": 0.5842374514989003, "grad_norm": 9.192077885254628, "learning_rate": 7.775682620872232e-08, "loss": 0.8509, "step": 5910 }, { "epoch": 0.5843363072436547, "grad_norm": 4.02258664859726, "learning_rate": 7.772560840851324e-08, "loss": 0.7837, "step": 5911 }, { "epoch": 0.5844351629884091, "grad_norm": 4.561139569023554, "learning_rate": 7.769439289221445e-08, "loss": 0.6759, "step": 5912 }, { "epoch": 0.5845340187331637, "grad_norm": 4.0637088517959175, "learning_rate": 7.766317966302669e-08, "loss": 0.643, "step": 5913 }, { "epoch": 0.5846328744779181, "grad_norm": 3.277485881564006, "learning_rate": 7.763196872415032e-08, "loss": 0.7116, "step": 5914 }, { "epoch": 0.5847317302226726, "grad_norm": 3.92710865429683, "learning_rate": 7.760076007878562e-08, "loss": 0.8483, "step": 5915 }, { "epoch": 0.584830585967427, "grad_norm": 5.559333763087896, "learning_rate": 7.756955373013256e-08, "loss": 0.7733, "step": 5916 }, { "epoch": 0.5849294417121815, "grad_norm": 4.496787787577906, "learning_rate": 7.75383496813909e-08, "loss": 0.7259, "step": 5917 }, { "epoch": 0.585028297456936, "grad_norm": 10.503370124217925, "learning_rate": 7.750714793576016e-08, "loss": 0.6947, "step": 5918 }, { "epoch": 0.5851271532016904, "grad_norm": 4.026571300324471, "learning_rate": 7.747594849643957e-08, "loss": 0.741, "step": 5919 }, { "epoch": 0.5852260089464449, "grad_norm": 4.415241206868575, "learning_rate": 7.744475136662822e-08, "loss": 0.6282, "step": 5920 }, { "epoch": 0.5853248646911994, "grad_norm": 5.982679706718438, "learning_rate": 7.741355654952493e-08, "loss": 0.7073, "step": 5921 }, { "epoch": 0.5854237204359538, "grad_norm": 11.96229749384239, "learning_rate": 7.738236404832821e-08, "loss": 0.6712, "step": 5922 }, { "epoch": 0.5855225761807084, "grad_norm": 5.387699713455965, "learning_rate": 7.735117386623641e-08, "loss": 0.7243, "step": 5923 }, { "epoch": 0.5856214319254628, "grad_norm": 3.536856985786053, "learning_rate": 7.731998600644769e-08, "loss": 0.6683, "step": 5924 }, { "epoch": 0.5857202876702172, "grad_norm": 4.384844992152021, "learning_rate": 7.728880047215982e-08, "loss": 0.8397, "step": 5925 }, { "epoch": 0.5858191434149717, "grad_norm": 4.496139682364623, "learning_rate": 7.725761726657044e-08, "loss": 0.7454, "step": 5926 }, { "epoch": 0.5859179991597262, "grad_norm": 4.725576037748162, "learning_rate": 7.722643639287698e-08, "loss": 0.7541, "step": 5927 }, { "epoch": 0.5860168549044806, "grad_norm": 3.679826199351501, "learning_rate": 7.719525785427647e-08, "loss": 0.7234, "step": 5928 }, { "epoch": 0.5861157106492351, "grad_norm": 4.880051339244408, "learning_rate": 7.716408165396593e-08, "loss": 0.7361, "step": 5929 }, { "epoch": 0.5862145663939896, "grad_norm": 3.063055984799968, "learning_rate": 7.713290779514192e-08, "loss": 0.68, "step": 5930 }, { "epoch": 0.586313422138744, "grad_norm": 10.723015931617267, "learning_rate": 7.71017362810009e-08, "loss": 0.6356, "step": 5931 }, { "epoch": 0.5864122778834985, "grad_norm": 4.330015750058091, "learning_rate": 7.707056711473909e-08, "loss": 0.6989, "step": 5932 }, { "epoch": 0.586511133628253, "grad_norm": 4.733451645488777, "learning_rate": 7.703940029955236e-08, "loss": 0.6727, "step": 5933 }, { "epoch": 0.5866099893730075, "grad_norm": 3.3419036738634706, "learning_rate": 7.700823583863644e-08, "loss": 0.6886, "step": 5934 }, { "epoch": 0.5867088451177619, "grad_norm": 4.831015775667677, "learning_rate": 7.697707373518683e-08, "loss": 0.8237, "step": 5935 }, { "epoch": 0.5868077008625163, "grad_norm": 2.641889115851129, "learning_rate": 7.694591399239862e-08, "loss": 0.6404, "step": 5936 }, { "epoch": 0.5869065566072709, "grad_norm": 3.9799969432940308, "learning_rate": 7.69147566134669e-08, "loss": 0.7062, "step": 5937 }, { "epoch": 0.5870054123520253, "grad_norm": 4.045138190323958, "learning_rate": 7.688360160158635e-08, "loss": 0.7893, "step": 5938 }, { "epoch": 0.5871042680967797, "grad_norm": 5.702411303256432, "learning_rate": 7.685244895995147e-08, "loss": 0.7391, "step": 5939 }, { "epoch": 0.5872031238415343, "grad_norm": 6.700297380896003, "learning_rate": 7.682129869175651e-08, "loss": 0.7123, "step": 5940 }, { "epoch": 0.5873019795862887, "grad_norm": 9.020802825149772, "learning_rate": 7.679015080019543e-08, "loss": 0.7272, "step": 5941 }, { "epoch": 0.5874008353310431, "grad_norm": 4.300161118822472, "learning_rate": 7.675900528846203e-08, "loss": 0.7019, "step": 5942 }, { "epoch": 0.5874996910757977, "grad_norm": 3.105683134889722, "learning_rate": 7.672786215974982e-08, "loss": 0.6221, "step": 5943 }, { "epoch": 0.5875985468205521, "grad_norm": 4.028657347614808, "learning_rate": 7.669672141725205e-08, "loss": 0.6677, "step": 5944 }, { "epoch": 0.5876974025653066, "grad_norm": 5.894485685237364, "learning_rate": 7.666558306416173e-08, "loss": 0.6233, "step": 5945 }, { "epoch": 0.587796258310061, "grad_norm": 6.225182708962404, "learning_rate": 7.66344471036717e-08, "loss": 0.7306, "step": 5946 }, { "epoch": 0.5878951140548155, "grad_norm": 3.196215573795415, "learning_rate": 7.660331353897441e-08, "loss": 0.6812, "step": 5947 }, { "epoch": 0.58799396979957, "grad_norm": 2.933671636110435, "learning_rate": 7.65721823732622e-08, "loss": 0.6708, "step": 5948 }, { "epoch": 0.5880928255443244, "grad_norm": 3.508713392903397, "learning_rate": 7.654105360972714e-08, "loss": 0.8536, "step": 5949 }, { "epoch": 0.588191681289079, "grad_norm": 3.4772505466392833, "learning_rate": 7.650992725156095e-08, "loss": 0.7136, "step": 5950 }, { "epoch": 0.5882905370338334, "grad_norm": 3.1537476026561944, "learning_rate": 7.647880330195525e-08, "loss": 0.6939, "step": 5951 }, { "epoch": 0.5883893927785878, "grad_norm": 3.1256520660546063, "learning_rate": 7.64476817641013e-08, "loss": 0.7236, "step": 5952 }, { "epoch": 0.5884882485233424, "grad_norm": 4.7457617222063515, "learning_rate": 7.641656264119013e-08, "loss": 0.5965, "step": 5953 }, { "epoch": 0.5885871042680968, "grad_norm": 3.5011957111357597, "learning_rate": 7.638544593641262e-08, "loss": 0.6852, "step": 5954 }, { "epoch": 0.5886859600128512, "grad_norm": 6.842915958176698, "learning_rate": 7.635433165295927e-08, "loss": 0.5828, "step": 5955 }, { "epoch": 0.5887848157576057, "grad_norm": 3.143601467512425, "learning_rate": 7.63232197940204e-08, "loss": 0.72, "step": 5956 }, { "epoch": 0.5888836715023602, "grad_norm": 9.6878131069622, "learning_rate": 7.629211036278612e-08, "loss": 0.7056, "step": 5957 }, { "epoch": 0.5889825272471146, "grad_norm": 21.119867018598832, "learning_rate": 7.626100336244617e-08, "loss": 0.7668, "step": 5958 }, { "epoch": 0.5890813829918691, "grad_norm": 4.86522905840004, "learning_rate": 7.622989879619017e-08, "loss": 0.7268, "step": 5959 }, { "epoch": 0.5891802387366236, "grad_norm": 3.746260723606131, "learning_rate": 7.619879666720741e-08, "loss": 0.8577, "step": 5960 }, { "epoch": 0.589279094481378, "grad_norm": 4.538439354328951, "learning_rate": 7.616769697868697e-08, "loss": 0.7946, "step": 5961 }, { "epoch": 0.5893779502261325, "grad_norm": 3.7909250546936484, "learning_rate": 7.613659973381768e-08, "loss": 0.709, "step": 5962 }, { "epoch": 0.589476805970887, "grad_norm": 3.727142886436844, "learning_rate": 7.610550493578802e-08, "loss": 0.7223, "step": 5963 }, { "epoch": 0.5895756617156415, "grad_norm": 3.7323233324712755, "learning_rate": 7.607441258778635e-08, "loss": 0.7481, "step": 5964 }, { "epoch": 0.5896745174603959, "grad_norm": 6.502522920904786, "learning_rate": 7.60433226930008e-08, "loss": 0.6534, "step": 5965 }, { "epoch": 0.5897733732051503, "grad_norm": 6.205588024023494, "learning_rate": 7.601223525461908e-08, "loss": 0.6618, "step": 5966 }, { "epoch": 0.5898722289499049, "grad_norm": 3.613633261360325, "learning_rate": 7.598115027582879e-08, "loss": 0.6338, "step": 5967 }, { "epoch": 0.5899710846946593, "grad_norm": 5.074091465929371, "learning_rate": 7.595006775981727e-08, "loss": 0.7512, "step": 5968 }, { "epoch": 0.5900699404394137, "grad_norm": 4.354136987998412, "learning_rate": 7.591898770977153e-08, "loss": 0.6967, "step": 5969 }, { "epoch": 0.5901687961841683, "grad_norm": 4.053908132522949, "learning_rate": 7.588791012887835e-08, "loss": 0.771, "step": 5970 }, { "epoch": 0.5902676519289227, "grad_norm": 3.2175591927237868, "learning_rate": 7.585683502032436e-08, "loss": 0.6931, "step": 5971 }, { "epoch": 0.5903665076736772, "grad_norm": 6.484221934247272, "learning_rate": 7.582576238729576e-08, "loss": 0.6761, "step": 5972 }, { "epoch": 0.5904653634184317, "grad_norm": 4.258239134911527, "learning_rate": 7.579469223297867e-08, "loss": 0.747, "step": 5973 }, { "epoch": 0.5905642191631861, "grad_norm": 4.72034245351533, "learning_rate": 7.576362456055878e-08, "loss": 0.7118, "step": 5974 }, { "epoch": 0.5906630749079406, "grad_norm": 6.336388888693745, "learning_rate": 7.573255937322169e-08, "loss": 0.814, "step": 5975 }, { "epoch": 0.590761930652695, "grad_norm": 5.02855401409818, "learning_rate": 7.570149667415273e-08, "loss": 0.6643, "step": 5976 }, { "epoch": 0.5908607863974495, "grad_norm": 4.39796329716832, "learning_rate": 7.56704364665368e-08, "loss": 0.6095, "step": 5977 }, { "epoch": 0.590959642142204, "grad_norm": 5.608328426287897, "learning_rate": 7.563937875355873e-08, "loss": 0.7317, "step": 5978 }, { "epoch": 0.5910584978869584, "grad_norm": 3.469008190547643, "learning_rate": 7.560832353840306e-08, "loss": 0.7053, "step": 5979 }, { "epoch": 0.591157353631713, "grad_norm": 5.366534365916716, "learning_rate": 7.557727082425397e-08, "loss": 0.7029, "step": 5980 }, { "epoch": 0.5912562093764674, "grad_norm": 4.610105854821648, "learning_rate": 7.55462206142955e-08, "loss": 0.7367, "step": 5981 }, { "epoch": 0.5913550651212218, "grad_norm": 3.8365163859550773, "learning_rate": 7.551517291171142e-08, "loss": 0.7192, "step": 5982 }, { "epoch": 0.5914539208659764, "grad_norm": 4.366245778155662, "learning_rate": 7.548412771968513e-08, "loss": 0.7876, "step": 5983 }, { "epoch": 0.5915527766107308, "grad_norm": 4.0856823760413326, "learning_rate": 7.545308504139996e-08, "loss": 0.7759, "step": 5984 }, { "epoch": 0.5916516323554852, "grad_norm": 4.780251662975842, "learning_rate": 7.542204488003878e-08, "loss": 0.754, "step": 5985 }, { "epoch": 0.5917504881002398, "grad_norm": 2.6386679171344887, "learning_rate": 7.539100723878437e-08, "loss": 0.6029, "step": 5986 }, { "epoch": 0.5918493438449942, "grad_norm": 3.1868580309186982, "learning_rate": 7.535997212081918e-08, "loss": 0.6794, "step": 5987 }, { "epoch": 0.5919481995897486, "grad_norm": 6.70439307677369, "learning_rate": 7.532893952932536e-08, "loss": 0.7758, "step": 5988 }, { "epoch": 0.5920470553345031, "grad_norm": 4.567660245960072, "learning_rate": 7.529790946748486e-08, "loss": 0.6809, "step": 5989 }, { "epoch": 0.5921459110792576, "grad_norm": 3.892820659011896, "learning_rate": 7.526688193847939e-08, "loss": 0.6416, "step": 5990 }, { "epoch": 0.5922447668240121, "grad_norm": 4.293167158478321, "learning_rate": 7.52358569454903e-08, "loss": 0.7329, "step": 5991 }, { "epoch": 0.5923436225687665, "grad_norm": 3.960476994924039, "learning_rate": 7.520483449169879e-08, "loss": 0.7759, "step": 5992 }, { "epoch": 0.592442478313521, "grad_norm": 40.77267531544019, "learning_rate": 7.517381458028578e-08, "loss": 0.6951, "step": 5993 }, { "epoch": 0.5925413340582755, "grad_norm": 11.454351494669874, "learning_rate": 7.514279721443183e-08, "loss": 0.7291, "step": 5994 }, { "epoch": 0.5926401898030299, "grad_norm": 3.3031246225380806, "learning_rate": 7.511178239731741e-08, "loss": 0.6393, "step": 5995 }, { "epoch": 0.5927390455477844, "grad_norm": 3.6377100536063467, "learning_rate": 7.508077013212256e-08, "loss": 0.7855, "step": 5996 }, { "epoch": 0.5928379012925389, "grad_norm": 7.980435991727292, "learning_rate": 7.504976042202712e-08, "loss": 0.6373, "step": 5997 }, { "epoch": 0.5929367570372933, "grad_norm": 3.992715987286459, "learning_rate": 7.501875327021075e-08, "loss": 0.6206, "step": 5998 }, { "epoch": 0.5930356127820477, "grad_norm": 13.002390362681936, "learning_rate": 7.498774867985269e-08, "loss": 0.7444, "step": 5999 }, { "epoch": 0.5931344685268023, "grad_norm": 3.678034102743296, "learning_rate": 7.495674665413204e-08, "loss": 0.7966, "step": 6000 }, { "epoch": 0.5932333242715567, "grad_norm": 2.958795622155242, "learning_rate": 7.492574719622766e-08, "loss": 0.7263, "step": 6001 }, { "epoch": 0.5933321800163112, "grad_norm": 6.6265387940336575, "learning_rate": 7.489475030931797e-08, "loss": 0.7132, "step": 6002 }, { "epoch": 0.5934310357610657, "grad_norm": 5.179298312076587, "learning_rate": 7.486375599658135e-08, "loss": 0.6492, "step": 6003 }, { "epoch": 0.5935298915058201, "grad_norm": 5.077244939220825, "learning_rate": 7.483276426119577e-08, "loss": 0.7077, "step": 6004 }, { "epoch": 0.5936287472505746, "grad_norm": 3.634273306199684, "learning_rate": 7.480177510633897e-08, "loss": 0.6865, "step": 6005 }, { "epoch": 0.5937276029953291, "grad_norm": 3.9029252169169286, "learning_rate": 7.477078853518844e-08, "loss": 0.6881, "step": 6006 }, { "epoch": 0.5938264587400836, "grad_norm": 4.843816493424362, "learning_rate": 7.473980455092134e-08, "loss": 0.69, "step": 6007 }, { "epoch": 0.593925314484838, "grad_norm": 3.124920780123082, "learning_rate": 7.470882315671469e-08, "loss": 0.7757, "step": 6008 }, { "epoch": 0.5940241702295924, "grad_norm": 3.6662147510433343, "learning_rate": 7.46778443557452e-08, "loss": 0.6135, "step": 6009 }, { "epoch": 0.594123025974347, "grad_norm": 4.831140780655563, "learning_rate": 7.464686815118919e-08, "loss": 0.6818, "step": 6010 }, { "epoch": 0.5942218817191014, "grad_norm": 8.16578223054908, "learning_rate": 7.461589454622285e-08, "loss": 0.6626, "step": 6011 }, { "epoch": 0.5943207374638558, "grad_norm": 2.702353838837334, "learning_rate": 7.458492354402214e-08, "loss": 0.6677, "step": 6012 }, { "epoch": 0.5944195932086104, "grad_norm": 5.101901862370919, "learning_rate": 7.455395514776261e-08, "loss": 0.7578, "step": 6013 }, { "epoch": 0.5945184489533648, "grad_norm": 2.9987383287034324, "learning_rate": 7.45229893606196e-08, "loss": 0.6367, "step": 6014 }, { "epoch": 0.5946173046981192, "grad_norm": 4.0480747926596115, "learning_rate": 7.449202618576827e-08, "loss": 0.6138, "step": 6015 }, { "epoch": 0.5947161604428738, "grad_norm": 4.593445423636252, "learning_rate": 7.446106562638332e-08, "loss": 0.6688, "step": 6016 }, { "epoch": 0.5948150161876282, "grad_norm": 3.0311974093111504, "learning_rate": 7.443010768563942e-08, "loss": 0.7457, "step": 6017 }, { "epoch": 0.5949138719323827, "grad_norm": 17.06387839503038, "learning_rate": 7.439915236671075e-08, "loss": 0.6977, "step": 6018 }, { "epoch": 0.5950127276771371, "grad_norm": 3.5923556848685863, "learning_rate": 7.436819967277135e-08, "loss": 0.7306, "step": 6019 }, { "epoch": 0.5951115834218916, "grad_norm": 3.574045115704137, "learning_rate": 7.433724960699504e-08, "loss": 0.6723, "step": 6020 }, { "epoch": 0.5952104391666461, "grad_norm": 5.1793874012375625, "learning_rate": 7.430630217255518e-08, "loss": 0.6239, "step": 6021 }, { "epoch": 0.5953092949114005, "grad_norm": 3.355606893562129, "learning_rate": 7.427535737262505e-08, "loss": 0.7383, "step": 6022 }, { "epoch": 0.595408150656155, "grad_norm": 2.9126303612879925, "learning_rate": 7.424441521037757e-08, "loss": 0.7774, "step": 6023 }, { "epoch": 0.5955070064009095, "grad_norm": 4.088861054070277, "learning_rate": 7.421347568898535e-08, "loss": 0.7289, "step": 6024 }, { "epoch": 0.5956058621456639, "grad_norm": 4.261131713838084, "learning_rate": 7.418253881162082e-08, "loss": 0.6978, "step": 6025 }, { "epoch": 0.5957047178904185, "grad_norm": 3.8383997988007614, "learning_rate": 7.415160458145615e-08, "loss": 0.5824, "step": 6026 }, { "epoch": 0.5958035736351729, "grad_norm": 3.632572858001046, "learning_rate": 7.41206730016631e-08, "loss": 0.772, "step": 6027 }, { "epoch": 0.5959024293799273, "grad_norm": 4.073657314247237, "learning_rate": 7.408974407541326e-08, "loss": 0.7002, "step": 6028 }, { "epoch": 0.5960012851246818, "grad_norm": 3.486117115901315, "learning_rate": 7.405881780587802e-08, "loss": 0.6797, "step": 6029 }, { "epoch": 0.5961001408694363, "grad_norm": 31.092973767989847, "learning_rate": 7.402789419622834e-08, "loss": 0.7706, "step": 6030 }, { "epoch": 0.5961989966141907, "grad_norm": 3.332056725205519, "learning_rate": 7.399697324963499e-08, "loss": 0.6799, "step": 6031 }, { "epoch": 0.5962978523589452, "grad_norm": 6.689025615370689, "learning_rate": 7.396605496926847e-08, "loss": 0.6381, "step": 6032 }, { "epoch": 0.5963967081036997, "grad_norm": 4.318722212419483, "learning_rate": 7.393513935829894e-08, "loss": 0.633, "step": 6033 }, { "epoch": 0.5964955638484541, "grad_norm": 3.796785721419977, "learning_rate": 7.390422641989644e-08, "loss": 0.7607, "step": 6034 }, { "epoch": 0.5965944195932086, "grad_norm": 3.8894743573572494, "learning_rate": 7.387331615723054e-08, "loss": 0.7246, "step": 6035 }, { "epoch": 0.5966932753379631, "grad_norm": 5.40320474344557, "learning_rate": 7.384240857347068e-08, "loss": 0.6774, "step": 6036 }, { "epoch": 0.5967921310827176, "grad_norm": 6.688019270873335, "learning_rate": 7.381150367178601e-08, "loss": 0.632, "step": 6037 }, { "epoch": 0.596890986827472, "grad_norm": 8.552557628329735, "learning_rate": 7.378060145534528e-08, "loss": 0.659, "step": 6038 }, { "epoch": 0.5969898425722264, "grad_norm": 3.2469217460299755, "learning_rate": 7.374970192731713e-08, "loss": 0.7284, "step": 6039 }, { "epoch": 0.597088698316981, "grad_norm": 6.832405342256464, "learning_rate": 7.371880509086985e-08, "loss": 0.6967, "step": 6040 }, { "epoch": 0.5971875540617354, "grad_norm": 5.025383214870353, "learning_rate": 7.368791094917139e-08, "loss": 0.7521, "step": 6041 }, { "epoch": 0.5972864098064898, "grad_norm": 4.842876624402265, "learning_rate": 7.365701950538958e-08, "loss": 0.719, "step": 6042 }, { "epoch": 0.5973852655512444, "grad_norm": 3.7835024575051865, "learning_rate": 7.36261307626918e-08, "loss": 0.5807, "step": 6043 }, { "epoch": 0.5974841212959988, "grad_norm": 3.899651666319529, "learning_rate": 7.359524472424525e-08, "loss": 0.7377, "step": 6044 }, { "epoch": 0.5975829770407532, "grad_norm": 4.61916050442917, "learning_rate": 7.35643613932169e-08, "loss": 0.6896, "step": 6045 }, { "epoch": 0.5976818327855078, "grad_norm": 6.6308882872472745, "learning_rate": 7.35334807727733e-08, "loss": 0.7051, "step": 6046 }, { "epoch": 0.5977806885302622, "grad_norm": 3.1473779658040635, "learning_rate": 7.350260286608086e-08, "loss": 0.7668, "step": 6047 }, { "epoch": 0.5978795442750167, "grad_norm": 4.984498897589573, "learning_rate": 7.347172767630562e-08, "loss": 0.6651, "step": 6048 }, { "epoch": 0.5979784000197711, "grad_norm": 7.775846736170326, "learning_rate": 7.34408552066134e-08, "loss": 0.6327, "step": 6049 }, { "epoch": 0.5980772557645256, "grad_norm": 7.0297045816479615, "learning_rate": 7.340998546016967e-08, "loss": 0.6666, "step": 6050 }, { "epoch": 0.5981761115092801, "grad_norm": 2.892086370735846, "learning_rate": 7.337911844013975e-08, "loss": 0.7016, "step": 6051 }, { "epoch": 0.5982749672540345, "grad_norm": 7.5094717632948536, "learning_rate": 7.334825414968852e-08, "loss": 0.8374, "step": 6052 }, { "epoch": 0.598373822998789, "grad_norm": 5.726785843702308, "learning_rate": 7.331739259198071e-08, "loss": 0.7381, "step": 6053 }, { "epoch": 0.5984726787435435, "grad_norm": 3.930157864628389, "learning_rate": 7.328653377018066e-08, "loss": 0.7291, "step": 6054 }, { "epoch": 0.5985715344882979, "grad_norm": 6.819135986973994, "learning_rate": 7.32556776874525e-08, "loss": 0.7123, "step": 6055 }, { "epoch": 0.5986703902330525, "grad_norm": 3.227003923682198, "learning_rate": 7.322482434696017e-08, "loss": 0.6966, "step": 6056 }, { "epoch": 0.5987692459778069, "grad_norm": 3.265060805047526, "learning_rate": 7.319397375186708e-08, "loss": 0.6709, "step": 6057 }, { "epoch": 0.5988681017225613, "grad_norm": 5.405244423477094, "learning_rate": 7.316312590533658e-08, "loss": 0.6975, "step": 6058 }, { "epoch": 0.5989669574673159, "grad_norm": 2.920350681249926, "learning_rate": 7.313228081053167e-08, "loss": 0.7587, "step": 6059 }, { "epoch": 0.5990658132120703, "grad_norm": 3.438678371445836, "learning_rate": 7.310143847061499e-08, "loss": 0.6443, "step": 6060 }, { "epoch": 0.5991646689568247, "grad_norm": 6.315888802415208, "learning_rate": 7.307059888874902e-08, "loss": 0.7457, "step": 6061 }, { "epoch": 0.5992635247015792, "grad_norm": 3.3929897080859712, "learning_rate": 7.303976206809594e-08, "loss": 0.7628, "step": 6062 }, { "epoch": 0.5993623804463337, "grad_norm": 4.98229535966313, "learning_rate": 7.30089280118175e-08, "loss": 0.6978, "step": 6063 }, { "epoch": 0.5994612361910882, "grad_norm": 3.240583751948194, "learning_rate": 7.297809672307542e-08, "loss": 0.7849, "step": 6064 }, { "epoch": 0.5995600919358426, "grad_norm": 4.195874643109401, "learning_rate": 7.294726820503088e-08, "loss": 0.7559, "step": 6065 }, { "epoch": 0.5996589476805971, "grad_norm": 12.692466151883247, "learning_rate": 7.291644246084494e-08, "loss": 0.6588, "step": 6066 }, { "epoch": 0.5997578034253516, "grad_norm": 3.4396118437224246, "learning_rate": 7.288561949367833e-08, "loss": 0.7803, "step": 6067 }, { "epoch": 0.599856659170106, "grad_norm": 5.256114037161938, "learning_rate": 7.285479930669144e-08, "loss": 0.8087, "step": 6068 }, { "epoch": 0.5999555149148605, "grad_norm": 3.5339806913882286, "learning_rate": 7.282398190304446e-08, "loss": 0.7731, "step": 6069 }, { "epoch": 0.600054370659615, "grad_norm": 3.285794909785628, "learning_rate": 7.27931672858973e-08, "loss": 0.6149, "step": 6070 }, { "epoch": 0.6001532264043694, "grad_norm": 3.726336648155763, "learning_rate": 7.276235545840947e-08, "loss": 0.7443, "step": 6071 }, { "epoch": 0.6002520821491238, "grad_norm": 2.8162038205518853, "learning_rate": 7.273154642374032e-08, "loss": 0.7624, "step": 6072 }, { "epoch": 0.6003509378938784, "grad_norm": 4.067303009874905, "learning_rate": 7.270074018504886e-08, "loss": 0.6704, "step": 6073 }, { "epoch": 0.6004497936386328, "grad_norm": 3.49053605977276, "learning_rate": 7.266993674549378e-08, "loss": 0.6122, "step": 6074 }, { "epoch": 0.6005486493833873, "grad_norm": 3.3485262962771953, "learning_rate": 7.263913610823356e-08, "loss": 0.6824, "step": 6075 }, { "epoch": 0.6006475051281418, "grad_norm": 3.654108547114296, "learning_rate": 7.260833827642632e-08, "loss": 0.7043, "step": 6076 }, { "epoch": 0.6007463608728962, "grad_norm": 3.7112913621750745, "learning_rate": 7.257754325322991e-08, "loss": 0.6876, "step": 6077 }, { "epoch": 0.6008452166176507, "grad_norm": 2.912053091827616, "learning_rate": 7.254675104180197e-08, "loss": 0.6353, "step": 6078 }, { "epoch": 0.6009440723624052, "grad_norm": 3.5679808672419577, "learning_rate": 7.251596164529972e-08, "loss": 0.6586, "step": 6079 }, { "epoch": 0.6010429281071596, "grad_norm": 3.2678992817610526, "learning_rate": 7.248517506688016e-08, "loss": 0.6901, "step": 6080 }, { "epoch": 0.6011417838519141, "grad_norm": 4.405056440417408, "learning_rate": 7.24543913097001e-08, "loss": 0.7062, "step": 6081 }, { "epoch": 0.6012406395966685, "grad_norm": 7.323425730250339, "learning_rate": 7.242361037691582e-08, "loss": 0.7787, "step": 6082 }, { "epoch": 0.6013394953414231, "grad_norm": 6.79452911100599, "learning_rate": 7.239283227168355e-08, "loss": 0.6341, "step": 6083 }, { "epoch": 0.6014383510861775, "grad_norm": 5.561736958860742, "learning_rate": 7.236205699715911e-08, "loss": 0.5904, "step": 6084 }, { "epoch": 0.6015372068309319, "grad_norm": 3.22113669895406, "learning_rate": 7.2331284556498e-08, "loss": 0.6263, "step": 6085 }, { "epoch": 0.6016360625756865, "grad_norm": 4.40371186059166, "learning_rate": 7.230051495285554e-08, "loss": 0.6635, "step": 6086 }, { "epoch": 0.6017349183204409, "grad_norm": 6.9142193971938335, "learning_rate": 7.226974818938665e-08, "loss": 0.8103, "step": 6087 }, { "epoch": 0.6018337740651953, "grad_norm": 4.3490551901122645, "learning_rate": 7.223898426924603e-08, "loss": 0.7195, "step": 6088 }, { "epoch": 0.6019326298099499, "grad_norm": 5.581394427827968, "learning_rate": 7.220822319558811e-08, "loss": 0.6964, "step": 6089 }, { "epoch": 0.6020314855547043, "grad_norm": 3.408776865121762, "learning_rate": 7.21774649715669e-08, "loss": 0.5861, "step": 6090 }, { "epoch": 0.6021303412994587, "grad_norm": 4.858221083729485, "learning_rate": 7.214670960033626e-08, "loss": 0.7265, "step": 6091 }, { "epoch": 0.6022291970442132, "grad_norm": 3.143802770360347, "learning_rate": 7.21159570850497e-08, "loss": 0.749, "step": 6092 }, { "epoch": 0.6023280527889677, "grad_norm": 6.191496082532079, "learning_rate": 7.208520742886041e-08, "loss": 0.6746, "step": 6093 }, { "epoch": 0.6024269085337222, "grad_norm": 3.6751360032162403, "learning_rate": 7.205446063492133e-08, "loss": 0.6748, "step": 6094 }, { "epoch": 0.6025257642784766, "grad_norm": 7.100666803967072, "learning_rate": 7.202371670638511e-08, "loss": 0.7121, "step": 6095 }, { "epoch": 0.6026246200232311, "grad_norm": 4.749101269619705, "learning_rate": 7.199297564640401e-08, "loss": 0.6581, "step": 6096 }, { "epoch": 0.6027234757679856, "grad_norm": 3.8125619985215, "learning_rate": 7.196223745813018e-08, "loss": 0.6857, "step": 6097 }, { "epoch": 0.60282233151274, "grad_norm": 6.779419157432558, "learning_rate": 7.193150214471527e-08, "loss": 0.6956, "step": 6098 }, { "epoch": 0.6029211872574946, "grad_norm": 12.142110300253409, "learning_rate": 7.190076970931079e-08, "loss": 0.6761, "step": 6099 }, { "epoch": 0.603020043002249, "grad_norm": 4.36426237816225, "learning_rate": 7.18700401550679e-08, "loss": 0.7272, "step": 6100 }, { "epoch": 0.6031188987470034, "grad_norm": 4.564992063919095, "learning_rate": 7.183931348513743e-08, "loss": 0.7627, "step": 6101 }, { "epoch": 0.6032177544917579, "grad_norm": 4.93146704427161, "learning_rate": 7.180858970266994e-08, "loss": 0.7398, "step": 6102 }, { "epoch": 0.6033166102365124, "grad_norm": 3.550226324272571, "learning_rate": 7.177786881081578e-08, "loss": 0.707, "step": 6103 }, { "epoch": 0.6034154659812668, "grad_norm": 3.183475386004774, "learning_rate": 7.174715081272482e-08, "loss": 0.7346, "step": 6104 }, { "epoch": 0.6035143217260213, "grad_norm": 4.779806380283616, "learning_rate": 7.171643571154678e-08, "loss": 0.7834, "step": 6105 }, { "epoch": 0.6036131774707758, "grad_norm": 4.951433934543384, "learning_rate": 7.168572351043109e-08, "loss": 0.6414, "step": 6106 }, { "epoch": 0.6037120332155302, "grad_norm": 10.327166515549669, "learning_rate": 7.165501421252675e-08, "loss": 0.7062, "step": 6107 }, { "epoch": 0.6038108889602847, "grad_norm": 4.056658359667362, "learning_rate": 7.162430782098262e-08, "loss": 0.6627, "step": 6108 }, { "epoch": 0.6039097447050392, "grad_norm": 7.9760866068268825, "learning_rate": 7.159360433894711e-08, "loss": 0.7664, "step": 6109 }, { "epoch": 0.6040086004497937, "grad_norm": 7.309268465093565, "learning_rate": 7.156290376956848e-08, "loss": 0.7712, "step": 6110 }, { "epoch": 0.6041074561945481, "grad_norm": 4.1349689919889, "learning_rate": 7.153220611599461e-08, "loss": 0.6894, "step": 6111 }, { "epoch": 0.6042063119393025, "grad_norm": 8.005353554299786, "learning_rate": 7.150151138137304e-08, "loss": 0.7137, "step": 6112 }, { "epoch": 0.6043051676840571, "grad_norm": 3.6781717545585058, "learning_rate": 7.147081956885109e-08, "loss": 0.7527, "step": 6113 }, { "epoch": 0.6044040234288115, "grad_norm": 3.638303787436359, "learning_rate": 7.144013068157579e-08, "loss": 0.6572, "step": 6114 }, { "epoch": 0.6045028791735659, "grad_norm": 5.377356417554445, "learning_rate": 7.140944472269376e-08, "loss": 0.842, "step": 6115 }, { "epoch": 0.6046017349183205, "grad_norm": 7.240689908400418, "learning_rate": 7.137876169535146e-08, "loss": 0.7056, "step": 6116 }, { "epoch": 0.6047005906630749, "grad_norm": 13.217449244704335, "learning_rate": 7.134808160269494e-08, "loss": 0.7177, "step": 6117 }, { "epoch": 0.6047994464078293, "grad_norm": 6.293442975708147, "learning_rate": 7.131740444787003e-08, "loss": 0.6685, "step": 6118 }, { "epoch": 0.6048983021525839, "grad_norm": 3.9279272260711604, "learning_rate": 7.128673023402219e-08, "loss": 0.74, "step": 6119 }, { "epoch": 0.6049971578973383, "grad_norm": 5.7837658422224925, "learning_rate": 7.125605896429661e-08, "loss": 0.7253, "step": 6120 }, { "epoch": 0.6050960136420928, "grad_norm": 7.679579766788226, "learning_rate": 7.122539064183816e-08, "loss": 0.7588, "step": 6121 }, { "epoch": 0.6051948693868472, "grad_norm": 4.3001770986102335, "learning_rate": 7.119472526979149e-08, "loss": 0.6279, "step": 6122 }, { "epoch": 0.6052937251316017, "grad_norm": 3.014875805161474, "learning_rate": 7.116406285130079e-08, "loss": 0.6052, "step": 6123 }, { "epoch": 0.6053925808763562, "grad_norm": 7.9074998916701595, "learning_rate": 7.113340338951008e-08, "loss": 0.7348, "step": 6124 }, { "epoch": 0.6054914366211106, "grad_norm": 8.708431587075943, "learning_rate": 7.110274688756308e-08, "loss": 0.6498, "step": 6125 }, { "epoch": 0.6055902923658651, "grad_norm": 2.8975018779483053, "learning_rate": 7.107209334860309e-08, "loss": 0.7577, "step": 6126 }, { "epoch": 0.6056891481106196, "grad_norm": 4.482538609304724, "learning_rate": 7.104144277577323e-08, "loss": 0.765, "step": 6127 }, { "epoch": 0.605788003855374, "grad_norm": 3.6435330986647565, "learning_rate": 7.101079517221626e-08, "loss": 0.8837, "step": 6128 }, { "epoch": 0.6058868596001286, "grad_norm": 3.446152494247545, "learning_rate": 7.09801505410746e-08, "loss": 0.7771, "step": 6129 }, { "epoch": 0.605985715344883, "grad_norm": 6.3843022772507485, "learning_rate": 7.094950888549045e-08, "loss": 0.7145, "step": 6130 }, { "epoch": 0.6060845710896374, "grad_norm": 8.279078926085116, "learning_rate": 7.091887020860561e-08, "loss": 0.6372, "step": 6131 }, { "epoch": 0.6061834268343919, "grad_norm": 41.07874655386906, "learning_rate": 7.088823451356163e-08, "loss": 0.7807, "step": 6132 }, { "epoch": 0.6062822825791464, "grad_norm": 16.05306619691092, "learning_rate": 7.085760180349984e-08, "loss": 0.7045, "step": 6133 }, { "epoch": 0.6063811383239008, "grad_norm": 2.9571142301152604, "learning_rate": 7.082697208156105e-08, "loss": 0.7697, "step": 6134 }, { "epoch": 0.6064799940686553, "grad_norm": 8.846272447427229, "learning_rate": 7.079634535088597e-08, "loss": 0.78, "step": 6135 }, { "epoch": 0.6065788498134098, "grad_norm": 4.289469846473578, "learning_rate": 7.076572161461488e-08, "loss": 0.7307, "step": 6136 }, { "epoch": 0.6066777055581642, "grad_norm": 6.136987673804083, "learning_rate": 7.073510087588781e-08, "loss": 0.7275, "step": 6137 }, { "epoch": 0.6067765613029187, "grad_norm": 4.437671437956001, "learning_rate": 7.070448313784442e-08, "loss": 0.8118, "step": 6138 }, { "epoch": 0.6068754170476732, "grad_norm": 4.0954426429409985, "learning_rate": 7.067386840362422e-08, "loss": 0.6747, "step": 6139 }, { "epoch": 0.6069742727924277, "grad_norm": 6.663765971038575, "learning_rate": 7.064325667636615e-08, "loss": 0.7144, "step": 6140 }, { "epoch": 0.6070731285371821, "grad_norm": 7.032556211999101, "learning_rate": 7.061264795920914e-08, "loss": 0.6316, "step": 6141 }, { "epoch": 0.6071719842819366, "grad_norm": 2.8222037034359424, "learning_rate": 7.058204225529152e-08, "loss": 0.6705, "step": 6142 }, { "epoch": 0.6072708400266911, "grad_norm": 5.29213207767586, "learning_rate": 7.055143956775155e-08, "loss": 0.5493, "step": 6143 }, { "epoch": 0.6073696957714455, "grad_norm": 4.443996388712738, "learning_rate": 7.052083989972706e-08, "loss": 0.7234, "step": 6144 }, { "epoch": 0.6074685515161999, "grad_norm": 6.38134241668575, "learning_rate": 7.049024325435558e-08, "loss": 0.7197, "step": 6145 }, { "epoch": 0.6075674072609545, "grad_norm": 4.125585818492529, "learning_rate": 7.045964963477433e-08, "loss": 0.6818, "step": 6146 }, { "epoch": 0.6076662630057089, "grad_norm": 5.460499115585014, "learning_rate": 7.042905904412031e-08, "loss": 0.6702, "step": 6147 }, { "epoch": 0.6077651187504634, "grad_norm": 3.185490521620197, "learning_rate": 7.039847148553004e-08, "loss": 0.6157, "step": 6148 }, { "epoch": 0.6078639744952179, "grad_norm": 8.996580345814403, "learning_rate": 7.036788696213986e-08, "loss": 0.5816, "step": 6149 }, { "epoch": 0.6079628302399723, "grad_norm": 7.563929909720708, "learning_rate": 7.033730547708583e-08, "loss": 0.8294, "step": 6150 }, { "epoch": 0.6080616859847268, "grad_norm": 5.334148136496572, "learning_rate": 7.030672703350351e-08, "loss": 0.6902, "step": 6151 }, { "epoch": 0.6081605417294813, "grad_norm": 3.2007011358107382, "learning_rate": 7.027615163452837e-08, "loss": 0.6602, "step": 6152 }, { "epoch": 0.6082593974742357, "grad_norm": 5.706528930385931, "learning_rate": 7.024557928329537e-08, "loss": 0.6738, "step": 6153 }, { "epoch": 0.6083582532189902, "grad_norm": 3.4503841791674277, "learning_rate": 7.021500998293935e-08, "loss": 0.6479, "step": 6154 }, { "epoch": 0.6084571089637446, "grad_norm": 2.8930884688935015, "learning_rate": 7.01844437365947e-08, "loss": 0.6789, "step": 6155 }, { "epoch": 0.6085559647084992, "grad_norm": 5.0808361526697015, "learning_rate": 7.015388054739549e-08, "loss": 0.7366, "step": 6156 }, { "epoch": 0.6086548204532536, "grad_norm": 3.5930953616493264, "learning_rate": 7.012332041847558e-08, "loss": 0.6205, "step": 6157 }, { "epoch": 0.608753676198008, "grad_norm": 4.217386192704726, "learning_rate": 7.009276335296848e-08, "loss": 0.618, "step": 6158 }, { "epoch": 0.6088525319427626, "grad_norm": 3.637088119307671, "learning_rate": 7.006220935400731e-08, "loss": 0.6864, "step": 6159 }, { "epoch": 0.608951387687517, "grad_norm": 3.208677652873969, "learning_rate": 7.003165842472496e-08, "loss": 0.7245, "step": 6160 }, { "epoch": 0.6090502434322714, "grad_norm": 3.3939628477862995, "learning_rate": 7.000111056825399e-08, "loss": 0.7726, "step": 6161 }, { "epoch": 0.609149099177026, "grad_norm": 8.322265139819734, "learning_rate": 6.997056578772661e-08, "loss": 0.6964, "step": 6162 }, { "epoch": 0.6092479549217804, "grad_norm": 2.990147228259665, "learning_rate": 6.994002408627474e-08, "loss": 0.5972, "step": 6163 }, { "epoch": 0.6093468106665348, "grad_norm": 16.36466765194963, "learning_rate": 6.990948546702999e-08, "loss": 0.7732, "step": 6164 }, { "epoch": 0.6094456664112893, "grad_norm": 5.656507031910142, "learning_rate": 6.987894993312362e-08, "loss": 0.7719, "step": 6165 }, { "epoch": 0.6095445221560438, "grad_norm": 4.387993127505781, "learning_rate": 6.984841748768664e-08, "loss": 0.7302, "step": 6166 }, { "epoch": 0.6096433779007983, "grad_norm": 7.039956825767253, "learning_rate": 6.981788813384966e-08, "loss": 0.6351, "step": 6167 }, { "epoch": 0.6097422336455527, "grad_norm": 4.659087742014038, "learning_rate": 6.978736187474302e-08, "loss": 0.6614, "step": 6168 }, { "epoch": 0.6098410893903072, "grad_norm": 13.172710758951316, "learning_rate": 6.97568387134968e-08, "loss": 0.6577, "step": 6169 }, { "epoch": 0.6099399451350617, "grad_norm": 5.538785065813369, "learning_rate": 6.972631865324064e-08, "loss": 0.7592, "step": 6170 }, { "epoch": 0.6100388008798161, "grad_norm": 2.98933881866553, "learning_rate": 6.969580169710389e-08, "loss": 0.5631, "step": 6171 }, { "epoch": 0.6101376566245706, "grad_norm": 2.8057294563027684, "learning_rate": 6.966528784821573e-08, "loss": 0.6713, "step": 6172 }, { "epoch": 0.6102365123693251, "grad_norm": 4.184486291097229, "learning_rate": 6.963477710970477e-08, "loss": 0.7819, "step": 6173 }, { "epoch": 0.6103353681140795, "grad_norm": 4.14598973494095, "learning_rate": 6.960426948469954e-08, "loss": 0.6659, "step": 6174 }, { "epoch": 0.610434223858834, "grad_norm": 4.442005608966007, "learning_rate": 6.957376497632807e-08, "loss": 0.7784, "step": 6175 }, { "epoch": 0.6105330796035885, "grad_norm": 3.7976578954273865, "learning_rate": 6.954326358771818e-08, "loss": 0.7583, "step": 6176 }, { "epoch": 0.6106319353483429, "grad_norm": 4.50317746368112, "learning_rate": 6.95127653219974e-08, "loss": 0.7364, "step": 6177 }, { "epoch": 0.6107307910930974, "grad_norm": 4.856367276441941, "learning_rate": 6.948227018229274e-08, "loss": 0.7946, "step": 6178 }, { "epoch": 0.6108296468378519, "grad_norm": 3.723820114024291, "learning_rate": 6.945177817173116e-08, "loss": 0.643, "step": 6179 }, { "epoch": 0.6109285025826063, "grad_norm": 6.012898570669482, "learning_rate": 6.942128929343909e-08, "loss": 0.7586, "step": 6180 }, { "epoch": 0.6110273583273608, "grad_norm": 20.252771162776323, "learning_rate": 6.939080355054274e-08, "loss": 0.651, "step": 6181 }, { "epoch": 0.6111262140721153, "grad_norm": 3.7952188613666586, "learning_rate": 6.936032094616795e-08, "loss": 0.7832, "step": 6182 }, { "epoch": 0.6112250698168697, "grad_norm": 4.178351858036433, "learning_rate": 6.932984148344033e-08, "loss": 0.6827, "step": 6183 }, { "epoch": 0.6113239255616242, "grad_norm": 3.0291990160402147, "learning_rate": 6.929936516548499e-08, "loss": 0.7678, "step": 6184 }, { "epoch": 0.6114227813063786, "grad_norm": 4.12973257057786, "learning_rate": 6.926889199542694e-08, "loss": 0.6995, "step": 6185 }, { "epoch": 0.6115216370511332, "grad_norm": 5.107059444909494, "learning_rate": 6.923842197639065e-08, "loss": 0.6293, "step": 6186 }, { "epoch": 0.6116204927958876, "grad_norm": 2.8863484693149295, "learning_rate": 6.920795511150045e-08, "loss": 0.7457, "step": 6187 }, { "epoch": 0.611719348540642, "grad_norm": 7.862364312192056, "learning_rate": 6.917749140388023e-08, "loss": 0.6754, "step": 6188 }, { "epoch": 0.6118182042853966, "grad_norm": 6.156064191160388, "learning_rate": 6.914703085665361e-08, "loss": 0.6481, "step": 6189 }, { "epoch": 0.611917060030151, "grad_norm": 5.336506090112736, "learning_rate": 6.911657347294385e-08, "loss": 0.6923, "step": 6190 }, { "epoch": 0.6120159157749054, "grad_norm": 2.944909938351846, "learning_rate": 6.908611925587395e-08, "loss": 0.6998, "step": 6191 }, { "epoch": 0.61211477151966, "grad_norm": 4.419652773412835, "learning_rate": 6.905566820856645e-08, "loss": 0.7413, "step": 6192 }, { "epoch": 0.6122136272644144, "grad_norm": 3.241371241130809, "learning_rate": 6.902522033414373e-08, "loss": 0.632, "step": 6193 }, { "epoch": 0.6123124830091689, "grad_norm": 4.996806842911084, "learning_rate": 6.89947756357278e-08, "loss": 0.7204, "step": 6194 }, { "epoch": 0.6124113387539233, "grad_norm": 5.205737629638206, "learning_rate": 6.896433411644023e-08, "loss": 0.7839, "step": 6195 }, { "epoch": 0.6125101944986778, "grad_norm": 4.554328849292869, "learning_rate": 6.893389577940241e-08, "loss": 0.7235, "step": 6196 }, { "epoch": 0.6126090502434323, "grad_norm": 9.007461067052628, "learning_rate": 6.89034606277353e-08, "loss": 0.7391, "step": 6197 }, { "epoch": 0.6127079059881867, "grad_norm": 6.962756218713341, "learning_rate": 6.887302866455962e-08, "loss": 0.6661, "step": 6198 }, { "epoch": 0.6128067617329412, "grad_norm": 4.274171958089775, "learning_rate": 6.88425998929957e-08, "loss": 0.6883, "step": 6199 }, { "epoch": 0.6129056174776957, "grad_norm": 11.931845265985965, "learning_rate": 6.881217431616352e-08, "loss": 0.7206, "step": 6200 }, { "epoch": 0.6130044732224501, "grad_norm": 3.092277314735298, "learning_rate": 6.87817519371828e-08, "loss": 0.7209, "step": 6201 }, { "epoch": 0.6131033289672047, "grad_norm": 3.0687625419703397, "learning_rate": 6.875133275917298e-08, "loss": 0.6177, "step": 6202 }, { "epoch": 0.6132021847119591, "grad_norm": 4.062303595073752, "learning_rate": 6.872091678525298e-08, "loss": 0.7045, "step": 6203 }, { "epoch": 0.6133010404567135, "grad_norm": 4.891724917850189, "learning_rate": 6.86905040185416e-08, "loss": 0.7055, "step": 6204 }, { "epoch": 0.613399896201468, "grad_norm": 3.883859304661967, "learning_rate": 6.866009446215717e-08, "loss": 0.7309, "step": 6205 }, { "epoch": 0.6134987519462225, "grad_norm": 3.1274452959711163, "learning_rate": 6.862968811921776e-08, "loss": 0.7119, "step": 6206 }, { "epoch": 0.6135976076909769, "grad_norm": 6.631919120302921, "learning_rate": 6.85992849928411e-08, "loss": 0.6683, "step": 6207 }, { "epoch": 0.6136964634357314, "grad_norm": 3.9032229404521264, "learning_rate": 6.856888508614455e-08, "loss": 0.6638, "step": 6208 }, { "epoch": 0.6137953191804859, "grad_norm": 5.370073232677507, "learning_rate": 6.853848840224518e-08, "loss": 0.7109, "step": 6209 }, { "epoch": 0.6138941749252403, "grad_norm": 4.847396970437173, "learning_rate": 6.850809494425979e-08, "loss": 0.7659, "step": 6210 }, { "epoch": 0.6139930306699948, "grad_norm": 6.606907931608661, "learning_rate": 6.847770471530467e-08, "loss": 0.6095, "step": 6211 }, { "epoch": 0.6140918864147493, "grad_norm": 4.5997332259545844, "learning_rate": 6.844731771849595e-08, "loss": 0.7539, "step": 6212 }, { "epoch": 0.6141907421595038, "grad_norm": 3.5505427503314193, "learning_rate": 6.841693395694941e-08, "loss": 0.6893, "step": 6213 }, { "epoch": 0.6142895979042582, "grad_norm": 4.644674789489748, "learning_rate": 6.838655343378036e-08, "loss": 0.7729, "step": 6214 }, { "epoch": 0.6143884536490127, "grad_norm": 4.92344174143465, "learning_rate": 6.835617615210394e-08, "loss": 0.7081, "step": 6215 }, { "epoch": 0.6144873093937672, "grad_norm": 5.23042078824137, "learning_rate": 6.83258021150349e-08, "loss": 0.8037, "step": 6216 }, { "epoch": 0.6145861651385216, "grad_norm": 9.242251788918047, "learning_rate": 6.829543132568759e-08, "loss": 0.784, "step": 6217 }, { "epoch": 0.614685020883276, "grad_norm": 8.69679550091551, "learning_rate": 6.826506378717616e-08, "loss": 0.7367, "step": 6218 }, { "epoch": 0.6147838766280306, "grad_norm": 3.153364076718097, "learning_rate": 6.823469950261428e-08, "loss": 0.6835, "step": 6219 }, { "epoch": 0.614882732372785, "grad_norm": 2.9668624931413228, "learning_rate": 6.820433847511538e-08, "loss": 0.7645, "step": 6220 }, { "epoch": 0.6149815881175394, "grad_norm": 10.935259864044317, "learning_rate": 6.817398070779261e-08, "loss": 0.8057, "step": 6221 }, { "epoch": 0.615080443862294, "grad_norm": 7.093704448137936, "learning_rate": 6.814362620375859e-08, "loss": 0.7025, "step": 6222 }, { "epoch": 0.6151792996070484, "grad_norm": 4.384723473528429, "learning_rate": 6.811327496612583e-08, "loss": 0.8146, "step": 6223 }, { "epoch": 0.6152781553518029, "grad_norm": 3.0471806316304217, "learning_rate": 6.808292699800635e-08, "loss": 0.7135, "step": 6224 }, { "epoch": 0.6153770110965574, "grad_norm": 7.279625831306129, "learning_rate": 6.80525823025119e-08, "loss": 0.63, "step": 6225 }, { "epoch": 0.6154758668413118, "grad_norm": 3.3237403938810837, "learning_rate": 6.802224088275386e-08, "loss": 0.7492, "step": 6226 }, { "epoch": 0.6155747225860663, "grad_norm": 8.18870448427928, "learning_rate": 6.799190274184335e-08, "loss": 0.7847, "step": 6227 }, { "epoch": 0.6156735783308207, "grad_norm": 3.7910888733726735, "learning_rate": 6.796156788289102e-08, "loss": 0.6959, "step": 6228 }, { "epoch": 0.6157724340755752, "grad_norm": 4.294180527062127, "learning_rate": 6.793123630900733e-08, "loss": 0.7288, "step": 6229 }, { "epoch": 0.6158712898203297, "grad_norm": 3.6922329874747533, "learning_rate": 6.790090802330228e-08, "loss": 0.7438, "step": 6230 }, { "epoch": 0.6159701455650841, "grad_norm": 4.81162728027057, "learning_rate": 6.787058302888565e-08, "loss": 0.7136, "step": 6231 }, { "epoch": 0.6160690013098387, "grad_norm": 3.274706760392854, "learning_rate": 6.784026132886676e-08, "loss": 0.7057, "step": 6232 }, { "epoch": 0.6161678570545931, "grad_norm": 4.751856226945673, "learning_rate": 6.78099429263547e-08, "loss": 0.5817, "step": 6233 }, { "epoch": 0.6162667127993475, "grad_norm": 4.193783336666513, "learning_rate": 6.777962782445814e-08, "loss": 0.6874, "step": 6234 }, { "epoch": 0.6163655685441021, "grad_norm": 4.896208182100557, "learning_rate": 6.774931602628549e-08, "loss": 0.7392, "step": 6235 }, { "epoch": 0.6164644242888565, "grad_norm": 3.378470669279497, "learning_rate": 6.771900753494473e-08, "loss": 0.7553, "step": 6236 }, { "epoch": 0.6165632800336109, "grad_norm": 11.927256860100345, "learning_rate": 6.768870235354354e-08, "loss": 0.6117, "step": 6237 }, { "epoch": 0.6166621357783654, "grad_norm": 3.8377416024393622, "learning_rate": 6.765840048518936e-08, "loss": 0.5667, "step": 6238 }, { "epoch": 0.6167609915231199, "grad_norm": 4.535338977268666, "learning_rate": 6.76281019329891e-08, "loss": 0.6744, "step": 6239 }, { "epoch": 0.6168598472678744, "grad_norm": 8.272872411950482, "learning_rate": 6.759780670004948e-08, "loss": 0.7106, "step": 6240 }, { "epoch": 0.6169587030126288, "grad_norm": 3.6726678398962593, "learning_rate": 6.756751478947682e-08, "loss": 0.7435, "step": 6241 }, { "epoch": 0.6170575587573833, "grad_norm": 5.960883474467487, "learning_rate": 6.753722620437709e-08, "loss": 0.758, "step": 6242 }, { "epoch": 0.6171564145021378, "grad_norm": 20.128041102672576, "learning_rate": 6.750694094785598e-08, "loss": 0.785, "step": 6243 }, { "epoch": 0.6172552702468922, "grad_norm": 3.138104728846168, "learning_rate": 6.747665902301874e-08, "loss": 0.8189, "step": 6244 }, { "epoch": 0.6173541259916467, "grad_norm": 13.82120287358253, "learning_rate": 6.744638043297035e-08, "loss": 0.7481, "step": 6245 }, { "epoch": 0.6174529817364012, "grad_norm": 3.575192087922712, "learning_rate": 6.741610518081551e-08, "loss": 0.7439, "step": 6246 }, { "epoch": 0.6175518374811556, "grad_norm": 19.427179705203343, "learning_rate": 6.738583326965837e-08, "loss": 0.7524, "step": 6247 }, { "epoch": 0.61765069322591, "grad_norm": 3.625597361531161, "learning_rate": 6.735556470260297e-08, "loss": 0.7847, "step": 6248 }, { "epoch": 0.6177495489706646, "grad_norm": 2.9780986087858965, "learning_rate": 6.732529948275288e-08, "loss": 0.5668, "step": 6249 }, { "epoch": 0.617848404715419, "grad_norm": 13.471357455358863, "learning_rate": 6.729503761321133e-08, "loss": 0.6893, "step": 6250 }, { "epoch": 0.6179472604601735, "grad_norm": 3.3916893170123283, "learning_rate": 6.726477909708123e-08, "loss": 0.6807, "step": 6251 }, { "epoch": 0.618046116204928, "grad_norm": 3.6340040080200624, "learning_rate": 6.723452393746519e-08, "loss": 0.6653, "step": 6252 }, { "epoch": 0.6181449719496824, "grad_norm": 4.0448200679631885, "learning_rate": 6.720427213746534e-08, "loss": 0.6453, "step": 6253 }, { "epoch": 0.6182438276944369, "grad_norm": 10.226989720994846, "learning_rate": 6.717402370018366e-08, "loss": 0.6821, "step": 6254 }, { "epoch": 0.6183426834391914, "grad_norm": 3.957763001026486, "learning_rate": 6.71437786287216e-08, "loss": 0.7745, "step": 6255 }, { "epoch": 0.6184415391839458, "grad_norm": 4.731305437730604, "learning_rate": 6.711353692618037e-08, "loss": 0.8482, "step": 6256 }, { "epoch": 0.6185403949287003, "grad_norm": 7.4669956353077875, "learning_rate": 6.708329859566086e-08, "loss": 0.7425, "step": 6257 }, { "epoch": 0.6186392506734547, "grad_norm": 3.6020710698928804, "learning_rate": 6.705306364026351e-08, "loss": 0.6738, "step": 6258 }, { "epoch": 0.6187381064182093, "grad_norm": 5.910326640846438, "learning_rate": 6.702283206308846e-08, "loss": 0.6654, "step": 6259 }, { "epoch": 0.6188369621629637, "grad_norm": 4.933082495467321, "learning_rate": 6.699260386723558e-08, "loss": 0.6579, "step": 6260 }, { "epoch": 0.6189358179077181, "grad_norm": 4.607648434896453, "learning_rate": 6.696237905580422e-08, "loss": 0.6754, "step": 6261 }, { "epoch": 0.6190346736524727, "grad_norm": 4.896115761004341, "learning_rate": 6.693215763189361e-08, "loss": 0.783, "step": 6262 }, { "epoch": 0.6191335293972271, "grad_norm": 4.181545759128898, "learning_rate": 6.69019395986024e-08, "loss": 0.7463, "step": 6263 }, { "epoch": 0.6192323851419815, "grad_norm": 3.5392519789266585, "learning_rate": 6.687172495902907e-08, "loss": 0.673, "step": 6264 }, { "epoch": 0.6193312408867361, "grad_norm": 3.1205880188763606, "learning_rate": 6.684151371627168e-08, "loss": 0.6422, "step": 6265 }, { "epoch": 0.6194300966314905, "grad_norm": 3.779961040697167, "learning_rate": 6.681130587342792e-08, "loss": 0.7022, "step": 6266 }, { "epoch": 0.619528952376245, "grad_norm": 4.222559511640989, "learning_rate": 6.678110143359518e-08, "loss": 0.6806, "step": 6267 }, { "epoch": 0.6196278081209994, "grad_norm": 4.2906768866299325, "learning_rate": 6.67509003998705e-08, "loss": 0.6306, "step": 6268 }, { "epoch": 0.6197266638657539, "grad_norm": 17.117394372937067, "learning_rate": 6.672070277535049e-08, "loss": 0.6796, "step": 6269 }, { "epoch": 0.6198255196105084, "grad_norm": 3.655166234031778, "learning_rate": 6.66905085631315e-08, "loss": 0.7317, "step": 6270 }, { "epoch": 0.6199243753552628, "grad_norm": 15.254431853727558, "learning_rate": 6.666031776630954e-08, "loss": 0.5562, "step": 6271 }, { "epoch": 0.6200232311000173, "grad_norm": 6.5981761334403, "learning_rate": 6.663013038798018e-08, "loss": 0.7478, "step": 6272 }, { "epoch": 0.6201220868447718, "grad_norm": 3.696566535617549, "learning_rate": 6.659994643123873e-08, "loss": 0.7006, "step": 6273 }, { "epoch": 0.6202209425895262, "grad_norm": 2.8523028331358944, "learning_rate": 6.656976589918006e-08, "loss": 0.6843, "step": 6274 }, { "epoch": 0.6203197983342807, "grad_norm": 3.1620233517000274, "learning_rate": 6.653958879489877e-08, "loss": 0.6463, "step": 6275 }, { "epoch": 0.6204186540790352, "grad_norm": 6.043312077238663, "learning_rate": 6.650941512148909e-08, "loss": 0.665, "step": 6276 }, { "epoch": 0.6205175098237896, "grad_norm": 7.3017135629872865, "learning_rate": 6.647924488204486e-08, "loss": 0.6789, "step": 6277 }, { "epoch": 0.620616365568544, "grad_norm": 3.208461430279885, "learning_rate": 6.644907807965958e-08, "loss": 0.6787, "step": 6278 }, { "epoch": 0.6207152213132986, "grad_norm": 6.249949642126891, "learning_rate": 6.641891471742648e-08, "loss": 0.7854, "step": 6279 }, { "epoch": 0.620814077058053, "grad_norm": 3.8993490003725872, "learning_rate": 6.638875479843828e-08, "loss": 0.7629, "step": 6280 }, { "epoch": 0.6209129328028075, "grad_norm": 7.663960225524915, "learning_rate": 6.635859832578746e-08, "loss": 0.716, "step": 6281 }, { "epoch": 0.621011788547562, "grad_norm": 8.208653431460247, "learning_rate": 6.632844530256619e-08, "loss": 0.6555, "step": 6282 }, { "epoch": 0.6211106442923164, "grad_norm": 5.85834194043953, "learning_rate": 6.629829573186611e-08, "loss": 0.6702, "step": 6283 }, { "epoch": 0.6212095000370709, "grad_norm": 3.2591961981432402, "learning_rate": 6.626814961677872e-08, "loss": 0.74, "step": 6284 }, { "epoch": 0.6213083557818254, "grad_norm": 4.632411144714355, "learning_rate": 6.623800696039499e-08, "loss": 0.6905, "step": 6285 }, { "epoch": 0.6214072115265799, "grad_norm": 14.725198427679178, "learning_rate": 6.62078677658056e-08, "loss": 0.6295, "step": 6286 }, { "epoch": 0.6215060672713343, "grad_norm": 4.433767577658281, "learning_rate": 6.617773203610093e-08, "loss": 0.705, "step": 6287 }, { "epoch": 0.6216049230160887, "grad_norm": 3.635158821672549, "learning_rate": 6.614759977437089e-08, "loss": 0.8058, "step": 6288 }, { "epoch": 0.6217037787608433, "grad_norm": 3.353846745683276, "learning_rate": 6.611747098370515e-08, "loss": 0.6826, "step": 6289 }, { "epoch": 0.6218026345055977, "grad_norm": 3.030404340737694, "learning_rate": 6.608734566719299e-08, "loss": 0.7408, "step": 6290 }, { "epoch": 0.6219014902503521, "grad_norm": 3.2809743341731634, "learning_rate": 6.605722382792323e-08, "loss": 0.6824, "step": 6291 }, { "epoch": 0.6220003459951067, "grad_norm": 3.870936100330101, "learning_rate": 6.602710546898451e-08, "loss": 0.7945, "step": 6292 }, { "epoch": 0.6220992017398611, "grad_norm": 8.799497247674406, "learning_rate": 6.5996990593465e-08, "loss": 0.7427, "step": 6293 }, { "epoch": 0.6221980574846155, "grad_norm": 4.875790614137491, "learning_rate": 6.59668792044525e-08, "loss": 0.712, "step": 6294 }, { "epoch": 0.6222969132293701, "grad_norm": 6.046370302826926, "learning_rate": 6.593677130503454e-08, "loss": 0.7385, "step": 6295 }, { "epoch": 0.6223957689741245, "grad_norm": 9.058245861545682, "learning_rate": 6.590666689829818e-08, "loss": 0.5422, "step": 6296 }, { "epoch": 0.622494624718879, "grad_norm": 6.798626020353305, "learning_rate": 6.587656598733023e-08, "loss": 0.7778, "step": 6297 }, { "epoch": 0.6225934804636335, "grad_norm": 3.7924407395037307, "learning_rate": 6.584646857521709e-08, "loss": 0.7461, "step": 6298 }, { "epoch": 0.6226923362083879, "grad_norm": 7.079361794716562, "learning_rate": 6.581637466504479e-08, "loss": 0.7278, "step": 6299 }, { "epoch": 0.6227911919531424, "grad_norm": 3.916469769110348, "learning_rate": 6.5786284259899e-08, "loss": 0.7994, "step": 6300 }, { "epoch": 0.6228900476978968, "grad_norm": 3.462061171856143, "learning_rate": 6.575619736286513e-08, "loss": 0.7018, "step": 6301 }, { "epoch": 0.6229889034426513, "grad_norm": 10.5058302339052, "learning_rate": 6.572611397702805e-08, "loss": 0.7438, "step": 6302 }, { "epoch": 0.6230877591874058, "grad_norm": 10.79715971253462, "learning_rate": 6.56960341054724e-08, "loss": 0.7847, "step": 6303 }, { "epoch": 0.6231866149321602, "grad_norm": 5.775058360699875, "learning_rate": 6.566595775128248e-08, "loss": 0.6894, "step": 6304 }, { "epoch": 0.6232854706769148, "grad_norm": 3.44836020208474, "learning_rate": 6.563588491754208e-08, "loss": 0.6476, "step": 6305 }, { "epoch": 0.6233843264216692, "grad_norm": 15.55859993490643, "learning_rate": 6.560581560733482e-08, "loss": 0.7717, "step": 6306 }, { "epoch": 0.6234831821664236, "grad_norm": 3.7422368606297396, "learning_rate": 6.557574982374377e-08, "loss": 0.6557, "step": 6307 }, { "epoch": 0.6235820379111782, "grad_norm": 2.9501495090671117, "learning_rate": 6.554568756985178e-08, "loss": 0.5925, "step": 6308 }, { "epoch": 0.6236808936559326, "grad_norm": 3.7825402619566737, "learning_rate": 6.551562884874135e-08, "loss": 0.8043, "step": 6309 }, { "epoch": 0.623779749400687, "grad_norm": 7.968455970047168, "learning_rate": 6.548557366349445e-08, "loss": 0.7606, "step": 6310 }, { "epoch": 0.6238786051454415, "grad_norm": 4.46391660858993, "learning_rate": 6.545552201719286e-08, "loss": 0.7418, "step": 6311 }, { "epoch": 0.623977460890196, "grad_norm": 8.49460961185926, "learning_rate": 6.542547391291795e-08, "loss": 0.6551, "step": 6312 }, { "epoch": 0.6240763166349504, "grad_norm": 2.9851707034607324, "learning_rate": 6.539542935375064e-08, "loss": 0.6861, "step": 6313 }, { "epoch": 0.6241751723797049, "grad_norm": 3.696139506804978, "learning_rate": 6.53653883427716e-08, "loss": 0.632, "step": 6314 }, { "epoch": 0.6242740281244594, "grad_norm": 5.781119761688448, "learning_rate": 6.533535088306112e-08, "loss": 0.6851, "step": 6315 }, { "epoch": 0.6243728838692139, "grad_norm": 11.880790847848965, "learning_rate": 6.530531697769904e-08, "loss": 0.8123, "step": 6316 }, { "epoch": 0.6244717396139683, "grad_norm": 25.15244698735989, "learning_rate": 6.527528662976495e-08, "loss": 0.7507, "step": 6317 }, { "epoch": 0.6245705953587228, "grad_norm": 4.565999530092545, "learning_rate": 6.524525984233795e-08, "loss": 0.636, "step": 6318 }, { "epoch": 0.6246694511034773, "grad_norm": 3.9391003907729507, "learning_rate": 6.521523661849691e-08, "loss": 0.674, "step": 6319 }, { "epoch": 0.6247683068482317, "grad_norm": 4.3503919391850925, "learning_rate": 6.518521696132024e-08, "loss": 0.7646, "step": 6320 }, { "epoch": 0.6248671625929861, "grad_norm": 5.039283039436438, "learning_rate": 6.515520087388602e-08, "loss": 0.6825, "step": 6321 }, { "epoch": 0.6249660183377407, "grad_norm": 4.712927891922407, "learning_rate": 6.512518835927193e-08, "loss": 0.683, "step": 6322 }, { "epoch": 0.6250648740824951, "grad_norm": 3.2027899931866592, "learning_rate": 6.509517942055538e-08, "loss": 0.7349, "step": 6323 }, { "epoch": 0.6251637298272495, "grad_norm": 3.4141505945147643, "learning_rate": 6.506517406081325e-08, "loss": 0.5947, "step": 6324 }, { "epoch": 0.6252625855720041, "grad_norm": 5.469997325423968, "learning_rate": 6.50351722831222e-08, "loss": 0.6565, "step": 6325 }, { "epoch": 0.6253614413167585, "grad_norm": 8.439113255546191, "learning_rate": 6.50051740905585e-08, "loss": 0.7422, "step": 6326 }, { "epoch": 0.625460297061513, "grad_norm": 4.177085746007796, "learning_rate": 6.497517948619796e-08, "loss": 0.8066, "step": 6327 }, { "epoch": 0.6255591528062675, "grad_norm": 2.9306928778316848, "learning_rate": 6.494518847311614e-08, "loss": 0.7816, "step": 6328 }, { "epoch": 0.6256580085510219, "grad_norm": 7.959703243777141, "learning_rate": 6.491520105438813e-08, "loss": 0.6277, "step": 6329 }, { "epoch": 0.6257568642957764, "grad_norm": 4.765365558257992, "learning_rate": 6.488521723308868e-08, "loss": 0.7469, "step": 6330 }, { "epoch": 0.6258557200405308, "grad_norm": 5.055629753000996, "learning_rate": 6.485523701229228e-08, "loss": 0.6932, "step": 6331 }, { "epoch": 0.6259545757852853, "grad_norm": 5.8648072800843085, "learning_rate": 6.482526039507287e-08, "loss": 0.786, "step": 6332 }, { "epoch": 0.6260534315300398, "grad_norm": 4.024240802144581, "learning_rate": 6.479528738450412e-08, "loss": 0.7225, "step": 6333 }, { "epoch": 0.6261522872747942, "grad_norm": 3.1805352633292174, "learning_rate": 6.476531798365941e-08, "loss": 0.6877, "step": 6334 }, { "epoch": 0.6262511430195488, "grad_norm": 20.35932820289275, "learning_rate": 6.473535219561152e-08, "loss": 0.7368, "step": 6335 }, { "epoch": 0.6263499987643032, "grad_norm": 5.55151726059079, "learning_rate": 6.470539002343312e-08, "loss": 0.666, "step": 6336 }, { "epoch": 0.6264488545090576, "grad_norm": 2.736333447523643, "learning_rate": 6.467543147019633e-08, "loss": 0.5854, "step": 6337 }, { "epoch": 0.6265477102538122, "grad_norm": 3.7030818840743946, "learning_rate": 6.464547653897297e-08, "loss": 0.6504, "step": 6338 }, { "epoch": 0.6266465659985666, "grad_norm": 4.260082788578561, "learning_rate": 6.461552523283449e-08, "loss": 0.7211, "step": 6339 }, { "epoch": 0.626745421743321, "grad_norm": 4.071906303752496, "learning_rate": 6.458557755485192e-08, "loss": 0.7423, "step": 6340 }, { "epoch": 0.6268442774880755, "grad_norm": 5.564647488815131, "learning_rate": 6.455563350809595e-08, "loss": 0.6378, "step": 6341 }, { "epoch": 0.62694313323283, "grad_norm": 3.0921984573223344, "learning_rate": 6.452569309563696e-08, "loss": 0.6712, "step": 6342 }, { "epoch": 0.6270419889775845, "grad_norm": 6.707406854142768, "learning_rate": 6.449575632054482e-08, "loss": 0.6994, "step": 6343 }, { "epoch": 0.6271408447223389, "grad_norm": 4.036187087791546, "learning_rate": 6.446582318588917e-08, "loss": 0.7527, "step": 6344 }, { "epoch": 0.6272397004670934, "grad_norm": 3.362909720957736, "learning_rate": 6.443589369473921e-08, "loss": 0.8145, "step": 6345 }, { "epoch": 0.6273385562118479, "grad_norm": 7.006676537878311, "learning_rate": 6.440596785016373e-08, "loss": 0.6488, "step": 6346 }, { "epoch": 0.6274374119566023, "grad_norm": 4.836704986110418, "learning_rate": 6.437604565523116e-08, "loss": 0.7151, "step": 6347 }, { "epoch": 0.6275362677013568, "grad_norm": 3.4880233746656923, "learning_rate": 6.434612711300968e-08, "loss": 0.6539, "step": 6348 }, { "epoch": 0.6276351234461113, "grad_norm": 7.272032443269752, "learning_rate": 6.431621222656688e-08, "loss": 0.7508, "step": 6349 }, { "epoch": 0.6277339791908657, "grad_norm": 4.801087082220287, "learning_rate": 6.428630099897018e-08, "loss": 0.7032, "step": 6350 }, { "epoch": 0.6278328349356201, "grad_norm": 6.652784198425823, "learning_rate": 6.425639343328647e-08, "loss": 0.6764, "step": 6351 }, { "epoch": 0.6279316906803747, "grad_norm": 3.515211984210982, "learning_rate": 6.422648953258235e-08, "loss": 0.6086, "step": 6352 }, { "epoch": 0.6280305464251291, "grad_norm": 3.269088046660099, "learning_rate": 6.419658929992408e-08, "loss": 0.6704, "step": 6353 }, { "epoch": 0.6281294021698836, "grad_norm": 3.777496552214907, "learning_rate": 6.416669273837739e-08, "loss": 0.7838, "step": 6354 }, { "epoch": 0.6282282579146381, "grad_norm": 4.148701788678798, "learning_rate": 6.413679985100781e-08, "loss": 0.7843, "step": 6355 }, { "epoch": 0.6283271136593925, "grad_norm": 3.995997001977764, "learning_rate": 6.410691064088041e-08, "loss": 0.7326, "step": 6356 }, { "epoch": 0.628425969404147, "grad_norm": 5.184056174537193, "learning_rate": 6.407702511105981e-08, "loss": 0.6243, "step": 6357 }, { "epoch": 0.6285248251489015, "grad_norm": 6.789385749779301, "learning_rate": 6.404714326461039e-08, "loss": 0.741, "step": 6358 }, { "epoch": 0.628623680893656, "grad_norm": 3.4951273803051324, "learning_rate": 6.401726510459612e-08, "loss": 0.6736, "step": 6359 }, { "epoch": 0.6287225366384104, "grad_norm": 5.177298856205879, "learning_rate": 6.39873906340805e-08, "loss": 0.5941, "step": 6360 }, { "epoch": 0.6288213923831648, "grad_norm": 7.255165100817646, "learning_rate": 6.395751985612678e-08, "loss": 0.7812, "step": 6361 }, { "epoch": 0.6289202481279194, "grad_norm": 3.74722379532626, "learning_rate": 6.392765277379771e-08, "loss": 0.5796, "step": 6362 }, { "epoch": 0.6290191038726738, "grad_norm": 3.533143545489133, "learning_rate": 6.389778939015573e-08, "loss": 0.7553, "step": 6363 }, { "epoch": 0.6291179596174282, "grad_norm": 3.718738738194319, "learning_rate": 6.386792970826294e-08, "loss": 0.651, "step": 6364 }, { "epoch": 0.6292168153621828, "grad_norm": 4.095190293126384, "learning_rate": 6.383807373118095e-08, "loss": 0.7629, "step": 6365 }, { "epoch": 0.6293156711069372, "grad_norm": 3.8849259685379414, "learning_rate": 6.380822146197107e-08, "loss": 0.6974, "step": 6366 }, { "epoch": 0.6294145268516916, "grad_norm": 4.993614323674291, "learning_rate": 6.377837290369423e-08, "loss": 0.6708, "step": 6367 }, { "epoch": 0.6295133825964462, "grad_norm": 3.3156951811996147, "learning_rate": 6.374852805941091e-08, "loss": 0.6292, "step": 6368 }, { "epoch": 0.6296122383412006, "grad_norm": 4.4837953452362465, "learning_rate": 6.37186869321813e-08, "loss": 0.7018, "step": 6369 }, { "epoch": 0.629711094085955, "grad_norm": 9.254188847604313, "learning_rate": 6.368884952506519e-08, "loss": 0.7675, "step": 6370 }, { "epoch": 0.6298099498307096, "grad_norm": 3.7962807462777555, "learning_rate": 6.36590158411219e-08, "loss": 0.6598, "step": 6371 }, { "epoch": 0.629908805575464, "grad_norm": 3.1013880379869287, "learning_rate": 6.362918588341051e-08, "loss": 0.6387, "step": 6372 }, { "epoch": 0.6300076613202185, "grad_norm": 6.406959164884269, "learning_rate": 6.359935965498958e-08, "loss": 0.69, "step": 6373 }, { "epoch": 0.6301065170649729, "grad_norm": 4.261266028022718, "learning_rate": 6.356953715891735e-08, "loss": 0.6752, "step": 6374 }, { "epoch": 0.6302053728097274, "grad_norm": 3.3171578548134293, "learning_rate": 6.353971839825173e-08, "loss": 0.7442, "step": 6375 }, { "epoch": 0.6303042285544819, "grad_norm": 7.413550029893226, "learning_rate": 6.350990337605012e-08, "loss": 0.6789, "step": 6376 }, { "epoch": 0.6304030842992363, "grad_norm": 3.2517768343000277, "learning_rate": 6.348009209536966e-08, "loss": 0.6849, "step": 6377 }, { "epoch": 0.6305019400439908, "grad_norm": 3.9850479709004536, "learning_rate": 6.345028455926708e-08, "loss": 0.6859, "step": 6378 }, { "epoch": 0.6306007957887453, "grad_norm": 3.8055348206382047, "learning_rate": 6.342048077079865e-08, "loss": 0.6039, "step": 6379 }, { "epoch": 0.6306996515334997, "grad_norm": 4.340607776792594, "learning_rate": 6.339068073302035e-08, "loss": 0.672, "step": 6380 }, { "epoch": 0.6307985072782543, "grad_norm": 3.366983103331821, "learning_rate": 6.336088444898768e-08, "loss": 0.6622, "step": 6381 }, { "epoch": 0.6308973630230087, "grad_norm": 3.885755580649514, "learning_rate": 6.333109192175588e-08, "loss": 0.7064, "step": 6382 }, { "epoch": 0.6309962187677631, "grad_norm": 3.6295313569291126, "learning_rate": 6.330130315437971e-08, "loss": 0.6758, "step": 6383 }, { "epoch": 0.6310950745125176, "grad_norm": 3.386109506699371, "learning_rate": 6.327151814991351e-08, "loss": 0.6923, "step": 6384 }, { "epoch": 0.6311939302572721, "grad_norm": 5.482291727613971, "learning_rate": 6.324173691141134e-08, "loss": 0.6564, "step": 6385 }, { "epoch": 0.6312927860020265, "grad_norm": 4.65870897632115, "learning_rate": 6.321195944192689e-08, "loss": 0.5785, "step": 6386 }, { "epoch": 0.631391641746781, "grad_norm": 3.5783784494065483, "learning_rate": 6.318218574451328e-08, "loss": 0.7012, "step": 6387 }, { "epoch": 0.6314904974915355, "grad_norm": 10.067256954978951, "learning_rate": 6.315241582222341e-08, "loss": 0.7222, "step": 6388 }, { "epoch": 0.63158935323629, "grad_norm": 5.305422470266035, "learning_rate": 6.312264967810981e-08, "loss": 0.6525, "step": 6389 }, { "epoch": 0.6316882089810444, "grad_norm": 3.4693765037419038, "learning_rate": 6.30928873152245e-08, "loss": 0.6519, "step": 6390 }, { "epoch": 0.6317870647257989, "grad_norm": 3.741554464418401, "learning_rate": 6.306312873661915e-08, "loss": 0.6987, "step": 6391 }, { "epoch": 0.6318859204705534, "grad_norm": 4.373886036227352, "learning_rate": 6.303337394534512e-08, "loss": 0.7349, "step": 6392 }, { "epoch": 0.6319847762153078, "grad_norm": 5.803830460322083, "learning_rate": 6.300362294445326e-08, "loss": 0.7379, "step": 6393 }, { "epoch": 0.6320836319600622, "grad_norm": 5.166470407993537, "learning_rate": 6.297387573699417e-08, "loss": 0.6896, "step": 6394 }, { "epoch": 0.6321824877048168, "grad_norm": 3.710213265058296, "learning_rate": 6.294413232601791e-08, "loss": 0.7322, "step": 6395 }, { "epoch": 0.6322813434495712, "grad_norm": 9.519209874902531, "learning_rate": 6.291439271457426e-08, "loss": 0.7498, "step": 6396 }, { "epoch": 0.6323801991943256, "grad_norm": 7.2123473719816955, "learning_rate": 6.288465690571266e-08, "loss": 0.5397, "step": 6397 }, { "epoch": 0.6324790549390802, "grad_norm": 4.699541872063325, "learning_rate": 6.285492490248193e-08, "loss": 0.6603, "step": 6398 }, { "epoch": 0.6325779106838346, "grad_norm": 3.050316220011718, "learning_rate": 6.282519670793074e-08, "loss": 0.7436, "step": 6399 }, { "epoch": 0.632676766428589, "grad_norm": 3.932108235035159, "learning_rate": 6.27954723251073e-08, "loss": 0.7477, "step": 6400 }, { "epoch": 0.6327756221733436, "grad_norm": 13.247102237347239, "learning_rate": 6.276575175705931e-08, "loss": 0.7172, "step": 6401 }, { "epoch": 0.632874477918098, "grad_norm": 16.52936903615708, "learning_rate": 6.273603500683425e-08, "loss": 0.8389, "step": 6402 }, { "epoch": 0.6329733336628525, "grad_norm": 4.329529652296655, "learning_rate": 6.270632207747916e-08, "loss": 0.6625, "step": 6403 }, { "epoch": 0.6330721894076069, "grad_norm": 4.329015821616079, "learning_rate": 6.267661297204057e-08, "loss": 0.765, "step": 6404 }, { "epoch": 0.6331710451523614, "grad_norm": 4.683682862613541, "learning_rate": 6.26469076935648e-08, "loss": 0.733, "step": 6405 }, { "epoch": 0.6332699008971159, "grad_norm": 4.455716016356109, "learning_rate": 6.26172062450976e-08, "loss": 0.8027, "step": 6406 }, { "epoch": 0.6333687566418703, "grad_norm": 3.922959500703809, "learning_rate": 6.258750862968451e-08, "loss": 0.7216, "step": 6407 }, { "epoch": 0.6334676123866249, "grad_norm": 3.8248469162398226, "learning_rate": 6.255781485037053e-08, "loss": 0.6948, "step": 6408 }, { "epoch": 0.6335664681313793, "grad_norm": 3.3977091392623864, "learning_rate": 6.252812491020033e-08, "loss": 0.6763, "step": 6409 }, { "epoch": 0.6336653238761337, "grad_norm": 3.425729098705427, "learning_rate": 6.249843881221813e-08, "loss": 0.6409, "step": 6410 }, { "epoch": 0.6337641796208883, "grad_norm": 4.909279605984304, "learning_rate": 6.24687565594679e-08, "loss": 0.6834, "step": 6411 }, { "epoch": 0.6338630353656427, "grad_norm": 4.274723882313248, "learning_rate": 6.243907815499301e-08, "loss": 0.6851, "step": 6412 }, { "epoch": 0.6339618911103971, "grad_norm": 5.517001292376823, "learning_rate": 6.24094036018366e-08, "loss": 0.6527, "step": 6413 }, { "epoch": 0.6340607468551516, "grad_norm": 4.561284733199471, "learning_rate": 6.237973290304139e-08, "loss": 0.7897, "step": 6414 }, { "epoch": 0.6341596025999061, "grad_norm": 6.563418355972411, "learning_rate": 6.235006606164959e-08, "loss": 0.5021, "step": 6415 }, { "epoch": 0.6342584583446605, "grad_norm": 4.995332333435272, "learning_rate": 6.232040308070318e-08, "loss": 0.6667, "step": 6416 }, { "epoch": 0.634357314089415, "grad_norm": 4.959069253611251, "learning_rate": 6.22907439632436e-08, "loss": 0.7101, "step": 6417 }, { "epoch": 0.6344561698341695, "grad_norm": 183.99518658654262, "learning_rate": 6.226108871231195e-08, "loss": 0.6665, "step": 6418 }, { "epoch": 0.634555025578924, "grad_norm": 5.285171832611314, "learning_rate": 6.223143733094903e-08, "loss": 0.6145, "step": 6419 }, { "epoch": 0.6346538813236784, "grad_norm": 4.2811709182488205, "learning_rate": 6.220178982219502e-08, "loss": 0.698, "step": 6420 }, { "epoch": 0.6347527370684329, "grad_norm": 4.779629064138447, "learning_rate": 6.21721461890899e-08, "loss": 0.7109, "step": 6421 }, { "epoch": 0.6348515928131874, "grad_norm": 4.804619680648377, "learning_rate": 6.214250643467323e-08, "loss": 0.5505, "step": 6422 }, { "epoch": 0.6349504485579418, "grad_norm": 3.055946114153866, "learning_rate": 6.211287056198406e-08, "loss": 0.726, "step": 6423 }, { "epoch": 0.6350493043026962, "grad_norm": 3.798059534515849, "learning_rate": 6.208323857406117e-08, "loss": 0.6324, "step": 6424 }, { "epoch": 0.6351481600474508, "grad_norm": 4.675098904405104, "learning_rate": 6.205361047394282e-08, "loss": 0.7247, "step": 6425 }, { "epoch": 0.6352470157922052, "grad_norm": 3.4660828922508014, "learning_rate": 6.202398626466699e-08, "loss": 0.7267, "step": 6426 }, { "epoch": 0.6353458715369596, "grad_norm": 4.606541723310948, "learning_rate": 6.19943659492712e-08, "loss": 0.7622, "step": 6427 }, { "epoch": 0.6354447272817142, "grad_norm": 3.281979557746655, "learning_rate": 6.196474953079251e-08, "loss": 0.6551, "step": 6428 }, { "epoch": 0.6355435830264686, "grad_norm": 5.807771707080433, "learning_rate": 6.19351370122677e-08, "loss": 0.7426, "step": 6429 }, { "epoch": 0.6356424387712231, "grad_norm": 4.099257708226999, "learning_rate": 6.190552839673313e-08, "loss": 0.6929, "step": 6430 }, { "epoch": 0.6357412945159776, "grad_norm": 2.9157953122739, "learning_rate": 6.187592368722466e-08, "loss": 0.767, "step": 6431 }, { "epoch": 0.635840150260732, "grad_norm": 5.601730544955432, "learning_rate": 6.184632288677785e-08, "loss": 0.7772, "step": 6432 }, { "epoch": 0.6359390060054865, "grad_norm": 4.233916674747636, "learning_rate": 6.181672599842784e-08, "loss": 0.641, "step": 6433 }, { "epoch": 0.6360378617502409, "grad_norm": 6.359670868870838, "learning_rate": 6.178713302520933e-08, "loss": 0.7219, "step": 6434 }, { "epoch": 0.6361367174949955, "grad_norm": 4.1221599136619, "learning_rate": 6.175754397015664e-08, "loss": 0.5986, "step": 6435 }, { "epoch": 0.6362355732397499, "grad_norm": 4.207712853628502, "learning_rate": 6.172795883630373e-08, "loss": 0.6706, "step": 6436 }, { "epoch": 0.6363344289845043, "grad_norm": 3.6549585745719773, "learning_rate": 6.169837762668404e-08, "loss": 0.6968, "step": 6437 }, { "epoch": 0.6364332847292589, "grad_norm": 9.903965092876502, "learning_rate": 6.166880034433079e-08, "loss": 0.7994, "step": 6438 }, { "epoch": 0.6365321404740133, "grad_norm": 4.715480007295255, "learning_rate": 6.163922699227658e-08, "loss": 0.6873, "step": 6439 }, { "epoch": 0.6366309962187677, "grad_norm": 4.074777482781056, "learning_rate": 6.16096575735538e-08, "loss": 0.6592, "step": 6440 }, { "epoch": 0.6367298519635223, "grad_norm": 3.780192160664811, "learning_rate": 6.158009209119438e-08, "loss": 0.7361, "step": 6441 }, { "epoch": 0.6368287077082767, "grad_norm": 4.180092635925593, "learning_rate": 6.155053054822972e-08, "loss": 0.633, "step": 6442 }, { "epoch": 0.6369275634530311, "grad_norm": 14.138339763712994, "learning_rate": 6.152097294769102e-08, "loss": 0.6339, "step": 6443 }, { "epoch": 0.6370264191977856, "grad_norm": 4.106240941027661, "learning_rate": 6.149141929260895e-08, "loss": 0.7474, "step": 6444 }, { "epoch": 0.6371252749425401, "grad_norm": 4.371595411096042, "learning_rate": 6.146186958601376e-08, "loss": 0.6846, "step": 6445 }, { "epoch": 0.6372241306872946, "grad_norm": 3.3410156213849374, "learning_rate": 6.143232383093536e-08, "loss": 0.7447, "step": 6446 }, { "epoch": 0.637322986432049, "grad_norm": 3.9305835910831837, "learning_rate": 6.140278203040328e-08, "loss": 0.7485, "step": 6447 }, { "epoch": 0.6374218421768035, "grad_norm": 6.24582263324381, "learning_rate": 6.137324418744651e-08, "loss": 0.708, "step": 6448 }, { "epoch": 0.637520697921558, "grad_norm": 3.2660072546939802, "learning_rate": 6.134371030509382e-08, "loss": 0.6815, "step": 6449 }, { "epoch": 0.6376195536663124, "grad_norm": 4.7776864307728735, "learning_rate": 6.131418038637336e-08, "loss": 0.7963, "step": 6450 }, { "epoch": 0.6377184094110669, "grad_norm": 6.793380576282161, "learning_rate": 6.128465443431307e-08, "loss": 0.7273, "step": 6451 }, { "epoch": 0.6378172651558214, "grad_norm": 3.005588763310597, "learning_rate": 6.125513245194039e-08, "loss": 0.6509, "step": 6452 }, { "epoch": 0.6379161209005758, "grad_norm": 4.337393023650969, "learning_rate": 6.122561444228233e-08, "loss": 0.7434, "step": 6453 }, { "epoch": 0.6380149766453304, "grad_norm": 18.213656547481868, "learning_rate": 6.119610040836552e-08, "loss": 0.6535, "step": 6454 }, { "epoch": 0.6381138323900848, "grad_norm": 6.635999922179362, "learning_rate": 6.116659035321628e-08, "loss": 0.8318, "step": 6455 }, { "epoch": 0.6382126881348392, "grad_norm": 4.836673146338819, "learning_rate": 6.113708427986031e-08, "loss": 0.7117, "step": 6456 }, { "epoch": 0.6383115438795937, "grad_norm": 3.4886505931882454, "learning_rate": 6.110758219132308e-08, "loss": 0.6077, "step": 6457 }, { "epoch": 0.6384103996243482, "grad_norm": 5.538933251306919, "learning_rate": 6.107808409062964e-08, "loss": 0.7678, "step": 6458 }, { "epoch": 0.6385092553691026, "grad_norm": 5.037187018889392, "learning_rate": 6.104858998080449e-08, "loss": 0.7508, "step": 6459 }, { "epoch": 0.6386081111138571, "grad_norm": 3.086058778695983, "learning_rate": 6.101909986487191e-08, "loss": 0.746, "step": 6460 }, { "epoch": 0.6387069668586116, "grad_norm": 8.02715026300322, "learning_rate": 6.098961374585561e-08, "loss": 0.7226, "step": 6461 }, { "epoch": 0.638805822603366, "grad_norm": 3.9477957911240065, "learning_rate": 6.096013162677895e-08, "loss": 0.7607, "step": 6462 }, { "epoch": 0.6389046783481205, "grad_norm": 4.916301663619931, "learning_rate": 6.093065351066497e-08, "loss": 0.6259, "step": 6463 }, { "epoch": 0.639003534092875, "grad_norm": 5.160944499534483, "learning_rate": 6.09011794005361e-08, "loss": 0.6998, "step": 6464 }, { "epoch": 0.6391023898376295, "grad_norm": 3.514360932095559, "learning_rate": 6.087170929941455e-08, "loss": 0.7765, "step": 6465 }, { "epoch": 0.6392012455823839, "grad_norm": 6.059806639276277, "learning_rate": 6.084224321032206e-08, "loss": 0.8323, "step": 6466 }, { "epoch": 0.6393001013271383, "grad_norm": 3.915271063468134, "learning_rate": 6.081278113627986e-08, "loss": 0.6702, "step": 6467 }, { "epoch": 0.6393989570718929, "grad_norm": 6.659257243700594, "learning_rate": 6.078332308030893e-08, "loss": 0.7459, "step": 6468 }, { "epoch": 0.6394978128166473, "grad_norm": 5.965686232431402, "learning_rate": 6.075386904542974e-08, "loss": 0.7705, "step": 6469 }, { "epoch": 0.6395966685614017, "grad_norm": 14.274941194643295, "learning_rate": 6.072441903466234e-08, "loss": 0.6254, "step": 6470 }, { "epoch": 0.6396955243061563, "grad_norm": 6.571029157661409, "learning_rate": 6.069497305102647e-08, "loss": 0.6683, "step": 6471 }, { "epoch": 0.6397943800509107, "grad_norm": 4.225812972082855, "learning_rate": 6.066553109754125e-08, "loss": 0.7721, "step": 6472 }, { "epoch": 0.6398932357956651, "grad_norm": 6.747128004536368, "learning_rate": 6.06360931772256e-08, "loss": 0.6024, "step": 6473 }, { "epoch": 0.6399920915404197, "grad_norm": 20.37159778891613, "learning_rate": 6.060665929309798e-08, "loss": 0.7785, "step": 6474 }, { "epoch": 0.6400909472851741, "grad_norm": 9.771142085052215, "learning_rate": 6.057722944817631e-08, "loss": 0.6946, "step": 6475 }, { "epoch": 0.6401898030299286, "grad_norm": 4.658965509952915, "learning_rate": 6.054780364547824e-08, "loss": 0.7596, "step": 6476 }, { "epoch": 0.640288658774683, "grad_norm": 3.503737103939055, "learning_rate": 6.051838188802097e-08, "loss": 0.5524, "step": 6477 }, { "epoch": 0.6403875145194375, "grad_norm": 3.809439751149928, "learning_rate": 6.048896417882124e-08, "loss": 0.8028, "step": 6478 }, { "epoch": 0.640486370264192, "grad_norm": 3.464375024342769, "learning_rate": 6.045955052089537e-08, "loss": 0.8145, "step": 6479 }, { "epoch": 0.6405852260089464, "grad_norm": 6.737056168960275, "learning_rate": 6.04301409172594e-08, "loss": 0.7215, "step": 6480 }, { "epoch": 0.640684081753701, "grad_norm": 4.361605899929606, "learning_rate": 6.040073537092872e-08, "loss": 0.5987, "step": 6481 }, { "epoch": 0.6407829374984554, "grad_norm": 4.652411574008578, "learning_rate": 6.037133388491853e-08, "loss": 0.7768, "step": 6482 }, { "epoch": 0.6408817932432098, "grad_norm": 5.189576475388783, "learning_rate": 6.034193646224346e-08, "loss": 0.7198, "step": 6483 }, { "epoch": 0.6409806489879644, "grad_norm": 3.235423135241254, "learning_rate": 6.03125431059178e-08, "loss": 0.6668, "step": 6484 }, { "epoch": 0.6410795047327188, "grad_norm": 2.900350989080107, "learning_rate": 6.028315381895547e-08, "loss": 0.6931, "step": 6485 }, { "epoch": 0.6411783604774732, "grad_norm": 9.048006058078702, "learning_rate": 6.02537686043698e-08, "loss": 0.7022, "step": 6486 }, { "epoch": 0.6412772162222277, "grad_norm": 5.937724555154702, "learning_rate": 6.022438746517387e-08, "loss": 0.6618, "step": 6487 }, { "epoch": 0.6413760719669822, "grad_norm": 3.457013827258602, "learning_rate": 6.01950104043803e-08, "loss": 0.7273, "step": 6488 }, { "epoch": 0.6414749277117366, "grad_norm": 3.0295083835818284, "learning_rate": 6.016563742500122e-08, "loss": 0.6576, "step": 6489 }, { "epoch": 0.6415737834564911, "grad_norm": 3.0521374704073594, "learning_rate": 6.013626853004843e-08, "loss": 0.7102, "step": 6490 }, { "epoch": 0.6416726392012456, "grad_norm": 3.5952714126726097, "learning_rate": 6.010690372253328e-08, "loss": 0.7697, "step": 6491 }, { "epoch": 0.641771494946, "grad_norm": 4.312572611664246, "learning_rate": 6.007754300546668e-08, "loss": 0.7521, "step": 6492 }, { "epoch": 0.6418703506907545, "grad_norm": 6.4130060161791365, "learning_rate": 6.004818638185916e-08, "loss": 0.6598, "step": 6493 }, { "epoch": 0.641969206435509, "grad_norm": 4.060788060503483, "learning_rate": 6.001883385472077e-08, "loss": 0.6304, "step": 6494 }, { "epoch": 0.6420680621802635, "grad_norm": 4.392635727553592, "learning_rate": 5.998948542706123e-08, "loss": 0.618, "step": 6495 }, { "epoch": 0.6421669179250179, "grad_norm": 6.186133045217443, "learning_rate": 5.996014110188976e-08, "loss": 0.6552, "step": 6496 }, { "epoch": 0.6422657736697723, "grad_norm": 5.237087279533955, "learning_rate": 5.993080088221519e-08, "loss": 0.6513, "step": 6497 }, { "epoch": 0.6423646294145269, "grad_norm": 4.562725888709792, "learning_rate": 5.990146477104589e-08, "loss": 0.7975, "step": 6498 }, { "epoch": 0.6424634851592813, "grad_norm": 24.965414716769097, "learning_rate": 5.987213277138995e-08, "loss": 0.9008, "step": 6499 }, { "epoch": 0.6425623409040357, "grad_norm": 4.763959095270247, "learning_rate": 5.984280488625481e-08, "loss": 0.6259, "step": 6500 }, { "epoch": 0.6426611966487903, "grad_norm": 4.183606661318391, "learning_rate": 5.981348111864768e-08, "loss": 0.7465, "step": 6501 }, { "epoch": 0.6427600523935447, "grad_norm": 5.08239547546349, "learning_rate": 5.97841614715753e-08, "loss": 0.7009, "step": 6502 }, { "epoch": 0.6428589081382992, "grad_norm": 3.659542967389106, "learning_rate": 5.975484594804389e-08, "loss": 0.6974, "step": 6503 }, { "epoch": 0.6429577638830537, "grad_norm": 3.2933307662050715, "learning_rate": 5.972553455105942e-08, "loss": 0.7659, "step": 6504 }, { "epoch": 0.6430566196278081, "grad_norm": 4.055417574415319, "learning_rate": 5.969622728362726e-08, "loss": 0.7187, "step": 6505 }, { "epoch": 0.6431554753725626, "grad_norm": 4.277061550373825, "learning_rate": 5.966692414875246e-08, "loss": 0.6716, "step": 6506 }, { "epoch": 0.643254331117317, "grad_norm": 10.626899589241658, "learning_rate": 5.963762514943968e-08, "loss": 0.6978, "step": 6507 }, { "epoch": 0.6433531868620715, "grad_norm": 4.27106916509404, "learning_rate": 5.960833028869301e-08, "loss": 0.7462, "step": 6508 }, { "epoch": 0.643452042606826, "grad_norm": 3.497387967647567, "learning_rate": 5.9579039569516243e-08, "loss": 0.7545, "step": 6509 }, { "epoch": 0.6435508983515804, "grad_norm": 3.4922645794280784, "learning_rate": 5.954975299491276e-08, "loss": 0.7219, "step": 6510 }, { "epoch": 0.643649754096335, "grad_norm": 3.7741248696686407, "learning_rate": 5.952047056788538e-08, "loss": 0.6097, "step": 6511 }, { "epoch": 0.6437486098410894, "grad_norm": 3.5478487575501068, "learning_rate": 5.949119229143664e-08, "loss": 0.6645, "step": 6512 }, { "epoch": 0.6438474655858438, "grad_norm": 5.167513937746826, "learning_rate": 5.94619181685686e-08, "loss": 0.7129, "step": 6513 }, { "epoch": 0.6439463213305984, "grad_norm": 4.896478760074855, "learning_rate": 5.943264820228288e-08, "loss": 0.7461, "step": 6514 }, { "epoch": 0.6440451770753528, "grad_norm": 6.903734947605831, "learning_rate": 5.940338239558068e-08, "loss": 0.778, "step": 6515 }, { "epoch": 0.6441440328201072, "grad_norm": 6.167444476321172, "learning_rate": 5.937412075146273e-08, "loss": 0.6209, "step": 6516 }, { "epoch": 0.6442428885648617, "grad_norm": 9.17797202093982, "learning_rate": 5.9344863272929447e-08, "loss": 0.6478, "step": 6517 }, { "epoch": 0.6443417443096162, "grad_norm": 3.106626199676358, "learning_rate": 5.931560996298075e-08, "loss": 0.6689, "step": 6518 }, { "epoch": 0.6444406000543706, "grad_norm": 4.881725634296227, "learning_rate": 5.928636082461607e-08, "loss": 0.6944, "step": 6519 }, { "epoch": 0.6445394557991251, "grad_norm": 42.438642246419235, "learning_rate": 5.925711586083453e-08, "loss": 0.6626, "step": 6520 }, { "epoch": 0.6446383115438796, "grad_norm": 5.772256418439717, "learning_rate": 5.92278750746348e-08, "loss": 0.733, "step": 6521 }, { "epoch": 0.6447371672886341, "grad_norm": 4.882392707672826, "learning_rate": 5.919863846901503e-08, "loss": 0.6341, "step": 6522 }, { "epoch": 0.6448360230333885, "grad_norm": 4.843882959753123, "learning_rate": 5.916940604697299e-08, "loss": 0.7826, "step": 6523 }, { "epoch": 0.644934878778143, "grad_norm": 3.6826392875528233, "learning_rate": 5.914017781150611e-08, "loss": 0.5958, "step": 6524 }, { "epoch": 0.6450337345228975, "grad_norm": 3.539009557564371, "learning_rate": 5.911095376561124e-08, "loss": 0.6038, "step": 6525 }, { "epoch": 0.6451325902676519, "grad_norm": 3.8883727627772564, "learning_rate": 5.908173391228495e-08, "loss": 0.6077, "step": 6526 }, { "epoch": 0.6452314460124065, "grad_norm": 4.442433504110512, "learning_rate": 5.90525182545232e-08, "loss": 0.7237, "step": 6527 }, { "epoch": 0.6453303017571609, "grad_norm": 11.230025241999561, "learning_rate": 5.90233067953217e-08, "loss": 0.7279, "step": 6528 }, { "epoch": 0.6454291575019153, "grad_norm": 3.2253951532499125, "learning_rate": 5.899409953767568e-08, "loss": 0.827, "step": 6529 }, { "epoch": 0.6455280132466698, "grad_norm": 4.734233169503264, "learning_rate": 5.896489648457983e-08, "loss": 0.8131, "step": 6530 }, { "epoch": 0.6456268689914243, "grad_norm": 5.070455986855912, "learning_rate": 5.893569763902855e-08, "loss": 0.7049, "step": 6531 }, { "epoch": 0.6457257247361787, "grad_norm": 10.658132984072129, "learning_rate": 5.890650300401576e-08, "loss": 0.6156, "step": 6532 }, { "epoch": 0.6458245804809332, "grad_norm": 3.5558353668687714, "learning_rate": 5.887731258253488e-08, "loss": 0.5295, "step": 6533 }, { "epoch": 0.6459234362256877, "grad_norm": 9.435107515436814, "learning_rate": 5.884812637757899e-08, "loss": 0.7724, "step": 6534 }, { "epoch": 0.6460222919704421, "grad_norm": 5.105519594396979, "learning_rate": 5.881894439214073e-08, "loss": 0.6887, "step": 6535 }, { "epoch": 0.6461211477151966, "grad_norm": 4.399163402350806, "learning_rate": 5.878976662921225e-08, "loss": 0.7257, "step": 6536 }, { "epoch": 0.6462200034599511, "grad_norm": 6.00198354905455, "learning_rate": 5.8760593091785316e-08, "loss": 0.8015, "step": 6537 }, { "epoch": 0.6463188592047056, "grad_norm": 5.6869742446663745, "learning_rate": 5.873142378285122e-08, "loss": 0.6633, "step": 6538 }, { "epoch": 0.64641771494946, "grad_norm": 3.6159910542217824, "learning_rate": 5.870225870540087e-08, "loss": 0.7019, "step": 6539 }, { "epoch": 0.6465165706942144, "grad_norm": 3.57621084615235, "learning_rate": 5.8673097862424714e-08, "loss": 0.7272, "step": 6540 }, { "epoch": 0.646615426438969, "grad_norm": 3.84838087065922, "learning_rate": 5.8643941256912746e-08, "loss": 0.7566, "step": 6541 }, { "epoch": 0.6467142821837234, "grad_norm": 5.464650255363838, "learning_rate": 5.861478889185454e-08, "loss": 0.7003, "step": 6542 }, { "epoch": 0.6468131379284778, "grad_norm": 4.020733908662698, "learning_rate": 5.858564077023931e-08, "loss": 0.7082, "step": 6543 }, { "epoch": 0.6469119936732324, "grad_norm": 4.413259127028339, "learning_rate": 5.855649689505566e-08, "loss": 0.6288, "step": 6544 }, { "epoch": 0.6470108494179868, "grad_norm": 3.90848317509244, "learning_rate": 5.852735726929192e-08, "loss": 0.7784, "step": 6545 }, { "epoch": 0.6471097051627412, "grad_norm": 10.152985217822845, "learning_rate": 5.8498221895935984e-08, "loss": 0.7694, "step": 6546 }, { "epoch": 0.6472085609074958, "grad_norm": 9.866717954247566, "learning_rate": 5.846909077797514e-08, "loss": 0.69, "step": 6547 }, { "epoch": 0.6473074166522502, "grad_norm": 3.2543750685513713, "learning_rate": 5.843996391839646e-08, "loss": 0.6884, "step": 6548 }, { "epoch": 0.6474062723970047, "grad_norm": 3.447756892105157, "learning_rate": 5.841084132018641e-08, "loss": 0.6745, "step": 6549 }, { "epoch": 0.6475051281417591, "grad_norm": 2.967498192277288, "learning_rate": 5.838172298633107e-08, "loss": 0.7136, "step": 6550 }, { "epoch": 0.6476039838865136, "grad_norm": 5.7043387246543436, "learning_rate": 5.835260891981617e-08, "loss": 0.7134, "step": 6551 }, { "epoch": 0.6477028396312681, "grad_norm": 9.005153262854412, "learning_rate": 5.8323499123626843e-08, "loss": 0.8361, "step": 6552 }, { "epoch": 0.6478016953760225, "grad_norm": 4.4971677920183675, "learning_rate": 5.82943936007479e-08, "loss": 0.5962, "step": 6553 }, { "epoch": 0.647900551120777, "grad_norm": 3.224244434024139, "learning_rate": 5.826529235416375e-08, "loss": 0.8004, "step": 6554 }, { "epoch": 0.6479994068655315, "grad_norm": 5.05232567846715, "learning_rate": 5.823619538685819e-08, "loss": 0.8029, "step": 6555 }, { "epoch": 0.6480982626102859, "grad_norm": 3.8761261469191397, "learning_rate": 5.820710270181474e-08, "loss": 0.7354, "step": 6556 }, { "epoch": 0.6481971183550405, "grad_norm": 3.079727984980904, "learning_rate": 5.817801430201645e-08, "loss": 0.7491, "step": 6557 }, { "epoch": 0.6482959740997949, "grad_norm": 4.485350384868977, "learning_rate": 5.8148930190445847e-08, "loss": 0.738, "step": 6558 }, { "epoch": 0.6483948298445493, "grad_norm": 3.569395701197582, "learning_rate": 5.811985037008513e-08, "loss": 0.6325, "step": 6559 }, { "epoch": 0.6484936855893038, "grad_norm": 3.740286026969137, "learning_rate": 5.8090774843915947e-08, "loss": 0.7882, "step": 6560 }, { "epoch": 0.6485925413340583, "grad_norm": 5.038231161119399, "learning_rate": 5.8061703614919644e-08, "loss": 0.6525, "step": 6561 }, { "epoch": 0.6486913970788127, "grad_norm": 7.707653065973685, "learning_rate": 5.8032636686076965e-08, "loss": 0.7401, "step": 6562 }, { "epoch": 0.6487902528235672, "grad_norm": 3.5317569171205982, "learning_rate": 5.8003574060368353e-08, "loss": 0.6512, "step": 6563 }, { "epoch": 0.6488891085683217, "grad_norm": 3.1759835404646166, "learning_rate": 5.797451574077371e-08, "loss": 0.5716, "step": 6564 }, { "epoch": 0.6489879643130761, "grad_norm": 4.844690624423004, "learning_rate": 5.794546173027259e-08, "loss": 0.6852, "step": 6565 }, { "epoch": 0.6490868200578306, "grad_norm": 8.126028900642929, "learning_rate": 5.7916412031843985e-08, "loss": 0.7283, "step": 6566 }, { "epoch": 0.6491856758025851, "grad_norm": 3.24610222691711, "learning_rate": 5.788736664846655e-08, "loss": 0.7402, "step": 6567 }, { "epoch": 0.6492845315473396, "grad_norm": 7.17288530206866, "learning_rate": 5.785832558311849e-08, "loss": 0.7901, "step": 6568 }, { "epoch": 0.649383387292094, "grad_norm": 6.420706113658787, "learning_rate": 5.782928883877749e-08, "loss": 0.6779, "step": 6569 }, { "epoch": 0.6494822430368484, "grad_norm": 6.252471719352112, "learning_rate": 5.780025641842088e-08, "loss": 0.6122, "step": 6570 }, { "epoch": 0.649581098781603, "grad_norm": 9.832590061246103, "learning_rate": 5.777122832502544e-08, "loss": 0.7687, "step": 6571 }, { "epoch": 0.6496799545263574, "grad_norm": 2.807221496512729, "learning_rate": 5.774220456156763e-08, "loss": 0.7633, "step": 6572 }, { "epoch": 0.6497788102711118, "grad_norm": 4.062038456568383, "learning_rate": 5.7713185131023435e-08, "loss": 0.7225, "step": 6573 }, { "epoch": 0.6498776660158664, "grad_norm": 5.7771650079473496, "learning_rate": 5.768417003636828e-08, "loss": 0.675, "step": 6574 }, { "epoch": 0.6499765217606208, "grad_norm": 4.266881192923484, "learning_rate": 5.765515928057729e-08, "loss": 0.651, "step": 6575 }, { "epoch": 0.6500753775053753, "grad_norm": 3.664575109830586, "learning_rate": 5.7626152866625135e-08, "loss": 0.7167, "step": 6576 }, { "epoch": 0.6501742332501298, "grad_norm": 4.110501624296853, "learning_rate": 5.7597150797485905e-08, "loss": 0.7971, "step": 6577 }, { "epoch": 0.6502730889948842, "grad_norm": 4.378608983609914, "learning_rate": 5.756815307613341e-08, "loss": 0.642, "step": 6578 }, { "epoch": 0.6503719447396387, "grad_norm": 4.819508230011718, "learning_rate": 5.753915970554085e-08, "loss": 0.7512, "step": 6579 }, { "epoch": 0.6504708004843931, "grad_norm": 3.182412232648986, "learning_rate": 5.751017068868118e-08, "loss": 0.7639, "step": 6580 }, { "epoch": 0.6505696562291476, "grad_norm": 8.213687734841196, "learning_rate": 5.748118602852668e-08, "loss": 0.759, "step": 6581 }, { "epoch": 0.6506685119739021, "grad_norm": 3.7369547214854157, "learning_rate": 5.745220572804941e-08, "loss": 0.6639, "step": 6582 }, { "epoch": 0.6507673677186565, "grad_norm": 6.041745222547543, "learning_rate": 5.742322979022076e-08, "loss": 0.7029, "step": 6583 }, { "epoch": 0.650866223463411, "grad_norm": 3.0137967526597707, "learning_rate": 5.73942582180119e-08, "loss": 0.6633, "step": 6584 }, { "epoch": 0.6509650792081655, "grad_norm": 3.695073792941132, "learning_rate": 5.7365291014393334e-08, "loss": 0.693, "step": 6585 }, { "epoch": 0.6510639349529199, "grad_norm": 4.540839714056835, "learning_rate": 5.733632818233528e-08, "loss": 0.7535, "step": 6586 }, { "epoch": 0.6511627906976745, "grad_norm": 5.7605482579052545, "learning_rate": 5.7307369724807476e-08, "loss": 0.6559, "step": 6587 }, { "epoch": 0.6512616464424289, "grad_norm": 3.1298386030881393, "learning_rate": 5.727841564477909e-08, "loss": 0.7257, "step": 6588 }, { "epoch": 0.6513605021871833, "grad_norm": 13.857883985502385, "learning_rate": 5.7249465945219e-08, "loss": 0.7683, "step": 6589 }, { "epoch": 0.6514593579319378, "grad_norm": 3.9873735287582246, "learning_rate": 5.7220520629095604e-08, "loss": 0.7648, "step": 6590 }, { "epoch": 0.6515582136766923, "grad_norm": 3.6633109353701303, "learning_rate": 5.7191579699376736e-08, "loss": 0.73, "step": 6591 }, { "epoch": 0.6516570694214467, "grad_norm": 4.07605565177372, "learning_rate": 5.71626431590299e-08, "loss": 0.7466, "step": 6592 }, { "epoch": 0.6517559251662012, "grad_norm": 2.8545320572611503, "learning_rate": 5.713371101102216e-08, "loss": 0.6904, "step": 6593 }, { "epoch": 0.6518547809109557, "grad_norm": 3.927951716471381, "learning_rate": 5.7104783258319977e-08, "loss": 0.6475, "step": 6594 }, { "epoch": 0.6519536366557102, "grad_norm": 4.179723301917675, "learning_rate": 5.707585990388957e-08, "loss": 0.6155, "step": 6595 }, { "epoch": 0.6520524924004646, "grad_norm": 3.140564880290087, "learning_rate": 5.704694095069652e-08, "loss": 0.7917, "step": 6596 }, { "epoch": 0.6521513481452191, "grad_norm": 3.4615098731525826, "learning_rate": 5.701802640170611e-08, "loss": 0.6747, "step": 6597 }, { "epoch": 0.6522502038899736, "grad_norm": 24.18937743896222, "learning_rate": 5.698911625988306e-08, "loss": 0.7949, "step": 6598 }, { "epoch": 0.652349059634728, "grad_norm": 5.800519546772479, "learning_rate": 5.696021052819164e-08, "loss": 0.6025, "step": 6599 }, { "epoch": 0.6524479153794824, "grad_norm": 3.8920093315256983, "learning_rate": 5.6931309209595766e-08, "loss": 0.7015, "step": 6600 }, { "epoch": 0.652546771124237, "grad_norm": 6.350962104727854, "learning_rate": 5.6902412307058855e-08, "loss": 0.7892, "step": 6601 }, { "epoch": 0.6526456268689914, "grad_norm": 3.755972410226422, "learning_rate": 5.687351982354378e-08, "loss": 0.7197, "step": 6602 }, { "epoch": 0.6527444826137458, "grad_norm": 3.423339335265188, "learning_rate": 5.684463176201308e-08, "loss": 0.599, "step": 6603 }, { "epoch": 0.6528433383585004, "grad_norm": 4.290524552626333, "learning_rate": 5.681574812542887e-08, "loss": 0.6615, "step": 6604 }, { "epoch": 0.6529421941032548, "grad_norm": 3.1246216251968852, "learning_rate": 5.6786868916752615e-08, "loss": 0.712, "step": 6605 }, { "epoch": 0.6530410498480093, "grad_norm": 4.6768083002398075, "learning_rate": 5.675799413894557e-08, "loss": 0.7698, "step": 6606 }, { "epoch": 0.6531399055927638, "grad_norm": 3.8541974917265636, "learning_rate": 5.672912379496833e-08, "loss": 0.7396, "step": 6607 }, { "epoch": 0.6532387613375182, "grad_norm": 3.9902344358404096, "learning_rate": 5.670025788778113e-08, "loss": 0.7225, "step": 6608 }, { "epoch": 0.6533376170822727, "grad_norm": 5.675479736375696, "learning_rate": 5.667139642034382e-08, "loss": 0.672, "step": 6609 }, { "epoch": 0.6534364728270272, "grad_norm": 5.507322172615904, "learning_rate": 5.664253939561563e-08, "loss": 0.6401, "step": 6610 }, { "epoch": 0.6535353285717816, "grad_norm": 6.681148729906989, "learning_rate": 5.661368681655545e-08, "loss": 0.8024, "step": 6611 }, { "epoch": 0.6536341843165361, "grad_norm": 4.241971000295251, "learning_rate": 5.658483868612173e-08, "loss": 0.7631, "step": 6612 }, { "epoch": 0.6537330400612905, "grad_norm": 3.6680550140206827, "learning_rate": 5.655599500727235e-08, "loss": 0.7187, "step": 6613 }, { "epoch": 0.6538318958060451, "grad_norm": 6.675880275448662, "learning_rate": 5.652715578296488e-08, "loss": 0.7161, "step": 6614 }, { "epoch": 0.6539307515507995, "grad_norm": 5.001396819240162, "learning_rate": 5.64983210161563e-08, "loss": 0.6874, "step": 6615 }, { "epoch": 0.6540296072955539, "grad_norm": 4.294958081435149, "learning_rate": 5.646949070980318e-08, "loss": 0.8579, "step": 6616 }, { "epoch": 0.6541284630403085, "grad_norm": 4.409366906265974, "learning_rate": 5.64406648668617e-08, "loss": 0.6468, "step": 6617 }, { "epoch": 0.6542273187850629, "grad_norm": 3.368601110768223, "learning_rate": 5.641184349028747e-08, "loss": 0.6216, "step": 6618 }, { "epoch": 0.6543261745298173, "grad_norm": 3.205988444732904, "learning_rate": 5.63830265830357e-08, "loss": 0.7164, "step": 6619 }, { "epoch": 0.6544250302745719, "grad_norm": 3.6998914405323635, "learning_rate": 5.6354214148061205e-08, "loss": 0.7293, "step": 6620 }, { "epoch": 0.6545238860193263, "grad_norm": 4.328171058643871, "learning_rate": 5.632540618831818e-08, "loss": 0.7744, "step": 6621 }, { "epoch": 0.6546227417640808, "grad_norm": 3.3848451908408737, "learning_rate": 5.629660270676051e-08, "loss": 0.709, "step": 6622 }, { "epoch": 0.6547215975088352, "grad_norm": 3.804742458976016, "learning_rate": 5.626780370634161e-08, "loss": 0.7115, "step": 6623 }, { "epoch": 0.6548204532535897, "grad_norm": 4.635032443572569, "learning_rate": 5.6239009190014297e-08, "loss": 0.6999, "step": 6624 }, { "epoch": 0.6549193089983442, "grad_norm": 7.269119753247966, "learning_rate": 5.621021916073108e-08, "loss": 0.8301, "step": 6625 }, { "epoch": 0.6550181647430986, "grad_norm": 5.094045449840713, "learning_rate": 5.6181433621443974e-08, "loss": 0.7268, "step": 6626 }, { "epoch": 0.6551170204878531, "grad_norm": 3.859522329814548, "learning_rate": 5.615265257510444e-08, "loss": 0.7998, "step": 6627 }, { "epoch": 0.6552158762326076, "grad_norm": 12.222383852521206, "learning_rate": 5.612387602466364e-08, "loss": 0.7243, "step": 6628 }, { "epoch": 0.655314731977362, "grad_norm": 4.6575014951615525, "learning_rate": 5.609510397307209e-08, "loss": 0.6645, "step": 6629 }, { "epoch": 0.6554135877221166, "grad_norm": 4.7088107413095095, "learning_rate": 5.606633642328003e-08, "loss": 0.6557, "step": 6630 }, { "epoch": 0.655512443466871, "grad_norm": 3.6332271986886187, "learning_rate": 5.6037573378237045e-08, "loss": 0.6921, "step": 6631 }, { "epoch": 0.6556112992116254, "grad_norm": 3.509381485555218, "learning_rate": 5.6008814840892484e-08, "loss": 0.6068, "step": 6632 }, { "epoch": 0.6557101549563799, "grad_norm": 6.80605661812052, "learning_rate": 5.598006081419498e-08, "loss": 0.7931, "step": 6633 }, { "epoch": 0.6558090107011344, "grad_norm": 4.34009183505453, "learning_rate": 5.5951311301092955e-08, "loss": 0.629, "step": 6634 }, { "epoch": 0.6559078664458888, "grad_norm": 145.63758619303948, "learning_rate": 5.5922566304534144e-08, "loss": 0.7253, "step": 6635 }, { "epoch": 0.6560067221906433, "grad_norm": 3.079590707019999, "learning_rate": 5.589382582746598e-08, "loss": 0.666, "step": 6636 }, { "epoch": 0.6561055779353978, "grad_norm": 3.92505507503517, "learning_rate": 5.58650898728354e-08, "loss": 0.6275, "step": 6637 }, { "epoch": 0.6562044336801522, "grad_norm": 4.020464408465863, "learning_rate": 5.5836358443588785e-08, "loss": 0.7323, "step": 6638 }, { "epoch": 0.6563032894249067, "grad_norm": 5.572547388924002, "learning_rate": 5.580763154267218e-08, "loss": 0.7013, "step": 6639 }, { "epoch": 0.6564021451696612, "grad_norm": 5.0633553194692755, "learning_rate": 5.5778909173031044e-08, "loss": 0.6215, "step": 6640 }, { "epoch": 0.6565010009144157, "grad_norm": 7.348870561947552, "learning_rate": 5.575019133761046e-08, "loss": 0.7301, "step": 6641 }, { "epoch": 0.6565998566591701, "grad_norm": 3.679799119860612, "learning_rate": 5.5721478039355076e-08, "loss": 0.7391, "step": 6642 }, { "epoch": 0.6566987124039245, "grad_norm": 3.500588295984293, "learning_rate": 5.569276928120892e-08, "loss": 0.7143, "step": 6643 }, { "epoch": 0.6567975681486791, "grad_norm": 4.1070943119419105, "learning_rate": 5.5664065066115685e-08, "loss": 0.6597, "step": 6644 }, { "epoch": 0.6568964238934335, "grad_norm": 3.3475519987480165, "learning_rate": 5.563536539701863e-08, "loss": 0.5269, "step": 6645 }, { "epoch": 0.6569952796381879, "grad_norm": 5.339517316091903, "learning_rate": 5.560667027686038e-08, "loss": 0.7679, "step": 6646 }, { "epoch": 0.6570941353829425, "grad_norm": 4.645189692176482, "learning_rate": 5.557797970858329e-08, "loss": 0.7332, "step": 6647 }, { "epoch": 0.6571929911276969, "grad_norm": 4.685681982763314, "learning_rate": 5.5549293695129064e-08, "loss": 0.7926, "step": 6648 }, { "epoch": 0.6572918468724513, "grad_norm": 4.637700469772344, "learning_rate": 5.552061223943912e-08, "loss": 0.8, "step": 6649 }, { "epoch": 0.6573907026172059, "grad_norm": 7.454173026709327, "learning_rate": 5.549193534445425e-08, "loss": 0.5874, "step": 6650 }, { "epoch": 0.6574895583619603, "grad_norm": 3.361228536934839, "learning_rate": 5.546326301311488e-08, "loss": 0.8492, "step": 6651 }, { "epoch": 0.6575884141067148, "grad_norm": 9.171976025493011, "learning_rate": 5.5434595248360906e-08, "loss": 0.6859, "step": 6652 }, { "epoch": 0.6576872698514692, "grad_norm": 3.8410310624189696, "learning_rate": 5.5405932053131834e-08, "loss": 0.6456, "step": 6653 }, { "epoch": 0.6577861255962237, "grad_norm": 3.049508060402196, "learning_rate": 5.537727343036658e-08, "loss": 0.6938, "step": 6654 }, { "epoch": 0.6578849813409782, "grad_norm": 4.837043752267119, "learning_rate": 5.534861938300371e-08, "loss": 0.7803, "step": 6655 }, { "epoch": 0.6579838370857326, "grad_norm": 3.652155587438829, "learning_rate": 5.5319969913981315e-08, "loss": 0.7641, "step": 6656 }, { "epoch": 0.6580826928304871, "grad_norm": 4.507283987901102, "learning_rate": 5.529132502623688e-08, "loss": 0.5167, "step": 6657 }, { "epoch": 0.6581815485752416, "grad_norm": 3.630754632681864, "learning_rate": 5.5262684722707564e-08, "loss": 0.6465, "step": 6658 }, { "epoch": 0.658280404319996, "grad_norm": 4.415447095278395, "learning_rate": 5.523404900633004e-08, "loss": 0.718, "step": 6659 }, { "epoch": 0.6583792600647506, "grad_norm": 3.8189958706613614, "learning_rate": 5.520541788004042e-08, "loss": 0.6904, "step": 6660 }, { "epoch": 0.658478115809505, "grad_norm": 4.534146437697531, "learning_rate": 5.517679134677446e-08, "loss": 0.7339, "step": 6661 }, { "epoch": 0.6585769715542594, "grad_norm": 3.6673094293964223, "learning_rate": 5.5148169409467317e-08, "loss": 0.7711, "step": 6662 }, { "epoch": 0.6586758272990139, "grad_norm": 8.827378519442334, "learning_rate": 5.51195520710538e-08, "loss": 0.6305, "step": 6663 }, { "epoch": 0.6587746830437684, "grad_norm": 5.465121191151299, "learning_rate": 5.509093933446822e-08, "loss": 0.7247, "step": 6664 }, { "epoch": 0.6588735387885228, "grad_norm": 3.1212084137016882, "learning_rate": 5.506233120264432e-08, "loss": 0.7097, "step": 6665 }, { "epoch": 0.6589723945332773, "grad_norm": 4.2926051055695345, "learning_rate": 5.503372767851551e-08, "loss": 0.7539, "step": 6666 }, { "epoch": 0.6590712502780318, "grad_norm": 13.87567208627923, "learning_rate": 5.50051287650146e-08, "loss": 0.605, "step": 6667 }, { "epoch": 0.6591701060227863, "grad_norm": 5.046959270333449, "learning_rate": 5.497653446507404e-08, "loss": 0.8018, "step": 6668 }, { "epoch": 0.6592689617675407, "grad_norm": 4.281588619876789, "learning_rate": 5.49479447816257e-08, "loss": 0.707, "step": 6669 }, { "epoch": 0.6593678175122952, "grad_norm": 3.5107722721360424, "learning_rate": 5.491935971760109e-08, "loss": 0.7047, "step": 6670 }, { "epoch": 0.6594666732570497, "grad_norm": 5.721236485242851, "learning_rate": 5.489077927593111e-08, "loss": 0.719, "step": 6671 }, { "epoch": 0.6595655290018041, "grad_norm": 5.280059719979246, "learning_rate": 5.486220345954635e-08, "loss": 0.6649, "step": 6672 }, { "epoch": 0.6596643847465585, "grad_norm": 6.331957336630566, "learning_rate": 5.4833632271376765e-08, "loss": 0.6673, "step": 6673 }, { "epoch": 0.6597632404913131, "grad_norm": 7.689295036137141, "learning_rate": 5.480506571435193e-08, "loss": 0.6193, "step": 6674 }, { "epoch": 0.6598620962360675, "grad_norm": 4.000480239150988, "learning_rate": 5.477650379140097e-08, "loss": 0.7635, "step": 6675 }, { "epoch": 0.6599609519808219, "grad_norm": 17.081719789805117, "learning_rate": 5.4747946505452437e-08, "loss": 0.648, "step": 6676 }, { "epoch": 0.6600598077255765, "grad_norm": 2.9995817272118956, "learning_rate": 5.471939385943445e-08, "loss": 0.6871, "step": 6677 }, { "epoch": 0.6601586634703309, "grad_norm": 3.610094409180275, "learning_rate": 5.469084585627475e-08, "loss": 0.7708, "step": 6678 }, { "epoch": 0.6602575192150854, "grad_norm": 4.722887626359667, "learning_rate": 5.466230249890038e-08, "loss": 0.6968, "step": 6679 }, { "epoch": 0.6603563749598399, "grad_norm": 3.3820850579170694, "learning_rate": 5.463376379023814e-08, "loss": 0.7486, "step": 6680 }, { "epoch": 0.6604552307045943, "grad_norm": 4.779615838499907, "learning_rate": 5.460522973321424e-08, "loss": 0.733, "step": 6681 }, { "epoch": 0.6605540864493488, "grad_norm": 4.673893277640158, "learning_rate": 5.457670033075439e-08, "loss": 0.7158, "step": 6682 }, { "epoch": 0.6606529421941033, "grad_norm": 2.831506343490153, "learning_rate": 5.4548175585783905e-08, "loss": 0.6226, "step": 6683 }, { "epoch": 0.6607517979388577, "grad_norm": 4.223456994983366, "learning_rate": 5.4519655501227525e-08, "loss": 0.6282, "step": 6684 }, { "epoch": 0.6608506536836122, "grad_norm": 2.927438881574786, "learning_rate": 5.449114008000962e-08, "loss": 0.6375, "step": 6685 }, { "epoch": 0.6609495094283666, "grad_norm": 3.6075153838735092, "learning_rate": 5.446262932505401e-08, "loss": 0.5984, "step": 6686 }, { "epoch": 0.6610483651731212, "grad_norm": 5.765833421856698, "learning_rate": 5.443412323928398e-08, "loss": 0.7899, "step": 6687 }, { "epoch": 0.6611472209178756, "grad_norm": 11.710287352056758, "learning_rate": 5.4405621825622495e-08, "loss": 0.5728, "step": 6688 }, { "epoch": 0.66124607666263, "grad_norm": 5.1658009294261085, "learning_rate": 5.4377125086991946e-08, "loss": 0.825, "step": 6689 }, { "epoch": 0.6613449324073846, "grad_norm": 4.186356871553078, "learning_rate": 5.434863302631421e-08, "loss": 0.7078, "step": 6690 }, { "epoch": 0.661443788152139, "grad_norm": 3.9542103824062655, "learning_rate": 5.432014564651073e-08, "loss": 0.7571, "step": 6691 }, { "epoch": 0.6615426438968934, "grad_norm": 5.3962978519305835, "learning_rate": 5.4291662950502534e-08, "loss": 0.7389, "step": 6692 }, { "epoch": 0.661641499641648, "grad_norm": 3.204008098484777, "learning_rate": 5.426318494121003e-08, "loss": 0.6519, "step": 6693 }, { "epoch": 0.6617403553864024, "grad_norm": 5.37393861788962, "learning_rate": 5.4234711621553286e-08, "loss": 0.7114, "step": 6694 }, { "epoch": 0.6618392111311568, "grad_norm": 5.178694733917559, "learning_rate": 5.420624299445173e-08, "loss": 0.687, "step": 6695 }, { "epoch": 0.6619380668759113, "grad_norm": 3.6982257106986474, "learning_rate": 5.417777906282445e-08, "loss": 0.6679, "step": 6696 }, { "epoch": 0.6620369226206658, "grad_norm": 2.936303312010156, "learning_rate": 5.414931982959005e-08, "loss": 0.6741, "step": 6697 }, { "epoch": 0.6621357783654203, "grad_norm": 8.295375689509793, "learning_rate": 5.412086529766651e-08, "loss": 0.7356, "step": 6698 }, { "epoch": 0.6622346341101747, "grad_norm": 8.230119346607147, "learning_rate": 5.409241546997146e-08, "loss": 0.6095, "step": 6699 }, { "epoch": 0.6623334898549292, "grad_norm": 4.7780774120361045, "learning_rate": 5.406397034942207e-08, "loss": 0.6397, "step": 6700 }, { "epoch": 0.6624323455996837, "grad_norm": 10.90626277701858, "learning_rate": 5.4035529938934924e-08, "loss": 0.7011, "step": 6701 }, { "epoch": 0.6625312013444381, "grad_norm": 13.079605774736509, "learning_rate": 5.40070942414261e-08, "loss": 0.7079, "step": 6702 }, { "epoch": 0.6626300570891926, "grad_norm": 3.2431121305150006, "learning_rate": 5.3978663259811374e-08, "loss": 0.7387, "step": 6703 }, { "epoch": 0.6627289128339471, "grad_norm": 5.103224978675987, "learning_rate": 5.3950236997005814e-08, "loss": 0.7099, "step": 6704 }, { "epoch": 0.6628277685787015, "grad_norm": 6.245254496651759, "learning_rate": 5.392181545592423e-08, "loss": 0.7672, "step": 6705 }, { "epoch": 0.662926624323456, "grad_norm": 3.7899643123841336, "learning_rate": 5.3893398639480705e-08, "loss": 0.7761, "step": 6706 }, { "epoch": 0.6630254800682105, "grad_norm": 3.923130698650747, "learning_rate": 5.386498655058904e-08, "loss": 0.6512, "step": 6707 }, { "epoch": 0.6631243358129649, "grad_norm": 5.271545825101287, "learning_rate": 5.383657919216251e-08, "loss": 0.6748, "step": 6708 }, { "epoch": 0.6632231915577194, "grad_norm": 6.51764594875741, "learning_rate": 5.380817656711378e-08, "loss": 0.7825, "step": 6709 }, { "epoch": 0.6633220473024739, "grad_norm": 7.103833863614909, "learning_rate": 5.377977867835517e-08, "loss": 0.6775, "step": 6710 }, { "epoch": 0.6634209030472283, "grad_norm": 5.294886270756204, "learning_rate": 5.3751385528798496e-08, "loss": 0.6566, "step": 6711 }, { "epoch": 0.6635197587919828, "grad_norm": 7.625777983818856, "learning_rate": 5.372299712135497e-08, "loss": 0.6932, "step": 6712 }, { "epoch": 0.6636186145367373, "grad_norm": 3.900991956430983, "learning_rate": 5.369461345893548e-08, "loss": 0.6964, "step": 6713 }, { "epoch": 0.6637174702814918, "grad_norm": 4.832581465427084, "learning_rate": 5.366623454445035e-08, "loss": 0.6388, "step": 6714 }, { "epoch": 0.6638163260262462, "grad_norm": 3.0584650737084154, "learning_rate": 5.3637860380809366e-08, "loss": 0.7081, "step": 6715 }, { "epoch": 0.6639151817710006, "grad_norm": 3.2095551979055537, "learning_rate": 5.3609490970921936e-08, "loss": 0.695, "step": 6716 }, { "epoch": 0.6640140375157552, "grad_norm": 7.956383656781262, "learning_rate": 5.358112631769687e-08, "loss": 0.7015, "step": 6717 }, { "epoch": 0.6641128932605096, "grad_norm": 3.2993593768481793, "learning_rate": 5.3552766424042615e-08, "loss": 0.751, "step": 6718 }, { "epoch": 0.664211749005264, "grad_norm": 9.22959414390837, "learning_rate": 5.352441129286698e-08, "loss": 0.6258, "step": 6719 }, { "epoch": 0.6643106047500186, "grad_norm": 7.462663106592477, "learning_rate": 5.349606092707743e-08, "loss": 0.7183, "step": 6720 }, { "epoch": 0.664409460494773, "grad_norm": 3.4798346352741154, "learning_rate": 5.346771532958083e-08, "loss": 0.6721, "step": 6721 }, { "epoch": 0.6645083162395274, "grad_norm": 3.9204322294949647, "learning_rate": 5.3439374503283644e-08, "loss": 0.7589, "step": 6722 }, { "epoch": 0.664607171984282, "grad_norm": 7.778321217831362, "learning_rate": 5.3411038451091763e-08, "loss": 0.7063, "step": 6723 }, { "epoch": 0.6647060277290364, "grad_norm": 23.901329207195484, "learning_rate": 5.3382707175910636e-08, "loss": 0.6886, "step": 6724 }, { "epoch": 0.6648048834737909, "grad_norm": 5.58819900601528, "learning_rate": 5.335438068064528e-08, "loss": 0.6985, "step": 6725 }, { "epoch": 0.6649037392185453, "grad_norm": 4.606073810574221, "learning_rate": 5.332605896820008e-08, "loss": 0.7136, "step": 6726 }, { "epoch": 0.6650025949632998, "grad_norm": 3.5518976583201183, "learning_rate": 5.329774204147908e-08, "loss": 0.6674, "step": 6727 }, { "epoch": 0.6651014507080543, "grad_norm": 4.448883084877687, "learning_rate": 5.3269429903385685e-08, "loss": 0.6509, "step": 6728 }, { "epoch": 0.6652003064528087, "grad_norm": 4.757073410811336, "learning_rate": 5.3241122556822935e-08, "loss": 0.643, "step": 6729 }, { "epoch": 0.6652991621975632, "grad_norm": 6.115019030258822, "learning_rate": 5.321282000469336e-08, "loss": 0.6429, "step": 6730 }, { "epoch": 0.6653980179423177, "grad_norm": 4.405555246533661, "learning_rate": 5.318452224989889e-08, "loss": 0.6952, "step": 6731 }, { "epoch": 0.6654968736870721, "grad_norm": 2.8612379345927823, "learning_rate": 5.315622929534109e-08, "loss": 0.7867, "step": 6732 }, { "epoch": 0.6655957294318267, "grad_norm": 5.794354467530149, "learning_rate": 5.312794114392103e-08, "loss": 0.5582, "step": 6733 }, { "epoch": 0.6656945851765811, "grad_norm": 6.819754001104054, "learning_rate": 5.309965779853915e-08, "loss": 0.6935, "step": 6734 }, { "epoch": 0.6657934409213355, "grad_norm": 7.585053969541475, "learning_rate": 5.307137926209557e-08, "loss": 0.7151, "step": 6735 }, { "epoch": 0.66589229666609, "grad_norm": 3.383240652383676, "learning_rate": 5.3043105537489764e-08, "loss": 0.635, "step": 6736 }, { "epoch": 0.6659911524108445, "grad_norm": 4.958498991872869, "learning_rate": 5.301483662762085e-08, "loss": 0.6732, "step": 6737 }, { "epoch": 0.6660900081555989, "grad_norm": 5.470041455548959, "learning_rate": 5.2986572535387343e-08, "loss": 0.8272, "step": 6738 }, { "epoch": 0.6661888639003534, "grad_norm": 35.15042824935367, "learning_rate": 5.2958313263687346e-08, "loss": 0.5697, "step": 6739 }, { "epoch": 0.6662877196451079, "grad_norm": 8.500426293830461, "learning_rate": 5.2930058815418386e-08, "loss": 0.6636, "step": 6740 }, { "epoch": 0.6663865753898623, "grad_norm": 3.3516527176877906, "learning_rate": 5.2901809193477586e-08, "loss": 0.7141, "step": 6741 }, { "epoch": 0.6664854311346168, "grad_norm": 4.9408431966948525, "learning_rate": 5.287356440076147e-08, "loss": 0.724, "step": 6742 }, { "epoch": 0.6665842868793713, "grad_norm": 5.552864779419335, "learning_rate": 5.2845324440166175e-08, "loss": 0.7145, "step": 6743 }, { "epoch": 0.6666831426241258, "grad_norm": 5.832433254921617, "learning_rate": 5.28170893145873e-08, "loss": 0.7185, "step": 6744 }, { "epoch": 0.6667819983688802, "grad_norm": 56.326360388161504, "learning_rate": 5.2788859026919893e-08, "loss": 0.6774, "step": 6745 }, { "epoch": 0.6668808541136346, "grad_norm": 4.149143964798494, "learning_rate": 5.2760633580058575e-08, "loss": 0.6959, "step": 6746 }, { "epoch": 0.6669797098583892, "grad_norm": 3.888407298099877, "learning_rate": 5.27324129768975e-08, "loss": 0.6002, "step": 6747 }, { "epoch": 0.6670785656031436, "grad_norm": 6.129426366986402, "learning_rate": 5.270419722033017e-08, "loss": 0.7233, "step": 6748 }, { "epoch": 0.667177421347898, "grad_norm": 3.058165519691487, "learning_rate": 5.267598631324981e-08, "loss": 0.66, "step": 6749 }, { "epoch": 0.6672762770926526, "grad_norm": 3.5812140948908184, "learning_rate": 5.2647780258548924e-08, "loss": 0.7314, "step": 6750 }, { "epoch": 0.667375132837407, "grad_norm": 6.030730302087922, "learning_rate": 5.261957905911969e-08, "loss": 0.7036, "step": 6751 }, { "epoch": 0.6674739885821614, "grad_norm": 5.032620753796471, "learning_rate": 5.259138271785375e-08, "loss": 0.715, "step": 6752 }, { "epoch": 0.667572844326916, "grad_norm": 6.398874442337878, "learning_rate": 5.2563191237642144e-08, "loss": 0.6756, "step": 6753 }, { "epoch": 0.6676717000716704, "grad_norm": 3.3258611670214577, "learning_rate": 5.253500462137557e-08, "loss": 0.6802, "step": 6754 }, { "epoch": 0.6677705558164249, "grad_norm": 3.054905637578774, "learning_rate": 5.2506822871944124e-08, "loss": 0.6874, "step": 6755 }, { "epoch": 0.6678694115611793, "grad_norm": 6.411033781430989, "learning_rate": 5.247864599223737e-08, "loss": 0.6923, "step": 6756 }, { "epoch": 0.6679682673059338, "grad_norm": 3.491075919522455, "learning_rate": 5.245047398514448e-08, "loss": 0.7605, "step": 6757 }, { "epoch": 0.6680671230506883, "grad_norm": 3.732582222289584, "learning_rate": 5.242230685355411e-08, "loss": 0.6821, "step": 6758 }, { "epoch": 0.6681659787954427, "grad_norm": 4.185982750832962, "learning_rate": 5.239414460035433e-08, "loss": 0.7959, "step": 6759 }, { "epoch": 0.6682648345401973, "grad_norm": 6.68517367613169, "learning_rate": 5.2365987228432815e-08, "loss": 0.7958, "step": 6760 }, { "epoch": 0.6683636902849517, "grad_norm": 3.4601949386836526, "learning_rate": 5.233783474067662e-08, "loss": 0.7638, "step": 6761 }, { "epoch": 0.6684625460297061, "grad_norm": 2.964957852677751, "learning_rate": 5.230968713997241e-08, "loss": 0.7627, "step": 6762 }, { "epoch": 0.6685614017744607, "grad_norm": 14.975749292150468, "learning_rate": 5.2281544429206336e-08, "loss": 0.7089, "step": 6763 }, { "epoch": 0.6686602575192151, "grad_norm": 4.170970821570884, "learning_rate": 5.225340661126395e-08, "loss": 0.6983, "step": 6764 }, { "epoch": 0.6687591132639695, "grad_norm": 7.5475294264630985, "learning_rate": 5.22252736890304e-08, "loss": 0.6863, "step": 6765 }, { "epoch": 0.6688579690087241, "grad_norm": 3.2996723429130395, "learning_rate": 5.219714566539035e-08, "loss": 0.6533, "step": 6766 }, { "epoch": 0.6689568247534785, "grad_norm": 3.9458950498699434, "learning_rate": 5.216902254322783e-08, "loss": 0.6915, "step": 6767 }, { "epoch": 0.6690556804982329, "grad_norm": 3.100522704483637, "learning_rate": 5.2140904325426484e-08, "loss": 0.5747, "step": 6768 }, { "epoch": 0.6691545362429874, "grad_norm": 3.8668015605349835, "learning_rate": 5.211279101486947e-08, "loss": 0.7475, "step": 6769 }, { "epoch": 0.6692533919877419, "grad_norm": 3.2739451753962068, "learning_rate": 5.2084682614439325e-08, "loss": 0.6408, "step": 6770 }, { "epoch": 0.6693522477324964, "grad_norm": 4.89288785563659, "learning_rate": 5.20565791270182e-08, "loss": 0.8086, "step": 6771 }, { "epoch": 0.6694511034772508, "grad_norm": 6.072262667347912, "learning_rate": 5.2028480555487676e-08, "loss": 0.7817, "step": 6772 }, { "epoch": 0.6695499592220053, "grad_norm": 3.409823859514897, "learning_rate": 5.200038690272881e-08, "loss": 0.7101, "step": 6773 }, { "epoch": 0.6696488149667598, "grad_norm": 4.6192354139885685, "learning_rate": 5.1972298171622255e-08, "loss": 0.7608, "step": 6774 }, { "epoch": 0.6697476707115142, "grad_norm": 8.771578910975975, "learning_rate": 5.194421436504802e-08, "loss": 0.6094, "step": 6775 }, { "epoch": 0.6698465264562687, "grad_norm": 6.997581682260295, "learning_rate": 5.191613548588574e-08, "loss": 0.6895, "step": 6776 }, { "epoch": 0.6699453822010232, "grad_norm": 4.909476160013606, "learning_rate": 5.1888061537014505e-08, "loss": 0.6297, "step": 6777 }, { "epoch": 0.6700442379457776, "grad_norm": 2.8885665044087894, "learning_rate": 5.185999252131283e-08, "loss": 0.674, "step": 6778 }, { "epoch": 0.670143093690532, "grad_norm": 11.126686014382956, "learning_rate": 5.18319284416588e-08, "loss": 0.7662, "step": 6779 }, { "epoch": 0.6702419494352866, "grad_norm": 9.39467778636712, "learning_rate": 5.1803869300930034e-08, "loss": 0.6784, "step": 6780 }, { "epoch": 0.670340805180041, "grad_norm": 4.989638768165759, "learning_rate": 5.177581510200347e-08, "loss": 0.6454, "step": 6781 }, { "epoch": 0.6704396609247955, "grad_norm": 5.263844535098252, "learning_rate": 5.1747765847755753e-08, "loss": 0.74, "step": 6782 }, { "epoch": 0.67053851666955, "grad_norm": 16.203609667606223, "learning_rate": 5.171972154106286e-08, "loss": 0.6408, "step": 6783 }, { "epoch": 0.6706373724143044, "grad_norm": 3.6255576136962, "learning_rate": 5.169168218480031e-08, "loss": 0.7453, "step": 6784 }, { "epoch": 0.6707362281590589, "grad_norm": 4.615145572903434, "learning_rate": 5.16636477818432e-08, "loss": 0.7234, "step": 6785 }, { "epoch": 0.6708350839038134, "grad_norm": 5.333011558170849, "learning_rate": 5.1635618335065975e-08, "loss": 0.721, "step": 6786 }, { "epoch": 0.6709339396485678, "grad_norm": 5.451128806865163, "learning_rate": 5.160759384734266e-08, "loss": 0.7154, "step": 6787 }, { "epoch": 0.6710327953933223, "grad_norm": 3.60497325291686, "learning_rate": 5.157957432154678e-08, "loss": 0.7974, "step": 6788 }, { "epoch": 0.6711316511380767, "grad_norm": 6.451600684100504, "learning_rate": 5.155155976055131e-08, "loss": 0.6748, "step": 6789 }, { "epoch": 0.6712305068828313, "grad_norm": 4.0501398428670115, "learning_rate": 5.152355016722869e-08, "loss": 0.7143, "step": 6790 }, { "epoch": 0.6713293626275857, "grad_norm": 5.334725346163968, "learning_rate": 5.149554554445096e-08, "loss": 0.6662, "step": 6791 }, { "epoch": 0.6714282183723401, "grad_norm": 15.989212688595666, "learning_rate": 5.1467545895089506e-08, "loss": 0.776, "step": 6792 }, { "epoch": 0.6715270741170947, "grad_norm": 2.9152107773792846, "learning_rate": 5.1439551222015344e-08, "loss": 0.6135, "step": 6793 }, { "epoch": 0.6716259298618491, "grad_norm": 3.098640885898533, "learning_rate": 5.1411561528098844e-08, "loss": 0.7281, "step": 6794 }, { "epoch": 0.6717247856066035, "grad_norm": 5.056906080072996, "learning_rate": 5.1383576816209997e-08, "loss": 0.5936, "step": 6795 }, { "epoch": 0.6718236413513581, "grad_norm": 6.999161458538162, "learning_rate": 5.135559708921824e-08, "loss": 0.707, "step": 6796 }, { "epoch": 0.6719224970961125, "grad_norm": 4.71023108554262, "learning_rate": 5.132762234999239e-08, "loss": 0.7689, "step": 6797 }, { "epoch": 0.672021352840867, "grad_norm": 3.8900876620592997, "learning_rate": 5.129965260140092e-08, "loss": 0.6388, "step": 6798 }, { "epoch": 0.6721202085856214, "grad_norm": 3.2319729226906646, "learning_rate": 5.1271687846311725e-08, "loss": 0.7253, "step": 6799 }, { "epoch": 0.6722190643303759, "grad_norm": 4.028619592893613, "learning_rate": 5.124372808759212e-08, "loss": 0.7008, "step": 6800 }, { "epoch": 0.6723179200751304, "grad_norm": 4.284638804453745, "learning_rate": 5.121577332810898e-08, "loss": 0.7569, "step": 6801 }, { "epoch": 0.6724167758198848, "grad_norm": 10.763042515320947, "learning_rate": 5.118782357072873e-08, "loss": 0.7668, "step": 6802 }, { "epoch": 0.6725156315646393, "grad_norm": 7.555716307197169, "learning_rate": 5.115987881831707e-08, "loss": 0.7564, "step": 6803 }, { "epoch": 0.6726144873093938, "grad_norm": 11.384482112843473, "learning_rate": 5.113193907373945e-08, "loss": 0.7009, "step": 6804 }, { "epoch": 0.6727133430541482, "grad_norm": 3.5196919143119922, "learning_rate": 5.1104004339860595e-08, "loss": 0.654, "step": 6805 }, { "epoch": 0.6728121987989027, "grad_norm": 4.616694375059566, "learning_rate": 5.107607461954485e-08, "loss": 0.8057, "step": 6806 }, { "epoch": 0.6729110545436572, "grad_norm": 3.613801782890328, "learning_rate": 5.1048149915655936e-08, "loss": 0.7178, "step": 6807 }, { "epoch": 0.6730099102884116, "grad_norm": 3.0903112702303352, "learning_rate": 5.102023023105722e-08, "loss": 0.6991, "step": 6808 }, { "epoch": 0.673108766033166, "grad_norm": 6.471821239669036, "learning_rate": 5.099231556861132e-08, "loss": 0.6626, "step": 6809 }, { "epoch": 0.6732076217779206, "grad_norm": 5.539924235755165, "learning_rate": 5.09644059311806e-08, "loss": 0.7433, "step": 6810 }, { "epoch": 0.673306477522675, "grad_norm": 8.132154354317967, "learning_rate": 5.093650132162668e-08, "loss": 0.7217, "step": 6811 }, { "epoch": 0.6734053332674295, "grad_norm": 2.979673044484443, "learning_rate": 5.090860174281082e-08, "loss": 0.7369, "step": 6812 }, { "epoch": 0.673504189012184, "grad_norm": 6.274236372168946, "learning_rate": 5.088070719759374e-08, "loss": 0.7522, "step": 6813 }, { "epoch": 0.6736030447569384, "grad_norm": 4.3796850184779315, "learning_rate": 5.085281768883554e-08, "loss": 0.7813, "step": 6814 }, { "epoch": 0.6737019005016929, "grad_norm": 12.85910083790196, "learning_rate": 5.0824933219395935e-08, "loss": 0.7435, "step": 6815 }, { "epoch": 0.6738007562464474, "grad_norm": 4.8747433467524655, "learning_rate": 5.0797053792134024e-08, "loss": 0.7218, "step": 6816 }, { "epoch": 0.6738996119912019, "grad_norm": 8.19871153019696, "learning_rate": 5.076917940990845e-08, "loss": 0.6886, "step": 6817 }, { "epoch": 0.6739984677359563, "grad_norm": 8.566417385688627, "learning_rate": 5.074131007557736e-08, "loss": 0.6285, "step": 6818 }, { "epoch": 0.6740973234807107, "grad_norm": 19.29585175460254, "learning_rate": 5.071344579199826e-08, "loss": 0.6677, "step": 6819 }, { "epoch": 0.6741961792254653, "grad_norm": 61.89689824451084, "learning_rate": 5.068558656202827e-08, "loss": 0.6092, "step": 6820 }, { "epoch": 0.6742950349702197, "grad_norm": 3.040544692548938, "learning_rate": 5.065773238852398e-08, "loss": 0.6731, "step": 6821 }, { "epoch": 0.6743938907149741, "grad_norm": 3.469502390996233, "learning_rate": 5.062988327434134e-08, "loss": 0.5938, "step": 6822 }, { "epoch": 0.6744927464597287, "grad_norm": 5.000464681759872, "learning_rate": 5.0602039222335946e-08, "loss": 0.6663, "step": 6823 }, { "epoch": 0.6745916022044831, "grad_norm": 3.759747599126241, "learning_rate": 5.0574200235362727e-08, "loss": 0.7535, "step": 6824 }, { "epoch": 0.6746904579492375, "grad_norm": 3.9344217141345372, "learning_rate": 5.054636631627623e-08, "loss": 0.6653, "step": 6825 }, { "epoch": 0.6747893136939921, "grad_norm": 6.182891817215699, "learning_rate": 5.051853746793038e-08, "loss": 0.6951, "step": 6826 }, { "epoch": 0.6748881694387465, "grad_norm": 4.167358944270966, "learning_rate": 5.049071369317857e-08, "loss": 0.6887, "step": 6827 }, { "epoch": 0.674987025183501, "grad_norm": 3.298263388817007, "learning_rate": 5.046289499487375e-08, "loss": 0.692, "step": 6828 }, { "epoch": 0.6750858809282554, "grad_norm": 10.923487124478394, "learning_rate": 5.043508137586836e-08, "loss": 0.6413, "step": 6829 }, { "epoch": 0.6751847366730099, "grad_norm": 4.48850383240122, "learning_rate": 5.04072728390142e-08, "loss": 0.6433, "step": 6830 }, { "epoch": 0.6752835924177644, "grad_norm": 4.073508453341519, "learning_rate": 5.0379469387162684e-08, "loss": 0.7421, "step": 6831 }, { "epoch": 0.6753824481625188, "grad_norm": 3.2358847113009737, "learning_rate": 5.0351671023164655e-08, "loss": 0.6871, "step": 6832 }, { "epoch": 0.6754813039072733, "grad_norm": 8.50330815808087, "learning_rate": 5.032387774987037e-08, "loss": 0.7038, "step": 6833 }, { "epoch": 0.6755801596520278, "grad_norm": 3.767945333046059, "learning_rate": 5.029608957012963e-08, "loss": 0.722, "step": 6834 }, { "epoch": 0.6756790153967822, "grad_norm": 8.67454522483573, "learning_rate": 5.026830648679177e-08, "loss": 0.7402, "step": 6835 }, { "epoch": 0.6757778711415368, "grad_norm": 10.761263278714258, "learning_rate": 5.024052850270544e-08, "loss": 0.7885, "step": 6836 }, { "epoch": 0.6758767268862912, "grad_norm": 30.00726554487649, "learning_rate": 5.021275562071896e-08, "loss": 0.6368, "step": 6837 }, { "epoch": 0.6759755826310456, "grad_norm": 3.5222781661145404, "learning_rate": 5.018498784367993e-08, "loss": 0.6619, "step": 6838 }, { "epoch": 0.6760744383758002, "grad_norm": 6.598657633384732, "learning_rate": 5.015722517443557e-08, "loss": 0.6796, "step": 6839 }, { "epoch": 0.6761732941205546, "grad_norm": 7.257881222039123, "learning_rate": 5.0129467615832576e-08, "loss": 0.8054, "step": 6840 }, { "epoch": 0.676272149865309, "grad_norm": 4.410335598514433, "learning_rate": 5.0101715170716995e-08, "loss": 0.7087, "step": 6841 }, { "epoch": 0.6763710056100635, "grad_norm": 5.229515771855248, "learning_rate": 5.007396784193452e-08, "loss": 0.5995, "step": 6842 }, { "epoch": 0.676469861354818, "grad_norm": 6.7163826864507845, "learning_rate": 5.0046225632330185e-08, "loss": 0.7135, "step": 6843 }, { "epoch": 0.6765687170995724, "grad_norm": 3.6096788154150077, "learning_rate": 5.001848854474848e-08, "loss": 0.6781, "step": 6844 }, { "epoch": 0.6766675728443269, "grad_norm": 5.144818572672796, "learning_rate": 4.9990756582033524e-08, "loss": 0.646, "step": 6845 }, { "epoch": 0.6767664285890814, "grad_norm": 45.91528018723797, "learning_rate": 4.996302974702882e-08, "loss": 0.6881, "step": 6846 }, { "epoch": 0.6768652843338359, "grad_norm": 6.214543403062526, "learning_rate": 4.993530804257727e-08, "loss": 0.7315, "step": 6847 }, { "epoch": 0.6769641400785903, "grad_norm": 8.519939838457285, "learning_rate": 4.9907591471521415e-08, "loss": 0.7692, "step": 6848 }, { "epoch": 0.6770629958233448, "grad_norm": 6.852349806020006, "learning_rate": 4.987988003670312e-08, "loss": 0.7436, "step": 6849 }, { "epoch": 0.6771618515680993, "grad_norm": 3.5236694116561154, "learning_rate": 4.985217374096378e-08, "loss": 0.6579, "step": 6850 }, { "epoch": 0.6772607073128537, "grad_norm": 7.860086317894921, "learning_rate": 4.982447258714434e-08, "loss": 0.8831, "step": 6851 }, { "epoch": 0.6773595630576081, "grad_norm": 10.290153666143267, "learning_rate": 4.979677657808505e-08, "loss": 0.7464, "step": 6852 }, { "epoch": 0.6774584188023627, "grad_norm": 2.9129561228148875, "learning_rate": 4.976908571662578e-08, "loss": 0.6691, "step": 6853 }, { "epoch": 0.6775572745471171, "grad_norm": 3.6517710707466478, "learning_rate": 4.9741400005605846e-08, "loss": 0.7698, "step": 6854 }, { "epoch": 0.6776561302918716, "grad_norm": 43.883086607895045, "learning_rate": 4.971371944786393e-08, "loss": 0.676, "step": 6855 }, { "epoch": 0.6777549860366261, "grad_norm": 5.053330528651903, "learning_rate": 4.968604404623832e-08, "loss": 0.5802, "step": 6856 }, { "epoch": 0.6778538417813805, "grad_norm": 24.160997486178267, "learning_rate": 4.965837380356675e-08, "loss": 0.7803, "step": 6857 }, { "epoch": 0.677952697526135, "grad_norm": 4.055121047593691, "learning_rate": 4.963070872268631e-08, "loss": 0.6842, "step": 6858 }, { "epoch": 0.6780515532708895, "grad_norm": 6.456773685263812, "learning_rate": 4.960304880643373e-08, "loss": 0.6539, "step": 6859 }, { "epoch": 0.6781504090156439, "grad_norm": 3.9703818043639645, "learning_rate": 4.957539405764509e-08, "loss": 0.7285, "step": 6860 }, { "epoch": 0.6782492647603984, "grad_norm": 3.7865751470777367, "learning_rate": 4.9547744479155936e-08, "loss": 0.7777, "step": 6861 }, { "epoch": 0.6783481205051528, "grad_norm": 10.317836805555624, "learning_rate": 4.952010007380139e-08, "loss": 0.7056, "step": 6862 }, { "epoch": 0.6784469762499074, "grad_norm": 4.328398420605287, "learning_rate": 4.949246084441593e-08, "loss": 0.7625, "step": 6863 }, { "epoch": 0.6785458319946618, "grad_norm": 4.581452250896427, "learning_rate": 4.946482679383357e-08, "loss": 0.7496, "step": 6864 }, { "epoch": 0.6786446877394162, "grad_norm": 3.584586257395639, "learning_rate": 4.943719792488781e-08, "loss": 0.5998, "step": 6865 }, { "epoch": 0.6787435434841708, "grad_norm": 9.224589967636696, "learning_rate": 4.94095742404115e-08, "loss": 0.6916, "step": 6866 }, { "epoch": 0.6788423992289252, "grad_norm": 4.975142700250369, "learning_rate": 4.93819557432371e-08, "loss": 0.675, "step": 6867 }, { "epoch": 0.6789412549736796, "grad_norm": 4.143003162179376, "learning_rate": 4.935434243619651e-08, "loss": 0.7365, "step": 6868 }, { "epoch": 0.6790401107184342, "grad_norm": 3.6014852945241134, "learning_rate": 4.932673432212099e-08, "loss": 0.6873, "step": 6869 }, { "epoch": 0.6791389664631886, "grad_norm": 4.119182194047706, "learning_rate": 4.929913140384141e-08, "loss": 0.7649, "step": 6870 }, { "epoch": 0.679237822207943, "grad_norm": 3.01901960597528, "learning_rate": 4.9271533684187984e-08, "loss": 0.642, "step": 6871 }, { "epoch": 0.6793366779526975, "grad_norm": 7.008676264886851, "learning_rate": 4.924394116599048e-08, "loss": 0.6502, "step": 6872 }, { "epoch": 0.679435533697452, "grad_norm": 28.44345425358551, "learning_rate": 4.921635385207814e-08, "loss": 0.6515, "step": 6873 }, { "epoch": 0.6795343894422065, "grad_norm": 7.773255624415793, "learning_rate": 4.918877174527957e-08, "loss": 0.6238, "step": 6874 }, { "epoch": 0.6796332451869609, "grad_norm": 5.709703784199952, "learning_rate": 4.9161194848422937e-08, "loss": 0.6795, "step": 6875 }, { "epoch": 0.6797321009317154, "grad_norm": 3.6449057839784182, "learning_rate": 4.913362316433588e-08, "loss": 0.7266, "step": 6876 }, { "epoch": 0.6798309566764699, "grad_norm": 4.54525678008131, "learning_rate": 4.910605669584544e-08, "loss": 0.642, "step": 6877 }, { "epoch": 0.6799298124212243, "grad_norm": 6.453322957820563, "learning_rate": 4.9078495445778114e-08, "loss": 0.7973, "step": 6878 }, { "epoch": 0.6800286681659788, "grad_norm": 4.505215899254946, "learning_rate": 4.905093941695996e-08, "loss": 0.7725, "step": 6879 }, { "epoch": 0.6801275239107333, "grad_norm": 3.2175590221620216, "learning_rate": 4.902338861221639e-08, "loss": 0.5864, "step": 6880 }, { "epoch": 0.6802263796554877, "grad_norm": 3.674000116246416, "learning_rate": 4.899584303437239e-08, "loss": 0.6796, "step": 6881 }, { "epoch": 0.6803252354002421, "grad_norm": 5.223100096392552, "learning_rate": 4.896830268625229e-08, "loss": 0.6873, "step": 6882 }, { "epoch": 0.6804240911449967, "grad_norm": 9.847794039806638, "learning_rate": 4.8940767570679974e-08, "loss": 0.698, "step": 6883 }, { "epoch": 0.6805229468897511, "grad_norm": 3.4539713799061236, "learning_rate": 4.8913237690478814e-08, "loss": 0.6823, "step": 6884 }, { "epoch": 0.6806218026345056, "grad_norm": 2.68701786093155, "learning_rate": 4.8885713048471496e-08, "loss": 0.6888, "step": 6885 }, { "epoch": 0.6807206583792601, "grad_norm": 5.38268592247178, "learning_rate": 4.885819364748033e-08, "loss": 0.6765, "step": 6886 }, { "epoch": 0.6808195141240145, "grad_norm": 5.199286414193881, "learning_rate": 4.883067949032704e-08, "loss": 0.702, "step": 6887 }, { "epoch": 0.680918369868769, "grad_norm": 4.7331600785469625, "learning_rate": 4.880317057983273e-08, "loss": 0.7339, "step": 6888 }, { "epoch": 0.6810172256135235, "grad_norm": 3.925974233253034, "learning_rate": 4.877566691881807e-08, "loss": 0.6814, "step": 6889 }, { "epoch": 0.681116081358278, "grad_norm": 3.4475610866722883, "learning_rate": 4.8748168510103194e-08, "loss": 0.7374, "step": 6890 }, { "epoch": 0.6812149371030324, "grad_norm": 8.18605286398184, "learning_rate": 4.872067535650758e-08, "loss": 0.7174, "step": 6891 }, { "epoch": 0.6813137928477868, "grad_norm": 3.4315461713930357, "learning_rate": 4.8693187460850326e-08, "loss": 0.7225, "step": 6892 }, { "epoch": 0.6814126485925414, "grad_norm": 2.652718045116078, "learning_rate": 4.866570482594984e-08, "loss": 0.6098, "step": 6893 }, { "epoch": 0.6815115043372958, "grad_norm": 4.700895924114779, "learning_rate": 4.8638227454624114e-08, "loss": 0.6696, "step": 6894 }, { "epoch": 0.6816103600820502, "grad_norm": 2.9915036690600734, "learning_rate": 4.8610755349690494e-08, "loss": 0.7632, "step": 6895 }, { "epoch": 0.6817092158268048, "grad_norm": 9.432653009692261, "learning_rate": 4.858328851396589e-08, "loss": 0.6526, "step": 6896 }, { "epoch": 0.6818080715715592, "grad_norm": 4.269756703680442, "learning_rate": 4.8555826950266576e-08, "loss": 0.7509, "step": 6897 }, { "epoch": 0.6819069273163136, "grad_norm": 10.586998629231845, "learning_rate": 4.852837066140839e-08, "loss": 0.68, "step": 6898 }, { "epoch": 0.6820057830610682, "grad_norm": 5.633736726901227, "learning_rate": 4.8500919650206494e-08, "loss": 0.7223, "step": 6899 }, { "epoch": 0.6821046388058226, "grad_norm": 4.672784187019227, "learning_rate": 4.847347391947563e-08, "loss": 0.787, "step": 6900 }, { "epoch": 0.682203494550577, "grad_norm": 3.698741175343985, "learning_rate": 4.844603347202997e-08, "loss": 0.7541, "step": 6901 }, { "epoch": 0.6823023502953315, "grad_norm": 3.5927796368160463, "learning_rate": 4.8418598310683065e-08, "loss": 0.7384, "step": 6902 }, { "epoch": 0.682401206040086, "grad_norm": 3.27263871494874, "learning_rate": 4.839116843824808e-08, "loss": 0.6882, "step": 6903 }, { "epoch": 0.6825000617848405, "grad_norm": 14.331237130589473, "learning_rate": 4.836374385753743e-08, "loss": 0.7054, "step": 6904 }, { "epoch": 0.6825989175295949, "grad_norm": 4.226825488965418, "learning_rate": 4.8336324571363164e-08, "loss": 0.7509, "step": 6905 }, { "epoch": 0.6826977732743494, "grad_norm": 6.912528144207288, "learning_rate": 4.830891058253677e-08, "loss": 0.6692, "step": 6906 }, { "epoch": 0.6827966290191039, "grad_norm": 3.647287556415683, "learning_rate": 4.828150189386905e-08, "loss": 0.7148, "step": 6907 }, { "epoch": 0.6828954847638583, "grad_norm": 14.819176302837613, "learning_rate": 4.8254098508170404e-08, "loss": 0.7197, "step": 6908 }, { "epoch": 0.6829943405086129, "grad_norm": 3.0249221226345964, "learning_rate": 4.82267004282507e-08, "loss": 0.6955, "step": 6909 }, { "epoch": 0.6830931962533673, "grad_norm": 3.721584503499986, "learning_rate": 4.819930765691911e-08, "loss": 0.7759, "step": 6910 }, { "epoch": 0.6831920519981217, "grad_norm": 3.3445865519529927, "learning_rate": 4.817192019698445e-08, "loss": 0.6797, "step": 6911 }, { "epoch": 0.6832909077428762, "grad_norm": 4.898585075805994, "learning_rate": 4.8144538051254814e-08, "loss": 0.7505, "step": 6912 }, { "epoch": 0.6833897634876307, "grad_norm": 3.4087692565303898, "learning_rate": 4.811716122253793e-08, "loss": 0.7699, "step": 6913 }, { "epoch": 0.6834886192323851, "grad_norm": 6.097220545003893, "learning_rate": 4.8089789713640814e-08, "loss": 0.7399, "step": 6914 }, { "epoch": 0.6835874749771396, "grad_norm": 3.6922918135559613, "learning_rate": 4.806242352737001e-08, "loss": 0.6545, "step": 6915 }, { "epoch": 0.6836863307218941, "grad_norm": 4.044641687068706, "learning_rate": 4.8035062666531536e-08, "loss": 0.742, "step": 6916 }, { "epoch": 0.6837851864666485, "grad_norm": 2.6762695537413697, "learning_rate": 4.800770713393089e-08, "loss": 0.6043, "step": 6917 }, { "epoch": 0.683884042211403, "grad_norm": 4.883396393599119, "learning_rate": 4.79803569323729e-08, "loss": 0.7531, "step": 6918 }, { "epoch": 0.6839828979561575, "grad_norm": 17.44980841798662, "learning_rate": 4.7953012064661956e-08, "loss": 0.704, "step": 6919 }, { "epoch": 0.684081753700912, "grad_norm": 18.456757305404814, "learning_rate": 4.792567253360192e-08, "loss": 0.7214, "step": 6920 }, { "epoch": 0.6841806094456664, "grad_norm": 6.103534579673057, "learning_rate": 4.789833834199598e-08, "loss": 0.7503, "step": 6921 }, { "epoch": 0.6842794651904209, "grad_norm": 10.77700498482542, "learning_rate": 4.787100949264689e-08, "loss": 0.5429, "step": 6922 }, { "epoch": 0.6843783209351754, "grad_norm": 3.857301112120819, "learning_rate": 4.784368598835685e-08, "loss": 0.7199, "step": 6923 }, { "epoch": 0.6844771766799298, "grad_norm": 3.776227179824211, "learning_rate": 4.7816367831927416e-08, "loss": 0.7694, "step": 6924 }, { "epoch": 0.6845760324246842, "grad_norm": 3.75230882991863, "learning_rate": 4.778905502615974e-08, "loss": 0.6948, "step": 6925 }, { "epoch": 0.6846748881694388, "grad_norm": 7.558730876775452, "learning_rate": 4.776174757385427e-08, "loss": 0.7334, "step": 6926 }, { "epoch": 0.6847737439141932, "grad_norm": 4.984377059430879, "learning_rate": 4.7734445477811016e-08, "loss": 0.8301, "step": 6927 }, { "epoch": 0.6848725996589476, "grad_norm": 4.796051509063145, "learning_rate": 4.770714874082944e-08, "loss": 0.7197, "step": 6928 }, { "epoch": 0.6849714554037022, "grad_norm": 2.9130885477803625, "learning_rate": 4.767985736570835e-08, "loss": 0.6725, "step": 6929 }, { "epoch": 0.6850703111484566, "grad_norm": 2.8242580284319945, "learning_rate": 4.765257135524615e-08, "loss": 0.6228, "step": 6930 }, { "epoch": 0.6851691668932111, "grad_norm": 3.1362185780204217, "learning_rate": 4.7625290712240575e-08, "loss": 0.6882, "step": 6931 }, { "epoch": 0.6852680226379656, "grad_norm": 3.925733239970925, "learning_rate": 4.7598015439488824e-08, "loss": 0.7749, "step": 6932 }, { "epoch": 0.68536687838272, "grad_norm": 4.190017270568966, "learning_rate": 4.7570745539787605e-08, "loss": 0.7653, "step": 6933 }, { "epoch": 0.6854657341274745, "grad_norm": 7.197463900046511, "learning_rate": 4.754348101593308e-08, "loss": 0.6995, "step": 6934 }, { "epoch": 0.6855645898722289, "grad_norm": 3.0534948004950624, "learning_rate": 4.7516221870720754e-08, "loss": 0.7125, "step": 6935 }, { "epoch": 0.6856634456169834, "grad_norm": 5.702806688323771, "learning_rate": 4.7488968106945724e-08, "loss": 0.7914, "step": 6936 }, { "epoch": 0.6857623013617379, "grad_norm": 33.78031274474961, "learning_rate": 4.7461719727402386e-08, "loss": 0.7491, "step": 6937 }, { "epoch": 0.6858611571064923, "grad_norm": 5.539549152347562, "learning_rate": 4.7434476734884686e-08, "loss": 0.6893, "step": 6938 }, { "epoch": 0.6859600128512469, "grad_norm": 6.615388740482905, "learning_rate": 4.740723913218605e-08, "loss": 0.6089, "step": 6939 }, { "epoch": 0.6860588685960013, "grad_norm": 6.924555318259605, "learning_rate": 4.738000692209919e-08, "loss": 0.6385, "step": 6940 }, { "epoch": 0.6861577243407557, "grad_norm": 3.138866659763953, "learning_rate": 4.7352780107416433e-08, "loss": 0.6036, "step": 6941 }, { "epoch": 0.6862565800855103, "grad_norm": 3.4513439957517504, "learning_rate": 4.7325558690929514e-08, "loss": 0.6677, "step": 6942 }, { "epoch": 0.6863554358302647, "grad_norm": 3.253709566020095, "learning_rate": 4.7298342675429506e-08, "loss": 0.6209, "step": 6943 }, { "epoch": 0.6864542915750191, "grad_norm": 5.799598788322272, "learning_rate": 4.727113206370705e-08, "loss": 0.7176, "step": 6944 }, { "epoch": 0.6865531473197736, "grad_norm": 5.365056362103453, "learning_rate": 4.7243926858552244e-08, "loss": 0.7836, "step": 6945 }, { "epoch": 0.6866520030645281, "grad_norm": 4.778812516991259, "learning_rate": 4.721672706275448e-08, "loss": 0.7239, "step": 6946 }, { "epoch": 0.6867508588092825, "grad_norm": 4.629921746670702, "learning_rate": 4.718953267910279e-08, "loss": 0.6773, "step": 6947 }, { "epoch": 0.686849714554037, "grad_norm": 5.579109019554846, "learning_rate": 4.716234371038552e-08, "loss": 0.7541, "step": 6948 }, { "epoch": 0.6869485702987915, "grad_norm": 4.758957705585035, "learning_rate": 4.7135160159390454e-08, "loss": 0.6683, "step": 6949 }, { "epoch": 0.687047426043546, "grad_norm": 4.195006991169889, "learning_rate": 4.7107982028904935e-08, "loss": 0.7253, "step": 6950 }, { "epoch": 0.6871462817883004, "grad_norm": 5.32580584084987, "learning_rate": 4.708080932171562e-08, "loss": 0.6512, "step": 6951 }, { "epoch": 0.6872451375330549, "grad_norm": 4.272632542991704, "learning_rate": 4.705364204060869e-08, "loss": 0.6579, "step": 6952 }, { "epoch": 0.6873439932778094, "grad_norm": 10.81736615931638, "learning_rate": 4.7026480188369786e-08, "loss": 0.6936, "step": 6953 }, { "epoch": 0.6874428490225638, "grad_norm": 4.152721692515138, "learning_rate": 4.699932376778388e-08, "loss": 0.5855, "step": 6954 }, { "epoch": 0.6875417047673182, "grad_norm": 3.0762782592733324, "learning_rate": 4.697217278163552e-08, "loss": 0.7149, "step": 6955 }, { "epoch": 0.6876405605120728, "grad_norm": 9.30964837927263, "learning_rate": 4.694502723270867e-08, "loss": 0.6871, "step": 6956 }, { "epoch": 0.6877394162568272, "grad_norm": 4.990785873297677, "learning_rate": 4.6917887123786617e-08, "loss": 0.7463, "step": 6957 }, { "epoch": 0.6878382720015817, "grad_norm": 6.7067265457758065, "learning_rate": 4.689075245765225e-08, "loss": 0.6029, "step": 6958 }, { "epoch": 0.6879371277463362, "grad_norm": 3.311523944426087, "learning_rate": 4.686362323708778e-08, "loss": 0.7121, "step": 6959 }, { "epoch": 0.6880359834910906, "grad_norm": 6.001790907195006, "learning_rate": 4.6836499464874916e-08, "loss": 0.7343, "step": 6960 }, { "epoch": 0.6881348392358451, "grad_norm": 4.1790906378052215, "learning_rate": 4.680938114379486e-08, "loss": 0.6271, "step": 6961 }, { "epoch": 0.6882336949805996, "grad_norm": 3.7741752379456477, "learning_rate": 4.678226827662811e-08, "loss": 0.7145, "step": 6962 }, { "epoch": 0.688332550725354, "grad_norm": 2.7902857485756316, "learning_rate": 4.6755160866154745e-08, "loss": 0.5493, "step": 6963 }, { "epoch": 0.6884314064701085, "grad_norm": 3.2281486018893077, "learning_rate": 4.6728058915154234e-08, "loss": 0.6889, "step": 6964 }, { "epoch": 0.6885302622148629, "grad_norm": 15.02017488018892, "learning_rate": 4.6700962426405475e-08, "loss": 0.534, "step": 6965 }, { "epoch": 0.6886291179596175, "grad_norm": 3.6835119299155443, "learning_rate": 4.667387140268677e-08, "loss": 0.6278, "step": 6966 }, { "epoch": 0.6887279737043719, "grad_norm": 21.539915563172084, "learning_rate": 4.6646785846775974e-08, "loss": 0.7097, "step": 6967 }, { "epoch": 0.6888268294491263, "grad_norm": 3.670108027498525, "learning_rate": 4.6619705761450226e-08, "loss": 0.6639, "step": 6968 }, { "epoch": 0.6889256851938809, "grad_norm": 10.321352344782213, "learning_rate": 4.6592631149486284e-08, "loss": 0.696, "step": 6969 }, { "epoch": 0.6890245409386353, "grad_norm": 3.0426274862385534, "learning_rate": 4.6565562013660154e-08, "loss": 0.6781, "step": 6970 }, { "epoch": 0.6891233966833897, "grad_norm": 7.990407110684896, "learning_rate": 4.653849835674744e-08, "loss": 0.6438, "step": 6971 }, { "epoch": 0.6892222524281443, "grad_norm": 3.7343883732729393, "learning_rate": 4.6511440181523144e-08, "loss": 0.664, "step": 6972 }, { "epoch": 0.6893211081728987, "grad_norm": 22.007635310269958, "learning_rate": 4.64843874907616e-08, "loss": 0.7313, "step": 6973 }, { "epoch": 0.6894199639176531, "grad_norm": 18.079633471455384, "learning_rate": 4.6457340287236714e-08, "loss": 0.7012, "step": 6974 }, { "epoch": 0.6895188196624076, "grad_norm": 2.9944582516664386, "learning_rate": 4.643029857372181e-08, "loss": 0.6674, "step": 6975 }, { "epoch": 0.6896176754071621, "grad_norm": 4.500741074965972, "learning_rate": 4.6403262352989525e-08, "loss": 0.6701, "step": 6976 }, { "epoch": 0.6897165311519166, "grad_norm": 2.548954140982369, "learning_rate": 4.637623162781209e-08, "loss": 0.6563, "step": 6977 }, { "epoch": 0.689815386896671, "grad_norm": 3.413288798767727, "learning_rate": 4.634920640096114e-08, "loss": 0.6268, "step": 6978 }, { "epoch": 0.6899142426414255, "grad_norm": 3.860836915974056, "learning_rate": 4.632218667520762e-08, "loss": 0.6269, "step": 6979 }, { "epoch": 0.69001309838618, "grad_norm": 3.910412860339552, "learning_rate": 4.6295172453322094e-08, "loss": 0.672, "step": 6980 }, { "epoch": 0.6901119541309344, "grad_norm": 2.9692310329764164, "learning_rate": 4.62681637380744e-08, "loss": 0.7331, "step": 6981 }, { "epoch": 0.690210809875689, "grad_norm": 4.051498392425163, "learning_rate": 4.624116053223395e-08, "loss": 0.7156, "step": 6982 }, { "epoch": 0.6903096656204434, "grad_norm": 5.926826218201472, "learning_rate": 4.6214162838569445e-08, "loss": 0.6409, "step": 6983 }, { "epoch": 0.6904085213651978, "grad_norm": 3.6555988559722015, "learning_rate": 4.6187170659849184e-08, "loss": 0.7848, "step": 6984 }, { "epoch": 0.6905073771099522, "grad_norm": 4.442348514567032, "learning_rate": 4.616018399884074e-08, "loss": 0.668, "step": 6985 }, { "epoch": 0.6906062328547068, "grad_norm": 5.600797610747027, "learning_rate": 4.613320285831127e-08, "loss": 0.7223, "step": 6986 }, { "epoch": 0.6907050885994612, "grad_norm": 2.3467663777227483, "learning_rate": 4.610622724102722e-08, "loss": 0.6415, "step": 6987 }, { "epoch": 0.6908039443442157, "grad_norm": 5.62466816179407, "learning_rate": 4.607925714975458e-08, "loss": 0.6499, "step": 6988 }, { "epoch": 0.6909028000889702, "grad_norm": 9.752393621424634, "learning_rate": 4.605229258725876e-08, "loss": 0.6266, "step": 6989 }, { "epoch": 0.6910016558337246, "grad_norm": 4.653689361003493, "learning_rate": 4.6025333556304526e-08, "loss": 0.7372, "step": 6990 }, { "epoch": 0.6911005115784791, "grad_norm": 3.877412010196789, "learning_rate": 4.599838005965618e-08, "loss": 0.8346, "step": 6991 }, { "epoch": 0.6911993673232336, "grad_norm": 3.405936658631345, "learning_rate": 4.597143210007735e-08, "loss": 0.7511, "step": 6992 }, { "epoch": 0.691298223067988, "grad_norm": 4.112167731098671, "learning_rate": 4.5944489680331164e-08, "loss": 0.6474, "step": 6993 }, { "epoch": 0.6913970788127425, "grad_norm": 3.55792547605359, "learning_rate": 4.591755280318025e-08, "loss": 0.7022, "step": 6994 }, { "epoch": 0.691495934557497, "grad_norm": 3.783431799249201, "learning_rate": 4.589062147138647e-08, "loss": 0.7833, "step": 6995 }, { "epoch": 0.6915947903022515, "grad_norm": 4.399061423112457, "learning_rate": 4.586369568771129e-08, "loss": 0.7456, "step": 6996 }, { "epoch": 0.6916936460470059, "grad_norm": 4.172590673669197, "learning_rate": 4.583677545491559e-08, "loss": 0.6749, "step": 6997 }, { "epoch": 0.6917925017917603, "grad_norm": 3.170618759000232, "learning_rate": 4.580986077575956e-08, "loss": 0.6892, "step": 6998 }, { "epoch": 0.6918913575365149, "grad_norm": 4.082652316355774, "learning_rate": 4.5782951653002986e-08, "loss": 0.706, "step": 6999 }, { "epoch": 0.6919902132812693, "grad_norm": 4.147069141668596, "learning_rate": 4.575604808940493e-08, "loss": 0.6585, "step": 7000 }, { "epoch": 0.6920890690260237, "grad_norm": 3.4024669367827287, "learning_rate": 4.572915008772402e-08, "loss": 0.7437, "step": 7001 }, { "epoch": 0.6921879247707783, "grad_norm": 9.821484566344866, "learning_rate": 4.5702257650718226e-08, "loss": 0.7625, "step": 7002 }, { "epoch": 0.6922867805155327, "grad_norm": 2.8417534691742126, "learning_rate": 4.567537078114492e-08, "loss": 0.6546, "step": 7003 }, { "epoch": 0.6923856362602872, "grad_norm": 3.3909536373141513, "learning_rate": 4.564848948176099e-08, "loss": 0.6391, "step": 7004 }, { "epoch": 0.6924844920050417, "grad_norm": 3.662450905195977, "learning_rate": 4.562161375532276e-08, "loss": 0.6067, "step": 7005 }, { "epoch": 0.6925833477497961, "grad_norm": 5.217868580566663, "learning_rate": 4.559474360458587e-08, "loss": 0.7441, "step": 7006 }, { "epoch": 0.6926822034945506, "grad_norm": 3.797889082697075, "learning_rate": 4.556787903230549e-08, "loss": 0.7772, "step": 7007 }, { "epoch": 0.692781059239305, "grad_norm": 5.981689063556762, "learning_rate": 4.554102004123622e-08, "loss": 0.7744, "step": 7008 }, { "epoch": 0.6928799149840595, "grad_norm": 14.40533530994454, "learning_rate": 4.551416663413199e-08, "loss": 0.6073, "step": 7009 }, { "epoch": 0.692978770728814, "grad_norm": 4.811375091166377, "learning_rate": 4.548731881374623e-08, "loss": 0.7217, "step": 7010 }, { "epoch": 0.6930776264735684, "grad_norm": 13.01851273427871, "learning_rate": 4.546047658283186e-08, "loss": 0.6341, "step": 7011 }, { "epoch": 0.693176482218323, "grad_norm": 2.8729040704522197, "learning_rate": 4.5433639944141055e-08, "loss": 0.6061, "step": 7012 }, { "epoch": 0.6932753379630774, "grad_norm": 4.240744459015942, "learning_rate": 4.5406808900425605e-08, "loss": 0.7378, "step": 7013 }, { "epoch": 0.6933741937078318, "grad_norm": 5.6521093090528485, "learning_rate": 4.537998345443657e-08, "loss": 0.613, "step": 7014 }, { "epoch": 0.6934730494525864, "grad_norm": 4.485684184463186, "learning_rate": 4.535316360892451e-08, "loss": 0.7241, "step": 7015 }, { "epoch": 0.6935719051973408, "grad_norm": 3.3234221227069014, "learning_rate": 4.532634936663947e-08, "loss": 0.6685, "step": 7016 }, { "epoch": 0.6936707609420952, "grad_norm": 4.984493110485975, "learning_rate": 4.5299540730330774e-08, "loss": 0.6965, "step": 7017 }, { "epoch": 0.6937696166868497, "grad_norm": 4.515608463647462, "learning_rate": 4.527273770274732e-08, "loss": 0.7289, "step": 7018 }, { "epoch": 0.6938684724316042, "grad_norm": 5.292856165379726, "learning_rate": 4.524594028663734e-08, "loss": 0.6921, "step": 7019 }, { "epoch": 0.6939673281763586, "grad_norm": 3.0354016105813764, "learning_rate": 4.521914848474846e-08, "loss": 0.779, "step": 7020 }, { "epoch": 0.6940661839211131, "grad_norm": 14.324095130019321, "learning_rate": 4.519236229982783e-08, "loss": 0.7788, "step": 7021 }, { "epoch": 0.6941650396658676, "grad_norm": 4.204408069559874, "learning_rate": 4.516558173462202e-08, "loss": 0.6728, "step": 7022 }, { "epoch": 0.6942638954106221, "grad_norm": 7.110813126564625, "learning_rate": 4.5138806791876884e-08, "loss": 0.6098, "step": 7023 }, { "epoch": 0.6943627511553765, "grad_norm": 4.146005771157704, "learning_rate": 4.5112037474337904e-08, "loss": 0.7019, "step": 7024 }, { "epoch": 0.694461606900131, "grad_norm": 5.935757855069707, "learning_rate": 4.5085273784749775e-08, "loss": 0.745, "step": 7025 }, { "epoch": 0.6945604626448855, "grad_norm": 11.45314793580284, "learning_rate": 4.5058515725856785e-08, "loss": 0.7465, "step": 7026 }, { "epoch": 0.6946593183896399, "grad_norm": 3.9245967108397006, "learning_rate": 4.5031763300402594e-08, "loss": 0.6676, "step": 7027 }, { "epoch": 0.6947581741343943, "grad_norm": 3.8409359679008572, "learning_rate": 4.5005016511130203e-08, "loss": 0.6195, "step": 7028 }, { "epoch": 0.6948570298791489, "grad_norm": 5.150819451275333, "learning_rate": 4.4978275360782136e-08, "loss": 0.7264, "step": 7029 }, { "epoch": 0.6949558856239033, "grad_norm": 3.16579865386107, "learning_rate": 4.495153985210034e-08, "loss": 0.68, "step": 7030 }, { "epoch": 0.6950547413686577, "grad_norm": 3.421796146852053, "learning_rate": 4.492480998782607e-08, "loss": 0.7373, "step": 7031 }, { "epoch": 0.6951535971134123, "grad_norm": 3.666440588925384, "learning_rate": 4.4898085770700125e-08, "loss": 0.7259, "step": 7032 }, { "epoch": 0.6952524528581667, "grad_norm": 4.465128853745658, "learning_rate": 4.487136720346271e-08, "loss": 0.7244, "step": 7033 }, { "epoch": 0.6953513086029212, "grad_norm": 5.78350564710398, "learning_rate": 4.484465428885335e-08, "loss": 0.7129, "step": 7034 }, { "epoch": 0.6954501643476757, "grad_norm": 3.7295906491059756, "learning_rate": 4.4817947029611125e-08, "loss": 0.7558, "step": 7035 }, { "epoch": 0.6955490200924301, "grad_norm": 3.7811358501318884, "learning_rate": 4.479124542847443e-08, "loss": 0.6949, "step": 7036 }, { "epoch": 0.6956478758371846, "grad_norm": 4.639442056557572, "learning_rate": 4.4764549488181116e-08, "loss": 0.7043, "step": 7037 }, { "epoch": 0.695746731581939, "grad_norm": 4.075285133289127, "learning_rate": 4.473785921146849e-08, "loss": 0.7531, "step": 7038 }, { "epoch": 0.6958455873266935, "grad_norm": 5.4340791258996015, "learning_rate": 4.47111746010732e-08, "loss": 0.6943, "step": 7039 }, { "epoch": 0.695944443071448, "grad_norm": 4.074630723378779, "learning_rate": 4.468449565973138e-08, "loss": 0.5914, "step": 7040 }, { "epoch": 0.6960432988162024, "grad_norm": 3.8061809138914238, "learning_rate": 4.465782239017861e-08, "loss": 0.7755, "step": 7041 }, { "epoch": 0.696142154560957, "grad_norm": 5.691660826362966, "learning_rate": 4.4631154795149764e-08, "loss": 0.7417, "step": 7042 }, { "epoch": 0.6962410103057114, "grad_norm": 15.537355902641137, "learning_rate": 4.460449287737924e-08, "loss": 0.772, "step": 7043 }, { "epoch": 0.6963398660504658, "grad_norm": 4.206928426433293, "learning_rate": 4.4577836639600876e-08, "loss": 0.7213, "step": 7044 }, { "epoch": 0.6964387217952204, "grad_norm": 4.336555186801764, "learning_rate": 4.4551186084547784e-08, "loss": 0.6845, "step": 7045 }, { "epoch": 0.6965375775399748, "grad_norm": 3.2330924777576646, "learning_rate": 4.452454121495267e-08, "loss": 0.7612, "step": 7046 }, { "epoch": 0.6966364332847292, "grad_norm": 136.36339910817512, "learning_rate": 4.449790203354751e-08, "loss": 0.6485, "step": 7047 }, { "epoch": 0.6967352890294837, "grad_norm": 2.977048798206112, "learning_rate": 4.447126854306377e-08, "loss": 0.6409, "step": 7048 }, { "epoch": 0.6968341447742382, "grad_norm": 4.937909238594574, "learning_rate": 4.4444640746232377e-08, "loss": 0.7296, "step": 7049 }, { "epoch": 0.6969330005189927, "grad_norm": 3.516311597379479, "learning_rate": 4.441801864578353e-08, "loss": 0.6764, "step": 7050 }, { "epoch": 0.6970318562637471, "grad_norm": 11.572678323382982, "learning_rate": 4.4391402244446985e-08, "loss": 0.8202, "step": 7051 }, { "epoch": 0.6971307120085016, "grad_norm": 22.22791551057962, "learning_rate": 4.43647915449519e-08, "loss": 0.6876, "step": 7052 }, { "epoch": 0.6972295677532561, "grad_norm": 4.723386750588535, "learning_rate": 4.4338186550026756e-08, "loss": 0.6855, "step": 7053 }, { "epoch": 0.6973284234980105, "grad_norm": 3.9965873336255804, "learning_rate": 4.4311587262399475e-08, "loss": 0.7287, "step": 7054 }, { "epoch": 0.697427279242765, "grad_norm": 5.216397754384862, "learning_rate": 4.42849936847975e-08, "loss": 0.7498, "step": 7055 }, { "epoch": 0.6975261349875195, "grad_norm": 5.551847374625398, "learning_rate": 4.425840581994752e-08, "loss": 0.715, "step": 7056 }, { "epoch": 0.6976249907322739, "grad_norm": 3.7092672429705607, "learning_rate": 4.423182367057583e-08, "loss": 0.7424, "step": 7057 }, { "epoch": 0.6977238464770283, "grad_norm": 4.080197852458536, "learning_rate": 4.420524723940794e-08, "loss": 0.795, "step": 7058 }, { "epoch": 0.6978227022217829, "grad_norm": 3.923023988000037, "learning_rate": 4.4178676529168915e-08, "loss": 0.7831, "step": 7059 }, { "epoch": 0.6979215579665373, "grad_norm": 3.3750621450988905, "learning_rate": 4.4152111542583216e-08, "loss": 0.7331, "step": 7060 }, { "epoch": 0.6980204137112918, "grad_norm": 4.972138888702306, "learning_rate": 4.412555228237464e-08, "loss": 0.6147, "step": 7061 }, { "epoch": 0.6981192694560463, "grad_norm": 3.4778845510959324, "learning_rate": 4.409899875126647e-08, "loss": 0.7695, "step": 7062 }, { "epoch": 0.6982181252008007, "grad_norm": 5.190071433585254, "learning_rate": 4.407245095198141e-08, "loss": 0.6722, "step": 7063 }, { "epoch": 0.6983169809455552, "grad_norm": 5.0893367855025184, "learning_rate": 4.404590888724149e-08, "loss": 0.6719, "step": 7064 }, { "epoch": 0.6984158366903097, "grad_norm": 13.689314178992209, "learning_rate": 4.401937255976822e-08, "loss": 0.7426, "step": 7065 }, { "epoch": 0.6985146924350641, "grad_norm": 3.886432452230138, "learning_rate": 4.3992841972282566e-08, "loss": 0.7488, "step": 7066 }, { "epoch": 0.6986135481798186, "grad_norm": 4.29462273567633, "learning_rate": 4.396631712750477e-08, "loss": 0.6614, "step": 7067 }, { "epoch": 0.6987124039245731, "grad_norm": 3.6147096058032897, "learning_rate": 4.393979802815463e-08, "loss": 0.7249, "step": 7068 }, { "epoch": 0.6988112596693276, "grad_norm": 13.362529953846478, "learning_rate": 4.391328467695122e-08, "loss": 0.6867, "step": 7069 }, { "epoch": 0.698910115414082, "grad_norm": 4.805445614567799, "learning_rate": 4.388677707661317e-08, "loss": 0.7476, "step": 7070 }, { "epoch": 0.6990089711588364, "grad_norm": 4.002959406047479, "learning_rate": 4.3860275229858353e-08, "loss": 0.694, "step": 7071 }, { "epoch": 0.699107826903591, "grad_norm": 3.634920035259162, "learning_rate": 4.3833779139404245e-08, "loss": 0.718, "step": 7072 }, { "epoch": 0.6992066826483454, "grad_norm": 4.463016932780265, "learning_rate": 4.3807288807967525e-08, "loss": 0.8144, "step": 7073 }, { "epoch": 0.6993055383930998, "grad_norm": 4.443265671485834, "learning_rate": 4.378080423826449e-08, "loss": 0.7437, "step": 7074 }, { "epoch": 0.6994043941378544, "grad_norm": 5.115215769795058, "learning_rate": 4.3754325433010645e-08, "loss": 0.8109, "step": 7075 }, { "epoch": 0.6995032498826088, "grad_norm": 6.609490572834012, "learning_rate": 4.372785239492104e-08, "loss": 0.5913, "step": 7076 }, { "epoch": 0.6996021056273632, "grad_norm": 3.556834520991442, "learning_rate": 4.3701385126710136e-08, "loss": 0.6607, "step": 7077 }, { "epoch": 0.6997009613721178, "grad_norm": 3.503267377482586, "learning_rate": 4.367492363109169e-08, "loss": 0.7208, "step": 7078 }, { "epoch": 0.6997998171168722, "grad_norm": 3.1412317102400085, "learning_rate": 4.364846791077901e-08, "loss": 0.6978, "step": 7079 }, { "epoch": 0.6998986728616267, "grad_norm": 4.63413506192923, "learning_rate": 4.362201796848466e-08, "loss": 0.7937, "step": 7080 }, { "epoch": 0.6999975286063811, "grad_norm": 4.597505101723923, "learning_rate": 4.359557380692074e-08, "loss": 0.6991, "step": 7081 }, { "epoch": 0.7000963843511356, "grad_norm": 3.365196526002669, "learning_rate": 4.3569135428798734e-08, "loss": 0.6017, "step": 7082 }, { "epoch": 0.7001952400958901, "grad_norm": 9.478094752797892, "learning_rate": 4.354270283682945e-08, "loss": 0.7241, "step": 7083 }, { "epoch": 0.7002940958406445, "grad_norm": 6.457823557372031, "learning_rate": 4.3516276033723174e-08, "loss": 0.6037, "step": 7084 }, { "epoch": 0.700392951585399, "grad_norm": 3.9380422882965562, "learning_rate": 4.3489855022189634e-08, "loss": 0.5678, "step": 7085 }, { "epoch": 0.7004918073301535, "grad_norm": 3.367728219530565, "learning_rate": 4.346343980493785e-08, "loss": 0.6664, "step": 7086 }, { "epoch": 0.7005906630749079, "grad_norm": 4.40610125053866, "learning_rate": 4.343703038467638e-08, "loss": 0.7944, "step": 7087 }, { "epoch": 0.7006895188196625, "grad_norm": 3.1311042868791215, "learning_rate": 4.341062676411303e-08, "loss": 0.725, "step": 7088 }, { "epoch": 0.7007883745644169, "grad_norm": 3.485293479105385, "learning_rate": 4.3384228945955205e-08, "loss": 0.6667, "step": 7089 }, { "epoch": 0.7008872303091713, "grad_norm": 20.987759454480397, "learning_rate": 4.335783693290955e-08, "loss": 0.7054, "step": 7090 }, { "epoch": 0.7009860860539258, "grad_norm": 5.051812844773426, "learning_rate": 4.333145072768215e-08, "loss": 0.7161, "step": 7091 }, { "epoch": 0.7010849417986803, "grad_norm": 4.8723660761958945, "learning_rate": 4.3305070332978563e-08, "loss": 0.7809, "step": 7092 }, { "epoch": 0.7011837975434347, "grad_norm": 6.653228636101896, "learning_rate": 4.3278695751503736e-08, "loss": 0.6777, "step": 7093 }, { "epoch": 0.7012826532881892, "grad_norm": 3.7535631666686946, "learning_rate": 4.325232698596193e-08, "loss": 0.7351, "step": 7094 }, { "epoch": 0.7013815090329437, "grad_norm": 3.75088931722641, "learning_rate": 4.322596403905691e-08, "loss": 0.7055, "step": 7095 }, { "epoch": 0.7014803647776982, "grad_norm": 3.232997524104842, "learning_rate": 4.3199606913491824e-08, "loss": 0.6897, "step": 7096 }, { "epoch": 0.7015792205224526, "grad_norm": 12.092553304005346, "learning_rate": 4.317325561196916e-08, "loss": 0.689, "step": 7097 }, { "epoch": 0.7016780762672071, "grad_norm": 5.726537049770308, "learning_rate": 4.3146910137190876e-08, "loss": 0.7249, "step": 7098 }, { "epoch": 0.7017769320119616, "grad_norm": 3.555417878662625, "learning_rate": 4.312057049185834e-08, "loss": 0.7228, "step": 7099 }, { "epoch": 0.701875787756716, "grad_norm": 9.247607530788189, "learning_rate": 4.3094236678672256e-08, "loss": 0.7404, "step": 7100 }, { "epoch": 0.7019746435014704, "grad_norm": 2.933228533227474, "learning_rate": 4.30679087003328e-08, "loss": 0.6708, "step": 7101 }, { "epoch": 0.702073499246225, "grad_norm": 3.245974214667151, "learning_rate": 4.3041586559539466e-08, "loss": 0.6951, "step": 7102 }, { "epoch": 0.7021723549909794, "grad_norm": 3.961343344308368, "learning_rate": 4.301527025899124e-08, "loss": 0.7106, "step": 7103 }, { "epoch": 0.7022712107357338, "grad_norm": 3.4102726668129093, "learning_rate": 4.2988959801386506e-08, "loss": 0.6328, "step": 7104 }, { "epoch": 0.7023700664804884, "grad_norm": 3.0778986151463923, "learning_rate": 4.296265518942292e-08, "loss": 0.6228, "step": 7105 }, { "epoch": 0.7024689222252428, "grad_norm": 6.765650497131367, "learning_rate": 4.2936356425797734e-08, "loss": 0.6811, "step": 7106 }, { "epoch": 0.7025677779699973, "grad_norm": 13.977110989176442, "learning_rate": 4.291006351320744e-08, "loss": 0.7232, "step": 7107 }, { "epoch": 0.7026666337147518, "grad_norm": 57.596676762599046, "learning_rate": 4.288377645434797e-08, "loss": 0.7356, "step": 7108 }, { "epoch": 0.7027654894595062, "grad_norm": 4.290878052886105, "learning_rate": 4.2857495251914686e-08, "loss": 0.7631, "step": 7109 }, { "epoch": 0.7028643452042607, "grad_norm": 5.53437944069947, "learning_rate": 4.28312199086024e-08, "loss": 0.6355, "step": 7110 }, { "epoch": 0.7029632009490151, "grad_norm": 3.616164757006522, "learning_rate": 4.280495042710518e-08, "loss": 0.6466, "step": 7111 }, { "epoch": 0.7030620566937696, "grad_norm": 2.946669126898014, "learning_rate": 4.277868681011664e-08, "loss": 0.6664, "step": 7112 }, { "epoch": 0.7031609124385241, "grad_norm": 6.7022489805688314, "learning_rate": 4.2752429060329656e-08, "loss": 0.7145, "step": 7113 }, { "epoch": 0.7032597681832785, "grad_norm": 12.275474720841915, "learning_rate": 4.272617718043661e-08, "loss": 0.7119, "step": 7114 }, { "epoch": 0.703358623928033, "grad_norm": 4.513289535796356, "learning_rate": 4.2699931173129286e-08, "loss": 0.6515, "step": 7115 }, { "epoch": 0.7034574796727875, "grad_norm": 40.80113905035267, "learning_rate": 4.2673691041098746e-08, "loss": 0.6651, "step": 7116 }, { "epoch": 0.7035563354175419, "grad_norm": 4.552578260540014, "learning_rate": 4.264745678703556e-08, "loss": 0.7741, "step": 7117 }, { "epoch": 0.7036551911622965, "grad_norm": 4.001574193412799, "learning_rate": 4.2621228413629715e-08, "loss": 0.7352, "step": 7118 }, { "epoch": 0.7037540469070509, "grad_norm": 12.000232701614044, "learning_rate": 4.259500592357047e-08, "loss": 0.7251, "step": 7119 }, { "epoch": 0.7038529026518053, "grad_norm": 7.09439459186691, "learning_rate": 4.256878931954657e-08, "loss": 0.617, "step": 7120 }, { "epoch": 0.7039517583965598, "grad_norm": 4.089826996412454, "learning_rate": 4.25425786042462e-08, "loss": 0.8402, "step": 7121 }, { "epoch": 0.7040506141413143, "grad_norm": 23.906156185720793, "learning_rate": 4.251637378035681e-08, "loss": 0.7931, "step": 7122 }, { "epoch": 0.7041494698860687, "grad_norm": 3.1789710913045193, "learning_rate": 4.249017485056536e-08, "loss": 0.7957, "step": 7123 }, { "epoch": 0.7042483256308232, "grad_norm": 11.08731434494615, "learning_rate": 4.246398181755817e-08, "loss": 0.7068, "step": 7124 }, { "epoch": 0.7043471813755777, "grad_norm": 10.92260393985129, "learning_rate": 4.243779468402089e-08, "loss": 0.7054, "step": 7125 }, { "epoch": 0.7044460371203322, "grad_norm": 5.098222294923528, "learning_rate": 4.2411613452638686e-08, "loss": 0.7201, "step": 7126 }, { "epoch": 0.7045448928650866, "grad_norm": 4.23680631877255, "learning_rate": 4.238543812609601e-08, "loss": 0.7509, "step": 7127 }, { "epoch": 0.7046437486098411, "grad_norm": 4.27494069813554, "learning_rate": 4.235926870707677e-08, "loss": 0.7784, "step": 7128 }, { "epoch": 0.7047426043545956, "grad_norm": 5.23603896608559, "learning_rate": 4.233310519826431e-08, "loss": 0.686, "step": 7129 }, { "epoch": 0.70484146009935, "grad_norm": 3.176026712239949, "learning_rate": 4.230694760234121e-08, "loss": 0.6351, "step": 7130 }, { "epoch": 0.7049403158441044, "grad_norm": 3.3142590296602465, "learning_rate": 4.228079592198962e-08, "loss": 0.6233, "step": 7131 }, { "epoch": 0.705039171588859, "grad_norm": 4.9303057926105325, "learning_rate": 4.2254650159891026e-08, "loss": 0.7797, "step": 7132 }, { "epoch": 0.7051380273336134, "grad_norm": 4.5423075325528846, "learning_rate": 4.2228510318726206e-08, "loss": 0.7655, "step": 7133 }, { "epoch": 0.7052368830783678, "grad_norm": 3.7352274381409383, "learning_rate": 4.2202376401175474e-08, "loss": 0.719, "step": 7134 }, { "epoch": 0.7053357388231224, "grad_norm": 4.5461994146137465, "learning_rate": 4.2176248409918504e-08, "loss": 0.6398, "step": 7135 }, { "epoch": 0.7054345945678768, "grad_norm": 3.79267438194552, "learning_rate": 4.215012634763425e-08, "loss": 0.6724, "step": 7136 }, { "epoch": 0.7055334503126313, "grad_norm": 6.9236159618297775, "learning_rate": 4.2124010217001236e-08, "loss": 0.6627, "step": 7137 }, { "epoch": 0.7056323060573858, "grad_norm": 7.424118220359549, "learning_rate": 4.209790002069721e-08, "loss": 0.7533, "step": 7138 }, { "epoch": 0.7057311618021402, "grad_norm": 3.3115339338439447, "learning_rate": 4.207179576139945e-08, "loss": 0.7038, "step": 7139 }, { "epoch": 0.7058300175468947, "grad_norm": 4.885036892760342, "learning_rate": 4.204569744178451e-08, "loss": 0.7131, "step": 7140 }, { "epoch": 0.7059288732916491, "grad_norm": 3.3687687444852186, "learning_rate": 4.2019605064528444e-08, "loss": 0.631, "step": 7141 }, { "epoch": 0.7060277290364037, "grad_norm": 3.949452937226992, "learning_rate": 4.1993518632306565e-08, "loss": 0.8222, "step": 7142 }, { "epoch": 0.7061265847811581, "grad_norm": 4.061439592842218, "learning_rate": 4.196743814779373e-08, "loss": 0.7126, "step": 7143 }, { "epoch": 0.7062254405259125, "grad_norm": 3.3878597093092684, "learning_rate": 4.1941363613664046e-08, "loss": 0.7683, "step": 7144 }, { "epoch": 0.7063242962706671, "grad_norm": 4.233334140149833, "learning_rate": 4.1915295032591095e-08, "loss": 0.783, "step": 7145 }, { "epoch": 0.7064231520154215, "grad_norm": 6.098846126414787, "learning_rate": 4.1889232407247856e-08, "loss": 0.6928, "step": 7146 }, { "epoch": 0.7065220077601759, "grad_norm": 3.6644696303036803, "learning_rate": 4.186317574030661e-08, "loss": 0.7197, "step": 7147 }, { "epoch": 0.7066208635049305, "grad_norm": 3.5804933897021014, "learning_rate": 4.1837125034439145e-08, "loss": 0.6973, "step": 7148 }, { "epoch": 0.7067197192496849, "grad_norm": 3.666759106170043, "learning_rate": 4.181108029231651e-08, "loss": 0.6663, "step": 7149 }, { "epoch": 0.7068185749944393, "grad_norm": 3.761212372800903, "learning_rate": 4.1785041516609256e-08, "loss": 0.7063, "step": 7150 }, { "epoch": 0.7069174307391939, "grad_norm": 6.367274296889707, "learning_rate": 4.1759008709987284e-08, "loss": 0.6299, "step": 7151 }, { "epoch": 0.7070162864839483, "grad_norm": 3.5149682323889277, "learning_rate": 4.173298187511983e-08, "loss": 0.686, "step": 7152 }, { "epoch": 0.7071151422287028, "grad_norm": 6.376683677952883, "learning_rate": 4.170696101467558e-08, "loss": 0.697, "step": 7153 }, { "epoch": 0.7072139979734572, "grad_norm": 3.960716441720981, "learning_rate": 4.168094613132262e-08, "loss": 0.6653, "step": 7154 }, { "epoch": 0.7073128537182117, "grad_norm": 4.92844524538202, "learning_rate": 4.165493722772835e-08, "loss": 0.7083, "step": 7155 }, { "epoch": 0.7074117094629662, "grad_norm": 8.287568434528591, "learning_rate": 4.162893430655965e-08, "loss": 0.8366, "step": 7156 }, { "epoch": 0.7075105652077206, "grad_norm": 3.304248141586816, "learning_rate": 4.160293737048266e-08, "loss": 0.7186, "step": 7157 }, { "epoch": 0.7076094209524751, "grad_norm": 8.129703731912658, "learning_rate": 4.157694642216306e-08, "loss": 0.6908, "step": 7158 }, { "epoch": 0.7077082766972296, "grad_norm": 3.4565816665807527, "learning_rate": 4.1550961464265776e-08, "loss": 0.8496, "step": 7159 }, { "epoch": 0.707807132441984, "grad_norm": 3.2320154504039333, "learning_rate": 4.1524982499455244e-08, "loss": 0.6824, "step": 7160 }, { "epoch": 0.7079059881867386, "grad_norm": 4.812802317050291, "learning_rate": 4.1499009530395143e-08, "loss": 0.738, "step": 7161 }, { "epoch": 0.708004843931493, "grad_norm": 5.513297795468437, "learning_rate": 4.147304255974872e-08, "loss": 0.6456, "step": 7162 }, { "epoch": 0.7081036996762474, "grad_norm": 3.8000332682859796, "learning_rate": 4.144708159017839e-08, "loss": 0.8356, "step": 7163 }, { "epoch": 0.7082025554210019, "grad_norm": 5.1156434128363415, "learning_rate": 4.142112662434615e-08, "loss": 0.6468, "step": 7164 }, { "epoch": 0.7083014111657564, "grad_norm": 6.3081443432214375, "learning_rate": 4.1395177664913294e-08, "loss": 0.7103, "step": 7165 }, { "epoch": 0.7084002669105108, "grad_norm": 3.5735351468605776, "learning_rate": 4.136923471454046e-08, "loss": 0.7784, "step": 7166 }, { "epoch": 0.7084991226552653, "grad_norm": 2.8034085704790086, "learning_rate": 4.134329777588773e-08, "loss": 0.6378, "step": 7167 }, { "epoch": 0.7085979784000198, "grad_norm": 7.397070278415529, "learning_rate": 4.131736685161461e-08, "loss": 0.7487, "step": 7168 }, { "epoch": 0.7086968341447742, "grad_norm": 10.016487416425305, "learning_rate": 4.129144194437986e-08, "loss": 0.6969, "step": 7169 }, { "epoch": 0.7087956898895287, "grad_norm": 4.4050566382880865, "learning_rate": 4.1265523056841745e-08, "loss": 0.837, "step": 7170 }, { "epoch": 0.7088945456342832, "grad_norm": 3.4584459064716753, "learning_rate": 4.1239610191657826e-08, "loss": 0.6528, "step": 7171 }, { "epoch": 0.7089934013790377, "grad_norm": 3.4440750620595773, "learning_rate": 4.12137033514851e-08, "loss": 0.6651, "step": 7172 }, { "epoch": 0.7090922571237921, "grad_norm": 7.10471008497355, "learning_rate": 4.118780253897996e-08, "loss": 0.7433, "step": 7173 }, { "epoch": 0.7091911128685465, "grad_norm": 3.1352884422694647, "learning_rate": 4.1161907756798106e-08, "loss": 0.71, "step": 7174 }, { "epoch": 0.7092899686133011, "grad_norm": 5.843182262736252, "learning_rate": 4.113601900759474e-08, "loss": 0.6737, "step": 7175 }, { "epoch": 0.7093888243580555, "grad_norm": 6.110978461766818, "learning_rate": 4.111013629402425e-08, "loss": 0.6147, "step": 7176 }, { "epoch": 0.7094876801028099, "grad_norm": 4.253386340574145, "learning_rate": 4.108425961874066e-08, "loss": 0.7472, "step": 7177 }, { "epoch": 0.7095865358475645, "grad_norm": 6.872792306168577, "learning_rate": 4.105838898439713e-08, "loss": 0.7651, "step": 7178 }, { "epoch": 0.7096853915923189, "grad_norm": 2.7194275676275987, "learning_rate": 4.1032524393646394e-08, "loss": 0.6783, "step": 7179 }, { "epoch": 0.7097842473370733, "grad_norm": 8.709779837841921, "learning_rate": 4.100666584914042e-08, "loss": 0.8754, "step": 7180 }, { "epoch": 0.7098831030818279, "grad_norm": 9.511570800402716, "learning_rate": 4.0980813353530686e-08, "loss": 0.8333, "step": 7181 }, { "epoch": 0.7099819588265823, "grad_norm": 4.9829555573695545, "learning_rate": 4.0954966909467905e-08, "loss": 0.738, "step": 7182 }, { "epoch": 0.7100808145713368, "grad_norm": 5.479746071182573, "learning_rate": 4.09291265196023e-08, "loss": 0.701, "step": 7183 }, { "epoch": 0.7101796703160912, "grad_norm": 3.8583318393224286, "learning_rate": 4.090329218658345e-08, "loss": 0.7691, "step": 7184 }, { "epoch": 0.7102785260608457, "grad_norm": 3.4396114594089515, "learning_rate": 4.0877463913060206e-08, "loss": 0.6874, "step": 7185 }, { "epoch": 0.7103773818056002, "grad_norm": 11.375985972930724, "learning_rate": 4.085164170168092e-08, "loss": 0.6774, "step": 7186 }, { "epoch": 0.7104762375503546, "grad_norm": 3.7087432308194836, "learning_rate": 4.082582555509332e-08, "loss": 0.7291, "step": 7187 }, { "epoch": 0.7105750932951092, "grad_norm": 7.933637303930125, "learning_rate": 4.080001547594438e-08, "loss": 0.7768, "step": 7188 }, { "epoch": 0.7106739490398636, "grad_norm": 7.417532054182681, "learning_rate": 4.0774211466880606e-08, "loss": 0.7672, "step": 7189 }, { "epoch": 0.710772804784618, "grad_norm": 5.807702054391788, "learning_rate": 4.074841353054783e-08, "loss": 0.83, "step": 7190 }, { "epoch": 0.7108716605293726, "grad_norm": 3.9388409534754185, "learning_rate": 4.072262166959119e-08, "loss": 0.7362, "step": 7191 }, { "epoch": 0.710970516274127, "grad_norm": 4.336892855348351, "learning_rate": 4.0696835886655336e-08, "loss": 0.7666, "step": 7192 }, { "epoch": 0.7110693720188814, "grad_norm": 2.884561757033427, "learning_rate": 4.0671056184384175e-08, "loss": 0.6788, "step": 7193 }, { "epoch": 0.7111682277636359, "grad_norm": 8.111279396184418, "learning_rate": 4.0645282565421003e-08, "loss": 0.763, "step": 7194 }, { "epoch": 0.7112670835083904, "grad_norm": 3.69549837187306, "learning_rate": 4.06195150324086e-08, "loss": 0.7813, "step": 7195 }, { "epoch": 0.7113659392531448, "grad_norm": 4.090766688691093, "learning_rate": 4.059375358798897e-08, "loss": 0.6716, "step": 7196 }, { "epoch": 0.7114647949978993, "grad_norm": 3.3916770785395047, "learning_rate": 4.0567998234803604e-08, "loss": 0.7262, "step": 7197 }, { "epoch": 0.7115636507426538, "grad_norm": 4.453203661399148, "learning_rate": 4.054224897549337e-08, "loss": 0.6416, "step": 7198 }, { "epoch": 0.7116625064874083, "grad_norm": 3.6957868711703417, "learning_rate": 4.0516505812698396e-08, "loss": 0.6856, "step": 7199 }, { "epoch": 0.7117613622321627, "grad_norm": 4.741333338158835, "learning_rate": 4.049076874905832e-08, "loss": 0.715, "step": 7200 }, { "epoch": 0.7118602179769172, "grad_norm": 6.061234757732551, "learning_rate": 4.046503778721212e-08, "loss": 0.7746, "step": 7201 }, { "epoch": 0.7119590737216717, "grad_norm": 3.4095660697350634, "learning_rate": 4.0439312929798054e-08, "loss": 0.7549, "step": 7202 }, { "epoch": 0.7120579294664261, "grad_norm": 9.39283891751759, "learning_rate": 4.0413594179453906e-08, "loss": 0.6824, "step": 7203 }, { "epoch": 0.7121567852111805, "grad_norm": 3.7628073693443937, "learning_rate": 4.038788153881667e-08, "loss": 0.6742, "step": 7204 }, { "epoch": 0.7122556409559351, "grad_norm": 12.731873638290553, "learning_rate": 4.0362175010522836e-08, "loss": 0.7373, "step": 7205 }, { "epoch": 0.7123544967006895, "grad_norm": 4.279034018972368, "learning_rate": 4.033647459720828e-08, "loss": 0.6737, "step": 7206 }, { "epoch": 0.7124533524454439, "grad_norm": 3.8841529272756734, "learning_rate": 4.031078030150812e-08, "loss": 0.6266, "step": 7207 }, { "epoch": 0.7125522081901985, "grad_norm": 3.6332887966725425, "learning_rate": 4.028509212605696e-08, "loss": 0.6009, "step": 7208 }, { "epoch": 0.7126510639349529, "grad_norm": 4.313504831501446, "learning_rate": 4.0259410073488785e-08, "loss": 0.7247, "step": 7209 }, { "epoch": 0.7127499196797074, "grad_norm": 7.394639836883826, "learning_rate": 4.023373414643687e-08, "loss": 0.6785, "step": 7210 }, { "epoch": 0.7128487754244619, "grad_norm": 6.622793036804556, "learning_rate": 4.0208064347533865e-08, "loss": 0.6954, "step": 7211 }, { "epoch": 0.7129476311692163, "grad_norm": 49.12174230278148, "learning_rate": 4.018240067941191e-08, "loss": 0.8061, "step": 7212 }, { "epoch": 0.7130464869139708, "grad_norm": 5.4962449605388555, "learning_rate": 4.015674314470235e-08, "loss": 0.6688, "step": 7213 }, { "epoch": 0.7131453426587252, "grad_norm": 4.100387572989624, "learning_rate": 4.013109174603607e-08, "loss": 0.6952, "step": 7214 }, { "epoch": 0.7132441984034797, "grad_norm": 4.714319862904531, "learning_rate": 4.010544648604317e-08, "loss": 0.6099, "step": 7215 }, { "epoch": 0.7133430541482342, "grad_norm": 9.418752542828472, "learning_rate": 4.0079807367353226e-08, "loss": 0.6704, "step": 7216 }, { "epoch": 0.7134419098929886, "grad_norm": 7.441707733515792, "learning_rate": 4.005417439259518e-08, "loss": 0.679, "step": 7217 }, { "epoch": 0.7135407656377432, "grad_norm": 4.550048217578439, "learning_rate": 4.0028547564397255e-08, "loss": 0.7649, "step": 7218 }, { "epoch": 0.7136396213824976, "grad_norm": 3.902056909234297, "learning_rate": 4.0002926885387136e-08, "loss": 0.8116, "step": 7219 }, { "epoch": 0.713738477127252, "grad_norm": 5.2151751478334285, "learning_rate": 3.9977312358191885e-08, "loss": 0.6541, "step": 7220 }, { "epoch": 0.7138373328720066, "grad_norm": 4.385114318994977, "learning_rate": 3.9951703985437814e-08, "loss": 0.69, "step": 7221 }, { "epoch": 0.713936188616761, "grad_norm": 2.9428628789085147, "learning_rate": 3.992610176975072e-08, "loss": 0.6998, "step": 7222 }, { "epoch": 0.7140350443615154, "grad_norm": 3.8243811699042607, "learning_rate": 3.9900505713755786e-08, "loss": 0.7899, "step": 7223 }, { "epoch": 0.71413390010627, "grad_norm": 5.232589690573017, "learning_rate": 3.9874915820077415e-08, "loss": 0.6005, "step": 7224 }, { "epoch": 0.7142327558510244, "grad_norm": 4.500795495086504, "learning_rate": 3.984933209133956e-08, "loss": 0.7156, "step": 7225 }, { "epoch": 0.7143316115957788, "grad_norm": 5.752989649198968, "learning_rate": 3.9823754530165374e-08, "loss": 0.6963, "step": 7226 }, { "epoch": 0.7144304673405333, "grad_norm": 3.6456077733059917, "learning_rate": 3.979818313917753e-08, "loss": 0.7257, "step": 7227 }, { "epoch": 0.7145293230852878, "grad_norm": 3.275052817329671, "learning_rate": 3.9772617920997944e-08, "loss": 0.783, "step": 7228 }, { "epoch": 0.7146281788300423, "grad_norm": 6.041836661263248, "learning_rate": 3.974705887824801e-08, "loss": 0.8157, "step": 7229 }, { "epoch": 0.7147270345747967, "grad_norm": 2.757069223990188, "learning_rate": 3.972150601354834e-08, "loss": 0.7483, "step": 7230 }, { "epoch": 0.7148258903195512, "grad_norm": 4.801815665471596, "learning_rate": 3.9695959329519114e-08, "loss": 0.7638, "step": 7231 }, { "epoch": 0.7149247460643057, "grad_norm": 4.803006410234423, "learning_rate": 3.967041882877966e-08, "loss": 0.8053, "step": 7232 }, { "epoch": 0.7150236018090601, "grad_norm": 3.9527041072027846, "learning_rate": 3.964488451394884e-08, "loss": 0.7576, "step": 7233 }, { "epoch": 0.7151224575538146, "grad_norm": 5.069433693207704, "learning_rate": 3.961935638764485e-08, "loss": 0.7557, "step": 7234 }, { "epoch": 0.7152213132985691, "grad_norm": 4.603975333410448, "learning_rate": 3.959383445248514e-08, "loss": 0.6804, "step": 7235 }, { "epoch": 0.7153201690433235, "grad_norm": 5.1960754221607655, "learning_rate": 3.95683187110867e-08, "loss": 0.6072, "step": 7236 }, { "epoch": 0.715419024788078, "grad_norm": 6.190205584124171, "learning_rate": 3.954280916606571e-08, "loss": 0.6431, "step": 7237 }, { "epoch": 0.7155178805328325, "grad_norm": 4.574662210280579, "learning_rate": 3.9517305820037826e-08, "loss": 0.6776, "step": 7238 }, { "epoch": 0.7156167362775869, "grad_norm": 7.189538630389273, "learning_rate": 3.949180867561809e-08, "loss": 0.6903, "step": 7239 }, { "epoch": 0.7157155920223414, "grad_norm": 6.259109202330865, "learning_rate": 3.9466317735420775e-08, "loss": 0.6476, "step": 7240 }, { "epoch": 0.7158144477670959, "grad_norm": 3.4771908472219413, "learning_rate": 3.9440833002059645e-08, "loss": 0.668, "step": 7241 }, { "epoch": 0.7159133035118503, "grad_norm": 7.219237989135649, "learning_rate": 3.941535447814781e-08, "loss": 0.6529, "step": 7242 }, { "epoch": 0.7160121592566048, "grad_norm": 4.1864913838083115, "learning_rate": 3.938988216629766e-08, "loss": 0.7093, "step": 7243 }, { "epoch": 0.7161110150013593, "grad_norm": 4.968633008795248, "learning_rate": 3.9364416069121066e-08, "loss": 0.6016, "step": 7244 }, { "epoch": 0.7162098707461138, "grad_norm": 10.059162178520529, "learning_rate": 3.9338956189229124e-08, "loss": 0.5587, "step": 7245 }, { "epoch": 0.7163087264908682, "grad_norm": 8.016012225073071, "learning_rate": 3.931350252923244e-08, "loss": 0.7628, "step": 7246 }, { "epoch": 0.7164075822356226, "grad_norm": 6.153936816751572, "learning_rate": 3.928805509174087e-08, "loss": 0.6703, "step": 7247 }, { "epoch": 0.7165064379803772, "grad_norm": 9.743808958443637, "learning_rate": 3.92626138793637e-08, "loss": 0.6951, "step": 7248 }, { "epoch": 0.7166052937251316, "grad_norm": 3.3218129792986484, "learning_rate": 3.923717889470952e-08, "loss": 0.6526, "step": 7249 }, { "epoch": 0.716704149469886, "grad_norm": 3.828314231317914, "learning_rate": 3.9211750140386354e-08, "loss": 0.7478, "step": 7250 }, { "epoch": 0.7168030052146406, "grad_norm": 4.622625743351136, "learning_rate": 3.918632761900149e-08, "loss": 0.7768, "step": 7251 }, { "epoch": 0.716901860959395, "grad_norm": 4.032919517063897, "learning_rate": 3.916091133316166e-08, "loss": 0.7106, "step": 7252 }, { "epoch": 0.7170007167041494, "grad_norm": 3.614769658136436, "learning_rate": 3.913550128547297e-08, "loss": 0.6679, "step": 7253 }, { "epoch": 0.717099572448904, "grad_norm": 4.9103931899585636, "learning_rate": 3.911009747854077e-08, "loss": 0.8231, "step": 7254 }, { "epoch": 0.7171984281936584, "grad_norm": 4.415993719952776, "learning_rate": 3.908469991496989e-08, "loss": 0.7668, "step": 7255 }, { "epoch": 0.7172972839384129, "grad_norm": 4.1812657452159305, "learning_rate": 3.9059308597364505e-08, "loss": 0.8071, "step": 7256 }, { "epoch": 0.7173961396831673, "grad_norm": 7.876578653183208, "learning_rate": 3.903392352832805e-08, "loss": 0.6671, "step": 7257 }, { "epoch": 0.7174949954279218, "grad_norm": 3.4212707603754158, "learning_rate": 3.900854471046345e-08, "loss": 0.709, "step": 7258 }, { "epoch": 0.7175938511726763, "grad_norm": 10.48144188768974, "learning_rate": 3.898317214637288e-08, "loss": 0.6852, "step": 7259 }, { "epoch": 0.7176927069174307, "grad_norm": 4.191491797066778, "learning_rate": 3.895780583865794e-08, "loss": 0.62, "step": 7260 }, { "epoch": 0.7177915626621852, "grad_norm": 3.154361399329377, "learning_rate": 3.893244578991961e-08, "loss": 0.7197, "step": 7261 }, { "epoch": 0.7178904184069397, "grad_norm": 3.8207507775030733, "learning_rate": 3.8907092002758126e-08, "loss": 0.7889, "step": 7262 }, { "epoch": 0.7179892741516941, "grad_norm": 4.809156188299764, "learning_rate": 3.8881744479773194e-08, "loss": 0.7377, "step": 7263 }, { "epoch": 0.7180881298964487, "grad_norm": 7.4247787976058754, "learning_rate": 3.885640322356382e-08, "loss": 0.7052, "step": 7264 }, { "epoch": 0.7181869856412031, "grad_norm": 6.740803316011307, "learning_rate": 3.8831068236728325e-08, "loss": 0.6873, "step": 7265 }, { "epoch": 0.7182858413859575, "grad_norm": 3.517649558469712, "learning_rate": 3.880573952186448e-08, "loss": 0.7278, "step": 7266 }, { "epoch": 0.718384697130712, "grad_norm": 5.684897771899062, "learning_rate": 3.8780417081569396e-08, "loss": 0.6787, "step": 7267 }, { "epoch": 0.7184835528754665, "grad_norm": 4.044140096900547, "learning_rate": 3.8755100918439457e-08, "loss": 0.6703, "step": 7268 }, { "epoch": 0.7185824086202209, "grad_norm": 3.534216382410133, "learning_rate": 3.8729791035070525e-08, "loss": 0.5907, "step": 7269 }, { "epoch": 0.7186812643649754, "grad_norm": 4.564932700707634, "learning_rate": 3.87044874340577e-08, "loss": 0.6249, "step": 7270 }, { "epoch": 0.7187801201097299, "grad_norm": 4.1463939580754, "learning_rate": 3.8679190117995494e-08, "loss": 0.757, "step": 7271 }, { "epoch": 0.7188789758544843, "grad_norm": 8.963493258081316, "learning_rate": 3.865389908947785e-08, "loss": 0.6969, "step": 7272 }, { "epoch": 0.7189778315992388, "grad_norm": 7.96203486952326, "learning_rate": 3.86286143510979e-08, "loss": 0.7054, "step": 7273 }, { "epoch": 0.7190766873439933, "grad_norm": 4.992801524848335, "learning_rate": 3.860333590544824e-08, "loss": 0.702, "step": 7274 }, { "epoch": 0.7191755430887478, "grad_norm": 7.7715004477342555, "learning_rate": 3.857806375512087e-08, "loss": 0.7474, "step": 7275 }, { "epoch": 0.7192743988335022, "grad_norm": 2.758760578435779, "learning_rate": 3.855279790270699e-08, "loss": 0.696, "step": 7276 }, { "epoch": 0.7193732545782566, "grad_norm": 4.572787141803022, "learning_rate": 3.8527538350797263e-08, "loss": 0.7434, "step": 7277 }, { "epoch": 0.7194721103230112, "grad_norm": 4.158001076278279, "learning_rate": 3.850228510198173e-08, "loss": 0.7634, "step": 7278 }, { "epoch": 0.7195709660677656, "grad_norm": 3.4832393659395984, "learning_rate": 3.847703815884969e-08, "loss": 0.5925, "step": 7279 }, { "epoch": 0.71966982181252, "grad_norm": 6.049605336973558, "learning_rate": 3.8451797523989885e-08, "loss": 0.6403, "step": 7280 }, { "epoch": 0.7197686775572746, "grad_norm": 4.602179710988769, "learning_rate": 3.8426563199990336e-08, "loss": 0.7629, "step": 7281 }, { "epoch": 0.719867533302029, "grad_norm": 3.1811931328295073, "learning_rate": 3.840133518943842e-08, "loss": 0.5972, "step": 7282 }, { "epoch": 0.7199663890467835, "grad_norm": 3.1099179976933313, "learning_rate": 3.837611349492098e-08, "loss": 0.6779, "step": 7283 }, { "epoch": 0.720065244791538, "grad_norm": 15.871684100894305, "learning_rate": 3.8350898119024055e-08, "loss": 0.6108, "step": 7284 }, { "epoch": 0.7201641005362924, "grad_norm": 3.002651036758441, "learning_rate": 3.832568906433313e-08, "loss": 0.6653, "step": 7285 }, { "epoch": 0.7202629562810469, "grad_norm": 19.20528807672984, "learning_rate": 3.8300486333433065e-08, "loss": 0.7197, "step": 7286 }, { "epoch": 0.7203618120258013, "grad_norm": 6.091391702347325, "learning_rate": 3.827528992890797e-08, "loss": 0.7082, "step": 7287 }, { "epoch": 0.7204606677705558, "grad_norm": 8.07608116927132, "learning_rate": 3.825009985334138e-08, "loss": 0.6642, "step": 7288 }, { "epoch": 0.7205595235153103, "grad_norm": 3.2803409873650478, "learning_rate": 3.822491610931622e-08, "loss": 0.8123, "step": 7289 }, { "epoch": 0.7206583792600647, "grad_norm": 4.622622419804162, "learning_rate": 3.819973869941462e-08, "loss": 0.6422, "step": 7290 }, { "epoch": 0.7207572350048193, "grad_norm": 6.058913313544786, "learning_rate": 3.8174567626218234e-08, "loss": 0.6294, "step": 7291 }, { "epoch": 0.7208560907495737, "grad_norm": 6.164019313859351, "learning_rate": 3.8149402892307926e-08, "loss": 0.7133, "step": 7292 }, { "epoch": 0.7209549464943281, "grad_norm": 8.406567617267314, "learning_rate": 3.812424450026398e-08, "loss": 0.7779, "step": 7293 }, { "epoch": 0.7210538022390827, "grad_norm": 4.179730782252114, "learning_rate": 3.809909245266607e-08, "loss": 0.7808, "step": 7294 }, { "epoch": 0.7211526579838371, "grad_norm": 14.40445112674945, "learning_rate": 3.807394675209309e-08, "loss": 0.6324, "step": 7295 }, { "epoch": 0.7212515137285915, "grad_norm": 3.118803719832981, "learning_rate": 3.80488074011234e-08, "loss": 0.7631, "step": 7296 }, { "epoch": 0.721350369473346, "grad_norm": 3.068769986251991, "learning_rate": 3.802367440233468e-08, "loss": 0.7202, "step": 7297 }, { "epoch": 0.7214492252181005, "grad_norm": 3.607349233601858, "learning_rate": 3.7998547758303956e-08, "loss": 0.818, "step": 7298 }, { "epoch": 0.7215480809628549, "grad_norm": 4.375728144249166, "learning_rate": 3.7973427471607536e-08, "loss": 0.7065, "step": 7299 }, { "epoch": 0.7216469367076094, "grad_norm": 4.188191294007096, "learning_rate": 3.794831354482121e-08, "loss": 0.5747, "step": 7300 }, { "epoch": 0.7217457924523639, "grad_norm": 3.746743819838766, "learning_rate": 3.792320598051996e-08, "loss": 0.6863, "step": 7301 }, { "epoch": 0.7218446481971184, "grad_norm": 4.007815714558189, "learning_rate": 3.7898104781278276e-08, "loss": 0.6992, "step": 7302 }, { "epoch": 0.7219435039418728, "grad_norm": 3.7910130943118197, "learning_rate": 3.7873009949669846e-08, "loss": 0.7794, "step": 7303 }, { "epoch": 0.7220423596866273, "grad_norm": 4.762449606599511, "learning_rate": 3.784792148826781e-08, "loss": 0.6732, "step": 7304 }, { "epoch": 0.7221412154313818, "grad_norm": 3.4203276834292975, "learning_rate": 3.7822839399644655e-08, "loss": 0.6884, "step": 7305 }, { "epoch": 0.7222400711761362, "grad_norm": 3.9950114656588585, "learning_rate": 3.77977636863721e-08, "loss": 0.8591, "step": 7306 }, { "epoch": 0.7223389269208907, "grad_norm": 3.686630045279255, "learning_rate": 3.777269435102134e-08, "loss": 0.8105, "step": 7307 }, { "epoch": 0.7224377826656452, "grad_norm": 3.130114398638946, "learning_rate": 3.774763139616288e-08, "loss": 0.621, "step": 7308 }, { "epoch": 0.7225366384103996, "grad_norm": 5.383661646167647, "learning_rate": 3.772257482436651e-08, "loss": 0.7834, "step": 7309 }, { "epoch": 0.722635494155154, "grad_norm": 3.532396032832338, "learning_rate": 3.7697524638201426e-08, "loss": 0.7656, "step": 7310 }, { "epoch": 0.7227343498999086, "grad_norm": 3.1655790533912977, "learning_rate": 3.767248084023621e-08, "loss": 0.7025, "step": 7311 }, { "epoch": 0.722833205644663, "grad_norm": 7.743606568238336, "learning_rate": 3.764744343303864e-08, "loss": 0.6332, "step": 7312 }, { "epoch": 0.7229320613894175, "grad_norm": 5.695455196302267, "learning_rate": 3.762241241917602e-08, "loss": 0.7623, "step": 7313 }, { "epoch": 0.723030917134172, "grad_norm": 3.3090528178229404, "learning_rate": 3.759738780121484e-08, "loss": 0.6543, "step": 7314 }, { "epoch": 0.7231297728789264, "grad_norm": 3.282117749879745, "learning_rate": 3.7572369581721075e-08, "loss": 0.7321, "step": 7315 }, { "epoch": 0.7232286286236809, "grad_norm": 5.968436571668573, "learning_rate": 3.75473577632599e-08, "loss": 0.7813, "step": 7316 }, { "epoch": 0.7233274843684354, "grad_norm": 5.6898750542813605, "learning_rate": 3.7522352348395983e-08, "loss": 0.6998, "step": 7317 }, { "epoch": 0.7234263401131898, "grad_norm": 8.636242442216014, "learning_rate": 3.7497353339693185e-08, "loss": 0.6337, "step": 7318 }, { "epoch": 0.7235251958579443, "grad_norm": 5.018692664621684, "learning_rate": 3.747236073971485e-08, "loss": 0.7249, "step": 7319 }, { "epoch": 0.7236240516026987, "grad_norm": 6.990117444758741, "learning_rate": 3.744737455102355e-08, "loss": 0.6772, "step": 7320 }, { "epoch": 0.7237229073474533, "grad_norm": 3.029985714054099, "learning_rate": 3.7422394776181264e-08, "loss": 0.629, "step": 7321 }, { "epoch": 0.7238217630922077, "grad_norm": 4.519916165996968, "learning_rate": 3.739742141774935e-08, "loss": 0.7856, "step": 7322 }, { "epoch": 0.7239206188369621, "grad_norm": 12.572969578201336, "learning_rate": 3.737245447828838e-08, "loss": 0.6027, "step": 7323 }, { "epoch": 0.7240194745817167, "grad_norm": 3.050312762739581, "learning_rate": 3.7347493960358423e-08, "loss": 0.7497, "step": 7324 }, { "epoch": 0.7241183303264711, "grad_norm": 5.674091492504297, "learning_rate": 3.732253986651873e-08, "loss": 0.6783, "step": 7325 }, { "epoch": 0.7242171860712255, "grad_norm": 4.170333137661759, "learning_rate": 3.729759219932802e-08, "loss": 0.6514, "step": 7326 }, { "epoch": 0.7243160418159801, "grad_norm": 4.364500947319158, "learning_rate": 3.727265096134433e-08, "loss": 0.6797, "step": 7327 }, { "epoch": 0.7244148975607345, "grad_norm": 5.255068144104749, "learning_rate": 3.7247716155124965e-08, "loss": 0.748, "step": 7328 }, { "epoch": 0.724513753305489, "grad_norm": 4.051218545819326, "learning_rate": 3.722278778322664e-08, "loss": 0.7683, "step": 7329 }, { "epoch": 0.7246126090502434, "grad_norm": 10.835198070045639, "learning_rate": 3.7197865848205446e-08, "loss": 0.8365, "step": 7330 }, { "epoch": 0.7247114647949979, "grad_norm": 4.4655424090683224, "learning_rate": 3.717295035261666e-08, "loss": 0.7806, "step": 7331 }, { "epoch": 0.7248103205397524, "grad_norm": 4.532474818486525, "learning_rate": 3.71480412990151e-08, "loss": 0.7371, "step": 7332 }, { "epoch": 0.7249091762845068, "grad_norm": 3.5373599779345843, "learning_rate": 3.7123138689954736e-08, "loss": 0.5911, "step": 7333 }, { "epoch": 0.7250080320292613, "grad_norm": 2.959868965400439, "learning_rate": 3.709824252798903e-08, "loss": 0.576, "step": 7334 }, { "epoch": 0.7251068877740158, "grad_norm": 8.083983988183228, "learning_rate": 3.707335281567069e-08, "loss": 0.7413, "step": 7335 }, { "epoch": 0.7252057435187702, "grad_norm": 4.220084962807107, "learning_rate": 3.7048469555551754e-08, "loss": 0.7713, "step": 7336 }, { "epoch": 0.7253045992635248, "grad_norm": 3.7259260405457337, "learning_rate": 3.7023592750183654e-08, "loss": 0.6555, "step": 7337 }, { "epoch": 0.7254034550082792, "grad_norm": 6.19648662530911, "learning_rate": 3.699872240211719e-08, "loss": 0.6701, "step": 7338 }, { "epoch": 0.7255023107530336, "grad_norm": 3.8911637241846724, "learning_rate": 3.697385851390238e-08, "loss": 0.7364, "step": 7339 }, { "epoch": 0.725601166497788, "grad_norm": 3.6743817295025574, "learning_rate": 3.694900108808867e-08, "loss": 0.7429, "step": 7340 }, { "epoch": 0.7257000222425426, "grad_norm": 6.531392739973485, "learning_rate": 3.692415012722487e-08, "loss": 0.7291, "step": 7341 }, { "epoch": 0.725798877987297, "grad_norm": 5.183875581091765, "learning_rate": 3.6899305633859014e-08, "loss": 0.7612, "step": 7342 }, { "epoch": 0.7258977337320515, "grad_norm": 3.6868071948126078, "learning_rate": 3.687446761053855e-08, "loss": 0.6757, "step": 7343 }, { "epoch": 0.725996589476806, "grad_norm": 5.74727703121191, "learning_rate": 3.6849636059810306e-08, "loss": 0.7899, "step": 7344 }, { "epoch": 0.7260954452215604, "grad_norm": 6.185501433262192, "learning_rate": 3.682481098422031e-08, "loss": 0.7072, "step": 7345 }, { "epoch": 0.7261943009663149, "grad_norm": 3.2652266885916497, "learning_rate": 3.6799992386314094e-08, "loss": 0.7171, "step": 7346 }, { "epoch": 0.7262931567110694, "grad_norm": 4.067004127937666, "learning_rate": 3.6775180268636355e-08, "loss": 0.7374, "step": 7347 }, { "epoch": 0.7263920124558239, "grad_norm": 10.161158229326492, "learning_rate": 3.675037463373124e-08, "loss": 0.7398, "step": 7348 }, { "epoch": 0.7264908682005783, "grad_norm": 5.42528044095739, "learning_rate": 3.6725575484142256e-08, "loss": 0.7313, "step": 7349 }, { "epoch": 0.7265897239453327, "grad_norm": 5.748117432053352, "learning_rate": 3.6700782822412104e-08, "loss": 0.6771, "step": 7350 }, { "epoch": 0.7266885796900873, "grad_norm": 6.430398489218253, "learning_rate": 3.6675996651082975e-08, "loss": 0.7826, "step": 7351 }, { "epoch": 0.7267874354348417, "grad_norm": 3.197896845969707, "learning_rate": 3.6651216972696285e-08, "loss": 0.6759, "step": 7352 }, { "epoch": 0.7268862911795961, "grad_norm": 5.170561135574947, "learning_rate": 3.6626443789792815e-08, "loss": 0.7392, "step": 7353 }, { "epoch": 0.7269851469243507, "grad_norm": 3.9338373131172957, "learning_rate": 3.660167710491271e-08, "loss": 0.7352, "step": 7354 }, { "epoch": 0.7270840026691051, "grad_norm": 5.266194256581534, "learning_rate": 3.6576916920595446e-08, "loss": 0.8256, "step": 7355 }, { "epoch": 0.7271828584138595, "grad_norm": 3.4700814323499625, "learning_rate": 3.6552163239379774e-08, "loss": 0.5898, "step": 7356 }, { "epoch": 0.7272817141586141, "grad_norm": 3.428373436054677, "learning_rate": 3.6527416063803875e-08, "loss": 0.7144, "step": 7357 }, { "epoch": 0.7273805699033685, "grad_norm": 3.444372794521405, "learning_rate": 3.650267539640514e-08, "loss": 0.7893, "step": 7358 }, { "epoch": 0.727479425648123, "grad_norm": 2.834926209772839, "learning_rate": 3.647794123972038e-08, "loss": 0.7409, "step": 7359 }, { "epoch": 0.7275782813928774, "grad_norm": 5.69860784710094, "learning_rate": 3.645321359628577e-08, "loss": 0.719, "step": 7360 }, { "epoch": 0.7276771371376319, "grad_norm": 3.9545425434718733, "learning_rate": 3.6428492468636686e-08, "loss": 0.6888, "step": 7361 }, { "epoch": 0.7277759928823864, "grad_norm": 5.823664045510305, "learning_rate": 3.640377785930796e-08, "loss": 0.6834, "step": 7362 }, { "epoch": 0.7278748486271408, "grad_norm": 3.107355404784063, "learning_rate": 3.6379069770833734e-08, "loss": 0.6758, "step": 7363 }, { "epoch": 0.7279737043718953, "grad_norm": 12.19219174770848, "learning_rate": 3.63543682057474e-08, "loss": 0.7382, "step": 7364 }, { "epoch": 0.7280725601166498, "grad_norm": 3.7154345634182415, "learning_rate": 3.6329673166581754e-08, "loss": 0.6769, "step": 7365 }, { "epoch": 0.7281714158614042, "grad_norm": 13.191955904067434, "learning_rate": 3.630498465586896e-08, "loss": 0.7036, "step": 7366 }, { "epoch": 0.7282702716061588, "grad_norm": 2.5622003678921765, "learning_rate": 3.6280302676140406e-08, "loss": 0.5699, "step": 7367 }, { "epoch": 0.7283691273509132, "grad_norm": 2.8140690293596475, "learning_rate": 3.62556272299269e-08, "loss": 0.7317, "step": 7368 }, { "epoch": 0.7284679830956676, "grad_norm": 5.962688836187505, "learning_rate": 3.623095831975852e-08, "loss": 0.7149, "step": 7369 }, { "epoch": 0.7285668388404221, "grad_norm": 3.521524220954424, "learning_rate": 3.6206295948164676e-08, "loss": 0.7064, "step": 7370 }, { "epoch": 0.7286656945851766, "grad_norm": 3.885849768376188, "learning_rate": 3.618164011767421e-08, "loss": 0.7111, "step": 7371 }, { "epoch": 0.728764550329931, "grad_norm": 3.608310023303222, "learning_rate": 3.6156990830815106e-08, "loss": 0.7299, "step": 7372 }, { "epoch": 0.7288634060746855, "grad_norm": 3.879972226997316, "learning_rate": 3.613234809011485e-08, "loss": 0.7228, "step": 7373 }, { "epoch": 0.72896226181944, "grad_norm": 4.750971679774283, "learning_rate": 3.610771189810021e-08, "loss": 0.7265, "step": 7374 }, { "epoch": 0.7290611175641944, "grad_norm": 77.13859105497136, "learning_rate": 3.6083082257297215e-08, "loss": 0.8042, "step": 7375 }, { "epoch": 0.7291599733089489, "grad_norm": 4.988003077417005, "learning_rate": 3.605845917023129e-08, "loss": 0.6589, "step": 7376 }, { "epoch": 0.7292588290537034, "grad_norm": 7.849302359150328, "learning_rate": 3.60338426394272e-08, "loss": 0.7589, "step": 7377 }, { "epoch": 0.7293576847984579, "grad_norm": 3.61602888774024, "learning_rate": 3.600923266740896e-08, "loss": 0.6609, "step": 7378 }, { "epoch": 0.7294565405432123, "grad_norm": 3.8838186209892296, "learning_rate": 3.5984629256700004e-08, "loss": 0.7452, "step": 7379 }, { "epoch": 0.7295553962879668, "grad_norm": 3.3779892351436933, "learning_rate": 3.596003240982299e-08, "loss": 0.6061, "step": 7380 }, { "epoch": 0.7296542520327213, "grad_norm": 4.66135140867551, "learning_rate": 3.593544212930001e-08, "loss": 0.745, "step": 7381 }, { "epoch": 0.7297531077774757, "grad_norm": 7.2549223743340185, "learning_rate": 3.5910858417652436e-08, "loss": 0.6487, "step": 7382 }, { "epoch": 0.7298519635222301, "grad_norm": 10.862855724991713, "learning_rate": 3.588628127740092e-08, "loss": 0.6936, "step": 7383 }, { "epoch": 0.7299508192669847, "grad_norm": 4.45758219101402, "learning_rate": 3.586171071106553e-08, "loss": 0.636, "step": 7384 }, { "epoch": 0.7300496750117391, "grad_norm": 3.144296742316466, "learning_rate": 3.583714672116561e-08, "loss": 0.578, "step": 7385 }, { "epoch": 0.7301485307564936, "grad_norm": 16.03708348386177, "learning_rate": 3.5812589310219844e-08, "loss": 0.7896, "step": 7386 }, { "epoch": 0.7302473865012481, "grad_norm": 6.05445394716261, "learning_rate": 3.578803848074618e-08, "loss": 0.7093, "step": 7387 }, { "epoch": 0.7303462422460025, "grad_norm": 3.238085886840995, "learning_rate": 3.5763494235262004e-08, "loss": 0.7693, "step": 7388 }, { "epoch": 0.730445097990757, "grad_norm": 3.3500755218596883, "learning_rate": 3.5738956576283894e-08, "loss": 0.718, "step": 7389 }, { "epoch": 0.7305439537355115, "grad_norm": 4.309545326748535, "learning_rate": 3.571442550632792e-08, "loss": 0.6553, "step": 7390 }, { "epoch": 0.7306428094802659, "grad_norm": 3.837182821225649, "learning_rate": 3.5689901027909294e-08, "loss": 0.7566, "step": 7391 }, { "epoch": 0.7307416652250204, "grad_norm": 10.343614384707289, "learning_rate": 3.566538314354267e-08, "loss": 0.6249, "step": 7392 }, { "epoch": 0.7308405209697748, "grad_norm": 3.169657567828645, "learning_rate": 3.564087185574205e-08, "loss": 0.749, "step": 7393 }, { "epoch": 0.7309393767145294, "grad_norm": 12.508530284009275, "learning_rate": 3.561636716702061e-08, "loss": 0.6439, "step": 7394 }, { "epoch": 0.7310382324592838, "grad_norm": 3.9293743800319487, "learning_rate": 3.5591869079891e-08, "loss": 0.7854, "step": 7395 }, { "epoch": 0.7311370882040382, "grad_norm": 3.048484397765153, "learning_rate": 3.5567377596865156e-08, "loss": 0.6835, "step": 7396 }, { "epoch": 0.7312359439487928, "grad_norm": 5.1085240418533, "learning_rate": 3.5542892720454264e-08, "loss": 0.7112, "step": 7397 }, { "epoch": 0.7313347996935472, "grad_norm": 3.53750653493388, "learning_rate": 3.551841445316891e-08, "loss": 0.7539, "step": 7398 }, { "epoch": 0.7314336554383016, "grad_norm": 5.569889898845101, "learning_rate": 3.5493942797519016e-08, "loss": 0.6989, "step": 7399 }, { "epoch": 0.7315325111830562, "grad_norm": 3.5302841587831693, "learning_rate": 3.546947775601373e-08, "loss": 0.7302, "step": 7400 }, { "epoch": 0.7316313669278106, "grad_norm": 4.282807525550946, "learning_rate": 3.544501933116164e-08, "loss": 0.6692, "step": 7401 }, { "epoch": 0.731730222672565, "grad_norm": 5.222107790246693, "learning_rate": 3.542056752547052e-08, "loss": 0.6724, "step": 7402 }, { "epoch": 0.7318290784173195, "grad_norm": 7.618750843075519, "learning_rate": 3.539612234144763e-08, "loss": 0.7275, "step": 7403 }, { "epoch": 0.731927934162074, "grad_norm": 4.33156286328052, "learning_rate": 3.537168378159938e-08, "loss": 0.7415, "step": 7404 }, { "epoch": 0.7320267899068285, "grad_norm": 10.578897364390052, "learning_rate": 3.534725184843167e-08, "loss": 0.6608, "step": 7405 }, { "epoch": 0.7321256456515829, "grad_norm": 4.071904674769014, "learning_rate": 3.532282654444955e-08, "loss": 0.693, "step": 7406 }, { "epoch": 0.7322245013963374, "grad_norm": 3.7362948624904964, "learning_rate": 3.529840787215753e-08, "loss": 0.683, "step": 7407 }, { "epoch": 0.7323233571410919, "grad_norm": 5.20605071246516, "learning_rate": 3.527399583405936e-08, "loss": 0.6591, "step": 7408 }, { "epoch": 0.7324222128858463, "grad_norm": 4.077748064387112, "learning_rate": 3.524959043265812e-08, "loss": 0.6578, "step": 7409 }, { "epoch": 0.7325210686306008, "grad_norm": 5.207410011729742, "learning_rate": 3.522519167045629e-08, "loss": 0.7288, "step": 7410 }, { "epoch": 0.7326199243753553, "grad_norm": 3.130241474950181, "learning_rate": 3.520079954995552e-08, "loss": 0.6408, "step": 7411 }, { "epoch": 0.7327187801201097, "grad_norm": 4.657718302513017, "learning_rate": 3.5176414073656936e-08, "loss": 0.7478, "step": 7412 }, { "epoch": 0.7328176358648641, "grad_norm": 5.2845170041554566, "learning_rate": 3.515203524406083e-08, "loss": 0.7429, "step": 7413 }, { "epoch": 0.7329164916096187, "grad_norm": 3.660652662868905, "learning_rate": 3.5127663063666944e-08, "loss": 0.6832, "step": 7414 }, { "epoch": 0.7330153473543731, "grad_norm": 3.1643111940130297, "learning_rate": 3.510329753497431e-08, "loss": 0.7629, "step": 7415 }, { "epoch": 0.7331142030991276, "grad_norm": 3.688840719368677, "learning_rate": 3.507893866048119e-08, "loss": 0.6935, "step": 7416 }, { "epoch": 0.7332130588438821, "grad_norm": 4.5977069067342375, "learning_rate": 3.505458644268525e-08, "loss": 0.7176, "step": 7417 }, { "epoch": 0.7333119145886365, "grad_norm": 4.171768732989724, "learning_rate": 3.503024088408349e-08, "loss": 0.5957, "step": 7418 }, { "epoch": 0.733410770333391, "grad_norm": 3.4564311213355543, "learning_rate": 3.500590198717214e-08, "loss": 0.6309, "step": 7419 }, { "epoch": 0.7335096260781455, "grad_norm": 9.668442513248033, "learning_rate": 3.4981569754446827e-08, "loss": 0.7048, "step": 7420 }, { "epoch": 0.7336084818229, "grad_norm": 6.564653996910882, "learning_rate": 3.4957244188402427e-08, "loss": 0.7521, "step": 7421 }, { "epoch": 0.7337073375676544, "grad_norm": 3.4707177873445847, "learning_rate": 3.49329252915332e-08, "loss": 0.6263, "step": 7422 }, { "epoch": 0.7338061933124088, "grad_norm": 6.089389960152548, "learning_rate": 3.490861306633269e-08, "loss": 0.597, "step": 7423 }, { "epoch": 0.7339050490571634, "grad_norm": 9.698114871013251, "learning_rate": 3.488430751529371e-08, "loss": 0.6695, "step": 7424 }, { "epoch": 0.7340039048019178, "grad_norm": 3.2133813969935896, "learning_rate": 3.486000864090848e-08, "loss": 0.8143, "step": 7425 }, { "epoch": 0.7341027605466722, "grad_norm": 3.746452946329229, "learning_rate": 3.4835716445668493e-08, "loss": 0.7585, "step": 7426 }, { "epoch": 0.7342016162914268, "grad_norm": 15.15664327618828, "learning_rate": 3.481143093206453e-08, "loss": 0.7051, "step": 7427 }, { "epoch": 0.7343004720361812, "grad_norm": 3.9686172649529046, "learning_rate": 3.4787152102586716e-08, "loss": 0.6238, "step": 7428 }, { "epoch": 0.7343993277809356, "grad_norm": 5.288245161563879, "learning_rate": 3.4762879959724536e-08, "loss": 0.6759, "step": 7429 }, { "epoch": 0.7344981835256902, "grad_norm": 6.26369992211701, "learning_rate": 3.473861450596667e-08, "loss": 0.7862, "step": 7430 }, { "epoch": 0.7345970392704446, "grad_norm": 5.926038086944689, "learning_rate": 3.4714355743801204e-08, "loss": 0.6769, "step": 7431 }, { "epoch": 0.734695895015199, "grad_norm": 3.0493275912835607, "learning_rate": 3.4690103675715555e-08, "loss": 0.7187, "step": 7432 }, { "epoch": 0.7347947507599535, "grad_norm": 5.966557963654479, "learning_rate": 3.466585830419636e-08, "loss": 0.844, "step": 7433 }, { "epoch": 0.734893606504708, "grad_norm": 2.527838323379806, "learning_rate": 3.464161963172968e-08, "loss": 0.7475, "step": 7434 }, { "epoch": 0.7349924622494625, "grad_norm": 2.9538887941305774, "learning_rate": 3.4617387660800775e-08, "loss": 0.5617, "step": 7435 }, { "epoch": 0.7350913179942169, "grad_norm": 3.65972430758589, "learning_rate": 3.4593162393894305e-08, "loss": 0.6494, "step": 7436 }, { "epoch": 0.7351901737389714, "grad_norm": 3.9812011265417993, "learning_rate": 3.456894383349423e-08, "loss": 0.7168, "step": 7437 }, { "epoch": 0.7352890294837259, "grad_norm": 6.408497090075123, "learning_rate": 3.4544731982083774e-08, "loss": 0.6426, "step": 7438 }, { "epoch": 0.7353878852284803, "grad_norm": 3.348056741846032, "learning_rate": 3.452052684214555e-08, "loss": 0.775, "step": 7439 }, { "epoch": 0.7354867409732349, "grad_norm": 3.351691308474385, "learning_rate": 3.44963284161614e-08, "loss": 0.8251, "step": 7440 }, { "epoch": 0.7355855967179893, "grad_norm": 4.718423311176304, "learning_rate": 3.4472136706612497e-08, "loss": 0.7707, "step": 7441 }, { "epoch": 0.7356844524627437, "grad_norm": 9.656976219531279, "learning_rate": 3.4447951715979374e-08, "loss": 0.7533, "step": 7442 }, { "epoch": 0.7357833082074982, "grad_norm": 3.649301639696371, "learning_rate": 3.442377344674187e-08, "loss": 0.6623, "step": 7443 }, { "epoch": 0.7358821639522527, "grad_norm": 5.216764482873255, "learning_rate": 3.4399601901379037e-08, "loss": 0.691, "step": 7444 }, { "epoch": 0.7359810196970071, "grad_norm": 3.7328444233246794, "learning_rate": 3.437543708236941e-08, "loss": 0.8226, "step": 7445 }, { "epoch": 0.7360798754417616, "grad_norm": 10.616242155327802, "learning_rate": 3.4351278992190635e-08, "loss": 0.7679, "step": 7446 }, { "epoch": 0.7361787311865161, "grad_norm": 4.326893341252115, "learning_rate": 3.432712763331981e-08, "loss": 0.6688, "step": 7447 }, { "epoch": 0.7362775869312705, "grad_norm": 3.402816403531725, "learning_rate": 3.430298300823334e-08, "loss": 0.8084, "step": 7448 }, { "epoch": 0.736376442676025, "grad_norm": 14.04830186478802, "learning_rate": 3.427884511940683e-08, "loss": 0.6927, "step": 7449 }, { "epoch": 0.7364752984207795, "grad_norm": 3.820032192165113, "learning_rate": 3.42547139693153e-08, "loss": 0.707, "step": 7450 }, { "epoch": 0.736574154165534, "grad_norm": 14.497299173102338, "learning_rate": 3.423058956043308e-08, "loss": 0.7596, "step": 7451 }, { "epoch": 0.7366730099102884, "grad_norm": 4.79596473455302, "learning_rate": 3.420647189523368e-08, "loss": 0.643, "step": 7452 }, { "epoch": 0.7367718656550428, "grad_norm": 15.291532850769846, "learning_rate": 3.418236097619008e-08, "loss": 0.7619, "step": 7453 }, { "epoch": 0.7368707213997974, "grad_norm": 4.535128543046381, "learning_rate": 3.415825680577451e-08, "loss": 0.6996, "step": 7454 }, { "epoch": 0.7369695771445518, "grad_norm": 4.8829535995249795, "learning_rate": 3.413415938645845e-08, "loss": 0.6912, "step": 7455 }, { "epoch": 0.7370684328893062, "grad_norm": 4.8063321433293815, "learning_rate": 3.411006872071278e-08, "loss": 0.681, "step": 7456 }, { "epoch": 0.7371672886340608, "grad_norm": 4.283904851799877, "learning_rate": 3.408598481100762e-08, "loss": 0.675, "step": 7457 }, { "epoch": 0.7372661443788152, "grad_norm": 10.732677706087673, "learning_rate": 3.406190765981237e-08, "loss": 0.7358, "step": 7458 }, { "epoch": 0.7373650001235696, "grad_norm": 3.467159315191169, "learning_rate": 3.4037837269595884e-08, "loss": 0.6777, "step": 7459 }, { "epoch": 0.7374638558683242, "grad_norm": 4.612059522722652, "learning_rate": 3.4013773642826135e-08, "loss": 0.6609, "step": 7460 }, { "epoch": 0.7375627116130786, "grad_norm": 6.4345756656274915, "learning_rate": 3.398971678197054e-08, "loss": 0.5928, "step": 7461 }, { "epoch": 0.7376615673578331, "grad_norm": 7.0267489669080945, "learning_rate": 3.39656666894958e-08, "loss": 0.7737, "step": 7462 }, { "epoch": 0.7377604231025876, "grad_norm": 3.711157073551067, "learning_rate": 3.3941623367867834e-08, "loss": 0.7443, "step": 7463 }, { "epoch": 0.737859278847342, "grad_norm": 4.039456920421458, "learning_rate": 3.3917586819551956e-08, "loss": 0.7938, "step": 7464 }, { "epoch": 0.7379581345920965, "grad_norm": 3.9185570716532485, "learning_rate": 3.38935570470128e-08, "loss": 0.7398, "step": 7465 }, { "epoch": 0.7380569903368509, "grad_norm": 3.5457364592839062, "learning_rate": 3.38695340527142e-08, "loss": 0.6958, "step": 7466 }, { "epoch": 0.7381558460816054, "grad_norm": 3.985931689054105, "learning_rate": 3.384551783911942e-08, "loss": 0.7548, "step": 7467 }, { "epoch": 0.7382547018263599, "grad_norm": 4.195000795328834, "learning_rate": 3.38215084086909e-08, "loss": 0.8091, "step": 7468 }, { "epoch": 0.7383535575711143, "grad_norm": 3.6047676468813283, "learning_rate": 3.3797505763890496e-08, "loss": 0.7372, "step": 7469 }, { "epoch": 0.7384524133158689, "grad_norm": 3.884104293650104, "learning_rate": 3.377350990717937e-08, "loss": 0.7076, "step": 7470 }, { "epoch": 0.7385512690606233, "grad_norm": 3.4601903397084315, "learning_rate": 3.374952084101784e-08, "loss": 0.6649, "step": 7471 }, { "epoch": 0.7386501248053777, "grad_norm": 3.246253025659364, "learning_rate": 3.372553856786568e-08, "loss": 0.7519, "step": 7472 }, { "epoch": 0.7387489805501323, "grad_norm": 6.48670276579539, "learning_rate": 3.370156309018197e-08, "loss": 0.7127, "step": 7473 }, { "epoch": 0.7388478362948867, "grad_norm": 3.692430004284158, "learning_rate": 3.367759441042499e-08, "loss": 0.702, "step": 7474 }, { "epoch": 0.7389466920396411, "grad_norm": 2.938270482676433, "learning_rate": 3.3653632531052355e-08, "loss": 0.6557, "step": 7475 }, { "epoch": 0.7390455477843956, "grad_norm": 6.671302130464745, "learning_rate": 3.3629677454521046e-08, "loss": 0.687, "step": 7476 }, { "epoch": 0.7391444035291501, "grad_norm": 3.4246277615526513, "learning_rate": 3.3605729183287255e-08, "loss": 0.7015, "step": 7477 }, { "epoch": 0.7392432592739046, "grad_norm": 4.843294902284229, "learning_rate": 3.3581787719806584e-08, "loss": 0.7273, "step": 7478 }, { "epoch": 0.739342115018659, "grad_norm": 4.4962345853495345, "learning_rate": 3.3557853066533814e-08, "loss": 0.6026, "step": 7479 }, { "epoch": 0.7394409707634135, "grad_norm": 5.194759417673226, "learning_rate": 3.353392522592312e-08, "loss": 0.7779, "step": 7480 }, { "epoch": 0.739539826508168, "grad_norm": 4.150761894124218, "learning_rate": 3.351000420042799e-08, "loss": 0.7449, "step": 7481 }, { "epoch": 0.7396386822529224, "grad_norm": 4.247644289883783, "learning_rate": 3.348608999250109e-08, "loss": 0.7355, "step": 7482 }, { "epoch": 0.7397375379976769, "grad_norm": 6.114185266738917, "learning_rate": 3.346218260459451e-08, "loss": 0.6938, "step": 7483 }, { "epoch": 0.7398363937424314, "grad_norm": 6.024452805623244, "learning_rate": 3.343828203915964e-08, "loss": 0.8236, "step": 7484 }, { "epoch": 0.7399352494871858, "grad_norm": 4.277147396842752, "learning_rate": 3.341438829864706e-08, "loss": 0.676, "step": 7485 }, { "epoch": 0.7400341052319402, "grad_norm": 4.358411680979199, "learning_rate": 3.339050138550675e-08, "loss": 0.7116, "step": 7486 }, { "epoch": 0.7401329609766948, "grad_norm": 3.131069958100941, "learning_rate": 3.3366621302188e-08, "loss": 0.7787, "step": 7487 }, { "epoch": 0.7402318167214492, "grad_norm": 4.219166293295592, "learning_rate": 3.334274805113929e-08, "loss": 0.6615, "step": 7488 }, { "epoch": 0.7403306724662037, "grad_norm": 11.76015559344868, "learning_rate": 3.331888163480854e-08, "loss": 0.6778, "step": 7489 }, { "epoch": 0.7404295282109582, "grad_norm": 2.940301211066793, "learning_rate": 3.329502205564283e-08, "loss": 0.6576, "step": 7490 }, { "epoch": 0.7405283839557126, "grad_norm": 5.9557824125201755, "learning_rate": 3.3271169316088666e-08, "loss": 0.6629, "step": 7491 }, { "epoch": 0.7406272397004671, "grad_norm": 7.479078663073239, "learning_rate": 3.3247323418591745e-08, "loss": 0.6543, "step": 7492 }, { "epoch": 0.7407260954452216, "grad_norm": 3.871716133540677, "learning_rate": 3.3223484365597156e-08, "loss": 0.6554, "step": 7493 }, { "epoch": 0.740824951189976, "grad_norm": 3.3047005620098164, "learning_rate": 3.31996521595492e-08, "loss": 0.7767, "step": 7494 }, { "epoch": 0.7409238069347305, "grad_norm": 4.863619856391789, "learning_rate": 3.317582680289157e-08, "loss": 0.7587, "step": 7495 }, { "epoch": 0.7410226626794849, "grad_norm": 4.656195908166571, "learning_rate": 3.315200829806715e-08, "loss": 0.7242, "step": 7496 }, { "epoch": 0.7411215184242395, "grad_norm": 5.823183182204489, "learning_rate": 3.3128196647518195e-08, "loss": 0.7826, "step": 7497 }, { "epoch": 0.7412203741689939, "grad_norm": 3.3017556920923252, "learning_rate": 3.310439185368628e-08, "loss": 0.6362, "step": 7498 }, { "epoch": 0.7413192299137483, "grad_norm": 4.996396511974805, "learning_rate": 3.308059391901217e-08, "loss": 0.7026, "step": 7499 }, { "epoch": 0.7414180856585029, "grad_norm": 3.4512218373346766, "learning_rate": 3.305680284593605e-08, "loss": 0.6185, "step": 7500 }, { "epoch": 0.7415169414032573, "grad_norm": 3.491391912611703, "learning_rate": 3.303301863689728e-08, "loss": 0.7414, "step": 7501 }, { "epoch": 0.7416157971480117, "grad_norm": 2.6936380913724474, "learning_rate": 3.300924129433462e-08, "loss": 0.6916, "step": 7502 }, { "epoch": 0.7417146528927663, "grad_norm": 2.9639088650564815, "learning_rate": 3.2985470820686125e-08, "loss": 0.6918, "step": 7503 }, { "epoch": 0.7418135086375207, "grad_norm": 3.7812181474771873, "learning_rate": 3.2961707218389026e-08, "loss": 0.6846, "step": 7504 }, { "epoch": 0.7419123643822751, "grad_norm": 3.124146906163775, "learning_rate": 3.2937950489879954e-08, "loss": 0.7388, "step": 7505 }, { "epoch": 0.7420112201270296, "grad_norm": 3.8928821379137095, "learning_rate": 3.291420063759487e-08, "loss": 0.6494, "step": 7506 }, { "epoch": 0.7421100758717841, "grad_norm": 3.244671882801789, "learning_rate": 3.289045766396889e-08, "loss": 0.7179, "step": 7507 }, { "epoch": 0.7422089316165386, "grad_norm": 5.647353298111852, "learning_rate": 3.2866721571436576e-08, "loss": 0.6881, "step": 7508 }, { "epoch": 0.742307787361293, "grad_norm": 5.0777622618652165, "learning_rate": 3.284299236243164e-08, "loss": 0.6931, "step": 7509 }, { "epoch": 0.7424066431060475, "grad_norm": 3.669598681610694, "learning_rate": 3.2819270039387235e-08, "loss": 0.672, "step": 7510 }, { "epoch": 0.742505498850802, "grad_norm": 3.8498272856680824, "learning_rate": 3.2795554604735707e-08, "loss": 0.7126, "step": 7511 }, { "epoch": 0.7426043545955564, "grad_norm": 4.974522607844476, "learning_rate": 3.277184606090869e-08, "loss": 0.7628, "step": 7512 }, { "epoch": 0.742703210340311, "grad_norm": 3.4197673591410176, "learning_rate": 3.274814441033717e-08, "loss": 0.723, "step": 7513 }, { "epoch": 0.7428020660850654, "grad_norm": 6.6841552348849556, "learning_rate": 3.2724449655451436e-08, "loss": 0.6452, "step": 7514 }, { "epoch": 0.7429009218298198, "grad_norm": 37.50461298870969, "learning_rate": 3.2700761798680974e-08, "loss": 0.677, "step": 7515 }, { "epoch": 0.7429997775745742, "grad_norm": 3.336381493861423, "learning_rate": 3.267708084245466e-08, "loss": 0.5924, "step": 7516 }, { "epoch": 0.7430986333193288, "grad_norm": 8.175023120990426, "learning_rate": 3.265340678920065e-08, "loss": 0.6593, "step": 7517 }, { "epoch": 0.7431974890640832, "grad_norm": 3.1124807729926323, "learning_rate": 3.262973964134631e-08, "loss": 0.6896, "step": 7518 }, { "epoch": 0.7432963448088377, "grad_norm": 4.715274375769304, "learning_rate": 3.2606079401318397e-08, "loss": 0.7812, "step": 7519 }, { "epoch": 0.7433952005535922, "grad_norm": 5.563966947058339, "learning_rate": 3.2582426071542934e-08, "loss": 0.6724, "step": 7520 }, { "epoch": 0.7434940562983466, "grad_norm": 3.0916158573643746, "learning_rate": 3.255877965444517e-08, "loss": 0.6848, "step": 7521 }, { "epoch": 0.7435929120431011, "grad_norm": 3.4498607062025597, "learning_rate": 3.2535140152449756e-08, "loss": 0.7962, "step": 7522 }, { "epoch": 0.7436917677878556, "grad_norm": 4.251054653977877, "learning_rate": 3.251150756798051e-08, "loss": 0.7039, "step": 7523 }, { "epoch": 0.74379062353261, "grad_norm": 3.820494400976635, "learning_rate": 3.248788190346063e-08, "loss": 0.7117, "step": 7524 }, { "epoch": 0.7438894792773645, "grad_norm": 10.158271157058255, "learning_rate": 3.2464263161312634e-08, "loss": 0.6874, "step": 7525 }, { "epoch": 0.7439883350221189, "grad_norm": 3.4339986996974763, "learning_rate": 3.244065134395818e-08, "loss": 0.6403, "step": 7526 }, { "epoch": 0.7440871907668735, "grad_norm": 6.250392913773462, "learning_rate": 3.241704645381841e-08, "loss": 0.6694, "step": 7527 }, { "epoch": 0.7441860465116279, "grad_norm": 40.069697539147015, "learning_rate": 3.239344849331359e-08, "loss": 0.7268, "step": 7528 }, { "epoch": 0.7442849022563823, "grad_norm": 4.5001561972213056, "learning_rate": 3.236985746486334e-08, "loss": 0.7044, "step": 7529 }, { "epoch": 0.7443837580011369, "grad_norm": 4.765233090866179, "learning_rate": 3.234627337088659e-08, "loss": 0.7201, "step": 7530 }, { "epoch": 0.7444826137458913, "grad_norm": 3.6030402177594008, "learning_rate": 3.232269621380157e-08, "loss": 0.7252, "step": 7531 }, { "epoch": 0.7445814694906457, "grad_norm": 2.885407503996708, "learning_rate": 3.2299125996025725e-08, "loss": 0.6703, "step": 7532 }, { "epoch": 0.7446803252354003, "grad_norm": 3.93632964298045, "learning_rate": 3.227556271997587e-08, "loss": 0.6251, "step": 7533 }, { "epoch": 0.7447791809801547, "grad_norm": 15.506032478083085, "learning_rate": 3.225200638806802e-08, "loss": 0.7991, "step": 7534 }, { "epoch": 0.7448780367249092, "grad_norm": 3.1995950453074706, "learning_rate": 3.2228457002717565e-08, "loss": 0.69, "step": 7535 }, { "epoch": 0.7449768924696637, "grad_norm": 6.094519671929715, "learning_rate": 3.220491456633918e-08, "loss": 0.7143, "step": 7536 }, { "epoch": 0.7450757482144181, "grad_norm": 4.411457028449421, "learning_rate": 3.2181379081346715e-08, "loss": 0.6915, "step": 7537 }, { "epoch": 0.7451746039591726, "grad_norm": 4.198057732484937, "learning_rate": 3.2157850550153446e-08, "loss": 0.7126, "step": 7538 }, { "epoch": 0.745273459703927, "grad_norm": 2.642248402645812, "learning_rate": 3.2134328975171874e-08, "loss": 0.6747, "step": 7539 }, { "epoch": 0.7453723154486815, "grad_norm": 3.466041020297305, "learning_rate": 3.2110814358813753e-08, "loss": 0.6811, "step": 7540 }, { "epoch": 0.745471171193436, "grad_norm": 4.225773658899789, "learning_rate": 3.208730670349019e-08, "loss": 0.6641, "step": 7541 }, { "epoch": 0.7455700269381904, "grad_norm": 5.087014704836251, "learning_rate": 3.206380601161157e-08, "loss": 0.7423, "step": 7542 }, { "epoch": 0.745668882682945, "grad_norm": 3.194945851842056, "learning_rate": 3.2040312285587476e-08, "loss": 0.6523, "step": 7543 }, { "epoch": 0.7457677384276994, "grad_norm": 6.646152331621697, "learning_rate": 3.2016825527826915e-08, "loss": 0.7354, "step": 7544 }, { "epoch": 0.7458665941724538, "grad_norm": 4.9553211840018205, "learning_rate": 3.199334574073809e-08, "loss": 0.7793, "step": 7545 }, { "epoch": 0.7459654499172084, "grad_norm": 3.649095304650544, "learning_rate": 3.196987292672844e-08, "loss": 0.7187, "step": 7546 }, { "epoch": 0.7460643056619628, "grad_norm": 4.42804683747053, "learning_rate": 3.1946407088204844e-08, "loss": 0.7069, "step": 7547 }, { "epoch": 0.7461631614067172, "grad_norm": 3.744736765242146, "learning_rate": 3.192294822757331e-08, "loss": 0.6693, "step": 7548 }, { "epoch": 0.7462620171514717, "grad_norm": 11.85350000871913, "learning_rate": 3.189949634723922e-08, "loss": 0.7628, "step": 7549 }, { "epoch": 0.7463608728962262, "grad_norm": 2.8669724743242027, "learning_rate": 3.1876051449607266e-08, "loss": 0.6829, "step": 7550 }, { "epoch": 0.7464597286409806, "grad_norm": 2.9606963134347577, "learning_rate": 3.185261353708131e-08, "loss": 0.6824, "step": 7551 }, { "epoch": 0.7465585843857351, "grad_norm": 4.811613672546457, "learning_rate": 3.182918261206458e-08, "loss": 0.7793, "step": 7552 }, { "epoch": 0.7466574401304896, "grad_norm": 10.425931725719128, "learning_rate": 3.180575867695961e-08, "loss": 0.6692, "step": 7553 }, { "epoch": 0.7467562958752441, "grad_norm": 4.944996106518451, "learning_rate": 3.1782341734168136e-08, "loss": 0.7202, "step": 7554 }, { "epoch": 0.7468551516199985, "grad_norm": 3.7307109629518727, "learning_rate": 3.175893178609126e-08, "loss": 0.7451, "step": 7555 }, { "epoch": 0.746954007364753, "grad_norm": 5.704717656699929, "learning_rate": 3.173552883512928e-08, "loss": 0.6037, "step": 7556 }, { "epoch": 0.7470528631095075, "grad_norm": 3.5100716183645404, "learning_rate": 3.171213288368183e-08, "loss": 0.6567, "step": 7557 }, { "epoch": 0.7471517188542619, "grad_norm": 4.211413326169119, "learning_rate": 3.168874393414788e-08, "loss": 0.7525, "step": 7558 }, { "epoch": 0.7472505745990163, "grad_norm": 3.2815990606841803, "learning_rate": 3.166536198892554e-08, "loss": 0.7684, "step": 7559 }, { "epoch": 0.7473494303437709, "grad_norm": 4.549860088121821, "learning_rate": 3.164198705041232e-08, "loss": 0.7612, "step": 7560 }, { "epoch": 0.7474482860885253, "grad_norm": 5.013148713135209, "learning_rate": 3.161861912100501e-08, "loss": 0.6231, "step": 7561 }, { "epoch": 0.7475471418332797, "grad_norm": 10.557824861092357, "learning_rate": 3.159525820309962e-08, "loss": 0.7189, "step": 7562 }, { "epoch": 0.7476459975780343, "grad_norm": 4.547605941498288, "learning_rate": 3.157190429909141e-08, "loss": 0.651, "step": 7563 }, { "epoch": 0.7477448533227887, "grad_norm": 5.051960179199116, "learning_rate": 3.154855741137508e-08, "loss": 0.579, "step": 7564 }, { "epoch": 0.7478437090675432, "grad_norm": 8.816172932591353, "learning_rate": 3.1525217542344415e-08, "loss": 0.7034, "step": 7565 }, { "epoch": 0.7479425648122977, "grad_norm": 3.5677867270234063, "learning_rate": 3.1501884694392656e-08, "loss": 0.7279, "step": 7566 }, { "epoch": 0.7480414205570521, "grad_norm": 4.26761128960305, "learning_rate": 3.1478558869912166e-08, "loss": 0.7695, "step": 7567 }, { "epoch": 0.7481402763018066, "grad_norm": 6.388390131092705, "learning_rate": 3.145524007129471e-08, "loss": 0.7196, "step": 7568 }, { "epoch": 0.748239132046561, "grad_norm": 3.8785166414070575, "learning_rate": 3.14319283009313e-08, "loss": 0.7601, "step": 7569 }, { "epoch": 0.7483379877913156, "grad_norm": 4.298214663975996, "learning_rate": 3.140862356121216e-08, "loss": 0.7499, "step": 7570 }, { "epoch": 0.74843684353607, "grad_norm": 6.564641383143187, "learning_rate": 3.1385325854526886e-08, "loss": 0.7464, "step": 7571 }, { "epoch": 0.7485356992808244, "grad_norm": 4.861326696109598, "learning_rate": 3.1362035183264334e-08, "loss": 0.7472, "step": 7572 }, { "epoch": 0.748634555025579, "grad_norm": 5.177711266123443, "learning_rate": 3.133875154981257e-08, "loss": 0.6286, "step": 7573 }, { "epoch": 0.7487334107703334, "grad_norm": 4.927023145411169, "learning_rate": 3.1315474956559e-08, "loss": 0.7271, "step": 7574 }, { "epoch": 0.7488322665150878, "grad_norm": 3.102062848522213, "learning_rate": 3.1292205405890336e-08, "loss": 0.6931, "step": 7575 }, { "epoch": 0.7489311222598424, "grad_norm": 5.25024901896606, "learning_rate": 3.126894290019247e-08, "loss": 0.794, "step": 7576 }, { "epoch": 0.7490299780045968, "grad_norm": 4.330657898888768, "learning_rate": 3.124568744185069e-08, "loss": 0.7135, "step": 7577 }, { "epoch": 0.7491288337493512, "grad_norm": 8.417444250963834, "learning_rate": 3.1222439033249436e-08, "loss": 0.7461, "step": 7578 }, { "epoch": 0.7492276894941057, "grad_norm": 4.110628120461992, "learning_rate": 3.1199197676772536e-08, "loss": 0.7517, "step": 7579 }, { "epoch": 0.7493265452388602, "grad_norm": 3.5259042857512846, "learning_rate": 3.117596337480302e-08, "loss": 0.8562, "step": 7580 }, { "epoch": 0.7494254009836147, "grad_norm": 3.8598780458136197, "learning_rate": 3.1152736129723244e-08, "loss": 0.6969, "step": 7581 }, { "epoch": 0.7495242567283691, "grad_norm": 9.601406180114708, "learning_rate": 3.1129515943914785e-08, "loss": 0.7234, "step": 7582 }, { "epoch": 0.7496231124731236, "grad_norm": 3.701599179813755, "learning_rate": 3.110630281975859e-08, "loss": 0.7948, "step": 7583 }, { "epoch": 0.7497219682178781, "grad_norm": 4.7212248590396255, "learning_rate": 3.108309675963476e-08, "loss": 0.712, "step": 7584 }, { "epoch": 0.7498208239626325, "grad_norm": 3.160289454822699, "learning_rate": 3.105989776592276e-08, "loss": 0.6953, "step": 7585 }, { "epoch": 0.749919679707387, "grad_norm": 6.448333417557734, "learning_rate": 3.103670584100133e-08, "loss": 0.678, "step": 7586 }, { "epoch": 0.7500185354521415, "grad_norm": 19.64414399660531, "learning_rate": 3.101352098724841e-08, "loss": 0.7792, "step": 7587 }, { "epoch": 0.7501173911968959, "grad_norm": 3.8198719607161125, "learning_rate": 3.099034320704132e-08, "loss": 0.7195, "step": 7588 }, { "epoch": 0.7502162469416503, "grad_norm": 4.54128178353477, "learning_rate": 3.0967172502756535e-08, "loss": 0.583, "step": 7589 }, { "epoch": 0.7503151026864049, "grad_norm": 3.8614357225200626, "learning_rate": 3.094400887676991e-08, "loss": 0.8195, "step": 7590 }, { "epoch": 0.7504139584311593, "grad_norm": 5.997000531300967, "learning_rate": 3.092085233145655e-08, "loss": 0.728, "step": 7591 }, { "epoch": 0.7505128141759138, "grad_norm": 3.5223468355689542, "learning_rate": 3.089770286919077e-08, "loss": 0.7709, "step": 7592 }, { "epoch": 0.7506116699206683, "grad_norm": 3.7654595803767754, "learning_rate": 3.0874560492346234e-08, "loss": 0.6903, "step": 7593 }, { "epoch": 0.7507105256654227, "grad_norm": 5.49042060768461, "learning_rate": 3.0851425203295864e-08, "loss": 0.748, "step": 7594 }, { "epoch": 0.7508093814101772, "grad_norm": 6.357842269584003, "learning_rate": 3.0828297004411806e-08, "loss": 0.6789, "step": 7595 }, { "epoch": 0.7509082371549317, "grad_norm": 2.762026000679715, "learning_rate": 3.0805175898065574e-08, "loss": 0.5489, "step": 7596 }, { "epoch": 0.7510070928996861, "grad_norm": 2.7650324838509266, "learning_rate": 3.078206188662782e-08, "loss": 0.7846, "step": 7597 }, { "epoch": 0.7511059486444406, "grad_norm": 6.731361765768484, "learning_rate": 3.075895497246861e-08, "loss": 0.676, "step": 7598 }, { "epoch": 0.751204804389195, "grad_norm": 3.252938789585685, "learning_rate": 3.0735855157957194e-08, "loss": 0.6772, "step": 7599 }, { "epoch": 0.7513036601339496, "grad_norm": 3.8466541096028264, "learning_rate": 3.071276244546208e-08, "loss": 0.7268, "step": 7600 }, { "epoch": 0.751402515878704, "grad_norm": 8.876294300129304, "learning_rate": 3.068967683735112e-08, "loss": 0.768, "step": 7601 }, { "epoch": 0.7515013716234584, "grad_norm": 5.133172219064097, "learning_rate": 3.066659833599143e-08, "loss": 0.7081, "step": 7602 }, { "epoch": 0.751600227368213, "grad_norm": 3.6446865683356964, "learning_rate": 3.064352694374932e-08, "loss": 0.714, "step": 7603 }, { "epoch": 0.7516990831129674, "grad_norm": 6.7785281047169175, "learning_rate": 3.062046266299043e-08, "loss": 0.7539, "step": 7604 }, { "epoch": 0.7517979388577218, "grad_norm": 5.226082016453731, "learning_rate": 3.059740549607971e-08, "loss": 0.7072, "step": 7605 }, { "epoch": 0.7518967946024764, "grad_norm": 4.428488219731953, "learning_rate": 3.057435544538126e-08, "loss": 0.6951, "step": 7606 }, { "epoch": 0.7519956503472308, "grad_norm": 4.0093504163253355, "learning_rate": 3.055131251325856e-08, "loss": 0.8544, "step": 7607 }, { "epoch": 0.7520945060919852, "grad_norm": 3.144548631767095, "learning_rate": 3.0528276702074365e-08, "loss": 0.7219, "step": 7608 }, { "epoch": 0.7521933618367397, "grad_norm": 21.654758165657338, "learning_rate": 3.0505248014190575e-08, "loss": 0.6313, "step": 7609 }, { "epoch": 0.7522922175814942, "grad_norm": 4.854278205297361, "learning_rate": 3.048222645196851e-08, "loss": 0.7703, "step": 7610 }, { "epoch": 0.7523910733262487, "grad_norm": 4.4911386655841845, "learning_rate": 3.045921201776863e-08, "loss": 0.7155, "step": 7611 }, { "epoch": 0.7524899290710031, "grad_norm": 3.9972892338599864, "learning_rate": 3.0436204713950764e-08, "loss": 0.5707, "step": 7612 }, { "epoch": 0.7525887848157576, "grad_norm": 3.1527256262220815, "learning_rate": 3.041320454287399e-08, "loss": 0.7749, "step": 7613 }, { "epoch": 0.7526876405605121, "grad_norm": 4.281559184225426, "learning_rate": 3.039021150689659e-08, "loss": 0.5895, "step": 7614 }, { "epoch": 0.7527864963052665, "grad_norm": 4.513850337351555, "learning_rate": 3.036722560837621e-08, "loss": 0.7441, "step": 7615 }, { "epoch": 0.752885352050021, "grad_norm": 3.5447537714549884, "learning_rate": 3.03442468496697e-08, "loss": 0.6761, "step": 7616 }, { "epoch": 0.7529842077947755, "grad_norm": 5.981874218367335, "learning_rate": 3.0321275233133135e-08, "loss": 0.7139, "step": 7617 }, { "epoch": 0.7530830635395299, "grad_norm": 4.053591214227134, "learning_rate": 3.029831076112196e-08, "loss": 0.6669, "step": 7618 }, { "epoch": 0.7531819192842845, "grad_norm": 4.291712301310414, "learning_rate": 3.027535343599088e-08, "loss": 0.6544, "step": 7619 }, { "epoch": 0.7532807750290389, "grad_norm": 8.08695496618772, "learning_rate": 3.025240326009377e-08, "loss": 0.6541, "step": 7620 }, { "epoch": 0.7533796307737933, "grad_norm": 4.578253429428077, "learning_rate": 3.0229460235783876e-08, "loss": 0.7135, "step": 7621 }, { "epoch": 0.7534784865185478, "grad_norm": 3.300532165733539, "learning_rate": 3.020652436541362e-08, "loss": 0.6129, "step": 7622 }, { "epoch": 0.7535773422633023, "grad_norm": 4.632658334756227, "learning_rate": 3.018359565133477e-08, "loss": 0.7979, "step": 7623 }, { "epoch": 0.7536761980080567, "grad_norm": 3.822684754234831, "learning_rate": 3.0160674095898354e-08, "loss": 0.7521, "step": 7624 }, { "epoch": 0.7537750537528112, "grad_norm": 5.332499679257379, "learning_rate": 3.013775970145457e-08, "loss": 0.8523, "step": 7625 }, { "epoch": 0.7538739094975657, "grad_norm": 7.9448143301130285, "learning_rate": 3.011485247035298e-08, "loss": 0.7366, "step": 7626 }, { "epoch": 0.7539727652423202, "grad_norm": 2.9517272271484947, "learning_rate": 3.009195240494243e-08, "loss": 0.6801, "step": 7627 }, { "epoch": 0.7540716209870746, "grad_norm": 13.574500234877924, "learning_rate": 3.006905950757091e-08, "loss": 0.7818, "step": 7628 }, { "epoch": 0.7541704767318291, "grad_norm": 4.749344133539396, "learning_rate": 3.0046173780585795e-08, "loss": 0.7214, "step": 7629 }, { "epoch": 0.7542693324765836, "grad_norm": 4.819825525615457, "learning_rate": 3.002329522633369e-08, "loss": 0.805, "step": 7630 }, { "epoch": 0.754368188221338, "grad_norm": 3.172837068096982, "learning_rate": 3.0000423847160396e-08, "loss": 0.8271, "step": 7631 }, { "epoch": 0.7544670439660924, "grad_norm": 3.9827485512455967, "learning_rate": 2.997755964541111e-08, "loss": 0.7309, "step": 7632 }, { "epoch": 0.754565899710847, "grad_norm": 11.141780044594084, "learning_rate": 2.9954702623430176e-08, "loss": 0.661, "step": 7633 }, { "epoch": 0.7546647554556014, "grad_norm": 14.177583971520994, "learning_rate": 2.993185278356121e-08, "loss": 0.6946, "step": 7634 }, { "epoch": 0.7547636112003558, "grad_norm": 6.964024633322112, "learning_rate": 2.9909010128147194e-08, "loss": 0.685, "step": 7635 }, { "epoch": 0.7548624669451104, "grad_norm": 3.5164858403286448, "learning_rate": 2.988617465953025e-08, "loss": 0.6585, "step": 7636 }, { "epoch": 0.7549613226898648, "grad_norm": 4.6922376332229305, "learning_rate": 2.986334638005184e-08, "loss": 0.5989, "step": 7637 }, { "epoch": 0.7550601784346193, "grad_norm": 3.1940524416039793, "learning_rate": 2.98405252920527e-08, "loss": 0.6475, "step": 7638 }, { "epoch": 0.7551590341793738, "grad_norm": 5.863109394148172, "learning_rate": 2.9817711397872736e-08, "loss": 0.6708, "step": 7639 }, { "epoch": 0.7552578899241282, "grad_norm": 6.466243476237676, "learning_rate": 2.9794904699851208e-08, "loss": 0.7649, "step": 7640 }, { "epoch": 0.7553567456688827, "grad_norm": 17.631525824941374, "learning_rate": 2.9772105200326634e-08, "loss": 0.6843, "step": 7641 }, { "epoch": 0.7554556014136371, "grad_norm": 3.0845093219429778, "learning_rate": 2.9749312901636702e-08, "loss": 0.7494, "step": 7642 }, { "epoch": 0.7555544571583916, "grad_norm": 13.56661224639578, "learning_rate": 2.9726527806118507e-08, "loss": 0.7162, "step": 7643 }, { "epoch": 0.7556533129031461, "grad_norm": 4.098600995888041, "learning_rate": 2.9703749916108235e-08, "loss": 0.6318, "step": 7644 }, { "epoch": 0.7557521686479005, "grad_norm": 3.161626137802655, "learning_rate": 2.9680979233941473e-08, "loss": 0.6732, "step": 7645 }, { "epoch": 0.7558510243926551, "grad_norm": 3.401617237544605, "learning_rate": 2.9658215761953043e-08, "loss": 0.7721, "step": 7646 }, { "epoch": 0.7559498801374095, "grad_norm": 5.497506938042624, "learning_rate": 2.9635459502476932e-08, "loss": 0.6904, "step": 7647 }, { "epoch": 0.7560487358821639, "grad_norm": 8.079116392493239, "learning_rate": 2.9612710457846536e-08, "loss": 0.7648, "step": 7648 }, { "epoch": 0.7561475916269185, "grad_norm": 3.2601357850847954, "learning_rate": 2.958996863039437e-08, "loss": 0.7132, "step": 7649 }, { "epoch": 0.7562464473716729, "grad_norm": 3.938109496194402, "learning_rate": 2.9567234022452327e-08, "loss": 0.7422, "step": 7650 }, { "epoch": 0.7563453031164273, "grad_norm": 3.0883033466711893, "learning_rate": 2.954450663635145e-08, "loss": 0.6978, "step": 7651 }, { "epoch": 0.7564441588611818, "grad_norm": 6.157603532709804, "learning_rate": 2.952178647442215e-08, "loss": 0.752, "step": 7652 }, { "epoch": 0.7565430146059363, "grad_norm": 6.392721749118564, "learning_rate": 2.949907353899399e-08, "loss": 0.786, "step": 7653 }, { "epoch": 0.7566418703506907, "grad_norm": 4.533814007253662, "learning_rate": 2.947636783239592e-08, "loss": 0.5945, "step": 7654 }, { "epoch": 0.7567407260954452, "grad_norm": 3.8136378819051746, "learning_rate": 2.945366935695599e-08, "loss": 0.7114, "step": 7655 }, { "epoch": 0.7568395818401997, "grad_norm": 4.931013072461832, "learning_rate": 2.9430978115001637e-08, "loss": 0.7068, "step": 7656 }, { "epoch": 0.7569384375849542, "grad_norm": 3.398335400953236, "learning_rate": 2.9408294108859545e-08, "loss": 0.6603, "step": 7657 }, { "epoch": 0.7570372933297086, "grad_norm": 4.651788099368189, "learning_rate": 2.9385617340855564e-08, "loss": 0.6241, "step": 7658 }, { "epoch": 0.7571361490744631, "grad_norm": 8.723901212436036, "learning_rate": 2.9362947813314888e-08, "loss": 0.7441, "step": 7659 }, { "epoch": 0.7572350048192176, "grad_norm": 4.495480156561079, "learning_rate": 2.934028552856197e-08, "loss": 0.7671, "step": 7660 }, { "epoch": 0.757333860563972, "grad_norm": 3.4667212737553226, "learning_rate": 2.931763048892043e-08, "loss": 0.7624, "step": 7661 }, { "epoch": 0.7574327163087264, "grad_norm": 14.171305990381178, "learning_rate": 2.9294982696713255e-08, "loss": 0.7268, "step": 7662 }, { "epoch": 0.757531572053481, "grad_norm": 3.9784887737511805, "learning_rate": 2.9272342154262665e-08, "loss": 0.7863, "step": 7663 }, { "epoch": 0.7576304277982354, "grad_norm": 5.041139432814118, "learning_rate": 2.9249708863890045e-08, "loss": 0.7224, "step": 7664 }, { "epoch": 0.7577292835429899, "grad_norm": 3.7086496032860947, "learning_rate": 2.9227082827916172e-08, "loss": 0.7458, "step": 7665 }, { "epoch": 0.7578281392877444, "grad_norm": 3.4328869468934315, "learning_rate": 2.9204464048660948e-08, "loss": 0.7127, "step": 7666 }, { "epoch": 0.7579269950324988, "grad_norm": 4.413863503587194, "learning_rate": 2.918185252844366e-08, "loss": 0.7592, "step": 7667 }, { "epoch": 0.7580258507772533, "grad_norm": 4.260350919773249, "learning_rate": 2.9159248269582727e-08, "loss": 0.6613, "step": 7668 }, { "epoch": 0.7581247065220078, "grad_norm": 3.695428028317312, "learning_rate": 2.913665127439594e-08, "loss": 0.7319, "step": 7669 }, { "epoch": 0.7582235622667622, "grad_norm": 3.2041656363461577, "learning_rate": 2.911406154520022e-08, "loss": 0.6522, "step": 7670 }, { "epoch": 0.7583224180115167, "grad_norm": 4.058804338982078, "learning_rate": 2.909147908431189e-08, "loss": 0.6691, "step": 7671 }, { "epoch": 0.7584212737562711, "grad_norm": 4.646514625418928, "learning_rate": 2.906890389404636e-08, "loss": 0.7118, "step": 7672 }, { "epoch": 0.7585201295010257, "grad_norm": 4.2605858989752035, "learning_rate": 2.9046335976718438e-08, "loss": 0.6933, "step": 7673 }, { "epoch": 0.7586189852457801, "grad_norm": 3.970395726046271, "learning_rate": 2.9023775334642154e-08, "loss": 0.6264, "step": 7674 }, { "epoch": 0.7587178409905345, "grad_norm": 7.0836151930196465, "learning_rate": 2.9001221970130697e-08, "loss": 0.742, "step": 7675 }, { "epoch": 0.7588166967352891, "grad_norm": 10.376454545053726, "learning_rate": 2.8978675885496627e-08, "loss": 0.7233, "step": 7676 }, { "epoch": 0.7589155524800435, "grad_norm": 7.494289143163814, "learning_rate": 2.8956137083051735e-08, "loss": 0.6788, "step": 7677 }, { "epoch": 0.7590144082247979, "grad_norm": 3.2950742928235286, "learning_rate": 2.893360556510699e-08, "loss": 0.716, "step": 7678 }, { "epoch": 0.7591132639695525, "grad_norm": 9.203779922109412, "learning_rate": 2.891108133397271e-08, "loss": 0.7455, "step": 7679 }, { "epoch": 0.7592121197143069, "grad_norm": 2.8546459484307123, "learning_rate": 2.888856439195838e-08, "loss": 0.6707, "step": 7680 }, { "epoch": 0.7593109754590613, "grad_norm": 5.634179656389038, "learning_rate": 2.8866054741372803e-08, "loss": 0.8216, "step": 7681 }, { "epoch": 0.7594098312038158, "grad_norm": 4.339289165860522, "learning_rate": 2.884355238452405e-08, "loss": 0.5192, "step": 7682 }, { "epoch": 0.7595086869485703, "grad_norm": 3.3165759561772665, "learning_rate": 2.8821057323719322e-08, "loss": 0.7386, "step": 7683 }, { "epoch": 0.7596075426933248, "grad_norm": 2.9066172590634536, "learning_rate": 2.879856956126525e-08, "loss": 0.7795, "step": 7684 }, { "epoch": 0.7597063984380792, "grad_norm": 5.719469561115347, "learning_rate": 2.8776089099467537e-08, "loss": 0.7622, "step": 7685 }, { "epoch": 0.7598052541828337, "grad_norm": 3.4361212754804447, "learning_rate": 2.8753615940631293e-08, "loss": 0.7718, "step": 7686 }, { "epoch": 0.7599041099275882, "grad_norm": 3.854583665844336, "learning_rate": 2.873115008706074e-08, "loss": 0.7329, "step": 7687 }, { "epoch": 0.7600029656723426, "grad_norm": 5.670550314712351, "learning_rate": 2.8708691541059494e-08, "loss": 0.7531, "step": 7688 }, { "epoch": 0.7601018214170971, "grad_norm": 3.768510966317342, "learning_rate": 2.8686240304930266e-08, "loss": 0.7482, "step": 7689 }, { "epoch": 0.7602006771618516, "grad_norm": 4.165613402830636, "learning_rate": 2.8663796380975192e-08, "loss": 0.7258, "step": 7690 }, { "epoch": 0.760299532906606, "grad_norm": 4.886643853622551, "learning_rate": 2.8641359771495466e-08, "loss": 0.7764, "step": 7691 }, { "epoch": 0.7603983886513606, "grad_norm": 3.470613586958174, "learning_rate": 2.861893047879169e-08, "loss": 0.787, "step": 7692 }, { "epoch": 0.760497244396115, "grad_norm": 5.013998272962857, "learning_rate": 2.8596508505163687e-08, "loss": 0.7366, "step": 7693 }, { "epoch": 0.7605961001408694, "grad_norm": 5.866121311604234, "learning_rate": 2.8574093852910418e-08, "loss": 0.7451, "step": 7694 }, { "epoch": 0.7606949558856239, "grad_norm": 4.099549992024438, "learning_rate": 2.855168652433023e-08, "loss": 0.7777, "step": 7695 }, { "epoch": 0.7607938116303784, "grad_norm": 3.786202661499008, "learning_rate": 2.8529286521720675e-08, "loss": 0.8161, "step": 7696 }, { "epoch": 0.7608926673751328, "grad_norm": 3.638618476916516, "learning_rate": 2.8506893847378488e-08, "loss": 0.6963, "step": 7697 }, { "epoch": 0.7609915231198873, "grad_norm": 3.260468197090366, "learning_rate": 2.8484508503599748e-08, "loss": 0.6518, "step": 7698 }, { "epoch": 0.7610903788646418, "grad_norm": 3.9428210456253376, "learning_rate": 2.846213049267976e-08, "loss": 0.7618, "step": 7699 }, { "epoch": 0.7611892346093962, "grad_norm": 4.729381769570667, "learning_rate": 2.8439759816913e-08, "loss": 0.592, "step": 7700 }, { "epoch": 0.7612880903541507, "grad_norm": 6.319555541027446, "learning_rate": 2.8417396478593315e-08, "loss": 0.6484, "step": 7701 }, { "epoch": 0.7613869460989052, "grad_norm": 5.42687326584172, "learning_rate": 2.83950404800137e-08, "loss": 0.6854, "step": 7702 }, { "epoch": 0.7614858018436597, "grad_norm": 6.967365256549571, "learning_rate": 2.837269182346641e-08, "loss": 0.7738, "step": 7703 }, { "epoch": 0.7615846575884141, "grad_norm": 4.149131907356359, "learning_rate": 2.8350350511243016e-08, "loss": 0.6562, "step": 7704 }, { "epoch": 0.7616835133331685, "grad_norm": 4.822271234033115, "learning_rate": 2.832801654563425e-08, "loss": 0.7774, "step": 7705 }, { "epoch": 0.7617823690779231, "grad_norm": 4.9508556882021155, "learning_rate": 2.8305689928930153e-08, "loss": 0.7201, "step": 7706 }, { "epoch": 0.7618812248226775, "grad_norm": 3.493834358485, "learning_rate": 2.8283370663420015e-08, "loss": 0.6808, "step": 7707 }, { "epoch": 0.7619800805674319, "grad_norm": 3.316985897985715, "learning_rate": 2.8261058751392286e-08, "loss": 0.6998, "step": 7708 }, { "epoch": 0.7620789363121865, "grad_norm": 3.643452157574719, "learning_rate": 2.8238754195134762e-08, "loss": 0.7914, "step": 7709 }, { "epoch": 0.7621777920569409, "grad_norm": 4.771487245857684, "learning_rate": 2.8216456996934478e-08, "loss": 0.713, "step": 7710 }, { "epoch": 0.7622766478016954, "grad_norm": 26.200209005438687, "learning_rate": 2.819416715907761e-08, "loss": 0.7206, "step": 7711 }, { "epoch": 0.7623755035464499, "grad_norm": 4.13997449967982, "learning_rate": 2.8171884683849722e-08, "loss": 0.7292, "step": 7712 }, { "epoch": 0.7624743592912043, "grad_norm": 3.095377883871744, "learning_rate": 2.8149609573535493e-08, "loss": 0.6719, "step": 7713 }, { "epoch": 0.7625732150359588, "grad_norm": 3.1089972088474718, "learning_rate": 2.8127341830418926e-08, "loss": 0.6477, "step": 7714 }, { "epoch": 0.7626720707807132, "grad_norm": 8.412194103305481, "learning_rate": 2.8105081456783297e-08, "loss": 0.6128, "step": 7715 }, { "epoch": 0.7627709265254677, "grad_norm": 3.984004545033115, "learning_rate": 2.808282845491101e-08, "loss": 0.6586, "step": 7716 }, { "epoch": 0.7628697822702222, "grad_norm": 3.3132265995412555, "learning_rate": 2.806058282708381e-08, "loss": 0.7717, "step": 7717 }, { "epoch": 0.7629686380149766, "grad_norm": 5.061407620579918, "learning_rate": 2.8038344575582684e-08, "loss": 0.6649, "step": 7718 }, { "epoch": 0.7630674937597312, "grad_norm": 3.3754566312456973, "learning_rate": 2.8016113702687817e-08, "loss": 0.8219, "step": 7719 }, { "epoch": 0.7631663495044856, "grad_norm": 3.598952798592271, "learning_rate": 2.7993890210678628e-08, "loss": 0.7785, "step": 7720 }, { "epoch": 0.76326520524924, "grad_norm": 10.817164405416433, "learning_rate": 2.7971674101833854e-08, "loss": 0.7852, "step": 7721 }, { "epoch": 0.7633640609939946, "grad_norm": 12.316263732399053, "learning_rate": 2.7949465378431382e-08, "loss": 0.6152, "step": 7722 }, { "epoch": 0.763462916738749, "grad_norm": 15.199118233567981, "learning_rate": 2.7927264042748443e-08, "loss": 0.7112, "step": 7723 }, { "epoch": 0.7635617724835034, "grad_norm": 4.061778410765547, "learning_rate": 2.7905070097061413e-08, "loss": 0.6377, "step": 7724 }, { "epoch": 0.7636606282282579, "grad_norm": 11.745589284977505, "learning_rate": 2.7882883543645953e-08, "loss": 0.7218, "step": 7725 }, { "epoch": 0.7637594839730124, "grad_norm": 3.477080117477137, "learning_rate": 2.7860704384777023e-08, "loss": 0.7437, "step": 7726 }, { "epoch": 0.7638583397177668, "grad_norm": 4.552923198138568, "learning_rate": 2.7838532622728682e-08, "loss": 0.7012, "step": 7727 }, { "epoch": 0.7639571954625213, "grad_norm": 3.8808959990857024, "learning_rate": 2.781636825977438e-08, "loss": 0.669, "step": 7728 }, { "epoch": 0.7640560512072758, "grad_norm": 5.650939051106397, "learning_rate": 2.7794211298186754e-08, "loss": 0.6271, "step": 7729 }, { "epoch": 0.7641549069520303, "grad_norm": 4.180632579747479, "learning_rate": 2.777206174023763e-08, "loss": 0.6882, "step": 7730 }, { "epoch": 0.7642537626967847, "grad_norm": 3.0241712401143235, "learning_rate": 2.7749919588198122e-08, "loss": 0.6835, "step": 7731 }, { "epoch": 0.7643526184415392, "grad_norm": 3.993316534072559, "learning_rate": 2.772778484433863e-08, "loss": 0.798, "step": 7732 }, { "epoch": 0.7644514741862937, "grad_norm": 6.436699875066889, "learning_rate": 2.7705657510928692e-08, "loss": 0.6836, "step": 7733 }, { "epoch": 0.7645503299310481, "grad_norm": 16.933401219670642, "learning_rate": 2.7683537590237184e-08, "loss": 0.6698, "step": 7734 }, { "epoch": 0.7646491856758025, "grad_norm": 3.8586673930188837, "learning_rate": 2.766142508453212e-08, "loss": 0.6222, "step": 7735 }, { "epoch": 0.7647480414205571, "grad_norm": 5.91160829565221, "learning_rate": 2.7639319996080868e-08, "loss": 0.6491, "step": 7736 }, { "epoch": 0.7648468971653115, "grad_norm": 11.119945612413623, "learning_rate": 2.7617222327149935e-08, "loss": 0.6415, "step": 7737 }, { "epoch": 0.764945752910066, "grad_norm": 4.704230677614593, "learning_rate": 2.759513208000517e-08, "loss": 0.7527, "step": 7738 }, { "epoch": 0.7650446086548205, "grad_norm": 3.5558337515050247, "learning_rate": 2.757304925691152e-08, "loss": 0.6404, "step": 7739 }, { "epoch": 0.7651434643995749, "grad_norm": 4.0060590444869995, "learning_rate": 2.755097386013332e-08, "loss": 0.5807, "step": 7740 }, { "epoch": 0.7652423201443294, "grad_norm": 5.81216463833762, "learning_rate": 2.752890589193404e-08, "loss": 0.5835, "step": 7741 }, { "epoch": 0.7653411758890839, "grad_norm": 4.7544752237139, "learning_rate": 2.7506845354576425e-08, "loss": 0.697, "step": 7742 }, { "epoch": 0.7654400316338383, "grad_norm": 3.4941906818371975, "learning_rate": 2.7484792250322496e-08, "loss": 0.8299, "step": 7743 }, { "epoch": 0.7655388873785928, "grad_norm": 13.917772410335527, "learning_rate": 2.7462746581433426e-08, "loss": 0.7745, "step": 7744 }, { "epoch": 0.7656377431233472, "grad_norm": 3.249852155137751, "learning_rate": 2.7440708350169727e-08, "loss": 0.6978, "step": 7745 }, { "epoch": 0.7657365988681017, "grad_norm": 11.314630652254577, "learning_rate": 2.7418677558791025e-08, "loss": 0.6757, "step": 7746 }, { "epoch": 0.7658354546128562, "grad_norm": 13.68896732251147, "learning_rate": 2.739665420955629e-08, "loss": 0.7617, "step": 7747 }, { "epoch": 0.7659343103576106, "grad_norm": 3.4474684717881168, "learning_rate": 2.7374638304723728e-08, "loss": 0.7689, "step": 7748 }, { "epoch": 0.7660331661023652, "grad_norm": 4.263329517579261, "learning_rate": 2.735262984655067e-08, "loss": 0.8261, "step": 7749 }, { "epoch": 0.7661320218471196, "grad_norm": 2.775954664006627, "learning_rate": 2.73306288372938e-08, "loss": 0.6442, "step": 7750 }, { "epoch": 0.766230877591874, "grad_norm": 5.419677290077693, "learning_rate": 2.7308635279209025e-08, "loss": 0.7991, "step": 7751 }, { "epoch": 0.7663297333366286, "grad_norm": 3.798322933612664, "learning_rate": 2.72866491745514e-08, "loss": 0.6729, "step": 7752 }, { "epoch": 0.766428589081383, "grad_norm": 5.185541687952851, "learning_rate": 2.7264670525575328e-08, "loss": 0.7384, "step": 7753 }, { "epoch": 0.7665274448261374, "grad_norm": 3.1205196068877954, "learning_rate": 2.7242699334534347e-08, "loss": 0.6821, "step": 7754 }, { "epoch": 0.7666263005708919, "grad_norm": 4.056640492876918, "learning_rate": 2.7220735603681332e-08, "loss": 0.5666, "step": 7755 }, { "epoch": 0.7667251563156464, "grad_norm": 22.334397871861015, "learning_rate": 2.719877933526832e-08, "loss": 0.5935, "step": 7756 }, { "epoch": 0.7668240120604009, "grad_norm": 5.949645476294345, "learning_rate": 2.7176830531546557e-08, "loss": 0.6794, "step": 7757 }, { "epoch": 0.7669228678051553, "grad_norm": 3.9239446342427033, "learning_rate": 2.7154889194766595e-08, "loss": 0.6975, "step": 7758 }, { "epoch": 0.7670217235499098, "grad_norm": 5.492743512117824, "learning_rate": 2.7132955327178253e-08, "loss": 0.7964, "step": 7759 }, { "epoch": 0.7671205792946643, "grad_norm": 3.8698749328447675, "learning_rate": 2.7111028931030435e-08, "loss": 0.8151, "step": 7760 }, { "epoch": 0.7672194350394187, "grad_norm": 3.362204455027206, "learning_rate": 2.7089110008571415e-08, "loss": 0.6523, "step": 7761 }, { "epoch": 0.7673182907841732, "grad_norm": 10.792713286716948, "learning_rate": 2.7067198562048686e-08, "loss": 0.7384, "step": 7762 }, { "epoch": 0.7674171465289277, "grad_norm": 4.352181853603256, "learning_rate": 2.7045294593708877e-08, "loss": 0.719, "step": 7763 }, { "epoch": 0.7675160022736821, "grad_norm": 4.010174820326327, "learning_rate": 2.7023398105797956e-08, "loss": 0.7616, "step": 7764 }, { "epoch": 0.7676148580184365, "grad_norm": 3.1718757532126567, "learning_rate": 2.7001509100561103e-08, "loss": 0.6704, "step": 7765 }, { "epoch": 0.7677137137631911, "grad_norm": 3.899917485517286, "learning_rate": 2.697962758024266e-08, "loss": 0.7817, "step": 7766 }, { "epoch": 0.7678125695079455, "grad_norm": 6.5566046730775245, "learning_rate": 2.6957753547086305e-08, "loss": 0.6248, "step": 7767 }, { "epoch": 0.7679114252527, "grad_norm": 33.42092510007329, "learning_rate": 2.6935887003334856e-08, "loss": 0.6785, "step": 7768 }, { "epoch": 0.7680102809974545, "grad_norm": 4.17861467745345, "learning_rate": 2.691402795123041e-08, "loss": 0.7551, "step": 7769 }, { "epoch": 0.7681091367422089, "grad_norm": 4.48541131935433, "learning_rate": 2.6892176393014344e-08, "loss": 0.6944, "step": 7770 }, { "epoch": 0.7682079924869634, "grad_norm": 8.49018939585132, "learning_rate": 2.6870332330927148e-08, "loss": 0.697, "step": 7771 }, { "epoch": 0.7683068482317179, "grad_norm": 4.699940903760272, "learning_rate": 2.6848495767208657e-08, "loss": 0.6344, "step": 7772 }, { "epoch": 0.7684057039764723, "grad_norm": 7.60990929239871, "learning_rate": 2.6826666704097857e-08, "loss": 0.7217, "step": 7773 }, { "epoch": 0.7685045597212268, "grad_norm": 3.9679978372658513, "learning_rate": 2.680484514383298e-08, "loss": 0.7022, "step": 7774 }, { "epoch": 0.7686034154659813, "grad_norm": 3.340331177863032, "learning_rate": 2.678303108865153e-08, "loss": 0.6926, "step": 7775 }, { "epoch": 0.7687022712107358, "grad_norm": 4.818799757456171, "learning_rate": 2.676122454079024e-08, "loss": 0.7474, "step": 7776 }, { "epoch": 0.7688011269554902, "grad_norm": 5.010739229758358, "learning_rate": 2.6739425502485003e-08, "loss": 0.6385, "step": 7777 }, { "epoch": 0.7688999827002446, "grad_norm": 4.231616556863292, "learning_rate": 2.6717633975971032e-08, "loss": 0.6535, "step": 7778 }, { "epoch": 0.7689988384449992, "grad_norm": 8.299982756642985, "learning_rate": 2.6695849963482687e-08, "loss": 0.7788, "step": 7779 }, { "epoch": 0.7690976941897536, "grad_norm": 5.255707822625191, "learning_rate": 2.667407346725361e-08, "loss": 0.7507, "step": 7780 }, { "epoch": 0.769196549934508, "grad_norm": 4.1920709379414145, "learning_rate": 2.6652304489516696e-08, "loss": 0.7026, "step": 7781 }, { "epoch": 0.7692954056792626, "grad_norm": 3.824001847060637, "learning_rate": 2.663054303250397e-08, "loss": 0.6327, "step": 7782 }, { "epoch": 0.769394261424017, "grad_norm": 10.250700256341972, "learning_rate": 2.660878909844678e-08, "loss": 0.6404, "step": 7783 }, { "epoch": 0.7694931171687714, "grad_norm": 73.13139297712725, "learning_rate": 2.658704268957571e-08, "loss": 0.6649, "step": 7784 }, { "epoch": 0.769591972913526, "grad_norm": 3.99899882767821, "learning_rate": 2.6565303808120453e-08, "loss": 0.6891, "step": 7785 }, { "epoch": 0.7696908286582804, "grad_norm": 3.3974851859886357, "learning_rate": 2.654357245631006e-08, "loss": 0.6823, "step": 7786 }, { "epoch": 0.7697896844030349, "grad_norm": 8.404689328926327, "learning_rate": 2.6521848636372778e-08, "loss": 0.7519, "step": 7787 }, { "epoch": 0.7698885401477893, "grad_norm": 5.973949684267329, "learning_rate": 2.650013235053602e-08, "loss": 0.6163, "step": 7788 }, { "epoch": 0.7699873958925438, "grad_norm": 5.972775361704657, "learning_rate": 2.647842360102651e-08, "loss": 0.6249, "step": 7789 }, { "epoch": 0.7700862516372983, "grad_norm": 3.2950315646023762, "learning_rate": 2.645672239007014e-08, "loss": 0.7241, "step": 7790 }, { "epoch": 0.7701851073820527, "grad_norm": 8.36952578944059, "learning_rate": 2.643502871989203e-08, "loss": 0.7098, "step": 7791 }, { "epoch": 0.7702839631268072, "grad_norm": 3.4740483916749896, "learning_rate": 2.6413342592716582e-08, "loss": 0.7469, "step": 7792 }, { "epoch": 0.7703828188715617, "grad_norm": 6.768872898735627, "learning_rate": 2.6391664010767345e-08, "loss": 0.7554, "step": 7793 }, { "epoch": 0.7704816746163161, "grad_norm": 11.111822640368167, "learning_rate": 2.6369992976267164e-08, "loss": 0.8389, "step": 7794 }, { "epoch": 0.7705805303610707, "grad_norm": 40.11663348099736, "learning_rate": 2.6348329491438124e-08, "loss": 0.6296, "step": 7795 }, { "epoch": 0.7706793861058251, "grad_norm": 4.008511875473146, "learning_rate": 2.632667355850141e-08, "loss": 0.7142, "step": 7796 }, { "epoch": 0.7707782418505795, "grad_norm": 3.7868138865566947, "learning_rate": 2.6305025179677576e-08, "loss": 0.7762, "step": 7797 }, { "epoch": 0.770877097595334, "grad_norm": 3.56291595561482, "learning_rate": 2.6283384357186367e-08, "loss": 0.6486, "step": 7798 }, { "epoch": 0.7709759533400885, "grad_norm": 3.780496904388255, "learning_rate": 2.6261751093246655e-08, "loss": 0.6145, "step": 7799 }, { "epoch": 0.7710748090848429, "grad_norm": 3.967801893055563, "learning_rate": 2.6240125390076684e-08, "loss": 0.7541, "step": 7800 }, { "epoch": 0.7711736648295974, "grad_norm": 4.972618896221208, "learning_rate": 2.6218507249893794e-08, "loss": 0.6587, "step": 7801 }, { "epoch": 0.7712725205743519, "grad_norm": 5.47735140435406, "learning_rate": 2.6196896674914624e-08, "loss": 0.6386, "step": 7802 }, { "epoch": 0.7713713763191063, "grad_norm": 4.7975108484846825, "learning_rate": 2.617529366735507e-08, "loss": 0.7187, "step": 7803 }, { "epoch": 0.7714702320638608, "grad_norm": 5.87551946125205, "learning_rate": 2.615369822943012e-08, "loss": 0.8296, "step": 7804 }, { "epoch": 0.7715690878086153, "grad_norm": 3.4354087194927767, "learning_rate": 2.613211036335411e-08, "loss": 0.6193, "step": 7805 }, { "epoch": 0.7716679435533698, "grad_norm": 9.134694197069546, "learning_rate": 2.6110530071340576e-08, "loss": 0.6854, "step": 7806 }, { "epoch": 0.7717667992981242, "grad_norm": 3.116167539522937, "learning_rate": 2.6088957355602248e-08, "loss": 0.6728, "step": 7807 }, { "epoch": 0.7718656550428786, "grad_norm": 3.104692206613377, "learning_rate": 2.606739221835105e-08, "loss": 0.6906, "step": 7808 }, { "epoch": 0.7719645107876332, "grad_norm": 13.449777594554032, "learning_rate": 2.6045834661798218e-08, "loss": 0.6655, "step": 7809 }, { "epoch": 0.7720633665323876, "grad_norm": 5.294268431202817, "learning_rate": 2.6024284688154107e-08, "loss": 0.7741, "step": 7810 }, { "epoch": 0.772162222277142, "grad_norm": 4.120823294699228, "learning_rate": 2.600274229962842e-08, "loss": 0.8111, "step": 7811 }, { "epoch": 0.7722610780218966, "grad_norm": 3.698124093794932, "learning_rate": 2.5981207498429946e-08, "loss": 0.7976, "step": 7812 }, { "epoch": 0.772359933766651, "grad_norm": 7.690019920759459, "learning_rate": 2.5959680286766796e-08, "loss": 0.5981, "step": 7813 }, { "epoch": 0.7724587895114055, "grad_norm": 5.278427886910685, "learning_rate": 2.5938160666846277e-08, "loss": 0.7954, "step": 7814 }, { "epoch": 0.77255764525616, "grad_norm": 3.7137211111589794, "learning_rate": 2.5916648640874872e-08, "loss": 0.5925, "step": 7815 }, { "epoch": 0.7726565010009144, "grad_norm": 3.5673840619424166, "learning_rate": 2.5895144211058327e-08, "loss": 0.6599, "step": 7816 }, { "epoch": 0.7727553567456689, "grad_norm": 4.756944285952017, "learning_rate": 2.587364737960167e-08, "loss": 0.7658, "step": 7817 }, { "epoch": 0.7728542124904233, "grad_norm": 12.920488899905608, "learning_rate": 2.585215814870899e-08, "loss": 0.6735, "step": 7818 }, { "epoch": 0.7729530682351778, "grad_norm": 3.193698076410193, "learning_rate": 2.5830676520583737e-08, "loss": 0.6738, "step": 7819 }, { "epoch": 0.7730519239799323, "grad_norm": 15.33694145620271, "learning_rate": 2.5809202497428573e-08, "loss": 0.706, "step": 7820 }, { "epoch": 0.7731507797246867, "grad_norm": 3.4167809834507694, "learning_rate": 2.5787736081445256e-08, "loss": 0.794, "step": 7821 }, { "epoch": 0.7732496354694413, "grad_norm": 3.62114800737765, "learning_rate": 2.5766277274834937e-08, "loss": 0.603, "step": 7822 }, { "epoch": 0.7733484912141957, "grad_norm": 3.7659756095915027, "learning_rate": 2.574482607979782e-08, "loss": 0.7573, "step": 7823 }, { "epoch": 0.7734473469589501, "grad_norm": 4.037636062084781, "learning_rate": 2.5723382498533484e-08, "loss": 0.7389, "step": 7824 }, { "epoch": 0.7735462027037047, "grad_norm": 4.555404502891533, "learning_rate": 2.5701946533240582e-08, "loss": 0.7518, "step": 7825 }, { "epoch": 0.7736450584484591, "grad_norm": 9.05611203869871, "learning_rate": 2.568051818611712e-08, "loss": 0.7707, "step": 7826 }, { "epoch": 0.7737439141932135, "grad_norm": 12.019151131639116, "learning_rate": 2.56590974593602e-08, "loss": 0.7421, "step": 7827 }, { "epoch": 0.773842769937968, "grad_norm": 3.6326228063213075, "learning_rate": 2.5637684355166268e-08, "loss": 0.6279, "step": 7828 }, { "epoch": 0.7739416256827225, "grad_norm": 4.687242757218988, "learning_rate": 2.5616278875730846e-08, "loss": 0.6011, "step": 7829 }, { "epoch": 0.774040481427477, "grad_norm": 3.108509908321275, "learning_rate": 2.559488102324878e-08, "loss": 0.6954, "step": 7830 }, { "epoch": 0.7741393371722314, "grad_norm": 3.368255466405175, "learning_rate": 2.5573490799914156e-08, "loss": 0.7082, "step": 7831 }, { "epoch": 0.7742381929169859, "grad_norm": 3.2315958395553506, "learning_rate": 2.5552108207920143e-08, "loss": 0.7823, "step": 7832 }, { "epoch": 0.7743370486617404, "grad_norm": 4.050550965030356, "learning_rate": 2.553073324945928e-08, "loss": 0.7997, "step": 7833 }, { "epoch": 0.7744359044064948, "grad_norm": 6.618567190019175, "learning_rate": 2.5509365926723204e-08, "loss": 0.6491, "step": 7834 }, { "epoch": 0.7745347601512493, "grad_norm": 3.5003407542799936, "learning_rate": 2.5488006241902827e-08, "loss": 0.6545, "step": 7835 }, { "epoch": 0.7746336158960038, "grad_norm": 4.949596426640748, "learning_rate": 2.546665419718832e-08, "loss": 0.7222, "step": 7836 }, { "epoch": 0.7747324716407582, "grad_norm": 3.600510288365261, "learning_rate": 2.5445309794768943e-08, "loss": 0.7661, "step": 7837 }, { "epoch": 0.7748313273855126, "grad_norm": 4.065068672310413, "learning_rate": 2.5423973036833292e-08, "loss": 0.7256, "step": 7838 }, { "epoch": 0.7749301831302672, "grad_norm": 7.8984641923595085, "learning_rate": 2.5402643925569155e-08, "loss": 0.6948, "step": 7839 }, { "epoch": 0.7750290388750216, "grad_norm": 4.067442692286579, "learning_rate": 2.538132246316347e-08, "loss": 0.6678, "step": 7840 }, { "epoch": 0.775127894619776, "grad_norm": 3.280842068791394, "learning_rate": 2.536000865180249e-08, "loss": 0.657, "step": 7841 }, { "epoch": 0.7752267503645306, "grad_norm": 4.582771439075677, "learning_rate": 2.533870249367158e-08, "loss": 0.8016, "step": 7842 }, { "epoch": 0.775325606109285, "grad_norm": 4.107048169349126, "learning_rate": 2.5317403990955433e-08, "loss": 0.6851, "step": 7843 }, { "epoch": 0.7754244618540395, "grad_norm": 15.914086903796616, "learning_rate": 2.5296113145837848e-08, "loss": 0.6078, "step": 7844 }, { "epoch": 0.775523317598794, "grad_norm": 3.1379085250894208, "learning_rate": 2.5274829960501876e-08, "loss": 0.807, "step": 7845 }, { "epoch": 0.7756221733435484, "grad_norm": 5.689082901435306, "learning_rate": 2.525355443712982e-08, "loss": 0.6493, "step": 7846 }, { "epoch": 0.7757210290883029, "grad_norm": 3.1499860768384913, "learning_rate": 2.5232286577903196e-08, "loss": 0.7222, "step": 7847 }, { "epoch": 0.7758198848330574, "grad_norm": 4.823590016474512, "learning_rate": 2.5211026385002655e-08, "loss": 0.6952, "step": 7848 }, { "epoch": 0.7759187405778118, "grad_norm": 5.027458951369968, "learning_rate": 2.5189773860608142e-08, "loss": 0.6623, "step": 7849 }, { "epoch": 0.7760175963225663, "grad_norm": 11.202331662735812, "learning_rate": 2.516852900689882e-08, "loss": 0.7407, "step": 7850 }, { "epoch": 0.7761164520673207, "grad_norm": 9.87498330880613, "learning_rate": 2.5147291826052974e-08, "loss": 0.6608, "step": 7851 }, { "epoch": 0.7762153078120753, "grad_norm": 4.304316812843773, "learning_rate": 2.5126062320248198e-08, "loss": 0.6151, "step": 7852 }, { "epoch": 0.7763141635568297, "grad_norm": 4.678076872263316, "learning_rate": 2.5104840491661294e-08, "loss": 0.679, "step": 7853 }, { "epoch": 0.7764130193015841, "grad_norm": 3.1820867182128465, "learning_rate": 2.508362634246819e-08, "loss": 0.57, "step": 7854 }, { "epoch": 0.7765118750463387, "grad_norm": 4.893697264970483, "learning_rate": 2.506241987484413e-08, "loss": 0.6696, "step": 7855 }, { "epoch": 0.7766107307910931, "grad_norm": 4.721493917207291, "learning_rate": 2.5041221090963484e-08, "loss": 0.6965, "step": 7856 }, { "epoch": 0.7767095865358475, "grad_norm": 4.609969200912756, "learning_rate": 2.5020029992999893e-08, "loss": 0.7247, "step": 7857 }, { "epoch": 0.7768084422806021, "grad_norm": 20.289897832421474, "learning_rate": 2.4998846583126222e-08, "loss": 0.5889, "step": 7858 }, { "epoch": 0.7769072980253565, "grad_norm": 6.718793825307793, "learning_rate": 2.4977670863514466e-08, "loss": 0.5897, "step": 7859 }, { "epoch": 0.777006153770111, "grad_norm": 5.03242359189194, "learning_rate": 2.4956502836335923e-08, "loss": 0.8359, "step": 7860 }, { "epoch": 0.7771050095148654, "grad_norm": 5.320287356015045, "learning_rate": 2.4935342503761057e-08, "loss": 0.8093, "step": 7861 }, { "epoch": 0.7772038652596199, "grad_norm": 5.123208578651246, "learning_rate": 2.49141898679595e-08, "loss": 0.8577, "step": 7862 }, { "epoch": 0.7773027210043744, "grad_norm": 3.576465095909544, "learning_rate": 2.4893044931100184e-08, "loss": 0.6591, "step": 7863 }, { "epoch": 0.7774015767491288, "grad_norm": 3.161508179896915, "learning_rate": 2.4871907695351223e-08, "loss": 0.7164, "step": 7864 }, { "epoch": 0.7775004324938833, "grad_norm": 5.63194555365071, "learning_rate": 2.485077816287988e-08, "loss": 0.8067, "step": 7865 }, { "epoch": 0.7775992882386378, "grad_norm": 5.0081160182814015, "learning_rate": 2.4829656335852757e-08, "loss": 0.7283, "step": 7866 }, { "epoch": 0.7776981439833922, "grad_norm": 4.922390743381536, "learning_rate": 2.480854221643549e-08, "loss": 0.7226, "step": 7867 }, { "epoch": 0.7777969997281468, "grad_norm": 3.921028153008956, "learning_rate": 2.4787435806793067e-08, "loss": 0.6976, "step": 7868 }, { "epoch": 0.7778958554729012, "grad_norm": 6.405015568662818, "learning_rate": 2.4766337109089674e-08, "loss": 0.7753, "step": 7869 }, { "epoch": 0.7779947112176556, "grad_norm": 5.548867722173285, "learning_rate": 2.4745246125488606e-08, "loss": 0.7417, "step": 7870 }, { "epoch": 0.77809356696241, "grad_norm": 5.175124487425344, "learning_rate": 2.4724162858152454e-08, "loss": 0.6572, "step": 7871 }, { "epoch": 0.7781924227071646, "grad_norm": 5.088730738339677, "learning_rate": 2.470308730924304e-08, "loss": 0.8145, "step": 7872 }, { "epoch": 0.778291278451919, "grad_norm": 5.3897524131874235, "learning_rate": 2.468201948092129e-08, "loss": 0.6558, "step": 7873 }, { "epoch": 0.7783901341966735, "grad_norm": 4.354893623262532, "learning_rate": 2.4660959375347412e-08, "loss": 0.6953, "step": 7874 }, { "epoch": 0.778488989941428, "grad_norm": 4.959656743126704, "learning_rate": 2.463990699468086e-08, "loss": 0.7365, "step": 7875 }, { "epoch": 0.7785878456861824, "grad_norm": 5.729973811774531, "learning_rate": 2.4618862341080173e-08, "loss": 0.6843, "step": 7876 }, { "epoch": 0.7786867014309369, "grad_norm": 5.1103458409325535, "learning_rate": 2.4597825416703232e-08, "loss": 0.6905, "step": 7877 }, { "epoch": 0.7787855571756914, "grad_norm": 4.329918194066023, "learning_rate": 2.4576796223707043e-08, "loss": 0.833, "step": 7878 }, { "epoch": 0.7788844129204459, "grad_norm": 4.415346141883247, "learning_rate": 2.4555774764247794e-08, "loss": 0.647, "step": 7879 }, { "epoch": 0.7789832686652003, "grad_norm": 4.373319218030685, "learning_rate": 2.453476104048099e-08, "loss": 0.7205, "step": 7880 }, { "epoch": 0.7790821244099547, "grad_norm": 4.895082542938074, "learning_rate": 2.4513755054561215e-08, "loss": 0.6694, "step": 7881 }, { "epoch": 0.7791809801547093, "grad_norm": 3.5007784331611718, "learning_rate": 2.4492756808642368e-08, "loss": 0.7823, "step": 7882 }, { "epoch": 0.7792798358994637, "grad_norm": 12.033646083348511, "learning_rate": 2.447176630487753e-08, "loss": 0.7201, "step": 7883 }, { "epoch": 0.7793786916442181, "grad_norm": 4.189358711577295, "learning_rate": 2.4450783545418906e-08, "loss": 0.7126, "step": 7884 }, { "epoch": 0.7794775473889727, "grad_norm": 15.371167473047045, "learning_rate": 2.4429808532418007e-08, "loss": 0.6917, "step": 7885 }, { "epoch": 0.7795764031337271, "grad_norm": 4.091382019648271, "learning_rate": 2.4408841268025516e-08, "loss": 0.6573, "step": 7886 }, { "epoch": 0.7796752588784815, "grad_norm": 3.7722525361623194, "learning_rate": 2.4387881754391294e-08, "loss": 0.6972, "step": 7887 }, { "epoch": 0.7797741146232361, "grad_norm": 6.886480454037972, "learning_rate": 2.4366929993664455e-08, "loss": 0.7049, "step": 7888 }, { "epoch": 0.7798729703679905, "grad_norm": 14.724240358170256, "learning_rate": 2.4345985987993256e-08, "loss": 0.7445, "step": 7889 }, { "epoch": 0.779971826112745, "grad_norm": 3.5660519230763787, "learning_rate": 2.4325049739525226e-08, "loss": 0.7818, "step": 7890 }, { "epoch": 0.7800706818574994, "grad_norm": 3.285684590489847, "learning_rate": 2.4304121250407082e-08, "loss": 0.5649, "step": 7891 }, { "epoch": 0.7801695376022539, "grad_norm": 3.3685643436292048, "learning_rate": 2.4283200522784686e-08, "loss": 0.5813, "step": 7892 }, { "epoch": 0.7802683933470084, "grad_norm": 5.510563386249811, "learning_rate": 2.4262287558803174e-08, "loss": 0.6165, "step": 7893 }, { "epoch": 0.7803672490917628, "grad_norm": 3.995089416160853, "learning_rate": 2.4241382360606888e-08, "loss": 0.7725, "step": 7894 }, { "epoch": 0.7804661048365173, "grad_norm": 4.774022732811174, "learning_rate": 2.4220484930339326e-08, "loss": 0.7543, "step": 7895 }, { "epoch": 0.7805649605812718, "grad_norm": 3.8650770983945653, "learning_rate": 2.419959527014318e-08, "loss": 0.7446, "step": 7896 }, { "epoch": 0.7806638163260262, "grad_norm": 5.476286244833428, "learning_rate": 2.4178713382160433e-08, "loss": 0.6131, "step": 7897 }, { "epoch": 0.7807626720707808, "grad_norm": 5.0621497585986415, "learning_rate": 2.4157839268532154e-08, "loss": 0.703, "step": 7898 }, { "epoch": 0.7808615278155352, "grad_norm": 5.741148338724125, "learning_rate": 2.4136972931398736e-08, "loss": 0.8045, "step": 7899 }, { "epoch": 0.7809603835602896, "grad_norm": 4.458241664363763, "learning_rate": 2.4116114372899666e-08, "loss": 0.6747, "step": 7900 }, { "epoch": 0.7810592393050441, "grad_norm": 3.5361974923975277, "learning_rate": 2.409526359517369e-08, "loss": 0.7751, "step": 7901 }, { "epoch": 0.7811580950497986, "grad_norm": 6.824568871400273, "learning_rate": 2.407442060035879e-08, "loss": 0.6868, "step": 7902 }, { "epoch": 0.781256950794553, "grad_norm": 8.140043075304199, "learning_rate": 2.4053585390592046e-08, "loss": 0.609, "step": 7903 }, { "epoch": 0.7813558065393075, "grad_norm": 3.039565769862241, "learning_rate": 2.4032757968009842e-08, "loss": 0.6627, "step": 7904 }, { "epoch": 0.781454662284062, "grad_norm": 3.434873646152947, "learning_rate": 2.401193833474773e-08, "loss": 0.7198, "step": 7905 }, { "epoch": 0.7815535180288165, "grad_norm": 3.4310713585531722, "learning_rate": 2.3991126492940416e-08, "loss": 0.7228, "step": 7906 }, { "epoch": 0.7816523737735709, "grad_norm": 4.63224542081652, "learning_rate": 2.397032244472187e-08, "loss": 0.7034, "step": 7907 }, { "epoch": 0.7817512295183254, "grad_norm": 5.193377121493787, "learning_rate": 2.3949526192225267e-08, "loss": 0.6743, "step": 7908 }, { "epoch": 0.7818500852630799, "grad_norm": 15.873124967105285, "learning_rate": 2.3928737737582904e-08, "loss": 0.6583, "step": 7909 }, { "epoch": 0.7819489410078343, "grad_norm": 28.859846596895796, "learning_rate": 2.390795708292638e-08, "loss": 0.6525, "step": 7910 }, { "epoch": 0.7820477967525887, "grad_norm": 2.5298926352566906, "learning_rate": 2.3887184230386394e-08, "loss": 0.6051, "step": 7911 }, { "epoch": 0.7821466524973433, "grad_norm": 3.3718660496167887, "learning_rate": 2.3866419182092944e-08, "loss": 0.6883, "step": 7912 }, { "epoch": 0.7822455082420977, "grad_norm": 3.934728351151892, "learning_rate": 2.3845661940175144e-08, "loss": 0.6053, "step": 7913 }, { "epoch": 0.7823443639868521, "grad_norm": 3.8666302131467436, "learning_rate": 2.3824912506761374e-08, "loss": 0.6473, "step": 7914 }, { "epoch": 0.7824432197316067, "grad_norm": 7.342190123868332, "learning_rate": 2.3804170883979135e-08, "loss": 0.7657, "step": 7915 }, { "epoch": 0.7825420754763611, "grad_norm": 2.9422973762010156, "learning_rate": 2.378343707395525e-08, "loss": 0.7167, "step": 7916 }, { "epoch": 0.7826409312211156, "grad_norm": 3.8855545171358656, "learning_rate": 2.3762711078815568e-08, "loss": 0.777, "step": 7917 }, { "epoch": 0.7827397869658701, "grad_norm": 6.730717492546894, "learning_rate": 2.3741992900685304e-08, "loss": 0.7544, "step": 7918 }, { "epoch": 0.7828386427106245, "grad_norm": 4.851802296442411, "learning_rate": 2.3721282541688805e-08, "loss": 0.8233, "step": 7919 }, { "epoch": 0.782937498455379, "grad_norm": 2.75529166195502, "learning_rate": 2.3700580003949556e-08, "loss": 0.7013, "step": 7920 }, { "epoch": 0.7830363542001334, "grad_norm": 4.3353416403159715, "learning_rate": 2.367988528959036e-08, "loss": 0.7325, "step": 7921 }, { "epoch": 0.7831352099448879, "grad_norm": 11.16760617367665, "learning_rate": 2.3659198400733114e-08, "loss": 0.7297, "step": 7922 }, { "epoch": 0.7832340656896424, "grad_norm": 4.722837574271411, "learning_rate": 2.363851933949895e-08, "loss": 0.7152, "step": 7923 }, { "epoch": 0.7833329214343968, "grad_norm": 5.543837318559596, "learning_rate": 2.3617848108008253e-08, "loss": 0.6866, "step": 7924 }, { "epoch": 0.7834317771791514, "grad_norm": 3.4161797423835827, "learning_rate": 2.359718470838049e-08, "loss": 0.6271, "step": 7925 }, { "epoch": 0.7835306329239058, "grad_norm": 5.888002783240124, "learning_rate": 2.3576529142734404e-08, "loss": 0.8256, "step": 7926 }, { "epoch": 0.7836294886686602, "grad_norm": 4.587234413420677, "learning_rate": 2.3555881413187973e-08, "loss": 0.6735, "step": 7927 }, { "epoch": 0.7837283444134148, "grad_norm": 3.8163570026212597, "learning_rate": 2.353524152185823e-08, "loss": 0.6536, "step": 7928 }, { "epoch": 0.7838272001581692, "grad_norm": 4.120597087554894, "learning_rate": 2.351460947086157e-08, "loss": 0.6932, "step": 7929 }, { "epoch": 0.7839260559029236, "grad_norm": 4.141673464485594, "learning_rate": 2.349398526231344e-08, "loss": 0.7183, "step": 7930 }, { "epoch": 0.7840249116476782, "grad_norm": 5.32607780005227, "learning_rate": 2.3473368898328603e-08, "loss": 0.8215, "step": 7931 }, { "epoch": 0.7841237673924326, "grad_norm": 5.842244627963679, "learning_rate": 2.345276038102094e-08, "loss": 0.6964, "step": 7932 }, { "epoch": 0.784222623137187, "grad_norm": 7.742630230415506, "learning_rate": 2.343215971250353e-08, "loss": 0.7151, "step": 7933 }, { "epoch": 0.7843214788819415, "grad_norm": 68.40132664667709, "learning_rate": 2.3411566894888666e-08, "loss": 0.8121, "step": 7934 }, { "epoch": 0.784420334626696, "grad_norm": 4.456467583694892, "learning_rate": 2.33909819302879e-08, "loss": 0.7447, "step": 7935 }, { "epoch": 0.7845191903714505, "grad_norm": 10.864087984207856, "learning_rate": 2.337040482081185e-08, "loss": 0.6607, "step": 7936 }, { "epoch": 0.7846180461162049, "grad_norm": 6.510730612787214, "learning_rate": 2.334983556857042e-08, "loss": 0.7296, "step": 7937 }, { "epoch": 0.7847169018609594, "grad_norm": 4.307067143286345, "learning_rate": 2.33292741756727e-08, "loss": 0.7123, "step": 7938 }, { "epoch": 0.7848157576057139, "grad_norm": 3.9713333485682383, "learning_rate": 2.330872064422692e-08, "loss": 0.6539, "step": 7939 }, { "epoch": 0.7849146133504683, "grad_norm": 4.574166985472579, "learning_rate": 2.328817497634056e-08, "loss": 0.6858, "step": 7940 }, { "epoch": 0.7850134690952228, "grad_norm": 3.5062007763974705, "learning_rate": 2.3267637174120302e-08, "loss": 0.6881, "step": 7941 }, { "epoch": 0.7851123248399773, "grad_norm": 3.834903599709114, "learning_rate": 2.3247107239671947e-08, "loss": 0.6369, "step": 7942 }, { "epoch": 0.7852111805847317, "grad_norm": 3.334683755610701, "learning_rate": 2.3226585175100587e-08, "loss": 0.6692, "step": 7943 }, { "epoch": 0.7853100363294861, "grad_norm": 10.88139317589456, "learning_rate": 2.3206070982510396e-08, "loss": 0.7643, "step": 7944 }, { "epoch": 0.7854088920742407, "grad_norm": 6.373125928747028, "learning_rate": 2.318556466400484e-08, "loss": 0.6868, "step": 7945 }, { "epoch": 0.7855077478189951, "grad_norm": 3.408507971062486, "learning_rate": 2.316506622168657e-08, "loss": 0.7236, "step": 7946 }, { "epoch": 0.7856066035637496, "grad_norm": 5.928232301649691, "learning_rate": 2.3144575657657328e-08, "loss": 0.6597, "step": 7947 }, { "epoch": 0.7857054593085041, "grad_norm": 3.514562112733798, "learning_rate": 2.3124092974018183e-08, "loss": 0.7336, "step": 7948 }, { "epoch": 0.7858043150532585, "grad_norm": 3.6303565368409, "learning_rate": 2.31036181728693e-08, "loss": 0.8211, "step": 7949 }, { "epoch": 0.785903170798013, "grad_norm": 3.4470937067307714, "learning_rate": 2.3083151256310053e-08, "loss": 0.7089, "step": 7950 }, { "epoch": 0.7860020265427675, "grad_norm": 4.236232256129904, "learning_rate": 2.306269222643904e-08, "loss": 0.6852, "step": 7951 }, { "epoch": 0.786100882287522, "grad_norm": 5.342598569692525, "learning_rate": 2.3042241085354054e-08, "loss": 0.7738, "step": 7952 }, { "epoch": 0.7861997380322764, "grad_norm": 3.998224217778554, "learning_rate": 2.3021797835152014e-08, "loss": 0.7597, "step": 7953 }, { "epoch": 0.7862985937770308, "grad_norm": 2.9681612838234797, "learning_rate": 2.300136247792913e-08, "loss": 0.5823, "step": 7954 }, { "epoch": 0.7863974495217854, "grad_norm": 5.625524409358926, "learning_rate": 2.298093501578068e-08, "loss": 0.6348, "step": 7955 }, { "epoch": 0.7864963052665398, "grad_norm": 4.000476782769655, "learning_rate": 2.2960515450801242e-08, "loss": 0.67, "step": 7956 }, { "epoch": 0.7865951610112942, "grad_norm": 27.433428244556023, "learning_rate": 2.2940103785084553e-08, "loss": 0.7078, "step": 7957 }, { "epoch": 0.7866940167560488, "grad_norm": 4.41186366678544, "learning_rate": 2.2919700020723475e-08, "loss": 0.8276, "step": 7958 }, { "epoch": 0.7867928725008032, "grad_norm": 4.341287447159085, "learning_rate": 2.2899304159810153e-08, "loss": 0.7159, "step": 7959 }, { "epoch": 0.7868917282455576, "grad_norm": 10.948784268806152, "learning_rate": 2.2878916204435904e-08, "loss": 0.7844, "step": 7960 }, { "epoch": 0.7869905839903122, "grad_norm": 3.2389869778020453, "learning_rate": 2.2858536156691143e-08, "loss": 0.7001, "step": 7961 }, { "epoch": 0.7870894397350666, "grad_norm": 3.33751696309188, "learning_rate": 2.2838164018665597e-08, "loss": 0.6745, "step": 7962 }, { "epoch": 0.787188295479821, "grad_norm": 4.115244278942007, "learning_rate": 2.2817799792448134e-08, "loss": 0.5759, "step": 7963 }, { "epoch": 0.7872871512245755, "grad_norm": 3.1788366146536347, "learning_rate": 2.2797443480126754e-08, "loss": 0.7119, "step": 7964 }, { "epoch": 0.78738600696933, "grad_norm": 16.756997493328615, "learning_rate": 2.2777095083788754e-08, "loss": 0.6871, "step": 7965 }, { "epoch": 0.7874848627140845, "grad_norm": 3.188214031370651, "learning_rate": 2.2756754605520545e-08, "loss": 0.6604, "step": 7966 }, { "epoch": 0.7875837184588389, "grad_norm": 3.8184620351545555, "learning_rate": 2.27364220474077e-08, "loss": 0.732, "step": 7967 }, { "epoch": 0.7876825742035934, "grad_norm": 3.855626198529875, "learning_rate": 2.2716097411535084e-08, "loss": 0.7148, "step": 7968 }, { "epoch": 0.7877814299483479, "grad_norm": 4.078361034788952, "learning_rate": 2.2695780699986634e-08, "loss": 0.5817, "step": 7969 }, { "epoch": 0.7878802856931023, "grad_norm": 8.010814655795262, "learning_rate": 2.2675471914845555e-08, "loss": 0.7712, "step": 7970 }, { "epoch": 0.7879791414378569, "grad_norm": 3.758006897735093, "learning_rate": 2.2655171058194232e-08, "loss": 0.715, "step": 7971 }, { "epoch": 0.7880779971826113, "grad_norm": 6.823619081569287, "learning_rate": 2.263487813211419e-08, "loss": 0.6903, "step": 7972 }, { "epoch": 0.7881768529273657, "grad_norm": 4.611349601855551, "learning_rate": 2.2614593138686167e-08, "loss": 0.7746, "step": 7973 }, { "epoch": 0.7882757086721202, "grad_norm": 5.571814339614893, "learning_rate": 2.259431607999014e-08, "loss": 0.6791, "step": 7974 }, { "epoch": 0.7883745644168747, "grad_norm": 7.199250654338662, "learning_rate": 2.2574046958105154e-08, "loss": 0.6552, "step": 7975 }, { "epoch": 0.7884734201616291, "grad_norm": 3.6545987502297104, "learning_rate": 2.255378577510956e-08, "loss": 0.7622, "step": 7976 }, { "epoch": 0.7885722759063836, "grad_norm": 4.811301527382787, "learning_rate": 2.2533532533080802e-08, "loss": 0.6054, "step": 7977 }, { "epoch": 0.7886711316511381, "grad_norm": 4.8765034076708575, "learning_rate": 2.2513287234095568e-08, "loss": 0.6655, "step": 7978 }, { "epoch": 0.7887699873958925, "grad_norm": 3.741553624909421, "learning_rate": 2.2493049880229754e-08, "loss": 0.6305, "step": 7979 }, { "epoch": 0.788868843140647, "grad_norm": 15.733397774190555, "learning_rate": 2.247282047355833e-08, "loss": 0.7941, "step": 7980 }, { "epoch": 0.7889676988854015, "grad_norm": 20.5054392399232, "learning_rate": 2.245259901615556e-08, "loss": 0.6917, "step": 7981 }, { "epoch": 0.789066554630156, "grad_norm": 2.923224560262284, "learning_rate": 2.2432385510094886e-08, "loss": 0.6864, "step": 7982 }, { "epoch": 0.7891654103749104, "grad_norm": 4.319365478203326, "learning_rate": 2.2412179957448873e-08, "loss": 0.7587, "step": 7983 }, { "epoch": 0.7892642661196648, "grad_norm": 4.0825688442557055, "learning_rate": 2.2391982360289286e-08, "loss": 0.6798, "step": 7984 }, { "epoch": 0.7893631218644194, "grad_norm": 3.7058223886823964, "learning_rate": 2.2371792720687134e-08, "loss": 0.6221, "step": 7985 }, { "epoch": 0.7894619776091738, "grad_norm": 4.697623782339271, "learning_rate": 2.2351611040712502e-08, "loss": 0.7228, "step": 7986 }, { "epoch": 0.7895608333539282, "grad_norm": 30.67986443614289, "learning_rate": 2.233143732243481e-08, "loss": 0.5827, "step": 7987 }, { "epoch": 0.7896596890986828, "grad_norm": 5.224985591619126, "learning_rate": 2.2311271567922486e-08, "loss": 0.7818, "step": 7988 }, { "epoch": 0.7897585448434372, "grad_norm": 3.8320025054831097, "learning_rate": 2.2291113779243276e-08, "loss": 0.671, "step": 7989 }, { "epoch": 0.7898574005881916, "grad_norm": 37.16870983121721, "learning_rate": 2.2270963958464095e-08, "loss": 0.7273, "step": 7990 }, { "epoch": 0.7899562563329462, "grad_norm": 4.2490256535518265, "learning_rate": 2.2250822107650947e-08, "loss": 0.7054, "step": 7991 }, { "epoch": 0.7900551120777006, "grad_norm": 5.355763337700417, "learning_rate": 2.223068822886911e-08, "loss": 0.7593, "step": 7992 }, { "epoch": 0.7901539678224551, "grad_norm": 8.449043213625671, "learning_rate": 2.2210562324183046e-08, "loss": 0.8008, "step": 7993 }, { "epoch": 0.7902528235672095, "grad_norm": 3.1969202069638922, "learning_rate": 2.2190444395656304e-08, "loss": 0.7921, "step": 7994 }, { "epoch": 0.790351679311964, "grad_norm": 3.560845240894026, "learning_rate": 2.2170334445351722e-08, "loss": 0.8001, "step": 7995 }, { "epoch": 0.7904505350567185, "grad_norm": 6.1252900465398525, "learning_rate": 2.215023247533131e-08, "loss": 0.7569, "step": 7996 }, { "epoch": 0.7905493908014729, "grad_norm": 2.90522990305727, "learning_rate": 2.213013848765616e-08, "loss": 0.7044, "step": 7997 }, { "epoch": 0.7906482465462275, "grad_norm": 4.984670803789874, "learning_rate": 2.2110052484386677e-08, "loss": 0.7538, "step": 7998 }, { "epoch": 0.7907471022909819, "grad_norm": 11.744281145178316, "learning_rate": 2.2089974467582317e-08, "loss": 0.6442, "step": 7999 }, { "epoch": 0.7908459580357363, "grad_norm": 4.175585473705624, "learning_rate": 2.2069904439301857e-08, "loss": 0.6857, "step": 8000 }, { "epoch": 0.7909448137804909, "grad_norm": 6.671109255896544, "learning_rate": 2.2049842401603114e-08, "loss": 0.6932, "step": 8001 }, { "epoch": 0.7910436695252453, "grad_norm": 5.018828823924566, "learning_rate": 2.202978835654322e-08, "loss": 0.7712, "step": 8002 }, { "epoch": 0.7911425252699997, "grad_norm": 4.374935048770241, "learning_rate": 2.200974230617836e-08, "loss": 0.6862, "step": 8003 }, { "epoch": 0.7912413810147543, "grad_norm": 4.683133690992875, "learning_rate": 2.198970425256401e-08, "loss": 0.6377, "step": 8004 }, { "epoch": 0.7913402367595087, "grad_norm": 6.027326559440247, "learning_rate": 2.196967419775474e-08, "loss": 0.6484, "step": 8005 }, { "epoch": 0.7914390925042631, "grad_norm": 13.081244321430484, "learning_rate": 2.1949652143804353e-08, "loss": 0.7065, "step": 8006 }, { "epoch": 0.7915379482490176, "grad_norm": 3.6743067014600075, "learning_rate": 2.1929638092765833e-08, "loss": 0.6428, "step": 8007 }, { "epoch": 0.7916368039937721, "grad_norm": 6.662122603116459, "learning_rate": 2.1909632046691286e-08, "loss": 0.7678, "step": 8008 }, { "epoch": 0.7917356597385266, "grad_norm": 4.004078300757308, "learning_rate": 2.1889634007632086e-08, "loss": 0.6403, "step": 8009 }, { "epoch": 0.791834515483281, "grad_norm": 7.335553535180043, "learning_rate": 2.1869643977638673e-08, "loss": 0.6983, "step": 8010 }, { "epoch": 0.7919333712280355, "grad_norm": 6.744684942020077, "learning_rate": 2.184966195876078e-08, "loss": 0.6596, "step": 8011 }, { "epoch": 0.79203222697279, "grad_norm": 4.89132616212019, "learning_rate": 2.182968795304727e-08, "loss": 0.7001, "step": 8012 }, { "epoch": 0.7921310827175444, "grad_norm": 4.884987268045727, "learning_rate": 2.1809721962546147e-08, "loss": 0.6913, "step": 8013 }, { "epoch": 0.7922299384622989, "grad_norm": 10.45320160068239, "learning_rate": 2.1789763989304643e-08, "loss": 0.7488, "step": 8014 }, { "epoch": 0.7923287942070534, "grad_norm": 6.647304119100279, "learning_rate": 2.1769814035369193e-08, "loss": 0.7222, "step": 8015 }, { "epoch": 0.7924276499518078, "grad_norm": 6.219794983380087, "learning_rate": 2.1749872102785316e-08, "loss": 0.5973, "step": 8016 }, { "epoch": 0.7925265056965622, "grad_norm": 9.390846267667538, "learning_rate": 2.1729938193597798e-08, "loss": 0.6158, "step": 8017 }, { "epoch": 0.7926253614413168, "grad_norm": 4.23259132486889, "learning_rate": 2.1710012309850533e-08, "loss": 0.5986, "step": 8018 }, { "epoch": 0.7927242171860712, "grad_norm": 4.428209106563973, "learning_rate": 2.169009445358667e-08, "loss": 0.6677, "step": 8019 }, { "epoch": 0.7928230729308257, "grad_norm": 3.782525782249729, "learning_rate": 2.1670184626848477e-08, "loss": 0.7392, "step": 8020 }, { "epoch": 0.7929219286755802, "grad_norm": 3.3784302891183473, "learning_rate": 2.1650282831677367e-08, "loss": 0.6603, "step": 8021 }, { "epoch": 0.7930207844203346, "grad_norm": 5.772061568954741, "learning_rate": 2.163038907011402e-08, "loss": 0.5832, "step": 8022 }, { "epoch": 0.7931196401650891, "grad_norm": 5.244454350414224, "learning_rate": 2.161050334419827e-08, "loss": 0.5931, "step": 8023 }, { "epoch": 0.7932184959098436, "grad_norm": 5.773776029106545, "learning_rate": 2.1590625655969043e-08, "loss": 0.7465, "step": 8024 }, { "epoch": 0.793317351654598, "grad_norm": 3.2557435661991287, "learning_rate": 2.157075600746453e-08, "loss": 0.6085, "step": 8025 }, { "epoch": 0.7934162073993525, "grad_norm": 5.225229002193835, "learning_rate": 2.155089440072212e-08, "loss": 0.6335, "step": 8026 }, { "epoch": 0.7935150631441069, "grad_norm": 4.152833505454176, "learning_rate": 2.1531040837778234e-08, "loss": 0.7487, "step": 8027 }, { "epoch": 0.7936139188888615, "grad_norm": 6.424456164882012, "learning_rate": 2.151119532066863e-08, "loss": 0.7194, "step": 8028 }, { "epoch": 0.7937127746336159, "grad_norm": 4.008157264099073, "learning_rate": 2.149135785142817e-08, "loss": 0.6901, "step": 8029 }, { "epoch": 0.7938116303783703, "grad_norm": 2.9733637686769843, "learning_rate": 2.1471528432090856e-08, "loss": 0.7102, "step": 8030 }, { "epoch": 0.7939104861231249, "grad_norm": 3.875086120823478, "learning_rate": 2.1451707064689963e-08, "loss": 0.7404, "step": 8031 }, { "epoch": 0.7940093418678793, "grad_norm": 3.309873499276737, "learning_rate": 2.1431893751257802e-08, "loss": 0.7221, "step": 8032 }, { "epoch": 0.7941081976126337, "grad_norm": 5.727981786135744, "learning_rate": 2.1412088493825974e-08, "loss": 0.7539, "step": 8033 }, { "epoch": 0.7942070533573883, "grad_norm": 38.26110321206332, "learning_rate": 2.1392291294425245e-08, "loss": 0.6302, "step": 8034 }, { "epoch": 0.7943059091021427, "grad_norm": 3.9731071033035574, "learning_rate": 2.137250215508548e-08, "loss": 0.6897, "step": 8035 }, { "epoch": 0.7944047648468971, "grad_norm": 2.901708803360247, "learning_rate": 2.13527210778358e-08, "loss": 0.6879, "step": 8036 }, { "epoch": 0.7945036205916516, "grad_norm": 3.530732500312674, "learning_rate": 2.133294806470445e-08, "loss": 0.6599, "step": 8037 }, { "epoch": 0.7946024763364061, "grad_norm": 4.444139872083399, "learning_rate": 2.1313183117718825e-08, "loss": 0.8117, "step": 8038 }, { "epoch": 0.7947013320811606, "grad_norm": 5.049486215810018, "learning_rate": 2.1293426238905554e-08, "loss": 0.793, "step": 8039 }, { "epoch": 0.794800187825915, "grad_norm": 3.3138905766336193, "learning_rate": 2.127367743029046e-08, "loss": 0.7655, "step": 8040 }, { "epoch": 0.7948990435706695, "grad_norm": 3.9109465963994645, "learning_rate": 2.1253936693898412e-08, "loss": 0.7563, "step": 8041 }, { "epoch": 0.794997899315424, "grad_norm": 67.3712578921352, "learning_rate": 2.12342040317536e-08, "loss": 0.653, "step": 8042 }, { "epoch": 0.7950967550601784, "grad_norm": 14.243264535549752, "learning_rate": 2.1214479445879263e-08, "loss": 0.7728, "step": 8043 }, { "epoch": 0.795195610804933, "grad_norm": 6.049394722691088, "learning_rate": 2.1194762938297884e-08, "loss": 0.8142, "step": 8044 }, { "epoch": 0.7952944665496874, "grad_norm": 9.18411710478427, "learning_rate": 2.1175054511031144e-08, "loss": 0.7855, "step": 8045 }, { "epoch": 0.7953933222944418, "grad_norm": 3.5867285444547616, "learning_rate": 2.1155354166099783e-08, "loss": 0.7274, "step": 8046 }, { "epoch": 0.7954921780391963, "grad_norm": 5.828503104307514, "learning_rate": 2.1135661905523804e-08, "loss": 0.7112, "step": 8047 }, { "epoch": 0.7955910337839508, "grad_norm": 6.116947221754191, "learning_rate": 2.1115977731322398e-08, "loss": 0.6881, "step": 8048 }, { "epoch": 0.7956898895287052, "grad_norm": 4.662552406288064, "learning_rate": 2.1096301645513826e-08, "loss": 0.7266, "step": 8049 }, { "epoch": 0.7957887452734597, "grad_norm": 4.314948749707543, "learning_rate": 2.1076633650115617e-08, "loss": 0.5832, "step": 8050 }, { "epoch": 0.7958876010182142, "grad_norm": 3.2581541910745324, "learning_rate": 2.1056973747144445e-08, "loss": 0.6591, "step": 8051 }, { "epoch": 0.7959864567629686, "grad_norm": 19.010837937080357, "learning_rate": 2.1037321938616093e-08, "loss": 0.8007, "step": 8052 }, { "epoch": 0.7960853125077231, "grad_norm": 12.149160451127367, "learning_rate": 2.1017678226545632e-08, "loss": 0.6793, "step": 8053 }, { "epoch": 0.7961841682524776, "grad_norm": 3.595750101527138, "learning_rate": 2.0998042612947187e-08, "loss": 0.6897, "step": 8054 }, { "epoch": 0.796283023997232, "grad_norm": 3.5330965799129967, "learning_rate": 2.097841509983409e-08, "loss": 0.7404, "step": 8055 }, { "epoch": 0.7963818797419865, "grad_norm": 3.6692030257104937, "learning_rate": 2.09587956892189e-08, "loss": 0.7822, "step": 8056 }, { "epoch": 0.7964807354867409, "grad_norm": 8.942758370329562, "learning_rate": 2.0939184383113252e-08, "loss": 0.726, "step": 8057 }, { "epoch": 0.7965795912314955, "grad_norm": 11.722889513971337, "learning_rate": 2.0919581183528012e-08, "loss": 0.7415, "step": 8058 }, { "epoch": 0.7966784469762499, "grad_norm": 34.11785209571026, "learning_rate": 2.089998609247322e-08, "loss": 0.7142, "step": 8059 }, { "epoch": 0.7967773027210043, "grad_norm": 4.408576341054688, "learning_rate": 2.0880399111958036e-08, "loss": 0.666, "step": 8060 }, { "epoch": 0.7968761584657589, "grad_norm": 3.3745661865778627, "learning_rate": 2.0860820243990818e-08, "loss": 0.6888, "step": 8061 }, { "epoch": 0.7969750142105133, "grad_norm": 4.319421893654931, "learning_rate": 2.0841249490579127e-08, "loss": 0.73, "step": 8062 }, { "epoch": 0.7970738699552677, "grad_norm": 4.41272352931148, "learning_rate": 2.08216868537296e-08, "loss": 0.7473, "step": 8063 }, { "epoch": 0.7971727257000223, "grad_norm": 13.401357850818037, "learning_rate": 2.0802132335448154e-08, "loss": 0.6473, "step": 8064 }, { "epoch": 0.7972715814447767, "grad_norm": 3.929366975149527, "learning_rate": 2.078258593773975e-08, "loss": 0.5866, "step": 8065 }, { "epoch": 0.7973704371895312, "grad_norm": 3.722015365338939, "learning_rate": 2.0763047662608623e-08, "loss": 0.6099, "step": 8066 }, { "epoch": 0.7974692929342856, "grad_norm": 5.199326734944557, "learning_rate": 2.0743517512058163e-08, "loss": 0.7046, "step": 8067 }, { "epoch": 0.7975681486790401, "grad_norm": 3.6675757830585813, "learning_rate": 2.072399548809084e-08, "loss": 0.6662, "step": 8068 }, { "epoch": 0.7976670044237946, "grad_norm": 8.062979898629724, "learning_rate": 2.070448159270838e-08, "loss": 0.6765, "step": 8069 }, { "epoch": 0.797765860168549, "grad_norm": 5.0576434118092655, "learning_rate": 2.0684975827911656e-08, "loss": 0.6662, "step": 8070 }, { "epoch": 0.7978647159133035, "grad_norm": 5.180636850437206, "learning_rate": 2.0665478195700693e-08, "loss": 0.8115, "step": 8071 }, { "epoch": 0.797963571658058, "grad_norm": 4.4458296156617525, "learning_rate": 2.0645988698074634e-08, "loss": 0.7213, "step": 8072 }, { "epoch": 0.7980624274028124, "grad_norm": 3.528763287094208, "learning_rate": 2.0626507337031917e-08, "loss": 0.6119, "step": 8073 }, { "epoch": 0.798161283147567, "grad_norm": 3.274028460644096, "learning_rate": 2.060703411457001e-08, "loss": 0.6698, "step": 8074 }, { "epoch": 0.7982601388923214, "grad_norm": 30.180095009728458, "learning_rate": 2.0587569032685635e-08, "loss": 0.6384, "step": 8075 }, { "epoch": 0.7983589946370758, "grad_norm": 3.670248011518941, "learning_rate": 2.0568112093374634e-08, "loss": 0.8001, "step": 8076 }, { "epoch": 0.7984578503818303, "grad_norm": 2.7472876690875454, "learning_rate": 2.054866329863202e-08, "loss": 0.7215, "step": 8077 }, { "epoch": 0.7985567061265848, "grad_norm": 2.9660745758187725, "learning_rate": 2.0529222650452028e-08, "loss": 0.7297, "step": 8078 }, { "epoch": 0.7986555618713392, "grad_norm": 4.195381394223327, "learning_rate": 2.0509790150827943e-08, "loss": 0.7175, "step": 8079 }, { "epoch": 0.7987544176160937, "grad_norm": 4.17238074415259, "learning_rate": 2.0490365801752317e-08, "loss": 0.7221, "step": 8080 }, { "epoch": 0.7988532733608482, "grad_norm": 5.37700508777497, "learning_rate": 2.0470949605216868e-08, "loss": 0.7411, "step": 8081 }, { "epoch": 0.7989521291056026, "grad_norm": 4.118258419236742, "learning_rate": 2.045154156321236e-08, "loss": 0.7819, "step": 8082 }, { "epoch": 0.7990509848503571, "grad_norm": 3.643008844472692, "learning_rate": 2.0432141677728855e-08, "loss": 0.6822, "step": 8083 }, { "epoch": 0.7991498405951116, "grad_norm": 4.274867072538961, "learning_rate": 2.0412749950755525e-08, "loss": 0.8026, "step": 8084 }, { "epoch": 0.7992486963398661, "grad_norm": 3.544116648031486, "learning_rate": 2.0393366384280674e-08, "loss": 0.7935, "step": 8085 }, { "epoch": 0.7993475520846205, "grad_norm": 4.304452531980691, "learning_rate": 2.0373990980291844e-08, "loss": 0.6773, "step": 8086 }, { "epoch": 0.799446407829375, "grad_norm": 4.01816693830789, "learning_rate": 2.0354623740775643e-08, "loss": 0.644, "step": 8087 }, { "epoch": 0.7995452635741295, "grad_norm": 3.555294405147759, "learning_rate": 2.0335264667717956e-08, "loss": 0.7897, "step": 8088 }, { "epoch": 0.7996441193188839, "grad_norm": 3.279926600650593, "learning_rate": 2.0315913763103722e-08, "loss": 0.7821, "step": 8089 }, { "epoch": 0.7997429750636383, "grad_norm": 6.233369017568878, "learning_rate": 2.029657102891712e-08, "loss": 0.6589, "step": 8090 }, { "epoch": 0.7998418308083929, "grad_norm": 19.2216354020714, "learning_rate": 2.0277236467141436e-08, "loss": 0.6563, "step": 8091 }, { "epoch": 0.7999406865531473, "grad_norm": 6.511785824731293, "learning_rate": 2.0257910079759176e-08, "loss": 0.7167, "step": 8092 }, { "epoch": 0.8000395422979018, "grad_norm": 6.030238248282697, "learning_rate": 2.023859186875193e-08, "loss": 0.6585, "step": 8093 }, { "epoch": 0.8001383980426563, "grad_norm": 7.466910954072572, "learning_rate": 2.0219281836100532e-08, "loss": 0.7082, "step": 8094 }, { "epoch": 0.8002372537874107, "grad_norm": 3.5697836165587757, "learning_rate": 2.0199979983784966e-08, "loss": 0.7237, "step": 8095 }, { "epoch": 0.8003361095321652, "grad_norm": 4.426020902102568, "learning_rate": 2.0180686313784278e-08, "loss": 0.6752, "step": 8096 }, { "epoch": 0.8004349652769197, "grad_norm": 3.5160122410106163, "learning_rate": 2.0161400828076826e-08, "loss": 0.5937, "step": 8097 }, { "epoch": 0.8005338210216741, "grad_norm": 4.762634656978886, "learning_rate": 2.0142123528639988e-08, "loss": 0.7138, "step": 8098 }, { "epoch": 0.8006326767664286, "grad_norm": 4.09055842820674, "learning_rate": 2.0122854417450398e-08, "loss": 0.747, "step": 8099 }, { "epoch": 0.800731532511183, "grad_norm": 4.189456518534083, "learning_rate": 2.010359349648384e-08, "loss": 0.7289, "step": 8100 }, { "epoch": 0.8008303882559376, "grad_norm": 3.857438340912207, "learning_rate": 2.008434076771519e-08, "loss": 0.6649, "step": 8101 }, { "epoch": 0.800929244000692, "grad_norm": 4.625002840569425, "learning_rate": 2.006509623311856e-08, "loss": 0.6313, "step": 8102 }, { "epoch": 0.8010280997454464, "grad_norm": 3.108123766184425, "learning_rate": 2.0045859894667206e-08, "loss": 0.6706, "step": 8103 }, { "epoch": 0.801126955490201, "grad_norm": 3.005111633933565, "learning_rate": 2.002663175433349e-08, "loss": 0.6397, "step": 8104 }, { "epoch": 0.8012258112349554, "grad_norm": 3.9238905838901523, "learning_rate": 2.0007411814089027e-08, "loss": 0.756, "step": 8105 }, { "epoch": 0.8013246669797098, "grad_norm": 4.996409609996933, "learning_rate": 1.9988200075904483e-08, "loss": 0.7001, "step": 8106 }, { "epoch": 0.8014235227244644, "grad_norm": 2.6206739923307087, "learning_rate": 1.9968996541749783e-08, "loss": 0.6713, "step": 8107 }, { "epoch": 0.8015223784692188, "grad_norm": 5.005501281655616, "learning_rate": 1.9949801213593952e-08, "loss": 0.8405, "step": 8108 }, { "epoch": 0.8016212342139732, "grad_norm": 5.714842396372804, "learning_rate": 1.9930614093405153e-08, "loss": 0.6252, "step": 8109 }, { "epoch": 0.8017200899587277, "grad_norm": 3.2964230152860847, "learning_rate": 1.991143518315077e-08, "loss": 0.796, "step": 8110 }, { "epoch": 0.8018189457034822, "grad_norm": 4.509164692423234, "learning_rate": 1.989226448479736e-08, "loss": 0.7527, "step": 8111 }, { "epoch": 0.8019178014482367, "grad_norm": 17.58630934442817, "learning_rate": 1.9873102000310515e-08, "loss": 0.6999, "step": 8112 }, { "epoch": 0.8020166571929911, "grad_norm": 2.89555627043851, "learning_rate": 1.985394773165511e-08, "loss": 0.7395, "step": 8113 }, { "epoch": 0.8021155129377456, "grad_norm": 5.208982929334602, "learning_rate": 1.9834801680795156e-08, "loss": 0.8391, "step": 8114 }, { "epoch": 0.8022143686825001, "grad_norm": 6.857070521786657, "learning_rate": 1.981566384969374e-08, "loss": 0.8851, "step": 8115 }, { "epoch": 0.8023132244272545, "grad_norm": 4.747328370063339, "learning_rate": 1.979653424031319e-08, "loss": 0.7525, "step": 8116 }, { "epoch": 0.802412080172009, "grad_norm": 3.166068649909162, "learning_rate": 1.9777412854615005e-08, "loss": 0.6254, "step": 8117 }, { "epoch": 0.8025109359167635, "grad_norm": 3.6454930191087884, "learning_rate": 1.975829969455973e-08, "loss": 0.7109, "step": 8118 }, { "epoch": 0.8026097916615179, "grad_norm": 6.320274982021155, "learning_rate": 1.9739194762107202e-08, "loss": 0.6239, "step": 8119 }, { "epoch": 0.8027086474062723, "grad_norm": 8.968933846069138, "learning_rate": 1.9720098059216307e-08, "loss": 0.6738, "step": 8120 }, { "epoch": 0.8028075031510269, "grad_norm": 5.536111403605802, "learning_rate": 1.970100958784513e-08, "loss": 0.7811, "step": 8121 }, { "epoch": 0.8029063588957813, "grad_norm": 3.3590779422663264, "learning_rate": 1.9681929349950955e-08, "loss": 0.676, "step": 8122 }, { "epoch": 0.8030052146405358, "grad_norm": 4.470301323728995, "learning_rate": 1.9662857347490124e-08, "loss": 0.6292, "step": 8123 }, { "epoch": 0.8031040703852903, "grad_norm": 4.597226970557335, "learning_rate": 1.9643793582418244e-08, "loss": 0.7593, "step": 8124 }, { "epoch": 0.8032029261300447, "grad_norm": 3.2212151211836693, "learning_rate": 1.9624738056689994e-08, "loss": 0.757, "step": 8125 }, { "epoch": 0.8033017818747992, "grad_norm": 9.927715332984322, "learning_rate": 1.960569077225921e-08, "loss": 0.7206, "step": 8126 }, { "epoch": 0.8034006376195537, "grad_norm": 10.386715619595421, "learning_rate": 1.958665173107894e-08, "loss": 0.6704, "step": 8127 }, { "epoch": 0.8034994933643081, "grad_norm": 4.490359632699761, "learning_rate": 1.956762093510138e-08, "loss": 0.6756, "step": 8128 }, { "epoch": 0.8035983491090626, "grad_norm": 5.533786778150065, "learning_rate": 1.9548598386277804e-08, "loss": 0.7621, "step": 8129 }, { "epoch": 0.803697204853817, "grad_norm": 4.937049909336295, "learning_rate": 1.9529584086558747e-08, "loss": 0.6913, "step": 8130 }, { "epoch": 0.8037960605985716, "grad_norm": 4.284610871927281, "learning_rate": 1.95105780378938e-08, "loss": 0.777, "step": 8131 }, { "epoch": 0.803894916343326, "grad_norm": 2.8599050318870924, "learning_rate": 1.9491580242231764e-08, "loss": 0.7054, "step": 8132 }, { "epoch": 0.8039937720880804, "grad_norm": 5.659455328997459, "learning_rate": 1.947259070152063e-08, "loss": 0.7568, "step": 8133 }, { "epoch": 0.804092627832835, "grad_norm": 3.6799102402691752, "learning_rate": 1.9453609417707417e-08, "loss": 0.6872, "step": 8134 }, { "epoch": 0.8041914835775894, "grad_norm": 3.49143131373834, "learning_rate": 1.9434636392738424e-08, "loss": 0.6436, "step": 8135 }, { "epoch": 0.8042903393223438, "grad_norm": 3.13834011693585, "learning_rate": 1.941567162855907e-08, "loss": 0.6957, "step": 8136 }, { "epoch": 0.8043891950670984, "grad_norm": 4.853966902434128, "learning_rate": 1.939671512711386e-08, "loss": 0.6408, "step": 8137 }, { "epoch": 0.8044880508118528, "grad_norm": 4.791482849456619, "learning_rate": 1.9377766890346538e-08, "loss": 0.7154, "step": 8138 }, { "epoch": 0.8045869065566073, "grad_norm": 4.126254003317854, "learning_rate": 1.9358826920199976e-08, "loss": 0.6496, "step": 8139 }, { "epoch": 0.8046857623013617, "grad_norm": 13.911987775063885, "learning_rate": 1.9339895218616186e-08, "loss": 0.6047, "step": 8140 }, { "epoch": 0.8047846180461162, "grad_norm": 3.4689454434307425, "learning_rate": 1.9320971787536276e-08, "loss": 0.8412, "step": 8141 }, { "epoch": 0.8048834737908707, "grad_norm": 3.345980641525808, "learning_rate": 1.9302056628900653e-08, "loss": 0.6865, "step": 8142 }, { "epoch": 0.8049823295356251, "grad_norm": 3.0140554749876562, "learning_rate": 1.928314974464871e-08, "loss": 0.7132, "step": 8143 }, { "epoch": 0.8050811852803796, "grad_norm": 3.450757274384056, "learning_rate": 1.9264251136719135e-08, "loss": 0.6339, "step": 8144 }, { "epoch": 0.8051800410251341, "grad_norm": 3.91145694498434, "learning_rate": 1.9245360807049637e-08, "loss": 0.747, "step": 8145 }, { "epoch": 0.8052788967698885, "grad_norm": 4.077156308594188, "learning_rate": 1.922647875757718e-08, "loss": 0.6584, "step": 8146 }, { "epoch": 0.805377752514643, "grad_norm": 3.435731444071695, "learning_rate": 1.9207604990237858e-08, "loss": 0.6942, "step": 8147 }, { "epoch": 0.8054766082593975, "grad_norm": 4.03687652878411, "learning_rate": 1.9188739506966835e-08, "loss": 0.7265, "step": 8148 }, { "epoch": 0.8055754640041519, "grad_norm": 3.9164971357718, "learning_rate": 1.9169882309698537e-08, "loss": 0.7718, "step": 8149 }, { "epoch": 0.8056743197489064, "grad_norm": 2.6830858016985486, "learning_rate": 1.9151033400366502e-08, "loss": 0.6243, "step": 8150 }, { "epoch": 0.8057731754936609, "grad_norm": 5.916403410851465, "learning_rate": 1.913219278090337e-08, "loss": 0.7133, "step": 8151 }, { "epoch": 0.8058720312384153, "grad_norm": 3.6043494264536546, "learning_rate": 1.9113360453241e-08, "loss": 0.7657, "step": 8152 }, { "epoch": 0.8059708869831698, "grad_norm": 4.320696767191986, "learning_rate": 1.9094536419310324e-08, "loss": 0.6846, "step": 8153 }, { "epoch": 0.8060697427279243, "grad_norm": 3.1294090047000545, "learning_rate": 1.9075720681041507e-08, "loss": 0.6783, "step": 8154 }, { "epoch": 0.8061685984726787, "grad_norm": 4.020975441216155, "learning_rate": 1.9056913240363835e-08, "loss": 0.6656, "step": 8155 }, { "epoch": 0.8062674542174332, "grad_norm": 4.885802947011346, "learning_rate": 1.9038114099205695e-08, "loss": 0.7113, "step": 8156 }, { "epoch": 0.8063663099621877, "grad_norm": 8.555111694530508, "learning_rate": 1.9019323259494714e-08, "loss": 0.6834, "step": 8157 }, { "epoch": 0.8064651657069422, "grad_norm": 4.856333314508178, "learning_rate": 1.9000540723157543e-08, "loss": 0.7044, "step": 8158 }, { "epoch": 0.8065640214516966, "grad_norm": 4.113343164492327, "learning_rate": 1.8981766492120133e-08, "loss": 0.6448, "step": 8159 }, { "epoch": 0.8066628771964511, "grad_norm": 18.911948160779964, "learning_rate": 1.896300056830744e-08, "loss": 0.7058, "step": 8160 }, { "epoch": 0.8067617329412056, "grad_norm": 9.360276347502339, "learning_rate": 1.8944242953643676e-08, "loss": 0.6714, "step": 8161 }, { "epoch": 0.80686058868596, "grad_norm": 3.8378025395492132, "learning_rate": 1.8925493650052127e-08, "loss": 0.7446, "step": 8162 }, { "epoch": 0.8069594444307144, "grad_norm": 3.577522924752322, "learning_rate": 1.8906752659455283e-08, "loss": 0.7303, "step": 8163 }, { "epoch": 0.807058300175469, "grad_norm": 3.2082284026510948, "learning_rate": 1.8888019983774726e-08, "loss": 0.6839, "step": 8164 }, { "epoch": 0.8071571559202234, "grad_norm": 5.704733146418734, "learning_rate": 1.8869295624931237e-08, "loss": 0.7363, "step": 8165 }, { "epoch": 0.8072560116649778, "grad_norm": 4.334902989990931, "learning_rate": 1.885057958484474e-08, "loss": 0.7036, "step": 8166 }, { "epoch": 0.8073548674097324, "grad_norm": 4.75450110571059, "learning_rate": 1.8831871865434246e-08, "loss": 0.7063, "step": 8167 }, { "epoch": 0.8074537231544868, "grad_norm": 4.324332530902224, "learning_rate": 1.8813172468617967e-08, "loss": 0.8492, "step": 8168 }, { "epoch": 0.8075525788992413, "grad_norm": 8.002785807368168, "learning_rate": 1.87944813963133e-08, "loss": 0.7189, "step": 8169 }, { "epoch": 0.8076514346439958, "grad_norm": 3.896823401794053, "learning_rate": 1.877579865043667e-08, "loss": 0.7236, "step": 8170 }, { "epoch": 0.8077502903887502, "grad_norm": 3.4020044430335328, "learning_rate": 1.8757124232903743e-08, "loss": 0.7651, "step": 8171 }, { "epoch": 0.8078491461335047, "grad_norm": 3.2813559967400243, "learning_rate": 1.8738458145629345e-08, "loss": 0.6374, "step": 8172 }, { "epoch": 0.8079480018782591, "grad_norm": 24.987478659834885, "learning_rate": 1.871980039052734e-08, "loss": 0.7074, "step": 8173 }, { "epoch": 0.8080468576230136, "grad_norm": 9.452432362472065, "learning_rate": 1.870115096951086e-08, "loss": 0.8397, "step": 8174 }, { "epoch": 0.8081457133677681, "grad_norm": 4.693398145572609, "learning_rate": 1.868250988449209e-08, "loss": 0.6917, "step": 8175 }, { "epoch": 0.8082445691125225, "grad_norm": 27.419923313480933, "learning_rate": 1.8663877137382432e-08, "loss": 0.7055, "step": 8176 }, { "epoch": 0.8083434248572771, "grad_norm": 5.093839624193209, "learning_rate": 1.8645252730092353e-08, "loss": 0.7145, "step": 8177 }, { "epoch": 0.8084422806020315, "grad_norm": 5.76758467169519, "learning_rate": 1.862663666453157e-08, "loss": 0.7011, "step": 8178 }, { "epoch": 0.8085411363467859, "grad_norm": 7.1236026205168725, "learning_rate": 1.8608028942608832e-08, "loss": 0.7308, "step": 8179 }, { "epoch": 0.8086399920915405, "grad_norm": 4.601693834473134, "learning_rate": 1.858942956623214e-08, "loss": 0.8292, "step": 8180 }, { "epoch": 0.8087388478362949, "grad_norm": 4.170198767104453, "learning_rate": 1.857083853730852e-08, "loss": 0.7412, "step": 8181 }, { "epoch": 0.8088377035810493, "grad_norm": 4.783223108454432, "learning_rate": 1.855225585774425e-08, "loss": 0.7081, "step": 8182 }, { "epoch": 0.8089365593258038, "grad_norm": 10.260101586026657, "learning_rate": 1.8533681529444723e-08, "loss": 0.7088, "step": 8183 }, { "epoch": 0.8090354150705583, "grad_norm": 4.012702342966552, "learning_rate": 1.851511555431442e-08, "loss": 0.6145, "step": 8184 }, { "epoch": 0.8091342708153128, "grad_norm": 4.776286707360679, "learning_rate": 1.849655793425705e-08, "loss": 0.6851, "step": 8185 }, { "epoch": 0.8092331265600672, "grad_norm": 4.2249096739713945, "learning_rate": 1.8478008671175382e-08, "loss": 0.7273, "step": 8186 }, { "epoch": 0.8093319823048217, "grad_norm": 8.850385820498555, "learning_rate": 1.8459467766971393e-08, "loss": 0.6715, "step": 8187 }, { "epoch": 0.8094308380495762, "grad_norm": 3.9476191124175593, "learning_rate": 1.8440935223546194e-08, "loss": 0.6839, "step": 8188 }, { "epoch": 0.8095296937943306, "grad_norm": 3.1798306506150094, "learning_rate": 1.842241104279999e-08, "loss": 0.865, "step": 8189 }, { "epoch": 0.8096285495390851, "grad_norm": 3.0677553873650503, "learning_rate": 1.8403895226632182e-08, "loss": 0.6936, "step": 8190 }, { "epoch": 0.8097274052838396, "grad_norm": 4.473809637646809, "learning_rate": 1.8385387776941308e-08, "loss": 0.6968, "step": 8191 }, { "epoch": 0.809826261028594, "grad_norm": 8.203366040248381, "learning_rate": 1.8366888695625006e-08, "loss": 0.7318, "step": 8192 }, { "epoch": 0.8099251167733484, "grad_norm": 4.907774632496555, "learning_rate": 1.834839798458011e-08, "loss": 0.7082, "step": 8193 }, { "epoch": 0.810023972518103, "grad_norm": 3.413933087269006, "learning_rate": 1.8329915645702522e-08, "loss": 0.6947, "step": 8194 }, { "epoch": 0.8101228282628574, "grad_norm": 4.627868986948393, "learning_rate": 1.8311441680887407e-08, "loss": 0.8207, "step": 8195 }, { "epoch": 0.8102216840076119, "grad_norm": 3.548140764944573, "learning_rate": 1.8292976092028945e-08, "loss": 0.6936, "step": 8196 }, { "epoch": 0.8103205397523664, "grad_norm": 4.238567579393971, "learning_rate": 1.8274518881020507e-08, "loss": 0.6912, "step": 8197 }, { "epoch": 0.8104193954971208, "grad_norm": 5.016609537819754, "learning_rate": 1.8256070049754613e-08, "loss": 0.7587, "step": 8198 }, { "epoch": 0.8105182512418753, "grad_norm": 3.4734301664061333, "learning_rate": 1.823762960012295e-08, "loss": 0.718, "step": 8199 }, { "epoch": 0.8106171069866298, "grad_norm": 3.0597498150441313, "learning_rate": 1.8219197534016272e-08, "loss": 0.6586, "step": 8200 }, { "epoch": 0.8107159627313842, "grad_norm": 5.027853165180677, "learning_rate": 1.8200773853324535e-08, "loss": 0.6562, "step": 8201 }, { "epoch": 0.8108148184761387, "grad_norm": 2.920335786429377, "learning_rate": 1.818235855993684e-08, "loss": 0.6463, "step": 8202 }, { "epoch": 0.8109136742208931, "grad_norm": 10.040753312854289, "learning_rate": 1.816395165574136e-08, "loss": 0.6952, "step": 8203 }, { "epoch": 0.8110125299656477, "grad_norm": 3.18723536190724, "learning_rate": 1.8145553142625458e-08, "loss": 0.7675, "step": 8204 }, { "epoch": 0.8111113857104021, "grad_norm": 3.1496356897801183, "learning_rate": 1.8127163022475678e-08, "loss": 0.7491, "step": 8205 }, { "epoch": 0.8112102414551565, "grad_norm": 3.5359247867680454, "learning_rate": 1.81087812971776e-08, "loss": 0.6448, "step": 8206 }, { "epoch": 0.8113090971999111, "grad_norm": 4.708420359075111, "learning_rate": 1.809040796861604e-08, "loss": 0.7781, "step": 8207 }, { "epoch": 0.8114079529446655, "grad_norm": 3.415514179578994, "learning_rate": 1.8072043038674868e-08, "loss": 0.6977, "step": 8208 }, { "epoch": 0.8115068086894199, "grad_norm": 6.925374243937216, "learning_rate": 1.8053686509237154e-08, "loss": 0.7203, "step": 8209 }, { "epoch": 0.8116056644341745, "grad_norm": 6.911373662620026, "learning_rate": 1.8035338382185118e-08, "loss": 0.7213, "step": 8210 }, { "epoch": 0.8117045201789289, "grad_norm": 7.645235528449383, "learning_rate": 1.8016998659400073e-08, "loss": 0.629, "step": 8211 }, { "epoch": 0.8118033759236833, "grad_norm": 3.4842729849002185, "learning_rate": 1.7998667342762464e-08, "loss": 0.6894, "step": 8212 }, { "epoch": 0.8119022316684378, "grad_norm": 2.6477565026968275, "learning_rate": 1.798034443415193e-08, "loss": 0.6941, "step": 8213 }, { "epoch": 0.8120010874131923, "grad_norm": 4.224898656359294, "learning_rate": 1.7962029935447164e-08, "loss": 0.7863, "step": 8214 }, { "epoch": 0.8120999431579468, "grad_norm": 3.859186241181576, "learning_rate": 1.7943723848526104e-08, "loss": 0.6992, "step": 8215 }, { "epoch": 0.8121987989027012, "grad_norm": 4.863523930984556, "learning_rate": 1.7925426175265758e-08, "loss": 0.674, "step": 8216 }, { "epoch": 0.8122976546474557, "grad_norm": 3.3832295739100022, "learning_rate": 1.7907136917542253e-08, "loss": 0.6082, "step": 8217 }, { "epoch": 0.8123965103922102, "grad_norm": 11.726975414823167, "learning_rate": 1.788885607723091e-08, "loss": 0.577, "step": 8218 }, { "epoch": 0.8124953661369646, "grad_norm": 4.252033247864316, "learning_rate": 1.7870583656206163e-08, "loss": 0.6609, "step": 8219 }, { "epoch": 0.8125942218817191, "grad_norm": 6.70813638768911, "learning_rate": 1.785231965634154e-08, "loss": 0.6596, "step": 8220 }, { "epoch": 0.8126930776264736, "grad_norm": 4.11505250816376, "learning_rate": 1.7834064079509815e-08, "loss": 0.7548, "step": 8221 }, { "epoch": 0.812791933371228, "grad_norm": 4.520543684170736, "learning_rate": 1.781581692758275e-08, "loss": 0.7039, "step": 8222 }, { "epoch": 0.8128907891159824, "grad_norm": 2.9931530564415216, "learning_rate": 1.779757820243136e-08, "loss": 0.6816, "step": 8223 }, { "epoch": 0.812989644860737, "grad_norm": 3.009717395528903, "learning_rate": 1.777934790592578e-08, "loss": 0.7389, "step": 8224 }, { "epoch": 0.8130885006054914, "grad_norm": 6.244645921569853, "learning_rate": 1.7761126039935204e-08, "loss": 0.6736, "step": 8225 }, { "epoch": 0.8131873563502459, "grad_norm": 4.75447451498345, "learning_rate": 1.774291260632804e-08, "loss": 0.7427, "step": 8226 }, { "epoch": 0.8132862120950004, "grad_norm": 5.636699321971508, "learning_rate": 1.7724707606971846e-08, "loss": 0.7537, "step": 8227 }, { "epoch": 0.8133850678397548, "grad_norm": 3.122017304945608, "learning_rate": 1.770651104373323e-08, "loss": 0.6136, "step": 8228 }, { "epoch": 0.8134839235845093, "grad_norm": 5.078289691533819, "learning_rate": 1.7688322918477982e-08, "loss": 0.7013, "step": 8229 }, { "epoch": 0.8135827793292638, "grad_norm": 3.686192421778246, "learning_rate": 1.7670143233071043e-08, "loss": 0.6481, "step": 8230 }, { "epoch": 0.8136816350740183, "grad_norm": 4.731440608192188, "learning_rate": 1.765197198937646e-08, "loss": 0.7546, "step": 8231 }, { "epoch": 0.8137804908187727, "grad_norm": 4.4472733813265, "learning_rate": 1.763380918925744e-08, "loss": 0.605, "step": 8232 }, { "epoch": 0.8138793465635272, "grad_norm": 3.4940563734216274, "learning_rate": 1.7615654834576278e-08, "loss": 0.6619, "step": 8233 }, { "epoch": 0.8139782023082817, "grad_norm": 7.900678883149771, "learning_rate": 1.759750892719446e-08, "loss": 0.6459, "step": 8234 }, { "epoch": 0.8140770580530361, "grad_norm": 21.397274551094846, "learning_rate": 1.7579371468972603e-08, "loss": 0.6797, "step": 8235 }, { "epoch": 0.8141759137977905, "grad_norm": 7.528602648478857, "learning_rate": 1.7561242461770386e-08, "loss": 0.7786, "step": 8236 }, { "epoch": 0.8142747695425451, "grad_norm": 3.9842856462379084, "learning_rate": 1.75431219074467e-08, "loss": 0.6509, "step": 8237 }, { "epoch": 0.8143736252872995, "grad_norm": 3.201332350645333, "learning_rate": 1.752500980785957e-08, "loss": 0.7668, "step": 8238 }, { "epoch": 0.8144724810320539, "grad_norm": 4.524756083342483, "learning_rate": 1.7506906164866052e-08, "loss": 0.7083, "step": 8239 }, { "epoch": 0.8145713367768085, "grad_norm": 3.9634568385791717, "learning_rate": 1.7488810980322466e-08, "loss": 0.7308, "step": 8240 }, { "epoch": 0.8146701925215629, "grad_norm": 6.133669712075953, "learning_rate": 1.74707242560842e-08, "loss": 0.8066, "step": 8241 }, { "epoch": 0.8147690482663174, "grad_norm": 4.47751430175296, "learning_rate": 1.7452645994005744e-08, "loss": 0.7421, "step": 8242 }, { "epoch": 0.8148679040110719, "grad_norm": 5.1936843210131425, "learning_rate": 1.743457619594082e-08, "loss": 0.8076, "step": 8243 }, { "epoch": 0.8149667597558263, "grad_norm": 4.031384021215409, "learning_rate": 1.7416514863742138e-08, "loss": 0.756, "step": 8244 }, { "epoch": 0.8150656155005808, "grad_norm": 11.59836546382563, "learning_rate": 1.7398461999261705e-08, "loss": 0.8364, "step": 8245 }, { "epoch": 0.8151644712453352, "grad_norm": 4.29904940252144, "learning_rate": 1.7380417604350505e-08, "loss": 0.6336, "step": 8246 }, { "epoch": 0.8152633269900897, "grad_norm": 3.8169772336844954, "learning_rate": 1.7362381680858774e-08, "loss": 0.739, "step": 8247 }, { "epoch": 0.8153621827348442, "grad_norm": 6.61068990424553, "learning_rate": 1.7344354230635792e-08, "loss": 0.706, "step": 8248 }, { "epoch": 0.8154610384795986, "grad_norm": 4.646341179504866, "learning_rate": 1.7326335255530057e-08, "loss": 0.7144, "step": 8249 }, { "epoch": 0.8155598942243532, "grad_norm": 5.111739598268035, "learning_rate": 1.730832475738908e-08, "loss": 0.625, "step": 8250 }, { "epoch": 0.8156587499691076, "grad_norm": 4.674203153356449, "learning_rate": 1.7290322738059625e-08, "loss": 0.7047, "step": 8251 }, { "epoch": 0.815757605713862, "grad_norm": 2.7935455080211127, "learning_rate": 1.7272329199387535e-08, "loss": 0.5972, "step": 8252 }, { "epoch": 0.8158564614586166, "grad_norm": 3.3819912286115494, "learning_rate": 1.7254344143217748e-08, "loss": 0.7222, "step": 8253 }, { "epoch": 0.815955317203371, "grad_norm": 4.593922459927797, "learning_rate": 1.7236367571394418e-08, "loss": 0.656, "step": 8254 }, { "epoch": 0.8160541729481254, "grad_norm": 6.222984830531296, "learning_rate": 1.7218399485760704e-08, "loss": 0.7111, "step": 8255 }, { "epoch": 0.8161530286928799, "grad_norm": 3.58629320365369, "learning_rate": 1.7200439888159014e-08, "loss": 0.6331, "step": 8256 }, { "epoch": 0.8162518844376344, "grad_norm": 5.075534190745787, "learning_rate": 1.718248878043086e-08, "loss": 0.7241, "step": 8257 }, { "epoch": 0.8163507401823888, "grad_norm": 5.4473328675178205, "learning_rate": 1.7164546164416804e-08, "loss": 0.7723, "step": 8258 }, { "epoch": 0.8164495959271433, "grad_norm": 3.0122881662567416, "learning_rate": 1.7146612041956642e-08, "loss": 0.7104, "step": 8259 }, { "epoch": 0.8165484516718978, "grad_norm": 3.5677235976378423, "learning_rate": 1.712868641488926e-08, "loss": 0.7709, "step": 8260 }, { "epoch": 0.8166473074166523, "grad_norm": 4.699049151007746, "learning_rate": 1.711076928505262e-08, "loss": 0.5824, "step": 8261 }, { "epoch": 0.8167461631614067, "grad_norm": 8.601399317553106, "learning_rate": 1.7092860654283914e-08, "loss": 0.7879, "step": 8262 }, { "epoch": 0.8168450189061612, "grad_norm": 2.897728248092048, "learning_rate": 1.707496052441937e-08, "loss": 0.5682, "step": 8263 }, { "epoch": 0.8169438746509157, "grad_norm": 4.059608378273966, "learning_rate": 1.7057068897294404e-08, "loss": 0.691, "step": 8264 }, { "epoch": 0.8170427303956701, "grad_norm": 3.254323209476913, "learning_rate": 1.7039185774743536e-08, "loss": 0.7245, "step": 8265 }, { "epoch": 0.8171415861404245, "grad_norm": 4.164441407328137, "learning_rate": 1.7021311158600372e-08, "loss": 0.6555, "step": 8266 }, { "epoch": 0.8172404418851791, "grad_norm": 4.523652875761072, "learning_rate": 1.700344505069774e-08, "loss": 0.7107, "step": 8267 }, { "epoch": 0.8173392976299335, "grad_norm": 5.598328129388018, "learning_rate": 1.6985587452867556e-08, "loss": 0.6664, "step": 8268 }, { "epoch": 0.817438153374688, "grad_norm": 6.140245733243421, "learning_rate": 1.6967738366940797e-08, "loss": 0.7225, "step": 8269 }, { "epoch": 0.8175370091194425, "grad_norm": 4.567549527795139, "learning_rate": 1.6949897794747658e-08, "loss": 0.7086, "step": 8270 }, { "epoch": 0.8176358648641969, "grad_norm": 23.811263962604535, "learning_rate": 1.693206573811745e-08, "loss": 0.7385, "step": 8271 }, { "epoch": 0.8177347206089514, "grad_norm": 5.5288924747831425, "learning_rate": 1.6914242198878548e-08, "loss": 0.6963, "step": 8272 }, { "epoch": 0.8178335763537059, "grad_norm": 3.6813253789228373, "learning_rate": 1.6896427178858496e-08, "loss": 0.7195, "step": 8273 }, { "epoch": 0.8179324320984603, "grad_norm": 2.7909725424180114, "learning_rate": 1.6878620679884005e-08, "loss": 0.7113, "step": 8274 }, { "epoch": 0.8180312878432148, "grad_norm": 3.45831358542465, "learning_rate": 1.6860822703780807e-08, "loss": 0.6751, "step": 8275 }, { "epoch": 0.8181301435879692, "grad_norm": 4.054286922209515, "learning_rate": 1.6843033252373885e-08, "loss": 0.7092, "step": 8276 }, { "epoch": 0.8182289993327237, "grad_norm": 4.825193486260379, "learning_rate": 1.6825252327487215e-08, "loss": 0.7521, "step": 8277 }, { "epoch": 0.8183278550774782, "grad_norm": 4.001320099516522, "learning_rate": 1.680747993094401e-08, "loss": 0.7399, "step": 8278 }, { "epoch": 0.8184267108222326, "grad_norm": 5.0033345084824195, "learning_rate": 1.6789716064566584e-08, "loss": 0.7666, "step": 8279 }, { "epoch": 0.8185255665669872, "grad_norm": 5.559744975744832, "learning_rate": 1.6771960730176315e-08, "loss": 0.7571, "step": 8280 }, { "epoch": 0.8186244223117416, "grad_norm": 3.3275998935342797, "learning_rate": 1.6754213929593795e-08, "loss": 0.6216, "step": 8281 }, { "epoch": 0.818723278056496, "grad_norm": 5.6785801638946385, "learning_rate": 1.6736475664638683e-08, "loss": 0.6617, "step": 8282 }, { "epoch": 0.8188221338012506, "grad_norm": 4.92166843824104, "learning_rate": 1.671874593712973e-08, "loss": 0.6288, "step": 8283 }, { "epoch": 0.818920989546005, "grad_norm": 4.089705936751735, "learning_rate": 1.6701024748884907e-08, "loss": 0.5966, "step": 8284 }, { "epoch": 0.8190198452907594, "grad_norm": 5.712583100205691, "learning_rate": 1.668331210172127e-08, "loss": 0.7065, "step": 8285 }, { "epoch": 0.8191187010355139, "grad_norm": 7.664507508837457, "learning_rate": 1.6665607997454956e-08, "loss": 0.6599, "step": 8286 }, { "epoch": 0.8192175567802684, "grad_norm": 4.503768432042703, "learning_rate": 1.664791243790128e-08, "loss": 0.6524, "step": 8287 }, { "epoch": 0.8193164125250229, "grad_norm": 3.49595098592785, "learning_rate": 1.6630225424874644e-08, "loss": 0.7348, "step": 8288 }, { "epoch": 0.8194152682697773, "grad_norm": 11.670657499005333, "learning_rate": 1.661254696018859e-08, "loss": 0.661, "step": 8289 }, { "epoch": 0.8195141240145318, "grad_norm": 4.888578689794759, "learning_rate": 1.6594877045655818e-08, "loss": 0.8014, "step": 8290 }, { "epoch": 0.8196129797592863, "grad_norm": 8.49400272653561, "learning_rate": 1.6577215683088075e-08, "loss": 0.7111, "step": 8291 }, { "epoch": 0.8197118355040407, "grad_norm": 3.593352216267742, "learning_rate": 1.6559562874296284e-08, "loss": 0.7232, "step": 8292 }, { "epoch": 0.8198106912487952, "grad_norm": 5.004505436285176, "learning_rate": 1.6541918621090522e-08, "loss": 0.7659, "step": 8293 }, { "epoch": 0.8199095469935497, "grad_norm": 6.1196477879901465, "learning_rate": 1.6524282925279876e-08, "loss": 0.7334, "step": 8294 }, { "epoch": 0.8200084027383041, "grad_norm": 4.677397523924521, "learning_rate": 1.6506655788672674e-08, "loss": 0.613, "step": 8295 }, { "epoch": 0.8201072584830585, "grad_norm": 4.078897000627077, "learning_rate": 1.6489037213076318e-08, "loss": 0.6557, "step": 8296 }, { "epoch": 0.8202061142278131, "grad_norm": 4.7253345790109025, "learning_rate": 1.6471427200297305e-08, "loss": 0.7185, "step": 8297 }, { "epoch": 0.8203049699725675, "grad_norm": 3.7938464536301946, "learning_rate": 1.6453825752141325e-08, "loss": 0.6978, "step": 8298 }, { "epoch": 0.820403825717322, "grad_norm": 3.214738810396406, "learning_rate": 1.6436232870413125e-08, "loss": 0.6793, "step": 8299 }, { "epoch": 0.8205026814620765, "grad_norm": 4.968759587456315, "learning_rate": 1.641864855691656e-08, "loss": 0.6695, "step": 8300 }, { "epoch": 0.8206015372068309, "grad_norm": 2.928517898900211, "learning_rate": 1.6401072813454697e-08, "loss": 0.5499, "step": 8301 }, { "epoch": 0.8207003929515854, "grad_norm": 2.821924426880065, "learning_rate": 1.638350564182963e-08, "loss": 0.7541, "step": 8302 }, { "epoch": 0.8207992486963399, "grad_norm": 6.00140054162403, "learning_rate": 1.636594704384262e-08, "loss": 0.6658, "step": 8303 }, { "epoch": 0.8208981044410943, "grad_norm": 4.035540182030818, "learning_rate": 1.634839702129409e-08, "loss": 0.703, "step": 8304 }, { "epoch": 0.8209969601858488, "grad_norm": 2.605790675300495, "learning_rate": 1.633085557598346e-08, "loss": 0.6488, "step": 8305 }, { "epoch": 0.8210958159306032, "grad_norm": 4.763399059015804, "learning_rate": 1.6313322709709387e-08, "loss": 0.6941, "step": 8306 }, { "epoch": 0.8211946716753578, "grad_norm": 3.077832850955196, "learning_rate": 1.6295798424269636e-08, "loss": 0.7497, "step": 8307 }, { "epoch": 0.8212935274201122, "grad_norm": 4.63172512895805, "learning_rate": 1.6278282721461e-08, "loss": 0.8086, "step": 8308 }, { "epoch": 0.8213923831648666, "grad_norm": 3.491385239515395, "learning_rate": 1.6260775603079514e-08, "loss": 0.77, "step": 8309 }, { "epoch": 0.8214912389096212, "grad_norm": 3.0345941601615403, "learning_rate": 1.6243277070920223e-08, "loss": 0.6368, "step": 8310 }, { "epoch": 0.8215900946543756, "grad_norm": 5.550387968562197, "learning_rate": 1.622578712677738e-08, "loss": 0.7537, "step": 8311 }, { "epoch": 0.82168895039913, "grad_norm": 7.594189685548211, "learning_rate": 1.620830577244433e-08, "loss": 0.7361, "step": 8312 }, { "epoch": 0.8217878061438846, "grad_norm": 9.036354056799702, "learning_rate": 1.6190833009713478e-08, "loss": 0.6386, "step": 8313 }, { "epoch": 0.821886661888639, "grad_norm": 16.87266436104744, "learning_rate": 1.6173368840376433e-08, "loss": 0.6561, "step": 8314 }, { "epoch": 0.8219855176333934, "grad_norm": 4.265718735573408, "learning_rate": 1.615591326622391e-08, "loss": 0.6674, "step": 8315 }, { "epoch": 0.822084373378148, "grad_norm": 4.045967832081505, "learning_rate": 1.6138466289045694e-08, "loss": 0.7716, "step": 8316 }, { "epoch": 0.8221832291229024, "grad_norm": 6.411548971302622, "learning_rate": 1.612102791063069e-08, "loss": 0.6994, "step": 8317 }, { "epoch": 0.8222820848676569, "grad_norm": 4.005056481892624, "learning_rate": 1.6103598132766992e-08, "loss": 0.7419, "step": 8318 }, { "epoch": 0.8223809406124113, "grad_norm": 4.6074434271142435, "learning_rate": 1.6086176957241727e-08, "loss": 0.7675, "step": 8319 }, { "epoch": 0.8224797963571658, "grad_norm": 4.451022912902485, "learning_rate": 1.6068764385841226e-08, "loss": 0.6247, "step": 8320 }, { "epoch": 0.8225786521019203, "grad_norm": 4.578728126771792, "learning_rate": 1.6051360420350835e-08, "loss": 0.813, "step": 8321 }, { "epoch": 0.8226775078466747, "grad_norm": 4.320378280614996, "learning_rate": 1.603396506255511e-08, "loss": 0.6343, "step": 8322 }, { "epoch": 0.8227763635914292, "grad_norm": 7.475883019237751, "learning_rate": 1.6016578314237694e-08, "loss": 0.712, "step": 8323 }, { "epoch": 0.8228752193361837, "grad_norm": 3.4114281323850046, "learning_rate": 1.5999200177181317e-08, "loss": 0.7037, "step": 8324 }, { "epoch": 0.8229740750809381, "grad_norm": 4.527512916740641, "learning_rate": 1.598183065316785e-08, "loss": 0.6333, "step": 8325 }, { "epoch": 0.8230729308256927, "grad_norm": 17.07600038139287, "learning_rate": 1.5964469743978327e-08, "loss": 0.5808, "step": 8326 }, { "epoch": 0.8231717865704471, "grad_norm": 10.155003574549776, "learning_rate": 1.59471174513928e-08, "loss": 0.7144, "step": 8327 }, { "epoch": 0.8232706423152015, "grad_norm": 5.575450733186684, "learning_rate": 1.5929773777190503e-08, "loss": 0.7074, "step": 8328 }, { "epoch": 0.823369498059956, "grad_norm": 4.374997845633731, "learning_rate": 1.5912438723149814e-08, "loss": 0.6833, "step": 8329 }, { "epoch": 0.8234683538047105, "grad_norm": 3.7089454696365203, "learning_rate": 1.589511229104813e-08, "loss": 0.7028, "step": 8330 }, { "epoch": 0.8235672095494649, "grad_norm": 4.477289867045588, "learning_rate": 1.5877794482662067e-08, "loss": 0.6475, "step": 8331 }, { "epoch": 0.8236660652942194, "grad_norm": 19.03225158055497, "learning_rate": 1.5860485299767256e-08, "loss": 0.7136, "step": 8332 }, { "epoch": 0.8237649210389739, "grad_norm": 30.791059398983332, "learning_rate": 1.5843184744138572e-08, "loss": 0.7807, "step": 8333 }, { "epoch": 0.8238637767837284, "grad_norm": 5.085049640488362, "learning_rate": 1.582589281754986e-08, "loss": 0.6963, "step": 8334 }, { "epoch": 0.8239626325284828, "grad_norm": 3.545174176970526, "learning_rate": 1.580860952177421e-08, "loss": 0.7309, "step": 8335 }, { "epoch": 0.8240614882732373, "grad_norm": 4.005614185889951, "learning_rate": 1.5791334858583727e-08, "loss": 0.7004, "step": 8336 }, { "epoch": 0.8241603440179918, "grad_norm": 9.263872204438943, "learning_rate": 1.5774068829749698e-08, "loss": 0.6254, "step": 8337 }, { "epoch": 0.8242591997627462, "grad_norm": 12.32613124659327, "learning_rate": 1.575681143704247e-08, "loss": 0.7532, "step": 8338 }, { "epoch": 0.8243580555075006, "grad_norm": 7.958827132984919, "learning_rate": 1.5739562682231567e-08, "loss": 0.6851, "step": 8339 }, { "epoch": 0.8244569112522552, "grad_norm": 3.544651006025782, "learning_rate": 1.57223225670856e-08, "loss": 0.758, "step": 8340 }, { "epoch": 0.8245557669970096, "grad_norm": 3.289423408132008, "learning_rate": 1.5705091093372237e-08, "loss": 0.7944, "step": 8341 }, { "epoch": 0.824654622741764, "grad_norm": 2.700903622355619, "learning_rate": 1.5687868262858383e-08, "loss": 0.7648, "step": 8342 }, { "epoch": 0.8247534784865186, "grad_norm": 10.058877745505415, "learning_rate": 1.567065407730991e-08, "loss": 0.6756, "step": 8343 }, { "epoch": 0.824852334231273, "grad_norm": 3.944522090652046, "learning_rate": 1.565344853849193e-08, "loss": 0.6761, "step": 8344 }, { "epoch": 0.8249511899760275, "grad_norm": 3.1750631698464358, "learning_rate": 1.563625164816862e-08, "loss": 0.7694, "step": 8345 }, { "epoch": 0.825050045720782, "grad_norm": 3.686615443054929, "learning_rate": 1.561906340810323e-08, "loss": 0.6833, "step": 8346 }, { "epoch": 0.8251489014655364, "grad_norm": 4.448221986604397, "learning_rate": 1.560188382005817e-08, "loss": 0.5606, "step": 8347 }, { "epoch": 0.8252477572102909, "grad_norm": 17.426064647210772, "learning_rate": 1.5584712885794993e-08, "loss": 0.8086, "step": 8348 }, { "epoch": 0.8253466129550453, "grad_norm": 73.01560356175342, "learning_rate": 1.5567550607074277e-08, "loss": 0.7518, "step": 8349 }, { "epoch": 0.8254454686997998, "grad_norm": 9.990590851631255, "learning_rate": 1.55503969856558e-08, "loss": 0.7298, "step": 8350 }, { "epoch": 0.8255443244445543, "grad_norm": 5.0362370024237775, "learning_rate": 1.553325202329837e-08, "loss": 0.8066, "step": 8351 }, { "epoch": 0.8256431801893087, "grad_norm": 4.229847563330237, "learning_rate": 1.5516115721760002e-08, "loss": 0.7195, "step": 8352 }, { "epoch": 0.8257420359340633, "grad_norm": 3.9793701011856935, "learning_rate": 1.5498988082797737e-08, "loss": 0.7396, "step": 8353 }, { "epoch": 0.8258408916788177, "grad_norm": 3.4198261592247463, "learning_rate": 1.5481869108167746e-08, "loss": 0.7162, "step": 8354 }, { "epoch": 0.8259397474235721, "grad_norm": 7.051106559053553, "learning_rate": 1.5464758799625342e-08, "loss": 0.7242, "step": 8355 }, { "epoch": 0.8260386031683267, "grad_norm": 4.325617709016498, "learning_rate": 1.5447657158924964e-08, "loss": 0.6406, "step": 8356 }, { "epoch": 0.8261374589130811, "grad_norm": 3.0296782343908575, "learning_rate": 1.5430564187820094e-08, "loss": 0.7492, "step": 8357 }, { "epoch": 0.8262363146578355, "grad_norm": 4.138735893466102, "learning_rate": 1.5413479888063373e-08, "loss": 0.6108, "step": 8358 }, { "epoch": 0.82633517040259, "grad_norm": 6.1708453314380245, "learning_rate": 1.539640426140658e-08, "loss": 0.6979, "step": 8359 }, { "epoch": 0.8264340261473445, "grad_norm": 2.663752452815474, "learning_rate": 1.5379337309600515e-08, "loss": 0.5624, "step": 8360 }, { "epoch": 0.826532881892099, "grad_norm": 10.83706099551059, "learning_rate": 1.5362279034395154e-08, "loss": 0.7461, "step": 8361 }, { "epoch": 0.8266317376368534, "grad_norm": 5.688496479767749, "learning_rate": 1.534522943753962e-08, "loss": 0.7326, "step": 8362 }, { "epoch": 0.8267305933816079, "grad_norm": 3.7716473451494035, "learning_rate": 1.5328188520782038e-08, "loss": 0.7161, "step": 8363 }, { "epoch": 0.8268294491263624, "grad_norm": 6.747578791013639, "learning_rate": 1.5311156285869743e-08, "loss": 0.7032, "step": 8364 }, { "epoch": 0.8269283048711168, "grad_norm": 2.9329721875188666, "learning_rate": 1.5294132734549103e-08, "loss": 0.6501, "step": 8365 }, { "epoch": 0.8270271606158713, "grad_norm": 5.787881881999662, "learning_rate": 1.527711786856565e-08, "loss": 0.6189, "step": 8366 }, { "epoch": 0.8271260163606258, "grad_norm": 3.0423283381581516, "learning_rate": 1.5260111689664034e-08, "loss": 0.6611, "step": 8367 }, { "epoch": 0.8272248721053802, "grad_norm": 4.186371233228861, "learning_rate": 1.5243114199587938e-08, "loss": 0.7501, "step": 8368 }, { "epoch": 0.8273237278501346, "grad_norm": 4.020584042988217, "learning_rate": 1.5226125400080247e-08, "loss": 0.6927, "step": 8369 }, { "epoch": 0.8274225835948892, "grad_norm": 5.2396084939466, "learning_rate": 1.5209145292882907e-08, "loss": 0.6241, "step": 8370 }, { "epoch": 0.8275214393396436, "grad_norm": 3.176471290028604, "learning_rate": 1.5192173879736926e-08, "loss": 0.7076, "step": 8371 }, { "epoch": 0.827620295084398, "grad_norm": 2.983634030241156, "learning_rate": 1.517521116238252e-08, "loss": 0.5683, "step": 8372 }, { "epoch": 0.8277191508291526, "grad_norm": 3.6476545727788388, "learning_rate": 1.5158257142558972e-08, "loss": 0.7798, "step": 8373 }, { "epoch": 0.827818006573907, "grad_norm": 4.42617828954565, "learning_rate": 1.5141311822004632e-08, "loss": 0.7395, "step": 8374 }, { "epoch": 0.8279168623186615, "grad_norm": 7.300808652890452, "learning_rate": 1.512437520245704e-08, "loss": 0.7628, "step": 8375 }, { "epoch": 0.828015718063416, "grad_norm": 5.76394435579745, "learning_rate": 1.510744728565273e-08, "loss": 0.6916, "step": 8376 }, { "epoch": 0.8281145738081704, "grad_norm": 7.771273040953953, "learning_rate": 1.5090528073327458e-08, "loss": 0.7155, "step": 8377 }, { "epoch": 0.8282134295529249, "grad_norm": 6.975691809674111, "learning_rate": 1.5073617567216047e-08, "loss": 0.7576, "step": 8378 }, { "epoch": 0.8283122852976793, "grad_norm": 3.872725970533153, "learning_rate": 1.5056715769052387e-08, "loss": 0.8019, "step": 8379 }, { "epoch": 0.8284111410424339, "grad_norm": 3.9969505814254487, "learning_rate": 1.5039822680569513e-08, "loss": 0.6189, "step": 8380 }, { "epoch": 0.8285099967871883, "grad_norm": 3.2316004106672915, "learning_rate": 1.50229383034996e-08, "loss": 0.8106, "step": 8381 }, { "epoch": 0.8286088525319427, "grad_norm": 5.038566813300291, "learning_rate": 1.5006062639573848e-08, "loss": 0.6772, "step": 8382 }, { "epoch": 0.8287077082766973, "grad_norm": 5.589243653836192, "learning_rate": 1.4989195690522615e-08, "loss": 0.7383, "step": 8383 }, { "epoch": 0.8288065640214517, "grad_norm": 4.963385631749689, "learning_rate": 1.497233745807539e-08, "loss": 0.5963, "step": 8384 }, { "epoch": 0.8289054197662061, "grad_norm": 6.666860866649288, "learning_rate": 1.4955487943960688e-08, "loss": 0.5939, "step": 8385 }, { "epoch": 0.8290042755109607, "grad_norm": 6.69735952552464, "learning_rate": 1.4938647149906226e-08, "loss": 0.715, "step": 8386 }, { "epoch": 0.8291031312557151, "grad_norm": 3.9429314085133567, "learning_rate": 1.4921815077638756e-08, "loss": 0.6739, "step": 8387 }, { "epoch": 0.8292019870004695, "grad_norm": 17.843405463186627, "learning_rate": 1.4904991728884144e-08, "loss": 0.7374, "step": 8388 }, { "epoch": 0.8293008427452241, "grad_norm": 4.110535600608065, "learning_rate": 1.4888177105367406e-08, "loss": 0.6628, "step": 8389 }, { "epoch": 0.8293996984899785, "grad_norm": 3.6985991626488466, "learning_rate": 1.487137120881259e-08, "loss": 0.7026, "step": 8390 }, { "epoch": 0.829498554234733, "grad_norm": 7.590234269957879, "learning_rate": 1.4854574040942924e-08, "loss": 0.6785, "step": 8391 }, { "epoch": 0.8295974099794874, "grad_norm": 3.891130443134417, "learning_rate": 1.4837785603480734e-08, "loss": 0.6326, "step": 8392 }, { "epoch": 0.8296962657242419, "grad_norm": 4.210015559180781, "learning_rate": 1.4821005898147376e-08, "loss": 0.7574, "step": 8393 }, { "epoch": 0.8297951214689964, "grad_norm": 4.519118160840881, "learning_rate": 1.4804234926663383e-08, "loss": 0.6916, "step": 8394 }, { "epoch": 0.8298939772137508, "grad_norm": 3.2651750564988906, "learning_rate": 1.4787472690748392e-08, "loss": 0.7135, "step": 8395 }, { "epoch": 0.8299928329585053, "grad_norm": 13.11160032545962, "learning_rate": 1.4770719192121083e-08, "loss": 0.6197, "step": 8396 }, { "epoch": 0.8300916887032598, "grad_norm": 5.80355515277913, "learning_rate": 1.4753974432499327e-08, "loss": 0.6078, "step": 8397 }, { "epoch": 0.8301905444480142, "grad_norm": 3.4331403665267994, "learning_rate": 1.4737238413600005e-08, "loss": 0.7504, "step": 8398 }, { "epoch": 0.8302894001927688, "grad_norm": 6.265011323639912, "learning_rate": 1.4720511137139158e-08, "loss": 0.7032, "step": 8399 }, { "epoch": 0.8303882559375232, "grad_norm": 4.022749769966276, "learning_rate": 1.4703792604831966e-08, "loss": 0.7627, "step": 8400 }, { "epoch": 0.8304871116822776, "grad_norm": 4.426030682826308, "learning_rate": 1.4687082818392605e-08, "loss": 0.747, "step": 8401 }, { "epoch": 0.8305859674270321, "grad_norm": 3.3110957601359727, "learning_rate": 1.4670381779534446e-08, "loss": 0.6878, "step": 8402 }, { "epoch": 0.8306848231717866, "grad_norm": 5.862627453235098, "learning_rate": 1.4653689489969967e-08, "loss": 0.748, "step": 8403 }, { "epoch": 0.830783678916541, "grad_norm": 9.109290056583887, "learning_rate": 1.4637005951410676e-08, "loss": 0.7425, "step": 8404 }, { "epoch": 0.8308825346612955, "grad_norm": 6.373231684124402, "learning_rate": 1.4620331165567212e-08, "loss": 0.7299, "step": 8405 }, { "epoch": 0.83098139040605, "grad_norm": 5.007916551345476, "learning_rate": 1.460366513414938e-08, "loss": 0.7297, "step": 8406 }, { "epoch": 0.8310802461508044, "grad_norm": 3.4168897557232074, "learning_rate": 1.4587007858865964e-08, "loss": 0.6993, "step": 8407 }, { "epoch": 0.8311791018955589, "grad_norm": 3.24849974299769, "learning_rate": 1.4570359341424998e-08, "loss": 0.6877, "step": 8408 }, { "epoch": 0.8312779576403134, "grad_norm": 4.243389647969769, "learning_rate": 1.4553719583533475e-08, "loss": 0.76, "step": 8409 }, { "epoch": 0.8313768133850679, "grad_norm": 7.769668518136011, "learning_rate": 1.4537088586897583e-08, "loss": 0.7191, "step": 8410 }, { "epoch": 0.8314756691298223, "grad_norm": 3.3502197490534975, "learning_rate": 1.4520466353222627e-08, "loss": 0.674, "step": 8411 }, { "epoch": 0.8315745248745767, "grad_norm": 3.8990861851489402, "learning_rate": 1.4503852884212897e-08, "loss": 0.738, "step": 8412 }, { "epoch": 0.8316733806193313, "grad_norm": 6.440867400141936, "learning_rate": 1.4487248181571909e-08, "loss": 0.7199, "step": 8413 }, { "epoch": 0.8317722363640857, "grad_norm": 5.954434549071798, "learning_rate": 1.4470652247002235e-08, "loss": 0.5838, "step": 8414 }, { "epoch": 0.8318710921088401, "grad_norm": 9.354101418231958, "learning_rate": 1.4454065082205503e-08, "loss": 0.6712, "step": 8415 }, { "epoch": 0.8319699478535947, "grad_norm": 3.253206329678569, "learning_rate": 1.4437486688882506e-08, "loss": 0.7023, "step": 8416 }, { "epoch": 0.8320688035983491, "grad_norm": 4.183063972718322, "learning_rate": 1.4420917068733141e-08, "loss": 0.6843, "step": 8417 }, { "epoch": 0.8321676593431035, "grad_norm": 19.711341637973977, "learning_rate": 1.4404356223456326e-08, "loss": 0.7626, "step": 8418 }, { "epoch": 0.8322665150878581, "grad_norm": 5.2312665384497, "learning_rate": 1.4387804154750171e-08, "loss": 0.6991, "step": 8419 }, { "epoch": 0.8323653708326125, "grad_norm": 4.398129961166469, "learning_rate": 1.4371260864311817e-08, "loss": 0.6639, "step": 8420 }, { "epoch": 0.832464226577367, "grad_norm": 3.4987046129129, "learning_rate": 1.4354726353837565e-08, "loss": 0.6492, "step": 8421 }, { "epoch": 0.8325630823221214, "grad_norm": 4.032073529190712, "learning_rate": 1.4338200625022756e-08, "loss": 0.7109, "step": 8422 }, { "epoch": 0.8326619380668759, "grad_norm": 2.772261643102233, "learning_rate": 1.432168367956188e-08, "loss": 0.6592, "step": 8423 }, { "epoch": 0.8327607938116304, "grad_norm": 5.289960931090339, "learning_rate": 1.4305175519148483e-08, "loss": 0.7121, "step": 8424 }, { "epoch": 0.8328596495563848, "grad_norm": 4.8570080365121475, "learning_rate": 1.4288676145475276e-08, "loss": 0.7978, "step": 8425 }, { "epoch": 0.8329585053011394, "grad_norm": 5.5494524855144975, "learning_rate": 1.4272185560233974e-08, "loss": 0.7546, "step": 8426 }, { "epoch": 0.8330573610458938, "grad_norm": 3.4205557381827205, "learning_rate": 1.4255703765115468e-08, "loss": 0.6435, "step": 8427 }, { "epoch": 0.8331562167906482, "grad_norm": 3.435215908598894, "learning_rate": 1.423923076180974e-08, "loss": 0.7006, "step": 8428 }, { "epoch": 0.8332550725354028, "grad_norm": 5.20846584385242, "learning_rate": 1.4222766552005827e-08, "loss": 0.6863, "step": 8429 }, { "epoch": 0.8333539282801572, "grad_norm": 3.0200734504686393, "learning_rate": 1.4206311137391914e-08, "loss": 0.5699, "step": 8430 }, { "epoch": 0.8334527840249116, "grad_norm": 4.579210645914143, "learning_rate": 1.4189864519655226e-08, "loss": 0.7581, "step": 8431 }, { "epoch": 0.8335516397696661, "grad_norm": 3.166561061995313, "learning_rate": 1.417342670048215e-08, "loss": 0.5814, "step": 8432 }, { "epoch": 0.8336504955144206, "grad_norm": 8.14588502728149, "learning_rate": 1.4156997681558147e-08, "loss": 0.7271, "step": 8433 }, { "epoch": 0.833749351259175, "grad_norm": 3.1790204380794087, "learning_rate": 1.4140577464567738e-08, "loss": 0.6831, "step": 8434 }, { "epoch": 0.8338482070039295, "grad_norm": 4.382773083521978, "learning_rate": 1.4124166051194608e-08, "loss": 0.6821, "step": 8435 }, { "epoch": 0.833947062748684, "grad_norm": 4.101192551030484, "learning_rate": 1.4107763443121501e-08, "loss": 0.5987, "step": 8436 }, { "epoch": 0.8340459184934385, "grad_norm": 3.8061989433986, "learning_rate": 1.4091369642030227e-08, "loss": 0.728, "step": 8437 }, { "epoch": 0.8341447742381929, "grad_norm": 22.40776214194871, "learning_rate": 1.4074984649601785e-08, "loss": 0.6833, "step": 8438 }, { "epoch": 0.8342436299829474, "grad_norm": 3.9745329802884553, "learning_rate": 1.4058608467516164e-08, "loss": 0.6765, "step": 8439 }, { "epoch": 0.8343424857277019, "grad_norm": 4.499406959795461, "learning_rate": 1.4042241097452534e-08, "loss": 0.6447, "step": 8440 }, { "epoch": 0.8344413414724563, "grad_norm": 2.6459490105881374, "learning_rate": 1.4025882541089118e-08, "loss": 0.7376, "step": 8441 }, { "epoch": 0.8345401972172107, "grad_norm": 3.4076972058650727, "learning_rate": 1.4009532800103218e-08, "loss": 0.6952, "step": 8442 }, { "epoch": 0.8346390529619653, "grad_norm": 2.70093654578028, "learning_rate": 1.3993191876171284e-08, "loss": 0.6357, "step": 8443 }, { "epoch": 0.8347379087067197, "grad_norm": 3.2404982251436962, "learning_rate": 1.3976859770968852e-08, "loss": 0.662, "step": 8444 }, { "epoch": 0.8348367644514741, "grad_norm": 3.5799945216846085, "learning_rate": 1.3960536486170493e-08, "loss": 0.6198, "step": 8445 }, { "epoch": 0.8349356201962287, "grad_norm": 4.776214668550081, "learning_rate": 1.3944222023449959e-08, "loss": 0.6956, "step": 8446 }, { "epoch": 0.8350344759409831, "grad_norm": 4.0648306536869265, "learning_rate": 1.3927916384480054e-08, "loss": 0.7457, "step": 8447 }, { "epoch": 0.8351333316857376, "grad_norm": 5.449152846135534, "learning_rate": 1.3911619570932654e-08, "loss": 0.6345, "step": 8448 }, { "epoch": 0.8352321874304921, "grad_norm": 7.983470980996166, "learning_rate": 1.3895331584478776e-08, "loss": 0.5954, "step": 8449 }, { "epoch": 0.8353310431752465, "grad_norm": 6.242531433773275, "learning_rate": 1.3879052426788529e-08, "loss": 0.6594, "step": 8450 }, { "epoch": 0.835429898920001, "grad_norm": 4.095958505138432, "learning_rate": 1.3862782099531056e-08, "loss": 0.7421, "step": 8451 }, { "epoch": 0.8355287546647554, "grad_norm": 6.12624439355586, "learning_rate": 1.3846520604374701e-08, "loss": 0.8037, "step": 8452 }, { "epoch": 0.83562761040951, "grad_norm": 3.674892353793679, "learning_rate": 1.3830267942986773e-08, "loss": 0.7862, "step": 8453 }, { "epoch": 0.8357264661542644, "grad_norm": 2.662325493976984, "learning_rate": 1.3814024117033773e-08, "loss": 0.6265, "step": 8454 }, { "epoch": 0.8358253218990188, "grad_norm": 4.339811366529225, "learning_rate": 1.3797789128181292e-08, "loss": 0.6834, "step": 8455 }, { "epoch": 0.8359241776437734, "grad_norm": 6.49488273797302, "learning_rate": 1.3781562978093942e-08, "loss": 0.6891, "step": 8456 }, { "epoch": 0.8360230333885278, "grad_norm": 3.3915200866585966, "learning_rate": 1.3765345668435525e-08, "loss": 0.657, "step": 8457 }, { "epoch": 0.8361218891332822, "grad_norm": 2.9120639006544327, "learning_rate": 1.3749137200868843e-08, "loss": 0.6626, "step": 8458 }, { "epoch": 0.8362207448780368, "grad_norm": 4.769168923177625, "learning_rate": 1.3732937577055847e-08, "loss": 0.8156, "step": 8459 }, { "epoch": 0.8363196006227912, "grad_norm": 3.625457161607391, "learning_rate": 1.3716746798657564e-08, "loss": 0.6153, "step": 8460 }, { "epoch": 0.8364184563675456, "grad_norm": 6.365248432688433, "learning_rate": 1.3700564867334153e-08, "loss": 0.7382, "step": 8461 }, { "epoch": 0.8365173121123001, "grad_norm": 3.3760188542107685, "learning_rate": 1.3684391784744787e-08, "loss": 0.6985, "step": 8462 }, { "epoch": 0.8366161678570546, "grad_norm": 3.343502248958111, "learning_rate": 1.3668227552547818e-08, "loss": 0.661, "step": 8463 }, { "epoch": 0.836715023601809, "grad_norm": 3.6543215439529813, "learning_rate": 1.3652072172400608e-08, "loss": 0.6541, "step": 8464 }, { "epoch": 0.8368138793465635, "grad_norm": 5.145635721245211, "learning_rate": 1.3635925645959678e-08, "loss": 0.6433, "step": 8465 }, { "epoch": 0.836912735091318, "grad_norm": 3.1973631932779685, "learning_rate": 1.3619787974880637e-08, "loss": 0.7136, "step": 8466 }, { "epoch": 0.8370115908360725, "grad_norm": 4.698718547369085, "learning_rate": 1.3603659160818126e-08, "loss": 0.8074, "step": 8467 }, { "epoch": 0.8371104465808269, "grad_norm": 3.8499600304409385, "learning_rate": 1.3587539205425924e-08, "loss": 0.733, "step": 8468 }, { "epoch": 0.8372093023255814, "grad_norm": 4.45241522097495, "learning_rate": 1.3571428110356919e-08, "loss": 0.652, "step": 8469 }, { "epoch": 0.8373081580703359, "grad_norm": 3.506235294495939, "learning_rate": 1.3555325877263036e-08, "loss": 0.7324, "step": 8470 }, { "epoch": 0.8374070138150903, "grad_norm": 4.830960847165661, "learning_rate": 1.3539232507795328e-08, "loss": 0.6316, "step": 8471 }, { "epoch": 0.8375058695598449, "grad_norm": 3.8574745584025547, "learning_rate": 1.3523148003603979e-08, "loss": 0.5847, "step": 8472 }, { "epoch": 0.8376047253045993, "grad_norm": 5.205343142494865, "learning_rate": 1.3507072366338146e-08, "loss": 0.7532, "step": 8473 }, { "epoch": 0.8377035810493537, "grad_norm": 3.9034086923723796, "learning_rate": 1.34910055976462e-08, "loss": 0.7088, "step": 8474 }, { "epoch": 0.8378024367941082, "grad_norm": 17.015037699502475, "learning_rate": 1.3474947699175532e-08, "loss": 0.6922, "step": 8475 }, { "epoch": 0.8379012925388627, "grad_norm": 3.5478827209781145, "learning_rate": 1.3458898672572627e-08, "loss": 0.7484, "step": 8476 }, { "epoch": 0.8380001482836171, "grad_norm": 6.2084265076394765, "learning_rate": 1.3442858519483114e-08, "loss": 0.8831, "step": 8477 }, { "epoch": 0.8380990040283716, "grad_norm": 4.606604779187436, "learning_rate": 1.3426827241551631e-08, "loss": 0.712, "step": 8478 }, { "epoch": 0.8381978597731261, "grad_norm": 7.05882036168716, "learning_rate": 1.3410804840421963e-08, "loss": 0.7525, "step": 8479 }, { "epoch": 0.8382967155178805, "grad_norm": 3.7082707181826007, "learning_rate": 1.3394791317737009e-08, "loss": 0.7107, "step": 8480 }, { "epoch": 0.838395571262635, "grad_norm": 3.939137037292204, "learning_rate": 1.3378786675138653e-08, "loss": 0.6796, "step": 8481 }, { "epoch": 0.8384944270073895, "grad_norm": 3.379979743229636, "learning_rate": 1.3362790914267985e-08, "loss": 0.7316, "step": 8482 }, { "epoch": 0.838593282752144, "grad_norm": 3.4161665274230186, "learning_rate": 1.3346804036765137e-08, "loss": 0.739, "step": 8483 }, { "epoch": 0.8386921384968984, "grad_norm": 5.53846879180623, "learning_rate": 1.3330826044269295e-08, "loss": 0.6174, "step": 8484 }, { "epoch": 0.8387909942416528, "grad_norm": 2.9454603068462815, "learning_rate": 1.3314856938418795e-08, "loss": 0.7308, "step": 8485 }, { "epoch": 0.8388898499864074, "grad_norm": 5.081424152232738, "learning_rate": 1.3298896720850994e-08, "loss": 0.7607, "step": 8486 }, { "epoch": 0.8389887057311618, "grad_norm": 3.9489857938144985, "learning_rate": 1.3282945393202416e-08, "loss": 0.6531, "step": 8487 }, { "epoch": 0.8390875614759162, "grad_norm": 4.270815222393693, "learning_rate": 1.326700295710863e-08, "loss": 0.7123, "step": 8488 }, { "epoch": 0.8391864172206708, "grad_norm": 4.275710869665043, "learning_rate": 1.3251069414204274e-08, "loss": 0.7499, "step": 8489 }, { "epoch": 0.8392852729654252, "grad_norm": 4.538386486686956, "learning_rate": 1.3235144766123118e-08, "loss": 0.7852, "step": 8490 }, { "epoch": 0.8393841287101796, "grad_norm": 8.379906984987116, "learning_rate": 1.3219229014498012e-08, "loss": 0.7224, "step": 8491 }, { "epoch": 0.8394829844549342, "grad_norm": 5.512766459256944, "learning_rate": 1.3203322160960861e-08, "loss": 0.6629, "step": 8492 }, { "epoch": 0.8395818401996886, "grad_norm": 4.580272921466746, "learning_rate": 1.318742420714266e-08, "loss": 0.8011, "step": 8493 }, { "epoch": 0.8396806959444431, "grad_norm": 3.270600316469669, "learning_rate": 1.3171535154673553e-08, "loss": 0.6766, "step": 8494 }, { "epoch": 0.8397795516891975, "grad_norm": 4.590350668459391, "learning_rate": 1.3155655005182687e-08, "loss": 0.7174, "step": 8495 }, { "epoch": 0.839878407433952, "grad_norm": 4.540651697909307, "learning_rate": 1.3139783760298372e-08, "loss": 0.752, "step": 8496 }, { "epoch": 0.8399772631787065, "grad_norm": 4.947158340172083, "learning_rate": 1.3123921421647944e-08, "loss": 0.7698, "step": 8497 }, { "epoch": 0.8400761189234609, "grad_norm": 3.8723219195540612, "learning_rate": 1.3108067990857852e-08, "loss": 0.6403, "step": 8498 }, { "epoch": 0.8401749746682154, "grad_norm": 7.316772989313302, "learning_rate": 1.3092223469553665e-08, "loss": 0.7771, "step": 8499 }, { "epoch": 0.8402738304129699, "grad_norm": 4.793960901709731, "learning_rate": 1.307638785935996e-08, "loss": 0.7454, "step": 8500 }, { "epoch": 0.8403726861577243, "grad_norm": 4.996272945539198, "learning_rate": 1.3060561161900462e-08, "loss": 0.7175, "step": 8501 }, { "epoch": 0.8404715419024789, "grad_norm": 5.955794121651958, "learning_rate": 1.3044743378798007e-08, "loss": 0.7072, "step": 8502 }, { "epoch": 0.8405703976472333, "grad_norm": 5.220372225578878, "learning_rate": 1.3028934511674405e-08, "loss": 0.5918, "step": 8503 }, { "epoch": 0.8406692533919877, "grad_norm": 5.654852020065491, "learning_rate": 1.3013134562150652e-08, "loss": 0.7096, "step": 8504 }, { "epoch": 0.8407681091367422, "grad_norm": 3.3924819294309105, "learning_rate": 1.2997343531846827e-08, "loss": 0.7123, "step": 8505 }, { "epoch": 0.8408669648814967, "grad_norm": 4.1706080685615765, "learning_rate": 1.2981561422382026e-08, "loss": 0.6254, "step": 8506 }, { "epoch": 0.8409658206262511, "grad_norm": 4.823449455314663, "learning_rate": 1.2965788235374508e-08, "loss": 0.7037, "step": 8507 }, { "epoch": 0.8410646763710056, "grad_norm": 3.764854631961377, "learning_rate": 1.2950023972441538e-08, "loss": 0.7639, "step": 8508 }, { "epoch": 0.8411635321157601, "grad_norm": 4.163932951807543, "learning_rate": 1.2934268635199552e-08, "loss": 0.5572, "step": 8509 }, { "epoch": 0.8412623878605145, "grad_norm": 3.1377234721187564, "learning_rate": 1.2918522225263995e-08, "loss": 0.8255, "step": 8510 }, { "epoch": 0.841361243605269, "grad_norm": 4.194937521687677, "learning_rate": 1.2902784744249462e-08, "loss": 0.7952, "step": 8511 }, { "epoch": 0.8414600993500235, "grad_norm": 2.8859037235084193, "learning_rate": 1.2887056193769553e-08, "loss": 0.7061, "step": 8512 }, { "epoch": 0.841558955094778, "grad_norm": 4.0230617530772, "learning_rate": 1.2871336575437042e-08, "loss": 0.634, "step": 8513 }, { "epoch": 0.8416578108395324, "grad_norm": 4.350890707939826, "learning_rate": 1.2855625890863708e-08, "loss": 0.7305, "step": 8514 }, { "epoch": 0.8417566665842868, "grad_norm": 4.279562089389925, "learning_rate": 1.2839924141660474e-08, "loss": 0.7091, "step": 8515 }, { "epoch": 0.8418555223290414, "grad_norm": 3.094083782710582, "learning_rate": 1.282423132943734e-08, "loss": 0.7007, "step": 8516 }, { "epoch": 0.8419543780737958, "grad_norm": 10.00209648917891, "learning_rate": 1.280854745580332e-08, "loss": 0.7592, "step": 8517 }, { "epoch": 0.8420532338185502, "grad_norm": 4.691680850694479, "learning_rate": 1.2792872522366626e-08, "loss": 0.6592, "step": 8518 }, { "epoch": 0.8421520895633048, "grad_norm": 4.4568290333283835, "learning_rate": 1.277720653073443e-08, "loss": 0.8316, "step": 8519 }, { "epoch": 0.8422509453080592, "grad_norm": 4.839182414328947, "learning_rate": 1.2761549482513078e-08, "loss": 0.7016, "step": 8520 }, { "epoch": 0.8423498010528137, "grad_norm": 3.4799222506666845, "learning_rate": 1.2745901379307988e-08, "loss": 0.5869, "step": 8521 }, { "epoch": 0.8424486567975682, "grad_norm": 57.20596102085202, "learning_rate": 1.2730262222723609e-08, "loss": 0.7077, "step": 8522 }, { "epoch": 0.8425475125423226, "grad_norm": 7.221651336927589, "learning_rate": 1.2714632014363513e-08, "loss": 0.696, "step": 8523 }, { "epoch": 0.8426463682870771, "grad_norm": 4.122182628922068, "learning_rate": 1.2699010755830385e-08, "loss": 0.648, "step": 8524 }, { "epoch": 0.8427452240318315, "grad_norm": 4.925281220369655, "learning_rate": 1.2683398448725891e-08, "loss": 0.7535, "step": 8525 }, { "epoch": 0.842844079776586, "grad_norm": 6.640910939568281, "learning_rate": 1.2667795094650901e-08, "loss": 0.6975, "step": 8526 }, { "epoch": 0.8429429355213405, "grad_norm": 5.006651811746319, "learning_rate": 1.2652200695205273e-08, "loss": 0.7749, "step": 8527 }, { "epoch": 0.8430417912660949, "grad_norm": 5.701928785931822, "learning_rate": 1.2636615251988003e-08, "loss": 0.793, "step": 8528 }, { "epoch": 0.8431406470108495, "grad_norm": 4.999523507117406, "learning_rate": 1.2621038766597137e-08, "loss": 0.7035, "step": 8529 }, { "epoch": 0.8432395027556039, "grad_norm": 3.2234117566514855, "learning_rate": 1.2605471240629796e-08, "loss": 0.6736, "step": 8530 }, { "epoch": 0.8433383585003583, "grad_norm": 4.620812787634261, "learning_rate": 1.2589912675682224e-08, "loss": 0.6973, "step": 8531 }, { "epoch": 0.8434372142451129, "grad_norm": 4.466336685202424, "learning_rate": 1.2574363073349737e-08, "loss": 0.6481, "step": 8532 }, { "epoch": 0.8435360699898673, "grad_norm": 5.167082200396749, "learning_rate": 1.2558822435226667e-08, "loss": 0.7195, "step": 8533 }, { "epoch": 0.8436349257346217, "grad_norm": 4.798241891567873, "learning_rate": 1.2543290762906522e-08, "loss": 0.6207, "step": 8534 }, { "epoch": 0.8437337814793762, "grad_norm": 3.373334173747111, "learning_rate": 1.2527768057981847e-08, "loss": 0.7724, "step": 8535 }, { "epoch": 0.8438326372241307, "grad_norm": 5.255831231422743, "learning_rate": 1.2512254322044224e-08, "loss": 0.6442, "step": 8536 }, { "epoch": 0.8439314929688851, "grad_norm": 4.3807112490341735, "learning_rate": 1.2496749556684393e-08, "loss": 0.6671, "step": 8537 }, { "epoch": 0.8440303487136396, "grad_norm": 3.6376313653245447, "learning_rate": 1.248125376349215e-08, "loss": 0.6761, "step": 8538 }, { "epoch": 0.8441292044583941, "grad_norm": 8.431564354772107, "learning_rate": 1.2465766944056322e-08, "loss": 0.6509, "step": 8539 }, { "epoch": 0.8442280602031486, "grad_norm": 3.3603642987362212, "learning_rate": 1.2450289099964895e-08, "loss": 0.715, "step": 8540 }, { "epoch": 0.844326915947903, "grad_norm": 3.804290592171711, "learning_rate": 1.2434820232804844e-08, "loss": 0.71, "step": 8541 }, { "epoch": 0.8444257716926575, "grad_norm": 4.205808905865243, "learning_rate": 1.241936034416231e-08, "loss": 0.7089, "step": 8542 }, { "epoch": 0.844524627437412, "grad_norm": 3.031134677061193, "learning_rate": 1.2403909435622484e-08, "loss": 0.7164, "step": 8543 }, { "epoch": 0.8446234831821664, "grad_norm": 49.90387611291004, "learning_rate": 1.2388467508769595e-08, "loss": 0.6565, "step": 8544 }, { "epoch": 0.844722338926921, "grad_norm": 13.074588240419608, "learning_rate": 1.2373034565187013e-08, "loss": 0.7075, "step": 8545 }, { "epoch": 0.8448211946716754, "grad_norm": 3.321670878155703, "learning_rate": 1.2357610606457158e-08, "loss": 0.6846, "step": 8546 }, { "epoch": 0.8449200504164298, "grad_norm": 3.6414581273701243, "learning_rate": 1.2342195634161489e-08, "loss": 0.7506, "step": 8547 }, { "epoch": 0.8450189061611842, "grad_norm": 2.9716924480151454, "learning_rate": 1.232678964988062e-08, "loss": 0.7168, "step": 8548 }, { "epoch": 0.8451177619059388, "grad_norm": 6.9095744220983555, "learning_rate": 1.231139265519422e-08, "loss": 0.7318, "step": 8549 }, { "epoch": 0.8452166176506932, "grad_norm": 7.8671277712383025, "learning_rate": 1.2296004651680992e-08, "loss": 0.6489, "step": 8550 }, { "epoch": 0.8453154733954477, "grad_norm": 13.662821910573875, "learning_rate": 1.2280625640918774e-08, "loss": 0.6608, "step": 8551 }, { "epoch": 0.8454143291402022, "grad_norm": 15.794476588832703, "learning_rate": 1.2265255624484427e-08, "loss": 0.677, "step": 8552 }, { "epoch": 0.8455131848849566, "grad_norm": 6.246189058887154, "learning_rate": 1.2249894603953937e-08, "loss": 0.7408, "step": 8553 }, { "epoch": 0.8456120406297111, "grad_norm": 3.553400025242204, "learning_rate": 1.2234542580902373e-08, "loss": 0.7701, "step": 8554 }, { "epoch": 0.8457108963744656, "grad_norm": 3.4843861516162917, "learning_rate": 1.2219199556903815e-08, "loss": 0.6451, "step": 8555 }, { "epoch": 0.84580975211922, "grad_norm": 3.5882770897965024, "learning_rate": 1.2203865533531477e-08, "loss": 0.6034, "step": 8556 }, { "epoch": 0.8459086078639745, "grad_norm": 3.4719375369650294, "learning_rate": 1.218854051235767e-08, "loss": 0.7745, "step": 8557 }, { "epoch": 0.8460074636087289, "grad_norm": 3.962421733725962, "learning_rate": 1.2173224494953705e-08, "loss": 0.7876, "step": 8558 }, { "epoch": 0.8461063193534835, "grad_norm": 8.44009361247056, "learning_rate": 1.2157917482890023e-08, "loss": 0.6846, "step": 8559 }, { "epoch": 0.8462051750982379, "grad_norm": 3.4554090703811045, "learning_rate": 1.2142619477736172e-08, "loss": 0.6856, "step": 8560 }, { "epoch": 0.8463040308429923, "grad_norm": 3.931842630883542, "learning_rate": 1.2127330481060682e-08, "loss": 0.767, "step": 8561 }, { "epoch": 0.8464028865877469, "grad_norm": 3.0103973945070504, "learning_rate": 1.2112050494431259e-08, "loss": 0.7861, "step": 8562 }, { "epoch": 0.8465017423325013, "grad_norm": 3.782932433127624, "learning_rate": 1.2096779519414624e-08, "loss": 0.8176, "step": 8563 }, { "epoch": 0.8466005980772557, "grad_norm": 8.897153196433312, "learning_rate": 1.2081517557576559e-08, "loss": 0.6339, "step": 8564 }, { "epoch": 0.8466994538220103, "grad_norm": 20.42593798197291, "learning_rate": 1.2066264610482002e-08, "loss": 0.7768, "step": 8565 }, { "epoch": 0.8467983095667647, "grad_norm": 14.78339119240879, "learning_rate": 1.2051020679694868e-08, "loss": 0.6685, "step": 8566 }, { "epoch": 0.8468971653115192, "grad_norm": 4.5203314724947194, "learning_rate": 1.2035785766778238e-08, "loss": 0.7724, "step": 8567 }, { "epoch": 0.8469960210562736, "grad_norm": 3.473537978596026, "learning_rate": 1.2020559873294223e-08, "loss": 0.6616, "step": 8568 }, { "epoch": 0.8470948768010281, "grad_norm": 7.097008051439177, "learning_rate": 1.200534300080399e-08, "loss": 0.7305, "step": 8569 }, { "epoch": 0.8471937325457826, "grad_norm": 14.791652840034502, "learning_rate": 1.199013515086782e-08, "loss": 0.7885, "step": 8570 }, { "epoch": 0.847292588290537, "grad_norm": 3.176442937947911, "learning_rate": 1.1974936325045071e-08, "loss": 0.73, "step": 8571 }, { "epoch": 0.8473914440352915, "grad_norm": 5.630353787687965, "learning_rate": 1.195974652489412e-08, "loss": 0.7074, "step": 8572 }, { "epoch": 0.847490299780046, "grad_norm": 4.857311683132986, "learning_rate": 1.1944565751972502e-08, "loss": 0.6683, "step": 8573 }, { "epoch": 0.8475891555248004, "grad_norm": 3.730590729990339, "learning_rate": 1.1929394007836734e-08, "loss": 0.6062, "step": 8574 }, { "epoch": 0.847688011269555, "grad_norm": 5.1433800661819165, "learning_rate": 1.1914231294042476e-08, "loss": 0.7404, "step": 8575 }, { "epoch": 0.8477868670143094, "grad_norm": 3.644935348443862, "learning_rate": 1.189907761214447e-08, "loss": 0.6446, "step": 8576 }, { "epoch": 0.8478857227590638, "grad_norm": 3.828507428914757, "learning_rate": 1.1883932963696452e-08, "loss": 0.6128, "step": 8577 }, { "epoch": 0.8479845785038183, "grad_norm": 3.8150320860946976, "learning_rate": 1.1868797350251313e-08, "loss": 0.7264, "step": 8578 }, { "epoch": 0.8480834342485728, "grad_norm": 3.1862804013382777, "learning_rate": 1.1853670773360968e-08, "loss": 0.7476, "step": 8579 }, { "epoch": 0.8481822899933272, "grad_norm": 2.9545089345263453, "learning_rate": 1.1838553234576465e-08, "loss": 0.743, "step": 8580 }, { "epoch": 0.8482811457380817, "grad_norm": 3.72739591912932, "learning_rate": 1.1823444735447818e-08, "loss": 0.5657, "step": 8581 }, { "epoch": 0.8483800014828362, "grad_norm": 3.220837083445289, "learning_rate": 1.1808345277524257e-08, "loss": 0.6392, "step": 8582 }, { "epoch": 0.8484788572275906, "grad_norm": 3.773286984151342, "learning_rate": 1.179325486235393e-08, "loss": 0.6494, "step": 8583 }, { "epoch": 0.8485777129723451, "grad_norm": 4.925270975620759, "learning_rate": 1.177817349148421e-08, "loss": 0.6951, "step": 8584 }, { "epoch": 0.8486765687170996, "grad_norm": 3.0739840171724744, "learning_rate": 1.176310116646141e-08, "loss": 0.6527, "step": 8585 }, { "epoch": 0.8487754244618541, "grad_norm": 4.418622425277918, "learning_rate": 1.1748037888830998e-08, "loss": 0.6535, "step": 8586 }, { "epoch": 0.8488742802066085, "grad_norm": 3.5878343045202077, "learning_rate": 1.1732983660137508e-08, "loss": 0.7181, "step": 8587 }, { "epoch": 0.8489731359513629, "grad_norm": 3.324750742044803, "learning_rate": 1.1717938481924494e-08, "loss": 0.8038, "step": 8588 }, { "epoch": 0.8490719916961175, "grad_norm": 6.758118662821149, "learning_rate": 1.1702902355734645e-08, "loss": 0.7342, "step": 8589 }, { "epoch": 0.8491708474408719, "grad_norm": 4.233508942378558, "learning_rate": 1.1687875283109705e-08, "loss": 0.6805, "step": 8590 }, { "epoch": 0.8492697031856263, "grad_norm": 4.846403369794062, "learning_rate": 1.167285726559043e-08, "loss": 0.6969, "step": 8591 }, { "epoch": 0.8493685589303809, "grad_norm": 6.805190738379136, "learning_rate": 1.1657848304716733e-08, "loss": 0.6977, "step": 8592 }, { "epoch": 0.8494674146751353, "grad_norm": 4.513462623217717, "learning_rate": 1.1642848402027572e-08, "loss": 0.7345, "step": 8593 }, { "epoch": 0.8495662704198897, "grad_norm": 3.129360204356596, "learning_rate": 1.1627857559060927e-08, "loss": 0.7411, "step": 8594 }, { "epoch": 0.8496651261646443, "grad_norm": 5.513847916834339, "learning_rate": 1.1612875777353925e-08, "loss": 0.7225, "step": 8595 }, { "epoch": 0.8497639819093987, "grad_norm": 4.173027093027577, "learning_rate": 1.1597903058442704e-08, "loss": 0.6668, "step": 8596 }, { "epoch": 0.8498628376541532, "grad_norm": 5.69921067371925, "learning_rate": 1.1582939403862512e-08, "loss": 0.7004, "step": 8597 }, { "epoch": 0.8499616933989076, "grad_norm": 3.7406242782703405, "learning_rate": 1.1567984815147614e-08, "loss": 0.7045, "step": 8598 }, { "epoch": 0.8500605491436621, "grad_norm": 3.530349804704448, "learning_rate": 1.1553039293831435e-08, "loss": 0.7452, "step": 8599 }, { "epoch": 0.8501594048884166, "grad_norm": 16.55668336937637, "learning_rate": 1.1538102841446373e-08, "loss": 0.6558, "step": 8600 }, { "epoch": 0.850258260633171, "grad_norm": 3.2796493944414578, "learning_rate": 1.1523175459523981e-08, "loss": 0.7255, "step": 8601 }, { "epoch": 0.8503571163779255, "grad_norm": 3.90340212671732, "learning_rate": 1.1508257149594802e-08, "loss": 0.7148, "step": 8602 }, { "epoch": 0.85045597212268, "grad_norm": 5.59878708680492, "learning_rate": 1.14933479131885e-08, "loss": 0.6661, "step": 8603 }, { "epoch": 0.8505548278674344, "grad_norm": 3.428531348637977, "learning_rate": 1.1478447751833831e-08, "loss": 0.6818, "step": 8604 }, { "epoch": 0.850653683612189, "grad_norm": 5.738782412316438, "learning_rate": 1.1463556667058538e-08, "loss": 0.6314, "step": 8605 }, { "epoch": 0.8507525393569434, "grad_norm": 5.637009291767376, "learning_rate": 1.1448674660389523e-08, "loss": 0.767, "step": 8606 }, { "epoch": 0.8508513951016978, "grad_norm": 6.404340845261244, "learning_rate": 1.1433801733352688e-08, "loss": 0.6912, "step": 8607 }, { "epoch": 0.8509502508464523, "grad_norm": 8.829665197552877, "learning_rate": 1.1418937887473024e-08, "loss": 0.6911, "step": 8608 }, { "epoch": 0.8510491065912068, "grad_norm": 3.4425234094167108, "learning_rate": 1.1404083124274655e-08, "loss": 0.7076, "step": 8609 }, { "epoch": 0.8511479623359612, "grad_norm": 3.904077261974068, "learning_rate": 1.1389237445280652e-08, "loss": 0.7606, "step": 8610 }, { "epoch": 0.8512468180807157, "grad_norm": 3.2837413601800676, "learning_rate": 1.1374400852013244e-08, "loss": 0.8117, "step": 8611 }, { "epoch": 0.8513456738254702, "grad_norm": 4.116250777135752, "learning_rate": 1.1359573345993734e-08, "loss": 0.6509, "step": 8612 }, { "epoch": 0.8514445295702247, "grad_norm": 9.60842227449942, "learning_rate": 1.1344754928742417e-08, "loss": 0.6632, "step": 8613 }, { "epoch": 0.8515433853149791, "grad_norm": 11.44916089302077, "learning_rate": 1.1329945601778745e-08, "loss": 0.7044, "step": 8614 }, { "epoch": 0.8516422410597336, "grad_norm": 4.6621759953591795, "learning_rate": 1.1315145366621159e-08, "loss": 0.6589, "step": 8615 }, { "epoch": 0.8517410968044881, "grad_norm": 3.939092411665556, "learning_rate": 1.1300354224787234e-08, "loss": 0.7315, "step": 8616 }, { "epoch": 0.8518399525492425, "grad_norm": 3.993345063548697, "learning_rate": 1.1285572177793579e-08, "loss": 0.6842, "step": 8617 }, { "epoch": 0.8519388082939969, "grad_norm": 17.178478847529327, "learning_rate": 1.1270799227155847e-08, "loss": 0.7377, "step": 8618 }, { "epoch": 0.8520376640387515, "grad_norm": 4.166605608921774, "learning_rate": 1.1256035374388805e-08, "loss": 0.6778, "step": 8619 }, { "epoch": 0.8521365197835059, "grad_norm": 3.0084052009049773, "learning_rate": 1.1241280621006288e-08, "loss": 0.6333, "step": 8620 }, { "epoch": 0.8522353755282603, "grad_norm": 7.911728728347471, "learning_rate": 1.122653496852114e-08, "loss": 0.6219, "step": 8621 }, { "epoch": 0.8523342312730149, "grad_norm": 4.438244012375608, "learning_rate": 1.121179841844534e-08, "loss": 0.7634, "step": 8622 }, { "epoch": 0.8524330870177693, "grad_norm": 5.170051539577789, "learning_rate": 1.1197070972289902e-08, "loss": 0.7558, "step": 8623 }, { "epoch": 0.8525319427625238, "grad_norm": 4.841628033527822, "learning_rate": 1.1182352631564885e-08, "loss": 0.8701, "step": 8624 }, { "epoch": 0.8526307985072783, "grad_norm": 2.7671423091890093, "learning_rate": 1.1167643397779458e-08, "loss": 0.7993, "step": 8625 }, { "epoch": 0.8527296542520327, "grad_norm": 4.93382937269188, "learning_rate": 1.115294327244184e-08, "loss": 0.753, "step": 8626 }, { "epoch": 0.8528285099967872, "grad_norm": 2.9870770133848277, "learning_rate": 1.1138252257059289e-08, "loss": 0.7439, "step": 8627 }, { "epoch": 0.8529273657415417, "grad_norm": 4.878601645057894, "learning_rate": 1.112357035313819e-08, "loss": 0.8117, "step": 8628 }, { "epoch": 0.8530262214862961, "grad_norm": 9.527334246902315, "learning_rate": 1.1108897562183906e-08, "loss": 0.6226, "step": 8629 }, { "epoch": 0.8531250772310506, "grad_norm": 29.186506763431137, "learning_rate": 1.1094233885700942e-08, "loss": 0.749, "step": 8630 }, { "epoch": 0.853223932975805, "grad_norm": 2.9975181593125733, "learning_rate": 1.1079579325192856e-08, "loss": 0.668, "step": 8631 }, { "epoch": 0.8533227887205596, "grad_norm": 3.1361771833850334, "learning_rate": 1.1064933882162231e-08, "loss": 0.7028, "step": 8632 }, { "epoch": 0.853421644465314, "grad_norm": 3.7025041790769886, "learning_rate": 1.1050297558110767e-08, "loss": 0.7708, "step": 8633 }, { "epoch": 0.8535205002100684, "grad_norm": 9.237076297921618, "learning_rate": 1.1035670354539195e-08, "loss": 0.7108, "step": 8634 }, { "epoch": 0.853619355954823, "grad_norm": 2.8953670503357842, "learning_rate": 1.1021052272947285e-08, "loss": 0.7656, "step": 8635 }, { "epoch": 0.8537182116995774, "grad_norm": 4.671584040102387, "learning_rate": 1.1006443314833936e-08, "loss": 0.6948, "step": 8636 }, { "epoch": 0.8538170674443318, "grad_norm": 3.840536118442229, "learning_rate": 1.0991843481697094e-08, "loss": 0.7001, "step": 8637 }, { "epoch": 0.8539159231890864, "grad_norm": 3.1399487003276088, "learning_rate": 1.0977252775033718e-08, "loss": 0.6816, "step": 8638 }, { "epoch": 0.8540147789338408, "grad_norm": 6.4489357880989955, "learning_rate": 1.0962671196339912e-08, "loss": 0.6689, "step": 8639 }, { "epoch": 0.8541136346785952, "grad_norm": 3.658538642877313, "learning_rate": 1.0948098747110768e-08, "loss": 0.7454, "step": 8640 }, { "epoch": 0.8542124904233497, "grad_norm": 6.049345703170962, "learning_rate": 1.093353542884049e-08, "loss": 0.7826, "step": 8641 }, { "epoch": 0.8543113461681042, "grad_norm": 12.073461188219037, "learning_rate": 1.0918981243022341e-08, "loss": 0.7132, "step": 8642 }, { "epoch": 0.8544102019128587, "grad_norm": 4.2754238788651415, "learning_rate": 1.0904436191148615e-08, "loss": 0.815, "step": 8643 }, { "epoch": 0.8545090576576131, "grad_norm": 4.469116543362615, "learning_rate": 1.0889900274710706e-08, "loss": 0.5443, "step": 8644 }, { "epoch": 0.8546079134023676, "grad_norm": 4.206560662009468, "learning_rate": 1.087537349519908e-08, "loss": 0.73, "step": 8645 }, { "epoch": 0.8547067691471221, "grad_norm": 3.743486051482117, "learning_rate": 1.0860855854103201e-08, "loss": 0.6457, "step": 8646 }, { "epoch": 0.8548056248918765, "grad_norm": 27.120450452041894, "learning_rate": 1.0846347352911655e-08, "loss": 0.6371, "step": 8647 }, { "epoch": 0.854904480636631, "grad_norm": 3.410331616972023, "learning_rate": 1.0831847993112097e-08, "loss": 0.6486, "step": 8648 }, { "epoch": 0.8550033363813855, "grad_norm": 21.247585100504924, "learning_rate": 1.0817357776191204e-08, "loss": 0.6255, "step": 8649 }, { "epoch": 0.8551021921261399, "grad_norm": 3.074339590690305, "learning_rate": 1.0802876703634722e-08, "loss": 0.6622, "step": 8650 }, { "epoch": 0.8552010478708943, "grad_norm": 5.387176652551241, "learning_rate": 1.0788404776927496e-08, "loss": 0.7492, "step": 8651 }, { "epoch": 0.8552999036156489, "grad_norm": 4.401914340613718, "learning_rate": 1.0773941997553382e-08, "loss": 0.6447, "step": 8652 }, { "epoch": 0.8553987593604033, "grad_norm": 3.0100601385974866, "learning_rate": 1.0759488366995362e-08, "loss": 0.7657, "step": 8653 }, { "epoch": 0.8554976151051578, "grad_norm": 3.486497545314362, "learning_rate": 1.0745043886735395e-08, "loss": 0.6473, "step": 8654 }, { "epoch": 0.8555964708499123, "grad_norm": 7.4827516360350925, "learning_rate": 1.0730608558254573e-08, "loss": 0.6849, "step": 8655 }, { "epoch": 0.8556953265946667, "grad_norm": 7.014862449870262, "learning_rate": 1.0716182383033045e-08, "loss": 0.6562, "step": 8656 }, { "epoch": 0.8557941823394212, "grad_norm": 3.2979276119402154, "learning_rate": 1.0701765362549975e-08, "loss": 0.7241, "step": 8657 }, { "epoch": 0.8558930380841757, "grad_norm": 14.119133956554663, "learning_rate": 1.0687357498283622e-08, "loss": 0.7519, "step": 8658 }, { "epoch": 0.8559918938289302, "grad_norm": 4.025196643932552, "learning_rate": 1.0672958791711317e-08, "loss": 0.7066, "step": 8659 }, { "epoch": 0.8560907495736846, "grad_norm": 3.872896239619561, "learning_rate": 1.0658569244309402e-08, "loss": 0.7395, "step": 8660 }, { "epoch": 0.856189605318439, "grad_norm": 2.913936092828878, "learning_rate": 1.064418885755336e-08, "loss": 0.735, "step": 8661 }, { "epoch": 0.8562884610631936, "grad_norm": 13.34846610700837, "learning_rate": 1.0629817632917637e-08, "loss": 0.6949, "step": 8662 }, { "epoch": 0.856387316807948, "grad_norm": 4.825349082215248, "learning_rate": 1.0615455571875809e-08, "loss": 0.6531, "step": 8663 }, { "epoch": 0.8564861725527024, "grad_norm": 4.243511534169478, "learning_rate": 1.060110267590052e-08, "loss": 0.7663, "step": 8664 }, { "epoch": 0.856585028297457, "grad_norm": 4.667084369448501, "learning_rate": 1.0586758946463403e-08, "loss": 0.7193, "step": 8665 }, { "epoch": 0.8566838840422114, "grad_norm": 4.682010805104338, "learning_rate": 1.0572424385035239e-08, "loss": 0.7389, "step": 8666 }, { "epoch": 0.8567827397869658, "grad_norm": 5.5331189247847625, "learning_rate": 1.0558098993085784e-08, "loss": 0.6884, "step": 8667 }, { "epoch": 0.8568815955317204, "grad_norm": 6.784267804794987, "learning_rate": 1.0543782772083932e-08, "loss": 0.7238, "step": 8668 }, { "epoch": 0.8569804512764748, "grad_norm": 4.22099583991144, "learning_rate": 1.0529475723497571e-08, "loss": 0.6956, "step": 8669 }, { "epoch": 0.8570793070212293, "grad_norm": 37.992527271641315, "learning_rate": 1.051517784879371e-08, "loss": 0.6799, "step": 8670 }, { "epoch": 0.8571781627659837, "grad_norm": 13.041609321636171, "learning_rate": 1.050088914943834e-08, "loss": 0.5856, "step": 8671 }, { "epoch": 0.8572770185107382, "grad_norm": 3.5033801180941797, "learning_rate": 1.0486609626896614e-08, "loss": 0.6064, "step": 8672 }, { "epoch": 0.8573758742554927, "grad_norm": 4.435807298808124, "learning_rate": 1.0472339282632637e-08, "loss": 0.8445, "step": 8673 }, { "epoch": 0.8574747300002471, "grad_norm": 4.469625416832599, "learning_rate": 1.0458078118109626e-08, "loss": 0.719, "step": 8674 }, { "epoch": 0.8575735857450016, "grad_norm": 4.4191596656962835, "learning_rate": 1.0443826134789901e-08, "loss": 0.7365, "step": 8675 }, { "epoch": 0.8576724414897561, "grad_norm": 3.3422166818409393, "learning_rate": 1.0429583334134739e-08, "loss": 0.7232, "step": 8676 }, { "epoch": 0.8577712972345105, "grad_norm": 3.414458561839941, "learning_rate": 1.0415349717604537e-08, "loss": 0.6365, "step": 8677 }, { "epoch": 0.857870152979265, "grad_norm": 2.718274859849102, "learning_rate": 1.0401125286658785e-08, "loss": 0.6888, "step": 8678 }, { "epoch": 0.8579690087240195, "grad_norm": 5.551944598949903, "learning_rate": 1.0386910042755937e-08, "loss": 0.7565, "step": 8679 }, { "epoch": 0.8580678644687739, "grad_norm": 4.4692023378354815, "learning_rate": 1.0372703987353581e-08, "loss": 0.8306, "step": 8680 }, { "epoch": 0.8581667202135284, "grad_norm": 6.454068231664784, "learning_rate": 1.0358507121908344e-08, "loss": 0.8514, "step": 8681 }, { "epoch": 0.8582655759582829, "grad_norm": 3.3796120106747316, "learning_rate": 1.0344319447875893e-08, "loss": 0.7532, "step": 8682 }, { "epoch": 0.8583644317030373, "grad_norm": 7.750928948459797, "learning_rate": 1.0330140966710976e-08, "loss": 0.6494, "step": 8683 }, { "epoch": 0.8584632874477918, "grad_norm": 8.252272232864279, "learning_rate": 1.0315971679867364e-08, "loss": 0.6477, "step": 8684 }, { "epoch": 0.8585621431925463, "grad_norm": 3.3608185956131633, "learning_rate": 1.0301811588797938e-08, "loss": 0.696, "step": 8685 }, { "epoch": 0.8586609989373007, "grad_norm": 3.8909509331029137, "learning_rate": 1.028766069495458e-08, "loss": 0.6406, "step": 8686 }, { "epoch": 0.8587598546820552, "grad_norm": 4.039903810264723, "learning_rate": 1.0273518999788288e-08, "loss": 0.6789, "step": 8687 }, { "epoch": 0.8588587104268097, "grad_norm": 5.380218236606968, "learning_rate": 1.0259386504749034e-08, "loss": 0.7093, "step": 8688 }, { "epoch": 0.8589575661715642, "grad_norm": 3.616318053299666, "learning_rate": 1.024526321128596e-08, "loss": 0.6764, "step": 8689 }, { "epoch": 0.8590564219163186, "grad_norm": 5.862041209264524, "learning_rate": 1.0231149120847137e-08, "loss": 0.7559, "step": 8690 }, { "epoch": 0.859155277661073, "grad_norm": 3.9265757929371294, "learning_rate": 1.021704423487979e-08, "loss": 0.7172, "step": 8691 }, { "epoch": 0.8592541334058276, "grad_norm": 5.3528392899950665, "learning_rate": 1.0202948554830194e-08, "loss": 0.7398, "step": 8692 }, { "epoch": 0.859352989150582, "grad_norm": 3.679831300628519, "learning_rate": 1.018886208214359e-08, "loss": 0.6784, "step": 8693 }, { "epoch": 0.8594518448953364, "grad_norm": 3.396465730842341, "learning_rate": 1.0174784818264404e-08, "loss": 0.6711, "step": 8694 }, { "epoch": 0.859550700640091, "grad_norm": 4.3053866972303, "learning_rate": 1.016071676463599e-08, "loss": 0.7732, "step": 8695 }, { "epoch": 0.8596495563848454, "grad_norm": 5.0484321464408, "learning_rate": 1.0146657922700863e-08, "loss": 0.7396, "step": 8696 }, { "epoch": 0.8597484121295998, "grad_norm": 3.8234309458579894, "learning_rate": 1.0132608293900547e-08, "loss": 0.6966, "step": 8697 }, { "epoch": 0.8598472678743544, "grad_norm": 3.5043980519445923, "learning_rate": 1.0118567879675587e-08, "loss": 0.722, "step": 8698 }, { "epoch": 0.8599461236191088, "grad_norm": 4.184632423052862, "learning_rate": 1.0104536681465658e-08, "loss": 0.6837, "step": 8699 }, { "epoch": 0.8600449793638633, "grad_norm": 11.263220299322613, "learning_rate": 1.0090514700709451e-08, "loss": 0.6881, "step": 8700 }, { "epoch": 0.8601438351086178, "grad_norm": 4.123417074001255, "learning_rate": 1.0076501938844694e-08, "loss": 0.6434, "step": 8701 }, { "epoch": 0.8602426908533722, "grad_norm": 3.9585163142615323, "learning_rate": 1.0062498397308205e-08, "loss": 0.7401, "step": 8702 }, { "epoch": 0.8603415465981267, "grad_norm": 3.415464844862139, "learning_rate": 1.0048504077535846e-08, "loss": 0.6605, "step": 8703 }, { "epoch": 0.8604404023428811, "grad_norm": 4.31684379522864, "learning_rate": 1.0034518980962492e-08, "loss": 0.8306, "step": 8704 }, { "epoch": 0.8605392580876357, "grad_norm": 3.352121276016661, "learning_rate": 1.002054310902215e-08, "loss": 0.7975, "step": 8705 }, { "epoch": 0.8606381138323901, "grad_norm": 3.7630931849961566, "learning_rate": 1.0006576463147797e-08, "loss": 0.6384, "step": 8706 }, { "epoch": 0.8607369695771445, "grad_norm": 3.5353133748254275, "learning_rate": 9.992619044771544e-09, "loss": 0.6089, "step": 8707 }, { "epoch": 0.8608358253218991, "grad_norm": 6.099731053276534, "learning_rate": 9.978670855324512e-09, "loss": 0.645, "step": 8708 }, { "epoch": 0.8609346810666535, "grad_norm": 4.116758070430057, "learning_rate": 9.964731896236867e-09, "loss": 0.6998, "step": 8709 }, { "epoch": 0.8610335368114079, "grad_norm": 9.020869106518614, "learning_rate": 9.950802168937843e-09, "loss": 0.6967, "step": 8710 }, { "epoch": 0.8611323925561625, "grad_norm": 10.97234212201642, "learning_rate": 9.936881674855768e-09, "loss": 0.7295, "step": 8711 }, { "epoch": 0.8612312483009169, "grad_norm": 2.9971556706620093, "learning_rate": 9.922970415417931e-09, "loss": 0.6494, "step": 8712 }, { "epoch": 0.8613301040456713, "grad_norm": 4.444234511067111, "learning_rate": 9.909068392050735e-09, "loss": 0.6235, "step": 8713 }, { "epoch": 0.8614289597904258, "grad_norm": 3.25232576475982, "learning_rate": 9.895175606179673e-09, "loss": 0.5969, "step": 8714 }, { "epoch": 0.8615278155351803, "grad_norm": 4.011636697464486, "learning_rate": 9.881292059229195e-09, "loss": 0.6862, "step": 8715 }, { "epoch": 0.8616266712799348, "grad_norm": 2.861597406889946, "learning_rate": 9.867417752622887e-09, "loss": 0.7134, "step": 8716 }, { "epoch": 0.8617255270246892, "grad_norm": 3.3820591773412403, "learning_rate": 9.853552687783318e-09, "loss": 0.8082, "step": 8717 }, { "epoch": 0.8618243827694437, "grad_norm": 9.800371565050407, "learning_rate": 9.839696866132164e-09, "loss": 0.6929, "step": 8718 }, { "epoch": 0.8619232385141982, "grad_norm": 5.255403067722871, "learning_rate": 9.825850289090164e-09, "loss": 0.6419, "step": 8719 }, { "epoch": 0.8620220942589526, "grad_norm": 7.219805833879519, "learning_rate": 9.812012958077054e-09, "loss": 0.7689, "step": 8720 }, { "epoch": 0.8621209500037071, "grad_norm": 3.7362987953898084, "learning_rate": 9.798184874511628e-09, "loss": 0.7278, "step": 8721 }, { "epoch": 0.8622198057484616, "grad_norm": 4.359118244053497, "learning_rate": 9.784366039811787e-09, "loss": 0.6545, "step": 8722 }, { "epoch": 0.862318661493216, "grad_norm": 2.958740673628993, "learning_rate": 9.770556455394429e-09, "loss": 0.645, "step": 8723 }, { "epoch": 0.8624175172379704, "grad_norm": 18.027789762760783, "learning_rate": 9.756756122675513e-09, "loss": 0.646, "step": 8724 }, { "epoch": 0.862516372982725, "grad_norm": 6.308876288170628, "learning_rate": 9.742965043070105e-09, "loss": 0.7124, "step": 8725 }, { "epoch": 0.8626152287274794, "grad_norm": 15.19638233938541, "learning_rate": 9.72918321799222e-09, "loss": 0.7611, "step": 8726 }, { "epoch": 0.8627140844722339, "grad_norm": 7.5366789564379415, "learning_rate": 9.715410648855039e-09, "loss": 0.6626, "step": 8727 }, { "epoch": 0.8628129402169884, "grad_norm": 4.096083467022603, "learning_rate": 9.701647337070673e-09, "loss": 0.743, "step": 8728 }, { "epoch": 0.8629117959617428, "grad_norm": 9.282828992279972, "learning_rate": 9.687893284050386e-09, "loss": 0.707, "step": 8729 }, { "epoch": 0.8630106517064973, "grad_norm": 4.587995607161568, "learning_rate": 9.674148491204458e-09, "loss": 0.7394, "step": 8730 }, { "epoch": 0.8631095074512518, "grad_norm": 4.646003949284046, "learning_rate": 9.660412959942188e-09, "loss": 0.7486, "step": 8731 }, { "epoch": 0.8632083631960062, "grad_norm": 2.7706079324301793, "learning_rate": 9.646686691671957e-09, "loss": 0.7123, "step": 8732 }, { "epoch": 0.8633072189407607, "grad_norm": 3.9719214145392843, "learning_rate": 9.63296968780123e-09, "loss": 0.7309, "step": 8733 }, { "epoch": 0.8634060746855151, "grad_norm": 40.58037114601579, "learning_rate": 9.619261949736435e-09, "loss": 0.6698, "step": 8734 }, { "epoch": 0.8635049304302697, "grad_norm": 4.139113532335888, "learning_rate": 9.605563478883116e-09, "loss": 0.7145, "step": 8735 }, { "epoch": 0.8636037861750241, "grad_norm": 4.337579719788171, "learning_rate": 9.591874276645861e-09, "loss": 0.6127, "step": 8736 }, { "epoch": 0.8637026419197785, "grad_norm": 4.020608605895523, "learning_rate": 9.578194344428291e-09, "loss": 0.8, "step": 8737 }, { "epoch": 0.8638014976645331, "grad_norm": 4.017405657528764, "learning_rate": 9.564523683633064e-09, "loss": 0.65, "step": 8738 }, { "epoch": 0.8639003534092875, "grad_norm": 8.535704077831664, "learning_rate": 9.550862295661932e-09, "loss": 0.6012, "step": 8739 }, { "epoch": 0.8639992091540419, "grad_norm": 3.1060104318065824, "learning_rate": 9.537210181915622e-09, "loss": 0.7005, "step": 8740 }, { "epoch": 0.8640980648987965, "grad_norm": 4.11184168751312, "learning_rate": 9.523567343794015e-09, "loss": 0.6689, "step": 8741 }, { "epoch": 0.8641969206435509, "grad_norm": 9.750668818116093, "learning_rate": 9.509933782695945e-09, "loss": 0.7788, "step": 8742 }, { "epoch": 0.8642957763883053, "grad_norm": 2.6941895629981114, "learning_rate": 9.496309500019328e-09, "loss": 0.8048, "step": 8743 }, { "epoch": 0.8643946321330598, "grad_norm": 3.438409769153508, "learning_rate": 9.48269449716118e-09, "loss": 0.5649, "step": 8744 }, { "epoch": 0.8644934878778143, "grad_norm": 2.8700570042368088, "learning_rate": 9.469088775517464e-09, "loss": 0.774, "step": 8745 }, { "epoch": 0.8645923436225688, "grad_norm": 3.5363338756664673, "learning_rate": 9.455492336483262e-09, "loss": 0.7024, "step": 8746 }, { "epoch": 0.8646911993673232, "grad_norm": 4.178623315967932, "learning_rate": 9.441905181452725e-09, "loss": 0.6993, "step": 8747 }, { "epoch": 0.8647900551120777, "grad_norm": 7.943577629810506, "learning_rate": 9.428327311818962e-09, "loss": 0.7559, "step": 8748 }, { "epoch": 0.8648889108568322, "grad_norm": 7.170393042105103, "learning_rate": 9.414758728974226e-09, "loss": 0.6667, "step": 8749 }, { "epoch": 0.8649877666015866, "grad_norm": 4.066462510351933, "learning_rate": 9.401199434309736e-09, "loss": 0.6335, "step": 8750 }, { "epoch": 0.8650866223463411, "grad_norm": 7.663538720962882, "learning_rate": 9.387649429215816e-09, "loss": 0.6389, "step": 8751 }, { "epoch": 0.8651854780910956, "grad_norm": 3.7810132512284307, "learning_rate": 9.37410871508184e-09, "loss": 0.7214, "step": 8752 }, { "epoch": 0.86528433383585, "grad_norm": 4.758669795015115, "learning_rate": 9.360577293296179e-09, "loss": 0.6519, "step": 8753 }, { "epoch": 0.8653831895806045, "grad_norm": 4.694440930957621, "learning_rate": 9.347055165246298e-09, "loss": 0.6909, "step": 8754 }, { "epoch": 0.865482045325359, "grad_norm": 4.284354105466366, "learning_rate": 9.333542332318679e-09, "loss": 0.7918, "step": 8755 }, { "epoch": 0.8655809010701134, "grad_norm": 5.37075965527498, "learning_rate": 9.320038795898877e-09, "loss": 0.7313, "step": 8756 }, { "epoch": 0.8656797568148679, "grad_norm": 4.96768697370037, "learning_rate": 9.306544557371455e-09, "loss": 0.6409, "step": 8757 }, { "epoch": 0.8657786125596224, "grad_norm": 4.314959971111847, "learning_rate": 9.293059618120091e-09, "loss": 0.6652, "step": 8758 }, { "epoch": 0.8658774683043768, "grad_norm": 3.103779569760233, "learning_rate": 9.279583979527416e-09, "loss": 0.6904, "step": 8759 }, { "epoch": 0.8659763240491313, "grad_norm": 6.039707627980897, "learning_rate": 9.26611764297519e-09, "loss": 0.7365, "step": 8760 }, { "epoch": 0.8660751797938858, "grad_norm": 4.6155515016381825, "learning_rate": 9.252660609844198e-09, "loss": 0.6787, "step": 8761 }, { "epoch": 0.8661740355386403, "grad_norm": 6.401217728674372, "learning_rate": 9.239212881514225e-09, "loss": 0.6253, "step": 8762 }, { "epoch": 0.8662728912833947, "grad_norm": 2.575455841388774, "learning_rate": 9.225774459364165e-09, "loss": 0.6793, "step": 8763 }, { "epoch": 0.8663717470281491, "grad_norm": 4.02595379304599, "learning_rate": 9.212345344771911e-09, "loss": 0.5624, "step": 8764 }, { "epoch": 0.8664706027729037, "grad_norm": 3.6220420957585606, "learning_rate": 9.198925539114433e-09, "loss": 0.7972, "step": 8765 }, { "epoch": 0.8665694585176581, "grad_norm": 3.5239043292179253, "learning_rate": 9.185515043767744e-09, "loss": 0.8094, "step": 8766 }, { "epoch": 0.8666683142624125, "grad_norm": 3.140265580951762, "learning_rate": 9.172113860106857e-09, "loss": 0.7566, "step": 8767 }, { "epoch": 0.8667671700071671, "grad_norm": 3.8531861516271753, "learning_rate": 9.15872198950589e-09, "loss": 0.7778, "step": 8768 }, { "epoch": 0.8668660257519215, "grad_norm": 5.4854655676393795, "learning_rate": 9.145339433338007e-09, "loss": 0.6816, "step": 8769 }, { "epoch": 0.8669648814966759, "grad_norm": 3.5775275515620515, "learning_rate": 9.131966192975339e-09, "loss": 0.6705, "step": 8770 }, { "epoch": 0.8670637372414305, "grad_norm": 6.231422615973247, "learning_rate": 9.118602269789155e-09, "loss": 0.6907, "step": 8771 }, { "epoch": 0.8671625929861849, "grad_norm": 30.713183053254085, "learning_rate": 9.105247665149695e-09, "loss": 0.7279, "step": 8772 }, { "epoch": 0.8672614487309394, "grad_norm": 3.578011624569336, "learning_rate": 9.09190238042632e-09, "loss": 0.7667, "step": 8773 }, { "epoch": 0.8673603044756938, "grad_norm": 4.217074998894786, "learning_rate": 9.078566416987365e-09, "loss": 0.6238, "step": 8774 }, { "epoch": 0.8674591602204483, "grad_norm": 3.493544210134473, "learning_rate": 9.065239776200218e-09, "loss": 0.6417, "step": 8775 }, { "epoch": 0.8675580159652028, "grad_norm": 3.9647127310128765, "learning_rate": 9.05192245943135e-09, "loss": 0.7099, "step": 8776 }, { "epoch": 0.8676568717099572, "grad_norm": 3.2106871687970835, "learning_rate": 9.03861446804628e-09, "loss": 0.7429, "step": 8777 }, { "epoch": 0.8677557274547117, "grad_norm": 3.802958822444391, "learning_rate": 9.025315803409494e-09, "loss": 0.6912, "step": 8778 }, { "epoch": 0.8678545831994662, "grad_norm": 4.008206496407164, "learning_rate": 9.012026466884603e-09, "loss": 0.741, "step": 8779 }, { "epoch": 0.8679534389442206, "grad_norm": 13.166805462948453, "learning_rate": 8.99874645983425e-09, "loss": 0.6348, "step": 8780 }, { "epoch": 0.8680522946889752, "grad_norm": 3.9159703691045418, "learning_rate": 8.985475783620067e-09, "loss": 0.7127, "step": 8781 }, { "epoch": 0.8681511504337296, "grad_norm": 6.970481654423302, "learning_rate": 8.972214439602787e-09, "loss": 0.7168, "step": 8782 }, { "epoch": 0.868250006178484, "grad_norm": 4.086676083428383, "learning_rate": 8.95896242914218e-09, "loss": 0.7148, "step": 8783 }, { "epoch": 0.8683488619232386, "grad_norm": 9.614858089711998, "learning_rate": 8.945719753597003e-09, "loss": 0.7437, "step": 8784 }, { "epoch": 0.868447717667993, "grad_norm": 4.843403893866844, "learning_rate": 8.932486414325135e-09, "loss": 0.7095, "step": 8785 }, { "epoch": 0.8685465734127474, "grad_norm": 3.1394924937619413, "learning_rate": 8.919262412683438e-09, "loss": 0.7096, "step": 8786 }, { "epoch": 0.8686454291575019, "grad_norm": 4.70879146619149, "learning_rate": 8.906047750027835e-09, "loss": 0.7117, "step": 8787 }, { "epoch": 0.8687442849022564, "grad_norm": 6.613484872043874, "learning_rate": 8.892842427713332e-09, "loss": 0.7425, "step": 8788 }, { "epoch": 0.8688431406470108, "grad_norm": 4.6207079801743784, "learning_rate": 8.879646447093891e-09, "loss": 0.7303, "step": 8789 }, { "epoch": 0.8689419963917653, "grad_norm": 2.9109905623665164, "learning_rate": 8.866459809522597e-09, "loss": 0.7092, "step": 8790 }, { "epoch": 0.8690408521365198, "grad_norm": 5.817113843715692, "learning_rate": 8.853282516351545e-09, "loss": 0.694, "step": 8791 }, { "epoch": 0.8691397078812743, "grad_norm": 4.708245533546307, "learning_rate": 8.840114568931844e-09, "loss": 0.6874, "step": 8792 }, { "epoch": 0.8692385636260287, "grad_norm": 12.40603062572837, "learning_rate": 8.82695596861368e-09, "loss": 0.7788, "step": 8793 }, { "epoch": 0.8693374193707832, "grad_norm": 7.334026383500333, "learning_rate": 8.81380671674632e-09, "loss": 0.746, "step": 8794 }, { "epoch": 0.8694362751155377, "grad_norm": 4.459729483034317, "learning_rate": 8.80066681467796e-09, "loss": 0.7191, "step": 8795 }, { "epoch": 0.8695351308602921, "grad_norm": 3.4785068130384103, "learning_rate": 8.787536263755957e-09, "loss": 0.5721, "step": 8796 }, { "epoch": 0.8696339866050465, "grad_norm": 5.286976065430261, "learning_rate": 8.774415065326612e-09, "loss": 0.6168, "step": 8797 }, { "epoch": 0.8697328423498011, "grad_norm": 4.672510348038304, "learning_rate": 8.761303220735327e-09, "loss": 0.8147, "step": 8798 }, { "epoch": 0.8698316980945555, "grad_norm": 6.841114483944898, "learning_rate": 8.74820073132655e-09, "loss": 0.6292, "step": 8799 }, { "epoch": 0.86993055383931, "grad_norm": 4.175597029087264, "learning_rate": 8.735107598443714e-09, "loss": 0.6357, "step": 8800 }, { "epoch": 0.8700294095840645, "grad_norm": 4.0422014245309414, "learning_rate": 8.722023823429336e-09, "loss": 0.7369, "step": 8801 }, { "epoch": 0.8701282653288189, "grad_norm": 6.833759323875495, "learning_rate": 8.708949407625e-09, "loss": 0.78, "step": 8802 }, { "epoch": 0.8702271210735734, "grad_norm": 4.334623353241719, "learning_rate": 8.69588435237123e-09, "loss": 0.6466, "step": 8803 }, { "epoch": 0.8703259768183279, "grad_norm": 4.46321018833137, "learning_rate": 8.682828659007702e-09, "loss": 0.742, "step": 8804 }, { "epoch": 0.8704248325630823, "grad_norm": 4.333773582324982, "learning_rate": 8.669782328873088e-09, "loss": 0.6972, "step": 8805 }, { "epoch": 0.8705236883078368, "grad_norm": 18.070184541070912, "learning_rate": 8.656745363305063e-09, "loss": 0.7177, "step": 8806 }, { "epoch": 0.8706225440525912, "grad_norm": 5.405760863997132, "learning_rate": 8.643717763640402e-09, "loss": 0.6293, "step": 8807 }, { "epoch": 0.8707213997973458, "grad_norm": 5.4032829592342795, "learning_rate": 8.630699531214891e-09, "loss": 0.7003, "step": 8808 }, { "epoch": 0.8708202555421002, "grad_norm": 4.5019953875097904, "learning_rate": 8.617690667363341e-09, "loss": 0.7776, "step": 8809 }, { "epoch": 0.8709191112868546, "grad_norm": 3.0542023268082583, "learning_rate": 8.604691173419632e-09, "loss": 0.7032, "step": 8810 }, { "epoch": 0.8710179670316092, "grad_norm": 5.945890298010638, "learning_rate": 8.59170105071666e-09, "loss": 0.7741, "step": 8811 }, { "epoch": 0.8711168227763636, "grad_norm": 2.599215169875295, "learning_rate": 8.578720300586372e-09, "loss": 0.6619, "step": 8812 }, { "epoch": 0.871215678521118, "grad_norm": 5.07575178367205, "learning_rate": 8.565748924359784e-09, "loss": 0.686, "step": 8813 }, { "epoch": 0.8713145342658726, "grad_norm": 3.427103039030657, "learning_rate": 8.552786923366873e-09, "loss": 0.6469, "step": 8814 }, { "epoch": 0.871413390010627, "grad_norm": 5.07544055264377, "learning_rate": 8.539834298936721e-09, "loss": 0.7119, "step": 8815 }, { "epoch": 0.8715122457553814, "grad_norm": 4.190692101773349, "learning_rate": 8.526891052397444e-09, "loss": 0.7857, "step": 8816 }, { "epoch": 0.8716111015001359, "grad_norm": 4.297389385715255, "learning_rate": 8.513957185076149e-09, "loss": 0.6742, "step": 8817 }, { "epoch": 0.8717099572448904, "grad_norm": 14.78482327539833, "learning_rate": 8.501032698299049e-09, "loss": 0.5984, "step": 8818 }, { "epoch": 0.8718088129896449, "grad_norm": 5.639983779835924, "learning_rate": 8.48811759339133e-09, "loss": 0.7374, "step": 8819 }, { "epoch": 0.8719076687343993, "grad_norm": 9.064821884091982, "learning_rate": 8.475211871677246e-09, "loss": 0.754, "step": 8820 }, { "epoch": 0.8720065244791538, "grad_norm": 3.3134792391944394, "learning_rate": 8.462315534480125e-09, "loss": 0.8118, "step": 8821 }, { "epoch": 0.8721053802239083, "grad_norm": 4.230887399798083, "learning_rate": 8.449428583122243e-09, "loss": 0.7354, "step": 8822 }, { "epoch": 0.8722042359686627, "grad_norm": 5.971152371658918, "learning_rate": 8.436551018924998e-09, "loss": 0.6762, "step": 8823 }, { "epoch": 0.8723030917134172, "grad_norm": 3.2149714448155673, "learning_rate": 8.423682843208813e-09, "loss": 0.6183, "step": 8824 }, { "epoch": 0.8724019474581717, "grad_norm": 4.817781222030279, "learning_rate": 8.410824057293109e-09, "loss": 0.6362, "step": 8825 }, { "epoch": 0.8725008032029261, "grad_norm": 5.632104365569159, "learning_rate": 8.397974662496333e-09, "loss": 0.7215, "step": 8826 }, { "epoch": 0.8725996589476805, "grad_norm": 4.130796738869813, "learning_rate": 8.38513466013605e-09, "loss": 0.7495, "step": 8827 }, { "epoch": 0.8726985146924351, "grad_norm": 3.0788245405419286, "learning_rate": 8.37230405152879e-09, "loss": 0.7063, "step": 8828 }, { "epoch": 0.8727973704371895, "grad_norm": 4.771919826041508, "learning_rate": 8.359482837990151e-09, "loss": 0.7861, "step": 8829 }, { "epoch": 0.872896226181944, "grad_norm": 5.979634236704562, "learning_rate": 8.346671020834739e-09, "loss": 0.7692, "step": 8830 }, { "epoch": 0.8729950819266985, "grad_norm": 4.587234776925188, "learning_rate": 8.333868601376227e-09, "loss": 0.7559, "step": 8831 }, { "epoch": 0.8730939376714529, "grad_norm": 3.421663615815811, "learning_rate": 8.321075580927339e-09, "loss": 0.7465, "step": 8832 }, { "epoch": 0.8731927934162074, "grad_norm": 3.4528025059520826, "learning_rate": 8.308291960799784e-09, "loss": 0.6328, "step": 8833 }, { "epoch": 0.8732916491609619, "grad_norm": 4.182213352395917, "learning_rate": 8.295517742304325e-09, "loss": 0.6173, "step": 8834 }, { "epoch": 0.8733905049057163, "grad_norm": 4.472295396157893, "learning_rate": 8.282752926750814e-09, "loss": 0.7132, "step": 8835 }, { "epoch": 0.8734893606504708, "grad_norm": 9.341332686639797, "learning_rate": 8.269997515448046e-09, "loss": 0.7109, "step": 8836 }, { "epoch": 0.8735882163952252, "grad_norm": 8.870826630065606, "learning_rate": 8.257251509703922e-09, "loss": 0.776, "step": 8837 }, { "epoch": 0.8736870721399798, "grad_norm": 4.056565299027339, "learning_rate": 8.244514910825362e-09, "loss": 0.8221, "step": 8838 }, { "epoch": 0.8737859278847342, "grad_norm": 2.6749054789888103, "learning_rate": 8.231787720118299e-09, "loss": 0.668, "step": 8839 }, { "epoch": 0.8738847836294886, "grad_norm": 3.877747239903266, "learning_rate": 8.219069938887746e-09, "loss": 0.7595, "step": 8840 }, { "epoch": 0.8739836393742432, "grad_norm": 3.7617029893790703, "learning_rate": 8.206361568437692e-09, "loss": 0.7245, "step": 8841 }, { "epoch": 0.8740824951189976, "grad_norm": 3.5178296382624006, "learning_rate": 8.193662610071217e-09, "loss": 0.7328, "step": 8842 }, { "epoch": 0.874181350863752, "grad_norm": 4.481060663685702, "learning_rate": 8.180973065090391e-09, "loss": 0.6896, "step": 8843 }, { "epoch": 0.8742802066085066, "grad_norm": 7.922486321279734, "learning_rate": 8.168292934796373e-09, "loss": 0.7297, "step": 8844 }, { "epoch": 0.874379062353261, "grad_norm": 5.932750578757979, "learning_rate": 8.155622220489278e-09, "loss": 0.6879, "step": 8845 }, { "epoch": 0.8744779180980154, "grad_norm": 3.9820155846097856, "learning_rate": 8.142960923468344e-09, "loss": 0.7177, "step": 8846 }, { "epoch": 0.8745767738427699, "grad_norm": 9.324340365018982, "learning_rate": 8.130309045031769e-09, "loss": 0.7442, "step": 8847 }, { "epoch": 0.8746756295875244, "grad_norm": 4.733139769251487, "learning_rate": 8.117666586476813e-09, "loss": 0.6973, "step": 8848 }, { "epoch": 0.8747744853322789, "grad_norm": 6.63913694688246, "learning_rate": 8.105033549099816e-09, "loss": 0.739, "step": 8849 }, { "epoch": 0.8748733410770333, "grad_norm": 15.872995709645496, "learning_rate": 8.092409934196066e-09, "loss": 0.7236, "step": 8850 }, { "epoch": 0.8749721968217878, "grad_norm": 7.104618523080822, "learning_rate": 8.079795743059958e-09, "loss": 0.6462, "step": 8851 }, { "epoch": 0.8750710525665423, "grad_norm": 3.8142001969236294, "learning_rate": 8.067190976984861e-09, "loss": 0.7069, "step": 8852 }, { "epoch": 0.8751699083112967, "grad_norm": 4.801325547490255, "learning_rate": 8.054595637263228e-09, "loss": 0.6321, "step": 8853 }, { "epoch": 0.8752687640560513, "grad_norm": 6.89401319557989, "learning_rate": 8.042009725186538e-09, "loss": 0.5614, "step": 8854 }, { "epoch": 0.8753676198008057, "grad_norm": 3.2565245071384714, "learning_rate": 8.029433242045257e-09, "loss": 0.7698, "step": 8855 }, { "epoch": 0.8754664755455601, "grad_norm": 4.728283309120825, "learning_rate": 8.016866189128946e-09, "loss": 0.7417, "step": 8856 }, { "epoch": 0.8755653312903147, "grad_norm": 4.618824904378138, "learning_rate": 8.00430856772617e-09, "loss": 0.6428, "step": 8857 }, { "epoch": 0.8756641870350691, "grad_norm": 8.181547616326991, "learning_rate": 7.99176037912449e-09, "loss": 0.7087, "step": 8858 }, { "epoch": 0.8757630427798235, "grad_norm": 7.740776331666418, "learning_rate": 7.9792216246106e-09, "loss": 0.8545, "step": 8859 }, { "epoch": 0.875861898524578, "grad_norm": 3.714420205282046, "learning_rate": 7.966692305470101e-09, "loss": 0.6253, "step": 8860 }, { "epoch": 0.8759607542693325, "grad_norm": 3.5638677312526332, "learning_rate": 7.954172422987748e-09, "loss": 0.7765, "step": 8861 }, { "epoch": 0.8760596100140869, "grad_norm": 3.6526212870591768, "learning_rate": 7.941661978447234e-09, "loss": 0.6246, "step": 8862 }, { "epoch": 0.8761584657588414, "grad_norm": 4.997595734378032, "learning_rate": 7.92916097313131e-09, "loss": 0.6886, "step": 8863 }, { "epoch": 0.8762573215035959, "grad_norm": 4.045775710049452, "learning_rate": 7.916669408321797e-09, "loss": 0.7328, "step": 8864 }, { "epoch": 0.8763561772483504, "grad_norm": 11.916422211000354, "learning_rate": 7.904187285299513e-09, "loss": 0.7616, "step": 8865 }, { "epoch": 0.8764550329931048, "grad_norm": 9.565784346361918, "learning_rate": 7.891714605344313e-09, "loss": 0.7001, "step": 8866 }, { "epoch": 0.8765538887378593, "grad_norm": 4.284172212468677, "learning_rate": 7.879251369735074e-09, "loss": 0.6539, "step": 8867 }, { "epoch": 0.8766527444826138, "grad_norm": 4.3152017623025944, "learning_rate": 7.86679757974975e-09, "loss": 0.7438, "step": 8868 }, { "epoch": 0.8767516002273682, "grad_norm": 3.1837607463527053, "learning_rate": 7.854353236665268e-09, "loss": 0.6375, "step": 8869 }, { "epoch": 0.8768504559721226, "grad_norm": 4.223069809184284, "learning_rate": 7.841918341757603e-09, "loss": 0.6695, "step": 8870 }, { "epoch": 0.8769493117168772, "grad_norm": 2.6906068177400377, "learning_rate": 7.829492896301815e-09, "loss": 0.7215, "step": 8871 }, { "epoch": 0.8770481674616316, "grad_norm": 4.1433598538754355, "learning_rate": 7.817076901571896e-09, "loss": 0.6574, "step": 8872 }, { "epoch": 0.877147023206386, "grad_norm": 3.243791163915268, "learning_rate": 7.804670358840959e-09, "loss": 0.6408, "step": 8873 }, { "epoch": 0.8772458789511406, "grad_norm": 3.6627357929379225, "learning_rate": 7.792273269381088e-09, "loss": 0.693, "step": 8874 }, { "epoch": 0.877344734695895, "grad_norm": 5.880631597536817, "learning_rate": 7.779885634463434e-09, "loss": 0.7728, "step": 8875 }, { "epoch": 0.8774435904406495, "grad_norm": 3.85653198442103, "learning_rate": 7.767507455358191e-09, "loss": 0.631, "step": 8876 }, { "epoch": 0.877542446185404, "grad_norm": 5.012050768160655, "learning_rate": 7.755138733334499e-09, "loss": 0.6924, "step": 8877 }, { "epoch": 0.8776413019301584, "grad_norm": 3.0414707581509233, "learning_rate": 7.742779469660654e-09, "loss": 0.7231, "step": 8878 }, { "epoch": 0.8777401576749129, "grad_norm": 4.464666771081131, "learning_rate": 7.730429665603876e-09, "loss": 0.6047, "step": 8879 }, { "epoch": 0.8778390134196673, "grad_norm": 3.3877946695342755, "learning_rate": 7.718089322430455e-09, "loss": 0.6815, "step": 8880 }, { "epoch": 0.8779378691644218, "grad_norm": 3.1018432624247194, "learning_rate": 7.705758441405707e-09, "loss": 0.6, "step": 8881 }, { "epoch": 0.8780367249091763, "grad_norm": 7.129200808710328, "learning_rate": 7.693437023794024e-09, "loss": 0.7385, "step": 8882 }, { "epoch": 0.8781355806539307, "grad_norm": 3.833043605505092, "learning_rate": 7.681125070858741e-09, "loss": 0.6696, "step": 8883 }, { "epoch": 0.8782344363986853, "grad_norm": 6.289033461064958, "learning_rate": 7.668822583862289e-09, "loss": 0.6499, "step": 8884 }, { "epoch": 0.8783332921434397, "grad_norm": 3.6999890812253944, "learning_rate": 7.656529564066094e-09, "loss": 0.6633, "step": 8885 }, { "epoch": 0.8784321478881941, "grad_norm": 3.692471935404897, "learning_rate": 7.644246012730627e-09, "loss": 0.6271, "step": 8886 }, { "epoch": 0.8785310036329487, "grad_norm": 4.496493740058102, "learning_rate": 7.631971931115399e-09, "loss": 0.7407, "step": 8887 }, { "epoch": 0.8786298593777031, "grad_norm": 8.970234708943824, "learning_rate": 7.619707320478908e-09, "loss": 0.6495, "step": 8888 }, { "epoch": 0.8787287151224575, "grad_norm": 6.107617586757581, "learning_rate": 7.607452182078733e-09, "loss": 0.6601, "step": 8889 }, { "epoch": 0.878827570867212, "grad_norm": 3.8579464745080894, "learning_rate": 7.59520651717146e-09, "loss": 0.6758, "step": 8890 }, { "epoch": 0.8789264266119665, "grad_norm": 4.037929991638243, "learning_rate": 7.582970327012683e-09, "loss": 0.5317, "step": 8891 }, { "epoch": 0.879025282356721, "grad_norm": 14.817822721398237, "learning_rate": 7.570743612857055e-09, "loss": 0.661, "step": 8892 }, { "epoch": 0.8791241381014754, "grad_norm": 7.892104073234358, "learning_rate": 7.558526375958252e-09, "loss": 0.6404, "step": 8893 }, { "epoch": 0.8792229938462299, "grad_norm": 3.3529088777209926, "learning_rate": 7.546318617568937e-09, "loss": 0.6172, "step": 8894 }, { "epoch": 0.8793218495909844, "grad_norm": 4.4804534628956665, "learning_rate": 7.534120338940886e-09, "loss": 0.7499, "step": 8895 }, { "epoch": 0.8794207053357388, "grad_norm": 3.098713483706549, "learning_rate": 7.521931541324832e-09, "loss": 0.6397, "step": 8896 }, { "epoch": 0.8795195610804933, "grad_norm": 5.165933630827799, "learning_rate": 7.509752225970523e-09, "loss": 0.698, "step": 8897 }, { "epoch": 0.8796184168252478, "grad_norm": 3.664141066295289, "learning_rate": 7.497582394126822e-09, "loss": 0.8364, "step": 8898 }, { "epoch": 0.8797172725700022, "grad_norm": 3.1545925551739296, "learning_rate": 7.485422047041523e-09, "loss": 0.7857, "step": 8899 }, { "epoch": 0.8798161283147566, "grad_norm": 3.5645049183033537, "learning_rate": 7.473271185961493e-09, "loss": 0.6608, "step": 8900 }, { "epoch": 0.8799149840595112, "grad_norm": 8.179113785884658, "learning_rate": 7.461129812132671e-09, "loss": 0.6741, "step": 8901 }, { "epoch": 0.8800138398042656, "grad_norm": 4.375366211842156, "learning_rate": 7.448997926799916e-09, "loss": 0.7284, "step": 8902 }, { "epoch": 0.88011269554902, "grad_norm": 4.269435602402366, "learning_rate": 7.43687553120721e-09, "loss": 0.6676, "step": 8903 }, { "epoch": 0.8802115512937746, "grad_norm": 5.253248961095636, "learning_rate": 7.424762626597525e-09, "loss": 0.7035, "step": 8904 }, { "epoch": 0.880310407038529, "grad_norm": 3.1922609650759366, "learning_rate": 7.412659214212835e-09, "loss": 0.7131, "step": 8905 }, { "epoch": 0.8804092627832835, "grad_norm": 5.234084422524008, "learning_rate": 7.400565295294192e-09, "loss": 0.7327, "step": 8906 }, { "epoch": 0.880508118528038, "grad_norm": 3.595250690560991, "learning_rate": 7.388480871081637e-09, "loss": 0.6601, "step": 8907 }, { "epoch": 0.8806069742727924, "grad_norm": 3.8163446480149834, "learning_rate": 7.376405942814245e-09, "loss": 0.6738, "step": 8908 }, { "epoch": 0.8807058300175469, "grad_norm": 6.73393045959171, "learning_rate": 7.364340511730149e-09, "loss": 0.6926, "step": 8909 }, { "epoch": 0.8808046857623013, "grad_norm": 3.9960969394956964, "learning_rate": 7.3522845790664476e-09, "loss": 0.6187, "step": 8910 }, { "epoch": 0.8809035415070559, "grad_norm": 2.8333849529140975, "learning_rate": 7.340238146059308e-09, "loss": 0.7299, "step": 8911 }, { "epoch": 0.8810023972518103, "grad_norm": 4.222306028682343, "learning_rate": 7.328201213943941e-09, "loss": 0.7028, "step": 8912 }, { "epoch": 0.8811012529965647, "grad_norm": 4.455751639541688, "learning_rate": 7.316173783954527e-09, "loss": 0.6933, "step": 8913 }, { "epoch": 0.8812001087413193, "grad_norm": 5.735798550190392, "learning_rate": 7.304155857324301e-09, "loss": 0.5745, "step": 8914 }, { "epoch": 0.8812989644860737, "grad_norm": 3.551166599457233, "learning_rate": 7.292147435285545e-09, "loss": 0.7322, "step": 8915 }, { "epoch": 0.8813978202308281, "grad_norm": 2.9066843139899388, "learning_rate": 7.2801485190695066e-09, "loss": 0.717, "step": 8916 }, { "epoch": 0.8814966759755827, "grad_norm": 3.4969125259322653, "learning_rate": 7.268159109906558e-09, "loss": 0.7309, "step": 8917 }, { "epoch": 0.8815955317203371, "grad_norm": 6.171568072620089, "learning_rate": 7.25617920902597e-09, "loss": 0.7495, "step": 8918 }, { "epoch": 0.8816943874650915, "grad_norm": 4.238434861586912, "learning_rate": 7.2442088176561385e-09, "loss": 0.7496, "step": 8919 }, { "epoch": 0.881793243209846, "grad_norm": 4.438327534461717, "learning_rate": 7.232247937024471e-09, "loss": 0.6449, "step": 8920 }, { "epoch": 0.8818920989546005, "grad_norm": 3.583685052209322, "learning_rate": 7.220296568357343e-09, "loss": 0.7508, "step": 8921 }, { "epoch": 0.881990954699355, "grad_norm": 3.43386795687596, "learning_rate": 7.208354712880194e-09, "loss": 0.7178, "step": 8922 }, { "epoch": 0.8820898104441094, "grad_norm": 3.4553004608937985, "learning_rate": 7.196422371817523e-09, "loss": 0.7438, "step": 8923 }, { "epoch": 0.8821886661888639, "grad_norm": 4.683864535543013, "learning_rate": 7.184499546392763e-09, "loss": 0.607, "step": 8924 }, { "epoch": 0.8822875219336184, "grad_norm": 5.333158442826294, "learning_rate": 7.1725862378284595e-09, "loss": 0.5755, "step": 8925 }, { "epoch": 0.8823863776783728, "grad_norm": 3.2599067773094395, "learning_rate": 7.160682447346145e-09, "loss": 0.6524, "step": 8926 }, { "epoch": 0.8824852334231273, "grad_norm": 3.477099238836639, "learning_rate": 7.1487881761663535e-09, "loss": 0.6713, "step": 8927 }, { "epoch": 0.8825840891678818, "grad_norm": 18.36335269819047, "learning_rate": 7.1369034255087e-09, "loss": 0.6216, "step": 8928 }, { "epoch": 0.8826829449126362, "grad_norm": 5.069234897230882, "learning_rate": 7.125028196591754e-09, "loss": 0.6981, "step": 8929 }, { "epoch": 0.8827818006573906, "grad_norm": 3.133220326217673, "learning_rate": 7.113162490633184e-09, "loss": 0.7839, "step": 8930 }, { "epoch": 0.8828806564021452, "grad_norm": 12.668323792089929, "learning_rate": 7.1013063088496085e-09, "loss": 0.5464, "step": 8931 }, { "epoch": 0.8829795121468996, "grad_norm": 5.748200553569565, "learning_rate": 7.089459652456742e-09, "loss": 0.5639, "step": 8932 }, { "epoch": 0.8830783678916541, "grad_norm": 9.33867054769068, "learning_rate": 7.077622522669236e-09, "loss": 0.567, "step": 8933 }, { "epoch": 0.8831772236364086, "grad_norm": 3.01929452863069, "learning_rate": 7.065794920700852e-09, "loss": 0.5769, "step": 8934 }, { "epoch": 0.883276079381163, "grad_norm": 7.602731212368465, "learning_rate": 7.053976847764309e-09, "loss": 0.8101, "step": 8935 }, { "epoch": 0.8833749351259175, "grad_norm": 6.366955070378368, "learning_rate": 7.042168305071394e-09, "loss": 0.7681, "step": 8936 }, { "epoch": 0.883473790870672, "grad_norm": 3.396994110657782, "learning_rate": 7.0303692938329055e-09, "loss": 0.859, "step": 8937 }, { "epoch": 0.8835726466154264, "grad_norm": 10.030898648430062, "learning_rate": 7.018579815258641e-09, "loss": 0.6341, "step": 8938 }, { "epoch": 0.8836715023601809, "grad_norm": 5.473360144159395, "learning_rate": 7.006799870557445e-09, "loss": 0.775, "step": 8939 }, { "epoch": 0.8837703581049354, "grad_norm": 2.9508525553181184, "learning_rate": 6.995029460937174e-09, "loss": 0.6195, "step": 8940 }, { "epoch": 0.8838692138496899, "grad_norm": 4.171083004447715, "learning_rate": 6.983268587604707e-09, "loss": 0.7373, "step": 8941 }, { "epoch": 0.8839680695944443, "grad_norm": 4.778084407363342, "learning_rate": 6.971517251765968e-09, "loss": 0.7034, "step": 8942 }, { "epoch": 0.8840669253391987, "grad_norm": 4.516531278754341, "learning_rate": 6.959775454625849e-09, "loss": 0.7477, "step": 8943 }, { "epoch": 0.8841657810839533, "grad_norm": 8.842039742015581, "learning_rate": 6.9480431973883204e-09, "loss": 0.7059, "step": 8944 }, { "epoch": 0.8842646368287077, "grad_norm": 4.285733672363656, "learning_rate": 6.936320481256363e-09, "loss": 0.6366, "step": 8945 }, { "epoch": 0.8843634925734621, "grad_norm": 9.926416001866258, "learning_rate": 6.92460730743194e-09, "loss": 0.7045, "step": 8946 }, { "epoch": 0.8844623483182167, "grad_norm": 4.147065608976252, "learning_rate": 6.9129036771161e-09, "loss": 0.6409, "step": 8947 }, { "epoch": 0.8845612040629711, "grad_norm": 2.520575679660517, "learning_rate": 6.901209591508839e-09, "loss": 0.7045, "step": 8948 }, { "epoch": 0.8846600598077256, "grad_norm": 3.7903656249601734, "learning_rate": 6.889525051809242e-09, "loss": 0.7247, "step": 8949 }, { "epoch": 0.8847589155524801, "grad_norm": 3.1128611611619186, "learning_rate": 6.877850059215373e-09, "loss": 0.6842, "step": 8950 }, { "epoch": 0.8848577712972345, "grad_norm": 8.087260785410317, "learning_rate": 6.8661846149243195e-09, "loss": 0.6359, "step": 8951 }, { "epoch": 0.884956627041989, "grad_norm": 3.2153643687804934, "learning_rate": 6.854528720132213e-09, "loss": 0.6939, "step": 8952 }, { "epoch": 0.8850554827867434, "grad_norm": 2.908186717435241, "learning_rate": 6.842882376034209e-09, "loss": 0.745, "step": 8953 }, { "epoch": 0.8851543385314979, "grad_norm": 2.956046452326921, "learning_rate": 6.831245583824441e-09, "loss": 0.7485, "step": 8954 }, { "epoch": 0.8852531942762524, "grad_norm": 3.2760740178402186, "learning_rate": 6.819618344696099e-09, "loss": 0.7091, "step": 8955 }, { "epoch": 0.8853520500210068, "grad_norm": 16.015330113239326, "learning_rate": 6.808000659841406e-09, "loss": 0.7451, "step": 8956 }, { "epoch": 0.8854509057657614, "grad_norm": 5.6723996434187525, "learning_rate": 6.796392530451556e-09, "loss": 0.654, "step": 8957 }, { "epoch": 0.8855497615105158, "grad_norm": 4.974980975513896, "learning_rate": 6.784793957716795e-09, "loss": 0.6556, "step": 8958 }, { "epoch": 0.8856486172552702, "grad_norm": 6.157163165419612, "learning_rate": 6.773204942826405e-09, "loss": 0.8166, "step": 8959 }, { "epoch": 0.8857474730000248, "grad_norm": 4.180858284192593, "learning_rate": 6.761625486968647e-09, "loss": 0.779, "step": 8960 }, { "epoch": 0.8858463287447792, "grad_norm": 4.3450298005889945, "learning_rate": 6.75005559133085e-09, "loss": 0.766, "step": 8961 }, { "epoch": 0.8859451844895336, "grad_norm": 3.1553343552652136, "learning_rate": 6.738495257099297e-09, "loss": 0.7613, "step": 8962 }, { "epoch": 0.8860440402342881, "grad_norm": 4.350488321705772, "learning_rate": 6.726944485459352e-09, "loss": 0.6, "step": 8963 }, { "epoch": 0.8861428959790426, "grad_norm": 4.16114426122731, "learning_rate": 6.715403277595399e-09, "loss": 0.6613, "step": 8964 }, { "epoch": 0.886241751723797, "grad_norm": 5.7414927995208584, "learning_rate": 6.703871634690772e-09, "loss": 0.744, "step": 8965 }, { "epoch": 0.8863406074685515, "grad_norm": 4.28154950492256, "learning_rate": 6.692349557927912e-09, "loss": 0.6286, "step": 8966 }, { "epoch": 0.886439463213306, "grad_norm": 9.682879403320705, "learning_rate": 6.68083704848822e-09, "loss": 0.8129, "step": 8967 }, { "epoch": 0.8865383189580605, "grad_norm": 8.135506802537659, "learning_rate": 6.669334107552116e-09, "loss": 0.7295, "step": 8968 }, { "epoch": 0.8866371747028149, "grad_norm": 6.071801768121945, "learning_rate": 6.65784073629907e-09, "loss": 0.6906, "step": 8969 }, { "epoch": 0.8867360304475694, "grad_norm": 5.094894620707953, "learning_rate": 6.646356935907594e-09, "loss": 0.7817, "step": 8970 }, { "epoch": 0.8868348861923239, "grad_norm": 3.19983089758021, "learning_rate": 6.634882707555123e-09, "loss": 0.6953, "step": 8971 }, { "epoch": 0.8869337419370783, "grad_norm": 6.822290209337686, "learning_rate": 6.623418052418217e-09, "loss": 0.8176, "step": 8972 }, { "epoch": 0.8870325976818327, "grad_norm": 4.147547261845449, "learning_rate": 6.611962971672369e-09, "loss": 0.9011, "step": 8973 }, { "epoch": 0.8871314534265873, "grad_norm": 7.535114354216261, "learning_rate": 6.600517466492151e-09, "loss": 0.644, "step": 8974 }, { "epoch": 0.8872303091713417, "grad_norm": 4.79003518255959, "learning_rate": 6.589081538051144e-09, "loss": 0.7822, "step": 8975 }, { "epoch": 0.8873291649160961, "grad_norm": 5.562793045303385, "learning_rate": 6.577655187521902e-09, "loss": 0.6674, "step": 8976 }, { "epoch": 0.8874280206608507, "grad_norm": 3.09788781676729, "learning_rate": 6.566238416076053e-09, "loss": 0.6766, "step": 8977 }, { "epoch": 0.8875268764056051, "grad_norm": 2.968956097039168, "learning_rate": 6.554831224884206e-09, "loss": 0.702, "step": 8978 }, { "epoch": 0.8876257321503596, "grad_norm": 6.9303927536382925, "learning_rate": 6.543433615116001e-09, "loss": 0.7222, "step": 8979 }, { "epoch": 0.8877245878951141, "grad_norm": 7.8917198025548485, "learning_rate": 6.532045587940094e-09, "loss": 0.7229, "step": 8980 }, { "epoch": 0.8878234436398685, "grad_norm": 4.835954142192795, "learning_rate": 6.520667144524183e-09, "loss": 0.7248, "step": 8981 }, { "epoch": 0.887922299384623, "grad_norm": 40.49474806392048, "learning_rate": 6.5092982860349146e-09, "loss": 0.5968, "step": 8982 }, { "epoch": 0.8880211551293774, "grad_norm": 4.190641662101274, "learning_rate": 6.497939013638043e-09, "loss": 0.7244, "step": 8983 }, { "epoch": 0.888120010874132, "grad_norm": 4.533030723035096, "learning_rate": 6.486589328498271e-09, "loss": 0.7445, "step": 8984 }, { "epoch": 0.8882188666188864, "grad_norm": 2.7907242597095587, "learning_rate": 6.475249231779334e-09, "loss": 0.6589, "step": 8985 }, { "epoch": 0.8883177223636408, "grad_norm": 3.190521678225836, "learning_rate": 6.463918724644013e-09, "loss": 0.6019, "step": 8986 }, { "epoch": 0.8884165781083954, "grad_norm": 3.8617659041462575, "learning_rate": 6.452597808254046e-09, "loss": 0.6229, "step": 8987 }, { "epoch": 0.8885154338531498, "grad_norm": 4.428409455016847, "learning_rate": 6.44128648377027e-09, "loss": 0.6376, "step": 8988 }, { "epoch": 0.8886142895979042, "grad_norm": 4.540746245508736, "learning_rate": 6.42998475235248e-09, "loss": 0.6683, "step": 8989 }, { "epoch": 0.8887131453426588, "grad_norm": 3.6543412219064075, "learning_rate": 6.418692615159493e-09, "loss": 0.7449, "step": 8990 }, { "epoch": 0.8888120010874132, "grad_norm": 3.893454205360025, "learning_rate": 6.407410073349151e-09, "loss": 0.6743, "step": 8991 }, { "epoch": 0.8889108568321676, "grad_norm": 14.483955770154605, "learning_rate": 6.396137128078327e-09, "loss": 0.6984, "step": 8992 }, { "epoch": 0.8890097125769221, "grad_norm": 3.7805706005587876, "learning_rate": 6.3848737805028865e-09, "loss": 0.703, "step": 8993 }, { "epoch": 0.8891085683216766, "grad_norm": 5.058099376722726, "learning_rate": 6.373620031777727e-09, "loss": 0.6807, "step": 8994 }, { "epoch": 0.889207424066431, "grad_norm": 4.647019801950886, "learning_rate": 6.3623758830567256e-09, "loss": 0.622, "step": 8995 }, { "epoch": 0.8893062798111855, "grad_norm": 6.4047763710172525, "learning_rate": 6.351141335492827e-09, "loss": 0.6561, "step": 8996 }, { "epoch": 0.88940513555594, "grad_norm": 2.7613180098149552, "learning_rate": 6.339916390237987e-09, "loss": 0.6959, "step": 8997 }, { "epoch": 0.8895039913006945, "grad_norm": 4.957547271908797, "learning_rate": 6.32870104844313e-09, "loss": 0.686, "step": 8998 }, { "epoch": 0.8896028470454489, "grad_norm": 3.8640531531698707, "learning_rate": 6.317495311258214e-09, "loss": 0.6848, "step": 8999 }, { "epoch": 0.8897017027902034, "grad_norm": 13.329761460473481, "learning_rate": 6.3062991798322644e-09, "loss": 0.6856, "step": 9000 }, { "epoch": 0.8898005585349579, "grad_norm": 5.135860137562225, "learning_rate": 6.295112655313261e-09, "loss": 0.666, "step": 9001 }, { "epoch": 0.8898994142797123, "grad_norm": 4.181638158252477, "learning_rate": 6.28393573884819e-09, "loss": 0.6913, "step": 9002 }, { "epoch": 0.8899982700244667, "grad_norm": 2.946539925146132, "learning_rate": 6.272768431583109e-09, "loss": 0.6394, "step": 9003 }, { "epoch": 0.8900971257692213, "grad_norm": 7.307136132147955, "learning_rate": 6.261610734663037e-09, "loss": 0.7523, "step": 9004 }, { "epoch": 0.8901959815139757, "grad_norm": 2.96374021312273, "learning_rate": 6.250462649232069e-09, "loss": 0.7002, "step": 9005 }, { "epoch": 0.8902948372587302, "grad_norm": 6.976320102927197, "learning_rate": 6.239324176433236e-09, "loss": 0.7786, "step": 9006 }, { "epoch": 0.8903936930034847, "grad_norm": 4.312248087019854, "learning_rate": 6.228195317408635e-09, "loss": 0.7042, "step": 9007 }, { "epoch": 0.8904925487482391, "grad_norm": 3.4778481871968325, "learning_rate": 6.217076073299388e-09, "loss": 0.6438, "step": 9008 }, { "epoch": 0.8905914044929936, "grad_norm": 3.8717620898674387, "learning_rate": 6.2059664452455804e-09, "loss": 0.7497, "step": 9009 }, { "epoch": 0.8906902602377481, "grad_norm": 2.9582186991739685, "learning_rate": 6.194866434386359e-09, "loss": 0.7707, "step": 9010 }, { "epoch": 0.8907891159825025, "grad_norm": 3.414150571634223, "learning_rate": 6.1837760418598784e-09, "loss": 0.713, "step": 9011 }, { "epoch": 0.890887971727257, "grad_norm": 12.10026033582786, "learning_rate": 6.172695268803263e-09, "loss": 0.7788, "step": 9012 }, { "epoch": 0.8909868274720115, "grad_norm": 4.218361531726069, "learning_rate": 6.1616241163526925e-09, "loss": 0.6538, "step": 9013 }, { "epoch": 0.891085683216766, "grad_norm": 3.0889679845656173, "learning_rate": 6.150562585643371e-09, "loss": 0.7331, "step": 9014 }, { "epoch": 0.8911845389615204, "grad_norm": 3.490195572766684, "learning_rate": 6.1395106778094675e-09, "loss": 0.7725, "step": 9015 }, { "epoch": 0.8912833947062748, "grad_norm": 3.3485301183015426, "learning_rate": 6.12846839398421e-09, "loss": 0.6797, "step": 9016 }, { "epoch": 0.8913822504510294, "grad_norm": 4.212954671576331, "learning_rate": 6.117435735299814e-09, "loss": 0.752, "step": 9017 }, { "epoch": 0.8914811061957838, "grad_norm": 6.929240985820221, "learning_rate": 6.10641270288752e-09, "loss": 0.8735, "step": 9018 }, { "epoch": 0.8915799619405382, "grad_norm": 5.083297109324743, "learning_rate": 6.095399297877568e-09, "loss": 0.7884, "step": 9019 }, { "epoch": 0.8916788176852928, "grad_norm": 6.989054297516516, "learning_rate": 6.084395521399244e-09, "loss": 0.7774, "step": 9020 }, { "epoch": 0.8917776734300472, "grad_norm": 4.653964383788346, "learning_rate": 6.073401374580789e-09, "loss": 0.6881, "step": 9021 }, { "epoch": 0.8918765291748016, "grad_norm": 4.165552676218666, "learning_rate": 6.062416858549524e-09, "loss": 0.7449, "step": 9022 }, { "epoch": 0.8919753849195562, "grad_norm": 11.479109517702655, "learning_rate": 6.051441974431726e-09, "loss": 0.7049, "step": 9023 }, { "epoch": 0.8920742406643106, "grad_norm": 3.4793859662421665, "learning_rate": 6.040476723352705e-09, "loss": 0.8017, "step": 9024 }, { "epoch": 0.8921730964090651, "grad_norm": 3.4095329537012713, "learning_rate": 6.029521106436819e-09, "loss": 0.6643, "step": 9025 }, { "epoch": 0.8922719521538195, "grad_norm": 5.068537630488904, "learning_rate": 6.018575124807357e-09, "loss": 0.6505, "step": 9026 }, { "epoch": 0.892370807898574, "grad_norm": 3.805180777231529, "learning_rate": 6.00763877958671e-09, "loss": 0.6463, "step": 9027 }, { "epoch": 0.8924696636433285, "grad_norm": 8.495149118695764, "learning_rate": 5.996712071896204e-09, "loss": 0.709, "step": 9028 }, { "epoch": 0.8925685193880829, "grad_norm": 5.255575198595314, "learning_rate": 5.985795002856231e-09, "loss": 0.7306, "step": 9029 }, { "epoch": 0.8926673751328374, "grad_norm": 13.868492499827381, "learning_rate": 5.974887573586196e-09, "loss": 0.6884, "step": 9030 }, { "epoch": 0.8927662308775919, "grad_norm": 4.086260077727874, "learning_rate": 5.963989785204448e-09, "loss": 0.6899, "step": 9031 }, { "epoch": 0.8928650866223463, "grad_norm": 4.83524296422827, "learning_rate": 5.953101638828417e-09, "loss": 0.6506, "step": 9032 }, { "epoch": 0.8929639423671009, "grad_norm": 3.524309093097231, "learning_rate": 5.942223135574554e-09, "loss": 0.6956, "step": 9033 }, { "epoch": 0.8930627981118553, "grad_norm": 3.061242048582986, "learning_rate": 5.9313542765582316e-09, "loss": 0.8005, "step": 9034 }, { "epoch": 0.8931616538566097, "grad_norm": 5.033840740380672, "learning_rate": 5.920495062893927e-09, "loss": 0.6631, "step": 9035 }, { "epoch": 0.8932605096013642, "grad_norm": 7.325986510286813, "learning_rate": 5.9096454956950816e-09, "loss": 0.7263, "step": 9036 }, { "epoch": 0.8933593653461187, "grad_norm": 2.916857839480505, "learning_rate": 5.898805576074162e-09, "loss": 0.6897, "step": 9037 }, { "epoch": 0.8934582210908731, "grad_norm": 3.9232604715027417, "learning_rate": 5.887975305142656e-09, "loss": 0.6747, "step": 9038 }, { "epoch": 0.8935570768356276, "grad_norm": 5.587357331909816, "learning_rate": 5.877154684010999e-09, "loss": 0.6924, "step": 9039 }, { "epoch": 0.8936559325803821, "grad_norm": 3.416957362285262, "learning_rate": 5.866343713788735e-09, "loss": 0.6609, "step": 9040 }, { "epoch": 0.8937547883251366, "grad_norm": 3.599630724364669, "learning_rate": 5.855542395584368e-09, "loss": 0.563, "step": 9041 }, { "epoch": 0.893853644069891, "grad_norm": 6.825961874497692, "learning_rate": 5.844750730505377e-09, "loss": 0.7298, "step": 9042 }, { "epoch": 0.8939524998146455, "grad_norm": 6.6110321370393, "learning_rate": 5.833968719658311e-09, "loss": 0.6394, "step": 9043 }, { "epoch": 0.8940513555594, "grad_norm": 11.720121781431233, "learning_rate": 5.82319636414873e-09, "loss": 0.6136, "step": 9044 }, { "epoch": 0.8941502113041544, "grad_norm": 9.3804149128615, "learning_rate": 5.812433665081129e-09, "loss": 0.7575, "step": 9045 }, { "epoch": 0.8942490670489088, "grad_norm": 4.823477261196422, "learning_rate": 5.801680623559102e-09, "loss": 0.7094, "step": 9046 }, { "epoch": 0.8943479227936634, "grad_norm": 8.46788510396276, "learning_rate": 5.790937240685212e-09, "loss": 0.6917, "step": 9047 }, { "epoch": 0.8944467785384178, "grad_norm": 11.54979699663987, "learning_rate": 5.780203517561011e-09, "loss": 0.7065, "step": 9048 }, { "epoch": 0.8945456342831722, "grad_norm": 2.912727532031011, "learning_rate": 5.7694794552871185e-09, "loss": 0.6422, "step": 9049 }, { "epoch": 0.8946444900279268, "grad_norm": 5.304295995793127, "learning_rate": 5.7587650549630994e-09, "loss": 0.733, "step": 9050 }, { "epoch": 0.8947433457726812, "grad_norm": 2.661527394536616, "learning_rate": 5.748060317687564e-09, "loss": 0.5528, "step": 9051 }, { "epoch": 0.8948422015174357, "grad_norm": 3.183297272318141, "learning_rate": 5.737365244558145e-09, "loss": 0.7162, "step": 9052 }, { "epoch": 0.8949410572621902, "grad_norm": 4.6871734251818795, "learning_rate": 5.726679836671422e-09, "loss": 0.8039, "step": 9053 }, { "epoch": 0.8950399130069446, "grad_norm": 3.3112091256687264, "learning_rate": 5.716004095123084e-09, "loss": 0.6498, "step": 9054 }, { "epoch": 0.8951387687516991, "grad_norm": 19.79664047289547, "learning_rate": 5.705338021007733e-09, "loss": 0.7042, "step": 9055 }, { "epoch": 0.8952376244964535, "grad_norm": 3.1102504157905333, "learning_rate": 5.694681615419006e-09, "loss": 0.6338, "step": 9056 }, { "epoch": 0.895336480241208, "grad_norm": 6.993121660563989, "learning_rate": 5.684034879449573e-09, "loss": 0.699, "step": 9057 }, { "epoch": 0.8954353359859625, "grad_norm": 5.430977169890235, "learning_rate": 5.673397814191128e-09, "loss": 0.5812, "step": 9058 }, { "epoch": 0.8955341917307169, "grad_norm": 4.482412218566147, "learning_rate": 5.662770420734308e-09, "loss": 0.7535, "step": 9059 }, { "epoch": 0.8956330474754715, "grad_norm": 5.1549890919692025, "learning_rate": 5.6521527001688216e-09, "loss": 0.7584, "step": 9060 }, { "epoch": 0.8957319032202259, "grad_norm": 5.1675340776152, "learning_rate": 5.64154465358333e-09, "loss": 0.8233, "step": 9061 }, { "epoch": 0.8958307589649803, "grad_norm": 3.850055254095426, "learning_rate": 5.630946282065552e-09, "loss": 0.764, "step": 9062 }, { "epoch": 0.8959296147097349, "grad_norm": 5.703398948245286, "learning_rate": 5.620357586702207e-09, "loss": 0.6532, "step": 9063 }, { "epoch": 0.8960284704544893, "grad_norm": 5.740486459681621, "learning_rate": 5.609778568578982e-09, "loss": 0.6895, "step": 9064 }, { "epoch": 0.8961273261992437, "grad_norm": 4.112036684430276, "learning_rate": 5.5992092287805995e-09, "loss": 0.7801, "step": 9065 }, { "epoch": 0.8962261819439982, "grad_norm": 5.867129017590787, "learning_rate": 5.5886495683908244e-09, "loss": 0.6395, "step": 9066 }, { "epoch": 0.8963250376887527, "grad_norm": 3.9431844818610378, "learning_rate": 5.5780995884923575e-09, "loss": 0.6978, "step": 9067 }, { "epoch": 0.8964238934335071, "grad_norm": 3.4360856451310138, "learning_rate": 5.5675592901669566e-09, "loss": 0.7497, "step": 9068 }, { "epoch": 0.8965227491782616, "grad_norm": 9.984321583756216, "learning_rate": 5.557028674495379e-09, "loss": 0.7033, "step": 9069 }, { "epoch": 0.8966216049230161, "grad_norm": 21.866246961997962, "learning_rate": 5.5465077425573715e-09, "loss": 0.6281, "step": 9070 }, { "epoch": 0.8967204606677706, "grad_norm": 3.349559942110378, "learning_rate": 5.535996495431728e-09, "loss": 0.7403, "step": 9071 }, { "epoch": 0.896819316412525, "grad_norm": 4.8741040817045596, "learning_rate": 5.525494934196195e-09, "loss": 0.7504, "step": 9072 }, { "epoch": 0.8969181721572795, "grad_norm": 8.522772128530024, "learning_rate": 5.5150030599275366e-09, "loss": 0.7928, "step": 9073 }, { "epoch": 0.897017027902034, "grad_norm": 5.691304670056707, "learning_rate": 5.5045208737015905e-09, "loss": 0.6849, "step": 9074 }, { "epoch": 0.8971158836467884, "grad_norm": 4.883295442914156, "learning_rate": 5.494048376593108e-09, "loss": 0.7323, "step": 9075 }, { "epoch": 0.8972147393915428, "grad_norm": 3.6902118230138177, "learning_rate": 5.483585569675897e-09, "loss": 0.7778, "step": 9076 }, { "epoch": 0.8973135951362974, "grad_norm": 3.3539238296993075, "learning_rate": 5.473132454022788e-09, "loss": 0.675, "step": 9077 }, { "epoch": 0.8974124508810518, "grad_norm": 3.119658709639869, "learning_rate": 5.462689030705558e-09, "loss": 0.6749, "step": 9078 }, { "epoch": 0.8975113066258062, "grad_norm": 2.999884114519488, "learning_rate": 5.45225530079505e-09, "loss": 0.648, "step": 9079 }, { "epoch": 0.8976101623705608, "grad_norm": 4.160503732036003, "learning_rate": 5.441831265361096e-09, "loss": 0.7173, "step": 9080 }, { "epoch": 0.8977090181153152, "grad_norm": 4.167825264338345, "learning_rate": 5.431416925472498e-09, "loss": 0.6897, "step": 9081 }, { "epoch": 0.8978078738600697, "grad_norm": 7.392268336783591, "learning_rate": 5.4210122821971235e-09, "loss": 0.6878, "step": 9082 }, { "epoch": 0.8979067296048242, "grad_norm": 3.4941309848222066, "learning_rate": 5.410617336601797e-09, "loss": 0.7539, "step": 9083 }, { "epoch": 0.8980055853495786, "grad_norm": 8.375166878139579, "learning_rate": 5.400232089752355e-09, "loss": 0.6155, "step": 9084 }, { "epoch": 0.8981044410943331, "grad_norm": 3.1230967650450787, "learning_rate": 5.389856542713689e-09, "loss": 0.6892, "step": 9085 }, { "epoch": 0.8982032968390875, "grad_norm": 3.995060930152873, "learning_rate": 5.379490696549627e-09, "loss": 0.7315, "step": 9086 }, { "epoch": 0.898302152583842, "grad_norm": 3.639293663231565, "learning_rate": 5.369134552323051e-09, "loss": 0.7155, "step": 9087 }, { "epoch": 0.8984010083285965, "grad_norm": 3.610226268734547, "learning_rate": 5.358788111095814e-09, "loss": 0.754, "step": 9088 }, { "epoch": 0.8984998640733509, "grad_norm": 3.57608861158042, "learning_rate": 5.348451373928808e-09, "loss": 0.7054, "step": 9089 }, { "epoch": 0.8985987198181055, "grad_norm": 5.857492662603637, "learning_rate": 5.3381243418819e-09, "loss": 0.787, "step": 9090 }, { "epoch": 0.8986975755628599, "grad_norm": 16.150956912896042, "learning_rate": 5.327807016013985e-09, "loss": 0.659, "step": 9091 }, { "epoch": 0.8987964313076143, "grad_norm": 6.72392288990043, "learning_rate": 5.3174993973829404e-09, "loss": 0.8339, "step": 9092 }, { "epoch": 0.8988952870523689, "grad_norm": 8.20118051089745, "learning_rate": 5.307201487045676e-09, "loss": 0.6369, "step": 9093 }, { "epoch": 0.8989941427971233, "grad_norm": 8.282513268639477, "learning_rate": 5.296913286058069e-09, "loss": 0.6453, "step": 9094 }, { "epoch": 0.8990929985418777, "grad_norm": 3.348955257899067, "learning_rate": 5.286634795475042e-09, "loss": 0.6654, "step": 9095 }, { "epoch": 0.8991918542866323, "grad_norm": 6.222665655154139, "learning_rate": 5.276366016350508e-09, "loss": 0.7257, "step": 9096 }, { "epoch": 0.8992907100313867, "grad_norm": 4.21566590862493, "learning_rate": 5.266106949737348e-09, "loss": 0.7653, "step": 9097 }, { "epoch": 0.8993895657761412, "grad_norm": 4.900619409465956, "learning_rate": 5.255857596687497e-09, "loss": 0.6568, "step": 9098 }, { "epoch": 0.8994884215208956, "grad_norm": 3.6187592824639436, "learning_rate": 5.245617958251891e-09, "loss": 0.6318, "step": 9099 }, { "epoch": 0.8995872772656501, "grad_norm": 10.901697593603277, "learning_rate": 5.235388035480426e-09, "loss": 0.782, "step": 9100 }, { "epoch": 0.8996861330104046, "grad_norm": 5.750625274174728, "learning_rate": 5.2251678294220395e-09, "loss": 0.6619, "step": 9101 }, { "epoch": 0.899784988755159, "grad_norm": 5.183019381626044, "learning_rate": 5.214957341124682e-09, "loss": 0.5876, "step": 9102 }, { "epoch": 0.8998838444999135, "grad_norm": 3.0465757329157794, "learning_rate": 5.204756571635261e-09, "loss": 0.7354, "step": 9103 }, { "epoch": 0.899982700244668, "grad_norm": 3.4343915947460513, "learning_rate": 5.194565521999728e-09, "loss": 0.7344, "step": 9104 }, { "epoch": 0.9000815559894224, "grad_norm": 9.604479466289362, "learning_rate": 5.184384193263014e-09, "loss": 0.6272, "step": 9105 }, { "epoch": 0.900180411734177, "grad_norm": 5.078502020966605, "learning_rate": 5.174212586469084e-09, "loss": 0.6087, "step": 9106 }, { "epoch": 0.9002792674789314, "grad_norm": 8.839361932451537, "learning_rate": 5.16405070266086e-09, "loss": 0.7852, "step": 9107 }, { "epoch": 0.9003781232236858, "grad_norm": 16.291408825058742, "learning_rate": 5.153898542880331e-09, "loss": 0.6661, "step": 9108 }, { "epoch": 0.9004769789684403, "grad_norm": 9.988033833899815, "learning_rate": 5.1437561081684065e-09, "loss": 0.6419, "step": 9109 }, { "epoch": 0.9005758347131948, "grad_norm": 8.327939792923141, "learning_rate": 5.133623399565079e-09, "loss": 0.6527, "step": 9110 }, { "epoch": 0.9006746904579492, "grad_norm": 3.5837198660212928, "learning_rate": 5.123500418109272e-09, "loss": 0.658, "step": 9111 }, { "epoch": 0.9007735462027037, "grad_norm": 12.63504232241865, "learning_rate": 5.113387164838989e-09, "loss": 0.7589, "step": 9112 }, { "epoch": 0.9008724019474582, "grad_norm": 7.094367936253062, "learning_rate": 5.103283640791178e-09, "loss": 0.6842, "step": 9113 }, { "epoch": 0.9009712576922126, "grad_norm": 3.267531718388435, "learning_rate": 5.0931898470017886e-09, "loss": 0.662, "step": 9114 }, { "epoch": 0.9010701134369671, "grad_norm": 4.368424453260849, "learning_rate": 5.083105784505826e-09, "loss": 0.7322, "step": 9115 }, { "epoch": 0.9011689691817216, "grad_norm": 4.0387402343416205, "learning_rate": 5.073031454337229e-09, "loss": 0.5428, "step": 9116 }, { "epoch": 0.9012678249264761, "grad_norm": 5.515160745670795, "learning_rate": 5.062966857528983e-09, "loss": 0.6834, "step": 9117 }, { "epoch": 0.9013666806712305, "grad_norm": 7.6455174936224815, "learning_rate": 5.052911995113074e-09, "loss": 0.6217, "step": 9118 }, { "epoch": 0.9014655364159849, "grad_norm": 4.54992581097053, "learning_rate": 5.042866868120466e-09, "loss": 0.7917, "step": 9119 }, { "epoch": 0.9015643921607395, "grad_norm": 4.1676280825882, "learning_rate": 5.032831477581145e-09, "loss": 0.7805, "step": 9120 }, { "epoch": 0.9016632479054939, "grad_norm": 2.726208934793631, "learning_rate": 5.022805824524101e-09, "loss": 0.5852, "step": 9121 }, { "epoch": 0.9017621036502483, "grad_norm": 3.3024583625912673, "learning_rate": 5.0127899099772995e-09, "loss": 0.7409, "step": 9122 }, { "epoch": 0.9018609593950029, "grad_norm": 3.0840779595691874, "learning_rate": 5.002783734967742e-09, "loss": 0.5984, "step": 9123 }, { "epoch": 0.9019598151397573, "grad_norm": 3.952418716489158, "learning_rate": 4.992787300521395e-09, "loss": 0.7413, "step": 9124 }, { "epoch": 0.9020586708845117, "grad_norm": 5.984436501574509, "learning_rate": 4.9828006076632626e-09, "loss": 0.6936, "step": 9125 }, { "epoch": 0.9021575266292663, "grad_norm": 6.678080411569986, "learning_rate": 4.9728236574173245e-09, "loss": 0.6778, "step": 9126 }, { "epoch": 0.9022563823740207, "grad_norm": 4.137278296645323, "learning_rate": 4.962856450806563e-09, "loss": 0.7594, "step": 9127 }, { "epoch": 0.9023552381187752, "grad_norm": 3.09541510390098, "learning_rate": 4.952898988852972e-09, "loss": 0.5969, "step": 9128 }, { "epoch": 0.9024540938635296, "grad_norm": 3.298622031148859, "learning_rate": 4.942951272577567e-09, "loss": 0.6166, "step": 9129 }, { "epoch": 0.9025529496082841, "grad_norm": 2.9604188938755382, "learning_rate": 4.933013303000299e-09, "loss": 0.7895, "step": 9130 }, { "epoch": 0.9026518053530386, "grad_norm": 3.3278858436309444, "learning_rate": 4.923085081140188e-09, "loss": 0.7391, "step": 9131 }, { "epoch": 0.902750661097793, "grad_norm": 4.693182347030312, "learning_rate": 4.913166608015218e-09, "loss": 0.7931, "step": 9132 }, { "epoch": 0.9028495168425476, "grad_norm": 3.40290267741298, "learning_rate": 4.9032578846423865e-09, "loss": 0.7497, "step": 9133 }, { "epoch": 0.902948372587302, "grad_norm": 4.445475895764255, "learning_rate": 4.8933589120376706e-09, "loss": 0.7445, "step": 9134 }, { "epoch": 0.9030472283320564, "grad_norm": 27.10899936477761, "learning_rate": 4.883469691216091e-09, "loss": 0.7498, "step": 9135 }, { "epoch": 0.903146084076811, "grad_norm": 4.828366005176653, "learning_rate": 4.8735902231916144e-09, "loss": 0.7948, "step": 9136 }, { "epoch": 0.9032449398215654, "grad_norm": 4.520922345392458, "learning_rate": 4.863720508977265e-09, "loss": 0.7133, "step": 9137 }, { "epoch": 0.9033437955663198, "grad_norm": 3.9366300668740357, "learning_rate": 4.853860549584998e-09, "loss": 0.6592, "step": 9138 }, { "epoch": 0.9034426513110743, "grad_norm": 5.082017815111666, "learning_rate": 4.84401034602584e-09, "loss": 0.6666, "step": 9139 }, { "epoch": 0.9035415070558288, "grad_norm": 3.659387955031945, "learning_rate": 4.83416989930977e-09, "loss": 0.7275, "step": 9140 }, { "epoch": 0.9036403628005832, "grad_norm": 6.527314847258016, "learning_rate": 4.824339210445794e-09, "loss": 0.6639, "step": 9141 }, { "epoch": 0.9037392185453377, "grad_norm": 6.170357481234975, "learning_rate": 4.81451828044187e-09, "loss": 0.666, "step": 9142 }, { "epoch": 0.9038380742900922, "grad_norm": 4.15418579377171, "learning_rate": 4.8047071103050284e-09, "loss": 0.672, "step": 9143 }, { "epoch": 0.9039369300348467, "grad_norm": 5.700091675504564, "learning_rate": 4.79490570104123e-09, "loss": 0.7578, "step": 9144 }, { "epoch": 0.9040357857796011, "grad_norm": 6.306017738963321, "learning_rate": 4.785114053655481e-09, "loss": 0.7114, "step": 9145 }, { "epoch": 0.9041346415243556, "grad_norm": 4.353419037464043, "learning_rate": 4.77533216915178e-09, "loss": 0.649, "step": 9146 }, { "epoch": 0.9042334972691101, "grad_norm": 5.196233454332332, "learning_rate": 4.765560048533079e-09, "loss": 0.7439, "step": 9147 }, { "epoch": 0.9043323530138645, "grad_norm": 3.229132138747151, "learning_rate": 4.755797692801411e-09, "loss": 0.732, "step": 9148 }, { "epoch": 0.9044312087586189, "grad_norm": 3.430926671494994, "learning_rate": 4.74604510295773e-09, "loss": 0.7313, "step": 9149 }, { "epoch": 0.9045300645033735, "grad_norm": 4.993021636451876, "learning_rate": 4.736302280002025e-09, "loss": 0.6791, "step": 9150 }, { "epoch": 0.9046289202481279, "grad_norm": 3.7545931791677902, "learning_rate": 4.726569224933286e-09, "loss": 0.7077, "step": 9151 }, { "epoch": 0.9047277759928823, "grad_norm": 4.492517168270949, "learning_rate": 4.7168459387494915e-09, "loss": 0.7858, "step": 9152 }, { "epoch": 0.9048266317376369, "grad_norm": 4.8936173155249865, "learning_rate": 4.707132422447613e-09, "loss": 0.6661, "step": 9153 }, { "epoch": 0.9049254874823913, "grad_norm": 8.0483465117594, "learning_rate": 4.6974286770236514e-09, "loss": 0.7807, "step": 9154 }, { "epoch": 0.9050243432271458, "grad_norm": 5.0692161784939, "learning_rate": 4.687734703472546e-09, "loss": 0.6535, "step": 9155 }, { "epoch": 0.9051231989719003, "grad_norm": 7.857257734350887, "learning_rate": 4.6780505027883e-09, "loss": 0.6921, "step": 9156 }, { "epoch": 0.9052220547166547, "grad_norm": 3.641741634881134, "learning_rate": 4.668376075963898e-09, "loss": 0.7254, "step": 9157 }, { "epoch": 0.9053209104614092, "grad_norm": 7.366780785575918, "learning_rate": 4.65871142399128e-09, "loss": 0.8213, "step": 9158 }, { "epoch": 0.9054197662061636, "grad_norm": 3.667250024952076, "learning_rate": 4.649056547861407e-09, "loss": 0.6443, "step": 9159 }, { "epoch": 0.9055186219509181, "grad_norm": 3.8927745258362236, "learning_rate": 4.639411448564279e-09, "loss": 0.7763, "step": 9160 }, { "epoch": 0.9056174776956726, "grad_norm": 6.286606537532318, "learning_rate": 4.629776127088814e-09, "loss": 0.761, "step": 9161 }, { "epoch": 0.905716333440427, "grad_norm": 4.462630520304676, "learning_rate": 4.620150584423021e-09, "loss": 0.7915, "step": 9162 }, { "epoch": 0.9058151891851816, "grad_norm": 12.323341772066613, "learning_rate": 4.610534821553824e-09, "loss": 0.6968, "step": 9163 }, { "epoch": 0.905914044929936, "grad_norm": 3.3455902280067558, "learning_rate": 4.600928839467177e-09, "loss": 0.7132, "step": 9164 }, { "epoch": 0.9060129006746904, "grad_norm": 8.660503840222322, "learning_rate": 4.59133263914806e-09, "loss": 0.723, "step": 9165 }, { "epoch": 0.906111756419445, "grad_norm": 7.178735333852051, "learning_rate": 4.581746221580396e-09, "loss": 0.6066, "step": 9166 }, { "epoch": 0.9062106121641994, "grad_norm": 4.793942003078857, "learning_rate": 4.572169587747132e-09, "loss": 0.8233, "step": 9167 }, { "epoch": 0.9063094679089538, "grad_norm": 4.1479795756526805, "learning_rate": 4.562602738630239e-09, "loss": 0.6846, "step": 9168 }, { "epoch": 0.9064083236537084, "grad_norm": 12.328306075205111, "learning_rate": 4.55304567521061e-09, "loss": 0.6102, "step": 9169 }, { "epoch": 0.9065071793984628, "grad_norm": 7.18462582334617, "learning_rate": 4.543498398468226e-09, "loss": 0.5514, "step": 9170 }, { "epoch": 0.9066060351432172, "grad_norm": 4.7173500363385505, "learning_rate": 4.5339609093819845e-09, "loss": 0.8319, "step": 9171 }, { "epoch": 0.9067048908879717, "grad_norm": 3.4431375693037576, "learning_rate": 4.524433208929823e-09, "loss": 0.7881, "step": 9172 }, { "epoch": 0.9068037466327262, "grad_norm": 8.525410044459592, "learning_rate": 4.5149152980886955e-09, "loss": 0.7845, "step": 9173 }, { "epoch": 0.9069026023774807, "grad_norm": 17.56127226046748, "learning_rate": 4.505407177834475e-09, "loss": 0.7543, "step": 9174 }, { "epoch": 0.9070014581222351, "grad_norm": 6.332380267674635, "learning_rate": 4.495908849142127e-09, "loss": 0.7574, "step": 9175 }, { "epoch": 0.9071003138669896, "grad_norm": 4.970590289637656, "learning_rate": 4.486420312985528e-09, "loss": 0.6779, "step": 9176 }, { "epoch": 0.9071991696117441, "grad_norm": 3.3821020093764353, "learning_rate": 4.4769415703376115e-09, "loss": 0.7232, "step": 9177 }, { "epoch": 0.9072980253564985, "grad_norm": 8.77429492204542, "learning_rate": 4.467472622170254e-09, "loss": 0.7148, "step": 9178 }, { "epoch": 0.907396881101253, "grad_norm": 3.300041622122075, "learning_rate": 4.458013469454392e-09, "loss": 0.7613, "step": 9179 }, { "epoch": 0.9074957368460075, "grad_norm": 3.1294584695264915, "learning_rate": 4.4485641131599024e-09, "loss": 0.6238, "step": 9180 }, { "epoch": 0.9075945925907619, "grad_norm": 5.842354043094671, "learning_rate": 4.43912455425568e-09, "loss": 0.7426, "step": 9181 }, { "epoch": 0.9076934483355164, "grad_norm": 3.448425872612245, "learning_rate": 4.429694793709604e-09, "loss": 0.7205, "step": 9182 }, { "epoch": 0.9077923040802709, "grad_norm": 3.6422889285752795, "learning_rate": 4.420274832488568e-09, "loss": 0.7602, "step": 9183 }, { "epoch": 0.9078911598250253, "grad_norm": 5.1073963048987245, "learning_rate": 4.410864671558456e-09, "loss": 0.7249, "step": 9184 }, { "epoch": 0.9079900155697798, "grad_norm": 7.05371822605644, "learning_rate": 4.401464311884129e-09, "loss": 0.6958, "step": 9185 }, { "epoch": 0.9080888713145343, "grad_norm": 8.144070996780728, "learning_rate": 4.392073754429448e-09, "loss": 0.7487, "step": 9186 }, { "epoch": 0.9081877270592887, "grad_norm": 3.3683058358430435, "learning_rate": 4.3826930001573116e-09, "loss": 0.6726, "step": 9187 }, { "epoch": 0.9082865828040432, "grad_norm": 3.746071205742553, "learning_rate": 4.373322050029527e-09, "loss": 0.6821, "step": 9188 }, { "epoch": 0.9083854385487977, "grad_norm": 4.462880768512415, "learning_rate": 4.363960905006981e-09, "loss": 0.711, "step": 9189 }, { "epoch": 0.9084842942935522, "grad_norm": 3.510071019385119, "learning_rate": 4.354609566049528e-09, "loss": 0.6632, "step": 9190 }, { "epoch": 0.9085831500383066, "grad_norm": 3.3664182204481996, "learning_rate": 4.345268034115979e-09, "loss": 0.6679, "step": 9191 }, { "epoch": 0.908682005783061, "grad_norm": 3.6209106998243064, "learning_rate": 4.335936310164201e-09, "loss": 0.6491, "step": 9192 }, { "epoch": 0.9087808615278156, "grad_norm": 5.080519302283076, "learning_rate": 4.326614395150996e-09, "loss": 0.6247, "step": 9193 }, { "epoch": 0.90887971727257, "grad_norm": 5.930758502776924, "learning_rate": 4.317302290032221e-09, "loss": 0.676, "step": 9194 }, { "epoch": 0.9089785730173244, "grad_norm": 2.969010412379885, "learning_rate": 4.307999995762657e-09, "loss": 0.7604, "step": 9195 }, { "epoch": 0.909077428762079, "grad_norm": 5.204428468281225, "learning_rate": 4.298707513296151e-09, "loss": 0.729, "step": 9196 }, { "epoch": 0.9091762845068334, "grad_norm": 3.792727533705053, "learning_rate": 4.2894248435854875e-09, "loss": 0.6376, "step": 9197 }, { "epoch": 0.9092751402515878, "grad_norm": 15.76575439831711, "learning_rate": 4.280151987582492e-09, "loss": 0.7183, "step": 9198 }, { "epoch": 0.9093739959963424, "grad_norm": 11.475887921892777, "learning_rate": 4.270888946237927e-09, "loss": 0.7504, "step": 9199 }, { "epoch": 0.9094728517410968, "grad_norm": 4.171933796855026, "learning_rate": 4.261635720501589e-09, "loss": 0.682, "step": 9200 }, { "epoch": 0.9095717074858513, "grad_norm": 3.0870335314420028, "learning_rate": 4.2523923113222836e-09, "loss": 0.7382, "step": 9201 }, { "epoch": 0.9096705632306057, "grad_norm": 2.869006747061063, "learning_rate": 4.243158719647766e-09, "loss": 0.7723, "step": 9202 }, { "epoch": 0.9097694189753602, "grad_norm": 6.873986863749269, "learning_rate": 4.233934946424811e-09, "loss": 0.7515, "step": 9203 }, { "epoch": 0.9098682747201147, "grad_norm": 8.666341949724863, "learning_rate": 4.224720992599173e-09, "loss": 0.6836, "step": 9204 }, { "epoch": 0.9099671304648691, "grad_norm": 4.382439665689499, "learning_rate": 4.215516859115609e-09, "loss": 0.6563, "step": 9205 }, { "epoch": 0.9100659862096236, "grad_norm": 4.114693030871112, "learning_rate": 4.2063225469178845e-09, "loss": 0.4812, "step": 9206 }, { "epoch": 0.9101648419543781, "grad_norm": 4.053021948548916, "learning_rate": 4.197138056948712e-09, "loss": 0.7028, "step": 9207 }, { "epoch": 0.9102636976991325, "grad_norm": 3.735670011177251, "learning_rate": 4.18796339014984e-09, "loss": 0.7408, "step": 9208 }, { "epoch": 0.9103625534438871, "grad_norm": 3.4970588299049457, "learning_rate": 4.178798547462015e-09, "loss": 0.7466, "step": 9209 }, { "epoch": 0.9104614091886415, "grad_norm": 3.865972687879547, "learning_rate": 4.169643529824918e-09, "loss": 0.624, "step": 9210 }, { "epoch": 0.9105602649333959, "grad_norm": 2.974082807789901, "learning_rate": 4.1604983381772876e-09, "loss": 0.6793, "step": 9211 }, { "epoch": 0.9106591206781504, "grad_norm": 10.04376664598997, "learning_rate": 4.151362973456829e-09, "loss": 0.8244, "step": 9212 }, { "epoch": 0.9107579764229049, "grad_norm": 5.208614615696838, "learning_rate": 4.1422374366002154e-09, "loss": 0.8731, "step": 9213 }, { "epoch": 0.9108568321676593, "grad_norm": 8.014865117176841, "learning_rate": 4.133121728543165e-09, "loss": 0.7424, "step": 9214 }, { "epoch": 0.9109556879124138, "grad_norm": 3.6980153682093166, "learning_rate": 4.124015850220331e-09, "loss": 0.6738, "step": 9215 }, { "epoch": 0.9110545436571683, "grad_norm": 2.665152801681194, "learning_rate": 4.114919802565397e-09, "loss": 0.7255, "step": 9216 }, { "epoch": 0.9111533994019227, "grad_norm": 6.702429372608779, "learning_rate": 4.105833586511054e-09, "loss": 0.68, "step": 9217 }, { "epoch": 0.9112522551466772, "grad_norm": 6.196124464351072, "learning_rate": 4.096757202988921e-09, "loss": 0.6501, "step": 9218 }, { "epoch": 0.9113511108914317, "grad_norm": 19.54025269125227, "learning_rate": 4.087690652929665e-09, "loss": 0.6889, "step": 9219 }, { "epoch": 0.9114499666361862, "grad_norm": 3.7811333890956678, "learning_rate": 4.078633937262932e-09, "loss": 0.6077, "step": 9220 }, { "epoch": 0.9115488223809406, "grad_norm": 3.02598678349501, "learning_rate": 4.069587056917345e-09, "loss": 0.6386, "step": 9221 }, { "epoch": 0.911647678125695, "grad_norm": 9.812229757611037, "learning_rate": 4.06055001282053e-09, "loss": 0.6498, "step": 9222 }, { "epoch": 0.9117465338704496, "grad_norm": 3.435322394449184, "learning_rate": 4.051522805899121e-09, "loss": 0.6909, "step": 9223 }, { "epoch": 0.911845389615204, "grad_norm": 5.093478416736397, "learning_rate": 4.042505437078692e-09, "loss": 0.7712, "step": 9224 }, { "epoch": 0.9119442453599584, "grad_norm": 4.229582456985856, "learning_rate": 4.0334979072838674e-09, "loss": 0.753, "step": 9225 }, { "epoch": 0.912043101104713, "grad_norm": 3.469554935949177, "learning_rate": 4.0245002174382205e-09, "loss": 0.6988, "step": 9226 }, { "epoch": 0.9121419568494674, "grad_norm": 7.365487132079871, "learning_rate": 4.015512368464336e-09, "loss": 0.6537, "step": 9227 }, { "epoch": 0.9122408125942219, "grad_norm": 13.479612707147696, "learning_rate": 4.006534361283798e-09, "loss": 0.6855, "step": 9228 }, { "epoch": 0.9123396683389764, "grad_norm": 6.415872931415979, "learning_rate": 3.997566196817159e-09, "loss": 0.7538, "step": 9229 }, { "epoch": 0.9124385240837308, "grad_norm": 4.3242681933872245, "learning_rate": 3.9886078759839624e-09, "loss": 0.7651, "step": 9230 }, { "epoch": 0.9125373798284853, "grad_norm": 3.531653815952403, "learning_rate": 3.979659399702773e-09, "loss": 0.6288, "step": 9231 }, { "epoch": 0.9126362355732397, "grad_norm": 3.702377417090328, "learning_rate": 3.9707207688910896e-09, "loss": 0.7389, "step": 9232 }, { "epoch": 0.9127350913179942, "grad_norm": 5.076857106069224, "learning_rate": 3.961791984465468e-09, "loss": 0.6269, "step": 9233 }, { "epoch": 0.9128339470627487, "grad_norm": 4.478393465335068, "learning_rate": 3.9528730473414315e-09, "loss": 0.6887, "step": 9234 }, { "epoch": 0.9129328028075031, "grad_norm": 4.443941183243416, "learning_rate": 3.943963958433449e-09, "loss": 0.6495, "step": 9235 }, { "epoch": 0.9130316585522577, "grad_norm": 4.3111805380406585, "learning_rate": 3.9350647186550455e-09, "loss": 0.7296, "step": 9236 }, { "epoch": 0.9131305142970121, "grad_norm": 3.4652639181472518, "learning_rate": 3.92617532891869e-09, "loss": 0.6418, "step": 9237 }, { "epoch": 0.9132293700417665, "grad_norm": 3.2594762134861095, "learning_rate": 3.917295790135877e-09, "loss": 0.615, "step": 9238 }, { "epoch": 0.9133282257865211, "grad_norm": 4.002180559581622, "learning_rate": 3.9084261032170554e-09, "loss": 0.6916, "step": 9239 }, { "epoch": 0.9134270815312755, "grad_norm": 3.754785391911426, "learning_rate": 3.8995662690716855e-09, "loss": 0.648, "step": 9240 }, { "epoch": 0.9135259372760299, "grad_norm": 4.723355654959425, "learning_rate": 3.890716288608209e-09, "loss": 0.639, "step": 9241 }, { "epoch": 0.9136247930207844, "grad_norm": 6.7448813757071715, "learning_rate": 3.8818761627340765e-09, "loss": 0.6397, "step": 9242 }, { "epoch": 0.9137236487655389, "grad_norm": 5.5422574450015905, "learning_rate": 3.873045892355697e-09, "loss": 0.6734, "step": 9243 }, { "epoch": 0.9138225045102933, "grad_norm": 4.323750224883757, "learning_rate": 3.86422547837848e-09, "loss": 0.7015, "step": 9244 }, { "epoch": 0.9139213602550478, "grad_norm": 13.249319446498447, "learning_rate": 3.855414921706857e-09, "loss": 0.6403, "step": 9245 }, { "epoch": 0.9140202159998023, "grad_norm": 4.686824255956182, "learning_rate": 3.846614223244194e-09, "loss": 0.6921, "step": 9246 }, { "epoch": 0.9141190717445568, "grad_norm": 14.813762574906415, "learning_rate": 3.837823383892869e-09, "loss": 0.5978, "step": 9247 }, { "epoch": 0.9142179274893112, "grad_norm": 10.254013874212204, "learning_rate": 3.829042404554284e-09, "loss": 0.6234, "step": 9248 }, { "epoch": 0.9143167832340657, "grad_norm": 3.0245895592387373, "learning_rate": 3.820271286128762e-09, "loss": 0.6068, "step": 9249 }, { "epoch": 0.9144156389788202, "grad_norm": 4.931793303757395, "learning_rate": 3.811510029515674e-09, "loss": 0.6695, "step": 9250 }, { "epoch": 0.9145144947235746, "grad_norm": 3.4863823098720927, "learning_rate": 3.8027586356133565e-09, "loss": 0.7004, "step": 9251 }, { "epoch": 0.9146133504683291, "grad_norm": 18.34838853790831, "learning_rate": 3.794017105319114e-09, "loss": 0.735, "step": 9252 }, { "epoch": 0.9147122062130836, "grad_norm": 3.631121291380736, "learning_rate": 3.785285439529307e-09, "loss": 0.8391, "step": 9253 }, { "epoch": 0.914811061957838, "grad_norm": 6.177329566689876, "learning_rate": 3.776563639139185e-09, "loss": 0.6358, "step": 9254 }, { "epoch": 0.9149099177025924, "grad_norm": 5.002852930401152, "learning_rate": 3.767851705043079e-09, "loss": 0.8186, "step": 9255 }, { "epoch": 0.915008773447347, "grad_norm": 14.268079960323641, "learning_rate": 3.7591496381342625e-09, "loss": 0.6095, "step": 9256 }, { "epoch": 0.9151076291921014, "grad_norm": 8.84783672554617, "learning_rate": 3.750457439304977e-09, "loss": 0.7628, "step": 9257 }, { "epoch": 0.9152064849368559, "grad_norm": 3.7536427377578168, "learning_rate": 3.7417751094465324e-09, "loss": 0.6243, "step": 9258 }, { "epoch": 0.9153053406816104, "grad_norm": 3.4165679123794037, "learning_rate": 3.733102649449116e-09, "loss": 0.6619, "step": 9259 }, { "epoch": 0.9154041964263648, "grad_norm": 10.790409931795745, "learning_rate": 3.7244400602019833e-09, "loss": 0.6562, "step": 9260 }, { "epoch": 0.9155030521711193, "grad_norm": 9.511954699671238, "learning_rate": 3.7157873425933793e-09, "loss": 0.692, "step": 9261 }, { "epoch": 0.9156019079158738, "grad_norm": 4.746753216219439, "learning_rate": 3.707144497510473e-09, "loss": 0.6563, "step": 9262 }, { "epoch": 0.9157007636606282, "grad_norm": 3.1887944054248973, "learning_rate": 3.6985115258394985e-09, "loss": 0.714, "step": 9263 }, { "epoch": 0.9157996194053827, "grad_norm": 3.5806343645462015, "learning_rate": 3.689888428465604e-09, "loss": 0.6365, "step": 9264 }, { "epoch": 0.9158984751501371, "grad_norm": 2.8613683524194844, "learning_rate": 3.6812752062729825e-09, "loss": 0.6933, "step": 9265 }, { "epoch": 0.9159973308948917, "grad_norm": 14.147453600226305, "learning_rate": 3.6726718601447826e-09, "loss": 0.6575, "step": 9266 }, { "epoch": 0.9160961866396461, "grad_norm": 4.932456020074218, "learning_rate": 3.6640783909631655e-09, "loss": 0.7341, "step": 9267 }, { "epoch": 0.9161950423844005, "grad_norm": 14.08405230592423, "learning_rate": 3.655494799609249e-09, "loss": 0.7562, "step": 9268 }, { "epoch": 0.9162938981291551, "grad_norm": 2.5207725685209077, "learning_rate": 3.6469210869631616e-09, "loss": 0.6925, "step": 9269 }, { "epoch": 0.9163927538739095, "grad_norm": 2.7895886588558927, "learning_rate": 3.6383572539040007e-09, "loss": 0.7789, "step": 9270 }, { "epoch": 0.9164916096186639, "grad_norm": 4.180408280119502, "learning_rate": 3.629803301309864e-09, "loss": 0.6589, "step": 9271 }, { "epoch": 0.9165904653634185, "grad_norm": 3.279077684369699, "learning_rate": 3.6212592300578605e-09, "loss": 0.6125, "step": 9272 }, { "epoch": 0.9166893211081729, "grad_norm": 4.348384887735285, "learning_rate": 3.6127250410240227e-09, "loss": 0.7766, "step": 9273 }, { "epoch": 0.9167881768529274, "grad_norm": 7.174700801978917, "learning_rate": 3.6042007350834068e-09, "loss": 0.8456, "step": 9274 }, { "epoch": 0.9168870325976818, "grad_norm": 5.065336606947412, "learning_rate": 3.595686313110091e-09, "loss": 0.7408, "step": 9275 }, { "epoch": 0.9169858883424363, "grad_norm": 7.297817717048018, "learning_rate": 3.5871817759770663e-09, "loss": 0.6759, "step": 9276 }, { "epoch": 0.9170847440871908, "grad_norm": 12.877861501682393, "learning_rate": 3.578687124556368e-09, "loss": 0.6697, "step": 9277 }, { "epoch": 0.9171835998319452, "grad_norm": 3.863183044266501, "learning_rate": 3.570202359718988e-09, "loss": 0.6834, "step": 9278 }, { "epoch": 0.9172824555766997, "grad_norm": 3.297513831922445, "learning_rate": 3.5617274823349197e-09, "loss": 0.6968, "step": 9279 }, { "epoch": 0.9173813113214542, "grad_norm": 5.283775161918684, "learning_rate": 3.5532624932731346e-09, "loss": 0.762, "step": 9280 }, { "epoch": 0.9174801670662086, "grad_norm": 3.8312405740651867, "learning_rate": 3.544807393401583e-09, "loss": 0.7967, "step": 9281 }, { "epoch": 0.9175790228109632, "grad_norm": 3.846415732086695, "learning_rate": 3.536362183587227e-09, "loss": 0.7381, "step": 9282 }, { "epoch": 0.9176778785557176, "grad_norm": 3.142275481552717, "learning_rate": 3.527926864695996e-09, "loss": 0.7532, "step": 9283 }, { "epoch": 0.917776734300472, "grad_norm": 8.977727317151844, "learning_rate": 3.519501437592787e-09, "loss": 0.7018, "step": 9284 }, { "epoch": 0.9178755900452265, "grad_norm": 7.712045329907277, "learning_rate": 3.5110859031415195e-09, "loss": 0.696, "step": 9285 }, { "epoch": 0.917974445789981, "grad_norm": 4.026643944497605, "learning_rate": 3.502680262205093e-09, "loss": 0.7642, "step": 9286 }, { "epoch": 0.9180733015347354, "grad_norm": 5.265587045453004, "learning_rate": 3.4942845156453514e-09, "loss": 0.6893, "step": 9287 }, { "epoch": 0.9181721572794899, "grad_norm": 4.614930117669901, "learning_rate": 3.4858986643231837e-09, "loss": 0.7295, "step": 9288 }, { "epoch": 0.9182710130242444, "grad_norm": 3.5308943032486058, "learning_rate": 3.477522709098435e-09, "loss": 0.6546, "step": 9289 }, { "epoch": 0.9183698687689988, "grad_norm": 4.1031178338106375, "learning_rate": 3.4691566508299076e-09, "loss": 0.7398, "step": 9290 }, { "epoch": 0.9184687245137533, "grad_norm": 7.906841691014189, "learning_rate": 3.4608004903754484e-09, "loss": 0.6606, "step": 9291 }, { "epoch": 0.9185675802585078, "grad_norm": 5.015895906866056, "learning_rate": 3.452454228591839e-09, "loss": 0.6061, "step": 9292 }, { "epoch": 0.9186664360032623, "grad_norm": 4.205587771748279, "learning_rate": 3.4441178663348724e-09, "loss": 0.7299, "step": 9293 }, { "epoch": 0.9187652917480167, "grad_norm": 4.064622495677099, "learning_rate": 3.4357914044593206e-09, "loss": 0.6653, "step": 9294 }, { "epoch": 0.9188641474927711, "grad_norm": 4.1130542331187, "learning_rate": 3.4274748438189337e-09, "loss": 0.7347, "step": 9295 }, { "epoch": 0.9189630032375257, "grad_norm": 6.944756316850829, "learning_rate": 3.4191681852664634e-09, "loss": 0.6149, "step": 9296 }, { "epoch": 0.9190618589822801, "grad_norm": 3.5775546432048198, "learning_rate": 3.410871429653628e-09, "loss": 0.7713, "step": 9297 }, { "epoch": 0.9191607147270345, "grad_norm": 7.149491108958584, "learning_rate": 3.402584577831136e-09, "loss": 0.7458, "step": 9298 }, { "epoch": 0.9192595704717891, "grad_norm": 3.9132493925469567, "learning_rate": 3.394307630648685e-09, "loss": 0.7426, "step": 9299 }, { "epoch": 0.9193584262165435, "grad_norm": 4.485575432264101, "learning_rate": 3.3860405889549524e-09, "loss": 0.6547, "step": 9300 }, { "epoch": 0.919457281961298, "grad_norm": 4.073806245277227, "learning_rate": 3.3777834535975935e-09, "loss": 0.643, "step": 9301 }, { "epoch": 0.9195561377060525, "grad_norm": 6.01288467258024, "learning_rate": 3.369536225423264e-09, "loss": 0.7406, "step": 9302 }, { "epoch": 0.9196549934508069, "grad_norm": 3.403492438776547, "learning_rate": 3.3612989052776096e-09, "loss": 0.7126, "step": 9303 }, { "epoch": 0.9197538491955614, "grad_norm": 10.38213259386531, "learning_rate": 3.353071494005222e-09, "loss": 0.7638, "step": 9304 }, { "epoch": 0.9198527049403158, "grad_norm": 5.153302318722467, "learning_rate": 3.3448539924497143e-09, "loss": 0.6701, "step": 9305 }, { "epoch": 0.9199515606850703, "grad_norm": 4.142047486665847, "learning_rate": 3.3366464014536576e-09, "loss": 0.6411, "step": 9306 }, { "epoch": 0.9200504164298248, "grad_norm": 3.187386041508717, "learning_rate": 3.3284487218586232e-09, "loss": 0.7027, "step": 9307 }, { "epoch": 0.9201492721745792, "grad_norm": 36.98745708881937, "learning_rate": 3.320260954505183e-09, "loss": 0.7075, "step": 9308 }, { "epoch": 0.9202481279193337, "grad_norm": 4.86778079152197, "learning_rate": 3.3120831002328432e-09, "loss": 0.6838, "step": 9309 }, { "epoch": 0.9203469836640882, "grad_norm": 4.233662728046249, "learning_rate": 3.303915159880133e-09, "loss": 0.7888, "step": 9310 }, { "epoch": 0.9204458394088426, "grad_norm": 2.912088899226901, "learning_rate": 3.2957571342845714e-09, "loss": 0.6928, "step": 9311 }, { "epoch": 0.9205446951535972, "grad_norm": 3.459586873641062, "learning_rate": 3.2876090242826116e-09, "loss": 0.685, "step": 9312 }, { "epoch": 0.9206435508983516, "grad_norm": 2.6919002240260395, "learning_rate": 3.2794708307097295e-09, "loss": 0.7568, "step": 9313 }, { "epoch": 0.920742406643106, "grad_norm": 6.695894651811346, "learning_rate": 3.2713425544004135e-09, "loss": 0.7455, "step": 9314 }, { "epoch": 0.9208412623878605, "grad_norm": 3.558689756961042, "learning_rate": 3.2632241961880526e-09, "loss": 0.6676, "step": 9315 }, { "epoch": 0.920940118132615, "grad_norm": 4.113663625824068, "learning_rate": 3.2551157569050913e-09, "loss": 0.6648, "step": 9316 }, { "epoch": 0.9210389738773694, "grad_norm": 4.989929516170186, "learning_rate": 3.247017237382921e-09, "loss": 0.6709, "step": 9317 }, { "epoch": 0.9211378296221239, "grad_norm": 3.4873563938638585, "learning_rate": 3.238928638451921e-09, "loss": 0.602, "step": 9318 }, { "epoch": 0.9212366853668784, "grad_norm": 4.648467174141105, "learning_rate": 3.230849960941473e-09, "loss": 0.6664, "step": 9319 }, { "epoch": 0.9213355411116328, "grad_norm": 3.8844632783056845, "learning_rate": 3.222781205679903e-09, "loss": 0.7318, "step": 9320 }, { "epoch": 0.9214343968563873, "grad_norm": 6.634637809478282, "learning_rate": 3.2147223734945492e-09, "loss": 0.6645, "step": 9321 }, { "epoch": 0.9215332526011418, "grad_norm": 3.580990785175267, "learning_rate": 3.2066734652117513e-09, "loss": 0.728, "step": 9322 }, { "epoch": 0.9216321083458963, "grad_norm": 4.7338847431403925, "learning_rate": 3.1986344816567702e-09, "loss": 0.7033, "step": 9323 }, { "epoch": 0.9217309640906507, "grad_norm": 3.7239724216181918, "learning_rate": 3.190605423653914e-09, "loss": 0.6349, "step": 9324 }, { "epoch": 0.9218298198354052, "grad_norm": 5.966533523860844, "learning_rate": 3.1825862920264346e-09, "loss": 0.6941, "step": 9325 }, { "epoch": 0.9219286755801597, "grad_norm": 3.9106414412488557, "learning_rate": 3.174577087596564e-09, "loss": 0.7158, "step": 9326 }, { "epoch": 0.9220275313249141, "grad_norm": 10.846930399777913, "learning_rate": 3.1665778111855558e-09, "loss": 0.7787, "step": 9327 }, { "epoch": 0.9221263870696685, "grad_norm": 16.080221713526157, "learning_rate": 3.1585884636135763e-09, "loss": 0.6454, "step": 9328 }, { "epoch": 0.9222252428144231, "grad_norm": 18.17945257134747, "learning_rate": 3.150609045699848e-09, "loss": 0.7735, "step": 9329 }, { "epoch": 0.9223240985591775, "grad_norm": 3.043931480291416, "learning_rate": 3.142639558262539e-09, "loss": 0.8213, "step": 9330 }, { "epoch": 0.922422954303932, "grad_norm": 2.9904184356574537, "learning_rate": 3.1346800021187836e-09, "loss": 0.7201, "step": 9331 }, { "epoch": 0.9225218100486865, "grad_norm": 4.199251100845723, "learning_rate": 3.126730378084741e-09, "loss": 0.6277, "step": 9332 }, { "epoch": 0.9226206657934409, "grad_norm": 6.393834087256833, "learning_rate": 3.1187906869755143e-09, "loss": 0.7687, "step": 9333 }, { "epoch": 0.9227195215381954, "grad_norm": 5.01027992516318, "learning_rate": 3.110860929605208e-09, "loss": 0.709, "step": 9334 }, { "epoch": 0.9228183772829499, "grad_norm": 4.306987487303191, "learning_rate": 3.1029411067868937e-09, "loss": 0.8013, "step": 9335 }, { "epoch": 0.9229172330277043, "grad_norm": 3.318404859315426, "learning_rate": 3.0950312193326446e-09, "loss": 0.6495, "step": 9336 }, { "epoch": 0.9230160887724588, "grad_norm": 3.682240879878511, "learning_rate": 3.0871312680534777e-09, "loss": 0.5861, "step": 9337 }, { "epoch": 0.9231149445172132, "grad_norm": 3.7775644234870556, "learning_rate": 3.079241253759457e-09, "loss": 0.6343, "step": 9338 }, { "epoch": 0.9232138002619678, "grad_norm": 4.566987191967971, "learning_rate": 3.071361177259546e-09, "loss": 0.7209, "step": 9339 }, { "epoch": 0.9233126560067222, "grad_norm": 3.9411048211406445, "learning_rate": 3.063491039361743e-09, "loss": 0.7872, "step": 9340 }, { "epoch": 0.9234115117514766, "grad_norm": 3.106358780521815, "learning_rate": 3.055630840873036e-09, "loss": 0.7232, "step": 9341 }, { "epoch": 0.9235103674962312, "grad_norm": 4.540832367721259, "learning_rate": 3.0477805825993465e-09, "loss": 0.6803, "step": 9342 }, { "epoch": 0.9236092232409856, "grad_norm": 3.8892334200676006, "learning_rate": 3.0399402653456085e-09, "loss": 0.7845, "step": 9343 }, { "epoch": 0.92370807898574, "grad_norm": 2.939295721859762, "learning_rate": 3.032109889915757e-09, "loss": 0.6812, "step": 9344 }, { "epoch": 0.9238069347304946, "grad_norm": 3.838101453770588, "learning_rate": 3.0242894571126386e-09, "loss": 0.7119, "step": 9345 }, { "epoch": 0.923905790475249, "grad_norm": 4.554007451484778, "learning_rate": 3.0164789677381454e-09, "loss": 0.753, "step": 9346 }, { "epoch": 0.9240046462200034, "grad_norm": 5.435994435554091, "learning_rate": 3.008678422593136e-09, "loss": 0.6185, "step": 9347 }, { "epoch": 0.9241035019647579, "grad_norm": 4.689923960756403, "learning_rate": 3.000887822477416e-09, "loss": 0.6833, "step": 9348 }, { "epoch": 0.9242023577095124, "grad_norm": 3.2971144935308963, "learning_rate": 2.9931071681898346e-09, "loss": 0.7211, "step": 9349 }, { "epoch": 0.9243012134542669, "grad_norm": 3.664468757470984, "learning_rate": 2.9853364605281427e-09, "loss": 0.6896, "step": 9350 }, { "epoch": 0.9244000691990213, "grad_norm": 4.992906990496116, "learning_rate": 2.9775757002891367e-09, "loss": 0.7195, "step": 9351 }, { "epoch": 0.9244989249437758, "grad_norm": 9.08647283858197, "learning_rate": 2.969824888268557e-09, "loss": 0.5418, "step": 9352 }, { "epoch": 0.9245977806885303, "grad_norm": 21.20081088746383, "learning_rate": 2.962084025261147e-09, "loss": 0.6779, "step": 9353 }, { "epoch": 0.9246966364332847, "grad_norm": 2.8015583577199306, "learning_rate": 2.9543531120606035e-09, "loss": 0.689, "step": 9354 }, { "epoch": 0.9247954921780392, "grad_norm": 3.9555241200022992, "learning_rate": 2.946632149459638e-09, "loss": 0.6692, "step": 9355 }, { "epoch": 0.9248943479227937, "grad_norm": 4.786341344169829, "learning_rate": 2.9389211382498837e-09, "loss": 0.6918, "step": 9356 }, { "epoch": 0.9249932036675481, "grad_norm": 12.028999663525427, "learning_rate": 2.9312200792220297e-09, "loss": 0.7418, "step": 9357 }, { "epoch": 0.9250920594123025, "grad_norm": 4.395978551505031, "learning_rate": 2.92352897316569e-09, "loss": 0.8512, "step": 9358 }, { "epoch": 0.9251909151570571, "grad_norm": 9.657148888111719, "learning_rate": 2.915847820869477e-09, "loss": 0.742, "step": 9359 }, { "epoch": 0.9252897709018115, "grad_norm": 4.035575254472802, "learning_rate": 2.9081766231209835e-09, "loss": 0.6243, "step": 9360 }, { "epoch": 0.925388626646566, "grad_norm": 7.507516496453883, "learning_rate": 2.9005153807067583e-09, "loss": 0.7906, "step": 9361 }, { "epoch": 0.9254874823913205, "grad_norm": 6.68664104615038, "learning_rate": 2.892864094412362e-09, "loss": 0.8031, "step": 9362 }, { "epoch": 0.9255863381360749, "grad_norm": 6.469033671826198, "learning_rate": 2.8852227650223325e-09, "loss": 0.6852, "step": 9363 }, { "epoch": 0.9256851938808294, "grad_norm": 3.6380734967598047, "learning_rate": 2.8775913933201557e-09, "loss": 0.7174, "step": 9364 }, { "epoch": 0.9257840496255839, "grad_norm": 6.380407976344252, "learning_rate": 2.869969980088327e-09, "loss": 0.6802, "step": 9365 }, { "epoch": 0.9258829053703383, "grad_norm": 3.406803227240417, "learning_rate": 2.8623585261083215e-09, "loss": 0.7045, "step": 9366 }, { "epoch": 0.9259817611150928, "grad_norm": 6.78639386395157, "learning_rate": 2.8547570321605486e-09, "loss": 0.6455, "step": 9367 }, { "epoch": 0.9260806168598472, "grad_norm": 11.109920539495475, "learning_rate": 2.847165499024462e-09, "loss": 0.7462, "step": 9368 }, { "epoch": 0.9261794726046018, "grad_norm": 5.3357597227266425, "learning_rate": 2.8395839274784393e-09, "loss": 0.7119, "step": 9369 }, { "epoch": 0.9262783283493562, "grad_norm": 3.997242949195111, "learning_rate": 2.8320123182998812e-09, "loss": 0.7481, "step": 9370 }, { "epoch": 0.9263771840941106, "grad_norm": 8.486579571912982, "learning_rate": 2.8244506722651328e-09, "loss": 0.7081, "step": 9371 }, { "epoch": 0.9264760398388652, "grad_norm": 3.0740515071455365, "learning_rate": 2.816898990149508e-09, "loss": 0.79, "step": 9372 }, { "epoch": 0.9265748955836196, "grad_norm": 3.025121469832653, "learning_rate": 2.809357272727342e-09, "loss": 0.6885, "step": 9373 }, { "epoch": 0.926673751328374, "grad_norm": 8.704721371587047, "learning_rate": 2.801825520771939e-09, "loss": 0.7993, "step": 9374 }, { "epoch": 0.9267726070731286, "grad_norm": 2.879266407771542, "learning_rate": 2.7943037350555476e-09, "loss": 0.7086, "step": 9375 }, { "epoch": 0.926871462817883, "grad_norm": 3.705636804522886, "learning_rate": 2.786791916349418e-09, "loss": 0.6692, "step": 9376 }, { "epoch": 0.9269703185626375, "grad_norm": 5.6892899932397105, "learning_rate": 2.7792900654238006e-09, "loss": 0.6528, "step": 9377 }, { "epoch": 0.9270691743073919, "grad_norm": 7.07633172841689, "learning_rate": 2.7717981830478574e-09, "loss": 0.7401, "step": 9378 }, { "epoch": 0.9271680300521464, "grad_norm": 4.585924105168714, "learning_rate": 2.764316269989797e-09, "loss": 0.6124, "step": 9379 }, { "epoch": 0.9272668857969009, "grad_norm": 5.097629093145771, "learning_rate": 2.756844327016794e-09, "loss": 0.6841, "step": 9380 }, { "epoch": 0.9273657415416553, "grad_norm": 5.296407382228247, "learning_rate": 2.749382354894958e-09, "loss": 0.6634, "step": 9381 }, { "epoch": 0.9274645972864098, "grad_norm": 6.918524000070982, "learning_rate": 2.7419303543894103e-09, "loss": 0.7557, "step": 9382 }, { "epoch": 0.9275634530311643, "grad_norm": 4.796107143404165, "learning_rate": 2.7344883262642505e-09, "loss": 0.7556, "step": 9383 }, { "epoch": 0.9276623087759187, "grad_norm": 7.707362434429936, "learning_rate": 2.7270562712825353e-09, "loss": 0.7298, "step": 9384 }, { "epoch": 0.9277611645206733, "grad_norm": 4.960323823213689, "learning_rate": 2.7196341902063434e-09, "loss": 0.6123, "step": 9385 }, { "epoch": 0.9278600202654277, "grad_norm": 5.596142570356802, "learning_rate": 2.712222083796667e-09, "loss": 0.7159, "step": 9386 }, { "epoch": 0.9279588760101821, "grad_norm": 3.413153455118787, "learning_rate": 2.7048199528135198e-09, "loss": 0.6095, "step": 9387 }, { "epoch": 0.9280577317549366, "grad_norm": 4.661464675753066, "learning_rate": 2.6974277980158944e-09, "loss": 0.7721, "step": 9388 }, { "epoch": 0.9281565874996911, "grad_norm": 6.486070713051215, "learning_rate": 2.6900456201617185e-09, "loss": 0.6431, "step": 9389 }, { "epoch": 0.9282554432444455, "grad_norm": 3.4431390329539657, "learning_rate": 2.6826734200079415e-09, "loss": 0.7744, "step": 9390 }, { "epoch": 0.9283542989892, "grad_norm": 3.326627314887195, "learning_rate": 2.6753111983104814e-09, "loss": 0.6834, "step": 9391 }, { "epoch": 0.9284531547339545, "grad_norm": 4.262504116837925, "learning_rate": 2.6679589558242122e-09, "loss": 0.7311, "step": 9392 }, { "epoch": 0.928552010478709, "grad_norm": 4.600521171275992, "learning_rate": 2.6606166933030085e-09, "loss": 0.6625, "step": 9393 }, { "epoch": 0.9286508662234634, "grad_norm": 5.304312392399939, "learning_rate": 2.653284411499701e-09, "loss": 0.6693, "step": 9394 }, { "epoch": 0.9287497219682179, "grad_norm": 3.245661315694154, "learning_rate": 2.6459621111661e-09, "loss": 0.6952, "step": 9395 }, { "epoch": 0.9288485777129724, "grad_norm": 13.093851123708832, "learning_rate": 2.6386497930530383e-09, "loss": 0.6293, "step": 9396 }, { "epoch": 0.9289474334577268, "grad_norm": 4.271081861875373, "learning_rate": 2.631347457910238e-09, "loss": 0.7339, "step": 9397 }, { "epoch": 0.9290462892024813, "grad_norm": 4.241356741790104, "learning_rate": 2.624055106486467e-09, "loss": 0.5985, "step": 9398 }, { "epoch": 0.9291451449472358, "grad_norm": 3.7755902256872886, "learning_rate": 2.6167727395294716e-09, "loss": 0.6597, "step": 9399 }, { "epoch": 0.9292440006919902, "grad_norm": 15.788884203214172, "learning_rate": 2.6095003577859097e-09, "loss": 0.6631, "step": 9400 }, { "epoch": 0.9293428564367446, "grad_norm": 4.545762585552154, "learning_rate": 2.6022379620014747e-09, "loss": 0.6923, "step": 9401 }, { "epoch": 0.9294417121814992, "grad_norm": 3.231907348676443, "learning_rate": 2.5949855529208254e-09, "loss": 0.7057, "step": 9402 }, { "epoch": 0.9295405679262536, "grad_norm": 5.349823730124508, "learning_rate": 2.5877431312875787e-09, "loss": 0.7413, "step": 9403 }, { "epoch": 0.929639423671008, "grad_norm": 14.676549099703486, "learning_rate": 2.5805106978443405e-09, "loss": 0.7496, "step": 9404 }, { "epoch": 0.9297382794157626, "grad_norm": 6.219080022335714, "learning_rate": 2.573288253332706e-09, "loss": 0.7594, "step": 9405 }, { "epoch": 0.929837135160517, "grad_norm": 4.993296810225713, "learning_rate": 2.5660757984931948e-09, "loss": 0.7607, "step": 9406 }, { "epoch": 0.9299359909052715, "grad_norm": 6.145423765878271, "learning_rate": 2.5588733340653814e-09, "loss": 0.7466, "step": 9407 }, { "epoch": 0.930034846650026, "grad_norm": 3.531956482823787, "learning_rate": 2.5516808607877304e-09, "loss": 0.7864, "step": 9408 }, { "epoch": 0.9301337023947804, "grad_norm": 13.951159133353551, "learning_rate": 2.5444983793977417e-09, "loss": 0.7719, "step": 9409 }, { "epoch": 0.9302325581395349, "grad_norm": 3.210603597837471, "learning_rate": 2.5373258906318916e-09, "loss": 0.647, "step": 9410 }, { "epoch": 0.9303314138842893, "grad_norm": 5.210853408432664, "learning_rate": 2.530163395225582e-09, "loss": 0.7094, "step": 9411 }, { "epoch": 0.9304302696290438, "grad_norm": 4.279214675589479, "learning_rate": 2.5230108939132355e-09, "loss": 0.5599, "step": 9412 }, { "epoch": 0.9305291253737983, "grad_norm": 3.456512921085227, "learning_rate": 2.5158683874282438e-09, "loss": 0.676, "step": 9413 }, { "epoch": 0.9306279811185527, "grad_norm": 13.73664052500378, "learning_rate": 2.508735876502954e-09, "loss": 0.7074, "step": 9414 }, { "epoch": 0.9307268368633073, "grad_norm": 10.144969870262862, "learning_rate": 2.501613361868704e-09, "loss": 0.688, "step": 9415 }, { "epoch": 0.9308256926080617, "grad_norm": 3.5550203266616207, "learning_rate": 2.494500844255798e-09, "loss": 0.691, "step": 9416 }, { "epoch": 0.9309245483528161, "grad_norm": 5.279824921322023, "learning_rate": 2.4873983243935192e-09, "loss": 0.7186, "step": 9417 }, { "epoch": 0.9310234040975707, "grad_norm": 4.17567267809548, "learning_rate": 2.480305803010141e-09, "loss": 0.8118, "step": 9418 }, { "epoch": 0.9311222598423251, "grad_norm": 3.5491342312346714, "learning_rate": 2.47322328083287e-09, "loss": 0.7583, "step": 9419 }, { "epoch": 0.9312211155870795, "grad_norm": 3.154140956805501, "learning_rate": 2.4661507585879372e-09, "loss": 0.8024, "step": 9420 }, { "epoch": 0.931319971331834, "grad_norm": 4.680718601170543, "learning_rate": 2.459088237000528e-09, "loss": 0.6817, "step": 9421 }, { "epoch": 0.9314188270765885, "grad_norm": 5.34507780494192, "learning_rate": 2.452035716794787e-09, "loss": 0.8004, "step": 9422 }, { "epoch": 0.931517682821343, "grad_norm": 5.795434304280505, "learning_rate": 2.4449931986938455e-09, "loss": 0.6597, "step": 9423 }, { "epoch": 0.9316165385660974, "grad_norm": 3.0892500217355683, "learning_rate": 2.437960683419815e-09, "loss": 0.6936, "step": 9424 }, { "epoch": 0.9317153943108519, "grad_norm": 4.2714302165973095, "learning_rate": 2.430938171693775e-09, "loss": 0.7193, "step": 9425 }, { "epoch": 0.9318142500556064, "grad_norm": 5.4945809116354445, "learning_rate": 2.423925664235782e-09, "loss": 0.795, "step": 9426 }, { "epoch": 0.9319131058003608, "grad_norm": 3.860428218279135, "learning_rate": 2.416923161764861e-09, "loss": 0.6972, "step": 9427 }, { "epoch": 0.9320119615451153, "grad_norm": 3.238415999273486, "learning_rate": 2.409930664999016e-09, "loss": 0.7454, "step": 9428 }, { "epoch": 0.9321108172898698, "grad_norm": 3.2989982802315074, "learning_rate": 2.4029481746552283e-09, "loss": 0.7824, "step": 9429 }, { "epoch": 0.9322096730346242, "grad_norm": 16.82300292630578, "learning_rate": 2.3959756914494478e-09, "loss": 0.706, "step": 9430 }, { "epoch": 0.9323085287793786, "grad_norm": 5.19254925230508, "learning_rate": 2.3890132160966026e-09, "loss": 0.7129, "step": 9431 }, { "epoch": 0.9324073845241332, "grad_norm": 4.059691334429038, "learning_rate": 2.382060749310588e-09, "loss": 0.6811, "step": 9432 }, { "epoch": 0.9325062402688876, "grad_norm": 4.615312574408365, "learning_rate": 2.375118291804268e-09, "loss": 0.7357, "step": 9433 }, { "epoch": 0.932605096013642, "grad_norm": 4.445173247207517, "learning_rate": 2.3681858442895055e-09, "loss": 0.6976, "step": 9434 }, { "epoch": 0.9327039517583966, "grad_norm": 3.9776128280467233, "learning_rate": 2.361263407477121e-09, "loss": 0.6747, "step": 9435 }, { "epoch": 0.932802807503151, "grad_norm": 22.62467826351968, "learning_rate": 2.3543509820768914e-09, "loss": 0.686, "step": 9436 }, { "epoch": 0.9329016632479055, "grad_norm": 11.893786216687706, "learning_rate": 2.347448568797605e-09, "loss": 0.655, "step": 9437 }, { "epoch": 0.93300051899266, "grad_norm": 4.768994886656905, "learning_rate": 2.3405561683469856e-09, "loss": 0.6388, "step": 9438 }, { "epoch": 0.9330993747374144, "grad_norm": 4.447149780193799, "learning_rate": 2.3336737814317555e-09, "loss": 0.706, "step": 9439 }, { "epoch": 0.9331982304821689, "grad_norm": 4.381167304650915, "learning_rate": 2.3268014087575947e-09, "loss": 0.707, "step": 9440 }, { "epoch": 0.9332970862269233, "grad_norm": 3.9319400823924786, "learning_rate": 2.319939051029174e-09, "loss": 0.6648, "step": 9441 }, { "epoch": 0.9333959419716779, "grad_norm": 3.061105258824254, "learning_rate": 2.313086708950107e-09, "loss": 0.6602, "step": 9442 }, { "epoch": 0.9334947977164323, "grad_norm": 4.094735733124143, "learning_rate": 2.306244383223033e-09, "loss": 0.7352, "step": 9443 }, { "epoch": 0.9335936534611867, "grad_norm": 4.390781015786918, "learning_rate": 2.2994120745495003e-09, "loss": 0.7472, "step": 9444 }, { "epoch": 0.9336925092059413, "grad_norm": 3.6443007264815517, "learning_rate": 2.2925897836300723e-09, "loss": 0.6338, "step": 9445 }, { "epoch": 0.9337913649506957, "grad_norm": 11.324004293916774, "learning_rate": 2.2857775111642884e-09, "loss": 0.6953, "step": 9446 }, { "epoch": 0.9338902206954501, "grad_norm": 5.210198916517, "learning_rate": 2.278975257850624e-09, "loss": 0.6849, "step": 9447 }, { "epoch": 0.9339890764402047, "grad_norm": 2.859410714836566, "learning_rate": 2.2721830243865757e-09, "loss": 0.7012, "step": 9448 }, { "epoch": 0.9340879321849591, "grad_norm": 3.906393462896743, "learning_rate": 2.2654008114685543e-09, "loss": 0.68, "step": 9449 }, { "epoch": 0.9341867879297135, "grad_norm": 9.746065070084937, "learning_rate": 2.2586286197920023e-09, "loss": 0.64, "step": 9450 }, { "epoch": 0.934285643674468, "grad_norm": 2.9001646340466585, "learning_rate": 2.2518664500512985e-09, "loss": 0.6532, "step": 9451 }, { "epoch": 0.9343844994192225, "grad_norm": 6.082909802346714, "learning_rate": 2.245114302939799e-09, "loss": 0.7301, "step": 9452 }, { "epoch": 0.934483355163977, "grad_norm": 3.8992260906642433, "learning_rate": 2.238372179149839e-09, "loss": 0.7729, "step": 9453 }, { "epoch": 0.9345822109087314, "grad_norm": 5.362699412546064, "learning_rate": 2.231640079372732e-09, "loss": 0.7257, "step": 9454 }, { "epoch": 0.9346810666534859, "grad_norm": 4.344396929479076, "learning_rate": 2.224918004298748e-09, "loss": 0.693, "step": 9455 }, { "epoch": 0.9347799223982404, "grad_norm": 3.9623458891185765, "learning_rate": 2.218205954617147e-09, "loss": 0.6778, "step": 9456 }, { "epoch": 0.9348787781429948, "grad_norm": 7.584680015343473, "learning_rate": 2.2115039310161343e-09, "loss": 0.7909, "step": 9457 }, { "epoch": 0.9349776338877493, "grad_norm": 4.915494994874523, "learning_rate": 2.204811934182915e-09, "loss": 0.6907, "step": 9458 }, { "epoch": 0.9350764896325038, "grad_norm": 8.704286000576328, "learning_rate": 2.198129964803652e-09, "loss": 0.6431, "step": 9459 }, { "epoch": 0.9351753453772582, "grad_norm": 4.6482714561707965, "learning_rate": 2.191458023563486e-09, "loss": 0.7197, "step": 9460 }, { "epoch": 0.9352742011220126, "grad_norm": 3.32249016507946, "learning_rate": 2.184796111146503e-09, "loss": 0.6231, "step": 9461 }, { "epoch": 0.9353730568667672, "grad_norm": 5.774286974729944, "learning_rate": 2.1781442282358232e-09, "loss": 0.7401, "step": 9462 }, { "epoch": 0.9354719126115216, "grad_norm": 3.8058705511329145, "learning_rate": 2.1715023755134675e-09, "loss": 0.7252, "step": 9463 }, { "epoch": 0.9355707683562761, "grad_norm": 5.637458940336325, "learning_rate": 2.1648705536604694e-09, "loss": 0.6851, "step": 9464 }, { "epoch": 0.9356696241010306, "grad_norm": 3.518999157990596, "learning_rate": 2.1582487633568403e-09, "loss": 0.7118, "step": 9465 }, { "epoch": 0.935768479845785, "grad_norm": 8.463993457413432, "learning_rate": 2.151637005281526e-09, "loss": 0.7213, "step": 9466 }, { "epoch": 0.9358673355905395, "grad_norm": 5.340538822994558, "learning_rate": 2.1450352801124618e-09, "loss": 0.7097, "step": 9467 }, { "epoch": 0.935966191335294, "grad_norm": 7.484612627412143, "learning_rate": 2.138443588526584e-09, "loss": 0.7609, "step": 9468 }, { "epoch": 0.9360650470800485, "grad_norm": 5.200703103508244, "learning_rate": 2.131861931199741e-09, "loss": 0.5201, "step": 9469 }, { "epoch": 0.9361639028248029, "grad_norm": 4.491616172682409, "learning_rate": 2.1252903088068153e-09, "loss": 0.6586, "step": 9470 }, { "epoch": 0.9362627585695573, "grad_norm": 3.5065925640953335, "learning_rate": 2.1187287220216123e-09, "loss": 0.6327, "step": 9471 }, { "epoch": 0.9363616143143119, "grad_norm": 4.803224605484602, "learning_rate": 2.1121771715169157e-09, "loss": 0.7321, "step": 9472 }, { "epoch": 0.9364604700590663, "grad_norm": 3.0880079005376926, "learning_rate": 2.105635657964522e-09, "loss": 0.7694, "step": 9473 }, { "epoch": 0.9365593258038207, "grad_norm": 47.38564803886269, "learning_rate": 2.0991041820351273e-09, "loss": 0.7044, "step": 9474 }, { "epoch": 0.9366581815485753, "grad_norm": 7.223594611681838, "learning_rate": 2.0925827443984746e-09, "loss": 0.7684, "step": 9475 }, { "epoch": 0.9367570372933297, "grad_norm": 4.30390666061517, "learning_rate": 2.086071345723228e-09, "loss": 0.7593, "step": 9476 }, { "epoch": 0.9368558930380841, "grad_norm": 4.937063302113026, "learning_rate": 2.079569986677021e-09, "loss": 0.7374, "step": 9477 }, { "epoch": 0.9369547487828387, "grad_norm": 3.3452245412954342, "learning_rate": 2.0730786679264866e-09, "loss": 0.6575, "step": 9478 }, { "epoch": 0.9370536045275931, "grad_norm": 4.054223306944417, "learning_rate": 2.066597390137215e-09, "loss": 0.7345, "step": 9479 }, { "epoch": 0.9371524602723476, "grad_norm": 8.150040826733962, "learning_rate": 2.0601261539737513e-09, "loss": 0.6282, "step": 9480 }, { "epoch": 0.9372513160171021, "grad_norm": 3.4134266091742, "learning_rate": 2.053664960099655e-09, "loss": 0.7141, "step": 9481 }, { "epoch": 0.9373501717618565, "grad_norm": 3.344018582080203, "learning_rate": 2.047213809177384e-09, "loss": 0.6747, "step": 9482 }, { "epoch": 0.937449027506611, "grad_norm": 6.082404859024422, "learning_rate": 2.040772701868443e-09, "loss": 0.6916, "step": 9483 }, { "epoch": 0.9375478832513654, "grad_norm": 5.579251691797863, "learning_rate": 2.03434163883327e-09, "loss": 0.6345, "step": 9484 }, { "epoch": 0.9376467389961199, "grad_norm": 5.099846693445276, "learning_rate": 2.027920620731249e-09, "loss": 0.7108, "step": 9485 }, { "epoch": 0.9377455947408744, "grad_norm": 6.8962614783778635, "learning_rate": 2.0215096482207873e-09, "loss": 0.7121, "step": 9486 }, { "epoch": 0.9378444504856288, "grad_norm": 3.0314534851749824, "learning_rate": 2.0151087219592354e-09, "loss": 0.6382, "step": 9487 }, { "epoch": 0.9379433062303834, "grad_norm": 11.116323960196492, "learning_rate": 2.0087178426028917e-09, "loss": 0.7046, "step": 9488 }, { "epoch": 0.9380421619751378, "grad_norm": 6.770631732536594, "learning_rate": 2.0023370108070537e-09, "loss": 0.6302, "step": 9489 }, { "epoch": 0.9381410177198922, "grad_norm": 3.7757503215476764, "learning_rate": 1.9959662272260093e-09, "loss": 0.6815, "step": 9490 }, { "epoch": 0.9382398734646468, "grad_norm": 5.637119343984888, "learning_rate": 1.989605492512958e-09, "loss": 0.6814, "step": 9491 }, { "epoch": 0.9383387292094012, "grad_norm": 5.000635018735602, "learning_rate": 1.9832548073201117e-09, "loss": 0.7011, "step": 9492 }, { "epoch": 0.9384375849541556, "grad_norm": 7.513236565366982, "learning_rate": 1.9769141722986383e-09, "loss": 0.6583, "step": 9493 }, { "epoch": 0.9385364406989101, "grad_norm": 3.3954197170877314, "learning_rate": 1.970583588098662e-09, "loss": 0.6947, "step": 9494 }, { "epoch": 0.9386352964436646, "grad_norm": 4.261247997682572, "learning_rate": 1.9642630553693085e-09, "loss": 0.6283, "step": 9495 }, { "epoch": 0.938734152188419, "grad_norm": 6.607808117113459, "learning_rate": 1.957952574758648e-09, "loss": 0.8332, "step": 9496 }, { "epoch": 0.9388330079331735, "grad_norm": 3.3222110248573795, "learning_rate": 1.9516521469137294e-09, "loss": 0.6823, "step": 9497 }, { "epoch": 0.938931863677928, "grad_norm": 3.303349851751872, "learning_rate": 1.9453617724805805e-09, "loss": 0.7286, "step": 9498 }, { "epoch": 0.9390307194226825, "grad_norm": 3.442410999705263, "learning_rate": 1.939081452104152e-09, "loss": 0.6639, "step": 9499 }, { "epoch": 0.9391295751674369, "grad_norm": 3.258662389114907, "learning_rate": 1.932811186428429e-09, "loss": 0.6113, "step": 9500 }, { "epoch": 0.9392284309121914, "grad_norm": 4.453182285188058, "learning_rate": 1.9265509760963415e-09, "loss": 0.7437, "step": 9501 }, { "epoch": 0.9393272866569459, "grad_norm": 3.2506802505446637, "learning_rate": 1.920300821749743e-09, "loss": 0.697, "step": 9502 }, { "epoch": 0.9394261424017003, "grad_norm": 4.884906440355357, "learning_rate": 1.9140607240295427e-09, "loss": 0.8005, "step": 9503 }, { "epoch": 0.9395249981464547, "grad_norm": 9.32451080478913, "learning_rate": 1.907830683575529e-09, "loss": 0.7036, "step": 9504 }, { "epoch": 0.9396238538912093, "grad_norm": 5.610917000034162, "learning_rate": 1.9016107010265124e-09, "loss": 0.6402, "step": 9505 }, { "epoch": 0.9397227096359637, "grad_norm": 5.61151799539278, "learning_rate": 1.895400777020273e-09, "loss": 0.7526, "step": 9506 }, { "epoch": 0.9398215653807181, "grad_norm": 4.2011025372962925, "learning_rate": 1.889200912193545e-09, "loss": 0.602, "step": 9507 }, { "epoch": 0.9399204211254727, "grad_norm": 4.505273502431266, "learning_rate": 1.8830111071820085e-09, "loss": 0.8132, "step": 9508 }, { "epoch": 0.9400192768702271, "grad_norm": 4.485959205093944, "learning_rate": 1.876831362620379e-09, "loss": 0.6671, "step": 9509 }, { "epoch": 0.9401181326149816, "grad_norm": 3.7240502530218462, "learning_rate": 1.8706616791422715e-09, "loss": 0.6899, "step": 9510 }, { "epoch": 0.9402169883597361, "grad_norm": 3.7184910265387696, "learning_rate": 1.8645020573802905e-09, "loss": 0.5886, "step": 9511 }, { "epoch": 0.9403158441044905, "grad_norm": 4.818279616103093, "learning_rate": 1.8583524979660314e-09, "loss": 0.7816, "step": 9512 }, { "epoch": 0.940414699849245, "grad_norm": 4.442129241017797, "learning_rate": 1.8522130015300119e-09, "loss": 0.8389, "step": 9513 }, { "epoch": 0.9405135555939994, "grad_norm": 6.303731532760152, "learning_rate": 1.8460835687017951e-09, "loss": 0.6037, "step": 9514 }, { "epoch": 0.940612411338754, "grad_norm": 7.707398075323764, "learning_rate": 1.8399642001098116e-09, "loss": 0.7242, "step": 9515 }, { "epoch": 0.9407112670835084, "grad_norm": 4.490662999858469, "learning_rate": 1.8338548963815482e-09, "loss": 0.6895, "step": 9516 }, { "epoch": 0.9408101228282628, "grad_norm": 9.035661648967672, "learning_rate": 1.827755658143415e-09, "loss": 0.7628, "step": 9517 }, { "epoch": 0.9409089785730174, "grad_norm": 3.324822260282589, "learning_rate": 1.8216664860207898e-09, "loss": 0.7586, "step": 9518 }, { "epoch": 0.9410078343177718, "grad_norm": 2.883821582969836, "learning_rate": 1.8155873806380396e-09, "loss": 0.7082, "step": 9519 }, { "epoch": 0.9411066900625262, "grad_norm": 3.2885143907397887, "learning_rate": 1.8095183426184878e-09, "loss": 0.7967, "step": 9520 }, { "epoch": 0.9412055458072808, "grad_norm": 4.016651746323613, "learning_rate": 1.8034593725844038e-09, "loss": 0.761, "step": 9521 }, { "epoch": 0.9413044015520352, "grad_norm": 37.94754153041741, "learning_rate": 1.797410471157057e-09, "loss": 0.7407, "step": 9522 }, { "epoch": 0.9414032572967896, "grad_norm": 4.781137485705265, "learning_rate": 1.7913716389566958e-09, "loss": 0.702, "step": 9523 }, { "epoch": 0.9415021130415441, "grad_norm": 3.675469303494617, "learning_rate": 1.7853428766024803e-09, "loss": 0.7073, "step": 9524 }, { "epoch": 0.9416009687862986, "grad_norm": 4.928609094733691, "learning_rate": 1.779324184712594e-09, "loss": 0.5802, "step": 9525 }, { "epoch": 0.941699824531053, "grad_norm": 3.996639579107412, "learning_rate": 1.773315563904143e-09, "loss": 0.8325, "step": 9526 }, { "epoch": 0.9417986802758075, "grad_norm": 3.637963527474457, "learning_rate": 1.767317014793257e-09, "loss": 0.7845, "step": 9527 }, { "epoch": 0.941897536020562, "grad_norm": 4.180321532776557, "learning_rate": 1.761328537994955e-09, "loss": 0.7203, "step": 9528 }, { "epoch": 0.9419963917653165, "grad_norm": 3.4498714140904005, "learning_rate": 1.7553501341233013e-09, "loss": 0.7422, "step": 9529 }, { "epoch": 0.9420952475100709, "grad_norm": 3.1662044692711513, "learning_rate": 1.7493818037912723e-09, "loss": 0.6721, "step": 9530 }, { "epoch": 0.9421941032548254, "grad_norm": 4.096140524567607, "learning_rate": 1.743423547610845e-09, "loss": 0.6697, "step": 9531 }, { "epoch": 0.9422929589995799, "grad_norm": 5.641133485488107, "learning_rate": 1.737475366192953e-09, "loss": 0.6811, "step": 9532 }, { "epoch": 0.9423918147443343, "grad_norm": 3.2898073327490946, "learning_rate": 1.7315372601474753e-09, "loss": 0.6675, "step": 9533 }, { "epoch": 0.9424906704890887, "grad_norm": 3.546898110060117, "learning_rate": 1.7256092300833025e-09, "loss": 0.5832, "step": 9534 }, { "epoch": 0.9425895262338433, "grad_norm": 10.202019965684796, "learning_rate": 1.7196912766082373e-09, "loss": 0.6834, "step": 9535 }, { "epoch": 0.9426883819785977, "grad_norm": 3.4272911735767244, "learning_rate": 1.7137834003291162e-09, "loss": 0.6664, "step": 9536 }, { "epoch": 0.9427872377233522, "grad_norm": 2.7459119684544544, "learning_rate": 1.7078856018516664e-09, "loss": 0.7507, "step": 9537 }, { "epoch": 0.9428860934681067, "grad_norm": 3.8630611789509635, "learning_rate": 1.701997881780637e-09, "loss": 0.7495, "step": 9538 }, { "epoch": 0.9429849492128611, "grad_norm": 5.384312523499959, "learning_rate": 1.696120240719734e-09, "loss": 0.7234, "step": 9539 }, { "epoch": 0.9430838049576156, "grad_norm": 2.889369447358657, "learning_rate": 1.6902526792716199e-09, "loss": 0.6785, "step": 9540 }, { "epoch": 0.9431826607023701, "grad_norm": 4.834195663686083, "learning_rate": 1.6843951980379133e-09, "loss": 0.6519, "step": 9541 }, { "epoch": 0.9432815164471245, "grad_norm": 4.055440550535125, "learning_rate": 1.678547797619234e-09, "loss": 0.6583, "step": 9542 }, { "epoch": 0.943380372191879, "grad_norm": 4.554359241228796, "learning_rate": 1.6727104786151247e-09, "loss": 0.7572, "step": 9543 }, { "epoch": 0.9434792279366334, "grad_norm": 4.543998028730801, "learning_rate": 1.6668832416241285e-09, "loss": 0.6764, "step": 9544 }, { "epoch": 0.943578083681388, "grad_norm": 6.375545131358971, "learning_rate": 1.6610660872437454e-09, "loss": 0.732, "step": 9545 }, { "epoch": 0.9436769394261424, "grad_norm": 4.5446417771247205, "learning_rate": 1.6552590160704317e-09, "loss": 0.6697, "step": 9546 }, { "epoch": 0.9437757951708968, "grad_norm": 2.8446867103316693, "learning_rate": 1.649462028699622e-09, "loss": 0.7065, "step": 9547 }, { "epoch": 0.9438746509156514, "grad_norm": 14.12016688933478, "learning_rate": 1.6436751257256965e-09, "loss": 0.7156, "step": 9548 }, { "epoch": 0.9439735066604058, "grad_norm": 4.9885228255671175, "learning_rate": 1.637898307742036e-09, "loss": 0.7318, "step": 9549 }, { "epoch": 0.9440723624051602, "grad_norm": 4.921791159261515, "learning_rate": 1.632131575340967e-09, "loss": 0.6344, "step": 9550 }, { "epoch": 0.9441712181499148, "grad_norm": 5.358949474243263, "learning_rate": 1.6263749291137606e-09, "loss": 0.7132, "step": 9551 }, { "epoch": 0.9442700738946692, "grad_norm": 10.314000415825845, "learning_rate": 1.6206283696507006e-09, "loss": 0.7748, "step": 9552 }, { "epoch": 0.9443689296394236, "grad_norm": 2.9639729766388063, "learning_rate": 1.6148918975410154e-09, "loss": 0.6851, "step": 9553 }, { "epoch": 0.9444677853841782, "grad_norm": 13.78915318982032, "learning_rate": 1.6091655133728676e-09, "loss": 0.7048, "step": 9554 }, { "epoch": 0.9445666411289326, "grad_norm": 3.1420632635632955, "learning_rate": 1.6034492177334324e-09, "loss": 0.5917, "step": 9555 }, { "epoch": 0.9446654968736871, "grad_norm": 4.228712703902489, "learning_rate": 1.5977430112088296e-09, "loss": 0.7274, "step": 9556 }, { "epoch": 0.9447643526184415, "grad_norm": 3.6848647723822903, "learning_rate": 1.5920468943841469e-09, "loss": 0.6859, "step": 9557 }, { "epoch": 0.944863208363196, "grad_norm": 2.8463442592894563, "learning_rate": 1.5863608678434393e-09, "loss": 0.6775, "step": 9558 }, { "epoch": 0.9449620641079505, "grad_norm": 14.16641554404359, "learning_rate": 1.580684932169718e-09, "loss": 0.7362, "step": 9559 }, { "epoch": 0.9450609198527049, "grad_norm": 5.7874088781158415, "learning_rate": 1.575019087944962e-09, "loss": 0.7285, "step": 9560 }, { "epoch": 0.9451597755974595, "grad_norm": 5.8754845106660945, "learning_rate": 1.56936333575014e-09, "loss": 0.6576, "step": 9561 }, { "epoch": 0.9452586313422139, "grad_norm": 4.368819259455327, "learning_rate": 1.563717676165155e-09, "loss": 0.667, "step": 9562 }, { "epoch": 0.9453574870869683, "grad_norm": 5.533860586705769, "learning_rate": 1.5580821097688768e-09, "loss": 0.6598, "step": 9563 }, { "epoch": 0.9454563428317229, "grad_norm": 5.076910941421249, "learning_rate": 1.5524566371391656e-09, "loss": 0.6482, "step": 9564 }, { "epoch": 0.9455551985764773, "grad_norm": 3.461274933528676, "learning_rate": 1.5468412588528157e-09, "loss": 0.7712, "step": 9565 }, { "epoch": 0.9456540543212317, "grad_norm": 3.2905133723351443, "learning_rate": 1.5412359754856108e-09, "loss": 0.7098, "step": 9566 }, { "epoch": 0.9457529100659862, "grad_norm": 11.135119342609995, "learning_rate": 1.5356407876122911e-09, "loss": 0.7017, "step": 9567 }, { "epoch": 0.9458517658107407, "grad_norm": 4.6191519086631985, "learning_rate": 1.5300556958065536e-09, "loss": 0.7163, "step": 9568 }, { "epoch": 0.9459506215554951, "grad_norm": 4.370946682008554, "learning_rate": 1.5244807006410732e-09, "loss": 0.675, "step": 9569 }, { "epoch": 0.9460494773002496, "grad_norm": 4.436988320092492, "learning_rate": 1.5189158026874704e-09, "loss": 0.7369, "step": 9570 }, { "epoch": 0.9461483330450041, "grad_norm": 18.427922074618525, "learning_rate": 1.5133610025163557e-09, "loss": 0.7057, "step": 9571 }, { "epoch": 0.9462471887897586, "grad_norm": 6.36627222240923, "learning_rate": 1.5078163006973066e-09, "loss": 0.6836, "step": 9572 }, { "epoch": 0.946346044534513, "grad_norm": 10.70946063942642, "learning_rate": 1.502281697798824e-09, "loss": 0.6454, "step": 9573 }, { "epoch": 0.9464449002792675, "grad_norm": 7.818420657876119, "learning_rate": 1.4967571943883983e-09, "loss": 0.8043, "step": 9574 }, { "epoch": 0.946543756024022, "grad_norm": 3.406175339000333, "learning_rate": 1.4912427910325209e-09, "loss": 0.7003, "step": 9575 }, { "epoch": 0.9466426117687764, "grad_norm": 6.243815321159405, "learning_rate": 1.4857384882965729e-09, "loss": 0.7077, "step": 9576 }, { "epoch": 0.9467414675135308, "grad_norm": 8.738423005550182, "learning_rate": 1.4802442867449472e-09, "loss": 0.5686, "step": 9577 }, { "epoch": 0.9468403232582854, "grad_norm": 9.123164796795818, "learning_rate": 1.4747601869410154e-09, "loss": 0.7058, "step": 9578 }, { "epoch": 0.9469391790030398, "grad_norm": 5.809920538056159, "learning_rate": 1.4692861894470721e-09, "loss": 0.7757, "step": 9579 }, { "epoch": 0.9470380347477942, "grad_norm": 7.965921667950453, "learning_rate": 1.4638222948244127e-09, "loss": 0.6984, "step": 9580 }, { "epoch": 0.9471368904925488, "grad_norm": 13.969528876775147, "learning_rate": 1.458368503633256e-09, "loss": 0.781, "step": 9581 }, { "epoch": 0.9472357462373032, "grad_norm": 4.957091867756254, "learning_rate": 1.45292481643281e-09, "loss": 0.644, "step": 9582 }, { "epoch": 0.9473346019820577, "grad_norm": 3.4013317612039975, "learning_rate": 1.4474912337812616e-09, "loss": 0.6661, "step": 9583 }, { "epoch": 0.9474334577268122, "grad_norm": 6.493526394811664, "learning_rate": 1.442067756235732e-09, "loss": 0.6709, "step": 9584 }, { "epoch": 0.9475323134715666, "grad_norm": 2.9247587221022617, "learning_rate": 1.4366543843523205e-09, "loss": 0.7275, "step": 9585 }, { "epoch": 0.9476311692163211, "grad_norm": 3.7720568998272364, "learning_rate": 1.4312511186861053e-09, "loss": 0.7638, "step": 9586 }, { "epoch": 0.9477300249610755, "grad_norm": 4.0993808269323235, "learning_rate": 1.425857959791077e-09, "loss": 0.7151, "step": 9587 }, { "epoch": 0.94782888070583, "grad_norm": 3.995536861087634, "learning_rate": 1.4204749082202482e-09, "loss": 0.7474, "step": 9588 }, { "epoch": 0.9479277364505845, "grad_norm": 3.3457523059123266, "learning_rate": 1.415101964525578e-09, "loss": 0.6676, "step": 9589 }, { "epoch": 0.9480265921953389, "grad_norm": 5.458175249248427, "learning_rate": 1.4097391292579586e-09, "loss": 0.6722, "step": 9590 }, { "epoch": 0.9481254479400935, "grad_norm": 15.708911263237425, "learning_rate": 1.4043864029672948e-09, "loss": 0.6667, "step": 9591 }, { "epoch": 0.9482243036848479, "grad_norm": 2.731033598470992, "learning_rate": 1.399043786202403e-09, "loss": 0.7365, "step": 9592 }, { "epoch": 0.9483231594296023, "grad_norm": 5.586102167529366, "learning_rate": 1.3937112795111117e-09, "loss": 0.6179, "step": 9593 }, { "epoch": 0.9484220151743569, "grad_norm": 3.190846058589971, "learning_rate": 1.3883888834401836e-09, "loss": 0.7002, "step": 9594 }, { "epoch": 0.9485208709191113, "grad_norm": 7.143162335583241, "learning_rate": 1.3830765985353488e-09, "loss": 0.6477, "step": 9595 }, { "epoch": 0.9486197266638657, "grad_norm": 4.003028787628363, "learning_rate": 1.3777744253413159e-09, "loss": 0.7631, "step": 9596 }, { "epoch": 0.9487185824086202, "grad_norm": 13.211707019449607, "learning_rate": 1.372482364401728e-09, "loss": 0.6653, "step": 9597 }, { "epoch": 0.9488174381533747, "grad_norm": 3.266825494737795, "learning_rate": 1.3672004162592287e-09, "loss": 0.7338, "step": 9598 }, { "epoch": 0.9489162938981291, "grad_norm": 3.202834326360478, "learning_rate": 1.3619285814553738e-09, "loss": 0.7086, "step": 9599 }, { "epoch": 0.9490151496428836, "grad_norm": 4.083126667197546, "learning_rate": 1.356666860530742e-09, "loss": 0.8052, "step": 9600 }, { "epoch": 0.9491140053876381, "grad_norm": 5.352618424847311, "learning_rate": 1.3514152540248237e-09, "loss": 0.7569, "step": 9601 }, { "epoch": 0.9492128611323926, "grad_norm": 3.451007196577972, "learning_rate": 1.3461737624760993e-09, "loss": 0.6259, "step": 9602 }, { "epoch": 0.949311716877147, "grad_norm": 3.8648539121877, "learning_rate": 1.340942386422017e-09, "loss": 0.7411, "step": 9603 }, { "epoch": 0.9494105726219015, "grad_norm": 3.8074213702915505, "learning_rate": 1.3357211263989588e-09, "loss": 0.7252, "step": 9604 }, { "epoch": 0.949509428366656, "grad_norm": 13.916537197939082, "learning_rate": 1.3305099829423072e-09, "loss": 0.6702, "step": 9605 }, { "epoch": 0.9496082841114104, "grad_norm": 3.822858963432504, "learning_rate": 1.3253089565863794e-09, "loss": 0.6091, "step": 9606 }, { "epoch": 0.9497071398561648, "grad_norm": 4.665566657321777, "learning_rate": 1.3201180478644601e-09, "loss": 0.7278, "step": 9607 }, { "epoch": 0.9498059956009194, "grad_norm": 5.033409044133929, "learning_rate": 1.3149372573088123e-09, "loss": 0.7568, "step": 9608 }, { "epoch": 0.9499048513456738, "grad_norm": 4.320330014231467, "learning_rate": 1.3097665854506334e-09, "loss": 0.6726, "step": 9609 }, { "epoch": 0.9500037070904283, "grad_norm": 4.935807217177442, "learning_rate": 1.3046060328200992e-09, "loss": 0.8174, "step": 9610 }, { "epoch": 0.9501025628351828, "grad_norm": 3.720533022372357, "learning_rate": 1.2994555999463751e-09, "loss": 0.723, "step": 9611 }, { "epoch": 0.9502014185799372, "grad_norm": 8.625374400364402, "learning_rate": 1.2943152873575391e-09, "loss": 0.6938, "step": 9612 }, { "epoch": 0.9503002743246917, "grad_norm": 4.266800230286825, "learning_rate": 1.2891850955806582e-09, "loss": 0.6431, "step": 9613 }, { "epoch": 0.9503991300694462, "grad_norm": 5.13796154773431, "learning_rate": 1.2840650251417562e-09, "loss": 0.6069, "step": 9614 }, { "epoch": 0.9504979858142006, "grad_norm": 3.8725358868236706, "learning_rate": 1.2789550765658241e-09, "loss": 0.6704, "step": 9615 }, { "epoch": 0.9505968415589551, "grad_norm": 4.105359328341502, "learning_rate": 1.2738552503768096e-09, "loss": 0.7355, "step": 9616 }, { "epoch": 0.9506956973037095, "grad_norm": 3.088569221513314, "learning_rate": 1.2687655470976277e-09, "loss": 0.7426, "step": 9617 }, { "epoch": 0.950794553048464, "grad_norm": 4.319713397637834, "learning_rate": 1.2636859672501498e-09, "loss": 0.7625, "step": 9618 }, { "epoch": 0.9508934087932185, "grad_norm": 3.821154319114971, "learning_rate": 1.258616511355215e-09, "loss": 0.6724, "step": 9619 }, { "epoch": 0.9509922645379729, "grad_norm": 4.032760426388844, "learning_rate": 1.2535571799326183e-09, "loss": 0.7275, "step": 9620 }, { "epoch": 0.9510911202827275, "grad_norm": 4.198799957790205, "learning_rate": 1.2485079735011227e-09, "loss": 0.6757, "step": 9621 }, { "epoch": 0.9511899760274819, "grad_norm": 2.918365892761069, "learning_rate": 1.2434688925784476e-09, "loss": 0.6602, "step": 9622 }, { "epoch": 0.9512888317722363, "grad_norm": 5.697747485084655, "learning_rate": 1.2384399376812683e-09, "loss": 0.6641, "step": 9623 }, { "epoch": 0.9513876875169909, "grad_norm": 3.4429000935753216, "learning_rate": 1.23342110932525e-09, "loss": 0.7265, "step": 9624 }, { "epoch": 0.9514865432617453, "grad_norm": 3.382926476464799, "learning_rate": 1.2284124080249703e-09, "loss": 0.654, "step": 9625 }, { "epoch": 0.9515853990064997, "grad_norm": 3.7131804057062703, "learning_rate": 1.2234138342940293e-09, "loss": 0.5713, "step": 9626 }, { "epoch": 0.9516842547512542, "grad_norm": 7.390561081347685, "learning_rate": 1.2184253886449392e-09, "loss": 0.6979, "step": 9627 }, { "epoch": 0.9517831104960087, "grad_norm": 5.243705524115942, "learning_rate": 1.213447071589191e-09, "loss": 0.7429, "step": 9628 }, { "epoch": 0.9518819662407632, "grad_norm": 3.55169440754598, "learning_rate": 1.2084788836372318e-09, "loss": 0.754, "step": 9629 }, { "epoch": 0.9519808219855176, "grad_norm": 7.556704159227265, "learning_rate": 1.2035208252985097e-09, "loss": 0.7159, "step": 9630 }, { "epoch": 0.9520796777302721, "grad_norm": 4.888665254885543, "learning_rate": 1.1985728970813625e-09, "loss": 0.6682, "step": 9631 }, { "epoch": 0.9521785334750266, "grad_norm": 10.85766643603529, "learning_rate": 1.1936350994931398e-09, "loss": 0.6057, "step": 9632 }, { "epoch": 0.952277389219781, "grad_norm": 7.950734072601201, "learning_rate": 1.1887074330401481e-09, "loss": 0.7753, "step": 9633 }, { "epoch": 0.9523762449645355, "grad_norm": 20.72581477301955, "learning_rate": 1.1837898982276384e-09, "loss": 0.7647, "step": 9634 }, { "epoch": 0.95247510070929, "grad_norm": 4.554731291558747, "learning_rate": 1.1788824955598299e-09, "loss": 0.6497, "step": 9635 }, { "epoch": 0.9525739564540444, "grad_norm": 4.52733011973087, "learning_rate": 1.1739852255399086e-09, "loss": 0.7165, "step": 9636 }, { "epoch": 0.952672812198799, "grad_norm": 3.9655019303795602, "learning_rate": 1.1690980886700063e-09, "loss": 0.6843, "step": 9637 }, { "epoch": 0.9527716679435534, "grad_norm": 5.967078803704163, "learning_rate": 1.1642210854512557e-09, "loss": 0.7442, "step": 9638 }, { "epoch": 0.9528705236883078, "grad_norm": 2.9626798270821686, "learning_rate": 1.15935421638369e-09, "loss": 0.6834, "step": 9639 }, { "epoch": 0.9529693794330623, "grad_norm": 4.304890334308182, "learning_rate": 1.1544974819663434e-09, "loss": 0.7134, "step": 9640 }, { "epoch": 0.9530682351778168, "grad_norm": 4.210812933699883, "learning_rate": 1.1496508826972172e-09, "loss": 0.7285, "step": 9641 }, { "epoch": 0.9531670909225712, "grad_norm": 22.28098723576354, "learning_rate": 1.1448144190732479e-09, "loss": 0.6296, "step": 9642 }, { "epoch": 0.9532659466673257, "grad_norm": 3.6553961273250697, "learning_rate": 1.139988091590327e-09, "loss": 0.5938, "step": 9643 }, { "epoch": 0.9533648024120802, "grad_norm": 3.498547217589503, "learning_rate": 1.1351719007433592e-09, "loss": 0.5882, "step": 9644 }, { "epoch": 0.9534636581568346, "grad_norm": 4.03815831675684, "learning_rate": 1.130365847026149e-09, "loss": 0.7056, "step": 9645 }, { "epoch": 0.9535625139015891, "grad_norm": 3.5073943887801713, "learning_rate": 1.1255699309314914e-09, "loss": 0.7356, "step": 9646 }, { "epoch": 0.9536613696463436, "grad_norm": 2.9851538930781296, "learning_rate": 1.1207841529511263e-09, "loss": 0.6836, "step": 9647 }, { "epoch": 0.9537602253910981, "grad_norm": 20.80705722156873, "learning_rate": 1.1160085135757834e-09, "loss": 0.7815, "step": 9648 }, { "epoch": 0.9538590811358525, "grad_norm": 7.188801675477845, "learning_rate": 1.1112430132951266e-09, "loss": 0.6002, "step": 9649 }, { "epoch": 0.9539579368806069, "grad_norm": 4.615957562552767, "learning_rate": 1.1064876525977872e-09, "loss": 0.6842, "step": 9650 }, { "epoch": 0.9540567926253615, "grad_norm": 4.029867573310436, "learning_rate": 1.101742431971353e-09, "loss": 0.6653, "step": 9651 }, { "epoch": 0.9541556483701159, "grad_norm": 8.724206220828663, "learning_rate": 1.0970073519023904e-09, "loss": 0.7537, "step": 9652 }, { "epoch": 0.9542545041148703, "grad_norm": 3.6550534375922896, "learning_rate": 1.0922824128763997e-09, "loss": 0.7431, "step": 9653 }, { "epoch": 0.9543533598596249, "grad_norm": 4.142880988521536, "learning_rate": 1.087567615377849e-09, "loss": 0.7114, "step": 9654 }, { "epoch": 0.9544522156043793, "grad_norm": 3.4525522389574332, "learning_rate": 1.0828629598901739e-09, "loss": 0.6682, "step": 9655 }, { "epoch": 0.9545510713491338, "grad_norm": 32.59812766707639, "learning_rate": 1.0781684468957774e-09, "loss": 0.71, "step": 9656 }, { "epoch": 0.9546499270938883, "grad_norm": 13.05438644738811, "learning_rate": 1.073484076876019e-09, "loss": 0.6863, "step": 9657 }, { "epoch": 0.9547487828386427, "grad_norm": 3.7740075508938067, "learning_rate": 1.068809850311181e-09, "loss": 0.6884, "step": 9658 }, { "epoch": 0.9548476385833972, "grad_norm": 2.911363290074344, "learning_rate": 1.0641457676805577e-09, "loss": 0.6908, "step": 9659 }, { "epoch": 0.9549464943281516, "grad_norm": 4.5614308785149555, "learning_rate": 1.059491829462389e-09, "loss": 0.7117, "step": 9660 }, { "epoch": 0.9550453500729061, "grad_norm": 4.1285527033913985, "learning_rate": 1.0548480361338597e-09, "loss": 0.7176, "step": 9661 }, { "epoch": 0.9551442058176606, "grad_norm": 26.878823688206424, "learning_rate": 1.0502143881711113e-09, "loss": 0.5754, "step": 9662 }, { "epoch": 0.955243061562415, "grad_norm": 4.536673748063369, "learning_rate": 1.0455908860492745e-09, "loss": 0.6253, "step": 9663 }, { "epoch": 0.9553419173071696, "grad_norm": 5.417885611029824, "learning_rate": 1.040977530242404e-09, "loss": 0.7872, "step": 9664 }, { "epoch": 0.955440773051924, "grad_norm": 5.711958789984218, "learning_rate": 1.036374321223532e-09, "loss": 0.6848, "step": 9665 }, { "epoch": 0.9555396287966784, "grad_norm": 3.1785964713292216, "learning_rate": 1.0317812594646702e-09, "loss": 0.6607, "step": 9666 }, { "epoch": 0.955638484541433, "grad_norm": 3.3837373478797166, "learning_rate": 1.0271983454367639e-09, "loss": 0.6939, "step": 9667 }, { "epoch": 0.9557373402861874, "grad_norm": 18.033654552987084, "learning_rate": 1.0226255796096927e-09, "loss": 0.7108, "step": 9668 }, { "epoch": 0.9558361960309418, "grad_norm": 3.420800981305091, "learning_rate": 1.0180629624523707e-09, "loss": 0.7955, "step": 9669 }, { "epoch": 0.9559350517756963, "grad_norm": 8.328212262374233, "learning_rate": 1.01351049443259e-09, "loss": 0.7239, "step": 9670 }, { "epoch": 0.9560339075204508, "grad_norm": 5.201204712180959, "learning_rate": 1.0089681760171663e-09, "loss": 0.7447, "step": 9671 }, { "epoch": 0.9561327632652052, "grad_norm": 4.387677888756311, "learning_rate": 1.0044360076718273e-09, "loss": 0.7292, "step": 9672 }, { "epoch": 0.9562316190099597, "grad_norm": 4.500534657618348, "learning_rate": 9.999139898612896e-10, "loss": 0.7185, "step": 9673 }, { "epoch": 0.9563304747547142, "grad_norm": 3.9323838181360937, "learning_rate": 9.95402123049227e-10, "loss": 0.8698, "step": 9674 }, { "epoch": 0.9564293304994687, "grad_norm": 3.0976638983995906, "learning_rate": 9.909004076982474e-10, "loss": 0.6414, "step": 9675 }, { "epoch": 0.9565281862442231, "grad_norm": 4.022168224939847, "learning_rate": 9.86408844269937e-10, "loss": 0.7131, "step": 9676 }, { "epoch": 0.9566270419889776, "grad_norm": 2.7652593911582346, "learning_rate": 9.819274332248607e-10, "loss": 0.663, "step": 9677 }, { "epoch": 0.9567258977337321, "grad_norm": 3.585534024343479, "learning_rate": 9.774561750225064e-10, "loss": 0.7385, "step": 9678 }, { "epoch": 0.9568247534784865, "grad_norm": 3.856384147511755, "learning_rate": 9.729950701213296e-10, "loss": 0.655, "step": 9679 }, { "epoch": 0.9569236092232409, "grad_norm": 5.169840256261319, "learning_rate": 9.685441189787535e-10, "loss": 0.5754, "step": 9680 }, { "epoch": 0.9570224649679955, "grad_norm": 8.898940549930296, "learning_rate": 9.641033220511686e-10, "loss": 0.7557, "step": 9681 }, { "epoch": 0.9571213207127499, "grad_norm": 3.256256487517087, "learning_rate": 9.596726797939102e-10, "loss": 0.7442, "step": 9682 }, { "epoch": 0.9572201764575043, "grad_norm": 3.9139738065965393, "learning_rate": 9.5525219266126e-10, "loss": 0.6913, "step": 9683 }, { "epoch": 0.9573190322022589, "grad_norm": 3.5272098876913303, "learning_rate": 9.508418611064883e-10, "loss": 0.8209, "step": 9684 }, { "epoch": 0.9574178879470133, "grad_norm": 5.083151727401354, "learning_rate": 9.464416855818002e-10, "loss": 0.6329, "step": 9685 }, { "epoch": 0.9575167436917678, "grad_norm": 9.721560612665419, "learning_rate": 9.420516665383904e-10, "loss": 0.7256, "step": 9686 }, { "epoch": 0.9576155994365223, "grad_norm": 4.267563678627701, "learning_rate": 9.376718044263653e-10, "loss": 0.7852, "step": 9687 }, { "epoch": 0.9577144551812767, "grad_norm": 6.773626415891948, "learning_rate": 9.333020996948216e-10, "loss": 0.5887, "step": 9688 }, { "epoch": 0.9578133109260312, "grad_norm": 4.236723764927924, "learning_rate": 9.289425527918116e-10, "loss": 0.7548, "step": 9689 }, { "epoch": 0.9579121666707856, "grad_norm": 5.593399423076282, "learning_rate": 9.245931641643446e-10, "loss": 0.681, "step": 9690 }, { "epoch": 0.9580110224155401, "grad_norm": 3.3661947112741206, "learning_rate": 9.202539342583859e-10, "loss": 0.7419, "step": 9691 }, { "epoch": 0.9581098781602946, "grad_norm": 3.1598945524679385, "learning_rate": 9.159248635188466e-10, "loss": 0.6697, "step": 9692 }, { "epoch": 0.958208733905049, "grad_norm": 4.269866843534518, "learning_rate": 9.116059523896269e-10, "loss": 0.7978, "step": 9693 }, { "epoch": 0.9583075896498036, "grad_norm": 4.3204954154925765, "learning_rate": 9.072972013135616e-10, "loss": 0.7042, "step": 9694 }, { "epoch": 0.958406445394558, "grad_norm": 3.158918891628624, "learning_rate": 9.029986107324416e-10, "loss": 0.66, "step": 9695 }, { "epoch": 0.9585053011393124, "grad_norm": 3.0862266905690876, "learning_rate": 8.987101810870478e-10, "loss": 0.5958, "step": 9696 }, { "epoch": 0.958604156884067, "grad_norm": 3.9873507596323416, "learning_rate": 8.944319128170619e-10, "loss": 0.6888, "step": 9697 }, { "epoch": 0.9587030126288214, "grad_norm": 3.0080664566583724, "learning_rate": 8.901638063611661e-10, "loss": 0.6494, "step": 9698 }, { "epoch": 0.9588018683735758, "grad_norm": 3.4987754836462437, "learning_rate": 8.859058621570215e-10, "loss": 0.6369, "step": 9699 }, { "epoch": 0.9589007241183303, "grad_norm": 12.494324803650857, "learning_rate": 8.816580806411789e-10, "loss": 0.7379, "step": 9700 }, { "epoch": 0.9589995798630848, "grad_norm": 8.995062488748955, "learning_rate": 8.774204622492121e-10, "loss": 0.7173, "step": 9701 }, { "epoch": 0.9590984356078393, "grad_norm": 11.053461650189186, "learning_rate": 8.731930074156069e-10, "loss": 0.6636, "step": 9702 }, { "epoch": 0.9591972913525937, "grad_norm": 4.026287265933042, "learning_rate": 8.689757165738387e-10, "loss": 0.6949, "step": 9703 }, { "epoch": 0.9592961470973482, "grad_norm": 5.2627006179694575, "learning_rate": 8.647685901563284e-10, "loss": 0.6509, "step": 9704 }, { "epoch": 0.9593950028421027, "grad_norm": 3.651827585355856, "learning_rate": 8.60571628594442e-10, "loss": 0.7442, "step": 9705 }, { "epoch": 0.9594938585868571, "grad_norm": 4.194697476578764, "learning_rate": 8.563848323185353e-10, "loss": 0.7904, "step": 9706 }, { "epoch": 0.9595927143316116, "grad_norm": 13.33652523740163, "learning_rate": 8.522082017578869e-10, "loss": 0.8419, "step": 9707 }, { "epoch": 0.9596915700763661, "grad_norm": 4.820101658623336, "learning_rate": 8.480417373407656e-10, "loss": 0.6705, "step": 9708 }, { "epoch": 0.9597904258211205, "grad_norm": 6.728002230520101, "learning_rate": 8.438854394943518e-10, "loss": 0.7492, "step": 9709 }, { "epoch": 0.959889281565875, "grad_norm": 3.5587468298612297, "learning_rate": 8.39739308644849e-10, "loss": 0.7615, "step": 9710 }, { "epoch": 0.9599881373106295, "grad_norm": 3.1144952089965647, "learning_rate": 8.356033452173505e-10, "loss": 0.7201, "step": 9711 }, { "epoch": 0.9600869930553839, "grad_norm": 6.799729312686852, "learning_rate": 8.314775496359616e-10, "loss": 0.7051, "step": 9712 }, { "epoch": 0.9601858488001384, "grad_norm": 3.1806485460535963, "learning_rate": 8.273619223236994e-10, "loss": 0.684, "step": 9713 }, { "epoch": 0.9602847045448929, "grad_norm": 4.980913827814384, "learning_rate": 8.232564637025707e-10, "loss": 0.7499, "step": 9714 }, { "epoch": 0.9603835602896473, "grad_norm": 5.738533061745901, "learning_rate": 8.1916117419355e-10, "loss": 0.6237, "step": 9715 }, { "epoch": 0.9604824160344018, "grad_norm": 4.332679892084157, "learning_rate": 8.150760542165125e-10, "loss": 0.7221, "step": 9716 }, { "epoch": 0.9605812717791563, "grad_norm": 4.473401907434002, "learning_rate": 8.110011041903564e-10, "loss": 0.6932, "step": 9717 }, { "epoch": 0.9606801275239107, "grad_norm": 7.415137561464604, "learning_rate": 8.069363245328919e-10, "loss": 0.7354, "step": 9718 }, { "epoch": 0.9607789832686652, "grad_norm": 4.90114092837828, "learning_rate": 8.02881715660908e-10, "loss": 0.7159, "step": 9719 }, { "epoch": 0.9608778390134197, "grad_norm": 4.186600454680899, "learning_rate": 7.988372779901387e-10, "loss": 0.7927, "step": 9720 }, { "epoch": 0.9609766947581742, "grad_norm": 3.7126562603143123, "learning_rate": 7.948030119352966e-10, "loss": 0.7742, "step": 9721 }, { "epoch": 0.9610755505029286, "grad_norm": 3.3351796314899347, "learning_rate": 7.907789179100177e-10, "loss": 0.7884, "step": 9722 }, { "epoch": 0.961174406247683, "grad_norm": 3.612729035356766, "learning_rate": 7.867649963269274e-10, "loss": 0.6505, "step": 9723 }, { "epoch": 0.9612732619924376, "grad_norm": 5.102026893976018, "learning_rate": 7.827612475975853e-10, "loss": 0.6761, "step": 9724 }, { "epoch": 0.961372117737192, "grad_norm": 3.240145263389014, "learning_rate": 7.787676721325187e-10, "loss": 0.6334, "step": 9725 }, { "epoch": 0.9614709734819464, "grad_norm": 2.8857992231216625, "learning_rate": 7.747842703412111e-10, "loss": 0.6471, "step": 9726 }, { "epoch": 0.961569829226701, "grad_norm": 11.087193659402446, "learning_rate": 7.708110426321024e-10, "loss": 0.7482, "step": 9727 }, { "epoch": 0.9616686849714554, "grad_norm": 4.395452736301807, "learning_rate": 7.66847989412589e-10, "loss": 0.6584, "step": 9728 }, { "epoch": 0.9617675407162098, "grad_norm": 5.473843599944854, "learning_rate": 7.628951110890236e-10, "loss": 0.7508, "step": 9729 }, { "epoch": 0.9618663964609644, "grad_norm": 4.3477825167187705, "learning_rate": 7.589524080667154e-10, "loss": 0.677, "step": 9730 }, { "epoch": 0.9619652522057188, "grad_norm": 3.6123872421783236, "learning_rate": 7.550198807499187e-10, "loss": 0.6595, "step": 9731 }, { "epoch": 0.9620641079504733, "grad_norm": 2.6982750489933647, "learning_rate": 7.510975295418776e-10, "loss": 0.7506, "step": 9732 }, { "epoch": 0.9621629636952277, "grad_norm": 5.267785490279878, "learning_rate": 7.471853548447593e-10, "loss": 0.6672, "step": 9733 }, { "epoch": 0.9622618194399822, "grad_norm": 4.360093215967715, "learning_rate": 7.432833570596985e-10, "loss": 0.6338, "step": 9734 }, { "epoch": 0.9623606751847367, "grad_norm": 3.1019696241552093, "learning_rate": 7.393915365867864e-10, "loss": 0.6757, "step": 9735 }, { "epoch": 0.9624595309294911, "grad_norm": 3.3082399215520937, "learning_rate": 7.355098938250815e-10, "loss": 0.7658, "step": 9736 }, { "epoch": 0.9625583866742456, "grad_norm": 5.742720731818882, "learning_rate": 7.316384291725763e-10, "loss": 0.7226, "step": 9737 }, { "epoch": 0.9626572424190001, "grad_norm": 5.126850206228592, "learning_rate": 7.277771430262536e-10, "loss": 0.6803, "step": 9738 }, { "epoch": 0.9627560981637545, "grad_norm": 9.714562543482561, "learning_rate": 7.239260357819965e-10, "loss": 0.6812, "step": 9739 }, { "epoch": 0.9628549539085091, "grad_norm": 3.8644242410869176, "learning_rate": 7.200851078347114e-10, "loss": 0.7827, "step": 9740 }, { "epoch": 0.9629538096532635, "grad_norm": 4.071164485270337, "learning_rate": 7.162543595782166e-10, "loss": 0.7898, "step": 9741 }, { "epoch": 0.9630526653980179, "grad_norm": 6.625017485617194, "learning_rate": 7.124337914052869e-10, "loss": 0.7369, "step": 9742 }, { "epoch": 0.9631515211427724, "grad_norm": 4.758917132208141, "learning_rate": 7.086234037076976e-10, "loss": 0.6858, "step": 9743 }, { "epoch": 0.9632503768875269, "grad_norm": 3.725849334856043, "learning_rate": 7.048231968761142e-10, "loss": 0.6787, "step": 9744 }, { "epoch": 0.9633492326322813, "grad_norm": 3.1488583994301824, "learning_rate": 7.010331713002027e-10, "loss": 0.6303, "step": 9745 }, { "epoch": 0.9634480883770358, "grad_norm": 8.119405740304217, "learning_rate": 6.972533273685854e-10, "loss": 0.7227, "step": 9746 }, { "epoch": 0.9635469441217903, "grad_norm": 4.186050938117485, "learning_rate": 6.934836654688081e-10, "loss": 0.7752, "step": 9747 }, { "epoch": 0.9636457998665447, "grad_norm": 4.115467115259784, "learning_rate": 6.89724185987417e-10, "loss": 0.6861, "step": 9748 }, { "epoch": 0.9637446556112992, "grad_norm": 6.16089539141914, "learning_rate": 6.859748893098816e-10, "loss": 0.615, "step": 9749 }, { "epoch": 0.9638435113560537, "grad_norm": 4.725833263509995, "learning_rate": 6.822357758206276e-10, "loss": 0.6372, "step": 9750 }, { "epoch": 0.9639423671008082, "grad_norm": 4.5118571425752485, "learning_rate": 6.785068459030707e-10, "loss": 0.7269, "step": 9751 }, { "epoch": 0.9640412228455626, "grad_norm": 10.53675692264601, "learning_rate": 6.747880999395272e-10, "loss": 0.7397, "step": 9752 }, { "epoch": 0.964140078590317, "grad_norm": 23.27990148023194, "learning_rate": 6.710795383113143e-10, "loss": 0.6905, "step": 9753 }, { "epoch": 0.9642389343350716, "grad_norm": 4.432709126082438, "learning_rate": 6.673811613986946e-10, "loss": 0.7574, "step": 9754 }, { "epoch": 0.964337790079826, "grad_norm": 4.841921083451247, "learning_rate": 6.636929695808868e-10, "loss": 0.7212, "step": 9755 }, { "epoch": 0.9644366458245804, "grad_norm": 5.069606632487082, "learning_rate": 6.60014963236033e-10, "loss": 0.6933, "step": 9756 }, { "epoch": 0.964535501569335, "grad_norm": 6.3082976381566365, "learning_rate": 6.56347142741298e-10, "loss": 0.6713, "step": 9757 }, { "epoch": 0.9646343573140894, "grad_norm": 3.3573668685547418, "learning_rate": 6.526895084727257e-10, "loss": 0.7103, "step": 9758 }, { "epoch": 0.9647332130588439, "grad_norm": 3.076213164864413, "learning_rate": 6.490420608053715e-10, "loss": 0.7103, "step": 9759 }, { "epoch": 0.9648320688035984, "grad_norm": 3.1470329026729127, "learning_rate": 6.45404800113225e-10, "loss": 0.7622, "step": 9760 }, { "epoch": 0.9649309245483528, "grad_norm": 3.419263046341245, "learning_rate": 6.417777267692326e-10, "loss": 0.7406, "step": 9761 }, { "epoch": 0.9650297802931073, "grad_norm": 8.01218112882822, "learning_rate": 6.381608411452965e-10, "loss": 0.6728, "step": 9762 }, { "epoch": 0.9651286360378617, "grad_norm": 4.410672966082493, "learning_rate": 6.345541436122759e-10, "loss": 0.7442, "step": 9763 }, { "epoch": 0.9652274917826162, "grad_norm": 4.696448536849743, "learning_rate": 6.309576345399858e-10, "loss": 0.6381, "step": 9764 }, { "epoch": 0.9653263475273707, "grad_norm": 3.170348971763446, "learning_rate": 6.273713142971871e-10, "loss": 0.7322, "step": 9765 }, { "epoch": 0.9654252032721251, "grad_norm": 7.433237395538701, "learning_rate": 6.237951832516186e-10, "loss": 0.68, "step": 9766 }, { "epoch": 0.9655240590168797, "grad_norm": 3.875400779386129, "learning_rate": 6.20229241769954e-10, "loss": 0.7307, "step": 9767 }, { "epoch": 0.9656229147616341, "grad_norm": 16.586924224126516, "learning_rate": 6.166734902178228e-10, "loss": 0.7675, "step": 9768 }, { "epoch": 0.9657217705063885, "grad_norm": 2.814432549697897, "learning_rate": 6.131279289598113e-10, "loss": 0.6566, "step": 9769 }, { "epoch": 0.9658206262511431, "grad_norm": 3.524509665070472, "learning_rate": 6.095925583594841e-10, "loss": 0.7798, "step": 9770 }, { "epoch": 0.9659194819958975, "grad_norm": 3.6488139135331203, "learning_rate": 6.060673787793181e-10, "loss": 0.7088, "step": 9771 }, { "epoch": 0.9660183377406519, "grad_norm": 5.01039292669606, "learning_rate": 6.025523905807795e-10, "loss": 0.672, "step": 9772 }, { "epoch": 0.9661171934854064, "grad_norm": 8.93765463919214, "learning_rate": 5.990475941242801e-10, "loss": 0.8412, "step": 9773 }, { "epoch": 0.9662160492301609, "grad_norm": 3.0111872846821623, "learning_rate": 5.95552989769188e-10, "loss": 0.7112, "step": 9774 }, { "epoch": 0.9663149049749153, "grad_norm": 4.980707458375923, "learning_rate": 5.920685778738055e-10, "loss": 0.7043, "step": 9775 }, { "epoch": 0.9664137607196698, "grad_norm": 6.119793841855568, "learning_rate": 5.885943587954356e-10, "loss": 0.7859, "step": 9776 }, { "epoch": 0.9665126164644243, "grad_norm": 3.584889669981746, "learning_rate": 5.851303328902823e-10, "loss": 0.6501, "step": 9777 }, { "epoch": 0.9666114722091788, "grad_norm": 4.7611567316222985, "learning_rate": 5.816765005135393e-10, "loss": 0.6904, "step": 9778 }, { "epoch": 0.9667103279539332, "grad_norm": 3.280101140946791, "learning_rate": 5.782328620193566e-10, "loss": 0.7019, "step": 9779 }, { "epoch": 0.9668091836986877, "grad_norm": 3.4807381079820012, "learning_rate": 5.747994177608073e-10, "loss": 0.7081, "step": 9780 }, { "epoch": 0.9669080394434422, "grad_norm": 5.463363369625085, "learning_rate": 5.713761680899654e-10, "loss": 0.5903, "step": 9781 }, { "epoch": 0.9670068951881966, "grad_norm": 6.110272995259872, "learning_rate": 5.679631133578166e-10, "loss": 0.807, "step": 9782 }, { "epoch": 0.967105750932951, "grad_norm": 6.232554247048842, "learning_rate": 5.645602539143257e-10, "loss": 0.7717, "step": 9783 }, { "epoch": 0.9672046066777056, "grad_norm": 4.768974985616243, "learning_rate": 5.611675901084023e-10, "loss": 0.677, "step": 9784 }, { "epoch": 0.96730346242246, "grad_norm": 3.6904399764117266, "learning_rate": 5.577851222879127e-10, "loss": 0.7649, "step": 9785 }, { "epoch": 0.9674023181672144, "grad_norm": 3.227719385733215, "learning_rate": 5.544128507996903e-10, "loss": 0.6948, "step": 9786 }, { "epoch": 0.967501173911969, "grad_norm": 3.5532518998844864, "learning_rate": 5.510507759895033e-10, "loss": 0.7744, "step": 9787 }, { "epoch": 0.9676000296567234, "grad_norm": 7.312322990578629, "learning_rate": 5.476988982020869e-10, "loss": 0.7747, "step": 9788 }, { "epoch": 0.9676988854014779, "grad_norm": 3.9065111260856957, "learning_rate": 5.443572177811217e-10, "loss": 0.6107, "step": 9789 }, { "epoch": 0.9677977411462324, "grad_norm": 5.849672187547683, "learning_rate": 5.410257350692449e-10, "loss": 0.7302, "step": 9790 }, { "epoch": 0.9678965968909868, "grad_norm": 2.8837299740660804, "learning_rate": 5.377044504080608e-10, "loss": 0.612, "step": 9791 }, { "epoch": 0.9679954526357413, "grad_norm": 3.8240600878818936, "learning_rate": 5.343933641381193e-10, "loss": 0.6592, "step": 9792 }, { "epoch": 0.9680943083804958, "grad_norm": 6.130189209584632, "learning_rate": 5.310924765989044e-10, "loss": 0.6922, "step": 9793 }, { "epoch": 0.9681931641252502, "grad_norm": 3.81312987172568, "learning_rate": 5.278017881288898e-10, "loss": 0.6982, "step": 9794 }, { "epoch": 0.9682920198700047, "grad_norm": 43.36402181810251, "learning_rate": 5.245212990654835e-10, "loss": 0.6802, "step": 9795 }, { "epoch": 0.9683908756147591, "grad_norm": 11.284023388337026, "learning_rate": 5.212510097450606e-10, "loss": 0.5642, "step": 9796 }, { "epoch": 0.9684897313595137, "grad_norm": 5.984420379265393, "learning_rate": 5.179909205029198e-10, "loss": 0.7121, "step": 9797 }, { "epoch": 0.9685885871042681, "grad_norm": 5.519638998020378, "learning_rate": 5.147410316733491e-10, "loss": 0.6762, "step": 9798 }, { "epoch": 0.9686874428490225, "grad_norm": 3.9828941993472444, "learning_rate": 5.115013435895821e-10, "loss": 0.679, "step": 9799 }, { "epoch": 0.9687862985937771, "grad_norm": 7.862648797220195, "learning_rate": 5.082718565837862e-10, "loss": 0.7183, "step": 9800 }, { "epoch": 0.9688851543385315, "grad_norm": 5.496884303112868, "learning_rate": 5.050525709871078e-10, "loss": 0.7439, "step": 9801 }, { "epoch": 0.9689840100832859, "grad_norm": 3.3413601803040383, "learning_rate": 5.018434871296384e-10, "loss": 0.5929, "step": 9802 }, { "epoch": 0.9690828658280405, "grad_norm": 3.602073932968947, "learning_rate": 4.986446053404148e-10, "loss": 0.7776, "step": 9803 }, { "epoch": 0.9691817215727949, "grad_norm": 21.958791351568347, "learning_rate": 4.954559259474411e-10, "loss": 0.6789, "step": 9804 }, { "epoch": 0.9692805773175494, "grad_norm": 6.777421932263544, "learning_rate": 4.92277449277656e-10, "loss": 0.7388, "step": 9805 }, { "epoch": 0.9693794330623038, "grad_norm": 3.893636829701767, "learning_rate": 4.891091756569876e-10, "loss": 0.6374, "step": 9806 }, { "epoch": 0.9694782888070583, "grad_norm": 12.890525529240557, "learning_rate": 4.859511054102872e-10, "loss": 0.77, "step": 9807 }, { "epoch": 0.9695771445518128, "grad_norm": 3.466698809403156, "learning_rate": 4.828032388613623e-10, "loss": 0.6431, "step": 9808 }, { "epoch": 0.9696760002965672, "grad_norm": 3.5365022998460818, "learning_rate": 4.796655763329771e-10, "loss": 0.6638, "step": 9809 }, { "epoch": 0.9697748560413217, "grad_norm": 4.105668325236833, "learning_rate": 4.76538118146863e-10, "loss": 0.7199, "step": 9810 }, { "epoch": 0.9698737117860762, "grad_norm": 2.736198003256814, "learning_rate": 4.734208646236971e-10, "loss": 0.6361, "step": 9811 }, { "epoch": 0.9699725675308306, "grad_norm": 4.3015621516780636, "learning_rate": 4.7031381608309e-10, "loss": 0.7504, "step": 9812 }, { "epoch": 0.9700714232755852, "grad_norm": 5.431033522245993, "learning_rate": 4.672169728436425e-10, "loss": 0.684, "step": 9813 }, { "epoch": 0.9701702790203396, "grad_norm": 4.297093631581839, "learning_rate": 4.641303352228787e-10, "loss": 0.6638, "step": 9814 }, { "epoch": 0.970269134765094, "grad_norm": 4.18150798110243, "learning_rate": 4.6105390353728954e-10, "loss": 0.783, "step": 9815 }, { "epoch": 0.9703679905098485, "grad_norm": 7.59770805003625, "learning_rate": 4.5798767810232287e-10, "loss": 0.6932, "step": 9816 }, { "epoch": 0.970466846254603, "grad_norm": 14.334301412268056, "learning_rate": 4.5493165923237153e-10, "loss": 0.7922, "step": 9817 }, { "epoch": 0.9705657019993574, "grad_norm": 3.868625049813854, "learning_rate": 4.5188584724078495e-10, "loss": 0.6097, "step": 9818 }, { "epoch": 0.9706645577441119, "grad_norm": 3.1787901625067962, "learning_rate": 4.488502424398688e-10, "loss": 0.781, "step": 9819 }, { "epoch": 0.9707634134888664, "grad_norm": 3.989400451892565, "learning_rate": 4.4582484514087417e-10, "loss": 0.7462, "step": 9820 }, { "epoch": 0.9708622692336208, "grad_norm": 6.539807280750128, "learning_rate": 4.428096556540084e-10, "loss": 0.7481, "step": 9821 }, { "epoch": 0.9709611249783753, "grad_norm": 3.1922652785589998, "learning_rate": 4.398046742884576e-10, "loss": 0.7372, "step": 9822 }, { "epoch": 0.9710599807231298, "grad_norm": 4.6887046279928635, "learning_rate": 4.3680990135230856e-10, "loss": 0.7521, "step": 9823 }, { "epoch": 0.9711588364678843, "grad_norm": 4.39018015083259, "learning_rate": 4.3382533715264903e-10, "loss": 0.6379, "step": 9824 }, { "epoch": 0.9712576922126387, "grad_norm": 3.5213994865775917, "learning_rate": 4.3085098199548977e-10, "loss": 0.8004, "step": 9825 }, { "epoch": 0.9713565479573931, "grad_norm": 3.5237488431662536, "learning_rate": 4.278868361858201e-10, "loss": 0.7018, "step": 9826 }, { "epoch": 0.9714554037021477, "grad_norm": 9.717385674846772, "learning_rate": 4.2493290002756367e-10, "loss": 0.7299, "step": 9827 }, { "epoch": 0.9715542594469021, "grad_norm": 3.9083619720909426, "learning_rate": 4.2198917382360033e-10, "loss": 0.6253, "step": 9828 }, { "epoch": 0.9716531151916565, "grad_norm": 5.013508086628578, "learning_rate": 4.190556578757665e-10, "loss": 0.7451, "step": 9829 }, { "epoch": 0.9717519709364111, "grad_norm": 4.524913204463662, "learning_rate": 4.1613235248485476e-10, "loss": 0.881, "step": 9830 }, { "epoch": 0.9718508266811655, "grad_norm": 3.432922706559564, "learning_rate": 4.132192579506144e-10, "loss": 0.7332, "step": 9831 }, { "epoch": 0.97194968242592, "grad_norm": 6.726804365772089, "learning_rate": 4.1031637457171754e-10, "loss": 0.7111, "step": 9832 }, { "epoch": 0.9720485381706745, "grad_norm": 4.431726548872343, "learning_rate": 4.07423702645826e-10, "loss": 0.6306, "step": 9833 }, { "epoch": 0.9721473939154289, "grad_norm": 5.290738578075236, "learning_rate": 4.045412424695471e-10, "loss": 0.7735, "step": 9834 }, { "epoch": 0.9722462496601834, "grad_norm": 3.7088054856265247, "learning_rate": 4.016689943384222e-10, "loss": 0.7313, "step": 9835 }, { "epoch": 0.9723451054049378, "grad_norm": 3.057922273226034, "learning_rate": 3.988069585469711e-10, "loss": 0.652, "step": 9836 }, { "epoch": 0.9724439611496923, "grad_norm": 10.725433282676205, "learning_rate": 3.95955135388637e-10, "loss": 0.6984, "step": 9837 }, { "epoch": 0.9725428168944468, "grad_norm": 3.873926827278017, "learning_rate": 3.931135251558415e-10, "loss": 0.7975, "step": 9838 }, { "epoch": 0.9726416726392012, "grad_norm": 10.654237845121067, "learning_rate": 3.902821281399515e-10, "loss": 0.6359, "step": 9839 }, { "epoch": 0.9727405283839557, "grad_norm": 8.382597783813386, "learning_rate": 3.874609446312793e-10, "loss": 0.8106, "step": 9840 }, { "epoch": 0.9728393841287102, "grad_norm": 2.993188768524057, "learning_rate": 3.8464997491909347e-10, "loss": 0.6845, "step": 9841 }, { "epoch": 0.9729382398734646, "grad_norm": 8.047100018416605, "learning_rate": 3.8184921929163005e-10, "loss": 0.7647, "step": 9842 }, { "epoch": 0.9730370956182192, "grad_norm": 3.5632866677879442, "learning_rate": 3.790586780360705e-10, "loss": 0.6946, "step": 9843 }, { "epoch": 0.9731359513629736, "grad_norm": 4.01233934497106, "learning_rate": 3.7627835143850814e-10, "loss": 0.6746, "step": 9844 }, { "epoch": 0.973234807107728, "grad_norm": 9.718228466762687, "learning_rate": 3.7350823978405944e-10, "loss": 0.7411, "step": 9845 }, { "epoch": 0.9733336628524825, "grad_norm": 9.445175571161732, "learning_rate": 3.707483433567526e-10, "loss": 0.7534, "step": 9846 }, { "epoch": 0.973432518597237, "grad_norm": 3.293166124551131, "learning_rate": 3.6799866243956144e-10, "loss": 0.6839, "step": 9847 }, { "epoch": 0.9735313743419914, "grad_norm": 3.7910276177122264, "learning_rate": 3.65259197314427e-10, "loss": 0.7231, "step": 9848 }, { "epoch": 0.9736302300867459, "grad_norm": 4.087673153428777, "learning_rate": 3.625299482622468e-10, "loss": 0.7625, "step": 9849 }, { "epoch": 0.9737290858315004, "grad_norm": 3.600363712852235, "learning_rate": 3.598109155628748e-10, "loss": 0.732, "step": 9850 }, { "epoch": 0.9738279415762549, "grad_norm": 2.866469683521575, "learning_rate": 3.5710209949508796e-10, "loss": 0.7544, "step": 9851 }, { "epoch": 0.9739267973210093, "grad_norm": 3.0487963334165307, "learning_rate": 3.544035003366419e-10, "loss": 0.6315, "step": 9852 }, { "epoch": 0.9740256530657638, "grad_norm": 4.156559151766531, "learning_rate": 3.517151183642486e-10, "loss": 0.8085, "step": 9853 }, { "epoch": 0.9741245088105183, "grad_norm": 22.842367884742874, "learning_rate": 3.4903695385355424e-10, "loss": 0.6553, "step": 9854 }, { "epoch": 0.9742233645552727, "grad_norm": 7.214943387830145, "learning_rate": 3.463690070791614e-10, "loss": 0.7119, "step": 9855 }, { "epoch": 0.9743222203000271, "grad_norm": 3.7418582447586535, "learning_rate": 3.437112783146401e-10, "loss": 0.7107, "step": 9856 }, { "epoch": 0.9744210760447817, "grad_norm": 4.302395281430418, "learning_rate": 3.410637678324835e-10, "loss": 0.7328, "step": 9857 }, { "epoch": 0.9745199317895361, "grad_norm": 5.366360172707314, "learning_rate": 3.384264759041744e-10, "loss": 0.6542, "step": 9858 }, { "epoch": 0.9746187875342905, "grad_norm": 4.336013585816708, "learning_rate": 3.3579940280010763e-10, "loss": 0.6488, "step": 9859 }, { "epoch": 0.9747176432790451, "grad_norm": 3.3833596872229617, "learning_rate": 3.331825487896678e-10, "loss": 0.6197, "step": 9860 }, { "epoch": 0.9748164990237995, "grad_norm": 2.9435571400162948, "learning_rate": 3.3057591414116237e-10, "loss": 0.6591, "step": 9861 }, { "epoch": 0.974915354768554, "grad_norm": 3.4856193449371244, "learning_rate": 3.279794991218776e-10, "loss": 0.7431, "step": 9862 }, { "epoch": 0.9750142105133085, "grad_norm": 4.65297764388433, "learning_rate": 3.253933039980339e-10, "loss": 0.8518, "step": 9863 }, { "epoch": 0.9751130662580629, "grad_norm": 3.611510734468384, "learning_rate": 3.228173290347969e-10, "loss": 0.6759, "step": 9864 }, { "epoch": 0.9752119220028174, "grad_norm": 4.396228649727912, "learning_rate": 3.202515744962886e-10, "loss": 0.7519, "step": 9865 }, { "epoch": 0.9753107777475719, "grad_norm": 14.82489325607934, "learning_rate": 3.176960406456208e-10, "loss": 0.741, "step": 9866 }, { "epoch": 0.9754096334923263, "grad_norm": 5.341015611270225, "learning_rate": 3.1515072774479513e-10, "loss": 0.7356, "step": 9867 }, { "epoch": 0.9755084892370808, "grad_norm": 3.5268333061095847, "learning_rate": 3.126156360548138e-10, "loss": 0.7241, "step": 9868 }, { "epoch": 0.9756073449818352, "grad_norm": 4.405675457532796, "learning_rate": 3.1009076583560225e-10, "loss": 0.7032, "step": 9869 }, { "epoch": 0.9757062007265898, "grad_norm": 3.2748381471452204, "learning_rate": 3.075761173460534e-10, "loss": 0.6133, "step": 9870 }, { "epoch": 0.9758050564713442, "grad_norm": 4.147903530018896, "learning_rate": 3.050716908440054e-10, "loss": 0.7325, "step": 9871 }, { "epoch": 0.9759039122160986, "grad_norm": 4.956861501530702, "learning_rate": 3.0257748658625295e-10, "loss": 0.7907, "step": 9872 }, { "epoch": 0.9760027679608532, "grad_norm": 3.986120343335268, "learning_rate": 3.000935048285469e-10, "loss": 0.6754, "step": 9873 }, { "epoch": 0.9761016237056076, "grad_norm": 6.364793656683157, "learning_rate": 2.9761974582556136e-10, "loss": 0.7633, "step": 9874 }, { "epoch": 0.976200479450362, "grad_norm": 3.6333559581309927, "learning_rate": 2.9515620983097124e-10, "loss": 0.7138, "step": 9875 }, { "epoch": 0.9762993351951166, "grad_norm": 3.491369699818869, "learning_rate": 2.9270289709735217e-10, "loss": 0.6814, "step": 9876 }, { "epoch": 0.976398190939871, "grad_norm": 9.03366985906941, "learning_rate": 2.902598078762697e-10, "loss": 0.7035, "step": 9877 }, { "epoch": 0.9764970466846254, "grad_norm": 5.319529687433717, "learning_rate": 2.8782694241821226e-10, "loss": 0.7387, "step": 9878 }, { "epoch": 0.9765959024293799, "grad_norm": 3.2612937519412237, "learning_rate": 2.854043009726581e-10, "loss": 0.7056, "step": 9879 }, { "epoch": 0.9766947581741344, "grad_norm": 4.177403945793153, "learning_rate": 2.829918837879863e-10, "loss": 0.6132, "step": 9880 }, { "epoch": 0.9767936139188889, "grad_norm": 2.6820373346316226, "learning_rate": 2.805896911115768e-10, "loss": 0.6527, "step": 9881 }, { "epoch": 0.9768924696636433, "grad_norm": 32.13629786149372, "learning_rate": 2.7819772318972146e-10, "loss": 0.7369, "step": 9882 }, { "epoch": 0.9769913254083978, "grad_norm": 3.899100037701286, "learning_rate": 2.758159802677018e-10, "loss": 0.7851, "step": 9883 }, { "epoch": 0.9770901811531523, "grad_norm": 4.07674298736393, "learning_rate": 2.734444625897003e-10, "loss": 0.8357, "step": 9884 }, { "epoch": 0.9771890368979067, "grad_norm": 3.430923453811778, "learning_rate": 2.710831703989114e-10, "loss": 0.6811, "step": 9885 }, { "epoch": 0.9772878926426612, "grad_norm": 8.666058120189678, "learning_rate": 2.687321039374413e-10, "loss": 0.7351, "step": 9886 }, { "epoch": 0.9773867483874157, "grad_norm": 4.4659607270315504, "learning_rate": 2.6639126344634165e-10, "loss": 0.6445, "step": 9887 }, { "epoch": 0.9774856041321701, "grad_norm": 3.6939422184739454, "learning_rate": 2.6406064916565385e-10, "loss": 0.6638, "step": 9888 }, { "epoch": 0.9775844598769245, "grad_norm": 3.813327268858609, "learning_rate": 2.6174026133433113e-10, "loss": 0.7485, "step": 9889 }, { "epoch": 0.9776833156216791, "grad_norm": 3.728782927727464, "learning_rate": 2.5943010019030545e-10, "loss": 0.7076, "step": 9890 }, { "epoch": 0.9777821713664335, "grad_norm": 2.958500823953866, "learning_rate": 2.571301659704539e-10, "loss": 0.6435, "step": 9891 }, { "epoch": 0.977881027111188, "grad_norm": 3.013869448883406, "learning_rate": 2.548404589105768e-10, "loss": 0.5983, "step": 9892 }, { "epoch": 0.9779798828559425, "grad_norm": 5.302610464484311, "learning_rate": 2.5256097924547524e-10, "loss": 0.7132, "step": 9893 }, { "epoch": 0.9780787386006969, "grad_norm": 5.380830364862367, "learning_rate": 2.502917272088734e-10, "loss": 0.6642, "step": 9894 }, { "epoch": 0.9781775943454514, "grad_norm": 3.0368158527641045, "learning_rate": 2.480327030334295e-10, "loss": 0.6186, "step": 9895 }, { "epoch": 0.9782764500902059, "grad_norm": 4.9952298386964795, "learning_rate": 2.4578390695079166e-10, "loss": 0.7029, "step": 9896 }, { "epoch": 0.9783753058349604, "grad_norm": 4.742092684310476, "learning_rate": 2.435453391915421e-10, "loss": 0.7646, "step": 9897 }, { "epoch": 0.9784741615797148, "grad_norm": 5.886400038443044, "learning_rate": 2.413169999852083e-10, "loss": 0.7026, "step": 9898 }, { "epoch": 0.9785730173244692, "grad_norm": 4.797896750842494, "learning_rate": 2.3909888956026303e-10, "loss": 0.679, "step": 9899 }, { "epoch": 0.9786718730692238, "grad_norm": 3.348508879087131, "learning_rate": 2.368910081441466e-10, "loss": 0.6838, "step": 9900 }, { "epoch": 0.9787707288139782, "grad_norm": 4.609385267644916, "learning_rate": 2.346933559632558e-10, "loss": 0.7948, "step": 9901 }, { "epoch": 0.9788695845587326, "grad_norm": 4.118060742094916, "learning_rate": 2.3250593324292134e-10, "loss": 0.7459, "step": 9902 }, { "epoch": 0.9789684403034872, "grad_norm": 3.017253387567331, "learning_rate": 2.3032874020741943e-10, "loss": 0.6872, "step": 9903 }, { "epoch": 0.9790672960482416, "grad_norm": 3.6385323344191933, "learning_rate": 2.2816177708000483e-10, "loss": 0.6439, "step": 9904 }, { "epoch": 0.979166151792996, "grad_norm": 12.847527852372831, "learning_rate": 2.2600504408286647e-10, "loss": 0.7226, "step": 9905 }, { "epoch": 0.9792650075377506, "grad_norm": 3.5140447449746777, "learning_rate": 2.2385854143713856e-10, "loss": 0.7603, "step": 9906 }, { "epoch": 0.979363863282505, "grad_norm": 40.77871241287877, "learning_rate": 2.217222693629117e-10, "loss": 0.7148, "step": 9907 }, { "epoch": 0.9794627190272595, "grad_norm": 29.265393620751098, "learning_rate": 2.1959622807923296e-10, "loss": 0.6797, "step": 9908 }, { "epoch": 0.9795615747720139, "grad_norm": 4.524540230881037, "learning_rate": 2.1748041780409453e-10, "loss": 0.6265, "step": 9909 }, { "epoch": 0.9796604305167684, "grad_norm": 3.708832330757567, "learning_rate": 2.1537483875445628e-10, "loss": 0.6664, "step": 9910 }, { "epoch": 0.9797592862615229, "grad_norm": 3.8988257818651904, "learning_rate": 2.1327949114618994e-10, "loss": 0.7057, "step": 9911 }, { "epoch": 0.9798581420062773, "grad_norm": 3.321139168944259, "learning_rate": 2.1119437519415694e-10, "loss": 0.6951, "step": 9912 }, { "epoch": 0.9799569977510318, "grad_norm": 4.609928489862205, "learning_rate": 2.09119491112153e-10, "loss": 0.6407, "step": 9913 }, { "epoch": 0.9800558534957863, "grad_norm": 5.1431567885094065, "learning_rate": 2.070548391129301e-10, "loss": 0.678, "step": 9914 }, { "epoch": 0.9801547092405407, "grad_norm": 16.883590195279165, "learning_rate": 2.0500041940817448e-10, "loss": 0.7148, "step": 9915 }, { "epoch": 0.9802535649852953, "grad_norm": 3.552474609872056, "learning_rate": 2.0295623220855096e-10, "loss": 0.6009, "step": 9916 }, { "epoch": 0.9803524207300497, "grad_norm": 3.2013360023016935, "learning_rate": 2.0092227772365855e-10, "loss": 0.7687, "step": 9917 }, { "epoch": 0.9804512764748041, "grad_norm": 4.368264705397335, "learning_rate": 1.9889855616204155e-10, "loss": 0.7012, "step": 9918 }, { "epoch": 0.9805501322195586, "grad_norm": 3.20721544737697, "learning_rate": 1.9688506773121172e-10, "loss": 0.7571, "step": 9919 }, { "epoch": 0.9806489879643131, "grad_norm": 3.917994691789507, "learning_rate": 1.9488181263761505e-10, "loss": 0.7051, "step": 9920 }, { "epoch": 0.9807478437090675, "grad_norm": 4.62219505336901, "learning_rate": 1.92888791086665e-10, "loss": 0.665, "step": 9921 }, { "epoch": 0.980846699453822, "grad_norm": 3.5945281060062126, "learning_rate": 1.9090600328269811e-10, "loss": 0.6604, "step": 9922 }, { "epoch": 0.9809455551985765, "grad_norm": 3.5046324714024997, "learning_rate": 1.8893344942904065e-10, "loss": 0.7153, "step": 9923 }, { "epoch": 0.981044410943331, "grad_norm": 4.4433401946259234, "learning_rate": 1.8697112972795304e-10, "loss": 0.7825, "step": 9924 }, { "epoch": 0.9811432666880854, "grad_norm": 5.968315767392347, "learning_rate": 1.8501904438061877e-10, "loss": 0.699, "step": 9925 }, { "epoch": 0.9812421224328399, "grad_norm": 2.9966015408695634, "learning_rate": 1.8307719358719999e-10, "loss": 0.5807, "step": 9926 }, { "epoch": 0.9813409781775944, "grad_norm": 4.1649764387096955, "learning_rate": 1.8114557754682624e-10, "loss": 0.6301, "step": 9927 }, { "epoch": 0.9814398339223488, "grad_norm": 4.373360869171952, "learning_rate": 1.7922419645752807e-10, "loss": 0.6505, "step": 9928 }, { "epoch": 0.9815386896671032, "grad_norm": 4.449797143040474, "learning_rate": 1.773130505163367e-10, "loss": 0.7694, "step": 9929 }, { "epoch": 0.9816375454118578, "grad_norm": 4.832628783887782, "learning_rate": 1.7541213991920657e-10, "loss": 0.6792, "step": 9930 }, { "epoch": 0.9817364011566122, "grad_norm": 3.4562825537883204, "learning_rate": 1.7352146486103725e-10, "loss": 0.6455, "step": 9931 }, { "epoch": 0.9818352569013666, "grad_norm": 6.4940467465641385, "learning_rate": 1.7164102553570704e-10, "loss": 0.6946, "step": 9932 }, { "epoch": 0.9819341126461212, "grad_norm": 3.118820495656103, "learning_rate": 1.6977082213601723e-10, "loss": 0.7063, "step": 9933 }, { "epoch": 0.9820329683908756, "grad_norm": 3.0809508490163076, "learning_rate": 1.6791085485371448e-10, "loss": 0.689, "step": 9934 }, { "epoch": 0.98213182413563, "grad_norm": 2.707978513673834, "learning_rate": 1.660611238795462e-10, "loss": 0.5611, "step": 9935 }, { "epoch": 0.9822306798803846, "grad_norm": 2.8474787613651498, "learning_rate": 1.6422162940313845e-10, "loss": 0.754, "step": 9936 }, { "epoch": 0.982329535625139, "grad_norm": 4.952379964250739, "learning_rate": 1.6239237161312925e-10, "loss": 0.6998, "step": 9937 }, { "epoch": 0.9824283913698935, "grad_norm": 6.069667226078943, "learning_rate": 1.6057335069707966e-10, "loss": 0.6542, "step": 9938 }, { "epoch": 0.9825272471146479, "grad_norm": 5.39797377651625, "learning_rate": 1.5876456684149607e-10, "loss": 0.7748, "step": 9939 }, { "epoch": 0.9826261028594024, "grad_norm": 8.567652336502583, "learning_rate": 1.5696602023183014e-10, "loss": 0.6973, "step": 9940 }, { "epoch": 0.9827249586041569, "grad_norm": 4.129489446355094, "learning_rate": 1.551777110525232e-10, "loss": 0.844, "step": 9941 }, { "epoch": 0.9828238143489113, "grad_norm": 3.6931308247752823, "learning_rate": 1.5339963948692857e-10, "loss": 0.6902, "step": 9942 }, { "epoch": 0.9829226700936659, "grad_norm": 16.615405599373148, "learning_rate": 1.51631805717356e-10, "loss": 0.6141, "step": 9943 }, { "epoch": 0.9830215258384203, "grad_norm": 4.009538573329675, "learning_rate": 1.498742099250716e-10, "loss": 0.6958, "step": 9944 }, { "epoch": 0.9831203815831747, "grad_norm": 18.807016268334888, "learning_rate": 1.4812685229029787e-10, "loss": 0.6796, "step": 9945 }, { "epoch": 0.9832192373279293, "grad_norm": 4.706884180195693, "learning_rate": 1.4638973299220257e-10, "loss": 0.6463, "step": 9946 }, { "epoch": 0.9833180930726837, "grad_norm": 3.858215772783094, "learning_rate": 1.4466285220888775e-10, "loss": 0.7486, "step": 9947 }, { "epoch": 0.9834169488174381, "grad_norm": 18.819162273409713, "learning_rate": 1.4294621011742282e-10, "loss": 0.6732, "step": 9948 }, { "epoch": 0.9835158045621927, "grad_norm": 8.341471659853797, "learning_rate": 1.4123980689383364e-10, "loss": 0.7147, "step": 9949 }, { "epoch": 0.9836146603069471, "grad_norm": 5.144863771086624, "learning_rate": 1.3954364271309138e-10, "loss": 0.7498, "step": 9950 }, { "epoch": 0.9837135160517015, "grad_norm": 6.454512955081075, "learning_rate": 1.3785771774909026e-10, "loss": 0.8006, "step": 9951 }, { "epoch": 0.983812371796456, "grad_norm": 4.9022639051854435, "learning_rate": 1.3618203217471424e-10, "loss": 0.5668, "step": 9952 }, { "epoch": 0.9839112275412105, "grad_norm": 3.801050549529694, "learning_rate": 1.345165861617814e-10, "loss": 0.6802, "step": 9953 }, { "epoch": 0.984010083285965, "grad_norm": 4.90729300365936, "learning_rate": 1.3286137988104408e-10, "loss": 0.7377, "step": 9954 }, { "epoch": 0.9841089390307194, "grad_norm": 10.32943859126779, "learning_rate": 1.312164135022331e-10, "loss": 0.6448, "step": 9955 }, { "epoch": 0.9842077947754739, "grad_norm": 20.4578840912686, "learning_rate": 1.2958168719401364e-10, "loss": 0.6545, "step": 9956 }, { "epoch": 0.9843066505202284, "grad_norm": 5.851436624348358, "learning_rate": 1.27957201123996e-10, "loss": 0.6392, "step": 9957 }, { "epoch": 0.9844055062649828, "grad_norm": 4.133461163069233, "learning_rate": 1.2634295545874696e-10, "loss": 0.736, "step": 9958 }, { "epoch": 0.9845043620097373, "grad_norm": 27.950272901779933, "learning_rate": 1.2473895036378967e-10, "loss": 0.7065, "step": 9959 }, { "epoch": 0.9846032177544918, "grad_norm": 3.8626218321452694, "learning_rate": 1.2314518600359258e-10, "loss": 0.6702, "step": 9960 }, { "epoch": 0.9847020734992462, "grad_norm": 4.084255439715281, "learning_rate": 1.215616625415694e-10, "loss": 0.7354, "step": 9961 }, { "epoch": 0.9848009292440006, "grad_norm": 3.803808906054133, "learning_rate": 1.1998838014009026e-10, "loss": 0.6672, "step": 9962 }, { "epoch": 0.9848997849887552, "grad_norm": 4.534919876011162, "learning_rate": 1.1842533896045948e-10, "loss": 0.5804, "step": 9963 }, { "epoch": 0.9849986407335096, "grad_norm": 4.409387728588799, "learning_rate": 1.168725391629599e-10, "loss": 0.6469, "step": 9964 }, { "epoch": 0.9850974964782641, "grad_norm": 4.047139214508962, "learning_rate": 1.1532998090679758e-10, "loss": 0.7143, "step": 9965 }, { "epoch": 0.9851963522230186, "grad_norm": 4.1717455182270635, "learning_rate": 1.1379766435014593e-10, "loss": 0.6903, "step": 9966 }, { "epoch": 0.985295207967773, "grad_norm": 3.1196441973007065, "learning_rate": 1.1227558965012373e-10, "loss": 0.7123, "step": 9967 }, { "epoch": 0.9853940637125275, "grad_norm": 6.041824512948329, "learning_rate": 1.1076375696278395e-10, "loss": 0.6363, "step": 9968 }, { "epoch": 0.985492919457282, "grad_norm": 5.7478154367933785, "learning_rate": 1.0926216644315811e-10, "loss": 0.6774, "step": 9969 }, { "epoch": 0.9855917752020364, "grad_norm": 3.9755313559566954, "learning_rate": 1.0777081824520084e-10, "loss": 0.6642, "step": 9970 }, { "epoch": 0.9856906309467909, "grad_norm": 3.6597063824236042, "learning_rate": 1.0628971252183427e-10, "loss": 0.7258, "step": 9971 }, { "epoch": 0.9857894866915453, "grad_norm": 4.409509577091096, "learning_rate": 1.048188494249147e-10, "loss": 0.7138, "step": 9972 }, { "epoch": 0.9858883424362999, "grad_norm": 2.714569549268208, "learning_rate": 1.0335822910526593e-10, "loss": 0.6178, "step": 9973 }, { "epoch": 0.9859871981810543, "grad_norm": 4.201549001128713, "learning_rate": 1.0190785171265704e-10, "loss": 0.6367, "step": 9974 }, { "epoch": 0.9860860539258087, "grad_norm": 4.91811784438016, "learning_rate": 1.004677173958024e-10, "loss": 0.6393, "step": 9975 }, { "epoch": 0.9861849096705633, "grad_norm": 4.810910429479335, "learning_rate": 9.903782630235058e-11, "loss": 0.6492, "step": 9976 }, { "epoch": 0.9862837654153177, "grad_norm": 4.2504483467825525, "learning_rate": 9.761817857892873e-11, "loss": 0.6977, "step": 9977 }, { "epoch": 0.9863826211600721, "grad_norm": 4.2561207971180375, "learning_rate": 9.620877437110931e-11, "loss": 0.7868, "step": 9978 }, { "epoch": 0.9864814769048267, "grad_norm": 4.426917421579262, "learning_rate": 9.480961382338782e-11, "loss": 0.695, "step": 9979 }, { "epoch": 0.9865803326495811, "grad_norm": 5.324591250320784, "learning_rate": 9.342069707922729e-11, "loss": 0.636, "step": 9980 }, { "epoch": 0.9866791883943355, "grad_norm": 4.289974497887985, "learning_rate": 9.204202428105823e-11, "loss": 0.7109, "step": 9981 }, { "epoch": 0.98677804413909, "grad_norm": 4.1888420754500615, "learning_rate": 9.067359557023424e-11, "loss": 0.7514, "step": 9982 }, { "epoch": 0.9868768998838445, "grad_norm": 3.892203932230894, "learning_rate": 8.931541108705421e-11, "loss": 0.7324, "step": 9983 }, { "epoch": 0.986975755628599, "grad_norm": 5.3225450905975595, "learning_rate": 8.79674709708067e-11, "loss": 0.8322, "step": 9984 }, { "epoch": 0.9870746113733534, "grad_norm": 4.09339809361668, "learning_rate": 8.66297753596923e-11, "loss": 0.6508, "step": 9985 }, { "epoch": 0.9871734671181079, "grad_norm": 4.353615897795182, "learning_rate": 8.530232439085683e-11, "loss": 0.7173, "step": 9986 }, { "epoch": 0.9872723228628624, "grad_norm": 4.877910742704973, "learning_rate": 8.398511820042475e-11, "loss": 0.6991, "step": 9987 }, { "epoch": 0.9873711786076168, "grad_norm": 3.6874270013839983, "learning_rate": 8.26781569234547e-11, "loss": 0.6511, "step": 9988 }, { "epoch": 0.9874700343523714, "grad_norm": 5.122228699353762, "learning_rate": 8.138144069395058e-11, "loss": 0.7521, "step": 9989 }, { "epoch": 0.9875688900971258, "grad_norm": 3.9592800079501003, "learning_rate": 8.009496964488382e-11, "loss": 0.6783, "step": 9990 }, { "epoch": 0.9876677458418802, "grad_norm": 8.847261761070852, "learning_rate": 7.881874390816001e-11, "loss": 0.6572, "step": 9991 }, { "epoch": 0.9877666015866347, "grad_norm": 7.203053987586899, "learning_rate": 7.755276361461893e-11, "loss": 0.7016, "step": 9992 }, { "epoch": 0.9878654573313892, "grad_norm": 3.08811316611876, "learning_rate": 7.629702889409006e-11, "loss": 0.7255, "step": 9993 }, { "epoch": 0.9879643130761436, "grad_norm": 4.080404630571386, "learning_rate": 7.505153987532598e-11, "loss": 0.6619, "step": 9994 }, { "epoch": 0.9880631688208981, "grad_norm": 4.251324219728391, "learning_rate": 7.381629668602452e-11, "loss": 0.6056, "step": 9995 }, { "epoch": 0.9881620245656526, "grad_norm": 4.383310435659365, "learning_rate": 7.259129945285102e-11, "loss": 0.8156, "step": 9996 }, { "epoch": 0.988260880310407, "grad_norm": 3.8436919945017998, "learning_rate": 7.137654830140505e-11, "loss": 0.699, "step": 9997 }, { "epoch": 0.9883597360551615, "grad_norm": 5.947776600003691, "learning_rate": 7.017204335623139e-11, "loss": 0.6669, "step": 9998 }, { "epoch": 0.988458591799916, "grad_norm": 6.194668081288551, "learning_rate": 6.897778474086458e-11, "loss": 0.7522, "step": 9999 }, { "epoch": 0.9885574475446705, "grad_norm": 3.7058306150967733, "learning_rate": 6.779377257772889e-11, "loss": 0.6414, "step": 10000 }, { "epoch": 0.9886563032894249, "grad_norm": 8.590374700074914, "learning_rate": 6.662000698823833e-11, "loss": 0.732, "step": 10001 }, { "epoch": 0.9887551590341793, "grad_norm": 3.1665087349288323, "learning_rate": 6.545648809275218e-11, "loss": 0.7691, "step": 10002 }, { "epoch": 0.9888540147789339, "grad_norm": 6.200114195771567, "learning_rate": 6.430321601056388e-11, "loss": 0.7826, "step": 10003 }, { "epoch": 0.9889528705236883, "grad_norm": 6.266211800291764, "learning_rate": 6.31601908599233e-11, "loss": 0.656, "step": 10004 }, { "epoch": 0.9890517262684427, "grad_norm": 4.1992588520632905, "learning_rate": 6.202741275802558e-11, "loss": 0.8186, "step": 10005 }, { "epoch": 0.9891505820131973, "grad_norm": 8.553364970265939, "learning_rate": 6.090488182103337e-11, "loss": 0.7621, "step": 10006 }, { "epoch": 0.9892494377579517, "grad_norm": 18.116611480455088, "learning_rate": 5.979259816404347e-11, "loss": 0.7442, "step": 10007 }, { "epoch": 0.9893482935027061, "grad_norm": 3.6112898399444835, "learning_rate": 5.8690561901098e-11, "loss": 0.7449, "step": 10008 }, { "epoch": 0.9894471492474607, "grad_norm": 3.309982523423621, "learning_rate": 5.7598773145195455e-11, "loss": 0.6635, "step": 10009 }, { "epoch": 0.9895460049922151, "grad_norm": 3.7701211608509007, "learning_rate": 5.651723200829073e-11, "loss": 0.687, "step": 10010 }, { "epoch": 0.9896448607369696, "grad_norm": 8.601007196658097, "learning_rate": 5.544593860126179e-11, "loss": 0.7536, "step": 10011 }, { "epoch": 0.989743716481724, "grad_norm": 3.8821972748491054, "learning_rate": 5.438489303397631e-11, "loss": 0.6934, "step": 10012 }, { "epoch": 0.9898425722264785, "grad_norm": 3.4428153403931008, "learning_rate": 5.333409541521394e-11, "loss": 0.8656, "step": 10013 }, { "epoch": 0.989941427971233, "grad_norm": 5.096028362861823, "learning_rate": 5.229354585272183e-11, "loss": 0.7541, "step": 10014 }, { "epoch": 0.9900402837159874, "grad_norm": 2.610724580438784, "learning_rate": 5.126324445320351e-11, "loss": 0.6679, "step": 10015 }, { "epoch": 0.990139139460742, "grad_norm": 3.8456089108832354, "learning_rate": 5.0243191322285606e-11, "loss": 0.7534, "step": 10016 }, { "epoch": 0.9902379952054964, "grad_norm": 5.777160598567235, "learning_rate": 4.9233386564562217e-11, "loss": 0.6506, "step": 10017 }, { "epoch": 0.9903368509502508, "grad_norm": 4.076652240473739, "learning_rate": 4.823383028358385e-11, "loss": 0.682, "step": 10018 }, { "epoch": 0.9904357066950054, "grad_norm": 13.07678329562823, "learning_rate": 4.72445225818352e-11, "loss": 0.672, "step": 10019 }, { "epoch": 0.9905345624397598, "grad_norm": 6.286169184971684, "learning_rate": 4.626546356074623e-11, "loss": 0.6964, "step": 10020 }, { "epoch": 0.9906334181845142, "grad_norm": 33.62452340618132, "learning_rate": 4.529665332071442e-11, "loss": 0.6862, "step": 10021 }, { "epoch": 0.9907322739292688, "grad_norm": 3.2429492739181898, "learning_rate": 4.433809196108251e-11, "loss": 0.7006, "step": 10022 }, { "epoch": 0.9908311296740232, "grad_norm": 4.955199825057955, "learning_rate": 4.338977958011636e-11, "loss": 0.6715, "step": 10023 }, { "epoch": 0.9909299854187776, "grad_norm": 9.725605010256109, "learning_rate": 4.245171627507149e-11, "loss": 0.6936, "step": 10024 }, { "epoch": 0.9910288411635321, "grad_norm": 3.1505703113627836, "learning_rate": 4.1523902142126534e-11, "loss": 0.6118, "step": 10025 }, { "epoch": 0.9911276969082866, "grad_norm": 3.822181808685845, "learning_rate": 4.060633727641649e-11, "loss": 0.7161, "step": 10026 }, { "epoch": 0.991226552653041, "grad_norm": 4.049983523293202, "learning_rate": 3.969902177202167e-11, "loss": 0.6425, "step": 10027 }, { "epoch": 0.9913254083977955, "grad_norm": 5.796519034576253, "learning_rate": 3.880195572196765e-11, "loss": 0.7112, "step": 10028 }, { "epoch": 0.99142426414255, "grad_norm": 3.7939990623302458, "learning_rate": 3.7915139218247516e-11, "loss": 0.7745, "step": 10029 }, { "epoch": 0.9915231198873045, "grad_norm": 3.9943597395753736, "learning_rate": 3.703857235178853e-11, "loss": 0.6666, "step": 10030 }, { "epoch": 0.9916219756320589, "grad_norm": 6.259165740555087, "learning_rate": 3.617225521245215e-11, "loss": 0.6515, "step": 10031 }, { "epoch": 0.9917208313768134, "grad_norm": 7.834823501481987, "learning_rate": 3.531618788908952e-11, "loss": 0.7266, "step": 10032 }, { "epoch": 0.9918196871215679, "grad_norm": 5.04823730878093, "learning_rate": 3.447037046947487e-11, "loss": 0.6575, "step": 10033 }, { "epoch": 0.9919185428663223, "grad_norm": 3.881578798452637, "learning_rate": 3.363480304032773e-11, "loss": 0.7325, "step": 10034 }, { "epoch": 0.9920173986110767, "grad_norm": 6.95826052971369, "learning_rate": 3.2809485687312896e-11, "loss": 0.7633, "step": 10035 }, { "epoch": 0.9921162543558313, "grad_norm": 3.670076387614518, "learning_rate": 3.199441849507378e-11, "loss": 0.7828, "step": 10036 }, { "epoch": 0.9922151101005857, "grad_norm": 6.925535944950219, "learning_rate": 3.118960154717687e-11, "loss": 0.7452, "step": 10037 }, { "epoch": 0.9923139658453402, "grad_norm": 3.9562502499797585, "learning_rate": 3.039503492613393e-11, "loss": 0.6999, "step": 10038 }, { "epoch": 0.9924128215900947, "grad_norm": 3.0754543441414866, "learning_rate": 2.961071871342424e-11, "loss": 0.7571, "step": 10039 }, { "epoch": 0.9925116773348491, "grad_norm": 6.442723774692946, "learning_rate": 2.8836652989483456e-11, "loss": 0.7953, "step": 10040 }, { "epoch": 0.9926105330796036, "grad_norm": 3.595260920921651, "learning_rate": 2.807283783364811e-11, "loss": 0.7118, "step": 10041 }, { "epoch": 0.9927093888243581, "grad_norm": 3.653731524595311, "learning_rate": 2.7319273324266645e-11, "loss": 0.6934, "step": 10042 }, { "epoch": 0.9928082445691125, "grad_norm": 4.914944002414366, "learning_rate": 2.6575959538588377e-11, "loss": 0.7095, "step": 10043 }, { "epoch": 0.992907100313867, "grad_norm": 4.652113137789867, "learning_rate": 2.584289655283012e-11, "loss": 0.7061, "step": 10044 }, { "epoch": 0.9930059560586214, "grad_norm": 4.246194880014553, "learning_rate": 2.512008444216507e-11, "loss": 0.723, "step": 10045 }, { "epoch": 0.993104811803376, "grad_norm": 3.662838257210592, "learning_rate": 2.4407523280700614e-11, "loss": 0.7043, "step": 10046 }, { "epoch": 0.9932036675481304, "grad_norm": 28.292698575198894, "learning_rate": 2.3705213141500534e-11, "loss": 0.7052, "step": 10047 }, { "epoch": 0.9933025232928848, "grad_norm": 6.059770002362138, "learning_rate": 2.3013154096573894e-11, "loss": 0.6367, "step": 10048 }, { "epoch": 0.9934013790376394, "grad_norm": 4.661791045854803, "learning_rate": 2.233134621688615e-11, "loss": 0.7564, "step": 10049 }, { "epoch": 0.9935002347823938, "grad_norm": 5.262218364186442, "learning_rate": 2.165978957234804e-11, "loss": 0.6881, "step": 10050 }, { "epoch": 0.9935990905271482, "grad_norm": 4.525379456955892, "learning_rate": 2.0998484231804502e-11, "loss": 0.7056, "step": 10051 }, { "epoch": 0.9936979462719028, "grad_norm": 3.3615027974112497, "learning_rate": 2.0347430263079058e-11, "loss": 0.6282, "step": 10052 }, { "epoch": 0.9937968020166572, "grad_norm": 4.6494461083073375, "learning_rate": 1.9706627732918312e-11, "loss": 0.768, "step": 10053 }, { "epoch": 0.9938956577614116, "grad_norm": 3.4094258522168444, "learning_rate": 1.907607670703637e-11, "loss": 0.764, "step": 10054 }, { "epoch": 0.9939945135061661, "grad_norm": 6.886558040421821, "learning_rate": 1.8455777250070414e-11, "loss": 0.7426, "step": 10055 }, { "epoch": 0.9940933692509206, "grad_norm": 4.9332961533608755, "learning_rate": 1.7845729425647328e-11, "loss": 0.6931, "step": 10056 }, { "epoch": 0.9941922249956751, "grad_norm": 3.865952656555199, "learning_rate": 1.7245933296294867e-11, "loss": 0.7055, "step": 10057 }, { "epoch": 0.9942910807404295, "grad_norm": 4.175999167513154, "learning_rate": 1.6656388923519394e-11, "loss": 0.7318, "step": 10058 }, { "epoch": 0.994389936485184, "grad_norm": 4.8090703829565635, "learning_rate": 1.607709636778365e-11, "loss": 0.6848, "step": 10059 }, { "epoch": 0.9944887922299385, "grad_norm": 3.7310018250751, "learning_rate": 1.5508055688462364e-11, "loss": 0.7862, "step": 10060 }, { "epoch": 0.9945876479746929, "grad_norm": 5.503969161740724, "learning_rate": 1.4949266943919957e-11, "loss": 0.5905, "step": 10061 }, { "epoch": 0.9946865037194474, "grad_norm": 3.570452962276279, "learning_rate": 1.4400730191443944e-11, "loss": 0.7825, "step": 10062 }, { "epoch": 0.9947853594642019, "grad_norm": 3.5266760251606604, "learning_rate": 1.386244548728932e-11, "loss": 0.7146, "step": 10063 }, { "epoch": 0.9948842152089563, "grad_norm": 3.881392244544323, "learning_rate": 1.3334412886634172e-11, "loss": 0.6663, "step": 10064 }, { "epoch": 0.9949830709537107, "grad_norm": 19.405826166838725, "learning_rate": 1.2816632443635178e-11, "loss": 0.792, "step": 10065 }, { "epoch": 0.9950819266984653, "grad_norm": 5.983361769928404, "learning_rate": 1.23091042113721e-11, "loss": 0.6076, "step": 10066 }, { "epoch": 0.9951807824432197, "grad_norm": 5.24729335323364, "learning_rate": 1.1811828241892197e-11, "loss": 0.647, "step": 10067 }, { "epoch": 0.9952796381879742, "grad_norm": 4.179667870302554, "learning_rate": 1.1324804586176906e-11, "loss": 0.6401, "step": 10068 }, { "epoch": 0.9953784939327287, "grad_norm": 4.452015655357441, "learning_rate": 1.084803329416406e-11, "loss": 0.7658, "step": 10069 }, { "epoch": 0.9954773496774831, "grad_norm": 5.543867896527644, "learning_rate": 1.038151441473678e-11, "loss": 0.8326, "step": 10070 }, { "epoch": 0.9955762054222376, "grad_norm": 4.445258506951427, "learning_rate": 9.925247995745678e-12, "loss": 0.6965, "step": 10071 }, { "epoch": 0.9956750611669921, "grad_norm": 2.798483195999004, "learning_rate": 9.479234083953347e-12, "loss": 0.7135, "step": 10072 }, { "epoch": 0.9957739169117465, "grad_norm": 7.796110985897896, "learning_rate": 9.043472725112077e-12, "loss": 0.6553, "step": 10073 }, { "epoch": 0.995872772656501, "grad_norm": 4.024172767242262, "learning_rate": 8.617963963886143e-12, "loss": 0.6722, "step": 10074 }, { "epoch": 0.9959716284012554, "grad_norm": 4.213195866364077, "learning_rate": 8.202707843907308e-12, "loss": 0.7463, "step": 10075 }, { "epoch": 0.99607048414601, "grad_norm": 3.32949548312114, "learning_rate": 7.797704407763728e-12, "loss": 0.6087, "step": 10076 }, { "epoch": 0.9961693398907644, "grad_norm": 3.4689819195532703, "learning_rate": 7.402953696977743e-12, "loss": 0.6622, "step": 10077 }, { "epoch": 0.9962681956355188, "grad_norm": 2.941661083220348, "learning_rate": 7.018455752016983e-12, "loss": 0.6532, "step": 10078 }, { "epoch": 0.9963670513802734, "grad_norm": 2.9509758907018853, "learning_rate": 6.64421061231657e-12, "loss": 0.6302, "step": 10079 }, { "epoch": 0.9964659071250278, "grad_norm": 2.8195316907000225, "learning_rate": 6.280218316245811e-12, "loss": 0.7757, "step": 10080 }, { "epoch": 0.9965647628697822, "grad_norm": 5.756266518651886, "learning_rate": 5.926478901130405e-12, "loss": 0.6931, "step": 10081 }, { "epoch": 0.9966636186145368, "grad_norm": 3.8854233089112142, "learning_rate": 5.5829924032413375e-12, "loss": 0.7207, "step": 10082 }, { "epoch": 0.9967624743592912, "grad_norm": 3.7524955750325533, "learning_rate": 5.249758857794883e-12, "loss": 0.6798, "step": 10083 }, { "epoch": 0.9968613301040457, "grad_norm": 6.772098783919843, "learning_rate": 4.926778298952605e-12, "loss": 0.7572, "step": 10084 }, { "epoch": 0.9969601858488001, "grad_norm": 5.271145250423485, "learning_rate": 4.614050759843557e-12, "loss": 0.7143, "step": 10085 }, { "epoch": 0.9970590415935546, "grad_norm": 4.530367238002224, "learning_rate": 4.311576272519879e-12, "loss": 0.7126, "step": 10086 }, { "epoch": 0.9971578973383091, "grad_norm": 3.155896177827421, "learning_rate": 4.019354868012303e-12, "loss": 0.7533, "step": 10087 }, { "epoch": 0.9972567530830635, "grad_norm": 3.610583272152024, "learning_rate": 3.737386576274648e-12, "loss": 0.6648, "step": 10088 }, { "epoch": 0.997355608827818, "grad_norm": 5.296881977324083, "learning_rate": 3.4656714262171205e-12, "loss": 0.6955, "step": 10089 }, { "epoch": 0.9974544645725725, "grad_norm": 4.1446852602063915, "learning_rate": 3.2042094456952162e-12, "loss": 0.7763, "step": 10090 }, { "epoch": 0.9975533203173269, "grad_norm": 7.070971169380547, "learning_rate": 2.9530006615319236e-12, "loss": 0.6391, "step": 10091 }, { "epoch": 0.9976521760620815, "grad_norm": 6.875305478952838, "learning_rate": 2.712045099484417e-12, "loss": 0.8553, "step": 10092 }, { "epoch": 0.9977510318068359, "grad_norm": 3.9182128481002474, "learning_rate": 2.481342784244056e-12, "loss": 0.8486, "step": 10093 }, { "epoch": 0.9978498875515903, "grad_norm": 7.209460973752709, "learning_rate": 2.260893739480796e-12, "loss": 0.6622, "step": 10094 }, { "epoch": 0.9979487432963448, "grad_norm": 8.157860160842715, "learning_rate": 2.0506979877876753e-12, "loss": 0.7175, "step": 10095 }, { "epoch": 0.9980475990410993, "grad_norm": 4.104448229580695, "learning_rate": 1.850755550725225e-12, "loss": 0.6578, "step": 10096 }, { "epoch": 0.9981464547858537, "grad_norm": 4.552771378402265, "learning_rate": 1.6610664487881621e-12, "loss": 0.712, "step": 10097 }, { "epoch": 0.9982453105306082, "grad_norm": 4.508228463095843, "learning_rate": 1.4816307014275942e-12, "loss": 0.6569, "step": 10098 }, { "epoch": 0.9983441662753627, "grad_norm": 4.524135856727574, "learning_rate": 1.312448327051019e-12, "loss": 0.7496, "step": 10099 }, { "epoch": 0.9984430220201171, "grad_norm": 3.605315444377779, "learning_rate": 1.15351934300012e-12, "loss": 0.7303, "step": 10100 }, { "epoch": 0.9985418777648716, "grad_norm": 2.722257441947089, "learning_rate": 1.0048437655618692e-12, "loss": 0.6223, "step": 10101 }, { "epoch": 0.9986407335096261, "grad_norm": 89.37083624230522, "learning_rate": 8.664216099907306e-13, "loss": 0.6111, "step": 10102 }, { "epoch": 0.9987395892543806, "grad_norm": 4.616934768141887, "learning_rate": 7.382528904753549e-13, "loss": 0.7225, "step": 10103 }, { "epoch": 0.998838444999135, "grad_norm": 4.765481648396158, "learning_rate": 6.203376201607824e-13, "loss": 0.7538, "step": 10104 }, { "epoch": 0.9989373007438895, "grad_norm": 4.0134690215814715, "learning_rate": 5.126758111262397e-13, "loss": 0.7196, "step": 10105 }, { "epoch": 0.999036156488644, "grad_norm": 3.353449523485935, "learning_rate": 4.152674744295481e-13, "loss": 0.6381, "step": 10106 }, { "epoch": 0.9991350122333984, "grad_norm": 6.8539268312854364, "learning_rate": 3.2811262005161267e-13, "loss": 0.6561, "step": 10107 }, { "epoch": 0.9992338679781528, "grad_norm": 22.667845996164242, "learning_rate": 2.5121125692972867e-13, "loss": 0.5774, "step": 10108 }, { "epoch": 0.9993327237229074, "grad_norm": 5.17299561125041, "learning_rate": 1.8456339293537736e-13, "loss": 0.6462, "step": 10109 }, { "epoch": 0.9994315794676618, "grad_norm": 5.270354985489154, "learning_rate": 1.2816903491863484e-13, "loss": 0.7205, "step": 10110 }, { "epoch": 0.9995304352124162, "grad_norm": 5.855207465036062, "learning_rate": 8.202818865266082e-14, "loss": 0.5983, "step": 10111 }, { "epoch": 0.9996292909571708, "grad_norm": 4.619373120439512, "learning_rate": 4.614085887810759e-14, "loss": 0.6319, "step": 10112 }, { "epoch": 0.9997281467019252, "grad_norm": 4.850602544743707, "learning_rate": 2.0507049269813392e-14, "loss": 0.7138, "step": 10113 }, { "epoch": 0.9998270024466797, "grad_norm": 4.650663586725914, "learning_rate": 5.1267624479045536e-15, "loss": 0.7126, "step": 10114 }, { "epoch": 0.9999258581914342, "grad_norm": 5.069951464751154, "learning_rate": 0.0, "loss": 0.672, "step": 10115 }, { "epoch": 0.9999258581914342, "step": 10115, "total_flos": 4197411658629120.0, "train_loss": 0.7261561949301403, "train_runtime": 64868.5677, "train_samples_per_second": 19.96, "train_steps_per_second": 0.156 } ], "logging_steps": 1.0, "max_steps": 10115, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4197411658629120.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }