diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100755--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4613 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2002136426064398, + "eval_steps": 500, + "global_step": 656, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003052037234854265, + "grad_norm": 19.476922880741295, + "learning_rate": 1.0101010101010103e-07, + "loss": 1.1728, + "step": 1 + }, + { + "epoch": 0.000610407446970853, + "grad_norm": 29.879020388476594, + "learning_rate": 2.0202020202020205e-07, + "loss": 1.0955, + "step": 2 + }, + { + "epoch": 0.0009156111704562796, + "grad_norm": 24.931945947136526, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.9541, + "step": 3 + }, + { + "epoch": 0.001220814893941706, + "grad_norm": 27.83214939667906, + "learning_rate": 4.040404040404041e-07, + "loss": 1.0735, + "step": 4 + }, + { + "epoch": 0.0015260186174271325, + "grad_norm": 21.219233961021736, + "learning_rate": 5.05050505050505e-07, + "loss": 1.0455, + "step": 5 + }, + { + "epoch": 0.0018312223409125592, + "grad_norm": 20.022707446211225, + "learning_rate": 6.060606060606061e-07, + "loss": 0.9675, + "step": 6 + }, + { + "epoch": 0.0021364260643979855, + "grad_norm": 26.532427830157193, + "learning_rate": 7.070707070707071e-07, + "loss": 1.1393, + "step": 7 + }, + { + "epoch": 0.002441629787883412, + "grad_norm": 27.89728780710031, + "learning_rate": 8.080808080808082e-07, + "loss": 1.0952, + "step": 8 + }, + { + "epoch": 0.0027468335113688385, + "grad_norm": 20.346264005570532, + "learning_rate": 9.090909090909091e-07, + "loss": 0.9626, + "step": 9 + }, + { + "epoch": 0.003052037234854265, + "grad_norm": 18.804489508720884, + "learning_rate": 1.01010101010101e-06, + "loss": 1.0255, + "step": 10 + }, + { + "epoch": 0.003357240958339692, + "grad_norm": 19.776534785573535, + "learning_rate": 1.111111111111111e-06, + "loss": 0.7399, + "step": 11 + }, + { + "epoch": 0.0036624446818251184, + "grad_norm": 21.16130386460154, + "learning_rate": 1.2121212121212122e-06, + "loss": 0.5413, + "step": 12 + }, + { + "epoch": 0.0039676484053105445, + "grad_norm": 16.482713371526263, + "learning_rate": 1.3131313131313134e-06, + "loss": 0.5773, + "step": 13 + }, + { + "epoch": 0.004272852128795971, + "grad_norm": 10.780528168770594, + "learning_rate": 1.4141414141414143e-06, + "loss": 0.6782, + "step": 14 + }, + { + "epoch": 0.0045780558522813975, + "grad_norm": 7.0900135030469915, + "learning_rate": 1.5151515151515152e-06, + "loss": 0.9153, + "step": 15 + }, + { + "epoch": 0.004883259575766824, + "grad_norm": 8.490445320662754, + "learning_rate": 1.6161616161616164e-06, + "loss": 0.4798, + "step": 16 + }, + { + "epoch": 0.0051884632992522505, + "grad_norm": 6.677142812986669, + "learning_rate": 1.7171717171717173e-06, + "loss": 0.4782, + "step": 17 + }, + { + "epoch": 0.005493667022737677, + "grad_norm": 5.9204247946017485, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.3191, + "step": 18 + }, + { + "epoch": 0.0057988707462231035, + "grad_norm": 5.012462343754674, + "learning_rate": 1.9191919191919192e-06, + "loss": 0.4115, + "step": 19 + }, + { + "epoch": 0.00610407446970853, + "grad_norm": 3.9095937836899113, + "learning_rate": 2.02020202020202e-06, + "loss": 0.6158, + "step": 20 + }, + { + "epoch": 0.006409278193193957, + "grad_norm": 4.438163815129716, + "learning_rate": 2.1212121212121216e-06, + "loss": 0.7388, + "step": 21 + }, + { + "epoch": 0.006714481916679384, + "grad_norm": 3.62875198348435, + "learning_rate": 2.222222222222222e-06, + "loss": 0.2875, + "step": 22 + }, + { + "epoch": 0.00701968564016481, + "grad_norm": 4.963543929599541, + "learning_rate": 2.3232323232323234e-06, + "loss": 0.4662, + "step": 23 + }, + { + "epoch": 0.007324889363650237, + "grad_norm": 4.274904100558248, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.5171, + "step": 24 + }, + { + "epoch": 0.007630093087135663, + "grad_norm": 2.670885047669819, + "learning_rate": 2.5252525252525258e-06, + "loss": 0.4488, + "step": 25 + }, + { + "epoch": 0.007935296810621089, + "grad_norm": 2.6864388610994014, + "learning_rate": 2.6262626262626267e-06, + "loss": 0.372, + "step": 26 + }, + { + "epoch": 0.008240500534106516, + "grad_norm": 3.804357369452407, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.2646, + "step": 27 + }, + { + "epoch": 0.008545704257591942, + "grad_norm": 4.059008227452532, + "learning_rate": 2.8282828282828286e-06, + "loss": 0.5907, + "step": 28 + }, + { + "epoch": 0.008850907981077369, + "grad_norm": 4.9062443629918855, + "learning_rate": 2.9292929292929295e-06, + "loss": 0.2972, + "step": 29 + }, + { + "epoch": 0.009156111704562795, + "grad_norm": 3.5391495380267064, + "learning_rate": 3.0303030303030305e-06, + "loss": 0.3821, + "step": 30 + }, + { + "epoch": 0.009461315428048222, + "grad_norm": 2.5896920322264854, + "learning_rate": 3.131313131313132e-06, + "loss": 0.4164, + "step": 31 + }, + { + "epoch": 0.009766519151533648, + "grad_norm": 3.0230775761822937, + "learning_rate": 3.232323232323233e-06, + "loss": 0.4237, + "step": 32 + }, + { + "epoch": 0.010071722875019075, + "grad_norm": 2.8417717057519423, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.3353, + "step": 33 + }, + { + "epoch": 0.010376926598504501, + "grad_norm": 2.5789157463945878, + "learning_rate": 3.4343434343434347e-06, + "loss": 0.3769, + "step": 34 + }, + { + "epoch": 0.010682130321989928, + "grad_norm": 2.5222241581850096, + "learning_rate": 3.5353535353535356e-06, + "loss": 0.519, + "step": 35 + }, + { + "epoch": 0.010987334045475354, + "grad_norm": 2.8704682168269127, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.2829, + "step": 36 + }, + { + "epoch": 0.01129253776896078, + "grad_norm": 3.24684532820184, + "learning_rate": 3.737373737373738e-06, + "loss": 0.3586, + "step": 37 + }, + { + "epoch": 0.011597741492446207, + "grad_norm": 5.24792475783676, + "learning_rate": 3.8383838383838385e-06, + "loss": 0.402, + "step": 38 + }, + { + "epoch": 0.011902945215931634, + "grad_norm": 3.111184671834165, + "learning_rate": 3.93939393939394e-06, + "loss": 0.466, + "step": 39 + }, + { + "epoch": 0.01220814893941706, + "grad_norm": 3.165565566985893, + "learning_rate": 4.04040404040404e-06, + "loss": 0.2678, + "step": 40 + }, + { + "epoch": 0.012513352662902488, + "grad_norm": 2.5486933296193257, + "learning_rate": 4.141414141414142e-06, + "loss": 0.5457, + "step": 41 + }, + { + "epoch": 0.012818556386387915, + "grad_norm": 3.4373721012250438, + "learning_rate": 4.242424242424243e-06, + "loss": 0.3862, + "step": 42 + }, + { + "epoch": 0.013123760109873341, + "grad_norm": 2.863317221380458, + "learning_rate": 4.343434343434344e-06, + "loss": 0.3601, + "step": 43 + }, + { + "epoch": 0.013428963833358768, + "grad_norm": 2.1041128573446035, + "learning_rate": 4.444444444444444e-06, + "loss": 0.3693, + "step": 44 + }, + { + "epoch": 0.013734167556844194, + "grad_norm": 2.286990324679626, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.2513, + "step": 45 + }, + { + "epoch": 0.01403937128032962, + "grad_norm": 8.793466778432636, + "learning_rate": 4.646464646464647e-06, + "loss": 0.4343, + "step": 46 + }, + { + "epoch": 0.014344575003815047, + "grad_norm": 1.8648737533834159, + "learning_rate": 4.747474747474748e-06, + "loss": 0.2631, + "step": 47 + }, + { + "epoch": 0.014649778727300474, + "grad_norm": 2.3081781364995324, + "learning_rate": 4.848484848484849e-06, + "loss": 0.2755, + "step": 48 + }, + { + "epoch": 0.0149549824507859, + "grad_norm": 2.284005369243557, + "learning_rate": 4.94949494949495e-06, + "loss": 0.4186, + "step": 49 + }, + { + "epoch": 0.015260186174271327, + "grad_norm": 2.6759709423238096, + "learning_rate": 5.0505050505050515e-06, + "loss": 0.6459, + "step": 50 + }, + { + "epoch": 0.015565389897756753, + "grad_norm": 2.8773749120652523, + "learning_rate": 5.151515151515152e-06, + "loss": 0.3324, + "step": 51 + }, + { + "epoch": 0.015870593621242178, + "grad_norm": 2.8060164424498786, + "learning_rate": 5.252525252525253e-06, + "loss": 0.3608, + "step": 52 + }, + { + "epoch": 0.016175797344727606, + "grad_norm": 2.3060494229726793, + "learning_rate": 5.353535353535354e-06, + "loss": 0.3818, + "step": 53 + }, + { + "epoch": 0.01648100106821303, + "grad_norm": 2.073464811557714, + "learning_rate": 5.4545454545454545e-06, + "loss": 0.2667, + "step": 54 + }, + { + "epoch": 0.01678620479169846, + "grad_norm": 2.3474749655399245, + "learning_rate": 5.555555555555557e-06, + "loss": 0.35, + "step": 55 + }, + { + "epoch": 0.017091408515183884, + "grad_norm": 3.6988890036672086, + "learning_rate": 5.656565656565657e-06, + "loss": 0.284, + "step": 56 + }, + { + "epoch": 0.017396612238669312, + "grad_norm": 2.313501192849839, + "learning_rate": 5.7575757575757586e-06, + "loss": 0.3308, + "step": 57 + }, + { + "epoch": 0.017701815962154737, + "grad_norm": 2.411936098122121, + "learning_rate": 5.858585858585859e-06, + "loss": 0.3982, + "step": 58 + }, + { + "epoch": 0.018007019685640165, + "grad_norm": 2.724660127775508, + "learning_rate": 5.95959595959596e-06, + "loss": 0.3587, + "step": 59 + }, + { + "epoch": 0.01831222340912559, + "grad_norm": 3.130895013540925, + "learning_rate": 6.060606060606061e-06, + "loss": 0.3427, + "step": 60 + }, + { + "epoch": 0.01861742713261102, + "grad_norm": 3.4261489723004614, + "learning_rate": 6.1616161616161615e-06, + "loss": 0.4578, + "step": 61 + }, + { + "epoch": 0.018922630856096443, + "grad_norm": 2.413871881063889, + "learning_rate": 6.262626262626264e-06, + "loss": 0.2067, + "step": 62 + }, + { + "epoch": 0.01922783457958187, + "grad_norm": 2.0941348505038366, + "learning_rate": 6.363636363636364e-06, + "loss": 0.27, + "step": 63 + }, + { + "epoch": 0.019533038303067296, + "grad_norm": 2.2153240133926153, + "learning_rate": 6.464646464646466e-06, + "loss": 0.3298, + "step": 64 + }, + { + "epoch": 0.019838242026552724, + "grad_norm": 2.422022070572305, + "learning_rate": 6.565656565656566e-06, + "loss": 0.4894, + "step": 65 + }, + { + "epoch": 0.02014344575003815, + "grad_norm": 2.45442660843552, + "learning_rate": 6.666666666666667e-06, + "loss": 0.3684, + "step": 66 + }, + { + "epoch": 0.020448649473523577, + "grad_norm": 3.5398238081108304, + "learning_rate": 6.767676767676769e-06, + "loss": 0.4233, + "step": 67 + }, + { + "epoch": 0.020753853197009002, + "grad_norm": 2.530397719080883, + "learning_rate": 6.868686868686869e-06, + "loss": 0.2676, + "step": 68 + }, + { + "epoch": 0.02105905692049443, + "grad_norm": 2.259346305696615, + "learning_rate": 6.969696969696971e-06, + "loss": 0.4409, + "step": 69 + }, + { + "epoch": 0.021364260643979855, + "grad_norm": 2.3339543424453764, + "learning_rate": 7.070707070707071e-06, + "loss": 0.3882, + "step": 70 + }, + { + "epoch": 0.021669464367465283, + "grad_norm": 2.348843038116063, + "learning_rate": 7.171717171717172e-06, + "loss": 0.3904, + "step": 71 + }, + { + "epoch": 0.021974668090950708, + "grad_norm": 2.7011363922899965, + "learning_rate": 7.272727272727273e-06, + "loss": 0.3586, + "step": 72 + }, + { + "epoch": 0.022279871814436136, + "grad_norm": 2.6923381814173486, + "learning_rate": 7.373737373737374e-06, + "loss": 0.4331, + "step": 73 + }, + { + "epoch": 0.02258507553792156, + "grad_norm": 2.0435337430530924, + "learning_rate": 7.474747474747476e-06, + "loss": 0.2739, + "step": 74 + }, + { + "epoch": 0.02289027926140699, + "grad_norm": 2.257183264462076, + "learning_rate": 7.5757575757575764e-06, + "loss": 0.4554, + "step": 75 + }, + { + "epoch": 0.023195482984892414, + "grad_norm": 2.5384248372961626, + "learning_rate": 7.676767676767677e-06, + "loss": 0.4934, + "step": 76 + }, + { + "epoch": 0.023500686708377842, + "grad_norm": 2.1578730127908488, + "learning_rate": 7.77777777777778e-06, + "loss": 0.3519, + "step": 77 + }, + { + "epoch": 0.023805890431863267, + "grad_norm": 2.1316764516757476, + "learning_rate": 7.87878787878788e-06, + "loss": 0.3268, + "step": 78 + }, + { + "epoch": 0.024111094155348695, + "grad_norm": 2.095996278024237, + "learning_rate": 7.97979797979798e-06, + "loss": 0.3318, + "step": 79 + }, + { + "epoch": 0.02441629787883412, + "grad_norm": 1.9985574049541877, + "learning_rate": 8.08080808080808e-06, + "loss": 0.1852, + "step": 80 + }, + { + "epoch": 0.02472150160231955, + "grad_norm": 1.7092921737326583, + "learning_rate": 8.181818181818183e-06, + "loss": 0.2412, + "step": 81 + }, + { + "epoch": 0.025026705325804977, + "grad_norm": 1.9609482601524066, + "learning_rate": 8.282828282828283e-06, + "loss": 0.3349, + "step": 82 + }, + { + "epoch": 0.0253319090492904, + "grad_norm": 2.5619254980161412, + "learning_rate": 8.383838383838384e-06, + "loss": 0.3327, + "step": 83 + }, + { + "epoch": 0.02563711277277583, + "grad_norm": 2.1734116421771827, + "learning_rate": 8.484848484848486e-06, + "loss": 0.5005, + "step": 84 + }, + { + "epoch": 0.025942316496261254, + "grad_norm": 2.4612836321871785, + "learning_rate": 8.585858585858587e-06, + "loss": 0.5919, + "step": 85 + }, + { + "epoch": 0.026247520219746683, + "grad_norm": 2.050264187978962, + "learning_rate": 8.686868686868687e-06, + "loss": 0.2654, + "step": 86 + }, + { + "epoch": 0.026552723943232107, + "grad_norm": 1.7466792206761999, + "learning_rate": 8.787878787878788e-06, + "loss": 0.2875, + "step": 87 + }, + { + "epoch": 0.026857927666717536, + "grad_norm": 1.9114055019911376, + "learning_rate": 8.888888888888888e-06, + "loss": 0.3317, + "step": 88 + }, + { + "epoch": 0.02716313139020296, + "grad_norm": 2.136028617695754, + "learning_rate": 8.98989898989899e-06, + "loss": 0.4322, + "step": 89 + }, + { + "epoch": 0.02746833511368839, + "grad_norm": 2.0559196693817303, + "learning_rate": 9.090909090909091e-06, + "loss": 0.3372, + "step": 90 + }, + { + "epoch": 0.027773538837173813, + "grad_norm": 1.6053810559753854, + "learning_rate": 9.191919191919193e-06, + "loss": 0.2833, + "step": 91 + }, + { + "epoch": 0.02807874256065924, + "grad_norm": 1.9190338968500587, + "learning_rate": 9.292929292929294e-06, + "loss": 0.2358, + "step": 92 + }, + { + "epoch": 0.028383946284144666, + "grad_norm": 1.7424429804531956, + "learning_rate": 9.393939393939396e-06, + "loss": 0.2805, + "step": 93 + }, + { + "epoch": 0.028689150007630095, + "grad_norm": 1.5616301594921251, + "learning_rate": 9.494949494949497e-06, + "loss": 0.326, + "step": 94 + }, + { + "epoch": 0.02899435373111552, + "grad_norm": 2.6517363851490297, + "learning_rate": 9.595959595959597e-06, + "loss": 0.5839, + "step": 95 + }, + { + "epoch": 0.029299557454600948, + "grad_norm": 1.9068377479857994, + "learning_rate": 9.696969696969698e-06, + "loss": 0.4213, + "step": 96 + }, + { + "epoch": 0.029604761178086372, + "grad_norm": 2.147263972819766, + "learning_rate": 9.797979797979798e-06, + "loss": 0.3776, + "step": 97 + }, + { + "epoch": 0.0299099649015718, + "grad_norm": 2.3466004395170685, + "learning_rate": 9.8989898989899e-06, + "loss": 0.4828, + "step": 98 + }, + { + "epoch": 0.030215168625057225, + "grad_norm": 1.9328188798162316, + "learning_rate": 1e-05, + "loss": 0.3816, + "step": 99 + }, + { + "epoch": 0.030520372348542654, + "grad_norm": 2.120656679761712, + "learning_rate": 9.999997555414177e-06, + "loss": 0.287, + "step": 100 + }, + { + "epoch": 0.03082557607202808, + "grad_norm": 1.8272767014289886, + "learning_rate": 9.999990221659095e-06, + "loss": 0.2529, + "step": 101 + }, + { + "epoch": 0.031130779795513507, + "grad_norm": 2.108876035097533, + "learning_rate": 9.999977998741925e-06, + "loss": 0.4, + "step": 102 + }, + { + "epoch": 0.031435983518998935, + "grad_norm": 2.611227326027621, + "learning_rate": 9.999960886674623e-06, + "loss": 0.5577, + "step": 103 + }, + { + "epoch": 0.031741187242484356, + "grad_norm": 2.012760226088087, + "learning_rate": 9.999938885473916e-06, + "loss": 0.2397, + "step": 104 + }, + { + "epoch": 0.032046390965969784, + "grad_norm": 3.4069313977643088, + "learning_rate": 9.999911995161323e-06, + "loss": 0.3074, + "step": 105 + }, + { + "epoch": 0.03235159468945521, + "grad_norm": 1.5281487804348939, + "learning_rate": 9.999880215763133e-06, + "loss": 0.306, + "step": 106 + }, + { + "epoch": 0.03265679841294064, + "grad_norm": 1.5733903167529437, + "learning_rate": 9.999843547310427e-06, + "loss": 0.3123, + "step": 107 + }, + { + "epoch": 0.03296200213642606, + "grad_norm": 2.2084260837102776, + "learning_rate": 9.999801989839055e-06, + "loss": 0.2686, + "step": 108 + }, + { + "epoch": 0.03326720585991149, + "grad_norm": 2.0235527329790477, + "learning_rate": 9.999755543389658e-06, + "loss": 0.362, + "step": 109 + }, + { + "epoch": 0.03357240958339692, + "grad_norm": 1.4126246608311444, + "learning_rate": 9.999704208007647e-06, + "loss": 0.1868, + "step": 110 + }, + { + "epoch": 0.03387761330688235, + "grad_norm": 1.9363750145032863, + "learning_rate": 9.999647983743227e-06, + "loss": 0.4674, + "step": 111 + }, + { + "epoch": 0.03418281703036777, + "grad_norm": 2.306492812857686, + "learning_rate": 9.999586870651372e-06, + "loss": 0.7454, + "step": 112 + }, + { + "epoch": 0.034488020753853196, + "grad_norm": 1.9927578577114744, + "learning_rate": 9.999520868791839e-06, + "loss": 0.2964, + "step": 113 + }, + { + "epoch": 0.034793224477338625, + "grad_norm": 2.897230200199283, + "learning_rate": 9.99944997822917e-06, + "loss": 0.3507, + "step": 114 + }, + { + "epoch": 0.03509842820082405, + "grad_norm": 1.7040567211820554, + "learning_rate": 9.999374199032682e-06, + "loss": 0.358, + "step": 115 + }, + { + "epoch": 0.035403631924309474, + "grad_norm": 1.7684725864001616, + "learning_rate": 9.999293531276475e-06, + "loss": 0.469, + "step": 116 + }, + { + "epoch": 0.0357088356477949, + "grad_norm": 2.151331613378997, + "learning_rate": 9.999207975039429e-06, + "loss": 0.4007, + "step": 117 + }, + { + "epoch": 0.03601403937128033, + "grad_norm": 2.1827006415812678, + "learning_rate": 9.999117530405205e-06, + "loss": 0.373, + "step": 118 + }, + { + "epoch": 0.03631924309476576, + "grad_norm": 2.0424756244526283, + "learning_rate": 9.99902219746224e-06, + "loss": 0.4664, + "step": 119 + }, + { + "epoch": 0.03662444681825118, + "grad_norm": 2.4438750213097014, + "learning_rate": 9.998921976303757e-06, + "loss": 0.5884, + "step": 120 + }, + { + "epoch": 0.03692965054173661, + "grad_norm": 1.6168805259489245, + "learning_rate": 9.998816867027753e-06, + "loss": 0.3874, + "step": 121 + }, + { + "epoch": 0.03723485426522204, + "grad_norm": 2.4836564854380914, + "learning_rate": 9.99870686973701e-06, + "loss": 0.3865, + "step": 122 + }, + { + "epoch": 0.037540057988707465, + "grad_norm": 2.187549263535683, + "learning_rate": 9.998591984539085e-06, + "loss": 0.4419, + "step": 123 + }, + { + "epoch": 0.037845261712192886, + "grad_norm": 2.3145724108896366, + "learning_rate": 9.998472211546317e-06, + "loss": 0.5048, + "step": 124 + }, + { + "epoch": 0.038150465435678314, + "grad_norm": 2.6043824271784377, + "learning_rate": 9.998347550875825e-06, + "loss": 0.4323, + "step": 125 + }, + { + "epoch": 0.03845566915916374, + "grad_norm": 1.7266964407358079, + "learning_rate": 9.998218002649507e-06, + "loss": 0.3093, + "step": 126 + }, + { + "epoch": 0.03876087288264917, + "grad_norm": 2.3091863655820397, + "learning_rate": 9.99808356699404e-06, + "loss": 0.5394, + "step": 127 + }, + { + "epoch": 0.03906607660613459, + "grad_norm": 2.178584103245907, + "learning_rate": 9.997944244040877e-06, + "loss": 0.562, + "step": 128 + }, + { + "epoch": 0.03937128032962002, + "grad_norm": 1.4762803065381216, + "learning_rate": 9.997800033926252e-06, + "loss": 0.3012, + "step": 129 + }, + { + "epoch": 0.03967648405310545, + "grad_norm": 1.6768704233807339, + "learning_rate": 9.997650936791183e-06, + "loss": 0.3314, + "step": 130 + }, + { + "epoch": 0.03998168777659088, + "grad_norm": 1.8423584681568375, + "learning_rate": 9.997496952781461e-06, + "loss": 0.5373, + "step": 131 + }, + { + "epoch": 0.0402868915000763, + "grad_norm": 1.4926628434179245, + "learning_rate": 9.997338082047656e-06, + "loss": 0.1992, + "step": 132 + }, + { + "epoch": 0.040592095223561726, + "grad_norm": 1.6323074947028773, + "learning_rate": 9.997174324745117e-06, + "loss": 0.4872, + "step": 133 + }, + { + "epoch": 0.040897298947047155, + "grad_norm": 2.159688005520465, + "learning_rate": 9.997005681033973e-06, + "loss": 0.5076, + "step": 134 + }, + { + "epoch": 0.04120250267053258, + "grad_norm": 2.207163038792008, + "learning_rate": 9.996832151079127e-06, + "loss": 0.2677, + "step": 135 + }, + { + "epoch": 0.041507706394018004, + "grad_norm": 1.3990677420334965, + "learning_rate": 9.996653735050265e-06, + "loss": 0.2526, + "step": 136 + }, + { + "epoch": 0.04181291011750343, + "grad_norm": 1.7368886105229604, + "learning_rate": 9.996470433121847e-06, + "loss": 0.2874, + "step": 137 + }, + { + "epoch": 0.04211811384098886, + "grad_norm": 1.8138446424045762, + "learning_rate": 9.996282245473113e-06, + "loss": 0.2986, + "step": 138 + }, + { + "epoch": 0.04242331756447429, + "grad_norm": 1.8564789601928355, + "learning_rate": 9.996089172288078e-06, + "loss": 0.3954, + "step": 139 + }, + { + "epoch": 0.04272852128795971, + "grad_norm": 1.9085920361180522, + "learning_rate": 9.995891213755536e-06, + "loss": 0.2739, + "step": 140 + }, + { + "epoch": 0.04303372501144514, + "grad_norm": 1.8924678931794556, + "learning_rate": 9.99568837006906e-06, + "loss": 0.2766, + "step": 141 + }, + { + "epoch": 0.04333892873493057, + "grad_norm": 1.8418836037208652, + "learning_rate": 9.995480641426992e-06, + "loss": 0.488, + "step": 142 + }, + { + "epoch": 0.043644132458415995, + "grad_norm": 1.6305125707231247, + "learning_rate": 9.99526802803246e-06, + "loss": 0.3045, + "step": 143 + }, + { + "epoch": 0.043949336181901416, + "grad_norm": 2.143051665423358, + "learning_rate": 9.995050530093366e-06, + "loss": 0.3567, + "step": 144 + }, + { + "epoch": 0.044254539905386844, + "grad_norm": 1.994194545633334, + "learning_rate": 9.994828147822387e-06, + "loss": 0.3655, + "step": 145 + }, + { + "epoch": 0.04455974362887227, + "grad_norm": 1.8553346605537173, + "learning_rate": 9.994600881436972e-06, + "loss": 0.3249, + "step": 146 + }, + { + "epoch": 0.0448649473523577, + "grad_norm": 2.1613773805709857, + "learning_rate": 9.994368731159351e-06, + "loss": 0.4863, + "step": 147 + }, + { + "epoch": 0.04517015107584312, + "grad_norm": 2.199571706523493, + "learning_rate": 9.99413169721653e-06, + "loss": 0.465, + "step": 148 + }, + { + "epoch": 0.04547535479932855, + "grad_norm": 1.681707967900651, + "learning_rate": 9.99388977984029e-06, + "loss": 0.3472, + "step": 149 + }, + { + "epoch": 0.04578055852281398, + "grad_norm": 1.6586587053140593, + "learning_rate": 9.993642979267184e-06, + "loss": 0.3626, + "step": 150 + }, + { + "epoch": 0.04608576224629941, + "grad_norm": 2.12592721793332, + "learning_rate": 9.993391295738542e-06, + "loss": 0.3218, + "step": 151 + }, + { + "epoch": 0.04639096596978483, + "grad_norm": 1.6765944279655143, + "learning_rate": 9.99313472950047e-06, + "loss": 0.3402, + "step": 152 + }, + { + "epoch": 0.046696169693270256, + "grad_norm": 1.6019038139070678, + "learning_rate": 9.992873280803848e-06, + "loss": 0.4554, + "step": 153 + }, + { + "epoch": 0.047001373416755685, + "grad_norm": 1.6429860881882794, + "learning_rate": 9.99260694990433e-06, + "loss": 0.4086, + "step": 154 + }, + { + "epoch": 0.04730657714024111, + "grad_norm": 1.98592334325083, + "learning_rate": 9.992335737062338e-06, + "loss": 0.5733, + "step": 155 + }, + { + "epoch": 0.047611780863726534, + "grad_norm": 1.5624846648417388, + "learning_rate": 9.992059642543076e-06, + "loss": 0.2524, + "step": 156 + }, + { + "epoch": 0.04791698458721196, + "grad_norm": 1.4438198320418865, + "learning_rate": 9.991778666616523e-06, + "loss": 0.1756, + "step": 157 + }, + { + "epoch": 0.04822218831069739, + "grad_norm": 1.6284817295660008, + "learning_rate": 9.991492809557424e-06, + "loss": 0.4144, + "step": 158 + }, + { + "epoch": 0.04852739203418282, + "grad_norm": 1.2236340789910145, + "learning_rate": 9.991202071645298e-06, + "loss": 0.1664, + "step": 159 + }, + { + "epoch": 0.04883259575766824, + "grad_norm": 1.4874398163232816, + "learning_rate": 9.99090645316444e-06, + "loss": 0.3323, + "step": 160 + }, + { + "epoch": 0.04913779948115367, + "grad_norm": 2.5394515927833403, + "learning_rate": 9.990605954403917e-06, + "loss": 0.27, + "step": 161 + }, + { + "epoch": 0.0494430032046391, + "grad_norm": 1.7966332314422868, + "learning_rate": 9.990300575657565e-06, + "loss": 0.4453, + "step": 162 + }, + { + "epoch": 0.049748206928124525, + "grad_norm": 1.825976682624809, + "learning_rate": 9.989990317223995e-06, + "loss": 0.2646, + "step": 163 + }, + { + "epoch": 0.05005341065160995, + "grad_norm": 1.6554541925183588, + "learning_rate": 9.989675179406588e-06, + "loss": 0.445, + "step": 164 + }, + { + "epoch": 0.050358614375095374, + "grad_norm": 1.6711133844293076, + "learning_rate": 9.989355162513496e-06, + "loss": 0.3685, + "step": 165 + }, + { + "epoch": 0.0506638180985808, + "grad_norm": 1.8033315345252203, + "learning_rate": 9.989030266857644e-06, + "loss": 0.2566, + "step": 166 + }, + { + "epoch": 0.05096902182206623, + "grad_norm": 1.6879852444966537, + "learning_rate": 9.988700492756726e-06, + "loss": 0.4086, + "step": 167 + }, + { + "epoch": 0.05127422554555166, + "grad_norm": 1.6855038740169574, + "learning_rate": 9.988365840533204e-06, + "loss": 0.3081, + "step": 168 + }, + { + "epoch": 0.05157942926903708, + "grad_norm": 2.245121010490438, + "learning_rate": 9.988026310514316e-06, + "loss": 0.5646, + "step": 169 + }, + { + "epoch": 0.05188463299252251, + "grad_norm": 1.531117336209479, + "learning_rate": 9.987681903032065e-06, + "loss": 0.3598, + "step": 170 + }, + { + "epoch": 0.05218983671600794, + "grad_norm": 1.4368727600956301, + "learning_rate": 9.987332618423221e-06, + "loss": 0.3864, + "step": 171 + }, + { + "epoch": 0.052495040439493365, + "grad_norm": 2.039026486601271, + "learning_rate": 9.98697845702933e-06, + "loss": 0.2728, + "step": 172 + }, + { + "epoch": 0.052800244162978786, + "grad_norm": 1.5481974795842472, + "learning_rate": 9.986619419196704e-06, + "loss": 0.2376, + "step": 173 + }, + { + "epoch": 0.053105447886464215, + "grad_norm": 1.583025735121783, + "learning_rate": 9.986255505276418e-06, + "loss": 0.3941, + "step": 174 + }, + { + "epoch": 0.05341065160994964, + "grad_norm": 2.025610033619695, + "learning_rate": 9.985886715624326e-06, + "loss": 0.432, + "step": 175 + }, + { + "epoch": 0.05371585533343507, + "grad_norm": 1.9370365819159912, + "learning_rate": 9.985513050601037e-06, + "loss": 0.3311, + "step": 176 + }, + { + "epoch": 0.05402105905692049, + "grad_norm": 1.534591376747653, + "learning_rate": 9.985134510571936e-06, + "loss": 0.3804, + "step": 177 + }, + { + "epoch": 0.05432626278040592, + "grad_norm": 1.5627980520171343, + "learning_rate": 9.984751095907175e-06, + "loss": 0.3991, + "step": 178 + }, + { + "epoch": 0.05463146650389135, + "grad_norm": 1.858760828475349, + "learning_rate": 9.984362806981665e-06, + "loss": 0.4124, + "step": 179 + }, + { + "epoch": 0.05493667022737678, + "grad_norm": 1.4922057145689682, + "learning_rate": 9.983969644175092e-06, + "loss": 0.2571, + "step": 180 + }, + { + "epoch": 0.0552418739508622, + "grad_norm": 1.4358215484460224, + "learning_rate": 9.983571607871903e-06, + "loss": 0.3351, + "step": 181 + }, + { + "epoch": 0.05554707767434763, + "grad_norm": 1.7105120125454414, + "learning_rate": 9.983168698461312e-06, + "loss": 0.4374, + "step": 182 + }, + { + "epoch": 0.055852281397833055, + "grad_norm": 1.4100459259074987, + "learning_rate": 9.982760916337296e-06, + "loss": 0.3958, + "step": 183 + }, + { + "epoch": 0.05615748512131848, + "grad_norm": 1.667173817085955, + "learning_rate": 9.982348261898598e-06, + "loss": 0.2867, + "step": 184 + }, + { + "epoch": 0.056462688844803904, + "grad_norm": 1.8278737995984025, + "learning_rate": 9.981930735548731e-06, + "loss": 0.3738, + "step": 185 + }, + { + "epoch": 0.05676789256828933, + "grad_norm": 1.806852289121097, + "learning_rate": 9.98150833769596e-06, + "loss": 0.5608, + "step": 186 + }, + { + "epoch": 0.05707309629177476, + "grad_norm": 1.6986308867720055, + "learning_rate": 9.981081068753324e-06, + "loss": 0.4253, + "step": 187 + }, + { + "epoch": 0.05737830001526019, + "grad_norm": 1.6392088091109513, + "learning_rate": 9.98064892913862e-06, + "loss": 0.2444, + "step": 188 + }, + { + "epoch": 0.05768350373874561, + "grad_norm": 1.7762995408711126, + "learning_rate": 9.980211919274407e-06, + "loss": 0.3866, + "step": 189 + }, + { + "epoch": 0.05798870746223104, + "grad_norm": 1.7144647062044762, + "learning_rate": 9.979770039588013e-06, + "loss": 0.4504, + "step": 190 + }, + { + "epoch": 0.05829391118571647, + "grad_norm": 1.9069269572943617, + "learning_rate": 9.979323290511517e-06, + "loss": 0.4972, + "step": 191 + }, + { + "epoch": 0.058599114909201895, + "grad_norm": 1.831943664409223, + "learning_rate": 9.978871672481774e-06, + "loss": 0.3884, + "step": 192 + }, + { + "epoch": 0.058904318632687316, + "grad_norm": 1.60483584957947, + "learning_rate": 9.978415185940383e-06, + "loss": 0.3366, + "step": 193 + }, + { + "epoch": 0.059209522356172745, + "grad_norm": 2.041633475935638, + "learning_rate": 9.977953831333718e-06, + "loss": 0.4928, + "step": 194 + }, + { + "epoch": 0.05951472607965817, + "grad_norm": 2.1574861604284243, + "learning_rate": 9.977487609112904e-06, + "loss": 0.7092, + "step": 195 + }, + { + "epoch": 0.0598199298031436, + "grad_norm": 1.5382345073334531, + "learning_rate": 9.97701651973383e-06, + "loss": 0.2236, + "step": 196 + }, + { + "epoch": 0.06012513352662902, + "grad_norm": 2.1479787995768014, + "learning_rate": 9.976540563657143e-06, + "loss": 0.5182, + "step": 197 + }, + { + "epoch": 0.06043033725011445, + "grad_norm": 1.8579437774142544, + "learning_rate": 9.976059741348252e-06, + "loss": 0.3093, + "step": 198 + }, + { + "epoch": 0.06073554097359988, + "grad_norm": 1.5409701380525285, + "learning_rate": 9.975574053277317e-06, + "loss": 0.2877, + "step": 199 + }, + { + "epoch": 0.06104074469708531, + "grad_norm": 1.5474598097011698, + "learning_rate": 9.975083499919264e-06, + "loss": 0.2981, + "step": 200 + }, + { + "epoch": 0.06134594842057073, + "grad_norm": 1.9202152932180157, + "learning_rate": 9.974588081753773e-06, + "loss": 0.5369, + "step": 201 + }, + { + "epoch": 0.06165115214405616, + "grad_norm": 1.4598442515817716, + "learning_rate": 9.974087799265279e-06, + "loss": 0.3696, + "step": 202 + }, + { + "epoch": 0.061956355867541585, + "grad_norm": 1.48078814360119, + "learning_rate": 9.973582652942975e-06, + "loss": 0.284, + "step": 203 + }, + { + "epoch": 0.06226155959102701, + "grad_norm": 2.100326004155181, + "learning_rate": 9.973072643280813e-06, + "loss": 0.5681, + "step": 204 + }, + { + "epoch": 0.06256676331451244, + "grad_norm": 1.976128330719915, + "learning_rate": 9.972557770777496e-06, + "loss": 0.3655, + "step": 205 + }, + { + "epoch": 0.06287196703799787, + "grad_norm": 1.2103730393566896, + "learning_rate": 9.972038035936483e-06, + "loss": 0.2471, + "step": 206 + }, + { + "epoch": 0.06317717076148328, + "grad_norm": 1.670449906238349, + "learning_rate": 9.971513439265992e-06, + "loss": 0.2184, + "step": 207 + }, + { + "epoch": 0.06348237448496871, + "grad_norm": 1.5020544764497652, + "learning_rate": 9.970983981278989e-06, + "loss": 0.3196, + "step": 208 + }, + { + "epoch": 0.06378757820845414, + "grad_norm": 1.7833251911345853, + "learning_rate": 9.970449662493195e-06, + "loss": 0.4122, + "step": 209 + }, + { + "epoch": 0.06409278193193957, + "grad_norm": 1.4149595334362772, + "learning_rate": 9.96991048343109e-06, + "loss": 0.2947, + "step": 210 + }, + { + "epoch": 0.064397985655425, + "grad_norm": 1.5991867680932033, + "learning_rate": 9.969366444619898e-06, + "loss": 0.1902, + "step": 211 + }, + { + "epoch": 0.06470318937891043, + "grad_norm": 1.4132064841734169, + "learning_rate": 9.968817546591601e-06, + "loss": 0.3389, + "step": 212 + }, + { + "epoch": 0.06500839310239585, + "grad_norm": 1.7671902900221814, + "learning_rate": 9.968263789882926e-06, + "loss": 0.4294, + "step": 213 + }, + { + "epoch": 0.06531359682588128, + "grad_norm": 1.5709821497329826, + "learning_rate": 9.96770517503536e-06, + "loss": 0.2765, + "step": 214 + }, + { + "epoch": 0.0656188005493667, + "grad_norm": 1.5211731343844295, + "learning_rate": 9.967141702595134e-06, + "loss": 0.387, + "step": 215 + }, + { + "epoch": 0.06592400427285212, + "grad_norm": 1.5499265222668686, + "learning_rate": 9.96657337311323e-06, + "loss": 0.4535, + "step": 216 + }, + { + "epoch": 0.06622920799633755, + "grad_norm": 1.4736546539447488, + "learning_rate": 9.966000187145383e-06, + "loss": 0.3834, + "step": 217 + }, + { + "epoch": 0.06653441171982298, + "grad_norm": 1.3306288958233108, + "learning_rate": 9.965422145252072e-06, + "loss": 0.3172, + "step": 218 + }, + { + "epoch": 0.06683961544330841, + "grad_norm": 1.5745937005003143, + "learning_rate": 9.964839247998524e-06, + "loss": 0.2725, + "step": 219 + }, + { + "epoch": 0.06714481916679384, + "grad_norm": 1.7546511557153388, + "learning_rate": 9.96425149595472e-06, + "loss": 0.3577, + "step": 220 + }, + { + "epoch": 0.06745002289027927, + "grad_norm": 2.0422588449754286, + "learning_rate": 9.96365888969538e-06, + "loss": 0.4976, + "step": 221 + }, + { + "epoch": 0.0677552266137647, + "grad_norm": 1.4661824124133862, + "learning_rate": 9.963061429799979e-06, + "loss": 0.3672, + "step": 222 + }, + { + "epoch": 0.06806043033725011, + "grad_norm": 2.0959067552369666, + "learning_rate": 9.96245911685273e-06, + "loss": 0.5381, + "step": 223 + }, + { + "epoch": 0.06836563406073554, + "grad_norm": 1.3296813372997014, + "learning_rate": 9.961851951442599e-06, + "loss": 0.2799, + "step": 224 + }, + { + "epoch": 0.06867083778422096, + "grad_norm": 1.7385807765114274, + "learning_rate": 9.96123993416329e-06, + "loss": 0.5183, + "step": 225 + }, + { + "epoch": 0.06897604150770639, + "grad_norm": 1.5190119701865645, + "learning_rate": 9.960623065613254e-06, + "loss": 0.4608, + "step": 226 + }, + { + "epoch": 0.06928124523119182, + "grad_norm": 1.4393894383331207, + "learning_rate": 9.96000134639569e-06, + "loss": 0.3455, + "step": 227 + }, + { + "epoch": 0.06958644895467725, + "grad_norm": 1.7132863682619555, + "learning_rate": 9.959374777118533e-06, + "loss": 0.316, + "step": 228 + }, + { + "epoch": 0.06989165267816268, + "grad_norm": 1.3227120889592454, + "learning_rate": 9.958743358394464e-06, + "loss": 0.2467, + "step": 229 + }, + { + "epoch": 0.0701968564016481, + "grad_norm": 1.5331153407144422, + "learning_rate": 9.95810709084091e-06, + "loss": 0.3138, + "step": 230 + }, + { + "epoch": 0.07050206012513352, + "grad_norm": 1.7990748995190806, + "learning_rate": 9.957465975080031e-06, + "loss": 0.4747, + "step": 231 + }, + { + "epoch": 0.07080726384861895, + "grad_norm": 1.1638981235859056, + "learning_rate": 9.956820011738736e-06, + "loss": 0.2265, + "step": 232 + }, + { + "epoch": 0.07111246757210438, + "grad_norm": 1.5739388418179414, + "learning_rate": 9.956169201448665e-06, + "loss": 0.5066, + "step": 233 + }, + { + "epoch": 0.0714176712955898, + "grad_norm": 1.6803933013620869, + "learning_rate": 9.955513544846205e-06, + "loss": 0.4415, + "step": 234 + }, + { + "epoch": 0.07172287501907523, + "grad_norm": 1.4014872110785643, + "learning_rate": 9.954853042572479e-06, + "loss": 0.3271, + "step": 235 + }, + { + "epoch": 0.07202807874256066, + "grad_norm": 1.5310222689941932, + "learning_rate": 9.954187695273352e-06, + "loss": 0.3289, + "step": 236 + }, + { + "epoch": 0.07233328246604609, + "grad_norm": 2.166268226472017, + "learning_rate": 9.953517503599419e-06, + "loss": 0.622, + "step": 237 + }, + { + "epoch": 0.07263848618953152, + "grad_norm": 2.258081862277545, + "learning_rate": 9.952842468206019e-06, + "loss": 0.5071, + "step": 238 + }, + { + "epoch": 0.07294368991301693, + "grad_norm": 1.7322119894263104, + "learning_rate": 9.952162589753224e-06, + "loss": 0.5097, + "step": 239 + }, + { + "epoch": 0.07324889363650236, + "grad_norm": 1.9966284228033864, + "learning_rate": 9.951477868905843e-06, + "loss": 0.2263, + "step": 240 + }, + { + "epoch": 0.07355409735998779, + "grad_norm": 1.6793267860774614, + "learning_rate": 9.95078830633342e-06, + "loss": 0.2065, + "step": 241 + }, + { + "epoch": 0.07385930108347322, + "grad_norm": 2.122564153881175, + "learning_rate": 9.95009390271023e-06, + "loss": 0.2665, + "step": 242 + }, + { + "epoch": 0.07416450480695864, + "grad_norm": 1.5852282963187305, + "learning_rate": 9.949394658715289e-06, + "loss": 0.4453, + "step": 243 + }, + { + "epoch": 0.07446970853044407, + "grad_norm": 1.7534712016120517, + "learning_rate": 9.948690575032338e-06, + "loss": 0.3628, + "step": 244 + }, + { + "epoch": 0.0747749122539295, + "grad_norm": 1.351810586905304, + "learning_rate": 9.947981652349854e-06, + "loss": 0.3984, + "step": 245 + }, + { + "epoch": 0.07508011597741493, + "grad_norm": 1.8377506474408298, + "learning_rate": 9.947267891361051e-06, + "loss": 0.3677, + "step": 246 + }, + { + "epoch": 0.07538531970090036, + "grad_norm": 1.4655632998364951, + "learning_rate": 9.946549292763865e-06, + "loss": 0.3516, + "step": 247 + }, + { + "epoch": 0.07569052342438577, + "grad_norm": 3.240838121636416, + "learning_rate": 9.945825857260967e-06, + "loss": 0.2627, + "step": 248 + }, + { + "epoch": 0.0759957271478712, + "grad_norm": 1.4085823215183912, + "learning_rate": 9.945097585559757e-06, + "loss": 0.2716, + "step": 249 + }, + { + "epoch": 0.07630093087135663, + "grad_norm": 1.6361471921651585, + "learning_rate": 9.944364478372364e-06, + "loss": 0.3595, + "step": 250 + }, + { + "epoch": 0.07660613459484206, + "grad_norm": 1.0912978886499554, + "learning_rate": 9.943626536415647e-06, + "loss": 0.1968, + "step": 251 + }, + { + "epoch": 0.07691133831832749, + "grad_norm": 1.9515717700893849, + "learning_rate": 9.942883760411188e-06, + "loss": 0.374, + "step": 252 + }, + { + "epoch": 0.07721654204181291, + "grad_norm": 1.5560755068838334, + "learning_rate": 9.942136151085302e-06, + "loss": 0.44, + "step": 253 + }, + { + "epoch": 0.07752174576529834, + "grad_norm": 1.4843235207715992, + "learning_rate": 9.941383709169024e-06, + "loss": 0.3175, + "step": 254 + }, + { + "epoch": 0.07782694948878377, + "grad_norm": 1.5210960196158274, + "learning_rate": 9.94062643539812e-06, + "loss": 0.3722, + "step": 255 + }, + { + "epoch": 0.07813215321226918, + "grad_norm": 1.6656094376801425, + "learning_rate": 9.939864330513079e-06, + "loss": 0.3511, + "step": 256 + }, + { + "epoch": 0.07843735693575461, + "grad_norm": 1.2732857455769802, + "learning_rate": 9.939097395259108e-06, + "loss": 0.2619, + "step": 257 + }, + { + "epoch": 0.07874256065924004, + "grad_norm": 1.8947301386622588, + "learning_rate": 9.938325630386149e-06, + "loss": 0.3933, + "step": 258 + }, + { + "epoch": 0.07904776438272547, + "grad_norm": 1.5625416559388712, + "learning_rate": 9.937549036648857e-06, + "loss": 0.4491, + "step": 259 + }, + { + "epoch": 0.0793529681062109, + "grad_norm": 1.5125179888703784, + "learning_rate": 9.936767614806612e-06, + "loss": 0.3674, + "step": 260 + }, + { + "epoch": 0.07965817182969633, + "grad_norm": 1.5026525250547669, + "learning_rate": 9.935981365623516e-06, + "loss": 0.4103, + "step": 261 + }, + { + "epoch": 0.07996337555318175, + "grad_norm": 2.3948536293362115, + "learning_rate": 9.93519028986839e-06, + "loss": 0.4009, + "step": 262 + }, + { + "epoch": 0.08026857927666718, + "grad_norm": 2.416554371647352, + "learning_rate": 9.934394388314775e-06, + "loss": 0.4265, + "step": 263 + }, + { + "epoch": 0.0805737830001526, + "grad_norm": 1.560923734953618, + "learning_rate": 9.933593661740933e-06, + "loss": 0.303, + "step": 264 + }, + { + "epoch": 0.08087898672363802, + "grad_norm": 1.6053945705234087, + "learning_rate": 9.932788110929837e-06, + "loss": 0.3295, + "step": 265 + }, + { + "epoch": 0.08118419044712345, + "grad_norm": 1.7775437462596928, + "learning_rate": 9.931977736669185e-06, + "loss": 0.2197, + "step": 266 + }, + { + "epoch": 0.08148939417060888, + "grad_norm": 1.701318325041301, + "learning_rate": 9.931162539751392e-06, + "loss": 0.3581, + "step": 267 + }, + { + "epoch": 0.08179459789409431, + "grad_norm": 1.5974548511363529, + "learning_rate": 9.93034252097358e-06, + "loss": 0.3432, + "step": 268 + }, + { + "epoch": 0.08209980161757974, + "grad_norm": 1.8669593065073864, + "learning_rate": 9.929517681137594e-06, + "loss": 0.4133, + "step": 269 + }, + { + "epoch": 0.08240500534106517, + "grad_norm": 1.4895827642408586, + "learning_rate": 9.928688021049991e-06, + "loss": 0.3111, + "step": 270 + }, + { + "epoch": 0.0827102090645506, + "grad_norm": 1.4317804244871846, + "learning_rate": 9.927853541522041e-06, + "loss": 0.2915, + "step": 271 + }, + { + "epoch": 0.08301541278803601, + "grad_norm": 1.252478145781798, + "learning_rate": 9.927014243369727e-06, + "loss": 0.2794, + "step": 272 + }, + { + "epoch": 0.08332061651152144, + "grad_norm": 1.6973954865497314, + "learning_rate": 9.926170127413743e-06, + "loss": 0.6183, + "step": 273 + }, + { + "epoch": 0.08362582023500686, + "grad_norm": 1.4723277244112698, + "learning_rate": 9.925321194479494e-06, + "loss": 0.2815, + "step": 274 + }, + { + "epoch": 0.08393102395849229, + "grad_norm": 1.7075555550514414, + "learning_rate": 9.924467445397097e-06, + "loss": 0.4178, + "step": 275 + }, + { + "epoch": 0.08423622768197772, + "grad_norm": 1.5354808046910606, + "learning_rate": 9.923608881001377e-06, + "loss": 0.2355, + "step": 276 + }, + { + "epoch": 0.08454143140546315, + "grad_norm": 1.1795750747565834, + "learning_rate": 9.922745502131865e-06, + "loss": 0.3404, + "step": 277 + }, + { + "epoch": 0.08484663512894858, + "grad_norm": 1.427067758888222, + "learning_rate": 9.921877309632805e-06, + "loss": 0.3141, + "step": 278 + }, + { + "epoch": 0.085151838852434, + "grad_norm": 1.3691564278772157, + "learning_rate": 9.921004304353147e-06, + "loss": 0.287, + "step": 279 + }, + { + "epoch": 0.08545704257591942, + "grad_norm": 1.9220775714586407, + "learning_rate": 9.920126487146544e-06, + "loss": 0.6617, + "step": 280 + }, + { + "epoch": 0.08576224629940485, + "grad_norm": 1.6761030408371134, + "learning_rate": 9.919243858871355e-06, + "loss": 0.466, + "step": 281 + }, + { + "epoch": 0.08606745002289028, + "grad_norm": 1.6120747264173168, + "learning_rate": 9.918356420390645e-06, + "loss": 0.5351, + "step": 282 + }, + { + "epoch": 0.0863726537463757, + "grad_norm": 1.5236961732014556, + "learning_rate": 9.91746417257218e-06, + "loss": 0.33, + "step": 283 + }, + { + "epoch": 0.08667785746986113, + "grad_norm": 1.6328635321860312, + "learning_rate": 9.916567116288434e-06, + "loss": 0.4301, + "step": 284 + }, + { + "epoch": 0.08698306119334656, + "grad_norm": 1.4120804188821041, + "learning_rate": 9.915665252416577e-06, + "loss": 0.3025, + "step": 285 + }, + { + "epoch": 0.08728826491683199, + "grad_norm": 1.8410843798908767, + "learning_rate": 9.914758581838482e-06, + "loss": 0.5415, + "step": 286 + }, + { + "epoch": 0.08759346864031742, + "grad_norm": 1.1807475096034001, + "learning_rate": 9.913847105440725e-06, + "loss": 0.3184, + "step": 287 + }, + { + "epoch": 0.08789867236380283, + "grad_norm": 1.52681276111022, + "learning_rate": 9.912930824114577e-06, + "loss": 0.4266, + "step": 288 + }, + { + "epoch": 0.08820387608728826, + "grad_norm": 1.4904538614169496, + "learning_rate": 9.91200973875601e-06, + "loss": 0.3404, + "step": 289 + }, + { + "epoch": 0.08850907981077369, + "grad_norm": 1.7385111110311349, + "learning_rate": 9.911083850265692e-06, + "loss": 0.3371, + "step": 290 + }, + { + "epoch": 0.08881428353425912, + "grad_norm": 1.6013762575114376, + "learning_rate": 9.91015315954899e-06, + "loss": 0.4475, + "step": 291 + }, + { + "epoch": 0.08911948725774455, + "grad_norm": 1.5474202900018152, + "learning_rate": 9.909217667515964e-06, + "loss": 0.4162, + "step": 292 + }, + { + "epoch": 0.08942469098122997, + "grad_norm": 1.875769203080621, + "learning_rate": 9.908277375081371e-06, + "loss": 0.4446, + "step": 293 + }, + { + "epoch": 0.0897298947047154, + "grad_norm": 1.4914731218024286, + "learning_rate": 9.907332283164663e-06, + "loss": 0.4274, + "step": 294 + }, + { + "epoch": 0.09003509842820083, + "grad_norm": 1.6551811079983538, + "learning_rate": 9.90638239268998e-06, + "loss": 0.4883, + "step": 295 + }, + { + "epoch": 0.09034030215168624, + "grad_norm": 1.645510927644492, + "learning_rate": 9.905427704586158e-06, + "loss": 0.4885, + "step": 296 + }, + { + "epoch": 0.09064550587517167, + "grad_norm": 1.6759165462483547, + "learning_rate": 9.904468219786727e-06, + "loss": 0.3878, + "step": 297 + }, + { + "epoch": 0.0909507095986571, + "grad_norm": 1.596800484010474, + "learning_rate": 9.903503939229901e-06, + "loss": 0.2725, + "step": 298 + }, + { + "epoch": 0.09125591332214253, + "grad_norm": 1.4035704196730787, + "learning_rate": 9.902534863858588e-06, + "loss": 0.2147, + "step": 299 + }, + { + "epoch": 0.09156111704562796, + "grad_norm": 1.7460761357385464, + "learning_rate": 9.90156099462038e-06, + "loss": 0.3495, + "step": 300 + }, + { + "epoch": 0.09186632076911339, + "grad_norm": 1.3373562156184522, + "learning_rate": 9.900582332467566e-06, + "loss": 0.342, + "step": 301 + }, + { + "epoch": 0.09217152449259881, + "grad_norm": 1.1466755748188362, + "learning_rate": 9.89959887835711e-06, + "loss": 0.1737, + "step": 302 + }, + { + "epoch": 0.09247672821608424, + "grad_norm": 1.8078659273922337, + "learning_rate": 9.898610633250669e-06, + "loss": 0.3111, + "step": 303 + }, + { + "epoch": 0.09278193193956966, + "grad_norm": 1.5400638324339648, + "learning_rate": 9.897617598114584e-06, + "loss": 0.4746, + "step": 304 + }, + { + "epoch": 0.09308713566305508, + "grad_norm": 1.558728128630052, + "learning_rate": 9.896619773919878e-06, + "loss": 0.3085, + "step": 305 + }, + { + "epoch": 0.09339233938654051, + "grad_norm": 4.094736926672729, + "learning_rate": 9.895617161642257e-06, + "loss": 0.4664, + "step": 306 + }, + { + "epoch": 0.09369754311002594, + "grad_norm": 1.63116898024897, + "learning_rate": 9.89460976226211e-06, + "loss": 0.3878, + "step": 307 + }, + { + "epoch": 0.09400274683351137, + "grad_norm": 1.7238364123731507, + "learning_rate": 9.893597576764508e-06, + "loss": 0.2989, + "step": 308 + }, + { + "epoch": 0.0943079505569968, + "grad_norm": 1.2496662648050174, + "learning_rate": 9.8925806061392e-06, + "loss": 0.3054, + "step": 309 + }, + { + "epoch": 0.09461315428048223, + "grad_norm": 0.8807197003313585, + "learning_rate": 9.891558851380614e-06, + "loss": 0.1904, + "step": 310 + }, + { + "epoch": 0.09491835800396765, + "grad_norm": 1.5076918479598347, + "learning_rate": 9.890532313487858e-06, + "loss": 0.2679, + "step": 311 + }, + { + "epoch": 0.09522356172745307, + "grad_norm": 1.8465691043660122, + "learning_rate": 9.889500993464716e-06, + "loss": 0.5002, + "step": 312 + }, + { + "epoch": 0.0955287654509385, + "grad_norm": 1.9183643810942494, + "learning_rate": 9.888464892319647e-06, + "loss": 0.4869, + "step": 313 + }, + { + "epoch": 0.09583396917442392, + "grad_norm": 1.6515373264151805, + "learning_rate": 9.887424011065788e-06, + "loss": 0.4507, + "step": 314 + }, + { + "epoch": 0.09613917289790935, + "grad_norm": 1.6223391241834122, + "learning_rate": 9.886378350720945e-06, + "loss": 0.3445, + "step": 315 + }, + { + "epoch": 0.09644437662139478, + "grad_norm": 1.4416645097808285, + "learning_rate": 9.885327912307604e-06, + "loss": 0.2808, + "step": 316 + }, + { + "epoch": 0.09674958034488021, + "grad_norm": 1.4777192121308136, + "learning_rate": 9.88427269685292e-06, + "loss": 0.4335, + "step": 317 + }, + { + "epoch": 0.09705478406836564, + "grad_norm": 1.6934694740555867, + "learning_rate": 9.883212705388715e-06, + "loss": 0.4299, + "step": 318 + }, + { + "epoch": 0.09735998779185107, + "grad_norm": 1.9031284601590377, + "learning_rate": 9.882147938951489e-06, + "loss": 0.5364, + "step": 319 + }, + { + "epoch": 0.09766519151533648, + "grad_norm": 1.990035566558448, + "learning_rate": 9.881078398582406e-06, + "loss": 0.6476, + "step": 320 + }, + { + "epoch": 0.09797039523882191, + "grad_norm": 1.4458600630840748, + "learning_rate": 9.8800040853273e-06, + "loss": 0.268, + "step": 321 + }, + { + "epoch": 0.09827559896230734, + "grad_norm": 1.473557254783057, + "learning_rate": 9.878925000236667e-06, + "loss": 0.3889, + "step": 322 + }, + { + "epoch": 0.09858080268579276, + "grad_norm": 1.429462352597184, + "learning_rate": 9.877841144365681e-06, + "loss": 0.3348, + "step": 323 + }, + { + "epoch": 0.0988860064092782, + "grad_norm": 1.9126483909533352, + "learning_rate": 9.876752518774167e-06, + "loss": 0.5004, + "step": 324 + }, + { + "epoch": 0.09919121013276362, + "grad_norm": 1.528278815830415, + "learning_rate": 9.875659124526622e-06, + "loss": 0.1931, + "step": 325 + }, + { + "epoch": 0.09949641385624905, + "grad_norm": 1.6064809314060318, + "learning_rate": 9.874560962692207e-06, + "loss": 0.2627, + "step": 326 + }, + { + "epoch": 0.09980161757973448, + "grad_norm": 1.8583002911468363, + "learning_rate": 9.873458034344741e-06, + "loss": 0.4795, + "step": 327 + }, + { + "epoch": 0.1001068213032199, + "grad_norm": 2.180040993961252, + "learning_rate": 9.872350340562704e-06, + "loss": 0.3502, + "step": 328 + }, + { + "epoch": 0.10041202502670532, + "grad_norm": 2.2760944374886334, + "learning_rate": 9.871237882429237e-06, + "loss": 0.5504, + "step": 329 + }, + { + "epoch": 0.10071722875019075, + "grad_norm": 1.599604903553732, + "learning_rate": 9.87012066103214e-06, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.10102243247367618, + "grad_norm": 1.165592902920284, + "learning_rate": 9.868998677463874e-06, + "loss": 0.2118, + "step": 331 + }, + { + "epoch": 0.1013276361971616, + "grad_norm": 1.562687671834839, + "learning_rate": 9.867871932821549e-06, + "loss": 0.3389, + "step": 332 + }, + { + "epoch": 0.10163283992064703, + "grad_norm": 1.5690938291630006, + "learning_rate": 9.866740428206935e-06, + "loss": 0.3084, + "step": 333 + }, + { + "epoch": 0.10193804364413246, + "grad_norm": 1.6392469885959746, + "learning_rate": 9.865604164726456e-06, + "loss": 0.3935, + "step": 334 + }, + { + "epoch": 0.10224324736761789, + "grad_norm": 1.4807871775535164, + "learning_rate": 9.864463143491192e-06, + "loss": 0.4445, + "step": 335 + }, + { + "epoch": 0.10254845109110332, + "grad_norm": 1.1926217304533164, + "learning_rate": 9.86331736561687e-06, + "loss": 0.2623, + "step": 336 + }, + { + "epoch": 0.10285365481458873, + "grad_norm": 1.5461262133304665, + "learning_rate": 9.862166832223871e-06, + "loss": 0.4161, + "step": 337 + }, + { + "epoch": 0.10315885853807416, + "grad_norm": 1.5135662608063911, + "learning_rate": 9.861011544437226e-06, + "loss": 0.2864, + "step": 338 + }, + { + "epoch": 0.10346406226155959, + "grad_norm": 1.2723458882899108, + "learning_rate": 9.85985150338662e-06, + "loss": 0.3208, + "step": 339 + }, + { + "epoch": 0.10376926598504502, + "grad_norm": 1.5555768560283307, + "learning_rate": 9.858686710206373e-06, + "loss": 0.3341, + "step": 340 + }, + { + "epoch": 0.10407446970853045, + "grad_norm": 1.7562240544367693, + "learning_rate": 9.857517166035466e-06, + "loss": 0.4637, + "step": 341 + }, + { + "epoch": 0.10437967343201587, + "grad_norm": 3.9759170817857044, + "learning_rate": 9.856342872017515e-06, + "loss": 0.6559, + "step": 342 + }, + { + "epoch": 0.1046848771555013, + "grad_norm": 1.6864581247305628, + "learning_rate": 9.855163829300789e-06, + "loss": 0.4889, + "step": 343 + }, + { + "epoch": 0.10499008087898673, + "grad_norm": 1.5660173118774432, + "learning_rate": 9.853980039038193e-06, + "loss": 0.2512, + "step": 344 + }, + { + "epoch": 0.10529528460247214, + "grad_norm": 1.747919617181712, + "learning_rate": 9.85279150238728e-06, + "loss": 0.4796, + "step": 345 + }, + { + "epoch": 0.10560048832595757, + "grad_norm": 2.0064830609007496, + "learning_rate": 9.85159822051024e-06, + "loss": 0.6876, + "step": 346 + }, + { + "epoch": 0.105905692049443, + "grad_norm": 3.232705832740987, + "learning_rate": 9.850400194573908e-06, + "loss": 0.5043, + "step": 347 + }, + { + "epoch": 0.10621089577292843, + "grad_norm": 1.6750365694523466, + "learning_rate": 9.849197425749753e-06, + "loss": 0.4426, + "step": 348 + }, + { + "epoch": 0.10651609949641386, + "grad_norm": 1.2407353388150502, + "learning_rate": 9.847989915213883e-06, + "loss": 0.2728, + "step": 349 + }, + { + "epoch": 0.10682130321989929, + "grad_norm": 1.145770915094197, + "learning_rate": 9.846777664147046e-06, + "loss": 0.1828, + "step": 350 + }, + { + "epoch": 0.10712650694338471, + "grad_norm": 1.0259526919339048, + "learning_rate": 9.845560673734617e-06, + "loss": 0.1815, + "step": 351 + }, + { + "epoch": 0.10743171066687014, + "grad_norm": 1.5927868344920564, + "learning_rate": 9.844338945166619e-06, + "loss": 0.4563, + "step": 352 + }, + { + "epoch": 0.10773691439035556, + "grad_norm": 1.862511817843432, + "learning_rate": 9.843112479637692e-06, + "loss": 0.4873, + "step": 353 + }, + { + "epoch": 0.10804211811384098, + "grad_norm": 1.3346380863318066, + "learning_rate": 9.841881278347122e-06, + "loss": 0.344, + "step": 354 + }, + { + "epoch": 0.10834732183732641, + "grad_norm": 1.2558059799685006, + "learning_rate": 9.840645342498817e-06, + "loss": 0.3489, + "step": 355 + }, + { + "epoch": 0.10865252556081184, + "grad_norm": 1.734496310003891, + "learning_rate": 9.839404673301318e-06, + "loss": 0.4161, + "step": 356 + }, + { + "epoch": 0.10895772928429727, + "grad_norm": 1.5538063811233147, + "learning_rate": 9.838159271967795e-06, + "loss": 0.4971, + "step": 357 + }, + { + "epoch": 0.1092629330077827, + "grad_norm": 1.51967394631939, + "learning_rate": 9.836909139716044e-06, + "loss": 0.3422, + "step": 358 + }, + { + "epoch": 0.10956813673126813, + "grad_norm": 1.8153898943745974, + "learning_rate": 9.835654277768487e-06, + "loss": 0.5826, + "step": 359 + }, + { + "epoch": 0.10987334045475355, + "grad_norm": 1.3956832615282058, + "learning_rate": 9.834394687352168e-06, + "loss": 0.3444, + "step": 360 + }, + { + "epoch": 0.11017854417823897, + "grad_norm": 1.7028097158357791, + "learning_rate": 9.833130369698764e-06, + "loss": 0.4653, + "step": 361 + }, + { + "epoch": 0.1104837479017244, + "grad_norm": 1.984086744142622, + "learning_rate": 9.831861326044564e-06, + "loss": 0.7268, + "step": 362 + }, + { + "epoch": 0.11078895162520982, + "grad_norm": 1.6010962575202456, + "learning_rate": 9.830587557630481e-06, + "loss": 0.4979, + "step": 363 + }, + { + "epoch": 0.11109415534869525, + "grad_norm": 1.163967768763629, + "learning_rate": 9.829309065702054e-06, + "loss": 0.2721, + "step": 364 + }, + { + "epoch": 0.11139935907218068, + "grad_norm": 1.5116526665647354, + "learning_rate": 9.828025851509433e-06, + "loss": 0.4504, + "step": 365 + }, + { + "epoch": 0.11170456279566611, + "grad_norm": 1.3126613426949747, + "learning_rate": 9.82673791630739e-06, + "loss": 0.2915, + "step": 366 + }, + { + "epoch": 0.11200976651915154, + "grad_norm": 1.2919896962667308, + "learning_rate": 9.825445261355313e-06, + "loss": 0.2862, + "step": 367 + }, + { + "epoch": 0.11231497024263697, + "grad_norm": 1.7465438774619029, + "learning_rate": 9.824147887917201e-06, + "loss": 0.5347, + "step": 368 + }, + { + "epoch": 0.11262017396612238, + "grad_norm": 1.5226131458552274, + "learning_rate": 9.822845797261676e-06, + "loss": 0.4764, + "step": 369 + }, + { + "epoch": 0.11292537768960781, + "grad_norm": 1.157991129368488, + "learning_rate": 9.82153899066196e-06, + "loss": 0.2002, + "step": 370 + }, + { + "epoch": 0.11323058141309324, + "grad_norm": 0.9301400886155634, + "learning_rate": 9.8202274693959e-06, + "loss": 0.1684, + "step": 371 + }, + { + "epoch": 0.11353578513657867, + "grad_norm": 1.502487900950231, + "learning_rate": 9.818911234745942e-06, + "loss": 0.3093, + "step": 372 + }, + { + "epoch": 0.1138409888600641, + "grad_norm": 1.9784385010881411, + "learning_rate": 9.817590287999149e-06, + "loss": 0.4803, + "step": 373 + }, + { + "epoch": 0.11414619258354952, + "grad_norm": 0.9195545540370847, + "learning_rate": 9.816264630447186e-06, + "loss": 0.2159, + "step": 374 + }, + { + "epoch": 0.11445139630703495, + "grad_norm": 1.9332444000616873, + "learning_rate": 9.814934263386324e-06, + "loss": 0.704, + "step": 375 + }, + { + "epoch": 0.11475660003052038, + "grad_norm": 1.2982271338494382, + "learning_rate": 9.813599188117447e-06, + "loss": 0.2579, + "step": 376 + }, + { + "epoch": 0.11506180375400579, + "grad_norm": 1.4400207145843447, + "learning_rate": 9.812259405946033e-06, + "loss": 0.2581, + "step": 377 + }, + { + "epoch": 0.11536700747749122, + "grad_norm": 1.6214016909138647, + "learning_rate": 9.810914918182168e-06, + "loss": 0.3348, + "step": 378 + }, + { + "epoch": 0.11567221120097665, + "grad_norm": 1.3037284021243603, + "learning_rate": 9.80956572614054e-06, + "loss": 0.2009, + "step": 379 + }, + { + "epoch": 0.11597741492446208, + "grad_norm": 1.4597999101205241, + "learning_rate": 9.808211831140434e-06, + "loss": 0.4417, + "step": 380 + }, + { + "epoch": 0.1162826186479475, + "grad_norm": 1.3133108268461724, + "learning_rate": 9.806853234505736e-06, + "loss": 0.3305, + "step": 381 + }, + { + "epoch": 0.11658782237143293, + "grad_norm": 1.455145466929522, + "learning_rate": 9.805489937564926e-06, + "loss": 0.4611, + "step": 382 + }, + { + "epoch": 0.11689302609491836, + "grad_norm": 1.3560358010182432, + "learning_rate": 9.804121941651085e-06, + "loss": 0.2624, + "step": 383 + }, + { + "epoch": 0.11719822981840379, + "grad_norm": 1.5730489544680661, + "learning_rate": 9.802749248101885e-06, + "loss": 0.5959, + "step": 384 + }, + { + "epoch": 0.1175034335418892, + "grad_norm": 1.6080011966995038, + "learning_rate": 9.801371858259594e-06, + "loss": 0.3077, + "step": 385 + }, + { + "epoch": 0.11780863726537463, + "grad_norm": 1.3452073998773173, + "learning_rate": 9.799989773471071e-06, + "loss": 0.3877, + "step": 386 + }, + { + "epoch": 0.11811384098886006, + "grad_norm": 1.521531541360139, + "learning_rate": 9.798602995087764e-06, + "loss": 0.2978, + "step": 387 + }, + { + "epoch": 0.11841904471234549, + "grad_norm": 1.705542362457564, + "learning_rate": 9.797211524465715e-06, + "loss": 0.4298, + "step": 388 + }, + { + "epoch": 0.11872424843583092, + "grad_norm": 2.110762528312395, + "learning_rate": 9.79581536296555e-06, + "loss": 0.4766, + "step": 389 + }, + { + "epoch": 0.11902945215931635, + "grad_norm": 1.8442245131411212, + "learning_rate": 9.794414511952484e-06, + "loss": 0.2988, + "step": 390 + }, + { + "epoch": 0.11933465588280177, + "grad_norm": 1.2950053698748747, + "learning_rate": 9.793008972796318e-06, + "loss": 0.2907, + "step": 391 + }, + { + "epoch": 0.1196398596062872, + "grad_norm": 1.1692504994324242, + "learning_rate": 9.791598746871438e-06, + "loss": 0.2584, + "step": 392 + }, + { + "epoch": 0.11994506332977262, + "grad_norm": 1.7847769770487698, + "learning_rate": 9.790183835556806e-06, + "loss": 0.4874, + "step": 393 + }, + { + "epoch": 0.12025026705325804, + "grad_norm": 1.2562325547558533, + "learning_rate": 9.788764240235976e-06, + "loss": 0.3739, + "step": 394 + }, + { + "epoch": 0.12055547077674347, + "grad_norm": 1.340465345857484, + "learning_rate": 9.787339962297076e-06, + "loss": 0.1911, + "step": 395 + }, + { + "epoch": 0.1208606745002289, + "grad_norm": 1.5667496682899116, + "learning_rate": 9.785911003132811e-06, + "loss": 0.4243, + "step": 396 + }, + { + "epoch": 0.12116587822371433, + "grad_norm": 1.81118055527733, + "learning_rate": 9.78447736414047e-06, + "loss": 0.5246, + "step": 397 + }, + { + "epoch": 0.12147108194719976, + "grad_norm": 1.4053388398772566, + "learning_rate": 9.783039046721912e-06, + "loss": 0.2964, + "step": 398 + }, + { + "epoch": 0.12177628567068519, + "grad_norm": 1.5253960954907508, + "learning_rate": 9.781596052283573e-06, + "loss": 0.3939, + "step": 399 + }, + { + "epoch": 0.12208148939417061, + "grad_norm": 1.4250712259795966, + "learning_rate": 9.780148382236465e-06, + "loss": 0.2338, + "step": 400 + }, + { + "epoch": 0.12238669311765603, + "grad_norm": 1.6445234212439845, + "learning_rate": 9.778696037996167e-06, + "loss": 0.6218, + "step": 401 + }, + { + "epoch": 0.12269189684114146, + "grad_norm": 1.625655408880631, + "learning_rate": 9.777239020982834e-06, + "loss": 0.4565, + "step": 402 + }, + { + "epoch": 0.12299710056462689, + "grad_norm": 1.2734470484088918, + "learning_rate": 9.775777332621184e-06, + "loss": 0.3673, + "step": 403 + }, + { + "epoch": 0.12330230428811231, + "grad_norm": 1.5775928166525761, + "learning_rate": 9.774310974340506e-06, + "loss": 0.3673, + "step": 404 + }, + { + "epoch": 0.12360750801159774, + "grad_norm": 1.2252627733176171, + "learning_rate": 9.772839947574658e-06, + "loss": 0.4064, + "step": 405 + }, + { + "epoch": 0.12391271173508317, + "grad_norm": 1.7189364245488037, + "learning_rate": 9.77136425376206e-06, + "loss": 0.4633, + "step": 406 + }, + { + "epoch": 0.1242179154585686, + "grad_norm": 1.7558994988767551, + "learning_rate": 9.769883894345693e-06, + "loss": 0.5655, + "step": 407 + }, + { + "epoch": 0.12452311918205403, + "grad_norm": 1.4802133948901528, + "learning_rate": 9.768398870773108e-06, + "loss": 0.4175, + "step": 408 + }, + { + "epoch": 0.12482832290553945, + "grad_norm": 1.3302751078782091, + "learning_rate": 9.766909184496408e-06, + "loss": 0.3468, + "step": 409 + }, + { + "epoch": 0.12513352662902488, + "grad_norm": 1.704721594870751, + "learning_rate": 9.765414836972262e-06, + "loss": 0.4898, + "step": 410 + }, + { + "epoch": 0.1254387303525103, + "grad_norm": 1.2131375907547226, + "learning_rate": 9.763915829661891e-06, + "loss": 0.3886, + "step": 411 + }, + { + "epoch": 0.12574393407599574, + "grad_norm": 1.3401723742285034, + "learning_rate": 9.76241216403108e-06, + "loss": 0.4102, + "step": 412 + }, + { + "epoch": 0.12604913779948115, + "grad_norm": 1.6513004120286505, + "learning_rate": 9.760903841550164e-06, + "loss": 0.433, + "step": 413 + }, + { + "epoch": 0.12635434152296657, + "grad_norm": 2.2390646637016327, + "learning_rate": 9.75939086369403e-06, + "loss": 0.4213, + "step": 414 + }, + { + "epoch": 0.126659545246452, + "grad_norm": 1.585281461881457, + "learning_rate": 9.757873231942122e-06, + "loss": 0.2145, + "step": 415 + }, + { + "epoch": 0.12696474896993742, + "grad_norm": 1.616278056338578, + "learning_rate": 9.756350947778431e-06, + "loss": 0.3786, + "step": 416 + }, + { + "epoch": 0.12726995269342287, + "grad_norm": 1.3896314522086528, + "learning_rate": 9.754824012691499e-06, + "loss": 0.3886, + "step": 417 + }, + { + "epoch": 0.12757515641690828, + "grad_norm": 1.5388381717065547, + "learning_rate": 9.753292428174416e-06, + "loss": 0.4901, + "step": 418 + }, + { + "epoch": 0.12788036014039372, + "grad_norm": 1.2099556172325527, + "learning_rate": 9.75175619572482e-06, + "loss": 0.2379, + "step": 419 + }, + { + "epoch": 0.12818556386387914, + "grad_norm": 2.578673148403812, + "learning_rate": 9.750215316844886e-06, + "loss": 0.3093, + "step": 420 + }, + { + "epoch": 0.12849076758736458, + "grad_norm": 1.3557950817043143, + "learning_rate": 9.748669793041345e-06, + "loss": 0.2901, + "step": 421 + }, + { + "epoch": 0.12879597131085, + "grad_norm": 1.8058846206435177, + "learning_rate": 9.747119625825459e-06, + "loss": 0.5222, + "step": 422 + }, + { + "epoch": 0.1291011750343354, + "grad_norm": 1.449090093366816, + "learning_rate": 9.745564816713034e-06, + "loss": 0.3738, + "step": 423 + }, + { + "epoch": 0.12940637875782085, + "grad_norm": 1.3181509799847857, + "learning_rate": 9.74400536722442e-06, + "loss": 0.2624, + "step": 424 + }, + { + "epoch": 0.12971158248130626, + "grad_norm": 2.1599699357377675, + "learning_rate": 9.742441278884496e-06, + "loss": 0.4838, + "step": 425 + }, + { + "epoch": 0.1300167862047917, + "grad_norm": 1.6258096878519581, + "learning_rate": 9.740872553222685e-06, + "loss": 0.4999, + "step": 426 + }, + { + "epoch": 0.13032198992827712, + "grad_norm": 1.5046387312958875, + "learning_rate": 9.739299191772937e-06, + "loss": 0.3095, + "step": 427 + }, + { + "epoch": 0.13062719365176256, + "grad_norm": 3.2565177704876653, + "learning_rate": 9.737721196073742e-06, + "loss": 0.4886, + "step": 428 + }, + { + "epoch": 0.13093239737524798, + "grad_norm": 5.072257540244327, + "learning_rate": 9.736138567668117e-06, + "loss": 0.2433, + "step": 429 + }, + { + "epoch": 0.1312376010987334, + "grad_norm": 1.5984597272623309, + "learning_rate": 9.734551308103607e-06, + "loss": 0.3274, + "step": 430 + }, + { + "epoch": 0.13154280482221883, + "grad_norm": 1.9766310728941525, + "learning_rate": 9.732959418932297e-06, + "loss": 0.5601, + "step": 431 + }, + { + "epoch": 0.13184800854570425, + "grad_norm": 1.8500535183975242, + "learning_rate": 9.731362901710783e-06, + "loss": 0.8055, + "step": 432 + }, + { + "epoch": 0.1321532122691897, + "grad_norm": 1.5082490121163843, + "learning_rate": 9.7297617580002e-06, + "loss": 0.3715, + "step": 433 + }, + { + "epoch": 0.1324584159926751, + "grad_norm": 1.3484742708817405, + "learning_rate": 9.728155989366198e-06, + "loss": 0.2316, + "step": 434 + }, + { + "epoch": 0.13276361971616055, + "grad_norm": 1.4064281774599339, + "learning_rate": 9.726545597378953e-06, + "loss": 0.2529, + "step": 435 + }, + { + "epoch": 0.13306882343964596, + "grad_norm": 1.3619428913058238, + "learning_rate": 9.724930583613164e-06, + "loss": 0.2675, + "step": 436 + }, + { + "epoch": 0.1333740271631314, + "grad_norm": 1.8072201350967787, + "learning_rate": 9.723310949648044e-06, + "loss": 0.4832, + "step": 437 + }, + { + "epoch": 0.13367923088661682, + "grad_norm": 1.376837369774298, + "learning_rate": 9.721686697067328e-06, + "loss": 0.3855, + "step": 438 + }, + { + "epoch": 0.13398443461010223, + "grad_norm": 1.577084491012061, + "learning_rate": 9.720057827459264e-06, + "loss": 0.3307, + "step": 439 + }, + { + "epoch": 0.13428963833358767, + "grad_norm": 1.7335930513549882, + "learning_rate": 9.718424342416619e-06, + "loss": 0.3794, + "step": 440 + }, + { + "epoch": 0.1345948420570731, + "grad_norm": 1.3979759520207362, + "learning_rate": 9.716786243536672e-06, + "loss": 0.17, + "step": 441 + }, + { + "epoch": 0.13490004578055853, + "grad_norm": 1.5166020341173607, + "learning_rate": 9.715143532421208e-06, + "loss": 0.2782, + "step": 442 + }, + { + "epoch": 0.13520524950404395, + "grad_norm": 1.5547284451560954, + "learning_rate": 9.71349621067653e-06, + "loss": 0.2398, + "step": 443 + }, + { + "epoch": 0.1355104532275294, + "grad_norm": 1.4706416022210265, + "learning_rate": 9.711844279913443e-06, + "loss": 0.3486, + "step": 444 + }, + { + "epoch": 0.1358156569510148, + "grad_norm": 1.7726813901729983, + "learning_rate": 9.710187741747264e-06, + "loss": 0.4938, + "step": 445 + }, + { + "epoch": 0.13612086067450022, + "grad_norm": 1.3682846898358798, + "learning_rate": 9.708526597797812e-06, + "loss": 0.3399, + "step": 446 + }, + { + "epoch": 0.13642606439798566, + "grad_norm": 1.173206742476641, + "learning_rate": 9.70686084968941e-06, + "loss": 0.209, + "step": 447 + }, + { + "epoch": 0.13673126812147107, + "grad_norm": 1.5006667880365117, + "learning_rate": 9.705190499050885e-06, + "loss": 0.3918, + "step": 448 + }, + { + "epoch": 0.13703647184495651, + "grad_norm": 1.4081281623516984, + "learning_rate": 9.70351554751556e-06, + "loss": 0.3478, + "step": 449 + }, + { + "epoch": 0.13734167556844193, + "grad_norm": 1.3521857616183288, + "learning_rate": 9.701835996721267e-06, + "loss": 0.2527, + "step": 450 + }, + { + "epoch": 0.13764687929192737, + "grad_norm": 1.609701105059192, + "learning_rate": 9.70015184831032e-06, + "loss": 0.4019, + "step": 451 + }, + { + "epoch": 0.13795208301541279, + "grad_norm": 1.5247656094886828, + "learning_rate": 9.698463103929542e-06, + "loss": 0.3393, + "step": 452 + }, + { + "epoch": 0.13825728673889823, + "grad_norm": 1.7521263555747508, + "learning_rate": 9.696769765230244e-06, + "loss": 0.4711, + "step": 453 + }, + { + "epoch": 0.13856249046238364, + "grad_norm": 1.1750952661731386, + "learning_rate": 9.695071833868233e-06, + "loss": 0.3209, + "step": 454 + }, + { + "epoch": 0.13886769418586906, + "grad_norm": 1.1097718724387464, + "learning_rate": 9.693369311503801e-06, + "loss": 0.2077, + "step": 455 + }, + { + "epoch": 0.1391728979093545, + "grad_norm": 1.3477077308441543, + "learning_rate": 9.691662199801735e-06, + "loss": 0.3387, + "step": 456 + }, + { + "epoch": 0.1394781016328399, + "grad_norm": 1.5926509134118427, + "learning_rate": 9.689950500431306e-06, + "loss": 0.4737, + "step": 457 + }, + { + "epoch": 0.13978330535632535, + "grad_norm": 1.238821976588628, + "learning_rate": 9.688234215066274e-06, + "loss": 0.2941, + "step": 458 + }, + { + "epoch": 0.14008850907981077, + "grad_norm": 1.283103486116252, + "learning_rate": 9.68651334538488e-06, + "loss": 0.2298, + "step": 459 + }, + { + "epoch": 0.1403937128032962, + "grad_norm": 1.3964480095243228, + "learning_rate": 9.684787893069852e-06, + "loss": 0.2755, + "step": 460 + }, + { + "epoch": 0.14069891652678163, + "grad_norm": 4.5768637107742185, + "learning_rate": 9.683057859808394e-06, + "loss": 0.3969, + "step": 461 + }, + { + "epoch": 0.14100412025026704, + "grad_norm": 1.238974584705195, + "learning_rate": 9.681323247292193e-06, + "loss": 0.302, + "step": 462 + }, + { + "epoch": 0.14130932397375248, + "grad_norm": 1.484367852444757, + "learning_rate": 9.679584057217412e-06, + "loss": 0.3771, + "step": 463 + }, + { + "epoch": 0.1416145276972379, + "grad_norm": 1.8892302609803566, + "learning_rate": 9.677840291284693e-06, + "loss": 0.4296, + "step": 464 + }, + { + "epoch": 0.14191973142072334, + "grad_norm": 1.9264735588835349, + "learning_rate": 9.676091951199147e-06, + "loss": 0.1789, + "step": 465 + }, + { + "epoch": 0.14222493514420875, + "grad_norm": 1.6779527799580054, + "learning_rate": 9.674339038670362e-06, + "loss": 0.3394, + "step": 466 + }, + { + "epoch": 0.1425301388676942, + "grad_norm": 1.4269803792009585, + "learning_rate": 9.672581555412396e-06, + "loss": 0.3436, + "step": 467 + }, + { + "epoch": 0.1428353425911796, + "grad_norm": 1.6997561953129157, + "learning_rate": 9.67081950314378e-06, + "loss": 0.4244, + "step": 468 + }, + { + "epoch": 0.14314054631466505, + "grad_norm": 1.5847658235861504, + "learning_rate": 9.669052883587503e-06, + "loss": 0.4061, + "step": 469 + }, + { + "epoch": 0.14344575003815047, + "grad_norm": 1.5000132719088555, + "learning_rate": 9.667281698471032e-06, + "loss": 0.2349, + "step": 470 + }, + { + "epoch": 0.14375095376163588, + "grad_norm": 1.4700398564287758, + "learning_rate": 9.665505949526288e-06, + "loss": 0.4265, + "step": 471 + }, + { + "epoch": 0.14405615748512132, + "grad_norm": 1.1929108671224367, + "learning_rate": 9.663725638489662e-06, + "loss": 0.3107, + "step": 472 + }, + { + "epoch": 0.14436136120860674, + "grad_norm": 1.1577689014622543, + "learning_rate": 9.661940767102001e-06, + "loss": 0.1722, + "step": 473 + }, + { + "epoch": 0.14466656493209218, + "grad_norm": 2.093772204836488, + "learning_rate": 9.660151337108617e-06, + "loss": 0.4278, + "step": 474 + }, + { + "epoch": 0.1449717686555776, + "grad_norm": 1.6654665758895533, + "learning_rate": 9.658357350259274e-06, + "loss": 0.3394, + "step": 475 + }, + { + "epoch": 0.14527697237906304, + "grad_norm": 1.2202451289258887, + "learning_rate": 9.656558808308193e-06, + "loss": 0.3433, + "step": 476 + }, + { + "epoch": 0.14558217610254845, + "grad_norm": 1.5418473816537024, + "learning_rate": 9.654755713014052e-06, + "loss": 0.4099, + "step": 477 + }, + { + "epoch": 0.14588737982603386, + "grad_norm": 1.867172943441559, + "learning_rate": 9.652948066139978e-06, + "loss": 0.4445, + "step": 478 + }, + { + "epoch": 0.1461925835495193, + "grad_norm": 1.2274320399473075, + "learning_rate": 9.651135869453552e-06, + "loss": 0.3687, + "step": 479 + }, + { + "epoch": 0.14649778727300472, + "grad_norm": 1.3632570280967384, + "learning_rate": 9.649319124726799e-06, + "loss": 0.3554, + "step": 480 + }, + { + "epoch": 0.14680299099649016, + "grad_norm": 1.9368328668689925, + "learning_rate": 9.647497833736197e-06, + "loss": 0.433, + "step": 481 + }, + { + "epoch": 0.14710819471997558, + "grad_norm": 1.7750440538339176, + "learning_rate": 9.645671998262668e-06, + "loss": 0.4275, + "step": 482 + }, + { + "epoch": 0.14741339844346102, + "grad_norm": 1.7620981827052555, + "learning_rate": 9.643841620091572e-06, + "loss": 0.4327, + "step": 483 + }, + { + "epoch": 0.14771860216694643, + "grad_norm": 1.705808013569081, + "learning_rate": 9.642006701012719e-06, + "loss": 0.2914, + "step": 484 + }, + { + "epoch": 0.14802380589043188, + "grad_norm": 1.4490666476731855, + "learning_rate": 9.640167242820356e-06, + "loss": 0.4171, + "step": 485 + }, + { + "epoch": 0.1483290096139173, + "grad_norm": 1.4017898188106575, + "learning_rate": 9.638323247313167e-06, + "loss": 0.3609, + "step": 486 + }, + { + "epoch": 0.1486342133374027, + "grad_norm": 1.273585773097182, + "learning_rate": 9.636474716294275e-06, + "loss": 0.2781, + "step": 487 + }, + { + "epoch": 0.14893941706088815, + "grad_norm": 1.4426438722313946, + "learning_rate": 9.634621651571235e-06, + "loss": 0.3446, + "step": 488 + }, + { + "epoch": 0.14924462078437356, + "grad_norm": 1.3396568501630033, + "learning_rate": 9.632764054956042e-06, + "loss": 0.3209, + "step": 489 + }, + { + "epoch": 0.149549824507859, + "grad_norm": 1.869987463373602, + "learning_rate": 9.630901928265113e-06, + "loss": 0.6676, + "step": 490 + }, + { + "epoch": 0.14985502823134442, + "grad_norm": 1.3995020559881168, + "learning_rate": 9.6290352733193e-06, + "loss": 0.443, + "step": 491 + }, + { + "epoch": 0.15016023195482986, + "grad_norm": 1.5646801044524754, + "learning_rate": 9.627164091943886e-06, + "loss": 0.4808, + "step": 492 + }, + { + "epoch": 0.15046543567831527, + "grad_norm": 1.4064811516565765, + "learning_rate": 9.625288385968572e-06, + "loss": 0.3281, + "step": 493 + }, + { + "epoch": 0.15077063940180072, + "grad_norm": 1.3603824959747954, + "learning_rate": 9.623408157227493e-06, + "loss": 0.4203, + "step": 494 + }, + { + "epoch": 0.15107584312528613, + "grad_norm": 1.4648922518012057, + "learning_rate": 9.621523407559193e-06, + "loss": 0.4691, + "step": 495 + }, + { + "epoch": 0.15138104684877154, + "grad_norm": 1.2898880598011702, + "learning_rate": 9.619634138806653e-06, + "loss": 0.22, + "step": 496 + }, + { + "epoch": 0.151686250572257, + "grad_norm": 1.629312149082348, + "learning_rate": 9.61774035281726e-06, + "loss": 0.3852, + "step": 497 + }, + { + "epoch": 0.1519914542957424, + "grad_norm": 1.4209940770003642, + "learning_rate": 9.615842051442825e-06, + "loss": 0.3434, + "step": 498 + }, + { + "epoch": 0.15229665801922784, + "grad_norm": 1.5981471419786573, + "learning_rate": 9.613939236539571e-06, + "loss": 0.4405, + "step": 499 + }, + { + "epoch": 0.15260186174271326, + "grad_norm": 1.6345273007096384, + "learning_rate": 9.612031909968138e-06, + "loss": 0.4621, + "step": 500 + }, + { + "epoch": 0.1529070654661987, + "grad_norm": 1.7946527422515466, + "learning_rate": 9.610120073593574e-06, + "loss": 0.4215, + "step": 501 + }, + { + "epoch": 0.1532122691896841, + "grad_norm": 1.9696007964079152, + "learning_rate": 9.608203729285337e-06, + "loss": 0.2416, + "step": 502 + }, + { + "epoch": 0.15351747291316953, + "grad_norm": 1.186483550480542, + "learning_rate": 9.606282878917296e-06, + "loss": 0.1656, + "step": 503 + }, + { + "epoch": 0.15382267663665497, + "grad_norm": 1.3709781521921298, + "learning_rate": 9.604357524367723e-06, + "loss": 0.3374, + "step": 504 + }, + { + "epoch": 0.15412788036014038, + "grad_norm": 1.4744363645402312, + "learning_rate": 9.602427667519297e-06, + "loss": 0.3472, + "step": 505 + }, + { + "epoch": 0.15443308408362583, + "grad_norm": 1.7032963377834875, + "learning_rate": 9.600493310259098e-06, + "loss": 0.4352, + "step": 506 + }, + { + "epoch": 0.15473828780711124, + "grad_norm": 1.487020684601837, + "learning_rate": 9.598554454478608e-06, + "loss": 0.2661, + "step": 507 + }, + { + "epoch": 0.15504349153059668, + "grad_norm": 1.2499312381905126, + "learning_rate": 9.596611102073703e-06, + "loss": 0.2785, + "step": 508 + }, + { + "epoch": 0.1553486952540821, + "grad_norm": 1.529878897767237, + "learning_rate": 9.594663254944664e-06, + "loss": 0.3768, + "step": 509 + }, + { + "epoch": 0.15565389897756754, + "grad_norm": 1.5214931502474798, + "learning_rate": 9.592710914996162e-06, + "loss": 0.5126, + "step": 510 + }, + { + "epoch": 0.15595910270105295, + "grad_norm": 1.3836939529329817, + "learning_rate": 9.590754084137259e-06, + "loss": 0.3011, + "step": 511 + }, + { + "epoch": 0.15626430642453837, + "grad_norm": 1.4833094737816435, + "learning_rate": 9.588792764281413e-06, + "loss": 0.4761, + "step": 512 + }, + { + "epoch": 0.1565695101480238, + "grad_norm": 1.2346664760598378, + "learning_rate": 9.586826957346473e-06, + "loss": 0.2454, + "step": 513 + }, + { + "epoch": 0.15687471387150922, + "grad_norm": 1.6476636760719772, + "learning_rate": 9.584856665254667e-06, + "loss": 0.2867, + "step": 514 + }, + { + "epoch": 0.15717991759499467, + "grad_norm": 1.5637210652382973, + "learning_rate": 9.58288188993262e-06, + "loss": 0.2899, + "step": 515 + }, + { + "epoch": 0.15748512131848008, + "grad_norm": 1.3205463270086828, + "learning_rate": 9.580902633311331e-06, + "loss": 0.3756, + "step": 516 + }, + { + "epoch": 0.15779032504196552, + "grad_norm": 1.3975127161911243, + "learning_rate": 9.578918897326186e-06, + "loss": 0.5111, + "step": 517 + }, + { + "epoch": 0.15809552876545094, + "grad_norm": 1.4459383872321914, + "learning_rate": 9.57693068391695e-06, + "loss": 0.4283, + "step": 518 + }, + { + "epoch": 0.15840073248893635, + "grad_norm": 1.7061785001760192, + "learning_rate": 9.574937995027767e-06, + "loss": 0.3702, + "step": 519 + }, + { + "epoch": 0.1587059362124218, + "grad_norm": 1.822247438656905, + "learning_rate": 9.572940832607157e-06, + "loss": 0.3406, + "step": 520 + }, + { + "epoch": 0.1590111399359072, + "grad_norm": 2.44932543751886, + "learning_rate": 9.570939198608013e-06, + "loss": 0.3118, + "step": 521 + }, + { + "epoch": 0.15931634365939265, + "grad_norm": 1.6119202421775476, + "learning_rate": 9.5689330949876e-06, + "loss": 0.3903, + "step": 522 + }, + { + "epoch": 0.15962154738287807, + "grad_norm": 1.4848330945324375, + "learning_rate": 9.56692252370756e-06, + "loss": 0.2336, + "step": 523 + }, + { + "epoch": 0.1599267511063635, + "grad_norm": 1.410632362194396, + "learning_rate": 9.564907486733893e-06, + "loss": 0.2749, + "step": 524 + }, + { + "epoch": 0.16023195482984892, + "grad_norm": 1.596052638125191, + "learning_rate": 9.562887986036975e-06, + "loss": 0.4752, + "step": 525 + }, + { + "epoch": 0.16053715855333436, + "grad_norm": 1.7668740909494465, + "learning_rate": 9.560864023591541e-06, + "loss": 0.4457, + "step": 526 + }, + { + "epoch": 0.16084236227681978, + "grad_norm": 1.4009268145182425, + "learning_rate": 9.558835601376692e-06, + "loss": 0.2615, + "step": 527 + }, + { + "epoch": 0.1611475660003052, + "grad_norm": 1.7299333351168085, + "learning_rate": 9.55680272137589e-06, + "loss": 0.5216, + "step": 528 + }, + { + "epoch": 0.16145276972379063, + "grad_norm": 1.398003196407042, + "learning_rate": 9.554765385576951e-06, + "loss": 0.2917, + "step": 529 + }, + { + "epoch": 0.16175797344727605, + "grad_norm": 1.4037115710357768, + "learning_rate": 9.552723595972055e-06, + "loss": 0.2794, + "step": 530 + }, + { + "epoch": 0.1620631771707615, + "grad_norm": 1.4104804936912443, + "learning_rate": 9.550677354557734e-06, + "loss": 0.3294, + "step": 531 + }, + { + "epoch": 0.1623683808942469, + "grad_norm": 1.3043707731550427, + "learning_rate": 9.548626663334872e-06, + "loss": 0.3542, + "step": 532 + }, + { + "epoch": 0.16267358461773235, + "grad_norm": 1.4523817232860987, + "learning_rate": 9.546571524308707e-06, + "loss": 0.4394, + "step": 533 + }, + { + "epoch": 0.16297878834121776, + "grad_norm": 1.2378417959119585, + "learning_rate": 9.544511939488823e-06, + "loss": 0.2859, + "step": 534 + }, + { + "epoch": 0.16328399206470318, + "grad_norm": 1.1623856416463947, + "learning_rate": 9.542447910889152e-06, + "loss": 0.2682, + "step": 535 + }, + { + "epoch": 0.16358919578818862, + "grad_norm": 1.503292443987416, + "learning_rate": 9.540379440527974e-06, + "loss": 0.4513, + "step": 536 + }, + { + "epoch": 0.16389439951167403, + "grad_norm": 1.278183220840744, + "learning_rate": 9.538306530427908e-06, + "loss": 0.2486, + "step": 537 + }, + { + "epoch": 0.16419960323515947, + "grad_norm": 1.477438530587252, + "learning_rate": 9.536229182615919e-06, + "loss": 0.4748, + "step": 538 + }, + { + "epoch": 0.1645048069586449, + "grad_norm": 1.161000468008389, + "learning_rate": 9.534147399123308e-06, + "loss": 0.3166, + "step": 539 + }, + { + "epoch": 0.16481001068213033, + "grad_norm": 1.3151690275104762, + "learning_rate": 9.532061181985713e-06, + "loss": 0.3547, + "step": 540 + }, + { + "epoch": 0.16511521440561575, + "grad_norm": 1.750297725419665, + "learning_rate": 9.529970533243112e-06, + "loss": 0.4156, + "step": 541 + }, + { + "epoch": 0.1654204181291012, + "grad_norm": 1.3465531852012238, + "learning_rate": 9.52787545493981e-06, + "loss": 0.3366, + "step": 542 + }, + { + "epoch": 0.1657256218525866, + "grad_norm": 1.5440141830188223, + "learning_rate": 9.525775949124447e-06, + "loss": 0.3376, + "step": 543 + }, + { + "epoch": 0.16603082557607202, + "grad_norm": 1.5415130315298482, + "learning_rate": 9.523672017849995e-06, + "loss": 0.541, + "step": 544 + }, + { + "epoch": 0.16633602929955746, + "grad_norm": 1.7579856956776627, + "learning_rate": 9.521563663173746e-06, + "loss": 0.4806, + "step": 545 + }, + { + "epoch": 0.16664123302304287, + "grad_norm": 1.7818866729621587, + "learning_rate": 9.519450887157324e-06, + "loss": 0.5464, + "step": 546 + }, + { + "epoch": 0.16694643674652831, + "grad_norm": 1.6064088023016758, + "learning_rate": 9.517333691866672e-06, + "loss": 0.43, + "step": 547 + }, + { + "epoch": 0.16725164047001373, + "grad_norm": 1.4778043177300115, + "learning_rate": 9.515212079372059e-06, + "loss": 0.4399, + "step": 548 + }, + { + "epoch": 0.16755684419349917, + "grad_norm": 2.027608100840915, + "learning_rate": 9.513086051748069e-06, + "loss": 0.4069, + "step": 549 + }, + { + "epoch": 0.16786204791698459, + "grad_norm": 1.5850802204517391, + "learning_rate": 9.510955611073605e-06, + "loss": 0.3827, + "step": 550 + }, + { + "epoch": 0.16816725164047, + "grad_norm": 1.293889481401633, + "learning_rate": 9.508820759431883e-06, + "loss": 0.2572, + "step": 551 + }, + { + "epoch": 0.16847245536395544, + "grad_norm": 1.842230801320139, + "learning_rate": 9.506681498910437e-06, + "loss": 0.5275, + "step": 552 + }, + { + "epoch": 0.16877765908744086, + "grad_norm": 1.1873629641518748, + "learning_rate": 9.50453783160111e-06, + "loss": 0.3282, + "step": 553 + }, + { + "epoch": 0.1690828628109263, + "grad_norm": 1.6093453710106354, + "learning_rate": 9.50238975960005e-06, + "loss": 0.5784, + "step": 554 + }, + { + "epoch": 0.1693880665344117, + "grad_norm": 1.8900657646403543, + "learning_rate": 9.500237285007719e-06, + "loss": 0.5224, + "step": 555 + }, + { + "epoch": 0.16969327025789716, + "grad_norm": 1.4803576264157936, + "learning_rate": 9.498080409928878e-06, + "loss": 0.3726, + "step": 556 + }, + { + "epoch": 0.16999847398138257, + "grad_norm": 1.7675886459458987, + "learning_rate": 9.495919136472595e-06, + "loss": 0.4656, + "step": 557 + }, + { + "epoch": 0.170303677704868, + "grad_norm": 1.7910944298366285, + "learning_rate": 9.493753466752236e-06, + "loss": 0.8076, + "step": 558 + }, + { + "epoch": 0.17060888142835343, + "grad_norm": 1.4678825901279975, + "learning_rate": 9.49158340288547e-06, + "loss": 0.3575, + "step": 559 + }, + { + "epoch": 0.17091408515183884, + "grad_norm": 1.411596350121475, + "learning_rate": 9.489408946994256e-06, + "loss": 0.3316, + "step": 560 + }, + { + "epoch": 0.17121928887532428, + "grad_norm": 1.2961064094904746, + "learning_rate": 9.487230101204855e-06, + "loss": 0.3634, + "step": 561 + }, + { + "epoch": 0.1715244925988097, + "grad_norm": 1.2525457208629842, + "learning_rate": 9.485046867647816e-06, + "loss": 0.368, + "step": 562 + }, + { + "epoch": 0.17182969632229514, + "grad_norm": 1.5857071732762902, + "learning_rate": 9.48285924845798e-06, + "loss": 0.4546, + "step": 563 + }, + { + "epoch": 0.17213490004578055, + "grad_norm": 1.4510716523054648, + "learning_rate": 9.480667245774474e-06, + "loss": 0.2739, + "step": 564 + }, + { + "epoch": 0.172440103769266, + "grad_norm": 1.7890268513821783, + "learning_rate": 9.478470861740716e-06, + "loss": 0.4085, + "step": 565 + }, + { + "epoch": 0.1727453074927514, + "grad_norm": 1.2143129127325427, + "learning_rate": 9.476270098504405e-06, + "loss": 0.2669, + "step": 566 + }, + { + "epoch": 0.17305051121623685, + "grad_norm": 1.4725539275626134, + "learning_rate": 9.474064958217524e-06, + "loss": 0.3474, + "step": 567 + }, + { + "epoch": 0.17335571493972227, + "grad_norm": 1.6648347880329453, + "learning_rate": 9.471855443036333e-06, + "loss": 0.3059, + "step": 568 + }, + { + "epoch": 0.17366091866320768, + "grad_norm": 1.4826208089202084, + "learning_rate": 9.469641555121372e-06, + "loss": 0.3309, + "step": 569 + }, + { + "epoch": 0.17396612238669312, + "grad_norm": 1.2087588985869038, + "learning_rate": 9.467423296637458e-06, + "loss": 0.2765, + "step": 570 + }, + { + "epoch": 0.17427132611017854, + "grad_norm": 1.5271503296745377, + "learning_rate": 9.465200669753678e-06, + "loss": 0.4388, + "step": 571 + }, + { + "epoch": 0.17457652983366398, + "grad_norm": 1.720167996940521, + "learning_rate": 9.462973676643395e-06, + "loss": 0.4693, + "step": 572 + }, + { + "epoch": 0.1748817335571494, + "grad_norm": 1.4666930059033054, + "learning_rate": 9.46074231948424e-06, + "loss": 0.354, + "step": 573 + }, + { + "epoch": 0.17518693728063484, + "grad_norm": 1.8597430337784902, + "learning_rate": 9.458506600458106e-06, + "loss": 0.3892, + "step": 574 + }, + { + "epoch": 0.17549214100412025, + "grad_norm": 0.9696512337091734, + "learning_rate": 9.456266521751162e-06, + "loss": 0.2294, + "step": 575 + }, + { + "epoch": 0.17579734472760566, + "grad_norm": 1.781498807963985, + "learning_rate": 9.454022085553829e-06, + "loss": 0.4873, + "step": 576 + }, + { + "epoch": 0.1761025484510911, + "grad_norm": 1.3337716733106453, + "learning_rate": 9.451773294060797e-06, + "loss": 0.3031, + "step": 577 + }, + { + "epoch": 0.17640775217457652, + "grad_norm": 1.7055986970891146, + "learning_rate": 9.449520149471008e-06, + "loss": 0.6405, + "step": 578 + }, + { + "epoch": 0.17671295589806196, + "grad_norm": 1.5188407211523098, + "learning_rate": 9.447262653987668e-06, + "loss": 0.3739, + "step": 579 + }, + { + "epoch": 0.17701815962154738, + "grad_norm": 0.9308569789252137, + "learning_rate": 9.445000809818231e-06, + "loss": 0.2505, + "step": 580 + }, + { + "epoch": 0.17732336334503282, + "grad_norm": 1.5000700139554115, + "learning_rate": 9.442734619174408e-06, + "loss": 0.4158, + "step": 581 + }, + { + "epoch": 0.17762856706851823, + "grad_norm": 1.3548714588751447, + "learning_rate": 9.440464084272157e-06, + "loss": 0.3911, + "step": 582 + }, + { + "epoch": 0.17793377079200368, + "grad_norm": 1.3241166157002833, + "learning_rate": 9.438189207331684e-06, + "loss": 0.371, + "step": 583 + }, + { + "epoch": 0.1782389745154891, + "grad_norm": 1.251287417238196, + "learning_rate": 9.435909990577442e-06, + "loss": 0.4543, + "step": 584 + }, + { + "epoch": 0.1785441782389745, + "grad_norm": 1.6243898273186124, + "learning_rate": 9.433626436238128e-06, + "loss": 0.3955, + "step": 585 + }, + { + "epoch": 0.17884938196245995, + "grad_norm": 1.5933520250274826, + "learning_rate": 9.43133854654668e-06, + "loss": 0.4232, + "step": 586 + }, + { + "epoch": 0.17915458568594536, + "grad_norm": 1.271604048902552, + "learning_rate": 9.429046323740275e-06, + "loss": 0.186, + "step": 587 + }, + { + "epoch": 0.1794597894094308, + "grad_norm": 1.7480757897399084, + "learning_rate": 9.426749770060325e-06, + "loss": 0.3198, + "step": 588 + }, + { + "epoch": 0.17976499313291622, + "grad_norm": 1.6582908292655634, + "learning_rate": 9.424448887752485e-06, + "loss": 0.4497, + "step": 589 + }, + { + "epoch": 0.18007019685640166, + "grad_norm": 1.514808198729056, + "learning_rate": 9.42214367906663e-06, + "loss": 0.3135, + "step": 590 + }, + { + "epoch": 0.18037540057988707, + "grad_norm": 1.33735933544563, + "learning_rate": 9.419834146256875e-06, + "loss": 0.1512, + "step": 591 + }, + { + "epoch": 0.1806806043033725, + "grad_norm": 1.7983955915325747, + "learning_rate": 9.417520291581562e-06, + "loss": 0.397, + "step": 592 + }, + { + "epoch": 0.18098580802685793, + "grad_norm": 1.8357965942853254, + "learning_rate": 9.415202117303253e-06, + "loss": 0.3479, + "step": 593 + }, + { + "epoch": 0.18129101175034334, + "grad_norm": 2.1098502294891084, + "learning_rate": 9.412879625688742e-06, + "loss": 0.6081, + "step": 594 + }, + { + "epoch": 0.1815962154738288, + "grad_norm": 1.7002717361934219, + "learning_rate": 9.410552819009041e-06, + "loss": 0.2335, + "step": 595 + }, + { + "epoch": 0.1819014191973142, + "grad_norm": 1.6858392243118179, + "learning_rate": 9.408221699539381e-06, + "loss": 0.3502, + "step": 596 + }, + { + "epoch": 0.18220662292079964, + "grad_norm": 1.9779389304442994, + "learning_rate": 9.40588626955921e-06, + "loss": 0.5023, + "step": 597 + }, + { + "epoch": 0.18251182664428506, + "grad_norm": 1.984831269441273, + "learning_rate": 9.403546531352192e-06, + "loss": 0.1808, + "step": 598 + }, + { + "epoch": 0.1828170303677705, + "grad_norm": 1.7825668553552305, + "learning_rate": 9.401202487206205e-06, + "loss": 0.2451, + "step": 599 + }, + { + "epoch": 0.18312223409125591, + "grad_norm": 1.7395479880147604, + "learning_rate": 9.398854139413332e-06, + "loss": 0.4586, + "step": 600 + }, + { + "epoch": 0.18342743781474133, + "grad_norm": 1.7910877075998561, + "learning_rate": 9.396501490269871e-06, + "loss": 0.4334, + "step": 601 + }, + { + "epoch": 0.18373264153822677, + "grad_norm": 1.671153260357796, + "learning_rate": 9.394144542076321e-06, + "loss": 0.3457, + "step": 602 + }, + { + "epoch": 0.18403784526171219, + "grad_norm": 2.2927735747628057, + "learning_rate": 9.391783297137392e-06, + "loss": 0.5006, + "step": 603 + }, + { + "epoch": 0.18434304898519763, + "grad_norm": 1.4375155904173251, + "learning_rate": 9.389417757761983e-06, + "loss": 0.3412, + "step": 604 + }, + { + "epoch": 0.18464825270868304, + "grad_norm": 1.0876679937459988, + "learning_rate": 9.387047926263205e-06, + "loss": 0.2323, + "step": 605 + }, + { + "epoch": 0.18495345643216848, + "grad_norm": 1.5691166969698962, + "learning_rate": 9.384673804958357e-06, + "loss": 0.3929, + "step": 606 + }, + { + "epoch": 0.1852586601556539, + "grad_norm": 1.2508041656129554, + "learning_rate": 9.38229539616894e-06, + "loss": 0.2123, + "step": 607 + }, + { + "epoch": 0.1855638638791393, + "grad_norm": 1.6014009719897135, + "learning_rate": 9.379912702220641e-06, + "loss": 0.234, + "step": 608 + }, + { + "epoch": 0.18586906760262475, + "grad_norm": 1.4804611004553776, + "learning_rate": 9.377525725443341e-06, + "loss": 0.3951, + "step": 609 + }, + { + "epoch": 0.18617427132611017, + "grad_norm": 1.4589508747304376, + "learning_rate": 9.375134468171108e-06, + "loss": 0.2887, + "step": 610 + }, + { + "epoch": 0.1864794750495956, + "grad_norm": 1.9199770728371568, + "learning_rate": 9.372738932742193e-06, + "loss": 0.5627, + "step": 611 + }, + { + "epoch": 0.18678467877308103, + "grad_norm": 1.5903295576095668, + "learning_rate": 9.370339121499039e-06, + "loss": 0.4379, + "step": 612 + }, + { + "epoch": 0.18708988249656647, + "grad_norm": 1.6986262549100166, + "learning_rate": 9.367935036788257e-06, + "loss": 0.4873, + "step": 613 + }, + { + "epoch": 0.18739508622005188, + "grad_norm": 1.2194659257752518, + "learning_rate": 9.365526680960645e-06, + "loss": 0.3571, + "step": 614 + }, + { + "epoch": 0.18770028994353732, + "grad_norm": 1.3049138038493902, + "learning_rate": 9.363114056371178e-06, + "loss": 0.3114, + "step": 615 + }, + { + "epoch": 0.18800549366702274, + "grad_norm": 1.468685879220778, + "learning_rate": 9.360697165379004e-06, + "loss": 0.5043, + "step": 616 + }, + { + "epoch": 0.18831069739050815, + "grad_norm": 2.131454426646245, + "learning_rate": 9.35827601034744e-06, + "loss": 0.5165, + "step": 617 + }, + { + "epoch": 0.1886159011139936, + "grad_norm": 1.5563065422847613, + "learning_rate": 9.355850593643974e-06, + "loss": 0.4707, + "step": 618 + }, + { + "epoch": 0.188921104837479, + "grad_norm": 1.3992788671446874, + "learning_rate": 9.353420917640264e-06, + "loss": 0.3905, + "step": 619 + }, + { + "epoch": 0.18922630856096445, + "grad_norm": 1.2635210704880713, + "learning_rate": 9.350986984712128e-06, + "loss": 0.2399, + "step": 620 + }, + { + "epoch": 0.18953151228444987, + "grad_norm": 1.4071894580574145, + "learning_rate": 9.348548797239551e-06, + "loss": 0.3689, + "step": 621 + }, + { + "epoch": 0.1898367160079353, + "grad_norm": 1.8460324916935194, + "learning_rate": 9.346106357606675e-06, + "loss": 0.3337, + "step": 622 + }, + { + "epoch": 0.19014191973142072, + "grad_norm": 1.2199756132877848, + "learning_rate": 9.343659668201803e-06, + "loss": 0.3707, + "step": 623 + }, + { + "epoch": 0.19044712345490614, + "grad_norm": 1.3352934416971625, + "learning_rate": 9.34120873141739e-06, + "loss": 0.3332, + "step": 624 + }, + { + "epoch": 0.19075232717839158, + "grad_norm": 1.5847999740161538, + "learning_rate": 9.33875354965005e-06, + "loss": 0.4658, + "step": 625 + }, + { + "epoch": 0.191057530901877, + "grad_norm": 1.59267718540602, + "learning_rate": 9.336294125300538e-06, + "loss": 0.5138, + "step": 626 + }, + { + "epoch": 0.19136273462536244, + "grad_norm": 1.0558131110089597, + "learning_rate": 9.333830460773767e-06, + "loss": 0.2512, + "step": 627 + }, + { + "epoch": 0.19166793834884785, + "grad_norm": 1.6912848096424418, + "learning_rate": 9.331362558478793e-06, + "loss": 0.3129, + "step": 628 + }, + { + "epoch": 0.1919731420723333, + "grad_norm": 1.7565199783626735, + "learning_rate": 9.328890420828817e-06, + "loss": 0.2625, + "step": 629 + }, + { + "epoch": 0.1922783457958187, + "grad_norm": 1.6287483120468187, + "learning_rate": 9.326414050241176e-06, + "loss": 0.4631, + "step": 630 + }, + { + "epoch": 0.19258354951930415, + "grad_norm": 1.5343127080699748, + "learning_rate": 9.323933449137353e-06, + "loss": 0.35, + "step": 631 + }, + { + "epoch": 0.19288875324278956, + "grad_norm": 1.2901421851525343, + "learning_rate": 9.321448619942963e-06, + "loss": 0.3191, + "step": 632 + }, + { + "epoch": 0.19319395696627498, + "grad_norm": 1.3651871352420322, + "learning_rate": 9.318959565087761e-06, + "loss": 0.3063, + "step": 633 + }, + { + "epoch": 0.19349916068976042, + "grad_norm": 1.4900191765389657, + "learning_rate": 9.316466287005625e-06, + "loss": 0.3621, + "step": 634 + }, + { + "epoch": 0.19380436441324583, + "grad_norm": 1.836926327897149, + "learning_rate": 9.313968788134572e-06, + "loss": 0.6273, + "step": 635 + }, + { + "epoch": 0.19410956813673128, + "grad_norm": 1.6283757285797815, + "learning_rate": 9.311467070916743e-06, + "loss": 0.3191, + "step": 636 + }, + { + "epoch": 0.1944147718602167, + "grad_norm": 1.7047955780313857, + "learning_rate": 9.308961137798398e-06, + "loss": 0.5581, + "step": 637 + }, + { + "epoch": 0.19471997558370213, + "grad_norm": 1.4826549000090183, + "learning_rate": 9.306450991229927e-06, + "loss": 0.3157, + "step": 638 + }, + { + "epoch": 0.19502517930718755, + "grad_norm": 1.435361017145943, + "learning_rate": 9.30393663366584e-06, + "loss": 0.3084, + "step": 639 + }, + { + "epoch": 0.19533038303067296, + "grad_norm": 1.402358583674702, + "learning_rate": 9.301418067564758e-06, + "loss": 0.3351, + "step": 640 + }, + { + "epoch": 0.1956355867541584, + "grad_norm": 1.2933654176691274, + "learning_rate": 9.298895295389423e-06, + "loss": 0.2585, + "step": 641 + }, + { + "epoch": 0.19594079047764382, + "grad_norm": 1.5346090103364156, + "learning_rate": 9.29636831960669e-06, + "loss": 0.33, + "step": 642 + }, + { + "epoch": 0.19624599420112926, + "grad_norm": 1.3251725534327445, + "learning_rate": 9.293837142687522e-06, + "loss": 0.2104, + "step": 643 + }, + { + "epoch": 0.19655119792461467, + "grad_norm": 1.583285740923444, + "learning_rate": 9.291301767106986e-06, + "loss": 0.4326, + "step": 644 + }, + { + "epoch": 0.19685640164810012, + "grad_norm": 1.0905371753559963, + "learning_rate": 9.288762195344266e-06, + "loss": 0.274, + "step": 645 + }, + { + "epoch": 0.19716160537158553, + "grad_norm": 1.7263838041187525, + "learning_rate": 9.28621842988264e-06, + "loss": 0.5011, + "step": 646 + }, + { + "epoch": 0.19746680909507097, + "grad_norm": 1.4838510492072716, + "learning_rate": 9.283670473209488e-06, + "loss": 0.1956, + "step": 647 + }, + { + "epoch": 0.1977720128185564, + "grad_norm": 1.2036114489558822, + "learning_rate": 9.28111832781629e-06, + "loss": 0.2346, + "step": 648 + }, + { + "epoch": 0.1980772165420418, + "grad_norm": 1.3853687861302435, + "learning_rate": 9.278561996198622e-06, + "loss": 0.2313, + "step": 649 + }, + { + "epoch": 0.19838242026552724, + "grad_norm": 1.2708592907281826, + "learning_rate": 9.276001480856152e-06, + "loss": 0.3717, + "step": 650 + }, + { + "epoch": 0.19868762398901266, + "grad_norm": 1.6407943638167721, + "learning_rate": 9.273436784292641e-06, + "loss": 0.5533, + "step": 651 + }, + { + "epoch": 0.1989928277124981, + "grad_norm": 1.9614104046455092, + "learning_rate": 9.270867909015936e-06, + "loss": 0.4552, + "step": 652 + }, + { + "epoch": 0.1992980314359835, + "grad_norm": 1.6038724290911757, + "learning_rate": 9.268294857537973e-06, + "loss": 0.477, + "step": 653 + }, + { + "epoch": 0.19960323515946896, + "grad_norm": 1.9838192379267598, + "learning_rate": 9.26571763237477e-06, + "loss": 0.7118, + "step": 654 + }, + { + "epoch": 0.19990843888295437, + "grad_norm": 1.4066170955047037, + "learning_rate": 9.263136236046422e-06, + "loss": 0.4222, + "step": 655 + }, + { + "epoch": 0.2002136426064398, + "grad_norm": 1.800111592330467, + "learning_rate": 9.260550671077113e-06, + "loss": 0.4969, + "step": 656 + } + ], + "logging_steps": 1.0, + "max_steps": 3276, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 328, + "total_flos": 81687288602624.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}