{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1001068213032199, "eval_steps": 500, "global_step": 328, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003052037234854265, "grad_norm": 19.476922880741295, "learning_rate": 1.0101010101010103e-07, "loss": 1.1728, "step": 1 }, { "epoch": 0.000610407446970853, "grad_norm": 29.879020388476594, "learning_rate": 2.0202020202020205e-07, "loss": 1.0955, "step": 2 }, { "epoch": 0.0009156111704562796, "grad_norm": 24.931945947136526, "learning_rate": 3.0303030303030305e-07, "loss": 0.9541, "step": 3 }, { "epoch": 0.001220814893941706, "grad_norm": 27.83214939667906, "learning_rate": 4.040404040404041e-07, "loss": 1.0735, "step": 4 }, { "epoch": 0.0015260186174271325, "grad_norm": 21.219233961021736, "learning_rate": 5.05050505050505e-07, "loss": 1.0455, "step": 5 }, { "epoch": 0.0018312223409125592, "grad_norm": 20.022707446211225, "learning_rate": 6.060606060606061e-07, "loss": 0.9675, "step": 6 }, { "epoch": 0.0021364260643979855, "grad_norm": 26.532427830157193, "learning_rate": 7.070707070707071e-07, "loss": 1.1393, "step": 7 }, { "epoch": 0.002441629787883412, "grad_norm": 27.89728780710031, "learning_rate": 8.080808080808082e-07, "loss": 1.0952, "step": 8 }, { "epoch": 0.0027468335113688385, "grad_norm": 20.346264005570532, "learning_rate": 9.090909090909091e-07, "loss": 0.9626, "step": 9 }, { "epoch": 0.003052037234854265, "grad_norm": 18.804489508720884, "learning_rate": 1.01010101010101e-06, "loss": 1.0255, "step": 10 }, { "epoch": 0.003357240958339692, "grad_norm": 19.776534785573535, "learning_rate": 1.111111111111111e-06, "loss": 0.7399, "step": 11 }, { "epoch": 0.0036624446818251184, "grad_norm": 21.16130386460154, "learning_rate": 1.2121212121212122e-06, "loss": 0.5413, "step": 12 }, { "epoch": 0.0039676484053105445, "grad_norm": 16.482713371526263, "learning_rate": 1.3131313131313134e-06, "loss": 0.5773, "step": 13 }, { "epoch": 0.004272852128795971, "grad_norm": 10.780528168770594, "learning_rate": 1.4141414141414143e-06, "loss": 0.6782, "step": 14 }, { "epoch": 0.0045780558522813975, "grad_norm": 7.0900135030469915, "learning_rate": 1.5151515151515152e-06, "loss": 0.9153, "step": 15 }, { "epoch": 0.004883259575766824, "grad_norm": 8.490445320662754, "learning_rate": 1.6161616161616164e-06, "loss": 0.4798, "step": 16 }, { "epoch": 0.0051884632992522505, "grad_norm": 6.677142812986669, "learning_rate": 1.7171717171717173e-06, "loss": 0.4782, "step": 17 }, { "epoch": 0.005493667022737677, "grad_norm": 5.9204247946017485, "learning_rate": 1.8181818181818183e-06, "loss": 0.3191, "step": 18 }, { "epoch": 0.0057988707462231035, "grad_norm": 5.012462343754674, "learning_rate": 1.9191919191919192e-06, "loss": 0.4115, "step": 19 }, { "epoch": 0.00610407446970853, "grad_norm": 3.9095937836899113, "learning_rate": 2.02020202020202e-06, "loss": 0.6158, "step": 20 }, { "epoch": 0.006409278193193957, "grad_norm": 4.438163815129716, "learning_rate": 2.1212121212121216e-06, "loss": 0.7388, "step": 21 }, { "epoch": 0.006714481916679384, "grad_norm": 3.62875198348435, "learning_rate": 2.222222222222222e-06, "loss": 0.2875, "step": 22 }, { "epoch": 0.00701968564016481, "grad_norm": 4.963543929599541, "learning_rate": 2.3232323232323234e-06, "loss": 0.4662, "step": 23 }, { "epoch": 0.007324889363650237, "grad_norm": 4.274904100558248, "learning_rate": 2.4242424242424244e-06, "loss": 0.5171, "step": 24 }, { "epoch": 0.007630093087135663, "grad_norm": 2.670885047669819, "learning_rate": 2.5252525252525258e-06, "loss": 0.4488, "step": 25 }, { "epoch": 0.007935296810621089, "grad_norm": 2.6864388610994014, "learning_rate": 2.6262626262626267e-06, "loss": 0.372, "step": 26 }, { "epoch": 0.008240500534106516, "grad_norm": 3.804357369452407, "learning_rate": 2.7272727272727272e-06, "loss": 0.2646, "step": 27 }, { "epoch": 0.008545704257591942, "grad_norm": 4.059008227452532, "learning_rate": 2.8282828282828286e-06, "loss": 0.5907, "step": 28 }, { "epoch": 0.008850907981077369, "grad_norm": 4.9062443629918855, "learning_rate": 2.9292929292929295e-06, "loss": 0.2972, "step": 29 }, { "epoch": 0.009156111704562795, "grad_norm": 3.5391495380267064, "learning_rate": 3.0303030303030305e-06, "loss": 0.3821, "step": 30 }, { "epoch": 0.009461315428048222, "grad_norm": 2.5896920322264854, "learning_rate": 3.131313131313132e-06, "loss": 0.4164, "step": 31 }, { "epoch": 0.009766519151533648, "grad_norm": 3.0230775761822937, "learning_rate": 3.232323232323233e-06, "loss": 0.4237, "step": 32 }, { "epoch": 0.010071722875019075, "grad_norm": 2.8417717057519423, "learning_rate": 3.3333333333333333e-06, "loss": 0.3353, "step": 33 }, { "epoch": 0.010376926598504501, "grad_norm": 2.5789157463945878, "learning_rate": 3.4343434343434347e-06, "loss": 0.3769, "step": 34 }, { "epoch": 0.010682130321989928, "grad_norm": 2.5222241581850096, "learning_rate": 3.5353535353535356e-06, "loss": 0.519, "step": 35 }, { "epoch": 0.010987334045475354, "grad_norm": 2.8704682168269127, "learning_rate": 3.6363636363636366e-06, "loss": 0.2829, "step": 36 }, { "epoch": 0.01129253776896078, "grad_norm": 3.24684532820184, "learning_rate": 3.737373737373738e-06, "loss": 0.3586, "step": 37 }, { "epoch": 0.011597741492446207, "grad_norm": 5.24792475783676, "learning_rate": 3.8383838383838385e-06, "loss": 0.402, "step": 38 }, { "epoch": 0.011902945215931634, "grad_norm": 3.111184671834165, "learning_rate": 3.93939393939394e-06, "loss": 0.466, "step": 39 }, { "epoch": 0.01220814893941706, "grad_norm": 3.165565566985893, "learning_rate": 4.04040404040404e-06, "loss": 0.2678, "step": 40 }, { "epoch": 0.012513352662902488, "grad_norm": 2.5486933296193257, "learning_rate": 4.141414141414142e-06, "loss": 0.5457, "step": 41 }, { "epoch": 0.012818556386387915, "grad_norm": 3.4373721012250438, "learning_rate": 4.242424242424243e-06, "loss": 0.3862, "step": 42 }, { "epoch": 0.013123760109873341, "grad_norm": 2.863317221380458, "learning_rate": 4.343434343434344e-06, "loss": 0.3601, "step": 43 }, { "epoch": 0.013428963833358768, "grad_norm": 2.1041128573446035, "learning_rate": 4.444444444444444e-06, "loss": 0.3693, "step": 44 }, { "epoch": 0.013734167556844194, "grad_norm": 2.286990324679626, "learning_rate": 4.5454545454545455e-06, "loss": 0.2513, "step": 45 }, { "epoch": 0.01403937128032962, "grad_norm": 8.793466778432636, "learning_rate": 4.646464646464647e-06, "loss": 0.4343, "step": 46 }, { "epoch": 0.014344575003815047, "grad_norm": 1.8648737533834159, "learning_rate": 4.747474747474748e-06, "loss": 0.2631, "step": 47 }, { "epoch": 0.014649778727300474, "grad_norm": 2.3081781364995324, "learning_rate": 4.848484848484849e-06, "loss": 0.2755, "step": 48 }, { "epoch": 0.0149549824507859, "grad_norm": 2.284005369243557, "learning_rate": 4.94949494949495e-06, "loss": 0.4186, "step": 49 }, { "epoch": 0.015260186174271327, "grad_norm": 2.6759709423238096, "learning_rate": 5.0505050505050515e-06, "loss": 0.6459, "step": 50 }, { "epoch": 0.015565389897756753, "grad_norm": 2.8773749120652523, "learning_rate": 5.151515151515152e-06, "loss": 0.3324, "step": 51 }, { "epoch": 0.015870593621242178, "grad_norm": 2.8060164424498786, "learning_rate": 5.252525252525253e-06, "loss": 0.3608, "step": 52 }, { "epoch": 0.016175797344727606, "grad_norm": 2.3060494229726793, "learning_rate": 5.353535353535354e-06, "loss": 0.3818, "step": 53 }, { "epoch": 0.01648100106821303, "grad_norm": 2.073464811557714, "learning_rate": 5.4545454545454545e-06, "loss": 0.2667, "step": 54 }, { "epoch": 0.01678620479169846, "grad_norm": 2.3474749655399245, "learning_rate": 5.555555555555557e-06, "loss": 0.35, "step": 55 }, { "epoch": 0.017091408515183884, "grad_norm": 3.6988890036672086, "learning_rate": 5.656565656565657e-06, "loss": 0.284, "step": 56 }, { "epoch": 0.017396612238669312, "grad_norm": 2.313501192849839, "learning_rate": 5.7575757575757586e-06, "loss": 0.3308, "step": 57 }, { "epoch": 0.017701815962154737, "grad_norm": 2.411936098122121, "learning_rate": 5.858585858585859e-06, "loss": 0.3982, "step": 58 }, { "epoch": 0.018007019685640165, "grad_norm": 2.724660127775508, "learning_rate": 5.95959595959596e-06, "loss": 0.3587, "step": 59 }, { "epoch": 0.01831222340912559, "grad_norm": 3.130895013540925, "learning_rate": 6.060606060606061e-06, "loss": 0.3427, "step": 60 }, { "epoch": 0.01861742713261102, "grad_norm": 3.4261489723004614, "learning_rate": 6.1616161616161615e-06, "loss": 0.4578, "step": 61 }, { "epoch": 0.018922630856096443, "grad_norm": 2.413871881063889, "learning_rate": 6.262626262626264e-06, "loss": 0.2067, "step": 62 }, { "epoch": 0.01922783457958187, "grad_norm": 2.0941348505038366, "learning_rate": 6.363636363636364e-06, "loss": 0.27, "step": 63 }, { "epoch": 0.019533038303067296, "grad_norm": 2.2153240133926153, "learning_rate": 6.464646464646466e-06, "loss": 0.3298, "step": 64 }, { "epoch": 0.019838242026552724, "grad_norm": 2.422022070572305, "learning_rate": 6.565656565656566e-06, "loss": 0.4894, "step": 65 }, { "epoch": 0.02014344575003815, "grad_norm": 2.45442660843552, "learning_rate": 6.666666666666667e-06, "loss": 0.3684, "step": 66 }, { "epoch": 0.020448649473523577, "grad_norm": 3.5398238081108304, "learning_rate": 6.767676767676769e-06, "loss": 0.4233, "step": 67 }, { "epoch": 0.020753853197009002, "grad_norm": 2.530397719080883, "learning_rate": 6.868686868686869e-06, "loss": 0.2676, "step": 68 }, { "epoch": 0.02105905692049443, "grad_norm": 2.259346305696615, "learning_rate": 6.969696969696971e-06, "loss": 0.4409, "step": 69 }, { "epoch": 0.021364260643979855, "grad_norm": 2.3339543424453764, "learning_rate": 7.070707070707071e-06, "loss": 0.3882, "step": 70 }, { "epoch": 0.021669464367465283, "grad_norm": 2.348843038116063, "learning_rate": 7.171717171717172e-06, "loss": 0.3904, "step": 71 }, { "epoch": 0.021974668090950708, "grad_norm": 2.7011363922899965, "learning_rate": 7.272727272727273e-06, "loss": 0.3586, "step": 72 }, { "epoch": 0.022279871814436136, "grad_norm": 2.6923381814173486, "learning_rate": 7.373737373737374e-06, "loss": 0.4331, "step": 73 }, { "epoch": 0.02258507553792156, "grad_norm": 2.0435337430530924, "learning_rate": 7.474747474747476e-06, "loss": 0.2739, "step": 74 }, { "epoch": 0.02289027926140699, "grad_norm": 2.257183264462076, "learning_rate": 7.5757575757575764e-06, "loss": 0.4554, "step": 75 }, { "epoch": 0.023195482984892414, "grad_norm": 2.5384248372961626, "learning_rate": 7.676767676767677e-06, "loss": 0.4934, "step": 76 }, { "epoch": 0.023500686708377842, "grad_norm": 2.1578730127908488, "learning_rate": 7.77777777777778e-06, "loss": 0.3519, "step": 77 }, { "epoch": 0.023805890431863267, "grad_norm": 2.1316764516757476, "learning_rate": 7.87878787878788e-06, "loss": 0.3268, "step": 78 }, { "epoch": 0.024111094155348695, "grad_norm": 2.095996278024237, "learning_rate": 7.97979797979798e-06, "loss": 0.3318, "step": 79 }, { "epoch": 0.02441629787883412, "grad_norm": 1.9985574049541877, "learning_rate": 8.08080808080808e-06, "loss": 0.1852, "step": 80 }, { "epoch": 0.02472150160231955, "grad_norm": 1.7092921737326583, "learning_rate": 8.181818181818183e-06, "loss": 0.2412, "step": 81 }, { "epoch": 0.025026705325804977, "grad_norm": 1.9609482601524066, "learning_rate": 8.282828282828283e-06, "loss": 0.3349, "step": 82 }, { "epoch": 0.0253319090492904, "grad_norm": 2.5619254980161412, "learning_rate": 8.383838383838384e-06, "loss": 0.3327, "step": 83 }, { "epoch": 0.02563711277277583, "grad_norm": 2.1734116421771827, "learning_rate": 8.484848484848486e-06, "loss": 0.5005, "step": 84 }, { "epoch": 0.025942316496261254, "grad_norm": 2.4612836321871785, "learning_rate": 8.585858585858587e-06, "loss": 0.5919, "step": 85 }, { "epoch": 0.026247520219746683, "grad_norm": 2.050264187978962, "learning_rate": 8.686868686868687e-06, "loss": 0.2654, "step": 86 }, { "epoch": 0.026552723943232107, "grad_norm": 1.7466792206761999, "learning_rate": 8.787878787878788e-06, "loss": 0.2875, "step": 87 }, { "epoch": 0.026857927666717536, "grad_norm": 1.9114055019911376, "learning_rate": 8.888888888888888e-06, "loss": 0.3317, "step": 88 }, { "epoch": 0.02716313139020296, "grad_norm": 2.136028617695754, "learning_rate": 8.98989898989899e-06, "loss": 0.4322, "step": 89 }, { "epoch": 0.02746833511368839, "grad_norm": 2.0559196693817303, "learning_rate": 9.090909090909091e-06, "loss": 0.3372, "step": 90 }, { "epoch": 0.027773538837173813, "grad_norm": 1.6053810559753854, "learning_rate": 9.191919191919193e-06, "loss": 0.2833, "step": 91 }, { "epoch": 0.02807874256065924, "grad_norm": 1.9190338968500587, "learning_rate": 9.292929292929294e-06, "loss": 0.2358, "step": 92 }, { "epoch": 0.028383946284144666, "grad_norm": 1.7424429804531956, "learning_rate": 9.393939393939396e-06, "loss": 0.2805, "step": 93 }, { "epoch": 0.028689150007630095, "grad_norm": 1.5616301594921251, "learning_rate": 9.494949494949497e-06, "loss": 0.326, "step": 94 }, { "epoch": 0.02899435373111552, "grad_norm": 2.6517363851490297, "learning_rate": 9.595959595959597e-06, "loss": 0.5839, "step": 95 }, { "epoch": 0.029299557454600948, "grad_norm": 1.9068377479857994, "learning_rate": 9.696969696969698e-06, "loss": 0.4213, "step": 96 }, { "epoch": 0.029604761178086372, "grad_norm": 2.147263972819766, "learning_rate": 9.797979797979798e-06, "loss": 0.3776, "step": 97 }, { "epoch": 0.0299099649015718, "grad_norm": 2.3466004395170685, "learning_rate": 9.8989898989899e-06, "loss": 0.4828, "step": 98 }, { "epoch": 0.030215168625057225, "grad_norm": 1.9328188798162316, "learning_rate": 1e-05, "loss": 0.3816, "step": 99 }, { "epoch": 0.030520372348542654, "grad_norm": 2.120656679761712, "learning_rate": 9.999997555414177e-06, "loss": 0.287, "step": 100 }, { "epoch": 0.03082557607202808, "grad_norm": 1.8272767014289886, "learning_rate": 9.999990221659095e-06, "loss": 0.2529, "step": 101 }, { "epoch": 0.031130779795513507, "grad_norm": 2.108876035097533, "learning_rate": 9.999977998741925e-06, "loss": 0.4, "step": 102 }, { "epoch": 0.031435983518998935, "grad_norm": 2.611227326027621, "learning_rate": 9.999960886674623e-06, "loss": 0.5577, "step": 103 }, { "epoch": 0.031741187242484356, "grad_norm": 2.012760226088087, "learning_rate": 9.999938885473916e-06, "loss": 0.2397, "step": 104 }, { "epoch": 0.032046390965969784, "grad_norm": 3.4069313977643088, "learning_rate": 9.999911995161323e-06, "loss": 0.3074, "step": 105 }, { "epoch": 0.03235159468945521, "grad_norm": 1.5281487804348939, "learning_rate": 9.999880215763133e-06, "loss": 0.306, "step": 106 }, { "epoch": 0.03265679841294064, "grad_norm": 1.5733903167529437, "learning_rate": 9.999843547310427e-06, "loss": 0.3123, "step": 107 }, { "epoch": 0.03296200213642606, "grad_norm": 2.2084260837102776, "learning_rate": 9.999801989839055e-06, "loss": 0.2686, "step": 108 }, { "epoch": 0.03326720585991149, "grad_norm": 2.0235527329790477, "learning_rate": 9.999755543389658e-06, "loss": 0.362, "step": 109 }, { "epoch": 0.03357240958339692, "grad_norm": 1.4126246608311444, "learning_rate": 9.999704208007647e-06, "loss": 0.1868, "step": 110 }, { "epoch": 0.03387761330688235, "grad_norm": 1.9363750145032863, "learning_rate": 9.999647983743227e-06, "loss": 0.4674, "step": 111 }, { "epoch": 0.03418281703036777, "grad_norm": 2.306492812857686, "learning_rate": 9.999586870651372e-06, "loss": 0.7454, "step": 112 }, { "epoch": 0.034488020753853196, "grad_norm": 1.9927578577114744, "learning_rate": 9.999520868791839e-06, "loss": 0.2964, "step": 113 }, { "epoch": 0.034793224477338625, "grad_norm": 2.897230200199283, "learning_rate": 9.99944997822917e-06, "loss": 0.3507, "step": 114 }, { "epoch": 0.03509842820082405, "grad_norm": 1.7040567211820554, "learning_rate": 9.999374199032682e-06, "loss": 0.358, "step": 115 }, { "epoch": 0.035403631924309474, "grad_norm": 1.7684725864001616, "learning_rate": 9.999293531276475e-06, "loss": 0.469, "step": 116 }, { "epoch": 0.0357088356477949, "grad_norm": 2.151331613378997, "learning_rate": 9.999207975039429e-06, "loss": 0.4007, "step": 117 }, { "epoch": 0.03601403937128033, "grad_norm": 2.1827006415812678, "learning_rate": 9.999117530405205e-06, "loss": 0.373, "step": 118 }, { "epoch": 0.03631924309476576, "grad_norm": 2.0424756244526283, "learning_rate": 9.99902219746224e-06, "loss": 0.4664, "step": 119 }, { "epoch": 0.03662444681825118, "grad_norm": 2.4438750213097014, "learning_rate": 9.998921976303757e-06, "loss": 0.5884, "step": 120 }, { "epoch": 0.03692965054173661, "grad_norm": 1.6168805259489245, "learning_rate": 9.998816867027753e-06, "loss": 0.3874, "step": 121 }, { "epoch": 0.03723485426522204, "grad_norm": 2.4836564854380914, "learning_rate": 9.99870686973701e-06, "loss": 0.3865, "step": 122 }, { "epoch": 0.037540057988707465, "grad_norm": 2.187549263535683, "learning_rate": 9.998591984539085e-06, "loss": 0.4419, "step": 123 }, { "epoch": 0.037845261712192886, "grad_norm": 2.3145724108896366, "learning_rate": 9.998472211546317e-06, "loss": 0.5048, "step": 124 }, { "epoch": 0.038150465435678314, "grad_norm": 2.6043824271784377, "learning_rate": 9.998347550875825e-06, "loss": 0.4323, "step": 125 }, { "epoch": 0.03845566915916374, "grad_norm": 1.7266964407358079, "learning_rate": 9.998218002649507e-06, "loss": 0.3093, "step": 126 }, { "epoch": 0.03876087288264917, "grad_norm": 2.3091863655820397, "learning_rate": 9.99808356699404e-06, "loss": 0.5394, "step": 127 }, { "epoch": 0.03906607660613459, "grad_norm": 2.178584103245907, "learning_rate": 9.997944244040877e-06, "loss": 0.562, "step": 128 }, { "epoch": 0.03937128032962002, "grad_norm": 1.4762803065381216, "learning_rate": 9.997800033926252e-06, "loss": 0.3012, "step": 129 }, { "epoch": 0.03967648405310545, "grad_norm": 1.6768704233807339, "learning_rate": 9.997650936791183e-06, "loss": 0.3314, "step": 130 }, { "epoch": 0.03998168777659088, "grad_norm": 1.8423584681568375, "learning_rate": 9.997496952781461e-06, "loss": 0.5373, "step": 131 }, { "epoch": 0.0402868915000763, "grad_norm": 1.4926628434179245, "learning_rate": 9.997338082047656e-06, "loss": 0.1992, "step": 132 }, { "epoch": 0.040592095223561726, "grad_norm": 1.6323074947028773, "learning_rate": 9.997174324745117e-06, "loss": 0.4872, "step": 133 }, { "epoch": 0.040897298947047155, "grad_norm": 2.159688005520465, "learning_rate": 9.997005681033973e-06, "loss": 0.5076, "step": 134 }, { "epoch": 0.04120250267053258, "grad_norm": 2.207163038792008, "learning_rate": 9.996832151079127e-06, "loss": 0.2677, "step": 135 }, { "epoch": 0.041507706394018004, "grad_norm": 1.3990677420334965, "learning_rate": 9.996653735050265e-06, "loss": 0.2526, "step": 136 }, { "epoch": 0.04181291011750343, "grad_norm": 1.7368886105229604, "learning_rate": 9.996470433121847e-06, "loss": 0.2874, "step": 137 }, { "epoch": 0.04211811384098886, "grad_norm": 1.8138446424045762, "learning_rate": 9.996282245473113e-06, "loss": 0.2986, "step": 138 }, { "epoch": 0.04242331756447429, "grad_norm": 1.8564789601928355, "learning_rate": 9.996089172288078e-06, "loss": 0.3954, "step": 139 }, { "epoch": 0.04272852128795971, "grad_norm": 1.9085920361180522, "learning_rate": 9.995891213755536e-06, "loss": 0.2739, "step": 140 }, { "epoch": 0.04303372501144514, "grad_norm": 1.8924678931794556, "learning_rate": 9.99568837006906e-06, "loss": 0.2766, "step": 141 }, { "epoch": 0.04333892873493057, "grad_norm": 1.8418836037208652, "learning_rate": 9.995480641426992e-06, "loss": 0.488, "step": 142 }, { "epoch": 0.043644132458415995, "grad_norm": 1.6305125707231247, "learning_rate": 9.99526802803246e-06, "loss": 0.3045, "step": 143 }, { "epoch": 0.043949336181901416, "grad_norm": 2.143051665423358, "learning_rate": 9.995050530093366e-06, "loss": 0.3567, "step": 144 }, { "epoch": 0.044254539905386844, "grad_norm": 1.994194545633334, "learning_rate": 9.994828147822387e-06, "loss": 0.3655, "step": 145 }, { "epoch": 0.04455974362887227, "grad_norm": 1.8553346605537173, "learning_rate": 9.994600881436972e-06, "loss": 0.3249, "step": 146 }, { "epoch": 0.0448649473523577, "grad_norm": 2.1613773805709857, "learning_rate": 9.994368731159351e-06, "loss": 0.4863, "step": 147 }, { "epoch": 0.04517015107584312, "grad_norm": 2.199571706523493, "learning_rate": 9.99413169721653e-06, "loss": 0.465, "step": 148 }, { "epoch": 0.04547535479932855, "grad_norm": 1.681707967900651, "learning_rate": 9.99388977984029e-06, "loss": 0.3472, "step": 149 }, { "epoch": 0.04578055852281398, "grad_norm": 1.6586587053140593, "learning_rate": 9.993642979267184e-06, "loss": 0.3626, "step": 150 }, { "epoch": 0.04608576224629941, "grad_norm": 2.12592721793332, "learning_rate": 9.993391295738542e-06, "loss": 0.3218, "step": 151 }, { "epoch": 0.04639096596978483, "grad_norm": 1.6765944279655143, "learning_rate": 9.99313472950047e-06, "loss": 0.3402, "step": 152 }, { "epoch": 0.046696169693270256, "grad_norm": 1.6019038139070678, "learning_rate": 9.992873280803848e-06, "loss": 0.4554, "step": 153 }, { "epoch": 0.047001373416755685, "grad_norm": 1.6429860881882794, "learning_rate": 9.99260694990433e-06, "loss": 0.4086, "step": 154 }, { "epoch": 0.04730657714024111, "grad_norm": 1.98592334325083, "learning_rate": 9.992335737062338e-06, "loss": 0.5733, "step": 155 }, { "epoch": 0.047611780863726534, "grad_norm": 1.5624846648417388, "learning_rate": 9.992059642543076e-06, "loss": 0.2524, "step": 156 }, { "epoch": 0.04791698458721196, "grad_norm": 1.4438198320418865, "learning_rate": 9.991778666616523e-06, "loss": 0.1756, "step": 157 }, { "epoch": 0.04822218831069739, "grad_norm": 1.6284817295660008, "learning_rate": 9.991492809557424e-06, "loss": 0.4144, "step": 158 }, { "epoch": 0.04852739203418282, "grad_norm": 1.2236340789910145, "learning_rate": 9.991202071645298e-06, "loss": 0.1664, "step": 159 }, { "epoch": 0.04883259575766824, "grad_norm": 1.4874398163232816, "learning_rate": 9.99090645316444e-06, "loss": 0.3323, "step": 160 }, { "epoch": 0.04913779948115367, "grad_norm": 2.5394515927833403, "learning_rate": 9.990605954403917e-06, "loss": 0.27, "step": 161 }, { "epoch": 0.0494430032046391, "grad_norm": 1.7966332314422868, "learning_rate": 9.990300575657565e-06, "loss": 0.4453, "step": 162 }, { "epoch": 0.049748206928124525, "grad_norm": 1.825976682624809, "learning_rate": 9.989990317223995e-06, "loss": 0.2646, "step": 163 }, { "epoch": 0.05005341065160995, "grad_norm": 1.6554541925183588, "learning_rate": 9.989675179406588e-06, "loss": 0.445, "step": 164 }, { "epoch": 0.050358614375095374, "grad_norm": 1.6711133844293076, "learning_rate": 9.989355162513496e-06, "loss": 0.3685, "step": 165 }, { "epoch": 0.0506638180985808, "grad_norm": 1.8033315345252203, "learning_rate": 9.989030266857644e-06, "loss": 0.2566, "step": 166 }, { "epoch": 0.05096902182206623, "grad_norm": 1.6879852444966537, "learning_rate": 9.988700492756726e-06, "loss": 0.4086, "step": 167 }, { "epoch": 0.05127422554555166, "grad_norm": 1.6855038740169574, "learning_rate": 9.988365840533204e-06, "loss": 0.3081, "step": 168 }, { "epoch": 0.05157942926903708, "grad_norm": 2.245121010490438, "learning_rate": 9.988026310514316e-06, "loss": 0.5646, "step": 169 }, { "epoch": 0.05188463299252251, "grad_norm": 1.531117336209479, "learning_rate": 9.987681903032065e-06, "loss": 0.3598, "step": 170 }, { "epoch": 0.05218983671600794, "grad_norm": 1.4368727600956301, "learning_rate": 9.987332618423221e-06, "loss": 0.3864, "step": 171 }, { "epoch": 0.052495040439493365, "grad_norm": 2.039026486601271, "learning_rate": 9.98697845702933e-06, "loss": 0.2728, "step": 172 }, { "epoch": 0.052800244162978786, "grad_norm": 1.5481974795842472, "learning_rate": 9.986619419196704e-06, "loss": 0.2376, "step": 173 }, { "epoch": 0.053105447886464215, "grad_norm": 1.583025735121783, "learning_rate": 9.986255505276418e-06, "loss": 0.3941, "step": 174 }, { "epoch": 0.05341065160994964, "grad_norm": 2.025610033619695, "learning_rate": 9.985886715624326e-06, "loss": 0.432, "step": 175 }, { "epoch": 0.05371585533343507, "grad_norm": 1.9370365819159912, "learning_rate": 9.985513050601037e-06, "loss": 0.3311, "step": 176 }, { "epoch": 0.05402105905692049, "grad_norm": 1.534591376747653, "learning_rate": 9.985134510571936e-06, "loss": 0.3804, "step": 177 }, { "epoch": 0.05432626278040592, "grad_norm": 1.5627980520171343, "learning_rate": 9.984751095907175e-06, "loss": 0.3991, "step": 178 }, { "epoch": 0.05463146650389135, "grad_norm": 1.858760828475349, "learning_rate": 9.984362806981665e-06, "loss": 0.4124, "step": 179 }, { "epoch": 0.05493667022737678, "grad_norm": 1.4922057145689682, "learning_rate": 9.983969644175092e-06, "loss": 0.2571, "step": 180 }, { "epoch": 0.0552418739508622, "grad_norm": 1.4358215484460224, "learning_rate": 9.983571607871903e-06, "loss": 0.3351, "step": 181 }, { "epoch": 0.05554707767434763, "grad_norm": 1.7105120125454414, "learning_rate": 9.983168698461312e-06, "loss": 0.4374, "step": 182 }, { "epoch": 0.055852281397833055, "grad_norm": 1.4100459259074987, "learning_rate": 9.982760916337296e-06, "loss": 0.3958, "step": 183 }, { "epoch": 0.05615748512131848, "grad_norm": 1.667173817085955, "learning_rate": 9.982348261898598e-06, "loss": 0.2867, "step": 184 }, { "epoch": 0.056462688844803904, "grad_norm": 1.8278737995984025, "learning_rate": 9.981930735548731e-06, "loss": 0.3738, "step": 185 }, { "epoch": 0.05676789256828933, "grad_norm": 1.806852289121097, "learning_rate": 9.98150833769596e-06, "loss": 0.5608, "step": 186 }, { "epoch": 0.05707309629177476, "grad_norm": 1.6986308867720055, "learning_rate": 9.981081068753324e-06, "loss": 0.4253, "step": 187 }, { "epoch": 0.05737830001526019, "grad_norm": 1.6392088091109513, "learning_rate": 9.98064892913862e-06, "loss": 0.2444, "step": 188 }, { "epoch": 0.05768350373874561, "grad_norm": 1.7762995408711126, "learning_rate": 9.980211919274407e-06, "loss": 0.3866, "step": 189 }, { "epoch": 0.05798870746223104, "grad_norm": 1.7144647062044762, "learning_rate": 9.979770039588013e-06, "loss": 0.4504, "step": 190 }, { "epoch": 0.05829391118571647, "grad_norm": 1.9069269572943617, "learning_rate": 9.979323290511517e-06, "loss": 0.4972, "step": 191 }, { "epoch": 0.058599114909201895, "grad_norm": 1.831943664409223, "learning_rate": 9.978871672481774e-06, "loss": 0.3884, "step": 192 }, { "epoch": 0.058904318632687316, "grad_norm": 1.60483584957947, "learning_rate": 9.978415185940383e-06, "loss": 0.3366, "step": 193 }, { "epoch": 0.059209522356172745, "grad_norm": 2.041633475935638, "learning_rate": 9.977953831333718e-06, "loss": 0.4928, "step": 194 }, { "epoch": 0.05951472607965817, "grad_norm": 2.1574861604284243, "learning_rate": 9.977487609112904e-06, "loss": 0.7092, "step": 195 }, { "epoch": 0.0598199298031436, "grad_norm": 1.5382345073334531, "learning_rate": 9.97701651973383e-06, "loss": 0.2236, "step": 196 }, { "epoch": 0.06012513352662902, "grad_norm": 2.1479787995768014, "learning_rate": 9.976540563657143e-06, "loss": 0.5182, "step": 197 }, { "epoch": 0.06043033725011445, "grad_norm": 1.8579437774142544, "learning_rate": 9.976059741348252e-06, "loss": 0.3093, "step": 198 }, { "epoch": 0.06073554097359988, "grad_norm": 1.5409701380525285, "learning_rate": 9.975574053277317e-06, "loss": 0.2877, "step": 199 }, { "epoch": 0.06104074469708531, "grad_norm": 1.5474598097011698, "learning_rate": 9.975083499919264e-06, "loss": 0.2981, "step": 200 }, { "epoch": 0.06134594842057073, "grad_norm": 1.9202152932180157, "learning_rate": 9.974588081753773e-06, "loss": 0.5369, "step": 201 }, { "epoch": 0.06165115214405616, "grad_norm": 1.4598442515817716, "learning_rate": 9.974087799265279e-06, "loss": 0.3696, "step": 202 }, { "epoch": 0.061956355867541585, "grad_norm": 1.48078814360119, "learning_rate": 9.973582652942975e-06, "loss": 0.284, "step": 203 }, { "epoch": 0.06226155959102701, "grad_norm": 2.100326004155181, "learning_rate": 9.973072643280813e-06, "loss": 0.5681, "step": 204 }, { "epoch": 0.06256676331451244, "grad_norm": 1.976128330719915, "learning_rate": 9.972557770777496e-06, "loss": 0.3655, "step": 205 }, { "epoch": 0.06287196703799787, "grad_norm": 1.2103730393566896, "learning_rate": 9.972038035936483e-06, "loss": 0.2471, "step": 206 }, { "epoch": 0.06317717076148328, "grad_norm": 1.670449906238349, "learning_rate": 9.971513439265992e-06, "loss": 0.2184, "step": 207 }, { "epoch": 0.06348237448496871, "grad_norm": 1.5020544764497652, "learning_rate": 9.970983981278989e-06, "loss": 0.3196, "step": 208 }, { "epoch": 0.06378757820845414, "grad_norm": 1.7833251911345853, "learning_rate": 9.970449662493195e-06, "loss": 0.4122, "step": 209 }, { "epoch": 0.06409278193193957, "grad_norm": 1.4149595334362772, "learning_rate": 9.96991048343109e-06, "loss": 0.2947, "step": 210 }, { "epoch": 0.064397985655425, "grad_norm": 1.5991867680932033, "learning_rate": 9.969366444619898e-06, "loss": 0.1902, "step": 211 }, { "epoch": 0.06470318937891043, "grad_norm": 1.4132064841734169, "learning_rate": 9.968817546591601e-06, "loss": 0.3389, "step": 212 }, { "epoch": 0.06500839310239585, "grad_norm": 1.7671902900221814, "learning_rate": 9.968263789882926e-06, "loss": 0.4294, "step": 213 }, { "epoch": 0.06531359682588128, "grad_norm": 1.5709821497329826, "learning_rate": 9.96770517503536e-06, "loss": 0.2765, "step": 214 }, { "epoch": 0.0656188005493667, "grad_norm": 1.5211731343844295, "learning_rate": 9.967141702595134e-06, "loss": 0.387, "step": 215 }, { "epoch": 0.06592400427285212, "grad_norm": 1.5499265222668686, "learning_rate": 9.96657337311323e-06, "loss": 0.4535, "step": 216 }, { "epoch": 0.06622920799633755, "grad_norm": 1.4736546539447488, "learning_rate": 9.966000187145383e-06, "loss": 0.3834, "step": 217 }, { "epoch": 0.06653441171982298, "grad_norm": 1.3306288958233108, "learning_rate": 9.965422145252072e-06, "loss": 0.3172, "step": 218 }, { "epoch": 0.06683961544330841, "grad_norm": 1.5745937005003143, "learning_rate": 9.964839247998524e-06, "loss": 0.2725, "step": 219 }, { "epoch": 0.06714481916679384, "grad_norm": 1.7546511557153388, "learning_rate": 9.96425149595472e-06, "loss": 0.3577, "step": 220 }, { "epoch": 0.06745002289027927, "grad_norm": 2.0422588449754286, "learning_rate": 9.96365888969538e-06, "loss": 0.4976, "step": 221 }, { "epoch": 0.0677552266137647, "grad_norm": 1.4661824124133862, "learning_rate": 9.963061429799979e-06, "loss": 0.3672, "step": 222 }, { "epoch": 0.06806043033725011, "grad_norm": 2.0959067552369666, "learning_rate": 9.96245911685273e-06, "loss": 0.5381, "step": 223 }, { "epoch": 0.06836563406073554, "grad_norm": 1.3296813372997014, "learning_rate": 9.961851951442599e-06, "loss": 0.2799, "step": 224 }, { "epoch": 0.06867083778422096, "grad_norm": 1.7385807765114274, "learning_rate": 9.96123993416329e-06, "loss": 0.5183, "step": 225 }, { "epoch": 0.06897604150770639, "grad_norm": 1.5190119701865645, "learning_rate": 9.960623065613254e-06, "loss": 0.4608, "step": 226 }, { "epoch": 0.06928124523119182, "grad_norm": 1.4393894383331207, "learning_rate": 9.96000134639569e-06, "loss": 0.3455, "step": 227 }, { "epoch": 0.06958644895467725, "grad_norm": 1.7132863682619555, "learning_rate": 9.959374777118533e-06, "loss": 0.316, "step": 228 }, { "epoch": 0.06989165267816268, "grad_norm": 1.3227120889592454, "learning_rate": 9.958743358394464e-06, "loss": 0.2467, "step": 229 }, { "epoch": 0.0701968564016481, "grad_norm": 1.5331153407144422, "learning_rate": 9.95810709084091e-06, "loss": 0.3138, "step": 230 }, { "epoch": 0.07050206012513352, "grad_norm": 1.7990748995190806, "learning_rate": 9.957465975080031e-06, "loss": 0.4747, "step": 231 }, { "epoch": 0.07080726384861895, "grad_norm": 1.1638981235859056, "learning_rate": 9.956820011738736e-06, "loss": 0.2265, "step": 232 }, { "epoch": 0.07111246757210438, "grad_norm": 1.5739388418179414, "learning_rate": 9.956169201448665e-06, "loss": 0.5066, "step": 233 }, { "epoch": 0.0714176712955898, "grad_norm": 1.6803933013620869, "learning_rate": 9.955513544846205e-06, "loss": 0.4415, "step": 234 }, { "epoch": 0.07172287501907523, "grad_norm": 1.4014872110785643, "learning_rate": 9.954853042572479e-06, "loss": 0.3271, "step": 235 }, { "epoch": 0.07202807874256066, "grad_norm": 1.5310222689941932, "learning_rate": 9.954187695273352e-06, "loss": 0.3289, "step": 236 }, { "epoch": 0.07233328246604609, "grad_norm": 2.166268226472017, "learning_rate": 9.953517503599419e-06, "loss": 0.622, "step": 237 }, { "epoch": 0.07263848618953152, "grad_norm": 2.258081862277545, "learning_rate": 9.952842468206019e-06, "loss": 0.5071, "step": 238 }, { "epoch": 0.07294368991301693, "grad_norm": 1.7322119894263104, "learning_rate": 9.952162589753224e-06, "loss": 0.5097, "step": 239 }, { "epoch": 0.07324889363650236, "grad_norm": 1.9966284228033864, "learning_rate": 9.951477868905843e-06, "loss": 0.2263, "step": 240 }, { "epoch": 0.07355409735998779, "grad_norm": 1.6793267860774614, "learning_rate": 9.95078830633342e-06, "loss": 0.2065, "step": 241 }, { "epoch": 0.07385930108347322, "grad_norm": 2.122564153881175, "learning_rate": 9.95009390271023e-06, "loss": 0.2665, "step": 242 }, { "epoch": 0.07416450480695864, "grad_norm": 1.5852282963187305, "learning_rate": 9.949394658715289e-06, "loss": 0.4453, "step": 243 }, { "epoch": 0.07446970853044407, "grad_norm": 1.7534712016120517, "learning_rate": 9.948690575032338e-06, "loss": 0.3628, "step": 244 }, { "epoch": 0.0747749122539295, "grad_norm": 1.351810586905304, "learning_rate": 9.947981652349854e-06, "loss": 0.3984, "step": 245 }, { "epoch": 0.07508011597741493, "grad_norm": 1.8377506474408298, "learning_rate": 9.947267891361051e-06, "loss": 0.3677, "step": 246 }, { "epoch": 0.07538531970090036, "grad_norm": 1.4655632998364951, "learning_rate": 9.946549292763865e-06, "loss": 0.3516, "step": 247 }, { "epoch": 0.07569052342438577, "grad_norm": 3.240838121636416, "learning_rate": 9.945825857260967e-06, "loss": 0.2627, "step": 248 }, { "epoch": 0.0759957271478712, "grad_norm": 1.4085823215183912, "learning_rate": 9.945097585559757e-06, "loss": 0.2716, "step": 249 }, { "epoch": 0.07630093087135663, "grad_norm": 1.6361471921651585, "learning_rate": 9.944364478372364e-06, "loss": 0.3595, "step": 250 }, { "epoch": 0.07660613459484206, "grad_norm": 1.0912978886499554, "learning_rate": 9.943626536415647e-06, "loss": 0.1968, "step": 251 }, { "epoch": 0.07691133831832749, "grad_norm": 1.9515717700893849, "learning_rate": 9.942883760411188e-06, "loss": 0.374, "step": 252 }, { "epoch": 0.07721654204181291, "grad_norm": 1.5560755068838334, "learning_rate": 9.942136151085302e-06, "loss": 0.44, "step": 253 }, { "epoch": 0.07752174576529834, "grad_norm": 1.4843235207715992, "learning_rate": 9.941383709169024e-06, "loss": 0.3175, "step": 254 }, { "epoch": 0.07782694948878377, "grad_norm": 1.5210960196158274, "learning_rate": 9.94062643539812e-06, "loss": 0.3722, "step": 255 }, { "epoch": 0.07813215321226918, "grad_norm": 1.6656094376801425, "learning_rate": 9.939864330513079e-06, "loss": 0.3511, "step": 256 }, { "epoch": 0.07843735693575461, "grad_norm": 1.2732857455769802, "learning_rate": 9.939097395259108e-06, "loss": 0.2619, "step": 257 }, { "epoch": 0.07874256065924004, "grad_norm": 1.8947301386622588, "learning_rate": 9.938325630386149e-06, "loss": 0.3933, "step": 258 }, { "epoch": 0.07904776438272547, "grad_norm": 1.5625416559388712, "learning_rate": 9.937549036648857e-06, "loss": 0.4491, "step": 259 }, { "epoch": 0.0793529681062109, "grad_norm": 1.5125179888703784, "learning_rate": 9.936767614806612e-06, "loss": 0.3674, "step": 260 }, { "epoch": 0.07965817182969633, "grad_norm": 1.5026525250547669, "learning_rate": 9.935981365623516e-06, "loss": 0.4103, "step": 261 }, { "epoch": 0.07996337555318175, "grad_norm": 2.3948536293362115, "learning_rate": 9.93519028986839e-06, "loss": 0.4009, "step": 262 }, { "epoch": 0.08026857927666718, "grad_norm": 2.416554371647352, "learning_rate": 9.934394388314775e-06, "loss": 0.4265, "step": 263 }, { "epoch": 0.0805737830001526, "grad_norm": 1.560923734953618, "learning_rate": 9.933593661740933e-06, "loss": 0.303, "step": 264 }, { "epoch": 0.08087898672363802, "grad_norm": 1.6053945705234087, "learning_rate": 9.932788110929837e-06, "loss": 0.3295, "step": 265 }, { "epoch": 0.08118419044712345, "grad_norm": 1.7775437462596928, "learning_rate": 9.931977736669185e-06, "loss": 0.2197, "step": 266 }, { "epoch": 0.08148939417060888, "grad_norm": 1.701318325041301, "learning_rate": 9.931162539751392e-06, "loss": 0.3581, "step": 267 }, { "epoch": 0.08179459789409431, "grad_norm": 1.5974548511363529, "learning_rate": 9.93034252097358e-06, "loss": 0.3432, "step": 268 }, { "epoch": 0.08209980161757974, "grad_norm": 1.8669593065073864, "learning_rate": 9.929517681137594e-06, "loss": 0.4133, "step": 269 }, { "epoch": 0.08240500534106517, "grad_norm": 1.4895827642408586, "learning_rate": 9.928688021049991e-06, "loss": 0.3111, "step": 270 }, { "epoch": 0.0827102090645506, "grad_norm": 1.4317804244871846, "learning_rate": 9.927853541522041e-06, "loss": 0.2915, "step": 271 }, { "epoch": 0.08301541278803601, "grad_norm": 1.252478145781798, "learning_rate": 9.927014243369727e-06, "loss": 0.2794, "step": 272 }, { "epoch": 0.08332061651152144, "grad_norm": 1.6973954865497314, "learning_rate": 9.926170127413743e-06, "loss": 0.6183, "step": 273 }, { "epoch": 0.08362582023500686, "grad_norm": 1.4723277244112698, "learning_rate": 9.925321194479494e-06, "loss": 0.2815, "step": 274 }, { "epoch": 0.08393102395849229, "grad_norm": 1.7075555550514414, "learning_rate": 9.924467445397097e-06, "loss": 0.4178, "step": 275 }, { "epoch": 0.08423622768197772, "grad_norm": 1.5354808046910606, "learning_rate": 9.923608881001377e-06, "loss": 0.2355, "step": 276 }, { "epoch": 0.08454143140546315, "grad_norm": 1.1795750747565834, "learning_rate": 9.922745502131865e-06, "loss": 0.3404, "step": 277 }, { "epoch": 0.08484663512894858, "grad_norm": 1.427067758888222, "learning_rate": 9.921877309632805e-06, "loss": 0.3141, "step": 278 }, { "epoch": 0.085151838852434, "grad_norm": 1.3691564278772157, "learning_rate": 9.921004304353147e-06, "loss": 0.287, "step": 279 }, { "epoch": 0.08545704257591942, "grad_norm": 1.9220775714586407, "learning_rate": 9.920126487146544e-06, "loss": 0.6617, "step": 280 }, { "epoch": 0.08576224629940485, "grad_norm": 1.6761030408371134, "learning_rate": 9.919243858871355e-06, "loss": 0.466, "step": 281 }, { "epoch": 0.08606745002289028, "grad_norm": 1.6120747264173168, "learning_rate": 9.918356420390645e-06, "loss": 0.5351, "step": 282 }, { "epoch": 0.0863726537463757, "grad_norm": 1.5236961732014556, "learning_rate": 9.91746417257218e-06, "loss": 0.33, "step": 283 }, { "epoch": 0.08667785746986113, "grad_norm": 1.6328635321860312, "learning_rate": 9.916567116288434e-06, "loss": 0.4301, "step": 284 }, { "epoch": 0.08698306119334656, "grad_norm": 1.4120804188821041, "learning_rate": 9.915665252416577e-06, "loss": 0.3025, "step": 285 }, { "epoch": 0.08728826491683199, "grad_norm": 1.8410843798908767, "learning_rate": 9.914758581838482e-06, "loss": 0.5415, "step": 286 }, { "epoch": 0.08759346864031742, "grad_norm": 1.1807475096034001, "learning_rate": 9.913847105440725e-06, "loss": 0.3184, "step": 287 }, { "epoch": 0.08789867236380283, "grad_norm": 1.52681276111022, "learning_rate": 9.912930824114577e-06, "loss": 0.4266, "step": 288 }, { "epoch": 0.08820387608728826, "grad_norm": 1.4904538614169496, "learning_rate": 9.91200973875601e-06, "loss": 0.3404, "step": 289 }, { "epoch": 0.08850907981077369, "grad_norm": 1.7385111110311349, "learning_rate": 9.911083850265692e-06, "loss": 0.3371, "step": 290 }, { "epoch": 0.08881428353425912, "grad_norm": 1.6013762575114376, "learning_rate": 9.91015315954899e-06, "loss": 0.4475, "step": 291 }, { "epoch": 0.08911948725774455, "grad_norm": 1.5474202900018152, "learning_rate": 9.909217667515964e-06, "loss": 0.4162, "step": 292 }, { "epoch": 0.08942469098122997, "grad_norm": 1.875769203080621, "learning_rate": 9.908277375081371e-06, "loss": 0.4446, "step": 293 }, { "epoch": 0.0897298947047154, "grad_norm": 1.4914731218024286, "learning_rate": 9.907332283164663e-06, "loss": 0.4274, "step": 294 }, { "epoch": 0.09003509842820083, "grad_norm": 1.6551811079983538, "learning_rate": 9.90638239268998e-06, "loss": 0.4883, "step": 295 }, { "epoch": 0.09034030215168624, "grad_norm": 1.645510927644492, "learning_rate": 9.905427704586158e-06, "loss": 0.4885, "step": 296 }, { "epoch": 0.09064550587517167, "grad_norm": 1.6759165462483547, "learning_rate": 9.904468219786727e-06, "loss": 0.3878, "step": 297 }, { "epoch": 0.0909507095986571, "grad_norm": 1.596800484010474, "learning_rate": 9.903503939229901e-06, "loss": 0.2725, "step": 298 }, { "epoch": 0.09125591332214253, "grad_norm": 1.4035704196730787, "learning_rate": 9.902534863858588e-06, "loss": 0.2147, "step": 299 }, { "epoch": 0.09156111704562796, "grad_norm": 1.7460761357385464, "learning_rate": 9.90156099462038e-06, "loss": 0.3495, "step": 300 }, { "epoch": 0.09186632076911339, "grad_norm": 1.3373562156184522, "learning_rate": 9.900582332467566e-06, "loss": 0.342, "step": 301 }, { "epoch": 0.09217152449259881, "grad_norm": 1.1466755748188362, "learning_rate": 9.89959887835711e-06, "loss": 0.1737, "step": 302 }, { "epoch": 0.09247672821608424, "grad_norm": 1.8078659273922337, "learning_rate": 9.898610633250669e-06, "loss": 0.3111, "step": 303 }, { "epoch": 0.09278193193956966, "grad_norm": 1.5400638324339648, "learning_rate": 9.897617598114584e-06, "loss": 0.4746, "step": 304 }, { "epoch": 0.09308713566305508, "grad_norm": 1.558728128630052, "learning_rate": 9.896619773919878e-06, "loss": 0.3085, "step": 305 }, { "epoch": 0.09339233938654051, "grad_norm": 4.094736926672729, "learning_rate": 9.895617161642257e-06, "loss": 0.4664, "step": 306 }, { "epoch": 0.09369754311002594, "grad_norm": 1.63116898024897, "learning_rate": 9.89460976226211e-06, "loss": 0.3878, "step": 307 }, { "epoch": 0.09400274683351137, "grad_norm": 1.7238364123731507, "learning_rate": 9.893597576764508e-06, "loss": 0.2989, "step": 308 }, { "epoch": 0.0943079505569968, "grad_norm": 1.2496662648050174, "learning_rate": 9.8925806061392e-06, "loss": 0.3054, "step": 309 }, { "epoch": 0.09461315428048223, "grad_norm": 0.8807197003313585, "learning_rate": 9.891558851380614e-06, "loss": 0.1904, "step": 310 }, { "epoch": 0.09491835800396765, "grad_norm": 1.5076918479598347, "learning_rate": 9.890532313487858e-06, "loss": 0.2679, "step": 311 }, { "epoch": 0.09522356172745307, "grad_norm": 1.8465691043660122, "learning_rate": 9.889500993464716e-06, "loss": 0.5002, "step": 312 }, { "epoch": 0.0955287654509385, "grad_norm": 1.9183643810942494, "learning_rate": 9.888464892319647e-06, "loss": 0.4869, "step": 313 }, { "epoch": 0.09583396917442392, "grad_norm": 1.6515373264151805, "learning_rate": 9.887424011065788e-06, "loss": 0.4507, "step": 314 }, { "epoch": 0.09613917289790935, "grad_norm": 1.6223391241834122, "learning_rate": 9.886378350720945e-06, "loss": 0.3445, "step": 315 }, { "epoch": 0.09644437662139478, "grad_norm": 1.4416645097808285, "learning_rate": 9.885327912307604e-06, "loss": 0.2808, "step": 316 }, { "epoch": 0.09674958034488021, "grad_norm": 1.4777192121308136, "learning_rate": 9.88427269685292e-06, "loss": 0.4335, "step": 317 }, { "epoch": 0.09705478406836564, "grad_norm": 1.6934694740555867, "learning_rate": 9.883212705388715e-06, "loss": 0.4299, "step": 318 }, { "epoch": 0.09735998779185107, "grad_norm": 1.9031284601590377, "learning_rate": 9.882147938951489e-06, "loss": 0.5364, "step": 319 }, { "epoch": 0.09766519151533648, "grad_norm": 1.990035566558448, "learning_rate": 9.881078398582406e-06, "loss": 0.6476, "step": 320 }, { "epoch": 0.09797039523882191, "grad_norm": 1.4458600630840748, "learning_rate": 9.8800040853273e-06, "loss": 0.268, "step": 321 }, { "epoch": 0.09827559896230734, "grad_norm": 1.473557254783057, "learning_rate": 9.878925000236667e-06, "loss": 0.3889, "step": 322 }, { "epoch": 0.09858080268579276, "grad_norm": 1.429462352597184, "learning_rate": 9.877841144365681e-06, "loss": 0.3348, "step": 323 }, { "epoch": 0.0988860064092782, "grad_norm": 1.9126483909533352, "learning_rate": 9.876752518774167e-06, "loss": 0.5004, "step": 324 }, { "epoch": 0.09919121013276362, "grad_norm": 1.528278815830415, "learning_rate": 9.875659124526622e-06, "loss": 0.1931, "step": 325 }, { "epoch": 0.09949641385624905, "grad_norm": 1.6064809314060318, "learning_rate": 9.874560962692207e-06, "loss": 0.2627, "step": 326 }, { "epoch": 0.09980161757973448, "grad_norm": 1.8583002911468363, "learning_rate": 9.873458034344741e-06, "loss": 0.4795, "step": 327 }, { "epoch": 0.1001068213032199, "grad_norm": 2.180040993961252, "learning_rate": 9.872350340562704e-06, "loss": 0.3502, "step": 328 } ], "logging_steps": 1.0, "max_steps": 3276, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 328, "total_flos": 40670334410752.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }