{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.997830802603037, "eval_steps": 500, "global_step": 691, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0028922631959508315, "grad_norm": 48.71332931518555, "learning_rate": 0.0, "loss": 3.3684, "step": 1 }, { "epoch": 0.005784526391901663, "grad_norm": 45.838565826416016, "learning_rate": 1.4285714285714286e-06, "loss": 3.2845, "step": 2 }, { "epoch": 0.008676789587852495, "grad_norm": 56.195335388183594, "learning_rate": 2.8571428571428573e-06, "loss": 3.5006, "step": 3 }, { "epoch": 0.011569052783803326, "grad_norm": 21.180103302001953, "learning_rate": 4.285714285714286e-06, "loss": 3.0654, "step": 4 }, { "epoch": 0.014461315979754157, "grad_norm": 21.839435577392578, "learning_rate": 5.7142857142857145e-06, "loss": 2.9674, "step": 5 }, { "epoch": 0.01735357917570499, "grad_norm": 9.650607109069824, "learning_rate": 7.142857142857143e-06, "loss": 2.9594, "step": 6 }, { "epoch": 0.02024584237165582, "grad_norm": 6.312131881713867, "learning_rate": 8.571428571428573e-06, "loss": 2.8589, "step": 7 }, { "epoch": 0.023138105567606652, "grad_norm": 6.745247840881348, "learning_rate": 1e-05, "loss": 2.6967, "step": 8 }, { "epoch": 0.026030368763557483, "grad_norm": 5.504076957702637, "learning_rate": 1.1428571428571429e-05, "loss": 2.446, "step": 9 }, { "epoch": 0.028922631959508314, "grad_norm": 4.069777011871338, "learning_rate": 1.2857142857142857e-05, "loss": 2.4143, "step": 10 }, { "epoch": 0.03181489515545915, "grad_norm": 3.7189438343048096, "learning_rate": 1.4285714285714285e-05, "loss": 2.1749, "step": 11 }, { "epoch": 0.03470715835140998, "grad_norm": 4.501105308532715, "learning_rate": 1.5714285714285715e-05, "loss": 2.1738, "step": 12 }, { "epoch": 0.03759942154736081, "grad_norm": 5.211763858795166, "learning_rate": 1.7142857142857145e-05, "loss": 2.0335, "step": 13 }, { "epoch": 0.04049168474331164, "grad_norm": 2.67036509513855, "learning_rate": 1.8571428571428572e-05, "loss": 1.9522, "step": 14 }, { "epoch": 0.04338394793926247, "grad_norm": 2.7940988540649414, "learning_rate": 2e-05, "loss": 1.9142, "step": 15 }, { "epoch": 0.046276211135213303, "grad_norm": 3.4890847206115723, "learning_rate": 2.1428571428571428e-05, "loss": 1.8412, "step": 16 }, { "epoch": 0.049168474331164135, "grad_norm": 5.002918720245361, "learning_rate": 2.2857142857142858e-05, "loss": 1.8031, "step": 17 }, { "epoch": 0.052060737527114966, "grad_norm": 4.0725226402282715, "learning_rate": 2.4285714285714288e-05, "loss": 1.8587, "step": 18 }, { "epoch": 0.0549530007230658, "grad_norm": 2.988891124725342, "learning_rate": 2.5714285714285714e-05, "loss": 1.6742, "step": 19 }, { "epoch": 0.05784526391901663, "grad_norm": 2.679062843322754, "learning_rate": 2.714285714285714e-05, "loss": 1.563, "step": 20 }, { "epoch": 0.06073752711496746, "grad_norm": 1.9652676582336426, "learning_rate": 2.857142857142857e-05, "loss": 1.6146, "step": 21 }, { "epoch": 0.0636297903109183, "grad_norm": 3.5487523078918457, "learning_rate": 3e-05, "loss": 1.6091, "step": 22 }, { "epoch": 0.06652205350686913, "grad_norm": 3.5734827518463135, "learning_rate": 3.142857142857143e-05, "loss": 1.5872, "step": 23 }, { "epoch": 0.06941431670281996, "grad_norm": 2.6711552143096924, "learning_rate": 3.285714285714286e-05, "loss": 1.5964, "step": 24 }, { "epoch": 0.07230657989877079, "grad_norm": 2.6824355125427246, "learning_rate": 3.428571428571429e-05, "loss": 1.6661, "step": 25 }, { "epoch": 0.07519884309472162, "grad_norm": 2.8385238647460938, "learning_rate": 3.571428571428572e-05, "loss": 1.6069, "step": 26 }, { "epoch": 0.07809110629067245, "grad_norm": 2.863154172897339, "learning_rate": 3.7142857142857143e-05, "loss": 1.6074, "step": 27 }, { "epoch": 0.08098336948662328, "grad_norm": 2.5264947414398193, "learning_rate": 3.857142857142858e-05, "loss": 1.5442, "step": 28 }, { "epoch": 0.08387563268257411, "grad_norm": 2.4073829650878906, "learning_rate": 4e-05, "loss": 1.4696, "step": 29 }, { "epoch": 0.08676789587852494, "grad_norm": 1.2896760702133179, "learning_rate": 4.1428571428571437e-05, "loss": 1.4876, "step": 30 }, { "epoch": 0.08966015907447578, "grad_norm": 1.3128914833068848, "learning_rate": 4.2857142857142856e-05, "loss": 1.5028, "step": 31 }, { "epoch": 0.09255242227042661, "grad_norm": 1.6972280740737915, "learning_rate": 4.428571428571428e-05, "loss": 1.5156, "step": 32 }, { "epoch": 0.09544468546637744, "grad_norm": 1.735119104385376, "learning_rate": 4.5714285714285716e-05, "loss": 1.3899, "step": 33 }, { "epoch": 0.09833694866232827, "grad_norm": 1.6684017181396484, "learning_rate": 4.714285714285714e-05, "loss": 1.4309, "step": 34 }, { "epoch": 0.1012292118582791, "grad_norm": 1.60593581199646, "learning_rate": 4.8571428571428576e-05, "loss": 1.4821, "step": 35 }, { "epoch": 0.10412147505422993, "grad_norm": 1.369852066040039, "learning_rate": 5e-05, "loss": 1.4606, "step": 36 }, { "epoch": 0.10701373825018076, "grad_norm": 1.0815776586532593, "learning_rate": 4.992378048780488e-05, "loss": 1.3166, "step": 37 }, { "epoch": 0.1099060014461316, "grad_norm": 1.1393859386444092, "learning_rate": 4.984756097560976e-05, "loss": 1.3596, "step": 38 }, { "epoch": 0.11279826464208242, "grad_norm": 1.293410062789917, "learning_rate": 4.977134146341464e-05, "loss": 1.4997, "step": 39 }, { "epoch": 0.11569052783803326, "grad_norm": 1.5961534976959229, "learning_rate": 4.969512195121951e-05, "loss": 1.3908, "step": 40 }, { "epoch": 0.11858279103398409, "grad_norm": 1.6708180904388428, "learning_rate": 4.961890243902439e-05, "loss": 1.4434, "step": 41 }, { "epoch": 0.12147505422993492, "grad_norm": 1.3898653984069824, "learning_rate": 4.954268292682927e-05, "loss": 1.3746, "step": 42 }, { "epoch": 0.12436731742588576, "grad_norm": 1.1497617959976196, "learning_rate": 4.946646341463415e-05, "loss": 1.3683, "step": 43 }, { "epoch": 0.1272595806218366, "grad_norm": 0.9966000318527222, "learning_rate": 4.9390243902439024e-05, "loss": 1.4588, "step": 44 }, { "epoch": 0.1301518438177874, "grad_norm": 1.0601434707641602, "learning_rate": 4.931402439024391e-05, "loss": 1.3757, "step": 45 }, { "epoch": 0.13304410701373826, "grad_norm": 1.2142244577407837, "learning_rate": 4.923780487804878e-05, "loss": 1.419, "step": 46 }, { "epoch": 0.13593637020968907, "grad_norm": 1.2789775133132935, "learning_rate": 4.916158536585366e-05, "loss": 1.3221, "step": 47 }, { "epoch": 0.13882863340563992, "grad_norm": 1.2200745344161987, "learning_rate": 4.908536585365854e-05, "loss": 1.4087, "step": 48 }, { "epoch": 0.14172089660159073, "grad_norm": 1.0769251585006714, "learning_rate": 4.900914634146342e-05, "loss": 1.3794, "step": 49 }, { "epoch": 0.14461315979754158, "grad_norm": 0.9566358923912048, "learning_rate": 4.893292682926829e-05, "loss": 1.3159, "step": 50 }, { "epoch": 0.1475054229934924, "grad_norm": 1.0282989740371704, "learning_rate": 4.885670731707317e-05, "loss": 1.3803, "step": 51 }, { "epoch": 0.15039768618944324, "grad_norm": 1.0863200426101685, "learning_rate": 4.878048780487805e-05, "loss": 1.3548, "step": 52 }, { "epoch": 0.15328994938539406, "grad_norm": 1.0302592515945435, "learning_rate": 4.870426829268293e-05, "loss": 1.4204, "step": 53 }, { "epoch": 0.1561822125813449, "grad_norm": 1.0147430896759033, "learning_rate": 4.86280487804878e-05, "loss": 1.4, "step": 54 }, { "epoch": 0.15907447577729572, "grad_norm": 0.9125880599021912, "learning_rate": 4.855182926829269e-05, "loss": 1.3568, "step": 55 }, { "epoch": 0.16196673897324657, "grad_norm": 0.8917691707611084, "learning_rate": 4.847560975609756e-05, "loss": 1.3547, "step": 56 }, { "epoch": 0.1648590021691974, "grad_norm": 1.03391432762146, "learning_rate": 4.839939024390244e-05, "loss": 1.2731, "step": 57 }, { "epoch": 0.16775126536514823, "grad_norm": 1.0735812187194824, "learning_rate": 4.832317073170732e-05, "loss": 1.2944, "step": 58 }, { "epoch": 0.17064352856109907, "grad_norm": 1.028361439704895, "learning_rate": 4.82469512195122e-05, "loss": 1.2617, "step": 59 }, { "epoch": 0.1735357917570499, "grad_norm": 0.9899557828903198, "learning_rate": 4.817073170731707e-05, "loss": 1.3891, "step": 60 }, { "epoch": 0.17642805495300073, "grad_norm": 6.040285110473633, "learning_rate": 4.809451219512195e-05, "loss": 1.3886, "step": 61 }, { "epoch": 0.17932031814895155, "grad_norm": 1.1661717891693115, "learning_rate": 4.801829268292683e-05, "loss": 1.2352, "step": 62 }, { "epoch": 0.1822125813449024, "grad_norm": 3.124387502670288, "learning_rate": 4.794207317073171e-05, "loss": 1.2965, "step": 63 }, { "epoch": 0.18510484454085321, "grad_norm": 1.1827131509780884, "learning_rate": 4.786585365853658e-05, "loss": 1.349, "step": 64 }, { "epoch": 0.18799710773680406, "grad_norm": 1.027674674987793, "learning_rate": 4.778963414634147e-05, "loss": 1.2165, "step": 65 }, { "epoch": 0.19088937093275488, "grad_norm": 0.9438247084617615, "learning_rate": 4.771341463414634e-05, "loss": 1.2538, "step": 66 }, { "epoch": 0.19378163412870572, "grad_norm": 0.9163101315498352, "learning_rate": 4.763719512195122e-05, "loss": 1.2914, "step": 67 }, { "epoch": 0.19667389732465654, "grad_norm": 0.9787700176239014, "learning_rate": 4.75609756097561e-05, "loss": 1.2013, "step": 68 }, { "epoch": 0.19956616052060738, "grad_norm": 0.9685674905776978, "learning_rate": 4.748475609756098e-05, "loss": 1.2933, "step": 69 }, { "epoch": 0.2024584237165582, "grad_norm": 0.8412639498710632, "learning_rate": 4.740853658536585e-05, "loss": 1.262, "step": 70 }, { "epoch": 0.20535068691250905, "grad_norm": 0.9766181707382202, "learning_rate": 4.733231707317073e-05, "loss": 1.225, "step": 71 }, { "epoch": 0.20824295010845986, "grad_norm": 0.990614116191864, "learning_rate": 4.725609756097561e-05, "loss": 1.192, "step": 72 }, { "epoch": 0.2111352133044107, "grad_norm": 0.8069394826889038, "learning_rate": 4.717987804878049e-05, "loss": 1.2127, "step": 73 }, { "epoch": 0.21402747650036152, "grad_norm": 1.022425889968872, "learning_rate": 4.710365853658536e-05, "loss": 1.1593, "step": 74 }, { "epoch": 0.21691973969631237, "grad_norm": 0.9153020977973938, "learning_rate": 4.702743902439025e-05, "loss": 1.1794, "step": 75 }, { "epoch": 0.2198120028922632, "grad_norm": 0.7978305816650391, "learning_rate": 4.695121951219512e-05, "loss": 1.2791, "step": 76 }, { "epoch": 0.22270426608821403, "grad_norm": 0.8948712348937988, "learning_rate": 4.6875e-05, "loss": 1.2227, "step": 77 }, { "epoch": 0.22559652928416485, "grad_norm": 0.9704264998435974, "learning_rate": 4.679878048780488e-05, "loss": 1.186, "step": 78 }, { "epoch": 0.2284887924801157, "grad_norm": 0.8205945491790771, "learning_rate": 4.672256097560976e-05, "loss": 1.1719, "step": 79 }, { "epoch": 0.2313810556760665, "grad_norm": 0.9167234897613525, "learning_rate": 4.664634146341464e-05, "loss": 1.2479, "step": 80 }, { "epoch": 0.23427331887201736, "grad_norm": 0.8766996264457703, "learning_rate": 4.657012195121951e-05, "loss": 1.2073, "step": 81 }, { "epoch": 0.23716558206796817, "grad_norm": 0.8327258229255676, "learning_rate": 4.64939024390244e-05, "loss": 1.2963, "step": 82 }, { "epoch": 0.24005784526391902, "grad_norm": 0.9994452595710754, "learning_rate": 4.641768292682927e-05, "loss": 1.1831, "step": 83 }, { "epoch": 0.24295010845986983, "grad_norm": 0.7853651642799377, "learning_rate": 4.634146341463415e-05, "loss": 1.2727, "step": 84 }, { "epoch": 0.24584237165582068, "grad_norm": 0.783089816570282, "learning_rate": 4.626524390243903e-05, "loss": 1.2149, "step": 85 }, { "epoch": 0.24873463485177152, "grad_norm": 0.9224200248718262, "learning_rate": 4.618902439024391e-05, "loss": 1.1902, "step": 86 }, { "epoch": 0.25162689804772237, "grad_norm": 0.7504012584686279, "learning_rate": 4.611280487804878e-05, "loss": 1.2568, "step": 87 }, { "epoch": 0.2545191612436732, "grad_norm": 0.8345561027526855, "learning_rate": 4.603658536585366e-05, "loss": 1.1642, "step": 88 }, { "epoch": 0.257411424439624, "grad_norm": 0.8287318348884583, "learning_rate": 4.596036585365854e-05, "loss": 1.2115, "step": 89 }, { "epoch": 0.2603036876355748, "grad_norm": 0.7950981259346008, "learning_rate": 4.588414634146342e-05, "loss": 1.1439, "step": 90 }, { "epoch": 0.2631959508315257, "grad_norm": 0.8269981741905212, "learning_rate": 4.580792682926829e-05, "loss": 1.226, "step": 91 }, { "epoch": 0.2660882140274765, "grad_norm": 0.7990830540657043, "learning_rate": 4.573170731707318e-05, "loss": 1.2059, "step": 92 }, { "epoch": 0.26898047722342733, "grad_norm": 0.746702253818512, "learning_rate": 4.565548780487805e-05, "loss": 1.272, "step": 93 }, { "epoch": 0.27187274041937814, "grad_norm": 0.7808762192726135, "learning_rate": 4.557926829268293e-05, "loss": 1.2135, "step": 94 }, { "epoch": 0.274765003615329, "grad_norm": 0.8141624331474304, "learning_rate": 4.550304878048781e-05, "loss": 1.1549, "step": 95 }, { "epoch": 0.27765726681127983, "grad_norm": 0.7702810168266296, "learning_rate": 4.542682926829269e-05, "loss": 1.1471, "step": 96 }, { "epoch": 0.28054953000723065, "grad_norm": 0.7874007821083069, "learning_rate": 4.535060975609756e-05, "loss": 1.2672, "step": 97 }, { "epoch": 0.28344179320318147, "grad_norm": 0.7983161211013794, "learning_rate": 4.527439024390244e-05, "loss": 1.099, "step": 98 }, { "epoch": 0.28633405639913234, "grad_norm": 0.8033881783485413, "learning_rate": 4.519817073170732e-05, "loss": 1.1549, "step": 99 }, { "epoch": 0.28922631959508316, "grad_norm": 0.8222156167030334, "learning_rate": 4.51219512195122e-05, "loss": 1.1527, "step": 100 }, { "epoch": 0.292118582791034, "grad_norm": 0.7592807412147522, "learning_rate": 4.504573170731707e-05, "loss": 1.2142, "step": 101 }, { "epoch": 0.2950108459869848, "grad_norm": 0.7466637492179871, "learning_rate": 4.496951219512196e-05, "loss": 1.2232, "step": 102 }, { "epoch": 0.29790310918293567, "grad_norm": 0.7532088756561279, "learning_rate": 4.489329268292683e-05, "loss": 1.1717, "step": 103 }, { "epoch": 0.3007953723788865, "grad_norm": 0.766828715801239, "learning_rate": 4.481707317073171e-05, "loss": 1.2218, "step": 104 }, { "epoch": 0.3036876355748373, "grad_norm": 0.6948519349098206, "learning_rate": 4.474085365853659e-05, "loss": 1.1116, "step": 105 }, { "epoch": 0.3065798987707881, "grad_norm": 0.7532397508621216, "learning_rate": 4.466463414634147e-05, "loss": 1.1451, "step": 106 }, { "epoch": 0.309472161966739, "grad_norm": 0.7384987473487854, "learning_rate": 4.458841463414634e-05, "loss": 1.2043, "step": 107 }, { "epoch": 0.3123644251626898, "grad_norm": 0.7876350283622742, "learning_rate": 4.451219512195122e-05, "loss": 1.3315, "step": 108 }, { "epoch": 0.3152566883586406, "grad_norm": 0.7799772024154663, "learning_rate": 4.44359756097561e-05, "loss": 1.2367, "step": 109 }, { "epoch": 0.31814895155459144, "grad_norm": 0.802836537361145, "learning_rate": 4.435975609756098e-05, "loss": 1.1859, "step": 110 }, { "epoch": 0.3210412147505423, "grad_norm": 0.7658648490905762, "learning_rate": 4.428353658536585e-05, "loss": 1.1554, "step": 111 }, { "epoch": 0.32393347794649313, "grad_norm": 0.7552660703659058, "learning_rate": 4.420731707317074e-05, "loss": 1.1773, "step": 112 }, { "epoch": 0.32682574114244395, "grad_norm": 0.7944100499153137, "learning_rate": 4.413109756097561e-05, "loss": 1.1369, "step": 113 }, { "epoch": 0.3297180043383948, "grad_norm": 0.79727703332901, "learning_rate": 4.405487804878049e-05, "loss": 1.1515, "step": 114 }, { "epoch": 0.33261026753434564, "grad_norm": 0.7767285704612732, "learning_rate": 4.397865853658537e-05, "loss": 1.2823, "step": 115 }, { "epoch": 0.33550253073029646, "grad_norm": 0.8018892407417297, "learning_rate": 4.390243902439025e-05, "loss": 1.1792, "step": 116 }, { "epoch": 0.3383947939262473, "grad_norm": 0.7893505692481995, "learning_rate": 4.382621951219512e-05, "loss": 1.2078, "step": 117 }, { "epoch": 0.34128705712219815, "grad_norm": 0.7643678784370422, "learning_rate": 4.375e-05, "loss": 1.1172, "step": 118 }, { "epoch": 0.34417932031814896, "grad_norm": 0.7227766513824463, "learning_rate": 4.3673780487804886e-05, "loss": 1.2424, "step": 119 }, { "epoch": 0.3470715835140998, "grad_norm": 0.7557047009468079, "learning_rate": 4.359756097560976e-05, "loss": 1.1996, "step": 120 }, { "epoch": 0.3499638467100506, "grad_norm": 0.75395667552948, "learning_rate": 4.352134146341464e-05, "loss": 1.2023, "step": 121 }, { "epoch": 0.35285610990600147, "grad_norm": 0.7078515291213989, "learning_rate": 4.344512195121952e-05, "loss": 1.2086, "step": 122 }, { "epoch": 0.3557483731019523, "grad_norm": 0.7395102381706238, "learning_rate": 4.3368902439024396e-05, "loss": 1.1106, "step": 123 }, { "epoch": 0.3586406362979031, "grad_norm": 0.819173276424408, "learning_rate": 4.329268292682927e-05, "loss": 1.1037, "step": 124 }, { "epoch": 0.3615328994938539, "grad_norm": 0.7435188889503479, "learning_rate": 4.321646341463415e-05, "loss": 1.1914, "step": 125 }, { "epoch": 0.3644251626898048, "grad_norm": 0.8237520456314087, "learning_rate": 4.314024390243903e-05, "loss": 1.1724, "step": 126 }, { "epoch": 0.3673174258857556, "grad_norm": 0.7931056022644043, "learning_rate": 4.306402439024391e-05, "loss": 1.1706, "step": 127 }, { "epoch": 0.37020968908170643, "grad_norm": 0.7253796458244324, "learning_rate": 4.298780487804878e-05, "loss": 1.1297, "step": 128 }, { "epoch": 0.37310195227765725, "grad_norm": 0.7788090705871582, "learning_rate": 4.2911585365853665e-05, "loss": 1.1685, "step": 129 }, { "epoch": 0.3759942154736081, "grad_norm": 0.7236787676811218, "learning_rate": 4.283536585365854e-05, "loss": 1.2329, "step": 130 }, { "epoch": 0.37888647866955893, "grad_norm": 0.7436123490333557, "learning_rate": 4.275914634146342e-05, "loss": 1.0825, "step": 131 }, { "epoch": 0.38177874186550975, "grad_norm": 0.7631476521492004, "learning_rate": 4.26829268292683e-05, "loss": 1.1648, "step": 132 }, { "epoch": 0.38467100506146057, "grad_norm": 0.7813283801078796, "learning_rate": 4.2606707317073176e-05, "loss": 1.1475, "step": 133 }, { "epoch": 0.38756326825741144, "grad_norm": 0.7633726000785828, "learning_rate": 4.253048780487805e-05, "loss": 1.1771, "step": 134 }, { "epoch": 0.39045553145336226, "grad_norm": 0.7443217039108276, "learning_rate": 4.245426829268293e-05, "loss": 1.0879, "step": 135 }, { "epoch": 0.3933477946493131, "grad_norm": 0.7620945572853088, "learning_rate": 4.237804878048781e-05, "loss": 1.1515, "step": 136 }, { "epoch": 0.3962400578452639, "grad_norm": 0.7569906711578369, "learning_rate": 4.230182926829269e-05, "loss": 1.1857, "step": 137 }, { "epoch": 0.39913232104121477, "grad_norm": 0.754265546798706, "learning_rate": 4.222560975609756e-05, "loss": 1.2235, "step": 138 }, { "epoch": 0.4020245842371656, "grad_norm": 0.8115909695625305, "learning_rate": 4.2149390243902445e-05, "loss": 1.1533, "step": 139 }, { "epoch": 0.4049168474331164, "grad_norm": 0.7119144201278687, "learning_rate": 4.207317073170732e-05, "loss": 1.0566, "step": 140 }, { "epoch": 0.4078091106290672, "grad_norm": 0.745745062828064, "learning_rate": 4.19969512195122e-05, "loss": 1.1801, "step": 141 }, { "epoch": 0.4107013738250181, "grad_norm": 0.7318696975708008, "learning_rate": 4.1920731707317077e-05, "loss": 1.0448, "step": 142 }, { "epoch": 0.4135936370209689, "grad_norm": 0.691558837890625, "learning_rate": 4.1844512195121956e-05, "loss": 1.118, "step": 143 }, { "epoch": 0.4164859002169197, "grad_norm": 0.7404938340187073, "learning_rate": 4.176829268292683e-05, "loss": 1.0795, "step": 144 }, { "epoch": 0.4193781634128706, "grad_norm": 0.7128071188926697, "learning_rate": 4.169207317073171e-05, "loss": 1.1663, "step": 145 }, { "epoch": 0.4222704266088214, "grad_norm": 0.8010504245758057, "learning_rate": 4.161585365853659e-05, "loss": 1.2375, "step": 146 }, { "epoch": 0.42516268980477223, "grad_norm": 0.7428746819496155, "learning_rate": 4.1539634146341466e-05, "loss": 1.0991, "step": 147 }, { "epoch": 0.42805495300072305, "grad_norm": 0.7510153651237488, "learning_rate": 4.146341463414634e-05, "loss": 1.1386, "step": 148 }, { "epoch": 0.4309472161966739, "grad_norm": 0.7697402834892273, "learning_rate": 4.1387195121951225e-05, "loss": 1.06, "step": 149 }, { "epoch": 0.43383947939262474, "grad_norm": 0.7100762128829956, "learning_rate": 4.13109756097561e-05, "loss": 1.1578, "step": 150 }, { "epoch": 0.43673174258857556, "grad_norm": 0.7327350974082947, "learning_rate": 4.123475609756098e-05, "loss": 1.1994, "step": 151 }, { "epoch": 0.4396240057845264, "grad_norm": 0.7481423020362854, "learning_rate": 4.1158536585365856e-05, "loss": 1.1554, "step": 152 }, { "epoch": 0.44251626898047725, "grad_norm": 0.7060924768447876, "learning_rate": 4.1082317073170736e-05, "loss": 1.1712, "step": 153 }, { "epoch": 0.44540853217642806, "grad_norm": 0.7289426326751709, "learning_rate": 4.100609756097561e-05, "loss": 1.0854, "step": 154 }, { "epoch": 0.4483007953723789, "grad_norm": 0.7729988694190979, "learning_rate": 4.092987804878049e-05, "loss": 1.1535, "step": 155 }, { "epoch": 0.4511930585683297, "grad_norm": 0.7460820078849792, "learning_rate": 4.085365853658537e-05, "loss": 1.1083, "step": 156 }, { "epoch": 0.45408532176428057, "grad_norm": 0.7617100477218628, "learning_rate": 4.0777439024390246e-05, "loss": 1.0703, "step": 157 }, { "epoch": 0.4569775849602314, "grad_norm": 0.7420201897621155, "learning_rate": 4.070121951219512e-05, "loss": 1.1499, "step": 158 }, { "epoch": 0.4598698481561822, "grad_norm": 0.7645936608314514, "learning_rate": 4.0625000000000005e-05, "loss": 1.1024, "step": 159 }, { "epoch": 0.462762111352133, "grad_norm": 0.7603924870491028, "learning_rate": 4.0548780487804884e-05, "loss": 1.0113, "step": 160 }, { "epoch": 0.4656543745480839, "grad_norm": 0.7942943572998047, "learning_rate": 4.047256097560976e-05, "loss": 1.1814, "step": 161 }, { "epoch": 0.4685466377440347, "grad_norm": 0.7691872715950012, "learning_rate": 4.0396341463414636e-05, "loss": 1.1274, "step": 162 }, { "epoch": 0.47143890093998553, "grad_norm": 0.7765952348709106, "learning_rate": 4.0320121951219515e-05, "loss": 1.1215, "step": 163 }, { "epoch": 0.47433116413593635, "grad_norm": 0.7291862368583679, "learning_rate": 4.0243902439024395e-05, "loss": 1.0973, "step": 164 }, { "epoch": 0.4772234273318872, "grad_norm": 0.7589432597160339, "learning_rate": 4.016768292682927e-05, "loss": 1.1347, "step": 165 }, { "epoch": 0.48011569052783803, "grad_norm": 0.7447579503059387, "learning_rate": 4.0091463414634153e-05, "loss": 1.2361, "step": 166 }, { "epoch": 0.48300795372378885, "grad_norm": 0.7255765199661255, "learning_rate": 4.0015243902439026e-05, "loss": 1.1495, "step": 167 }, { "epoch": 0.48590021691973967, "grad_norm": 0.7621276378631592, "learning_rate": 3.9939024390243905e-05, "loss": 1.1568, "step": 168 }, { "epoch": 0.48879248011569054, "grad_norm": 0.7537471055984497, "learning_rate": 3.9862804878048785e-05, "loss": 1.1004, "step": 169 }, { "epoch": 0.49168474331164136, "grad_norm": 0.7859211564064026, "learning_rate": 3.9786585365853664e-05, "loss": 1.1401, "step": 170 }, { "epoch": 0.4945770065075922, "grad_norm": 0.7351391911506653, "learning_rate": 3.971036585365854e-05, "loss": 1.1076, "step": 171 }, { "epoch": 0.49746926970354305, "grad_norm": 0.7664011716842651, "learning_rate": 3.9634146341463416e-05, "loss": 1.0421, "step": 172 }, { "epoch": 0.5003615328994938, "grad_norm": 0.7682709693908691, "learning_rate": 3.9557926829268295e-05, "loss": 1.1002, "step": 173 }, { "epoch": 0.5032537960954447, "grad_norm": 0.7599637508392334, "learning_rate": 3.9481707317073175e-05, "loss": 1.1453, "step": 174 }, { "epoch": 0.5061460592913956, "grad_norm": 0.8105545043945312, "learning_rate": 3.940548780487805e-05, "loss": 1.1733, "step": 175 }, { "epoch": 0.5090383224873464, "grad_norm": 0.7692773938179016, "learning_rate": 3.932926829268293e-05, "loss": 1.1658, "step": 176 }, { "epoch": 0.5119305856832972, "grad_norm": 0.7400121092796326, "learning_rate": 3.9253048780487806e-05, "loss": 1.1037, "step": 177 }, { "epoch": 0.514822848879248, "grad_norm": 0.7246294021606445, "learning_rate": 3.9176829268292685e-05, "loss": 1.1829, "step": 178 }, { "epoch": 0.5177151120751988, "grad_norm": 0.7318651676177979, "learning_rate": 3.9100609756097565e-05, "loss": 0.9872, "step": 179 }, { "epoch": 0.5206073752711496, "grad_norm": 0.7589302659034729, "learning_rate": 3.9024390243902444e-05, "loss": 1.1624, "step": 180 }, { "epoch": 0.5234996384671005, "grad_norm": 0.7625978589057922, "learning_rate": 3.8948170731707316e-05, "loss": 1.1147, "step": 181 }, { "epoch": 0.5263919016630514, "grad_norm": 0.7786478400230408, "learning_rate": 3.8871951219512196e-05, "loss": 1.0524, "step": 182 }, { "epoch": 0.5292841648590022, "grad_norm": 0.7591277956962585, "learning_rate": 3.8795731707317075e-05, "loss": 1.0672, "step": 183 }, { "epoch": 0.532176428054953, "grad_norm": 0.806042492389679, "learning_rate": 3.8719512195121954e-05, "loss": 1.0742, "step": 184 }, { "epoch": 0.5350686912509038, "grad_norm": 0.7718027830123901, "learning_rate": 3.864329268292683e-05, "loss": 1.1326, "step": 185 }, { "epoch": 0.5379609544468547, "grad_norm": 0.7538328766822815, "learning_rate": 3.856707317073171e-05, "loss": 1.15, "step": 186 }, { "epoch": 0.5408532176428055, "grad_norm": 0.7316940426826477, "learning_rate": 3.8490853658536586e-05, "loss": 1.0463, "step": 187 }, { "epoch": 0.5437454808387563, "grad_norm": 0.7699999809265137, "learning_rate": 3.8414634146341465e-05, "loss": 1.183, "step": 188 }, { "epoch": 0.5466377440347071, "grad_norm": 0.7050356268882751, "learning_rate": 3.8338414634146344e-05, "loss": 1.1208, "step": 189 }, { "epoch": 0.549530007230658, "grad_norm": 0.7819121479988098, "learning_rate": 3.8262195121951224e-05, "loss": 1.1622, "step": 190 }, { "epoch": 0.5524222704266089, "grad_norm": 0.700554370880127, "learning_rate": 3.8185975609756096e-05, "loss": 1.1104, "step": 191 }, { "epoch": 0.5553145336225597, "grad_norm": 0.7335946559906006, "learning_rate": 3.8109756097560976e-05, "loss": 1.0856, "step": 192 }, { "epoch": 0.5582067968185105, "grad_norm": 0.7291987538337708, "learning_rate": 3.8033536585365855e-05, "loss": 1.1158, "step": 193 }, { "epoch": 0.5610990600144613, "grad_norm": 0.7313510775566101, "learning_rate": 3.7957317073170734e-05, "loss": 1.1907, "step": 194 }, { "epoch": 0.5639913232104121, "grad_norm": 0.7727324366569519, "learning_rate": 3.788109756097561e-05, "loss": 1.1681, "step": 195 }, { "epoch": 0.5668835864063629, "grad_norm": 0.7505455613136292, "learning_rate": 3.780487804878049e-05, "loss": 1.0712, "step": 196 }, { "epoch": 0.5697758496023138, "grad_norm": 0.7288169860839844, "learning_rate": 3.7728658536585365e-05, "loss": 1.113, "step": 197 }, { "epoch": 0.5726681127982647, "grad_norm": 0.8041896820068359, "learning_rate": 3.7652439024390245e-05, "loss": 1.056, "step": 198 }, { "epoch": 0.5755603759942155, "grad_norm": 0.7612701058387756, "learning_rate": 3.7576219512195124e-05, "loss": 1.0862, "step": 199 }, { "epoch": 0.5784526391901663, "grad_norm": 0.7867717742919922, "learning_rate": 3.7500000000000003e-05, "loss": 1.1025, "step": 200 }, { "epoch": 0.5813449023861171, "grad_norm": 0.7869510054588318, "learning_rate": 3.742378048780488e-05, "loss": 1.1034, "step": 201 }, { "epoch": 0.584237165582068, "grad_norm": 0.7608320713043213, "learning_rate": 3.7347560975609755e-05, "loss": 1.1001, "step": 202 }, { "epoch": 0.5871294287780188, "grad_norm": 0.7015742063522339, "learning_rate": 3.727134146341464e-05, "loss": 1.1404, "step": 203 }, { "epoch": 0.5900216919739696, "grad_norm": 0.7741048336029053, "learning_rate": 3.7195121951219514e-05, "loss": 1.1069, "step": 204 }, { "epoch": 0.5929139551699205, "grad_norm": 0.7461472749710083, "learning_rate": 3.7118902439024393e-05, "loss": 1.1364, "step": 205 }, { "epoch": 0.5958062183658713, "grad_norm": 0.7453926205635071, "learning_rate": 3.704268292682927e-05, "loss": 1.1352, "step": 206 }, { "epoch": 0.5986984815618221, "grad_norm": 0.7385006546974182, "learning_rate": 3.696646341463415e-05, "loss": 1.1277, "step": 207 }, { "epoch": 0.601590744757773, "grad_norm": 0.755822479724884, "learning_rate": 3.6890243902439025e-05, "loss": 1.2321, "step": 208 }, { "epoch": 0.6044830079537238, "grad_norm": 0.7634955048561096, "learning_rate": 3.6814024390243904e-05, "loss": 1.0884, "step": 209 }, { "epoch": 0.6073752711496746, "grad_norm": 0.739771842956543, "learning_rate": 3.673780487804878e-05, "loss": 1.0753, "step": 210 }, { "epoch": 0.6102675343456254, "grad_norm": 0.7629417777061462, "learning_rate": 3.666158536585366e-05, "loss": 1.1352, "step": 211 }, { "epoch": 0.6131597975415762, "grad_norm": 0.7024405002593994, "learning_rate": 3.6585365853658535e-05, "loss": 1.0872, "step": 212 }, { "epoch": 0.6160520607375272, "grad_norm": 0.778109610080719, "learning_rate": 3.650914634146342e-05, "loss": 1.1214, "step": 213 }, { "epoch": 0.618944323933478, "grad_norm": 0.8042979836463928, "learning_rate": 3.6432926829268294e-05, "loss": 1.0782, "step": 214 }, { "epoch": 0.6218365871294288, "grad_norm": 0.7753491997718811, "learning_rate": 3.635670731707317e-05, "loss": 1.1299, "step": 215 }, { "epoch": 0.6247288503253796, "grad_norm": 0.7622751593589783, "learning_rate": 3.628048780487805e-05, "loss": 1.1382, "step": 216 }, { "epoch": 0.6276211135213304, "grad_norm": 0.7174673080444336, "learning_rate": 3.620426829268293e-05, "loss": 1.0331, "step": 217 }, { "epoch": 0.6305133767172812, "grad_norm": 0.7472246885299683, "learning_rate": 3.6128048780487804e-05, "loss": 1.1305, "step": 218 }, { "epoch": 0.6334056399132321, "grad_norm": 0.7711846232414246, "learning_rate": 3.6051829268292684e-05, "loss": 1.0612, "step": 219 }, { "epoch": 0.6362979031091829, "grad_norm": 0.6961559653282166, "learning_rate": 3.597560975609756e-05, "loss": 1.1416, "step": 220 }, { "epoch": 0.6391901663051338, "grad_norm": 0.7391098141670227, "learning_rate": 3.589939024390244e-05, "loss": 1.1794, "step": 221 }, { "epoch": 0.6420824295010846, "grad_norm": 0.7802613973617554, "learning_rate": 3.5823170731707315e-05, "loss": 1.1799, "step": 222 }, { "epoch": 0.6449746926970354, "grad_norm": 0.7498157620429993, "learning_rate": 3.57469512195122e-05, "loss": 1.0472, "step": 223 }, { "epoch": 0.6478669558929863, "grad_norm": 0.7516718506813049, "learning_rate": 3.5670731707317074e-05, "loss": 1.1124, "step": 224 }, { "epoch": 0.6507592190889371, "grad_norm": 0.7159478068351746, "learning_rate": 3.559451219512195e-05, "loss": 1.1552, "step": 225 }, { "epoch": 0.6536514822848879, "grad_norm": 0.7362671494483948, "learning_rate": 3.551829268292683e-05, "loss": 1.0227, "step": 226 }, { "epoch": 0.6565437454808387, "grad_norm": 0.7803052663803101, "learning_rate": 3.544207317073171e-05, "loss": 1.0371, "step": 227 }, { "epoch": 0.6594360086767896, "grad_norm": 0.7725502252578735, "learning_rate": 3.5365853658536584e-05, "loss": 1.0399, "step": 228 }, { "epoch": 0.6623282718727405, "grad_norm": 0.7670521140098572, "learning_rate": 3.5289634146341464e-05, "loss": 1.1018, "step": 229 }, { "epoch": 0.6652205350686913, "grad_norm": 0.8205684423446655, "learning_rate": 3.521341463414634e-05, "loss": 1.0543, "step": 230 }, { "epoch": 0.6681127982646421, "grad_norm": 0.7324901223182678, "learning_rate": 3.513719512195122e-05, "loss": 1.036, "step": 231 }, { "epoch": 0.6710050614605929, "grad_norm": 0.7967138886451721, "learning_rate": 3.5060975609756095e-05, "loss": 1.1261, "step": 232 }, { "epoch": 0.6738973246565437, "grad_norm": 0.7588431239128113, "learning_rate": 3.498475609756098e-05, "loss": 1.14, "step": 233 }, { "epoch": 0.6767895878524945, "grad_norm": 0.778581440448761, "learning_rate": 3.4908536585365853e-05, "loss": 1.1164, "step": 234 }, { "epoch": 0.6796818510484454, "grad_norm": 0.7511733174324036, "learning_rate": 3.483231707317073e-05, "loss": 1.0948, "step": 235 }, { "epoch": 0.6825741142443963, "grad_norm": 0.7637711763381958, "learning_rate": 3.475609756097561e-05, "loss": 1.0335, "step": 236 }, { "epoch": 0.6854663774403471, "grad_norm": 0.7194728851318359, "learning_rate": 3.467987804878049e-05, "loss": 1.1091, "step": 237 }, { "epoch": 0.6883586406362979, "grad_norm": 1.0010840892791748, "learning_rate": 3.4603658536585364e-05, "loss": 1.1292, "step": 238 }, { "epoch": 0.6912509038322487, "grad_norm": 0.767515242099762, "learning_rate": 3.4527439024390243e-05, "loss": 1.0865, "step": 239 }, { "epoch": 0.6941431670281996, "grad_norm": 0.7898090481758118, "learning_rate": 3.445121951219512e-05, "loss": 1.0899, "step": 240 }, { "epoch": 0.6970354302241504, "grad_norm": 0.7335265874862671, "learning_rate": 3.4375e-05, "loss": 1.033, "step": 241 }, { "epoch": 0.6999276934201012, "grad_norm": 0.8223006129264832, "learning_rate": 3.429878048780488e-05, "loss": 1.195, "step": 242 }, { "epoch": 0.702819956616052, "grad_norm": 0.8035436868667603, "learning_rate": 3.422256097560976e-05, "loss": 1.0006, "step": 243 }, { "epoch": 0.7057122198120029, "grad_norm": 0.7428767681121826, "learning_rate": 3.414634146341464e-05, "loss": 1.092, "step": 244 }, { "epoch": 0.7086044830079538, "grad_norm": 0.7584668397903442, "learning_rate": 3.407012195121951e-05, "loss": 1.1246, "step": 245 }, { "epoch": 0.7114967462039046, "grad_norm": 0.7379582524299622, "learning_rate": 3.399390243902439e-05, "loss": 1.1107, "step": 246 }, { "epoch": 0.7143890093998554, "grad_norm": 0.7565631866455078, "learning_rate": 3.391768292682927e-05, "loss": 1.1014, "step": 247 }, { "epoch": 0.7172812725958062, "grad_norm": 0.78312748670578, "learning_rate": 3.384146341463415e-05, "loss": 1.0298, "step": 248 }, { "epoch": 0.720173535791757, "grad_norm": 0.7658934593200684, "learning_rate": 3.376524390243902e-05, "loss": 1.0314, "step": 249 }, { "epoch": 0.7230657989877078, "grad_norm": 0.7525564432144165, "learning_rate": 3.368902439024391e-05, "loss": 1.0405, "step": 250 }, { "epoch": 0.7259580621836587, "grad_norm": 0.7480136752128601, "learning_rate": 3.361280487804878e-05, "loss": 1.0912, "step": 251 }, { "epoch": 0.7288503253796096, "grad_norm": 0.7331277132034302, "learning_rate": 3.353658536585366e-05, "loss": 1.102, "step": 252 }, { "epoch": 0.7317425885755604, "grad_norm": 0.7302331924438477, "learning_rate": 3.346036585365854e-05, "loss": 1.0178, "step": 253 }, { "epoch": 0.7346348517715112, "grad_norm": 0.7028475999832153, "learning_rate": 3.338414634146342e-05, "loss": 0.997, "step": 254 }, { "epoch": 0.737527114967462, "grad_norm": 0.7154017090797424, "learning_rate": 3.330792682926829e-05, "loss": 1.0404, "step": 255 }, { "epoch": 0.7404193781634129, "grad_norm": 0.7640696167945862, "learning_rate": 3.323170731707317e-05, "loss": 1.1071, "step": 256 }, { "epoch": 0.7433116413593637, "grad_norm": 0.7853246331214905, "learning_rate": 3.315548780487805e-05, "loss": 1.0511, "step": 257 }, { "epoch": 0.7462039045553145, "grad_norm": 0.7854739427566528, "learning_rate": 3.307926829268293e-05, "loss": 1.0859, "step": 258 }, { "epoch": 0.7490961677512654, "grad_norm": 0.7378141283988953, "learning_rate": 3.30030487804878e-05, "loss": 1.0932, "step": 259 }, { "epoch": 0.7519884309472162, "grad_norm": 0.7881212830543518, "learning_rate": 3.292682926829269e-05, "loss": 1.0923, "step": 260 }, { "epoch": 0.754880694143167, "grad_norm": 0.7434545755386353, "learning_rate": 3.285060975609756e-05, "loss": 1.0612, "step": 261 }, { "epoch": 0.7577729573391179, "grad_norm": 0.7590733766555786, "learning_rate": 3.277439024390244e-05, "loss": 1.099, "step": 262 }, { "epoch": 0.7606652205350687, "grad_norm": 0.809688925743103, "learning_rate": 3.269817073170732e-05, "loss": 1.1674, "step": 263 }, { "epoch": 0.7635574837310195, "grad_norm": 0.7180957198143005, "learning_rate": 3.26219512195122e-05, "loss": 1.1196, "step": 264 }, { "epoch": 0.7664497469269703, "grad_norm": 0.7526130676269531, "learning_rate": 3.254573170731707e-05, "loss": 0.9961, "step": 265 }, { "epoch": 0.7693420101229211, "grad_norm": 0.8099539279937744, "learning_rate": 3.246951219512195e-05, "loss": 1.0742, "step": 266 }, { "epoch": 0.7722342733188721, "grad_norm": 0.7374089360237122, "learning_rate": 3.239329268292683e-05, "loss": 1.0845, "step": 267 }, { "epoch": 0.7751265365148229, "grad_norm": 0.6704961061477661, "learning_rate": 3.231707317073171e-05, "loss": 0.9631, "step": 268 }, { "epoch": 0.7780187997107737, "grad_norm": 0.7654604315757751, "learning_rate": 3.224085365853658e-05, "loss": 1.0663, "step": 269 }, { "epoch": 0.7809110629067245, "grad_norm": 0.7672616243362427, "learning_rate": 3.216463414634147e-05, "loss": 1.0802, "step": 270 }, { "epoch": 0.7838033261026753, "grad_norm": 0.7247093915939331, "learning_rate": 3.208841463414634e-05, "loss": 1.0921, "step": 271 }, { "epoch": 0.7866955892986262, "grad_norm": 0.75218266248703, "learning_rate": 3.201219512195122e-05, "loss": 1.1062, "step": 272 }, { "epoch": 0.789587852494577, "grad_norm": 0.7745797038078308, "learning_rate": 3.19359756097561e-05, "loss": 1.1105, "step": 273 }, { "epoch": 0.7924801156905278, "grad_norm": 0.7872446179389954, "learning_rate": 3.185975609756098e-05, "loss": 1.0644, "step": 274 }, { "epoch": 0.7953723788864787, "grad_norm": 0.8333762884140015, "learning_rate": 3.178353658536585e-05, "loss": 1.065, "step": 275 }, { "epoch": 0.7982646420824295, "grad_norm": 0.7147220969200134, "learning_rate": 3.170731707317073e-05, "loss": 1.1217, "step": 276 }, { "epoch": 0.8011569052783803, "grad_norm": 0.7681723237037659, "learning_rate": 3.163109756097561e-05, "loss": 1.0033, "step": 277 }, { "epoch": 0.8040491684743312, "grad_norm": 0.7502139210700989, "learning_rate": 3.155487804878049e-05, "loss": 1.0245, "step": 278 }, { "epoch": 0.806941431670282, "grad_norm": 0.7371497750282288, "learning_rate": 3.147865853658536e-05, "loss": 0.9599, "step": 279 }, { "epoch": 0.8098336948662328, "grad_norm": 0.7861061692237854, "learning_rate": 3.140243902439025e-05, "loss": 1.0698, "step": 280 }, { "epoch": 0.8127259580621836, "grad_norm": 0.7982838749885559, "learning_rate": 3.132621951219512e-05, "loss": 1.093, "step": 281 }, { "epoch": 0.8156182212581344, "grad_norm": 0.7698132991790771, "learning_rate": 3.125e-05, "loss": 0.9996, "step": 282 }, { "epoch": 0.8185104844540854, "grad_norm": 0.7293528914451599, "learning_rate": 3.117378048780488e-05, "loss": 1.0981, "step": 283 }, { "epoch": 0.8214027476500362, "grad_norm": 0.7758128643035889, "learning_rate": 3.109756097560976e-05, "loss": 1.1089, "step": 284 }, { "epoch": 0.824295010845987, "grad_norm": 0.7410516738891602, "learning_rate": 3.102134146341464e-05, "loss": 1.0829, "step": 285 }, { "epoch": 0.8271872740419378, "grad_norm": 0.7614254355430603, "learning_rate": 3.094512195121951e-05, "loss": 1.0397, "step": 286 }, { "epoch": 0.8300795372378886, "grad_norm": 0.7554497718811035, "learning_rate": 3.08689024390244e-05, "loss": 1.0749, "step": 287 }, { "epoch": 0.8329718004338394, "grad_norm": 0.7554106116294861, "learning_rate": 3.079268292682927e-05, "loss": 1.0298, "step": 288 }, { "epoch": 0.8358640636297903, "grad_norm": 0.7850284576416016, "learning_rate": 3.071646341463415e-05, "loss": 1.0809, "step": 289 }, { "epoch": 0.8387563268257412, "grad_norm": 0.7142320275306702, "learning_rate": 3.064024390243903e-05, "loss": 1.0664, "step": 290 }, { "epoch": 0.841648590021692, "grad_norm": 0.7595747113227844, "learning_rate": 3.056402439024391e-05, "loss": 1.0581, "step": 291 }, { "epoch": 0.8445408532176428, "grad_norm": 0.8003636598587036, "learning_rate": 3.048780487804878e-05, "loss": 1.0977, "step": 292 }, { "epoch": 0.8474331164135936, "grad_norm": 0.7981911301612854, "learning_rate": 3.0411585365853663e-05, "loss": 1.0425, "step": 293 }, { "epoch": 0.8503253796095445, "grad_norm": 0.7293020486831665, "learning_rate": 3.0335365853658536e-05, "loss": 1.086, "step": 294 }, { "epoch": 0.8532176428054953, "grad_norm": 0.7135725617408752, "learning_rate": 3.025914634146342e-05, "loss": 1.1027, "step": 295 }, { "epoch": 0.8561099060014461, "grad_norm": 0.7151292562484741, "learning_rate": 3.0182926829268294e-05, "loss": 1.1234, "step": 296 }, { "epoch": 0.8590021691973969, "grad_norm": 0.7805321216583252, "learning_rate": 3.0106707317073174e-05, "loss": 1.0833, "step": 297 }, { "epoch": 0.8618944323933478, "grad_norm": 0.7318261861801147, "learning_rate": 3.003048780487805e-05, "loss": 1.0742, "step": 298 }, { "epoch": 0.8647866955892987, "grad_norm": 0.7618130445480347, "learning_rate": 2.995426829268293e-05, "loss": 1.0974, "step": 299 }, { "epoch": 0.8676789587852495, "grad_norm": 0.7759801745414734, "learning_rate": 2.9878048780487805e-05, "loss": 1.0236, "step": 300 }, { "epoch": 0.8705712219812003, "grad_norm": 0.7935881614685059, "learning_rate": 2.9801829268292684e-05, "loss": 1.0511, "step": 301 }, { "epoch": 0.8734634851771511, "grad_norm": 0.7859032154083252, "learning_rate": 2.972560975609756e-05, "loss": 1.0469, "step": 302 }, { "epoch": 0.8763557483731019, "grad_norm": 0.7812406420707703, "learning_rate": 2.9649390243902443e-05, "loss": 1.0289, "step": 303 }, { "epoch": 0.8792480115690527, "grad_norm": 0.7637215256690979, "learning_rate": 2.9573170731707316e-05, "loss": 0.9902, "step": 304 }, { "epoch": 0.8821402747650036, "grad_norm": 0.7497740983963013, "learning_rate": 2.9496951219512198e-05, "loss": 1.0487, "step": 305 }, { "epoch": 0.8850325379609545, "grad_norm": 0.7327484488487244, "learning_rate": 2.9420731707317074e-05, "loss": 1.1966, "step": 306 }, { "epoch": 0.8879248011569053, "grad_norm": 0.7829355597496033, "learning_rate": 2.9344512195121954e-05, "loss": 1.0982, "step": 307 }, { "epoch": 0.8908170643528561, "grad_norm": 0.7765836119651794, "learning_rate": 2.926829268292683e-05, "loss": 0.9476, "step": 308 }, { "epoch": 0.8937093275488069, "grad_norm": 0.7646698951721191, "learning_rate": 2.919207317073171e-05, "loss": 1.1214, "step": 309 }, { "epoch": 0.8966015907447578, "grad_norm": 0.7531141638755798, "learning_rate": 2.9115853658536585e-05, "loss": 1.0438, "step": 310 }, { "epoch": 0.8994938539407086, "grad_norm": 0.7788392305374146, "learning_rate": 2.9039634146341464e-05, "loss": 1.0591, "step": 311 }, { "epoch": 0.9023861171366594, "grad_norm": 0.7006287574768066, "learning_rate": 2.896341463414634e-05, "loss": 1.0351, "step": 312 }, { "epoch": 0.9052783803326103, "grad_norm": 0.8054205775260925, "learning_rate": 2.8887195121951223e-05, "loss": 1.1357, "step": 313 }, { "epoch": 0.9081706435285611, "grad_norm": 0.7643339037895203, "learning_rate": 2.8810975609756095e-05, "loss": 1.073, "step": 314 }, { "epoch": 0.911062906724512, "grad_norm": 0.7552357316017151, "learning_rate": 2.8734756097560978e-05, "loss": 1.0199, "step": 315 }, { "epoch": 0.9139551699204628, "grad_norm": 0.7398456931114197, "learning_rate": 2.8658536585365854e-05, "loss": 1.0532, "step": 316 }, { "epoch": 0.9168474331164136, "grad_norm": 0.7522266507148743, "learning_rate": 2.8582317073170733e-05, "loss": 1.0824, "step": 317 }, { "epoch": 0.9197396963123644, "grad_norm": 0.7729273438453674, "learning_rate": 2.850609756097561e-05, "loss": 1.0282, "step": 318 }, { "epoch": 0.9226319595083152, "grad_norm": 0.7700569033622742, "learning_rate": 2.842987804878049e-05, "loss": 1.0654, "step": 319 }, { "epoch": 0.925524222704266, "grad_norm": 0.7540171146392822, "learning_rate": 2.8353658536585365e-05, "loss": 1.0615, "step": 320 }, { "epoch": 0.928416485900217, "grad_norm": 0.7484927773475647, "learning_rate": 2.8277439024390244e-05, "loss": 1.0276, "step": 321 }, { "epoch": 0.9313087490961678, "grad_norm": 0.793731153011322, "learning_rate": 2.820121951219512e-05, "loss": 1.0536, "step": 322 }, { "epoch": 0.9342010122921186, "grad_norm": 0.7182806134223938, "learning_rate": 2.8125000000000003e-05, "loss": 1.0135, "step": 323 }, { "epoch": 0.9370932754880694, "grad_norm": 0.7177212834358215, "learning_rate": 2.8048780487804882e-05, "loss": 1.02, "step": 324 }, { "epoch": 0.9399855386840202, "grad_norm": 0.7477127909660339, "learning_rate": 2.7972560975609758e-05, "loss": 1.09, "step": 325 }, { "epoch": 0.9428778018799711, "grad_norm": 0.7824453115463257, "learning_rate": 2.7896341463414637e-05, "loss": 0.9806, "step": 326 }, { "epoch": 0.9457700650759219, "grad_norm": 0.7952285408973694, "learning_rate": 2.7820121951219513e-05, "loss": 1.0417, "step": 327 }, { "epoch": 0.9486623282718727, "grad_norm": 0.8422231674194336, "learning_rate": 2.7743902439024393e-05, "loss": 1.0552, "step": 328 }, { "epoch": 0.9515545914678236, "grad_norm": 0.8023759722709656, "learning_rate": 2.766768292682927e-05, "loss": 1.0695, "step": 329 }, { "epoch": 0.9544468546637744, "grad_norm": 0.7767244577407837, "learning_rate": 2.759146341463415e-05, "loss": 1.1414, "step": 330 }, { "epoch": 0.9573391178597253, "grad_norm": 0.7687296271324158, "learning_rate": 2.7515243902439024e-05, "loss": 1.0518, "step": 331 }, { "epoch": 0.9602313810556761, "grad_norm": 0.76921147108078, "learning_rate": 2.7439024390243906e-05, "loss": 1.0616, "step": 332 }, { "epoch": 0.9631236442516269, "grad_norm": 0.7176332473754883, "learning_rate": 2.7362804878048782e-05, "loss": 1.0969, "step": 333 }, { "epoch": 0.9660159074475777, "grad_norm": 0.7853028774261475, "learning_rate": 2.7286585365853662e-05, "loss": 1.0375, "step": 334 }, { "epoch": 0.9689081706435285, "grad_norm": 0.7683706879615784, "learning_rate": 2.7210365853658538e-05, "loss": 0.9734, "step": 335 }, { "epoch": 0.9718004338394793, "grad_norm": 0.8103812336921692, "learning_rate": 2.7134146341463417e-05, "loss": 1.0579, "step": 336 }, { "epoch": 0.9746926970354303, "grad_norm": 0.7865802049636841, "learning_rate": 2.7057926829268293e-05, "loss": 1.019, "step": 337 }, { "epoch": 0.9775849602313811, "grad_norm": 0.7285350561141968, "learning_rate": 2.6981707317073172e-05, "loss": 1.0886, "step": 338 }, { "epoch": 0.9804772234273319, "grad_norm": 0.7790278196334839, "learning_rate": 2.6905487804878048e-05, "loss": 1.03, "step": 339 }, { "epoch": 0.9833694866232827, "grad_norm": 0.8020289540290833, "learning_rate": 2.682926829268293e-05, "loss": 1.0997, "step": 340 }, { "epoch": 0.9862617498192335, "grad_norm": 0.7671722173690796, "learning_rate": 2.6753048780487804e-05, "loss": 1.1381, "step": 341 }, { "epoch": 0.9891540130151844, "grad_norm": 0.8592469096183777, "learning_rate": 2.6676829268292686e-05, "loss": 1.0825, "step": 342 }, { "epoch": 0.9920462762111352, "grad_norm": 0.7508606910705566, "learning_rate": 2.6600609756097562e-05, "loss": 1.0808, "step": 343 }, { "epoch": 0.9949385394070861, "grad_norm": 0.7976868152618408, "learning_rate": 2.652439024390244e-05, "loss": 1.0345, "step": 344 }, { "epoch": 0.9978308026030369, "grad_norm": 0.7527894973754883, "learning_rate": 2.6448170731707318e-05, "loss": 0.9788, "step": 345 }, { "epoch": 1.0, "grad_norm": 0.7162013053894043, "learning_rate": 2.6371951219512197e-05, "loss": 0.6875, "step": 346 }, { "epoch": 1.002892263195951, "grad_norm": 0.7367959022521973, "learning_rate": 2.6295731707317073e-05, "loss": 0.8764, "step": 347 }, { "epoch": 1.0057845263919016, "grad_norm": 0.7669069170951843, "learning_rate": 2.6219512195121952e-05, "loss": 0.8188, "step": 348 }, { "epoch": 1.0086767895878526, "grad_norm": 0.7791001200675964, "learning_rate": 2.6143292682926828e-05, "loss": 0.9138, "step": 349 }, { "epoch": 1.0115690527838033, "grad_norm": 0.7576078772544861, "learning_rate": 2.606707317073171e-05, "loss": 0.7908, "step": 350 }, { "epoch": 1.0144613159797542, "grad_norm": 0.7850218415260315, "learning_rate": 2.5990853658536583e-05, "loss": 0.9152, "step": 351 }, { "epoch": 1.017353579175705, "grad_norm": 0.9033083319664001, "learning_rate": 2.5914634146341466e-05, "loss": 0.8214, "step": 352 }, { "epoch": 1.0202458423716558, "grad_norm": 0.91056889295578, "learning_rate": 2.5838414634146342e-05, "loss": 0.873, "step": 353 }, { "epoch": 1.0231381055676068, "grad_norm": 0.9178743958473206, "learning_rate": 2.576219512195122e-05, "loss": 0.8357, "step": 354 }, { "epoch": 1.0260303687635575, "grad_norm": 0.9112760424613953, "learning_rate": 2.5685975609756097e-05, "loss": 0.8381, "step": 355 }, { "epoch": 1.0289226319595084, "grad_norm": 0.874699056148529, "learning_rate": 2.5609756097560977e-05, "loss": 0.8443, "step": 356 }, { "epoch": 1.031814895155459, "grad_norm": 0.866185188293457, "learning_rate": 2.5533536585365853e-05, "loss": 0.8651, "step": 357 }, { "epoch": 1.03470715835141, "grad_norm": 0.8335126042366028, "learning_rate": 2.5457317073170732e-05, "loss": 0.7468, "step": 358 }, { "epoch": 1.0375994215473607, "grad_norm": 0.8364746570587158, "learning_rate": 2.5381097560975608e-05, "loss": 0.8368, "step": 359 }, { "epoch": 1.0404916847433117, "grad_norm": 0.887727677822113, "learning_rate": 2.530487804878049e-05, "loss": 0.8161, "step": 360 }, { "epoch": 1.0433839479392624, "grad_norm": 0.8570895791053772, "learning_rate": 2.5228658536585363e-05, "loss": 0.7743, "step": 361 }, { "epoch": 1.0462762111352133, "grad_norm": 0.8758525252342224, "learning_rate": 2.5152439024390246e-05, "loss": 0.7668, "step": 362 }, { "epoch": 1.0491684743311642, "grad_norm": 0.9433422088623047, "learning_rate": 2.5076219512195122e-05, "loss": 0.8556, "step": 363 }, { "epoch": 1.052060737527115, "grad_norm": 0.957084596157074, "learning_rate": 2.5e-05, "loss": 0.859, "step": 364 }, { "epoch": 1.0549530007230659, "grad_norm": 0.9015299677848816, "learning_rate": 2.492378048780488e-05, "loss": 0.7513, "step": 365 }, { "epoch": 1.0578452639190166, "grad_norm": 0.8645225763320923, "learning_rate": 2.4847560975609756e-05, "loss": 0.7758, "step": 366 }, { "epoch": 1.0607375271149675, "grad_norm": 0.8781758546829224, "learning_rate": 2.4771341463414636e-05, "loss": 0.7608, "step": 367 }, { "epoch": 1.0636297903109182, "grad_norm": 0.9088943600654602, "learning_rate": 2.4695121951219512e-05, "loss": 0.8187, "step": 368 }, { "epoch": 1.0665220535068691, "grad_norm": 0.8699431419372559, "learning_rate": 2.461890243902439e-05, "loss": 0.885, "step": 369 }, { "epoch": 1.06941431670282, "grad_norm": 0.8766498565673828, "learning_rate": 2.454268292682927e-05, "loss": 0.8439, "step": 370 }, { "epoch": 1.0723065798987708, "grad_norm": 0.9093021154403687, "learning_rate": 2.4466463414634146e-05, "loss": 0.8731, "step": 371 }, { "epoch": 1.0751988430947217, "grad_norm": 0.9020785689353943, "learning_rate": 2.4390243902439026e-05, "loss": 0.8291, "step": 372 }, { "epoch": 1.0780911062906724, "grad_norm": 0.8650471568107605, "learning_rate": 2.43140243902439e-05, "loss": 0.8439, "step": 373 }, { "epoch": 1.0809833694866233, "grad_norm": 0.9382796883583069, "learning_rate": 2.423780487804878e-05, "loss": 0.8312, "step": 374 }, { "epoch": 1.083875632682574, "grad_norm": 0.8890308737754822, "learning_rate": 2.416158536585366e-05, "loss": 0.8552, "step": 375 }, { "epoch": 1.086767895878525, "grad_norm": 0.9097614884376526, "learning_rate": 2.4085365853658536e-05, "loss": 0.8513, "step": 376 }, { "epoch": 1.0896601590744757, "grad_norm": 0.9238763451576233, "learning_rate": 2.4009146341463416e-05, "loss": 0.7782, "step": 377 }, { "epoch": 1.0925524222704266, "grad_norm": 0.917517364025116, "learning_rate": 2.393292682926829e-05, "loss": 0.7853, "step": 378 }, { "epoch": 1.0954446854663775, "grad_norm": 0.954457700252533, "learning_rate": 2.385670731707317e-05, "loss": 0.8102, "step": 379 }, { "epoch": 1.0983369486623282, "grad_norm": 0.9540069699287415, "learning_rate": 2.378048780487805e-05, "loss": 0.8117, "step": 380 }, { "epoch": 1.1012292118582792, "grad_norm": 0.8629953265190125, "learning_rate": 2.3704268292682926e-05, "loss": 0.8483, "step": 381 }, { "epoch": 1.1041214750542299, "grad_norm": 0.9152767658233643, "learning_rate": 2.3628048780487806e-05, "loss": 0.7391, "step": 382 }, { "epoch": 1.1070137382501808, "grad_norm": 0.9119929671287537, "learning_rate": 2.355182926829268e-05, "loss": 0.8084, "step": 383 }, { "epoch": 1.1099060014461315, "grad_norm": 0.9688836932182312, "learning_rate": 2.347560975609756e-05, "loss": 0.8794, "step": 384 }, { "epoch": 1.1127982646420824, "grad_norm": 0.8734216094017029, "learning_rate": 2.339939024390244e-05, "loss": 0.771, "step": 385 }, { "epoch": 1.1156905278380334, "grad_norm": 0.936385452747345, "learning_rate": 2.332317073170732e-05, "loss": 0.843, "step": 386 }, { "epoch": 1.118582791033984, "grad_norm": 0.8708637356758118, "learning_rate": 2.32469512195122e-05, "loss": 0.8005, "step": 387 }, { "epoch": 1.121475054229935, "grad_norm": 0.9174913167953491, "learning_rate": 2.3170731707317075e-05, "loss": 0.7858, "step": 388 }, { "epoch": 1.1243673174258857, "grad_norm": 0.8793891668319702, "learning_rate": 2.3094512195121954e-05, "loss": 0.7827, "step": 389 }, { "epoch": 1.1272595806218366, "grad_norm": 0.9375653266906738, "learning_rate": 2.301829268292683e-05, "loss": 0.8587, "step": 390 }, { "epoch": 1.1301518438177873, "grad_norm": 0.9476063251495361, "learning_rate": 2.294207317073171e-05, "loss": 0.8222, "step": 391 }, { "epoch": 1.1330441070137383, "grad_norm": 0.8776272535324097, "learning_rate": 2.286585365853659e-05, "loss": 0.8089, "step": 392 }, { "epoch": 1.1359363702096892, "grad_norm": 0.8908610343933105, "learning_rate": 2.2789634146341465e-05, "loss": 0.8531, "step": 393 }, { "epoch": 1.13882863340564, "grad_norm": 0.9270078539848328, "learning_rate": 2.2713414634146344e-05, "loss": 0.8842, "step": 394 }, { "epoch": 1.1417208966015908, "grad_norm": 0.9019871354103088, "learning_rate": 2.263719512195122e-05, "loss": 0.7006, "step": 395 }, { "epoch": 1.1446131597975415, "grad_norm": 0.9170034527778625, "learning_rate": 2.25609756097561e-05, "loss": 0.8055, "step": 396 }, { "epoch": 1.1475054229934925, "grad_norm": 0.9285536408424377, "learning_rate": 2.248475609756098e-05, "loss": 0.8192, "step": 397 }, { "epoch": 1.1503976861894432, "grad_norm": 0.9291247725486755, "learning_rate": 2.2408536585365855e-05, "loss": 0.7733, "step": 398 }, { "epoch": 1.153289949385394, "grad_norm": 0.893548846244812, "learning_rate": 2.2332317073170734e-05, "loss": 0.8112, "step": 399 }, { "epoch": 1.1561822125813448, "grad_norm": 0.933894693851471, "learning_rate": 2.225609756097561e-05, "loss": 0.8244, "step": 400 }, { "epoch": 1.1590744757772957, "grad_norm": 0.8933086395263672, "learning_rate": 2.217987804878049e-05, "loss": 0.799, "step": 401 }, { "epoch": 1.1619667389732466, "grad_norm": 0.8862596750259399, "learning_rate": 2.210365853658537e-05, "loss": 0.7522, "step": 402 }, { "epoch": 1.1648590021691974, "grad_norm": 0.9892849922180176, "learning_rate": 2.2027439024390244e-05, "loss": 0.8144, "step": 403 }, { "epoch": 1.1677512653651483, "grad_norm": 0.8950841426849365, "learning_rate": 2.1951219512195124e-05, "loss": 0.8498, "step": 404 }, { "epoch": 1.170643528561099, "grad_norm": 0.9264621734619141, "learning_rate": 2.1875e-05, "loss": 0.8619, "step": 405 }, { "epoch": 1.17353579175705, "grad_norm": 0.9350318908691406, "learning_rate": 2.179878048780488e-05, "loss": 0.901, "step": 406 }, { "epoch": 1.1764280549530008, "grad_norm": 0.8909422755241394, "learning_rate": 2.172256097560976e-05, "loss": 0.7969, "step": 407 }, { "epoch": 1.1793203181489516, "grad_norm": 0.9076801538467407, "learning_rate": 2.1646341463414634e-05, "loss": 0.8102, "step": 408 }, { "epoch": 1.1822125813449025, "grad_norm": 0.9365906715393066, "learning_rate": 2.1570121951219514e-05, "loss": 0.8216, "step": 409 }, { "epoch": 1.1851048445408532, "grad_norm": 0.9423839449882507, "learning_rate": 2.149390243902439e-05, "loss": 0.8007, "step": 410 }, { "epoch": 1.1879971077368041, "grad_norm": 0.9760177135467529, "learning_rate": 2.141768292682927e-05, "loss": 0.7394, "step": 411 }, { "epoch": 1.1908893709327548, "grad_norm": 0.9895643591880798, "learning_rate": 2.134146341463415e-05, "loss": 0.8613, "step": 412 }, { "epoch": 1.1937816341287057, "grad_norm": 0.9074323177337646, "learning_rate": 2.1265243902439024e-05, "loss": 0.7996, "step": 413 }, { "epoch": 1.1966738973246565, "grad_norm": 0.9774613380432129, "learning_rate": 2.1189024390243904e-05, "loss": 0.7982, "step": 414 }, { "epoch": 1.1995661605206074, "grad_norm": 0.9536191821098328, "learning_rate": 2.111280487804878e-05, "loss": 0.8498, "step": 415 }, { "epoch": 1.2024584237165583, "grad_norm": 0.9640031456947327, "learning_rate": 2.103658536585366e-05, "loss": 0.7995, "step": 416 }, { "epoch": 1.205350686912509, "grad_norm": 0.9486613869667053, "learning_rate": 2.0960365853658538e-05, "loss": 0.8277, "step": 417 }, { "epoch": 1.20824295010846, "grad_norm": 0.9539316296577454, "learning_rate": 2.0884146341463414e-05, "loss": 0.8163, "step": 418 }, { "epoch": 1.2111352133044107, "grad_norm": 0.9421859383583069, "learning_rate": 2.0807926829268294e-05, "loss": 0.8645, "step": 419 }, { "epoch": 1.2140274765003616, "grad_norm": 0.9420467615127563, "learning_rate": 2.073170731707317e-05, "loss": 0.7646, "step": 420 }, { "epoch": 1.2169197396963123, "grad_norm": 0.8715965151786804, "learning_rate": 2.065548780487805e-05, "loss": 0.819, "step": 421 }, { "epoch": 1.2198120028922632, "grad_norm": 0.8634954690933228, "learning_rate": 2.0579268292682928e-05, "loss": 0.8478, "step": 422 }, { "epoch": 1.222704266088214, "grad_norm": 0.9214886426925659, "learning_rate": 2.0503048780487804e-05, "loss": 0.8249, "step": 423 }, { "epoch": 1.2255965292841648, "grad_norm": 0.9319393634796143, "learning_rate": 2.0426829268292683e-05, "loss": 0.8251, "step": 424 }, { "epoch": 1.2284887924801158, "grad_norm": 0.9580456018447876, "learning_rate": 2.035060975609756e-05, "loss": 0.8139, "step": 425 }, { "epoch": 1.2313810556760665, "grad_norm": 0.9004295468330383, "learning_rate": 2.0274390243902442e-05, "loss": 0.7768, "step": 426 }, { "epoch": 1.2342733188720174, "grad_norm": 0.9250595569610596, "learning_rate": 2.0198170731707318e-05, "loss": 0.7709, "step": 427 }, { "epoch": 1.2371655820679681, "grad_norm": 0.9740453362464905, "learning_rate": 2.0121951219512197e-05, "loss": 0.8407, "step": 428 }, { "epoch": 1.240057845263919, "grad_norm": 0.9681423306465149, "learning_rate": 2.0045731707317077e-05, "loss": 0.7929, "step": 429 }, { "epoch": 1.2429501084598698, "grad_norm": 0.9964022040367126, "learning_rate": 1.9969512195121953e-05, "loss": 0.7823, "step": 430 }, { "epoch": 1.2458423716558207, "grad_norm": 1.0318474769592285, "learning_rate": 1.9893292682926832e-05, "loss": 0.8579, "step": 431 }, { "epoch": 1.2487346348517716, "grad_norm": 0.9292550086975098, "learning_rate": 1.9817073170731708e-05, "loss": 0.815, "step": 432 }, { "epoch": 1.2516268980477223, "grad_norm": 0.9619131088256836, "learning_rate": 1.9740853658536587e-05, "loss": 0.8136, "step": 433 }, { "epoch": 1.2545191612436732, "grad_norm": 0.9113368391990662, "learning_rate": 1.9664634146341467e-05, "loss": 0.7857, "step": 434 }, { "epoch": 1.257411424439624, "grad_norm": 0.9458669424057007, "learning_rate": 1.9588414634146343e-05, "loss": 0.8051, "step": 435 }, { "epoch": 1.2603036876355749, "grad_norm": 0.9174255132675171, "learning_rate": 1.9512195121951222e-05, "loss": 0.8014, "step": 436 }, { "epoch": 1.2631959508315256, "grad_norm": 0.961124837398529, "learning_rate": 1.9435975609756098e-05, "loss": 0.8441, "step": 437 }, { "epoch": 1.2660882140274765, "grad_norm": 1.0305391550064087, "learning_rate": 1.9359756097560977e-05, "loss": 0.8183, "step": 438 }, { "epoch": 1.2689804772234274, "grad_norm": 0.939954936504364, "learning_rate": 1.9283536585365857e-05, "loss": 0.7894, "step": 439 }, { "epoch": 1.2718727404193781, "grad_norm": 0.921103835105896, "learning_rate": 1.9207317073170733e-05, "loss": 0.7405, "step": 440 }, { "epoch": 1.274765003615329, "grad_norm": 0.926176130771637, "learning_rate": 1.9131097560975612e-05, "loss": 0.7853, "step": 441 }, { "epoch": 1.2776572668112798, "grad_norm": 0.9235204458236694, "learning_rate": 1.9054878048780488e-05, "loss": 0.8532, "step": 442 }, { "epoch": 1.2805495300072307, "grad_norm": 0.9539816975593567, "learning_rate": 1.8978658536585367e-05, "loss": 0.7904, "step": 443 }, { "epoch": 1.2834417932031814, "grad_norm": 0.9811721444129944, "learning_rate": 1.8902439024390246e-05, "loss": 0.824, "step": 444 }, { "epoch": 1.2863340563991323, "grad_norm": 0.900104284286499, "learning_rate": 1.8826219512195122e-05, "loss": 0.762, "step": 445 }, { "epoch": 1.289226319595083, "grad_norm": 0.9972739815711975, "learning_rate": 1.8750000000000002e-05, "loss": 0.8043, "step": 446 }, { "epoch": 1.292118582791034, "grad_norm": 0.9787886738777161, "learning_rate": 1.8673780487804878e-05, "loss": 0.8379, "step": 447 }, { "epoch": 1.295010845986985, "grad_norm": 1.0129365921020508, "learning_rate": 1.8597560975609757e-05, "loss": 0.8211, "step": 448 }, { "epoch": 1.2979031091829356, "grad_norm": 0.9614445567131042, "learning_rate": 1.8521341463414636e-05, "loss": 0.811, "step": 449 }, { "epoch": 1.3007953723788865, "grad_norm": 0.9432827830314636, "learning_rate": 1.8445121951219512e-05, "loss": 0.8049, "step": 450 }, { "epoch": 1.3036876355748372, "grad_norm": 0.9323035478591919, "learning_rate": 1.836890243902439e-05, "loss": 0.8285, "step": 451 }, { "epoch": 1.3065798987707882, "grad_norm": 0.979387640953064, "learning_rate": 1.8292682926829268e-05, "loss": 0.833, "step": 452 }, { "epoch": 1.309472161966739, "grad_norm": 0.9406694173812866, "learning_rate": 1.8216463414634147e-05, "loss": 0.823, "step": 453 }, { "epoch": 1.3123644251626898, "grad_norm": 0.9428540468215942, "learning_rate": 1.8140243902439026e-05, "loss": 0.7691, "step": 454 }, { "epoch": 1.3152566883586405, "grad_norm": 0.9734871983528137, "learning_rate": 1.8064024390243902e-05, "loss": 0.7952, "step": 455 }, { "epoch": 1.3181489515545914, "grad_norm": 0.9358460307121277, "learning_rate": 1.798780487804878e-05, "loss": 0.7799, "step": 456 }, { "epoch": 1.3210412147505424, "grad_norm": 0.9847381711006165, "learning_rate": 1.7911585365853658e-05, "loss": 0.8272, "step": 457 }, { "epoch": 1.323933477946493, "grad_norm": 1.0185282230377197, "learning_rate": 1.7835365853658537e-05, "loss": 0.7397, "step": 458 }, { "epoch": 1.326825741142444, "grad_norm": 1.019514560699463, "learning_rate": 1.7759146341463416e-05, "loss": 0.8922, "step": 459 }, { "epoch": 1.3297180043383947, "grad_norm": 1.0088555812835693, "learning_rate": 1.7682926829268292e-05, "loss": 0.8657, "step": 460 }, { "epoch": 1.3326102675343456, "grad_norm": 0.9719268679618835, "learning_rate": 1.760670731707317e-05, "loss": 0.8074, "step": 461 }, { "epoch": 1.3355025307302966, "grad_norm": 0.9707063436508179, "learning_rate": 1.7530487804878047e-05, "loss": 0.7983, "step": 462 }, { "epoch": 1.3383947939262473, "grad_norm": 1.0087740421295166, "learning_rate": 1.7454268292682927e-05, "loss": 0.8205, "step": 463 }, { "epoch": 1.3412870571221982, "grad_norm": 0.957075297832489, "learning_rate": 1.7378048780487806e-05, "loss": 0.8248, "step": 464 }, { "epoch": 1.344179320318149, "grad_norm": 0.9987917542457581, "learning_rate": 1.7301829268292682e-05, "loss": 0.8194, "step": 465 }, { "epoch": 1.3470715835140998, "grad_norm": 0.959826648235321, "learning_rate": 1.722560975609756e-05, "loss": 0.754, "step": 466 }, { "epoch": 1.3499638467100505, "grad_norm": 0.9746386408805847, "learning_rate": 1.714939024390244e-05, "loss": 0.7998, "step": 467 }, { "epoch": 1.3528561099060015, "grad_norm": 0.9507508873939514, "learning_rate": 1.707317073170732e-05, "loss": 0.7447, "step": 468 }, { "epoch": 1.3557483731019522, "grad_norm": 1.0092105865478516, "learning_rate": 1.6996951219512196e-05, "loss": 0.8063, "step": 469 }, { "epoch": 1.358640636297903, "grad_norm": 0.973320484161377, "learning_rate": 1.6920731707317075e-05, "loss": 0.7818, "step": 470 }, { "epoch": 1.361532899493854, "grad_norm": 0.9913963675498962, "learning_rate": 1.6844512195121955e-05, "loss": 0.8006, "step": 471 }, { "epoch": 1.3644251626898047, "grad_norm": 1.0580593347549438, "learning_rate": 1.676829268292683e-05, "loss": 0.8488, "step": 472 }, { "epoch": 1.3673174258857557, "grad_norm": 0.9785270094871521, "learning_rate": 1.669207317073171e-05, "loss": 0.8249, "step": 473 }, { "epoch": 1.3702096890817064, "grad_norm": 0.981171727180481, "learning_rate": 1.6615853658536586e-05, "loss": 0.7762, "step": 474 }, { "epoch": 1.3731019522776573, "grad_norm": 1.0523923635482788, "learning_rate": 1.6539634146341465e-05, "loss": 0.7582, "step": 475 }, { "epoch": 1.3759942154736082, "grad_norm": 1.0290507078170776, "learning_rate": 1.6463414634146345e-05, "loss": 0.7927, "step": 476 }, { "epoch": 1.378886478669559, "grad_norm": 0.9900729060173035, "learning_rate": 1.638719512195122e-05, "loss": 0.7436, "step": 477 }, { "epoch": 1.3817787418655096, "grad_norm": 0.9794175028800964, "learning_rate": 1.63109756097561e-05, "loss": 0.7744, "step": 478 }, { "epoch": 1.3846710050614606, "grad_norm": 1.0114864110946655, "learning_rate": 1.6234756097560976e-05, "loss": 0.8683, "step": 479 }, { "epoch": 1.3875632682574115, "grad_norm": 1.026435375213623, "learning_rate": 1.6158536585365855e-05, "loss": 0.8049, "step": 480 }, { "epoch": 1.3904555314533622, "grad_norm": 1.0069879293441772, "learning_rate": 1.6082317073170734e-05, "loss": 0.9052, "step": 481 }, { "epoch": 1.3933477946493131, "grad_norm": 0.9856945276260376, "learning_rate": 1.600609756097561e-05, "loss": 0.8129, "step": 482 }, { "epoch": 1.3962400578452638, "grad_norm": 0.9632019400596619, "learning_rate": 1.592987804878049e-05, "loss": 0.7651, "step": 483 }, { "epoch": 1.3991323210412148, "grad_norm": 0.9180967807769775, "learning_rate": 1.5853658536585366e-05, "loss": 0.7798, "step": 484 }, { "epoch": 1.4020245842371657, "grad_norm": 0.9854956269264221, "learning_rate": 1.5777439024390245e-05, "loss": 0.7869, "step": 485 }, { "epoch": 1.4049168474331164, "grad_norm": 0.9699094891548157, "learning_rate": 1.5701219512195124e-05, "loss": 0.7424, "step": 486 }, { "epoch": 1.407809110629067, "grad_norm": 1.0167737007141113, "learning_rate": 1.5625e-05, "loss": 0.8369, "step": 487 }, { "epoch": 1.410701373825018, "grad_norm": 0.9676855802536011, "learning_rate": 1.554878048780488e-05, "loss": 0.8397, "step": 488 }, { "epoch": 1.413593637020969, "grad_norm": 0.974721372127533, "learning_rate": 1.5472560975609756e-05, "loss": 0.7772, "step": 489 }, { "epoch": 1.4164859002169197, "grad_norm": 0.981971800327301, "learning_rate": 1.5396341463414635e-05, "loss": 0.8626, "step": 490 }, { "epoch": 1.4193781634128706, "grad_norm": 1.004634976387024, "learning_rate": 1.5320121951219514e-05, "loss": 0.7482, "step": 491 }, { "epoch": 1.4222704266088213, "grad_norm": 0.995227575302124, "learning_rate": 1.524390243902439e-05, "loss": 0.7898, "step": 492 }, { "epoch": 1.4251626898047722, "grad_norm": 0.9808421730995178, "learning_rate": 1.5167682926829268e-05, "loss": 0.7677, "step": 493 }, { "epoch": 1.4280549530007232, "grad_norm": 0.9480452537536621, "learning_rate": 1.5091463414634147e-05, "loss": 0.7852, "step": 494 }, { "epoch": 1.4309472161966739, "grad_norm": 0.9107538461685181, "learning_rate": 1.5015243902439025e-05, "loss": 0.8798, "step": 495 }, { "epoch": 1.4338394793926248, "grad_norm": 0.9696621894836426, "learning_rate": 1.4939024390243902e-05, "loss": 0.8056, "step": 496 }, { "epoch": 1.4367317425885755, "grad_norm": 1.025511384010315, "learning_rate": 1.486280487804878e-05, "loss": 0.8319, "step": 497 }, { "epoch": 1.4396240057845264, "grad_norm": 0.9872826337814331, "learning_rate": 1.4786585365853658e-05, "loss": 0.7518, "step": 498 }, { "epoch": 1.4425162689804774, "grad_norm": 0.9867232441902161, "learning_rate": 1.4710365853658537e-05, "loss": 0.7372, "step": 499 }, { "epoch": 1.445408532176428, "grad_norm": 1.0221909284591675, "learning_rate": 1.4634146341463415e-05, "loss": 0.764, "step": 500 }, { "epoch": 1.4483007953723788, "grad_norm": 0.9744577407836914, "learning_rate": 1.4557926829268292e-05, "loss": 0.7816, "step": 501 }, { "epoch": 1.4511930585683297, "grad_norm": 0.9650794267654419, "learning_rate": 1.448170731707317e-05, "loss": 0.7687, "step": 502 }, { "epoch": 1.4540853217642806, "grad_norm": 1.067771077156067, "learning_rate": 1.4405487804878048e-05, "loss": 0.7803, "step": 503 }, { "epoch": 1.4569775849602313, "grad_norm": 1.0217148065567017, "learning_rate": 1.4329268292682927e-05, "loss": 0.8766, "step": 504 }, { "epoch": 1.4598698481561823, "grad_norm": 0.9869562983512878, "learning_rate": 1.4253048780487805e-05, "loss": 0.7447, "step": 505 }, { "epoch": 1.462762111352133, "grad_norm": 1.004603385925293, "learning_rate": 1.4176829268292682e-05, "loss": 0.7725, "step": 506 }, { "epoch": 1.465654374548084, "grad_norm": 1.0009071826934814, "learning_rate": 1.410060975609756e-05, "loss": 0.8871, "step": 507 }, { "epoch": 1.4685466377440348, "grad_norm": 1.0561660528182983, "learning_rate": 1.4024390243902441e-05, "loss": 0.7484, "step": 508 }, { "epoch": 1.4714389009399855, "grad_norm": 0.9575408101081848, "learning_rate": 1.3948170731707319e-05, "loss": 0.7578, "step": 509 }, { "epoch": 1.4743311641359362, "grad_norm": 1.0391199588775635, "learning_rate": 1.3871951219512196e-05, "loss": 0.8065, "step": 510 }, { "epoch": 1.4772234273318872, "grad_norm": 1.00908625125885, "learning_rate": 1.3795731707317076e-05, "loss": 0.7456, "step": 511 }, { "epoch": 1.480115690527838, "grad_norm": 0.9751247763633728, "learning_rate": 1.3719512195121953e-05, "loss": 0.6815, "step": 512 }, { "epoch": 1.4830079537237888, "grad_norm": 1.007405161857605, "learning_rate": 1.3643292682926831e-05, "loss": 0.7728, "step": 513 }, { "epoch": 1.4859002169197397, "grad_norm": 0.9923568964004517, "learning_rate": 1.3567073170731709e-05, "loss": 0.7887, "step": 514 }, { "epoch": 1.4887924801156904, "grad_norm": 0.9783514142036438, "learning_rate": 1.3490853658536586e-05, "loss": 0.8677, "step": 515 }, { "epoch": 1.4916847433116414, "grad_norm": 0.9877396821975708, "learning_rate": 1.3414634146341466e-05, "loss": 0.8264, "step": 516 }, { "epoch": 1.4945770065075923, "grad_norm": 0.973827600479126, "learning_rate": 1.3338414634146343e-05, "loss": 0.8344, "step": 517 }, { "epoch": 1.497469269703543, "grad_norm": 0.9245984554290771, "learning_rate": 1.326219512195122e-05, "loss": 0.7671, "step": 518 }, { "epoch": 1.5003615328994937, "grad_norm": 1.0020720958709717, "learning_rate": 1.3185975609756098e-05, "loss": 0.794, "step": 519 }, { "epoch": 1.5032537960954446, "grad_norm": 0.9446883797645569, "learning_rate": 1.3109756097560976e-05, "loss": 0.7783, "step": 520 }, { "epoch": 1.5061460592913956, "grad_norm": 0.9875244498252869, "learning_rate": 1.3033536585365855e-05, "loss": 0.8469, "step": 521 }, { "epoch": 1.5090383224873465, "grad_norm": 1.0033190250396729, "learning_rate": 1.2957317073170733e-05, "loss": 0.8749, "step": 522 }, { "epoch": 1.5119305856832972, "grad_norm": 0.9534813165664673, "learning_rate": 1.288109756097561e-05, "loss": 0.8684, "step": 523 }, { "epoch": 1.514822848879248, "grad_norm": 0.9435486793518066, "learning_rate": 1.2804878048780488e-05, "loss": 0.8012, "step": 524 }, { "epoch": 1.5177151120751988, "grad_norm": 1.0029319524765015, "learning_rate": 1.2728658536585366e-05, "loss": 0.762, "step": 525 }, { "epoch": 1.5206073752711498, "grad_norm": 1.0000132322311401, "learning_rate": 1.2652439024390245e-05, "loss": 0.7812, "step": 526 }, { "epoch": 1.5234996384671005, "grad_norm": 0.9410236477851868, "learning_rate": 1.2576219512195123e-05, "loss": 0.775, "step": 527 }, { "epoch": 1.5263919016630514, "grad_norm": 0.9614347815513611, "learning_rate": 1.25e-05, "loss": 0.7783, "step": 528 }, { "epoch": 1.529284164859002, "grad_norm": 0.9015387296676636, "learning_rate": 1.2423780487804878e-05, "loss": 0.7767, "step": 529 }, { "epoch": 1.532176428054953, "grad_norm": 0.9506531357765198, "learning_rate": 1.2347560975609756e-05, "loss": 0.7928, "step": 530 }, { "epoch": 1.535068691250904, "grad_norm": 1.0034101009368896, "learning_rate": 1.2271341463414635e-05, "loss": 0.794, "step": 531 }, { "epoch": 1.5379609544468547, "grad_norm": 1.0089356899261475, "learning_rate": 1.2195121951219513e-05, "loss": 0.7306, "step": 532 }, { "epoch": 1.5408532176428054, "grad_norm": 1.0234556198120117, "learning_rate": 1.211890243902439e-05, "loss": 0.7613, "step": 533 }, { "epoch": 1.5437454808387563, "grad_norm": 0.9771298170089722, "learning_rate": 1.2042682926829268e-05, "loss": 0.7869, "step": 534 }, { "epoch": 1.5466377440347072, "grad_norm": 1.019014835357666, "learning_rate": 1.1966463414634146e-05, "loss": 0.8096, "step": 535 }, { "epoch": 1.5495300072306581, "grad_norm": 0.95261150598526, "learning_rate": 1.1890243902439025e-05, "loss": 0.843, "step": 536 }, { "epoch": 1.5524222704266089, "grad_norm": 0.9801099300384521, "learning_rate": 1.1814024390243903e-05, "loss": 0.7219, "step": 537 }, { "epoch": 1.5553145336225596, "grad_norm": 1.0174713134765625, "learning_rate": 1.173780487804878e-05, "loss": 0.787, "step": 538 }, { "epoch": 1.5582067968185105, "grad_norm": 1.119850754737854, "learning_rate": 1.166158536585366e-05, "loss": 0.8341, "step": 539 }, { "epoch": 1.5610990600144614, "grad_norm": 0.996792733669281, "learning_rate": 1.1585365853658537e-05, "loss": 0.8291, "step": 540 }, { "epoch": 1.5639913232104121, "grad_norm": 1.0276952981948853, "learning_rate": 1.1509146341463415e-05, "loss": 0.7911, "step": 541 }, { "epoch": 1.5668835864063628, "grad_norm": 0.9893227815628052, "learning_rate": 1.1432926829268294e-05, "loss": 0.8017, "step": 542 }, { "epoch": 1.5697758496023138, "grad_norm": 1.0083463191986084, "learning_rate": 1.1356707317073172e-05, "loss": 0.8681, "step": 543 }, { "epoch": 1.5726681127982647, "grad_norm": 1.0352839231491089, "learning_rate": 1.128048780487805e-05, "loss": 0.7451, "step": 544 }, { "epoch": 1.5755603759942156, "grad_norm": 1.0231815576553345, "learning_rate": 1.1204268292682927e-05, "loss": 0.7971, "step": 545 }, { "epoch": 1.5784526391901663, "grad_norm": 0.9740004539489746, "learning_rate": 1.1128048780487805e-05, "loss": 0.7174, "step": 546 }, { "epoch": 1.581344902386117, "grad_norm": 0.9921448826789856, "learning_rate": 1.1051829268292684e-05, "loss": 0.7669, "step": 547 }, { "epoch": 1.584237165582068, "grad_norm": 0.9635536670684814, "learning_rate": 1.0975609756097562e-05, "loss": 0.7851, "step": 548 }, { "epoch": 1.5871294287780189, "grad_norm": 0.9930370450019836, "learning_rate": 1.089939024390244e-05, "loss": 0.749, "step": 549 }, { "epoch": 1.5900216919739696, "grad_norm": 1.0188409090042114, "learning_rate": 1.0823170731707317e-05, "loss": 0.8287, "step": 550 }, { "epoch": 1.5929139551699205, "grad_norm": 0.9855648875236511, "learning_rate": 1.0746951219512195e-05, "loss": 0.7985, "step": 551 }, { "epoch": 1.5958062183658712, "grad_norm": 1.0312644243240356, "learning_rate": 1.0670731707317074e-05, "loss": 0.824, "step": 552 }, { "epoch": 1.5986984815618221, "grad_norm": 0.9914786219596863, "learning_rate": 1.0594512195121952e-05, "loss": 0.8491, "step": 553 }, { "epoch": 1.601590744757773, "grad_norm": 1.0038225650787354, "learning_rate": 1.051829268292683e-05, "loss": 0.8882, "step": 554 }, { "epoch": 1.6044830079537238, "grad_norm": 1.0336111783981323, "learning_rate": 1.0442073170731707e-05, "loss": 0.7973, "step": 555 }, { "epoch": 1.6073752711496745, "grad_norm": 0.9833325743675232, "learning_rate": 1.0365853658536585e-05, "loss": 0.7918, "step": 556 }, { "epoch": 1.6102675343456254, "grad_norm": 1.0113708972930908, "learning_rate": 1.0289634146341464e-05, "loss": 0.803, "step": 557 }, { "epoch": 1.6131597975415763, "grad_norm": 1.0248537063598633, "learning_rate": 1.0213414634146342e-05, "loss": 0.8015, "step": 558 }, { "epoch": 1.6160520607375273, "grad_norm": 0.9835037589073181, "learning_rate": 1.0137195121951221e-05, "loss": 0.7493, "step": 559 }, { "epoch": 1.618944323933478, "grad_norm": 0.9587700963020325, "learning_rate": 1.0060975609756099e-05, "loss": 0.7041, "step": 560 }, { "epoch": 1.6218365871294287, "grad_norm": 1.0020424127578735, "learning_rate": 9.984756097560976e-06, "loss": 0.7743, "step": 561 }, { "epoch": 1.6247288503253796, "grad_norm": 1.0215778350830078, "learning_rate": 9.908536585365854e-06, "loss": 0.9143, "step": 562 }, { "epoch": 1.6276211135213305, "grad_norm": 1.05181086063385, "learning_rate": 9.832317073170733e-06, "loss": 0.7612, "step": 563 }, { "epoch": 1.6305133767172812, "grad_norm": 0.9703447222709656, "learning_rate": 9.756097560975611e-06, "loss": 0.7819, "step": 564 }, { "epoch": 1.633405639913232, "grad_norm": 1.0287517309188843, "learning_rate": 9.679878048780489e-06, "loss": 0.8443, "step": 565 }, { "epoch": 1.6362979031091829, "grad_norm": 1.0159296989440918, "learning_rate": 9.603658536585366e-06, "loss": 0.7781, "step": 566 }, { "epoch": 1.6391901663051338, "grad_norm": 1.0067027807235718, "learning_rate": 9.527439024390244e-06, "loss": 0.7417, "step": 567 }, { "epoch": 1.6420824295010847, "grad_norm": 1.067325472831726, "learning_rate": 9.451219512195123e-06, "loss": 0.856, "step": 568 }, { "epoch": 1.6449746926970354, "grad_norm": 1.0160930156707764, "learning_rate": 9.375000000000001e-06, "loss": 0.856, "step": 569 }, { "epoch": 1.6478669558929862, "grad_norm": 0.9937707781791687, "learning_rate": 9.298780487804879e-06, "loss": 0.7341, "step": 570 }, { "epoch": 1.650759219088937, "grad_norm": 1.0597978830337524, "learning_rate": 9.222560975609756e-06, "loss": 0.7363, "step": 571 }, { "epoch": 1.653651482284888, "grad_norm": 1.0080229043960571, "learning_rate": 9.146341463414634e-06, "loss": 0.7734, "step": 572 }, { "epoch": 1.6565437454808387, "grad_norm": 1.0394561290740967, "learning_rate": 9.070121951219513e-06, "loss": 0.8179, "step": 573 }, { "epoch": 1.6594360086767896, "grad_norm": 1.0613329410552979, "learning_rate": 8.99390243902439e-06, "loss": 0.8376, "step": 574 }, { "epoch": 1.6623282718727403, "grad_norm": 1.0188164710998535, "learning_rate": 8.917682926829268e-06, "loss": 0.7931, "step": 575 }, { "epoch": 1.6652205350686913, "grad_norm": 0.9689257740974426, "learning_rate": 8.841463414634146e-06, "loss": 0.8066, "step": 576 }, { "epoch": 1.6681127982646422, "grad_norm": 0.9878205060958862, "learning_rate": 8.765243902439024e-06, "loss": 0.7386, "step": 577 }, { "epoch": 1.671005061460593, "grad_norm": 0.9607040286064148, "learning_rate": 8.689024390243903e-06, "loss": 0.7762, "step": 578 }, { "epoch": 1.6738973246565436, "grad_norm": 0.934492290019989, "learning_rate": 8.61280487804878e-06, "loss": 0.8317, "step": 579 }, { "epoch": 1.6767895878524945, "grad_norm": 1.0009124279022217, "learning_rate": 8.53658536585366e-06, "loss": 0.7755, "step": 580 }, { "epoch": 1.6796818510484455, "grad_norm": 0.9868451952934265, "learning_rate": 8.460365853658538e-06, "loss": 0.7688, "step": 581 }, { "epoch": 1.6825741142443964, "grad_norm": 1.0356996059417725, "learning_rate": 8.384146341463415e-06, "loss": 0.7601, "step": 582 }, { "epoch": 1.685466377440347, "grad_norm": 1.0577391386032104, "learning_rate": 8.307926829268293e-06, "loss": 0.7847, "step": 583 }, { "epoch": 1.6883586406362978, "grad_norm": 1.0306715965270996, "learning_rate": 8.231707317073172e-06, "loss": 0.8193, "step": 584 }, { "epoch": 1.6912509038322487, "grad_norm": 1.04917311668396, "learning_rate": 8.15548780487805e-06, "loss": 0.7714, "step": 585 }, { "epoch": 1.6941431670281997, "grad_norm": 0.9596878290176392, "learning_rate": 8.079268292682928e-06, "loss": 0.8267, "step": 586 }, { "epoch": 1.6970354302241504, "grad_norm": 1.041686773300171, "learning_rate": 8.003048780487805e-06, "loss": 0.7706, "step": 587 }, { "epoch": 1.699927693420101, "grad_norm": 1.0023382902145386, "learning_rate": 7.926829268292683e-06, "loss": 0.8456, "step": 588 }, { "epoch": 1.702819956616052, "grad_norm": 1.009926438331604, "learning_rate": 7.850609756097562e-06, "loss": 0.7796, "step": 589 }, { "epoch": 1.705712219812003, "grad_norm": 1.0054479837417603, "learning_rate": 7.77439024390244e-06, "loss": 0.7221, "step": 590 }, { "epoch": 1.7086044830079539, "grad_norm": 0.9531407952308655, "learning_rate": 7.698170731707317e-06, "loss": 0.7801, "step": 591 }, { "epoch": 1.7114967462039046, "grad_norm": 1.0707489252090454, "learning_rate": 7.621951219512195e-06, "loss": 0.8474, "step": 592 }, { "epoch": 1.7143890093998553, "grad_norm": 1.0391806364059448, "learning_rate": 7.545731707317074e-06, "loss": 0.8122, "step": 593 }, { "epoch": 1.7172812725958062, "grad_norm": 0.9896015524864197, "learning_rate": 7.469512195121951e-06, "loss": 0.8505, "step": 594 }, { "epoch": 1.7201735357917571, "grad_norm": 1.122521162033081, "learning_rate": 7.393292682926829e-06, "loss": 0.878, "step": 595 }, { "epoch": 1.7230657989877078, "grad_norm": 1.0091516971588135, "learning_rate": 7.317073170731707e-06, "loss": 0.7846, "step": 596 }, { "epoch": 1.7259580621836585, "grad_norm": 0.9725529551506042, "learning_rate": 7.240853658536585e-06, "loss": 0.8274, "step": 597 }, { "epoch": 1.7288503253796095, "grad_norm": 1.0169364213943481, "learning_rate": 7.1646341463414635e-06, "loss": 0.9092, "step": 598 }, { "epoch": 1.7317425885755604, "grad_norm": 0.9752337336540222, "learning_rate": 7.088414634146341e-06, "loss": 0.7489, "step": 599 }, { "epoch": 1.7346348517715113, "grad_norm": 1.0482772588729858, "learning_rate": 7.0121951219512205e-06, "loss": 0.7379, "step": 600 }, { "epoch": 1.737527114967462, "grad_norm": 0.9847067594528198, "learning_rate": 6.935975609756098e-06, "loss": 0.7102, "step": 601 }, { "epoch": 1.7404193781634127, "grad_norm": 0.9766717553138733, "learning_rate": 6.859756097560977e-06, "loss": 0.8012, "step": 602 }, { "epoch": 1.7433116413593637, "grad_norm": 0.9498171806335449, "learning_rate": 6.783536585365854e-06, "loss": 0.7409, "step": 603 }, { "epoch": 1.7462039045553146, "grad_norm": 1.0003339052200317, "learning_rate": 6.707317073170733e-06, "loss": 0.7585, "step": 604 }, { "epoch": 1.7490961677512655, "grad_norm": 1.0416187047958374, "learning_rate": 6.63109756097561e-06, "loss": 0.7591, "step": 605 }, { "epoch": 1.7519884309472162, "grad_norm": 0.9981351494789124, "learning_rate": 6.554878048780488e-06, "loss": 0.741, "step": 606 }, { "epoch": 1.754880694143167, "grad_norm": 0.998756468296051, "learning_rate": 6.4786585365853665e-06, "loss": 0.8408, "step": 607 }, { "epoch": 1.7577729573391179, "grad_norm": 1.0053471326828003, "learning_rate": 6.402439024390244e-06, "loss": 0.7636, "step": 608 }, { "epoch": 1.7606652205350688, "grad_norm": 1.0228371620178223, "learning_rate": 6.326219512195123e-06, "loss": 0.7811, "step": 609 }, { "epoch": 1.7635574837310195, "grad_norm": 1.0302461385726929, "learning_rate": 6.25e-06, "loss": 0.7339, "step": 610 }, { "epoch": 1.7664497469269702, "grad_norm": 1.0541510581970215, "learning_rate": 6.173780487804878e-06, "loss": 0.7718, "step": 611 }, { "epoch": 1.7693420101229211, "grad_norm": 0.9746615290641785, "learning_rate": 6.0975609756097564e-06, "loss": 0.849, "step": 612 }, { "epoch": 1.772234273318872, "grad_norm": 0.9652546048164368, "learning_rate": 6.021341463414634e-06, "loss": 0.8287, "step": 613 }, { "epoch": 1.775126536514823, "grad_norm": 1.0296525955200195, "learning_rate": 5.9451219512195126e-06, "loss": 0.7493, "step": 614 }, { "epoch": 1.7780187997107737, "grad_norm": 1.045018196105957, "learning_rate": 5.86890243902439e-06, "loss": 0.7284, "step": 615 }, { "epoch": 1.7809110629067244, "grad_norm": 1.0308400392532349, "learning_rate": 5.792682926829269e-06, "loss": 0.8641, "step": 616 }, { "epoch": 1.7838033261026753, "grad_norm": 1.0580596923828125, "learning_rate": 5.716463414634147e-06, "loss": 0.8282, "step": 617 }, { "epoch": 1.7866955892986263, "grad_norm": 1.0240721702575684, "learning_rate": 5.640243902439025e-06, "loss": 0.765, "step": 618 }, { "epoch": 1.789587852494577, "grad_norm": 1.0127959251403809, "learning_rate": 5.5640243902439025e-06, "loss": 0.7923, "step": 619 }, { "epoch": 1.7924801156905277, "grad_norm": 1.1011825799942017, "learning_rate": 5.487804878048781e-06, "loss": 0.7251, "step": 620 }, { "epoch": 1.7953723788864786, "grad_norm": 1.0520384311676025, "learning_rate": 5.411585365853659e-06, "loss": 0.7217, "step": 621 }, { "epoch": 1.7982646420824295, "grad_norm": 1.0805737972259521, "learning_rate": 5.335365853658537e-06, "loss": 0.8411, "step": 622 }, { "epoch": 1.8011569052783805, "grad_norm": 1.0442290306091309, "learning_rate": 5.259146341463415e-06, "loss": 0.7386, "step": 623 }, { "epoch": 1.8040491684743312, "grad_norm": 1.0919840335845947, "learning_rate": 5.182926829268292e-06, "loss": 0.7858, "step": 624 }, { "epoch": 1.8069414316702819, "grad_norm": 0.9759023785591125, "learning_rate": 5.106707317073171e-06, "loss": 0.697, "step": 625 }, { "epoch": 1.8098336948662328, "grad_norm": 1.017999291419983, "learning_rate": 5.030487804878049e-06, "loss": 0.8095, "step": 626 }, { "epoch": 1.8127259580621837, "grad_norm": 1.0746080875396729, "learning_rate": 4.954268292682927e-06, "loss": 0.7828, "step": 627 }, { "epoch": 1.8156182212581344, "grad_norm": 1.0229034423828125, "learning_rate": 4.8780487804878055e-06, "loss": 0.8028, "step": 628 }, { "epoch": 1.8185104844540854, "grad_norm": 1.0520620346069336, "learning_rate": 4.801829268292683e-06, "loss": 0.7629, "step": 629 }, { "epoch": 1.821402747650036, "grad_norm": 1.0495305061340332, "learning_rate": 4.725609756097562e-06, "loss": 0.7609, "step": 630 }, { "epoch": 1.824295010845987, "grad_norm": 0.9548224806785583, "learning_rate": 4.649390243902439e-06, "loss": 0.752, "step": 631 }, { "epoch": 1.827187274041938, "grad_norm": 1.0313746929168701, "learning_rate": 4.573170731707317e-06, "loss": 0.8528, "step": 632 }, { "epoch": 1.8300795372378886, "grad_norm": 1.0014350414276123, "learning_rate": 4.496951219512195e-06, "loss": 0.7587, "step": 633 }, { "epoch": 1.8329718004338393, "grad_norm": 1.069353461265564, "learning_rate": 4.420731707317073e-06, "loss": 0.8193, "step": 634 }, { "epoch": 1.8358640636297903, "grad_norm": 1.085693120956421, "learning_rate": 4.3445121951219515e-06, "loss": 0.799, "step": 635 }, { "epoch": 1.8387563268257412, "grad_norm": 0.97664475440979, "learning_rate": 4.26829268292683e-06, "loss": 0.7018, "step": 636 }, { "epoch": 1.8416485900216921, "grad_norm": 1.0830881595611572, "learning_rate": 4.192073170731708e-06, "loss": 0.7851, "step": 637 }, { "epoch": 1.8445408532176428, "grad_norm": 0.9672832489013672, "learning_rate": 4.115853658536586e-06, "loss": 0.7542, "step": 638 }, { "epoch": 1.8474331164135935, "grad_norm": 1.0837608575820923, "learning_rate": 4.039634146341464e-06, "loss": 0.8329, "step": 639 }, { "epoch": 1.8503253796095445, "grad_norm": 1.0772196054458618, "learning_rate": 3.9634146341463414e-06, "loss": 0.7884, "step": 640 }, { "epoch": 1.8532176428054954, "grad_norm": 1.1313399076461792, "learning_rate": 3.88719512195122e-06, "loss": 0.7771, "step": 641 }, { "epoch": 1.856109906001446, "grad_norm": 1.0799105167388916, "learning_rate": 3.8109756097560976e-06, "loss": 0.8173, "step": 642 }, { "epoch": 1.8590021691973968, "grad_norm": 1.035786509513855, "learning_rate": 3.7347560975609756e-06, "loss": 0.7445, "step": 643 }, { "epoch": 1.8618944323933477, "grad_norm": 1.0022109746932983, "learning_rate": 3.6585365853658537e-06, "loss": 0.7441, "step": 644 }, { "epoch": 1.8647866955892987, "grad_norm": 1.0012871026992798, "learning_rate": 3.5823170731707318e-06, "loss": 0.7731, "step": 645 }, { "epoch": 1.8676789587852496, "grad_norm": 1.0303922891616821, "learning_rate": 3.5060975609756102e-06, "loss": 0.7432, "step": 646 }, { "epoch": 1.8705712219812003, "grad_norm": 0.9990852475166321, "learning_rate": 3.4298780487804883e-06, "loss": 0.7346, "step": 647 }, { "epoch": 1.873463485177151, "grad_norm": 1.0499917268753052, "learning_rate": 3.3536585365853664e-06, "loss": 0.8286, "step": 648 }, { "epoch": 1.876355748373102, "grad_norm": 0.9858948588371277, "learning_rate": 3.277439024390244e-06, "loss": 0.7513, "step": 649 }, { "epoch": 1.8792480115690529, "grad_norm": 1.020816445350647, "learning_rate": 3.201219512195122e-06, "loss": 0.7283, "step": 650 }, { "epoch": 1.8821402747650036, "grad_norm": 1.0142725706100464, "learning_rate": 3.125e-06, "loss": 0.8384, "step": 651 }, { "epoch": 1.8850325379609545, "grad_norm": 1.0734213590621948, "learning_rate": 3.0487804878048782e-06, "loss": 0.7657, "step": 652 }, { "epoch": 1.8879248011569052, "grad_norm": 0.9841848611831665, "learning_rate": 2.9725609756097563e-06, "loss": 0.7097, "step": 653 }, { "epoch": 1.8908170643528561, "grad_norm": 1.4696120023727417, "learning_rate": 2.8963414634146343e-06, "loss": 0.6966, "step": 654 }, { "epoch": 1.893709327548807, "grad_norm": 1.0753856897354126, "learning_rate": 2.8201219512195124e-06, "loss": 0.7836, "step": 655 }, { "epoch": 1.8966015907447578, "grad_norm": 1.058305025100708, "learning_rate": 2.7439024390243905e-06, "loss": 0.7982, "step": 656 }, { "epoch": 1.8994938539407085, "grad_norm": 1.0660943984985352, "learning_rate": 2.6676829268292685e-06, "loss": 0.7404, "step": 657 }, { "epoch": 1.9023861171366594, "grad_norm": 1.0167231559753418, "learning_rate": 2.591463414634146e-06, "loss": 0.6959, "step": 658 }, { "epoch": 1.9052783803326103, "grad_norm": 0.9782930016517639, "learning_rate": 2.5152439024390247e-06, "loss": 0.7038, "step": 659 }, { "epoch": 1.9081706435285612, "grad_norm": 1.0442514419555664, "learning_rate": 2.4390243902439027e-06, "loss": 0.8573, "step": 660 }, { "epoch": 1.911062906724512, "grad_norm": 1.0171256065368652, "learning_rate": 2.362804878048781e-06, "loss": 0.7684, "step": 661 }, { "epoch": 1.9139551699204627, "grad_norm": 1.020768165588379, "learning_rate": 2.2865853658536584e-06, "loss": 0.8061, "step": 662 }, { "epoch": 1.9168474331164136, "grad_norm": 0.9942306876182556, "learning_rate": 2.2103658536585365e-06, "loss": 0.7691, "step": 663 }, { "epoch": 1.9197396963123645, "grad_norm": 0.9986061453819275, "learning_rate": 2.134146341463415e-06, "loss": 0.7012, "step": 664 }, { "epoch": 1.9226319595083152, "grad_norm": 1.0474562644958496, "learning_rate": 2.057926829268293e-06, "loss": 0.728, "step": 665 }, { "epoch": 1.925524222704266, "grad_norm": 1.0567129850387573, "learning_rate": 1.9817073170731707e-06, "loss": 0.7762, "step": 666 }, { "epoch": 1.9284164859002169, "grad_norm": 1.0257785320281982, "learning_rate": 1.9054878048780488e-06, "loss": 0.7986, "step": 667 }, { "epoch": 1.9313087490961678, "grad_norm": 0.9999968409538269, "learning_rate": 1.8292682926829268e-06, "loss": 0.7539, "step": 668 }, { "epoch": 1.9342010122921187, "grad_norm": 1.082047462463379, "learning_rate": 1.7530487804878051e-06, "loss": 0.7971, "step": 669 }, { "epoch": 1.9370932754880694, "grad_norm": 0.994654655456543, "learning_rate": 1.6768292682926832e-06, "loss": 0.7363, "step": 670 }, { "epoch": 1.9399855386840201, "grad_norm": 1.0056068897247314, "learning_rate": 1.600609756097561e-06, "loss": 0.7643, "step": 671 }, { "epoch": 1.942877801879971, "grad_norm": 1.015271782875061, "learning_rate": 1.5243902439024391e-06, "loss": 0.7108, "step": 672 }, { "epoch": 1.945770065075922, "grad_norm": 0.9946292042732239, "learning_rate": 1.4481707317073172e-06, "loss": 0.8213, "step": 673 }, { "epoch": 1.9486623282718727, "grad_norm": 0.9914453625679016, "learning_rate": 1.3719512195121952e-06, "loss": 0.7917, "step": 674 }, { "epoch": 1.9515545914678236, "grad_norm": 1.062779426574707, "learning_rate": 1.295731707317073e-06, "loss": 0.725, "step": 675 }, { "epoch": 1.9544468546637743, "grad_norm": 1.0502513647079468, "learning_rate": 1.2195121951219514e-06, "loss": 0.7978, "step": 676 }, { "epoch": 1.9573391178597253, "grad_norm": 1.0494405031204224, "learning_rate": 1.1432926829268292e-06, "loss": 0.7927, "step": 677 }, { "epoch": 1.9602313810556762, "grad_norm": 1.054677128791809, "learning_rate": 1.0670731707317075e-06, "loss": 0.7595, "step": 678 }, { "epoch": 1.9631236442516269, "grad_norm": 1.0292917490005493, "learning_rate": 9.908536585365854e-07, "loss": 0.8302, "step": 679 }, { "epoch": 1.9660159074475776, "grad_norm": 1.1083894968032837, "learning_rate": 9.146341463414634e-07, "loss": 0.8153, "step": 680 }, { "epoch": 1.9689081706435285, "grad_norm": 1.086378574371338, "learning_rate": 8.384146341463416e-07, "loss": 0.7676, "step": 681 }, { "epoch": 1.9718004338394794, "grad_norm": 1.0098559856414795, "learning_rate": 7.621951219512196e-07, "loss": 0.7764, "step": 682 }, { "epoch": 1.9746926970354304, "grad_norm": 1.0091646909713745, "learning_rate": 6.859756097560976e-07, "loss": 0.8242, "step": 683 }, { "epoch": 1.977584960231381, "grad_norm": 1.0496336221694946, "learning_rate": 6.097560975609757e-07, "loss": 0.7758, "step": 684 }, { "epoch": 1.9804772234273318, "grad_norm": 1.0282728672027588, "learning_rate": 5.335365853658538e-07, "loss": 0.7421, "step": 685 }, { "epoch": 1.9833694866232827, "grad_norm": 1.0808695554733276, "learning_rate": 4.573170731707317e-07, "loss": 0.7813, "step": 686 }, { "epoch": 1.9862617498192336, "grad_norm": 1.0309821367263794, "learning_rate": 3.810975609756098e-07, "loss": 0.7839, "step": 687 }, { "epoch": 1.9891540130151844, "grad_norm": 1.0294197797775269, "learning_rate": 3.0487804878048784e-07, "loss": 0.697, "step": 688 }, { "epoch": 1.992046276211135, "grad_norm": 1.0775706768035889, "learning_rate": 2.2865853658536586e-07, "loss": 0.7508, "step": 689 }, { "epoch": 1.994938539407086, "grad_norm": 1.0518558025360107, "learning_rate": 1.5243902439024392e-07, "loss": 0.7384, "step": 690 }, { "epoch": 1.997830802603037, "grad_norm": 1.0389012098312378, "learning_rate": 7.621951219512196e-08, "loss": 0.7942, "step": 691 } ], "logging_steps": 1, "max_steps": 691, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.687294393884475e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }